From 45333fa9d68d5bf8af7a811a45fff809d01cfe8b Mon Sep 17 00:00:00 2001 From: Marco Barbone Date: Wed, 3 Jul 2024 09:43:20 -0400 Subject: [PATCH 01/68] basic benchmarks --- CMakeLists.txt | 19 +++++----- include/cufinufft/common.h | 4 ++ include/cufinufft/impl.h | 66 +++++++++++++++++++++------------ perftest/cuda/CMakeLists.txt | 1 + perftest/cuda/bench.sh | 13 +++++++ perftest/cuda/cuperftest.cu | 41 +++++++++++--------- src/cuda/1d/cufinufft1d.cu | 3 +- src/cuda/3d/spread3d_wrapper.cu | 16 +++++--- src/cuda/CMakeLists.txt | 17 ++++++++- src/cuda/common.cu | 28 ++++++++++++++ src/cuda/spreadinterp.cpp | 2 +- test/cuda/CMakeLists.txt | 8 ++++ 12 files changed, 156 insertions(+), 62 deletions(-) create mode 100644 perftest/cuda/bench.sh diff --git a/CMakeLists.txt b/CMakeLists.txt index f53d6e28b..a6389f2ec 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,4 +1,4 @@ -cmake_minimum_required(VERSION 3.19) +cmake_minimum_required(VERSION 3.23) project(finufft VERSION 2.2.0 LANGUAGES C CXX) @@ -23,7 +23,7 @@ if (CMAKE_CXX_COMPILER_ID IN_LIST GNU_LIKE_FRONTENDS) endif () set(FINUFFT_FFTW_SUFFIX "OpenMP" CACHE STRING "Suffix for FFTW libraries (e.g. OpenMP, Threads etc.)") set(FINUFFT_FFTW_LIBRARIES "DEFAULT" CACHE STRING "Specify a custom FFTW library") - +set(FINUFFT_CUDA_ARCHITECTURES "all-major" CACHE STRING "CUDA architectures to build for (e.g. 60;70;75;)") # All options go here # sphinx tag (don't remove): @cmake_opts_start option(FINUFFT_BUILD_EXAMPLES "Whether to build the FINUFFT examples" OFF) @@ -219,30 +219,29 @@ if (FINUFFT_USE_CUDA) if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES) message("FINUFFT WARNING: No CUDA architecture supplied via '-DCMAKE_CUDA_ARCHITECTURES=...', defaulting to '60;70;75;'") message("See: https://developer.nvidia.com/cuda-gpus for more details on what architecture to supply.") - set(CMAKE_CUDA_ARCHITECTURES "60;70;75" CACHE STRING "" FORCE) endif () enable_language(CUDA) find_package(CUDAToolkit REQUIRED) add_subdirectory(src/cuda) - if (BUILD_TESTING AND FINUFFT_BUILD_TESTS) + if (BUILD_TESTING OR FINUFFT_BUILD_TESTS) add_subdirectory(perftest/cuda) + add_subdirectory(test/cuda) endif () - list(APPEND INSTALL_TARGETS cufinufft cufinufft_static) endif () # Add tests defined in their own directory -if (BUILD_TESTING AND FINUFFT_BUILD_TESTS AND FINUFFT_USE_CPU) +if (FINUFFT_USE_CPU AND (BUILD_TESTING OR FINUFFT_BUILD_TESTS)) add_subdirectory(test) add_subdirectory(perftest) endif () -if (BUILD_TESTING AND FINUFFT_BUILD_TESTS AND FINUFFT_USE_CUDA) - add_subdirectory(test/cuda) +if (FINUFFT_BUILD_EXAMPLES AND FINUFFT_USE_CPU) + add_subdirectory(examples) endif () -if (FINUFFT_BUILD_EXAMPLES) - add_subdirectory(examples) +if (FINUFFT_BUILD_EXAMPLES AND FINUFFT_USE_GPU) + add_subdirectory(examples/cuda) endif () if (FINUFFT_BUILD_FORTRAN) diff --git a/include/cufinufft/common.h b/include/cufinufft/common.h index 7bddc188e..b45519a50 100644 --- a/include/cufinufft/common.h +++ b/include/cufinufft/common.h @@ -32,6 +32,10 @@ template void onedim_fseries_kernel_compute(CUFINUFFT_BIGINT nf, T *f, std::complex *a, T *fwkerhalf, finufft_spread_opts opts); +template +std::size_t shared_memory_required(int dim, int ns, int bin_size_x, int bin_size_y, + int bin_size_z); + } // namespace common } // namespace cufinufft #endif diff --git a/include/cufinufft/impl.h b/include/cufinufft/impl.h index 826319516..aa58c8dee 100644 --- a/include/cufinufft/impl.h +++ b/include/cufinufft/impl.h @@ -53,6 +53,7 @@ static void cufinufft_setup_binsize(int type, int dim, cufinufft_opts *opts) { } break; case 3: { switch (opts->gpu_method) { + case 0: case 1: case 2: { opts->gpu_binsizex = (opts->gpu_binsizex < 0) ? 16 : opts->gpu_binsizex; @@ -109,17 +110,16 @@ int cufinufft_makeplan_impl(int type, int dim, int *nmodes, int iflag, int ntran } // Mult-GPU support: set the CUDA Device ID: - const int device_id = opts == NULL ? 0 : opts->gpu_device_id; + const int device_id = opts == nullptr ? 0 : opts->gpu_device_id; cufinufft::utils::WithCudaDevice device_swapper(device_id); /* allocate the plan structure, assign address to user pointer. */ - cufinufft_plan_t *d_plan = new cufinufft_plan_t; - *d_plan_ptr = d_plan; + auto *d_plan = new cufinufft_plan_t; + *d_plan_ptr = d_plan; // Zero out your struct, (sets all pointers to NULL) memset(d_plan, 0, sizeof(*d_plan)); - /* If a user has not supplied their own options, assign defaults for them. */ - if (opts == NULL) { // use default opts + if (opts == nullptr) { // use default opts cufinufft_default_opts(&(d_plan->opts)); } else { // or read from what's passed in d_plan->opts = *opts; // keep a deep copy; changing *opts now has no effect @@ -138,26 +138,9 @@ int cufinufft_makeplan_impl(int type, int dim, int *nmodes, int iflag, int ntran } auto &stream = d_plan->stream = (cudaStream_t)d_plan->opts.gpu_stream; - - /* Automatically set GPU method. */ - if (d_plan->opts.gpu_method == 0) { - /* For type 1, we default to method 2 (SM) since this is generally faster. - * However, in the special case of _double precision_ in _three dimensions_ - * with more than _three digits of precision_, there is note enough shared - * memory for this to work. As a result, we will default to method 1 (GM) in - * this special case. - * - * For type 2, we always default to method 1 (GM). */ - if (type == 1 && (sizeof(T) == 4 || dim < 3 || tol >= 1e-3)) - d_plan->opts.gpu_method = 2; - else if (type == 1 && tol < 1e-3) - d_plan->opts.gpu_method = 1; - else if (type == 2) - d_plan->opts.gpu_method = 1; - } - - /* Setup Spreader */ using namespace cufinufft::common; + /* Setup Spreader */ + // can return FINUFFT_WARN_EPS_TOO_SMALL=1, which is OK if ((ier = setup_spreader_for_nufft(d_plan->spopts, tol, d_plan->opts)) > 1) { delete *d_plan_ptr; @@ -180,6 +163,41 @@ int cufinufft_makeplan_impl(int type, int dim, int *nmodes, int iflag, int ntran if (dim > 2) set_nf_type12(d_plan->mu, d_plan->opts, d_plan->spopts, &nf3, d_plan->opts.gpu_obinsizez); + + // dynamically request the maximum amount of shared memory available + // for the spreader + + /* Automatically set GPU method. */ + if (d_plan->opts.gpu_method == 0) { + /* For type 1, we default to method 2 (SM) since this is generally faster. + * However, in the special case of _double precision_ in _three dimensions_ + * with more than _three digits of precision_, there is note enough shared + * memory for this to work. As a result, we will default to method 1 (GM) in + * this special case. + * + * For type 2, we always default to method 1 (GM). */ + + // query the device for the amount of shared memory available + int shared_mem_per_block{}; + cudaDeviceGetAttribute(&shared_mem_per_block, cudaDevAttrMaxSharedMemoryPerBlockOptin, + device_id); + RETURN_IF_CUDA_ERROR + // compute the amount of shared memory required for the method + const auto shared_mem_required = + shared_memory_required(dim, d_plan->spopts.nspread, d_plan->opts.gpu_binsizex, + d_plan->opts.gpu_binsizey, d_plan->opts.gpu_binsizez); + printf("Shared memory available: %d KB, required: %d KB\n", shared_mem_per_block, + shared_mem_required); + if ((shared_mem_required > shared_mem_per_block)) { + d_plan->opts.gpu_method = 1; + printf("choosing method 1\n"); + } else { + d_plan->opts.gpu_method = 2; + printf("choosing method 2\n"); + } + printf("using method %d\n", d_plan->opts.gpu_method); + } + int fftsign = (iflag >= 0) ? 1 : -1; d_plan->nf1 = nf1; diff --git a/perftest/cuda/CMakeLists.txt b/perftest/cuda/CMakeLists.txt index 9d817d5f6..5f1079fde 100644 --- a/perftest/cuda/CMakeLists.txt +++ b/perftest/cuda/CMakeLists.txt @@ -1,3 +1,4 @@ add_executable(cuperftest cuperftest.cu) target_include_directories(cuperftest PUBLIC ${CUFINUFFT_INCLUDE_DIRS}) target_link_libraries(cuperftest PUBLIC cufinufft) +#file(COPY ${CMAKE_CURRENT_SOURCE_DIR}/bench.sh DESTINATION ${CMAKE_CURRENT_BINARY_DIR}) \ No newline at end of file diff --git a/perftest/cuda/bench.sh b/perftest/cuda/bench.sh new file mode 100644 index 000000000..9832e1088 --- /dev/null +++ b/perftest/cuda/bench.sh @@ -0,0 +1,13 @@ +./cuperftest --prec d --n_runs 5 --N1 1e2 --N2 1e2 --M 2e6 --method 0 --tol 1e-4 +./cuperftest --prec d --n_runs 5 --N1 1e1 --N2 1e1 --N3 1e1 --M 2e6 --method 0 --tol 1e-4 +./cuperftest --prec d --n_runs 5 --N1 1e2 --N2 1e2 --N3 1e1 --M 2e6 --method 0 --tol 1e-4 +./cuperftest --prec d --n_runs 5 --N1 1e1 --N2 1e2 --N3 1e3 --M 2e6 --method 0 --tol 1e-4 +./cuperftest --prec d --n_runs 5 --N1 1e2 --N2 1e2 --N3 1e3 --M 2e6 --method 0 --tol 1e-4 +#./cuperftest --prec d --n_runs 5 --N1 1e5 --N2 1e5 --N3 1e5 --M 2e6 --method 0 --tol 1e-10 +#./cuperftest --prec d --n_runs 5 --N1 1e4 --N2 1e4 --N3 1e4 --M 2e6 --method 0 --tol 1e-10 +#./cuperftest --prec d --n_runs 5 --N1 1e5 --N2 1e5 --N3 1e5 --M 2e6 --method 0 --tol 1e-10 +#./cuperftest --prec d --n_runs 5 --N1 1e6 --N2 1e6 --M 2e6 --method 0 --tol 1e-10 +#./cuperftest --prec d --n_runs 5 --N1 1e8 --N2 1e6 --M 2e6 --method 0 --tol 1e-10 +#./cuperftest --prec d --n_runs 5 --N1 1e6 --N2 1e6 --M 2e6 --method 0 --tol 1e-10 +#./cuperftest --prec d --n_runs 5 --N1 1e7 --N2 1e7 --M 2e6 --method 0 --tol 1e-10 +#./cuperftest --prec d --n_runs 5 --N1 1e8 --N2 1e8 --M 2e6 --method 0 --tol 1e-10 diff --git a/perftest/cuda/cuperftest.cu b/perftest/cuda/cuperftest.cu index f72ffb3e6..85118f1f8 100644 --- a/perftest/cuda/cuperftest.cu +++ b/perftest/cuda/cuperftest.cu @@ -275,24 +275,29 @@ template void run_test(test_options_t &test_opts) { } const int64_t nupts_tot = M * test_opts.n_runs * ntransf; - - printf("event,count,tot(ms),mean(ms),std(ms),nupts/s,ns/nupt\n"); - printf("host_to_device,%d,%f,%f,%f,0.0,0.0\n", h2d_timer.count(), h2d_timer.tot(), - h2d_timer.mean(), h2d_timer.std()); - printf("makeplan,%d,%f,%f,%f,0.0,0.0\n", makeplan_timer.count(), makeplan_timer.tot(), - makeplan_timer.mean(), makeplan_timer.std()); - printf("setpts,%d,%f,%f,%f,%g,%f\n", test_opts.n_runs, setpts_timer.tot(), - setpts_timer.mean(), setpts_timer.std(), nupts_tot * 1000 / setpts_timer.tot(), - setpts_timer.tot() * 1E6 / nupts_tot); - printf("execute,%d,%f,%f,%f,%g,%f\n", test_opts.n_runs, execute_timer.tot(), - execute_timer.mean(), execute_timer.std(), - nupts_tot * 1000 / execute_timer.tot(), execute_timer.tot() * 1E6 / nupts_tot); - printf("device_to_host,%d,%f,%f,%f,0.0,0.0\n", d2h_timer.count(), d2h_timer.tot(), - d2h_timer.mean(), d2h_timer.std()); - printf("amortized,%d,%f,%f,%f,%g,%f\n", 1, amortized_timer.tot(), - amortized_timer.mean(), amortized_timer.std(), - nupts_tot * 1000 / amortized_timer.tot(), - amortized_timer.tot() * 1E6 / nupts_tot); + // + // printf("event,count,tot(ms),mean(ms),std(ms),nupts/s,ns/nupt\n"); + // printf("host_to_device,%d,%f,%f,%f,0.0,0.0\n", h2d_timer.count(), h2d_timer.tot(), + // h2d_timer.mean(), h2d_timer.std()); + // printf("makeplan,%d,%f,%f,%f,0.0,0.0\n", makeplan_timer.count(), + // makeplan_timer.tot(), + // makeplan_timer.mean(), makeplan_timer.std()); + // printf("setpts,%d,%f,%f,%f,%g,%f\n", test_opts.n_runs, setpts_timer.tot(), + // setpts_timer.mean(), setpts_timer.std(), nupts_tot * 1000 / + // setpts_timer.tot(), setpts_timer.tot() * 1E6 / nupts_tot); + // printf("execute,%d,%f,%f,%f,%g,%f\n", test_opts.n_runs, execute_timer.tot(), + // execute_timer.mean(), execute_timer.std(), + // nupts_tot * 1000 / execute_timer.tot(), execute_timer.tot() * 1E6 / + // nupts_tot); + // printf("device_to_host,%d,%f,%f,%f,0.0,0.0\n", d2h_timer.count(), d2h_timer.tot(), + // d2h_timer.mean(), d2h_timer.std()); + // printf("amortized,%d,%f,%f,%f,%g,%f\n", 1, amortized_timer.tot(), + // amortized_timer.mean(), amortized_timer.std(), + // nupts_tot * 1000 / amortized_timer.tot(), + // amortized_timer.tot() * 1E6 / nupts_tot); + // print numpts / s + printf("setpts pts/s: %g\n", float(nupts_tot) * 1000 / setpts_timer.tot()); + printf("execute pts/s: %g\n", float(nupts_tot) * 1000 / execute_timer.tot()); } int main(int argc, char *argv[]) { diff --git a/src/cuda/1d/cufinufft1d.cu b/src/cuda/1d/cufinufft1d.cu index 26eaff491..4ecb3b283 100644 --- a/src/cuda/1d/cufinufft1d.cu +++ b/src/cuda/1d/cufinufft1d.cu @@ -5,11 +5,10 @@ #include #include -#include +#include #include #include -#include #include #include diff --git a/src/cuda/3d/spread3d_wrapper.cu b/src/cuda/3d/spread3d_wrapper.cu index fa67f95f8..c25393e1a 100644 --- a/src/cuda/3d/spread3d_wrapper.cu +++ b/src/cuda/3d/spread3d_wrapper.cu @@ -536,14 +536,17 @@ int cuspread3d_subprob(int nf1, int nf2, int nf3, int M, cufinufft_plan_t *d_ size_t sharedplanorysize = (bin_size_x + 2 * ceil(ns / 2.0)) * (bin_size_y + 2 * ceil(ns / 2.0)) * (bin_size_z + 2 * ceil(ns / 2.0)) * sizeof(cuda_complex); - if (sharedplanorysize > 49152) { - std::cerr << "[cuspread3d_subprob] error: not enough shared memory (" - << sharedplanorysize << ")" << std::endl; - return FINUFFT_ERR_INSUFFICIENT_SHMEM; - } + // if (sharedplanorysize > 49152) { + // std::cerr << "[cuspread3d_subprob] error: not enough shared memory (" + // << sharedplanorysize << ")" << std::endl; + // return FINUFFT_ERR_INSUFFICIENT_SHMEM; + // } for (int t = 0; t < blksize; t++) { if (d_plan->opts.gpu_kerevalmeth) { + cudaFuncSetAttribute(spread_3d_subprob, + cudaFuncAttributeMaxDynamicSharedMemorySize, + sharedplanorysize); spread_3d_subprob<<>>( d_kx, d_ky, d_kz, d_c + t * M, d_fw + t * nf1 * nf2 * nf3, M, ns, nf1, nf2, nf3, sigma, es_c, es_beta, d_binstartpts, d_binsize, bin_size_x, bin_size_y, @@ -551,6 +554,9 @@ int cuspread3d_subprob(int nf1, int nf2, int nf3, int M, cufinufft_plan_t *d_ numbins[0], numbins[1], numbins[2], d_idxnupts); RETURN_IF_CUDA_ERROR } else { + cudaFuncSetAttribute(spread_3d_subprob, + cudaFuncAttributeMaxDynamicSharedMemorySize, + sharedplanorysize); spread_3d_subprob<<>>( d_kx, d_ky, d_kz, d_c + t * M, d_fw + t * nf1 * nf2 * nf3, M, ns, nf1, nf2, nf3, sigma, es_c, es_beta, d_binstartpts, d_binsize, bin_size_x, bin_size_y, diff --git a/src/cuda/CMakeLists.txt b/src/cuda/CMakeLists.txt index 62d6c901c..d2928858b 100644 --- a/src/cuda/CMakeLists.txt +++ b/src/cuda/CMakeLists.txt @@ -24,26 +24,38 @@ set(CUFINUFFT_INCLUDE_DIRS ${CUFINUFFT_INCLUDE_DIRS} PARENT_SCOPE) add_library(cufinufft_common_objects OBJECT ${PRECISION_INDEPENDENT_SRC}) target_include_directories(cufinufft_common_objects PUBLIC ${CUFINUFFT_INCLUDE_DIRS}) -set_property(TARGET cufinufft_common_objects PROPERTY POSITION_INDEPENDENT_CODE ON) +set_target_properties( + cufinufft_common_objects PROPERTIES + POSITION_INDEPENDENT_CODE ON + CUDA_ARCHITECTURES ${FINUFFT_CUDA_ARCHITECTURES} +) add_library(cufinufft_objects OBJECT ${PRECISION_DEPENDENT_SRC}) target_include_directories(cufinufft_objects PUBLIC ${CUFINUFFT_INCLUDE_DIRS}) set_property(TARGET cufinufft_objects PROPERTY POSITION_INDEPENDENT_CODE ON) +set_target_properties( + cufinufft_objects PROPERTIES + POSITION_INDEPENDENT_CODE ON + CUDA_ARCHITECTURES ${FINUFFT_CUDA_ARCHITECTURES} +) add_library(cufinufft SHARED $ $ ) +target_include_directories(cufinufft PUBLIC ${CUFINUFFT_INCLUDE_DIRS}) target_link_libraries(cufinufft CUDA::cudart CUDA::cufft CUDA::nvToolsExt) set_target_properties( cufinufft PROPERTIES LIBRARY_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}" + CUDA_ARCHITECTURES ${FINUFFT_CUDA_ARCHITECTURES} ) add_library(cufinufft_static STATIC $ $ ) +target_include_directories(cufinufft_static PUBLIC ${CUFINUFFT_INCLUDE_DIRS}) if(WIN32) target_link_libraries(cufinufft_static PUBLIC CUDA::cudart CUDA::cufft CUDA::nvToolsExt) else() @@ -51,7 +63,8 @@ else() endif() set_target_properties( cufinufft_static PROPERTIES - ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}" + CUDA_ARCHITECTURES ${FINUFFT_CUDA_ARCHITECTURES} + ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}" ) file(GLOB CUFINUFFT_PUBLIC_HEADERS "${CMAKE_SOURCE_DIR}/include/cufinufft*.h") diff --git a/src/cuda/common.cu b/src/cuda/common.cu index c6bf8315d..7709cdf74 100644 --- a/src/cuda/common.cu +++ b/src/cuda/common.cu @@ -199,6 +199,28 @@ void onedim_fseries_kernel_compute(CUFINUFFT_BIGINT nf, T *f, std::complex +std::size_t shared_memory_required(int dim, int ns, int bin_size_x, int bin_size_y, + int bin_size_z) { + printf("dim, ns, bin_size_x, bin_size_y, bin_size_z: %d %d %d %d %d\n", dim, ns, + bin_size_x, bin_size_y, bin_size_z); + int adjusted_ns = bin_size_x + ((ns + 1) / 2) * 2; + + if (dim == 1) { + return adjusted_ns * sizeof(cuda_complex); + } + + adjusted_ns *= (bin_size_y + ((ns + 1) / 2) * 2); + + if (dim == 2) { + return adjusted_ns * sizeof(cuda_complex); + } + + adjusted_ns *= (bin_size_z + ((ns + 1) / 2) * 2); + + return adjusted_ns * sizeof(cuda_complex); +} + template void onedim_fseries_kernel_compute(CUFINUFFT_BIGINT nf, float *f, std::complex *a, float *fwkerhalf, finufft_spread_opts opts); @@ -227,5 +249,11 @@ template void onedim_fseries_kernel(CUFINUFFT_BIGINT nf, float *fwkerhalf, finufft_spread_opts opts); template void onedim_fseries_kernel(CUFINUFFT_BIGINT nf, double *fwkerhalf, finufft_spread_opts opts); + +template std::size_t shared_memory_required(int dim, int ns, int bin_size_x, + int bin_size_y, int bin_size_z); +template std::size_t shared_memory_required(int dim, int ns, int bin_size_x, + int bin_size_y, int bin_size_z); + } // namespace common } // namespace cufinufft diff --git a/src/cuda/spreadinterp.cpp b/src/cuda/spreadinterp.cpp index 6ff91f8ca..b01d1c98f 100644 --- a/src/cuda/spreadinterp.cpp +++ b/src/cuda/spreadinterp.cpp @@ -69,7 +69,7 @@ int setup_spreader(finufft_spread_opts &opts, T eps, T upsampfac, int kerevalmet ier = FINUFFT_WARN_EPS_TOO_SMALL; } opts.nspread = ns; - opts.ES_halfwidth = (T)ns / 2; // constants to help ker eval (except Horner) + opts.ES_halfwidth = T(ns * .5); // constants to help ker eval (except Horner) opts.ES_c = 4.0 / (T)(ns * ns); T betaoverns = 2.30; // gives decent betas for default sigma=2.0 diff --git a/test/cuda/CMakeLists.txt b/test/cuda/CMakeLists.txt index 23b3346da..8d77d9fdc 100644 --- a/test/cuda/CMakeLists.txt +++ b/test/cuda/CMakeLists.txt @@ -7,6 +7,14 @@ foreach(srcfile ${test_src}) add_executable(${executable} ${srcfile}) target_include_directories(${executable} PUBLIC ${CUFINUFFT_INCLUDE_DIRS}) target_link_libraries(${executable} PUBLIC cufinufft m) + set_target_properties(${executable} PROPERTIES + LINKER_LANGUAGE CUDA + CUDA_ARCHITECTURES ${FINUFFT_CUDA_ARCHITECTURES} + ) + message(STATUS "Adding test ${executable}" + " with CUDA_ARCHITECTURES=${FINUFFT_CUDA_ARCHITECTURES}" + " and INCLUDE=${CUFINUFFT_INCLUDE_DIRS}" + ) endforeach() function(add_tests PREC REQ_TOL CHECK_TOL) From b95a0826a6adfcbc1c81cd46576b3006633124b4 Mon Sep 17 00:00:00 2001 From: Marco Barbone Date: Wed, 3 Jul 2024 22:13:31 -0400 Subject: [PATCH 02/68] added plotting script --- CMakeLists.txt | 2 +- perftest/cuda/bench.py | 106 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 107 insertions(+), 1 deletion(-) create mode 100644 perftest/cuda/bench.py diff --git a/CMakeLists.txt b/CMakeLists.txt index a6389f2ec..15e6161a1 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -23,7 +23,7 @@ if (CMAKE_CXX_COMPILER_ID IN_LIST GNU_LIKE_FRONTENDS) endif () set(FINUFFT_FFTW_SUFFIX "OpenMP" CACHE STRING "Suffix for FFTW libraries (e.g. OpenMP, Threads etc.)") set(FINUFFT_FFTW_LIBRARIES "DEFAULT" CACHE STRING "Specify a custom FFTW library") -set(FINUFFT_CUDA_ARCHITECTURES "all-major" CACHE STRING "CUDA architectures to build for (e.g. 60;70;75;)") +set(FINUFFT_CUDA_ARCHITECTURES "native" CACHE STRING "CUDA architectures to build for (e.g. 60;70;75;)") # All options go here # sphinx tag (don't remove): @cmake_opts_start option(FINUFFT_BUILD_EXAMPLES "Whether to build the FINUFFT examples" OFF) diff --git a/perftest/cuda/bench.py b/perftest/cuda/bench.py new file mode 100644 index 000000000..8812f10a4 --- /dev/null +++ b/perftest/cuda/bench.py @@ -0,0 +1,106 @@ +import matplotlib.pyplot as plt +import os +import subprocess +import pandas as pd +import numpy as np + +cwd = os.getcwd() + + +# function that runs a command line command and returns the output +# it also takes a list of arguments to pass to the command +def run_command(command, args): + # convert command and args to a string + try: + cmd = [command] + args + result = subprocess.run(cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) + return result.stdout + except subprocess.CalledProcessError as e: + print('stdout output:\n', e.stdout) + print('stderr output:\n', e.stderr) + print("Error executing command:", e) + + +# function that builds a string from a dictionary of arguments + +def build_args(args): + args_list = [] + for key, value in args.items(): + args_list.append(key + " " + value) + return ' '.join(args_list) + + +# function + +# example command to run: +# nsys profile -o cuperftest_profile ./cuperftest --prec f --n_runs 10 --method 1 --N1 256 --N2 256 --N3 256 --M 1E8 --tol 1E-6 +# example arguments +args = {"--prec": "f", + "--n_runs": "1", + "--method": "1", + "--N1": "256", + # "--N2": "256", + # "--N3": "256", + "--M": "1E8", + "--tol": "1E-6"} +# iterate over tol from 1E-6 to 1E-1 +data = { + 'method': [], + 'throughput': [], + 'tolerance': [] +} +for i in range(1, 7): + args["--tol"] = "1E-" + str(i) + print("Running with tol = 1E-" + str(i)) + for method in ['2', '1']: + if method == '0': + data['method'].append('auto') + elif method == '1': + data['method'].append('GM') + elif method == '2': + data['method'].append('SM') + print("Method " + data['method'][-1]) + cmd = ["profile", "--force-overwrite", "true", "-o", "cuperftest_profile", cwd + "/cuperftest", build_args(args)] + run_command("nsys", cmd) + cmd = ["stats", "--force-overwrite=true", "--force-export=true", "--report", "cuda_gpu_trace", "--report", "cuda_gpu_kern_sum", "cuperftest_profile.nsys-rep", + "--format=csv", "--output", "cuperftest"] + csv = run_command("nsys", cmd) + print(csv) + dt = pd.read_csv("./cuperftest_cuda_gpu_kern_sum.csv") + # sort dt by column "Time (%)" + dt = dt[dt["Name"].str.contains("cufinufft::spreadinterp::spread")] + dt = dt.sort_values(by="Time (%)", ascending=False) + # drop all the rows with spread not in "Name" + time = dt["Avg (ns)"].values[0] + # pt/s + throughput = float(args['--M']) * 1_000_000_000 / time + data['throughput'].append(throughput) + data['tolerance'].append(args['--tol']) + +df = pd.DataFrame(data) + +# Pivot the DataFrame +pivot_df = df.pivot(index='tolerance', columns='method', values='throughput') +# Plot +pivot_df.plot(kind='bar', figsize=(10, 7)) +# Find the minimum throughput value +min_throughput = df['throughput'].min() + +# Calculate the smallest power of 10 +min_pow_10 = 10 ** np.floor(np.log10(min_throughput)) + +# Adjust the plot's y-axis limits +plt.ylim(df['throughput'].min()*.95, df['throughput'].max() * 1.05) # Adding 10% for upper margin + +plt.xlabel('Tolerance') +plt.ylabel('Throughput') +plt.title('Throughput by Tolerance and Method') +plt.legend(title='Method') +plt.tight_layout() +plt.show() +plt.xlabel("Tolerance") +plt.ylabel("Points/s") +plt.savefig("bench.png") +plt.savefig("bench.svg") +plt.savefig("bench.pdf") +plt.show() \ No newline at end of file From ae55ca5b96b7167831ccdc0a2e2211b6297753bf Mon Sep 17 00:00:00 2001 From: Marco Barbone Date: Mon, 8 Jul 2024 12:09:20 -0400 Subject: [PATCH 03/68] optimised plotting --- perftest/cuda/bench.py | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/perftest/cuda/bench.py b/perftest/cuda/bench.py index 8812f10a4..5857b5ede 100644 --- a/perftest/cuda/bench.py +++ b/perftest/cuda/bench.py @@ -65,20 +65,27 @@ def build_args(args): cmd = ["stats", "--force-overwrite=true", "--force-export=true", "--report", "cuda_gpu_trace", "--report", "cuda_gpu_kern_sum", "cuperftest_profile.nsys-rep", "--format=csv", "--output", "cuperftest"] csv = run_command("nsys", cmd) - print(csv) - dt = pd.read_csv("./cuperftest_cuda_gpu_kern_sum.csv") - # sort dt by column "Time (%)" - dt = dt[dt["Name"].str.contains("cufinufft::spreadinterp::spread")] - dt = dt.sort_values(by="Time (%)", ascending=False) + # print(csv) + dt = pd.read_csv("./cuperftest_cuda_gpu_trace.csv") + # print(dt) + # sum the "Total Time" column of the ones that contain "fft" in name + # print(dt[dt["Name"].str.contains("fft") & ~dt["Name"].str.contains("cufinufft")]) + total_fft = dt[dt["Name"].str.contains("fft") & ~dt["Name"].str.contains("cufinufft")]['Duration (ns)'].sum() + print(f'total_fft: {total_fft}') # drop all the rows with spread not in "Name" - time = dt["Avg (ns)"].values[0] + dt = dt[dt["Name"].str.contains("cufinufft::spreadinterp::spread")] + # print(dt) + # sort dt by column "Time (%)" + total_spread = dt['Duration (ns)'].sum() - total_fft + print(f'total_spread: {total_spread}') # pt/s - throughput = float(args['--M']) * 1_000_000_000 / time + throughput = float(args['--M']) * float(args['--n_runs']) * 1_000_000_000 / total_spread + print(f'throughput: {throughput}') data['throughput'].append(throughput) data['tolerance'].append(args['--tol']) df = pd.DataFrame(data) - +print(df) # Pivot the DataFrame pivot_df = df.pivot(index='tolerance', columns='method', values='throughput') # Plot @@ -90,7 +97,7 @@ def build_args(args): min_pow_10 = 10 ** np.floor(np.log10(min_throughput)) # Adjust the plot's y-axis limits -plt.ylim(df['throughput'].min()*.95, df['throughput'].max() * 1.05) # Adding 10% for upper margin +plt.ylim(df['throughput'].min()*.99, df['throughput'].max() * 1.09) # Adding 10% for upper margin plt.xlabel('Tolerance') plt.ylabel('Throughput') From 16e27f0575a930633803c13ea274fd8182c4a064 Mon Sep 17 00:00:00 2001 From: Marco Barbone Date: Mon, 8 Jul 2024 12:28:34 -0400 Subject: [PATCH 04/68] fixed plotting and metrics --- perftest/cuda/bench.py | 15 +++++++++++--- perftest/cuda/cuperftest.cu | 41 ++++++++++++++++--------------------- 2 files changed, 30 insertions(+), 26 deletions(-) diff --git a/perftest/cuda/bench.py b/perftest/cuda/bench.py index 5857b5ede..88ef0679b 100644 --- a/perftest/cuda/bench.py +++ b/perftest/cuda/bench.py @@ -3,7 +3,7 @@ import subprocess import pandas as pd import numpy as np - +import io cwd = os.getcwd() @@ -61,10 +61,19 @@ def build_args(args): data['method'].append('SM') print("Method " + data['method'][-1]) cmd = ["profile", "--force-overwrite", "true", "-o", "cuperftest_profile", cwd + "/cuperftest", build_args(args)] - run_command("nsys", cmd) + stdout = run_command("nsys", cmd) + # skip all lines starting with # in stdout + stdout = [x for x in stdout.splitlines() if not x.startswith("#")][:7] + stdout = '\n'.join(stdout) + # convert stdout to a dataframe from csv string + dt = pd.read_csv(io.StringIO(stdout), sep=',') + setpts = dt[dt["event"].str.contains("setpts")]['nupts/s'].sum() + exec = dt[dt["event"].str.contains("exec")]['nupts/s'].sum() + print(f'setpts pts/s: {setpts}') + print(f'exec pts/s: {exec}') cmd = ["stats", "--force-overwrite=true", "--force-export=true", "--report", "cuda_gpu_trace", "--report", "cuda_gpu_kern_sum", "cuperftest_profile.nsys-rep", "--format=csv", "--output", "cuperftest"] - csv = run_command("nsys", cmd) + stdout = run_command("nsys", cmd) # print(csv) dt = pd.read_csv("./cuperftest_cuda_gpu_trace.csv") # print(dt) diff --git a/perftest/cuda/cuperftest.cu b/perftest/cuda/cuperftest.cu index 85118f1f8..f72ffb3e6 100644 --- a/perftest/cuda/cuperftest.cu +++ b/perftest/cuda/cuperftest.cu @@ -275,29 +275,24 @@ template void run_test(test_options_t &test_opts) { } const int64_t nupts_tot = M * test_opts.n_runs * ntransf; - // - // printf("event,count,tot(ms),mean(ms),std(ms),nupts/s,ns/nupt\n"); - // printf("host_to_device,%d,%f,%f,%f,0.0,0.0\n", h2d_timer.count(), h2d_timer.tot(), - // h2d_timer.mean(), h2d_timer.std()); - // printf("makeplan,%d,%f,%f,%f,0.0,0.0\n", makeplan_timer.count(), - // makeplan_timer.tot(), - // makeplan_timer.mean(), makeplan_timer.std()); - // printf("setpts,%d,%f,%f,%f,%g,%f\n", test_opts.n_runs, setpts_timer.tot(), - // setpts_timer.mean(), setpts_timer.std(), nupts_tot * 1000 / - // setpts_timer.tot(), setpts_timer.tot() * 1E6 / nupts_tot); - // printf("execute,%d,%f,%f,%f,%g,%f\n", test_opts.n_runs, execute_timer.tot(), - // execute_timer.mean(), execute_timer.std(), - // nupts_tot * 1000 / execute_timer.tot(), execute_timer.tot() * 1E6 / - // nupts_tot); - // printf("device_to_host,%d,%f,%f,%f,0.0,0.0\n", d2h_timer.count(), d2h_timer.tot(), - // d2h_timer.mean(), d2h_timer.std()); - // printf("amortized,%d,%f,%f,%f,%g,%f\n", 1, amortized_timer.tot(), - // amortized_timer.mean(), amortized_timer.std(), - // nupts_tot * 1000 / amortized_timer.tot(), - // amortized_timer.tot() * 1E6 / nupts_tot); - // print numpts / s - printf("setpts pts/s: %g\n", float(nupts_tot) * 1000 / setpts_timer.tot()); - printf("execute pts/s: %g\n", float(nupts_tot) * 1000 / execute_timer.tot()); + + printf("event,count,tot(ms),mean(ms),std(ms),nupts/s,ns/nupt\n"); + printf("host_to_device,%d,%f,%f,%f,0.0,0.0\n", h2d_timer.count(), h2d_timer.tot(), + h2d_timer.mean(), h2d_timer.std()); + printf("makeplan,%d,%f,%f,%f,0.0,0.0\n", makeplan_timer.count(), makeplan_timer.tot(), + makeplan_timer.mean(), makeplan_timer.std()); + printf("setpts,%d,%f,%f,%f,%g,%f\n", test_opts.n_runs, setpts_timer.tot(), + setpts_timer.mean(), setpts_timer.std(), nupts_tot * 1000 / setpts_timer.tot(), + setpts_timer.tot() * 1E6 / nupts_tot); + printf("execute,%d,%f,%f,%f,%g,%f\n", test_opts.n_runs, execute_timer.tot(), + execute_timer.mean(), execute_timer.std(), + nupts_tot * 1000 / execute_timer.tot(), execute_timer.tot() * 1E6 / nupts_tot); + printf("device_to_host,%d,%f,%f,%f,0.0,0.0\n", d2h_timer.count(), d2h_timer.tot(), + d2h_timer.mean(), d2h_timer.std()); + printf("amortized,%d,%f,%f,%f,%g,%f\n", 1, amortized_timer.tot(), + amortized_timer.mean(), amortized_timer.std(), + nupts_tot * 1000 / amortized_timer.tot(), + amortized_timer.tot() * 1E6 / nupts_tot); } int main(int argc, char *argv[]) { From 49d1f21c095704277932b3f3c204ab0f70fc58f3 Mon Sep 17 00:00:00 2001 From: Marco Barbone Date: Mon, 8 Jul 2024 15:28:32 -0400 Subject: [PATCH 05/68] fixed the plot script --- include/cufinufft/impl.h | 2 +- perftest/cuda/bench.py | 53 ++++++++++++++++++++++++++++------------ 2 files changed, 39 insertions(+), 16 deletions(-) diff --git a/include/cufinufft/impl.h b/include/cufinufft/impl.h index aa58c8dee..a53f58c82 100644 --- a/include/cufinufft/impl.h +++ b/include/cufinufft/impl.h @@ -42,7 +42,7 @@ int cufinufft3d2_exec(cuda_complex *d_c, cuda_complex *d_fk, static void cufinufft_setup_binsize(int type, int dim, cufinufft_opts *opts) { switch (dim) { case 1: { - opts->gpu_binsizex = (opts->gpu_binsizex < 0) ? 1024 : opts->gpu_binsizex; + opts->gpu_binsizex = (opts->gpu_binsizex < 0) ? 16384 : opts->gpu_binsizex; opts->gpu_binsizey = 1; opts->gpu_binsizez = 1; } break; diff --git a/perftest/cuda/bench.py b/perftest/cuda/bench.py index 88ef0679b..def6e8303 100644 --- a/perftest/cuda/bench.py +++ b/perftest/cuda/bench.py @@ -13,6 +13,7 @@ def run_command(command, args): # convert command and args to a string try: cmd = [command] + args + print("Running command:", ' '.join(cmd)) result = subprocess.run(cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) return result.stdout except subprocess.CalledProcessError as e: @@ -26,8 +27,9 @@ def run_command(command, args): def build_args(args): args_list = [] for key, value in args.items(): - args_list.append(key + " " + value) - return ' '.join(args_list) + args_list.append(key) + args_list.append(value) + return args_list # function @@ -36,9 +38,9 @@ def build_args(args): # nsys profile -o cuperftest_profile ./cuperftest --prec f --n_runs 10 --method 1 --N1 256 --N2 256 --N3 256 --M 1E8 --tol 1E-6 # example arguments args = {"--prec": "f", - "--n_runs": "1", + "--n_runs": "5", "--method": "1", - "--N1": "256", + "--N1": "65536", # "--N2": "256", # "--N3": "256", "--M": "1E8", @@ -47,7 +49,9 @@ def build_args(args): data = { 'method': [], 'throughput': [], - 'tolerance': [] + 'tolerance': [], + # 'setpts': [], + 'exec': [], } for i in range(1, 7): args["--tol"] = "1E-" + str(i) @@ -60,15 +64,17 @@ def build_args(args): elif method == '2': data['method'].append('SM') print("Method " + data['method'][-1]) - cmd = ["profile", "--force-overwrite", "true", "-o", "cuperftest_profile", cwd + "/cuperftest", build_args(args)] + cmd = ["profile", "--force-overwrite", "true", "-o", "cuperftest_profile", cwd + "/cuperftest"] + build_args(args) stdout = run_command("nsys", cmd) # skip all lines starting with # in stdout + conf = [x for x in stdout.splitlines() if x.startswith("#")] + print('\n'.join(conf)) stdout = [x for x in stdout.splitlines() if not x.startswith("#")][:7] stdout = '\n'.join(stdout) # convert stdout to a dataframe from csv string dt = pd.read_csv(io.StringIO(stdout), sep=',') - setpts = dt[dt["event"].str.contains("setpts")]['nupts/s'].sum() - exec = dt[dt["event"].str.contains("exec")]['nupts/s'].sum() + setpts = dt[dt["event"].str.contains("setpts")]['nupts/s'].sum() # it is only one row it extracts the value + exec = dt[dt["event"].str.contains("exec")]['nupts/s'].sum() # it is only one row it extracts the value print(f'setpts pts/s: {setpts}') print(f'exec pts/s: {exec}') cmd = ["stats", "--force-overwrite=true", "--force-export=true", "--report", "cuda_gpu_trace", "--report", "cuda_gpu_kern_sum", "cuperftest_profile.nsys-rep", @@ -84,6 +90,7 @@ def build_args(args): # drop all the rows with spread not in "Name" dt = dt[dt["Name"].str.contains("cufinufft::spreadinterp::spread")] # print(dt) + # exit(0) # sort dt by column "Time (%)" total_spread = dt['Duration (ns)'].sum() - total_fft print(f'total_spread: {total_spread}') @@ -92,30 +99,46 @@ def build_args(args): print(f'throughput: {throughput}') data['throughput'].append(throughput) data['tolerance'].append(args['--tol']) + # data['setpts'].append(setpts) + data['exec'].append(exec) + df = pd.DataFrame(data) -print(df) # Pivot the DataFrame -pivot_df = df.pivot(index='tolerance', columns='method', values='throughput') +pivot_df = df.pivot(index='tolerance', columns='method') +# print(pivot_df) +# scale the throughput SM by GM +pivot_df['throughput', 'SM'] /= pivot_df['throughput', 'GM'] +# pivot_df['throughput', 'GM'] /= pivot_df['throughput', 'GM'] +# scale setpts SM by GM +pivot_df['exec', 'SM'] /= pivot_df['exec', 'GM'] +# pivot_df['exec', 'GM'] /= pivot_df['exec', 'GM'] +# remove the GM column +pivot_df.drop(('throughput', 'GM'), axis=1, inplace=True) +pivot_df.drop(('exec', 'GM'), axis=1, inplace=True) # Plot pivot_df.plot(kind='bar', figsize=(10, 7)) # Find the minimum throughput value -min_throughput = df['throughput'].min() +min_val = min(df['throughput'].min(), df['exec'].min()) +max_val = max(df['throughput'].max(), df['exec'].max()) +plt.ylim(.8, 1.2) # Calculate the smallest power of 10 -min_pow_10 = 10 ** np.floor(np.log10(min_throughput)) +# min_pow_10 = 10 ** np.floor(np.log10(min_throughput)) # Adjust the plot's y-axis limits -plt.ylim(df['throughput'].min()*.99, df['throughput'].max() * 1.09) # Adding 10% for upper margin +# plt.ylim(df['throughput'].min()*.99, df['throughput'].max() * 1.009) # Adding 10% for upper margin +# plot an horizontal line at 1 with label "GM" +plt.axhline(y=1, color='k', linestyle='--', label='GM') plt.xlabel('Tolerance') -plt.ylabel('Throughput') +plt.ylabel('Throughput (% of GM)') plt.title('Throughput by Tolerance and Method') plt.legend(title='Method') plt.tight_layout() plt.show() plt.xlabel("Tolerance") -plt.ylabel("Points/s") +plt.ylabel("Points/s (% of GM)") plt.savefig("bench.png") plt.savefig("bench.svg") plt.savefig("bench.pdf") From 2fdae684b2a6044f1d5bca9302666779b6272fd5 Mon Sep 17 00:00:00 2001 From: Marco Barbone Date: Mon, 8 Jul 2024 19:19:48 -0400 Subject: [PATCH 06/68] bin_size_x is as function of the shared memory available --- include/cufinufft/common.h | 28 ++++++++++++++ include/cufinufft/impl.h | 36 +----------------- perftest/cuda/bench.py | 47 ++++++++++++++++++----- src/cuda/1d/spread1d_wrapper.cu | 14 ++++--- src/cuda/3d/spread3d_wrapper.cu | 18 +++------ src/cuda/common.cu | 67 ++++++++++++++++++++++++++++++++- 6 files changed, 146 insertions(+), 64 deletions(-) diff --git a/include/cufinufft/common.h b/include/cufinufft/common.h index b45519a50..33d8a0d86 100644 --- a/include/cufinufft/common.h +++ b/include/cufinufft/common.h @@ -36,6 +36,34 @@ template std::size_t shared_memory_required(int dim, int ns, int bin_size_x, int bin_size_y, int bin_size_z); +template +void cufinufft_setup_binsize(int type, int ns, int dim, cufinufft_opts *opts); + +template +auto cufinufft_set_shared_memory(V *kernel, const int dim, + const cufinufft_plan_t &d_plan) { + int device_id; + cudaGetDevice(&device_id); + const auto shared_mem_required = + shared_memory_required(dim, d_plan.spopts.nspread, d_plan.opts.gpu_binsizex, + d_plan.opts.gpu_binsizey, d_plan.opts.gpu_binsizez); + int shared_mem_per_block{}; + const auto err = cudaDeviceGetAttribute( + &shared_mem_per_block, cudaDevAttrMaxSharedMemoryPerBlockOptin, device_id); + if (err != cudaSuccess) { + return err; + } + if (shared_mem_required > shared_mem_per_block) { + fprintf(stderr, + "Error: Shared memory required per block is %zu bytes, but the device " + "supports only %d bytes.\n", + shared_mem_required, shared_mem_per_block); + return err; + } + return cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, + shared_mem_required); +} + } // namespace common } // namespace cufinufft #endif diff --git a/include/cufinufft/impl.h b/include/cufinufft/impl.h index a53f58c82..4a1c6ae31 100644 --- a/include/cufinufft/impl.h +++ b/include/cufinufft/impl.h @@ -39,40 +39,6 @@ template int cufinufft3d2_exec(cuda_complex *d_c, cuda_complex *d_fk, cufinufft_plan_t *d_plan); -static void cufinufft_setup_binsize(int type, int dim, cufinufft_opts *opts) { - switch (dim) { - case 1: { - opts->gpu_binsizex = (opts->gpu_binsizex < 0) ? 16384 : opts->gpu_binsizex; - opts->gpu_binsizey = 1; - opts->gpu_binsizez = 1; - } break; - case 2: { - opts->gpu_binsizex = (opts->gpu_binsizex < 0) ? 32 : opts->gpu_binsizex; - opts->gpu_binsizey = (opts->gpu_binsizey < 0) ? 32 : opts->gpu_binsizey; - opts->gpu_binsizez = 1; - } break; - case 3: { - switch (opts->gpu_method) { - case 0: - case 1: - case 2: { - opts->gpu_binsizex = (opts->gpu_binsizex < 0) ? 16 : opts->gpu_binsizex; - opts->gpu_binsizey = (opts->gpu_binsizey < 0) ? 16 : opts->gpu_binsizey; - opts->gpu_binsizez = (opts->gpu_binsizez < 0) ? 2 : opts->gpu_binsizez; - } break; - case 4: { - opts->gpu_obinsizex = (opts->gpu_obinsizex < 0) ? 8 : opts->gpu_obinsizex; - opts->gpu_obinsizey = (opts->gpu_obinsizey < 0) ? 8 : opts->gpu_obinsizey; - opts->gpu_obinsizez = (opts->gpu_obinsizez < 0) ? 8 : opts->gpu_obinsizez; - opts->gpu_binsizex = (opts->gpu_binsizex < 0) ? 4 : opts->gpu_binsizex; - opts->gpu_binsizey = (opts->gpu_binsizey < 0) ? 4 : opts->gpu_binsizey; - opts->gpu_binsizez = (opts->gpu_binsizez < 0) ? 4 : opts->gpu_binsizez; - } break; - } - } break; - } -} - template int cufinufft_makeplan_impl(int type, int dim, int *nmodes, int iflag, int ntransf, T tol, cufinufft_plan_t **d_plan_ptr, cufinufft_opts *opts) { @@ -153,7 +119,7 @@ int cufinufft_makeplan_impl(int type, int dim, int *nmodes, int iflag, int ntran d_plan->mt = nmodes[1]; d_plan->mu = nmodes[2]; - cufinufft_setup_binsize(type, dim, &d_plan->opts); + cufinufft_setup_binsize(type, d_plan->spopts.nspread, dim, &d_plan->opts); CUFINUFFT_BIGINT nf1 = 1, nf2 = 1, nf3 = 1; set_nf_type12(d_plan->ms, d_plan->opts, d_plan->spopts, &nf1, d_plan->opts.gpu_obinsizex); diff --git a/perftest/cuda/bench.py b/perftest/cuda/bench.py index def6e8303..1e1f4838e 100644 --- a/perftest/cuda/bench.py +++ b/perftest/cuda/bench.py @@ -15,7 +15,7 @@ def run_command(command, args): cmd = [command] + args print("Running command:", ' '.join(cmd)) result = subprocess.run(cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) - return result.stdout + return result.stdout, result.stderr except subprocess.CalledProcessError as e: print('stdout output:\n', e.stdout) print('stderr output:\n', e.stderr) @@ -38,9 +38,9 @@ def build_args(args): # nsys profile -o cuperftest_profile ./cuperftest --prec f --n_runs 10 --method 1 --N1 256 --N2 256 --N3 256 --M 1E8 --tol 1E-6 # example arguments args = {"--prec": "f", - "--n_runs": "5", - "--method": "1", - "--N1": "65536", + "--n_runs": "10", + "--method": "0", + "--N1": "16777216", # "--N2": "256", # "--N3": "256", "--M": "1E8", @@ -53,10 +53,26 @@ def build_args(args): # 'setpts': [], 'exec': [], } +warmup = {"--prec": "f", + "--n_runs": "1", + "--method": "0", + "--N1": "256", + # "--N2": "256", + # "--N3": "256", + "--M": "256", + "--tol": "1E-1"} +cmd = ["profile", "--force-overwrite", "true", "-o", "cuperftest_profile", cwd + "/cuperftest"] + build_args(warmup) +print("Warmup") +stdout, stderr = run_command("nsys", cmd) +print("Benchmarking") +if stderr != '': + print(stderr) + exit(0) for i in range(1, 7): args["--tol"] = "1E-" + str(i) print("Running with tol = 1E-" + str(i)) for method in ['2', '1']: + args["--method"] = method if method == '0': data['method'].append('auto') elif method == '1': @@ -65,7 +81,10 @@ def build_args(args): data['method'].append('SM') print("Method " + data['method'][-1]) cmd = ["profile", "--force-overwrite", "true", "-o", "cuperftest_profile", cwd + "/cuperftest"] + build_args(args) - stdout = run_command("nsys", cmd) + stdout, stderr = run_command("nsys", cmd) + if stderr != '': + print(stderr) + exit(0) # skip all lines starting with # in stdout conf = [x for x in stdout.splitlines() if x.startswith("#")] print('\n'.join(conf)) @@ -79,7 +98,10 @@ def build_args(args): print(f'exec pts/s: {exec}') cmd = ["stats", "--force-overwrite=true", "--force-export=true", "--report", "cuda_gpu_trace", "--report", "cuda_gpu_kern_sum", "cuperftest_profile.nsys-rep", "--format=csv", "--output", "cuperftest"] - stdout = run_command("nsys", cmd) + stdout, _ = run_command("nsys", cmd) + # remove format from cmd + cmd = cmd[:-3] + # print(run_command("nsys", cmd)) # print(csv) dt = pd.read_csv("./cuperftest_cuda_gpu_trace.csv") # print(dt) @@ -94,6 +116,9 @@ def build_args(args): # sort dt by column "Time (%)" total_spread = dt['Duration (ns)'].sum() - total_fft print(f'total_spread: {total_spread}') + if total_fft > total_spread: + print("Warning: total_fft > total_spread") + # exit(0) # pt/s throughput = float(args['--M']) * float(args['--n_runs']) * 1_000_000_000 / total_spread print(f'throughput: {throughput}') @@ -116,12 +141,16 @@ def build_args(args): # remove the GM column pivot_df.drop(('throughput', 'GM'), axis=1, inplace=True) pivot_df.drop(('exec', 'GM'), axis=1, inplace=True) + +print(pivot_df) # Plot pivot_df.plot(kind='bar', figsize=(10, 7)) # Find the minimum throughput value -min_val = min(df['throughput'].min(), df['exec'].min()) -max_val = max(df['throughput'].max(), df['exec'].max()) -plt.ylim(.8, 1.2) +min_val = min(pivot_df[('exec', 'SM')].min(), pivot_df[('throughput', 'SM')].min(), 1) +max_val = max(pivot_df[('exec', 'SM')].max(), pivot_df[('throughput', 'SM')].max(), 0) +print(min_val, max_val) +plt.ylim(min_val * .99, max_val * 1.01) +# plt.ylim(.8, 1.2) # Calculate the smallest power of 10 # min_pow_10 = 10 ** np.floor(np.log10(min_throughput)) diff --git a/src/cuda/1d/spread1d_wrapper.cu b/src/cuda/1d/spread1d_wrapper.cu index 26fd5024c..36fa2bef9 100644 --- a/src/cuda/1d/spread1d_wrapper.cu +++ b/src/cuda/1d/spread1d_wrapper.cu @@ -7,6 +7,7 @@ #include #include +#include #include #include #include @@ -251,15 +252,14 @@ int cuspread1d_subprob(int nf1, int M, cufinufft_plan_t *d_plan, int blksize) T sigma = d_plan->opts.upsampfac; - size_t sharedplanorysize = - (bin_size_x + 2 * (int)ceil(ns / 2.0)) * sizeof(cuda_complex); - if (sharedplanorysize > 49152) { - std::cerr << "[cuspread1d_subprob] error: not enough shared memory\n"; - return FINUFFT_ERR_INSUFFICIENT_SHMEM; - } + const auto sharedplanorysize = + shared_memory_required(1, d_plan->spopts.nspread, d_plan->opts.gpu_binsizex, + d_plan->opts.gpu_binsizey, d_plan->opts.gpu_binsizez); if (d_plan->opts.gpu_kerevalmeth) { for (int t = 0; t < blksize; t++) { + cufinufft_set_shared_memory(spread_1d_subprob, 1, *d_plan); + RETURN_IF_CUDA_ERROR spread_1d_subprob<<>>( d_kx, d_c + t * M, d_fw + t * nf1, M, ns, nf1, es_c, es_beta, sigma, d_binstartpts, d_binsize, bin_size_x, d_subprob_to_bin, d_subprobstartpts, @@ -268,6 +268,8 @@ int cuspread1d_subprob(int nf1, int M, cufinufft_plan_t *d_plan, int blksize) } } else { for (int t = 0; t < blksize; t++) { + cufinufft_set_shared_memory(spread_1d_subprob, 1, *d_plan); + RETURN_IF_CUDA_ERROR spread_1d_subprob<<>>( d_kx, d_c + t * M, d_fw + t * nf1, M, ns, nf1, es_c, es_beta, sigma, d_binstartpts, d_binsize, bin_size_x, d_subprob_to_bin, d_subprobstartpts, diff --git a/src/cuda/3d/spread3d_wrapper.cu b/src/cuda/3d/spread3d_wrapper.cu index c25393e1a..6c851389c 100644 --- a/src/cuda/3d/spread3d_wrapper.cu +++ b/src/cuda/3d/spread3d_wrapper.cu @@ -7,9 +7,11 @@ #include #include +#include #include #include #include + using namespace cufinufft::common; using namespace cufinufft::memtransfer; @@ -536,17 +538,10 @@ int cuspread3d_subprob(int nf1, int nf2, int nf3, int M, cufinufft_plan_t *d_ size_t sharedplanorysize = (bin_size_x + 2 * ceil(ns / 2.0)) * (bin_size_y + 2 * ceil(ns / 2.0)) * (bin_size_z + 2 * ceil(ns / 2.0)) * sizeof(cuda_complex); - // if (sharedplanorysize > 49152) { - // std::cerr << "[cuspread3d_subprob] error: not enough shared memory (" - // << sharedplanorysize << ")" << std::endl; - // return FINUFFT_ERR_INSUFFICIENT_SHMEM; - // } - for (int t = 0; t < blksize; t++) { if (d_plan->opts.gpu_kerevalmeth) { - cudaFuncSetAttribute(spread_3d_subprob, - cudaFuncAttributeMaxDynamicSharedMemorySize, - sharedplanorysize); + cufinufft_set_shared_memory(spread_3d_subprob, 3, *d_plan); + RETURN_IF_CUDA_ERROR spread_3d_subprob<<>>( d_kx, d_ky, d_kz, d_c + t * M, d_fw + t * nf1 * nf2 * nf3, M, ns, nf1, nf2, nf3, sigma, es_c, es_beta, d_binstartpts, d_binsize, bin_size_x, bin_size_y, @@ -554,9 +549,8 @@ int cuspread3d_subprob(int nf1, int nf2, int nf3, int M, cufinufft_plan_t *d_ numbins[0], numbins[1], numbins[2], d_idxnupts); RETURN_IF_CUDA_ERROR } else { - cudaFuncSetAttribute(spread_3d_subprob, - cudaFuncAttributeMaxDynamicSharedMemorySize, - sharedplanorysize); + cufinufft_set_shared_memory(spread_3d_subprob, 3, *d_plan); + RETURN_IF_CUDA_ERROR spread_3d_subprob<<>>( d_kx, d_ky, d_kz, d_c + t * M, d_fw + t * nf1 * nf2 * nf3, M, ns, nf1, nf2, nf3, sigma, es_c, es_beta, d_binstartpts, d_binsize, bin_size_x, bin_size_y, diff --git a/src/cuda/common.cu b/src/cuda/common.cu index 7709cdf74..5e32cb101 100644 --- a/src/cuda/common.cu +++ b/src/cuda/common.cu @@ -202,8 +202,8 @@ void onedim_fseries_kernel_compute(CUFINUFFT_BIGINT nf, T *f, std::complex std::size_t shared_memory_required(int dim, int ns, int bin_size_x, int bin_size_y, int bin_size_z) { - printf("dim, ns, bin_size_x, bin_size_y, bin_size_z: %d %d %d %d %d\n", dim, ns, - bin_size_x, bin_size_y, bin_size_z); + // printf("dim, ns, bin_size_x, bin_size_y, bin_size_z: %d %d %d %d %d\n", dim, ns, + // bin_size_x, bin_size_y, bin_size_z); int adjusted_ns = bin_size_x + ((ns + 1) / 2) * 2; if (dim == 1) { @@ -221,6 +221,65 @@ std::size_t shared_memory_required(int dim, int ns, int bin_size_x, int bin_size return adjusted_ns * sizeof(cuda_complex); } +template +void cufinufft_setup_binsize(int type, int ns, int dim, cufinufft_opts *opts) { + int shared_mem_per_block{}, device_id{}; + switch (dim) { + case 1: { + switch (opts->gpu_method) { + case 0: + case 1: + case 2: + if (opts->gpu_binsizex < 0) { + cudaGetDevice(&device_id); + if (const auto err = cudaGetLastError(); err != cudaSuccess) { + throw std::runtime_error(cudaGetErrorString(err)); + } + cudaDeviceGetAttribute(&shared_mem_per_block, + cudaDevAttrMaxSharedMemoryPerBlockOptin, device_id); + if (const auto err = cudaGetLastError(); err != cudaSuccess) { + throw std::runtime_error(cudaGetErrorString(err)); + } + const int bin_size = + shared_mem_per_block / sizeof(cuda_complex) - ((ns + 1) / 2) * 2; + // find the power of 2 that is less than bin_size + const int exponent = std::log2(bin_size); + opts->gpu_binsizex = 1 << (exponent - 1); + // printf("bin_size: %d, gpu_binsizex: %d\n", bin_size, + // opts->gpu_binsizex); + } + break; + } + opts->gpu_binsizey = 1; + opts->gpu_binsizez = 1; + } break; + case 2: { + opts->gpu_binsizex = (opts->gpu_binsizex < 0) ? 32 : opts->gpu_binsizex; + opts->gpu_binsizey = (opts->gpu_binsizey < 0) ? 32 : opts->gpu_binsizey; + opts->gpu_binsizez = 1; + } break; + case 3: { + switch (opts->gpu_method) { + case 0: + case 1: + case 2: { + opts->gpu_binsizex = (opts->gpu_binsizex < 0) ? 16 : opts->gpu_binsizex; + opts->gpu_binsizey = (opts->gpu_binsizey < 0) ? 16 : opts->gpu_binsizey; + opts->gpu_binsizez = (opts->gpu_binsizez < 0) ? 2 : opts->gpu_binsizez; + } break; + case 4: { + opts->gpu_obinsizex = (opts->gpu_obinsizex < 0) ? 8 : opts->gpu_obinsizex; + opts->gpu_obinsizey = (opts->gpu_obinsizey < 0) ? 8 : opts->gpu_obinsizey; + opts->gpu_obinsizez = (opts->gpu_obinsizez < 0) ? 8 : opts->gpu_obinsizez; + opts->gpu_binsizex = (opts->gpu_binsizex < 0) ? 4 : opts->gpu_binsizex; + opts->gpu_binsizey = (opts->gpu_binsizey < 0) ? 4 : opts->gpu_binsizey; + opts->gpu_binsizez = (opts->gpu_binsizez < 0) ? 4 : opts->gpu_binsizez; + } break; + } + } break; + } +} + template void onedim_fseries_kernel_compute(CUFINUFFT_BIGINT nf, float *f, std::complex *a, float *fwkerhalf, finufft_spread_opts opts); @@ -255,5 +314,9 @@ template std::size_t shared_memory_required(int dim, int ns, int bin_size template std::size_t shared_memory_required(int dim, int ns, int bin_size_x, int bin_size_y, int bin_size_z); +template void cufinufft_setup_binsize(int type, int ns, int dim, + cufinufft_opts *opts); +template void cufinufft_setup_binsize(int type, int ns, int dim, + cufinufft_opts *opts); } // namespace common } // namespace cufinufft From c0d992377dc66808d53b0bc5ebe9f8aae3f33fa4 Mon Sep 17 00:00:00 2001 From: Marco Barbone Date: Mon, 8 Jul 2024 19:22:17 -0400 Subject: [PATCH 07/68] bin_size_x is as function of the shared memory available --- perftest/cuda/bench.py | 1 + 1 file changed, 1 insertion(+) diff --git a/perftest/cuda/bench.py b/perftest/cuda/bench.py index 1e1f4838e..5269a3f45 100644 --- a/perftest/cuda/bench.py +++ b/perftest/cuda/bench.py @@ -56,6 +56,7 @@ def build_args(args): warmup = {"--prec": "f", "--n_runs": "1", "--method": "0", + "--sort": "0", "--N1": "256", # "--N2": "256", # "--N3": "256", From 907797c82fe6ce839385348644f77d11cd5b4a34 Mon Sep 17 00:00:00 2001 From: Marco Barbone Date: Tue, 9 Jul 2024 14:19:23 -0400 Subject: [PATCH 08/68] minor optimizations in 1D --- .../contrib/ker_horner_allw_loop.inc | 362 +++++++++--------- include/cufinufft/spreadinterp.h | 1 + perftest/cuda/bench.py | 13 +- src/cuda/1d/spreadinterp1d.cuh | 285 +++++++------- src/cuda/common.cu | 4 +- 5 files changed, 337 insertions(+), 328 deletions(-) diff --git a/include/cufinufft/contrib/ker_horner_allw_loop.inc b/include/cufinufft/contrib/ker_horner_allw_loop.inc index 32f2cff00..f905c14f0 100644 --- a/include/cufinufft/contrib/ker_horner_allw_loop.inc +++ b/include/cufinufft/contrib/ker_horner_allw_loop.inc @@ -2,215 +2,215 @@ // Authors: Alex Barnett & Ludvig af Klinteberg. // (C) 2018, The Simons Foundation, Inc. if (w==2) { - CUFINUFFT_FLT c0[] = {4.5147043243215343E+01, 4.5147043243215336E+01}; - CUFINUFFT_FLT c1[] = {5.7408070938221300E+01, -5.7408070938221293E+01}; - CUFINUFFT_FLT c2[] = {-1.8395117920046662E+00, -1.8395117920046617E+00}; - CUFINUFFT_FLT c3[] = {-2.0382426253182079E+01, 2.0382426253182079E+01}; - CUFINUFFT_FLT c4[] = {-2.0940804433577291E+00, -2.0940804433577358E+00}; - CUFINUFFT_FLT c5[] = {3.1328044596872613E+00, -3.1328044596872546E+00}; + constexpr CUFINUFFT_FLT c0[] = {4.5147043243215343E+01, 4.5147043243215336E+01}; + constexpr CUFINUFFT_FLT c1[] = {5.7408070938221300E+01, -5.7408070938221293E+01}; + constexpr CUFINUFFT_FLT c2[] = {-1.8395117920046662E+00, -1.8395117920046617E+00}; + constexpr CUFINUFFT_FLT c3[] = {-2.0382426253182079E+01, 2.0382426253182079E+01}; + constexpr CUFINUFFT_FLT c4[] = {-2.0940804433577291E+00, -2.0940804433577358E+00}; + constexpr CUFINUFFT_FLT c5[] = {3.1328044596872613E+00, -3.1328044596872546E+00}; for (int i=0; i<2; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i]))))); } else if (w==3) { - CUFINUFFT_FLT c0[] = {1.5653991189315124E+02, 8.8006872410780340E+02, 1.5653991189967161E+02}; - CUFINUFFT_FLT c1[] = {3.1653018869611071E+02, 2.1722031447974492E-14, -3.1653018868907077E+02}; - CUFINUFFT_FLT c2[] = {1.7742692790454473E+02, -3.3149255274727807E+02, 1.7742692791117116E+02}; - CUFINUFFT_FLT c3[] = {-1.5357716116473128E+01, -5.1917435849174007E-16, 1.5357716122720189E+01}; - CUFINUFFT_FLT c4[] = {-3.7757583061523604E+01, 5.3222970968867436E+01, -3.7757583054647363E+01}; - CUFINUFFT_FLT c5[] = {-3.9654011076088960E+00, 6.0642442697108023E-14, 3.9654011139270056E+00}; - CUFINUFFT_FLT c6[] = {3.3694352031960180E+00, -4.8817394017826032E+00, 3.3694352094301192E+00}; + constexpr CUFINUFFT_FLT c0[] = {1.5653991189315124E+02, 8.8006872410780340E+02, 1.5653991189967161E+02}; + constexpr CUFINUFFT_FLT c1[] = {3.1653018869611071E+02, 2.1722031447974492E-14, -3.1653018868907077E+02}; + constexpr CUFINUFFT_FLT c2[] = {1.7742692790454473E+02, -3.3149255274727807E+02, 1.7742692791117116E+02}; + constexpr CUFINUFFT_FLT c3[] = {-1.5357716116473128E+01, -5.1917435849174007E-16, 1.5357716122720189E+01}; + constexpr CUFINUFFT_FLT c4[] = {-3.7757583061523604E+01, 5.3222970968867436E+01, -3.7757583054647363E+01}; + constexpr CUFINUFFT_FLT c5[] = {-3.9654011076088960E+00, 6.0642442697108023E-14, 3.9654011139270056E+00}; + constexpr CUFINUFFT_FLT c6[] = {3.3694352031960180E+00, -4.8817394017826032E+00, 3.3694352094301192E+00}; for (int i=0; i<3; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i])))))); } else if (w==4) { - CUFINUFFT_FLT c0[] = {5.4284366850213223E+02, 1.0073871433088403E+04, 1.0073871433088401E+04, 5.4284366850213223E+02}; - CUFINUFFT_FLT c1[] = {1.4650917259256937E+03, 6.1905285583602872E+03, -6.1905285583602890E+03, -1.4650917259256942E+03}; - CUFINUFFT_FLT c2[] = {1.4186910680718343E+03, -1.3995339862725584E+03, -1.3995339862725591E+03, 1.4186910680718338E+03}; - CUFINUFFT_FLT c3[] = {5.1133995502497419E+02, -1.4191608683682987E+03, 1.4191608683682980E+03, -5.1133995502497419E+02}; - CUFINUFFT_FLT c4[] = {-4.8293622641173549E+01, 3.9393732546136526E+01, 3.9393732546137308E+01, -4.8293622641173634E+01}; - CUFINUFFT_FLT c5[] = {-7.8386867802392118E+01, 1.4918904800408907E+02, -1.4918904800408754E+02, 7.8386867802392175E+01}; - CUFINUFFT_FLT c6[] = {-1.0039212571700762E+01, 5.0626747735616444E+00, 5.0626747735613531E+00, -1.0039212571700721E+01}; - CUFINUFFT_FLT c7[] = {4.7282853097645736E+00, -9.5966330409183929E+00, 9.5966330409170837E+00, -4.7282853097647068E+00}; + constexpr CUFINUFFT_FLT c0[] = {5.4284366850213223E+02, 1.0073871433088403E+04, 1.0073871433088401E+04, 5.4284366850213223E+02}; + constexpr CUFINUFFT_FLT c1[] = {1.4650917259256937E+03, 6.1905285583602872E+03, -6.1905285583602890E+03, -1.4650917259256942E+03}; + constexpr CUFINUFFT_FLT c2[] = {1.4186910680718343E+03, -1.3995339862725584E+03, -1.3995339862725591E+03, 1.4186910680718338E+03}; + constexpr CUFINUFFT_FLT c3[] = {5.1133995502497419E+02, -1.4191608683682987E+03, 1.4191608683682980E+03, -5.1133995502497419E+02}; + constexpr CUFINUFFT_FLT c4[] = {-4.8293622641173549E+01, 3.9393732546136526E+01, 3.9393732546137308E+01, -4.8293622641173634E+01}; + constexpr CUFINUFFT_FLT c5[] = {-7.8386867802392118E+01, 1.4918904800408907E+02, -1.4918904800408754E+02, 7.8386867802392175E+01}; + constexpr CUFINUFFT_FLT c6[] = {-1.0039212571700762E+01, 5.0626747735616444E+00, 5.0626747735613531E+00, -1.0039212571700721E+01}; + constexpr CUFINUFFT_FLT c7[] = {4.7282853097645736E+00, -9.5966330409183929E+00, 9.5966330409170837E+00, -4.7282853097647068E+00}; for (int i=0; i<4; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i]))))))); } else if (w==5) { - CUFINUFFT_FLT c0[] = {9.9223677575398324E+02, 3.7794697666613341E+04, 9.8715771010760523E+04, 3.7794697666613290E+04, 9.9223677575398494E+02}; - CUFINUFFT_FLT c1[] = {3.0430174925083820E+03, 3.7938404259811403E+04, 2.7804200253407354E-12, -3.7938404259811381E+04, -3.0430174925083838E+03}; - CUFINUFFT_FLT c2[] = {3.6092689177271218E+03, 7.7501368899498566E+03, -2.2704627332474989E+04, 7.7501368899498684E+03, 3.6092689177271227E+03}; - CUFINUFFT_FLT c3[] = {1.9990077310495410E+03, -3.8875294641277278E+03, 3.8628399128660033E-12, 3.8875294641277342E+03, -1.9990077310495410E+03}; - CUFINUFFT_FLT c4[] = {4.0071733590403858E+02, -1.5861137916762520E+03, 2.3839858699098813E+03, -1.5861137916762589E+03, 4.0071733590403880E+02}; - CUFINUFFT_FLT c5[] = {-9.1301168206167731E+01, 1.2316471075214690E+02, 1.0425607383569405E-11, -1.2316471075215136E+02, 9.1301168206167446E+01}; - CUFINUFFT_FLT c6[] = {-5.5339722671223782E+01, 1.1960590540261434E+02, -1.5249941358312017E+02, 1.1960590540261727E+02, -5.5339722671222638E+01}; - CUFINUFFT_FLT c7[] = {-3.3762488150349701E+00, 2.2839981872969930E+00, 3.9507985966337744E-12, -2.2839981872938613E+00, 3.3762488150346224E+00}; - CUFINUFFT_FLT c8[] = {2.5183531846827609E+00, -5.3664382310942162E+00, 6.6969190369431528E+00, -5.3664382311060113E+00, 2.5183531846825087E+00}; + constexpr CUFINUFFT_FLT c0[] = {9.9223677575398324E+02, 3.7794697666613341E+04, 9.8715771010760523E+04, 3.7794697666613290E+04, 9.9223677575398494E+02}; + constexpr CUFINUFFT_FLT c1[] = {3.0430174925083820E+03, 3.7938404259811403E+04, 2.7804200253407354E-12, -3.7938404259811381E+04, -3.0430174925083838E+03}; + constexpr CUFINUFFT_FLT c2[] = {3.6092689177271218E+03, 7.7501368899498566E+03, -2.2704627332474989E+04, 7.7501368899498684E+03, 3.6092689177271227E+03}; + constexpr CUFINUFFT_FLT c3[] = {1.9990077310495410E+03, -3.8875294641277278E+03, 3.8628399128660033E-12, 3.8875294641277342E+03, -1.9990077310495410E+03}; + constexpr CUFINUFFT_FLT c4[] = {4.0071733590403858E+02, -1.5861137916762520E+03, 2.3839858699098813E+03, -1.5861137916762589E+03, 4.0071733590403880E+02}; + constexpr CUFINUFFT_FLT c5[] = {-9.1301168206167731E+01, 1.2316471075214690E+02, 1.0425607383569405E-11, -1.2316471075215136E+02, 9.1301168206167446E+01}; + constexpr CUFINUFFT_FLT c6[] = {-5.5339722671223782E+01, 1.1960590540261434E+02, -1.5249941358312017E+02, 1.1960590540261727E+02, -5.5339722671222638E+01}; + constexpr CUFINUFFT_FLT c7[] = {-3.3762488150349701E+00, 2.2839981872969930E+00, 3.9507985966337744E-12, -2.2839981872938613E+00, 3.3762488150346224E+00}; + constexpr CUFINUFFT_FLT c8[] = {2.5183531846827609E+00, -5.3664382310942162E+00, 6.6969190369431528E+00, -5.3664382311060113E+00, 2.5183531846825087E+00}; for (int i=0; i<5; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i])))))))); } else if (w==6) { - CUFINUFFT_FLT c0[] = {2.0553833234911881E+03, 1.5499537739913142E+05, 8.1177907023291197E+05, 8.1177907023291243E+05, 1.5499537739913136E+05, 2.0553833235005709E+03}; - CUFINUFFT_FLT c1[] = {7.1269776034442639E+03, 2.0581923258843314E+05, 3.1559612614917662E+05, -3.1559612614917639E+05, -2.0581923258843314E+05, -7.1269776034341376E+03}; - CUFINUFFT_FLT c2[] = {1.0023404568475091E+04, 9.0916650498360163E+04, -1.0095927514054625E+05, -1.0095927514054641E+05, 9.0916650498360133E+04, 1.0023404568484631E+04}; - CUFINUFFT_FLT c3[] = {7.2536109410387417E+03, 4.8347162752603172E+03, -5.0512736602018493E+04, 5.0512736602018464E+04, -4.8347162752602935E+03, -7.2536109410297549E+03}; - CUFINUFFT_FLT c4[] = {2.7021878300949775E+03, -7.8773465553971982E+03, 5.2105876478344171E+03, 5.2105876478344435E+03, -7.8773465553972501E+03, 2.7021878301048719E+03}; - CUFINUFFT_FLT c5[] = {3.2120291706547602E+02, -1.8229189469937089E+03, 3.7928113414428362E+03, -3.7928113414427862E+03, 1.8229189469936987E+03, -3.2120291705638107E+02}; - CUFINUFFT_FLT c6[] = {-1.2051267090537493E+02, 2.2400507411396228E+02, -1.2506575852544464E+02, -1.2506575852534223E+02, 2.2400507411397808E+02, -1.2051267089640046E+02}; - CUFINUFFT_FLT c7[] = {-4.5977202613351125E+01, 1.1536880606853479E+02, -1.7819720186493950E+02, 1.7819720186493225E+02, -1.1536880606854527E+02, 4.5977202622148695E+01}; - CUFINUFFT_FLT c8[] = {-1.5631081288828985E+00, 7.1037430592828998E-01, -6.9838401131851052E-02, -6.9838401215353244E-02, 7.1037430589405925E-01, -1.5631081203763799E+00}; - CUFINUFFT_FLT c9[] = {1.7872002109952807E+00, -4.0452381056429791E+00, 5.8969107680858182E+00, -5.8969107681844992E+00, 4.0452381056487843E+00, -1.7872002036951482E+00}; + constexpr CUFINUFFT_FLT c0[] = {2.0553833234911881E+03, 1.5499537739913142E+05, 8.1177907023291197E+05, 8.1177907023291243E+05, 1.5499537739913136E+05, 2.0553833235005709E+03}; + constexpr CUFINUFFT_FLT c1[] = {7.1269776034442639E+03, 2.0581923258843314E+05, 3.1559612614917662E+05, -3.1559612614917639E+05, -2.0581923258843314E+05, -7.1269776034341376E+03}; + constexpr CUFINUFFT_FLT c2[] = {1.0023404568475091E+04, 9.0916650498360163E+04, -1.0095927514054625E+05, -1.0095927514054641E+05, 9.0916650498360133E+04, 1.0023404568484631E+04}; + constexpr CUFINUFFT_FLT c3[] = {7.2536109410387417E+03, 4.8347162752603172E+03, -5.0512736602018493E+04, 5.0512736602018464E+04, -4.8347162752602935E+03, -7.2536109410297549E+03}; + constexpr CUFINUFFT_FLT c4[] = {2.7021878300949775E+03, -7.8773465553971982E+03, 5.2105876478344171E+03, 5.2105876478344435E+03, -7.8773465553972501E+03, 2.7021878301048719E+03}; + constexpr CUFINUFFT_FLT c5[] = {3.2120291706547602E+02, -1.8229189469937089E+03, 3.7928113414428362E+03, -3.7928113414427862E+03, 1.8229189469936987E+03, -3.2120291705638107E+02}; + constexpr CUFINUFFT_FLT c6[] = {-1.2051267090537493E+02, 2.2400507411396228E+02, -1.2506575852544464E+02, -1.2506575852534223E+02, 2.2400507411397808E+02, -1.2051267089640046E+02}; + constexpr CUFINUFFT_FLT c7[] = {-4.5977202613351125E+01, 1.1536880606853479E+02, -1.7819720186493950E+02, 1.7819720186493225E+02, -1.1536880606854527E+02, 4.5977202622148695E+01}; + constexpr CUFINUFFT_FLT c8[] = {-1.5631081288828985E+00, 7.1037430592828998E-01, -6.9838401131851052E-02, -6.9838401215353244E-02, 7.1037430589405925E-01, -1.5631081203763799E+00}; + constexpr CUFINUFFT_FLT c9[] = {1.7872002109952807E+00, -4.0452381056429791E+00, 5.8969107680858182E+00, -5.8969107681844992E+00, 4.0452381056487843E+00, -1.7872002036951482E+00}; for (int i=0; i<6; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i]))))))))); } else if (w==7) { - CUFINUFFT_FLT c0[] = {3.9948351830487572E+03, 5.4715865608590818E+05, 5.0196413492771797E+06, 9.8206709220713284E+06, 5.0196413492771862E+06, 5.4715865608590830E+05, 3.9948351830642591E+03}; - CUFINUFFT_FLT c1[] = {1.5290160332974685E+04, 8.7628248584320396E+05, 3.4421061790934447E+06, -1.3062175007082776E-26, -3.4421061790934466E+06, -8.7628248584320408E+05, -1.5290160332958067E+04}; - CUFINUFFT_FLT c2[] = {2.4458227486779248E+04, 5.3904618484139408E+05, 2.4315566181017426E+05, -1.6133959371974319E+06, 2.4315566181017403E+05, 5.3904618484139384E+05, 2.4458227486795098E+04}; - CUFINUFFT_FLT c3[] = {2.1166189345881645E+04, 1.3382732160223144E+05, -3.3113450969689671E+05, -6.5160817568418758E-10, 3.3113450969689724E+05, -1.3382732160223127E+05, -2.1166189345866882E+04}; - CUFINUFFT_FLT c4[] = {1.0542795672344866E+04, -7.0739172265096213E+03, -6.5563293056048453E+04, 1.2429734005960147E+05, -6.5563293056048846E+04, -7.0739172265096058E+03, 1.0542795672361211E+04}; - CUFINUFFT_FLT c5[] = {2.7903491906228414E+03, -1.0975382873973065E+04, 1.3656979541144814E+04, 1.2638008605419305E-09, -1.3656979541144177E+04, 1.0975382873973065E+04, -2.7903491906078302E+03}; - CUFINUFFT_FLT c6[] = {1.6069721418053450E+02, -1.5518707872250775E+03, 4.3634273936637373E+03, -5.9891976420593228E+03, 4.3634273936637110E+03, -1.5518707872251396E+03, 1.6069721419533406E+02}; - CUFINUFFT_FLT c7[] = {-1.2289277373867886E+02, 2.8583630927743752E+02, -2.8318194617301111E+02, -8.6523823682922648E-10, 2.8318194617373905E+02, -2.8583630927755564E+02, 1.2289277375320185E+02}; - CUFINUFFT_FLT c8[] = {-3.2270164914248042E+01, 9.1892112257600488E+01, -1.6710678096332572E+02, 2.0317049305437533E+02, -1.6710678096375165E+02, 9.1892112257478516E+01, -3.2270164900225943E+01}; - CUFINUFFT_FLT c9[] = {-1.4761409684737312E-01, -9.1862771282699363E-01, 1.2845147738991460E+00, 2.0325596081255337E-10, -1.2845147731561355E+00, 9.1862771288504130E-01, 1.4761410890750706E-01}; - CUFINUFFT_FLT c10[] = {1.0330620799191630E+00, -2.6798144967451138E+00, 4.4142511561803381E+00, -5.1799254918189979E+00, 4.4142511544246821E+00, -2.6798144968294695E+00, 1.0330620914479023E+00}; + constexpr CUFINUFFT_FLT c0[] = {3.9948351830487572E+03, 5.4715865608590818E+05, 5.0196413492771797E+06, 9.8206709220713284E+06, 5.0196413492771862E+06, 5.4715865608590830E+05, 3.9948351830642591E+03}; + constexpr CUFINUFFT_FLT c1[] = {1.5290160332974685E+04, 8.7628248584320396E+05, 3.4421061790934447E+06, -1.3062175007082776E-26, -3.4421061790934466E+06, -8.7628248584320408E+05, -1.5290160332958067E+04}; + constexpr CUFINUFFT_FLT c2[] = {2.4458227486779248E+04, 5.3904618484139408E+05, 2.4315566181017426E+05, -1.6133959371974319E+06, 2.4315566181017403E+05, 5.3904618484139384E+05, 2.4458227486795098E+04}; + constexpr CUFINUFFT_FLT c3[] = {2.1166189345881645E+04, 1.3382732160223144E+05, -3.3113450969689671E+05, -6.5160817568418758E-10, 3.3113450969689724E+05, -1.3382732160223127E+05, -2.1166189345866882E+04}; + constexpr CUFINUFFT_FLT c4[] = {1.0542795672344866E+04, -7.0739172265096213E+03, -6.5563293056048453E+04, 1.2429734005960147E+05, -6.5563293056048846E+04, -7.0739172265096058E+03, 1.0542795672361211E+04}; + constexpr CUFINUFFT_FLT c5[] = {2.7903491906228414E+03, -1.0975382873973065E+04, 1.3656979541144814E+04, 1.2638008605419305E-09, -1.3656979541144177E+04, 1.0975382873973065E+04, -2.7903491906078302E+03}; + constexpr CUFINUFFT_FLT c6[] = {1.6069721418053450E+02, -1.5518707872250775E+03, 4.3634273936637373E+03, -5.9891976420593228E+03, 4.3634273936637110E+03, -1.5518707872251396E+03, 1.6069721419533406E+02}; + constexpr CUFINUFFT_FLT c7[] = {-1.2289277373867886E+02, 2.8583630927743752E+02, -2.8318194617301111E+02, -8.6523823682922648E-10, 2.8318194617373905E+02, -2.8583630927755564E+02, 1.2289277375320185E+02}; + constexpr CUFINUFFT_FLT c8[] = {-3.2270164914248042E+01, 9.1892112257600488E+01, -1.6710678096332572E+02, 2.0317049305437533E+02, -1.6710678096375165E+02, 9.1892112257478516E+01, -3.2270164900225943E+01}; + constexpr CUFINUFFT_FLT c9[] = {-1.4761409684737312E-01, -9.1862771282699363E-01, 1.2845147738991460E+00, 2.0325596081255337E-10, -1.2845147731561355E+00, 9.1862771288504130E-01, 1.4761410890750706E-01}; + constexpr CUFINUFFT_FLT c10[] = {1.0330620799191630E+00, -2.6798144967451138E+00, 4.4142511561803381E+00, -5.1799254918189979E+00, 4.4142511544246821E+00, -2.6798144968294695E+00, 1.0330620914479023E+00}; for (int i=0; i<7; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i])))))))))); } else if (w==8) { - CUFINUFFT_FLT c0[] = {7.3898000697447951E+03, 1.7297637497600042E+06, 2.5578341605285816E+07, 8.4789650417103380E+07, 8.4789650417103380E+07, 2.5578341605285820E+07, 1.7297637497600049E+06, 7.3898000697448042E+03}; - CUFINUFFT_FLT c1[] = {3.0719636811267595E+04, 3.1853145713323937E+06, 2.3797981861403696E+07, 2.4569731244678468E+07, -2.4569731244678464E+07, -2.3797981861403700E+07, -3.1853145713323932E+06, -3.0719636811267599E+04}; - CUFINUFFT_FLT c2[] = {5.4488498478251720E+04, 2.4101183255475122E+06, 6.4554051283428278E+06, -8.9200440393090621E+06, -8.9200440393090658E+06, 6.4554051283428278E+06, 2.4101183255475122E+06, 5.4488498478251720E+04}; - CUFINUFFT_FLT c3[] = {5.3926359802542131E+04, 9.0469037926849292E+05, -6.0897036277696094E+05, -3.0743852105800072E+06, 3.0743852105800039E+06, 6.0897036277696339E+05, -9.0469037926849292E+05, -5.3926359802542116E+04}; - CUFINUFFT_FLT c4[] = {3.2444118016247583E+04, 1.3079802224392195E+05, -5.8652889370128501E+05, 4.2333306008153502E+05, 4.2333306008153904E+05, -5.8652889370128524E+05, 1.3079802224392162E+05, 3.2444118016247587E+04}; - CUFINUFFT_FLT c5[] = {1.1864306345505289E+04, -2.2700360645707628E+04, -5.0713607251413239E+04, 1.8308704458211805E+05, -1.8308704458211269E+05, 5.0713607251412053E+04, 2.2700360645707922E+04, -1.1864306345505289E+04}; - CUFINUFFT_FLT c6[] = {2.2812256770903182E+03, -1.1569135767378117E+04, 2.0942387020799080E+04, -1.1661592834949530E+04, -1.1661592834949715E+04, 2.0942387020801576E+04, -1.1569135767377431E+04, 2.2812256770903446E+03}; - CUFINUFFT_FLT c7[] = {8.5503535636805026E+00, -9.7513976461269635E+02, 3.8242995179157779E+03, -6.9201295567256420E+03, 6.9201295567222760E+03, -3.8242995179195914E+03, 9.7513976461218783E+02, -8.5503535636857091E+00}; - CUFINUFFT_FLT c8[] = {-1.0230637348345583E+02, 2.8246898554291380E+02, -3.8638201738179225E+02, 1.9106407993005959E+02, 1.9106407993232122E+02, -3.8638201738334749E+02, 2.8246898554236805E+02, -1.0230637348345877E+02}; - CUFINUFFT_FLT c9[] = {-1.9200143062948566E+01, 6.1692257626799076E+01, -1.2981109187842986E+02, 1.8681284209951576E+02, -1.8681284210285929E+02, 1.2981109187694383E+02, -6.1692257626659767E+01, 1.9200143062946392E+01}; - CUFINUFFT_FLT c10[] = {3.7894993760901435E-01, -1.7334408837152924E+00, 2.5271184066312142E+00, -1.2600963963387819E+00, -1.2600963946516730E+00, 2.5271184093306061E+00, -1.7334408836731170E+00, 3.7894993761824158E-01}; + constexpr CUFINUFFT_FLT c0[] = {7.3898000697447951E+03, 1.7297637497600042E+06, 2.5578341605285816E+07, 8.4789650417103380E+07, 8.4789650417103380E+07, 2.5578341605285820E+07, 1.7297637497600049E+06, 7.3898000697448042E+03}; + constexpr CUFINUFFT_FLT c1[] = {3.0719636811267595E+04, 3.1853145713323937E+06, 2.3797981861403696E+07, 2.4569731244678468E+07, -2.4569731244678464E+07, -2.3797981861403700E+07, -3.1853145713323932E+06, -3.0719636811267599E+04}; + constexpr CUFINUFFT_FLT c2[] = {5.4488498478251720E+04, 2.4101183255475122E+06, 6.4554051283428278E+06, -8.9200440393090621E+06, -8.9200440393090658E+06, 6.4554051283428278E+06, 2.4101183255475122E+06, 5.4488498478251720E+04}; + constexpr CUFINUFFT_FLT c3[] = {5.3926359802542131E+04, 9.0469037926849292E+05, -6.0897036277696094E+05, -3.0743852105800072E+06, 3.0743852105800039E+06, 6.0897036277696339E+05, -9.0469037926849292E+05, -5.3926359802542116E+04}; + constexpr CUFINUFFT_FLT c4[] = {3.2444118016247583E+04, 1.3079802224392195E+05, -5.8652889370128501E+05, 4.2333306008153502E+05, 4.2333306008153904E+05, -5.8652889370128524E+05, 1.3079802224392162E+05, 3.2444118016247587E+04}; + constexpr CUFINUFFT_FLT c5[] = {1.1864306345505289E+04, -2.2700360645707628E+04, -5.0713607251413239E+04, 1.8308704458211805E+05, -1.8308704458211269E+05, 5.0713607251412053E+04, 2.2700360645707922E+04, -1.1864306345505289E+04}; + constexpr CUFINUFFT_FLT c6[] = {2.2812256770903182E+03, -1.1569135767378117E+04, 2.0942387020799080E+04, -1.1661592834949530E+04, -1.1661592834949715E+04, 2.0942387020801576E+04, -1.1569135767377431E+04, 2.2812256770903446E+03}; + constexpr CUFINUFFT_FLT c7[] = {8.5503535636805026E+00, -9.7513976461269635E+02, 3.8242995179157779E+03, -6.9201295567256420E+03, 6.9201295567222760E+03, -3.8242995179195914E+03, 9.7513976461218783E+02, -8.5503535636857091E+00}; + constexpr CUFINUFFT_FLT c8[] = {-1.0230637348345583E+02, 2.8246898554291380E+02, -3.8638201738179225E+02, 1.9106407993005959E+02, 1.9106407993232122E+02, -3.8638201738334749E+02, 2.8246898554236805E+02, -1.0230637348345877E+02}; + constexpr CUFINUFFT_FLT c9[] = {-1.9200143062948566E+01, 6.1692257626799076E+01, -1.2981109187842986E+02, 1.8681284209951576E+02, -1.8681284210285929E+02, 1.2981109187694383E+02, -6.1692257626659767E+01, 1.9200143062946392E+01}; + constexpr CUFINUFFT_FLT c10[] = {3.7894993760901435E-01, -1.7334408837152924E+00, 2.5271184066312142E+00, -1.2600963963387819E+00, -1.2600963946516730E+00, 2.5271184093306061E+00, -1.7334408836731170E+00, 3.7894993761824158E-01}; for (int i=0; i<8; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i])))))))))); } else if (w==9) { - CUFINUFFT_FLT c0[] = {1.3136365370186117E+04, 5.0196413492771843E+06, 1.1303327711722571E+08, 5.8225443924996734E+08, 9.7700272582690704E+08, 5.8225443924996817E+08, 1.1303327711722572E+08, 5.0196413492772235E+06, 1.3136365370186102E+04}; - CUFINUFFT_FLT c1[] = {5.8623313038274340E+04, 1.0326318537280340E+07, 1.2898448324824861E+08, 3.0522863709830379E+08, 2.2777200847591304E-08, -3.0522863709830391E+08, -1.2898448324824867E+08, -1.0326318537280390E+07, -5.8623313038274362E+04}; - CUFINUFFT_FLT c2[] = {1.1335001341875963E+05, 9.0726133144784775E+06, 5.3501544534038082E+07, -2.6789524644150439E+05, -1.2483923718899380E+08, -2.6789524644173466E+05, 5.3501544534038067E+07, 9.0726133144785129E+06, 1.1335001341875964E+05}; - CUFINUFFT_FLT c3[] = {1.2489113703229750E+05, 4.3035547171861930E+06, 6.3021978510598894E+06, -2.6014941986659020E+07, 2.8258041381448560E-08, 2.6014941986659355E+07, -6.3021978510598978E+06, -4.3035547171862079E+06, -1.2489113703229750E+05}; - CUFINUFFT_FLT c4[] = {8.6425493435991229E+04, 1.0891182836653332E+06, -2.0713033564200329E+06, -2.8994941183505855E+06, 7.5905338661207352E+06, -2.8994941183504057E+06, -2.0713033564200525E+06, 1.0891182836653360E+06, 8.6425493435991244E+04}; - CUFINUFFT_FLT c5[] = {3.8657354724013807E+04, 7.9936390113327987E+04, -7.0458265546792350E+05, 1.0151095605715724E+06, 8.7808418931366203E-08, -1.0151095605718571E+06, 7.0458265546792292E+05, -7.9936390113333473E+04, -3.8657354724013807E+04}; - CUFINUFFT_FLT c6[] = {1.0779131453134632E+04, -3.3466718311303863E+04, -1.3245366619006214E+04, 1.8238470515351585E+05, -2.9285656292984058E+05, 1.8238470515350348E+05, -1.3245366619016511E+04, -3.3466718311298035E+04, 1.0779131453134652E+04}; - CUFINUFFT_FLT c7[] = {1.4992527030548451E+03, -9.7024371533906651E+03, 2.3216330734046409E+04, -2.3465262819075571E+04, -3.7031099746142328E-08, 2.3465262819179152E+04, -2.3216330734079289E+04, 9.7024371533883768E+03, -1.4992527030548429E+03}; - CUFINUFFT_FLT c8[] = {-7.9857427421137089E+01, -4.0585588534737309E+02, 2.6054813773474157E+03, -6.1806593581211082E+03, 8.0679596873751289E+03, -6.1806593581509942E+03, 2.6054813773256465E+03, -4.0585588535330419E+02, -7.9857427421164303E+01}; - CUFINUFFT_FLT c9[] = {-7.1572272057931258E+01, 2.2785637019446185E+02, -3.9109820765219445E+02, 3.3597424707607246E+02, 1.7793576396134983E-08, -3.3597424727519928E+02, 3.9109820766111056E+02, -2.2785637019102543E+02, 7.1572272057951565E+01}; - CUFINUFFT_FLT c10[] = {-9.8886360698029030E+00, 3.5359026948517517E+01, -8.5251867695464824E+01, 1.4285748015591199E+02, -1.6935269673908536E+02, 1.4285748008591776E+02, -8.5251867720434134E+01, 3.5359026945818123E+01, -9.8886360698009241E+00}; - CUFINUFFT_FLT c11[] = {5.4050464453063796E-01, -1.7215219066697895E+00, 2.8631741265441102E+00, -2.3817977385844018E+00, -1.0173343205540475E-08, 2.3817977172440110E+00, -2.8631741497139487E+00, 1.7215219081941548E+00, -5.4050464453541269E-01}; + constexpr CUFINUFFT_FLT c0[] = {1.3136365370186117E+04, 5.0196413492771843E+06, 1.1303327711722571E+08, 5.8225443924996734E+08, 9.7700272582690704E+08, 5.8225443924996817E+08, 1.1303327711722572E+08, 5.0196413492772235E+06, 1.3136365370186102E+04}; + constexpr CUFINUFFT_FLT c1[] = {5.8623313038274340E+04, 1.0326318537280340E+07, 1.2898448324824861E+08, 3.0522863709830379E+08, 2.2777200847591304E-08, -3.0522863709830391E+08, -1.2898448324824867E+08, -1.0326318537280390E+07, -5.8623313038274362E+04}; + constexpr CUFINUFFT_FLT c2[] = {1.1335001341875963E+05, 9.0726133144784775E+06, 5.3501544534038082E+07, -2.6789524644150439E+05, -1.2483923718899380E+08, -2.6789524644173466E+05, 5.3501544534038067E+07, 9.0726133144785129E+06, 1.1335001341875964E+05}; + constexpr CUFINUFFT_FLT c3[] = {1.2489113703229750E+05, 4.3035547171861930E+06, 6.3021978510598894E+06, -2.6014941986659020E+07, 2.8258041381448560E-08, 2.6014941986659355E+07, -6.3021978510598978E+06, -4.3035547171862079E+06, -1.2489113703229750E+05}; + constexpr CUFINUFFT_FLT c4[] = {8.6425493435991229E+04, 1.0891182836653332E+06, -2.0713033564200329E+06, -2.8994941183505855E+06, 7.5905338661207352E+06, -2.8994941183504057E+06, -2.0713033564200525E+06, 1.0891182836653360E+06, 8.6425493435991244E+04}; + constexpr CUFINUFFT_FLT c5[] = {3.8657354724013807E+04, 7.9936390113327987E+04, -7.0458265546792350E+05, 1.0151095605715724E+06, 8.7808418931366203E-08, -1.0151095605718571E+06, 7.0458265546792292E+05, -7.9936390113333473E+04, -3.8657354724013807E+04}; + constexpr CUFINUFFT_FLT c6[] = {1.0779131453134632E+04, -3.3466718311303863E+04, -1.3245366619006214E+04, 1.8238470515351585E+05, -2.9285656292984058E+05, 1.8238470515350348E+05, -1.3245366619016511E+04, -3.3466718311298035E+04, 1.0779131453134652E+04}; + constexpr CUFINUFFT_FLT c7[] = {1.4992527030548451E+03, -9.7024371533906651E+03, 2.3216330734046409E+04, -2.3465262819075571E+04, -3.7031099746142328E-08, 2.3465262819179152E+04, -2.3216330734079289E+04, 9.7024371533883768E+03, -1.4992527030548429E+03}; + constexpr CUFINUFFT_FLT c8[] = {-7.9857427421137089E+01, -4.0585588534737309E+02, 2.6054813773474157E+03, -6.1806593581211082E+03, 8.0679596873751289E+03, -6.1806593581509942E+03, 2.6054813773256465E+03, -4.0585588535330419E+02, -7.9857427421164303E+01}; + constexpr CUFINUFFT_FLT c9[] = {-7.1572272057931258E+01, 2.2785637019446185E+02, -3.9109820765219445E+02, 3.3597424707607246E+02, 1.7793576396134983E-08, -3.3597424727519928E+02, 3.9109820766111056E+02, -2.2785637019102543E+02, 7.1572272057951565E+01}; + constexpr CUFINUFFT_FLT c10[] = {-9.8886360698029030E+00, 3.5359026948517517E+01, -8.5251867695464824E+01, 1.4285748015591199E+02, -1.6935269673908536E+02, 1.4285748008591776E+02, -8.5251867720434134E+01, 3.5359026945818123E+01, -9.8886360698009241E+00}; + constexpr CUFINUFFT_FLT c11[] = {5.4050464453063796E-01, -1.7215219066697895E+00, 2.8631741265441102E+00, -2.3817977385844018E+00, -1.0173343205540475E-08, 2.3817977172440110E+00, -2.8631741497139487E+00, 1.7215219081941548E+00, -5.4050464453541269E-01}; for (int i=0; i<9; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i]))))))))))); } else if (w==10) { - CUFINUFFT_FLT c0[] = {2.2594586605749279E+04, 1.3595989066786604E+07, 4.4723032442444921E+08, 3.3781755837397542E+09, 8.6836783895849838E+09, 8.6836783895849819E+09, 3.3781755837397518E+09, 4.4723032442444921E+08, 1.3595989066786485E+07, 2.2594586605749315E+04}; - CUFINUFFT_FLT c1[] = {1.0729981697645642E+05, 3.0651490267742988E+07, 5.9387966085130477E+08, 2.4434902657508340E+09, 2.0073077861288924E+09, -2.0073077861288958E+09, -2.4434902657508330E+09, -5.9387966085130465E+08, -3.0651490267742820E+07, -1.0729981697645631E+05}; - CUFINUFFT_FLT c2[] = {2.2340399734184594E+05, 3.0258214643190444E+07, 3.1512411458738214E+08, 4.3618276932319784E+08, -7.8178848450497377E+08, -7.8178848450497079E+08, 4.3618276932319820E+08, 3.1512411458738226E+08, 3.0258214643190306E+07, 2.2340399734184553E+05}; - CUFINUFFT_FLT c3[] = {2.6917433004353492E+05, 1.6875651476661235E+07, 7.4664745481963485E+07, -9.5882157211117983E+07, -2.0622994435532546E+08, 2.0622994435532695E+08, 9.5882157211117893E+07, -7.4664745481963441E+07, -1.6875651476661157E+07, -2.6917433004353417E+05}; - CUFINUFFT_FLT c4[] = {2.0818422772177903E+05, 5.6084730690362593E+06, 1.4435118192352918E+06, -4.0063869969543688E+07, 3.2803674392747905E+07, 3.2803674392747425E+07, -4.0063869969546065E+07, 1.4435118192351861E+06, 5.6084730690362072E+06, 2.0818422772177853E+05}; - CUFINUFFT_FLT c5[] = {1.0781139496011089E+05, 9.9202615851199115E+05, -3.3266265543962144E+06, -4.8557049011465441E+05, 1.0176155522771550E+07, -1.0176155522773480E+07, 4.8557049011624791E+05, 3.3266265543963145E+06, -9.9202615851196367E+05, -1.0781139496011069E+05}; - CUFINUFFT_FLT c6[] = {3.7380102688153507E+04, 1.2716675000354149E+04, -6.2163527451780590E+05, 1.4157962667182824E+06, -8.4419693137806712E+05, -8.4419693137792684E+05, 1.4157962667183836E+06, -6.2163527451768133E+05, 1.2716675000338953E+04, 3.7380102688153551E+04}; - CUFINUFFT_FLT c7[] = {8.1238936393894865E+03, -3.4872365530450799E+04, 2.3913680325180554E+04, 1.2428850301840073E+05, -3.2158255329732876E+05, 3.2158255329921009E+05, -1.2428850301906197E+05, -2.3913680325219862E+04, 3.4872365530457639E+04, -8.1238936393893855E+03}; - CUFINUFFT_FLT c8[] = {7.8515926628983277E+02, -6.6607899119362401E+03, 2.0167398338517272E+04, -2.8951401344174039E+04, 1.4622828141519254E+04, 1.4622828143473866E+04, -2.8951401346529910E+04, 2.0167398338405819E+04, -6.6607899119515532E+03, 7.8515926628964587E+02}; - CUFINUFFT_FLT c9[] = {-1.0147176570533524E+02, -3.5304284183527621E+01, 1.3576976854816689E+03, -4.3921059353471846E+03, 7.3232085265419046E+03, -7.3232085280635902E+03, 4.3921059363220147E+03, -1.3576976854281722E+03, 3.5304284184270628E+01, 1.0147176570551520E+02}; - CUFINUFFT_FLT c10[] = {-4.3161545259395531E+01, 1.5498490982051828E+02, -3.1771250772612478E+02, 3.7215448793727404E+02, -1.7181762882439287E+02, -1.7181763008770599E+02, 3.7215448759715150E+02, -3.1771250770992856E+02, 1.5498490982321766E+02, -4.3161545259481535E+01}; - CUFINUFFT_FLT c11[] = {-4.2916172038404330E+00, 1.7402146068709751E+01, -4.7947588102062113E+01, 9.2697697983158491E+01, -1.2821427595919303E+02, 1.2821427694451660E+02, -9.2697698629471930E+01, 4.7947588133767717E+01, -1.7402146075416606E+01, 4.2916172038784923E+00}; - CUFINUFFT_FLT c12[] = {3.5357495062947814E-01, -1.2828127005767840E+00, 2.4090120532215455E+00, -2.6448901913160028E+00, 1.1811546776400381E+00, 1.1811568523765217E+00, -2.6448918925210712E+00, 2.4090119216851607E+00, -1.2828127015358992E+00, 3.5357495059093369E-01}; + constexpr CUFINUFFT_FLT c0[] = {2.2594586605749279E+04, 1.3595989066786604E+07, 4.4723032442444921E+08, 3.3781755837397542E+09, 8.6836783895849838E+09, 8.6836783895849819E+09, 3.3781755837397518E+09, 4.4723032442444921E+08, 1.3595989066786485E+07, 2.2594586605749315E+04}; + constexpr CUFINUFFT_FLT c1[] = {1.0729981697645642E+05, 3.0651490267742988E+07, 5.9387966085130477E+08, 2.4434902657508340E+09, 2.0073077861288924E+09, -2.0073077861288958E+09, -2.4434902657508330E+09, -5.9387966085130465E+08, -3.0651490267742820E+07, -1.0729981697645631E+05}; + constexpr CUFINUFFT_FLT c2[] = {2.2340399734184594E+05, 3.0258214643190444E+07, 3.1512411458738214E+08, 4.3618276932319784E+08, -7.8178848450497377E+08, -7.8178848450497079E+08, 4.3618276932319820E+08, 3.1512411458738226E+08, 3.0258214643190306E+07, 2.2340399734184553E+05}; + constexpr CUFINUFFT_FLT c3[] = {2.6917433004353492E+05, 1.6875651476661235E+07, 7.4664745481963485E+07, -9.5882157211117983E+07, -2.0622994435532546E+08, 2.0622994435532695E+08, 9.5882157211117893E+07, -7.4664745481963441E+07, -1.6875651476661157E+07, -2.6917433004353417E+05}; + constexpr CUFINUFFT_FLT c4[] = {2.0818422772177903E+05, 5.6084730690362593E+06, 1.4435118192352918E+06, -4.0063869969543688E+07, 3.2803674392747905E+07, 3.2803674392747425E+07, -4.0063869969546065E+07, 1.4435118192351861E+06, 5.6084730690362072E+06, 2.0818422772177853E+05}; + constexpr CUFINUFFT_FLT c5[] = {1.0781139496011089E+05, 9.9202615851199115E+05, -3.3266265543962144E+06, -4.8557049011465441E+05, 1.0176155522771550E+07, -1.0176155522773480E+07, 4.8557049011624791E+05, 3.3266265543963145E+06, -9.9202615851196367E+05, -1.0781139496011069E+05}; + constexpr CUFINUFFT_FLT c6[] = {3.7380102688153507E+04, 1.2716675000354149E+04, -6.2163527451780590E+05, 1.4157962667182824E+06, -8.4419693137806712E+05, -8.4419693137792684E+05, 1.4157962667183836E+06, -6.2163527451768133E+05, 1.2716675000338953E+04, 3.7380102688153551E+04}; + constexpr CUFINUFFT_FLT c7[] = {8.1238936393894865E+03, -3.4872365530450799E+04, 2.3913680325180554E+04, 1.2428850301840073E+05, -3.2158255329732876E+05, 3.2158255329921009E+05, -1.2428850301906197E+05, -2.3913680325219862E+04, 3.4872365530457639E+04, -8.1238936393893855E+03}; + constexpr CUFINUFFT_FLT c8[] = {7.8515926628983277E+02, -6.6607899119362401E+03, 2.0167398338517272E+04, -2.8951401344174039E+04, 1.4622828141519254E+04, 1.4622828143473866E+04, -2.8951401346529910E+04, 2.0167398338405819E+04, -6.6607899119515532E+03, 7.8515926628964587E+02}; + constexpr CUFINUFFT_FLT c9[] = {-1.0147176570533524E+02, -3.5304284183527621E+01, 1.3576976854816689E+03, -4.3921059353471846E+03, 7.3232085265419046E+03, -7.3232085280635902E+03, 4.3921059363220147E+03, -1.3576976854281722E+03, 3.5304284184270628E+01, 1.0147176570551520E+02}; + constexpr CUFINUFFT_FLT c10[] = {-4.3161545259395531E+01, 1.5498490982051828E+02, -3.1771250772612478E+02, 3.7215448793727404E+02, -1.7181762882439287E+02, -1.7181763008770599E+02, 3.7215448759715150E+02, -3.1771250770992856E+02, 1.5498490982321766E+02, -4.3161545259481535E+01}; + constexpr CUFINUFFT_FLT c11[] = {-4.2916172038404330E+00, 1.7402146068709751E+01, -4.7947588102062113E+01, 9.2697697983158491E+01, -1.2821427595919303E+02, 1.2821427694451660E+02, -9.2697698629471930E+01, 4.7947588133767717E+01, -1.7402146075416606E+01, 4.2916172038784923E+00}; + constexpr CUFINUFFT_FLT c12[] = {3.5357495062947814E-01, -1.2828127005767840E+00, 2.4090120532215455E+00, -2.6448901913160028E+00, 1.1811546776400381E+00, 1.1811568523765217E+00, -2.6448918925210712E+00, 2.4090119216851607E+00, -1.2828127015358992E+00, 3.5357495059093369E-01}; for (int i=0; i<10; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i] + z*(c12[i])))))))))))); } else if (w==11) { - CUFINUFFT_FLT c0[] = {3.7794653219809574E+04, 3.4782300224660799E+07, 1.6188020733727567E+09, 1.7196758809615021E+10, 6.3754384857724678E+10, 9.7196447559193558E+10, 6.3754384857724640E+10, 1.7196758809615005E+10, 1.6188020733727570E+09, 3.4782300224660806E+07, 3.7794653219808897E+04}; - CUFINUFFT_FLT c1[] = {1.8969206922085880E+05, 8.4769319065313682E+07, 2.4230555767723408E+09, 1.5439732722639105E+10, 2.7112836839612309E+10, 2.9154817084916870E-06, -2.7112836839612320E+10, -1.5439732722639105E+10, -2.4230555767723408E+09, -8.4769319065313682E+07, -1.8969206922085711E+05}; - CUFINUFFT_FLT c2[] = {4.2138380313901423E+05, 9.2050522922791898E+07, 1.5259983101266611E+09, 4.7070559561237154E+09, -1.2448027572952452E+09, -1.0161446790279312E+10, -1.2448027572952352E+09, 4.7070559561237249E+09, 1.5259983101266615E+09, 9.2050522922791868E+07, 4.2138380313901143E+05}; - CUFINUFFT_FLT c3[] = {5.4814313598122017E+05, 5.8085130777589574E+07, 4.9484006166551107E+08, 1.6222124676640958E+08, -2.0440440381345322E+09, -1.0628188648962249E-06, 2.0440440381345263E+09, -1.6222124676641047E+08, -4.9484006166551083E+08, -5.8085130777589560E+07, -5.4814313598121691E+05}; - CUFINUFFT_FLT c4[] = {4.6495183529254969E+05, 2.3067199578027174E+07, 6.9832590192482829E+07, -2.2024799260683161E+08, -1.2820270942587741E+08, 5.1017181199130940E+08, -1.2820270942587276E+08, -2.2024799260684022E+08, 6.9832590192482591E+07, 2.3067199578027155E+07, 4.6495183529254753E+05}; - CUFINUFFT_FLT c5[] = {2.7021781043532968E+05, 5.6764510325100143E+06, -5.5650761736747762E+06, -3.9907385617900737E+07, 7.2453390663686648E+07, 3.7361048615190248E-06, -7.2453390663685605E+07, 3.9907385617898554E+07, 5.5650761736747930E+06, -5.6764510325100180E+06, -2.7021781043532834E+05}; - CUFINUFFT_FLT c6[] = {1.0933249308680615E+05, 6.9586821127986431E+05, -3.6860240321940281E+06, 2.7428169457723838E+06, 8.3392008440598147E+06, -1.6402201025051240E+07, 8.3392008440649221E+06, 2.7428169457788388E+06, -3.6860240321937916E+06, 6.9586821127989038E+05, 1.0933249308680584E+05}; - CUFINUFFT_FLT c7[] = {3.0203516161820480E+04, -3.6879059542777912E+04, -4.1141031216801296E+05, 1.4111389975270075E+06, -1.5914376635392811E+06, 6.6766157119460594E-07, 1.5914376635341521E+06, -1.4111389975270815E+06, 4.1141031216760987E+05, 3.6879059542751726E+04, -3.0203516161820367E+04}; - CUFINUFFT_FLT c8[] = {5.1670143574922804E+03, -2.8613147115365118E+04, 4.3560195427108687E+04, 4.8438679581840552E+04, -2.5856630639330545E+05, 3.7994883866097208E+05, -2.5856630640124826E+05, 4.8438679578319818E+04, 4.3560195426824532E+04, -2.8613147115371667E+04, 5.1670143574923577E+03}; - CUFINUFFT_FLT c9[] = {3.0888018539742444E+02, -3.7949446187516196E+03, 1.4313303205035631E+04, -2.6681600236925929E+04, 2.3856005161221132E+04, -2.3276789125970764E-06, -2.3856005160840708E+04, 2.6681600234072768E+04, -1.4313303205083184E+04, 3.7949446187479048E+03, -3.0888018539723868E+02}; - CUFINUFFT_FLT c10[] = {-8.3747489794255131E+01, 1.1948077479810485E+02, 4.8528498025870488E+02, -2.5024391115619069E+03, 5.3511195350414373E+03, -6.7655484152307990E+03, 5.3511195328171416E+03, -2.5024391120801879E+03, 4.8528498023710927E+02, 1.1948077481025226E+02, -8.3747489794331599E+01}; - CUFINUFFT_FLT c11[] = {-2.2640047135555928E+01, 9.0840898549317998E+01, -2.1597187568776889E+02, 3.1511229085836396E+02, -2.4856618287164540E+02, 1.6489710183426948E-06, 2.4856618404233313E+02, -3.1511228957061689E+02, 2.1597187534632059E+02, -9.0840898568829203E+01, 2.2640047135641577E+01}; - CUFINUFFT_FLT c12[] = {-1.6306382885945303E+00, 7.3325946569413265E+00, -2.3241017814397217E+01, 5.1715493697385526E+01, -8.2673003927086967E+01, 9.6489715222659115E+01, -8.2673013187251925E+01, 5.1715492855550593E+01, -2.3241018165160245E+01, 7.3325946421432624E+00, -1.6306382886373367E+00}; - CUFINUFFT_FLT c13[] = {2.4409286936442823E-01, -7.8803147249892458E-01, 1.6467143668339987E+00, -2.1898241453519685E+00, 1.6350102449767006E+00, -1.1782931558589478E-06, -1.6350139430218933E+00, 2.1898230913723329E+00, -1.6467144225690411E+00, 7.8803147709023735E-01, -2.4409286927983653E-01}; + constexpr CUFINUFFT_FLT c0[] = {3.7794653219809574E+04, 3.4782300224660799E+07, 1.6188020733727567E+09, 1.7196758809615021E+10, 6.3754384857724678E+10, 9.7196447559193558E+10, 6.3754384857724640E+10, 1.7196758809615005E+10, 1.6188020733727570E+09, 3.4782300224660806E+07, 3.7794653219808897E+04}; + constexpr CUFINUFFT_FLT c1[] = {1.8969206922085880E+05, 8.4769319065313682E+07, 2.4230555767723408E+09, 1.5439732722639105E+10, 2.7112836839612309E+10, 2.9154817084916870E-06, -2.7112836839612320E+10, -1.5439732722639105E+10, -2.4230555767723408E+09, -8.4769319065313682E+07, -1.8969206922085711E+05}; + constexpr CUFINUFFT_FLT c2[] = {4.2138380313901423E+05, 9.2050522922791898E+07, 1.5259983101266611E+09, 4.7070559561237154E+09, -1.2448027572952452E+09, -1.0161446790279312E+10, -1.2448027572952352E+09, 4.7070559561237249E+09, 1.5259983101266615E+09, 9.2050522922791868E+07, 4.2138380313901143E+05}; + constexpr CUFINUFFT_FLT c3[] = {5.4814313598122017E+05, 5.8085130777589574E+07, 4.9484006166551107E+08, 1.6222124676640958E+08, -2.0440440381345322E+09, -1.0628188648962249E-06, 2.0440440381345263E+09, -1.6222124676641047E+08, -4.9484006166551083E+08, -5.8085130777589560E+07, -5.4814313598121691E+05}; + constexpr CUFINUFFT_FLT c4[] = {4.6495183529254969E+05, 2.3067199578027174E+07, 6.9832590192482829E+07, -2.2024799260683161E+08, -1.2820270942587741E+08, 5.1017181199130940E+08, -1.2820270942587276E+08, -2.2024799260684022E+08, 6.9832590192482591E+07, 2.3067199578027155E+07, 4.6495183529254753E+05}; + constexpr CUFINUFFT_FLT c5[] = {2.7021781043532968E+05, 5.6764510325100143E+06, -5.5650761736747762E+06, -3.9907385617900737E+07, 7.2453390663686648E+07, 3.7361048615190248E-06, -7.2453390663685605E+07, 3.9907385617898554E+07, 5.5650761736747930E+06, -5.6764510325100180E+06, -2.7021781043532834E+05}; + constexpr CUFINUFFT_FLT c6[] = {1.0933249308680615E+05, 6.9586821127986431E+05, -3.6860240321940281E+06, 2.7428169457723838E+06, 8.3392008440598147E+06, -1.6402201025051240E+07, 8.3392008440649221E+06, 2.7428169457788388E+06, -3.6860240321937916E+06, 6.9586821127989038E+05, 1.0933249308680584E+05}; + constexpr CUFINUFFT_FLT c7[] = {3.0203516161820480E+04, -3.6879059542777912E+04, -4.1141031216801296E+05, 1.4111389975270075E+06, -1.5914376635392811E+06, 6.6766157119460594E-07, 1.5914376635341521E+06, -1.4111389975270815E+06, 4.1141031216760987E+05, 3.6879059542751726E+04, -3.0203516161820367E+04}; + constexpr CUFINUFFT_FLT c8[] = {5.1670143574922804E+03, -2.8613147115365118E+04, 4.3560195427108687E+04, 4.8438679581840552E+04, -2.5856630639330545E+05, 3.7994883866097208E+05, -2.5856630640124826E+05, 4.8438679578319818E+04, 4.3560195426824532E+04, -2.8613147115371667E+04, 5.1670143574923577E+03}; + constexpr CUFINUFFT_FLT c9[] = {3.0888018539742444E+02, -3.7949446187516196E+03, 1.4313303205035631E+04, -2.6681600236925929E+04, 2.3856005161221132E+04, -2.3276789125970764E-06, -2.3856005160840708E+04, 2.6681600234072768E+04, -1.4313303205083184E+04, 3.7949446187479048E+03, -3.0888018539723868E+02}; + constexpr CUFINUFFT_FLT c10[] = {-8.3747489794255131E+01, 1.1948077479810485E+02, 4.8528498025870488E+02, -2.5024391115619069E+03, 5.3511195350414373E+03, -6.7655484152307990E+03, 5.3511195328171416E+03, -2.5024391120801879E+03, 4.8528498023710927E+02, 1.1948077481025226E+02, -8.3747489794331599E+01}; + constexpr CUFINUFFT_FLT c11[] = {-2.2640047135555928E+01, 9.0840898549317998E+01, -2.1597187568776889E+02, 3.1511229085836396E+02, -2.4856618287164540E+02, 1.6489710183426948E-06, 2.4856618404233313E+02, -3.1511228957061689E+02, 2.1597187534632059E+02, -9.0840898568829203E+01, 2.2640047135641577E+01}; + constexpr CUFINUFFT_FLT c12[] = {-1.6306382885945303E+00, 7.3325946569413265E+00, -2.3241017814397217E+01, 5.1715493697385526E+01, -8.2673003927086967E+01, 9.6489715222659115E+01, -8.2673013187251925E+01, 5.1715492855550593E+01, -2.3241018165160245E+01, 7.3325946421432624E+00, -1.6306382886373367E+00}; + constexpr CUFINUFFT_FLT c13[] = {2.4409286936442823E-01, -7.8803147249892458E-01, 1.6467143668339987E+00, -2.1898241453519685E+00, 1.6350102449767006E+00, -1.1782931558589478E-06, -1.6350139430218933E+00, 2.1898230913723329E+00, -1.6467144225690411E+00, 7.8803147709023735E-01, -2.4409286927983653E-01}; for (int i=0; i<11; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i] + z*(c12[i] + z*(c13[i]))))))))))))); } else if (w==12) { - CUFINUFFT_FLT c0[] = {6.1722991679853112E+04, 8.4789650417103723E+07, 5.4431675199498730E+09, 7.8788892335272293E+10, 4.0355760945670062E+11, 8.8071481911347974E+11, 8.8071481911347998E+11, 4.0355760945670068E+11, 7.8788892335272491E+10, 5.4431675199498854E+09, 8.4789650417103767E+07, 6.1722991679871629E+04}; - CUFINUFFT_FLT c1[] = {3.2561466099406150E+05, 2.2112758120210624E+08, 8.9911609880089817E+09, 8.3059508064200928E+10, 2.3965569143469864E+11, 1.6939286803305209E+11, -1.6939286803305209E+11, -2.3965569143469867E+11, -8.3059508064201080E+10, -8.9911609880089989E+09, -2.2112758120210624E+08, -3.2561466099404282E+05}; - CUFINUFFT_FLT c2[] = {7.6621098001581465E+05, 2.6026568260310274E+08, 6.4524338253008652E+09, 3.3729904113826797E+10, 2.8555202212474010E+10, -6.8998572040731583E+10, -6.8998572040731506E+10, 2.8555202212474064E+10, 3.3729904113826805E+10, 6.4524338253008747E+09, 2.6026568260310277E+08, 7.6621098001583852E+05}; - CUFINUFFT_FLT c3[] = {1.0657807616803222E+06, 1.8144472126890999E+08, 2.5524827004349856E+09, 5.2112383911371746E+09, -1.0268350564014614E+10, -1.4763245309081245E+10, 1.4763245309081299E+10, 1.0268350564014664E+10, -5.2112383911371031E+09, -2.5524827004349875E+09, -1.8144472126890990E+08, -1.0657807616803090E+06}; - CUFINUFFT_FLT c4[] = {9.7829638830158743E+05, 8.2222351241519973E+07, 5.5676911894064891E+08, -4.8739037675424922E+08, -2.7153428193077750E+09, 2.5627633609246840E+09, 2.5627633609247112E+09, -2.7153428193078070E+09, -4.8739037675429451E+08, 5.5676911894064677E+08, 8.2222351241519928E+07, 9.7829638830161165E+05}; - CUFINUFFT_FLT c5[] = {6.2536876825113979E+05, 2.4702814073680263E+07, 4.1488431554845832E+07, -2.9274790542418414E+08, 1.0742154109193267E+08, 6.2185168968029702E+08, -6.2185168968023658E+08, -1.0742154109185636E+08, 2.9274790542422676E+08, -4.1488431554844096E+07, -2.4702814073680244E+07, -6.2536876825112442E+05}; - CUFINUFFT_FLT c6[] = {2.8527714307528478E+05, 4.6266378435690189E+06, -1.0665598090791209E+07, -2.6048960239906937E+07, 9.1597254427339226E+07, -5.9794495983323507E+07, -5.9794495983287223E+07, 9.1597254427330941E+07, -2.6048960239925586E+07, -1.0665598090793334E+07, 4.6266378435690831E+06, 2.8527714307530422E+05}; - CUFINUFFT_FLT c7[] = {9.2873647411234240E+04, 3.6630046787428786E+05, -3.1271047224731087E+06, 4.8612412939261831E+06, 3.3820440907802135E+06, -1.6880127953711823E+07, 1.6880127953682471E+07, -3.3820440907974164E+06, -4.8612412939092657E+06, 3.1271047224737639E+06, -3.6630046787430649E+05, -9.2873647411216807E+04}; - CUFINUFFT_FLT c8[] = {2.0817947751046187E+04, -5.5660303410280452E+04, -1.9519783923293054E+05, 1.0804817251338358E+06, -1.8264985852948832E+06, 9.7602844964432076E+05, 9.7602844962242560E+05, -1.8264985853129351E+06, 1.0804817251129062E+06, -1.9519783923449527E+05, -5.5660303410338929E+04, 2.0817947751063308E+04}; - CUFINUFFT_FLT c9[] = {2.7986023314784748E+03, -1.9404411093600604E+04, 4.3922624999853564E+04, -7.6450317375817094E+03, -1.5273911976404345E+05, 3.3223441450299282E+05, -3.3223441454103496E+05, 1.5273911977621692E+05, 7.6450317497551932E+03, -4.3922624998426982E+04, 1.9404411093646668E+04, -2.7986023314644040E+03}; - CUFINUFFT_FLT c10[] = {6.7849020474186844E+01, -1.7921351307934926E+03, 8.4980694693463538E+03, -1.9742624859078383E+04, 2.4620674878200782E+04, -1.1676544885779787E+04, -1.1676544871958942E+04, 2.4620674838120303E+04, -1.9742624835582923E+04, 8.4980694640771490E+03, -1.7921351307934922E+03, 6.7849020488748664E+01}; - CUFINUFFT_FLT c11[] = {-5.4577020998847871E+01, 1.3637112866755427E+02, 4.5513615487589092E+01, -1.1174001343792290E+03, 3.2018769324922364E+03, -5.0580351333780654E+03, 5.0580351424313239E+03, -3.2018769362383905E+03, 1.1174000937955741E+03, -4.5513610843875405E+01, -1.3637112870657899E+02, 5.4577021011919037E+01}; - CUFINUFFT_FLT c12[] = {-1.0538365872424132E+01, 4.6577222490846609E+01, -1.2606964180937365E+02, 2.1881091191930210E+02, -2.3273402308837001E+02, 1.0274273857329082E+02, 1.0274268020620094E+02, -2.3273404553726701E+02, 2.1881091276113446E+02, -1.2606964815819696E+02, 4.6577222438230805E+01, -1.0538365860846021E+01}; - CUFINUFFT_FLT c13[] = {-4.6087004128022252E-01, 2.5969759424153827E+00, -9.6946930749915676E+00, 2.4990050007153755E+01, -4.6013920149683365E+01, 6.2056948047986317E+01, -6.2056981293939970E+01, 4.6013908245461884E+01, -2.4990038356462701E+01, 9.6946952377382889E+00, -2.5969759165384922E+00, 4.6087004737535314E-01}; + constexpr CUFINUFFT_FLT c0[] = {6.1722991679853112E+04, 8.4789650417103723E+07, 5.4431675199498730E+09, 7.8788892335272293E+10, 4.0355760945670062E+11, 8.8071481911347974E+11, 8.8071481911347998E+11, 4.0355760945670068E+11, 7.8788892335272491E+10, 5.4431675199498854E+09, 8.4789650417103767E+07, 6.1722991679871629E+04}; + constexpr CUFINUFFT_FLT c1[] = {3.2561466099406150E+05, 2.2112758120210624E+08, 8.9911609880089817E+09, 8.3059508064200928E+10, 2.3965569143469864E+11, 1.6939286803305209E+11, -1.6939286803305209E+11, -2.3965569143469867E+11, -8.3059508064201080E+10, -8.9911609880089989E+09, -2.2112758120210624E+08, -3.2561466099404282E+05}; + constexpr CUFINUFFT_FLT c2[] = {7.6621098001581465E+05, 2.6026568260310274E+08, 6.4524338253008652E+09, 3.3729904113826797E+10, 2.8555202212474010E+10, -6.8998572040731583E+10, -6.8998572040731506E+10, 2.8555202212474064E+10, 3.3729904113826805E+10, 6.4524338253008747E+09, 2.6026568260310277E+08, 7.6621098001583852E+05}; + constexpr CUFINUFFT_FLT c3[] = {1.0657807616803222E+06, 1.8144472126890999E+08, 2.5524827004349856E+09, 5.2112383911371746E+09, -1.0268350564014614E+10, -1.4763245309081245E+10, 1.4763245309081299E+10, 1.0268350564014664E+10, -5.2112383911371031E+09, -2.5524827004349875E+09, -1.8144472126890990E+08, -1.0657807616803090E+06}; + constexpr CUFINUFFT_FLT c4[] = {9.7829638830158743E+05, 8.2222351241519973E+07, 5.5676911894064891E+08, -4.8739037675424922E+08, -2.7153428193077750E+09, 2.5627633609246840E+09, 2.5627633609247112E+09, -2.7153428193078070E+09, -4.8739037675429451E+08, 5.5676911894064677E+08, 8.2222351241519928E+07, 9.7829638830161165E+05}; + constexpr CUFINUFFT_FLT c5[] = {6.2536876825113979E+05, 2.4702814073680263E+07, 4.1488431554845832E+07, -2.9274790542418414E+08, 1.0742154109193267E+08, 6.2185168968029702E+08, -6.2185168968023658E+08, -1.0742154109185636E+08, 2.9274790542422676E+08, -4.1488431554844096E+07, -2.4702814073680244E+07, -6.2536876825112442E+05}; + constexpr CUFINUFFT_FLT c6[] = {2.8527714307528478E+05, 4.6266378435690189E+06, -1.0665598090791209E+07, -2.6048960239906937E+07, 9.1597254427339226E+07, -5.9794495983323507E+07, -5.9794495983287223E+07, 9.1597254427330941E+07, -2.6048960239925586E+07, -1.0665598090793334E+07, 4.6266378435690831E+06, 2.8527714307530422E+05}; + constexpr CUFINUFFT_FLT c7[] = {9.2873647411234240E+04, 3.6630046787428786E+05, -3.1271047224731087E+06, 4.8612412939261831E+06, 3.3820440907802135E+06, -1.6880127953711823E+07, 1.6880127953682471E+07, -3.3820440907974164E+06, -4.8612412939092657E+06, 3.1271047224737639E+06, -3.6630046787430649E+05, -9.2873647411216807E+04}; + constexpr CUFINUFFT_FLT c8[] = {2.0817947751046187E+04, -5.5660303410280452E+04, -1.9519783923293054E+05, 1.0804817251338358E+06, -1.8264985852948832E+06, 9.7602844964432076E+05, 9.7602844962242560E+05, -1.8264985853129351E+06, 1.0804817251129062E+06, -1.9519783923449527E+05, -5.5660303410338929E+04, 2.0817947751063308E+04}; + constexpr CUFINUFFT_FLT c9[] = {2.7986023314784748E+03, -1.9404411093600604E+04, 4.3922624999853564E+04, -7.6450317375817094E+03, -1.5273911976404345E+05, 3.3223441450299282E+05, -3.3223441454103496E+05, 1.5273911977621692E+05, 7.6450317497551932E+03, -4.3922624998426982E+04, 1.9404411093646668E+04, -2.7986023314644040E+03}; + constexpr CUFINUFFT_FLT c10[] = {6.7849020474186844E+01, -1.7921351307934926E+03, 8.4980694693463538E+03, -1.9742624859078383E+04, 2.4620674878200782E+04, -1.1676544885779787E+04, -1.1676544871958942E+04, 2.4620674838120303E+04, -1.9742624835582923E+04, 8.4980694640771490E+03, -1.7921351307934922E+03, 6.7849020488748664E+01}; + constexpr CUFINUFFT_FLT c11[] = {-5.4577020998847871E+01, 1.3637112866755427E+02, 4.5513615487589092E+01, -1.1174001343792290E+03, 3.2018769324922364E+03, -5.0580351333780654E+03, 5.0580351424313239E+03, -3.2018769362383905E+03, 1.1174000937955741E+03, -4.5513610843875405E+01, -1.3637112870657899E+02, 5.4577021011919037E+01}; + constexpr CUFINUFFT_FLT c12[] = {-1.0538365872424132E+01, 4.6577222490846609E+01, -1.2606964180937365E+02, 2.1881091191930210E+02, -2.3273402308837001E+02, 1.0274273857329082E+02, 1.0274268020620094E+02, -2.3273404553726701E+02, 2.1881091276113446E+02, -1.2606964815819696E+02, 4.6577222438230805E+01, -1.0538365860846021E+01}; + constexpr CUFINUFFT_FLT c13[] = {-4.6087004128022252E-01, 2.5969759424153827E+00, -9.6946930749915676E+00, 2.4990050007153755E+01, -4.6013920149683365E+01, 6.2056948047986317E+01, -6.2056981293939970E+01, 4.6013908245461884E+01, -2.4990038356462701E+01, 9.6946952377382889E+00, -2.5969759165384922E+00, 4.6087004737535314E-01}; for (int i=0; i<12; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i] + z*(c12[i] + z*(c13[i]))))))))))))); } else if (w==13) { - CUFINUFFT_FLT c0[] = {9.8715725867495639E+04, 1.9828875496808118E+08, 1.7196758809614998E+10, 3.3083776881353607E+11, 2.2668873993375444E+12, 6.7734720591167598E+12, 9.6695220682534824E+12, 6.7734720591167471E+12, 2.2668873993375439E+12, 3.3083776881353534E+11, 1.7196758809614998E+10, 1.9828875496807906E+08, 9.8715725867495537E+04}; - CUFINUFFT_FLT c1[] = {5.4491110456935503E+05, 5.4903670125539362E+08, 3.0879465445278172E+10, 3.9588436413399951E+11, 1.6860562536749778E+12, 2.4256447893117881E+12, 3.7318165868693593E-04, -2.4256447893117856E+12, -1.6860562536749768E+12, -3.9588436413399890E+11, -3.0879465445278183E+10, -5.4903670125538874E+08, -5.4491110456935491E+05}; - CUFINUFFT_FLT c2[] = {1.3504711883426066E+06, 6.9286979077463162E+08, 2.4618123595484562E+10, 1.9493985627722598E+11, 3.9422703517046326E+11, -1.8678883613919931E+11, -8.5538079834550146E+11, -1.8678883613919705E+11, 3.9422703517046338E+11, 1.9493985627722586E+11, 2.4618123595484554E+10, 6.9286979077462578E+08, 1.3504711883426069E+06}; - CUFINUFFT_FLT c3[] = {1.9937206140846494E+06, 5.2512029493765986E+08, 1.1253303793811754E+10, 4.6205527735932175E+10, -1.1607472377983284E+10, -1.6305241755642325E+11, 1.3350300616010507E-04, 1.6305241755642365E+11, 1.1607472377982744E+10, -4.6205527735932228E+10, -1.1253303793811750E+10, -5.2512029493765610E+08, -1.9937206140846484E+06}; - CUFINUFFT_FLT c4[] = {1.9607419630386413E+06, 2.6425362558103889E+08, 3.1171259341747255E+09, 2.9839860297840505E+09, -1.9585031917561890E+10, -5.0666917387055302E+09, 3.6568794485482079E+10, -5.0666917387051382E+09, -1.9585031917561581E+10, 2.9839860297839398E+09, 3.1171259341747217E+09, 2.6425362558103737E+08, 1.9607419630386410E+06}; - CUFINUFFT_FLT c5[] = {1.3593773865640301E+06, 9.1556445104158148E+07, 4.7074012944133490E+08, -1.1192579335657711E+09, -2.1090780087868552E+09, 5.2270306737949314E+09, 1.0058570913473114E-03, -5.2270306737942495E+09, 2.1090780087878082E+09, 1.1192579335658059E+09, -4.7074012944133729E+08, -9.1556445104157895E+07, -1.3593773865640303E+06}; - CUFINUFFT_FLT c6[] = {6.8417206432039186E+05, 2.1561705510027003E+07, 7.5785249892988410E+06, -2.7456096030230397E+08, 3.4589095671043062E+08, 4.0256106808852541E+08, -1.0074306926606210E+09, 4.0256106809059316E+08, 3.4589095670995283E+08, -2.7456096030234104E+08, 7.5785249893005500E+06, 2.1561705510027427E+07, 6.8417206432039267E+05}; - CUFINUFFT_FLT c7[] = {2.5248269397037479E+05, 3.0985559672615193E+06, -1.1816517087617906E+07, -8.2958498770340970E+06, 8.0546642347242445E+07, -1.0594657799535300E+08, -4.1868673222825360E-04, 1.0594657799426495E+08, -8.0546642347729877E+07, 8.2958498770339396E+06, 1.1816517087613177E+07, -3.0985559672620757E+06, -2.5248269397037491E+05}; - CUFINUFFT_FLT c8[] = {6.7530100970876083E+04, 1.2373362326659705E+05, -2.1245597183259744E+06, 5.1047323238916462E+06, -1.4139444405955642E+06, -1.1818267554953648E+07, 2.0121548577168033E+07, -1.1818267556967378E+07, -1.4139444400679788E+06, 5.1047323236808330E+06, -2.1245597183310925E+06, 1.2373362326704434E+05, 6.7530100970875879E+04}; - CUFINUFFT_FLT c9[] = {1.2421368748960791E+04, -5.0576243646949319E+04, -4.8878193435000605E+04, 6.5307896868984913E+05, -1.5497610128277773E+06, 1.5137725915373438E+06, 2.4159142842753925E-04, -1.5137725925842635E+06, 1.5497610128277773E+06, -6.5307896858028776E+05, 4.8878193437283131E+04, 5.0576243646456518E+04, -1.2421368748960884E+04}; - CUFINUFFT_FLT c10[] = {1.2904654687546160E+03, -1.1169946055063081E+04, 3.3275109714208906E+04, -3.1765222279764806E+04, -5.9810981980285695E+04, 2.2355863005975721E+05, -3.1083591689740209E+05, 2.2355863472015061E+05, -5.9810982676856896E+04, -3.1765222445615127E+04, 3.3275109711790254E+04, -1.1169946054458416E+04, 1.2904654687550794E+03}; - CUFINUFFT_FLT c11[] = {-1.9043622268985253E+01, -6.8296542226098870E+02, 4.2702512255472038E+03, -1.2165497337805051E+04, 1.9423733200245264E+04, -1.6010024156865491E+04, -1.8587318864580292E-04, 1.6010021504569266E+04, -1.9423732997327170E+04, 1.2165497443946821E+04, -4.2702512314786209E+03, 6.8296542157807858E+02, 1.9043622268681840E+01}; - CUFINUFFT_FLT c12[] = {-3.0093984465812213E+01, 9.8972865698526618E+01, -9.7437039087669007E+01, -3.5079927282955276E+02, 1.5699250476860170E+03, -3.1287441993042225E+03, 3.8692185175061472E+03, -3.1287462825609659E+03, 1.5699252631952513E+03, -3.5079945803284346E+02, -9.7437044419281492E+01, 9.8972866145746991E+01, -3.0093984466256714E+01}; - CUFINUFFT_FLT c13[] = {-4.3050286009571908E+00, 2.1108975820085092E+01, -6.4297196365104938E+01, 1.2922885252832501E+02, -1.6991814421468084E+02, 1.2655005406584399E+02, -2.7552199668252238E-05, -1.2655093214380580E+02, 1.6991796275475141E+02, -1.2922893349406868E+02, 6.4297198822227926E+01, -2.1108976183295965E+01, 4.3050286010617569E+00}; - CUFINUFFT_FLT c14[] = {-1.0957333744888972E-01, 7.2949316377828033E-01, -3.4300810538238449E+00, 1.0470062030552395E+01, -2.2292087310650142E+01, 3.4570674930666925E+01, -3.9923385381532697E+01, 3.4573472104415345E+01, -2.2292369892227434E+01, 1.0470053799441445E+01, -3.4300825281782954E+00, 7.2949352704193948E-01, -1.0957333730383595E-01}; + constexpr CUFINUFFT_FLT c0[] = {9.8715725867495639E+04, 1.9828875496808118E+08, 1.7196758809614998E+10, 3.3083776881353607E+11, 2.2668873993375444E+12, 6.7734720591167598E+12, 9.6695220682534824E+12, 6.7734720591167471E+12, 2.2668873993375439E+12, 3.3083776881353534E+11, 1.7196758809614998E+10, 1.9828875496807906E+08, 9.8715725867495537E+04}; + constexpr CUFINUFFT_FLT c1[] = {5.4491110456935503E+05, 5.4903670125539362E+08, 3.0879465445278172E+10, 3.9588436413399951E+11, 1.6860562536749778E+12, 2.4256447893117881E+12, 3.7318165868693593E-04, -2.4256447893117856E+12, -1.6860562536749768E+12, -3.9588436413399890E+11, -3.0879465445278183E+10, -5.4903670125538874E+08, -5.4491110456935491E+05}; + constexpr CUFINUFFT_FLT c2[] = {1.3504711883426066E+06, 6.9286979077463162E+08, 2.4618123595484562E+10, 1.9493985627722598E+11, 3.9422703517046326E+11, -1.8678883613919931E+11, -8.5538079834550146E+11, -1.8678883613919705E+11, 3.9422703517046338E+11, 1.9493985627722586E+11, 2.4618123595484554E+10, 6.9286979077462578E+08, 1.3504711883426069E+06}; + constexpr CUFINUFFT_FLT c3[] = {1.9937206140846494E+06, 5.2512029493765986E+08, 1.1253303793811754E+10, 4.6205527735932175E+10, -1.1607472377983284E+10, -1.6305241755642325E+11, 1.3350300616010507E-04, 1.6305241755642365E+11, 1.1607472377982744E+10, -4.6205527735932228E+10, -1.1253303793811750E+10, -5.2512029493765610E+08, -1.9937206140846484E+06}; + constexpr CUFINUFFT_FLT c4[] = {1.9607419630386413E+06, 2.6425362558103889E+08, 3.1171259341747255E+09, 2.9839860297840505E+09, -1.9585031917561890E+10, -5.0666917387055302E+09, 3.6568794485482079E+10, -5.0666917387051382E+09, -1.9585031917561581E+10, 2.9839860297839398E+09, 3.1171259341747217E+09, 2.6425362558103737E+08, 1.9607419630386410E+06}; + constexpr CUFINUFFT_FLT c5[] = {1.3593773865640301E+06, 9.1556445104158148E+07, 4.7074012944133490E+08, -1.1192579335657711E+09, -2.1090780087868552E+09, 5.2270306737949314E+09, 1.0058570913473114E-03, -5.2270306737942495E+09, 2.1090780087878082E+09, 1.1192579335658059E+09, -4.7074012944133729E+08, -9.1556445104157895E+07, -1.3593773865640303E+06}; + constexpr CUFINUFFT_FLT c6[] = {6.8417206432039186E+05, 2.1561705510027003E+07, 7.5785249892988410E+06, -2.7456096030230397E+08, 3.4589095671043062E+08, 4.0256106808852541E+08, -1.0074306926606210E+09, 4.0256106809059316E+08, 3.4589095670995283E+08, -2.7456096030234104E+08, 7.5785249893005500E+06, 2.1561705510027427E+07, 6.8417206432039267E+05}; + constexpr CUFINUFFT_FLT c7[] = {2.5248269397037479E+05, 3.0985559672615193E+06, -1.1816517087617906E+07, -8.2958498770340970E+06, 8.0546642347242445E+07, -1.0594657799535300E+08, -4.1868673222825360E-04, 1.0594657799426495E+08, -8.0546642347729877E+07, 8.2958498770339396E+06, 1.1816517087613177E+07, -3.0985559672620757E+06, -2.5248269397037491E+05}; + constexpr CUFINUFFT_FLT c8[] = {6.7530100970876083E+04, 1.2373362326659705E+05, -2.1245597183259744E+06, 5.1047323238916462E+06, -1.4139444405955642E+06, -1.1818267554953648E+07, 2.0121548577168033E+07, -1.1818267556967378E+07, -1.4139444400679788E+06, 5.1047323236808330E+06, -2.1245597183310925E+06, 1.2373362326704434E+05, 6.7530100970875879E+04}; + constexpr CUFINUFFT_FLT c9[] = {1.2421368748960791E+04, -5.0576243646949319E+04, -4.8878193435000605E+04, 6.5307896868984913E+05, -1.5497610128277773E+06, 1.5137725915373438E+06, 2.4159142842753925E-04, -1.5137725925842635E+06, 1.5497610128277773E+06, -6.5307896858028776E+05, 4.8878193437283131E+04, 5.0576243646456518E+04, -1.2421368748960884E+04}; + constexpr CUFINUFFT_FLT c10[] = {1.2904654687546160E+03, -1.1169946055063081E+04, 3.3275109714208906E+04, -3.1765222279764806E+04, -5.9810981980285695E+04, 2.2355863005975721E+05, -3.1083591689740209E+05, 2.2355863472015061E+05, -5.9810982676856896E+04, -3.1765222445615127E+04, 3.3275109711790254E+04, -1.1169946054458416E+04, 1.2904654687550794E+03}; + constexpr CUFINUFFT_FLT c11[] = {-1.9043622268985253E+01, -6.8296542226098870E+02, 4.2702512255472038E+03, -1.2165497337805051E+04, 1.9423733200245264E+04, -1.6010024156865491E+04, -1.8587318864580292E-04, 1.6010021504569266E+04, -1.9423732997327170E+04, 1.2165497443946821E+04, -4.2702512314786209E+03, 6.8296542157807858E+02, 1.9043622268681840E+01}; + constexpr CUFINUFFT_FLT c12[] = {-3.0093984465812213E+01, 9.8972865698526618E+01, -9.7437039087669007E+01, -3.5079927282955276E+02, 1.5699250476860170E+03, -3.1287441993042225E+03, 3.8692185175061472E+03, -3.1287462825609659E+03, 1.5699252631952513E+03, -3.5079945803284346E+02, -9.7437044419281492E+01, 9.8972866145746991E+01, -3.0093984466256714E+01}; + constexpr CUFINUFFT_FLT c13[] = {-4.3050286009571908E+00, 2.1108975820085092E+01, -6.4297196365104938E+01, 1.2922885252832501E+02, -1.6991814421468084E+02, 1.2655005406584399E+02, -2.7552199668252238E-05, -1.2655093214380580E+02, 1.6991796275475141E+02, -1.2922893349406868E+02, 6.4297198822227926E+01, -2.1108976183295965E+01, 4.3050286010617569E+00}; + constexpr CUFINUFFT_FLT c14[] = {-1.0957333744888972E-01, 7.2949316377828033E-01, -3.4300810538238449E+00, 1.0470062030552395E+01, -2.2292087310650142E+01, 3.4570674930666925E+01, -3.9923385381532697E+01, 3.4573472104415345E+01, -2.2292369892227434E+01, 1.0470053799441445E+01, -3.4300825281782954E+00, 7.2949352704193948E-01, -1.0957333730383595E-01}; for (int i=0; i<13; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i] + z*(c12[i] + z*(c13[i] + z*(c14[i])))))))))))))); } else if (w==14) { - CUFINUFFT_FLT c0[] = {1.5499533202966300E+05, 4.4723032442444748E+08, 5.1495083701694786E+10, 1.2904576022918081E+12, 1.1534950432785512E+13, 4.5650102198520516E+13, 8.8830582190032688E+13, 8.8830582190032672E+13, 4.5650102198520516E+13, 1.1534950432785535E+13, 1.2904576022918081E+12, 5.1495083701695145E+10, 4.4723032442444843E+08, 1.5499533202970150E+05}; - CUFINUFFT_FLT c1[] = {8.9188339002980455E+05, 1.3065352538728631E+09, 9.9400185225815582E+10, 1.7136059013402410E+12, 1.0144146621675832E+13, 2.3034036018490723E+13, 1.4630967270448867E+13, -1.4630967270448859E+13, -2.3034036018490715E+13, -1.0144146621675846E+13, -1.7136059013402410E+12, -9.9400185225815979E+10, -1.3065352538728662E+09, -8.9188339002979524E+05}; - CUFINUFFT_FLT c2[] = {2.3170473769379673E+06, 1.7532505043698246E+09, 8.6523535958354294E+10, 9.7455289065487329E+11, 3.2977972139362295E+12, 1.7874626001697771E+12, -6.1480918082634004E+12, -6.1480918082633994E+12, 1.7874626001697695E+12, 3.2977972139362256E+12, 9.7455289065487366E+11, 8.6523535958354599E+10, 1.7532505043698282E+09, 2.3170473769380408E+06}; - CUFINUFFT_FLT c3[] = {3.6089249230396431E+06, 1.4278058213962190E+09, 4.4296625537022438E+10, 2.9466624630419812E+11, 3.1903621584503357E+11, -9.8834691411254529E+11, -1.1072264714919219E+12, 1.1072264714919253E+12, 9.8834691411255261E+11, -3.1903621584503473E+11, -2.9466624630419775E+11, -4.4296625537022629E+10, -1.4278058213962216E+09, -3.6089249230396645E+06}; - CUFINUFFT_FLT c4[] = {3.7733555140851741E+06, 7.8376718099107432E+08, 1.4443117772349600E+10, 4.3197433307419121E+10, -7.6585042240582489E+10, -1.8569640140761731E+11, 2.0385335192658878E+11, 2.0385335192657968E+11, -1.8569640140762405E+11, -7.6585042240578430E+10, 4.3197433307418945E+10, 1.4443117772349699E+10, 7.8376718099107552E+08, 3.7733555140852556E+06}; - CUFINUFFT_FLT c5[] = {2.8079157920112349E+06, 3.0340753492383713E+08, 2.9498136661747351E+09, -6.2820200387927818E+08, -2.2372008390622681E+10, 1.5217518660587118E+10, 4.0682590266889229E+10, -4.0682590266876595E+10, -1.5217518660581694E+10, 2.2372008390624306E+10, 6.2820200387922049E+08, -2.9498136661747746E+09, -3.0340753492383796E+08, -2.8079157920112382E+06}; - CUFINUFFT_FLT c6[] = {1.5361613559533113E+06, 8.3513615594416350E+07, 3.0077547202707732E+08, -1.3749596754069650E+09, -6.6733027297582805E+08, 5.9590333632825184E+09, -4.3025685566887646E+09, -4.3025685566943264E+09, 5.9590333632825480E+09, -6.6733027297550666E+08, -1.3749596754065177E+09, 3.0077547202710402E+08, 8.3513615594416887E+07, 1.5361613559533583E+06}; - CUFINUFFT_FLT c7[] = {6.2759409419592936E+05, 1.5741723594963074E+07, -1.5632610223404476E+07, -1.9294824907080847E+08, 4.4643806532363749E+08, 1.5178998383416286E+07, -9.6771139892184162E+08, 9.6771139891756535E+08, -1.5178998386503356E+07, -4.4643806533349395E+08, 1.9294824907058707E+08, 1.5632610223392753E+07, -1.5741723594962660E+07, -6.2759409419590654E+05}; - CUFINUFFT_FLT c8[] = {1.9151404903933575E+05, 1.7156606891565928E+06, -9.7733523156610541E+06, 4.2982266236283993E+06, 5.1660907884816565E+07, -1.1279400211055294E+08, 6.4701089573887214E+07, 6.4701089567399226E+07, -1.1279400211297083E+08, 5.1660907891780980E+07, 4.2982266233826252E+06, -9.7733523156971950E+06, 1.7156606891561027E+06, 1.9151404903936631E+05}; - CUFINUFFT_FLT c9[] = {4.2715272622844830E+04, -2.2565910608684317E+03, -1.1769776156829668E+06, 4.0078399908543471E+06, -3.8951858064309461E+06, -5.0944610762301283E+06, 1.6765992441460442E+07, -1.6765992436785825E+07, 5.0944610781778852E+06, 3.8951858054570677E+06, -4.0078399907569592E+06, 1.1769776157156830E+06, 2.2565910609040961E+03, -4.2715272622820310E+04}; - CUFINUFFT_FLT c10[] = {6.4806786522791654E+03, -3.5474227032931303E+04, 1.8237100723206047E+04, 3.0934714627485734E+05, -1.0394703921956274E+06, 1.4743920336239333E+06, -7.3356882129423053E+05, -7.3356882916659222E+05, 1.4743920340662012E+06, -1.0394703928590287E+06, 3.0934714634119731E+05, 1.8237100680361433E+04, -3.5474227032996088E+04, 6.4806786523011797E+03}; - CUFINUFFT_FLT c11[] = {4.9913632908432180E+02, -5.5416668526903932E+03, 2.0614058707628108E+04, -3.2285139177838235E+04, -5.3099560012237780E+03, 1.1559000312360718E+05, -2.2569743818692098E+05, 2.2569743267254104E+05, -1.1559000606061178E+05, 5.3099530192621614E+03, 3.2285139062955688E+04, -2.0614058671415001E+04, 5.5416668535488525E+03, -4.9913632906175445E+02}; - CUFINUFFT_FLT c12[] = {-3.3076333188770995E+01, -1.8970588549665433E+02, 1.8160423465108606E+03, -6.3715702906684537E+03, 1.2525623712293716E+04, -1.4199809613604592E+04, 6.4441857815348694E+03, 6.4441852068443368E+03, -1.4199811050333730E+04, 1.2525626046977848E+04, -6.3715705510753096E+03, 1.8160422724294601E+03, -1.8970588700494130E+02, -3.3076333169380085E+01}; - CUFINUFFT_FLT c13[] = {-1.4394533627757088E+01, 5.7000699312246105E+01, -1.0101141802233408E+02, -3.2954042015367456E+01, 6.1417873351558330E+02, -1.6177281811377129E+03, 2.4593356854220169E+03, -2.4593356782637338E+03, 1.6177289006539679E+03, -6.1417987494681950E+02, 3.2954142200289709E+01, 1.0101142888658896E+02, -5.7000698890466253E+01, 1.4394533639134110E+01}; - CUFINUFFT_FLT c14[] = {-1.5925952286169334E+00, 8.5113929411519127E+00, -2.8993517494090959E+01, 6.6373419665690747E+01, -1.0329523947888029E+02, 1.0280172537525394E+02, -4.3894765605046906E+01, -4.3897466711581743E+01, 1.0280269421314661E+02, -1.0329529425338121E+02, 6.6373405476301841E+01, -2.8993535416845578E+01, 8.5113925602355138E+00, -1.5925952196632756E+00}; - CUFINUFFT_FLT c15[] = {1.5984868375087002E-02, 1.2876155307218357E-01, -9.8359379953002779E-01, 3.7711056267887488E+00, -9.4307026856950991E+00, 1.6842022255882348E+01, -2.2310401016395307E+01, 2.2307954998498516E+01, -1.6843279237301534E+01, 9.4308852877255891E+00, -3.7711056267887488E+00, 9.8361025494556609E-01, -1.2876093931172500E-01, -1.5984859319657936E-02}; + constexpr CUFINUFFT_FLT c0[] = {1.5499533202966300E+05, 4.4723032442444748E+08, 5.1495083701694786E+10, 1.2904576022918081E+12, 1.1534950432785512E+13, 4.5650102198520516E+13, 8.8830582190032688E+13, 8.8830582190032672E+13, 4.5650102198520516E+13, 1.1534950432785535E+13, 1.2904576022918081E+12, 5.1495083701695145E+10, 4.4723032442444843E+08, 1.5499533202970150E+05}; + constexpr CUFINUFFT_FLT c1[] = {8.9188339002980455E+05, 1.3065352538728631E+09, 9.9400185225815582E+10, 1.7136059013402410E+12, 1.0144146621675832E+13, 2.3034036018490723E+13, 1.4630967270448867E+13, -1.4630967270448859E+13, -2.3034036018490715E+13, -1.0144146621675846E+13, -1.7136059013402410E+12, -9.9400185225815979E+10, -1.3065352538728662E+09, -8.9188339002979524E+05}; + constexpr CUFINUFFT_FLT c2[] = {2.3170473769379673E+06, 1.7532505043698246E+09, 8.6523535958354294E+10, 9.7455289065487329E+11, 3.2977972139362295E+12, 1.7874626001697771E+12, -6.1480918082634004E+12, -6.1480918082633994E+12, 1.7874626001697695E+12, 3.2977972139362256E+12, 9.7455289065487366E+11, 8.6523535958354599E+10, 1.7532505043698282E+09, 2.3170473769380408E+06}; + constexpr CUFINUFFT_FLT c3[] = {3.6089249230396431E+06, 1.4278058213962190E+09, 4.4296625537022438E+10, 2.9466624630419812E+11, 3.1903621584503357E+11, -9.8834691411254529E+11, -1.1072264714919219E+12, 1.1072264714919253E+12, 9.8834691411255261E+11, -3.1903621584503473E+11, -2.9466624630419775E+11, -4.4296625537022629E+10, -1.4278058213962216E+09, -3.6089249230396645E+06}; + constexpr CUFINUFFT_FLT c4[] = {3.7733555140851741E+06, 7.8376718099107432E+08, 1.4443117772349600E+10, 4.3197433307419121E+10, -7.6585042240582489E+10, -1.8569640140761731E+11, 2.0385335192658878E+11, 2.0385335192657968E+11, -1.8569640140762405E+11, -7.6585042240578430E+10, 4.3197433307418945E+10, 1.4443117772349699E+10, 7.8376718099107552E+08, 3.7733555140852556E+06}; + constexpr CUFINUFFT_FLT c5[] = {2.8079157920112349E+06, 3.0340753492383713E+08, 2.9498136661747351E+09, -6.2820200387927818E+08, -2.2372008390622681E+10, 1.5217518660587118E+10, 4.0682590266889229E+10, -4.0682590266876595E+10, -1.5217518660581694E+10, 2.2372008390624306E+10, 6.2820200387922049E+08, -2.9498136661747746E+09, -3.0340753492383796E+08, -2.8079157920112382E+06}; + constexpr CUFINUFFT_FLT c6[] = {1.5361613559533113E+06, 8.3513615594416350E+07, 3.0077547202707732E+08, -1.3749596754069650E+09, -6.6733027297582805E+08, 5.9590333632825184E+09, -4.3025685566887646E+09, -4.3025685566943264E+09, 5.9590333632825480E+09, -6.6733027297550666E+08, -1.3749596754065177E+09, 3.0077547202710402E+08, 8.3513615594416887E+07, 1.5361613559533583E+06}; + constexpr CUFINUFFT_FLT c7[] = {6.2759409419592936E+05, 1.5741723594963074E+07, -1.5632610223404476E+07, -1.9294824907080847E+08, 4.4643806532363749E+08, 1.5178998383416286E+07, -9.6771139892184162E+08, 9.6771139891756535E+08, -1.5178998386503356E+07, -4.4643806533349395E+08, 1.9294824907058707E+08, 1.5632610223392753E+07, -1.5741723594962660E+07, -6.2759409419590654E+05}; + constexpr CUFINUFFT_FLT c8[] = {1.9151404903933575E+05, 1.7156606891565928E+06, -9.7733523156610541E+06, 4.2982266236283993E+06, 5.1660907884816565E+07, -1.1279400211055294E+08, 6.4701089573887214E+07, 6.4701089567399226E+07, -1.1279400211297083E+08, 5.1660907891780980E+07, 4.2982266233826252E+06, -9.7733523156971950E+06, 1.7156606891561027E+06, 1.9151404903936631E+05}; + constexpr CUFINUFFT_FLT c9[] = {4.2715272622844830E+04, -2.2565910608684317E+03, -1.1769776156829668E+06, 4.0078399908543471E+06, -3.8951858064309461E+06, -5.0944610762301283E+06, 1.6765992441460442E+07, -1.6765992436785825E+07, 5.0944610781778852E+06, 3.8951858054570677E+06, -4.0078399907569592E+06, 1.1769776157156830E+06, 2.2565910609040961E+03, -4.2715272622820310E+04}; + constexpr CUFINUFFT_FLT c10[] = {6.4806786522791654E+03, -3.5474227032931303E+04, 1.8237100723206047E+04, 3.0934714627485734E+05, -1.0394703921956274E+06, 1.4743920336239333E+06, -7.3356882129423053E+05, -7.3356882916659222E+05, 1.4743920340662012E+06, -1.0394703928590287E+06, 3.0934714634119731E+05, 1.8237100680361433E+04, -3.5474227032996088E+04, 6.4806786523011797E+03}; + constexpr CUFINUFFT_FLT c11[] = {4.9913632908432180E+02, -5.5416668526903932E+03, 2.0614058707628108E+04, -3.2285139177838235E+04, -5.3099560012237780E+03, 1.1559000312360718E+05, -2.2569743818692098E+05, 2.2569743267254104E+05, -1.1559000606061178E+05, 5.3099530192621614E+03, 3.2285139062955688E+04, -2.0614058671415001E+04, 5.5416668535488525E+03, -4.9913632906175445E+02}; + constexpr CUFINUFFT_FLT c12[] = {-3.3076333188770995E+01, -1.8970588549665433E+02, 1.8160423465108606E+03, -6.3715702906684537E+03, 1.2525623712293716E+04, -1.4199809613604592E+04, 6.4441857815348694E+03, 6.4441852068443368E+03, -1.4199811050333730E+04, 1.2525626046977848E+04, -6.3715705510753096E+03, 1.8160422724294601E+03, -1.8970588700494130E+02, -3.3076333169380085E+01}; + constexpr CUFINUFFT_FLT c13[] = {-1.4394533627757088E+01, 5.7000699312246105E+01, -1.0101141802233408E+02, -3.2954042015367456E+01, 6.1417873351558330E+02, -1.6177281811377129E+03, 2.4593356854220169E+03, -2.4593356782637338E+03, 1.6177289006539679E+03, -6.1417987494681950E+02, 3.2954142200289709E+01, 1.0101142888658896E+02, -5.7000698890466253E+01, 1.4394533639134110E+01}; + constexpr CUFINUFFT_FLT c14[] = {-1.5925952286169334E+00, 8.5113929411519127E+00, -2.8993517494090959E+01, 6.6373419665690747E+01, -1.0329523947888029E+02, 1.0280172537525394E+02, -4.3894765605046906E+01, -4.3897466711581743E+01, 1.0280269421314661E+02, -1.0329529425338121E+02, 6.6373405476301841E+01, -2.8993535416845578E+01, 8.5113925602355138E+00, -1.5925952196632756E+00}; + constexpr CUFINUFFT_FLT c15[] = {1.5984868375087002E-02, 1.2876155307218357E-01, -9.8359379953002779E-01, 3.7711056267887488E+00, -9.4307026856950991E+00, 1.6842022255882348E+01, -2.2310401016395307E+01, 2.2307954998498516E+01, -1.6843279237301534E+01, 9.4308852877255891E+00, -3.7711056267887488E+00, 9.8361025494556609E-01, -1.2876093931172500E-01, -1.5984859319657936E-02}; for (int i=0; i<14; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i] + z*(c12[i] + z*(c13[i] + z*(c14[i] + z*(c15[i]))))))))))))))); } else if (w==15) { - CUFINUFFT_FLT c0[] = {2.3939707792241831E+05, 9.7700272582690263E+08, 1.4715933396485272E+11, 4.7242424833337188E+12, 5.3987426629953602E+13, 2.7580474290566097E+14, 7.0693378336533425E+14, 9.6196578554477812E+14, 7.0693378336533450E+14, 2.7580474290566138E+14, 5.3987426629953812E+13, 4.7242424833337275E+12, 1.4715933396485272E+11, 9.7700272582690227E+08, 2.3939707792241947E+05}; - CUFINUFFT_FLT c1[] = {1.4314487885226030E+06, 2.9961416925358467E+09, 3.0273361232748438E+11, 6.8507333793903594E+12, 5.4192702756911008E+13, 1.7551587948105312E+14, 2.1874615668430153E+14, 7.1650878467891699E-02, -2.1874615668430153E+14, -1.7551587948105331E+14, -5.4192702756911164E+13, -6.8507333793903701E+12, -3.0273361232748431E+11, -2.9961416925358462E+09, -1.4314487885226023E+06}; - CUFINUFFT_FLT c2[] = {3.8829497354762917E+06, 4.2473082696966438E+09, 2.8414312556015527E+11, 4.3688281331121411E+12, 2.1823119508000523E+13, 3.2228098609392012E+13, -2.1833085454691871E+13, -7.3750710225100922E+13, -2.1833085454691941E+13, 3.2228098609392000E+13, 2.1823119508000590E+13, 4.3688281331121475E+12, 2.8414312556015521E+11, 4.2473082696966453E+09, 3.8829497354762908E+06}; - CUFINUFFT_FLT c3[] = {6.3495763451755773E+06, 3.6841035003733954E+09, 1.5965774278321045E+11, 1.5630338683778196E+12, 3.8749058615819282E+12, -2.7319740087723496E+12, -1.3233342822865416E+13, 1.2094759019991106E-03, 1.3233342822865408E+13, 2.7319740087723706E+12, -3.8749058615819390E+12, -1.5630338683778196E+12, -1.5965774278321036E+11, -3.6841035003733935E+09, -6.3495763451755773E+06}; - CUFINUFFT_FLT c4[] = {7.0146619045520434E+06, 2.1782897863065772E+09, 5.8897780310148148E+10, 3.1953009601770453E+11, 4.0651527030852091E+08, -1.6379148273275527E+12, -1.1568753136999574E+11, 2.7451653250461855E+12, -1.1568753137002715E+11, -1.6379148273276675E+12, 4.0651527030276263E+08, 3.1953009601770386E+11, 5.8897780310148087E+10, 2.1782897863065767E+09, 7.0146619045520416E+06}; - CUFINUFFT_FLT c5[] = {5.5580012413990181E+06, 9.2345162185944223E+08, 1.4522950934020067E+10, 2.7025952371212223E+10, -1.2304576967641710E+11, -1.0116752717201025E+11, 3.8517418245457495E+11, 1.1720185410178396E-01, -3.8517418245448737E+11, 1.0116752717220248E+11, 1.2304576967643900E+11, -2.7025952371215157E+10, -1.4522950934020073E+10, -9.2345162185944128E+08, -5.5580012413990190E+06}; - CUFINUFFT_FLT c6[] = {3.2693972344231787E+06, 2.8610260147425157E+08, 2.2348528403750129E+09, -3.4574515574239435E+09, -1.7480626463586948E+10, 3.1608597465528339E+10, 1.9879262560041798E+10, -6.6148013553832657E+10, 1.9879262560029728E+10, 3.1608597465497307E+10, -1.7480626463581020E+10, -3.4574515574192748E+09, 2.2348528403750839E+09, 2.8610260147425318E+08, 3.2693972344231806E+06}; - CUFINUFFT_FLT c7[] = {1.4553539959296260E+06, 6.4136842048383795E+07, 1.3622336582061595E+08, -1.2131510424646864E+09, 6.4322366984170294E+08, 4.5078753872136936E+09, -7.1689413747181644E+09, -1.1786171556070136E-02, 7.1689413746620741E+09, -4.5078753875125484E+09, -6.4322366985783029E+08, 1.2131510424602287E+09, -1.3622336582069945E+08, -6.4136842048384361E+07, -1.4553539959296270E+06}; - CUFINUFFT_FLT c8[] = {4.9358776531681529E+05, 9.7772970960589685E+06, -2.3511574237970300E+07, -1.0142613816602133E+08, 3.9421144218642426E+08, -2.8449115593954617E+08, -5.7549243245203042E+08, 1.1608781631399941E+09, -5.7549243247572994E+08, -2.8449115597919518E+08, 3.9421144214433813E+08, -1.0142613816466759E+08, -2.3511574237996321E+07, 9.7772970960581861E+06, 4.9358776531681448E+05}; - CUFINUFFT_FLT c9[] = {1.2660319987326673E+05, 7.7519511328176421E+05, -6.5244610661542173E+06, 9.0878257489026226E+06, 2.3116605620370809E+07, -8.7079594480778053E+07, 9.5542733720576629E+07, 4.2723164545317951E-02, -9.5542733670714036E+07, 8.7079594586736053E+07, -2.3116605561938088E+07, -9.0878257517268714E+06, 6.5244610661359569E+06, -7.7519511328043276E+05, -1.2660319987326747E+05}; - CUFINUFFT_FLT c10[] = {2.3793325531458449E+04, -4.2305332803592217E+04, -5.2884156986641441E+05, 2.5307340140247596E+06, -4.0404175229102052E+06, -1.7519991511035681E+05, 1.0146438775036881E+07, -1.5828545434039038E+07, 1.0146438771144925E+07, -1.7520004460626876E+05, -4.0404175749208611E+06, 2.5307340154400147E+06, -5.2884156982771575E+05, -4.2305332803462676E+04, 2.3793325531458788E+04}; - CUFINUFFT_FLT c11[] = {2.9741655196842516E+03, -2.0687056404176896E+04, 3.3295507782231041E+04, 1.0661145714339131E+05, -5.6644238113375264E+05, 1.0874811579280477E+06, -9.6561272951275646E+05, -5.1287199081408294E-03, 9.6561272024221742E+05, -1.0874812519522079E+06, 5.6644242684715183E+05, -1.0661145918131116E+05, -3.3295507839673090E+04, 2.0687056403552484E+04, -2.9741655196846054E+03}; - CUFINUFFT_FLT c12[] = {1.5389176594851995E+02, -2.3864418514303975E+03, 1.0846266940782971E+04, -2.2940053288728755E+04, 1.4780109856545603E+04, 4.2663625334078126E+04, -1.3047651001642903E+05, 1.7468402233671257E+05, -1.3047651921148783E+05, 4.2663543727874072E+04, 1.4780033422571960E+04, -2.2940053360564565E+04, 1.0846266911599001E+04, -2.3864418523423406E+03, 1.5389176594715920E+02}; - CUFINUFFT_FLT c13[] = {-2.3857631312189291E+01, -1.9651605604649610E+01, 6.4183085202559698E+02, -2.8648428618202479E+03, 6.8249256924540387E+03, -9.7944454945500202E+03, 7.6177717113307281E+03, 1.2047808031005401E-02, -7.6177543637173221E+03, 9.7944303211006554E+03, -6.8249067869823548E+03, 2.8648410033462715E+03, -6.4183084900019139E+02, 1.9651606442715156E+01, 2.3857631312384541E+01}; - CUFINUFFT_FLT c14[] = {-6.1348505741956316E+00, 2.7872916029950378E+01, -6.5819949282243059E+01, 5.1366943137229264E+01, 1.7214074364107390E+02, -6.9658313160417026E+02, 1.3192072946885612E+03, -1.6053709652649356E+03, 1.3192033489278531E+03, -6.9663899461741221E+02, 1.7211498258980890E+02, 5.1367587332701412E+01, -6.5819942079787495E+01, 2.7872915852722411E+01, -6.1348505745937754E+00}; - CUFINUFFT_FLT c15[] = {-4.9671584494050897E-01, 3.0617548962871655E+00, -1.1650680501534040E+01, 3.0081518778147480E+01, -5.4027643304315461E+01, 6.6072752684824721E+01, -4.7155420133398515E+01, -5.6540863480770403E-03, 4.7158681490594240E+01, -6.6050534688928863E+01, 5.4059169757207428E+01, -3.0081909461561551E+01, 1.1650669885136919E+01, -3.0617550621683702E+00, 4.9671584460032286E-01}; - CUFINUFFT_FLT c16[] = {4.3460787769280373E-03, -1.3199805974685097E-02, -1.9413550415167488E-01, 1.1330353009743728E+00, -3.4412627904689330E+00, 7.1628360506506050E+00, -1.1104833360853762E+01, 1.2402582581952625E+01, -1.1114919494696498E+01, 7.0930736249049993E+00, -3.4864402649728556E+00, 1.1323392526753271E+00, -1.9415335680557039E-01, -1.3200242030886846E-02, 4.3460779753541788E-03}; + constexpr CUFINUFFT_FLT c0[] = {2.3939707792241831E+05, 9.7700272582690263E+08, 1.4715933396485272E+11, 4.7242424833337188E+12, 5.3987426629953602E+13, 2.7580474290566097E+14, 7.0693378336533425E+14, 9.6196578554477812E+14, 7.0693378336533450E+14, 2.7580474290566138E+14, 5.3987426629953812E+13, 4.7242424833337275E+12, 1.4715933396485272E+11, 9.7700272582690227E+08, 2.3939707792241947E+05}; + constexpr CUFINUFFT_FLT c1[] = {1.4314487885226030E+06, 2.9961416925358467E+09, 3.0273361232748438E+11, 6.8507333793903594E+12, 5.4192702756911008E+13, 1.7551587948105312E+14, 2.1874615668430153E+14, 7.1650878467891699E-02, -2.1874615668430153E+14, -1.7551587948105331E+14, -5.4192702756911164E+13, -6.8507333793903701E+12, -3.0273361232748431E+11, -2.9961416925358462E+09, -1.4314487885226023E+06}; + constexpr CUFINUFFT_FLT c2[] = {3.8829497354762917E+06, 4.2473082696966438E+09, 2.8414312556015527E+11, 4.3688281331121411E+12, 2.1823119508000523E+13, 3.2228098609392012E+13, -2.1833085454691871E+13, -7.3750710225100922E+13, -2.1833085454691941E+13, 3.2228098609392000E+13, 2.1823119508000590E+13, 4.3688281331121475E+12, 2.8414312556015521E+11, 4.2473082696966453E+09, 3.8829497354762908E+06}; + constexpr CUFINUFFT_FLT c3[] = {6.3495763451755773E+06, 3.6841035003733954E+09, 1.5965774278321045E+11, 1.5630338683778196E+12, 3.8749058615819282E+12, -2.7319740087723496E+12, -1.3233342822865416E+13, 1.2094759019991106E-03, 1.3233342822865408E+13, 2.7319740087723706E+12, -3.8749058615819390E+12, -1.5630338683778196E+12, -1.5965774278321036E+11, -3.6841035003733935E+09, -6.3495763451755773E+06}; + constexpr CUFINUFFT_FLT c4[] = {7.0146619045520434E+06, 2.1782897863065772E+09, 5.8897780310148148E+10, 3.1953009601770453E+11, 4.0651527030852091E+08, -1.6379148273275527E+12, -1.1568753136999574E+11, 2.7451653250461855E+12, -1.1568753137002715E+11, -1.6379148273276675E+12, 4.0651527030276263E+08, 3.1953009601770386E+11, 5.8897780310148087E+10, 2.1782897863065767E+09, 7.0146619045520416E+06}; + constexpr CUFINUFFT_FLT c5[] = {5.5580012413990181E+06, 9.2345162185944223E+08, 1.4522950934020067E+10, 2.7025952371212223E+10, -1.2304576967641710E+11, -1.0116752717201025E+11, 3.8517418245457495E+11, 1.1720185410178396E-01, -3.8517418245448737E+11, 1.0116752717220248E+11, 1.2304576967643900E+11, -2.7025952371215157E+10, -1.4522950934020073E+10, -9.2345162185944128E+08, -5.5580012413990190E+06}; + constexpr CUFINUFFT_FLT c6[] = {3.2693972344231787E+06, 2.8610260147425157E+08, 2.2348528403750129E+09, -3.4574515574239435E+09, -1.7480626463586948E+10, 3.1608597465528339E+10, 1.9879262560041798E+10, -6.6148013553832657E+10, 1.9879262560029728E+10, 3.1608597465497307E+10, -1.7480626463581020E+10, -3.4574515574192748E+09, 2.2348528403750839E+09, 2.8610260147425318E+08, 3.2693972344231806E+06}; + constexpr CUFINUFFT_FLT c7[] = {1.4553539959296260E+06, 6.4136842048383795E+07, 1.3622336582061595E+08, -1.2131510424646864E+09, 6.4322366984170294E+08, 4.5078753872136936E+09, -7.1689413747181644E+09, -1.1786171556070136E-02, 7.1689413746620741E+09, -4.5078753875125484E+09, -6.4322366985783029E+08, 1.2131510424602287E+09, -1.3622336582069945E+08, -6.4136842048384361E+07, -1.4553539959296270E+06}; + constexpr CUFINUFFT_FLT c8[] = {4.9358776531681529E+05, 9.7772970960589685E+06, -2.3511574237970300E+07, -1.0142613816602133E+08, 3.9421144218642426E+08, -2.8449115593954617E+08, -5.7549243245203042E+08, 1.1608781631399941E+09, -5.7549243247572994E+08, -2.8449115597919518E+08, 3.9421144214433813E+08, -1.0142613816466759E+08, -2.3511574237996321E+07, 9.7772970960581861E+06, 4.9358776531681448E+05}; + constexpr CUFINUFFT_FLT c9[] = {1.2660319987326673E+05, 7.7519511328176421E+05, -6.5244610661542173E+06, 9.0878257489026226E+06, 2.3116605620370809E+07, -8.7079594480778053E+07, 9.5542733720576629E+07, 4.2723164545317951E-02, -9.5542733670714036E+07, 8.7079594586736053E+07, -2.3116605561938088E+07, -9.0878257517268714E+06, 6.5244610661359569E+06, -7.7519511328043276E+05, -1.2660319987326747E+05}; + constexpr CUFINUFFT_FLT c10[] = {2.3793325531458449E+04, -4.2305332803592217E+04, -5.2884156986641441E+05, 2.5307340140247596E+06, -4.0404175229102052E+06, -1.7519991511035681E+05, 1.0146438775036881E+07, -1.5828545434039038E+07, 1.0146438771144925E+07, -1.7520004460626876E+05, -4.0404175749208611E+06, 2.5307340154400147E+06, -5.2884156982771575E+05, -4.2305332803462676E+04, 2.3793325531458788E+04}; + constexpr CUFINUFFT_FLT c11[] = {2.9741655196842516E+03, -2.0687056404176896E+04, 3.3295507782231041E+04, 1.0661145714339131E+05, -5.6644238113375264E+05, 1.0874811579280477E+06, -9.6561272951275646E+05, -5.1287199081408294E-03, 9.6561272024221742E+05, -1.0874812519522079E+06, 5.6644242684715183E+05, -1.0661145918131116E+05, -3.3295507839673090E+04, 2.0687056403552484E+04, -2.9741655196846054E+03}; + constexpr CUFINUFFT_FLT c12[] = {1.5389176594851995E+02, -2.3864418514303975E+03, 1.0846266940782971E+04, -2.2940053288728755E+04, 1.4780109856545603E+04, 4.2663625334078126E+04, -1.3047651001642903E+05, 1.7468402233671257E+05, -1.3047651921148783E+05, 4.2663543727874072E+04, 1.4780033422571960E+04, -2.2940053360564565E+04, 1.0846266911599001E+04, -2.3864418523423406E+03, 1.5389176594715920E+02}; + constexpr CUFINUFFT_FLT c13[] = {-2.3857631312189291E+01, -1.9651605604649610E+01, 6.4183085202559698E+02, -2.8648428618202479E+03, 6.8249256924540387E+03, -9.7944454945500202E+03, 7.6177717113307281E+03, 1.2047808031005401E-02, -7.6177543637173221E+03, 9.7944303211006554E+03, -6.8249067869823548E+03, 2.8648410033462715E+03, -6.4183084900019139E+02, 1.9651606442715156E+01, 2.3857631312384541E+01}; + constexpr CUFINUFFT_FLT c14[] = {-6.1348505741956316E+00, 2.7872916029950378E+01, -6.5819949282243059E+01, 5.1366943137229264E+01, 1.7214074364107390E+02, -6.9658313160417026E+02, 1.3192072946885612E+03, -1.6053709652649356E+03, 1.3192033489278531E+03, -6.9663899461741221E+02, 1.7211498258980890E+02, 5.1367587332701412E+01, -6.5819942079787495E+01, 2.7872915852722411E+01, -6.1348505745937754E+00}; + constexpr CUFINUFFT_FLT c15[] = {-4.9671584494050897E-01, 3.0617548962871655E+00, -1.1650680501534040E+01, 3.0081518778147480E+01, -5.4027643304315461E+01, 6.6072752684824721E+01, -4.7155420133398515E+01, -5.6540863480770403E-03, 4.7158681490594240E+01, -6.6050534688928863E+01, 5.4059169757207428E+01, -3.0081909461561551E+01, 1.1650669885136919E+01, -3.0617550621683702E+00, 4.9671584460032286E-01}; + constexpr CUFINUFFT_FLT c16[] = {4.3460787769280373E-03, -1.3199805974685097E-02, -1.9413550415167488E-01, 1.1330353009743728E+00, -3.4412627904689330E+00, 7.1628360506506050E+00, -1.1104833360853762E+01, 1.2402582581952625E+01, -1.1114919494696498E+01, 7.0930736249049993E+00, -3.4864402649728556E+00, 1.1323392526753271E+00, -1.9415335680557039E-01, -1.3200242030886846E-02, 4.3460779753541788E-03}; for (int i=0; i<15; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i] + z*(c12[i] + z*(c13[i] + z*(c14[i] + z*(c15[i] + z*(c16[i])))))))))))))))); } else if (w==16) { - CUFINUFFT_FLT c0[] = {3.6434551345571090E+05, 2.0744705928579485E+09, 4.0355760945670044E+11, 1.6364575388763037E+13, 2.3514830376056556E+14, 1.5192201717462535E+15, 4.9956173084674140E+15, 8.9287666945127430E+15, 8.9287666945127430E+15, 4.9956173084674140E+15, 1.5192201717462535E+15, 2.3514830376056556E+14, 1.6364575388763041E+13, 4.0355760945670050E+11, 2.0744705928579490E+09, 3.6434551345570857E+05}; - CUFINUFFT_FLT c1[] = {2.2576246485480363E+06, 6.6499571180086451E+09, 8.7873753526056287E+11, 2.5606844387131055E+13, 2.6313738449330159E+14, 1.1495095100701462E+15, 2.1932582707747572E+15, 1.2860244365132600E+15, -1.2860244365132588E+15, -2.1932582707747572E+15, -1.1495095100701462E+15, -2.6313738449330169E+14, -2.5606844387131066E+13, -8.7873753526056323E+11, -6.6499571180086451E+09, -2.2576246485480368E+06}; - CUFINUFFT_FLT c2[] = {6.3730995546265068E+06, 9.9060026035198040E+09, 8.8097248605448987E+11, 1.7953384130753676E+13, 1.2398425545001648E+14, 3.0749346493041212E+14, 1.0259777520247089E+14, -5.5291976457534288E+14, -5.5291976457534375E+14, 1.0259777520247070E+14, 3.0749346493041225E+14, 1.2398425545001656E+14, 1.7953384130753684E+13, 8.8097248605449011E+11, 9.9060026035198078E+09, 6.3730995546265068E+06}; - CUFINUFFT_FLT c3[] = {1.0896915393078225E+07, 9.0890343524593887E+09, 5.3565169504010028E+11, 7.3004206720038701E+12, 2.9692333044160082E+13, 1.6051737468109645E+13, -9.1273329108089531E+13, -8.5999306918502797E+13, 8.5999306918501641E+13, 9.1273329108090062E+13, -1.6051737468109594E+13, -2.9692333044160074E+13, -7.3004206720038711E+12, -5.3565169504010034E+11, -9.0890343524593887E+09, -1.0896915393078221E+07}; - CUFINUFFT_FLT c4[] = {1.2655725616100587E+07, 5.7342804054544201E+09, 2.1822836608899588E+11, 1.8300700858999731E+12, 2.7770431049858564E+12, -8.5034969223847109E+12, -1.2846668467422201E+13, 1.6519076896574611E+13, 1.6519076896573730E+13, -1.2846668467421688E+13, -8.5034969223849521E+12, 2.7770431049858491E+12, 1.8300700858999692E+12, 2.1822836608899588E+11, 5.7342804054544220E+09, 1.2655725616100591E+07}; - CUFINUFFT_FLT c5[] = {1.0609303958036324E+07, 2.6255609052371726E+09, 6.1673589426039383E+10, 2.6044432099084976E+11, -3.5431628074578320E+11, -1.6077602129636006E+12, 1.5534405614729011E+12, 2.8019935380861670E+12, -2.8019935380844810E+12, -1.5534405614727644E+12, 1.6077602129636335E+12, 3.5431628074576636E+11, -2.6044432099085037E+11, -6.1673589426039368E+10, -2.6255609052371726E+09, -1.0609303958036324E+07}; - CUFINUFFT_FLT c6[] = {6.6544809363384563E+06, 8.9490403680928385E+08, 1.1882638725190760E+10, 8.1552898137784090E+09, -1.2575562817891687E+11, 2.7074695075842178E+10, 3.9453789461922034E+11, -3.1679644857435541E+11, -3.1679644857440692E+11, 3.9453789461951154E+11, 2.7074695076007500E+10, -1.2575562817885344E+11, 8.1552898137852116E+09, 1.1882638725191153E+10, 8.9490403680928493E+08, 6.6544809363384582E+06}; - CUFINUFFT_FLT c7[] = {3.1906872142824987E+06, 2.2785946180651781E+08, 1.3744578972809656E+09, -4.3997172592913818E+09, -9.2011130754125404E+09, 3.4690551711826530E+10, -9.4227043395316906E+09, -5.9308465069991577E+10, 5.9308465068943581E+10, 9.4227043392705956E+09, -3.4690551712022408E+10, 9.2011130753675175E+09, 4.3997172592866106E+09, -1.3744578972812984E+09, -2.2785946180652174E+08, -3.1906872142824973E+06}; - CUFINUFFT_FLT c8[] = {1.1821527096621725E+06, 4.2281234059839047E+07, 2.8723226058821958E+07, -8.3553955857311106E+08, 1.2447304829054153E+09, 2.1955280944846683E+09, -7.0514195725593920E+09, 4.3745141235010500E+09, 4.3745141236655197E+09, -7.0514195727234411E+09, 2.1955280942826533E+09, 1.2447304829048812E+09, -8.3553955857841730E+08, 2.8723226058853466E+07, 4.2281234059838966E+07, 1.1821527096621748E+06}; - CUFINUFFT_FLT c9[] = {3.3854610744280228E+05, 5.2176984975098642E+06, -2.0677283564981934E+07, -3.5831818966960624E+07, 2.6599346104854527E+08, -3.7992777983589816E+08, -1.3426914439904341E+08, 9.1752051209279442E+08, -9.1752051188087845E+08, 1.3426914452369988E+08, 3.7992777987329507E+08, -2.6599346107659298E+08, 3.5831818968129277E+07, 2.0677283565073237E+07, -5.2176984975084374E+06, -3.3854610744280077E+05}; - CUFINUFFT_FLT c10[] = {7.3893334077309293E+04, 2.6983804209740972E+05, -3.6415998560880083E+06, 8.4025485863333493E+06, 4.9278860779347531E+06, -5.1437033824108891E+07, 8.7603898602732122E+07, -4.6199497846299231E+07, -4.6199498219926819E+07, 8.7603898832003579E+07, -5.1437033801464774E+07, 4.9278861005788362E+06, 8.4025485870409794E+06, -3.6415998559663831E+06, 2.6983804209585470E+05, 7.3893334077307591E+04}; - CUFINUFFT_FLT c11[] = {1.1778892113374410E+04, -4.0077190109195144E+04, -1.8372552183899941E+05, 1.3262878359201169E+06, -2.9738540144900386E+06, 1.9493508843214174E+06, 4.1881949043266159E+06, -1.1066749441324197E+07, 1.1066749225224417E+07, -4.1881949989500660E+06, -1.9493509811827433E+06, 2.9738539876374160E+06, -1.3262878392766861E+06, 1.8372552166916840E+05, 4.0077190106541901E+04, -1.1778892113374635E+04}; - CUFINUFFT_FLT c12[] = {1.2019749667905517E+03, -1.0378455845905968E+04, 2.6333352626226591E+04, 1.7117060824677988E+04, -2.5133287788479996E+05, 6.4713912423136400E+05, -8.1634971996757365E+05, 3.8623850687193515E+05, 3.8623887467457692E+05, -8.1634999581952032E+05, 6.4713888515965885E+05, -2.5133289397614688E+05, 1.7117056658162492E+04, 2.6333352590306949E+04, -1.0378455846607170E+04, 1.2019749667886601E+03}; - CUFINUFFT_FLT c13[] = {3.1189837633271310E+01, -8.9083493666530228E+02, 4.9454294721013366E+03, -1.3124691362129612E+04, 1.5834782149156119E+04, 6.9607783053915546E+03, -5.9789949050326162E+04, 1.0841720290002371E+05, -1.0841726183381994E+05, 5.9790023686287932E+04, -6.9607416211385053E+03, -1.5834800728954084E+04, 1.3124692508510609E+04, -4.9454294244132070E+03, 8.9083493795553227E+02, -3.1189837630675466E+01}; - CUFINUFFT_FLT c14[] = {-1.2975319073318561E+01, 1.8283698900397550E+01, 1.7684013462935113E+02, -1.1059907069976271E+03, 3.1998196269059799E+03, -5.5988285845467362E+03, 5.9248624962359208E+03, -2.5987075415506133E+03, -2.5989297031998472E+03, 5.9249309327755627E+03, -5.5988287659129119E+03, 3.1998292347735460E+03, -1.1059914993060199E+03, 1.7684017599586443E+02, 1.8283697951655380E+01, -1.2975319075406015E+01}; - CUFINUFFT_FLT c15[] = {-2.3155118737567935E+00, 1.1938503501764195E+01, -3.4150613932459848E+01, 4.8896713096147266E+01, 1.5844216816345641E+01, -2.4277080939345015E+02, 6.0146058115394737E+02, -8.8748160721868635E+02, 8.8732832343048744E+02, -6.0146927810646923E+02, 2.4275722040513463E+02, -1.5849652411671842E+01, -4.8897528435446198E+01, 3.4150596946224454E+01, -1.1938504032584051E+01, 2.3155118728820292E+00}; - CUFINUFFT_FLT c16[] = {-1.5401723736175238E-01, 9.8067757197686212E-01, -4.1901188293318530E+00, 1.2150691895619683E+01, -2.4764820628534302E+01, 3.6081462800085532E+01, -3.4534922277532473E+01, 1.2910251318703700E+01, 1.3098525817101535E+01, -3.4588714991360455E+01, 3.5973877372429698E+01, -2.4775747273530602E+01, 1.2149010873312557E+01, -4.1901467369287460E+00, 9.8067700766883559E-01, -1.5401723876450651E-01}; - CUFINUFFT_FLT c17[] = {1.1808835457017667E-02, -2.5443945538745794E-02, -1.3157119144786456E-04, 2.5877310634925382E-01, -1.0920774586473376E+00, 2.6473618304294715E+00, -4.4448325935254926E+00, 6.8292491990998831E+00, -6.8300632710034588E+00, 4.4643703192113184E+00, -2.6384070394901351E+00, 1.0890246890089277E+00, -2.5849326913239973E-01, 1.4031610447463365E-04, 2.5444280926035151E-02, -1.1808834729180664E-02}; + constexpr CUFINUFFT_FLT c0[] = {3.6434551345571090E+05, 2.0744705928579485E+09, 4.0355760945670044E+11, 1.6364575388763037E+13, 2.3514830376056556E+14, 1.5192201717462535E+15, 4.9956173084674140E+15, 8.9287666945127430E+15, 8.9287666945127430E+15, 4.9956173084674140E+15, 1.5192201717462535E+15, 2.3514830376056556E+14, 1.6364575388763041E+13, 4.0355760945670050E+11, 2.0744705928579490E+09, 3.6434551345570857E+05}; + constexpr CUFINUFFT_FLT c1[] = {2.2576246485480363E+06, 6.6499571180086451E+09, 8.7873753526056287E+11, 2.5606844387131055E+13, 2.6313738449330159E+14, 1.1495095100701462E+15, 2.1932582707747572E+15, 1.2860244365132600E+15, -1.2860244365132588E+15, -2.1932582707747572E+15, -1.1495095100701462E+15, -2.6313738449330169E+14, -2.5606844387131066E+13, -8.7873753526056323E+11, -6.6499571180086451E+09, -2.2576246485480368E+06}; + constexpr CUFINUFFT_FLT c2[] = {6.3730995546265068E+06, 9.9060026035198040E+09, 8.8097248605448987E+11, 1.7953384130753676E+13, 1.2398425545001648E+14, 3.0749346493041212E+14, 1.0259777520247089E+14, -5.5291976457534288E+14, -5.5291976457534375E+14, 1.0259777520247070E+14, 3.0749346493041225E+14, 1.2398425545001656E+14, 1.7953384130753684E+13, 8.8097248605449011E+11, 9.9060026035198078E+09, 6.3730995546265068E+06}; + constexpr CUFINUFFT_FLT c3[] = {1.0896915393078225E+07, 9.0890343524593887E+09, 5.3565169504010028E+11, 7.3004206720038701E+12, 2.9692333044160082E+13, 1.6051737468109645E+13, -9.1273329108089531E+13, -8.5999306918502797E+13, 8.5999306918501641E+13, 9.1273329108090062E+13, -1.6051737468109594E+13, -2.9692333044160074E+13, -7.3004206720038711E+12, -5.3565169504010034E+11, -9.0890343524593887E+09, -1.0896915393078221E+07}; + constexpr CUFINUFFT_FLT c4[] = {1.2655725616100587E+07, 5.7342804054544201E+09, 2.1822836608899588E+11, 1.8300700858999731E+12, 2.7770431049858564E+12, -8.5034969223847109E+12, -1.2846668467422201E+13, 1.6519076896574611E+13, 1.6519076896573730E+13, -1.2846668467421688E+13, -8.5034969223849521E+12, 2.7770431049858491E+12, 1.8300700858999692E+12, 2.1822836608899588E+11, 5.7342804054544220E+09, 1.2655725616100591E+07}; + constexpr CUFINUFFT_FLT c5[] = {1.0609303958036324E+07, 2.6255609052371726E+09, 6.1673589426039383E+10, 2.6044432099084976E+11, -3.5431628074578320E+11, -1.6077602129636006E+12, 1.5534405614729011E+12, 2.8019935380861670E+12, -2.8019935380844810E+12, -1.5534405614727644E+12, 1.6077602129636335E+12, 3.5431628074576636E+11, -2.6044432099085037E+11, -6.1673589426039368E+10, -2.6255609052371726E+09, -1.0609303958036324E+07}; + constexpr CUFINUFFT_FLT c6[] = {6.6544809363384563E+06, 8.9490403680928385E+08, 1.1882638725190760E+10, 8.1552898137784090E+09, -1.2575562817891687E+11, 2.7074695075842178E+10, 3.9453789461922034E+11, -3.1679644857435541E+11, -3.1679644857440692E+11, 3.9453789461951154E+11, 2.7074695076007500E+10, -1.2575562817885344E+11, 8.1552898137852116E+09, 1.1882638725191153E+10, 8.9490403680928493E+08, 6.6544809363384582E+06}; + constexpr CUFINUFFT_FLT c7[] = {3.1906872142824987E+06, 2.2785946180651781E+08, 1.3744578972809656E+09, -4.3997172592913818E+09, -9.2011130754125404E+09, 3.4690551711826530E+10, -9.4227043395316906E+09, -5.9308465069991577E+10, 5.9308465068943581E+10, 9.4227043392705956E+09, -3.4690551712022408E+10, 9.2011130753675175E+09, 4.3997172592866106E+09, -1.3744578972812984E+09, -2.2785946180652174E+08, -3.1906872142824973E+06}; + constexpr CUFINUFFT_FLT c8[] = {1.1821527096621725E+06, 4.2281234059839047E+07, 2.8723226058821958E+07, -8.3553955857311106E+08, 1.2447304829054153E+09, 2.1955280944846683E+09, -7.0514195725593920E+09, 4.3745141235010500E+09, 4.3745141236655197E+09, -7.0514195727234411E+09, 2.1955280942826533E+09, 1.2447304829048812E+09, -8.3553955857841730E+08, 2.8723226058853466E+07, 4.2281234059838966E+07, 1.1821527096621748E+06}; + constexpr CUFINUFFT_FLT c9[] = {3.3854610744280228E+05, 5.2176984975098642E+06, -2.0677283564981934E+07, -3.5831818966960624E+07, 2.6599346104854527E+08, -3.7992777983589816E+08, -1.3426914439904341E+08, 9.1752051209279442E+08, -9.1752051188087845E+08, 1.3426914452369988E+08, 3.7992777987329507E+08, -2.6599346107659298E+08, 3.5831818968129277E+07, 2.0677283565073237E+07, -5.2176984975084374E+06, -3.3854610744280077E+05}; + constexpr CUFINUFFT_FLT c10[] = {7.3893334077309293E+04, 2.6983804209740972E+05, -3.6415998560880083E+06, 8.4025485863333493E+06, 4.9278860779347531E+06, -5.1437033824108891E+07, 8.7603898602732122E+07, -4.6199497846299231E+07, -4.6199498219926819E+07, 8.7603898832003579E+07, -5.1437033801464774E+07, 4.9278861005788362E+06, 8.4025485870409794E+06, -3.6415998559663831E+06, 2.6983804209585470E+05, 7.3893334077307591E+04}; + constexpr CUFINUFFT_FLT c11[] = {1.1778892113374410E+04, -4.0077190109195144E+04, -1.8372552183899941E+05, 1.3262878359201169E+06, -2.9738540144900386E+06, 1.9493508843214174E+06, 4.1881949043266159E+06, -1.1066749441324197E+07, 1.1066749225224417E+07, -4.1881949989500660E+06, -1.9493509811827433E+06, 2.9738539876374160E+06, -1.3262878392766861E+06, 1.8372552166916840E+05, 4.0077190106541901E+04, -1.1778892113374635E+04}; + constexpr CUFINUFFT_FLT c12[] = {1.2019749667905517E+03, -1.0378455845905968E+04, 2.6333352626226591E+04, 1.7117060824677988E+04, -2.5133287788479996E+05, 6.4713912423136400E+05, -8.1634971996757365E+05, 3.8623850687193515E+05, 3.8623887467457692E+05, -8.1634999581952032E+05, 6.4713888515965885E+05, -2.5133289397614688E+05, 1.7117056658162492E+04, 2.6333352590306949E+04, -1.0378455846607170E+04, 1.2019749667886601E+03}; + constexpr CUFINUFFT_FLT c13[] = {3.1189837633271310E+01, -8.9083493666530228E+02, 4.9454294721013366E+03, -1.3124691362129612E+04, 1.5834782149156119E+04, 6.9607783053915546E+03, -5.9789949050326162E+04, 1.0841720290002371E+05, -1.0841726183381994E+05, 5.9790023686287932E+04, -6.9607416211385053E+03, -1.5834800728954084E+04, 1.3124692508510609E+04, -4.9454294244132070E+03, 8.9083493795553227E+02, -3.1189837630675466E+01}; + constexpr CUFINUFFT_FLT c14[] = {-1.2975319073318561E+01, 1.8283698900397550E+01, 1.7684013462935113E+02, -1.1059907069976271E+03, 3.1998196269059799E+03, -5.5988285845467362E+03, 5.9248624962359208E+03, -2.5987075415506133E+03, -2.5989297031998472E+03, 5.9249309327755627E+03, -5.5988287659129119E+03, 3.1998292347735460E+03, -1.1059914993060199E+03, 1.7684017599586443E+02, 1.8283697951655380E+01, -1.2975319075406015E+01}; + constexpr CUFINUFFT_FLT c15[] = {-2.3155118737567935E+00, 1.1938503501764195E+01, -3.4150613932459848E+01, 4.8896713096147266E+01, 1.5844216816345641E+01, -2.4277080939345015E+02, 6.0146058115394737E+02, -8.8748160721868635E+02, 8.8732832343048744E+02, -6.0146927810646923E+02, 2.4275722040513463E+02, -1.5849652411671842E+01, -4.8897528435446198E+01, 3.4150596946224454E+01, -1.1938504032584051E+01, 2.3155118728820292E+00}; + constexpr CUFINUFFT_FLT c16[] = {-1.5401723736175238E-01, 9.8067757197686212E-01, -4.1901188293318530E+00, 1.2150691895619683E+01, -2.4764820628534302E+01, 3.6081462800085532E+01, -3.4534922277532473E+01, 1.2910251318703700E+01, 1.3098525817101535E+01, -3.4588714991360455E+01, 3.5973877372429698E+01, -2.4775747273530602E+01, 1.2149010873312557E+01, -4.1901467369287460E+00, 9.8067700766883559E-01, -1.5401723876450651E-01}; + constexpr CUFINUFFT_FLT c17[] = {1.1808835457017667E-02, -2.5443945538745794E-02, -1.3157119144786456E-04, 2.5877310634925382E-01, -1.0920774586473376E+00, 2.6473618304294715E+00, -4.4448325935254926E+00, 6.8292491990998831E+00, -6.8300632710034588E+00, 4.4643703192113184E+00, -2.6384070394901351E+00, 1.0890246890089277E+00, -2.5849326913239973E-01, 1.4031610447463365E-04, 2.5444280926035151E-02, -1.1808834729180664E-02}; for (int i=0; i<16; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i] + z*(c12[i] + z*(c13[i] + z*(c14[i] + z*(c15[i] + z*(c16[i] + z*(c17[i]))))))))))))))))); } else printf("width not implemented!\n"); diff --git a/include/cufinufft/spreadinterp.h b/include/cufinufft/spreadinterp.h index da1c59930..d2f1ecd2d 100644 --- a/include/cufinufft/spreadinterp.h +++ b/include/cufinufft/spreadinterp.h @@ -2,6 +2,7 @@ #define __CUSPREADINTERP_H__ #include +#include #include #include diff --git a/perftest/cuda/bench.py b/perftest/cuda/bench.py index 5269a3f45..bb288af0b 100644 --- a/perftest/cuda/bench.py +++ b/perftest/cuda/bench.py @@ -40,8 +40,10 @@ def build_args(args): args = {"--prec": "f", "--n_runs": "10", "--method": "0", - "--N1": "16777216", - # "--N2": "256", + "--sort": "1", + # "--N1": "16777216", + "--N1": "256", + "--N2": "256", # "--N3": "256", "--M": "1E8", "--tol": "1E-6"} @@ -56,7 +58,6 @@ def build_args(args): warmup = {"--prec": "f", "--n_runs": "1", "--method": "0", - "--sort": "0", "--N1": "256", # "--N2": "256", # "--N3": "256", @@ -142,13 +143,13 @@ def build_args(args): # remove the GM column pivot_df.drop(('throughput', 'GM'), axis=1, inplace=True) pivot_df.drop(('exec', 'GM'), axis=1, inplace=True) - +pivot_df.drop(('exec', 'SM'), axis=1, inplace=True) print(pivot_df) # Plot pivot_df.plot(kind='bar', figsize=(10, 7)) # Find the minimum throughput value -min_val = min(pivot_df[('exec', 'SM')].min(), pivot_df[('throughput', 'SM')].min(), 1) -max_val = max(pivot_df[('exec', 'SM')].max(), pivot_df[('throughput', 'SM')].max(), 0) +min_val = min(pivot_df[('throughput', 'SM')].min(), 1) +max_val = max(pivot_df[('throughput', 'SM')].max(), 0) print(min_val, max_val) plt.ylim(min_val * .99, max_val * 1.01) # plt.ylim(.8, 1.2) diff --git a/src/cuda/1d/spreadinterp1d.cuh b/src/cuda/1d/spreadinterp1d.cuh index 24b4fb9d2..9a536ec9c 100644 --- a/src/cuda/1d/spreadinterp1d.cuh +++ b/src/cuda/1d/spreadinterp1d.cuh @@ -5,9 +5,11 @@ #include #include +#include #include #include #include + using namespace cufinufft::utils; namespace cufinufft { @@ -15,164 +17,167 @@ namespace spreadinterp { /* ------------------------ 1d Spreading Kernels ----------------------------*/ /* Kernels for NUptsdriven Method */ -template -__global__ void spread_1d_nuptsdriven(const T *x, const cuda_complex *c, cuda_complex *fw, int M, int ns, int nf1, - T es_c, T es_beta, T sigma, const int *idxnupts) { - int xx, ix; - T ker1[MAX_NSPREAD]; - - T x_rescaled; - cuda_complex cnow; - for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < M; i += blockDim.x * gridDim.x) { - x_rescaled = fold_rescale(x[idxnupts[i]], nf1); - cnow = c[idxnupts[i]]; - int xstart = ceil(x_rescaled - ns / 2.0); - int xend = floor(x_rescaled + ns / 2.0); - - T x1 = (T)xstart - x_rescaled; - if constexpr (KEREVALMETH == 1) - eval_kernel_vec_horner(ker1, x1, ns, sigma); - else - eval_kernel_vec(ker1, x1, ns, es_c, es_beta); - - for (xx = xstart; xx <= xend; xx++) { - ix = xx < 0 ? xx + nf1 : (xx > nf1 - 1 ? xx - nf1 : xx); - T kervalue = ker1[xx - xstart]; - atomicAdd(&fw[ix].x, cnow.x * kervalue); - atomicAdd(&fw[ix].y, cnow.y * kervalue); - } +template +__global__ void spread_1d_nuptsdriven(const T *x, const cuda_complex *c, + cuda_complex *fw, int M, int ns, int nf1, T es_c, + T es_beta, T sigma, const int *idxnupts) { + int xx, ix; + T ker1[MAX_NSPREAD]; + + T x_rescaled; + cuda_complex cnow; + for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < M; + i += blockDim.x * gridDim.x) { + x_rescaled = fold_rescale(x[idxnupts[i]], nf1); + cnow = c[idxnupts[i]]; + int xstart = ceil(x_rescaled - ns / 2.0); + int xend = floor(x_rescaled + ns / 2.0); + + T x1 = (T)xstart - x_rescaled; + if constexpr (KEREVALMETH == 1) + eval_kernel_vec_horner(ker1, x1, ns, sigma); + else + eval_kernel_vec(ker1, x1, ns, es_c, es_beta); + + for (xx = xstart; xx <= xend; xx++) { + ix = xx < 0 ? xx + nf1 : (xx > nf1 - 1 ? xx - nf1 : xx); + T kervalue = ker1[xx - xstart]; + atomicAdd(&fw[ix].x, cnow.x * kervalue); + atomicAdd(&fw[ix].y, cnow.y * kervalue); } + } } /* Kernels for SubProb Method */ // SubProb properties -template -__global__ void calc_bin_size_noghost_1d(int M, int nf1, int bin_size_x, int nbinx, int *bin_size, const T *x, - int *sortidx) { - int binx; - int oldidx; - T x_rescaled; - for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < M; i += gridDim.x * blockDim.x) { - x_rescaled = fold_rescale(x[i], nf1); - binx = floor(x_rescaled / bin_size_x); - binx = binx >= nbinx ? binx - 1 : binx; - binx = binx < 0 ? 0 : binx; - oldidx = atomicAdd(&bin_size[binx], 1); - sortidx[i] = oldidx; - if (binx >= nbinx) { - sortidx[i] = -binx; - } +template +__global__ void calc_bin_size_noghost_1d(int M, int nf1, int bin_size_x, int nbinx, + int *bin_size, const T *x, int *sortidx) { + int binx; + int oldidx; + T x_rescaled; + for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < M; + i += gridDim.x * blockDim.x) { + x_rescaled = fold_rescale(x[i], nf1); + binx = floor(x_rescaled / bin_size_x); + binx = binx >= nbinx ? binx - 1 : binx; + binx = binx < 0 ? 0 : binx; + oldidx = atomicAdd(&bin_size[binx], 1); + sortidx[i] = oldidx; + if (binx >= nbinx) { + sortidx[i] = -binx; } + } } -template -__global__ void calc_inverse_of_global_sort_idx_1d(int M, int bin_size_x, int nbinx, const int *bin_startpts, - const int *sortidx, const T *x, int *index, int nf1) { - int binx; - T x_rescaled; - for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < M; i += gridDim.x * blockDim.x) { - x_rescaled = fold_rescale(x[i], nf1); - binx = floor(x_rescaled / bin_size_x); - binx = binx >= nbinx ? binx - 1 : binx; - binx = binx < 0 ? 0 : binx; - - index[bin_startpts[binx] + sortidx[i]] = i; - } +template +__global__ void calc_inverse_of_global_sort_idx_1d( + int M, int bin_size_x, int nbinx, const int *bin_startpts, const int *sortidx, + const T *x, int *index, int nf1) { + int binx; + T x_rescaled; + for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < M; + i += gridDim.x * blockDim.x) { + x_rescaled = fold_rescale(x[i], nf1); + binx = floor(x_rescaled / bin_size_x); + binx = binx >= nbinx ? binx - 1 : binx; + binx = binx < 0 ? 0 : binx; + + index[bin_startpts[binx] + sortidx[i]] = i; + } } -template -__global__ void spread_1d_subprob(const T *x, const cuda_complex *c, cuda_complex *fw, int M, int ns, int nf1, - T es_c, T es_beta, T sigma, const int *binstartpts, const int *bin_size, - int bin_size_x, const int *subprob_to_bin, const int *subprobstartpts, - const int *numsubprob, int maxsubprobsize, int nbinx, const int *idxnupts) { - extern __shared__ char sharedbuf[]; - cuda_complex *fwshared = (cuda_complex *)sharedbuf; - - int xstart, xend; - int subpidx = blockIdx.x; - int bidx = subprob_to_bin[subpidx]; - int binsubp_idx = subpidx - subprobstartpts[bidx]; - int ix; - int ptstart = binstartpts[bidx] + binsubp_idx * maxsubprobsize; - int nupts = min(maxsubprobsize, bin_size[bidx] - binsubp_idx * maxsubprobsize); - - int xoffset = (bidx % nbinx) * bin_size_x; - - int N = (bin_size_x + 2 * ceil(ns / 2.0)); - T ker1[MAX_NSPREAD]; - - for (int i = threadIdx.x; i < N; i += blockDim.x) { - fwshared[i].x = 0.0; - fwshared[i].y = 0.0; +template +__global__ void spread_1d_subprob( + const T *x, const cuda_complex *c, cuda_complex *fw, int M, uint8_t ns, int nf1, + T es_c, T es_beta, T sigma, const int *binstartpts, const int *bin_size, + int bin_size_x, const int *subprob_to_bin, const int *subprobstartpts, + const int *numsubprob, int maxsubprobsize, int nbinx, const int *idxnupts) { + extern __shared__ char sharedbuf[]; + auto *__restrict__ fwshared = (cuda_complex *)sharedbuf; + + int xstart, xend, ix; + const int subpidx = blockIdx.x; + const int bidx = subprob_to_bin[subpidx]; + const int binsubp_idx = subpidx - subprobstartpts[bidx]; + const int ptstart = binstartpts[bidx] + binsubp_idx * maxsubprobsize; + const int nupts = min(maxsubprobsize, bin_size[bidx] - binsubp_idx * maxsubprobsize); + const int xoffset = (bidx % nbinx) * bin_size_x; + const auto ns_2 = (ns + 1) / 2; + const int N = bin_size_x + 2 * ns_2; + + T ker1[MAX_NSPREAD]; + + for (int i = threadIdx.x; i < N; i += blockDim.x) { + fwshared[i].x = T(0); + fwshared[i].y = T(0); + } + __syncthreads(); + + for (auto i = threadIdx.x; i < nupts; i += blockDim.x) { + const auto idx = ptstart + i; + const auto x_rescaled = fold_rescale(x[idxnupts[idx]], nf1); + const auto cnow = c[idxnupts[idx]]; + + xstart = ceil(x_rescaled - ns / 2.0) - xoffset; + xend = floor(x_rescaled + ns / 2.0) - xoffset; + + const T x1 = T(xstart + xoffset) - x_rescaled; + if constexpr (KEREVALMETH == 1) + eval_kernel_vec_horner(ker1, x1, ns, sigma); + else + eval_kernel_vec(ker1, x1, ns, es_c, es_beta); + for (int xx = xstart; xx <= xend; xx++) { + ix = xx + ns_2; + if (ix >= (bin_size_x + ns_2) || ix < 0) break; + atomicAdd(&fwshared[ix].x, cnow.x * ker1[xx - xstart]); + atomicAdd(&fwshared[ix].y, cnow.y * ker1[xx - xstart]); } - __syncthreads(); - - T x_rescaled; - cuda_complex cnow; - for (int i = threadIdx.x; i < nupts; i += blockDim.x) { - int idx = ptstart + i; - x_rescaled = fold_rescale(x[idxnupts[idx]], nf1); - cnow = c[idxnupts[idx]]; - - xstart = ceil(x_rescaled - ns / 2.0) - xoffset; - xend = floor(x_rescaled + ns / 2.0) - xoffset; - - T x1 = (T)xstart + xoffset - x_rescaled; - if constexpr (KEREVALMETH == 1) - eval_kernel_vec_horner(ker1, x1, ns, sigma); - else - eval_kernel_vec(ker1, x1, ns, es_c, es_beta); - - for (int xx = xstart; xx <= xend; xx++) { - ix = xx + ceil(ns / 2.0); - if (ix >= (bin_size_x + (int)ceil(ns / 2.0) * 2) || ix < 0) - break; - atomicAdd(&fwshared[ix].x, cnow.x * ker1[xx - xstart]); - atomicAdd(&fwshared[ix].y, cnow.y * ker1[xx - xstart]); - } - } - __syncthreads(); - /* write to global memory */ - for (int k = threadIdx.x; k < N; k += blockDim.x) { - ix = xoffset - ceil(ns / 2.0) + k; - if (ix < (nf1 + ceil(ns / 2.0))) { - ix = ix < 0 ? ix + nf1 : (ix > nf1 - 1 ? ix - nf1 : ix); - atomicAdd(&fw[ix].x, fwshared[k].x); - atomicAdd(&fw[ix].y, fwshared[k].y); - } + } + __syncthreads(); + /* write to global memory */ + for (int k = threadIdx.x; k < N; k += blockDim.x) { + ix = xoffset - ns_2 + k; + if (ix < (nf1 + ns_2)) { + ix = ix < 0 ? ix + nf1 : (ix > nf1 - 1 ? ix - nf1 : ix); + atomicAdd(&fw[ix].x, fwshared[k].x); + atomicAdd(&fw[ix].y, fwshared[k].y); } + } } /* --------------------- 1d Interpolation Kernels ----------------------------*/ /* Kernels for NUptsdriven Method */ -template -__global__ void interp_1d_nuptsdriven(const T *x, cuda_complex *c, const cuda_complex *fw, int M, int ns, int nf1, +template +__global__ void interp_1d_nuptsdriven(const T *x, cuda_complex *c, + const cuda_complex *fw, int M, int ns, int nf1, T es_c, T es_beta, T sigma, const int *idxnupts) { - T ker1[MAX_NSPREAD]; - for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < M; i += blockDim.x * gridDim.x) { - T x_rescaled = fold_rescale(x[idxnupts[i]], nf1); - - int xstart = ceil(x_rescaled - ns / 2.0); - int xend = floor(x_rescaled + ns / 2.0); - cuda_complex cnow; - cnow.x = 0.0; - cnow.y = 0.0; - - T x1 = (T)xstart - x_rescaled; - if constexpr (KEREVALMETH == 1) - eval_kernel_vec_horner(ker1, x1, ns, sigma); - else - eval_kernel_vec(ker1, x1, ns, es_c, es_beta); - - for (int xx = xstart; xx <= xend; xx++) { - int ix = xx < 0 ? xx + nf1 : (xx > nf1 - 1 ? xx - nf1 : xx); - T kervalue1 = ker1[xx - xstart]; - cnow.x += fw[ix].x * kervalue1; - cnow.y += fw[ix].y * kervalue1; - } - c[idxnupts[i]].x = cnow.x; - c[idxnupts[i]].y = cnow.y; + T ker1[MAX_NSPREAD]; + for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < M; + i += blockDim.x * gridDim.x) { + T x_rescaled = fold_rescale(x[idxnupts[i]], nf1); + + int xstart = ceil(x_rescaled - ns / 2.0); + int xend = floor(x_rescaled + ns / 2.0); + cuda_complex cnow; + cnow.x = 0.0; + cnow.y = 0.0; + + T x1 = (T)xstart - x_rescaled; + if constexpr (KEREVALMETH == 1) + eval_kernel_vec_horner(ker1, x1, ns, sigma); + else + eval_kernel_vec(ker1, x1, ns, es_c, es_beta); + + for (int xx = xstart; xx <= xend; xx++) { + int ix = xx < 0 ? xx + nf1 : (xx > nf1 - 1 ? xx - nf1 : xx); + T kervalue1 = ker1[xx - xstart]; + cnow.x += fw[ix].x * kervalue1; + cnow.y += fw[ix].y * kervalue1; } + c[idxnupts[i]].x = cnow.x; + c[idxnupts[i]].y = cnow.y; + } } } // namespace spreadinterp diff --git a/src/cuda/common.cu b/src/cuda/common.cu index 5e32cb101..f5661b1dd 100644 --- a/src/cuda/common.cu +++ b/src/cuda/common.cu @@ -227,8 +227,10 @@ void cufinufft_setup_binsize(int type, int ns, int dim, cufinufft_opts *opts) { switch (dim) { case 1: { switch (opts->gpu_method) { - case 0: case 1: + opts->gpu_binsizex = (opts->gpu_binsizex < 0) ? 1024 : opts->gpu_binsizex; + break; + case 0: case 2: if (opts->gpu_binsizex < 0) { cudaGetDevice(&device_id); From 60f478033ccc9fe9ac5258e1c64a5c91230c9c40 Mon Sep 17 00:00:00 2001 From: Marco Barbone Date: Fri, 12 Jul 2024 15:25:12 -0400 Subject: [PATCH 09/68] otpimized nupts driven --- include/cufinufft/contrib/helper_cuda.h | 15 +-- .../contrib/ker_horner_allw_loop.inc | 28 ++++-- include/cufinufft/spreadinterp.h | 53 +++++++++-- perftest/cuda/bench.py | 18 ++-- src/cuda/1d/spread1d_wrapper.cu | 92 ++++++++++--------- src/cuda/1d/spreadinterp1d.cuh | 69 +++++++++----- src/cuda/CMakeLists.txt | 16 ++++ 7 files changed, 199 insertions(+), 92 deletions(-) diff --git a/include/cufinufft/contrib/helper_cuda.h b/include/cufinufft/contrib/helper_cuda.h index 3dade898e..c3a31bd2b 100644 --- a/include/cufinufft/contrib/helper_cuda.h +++ b/include/cufinufft/contrib/helper_cuda.h @@ -58,13 +58,14 @@ static inline cudaError_t cudaFreeWrapper(T *devPtr, cudaStream_t stream, return pool_supported ? cudaFreeAsync(devPtr, stream) : cudaFree(devPtr); } -#define RETURN_IF_CUDA_ERROR \ - { \ - cudaError_t err = cudaGetLastError(); \ - if (err != cudaSuccess) { \ - printf("[%s] Error: %s\n", __func__, cudaGetErrorString(err)); \ - return FINUFFT_ERR_CUDA_FAILURE; \ - } \ +#define RETURN_IF_CUDA_ERROR \ + { \ + cudaError_t err = cudaGetLastError(); \ + if (err != cudaSuccess) { \ + printf("[%s] Error: %s in %s at line %d\n", __func__, cudaGetErrorString(err), \ + __FILE__, __LINE__); \ + return FINUFFT_ERR_CUDA_FAILURE; \ + } \ } #define CUDA_FREE_AND_NULL(val, stream, pool_supported) \ diff --git a/include/cufinufft/contrib/ker_horner_allw_loop.inc b/include/cufinufft/contrib/ker_horner_allw_loop.inc index f905c14f0..c9c5e2ca2 100644 --- a/include/cufinufft/contrib/ker_horner_allw_loop.inc +++ b/include/cufinufft/contrib/ker_horner_allw_loop.inc @@ -8,7 +8,9 @@ constexpr CUFINUFFT_FLT c3[] = {-2.0382426253182079E+01, 2.0382426253182079E+01}; constexpr CUFINUFFT_FLT c4[] = {-2.0940804433577291E+00, -2.0940804433577358E+00}; constexpr CUFINUFFT_FLT c5[] = {3.1328044596872613E+00, -3.1328044596872546E+00}; - for (int i=0; i<2; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i]))))); + for (int i = 0; i < 2; i++) { + ker[i] = fma(z, fma(z, fma(z, fma(z, fma(z, c5[i], c4[i]), c3[i]), c2[i]), c1[i]), c0[i]); + } } else if (w==3) { constexpr CUFINUFFT_FLT c0[] = {1.5653991189315124E+02, 8.8006872410780340E+02, 1.5653991189967161E+02}; constexpr CUFINUFFT_FLT c1[] = {3.1653018869611071E+02, 2.1722031447974492E-14, -3.1653018868907077E+02}; @@ -17,7 +19,9 @@ constexpr CUFINUFFT_FLT c4[] = {-3.7757583061523604E+01, 5.3222970968867436E+01, -3.7757583054647363E+01}; constexpr CUFINUFFT_FLT c5[] = {-3.9654011076088960E+00, 6.0642442697108023E-14, 3.9654011139270056E+00}; constexpr CUFINUFFT_FLT c6[] = {3.3694352031960180E+00, -4.8817394017826032E+00, 3.3694352094301192E+00}; - for (int i=0; i<3; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i])))))); +for (int i=0; i<3; i++) { + ker[i] = fmaf(z, fmaf(z, fmaf(z, fmaf(z, fmaf(z, fmaf(z, c6[i], c5[i]), c4[i]), c3[i]), c2[i]), c1[i]), c0[i]); +} } else if (w==4) { constexpr CUFINUFFT_FLT c0[] = {5.4284366850213223E+02, 1.0073871433088403E+04, 1.0073871433088401E+04, 5.4284366850213223E+02}; constexpr CUFINUFFT_FLT c1[] = {1.4650917259256937E+03, 6.1905285583602872E+03, -6.1905285583602890E+03, -1.4650917259256942E+03}; @@ -27,7 +31,9 @@ constexpr CUFINUFFT_FLT c5[] = {-7.8386867802392118E+01, 1.4918904800408907E+02, -1.4918904800408754E+02, 7.8386867802392175E+01}; constexpr CUFINUFFT_FLT c6[] = {-1.0039212571700762E+01, 5.0626747735616444E+00, 5.0626747735613531E+00, -1.0039212571700721E+01}; constexpr CUFINUFFT_FLT c7[] = {4.7282853097645736E+00, -9.5966330409183929E+00, 9.5966330409170837E+00, -4.7282853097647068E+00}; - for (int i=0; i<4; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i]))))))); + for (int i=0; i<4; i++) { + ker[i] = fmaf(z, fmaf(z, fmaf(z, fmaf(z, fmaf(z, fmaf(z, fmaf(z, c7[i], c6[i]), c5[i]), c4[i]), c3[i]), c2[i]), c1[i]), c0[i]); + } } else if (w==5) { constexpr CUFINUFFT_FLT c0[] = {9.9223677575398324E+02, 3.7794697666613341E+04, 9.8715771010760523E+04, 3.7794697666613290E+04, 9.9223677575398494E+02}; constexpr CUFINUFFT_FLT c1[] = {3.0430174925083820E+03, 3.7938404259811403E+04, 2.7804200253407354E-12, -3.7938404259811381E+04, -3.0430174925083838E+03}; @@ -38,7 +44,9 @@ constexpr CUFINUFFT_FLT c6[] = {-5.5339722671223782E+01, 1.1960590540261434E+02, -1.5249941358312017E+02, 1.1960590540261727E+02, -5.5339722671222638E+01}; constexpr CUFINUFFT_FLT c7[] = {-3.3762488150349701E+00, 2.2839981872969930E+00, 3.9507985966337744E-12, -2.2839981872938613E+00, 3.3762488150346224E+00}; constexpr CUFINUFFT_FLT c8[] = {2.5183531846827609E+00, -5.3664382310942162E+00, 6.6969190369431528E+00, -5.3664382311060113E+00, 2.5183531846825087E+00}; - for (int i=0; i<5; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i])))))))); + for (int i=0; i<5; i++) { + ker[i] = fmaf(z, fmaf(z, fmaf(z, fmaf(z, fmaf(z, fmaf(z, fmaf(z, fmaf(z, c8[i], c7[i]), c6[i]), c5[i]), c4[i]), c3[i]), c2[i]), c1[i]), c0[i]); + } } else if (w==6) { constexpr CUFINUFFT_FLT c0[] = {2.0553833234911881E+03, 1.5499537739913142E+05, 8.1177907023291197E+05, 8.1177907023291243E+05, 1.5499537739913136E+05, 2.0553833235005709E+03}; constexpr CUFINUFFT_FLT c1[] = {7.1269776034442639E+03, 2.0581923258843314E+05, 3.1559612614917662E+05, -3.1559612614917639E+05, -2.0581923258843314E+05, -7.1269776034341376E+03}; @@ -50,7 +58,9 @@ constexpr CUFINUFFT_FLT c7[] = {-4.5977202613351125E+01, 1.1536880606853479E+02, -1.7819720186493950E+02, 1.7819720186493225E+02, -1.1536880606854527E+02, 4.5977202622148695E+01}; constexpr CUFINUFFT_FLT c8[] = {-1.5631081288828985E+00, 7.1037430592828998E-01, -6.9838401131851052E-02, -6.9838401215353244E-02, 7.1037430589405925E-01, -1.5631081203763799E+00}; constexpr CUFINUFFT_FLT c9[] = {1.7872002109952807E+00, -4.0452381056429791E+00, 5.8969107680858182E+00, -5.8969107681844992E+00, 4.0452381056487843E+00, -1.7872002036951482E+00}; - for (int i=0; i<6; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i]))))))))); + for (int i=0; i<6; i++) { + ker[i] = fma(z, fma(z, fma(z, fma(z, fma(z, fma(z, fma(z, fma(z, fma(z, c9[i], c8[i]), c7[i]), c6[i]), c5[i]), c4[i]), c3[i]), c2[i]), c1[i]), c0[i]); + } } else if (w==7) { constexpr CUFINUFFT_FLT c0[] = {3.9948351830487572E+03, 5.4715865608590818E+05, 5.0196413492771797E+06, 9.8206709220713284E+06, 5.0196413492771862E+06, 5.4715865608590830E+05, 3.9948351830642591E+03}; constexpr CUFINUFFT_FLT c1[] = {1.5290160332974685E+04, 8.7628248584320396E+05, 3.4421061790934447E+06, -1.3062175007082776E-26, -3.4421061790934466E+06, -8.7628248584320408E+05, -1.5290160332958067E+04}; @@ -63,7 +73,9 @@ constexpr CUFINUFFT_FLT c8[] = {-3.2270164914248042E+01, 9.1892112257600488E+01, -1.6710678096332572E+02, 2.0317049305437533E+02, -1.6710678096375165E+02, 9.1892112257478516E+01, -3.2270164900225943E+01}; constexpr CUFINUFFT_FLT c9[] = {-1.4761409684737312E-01, -9.1862771282699363E-01, 1.2845147738991460E+00, 2.0325596081255337E-10, -1.2845147731561355E+00, 9.1862771288504130E-01, 1.4761410890750706E-01}; constexpr CUFINUFFT_FLT c10[] = {1.0330620799191630E+00, -2.6798144967451138E+00, 4.4142511561803381E+00, -5.1799254918189979E+00, 4.4142511544246821E+00, -2.6798144968294695E+00, 1.0330620914479023E+00}; - for (int i=0; i<7; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i])))))))))); + for (int i=0; i<7; i++) { + ker[i] = fma(z, fma(z, fma(z, fma(z, fma(z, fma(z, fma(z, fma(z, fma(z, fma(z, c10[i], c9[i]), c8[i]), c7[i]), c6[i]), c5[i]), c4[i]), c3[i]), c2[i]), c1[i]), c0[i]); + } } else if (w==8) { constexpr CUFINUFFT_FLT c0[] = {7.3898000697447951E+03, 1.7297637497600042E+06, 2.5578341605285816E+07, 8.4789650417103380E+07, 8.4789650417103380E+07, 2.5578341605285820E+07, 1.7297637497600049E+06, 7.3898000697448042E+03}; constexpr CUFINUFFT_FLT c1[] = {3.0719636811267595E+04, 3.1853145713323937E+06, 2.3797981861403696E+07, 2.4569731244678468E+07, -2.4569731244678464E+07, -2.3797981861403700E+07, -3.1853145713323932E+06, -3.0719636811267599E+04}; @@ -76,7 +88,9 @@ constexpr CUFINUFFT_FLT c8[] = {-1.0230637348345583E+02, 2.8246898554291380E+02, -3.8638201738179225E+02, 1.9106407993005959E+02, 1.9106407993232122E+02, -3.8638201738334749E+02, 2.8246898554236805E+02, -1.0230637348345877E+02}; constexpr CUFINUFFT_FLT c9[] = {-1.9200143062948566E+01, 6.1692257626799076E+01, -1.2981109187842986E+02, 1.8681284209951576E+02, -1.8681284210285929E+02, 1.2981109187694383E+02, -6.1692257626659767E+01, 1.9200143062946392E+01}; constexpr CUFINUFFT_FLT c10[] = {3.7894993760901435E-01, -1.7334408837152924E+00, 2.5271184066312142E+00, -1.2600963963387819E+00, -1.2600963946516730E+00, 2.5271184093306061E+00, -1.7334408836731170E+00, 3.7894993761824158E-01}; - for (int i=0; i<8; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i])))))))))); + for (int i = 0; i < 8; i++) { + ker[i] = fma(z, fma(z, fma(z, fma(z, fma(z, fma(z, fma(z, fma(z, fma(z, fma(z, c10[i], c9[i]), c8[i]), c7[i]), c6[i]), c5[i]), c4[i]), c3[i]), c2[i]), c1[i]), c0[i]); + } } else if (w==9) { constexpr CUFINUFFT_FLT c0[] = {1.3136365370186117E+04, 5.0196413492771843E+06, 1.1303327711722571E+08, 5.8225443924996734E+08, 9.7700272582690704E+08, 5.8225443924996817E+08, 1.1303327711722572E+08, 5.0196413492772235E+06, 1.3136365370186102E+04}; constexpr CUFINUFFT_FLT c1[] = {5.8623313038274340E+04, 1.0326318537280340E+07, 1.2898448324824861E+08, 3.0522863709830379E+08, 2.2777200847591304E-08, -3.0522863709830391E+08, -1.2898448324824867E+08, -1.0326318537280390E+07, -5.8623313038274362E+04}; diff --git a/include/cufinufft/spreadinterp.h b/include/cufinufft/spreadinterp.h index d2f1ecd2d..7fd098925 100644 --- a/include/cufinufft/spreadinterp.h +++ b/include/cufinufft/spreadinterp.h @@ -9,10 +9,39 @@ namespace cufinufft { namespace spreadinterp { -template static __forceinline__ __device__ T fold_rescale(T x, int N) { - static constexpr const auto x2pi = T(0.159154943091895345554011992339482617); - const T result = x * x2pi + T(0.5); - return (result - floor(result)) * T(N); +template +constexpr __forceinline__ __host__ __device__ T fold_rescale(T x, int N) { + constexpr const auto x2pi = T(0.159154943091895345554011992339482617); + constexpr const auto half = T(0.5); +#if defined(__CUDA_ARCH__) + if constexpr (std::is_same_v) { + auto result = __fmaf_rn(x, x2pi, half); + result = __fsub_rd(result, truncf(result)); + return __fmul_rd(result, static_cast(N)); + } else if constexpr (std::is_same_v) { + auto result = __fma_rn(x, x2pi, half); + result = __dsub_rd(result, trunc(result)); + return __dmul_rd(result, static_cast(N)); + } else { + static_assert(std::is_same_v || std::is_same_v, + "Only float and double are supported."); + } +#else + const auto result = std::fma(x, x2pi, half); + return (result - std::trunc(result)) * static_cast(N); +#endif +} + +template +static __forceinline__ __device__ constexpr T fma(const T a, const T b, const T c) { + if constexpr (std::is_same_v) { + return __fmaf_rn(a, b, c); + } else if constexpr (std::is_same_v) { + return __fma_rn(a, b, c); + } else { + static_assert(std::is_same_v || std::is_same_v, + "Only float and double are supported."); + } } template @@ -23,11 +52,11 @@ static inline T evaluate_kernel(T x, const finufft_spread_opts &opts) approximation to prolate spheroidal wavefunction (PSWF) of order 0. This is the "reference implementation", used by eg common/onedim_* 2/17/17 */ { - if (abs(x) >= opts.ES_halfwidth) + if (abs(x) >= T(opts.ES_halfwidth)) // if spreading/FT careful, shouldn't need this if, but causes no speed hit return 0.0; else - return exp(opts.ES_beta * sqrt(1.0 - opts.ES_c * x * x)); + return exp(T(opts.ES_beta) * sqrt(T(1.0) - T(opts.ES_c) * x * x)); } template @@ -53,7 +82,17 @@ static __inline__ __device__ void eval_kernel_vec_horner(T *ker, const T x, cons This is the current evaluation method, since it's faster (except i7 w=16). Two upsampfacs implemented. Params must match ref formula. Barnett 4/24/18 */ { - T z = 2 * x + w - 1.0; // scale so local grid offset z in [-1,1] +#ifdef __CUDA_ARCH__ + __builtin_assume(w >= 2); + if constexpr (std::is_same_v) { + __builtin_assume(w <= 7); + } + if constexpr (std::is_same_v) { + __builtin_assume(w <= 16); + } +#endif + const auto z = fma(T(2), x, T(w - 1)); // scale so local grid offset z in [-1,1] + // T z = 2 * x + w - 1.0; // insert the auto-generated code which expects z, w args, writes to ker... if (upsampfac == 2.0) { // floating point equality is fine here using FLT = T; diff --git a/perftest/cuda/bench.py b/perftest/cuda/bench.py index bb288af0b..dbcaed87f 100644 --- a/perftest/cuda/bench.py +++ b/perftest/cuda/bench.py @@ -41,10 +41,12 @@ def build_args(args): "--n_runs": "10", "--method": "0", "--sort": "1", - # "--N1": "16777216", - "--N1": "256", - "--N2": "256", + "--N1": "16777216", + # "--N2": "256", + # "--N1": "256", + # "--N2": "256", # "--N3": "256", + "--kerevalmethod": "1", "--M": "1E8", "--tol": "1E-6"} # iterate over tol from 1E-6 to 1E-1 @@ -135,21 +137,21 @@ def build_args(args): pivot_df = df.pivot(index='tolerance', columns='method') # print(pivot_df) # scale the throughput SM by GM -pivot_df['throughput', 'SM'] /= pivot_df['throughput', 'GM'] +# pivot_df['throughput', 'SM'] /= pivot_df['throughput', 'GM'] # pivot_df['throughput', 'GM'] /= pivot_df['throughput', 'GM'] # scale setpts SM by GM -pivot_df['exec', 'SM'] /= pivot_df['exec', 'GM'] +# pivot_df['exec', 'SM'] /= pivot_df['exec', 'GM'] # pivot_df['exec', 'GM'] /= pivot_df['exec', 'GM'] # remove the GM column -pivot_df.drop(('throughput', 'GM'), axis=1, inplace=True) +# pivot_df.drop(('throughput', 'GM'), axis=1, inplace=True) pivot_df.drop(('exec', 'GM'), axis=1, inplace=True) pivot_df.drop(('exec', 'SM'), axis=1, inplace=True) print(pivot_df) # Plot pivot_df.plot(kind='bar', figsize=(10, 7)) # Find the minimum throughput value -min_val = min(pivot_df[('throughput', 'SM')].min(), 1) -max_val = max(pivot_df[('throughput', 'SM')].max(), 0) +min_val = min(pivot_df[('throughput', 'SM')].min(), pivot_df[('throughput', 'GM')].min()) +max_val = max(pivot_df[('throughput', 'SM')].max(), pivot_df[('throughput', 'GM')].max()) print(min_val, max_val) plt.ylim(min_val * .99, max_val * 1.01) # plt.ylim(.8, 1.2) diff --git a/src/cuda/1d/spread1d_wrapper.cu b/src/cuda/1d/spread1d_wrapper.cu index 36fa2bef9..e958bfea3 100644 --- a/src/cuda/1d/spread1d_wrapper.cu +++ b/src/cuda/1d/spread1d_wrapper.cu @@ -16,6 +16,7 @@ using namespace cufinufft::common; using namespace cufinufft::memtransfer; #include "spreadinterp1d.cuh" +#include namespace cufinufft { namespace spreadinterp { @@ -51,10 +52,30 @@ int cuspread1d(cufinufft_plan_t *d_plan, int blksize) return ier; } +template struct cmp : public thrust::binary_function { + + cmp(const T *kx) : kx(kx) {} + + __host__ __device__ bool operator()(const int a, const int b) const { + return fold_rescale(kx[a], 1) < fold_rescale(kx[b], 1); + } + +private: + const T *kx; +}; + template int cuspread1d_nuptsdriven_prop(int nf1, int M, cufinufft_plan_t *d_plan) { auto &stream = d_plan->stream; - + if (d_plan->opts.gpu_sort && d_plan->opts.gpu_method == 1) { + int *d_idxnupts = d_plan->idxnupts; + thrust::sequence(thrust::cuda::par.on(stream), d_idxnupts, d_idxnupts + M); + RETURN_IF_CUDA_ERROR + thrust::sort(thrust::cuda::par.on(stream), d_idxnupts, d_idxnupts + M, + cmp{d_plan->kx}); + RETURN_IF_CUDA_ERROR + return 0; + } if (d_plan->opts.gpu_sort) { int bin_size_x = d_plan->opts.gpu_binsizex; if (bin_size_x < 0) { @@ -84,17 +105,16 @@ int cuspread1d_nuptsdriven_prop(int nf1, int M, cufinufft_plan_t *d_plan) { thrust::device_ptr d_ptr(d_binsize); thrust::device_ptr d_result(d_binstartpts); thrust::exclusive_scan(thrust::cuda::par.on(stream), d_ptr, d_ptr + n, d_result); + RETURN_IF_CUDA_ERROR calc_inverse_of_global_sort_idx_1d<<<(M + 1024 - 1) / 1024, 1024, 0, stream>>>( M, bin_size_x, numbins, d_binstartpts, d_sortidx, d_kx, d_idxnupts, nf1); RETURN_IF_CUDA_ERROR } else { int *d_idxnupts = d_plan->idxnupts; - trivial_global_sort_index_1d<<<(M + 1024 - 1) / 1024, 1024, 0, stream>>>(M, - d_idxnupts); + thrust::sequence(thrust::cuda::par.on(stream), d_idxnupts, d_idxnupts + M); RETURN_IF_CUDA_ERROR } - return 0; } @@ -134,7 +154,6 @@ int cuspread1d_nuptsdriven(int nf1, int M, cufinufft_plan_t *d_plan, int blks RETURN_IF_CUDA_ERROR } } - return 0; } @@ -146,33 +165,29 @@ int cuspread1d_subprob_prop(int nf1, int M, cufinufft_plan_t *d_plan) which only needs to be done once. */ { - auto &stream = d_plan->stream; - int ier; - int maxsubprobsize = d_plan->opts.gpu_maxsubprobsize; - int bin_size_x = d_plan->opts.gpu_binsizex; + const auto maxsubprobsize = d_plan->opts.gpu_maxsubprobsize; + const auto bin_size_x = d_plan->opts.gpu_binsizex; if (bin_size_x < 0) { std::cerr << "[cuspread1d_subprob_prop] error: invalid binsize (binsizex) = (" << bin_size_x << ")\n"; return FINUFFT_ERR_BINSIZE_NOTVALID; } - int numbins = ceil((T)nf1 / bin_size_x); - - T *d_kx = d_plan->kx; - - int *d_binsize = d_plan->binsize; - int *d_binstartpts = d_plan->binstartpts; - int *d_sortidx = d_plan->sortidx; - int *d_numsubprob = d_plan->numsubprob; - int *d_subprobstartpts = d_plan->subprobstartpts; - int *d_idxnupts = d_plan->idxnupts; + const auto numbins = (nf1 + bin_size_x - 1) / bin_size_x; + const auto d_kx = d_plan->kx; + const auto d_binsize = d_plan->binsize; + const auto d_binstartpts = d_plan->binstartpts; + const auto d_sortidx = d_plan->sortidx; + const auto d_numsubprob = d_plan->numsubprob; + const auto d_subprobstartpts = d_plan->subprobstartpts; + const auto d_idxnupts = d_plan->idxnupts; + const auto stream = d_plan->stream; int *d_subprob_to_bin = nullptr; - if ((ier = - checkCudaErrors(cudaMemsetAsync(d_binsize, 0, numbins * sizeof(int), stream)))) - return ier; + cudaMemsetAsync(d_binsize, 0, numbins * sizeof(int), stream); + RETURN_IF_CUDA_ERROR calc_bin_size_noghost_1d<<<(M + 1024 - 1) / 1024, 1024, 0, stream>>>( M, nf1, bin_size_x, numbins, d_binsize, d_kx, d_sortidx); RETURN_IF_CUDA_ERROR @@ -193,30 +208,25 @@ int cuspread1d_subprob_prop(int nf1, int M, cufinufft_plan_t *d_plan) d_ptr = thrust::device_pointer_cast(d_numsubprob); d_result = thrust::device_pointer_cast(d_subprobstartpts + 1); thrust::inclusive_scan(thrust::cuda::par.on(stream), d_ptr, d_ptr + n, d_result); + RETURN_IF_CUDA_ERROR - if ((ier = checkCudaErrors(cudaMemsetAsync(d_subprobstartpts, 0, sizeof(int), stream)))) - return ier; + cudaMemsetAsync(d_subprobstartpts, 0, sizeof(int), stream); + RETURN_IF_CUDA_ERROR - int totalnumsubprob; - if ((ier = - checkCudaErrors(cudaMemcpyAsync(&totalnumsubprob, &d_subprobstartpts[n], - sizeof(int), cudaMemcpyDeviceToHost, stream)))) - return ier; + int totalnumsubprob{}; + cudaMemcpyAsync(&totalnumsubprob, &d_subprobstartpts[n], sizeof(int), + cudaMemcpyDeviceToHost, stream); cudaStreamSynchronize(stream); - if ((ier = checkCudaErrors( - cudaMallocWrapper(&d_subprob_to_bin, totalnumsubprob * sizeof(int), stream, - d_plan->supports_pools)))) - return ier; + RETURN_IF_CUDA_ERROR + + cudaMallocWrapper(&d_subprob_to_bin, totalnumsubprob * sizeof(int), stream, + d_plan->supports_pools); + RETURN_IF_CUDA_ERROR + map_b_into_subprob_1d<<<(numbins + 1024 - 1) / 1024, 1024, 0, stream>>>( d_subprob_to_bin, d_subprobstartpts, d_numsubprob, numbins); - cudaError_t err = cudaGetLastError(); - if (err != cudaSuccess) { - fprintf(stderr, "[%s] Error: %s\n", __func__, cudaGetErrorString(err)); - cudaFree(d_subprob_to_bin); - return FINUFFT_ERR_CUDA_FAILURE; - } - - assert(d_subprob_to_bin != NULL); + RETURN_IF_CUDA_ERROR + assert(d_subprob_to_bin != nullptr); cudaFreeWrapper(d_plan->subprob_to_bin, stream, d_plan->supports_pools); d_plan->subprob_to_bin = d_subprob_to_bin; d_plan->totalnumsubprob = totalnumsubprob; diff --git a/src/cuda/1d/spreadinterp1d.cuh b/src/cuda/1d/spreadinterp1d.cuh index 9a536ec9c..68656c124 100644 --- a/src/cuda/1d/spreadinterp1d.cuh +++ b/src/cuda/1d/spreadinterp1d.cuh @@ -10,6 +10,8 @@ #include #include +#include + using namespace cufinufft::utils; namespace cufinufft { @@ -21,26 +23,33 @@ template __global__ void spread_1d_nuptsdriven(const T *x, const cuda_complex *c, cuda_complex *fw, int M, int ns, int nf1, T es_c, T es_beta, T sigma, const int *idxnupts) { - int xx, ix; - T ker1[MAX_NSPREAD]; - T x_rescaled; - cuda_complex cnow; + auto ker1 = (T __restrict__ *)alloca(sizeof(T) * ns); + for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < M; i += blockDim.x * gridDim.x) { - x_rescaled = fold_rescale(x[idxnupts[i]], nf1); - cnow = c[idxnupts[i]]; - int xstart = ceil(x_rescaled - ns / 2.0); - int xend = floor(x_rescaled + ns / 2.0); - - T x1 = (T)xstart - x_rescaled; + const auto x_rescaled = fold_rescale(x[idxnupts[i]], nf1); + const auto cnow = c[idxnupts[i]]; + const auto [xstart, xend] = [ns, x_rescaled]() constexpr noexcept { + if constexpr (std::is_same_v) { + const auto xstart = __float2int_ru(__fmaf_ru(ns, -.5f, x_rescaled)); + const auto xend = __float2int_rd(__fmaf_rd(ns, .5f, x_rescaled)); + return int2{xstart, xend}; + } + if constexpr (std::is_same_v) { + const auto xstart = __double2int_ru(__fma_ru(ns, -.5, x_rescaled)); + const auto xend = __double2int_rd(__fma_rd(ns, .5, x_rescaled)); + return int2{xstart, xend}; + } + }(); + const T x1 = (T)xstart - x_rescaled; if constexpr (KEREVALMETH == 1) eval_kernel_vec_horner(ker1, x1, ns, sigma); else eval_kernel_vec(ker1, x1, ns, es_c, es_beta); - for (xx = xstart; xx <= xend; xx++) { - ix = xx < 0 ? xx + nf1 : (xx > nf1 - 1 ? xx - nf1 : xx); + for (auto xx = xstart; xx <= xend; xx++) { + auto ix = xx < 0 ? xx + nf1 : (xx > nf1 - 1 ? xx - nf1 : xx); T kervalue = ker1[xx - xstart]; atomicAdd(&fw[ix].x, cnow.x * kervalue); atomicAdd(&fw[ix].y, cnow.y * kervalue); @@ -87,16 +96,21 @@ __global__ void calc_inverse_of_global_sort_idx_1d( } } +template +__forceinline__ __device__ cuda_complex mul(const cuda_complex &a, const T b) { + return {a.x * b, a.y * b}; +} + template __global__ void spread_1d_subprob( const T *x, const cuda_complex *c, cuda_complex *fw, int M, uint8_t ns, int nf1, T es_c, T es_beta, T sigma, const int *binstartpts, const int *bin_size, int bin_size_x, const int *subprob_to_bin, const int *subprobstartpts, - const int *numsubprob, int maxsubprobsize, int nbinx, const int *idxnupts) { + const int *numsubprob, int maxsubprobsize, int nbinx, int *idxnupts) { extern __shared__ char sharedbuf[]; - auto *__restrict__ fwshared = (cuda_complex *)sharedbuf; + alignas(256) auto *__restrict__ fwshared = (cuda_complex *)sharedbuf; - int xstart, xend, ix; + int ix; const int subpidx = blockIdx.x; const int bidx = subprob_to_bin[subpidx]; const int binsubp_idx = subpidx - subprobstartpts[bidx]; @@ -106,11 +120,11 @@ __global__ void spread_1d_subprob( const auto ns_2 = (ns + 1) / 2; const int N = bin_size_x + 2 * ns_2; - T ker1[MAX_NSPREAD]; + // dynamic stack allocation + auto ker1 = (T __restrict__ *)alloca(sizeof(T) * ns); for (int i = threadIdx.x; i < N; i += blockDim.x) { - fwshared[i].x = T(0); - fwshared[i].y = T(0); + fwshared[i] = {0, 0}; } __syncthreads(); @@ -119,8 +133,18 @@ __global__ void spread_1d_subprob( const auto x_rescaled = fold_rescale(x[idxnupts[idx]], nf1); const auto cnow = c[idxnupts[idx]]; - xstart = ceil(x_rescaled - ns / 2.0) - xoffset; - xend = floor(x_rescaled + ns / 2.0) - xoffset; + const auto [xstart, xend] = [ns, x_rescaled]() constexpr noexcept { + if constexpr (std::is_same_v) { + const auto xstart = __float2int_ru(__fmaf_ru(ns, -.5f, x_rescaled)); + const auto xend = __float2int_rd(__fmaf_rd(ns, .5f, x_rescaled)); + return int2{xstart, xend}; + } + if constexpr (std::is_same_v) { + const auto xstart = __double2int_ru(__fma_ru(ns, -.5, x_rescaled)); + const auto xend = __double2int_rd(__fma_rd(ns, .5, x_rescaled)); + return int2{xstart, xend}; + } + }(); const T x1 = T(xstart + xoffset) - x_rescaled; if constexpr (KEREVALMETH == 1) @@ -130,8 +154,9 @@ __global__ void spread_1d_subprob( for (int xx = xstart; xx <= xend; xx++) { ix = xx + ns_2; if (ix >= (bin_size_x + ns_2) || ix < 0) break; - atomicAdd(&fwshared[ix].x, cnow.x * ker1[xx - xstart]); - atomicAdd(&fwshared[ix].y, cnow.y * ker1[xx - xstart]); + const auto result = mul(cnow, ker1[xx - xstart]); + atomicAdd(&fwshared[ix].x, result.x); + atomicAdd(&fwshared[ix].y, result.y); } } __syncthreads(); diff --git a/src/cuda/CMakeLists.txt b/src/cuda/CMakeLists.txt index d2928858b..d8b192e8b 100644 --- a/src/cuda/CMakeLists.txt +++ b/src/cuda/CMakeLists.txt @@ -13,15 +13,28 @@ set(PRECISION_DEPENDENT_SRC memtransfer_wrapper.cu deconvolve_wrapper.cu cufinufft.cu common.cu ) +set(HELPER_MATH_URL "https://raw.githubusercontent.com/NVIDIA/cuda-samples/master/Common/helper_math.h") +set(HELPER_MATH_FILE "${CMAKE_BINARY_DIR}/helper_math.h") +if(NOT EXISTS ${HELPER_MATH_FILE}) + file(DOWNLOAD ${HELPER_MATH_URL} ${HELPER_MATH_FILE}) +endif() + set(CUFINUFFT_INCLUDE_DIRS ${PROJECT_SOURCE_DIR}/include ${PROJECT_SOURCE_DIR}/contrib + ${CMAKE_BINARY_DIR} $ $ $ ) set(CUFINUFFT_INCLUDE_DIRS ${CUFINUFFT_INCLUDE_DIRS} PARENT_SCOPE) +# flush denormals to zero and enable verbose PTXAS output +set(FINUFFT_CUDA_FLAGS + -ftz=true -fmad=true -restrict -Xptxas=-v --extra-device-vectorization -res-usage + -Wdouble-promotion -lineinfo --extended-lambda --expt-relaxed-constexpr +) + add_library(cufinufft_common_objects OBJECT ${PRECISION_INDEPENDENT_SRC}) target_include_directories(cufinufft_common_objects PUBLIC ${CUFINUFFT_INCLUDE_DIRS}) set_target_properties( @@ -30,6 +43,8 @@ set_target_properties( CUDA_ARCHITECTURES ${FINUFFT_CUDA_ARCHITECTURES} ) +target_compile_options(cufinufft_common_objects PRIVATE $<$:${FINUFFT_CUDA_FLAGS}>) + add_library(cufinufft_objects OBJECT ${PRECISION_DEPENDENT_SRC}) target_include_directories(cufinufft_objects PUBLIC ${CUFINUFFT_INCLUDE_DIRS}) set_property(TARGET cufinufft_objects PROPERTY POSITION_INDEPENDENT_CODE ON) @@ -38,6 +53,7 @@ set_target_properties( POSITION_INDEPENDENT_CODE ON CUDA_ARCHITECTURES ${FINUFFT_CUDA_ARCHITECTURES} ) +target_compile_options(cufinufft_objects PRIVATE $<$:${FINUFFT_CUDA_FLAGS}>) add_library(cufinufft SHARED $ From 35dcc666197a0cfb3d4ab29b3b728b86b057050e Mon Sep 17 00:00:00 2001 From: Marco Barbone Date: Mon, 15 Jul 2024 16:17:20 -0400 Subject: [PATCH 10/68] Optimized 1D and 2D --- .../contrib/ker_horner_allw_loop.inc | 6 +- include/cufinufft/utils.h | 13 + perftest/cuda/bench.py | 23 +- src/cuda/1d/spread1d_wrapper.cu | 1 + src/cuda/1d/spreadinterp1d.cuh | 43 +- src/cuda/2d/interp2d_wrapper.cu | 16 +- src/cuda/2d/spread2d_wrapper.cu | 15 +- src/cuda/2d/spreadinterp2d.cuh | 568 +++++++++--------- src/cuda/common.cu | 87 ++- 9 files changed, 407 insertions(+), 365 deletions(-) diff --git a/include/cufinufft/contrib/ker_horner_allw_loop.inc b/include/cufinufft/contrib/ker_horner_allw_loop.inc index c9c5e2ca2..1178a8544 100644 --- a/include/cufinufft/contrib/ker_horner_allw_loop.inc +++ b/include/cufinufft/contrib/ker_horner_allw_loop.inc @@ -20,7 +20,7 @@ constexpr CUFINUFFT_FLT c5[] = {-3.9654011076088960E+00, 6.0642442697108023E-14, 3.9654011139270056E+00}; constexpr CUFINUFFT_FLT c6[] = {3.3694352031960180E+00, -4.8817394017826032E+00, 3.3694352094301192E+00}; for (int i=0; i<3; i++) { - ker[i] = fmaf(z, fmaf(z, fmaf(z, fmaf(z, fmaf(z, fmaf(z, c6[i], c5[i]), c4[i]), c3[i]), c2[i]), c1[i]), c0[i]); + ker[i] = fma(z, fma(z, fma(z, fma(z, fma(z, fma(z, c6[i], c5[i]), c4[i]), c3[i]), c2[i]), c1[i]), c0[i]); } } else if (w==4) { constexpr CUFINUFFT_FLT c0[] = {5.4284366850213223E+02, 1.0073871433088403E+04, 1.0073871433088401E+04, 5.4284366850213223E+02}; @@ -32,7 +32,7 @@ for (int i=0; i<3; i++) { constexpr CUFINUFFT_FLT c6[] = {-1.0039212571700762E+01, 5.0626747735616444E+00, 5.0626747735613531E+00, -1.0039212571700721E+01}; constexpr CUFINUFFT_FLT c7[] = {4.7282853097645736E+00, -9.5966330409183929E+00, 9.5966330409170837E+00, -4.7282853097647068E+00}; for (int i=0; i<4; i++) { - ker[i] = fmaf(z, fmaf(z, fmaf(z, fmaf(z, fmaf(z, fmaf(z, fmaf(z, c7[i], c6[i]), c5[i]), c4[i]), c3[i]), c2[i]), c1[i]), c0[i]); + ker[i] = fma(z, fma(z, fma(z, fma(z, fma(z, fma(z, fma(z, c7[i], c6[i]), c5[i]), c4[i]), c3[i]), c2[i]), c1[i]), c0[i]); } } else if (w==5) { constexpr CUFINUFFT_FLT c0[] = {9.9223677575398324E+02, 3.7794697666613341E+04, 9.8715771010760523E+04, 3.7794697666613290E+04, 9.9223677575398494E+02}; @@ -45,7 +45,7 @@ for (int i=0; i<3; i++) { constexpr CUFINUFFT_FLT c7[] = {-3.3762488150349701E+00, 2.2839981872969930E+00, 3.9507985966337744E-12, -2.2839981872938613E+00, 3.3762488150346224E+00}; constexpr CUFINUFFT_FLT c8[] = {2.5183531846827609E+00, -5.3664382310942162E+00, 6.6969190369431528E+00, -5.3664382311060113E+00, 2.5183531846825087E+00}; for (int i=0; i<5; i++) { - ker[i] = fmaf(z, fmaf(z, fmaf(z, fmaf(z, fmaf(z, fmaf(z, fmaf(z, fmaf(z, c8[i], c7[i]), c6[i]), c5[i]), c4[i]), c3[i]), c2[i]), c1[i]), c0[i]); + ker[i] = fma(z, fma(z, fma(z, fma(z, fma(z, fma(z, fma(z, fma(z, c8[i], c7[i]), c6[i]), c5[i]), c4[i]), c3[i]), c2[i]), c1[i]), c0[i]); } } else if (w==6) { constexpr CUFINUFFT_FLT c0[] = {2.0553833234911881E+03, 1.5499537739913142E+05, 8.1177907023291197E+05, 8.1177907023291243E+05, 1.5499537739913136E+05, 2.0553833235005709E+03}; diff --git a/include/cufinufft/utils.h b/include/cufinufft/utils.h index 3455b99c0..b0a77aec7 100644 --- a/include/cufinufft/utils.h +++ b/include/cufinufft/utils.h @@ -68,6 +68,19 @@ template T infnorm(int n, std::complex *a) { } return sqrt(nrm); } + +#ifdef __CUDA_ARCH__ +__forceinline__ __device__ auto interval(const int ns, const float x) { + const auto xstart = __float2int_ru(__fmaf_ru(ns, -.5f, x)); + const auto xend = __float2int_rd(__fmaf_rd(ns, .5f, x)); + return int2{xstart, xend}; +} +__forceinline__ __device__ auto interval(const int ns, const double x) { + const auto xstart = __double2int_ru(__fma_ru(ns, -.5, x)); + const auto xend = __double2int_rd(__fma_rd(ns, .5, x)); + return int2{xstart, xend}; +} +#endif } // namespace utils } // namespace cufinufft diff --git a/perftest/cuda/bench.py b/perftest/cuda/bench.py index dbcaed87f..db7e73873 100644 --- a/perftest/cuda/bench.py +++ b/perftest/cuda/bench.py @@ -37,14 +37,13 @@ def build_args(args): # example command to run: # nsys profile -o cuperftest_profile ./cuperftest --prec f --n_runs 10 --method 1 --N1 256 --N2 256 --N3 256 --M 1E8 --tol 1E-6 # example arguments -args = {"--prec": "f", - "--n_runs": "10", +args = {"--prec": "d", + "--n_runs": "5", "--method": "0", "--sort": "1", - "--N1": "16777216", - # "--N2": "256", - # "--N1": "256", - # "--N2": "256", + # "--N1": "16777216", + "--N1": "256", + "--N2": "256", # "--N3": "256", "--kerevalmethod": "1", "--M": "1E8", @@ -93,6 +92,10 @@ def build_args(args): conf = [x for x in stdout.splitlines() if x.startswith("#")] print('\n'.join(conf)) stdout = [x for x in stdout.splitlines() if not x.startswith("#")][:7] + if stdout[0].startswith("bin"): + print(stdout[0]) + stdout = stdout[1:] + stdout = '\n'.join(stdout) # convert stdout to a dataframe from csv string dt = pd.read_csv(io.StringIO(stdout), sep=',') @@ -153,7 +156,7 @@ def build_args(args): min_val = min(pivot_df[('throughput', 'SM')].min(), pivot_df[('throughput', 'GM')].min()) max_val = max(pivot_df[('throughput', 'SM')].max(), pivot_df[('throughput', 'GM')].max()) print(min_val, max_val) -plt.ylim(min_val * .99, max_val * 1.01) +plt.ylim(min_val * .90, max_val * 1.1) # plt.ylim(.8, 1.2) # Calculate the smallest power of 10 @@ -163,15 +166,15 @@ def build_args(args): # plt.ylim(df['throughput'].min()*.99, df['throughput'].max() * 1.009) # Adding 10% for upper margin # plot an horizontal line at 1 with label "GM" -plt.axhline(y=1, color='k', linestyle='--', label='GM') +# plt.axhline(y=1, color='k', linestyle='--', label='GM') plt.xlabel('Tolerance') -plt.ylabel('Throughput (% of GM)') +plt.ylabel('Throughput') plt.title('Throughput by Tolerance and Method') plt.legend(title='Method') plt.tight_layout() plt.show() plt.xlabel("Tolerance") -plt.ylabel("Points/s (% of GM)") +plt.ylabel("Points/s") plt.savefig("bench.png") plt.savefig("bench.svg") plt.savefig("bench.pdf") diff --git a/src/cuda/1d/spread1d_wrapper.cu b/src/cuda/1d/spread1d_wrapper.cu index e958bfea3..4e7f4ea0b 100644 --- a/src/cuda/1d/spread1d_wrapper.cu +++ b/src/cuda/1d/spread1d_wrapper.cu @@ -268,6 +268,7 @@ int cuspread1d_subprob(int nf1, int M, cufinufft_plan_t *d_plan, int blksize) if (d_plan->opts.gpu_kerevalmeth) { for (int t = 0; t < blksize; t++) { + cufinufft_set_shared_memory(spread_1d_subprob, 1, *d_plan); RETURN_IF_CUDA_ERROR spread_1d_subprob<<>>( diff --git a/src/cuda/1d/spreadinterp1d.cuh b/src/cuda/1d/spreadinterp1d.cuh index 68656c124..f94ffd7eb 100644 --- a/src/cuda/1d/spreadinterp1d.cuh +++ b/src/cuda/1d/spreadinterp1d.cuh @@ -23,26 +23,15 @@ template __global__ void spread_1d_nuptsdriven(const T *x, const cuda_complex *c, cuda_complex *fw, int M, int ns, int nf1, T es_c, T es_beta, T sigma, const int *idxnupts) { - + // dynamic stack allocation to reduce stack usage auto ker1 = (T __restrict__ *)alloca(sizeof(T) * ns); for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < M; i += blockDim.x * gridDim.x) { const auto x_rescaled = fold_rescale(x[idxnupts[i]], nf1); const auto cnow = c[idxnupts[i]]; - const auto [xstart, xend] = [ns, x_rescaled]() constexpr noexcept { - if constexpr (std::is_same_v) { - const auto xstart = __float2int_ru(__fmaf_ru(ns, -.5f, x_rescaled)); - const auto xend = __float2int_rd(__fmaf_rd(ns, .5f, x_rescaled)); - return int2{xstart, xend}; - } - if constexpr (std::is_same_v) { - const auto xstart = __double2int_ru(__fma_ru(ns, -.5, x_rescaled)); - const auto xend = __double2int_rd(__fma_rd(ns, .5, x_rescaled)); - return int2{xstart, xend}; - } - }(); - const T x1 = (T)xstart - x_rescaled; + const auto [xstart, xend] = interval(ns, x_rescaled); + const T x1 = (T)xstart - x_rescaled; if constexpr (KEREVALMETH == 1) eval_kernel_vec_horner(ker1, x1, ns, sigma); else @@ -126,27 +115,17 @@ __global__ void spread_1d_subprob( for (int i = threadIdx.x; i < N; i += blockDim.x) { fwshared[i] = {0, 0}; } + + const T ns_2f = ns * T(.5); + __syncthreads(); for (auto i = threadIdx.x; i < nupts; i += blockDim.x) { - const auto idx = ptstart + i; - const auto x_rescaled = fold_rescale(x[idxnupts[idx]], nf1); - const auto cnow = c[idxnupts[idx]]; - - const auto [xstart, xend] = [ns, x_rescaled]() constexpr noexcept { - if constexpr (std::is_same_v) { - const auto xstart = __float2int_ru(__fmaf_ru(ns, -.5f, x_rescaled)); - const auto xend = __float2int_rd(__fmaf_rd(ns, .5f, x_rescaled)); - return int2{xstart, xend}; - } - if constexpr (std::is_same_v) { - const auto xstart = __double2int_ru(__fma_ru(ns, -.5, x_rescaled)); - const auto xend = __double2int_rd(__fma_rd(ns, .5, x_rescaled)); - return int2{xstart, xend}; - } - }(); - - const T x1 = T(xstart + xoffset) - x_rescaled; + const auto idx = ptstart + i; + const auto x_rescaled = fold_rescale(x[idxnupts[idx]], nf1); + const auto cnow = c[idxnupts[idx]]; + const auto [xstart, xend] = interval(ns, x_rescaled); + const T x1 = T(xstart + xoffset) - x_rescaled; if constexpr (KEREVALMETH == 1) eval_kernel_vec_horner(ker1, x1, ns, sigma); else diff --git a/src/cuda/2d/interp2d_wrapper.cu b/src/cuda/2d/interp2d_wrapper.cu index 533788482..eda0d579b 100644 --- a/src/cuda/2d/interp2d_wrapper.cu +++ b/src/cuda/2d/interp2d_wrapper.cu @@ -4,10 +4,12 @@ #include #include +#include #include #include using namespace cufinufft::memtransfer; +using namespace cufinufft::common; #include "spreadinterp2d.cuh" @@ -120,17 +122,14 @@ int cuinterp2d_subprob(int nf1, int nf2, int M, cufinufft_plan_t *d_plan, int *d_subprob_to_bin = d_plan->subprob_to_bin; int totalnumsubprob = d_plan->totalnumsubprob; - T sigma = d_plan->opts.upsampfac; - size_t sharedplanorysize = (bin_size_x + 2 * ceil(ns / 2.0)) * - (bin_size_y + 2 * ceil(ns / 2.0)) * sizeof(cuda_complex); - - if (sharedplanorysize > 49152) { - std::cerr << "[cuinterp2d_subprob] error: not enough shared memory\n"; - return FINUFFT_ERR_INSUFFICIENT_SHMEM; - } + T sigma = d_plan->opts.upsampfac; + const auto sharedplanorysize = + shared_memory_required(2, d_plan->spopts.nspread, d_plan->opts.gpu_binsizex, + d_plan->opts.gpu_binsizey, d_plan->opts.gpu_binsizez); if (d_plan->opts.gpu_kerevalmeth) { for (int t = 0; t < blksize; t++) { + cufinufft_set_shared_memory(interp_2d_subprob, 2, *d_plan); interp_2d_subprob<<>>( d_kx, d_ky, d_c + t * M, d_fw + t * nf1 * nf2, M, ns, nf1, nf2, es_c, es_beta, sigma, d_binstartpts, d_binsize, bin_size_x, bin_size_y, d_subprob_to_bin, @@ -140,6 +139,7 @@ int cuinterp2d_subprob(int nf1, int nf2, int M, cufinufft_plan_t *d_plan, } } else { for (int t = 0; t < blksize; t++) { + cufinufft_set_shared_memory(interp_2d_subprob, 2, *d_plan); interp_2d_subprob<<>>( d_kx, d_ky, d_c + t * M, d_fw + t * nf1 * nf2, M, ns, nf1, nf2, es_c, es_beta, sigma, d_binstartpts, d_binsize, bin_size_x, bin_size_y, d_subprob_to_bin, diff --git a/src/cuda/2d/spread2d_wrapper.cu b/src/cuda/2d/spread2d_wrapper.cu index 69b2ba956..d361791b0 100644 --- a/src/cuda/2d/spread2d_wrapper.cu +++ b/src/cuda/2d/spread2d_wrapper.cu @@ -7,6 +7,7 @@ #include #include +#include #include #include #include @@ -273,16 +274,14 @@ int cuspread2d_subprob(int nf1, int nf2, int M, cufinufft_plan_t *d_plan, T sigma = d_plan->opts.upsampfac; - size_t sharedplanorysize = (bin_size_x + 2 * (int)ceil(ns / 2.0)) * - (bin_size_y + 2 * (int)ceil(ns / 2.0)) * - sizeof(cuda_complex); - if (sharedplanorysize > 49152) { - std::cerr << "[cuspread2d_subprob] error: not enough shared memory\n"; - return FINUFFT_ERR_INSUFFICIENT_SHMEM; - } + const auto sharedplanorysize = + shared_memory_required(2, d_plan->spopts.nspread, d_plan->opts.gpu_binsizex, + d_plan->opts.gpu_binsizey, d_plan->opts.gpu_binsizez); if (d_plan->opts.gpu_kerevalmeth) { for (int t = 0; t < blksize; t++) { + cufinufft_set_shared_memory(spread_2d_subprob, 2, *d_plan); + RETURN_IF_CUDA_ERROR spread_2d_subprob<<>>( d_kx, d_ky, d_c + t * M, d_fw + t * nf1 * nf2, M, ns, nf1, nf2, es_c, es_beta, sigma, d_binstartpts, d_binsize, bin_size_x, bin_size_y, d_subprob_to_bin, @@ -292,6 +291,8 @@ int cuspread2d_subprob(int nf1, int nf2, int M, cufinufft_plan_t *d_plan, } } else { for (int t = 0; t < blksize; t++) { + cufinufft_set_shared_memory(spread_2d_subprob, 2, *d_plan); + RETURN_IF_CUDA_ERROR spread_2d_subprob<<>>( d_kx, d_ky, d_c + t * M, d_fw + t * nf1 * nf2, M, ns, nf1, nf2, es_c, es_beta, sigma, d_binstartpts, d_binsize, bin_size_x, bin_size_y, d_subprob_to_bin, diff --git a/src/cuda/2d/spreadinterp2d.cuh b/src/cuda/2d/spreadinterp2d.cuh index 558984ea1..62a430ca5 100644 --- a/src/cuda/2d/spreadinterp2d.cuh +++ b/src/cuda/2d/spreadinterp2d.cuh @@ -15,314 +15,314 @@ namespace spreadinterp { /* ------------------------ 2d Spreading Kernels ----------------------------*/ /* Kernels for NUptsdriven Method */ -template -__global__ void spread_2d_nupts_driven(const T *x, const T *y, const cuda_complex *c, cuda_complex *fw, int M, - int ns, int nf1, int nf2, T es_c, T es_beta, T sigma, const int *idxnupts) { - int xstart, ystart, xend, yend; - int xx, yy, ix, iy; - int outidx; - T ker1[MAX_NSPREAD]; - T ker2[MAX_NSPREAD]; - - T x_rescaled, y_rescaled; - T kervalue1, kervalue2; - cuda_complex cnow; - for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < M; i += blockDim.x * gridDim.x) { - x_rescaled = fold_rescale(x[idxnupts[i]], nf1); - y_rescaled = fold_rescale(y[idxnupts[i]], nf2); - cnow = c[idxnupts[i]]; - - xstart = ceil(x_rescaled - ns / 2.0); - ystart = ceil(y_rescaled - ns / 2.0); - xend = floor(x_rescaled + ns / 2.0); - yend = floor(y_rescaled + ns / 2.0); - - T x1 = (T)xstart - x_rescaled; - T y1 = (T)ystart - y_rescaled; - - if constexpr (KEREVALMETH == 1) { - eval_kernel_vec_horner(ker1, x1, ns, sigma); - eval_kernel_vec_horner(ker2, y1, ns, sigma); - } else { - eval_kernel_vec(ker1, x1, ns, es_c, es_beta); - eval_kernel_vec(ker2, y1, ns, es_c, es_beta); - } - - for (yy = ystart; yy <= yend; yy++) { - for (xx = xstart; xx <= xend; xx++) { - ix = xx < 0 ? xx + nf1 : (xx > nf1 - 1 ? xx - nf1 : xx); - iy = yy < 0 ? yy + nf2 : (yy > nf2 - 1 ? yy - nf2 : yy); - outidx = ix + iy * nf1; - kervalue1 = ker1[xx - xstart]; - kervalue2 = ker2[yy - ystart]; - atomicAdd(&fw[outidx].x, cnow.x * kervalue1 * kervalue2); - atomicAdd(&fw[outidx].y, cnow.y * kervalue1 * kervalue2); - } - } +template +__global__ void spread_2d_nupts_driven( + const T *x, const T *y, const cuda_complex *c, cuda_complex *fw, int M, int ns, + int nf1, int nf2, T es_c, T es_beta, T sigma, const int *idxnupts) { + auto ker = (T *)alloca(sizeof(T) * ns * 2); + auto *__restrict__ ker1 = ker; + auto *__restrict__ ker2 = ker + ns; + for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < M; + i += blockDim.x * gridDim.x) { + const auto x_rescaled = fold_rescale(x[idxnupts[i]], nf1); + const auto y_rescaled = fold_rescale(y[idxnupts[i]], nf2); + const auto cnow = c[idxnupts[i]]; + const auto [xstart, xend] = interval(ns, x_rescaled); + const auto [ystart, yend] = interval(ns, y_rescaled); + + const auto x1 = (T)xstart - x_rescaled; + const auto y1 = (T)ystart - y_rescaled; + + if constexpr (KEREVALMETH == 1) { + eval_kernel_vec_horner(ker1, x1, ns, sigma); + eval_kernel_vec_horner(ker2, y1, ns, sigma); + } else { + eval_kernel_vec(ker1, x1, ns, es_c, es_beta); + eval_kernel_vec(ker2, y1, ns, es_c, es_beta); } + + for (auto yy = ystart; yy <= yend; yy++) { + for (auto xx = xstart; xx <= xend; xx++) { + const auto ix = xx < 0 ? xx + nf1 : (xx > nf1 - 1 ? xx - nf1 : xx); + const auto iy = yy < 0 ? yy + nf2 : (yy > nf2 - 1 ? yy - nf2 : yy); + const auto outidx = ix + iy * nf1; + const auto kervalue1 = ker1[xx - xstart]; + const auto kervalue2 = ker2[yy - ystart]; + atomicAdd(&fw[outidx].x, cnow.x * kervalue1 * kervalue2); + atomicAdd(&fw[outidx].y, cnow.y * kervalue1 * kervalue2); + } + } + } } /* Kernels for SubProb Method */ // SubProb properties -template -__global__ void calc_bin_size_noghost_2d(int M, int nf1, int nf2, int bin_size_x, int bin_size_y, int nbinx, int nbiny, +template +__global__ void calc_bin_size_noghost_2d(int M, int nf1, int nf2, int bin_size_x, + int bin_size_y, int nbinx, int nbiny, int *bin_size, T *x, T *y, int *sortidx) { - int binidx, binx, biny; - int oldidx; - T x_rescaled, y_rescaled; - for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < M; i += gridDim.x * blockDim.x) { - x_rescaled = fold_rescale(x[i], nf1); - y_rescaled = fold_rescale(y[i], nf2); - binx = floor(x_rescaled / bin_size_x); - binx = binx >= nbinx ? binx - 1 : binx; - binx = binx < 0 ? 0 : binx; - biny = floor(y_rescaled / bin_size_y); - biny = biny >= nbiny ? biny - 1 : biny; - biny = biny < 0 ? 0 : biny; - binidx = binx + biny * nbinx; - oldidx = atomicAdd(&bin_size[binidx], 1); - sortidx[i] = oldidx; - if (binx >= nbinx || biny >= nbiny) { - sortidx[i] = -biny; - } + int binidx, binx, biny; + int oldidx; + T x_rescaled, y_rescaled; + for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < M; + i += gridDim.x * blockDim.x) { + x_rescaled = fold_rescale(x[i], nf1); + y_rescaled = fold_rescale(y[i], nf2); + binx = floor(x_rescaled / bin_size_x); + binx = binx >= nbinx ? binx - 1 : binx; + binx = binx < 0 ? 0 : binx; + biny = floor(y_rescaled / bin_size_y); + biny = biny >= nbiny ? biny - 1 : biny; + biny = biny < 0 ? 0 : biny; + binidx = binx + biny * nbinx; + oldidx = atomicAdd(&bin_size[binidx], 1); + sortidx[i] = oldidx; + if (binx >= nbinx || biny >= nbiny) { + sortidx[i] = -biny; } + } } -template -__global__ void calc_inverse_of_global_sort_index_2d(int M, int bin_size_x, int bin_size_y, int nbinx, int nbiny, - const int *bin_startpts, const int *sortidx, const T *x, - const T *y, int *index, int nf1, int nf2) { - int binx, biny; - int binidx; - T x_rescaled, y_rescaled; - for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < M; i += gridDim.x * blockDim.x) { - x_rescaled = fold_rescale(x[i], nf1); - y_rescaled = fold_rescale(y[i], nf2); - binx = floor(x_rescaled / bin_size_x); - binx = binx >= nbinx ? binx - 1 : binx; - binx = binx < 0 ? 0 : binx; - biny = floor(y_rescaled / bin_size_y); - biny = biny >= nbiny ? biny - 1 : biny; - biny = biny < 0 ? 0 : biny; - binidx = binx + biny * nbinx; - - index[bin_startpts[binidx] + sortidx[i]] = i; - } +template +__global__ void calc_inverse_of_global_sort_index_2d( + int M, int bin_size_x, int bin_size_y, int nbinx, int nbiny, const int *bin_startpts, + const int *sortidx, const T *x, const T *y, int *index, int nf1, int nf2) { + int binx, biny; + int binidx; + T x_rescaled, y_rescaled; + for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < M; + i += gridDim.x * blockDim.x) { + x_rescaled = fold_rescale(x[i], nf1); + y_rescaled = fold_rescale(y[i], nf2); + binx = floor(x_rescaled / bin_size_x); + binx = binx >= nbinx ? binx - 1 : binx; + binx = binx < 0 ? 0 : binx; + biny = floor(y_rescaled / bin_size_y); + biny = biny >= nbiny ? biny - 1 : biny; + biny = biny < 0 ? 0 : biny; + binidx = binx + biny * nbinx; + + index[bin_startpts[binidx] + sortidx[i]] = i; + } } -template -__global__ void spread_2d_subprob(const T *x, const T *y, const cuda_complex *c, cuda_complex *fw, int M, int ns, - int nf1, int nf2, T es_c, T es_beta, T sigma, int *binstartpts, const int *bin_size, - int bin_size_x, int bin_size_y, int *subprob_to_bin, const int *subprobstartpts, - const int *numsubprob, int maxsubprobsize, int nbinx, int nbiny, const int *idxnupts) { - extern __shared__ char sharedbuf[]; - cuda_complex *fwshared = (cuda_complex *)sharedbuf; - - int xstart, ystart, xend, yend; - int subpidx = blockIdx.x; - int bidx = subprob_to_bin[subpidx]; - int binsubp_idx = subpidx - subprobstartpts[bidx]; - int ix, iy; - int outidx; - int ptstart = binstartpts[bidx] + binsubp_idx * maxsubprobsize; - int nupts = min(maxsubprobsize, bin_size[bidx] - binsubp_idx * maxsubprobsize); - - int xoffset = (bidx % nbinx) * bin_size_x; - int yoffset = (bidx / nbinx) * bin_size_y; - - int N = (bin_size_x + 2 * ceil(ns / 2.0)) * (bin_size_y + 2 * ceil(ns / 2.0)); - T ker1[MAX_NSPREAD]; - T ker2[MAX_NSPREAD]; - - for (int i = threadIdx.x; i < N; i += blockDim.x) { - fwshared[i].x = 0.0; - fwshared[i].y = 0.0; - } - __syncthreads(); - - T x_rescaled, y_rescaled; - cuda_complex cnow; - for (int i = threadIdx.x; i < nupts; i += blockDim.x) { - int idx = ptstart + i; - x_rescaled = fold_rescale(x[idxnupts[idx]], nf1); - y_rescaled = fold_rescale(y[idxnupts[idx]], nf2); - cnow = c[idxnupts[idx]]; - - xstart = ceil(x_rescaled - ns / 2.0) - xoffset; - ystart = ceil(y_rescaled - ns / 2.0) - yoffset; - xend = floor(x_rescaled + ns / 2.0) - xoffset; - yend = floor(y_rescaled + ns / 2.0) - yoffset; - - T x1 = (T)xstart + xoffset - x_rescaled; - T y1 = (T)ystart + yoffset - y_rescaled; - - if constexpr (KEREVALMETH == 1) { - eval_kernel_vec_horner(ker1, x1, ns, sigma); - eval_kernel_vec_horner(ker2, y1, ns, sigma); - } else { - eval_kernel_vec(ker1, x1, ns, es_c, es_beta); - eval_kernel_vec(ker2, y1, ns, es_c, es_beta); - } - - for (int yy = ystart; yy <= yend; yy++) { - iy = yy + ceil(ns / 2.0); - if (iy >= (bin_size_y + (int)ceil(ns / 2.0) * 2) || iy < 0) - break; - for (int xx = xstart; xx <= xend; xx++) { - ix = xx + ceil(ns / 2.0); - if (ix >= (bin_size_x + (int)ceil(ns / 2.0) * 2) || ix < 0) - break; - outidx = ix + iy * (bin_size_x + ceil(ns / 2.0) * 2); - T kervalue1 = ker1[xx - xstart]; - T kervalue2 = ker2[yy - ystart]; - atomicAdd(&fwshared[outidx].x, cnow.x * kervalue1 * kervalue2); - atomicAdd(&fwshared[outidx].y, cnow.y * kervalue1 * kervalue2); - } - } +template +__global__ void spread_2d_subprob( + const T *x, const T *y, const cuda_complex *c, cuda_complex *fw, int M, int ns, + int nf1, int nf2, T es_c, T es_beta, T sigma, int *binstartpts, const int *bin_size, + int bin_size_x, int bin_size_y, int *subprob_to_bin, const int *subprobstartpts, + const int *numsubprob, int maxsubprobsize, int nbinx, int nbiny, + const int *idxnupts) { + extern __shared__ char sharedbuf[]; + cuda_complex *fwshared = (cuda_complex *)sharedbuf; + + const int subpidx = blockIdx.x; + const auto bidx = subprob_to_bin[subpidx]; + const auto binsubp_idx = subpidx - subprobstartpts[bidx]; + const auto ptstart = binstartpts[bidx] + binsubp_idx * maxsubprobsize; + const auto nupts = min(maxsubprobsize, bin_size[bidx] - binsubp_idx * maxsubprobsize); + + const int xoffset = (bidx % nbinx) * bin_size_x; + const int yoffset = (bidx / nbinx) * bin_size_y; + + const T ns_2f = ns * T(.5); + const auto ns_2 = (ns + 1) / 2; + const auto rounded_ns = ns_2 * 2; + const int N = (bin_size_x + rounded_ns) * (bin_size_y + rounded_ns); + + auto ker = (T *)alloca(sizeof(T) * ns * 2); + auto *__restrict__ ker1 = ker; + auto *__restrict__ ker2 = ker + ns; + + for (int i = threadIdx.x; i < N; i += blockDim.x) { + fwshared[i] = {0, 0}; + } + __syncthreads(); + + for (int i = threadIdx.x; i < nupts; i += blockDim.x) { + const int idx = ptstart + i; + const auto x_rescaled = fold_rescale(x[idxnupts[idx]], nf1); + const auto y_rescaled = fold_rescale(y[idxnupts[idx]], nf2); + const auto cnow = c[idxnupts[idx]]; + auto [xstart, xend] = interval(ns, x_rescaled); + auto [ystart, yend] = interval(ns, y_rescaled); + + const T x1 = T(xstart) - x_rescaled; + const T y1 = T(ystart) - y_rescaled; + xstart -= xoffset; + ystart -= yoffset; + xend -= xoffset; + yend -= yoffset; + + if constexpr (KEREVALMETH == 1) { + eval_kernel_vec_horner(ker1, x1, ns, sigma); + eval_kernel_vec_horner(ker2, y1, ns, sigma); + } else { + eval_kernel_vec(ker1, x1, ns, es_c, es_beta); + eval_kernel_vec(ker2, y1, ns, es_c, es_beta); } - __syncthreads(); - /* write to global memory */ - for (int k = threadIdx.x; k < N; k += blockDim.x) { - int i = k % (int)(bin_size_x + 2 * ceil(ns / 2.0)); - int j = k / (bin_size_x + 2 * ceil(ns / 2.0)); - ix = xoffset - ceil(ns / 2.0) + i; - iy = yoffset - ceil(ns / 2.0) + j; - if (ix < (nf1 + ceil(ns / 2.0)) && iy < (nf2 + ceil(ns / 2.0))) { - ix = ix < 0 ? ix + nf1 : (ix > nf1 - 1 ? ix - nf1 : ix); - iy = iy < 0 ? iy + nf2 : (iy > nf2 - 1 ? iy - nf2 : iy); - outidx = ix + iy * nf1; - int sharedidx = i + j * (bin_size_x + ceil(ns / 2.0) * 2); - atomicAdd(&fw[outidx].x, fwshared[sharedidx].x); - atomicAdd(&fw[outidx].y, fwshared[sharedidx].y); - } + for (int yy = ystart; yy <= yend; yy++) { + const auto iy = yy + ns_2; + if (iy >= (bin_size_y + rounded_ns) || iy < 0) break; + for (int xx = xstart; xx <= xend; xx++) { + const auto ix = xx + ns_2; + if (ix >= (bin_size_x + rounded_ns) || ix < 0) break; + const auto outidx = ix + iy * (bin_size_x + rounded_ns); + const auto kervalue = ker1[xx - xstart] * ker2[yy - ystart]; + const auto resx = cnow.x * kervalue; + const auto resy = cnow.y * kervalue; + atomicAdd(&fwshared[outidx].x, resx); + atomicAdd(&fwshared[outidx].y, resy); + } + } + } + + __syncthreads(); + /* write to global memory */ + for (int k = threadIdx.x; k < N; k += blockDim.x) { + const auto i = k % (bin_size_x + rounded_ns); + const auto j = k / (bin_size_x + rounded_ns); + auto ix = xoffset - ns_2 + i; + auto iy = yoffset - ns_2 + j; + if (ix < (nf1 + ns_2) && iy < (nf2 + ns_2)) { + ix = ix < 0 ? ix + nf1 : (ix > nf1 - 1 ? ix - nf1 : ix); + iy = iy < 0 ? iy + nf2 : (iy > nf2 - 1 ? iy - nf2 : iy); + const auto outidx = ix + iy * nf1; + const auto sharedidx = i + j * (bin_size_x + rounded_ns); + atomicAdd(&fw[outidx].x, fwshared[sharedidx].x); + atomicAdd(&fw[outidx].y, fwshared[sharedidx].y); } + } } /* --------------------- 2d Interpolation Kernels ----------------------------*/ /* Kernels for NUptsdriven Method */ -template -__global__ void interp_2d_nupts_driven(const T *x, const T *y, cuda_complex *c, const cuda_complex *fw, int M, - int ns, int nf1, int nf2, T es_c, T es_beta, T sigma, const int *idxnupts) { - for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < M; i += blockDim.x * gridDim.x) { - T x_rescaled = fold_rescale(x[idxnupts[i]], nf1); - T y_rescaled = fold_rescale(y[idxnupts[i]], nf2); - - int xstart = ceil(x_rescaled - ns / 2.0); - int ystart = ceil(y_rescaled - ns / 2.0); - int xend = floor(x_rescaled + ns / 2.0); - int yend = floor(y_rescaled + ns / 2.0); - cuda_complex cnow; - cnow.x = 0.0; - cnow.y = 0.0; - T ker1[MAX_NSPREAD]; - T ker2[MAX_NSPREAD]; - - T x1 = (T)xstart - x_rescaled; - T y1 = (T)ystart - y_rescaled; - if constexpr (KEREVALMETH == 1) { - eval_kernel_vec_horner(ker1, x1, ns, sigma); - eval_kernel_vec_horner(ker2, y1, ns, sigma); - } else { - eval_kernel_vec(ker1, x1, ns, es_c, es_beta); - eval_kernel_vec(ker2, y1, ns, es_c, es_beta); - } - - for (int yy = ystart; yy <= yend; yy++) { - T kervalue2 = ker2[yy - ystart]; - for (int xx = xstart; xx <= xend; xx++) { - int ix = xx < 0 ? xx + nf1 : (xx > nf1 - 1 ? xx - nf1 : xx); - int iy = yy < 0 ? yy + nf2 : (yy > nf2 - 1 ? yy - nf2 : yy); - int inidx = ix + iy * nf1; - T kervalue1 = ker1[xx - xstart]; - cnow.x += fw[inidx].x * kervalue1 * kervalue2; - cnow.y += fw[inidx].y * kervalue1 * kervalue2; - } - } - c[idxnupts[i]].x = cnow.x; - c[idxnupts[i]].y = cnow.y; +template +__global__ void interp_2d_nupts_driven( + const T *x, const T *y, cuda_complex *c, const cuda_complex *fw, int M, int ns, + int nf1, int nf2, T es_c, T es_beta, T sigma, const int *idxnupts) { + auto ker = (T *)alloca(sizeof(T) * ns * 2); + auto *__restrict__ ker1 = ker; + auto *__restrict__ ker2 = ker + ns; + + for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < M; + i += blockDim.x * gridDim.x) { + const auto x_rescaled = fold_rescale(x[idxnupts[i]], nf1); + const auto y_rescaled = fold_rescale(y[idxnupts[i]], nf2); + const auto [xstart, xend] = interval(ns, x_rescaled); + const auto [ystart, yend] = interval(ns, y_rescaled); + + T x1 = (T)xstart - x_rescaled; + T y1 = (T)ystart - y_rescaled; + + if constexpr (KEREVALMETH == 1) { + eval_kernel_vec_horner(ker1, x1, ns, sigma); + eval_kernel_vec_horner(ker2, y1, ns, sigma); + } else { + eval_kernel_vec(ker1, x1, ns, es_c, es_beta); + eval_kernel_vec(ker2, y1, ns, es_c, es_beta); } + + cuda_complex cnow{0, 0}; + for (int yy = ystart; yy <= yend; yy++) { + const T kervalue2 = ker2[yy - ystart]; + for (int xx = xstart; xx <= xend; xx++) { + const auto ix = xx < 0 ? xx + nf1 : (xx > nf1 - 1 ? xx - nf1 : xx); + const auto iy = yy < 0 ? yy + nf2 : (yy > nf2 - 1 ? yy - nf2 : yy); + const auto inidx = ix + iy * nf1; + const auto kervalue1 = ker1[xx - xstart]; + cnow.x += fw[inidx].x * kervalue1 * kervalue2; + cnow.y += fw[inidx].y * kervalue1 * kervalue2; + } + } + c[idxnupts[i]].x = cnow.x; + c[idxnupts[i]].y = cnow.y; + } } /* Kernels for Subprob Method */ -template -__global__ void interp_2d_subprob(const T *x, const T *y, cuda_complex *c, const cuda_complex *fw, int M, int ns, - int nf1, int nf2, T es_c, T es_beta, T sigma, int *binstartpts, const int *bin_size, - int bin_size_x, int bin_size_y, int *subprob_to_bin, const int *subprobstartpts, - const int *numsubprob, int maxsubprobsize, int nbinx, int nbiny, - const int *idxnupts) { - extern __shared__ char sharedbuf[]; - cuda_complex *fwshared = (cuda_complex *)sharedbuf; - - int xstart, ystart, xend, yend; - int subpidx = blockIdx.x; - int bidx = subprob_to_bin[subpidx]; - int binsubp_idx = subpidx - subprobstartpts[bidx]; - int ix, iy; - int outidx; - int ptstart = binstartpts[bidx] + binsubp_idx * maxsubprobsize; - int nupts = min(maxsubprobsize, bin_size[bidx] - binsubp_idx * maxsubprobsize); - - int xoffset = (bidx % nbinx) * bin_size_x; - int yoffset = (bidx / nbinx) * bin_size_y; - int N = (bin_size_x + 2 * ceil(ns / 2.0)) * (bin_size_y + 2 * ceil(ns / 2.0)); - - for (int k = threadIdx.x; k < N; k += blockDim.x) { - int i = k % (int)(bin_size_x + 2 * ceil(ns / 2.0)); - int j = k / (bin_size_x + 2 * ceil(ns / 2.0)); - ix = xoffset - ceil(ns / 2.0) + i; - iy = yoffset - ceil(ns / 2.0) + j; - if (ix < (nf1 + ceil(ns / 2.0)) && iy < (nf2 + ceil(ns / 2.0))) { - ix = ix < 0 ? ix + nf1 : (ix > nf1 - 1 ? ix - nf1 : ix); - iy = iy < 0 ? iy + nf2 : (iy > nf2 - 1 ? iy - nf2 : iy); - outidx = ix + iy * nf1; - int sharedidx = i + j * (bin_size_x + ceil(ns / 2.0) * 2); - fwshared[sharedidx].x = fw[outidx].x; - fwshared[sharedidx].y = fw[outidx].y; - } +template +__global__ void interp_2d_subprob( + const T *x, const T *y, cuda_complex *c, const cuda_complex *fw, int M, int ns, + int nf1, int nf2, T es_c, T es_beta, T sigma, int *binstartpts, const int *bin_size, + int bin_size_x, int bin_size_y, int *subprob_to_bin, const int *subprobstartpts, + const int *numsubprob, int maxsubprobsize, int nbinx, int nbiny, + const int *idxnupts) { + extern __shared__ char sharedbuf[]; + cuda_complex *fwshared = (cuda_complex *)sharedbuf; + + auto ker = (T *)alloca(sizeof(T) * ns * 2); + auto *__restrict__ ker1 = ker; + auto *__restrict__ ker2 = ker + ns; + + const auto subpidx = blockIdx.x; + const auto bidx = subprob_to_bin[subpidx]; + const auto binsubp_idx = subpidx - subprobstartpts[bidx]; + const auto ptstart = binstartpts[bidx] + binsubp_idx * maxsubprobsize; + const auto nupts = min(maxsubprobsize, bin_size[bidx] - binsubp_idx * maxsubprobsize); + + const auto xoffset = (bidx % nbinx) * bin_size_x; + const auto yoffset = (bidx / nbinx) * bin_size_y; + + const T ns_2f = ns * T(.5); + const auto ns_2 = (ns + 1) / 2; + const auto rounded_ns = ns_2 * 2; + const int N = (bin_size_x + rounded_ns) * (bin_size_y + rounded_ns); + + for (int k = threadIdx.x; k < N; k += blockDim.x) { + int i = k % (bin_size_x + rounded_ns); + int j = k / (bin_size_x + rounded_ns); + auto ix = xoffset - ns_2 + i; + auto iy = yoffset - ns_2 + j; + if (ix < (nf1 + ns_2) && iy < (nf2 + ns_2)) { + ix = ix < 0 ? ix + nf1 : (ix > nf1 - 1 ? ix - nf1 : ix); + iy = iy < 0 ? iy + nf2 : (iy > nf2 - 1 ? iy - nf2 : iy); + const auto outidx = ix + int(iy * nf1); + const auto sharedidx = i + j * (bin_size_x + rounded_ns); + fwshared[sharedidx].x = fw[outidx].x; + fwshared[sharedidx].y = fw[outidx].y; } - __syncthreads(); - - T ker1[MAX_NSPREAD]; - T ker2[MAX_NSPREAD]; - - T x_rescaled, y_rescaled; - cuda_complex cnow; - for (int i = threadIdx.x; i < nupts; i += blockDim.x) { - int idx = ptstart + i; - x_rescaled = fold_rescale(x[idxnupts[idx]], nf1); - y_rescaled = fold_rescale(y[idxnupts[idx]], nf2); - cnow.x = 0.0; - cnow.y = 0.0; - - xstart = ceil(x_rescaled - ns / 2.0) - xoffset; - ystart = ceil(y_rescaled - ns / 2.0) - yoffset; - xend = floor(x_rescaled + ns / 2.0) - xoffset; - yend = floor(y_rescaled + ns / 2.0) - yoffset; - - T x1 = (T)xstart + xoffset - x_rescaled; - T y1 = (T)ystart + yoffset - y_rescaled; - if constexpr (KEREVALMETH == 1) { - eval_kernel_vec_horner(ker1, x1, ns, sigma); - eval_kernel_vec_horner(ker2, y1, ns, sigma); - } else { - eval_kernel_vec(ker1, x1, ns, es_c, es_beta); - eval_kernel_vec(ker2, y1, ns, es_c, es_beta); - } - - for (int yy = ystart; yy <= yend; yy++) { - T kervalue2 = ker2[yy - ystart]; - for (int xx = xstart; xx <= xend; xx++) { - ix = xx + ceil(ns / 2.0); - iy = yy + ceil(ns / 2.0); - outidx = ix + iy * (bin_size_x + ceil(ns / 2.0) * 2); - T kervalue1 = ker1[xx - xstart]; - cnow.x += fwshared[outidx].x * kervalue1 * kervalue2; - cnow.y += fwshared[outidx].y * kervalue1 * kervalue2; - } - } - c[idxnupts[idx]] = cnow; + } + __syncthreads(); + for (int i = threadIdx.x; i < nupts; i += blockDim.x) { + int idx = ptstart + i; + const auto x_rescaled = fold_rescale(x[idxnupts[idx]], nf1); + const auto y_rescaled = fold_rescale(y[idxnupts[idx]], nf2); + cuda_complex cnow{0, 0}; + + auto [xstart, xend] = interval(ns, x_rescaled); + auto [ystart, yend] = interval(ns, y_rescaled); + + const T x1 = T(xstart) - x_rescaled; + const T y1 = T(ystart) - y_rescaled; + + xstart -= xoffset; + ystart -= yoffset; + xend -= xoffset; + yend -= yoffset; + + if constexpr (KEREVALMETH == 1) { + eval_kernel_vec_horner(ker1, x1, ns, sigma); + eval_kernel_vec_horner(ker2, y1, ns, sigma); + } else { + eval_kernel_vec(ker1, x1, ns, es_c, es_beta); + eval_kernel_vec(ker2, y1, ns, es_c, es_beta); + } + + for (int yy = ystart; yy <= yend; yy++) { + const auto kervalue2 = ker2[yy - ystart]; + for (int xx = xstart; xx <= xend; xx++) { + const auto ix = xx + ns_2; + const auto iy = yy + ns_2; + const auto outidx = ix + iy * (bin_size_x + rounded_ns); + const auto kervalue1 = ker1[xx - xstart]; + cnow.x += fwshared[outidx].x * kervalue1 * kervalue2; + cnow.y += fwshared[outidx].y * kervalue1 * kervalue2; + } } + c[idxnupts[idx]] = cnow; + } } } // namespace spreadinterp diff --git a/src/cuda/common.cu b/src/cuda/common.cu index f5661b1dd..e7ce65b52 100644 --- a/src/cuda/common.cu +++ b/src/cuda/common.cu @@ -221,18 +221,68 @@ std::size_t shared_memory_required(int dim, int ns, int bin_size_x, int bin_size return adjusted_ns * sizeof(cuda_complex); } +// Function to find bin_size_x == bin_size_y where bin_size_x * bin_size_y < MemSize +template int find_bin_size(std::size_t MemSize, int dim, int ns) { + int binsize = 1; // Start with the smallest possible bin size + + while (true) { + // Calculate the shared memory required for the current bin_size_x and bin_size_y + std::size_t required_memory = + shared_memory_required(dim, ns, binsize, binsize, binsize); + + // Check if the required memory is less than the available memory + if (required_memory > MemSize) { + // If the condition is met, return the current bin_size_x + return binsize - 1; + } + + // Increment bin_size_x for the next iteration + binsize++; + } +} + template void cufinufft_setup_binsize(int type, int ns, int dim, cufinufft_opts *opts) { int shared_mem_per_block{}, device_id{}; switch (dim) { case 1: { - switch (opts->gpu_method) { - case 1: - opts->gpu_binsizex = (opts->gpu_binsizex < 0) ? 1024 : opts->gpu_binsizex; - break; - case 0: - case 2: - if (opts->gpu_binsizex < 0) { + if (opts->gpu_binsizex < 0) { + cudaGetDevice(&device_id); + if (const auto err = cudaGetLastError(); err != cudaSuccess) { + throw std::runtime_error(cudaGetErrorString(err)); + } + cudaDeviceGetAttribute(&shared_mem_per_block, + cudaDevAttrMaxSharedMemoryPerBlockOptin, device_id); + if (const auto err = cudaGetLastError(); err != cudaSuccess) { + throw std::runtime_error(cudaGetErrorString(err)); + } + const int bin_size = + shared_mem_per_block / sizeof(cuda_complex) - ((ns + 1) / 2) * 2; + // find the power of 2 that is less than bin_size + // this makes the bin_size use the maximum shared memory available + opts->gpu_binsizex = bin_size; + const auto shared_mem_required = shared_memory_required( + dim, ns, opts->gpu_binsizex, opts->gpu_binsizey, opts->gpu_binsizez); + // printf("binsizex: %d, shared_mem_required %ld (bytes)\n", + // opts->gpu_binsizex, + // shared_mem_required); + } + opts->gpu_binsizey = 1; + opts->gpu_binsizez = 1; + } break; + case 2: { + if (opts->gpu_binsizex < 0 || opts->gpu_binsizey < 0) { + switch (opts->gpu_method) { + case 0: + case 2: { + opts->gpu_binsizex = 32; + opts->gpu_binsizey = 32; + // fall through otherwise + if (opts->gpu_method && ns > 2) { + break; + } + } + case 1: { cudaGetDevice(&device_id); if (const auto err = cudaGetLastError(); err != cudaSuccess) { throw std::runtime_error(cudaGetErrorString(err)); @@ -242,22 +292,17 @@ void cufinufft_setup_binsize(int type, int ns, int dim, cufinufft_opts *opts) { if (const auto err = cudaGetLastError(); err != cudaSuccess) { throw std::runtime_error(cudaGetErrorString(err)); } - const int bin_size = - shared_mem_per_block / sizeof(cuda_complex) - ((ns + 1) / 2) * 2; - // find the power of 2 that is less than bin_size - const int exponent = std::log2(bin_size); - opts->gpu_binsizex = 1 << (exponent - 1); - // printf("bin_size: %d, gpu_binsizex: %d\n", bin_size, - // opts->gpu_binsizex); + + const auto binsize = find_bin_size(shared_mem_per_block, dim, ns); + opts->gpu_binsizex = binsize; + opts->gpu_binsizey = binsize; + } break; } - break; } - opts->gpu_binsizey = 1; - opts->gpu_binsizez = 1; - } break; - case 2: { - opts->gpu_binsizex = (opts->gpu_binsizex < 0) ? 32 : opts->gpu_binsizex; - opts->gpu_binsizey = (opts->gpu_binsizey < 0) ? 32 : opts->gpu_binsizey; + // const auto shared_mem_required = shared_memory_required( + // dim, ns, opts->gpu_binsizex, opts->gpu_binsizey, opts->gpu_binsizez); + // printf("binsizex: %d, binsizey: %d, shared_mem_required %ld (bytes)\n", + // opts->gpu_binsizex, opts->gpu_binsizey, shared_mem_required); opts->gpu_binsizez = 1; } break; case 3: { From 366295d41c54837250d728da6b1ef590002d1a40 Mon Sep 17 00:00:00 2001 From: Marco Barbone Date: Thu, 18 Jul 2024 15:18:27 -0400 Subject: [PATCH 11/68] 3D integer operations --- perftest/cuda/bench.py | 6 +- src/cuda/1d/cufinufft1d.cu | 3 - src/cuda/1d/interp1d_wrapper.cu | 4 - src/cuda/1d/spread1d_wrapper.cu | 1 - src/cuda/1d/spreadinterp1d.cuh | 1 - src/cuda/2d/cufinufft2d.cu | 6 +- src/cuda/2d/interp2d_wrapper.cu | 3 - src/cuda/2d/spread2d_wrapper.cu | 3 - src/cuda/3d/cufinufft3d.cu | 3 - src/cuda/3d/interp3d_wrapper.cu | 24 +- src/cuda/3d/spread3d_wrapper.cu | 15 +- src/cuda/3d/spreadinterp3d.cuh | 1010 ++++++++++++++++--------------- src/cuda/common.cu | 15 +- 13 files changed, 549 insertions(+), 545 deletions(-) diff --git a/perftest/cuda/bench.py b/perftest/cuda/bench.py index db7e73873..8a9e757a3 100644 --- a/perftest/cuda/bench.py +++ b/perftest/cuda/bench.py @@ -37,14 +37,14 @@ def build_args(args): # example command to run: # nsys profile -o cuperftest_profile ./cuperftest --prec f --n_runs 10 --method 1 --N1 256 --N2 256 --N3 256 --M 1E8 --tol 1E-6 # example arguments -args = {"--prec": "d", +args = {"--prec": "f", "--n_runs": "5", "--method": "0", "--sort": "1", # "--N1": "16777216", "--N1": "256", "--N2": "256", - # "--N3": "256", + "--N3": "256", "--kerevalmethod": "1", "--M": "1E8", "--tol": "1E-6"} @@ -82,6 +82,8 @@ def build_args(args): data['method'].append('GM') elif method == '2': data['method'].append('SM') + elif method == '4': + data['method'].append('BLOCK') print("Method " + data['method'][-1]) cmd = ["profile", "--force-overwrite", "true", "-o", "cuperftest_profile", cwd + "/cuperftest"] + build_args(args) stdout, stderr = run_command("nsys", cmd) diff --git a/src/cuda/1d/cufinufft1d.cu b/src/cuda/1d/cufinufft1d.cu index 4ecb3b283..a17b6f044 100644 --- a/src/cuda/1d/cufinufft1d.cu +++ b/src/cuda/1d/cufinufft1d.cu @@ -1,9 +1,6 @@ #include #include #include -#include -#include -#include #include #include diff --git a/src/cuda/1d/interp1d_wrapper.cu b/src/cuda/1d/interp1d_wrapper.cu index cd3637c8b..2bf69f6a2 100644 --- a/src/cuda/1d/interp1d_wrapper.cu +++ b/src/cuda/1d/interp1d_wrapper.cu @@ -1,14 +1,10 @@ #include #include -#include #include -#include #include #include -using namespace cufinufft::memtransfer; - #include "spreadinterp1d.cuh" namespace cufinufft { diff --git a/src/cuda/1d/spread1d_wrapper.cu b/src/cuda/1d/spread1d_wrapper.cu index 4e7f4ea0b..824da42c9 100644 --- a/src/cuda/1d/spread1d_wrapper.cu +++ b/src/cuda/1d/spread1d_wrapper.cu @@ -1,6 +1,5 @@ #include #include -#include #include #include diff --git a/src/cuda/1d/spreadinterp1d.cuh b/src/cuda/1d/spreadinterp1d.cuh index f94ffd7eb..b6c511555 100644 --- a/src/cuda/1d/spreadinterp1d.cuh +++ b/src/cuda/1d/spreadinterp1d.cuh @@ -5,7 +5,6 @@ #include #include -#include #include #include #include diff --git a/src/cuda/2d/cufinufft2d.cu b/src/cuda/2d/cufinufft2d.cu index afc801b7f..f7f7b1559 100644 --- a/src/cuda/2d/cufinufft2d.cu +++ b/src/cuda/2d/cufinufft2d.cu @@ -1,14 +1,10 @@ -#include +#include #include #include -#include -#include - #include #include #include -#include #include using namespace cufinufft::deconvolve; diff --git a/src/cuda/2d/interp2d_wrapper.cu b/src/cuda/2d/interp2d_wrapper.cu index eda0d579b..0d3d3ff9b 100644 --- a/src/cuda/2d/interp2d_wrapper.cu +++ b/src/cuda/2d/interp2d_wrapper.cu @@ -1,14 +1,11 @@ -#include #include #include #include #include -#include #include -using namespace cufinufft::memtransfer; using namespace cufinufft::common; #include "spreadinterp2d.cuh" diff --git a/src/cuda/2d/spread2d_wrapper.cu b/src/cuda/2d/spread2d_wrapper.cu index d361791b0..244d25b03 100644 --- a/src/cuda/2d/spread2d_wrapper.cu +++ b/src/cuda/2d/spread2d_wrapper.cu @@ -1,5 +1,4 @@ #include -#include #include #include @@ -8,14 +7,12 @@ #include #include -#include #include #include #include "spreadinterp2d.cuh" using namespace cufinufft::common; -using namespace cufinufft::memtransfer; namespace cufinufft { namespace spreadinterp { diff --git a/src/cuda/3d/cufinufft3d.cu b/src/cuda/3d/cufinufft3d.cu index ea0ef4a86..5977e6d5f 100644 --- a/src/cuda/3d/cufinufft3d.cu +++ b/src/cuda/3d/cufinufft3d.cu @@ -1,13 +1,10 @@ #include #include -#include -#include #include #include #include -#include #include #include diff --git a/src/cuda/3d/interp3d_wrapper.cu b/src/cuda/3d/interp3d_wrapper.cu index b42231d86..91379d3ae 100644 --- a/src/cuda/3d/interp3d_wrapper.cu +++ b/src/cuda/3d/interp3d_wrapper.cu @@ -1,15 +1,15 @@ -#include #include #include #include +#include "spreadinterp3d.cuh" +#include #include #include -#include "spreadinterp3d.cuh" - using namespace cufinufft::memtransfer; +using namespace cufinufft::common; namespace cufinufft { namespace spreadinterp { @@ -123,19 +123,16 @@ int cuinterp3d_subprob(int nf1, int nf2, int nf3, int M, cufinufft_plan_t *d_ int *d_subprob_to_bin = d_plan->subprob_to_bin; int totalnumsubprob = d_plan->totalnumsubprob; - T sigma = d_plan->spopts.upsampfac; - T es_c = d_plan->spopts.ES_c; - T es_beta = d_plan->spopts.ES_beta; - size_t sharedplanorysize = (bin_size_x + 2 * ceil(ns / 2.0)) * - (bin_size_y + 2 * ceil(ns / 2.0)) * - (bin_size_z + 2 * ceil(ns / 2.0)) * sizeof(cuda_complex); - if (sharedplanorysize > 49152) { - std::cerr << "[cuinterp3d_subprob] error: not enough shared memory\n"; - return FINUFFT_ERR_INSUFFICIENT_SHMEM; - } + T sigma = d_plan->spopts.upsampfac; + T es_c = d_plan->spopts.ES_c; + T es_beta = d_plan->spopts.ES_beta; + const auto sharedplanorysize = + shared_memory_required(3, d_plan->spopts.nspread, d_plan->opts.gpu_binsizex, + d_plan->opts.gpu_binsizey, d_plan->opts.gpu_binsizez); for (int t = 0; t < blksize; t++) { if (d_plan->opts.gpu_kerevalmeth == 1) { + cufinufft_set_shared_memory(interp_3d_subprob, 3, *d_plan); interp_3d_subprob<<>>( d_kx, d_ky, d_kz, d_c + t * M, d_fw + t * nf1 * nf2 * nf3, M, ns, nf1, nf2, nf3, es_c, es_beta, sigma, d_binstartpts, d_binsize, bin_size_x, bin_size_y, @@ -143,6 +140,7 @@ int cuinterp3d_subprob(int nf1, int nf2, int nf3, int M, cufinufft_plan_t *d_ numbins[0], numbins[1], numbins[2], d_idxnupts); RETURN_IF_CUDA_ERROR } else { + cufinufft_set_shared_memory(interp_3d_subprob, 3, *d_plan); interp_3d_subprob<<>>( d_kx, d_ky, d_kz, d_c + t * M, d_fw + t * nf1 * nf2 * nf3, M, ns, nf1, nf2, nf3, es_c, es_beta, sigma, d_binstartpts, d_binsize, bin_size_x, bin_size_y, diff --git a/src/cuda/3d/spread3d_wrapper.cu b/src/cuda/3d/spread3d_wrapper.cu index 6c851389c..bf78ed905 100644 --- a/src/cuda/3d/spread3d_wrapper.cu +++ b/src/cuda/3d/spread3d_wrapper.cu @@ -1,5 +1,4 @@ #include -#include #include #include @@ -8,12 +7,10 @@ #include #include -#include #include #include using namespace cufinufft::common; -using namespace cufinufft::memtransfer; #include "spreadinterp3d.cuh" @@ -532,12 +529,12 @@ int cuspread3d_subprob(int nf1, int nf2, int nf3, int M, cufinufft_plan_t *d_ int totalnumsubprob = d_plan->totalnumsubprob; int *d_subprob_to_bin = d_plan->subprob_to_bin; - T sigma = d_plan->spopts.upsampfac; - T es_c = d_plan->spopts.ES_c; - T es_beta = d_plan->spopts.ES_beta; - size_t sharedplanorysize = (bin_size_x + 2 * ceil(ns / 2.0)) * - (bin_size_y + 2 * ceil(ns / 2.0)) * - (bin_size_z + 2 * ceil(ns / 2.0)) * sizeof(cuda_complex); + T sigma = d_plan->spopts.upsampfac; + T es_c = d_plan->spopts.ES_c; + T es_beta = d_plan->spopts.ES_beta; + const auto sharedplanorysize = + shared_memory_required(3, d_plan->spopts.nspread, d_plan->opts.gpu_binsizex, + d_plan->opts.gpu_binsizey, d_plan->opts.gpu_binsizez); for (int t = 0; t < blksize; t++) { if (d_plan->opts.gpu_kerevalmeth) { cufinufft_set_shared_memory(spread_3d_subprob, 3, *d_plan); diff --git a/src/cuda/3d/spreadinterp3d.cuh b/src/cuda/3d/spreadinterp3d.cuh index 838816a56..dc722ddc3 100644 --- a/src/cuda/3d/spreadinterp3d.cuh +++ b/src/cuda/3d/spreadinterp3d.cuh @@ -10,548 +10,568 @@ #include #include +using namespace cufinufft::utils; + namespace cufinufft { namespace spreadinterp { /* ---------------------- 3d Spreading Kernels -------------------------------*/ /* Kernels for bin sort NUpts */ -template -__global__ void calc_bin_size_noghost_3d(int M, int nf1, int nf2, int nf3, int bin_size_x, int bin_size_y, - int bin_size_z, int nbinx, int nbiny, int nbinz, int *bin_size, const T *x, +template +__global__ void calc_bin_size_noghost_3d(int M, int nf1, int nf2, int nf3, int bin_size_x, + int bin_size_y, int bin_size_z, int nbinx, + int nbiny, int nbinz, int *bin_size, const T *x, const T *y, const T *z, int *sortidx) { - int binidx, binx, biny, binz; - int oldidx; - T x_rescaled, y_rescaled, z_rescaled; - for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < M; i += gridDim.x * blockDim.x) { - x_rescaled = fold_rescale(x[i], nf1); - y_rescaled = fold_rescale(y[i], nf2); - z_rescaled = fold_rescale(z[i], nf3); - binx = floor(x_rescaled / bin_size_x); - binx = binx >= nbinx ? binx - 1 : binx; - binx = binx < 0 ? 0 : binx; - - biny = floor(y_rescaled / bin_size_y); - biny = biny >= nbiny ? biny - 1 : biny; - biny = biny < 0 ? 0 : biny; - - binz = floor(z_rescaled / bin_size_z); - binz = binz >= nbinz ? binz - 1 : binz; - binz = binz < 0 ? 0 : binz; - binidx = binx + biny * nbinx + binz * nbinx * nbiny; - oldidx = atomicAdd(&bin_size[binidx], 1); - sortidx[i] = oldidx; - } + int binidx, binx, biny, binz; + int oldidx; + T x_rescaled, y_rescaled, z_rescaled; + for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < M; + i += gridDim.x * blockDim.x) { + x_rescaled = fold_rescale(x[i], nf1); + y_rescaled = fold_rescale(y[i], nf2); + z_rescaled = fold_rescale(z[i], nf3); + binx = floor(x_rescaled / bin_size_x); + binx = binx >= nbinx ? binx - 1 : binx; + binx = binx < 0 ? 0 : binx; + + biny = floor(y_rescaled / bin_size_y); + biny = biny >= nbiny ? biny - 1 : biny; + biny = biny < 0 ? 0 : biny; + + binz = floor(z_rescaled / bin_size_z); + binz = binz >= nbinz ? binz - 1 : binz; + binz = binz < 0 ? 0 : binz; + binidx = binx + biny * nbinx + binz * nbinx * nbiny; + oldidx = atomicAdd(&bin_size[binidx], 1); + sortidx[i] = oldidx; + } } -template -__global__ void calc_inverse_of_global_sort_index_3d(int M, int bin_size_x, int bin_size_y, int bin_size_z, int nbinx, - int nbiny, int nbinz, const int *bin_startpts, const int *sortidx, - const T *x, const T *y, const T *z, int *index, - int nf1, int nf2, int nf3) { - int binx, biny, binz; - int binidx; - T x_rescaled, y_rescaled, z_rescaled; - for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < M; i += gridDim.x * blockDim.x) { - x_rescaled = fold_rescale(x[i], nf1); - y_rescaled = fold_rescale(y[i], nf2); - z_rescaled = fold_rescale(z[i], nf3); - binx = floor(x_rescaled / bin_size_x); - binx = binx >= nbinx ? binx - 1 : binx; - binx = binx < 0 ? 0 : binx; - biny = floor(y_rescaled / bin_size_y); - biny = biny >= nbiny ? biny - 1 : biny; - biny = biny < 0 ? 0 : biny; - binz = floor(z_rescaled / bin_size_z); - binz = binz >= nbinz ? binz - 1 : binz; - binz = binz < 0 ? 0 : binz; - binidx = common::calc_global_index_v2(binx, biny, binz, nbinx, nbiny, nbinz); - - index[bin_startpts[binidx] + sortidx[i]] = i; - } +template +__global__ void calc_inverse_of_global_sort_index_3d( + int M, int bin_size_x, int bin_size_y, int bin_size_z, int nbinx, int nbiny, + int nbinz, const int *bin_startpts, const int *sortidx, const T *x, const T *y, + const T *z, int *index, int nf1, int nf2, int nf3) { + int binx, biny, binz; + int binidx; + T x_rescaled, y_rescaled, z_rescaled; + for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < M; + i += gridDim.x * blockDim.x) { + x_rescaled = fold_rescale(x[i], nf1); + y_rescaled = fold_rescale(y[i], nf2); + z_rescaled = fold_rescale(z[i], nf3); + binx = floor(x_rescaled / bin_size_x); + binx = binx >= nbinx ? binx - 1 : binx; + binx = binx < 0 ? 0 : binx; + biny = floor(y_rescaled / bin_size_y); + biny = biny >= nbiny ? biny - 1 : biny; + biny = biny < 0 ? 0 : biny; + binz = floor(z_rescaled / bin_size_z); + binz = binz >= nbinz ? binz - 1 : binz; + binz = binz < 0 ? 0 : binz; + binidx = common::calc_global_index_v2(binx, biny, binz, nbinx, nbiny, nbinz); + + index[bin_startpts[binidx] + sortidx[i]] = i; + } } /* Kernels for NUptsdriven method */ -template -__global__ void spread_3d_nupts_driven(const T *x, const T *y, const T *z, const cuda_complex *c, - cuda_complex *fw, int M, int ns, int nf1, int nf2, int nf3, T es_c, T es_beta, - T sigma, const int *idxnupts) { - int xx, yy, zz, ix, iy, iz; - int outidx; - T ker1[MAX_NSPREAD]; - T ker2[MAX_NSPREAD]; - T ker3[MAX_NSPREAD]; - - T ker1val, ker2val, ker3val; - - T x_rescaled, y_rescaled, z_rescaled; - for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < M; i += blockDim.x * gridDim.x) { - x_rescaled = fold_rescale(x[idxnupts[i]], nf1); - y_rescaled = fold_rescale(y[idxnupts[i]], nf2); - z_rescaled = fold_rescale(z[idxnupts[i]], nf3); - - int xstart = ceil(x_rescaled - ns / 2.0); - int ystart = ceil(y_rescaled - ns / 2.0); - int zstart = ceil(z_rescaled - ns / 2.0); - int xend = floor(x_rescaled + ns / 2.0); - int yend = floor(y_rescaled + ns / 2.0); - int zend = floor(z_rescaled + ns / 2.0); - - T x1 = (T)xstart - x_rescaled; - T y1 = (T)ystart - y_rescaled; - T z1 = (T)zstart - z_rescaled; - - if constexpr (KEREVALMETH == 1) { - eval_kernel_vec_horner(ker1, x1, ns, sigma); - eval_kernel_vec_horner(ker2, y1, ns, sigma); - eval_kernel_vec_horner(ker3, z1, ns, sigma); - } else { - eval_kernel_vec(ker1, x1, ns, es_c, es_beta); - eval_kernel_vec(ker2, y1, ns, es_c, es_beta); - eval_kernel_vec(ker3, z1, ns, es_c, es_beta); - } +template +__global__ void spread_3d_nupts_driven(const T *x, const T *y, const T *z, + const cuda_complex *c, cuda_complex *fw, + int M, int ns, int nf1, int nf2, int nf3, T es_c, + T es_beta, T sigma, const int *idxnupts) { + auto ker = (T *)alloca(sizeof(T) * ns * 3); + auto *__restrict__ ker1 = ker; + auto *__restrict__ ker2 = ker + ns; + auto *__restrict__ ker3 = ker + ns + ns; + + for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < M; + i += blockDim.x * gridDim.x) { + const auto x_rescaled = fold_rescale(x[idxnupts[i]], nf1); + const auto y_rescaled = fold_rescale(y[idxnupts[i]], nf2); + const auto z_rescaled = fold_rescale(z[idxnupts[i]], nf3); + + const auto [xstart, xend] = interval(ns, x_rescaled); + const auto [ystart, yend] = interval(ns, y_rescaled); + const auto [zstart, zend] = interval(ns, z_rescaled); + + const auto x1 = T(xstart) - x_rescaled; + const auto y1 = T(ystart) - y_rescaled; + const auto z1 = T(zstart) - z_rescaled; + + if constexpr (KEREVALMETH == 1) { + eval_kernel_vec_horner(ker1, x1, ns, sigma); + eval_kernel_vec_horner(ker2, y1, ns, sigma); + eval_kernel_vec_horner(ker3, z1, ns, sigma); + } else { + eval_kernel_vec(ker1, x1, ns, es_c, es_beta); + eval_kernel_vec(ker2, y1, ns, es_c, es_beta); + eval_kernel_vec(ker3, z1, ns, es_c, es_beta); + } - for (zz = zstart; zz <= zend; zz++) { - ker3val = ker3[zz - zstart]; - for (yy = ystart; yy <= yend; yy++) { - ker2val = ker2[yy - ystart]; - for (xx = xstart; xx <= xend; xx++) { - ix = xx < 0 ? xx + nf1 : (xx > nf1 - 1 ? xx - nf1 : xx); - iy = yy < 0 ? yy + nf2 : (yy > nf2 - 1 ? yy - nf2 : yy); - iz = zz < 0 ? zz + nf3 : (zz > nf3 - 1 ? zz - nf3 : zz); - outidx = ix + iy * nf1 + iz * nf1 * nf2; - ker1val = ker1[xx - xstart]; - T kervalue = ker1val * ker2val * ker3val; - atomicAdd(&fw[outidx].x, c[idxnupts[i]].x * kervalue); - atomicAdd(&fw[outidx].y, c[idxnupts[i]].y * kervalue); - } - } + for (int zz = zstart; zz <= zend; zz++) { + const auto ker3val = ker3[zz - zstart]; + for (int yy = ystart; yy <= yend; yy++) { + const auto ker2val = ker2[yy - ystart]; + for (int xx = xstart; xx <= xend; xx++) { + const int ix = xx < 0 ? xx + nf1 : (xx > nf1 - 1 ? xx - nf1 : xx); + const int iy = yy < 0 ? yy + nf2 : (yy > nf2 - 1 ? yy - nf2 : yy); + const int iz = zz < 0 ? zz + nf3 : (zz > nf3 - 1 ? zz - nf3 : zz); + const int outidx = ix + iy * nf1 + iz * nf1 * nf2; + const auto ker1val = ker1[xx - xstart]; + const auto kervalue = ker1val * ker2val * ker3val; + atomicAdd(&fw[outidx].x, c[idxnupts[i]].x * kervalue); + atomicAdd(&fw[outidx].y, c[idxnupts[i]].y * kervalue); } + } } + } } /* Kernels for Subprob method */ -template -__global__ void spread_3d_subprob(T *x, T *y, T *z, cuda_complex *c, cuda_complex *fw, int M, int ns, int nf1, - int nf2, int nf3, T sigma, T es_c, T es_beta, int *binstartpts, int *bin_size, - int bin_size_x, int bin_size_y, int bin_size_z, int *subprob_to_bin, - int *subprobstartpts, int *numsubprob, int maxsubprobsize, int nbinx, int nbiny, - int nbinz, int *idxnupts) { - extern __shared__ char sharedbuf[]; - cuda_complex *fwshared = (cuda_complex *)sharedbuf; - - const int bidx = subprob_to_bin[blockIdx.x]; - const int binsubp_idx = blockIdx.x - subprobstartpts[bidx]; - const int ptstart = binstartpts[bidx] + binsubp_idx * maxsubprobsize; - const int nupts = min(maxsubprobsize, bin_size[bidx] - binsubp_idx * maxsubprobsize); - - const int xoffset = (bidx % nbinx) * bin_size_x; - const int yoffset = ((bidx / nbinx) % nbiny) * bin_size_y; - const int zoffset = (bidx / (nbinx * nbiny)) * bin_size_z; - - int N = (bin_size_x + 2 * ceil(ns / 2.0)) * (bin_size_y + 2 * ceil(ns / 2.0)) * (bin_size_z + 2 * ceil(ns / 2.0)); - - for (int i = threadIdx.x; i < N; i += blockDim.x) { - fwshared[i].x = 0.0; - fwshared[i].y = 0.0; +template +__global__ void spread_3d_subprob( + T *x, T *y, T *z, cuda_complex *c, cuda_complex *fw, int M, int ns, int nf1, + int nf2, int nf3, T sigma, T es_c, T es_beta, int *binstartpts, int *bin_size, + int bin_size_x, int bin_size_y, int bin_size_z, int *subprob_to_bin, + int *subprobstartpts, int *numsubprob, int maxsubprobsize, int nbinx, int nbiny, + int nbinz, int *idxnupts) { + extern __shared__ char sharedbuf[]; + auto fwshared = (cuda_complex *)sharedbuf; + + const int bidx = subprob_to_bin[blockIdx.x]; + const int binsubp_idx = blockIdx.x - subprobstartpts[bidx]; + const int ptstart = binstartpts[bidx] + binsubp_idx * maxsubprobsize; + const int nupts = min(maxsubprobsize, bin_size[bidx] - binsubp_idx * maxsubprobsize); + + const int xoffset = (bidx % nbinx) * bin_size_x; + const int yoffset = ((bidx / nbinx) % nbiny) * bin_size_y; + const int zoffset = (bidx / (nbinx * nbiny)) * bin_size_z; + + const T ns_2f = ns * T(.5); + const auto ns_2 = (ns + 1) / 2; + const auto rounded_ns = ns_2 * 2; + + const int N = + (bin_size_x + rounded_ns) * (bin_size_y + rounded_ns) * (bin_size_z + rounded_ns); + + for (int i = threadIdx.x; i < N; i += blockDim.x) { + fwshared[i] = {0, 0}; + } + __syncthreads(); + auto ker = (T *)alloca(sizeof(T) * ns * 3); + auto *__restrict__ ker1 = ker; + auto *__restrict__ ker2 = ker + ns; + auto *__restrict__ ker3 = ker + ns + ns; + + for (int i = threadIdx.x; i < nupts; i += blockDim.x) { + const int nuptsidx = idxnupts[ptstart + i]; + const auto x_rescaled = fold_rescale(x[nuptsidx], nf1); + const auto y_rescaled = fold_rescale(y[nuptsidx], nf2); + const auto z_rescaled = fold_rescale(z[nuptsidx], nf3); + const auto cnow = c[nuptsidx]; + auto [xstart, xend] = interval(ns, x_rescaled); + auto [ystart, yend] = interval(ns, y_rescaled); + auto [zstart, zend] = interval(ns, z_rescaled); + + const T x1 = T(xstart) - x_rescaled; + const T y1 = T(ystart) - y_rescaled; + const T z1 = T(zstart) - z_rescaled; + + xstart -= xoffset; + ystart -= yoffset; + zstart -= zoffset; + + xend -= xoffset; + yend -= yoffset; + zend -= zoffset; + + if constexpr (KEREVALMETH == 1) { + eval_kernel_vec_horner(ker1, x1, ns, sigma); + eval_kernel_vec_horner(ker2, y1, ns, sigma); + eval_kernel_vec_horner(ker3, z1, ns, sigma); + } else { + eval_kernel_vec(ker1, x1, ns, es_c, es_beta); + eval_kernel_vec(ker2, y1, ns, es_c, es_beta); + eval_kernel_vec(ker3, z1, ns, es_c, es_beta); } - __syncthreads(); - - for (int i = threadIdx.x; i < nupts; i += blockDim.x) { - T ker1[MAX_NSPREAD]; - T ker2[MAX_NSPREAD]; - T ker3[MAX_NSPREAD]; - - const int nuptsidx = idxnupts[ptstart + i]; - const T x_rescaled = fold_rescale(x[nuptsidx], nf1); - const T y_rescaled = fold_rescale(y[nuptsidx], nf2); - const T z_rescaled = fold_rescale(z[nuptsidx], nf3); - cuda_complex cnow = c[nuptsidx]; - - const int xstart = ceil(x_rescaled - ns / 2.0) - xoffset; - const int ystart = ceil(y_rescaled - ns / 2.0) - yoffset; - const int zstart = ceil(z_rescaled - ns / 2.0) - zoffset; - - const int xend = floor(x_rescaled + ns / 2.0) - xoffset; - const int yend = floor(y_rescaled + ns / 2.0) - yoffset; - const int zend = floor(z_rescaled + ns / 2.0) - zoffset; - - if constexpr (KEREVALMETH == 1) { - eval_kernel_vec_horner(ker1, xstart + xoffset - x_rescaled, ns, sigma); - eval_kernel_vec_horner(ker2, ystart + yoffset - y_rescaled, ns, sigma); - eval_kernel_vec_horner(ker3, zstart + zoffset - z_rescaled, ns, sigma); - } else { - eval_kernel_vec(ker1, xstart + xoffset - x_rescaled, ns, es_c, es_beta); - eval_kernel_vec(ker2, ystart + yoffset - y_rescaled, ns, es_c, es_beta); - eval_kernel_vec(ker3, zstart + zoffset - z_rescaled, ns, es_c, es_beta); - } - for (int zz = zstart; zz <= zend; zz++) { - const T kervalue3 = ker3[zz - zstart]; - const int iz = zz + ceil(ns / 2.0); - if (iz >= (bin_size_z + (int)ceil(ns / 2.0) * 2) || iz < 0) - break; - for (int yy = ystart; yy <= yend; yy++) { - const T kervalue2 = ker2[yy - ystart]; - const int iy = yy + ceil(ns / 2.0); - if (iy >= (bin_size_y + (int)ceil(ns / 2.0) * 2) || iy < 0) - break; - for (int xx = xstart; xx <= xend; xx++) { - const int ix = xx + ceil(ns / 2.0); - if (ix >= (bin_size_x + (int)ceil(ns / 2.0) * 2) || ix < 0) - break; - const int outidx = ix + iy * (bin_size_x + ceil(ns / 2.0) * 2) + - iz * (bin_size_x + ceil(ns / 2.0) * 2) * (bin_size_y + ceil(ns / 2.0) * 2); - const T kervalue1 = ker1[xx - xstart]; - atomicAdd(&fwshared[outidx].x, cnow.x * kervalue1 * kervalue2 * kervalue3); - atomicAdd(&fwshared[outidx].y, cnow.y * kervalue1 * kervalue2 * kervalue3); - } - } + for (int zz = zstart; zz <= zend; zz++) { + const T kervalue3 = ker3[zz - zstart]; + const int iz = zz + ns_2; + if (iz >= (bin_size_z + (int)rounded_ns) || iz < 0) break; + for (int yy = ystart; yy <= yend; yy++) { + const T kervalue2 = ker2[yy - ystart]; + const int iy = yy + ns_2; + if (iy >= (bin_size_y + (int)rounded_ns) || iy < 0) break; + for (int xx = xstart; xx <= xend; xx++) { + const int ix = xx + ns_2; + if (ix >= (bin_size_x + (int)rounded_ns) || ix < 0) break; + const int outidx = ix + iy * (bin_size_x + rounded_ns) + + iz * (bin_size_x + rounded_ns) * (bin_size_y + rounded_ns); + const auto kervalue = ker1[xx - xstart] * kervalue2 * kervalue3; + const auto resx = cnow.x * kervalue; + const auto resy = cnow.y * kervalue; + atomicAdd(&fwshared[outidx].x, resx); + atomicAdd(&fwshared[outidx].y, resy); } + } } - __syncthreads(); - - /* write to global memory */ - for (int n = threadIdx.x; n < N; n += blockDim.x) { - const int i = n % (int)(bin_size_x + 2 * ceil(ns / 2.0)); - const int j = (int)(n / (bin_size_x + 2 * ceil(ns / 2.0))) % (int)(bin_size_y + 2 * ceil(ns / 2.0)); - const int k = n / ((bin_size_x + 2 * ceil(ns / 2.0)) * (bin_size_y + 2 * ceil(ns / 2.0))); - - int ix = xoffset - ceil(ns / 2.0) + i; - int iy = yoffset - ceil(ns / 2.0) + j; - int iz = zoffset - ceil(ns / 2.0) + k; - - if (ix < (nf1 + ceil(ns / 2.0)) && iy < (nf2 + ceil(ns / 2.0)) && iz < (nf3 + ceil(ns / 2.0))) { - ix = ix < 0 ? ix + nf1 : (ix > nf1 - 1 ? ix - nf1 : ix); - iy = iy < 0 ? iy + nf2 : (iy > nf2 - 1 ? iy - nf2 : iy); - iz = iz < 0 ? iz + nf3 : (iz > nf3 - 1 ? iz - nf3 : iz); - const int outidx = ix + iy * nf1 + iz * nf1 * nf2; - const int sharedidx = i + j * (bin_size_x + ceil(ns / 2.0) * 2) + - k * (bin_size_x + ceil(ns / 2.0) * 2) * (bin_size_y + ceil(ns / 2.0) * 2); - atomicAdd(&fw[outidx].x, fwshared[sharedidx].x); - atomicAdd(&fw[outidx].y, fwshared[sharedidx].y); - } + } + __syncthreads(); + + /* write to global memory */ + for (int n = threadIdx.x; n < N; n += blockDim.x) { + const int i = n % (bin_size_x + rounded_ns); + const int j = (n / (bin_size_x + rounded_ns)) % (bin_size_y + rounded_ns); + const int k = n / ((bin_size_x + rounded_ns) * (bin_size_y + rounded_ns)); + + int ix = xoffset - ns_2 + i; + int iy = yoffset - ns_2 + j; + int iz = zoffset - ns_2 + k; + + if (ix < (nf1 + ns_2) && iy < (nf2 + ns_2) && iz < (nf3 + ns_2)) { + ix = ix < 0 ? ix + nf1 : (ix > nf1 - 1 ? ix - nf1 : ix); + iy = iy < 0 ? iy + nf2 : (iy > nf2 - 1 ? iy - nf2 : iy); + iz = iz < 0 ? iz + nf3 : (iz > nf3 - 1 ? iz - nf3 : iz); + const int outidx = ix + iy * nf1 + iz * nf1 * nf2; + const int sharedidx = i + j * (bin_size_x + rounded_ns) + + k * (bin_size_x + rounded_ns) * (bin_size_y + rounded_ns); + atomicAdd(&fw[outidx].x, fwshared[sharedidx].x); + atomicAdd(&fw[outidx].y, fwshared[sharedidx].y); } + } } /* Kernels for BlockGather Method */ -template -__global__ void locate_nupts_to_bins_ghost(int M, int bin_size_x, int bin_size_y, int bin_size_z, int nobinx, - int nobiny, int nobinz, int binsperobinx, int binsperobiny, int binsperobinz, - int *bin_size, const T *x, const T *y, const T *z, int *sortidx, - int nf1, int nf2, int nf3) { - int binidx, binx, biny, binz; - int oldidx; - T x_rescaled, y_rescaled, z_rescaled; - for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < M; i += gridDim.x * blockDim.x) { - x_rescaled = fold_rescale(x[i], nf1); - y_rescaled = fold_rescale(y[i], nf2); - z_rescaled = fold_rescale(z[i], nf3); - binx = floor(x_rescaled / bin_size_x); - biny = floor(y_rescaled / bin_size_y); - binz = floor(z_rescaled / bin_size_z); - binx = binx / (binsperobinx - 2) * binsperobinx + (binx % (binsperobinx - 2) + 1); - biny = biny / (binsperobiny - 2) * binsperobiny + (biny % (binsperobiny - 2) + 1); - binz = binz / (binsperobinz - 2) * binsperobinz + (binz % (binsperobinz - 2) + 1); - - binidx = common::calc_global_index(binx, biny, binz, nobinx, nobiny, nobinz, binsperobinx, binsperobiny, - binsperobinz); - oldidx = atomicAdd(&bin_size[binidx], 1); - sortidx[i] = oldidx; - } +template +__global__ void locate_nupts_to_bins_ghost( + int M, int bin_size_x, int bin_size_y, int bin_size_z, int nobinx, int nobiny, + int nobinz, int binsperobinx, int binsperobiny, int binsperobinz, int *bin_size, + const T *x, const T *y, const T *z, int *sortidx, int nf1, int nf2, int nf3) { + int binidx, binx, biny, binz; + int oldidx; + T x_rescaled, y_rescaled, z_rescaled; + for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < M; + i += gridDim.x * blockDim.x) { + x_rescaled = fold_rescale(x[i], nf1); + y_rescaled = fold_rescale(y[i], nf2); + z_rescaled = fold_rescale(z[i], nf3); + binx = floor(x_rescaled / bin_size_x); + biny = floor(y_rescaled / bin_size_y); + binz = floor(z_rescaled / bin_size_z); + binx = binx / (binsperobinx - 2) * binsperobinx + (binx % (binsperobinx - 2) + 1); + biny = biny / (binsperobiny - 2) * binsperobiny + (biny % (binsperobiny - 2) + 1); + binz = binz / (binsperobinz - 2) * binsperobinz + (binz % (binsperobinz - 2) + 1); + + binidx = common::calc_global_index(binx, biny, binz, nobinx, nobiny, nobinz, + binsperobinx, binsperobiny, binsperobinz); + oldidx = atomicAdd(&bin_size[binidx], 1); + sortidx[i] = oldidx; + } } -template -__global__ void calc_inverse_of_global_sort_index_ghost(int M, int bin_size_x, int bin_size_y, int bin_size_z, - int nobinx, int nobiny, int nobinz, int binsperobinx, - int binsperobiny, int binsperobinz, int *bin_startpts, - const int *sortidx, const T *x, const T *y, const T *z, - int *index, int nf1, int nf2, int nf3) { - int binx, biny, binz; - int binidx; - T x_rescaled, y_rescaled, z_rescaled; - for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < M; i += gridDim.x * blockDim.x) { - x_rescaled = fold_rescale(x[i], nf1); - y_rescaled = fold_rescale(y[i], nf2); - z_rescaled = fold_rescale(z[i], nf3); - binx = floor(x_rescaled / bin_size_x); - biny = floor(y_rescaled / bin_size_y); - binz = floor(z_rescaled / bin_size_z); - binx = binx / (binsperobinx - 2) * binsperobinx + (binx % (binsperobinx - 2) + 1); - biny = biny / (binsperobiny - 2) * binsperobiny + (biny % (binsperobiny - 2) + 1); - binz = binz / (binsperobinz - 2) * binsperobinz + (binz % (binsperobinz - 2) + 1); - - binidx = common::calc_global_index(binx, biny, binz, nobinx, nobiny, nobinz, binsperobinx, binsperobiny, - binsperobinz); - - index[bin_startpts[binidx] + sortidx[i]] = i; - } +template +__global__ void calc_inverse_of_global_sort_index_ghost( + int M, int bin_size_x, int bin_size_y, int bin_size_z, int nobinx, int nobiny, + int nobinz, int binsperobinx, int binsperobiny, int binsperobinz, int *bin_startpts, + const int *sortidx, const T *x, const T *y, const T *z, int *index, int nf1, int nf2, + int nf3) { + int binx, biny, binz; + int binidx; + T x_rescaled, y_rescaled, z_rescaled; + for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < M; + i += gridDim.x * blockDim.x) { + x_rescaled = fold_rescale(x[i], nf1); + y_rescaled = fold_rescale(y[i], nf2); + z_rescaled = fold_rescale(z[i], nf3); + binx = floor(x_rescaled / bin_size_x); + biny = floor(y_rescaled / bin_size_y); + binz = floor(z_rescaled / bin_size_z); + binx = binx / (binsperobinx - 2) * binsperobinx + (binx % (binsperobinx - 2) + 1); + biny = biny / (binsperobiny - 2) * binsperobiny + (biny % (binsperobiny - 2) + 1); + binz = binz / (binsperobinz - 2) * binsperobinz + (binz % (binsperobinz - 2) + 1); + + binidx = common::calc_global_index(binx, biny, binz, nobinx, nobiny, nobinz, + binsperobinx, binsperobiny, binsperobinz); + + index[bin_startpts[binidx] + sortidx[i]] = i; + } } -template -__global__ void spread_3d_block_gather(const T *x, const T *y, const T *z, const cuda_complex *c, - cuda_complex *fw, int M, int ns, int nf1, int nf2, int nf3, T es_c, T es_beta, - T sigma, const int *binstartpts, int obin_size_x, int obin_size_y, - int obin_size_z, int binsperobin, int *subprob_to_bin, - const int *subprobstartpts, int maxsubprobsize, int nobinx, int nobiny, - int nobinz, const int *idxnupts) { - extern __shared__ char sharedbuf[]; - cuda_complex *fwshared = (cuda_complex *)sharedbuf; - - int xstart, ystart, zstart, xend, yend, zend; - int xstartnew, ystartnew, zstartnew, xendnew, yendnew, zendnew; - int subpidx = blockIdx.x; - int obidx = subprob_to_bin[subpidx]; - int bidx = obidx * binsperobin; - - int obinsubp_idx = subpidx - subprobstartpts[obidx]; - int ix, iy, iz; - int outidx; - int ptstart = binstartpts[bidx] + obinsubp_idx * maxsubprobsize; - int nupts = - min(maxsubprobsize, binstartpts[bidx + binsperobin] - binstartpts[bidx] - obinsubp_idx * maxsubprobsize); - - int xoffset = (obidx % nobinx) * obin_size_x; - int yoffset = (obidx / nobinx) % nobiny * obin_size_y; - int zoffset = (obidx / (nobinx * nobiny)) * obin_size_z; - - int N = obin_size_x * obin_size_y * obin_size_z; - - T ker1[MAX_NSPREAD]; - T ker2[MAX_NSPREAD]; - T ker3[MAX_NSPREAD]; - - for (int i = threadIdx.x; i < N; i += blockDim.x) { - fwshared[i].x = 0.0; - fwshared[i].y = 0.0; +template +__global__ void spread_3d_block_gather( + const T *x, const T *y, const T *z, const cuda_complex *c, cuda_complex *fw, + int M, int ns, int nf1, int nf2, int nf3, T es_c, T es_beta, T sigma, + const int *binstartpts, int obin_size_x, int obin_size_y, int obin_size_z, + int binsperobin, int *subprob_to_bin, const int *subprobstartpts, int maxsubprobsize, + int nobinx, int nobiny, int nobinz, const int *idxnupts) { + extern __shared__ char sharedbuf[]; + cuda_complex *fwshared = (cuda_complex *)sharedbuf; + + int xstart, ystart, zstart, xend, yend, zend; + int xstartnew, ystartnew, zstartnew, xendnew, yendnew, zendnew; + int subpidx = blockIdx.x; + int obidx = subprob_to_bin[subpidx]; + int bidx = obidx * binsperobin; + + int obinsubp_idx = subpidx - subprobstartpts[obidx]; + int ix, iy, iz; + int outidx; + int ptstart = binstartpts[bidx] + obinsubp_idx * maxsubprobsize; + int nupts = min(maxsubprobsize, binstartpts[bidx + binsperobin] - binstartpts[bidx] - + obinsubp_idx * maxsubprobsize); + + int xoffset = (obidx % nobinx) * obin_size_x; + int yoffset = (obidx / nobinx) % nobiny * obin_size_y; + int zoffset = (obidx / (nobinx * nobiny)) * obin_size_z; + + int N = obin_size_x * obin_size_y * obin_size_z; + + T ker1[MAX_NSPREAD]; + T ker2[MAX_NSPREAD]; + T ker3[MAX_NSPREAD]; + + for (int i = threadIdx.x; i < N; i += blockDim.x) { + fwshared[i].x = 0.0; + fwshared[i].y = 0.0; + } + __syncthreads(); + + T x_rescaled, y_rescaled, z_rescaled; + cuda_complex cnow; + for (int i = threadIdx.x; i < nupts; i += blockDim.x) { + int nidx = idxnupts[ptstart + i]; + int b = nidx / M; + int box[3]; + for (int d = 0; d < 3; d++) { + box[d] = b % 3; + if (box[d] == 1) box[d] = -1; + if (box[d] == 2) box[d] = 1; + b = b / 3; + } + int ii = nidx % M; + x_rescaled = fold_rescale(x[ii], nf1) + box[0] * nf1; + y_rescaled = fold_rescale(y[ii], nf2) + box[1] * nf2; + z_rescaled = fold_rescale(z[ii], nf3) + box[2] * nf3; + cnow = c[ii]; + + xstart = ceil(x_rescaled - ns / 2.0) - xoffset; + ystart = ceil(y_rescaled - ns / 2.0) - yoffset; + zstart = ceil(z_rescaled - ns / 2.0) - zoffset; + xend = floor(x_rescaled + ns / 2.0) - xoffset; + yend = floor(y_rescaled + ns / 2.0) - yoffset; + zend = floor(z_rescaled + ns / 2.0) - zoffset; + + if constexpr (KEREVALMETH == 1) { + eval_kernel_vec_horner(ker1, xstart + xoffset - x_rescaled, ns, sigma); + eval_kernel_vec_horner(ker2, ystart + yoffset - y_rescaled, ns, sigma); + eval_kernel_vec_horner(ker3, zstart + zoffset - z_rescaled, ns, sigma); + } else { + eval_kernel_vec(ker1, xstart + xoffset - x_rescaled, ns, es_c, es_beta); + eval_kernel_vec(ker2, ystart + yoffset - y_rescaled, ns, es_c, es_beta); + eval_kernel_vec(ker3, zstart + zoffset - z_rescaled, ns, es_c, es_beta); } - __syncthreads(); - - T x_rescaled, y_rescaled, z_rescaled; - cuda_complex cnow; - for (int i = threadIdx.x; i < nupts; i += blockDim.x) { - int nidx = idxnupts[ptstart + i]; - int b = nidx / M; - int box[3]; - for (int d = 0; d < 3; d++) { - box[d] = b % 3; - if (box[d] == 1) - box[d] = -1; - if (box[d] == 2) - box[d] = 1; - b = b / 3; - } - int ii = nidx % M; - x_rescaled = fold_rescale(x[ii], nf1) + box[0] * nf1; - y_rescaled = fold_rescale(y[ii], nf2) + box[1] * nf2; - z_rescaled = fold_rescale(z[ii], nf3) + box[2] * nf3; - cnow = c[ii]; - - xstart = ceil(x_rescaled - ns / 2.0) - xoffset; - ystart = ceil(y_rescaled - ns / 2.0) - yoffset; - zstart = ceil(z_rescaled - ns / 2.0) - zoffset; - xend = floor(x_rescaled + ns / 2.0) - xoffset; - yend = floor(y_rescaled + ns / 2.0) - yoffset; - zend = floor(z_rescaled + ns / 2.0) - zoffset; - - if constexpr (KEREVALMETH == 1) { - eval_kernel_vec_horner(ker1, xstart + xoffset - x_rescaled, ns, sigma); - eval_kernel_vec_horner(ker2, ystart + yoffset - y_rescaled, ns, sigma); - eval_kernel_vec_horner(ker3, zstart + zoffset - z_rescaled, ns, sigma); - } else { - eval_kernel_vec(ker1, xstart + xoffset - x_rescaled, ns, es_c, es_beta); - eval_kernel_vec(ker2, ystart + yoffset - y_rescaled, ns, es_c, es_beta); - eval_kernel_vec(ker3, zstart + zoffset - z_rescaled, ns, es_c, es_beta); - } - xstartnew = xstart < 0 ? 0 : xstart; - ystartnew = ystart < 0 ? 0 : ystart; - zstartnew = zstart < 0 ? 0 : zstart; - xendnew = xend >= obin_size_x ? obin_size_x - 1 : xend; - yendnew = yend >= obin_size_y ? obin_size_y - 1 : yend; - zendnew = zend >= obin_size_z ? obin_size_z - 1 : zend; - - for (int zz = zstartnew; zz <= zendnew; zz++) { - T kervalue3 = ker3[zz - zstart]; - for (int yy = ystartnew; yy <= yendnew; yy++) { - T kervalue2 = ker2[yy - ystart]; - for (int xx = xstartnew; xx <= xendnew; xx++) { - outidx = xx + yy * obin_size_x + zz * obin_size_y * obin_size_x; - T kervalue1 = ker1[xx - xstart]; - atomicAdd(&fwshared[outidx].x, cnow.x * kervalue1 * kervalue2 * kervalue3); - atomicAdd(&fwshared[outidx].y, cnow.y * kervalue1 * kervalue2 * kervalue3); - } - } + xstartnew = xstart < 0 ? 0 : xstart; + ystartnew = ystart < 0 ? 0 : ystart; + zstartnew = zstart < 0 ? 0 : zstart; + xendnew = xend >= obin_size_x ? obin_size_x - 1 : xend; + yendnew = yend >= obin_size_y ? obin_size_y - 1 : yend; + zendnew = zend >= obin_size_z ? obin_size_z - 1 : zend; + + for (int zz = zstartnew; zz <= zendnew; zz++) { + T kervalue3 = ker3[zz - zstart]; + for (int yy = ystartnew; yy <= yendnew; yy++) { + T kervalue2 = ker2[yy - ystart]; + for (int xx = xstartnew; xx <= xendnew; xx++) { + outidx = xx + yy * obin_size_x + zz * obin_size_y * obin_size_x; + T kervalue1 = ker1[xx - xstart]; + atomicAdd(&fwshared[outidx].x, cnow.x * kervalue1 * kervalue2 * kervalue3); + atomicAdd(&fwshared[outidx].y, cnow.y * kervalue1 * kervalue2 * kervalue3); } + } } - __syncthreads(); - /* write to global memory */ - for (int n = threadIdx.x; n < N; n += blockDim.x) { - int i = n % obin_size_x; - int j = (n / obin_size_x) % obin_size_y; - int k = n / (obin_size_x * obin_size_y); - - ix = xoffset + i; - iy = yoffset + j; - iz = zoffset + k; - outidx = ix + iy * nf1 + iz * nf1 * nf2; - atomicAdd(&fw[outidx].x, fwshared[n].x); - atomicAdd(&fw[outidx].y, fwshared[n].y); - } + } + __syncthreads(); + /* write to global memory */ + for (int n = threadIdx.x; n < N; n += blockDim.x) { + int i = n % obin_size_x; + int j = (n / obin_size_x) % obin_size_y; + int k = n / (obin_size_x * obin_size_y); + + ix = xoffset + i; + iy = yoffset + j; + iz = zoffset + k; + outidx = ix + iy * nf1 + iz * nf1 * nf2; + atomicAdd(&fw[outidx].x, fwshared[n].x); + atomicAdd(&fw[outidx].y, fwshared[n].y); + } } /* ---------------------- 3d Interpolation Kernels ---------------------------*/ /* Kernels for NUptsdriven Method */ -template -__global__ void interp_3d_nupts_driven(const T *x, const T *y, const T *z, cuda_complex *c, - const cuda_complex *fw, int M, int ns, int nf1, int nf2, int nf3, T es_c, - T es_beta, T sigma, int *idxnupts) { - for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < M; i += blockDim.x * gridDim.x) { - T x_rescaled = fold_rescale(x[idxnupts[i]], nf1); - T y_rescaled = fold_rescale(y[idxnupts[i]], nf2); - T z_rescaled = fold_rescale(z[idxnupts[i]], nf3); - - int xstart = ceil(x_rescaled - ns / 2.0); - int ystart = ceil(y_rescaled - ns / 2.0); - int zstart = ceil(z_rescaled - ns / 2.0); - - int xend = floor(x_rescaled + ns / 2.0); - int yend = floor(y_rescaled + ns / 2.0); - int zend = floor(z_rescaled + ns / 2.0); - - cuda_complex cnow; - cnow.x = 0.0; - cnow.y = 0.0; - - T ker1[MAX_NSPREAD]; - T ker2[MAX_NSPREAD]; - T ker3[MAX_NSPREAD]; - - if constexpr (KEREVALMETH == 1) { - eval_kernel_vec_horner(ker1, xstart - x_rescaled, ns, sigma); - eval_kernel_vec_horner(ker2, ystart - y_rescaled, ns, sigma); - eval_kernel_vec_horner(ker3, zstart - z_rescaled, ns, sigma); - } else { - eval_kernel_vec(ker1, xstart - x_rescaled, ns, es_c, es_beta); - eval_kernel_vec(ker2, ystart - y_rescaled, ns, es_c, es_beta); - eval_kernel_vec(ker3, zstart - z_rescaled, ns, es_c, es_beta); - } +template +__global__ void interp_3d_nupts_driven( + const T *x, const T *y, const T *z, cuda_complex *c, const cuda_complex *fw, + int M, int ns, int nf1, int nf2, int nf3, T es_c, T es_beta, T sigma, int *idxnupts) { + auto ker = (T *)alloca(sizeof(T) * ns * 2); + auto *__restrict__ ker1 = ker; + auto *__restrict__ ker2 = ker + ns; + auto *__restrict__ ker3 = ker + ns + ns; + for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < M; + i += blockDim.x * gridDim.x) { + const auto x_rescaled = fold_rescale(x[idxnupts[i]], nf1); + const auto y_rescaled = fold_rescale(y[idxnupts[i]], nf2); + const auto z_rescaled = fold_rescale(z[idxnupts[i]], nf3); + + const auto [xstart, xend] = interval(ns, x_rescaled); + const auto [ystart, yend] = interval(ns, y_rescaled); + const auto [zstart, zend] = interval(ns, z_rescaled); + + const auto x1 = T(xstart) - x_rescaled; + const auto y1 = T(ystart) - y_rescaled; + const auto z1 = T(zstart) - z_rescaled; + + cuda_complex cnow{0, 0}; + + if constexpr (KEREVALMETH == 1) { + eval_kernel_vec_horner(ker1, x1, ns, sigma); + eval_kernel_vec_horner(ker2, y1, ns, sigma); + eval_kernel_vec_horner(ker3, z1, ns, sigma); + } else { + eval_kernel_vec(ker1, x1, ns, es_c, es_beta); + eval_kernel_vec(ker2, y1, ns, es_c, es_beta); + eval_kernel_vec(ker3, z1, ns, es_c, es_beta); + } - for (int zz = zstart; zz <= zend; zz++) { - T kervalue3 = ker3[zz - zstart]; - int iz = zz < 0 ? zz + nf3 : (zz > nf3 - 1 ? zz - nf3 : zz); - for (int yy = ystart; yy <= yend; yy++) { - T kervalue2 = ker2[yy - ystart]; - int iy = yy < 0 ? yy + nf2 : (yy > nf2 - 1 ? yy - nf2 : yy); - for (int xx = xstart; xx <= xend; xx++) { - int ix = xx < 0 ? xx + nf1 : (xx > nf1 - 1 ? xx - nf1 : xx); - int inidx = ix + iy * nf1 + iz * nf2 * nf1; - T kervalue1 = ker1[xx - xstart]; - cnow.x += fw[inidx].x * kervalue1 * kervalue2 * kervalue3; - cnow.y += fw[inidx].y * kervalue1 * kervalue2 * kervalue3; - } - } + for (int zz = zstart; zz <= zend; zz++) { + const auto kervalue3 = ker3[zz - zstart]; + int iz = zz < 0 ? zz + nf3 : (zz > nf3 - 1 ? zz - nf3 : zz); + for (int yy = ystart; yy <= yend; yy++) { + const auto kervalue2 = ker2[yy - ystart]; + int iy = yy < 0 ? yy + nf2 : (yy > nf2 - 1 ? yy - nf2 : yy); + for (int xx = xstart; xx <= xend; xx++) { + const int ix = xx < 0 ? xx + nf1 : (xx > nf1 - 1 ? xx - nf1 : xx); + const int inidx = ix + iy * nf1 + iz * nf2 * nf1; + const auto kervalue1 = ker1[xx - xstart]; + cnow.x += fw[inidx].x * kervalue1 * kervalue2 * kervalue3; + cnow.y += fw[inidx].y * kervalue1 * kervalue2 * kervalue3; } - c[idxnupts[i]].x = cnow.x; - c[idxnupts[i]].y = cnow.y; + } } + c[idxnupts[i]].x = cnow.x; + c[idxnupts[i]].y = cnow.y; + } } /* Kernels for SubProb Method */ -template -__global__ void interp_3d_subprob(const T *x, const T *y, const T *z, cuda_complex *c, const cuda_complex *fw, - int M, int ns, int nf1, int nf2, int nf3, T es_c, T es_beta, T sigma, - const int *binstartpts, const int *bin_size, int bin_size_x, int bin_size_y, - int bin_size_z, const int *subprob_to_bin, const int *subprobstartpts, - const int *numsubprob, int maxsubprobsize, int nbinx, int nbiny, int nbinz, - const int *idxnupts) { - extern __shared__ char sharedbuf[]; - cuda_complex *fwshared = (cuda_complex *)sharedbuf; - - int xstart, ystart, xend, yend, zstart, zend; - int subpidx = blockIdx.x; - int bidx = subprob_to_bin[subpidx]; - int binsubp_idx = subpidx - subprobstartpts[bidx]; - int ix, iy, iz; - int outidx; - int ptstart = binstartpts[bidx] + binsubp_idx * maxsubprobsize; - int nupts = min(maxsubprobsize, bin_size[bidx] - binsubp_idx * maxsubprobsize); - - int xoffset = (bidx % nbinx) * bin_size_x; - int yoffset = ((bidx / nbinx) % nbiny) * bin_size_y; - int zoffset = (bidx / (nbinx * nbiny)) * bin_size_z; - - int N = (bin_size_x + 2 * ceil(ns / 2.0)) * (bin_size_y + 2 * ceil(ns / 2.0)) * (bin_size_z + 2 * ceil(ns / 2.0)); - - for (int n = threadIdx.x; n < N; n += blockDim.x) { - int i = n % (int)(bin_size_x + 2 * ceil(ns / 2.0)); - int j = (int)(n / (bin_size_x + 2 * ceil(ns / 2.0))) % (int)(bin_size_y + 2 * ceil(ns / 2.0)); - int k = n / ((bin_size_x + 2 * ceil(ns / 2.0)) * (bin_size_y + 2 * ceil(ns / 2.0))); - - ix = xoffset - ceil(ns / 2.0) + i; - iy = yoffset - ceil(ns / 2.0) + j; - iz = zoffset - ceil(ns / 2.0) + k; - if (ix < (nf1 + ceil(ns / 2.0)) && iy < (nf2 + ceil(ns / 2.0)) && iz < (nf3 + ceil(ns / 2.0))) { - ix = ix < 0 ? ix + nf1 : (ix > nf1 - 1 ? ix - nf1 : ix); - iy = iy < 0 ? iy + nf2 : (iy > nf2 - 1 ? iy - nf2 : iy); - iz = iz < 0 ? iz + nf3 : (iz > nf3 - 1 ? iz - nf3 : iz); - outidx = ix + iy * nf1 + iz * nf1 * nf2; - int sharedidx = i + j * (bin_size_x + ceil(ns / 2.0) * 2) + - k * (bin_size_x + ceil(ns / 2.0) * 2) * (bin_size_y + ceil(ns / 2.0) * 2); - fwshared[sharedidx].x = fw[outidx].x; - fwshared[sharedidx].y = fw[outidx].y; - } +template +__global__ void interp_3d_subprob( + const T *x, const T *y, const T *z, cuda_complex *c, const cuda_complex *fw, + int M, int ns, int nf1, int nf2, int nf3, T es_c, T es_beta, T sigma, + const int *binstartpts, const int *bin_size, int bin_size_x, int bin_size_y, + int bin_size_z, const int *subprob_to_bin, const int *subprobstartpts, + const int *numsubprob, int maxsubprobsize, int nbinx, int nbiny, int nbinz, + const int *idxnupts) { + extern __shared__ char sharedbuf[]; + auto fwshared = (cuda_complex *)sharedbuf; + + auto ker = (T *)alloca(sizeof(T) * ns * 2); + auto *__restrict__ ker1 = ker; + auto *__restrict__ ker2 = ker + ns; + auto *__restrict__ ker3 = ker + ns + ns; + + const auto subpidx = blockIdx.x; + const auto bidx = subprob_to_bin[subpidx]; + const auto binsubp_idx = subpidx - subprobstartpts[bidx]; + const auto ptstart = binstartpts[bidx] + binsubp_idx * maxsubprobsize; + const auto nupts = min(maxsubprobsize, bin_size[bidx] - binsubp_idx * maxsubprobsize); + + const auto xoffset = (bidx % nbinx) * bin_size_x; + const auto yoffset = ((bidx / nbinx) % nbiny) * bin_size_y; + const auto zoffset = (bidx / (nbinx * nbiny)) * bin_size_z; + + const T ns_2f = ns * T(.5); + const auto ns_2 = (ns + 1) / 2; + const auto rounded_ns = ns_2 * 2; + + const int N = + (bin_size_x + rounded_ns) * (bin_size_y + rounded_ns) * (bin_size_z + rounded_ns); + + for (int n = threadIdx.x; n < N; n += blockDim.x) { + int i = n % (bin_size_x + rounded_ns); + int j = (n / (bin_size_x + rounded_ns)) % (bin_size_y + rounded_ns); + int k = n / ((bin_size_x + rounded_ns) * (bin_size_y + rounded_ns)); + auto ix = xoffset - ns_2 + i; + auto iy = yoffset - ns_2 + j; + auto iz = zoffset - ns_2 + k; + if (ix < (nf1 + ns_2) && iy < (nf2 + ns_2) && iz < (nf3 + ns_2)) { + ix = ix < 0 ? ix + nf1 : (ix > nf1 - 1 ? ix - nf1 : ix); + iy = iy < 0 ? iy + nf2 : (iy > nf2 - 1 ? iy - nf2 : iy); + iz = iz < 0 ? iz + nf3 : (iz > nf3 - 1 ? iz - nf3 : iz); + const auto outidx = ix + iy * nf1 + iz * nf1 * nf2; + int sharedidx = i + j * (bin_size_x + rounded_ns) + + k * (bin_size_x + rounded_ns) * (bin_size_y + rounded_ns); + fwshared[sharedidx].x = fw[outidx].x; + fwshared[sharedidx].y = fw[outidx].y; + } + } + __syncthreads(); + + for (int i = threadIdx.x; i < nupts; i += blockDim.x) { + const int idx = ptstart + i; + const auto x_rescaled = fold_rescale(x[idxnupts[idx]], nf1); + const auto y_rescaled = fold_rescale(y[idxnupts[idx]], nf2); + const auto z_rescaled = fold_rescale(z[idxnupts[idx]], nf3); + cuda_complex cnow{0, 0}; + + auto [xstart, xend] = interval(ns, x_rescaled); + auto [ystart, yend] = interval(ns, y_rescaled); + auto [zstart, zend] = interval(ns, z_rescaled); + + const T x1 = T(xstart) - x_rescaled; + const T y1 = T(ystart) - y_rescaled; + const T z1 = T(zstart) - z_rescaled; + + xstart -= xoffset; + ystart -= yoffset; + zstart -= zoffset; + + xend -= xoffset; + yend -= yoffset; + zend -= zoffset; + + if constexpr (KEREVALMETH == 1) { + eval_kernel_vec_horner(ker1, x1, ns, sigma); + eval_kernel_vec_horner(ker2, y1, ns, sigma); + eval_kernel_vec_horner(ker3, z1, ns, sigma); + } else { + eval_kernel_vec(ker1, x1, ns, es_c, es_beta); + eval_kernel_vec(ker2, y1, ns, es_c, es_beta); + eval_kernel_vec(ker3, z1, ns, es_c, es_beta); } - __syncthreads(); - T ker1[MAX_NSPREAD]; - T ker2[MAX_NSPREAD]; - T ker3[MAX_NSPREAD]; - T x_rescaled, y_rescaled, z_rescaled; - cuda_complex cnow; - for (int i = threadIdx.x; i < nupts; i += blockDim.x) { - int idx = ptstart + i; - x_rescaled = fold_rescale(x[idxnupts[idx]], nf1); - y_rescaled = fold_rescale(y[idxnupts[idx]], nf2); - z_rescaled = fold_rescale(z[idxnupts[idx]], nf3); - cnow.x = 0.0; - cnow.y = 0.0; - - xstart = ceil(x_rescaled - ns / 2.0) - xoffset; - ystart = ceil(y_rescaled - ns / 2.0) - yoffset; - zstart = ceil(z_rescaled - ns / 2.0) - zoffset; - - xend = floor(x_rescaled + ns / 2.0) - xoffset; - yend = floor(y_rescaled + ns / 2.0) - yoffset; - zend = floor(z_rescaled + ns / 2.0) - zoffset; - - if constexpr (KEREVALMETH == 1) { - eval_kernel_vec_horner(ker1, xstart + xoffset - x_rescaled, ns, sigma); - eval_kernel_vec_horner(ker2, ystart + yoffset - y_rescaled, ns, sigma); - eval_kernel_vec_horner(ker3, zstart + zoffset - z_rescaled, ns, sigma); - } else { - eval_kernel_vec(ker1, xstart + xoffset - x_rescaled, ns, es_c, es_beta); - eval_kernel_vec(ker2, ystart + yoffset - y_rescaled, ns, es_c, es_beta); - eval_kernel_vec(ker3, zstart + zoffset - z_rescaled, ns, es_c, es_beta); - } - for (int zz = zstart; zz <= zend; zz++) { - T kervalue3 = ker3[zz - zstart]; - iz = zz + ceil(ns / 2.0); - for (int yy = ystart; yy <= yend; yy++) { - T kervalue2 = ker2[yy - ystart]; - iy = yy + ceil(ns / 2.0); - for (int xx = xstart; xx <= xend; xx++) { - ix = xx + ceil(ns / 2.0); - outidx = ix + iy * (bin_size_x + ceil(ns / 2.0) * 2) + - iz * (bin_size_x + ceil(ns / 2.0) * 2) * (bin_size_y + ceil(ns / 2.0) * 2); - T kervalue1 = ker1[xx - xstart]; - cnow.x += fwshared[outidx].x * kervalue1 * kervalue2 * kervalue3; - cnow.y += fwshared[outidx].y * kervalue1 * kervalue2 * kervalue3; - } - } + for (int zz = zstart; zz <= zend; zz++) { + const auto kervalue3 = ker3[zz - zstart]; + const auto iz = zz + ns_2; + for (int yy = ystart; yy <= yend; yy++) { + const auto kervalue2 = ker2[yy - ystart]; + const auto iy = yy + ns_2; + for (int xx = xstart; xx <= xend; xx++) { + const auto ix = xx + ns_2; + const auto outidx = ix + iy * (bin_size_x + rounded_ns) + + iz * (bin_size_x + rounded_ns) * (bin_size_y + rounded_ns); + const auto kervalue1 = ker1[xx - xstart]; + cnow.x += fwshared[outidx].x * kervalue1 * kervalue2 * kervalue3; + cnow.y += fwshared[outidx].y * kervalue1 * kervalue2 * kervalue3; } - c[idxnupts[idx]].x = cnow.x; - c[idxnupts[idx]].y = cnow.y; + } } + c[idxnupts[idx]].x = cnow.x; + c[idxnupts[idx]].y = cnow.y; + } } } // namespace spreadinterp diff --git a/src/cuda/common.cu b/src/cuda/common.cu index e7ce65b52..1552076ee 100644 --- a/src/cuda/common.cu +++ b/src/cuda/common.cu @@ -310,9 +310,18 @@ void cufinufft_setup_binsize(int type, int ns, int dim, cufinufft_opts *opts) { case 0: case 1: case 2: { - opts->gpu_binsizex = (opts->gpu_binsizex < 0) ? 16 : opts->gpu_binsizex; - opts->gpu_binsizey = (opts->gpu_binsizey < 0) ? 16 : opts->gpu_binsizey; - opts->gpu_binsizez = (opts->gpu_binsizez < 0) ? 2 : opts->gpu_binsizez; + if (opts->gpu_binsizex < 0 || opts->gpu_binsizey < 0 || opts->gpu_binsizez < 0) { + opts->gpu_binsizex = 16; + opts->gpu_binsizey = 16; + opts->gpu_binsizez = 2; + // const auto shared_mem_required = shared_memory_required( + // dim, ns, opts->gpu_binsizex, opts->gpu_binsizey, + // opts->gpu_binsizez); + // printf( + // "binsizex: %d, binsizey: %d, binsizez: %d shared_mem_required %ld + // (bytes)\n", opts->gpu_binsizex, opts->gpu_binsizey, + // opts->gpu_binsizez, shared_mem_required); + } } break; case 4: { opts->gpu_obinsizex = (opts->gpu_obinsizex < 0) ? 8 : opts->gpu_obinsizex; From 24bf6beb68e88c05ea2c9fa1bbb23eb4a787fb51 Mon Sep 17 00:00:00 2001 From: Marco Barbone Date: Thu, 18 Jul 2024 15:43:57 -0400 Subject: [PATCH 12/68] 3D SM and GM optimized --- perftest/cuda/bench.py | 2 +- src/cuda/common.cu | 6 +----- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/perftest/cuda/bench.py b/perftest/cuda/bench.py index 8a9e757a3..7af6b0bc1 100644 --- a/perftest/cuda/bench.py +++ b/perftest/cuda/bench.py @@ -74,7 +74,7 @@ def build_args(args): for i in range(1, 7): args["--tol"] = "1E-" + str(i) print("Running with tol = 1E-" + str(i)) - for method in ['2', '1']: + for method in ['4', '2']: args["--method"] = method if method == '0': data['method'].append('auto') diff --git a/src/cuda/common.cu b/src/cuda/common.cu index 1552076ee..64c5639dc 100644 --- a/src/cuda/common.cu +++ b/src/cuda/common.cu @@ -277,11 +277,7 @@ void cufinufft_setup_binsize(int type, int ns, int dim, cufinufft_opts *opts) { case 2: { opts->gpu_binsizex = 32; opts->gpu_binsizey = 32; - // fall through otherwise - if (opts->gpu_method && ns > 2) { - break; - } - } + } break; case 1: { cudaGetDevice(&device_id); if (const auto err = cudaGetLastError(); err != cudaSuccess) { From 960117a33109b60001797cf2045992a04f3a8406 Mon Sep 17 00:00:00 2001 From: Marco Barbone Date: Thu, 18 Jul 2024 16:26:58 -0400 Subject: [PATCH 13/68] bump cuda version --- Jenkinsfile | 2 +- perftest/cuda/bench.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index 6600c1cc3..c733a9436 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -9,7 +9,7 @@ pipeline { stage('main') { agent { dockerfile { - filename 'tools/cufinufft/docker/cuda11.2/Dockerfile-x86_64' + filename 'tools/cufinufft/docker/cuda12.0/Dockerfile-x86_64' args '--gpus 2' label 'v100' } diff --git a/perftest/cuda/bench.py b/perftest/cuda/bench.py index 7af6b0bc1..8a9e757a3 100644 --- a/perftest/cuda/bench.py +++ b/perftest/cuda/bench.py @@ -74,7 +74,7 @@ def build_args(args): for i in range(1, 7): args["--tol"] = "1E-" + str(i) print("Running with tol = 1E-" + str(i)) - for method in ['4', '2']: + for method in ['2', '1']: args["--method"] = method if method == '0': data['method'].append('auto') From c1b14c66b34e737dc6ce48a2ab2e7d997c0b0187 Mon Sep 17 00:00:00 2001 From: Marco Barbone Date: Tue, 23 Jul 2024 11:22:39 -0400 Subject: [PATCH 14/68] changed matlab to generate necessary cuda upsampfact files --- devel/gen_all_horner_C_code.m | 8 ++++---- devel/gen_ker_horner_loop_C_code.m | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/devel/gen_all_horner_C_code.m b/devel/gen_all_horner_C_code.m index 5ac28cb95..360725570 100644 --- a/devel/gen_all_horner_C_code.m +++ b/devel/gen_all_horner_C_code.m @@ -9,11 +9,11 @@ opts = struct(); ws = 2:16; -upsampfac = 2; % sigma (upsampling): either 2 (default) or low (eg 5/4). -opts.wpad = true; % pad kernel eval to multiple of 4 +upsampfac = 1.25; % sigma (upsampling): either 2 (default) or low (eg 5/4). +opts.wpad = false; % pad kernel eval to multiple of 4 -if upsampfac==2, fid = fopen('../src/ker_horner_allw_loop.c','w'); -else, fid = fopen('../src/ker_lowupsampfac_horner_allw_loop.c','w'); +if upsampfac==2, fid = fopen('../include/cufinufft/contrib/ker_horner_allw_loop.inc','w'); +else, fid = fopen('../include/cufinufft/contrib/ker_lowupsampfac_horner_allw_loop.inc','w'); end fwrite(fid,sprintf('// Code generated by gen_all_horner_C_code.m in finufft/devel\n')); fwrite(fid,sprintf('// Authors: Alex Barnett & Ludvig af Klinteberg.\n// (C) The Simons Foundation, Inc.\n')); diff --git a/devel/gen_ker_horner_loop_C_code.m b/devel/gen_ker_horner_loop_C_code.m index 12fe74baa..9c0b6d1ed 100644 --- a/devel/gen_ker_horner_loop_C_code.m +++ b/devel/gen_ker_horner_loop_C_code.m @@ -35,7 +35,7 @@ width = w; end for n=1:d % loop over poly coeff powers - s = sprintf('FLT c%d[] = {%.16E',n-1, C(n,1)); + s = sprintf('constexpr FLT c%d[] = {%.16E',n-1, C(n,1)); for i=2:width % loop over segments s = sprintf('%s, %.16E', s, C(n,i)); end From f300d2d8839cdc51381ef6516c18e25aeb1060ab Mon Sep 17 00:00:00 2001 From: Marco Barbone Date: Tue, 23 Jul 2024 11:26:29 -0400 Subject: [PATCH 15/68] added new coeffs --- .../contrib/ker_horner_allw_loop.inc | 389 ++++++++---------- .../ker_lowupsampfac_horner_allw_loop.inc | 192 +++++++++ 2 files changed, 375 insertions(+), 206 deletions(-) create mode 100644 include/cufinufft/contrib/ker_lowupsampfac_horner_allw_loop.inc diff --git a/include/cufinufft/contrib/ker_horner_allw_loop.inc b/include/cufinufft/contrib/ker_horner_allw_loop.inc index 1178a8544..953c4618b 100644 --- a/include/cufinufft/contrib/ker_horner_allw_loop.inc +++ b/include/cufinufft/contrib/ker_horner_allw_loop.inc @@ -1,230 +1,207 @@ // Code generated by gen_all_horner_C_code.m in finufft/devel // Authors: Alex Barnett & Ludvig af Klinteberg. -// (C) 2018, The Simons Foundation, Inc. +// (C) The Simons Foundation, Inc. if (w==2) { - constexpr CUFINUFFT_FLT c0[] = {4.5147043243215343E+01, 4.5147043243215336E+01}; - constexpr CUFINUFFT_FLT c1[] = {5.7408070938221300E+01, -5.7408070938221293E+01}; - constexpr CUFINUFFT_FLT c2[] = {-1.8395117920046662E+00, -1.8395117920046617E+00}; - constexpr CUFINUFFT_FLT c3[] = {-2.0382426253182079E+01, 2.0382426253182079E+01}; - constexpr CUFINUFFT_FLT c4[] = {-2.0940804433577291E+00, -2.0940804433577358E+00}; - constexpr CUFINUFFT_FLT c5[] = {3.1328044596872613E+00, -3.1328044596872546E+00}; - for (int i = 0; i < 2; i++) { - ker[i] = fma(z, fma(z, fma(z, fma(z, fma(z, c5[i], c4[i]), c3[i]), c2[i]), c1[i]), c0[i]); - } + constexpr FLT c0[] = {4.5147043243215343E+01, 4.5147043243215350E+01}; + constexpr FLT c1[] = {5.7408070938221307E+01, -5.7408070938221300E+01}; + constexpr FLT c2[] = {-1.8395117920046544E+00, -1.8395117920046602E+00}; + constexpr FLT c3[] = {-2.0382426253182064E+01, 2.0382426253182086E+01}; + constexpr FLT c4[] = {-2.0940804433577389E+00, -2.0940804433577398E+00}; + for (int i=0; i<2; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i])))); } else if (w==3) { - constexpr CUFINUFFT_FLT c0[] = {1.5653991189315124E+02, 8.8006872410780340E+02, 1.5653991189967161E+02}; - constexpr CUFINUFFT_FLT c1[] = {3.1653018869611071E+02, 2.1722031447974492E-14, -3.1653018868907077E+02}; - constexpr CUFINUFFT_FLT c2[] = {1.7742692790454473E+02, -3.3149255274727807E+02, 1.7742692791117116E+02}; - constexpr CUFINUFFT_FLT c3[] = {-1.5357716116473128E+01, -5.1917435849174007E-16, 1.5357716122720189E+01}; - constexpr CUFINUFFT_FLT c4[] = {-3.7757583061523604E+01, 5.3222970968867436E+01, -3.7757583054647363E+01}; - constexpr CUFINUFFT_FLT c5[] = {-3.9654011076088960E+00, 6.0642442697108023E-14, 3.9654011139270056E+00}; - constexpr CUFINUFFT_FLT c6[] = {3.3694352031960180E+00, -4.8817394017826032E+00, 3.3694352094301192E+00}; -for (int i=0; i<3; i++) { - ker[i] = fma(z, fma(z, fma(z, fma(z, fma(z, fma(z, c6[i], c5[i]), c4[i]), c3[i]), c2[i]), c1[i]), c0[i]); -} + constexpr FLT c0[] = {1.5653991189315130E+02, 8.8006872410780375E+02, 1.5653991189967169E+02}; + constexpr FLT c1[] = {3.1653018869611083E+02, 2.7828437114531882E-14, -3.1653018868907077E+02}; + constexpr FLT c2[] = {1.7742692790454484E+02, -3.3149255274727801E+02, 1.7742692791117128E+02}; + constexpr FLT c3[] = {-1.5357716116473071E+01, 1.0675641863333163E-13, 1.5357716122720211E+01}; + constexpr FLT c4[] = {-3.7757583061523640E+01, 5.3222970968867450E+01, -3.7757583054647341E+01}; + constexpr FLT c5[] = {-3.9654011076088449E+00, 4.9521033695040343E-14, 3.9654011139270429E+00}; + for (int i=0; i<3; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i]))))); } else if (w==4) { - constexpr CUFINUFFT_FLT c0[] = {5.4284366850213223E+02, 1.0073871433088403E+04, 1.0073871433088401E+04, 5.4284366850213223E+02}; - constexpr CUFINUFFT_FLT c1[] = {1.4650917259256937E+03, 6.1905285583602872E+03, -6.1905285583602890E+03, -1.4650917259256942E+03}; - constexpr CUFINUFFT_FLT c2[] = {1.4186910680718343E+03, -1.3995339862725584E+03, -1.3995339862725591E+03, 1.4186910680718338E+03}; - constexpr CUFINUFFT_FLT c3[] = {5.1133995502497419E+02, -1.4191608683682987E+03, 1.4191608683682980E+03, -5.1133995502497419E+02}; - constexpr CUFINUFFT_FLT c4[] = {-4.8293622641173549E+01, 3.9393732546136526E+01, 3.9393732546137308E+01, -4.8293622641173634E+01}; - constexpr CUFINUFFT_FLT c5[] = {-7.8386867802392118E+01, 1.4918904800408907E+02, -1.4918904800408754E+02, 7.8386867802392175E+01}; - constexpr CUFINUFFT_FLT c6[] = {-1.0039212571700762E+01, 5.0626747735616444E+00, 5.0626747735613531E+00, -1.0039212571700721E+01}; - constexpr CUFINUFFT_FLT c7[] = {4.7282853097645736E+00, -9.5966330409183929E+00, 9.5966330409170837E+00, -4.7282853097647068E+00}; - for (int i=0; i<4; i++) { - ker[i] = fma(z, fma(z, fma(z, fma(z, fma(z, fma(z, fma(z, c7[i], c6[i]), c5[i]), c4[i]), c3[i]), c2[i]), c1[i]), c0[i]); - } + constexpr FLT c0[] = {5.4284366850213223E+02, 1.0073871433088407E+04, 1.0073871433088407E+04, 5.4284366850213269E+02}; + constexpr FLT c1[] = {1.4650917259256942E+03, 6.1905285583602899E+03, -6.1905285583602899E+03, -1.4650917259256942E+03}; + constexpr FLT c2[] = {1.4186910680718349E+03, -1.3995339862725573E+03, -1.3995339862725571E+03, 1.4186910680718345E+03}; + constexpr FLT c3[] = {5.1133995502497481E+02, -1.4191608683682980E+03, 1.4191608683682985E+03, -5.1133995502497402E+02}; + constexpr FLT c4[] = {-4.8293622641173705E+01, 3.9393732546135901E+01, 3.9393732546136945E+01, -4.8293622641173727E+01}; + constexpr FLT c5[] = {-7.8386867802392203E+01, 1.4918904800408794E+02, -1.4918904800408947E+02, 7.8386867802392203E+01}; + constexpr FLT c6[] = {-1.0039212571700403E+01, 5.0626747735617119E+00, 5.0626747735622777E+00, -1.0039212571700599E+01}; + for (int i=0; i<4; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i])))))); } else if (w==5) { - constexpr CUFINUFFT_FLT c0[] = {9.9223677575398324E+02, 3.7794697666613341E+04, 9.8715771010760523E+04, 3.7794697666613290E+04, 9.9223677575398494E+02}; - constexpr CUFINUFFT_FLT c1[] = {3.0430174925083820E+03, 3.7938404259811403E+04, 2.7804200253407354E-12, -3.7938404259811381E+04, -3.0430174925083838E+03}; - constexpr CUFINUFFT_FLT c2[] = {3.6092689177271218E+03, 7.7501368899498566E+03, -2.2704627332474989E+04, 7.7501368899498684E+03, 3.6092689177271227E+03}; - constexpr CUFINUFFT_FLT c3[] = {1.9990077310495410E+03, -3.8875294641277278E+03, 3.8628399128660033E-12, 3.8875294641277342E+03, -1.9990077310495410E+03}; - constexpr CUFINUFFT_FLT c4[] = {4.0071733590403858E+02, -1.5861137916762520E+03, 2.3839858699098813E+03, -1.5861137916762589E+03, 4.0071733590403880E+02}; - constexpr CUFINUFFT_FLT c5[] = {-9.1301168206167731E+01, 1.2316471075214690E+02, 1.0425607383569405E-11, -1.2316471075215136E+02, 9.1301168206167446E+01}; - constexpr CUFINUFFT_FLT c6[] = {-5.5339722671223782E+01, 1.1960590540261434E+02, -1.5249941358312017E+02, 1.1960590540261727E+02, -5.5339722671222638E+01}; - constexpr CUFINUFFT_FLT c7[] = {-3.3762488150349701E+00, 2.2839981872969930E+00, 3.9507985966337744E-12, -2.2839981872938613E+00, 3.3762488150346224E+00}; - constexpr CUFINUFFT_FLT c8[] = {2.5183531846827609E+00, -5.3664382310942162E+00, 6.6969190369431528E+00, -5.3664382311060113E+00, 2.5183531846825087E+00}; - for (int i=0; i<5; i++) { - ker[i] = fma(z, fma(z, fma(z, fma(z, fma(z, fma(z, fma(z, fma(z, c8[i], c7[i]), c6[i]), c5[i]), c4[i]), c3[i]), c2[i]), c1[i]), c0[i]); - } + constexpr FLT c0[] = {9.9223677575398506E+02, 3.7794697666613349E+04, 9.8715771010760567E+04, 3.7794697666613327E+04, 9.9223677575398540E+02}; + constexpr FLT c1[] = {3.0430174925083834E+03, 3.7938404259811425E+04, -4.1880997701304513E-12, -3.7938404259811403E+04, -3.0430174925083829E+03}; + constexpr FLT c2[] = {3.6092689177271232E+03, 7.7501368899498630E+03, -2.2704627332475000E+04, 7.7501368899498721E+03, 3.6092689177271213E+03}; + constexpr FLT c3[] = {1.9990077310495410E+03, -3.8875294641277214E+03, 1.6137850891850780E-11, 3.8875294641277346E+03, -1.9990077310495410E+03}; + constexpr FLT c4[] = {4.0071733590403909E+02, -1.5861137916762543E+03, 2.3839858699098786E+03, -1.5861137916762577E+03, 4.0071733590403909E+02}; + constexpr FLT c5[] = {-9.1301168206167233E+01, 1.2316471075215087E+02, 1.9401736511657983E-12, -1.2316471075215495E+02, 9.1301168206166977E+01}; + constexpr FLT c6[] = {-5.5339722671222894E+01, 1.1960590540262304E+02, -1.5249941358312140E+02, 1.1960590540262024E+02, -5.5339722671224088E+01}; + constexpr FLT c7[] = {-3.3762488150349581E+00, 2.2839981873006558E+00, 8.2819625836083788E-12, -2.2839981872910400E+00, 3.3762488150351579E+00}; + for (int i=0; i<5; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i]))))))); } else if (w==6) { - constexpr CUFINUFFT_FLT c0[] = {2.0553833234911881E+03, 1.5499537739913142E+05, 8.1177907023291197E+05, 8.1177907023291243E+05, 1.5499537739913136E+05, 2.0553833235005709E+03}; - constexpr CUFINUFFT_FLT c1[] = {7.1269776034442639E+03, 2.0581923258843314E+05, 3.1559612614917662E+05, -3.1559612614917639E+05, -2.0581923258843314E+05, -7.1269776034341376E+03}; - constexpr CUFINUFFT_FLT c2[] = {1.0023404568475091E+04, 9.0916650498360163E+04, -1.0095927514054625E+05, -1.0095927514054641E+05, 9.0916650498360133E+04, 1.0023404568484631E+04}; - constexpr CUFINUFFT_FLT c3[] = {7.2536109410387417E+03, 4.8347162752603172E+03, -5.0512736602018493E+04, 5.0512736602018464E+04, -4.8347162752602935E+03, -7.2536109410297549E+03}; - constexpr CUFINUFFT_FLT c4[] = {2.7021878300949775E+03, -7.8773465553971982E+03, 5.2105876478344171E+03, 5.2105876478344435E+03, -7.8773465553972501E+03, 2.7021878301048719E+03}; - constexpr CUFINUFFT_FLT c5[] = {3.2120291706547602E+02, -1.8229189469937089E+03, 3.7928113414428362E+03, -3.7928113414427862E+03, 1.8229189469936987E+03, -3.2120291705638107E+02}; - constexpr CUFINUFFT_FLT c6[] = {-1.2051267090537493E+02, 2.2400507411396228E+02, -1.2506575852544464E+02, -1.2506575852534223E+02, 2.2400507411397808E+02, -1.2051267089640046E+02}; - constexpr CUFINUFFT_FLT c7[] = {-4.5977202613351125E+01, 1.1536880606853479E+02, -1.7819720186493950E+02, 1.7819720186493225E+02, -1.1536880606854527E+02, 4.5977202622148695E+01}; - constexpr CUFINUFFT_FLT c8[] = {-1.5631081288828985E+00, 7.1037430592828998E-01, -6.9838401131851052E-02, -6.9838401215353244E-02, 7.1037430589405925E-01, -1.5631081203763799E+00}; - constexpr CUFINUFFT_FLT c9[] = {1.7872002109952807E+00, -4.0452381056429791E+00, 5.8969107680858182E+00, -5.8969107681844992E+00, 4.0452381056487843E+00, -1.7872002036951482E+00}; - for (int i=0; i<6; i++) { - ker[i] = fma(z, fma(z, fma(z, fma(z, fma(z, fma(z, fma(z, fma(z, fma(z, c9[i], c8[i]), c7[i]), c6[i]), c5[i]), c4[i]), c3[i]), c2[i]), c1[i]), c0[i]); - } + constexpr FLT c0[] = {2.0553833234911899E+03, 1.5499537739913145E+05, 8.1177907023291232E+05, 8.1177907023291232E+05, 1.5499537739913145E+05, 2.0553833235005700E+03}; + constexpr FLT c1[] = {7.1269776034442684E+03, 2.0581923258843319E+05, 3.1559612614917679E+05, -3.1559612614917639E+05, -2.0581923258843317E+05, -7.1269776034341394E+03}; + constexpr FLT c2[] = {1.0023404568475091E+04, 9.0916650498360206E+04, -1.0095927514054631E+05, -1.0095927514054631E+05, 9.0916650498360163E+04, 1.0023404568484637E+04}; + constexpr FLT c3[] = {7.2536109410387444E+03, 4.8347162752603444E+03, -5.0512736602018485E+04, 5.0512736602018602E+04, -4.8347162752602972E+03, -7.2536109410297577E+03}; + constexpr FLT c4[] = {2.7021878300949775E+03, -7.8773465553972374E+03, 5.2105876478343516E+03, 5.2105876478343944E+03, -7.8773465553972464E+03, 2.7021878301048723E+03}; + constexpr FLT c5[] = {3.2120291706547630E+02, -1.8229189469936912E+03, 3.7928113414428476E+03, -3.7928113414427171E+03, 1.8229189469937239E+03, -3.2120291705638328E+02}; + constexpr FLT c6[] = {-1.2051267090537345E+02, 2.2400507411399769E+02, -1.2506575852547746E+02, -1.2506575852531816E+02, 2.2400507411399730E+02, -1.2051267089640162E+02}; + constexpr FLT c7[] = {-4.5977202613346755E+01, 1.1536880606857032E+02, -1.7819720186492938E+02, 1.7819720186504426E+02, -1.1536880606851560E+02, 4.5977202622148354E+01}; + constexpr FLT c8[] = {-1.5631081288822022E+00, 7.1037430590520445E-01, -6.9838401262032682E-02, -6.9838401199524530E-02, 7.1037430591562767E-01, -1.5631081203751171E+00}; + for (int i=0; i<6; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i])))))))); } else if (w==7) { - constexpr CUFINUFFT_FLT c0[] = {3.9948351830487572E+03, 5.4715865608590818E+05, 5.0196413492771797E+06, 9.8206709220713284E+06, 5.0196413492771862E+06, 5.4715865608590830E+05, 3.9948351830642591E+03}; - constexpr CUFINUFFT_FLT c1[] = {1.5290160332974685E+04, 8.7628248584320396E+05, 3.4421061790934447E+06, -1.3062175007082776E-26, -3.4421061790934466E+06, -8.7628248584320408E+05, -1.5290160332958067E+04}; - constexpr CUFINUFFT_FLT c2[] = {2.4458227486779248E+04, 5.3904618484139408E+05, 2.4315566181017426E+05, -1.6133959371974319E+06, 2.4315566181017403E+05, 5.3904618484139384E+05, 2.4458227486795098E+04}; - constexpr CUFINUFFT_FLT c3[] = {2.1166189345881645E+04, 1.3382732160223144E+05, -3.3113450969689671E+05, -6.5160817568418758E-10, 3.3113450969689724E+05, -1.3382732160223127E+05, -2.1166189345866882E+04}; - constexpr CUFINUFFT_FLT c4[] = {1.0542795672344866E+04, -7.0739172265096213E+03, -6.5563293056048453E+04, 1.2429734005960147E+05, -6.5563293056048846E+04, -7.0739172265096058E+03, 1.0542795672361211E+04}; - constexpr CUFINUFFT_FLT c5[] = {2.7903491906228414E+03, -1.0975382873973065E+04, 1.3656979541144814E+04, 1.2638008605419305E-09, -1.3656979541144177E+04, 1.0975382873973065E+04, -2.7903491906078302E+03}; - constexpr CUFINUFFT_FLT c6[] = {1.6069721418053450E+02, -1.5518707872250775E+03, 4.3634273936637373E+03, -5.9891976420593228E+03, 4.3634273936637110E+03, -1.5518707872251396E+03, 1.6069721419533406E+02}; - constexpr CUFINUFFT_FLT c7[] = {-1.2289277373867886E+02, 2.8583630927743752E+02, -2.8318194617301111E+02, -8.6523823682922648E-10, 2.8318194617373905E+02, -2.8583630927755564E+02, 1.2289277375320185E+02}; - constexpr CUFINUFFT_FLT c8[] = {-3.2270164914248042E+01, 9.1892112257600488E+01, -1.6710678096332572E+02, 2.0317049305437533E+02, -1.6710678096375165E+02, 9.1892112257478516E+01, -3.2270164900225943E+01}; - constexpr CUFINUFFT_FLT c9[] = {-1.4761409684737312E-01, -9.1862771282699363E-01, 1.2845147738991460E+00, 2.0325596081255337E-10, -1.2845147731561355E+00, 9.1862771288504130E-01, 1.4761410890750706E-01}; - constexpr CUFINUFFT_FLT c10[] = {1.0330620799191630E+00, -2.6798144967451138E+00, 4.4142511561803381E+00, -5.1799254918189979E+00, 4.4142511544246821E+00, -2.6798144968294695E+00, 1.0330620914479023E+00}; - for (int i=0; i<7; i++) { - ker[i] = fma(z, fma(z, fma(z, fma(z, fma(z, fma(z, fma(z, fma(z, fma(z, fma(z, c10[i], c9[i]), c8[i]), c7[i]), c6[i]), c5[i]), c4[i]), c3[i]), c2[i]), c1[i]), c0[i]); - } + constexpr FLT c0[] = {3.9948351830487582E+03, 5.4715865608590841E+05, 5.0196413492771825E+06, 9.8206709220713321E+06, 5.0196413492771871E+06, 5.4715865608590853E+05, 3.9948351830642619E+03}; + constexpr FLT c1[] = {1.5290160332974698E+04, 8.7628248584320419E+05, 3.4421061790934466E+06, 6.5103105025927563E-10, -3.4421061790934466E+06, -8.7628248584320443E+05, -1.5290160332958061E+04}; + constexpr FLT c2[] = {2.4458227486779258E+04, 5.3904618484139442E+05, 2.4315566181017563E+05, -1.6133959371974308E+06, 2.4315566181017424E+05, 5.3904618484139396E+05, 2.4458227486795091E+04}; + constexpr FLT c3[] = {2.1166189345881652E+04, 1.3382732160223150E+05, -3.3113450969689601E+05, 2.5683270626620309E-10, 3.3113450969689793E+05, -1.3382732160223130E+05, -2.1166189345866896E+04}; + constexpr FLT c4[] = {1.0542795672344870E+04, -7.0739172265096349E+03, -6.5563293056048627E+04, 1.2429734005960199E+05, -6.5563293056048671E+04, -7.0739172265096395E+03, 1.0542795672361222E+04}; + constexpr FLT c5[] = {2.7903491906228451E+03, -1.0975382873972989E+04, 1.3656979541145318E+04, 4.9801640867456605E-10, -1.3656979541144143E+04, 1.0975382873973054E+04, -2.7903491906078325E+03}; + constexpr FLT c6[] = {1.6069721418054232E+02, -1.5518707872249406E+03, 4.3634273936649897E+03, -5.9891976420600004E+03, 4.3634273936636964E+03, -1.5518707872250636E+03, 1.6069721419532380E+02}; + constexpr FLT c7[] = {-1.2289277373866669E+02, 2.8583630927761948E+02, -2.8318194617245649E+02, -3.5832266061541795E-11, 2.8318194617438041E+02, -2.8583630927744588E+02, 1.2289277375319726E+02}; + constexpr FLT c8[] = {-3.2270164914244575E+01, 9.1892112257588494E+01, -1.6710678096380749E+02, 2.0317049305436126E+02, -1.6710678096299210E+02, 9.1892112257580479E+01, -3.2270164900216493E+01}; + constexpr FLT c9[] = {-1.4761409684320093E-01, -9.1862771282699351E-01, 1.2845147740384601E+00, -5.0335941641611417E-10, -1.2845147731561353E+00, 9.1862771293147938E-01, 1.4761410890830065E-01}; + for (int i=0; i<7; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i]))))))))); } else if (w==8) { - constexpr CUFINUFFT_FLT c0[] = {7.3898000697447951E+03, 1.7297637497600042E+06, 2.5578341605285816E+07, 8.4789650417103380E+07, 8.4789650417103380E+07, 2.5578341605285820E+07, 1.7297637497600049E+06, 7.3898000697448042E+03}; - constexpr CUFINUFFT_FLT c1[] = {3.0719636811267595E+04, 3.1853145713323937E+06, 2.3797981861403696E+07, 2.4569731244678468E+07, -2.4569731244678464E+07, -2.3797981861403700E+07, -3.1853145713323932E+06, -3.0719636811267599E+04}; - constexpr CUFINUFFT_FLT c2[] = {5.4488498478251720E+04, 2.4101183255475122E+06, 6.4554051283428278E+06, -8.9200440393090621E+06, -8.9200440393090658E+06, 6.4554051283428278E+06, 2.4101183255475122E+06, 5.4488498478251720E+04}; - constexpr CUFINUFFT_FLT c3[] = {5.3926359802542131E+04, 9.0469037926849292E+05, -6.0897036277696094E+05, -3.0743852105800072E+06, 3.0743852105800039E+06, 6.0897036277696339E+05, -9.0469037926849292E+05, -5.3926359802542116E+04}; - constexpr CUFINUFFT_FLT c4[] = {3.2444118016247583E+04, 1.3079802224392195E+05, -5.8652889370128501E+05, 4.2333306008153502E+05, 4.2333306008153904E+05, -5.8652889370128524E+05, 1.3079802224392162E+05, 3.2444118016247587E+04}; - constexpr CUFINUFFT_FLT c5[] = {1.1864306345505289E+04, -2.2700360645707628E+04, -5.0713607251413239E+04, 1.8308704458211805E+05, -1.8308704458211269E+05, 5.0713607251412053E+04, 2.2700360645707922E+04, -1.1864306345505289E+04}; - constexpr CUFINUFFT_FLT c6[] = {2.2812256770903182E+03, -1.1569135767378117E+04, 2.0942387020799080E+04, -1.1661592834949530E+04, -1.1661592834949715E+04, 2.0942387020801576E+04, -1.1569135767377431E+04, 2.2812256770903446E+03}; - constexpr CUFINUFFT_FLT c7[] = {8.5503535636805026E+00, -9.7513976461269635E+02, 3.8242995179157779E+03, -6.9201295567256420E+03, 6.9201295567222760E+03, -3.8242995179195914E+03, 9.7513976461218783E+02, -8.5503535636857091E+00}; - constexpr CUFINUFFT_FLT c8[] = {-1.0230637348345583E+02, 2.8246898554291380E+02, -3.8638201738179225E+02, 1.9106407993005959E+02, 1.9106407993232122E+02, -3.8638201738334749E+02, 2.8246898554236805E+02, -1.0230637348345877E+02}; - constexpr CUFINUFFT_FLT c9[] = {-1.9200143062948566E+01, 6.1692257626799076E+01, -1.2981109187842986E+02, 1.8681284209951576E+02, -1.8681284210285929E+02, 1.2981109187694383E+02, -6.1692257626659767E+01, 1.9200143062946392E+01}; - constexpr CUFINUFFT_FLT c10[] = {3.7894993760901435E-01, -1.7334408837152924E+00, 2.5271184066312142E+00, -1.2600963963387819E+00, -1.2600963946516730E+00, 2.5271184093306061E+00, -1.7334408836731170E+00, 3.7894993761824158E-01}; - for (int i = 0; i < 8; i++) { - ker[i] = fma(z, fma(z, fma(z, fma(z, fma(z, fma(z, fma(z, fma(z, fma(z, fma(z, c10[i], c9[i]), c8[i]), c7[i]), c6[i]), c5[i]), c4[i]), c3[i]), c2[i]), c1[i]), c0[i]); - } + constexpr FLT c0[] = {7.3898000697448142E+03, 1.7297637497600052E+06, 2.5578341605285820E+07, 8.4789650417103425E+07, 8.4789650417103410E+07, 2.5578341605285831E+07, 1.7297637497600054E+06, 7.3898000697448097E+03}; + constexpr FLT c1[] = {3.0719636811267621E+04, 3.1853145713323932E+06, 2.3797981861403704E+07, 2.4569731244678468E+07, -2.4569731244678475E+07, -2.3797981861403704E+07, -3.1853145713323941E+06, -3.0719636811267595E+04}; + constexpr FLT c2[] = {5.4488498478251728E+04, 2.4101183255475122E+06, 6.4554051283428418E+06, -8.9200440393090658E+06, -8.9200440393090583E+06, 6.4554051283428296E+06, 2.4101183255475126E+06, 5.4488498478251728E+04}; + constexpr FLT c3[] = {5.3926359802542138E+04, 9.0469037926849385E+05, -6.0897036277695757E+05, -3.0743852105799988E+06, 3.0743852105800197E+06, 6.0897036277696723E+05, -9.0469037926849280E+05, -5.3926359802542152E+04}; + constexpr FLT c4[] = {3.2444118016247576E+04, 1.3079802224392162E+05, -5.8652889370128687E+05, 4.2333306008153327E+05, 4.2333306008153543E+05, -5.8652889370128710E+05, 1.3079802224392179E+05, 3.2444118016247601E+04}; + constexpr FLT c5[] = {1.1864306345505300E+04, -2.2700360645707835E+04, -5.0713607251411129E+04, 1.8308704458211461E+05, -1.8308704458211147E+05, 5.0713607251410089E+04, 2.2700360645707704E+04, -1.1864306345505296E+04}; + constexpr FLT c6[] = {2.2812256770903396E+03, -1.1569135767377908E+04, 2.0942387020802456E+04, -1.1661592834947036E+04, -1.1661592834946512E+04, 2.0942387020804370E+04, -1.1569135767377549E+04, 2.2812256770903291E+03}; + constexpr FLT c7[] = {8.5503535636977634E+00, -9.7513976461196773E+02, 3.8242995179186414E+03, -6.9201295567263214E+03, 6.9201295567309990E+03, -3.8242995179140653E+03, 9.7513976461263269E+02, -8.5503535636935535E+00}; + constexpr FLT c8[] = {-1.0230637348345098E+02, 2.8246898554249236E+02, -3.8638201738252542E+02, 1.9106407992706994E+02, 1.9106407993520349E+02, -3.8638201738414602E+02, 2.8246898554297724E+02, -1.0230637348344338E+02}; + constexpr FLT c9[] = {-1.9200143062942033E+01, 6.1692257626381128E+01, -1.2981109187954436E+02, 1.8681284209765820E+02, -1.8681284209914423E+02, 1.2981109187880136E+02, -6.1692257626381128E+01, 1.9200143062947838E+01}; + constexpr FLT c10[] = {3.7894993761363543E-01, -1.7334408835887836E+00, 2.5271184092462979E+00, -1.2600963912775105E+00, -1.2600963880718390E+00, 2.5271184126204269E+00, -1.7334408829982433E+00, 3.7894993761427903E-01}; + for (int i=0; i<8; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i])))))))))); } else if (w==9) { - constexpr CUFINUFFT_FLT c0[] = {1.3136365370186117E+04, 5.0196413492771843E+06, 1.1303327711722571E+08, 5.8225443924996734E+08, 9.7700272582690704E+08, 5.8225443924996817E+08, 1.1303327711722572E+08, 5.0196413492772235E+06, 1.3136365370186102E+04}; - constexpr CUFINUFFT_FLT c1[] = {5.8623313038274340E+04, 1.0326318537280340E+07, 1.2898448324824861E+08, 3.0522863709830379E+08, 2.2777200847591304E-08, -3.0522863709830391E+08, -1.2898448324824867E+08, -1.0326318537280390E+07, -5.8623313038274362E+04}; - constexpr CUFINUFFT_FLT c2[] = {1.1335001341875963E+05, 9.0726133144784775E+06, 5.3501544534038082E+07, -2.6789524644150439E+05, -1.2483923718899380E+08, -2.6789524644173466E+05, 5.3501544534038067E+07, 9.0726133144785129E+06, 1.1335001341875964E+05}; - constexpr CUFINUFFT_FLT c3[] = {1.2489113703229750E+05, 4.3035547171861930E+06, 6.3021978510598894E+06, -2.6014941986659020E+07, 2.8258041381448560E-08, 2.6014941986659355E+07, -6.3021978510598978E+06, -4.3035547171862079E+06, -1.2489113703229750E+05}; - constexpr CUFINUFFT_FLT c4[] = {8.6425493435991229E+04, 1.0891182836653332E+06, -2.0713033564200329E+06, -2.8994941183505855E+06, 7.5905338661207352E+06, -2.8994941183504057E+06, -2.0713033564200525E+06, 1.0891182836653360E+06, 8.6425493435991244E+04}; - constexpr CUFINUFFT_FLT c5[] = {3.8657354724013807E+04, 7.9936390113327987E+04, -7.0458265546792350E+05, 1.0151095605715724E+06, 8.7808418931366203E-08, -1.0151095605718571E+06, 7.0458265546792292E+05, -7.9936390113333473E+04, -3.8657354724013807E+04}; - constexpr CUFINUFFT_FLT c6[] = {1.0779131453134632E+04, -3.3466718311303863E+04, -1.3245366619006214E+04, 1.8238470515351585E+05, -2.9285656292984058E+05, 1.8238470515350348E+05, -1.3245366619016511E+04, -3.3466718311298035E+04, 1.0779131453134652E+04}; - constexpr CUFINUFFT_FLT c7[] = {1.4992527030548451E+03, -9.7024371533906651E+03, 2.3216330734046409E+04, -2.3465262819075571E+04, -3.7031099746142328E-08, 2.3465262819179152E+04, -2.3216330734079289E+04, 9.7024371533883768E+03, -1.4992527030548429E+03}; - constexpr CUFINUFFT_FLT c8[] = {-7.9857427421137089E+01, -4.0585588534737309E+02, 2.6054813773474157E+03, -6.1806593581211082E+03, 8.0679596873751289E+03, -6.1806593581509942E+03, 2.6054813773256465E+03, -4.0585588535330419E+02, -7.9857427421164303E+01}; - constexpr CUFINUFFT_FLT c9[] = {-7.1572272057931258E+01, 2.2785637019446185E+02, -3.9109820765219445E+02, 3.3597424707607246E+02, 1.7793576396134983E-08, -3.3597424727519928E+02, 3.9109820766111056E+02, -2.2785637019102543E+02, 7.1572272057951565E+01}; - constexpr CUFINUFFT_FLT c10[] = {-9.8886360698029030E+00, 3.5359026948517517E+01, -8.5251867695464824E+01, 1.4285748015591199E+02, -1.6935269673908536E+02, 1.4285748008591776E+02, -8.5251867720434134E+01, 3.5359026945818123E+01, -9.8886360698009241E+00}; - constexpr CUFINUFFT_FLT c11[] = {5.4050464453063796E-01, -1.7215219066697895E+00, 2.8631741265441102E+00, -2.3817977385844018E+00, -1.0173343205540475E-08, 2.3817977172440110E+00, -2.8631741497139487E+00, 1.7215219081941548E+00, -5.4050464453541269E-01}; - for (int i=0; i<9; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i]))))))))))); + constexpr FLT c0[] = {1.3136365370186153E+04, 5.0196413492771843E+06, 1.1303327711722577E+08, 5.8225443924996734E+08, 9.7700272582690716E+08, 5.8225443924996805E+08, 1.1303327711722578E+08, 5.0196413492772263E+06, 1.3136365370186144E+04}; + constexpr FLT c1[] = {5.8623313038274369E+04, 1.0326318537280345E+07, 1.2898448324824868E+08, 3.0522863709830379E+08, 7.2435840302079811E-08, -3.0522863709830397E+08, -1.2898448324824865E+08, -1.0326318537280394E+07, -5.8623313038274347E+04}; + constexpr FLT c2[] = {1.1335001341875960E+05, 9.0726133144784812E+06, 5.3501544534038134E+07, -2.6789524644140172E+05, -1.2483923718899371E+08, -2.6789524644173466E+05, 5.3501544534038089E+07, 9.0726133144785147E+06, 1.1335001341875963E+05}; + constexpr FLT c3[] = {1.2489113703229754E+05, 4.3035547171861976E+06, 6.3021978510599164E+06, -2.6014941986658975E+07, 5.3074599277157087E-08, 2.6014941986659400E+07, -6.3021978510598680E+06, -4.3035547171862088E+06, -1.2489113703229751E+05}; + constexpr FLT c4[] = {8.6425493435991244E+04, 1.0891182836653311E+06, -2.0713033564200432E+06, -2.8994941183505901E+06, 7.5905338661206560E+06, -2.8994941183505324E+06, -2.0713033564200350E+06, 1.0891182836653385E+06, 8.6425493435991288E+04}; + constexpr FLT c5[] = {3.8657354724013800E+04, 7.9936390113329253E+04, -7.0458265546791849E+05, 1.0151095605715540E+06, 7.5990350518026299E-08, -1.0151095605718379E+06, 7.0458265546793933E+05, -7.9936390113333939E+04, -3.8657354724013821E+04}; + constexpr FLT c6[] = {1.0779131453134645E+04, -3.3466718311300116E+04, -1.3245366618985940E+04, 1.8238470515354761E+05, -2.9285656292981049E+05, 1.8238470515352563E+05, -1.3245366618989963E+04, -3.3466718311299133E+04, 1.0779131453134627E+04}; + constexpr FLT c7[] = {1.4992527030548656E+03, -9.7024371533879767E+03, 2.3216330734078529E+04, -2.3465262819038293E+04, -4.5678067266366728E-08, 2.3465262819229152E+04, -2.3216330734050898E+04, 9.7024371533899721E+03, -1.4992527030548690E+03}; + constexpr FLT c8[] = {-7.9857427421152821E+01, -4.0585588534976301E+02, 2.6054813773370911E+03, -6.1806593581469824E+03, 8.0679596873459095E+03, -6.1806593581737125E+03, 2.6054813773390433E+03, -4.0585588535087578E+02, -7.9857427421118601E+01}; + constexpr FLT c9[] = {-7.1572272057928345E+01, 2.2785637019390455E+02, -3.9109820766111051E+02, 3.3597424707310040E+02, -1.3908671051550088E-08, -3.3597424727519922E+02, 3.9109820767448468E+02, -2.2785637019111829E+02, 7.1572272057948652E+01}; + constexpr FLT c10[] = {-9.8886360697883688E+00, 3.5359026950204516E+01, -8.5251867695464611E+01, 1.4285748013461193E+02, -1.6935269664190733E+02, 1.4285748014610570E+02, -8.5251867686017064E+01, 3.5359026947336602E+01, -9.8886360697963340E+00}; + for (int i=0; i<9; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i])))))))))); } else if (w==10) { - constexpr CUFINUFFT_FLT c0[] = {2.2594586605749279E+04, 1.3595989066786604E+07, 4.4723032442444921E+08, 3.3781755837397542E+09, 8.6836783895849838E+09, 8.6836783895849819E+09, 3.3781755837397518E+09, 4.4723032442444921E+08, 1.3595989066786485E+07, 2.2594586605749315E+04}; - constexpr CUFINUFFT_FLT c1[] = {1.0729981697645642E+05, 3.0651490267742988E+07, 5.9387966085130477E+08, 2.4434902657508340E+09, 2.0073077861288924E+09, -2.0073077861288958E+09, -2.4434902657508330E+09, -5.9387966085130465E+08, -3.0651490267742820E+07, -1.0729981697645631E+05}; - constexpr CUFINUFFT_FLT c2[] = {2.2340399734184594E+05, 3.0258214643190444E+07, 3.1512411458738214E+08, 4.3618276932319784E+08, -7.8178848450497377E+08, -7.8178848450497079E+08, 4.3618276932319820E+08, 3.1512411458738226E+08, 3.0258214643190306E+07, 2.2340399734184553E+05}; - constexpr CUFINUFFT_FLT c3[] = {2.6917433004353492E+05, 1.6875651476661235E+07, 7.4664745481963485E+07, -9.5882157211117983E+07, -2.0622994435532546E+08, 2.0622994435532695E+08, 9.5882157211117893E+07, -7.4664745481963441E+07, -1.6875651476661157E+07, -2.6917433004353417E+05}; - constexpr CUFINUFFT_FLT c4[] = {2.0818422772177903E+05, 5.6084730690362593E+06, 1.4435118192352918E+06, -4.0063869969543688E+07, 3.2803674392747905E+07, 3.2803674392747425E+07, -4.0063869969546065E+07, 1.4435118192351861E+06, 5.6084730690362072E+06, 2.0818422772177853E+05}; - constexpr CUFINUFFT_FLT c5[] = {1.0781139496011089E+05, 9.9202615851199115E+05, -3.3266265543962144E+06, -4.8557049011465441E+05, 1.0176155522771550E+07, -1.0176155522773480E+07, 4.8557049011624791E+05, 3.3266265543963145E+06, -9.9202615851196367E+05, -1.0781139496011069E+05}; - constexpr CUFINUFFT_FLT c6[] = {3.7380102688153507E+04, 1.2716675000354149E+04, -6.2163527451780590E+05, 1.4157962667182824E+06, -8.4419693137806712E+05, -8.4419693137792684E+05, 1.4157962667183836E+06, -6.2163527451768133E+05, 1.2716675000338953E+04, 3.7380102688153551E+04}; - constexpr CUFINUFFT_FLT c7[] = {8.1238936393894865E+03, -3.4872365530450799E+04, 2.3913680325180554E+04, 1.2428850301840073E+05, -3.2158255329732876E+05, 3.2158255329921009E+05, -1.2428850301906197E+05, -2.3913680325219862E+04, 3.4872365530457639E+04, -8.1238936393893855E+03}; - constexpr CUFINUFFT_FLT c8[] = {7.8515926628983277E+02, -6.6607899119362401E+03, 2.0167398338517272E+04, -2.8951401344174039E+04, 1.4622828141519254E+04, 1.4622828143473866E+04, -2.8951401346529910E+04, 2.0167398338405819E+04, -6.6607899119515532E+03, 7.8515926628964587E+02}; - constexpr CUFINUFFT_FLT c9[] = {-1.0147176570533524E+02, -3.5304284183527621E+01, 1.3576976854816689E+03, -4.3921059353471846E+03, 7.3232085265419046E+03, -7.3232085280635902E+03, 4.3921059363220147E+03, -1.3576976854281722E+03, 3.5304284184270628E+01, 1.0147176570551520E+02}; - constexpr CUFINUFFT_FLT c10[] = {-4.3161545259395531E+01, 1.5498490982051828E+02, -3.1771250772612478E+02, 3.7215448793727404E+02, -1.7181762882439287E+02, -1.7181763008770599E+02, 3.7215448759715150E+02, -3.1771250770992856E+02, 1.5498490982321766E+02, -4.3161545259481535E+01}; - constexpr CUFINUFFT_FLT c11[] = {-4.2916172038404330E+00, 1.7402146068709751E+01, -4.7947588102062113E+01, 9.2697697983158491E+01, -1.2821427595919303E+02, 1.2821427694451660E+02, -9.2697698629471930E+01, 4.7947588133767717E+01, -1.7402146075416606E+01, 4.2916172038784923E+00}; - constexpr CUFINUFFT_FLT c12[] = {3.5357495062947814E-01, -1.2828127005767840E+00, 2.4090120532215455E+00, -2.6448901913160028E+00, 1.1811546776400381E+00, 1.1811568523765217E+00, -2.6448918925210712E+00, 2.4090119216851607E+00, -1.2828127015358992E+00, 3.5357495059093369E-01}; - for (int i=0; i<10; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i] + z*(c12[i])))))))))))); + constexpr FLT c0[] = {2.2594586605749224E+04, 1.3595989066786611E+07, 4.4723032442444932E+08, 3.3781755837397552E+09, 8.6836783895849857E+09, 8.6836783895849838E+09, 3.3781755837397523E+09, 4.4723032442444944E+08, 1.3595989066786496E+07, 2.2594586605749344E+04}; + constexpr FLT c1[] = {1.0729981697645644E+05, 3.0651490267742995E+07, 5.9387966085130477E+08, 2.4434902657508349E+09, 2.0073077861288934E+09, -2.0073077861288950E+09, -2.4434902657508330E+09, -5.9387966085130477E+08, -3.0651490267742828E+07, -1.0729981697645634E+05}; + constexpr FLT c2[] = {2.2340399734184612E+05, 3.0258214643190462E+07, 3.1512411458738238E+08, 4.3618276932319850E+08, -7.8178848450497270E+08, -7.8178848450497031E+08, 4.3618276932319820E+08, 3.1512411458738214E+08, 3.0258214643190324E+07, 2.2340399734184553E+05}; + constexpr FLT c3[] = {2.6917433004353492E+05, 1.6875651476661246E+07, 7.4664745481963649E+07, -9.5882157211117968E+07, -2.0622994435532477E+08, 2.0622994435532823E+08, 9.5882157211118430E+07, -7.4664745481963366E+07, -1.6875651476661157E+07, -2.6917433004353428E+05}; + constexpr FLT c4[] = {2.0818422772177897E+05, 5.6084730690362593E+06, 1.4435118192351859E+06, -4.0063869969544269E+07, 3.2803674392747816E+07, 3.2803674392746560E+07, -4.0063869969546124E+07, 1.4435118192352206E+06, 5.6084730690362155E+06, 2.0818422772177868E+05}; + constexpr FLT c5[] = {1.0781139496011086E+05, 9.9202615851199278E+05, -3.3266265543961083E+06, -4.8557049011452327E+05, 1.0176155522772400E+07, -1.0176155522773268E+07, 4.8557049011599307E+05, 3.3266265543962419E+06, -9.9202615851196356E+05, -1.0781139496011072E+05}; + constexpr FLT c6[] = {3.7380102688153638E+04, 1.2716675000361241E+04, -6.2163527451762755E+05, 1.4157962667184302E+06, -8.4419693137719855E+05, -8.4419693137682532E+05, 1.4157962667184921E+06, -6.2163527451772091E+05, 1.2716675000342160E+04, 3.7380102688153478E+04}; + constexpr FLT c7[] = {8.1238936393894573E+03, -3.4872365530440075E+04, 2.3913680325287874E+04, 1.2428850301835715E+05, -3.2158255329711520E+05, 3.2158255329964001E+05, -1.2428850301842803E+05, -2.3913680325138281E+04, 3.4872365530466821E+04, -8.1238936393894610E+03}; + constexpr FLT c8[] = {7.8515926628982811E+02, -6.6607899119346384E+03, 2.0167398338412942E+04, -2.8951401344643764E+04, 1.4622828141516249E+04, 1.4622828142773422E+04, -2.8951401346273171E+04, 2.0167398338466974E+04, -6.6607899119428766E+03, 7.8515926628979298E+02}; + constexpr FLT c9[] = {-1.0147176570538747E+02, -3.5304284178326540E+01, 1.3576976855470537E+03, -4.3921059355373945E+03, 7.3232085265656797E+03, -7.3232085282537992E+03, 4.3921059362506849E+03, -1.3576976853984515E+03, 3.5304284186128150E+01, 1.0147176570552679E+02}; + constexpr FLT c10[] = {-4.3161545259359876E+01, 1.5498490982726668E+02, -3.1771250761814974E+02, 3.7215448796966825E+02, -1.7181762811175784E+02, -1.7181762918070896E+02, 3.7215448823960344E+02, -3.1771250765054128E+02, 1.5498490982861634E+02, -4.3161545259484186E+01}; + constexpr FLT c11[] = {-4.2916172038642904E+00, 1.7402146073587435E+01, -4.7947588063038118E+01, 9.2697697961204668E+01, -1.2821427624698006E+02, 1.2821427667135228E+02, -9.2697698383138089E+01, 4.7947588092305367E+01, -1.7402146072063207E+01, 4.2916172038214455E+00}; + for (int i=0; i<10; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i]))))))))))); } else if (w==11) { - constexpr CUFINUFFT_FLT c0[] = {3.7794653219809574E+04, 3.4782300224660799E+07, 1.6188020733727567E+09, 1.7196758809615021E+10, 6.3754384857724678E+10, 9.7196447559193558E+10, 6.3754384857724640E+10, 1.7196758809615005E+10, 1.6188020733727570E+09, 3.4782300224660806E+07, 3.7794653219808897E+04}; - constexpr CUFINUFFT_FLT c1[] = {1.8969206922085880E+05, 8.4769319065313682E+07, 2.4230555767723408E+09, 1.5439732722639105E+10, 2.7112836839612309E+10, 2.9154817084916870E-06, -2.7112836839612320E+10, -1.5439732722639105E+10, -2.4230555767723408E+09, -8.4769319065313682E+07, -1.8969206922085711E+05}; - constexpr CUFINUFFT_FLT c2[] = {4.2138380313901423E+05, 9.2050522922791898E+07, 1.5259983101266611E+09, 4.7070559561237154E+09, -1.2448027572952452E+09, -1.0161446790279312E+10, -1.2448027572952352E+09, 4.7070559561237249E+09, 1.5259983101266615E+09, 9.2050522922791868E+07, 4.2138380313901143E+05}; - constexpr CUFINUFFT_FLT c3[] = {5.4814313598122017E+05, 5.8085130777589574E+07, 4.9484006166551107E+08, 1.6222124676640958E+08, -2.0440440381345322E+09, -1.0628188648962249E-06, 2.0440440381345263E+09, -1.6222124676641047E+08, -4.9484006166551083E+08, -5.8085130777589560E+07, -5.4814313598121691E+05}; - constexpr CUFINUFFT_FLT c4[] = {4.6495183529254969E+05, 2.3067199578027174E+07, 6.9832590192482829E+07, -2.2024799260683161E+08, -1.2820270942587741E+08, 5.1017181199130940E+08, -1.2820270942587276E+08, -2.2024799260684022E+08, 6.9832590192482591E+07, 2.3067199578027155E+07, 4.6495183529254753E+05}; - constexpr CUFINUFFT_FLT c5[] = {2.7021781043532968E+05, 5.6764510325100143E+06, -5.5650761736747762E+06, -3.9907385617900737E+07, 7.2453390663686648E+07, 3.7361048615190248E-06, -7.2453390663685605E+07, 3.9907385617898554E+07, 5.5650761736747930E+06, -5.6764510325100180E+06, -2.7021781043532834E+05}; - constexpr CUFINUFFT_FLT c6[] = {1.0933249308680615E+05, 6.9586821127986431E+05, -3.6860240321940281E+06, 2.7428169457723838E+06, 8.3392008440598147E+06, -1.6402201025051240E+07, 8.3392008440649221E+06, 2.7428169457788388E+06, -3.6860240321937916E+06, 6.9586821127989038E+05, 1.0933249308680584E+05}; - constexpr CUFINUFFT_FLT c7[] = {3.0203516161820480E+04, -3.6879059542777912E+04, -4.1141031216801296E+05, 1.4111389975270075E+06, -1.5914376635392811E+06, 6.6766157119460594E-07, 1.5914376635341521E+06, -1.4111389975270815E+06, 4.1141031216760987E+05, 3.6879059542751726E+04, -3.0203516161820367E+04}; - constexpr CUFINUFFT_FLT c8[] = {5.1670143574922804E+03, -2.8613147115365118E+04, 4.3560195427108687E+04, 4.8438679581840552E+04, -2.5856630639330545E+05, 3.7994883866097208E+05, -2.5856630640124826E+05, 4.8438679578319818E+04, 4.3560195426824532E+04, -2.8613147115371667E+04, 5.1670143574923577E+03}; - constexpr CUFINUFFT_FLT c9[] = {3.0888018539742444E+02, -3.7949446187516196E+03, 1.4313303205035631E+04, -2.6681600236925929E+04, 2.3856005161221132E+04, -2.3276789125970764E-06, -2.3856005160840708E+04, 2.6681600234072768E+04, -1.4313303205083184E+04, 3.7949446187479048E+03, -3.0888018539723868E+02}; - constexpr CUFINUFFT_FLT c10[] = {-8.3747489794255131E+01, 1.1948077479810485E+02, 4.8528498025870488E+02, -2.5024391115619069E+03, 5.3511195350414373E+03, -6.7655484152307990E+03, 5.3511195328171416E+03, -2.5024391120801879E+03, 4.8528498023710927E+02, 1.1948077481025226E+02, -8.3747489794331599E+01}; - constexpr CUFINUFFT_FLT c11[] = {-2.2640047135555928E+01, 9.0840898549317998E+01, -2.1597187568776889E+02, 3.1511229085836396E+02, -2.4856618287164540E+02, 1.6489710183426948E-06, 2.4856618404233313E+02, -3.1511228957061689E+02, 2.1597187534632059E+02, -9.0840898568829203E+01, 2.2640047135641577E+01}; - constexpr CUFINUFFT_FLT c12[] = {-1.6306382885945303E+00, 7.3325946569413265E+00, -2.3241017814397217E+01, 5.1715493697385526E+01, -8.2673003927086967E+01, 9.6489715222659115E+01, -8.2673013187251925E+01, 5.1715492855550593E+01, -2.3241018165160245E+01, 7.3325946421432624E+00, -1.6306382886373367E+00}; - constexpr CUFINUFFT_FLT c13[] = {2.4409286936442823E-01, -7.8803147249892458E-01, 1.6467143668339987E+00, -2.1898241453519685E+00, 1.6350102449767006E+00, -1.1782931558589478E-06, -1.6350139430218933E+00, 2.1898230913723329E+00, -1.6467144225690411E+00, 7.8803147709023735E-01, -2.4409286927983653E-01}; - for (int i=0; i<11; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i] + z*(c12[i] + z*(c13[i]))))))))))))); + constexpr FLT c0[] = {3.7794653219809712E+04, 3.4782300224660814E+07, 1.6188020733727572E+09, 1.7196758809615025E+10, 6.3754384857724686E+10, 9.7196447559193588E+10, 6.3754384857724686E+10, 1.7196758809615013E+10, 1.6188020733727574E+09, 3.4782300224660836E+07, 3.7794653219808912E+04}; + constexpr FLT c1[] = {1.8969206922085886E+05, 8.4769319065313712E+07, 2.4230555767723413E+09, 1.5439732722639107E+10, 2.7112836839612331E+10, 7.5382856415600940E-06, -2.7112836839612324E+10, -1.5439732722639109E+10, -2.4230555767723413E+09, -8.4769319065313712E+07, -1.8969206922085691E+05}; + constexpr FLT c2[] = {4.2138380313901440E+05, 9.2050522922791913E+07, 1.5259983101266618E+09, 4.7070559561237240E+09, -1.2448027572952247E+09, -1.0161446790279316E+10, -1.2448027572952359E+09, 4.7070559561237249E+09, 1.5259983101266608E+09, 9.2050522922791883E+07, 4.2138380313901132E+05}; + constexpr FLT c3[] = {5.4814313598122029E+05, 5.8085130777589604E+07, 4.9484006166551131E+08, 1.6222124676641059E+08, -2.0440440381345210E+09, 1.6029666825264191E-05, 2.0440440381345406E+09, -1.6222124676640612E+08, -4.9484006166551065E+08, -5.8085130777589574E+07, -5.4814313598121749E+05}; + constexpr FLT c4[] = {4.6495183529254969E+05, 2.3067199578027170E+07, 6.9832590192482471E+07, -2.2024799260683393E+08, -1.2820270942588173E+08, 5.1017181199129957E+08, -1.2820270942587103E+08, -2.2024799260683718E+08, 6.9832590192482680E+07, 2.3067199578027181E+07, 4.6495183529254753E+05}; + constexpr FLT c5[] = {2.7021781043532956E+05, 5.6764510325100170E+06, -5.5650761736746123E+06, -3.9907385617899098E+07, 7.2453390663685441E+07, 1.3807321808330796E-06, -7.2453390663686499E+07, 3.9907385617896959E+07, 5.5650761736744791E+06, -5.6764510325100273E+06, -2.7021781043532840E+05}; + constexpr FLT c6[] = {1.0933249308680632E+05, 6.9586821127988759E+05, -3.6860240321936086E+06, 2.7428169457744057E+06, 8.3392008440658972E+06, -1.6402201025049815E+07, 8.3392008440622678E+06, 2.7428169457778567E+06, -3.6860240321934861E+06, 6.9586821127989655E+05, 1.0933249308680571E+05}; + constexpr FLT c7[] = {3.0203516161820731E+04, -3.6879059542738614E+04, -4.1141031216769724E+05, 1.4111389975281695E+06, -1.5914376635274226E+06, 6.7631682826831895E-06, 1.5914376635404355E+06, -1.4111389975219201E+06, 4.1141031216798135E+05, 3.6879059542753101E+04, -3.0203516161820640E+04}; + constexpr FLT c8[] = {5.1670143574923986E+03, -2.8613147115359603E+04, 4.3560195427027051E+04, 4.8438679581734432E+04, -2.5856630639957223E+05, 3.7994883866286115E+05, -2.5856630639708077E+05, 4.8438679579228658E+04, 4.3560195427174098E+04, -2.8613147115353891E+04, 5.1670143574923814E+03}; + constexpr FLT c9[] = {3.0888018539742438E+02, -3.7949446187486474E+03, 1.4313303205130735E+04, -2.6681600236165083E+04, 2.3856005159699442E+04, -1.9072153968212169E-06, -2.3856005160079862E+04, 2.6681600234262976E+04, -1.4313303204940523E+04, 3.7949446187568205E+03, -3.0888018539723868E+02}; + constexpr FLT c10[] = {-8.3747489794178762E+01, 1.1948077481430271E+02, 4.8528498043145930E+02, -2.5024391100070475E+03, 5.3511195380863319E+03, -6.7655484103934950E+03, 5.3511195323636521E+03, -2.5024391101798296E+03, 4.8528498086337265E+02, 1.1948077483184566E+02, -8.3747489794339316E+01}; + constexpr FLT c11[] = {-2.2640047135393669E+01, 9.0840898559070766E+01, -2.1597187557069051E+02, 3.1511228970473707E+02, -2.4856618213020064E+02, -2.0962600056762836E-06, 2.4856618232531096E+02, -3.1511228707801843E+02, 2.1597187541459934E+02, -9.0840898577362736E+01, 2.2640047135479467E+01}; + constexpr FLT c12[] = {-1.6306382885603201E+00, 7.3325946574893264E+00, -2.3241017691629008E+01, 5.1715493346619120E+01, -8.2673008978082819E+01, 9.6489716906321945E+01, -8.2673008978083388E+01, 5.1715493276466965E+01, -2.3241017744243891E+01, 7.3325946602297218E+00, -1.6306382886202573E+00}; + for (int i=0; i<11; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i] + z*(c12[i])))))))))))); } else if (w==12) { - constexpr CUFINUFFT_FLT c0[] = {6.1722991679853112E+04, 8.4789650417103723E+07, 5.4431675199498730E+09, 7.8788892335272293E+10, 4.0355760945670062E+11, 8.8071481911347974E+11, 8.8071481911347998E+11, 4.0355760945670068E+11, 7.8788892335272491E+10, 5.4431675199498854E+09, 8.4789650417103767E+07, 6.1722991679871629E+04}; - constexpr CUFINUFFT_FLT c1[] = {3.2561466099406150E+05, 2.2112758120210624E+08, 8.9911609880089817E+09, 8.3059508064200928E+10, 2.3965569143469864E+11, 1.6939286803305209E+11, -1.6939286803305209E+11, -2.3965569143469867E+11, -8.3059508064201080E+10, -8.9911609880089989E+09, -2.2112758120210624E+08, -3.2561466099404282E+05}; - constexpr CUFINUFFT_FLT c2[] = {7.6621098001581465E+05, 2.6026568260310274E+08, 6.4524338253008652E+09, 3.3729904113826797E+10, 2.8555202212474010E+10, -6.8998572040731583E+10, -6.8998572040731506E+10, 2.8555202212474064E+10, 3.3729904113826805E+10, 6.4524338253008747E+09, 2.6026568260310277E+08, 7.6621098001583852E+05}; - constexpr CUFINUFFT_FLT c3[] = {1.0657807616803222E+06, 1.8144472126890999E+08, 2.5524827004349856E+09, 5.2112383911371746E+09, -1.0268350564014614E+10, -1.4763245309081245E+10, 1.4763245309081299E+10, 1.0268350564014664E+10, -5.2112383911371031E+09, -2.5524827004349875E+09, -1.8144472126890990E+08, -1.0657807616803090E+06}; - constexpr CUFINUFFT_FLT c4[] = {9.7829638830158743E+05, 8.2222351241519973E+07, 5.5676911894064891E+08, -4.8739037675424922E+08, -2.7153428193077750E+09, 2.5627633609246840E+09, 2.5627633609247112E+09, -2.7153428193078070E+09, -4.8739037675429451E+08, 5.5676911894064677E+08, 8.2222351241519928E+07, 9.7829638830161165E+05}; - constexpr CUFINUFFT_FLT c5[] = {6.2536876825113979E+05, 2.4702814073680263E+07, 4.1488431554845832E+07, -2.9274790542418414E+08, 1.0742154109193267E+08, 6.2185168968029702E+08, -6.2185168968023658E+08, -1.0742154109185636E+08, 2.9274790542422676E+08, -4.1488431554844096E+07, -2.4702814073680244E+07, -6.2536876825112442E+05}; - constexpr CUFINUFFT_FLT c6[] = {2.8527714307528478E+05, 4.6266378435690189E+06, -1.0665598090791209E+07, -2.6048960239906937E+07, 9.1597254427339226E+07, -5.9794495983323507E+07, -5.9794495983287223E+07, 9.1597254427330941E+07, -2.6048960239925586E+07, -1.0665598090793334E+07, 4.6266378435690831E+06, 2.8527714307530422E+05}; - constexpr CUFINUFFT_FLT c7[] = {9.2873647411234240E+04, 3.6630046787428786E+05, -3.1271047224731087E+06, 4.8612412939261831E+06, 3.3820440907802135E+06, -1.6880127953711823E+07, 1.6880127953682471E+07, -3.3820440907974164E+06, -4.8612412939092657E+06, 3.1271047224737639E+06, -3.6630046787430649E+05, -9.2873647411216807E+04}; - constexpr CUFINUFFT_FLT c8[] = {2.0817947751046187E+04, -5.5660303410280452E+04, -1.9519783923293054E+05, 1.0804817251338358E+06, -1.8264985852948832E+06, 9.7602844964432076E+05, 9.7602844962242560E+05, -1.8264985853129351E+06, 1.0804817251129062E+06, -1.9519783923449527E+05, -5.5660303410338929E+04, 2.0817947751063308E+04}; - constexpr CUFINUFFT_FLT c9[] = {2.7986023314784748E+03, -1.9404411093600604E+04, 4.3922624999853564E+04, -7.6450317375817094E+03, -1.5273911976404345E+05, 3.3223441450299282E+05, -3.3223441454103496E+05, 1.5273911977621692E+05, 7.6450317497551932E+03, -4.3922624998426982E+04, 1.9404411093646668E+04, -2.7986023314644040E+03}; - constexpr CUFINUFFT_FLT c10[] = {6.7849020474186844E+01, -1.7921351307934926E+03, 8.4980694693463538E+03, -1.9742624859078383E+04, 2.4620674878200782E+04, -1.1676544885779787E+04, -1.1676544871958942E+04, 2.4620674838120303E+04, -1.9742624835582923E+04, 8.4980694640771490E+03, -1.7921351307934922E+03, 6.7849020488748664E+01}; - constexpr CUFINUFFT_FLT c11[] = {-5.4577020998847871E+01, 1.3637112866755427E+02, 4.5513615487589092E+01, -1.1174001343792290E+03, 3.2018769324922364E+03, -5.0580351333780654E+03, 5.0580351424313239E+03, -3.2018769362383905E+03, 1.1174000937955741E+03, -4.5513610843875405E+01, -1.3637112870657899E+02, 5.4577021011919037E+01}; - constexpr CUFINUFFT_FLT c12[] = {-1.0538365872424132E+01, 4.6577222490846609E+01, -1.2606964180937365E+02, 2.1881091191930210E+02, -2.3273402308837001E+02, 1.0274273857329082E+02, 1.0274268020620094E+02, -2.3273404553726701E+02, 2.1881091276113446E+02, -1.2606964815819696E+02, 4.6577222438230805E+01, -1.0538365860846021E+01}; - constexpr CUFINUFFT_FLT c13[] = {-4.6087004128022252E-01, 2.5969759424153827E+00, -9.6946930749915676E+00, 2.4990050007153755E+01, -4.6013920149683365E+01, 6.2056948047986317E+01, -6.2056981293939970E+01, 4.6013908245461884E+01, -2.4990038356462701E+01, 9.6946952377382889E+00, -2.5969759165384922E+00, 4.6087004737535314E-01}; + constexpr FLT c0[] = {6.1722991679853279E+04, 8.4789650417103827E+07, 5.4431675199498749E+09, 7.8788892335272308E+10, 4.0355760945670074E+11, 8.8071481911347998E+11, 8.8071481911348035E+11, 4.0355760945670081E+11, 7.8788892335272507E+10, 5.4431675199498901E+09, 8.4789650417103752E+07, 6.1722991679871782E+04}; + constexpr FLT c1[] = {3.2561466099406185E+05, 2.2112758120210630E+08, 8.9911609880089836E+09, 8.3059508064200958E+10, 2.3965569143469873E+11, 1.6939286803305209E+11, -1.6939286803305209E+11, -2.3965569143469867E+11, -8.3059508064201111E+10, -8.9911609880090008E+09, -2.2112758120210621E+08, -3.2561466099404270E+05}; + constexpr FLT c2[] = {7.6621098001581512E+05, 2.6026568260310283E+08, 6.4524338253008652E+09, 3.3729904113826836E+10, 2.8555202212474079E+10, -6.8998572040731476E+10, -6.8998572040731461E+10, 2.8555202212474102E+10, 3.3729904113826820E+10, 6.4524338253008747E+09, 2.6026568260310283E+08, 7.6621098001583782E+05}; + constexpr FLT c3[] = {1.0657807616803222E+06, 1.8144472126891005E+08, 2.5524827004349880E+09, 5.2112383911371851E+09, -1.0268350564014641E+10, -1.4763245309081160E+10, 1.4763245309081381E+10, 1.0268350564014679E+10, -5.2112383911371050E+09, -2.5524827004349866E+09, -1.8144472126890993E+08, -1.0657807616803094E+06}; + constexpr FLT c4[] = {9.7829638830158766E+05, 8.2222351241520002E+07, 5.5676911894064677E+08, -4.8739037675425845E+08, -2.7153428193078089E+09, 2.5627633609246616E+09, 2.5627633609247270E+09, -2.7153428193078089E+09, -4.8739037675429344E+08, 5.5676911894064772E+08, 8.2222351241519988E+07, 9.7829638830161223E+05}; + constexpr FLT c5[] = {6.2536876825113979E+05, 2.4702814073680259E+07, 4.1488431554846764E+07, -2.9274790542417943E+08, 1.0742154109192364E+08, 6.2185168968026125E+08, -6.2185168968025279E+08, -1.0742154109186378E+08, 2.9274790542422217E+08, -4.1488431554844894E+07, -2.4702814073680248E+07, -6.2536876825112430E+05}; + constexpr FLT c6[] = {2.8527714307528501E+05, 4.6266378435690925E+06, -1.0665598090789001E+07, -2.6048960239884529E+07, 9.1597254427304730E+07, -5.9794495983325504E+07, -5.9794495983230442E+07, 9.1597254427350238E+07, -2.6048960239922173E+07, -1.0665598090794679E+07, 4.6266378435690831E+06, 2.8527714307530370E+05}; + constexpr FLT c7[] = {9.2873647411234633E+04, 3.6630046787437343E+05, -3.1271047224703613E+06, 4.8612412939389814E+06, 3.3820440907783178E+06, -1.6880127953644276E+07, 1.6880127953794900E+07, -3.3820440907782884E+06, -4.8612412938910574E+06, 3.1271047224760642E+06, -3.6630046787425788E+05, -9.2873647411217215E+04}; + constexpr FLT c8[] = {2.0817947751046311E+04, -5.5660303410283603E+04, -1.9519783923352187E+05, 1.0804817251249440E+06, -1.8264985852847320E+06, 9.7602844964054180E+05, 9.7602844964026869E+05, -1.8264985852578641E+06, 1.0804817251242315E+06, -1.9519783923298802E+05, -5.5660303410281354E+04, 2.0817947751063894E+04}; + constexpr FLT c9[] = {2.7986023314783351E+03, -1.9404411093657811E+04, 4.3922625001185028E+04, -7.6450317330166517E+03, -1.5273911976404343E+05, 3.3223441450907954E+05, -3.3223441450755787E+05, 1.5273911981578072E+05, 7.6450317512768770E+03, -4.3922624998712294E+04, 1.9404411093676386E+04, -2.7986023314643107E+03}; + constexpr FLT c10[] = {6.7849020474217255E+01, -1.7921351307610907E+03, 8.4980694701237535E+03, -1.9742624848712727E+04, 2.4620674811515193E+04, -1.1676544936917096E+04, -1.1676544845699163E+04, 2.4620674862652242E+04, -1.9742624819688928E+04, 8.4980694644226842E+03, -1.7921351307503089E+03, 6.7849020488654887E+01}; + constexpr FLT c11[] = {-5.4577020998540995E+01, 1.3637112871144197E+02, 4.5513617165591533E+01, -1.1174001347694452E+03, 3.2018768920645603E+03, -5.0580352089258022E+03, 5.0580351705274497E+03, -3.2018769484133886E+03, 1.1174001005075061E+03, -4.5513609907370189E+01, -1.3637112869192950E+02, 5.4577021011650153E+01}; + constexpr FLT c12[] = {-1.0538365872663764E+01, 4.6577222493036992E+01, -1.2606964247581806E+02, 2.1881090265912360E+02, -2.3273404104747246E+02, 1.0274271612440927E+02, 1.0274271612440242E+02, -2.3273400063947102E+02, 2.1881092482740195E+02, -1.2606964693052080E+02, 4.6577222495229805E+01, -1.0538365860486415E+01}; + constexpr FLT c13[] = {-4.6087004138254672E-01, 2.5969759057927089E+00, -9.6946928123584506E+00, 2.4990051638288470E+01, -4.6013914134428035E+01, 6.2056955095902744E+01, -6.2056967309552682E+01, 4.6013924603270830E+01, -2.4990037679831403E+01, 9.6946951024178141E+00, -2.5969758989770559E+00, 4.6087004739949022E-01}; for (int i=0; i<12; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i] + z*(c12[i] + z*(c13[i]))))))))))))); } else if (w==13) { - constexpr CUFINUFFT_FLT c0[] = {9.8715725867495639E+04, 1.9828875496808118E+08, 1.7196758809614998E+10, 3.3083776881353607E+11, 2.2668873993375444E+12, 6.7734720591167598E+12, 9.6695220682534824E+12, 6.7734720591167471E+12, 2.2668873993375439E+12, 3.3083776881353534E+11, 1.7196758809614998E+10, 1.9828875496807906E+08, 9.8715725867495537E+04}; - constexpr CUFINUFFT_FLT c1[] = {5.4491110456935503E+05, 5.4903670125539362E+08, 3.0879465445278172E+10, 3.9588436413399951E+11, 1.6860562536749778E+12, 2.4256447893117881E+12, 3.7318165868693593E-04, -2.4256447893117856E+12, -1.6860562536749768E+12, -3.9588436413399890E+11, -3.0879465445278183E+10, -5.4903670125538874E+08, -5.4491110456935491E+05}; - constexpr CUFINUFFT_FLT c2[] = {1.3504711883426066E+06, 6.9286979077463162E+08, 2.4618123595484562E+10, 1.9493985627722598E+11, 3.9422703517046326E+11, -1.8678883613919931E+11, -8.5538079834550146E+11, -1.8678883613919705E+11, 3.9422703517046338E+11, 1.9493985627722586E+11, 2.4618123595484554E+10, 6.9286979077462578E+08, 1.3504711883426069E+06}; - constexpr CUFINUFFT_FLT c3[] = {1.9937206140846494E+06, 5.2512029493765986E+08, 1.1253303793811754E+10, 4.6205527735932175E+10, -1.1607472377983284E+10, -1.6305241755642325E+11, 1.3350300616010507E-04, 1.6305241755642365E+11, 1.1607472377982744E+10, -4.6205527735932228E+10, -1.1253303793811750E+10, -5.2512029493765610E+08, -1.9937206140846484E+06}; - constexpr CUFINUFFT_FLT c4[] = {1.9607419630386413E+06, 2.6425362558103889E+08, 3.1171259341747255E+09, 2.9839860297840505E+09, -1.9585031917561890E+10, -5.0666917387055302E+09, 3.6568794485482079E+10, -5.0666917387051382E+09, -1.9585031917561581E+10, 2.9839860297839398E+09, 3.1171259341747217E+09, 2.6425362558103737E+08, 1.9607419630386410E+06}; - constexpr CUFINUFFT_FLT c5[] = {1.3593773865640301E+06, 9.1556445104158148E+07, 4.7074012944133490E+08, -1.1192579335657711E+09, -2.1090780087868552E+09, 5.2270306737949314E+09, 1.0058570913473114E-03, -5.2270306737942495E+09, 2.1090780087878082E+09, 1.1192579335658059E+09, -4.7074012944133729E+08, -9.1556445104157895E+07, -1.3593773865640303E+06}; - constexpr CUFINUFFT_FLT c6[] = {6.8417206432039186E+05, 2.1561705510027003E+07, 7.5785249892988410E+06, -2.7456096030230397E+08, 3.4589095671043062E+08, 4.0256106808852541E+08, -1.0074306926606210E+09, 4.0256106809059316E+08, 3.4589095670995283E+08, -2.7456096030234104E+08, 7.5785249893005500E+06, 2.1561705510027427E+07, 6.8417206432039267E+05}; - constexpr CUFINUFFT_FLT c7[] = {2.5248269397037479E+05, 3.0985559672615193E+06, -1.1816517087617906E+07, -8.2958498770340970E+06, 8.0546642347242445E+07, -1.0594657799535300E+08, -4.1868673222825360E-04, 1.0594657799426495E+08, -8.0546642347729877E+07, 8.2958498770339396E+06, 1.1816517087613177E+07, -3.0985559672620757E+06, -2.5248269397037491E+05}; - constexpr CUFINUFFT_FLT c8[] = {6.7530100970876083E+04, 1.2373362326659705E+05, -2.1245597183259744E+06, 5.1047323238916462E+06, -1.4139444405955642E+06, -1.1818267554953648E+07, 2.0121548577168033E+07, -1.1818267556967378E+07, -1.4139444400679788E+06, 5.1047323236808330E+06, -2.1245597183310925E+06, 1.2373362326704434E+05, 6.7530100970875879E+04}; - constexpr CUFINUFFT_FLT c9[] = {1.2421368748960791E+04, -5.0576243646949319E+04, -4.8878193435000605E+04, 6.5307896868984913E+05, -1.5497610128277773E+06, 1.5137725915373438E+06, 2.4159142842753925E-04, -1.5137725925842635E+06, 1.5497610128277773E+06, -6.5307896858028776E+05, 4.8878193437283131E+04, 5.0576243646456518E+04, -1.2421368748960884E+04}; - constexpr CUFINUFFT_FLT c10[] = {1.2904654687546160E+03, -1.1169946055063081E+04, 3.3275109714208906E+04, -3.1765222279764806E+04, -5.9810981980285695E+04, 2.2355863005975721E+05, -3.1083591689740209E+05, 2.2355863472015061E+05, -5.9810982676856896E+04, -3.1765222445615127E+04, 3.3275109711790254E+04, -1.1169946054458416E+04, 1.2904654687550794E+03}; - constexpr CUFINUFFT_FLT c11[] = {-1.9043622268985253E+01, -6.8296542226098870E+02, 4.2702512255472038E+03, -1.2165497337805051E+04, 1.9423733200245264E+04, -1.6010024156865491E+04, -1.8587318864580292E-04, 1.6010021504569266E+04, -1.9423732997327170E+04, 1.2165497443946821E+04, -4.2702512314786209E+03, 6.8296542157807858E+02, 1.9043622268681840E+01}; - constexpr CUFINUFFT_FLT c12[] = {-3.0093984465812213E+01, 9.8972865698526618E+01, -9.7437039087669007E+01, -3.5079927282955276E+02, 1.5699250476860170E+03, -3.1287441993042225E+03, 3.8692185175061472E+03, -3.1287462825609659E+03, 1.5699252631952513E+03, -3.5079945803284346E+02, -9.7437044419281492E+01, 9.8972866145746991E+01, -3.0093984466256714E+01}; - constexpr CUFINUFFT_FLT c13[] = {-4.3050286009571908E+00, 2.1108975820085092E+01, -6.4297196365104938E+01, 1.2922885252832501E+02, -1.6991814421468084E+02, 1.2655005406584399E+02, -2.7552199668252238E-05, -1.2655093214380580E+02, 1.6991796275475141E+02, -1.2922893349406868E+02, 6.4297198822227926E+01, -2.1108976183295965E+01, 4.3050286010617569E+00}; - constexpr CUFINUFFT_FLT c14[] = {-1.0957333744888972E-01, 7.2949316377828033E-01, -3.4300810538238449E+00, 1.0470062030552395E+01, -2.2292087310650142E+01, 3.4570674930666925E+01, -3.9923385381532697E+01, 3.4573472104415345E+01, -2.2292369892227434E+01, 1.0470053799441445E+01, -3.4300825281782954E+00, 7.2949352704193948E-01, -1.0957333730383595E-01}; + constexpr FLT c0[] = {9.8715725867495858E+04, 1.9828875496808127E+08, 1.7196758809615005E+10, 3.3083776881353601E+11, 2.2668873993375454E+12, 6.7734720591167598E+12, 9.6695220682534863E+12, 6.7734720591167490E+12, 2.2668873993375454E+12, 3.3083776881353540E+11, 1.7196758809615013E+10, 1.9828875496807912E+08, 9.8715725867495596E+04}; + constexpr FLT c1[] = {5.4491110456935561E+05, 5.4903670125539398E+08, 3.0879465445278194E+10, 3.9588436413399976E+11, 1.6860562536749780E+12, 2.4256447893117891E+12, 5.2271652473787576E-04, -2.4256447893117861E+12, -1.6860562536749771E+12, -3.9588436413399896E+11, -3.0879465445278202E+10, -5.4903670125538874E+08, -5.4491110456935479E+05}; + constexpr FLT c2[] = {1.3504711883426080E+06, 6.9286979077463174E+08, 2.4618123595484570E+10, 1.9493985627722617E+11, 3.9422703517046405E+11, -1.8678883613919846E+11, -8.5538079834550037E+11, -1.8678883613919666E+11, 3.9422703517046375E+11, 1.9493985627722595E+11, 2.4618123595484570E+10, 6.9286979077462602E+08, 1.3504711883426073E+06}; + constexpr FLT c3[] = {1.9937206140846505E+06, 5.2512029493766004E+08, 1.1253303793811764E+10, 4.6205527735932259E+10, -1.1607472377982828E+10, -1.6305241755642276E+11, 1.6137900538478137E-04, 1.6305241755642496E+11, 1.1607472377982767E+10, -4.6205527735932159E+10, -1.1253303793811754E+10, -5.2512029493765628E+08, -1.9937206140846501E+06}; + constexpr FLT c4[] = {1.9607419630386413E+06, 2.6425362558103913E+08, 3.1171259341747184E+09, 2.9839860297840395E+09, -1.9585031917561905E+10, -5.0666917387060509E+09, 3.6568794485482040E+10, -5.0666917387052479E+09, -1.9585031917561382E+10, 2.9839860297839293E+09, 3.1171259341747251E+09, 2.6425362558103746E+08, 1.9607419630386424E+06}; + constexpr FLT c5[] = {1.3593773865640303E+06, 9.1556445104158297E+07, 4.7074012944133645E+08, -1.1192579335656993E+09, -2.1090780087868536E+09, 5.2270306737954664E+09, 5.5914317801530834E-04, -5.2270306737946453E+09, 2.1090780087878797E+09, 1.1192579335657849E+09, -4.7074012944133860E+08, -9.1556445104157880E+07, -1.3593773865640303E+06}; + constexpr FLT c6[] = {6.8417206432039291E+05, 2.1561705510027312E+07, 7.5785249893027432E+06, -2.7456096030220407E+08, 3.4589095671070045E+08, 4.0256106808935356E+08, -1.0074306926604354E+09, 4.0256106809054130E+08, 3.4589095671009880E+08, -2.7456096030236250E+08, 7.5785249893008731E+06, 2.1561705510027334E+07, 6.8417206432039256E+05}; + constexpr FLT c7[] = {2.5248269397037590E+05, 3.0985559672617475E+06, -1.1816517087615140E+07, -8.2958498769974122E+06, 8.0546642347458601E+07, -1.0594657799513456E+08, 2.0249720264016184E-04, 1.0594657799514198E+08, -8.0546642347324282E+07, 8.2958498771580132E+06, 1.1816517087620620E+07, -3.0985559672620827E+06, -2.5248269397037590E+05}; + constexpr FLT c8[] = {6.7530100970876185E+04, 1.2373362326675311E+05, -2.1245597183288219E+06, 5.1047323238642653E+06, -1.4139444406972022E+06, -1.1818267556148527E+07, 2.0121548578311723E+07, -1.1818267556689126E+07, -1.4139444399964837E+06, 5.1047323237335468E+06, -2.1245597183262822E+06, 1.2373362326715943E+05, 6.7530100970876825E+04}; + constexpr FLT c9[] = {1.2421368748960511E+04, -5.0576243646858849E+04, -4.8878193436522284E+04, 6.5307896871419600E+05, -1.5497610128521242E+06, 1.5137725913425679E+06, 9.4288709689637382E-06, -1.5137725926086102E+06, 1.5497610130712469E+06, -6.5307896859246108E+05, 4.8878193441087336E+04, 5.0576243646517250E+04, -1.2421368748960882E+04}; + constexpr FLT c10[] = {1.2904654687548632E+03, -1.1169946054771519E+04, 3.3275109715936509E+04, -3.1765222282529230E+04, -5.9810982046625119E+04, 2.2355863065128919E+05, -3.1083591717381903E+05, 2.2355863453495159E+05, -5.9810982317515191E+04, -3.1765222420737289E+04, 3.3275109716627514E+04, -1.1169946054393644E+04, 1.2904654687550840E+03}; + constexpr FLT c11[] = {-1.9043622268214964E+01, -6.8296542209517031E+02, 4.2702512258593224E+03, -1.2165497344048174E+04, 1.9423733117203814E+04, -1.6010024763745962E+04, 3.4546242756821764E-04, 1.6010021562009399E+04, -1.9423732921465795E+04, 1.2165497485154361E+04, -4.2702512258593424E+03, 6.8296542155861471E+02, 1.9043622268233225E+01}; + constexpr FLT c12[] = {-3.0093984466084923E+01, 9.8972865759901183E+01, -9.7437038386122609E+01, -3.5079929976821143E+02, 1.5699249129925884E+03, -3.1287450613413444E+03, 3.8692192717886201E+03, -3.1287461388880197E+03, 1.5699252721748373E+03, -3.5079941874733129E+02, -9.7437038807041006E+01, 9.8972866294818274E+01, -3.0093984465708520E+01}; + constexpr FLT c13[] = {-4.3050286012574066E+00, 2.1108975856232256E+01, -6.4297196943170974E+01, 1.2922884719917388E+02, -1.6991815434264092E+02, 1.2654996803592717E+02, -1.3650372630766216E-04, -1.2655097304483594E+02, 1.6991801475807023E+02, -1.2922895886683040E+02, 6.4297199778482565E+01, -2.1108976173160116E+01, 4.3050286010444170E+00}; + constexpr FLT c14[] = {-1.0957333734356203E-01, 7.2949328697697935E-01, -3.4300803257592030E+00, 1.0470037850609911E+01, -2.2292132783546631E+01, 3.4570970759468082E+01, -3.9923502981338281E+01, 3.4573363471454584E+01, -2.2292171023236033E+01, 1.0470076090299283E+01, -3.4300793014818574E+00, 7.2949361239845723E-01, -1.0957333723937021E-01}; for (int i=0; i<13; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i] + z*(c12[i] + z*(c13[i] + z*(c14[i])))))))))))))); } else if (w==14) { - constexpr CUFINUFFT_FLT c0[] = {1.5499533202966300E+05, 4.4723032442444748E+08, 5.1495083701694786E+10, 1.2904576022918081E+12, 1.1534950432785512E+13, 4.5650102198520516E+13, 8.8830582190032688E+13, 8.8830582190032672E+13, 4.5650102198520516E+13, 1.1534950432785535E+13, 1.2904576022918081E+12, 5.1495083701695145E+10, 4.4723032442444843E+08, 1.5499533202970150E+05}; - constexpr CUFINUFFT_FLT c1[] = {8.9188339002980455E+05, 1.3065352538728631E+09, 9.9400185225815582E+10, 1.7136059013402410E+12, 1.0144146621675832E+13, 2.3034036018490723E+13, 1.4630967270448867E+13, -1.4630967270448859E+13, -2.3034036018490715E+13, -1.0144146621675846E+13, -1.7136059013402410E+12, -9.9400185225815979E+10, -1.3065352538728662E+09, -8.9188339002979524E+05}; - constexpr CUFINUFFT_FLT c2[] = {2.3170473769379673E+06, 1.7532505043698246E+09, 8.6523535958354294E+10, 9.7455289065487329E+11, 3.2977972139362295E+12, 1.7874626001697771E+12, -6.1480918082634004E+12, -6.1480918082633994E+12, 1.7874626001697695E+12, 3.2977972139362256E+12, 9.7455289065487366E+11, 8.6523535958354599E+10, 1.7532505043698282E+09, 2.3170473769380408E+06}; - constexpr CUFINUFFT_FLT c3[] = {3.6089249230396431E+06, 1.4278058213962190E+09, 4.4296625537022438E+10, 2.9466624630419812E+11, 3.1903621584503357E+11, -9.8834691411254529E+11, -1.1072264714919219E+12, 1.1072264714919253E+12, 9.8834691411255261E+11, -3.1903621584503473E+11, -2.9466624630419775E+11, -4.4296625537022629E+10, -1.4278058213962216E+09, -3.6089249230396645E+06}; - constexpr CUFINUFFT_FLT c4[] = {3.7733555140851741E+06, 7.8376718099107432E+08, 1.4443117772349600E+10, 4.3197433307419121E+10, -7.6585042240582489E+10, -1.8569640140761731E+11, 2.0385335192658878E+11, 2.0385335192657968E+11, -1.8569640140762405E+11, -7.6585042240578430E+10, 4.3197433307418945E+10, 1.4443117772349699E+10, 7.8376718099107552E+08, 3.7733555140852556E+06}; - constexpr CUFINUFFT_FLT c5[] = {2.8079157920112349E+06, 3.0340753492383713E+08, 2.9498136661747351E+09, -6.2820200387927818E+08, -2.2372008390622681E+10, 1.5217518660587118E+10, 4.0682590266889229E+10, -4.0682590266876595E+10, -1.5217518660581694E+10, 2.2372008390624306E+10, 6.2820200387922049E+08, -2.9498136661747746E+09, -3.0340753492383796E+08, -2.8079157920112382E+06}; - constexpr CUFINUFFT_FLT c6[] = {1.5361613559533113E+06, 8.3513615594416350E+07, 3.0077547202707732E+08, -1.3749596754069650E+09, -6.6733027297582805E+08, 5.9590333632825184E+09, -4.3025685566887646E+09, -4.3025685566943264E+09, 5.9590333632825480E+09, -6.6733027297550666E+08, -1.3749596754065177E+09, 3.0077547202710402E+08, 8.3513615594416887E+07, 1.5361613559533583E+06}; - constexpr CUFINUFFT_FLT c7[] = {6.2759409419592936E+05, 1.5741723594963074E+07, -1.5632610223404476E+07, -1.9294824907080847E+08, 4.4643806532363749E+08, 1.5178998383416286E+07, -9.6771139892184162E+08, 9.6771139891756535E+08, -1.5178998386503356E+07, -4.4643806533349395E+08, 1.9294824907058707E+08, 1.5632610223392753E+07, -1.5741723594962660E+07, -6.2759409419590654E+05}; - constexpr CUFINUFFT_FLT c8[] = {1.9151404903933575E+05, 1.7156606891565928E+06, -9.7733523156610541E+06, 4.2982266236283993E+06, 5.1660907884816565E+07, -1.1279400211055294E+08, 6.4701089573887214E+07, 6.4701089567399226E+07, -1.1279400211297083E+08, 5.1660907891780980E+07, 4.2982266233826252E+06, -9.7733523156971950E+06, 1.7156606891561027E+06, 1.9151404903936631E+05}; - constexpr CUFINUFFT_FLT c9[] = {4.2715272622844830E+04, -2.2565910608684317E+03, -1.1769776156829668E+06, 4.0078399908543471E+06, -3.8951858064309461E+06, -5.0944610762301283E+06, 1.6765992441460442E+07, -1.6765992436785825E+07, 5.0944610781778852E+06, 3.8951858054570677E+06, -4.0078399907569592E+06, 1.1769776157156830E+06, 2.2565910609040961E+03, -4.2715272622820310E+04}; - constexpr CUFINUFFT_FLT c10[] = {6.4806786522791654E+03, -3.5474227032931303E+04, 1.8237100723206047E+04, 3.0934714627485734E+05, -1.0394703921956274E+06, 1.4743920336239333E+06, -7.3356882129423053E+05, -7.3356882916659222E+05, 1.4743920340662012E+06, -1.0394703928590287E+06, 3.0934714634119731E+05, 1.8237100680361433E+04, -3.5474227032996088E+04, 6.4806786523011797E+03}; - constexpr CUFINUFFT_FLT c11[] = {4.9913632908432180E+02, -5.5416668526903932E+03, 2.0614058707628108E+04, -3.2285139177838235E+04, -5.3099560012237780E+03, 1.1559000312360718E+05, -2.2569743818692098E+05, 2.2569743267254104E+05, -1.1559000606061178E+05, 5.3099530192621614E+03, 3.2285139062955688E+04, -2.0614058671415001E+04, 5.5416668535488525E+03, -4.9913632906175445E+02}; - constexpr CUFINUFFT_FLT c12[] = {-3.3076333188770995E+01, -1.8970588549665433E+02, 1.8160423465108606E+03, -6.3715702906684537E+03, 1.2525623712293716E+04, -1.4199809613604592E+04, 6.4441857815348694E+03, 6.4441852068443368E+03, -1.4199811050333730E+04, 1.2525626046977848E+04, -6.3715705510753096E+03, 1.8160422724294601E+03, -1.8970588700494130E+02, -3.3076333169380085E+01}; - constexpr CUFINUFFT_FLT c13[] = {-1.4394533627757088E+01, 5.7000699312246105E+01, -1.0101141802233408E+02, -3.2954042015367456E+01, 6.1417873351558330E+02, -1.6177281811377129E+03, 2.4593356854220169E+03, -2.4593356782637338E+03, 1.6177289006539679E+03, -6.1417987494681950E+02, 3.2954142200289709E+01, 1.0101142888658896E+02, -5.7000698890466253E+01, 1.4394533639134110E+01}; - constexpr CUFINUFFT_FLT c14[] = {-1.5925952286169334E+00, 8.5113929411519127E+00, -2.8993517494090959E+01, 6.6373419665690747E+01, -1.0329523947888029E+02, 1.0280172537525394E+02, -4.3894765605046906E+01, -4.3897466711581743E+01, 1.0280269421314661E+02, -1.0329529425338121E+02, 6.6373405476301841E+01, -2.8993535416845578E+01, 8.5113925602355138E+00, -1.5925952196632756E+00}; - constexpr CUFINUFFT_FLT c15[] = {1.5984868375087002E-02, 1.2876155307218357E-01, -9.8359379953002779E-01, 3.7711056267887488E+00, -9.4307026856950991E+00, 1.6842022255882348E+01, -2.2310401016395307E+01, 2.2307954998498516E+01, -1.6843279237301534E+01, 9.4308852877255891E+00, -3.7711056267887488E+00, 9.8361025494556609E-01, -1.2876093931172500E-01, -1.5984859319657936E-02}; + constexpr FLT c0[] = {1.5499533202966311E+05, 4.4723032442444772E+08, 5.1495083701694801E+10, 1.2904576022918081E+12, 1.1534950432785514E+13, 4.5650102198520523E+13, 8.8830582190032719E+13, 8.8830582190032734E+13, 4.5650102198520523E+13, 1.1534950432785541E+13, 1.2904576022918088E+12, 5.1495083701695160E+10, 4.4723032442444867E+08, 1.5499533202970124E+05}; + constexpr FLT c1[] = {8.9188339002980455E+05, 1.3065352538728638E+09, 9.9400185225815598E+10, 1.7136059013402412E+12, 1.0144146621675834E+13, 2.3034036018490723E+13, 1.4630967270448885E+13, -1.4630967270448867E+13, -2.3034036018490715E+13, -1.0144146621675846E+13, -1.7136059013402415E+12, -9.9400185225815979E+10, -1.3065352538728662E+09, -8.9188339002979419E+05}; + constexpr FLT c2[] = {2.3170473769379673E+06, 1.7532505043698251E+09, 8.6523535958354309E+10, 9.7455289065487476E+11, 3.2977972139362329E+12, 1.7874626001697834E+12, -6.1480918082633936E+12, -6.1480918082634014E+12, 1.7874626001697737E+12, 3.2977972139362251E+12, 9.7455289065487329E+11, 8.6523535958354599E+10, 1.7532505043698282E+09, 2.3170473769380408E+06}; + constexpr FLT c3[] = {3.6089249230396431E+06, 1.4278058213962200E+09, 4.4296625537022446E+10, 2.9466624630419830E+11, 3.1903621584503467E+11, -9.8834691411254578E+11, -1.1072264714919094E+12, 1.1072264714919380E+12, 9.8834691411255481E+11, -3.1903621584503326E+11, -2.9466624630419788E+11, -4.4296625537022636E+10, -1.4278058213962224E+09, -3.6089249230396668E+06}; + constexpr FLT c4[] = {3.7733555140851745E+06, 7.8376718099107444E+08, 1.4443117772349586E+10, 4.3197433307418678E+10, -7.6585042240583893E+10, -1.8569640140762125E+11, 2.0385335192658521E+11, 2.0385335192658505E+11, -1.8569640140762244E+11, -7.6585042240577591E+10, 4.3197433307418831E+10, 1.4443117772349697E+10, 7.8376718099107611E+08, 3.7733555140852574E+06}; + constexpr FLT c5[] = {2.8079157920112340E+06, 3.0340753492383713E+08, 2.9498136661747241E+09, -6.2820200387946582E+08, -2.2372008390623741E+10, 1.5217518660587065E+10, 4.0682590266890762E+10, -4.0682590266874344E+10, -1.5217518660581593E+10, 2.2372008390624836E+10, 6.2820200387926054E+08, -2.9498136661747794E+09, -3.0340753492383808E+08, -2.8079157920112382E+06}; + constexpr FLT c6[] = {1.5361613559533129E+06, 8.3513615594416931E+07, 3.0077547202709264E+08, -1.3749596754065564E+09, -6.6733027297578251E+08, 5.9590333632812872E+09, -4.3025685566868906E+09, -4.3025685566947279E+09, 5.9590333632843285E+09, -6.6733027297604084E+08, -1.3749596754066198E+09, 3.0077547202708143E+08, 8.3513615594416305E+07, 1.5361613559533581E+06}; + constexpr FLT c7[] = {6.2759409419593017E+05, 1.5741723594963871E+07, -1.5632610223386128E+07, -1.9294824907063219E+08, 4.4643806532504034E+08, 1.5178998384579189E+07, -9.6771139891231704E+08, 9.6771139892423606E+08, -1.5178998381071322E+07, -4.4643806533015347E+08, 1.9294824907069016E+08, 1.5632610223408137E+07, -1.5741723594963046E+07, -6.2759409419590794E+05}; + constexpr FLT c8[] = {1.9151404903933618E+05, 1.7156606891565623E+06, -9.7733523156695794E+06, 4.2982266232611798E+06, 5.1660907884888940E+07, -1.1279400211171694E+08, 6.4701089576848499E+07, 6.4701089570801638E+07, -1.1279400210612530E+08, 5.1660907893511616E+07, 4.2982266235306170E+06, -9.7733523156822342E+06, 1.7156606891565854E+06, 1.9151404903936735E+05}; + constexpr FLT c9[] = {4.2715272622844263E+04, -2.2565910611002505E+03, -1.1769776156928577E+06, 4.0078399906352242E+06, -3.8951858073074366E+06, -5.0944610789569877E+06, 1.6765992441849992E+07, -1.6765992434448514E+07, 5.0944610797360903E+06, 3.8951858063335577E+06, -4.0078399906595708E+06, 1.1769776157202481E+06, 2.2565910608803192E+03, -4.2715272622819932E+04}; + constexpr FLT c10[] = {6.4806786522801558E+03, -3.5474227032715331E+04, 1.8237100734263218E+04, 3.0934714642964909E+05, -1.0394703930801603E+06, 1.4743920316337310E+06, -7.3356881642929500E+05, -7.3356882324020052E+05, 1.4743920364765557E+06, -1.0394703915764539E+06, 3.0934714676135289E+05, 1.8237100683125096E+04, -3.5474227032952876E+04, 6.4806786523017845E+03}; + constexpr FLT c11[] = {4.9913632908494827E+02, -5.5416668522806276E+03, 2.0614058722611946E+04, -3.2285139157855901E+04, -5.3099566255893524E+03, 1.1559000150525174E+05, -2.2569743273246771E+05, 2.2569743457059452E+05, -1.1559000428242185E+05, 5.3099542679931265E+03, 3.2285138893125553E+04, -2.0614058670789782E+04, 5.5416668532562171E+03, -4.9913632906264002E+02}; + constexpr FLT c12[] = {-3.3076333188696488E+01, -1.8970588558436827E+02, 1.8160423493169353E+03, -6.3715703265863249E+03, 1.2525624646166696E+04, -1.4199807314837786E+04, 6.4441944019082612E+03, 6.4441857815347785E+03, -1.4199805590763088E+04, 1.2525627375951648E+04, -6.3715703355659844E+03, 1.8160422864600705E+03, -1.8970588672434647E+02, -3.3076333168693779E+01}; + constexpr FLT c13[] = {-1.4394533628062636E+01, 5.7000699174526638E+01, -1.0101142144442984E+02, -3.2954074617159108E+01, 6.1417869930814436E+02, -1.6177306801656998E+03, 2.4593354137960296E+03, -2.4593361954696252E+03, 1.6177288934831954E+03, -6.1417959264939657E+02, 3.2954074617159108E+01, 1.0101142929606195E+02, -5.7000698932570963E+01, 1.4394533639244566E+01}; + constexpr FLT c14[] = {-1.5925952284527973E+00, 8.5113930275160214E+00, -2.8993510636695618E+01, 6.6373557362227814E+01, -1.0329536491693236E+02, 1.0280181071020283E+02, -4.3891122033571499E+01, -4.3893656778687756E+01, 1.0280325289276884E+02, -1.0329444716438918E+02, 6.6373666618482872E+01, -2.8993528390837142E+01, 8.5113926647511526E+00, -1.5925952190335899E+00}; + constexpr FLT c15[] = {1.5984868634272537E-02, 1.2876168577716327E-01, -9.8358742969178536E-01, 3.7710928871122080E+00, -9.4315137784350505E+00, 1.6840408563519507E+01, -2.2308532530501328E+01, 2.2310146222863779E+01, -1.6843058416240989E+01, 9.4311230950209399E+00, -3.7712287769953385E+00, 9.8360653920659347E-01, -1.2876103884046056E-01, -1.5984859595043394E-02}; for (int i=0; i<14; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i] + z*(c12[i] + z*(c13[i] + z*(c14[i] + z*(c15[i]))))))))))))))); } else if (w==15) { - constexpr CUFINUFFT_FLT c0[] = {2.3939707792241831E+05, 9.7700272582690263E+08, 1.4715933396485272E+11, 4.7242424833337188E+12, 5.3987426629953602E+13, 2.7580474290566097E+14, 7.0693378336533425E+14, 9.6196578554477812E+14, 7.0693378336533450E+14, 2.7580474290566138E+14, 5.3987426629953812E+13, 4.7242424833337275E+12, 1.4715933396485272E+11, 9.7700272582690227E+08, 2.3939707792241947E+05}; - constexpr CUFINUFFT_FLT c1[] = {1.4314487885226030E+06, 2.9961416925358467E+09, 3.0273361232748438E+11, 6.8507333793903594E+12, 5.4192702756911008E+13, 1.7551587948105312E+14, 2.1874615668430153E+14, 7.1650878467891699E-02, -2.1874615668430153E+14, -1.7551587948105331E+14, -5.4192702756911164E+13, -6.8507333793903701E+12, -3.0273361232748431E+11, -2.9961416925358462E+09, -1.4314487885226023E+06}; - constexpr CUFINUFFT_FLT c2[] = {3.8829497354762917E+06, 4.2473082696966438E+09, 2.8414312556015527E+11, 4.3688281331121411E+12, 2.1823119508000523E+13, 3.2228098609392012E+13, -2.1833085454691871E+13, -7.3750710225100922E+13, -2.1833085454691941E+13, 3.2228098609392000E+13, 2.1823119508000590E+13, 4.3688281331121475E+12, 2.8414312556015521E+11, 4.2473082696966453E+09, 3.8829497354762908E+06}; - constexpr CUFINUFFT_FLT c3[] = {6.3495763451755773E+06, 3.6841035003733954E+09, 1.5965774278321045E+11, 1.5630338683778196E+12, 3.8749058615819282E+12, -2.7319740087723496E+12, -1.3233342822865416E+13, 1.2094759019991106E-03, 1.3233342822865408E+13, 2.7319740087723706E+12, -3.8749058615819390E+12, -1.5630338683778196E+12, -1.5965774278321036E+11, -3.6841035003733935E+09, -6.3495763451755773E+06}; - constexpr CUFINUFFT_FLT c4[] = {7.0146619045520434E+06, 2.1782897863065772E+09, 5.8897780310148148E+10, 3.1953009601770453E+11, 4.0651527030852091E+08, -1.6379148273275527E+12, -1.1568753136999574E+11, 2.7451653250461855E+12, -1.1568753137002715E+11, -1.6379148273276675E+12, 4.0651527030276263E+08, 3.1953009601770386E+11, 5.8897780310148087E+10, 2.1782897863065767E+09, 7.0146619045520416E+06}; - constexpr CUFINUFFT_FLT c5[] = {5.5580012413990181E+06, 9.2345162185944223E+08, 1.4522950934020067E+10, 2.7025952371212223E+10, -1.2304576967641710E+11, -1.0116752717201025E+11, 3.8517418245457495E+11, 1.1720185410178396E-01, -3.8517418245448737E+11, 1.0116752717220248E+11, 1.2304576967643900E+11, -2.7025952371215157E+10, -1.4522950934020073E+10, -9.2345162185944128E+08, -5.5580012413990190E+06}; - constexpr CUFINUFFT_FLT c6[] = {3.2693972344231787E+06, 2.8610260147425157E+08, 2.2348528403750129E+09, -3.4574515574239435E+09, -1.7480626463586948E+10, 3.1608597465528339E+10, 1.9879262560041798E+10, -6.6148013553832657E+10, 1.9879262560029728E+10, 3.1608597465497307E+10, -1.7480626463581020E+10, -3.4574515574192748E+09, 2.2348528403750839E+09, 2.8610260147425318E+08, 3.2693972344231806E+06}; - constexpr CUFINUFFT_FLT c7[] = {1.4553539959296260E+06, 6.4136842048383795E+07, 1.3622336582061595E+08, -1.2131510424646864E+09, 6.4322366984170294E+08, 4.5078753872136936E+09, -7.1689413747181644E+09, -1.1786171556070136E-02, 7.1689413746620741E+09, -4.5078753875125484E+09, -6.4322366985783029E+08, 1.2131510424602287E+09, -1.3622336582069945E+08, -6.4136842048384361E+07, -1.4553539959296270E+06}; - constexpr CUFINUFFT_FLT c8[] = {4.9358776531681529E+05, 9.7772970960589685E+06, -2.3511574237970300E+07, -1.0142613816602133E+08, 3.9421144218642426E+08, -2.8449115593954617E+08, -5.7549243245203042E+08, 1.1608781631399941E+09, -5.7549243247572994E+08, -2.8449115597919518E+08, 3.9421144214433813E+08, -1.0142613816466759E+08, -2.3511574237996321E+07, 9.7772970960581861E+06, 4.9358776531681448E+05}; - constexpr CUFINUFFT_FLT c9[] = {1.2660319987326673E+05, 7.7519511328176421E+05, -6.5244610661542173E+06, 9.0878257489026226E+06, 2.3116605620370809E+07, -8.7079594480778053E+07, 9.5542733720576629E+07, 4.2723164545317951E-02, -9.5542733670714036E+07, 8.7079594586736053E+07, -2.3116605561938088E+07, -9.0878257517268714E+06, 6.5244610661359569E+06, -7.7519511328043276E+05, -1.2660319987326747E+05}; - constexpr CUFINUFFT_FLT c10[] = {2.3793325531458449E+04, -4.2305332803592217E+04, -5.2884156986641441E+05, 2.5307340140247596E+06, -4.0404175229102052E+06, -1.7519991511035681E+05, 1.0146438775036881E+07, -1.5828545434039038E+07, 1.0146438771144925E+07, -1.7520004460626876E+05, -4.0404175749208611E+06, 2.5307340154400147E+06, -5.2884156982771575E+05, -4.2305332803462676E+04, 2.3793325531458788E+04}; - constexpr CUFINUFFT_FLT c11[] = {2.9741655196842516E+03, -2.0687056404176896E+04, 3.3295507782231041E+04, 1.0661145714339131E+05, -5.6644238113375264E+05, 1.0874811579280477E+06, -9.6561272951275646E+05, -5.1287199081408294E-03, 9.6561272024221742E+05, -1.0874812519522079E+06, 5.6644242684715183E+05, -1.0661145918131116E+05, -3.3295507839673090E+04, 2.0687056403552484E+04, -2.9741655196846054E+03}; - constexpr CUFINUFFT_FLT c12[] = {1.5389176594851995E+02, -2.3864418514303975E+03, 1.0846266940782971E+04, -2.2940053288728755E+04, 1.4780109856545603E+04, 4.2663625334078126E+04, -1.3047651001642903E+05, 1.7468402233671257E+05, -1.3047651921148783E+05, 4.2663543727874072E+04, 1.4780033422571960E+04, -2.2940053360564565E+04, 1.0846266911599001E+04, -2.3864418523423406E+03, 1.5389176594715920E+02}; - constexpr CUFINUFFT_FLT c13[] = {-2.3857631312189291E+01, -1.9651605604649610E+01, 6.4183085202559698E+02, -2.8648428618202479E+03, 6.8249256924540387E+03, -9.7944454945500202E+03, 7.6177717113307281E+03, 1.2047808031005401E-02, -7.6177543637173221E+03, 9.7944303211006554E+03, -6.8249067869823548E+03, 2.8648410033462715E+03, -6.4183084900019139E+02, 1.9651606442715156E+01, 2.3857631312384541E+01}; - constexpr CUFINUFFT_FLT c14[] = {-6.1348505741956316E+00, 2.7872916029950378E+01, -6.5819949282243059E+01, 5.1366943137229264E+01, 1.7214074364107390E+02, -6.9658313160417026E+02, 1.3192072946885612E+03, -1.6053709652649356E+03, 1.3192033489278531E+03, -6.9663899461741221E+02, 1.7211498258980890E+02, 5.1367587332701412E+01, -6.5819942079787495E+01, 2.7872915852722411E+01, -6.1348505745937754E+00}; - constexpr CUFINUFFT_FLT c15[] = {-4.9671584494050897E-01, 3.0617548962871655E+00, -1.1650680501534040E+01, 3.0081518778147480E+01, -5.4027643304315461E+01, 6.6072752684824721E+01, -4.7155420133398515E+01, -5.6540863480770403E-03, 4.7158681490594240E+01, -6.6050534688928863E+01, 5.4059169757207428E+01, -3.0081909461561551E+01, 1.1650669885136919E+01, -3.0617550621683702E+00, 4.9671584460032286E-01}; - constexpr CUFINUFFT_FLT c16[] = {4.3460787769280373E-03, -1.3199805974685097E-02, -1.9413550415167488E-01, 1.1330353009743728E+00, -3.4412627904689330E+00, 7.1628360506506050E+00, -1.1104833360853762E+01, 1.2402582581952625E+01, -1.1114919494696498E+01, 7.0930736249049993E+00, -3.4864402649728556E+00, 1.1323392526753271E+00, -1.9415335680557039E-01, -1.3200242030886846E-02, 4.3460779753541788E-03}; + constexpr FLT c0[] = {2.3939707792242090E+05, 9.7700272582690299E+08, 1.4715933396485275E+11, 4.7242424833337236E+12, 5.3987426629953617E+13, 2.7580474290566103E+14, 7.0693378336533425E+14, 9.6196578554477850E+14, 7.0693378336533425E+14, 2.7580474290566153E+14, 5.3987426629953828E+13, 4.7242424833337285E+12, 1.4715933396485275E+11, 9.7700272582690418E+08, 2.3939707792242119E+05}; + constexpr FLT c1[] = {1.4314487885226035E+06, 2.9961416925358462E+09, 3.0273361232748425E+11, 6.8507333793903604E+12, 5.4192702756911016E+13, 1.7551587948105316E+14, 2.1874615668430153E+14, 5.4722295550654096E-02, -2.1874615668430156E+14, -1.7551587948105334E+14, -5.4192702756911172E+13, -6.8507333793903730E+12, -3.0273361232748438E+11, -2.9961416925358448E+09, -1.4314487885226023E+06}; + constexpr FLT c2[] = {3.8829497354762922E+06, 4.2473082696966453E+09, 2.8414312556015533E+11, 4.3688281331121431E+12, 2.1823119508000547E+13, 3.2228098609392133E+13, -2.1833085454691801E+13, -7.3750710225100750E+13, -2.1833085454691875E+13, 3.2228098609392070E+13, 2.1823119508000590E+13, 4.3688281331121470E+12, 2.8414312556015527E+11, 4.2473082696966438E+09, 3.8829497354762908E+06}; + constexpr FLT c3[] = {6.3495763451755792E+06, 3.6841035003733959E+09, 1.5965774278321054E+11, 1.5630338683778213E+12, 3.8749058615819409E+12, -2.7319740087722651E+12, -1.3233342822865350E+13, 1.2682483963161023E-01, 1.3233342822865453E+13, 2.7319740087724204E+12, -3.8749058615819307E+12, -1.5630338683778201E+12, -1.5965774278321042E+11, -3.6841035003733950E+09, -6.3495763451755783E+06}; + constexpr FLT c4[] = {7.0146619045520453E+06, 2.1782897863065763E+09, 5.8897780310148117E+10, 3.1953009601770477E+11, 4.0651527030195397E+08, -1.6379148273275671E+12, -1.1568753137013023E+11, 2.7451653250461045E+12, -1.1568753137006947E+11, -1.6379148273276748E+12, 4.0651527030228132E+08, 3.1953009601770502E+11, 5.8897780310148155E+10, 2.1782897863065772E+09, 7.0146619045520453E+06}; + constexpr FLT c5[] = {5.5580012413990172E+06, 9.2345162185944211E+08, 1.4522950934020031E+10, 2.7025952371212032E+10, -1.2304576967641461E+11, -1.0116752717201025E+11, 3.8517418245450385E+11, 1.3143739157465117E-02, -3.8517418245443384E+11, 1.0116752717219414E+11, 1.2304576967643431E+11, -2.7025952371216137E+10, -1.4522950934020092E+10, -9.2345162185944176E+08, -5.5580012413990181E+06}; + constexpr FLT c6[] = {3.2693972344231815E+06, 2.8610260147425276E+08, 2.2348528403751349E+09, -3.4574515574230409E+09, -1.7480626463581440E+10, 3.1608597465590984E+10, 1.9879262560063576E+10, -6.6148013553869423E+10, 1.9879262560078850E+10, 3.1608597465530212E+10, -1.7480626463573368E+10, -3.4574515574202504E+09, 2.2348528403750744E+09, 2.8610260147425228E+08, 3.2693972344231787E+06}; + constexpr FLT c7[] = {1.4553539959296281E+06, 6.4136842048384696E+07, 1.3622336582072574E+08, -1.2131510424637468E+09, 6.4322366984755766E+08, 4.5078753872548027E+09, -7.1689413747004452E+09, 3.2111361580040181E-03, 7.1689413747369127E+09, -4.5078753874649162E+09, -6.4322366984639454E+08, 1.2131510424612916E+09, -1.3622336582064471E+08, -6.4136842048384838E+07, -1.4553539959296265E+06}; + constexpr FLT c8[] = {4.9358776531681791E+05, 9.7772970960583091E+06, -2.3511574237971250E+07, -1.0142613816625430E+08, 3.9421144217985487E+08, -2.8449115594571364E+08, -5.7549243248595941E+08, 1.1608781630719392E+09, -5.7549243238966489E+08, -2.8449115596289498E+08, 3.9421144214631909E+08, -1.0142613816300942E+08, -2.3511574237913735E+07, 9.7772970960591603E+06, 4.9358776531681628E+05}; + constexpr FLT c9[] = {1.2660319987326709E+05, 7.7519511328105081E+05, -6.5244610661542164E+06, 9.0878257490973976E+06, 2.3116605621149909E+07, -8.7079594477661625E+07, 9.5542733670714021E+07, -3.4623017322338634E-02, -9.5542733658248380E+07, 8.7079594589852452E+07, -2.3116605559600774E+07, -9.0878257518242579E+06, 6.5244610661450867E+06, -7.7519511328086059E+05, -1.2660319987326671E+05}; + constexpr FLT c10[] = {2.3793325531461589E+04, -4.2305332802771904E+04, -5.2884156975031609E+05, 2.5307340145554747E+06, -4.0404175204335153E+06, -1.7519988538994591E+05, 1.0146438798034744E+07, -1.5828545528861172E+07, 1.0146438794496680E+07, -1.7520001842407117E+05, -4.0404175643064296E+06, 2.5307340160591919E+06, -5.2884156977243477E+05, -4.2305332802771285E+04, 2.3793325531458995E+04}; + constexpr FLT c11[] = {2.9741655196857741E+03, -2.0687056403629973E+04, 3.3295507834673197E+04, 1.0661145690364030E+05, -5.6644238449031080E+05, 1.0874811673184116E+06, -9.6561276275880623E+05, -7.6207036577648435E-02, 9.6561275636531680E+05, -1.0874812580259521E+06, 5.6644242612787138E+05, -1.0661145858193116E+05, -3.3295507822185595E+04, 2.0687056403005630E+04, -2.9741655196852739E+03}; + constexpr FLT c12[] = {1.5389176594840404E+02, -2.3864418517811582E+03, 1.0846266965476148E+04, -2.2940053899336592E+04, 1.4780105833703366E+04, 4.2663634529139046E+04, -1.3047650082135458E+05, 1.7468394417865420E+05, -1.3047642955960588E+05, 4.2663569014305380E+04, 1.4780038020101238E+04, -2.2940052498526344E+04, 1.0846266965476338E+04, -2.3864418513602504E+03, 1.5389176594853458E+02}; + constexpr FLT c13[] = {-2.3857631312306911E+01, -1.9651606200276817E+01, 6.4183084244784663E+02, -2.8648428291977302E+03, 6.8249248253356263E+03, -9.7944434082514545E+03, 7.6177566999585488E+03, -4.8285923071218206E-02, -7.6177709934185850E+03, 9.7944219680614005E+03, -6.8249060651693289E+03, 2.8648407633460843E+03, -6.4183085466149657E+02, 1.9651606115081155E+01, 2.3857631312306911E+01}; + constexpr FLT c14[] = {-6.1348505726741482E+00, 2.7872916302350376E+01, -6.5819898558168433E+01, 5.1367134246654771E+01, 1.7214275703496423E+02, -6.9657243183240860E+02, 1.3192259272931558E+03, -1.6054145588281010E+03, 1.3192138654025996E+03, -6.9662907027505264E+02, 1.7212038135392731E+02, 5.1368095701697484E+01, -6.5819904020980715E+01, 2.7872916473063263E+01, -6.1348505738411490E+00}; + constexpr FLT c15[] = {-4.9671584422774523E-01, 3.0617550953446120E+00, -1.1650665638577927E+01, 3.0081331929557447E+01, -5.4030564936801589E+01, 6.6075844179663960E+01, -4.7176211285519123E+01, -3.4313439732287163E-02, 4.7173085818207042E+01, -6.6061100127341888E+01, 5.4056655794367416E+01, -3.0081722612971500E+01, 1.1650665638577902E+01, -3.0617553939307713E+00, 4.9671584448693240E-01}; + constexpr FLT c16[] = {4.3460783761337983E-03, -1.3199934226522787E-02, -1.9412503880258877E-01, 1.1325756464362078E+00, -3.4439944517155450E+00, 7.1653575841078521E+00, -1.1108195405465501E+01, 1.2348789868125033E+01, -1.1088023137785596E+01, 7.0939141360622937E+00, -3.4847592426682690E+00, 1.1324705825441117E+00, -1.9413837699275374E-01, -1.3199908576142469E-02, 4.3460782759542488E-03}; for (int i=0; i<15; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i] + z*(c12[i] + z*(c13[i] + z*(c14[i] + z*(c15[i] + z*(c16[i])))))))))))))))); } else if (w==16) { - constexpr CUFINUFFT_FLT c0[] = {3.6434551345571090E+05, 2.0744705928579485E+09, 4.0355760945670044E+11, 1.6364575388763037E+13, 2.3514830376056556E+14, 1.5192201717462535E+15, 4.9956173084674140E+15, 8.9287666945127430E+15, 8.9287666945127430E+15, 4.9956173084674140E+15, 1.5192201717462535E+15, 2.3514830376056556E+14, 1.6364575388763041E+13, 4.0355760945670050E+11, 2.0744705928579490E+09, 3.6434551345570857E+05}; - constexpr CUFINUFFT_FLT c1[] = {2.2576246485480363E+06, 6.6499571180086451E+09, 8.7873753526056287E+11, 2.5606844387131055E+13, 2.6313738449330159E+14, 1.1495095100701462E+15, 2.1932582707747572E+15, 1.2860244365132600E+15, -1.2860244365132588E+15, -2.1932582707747572E+15, -1.1495095100701462E+15, -2.6313738449330169E+14, -2.5606844387131066E+13, -8.7873753526056323E+11, -6.6499571180086451E+09, -2.2576246485480368E+06}; - constexpr CUFINUFFT_FLT c2[] = {6.3730995546265068E+06, 9.9060026035198040E+09, 8.8097248605448987E+11, 1.7953384130753676E+13, 1.2398425545001648E+14, 3.0749346493041212E+14, 1.0259777520247089E+14, -5.5291976457534288E+14, -5.5291976457534375E+14, 1.0259777520247070E+14, 3.0749346493041225E+14, 1.2398425545001656E+14, 1.7953384130753684E+13, 8.8097248605449011E+11, 9.9060026035198078E+09, 6.3730995546265068E+06}; - constexpr CUFINUFFT_FLT c3[] = {1.0896915393078225E+07, 9.0890343524593887E+09, 5.3565169504010028E+11, 7.3004206720038701E+12, 2.9692333044160082E+13, 1.6051737468109645E+13, -9.1273329108089531E+13, -8.5999306918502797E+13, 8.5999306918501641E+13, 9.1273329108090062E+13, -1.6051737468109594E+13, -2.9692333044160074E+13, -7.3004206720038711E+12, -5.3565169504010034E+11, -9.0890343524593887E+09, -1.0896915393078221E+07}; - constexpr CUFINUFFT_FLT c4[] = {1.2655725616100587E+07, 5.7342804054544201E+09, 2.1822836608899588E+11, 1.8300700858999731E+12, 2.7770431049858564E+12, -8.5034969223847109E+12, -1.2846668467422201E+13, 1.6519076896574611E+13, 1.6519076896573730E+13, -1.2846668467421688E+13, -8.5034969223849521E+12, 2.7770431049858491E+12, 1.8300700858999692E+12, 2.1822836608899588E+11, 5.7342804054544220E+09, 1.2655725616100591E+07}; - constexpr CUFINUFFT_FLT c5[] = {1.0609303958036324E+07, 2.6255609052371726E+09, 6.1673589426039383E+10, 2.6044432099084976E+11, -3.5431628074578320E+11, -1.6077602129636006E+12, 1.5534405614729011E+12, 2.8019935380861670E+12, -2.8019935380844810E+12, -1.5534405614727644E+12, 1.6077602129636335E+12, 3.5431628074576636E+11, -2.6044432099085037E+11, -6.1673589426039368E+10, -2.6255609052371726E+09, -1.0609303958036324E+07}; - constexpr CUFINUFFT_FLT c6[] = {6.6544809363384563E+06, 8.9490403680928385E+08, 1.1882638725190760E+10, 8.1552898137784090E+09, -1.2575562817891687E+11, 2.7074695075842178E+10, 3.9453789461922034E+11, -3.1679644857435541E+11, -3.1679644857440692E+11, 3.9453789461951154E+11, 2.7074695076007500E+10, -1.2575562817885344E+11, 8.1552898137852116E+09, 1.1882638725191153E+10, 8.9490403680928493E+08, 6.6544809363384582E+06}; - constexpr CUFINUFFT_FLT c7[] = {3.1906872142824987E+06, 2.2785946180651781E+08, 1.3744578972809656E+09, -4.3997172592913818E+09, -9.2011130754125404E+09, 3.4690551711826530E+10, -9.4227043395316906E+09, -5.9308465069991577E+10, 5.9308465068943581E+10, 9.4227043392705956E+09, -3.4690551712022408E+10, 9.2011130753675175E+09, 4.3997172592866106E+09, -1.3744578972812984E+09, -2.2785946180652174E+08, -3.1906872142824973E+06}; - constexpr CUFINUFFT_FLT c8[] = {1.1821527096621725E+06, 4.2281234059839047E+07, 2.8723226058821958E+07, -8.3553955857311106E+08, 1.2447304829054153E+09, 2.1955280944846683E+09, -7.0514195725593920E+09, 4.3745141235010500E+09, 4.3745141236655197E+09, -7.0514195727234411E+09, 2.1955280942826533E+09, 1.2447304829048812E+09, -8.3553955857841730E+08, 2.8723226058853466E+07, 4.2281234059838966E+07, 1.1821527096621748E+06}; - constexpr CUFINUFFT_FLT c9[] = {3.3854610744280228E+05, 5.2176984975098642E+06, -2.0677283564981934E+07, -3.5831818966960624E+07, 2.6599346104854527E+08, -3.7992777983589816E+08, -1.3426914439904341E+08, 9.1752051209279442E+08, -9.1752051188087845E+08, 1.3426914452369988E+08, 3.7992777987329507E+08, -2.6599346107659298E+08, 3.5831818968129277E+07, 2.0677283565073237E+07, -5.2176984975084374E+06, -3.3854610744280077E+05}; - constexpr CUFINUFFT_FLT c10[] = {7.3893334077309293E+04, 2.6983804209740972E+05, -3.6415998560880083E+06, 8.4025485863333493E+06, 4.9278860779347531E+06, -5.1437033824108891E+07, 8.7603898602732122E+07, -4.6199497846299231E+07, -4.6199498219926819E+07, 8.7603898832003579E+07, -5.1437033801464774E+07, 4.9278861005788362E+06, 8.4025485870409794E+06, -3.6415998559663831E+06, 2.6983804209585470E+05, 7.3893334077307591E+04}; - constexpr CUFINUFFT_FLT c11[] = {1.1778892113374410E+04, -4.0077190109195144E+04, -1.8372552183899941E+05, 1.3262878359201169E+06, -2.9738540144900386E+06, 1.9493508843214174E+06, 4.1881949043266159E+06, -1.1066749441324197E+07, 1.1066749225224417E+07, -4.1881949989500660E+06, -1.9493509811827433E+06, 2.9738539876374160E+06, -1.3262878392766861E+06, 1.8372552166916840E+05, 4.0077190106541901E+04, -1.1778892113374635E+04}; - constexpr CUFINUFFT_FLT c12[] = {1.2019749667905517E+03, -1.0378455845905968E+04, 2.6333352626226591E+04, 1.7117060824677988E+04, -2.5133287788479996E+05, 6.4713912423136400E+05, -8.1634971996757365E+05, 3.8623850687193515E+05, 3.8623887467457692E+05, -8.1634999581952032E+05, 6.4713888515965885E+05, -2.5133289397614688E+05, 1.7117056658162492E+04, 2.6333352590306949E+04, -1.0378455846607170E+04, 1.2019749667886601E+03}; - constexpr CUFINUFFT_FLT c13[] = {3.1189837633271310E+01, -8.9083493666530228E+02, 4.9454294721013366E+03, -1.3124691362129612E+04, 1.5834782149156119E+04, 6.9607783053915546E+03, -5.9789949050326162E+04, 1.0841720290002371E+05, -1.0841726183381994E+05, 5.9790023686287932E+04, -6.9607416211385053E+03, -1.5834800728954084E+04, 1.3124692508510609E+04, -4.9454294244132070E+03, 8.9083493795553227E+02, -3.1189837630675466E+01}; - constexpr CUFINUFFT_FLT c14[] = {-1.2975319073318561E+01, 1.8283698900397550E+01, 1.7684013462935113E+02, -1.1059907069976271E+03, 3.1998196269059799E+03, -5.5988285845467362E+03, 5.9248624962359208E+03, -2.5987075415506133E+03, -2.5989297031998472E+03, 5.9249309327755627E+03, -5.5988287659129119E+03, 3.1998292347735460E+03, -1.1059914993060199E+03, 1.7684017599586443E+02, 1.8283697951655380E+01, -1.2975319075406015E+01}; - constexpr CUFINUFFT_FLT c15[] = {-2.3155118737567935E+00, 1.1938503501764195E+01, -3.4150613932459848E+01, 4.8896713096147266E+01, 1.5844216816345641E+01, -2.4277080939345015E+02, 6.0146058115394737E+02, -8.8748160721868635E+02, 8.8732832343048744E+02, -6.0146927810646923E+02, 2.4275722040513463E+02, -1.5849652411671842E+01, -4.8897528435446198E+01, 3.4150596946224454E+01, -1.1938504032584051E+01, 2.3155118728820292E+00}; - constexpr CUFINUFFT_FLT c16[] = {-1.5401723736175238E-01, 9.8067757197686212E-01, -4.1901188293318530E+00, 1.2150691895619683E+01, -2.4764820628534302E+01, 3.6081462800085532E+01, -3.4534922277532473E+01, 1.2910251318703700E+01, 1.3098525817101535E+01, -3.4588714991360455E+01, 3.5973877372429698E+01, -2.4775747273530602E+01, 1.2149010873312557E+01, -4.1901467369287460E+00, 9.8067700766883559E-01, -1.5401723876450651E-01}; - constexpr CUFINUFFT_FLT c17[] = {1.1808835457017667E-02, -2.5443945538745794E-02, -1.3157119144786456E-04, 2.5877310634925382E-01, -1.0920774586473376E+00, 2.6473618304294715E+00, -4.4448325935254926E+00, 6.8292491990998831E+00, -6.8300632710034588E+00, 4.4643703192113184E+00, -2.6384070394901351E+00, 1.0890246890089277E+00, -2.5849326913239973E-01, 1.4031610447463365E-04, 2.5444280926035151E-02, -1.1808834729180664E-02}; + constexpr FLT c0[] = {3.6434551345571154E+05, 2.0744705928579516E+09, 4.0355760945670056E+11, 1.6364575388763043E+13, 2.3514830376056566E+14, 1.5192201717462540E+15, 4.9956173084674150E+15, 8.9287666945127440E+15, 8.9287666945127440E+15, 4.9956173084674160E+15, 1.5192201717462542E+15, 2.3514830376056566E+14, 1.6364575388763049E+13, 4.0355760945670068E+11, 2.0744705928579512E+09, 3.6434551345570991E+05}; + constexpr FLT c1[] = {2.2576246485480345E+06, 6.6499571180086479E+09, 8.7873753526056311E+11, 2.5606844387131062E+13, 2.6313738449330162E+14, 1.1495095100701470E+15, 2.1932582707747572E+15, 1.2860244365132608E+15, -1.2860244365132600E+15, -2.1932582707747580E+15, -1.1495095100701462E+15, -2.6313738449330162E+14, -2.5606844387131066E+13, -8.7873753526056299E+11, -6.6499571180086479E+09, -2.2576246485480345E+06}; + constexpr FLT c2[] = {6.3730995546265058E+06, 9.9060026035198078E+09, 8.8097248605449023E+11, 1.7953384130753688E+13, 1.2398425545001667E+14, 3.0749346493041262E+14, 1.0259777520247212E+14, -5.5291976457534244E+14, -5.5291976457534294E+14, 1.0259777520247097E+14, 3.0749346493041212E+14, 1.2398425545001659E+14, 1.7953384130753672E+13, 8.8097248605448987E+11, 9.9060026035198078E+09, 6.3730995546265077E+06}; + constexpr FLT c3[] = {1.0896915393078227E+07, 9.0890343524593887E+09, 5.3565169504010052E+11, 7.3004206720038770E+12, 2.9692333044160145E+13, 1.6051737468109752E+13, -9.1273329108089609E+13, -8.5999306918501562E+13, 8.5999306918502812E+13, 9.1273329108090391E+13, -1.6051737468109348E+13, -2.9692333044160059E+13, -7.3004206720038691E+12, -5.3565169504010046E+11, -9.0890343524593925E+09, -1.0896915393078225E+07}; + constexpr FLT c4[] = {1.2655725616100591E+07, 5.7342804054544220E+09, 2.1822836608899585E+11, 1.8300700858999712E+12, 2.7770431049857900E+12, -8.5034969223848574E+12, -1.2846668467422469E+13, 1.6519076896573322E+13, 1.6519076896573414E+13, -1.2846668467422033E+13, -8.5034969223850078E+12, 2.7770431049858350E+12, 1.8300700858999753E+12, 2.1822836608899594E+11, 5.7342804054544239E+09, 1.2655725616100593E+07}; + constexpr FLT c5[] = {1.0609303958036318E+07, 2.6255609052371716E+09, 6.1673589426039268E+10, 2.6044432099085120E+11, -3.5431628074578119E+11, -1.6077602129631777E+12, 1.5534405614726155E+12, 2.8019935380863682E+12, -2.8019935380852476E+12, -1.5534405614728257E+12, 1.6077602129636682E+12, 3.5431628074579871E+11, -2.6044432099085229E+11, -6.1673589426039368E+10, -2.6255609052371745E+09, -1.0609303958036322E+07}; + constexpr FLT c6[] = {6.6544809363384582E+06, 8.9490403680928528E+08, 1.1882638725190987E+10, 8.1552898137820768E+09, -1.2575562817884897E+11, 2.7074695075942204E+10, 3.9453789461929230E+11, -3.1679644857371918E+11, -3.1679644857384814E+11, 3.9453789461920764E+11, 2.7074695075779831E+10, -1.2575562817882477E+11, 8.1552898137801113E+09, 1.1882638725190844E+10, 8.9490403680928373E+08, 6.6544809363384526E+06}; + constexpr FLT c7[] = {3.1906872142825029E+06, 2.2785946180651915E+08, 1.3744578972811413E+09, -4.3997172592843504E+09, -9.2011130753862667E+09, 3.4690551711764793E+10, -9.4227043392778511E+09, -5.9308465069355759E+10, 5.9308465069781982E+10, 9.4227043396369877E+09, -3.4690551711565643E+10, 9.2011130754329739E+09, 4.3997172592904301E+09, -1.3744578972811375E+09, -2.2785946180652067E+08, -3.1906872142825001E+06}; + constexpr FLT c8[] = {1.1821527096621764E+06, 4.2281234059839748E+07, 2.8723226058752719E+07, -8.3553955857505906E+08, 1.2447304828865275E+09, 2.1955280942222519E+09, -7.0514195727878428E+09, 4.3745141232918625E+09, 4.3745141237316084E+09, -7.0514195722924280E+09, 2.1955280943332024E+09, 1.2447304828901291E+09, -8.3553955857124400E+08, 2.8723226058927339E+07, 4.2281234059842363E+07, 1.1821527096621776E+06}; + constexpr FLT c9[] = {3.3854610744279926E+05, 5.2176984975088174E+06, -2.0677283565109752E+07, -3.5831818967739724E+07, 2.6599346107970935E+08, -3.7992777963644773E+08, -1.3426914477301279E+08, 9.1752051236703849E+08, -9.1752051203046608E+08, 1.3426914449876857E+08, 3.7992777988576066E+08, -2.6599346104854524E+08, 3.5831818969687484E+07, 2.0677283565073233E+07, -5.2176984975085324E+06, -3.3854610744279926E+05}; + constexpr FLT c10[] = {7.3893334077310792E+04, 2.6983804209766653E+05, -3.6415998560216571E+06, 8.4025485866871737E+06, 4.9278860835956605E+06, -5.1437033778820507E+07, 8.7603898248918146E+07, -4.6199497914231867E+07, -4.6199497948197275E+07, 8.7603898697554156E+07, -5.1437033767498761E+07, 4.9278861543586710E+06, 8.4025485891638417E+06, -3.6415998559774463E+06, 2.6983804209732520E+05, 7.3893334077308697E+04}; + constexpr FLT c11[] = {1.1778892113376965E+04, -4.0077190108567142E+04, -1.8372552169915423E+05, 1.3262878389569877E+06, -2.9738540196046322E+06, 1.9493506557541618E+06, 4.1881949490808225E+06, -1.1066749801915919E+07, 1.1066748877418302E+07, -4.1881948928182255E+06, -1.9493507634843190E+06, 2.9738539997848324E+06, -1.3262878392766670E+06, 1.8372552166918706E+05, 4.0077190106849979E+04, -1.1778892113376709E+04}; + constexpr FLT c12[] = {1.2019749667900676E+03, -1.0378455845063749E+04, 2.6333352662141660E+04, 1.7117059675298591E+04, -2.5133289742429825E+05, 6.4713895872015413E+05, -8.1634975674778735E+05, 3.8623909535608569E+05, 3.8623887467451266E+05, -8.1634966479713970E+05, 6.4713897711029404E+05, -2.5133289282677229E+05, 1.7117063267120848E+04, 2.6333352680101594E+04, -1.0378455843660833E+04, 1.2019749667921026E+03}; + constexpr FLT c13[] = {3.1189837631121321E+01, -8.9083493701244504E+02, 4.9454293991649774E+03, -1.3124692742151998E+04, 1.5834795298841136E+04, 6.9608292767098355E+03, -5.9790200829217545E+04, 1.0841735230501879E+05, -1.0841732371809872E+05, 5.9789914960016831E+04, -6.9607435159496199E+03, -1.5834797085523640E+04, 1.3124692295481371E+04, -4.9454294410403490E+03, 8.9083493766674769E+02, -3.1189837632399257E+01}; + constexpr FLT c14[] = {-1.2975319072478742E+01, 1.8283699094028595E+01, 1.7684019694555272E+02, -1.1059902320249000E+03, 3.1998244780238201E+03, -5.5987981589200417E+03, 5.9247600879368474E+03, -2.5988290685215188E+03, -2.5988178806809206E+03, 5.9249852432272892E+03, -5.5987701893187350E+03, 3.1998552445852642E+03, -1.1059895327848767E+03, 1.7684022972243278E+02, 1.8283699179384410E+01, -1.2975319072812146E+01}; + constexpr FLT c15[] = {-2.3155118729306223E+00, 1.1938503369059017E+01, -3.4150537494399323E+01, 4.8897188710734866E+01, 1.5839596560322873E+01, -2.4289147960969117E+02, 6.0143231605823757E+02, -8.8772403477020873E+02, 8.8712611928432557E+02, -6.0139861536721287E+02, 2.4281211991792659E+02, -1.5853729108169823E+01, -4.8898479664625256E+01, 3.4150529001281690E+01, -1.1938504563403686E+01, 2.3155118727038264E+00}; + constexpr FLT c16[] = {-1.5401723836370515E-01, 9.8067787978090881E-01, -4.1900810719931050E+00, 1.2149798852514468E+01, -2.4780790340446881E+01, 3.6014221907804398E+01, -3.4588714991383583E+01, 1.3071629460227753E+01, 1.2883354961750646E+01, -3.4615611348253751E+01, 3.5973877372428277E+01, -2.4777428295844171E+01, 1.2151059619254390E+01, -4.1901237542037384E+00, 9.8067813628521039E-01, -1.5401723766235165E-01}; + constexpr FLT c17[] = {1.1808834947531816E-02, -2.5444032491006262E-02, -1.4707353726716647E-04, 2.5840423001794482E-01, -1.0910598687678679E+00, 2.6514321899473572E+00, -4.5034457705829842E+00, 6.8479728528821520E+00, -6.8634402190500978E+00, 4.4285511554539836E+00, -2.6424773990080204E+00, 1.0878035811535636E+00, -2.5882398584322625E-01, 1.3196868749378181E-04, 2.5444131865017927E-02, -1.1808835384234016E-02}; for (int i=0; i<16; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i] + z*(c12[i] + z*(c13[i] + z*(c14[i] + z*(c15[i] + z*(c16[i] + z*(c17[i]))))))))))))))))); } else printf("width not implemented!\n"); diff --git a/include/cufinufft/contrib/ker_lowupsampfac_horner_allw_loop.inc b/include/cufinufft/contrib/ker_lowupsampfac_horner_allw_loop.inc new file mode 100644 index 000000000..358a1bdbf --- /dev/null +++ b/include/cufinufft/contrib/ker_lowupsampfac_horner_allw_loop.inc @@ -0,0 +1,192 @@ +// Code generated by gen_all_horner_C_code.m in finufft/devel +// Authors: Alex Barnett & Ludvig af Klinteberg. +// (C) The Simons Foundation, Inc. + if (w==2) { + constexpr FLT c0[] = {2.3711015472112535E+01, 2.3711015472112539E+01}; + constexpr FLT c1[] = {2.5079742199350566E+01, -2.5079742199350566E+01}; + constexpr FLT c2[] = {-3.5023281580177019E+00, -3.5023281580177028E+00}; + constexpr FLT c3[] = {-7.3894949249195596E+00, 7.3894949249195649E+00}; + for (int i=0; i<2; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i]))); + } else if (w==3) { + constexpr FLT c0[] = {5.9620016143346866E+01, 2.4110216701187517E+02, 5.9620016148621886E+01}; + constexpr FLT c1[] = {9.7575520958604287E+01, 6.0625609804989280E-15, -9.7575520952908548E+01}; + constexpr FLT c2[] = {3.5838417859768519E+01, -7.3472145274965385E+01, 3.5838417865129472E+01}; + constexpr FLT c3[] = {-1.0721643298166459E+01, 2.2269719700859066E-14, 1.0721643303220411E+01}; + constexpr FLT c4[] = {-7.0570630207138105E+00, 9.1538553399011651E+00, -7.0570630151506615E+00}; + for (int i=0; i<3; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i])))); + } else if (w==4) { + constexpr FLT c0[] = {1.2612470018753703E+02, 1.1896204292999123E+03, 1.1896204292999125E+03, 1.2612470018753706E+02}; + constexpr FLT c1[] = {2.6158034850676631E+02, 5.6161104654809833E+02, -5.6161104654809833E+02, -2.6158034850676631E+02}; + constexpr FLT c2[] = {1.7145379463699527E+02, -1.6695967127766502E+02, -1.6695967127766531E+02, 1.7145379463699518E+02}; + constexpr FLT c3[] = {2.3525961965887934E+01, -1.0057439659768855E+02, 1.0057439659768869E+02, -2.3525961965887870E+01}; + constexpr FLT c4[] = {-1.5608307370340814E+01, 9.5627412100261218E+00, 9.5627412100261768E+00, -1.5608307370340912E+01}; + constexpr FLT c5[] = {-4.5715207776748672E+00, 7.9904373067896399E+00, -7.9904373067894170E+00, 4.5715207776748832E+00}; + for (int i=0; i<4; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i]))))); + } else if (w==5) { + constexpr FLT c0[] = {2.4106943677442635E+02, 4.3538384278025578E+03, 9.3397486707382068E+03, 4.3538384278025542E+03, 2.4106943677442635E+02}; + constexpr FLT c1[] = {5.8781364250328284E+02, 3.4742855804122032E+03, -2.2247045611533172E-13, -3.4742855804122019E+03, -5.8781364250328272E+02}; + constexpr FLT c2[] = {5.1234107167555874E+02, 3.5219546517037230E+02, -1.7076861141633149E+03, 3.5219546517037259E+02, 5.1234107167555862E+02}; + constexpr FLT c3[] = {1.7540956907856085E+02, -3.5792356187777011E+02, 1.0950032210404113E-12, 3.5792356187777193E+02, -1.7540956907856062E+02}; + constexpr FLT c4[] = {-2.1768066955080412E-01, -7.8322173187697160E+01, 1.3904039464934533E+02, -7.8322173187696521E+01, -2.1768066955089899E-01}; + constexpr FLT c5[] = {-1.4207955403641282E+01, 1.6019466986222039E+01, 6.2864597222035853E-14, -1.6019466986221275E+01, 1.4207955403641282E+01}; + constexpr FLT c6[] = {-2.1966493586752702E+00, 5.0672636163198259E+00, -6.7340544905090631E+00, 5.0672636163192113E+00, -2.1966493586753031E+00}; + for (int i=0; i<5; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i])))))); + } else if (w==6) { + constexpr FLT c0[] = {4.3011762559089192E+02, 1.3368828836127082E+04, 4.9861340433371268E+04, 4.9861340433371290E+04, 1.3368828836127082E+04, 4.3011762559835182E+02}; + constexpr FLT c1[] = {1.1857225840065146E+03, 1.4112553227730619E+04, 1.5410005180819442E+04, -1.5410005180819426E+04, -1.4112553227730617E+04, -1.1857225839984601E+03}; + constexpr FLT c2[] = {1.2460481448413077E+03, 4.3127030215084988E+03, -5.5438591621431215E+03, -5.5438591621431233E+03, 4.3127030215084969E+03, 1.2460481448488895E+03}; + constexpr FLT c3[] = {6.0825549344387821E+02, -3.4106010789546866E+02, -1.9775725023673151E+03, 1.9775725023673224E+03, 3.4106010789547190E+02, -6.0825549343673049E+02}; + constexpr FLT c4[] = {1.1264961069783713E+02, -3.9740822717990801E+02, 2.7557540616463564E+02, 2.7557540616463149E+02, -3.9740822717990505E+02, 1.1264961070570472E+02}; + constexpr FLT c5[] = {-1.5387906304333869E+01, -3.2640579296386335E+01, 1.1683718215647407E+02, -1.1683718215647050E+02, 3.2640579296386335E+01, 1.5387906311562686E+01}; + constexpr FLT c6[] = {-9.3947198873910107E+00, 1.5069930500884340E+01, -8.0900452409585597E+00, -8.0900452409573536E+00, 1.5069930500885983E+01, -9.3947198802582648E+00}; + constexpr FLT c7[] = {-5.6048841964528473E-01, 2.3377422080932533E+00, -4.2391567591829169E+00, 4.2391567591861783E+00, -2.3377422080911803E+00, 5.6048842664328347E-01}; + for (int i=0; i<6; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i]))))))); + } else if (w==7) { + constexpr FLT c0[] = {7.2950392616203362E+02, 3.6439117038309523E+04, 2.1220891582018451E+05, 3.6180058567561547E+05, 2.1220891582018466E+05, 3.6439117038309538E+04, 7.2950392617434579E+02}; + constexpr FLT c1[] = {2.2197790785452585E+03, 4.6392067080426263E+04, 1.1568051746995676E+05, -2.6471374827810822E-11, -1.1568051746995673E+05, -4.6392067080426248E+04, -2.2197790785319785E+03}; + constexpr FLT c2[] = {2.6796845075663950E+03, 2.0921129984587253E+04, 3.9399551345633640E+01, -4.7251335435527413E+04, 3.9399551345568185E+01, 2.0921129984587242E+04, 2.6796845075789138E+03}; + constexpr FLT c3[] = {1.6253748990844513E+03, 2.6138488347211651E+03, -1.0037546705421486E+04, 4.9207207296884551E-11, 1.0037546705421528E+04, -2.6138488347211514E+03, -1.6253748990726617E+03}; + constexpr FLT c4[] = {4.9106375852553407E+02, -8.6668269315415375E+02, -1.0513434716617946E+03, 2.8444456471590820E+03, -1.0513434716617835E+03, -8.6668269315414682E+02, 4.9106375853851517E+02}; + constexpr FLT c5[] = {4.0739167949763470E+01, -2.8515155742293291E+02, 3.9930326803802245E+02, 9.3897520950192402E-12, -3.9930326803800614E+02, 2.8515155742293899E+02, -4.0739167937836122E+01}; + constexpr FLT c6[] = {-1.7148987139838134E+01, 7.5799002551925454E-01, 6.3260304953181709E+01, -1.0529869309159973E+02, 6.3260304953170241E+01, 7.5799002552861849E-01, -1.7148987128070043E+01}; + constexpr FLT c7[] = {-4.5424411501048008E+00, 9.8749254058339080E+00, -9.6456179777422530E+00, 1.4220101775868667E-11, 9.6456179778363111E+00, -9.8749254058241132E+00, 4.5424411616515830E+00}; + constexpr FLT c8[] = {-5.0793946806705008E-02, 7.3273813711596381E-01, -2.0117140545159620E+00, 2.6999257940738310E+00, -2.0117140545257630E+00, 7.3273813712090197E-01, -5.0793935652734865E-02}; + for (int i=0; i<7; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i])))))))); + } else if (w==8) { + constexpr FLT c0[] = {1.1895823653767156E+03, 9.0980236725237002E+04, 7.7438826909537544E+05, 2.0077596413122714E+06, 2.0077596413122721E+06, 7.7438826909537590E+05, 9.0980236725237002E+04, 1.1895823653767152E+03}; + constexpr FLT c1[] = {3.9313191526977803E+03, 1.3318570706800825E+05, 5.7275848637687659E+05, 4.6250273225257988E+05, -4.6250273225258006E+05, -5.7275848637687659E+05, -1.3318570706800825E+05, -3.9313191526977798E+03}; + constexpr FLT c2[] = {5.2976026193612415E+03, 7.5628970871188474E+04, 1.0073339198368331E+05, -1.8165150843791279E+05, -1.8165150843791300E+05, 1.0073339198368324E+05, 7.5628970871188460E+04, 5.2976026193612397E+03}; + constexpr FLT c3[] = {3.7552239608473869E+03, 1.8376340228970930E+04, -2.3878081117551392E+04, -4.6296734056047753E+04, 4.6296734056048466E+04, 2.3878081117551716E+04, -1.8376340228970901E+04, -3.7552239608473869E+03}; + constexpr FLT c4[] = {1.4742862505418659E+03, 1.2842168112180084E+02, -9.1969665138397813E+03, 7.5990739935236888E+03, 7.5990739935236415E+03, -9.1969665138397813E+03, 1.2842168112182003E+02, 1.4742862505418657E+03}; + constexpr FLT c5[] = {2.8158981009344376E+02, -8.8613607108855138E+02, 5.3457145342334591E+01, 2.1750989694613118E+03, -2.1750989694611812E+03, -5.3457145342138865E+01, 8.8613607108855138E+02, -2.8158981009344376E+02}; + constexpr FLT c6[] = {-1.4786862436220549E+00, -1.3935442261829297E+02, 3.2599325739090762E+02, -1.9541889343354751E+02, -1.9541889343356968E+02, 3.2599325739086612E+02, -1.3935442261828183E+02, -1.4786862436238759E+00}; + constexpr FLT c7[] = {-1.1542034522900533E+01, 1.2000512051415985E+01, 1.9687328710253290E+01, -6.3962883082497100E+01, 6.3962883082831397E+01, -1.9687328710065113E+01, -1.2000512051397745E+01, 1.1542034522901620E+01}; + constexpr FLT c8[] = {-1.7448292513541994E+00, 4.8577330433876664E+00, -6.8794163043749101E+00, 3.4611708986529197E+00, 3.4611708984979552E+00, -6.8794163042722616E+00, 4.8577330434089125E+00, -1.7448292513539221E+00}; + constexpr FLT c9[] = {1.5044951479000782E-01, 9.6230159355094672E-02, -7.0399250408500635E-01, 1.3251401130885254E+00, -1.3251401130188682E+00, 7.0399250409661596E-01, -9.6230159344936325E-02, -1.5044951478914617E-01}; + for (int i=0; i<8; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i]))))))))); + } else if (w==9) { + constexpr FLT c0[] = {1.8793738965777031E+03, 2.1220891582018440E+05, 2.5233246441351655E+06, 9.2877384983420707E+06, 1.4015330434461467E+07, 9.2877384983420800E+06, 2.5233246441351655E+06, 2.1220891582018536E+05, 1.8793738965777065E+03}; + constexpr FLT c1[] = {6.6675066501609354E+03, 3.4704155240987014E+05, 2.2890184838322564E+06, 3.8705035445351237E+06, 1.1717532248112299E-10, -3.8705035445351265E+06, -2.2890184838322559E+06, -3.4704155240987102E+05, -6.6675066501609354E+03}; + constexpr FLT c2[] = {9.8412775404612330E+03, 2.3171563090202375E+05, 6.8167589492092282E+05, -2.1140963571671949E+05, -1.4236515118873832E+06, -2.1140963571672430E+05, 6.8167589492092212E+05, 2.3171563090202416E+05, 9.8412775404612275E+03}; + constexpr FLT c3[] = {7.8762358364031061E+03, 7.6500585979636191E+04, 1.2434778984075345E+04, -2.8572091469429957E+05, 1.1900185890455270E-09, 2.8572091469430370E+05, -1.2434778984074723E+04, -7.6500585979636191E+04, -7.8762358364031033E+03}; + constexpr FLT c4[] = {3.6941911906762075E+03, 9.9232929169976032E+03, -3.3472877669901907E+04, -1.4082384858050133E+04, 6.7911966136974472E+04, -1.4082384858045889E+04, -3.3472877669901856E+04, 9.9232929169977433E+03, 3.6941911906762098E+03}; + constexpr FLT c5[] = {9.8900189723050323E+02, -1.2736589324621348E+03, -5.0407308390125609E+03, 9.8914296140178049E+03, 6.1223023135982708E-10, -9.8914296140230235E+03, 5.0407308390128219E+03, 1.2736589324621673E+03, -9.8900189723050403E+02}; + constexpr FLT c6[] = {1.1165868717716108E+02, -5.9057035448559543E+02, 5.5860705835625356E+02, 9.1996097522935008E+02, -2.0290255886368843E+03, 9.1996097522906575E+02, 5.5860705835607132E+02, -5.9057035448565603E+02, 1.1165868717715755E+02}; + constexpr FLT c7[] = {-1.3142584300867490E+01, -4.2852762793261455E+01, 1.8188640945803897E+02, -2.1362000457586478E+02, 1.1194928851903786E-10, 2.1362000457739751E+02, -1.8188640945787162E+02, 4.2852762793424958E+01, 1.3142584300868396E+01}; + constexpr FLT c8[] = {-5.8088068374876212E+00, 1.0201832931297655E+01, -3.5220973552653217E-01, -2.6632420897260161E+01, 4.2737607183076172E+01, -2.6632420895005694E+01, -3.5220973526763744E-01, 1.0201832931314263E+01, -5.8088068374874551E+00}; + constexpr FLT c9[] = {-4.0642645973149144E-01, 1.8389772328590479E+00, -3.5549484956004700E+00, 3.2273562224626624E+00, 2.3066481718890602E-10, -3.2273562263634674E+00, 3.5549484956933464E+00, -1.8389772328126097E+00, 4.0642645973247782E-01}; + for (int i=0; i<9; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i]))))))))); + } else if (w==10) { + constexpr FLT c0[] = {2.8923571298063644E+03, 4.6856831608341972E+05, 7.5304732752870098E+06, 3.7576537584215805E+07, 7.9591606307847947E+07, 7.9591606307847947E+07, 3.7576537584215775E+07, 7.5304732752870088E+06, 4.6856831608341815E+05, 2.8923571298063584E+03}; + constexpr FLT c1[] = {1.0919387804943195E+04, 8.3976685277206486E+05, 7.9494027659552386E+06, 2.1606786285174560E+07, 1.4625897641453253E+07, -1.4625897641453268E+07, -2.1606786285174556E+07, -7.9494027659552386E+06, -8.3976685277206241E+05, -1.0919387804943173E+04}; + constexpr FLT c2[] = {1.7418455635504146E+04, 6.3489952164419868E+05, 3.1358985409389907E+06, 2.2547438801903715E+06, -6.0429762783920690E+06, -6.0429762783920504E+06, 2.2547438801903636E+06, 3.1358985409389869E+06, 6.3489952164419682E+05, 1.7418455635504106E+04}; + constexpr FLT c3[] = {1.5396188098732166E+04, 2.5490607173283477E+05, 4.2818880748176732E+05, -9.5435463094349112E+05, -1.2004850139039194E+06, 1.2004850139039543E+06, 9.5435463094349764E+05, -4.2818880748176464E+05, -2.5490607173283392E+05, -1.5396188098732144E+04}; + constexpr FLT c4[] = {8.2616700456447434E+03, 5.2880641964112423E+04, -6.1165055141129313E+04, -2.1590299490710214E+05, 2.1595822052158226E+05, 2.1595822052158433E+05, -2.1590299490713206E+05, -6.1165055141130644E+04, 5.2880641964112234E+04, 8.2616700456447343E+03}; + constexpr FLT c5[] = {2.7267169079066489E+03, 2.4572549134030178E+03, -2.6065821571076271E+04, 1.3919259807562572E+04, 4.6802084705703302E+04, -4.6802084705714791E+04, -1.3919259807544826E+04, 2.6065821571078101E+04, -2.4572549134029523E+03, -2.7267169079066462E+03}; + constexpr FLT c6[] = {5.0402062537834655E+02, -1.3640153425625094E+03, -1.4063198459010243E+03, 7.0858129627832977E+03, -4.8375233777539070E+03, -4.8375233777688618E+03, 7.0858129627894568E+03, -1.4063198459013925E+03, -1.3640153425628407E+03, 5.0402062537833399E+02}; + constexpr FLT c7[] = {2.4199726682552246E+01, -2.8393731159230907E+02, 5.1652001352658374E+02, 7.4578914842690025E+01, -1.1556759026394043E+03, 1.1556759026669868E+03, -7.4578914836335017E+01, -5.1652001352477316E+02, 2.8393731159271266E+02, -2.4199726682540764E+01}; + constexpr FLT c8[] = {-1.0545675122358718E+01, -3.0306758891736707E+00, 7.2305523762002423E+01, -1.3808908570315674E+02, 7.6293213390392353E+01, 7.6293213419941608E+01, -1.3808908572000124E+02, 7.2305523762424571E+01, -3.0306758892308885E+00, -1.0545675122367939E+01}; + constexpr FLT c9[] = {-2.1836930570445361E+00, 5.4992367507340179E+00, -4.5624617242018264E+00, -6.6492709812433128E+00, 2.0339240340948546E+01, -2.0339240355994509E+01, 6.6492709998185751E+00, 4.5624617253163429E+00, -5.4992367508385041E+00, 2.1836930570532433E+00}; + constexpr FLT c10[] = {-9.1748741454156318E-02, 5.2562451749078731E-01, -1.4144257942386596E+00, 1.8629579002072614E+00, -9.0169873685258095E-01, -9.0169875903814667E-01, 1.8629579050577161E+00, -1.4144257935638165E+00, 5.2562451754351402E-01, -9.1748741461736935E-02}; + for (int i=0; i<10; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i])))))))))); + } else if (w==11) { + constexpr FLT c0[] = {4.3537972057094375E+03, 9.8872306817881158E+05, 2.0938056062983297E+07, 1.3701428307175839E+08, 3.8828289972017384E+08, 5.4292197128519225E+08, 3.8828289972017366E+08, 1.3701428307175839E+08, 2.0938056062983308E+07, 9.8872306817881158E+05, 4.3537972057093921E+03}; + constexpr FLT c1[] = {1.7371472778611500E+04, 1.9155790709433779E+06, 2.4914432724618737E+07, 9.7792160665338382E+07, 1.3126779387874995E+08, -1.1645321713027108E-08, -1.3126779387875001E+08, -9.7792160665338382E+07, -2.4914432724618725E+07, -1.9155790709433777E+06, -1.7371472778611380E+04}; + constexpr FLT c2[] = {2.9650558537745463E+04, 1.6014973065836846E+06, 1.1867448782239098E+07, 2.0812212822540630E+07, -1.1749875870571045E+07, -4.5121922350041404E+07, -1.1749875870570999E+07, 2.0812212822540656E+07, 1.1867448782239093E+07, 1.6014973065836844E+06, 2.9650558537745292E+04}; + constexpr FLT c3[] = {2.8505604980264405E+04, 7.4166660874053370E+05, 2.5711466441825363E+06, -1.2146931938153724E+06, -8.3931576510115806E+06, 5.8947555067017928E-08, 8.3931576510117110E+06, 1.2146931938154269E+06, -2.5711466441825293E+06, -7.4166660874053300E+05, -2.8505604980264299E+04}; + constexpr FLT c4[] = {1.7045632829988484E+04, 1.9785834209758099E+05, 8.6361403553703407E+04, -1.0584472412325807E+06, -1.3367486018954750E+05, 1.7818009619468113E+06, -1.3367486018952320E+05, -1.0584472412325810E+06, 8.6361403553705750E+04, 1.9785834209758116E+05, 1.7045632829988426E+04}; + constexpr FLT c5[] = {6.5462464716912891E+03, 2.5347576368078731E+04, -7.5810878908802741E+04, -8.0774039751698409E+04, 2.5492801112953416E+05, 3.1373949311406158E-08, -2.5492801112952997E+05, 8.0774039751677527E+04, 7.5810878908807950E+04, -2.5347576368078797E+04, -6.5462464716912691E+03}; + constexpr FLT c6[] = {1.5684149291082226E+03, -1.0302687059850266E+03, -1.3446845770824604E+04, 2.0814393480318489E+04, 1.4366994276506950E+04, -4.4581342385966971E+04, 1.4366994276487216E+04, 2.0814393480327166E+04, -1.3446845770825106E+04, -1.0302687059851414E+03, 1.5684149291082156E+03}; + constexpr FLT c7[] = {1.9398419323286674E+02, -8.7329293867233980E+02, 2.4796533428845552E+02, 3.2905701326708659E+03, -4.8989871768521243E+03, 2.5910474731743909E-08, 4.8989871768931434E+03, -3.2905701326280059E+03, -2.4796533428623073E+02, 8.7329293867272952E+02, -1.9398419323288715E+02}; + constexpr FLT c8[] = {-4.2288232505094108E+00, -9.9574929618070513E+01, 2.9563077145679659E+02, -1.9453049353627330E+02, -4.0107401575324394E+02, 7.9532514191794951E+02, -4.0107401576649818E+02, -1.9453049352309569E+02, 2.9563077145970482E+02, -9.9574929617658114E+01, -4.2288232504962613E+00}; + constexpr FLT c9[] = {-5.3741131162116726E+00, 5.5350606001924518E+00, 1.9153744596147146E+01, -6.3189447496716646E+01, 6.6921287671707859E+01, -1.3450045688823196E-08, -6.6921287609294978E+01, 6.3189447455108059E+01, -1.9153744593546609E+01, -5.5350606002853286E+00, 5.3741131162113103E+00}; + constexpr FLT c10[] = {-7.0359426507051681E-01, 2.2229112760631806E+00, -3.2054079730741187E+00, 8.3392535011476268E-02, 6.8879260445103929E+00, -1.0795498350223303E+01, 6.8879260559828390E+00, 8.3392524213879743E-02, -3.2054079670004838E+00, 2.2229112761686296E+00, -7.0359426507381639E-01}; + constexpr FLT c11[] = {5.2648094862911970E-02, 9.9912561370710071E-02, -4.3913938793989010E-01, 7.9792986880755179E-01, -6.9191820607752896E-01, -3.1086723020887482E-08, 6.9191819251103082E-01, -7.9792986253876474E-01, 4.3913938485313375E-01, -9.9912561580306161E-02, -5.2648094876606648E-02}; + for (int i=0; i<11; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i]))))))))))); + } else if (w==12) { + constexpr FLT c0[] = {6.4299692685485479E+03, 2.0077596413122746E+06, 5.4904521978991687E+07, 4.5946106674819386E+08, 1.6835469840840111E+09, 3.1308386544851584E+09, 3.1308386544851594E+09, 1.6835469840840116E+09, 4.5946106674819499E+08, 5.4904521978991836E+07, 2.0077596413122742E+06, 6.4299692685634491E+03}; + constexpr FLT c1[] = {2.6965848540274084E+04, 4.1625245902732192E+06, 7.2097002594596982E+07, 3.8505085985474664E+08, 7.9479013671674263E+08, 4.7870231281824070E+08, -4.7870231281824070E+08, -7.9479013671674287E+08, -3.8505085985474682E+08, -7.2097002594597101E+07, -4.1625245902732182E+06, -2.6965848540258085E+04}; + constexpr FLT c2[] = {4.8869694409905118E+04, 3.7863371066322499E+06, 3.9530526716552719E+07, 1.1475134266581047E+08, 4.6311261797931008E+07, -2.0442837194260687E+08, -2.0442837194260764E+08, 4.6311261797930703E+07, 1.1475134266581020E+08, 3.9530526716552772E+07, 3.7863371066322499E+06, 4.8869694409920470E+04}; + constexpr FLT c3[] = {5.0530564260114013E+04, 1.9615784087727305E+06, 1.1044597342441026E+07, 7.9812418612436997E+06, -3.4042228324588403E+07, -3.3301805987927672E+07, 3.3301805987928241E+07, 3.4042228324588865E+07, -7.9812418612435153E+06, -1.1044597342440989E+07, -1.9615784087727298E+06, -5.0530564260099913E+04}; + constexpr FLT c4[] = {3.3081876469965486E+04, 6.2011956881368393E+05, 1.3086001239863783E+06, -3.1165484297367223E+06, -5.1982996003441429E+06, 6.3530947749620415E+06, 6.3530947749622557E+06, -5.1982996003440823E+06, -3.1165484297365877E+06, 1.3086001239863841E+06, 6.2011956881368428E+05, 3.3081876469981347E+04}; + constexpr FLT c5[] = {1.4308966168506786E+04, 1.1375573205951968E+05, -1.0318195403423737E+05, -6.6892418721464148E+05, 5.9223570255464804E+05, 1.1093685152670993E+06, -1.1093685152665814E+06, -5.9223570255454781E+05, 6.6892418721485860E+05, 1.0318195403423111E+05, -1.1375573205951942E+05, -1.4308966168492359E+04}; + constexpr FLT c6[] = {4.0848961919701046E+03, 7.5033277163530902E+03, -5.2578904182708357E+04, 6.3431596330007251E+03, 1.5984798504282974E+05, -1.2521363434086266E+05, -1.2521363434064612E+05, 1.5984798504277965E+05, 6.3431596327688303E+03, -5.2578904182719976E+04, 7.5033277163531166E+03, 4.0848961919843532E+03}; + constexpr FLT c7[] = {7.1658797373677851E+02, -1.5499947984091114E+03, -4.5490740453145772E+03, 1.4520122796449663E+04, -3.7896465827621914E+03, -2.3597107892496744E+04, 2.3597107892730306E+04, 3.7896465829102508E+03, -1.4520122796250829E+04, 4.5490740453377412E+03, 1.5499947984094479E+03, -7.1658797372277252E+02}; + constexpr FLT c8[] = {5.2022749592536726E+01, -4.0624258132612465E+02, 5.2256582979411519E+02, 9.3282469962228390E+02, -2.8710622268636553E+03, 1.7594166900407929E+03, 1.7594166904608542E+03, -2.8710622266536416E+03, 9.3282469976057041E+02, 5.2256582978430436E+02, -4.0624258132566132E+02, 5.2022749606076808E+01}; + constexpr FLT c9[] = {-7.0341875498933257E+00, -2.3043166228613529E+01, 1.2279331781902621E+02, -1.6714687552668008E+02, -4.4746498567249184E+01, 3.6060905998808425E+02, -3.6060905975626497E+02, 4.4746498638578188E+01, 1.6714687551479193E+02, -1.2279331779450688E+02, 2.3043166229077912E+01, 7.0341875614883520E+00}; + constexpr FLT c10[] = {-2.1556100132578342E+00, 4.1361104015055048E+00, 1.8107701824759481E+00, -2.1223400283067541E+01, 3.5820961921268712E+01, -1.8782945757357222E+01, -1.8782945295761856E+01, 3.5820961970532480E+01, -2.1223400227730256E+01, 1.8107701446846367E+00, 4.1361104022646886E+00, -2.1556100021360516E+00}; + constexpr FLT c11[] = {-1.1440899376747989E-01, 7.0567641591059616E-01, -1.4530217944402339E+00, 1.0571984630250064E+00, 1.4389000408734942E+00, -4.2241734506571262E+00, 4.2241732732256922E+00, -1.4389001658681779E+00, -1.0571984849752754E+00, 1.4530218273656557E+00, -7.0567641625357191E-01, 1.1440900438178589E-01}; + constexpr FLT c12[] = {-1.4486009664532199E-02, 2.9387825785133236E-03, -1.0265970208873806E-01, 2.6748270027876714E-01, -3.3606433030575705E-01, 1.5850134054436241E-01, 1.5850148084990595E-01, -3.3606430399846576E-01, 2.6748282743067825E-01, -1.0265974511212309E-01, 2.9387825100049524E-03, -1.4486000362352570E-02}; + for (int i=0; i<12; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i] + z*(c12[i])))))))))))); + } else if (w==13) { + constexpr FLT c0[] = {9.3397060605267925E+03, 3.9447202186643188E+06, 1.3701428307175836E+08, 1.4375660883001420E+09, 6.6384519128895750E+09, 1.5848048271166540E+10, 2.1031560281976685E+10, 1.5848048271166515E+10, 6.6384519128895721E+09, 1.4375660883001390E+09, 1.3701428307175830E+08, 3.9447202186642904E+06, 9.3397060605267870E+03}; + constexpr FLT c1[] = {4.0984512931817779E+04, 8.6828943763566837E+06, 1.9558432133067667E+08, 1.3674961320373521E+09, 3.9251291128182445E+09, 4.5116631434426517E+09, -5.2784645410468957E-07, -4.5116631434426460E+09, -3.9251291128182430E+09, -1.3674961320373495E+09, -1.9558432133067659E+08, -8.6828943763566315E+06, -4.0984512931817771E+04}; + constexpr FLT c2[] = {7.8379538318778941E+04, 8.4928073133582622E+06, 1.1992091153966446E+08, 5.0561697705436689E+08, 6.1845897311594033E+08, -5.1306326495404607E+08, -1.4790096327029381E+09, -5.1306326495404249E+08, 6.1845897311593974E+08, 5.0561697705436635E+08, 1.1992091153966436E+08, 8.4928073133582175E+06, 7.8379538318778941E+04}; + constexpr FLT c3[] = {8.6417670227040027E+04, 4.8250267333349725E+06, 3.9836803808039062E+07, 7.5026052902191281E+07, -7.7565422849559024E+07, -2.5393835488011667E+08, 3.3249826368607219E-06, 2.5393835488012213E+08, 7.7565422849558040E+07, -7.5026052902191922E+07, -3.9836803808038987E+07, -4.8250267333349492E+06, -8.6417670227040042E+04}; + constexpr FLT c4[] = {6.1161604972829395E+04, 1.7331203720075563E+06, 7.0216196997559210E+06, -3.6027138646115125E+06, -3.1775875626363419E+07, 1.6544480876799976E+06, 4.9816566960117713E+07, 1.6544480876825110E+06, -3.1775875626362957E+07, -3.6027138646109658E+06, 7.0216196997559462E+06, 1.7331203720075507E+06, 6.1161604972829424E+04}; + constexpr FLT c5[] = {2.9177164557155927E+04, 3.9318079134661297E+05, 3.1307448297762702E+05, -2.7571366584958737E+06, -9.8421840747392213E+05, 6.8469173866723683E+06, 2.8271164666996988E-07, -6.8469173866687613E+06, 9.8421840747752984E+05, 2.7571366584952055E+06, -3.1307448297760193E+05, -3.9318079134661169E+05, -2.9177164557155942E+04}; + constexpr FLT c6[] = {9.5097815505886592E+03, 4.8799940773717601E+04, -1.2734023162442955E+05, -2.5472337176560360E+05, 6.3596049196317361E+05, 2.2361868201724227E+05, -1.0716559939672153E+06, 2.2361868202200226E+05, 6.3596049196156661E+05, -2.5472337176510989E+05, -1.2734023162441404E+05, 4.8799940773715760E+04, 9.5097815505886429E+03}; + constexpr FLT c7[] = {2.0601715730545525E+03, 1.9365931141588459E+02, -2.5304303117500138E+04, 2.9151392447016315E+04, 5.9055020355996137E+04, -1.1784846181768291E+05, 2.6154044742765007E-06, 1.1784846181457305E+05, -5.9055020356659290E+04, -2.9151392447180453E+04, 2.5304303117533978E+04, -1.9365931141453160E+02, -2.0601715730545707E+03}; + constexpr FLT c8[] = {2.5975061893406377E+02, -1.0025387650570891E+03, -6.8642481197673135E+02, 6.7515314203707721E+03, -7.0772939651788483E+03, -6.5444514138990871E+03, 1.6566898963252905E+04, -6.5444514157945678E+03, -7.0772939632859488E+03, 6.7515314204902643E+03, -6.8642481194565551E+02, -1.0025387650535661E+03, 2.5975061893407650E+02}; + constexpr FLT c9[] = {5.8705282128692158E+00, -1.4424362302794552E+02, 3.3390627212323119E+02, 4.8151337259952918E+01, -1.1431733956368030E+03, 1.4557114776348812E+03, -3.3159944254032091E-07, -1.4557114806782522E+03, 1.1431733967780669E+03, -4.8151337378834590E+01, -3.3390627213511937E+02, 1.4424362302320881E+02, -5.8705282128605081E+00}; + constexpr FLT c10[] = {-4.0954969508851224E+00, -1.2634947171672739E+00, 3.8134139827368251E+01, -8.4115524684139231E+01, 4.2766848660349709E+01, 1.0573434367831015E+02, -1.9636661091449494E+02, 1.0573435467021281E+02, 4.2766847947710779E+01, -8.4115525105243464E+01, 3.8134139870558698E+01, -1.2634947126121756E+00, -4.0954969508837991E+00}; + constexpr FLT c11[] = {-6.2702735485690120E-01, 1.8595467760284645E+00, -1.3027978720941771E+00, -4.9265267037365117E+00, 1.3906831814366365E+01, -1.3753763493382712E+01, 2.6871064791607931E-07, 1.3753755542502716E+01, -1.3906831747296087E+01, 4.9265273573671839E+00, 1.3027978458757612E+00, -1.8595467797630605E+00, 6.2702735484380401E-01}; + constexpr FLT c12[] = {-4.8290636698016143E-02, 1.7531876457248552E-01, -5.0041296501579524E-01, 6.3665129689096389E-01, -1.2477021972354120E-02, -1.2061605995627183E+00, 1.8595304429529254E+00, -1.2061634758265700E+00, -1.2475794298747987E-02, 6.3665098120347430E-01, -5.0041293542010268E-01, 1.7531876909405444E-01, -4.8290636687311379E-02}; + constexpr FLT c13[] = {2.2894665623763296E-02, -7.1358251863425162E-03, -1.4950753078549017E-02, 7.0611554068321924E-02, -1.2311301880976686E-01, 1.0342486048127918E-01, -6.8988570158793749E-07, -1.0342802294420825E-01, 1.2311280070887519E-01, -7.0611922113576600E-02, 1.4950741151156504E-02, 7.1358201810974436E-03, -2.2894665619603353E-02}; + for (int i=0; i<13; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i] + z*(c12[i] + z*(c13[i]))))))))))))); + } else if (w==14) { + constexpr FLT c0[] = {1.3368785683552924E+04, 7.5304732752870303E+06, 3.2765764524435025E+08, 4.2418096936485295E+09, 2.4197690538177547E+10, 7.2227640697189728E+10, 1.2261475327356721E+11, 1.2261475327356729E+11, 7.2227640697189728E+10, 2.4197690538177608E+10, 4.2418096936485305E+09, 3.2765764524435204E+08, 7.5304732752870284E+06, 1.3368785683578022E+04}; + constexpr FLT c1[] = {6.1154444023081698E+04, 1.7488686085101545E+07, 5.0279014009863281E+08, 4.4777867842655859E+09, 1.6916819861812075E+10, 2.8971884004562843E+10, 1.6054555293734529E+10, -1.6054555293734520E+10, -2.8971884004562851E+10, -1.6916819861812094E+10, -4.4777867842655849E+09, -5.0279014009863436E+08, -1.7488686085101552E+07, -6.1154444023056109E+04}; + constexpr FLT c2[] = {1.2279790808348054E+05, 1.8230319600271538E+07, 3.3815815633684015E+08, 1.9369899011251259E+09, 3.9743454154781294E+09, 7.4954544638351953E+08, -7.0173920607394953E+09, -7.0173920607394981E+09, 7.4954544638350523E+08, 3.9743454154781094E+09, 1.9369899011251252E+09, 3.3815815633684099E+08, 1.8230319600271549E+07, 1.2279790808350702E+05}; + constexpr FLT c3[] = {1.4339321200624772E+05, 1.1200899688172197E+07, 1.2799140125169736E+08, 4.0176966726270700E+08, 7.9146174555817381E+07, -1.1719748245183482E+09, -9.6919138198233318E+08, 9.6919138198235631E+08, 1.1719748245183690E+09, -7.9146174555820629E+07, -4.0176966726270568E+08, -1.2799140125169775E+08, -1.1200899688172201E+07, -1.4339321200622563E+05}; + constexpr FLT c4[] = {1.0866548538632697E+05, 4.4565213401510660E+06, 2.8354150929531515E+07, 2.2805067924010411E+07, -1.2058223609888455E+08, -1.2775415620367479E+08, 1.9261201640091833E+08, 1.9261201640092278E+08, -1.2775415620368402E+08, -1.2058223609887798E+08, 2.2805067924010262E+07, 2.8354150929531977E+07, 4.4565213401510660E+06, 1.0866548538635395E+05}; + constexpr FLT c5[] = {5.6346565047794371E+04, 1.1743908345502394E+06, 3.0601086667308519E+06, -7.2274020134796854E+06, -1.6220595157138506E+07, 2.0773587344464455E+07, 2.8183198298702076E+07, -2.8183198298697799E+07, -2.0773587344463386E+07, 1.6220595157145990E+07, 7.2274020134800859E+06, -3.0601086667311694E+06, -1.1743908345502326E+06, -5.6346565047771030E+04}; + constexpr FLT c6[] = {2.0435142564639620E+04, 1.9450977300079435E+05, -1.1234667576916210E+05, -1.5205767549239143E+06, 1.0515640561116433E+06, 3.7458351782459249E+06, -3.3794074240140119E+06, -3.3794074240169711E+06, 3.7458351782412329E+06, 1.0515640561062016E+06, -1.5205767549244103E+06, -1.1234667576906871E+05, 1.9450977300078108E+05, 2.0435142564663318E+04}; + constexpr FLT c7[] = {5.1491366053560578E+03, 1.4735748500446980E+04, -8.1689482343558659E+04, -3.5176894225535718E+04, 3.7034248411029513E+05, -1.9109669530087037E+05, -5.2637978465954703E+05, 5.2637978466513811E+05, 1.9109669530731969E+05, -3.7034248412243859E+05, 3.5176894226134398E+04, 8.1689482343736949E+04, -1.4735748500440675E+04, -5.1491366053330503E+03}; + constexpr FLT c8[] = {8.5138795113642539E+02, -1.2978618911724870E+03, -8.7500873646799319E+03, 2.1319159614070901E+04, 7.6586611596445446E+03, -6.2424139814276627E+04, 4.2620771484048986E+04, 4.2620771487400976E+04, -6.2424139811762492E+04, 7.6586611726886877E+03, 2.1319159614126653E+04, -8.7500873648028410E+03, -1.2978618911666397E+03, 8.5138795115875746E+02}; + constexpr FLT c9[] = {7.2176142041601707E+01, -4.5543406154804239E+02, 2.8301959889246939E+02, 2.1994171513294418E+03, -4.5082500681007541E+03, 4.7658016701186381E+02, 7.1044827179414842E+03, -7.1044827207946446E+03, -4.7658016510975699E+02, 4.5082500692420190E+03, -2.1994171509014677E+03, -2.8301959872009093E+02, 4.5543406154544186E+02, -7.2176142022434362E+01}; + constexpr FLT c10[] = {-3.1135380162987940E+00, -3.8554406978579038E+01, 1.4396028115898400E+02, -1.1260050343554748E+02, -3.0073664795307559E+02, 7.2079162583931463E+02, -4.1195307853504261E+02, -4.1195308389061950E+02, 7.2079161951195317E+02, -3.0073665201295637E+02, -1.1260050330597517E+02, 1.4396028109959775E+02, -3.8554406977567140E+01, -3.1135379980017595E+00}; + constexpr FLT c11[] = {-1.6022934776926798E+00, 1.8678197421256739E+00, 8.3368944138930399E+00, -3.0791579027234270E+01, 3.4749714150762280E+01, 1.2322523792409507E+01, -7.3924012166427417E+01, 7.3924001493712765E+01, -1.2322523909478123E+01, -3.4749718994457659E+01, 3.0791578402870758E+01, -8.3368943163363198E+00, -1.8678197396867300E+00, 1.6022934951962213E+00}; + constexpr FLT c12[] = {-1.9362061844377096E-01, 6.3024467546449237E-01, -9.3262282246103156E-01, -4.8908745811188170E-01, 4.0479355563504544E+00, -6.2829791472071852E+00, 3.1767781035894589E+00, 3.1767769811448687E+00, -6.2829724125407163E+00, 4.0479411685726534E+00, -4.8908752826470542E-01, -9.3262301538118120E-01, 6.3024467436836862E-01, -1.9362060312354304E-01}; + constexpr FLT c13[] = {1.8785913715361053E-02, 3.1605272623671174E-02, -1.3655798799707175E-01, 2.5016548497515428E-01, -1.6654380378010236E-01, -2.1682631004979175E-01, 6.1785823408636587E-01, -6.1786412281044067E-01, 2.1682412904087514E-01, 1.6654140467029407E-01, -2.5016543044993139E-01, 1.3655803570664179E-01, -3.1605272197692873E-02, -1.8785905270673971E-02}; + constexpr FLT c14[] = {-1.2896545121493665E-02, -3.7106960851979211E-03, 5.8859140039070395E-04, 1.3987190631712249E-02, -3.5710919113872190E-02, 4.3405397573933885E-02, -2.0030939379906375E-02, -2.0032731865340953E-02, 4.3401439168598052E-02, -3.5712796955756618E-02, 1.3987489379284932E-02, 5.8862874383716927E-04, -3.7106965853333437E-03, -1.2896537371347905E-02}; + for (int i=0; i<14; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i] + z*(c12[i] + z*(c13[i] + z*(c14[i])))))))))))))); + } else if (w==15) { + constexpr FLT c0[] = {1.8887777774374495E+04, 1.4015330434461441E+07, 7.5498683300180113E+08, 1.1900937739619959E+10, 8.2530965279375427E+10, 3.0178246269069617E+11, 6.3775691457119177E+11, 8.1471473119305627E+11, 6.3775691457119177E+11, 3.0178246269069659E+11, 8.2530965279375626E+10, 1.1900937739619970E+10, 7.5498683300180113E+08, 1.4015330434461441E+07, 1.8887777774374499E+04}; + constexpr FLT c1[] = {8.9780907163796350E+04, 3.4167636285297170E+07, 1.2346880033823483E+09, 1.3719272724135921E+10, 6.5858241494816727E+10, 1.5266999939989542E+11, 1.5687794513790732E+11, 8.2054309331652521E-05, -1.5687794513790729E+11, -1.5266999939989551E+11, -6.5858241494816811E+10, -1.3719272724135935E+10, -1.2346880033823485E+09, -3.4167636285297155E+07, -8.9780907163796262E+04}; + constexpr FLT c2[] = {1.8850321233130724E+05, 3.7693640983013548E+07, 8.9846818051570022E+08, 6.7094088040439663E+09, 1.9743296615199219E+10, 1.8072727219391186E+10, -2.0634615374559433E+10, -4.9654335197177406E+10, -2.0634615374559402E+10, 1.8072727219391071E+10, 1.9743296615199223E+10, 6.7094088040439653E+09, 8.9846818051569998E+08, 3.7693640983013526E+07, 1.8850321233130703E+05}; + constexpr FLT c3[] = {2.3185006533495741E+05, 2.4789475362741619E+07, 3.7751696829092431E+08, 1.7167916788178215E+09, 1.9832401267745426E+09, -3.4881359830883756E+09, -7.8785602379628572E+09, 9.7140016072625200E-05, 7.8785602379629736E+09, 3.4881359830884337E+09, -1.9832401267745149E+09, -1.7167916788178086E+09, -3.7751696829092413E+08, -2.4789475362741601E+07, -2.3185006533495741E+05}; + constexpr FLT c4[] = {1.8672970114818294E+05, 1.0741068109706741E+07, 9.8017949708492860E+07, 2.0291084954252207E+08, -2.7857869294215119E+08, -9.4112677968749356E+08, 1.7886520649348873E+08, 1.4579673547892964E+09, 1.7886520649342585E+08, -9.4112677968752539E+08, -2.7857869294214994E+08, 2.0291084954251558E+08, 9.8017949708492786E+07, 1.0741068109706741E+07, 1.8672970114818294E+05}; + constexpr FLT c5[] = {1.0411891611891470E+05, 3.1771463075269503E+06, 1.4880104152842240E+07, -6.8136965447559115E+06, -8.7072998215433106E+07, 1.8024116531034056E+06, 1.9067730799617344E+08, 4.2457739417067258E-05, -1.9067730799613068E+08, -1.8024116529409259E+06, 8.7072998215441659E+07, 6.8136965447553769E+06, -1.4880104152842039E+07, -3.1771463075269512E+06, -1.0411891611891471E+05}; + constexpr FLT c6[] = {4.1300641422694804E+04, 6.3217168592498475E+05, 7.7343707634861500E+05, -5.4575962381464886E+06, -3.7387211063140454E+06, 1.8451583614096310E+07, 3.0480804947991944E+06, -2.7500445095909819E+07, 3.0480804948348333E+06, 1.8451583614054784E+07, -3.7387211062913244E+06, -5.4575962381459959E+06, 7.7343707634824759E+05, 6.3217168592497776E+05, 4.1300641422694753E+04}; + constexpr FLT c7[] = {1.1710443348523793E+04, 7.5405449195728594E+04, -1.6634736996463325E+05, -5.6069290801800112E+05, 1.1540571564075467E+06, 1.0209821661192341E+06, -2.9641921942296810E+06, 3.3808352628184138E-05, 2.9641921942798980E+06, -1.0209821662794619E+06, -1.1540571563939669E+06, 5.6069290802062431E+05, 1.6634736996474760E+05, -7.5405449195719484E+04, -1.1710443348523821E+04}; + constexpr FLT c8[] = {2.3142324239350878E+03, 2.1710560541685127E+03, -3.6929625713073510E+04, 2.6143898219454975E+04, 1.4046980089280056E+05, -2.1033190113776314E+05, -1.1132269821056565E+05, 3.7491447377567255E+05, -1.1132269820392072E+05, -2.1033190119832297E+05, 1.4046980086087715E+05, 2.6143898218932318E+04, -3.6929625712961781E+04, 2.1710560541720374E+03, 2.3142324239350669E+03}; + constexpr FLT c9[] = {2.8879718294280184E+02, -9.2801372612475961E+02, -1.9817144426574330E+03, 9.9004179204792053E+03, -5.7928269087620147E+03, -2.1083466263505023E+04, 3.3285501948595454E+04, -2.7485328636422507E-05, -3.3285501965333991E+04, 2.1083466366979632E+04, 5.7928269521300508E+03, -9.9004179216204702E+03, 1.9817144428595318E+03, 9.2801372612847467E+02, -2.8879718294283089E+02}; + constexpr FLT c10[] = {1.3121871131812668E+01, -1.5978845116799533E+02, 2.7429718922951372E+02, 4.4598059414156506E+02, -1.8917609553066516E+03, 1.5303002688244715E+03, 1.7542368497545090E+03, -3.9411530602516441E+03, 1.7542369316431223E+03, 1.5303002442924305E+03, -1.8917609584163495E+03, 4.4598059457347478E+02, 2.7429718902435877E+02, -1.5978845117002061E+02, 1.3121871131803672E+01}; + constexpr FLT c11[] = {-2.4286151057240977E+00, -6.7839829107457454E+00, 4.6999223071396322E+01, -7.4896070961958642E+01, -3.2010113081168477E+01, 2.5022928265034139E+02, -2.8786059319143976E+02, -7.6634590881515742E-06, 2.8786055354435149E+02, -2.5022938574837804E+02, 3.2010133958326769E+01, 7.4896073537458122E+01, -4.6999222973839679E+01, 6.7839829144042234E+00, 2.4286151057002718E+00}; + constexpr FLT c12[] = {-5.4810555663540994E-01, 1.1436870829533889E+00, 8.2471503038810468E-01, -8.5602133190676231E+00, 1.5631626747736027E+01, -6.4979530690388971E+00, -1.8737705444912390E+01, 3.3283700586432069E+01, -1.8737671771580779E+01, -6.4980608237023150E+00, 1.5631576518348636E+01, -8.5602150728872868E+00, 8.2471496023535673E-01, 1.1436870829534245E+00, -5.4810555666110816E-01}; + constexpr FLT c13[] = {-1.4554612894071435E-02, 1.7022157798828938E-01, -3.7563883252838998E-01, 2.0131137597017346E-01, 8.3554102633770899E-01, -2.1191293316246047E+00, 1.9960663397068628E+00, -2.3728355667610635E-05, -1.9960994910423950E+00, 2.1191258420103383E+00, -8.3552532307350946E-01, -2.0131366602953590E-01, 3.7563888705361287E-01, -1.7022157564540871E-01, 1.4554612874103701E-02}; + constexpr FLT c14[] = {-1.2348455954758902E-02, 2.6143546776172359E-03, -2.9252135300577905E-02, 7.5391681327619392E-02, -8.7984403647335341E-02, 1.3344627281489669E-03, 1.5252941418184685E-01, -2.3235937480302737E-01, 1.5257226311939021E-01, 1.3278049251030887E-03, -8.7990378598784807E-02, 7.5392790961460260E-02, -2.9252188648358976E-02, 2.6143533439228375E-03, -1.2348455958015002E-02}; + constexpr FLT c15[] = {1.4214685601398354E-02, -1.2364336624800189E-03, 1.2892619016815934E-03, 1.6178062163508013E-03, -8.2136742192079667E-03, 1.3906385413195475E-02, -1.1450713230272313E-02, -3.7721726447119798E-06, 1.1423376007684534E-02, -1.3922509066323734E-02, 8.2263143670307064E-03, -1.6156663488059737E-03, -1.2892038432598459E-03, 1.2364357359950825E-03, -1.4214685605448193E-02}; + for (int i=0; i<15; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i] + z*(c12[i] + z*(c13[i] + z*(c14[i] + z*(c15[i]))))))))))))))); + } else if (w==16) { + constexpr FLT c0[] = {2.6374086784014766E+04, 2.5501413681212697E+07, 1.6835469840840111E+09, 3.1953580806547901E+10, 2.6584910126662793E+11, 1.1715858191494631E+12, 3.0181658330343154E+12, 4.7888775408612793E+12, 4.7888775408612793E+12, 3.0181658330343149E+12, 1.1715858191494631E+12, 2.6584910126662802E+11, 3.1953580806547905E+10, 1.6835469840840123E+09, 2.5501413681212693E+07, 2.6374086784014838E+04}; + constexpr FLT c1[] = {1.2991568388123445E+05, 6.4986154651133671E+07, 2.9142305012947264E+09, 3.9748054433728172E+10, 2.3649443248440253E+11, 7.0471088240421252E+11, 1.0533888905987035E+12, 5.4832304482297614E+11, -5.4832304482297620E+11, -1.0533888905987037E+12, -7.0471088240421265E+11, -2.3649443248440253E+11, -3.9748054433728172E+10, -2.9142305012947268E+09, -6.4986154651133649E+07, -1.2991568388123452E+05}; + constexpr FLT c2[] = {2.8421223836872837E+05, 7.5448503558118597E+07, 2.2710828032883873E+09, 2.1491603403163834E+10, 8.4299374042308197E+10, 1.3384457365769531E+11, 1.8630012765538406E+09, -2.4384536789321063E+11, -2.4384536789321036E+11, 1.8630012765533686E+09, 1.3384457365769537E+11, 8.4299374042308105E+10, 2.1491603403163818E+10, 2.2710828032883859E+09, 7.5448503558118537E+07, 2.8421223836872837E+05}; + constexpr FLT c3[] = {3.6653021243297530E+05, 5.2693428548387125E+07, 1.0410094433021290E+09, 6.3986267576853638E+09, 1.3313926739756351E+10, -2.7909761561126175E+09, -3.9911638977027939E+10, -2.9236947704012280E+10, 2.9236947704013081E+10, 3.9911638977028137E+10, 2.7909761561130028E+09, -1.3313926739756271E+10, -6.3986267576853542E+09, -1.0410094433021282E+09, -5.2693428548387118E+07, -3.6653021243297530E+05}; + constexpr FLT c4[] = {3.1185660915838124E+05, 2.4564274645530283E+07, 3.0509279143241888E+08, 1.0432225146182600E+09, 6.4966284440289930E+07, -4.2483903608015141E+09, -3.1778261722520151E+09, 5.9880587942837610E+09, 5.9880587942838221E+09, -3.1778261722524805E+09, -4.2483903608015366E+09, 6.4966284440239742E+07, 1.0432225146182716E+09, 3.0509279143241870E+08, 2.4564274645530298E+07, 3.1185660915838124E+05}; + constexpr FLT c5[] = {1.8544733523229556E+05, 7.9824949938292857E+06, 5.6880943382648587E+07, 5.4097201999261037E+07, -3.0776449202831459E+08, -3.7659931821870732E+08, 6.8797698944740057E+08, 7.5429896889854825E+08, -7.5429896889813769E+08, -6.8797698944685316E+08, 3.7659931821880990E+08, 3.0776449202837443E+08, -5.4097201999261037E+07, -5.6880943382648058E+07, -7.9824949938292904E+06, -1.8544733523229562E+05}; + constexpr FLT c6[] = {7.9472339236673346E+04, 1.8159676553648554E+06, 5.7259818806757703E+06, -1.2786136236414703E+07, -3.8677490873126298E+07, 4.7651450515746824E+07, 9.0723760109486386E+07, -9.4532949239712372E+07, -9.4532949239553988E+07, 9.0723760109301269E+07, 4.7651450515691362E+07, -3.8677490873146154E+07, -1.2786136236417659E+07, 5.7259818806749191E+06, 1.8159676553648303E+06, 7.9472339236673288E+04}; + constexpr FLT c7[] = {2.4831718998299966E+04, 2.7536301841718081E+05, -5.1045953355375612E+04, -2.6996387880195463E+06, 1.1656554632389303E+06, 9.1521923450131379E+06, -6.8198180924866442E+06, -1.2555197000819867E+07, 1.2555197001241650E+07, 6.8198180927697066E+06, -9.1521923448700085E+06, -1.1656554631878142E+06, 2.6996387880213680E+06, 5.1045953356119258E+04, -2.7536301841717307E+05, -2.4831718998299926E+04}; + constexpr FLT c8[] = {5.6060763597396308E+03, 2.2154740880106889E+04, -1.0243462874801211E+05, -1.1802198892514131E+05, 6.4061699367996352E+05, -1.1166716767206143E+05, -1.4153578101430011E+06, 1.0790712966724981E+06, 1.0790712967259965E+06, -1.4153578105201155E+06, -1.1166716749694763E+05, 6.4061699367337034E+05, -1.1802198891465126E+05, -1.0243462874806672E+05, 2.2154740880108289E+04, 5.6060763597395980E+03}; + constexpr FLT c9[] = {8.7271993222052015E+02, -7.0074676858636565E+02, -1.2528372958260919E+04, 2.3643101058174649E+04, 3.1699060176870429E+04, -1.1270133590467999E+05, 3.6872846694334214E+04, 1.5168911740364679E+05, -1.5168911743408049E+05, -3.6872846682160729E+04, 1.1270133589250650E+05, -3.1699060125133125E+04, -2.3643101053990013E+04, 1.2528372958926657E+04, 7.0074676859379576E+02, -8.7271993222046206E+02}; + constexpr FLT c10[] = {7.8842259458809167E+01, -4.2070880912368045E+02, -1.0535142084668550E+02, 3.3375056840527291E+03, -4.9426353391946941E+03, -3.6567309106352213E+03, 1.5199085303756190E+04, -9.4972223386509122E+03, -9.4972222612539845E+03, 1.5199085250589107E+04, -3.6567308608802218E+03, -4.9426353295200679E+03, 3.3375056868169195E+03, -1.0535142136497778E+02, -4.2070880912233122E+02, 7.8842259458809863E+01}; + constexpr FLT c11[] = {8.9833076822322541E-02, -4.4163371176090656E+01, 1.2880771155499514E+02, 2.8722193371824223E+00, -5.7164633743445722E+02, 9.0417612969072786E+02, 1.1220387898916500E+00, -1.4190926236781661E+03, 1.4190921497862169E+03, -1.1219395160922474E+00, -9.0417626783116691E+02, 5.7164631339646269E+02, -2.8722233955477368E+00, -1.2880771178913139E+02, 4.4163371168774162E+01, -8.9833076836661779E-02}; + constexpr FLT c12[] = {-1.0900468357478950E+00, -1.1264666525354303E-01, 1.1810668147959248E+01, -3.0289105313513339E+01, 1.5494580774353590E+01, 6.0129886123389447E+01, -1.2330199171381130E+02, 6.7114507519752891E+01, 6.7114417724195803E+01, -1.2330220722314033E+02, 6.0129944490502041E+01, 1.5494578529464169E+01, -3.0289104892597450E+01, 1.1810668147959559E+01, -1.1264666963803399E-01, -1.0900468357479236E+00}; + constexpr FLT c13[] = {-1.1763610120003680E-01, 4.2939195911805172E-01, -2.7950209959937194E-01, -1.7354549670508441E+00, 5.1182015415147619E+00, -5.0538827161604676E+00, -2.1270036462171213E+00, 1.0709458682620088E+01, -1.0709612225647817E+01, 2.1267942693611270E+00, 5.0538338615607357E+00, -5.1181806038291624E+00, 1.7354571480597607E+00, 2.7950229043765212E-01, -4.2939195443229039E-01, 1.1763610122666045E-01}; + constexpr FLT c14[] = {-1.8020499668410097E-02, 3.6694580839244442E-02, -1.1331134794057113E-01, 1.3971228975695787E-01, 8.1734604430561311E-02, -5.4464516301492671E-01, 7.9646109231150031E-01, -3.9024149191964747E-01, -3.9020325223035940E-01, 7.9644613359376126E-01, -5.4458780348100966E-01, 8.1735287282159258E-02, 1.3971280189565236E-01, -1.1331156133169454E-01, 3.6694584840328316E-02, -1.8020499652780946E-02}; + constexpr FLT c15[] = {1.4589783473923206E-02, -7.8885429103313365E-04, -4.4856766056362643E-03, 1.8116483572926646E-02, -3.0574294775135746E-02, 1.8967420978453962E-02, 2.4666137072064612E-02, -6.8017929307730221E-02, 6.7615302446897660E-02, -2.4691085605299815E-02, -1.9038882601578176E-02, 3.0552398456072709E-02, -1.8118938614760938E-02, 4.4854443719491892E-03, 7.8884755210919307E-04, -1.4589783498222219E-02}; + constexpr FLT c16[] = {-1.0467998078291846E-02, -3.2140608463710125E-04, 5.2959666930518063E-04, -1.5769844275261027E-04, -1.4331371817542763E-03, 3.7100687637655694E-03, -3.8742310984482158E-03, 1.6810223071268796E-03, 1.6547563335702548E-03, -3.9924279794162345E-03, 3.6969357769948610E-03, -1.4380620517984166E-03, -1.5934006609813836E-04, 5.2953895598459668E-04, -3.2140848935911386E-04, -1.0467998075160606E-02}; + for (int i=0; i<16; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i] + z*(c12[i] + z*(c13[i] + z*(c14[i] + z*(c15[i] + z*(c16[i])))))))))))))))); + } else + printf("width not implemented!\n"); From db0457ab75d39e6d65c717c82cbc971440c83bad Mon Sep 17 00:00:00 2001 From: Marco Barbone Date: Tue, 23 Jul 2024 12:00:09 -0400 Subject: [PATCH 16/68] restoring .m from master --- devel/gen_all_horner_C_code.m | 26 +++++++------------------- devel/gen_ker_horner_loop_C_code.m | 4 ++-- 2 files changed, 9 insertions(+), 21 deletions(-) diff --git a/devel/gen_all_horner_C_code.m b/devel/gen_all_horner_C_code.m index 754d91d61..009e05ea4 100644 --- a/devel/gen_all_horner_C_code.m +++ b/devel/gen_all_horner_C_code.m @@ -10,26 +10,14 @@ clear opts = struct(); -ws = 2:16; -upsampfac = 1.25; % sigma (upsampling): either 2 (default) or low (eg 5/4). -opts.wpad = false; % pad kernel eval to multiple of 4 +for upsampfac = [2.0, 1.25]; % sigma: either 2 (default) or low (eg 5/4) + fprintf('upsampfac = %g...\n',upsampfac) + + ws = 2:16; + opts.wpad = true; % pad kernel eval to multiple of 4 -if upsampfac==2, fid = fopen('../include/cufinufft/contrib/ker_horner_allw_loop.inc','w'); -else, fid = fopen('../include/cufinufft/contrib/ker_lowupsampfac_horner_allw_loop.inc','w'); -end -fwrite(fid,sprintf('// Code generated by gen_all_horner_C_code.m in finufft/devel\n')); -fwrite(fid,sprintf('// Authors: Alex Barnett & Ludvig af Klinteberg.\n// (C) The Simons Foundation, Inc.\n')); -for j=1:numel(ws) - w = ws(j) - if upsampfac==2 % hardwire the betas for this default case - betaoverws = [2.20 2.26 2.38 2.30]; % matches setup_spreader - beta = betaoverws(min(4,w-1)) * w; % uses last entry for w>=5 - d = w + 2 + (w<=8); % between 2-3 more degree than w - else % use formulae, must match params in setup_spreader... - gamma=0.97; % safety factor - betaoverws = gamma*pi*(1-1/(2*upsampfac)); % from cutoff freq formula - beta = betaoverws * w; - d = w + 1 + (w<=8); % less, since beta smaller, smoother + if upsampfac==2, fid = fopen('../src/ker_horner_allw_loop_constexpr.c','w'); + else, fid = fopen('../src/ker_lowupsampfac_horner_allw_loop_constexpr.c','w'); end fwrite(fid,sprintf('// Code generated by gen_all_horner_C_code.m in finufft/devel\n')); fwrite(fid,sprintf('// Authors: Alex Barnett & Ludvig af Klinteberg.\n// (C) The Simons Foundation, Inc.\n')); diff --git a/devel/gen_ker_horner_loop_C_code.m b/devel/gen_ker_horner_loop_C_code.m index d1a635536..e2dd1b75a 100644 --- a/devel/gen_ker_horner_loop_C_code.m +++ b/devel/gen_ker_horner_loop_C_code.m @@ -37,8 +37,8 @@ else width = w; end -for n=1:d % loop over poly coeff powers - s = sprintf('constexpr FLT c%d[] = {%.16E',n-1, C(n,1)); +for n=1:d+1 % loop over poly coeff powers + s = sprintf('FLT c%d[] = {%.16E',n-1, C(n,1)); for i=2:width % loop over segments s = sprintf('%s, %.16E', s, C(n,i)); end From d0ce11e718f257a4bf1bef94ff014f7a64b8e323 Mon Sep 17 00:00:00 2001 From: Marco Barbone Date: Tue, 23 Jul 2024 12:11:05 -0400 Subject: [PATCH 17/68] updated hook --- .pre-commit-config.yaml | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 3b839e6f5..ac25e3f63 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,5 +1,12 @@ repos: - repo: https://github.com/pre-commit/mirrors-clang-format - rev: 'v18.1.6' + rev: 'v18.1.8' hooks: - id: clang-format + types_or: [c++, c, cuda] + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v2.3.0 + hooks: + - id: check-yaml + - id: end-of-file-fixer + - id: trailing-whitespace From 798717d9a6f97f4c5f0c6904de89b10d06e3fa61 Mon Sep 17 00:00:00 2001 From: Marco Barbone Date: Tue, 23 Jul 2024 12:44:50 -0400 Subject: [PATCH 18/68] updated coefficients --- devel/gen_all_horner_C_code.m | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/devel/gen_all_horner_C_code.m b/devel/gen_all_horner_C_code.m index baf6590cf..51aa4e4e1 100644 --- a/devel/gen_all_horner_C_code.m +++ b/devel/gen_all_horner_C_code.m @@ -16,8 +16,8 @@ ws = 2:16; opts.wpad = false; % pad kernel eval to multiple of 4 - if upsampfac==2, fid = fopen('../inclue/cuda/contrib/ker_horner_allw_loop_constexpr.inc','w'); - else, fid = fopen('../inclue/cuda/contrib/ker_lowupsampfac_horner_allw_loop_constexpr.inc','w'); + if upsampfac==2, fid = fopen('../include/cufinufft/contrib/ker_horner_allw_loop_constexpr.inc','w'); + else, fid = fopen('../include/cufinufft/contrib/ker_lowupsampfac_horner_allw_loop_constexpr.inc','w'); end fwrite(fid,sprintf('// Code generated by gen_all_horner_C_code.m in finufft/devel\n')); fwrite(fid,sprintf('// Authors: Alex Barnett & Ludvig af Klinteberg.\n// (C) The Simons Foundation, Inc.\n')); From 282baf50ea3a4d000e1cc3ed40d940424d54639a Mon Sep 17 00:00:00 2001 From: Marco Barbone Date: Tue, 23 Jul 2024 12:46:37 -0400 Subject: [PATCH 19/68] new coeffs --- .../ker_horner_allw_loop_constexpr.inc | 205 +++++++++++++++++ ...owupsampfac_horner_allw_loop_constexpr.inc | 171 +++++++++++++++ src/ker_horner_allw_loop.inc | 207 ++++++++++++++++++ 3 files changed, 583 insertions(+) create mode 100644 include/cufinufft/contrib/ker_horner_allw_loop_constexpr.inc create mode 100644 include/cufinufft/contrib/ker_lowupsampfac_horner_allw_loop_constexpr.inc create mode 100644 src/ker_horner_allw_loop.inc diff --git a/include/cufinufft/contrib/ker_horner_allw_loop_constexpr.inc b/include/cufinufft/contrib/ker_horner_allw_loop_constexpr.inc new file mode 100644 index 000000000..1f4c59e2a --- /dev/null +++ b/include/cufinufft/contrib/ker_horner_allw_loop_constexpr.inc @@ -0,0 +1,205 @@ +// Code generated by gen_all_horner_C_code.m in finufft/devel +// Authors: Alex Barnett & Ludvig af Klinteberg. +// (C) The Simons Foundation, Inc. + if (w==2) { + constexpr FLT c0[] = {5.5428559551548406E-01, 5.5428559551548395E-01}; + constexpr FLT c1[] = {7.0481840008800778E-01, -7.0481840008800811E-01}; + constexpr FLT c2[] = {-2.2584311526143548E-02, -2.2584311526143607E-02}; + constexpr FLT c3[] = {-2.5024197515954211E-01, 2.5024197515954211E-01}; + for (int i=0; i<2; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i]))); + } else if (w==3) { + constexpr FLT c0[] = {1.7787237246937579E-01, 1.0000000000000013E+00, 1.7787237247678464E-01}; + constexpr FLT c1[] = {3.5966530797581003E-01, -4.2425842671825248E-17, -3.5966530796781060E-01}; + constexpr FLT c2[] = {2.0160576446392536E-01, -3.7666666666667331E-01, 2.0160576447145470E-01}; + constexpr FLT c3[] = {-1.7450587318669351E-02, 2.2939218956436377E-17, 1.7450587325767743E-02}; + constexpr FLT c4[] = {-4.2902993854032963E-02, 6.0475925925925586E-02, -4.2902993846219546E-02}; + constexpr FLT c5[] = {-4.5057857403453909E-03, 6.6232851036457955E-18, 4.5057857475245110E-03}; + for (int i=0; i<3; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i]))))); + } else if (w==4) { + constexpr FLT c0[] = {3.9828257752799377E-02, 7.3911656575585805E-01, 7.3911656575585805E-01, 3.9828257752799433E-02}; + constexpr FLT c1[] = {1.0749328817387334E-01, 4.5419700247912287E-01, -4.5419700247912287E-01, -1.0749328817387330E-01}; + constexpr FLT c2[] = {1.0408888748149289E-01, -1.0268333881994456E-01, -1.0268333881994476E-01, 1.0408888748149285E-01}; + constexpr FLT c3[] = {3.7516840869185789E-02, -1.0412335657155622E-01, 1.0412335657155641E-01, -3.7516840869185733E-02}; + constexpr FLT c4[] = {-3.5432868834529888E-03, 2.8903049344237370E-03, 2.8903049344238003E-03, -3.5432868834529676E-03}; + constexpr FLT c5[] = {-5.7512181801490673E-03, 1.0945950376831730E-02, -1.0945950376831654E-02, 5.7512181801490829E-03}; + constexpr FLT c6[] = {-7.3657365672905430E-04, 3.7144674885200340E-04, 3.7144674885207063E-04, -7.3657365672907728E-04}; + for (int i=0; i<4; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i])))))); + } else if (w==5) { + constexpr FLT c0[] = {1.0051451410391413E-02, 3.8286382489474308E-01, 1.0000000000000009E+00, 3.8286382489474252E-01, 1.0051451410391420E-02}; + constexpr FLT c1[] = {3.0826052021380446E-02, 3.8431958613457984E-01, -4.7102147373384796E-32, -3.8431958613457951E-01, -3.0826052021380446E-02}; + constexpr FLT c2[] = {3.6562231959204314E-02, 7.8509612097392906E-02, -2.3000000000000059E-01, 7.8509612097392906E-02, 3.6562231959204300E-02}; + constexpr FLT c3[] = {2.0250135419918262E-02, -3.9381037339048602E-02, 1.0193845429304082E-16, 3.9381037339048686E-02, -2.0250135419918248E-02}; + constexpr FLT c4[] = {4.0593041193018580E-03, -1.6067481167759540E-02, 2.4150000000000074E-02, -1.6067481167759530E-02, 4.0593041193018597E-03}; + constexpr FLT c5[] = {-9.2488937959280210E-04, 1.2476700479675494E-03, 1.0406437805617128E-16, -1.2476700479676270E-03, 9.2488937959280405E-04}; + constexpr FLT c6[] = {-5.6059657038176136E-04, 1.2116190166774866E-03, -1.5448333333332675E-03, 1.2116190166775878E-03, -5.6059657038176342E-04}; + constexpr FLT c7[] = {-3.4201716508558499E-05, 2.3137115416428607E-05, 3.6450914717742488E-17, -2.3137115416288715E-05, 3.4201716508574924E-05}; + for (int i=0; i<5; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i]))))))); + } else if (w==6) { + constexpr FLT c0[] = {2.0875119883113440E-03, 1.5741818314646622E-01, 8.2446837122968764E-01, 8.2446837122968819E-01, 1.5741818314646633E-01, 2.0875119883208737E-03}; + constexpr FLT c1[] = {7.2383827471879086E-03, 2.0903648995439439E-01, 3.2052935784357633E-01, -3.2052935784357606E-01, -2.0903648995439447E-01, -7.2383827471776260E-03}; + constexpr FLT c2[] = {1.0180085126333453E-02, 9.2337811484269047E-02, -1.0253741712233820E-01, -1.0253741712233828E-01, 9.2337811484268964E-02, 1.0180085126343144E-02}; + constexpr FLT c3[] = {7.3669955501269460E-03, 4.9102900025223507E-03, -5.1302324979469405E-02, 5.1302324979469550E-02, -4.9102900025223160E-03, -7.3669955501178214E-03}; + constexpr FLT c4[] = {2.7444270008043898E-03, -8.0004810696544734E-03, 5.2920367975573743E-03, 5.2920367975574090E-03, -8.0004810696544873E-03, 2.7444270008144425E-03}; + constexpr FLT c5[] = {3.2622379114949894E-04, -1.8514138516535197E-03, 3.8520985619445234E-03, -3.8520985619444454E-03, 1.8514138516535119E-03, -3.2622379114026425E-04}; + constexpr FLT c6[] = {-1.2239646122606432E-04, 2.2750660293442782E-04, -1.2702072030317145E-04, -1.2702072030306984E-04, 2.2750660293439860E-04, -1.2239646121695236E-04}; + constexpr FLT c7[] = {-4.6695893922776242E-05, 1.1717219021520763E-04, -1.8098268625859964E-04, 1.8098268625869589E-04, -1.1717219021517810E-04, 4.6695893931711504E-05}; + constexpr FLT c8[] = {-1.5875418082745247E-06, 7.2147850127730698E-07, -7.0930078293142108E-08, -7.0930078245872243E-08, 7.2147850127811706E-07, -1.5875417996312271E-06}; + for (int i=0; i<6; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i])))))))); + } else if (w==7) { + constexpr FLT c0[] = {4.0677823488318067E-04, 5.5714997521829540E-02, 5.1113018541287825E-01, 1.0000000000000002E+00, 5.1113018541287869E-01, 5.5714997521829561E-02, 4.0677823488475981E-04}; + constexpr FLT c1[] = {1.5569364307494555E-03, 8.9228372765634056E-02, 3.5049603091348180E-01, -1.8840858949353919E-32, -3.5049603091348197E-01, -8.9228372765634029E-02, -1.5569364307477620E-03}; + constexpr FLT c2[] = {2.4904843753404838E-03, 5.4888936725282375E-02, 2.4759577399513382E-02, -1.6428571428571445E-01, 2.4759577399513264E-02, 5.4888936725282340E-02, 2.4904843753420954E-03}; + constexpr FLT c3[] = {2.1552691780265232E-03, 1.3627105791872422E-02, -3.3718114813591167E-02, 1.0435679823191637E-16, 3.3718114813591278E-02, -1.3627105791872396E-02, -2.1552691780250210E-03}; + constexpr FLT c4[] = {1.0735311014902868E-03, -7.2030895675484117E-04, -6.6760503000563741E-03, 1.2656705539358732E-02, -6.6760503000563680E-03, -7.2030895675483119E-04, 1.0735311014919520E-03}; + constexpr FLT c5[] = {2.8413019973530626E-04, -1.1175797418592351E-03, 1.3906361031252640E-03, 1.0099777883094147E-16, -1.3906361031252017E-03, 1.1175797418592505E-03, -2.8413019973377792E-04}; + constexpr FLT c6[] = {1.6363160465889005E-05, -1.5802085209242310E-04, 4.4431051893374396E-04, -6.0985626028865780E-04, 4.4431051893376408E-04, -1.5802085209243416E-04, 1.6363160467394339E-05}; + constexpr FLT c7[] = {-1.2513684117291295E-05, 2.9105578584781478E-05, -2.8835295309364819E-05, 6.9093005849597210E-17, 2.8835295309456306E-05, -2.9105578584752466E-05, 1.2513684118770622E-05}; + constexpr FLT c8[] = {-3.2859430043343403E-06, 9.3570096164232078E-06, -1.7015821249906871E-05, 2.0688046128660197E-05, -1.7015821249876886E-05, 9.3570096164290557E-06, -3.2859430029058764E-06}; + constexpr FLT c9[] = {-1.5030958477935016E-08, -9.3540219413709317E-08, 1.3079704875560537E-07, 3.0755088144886539E-17, -1.3079704870024676E-07, 9.3540219430316894E-08, 1.5030959705830809E-08}; + for (int i=0; i<7; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i]))))))))); + } else if (w==8) { + constexpr FLT c0[] = {7.5442178667264049E-05, 1.7659090182402852E-02, 2.6112828482312650E-01, 8.6561421087578294E-01, 8.6561421087578294E-01, 2.6112828482312650E-01, 1.7659090182402856E-02, 7.5442178667263913E-05}; + constexpr FLT c1[] = {3.1361556564941527E-04, 3.2518751351035657E-02, 2.4295266212395961E-01, 2.5083142126627195E-01, -2.5083142126627200E-01, -2.4295266212395961E-01, -3.2518751351035664E-02, -3.1361556564941506E-04}; + constexpr FLT c2[] = {5.5627094085228170E-04, 2.4604803324737457E-02, 6.5902977410162822E-02, -9.1064379250067565E-02, -9.1064379250067648E-02, 6.5902977410162836E-02, 2.4604803324737447E-02, 5.5627094085228149E-04}; + constexpr FLT c3[] = {5.5053208919074741E-04, 9.2359485489686977E-03, -6.2169545154249764E-03, -3.1386277864020387E-02, 3.1386277864020692E-02, 6.2169545154250301E-03, -9.2359485489686925E-03, -5.5053208919074741E-04}; + constexpr FLT c4[] = {3.3122072653963820E-04, 1.3353118718124376E-03, -5.9878504390516807E-03, 4.3217905833729843E-03, 4.3217905833729184E-03, -5.9878504390516564E-03, 1.3353118718124411E-03, 3.3122072653963842E-04}; + constexpr FLT c5[] = {1.2112223749399388E-04, -2.3174709024353528E-04, -5.1773322458159945E-04, 1.8691284471382664E-03, -1.8691284471382276E-03, 5.1773322458165388E-04, 2.3174709024353332E-04, -1.2112223749399391E-04}; + constexpr FLT c6[] = {2.3288943339077962E-05, -1.1810885265513022E-04, 2.1380000655379686E-04, -1.1905274322668279E-04, -1.1905274322667877E-04, 2.1380000655378596E-04, -1.1810885265513386E-04, 2.3288943339077766E-05}; + constexpr FLT c7[] = {8.7290223704935849E-08, -9.9551635569432461E-06, 3.9042123573714734E-05, -7.0647330846704962E-05, 7.0647330846826175E-05, -3.9042123573667747E-05, 9.9551635569490195E-06, -8.7290223704824623E-08}; + constexpr FLT c8[] = {-1.0444417486661213E-06, 2.8837147790326586E-06, -3.9445588398358951E-06, 1.9505656879624058E-06, 1.9505656880227840E-06, -3.9445588398203690E-06, 2.8837147790369691E-06, -1.0444417486660073E-06}; + constexpr FLT c9[] = {-1.9601350641688945E-07, 6.2981383505868899E-07, -1.3252363384761618E-06, 1.9071649677058813E-06, -1.9071649677363285E-06, 1.3252363385149127E-06, -6.2981383505419114E-07, 1.9601350641697053E-07}; + for (int i=0; i<8; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i]))))))))); + } else if (w==9) { + constexpr FLT c0[] = {1.3445576990655693E-05, 5.1377966678943553E-03, 1.1569392196071671E-01, 5.9595989228910695E-01, 1.0000000000000004E+00, 5.9595989228910784E-01, 1.1569392196071673E-01, 5.1377966678943874E-03, 1.3445576990655681E-05}; + constexpr FLT c1[] = {6.0003223623206657E-05, 1.0569385595664990E-02, 1.3202059711663530E-01, 3.1241329121161582E-01, -8.4851685343650422E-17, -3.1241329121161615E-01, -1.3202059711663522E-01, -1.0569385595665032E-02, -6.0003223623206596E-05}; + constexpr FLT c2[] = {1.1601811379064824E-04, 9.2861699099147151E-03, 5.4760895870332324E-02, -2.7420112488894219E-04, -1.2777777777777805E-01, -2.7420112488935430E-04, 5.4760895870332296E-02, 9.2861699099147359E-03, 1.1601811379064817E-04}; + constexpr FLT c3[] = {1.2783089927061688E-04, 4.4048543606096807E-03, 6.4505427512762566E-03, -2.6627297241817574E-02, 1.0570032264240285E-16, 2.6627297241817935E-02, -6.4505427512762245E-03, -4.4048543606096877E-03, -1.2783089927061688E-04}; + constexpr FLT c4[] = {8.8459828362140127E-05, 1.1147546008569559E-03, -2.1200589329645782E-03, -2.9677441441083273E-03, 7.7692043895744413E-03, -2.9677441441080211E-03, -2.1200589329645678E-03, 1.1147546008569583E-03, 8.8459828362140168E-05}; + constexpr FLT c5[] = {3.9567294647305465E-05, 8.1817980646548672E-05, -7.2116754318327786E-04, 1.0390038161997466E-03, 1.3960675422467541E-16, -1.0390038161998867E-03, 7.2116754318328556E-04, -8.1817980646550122E-05, -3.9567294647305431E-05}; + constexpr FLT c6[] = {1.1032857092605887E-05, -3.4254477931955853E-05, -1.3557143976035256E-05, 1.8667778536557664E-04, -2.9974999576614188E-04, 1.8667778536546106E-04, -1.3557143976042615E-05, -3.4254477931959885E-05, 1.1032857092605841E-05}; + constexpr FLT c7[] = {1.5345430093717796E-06, -9.9308189188274098E-06, 2.3762810604639151E-05, -2.4017602201954516E-05, 1.1627785359675844E-17, 2.4017602202115669E-05, -2.3762810604628780E-05, 9.9308189188319669E-06, -1.5345430093718216E-06}; + constexpr FLT c8[] = {-8.1737159283255726E-08, -4.1540916378247392E-07, 2.6668107554223020E-06, -6.3261434127908313E-06, 8.2578681449311880E-06, -6.3261434126076934E-06, 2.6668107554440373E-06, -4.1540916378676467E-07, -8.1737159283249333E-08}; + constexpr FLT c9[] = {-7.3256982980608342E-08, 2.3321978963880019E-07, -4.0030411105333760E-07, 3.4388260968054864E-07, 6.5677795522570459E-17, -3.4388260990751890E-07, 4.0030411105333760E-07, -2.3321978963499429E-07, 7.3256982980640781E-08}; + constexpr FLT c10[] = {-1.0121400696579195E-08, 3.6191328862414928E-08, -8.7258577118961372E-08, 1.4622014477867198E-07, -1.7333902174790525E-07, 1.4622014483401952E-07, -8.7258577100106683E-08, 3.6191328859901120E-08, -1.0121400696606260E-08}; + for (int i=0; i<9; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i])))))))))); + } else if (w==10) { + constexpr FLT c0[] = {2.3186292807626266E-06, 1.3952040327729876E-03, 4.5894237568906843E-02, 3.4666431215091636E-01, 8.9110862394332080E-01, 8.9110862394332024E-01, 3.4666431215091614E-01, 4.5894237568906843E-02, 1.3952040327729804E-03, 2.3186292807626329E-06}; + constexpr FLT c1[] = {1.1010978063160391E-05, 3.1454190365986022E-03, 6.0943215953720313E-02, 2.5074802988370321E-01, 2.0598750885032702E-01, -2.0598750885032710E-01, -2.5074802988370315E-01, -6.0943215953720306E-02, -3.1454190365985909E-03, -1.1010978063160380E-05}; + constexpr FLT c2[] = {2.2925449299630732E-05, 3.1050615653861980E-03, 3.2337657329423494E-02, 4.4760550762170469E-02, -8.0226193254406428E-02, -8.0226193254406289E-02, 4.4760550762170441E-02, 3.2337657329423480E-02, 3.1050615653861868E-03, 2.2925449299630681E-05}; + constexpr FLT c3[] = {2.7622345748507540E-05, 1.7317590416004974E-03, 7.6620063086756569E-03, -9.8393115612840278E-03, -2.1163068654269049E-02, 2.1163068654269510E-02, 9.8393115612841128E-03, -7.6620063086756491E-03, -1.7317590416004913E-03, -2.7622345748507479E-05}; + constexpr FLT c4[] = {2.1363614860997117E-05, 5.7553475552091617E-04, 1.4813144535930287E-04, -4.1113061120761924E-03, 3.3662735809591683E-03, 3.3662735809590794E-03, -4.1113061120762826E-03, 1.4813144535930759E-04, 5.7553475552091368E-04, 2.1363614860997080E-05}; + constexpr FLT c5[] = {1.1063475580065299E-05, 1.0180053030149723E-04, -3.4137441280837177E-04, -4.9828659222651745E-05, 1.0442648308817235E-03, -1.0442648308817467E-03, 4.9828659222713965E-05, 3.4137441280837177E-04, -1.0180053030149541E-04, -1.1063475580065281E-05}; + constexpr FLT c6[] = {3.8359011440648869E-06, 1.3049698816919587E-06, -6.3791463619208982E-05, 1.4528730872072194E-04, -8.6630472952355992E-05, -8.6630472952398913E-05, 1.4528730872073633E-04, -6.3791463619214471E-05, 1.3049698816901833E-06, 3.8359011440648767E-06}; + constexpr FLT c7[] = {8.3366418668164326E-07, -3.5785601754616355E-06, 2.4539930904858821E-06, 1.2754336575782058E-05, -3.3000414536039571E-05, 3.3000414536273711E-05, -1.2754336575693992E-05, -2.4539930904800897E-06, 3.5785601754627781E-06, -8.3366418668163871E-07}; + constexpr FLT c8[] = {8.0572098823818712E-08, -6.8352224328357488E-07, 2.0695541423376112E-06, -2.9709579576770532E-06, 1.5005770225996294E-06, 1.5005770226481292E-06, -2.9709579578116679E-06, 2.0695541423438809E-06, -6.8352224328404986E-07, 8.0572098823810798E-08}; + constexpr FLT c9[] = {-1.0412910456843575E-08, -3.6228831474008107E-09, 1.3932530225640674E-07, -4.5071262434444286E-07, 7.5149884418348562E-07, -7.5149884428313110E-07, 4.5071262441364111E-07, -1.3932530225017888E-07, 3.6228831478332996E-09, 1.0412910456861821E-08}; + constexpr FLT c10[] = {-4.4291858216944146E-09, 1.5904364893350153E-08, -3.2603275106346107E-08, 3.8190045632066571E-08, -1.7631718176528265E-08, -1.7631718292171639E-08, 3.8190045621381707E-08, -3.2603275098803994E-08, 1.5904364893978648E-08, -4.4291858217073890E-09}; + constexpr FLT c11[] = {-4.4040059170580565E-10, 1.7857872825180656E-09, -4.9203237617335969E-09, 9.5125262125165431E-09, -1.3157194779492521E-08, 1.3157194812996001E-08, -9.5125262191888681E-09, 4.9203237596041585E-09, -1.7857872834763311E-09, 4.4040059170802652E-10}; + for (int i=0; i<10; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i]))))))))))); + } else if (w==11) { + constexpr FLT c0[] = {3.8884809238313434E-07, 3.5785567372179951E-04, 1.6654951019551330E-02, 1.7692785324424570E-01, 6.5593328211813162E-01, 9.9999999999999978E-01, 6.5593328211813129E-01, 1.7692785324424565E-01, 1.6654951019551330E-02, 3.5785567372179962E-04, 3.8884809238312539E-07}; + constexpr FLT c1[] = {1.9516358260453364E-06, 8.7214421096705593E-04, 2.4929466432368100E-02, 1.5885079249667189E-01, 2.7894884556454935E-01, 9.4204294746769595E-33, -2.7894884556454941E-01, -1.5885079249667189E-01, -2.4929466432368097E-02, -8.7214421096705604E-04, -1.9516358260453169E-06}; + constexpr FLT c2[] = {4.3353827605930511E-06, 9.4705645354715550E-04, 1.5700144896729017E-02, 4.8428271550326758E-02, -1.2807080799297165E-02, -1.0454545454545448E-01, -1.2807080799297061E-02, 4.8428271550326821E-02, 1.5700144896729006E-02, 9.4705645354715518E-04, 4.3353827605930215E-06}; + constexpr FLT c3[] = {5.6395387871289846E-06, 5.9760549110825473E-04, 5.0911332059142295E-03, 1.6690038662948304E-03, -2.1030028251697912E-02, 1.4335617874817167E-16, 2.1030028251698141E-02, -1.6690038662947660E-03, -5.0911332059142200E-03, -5.9760549110825429E-04, -5.6395387871289508E-06}; + constexpr FLT c4[] = {4.7836299264887200E-06, 2.3732554180006408E-04, 7.1846854433598795E-04, -2.2660086673713248E-03, -1.3190061226035158E-03, 5.2488730277989188E-03, -1.3190061226033569E-03, -2.2660086673713374E-03, 7.1846854433598557E-04, 2.3732554180006421E-04, 4.7836299264886963E-06}; + constexpr FLT c5[] = {2.7801202330030064E-06, 5.8401836435976300E-05, -5.7255962675850168E-05, -4.1058481683291448E-04, 7.4543249761827859E-04, 6.7099534430837577E-17, -7.4543249761823186E-04, 4.1058481683291448E-04, 5.7255962675853089E-05, -5.8401836435976178E-05, -2.7801202330029924E-06}; + constexpr FLT c6[] = {1.1248609988572041E-06, 7.1593996360419040E-06, -3.7923443960739119E-05, 2.8219312687371359E-05, 8.5797383067823588E-05, -1.6875309167105302E-04, 8.5797383067779691E-05, 2.8219312687392853E-05, -3.7923443960740034E-05, 7.1593996360418057E-06, 1.1248609988571978E-06}; + constexpr FLT c7[] = {3.1074712008817516E-07, -3.7942806006679305E-07, -4.2327710785708026E-06, 1.4518421536643064E-05, -1.6373413879605298E-05, 3.0222646636983358E-17, 1.6373413879621934E-05, -1.4518421536591986E-05, 4.2327710785753580E-06, 3.7942806006705484E-07, -3.1074712008817235E-07}; + constexpr FLT c8[] = {5.3160526822194444E-08, -2.9438470061321741E-07, 4.4816653817789122E-07, 4.9835853873945607E-07, -2.6602444110833864E-06, 3.9090815375281113E-06, -2.6602444110225165E-06, 4.9835853874269618E-07, 4.4816653818193273E-07, -2.9438470061323123E-07, 5.3160526822193583E-08}; + constexpr FLT c9[] = {3.1778958300854393E-09, -3.9044067083483707E-08, 1.4726158788365547E-07, -2.7451209287062293E-07, 2.4544112217999958E-07, 8.6199548859978872E-18, -2.4544112207758621E-07, 2.7451209285678326E-07, -1.4726158788296347E-07, 3.9044067083624268E-08, -3.1778958300829052E-09}; + constexpr FLT c10[] = {-8.6163117991617490E-10, 1.2292710054271969E-09, 4.9928263052430922E-09, -2.5746199362556884E-08, 5.5054682151312924E-08, -6.9606951358406722E-08, 5.5054682230504105E-08, -2.5746199365699604E-08, 4.9928263093284604E-09, 1.2292710054468060E-09, -8.6163117991862728E-10}; + constexpr FLT c11[] = {-2.3293080872726303E-10, 9.3461130390718653E-10, -2.2220140857286656E-09, 3.2420144232604506E-09, -2.5573586459741160E-09, -3.4362247560151687E-17, 2.5573586170134590E-09, -3.2420144222311963E-09, 2.2220140843090244E-09, -9.3461130382733279E-10, 2.3293080872885788E-10}; + constexpr FLT c12[] = {-1.6776727231079557E-11, 7.5440974150049303E-11, -2.3911386677196792E-10, 5.3207180787495740E-10, -8.5057641018270776E-10, 9.9272876082686339E-10, -8.5057644693357476E-10, 5.3207181195839291E-10, -2.3911386485786361E-10, 7.5440974126123504E-11, -1.6776727231328710E-11}; + for (int i=0; i<11; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i] + z*(c12[i])))))))))))); + } else if (w==12) { + constexpr FLT c0[] = {6.3667715563015689E-08, 8.7461142088576888E-05, 5.6146669497086589E-03, 8.1271316412301370E-02, 4.1627261402765736E-01, 9.0846375182673755E-01, 9.0846375182673755E-01, 4.1627261402765736E-01, 8.1271316412301550E-02, 5.6146669497086719E-03, 8.7461142088576929E-05, 6.3667715563034801E-08}; + constexpr FLT c1[] = {3.3587389488258588E-07, 2.2809471090022899E-04, 9.2744480587562007E-03, 8.5676487647659991E-02, 2.4720659158040625E-01, 1.7472997738462001E-01, -1.7472997738461990E-01, -2.4720659158040617E-01, -8.5676487647660143E-02, -9.2744480587562180E-03, -2.2809471090022899E-04, -3.3587389488256608E-07}; + constexpr FLT c2[] = {7.9035220764954472E-07, 2.6846594761214740E-04, 6.6557324960729147E-03, 3.4792641812076718E-02, 2.9454899103693762E-02, -7.1172529707069221E-02, -7.1172529707069207E-02, 2.9454899103693671E-02, 3.4792641812076690E-02, 6.6557324960729242E-03, 2.6846594761214740E-04, 7.9035220764956886E-07}; + constexpr FLT c3[] = {1.0993606197695965E-06, 1.8716155179384050E-04, 2.6329045000561364E-03, 5.3754303637600113E-03, -1.0591878410592502E-02, -1.5228395084945664E-02, 1.5228395084945803E-02, 1.0591878410592646E-02, -5.3754303637599376E-03, -2.6329045000561364E-03, -1.8716155179384044E-04, -1.0993606197695836E-06}; + constexpr FLT c4[] = {1.0091198513153346E-06, 8.4812954286468477E-05, 5.7431140218944460E-04, -5.0274672420766203E-04, -2.8008958990917627E-03, 2.6435090762445433E-03, 2.6435090762445819E-03, -2.8008958990918187E-03, -5.0274672420767580E-04, 5.7431140218944276E-04, 8.4812954286468423E-05, 1.0091198513153598E-06}; + constexpr FLT c5[] = {6.4507244019416584E-07, 2.5481132674301279E-05, 4.2795619387511420E-05, -3.0197159708156643E-04, 1.1080610219049720E-04, 6.4144454802694492E-04, -6.4144454802681275E-04, -1.1080610219045053E-04, 3.0197159708157808E-04, -4.2795619387511908E-05, -2.5481132674301286E-05, -6.4507244019414964E-07}; + constexpr FLT c6[] = {2.9426545129495891E-07, 4.7724106401925034E-06, -1.1001642128368358E-05, -2.6869692251292103E-05, 9.4483235217708846E-05, -6.1678458203322752E-05, -6.1678458203283029E-05, 9.4483235217638725E-05, -2.6869692251319154E-05, -1.1001642128368348E-05, 4.7724106401924525E-06, 2.9426545129497845E-07}; + constexpr FLT c7[] = {9.5799843879057487E-08, 3.7784160107136394E-07, -3.2256313018476217E-06, 5.0144058082843800E-06, 3.4886031174309006E-06, -1.7411974954245794E-05, 1.7411974954244114E-05, -3.4886031173677615E-06, -5.0144058082412084E-06, 3.2256313018490718E-06, -3.7784160107127161E-07, -9.5799843879039593E-08}; + constexpr FLT c8[] = {2.1473864761677802E-08, -5.7414008446850441E-08, -2.0134799316446491E-07, 1.1145247706131597E-06, -1.8840465966107854E-06, 1.0067804561094662E-06, 1.0067804560969447E-06, -1.8840465965985945E-06, 1.1145247706194121E-06, -2.0134799316567892E-07, -5.7414008446903526E-08, 2.1473864761695718E-08}; + constexpr FLT c9[] = {2.8867786924320735E-09, -2.0015791402048098E-08, 4.5306507660172584E-08, -7.8859059608423767E-09, -1.5755151471717741E-07, 3.4270221893522085E-07, -3.4270221891584534E-07, 1.5755151474485673E-07, 7.8859059608423767E-09, -4.5306507656885666E-08, 2.0015791402102159E-08, -2.8867786924173336E-09}; + constexpr FLT c10[] = {6.9986758892026879E-11, -1.8486004428526375E-09, 8.7658205612213605E-09, -2.0364661368255434E-08, 2.5396405431717686E-08, -1.2044441164754235E-08, -1.2044441145898965E-08, 2.5396405393379069E-08, -2.0364661337458944E-08, 8.7658205594930229E-09, -1.8486004428624741E-09, 6.9986758906941889E-11}; + constexpr FLT c11[] = {-5.6296594747629561E-11, 1.4066781276164117E-10, 4.6947620156299098E-11, -1.1526063766721083E-09, 3.3027593515457814E-09, -5.2174001597719162E-09, 5.2174001336505757E-09, -3.3027593563725673E-09, 1.1526063504088099E-09, -4.6947618665684182E-11, -1.4066781273945818E-10, 5.6296594761077256E-11}; + constexpr FLT c12[] = {-1.0870401168253040E-11, 4.8044744351982426E-11, -1.3004175788815863E-10, 2.2570502267192305E-10, -2.4006684875388499E-10, 1.0598000131166063E-10, 1.0597991964307358E-10, -2.4006682833673746E-10, 2.2570504206821193E-10, -1.3004176149306233E-10, 4.8044744304130286E-11, -1.0870401156071839E-11}; + constexpr FLT c13[] = {-4.7539080498592749E-13, 2.6787995976616703E-12, -1.0000145739993567E-11, 2.5777400861531429E-11, -4.7463672955972831E-11, 6.4012227921839136E-11, -6.4012266007267373E-11, 4.7463669782187146E-11, -2.5777397687745743E-11, 1.0000149112140858E-11, -2.6787995744161696E-12, 4.7539081133001201E-13}; + for (int i=0; i<12; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i] + z*(c12[i] + z*(c13[i]))))))))))))); + } else if (w==13) { + constexpr FLT c0[] = {1.0208956054983696E-08, 2.0506572462261995E-05, 1.7784497194617906E-03, 3.4214490279693019E-02, 2.3443634373410047E-01, 7.0049708882252804E-01, 9.9999999999999956E-01, 7.0049708882252670E-01, 2.3443634373410041E-01, 3.4214490279692922E-02, 1.7784497194617906E-03, 2.0506572462261785E-05, 1.0208956054983676E-08}; + constexpr FLT c1[] = {5.6353468219321995E-08, 5.6780128053894686E-05, 3.1934841481628326E-03, 4.0941461360716927E-02, 1.7436810648693357E-01, 2.5085467225681696E-01, -6.3638764007737755E-17, -2.5085467225681662E-01, -1.7436810648693341E-01, -4.0941461360716816E-02, -3.1934841481628326E-03, -5.6780128053894232E-05, -5.6353468219321988E-08}; + constexpr FLT c2[] = {1.3966266158866427E-07, 7.1655019336418755E-05, 2.5459504018621182E-03, 2.0160236969440644E-02, 4.0770064165298429E-02, -1.9317276988534509E-02, -8.8461538461538661E-02, -1.9317276988534381E-02, 4.0770064165298395E-02, 2.0160236969440602E-02, 2.5459504018621160E-03, 7.1655019336418200E-05, 1.3966266158866422E-07}; + constexpr FLT c3[] = {2.0618605552701903E-07, 5.4306747658367697E-05, 1.1637911071900936E-03, 4.7784706844645319E-03, -1.2004184173788884E-03, -1.6862510515565966E-02, 1.4394808111083350E-16, 1.6862510515566146E-02, 1.2004184173788636E-03, -4.7784706844645379E-03, -1.1637911071900920E-03, -5.4306747658367331E-05, -2.0618605552701909E-07}; + constexpr FLT c4[] = {2.0277547837406105E-07, 2.7328509487415503E-05, 3.2236608098850310E-04, 3.0859705461356495E-04, -2.0254394973524947E-03, -5.2398574644553877E-04, 3.7818616294949463E-03, -5.2398574644547762E-04, -2.0254394973524895E-03, 3.0859705461357378E-04, 3.2236608098850327E-04, 2.7328509487415384E-05, 2.0277547837406108E-07}; + constexpr FLT c5[] = {1.4058372037094490E-07, 9.4685595066536085E-06, 4.8682874512158502E-05, -1.1575111217134651E-04, -2.1811605515759046E-04, 5.4056763477041119E-04, 1.1213866287069097E-16, -5.4056763477029453E-04, 2.1811605515769156E-04, 1.1575111217135234E-04, -4.8682874512158861E-05, -9.4685595066535949E-06, -1.4058372037094498E-07}; + constexpr FLT c6[] = {7.0755520230584385E-08, 2.2298625886400277E-06, 7.8375383352022143E-07, -2.8394470622676381E-05, 3.5771256766257562E-05, 4.1631950912211130E-05, -1.0418619302467684E-04, 4.1631950912333557E-05, 3.5771256766183768E-05, -2.8394470622671916E-05, 7.8375383351933331E-07, 2.2298625886400294E-06, 7.0755520230584346E-08}; + constexpr FLT c7[] = {2.6111186487625245E-08, 3.2044561720738826E-07, -1.2220373462313589E-06, -8.5793794342228941E-07, 8.3299507234112700E-06, -1.0956754351178954E-05, 9.4610283796409485E-17, 1.0956754351115859E-05, -8.3299507234215327E-06, 8.5793794342144989E-07, 1.2220373462321896E-06, -3.2044561720741346E-07, -2.6111186487625302E-08}; + constexpr FLT c8[] = {6.9838095920570498E-09, 1.2796250155222958E-08, -2.1971713837900942E-07, 5.2791981730307194E-07, -1.4622692107334488E-07, -1.2222183756556175E-06, 2.0809248310569844E-06, -1.2222183756925741E-06, -1.4622692099063203E-07, 5.2791981730006307E-07, -2.1971713837856465E-07, 1.2796250155283016E-08, 6.9838095920570937E-09}; + constexpr FLT c9[] = {1.2845897306280646E-09, -5.2304801922802769E-09, -5.0548716982175665E-09, 6.7539942924545603E-08, -1.6027276234256162E-07, 1.5655092165632365E-07, 4.6828140259346451E-17, -1.5655092173659360E-07, 1.6027276234809749E-07, -6.7539942912781904E-08, 5.0548716984338105E-09, 5.2304801922379145E-09, -1.2845897306280857E-09}; + constexpr FLT c10[] = {1.3345700642131601E-10, -1.1551704392349950E-09, 3.4412362345673782E-09, -3.2850871078054311E-09, -6.1855158542452699E-09, 2.3119925642302808E-08, -3.2145944181567604E-08, 2.3119926027259106E-08, -6.1855159240088862E-09, -3.2850871247748739E-09, 3.4412362345280933E-09, -1.1551704391858975E-09, 1.3345700642134581E-10}; + constexpr FLT c11[] = {-1.9694481417663767E-12, -7.0630732018717419E-11, 4.4161967766895751E-10, -1.2581280884757252E-09, 2.0087583285653241E-09, -1.6557203488425082E-09, 5.7014219382328511E-17, 1.6557200410648860E-09, -2.0087583339599462E-09, 1.2581281082796833E-09, -4.4161967789965090E-10, 7.0630731978790794E-11, 1.9694481417229703E-12}; + constexpr FLT c12[] = {-3.1122514901291979E-12, 1.0235548893351873E-11, -1.0076717787418374E-11, -3.6278872085836478E-11, 1.6235812713334426E-10, -3.2356766327511469E-10, 4.0014573853281197E-10, -3.2356772044312440E-10, 1.6235817511363862E-10, -3.6278891226911122E-11, -1.0076717627909611E-11, 1.0235548938213992E-11, -3.1122514900941893E-12}; + constexpr FLT c13[] = {-4.4521627553052389E-13, 2.1830423195977186E-12, -6.6494700502871459E-12, 1.3364548102385267E-11, -1.7572530897780217E-11, 1.3087527392509343E-11, -1.4854086432767967E-17, -1.3087613084722882E-11, 1.7572508681280409E-11, -1.3364552466340585E-11, 6.6494701742631489E-12, -2.1830423513665695E-12, 4.4521627553052389E-13}; + constexpr FLT c14[] = {-1.1331825591762625E-14, 7.5442537823437382E-14, -3.5473113067901070E-13, 1.0827924393926043E-12, -2.3053993601726267E-12, 3.5752731472827676E-12, -4.1288118242378826E-12, 3.5755029357484062E-12, -2.3054273074184593E-12, 1.0827837446939142E-12, -3.5473109186339628E-13, 7.5442574213081941E-14, -1.1331825564518091E-14}; + for (int i=0; i<13; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i] + z*(c12[i] + z*(c13[i] + z*(c14[i])))))))))))))); + } else if (w==14) { + constexpr FLT c0[] = {1.6070755785071491E-09, 4.6371263117318300E-06, 5.3392892770691468E-04, 1.3380163586766329E-02, 1.1960061568997656E-01, 4.7332499268789285E-01, 9.2104360429933863E-01, 9.2104360429933885E-01, 4.7332499268789302E-01, 1.1960061568997683E-01, 1.3380163586766332E-02, 5.3392892770691837E-04, 4.6371263117318342E-06, 1.6070755785075502E-09}; + constexpr FLT c1[] = {9.2475302076758674E-09, 1.3546865389183953E-05, 1.0306349751547578E-03, 1.7767594411827761E-02, 1.0518000824290019E-01, 2.3882936521395404E-01, 1.5170179567585843E-01, -1.5170179567585837E-01, -2.3882936521395398E-01, -1.0518000824290036E-01, -1.7767594411827754E-02, -1.0306349751547613E-03, -1.3546865389183977E-05, -9.2475302076757731E-09}; + constexpr FLT c2[] = {2.4024402573674993E-08, 1.8178651135370012E-05, 8.9712289901830596E-04, 1.0104692380253478E-02, 3.4193348251104483E-02, 1.8533380680638794E-02, -6.3746746886473832E-02, -6.3746746886473860E-02, 1.8533380680638745E-02, 3.4193348251104413E-02, 1.0104692380253471E-02, 8.9712289901830889E-04, 1.8178651135370046E-05, 2.4024402573675768E-08}; + constexpr FLT c3[] = {3.7419288907183495E-08, 1.4804264337309617E-05, 4.5929141335173144E-04, 3.0552592910038168E-03, 3.3079403387824323E-03, -1.0247716289024879E-02, -1.1480323948535117E-02, 1.1480323948535463E-02, 1.0247716289025027E-02, -3.3079403387824271E-03, -3.0552592910038120E-03, -4.5929141335173334E-04, -1.4804264337309643E-05, -3.7419288907183766E-08}; + constexpr FLT c4[] = {3.9124194363163287E-08, 8.1265227753122953E-06, 1.4975407030324905E-04, 4.4789439277602894E-04, -7.9407521150521383E-04, -1.9254008995687184E-03, 2.1136619999320748E-03, 2.1136619999320141E-03, -1.9254008995687132E-03, -7.9407521150514292E-04, 4.4789439277602867E-04, 1.4975407030325005E-04, 8.1265227753123105E-06, 3.9124194363164148E-08}; + constexpr FLT c5[] = {2.9113992252245385E-08, 3.1458937074171823E-06, 3.0585266291431613E-05, -6.5135387342551234E-06, -2.3196510408355524E-04, 1.5778347828067563E-04, 4.2181913759748168E-04, -4.2181913759742725E-04, -1.5778347828060562E-04, 2.3196510408355524E-04, 6.5135387342551234E-06, -3.0585266291432040E-05, -3.1458937074171887E-06, -2.9113992252245408E-08}; + constexpr FLT c6[] = {1.5927753226313472E-08, 8.6591441391883797E-07, 3.1186030532599549E-06, -1.4256326863802477E-05, -6.9192418278078229E-06, 6.1786486497582421E-05, -4.4611361914704291E-05, -4.4611361914610670E-05, 6.1786486497541994E-05, -6.9192418278024798E-06, -1.4256326863804276E-05, 3.1186030532598494E-06, 8.6591441391883161E-07, 1.5927753226313945E-08}; + constexpr FLT c7[] = {6.5072355972925020E-09, 1.6321871905299654E-07, -1.6208737249918160E-07, -2.0005919851675986E-06, 4.6289117401651821E-06, 1.5738407907104777E-07, -1.0033756087313552E-05, 1.0033756087535249E-05, -1.5738407898383816E-07, -4.6289117402341052E-06, 2.0005919851709152E-06, 1.6208737249923451E-07, -1.6321871905299225E-07, -6.5072355972922787E-09}; + constexpr FLT c8[] = {1.9857214221989366E-09, 1.7788899565181922E-08, -1.0133541198312604E-07, 4.4566342395340293E-08, 5.3564828266574526E-07, -1.1695093255338883E-06, 6.7085595118984104E-07, 6.7085595114069746E-07, -1.1695093255217181E-06, 5.3564828276835377E-07, 4.4566342396873204E-08, -1.0133541198326502E-07, 1.7788899565180526E-08, 1.9857214221992563E-09}; + constexpr FLT c9[] = {4.4289508956510332E-10, -2.3397558741938982E-11, -1.2203541602658680E-08, 4.1555456455006879E-08, -4.0387396856849884E-08, -5.2822132653130956E-08, 1.7383889351097292E-07, -1.7383889353173241E-07, 5.2822132672506464E-08, 4.0387396834706444E-08, -4.1555456455698865E-08, 1.2203541602950610E-08, 2.3397558742361335E-11, -4.4289508956485253E-10}; + constexpr FLT c10[] = {6.7195187479843226E-11, -3.6781600571171619E-10, 1.8909214083296717E-10, 3.2074788122994124E-09, -1.0777792237807384E-08, 1.5287295377979802E-08, -7.6060392723093131E-09, -7.6060391755201933E-09, 1.5287295398091755E-08, -1.0777792217695420E-08, 3.2074788146563205E-09, 1.8909214044014493E-10, -3.6781600571662634E-10, 6.7195187480068943E-11}; + constexpr FLT c11[] = {5.1753158905822061E-12, -5.7459004384753609E-11, 2.1373772914288248E-10, -3.3474981614755248E-10, -5.5056523013581392E-11, 1.1984997345151211E-09, -2.3401534609898206E-09, 2.3401534737665714E-09, -1.1984997515507915E-09, 5.5056487167718091E-11, 3.3474981678638774E-10, -2.1373772871699109E-10, 5.7459004393903842E-11, -5.1753158903480283E-12}; + constexpr FLT c12[] = {-3.4295334316135217E-13, -1.9669734020395281E-12, 1.8829710516667924E-11, -6.6063898621267923E-11, 1.2987243021035191E-10, -1.4723142988261286E-10, 6.6816662742079877E-11, 6.6816650491789053E-11, -1.4723143192432656E-10, 1.2987247614892944E-10, -6.6063898621269021E-11, 1.8829709886607818E-11, -1.9669734162457477E-12, -3.4295334295692199E-13}; + constexpr FLT c13[] = {-1.4925032356367256E-13, 5.9101412900182951E-13, -1.0473414103260276E-12, -3.4168877521962931E-13, 6.3681343308181771E-12, -1.6773485918159645E-11, 2.5499676364679485E-11, -2.5499722384571941E-11, 1.6773473223016897E-11, -6.3681501997466111E-12, 3.4168877521962931E-13, 1.0473414909104298E-12, -5.9101412551500433E-13, 1.4925032367414924E-13}; + constexpr FLT c14[] = {-1.6512890188764807E-14, 8.8250735109913167E-14, -3.0062084749515021E-13, 6.8819378623923325E-13, -1.0710378278007934E-12, 1.0658930503703208E-12, -4.5535006559156473E-13, -4.5529417109990688E-13, 1.0659116818675222E-12, -1.0710247857527394E-12, 6.8819549412647750E-13, -3.0062091542248455E-13, 8.8250729803090660E-14, -1.6512890092223385E-14}; + constexpr FLT c15[] = {1.6573977440105294E-16, 1.3350735743743382E-15, -1.0198606577404851E-14, 3.9099634678793536E-14, -9.7801981044810947E-14, 1.7461338478760738E-13, -2.3137912816883565E-13, 2.3133990246879147E-13, -1.7463221312362809E-13, 9.7795403196649327E-14, -3.9099513984331611E-14, 1.0198764988885690E-14, -1.3350660309704511E-15, -1.6573967886539614E-16}; + for (int i=0; i<14; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i] + z*(c12[i] + z*(c13[i] + z*(c14[i] + z*(c15[i]))))))))))))))); + } else if (w==15) { + constexpr FLT c0[] = {2.4886236238313534E-10, 1.0156314710024854E-06, 1.5297772142853732E-04, 4.9110296377727252E-03, 5.6121982134094042E-02, 2.8670951404936740E-01, 7.3488453954210731E-01, 1.0000000000000018E+00, 7.3488453954210708E-01, 2.8670951404936784E-01, 5.6121982134094188E-02, 4.9110296377727321E-03, 1.5297772142853737E-04, 1.0156314710024854E-06, 2.4886236238313394E-10}; + constexpr FLT c1[] = {1.4880454274285384E-09, 3.1146031777409673E-06, 3.1470309742465694E-04, 7.1215977556942766E-03, 5.6335374470954679E-02, 1.8245542837228418E-01, 2.2739494478010200E-01, -4.2425842671825266E-17, -2.2739494478010208E-01, -1.8245542837228432E-01, -5.6335374470954783E-02, -7.1215977556942861E-03, -3.1470309742465694E-04, -3.1146031777409668E-06, -1.4880454274285366E-09}; + constexpr FLT c2[] = {4.0364738474324423E-09, 4.4152383936309442E-06, 2.9537757977456596E-04, 4.5415629108243238E-03, 2.2685962261788550E-02, 3.3502333548319392E-02, -2.2696322242195994E-02, -7.6666666666667133E-02, -2.2696322242195945E-02, 3.3502333548319260E-02, 2.2685962261788570E-02, 4.5415629108243273E-03, 2.9537757977456591E-04, 4.4152383936309416E-06, 4.0364738474324407E-09}; + constexpr FLT c3[] = {6.6006259688120961E-09, 3.8297656275654657E-06, 1.6597029248061439E-04, 1.6248331197066942E-03, 4.0281119347581979E-03, -2.8399908290139206E-03, -1.3756562885831705E-02, 1.0758125681708418E-16, 1.3756562885831904E-02, 2.8399908290139895E-03, -4.0281119347581771E-03, -1.6248331197066914E-03, -1.6597029248061437E-04, -3.8297656275654657E-06, -6.6006259688120969E-09}; + constexpr FLT c4[] = {7.2920076887968825E-09, 2.2644150332986910E-06, 6.1226481435400985E-05, 3.3216368068303816E-04, 4.2258807580024870E-07, -1.7026747228854500E-03, -1.2026158633582243E-04, 2.8537037037044089E-03, -1.2026158633584264E-04, -1.7026747228853732E-03, 4.2258807580182180E-07, 3.3216368068303642E-04, 6.1226481435401053E-05, 2.2644150332986919E-06, 7.2920076887968842E-09}; + constexpr FLT c5[] = {5.7777535593445574E-09, 9.5996306286140537E-07, 1.5097159537535560E-05, 2.8094504791464212E-05, -1.2791075475386364E-04, -1.0516749004210079E-04, 4.0040320377530828E-04, 5.4844446833709888E-17, -4.0040320377525385E-04, 1.0516749004229523E-04, 1.2791075475386559E-04, -2.8094504791467126E-05, -1.5097159537535560E-05, -9.5996306286140579E-07, -5.7777535593445582E-09}; + constexpr FLT c6[] = {3.3986627004323950E-09, 2.9741452947022275E-07, 2.3232144780590118E-06, -3.5941523174497321E-06, -1.8171775676701533E-05, 3.2858338560981214E-05, 2.0665249075258455E-05, -6.8763374485615104E-05, 2.0665249075221676E-05, 3.2858338560934424E-05, -1.8171775676683576E-05, -3.5941523174470280E-06, 2.3232144780590435E-06, 2.9741452947022206E-07, 3.3986627004323950E-09}; + constexpr FLT c7[] = {1.5128957992049987E-09, 6.6672685257784247E-08, 1.4160936684823307E-07, -1.2611166225385906E-06, 6.6865545481897967E-07, 4.6861078169740899E-06, -7.4523870622442393E-06, 5.1688954219266444E-17, 7.4523870623463821E-06, -4.6861078171739939E-06, -6.6865545481690963E-07, 1.2611166225370325E-06, -1.4160936684824530E-07, -6.6672685257784551E-08, -1.5128957992049987E-09}; + constexpr FLT c8[] = {5.1310324414219292E-10, 1.0163871982745590E-08, -2.4441175134592830E-08, -1.0543632600171378E-07, 4.0979777876715675E-07, -2.9573937051194202E-07, -5.9824625884543558E-07, 1.2067769776847866E-06, -5.9824625879665336E-07, -2.9573937049659643E-07, 4.0979777875267863E-07, -1.0543632599876183E-07, -2.4441175134530762E-08, 1.0163871982746284E-08, 5.1310324414219364E-10}; + constexpr FLT c9[] = {1.3160883866734095E-10, 8.0584478671564817E-10, -6.7824252838686685E-09, 9.4471403089230076E-09, 2.4030590211824177E-08, -9.0522548480936782E-08, 9.9320303339648267E-08, 1.4827374781995408E-17, -9.9320303311968964E-08, 9.0522548602725694E-08, -2.4030590184836860E-08, -9.4471403124694187E-09, 6.7824252839146209E-09, -8.0584478671585931E-10, -1.3160883866734196E-10}; + constexpr FLT c10[] = {2.4734066313995269E-11, -4.3978001545632529E-11, -5.4975091406435660E-10, 2.6307942070348926E-09, -4.2001676281559915E-09, -1.8212709350780177E-10, 1.0547608795803518E-08, -1.6454374555673015E-08, 1.0547608746152108E-08, -1.8212708345187657E-10, -4.2001676312984721E-09, 2.6307942087632753E-09, -5.4975091402508072E-10, -4.3978001545363347E-11, 2.4734066313995970E-11}; + constexpr FLT c11[] = {3.0917581107111067E-12, -2.1504981481527399E-11, 3.4611945838654282E-11, 1.1082666500276105E-10, -5.8883840899000033E-10, 1.1304779661881485E-09, -1.0037911406820197E-09, -5.7884986037117854E-17, 1.0037911398302301E-09, -1.1304781086488634E-09, 5.8883842723235649E-10, -1.1082666592552764E-10, -3.4611945887454015E-11, 2.1504981480972878E-11, -3.0917581107111891E-12}; + constexpr FLT c12[] = {1.5997634038655269E-13, -2.4807970173617968E-12, 1.1275106610326804E-11, -2.3847055813595321E-11, 1.5364454138408298E-11, 4.4350534757580891E-11, -1.3563510404683277E-10, 1.8159081432580251E-10, -1.3563508771311925E-10, 4.4350484735577755E-11, 1.5364420705333068E-11, -2.3847054665131313E-11, 1.1275106670142851E-11, -2.4807970168633410E-12, 1.5997634038739785E-13}; + constexpr FLT c13[] = {-2.4800914618527656E-14, -2.0428592368367617E-14, 6.6720756177865110E-13, -2.9781122281459938E-12, 7.0947566948544657E-12, -1.0181675867287212E-11, 7.9189142537208719E-12, -1.4497056804736912E-17, -7.9189459915777383E-12, 1.0181666345930152E-11, -7.0947487603902491E-12, 2.9781098973971301E-12, -6.6720754938105074E-13, 2.0428592180708626E-14, 2.4800914617770965E-14}; + constexpr FLT c14[] = {-6.3774103672726629E-15, 2.8974955370030088E-14, -6.8422346755457550E-14, 5.3399811794037740E-14, 1.7893441503609519E-13, -7.2418549150581294E-13, 1.3713697997539906E-12, -1.6687145216540105E-12, 1.3713520998316439E-12, -7.2416872315832831E-13, 1.7893006768675052E-13, 5.3400626922038687E-14, -6.8422339477528482E-14, 2.8974955559559462E-14, -6.3774103666804019E-15}; + constexpr FLT c15[] = {-5.1635500202709335E-16, 3.1828105471276549E-15, -1.2111383721117860E-14, 3.1272734620510859E-14, -5.6176935449952714E-14, 6.8640388687474512E-14, -4.9039125333789703E-14, -3.5058680377244798E-17, 4.9029469776856299E-14, -6.8666790600965935E-14, 5.6189548021197700E-14, -3.1272749707318549E-14, 1.2111366748459164E-14, -3.1828106649933298E-15, 5.1635500199831522E-16}; + constexpr FLT c16[] = {4.5179133600663468E-18, -1.3721818586136237E-17, -2.0190809683029299E-16, 1.1787611877454253E-15, -3.5963787346199218E-15, 7.4622525856292898E-15, -1.1451676136812928E-14, 1.2941737777564503E-14, -1.1457648327763603E-14, 7.4174611535501039E-15, -3.6182145577673462E-15, 1.1783995902489914E-15, -2.0188185185104562E-16, -1.3721704675617759E-17, 4.5179136270619547E-18}; + for (int i=0; i<15; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i] + z*(c12[i] + z*(c13[i] + z*(c14[i] + z*(c15[i] + z*(c16[i])))))))))))))))); + } else if (w==16) { + constexpr FLT c0[] = {3.7973138383475505E-11, 2.1620729770457867E-07, 4.2059935922517660E-05, 1.7055631615451750E-03, 2.4507833223051390E-02, 1.5833750021928361E-01, 5.2065761855025572E-01, 9.3058177132107800E-01, 9.3058177132107822E-01, 5.2065761855025583E-01, 1.5833750021928361E-01, 2.4507833223051407E-02, 1.7055631615451757E-03, 4.2059935922517680E-05, 2.1620729770457854E-07, 3.7973138383475363E-11}; + constexpr FLT c1[] = {2.3529614069937368E-10, 6.9307767643753084E-07, 9.1584555859393273E-05, 2.6688190455647263E-03, 2.7424935799146805E-02, 1.1980519064171602E-01, 2.2858769149343988E-01, 1.3403316930972969E-01, -1.3403316930972969E-01, -2.2858769149343988E-01, -1.1980519064171603E-01, -2.7424935799146809E-02, -2.6688190455647263E-03, -9.1584555859393273E-05, -6.9307767643753063E-07, -2.3529614069937291E-10}; + constexpr FLT c2[] = {6.6422278409342484E-10, 1.0324321112746625E-06, 9.1817488865684769E-05, 1.8711533829047168E-03, 1.2921996060610234E-02, 3.2047854205940321E-02, 1.0693035516337747E-02, -5.7626889750985358E-02, -5.7626889750985420E-02, 1.0693035516337622E-02, 3.2047854205940300E-02, 1.2921996060610227E-02, 1.8711533829047159E-03, 9.1817488865684728E-05, 1.0324321112746625E-06, 6.6422278409342453E-10}; + constexpr FLT c3[] = {1.1357078950958115E-09, 9.4728532805183455E-07, 5.5827161828283907E-05, 7.6087086075588353E-04, 3.0946204357507638E-03, 1.6729582927767952E-03, -9.5127691406672668E-03, -8.9630953638633881E-03, 8.9630953638635737E-03, 9.5127691406674039E-03, -1.6729582927767412E-03, -3.0946204357507521E-03, -7.6087086075588267E-04, -5.5827161828283886E-05, -9.4728532805183402E-07, -1.1357078950958119E-09}; + constexpr FLT c4[] = {1.3190161602522571E-09, 5.9764321317063336E-07, 2.2744388605472980E-05, 1.9073517322668089E-04, 2.8943142766413201E-04, -8.8625893129445465E-04, -1.3389167739520302E-03, 1.7216657535080475E-03, 1.7216657535079566E-03, -1.3389167739519974E-03, -8.8625893129445302E-04, 2.8943142766413342E-04, 1.9073517322668089E-04, 2.2744388605472997E-05, 5.9764321317063368E-07, 1.3190161602522571E-09}; + constexpr FLT c5[] = {1.1057322032863292E-09, 2.7364351668058875E-07, 6.4277990516969732E-06, 2.7144256967440253E-05, -3.6927862875708149E-05, -1.6756539822663250E-04, 1.6190404775924360E-04, 2.9203183363577429E-04, -2.9203183363574707E-04, -1.6190404775915027E-04, 1.6756539822663250E-04, 3.6927862875712038E-05, -2.7144256967440009E-05, -6.4277990516969918E-06, -2.7364351668058875E-07, -1.1057322032863296E-09}; + constexpr FLT c6[] = {6.9354916180818945E-10, 9.3269475195063855E-08, 1.2384428187212403E-06, 8.4996778392803041E-07, -1.3106613626284104E-05, 2.8218026704026646E-06, 4.1119875273776001E-05, -3.3017437945353985E-05, -3.3017437945415066E-05, 4.1119875273714446E-05, 2.8218026703990287E-06, -1.3106613626289508E-05, 8.4996778392747454E-07, 1.2384428187212240E-06, 9.3269475195063643E-08, 6.9354916180818914E-10}; + constexpr FLT c7[] = {3.3254260763956042E-10, 2.3748169129617104E-08, 1.4324995919586480E-07, -4.5855119979446571E-07, -9.5896649524100645E-07, 3.6155491755001142E-06, -9.8206137491315186E-07, -6.1812989819835450E-06, 6.1812989820611756E-06, 9.8206137497544330E-07, -3.6155491754721922E-06, 9.5896649524660746E-07, 4.5855119979503682E-07, -1.4324995919584492E-07, -2.3748169129616922E-08, -3.3254260763956068E-10}; + constexpr FLT c8[] = {1.2320735888479529E-10, 4.4066719437554910E-09, 2.9936173156462927E-09, -8.7082338359679101E-08, 1.2972939456291547E-07, 2.2882425903046301E-07, -7.3491924909334631E-07, 4.5592445674903059E-07, 4.5592445658978770E-07, -7.3491924903833956E-07, 2.2882425902441689E-07, 1.2972939456293178E-07, -8.7082338359266715E-08, 2.9936173156449473E-09, 4.4066719437557416E-09, 1.2320735888479524E-10}; + constexpr FLT c9[] = {3.5284250010876628E-11, 5.4380355945640250E-10, -2.1550460241694361E-09, -3.7344953348928088E-09, 2.7722604311846508E-08, -3.9597167021230792E-08, -1.3993916628542531E-08, 9.5626629210101709E-08, -9.5626629290371673E-08, 1.3993916670061478E-08, 3.9597167019846826E-08, -2.7722604310808535E-08, 3.7344953348928088E-09, 2.1550460241924123E-09, -5.4380355945618072E-10, -3.5284250010876789E-11}; + constexpr FLT c10[] = {7.7013760205813290E-12, 2.8123297626332877E-11, -3.7953802132437611E-10, 8.7573780453214681E-10, 5.1359846908750478E-10, -5.3609157480923598E-09, 9.1303305149265196E-09, -4.8150450778386211E-09, -4.8150450602405480E-09, 9.1303305006281353E-09, -5.3609157342653948E-09, 5.1359846657352753E-10, 8.7573780480711250E-10, -3.7953802133297068E-10, 2.8123297626237416E-11, 7.7013760205811319E-12}; + constexpr FLT c11[] = {1.2276300481459368E-12, -4.1769601372671798E-12, -1.9148402800715177E-11, 1.3822953630779855E-10, -3.0994364017547768E-10, 2.0316700893505159E-10, 4.3650568116859601E-10, -1.1534087567294806E-09, 1.1534086455717957E-09, -4.3650568244627625E-10, -2.0316701046115955E-10, 3.0994364003351358E-10, -1.3822953650299937E-10, 1.9148402794060861E-11, 4.1769601372325045E-12, -1.2276300481460517E-12}; + constexpr FLT c12[] = {1.2527329159215257E-13, -1.0816725479918068E-12, 2.7445378707133412E-12, 1.7839886378835549E-12, -2.6194655703148228E-11, 6.7446666417949068E-11, -8.5082142817277568E-11, 4.0255080062661886E-11, 4.0254965726647763E-11, -8.5082126483561454E-11, 6.7446671522236455E-11, -2.6194657362041918E-11, 1.7839889409505645E-12, 2.7445378607441180E-12, -1.0816725479139360E-12, 1.2527329159224173E-13}; + constexpr FLT c13[] = {3.2506946752710786E-15, -9.2845381849289691E-14, 5.1542691616877330E-13, -1.3678932005895992E-12, 1.6503397946393055E-12, 7.2548932254614457E-13, -6.2314806405069215E-12, 1.1299375277421538E-11, -1.1299433992456742E-11, 6.2314647715784883E-12, -7.2550201768889120E-13, -1.6503403897241219E-12, 1.3678930766135958E-12, -5.1542690377117294E-13, 9.2845381940092428E-14, -3.2506946753893115E-15}; + constexpr FLT c14[] = {-1.3523251101878356E-15, 1.9055798839533079E-15, 1.8430813184053169E-14, -1.1526987096958319E-13, 3.3349122385594633E-13, -5.8352048227061829E-13, 6.1751861733538967E-13, -2.7104853725824153E-13, -2.7103052681092733E-13, 6.1751644366071028E-13, -5.8351023494715043E-13, 3.3348982649365648E-13, -1.1526961866805939E-13, 1.8430809545089241E-14, 1.9055798650003023E-15, -1.3523251102248507E-15}; + constexpr FLT c15[] = {-2.4132931360656334E-16, 1.2442654599774185E-15, -3.5592598733275504E-15, 5.0956447378324209E-15, 1.6446732556150498E-15, -2.5290498540837812E-14, 6.2712721591286338E-14, -9.2666673089509217E-14, 9.2581824882952367E-14, -6.2712118118977746E-14, 2.5288160085642670E-14, -1.6451258598462044E-15, -5.0958559531403920E-15, 3.5592532728491847E-15, -1.2442654894438389E-15, 2.4132931361645452E-16}; + constexpr FLT c16[] = {-1.6052119916687038E-17, 1.0220930228231101E-16, -4.3668420339021406E-16, 1.2658361982998821E-15, -2.5907177687935505E-15, 3.7311262928168221E-15, -3.4997038937045781E-15, 1.4124231584693148E-15, 1.3706178218468559E-15, -3.5056760846448971E-15, 3.7363519598930578E-15, -2.5923974474980012E-15, 1.2658945204780770E-15, -4.3668985335150679E-16, 1.0220927950027870E-16, -1.6052119872193216E-17}; + constexpr FLT c17[] = {1.2307507877258324E-18, -2.6518352923945508E-18, -1.0105982127470271E-20, 2.6958700270869167E-17, -1.1513299715471039E-16, 2.7882272296911513E-16, -4.6961519239790030E-16, 6.5796739812484873E-16, -6.7025909677113713E-16, 4.6238478142949540E-16, -2.8307058941305305E-16, 1.1494093936336214E-16, -2.6999653770494898E-17, 1.1474040843416029E-20, 2.6518435669432360E-18, -1.2307508200482882E-18}; + for (int i=0; i<16; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i] + z*(c12[i] + z*(c13[i] + z*(c14[i] + z*(c15[i] + z*(c16[i] + z*(c17[i]))))))))))))))))); + } else + printf("width not implemented!\n"); diff --git a/include/cufinufft/contrib/ker_lowupsampfac_horner_allw_loop_constexpr.inc b/include/cufinufft/contrib/ker_lowupsampfac_horner_allw_loop_constexpr.inc new file mode 100644 index 000000000..e2fa229b7 --- /dev/null +++ b/include/cufinufft/contrib/ker_lowupsampfac_horner_allw_loop_constexpr.inc @@ -0,0 +1,171 @@ +// Code generated by gen_all_horner_C_code.m in finufft/devel +// Authors: Alex Barnett & Ludvig af Klinteberg. +// (C) The Simons Foundation, Inc. + if (w==2) { + constexpr FLT c0[] = {6.1209111871385702E-01, 6.1209111871385702E-01}; + constexpr FLT c1[] = {6.4742429432896431E-01, -6.4742429432896442E-01}; + constexpr FLT c2[] = {-9.0411309581634847E-02, -9.0411309581634750E-02}; + constexpr FLT c3[] = {-1.9075708590566751E-01, 1.9075708590566753E-01}; + for (int i=0; i<2; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i]))); + } else if (w==3) { + constexpr FLT c0[] = {2.4728112933307078E-01, 1.0000000000000044E+00, 2.4728112935494964E-01}; + constexpr FLT c1[] = {4.0470611346184543E-01, 2.1212921335912390E-17, -4.0470611343822160E-01}; + constexpr FLT c2[] = {1.4864411342268655E-01, -3.0473448739822773E-01, 1.4864411344492173E-01}; + constexpr FLT c3[] = {-4.4469294619149627E-02, 1.3598904496642886E-16, 4.4469294640111616E-02}; + constexpr FLT c4[] = {-2.9270010751775037E-02, 3.7966707032750659E-02, -2.9270010728701147E-02}; + for (int i=0; i<3; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i])))); + } else if (w==4) { + constexpr FLT c0[] = {8.4048892491849839E-02, 7.9275732207620875E-01, 7.9275732207620908E-01, 8.4048892491849811E-02}; + constexpr FLT c1[] = {1.7431588385887239E-01, 3.7425489538028417E-01, -3.7425489538028422E-01, -1.7431588385887242E-01}; + constexpr FLT c2[] = {1.1425598262146337E-01, -1.1126112046907141E-01, -1.1126112046907137E-01, 1.1425598262146335E-01}; + constexpr FLT c3[] = {1.5677587697716072E-02, -6.7022293289915616E-02, 6.7022293289915727E-02, -1.5677587697716041E-02}; + constexpr FLT c4[] = {-1.0401300825285629E-02, 6.3725646657139309E-03, 6.3725646657139005E-03, -1.0401300825285625E-02}; + constexpr FLT c5[] = {-3.0464394190490617E-03, 5.3247889205097435E-03, -5.3247889205097279E-03, 3.0464394190490305E-03}; + for (int i=0; i<4; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i]))))); + } else if (w==5) { + constexpr FLT c0[] = {2.5811126752233307E-02, 4.6616226852477344E-01, 1.0000000000000007E+00, 4.6616226852477305E-01, 2.5811126752233318E-02}; + constexpr FLT c1[] = {6.2936773057387055E-02, 3.7198919402374020E-01, 2.1212921335912559E-17, -3.7198919402374009E-01, -6.2936773057387055E-02}; + constexpr FLT c2[] = {5.4855980576944567E-02, 3.7709308632020676E-02, -1.8284069243892637E-01, 3.7709308632020731E-02, 5.4855980576944567E-02}; + constexpr FLT c3[] = {1.8780973157032140E-02, -3.8322611720715660E-02, 1.4047484462204681E-16, 3.8322611720715834E-02, -1.8780973157032116E-02}; + constexpr FLT c4[] = {-2.3306908700105430E-05, -8.3858973028989436E-03, 1.4886952481383787E-02, -8.3858973028988499E-03, -2.3306908700106227E-05}; + constexpr FLT c5[] = {-1.5212353034889806E-03, 1.7151925122365422E-03, 1.0734071182258885E-16, -1.7151925122365888E-03, 1.5212353034889806E-03}; + for (int i=0; i<5; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i]))))); + } else if (w==6) { + constexpr FLT c0[] = {7.3992041846532818E-03, 2.2998056434514028E-01, 8.5775196559356059E-01, 8.5775196559356115E-01, 2.2998056434514028E-01, 7.3992041847816166E-03}; + constexpr FLT c1[] = {2.0397684222696250E-02, 2.4277466601214742E-01, 2.6509440217151281E-01, -2.6509440217151231E-01, -2.4277466601214739E-01, -2.0397684222557694E-02}; + constexpr FLT c2[] = {2.1435449512033435E-02, 7.4190333865239946E-02, -9.5369600014193256E-02, -9.5369600014193381E-02, 7.4190333865239905E-02, 2.1435449512163876E-02}; + constexpr FLT c3[] = {1.0463664645794037E-02, -5.8671703446042224E-03, -3.4019677093840447E-02, 3.4019677093840760E-02, 5.8671703446042771E-03, -1.0463664645671082E-02}; + constexpr FLT c4[] = {1.9378826192716972E-03, -6.8365127179467735E-03, 4.7406536657957962E-03, 4.7406536657958473E-03, -6.8365127179467848E-03, 1.9378826194070377E-03}; + constexpr FLT c5[] = {-2.6471424081647417E-04, -5.6150758897069279E-04, 2.0099203466671291E-03, -2.0099203466670359E-03, 5.6150758897070829E-04, 2.6471424094083520E-04}; + constexpr FLT c6[] = {-1.6161497824910217E-04, 2.5924418389355766E-04, -1.3917099193215483E-04, -1.3917099193211840E-04, 2.5924418389357192E-04, -1.6161497812639921E-04}; + for (int i=0; i<6; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i])))))); + } else if (w==7) { + constexpr FLT c0[] = {2.0163149398992283E-03, 1.0071602557045130E-01, 5.8653557849806126E-01, 1.0000000000000002E+00, 5.8653557849806159E-01, 1.0071602557045131E-01, 2.0163149399332597E-03}; + constexpr FLT c1[] = {6.1353661835569211E-03, 1.2822551681002711E-01, 3.1973557271594344E-01, -2.1212921335912596E-17, -3.1973557271594366E-01, -1.2822551681002711E-01, -6.1353661835202118E-03}; + constexpr FLT c2[] = {7.4065234100227761E-03, 5.7825030729344404E-02, 1.0889852837592919E-04, -1.3060049459923276E-01, 1.0889852837575314E-04, 5.7825030729344355E-02, 7.4065234100573725E-03}; + constexpr FLT c3[] = {4.4924606632387705E-03, 7.2245566707421303E-03, -2.7743312484355583E-02, 1.0559644416237177E-16, 2.7743312484355832E-02, -7.2245566707420826E-03, -4.4924606632061881E-03}; + constexpr FLT c4[] = {1.3572774007773842E-03, -2.3954706749181320E-03, -2.9058644824981098E-03, 7.8619155407045772E-03, -2.9058644824980807E-03, -2.3954706749181507E-03, 1.3572774008132615E-03}; + constexpr FLT c5[] = {1.1260116639581618E-04, -7.8814564904709067E-04, 1.1036556706849172E-03, -3.0492924261508591E-17, -1.1036556706849482E-03, 7.8814564904710227E-04, -1.1260116636284763E-04}; + constexpr FLT c6[] = {-4.7399003259805808E-05, 2.0950491943152726E-06, 1.7484854214667859E-04, -2.9104069274769336E-04, 1.7484854214659272E-04, 2.0950491943114936E-06, -4.7399003227280901E-05}; + constexpr FLT c7[] = {-1.2555096177146811E-05, 2.7293834771974277E-05, -2.6660039700396876E-05, 5.1878356274645480E-17, 2.6660039700612832E-05, -2.7293834771939816E-05, 1.2555096209061404E-05}; + for (int i=0; i<7; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i]))))))); + } else if (w==8) { + constexpr FLT c0[] = {5.2827275612461462E-04, 4.0402734444109238E-02, 3.4389230803369686E-01, 8.9161099745784866E-01, 8.9161099745784866E-01, 3.4389230803369708E-01, 4.0402734444109252E-02, 5.2827275612461408E-04}; + constexpr FLT c1[] = {1.7458301875074096E-03, 5.9145446836664541E-02, 2.5435204236257858E-01, 2.0538938722823222E-01, -2.0538938722823233E-01, -2.5435204236257858E-01, -5.9145446836664547E-02, -1.7458301875074094E-03}; + constexpr FLT c2[] = {2.3525728171808306E-03, 3.3585505340219701E-02, 4.4733940386002209E-02, -8.0668262921248624E-02, -8.0668262921248748E-02, 4.4733940386002119E-02, 3.3585505340219687E-02, 2.3525728171808311E-03}; + constexpr FLT c3[] = {1.6676293877589678E-03, 8.1606118103203940E-03, -1.0603838868224419E-02, -2.0559571166483725E-02, 2.0559571166484002E-02, 1.0603838868224510E-02, -8.1606118103203749E-03, -1.6676293877589678E-03}; + constexpr FLT c4[] = {6.5470478006265378E-04, 5.7029826102775656E-05, -4.0842122325118182E-03, 3.3746160664395084E-03, 3.3746160664396086E-03, -4.0842122325118321E-03, 5.7029826102778678E-05, 6.5470478006265432E-04}; + constexpr FLT c5[] = {1.2504911757628686E-04, -3.9351755557266000E-04, 2.3739384784447216E-05, 9.6592347103022203E-04, -9.6592347103013649E-04, -2.3739384784439440E-05, 3.9351755557266586E-04, -1.2504911757628702E-04}; + constexpr FLT c6[] = {-6.5665874015798238E-07, -6.1884865695206891E-05, 1.4476791315356577E-04, -8.6782118193344350E-05, -8.6782118193318939E-05, 1.4476791315358196E-04, -6.1884865695214169E-05, -6.5665874015806602E-07}; + constexpr FLT c7[] = {-5.1256159860509675E-06, 5.3292178505898186E-06, 8.7427989025457230E-06, -2.8404799465047339E-05, 2.8404799465135336E-05, -8.7427989024875505E-06, -5.3292178505782125E-06, 5.1256159860509675E-06}; + for (int i=0; i<8; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i]))))))); + } else if (w==9) { + constexpr FLT c0[] = {1.3409415535124456E-04, 1.5141199617983757E-02, 1.8004032483820079E-01, 6.6268423293859657E-01, 1.0000000000000004E+00, 6.6268423293859746E-01, 1.8004032483820084E-01, 1.5141199617983828E-02, 1.3409415535124450E-04}; + constexpr FLT c1[] = {4.7572953640583401E-04, 2.4761567630011042E-02, 1.6332247709293549E-01, 2.7616213278983226E-01, -4.2425842671825223E-17, -2.7616213278983237E-01, -1.6332247709293549E-01, -2.4761567630011111E-02, -4.7572953640583401E-04}; + constexpr FLT c2[] = {7.0217948741779855E-04, 1.6533012331430421E-02, 4.8637875368588490E-02, -1.5084170630533007E-02, -1.0157816246606997E-01, -1.5084170630533338E-02, 4.8637875368588449E-02, 1.6533012331430445E-02, 7.0217948741779833E-04}; + constexpr FLT c3[] = {5.6197289626769645E-04, 5.4583505067803007E-03, 8.8722695781044485E-04, -2.0386313118366230E-02, 1.4346537772579219E-16, 2.0386313118366597E-02, -8.8722695781040203E-04, -5.4583505067802999E-03, -5.6197289626769645E-04}; + constexpr FLT c4[] = {2.6358216867957524E-04, 7.0803132065997147E-04, -2.3883045659485441E-03, -1.0047843626593360E-03, 4.8455486978739078E-03, -1.0047843626590051E-03, -2.3883045659485362E-03, 7.0803132065996898E-04, 2.6358216867957530E-04}; + constexpr FLT c5[] = {7.0565721004957831E-05, -9.0876125855045856E-05, -3.5965836571493702E-04, 7.0575785995728897E-04, 5.6006957738110937E-17, -7.0575785995746006E-04, 3.5965836571493702E-04, 9.0876125855046818E-05, -7.0565721004957980E-05}; + constexpr FLT c6[] = {7.9668965137354764E-06, -4.2137454928171943E-05, 3.9856859670063718E-05, 6.5639620808911507E-05, -1.4477186949841611E-04, 6.5639620808762402E-05, 3.9856859670072629E-05, -4.2137454928186349E-05, 7.9668965137352681E-06}; + constexpr FLT c7[] = {-9.3772917893888351E-07, -3.0575635011675480E-06, 1.2977675432514170E-05, -1.5241881422267232E-05, 5.6444540850624641E-17, 1.5241881422464882E-05, -1.2977675432482811E-05, 3.0575635011824812E-06, 9.3772917893893782E-07}; + constexpr FLT c8[] = {-4.1446092652958961E-07, 7.2790527337844100E-07, -2.5130319764268858E-08, -1.9002349621010172E-06, 3.0493470976000790E-06, -1.9002349619116138E-06, -2.5130319761051126E-08, 7.2790527337217009E-07, -4.1446092652952507E-07}; + for (int i=0; i<9; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i])))))))); + } else if (w==10) { + constexpr FLT c0[] = {3.3157481538170295E-05, 5.3715860775974443E-03, 8.6328042282845782E-02, 4.3077092326437988E-01, 9.1242439930731112E-01, 9.1242439930731112E-01, 4.3077092326437971E-01, 8.6328042282845754E-02, 5.3715860775974227E-03, 3.3157481538170322E-05}; + constexpr FLT c1[] = {1.2517797191066981E-04, 9.6269418565961412E-03, 9.1130577457178452E-02, 2.4769645835465362E-01, 1.6766875916810517E-01, -1.6766875916810536E-01, -2.4769645835465354E-01, -9.1130577457178424E-02, -9.6269418565961117E-03, -1.2517797191066951E-04}; + constexpr FLT c2[] = {1.9968216068682153E-04, 7.2783782301876591E-03, 3.5949398124193940E-02, 2.5847993600195553E-02, -6.9275634160640490E-02, -6.9275634160640504E-02, 2.5847993600195445E-02, 3.5949398124193913E-02, 7.2783782301876375E-03, 1.9968216068682094E-04}; + constexpr FLT c3[] = {1.7649923565147242E-04, 2.9221990881931090E-03, 4.9086823797165058E-03, -1.0940556313145914E-02, -1.3762152424114656E-02, 1.3762152424114910E-02, 1.0940556313146081E-02, -4.9086823797164919E-03, -2.9221990881930998E-03, -1.7649923565147204E-04}; + constexpr FLT c4[] = {9.4710355505531920E-05, 6.0621452710061727E-04, -7.0118560592788729E-04, -2.4750745659639179E-03, 2.4757076628501668E-03, 2.4757076628502063E-03, -2.4750745659640264E-03, -7.0118560592788274E-04, 6.0621452710061163E-04, 9.4710355505531771E-05}; + constexpr FLT c5[] = {3.1258610702677804E-05, 2.8169545035126350E-05, -2.9881406711974808E-04, 1.5956798534243302E-04, 5.3653099874326161E-04, -5.3653099874339388E-04, -1.5956798534226972E-04, 2.9881406711975192E-04, -2.8169545035121488E-05, -3.1258610702677743E-05}; + constexpr FLT c6[] = {5.7780052154065432E-06, -1.5636835808661990E-05, -1.6121807313036067E-05, 8.1230533420465018E-05, -5.5456530742754838E-05, -5.5456530742851827E-05, 8.1230533420445272E-05, -1.6121807313045130E-05, -1.5636835808665131E-05, 5.7780052154064593E-06}; + constexpr FLT c7[] = {2.7742147829406768E-07, -3.2550081973304980E-06, 5.9212960378031332E-06, 8.5495977199682674E-07, -1.3248468528032551E-05, 1.3248468528215217E-05, -8.5495977185729702E-07, -5.9212960377964950E-06, 3.2550081973313239E-06, -2.7742147829400097E-07}; + constexpr FLT c8[] = {-1.2089379439825852E-07, -3.4743143855784781E-08, 8.2889801006379481E-07, -1.5830293785226849E-06, 8.7461219388985494E-07, 8.7461219397529632E-07, -1.5830293786451511E-06, 8.2889801008534534E-07, -3.4743143855462353E-08, -1.2089379439833804E-07}; + constexpr FLT c9[] = {-2.5033479260872450E-08, 6.3042298326687954E-08, -5.2303271559903752E-08, -7.6226091757998386E-08, 2.3316553102767969E-07, -2.3316553111902137E-07, 7.6226091879787297E-08, 5.2303271554367896E-08, -6.3042298324957995E-08, 2.5033479260965031E-08}; + for (int i=0; i<10; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i]))))))))); + } else if (w==11) { + constexpr FLT c0[] = {8.0191950887587638E-06, 1.8211144887695905E-03, 3.8565497751765702E-02, 2.5236459439543663E-01, 7.1517256669690443E-01, 1.0000000000000002E+00, 7.1517256669690443E-01, 2.5236459439543651E-01, 3.8565497751765723E-02, 1.8211144887695927E-03, 8.0191950887586707E-06}; + constexpr FLT c1[] = {3.1996260415636073E-05, 3.5282769389657661E-03, 4.5889527487056492E-02, 1.8012194355267480E-01, 2.4178022040260394E-01, 2.1212921335912587E-17, -2.4178022040260411E-01, -1.8012194355267488E-01, -4.5889527487056492E-02, -3.5282769389657648E-03, -3.1996260415635850E-05}; + constexpr FLT c2[] = {5.4612928019025183E-05, 2.9497743530118290E-03, 2.1858479505161201E-02, 3.8333708936616528E-02, -2.1641923687039297E-02, -8.3109405654057292E-02, -2.1641923687039287E-02, 3.8333708936616487E-02, 2.1858479505161187E-02, 2.9497743530118290E-03, 5.4612928019024885E-05}; + constexpr FLT c3[] = {5.2504054888010150E-05, 1.3660648269306127E-03, 4.7357572177382694E-03, -2.2373255422688926E-03, -1.5459233729560824E-02, -3.0584997651941540E-18, 1.5459233729561050E-02, 2.2373255422689746E-03, -4.7357572177382599E-03, -1.3660648269306129E-03, -5.2504054888009953E-05}; + constexpr FLT c4[] = {3.1396100602888584E-05, 3.6443237253636144E-04, 1.5906780001786821E-04, -1.9495384184342716E-03, -2.4621376046556434E-04, 3.2818730060399505E-03, -2.4621376046541547E-04, -1.9495384184342974E-03, 1.5906780001787157E-04, 3.6443237253636144E-04, 3.1396100602888483E-05}; + constexpr FLT c5[] = {1.2057435171015750E-05, 4.6687328398363315E-05, -1.3963494372747466E-04, -1.4877651674418741E-04, 4.6954815721697059E-04, 7.1576260535837041E-17, -4.6954815721696283E-04, 1.4877651674414852E-04, 1.3963494372747659E-04, -4.6687328398363071E-05, -1.2057435171015728E-05}; + constexpr FLT c6[] = {2.8888404081262488E-06, -1.8976367884800935E-06, -2.4767547607257735E-05, 3.8337725458133611E-05, 2.6462355617055980E-05, -8.2113719362939881E-05, 2.6462355617066876E-05, 3.8337725458138978E-05, -2.4767547607262269E-05, -1.8976367884805327E-06, 2.8888404081262340E-06}; + constexpr FLT c7[] = {3.5729663467786725E-07, -1.6085054296206689E-06, 4.5672370507959851E-07, 6.0608527683273524E-06, -9.0233724844644286E-06, -4.5070818825954386E-17, 9.0233724845159214E-06, -6.0608527682667218E-06, -4.5672370507254818E-07, 1.6085054296207723E-06, -3.5729663467788907E-07}; + constexpr FLT c8[] = {-7.7890073973236871E-09, -1.8340559948709468E-07, 5.4451797328971916E-07, -3.5830285713854766E-07, -7.3873233537913819E-07, 1.4648976903075259E-06, -7.3873233536710514E-07, -3.5830285713236262E-07, 5.4451797329704790E-07, -1.8340559948689703E-07, -7.7890073973081013E-09}; + constexpr FLT c9[] = {-9.8984999695252047E-09, 1.0194946774280524E-08, 3.5279000677512062E-08, -1.1638771469313311E-07, 1.2326133617211816E-07, -2.5669371006274292E-17, -1.2326133615551060E-07, 1.1638771463500659E-07, -3.5279000676820083E-08, -1.0194946774410270E-08, 9.8984999695130418E-09}; + for (int i=0; i<11; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i]))))))))); + } else if (w==12) { + constexpr FLT c0[] = {1.9028495068410023E-06, 5.9416527261081913E-04, 1.6248140264385581E-02, 1.3597036436097915E-01, 4.9821957378204840E-01, 9.2652305802242962E-01, 9.2652305802242962E-01, 4.9821957378204840E-01, 1.3597036436097937E-01, 1.6248140264385626E-02, 5.9416527261081924E-04, 1.9028495068454171E-06}; + constexpr FLT c1[] = {7.9801239249145923E-06, 1.2318344820958854E-03, 2.1335987794357199E-02, 1.1394981969310448E-01, 2.3520579283187484E-01, 1.4166451219687695E-01, -1.4166451219687687E-01, -2.3520579283187476E-01, -1.1394981969310460E-01, -2.1335987794357230E-02, -1.2318344820958847E-03, -7.9801239249098540E-06}; + constexpr FLT c2[] = {1.4462226804444730E-05, 1.1205076408888257E-03, 1.1698445222077612E-02, 3.3958877046121660E-02, 1.3705098421608795E-02, -6.0497400607811481E-02, -6.0497400607811579E-02, 1.3705098421608806E-02, 3.3958877046121591E-02, 1.1698445222077622E-02, 1.1205076408888255E-03, 1.4462226804449267E-05}; + constexpr FLT c3[] = {1.4953735432776090E-05, 5.8049865432805142E-04, 3.2684769908807722E-03, 2.3619245295514353E-03, -1.0074268581043095E-02, -9.8551520939611746E-03, 9.8551520939615059E-03, 1.0074268581043251E-02, -2.3619245295513252E-03, -3.2684769908807648E-03, -5.8049865432805098E-04, -1.4953735432771914E-05}; + constexpr FLT c4[] = {9.7900673700200676E-06, 1.8351475200221906E-04, 3.8725987583789238E-04, -9.2229408802588448E-04, -1.5383560041742387E-03, 1.8800996948122926E-03, 1.8800996948123033E-03, -1.5383560041742409E-03, -9.2229408802591614E-04, 3.8725987583789064E-04, 1.8351475200221903E-04, 9.7900673700247601E-06}; + constexpr FLT c5[] = {4.2345162286123928E-06, 3.3664241555334181E-05, -3.0535096226552352E-05, -1.9795772057290591E-04, 1.7526295499606013E-04, 3.2830037656743561E-04, -3.2830037656734232E-04, -1.7526295499599014E-04, 1.9795772057292925E-04, 3.0535096226555273E-05, -3.3664241555334181E-05, -4.2345162286081255E-06}; + constexpr FLT c6[] = {1.2088615636792351E-06, 2.2204932634073669E-06, -1.5559909809157569E-05, 1.8771595438708362E-06, 4.7304527720902187E-05, -3.7055029721502823E-05, -3.7055029721506354E-05, 4.7304527720948991E-05, 1.8771595438366184E-06, -1.5559909809165219E-05, 2.2204932634074313E-06, 1.2088615636834544E-06}; + constexpr FLT c7[] = {2.1206307767331379E-07, -4.5869687934383747E-07, -1.3462277877507893E-06, 4.2970047520348418E-06, -1.1214870287581008E-06, -6.9831974682071699E-06, 6.9831974683366982E-06, 1.1214870288087690E-06, -4.2970047519748465E-06, 1.3462277877599186E-06, 4.5869687934394192E-07, -2.1206307766917122E-07}; + constexpr FLT c8[] = {1.5395324498807062E-08, -1.2022118042093087E-07, 1.5464523856613661E-07, 2.7605497716337475E-07, -8.4964626033234966E-07, 5.2067203458077506E-07, 5.2067203461734952E-07, -8.4964626032018743E-07, 2.7605497716040193E-07, 1.5464523856098652E-07, -1.2022118042095769E-07, 1.5395324502815322E-08}; + constexpr FLT c9[] = {-2.0816585198648028E-09, -6.8192670389370156E-09, 3.6338774649049193E-08, -4.9464520974759579E-08, -1.3242031035521981E-08, 1.0671664854533778E-07, -1.0671664854533778E-07, 1.3242031024450263E-08, 4.9464520977527511E-08, -3.6338774639015446E-08, 6.8192670391856967E-09, 2.0816585232951501E-09}; + constexpr FLT c10[] = {-6.3791929313390708E-10, 1.2240176132927394E-09, 5.3586930472778203E-10, -6.2807355748408205E-09, 1.0600657362033408E-08, -5.5585207892891946E-09, -5.5585208232281016E-09, 1.0600657414513137E-08, -6.2807355547288652E-09, 5.3586929184356377E-10, 1.2240176133909372E-09, -6.3791928984134277E-10}; + for (int i=0; i<12; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i])))))))))); + } else if (w==13) { + constexpr FLT c0[] = {4.4408051211162946E-07, 1.8756193861873427E-04, 6.5146989208011716E-03, 6.8352802598867876E-02, 3.1564238810082484E-01, 7.5353649746793960E-01, 9.9999999999999956E-01, 7.5353649746793838E-01, 3.1564238810082484E-01, 6.8352802598867710E-02, 6.5146989208011707E-03, 1.8756193861873272E-04, 4.4408051211162761E-07}; + constexpr FLT c1[] = {1.9487148068106057E-06, 4.1285069961250701E-04, 9.2995630713278762E-03, 6.5021145064983563E-02, 1.8663042875530009E-01, 2.1451870821533808E-01, 1.8840858949353919E-32, -2.1451870821533794E-01, -1.8663042875529998E-01, -6.5021145064983438E-02, -9.2995630713278762E-03, -4.1285069961250425E-04, -1.9487148068106044E-06}; + constexpr FLT c2[] = {3.7267581324409626E-06, 4.0381251792508734E-04, 5.7019503038218408E-03, 2.4040868593456825E-02, 2.9406233528281710E-02, -2.4394921635639378E-02, -7.0323343245740924E-02, -2.4394921635639052E-02, 2.9406233528281724E-02, 2.4040868593456791E-02, 5.7019503038218382E-03, 4.0381251792508501E-04, 3.7267581324409626E-06}; + constexpr FLT c3[] = {4.1089519307370168E-06, 2.2941839162878727E-04, 1.8941440042457443E-03, 3.5673079836347822E-03, -3.6880489041048953E-03, -1.2074156718545214E-02, 7.1013810712957114E-17, 1.2074156718545436E-02, 3.6880489041048944E-03, -3.5673079836347674E-03, -1.8941440042457413E-03, -2.2941839162878624E-04, -4.1089519307370151E-06}; + constexpr FLT c4[] = {2.9080869014384424E-06, 8.2405696428180906E-05, 3.3386109283452779E-04, -1.7130036080580219E-04, -1.5108662980936900E-03, 7.8665018928679242E-05, 2.3686576883603073E-03, 7.8665018928764622E-05, -1.5108662980936485E-03, -1.7130036080580737E-04, 3.3386109283452861E-04, 8.2405696428180703E-05, 2.9080869014384429E-06}; + constexpr FLT c5[] = {1.3873038503072801E-06, 1.8694798962849948E-05, 1.4885937076477316E-05, -1.3109520271106624E-04, -4.6797213058790025E-05, 3.2555441892430825E-04, 6.5502537691746230E-17, -3.2555441892416048E-04, 4.6797213058875582E-05, 1.3109520271106819E-04, -1.4885937076477316E-05, -1.8694798962849962E-05, -1.3873038503072801E-06}; + constexpr FLT c6[] = {4.5216719173889445E-07, 2.3203195635245624E-06, -6.0547210914038460E-06, -1.2111482379340961E-05, 3.0238388566383385E-05, 1.0632529352081665E-05, -5.0954659549722746E-05, 1.0632529352250802E-05, 3.0238388566313227E-05, -1.2111482379347288E-05, -6.0547210914040671E-06, 2.3203195635247352E-06, 4.5216719173889350E-07}; + constexpr FLT c7[] = {9.7956192761412821E-08, 9.2080334896449358E-09, -1.2031586234326618E-06, 1.3860784486076025E-06, 2.8079238803293383E-06, -5.6034103145907796E-06, 1.6113788341939994E-17, 5.6034103146040687E-06, -2.8079238803054550E-06, -1.3860784485997179E-06, 1.2031586234342167E-06, -9.2080334898128650E-09, -9.7956192761411458E-08}; + constexpr FLT c8[] = {1.2350515865275843E-08, -4.7668301905167552E-08, -3.2637845350597966E-08, 3.2101904613347501E-07, -3.3650826994957826E-07, -3.1117289066304045E-07, 7.8771611535813792E-07, -3.1117289069990237E-07, -3.3650826984246136E-07, 3.2101904612282309E-07, -3.2637845349600439E-08, -4.7668301904853071E-08, 1.2350515865276535E-08}; + constexpr FLT c9[] = {2.7912946705592266E-10, -6.8584366111657433E-09, 1.5876438439662156E-08, 2.2894800381734934E-09, -5.4355139631893104E-08, 6.9215572156100812E-08, 1.6320619156148685E-17, -6.9215572241906639E-08, 5.4355139637428967E-08, -2.2894800215659153E-09, -1.5876438439575659E-08, 6.8584366109657170E-09, -2.7912946705524691E-10}; + constexpr FLT c10[] = {-1.9473100882503891E-10, -6.0076128424585684E-11, 1.8131864354130518E-09, -3.9994904462490394E-09, 2.0334605597831887E-09, 5.0274131974512103E-09, -9.3367591026663196E-09, 5.0274136044049357E-09, 2.0334605333861501E-09, -3.9994904745315308E-09, 1.8131864358844393E-09, -6.0076128154532669E-11, -1.9473100882561411E-10}; + constexpr FLT c11[] = {-2.9813639427701670E-11, 8.8416967305832406E-11, -6.1944900155883343E-11, -2.3424446318938161E-10, 6.6123632509207570E-10, -6.5395825305270265E-10, -7.6394712006965382E-17, 6.5395802534269801E-10, -6.6123633886256970E-10, 2.3424448263843040E-10, 6.1944899055662456E-11, -8.8416967554269098E-11, 2.9813639428048382E-11}; + for (int i=0; i<13; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i]))))))))))); + } else if (w==14) { + constexpr FLT c0[] = {1.0213002307223062E-07, 5.7528591418445639E-05, 2.5031206020280088E-03, 3.2405046511689233E-02, 1.8485678142025513E-01, 5.5177865704975304E-01, 9.3670793123951734E-01, 9.3670793123951712E-01, 5.5177865704975315E-01, 1.8485678142025547E-01, 3.2405046511689239E-02, 2.5031206020280179E-03, 5.7528591418445801E-05, 1.0213002307242253E-07}; + constexpr FLT c1[] = {4.6718564624239767E-07, 1.3360375098030156E-04, 3.8410346178215306E-03, 3.4207779106833425E-02, 1.2923501383683489E-01, 2.2132894130184291E-01, 1.2264779624530273E-01, -1.2264779624530257E-01, -2.2132894130184308E-01, -1.2923501383683503E-01, -3.4207779106833425E-02, -3.8410346178215393E-03, -1.3360375098030178E-04, -4.6718564624220264E-07}; + constexpr FLT c2[] = {9.3810713124204527E-07, 1.3926941499858519E-04, 2.5833386162539013E-03, 1.4797516242328850E-02, 3.0361769467151970E-02, 5.7261067343619262E-03, -5.3608938764866873E-02, -5.3608938764866894E-02, 5.7261067343618603E-03, 3.0361769467151870E-02, 1.4797516242328836E-02, 2.5833386162539061E-03, 1.3926941499858543E-04, 9.3810713124224814E-07}; + constexpr FLT c3[] = {1.0954436997682021E-06, 8.5568590196649221E-05, 9.7778250562911601E-04, 3.0692948752812804E-03, 6.0463237460738756E-04, -8.9532302111318181E-03, -7.4040784665309846E-03, 7.4040784665312838E-03, 8.9532302111319968E-03, -6.0463237460737487E-04, -3.0692948752812708E-03, -9.7778250562911818E-04, -8.5568590196649329E-05, -1.0954436997680333E-06}; + constexpr FLT c4[] = {8.3014334976692641E-07, 3.4045323043173900E-05, 2.1660980714121239E-04, 1.7421792587401689E-04, -9.2118064021561887E-04, -9.7597008655075522E-04, 1.4714477548413631E-03, 1.4714477548414121E-03, -9.7597008655073809E-04, -9.2118064021559762E-04, 1.7421792587402266E-04, 2.1660980714121363E-04, 3.4045323043173968E-05, 8.3014334976713224E-07}; + constexpr FLT c5[] = {4.3045614796951587E-07, 8.9716871724550274E-06, 2.3377513570381849E-05, -5.5213296993546423E-05, -1.2391624765752083E-04, 1.5869855385555775E-04, 2.1530382494154427E-04, -2.1530382494144317E-04, -1.5869855385557331E-04, 1.2391624765755973E-04, 5.5213296993542533E-05, -2.3377513570381968E-05, -8.9716871724550325E-06, -4.3045614796933747E-07}; + constexpr FLT c6[] = {1.5611302559652642E-07, 1.4859455506706785E-06, -8.5826557923722616E-07, -1.1616353402592630E-05, 8.0333594878995593E-06, 2.8616079443375728E-05, -2.5816776957707699E-05, -2.5816776957707652E-05, 2.8616079443268301E-05, 8.0333594878977314E-06, -1.1616353402591744E-05, -8.5826557923811989E-07, 1.4859455506706314E-06, 1.5611302559670737E-07}; + constexpr FLT c7[] = {3.9336515129721532E-08, 1.1257285216182540E-07, -6.2406181937560562E-07, -2.6873173855233150E-07, 2.8292088258393860E-06, -1.4598715516905790E-06, -4.0212462690723253E-06, 4.0212462691823422E-06, 1.4598715517761175E-06, -2.8292088259133913E-06, 2.6873173855647969E-07, 6.2406181937648769E-07, -1.1257285216174059E-07, -3.9336515129545720E-08}; + constexpr FLT c8[] = {6.5041263396088790E-09, -9.9149367808853263E-09, -6.6845758889620994E-08, 1.6286641992901855E-07, 5.8507874943424797E-08, -4.7688540978638226E-07, 3.2559878511421460E-07, 3.2559878519979701E-07, -4.7688540972525423E-07, 5.8507875026096430E-08, 1.6286641993325022E-07, -6.6845758889870313E-08, -9.9149367809131923E-09, 6.5041263397795280E-09}; + constexpr FLT c9[] = {5.5138523621090170E-10, -3.4792607432658830E-09, 2.1621109687111844E-09, 1.6802313210571416E-08, -3.4440501484206901E-08, 3.6408051867813727E-09, 5.4274262350067578E-08, -5.4274262322388281E-08, -3.6408052006210212E-09, 3.4440501481438969E-08, -1.6802313213339344E-08, -2.1621109679759532E-09, 3.4792607432902108E-09, -5.5138523606396516E-10}; + constexpr FLT c10[] = {-2.3785683828448576E-11, -2.9453404124114860E-10, 1.0997757897423152E-09, -8.6020468987368310E-10, -2.2974592934948612E-09, 5.5064437603692059E-09, -3.1470905819229834E-09, -3.1470905272434506E-09, 5.5064436867561607E-09, -2.2974592840673907E-09, -8.6020468484567061E-10, 1.0997757884067548E-09, -2.9453404129270796E-10, -2.3785683688822786E-11}; + constexpr FLT c11[] = {-1.2240623323339709E-11, 1.4269095096874458E-11, 6.3689195980296716E-11, -2.3523039255622989E-10, 2.6546832331592691E-10, 9.4137182189250380E-11, -5.6473803777133577E-10, 5.6473799518218520E-10, -9.4137157913436917E-11, -2.6546835890448598E-10, 2.3523039312408576E-10, -6.3689194329967738E-11, -1.4269094997055950E-11, 1.2240623457297303E-11}; + constexpr FLT c12[] = {-1.4791529085565623E-12, 4.8147158180813514E-12, -7.1247159181258048E-12, -3.7363568005007135E-12, 3.0923958877552072E-11, -4.7998366007614543E-11, 2.4268802632733111E-11, 2.4268880217882715E-11, -4.7998325173324774E-11, 3.0923998690985708E-11, -3.7363589698227313E-12, -7.1247171622956968E-12, 4.8147157313484649E-12, -1.4791527915262285E-12}; + for (int i=0; i<14; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i] + z*(c12[i])))))))))))); + } else if (w==15) { + constexpr FLT c0[] = {2.3183302143948793E-08, 1.7202745817468655E-05, 9.2668857465754784E-04, 1.4607490553401936E-02, 1.0130044556641116E-01, 3.7041488405244677E-01, 7.8279781886019206E-01, 1.0000000000000018E+00, 7.8279781886019228E-01, 3.7041488405244727E-01, 1.0130044556641139E-01, 1.4607490553401959E-02, 9.2668857465754882E-04, 1.7202745817468652E-05, 2.3183302143948763E-08}; + constexpr FLT c1[] = {1.1019919454791572E-07, 4.1938159428224126E-05, 1.5154850601194973E-03, 1.6839357628952684E-02, 8.0835952724673255E-02, 1.8739074372244105E-01, 1.9255567517255739E-01, -9.4204294746769593E-32, -1.9255567517255723E-01, -1.8739074372244108E-01, -8.0835952724673352E-02, -1.6839357628952709E-02, -1.5154850601194973E-03, -4.1938159428224126E-05, -1.1019919454791572E-07}; + constexpr FLT c2[] = {2.3137327105312791E-07, 4.6266060425611204E-05, 1.1028009511991974E-03, 8.2352859806754802E-03, 2.4233386066663413E-02, 2.2182889945939449E-02, -2.5327411650384993E-02, -6.0946897479642256E-02, -2.5327411650385129E-02, 2.2182889945939359E-02, 2.4233386066663424E-02, 8.2352859806754854E-03, 1.1028009511991970E-03, 4.6266060425611204E-05, 2.3137327105312783E-07}; + constexpr FLT c3[] = {2.8457821671573274E-07, 3.0427184404092299E-05, 4.6337319534911844E-04, 2.1072304367244932E-03, 2.4342755210407531E-03, -4.2814200474568563E-03, -9.6703299158782657E-03, 1.8176153030403361E-16, 9.6703299158783507E-03, 4.2814200474569379E-03, -2.4342755210407076E-03, -2.1072304367244859E-03, -4.6337319534911817E-04, -3.0427184404092296E-05, -2.8457821671573279E-07}; + constexpr FLT c4[] = {2.2919642176438702E-07, 1.3183839322480003E-05, 1.2030953406839325E-04, 2.4905754342428421E-04, -3.4193403196993951E-04, -1.1551611179404738E-03, 2.1954335627567210E-04, 1.7895433812201793E-03, 2.1954335627571010E-04, -1.1551611179404326E-03, -3.4193403196995387E-04, 2.4905754342428610E-04, 1.2030953406839360E-04, 1.3183839322480008E-05, 2.2919642176438720E-07}; + constexpr FLT c5[] = {1.2779800356186583E-07, 3.8997040140349313E-06, 1.8264189394307498E-05, -8.3632912035128204E-06, -1.0687544349164653E-04, 2.2123224044726536E-06, 2.3404180714514772E-04, 6.5064979845545577E-17, -2.3404180714503106E-04, -2.2123224042782134E-06, 1.0687544349166598E-04, 8.3632912035006689E-06, -1.8264189394307559E-05, -3.8997040140349338E-06, -1.2779800356186589E-07}; + constexpr FLT c6[] = {5.0693377499403691E-08, 7.7594237801400426E-07, 9.4933483676717755E-07, -6.6987818302423087E-06, -4.5889941143373546E-06, 2.2647907184667538E-05, 3.7412856035449417E-06, -3.3754692339426772E-05, 3.7412856034892404E-06, 2.2647907184654951E-05, -4.5889941143014083E-06, -6.6987818302351157E-06, 9.4933483676684456E-07, 7.7594237801399991E-07, 5.0693377499403691E-08}; + constexpr FLT c7[] = {1.4373673262756881E-08, 9.2554419735729795E-08, -2.0417866965615742E-07, -6.8820764686271727E-07, 1.4165168644096691E-06, 1.2531774951198972E-06, -3.6383191328570317E-06, 5.9333697238861927E-17, 3.6383191329076855E-06, -1.2531774952992520E-06, -1.4165168643945163E-06, 6.8820764685908223E-07, 2.0417866965620961E-07, -9.2554419735731158E-08, -1.4373673262756913E-08}; + constexpr FLT c8[] = {2.8405432421064598E-09, 2.6648052024128211E-09, -4.5328290134778586E-08, 3.2089634828694367E-08, 1.7241593348808383E-07, -2.5816631656161770E-07, -1.3664009513726493E-07, 4.6017883216168089E-07, -1.3664009510064915E-07, -2.5816631656773852E-07, 1.7241593343152281E-07, 3.2089634835965337E-08, -4.5328290134523662E-08, 2.6648052024185691E-09, 2.8405432421065198E-09}; + constexpr FLT c9[] = {3.5447644664522991E-10, -1.1390658479562114E-09, -2.4324028601311552E-09, 1.2152005527725076E-08, -7.1102518341828894E-09, -2.5878341862165437E-08, 4.0855407178225425E-08, -6.7229636689436406E-18, -4.0855407139474409E-08, 2.5878341989490202E-08, 7.1102518840056246E-09, -1.2152005535163887E-08, 2.4324028601311552E-09, 1.1390658479600971E-09, -3.5447644664517713E-10}; + constexpr FLT c10[] = {1.6106092880607926E-11, -1.9612809866225313E-10, 3.3667881388500915E-10, 5.4740705815843633E-10, -2.3219918220819429E-09, 1.8783264389538617E-09, 2.1531915835821252E-09, -4.8374637778167195E-09, 2.1531915732119103E-09, 1.8783264455530896E-09, -2.3219918255386980E-09, 5.4740706350069505E-10, 3.3667881394392907E-10, -1.9612809866164026E-10, 1.6106092880601619E-11}; + constexpr FLT c11[] = {-2.9809392328002639E-12, -8.3268200084267327E-12, 5.7687950483526562E-11, -9.1929198156856840E-11, -3.9289938224686938E-11, 3.0713724621937891E-10, -3.5332675603861928E-10, -4.7176615708722248E-17, 3.5332675632254561E-10, -3.0713734445835836E-10, 3.9289964949381516E-11, 9.1929194004414145E-11, -5.7687950660981567E-11, 8.3268199995541140E-12, 2.9809392327699276E-12}; + constexpr FLT c12[] = {-6.7275763613050405E-13, 1.4037883809519618E-12, 1.0122748224833392E-12, -1.0507010409950668E-11, 1.9186635811522471E-11, -7.9758147674463026E-12, -2.2999207389706864E-11, 4.0853090072343795E-11, -2.2999199222849929E-11, -7.9758923525966314E-12, 1.9186574560087790E-11, -1.0507007219772089E-11, 1.0122747905815843E-12, 1.4037883779612130E-12, -6.7275763610714771E-13}; + for (int i=0; i<15; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i] + z*(c12[i])))))))))))); + } else if (w==16) { + constexpr FLT c0[] = {5.2012152104084075E-09, 5.0291159580938685E-06, 3.3201112337137914E-04, 6.3015433246683345E-03, 5.2427915343763419E-02, 2.3104762006593382E-01, 5.9521037322997228E-01, 9.4441119081353919E-01, 9.4441119081353897E-01, 5.9521037322997228E-01, 2.3104762006593382E-01, 5.2427915343763426E-02, 6.3015433246683362E-03, 3.3201112337137925E-04, 5.0291159580938685E-06, 5.2012152104083968E-09}; + constexpr FLT c1[] = {2.5620581163903698E-08, 1.2815874111792785E-05, 5.7471335914300648E-04, 7.8386860177525539E-03, 4.6638901641906975E-02, 1.3897554029141568E-01, 2.0773808644544139E-01, 1.0813440420918323E-01, -1.0813440420918335E-01, -2.0773808644544151E-01, -1.3897554029141571E-01, -4.6638901641906962E-02, -7.8386860177525539E-03, -5.7471335914300648E-04, -1.2815874111792780E-05, -2.5620581163903678E-08}; + constexpr FLT c2[] = {5.6049296769722407E-08, 1.4879146623074265E-05, 4.4787865139353408E-04, 4.2383440773521713E-03, 1.6624620601556200E-02, 2.6395394769117682E-02, 3.6740117889108559E-04, -4.8088574473126838E-02, -4.8088574473126817E-02, 3.6740117889110039E-04, 2.6395394769117647E-02, 1.6624620601556183E-02, 4.2383440773521705E-03, 4.4787865139353381E-04, 1.4879146623074262E-05, 5.6049296769722367E-08}; + constexpr FLT c3[] = {7.2283166867263369E-08, 1.0391634193778174E-05, 2.0529674430143886E-04, 1.2618687081127949E-03, 2.6256301814801142E-03, -5.5040645592548403E-04, -7.8709464111364428E-03, -5.7657980103485666E-03, 5.7657980103488684E-03, 7.8709464111365764E-03, 5.5040645592556046E-04, -2.6256301814800891E-03, -1.2618687081127923E-03, -2.0529674430143870E-04, -1.0391634193778174E-05, -7.2283166867263382E-08}; + constexpr FLT c4[] = {6.1501023800531295E-08, 4.8443034242391149E-06, 6.0167136036954489E-05, 2.0573318254801955E-04, 1.2811955521425743E-05, -8.3782209201439741E-04, -6.2669687707126603E-04, 1.1809008871739588E-03, 1.1809008871740102E-03, -6.2669687707129801E-04, -8.3782209201439957E-04, 1.2811955521424802E-05, 2.0573318254801969E-04, 6.0167136036954442E-05, 4.8443034242391132E-06, 6.1501023800531308E-08}; + constexpr FLT c5[] = {3.6571939291734573E-08, 1.5742222553115388E-06, 1.1217451065775747E-05, 1.0668471374318139E-05, -6.0694020243058218E-05, -7.4268888177597524E-05, 1.3567546096387106E-04, 1.4875477215044619E-04, -1.4875477215041898E-04, -1.3567546096383994E-04, 7.4268888177628640E-05, 6.0694020243062108E-05, -1.0668471374318139E-05, -1.1217451065775808E-05, -1.5742222553115373E-06, -3.6571939291734560E-08}; + constexpr FLT c6[] = {1.5672684443241293E-08, 3.5812571134853537E-07, 1.1292168823203332E-06, -2.5215449854185100E-06, -7.6275609266365118E-06, 9.3973092319789718E-06, 1.7891569285072030E-05, -1.8642776809419116E-05, -1.8642776809435267E-05, 1.7891569285119396E-05, 9.3973092319861496E-06, -7.6275609266374249E-06, -2.5215449854180577E-06, 1.1292168823202796E-06, 3.5812571134853394E-07, 1.5672684443241266E-08}; + constexpr FLT c7[] = {4.8970459380161511E-09, 5.4304148291621772E-08, -1.0066736763205116E-08, -5.3239387743771190E-07, 2.2987809872388434E-07, 1.8048974519458305E-06, -1.3449315565530231E-06, -2.4760016203656832E-06, 2.4760016205558345E-06, 1.3449315566530894E-06, -1.8048974519264694E-06, -2.2987809871496018E-07, 5.3239387743957950E-07, 1.0066736763205477E-08, -5.4304148291620039E-08, -4.8970459380161527E-09}; + constexpr FLT c8[] = {1.1055703983904693E-09, 4.3691209554215673E-09, -2.0201061499499309E-08, -2.3275033898522544E-08, 1.2633562932172848E-07, -2.2021804055583841E-08, -2.7912172397333448E-07, 2.1280289571270167E-07, 2.1280289561471954E-07, -2.7912172398563377E-07, -2.2021804043311624E-08, 1.2633562932175524E-07, -2.3275033897953490E-08, -2.0201061499405642E-08, 4.3691209554208717E-09, 1.1055703983904937E-09}; + constexpr FLT c9[] = {1.7210848751142109E-10, -1.3819378018358974E-10, -2.4707116696395418E-09, 4.6626394240840718E-09, 6.2513494821407377E-09, -2.2225751663756647E-08, 7.2716681831167356E-09, 2.9914504875425248E-08, -2.9914504880961111E-08, -7.2716681858846656E-09, 2.2225751666524578E-08, -6.2513494807567727E-09, -4.6626394246030589E-09, 2.4707116695638564E-09, 1.3819378018734865E-10, -1.7210848751139469E-10}; + constexpr FLT c10[] = {1.5548426850891040E-11, -8.2967690037353030E-11, -2.0776280196441915E-11, 6.5818716237227360E-10, -9.7473365318544434E-10, -7.2114132190269774E-10, 2.9974008768194548E-09, -1.8729406654385533E-09, -1.8729407980520035E-09, 2.9974009543459026E-09, -7.2114130179071973E-10, -9.7473365601368880E-10, 6.5818716417921449E-10, -2.0776280166982969E-11, -8.2967690036279040E-11, 1.5548426850876794E-11}; + constexpr FLT c11[] = {1.7715918253734007E-14, -8.7094275492396390E-12, 2.5402078548167017E-11, 5.6643084712743339E-13, -1.1273398069226705E-10, 1.7831197627554656E-10, 2.2124056737037060E-13, -2.7985821416111004E-10, 2.7985826569398559E-10, -2.2122821651802181E-13, -1.7831199885666961E-10, 1.1273397622040666E-10, -5.6643203607501166E-13, -2.5402078628021660E-11, 8.7094275492396907E-12, -1.7715918256992908E-14}; + constexpr FLT c12[] = {-2.1496737418348056E-13, -2.2214973543773537E-14, 2.3291735079229971E-12, -5.9732922869516132E-12, 3.0556730493177866E-12, 1.1858129781605648E-11, -2.4316397039401376E-11, 1.3235569405286772E-11, 1.3235463236132106E-11, -2.4316413373117597E-11, 1.1858131823320733E-11, 3.0556730493176707E-12, -5.9732919041302971E-12, 2.3291735916652542E-12, -2.2214974665309464E-14, -2.1496737416109420E-13}; + constexpr FLT c13[] = {-2.3198933254093550E-14, 8.4680085604099498E-14, -5.5120431569756550E-14, -3.4224865085091971E-13, 1.0093479536840142E-12, -9.9670676529397927E-13, -4.1953479545762892E-13, 2.1120282165025634E-12, -2.1120647150379602E-12, 4.1949829692223215E-13, 9.9668454879417257E-13, -1.0093487471304360E-12, 3.4224795658530073E-13, 5.5120400575755698E-14, -8.4680084102827573E-14, 2.3198933260903755E-14}; + for (int i=0; i<16; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i] + z*(c12[i] + z*(c13[i]))))))))))))); + } else + printf("width not implemented!\n"); diff --git a/src/ker_horner_allw_loop.inc b/src/ker_horner_allw_loop.inc new file mode 100644 index 000000000..953c4618b --- /dev/null +++ b/src/ker_horner_allw_loop.inc @@ -0,0 +1,207 @@ +// Code generated by gen_all_horner_C_code.m in finufft/devel +// Authors: Alex Barnett & Ludvig af Klinteberg. +// (C) The Simons Foundation, Inc. + if (w==2) { + constexpr FLT c0[] = {4.5147043243215343E+01, 4.5147043243215350E+01}; + constexpr FLT c1[] = {5.7408070938221307E+01, -5.7408070938221300E+01}; + constexpr FLT c2[] = {-1.8395117920046544E+00, -1.8395117920046602E+00}; + constexpr FLT c3[] = {-2.0382426253182064E+01, 2.0382426253182086E+01}; + constexpr FLT c4[] = {-2.0940804433577389E+00, -2.0940804433577398E+00}; + for (int i=0; i<2; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i])))); + } else if (w==3) { + constexpr FLT c0[] = {1.5653991189315130E+02, 8.8006872410780375E+02, 1.5653991189967169E+02}; + constexpr FLT c1[] = {3.1653018869611083E+02, 2.7828437114531882E-14, -3.1653018868907077E+02}; + constexpr FLT c2[] = {1.7742692790454484E+02, -3.3149255274727801E+02, 1.7742692791117128E+02}; + constexpr FLT c3[] = {-1.5357716116473071E+01, 1.0675641863333163E-13, 1.5357716122720211E+01}; + constexpr FLT c4[] = {-3.7757583061523640E+01, 5.3222970968867450E+01, -3.7757583054647341E+01}; + constexpr FLT c5[] = {-3.9654011076088449E+00, 4.9521033695040343E-14, 3.9654011139270429E+00}; + for (int i=0; i<3; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i]))))); + } else if (w==4) { + constexpr FLT c0[] = {5.4284366850213223E+02, 1.0073871433088407E+04, 1.0073871433088407E+04, 5.4284366850213269E+02}; + constexpr FLT c1[] = {1.4650917259256942E+03, 6.1905285583602899E+03, -6.1905285583602899E+03, -1.4650917259256942E+03}; + constexpr FLT c2[] = {1.4186910680718349E+03, -1.3995339862725573E+03, -1.3995339862725571E+03, 1.4186910680718345E+03}; + constexpr FLT c3[] = {5.1133995502497481E+02, -1.4191608683682980E+03, 1.4191608683682985E+03, -5.1133995502497402E+02}; + constexpr FLT c4[] = {-4.8293622641173705E+01, 3.9393732546135901E+01, 3.9393732546136945E+01, -4.8293622641173727E+01}; + constexpr FLT c5[] = {-7.8386867802392203E+01, 1.4918904800408794E+02, -1.4918904800408947E+02, 7.8386867802392203E+01}; + constexpr FLT c6[] = {-1.0039212571700403E+01, 5.0626747735617119E+00, 5.0626747735622777E+00, -1.0039212571700599E+01}; + for (int i=0; i<4; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i])))))); + } else if (w==5) { + constexpr FLT c0[] = {9.9223677575398506E+02, 3.7794697666613349E+04, 9.8715771010760567E+04, 3.7794697666613327E+04, 9.9223677575398540E+02}; + constexpr FLT c1[] = {3.0430174925083834E+03, 3.7938404259811425E+04, -4.1880997701304513E-12, -3.7938404259811403E+04, -3.0430174925083829E+03}; + constexpr FLT c2[] = {3.6092689177271232E+03, 7.7501368899498630E+03, -2.2704627332475000E+04, 7.7501368899498721E+03, 3.6092689177271213E+03}; + constexpr FLT c3[] = {1.9990077310495410E+03, -3.8875294641277214E+03, 1.6137850891850780E-11, 3.8875294641277346E+03, -1.9990077310495410E+03}; + constexpr FLT c4[] = {4.0071733590403909E+02, -1.5861137916762543E+03, 2.3839858699098786E+03, -1.5861137916762577E+03, 4.0071733590403909E+02}; + constexpr FLT c5[] = {-9.1301168206167233E+01, 1.2316471075215087E+02, 1.9401736511657983E-12, -1.2316471075215495E+02, 9.1301168206166977E+01}; + constexpr FLT c6[] = {-5.5339722671222894E+01, 1.1960590540262304E+02, -1.5249941358312140E+02, 1.1960590540262024E+02, -5.5339722671224088E+01}; + constexpr FLT c7[] = {-3.3762488150349581E+00, 2.2839981873006558E+00, 8.2819625836083788E-12, -2.2839981872910400E+00, 3.3762488150351579E+00}; + for (int i=0; i<5; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i]))))))); + } else if (w==6) { + constexpr FLT c0[] = {2.0553833234911899E+03, 1.5499537739913145E+05, 8.1177907023291232E+05, 8.1177907023291232E+05, 1.5499537739913145E+05, 2.0553833235005700E+03}; + constexpr FLT c1[] = {7.1269776034442684E+03, 2.0581923258843319E+05, 3.1559612614917679E+05, -3.1559612614917639E+05, -2.0581923258843317E+05, -7.1269776034341394E+03}; + constexpr FLT c2[] = {1.0023404568475091E+04, 9.0916650498360206E+04, -1.0095927514054631E+05, -1.0095927514054631E+05, 9.0916650498360163E+04, 1.0023404568484637E+04}; + constexpr FLT c3[] = {7.2536109410387444E+03, 4.8347162752603444E+03, -5.0512736602018485E+04, 5.0512736602018602E+04, -4.8347162752602972E+03, -7.2536109410297577E+03}; + constexpr FLT c4[] = {2.7021878300949775E+03, -7.8773465553972374E+03, 5.2105876478343516E+03, 5.2105876478343944E+03, -7.8773465553972464E+03, 2.7021878301048723E+03}; + constexpr FLT c5[] = {3.2120291706547630E+02, -1.8229189469936912E+03, 3.7928113414428476E+03, -3.7928113414427171E+03, 1.8229189469937239E+03, -3.2120291705638328E+02}; + constexpr FLT c6[] = {-1.2051267090537345E+02, 2.2400507411399769E+02, -1.2506575852547746E+02, -1.2506575852531816E+02, 2.2400507411399730E+02, -1.2051267089640162E+02}; + constexpr FLT c7[] = {-4.5977202613346755E+01, 1.1536880606857032E+02, -1.7819720186492938E+02, 1.7819720186504426E+02, -1.1536880606851560E+02, 4.5977202622148354E+01}; + constexpr FLT c8[] = {-1.5631081288822022E+00, 7.1037430590520445E-01, -6.9838401262032682E-02, -6.9838401199524530E-02, 7.1037430591562767E-01, -1.5631081203751171E+00}; + for (int i=0; i<6; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i])))))))); + } else if (w==7) { + constexpr FLT c0[] = {3.9948351830487582E+03, 5.4715865608590841E+05, 5.0196413492771825E+06, 9.8206709220713321E+06, 5.0196413492771871E+06, 5.4715865608590853E+05, 3.9948351830642619E+03}; + constexpr FLT c1[] = {1.5290160332974698E+04, 8.7628248584320419E+05, 3.4421061790934466E+06, 6.5103105025927563E-10, -3.4421061790934466E+06, -8.7628248584320443E+05, -1.5290160332958061E+04}; + constexpr FLT c2[] = {2.4458227486779258E+04, 5.3904618484139442E+05, 2.4315566181017563E+05, -1.6133959371974308E+06, 2.4315566181017424E+05, 5.3904618484139396E+05, 2.4458227486795091E+04}; + constexpr FLT c3[] = {2.1166189345881652E+04, 1.3382732160223150E+05, -3.3113450969689601E+05, 2.5683270626620309E-10, 3.3113450969689793E+05, -1.3382732160223130E+05, -2.1166189345866896E+04}; + constexpr FLT c4[] = {1.0542795672344870E+04, -7.0739172265096349E+03, -6.5563293056048627E+04, 1.2429734005960199E+05, -6.5563293056048671E+04, -7.0739172265096395E+03, 1.0542795672361222E+04}; + constexpr FLT c5[] = {2.7903491906228451E+03, -1.0975382873972989E+04, 1.3656979541145318E+04, 4.9801640867456605E-10, -1.3656979541144143E+04, 1.0975382873973054E+04, -2.7903491906078325E+03}; + constexpr FLT c6[] = {1.6069721418054232E+02, -1.5518707872249406E+03, 4.3634273936649897E+03, -5.9891976420600004E+03, 4.3634273936636964E+03, -1.5518707872250636E+03, 1.6069721419532380E+02}; + constexpr FLT c7[] = {-1.2289277373866669E+02, 2.8583630927761948E+02, -2.8318194617245649E+02, -3.5832266061541795E-11, 2.8318194617438041E+02, -2.8583630927744588E+02, 1.2289277375319726E+02}; + constexpr FLT c8[] = {-3.2270164914244575E+01, 9.1892112257588494E+01, -1.6710678096380749E+02, 2.0317049305436126E+02, -1.6710678096299210E+02, 9.1892112257580479E+01, -3.2270164900216493E+01}; + constexpr FLT c9[] = {-1.4761409684320093E-01, -9.1862771282699351E-01, 1.2845147740384601E+00, -5.0335941641611417E-10, -1.2845147731561353E+00, 9.1862771293147938E-01, 1.4761410890830065E-01}; + for (int i=0; i<7; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i]))))))))); + } else if (w==8) { + constexpr FLT c0[] = {7.3898000697448142E+03, 1.7297637497600052E+06, 2.5578341605285820E+07, 8.4789650417103425E+07, 8.4789650417103410E+07, 2.5578341605285831E+07, 1.7297637497600054E+06, 7.3898000697448097E+03}; + constexpr FLT c1[] = {3.0719636811267621E+04, 3.1853145713323932E+06, 2.3797981861403704E+07, 2.4569731244678468E+07, -2.4569731244678475E+07, -2.3797981861403704E+07, -3.1853145713323941E+06, -3.0719636811267595E+04}; + constexpr FLT c2[] = {5.4488498478251728E+04, 2.4101183255475122E+06, 6.4554051283428418E+06, -8.9200440393090658E+06, -8.9200440393090583E+06, 6.4554051283428296E+06, 2.4101183255475126E+06, 5.4488498478251728E+04}; + constexpr FLT c3[] = {5.3926359802542138E+04, 9.0469037926849385E+05, -6.0897036277695757E+05, -3.0743852105799988E+06, 3.0743852105800197E+06, 6.0897036277696723E+05, -9.0469037926849280E+05, -5.3926359802542152E+04}; + constexpr FLT c4[] = {3.2444118016247576E+04, 1.3079802224392162E+05, -5.8652889370128687E+05, 4.2333306008153327E+05, 4.2333306008153543E+05, -5.8652889370128710E+05, 1.3079802224392179E+05, 3.2444118016247601E+04}; + constexpr FLT c5[] = {1.1864306345505300E+04, -2.2700360645707835E+04, -5.0713607251411129E+04, 1.8308704458211461E+05, -1.8308704458211147E+05, 5.0713607251410089E+04, 2.2700360645707704E+04, -1.1864306345505296E+04}; + constexpr FLT c6[] = {2.2812256770903396E+03, -1.1569135767377908E+04, 2.0942387020802456E+04, -1.1661592834947036E+04, -1.1661592834946512E+04, 2.0942387020804370E+04, -1.1569135767377549E+04, 2.2812256770903291E+03}; + constexpr FLT c7[] = {8.5503535636977634E+00, -9.7513976461196773E+02, 3.8242995179186414E+03, -6.9201295567263214E+03, 6.9201295567309990E+03, -3.8242995179140653E+03, 9.7513976461263269E+02, -8.5503535636935535E+00}; + constexpr FLT c8[] = {-1.0230637348345098E+02, 2.8246898554249236E+02, -3.8638201738252542E+02, 1.9106407992706994E+02, 1.9106407993520349E+02, -3.8638201738414602E+02, 2.8246898554297724E+02, -1.0230637348344338E+02}; + constexpr FLT c9[] = {-1.9200143062942033E+01, 6.1692257626381128E+01, -1.2981109187954436E+02, 1.8681284209765820E+02, -1.8681284209914423E+02, 1.2981109187880136E+02, -6.1692257626381128E+01, 1.9200143062947838E+01}; + constexpr FLT c10[] = {3.7894993761363543E-01, -1.7334408835887836E+00, 2.5271184092462979E+00, -1.2600963912775105E+00, -1.2600963880718390E+00, 2.5271184126204269E+00, -1.7334408829982433E+00, 3.7894993761427903E-01}; + for (int i=0; i<8; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i])))))))))); + } else if (w==9) { + constexpr FLT c0[] = {1.3136365370186153E+04, 5.0196413492771843E+06, 1.1303327711722577E+08, 5.8225443924996734E+08, 9.7700272582690716E+08, 5.8225443924996805E+08, 1.1303327711722578E+08, 5.0196413492772263E+06, 1.3136365370186144E+04}; + constexpr FLT c1[] = {5.8623313038274369E+04, 1.0326318537280345E+07, 1.2898448324824868E+08, 3.0522863709830379E+08, 7.2435840302079811E-08, -3.0522863709830397E+08, -1.2898448324824865E+08, -1.0326318537280394E+07, -5.8623313038274347E+04}; + constexpr FLT c2[] = {1.1335001341875960E+05, 9.0726133144784812E+06, 5.3501544534038134E+07, -2.6789524644140172E+05, -1.2483923718899371E+08, -2.6789524644173466E+05, 5.3501544534038089E+07, 9.0726133144785147E+06, 1.1335001341875963E+05}; + constexpr FLT c3[] = {1.2489113703229754E+05, 4.3035547171861976E+06, 6.3021978510599164E+06, -2.6014941986658975E+07, 5.3074599277157087E-08, 2.6014941986659400E+07, -6.3021978510598680E+06, -4.3035547171862088E+06, -1.2489113703229751E+05}; + constexpr FLT c4[] = {8.6425493435991244E+04, 1.0891182836653311E+06, -2.0713033564200432E+06, -2.8994941183505901E+06, 7.5905338661206560E+06, -2.8994941183505324E+06, -2.0713033564200350E+06, 1.0891182836653385E+06, 8.6425493435991288E+04}; + constexpr FLT c5[] = {3.8657354724013800E+04, 7.9936390113329253E+04, -7.0458265546791849E+05, 1.0151095605715540E+06, 7.5990350518026299E-08, -1.0151095605718379E+06, 7.0458265546793933E+05, -7.9936390113333939E+04, -3.8657354724013821E+04}; + constexpr FLT c6[] = {1.0779131453134645E+04, -3.3466718311300116E+04, -1.3245366618985940E+04, 1.8238470515354761E+05, -2.9285656292981049E+05, 1.8238470515352563E+05, -1.3245366618989963E+04, -3.3466718311299133E+04, 1.0779131453134627E+04}; + constexpr FLT c7[] = {1.4992527030548656E+03, -9.7024371533879767E+03, 2.3216330734078529E+04, -2.3465262819038293E+04, -4.5678067266366728E-08, 2.3465262819229152E+04, -2.3216330734050898E+04, 9.7024371533899721E+03, -1.4992527030548690E+03}; + constexpr FLT c8[] = {-7.9857427421152821E+01, -4.0585588534976301E+02, 2.6054813773370911E+03, -6.1806593581469824E+03, 8.0679596873459095E+03, -6.1806593581737125E+03, 2.6054813773390433E+03, -4.0585588535087578E+02, -7.9857427421118601E+01}; + constexpr FLT c9[] = {-7.1572272057928345E+01, 2.2785637019390455E+02, -3.9109820766111051E+02, 3.3597424707310040E+02, -1.3908671051550088E-08, -3.3597424727519922E+02, 3.9109820767448468E+02, -2.2785637019111829E+02, 7.1572272057948652E+01}; + constexpr FLT c10[] = {-9.8886360697883688E+00, 3.5359026950204516E+01, -8.5251867695464611E+01, 1.4285748013461193E+02, -1.6935269664190733E+02, 1.4285748014610570E+02, -8.5251867686017064E+01, 3.5359026947336602E+01, -9.8886360697963340E+00}; + for (int i=0; i<9; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i])))))))))); + } else if (w==10) { + constexpr FLT c0[] = {2.2594586605749224E+04, 1.3595989066786611E+07, 4.4723032442444932E+08, 3.3781755837397552E+09, 8.6836783895849857E+09, 8.6836783895849838E+09, 3.3781755837397523E+09, 4.4723032442444944E+08, 1.3595989066786496E+07, 2.2594586605749344E+04}; + constexpr FLT c1[] = {1.0729981697645644E+05, 3.0651490267742995E+07, 5.9387966085130477E+08, 2.4434902657508349E+09, 2.0073077861288934E+09, -2.0073077861288950E+09, -2.4434902657508330E+09, -5.9387966085130477E+08, -3.0651490267742828E+07, -1.0729981697645634E+05}; + constexpr FLT c2[] = {2.2340399734184612E+05, 3.0258214643190462E+07, 3.1512411458738238E+08, 4.3618276932319850E+08, -7.8178848450497270E+08, -7.8178848450497031E+08, 4.3618276932319820E+08, 3.1512411458738214E+08, 3.0258214643190324E+07, 2.2340399734184553E+05}; + constexpr FLT c3[] = {2.6917433004353492E+05, 1.6875651476661246E+07, 7.4664745481963649E+07, -9.5882157211117968E+07, -2.0622994435532477E+08, 2.0622994435532823E+08, 9.5882157211118430E+07, -7.4664745481963366E+07, -1.6875651476661157E+07, -2.6917433004353428E+05}; + constexpr FLT c4[] = {2.0818422772177897E+05, 5.6084730690362593E+06, 1.4435118192351859E+06, -4.0063869969544269E+07, 3.2803674392747816E+07, 3.2803674392746560E+07, -4.0063869969546124E+07, 1.4435118192352206E+06, 5.6084730690362155E+06, 2.0818422772177868E+05}; + constexpr FLT c5[] = {1.0781139496011086E+05, 9.9202615851199278E+05, -3.3266265543961083E+06, -4.8557049011452327E+05, 1.0176155522772400E+07, -1.0176155522773268E+07, 4.8557049011599307E+05, 3.3266265543962419E+06, -9.9202615851196356E+05, -1.0781139496011072E+05}; + constexpr FLT c6[] = {3.7380102688153638E+04, 1.2716675000361241E+04, -6.2163527451762755E+05, 1.4157962667184302E+06, -8.4419693137719855E+05, -8.4419693137682532E+05, 1.4157962667184921E+06, -6.2163527451772091E+05, 1.2716675000342160E+04, 3.7380102688153478E+04}; + constexpr FLT c7[] = {8.1238936393894573E+03, -3.4872365530440075E+04, 2.3913680325287874E+04, 1.2428850301835715E+05, -3.2158255329711520E+05, 3.2158255329964001E+05, -1.2428850301842803E+05, -2.3913680325138281E+04, 3.4872365530466821E+04, -8.1238936393894610E+03}; + constexpr FLT c8[] = {7.8515926628982811E+02, -6.6607899119346384E+03, 2.0167398338412942E+04, -2.8951401344643764E+04, 1.4622828141516249E+04, 1.4622828142773422E+04, -2.8951401346273171E+04, 2.0167398338466974E+04, -6.6607899119428766E+03, 7.8515926628979298E+02}; + constexpr FLT c9[] = {-1.0147176570538747E+02, -3.5304284178326540E+01, 1.3576976855470537E+03, -4.3921059355373945E+03, 7.3232085265656797E+03, -7.3232085282537992E+03, 4.3921059362506849E+03, -1.3576976853984515E+03, 3.5304284186128150E+01, 1.0147176570552679E+02}; + constexpr FLT c10[] = {-4.3161545259359876E+01, 1.5498490982726668E+02, -3.1771250761814974E+02, 3.7215448796966825E+02, -1.7181762811175784E+02, -1.7181762918070896E+02, 3.7215448823960344E+02, -3.1771250765054128E+02, 1.5498490982861634E+02, -4.3161545259484186E+01}; + constexpr FLT c11[] = {-4.2916172038642904E+00, 1.7402146073587435E+01, -4.7947588063038118E+01, 9.2697697961204668E+01, -1.2821427624698006E+02, 1.2821427667135228E+02, -9.2697698383138089E+01, 4.7947588092305367E+01, -1.7402146072063207E+01, 4.2916172038214455E+00}; + for (int i=0; i<10; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i]))))))))))); + } else if (w==11) { + constexpr FLT c0[] = {3.7794653219809712E+04, 3.4782300224660814E+07, 1.6188020733727572E+09, 1.7196758809615025E+10, 6.3754384857724686E+10, 9.7196447559193588E+10, 6.3754384857724686E+10, 1.7196758809615013E+10, 1.6188020733727574E+09, 3.4782300224660836E+07, 3.7794653219808912E+04}; + constexpr FLT c1[] = {1.8969206922085886E+05, 8.4769319065313712E+07, 2.4230555767723413E+09, 1.5439732722639107E+10, 2.7112836839612331E+10, 7.5382856415600940E-06, -2.7112836839612324E+10, -1.5439732722639109E+10, -2.4230555767723413E+09, -8.4769319065313712E+07, -1.8969206922085691E+05}; + constexpr FLT c2[] = {4.2138380313901440E+05, 9.2050522922791913E+07, 1.5259983101266618E+09, 4.7070559561237240E+09, -1.2448027572952247E+09, -1.0161446790279316E+10, -1.2448027572952359E+09, 4.7070559561237249E+09, 1.5259983101266608E+09, 9.2050522922791883E+07, 4.2138380313901132E+05}; + constexpr FLT c3[] = {5.4814313598122029E+05, 5.8085130777589604E+07, 4.9484006166551131E+08, 1.6222124676641059E+08, -2.0440440381345210E+09, 1.6029666825264191E-05, 2.0440440381345406E+09, -1.6222124676640612E+08, -4.9484006166551065E+08, -5.8085130777589574E+07, -5.4814313598121749E+05}; + constexpr FLT c4[] = {4.6495183529254969E+05, 2.3067199578027170E+07, 6.9832590192482471E+07, -2.2024799260683393E+08, -1.2820270942588173E+08, 5.1017181199129957E+08, -1.2820270942587103E+08, -2.2024799260683718E+08, 6.9832590192482680E+07, 2.3067199578027181E+07, 4.6495183529254753E+05}; + constexpr FLT c5[] = {2.7021781043532956E+05, 5.6764510325100170E+06, -5.5650761736746123E+06, -3.9907385617899098E+07, 7.2453390663685441E+07, 1.3807321808330796E-06, -7.2453390663686499E+07, 3.9907385617896959E+07, 5.5650761736744791E+06, -5.6764510325100273E+06, -2.7021781043532840E+05}; + constexpr FLT c6[] = {1.0933249308680632E+05, 6.9586821127988759E+05, -3.6860240321936086E+06, 2.7428169457744057E+06, 8.3392008440658972E+06, -1.6402201025049815E+07, 8.3392008440622678E+06, 2.7428169457778567E+06, -3.6860240321934861E+06, 6.9586821127989655E+05, 1.0933249308680571E+05}; + constexpr FLT c7[] = {3.0203516161820731E+04, -3.6879059542738614E+04, -4.1141031216769724E+05, 1.4111389975281695E+06, -1.5914376635274226E+06, 6.7631682826831895E-06, 1.5914376635404355E+06, -1.4111389975219201E+06, 4.1141031216798135E+05, 3.6879059542753101E+04, -3.0203516161820640E+04}; + constexpr FLT c8[] = {5.1670143574923986E+03, -2.8613147115359603E+04, 4.3560195427027051E+04, 4.8438679581734432E+04, -2.5856630639957223E+05, 3.7994883866286115E+05, -2.5856630639708077E+05, 4.8438679579228658E+04, 4.3560195427174098E+04, -2.8613147115353891E+04, 5.1670143574923814E+03}; + constexpr FLT c9[] = {3.0888018539742438E+02, -3.7949446187486474E+03, 1.4313303205130735E+04, -2.6681600236165083E+04, 2.3856005159699442E+04, -1.9072153968212169E-06, -2.3856005160079862E+04, 2.6681600234262976E+04, -1.4313303204940523E+04, 3.7949446187568205E+03, -3.0888018539723868E+02}; + constexpr FLT c10[] = {-8.3747489794178762E+01, 1.1948077481430271E+02, 4.8528498043145930E+02, -2.5024391100070475E+03, 5.3511195380863319E+03, -6.7655484103934950E+03, 5.3511195323636521E+03, -2.5024391101798296E+03, 4.8528498086337265E+02, 1.1948077483184566E+02, -8.3747489794339316E+01}; + constexpr FLT c11[] = {-2.2640047135393669E+01, 9.0840898559070766E+01, -2.1597187557069051E+02, 3.1511228970473707E+02, -2.4856618213020064E+02, -2.0962600056762836E-06, 2.4856618232531096E+02, -3.1511228707801843E+02, 2.1597187541459934E+02, -9.0840898577362736E+01, 2.2640047135479467E+01}; + constexpr FLT c12[] = {-1.6306382885603201E+00, 7.3325946574893264E+00, -2.3241017691629008E+01, 5.1715493346619120E+01, -8.2673008978082819E+01, 9.6489716906321945E+01, -8.2673008978083388E+01, 5.1715493276466965E+01, -2.3241017744243891E+01, 7.3325946602297218E+00, -1.6306382886202573E+00}; + for (int i=0; i<11; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i] + z*(c12[i])))))))))))); + } else if (w==12) { + constexpr FLT c0[] = {6.1722991679853279E+04, 8.4789650417103827E+07, 5.4431675199498749E+09, 7.8788892335272308E+10, 4.0355760945670074E+11, 8.8071481911347998E+11, 8.8071481911348035E+11, 4.0355760945670081E+11, 7.8788892335272507E+10, 5.4431675199498901E+09, 8.4789650417103752E+07, 6.1722991679871782E+04}; + constexpr FLT c1[] = {3.2561466099406185E+05, 2.2112758120210630E+08, 8.9911609880089836E+09, 8.3059508064200958E+10, 2.3965569143469873E+11, 1.6939286803305209E+11, -1.6939286803305209E+11, -2.3965569143469867E+11, -8.3059508064201111E+10, -8.9911609880090008E+09, -2.2112758120210621E+08, -3.2561466099404270E+05}; + constexpr FLT c2[] = {7.6621098001581512E+05, 2.6026568260310283E+08, 6.4524338253008652E+09, 3.3729904113826836E+10, 2.8555202212474079E+10, -6.8998572040731476E+10, -6.8998572040731461E+10, 2.8555202212474102E+10, 3.3729904113826820E+10, 6.4524338253008747E+09, 2.6026568260310283E+08, 7.6621098001583782E+05}; + constexpr FLT c3[] = {1.0657807616803222E+06, 1.8144472126891005E+08, 2.5524827004349880E+09, 5.2112383911371851E+09, -1.0268350564014641E+10, -1.4763245309081160E+10, 1.4763245309081381E+10, 1.0268350564014679E+10, -5.2112383911371050E+09, -2.5524827004349866E+09, -1.8144472126890993E+08, -1.0657807616803094E+06}; + constexpr FLT c4[] = {9.7829638830158766E+05, 8.2222351241520002E+07, 5.5676911894064677E+08, -4.8739037675425845E+08, -2.7153428193078089E+09, 2.5627633609246616E+09, 2.5627633609247270E+09, -2.7153428193078089E+09, -4.8739037675429344E+08, 5.5676911894064772E+08, 8.2222351241519988E+07, 9.7829638830161223E+05}; + constexpr FLT c5[] = {6.2536876825113979E+05, 2.4702814073680259E+07, 4.1488431554846764E+07, -2.9274790542417943E+08, 1.0742154109192364E+08, 6.2185168968026125E+08, -6.2185168968025279E+08, -1.0742154109186378E+08, 2.9274790542422217E+08, -4.1488431554844894E+07, -2.4702814073680248E+07, -6.2536876825112430E+05}; + constexpr FLT c6[] = {2.8527714307528501E+05, 4.6266378435690925E+06, -1.0665598090789001E+07, -2.6048960239884529E+07, 9.1597254427304730E+07, -5.9794495983325504E+07, -5.9794495983230442E+07, 9.1597254427350238E+07, -2.6048960239922173E+07, -1.0665598090794679E+07, 4.6266378435690831E+06, 2.8527714307530370E+05}; + constexpr FLT c7[] = {9.2873647411234633E+04, 3.6630046787437343E+05, -3.1271047224703613E+06, 4.8612412939389814E+06, 3.3820440907783178E+06, -1.6880127953644276E+07, 1.6880127953794900E+07, -3.3820440907782884E+06, -4.8612412938910574E+06, 3.1271047224760642E+06, -3.6630046787425788E+05, -9.2873647411217215E+04}; + constexpr FLT c8[] = {2.0817947751046311E+04, -5.5660303410283603E+04, -1.9519783923352187E+05, 1.0804817251249440E+06, -1.8264985852847320E+06, 9.7602844964054180E+05, 9.7602844964026869E+05, -1.8264985852578641E+06, 1.0804817251242315E+06, -1.9519783923298802E+05, -5.5660303410281354E+04, 2.0817947751063894E+04}; + constexpr FLT c9[] = {2.7986023314783351E+03, -1.9404411093657811E+04, 4.3922625001185028E+04, -7.6450317330166517E+03, -1.5273911976404343E+05, 3.3223441450907954E+05, -3.3223441450755787E+05, 1.5273911981578072E+05, 7.6450317512768770E+03, -4.3922624998712294E+04, 1.9404411093676386E+04, -2.7986023314643107E+03}; + constexpr FLT c10[] = {6.7849020474217255E+01, -1.7921351307610907E+03, 8.4980694701237535E+03, -1.9742624848712727E+04, 2.4620674811515193E+04, -1.1676544936917096E+04, -1.1676544845699163E+04, 2.4620674862652242E+04, -1.9742624819688928E+04, 8.4980694644226842E+03, -1.7921351307503089E+03, 6.7849020488654887E+01}; + constexpr FLT c11[] = {-5.4577020998540995E+01, 1.3637112871144197E+02, 4.5513617165591533E+01, -1.1174001347694452E+03, 3.2018768920645603E+03, -5.0580352089258022E+03, 5.0580351705274497E+03, -3.2018769484133886E+03, 1.1174001005075061E+03, -4.5513609907370189E+01, -1.3637112869192950E+02, 5.4577021011650153E+01}; + constexpr FLT c12[] = {-1.0538365872663764E+01, 4.6577222493036992E+01, -1.2606964247581806E+02, 2.1881090265912360E+02, -2.3273404104747246E+02, 1.0274271612440927E+02, 1.0274271612440242E+02, -2.3273400063947102E+02, 2.1881092482740195E+02, -1.2606964693052080E+02, 4.6577222495229805E+01, -1.0538365860486415E+01}; + constexpr FLT c13[] = {-4.6087004138254672E-01, 2.5969759057927089E+00, -9.6946928123584506E+00, 2.4990051638288470E+01, -4.6013914134428035E+01, 6.2056955095902744E+01, -6.2056967309552682E+01, 4.6013924603270830E+01, -2.4990037679831403E+01, 9.6946951024178141E+00, -2.5969758989770559E+00, 4.6087004739949022E-01}; + for (int i=0; i<12; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i] + z*(c12[i] + z*(c13[i]))))))))))))); + } else if (w==13) { + constexpr FLT c0[] = {9.8715725867495858E+04, 1.9828875496808127E+08, 1.7196758809615005E+10, 3.3083776881353601E+11, 2.2668873993375454E+12, 6.7734720591167598E+12, 9.6695220682534863E+12, 6.7734720591167490E+12, 2.2668873993375454E+12, 3.3083776881353540E+11, 1.7196758809615013E+10, 1.9828875496807912E+08, 9.8715725867495596E+04}; + constexpr FLT c1[] = {5.4491110456935561E+05, 5.4903670125539398E+08, 3.0879465445278194E+10, 3.9588436413399976E+11, 1.6860562536749780E+12, 2.4256447893117891E+12, 5.2271652473787576E-04, -2.4256447893117861E+12, -1.6860562536749771E+12, -3.9588436413399896E+11, -3.0879465445278202E+10, -5.4903670125538874E+08, -5.4491110456935479E+05}; + constexpr FLT c2[] = {1.3504711883426080E+06, 6.9286979077463174E+08, 2.4618123595484570E+10, 1.9493985627722617E+11, 3.9422703517046405E+11, -1.8678883613919846E+11, -8.5538079834550037E+11, -1.8678883613919666E+11, 3.9422703517046375E+11, 1.9493985627722595E+11, 2.4618123595484570E+10, 6.9286979077462602E+08, 1.3504711883426073E+06}; + constexpr FLT c3[] = {1.9937206140846505E+06, 5.2512029493766004E+08, 1.1253303793811764E+10, 4.6205527735932259E+10, -1.1607472377982828E+10, -1.6305241755642276E+11, 1.6137900538478137E-04, 1.6305241755642496E+11, 1.1607472377982767E+10, -4.6205527735932159E+10, -1.1253303793811754E+10, -5.2512029493765628E+08, -1.9937206140846501E+06}; + constexpr FLT c4[] = {1.9607419630386413E+06, 2.6425362558103913E+08, 3.1171259341747184E+09, 2.9839860297840395E+09, -1.9585031917561905E+10, -5.0666917387060509E+09, 3.6568794485482040E+10, -5.0666917387052479E+09, -1.9585031917561382E+10, 2.9839860297839293E+09, 3.1171259341747251E+09, 2.6425362558103746E+08, 1.9607419630386424E+06}; + constexpr FLT c5[] = {1.3593773865640303E+06, 9.1556445104158297E+07, 4.7074012944133645E+08, -1.1192579335656993E+09, -2.1090780087868536E+09, 5.2270306737954664E+09, 5.5914317801530834E-04, -5.2270306737946453E+09, 2.1090780087878797E+09, 1.1192579335657849E+09, -4.7074012944133860E+08, -9.1556445104157880E+07, -1.3593773865640303E+06}; + constexpr FLT c6[] = {6.8417206432039291E+05, 2.1561705510027312E+07, 7.5785249893027432E+06, -2.7456096030220407E+08, 3.4589095671070045E+08, 4.0256106808935356E+08, -1.0074306926604354E+09, 4.0256106809054130E+08, 3.4589095671009880E+08, -2.7456096030236250E+08, 7.5785249893008731E+06, 2.1561705510027334E+07, 6.8417206432039256E+05}; + constexpr FLT c7[] = {2.5248269397037590E+05, 3.0985559672617475E+06, -1.1816517087615140E+07, -8.2958498769974122E+06, 8.0546642347458601E+07, -1.0594657799513456E+08, 2.0249720264016184E-04, 1.0594657799514198E+08, -8.0546642347324282E+07, 8.2958498771580132E+06, 1.1816517087620620E+07, -3.0985559672620827E+06, -2.5248269397037590E+05}; + constexpr FLT c8[] = {6.7530100970876185E+04, 1.2373362326675311E+05, -2.1245597183288219E+06, 5.1047323238642653E+06, -1.4139444406972022E+06, -1.1818267556148527E+07, 2.0121548578311723E+07, -1.1818267556689126E+07, -1.4139444399964837E+06, 5.1047323237335468E+06, -2.1245597183262822E+06, 1.2373362326715943E+05, 6.7530100970876825E+04}; + constexpr FLT c9[] = {1.2421368748960511E+04, -5.0576243646858849E+04, -4.8878193436522284E+04, 6.5307896871419600E+05, -1.5497610128521242E+06, 1.5137725913425679E+06, 9.4288709689637382E-06, -1.5137725926086102E+06, 1.5497610130712469E+06, -6.5307896859246108E+05, 4.8878193441087336E+04, 5.0576243646517250E+04, -1.2421368748960882E+04}; + constexpr FLT c10[] = {1.2904654687548632E+03, -1.1169946054771519E+04, 3.3275109715936509E+04, -3.1765222282529230E+04, -5.9810982046625119E+04, 2.2355863065128919E+05, -3.1083591717381903E+05, 2.2355863453495159E+05, -5.9810982317515191E+04, -3.1765222420737289E+04, 3.3275109716627514E+04, -1.1169946054393644E+04, 1.2904654687550840E+03}; + constexpr FLT c11[] = {-1.9043622268214964E+01, -6.8296542209517031E+02, 4.2702512258593224E+03, -1.2165497344048174E+04, 1.9423733117203814E+04, -1.6010024763745962E+04, 3.4546242756821764E-04, 1.6010021562009399E+04, -1.9423732921465795E+04, 1.2165497485154361E+04, -4.2702512258593424E+03, 6.8296542155861471E+02, 1.9043622268233225E+01}; + constexpr FLT c12[] = {-3.0093984466084923E+01, 9.8972865759901183E+01, -9.7437038386122609E+01, -3.5079929976821143E+02, 1.5699249129925884E+03, -3.1287450613413444E+03, 3.8692192717886201E+03, -3.1287461388880197E+03, 1.5699252721748373E+03, -3.5079941874733129E+02, -9.7437038807041006E+01, 9.8972866294818274E+01, -3.0093984465708520E+01}; + constexpr FLT c13[] = {-4.3050286012574066E+00, 2.1108975856232256E+01, -6.4297196943170974E+01, 1.2922884719917388E+02, -1.6991815434264092E+02, 1.2654996803592717E+02, -1.3650372630766216E-04, -1.2655097304483594E+02, 1.6991801475807023E+02, -1.2922895886683040E+02, 6.4297199778482565E+01, -2.1108976173160116E+01, 4.3050286010444170E+00}; + constexpr FLT c14[] = {-1.0957333734356203E-01, 7.2949328697697935E-01, -3.4300803257592030E+00, 1.0470037850609911E+01, -2.2292132783546631E+01, 3.4570970759468082E+01, -3.9923502981338281E+01, 3.4573363471454584E+01, -2.2292171023236033E+01, 1.0470076090299283E+01, -3.4300793014818574E+00, 7.2949361239845723E-01, -1.0957333723937021E-01}; + for (int i=0; i<13; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i] + z*(c12[i] + z*(c13[i] + z*(c14[i])))))))))))))); + } else if (w==14) { + constexpr FLT c0[] = {1.5499533202966311E+05, 4.4723032442444772E+08, 5.1495083701694801E+10, 1.2904576022918081E+12, 1.1534950432785514E+13, 4.5650102198520523E+13, 8.8830582190032719E+13, 8.8830582190032734E+13, 4.5650102198520523E+13, 1.1534950432785541E+13, 1.2904576022918088E+12, 5.1495083701695160E+10, 4.4723032442444867E+08, 1.5499533202970124E+05}; + constexpr FLT c1[] = {8.9188339002980455E+05, 1.3065352538728638E+09, 9.9400185225815598E+10, 1.7136059013402412E+12, 1.0144146621675834E+13, 2.3034036018490723E+13, 1.4630967270448885E+13, -1.4630967270448867E+13, -2.3034036018490715E+13, -1.0144146621675846E+13, -1.7136059013402415E+12, -9.9400185225815979E+10, -1.3065352538728662E+09, -8.9188339002979419E+05}; + constexpr FLT c2[] = {2.3170473769379673E+06, 1.7532505043698251E+09, 8.6523535958354309E+10, 9.7455289065487476E+11, 3.2977972139362329E+12, 1.7874626001697834E+12, -6.1480918082633936E+12, -6.1480918082634014E+12, 1.7874626001697737E+12, 3.2977972139362251E+12, 9.7455289065487329E+11, 8.6523535958354599E+10, 1.7532505043698282E+09, 2.3170473769380408E+06}; + constexpr FLT c3[] = {3.6089249230396431E+06, 1.4278058213962200E+09, 4.4296625537022446E+10, 2.9466624630419830E+11, 3.1903621584503467E+11, -9.8834691411254578E+11, -1.1072264714919094E+12, 1.1072264714919380E+12, 9.8834691411255481E+11, -3.1903621584503326E+11, -2.9466624630419788E+11, -4.4296625537022636E+10, -1.4278058213962224E+09, -3.6089249230396668E+06}; + constexpr FLT c4[] = {3.7733555140851745E+06, 7.8376718099107444E+08, 1.4443117772349586E+10, 4.3197433307418678E+10, -7.6585042240583893E+10, -1.8569640140762125E+11, 2.0385335192658521E+11, 2.0385335192658505E+11, -1.8569640140762244E+11, -7.6585042240577591E+10, 4.3197433307418831E+10, 1.4443117772349697E+10, 7.8376718099107611E+08, 3.7733555140852574E+06}; + constexpr FLT c5[] = {2.8079157920112340E+06, 3.0340753492383713E+08, 2.9498136661747241E+09, -6.2820200387946582E+08, -2.2372008390623741E+10, 1.5217518660587065E+10, 4.0682590266890762E+10, -4.0682590266874344E+10, -1.5217518660581593E+10, 2.2372008390624836E+10, 6.2820200387926054E+08, -2.9498136661747794E+09, -3.0340753492383808E+08, -2.8079157920112382E+06}; + constexpr FLT c6[] = {1.5361613559533129E+06, 8.3513615594416931E+07, 3.0077547202709264E+08, -1.3749596754065564E+09, -6.6733027297578251E+08, 5.9590333632812872E+09, -4.3025685566868906E+09, -4.3025685566947279E+09, 5.9590333632843285E+09, -6.6733027297604084E+08, -1.3749596754066198E+09, 3.0077547202708143E+08, 8.3513615594416305E+07, 1.5361613559533581E+06}; + constexpr FLT c7[] = {6.2759409419593017E+05, 1.5741723594963871E+07, -1.5632610223386128E+07, -1.9294824907063219E+08, 4.4643806532504034E+08, 1.5178998384579189E+07, -9.6771139891231704E+08, 9.6771139892423606E+08, -1.5178998381071322E+07, -4.4643806533015347E+08, 1.9294824907069016E+08, 1.5632610223408137E+07, -1.5741723594963046E+07, -6.2759409419590794E+05}; + constexpr FLT c8[] = {1.9151404903933618E+05, 1.7156606891565623E+06, -9.7733523156695794E+06, 4.2982266232611798E+06, 5.1660907884888940E+07, -1.1279400211171694E+08, 6.4701089576848499E+07, 6.4701089570801638E+07, -1.1279400210612530E+08, 5.1660907893511616E+07, 4.2982266235306170E+06, -9.7733523156822342E+06, 1.7156606891565854E+06, 1.9151404903936735E+05}; + constexpr FLT c9[] = {4.2715272622844263E+04, -2.2565910611002505E+03, -1.1769776156928577E+06, 4.0078399906352242E+06, -3.8951858073074366E+06, -5.0944610789569877E+06, 1.6765992441849992E+07, -1.6765992434448514E+07, 5.0944610797360903E+06, 3.8951858063335577E+06, -4.0078399906595708E+06, 1.1769776157202481E+06, 2.2565910608803192E+03, -4.2715272622819932E+04}; + constexpr FLT c10[] = {6.4806786522801558E+03, -3.5474227032715331E+04, 1.8237100734263218E+04, 3.0934714642964909E+05, -1.0394703930801603E+06, 1.4743920316337310E+06, -7.3356881642929500E+05, -7.3356882324020052E+05, 1.4743920364765557E+06, -1.0394703915764539E+06, 3.0934714676135289E+05, 1.8237100683125096E+04, -3.5474227032952876E+04, 6.4806786523017845E+03}; + constexpr FLT c11[] = {4.9913632908494827E+02, -5.5416668522806276E+03, 2.0614058722611946E+04, -3.2285139157855901E+04, -5.3099566255893524E+03, 1.1559000150525174E+05, -2.2569743273246771E+05, 2.2569743457059452E+05, -1.1559000428242185E+05, 5.3099542679931265E+03, 3.2285138893125553E+04, -2.0614058670789782E+04, 5.5416668532562171E+03, -4.9913632906264002E+02}; + constexpr FLT c12[] = {-3.3076333188696488E+01, -1.8970588558436827E+02, 1.8160423493169353E+03, -6.3715703265863249E+03, 1.2525624646166696E+04, -1.4199807314837786E+04, 6.4441944019082612E+03, 6.4441857815347785E+03, -1.4199805590763088E+04, 1.2525627375951648E+04, -6.3715703355659844E+03, 1.8160422864600705E+03, -1.8970588672434647E+02, -3.3076333168693779E+01}; + constexpr FLT c13[] = {-1.4394533628062636E+01, 5.7000699174526638E+01, -1.0101142144442984E+02, -3.2954074617159108E+01, 6.1417869930814436E+02, -1.6177306801656998E+03, 2.4593354137960296E+03, -2.4593361954696252E+03, 1.6177288934831954E+03, -6.1417959264939657E+02, 3.2954074617159108E+01, 1.0101142929606195E+02, -5.7000698932570963E+01, 1.4394533639244566E+01}; + constexpr FLT c14[] = {-1.5925952284527973E+00, 8.5113930275160214E+00, -2.8993510636695618E+01, 6.6373557362227814E+01, -1.0329536491693236E+02, 1.0280181071020283E+02, -4.3891122033571499E+01, -4.3893656778687756E+01, 1.0280325289276884E+02, -1.0329444716438918E+02, 6.6373666618482872E+01, -2.8993528390837142E+01, 8.5113926647511526E+00, -1.5925952190335899E+00}; + constexpr FLT c15[] = {1.5984868634272537E-02, 1.2876168577716327E-01, -9.8358742969178536E-01, 3.7710928871122080E+00, -9.4315137784350505E+00, 1.6840408563519507E+01, -2.2308532530501328E+01, 2.2310146222863779E+01, -1.6843058416240989E+01, 9.4311230950209399E+00, -3.7712287769953385E+00, 9.8360653920659347E-01, -1.2876103884046056E-01, -1.5984859595043394E-02}; + for (int i=0; i<14; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i] + z*(c12[i] + z*(c13[i] + z*(c14[i] + z*(c15[i]))))))))))))))); + } else if (w==15) { + constexpr FLT c0[] = {2.3939707792242090E+05, 9.7700272582690299E+08, 1.4715933396485275E+11, 4.7242424833337236E+12, 5.3987426629953617E+13, 2.7580474290566103E+14, 7.0693378336533425E+14, 9.6196578554477850E+14, 7.0693378336533425E+14, 2.7580474290566153E+14, 5.3987426629953828E+13, 4.7242424833337285E+12, 1.4715933396485275E+11, 9.7700272582690418E+08, 2.3939707792242119E+05}; + constexpr FLT c1[] = {1.4314487885226035E+06, 2.9961416925358462E+09, 3.0273361232748425E+11, 6.8507333793903604E+12, 5.4192702756911016E+13, 1.7551587948105316E+14, 2.1874615668430153E+14, 5.4722295550654096E-02, -2.1874615668430156E+14, -1.7551587948105334E+14, -5.4192702756911172E+13, -6.8507333793903730E+12, -3.0273361232748438E+11, -2.9961416925358448E+09, -1.4314487885226023E+06}; + constexpr FLT c2[] = {3.8829497354762922E+06, 4.2473082696966453E+09, 2.8414312556015533E+11, 4.3688281331121431E+12, 2.1823119508000547E+13, 3.2228098609392133E+13, -2.1833085454691801E+13, -7.3750710225100750E+13, -2.1833085454691875E+13, 3.2228098609392070E+13, 2.1823119508000590E+13, 4.3688281331121470E+12, 2.8414312556015527E+11, 4.2473082696966438E+09, 3.8829497354762908E+06}; + constexpr FLT c3[] = {6.3495763451755792E+06, 3.6841035003733959E+09, 1.5965774278321054E+11, 1.5630338683778213E+12, 3.8749058615819409E+12, -2.7319740087722651E+12, -1.3233342822865350E+13, 1.2682483963161023E-01, 1.3233342822865453E+13, 2.7319740087724204E+12, -3.8749058615819307E+12, -1.5630338683778201E+12, -1.5965774278321042E+11, -3.6841035003733950E+09, -6.3495763451755783E+06}; + constexpr FLT c4[] = {7.0146619045520453E+06, 2.1782897863065763E+09, 5.8897780310148117E+10, 3.1953009601770477E+11, 4.0651527030195397E+08, -1.6379148273275671E+12, -1.1568753137013023E+11, 2.7451653250461045E+12, -1.1568753137006947E+11, -1.6379148273276748E+12, 4.0651527030228132E+08, 3.1953009601770502E+11, 5.8897780310148155E+10, 2.1782897863065772E+09, 7.0146619045520453E+06}; + constexpr FLT c5[] = {5.5580012413990172E+06, 9.2345162185944211E+08, 1.4522950934020031E+10, 2.7025952371212032E+10, -1.2304576967641461E+11, -1.0116752717201025E+11, 3.8517418245450385E+11, 1.3143739157465117E-02, -3.8517418245443384E+11, 1.0116752717219414E+11, 1.2304576967643431E+11, -2.7025952371216137E+10, -1.4522950934020092E+10, -9.2345162185944176E+08, -5.5580012413990181E+06}; + constexpr FLT c6[] = {3.2693972344231815E+06, 2.8610260147425276E+08, 2.2348528403751349E+09, -3.4574515574230409E+09, -1.7480626463581440E+10, 3.1608597465590984E+10, 1.9879262560063576E+10, -6.6148013553869423E+10, 1.9879262560078850E+10, 3.1608597465530212E+10, -1.7480626463573368E+10, -3.4574515574202504E+09, 2.2348528403750744E+09, 2.8610260147425228E+08, 3.2693972344231787E+06}; + constexpr FLT c7[] = {1.4553539959296281E+06, 6.4136842048384696E+07, 1.3622336582072574E+08, -1.2131510424637468E+09, 6.4322366984755766E+08, 4.5078753872548027E+09, -7.1689413747004452E+09, 3.2111361580040181E-03, 7.1689413747369127E+09, -4.5078753874649162E+09, -6.4322366984639454E+08, 1.2131510424612916E+09, -1.3622336582064471E+08, -6.4136842048384838E+07, -1.4553539959296265E+06}; + constexpr FLT c8[] = {4.9358776531681791E+05, 9.7772970960583091E+06, -2.3511574237971250E+07, -1.0142613816625430E+08, 3.9421144217985487E+08, -2.8449115594571364E+08, -5.7549243248595941E+08, 1.1608781630719392E+09, -5.7549243238966489E+08, -2.8449115596289498E+08, 3.9421144214631909E+08, -1.0142613816300942E+08, -2.3511574237913735E+07, 9.7772970960591603E+06, 4.9358776531681628E+05}; + constexpr FLT c9[] = {1.2660319987326709E+05, 7.7519511328105081E+05, -6.5244610661542164E+06, 9.0878257490973976E+06, 2.3116605621149909E+07, -8.7079594477661625E+07, 9.5542733670714021E+07, -3.4623017322338634E-02, -9.5542733658248380E+07, 8.7079594589852452E+07, -2.3116605559600774E+07, -9.0878257518242579E+06, 6.5244610661450867E+06, -7.7519511328086059E+05, -1.2660319987326671E+05}; + constexpr FLT c10[] = {2.3793325531461589E+04, -4.2305332802771904E+04, -5.2884156975031609E+05, 2.5307340145554747E+06, -4.0404175204335153E+06, -1.7519988538994591E+05, 1.0146438798034744E+07, -1.5828545528861172E+07, 1.0146438794496680E+07, -1.7520001842407117E+05, -4.0404175643064296E+06, 2.5307340160591919E+06, -5.2884156977243477E+05, -4.2305332802771285E+04, 2.3793325531458995E+04}; + constexpr FLT c11[] = {2.9741655196857741E+03, -2.0687056403629973E+04, 3.3295507834673197E+04, 1.0661145690364030E+05, -5.6644238449031080E+05, 1.0874811673184116E+06, -9.6561276275880623E+05, -7.6207036577648435E-02, 9.6561275636531680E+05, -1.0874812580259521E+06, 5.6644242612787138E+05, -1.0661145858193116E+05, -3.3295507822185595E+04, 2.0687056403005630E+04, -2.9741655196852739E+03}; + constexpr FLT c12[] = {1.5389176594840404E+02, -2.3864418517811582E+03, 1.0846266965476148E+04, -2.2940053899336592E+04, 1.4780105833703366E+04, 4.2663634529139046E+04, -1.3047650082135458E+05, 1.7468394417865420E+05, -1.3047642955960588E+05, 4.2663569014305380E+04, 1.4780038020101238E+04, -2.2940052498526344E+04, 1.0846266965476338E+04, -2.3864418513602504E+03, 1.5389176594853458E+02}; + constexpr FLT c13[] = {-2.3857631312306911E+01, -1.9651606200276817E+01, 6.4183084244784663E+02, -2.8648428291977302E+03, 6.8249248253356263E+03, -9.7944434082514545E+03, 7.6177566999585488E+03, -4.8285923071218206E-02, -7.6177709934185850E+03, 9.7944219680614005E+03, -6.8249060651693289E+03, 2.8648407633460843E+03, -6.4183085466149657E+02, 1.9651606115081155E+01, 2.3857631312306911E+01}; + constexpr FLT c14[] = {-6.1348505726741482E+00, 2.7872916302350376E+01, -6.5819898558168433E+01, 5.1367134246654771E+01, 1.7214275703496423E+02, -6.9657243183240860E+02, 1.3192259272931558E+03, -1.6054145588281010E+03, 1.3192138654025996E+03, -6.9662907027505264E+02, 1.7212038135392731E+02, 5.1368095701697484E+01, -6.5819904020980715E+01, 2.7872916473063263E+01, -6.1348505738411490E+00}; + constexpr FLT c15[] = {-4.9671584422774523E-01, 3.0617550953446120E+00, -1.1650665638577927E+01, 3.0081331929557447E+01, -5.4030564936801589E+01, 6.6075844179663960E+01, -4.7176211285519123E+01, -3.4313439732287163E-02, 4.7173085818207042E+01, -6.6061100127341888E+01, 5.4056655794367416E+01, -3.0081722612971500E+01, 1.1650665638577902E+01, -3.0617553939307713E+00, 4.9671584448693240E-01}; + constexpr FLT c16[] = {4.3460783761337983E-03, -1.3199934226522787E-02, -1.9412503880258877E-01, 1.1325756464362078E+00, -3.4439944517155450E+00, 7.1653575841078521E+00, -1.1108195405465501E+01, 1.2348789868125033E+01, -1.1088023137785596E+01, 7.0939141360622937E+00, -3.4847592426682690E+00, 1.1324705825441117E+00, -1.9413837699275374E-01, -1.3199908576142469E-02, 4.3460782759542488E-03}; + for (int i=0; i<15; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i] + z*(c12[i] + z*(c13[i] + z*(c14[i] + z*(c15[i] + z*(c16[i])))))))))))))))); + } else if (w==16) { + constexpr FLT c0[] = {3.6434551345571154E+05, 2.0744705928579516E+09, 4.0355760945670056E+11, 1.6364575388763043E+13, 2.3514830376056566E+14, 1.5192201717462540E+15, 4.9956173084674150E+15, 8.9287666945127440E+15, 8.9287666945127440E+15, 4.9956173084674160E+15, 1.5192201717462542E+15, 2.3514830376056566E+14, 1.6364575388763049E+13, 4.0355760945670068E+11, 2.0744705928579512E+09, 3.6434551345570991E+05}; + constexpr FLT c1[] = {2.2576246485480345E+06, 6.6499571180086479E+09, 8.7873753526056311E+11, 2.5606844387131062E+13, 2.6313738449330162E+14, 1.1495095100701470E+15, 2.1932582707747572E+15, 1.2860244365132608E+15, -1.2860244365132600E+15, -2.1932582707747580E+15, -1.1495095100701462E+15, -2.6313738449330162E+14, -2.5606844387131066E+13, -8.7873753526056299E+11, -6.6499571180086479E+09, -2.2576246485480345E+06}; + constexpr FLT c2[] = {6.3730995546265058E+06, 9.9060026035198078E+09, 8.8097248605449023E+11, 1.7953384130753688E+13, 1.2398425545001667E+14, 3.0749346493041262E+14, 1.0259777520247212E+14, -5.5291976457534244E+14, -5.5291976457534294E+14, 1.0259777520247097E+14, 3.0749346493041212E+14, 1.2398425545001659E+14, 1.7953384130753672E+13, 8.8097248605448987E+11, 9.9060026035198078E+09, 6.3730995546265077E+06}; + constexpr FLT c3[] = {1.0896915393078227E+07, 9.0890343524593887E+09, 5.3565169504010052E+11, 7.3004206720038770E+12, 2.9692333044160145E+13, 1.6051737468109752E+13, -9.1273329108089609E+13, -8.5999306918501562E+13, 8.5999306918502812E+13, 9.1273329108090391E+13, -1.6051737468109348E+13, -2.9692333044160059E+13, -7.3004206720038691E+12, -5.3565169504010046E+11, -9.0890343524593925E+09, -1.0896915393078225E+07}; + constexpr FLT c4[] = {1.2655725616100591E+07, 5.7342804054544220E+09, 2.1822836608899585E+11, 1.8300700858999712E+12, 2.7770431049857900E+12, -8.5034969223848574E+12, -1.2846668467422469E+13, 1.6519076896573322E+13, 1.6519076896573414E+13, -1.2846668467422033E+13, -8.5034969223850078E+12, 2.7770431049858350E+12, 1.8300700858999753E+12, 2.1822836608899594E+11, 5.7342804054544239E+09, 1.2655725616100593E+07}; + constexpr FLT c5[] = {1.0609303958036318E+07, 2.6255609052371716E+09, 6.1673589426039268E+10, 2.6044432099085120E+11, -3.5431628074578119E+11, -1.6077602129631777E+12, 1.5534405614726155E+12, 2.8019935380863682E+12, -2.8019935380852476E+12, -1.5534405614728257E+12, 1.6077602129636682E+12, 3.5431628074579871E+11, -2.6044432099085229E+11, -6.1673589426039368E+10, -2.6255609052371745E+09, -1.0609303958036322E+07}; + constexpr FLT c6[] = {6.6544809363384582E+06, 8.9490403680928528E+08, 1.1882638725190987E+10, 8.1552898137820768E+09, -1.2575562817884897E+11, 2.7074695075942204E+10, 3.9453789461929230E+11, -3.1679644857371918E+11, -3.1679644857384814E+11, 3.9453789461920764E+11, 2.7074695075779831E+10, -1.2575562817882477E+11, 8.1552898137801113E+09, 1.1882638725190844E+10, 8.9490403680928373E+08, 6.6544809363384526E+06}; + constexpr FLT c7[] = {3.1906872142825029E+06, 2.2785946180651915E+08, 1.3744578972811413E+09, -4.3997172592843504E+09, -9.2011130753862667E+09, 3.4690551711764793E+10, -9.4227043392778511E+09, -5.9308465069355759E+10, 5.9308465069781982E+10, 9.4227043396369877E+09, -3.4690551711565643E+10, 9.2011130754329739E+09, 4.3997172592904301E+09, -1.3744578972811375E+09, -2.2785946180652067E+08, -3.1906872142825001E+06}; + constexpr FLT c8[] = {1.1821527096621764E+06, 4.2281234059839748E+07, 2.8723226058752719E+07, -8.3553955857505906E+08, 1.2447304828865275E+09, 2.1955280942222519E+09, -7.0514195727878428E+09, 4.3745141232918625E+09, 4.3745141237316084E+09, -7.0514195722924280E+09, 2.1955280943332024E+09, 1.2447304828901291E+09, -8.3553955857124400E+08, 2.8723226058927339E+07, 4.2281234059842363E+07, 1.1821527096621776E+06}; + constexpr FLT c9[] = {3.3854610744279926E+05, 5.2176984975088174E+06, -2.0677283565109752E+07, -3.5831818967739724E+07, 2.6599346107970935E+08, -3.7992777963644773E+08, -1.3426914477301279E+08, 9.1752051236703849E+08, -9.1752051203046608E+08, 1.3426914449876857E+08, 3.7992777988576066E+08, -2.6599346104854524E+08, 3.5831818969687484E+07, 2.0677283565073233E+07, -5.2176984975085324E+06, -3.3854610744279926E+05}; + constexpr FLT c10[] = {7.3893334077310792E+04, 2.6983804209766653E+05, -3.6415998560216571E+06, 8.4025485866871737E+06, 4.9278860835956605E+06, -5.1437033778820507E+07, 8.7603898248918146E+07, -4.6199497914231867E+07, -4.6199497948197275E+07, 8.7603898697554156E+07, -5.1437033767498761E+07, 4.9278861543586710E+06, 8.4025485891638417E+06, -3.6415998559774463E+06, 2.6983804209732520E+05, 7.3893334077308697E+04}; + constexpr FLT c11[] = {1.1778892113376965E+04, -4.0077190108567142E+04, -1.8372552169915423E+05, 1.3262878389569877E+06, -2.9738540196046322E+06, 1.9493506557541618E+06, 4.1881949490808225E+06, -1.1066749801915919E+07, 1.1066748877418302E+07, -4.1881948928182255E+06, -1.9493507634843190E+06, 2.9738539997848324E+06, -1.3262878392766670E+06, 1.8372552166918706E+05, 4.0077190106849979E+04, -1.1778892113376709E+04}; + constexpr FLT c12[] = {1.2019749667900676E+03, -1.0378455845063749E+04, 2.6333352662141660E+04, 1.7117059675298591E+04, -2.5133289742429825E+05, 6.4713895872015413E+05, -8.1634975674778735E+05, 3.8623909535608569E+05, 3.8623887467451266E+05, -8.1634966479713970E+05, 6.4713897711029404E+05, -2.5133289282677229E+05, 1.7117063267120848E+04, 2.6333352680101594E+04, -1.0378455843660833E+04, 1.2019749667921026E+03}; + constexpr FLT c13[] = {3.1189837631121321E+01, -8.9083493701244504E+02, 4.9454293991649774E+03, -1.3124692742151998E+04, 1.5834795298841136E+04, 6.9608292767098355E+03, -5.9790200829217545E+04, 1.0841735230501879E+05, -1.0841732371809872E+05, 5.9789914960016831E+04, -6.9607435159496199E+03, -1.5834797085523640E+04, 1.3124692295481371E+04, -4.9454294410403490E+03, 8.9083493766674769E+02, -3.1189837632399257E+01}; + constexpr FLT c14[] = {-1.2975319072478742E+01, 1.8283699094028595E+01, 1.7684019694555272E+02, -1.1059902320249000E+03, 3.1998244780238201E+03, -5.5987981589200417E+03, 5.9247600879368474E+03, -2.5988290685215188E+03, -2.5988178806809206E+03, 5.9249852432272892E+03, -5.5987701893187350E+03, 3.1998552445852642E+03, -1.1059895327848767E+03, 1.7684022972243278E+02, 1.8283699179384410E+01, -1.2975319072812146E+01}; + constexpr FLT c15[] = {-2.3155118729306223E+00, 1.1938503369059017E+01, -3.4150537494399323E+01, 4.8897188710734866E+01, 1.5839596560322873E+01, -2.4289147960969117E+02, 6.0143231605823757E+02, -8.8772403477020873E+02, 8.8712611928432557E+02, -6.0139861536721287E+02, 2.4281211991792659E+02, -1.5853729108169823E+01, -4.8898479664625256E+01, 3.4150529001281690E+01, -1.1938504563403686E+01, 2.3155118727038264E+00}; + constexpr FLT c16[] = {-1.5401723836370515E-01, 9.8067787978090881E-01, -4.1900810719931050E+00, 1.2149798852514468E+01, -2.4780790340446881E+01, 3.6014221907804398E+01, -3.4588714991383583E+01, 1.3071629460227753E+01, 1.2883354961750646E+01, -3.4615611348253751E+01, 3.5973877372428277E+01, -2.4777428295844171E+01, 1.2151059619254390E+01, -4.1901237542037384E+00, 9.8067813628521039E-01, -1.5401723766235165E-01}; + constexpr FLT c17[] = {1.1808834947531816E-02, -2.5444032491006262E-02, -1.4707353726716647E-04, 2.5840423001794482E-01, -1.0910598687678679E+00, 2.6514321899473572E+00, -4.5034457705829842E+00, 6.8479728528821520E+00, -6.8634402190500978E+00, 4.4285511554539836E+00, -2.6424773990080204E+00, 1.0878035811535636E+00, -2.5882398584322625E-01, 1.3196868749378181E-04, 2.5444131865017927E-02, -1.1808835384234016E-02}; + for (int i=0; i<16; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i] + z*(c12[i] + z*(c13[i] + z*(c14[i] + z*(c15[i] + z*(c16[i] + z*(c17[i]))))))))))))))))); + } else + printf("width not implemented!\n"); From 12822a218ffbf6c8090a2343440b6d1c5bae81d0 Mon Sep 17 00:00:00 2001 From: Marco Barbone Date: Tue, 23 Jul 2024 14:40:42 -0400 Subject: [PATCH 20/68] updated cufinufft to new coeff --- .../contrib/ker_horner_allw_loop.inc | 346 +++++++++--------- .../ker_horner_allw_loop_constexpr.inc | 205 ----------- .../ker_lowupsampfac_horner_allw_loop.inc | 317 ++++++++-------- ...owupsampfac_horner_allw_loop_constexpr.inc | 171 --------- include/cufinufft/spreadinterp.h | 22 +- 5 files changed, 329 insertions(+), 732 deletions(-) delete mode 100644 include/cufinufft/contrib/ker_horner_allw_loop_constexpr.inc delete mode 100644 include/cufinufft/contrib/ker_lowupsampfac_horner_allw_loop_constexpr.inc diff --git a/include/cufinufft/contrib/ker_horner_allw_loop.inc b/include/cufinufft/contrib/ker_horner_allw_loop.inc index 953c4618b..1f4c59e2a 100644 --- a/include/cufinufft/contrib/ker_horner_allw_loop.inc +++ b/include/cufinufft/contrib/ker_horner_allw_loop.inc @@ -2,206 +2,204 @@ // Authors: Alex Barnett & Ludvig af Klinteberg. // (C) The Simons Foundation, Inc. if (w==2) { - constexpr FLT c0[] = {4.5147043243215343E+01, 4.5147043243215350E+01}; - constexpr FLT c1[] = {5.7408070938221307E+01, -5.7408070938221300E+01}; - constexpr FLT c2[] = {-1.8395117920046544E+00, -1.8395117920046602E+00}; - constexpr FLT c3[] = {-2.0382426253182064E+01, 2.0382426253182086E+01}; - constexpr FLT c4[] = {-2.0940804433577389E+00, -2.0940804433577398E+00}; - for (int i=0; i<2; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i])))); + constexpr FLT c0[] = {5.5428559551548406E-01, 5.5428559551548395E-01}; + constexpr FLT c1[] = {7.0481840008800778E-01, -7.0481840008800811E-01}; + constexpr FLT c2[] = {-2.2584311526143548E-02, -2.2584311526143607E-02}; + constexpr FLT c3[] = {-2.5024197515954211E-01, 2.5024197515954211E-01}; + for (int i=0; i<2; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i]))); } else if (w==3) { - constexpr FLT c0[] = {1.5653991189315130E+02, 8.8006872410780375E+02, 1.5653991189967169E+02}; - constexpr FLT c1[] = {3.1653018869611083E+02, 2.7828437114531882E-14, -3.1653018868907077E+02}; - constexpr FLT c2[] = {1.7742692790454484E+02, -3.3149255274727801E+02, 1.7742692791117128E+02}; - constexpr FLT c3[] = {-1.5357716116473071E+01, 1.0675641863333163E-13, 1.5357716122720211E+01}; - constexpr FLT c4[] = {-3.7757583061523640E+01, 5.3222970968867450E+01, -3.7757583054647341E+01}; - constexpr FLT c5[] = {-3.9654011076088449E+00, 4.9521033695040343E-14, 3.9654011139270429E+00}; + constexpr FLT c0[] = {1.7787237246937579E-01, 1.0000000000000013E+00, 1.7787237247678464E-01}; + constexpr FLT c1[] = {3.5966530797581003E-01, -4.2425842671825248E-17, -3.5966530796781060E-01}; + constexpr FLT c2[] = {2.0160576446392536E-01, -3.7666666666667331E-01, 2.0160576447145470E-01}; + constexpr FLT c3[] = {-1.7450587318669351E-02, 2.2939218956436377E-17, 1.7450587325767743E-02}; + constexpr FLT c4[] = {-4.2902993854032963E-02, 6.0475925925925586E-02, -4.2902993846219546E-02}; + constexpr FLT c5[] = {-4.5057857403453909E-03, 6.6232851036457955E-18, 4.5057857475245110E-03}; for (int i=0; i<3; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i]))))); } else if (w==4) { - constexpr FLT c0[] = {5.4284366850213223E+02, 1.0073871433088407E+04, 1.0073871433088407E+04, 5.4284366850213269E+02}; - constexpr FLT c1[] = {1.4650917259256942E+03, 6.1905285583602899E+03, -6.1905285583602899E+03, -1.4650917259256942E+03}; - constexpr FLT c2[] = {1.4186910680718349E+03, -1.3995339862725573E+03, -1.3995339862725571E+03, 1.4186910680718345E+03}; - constexpr FLT c3[] = {5.1133995502497481E+02, -1.4191608683682980E+03, 1.4191608683682985E+03, -5.1133995502497402E+02}; - constexpr FLT c4[] = {-4.8293622641173705E+01, 3.9393732546135901E+01, 3.9393732546136945E+01, -4.8293622641173727E+01}; - constexpr FLT c5[] = {-7.8386867802392203E+01, 1.4918904800408794E+02, -1.4918904800408947E+02, 7.8386867802392203E+01}; - constexpr FLT c6[] = {-1.0039212571700403E+01, 5.0626747735617119E+00, 5.0626747735622777E+00, -1.0039212571700599E+01}; + constexpr FLT c0[] = {3.9828257752799377E-02, 7.3911656575585805E-01, 7.3911656575585805E-01, 3.9828257752799433E-02}; + constexpr FLT c1[] = {1.0749328817387334E-01, 4.5419700247912287E-01, -4.5419700247912287E-01, -1.0749328817387330E-01}; + constexpr FLT c2[] = {1.0408888748149289E-01, -1.0268333881994456E-01, -1.0268333881994476E-01, 1.0408888748149285E-01}; + constexpr FLT c3[] = {3.7516840869185789E-02, -1.0412335657155622E-01, 1.0412335657155641E-01, -3.7516840869185733E-02}; + constexpr FLT c4[] = {-3.5432868834529888E-03, 2.8903049344237370E-03, 2.8903049344238003E-03, -3.5432868834529676E-03}; + constexpr FLT c5[] = {-5.7512181801490673E-03, 1.0945950376831730E-02, -1.0945950376831654E-02, 5.7512181801490829E-03}; + constexpr FLT c6[] = {-7.3657365672905430E-04, 3.7144674885200340E-04, 3.7144674885207063E-04, -7.3657365672907728E-04}; for (int i=0; i<4; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i])))))); } else if (w==5) { - constexpr FLT c0[] = {9.9223677575398506E+02, 3.7794697666613349E+04, 9.8715771010760567E+04, 3.7794697666613327E+04, 9.9223677575398540E+02}; - constexpr FLT c1[] = {3.0430174925083834E+03, 3.7938404259811425E+04, -4.1880997701304513E-12, -3.7938404259811403E+04, -3.0430174925083829E+03}; - constexpr FLT c2[] = {3.6092689177271232E+03, 7.7501368899498630E+03, -2.2704627332475000E+04, 7.7501368899498721E+03, 3.6092689177271213E+03}; - constexpr FLT c3[] = {1.9990077310495410E+03, -3.8875294641277214E+03, 1.6137850891850780E-11, 3.8875294641277346E+03, -1.9990077310495410E+03}; - constexpr FLT c4[] = {4.0071733590403909E+02, -1.5861137916762543E+03, 2.3839858699098786E+03, -1.5861137916762577E+03, 4.0071733590403909E+02}; - constexpr FLT c5[] = {-9.1301168206167233E+01, 1.2316471075215087E+02, 1.9401736511657983E-12, -1.2316471075215495E+02, 9.1301168206166977E+01}; - constexpr FLT c6[] = {-5.5339722671222894E+01, 1.1960590540262304E+02, -1.5249941358312140E+02, 1.1960590540262024E+02, -5.5339722671224088E+01}; - constexpr FLT c7[] = {-3.3762488150349581E+00, 2.2839981873006558E+00, 8.2819625836083788E-12, -2.2839981872910400E+00, 3.3762488150351579E+00}; + constexpr FLT c0[] = {1.0051451410391413E-02, 3.8286382489474308E-01, 1.0000000000000009E+00, 3.8286382489474252E-01, 1.0051451410391420E-02}; + constexpr FLT c1[] = {3.0826052021380446E-02, 3.8431958613457984E-01, -4.7102147373384796E-32, -3.8431958613457951E-01, -3.0826052021380446E-02}; + constexpr FLT c2[] = {3.6562231959204314E-02, 7.8509612097392906E-02, -2.3000000000000059E-01, 7.8509612097392906E-02, 3.6562231959204300E-02}; + constexpr FLT c3[] = {2.0250135419918262E-02, -3.9381037339048602E-02, 1.0193845429304082E-16, 3.9381037339048686E-02, -2.0250135419918248E-02}; + constexpr FLT c4[] = {4.0593041193018580E-03, -1.6067481167759540E-02, 2.4150000000000074E-02, -1.6067481167759530E-02, 4.0593041193018597E-03}; + constexpr FLT c5[] = {-9.2488937959280210E-04, 1.2476700479675494E-03, 1.0406437805617128E-16, -1.2476700479676270E-03, 9.2488937959280405E-04}; + constexpr FLT c6[] = {-5.6059657038176136E-04, 1.2116190166774866E-03, -1.5448333333332675E-03, 1.2116190166775878E-03, -5.6059657038176342E-04}; + constexpr FLT c7[] = {-3.4201716508558499E-05, 2.3137115416428607E-05, 3.6450914717742488E-17, -2.3137115416288715E-05, 3.4201716508574924E-05}; for (int i=0; i<5; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i]))))))); } else if (w==6) { - constexpr FLT c0[] = {2.0553833234911899E+03, 1.5499537739913145E+05, 8.1177907023291232E+05, 8.1177907023291232E+05, 1.5499537739913145E+05, 2.0553833235005700E+03}; - constexpr FLT c1[] = {7.1269776034442684E+03, 2.0581923258843319E+05, 3.1559612614917679E+05, -3.1559612614917639E+05, -2.0581923258843317E+05, -7.1269776034341394E+03}; - constexpr FLT c2[] = {1.0023404568475091E+04, 9.0916650498360206E+04, -1.0095927514054631E+05, -1.0095927514054631E+05, 9.0916650498360163E+04, 1.0023404568484637E+04}; - constexpr FLT c3[] = {7.2536109410387444E+03, 4.8347162752603444E+03, -5.0512736602018485E+04, 5.0512736602018602E+04, -4.8347162752602972E+03, -7.2536109410297577E+03}; - constexpr FLT c4[] = {2.7021878300949775E+03, -7.8773465553972374E+03, 5.2105876478343516E+03, 5.2105876478343944E+03, -7.8773465553972464E+03, 2.7021878301048723E+03}; - constexpr FLT c5[] = {3.2120291706547630E+02, -1.8229189469936912E+03, 3.7928113414428476E+03, -3.7928113414427171E+03, 1.8229189469937239E+03, -3.2120291705638328E+02}; - constexpr FLT c6[] = {-1.2051267090537345E+02, 2.2400507411399769E+02, -1.2506575852547746E+02, -1.2506575852531816E+02, 2.2400507411399730E+02, -1.2051267089640162E+02}; - constexpr FLT c7[] = {-4.5977202613346755E+01, 1.1536880606857032E+02, -1.7819720186492938E+02, 1.7819720186504426E+02, -1.1536880606851560E+02, 4.5977202622148354E+01}; - constexpr FLT c8[] = {-1.5631081288822022E+00, 7.1037430590520445E-01, -6.9838401262032682E-02, -6.9838401199524530E-02, 7.1037430591562767E-01, -1.5631081203751171E+00}; + constexpr FLT c0[] = {2.0875119883113440E-03, 1.5741818314646622E-01, 8.2446837122968764E-01, 8.2446837122968819E-01, 1.5741818314646633E-01, 2.0875119883208737E-03}; + constexpr FLT c1[] = {7.2383827471879086E-03, 2.0903648995439439E-01, 3.2052935784357633E-01, -3.2052935784357606E-01, -2.0903648995439447E-01, -7.2383827471776260E-03}; + constexpr FLT c2[] = {1.0180085126333453E-02, 9.2337811484269047E-02, -1.0253741712233820E-01, -1.0253741712233828E-01, 9.2337811484268964E-02, 1.0180085126343144E-02}; + constexpr FLT c3[] = {7.3669955501269460E-03, 4.9102900025223507E-03, -5.1302324979469405E-02, 5.1302324979469550E-02, -4.9102900025223160E-03, -7.3669955501178214E-03}; + constexpr FLT c4[] = {2.7444270008043898E-03, -8.0004810696544734E-03, 5.2920367975573743E-03, 5.2920367975574090E-03, -8.0004810696544873E-03, 2.7444270008144425E-03}; + constexpr FLT c5[] = {3.2622379114949894E-04, -1.8514138516535197E-03, 3.8520985619445234E-03, -3.8520985619444454E-03, 1.8514138516535119E-03, -3.2622379114026425E-04}; + constexpr FLT c6[] = {-1.2239646122606432E-04, 2.2750660293442782E-04, -1.2702072030317145E-04, -1.2702072030306984E-04, 2.2750660293439860E-04, -1.2239646121695236E-04}; + constexpr FLT c7[] = {-4.6695893922776242E-05, 1.1717219021520763E-04, -1.8098268625859964E-04, 1.8098268625869589E-04, -1.1717219021517810E-04, 4.6695893931711504E-05}; + constexpr FLT c8[] = {-1.5875418082745247E-06, 7.2147850127730698E-07, -7.0930078293142108E-08, -7.0930078245872243E-08, 7.2147850127811706E-07, -1.5875417996312271E-06}; for (int i=0; i<6; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i])))))))); } else if (w==7) { - constexpr FLT c0[] = {3.9948351830487582E+03, 5.4715865608590841E+05, 5.0196413492771825E+06, 9.8206709220713321E+06, 5.0196413492771871E+06, 5.4715865608590853E+05, 3.9948351830642619E+03}; - constexpr FLT c1[] = {1.5290160332974698E+04, 8.7628248584320419E+05, 3.4421061790934466E+06, 6.5103105025927563E-10, -3.4421061790934466E+06, -8.7628248584320443E+05, -1.5290160332958061E+04}; - constexpr FLT c2[] = {2.4458227486779258E+04, 5.3904618484139442E+05, 2.4315566181017563E+05, -1.6133959371974308E+06, 2.4315566181017424E+05, 5.3904618484139396E+05, 2.4458227486795091E+04}; - constexpr FLT c3[] = {2.1166189345881652E+04, 1.3382732160223150E+05, -3.3113450969689601E+05, 2.5683270626620309E-10, 3.3113450969689793E+05, -1.3382732160223130E+05, -2.1166189345866896E+04}; - constexpr FLT c4[] = {1.0542795672344870E+04, -7.0739172265096349E+03, -6.5563293056048627E+04, 1.2429734005960199E+05, -6.5563293056048671E+04, -7.0739172265096395E+03, 1.0542795672361222E+04}; - constexpr FLT c5[] = {2.7903491906228451E+03, -1.0975382873972989E+04, 1.3656979541145318E+04, 4.9801640867456605E-10, -1.3656979541144143E+04, 1.0975382873973054E+04, -2.7903491906078325E+03}; - constexpr FLT c6[] = {1.6069721418054232E+02, -1.5518707872249406E+03, 4.3634273936649897E+03, -5.9891976420600004E+03, 4.3634273936636964E+03, -1.5518707872250636E+03, 1.6069721419532380E+02}; - constexpr FLT c7[] = {-1.2289277373866669E+02, 2.8583630927761948E+02, -2.8318194617245649E+02, -3.5832266061541795E-11, 2.8318194617438041E+02, -2.8583630927744588E+02, 1.2289277375319726E+02}; - constexpr FLT c8[] = {-3.2270164914244575E+01, 9.1892112257588494E+01, -1.6710678096380749E+02, 2.0317049305436126E+02, -1.6710678096299210E+02, 9.1892112257580479E+01, -3.2270164900216493E+01}; - constexpr FLT c9[] = {-1.4761409684320093E-01, -9.1862771282699351E-01, 1.2845147740384601E+00, -5.0335941641611417E-10, -1.2845147731561353E+00, 9.1862771293147938E-01, 1.4761410890830065E-01}; + constexpr FLT c0[] = {4.0677823488318067E-04, 5.5714997521829540E-02, 5.1113018541287825E-01, 1.0000000000000002E+00, 5.1113018541287869E-01, 5.5714997521829561E-02, 4.0677823488475981E-04}; + constexpr FLT c1[] = {1.5569364307494555E-03, 8.9228372765634056E-02, 3.5049603091348180E-01, -1.8840858949353919E-32, -3.5049603091348197E-01, -8.9228372765634029E-02, -1.5569364307477620E-03}; + constexpr FLT c2[] = {2.4904843753404838E-03, 5.4888936725282375E-02, 2.4759577399513382E-02, -1.6428571428571445E-01, 2.4759577399513264E-02, 5.4888936725282340E-02, 2.4904843753420954E-03}; + constexpr FLT c3[] = {2.1552691780265232E-03, 1.3627105791872422E-02, -3.3718114813591167E-02, 1.0435679823191637E-16, 3.3718114813591278E-02, -1.3627105791872396E-02, -2.1552691780250210E-03}; + constexpr FLT c4[] = {1.0735311014902868E-03, -7.2030895675484117E-04, -6.6760503000563741E-03, 1.2656705539358732E-02, -6.6760503000563680E-03, -7.2030895675483119E-04, 1.0735311014919520E-03}; + constexpr FLT c5[] = {2.8413019973530626E-04, -1.1175797418592351E-03, 1.3906361031252640E-03, 1.0099777883094147E-16, -1.3906361031252017E-03, 1.1175797418592505E-03, -2.8413019973377792E-04}; + constexpr FLT c6[] = {1.6363160465889005E-05, -1.5802085209242310E-04, 4.4431051893374396E-04, -6.0985626028865780E-04, 4.4431051893376408E-04, -1.5802085209243416E-04, 1.6363160467394339E-05}; + constexpr FLT c7[] = {-1.2513684117291295E-05, 2.9105578584781478E-05, -2.8835295309364819E-05, 6.9093005849597210E-17, 2.8835295309456306E-05, -2.9105578584752466E-05, 1.2513684118770622E-05}; + constexpr FLT c8[] = {-3.2859430043343403E-06, 9.3570096164232078E-06, -1.7015821249906871E-05, 2.0688046128660197E-05, -1.7015821249876886E-05, 9.3570096164290557E-06, -3.2859430029058764E-06}; + constexpr FLT c9[] = {-1.5030958477935016E-08, -9.3540219413709317E-08, 1.3079704875560537E-07, 3.0755088144886539E-17, -1.3079704870024676E-07, 9.3540219430316894E-08, 1.5030959705830809E-08}; for (int i=0; i<7; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i]))))))))); } else if (w==8) { - constexpr FLT c0[] = {7.3898000697448142E+03, 1.7297637497600052E+06, 2.5578341605285820E+07, 8.4789650417103425E+07, 8.4789650417103410E+07, 2.5578341605285831E+07, 1.7297637497600054E+06, 7.3898000697448097E+03}; - constexpr FLT c1[] = {3.0719636811267621E+04, 3.1853145713323932E+06, 2.3797981861403704E+07, 2.4569731244678468E+07, -2.4569731244678475E+07, -2.3797981861403704E+07, -3.1853145713323941E+06, -3.0719636811267595E+04}; - constexpr FLT c2[] = {5.4488498478251728E+04, 2.4101183255475122E+06, 6.4554051283428418E+06, -8.9200440393090658E+06, -8.9200440393090583E+06, 6.4554051283428296E+06, 2.4101183255475126E+06, 5.4488498478251728E+04}; - constexpr FLT c3[] = {5.3926359802542138E+04, 9.0469037926849385E+05, -6.0897036277695757E+05, -3.0743852105799988E+06, 3.0743852105800197E+06, 6.0897036277696723E+05, -9.0469037926849280E+05, -5.3926359802542152E+04}; - constexpr FLT c4[] = {3.2444118016247576E+04, 1.3079802224392162E+05, -5.8652889370128687E+05, 4.2333306008153327E+05, 4.2333306008153543E+05, -5.8652889370128710E+05, 1.3079802224392179E+05, 3.2444118016247601E+04}; - constexpr FLT c5[] = {1.1864306345505300E+04, -2.2700360645707835E+04, -5.0713607251411129E+04, 1.8308704458211461E+05, -1.8308704458211147E+05, 5.0713607251410089E+04, 2.2700360645707704E+04, -1.1864306345505296E+04}; - constexpr FLT c6[] = {2.2812256770903396E+03, -1.1569135767377908E+04, 2.0942387020802456E+04, -1.1661592834947036E+04, -1.1661592834946512E+04, 2.0942387020804370E+04, -1.1569135767377549E+04, 2.2812256770903291E+03}; - constexpr FLT c7[] = {8.5503535636977634E+00, -9.7513976461196773E+02, 3.8242995179186414E+03, -6.9201295567263214E+03, 6.9201295567309990E+03, -3.8242995179140653E+03, 9.7513976461263269E+02, -8.5503535636935535E+00}; - constexpr FLT c8[] = {-1.0230637348345098E+02, 2.8246898554249236E+02, -3.8638201738252542E+02, 1.9106407992706994E+02, 1.9106407993520349E+02, -3.8638201738414602E+02, 2.8246898554297724E+02, -1.0230637348344338E+02}; - constexpr FLT c9[] = {-1.9200143062942033E+01, 6.1692257626381128E+01, -1.2981109187954436E+02, 1.8681284209765820E+02, -1.8681284209914423E+02, 1.2981109187880136E+02, -6.1692257626381128E+01, 1.9200143062947838E+01}; - constexpr FLT c10[] = {3.7894993761363543E-01, -1.7334408835887836E+00, 2.5271184092462979E+00, -1.2600963912775105E+00, -1.2600963880718390E+00, 2.5271184126204269E+00, -1.7334408829982433E+00, 3.7894993761427903E-01}; - for (int i=0; i<8; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i])))))))))); + constexpr FLT c0[] = {7.5442178667264049E-05, 1.7659090182402852E-02, 2.6112828482312650E-01, 8.6561421087578294E-01, 8.6561421087578294E-01, 2.6112828482312650E-01, 1.7659090182402856E-02, 7.5442178667263913E-05}; + constexpr FLT c1[] = {3.1361556564941527E-04, 3.2518751351035657E-02, 2.4295266212395961E-01, 2.5083142126627195E-01, -2.5083142126627200E-01, -2.4295266212395961E-01, -3.2518751351035664E-02, -3.1361556564941506E-04}; + constexpr FLT c2[] = {5.5627094085228170E-04, 2.4604803324737457E-02, 6.5902977410162822E-02, -9.1064379250067565E-02, -9.1064379250067648E-02, 6.5902977410162836E-02, 2.4604803324737447E-02, 5.5627094085228149E-04}; + constexpr FLT c3[] = {5.5053208919074741E-04, 9.2359485489686977E-03, -6.2169545154249764E-03, -3.1386277864020387E-02, 3.1386277864020692E-02, 6.2169545154250301E-03, -9.2359485489686925E-03, -5.5053208919074741E-04}; + constexpr FLT c4[] = {3.3122072653963820E-04, 1.3353118718124376E-03, -5.9878504390516807E-03, 4.3217905833729843E-03, 4.3217905833729184E-03, -5.9878504390516564E-03, 1.3353118718124411E-03, 3.3122072653963842E-04}; + constexpr FLT c5[] = {1.2112223749399388E-04, -2.3174709024353528E-04, -5.1773322458159945E-04, 1.8691284471382664E-03, -1.8691284471382276E-03, 5.1773322458165388E-04, 2.3174709024353332E-04, -1.2112223749399391E-04}; + constexpr FLT c6[] = {2.3288943339077962E-05, -1.1810885265513022E-04, 2.1380000655379686E-04, -1.1905274322668279E-04, -1.1905274322667877E-04, 2.1380000655378596E-04, -1.1810885265513386E-04, 2.3288943339077766E-05}; + constexpr FLT c7[] = {8.7290223704935849E-08, -9.9551635569432461E-06, 3.9042123573714734E-05, -7.0647330846704962E-05, 7.0647330846826175E-05, -3.9042123573667747E-05, 9.9551635569490195E-06, -8.7290223704824623E-08}; + constexpr FLT c8[] = {-1.0444417486661213E-06, 2.8837147790326586E-06, -3.9445588398358951E-06, 1.9505656879624058E-06, 1.9505656880227840E-06, -3.9445588398203690E-06, 2.8837147790369691E-06, -1.0444417486660073E-06}; + constexpr FLT c9[] = {-1.9601350641688945E-07, 6.2981383505868899E-07, -1.3252363384761618E-06, 1.9071649677058813E-06, -1.9071649677363285E-06, 1.3252363385149127E-06, -6.2981383505419114E-07, 1.9601350641697053E-07}; + for (int i=0; i<8; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i]))))))))); } else if (w==9) { - constexpr FLT c0[] = {1.3136365370186153E+04, 5.0196413492771843E+06, 1.1303327711722577E+08, 5.8225443924996734E+08, 9.7700272582690716E+08, 5.8225443924996805E+08, 1.1303327711722578E+08, 5.0196413492772263E+06, 1.3136365370186144E+04}; - constexpr FLT c1[] = {5.8623313038274369E+04, 1.0326318537280345E+07, 1.2898448324824868E+08, 3.0522863709830379E+08, 7.2435840302079811E-08, -3.0522863709830397E+08, -1.2898448324824865E+08, -1.0326318537280394E+07, -5.8623313038274347E+04}; - constexpr FLT c2[] = {1.1335001341875960E+05, 9.0726133144784812E+06, 5.3501544534038134E+07, -2.6789524644140172E+05, -1.2483923718899371E+08, -2.6789524644173466E+05, 5.3501544534038089E+07, 9.0726133144785147E+06, 1.1335001341875963E+05}; - constexpr FLT c3[] = {1.2489113703229754E+05, 4.3035547171861976E+06, 6.3021978510599164E+06, -2.6014941986658975E+07, 5.3074599277157087E-08, 2.6014941986659400E+07, -6.3021978510598680E+06, -4.3035547171862088E+06, -1.2489113703229751E+05}; - constexpr FLT c4[] = {8.6425493435991244E+04, 1.0891182836653311E+06, -2.0713033564200432E+06, -2.8994941183505901E+06, 7.5905338661206560E+06, -2.8994941183505324E+06, -2.0713033564200350E+06, 1.0891182836653385E+06, 8.6425493435991288E+04}; - constexpr FLT c5[] = {3.8657354724013800E+04, 7.9936390113329253E+04, -7.0458265546791849E+05, 1.0151095605715540E+06, 7.5990350518026299E-08, -1.0151095605718379E+06, 7.0458265546793933E+05, -7.9936390113333939E+04, -3.8657354724013821E+04}; - constexpr FLT c6[] = {1.0779131453134645E+04, -3.3466718311300116E+04, -1.3245366618985940E+04, 1.8238470515354761E+05, -2.9285656292981049E+05, 1.8238470515352563E+05, -1.3245366618989963E+04, -3.3466718311299133E+04, 1.0779131453134627E+04}; - constexpr FLT c7[] = {1.4992527030548656E+03, -9.7024371533879767E+03, 2.3216330734078529E+04, -2.3465262819038293E+04, -4.5678067266366728E-08, 2.3465262819229152E+04, -2.3216330734050898E+04, 9.7024371533899721E+03, -1.4992527030548690E+03}; - constexpr FLT c8[] = {-7.9857427421152821E+01, -4.0585588534976301E+02, 2.6054813773370911E+03, -6.1806593581469824E+03, 8.0679596873459095E+03, -6.1806593581737125E+03, 2.6054813773390433E+03, -4.0585588535087578E+02, -7.9857427421118601E+01}; - constexpr FLT c9[] = {-7.1572272057928345E+01, 2.2785637019390455E+02, -3.9109820766111051E+02, 3.3597424707310040E+02, -1.3908671051550088E-08, -3.3597424727519922E+02, 3.9109820767448468E+02, -2.2785637019111829E+02, 7.1572272057948652E+01}; - constexpr FLT c10[] = {-9.8886360697883688E+00, 3.5359026950204516E+01, -8.5251867695464611E+01, 1.4285748013461193E+02, -1.6935269664190733E+02, 1.4285748014610570E+02, -8.5251867686017064E+01, 3.5359026947336602E+01, -9.8886360697963340E+00}; + constexpr FLT c0[] = {1.3445576990655693E-05, 5.1377966678943553E-03, 1.1569392196071671E-01, 5.9595989228910695E-01, 1.0000000000000004E+00, 5.9595989228910784E-01, 1.1569392196071673E-01, 5.1377966678943874E-03, 1.3445576990655681E-05}; + constexpr FLT c1[] = {6.0003223623206657E-05, 1.0569385595664990E-02, 1.3202059711663530E-01, 3.1241329121161582E-01, -8.4851685343650422E-17, -3.1241329121161615E-01, -1.3202059711663522E-01, -1.0569385595665032E-02, -6.0003223623206596E-05}; + constexpr FLT c2[] = {1.1601811379064824E-04, 9.2861699099147151E-03, 5.4760895870332324E-02, -2.7420112488894219E-04, -1.2777777777777805E-01, -2.7420112488935430E-04, 5.4760895870332296E-02, 9.2861699099147359E-03, 1.1601811379064817E-04}; + constexpr FLT c3[] = {1.2783089927061688E-04, 4.4048543606096807E-03, 6.4505427512762566E-03, -2.6627297241817574E-02, 1.0570032264240285E-16, 2.6627297241817935E-02, -6.4505427512762245E-03, -4.4048543606096877E-03, -1.2783089927061688E-04}; + constexpr FLT c4[] = {8.8459828362140127E-05, 1.1147546008569559E-03, -2.1200589329645782E-03, -2.9677441441083273E-03, 7.7692043895744413E-03, -2.9677441441080211E-03, -2.1200589329645678E-03, 1.1147546008569583E-03, 8.8459828362140168E-05}; + constexpr FLT c5[] = {3.9567294647305465E-05, 8.1817980646548672E-05, -7.2116754318327786E-04, 1.0390038161997466E-03, 1.3960675422467541E-16, -1.0390038161998867E-03, 7.2116754318328556E-04, -8.1817980646550122E-05, -3.9567294647305431E-05}; + constexpr FLT c6[] = {1.1032857092605887E-05, -3.4254477931955853E-05, -1.3557143976035256E-05, 1.8667778536557664E-04, -2.9974999576614188E-04, 1.8667778536546106E-04, -1.3557143976042615E-05, -3.4254477931959885E-05, 1.1032857092605841E-05}; + constexpr FLT c7[] = {1.5345430093717796E-06, -9.9308189188274098E-06, 2.3762810604639151E-05, -2.4017602201954516E-05, 1.1627785359675844E-17, 2.4017602202115669E-05, -2.3762810604628780E-05, 9.9308189188319669E-06, -1.5345430093718216E-06}; + constexpr FLT c8[] = {-8.1737159283255726E-08, -4.1540916378247392E-07, 2.6668107554223020E-06, -6.3261434127908313E-06, 8.2578681449311880E-06, -6.3261434126076934E-06, 2.6668107554440373E-06, -4.1540916378676467E-07, -8.1737159283249333E-08}; + constexpr FLT c9[] = {-7.3256982980608342E-08, 2.3321978963880019E-07, -4.0030411105333760E-07, 3.4388260968054864E-07, 6.5677795522570459E-17, -3.4388260990751890E-07, 4.0030411105333760E-07, -2.3321978963499429E-07, 7.3256982980640781E-08}; + constexpr FLT c10[] = {-1.0121400696579195E-08, 3.6191328862414928E-08, -8.7258577118961372E-08, 1.4622014477867198E-07, -1.7333902174790525E-07, 1.4622014483401952E-07, -8.7258577100106683E-08, 3.6191328859901120E-08, -1.0121400696606260E-08}; for (int i=0; i<9; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i])))))))))); } else if (w==10) { - constexpr FLT c0[] = {2.2594586605749224E+04, 1.3595989066786611E+07, 4.4723032442444932E+08, 3.3781755837397552E+09, 8.6836783895849857E+09, 8.6836783895849838E+09, 3.3781755837397523E+09, 4.4723032442444944E+08, 1.3595989066786496E+07, 2.2594586605749344E+04}; - constexpr FLT c1[] = {1.0729981697645644E+05, 3.0651490267742995E+07, 5.9387966085130477E+08, 2.4434902657508349E+09, 2.0073077861288934E+09, -2.0073077861288950E+09, -2.4434902657508330E+09, -5.9387966085130477E+08, -3.0651490267742828E+07, -1.0729981697645634E+05}; - constexpr FLT c2[] = {2.2340399734184612E+05, 3.0258214643190462E+07, 3.1512411458738238E+08, 4.3618276932319850E+08, -7.8178848450497270E+08, -7.8178848450497031E+08, 4.3618276932319820E+08, 3.1512411458738214E+08, 3.0258214643190324E+07, 2.2340399734184553E+05}; - constexpr FLT c3[] = {2.6917433004353492E+05, 1.6875651476661246E+07, 7.4664745481963649E+07, -9.5882157211117968E+07, -2.0622994435532477E+08, 2.0622994435532823E+08, 9.5882157211118430E+07, -7.4664745481963366E+07, -1.6875651476661157E+07, -2.6917433004353428E+05}; - constexpr FLT c4[] = {2.0818422772177897E+05, 5.6084730690362593E+06, 1.4435118192351859E+06, -4.0063869969544269E+07, 3.2803674392747816E+07, 3.2803674392746560E+07, -4.0063869969546124E+07, 1.4435118192352206E+06, 5.6084730690362155E+06, 2.0818422772177868E+05}; - constexpr FLT c5[] = {1.0781139496011086E+05, 9.9202615851199278E+05, -3.3266265543961083E+06, -4.8557049011452327E+05, 1.0176155522772400E+07, -1.0176155522773268E+07, 4.8557049011599307E+05, 3.3266265543962419E+06, -9.9202615851196356E+05, -1.0781139496011072E+05}; - constexpr FLT c6[] = {3.7380102688153638E+04, 1.2716675000361241E+04, -6.2163527451762755E+05, 1.4157962667184302E+06, -8.4419693137719855E+05, -8.4419693137682532E+05, 1.4157962667184921E+06, -6.2163527451772091E+05, 1.2716675000342160E+04, 3.7380102688153478E+04}; - constexpr FLT c7[] = {8.1238936393894573E+03, -3.4872365530440075E+04, 2.3913680325287874E+04, 1.2428850301835715E+05, -3.2158255329711520E+05, 3.2158255329964001E+05, -1.2428850301842803E+05, -2.3913680325138281E+04, 3.4872365530466821E+04, -8.1238936393894610E+03}; - constexpr FLT c8[] = {7.8515926628982811E+02, -6.6607899119346384E+03, 2.0167398338412942E+04, -2.8951401344643764E+04, 1.4622828141516249E+04, 1.4622828142773422E+04, -2.8951401346273171E+04, 2.0167398338466974E+04, -6.6607899119428766E+03, 7.8515926628979298E+02}; - constexpr FLT c9[] = {-1.0147176570538747E+02, -3.5304284178326540E+01, 1.3576976855470537E+03, -4.3921059355373945E+03, 7.3232085265656797E+03, -7.3232085282537992E+03, 4.3921059362506849E+03, -1.3576976853984515E+03, 3.5304284186128150E+01, 1.0147176570552679E+02}; - constexpr FLT c10[] = {-4.3161545259359876E+01, 1.5498490982726668E+02, -3.1771250761814974E+02, 3.7215448796966825E+02, -1.7181762811175784E+02, -1.7181762918070896E+02, 3.7215448823960344E+02, -3.1771250765054128E+02, 1.5498490982861634E+02, -4.3161545259484186E+01}; - constexpr FLT c11[] = {-4.2916172038642904E+00, 1.7402146073587435E+01, -4.7947588063038118E+01, 9.2697697961204668E+01, -1.2821427624698006E+02, 1.2821427667135228E+02, -9.2697698383138089E+01, 4.7947588092305367E+01, -1.7402146072063207E+01, 4.2916172038214455E+00}; + constexpr FLT c0[] = {2.3186292807626266E-06, 1.3952040327729876E-03, 4.5894237568906843E-02, 3.4666431215091636E-01, 8.9110862394332080E-01, 8.9110862394332024E-01, 3.4666431215091614E-01, 4.5894237568906843E-02, 1.3952040327729804E-03, 2.3186292807626329E-06}; + constexpr FLT c1[] = {1.1010978063160391E-05, 3.1454190365986022E-03, 6.0943215953720313E-02, 2.5074802988370321E-01, 2.0598750885032702E-01, -2.0598750885032710E-01, -2.5074802988370315E-01, -6.0943215953720306E-02, -3.1454190365985909E-03, -1.1010978063160380E-05}; + constexpr FLT c2[] = {2.2925449299630732E-05, 3.1050615653861980E-03, 3.2337657329423494E-02, 4.4760550762170469E-02, -8.0226193254406428E-02, -8.0226193254406289E-02, 4.4760550762170441E-02, 3.2337657329423480E-02, 3.1050615653861868E-03, 2.2925449299630681E-05}; + constexpr FLT c3[] = {2.7622345748507540E-05, 1.7317590416004974E-03, 7.6620063086756569E-03, -9.8393115612840278E-03, -2.1163068654269049E-02, 2.1163068654269510E-02, 9.8393115612841128E-03, -7.6620063086756491E-03, -1.7317590416004913E-03, -2.7622345748507479E-05}; + constexpr FLT c4[] = {2.1363614860997117E-05, 5.7553475552091617E-04, 1.4813144535930287E-04, -4.1113061120761924E-03, 3.3662735809591683E-03, 3.3662735809590794E-03, -4.1113061120762826E-03, 1.4813144535930759E-04, 5.7553475552091368E-04, 2.1363614860997080E-05}; + constexpr FLT c5[] = {1.1063475580065299E-05, 1.0180053030149723E-04, -3.4137441280837177E-04, -4.9828659222651745E-05, 1.0442648308817235E-03, -1.0442648308817467E-03, 4.9828659222713965E-05, 3.4137441280837177E-04, -1.0180053030149541E-04, -1.1063475580065281E-05}; + constexpr FLT c6[] = {3.8359011440648869E-06, 1.3049698816919587E-06, -6.3791463619208982E-05, 1.4528730872072194E-04, -8.6630472952355992E-05, -8.6630472952398913E-05, 1.4528730872073633E-04, -6.3791463619214471E-05, 1.3049698816901833E-06, 3.8359011440648767E-06}; + constexpr FLT c7[] = {8.3366418668164326E-07, -3.5785601754616355E-06, 2.4539930904858821E-06, 1.2754336575782058E-05, -3.3000414536039571E-05, 3.3000414536273711E-05, -1.2754336575693992E-05, -2.4539930904800897E-06, 3.5785601754627781E-06, -8.3366418668163871E-07}; + constexpr FLT c8[] = {8.0572098823818712E-08, -6.8352224328357488E-07, 2.0695541423376112E-06, -2.9709579576770532E-06, 1.5005770225996294E-06, 1.5005770226481292E-06, -2.9709579578116679E-06, 2.0695541423438809E-06, -6.8352224328404986E-07, 8.0572098823810798E-08}; + constexpr FLT c9[] = {-1.0412910456843575E-08, -3.6228831474008107E-09, 1.3932530225640674E-07, -4.5071262434444286E-07, 7.5149884418348562E-07, -7.5149884428313110E-07, 4.5071262441364111E-07, -1.3932530225017888E-07, 3.6228831478332996E-09, 1.0412910456861821E-08}; + constexpr FLT c10[] = {-4.4291858216944146E-09, 1.5904364893350153E-08, -3.2603275106346107E-08, 3.8190045632066571E-08, -1.7631718176528265E-08, -1.7631718292171639E-08, 3.8190045621381707E-08, -3.2603275098803994E-08, 1.5904364893978648E-08, -4.4291858217073890E-09}; + constexpr FLT c11[] = {-4.4040059170580565E-10, 1.7857872825180656E-09, -4.9203237617335969E-09, 9.5125262125165431E-09, -1.3157194779492521E-08, 1.3157194812996001E-08, -9.5125262191888681E-09, 4.9203237596041585E-09, -1.7857872834763311E-09, 4.4040059170802652E-10}; for (int i=0; i<10; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i]))))))))))); } else if (w==11) { - constexpr FLT c0[] = {3.7794653219809712E+04, 3.4782300224660814E+07, 1.6188020733727572E+09, 1.7196758809615025E+10, 6.3754384857724686E+10, 9.7196447559193588E+10, 6.3754384857724686E+10, 1.7196758809615013E+10, 1.6188020733727574E+09, 3.4782300224660836E+07, 3.7794653219808912E+04}; - constexpr FLT c1[] = {1.8969206922085886E+05, 8.4769319065313712E+07, 2.4230555767723413E+09, 1.5439732722639107E+10, 2.7112836839612331E+10, 7.5382856415600940E-06, -2.7112836839612324E+10, -1.5439732722639109E+10, -2.4230555767723413E+09, -8.4769319065313712E+07, -1.8969206922085691E+05}; - constexpr FLT c2[] = {4.2138380313901440E+05, 9.2050522922791913E+07, 1.5259983101266618E+09, 4.7070559561237240E+09, -1.2448027572952247E+09, -1.0161446790279316E+10, -1.2448027572952359E+09, 4.7070559561237249E+09, 1.5259983101266608E+09, 9.2050522922791883E+07, 4.2138380313901132E+05}; - constexpr FLT c3[] = {5.4814313598122029E+05, 5.8085130777589604E+07, 4.9484006166551131E+08, 1.6222124676641059E+08, -2.0440440381345210E+09, 1.6029666825264191E-05, 2.0440440381345406E+09, -1.6222124676640612E+08, -4.9484006166551065E+08, -5.8085130777589574E+07, -5.4814313598121749E+05}; - constexpr FLT c4[] = {4.6495183529254969E+05, 2.3067199578027170E+07, 6.9832590192482471E+07, -2.2024799260683393E+08, -1.2820270942588173E+08, 5.1017181199129957E+08, -1.2820270942587103E+08, -2.2024799260683718E+08, 6.9832590192482680E+07, 2.3067199578027181E+07, 4.6495183529254753E+05}; - constexpr FLT c5[] = {2.7021781043532956E+05, 5.6764510325100170E+06, -5.5650761736746123E+06, -3.9907385617899098E+07, 7.2453390663685441E+07, 1.3807321808330796E-06, -7.2453390663686499E+07, 3.9907385617896959E+07, 5.5650761736744791E+06, -5.6764510325100273E+06, -2.7021781043532840E+05}; - constexpr FLT c6[] = {1.0933249308680632E+05, 6.9586821127988759E+05, -3.6860240321936086E+06, 2.7428169457744057E+06, 8.3392008440658972E+06, -1.6402201025049815E+07, 8.3392008440622678E+06, 2.7428169457778567E+06, -3.6860240321934861E+06, 6.9586821127989655E+05, 1.0933249308680571E+05}; - constexpr FLT c7[] = {3.0203516161820731E+04, -3.6879059542738614E+04, -4.1141031216769724E+05, 1.4111389975281695E+06, -1.5914376635274226E+06, 6.7631682826831895E-06, 1.5914376635404355E+06, -1.4111389975219201E+06, 4.1141031216798135E+05, 3.6879059542753101E+04, -3.0203516161820640E+04}; - constexpr FLT c8[] = {5.1670143574923986E+03, -2.8613147115359603E+04, 4.3560195427027051E+04, 4.8438679581734432E+04, -2.5856630639957223E+05, 3.7994883866286115E+05, -2.5856630639708077E+05, 4.8438679579228658E+04, 4.3560195427174098E+04, -2.8613147115353891E+04, 5.1670143574923814E+03}; - constexpr FLT c9[] = {3.0888018539742438E+02, -3.7949446187486474E+03, 1.4313303205130735E+04, -2.6681600236165083E+04, 2.3856005159699442E+04, -1.9072153968212169E-06, -2.3856005160079862E+04, 2.6681600234262976E+04, -1.4313303204940523E+04, 3.7949446187568205E+03, -3.0888018539723868E+02}; - constexpr FLT c10[] = {-8.3747489794178762E+01, 1.1948077481430271E+02, 4.8528498043145930E+02, -2.5024391100070475E+03, 5.3511195380863319E+03, -6.7655484103934950E+03, 5.3511195323636521E+03, -2.5024391101798296E+03, 4.8528498086337265E+02, 1.1948077483184566E+02, -8.3747489794339316E+01}; - constexpr FLT c11[] = {-2.2640047135393669E+01, 9.0840898559070766E+01, -2.1597187557069051E+02, 3.1511228970473707E+02, -2.4856618213020064E+02, -2.0962600056762836E-06, 2.4856618232531096E+02, -3.1511228707801843E+02, 2.1597187541459934E+02, -9.0840898577362736E+01, 2.2640047135479467E+01}; - constexpr FLT c12[] = {-1.6306382885603201E+00, 7.3325946574893264E+00, -2.3241017691629008E+01, 5.1715493346619120E+01, -8.2673008978082819E+01, 9.6489716906321945E+01, -8.2673008978083388E+01, 5.1715493276466965E+01, -2.3241017744243891E+01, 7.3325946602297218E+00, -1.6306382886202573E+00}; + constexpr FLT c0[] = {3.8884809238313434E-07, 3.5785567372179951E-04, 1.6654951019551330E-02, 1.7692785324424570E-01, 6.5593328211813162E-01, 9.9999999999999978E-01, 6.5593328211813129E-01, 1.7692785324424565E-01, 1.6654951019551330E-02, 3.5785567372179962E-04, 3.8884809238312539E-07}; + constexpr FLT c1[] = {1.9516358260453364E-06, 8.7214421096705593E-04, 2.4929466432368100E-02, 1.5885079249667189E-01, 2.7894884556454935E-01, 9.4204294746769595E-33, -2.7894884556454941E-01, -1.5885079249667189E-01, -2.4929466432368097E-02, -8.7214421096705604E-04, -1.9516358260453169E-06}; + constexpr FLT c2[] = {4.3353827605930511E-06, 9.4705645354715550E-04, 1.5700144896729017E-02, 4.8428271550326758E-02, -1.2807080799297165E-02, -1.0454545454545448E-01, -1.2807080799297061E-02, 4.8428271550326821E-02, 1.5700144896729006E-02, 9.4705645354715518E-04, 4.3353827605930215E-06}; + constexpr FLT c3[] = {5.6395387871289846E-06, 5.9760549110825473E-04, 5.0911332059142295E-03, 1.6690038662948304E-03, -2.1030028251697912E-02, 1.4335617874817167E-16, 2.1030028251698141E-02, -1.6690038662947660E-03, -5.0911332059142200E-03, -5.9760549110825429E-04, -5.6395387871289508E-06}; + constexpr FLT c4[] = {4.7836299264887200E-06, 2.3732554180006408E-04, 7.1846854433598795E-04, -2.2660086673713248E-03, -1.3190061226035158E-03, 5.2488730277989188E-03, -1.3190061226033569E-03, -2.2660086673713374E-03, 7.1846854433598557E-04, 2.3732554180006421E-04, 4.7836299264886963E-06}; + constexpr FLT c5[] = {2.7801202330030064E-06, 5.8401836435976300E-05, -5.7255962675850168E-05, -4.1058481683291448E-04, 7.4543249761827859E-04, 6.7099534430837577E-17, -7.4543249761823186E-04, 4.1058481683291448E-04, 5.7255962675853089E-05, -5.8401836435976178E-05, -2.7801202330029924E-06}; + constexpr FLT c6[] = {1.1248609988572041E-06, 7.1593996360419040E-06, -3.7923443960739119E-05, 2.8219312687371359E-05, 8.5797383067823588E-05, -1.6875309167105302E-04, 8.5797383067779691E-05, 2.8219312687392853E-05, -3.7923443960740034E-05, 7.1593996360418057E-06, 1.1248609988571978E-06}; + constexpr FLT c7[] = {3.1074712008817516E-07, -3.7942806006679305E-07, -4.2327710785708026E-06, 1.4518421536643064E-05, -1.6373413879605298E-05, 3.0222646636983358E-17, 1.6373413879621934E-05, -1.4518421536591986E-05, 4.2327710785753580E-06, 3.7942806006705484E-07, -3.1074712008817235E-07}; + constexpr FLT c8[] = {5.3160526822194444E-08, -2.9438470061321741E-07, 4.4816653817789122E-07, 4.9835853873945607E-07, -2.6602444110833864E-06, 3.9090815375281113E-06, -2.6602444110225165E-06, 4.9835853874269618E-07, 4.4816653818193273E-07, -2.9438470061323123E-07, 5.3160526822193583E-08}; + constexpr FLT c9[] = {3.1778958300854393E-09, -3.9044067083483707E-08, 1.4726158788365547E-07, -2.7451209287062293E-07, 2.4544112217999958E-07, 8.6199548859978872E-18, -2.4544112207758621E-07, 2.7451209285678326E-07, -1.4726158788296347E-07, 3.9044067083624268E-08, -3.1778958300829052E-09}; + constexpr FLT c10[] = {-8.6163117991617490E-10, 1.2292710054271969E-09, 4.9928263052430922E-09, -2.5746199362556884E-08, 5.5054682151312924E-08, -6.9606951358406722E-08, 5.5054682230504105E-08, -2.5746199365699604E-08, 4.9928263093284604E-09, 1.2292710054468060E-09, -8.6163117991862728E-10}; + constexpr FLT c11[] = {-2.3293080872726303E-10, 9.3461130390718653E-10, -2.2220140857286656E-09, 3.2420144232604506E-09, -2.5573586459741160E-09, -3.4362247560151687E-17, 2.5573586170134590E-09, -3.2420144222311963E-09, 2.2220140843090244E-09, -9.3461130382733279E-10, 2.3293080872885788E-10}; + constexpr FLT c12[] = {-1.6776727231079557E-11, 7.5440974150049303E-11, -2.3911386677196792E-10, 5.3207180787495740E-10, -8.5057641018270776E-10, 9.9272876082686339E-10, -8.5057644693357476E-10, 5.3207181195839291E-10, -2.3911386485786361E-10, 7.5440974126123504E-11, -1.6776727231328710E-11}; for (int i=0; i<11; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i] + z*(c12[i])))))))))))); } else if (w==12) { - constexpr FLT c0[] = {6.1722991679853279E+04, 8.4789650417103827E+07, 5.4431675199498749E+09, 7.8788892335272308E+10, 4.0355760945670074E+11, 8.8071481911347998E+11, 8.8071481911348035E+11, 4.0355760945670081E+11, 7.8788892335272507E+10, 5.4431675199498901E+09, 8.4789650417103752E+07, 6.1722991679871782E+04}; - constexpr FLT c1[] = {3.2561466099406185E+05, 2.2112758120210630E+08, 8.9911609880089836E+09, 8.3059508064200958E+10, 2.3965569143469873E+11, 1.6939286803305209E+11, -1.6939286803305209E+11, -2.3965569143469867E+11, -8.3059508064201111E+10, -8.9911609880090008E+09, -2.2112758120210621E+08, -3.2561466099404270E+05}; - constexpr FLT c2[] = {7.6621098001581512E+05, 2.6026568260310283E+08, 6.4524338253008652E+09, 3.3729904113826836E+10, 2.8555202212474079E+10, -6.8998572040731476E+10, -6.8998572040731461E+10, 2.8555202212474102E+10, 3.3729904113826820E+10, 6.4524338253008747E+09, 2.6026568260310283E+08, 7.6621098001583782E+05}; - constexpr FLT c3[] = {1.0657807616803222E+06, 1.8144472126891005E+08, 2.5524827004349880E+09, 5.2112383911371851E+09, -1.0268350564014641E+10, -1.4763245309081160E+10, 1.4763245309081381E+10, 1.0268350564014679E+10, -5.2112383911371050E+09, -2.5524827004349866E+09, -1.8144472126890993E+08, -1.0657807616803094E+06}; - constexpr FLT c4[] = {9.7829638830158766E+05, 8.2222351241520002E+07, 5.5676911894064677E+08, -4.8739037675425845E+08, -2.7153428193078089E+09, 2.5627633609246616E+09, 2.5627633609247270E+09, -2.7153428193078089E+09, -4.8739037675429344E+08, 5.5676911894064772E+08, 8.2222351241519988E+07, 9.7829638830161223E+05}; - constexpr FLT c5[] = {6.2536876825113979E+05, 2.4702814073680259E+07, 4.1488431554846764E+07, -2.9274790542417943E+08, 1.0742154109192364E+08, 6.2185168968026125E+08, -6.2185168968025279E+08, -1.0742154109186378E+08, 2.9274790542422217E+08, -4.1488431554844894E+07, -2.4702814073680248E+07, -6.2536876825112430E+05}; - constexpr FLT c6[] = {2.8527714307528501E+05, 4.6266378435690925E+06, -1.0665598090789001E+07, -2.6048960239884529E+07, 9.1597254427304730E+07, -5.9794495983325504E+07, -5.9794495983230442E+07, 9.1597254427350238E+07, -2.6048960239922173E+07, -1.0665598090794679E+07, 4.6266378435690831E+06, 2.8527714307530370E+05}; - constexpr FLT c7[] = {9.2873647411234633E+04, 3.6630046787437343E+05, -3.1271047224703613E+06, 4.8612412939389814E+06, 3.3820440907783178E+06, -1.6880127953644276E+07, 1.6880127953794900E+07, -3.3820440907782884E+06, -4.8612412938910574E+06, 3.1271047224760642E+06, -3.6630046787425788E+05, -9.2873647411217215E+04}; - constexpr FLT c8[] = {2.0817947751046311E+04, -5.5660303410283603E+04, -1.9519783923352187E+05, 1.0804817251249440E+06, -1.8264985852847320E+06, 9.7602844964054180E+05, 9.7602844964026869E+05, -1.8264985852578641E+06, 1.0804817251242315E+06, -1.9519783923298802E+05, -5.5660303410281354E+04, 2.0817947751063894E+04}; - constexpr FLT c9[] = {2.7986023314783351E+03, -1.9404411093657811E+04, 4.3922625001185028E+04, -7.6450317330166517E+03, -1.5273911976404343E+05, 3.3223441450907954E+05, -3.3223441450755787E+05, 1.5273911981578072E+05, 7.6450317512768770E+03, -4.3922624998712294E+04, 1.9404411093676386E+04, -2.7986023314643107E+03}; - constexpr FLT c10[] = {6.7849020474217255E+01, -1.7921351307610907E+03, 8.4980694701237535E+03, -1.9742624848712727E+04, 2.4620674811515193E+04, -1.1676544936917096E+04, -1.1676544845699163E+04, 2.4620674862652242E+04, -1.9742624819688928E+04, 8.4980694644226842E+03, -1.7921351307503089E+03, 6.7849020488654887E+01}; - constexpr FLT c11[] = {-5.4577020998540995E+01, 1.3637112871144197E+02, 4.5513617165591533E+01, -1.1174001347694452E+03, 3.2018768920645603E+03, -5.0580352089258022E+03, 5.0580351705274497E+03, -3.2018769484133886E+03, 1.1174001005075061E+03, -4.5513609907370189E+01, -1.3637112869192950E+02, 5.4577021011650153E+01}; - constexpr FLT c12[] = {-1.0538365872663764E+01, 4.6577222493036992E+01, -1.2606964247581806E+02, 2.1881090265912360E+02, -2.3273404104747246E+02, 1.0274271612440927E+02, 1.0274271612440242E+02, -2.3273400063947102E+02, 2.1881092482740195E+02, -1.2606964693052080E+02, 4.6577222495229805E+01, -1.0538365860486415E+01}; - constexpr FLT c13[] = {-4.6087004138254672E-01, 2.5969759057927089E+00, -9.6946928123584506E+00, 2.4990051638288470E+01, -4.6013914134428035E+01, 6.2056955095902744E+01, -6.2056967309552682E+01, 4.6013924603270830E+01, -2.4990037679831403E+01, 9.6946951024178141E+00, -2.5969758989770559E+00, 4.6087004739949022E-01}; + constexpr FLT c0[] = {6.3667715563015689E-08, 8.7461142088576888E-05, 5.6146669497086589E-03, 8.1271316412301370E-02, 4.1627261402765736E-01, 9.0846375182673755E-01, 9.0846375182673755E-01, 4.1627261402765736E-01, 8.1271316412301550E-02, 5.6146669497086719E-03, 8.7461142088576929E-05, 6.3667715563034801E-08}; + constexpr FLT c1[] = {3.3587389488258588E-07, 2.2809471090022899E-04, 9.2744480587562007E-03, 8.5676487647659991E-02, 2.4720659158040625E-01, 1.7472997738462001E-01, -1.7472997738461990E-01, -2.4720659158040617E-01, -8.5676487647660143E-02, -9.2744480587562180E-03, -2.2809471090022899E-04, -3.3587389488256608E-07}; + constexpr FLT c2[] = {7.9035220764954472E-07, 2.6846594761214740E-04, 6.6557324960729147E-03, 3.4792641812076718E-02, 2.9454899103693762E-02, -7.1172529707069221E-02, -7.1172529707069207E-02, 2.9454899103693671E-02, 3.4792641812076690E-02, 6.6557324960729242E-03, 2.6846594761214740E-04, 7.9035220764956886E-07}; + constexpr FLT c3[] = {1.0993606197695965E-06, 1.8716155179384050E-04, 2.6329045000561364E-03, 5.3754303637600113E-03, -1.0591878410592502E-02, -1.5228395084945664E-02, 1.5228395084945803E-02, 1.0591878410592646E-02, -5.3754303637599376E-03, -2.6329045000561364E-03, -1.8716155179384044E-04, -1.0993606197695836E-06}; + constexpr FLT c4[] = {1.0091198513153346E-06, 8.4812954286468477E-05, 5.7431140218944460E-04, -5.0274672420766203E-04, -2.8008958990917627E-03, 2.6435090762445433E-03, 2.6435090762445819E-03, -2.8008958990918187E-03, -5.0274672420767580E-04, 5.7431140218944276E-04, 8.4812954286468423E-05, 1.0091198513153598E-06}; + constexpr FLT c5[] = {6.4507244019416584E-07, 2.5481132674301279E-05, 4.2795619387511420E-05, -3.0197159708156643E-04, 1.1080610219049720E-04, 6.4144454802694492E-04, -6.4144454802681275E-04, -1.1080610219045053E-04, 3.0197159708157808E-04, -4.2795619387511908E-05, -2.5481132674301286E-05, -6.4507244019414964E-07}; + constexpr FLT c6[] = {2.9426545129495891E-07, 4.7724106401925034E-06, -1.1001642128368358E-05, -2.6869692251292103E-05, 9.4483235217708846E-05, -6.1678458203322752E-05, -6.1678458203283029E-05, 9.4483235217638725E-05, -2.6869692251319154E-05, -1.1001642128368348E-05, 4.7724106401924525E-06, 2.9426545129497845E-07}; + constexpr FLT c7[] = {9.5799843879057487E-08, 3.7784160107136394E-07, -3.2256313018476217E-06, 5.0144058082843800E-06, 3.4886031174309006E-06, -1.7411974954245794E-05, 1.7411974954244114E-05, -3.4886031173677615E-06, -5.0144058082412084E-06, 3.2256313018490718E-06, -3.7784160107127161E-07, -9.5799843879039593E-08}; + constexpr FLT c8[] = {2.1473864761677802E-08, -5.7414008446850441E-08, -2.0134799316446491E-07, 1.1145247706131597E-06, -1.8840465966107854E-06, 1.0067804561094662E-06, 1.0067804560969447E-06, -1.8840465965985945E-06, 1.1145247706194121E-06, -2.0134799316567892E-07, -5.7414008446903526E-08, 2.1473864761695718E-08}; + constexpr FLT c9[] = {2.8867786924320735E-09, -2.0015791402048098E-08, 4.5306507660172584E-08, -7.8859059608423767E-09, -1.5755151471717741E-07, 3.4270221893522085E-07, -3.4270221891584534E-07, 1.5755151474485673E-07, 7.8859059608423767E-09, -4.5306507656885666E-08, 2.0015791402102159E-08, -2.8867786924173336E-09}; + constexpr FLT c10[] = {6.9986758892026879E-11, -1.8486004428526375E-09, 8.7658205612213605E-09, -2.0364661368255434E-08, 2.5396405431717686E-08, -1.2044441164754235E-08, -1.2044441145898965E-08, 2.5396405393379069E-08, -2.0364661337458944E-08, 8.7658205594930229E-09, -1.8486004428624741E-09, 6.9986758906941889E-11}; + constexpr FLT c11[] = {-5.6296594747629561E-11, 1.4066781276164117E-10, 4.6947620156299098E-11, -1.1526063766721083E-09, 3.3027593515457814E-09, -5.2174001597719162E-09, 5.2174001336505757E-09, -3.3027593563725673E-09, 1.1526063504088099E-09, -4.6947618665684182E-11, -1.4066781273945818E-10, 5.6296594761077256E-11}; + constexpr FLT c12[] = {-1.0870401168253040E-11, 4.8044744351982426E-11, -1.3004175788815863E-10, 2.2570502267192305E-10, -2.4006684875388499E-10, 1.0598000131166063E-10, 1.0597991964307358E-10, -2.4006682833673746E-10, 2.2570504206821193E-10, -1.3004176149306233E-10, 4.8044744304130286E-11, -1.0870401156071839E-11}; + constexpr FLT c13[] = {-4.7539080498592749E-13, 2.6787995976616703E-12, -1.0000145739993567E-11, 2.5777400861531429E-11, -4.7463672955972831E-11, 6.4012227921839136E-11, -6.4012266007267373E-11, 4.7463669782187146E-11, -2.5777397687745743E-11, 1.0000149112140858E-11, -2.6787995744161696E-12, 4.7539081133001201E-13}; for (int i=0; i<12; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i] + z*(c12[i] + z*(c13[i]))))))))))))); } else if (w==13) { - constexpr FLT c0[] = {9.8715725867495858E+04, 1.9828875496808127E+08, 1.7196758809615005E+10, 3.3083776881353601E+11, 2.2668873993375454E+12, 6.7734720591167598E+12, 9.6695220682534863E+12, 6.7734720591167490E+12, 2.2668873993375454E+12, 3.3083776881353540E+11, 1.7196758809615013E+10, 1.9828875496807912E+08, 9.8715725867495596E+04}; - constexpr FLT c1[] = {5.4491110456935561E+05, 5.4903670125539398E+08, 3.0879465445278194E+10, 3.9588436413399976E+11, 1.6860562536749780E+12, 2.4256447893117891E+12, 5.2271652473787576E-04, -2.4256447893117861E+12, -1.6860562536749771E+12, -3.9588436413399896E+11, -3.0879465445278202E+10, -5.4903670125538874E+08, -5.4491110456935479E+05}; - constexpr FLT c2[] = {1.3504711883426080E+06, 6.9286979077463174E+08, 2.4618123595484570E+10, 1.9493985627722617E+11, 3.9422703517046405E+11, -1.8678883613919846E+11, -8.5538079834550037E+11, -1.8678883613919666E+11, 3.9422703517046375E+11, 1.9493985627722595E+11, 2.4618123595484570E+10, 6.9286979077462602E+08, 1.3504711883426073E+06}; - constexpr FLT c3[] = {1.9937206140846505E+06, 5.2512029493766004E+08, 1.1253303793811764E+10, 4.6205527735932259E+10, -1.1607472377982828E+10, -1.6305241755642276E+11, 1.6137900538478137E-04, 1.6305241755642496E+11, 1.1607472377982767E+10, -4.6205527735932159E+10, -1.1253303793811754E+10, -5.2512029493765628E+08, -1.9937206140846501E+06}; - constexpr FLT c4[] = {1.9607419630386413E+06, 2.6425362558103913E+08, 3.1171259341747184E+09, 2.9839860297840395E+09, -1.9585031917561905E+10, -5.0666917387060509E+09, 3.6568794485482040E+10, -5.0666917387052479E+09, -1.9585031917561382E+10, 2.9839860297839293E+09, 3.1171259341747251E+09, 2.6425362558103746E+08, 1.9607419630386424E+06}; - constexpr FLT c5[] = {1.3593773865640303E+06, 9.1556445104158297E+07, 4.7074012944133645E+08, -1.1192579335656993E+09, -2.1090780087868536E+09, 5.2270306737954664E+09, 5.5914317801530834E-04, -5.2270306737946453E+09, 2.1090780087878797E+09, 1.1192579335657849E+09, -4.7074012944133860E+08, -9.1556445104157880E+07, -1.3593773865640303E+06}; - constexpr FLT c6[] = {6.8417206432039291E+05, 2.1561705510027312E+07, 7.5785249893027432E+06, -2.7456096030220407E+08, 3.4589095671070045E+08, 4.0256106808935356E+08, -1.0074306926604354E+09, 4.0256106809054130E+08, 3.4589095671009880E+08, -2.7456096030236250E+08, 7.5785249893008731E+06, 2.1561705510027334E+07, 6.8417206432039256E+05}; - constexpr FLT c7[] = {2.5248269397037590E+05, 3.0985559672617475E+06, -1.1816517087615140E+07, -8.2958498769974122E+06, 8.0546642347458601E+07, -1.0594657799513456E+08, 2.0249720264016184E-04, 1.0594657799514198E+08, -8.0546642347324282E+07, 8.2958498771580132E+06, 1.1816517087620620E+07, -3.0985559672620827E+06, -2.5248269397037590E+05}; - constexpr FLT c8[] = {6.7530100970876185E+04, 1.2373362326675311E+05, -2.1245597183288219E+06, 5.1047323238642653E+06, -1.4139444406972022E+06, -1.1818267556148527E+07, 2.0121548578311723E+07, -1.1818267556689126E+07, -1.4139444399964837E+06, 5.1047323237335468E+06, -2.1245597183262822E+06, 1.2373362326715943E+05, 6.7530100970876825E+04}; - constexpr FLT c9[] = {1.2421368748960511E+04, -5.0576243646858849E+04, -4.8878193436522284E+04, 6.5307896871419600E+05, -1.5497610128521242E+06, 1.5137725913425679E+06, 9.4288709689637382E-06, -1.5137725926086102E+06, 1.5497610130712469E+06, -6.5307896859246108E+05, 4.8878193441087336E+04, 5.0576243646517250E+04, -1.2421368748960882E+04}; - constexpr FLT c10[] = {1.2904654687548632E+03, -1.1169946054771519E+04, 3.3275109715936509E+04, -3.1765222282529230E+04, -5.9810982046625119E+04, 2.2355863065128919E+05, -3.1083591717381903E+05, 2.2355863453495159E+05, -5.9810982317515191E+04, -3.1765222420737289E+04, 3.3275109716627514E+04, -1.1169946054393644E+04, 1.2904654687550840E+03}; - constexpr FLT c11[] = {-1.9043622268214964E+01, -6.8296542209517031E+02, 4.2702512258593224E+03, -1.2165497344048174E+04, 1.9423733117203814E+04, -1.6010024763745962E+04, 3.4546242756821764E-04, 1.6010021562009399E+04, -1.9423732921465795E+04, 1.2165497485154361E+04, -4.2702512258593424E+03, 6.8296542155861471E+02, 1.9043622268233225E+01}; - constexpr FLT c12[] = {-3.0093984466084923E+01, 9.8972865759901183E+01, -9.7437038386122609E+01, -3.5079929976821143E+02, 1.5699249129925884E+03, -3.1287450613413444E+03, 3.8692192717886201E+03, -3.1287461388880197E+03, 1.5699252721748373E+03, -3.5079941874733129E+02, -9.7437038807041006E+01, 9.8972866294818274E+01, -3.0093984465708520E+01}; - constexpr FLT c13[] = {-4.3050286012574066E+00, 2.1108975856232256E+01, -6.4297196943170974E+01, 1.2922884719917388E+02, -1.6991815434264092E+02, 1.2654996803592717E+02, -1.3650372630766216E-04, -1.2655097304483594E+02, 1.6991801475807023E+02, -1.2922895886683040E+02, 6.4297199778482565E+01, -2.1108976173160116E+01, 4.3050286010444170E+00}; - constexpr FLT c14[] = {-1.0957333734356203E-01, 7.2949328697697935E-01, -3.4300803257592030E+00, 1.0470037850609911E+01, -2.2292132783546631E+01, 3.4570970759468082E+01, -3.9923502981338281E+01, 3.4573363471454584E+01, -2.2292171023236033E+01, 1.0470076090299283E+01, -3.4300793014818574E+00, 7.2949361239845723E-01, -1.0957333723937021E-01}; + constexpr FLT c0[] = {1.0208956054983696E-08, 2.0506572462261995E-05, 1.7784497194617906E-03, 3.4214490279693019E-02, 2.3443634373410047E-01, 7.0049708882252804E-01, 9.9999999999999956E-01, 7.0049708882252670E-01, 2.3443634373410041E-01, 3.4214490279692922E-02, 1.7784497194617906E-03, 2.0506572462261785E-05, 1.0208956054983676E-08}; + constexpr FLT c1[] = {5.6353468219321995E-08, 5.6780128053894686E-05, 3.1934841481628326E-03, 4.0941461360716927E-02, 1.7436810648693357E-01, 2.5085467225681696E-01, -6.3638764007737755E-17, -2.5085467225681662E-01, -1.7436810648693341E-01, -4.0941461360716816E-02, -3.1934841481628326E-03, -5.6780128053894232E-05, -5.6353468219321988E-08}; + constexpr FLT c2[] = {1.3966266158866427E-07, 7.1655019336418755E-05, 2.5459504018621182E-03, 2.0160236969440644E-02, 4.0770064165298429E-02, -1.9317276988534509E-02, -8.8461538461538661E-02, -1.9317276988534381E-02, 4.0770064165298395E-02, 2.0160236969440602E-02, 2.5459504018621160E-03, 7.1655019336418200E-05, 1.3966266158866422E-07}; + constexpr FLT c3[] = {2.0618605552701903E-07, 5.4306747658367697E-05, 1.1637911071900936E-03, 4.7784706844645319E-03, -1.2004184173788884E-03, -1.6862510515565966E-02, 1.4394808111083350E-16, 1.6862510515566146E-02, 1.2004184173788636E-03, -4.7784706844645379E-03, -1.1637911071900920E-03, -5.4306747658367331E-05, -2.0618605552701909E-07}; + constexpr FLT c4[] = {2.0277547837406105E-07, 2.7328509487415503E-05, 3.2236608098850310E-04, 3.0859705461356495E-04, -2.0254394973524947E-03, -5.2398574644553877E-04, 3.7818616294949463E-03, -5.2398574644547762E-04, -2.0254394973524895E-03, 3.0859705461357378E-04, 3.2236608098850327E-04, 2.7328509487415384E-05, 2.0277547837406108E-07}; + constexpr FLT c5[] = {1.4058372037094490E-07, 9.4685595066536085E-06, 4.8682874512158502E-05, -1.1575111217134651E-04, -2.1811605515759046E-04, 5.4056763477041119E-04, 1.1213866287069097E-16, -5.4056763477029453E-04, 2.1811605515769156E-04, 1.1575111217135234E-04, -4.8682874512158861E-05, -9.4685595066535949E-06, -1.4058372037094498E-07}; + constexpr FLT c6[] = {7.0755520230584385E-08, 2.2298625886400277E-06, 7.8375383352022143E-07, -2.8394470622676381E-05, 3.5771256766257562E-05, 4.1631950912211130E-05, -1.0418619302467684E-04, 4.1631950912333557E-05, 3.5771256766183768E-05, -2.8394470622671916E-05, 7.8375383351933331E-07, 2.2298625886400294E-06, 7.0755520230584346E-08}; + constexpr FLT c7[] = {2.6111186487625245E-08, 3.2044561720738826E-07, -1.2220373462313589E-06, -8.5793794342228941E-07, 8.3299507234112700E-06, -1.0956754351178954E-05, 9.4610283796409485E-17, 1.0956754351115859E-05, -8.3299507234215327E-06, 8.5793794342144989E-07, 1.2220373462321896E-06, -3.2044561720741346E-07, -2.6111186487625302E-08}; + constexpr FLT c8[] = {6.9838095920570498E-09, 1.2796250155222958E-08, -2.1971713837900942E-07, 5.2791981730307194E-07, -1.4622692107334488E-07, -1.2222183756556175E-06, 2.0809248310569844E-06, -1.2222183756925741E-06, -1.4622692099063203E-07, 5.2791981730006307E-07, -2.1971713837856465E-07, 1.2796250155283016E-08, 6.9838095920570937E-09}; + constexpr FLT c9[] = {1.2845897306280646E-09, -5.2304801922802769E-09, -5.0548716982175665E-09, 6.7539942924545603E-08, -1.6027276234256162E-07, 1.5655092165632365E-07, 4.6828140259346451E-17, -1.5655092173659360E-07, 1.6027276234809749E-07, -6.7539942912781904E-08, 5.0548716984338105E-09, 5.2304801922379145E-09, -1.2845897306280857E-09}; + constexpr FLT c10[] = {1.3345700642131601E-10, -1.1551704392349950E-09, 3.4412362345673782E-09, -3.2850871078054311E-09, -6.1855158542452699E-09, 2.3119925642302808E-08, -3.2145944181567604E-08, 2.3119926027259106E-08, -6.1855159240088862E-09, -3.2850871247748739E-09, 3.4412362345280933E-09, -1.1551704391858975E-09, 1.3345700642134581E-10}; + constexpr FLT c11[] = {-1.9694481417663767E-12, -7.0630732018717419E-11, 4.4161967766895751E-10, -1.2581280884757252E-09, 2.0087583285653241E-09, -1.6557203488425082E-09, 5.7014219382328511E-17, 1.6557200410648860E-09, -2.0087583339599462E-09, 1.2581281082796833E-09, -4.4161967789965090E-10, 7.0630731978790794E-11, 1.9694481417229703E-12}; + constexpr FLT c12[] = {-3.1122514901291979E-12, 1.0235548893351873E-11, -1.0076717787418374E-11, -3.6278872085836478E-11, 1.6235812713334426E-10, -3.2356766327511469E-10, 4.0014573853281197E-10, -3.2356772044312440E-10, 1.6235817511363862E-10, -3.6278891226911122E-11, -1.0076717627909611E-11, 1.0235548938213992E-11, -3.1122514900941893E-12}; + constexpr FLT c13[] = {-4.4521627553052389E-13, 2.1830423195977186E-12, -6.6494700502871459E-12, 1.3364548102385267E-11, -1.7572530897780217E-11, 1.3087527392509343E-11, -1.4854086432767967E-17, -1.3087613084722882E-11, 1.7572508681280409E-11, -1.3364552466340585E-11, 6.6494701742631489E-12, -2.1830423513665695E-12, 4.4521627553052389E-13}; + constexpr FLT c14[] = {-1.1331825591762625E-14, 7.5442537823437382E-14, -3.5473113067901070E-13, 1.0827924393926043E-12, -2.3053993601726267E-12, 3.5752731472827676E-12, -4.1288118242378826E-12, 3.5755029357484062E-12, -2.3054273074184593E-12, 1.0827837446939142E-12, -3.5473109186339628E-13, 7.5442574213081941E-14, -1.1331825564518091E-14}; for (int i=0; i<13; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i] + z*(c12[i] + z*(c13[i] + z*(c14[i])))))))))))))); } else if (w==14) { - constexpr FLT c0[] = {1.5499533202966311E+05, 4.4723032442444772E+08, 5.1495083701694801E+10, 1.2904576022918081E+12, 1.1534950432785514E+13, 4.5650102198520523E+13, 8.8830582190032719E+13, 8.8830582190032734E+13, 4.5650102198520523E+13, 1.1534950432785541E+13, 1.2904576022918088E+12, 5.1495083701695160E+10, 4.4723032442444867E+08, 1.5499533202970124E+05}; - constexpr FLT c1[] = {8.9188339002980455E+05, 1.3065352538728638E+09, 9.9400185225815598E+10, 1.7136059013402412E+12, 1.0144146621675834E+13, 2.3034036018490723E+13, 1.4630967270448885E+13, -1.4630967270448867E+13, -2.3034036018490715E+13, -1.0144146621675846E+13, -1.7136059013402415E+12, -9.9400185225815979E+10, -1.3065352538728662E+09, -8.9188339002979419E+05}; - constexpr FLT c2[] = {2.3170473769379673E+06, 1.7532505043698251E+09, 8.6523535958354309E+10, 9.7455289065487476E+11, 3.2977972139362329E+12, 1.7874626001697834E+12, -6.1480918082633936E+12, -6.1480918082634014E+12, 1.7874626001697737E+12, 3.2977972139362251E+12, 9.7455289065487329E+11, 8.6523535958354599E+10, 1.7532505043698282E+09, 2.3170473769380408E+06}; - constexpr FLT c3[] = {3.6089249230396431E+06, 1.4278058213962200E+09, 4.4296625537022446E+10, 2.9466624630419830E+11, 3.1903621584503467E+11, -9.8834691411254578E+11, -1.1072264714919094E+12, 1.1072264714919380E+12, 9.8834691411255481E+11, -3.1903621584503326E+11, -2.9466624630419788E+11, -4.4296625537022636E+10, -1.4278058213962224E+09, -3.6089249230396668E+06}; - constexpr FLT c4[] = {3.7733555140851745E+06, 7.8376718099107444E+08, 1.4443117772349586E+10, 4.3197433307418678E+10, -7.6585042240583893E+10, -1.8569640140762125E+11, 2.0385335192658521E+11, 2.0385335192658505E+11, -1.8569640140762244E+11, -7.6585042240577591E+10, 4.3197433307418831E+10, 1.4443117772349697E+10, 7.8376718099107611E+08, 3.7733555140852574E+06}; - constexpr FLT c5[] = {2.8079157920112340E+06, 3.0340753492383713E+08, 2.9498136661747241E+09, -6.2820200387946582E+08, -2.2372008390623741E+10, 1.5217518660587065E+10, 4.0682590266890762E+10, -4.0682590266874344E+10, -1.5217518660581593E+10, 2.2372008390624836E+10, 6.2820200387926054E+08, -2.9498136661747794E+09, -3.0340753492383808E+08, -2.8079157920112382E+06}; - constexpr FLT c6[] = {1.5361613559533129E+06, 8.3513615594416931E+07, 3.0077547202709264E+08, -1.3749596754065564E+09, -6.6733027297578251E+08, 5.9590333632812872E+09, -4.3025685566868906E+09, -4.3025685566947279E+09, 5.9590333632843285E+09, -6.6733027297604084E+08, -1.3749596754066198E+09, 3.0077547202708143E+08, 8.3513615594416305E+07, 1.5361613559533581E+06}; - constexpr FLT c7[] = {6.2759409419593017E+05, 1.5741723594963871E+07, -1.5632610223386128E+07, -1.9294824907063219E+08, 4.4643806532504034E+08, 1.5178998384579189E+07, -9.6771139891231704E+08, 9.6771139892423606E+08, -1.5178998381071322E+07, -4.4643806533015347E+08, 1.9294824907069016E+08, 1.5632610223408137E+07, -1.5741723594963046E+07, -6.2759409419590794E+05}; - constexpr FLT c8[] = {1.9151404903933618E+05, 1.7156606891565623E+06, -9.7733523156695794E+06, 4.2982266232611798E+06, 5.1660907884888940E+07, -1.1279400211171694E+08, 6.4701089576848499E+07, 6.4701089570801638E+07, -1.1279400210612530E+08, 5.1660907893511616E+07, 4.2982266235306170E+06, -9.7733523156822342E+06, 1.7156606891565854E+06, 1.9151404903936735E+05}; - constexpr FLT c9[] = {4.2715272622844263E+04, -2.2565910611002505E+03, -1.1769776156928577E+06, 4.0078399906352242E+06, -3.8951858073074366E+06, -5.0944610789569877E+06, 1.6765992441849992E+07, -1.6765992434448514E+07, 5.0944610797360903E+06, 3.8951858063335577E+06, -4.0078399906595708E+06, 1.1769776157202481E+06, 2.2565910608803192E+03, -4.2715272622819932E+04}; - constexpr FLT c10[] = {6.4806786522801558E+03, -3.5474227032715331E+04, 1.8237100734263218E+04, 3.0934714642964909E+05, -1.0394703930801603E+06, 1.4743920316337310E+06, -7.3356881642929500E+05, -7.3356882324020052E+05, 1.4743920364765557E+06, -1.0394703915764539E+06, 3.0934714676135289E+05, 1.8237100683125096E+04, -3.5474227032952876E+04, 6.4806786523017845E+03}; - constexpr FLT c11[] = {4.9913632908494827E+02, -5.5416668522806276E+03, 2.0614058722611946E+04, -3.2285139157855901E+04, -5.3099566255893524E+03, 1.1559000150525174E+05, -2.2569743273246771E+05, 2.2569743457059452E+05, -1.1559000428242185E+05, 5.3099542679931265E+03, 3.2285138893125553E+04, -2.0614058670789782E+04, 5.5416668532562171E+03, -4.9913632906264002E+02}; - constexpr FLT c12[] = {-3.3076333188696488E+01, -1.8970588558436827E+02, 1.8160423493169353E+03, -6.3715703265863249E+03, 1.2525624646166696E+04, -1.4199807314837786E+04, 6.4441944019082612E+03, 6.4441857815347785E+03, -1.4199805590763088E+04, 1.2525627375951648E+04, -6.3715703355659844E+03, 1.8160422864600705E+03, -1.8970588672434647E+02, -3.3076333168693779E+01}; - constexpr FLT c13[] = {-1.4394533628062636E+01, 5.7000699174526638E+01, -1.0101142144442984E+02, -3.2954074617159108E+01, 6.1417869930814436E+02, -1.6177306801656998E+03, 2.4593354137960296E+03, -2.4593361954696252E+03, 1.6177288934831954E+03, -6.1417959264939657E+02, 3.2954074617159108E+01, 1.0101142929606195E+02, -5.7000698932570963E+01, 1.4394533639244566E+01}; - constexpr FLT c14[] = {-1.5925952284527973E+00, 8.5113930275160214E+00, -2.8993510636695618E+01, 6.6373557362227814E+01, -1.0329536491693236E+02, 1.0280181071020283E+02, -4.3891122033571499E+01, -4.3893656778687756E+01, 1.0280325289276884E+02, -1.0329444716438918E+02, 6.6373666618482872E+01, -2.8993528390837142E+01, 8.5113926647511526E+00, -1.5925952190335899E+00}; - constexpr FLT c15[] = {1.5984868634272537E-02, 1.2876168577716327E-01, -9.8358742969178536E-01, 3.7710928871122080E+00, -9.4315137784350505E+00, 1.6840408563519507E+01, -2.2308532530501328E+01, 2.2310146222863779E+01, -1.6843058416240989E+01, 9.4311230950209399E+00, -3.7712287769953385E+00, 9.8360653920659347E-01, -1.2876103884046056E-01, -1.5984859595043394E-02}; + constexpr FLT c0[] = {1.6070755785071491E-09, 4.6371263117318300E-06, 5.3392892770691468E-04, 1.3380163586766329E-02, 1.1960061568997656E-01, 4.7332499268789285E-01, 9.2104360429933863E-01, 9.2104360429933885E-01, 4.7332499268789302E-01, 1.1960061568997683E-01, 1.3380163586766332E-02, 5.3392892770691837E-04, 4.6371263117318342E-06, 1.6070755785075502E-09}; + constexpr FLT c1[] = {9.2475302076758674E-09, 1.3546865389183953E-05, 1.0306349751547578E-03, 1.7767594411827761E-02, 1.0518000824290019E-01, 2.3882936521395404E-01, 1.5170179567585843E-01, -1.5170179567585837E-01, -2.3882936521395398E-01, -1.0518000824290036E-01, -1.7767594411827754E-02, -1.0306349751547613E-03, -1.3546865389183977E-05, -9.2475302076757731E-09}; + constexpr FLT c2[] = {2.4024402573674993E-08, 1.8178651135370012E-05, 8.9712289901830596E-04, 1.0104692380253478E-02, 3.4193348251104483E-02, 1.8533380680638794E-02, -6.3746746886473832E-02, -6.3746746886473860E-02, 1.8533380680638745E-02, 3.4193348251104413E-02, 1.0104692380253471E-02, 8.9712289901830889E-04, 1.8178651135370046E-05, 2.4024402573675768E-08}; + constexpr FLT c3[] = {3.7419288907183495E-08, 1.4804264337309617E-05, 4.5929141335173144E-04, 3.0552592910038168E-03, 3.3079403387824323E-03, -1.0247716289024879E-02, -1.1480323948535117E-02, 1.1480323948535463E-02, 1.0247716289025027E-02, -3.3079403387824271E-03, -3.0552592910038120E-03, -4.5929141335173334E-04, -1.4804264337309643E-05, -3.7419288907183766E-08}; + constexpr FLT c4[] = {3.9124194363163287E-08, 8.1265227753122953E-06, 1.4975407030324905E-04, 4.4789439277602894E-04, -7.9407521150521383E-04, -1.9254008995687184E-03, 2.1136619999320748E-03, 2.1136619999320141E-03, -1.9254008995687132E-03, -7.9407521150514292E-04, 4.4789439277602867E-04, 1.4975407030325005E-04, 8.1265227753123105E-06, 3.9124194363164148E-08}; + constexpr FLT c5[] = {2.9113992252245385E-08, 3.1458937074171823E-06, 3.0585266291431613E-05, -6.5135387342551234E-06, -2.3196510408355524E-04, 1.5778347828067563E-04, 4.2181913759748168E-04, -4.2181913759742725E-04, -1.5778347828060562E-04, 2.3196510408355524E-04, 6.5135387342551234E-06, -3.0585266291432040E-05, -3.1458937074171887E-06, -2.9113992252245408E-08}; + constexpr FLT c6[] = {1.5927753226313472E-08, 8.6591441391883797E-07, 3.1186030532599549E-06, -1.4256326863802477E-05, -6.9192418278078229E-06, 6.1786486497582421E-05, -4.4611361914704291E-05, -4.4611361914610670E-05, 6.1786486497541994E-05, -6.9192418278024798E-06, -1.4256326863804276E-05, 3.1186030532598494E-06, 8.6591441391883161E-07, 1.5927753226313945E-08}; + constexpr FLT c7[] = {6.5072355972925020E-09, 1.6321871905299654E-07, -1.6208737249918160E-07, -2.0005919851675986E-06, 4.6289117401651821E-06, 1.5738407907104777E-07, -1.0033756087313552E-05, 1.0033756087535249E-05, -1.5738407898383816E-07, -4.6289117402341052E-06, 2.0005919851709152E-06, 1.6208737249923451E-07, -1.6321871905299225E-07, -6.5072355972922787E-09}; + constexpr FLT c8[] = {1.9857214221989366E-09, 1.7788899565181922E-08, -1.0133541198312604E-07, 4.4566342395340293E-08, 5.3564828266574526E-07, -1.1695093255338883E-06, 6.7085595118984104E-07, 6.7085595114069746E-07, -1.1695093255217181E-06, 5.3564828276835377E-07, 4.4566342396873204E-08, -1.0133541198326502E-07, 1.7788899565180526E-08, 1.9857214221992563E-09}; + constexpr FLT c9[] = {4.4289508956510332E-10, -2.3397558741938982E-11, -1.2203541602658680E-08, 4.1555456455006879E-08, -4.0387396856849884E-08, -5.2822132653130956E-08, 1.7383889351097292E-07, -1.7383889353173241E-07, 5.2822132672506464E-08, 4.0387396834706444E-08, -4.1555456455698865E-08, 1.2203541602950610E-08, 2.3397558742361335E-11, -4.4289508956485253E-10}; + constexpr FLT c10[] = {6.7195187479843226E-11, -3.6781600571171619E-10, 1.8909214083296717E-10, 3.2074788122994124E-09, -1.0777792237807384E-08, 1.5287295377979802E-08, -7.6060392723093131E-09, -7.6060391755201933E-09, 1.5287295398091755E-08, -1.0777792217695420E-08, 3.2074788146563205E-09, 1.8909214044014493E-10, -3.6781600571662634E-10, 6.7195187480068943E-11}; + constexpr FLT c11[] = {5.1753158905822061E-12, -5.7459004384753609E-11, 2.1373772914288248E-10, -3.3474981614755248E-10, -5.5056523013581392E-11, 1.1984997345151211E-09, -2.3401534609898206E-09, 2.3401534737665714E-09, -1.1984997515507915E-09, 5.5056487167718091E-11, 3.3474981678638774E-10, -2.1373772871699109E-10, 5.7459004393903842E-11, -5.1753158903480283E-12}; + constexpr FLT c12[] = {-3.4295334316135217E-13, -1.9669734020395281E-12, 1.8829710516667924E-11, -6.6063898621267923E-11, 1.2987243021035191E-10, -1.4723142988261286E-10, 6.6816662742079877E-11, 6.6816650491789053E-11, -1.4723143192432656E-10, 1.2987247614892944E-10, -6.6063898621269021E-11, 1.8829709886607818E-11, -1.9669734162457477E-12, -3.4295334295692199E-13}; + constexpr FLT c13[] = {-1.4925032356367256E-13, 5.9101412900182951E-13, -1.0473414103260276E-12, -3.4168877521962931E-13, 6.3681343308181771E-12, -1.6773485918159645E-11, 2.5499676364679485E-11, -2.5499722384571941E-11, 1.6773473223016897E-11, -6.3681501997466111E-12, 3.4168877521962931E-13, 1.0473414909104298E-12, -5.9101412551500433E-13, 1.4925032367414924E-13}; + constexpr FLT c14[] = {-1.6512890188764807E-14, 8.8250735109913167E-14, -3.0062084749515021E-13, 6.8819378623923325E-13, -1.0710378278007934E-12, 1.0658930503703208E-12, -4.5535006559156473E-13, -4.5529417109990688E-13, 1.0659116818675222E-12, -1.0710247857527394E-12, 6.8819549412647750E-13, -3.0062091542248455E-13, 8.8250729803090660E-14, -1.6512890092223385E-14}; + constexpr FLT c15[] = {1.6573977440105294E-16, 1.3350735743743382E-15, -1.0198606577404851E-14, 3.9099634678793536E-14, -9.7801981044810947E-14, 1.7461338478760738E-13, -2.3137912816883565E-13, 2.3133990246879147E-13, -1.7463221312362809E-13, 9.7795403196649327E-14, -3.9099513984331611E-14, 1.0198764988885690E-14, -1.3350660309704511E-15, -1.6573967886539614E-16}; for (int i=0; i<14; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i] + z*(c12[i] + z*(c13[i] + z*(c14[i] + z*(c15[i]))))))))))))))); } else if (w==15) { - constexpr FLT c0[] = {2.3939707792242090E+05, 9.7700272582690299E+08, 1.4715933396485275E+11, 4.7242424833337236E+12, 5.3987426629953617E+13, 2.7580474290566103E+14, 7.0693378336533425E+14, 9.6196578554477850E+14, 7.0693378336533425E+14, 2.7580474290566153E+14, 5.3987426629953828E+13, 4.7242424833337285E+12, 1.4715933396485275E+11, 9.7700272582690418E+08, 2.3939707792242119E+05}; - constexpr FLT c1[] = {1.4314487885226035E+06, 2.9961416925358462E+09, 3.0273361232748425E+11, 6.8507333793903604E+12, 5.4192702756911016E+13, 1.7551587948105316E+14, 2.1874615668430153E+14, 5.4722295550654096E-02, -2.1874615668430156E+14, -1.7551587948105334E+14, -5.4192702756911172E+13, -6.8507333793903730E+12, -3.0273361232748438E+11, -2.9961416925358448E+09, -1.4314487885226023E+06}; - constexpr FLT c2[] = {3.8829497354762922E+06, 4.2473082696966453E+09, 2.8414312556015533E+11, 4.3688281331121431E+12, 2.1823119508000547E+13, 3.2228098609392133E+13, -2.1833085454691801E+13, -7.3750710225100750E+13, -2.1833085454691875E+13, 3.2228098609392070E+13, 2.1823119508000590E+13, 4.3688281331121470E+12, 2.8414312556015527E+11, 4.2473082696966438E+09, 3.8829497354762908E+06}; - constexpr FLT c3[] = {6.3495763451755792E+06, 3.6841035003733959E+09, 1.5965774278321054E+11, 1.5630338683778213E+12, 3.8749058615819409E+12, -2.7319740087722651E+12, -1.3233342822865350E+13, 1.2682483963161023E-01, 1.3233342822865453E+13, 2.7319740087724204E+12, -3.8749058615819307E+12, -1.5630338683778201E+12, -1.5965774278321042E+11, -3.6841035003733950E+09, -6.3495763451755783E+06}; - constexpr FLT c4[] = {7.0146619045520453E+06, 2.1782897863065763E+09, 5.8897780310148117E+10, 3.1953009601770477E+11, 4.0651527030195397E+08, -1.6379148273275671E+12, -1.1568753137013023E+11, 2.7451653250461045E+12, -1.1568753137006947E+11, -1.6379148273276748E+12, 4.0651527030228132E+08, 3.1953009601770502E+11, 5.8897780310148155E+10, 2.1782897863065772E+09, 7.0146619045520453E+06}; - constexpr FLT c5[] = {5.5580012413990172E+06, 9.2345162185944211E+08, 1.4522950934020031E+10, 2.7025952371212032E+10, -1.2304576967641461E+11, -1.0116752717201025E+11, 3.8517418245450385E+11, 1.3143739157465117E-02, -3.8517418245443384E+11, 1.0116752717219414E+11, 1.2304576967643431E+11, -2.7025952371216137E+10, -1.4522950934020092E+10, -9.2345162185944176E+08, -5.5580012413990181E+06}; - constexpr FLT c6[] = {3.2693972344231815E+06, 2.8610260147425276E+08, 2.2348528403751349E+09, -3.4574515574230409E+09, -1.7480626463581440E+10, 3.1608597465590984E+10, 1.9879262560063576E+10, -6.6148013553869423E+10, 1.9879262560078850E+10, 3.1608597465530212E+10, -1.7480626463573368E+10, -3.4574515574202504E+09, 2.2348528403750744E+09, 2.8610260147425228E+08, 3.2693972344231787E+06}; - constexpr FLT c7[] = {1.4553539959296281E+06, 6.4136842048384696E+07, 1.3622336582072574E+08, -1.2131510424637468E+09, 6.4322366984755766E+08, 4.5078753872548027E+09, -7.1689413747004452E+09, 3.2111361580040181E-03, 7.1689413747369127E+09, -4.5078753874649162E+09, -6.4322366984639454E+08, 1.2131510424612916E+09, -1.3622336582064471E+08, -6.4136842048384838E+07, -1.4553539959296265E+06}; - constexpr FLT c8[] = {4.9358776531681791E+05, 9.7772970960583091E+06, -2.3511574237971250E+07, -1.0142613816625430E+08, 3.9421144217985487E+08, -2.8449115594571364E+08, -5.7549243248595941E+08, 1.1608781630719392E+09, -5.7549243238966489E+08, -2.8449115596289498E+08, 3.9421144214631909E+08, -1.0142613816300942E+08, -2.3511574237913735E+07, 9.7772970960591603E+06, 4.9358776531681628E+05}; - constexpr FLT c9[] = {1.2660319987326709E+05, 7.7519511328105081E+05, -6.5244610661542164E+06, 9.0878257490973976E+06, 2.3116605621149909E+07, -8.7079594477661625E+07, 9.5542733670714021E+07, -3.4623017322338634E-02, -9.5542733658248380E+07, 8.7079594589852452E+07, -2.3116605559600774E+07, -9.0878257518242579E+06, 6.5244610661450867E+06, -7.7519511328086059E+05, -1.2660319987326671E+05}; - constexpr FLT c10[] = {2.3793325531461589E+04, -4.2305332802771904E+04, -5.2884156975031609E+05, 2.5307340145554747E+06, -4.0404175204335153E+06, -1.7519988538994591E+05, 1.0146438798034744E+07, -1.5828545528861172E+07, 1.0146438794496680E+07, -1.7520001842407117E+05, -4.0404175643064296E+06, 2.5307340160591919E+06, -5.2884156977243477E+05, -4.2305332802771285E+04, 2.3793325531458995E+04}; - constexpr FLT c11[] = {2.9741655196857741E+03, -2.0687056403629973E+04, 3.3295507834673197E+04, 1.0661145690364030E+05, -5.6644238449031080E+05, 1.0874811673184116E+06, -9.6561276275880623E+05, -7.6207036577648435E-02, 9.6561275636531680E+05, -1.0874812580259521E+06, 5.6644242612787138E+05, -1.0661145858193116E+05, -3.3295507822185595E+04, 2.0687056403005630E+04, -2.9741655196852739E+03}; - constexpr FLT c12[] = {1.5389176594840404E+02, -2.3864418517811582E+03, 1.0846266965476148E+04, -2.2940053899336592E+04, 1.4780105833703366E+04, 4.2663634529139046E+04, -1.3047650082135458E+05, 1.7468394417865420E+05, -1.3047642955960588E+05, 4.2663569014305380E+04, 1.4780038020101238E+04, -2.2940052498526344E+04, 1.0846266965476338E+04, -2.3864418513602504E+03, 1.5389176594853458E+02}; - constexpr FLT c13[] = {-2.3857631312306911E+01, -1.9651606200276817E+01, 6.4183084244784663E+02, -2.8648428291977302E+03, 6.8249248253356263E+03, -9.7944434082514545E+03, 7.6177566999585488E+03, -4.8285923071218206E-02, -7.6177709934185850E+03, 9.7944219680614005E+03, -6.8249060651693289E+03, 2.8648407633460843E+03, -6.4183085466149657E+02, 1.9651606115081155E+01, 2.3857631312306911E+01}; - constexpr FLT c14[] = {-6.1348505726741482E+00, 2.7872916302350376E+01, -6.5819898558168433E+01, 5.1367134246654771E+01, 1.7214275703496423E+02, -6.9657243183240860E+02, 1.3192259272931558E+03, -1.6054145588281010E+03, 1.3192138654025996E+03, -6.9662907027505264E+02, 1.7212038135392731E+02, 5.1368095701697484E+01, -6.5819904020980715E+01, 2.7872916473063263E+01, -6.1348505738411490E+00}; - constexpr FLT c15[] = {-4.9671584422774523E-01, 3.0617550953446120E+00, -1.1650665638577927E+01, 3.0081331929557447E+01, -5.4030564936801589E+01, 6.6075844179663960E+01, -4.7176211285519123E+01, -3.4313439732287163E-02, 4.7173085818207042E+01, -6.6061100127341888E+01, 5.4056655794367416E+01, -3.0081722612971500E+01, 1.1650665638577902E+01, -3.0617553939307713E+00, 4.9671584448693240E-01}; - constexpr FLT c16[] = {4.3460783761337983E-03, -1.3199934226522787E-02, -1.9412503880258877E-01, 1.1325756464362078E+00, -3.4439944517155450E+00, 7.1653575841078521E+00, -1.1108195405465501E+01, 1.2348789868125033E+01, -1.1088023137785596E+01, 7.0939141360622937E+00, -3.4847592426682690E+00, 1.1324705825441117E+00, -1.9413837699275374E-01, -1.3199908576142469E-02, 4.3460782759542488E-03}; + constexpr FLT c0[] = {2.4886236238313534E-10, 1.0156314710024854E-06, 1.5297772142853732E-04, 4.9110296377727252E-03, 5.6121982134094042E-02, 2.8670951404936740E-01, 7.3488453954210731E-01, 1.0000000000000018E+00, 7.3488453954210708E-01, 2.8670951404936784E-01, 5.6121982134094188E-02, 4.9110296377727321E-03, 1.5297772142853737E-04, 1.0156314710024854E-06, 2.4886236238313394E-10}; + constexpr FLT c1[] = {1.4880454274285384E-09, 3.1146031777409673E-06, 3.1470309742465694E-04, 7.1215977556942766E-03, 5.6335374470954679E-02, 1.8245542837228418E-01, 2.2739494478010200E-01, -4.2425842671825266E-17, -2.2739494478010208E-01, -1.8245542837228432E-01, -5.6335374470954783E-02, -7.1215977556942861E-03, -3.1470309742465694E-04, -3.1146031777409668E-06, -1.4880454274285366E-09}; + constexpr FLT c2[] = {4.0364738474324423E-09, 4.4152383936309442E-06, 2.9537757977456596E-04, 4.5415629108243238E-03, 2.2685962261788550E-02, 3.3502333548319392E-02, -2.2696322242195994E-02, -7.6666666666667133E-02, -2.2696322242195945E-02, 3.3502333548319260E-02, 2.2685962261788570E-02, 4.5415629108243273E-03, 2.9537757977456591E-04, 4.4152383936309416E-06, 4.0364738474324407E-09}; + constexpr FLT c3[] = {6.6006259688120961E-09, 3.8297656275654657E-06, 1.6597029248061439E-04, 1.6248331197066942E-03, 4.0281119347581979E-03, -2.8399908290139206E-03, -1.3756562885831705E-02, 1.0758125681708418E-16, 1.3756562885831904E-02, 2.8399908290139895E-03, -4.0281119347581771E-03, -1.6248331197066914E-03, -1.6597029248061437E-04, -3.8297656275654657E-06, -6.6006259688120969E-09}; + constexpr FLT c4[] = {7.2920076887968825E-09, 2.2644150332986910E-06, 6.1226481435400985E-05, 3.3216368068303816E-04, 4.2258807580024870E-07, -1.7026747228854500E-03, -1.2026158633582243E-04, 2.8537037037044089E-03, -1.2026158633584264E-04, -1.7026747228853732E-03, 4.2258807580182180E-07, 3.3216368068303642E-04, 6.1226481435401053E-05, 2.2644150332986919E-06, 7.2920076887968842E-09}; + constexpr FLT c5[] = {5.7777535593445574E-09, 9.5996306286140537E-07, 1.5097159537535560E-05, 2.8094504791464212E-05, -1.2791075475386364E-04, -1.0516749004210079E-04, 4.0040320377530828E-04, 5.4844446833709888E-17, -4.0040320377525385E-04, 1.0516749004229523E-04, 1.2791075475386559E-04, -2.8094504791467126E-05, -1.5097159537535560E-05, -9.5996306286140579E-07, -5.7777535593445582E-09}; + constexpr FLT c6[] = {3.3986627004323950E-09, 2.9741452947022275E-07, 2.3232144780590118E-06, -3.5941523174497321E-06, -1.8171775676701533E-05, 3.2858338560981214E-05, 2.0665249075258455E-05, -6.8763374485615104E-05, 2.0665249075221676E-05, 3.2858338560934424E-05, -1.8171775676683576E-05, -3.5941523174470280E-06, 2.3232144780590435E-06, 2.9741452947022206E-07, 3.3986627004323950E-09}; + constexpr FLT c7[] = {1.5128957992049987E-09, 6.6672685257784247E-08, 1.4160936684823307E-07, -1.2611166225385906E-06, 6.6865545481897967E-07, 4.6861078169740899E-06, -7.4523870622442393E-06, 5.1688954219266444E-17, 7.4523870623463821E-06, -4.6861078171739939E-06, -6.6865545481690963E-07, 1.2611166225370325E-06, -1.4160936684824530E-07, -6.6672685257784551E-08, -1.5128957992049987E-09}; + constexpr FLT c8[] = {5.1310324414219292E-10, 1.0163871982745590E-08, -2.4441175134592830E-08, -1.0543632600171378E-07, 4.0979777876715675E-07, -2.9573937051194202E-07, -5.9824625884543558E-07, 1.2067769776847866E-06, -5.9824625879665336E-07, -2.9573937049659643E-07, 4.0979777875267863E-07, -1.0543632599876183E-07, -2.4441175134530762E-08, 1.0163871982746284E-08, 5.1310324414219364E-10}; + constexpr FLT c9[] = {1.3160883866734095E-10, 8.0584478671564817E-10, -6.7824252838686685E-09, 9.4471403089230076E-09, 2.4030590211824177E-08, -9.0522548480936782E-08, 9.9320303339648267E-08, 1.4827374781995408E-17, -9.9320303311968964E-08, 9.0522548602725694E-08, -2.4030590184836860E-08, -9.4471403124694187E-09, 6.7824252839146209E-09, -8.0584478671585931E-10, -1.3160883866734196E-10}; + constexpr FLT c10[] = {2.4734066313995269E-11, -4.3978001545632529E-11, -5.4975091406435660E-10, 2.6307942070348926E-09, -4.2001676281559915E-09, -1.8212709350780177E-10, 1.0547608795803518E-08, -1.6454374555673015E-08, 1.0547608746152108E-08, -1.8212708345187657E-10, -4.2001676312984721E-09, 2.6307942087632753E-09, -5.4975091402508072E-10, -4.3978001545363347E-11, 2.4734066313995970E-11}; + constexpr FLT c11[] = {3.0917581107111067E-12, -2.1504981481527399E-11, 3.4611945838654282E-11, 1.1082666500276105E-10, -5.8883840899000033E-10, 1.1304779661881485E-09, -1.0037911406820197E-09, -5.7884986037117854E-17, 1.0037911398302301E-09, -1.1304781086488634E-09, 5.8883842723235649E-10, -1.1082666592552764E-10, -3.4611945887454015E-11, 2.1504981480972878E-11, -3.0917581107111891E-12}; + constexpr FLT c12[] = {1.5997634038655269E-13, -2.4807970173617968E-12, 1.1275106610326804E-11, -2.3847055813595321E-11, 1.5364454138408298E-11, 4.4350534757580891E-11, -1.3563510404683277E-10, 1.8159081432580251E-10, -1.3563508771311925E-10, 4.4350484735577755E-11, 1.5364420705333068E-11, -2.3847054665131313E-11, 1.1275106670142851E-11, -2.4807970168633410E-12, 1.5997634038739785E-13}; + constexpr FLT c13[] = {-2.4800914618527656E-14, -2.0428592368367617E-14, 6.6720756177865110E-13, -2.9781122281459938E-12, 7.0947566948544657E-12, -1.0181675867287212E-11, 7.9189142537208719E-12, -1.4497056804736912E-17, -7.9189459915777383E-12, 1.0181666345930152E-11, -7.0947487603902491E-12, 2.9781098973971301E-12, -6.6720754938105074E-13, 2.0428592180708626E-14, 2.4800914617770965E-14}; + constexpr FLT c14[] = {-6.3774103672726629E-15, 2.8974955370030088E-14, -6.8422346755457550E-14, 5.3399811794037740E-14, 1.7893441503609519E-13, -7.2418549150581294E-13, 1.3713697997539906E-12, -1.6687145216540105E-12, 1.3713520998316439E-12, -7.2416872315832831E-13, 1.7893006768675052E-13, 5.3400626922038687E-14, -6.8422339477528482E-14, 2.8974955559559462E-14, -6.3774103666804019E-15}; + constexpr FLT c15[] = {-5.1635500202709335E-16, 3.1828105471276549E-15, -1.2111383721117860E-14, 3.1272734620510859E-14, -5.6176935449952714E-14, 6.8640388687474512E-14, -4.9039125333789703E-14, -3.5058680377244798E-17, 4.9029469776856299E-14, -6.8666790600965935E-14, 5.6189548021197700E-14, -3.1272749707318549E-14, 1.2111366748459164E-14, -3.1828106649933298E-15, 5.1635500199831522E-16}; + constexpr FLT c16[] = {4.5179133600663468E-18, -1.3721818586136237E-17, -2.0190809683029299E-16, 1.1787611877454253E-15, -3.5963787346199218E-15, 7.4622525856292898E-15, -1.1451676136812928E-14, 1.2941737777564503E-14, -1.1457648327763603E-14, 7.4174611535501039E-15, -3.6182145577673462E-15, 1.1783995902489914E-15, -2.0188185185104562E-16, -1.3721704675617759E-17, 4.5179136270619547E-18}; for (int i=0; i<15; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i] + z*(c12[i] + z*(c13[i] + z*(c14[i] + z*(c15[i] + z*(c16[i])))))))))))))))); } else if (w==16) { - constexpr FLT c0[] = {3.6434551345571154E+05, 2.0744705928579516E+09, 4.0355760945670056E+11, 1.6364575388763043E+13, 2.3514830376056566E+14, 1.5192201717462540E+15, 4.9956173084674150E+15, 8.9287666945127440E+15, 8.9287666945127440E+15, 4.9956173084674160E+15, 1.5192201717462542E+15, 2.3514830376056566E+14, 1.6364575388763049E+13, 4.0355760945670068E+11, 2.0744705928579512E+09, 3.6434551345570991E+05}; - constexpr FLT c1[] = {2.2576246485480345E+06, 6.6499571180086479E+09, 8.7873753526056311E+11, 2.5606844387131062E+13, 2.6313738449330162E+14, 1.1495095100701470E+15, 2.1932582707747572E+15, 1.2860244365132608E+15, -1.2860244365132600E+15, -2.1932582707747580E+15, -1.1495095100701462E+15, -2.6313738449330162E+14, -2.5606844387131066E+13, -8.7873753526056299E+11, -6.6499571180086479E+09, -2.2576246485480345E+06}; - constexpr FLT c2[] = {6.3730995546265058E+06, 9.9060026035198078E+09, 8.8097248605449023E+11, 1.7953384130753688E+13, 1.2398425545001667E+14, 3.0749346493041262E+14, 1.0259777520247212E+14, -5.5291976457534244E+14, -5.5291976457534294E+14, 1.0259777520247097E+14, 3.0749346493041212E+14, 1.2398425545001659E+14, 1.7953384130753672E+13, 8.8097248605448987E+11, 9.9060026035198078E+09, 6.3730995546265077E+06}; - constexpr FLT c3[] = {1.0896915393078227E+07, 9.0890343524593887E+09, 5.3565169504010052E+11, 7.3004206720038770E+12, 2.9692333044160145E+13, 1.6051737468109752E+13, -9.1273329108089609E+13, -8.5999306918501562E+13, 8.5999306918502812E+13, 9.1273329108090391E+13, -1.6051737468109348E+13, -2.9692333044160059E+13, -7.3004206720038691E+12, -5.3565169504010046E+11, -9.0890343524593925E+09, -1.0896915393078225E+07}; - constexpr FLT c4[] = {1.2655725616100591E+07, 5.7342804054544220E+09, 2.1822836608899585E+11, 1.8300700858999712E+12, 2.7770431049857900E+12, -8.5034969223848574E+12, -1.2846668467422469E+13, 1.6519076896573322E+13, 1.6519076896573414E+13, -1.2846668467422033E+13, -8.5034969223850078E+12, 2.7770431049858350E+12, 1.8300700858999753E+12, 2.1822836608899594E+11, 5.7342804054544239E+09, 1.2655725616100593E+07}; - constexpr FLT c5[] = {1.0609303958036318E+07, 2.6255609052371716E+09, 6.1673589426039268E+10, 2.6044432099085120E+11, -3.5431628074578119E+11, -1.6077602129631777E+12, 1.5534405614726155E+12, 2.8019935380863682E+12, -2.8019935380852476E+12, -1.5534405614728257E+12, 1.6077602129636682E+12, 3.5431628074579871E+11, -2.6044432099085229E+11, -6.1673589426039368E+10, -2.6255609052371745E+09, -1.0609303958036322E+07}; - constexpr FLT c6[] = {6.6544809363384582E+06, 8.9490403680928528E+08, 1.1882638725190987E+10, 8.1552898137820768E+09, -1.2575562817884897E+11, 2.7074695075942204E+10, 3.9453789461929230E+11, -3.1679644857371918E+11, -3.1679644857384814E+11, 3.9453789461920764E+11, 2.7074695075779831E+10, -1.2575562817882477E+11, 8.1552898137801113E+09, 1.1882638725190844E+10, 8.9490403680928373E+08, 6.6544809363384526E+06}; - constexpr FLT c7[] = {3.1906872142825029E+06, 2.2785946180651915E+08, 1.3744578972811413E+09, -4.3997172592843504E+09, -9.2011130753862667E+09, 3.4690551711764793E+10, -9.4227043392778511E+09, -5.9308465069355759E+10, 5.9308465069781982E+10, 9.4227043396369877E+09, -3.4690551711565643E+10, 9.2011130754329739E+09, 4.3997172592904301E+09, -1.3744578972811375E+09, -2.2785946180652067E+08, -3.1906872142825001E+06}; - constexpr FLT c8[] = {1.1821527096621764E+06, 4.2281234059839748E+07, 2.8723226058752719E+07, -8.3553955857505906E+08, 1.2447304828865275E+09, 2.1955280942222519E+09, -7.0514195727878428E+09, 4.3745141232918625E+09, 4.3745141237316084E+09, -7.0514195722924280E+09, 2.1955280943332024E+09, 1.2447304828901291E+09, -8.3553955857124400E+08, 2.8723226058927339E+07, 4.2281234059842363E+07, 1.1821527096621776E+06}; - constexpr FLT c9[] = {3.3854610744279926E+05, 5.2176984975088174E+06, -2.0677283565109752E+07, -3.5831818967739724E+07, 2.6599346107970935E+08, -3.7992777963644773E+08, -1.3426914477301279E+08, 9.1752051236703849E+08, -9.1752051203046608E+08, 1.3426914449876857E+08, 3.7992777988576066E+08, -2.6599346104854524E+08, 3.5831818969687484E+07, 2.0677283565073233E+07, -5.2176984975085324E+06, -3.3854610744279926E+05}; - constexpr FLT c10[] = {7.3893334077310792E+04, 2.6983804209766653E+05, -3.6415998560216571E+06, 8.4025485866871737E+06, 4.9278860835956605E+06, -5.1437033778820507E+07, 8.7603898248918146E+07, -4.6199497914231867E+07, -4.6199497948197275E+07, 8.7603898697554156E+07, -5.1437033767498761E+07, 4.9278861543586710E+06, 8.4025485891638417E+06, -3.6415998559774463E+06, 2.6983804209732520E+05, 7.3893334077308697E+04}; - constexpr FLT c11[] = {1.1778892113376965E+04, -4.0077190108567142E+04, -1.8372552169915423E+05, 1.3262878389569877E+06, -2.9738540196046322E+06, 1.9493506557541618E+06, 4.1881949490808225E+06, -1.1066749801915919E+07, 1.1066748877418302E+07, -4.1881948928182255E+06, -1.9493507634843190E+06, 2.9738539997848324E+06, -1.3262878392766670E+06, 1.8372552166918706E+05, 4.0077190106849979E+04, -1.1778892113376709E+04}; - constexpr FLT c12[] = {1.2019749667900676E+03, -1.0378455845063749E+04, 2.6333352662141660E+04, 1.7117059675298591E+04, -2.5133289742429825E+05, 6.4713895872015413E+05, -8.1634975674778735E+05, 3.8623909535608569E+05, 3.8623887467451266E+05, -8.1634966479713970E+05, 6.4713897711029404E+05, -2.5133289282677229E+05, 1.7117063267120848E+04, 2.6333352680101594E+04, -1.0378455843660833E+04, 1.2019749667921026E+03}; - constexpr FLT c13[] = {3.1189837631121321E+01, -8.9083493701244504E+02, 4.9454293991649774E+03, -1.3124692742151998E+04, 1.5834795298841136E+04, 6.9608292767098355E+03, -5.9790200829217545E+04, 1.0841735230501879E+05, -1.0841732371809872E+05, 5.9789914960016831E+04, -6.9607435159496199E+03, -1.5834797085523640E+04, 1.3124692295481371E+04, -4.9454294410403490E+03, 8.9083493766674769E+02, -3.1189837632399257E+01}; - constexpr FLT c14[] = {-1.2975319072478742E+01, 1.8283699094028595E+01, 1.7684019694555272E+02, -1.1059902320249000E+03, 3.1998244780238201E+03, -5.5987981589200417E+03, 5.9247600879368474E+03, -2.5988290685215188E+03, -2.5988178806809206E+03, 5.9249852432272892E+03, -5.5987701893187350E+03, 3.1998552445852642E+03, -1.1059895327848767E+03, 1.7684022972243278E+02, 1.8283699179384410E+01, -1.2975319072812146E+01}; - constexpr FLT c15[] = {-2.3155118729306223E+00, 1.1938503369059017E+01, -3.4150537494399323E+01, 4.8897188710734866E+01, 1.5839596560322873E+01, -2.4289147960969117E+02, 6.0143231605823757E+02, -8.8772403477020873E+02, 8.8712611928432557E+02, -6.0139861536721287E+02, 2.4281211991792659E+02, -1.5853729108169823E+01, -4.8898479664625256E+01, 3.4150529001281690E+01, -1.1938504563403686E+01, 2.3155118727038264E+00}; - constexpr FLT c16[] = {-1.5401723836370515E-01, 9.8067787978090881E-01, -4.1900810719931050E+00, 1.2149798852514468E+01, -2.4780790340446881E+01, 3.6014221907804398E+01, -3.4588714991383583E+01, 1.3071629460227753E+01, 1.2883354961750646E+01, -3.4615611348253751E+01, 3.5973877372428277E+01, -2.4777428295844171E+01, 1.2151059619254390E+01, -4.1901237542037384E+00, 9.8067813628521039E-01, -1.5401723766235165E-01}; - constexpr FLT c17[] = {1.1808834947531816E-02, -2.5444032491006262E-02, -1.4707353726716647E-04, 2.5840423001794482E-01, -1.0910598687678679E+00, 2.6514321899473572E+00, -4.5034457705829842E+00, 6.8479728528821520E+00, -6.8634402190500978E+00, 4.4285511554539836E+00, -2.6424773990080204E+00, 1.0878035811535636E+00, -2.5882398584322625E-01, 1.3196868749378181E-04, 2.5444131865017927E-02, -1.1808835384234016E-02}; + constexpr FLT c0[] = {3.7973138383475505E-11, 2.1620729770457867E-07, 4.2059935922517660E-05, 1.7055631615451750E-03, 2.4507833223051390E-02, 1.5833750021928361E-01, 5.2065761855025572E-01, 9.3058177132107800E-01, 9.3058177132107822E-01, 5.2065761855025583E-01, 1.5833750021928361E-01, 2.4507833223051407E-02, 1.7055631615451757E-03, 4.2059935922517680E-05, 2.1620729770457854E-07, 3.7973138383475363E-11}; + constexpr FLT c1[] = {2.3529614069937368E-10, 6.9307767643753084E-07, 9.1584555859393273E-05, 2.6688190455647263E-03, 2.7424935799146805E-02, 1.1980519064171602E-01, 2.2858769149343988E-01, 1.3403316930972969E-01, -1.3403316930972969E-01, -2.2858769149343988E-01, -1.1980519064171603E-01, -2.7424935799146809E-02, -2.6688190455647263E-03, -9.1584555859393273E-05, -6.9307767643753063E-07, -2.3529614069937291E-10}; + constexpr FLT c2[] = {6.6422278409342484E-10, 1.0324321112746625E-06, 9.1817488865684769E-05, 1.8711533829047168E-03, 1.2921996060610234E-02, 3.2047854205940321E-02, 1.0693035516337747E-02, -5.7626889750985358E-02, -5.7626889750985420E-02, 1.0693035516337622E-02, 3.2047854205940300E-02, 1.2921996060610227E-02, 1.8711533829047159E-03, 9.1817488865684728E-05, 1.0324321112746625E-06, 6.6422278409342453E-10}; + constexpr FLT c3[] = {1.1357078950958115E-09, 9.4728532805183455E-07, 5.5827161828283907E-05, 7.6087086075588353E-04, 3.0946204357507638E-03, 1.6729582927767952E-03, -9.5127691406672668E-03, -8.9630953638633881E-03, 8.9630953638635737E-03, 9.5127691406674039E-03, -1.6729582927767412E-03, -3.0946204357507521E-03, -7.6087086075588267E-04, -5.5827161828283886E-05, -9.4728532805183402E-07, -1.1357078950958119E-09}; + constexpr FLT c4[] = {1.3190161602522571E-09, 5.9764321317063336E-07, 2.2744388605472980E-05, 1.9073517322668089E-04, 2.8943142766413201E-04, -8.8625893129445465E-04, -1.3389167739520302E-03, 1.7216657535080475E-03, 1.7216657535079566E-03, -1.3389167739519974E-03, -8.8625893129445302E-04, 2.8943142766413342E-04, 1.9073517322668089E-04, 2.2744388605472997E-05, 5.9764321317063368E-07, 1.3190161602522571E-09}; + constexpr FLT c5[] = {1.1057322032863292E-09, 2.7364351668058875E-07, 6.4277990516969732E-06, 2.7144256967440253E-05, -3.6927862875708149E-05, -1.6756539822663250E-04, 1.6190404775924360E-04, 2.9203183363577429E-04, -2.9203183363574707E-04, -1.6190404775915027E-04, 1.6756539822663250E-04, 3.6927862875712038E-05, -2.7144256967440009E-05, -6.4277990516969918E-06, -2.7364351668058875E-07, -1.1057322032863296E-09}; + constexpr FLT c6[] = {6.9354916180818945E-10, 9.3269475195063855E-08, 1.2384428187212403E-06, 8.4996778392803041E-07, -1.3106613626284104E-05, 2.8218026704026646E-06, 4.1119875273776001E-05, -3.3017437945353985E-05, -3.3017437945415066E-05, 4.1119875273714446E-05, 2.8218026703990287E-06, -1.3106613626289508E-05, 8.4996778392747454E-07, 1.2384428187212240E-06, 9.3269475195063643E-08, 6.9354916180818914E-10}; + constexpr FLT c7[] = {3.3254260763956042E-10, 2.3748169129617104E-08, 1.4324995919586480E-07, -4.5855119979446571E-07, -9.5896649524100645E-07, 3.6155491755001142E-06, -9.8206137491315186E-07, -6.1812989819835450E-06, 6.1812989820611756E-06, 9.8206137497544330E-07, -3.6155491754721922E-06, 9.5896649524660746E-07, 4.5855119979503682E-07, -1.4324995919584492E-07, -2.3748169129616922E-08, -3.3254260763956068E-10}; + constexpr FLT c8[] = {1.2320735888479529E-10, 4.4066719437554910E-09, 2.9936173156462927E-09, -8.7082338359679101E-08, 1.2972939456291547E-07, 2.2882425903046301E-07, -7.3491924909334631E-07, 4.5592445674903059E-07, 4.5592445658978770E-07, -7.3491924903833956E-07, 2.2882425902441689E-07, 1.2972939456293178E-07, -8.7082338359266715E-08, 2.9936173156449473E-09, 4.4066719437557416E-09, 1.2320735888479524E-10}; + constexpr FLT c9[] = {3.5284250010876628E-11, 5.4380355945640250E-10, -2.1550460241694361E-09, -3.7344953348928088E-09, 2.7722604311846508E-08, -3.9597167021230792E-08, -1.3993916628542531E-08, 9.5626629210101709E-08, -9.5626629290371673E-08, 1.3993916670061478E-08, 3.9597167019846826E-08, -2.7722604310808535E-08, 3.7344953348928088E-09, 2.1550460241924123E-09, -5.4380355945618072E-10, -3.5284250010876789E-11}; + constexpr FLT c10[] = {7.7013760205813290E-12, 2.8123297626332877E-11, -3.7953802132437611E-10, 8.7573780453214681E-10, 5.1359846908750478E-10, -5.3609157480923598E-09, 9.1303305149265196E-09, -4.8150450778386211E-09, -4.8150450602405480E-09, 9.1303305006281353E-09, -5.3609157342653948E-09, 5.1359846657352753E-10, 8.7573780480711250E-10, -3.7953802133297068E-10, 2.8123297626237416E-11, 7.7013760205811319E-12}; + constexpr FLT c11[] = {1.2276300481459368E-12, -4.1769601372671798E-12, -1.9148402800715177E-11, 1.3822953630779855E-10, -3.0994364017547768E-10, 2.0316700893505159E-10, 4.3650568116859601E-10, -1.1534087567294806E-09, 1.1534086455717957E-09, -4.3650568244627625E-10, -2.0316701046115955E-10, 3.0994364003351358E-10, -1.3822953650299937E-10, 1.9148402794060861E-11, 4.1769601372325045E-12, -1.2276300481460517E-12}; + constexpr FLT c12[] = {1.2527329159215257E-13, -1.0816725479918068E-12, 2.7445378707133412E-12, 1.7839886378835549E-12, -2.6194655703148228E-11, 6.7446666417949068E-11, -8.5082142817277568E-11, 4.0255080062661886E-11, 4.0254965726647763E-11, -8.5082126483561454E-11, 6.7446671522236455E-11, -2.6194657362041918E-11, 1.7839889409505645E-12, 2.7445378607441180E-12, -1.0816725479139360E-12, 1.2527329159224173E-13}; + constexpr FLT c13[] = {3.2506946752710786E-15, -9.2845381849289691E-14, 5.1542691616877330E-13, -1.3678932005895992E-12, 1.6503397946393055E-12, 7.2548932254614457E-13, -6.2314806405069215E-12, 1.1299375277421538E-11, -1.1299433992456742E-11, 6.2314647715784883E-12, -7.2550201768889120E-13, -1.6503403897241219E-12, 1.3678930766135958E-12, -5.1542690377117294E-13, 9.2845381940092428E-14, -3.2506946753893115E-15}; + constexpr FLT c14[] = {-1.3523251101878356E-15, 1.9055798839533079E-15, 1.8430813184053169E-14, -1.1526987096958319E-13, 3.3349122385594633E-13, -5.8352048227061829E-13, 6.1751861733538967E-13, -2.7104853725824153E-13, -2.7103052681092733E-13, 6.1751644366071028E-13, -5.8351023494715043E-13, 3.3348982649365648E-13, -1.1526961866805939E-13, 1.8430809545089241E-14, 1.9055798650003023E-15, -1.3523251102248507E-15}; + constexpr FLT c15[] = {-2.4132931360656334E-16, 1.2442654599774185E-15, -3.5592598733275504E-15, 5.0956447378324209E-15, 1.6446732556150498E-15, -2.5290498540837812E-14, 6.2712721591286338E-14, -9.2666673089509217E-14, 9.2581824882952367E-14, -6.2712118118977746E-14, 2.5288160085642670E-14, -1.6451258598462044E-15, -5.0958559531403920E-15, 3.5592532728491847E-15, -1.2442654894438389E-15, 2.4132931361645452E-16}; + constexpr FLT c16[] = {-1.6052119916687038E-17, 1.0220930228231101E-16, -4.3668420339021406E-16, 1.2658361982998821E-15, -2.5907177687935505E-15, 3.7311262928168221E-15, -3.4997038937045781E-15, 1.4124231584693148E-15, 1.3706178218468559E-15, -3.5056760846448971E-15, 3.7363519598930578E-15, -2.5923974474980012E-15, 1.2658945204780770E-15, -4.3668985335150679E-16, 1.0220927950027870E-16, -1.6052119872193216E-17}; + constexpr FLT c17[] = {1.2307507877258324E-18, -2.6518352923945508E-18, -1.0105982127470271E-20, 2.6958700270869167E-17, -1.1513299715471039E-16, 2.7882272296911513E-16, -4.6961519239790030E-16, 6.5796739812484873E-16, -6.7025909677113713E-16, 4.6238478142949540E-16, -2.8307058941305305E-16, 1.1494093936336214E-16, -2.6999653770494898E-17, 1.1474040843416029E-20, 2.6518435669432360E-18, -1.2307508200482882E-18}; for (int i=0; i<16; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i] + z*(c12[i] + z*(c13[i] + z*(c14[i] + z*(c15[i] + z*(c16[i] + z*(c17[i]))))))))))))))))); } else printf("width not implemented!\n"); diff --git a/include/cufinufft/contrib/ker_horner_allw_loop_constexpr.inc b/include/cufinufft/contrib/ker_horner_allw_loop_constexpr.inc deleted file mode 100644 index 1f4c59e2a..000000000 --- a/include/cufinufft/contrib/ker_horner_allw_loop_constexpr.inc +++ /dev/null @@ -1,205 +0,0 @@ -// Code generated by gen_all_horner_C_code.m in finufft/devel -// Authors: Alex Barnett & Ludvig af Klinteberg. -// (C) The Simons Foundation, Inc. - if (w==2) { - constexpr FLT c0[] = {5.5428559551548406E-01, 5.5428559551548395E-01}; - constexpr FLT c1[] = {7.0481840008800778E-01, -7.0481840008800811E-01}; - constexpr FLT c2[] = {-2.2584311526143548E-02, -2.2584311526143607E-02}; - constexpr FLT c3[] = {-2.5024197515954211E-01, 2.5024197515954211E-01}; - for (int i=0; i<2; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i]))); - } else if (w==3) { - constexpr FLT c0[] = {1.7787237246937579E-01, 1.0000000000000013E+00, 1.7787237247678464E-01}; - constexpr FLT c1[] = {3.5966530797581003E-01, -4.2425842671825248E-17, -3.5966530796781060E-01}; - constexpr FLT c2[] = {2.0160576446392536E-01, -3.7666666666667331E-01, 2.0160576447145470E-01}; - constexpr FLT c3[] = {-1.7450587318669351E-02, 2.2939218956436377E-17, 1.7450587325767743E-02}; - constexpr FLT c4[] = {-4.2902993854032963E-02, 6.0475925925925586E-02, -4.2902993846219546E-02}; - constexpr FLT c5[] = {-4.5057857403453909E-03, 6.6232851036457955E-18, 4.5057857475245110E-03}; - for (int i=0; i<3; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i]))))); - } else if (w==4) { - constexpr FLT c0[] = {3.9828257752799377E-02, 7.3911656575585805E-01, 7.3911656575585805E-01, 3.9828257752799433E-02}; - constexpr FLT c1[] = {1.0749328817387334E-01, 4.5419700247912287E-01, -4.5419700247912287E-01, -1.0749328817387330E-01}; - constexpr FLT c2[] = {1.0408888748149289E-01, -1.0268333881994456E-01, -1.0268333881994476E-01, 1.0408888748149285E-01}; - constexpr FLT c3[] = {3.7516840869185789E-02, -1.0412335657155622E-01, 1.0412335657155641E-01, -3.7516840869185733E-02}; - constexpr FLT c4[] = {-3.5432868834529888E-03, 2.8903049344237370E-03, 2.8903049344238003E-03, -3.5432868834529676E-03}; - constexpr FLT c5[] = {-5.7512181801490673E-03, 1.0945950376831730E-02, -1.0945950376831654E-02, 5.7512181801490829E-03}; - constexpr FLT c6[] = {-7.3657365672905430E-04, 3.7144674885200340E-04, 3.7144674885207063E-04, -7.3657365672907728E-04}; - for (int i=0; i<4; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i])))))); - } else if (w==5) { - constexpr FLT c0[] = {1.0051451410391413E-02, 3.8286382489474308E-01, 1.0000000000000009E+00, 3.8286382489474252E-01, 1.0051451410391420E-02}; - constexpr FLT c1[] = {3.0826052021380446E-02, 3.8431958613457984E-01, -4.7102147373384796E-32, -3.8431958613457951E-01, -3.0826052021380446E-02}; - constexpr FLT c2[] = {3.6562231959204314E-02, 7.8509612097392906E-02, -2.3000000000000059E-01, 7.8509612097392906E-02, 3.6562231959204300E-02}; - constexpr FLT c3[] = {2.0250135419918262E-02, -3.9381037339048602E-02, 1.0193845429304082E-16, 3.9381037339048686E-02, -2.0250135419918248E-02}; - constexpr FLT c4[] = {4.0593041193018580E-03, -1.6067481167759540E-02, 2.4150000000000074E-02, -1.6067481167759530E-02, 4.0593041193018597E-03}; - constexpr FLT c5[] = {-9.2488937959280210E-04, 1.2476700479675494E-03, 1.0406437805617128E-16, -1.2476700479676270E-03, 9.2488937959280405E-04}; - constexpr FLT c6[] = {-5.6059657038176136E-04, 1.2116190166774866E-03, -1.5448333333332675E-03, 1.2116190166775878E-03, -5.6059657038176342E-04}; - constexpr FLT c7[] = {-3.4201716508558499E-05, 2.3137115416428607E-05, 3.6450914717742488E-17, -2.3137115416288715E-05, 3.4201716508574924E-05}; - for (int i=0; i<5; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i]))))))); - } else if (w==6) { - constexpr FLT c0[] = {2.0875119883113440E-03, 1.5741818314646622E-01, 8.2446837122968764E-01, 8.2446837122968819E-01, 1.5741818314646633E-01, 2.0875119883208737E-03}; - constexpr FLT c1[] = {7.2383827471879086E-03, 2.0903648995439439E-01, 3.2052935784357633E-01, -3.2052935784357606E-01, -2.0903648995439447E-01, -7.2383827471776260E-03}; - constexpr FLT c2[] = {1.0180085126333453E-02, 9.2337811484269047E-02, -1.0253741712233820E-01, -1.0253741712233828E-01, 9.2337811484268964E-02, 1.0180085126343144E-02}; - constexpr FLT c3[] = {7.3669955501269460E-03, 4.9102900025223507E-03, -5.1302324979469405E-02, 5.1302324979469550E-02, -4.9102900025223160E-03, -7.3669955501178214E-03}; - constexpr FLT c4[] = {2.7444270008043898E-03, -8.0004810696544734E-03, 5.2920367975573743E-03, 5.2920367975574090E-03, -8.0004810696544873E-03, 2.7444270008144425E-03}; - constexpr FLT c5[] = {3.2622379114949894E-04, -1.8514138516535197E-03, 3.8520985619445234E-03, -3.8520985619444454E-03, 1.8514138516535119E-03, -3.2622379114026425E-04}; - constexpr FLT c6[] = {-1.2239646122606432E-04, 2.2750660293442782E-04, -1.2702072030317145E-04, -1.2702072030306984E-04, 2.2750660293439860E-04, -1.2239646121695236E-04}; - constexpr FLT c7[] = {-4.6695893922776242E-05, 1.1717219021520763E-04, -1.8098268625859964E-04, 1.8098268625869589E-04, -1.1717219021517810E-04, 4.6695893931711504E-05}; - constexpr FLT c8[] = {-1.5875418082745247E-06, 7.2147850127730698E-07, -7.0930078293142108E-08, -7.0930078245872243E-08, 7.2147850127811706E-07, -1.5875417996312271E-06}; - for (int i=0; i<6; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i])))))))); - } else if (w==7) { - constexpr FLT c0[] = {4.0677823488318067E-04, 5.5714997521829540E-02, 5.1113018541287825E-01, 1.0000000000000002E+00, 5.1113018541287869E-01, 5.5714997521829561E-02, 4.0677823488475981E-04}; - constexpr FLT c1[] = {1.5569364307494555E-03, 8.9228372765634056E-02, 3.5049603091348180E-01, -1.8840858949353919E-32, -3.5049603091348197E-01, -8.9228372765634029E-02, -1.5569364307477620E-03}; - constexpr FLT c2[] = {2.4904843753404838E-03, 5.4888936725282375E-02, 2.4759577399513382E-02, -1.6428571428571445E-01, 2.4759577399513264E-02, 5.4888936725282340E-02, 2.4904843753420954E-03}; - constexpr FLT c3[] = {2.1552691780265232E-03, 1.3627105791872422E-02, -3.3718114813591167E-02, 1.0435679823191637E-16, 3.3718114813591278E-02, -1.3627105791872396E-02, -2.1552691780250210E-03}; - constexpr FLT c4[] = {1.0735311014902868E-03, -7.2030895675484117E-04, -6.6760503000563741E-03, 1.2656705539358732E-02, -6.6760503000563680E-03, -7.2030895675483119E-04, 1.0735311014919520E-03}; - constexpr FLT c5[] = {2.8413019973530626E-04, -1.1175797418592351E-03, 1.3906361031252640E-03, 1.0099777883094147E-16, -1.3906361031252017E-03, 1.1175797418592505E-03, -2.8413019973377792E-04}; - constexpr FLT c6[] = {1.6363160465889005E-05, -1.5802085209242310E-04, 4.4431051893374396E-04, -6.0985626028865780E-04, 4.4431051893376408E-04, -1.5802085209243416E-04, 1.6363160467394339E-05}; - constexpr FLT c7[] = {-1.2513684117291295E-05, 2.9105578584781478E-05, -2.8835295309364819E-05, 6.9093005849597210E-17, 2.8835295309456306E-05, -2.9105578584752466E-05, 1.2513684118770622E-05}; - constexpr FLT c8[] = {-3.2859430043343403E-06, 9.3570096164232078E-06, -1.7015821249906871E-05, 2.0688046128660197E-05, -1.7015821249876886E-05, 9.3570096164290557E-06, -3.2859430029058764E-06}; - constexpr FLT c9[] = {-1.5030958477935016E-08, -9.3540219413709317E-08, 1.3079704875560537E-07, 3.0755088144886539E-17, -1.3079704870024676E-07, 9.3540219430316894E-08, 1.5030959705830809E-08}; - for (int i=0; i<7; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i]))))))))); - } else if (w==8) { - constexpr FLT c0[] = {7.5442178667264049E-05, 1.7659090182402852E-02, 2.6112828482312650E-01, 8.6561421087578294E-01, 8.6561421087578294E-01, 2.6112828482312650E-01, 1.7659090182402856E-02, 7.5442178667263913E-05}; - constexpr FLT c1[] = {3.1361556564941527E-04, 3.2518751351035657E-02, 2.4295266212395961E-01, 2.5083142126627195E-01, -2.5083142126627200E-01, -2.4295266212395961E-01, -3.2518751351035664E-02, -3.1361556564941506E-04}; - constexpr FLT c2[] = {5.5627094085228170E-04, 2.4604803324737457E-02, 6.5902977410162822E-02, -9.1064379250067565E-02, -9.1064379250067648E-02, 6.5902977410162836E-02, 2.4604803324737447E-02, 5.5627094085228149E-04}; - constexpr FLT c3[] = {5.5053208919074741E-04, 9.2359485489686977E-03, -6.2169545154249764E-03, -3.1386277864020387E-02, 3.1386277864020692E-02, 6.2169545154250301E-03, -9.2359485489686925E-03, -5.5053208919074741E-04}; - constexpr FLT c4[] = {3.3122072653963820E-04, 1.3353118718124376E-03, -5.9878504390516807E-03, 4.3217905833729843E-03, 4.3217905833729184E-03, -5.9878504390516564E-03, 1.3353118718124411E-03, 3.3122072653963842E-04}; - constexpr FLT c5[] = {1.2112223749399388E-04, -2.3174709024353528E-04, -5.1773322458159945E-04, 1.8691284471382664E-03, -1.8691284471382276E-03, 5.1773322458165388E-04, 2.3174709024353332E-04, -1.2112223749399391E-04}; - constexpr FLT c6[] = {2.3288943339077962E-05, -1.1810885265513022E-04, 2.1380000655379686E-04, -1.1905274322668279E-04, -1.1905274322667877E-04, 2.1380000655378596E-04, -1.1810885265513386E-04, 2.3288943339077766E-05}; - constexpr FLT c7[] = {8.7290223704935849E-08, -9.9551635569432461E-06, 3.9042123573714734E-05, -7.0647330846704962E-05, 7.0647330846826175E-05, -3.9042123573667747E-05, 9.9551635569490195E-06, -8.7290223704824623E-08}; - constexpr FLT c8[] = {-1.0444417486661213E-06, 2.8837147790326586E-06, -3.9445588398358951E-06, 1.9505656879624058E-06, 1.9505656880227840E-06, -3.9445588398203690E-06, 2.8837147790369691E-06, -1.0444417486660073E-06}; - constexpr FLT c9[] = {-1.9601350641688945E-07, 6.2981383505868899E-07, -1.3252363384761618E-06, 1.9071649677058813E-06, -1.9071649677363285E-06, 1.3252363385149127E-06, -6.2981383505419114E-07, 1.9601350641697053E-07}; - for (int i=0; i<8; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i]))))))))); - } else if (w==9) { - constexpr FLT c0[] = {1.3445576990655693E-05, 5.1377966678943553E-03, 1.1569392196071671E-01, 5.9595989228910695E-01, 1.0000000000000004E+00, 5.9595989228910784E-01, 1.1569392196071673E-01, 5.1377966678943874E-03, 1.3445576990655681E-05}; - constexpr FLT c1[] = {6.0003223623206657E-05, 1.0569385595664990E-02, 1.3202059711663530E-01, 3.1241329121161582E-01, -8.4851685343650422E-17, -3.1241329121161615E-01, -1.3202059711663522E-01, -1.0569385595665032E-02, -6.0003223623206596E-05}; - constexpr FLT c2[] = {1.1601811379064824E-04, 9.2861699099147151E-03, 5.4760895870332324E-02, -2.7420112488894219E-04, -1.2777777777777805E-01, -2.7420112488935430E-04, 5.4760895870332296E-02, 9.2861699099147359E-03, 1.1601811379064817E-04}; - constexpr FLT c3[] = {1.2783089927061688E-04, 4.4048543606096807E-03, 6.4505427512762566E-03, -2.6627297241817574E-02, 1.0570032264240285E-16, 2.6627297241817935E-02, -6.4505427512762245E-03, -4.4048543606096877E-03, -1.2783089927061688E-04}; - constexpr FLT c4[] = {8.8459828362140127E-05, 1.1147546008569559E-03, -2.1200589329645782E-03, -2.9677441441083273E-03, 7.7692043895744413E-03, -2.9677441441080211E-03, -2.1200589329645678E-03, 1.1147546008569583E-03, 8.8459828362140168E-05}; - constexpr FLT c5[] = {3.9567294647305465E-05, 8.1817980646548672E-05, -7.2116754318327786E-04, 1.0390038161997466E-03, 1.3960675422467541E-16, -1.0390038161998867E-03, 7.2116754318328556E-04, -8.1817980646550122E-05, -3.9567294647305431E-05}; - constexpr FLT c6[] = {1.1032857092605887E-05, -3.4254477931955853E-05, -1.3557143976035256E-05, 1.8667778536557664E-04, -2.9974999576614188E-04, 1.8667778536546106E-04, -1.3557143976042615E-05, -3.4254477931959885E-05, 1.1032857092605841E-05}; - constexpr FLT c7[] = {1.5345430093717796E-06, -9.9308189188274098E-06, 2.3762810604639151E-05, -2.4017602201954516E-05, 1.1627785359675844E-17, 2.4017602202115669E-05, -2.3762810604628780E-05, 9.9308189188319669E-06, -1.5345430093718216E-06}; - constexpr FLT c8[] = {-8.1737159283255726E-08, -4.1540916378247392E-07, 2.6668107554223020E-06, -6.3261434127908313E-06, 8.2578681449311880E-06, -6.3261434126076934E-06, 2.6668107554440373E-06, -4.1540916378676467E-07, -8.1737159283249333E-08}; - constexpr FLT c9[] = {-7.3256982980608342E-08, 2.3321978963880019E-07, -4.0030411105333760E-07, 3.4388260968054864E-07, 6.5677795522570459E-17, -3.4388260990751890E-07, 4.0030411105333760E-07, -2.3321978963499429E-07, 7.3256982980640781E-08}; - constexpr FLT c10[] = {-1.0121400696579195E-08, 3.6191328862414928E-08, -8.7258577118961372E-08, 1.4622014477867198E-07, -1.7333902174790525E-07, 1.4622014483401952E-07, -8.7258577100106683E-08, 3.6191328859901120E-08, -1.0121400696606260E-08}; - for (int i=0; i<9; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i])))))))))); - } else if (w==10) { - constexpr FLT c0[] = {2.3186292807626266E-06, 1.3952040327729876E-03, 4.5894237568906843E-02, 3.4666431215091636E-01, 8.9110862394332080E-01, 8.9110862394332024E-01, 3.4666431215091614E-01, 4.5894237568906843E-02, 1.3952040327729804E-03, 2.3186292807626329E-06}; - constexpr FLT c1[] = {1.1010978063160391E-05, 3.1454190365986022E-03, 6.0943215953720313E-02, 2.5074802988370321E-01, 2.0598750885032702E-01, -2.0598750885032710E-01, -2.5074802988370315E-01, -6.0943215953720306E-02, -3.1454190365985909E-03, -1.1010978063160380E-05}; - constexpr FLT c2[] = {2.2925449299630732E-05, 3.1050615653861980E-03, 3.2337657329423494E-02, 4.4760550762170469E-02, -8.0226193254406428E-02, -8.0226193254406289E-02, 4.4760550762170441E-02, 3.2337657329423480E-02, 3.1050615653861868E-03, 2.2925449299630681E-05}; - constexpr FLT c3[] = {2.7622345748507540E-05, 1.7317590416004974E-03, 7.6620063086756569E-03, -9.8393115612840278E-03, -2.1163068654269049E-02, 2.1163068654269510E-02, 9.8393115612841128E-03, -7.6620063086756491E-03, -1.7317590416004913E-03, -2.7622345748507479E-05}; - constexpr FLT c4[] = {2.1363614860997117E-05, 5.7553475552091617E-04, 1.4813144535930287E-04, -4.1113061120761924E-03, 3.3662735809591683E-03, 3.3662735809590794E-03, -4.1113061120762826E-03, 1.4813144535930759E-04, 5.7553475552091368E-04, 2.1363614860997080E-05}; - constexpr FLT c5[] = {1.1063475580065299E-05, 1.0180053030149723E-04, -3.4137441280837177E-04, -4.9828659222651745E-05, 1.0442648308817235E-03, -1.0442648308817467E-03, 4.9828659222713965E-05, 3.4137441280837177E-04, -1.0180053030149541E-04, -1.1063475580065281E-05}; - constexpr FLT c6[] = {3.8359011440648869E-06, 1.3049698816919587E-06, -6.3791463619208982E-05, 1.4528730872072194E-04, -8.6630472952355992E-05, -8.6630472952398913E-05, 1.4528730872073633E-04, -6.3791463619214471E-05, 1.3049698816901833E-06, 3.8359011440648767E-06}; - constexpr FLT c7[] = {8.3366418668164326E-07, -3.5785601754616355E-06, 2.4539930904858821E-06, 1.2754336575782058E-05, -3.3000414536039571E-05, 3.3000414536273711E-05, -1.2754336575693992E-05, -2.4539930904800897E-06, 3.5785601754627781E-06, -8.3366418668163871E-07}; - constexpr FLT c8[] = {8.0572098823818712E-08, -6.8352224328357488E-07, 2.0695541423376112E-06, -2.9709579576770532E-06, 1.5005770225996294E-06, 1.5005770226481292E-06, -2.9709579578116679E-06, 2.0695541423438809E-06, -6.8352224328404986E-07, 8.0572098823810798E-08}; - constexpr FLT c9[] = {-1.0412910456843575E-08, -3.6228831474008107E-09, 1.3932530225640674E-07, -4.5071262434444286E-07, 7.5149884418348562E-07, -7.5149884428313110E-07, 4.5071262441364111E-07, -1.3932530225017888E-07, 3.6228831478332996E-09, 1.0412910456861821E-08}; - constexpr FLT c10[] = {-4.4291858216944146E-09, 1.5904364893350153E-08, -3.2603275106346107E-08, 3.8190045632066571E-08, -1.7631718176528265E-08, -1.7631718292171639E-08, 3.8190045621381707E-08, -3.2603275098803994E-08, 1.5904364893978648E-08, -4.4291858217073890E-09}; - constexpr FLT c11[] = {-4.4040059170580565E-10, 1.7857872825180656E-09, -4.9203237617335969E-09, 9.5125262125165431E-09, -1.3157194779492521E-08, 1.3157194812996001E-08, -9.5125262191888681E-09, 4.9203237596041585E-09, -1.7857872834763311E-09, 4.4040059170802652E-10}; - for (int i=0; i<10; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i]))))))))))); - } else if (w==11) { - constexpr FLT c0[] = {3.8884809238313434E-07, 3.5785567372179951E-04, 1.6654951019551330E-02, 1.7692785324424570E-01, 6.5593328211813162E-01, 9.9999999999999978E-01, 6.5593328211813129E-01, 1.7692785324424565E-01, 1.6654951019551330E-02, 3.5785567372179962E-04, 3.8884809238312539E-07}; - constexpr FLT c1[] = {1.9516358260453364E-06, 8.7214421096705593E-04, 2.4929466432368100E-02, 1.5885079249667189E-01, 2.7894884556454935E-01, 9.4204294746769595E-33, -2.7894884556454941E-01, -1.5885079249667189E-01, -2.4929466432368097E-02, -8.7214421096705604E-04, -1.9516358260453169E-06}; - constexpr FLT c2[] = {4.3353827605930511E-06, 9.4705645354715550E-04, 1.5700144896729017E-02, 4.8428271550326758E-02, -1.2807080799297165E-02, -1.0454545454545448E-01, -1.2807080799297061E-02, 4.8428271550326821E-02, 1.5700144896729006E-02, 9.4705645354715518E-04, 4.3353827605930215E-06}; - constexpr FLT c3[] = {5.6395387871289846E-06, 5.9760549110825473E-04, 5.0911332059142295E-03, 1.6690038662948304E-03, -2.1030028251697912E-02, 1.4335617874817167E-16, 2.1030028251698141E-02, -1.6690038662947660E-03, -5.0911332059142200E-03, -5.9760549110825429E-04, -5.6395387871289508E-06}; - constexpr FLT c4[] = {4.7836299264887200E-06, 2.3732554180006408E-04, 7.1846854433598795E-04, -2.2660086673713248E-03, -1.3190061226035158E-03, 5.2488730277989188E-03, -1.3190061226033569E-03, -2.2660086673713374E-03, 7.1846854433598557E-04, 2.3732554180006421E-04, 4.7836299264886963E-06}; - constexpr FLT c5[] = {2.7801202330030064E-06, 5.8401836435976300E-05, -5.7255962675850168E-05, -4.1058481683291448E-04, 7.4543249761827859E-04, 6.7099534430837577E-17, -7.4543249761823186E-04, 4.1058481683291448E-04, 5.7255962675853089E-05, -5.8401836435976178E-05, -2.7801202330029924E-06}; - constexpr FLT c6[] = {1.1248609988572041E-06, 7.1593996360419040E-06, -3.7923443960739119E-05, 2.8219312687371359E-05, 8.5797383067823588E-05, -1.6875309167105302E-04, 8.5797383067779691E-05, 2.8219312687392853E-05, -3.7923443960740034E-05, 7.1593996360418057E-06, 1.1248609988571978E-06}; - constexpr FLT c7[] = {3.1074712008817516E-07, -3.7942806006679305E-07, -4.2327710785708026E-06, 1.4518421536643064E-05, -1.6373413879605298E-05, 3.0222646636983358E-17, 1.6373413879621934E-05, -1.4518421536591986E-05, 4.2327710785753580E-06, 3.7942806006705484E-07, -3.1074712008817235E-07}; - constexpr FLT c8[] = {5.3160526822194444E-08, -2.9438470061321741E-07, 4.4816653817789122E-07, 4.9835853873945607E-07, -2.6602444110833864E-06, 3.9090815375281113E-06, -2.6602444110225165E-06, 4.9835853874269618E-07, 4.4816653818193273E-07, -2.9438470061323123E-07, 5.3160526822193583E-08}; - constexpr FLT c9[] = {3.1778958300854393E-09, -3.9044067083483707E-08, 1.4726158788365547E-07, -2.7451209287062293E-07, 2.4544112217999958E-07, 8.6199548859978872E-18, -2.4544112207758621E-07, 2.7451209285678326E-07, -1.4726158788296347E-07, 3.9044067083624268E-08, -3.1778958300829052E-09}; - constexpr FLT c10[] = {-8.6163117991617490E-10, 1.2292710054271969E-09, 4.9928263052430922E-09, -2.5746199362556884E-08, 5.5054682151312924E-08, -6.9606951358406722E-08, 5.5054682230504105E-08, -2.5746199365699604E-08, 4.9928263093284604E-09, 1.2292710054468060E-09, -8.6163117991862728E-10}; - constexpr FLT c11[] = {-2.3293080872726303E-10, 9.3461130390718653E-10, -2.2220140857286656E-09, 3.2420144232604506E-09, -2.5573586459741160E-09, -3.4362247560151687E-17, 2.5573586170134590E-09, -3.2420144222311963E-09, 2.2220140843090244E-09, -9.3461130382733279E-10, 2.3293080872885788E-10}; - constexpr FLT c12[] = {-1.6776727231079557E-11, 7.5440974150049303E-11, -2.3911386677196792E-10, 5.3207180787495740E-10, -8.5057641018270776E-10, 9.9272876082686339E-10, -8.5057644693357476E-10, 5.3207181195839291E-10, -2.3911386485786361E-10, 7.5440974126123504E-11, -1.6776727231328710E-11}; - for (int i=0; i<11; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i] + z*(c12[i])))))))))))); - } else if (w==12) { - constexpr FLT c0[] = {6.3667715563015689E-08, 8.7461142088576888E-05, 5.6146669497086589E-03, 8.1271316412301370E-02, 4.1627261402765736E-01, 9.0846375182673755E-01, 9.0846375182673755E-01, 4.1627261402765736E-01, 8.1271316412301550E-02, 5.6146669497086719E-03, 8.7461142088576929E-05, 6.3667715563034801E-08}; - constexpr FLT c1[] = {3.3587389488258588E-07, 2.2809471090022899E-04, 9.2744480587562007E-03, 8.5676487647659991E-02, 2.4720659158040625E-01, 1.7472997738462001E-01, -1.7472997738461990E-01, -2.4720659158040617E-01, -8.5676487647660143E-02, -9.2744480587562180E-03, -2.2809471090022899E-04, -3.3587389488256608E-07}; - constexpr FLT c2[] = {7.9035220764954472E-07, 2.6846594761214740E-04, 6.6557324960729147E-03, 3.4792641812076718E-02, 2.9454899103693762E-02, -7.1172529707069221E-02, -7.1172529707069207E-02, 2.9454899103693671E-02, 3.4792641812076690E-02, 6.6557324960729242E-03, 2.6846594761214740E-04, 7.9035220764956886E-07}; - constexpr FLT c3[] = {1.0993606197695965E-06, 1.8716155179384050E-04, 2.6329045000561364E-03, 5.3754303637600113E-03, -1.0591878410592502E-02, -1.5228395084945664E-02, 1.5228395084945803E-02, 1.0591878410592646E-02, -5.3754303637599376E-03, -2.6329045000561364E-03, -1.8716155179384044E-04, -1.0993606197695836E-06}; - constexpr FLT c4[] = {1.0091198513153346E-06, 8.4812954286468477E-05, 5.7431140218944460E-04, -5.0274672420766203E-04, -2.8008958990917627E-03, 2.6435090762445433E-03, 2.6435090762445819E-03, -2.8008958990918187E-03, -5.0274672420767580E-04, 5.7431140218944276E-04, 8.4812954286468423E-05, 1.0091198513153598E-06}; - constexpr FLT c5[] = {6.4507244019416584E-07, 2.5481132674301279E-05, 4.2795619387511420E-05, -3.0197159708156643E-04, 1.1080610219049720E-04, 6.4144454802694492E-04, -6.4144454802681275E-04, -1.1080610219045053E-04, 3.0197159708157808E-04, -4.2795619387511908E-05, -2.5481132674301286E-05, -6.4507244019414964E-07}; - constexpr FLT c6[] = {2.9426545129495891E-07, 4.7724106401925034E-06, -1.1001642128368358E-05, -2.6869692251292103E-05, 9.4483235217708846E-05, -6.1678458203322752E-05, -6.1678458203283029E-05, 9.4483235217638725E-05, -2.6869692251319154E-05, -1.1001642128368348E-05, 4.7724106401924525E-06, 2.9426545129497845E-07}; - constexpr FLT c7[] = {9.5799843879057487E-08, 3.7784160107136394E-07, -3.2256313018476217E-06, 5.0144058082843800E-06, 3.4886031174309006E-06, -1.7411974954245794E-05, 1.7411974954244114E-05, -3.4886031173677615E-06, -5.0144058082412084E-06, 3.2256313018490718E-06, -3.7784160107127161E-07, -9.5799843879039593E-08}; - constexpr FLT c8[] = {2.1473864761677802E-08, -5.7414008446850441E-08, -2.0134799316446491E-07, 1.1145247706131597E-06, -1.8840465966107854E-06, 1.0067804561094662E-06, 1.0067804560969447E-06, -1.8840465965985945E-06, 1.1145247706194121E-06, -2.0134799316567892E-07, -5.7414008446903526E-08, 2.1473864761695718E-08}; - constexpr FLT c9[] = {2.8867786924320735E-09, -2.0015791402048098E-08, 4.5306507660172584E-08, -7.8859059608423767E-09, -1.5755151471717741E-07, 3.4270221893522085E-07, -3.4270221891584534E-07, 1.5755151474485673E-07, 7.8859059608423767E-09, -4.5306507656885666E-08, 2.0015791402102159E-08, -2.8867786924173336E-09}; - constexpr FLT c10[] = {6.9986758892026879E-11, -1.8486004428526375E-09, 8.7658205612213605E-09, -2.0364661368255434E-08, 2.5396405431717686E-08, -1.2044441164754235E-08, -1.2044441145898965E-08, 2.5396405393379069E-08, -2.0364661337458944E-08, 8.7658205594930229E-09, -1.8486004428624741E-09, 6.9986758906941889E-11}; - constexpr FLT c11[] = {-5.6296594747629561E-11, 1.4066781276164117E-10, 4.6947620156299098E-11, -1.1526063766721083E-09, 3.3027593515457814E-09, -5.2174001597719162E-09, 5.2174001336505757E-09, -3.3027593563725673E-09, 1.1526063504088099E-09, -4.6947618665684182E-11, -1.4066781273945818E-10, 5.6296594761077256E-11}; - constexpr FLT c12[] = {-1.0870401168253040E-11, 4.8044744351982426E-11, -1.3004175788815863E-10, 2.2570502267192305E-10, -2.4006684875388499E-10, 1.0598000131166063E-10, 1.0597991964307358E-10, -2.4006682833673746E-10, 2.2570504206821193E-10, -1.3004176149306233E-10, 4.8044744304130286E-11, -1.0870401156071839E-11}; - constexpr FLT c13[] = {-4.7539080498592749E-13, 2.6787995976616703E-12, -1.0000145739993567E-11, 2.5777400861531429E-11, -4.7463672955972831E-11, 6.4012227921839136E-11, -6.4012266007267373E-11, 4.7463669782187146E-11, -2.5777397687745743E-11, 1.0000149112140858E-11, -2.6787995744161696E-12, 4.7539081133001201E-13}; - for (int i=0; i<12; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i] + z*(c12[i] + z*(c13[i]))))))))))))); - } else if (w==13) { - constexpr FLT c0[] = {1.0208956054983696E-08, 2.0506572462261995E-05, 1.7784497194617906E-03, 3.4214490279693019E-02, 2.3443634373410047E-01, 7.0049708882252804E-01, 9.9999999999999956E-01, 7.0049708882252670E-01, 2.3443634373410041E-01, 3.4214490279692922E-02, 1.7784497194617906E-03, 2.0506572462261785E-05, 1.0208956054983676E-08}; - constexpr FLT c1[] = {5.6353468219321995E-08, 5.6780128053894686E-05, 3.1934841481628326E-03, 4.0941461360716927E-02, 1.7436810648693357E-01, 2.5085467225681696E-01, -6.3638764007737755E-17, -2.5085467225681662E-01, -1.7436810648693341E-01, -4.0941461360716816E-02, -3.1934841481628326E-03, -5.6780128053894232E-05, -5.6353468219321988E-08}; - constexpr FLT c2[] = {1.3966266158866427E-07, 7.1655019336418755E-05, 2.5459504018621182E-03, 2.0160236969440644E-02, 4.0770064165298429E-02, -1.9317276988534509E-02, -8.8461538461538661E-02, -1.9317276988534381E-02, 4.0770064165298395E-02, 2.0160236969440602E-02, 2.5459504018621160E-03, 7.1655019336418200E-05, 1.3966266158866422E-07}; - constexpr FLT c3[] = {2.0618605552701903E-07, 5.4306747658367697E-05, 1.1637911071900936E-03, 4.7784706844645319E-03, -1.2004184173788884E-03, -1.6862510515565966E-02, 1.4394808111083350E-16, 1.6862510515566146E-02, 1.2004184173788636E-03, -4.7784706844645379E-03, -1.1637911071900920E-03, -5.4306747658367331E-05, -2.0618605552701909E-07}; - constexpr FLT c4[] = {2.0277547837406105E-07, 2.7328509487415503E-05, 3.2236608098850310E-04, 3.0859705461356495E-04, -2.0254394973524947E-03, -5.2398574644553877E-04, 3.7818616294949463E-03, -5.2398574644547762E-04, -2.0254394973524895E-03, 3.0859705461357378E-04, 3.2236608098850327E-04, 2.7328509487415384E-05, 2.0277547837406108E-07}; - constexpr FLT c5[] = {1.4058372037094490E-07, 9.4685595066536085E-06, 4.8682874512158502E-05, -1.1575111217134651E-04, -2.1811605515759046E-04, 5.4056763477041119E-04, 1.1213866287069097E-16, -5.4056763477029453E-04, 2.1811605515769156E-04, 1.1575111217135234E-04, -4.8682874512158861E-05, -9.4685595066535949E-06, -1.4058372037094498E-07}; - constexpr FLT c6[] = {7.0755520230584385E-08, 2.2298625886400277E-06, 7.8375383352022143E-07, -2.8394470622676381E-05, 3.5771256766257562E-05, 4.1631950912211130E-05, -1.0418619302467684E-04, 4.1631950912333557E-05, 3.5771256766183768E-05, -2.8394470622671916E-05, 7.8375383351933331E-07, 2.2298625886400294E-06, 7.0755520230584346E-08}; - constexpr FLT c7[] = {2.6111186487625245E-08, 3.2044561720738826E-07, -1.2220373462313589E-06, -8.5793794342228941E-07, 8.3299507234112700E-06, -1.0956754351178954E-05, 9.4610283796409485E-17, 1.0956754351115859E-05, -8.3299507234215327E-06, 8.5793794342144989E-07, 1.2220373462321896E-06, -3.2044561720741346E-07, -2.6111186487625302E-08}; - constexpr FLT c8[] = {6.9838095920570498E-09, 1.2796250155222958E-08, -2.1971713837900942E-07, 5.2791981730307194E-07, -1.4622692107334488E-07, -1.2222183756556175E-06, 2.0809248310569844E-06, -1.2222183756925741E-06, -1.4622692099063203E-07, 5.2791981730006307E-07, -2.1971713837856465E-07, 1.2796250155283016E-08, 6.9838095920570937E-09}; - constexpr FLT c9[] = {1.2845897306280646E-09, -5.2304801922802769E-09, -5.0548716982175665E-09, 6.7539942924545603E-08, -1.6027276234256162E-07, 1.5655092165632365E-07, 4.6828140259346451E-17, -1.5655092173659360E-07, 1.6027276234809749E-07, -6.7539942912781904E-08, 5.0548716984338105E-09, 5.2304801922379145E-09, -1.2845897306280857E-09}; - constexpr FLT c10[] = {1.3345700642131601E-10, -1.1551704392349950E-09, 3.4412362345673782E-09, -3.2850871078054311E-09, -6.1855158542452699E-09, 2.3119925642302808E-08, -3.2145944181567604E-08, 2.3119926027259106E-08, -6.1855159240088862E-09, -3.2850871247748739E-09, 3.4412362345280933E-09, -1.1551704391858975E-09, 1.3345700642134581E-10}; - constexpr FLT c11[] = {-1.9694481417663767E-12, -7.0630732018717419E-11, 4.4161967766895751E-10, -1.2581280884757252E-09, 2.0087583285653241E-09, -1.6557203488425082E-09, 5.7014219382328511E-17, 1.6557200410648860E-09, -2.0087583339599462E-09, 1.2581281082796833E-09, -4.4161967789965090E-10, 7.0630731978790794E-11, 1.9694481417229703E-12}; - constexpr FLT c12[] = {-3.1122514901291979E-12, 1.0235548893351873E-11, -1.0076717787418374E-11, -3.6278872085836478E-11, 1.6235812713334426E-10, -3.2356766327511469E-10, 4.0014573853281197E-10, -3.2356772044312440E-10, 1.6235817511363862E-10, -3.6278891226911122E-11, -1.0076717627909611E-11, 1.0235548938213992E-11, -3.1122514900941893E-12}; - constexpr FLT c13[] = {-4.4521627553052389E-13, 2.1830423195977186E-12, -6.6494700502871459E-12, 1.3364548102385267E-11, -1.7572530897780217E-11, 1.3087527392509343E-11, -1.4854086432767967E-17, -1.3087613084722882E-11, 1.7572508681280409E-11, -1.3364552466340585E-11, 6.6494701742631489E-12, -2.1830423513665695E-12, 4.4521627553052389E-13}; - constexpr FLT c14[] = {-1.1331825591762625E-14, 7.5442537823437382E-14, -3.5473113067901070E-13, 1.0827924393926043E-12, -2.3053993601726267E-12, 3.5752731472827676E-12, -4.1288118242378826E-12, 3.5755029357484062E-12, -2.3054273074184593E-12, 1.0827837446939142E-12, -3.5473109186339628E-13, 7.5442574213081941E-14, -1.1331825564518091E-14}; - for (int i=0; i<13; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i] + z*(c12[i] + z*(c13[i] + z*(c14[i])))))))))))))); - } else if (w==14) { - constexpr FLT c0[] = {1.6070755785071491E-09, 4.6371263117318300E-06, 5.3392892770691468E-04, 1.3380163586766329E-02, 1.1960061568997656E-01, 4.7332499268789285E-01, 9.2104360429933863E-01, 9.2104360429933885E-01, 4.7332499268789302E-01, 1.1960061568997683E-01, 1.3380163586766332E-02, 5.3392892770691837E-04, 4.6371263117318342E-06, 1.6070755785075502E-09}; - constexpr FLT c1[] = {9.2475302076758674E-09, 1.3546865389183953E-05, 1.0306349751547578E-03, 1.7767594411827761E-02, 1.0518000824290019E-01, 2.3882936521395404E-01, 1.5170179567585843E-01, -1.5170179567585837E-01, -2.3882936521395398E-01, -1.0518000824290036E-01, -1.7767594411827754E-02, -1.0306349751547613E-03, -1.3546865389183977E-05, -9.2475302076757731E-09}; - constexpr FLT c2[] = {2.4024402573674993E-08, 1.8178651135370012E-05, 8.9712289901830596E-04, 1.0104692380253478E-02, 3.4193348251104483E-02, 1.8533380680638794E-02, -6.3746746886473832E-02, -6.3746746886473860E-02, 1.8533380680638745E-02, 3.4193348251104413E-02, 1.0104692380253471E-02, 8.9712289901830889E-04, 1.8178651135370046E-05, 2.4024402573675768E-08}; - constexpr FLT c3[] = {3.7419288907183495E-08, 1.4804264337309617E-05, 4.5929141335173144E-04, 3.0552592910038168E-03, 3.3079403387824323E-03, -1.0247716289024879E-02, -1.1480323948535117E-02, 1.1480323948535463E-02, 1.0247716289025027E-02, -3.3079403387824271E-03, -3.0552592910038120E-03, -4.5929141335173334E-04, -1.4804264337309643E-05, -3.7419288907183766E-08}; - constexpr FLT c4[] = {3.9124194363163287E-08, 8.1265227753122953E-06, 1.4975407030324905E-04, 4.4789439277602894E-04, -7.9407521150521383E-04, -1.9254008995687184E-03, 2.1136619999320748E-03, 2.1136619999320141E-03, -1.9254008995687132E-03, -7.9407521150514292E-04, 4.4789439277602867E-04, 1.4975407030325005E-04, 8.1265227753123105E-06, 3.9124194363164148E-08}; - constexpr FLT c5[] = {2.9113992252245385E-08, 3.1458937074171823E-06, 3.0585266291431613E-05, -6.5135387342551234E-06, -2.3196510408355524E-04, 1.5778347828067563E-04, 4.2181913759748168E-04, -4.2181913759742725E-04, -1.5778347828060562E-04, 2.3196510408355524E-04, 6.5135387342551234E-06, -3.0585266291432040E-05, -3.1458937074171887E-06, -2.9113992252245408E-08}; - constexpr FLT c6[] = {1.5927753226313472E-08, 8.6591441391883797E-07, 3.1186030532599549E-06, -1.4256326863802477E-05, -6.9192418278078229E-06, 6.1786486497582421E-05, -4.4611361914704291E-05, -4.4611361914610670E-05, 6.1786486497541994E-05, -6.9192418278024798E-06, -1.4256326863804276E-05, 3.1186030532598494E-06, 8.6591441391883161E-07, 1.5927753226313945E-08}; - constexpr FLT c7[] = {6.5072355972925020E-09, 1.6321871905299654E-07, -1.6208737249918160E-07, -2.0005919851675986E-06, 4.6289117401651821E-06, 1.5738407907104777E-07, -1.0033756087313552E-05, 1.0033756087535249E-05, -1.5738407898383816E-07, -4.6289117402341052E-06, 2.0005919851709152E-06, 1.6208737249923451E-07, -1.6321871905299225E-07, -6.5072355972922787E-09}; - constexpr FLT c8[] = {1.9857214221989366E-09, 1.7788899565181922E-08, -1.0133541198312604E-07, 4.4566342395340293E-08, 5.3564828266574526E-07, -1.1695093255338883E-06, 6.7085595118984104E-07, 6.7085595114069746E-07, -1.1695093255217181E-06, 5.3564828276835377E-07, 4.4566342396873204E-08, -1.0133541198326502E-07, 1.7788899565180526E-08, 1.9857214221992563E-09}; - constexpr FLT c9[] = {4.4289508956510332E-10, -2.3397558741938982E-11, -1.2203541602658680E-08, 4.1555456455006879E-08, -4.0387396856849884E-08, -5.2822132653130956E-08, 1.7383889351097292E-07, -1.7383889353173241E-07, 5.2822132672506464E-08, 4.0387396834706444E-08, -4.1555456455698865E-08, 1.2203541602950610E-08, 2.3397558742361335E-11, -4.4289508956485253E-10}; - constexpr FLT c10[] = {6.7195187479843226E-11, -3.6781600571171619E-10, 1.8909214083296717E-10, 3.2074788122994124E-09, -1.0777792237807384E-08, 1.5287295377979802E-08, -7.6060392723093131E-09, -7.6060391755201933E-09, 1.5287295398091755E-08, -1.0777792217695420E-08, 3.2074788146563205E-09, 1.8909214044014493E-10, -3.6781600571662634E-10, 6.7195187480068943E-11}; - constexpr FLT c11[] = {5.1753158905822061E-12, -5.7459004384753609E-11, 2.1373772914288248E-10, -3.3474981614755248E-10, -5.5056523013581392E-11, 1.1984997345151211E-09, -2.3401534609898206E-09, 2.3401534737665714E-09, -1.1984997515507915E-09, 5.5056487167718091E-11, 3.3474981678638774E-10, -2.1373772871699109E-10, 5.7459004393903842E-11, -5.1753158903480283E-12}; - constexpr FLT c12[] = {-3.4295334316135217E-13, -1.9669734020395281E-12, 1.8829710516667924E-11, -6.6063898621267923E-11, 1.2987243021035191E-10, -1.4723142988261286E-10, 6.6816662742079877E-11, 6.6816650491789053E-11, -1.4723143192432656E-10, 1.2987247614892944E-10, -6.6063898621269021E-11, 1.8829709886607818E-11, -1.9669734162457477E-12, -3.4295334295692199E-13}; - constexpr FLT c13[] = {-1.4925032356367256E-13, 5.9101412900182951E-13, -1.0473414103260276E-12, -3.4168877521962931E-13, 6.3681343308181771E-12, -1.6773485918159645E-11, 2.5499676364679485E-11, -2.5499722384571941E-11, 1.6773473223016897E-11, -6.3681501997466111E-12, 3.4168877521962931E-13, 1.0473414909104298E-12, -5.9101412551500433E-13, 1.4925032367414924E-13}; - constexpr FLT c14[] = {-1.6512890188764807E-14, 8.8250735109913167E-14, -3.0062084749515021E-13, 6.8819378623923325E-13, -1.0710378278007934E-12, 1.0658930503703208E-12, -4.5535006559156473E-13, -4.5529417109990688E-13, 1.0659116818675222E-12, -1.0710247857527394E-12, 6.8819549412647750E-13, -3.0062091542248455E-13, 8.8250729803090660E-14, -1.6512890092223385E-14}; - constexpr FLT c15[] = {1.6573977440105294E-16, 1.3350735743743382E-15, -1.0198606577404851E-14, 3.9099634678793536E-14, -9.7801981044810947E-14, 1.7461338478760738E-13, -2.3137912816883565E-13, 2.3133990246879147E-13, -1.7463221312362809E-13, 9.7795403196649327E-14, -3.9099513984331611E-14, 1.0198764988885690E-14, -1.3350660309704511E-15, -1.6573967886539614E-16}; - for (int i=0; i<14; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i] + z*(c12[i] + z*(c13[i] + z*(c14[i] + z*(c15[i]))))))))))))))); - } else if (w==15) { - constexpr FLT c0[] = {2.4886236238313534E-10, 1.0156314710024854E-06, 1.5297772142853732E-04, 4.9110296377727252E-03, 5.6121982134094042E-02, 2.8670951404936740E-01, 7.3488453954210731E-01, 1.0000000000000018E+00, 7.3488453954210708E-01, 2.8670951404936784E-01, 5.6121982134094188E-02, 4.9110296377727321E-03, 1.5297772142853737E-04, 1.0156314710024854E-06, 2.4886236238313394E-10}; - constexpr FLT c1[] = {1.4880454274285384E-09, 3.1146031777409673E-06, 3.1470309742465694E-04, 7.1215977556942766E-03, 5.6335374470954679E-02, 1.8245542837228418E-01, 2.2739494478010200E-01, -4.2425842671825266E-17, -2.2739494478010208E-01, -1.8245542837228432E-01, -5.6335374470954783E-02, -7.1215977556942861E-03, -3.1470309742465694E-04, -3.1146031777409668E-06, -1.4880454274285366E-09}; - constexpr FLT c2[] = {4.0364738474324423E-09, 4.4152383936309442E-06, 2.9537757977456596E-04, 4.5415629108243238E-03, 2.2685962261788550E-02, 3.3502333548319392E-02, -2.2696322242195994E-02, -7.6666666666667133E-02, -2.2696322242195945E-02, 3.3502333548319260E-02, 2.2685962261788570E-02, 4.5415629108243273E-03, 2.9537757977456591E-04, 4.4152383936309416E-06, 4.0364738474324407E-09}; - constexpr FLT c3[] = {6.6006259688120961E-09, 3.8297656275654657E-06, 1.6597029248061439E-04, 1.6248331197066942E-03, 4.0281119347581979E-03, -2.8399908290139206E-03, -1.3756562885831705E-02, 1.0758125681708418E-16, 1.3756562885831904E-02, 2.8399908290139895E-03, -4.0281119347581771E-03, -1.6248331197066914E-03, -1.6597029248061437E-04, -3.8297656275654657E-06, -6.6006259688120969E-09}; - constexpr FLT c4[] = {7.2920076887968825E-09, 2.2644150332986910E-06, 6.1226481435400985E-05, 3.3216368068303816E-04, 4.2258807580024870E-07, -1.7026747228854500E-03, -1.2026158633582243E-04, 2.8537037037044089E-03, -1.2026158633584264E-04, -1.7026747228853732E-03, 4.2258807580182180E-07, 3.3216368068303642E-04, 6.1226481435401053E-05, 2.2644150332986919E-06, 7.2920076887968842E-09}; - constexpr FLT c5[] = {5.7777535593445574E-09, 9.5996306286140537E-07, 1.5097159537535560E-05, 2.8094504791464212E-05, -1.2791075475386364E-04, -1.0516749004210079E-04, 4.0040320377530828E-04, 5.4844446833709888E-17, -4.0040320377525385E-04, 1.0516749004229523E-04, 1.2791075475386559E-04, -2.8094504791467126E-05, -1.5097159537535560E-05, -9.5996306286140579E-07, -5.7777535593445582E-09}; - constexpr FLT c6[] = {3.3986627004323950E-09, 2.9741452947022275E-07, 2.3232144780590118E-06, -3.5941523174497321E-06, -1.8171775676701533E-05, 3.2858338560981214E-05, 2.0665249075258455E-05, -6.8763374485615104E-05, 2.0665249075221676E-05, 3.2858338560934424E-05, -1.8171775676683576E-05, -3.5941523174470280E-06, 2.3232144780590435E-06, 2.9741452947022206E-07, 3.3986627004323950E-09}; - constexpr FLT c7[] = {1.5128957992049987E-09, 6.6672685257784247E-08, 1.4160936684823307E-07, -1.2611166225385906E-06, 6.6865545481897967E-07, 4.6861078169740899E-06, -7.4523870622442393E-06, 5.1688954219266444E-17, 7.4523870623463821E-06, -4.6861078171739939E-06, -6.6865545481690963E-07, 1.2611166225370325E-06, -1.4160936684824530E-07, -6.6672685257784551E-08, -1.5128957992049987E-09}; - constexpr FLT c8[] = {5.1310324414219292E-10, 1.0163871982745590E-08, -2.4441175134592830E-08, -1.0543632600171378E-07, 4.0979777876715675E-07, -2.9573937051194202E-07, -5.9824625884543558E-07, 1.2067769776847866E-06, -5.9824625879665336E-07, -2.9573937049659643E-07, 4.0979777875267863E-07, -1.0543632599876183E-07, -2.4441175134530762E-08, 1.0163871982746284E-08, 5.1310324414219364E-10}; - constexpr FLT c9[] = {1.3160883866734095E-10, 8.0584478671564817E-10, -6.7824252838686685E-09, 9.4471403089230076E-09, 2.4030590211824177E-08, -9.0522548480936782E-08, 9.9320303339648267E-08, 1.4827374781995408E-17, -9.9320303311968964E-08, 9.0522548602725694E-08, -2.4030590184836860E-08, -9.4471403124694187E-09, 6.7824252839146209E-09, -8.0584478671585931E-10, -1.3160883866734196E-10}; - constexpr FLT c10[] = {2.4734066313995269E-11, -4.3978001545632529E-11, -5.4975091406435660E-10, 2.6307942070348926E-09, -4.2001676281559915E-09, -1.8212709350780177E-10, 1.0547608795803518E-08, -1.6454374555673015E-08, 1.0547608746152108E-08, -1.8212708345187657E-10, -4.2001676312984721E-09, 2.6307942087632753E-09, -5.4975091402508072E-10, -4.3978001545363347E-11, 2.4734066313995970E-11}; - constexpr FLT c11[] = {3.0917581107111067E-12, -2.1504981481527399E-11, 3.4611945838654282E-11, 1.1082666500276105E-10, -5.8883840899000033E-10, 1.1304779661881485E-09, -1.0037911406820197E-09, -5.7884986037117854E-17, 1.0037911398302301E-09, -1.1304781086488634E-09, 5.8883842723235649E-10, -1.1082666592552764E-10, -3.4611945887454015E-11, 2.1504981480972878E-11, -3.0917581107111891E-12}; - constexpr FLT c12[] = {1.5997634038655269E-13, -2.4807970173617968E-12, 1.1275106610326804E-11, -2.3847055813595321E-11, 1.5364454138408298E-11, 4.4350534757580891E-11, -1.3563510404683277E-10, 1.8159081432580251E-10, -1.3563508771311925E-10, 4.4350484735577755E-11, 1.5364420705333068E-11, -2.3847054665131313E-11, 1.1275106670142851E-11, -2.4807970168633410E-12, 1.5997634038739785E-13}; - constexpr FLT c13[] = {-2.4800914618527656E-14, -2.0428592368367617E-14, 6.6720756177865110E-13, -2.9781122281459938E-12, 7.0947566948544657E-12, -1.0181675867287212E-11, 7.9189142537208719E-12, -1.4497056804736912E-17, -7.9189459915777383E-12, 1.0181666345930152E-11, -7.0947487603902491E-12, 2.9781098973971301E-12, -6.6720754938105074E-13, 2.0428592180708626E-14, 2.4800914617770965E-14}; - constexpr FLT c14[] = {-6.3774103672726629E-15, 2.8974955370030088E-14, -6.8422346755457550E-14, 5.3399811794037740E-14, 1.7893441503609519E-13, -7.2418549150581294E-13, 1.3713697997539906E-12, -1.6687145216540105E-12, 1.3713520998316439E-12, -7.2416872315832831E-13, 1.7893006768675052E-13, 5.3400626922038687E-14, -6.8422339477528482E-14, 2.8974955559559462E-14, -6.3774103666804019E-15}; - constexpr FLT c15[] = {-5.1635500202709335E-16, 3.1828105471276549E-15, -1.2111383721117860E-14, 3.1272734620510859E-14, -5.6176935449952714E-14, 6.8640388687474512E-14, -4.9039125333789703E-14, -3.5058680377244798E-17, 4.9029469776856299E-14, -6.8666790600965935E-14, 5.6189548021197700E-14, -3.1272749707318549E-14, 1.2111366748459164E-14, -3.1828106649933298E-15, 5.1635500199831522E-16}; - constexpr FLT c16[] = {4.5179133600663468E-18, -1.3721818586136237E-17, -2.0190809683029299E-16, 1.1787611877454253E-15, -3.5963787346199218E-15, 7.4622525856292898E-15, -1.1451676136812928E-14, 1.2941737777564503E-14, -1.1457648327763603E-14, 7.4174611535501039E-15, -3.6182145577673462E-15, 1.1783995902489914E-15, -2.0188185185104562E-16, -1.3721704675617759E-17, 4.5179136270619547E-18}; - for (int i=0; i<15; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i] + z*(c12[i] + z*(c13[i] + z*(c14[i] + z*(c15[i] + z*(c16[i])))))))))))))))); - } else if (w==16) { - constexpr FLT c0[] = {3.7973138383475505E-11, 2.1620729770457867E-07, 4.2059935922517660E-05, 1.7055631615451750E-03, 2.4507833223051390E-02, 1.5833750021928361E-01, 5.2065761855025572E-01, 9.3058177132107800E-01, 9.3058177132107822E-01, 5.2065761855025583E-01, 1.5833750021928361E-01, 2.4507833223051407E-02, 1.7055631615451757E-03, 4.2059935922517680E-05, 2.1620729770457854E-07, 3.7973138383475363E-11}; - constexpr FLT c1[] = {2.3529614069937368E-10, 6.9307767643753084E-07, 9.1584555859393273E-05, 2.6688190455647263E-03, 2.7424935799146805E-02, 1.1980519064171602E-01, 2.2858769149343988E-01, 1.3403316930972969E-01, -1.3403316930972969E-01, -2.2858769149343988E-01, -1.1980519064171603E-01, -2.7424935799146809E-02, -2.6688190455647263E-03, -9.1584555859393273E-05, -6.9307767643753063E-07, -2.3529614069937291E-10}; - constexpr FLT c2[] = {6.6422278409342484E-10, 1.0324321112746625E-06, 9.1817488865684769E-05, 1.8711533829047168E-03, 1.2921996060610234E-02, 3.2047854205940321E-02, 1.0693035516337747E-02, -5.7626889750985358E-02, -5.7626889750985420E-02, 1.0693035516337622E-02, 3.2047854205940300E-02, 1.2921996060610227E-02, 1.8711533829047159E-03, 9.1817488865684728E-05, 1.0324321112746625E-06, 6.6422278409342453E-10}; - constexpr FLT c3[] = {1.1357078950958115E-09, 9.4728532805183455E-07, 5.5827161828283907E-05, 7.6087086075588353E-04, 3.0946204357507638E-03, 1.6729582927767952E-03, -9.5127691406672668E-03, -8.9630953638633881E-03, 8.9630953638635737E-03, 9.5127691406674039E-03, -1.6729582927767412E-03, -3.0946204357507521E-03, -7.6087086075588267E-04, -5.5827161828283886E-05, -9.4728532805183402E-07, -1.1357078950958119E-09}; - constexpr FLT c4[] = {1.3190161602522571E-09, 5.9764321317063336E-07, 2.2744388605472980E-05, 1.9073517322668089E-04, 2.8943142766413201E-04, -8.8625893129445465E-04, -1.3389167739520302E-03, 1.7216657535080475E-03, 1.7216657535079566E-03, -1.3389167739519974E-03, -8.8625893129445302E-04, 2.8943142766413342E-04, 1.9073517322668089E-04, 2.2744388605472997E-05, 5.9764321317063368E-07, 1.3190161602522571E-09}; - constexpr FLT c5[] = {1.1057322032863292E-09, 2.7364351668058875E-07, 6.4277990516969732E-06, 2.7144256967440253E-05, -3.6927862875708149E-05, -1.6756539822663250E-04, 1.6190404775924360E-04, 2.9203183363577429E-04, -2.9203183363574707E-04, -1.6190404775915027E-04, 1.6756539822663250E-04, 3.6927862875712038E-05, -2.7144256967440009E-05, -6.4277990516969918E-06, -2.7364351668058875E-07, -1.1057322032863296E-09}; - constexpr FLT c6[] = {6.9354916180818945E-10, 9.3269475195063855E-08, 1.2384428187212403E-06, 8.4996778392803041E-07, -1.3106613626284104E-05, 2.8218026704026646E-06, 4.1119875273776001E-05, -3.3017437945353985E-05, -3.3017437945415066E-05, 4.1119875273714446E-05, 2.8218026703990287E-06, -1.3106613626289508E-05, 8.4996778392747454E-07, 1.2384428187212240E-06, 9.3269475195063643E-08, 6.9354916180818914E-10}; - constexpr FLT c7[] = {3.3254260763956042E-10, 2.3748169129617104E-08, 1.4324995919586480E-07, -4.5855119979446571E-07, -9.5896649524100645E-07, 3.6155491755001142E-06, -9.8206137491315186E-07, -6.1812989819835450E-06, 6.1812989820611756E-06, 9.8206137497544330E-07, -3.6155491754721922E-06, 9.5896649524660746E-07, 4.5855119979503682E-07, -1.4324995919584492E-07, -2.3748169129616922E-08, -3.3254260763956068E-10}; - constexpr FLT c8[] = {1.2320735888479529E-10, 4.4066719437554910E-09, 2.9936173156462927E-09, -8.7082338359679101E-08, 1.2972939456291547E-07, 2.2882425903046301E-07, -7.3491924909334631E-07, 4.5592445674903059E-07, 4.5592445658978770E-07, -7.3491924903833956E-07, 2.2882425902441689E-07, 1.2972939456293178E-07, -8.7082338359266715E-08, 2.9936173156449473E-09, 4.4066719437557416E-09, 1.2320735888479524E-10}; - constexpr FLT c9[] = {3.5284250010876628E-11, 5.4380355945640250E-10, -2.1550460241694361E-09, -3.7344953348928088E-09, 2.7722604311846508E-08, -3.9597167021230792E-08, -1.3993916628542531E-08, 9.5626629210101709E-08, -9.5626629290371673E-08, 1.3993916670061478E-08, 3.9597167019846826E-08, -2.7722604310808535E-08, 3.7344953348928088E-09, 2.1550460241924123E-09, -5.4380355945618072E-10, -3.5284250010876789E-11}; - constexpr FLT c10[] = {7.7013760205813290E-12, 2.8123297626332877E-11, -3.7953802132437611E-10, 8.7573780453214681E-10, 5.1359846908750478E-10, -5.3609157480923598E-09, 9.1303305149265196E-09, -4.8150450778386211E-09, -4.8150450602405480E-09, 9.1303305006281353E-09, -5.3609157342653948E-09, 5.1359846657352753E-10, 8.7573780480711250E-10, -3.7953802133297068E-10, 2.8123297626237416E-11, 7.7013760205811319E-12}; - constexpr FLT c11[] = {1.2276300481459368E-12, -4.1769601372671798E-12, -1.9148402800715177E-11, 1.3822953630779855E-10, -3.0994364017547768E-10, 2.0316700893505159E-10, 4.3650568116859601E-10, -1.1534087567294806E-09, 1.1534086455717957E-09, -4.3650568244627625E-10, -2.0316701046115955E-10, 3.0994364003351358E-10, -1.3822953650299937E-10, 1.9148402794060861E-11, 4.1769601372325045E-12, -1.2276300481460517E-12}; - constexpr FLT c12[] = {1.2527329159215257E-13, -1.0816725479918068E-12, 2.7445378707133412E-12, 1.7839886378835549E-12, -2.6194655703148228E-11, 6.7446666417949068E-11, -8.5082142817277568E-11, 4.0255080062661886E-11, 4.0254965726647763E-11, -8.5082126483561454E-11, 6.7446671522236455E-11, -2.6194657362041918E-11, 1.7839889409505645E-12, 2.7445378607441180E-12, -1.0816725479139360E-12, 1.2527329159224173E-13}; - constexpr FLT c13[] = {3.2506946752710786E-15, -9.2845381849289691E-14, 5.1542691616877330E-13, -1.3678932005895992E-12, 1.6503397946393055E-12, 7.2548932254614457E-13, -6.2314806405069215E-12, 1.1299375277421538E-11, -1.1299433992456742E-11, 6.2314647715784883E-12, -7.2550201768889120E-13, -1.6503403897241219E-12, 1.3678930766135958E-12, -5.1542690377117294E-13, 9.2845381940092428E-14, -3.2506946753893115E-15}; - constexpr FLT c14[] = {-1.3523251101878356E-15, 1.9055798839533079E-15, 1.8430813184053169E-14, -1.1526987096958319E-13, 3.3349122385594633E-13, -5.8352048227061829E-13, 6.1751861733538967E-13, -2.7104853725824153E-13, -2.7103052681092733E-13, 6.1751644366071028E-13, -5.8351023494715043E-13, 3.3348982649365648E-13, -1.1526961866805939E-13, 1.8430809545089241E-14, 1.9055798650003023E-15, -1.3523251102248507E-15}; - constexpr FLT c15[] = {-2.4132931360656334E-16, 1.2442654599774185E-15, -3.5592598733275504E-15, 5.0956447378324209E-15, 1.6446732556150498E-15, -2.5290498540837812E-14, 6.2712721591286338E-14, -9.2666673089509217E-14, 9.2581824882952367E-14, -6.2712118118977746E-14, 2.5288160085642670E-14, -1.6451258598462044E-15, -5.0958559531403920E-15, 3.5592532728491847E-15, -1.2442654894438389E-15, 2.4132931361645452E-16}; - constexpr FLT c16[] = {-1.6052119916687038E-17, 1.0220930228231101E-16, -4.3668420339021406E-16, 1.2658361982998821E-15, -2.5907177687935505E-15, 3.7311262928168221E-15, -3.4997038937045781E-15, 1.4124231584693148E-15, 1.3706178218468559E-15, -3.5056760846448971E-15, 3.7363519598930578E-15, -2.5923974474980012E-15, 1.2658945204780770E-15, -4.3668985335150679E-16, 1.0220927950027870E-16, -1.6052119872193216E-17}; - constexpr FLT c17[] = {1.2307507877258324E-18, -2.6518352923945508E-18, -1.0105982127470271E-20, 2.6958700270869167E-17, -1.1513299715471039E-16, 2.7882272296911513E-16, -4.6961519239790030E-16, 6.5796739812484873E-16, -6.7025909677113713E-16, 4.6238478142949540E-16, -2.8307058941305305E-16, 1.1494093936336214E-16, -2.6999653770494898E-17, 1.1474040843416029E-20, 2.6518435669432360E-18, -1.2307508200482882E-18}; - for (int i=0; i<16; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i] + z*(c12[i] + z*(c13[i] + z*(c14[i] + z*(c15[i] + z*(c16[i] + z*(c17[i]))))))))))))))))); - } else - printf("width not implemented!\n"); diff --git a/include/cufinufft/contrib/ker_lowupsampfac_horner_allw_loop.inc b/include/cufinufft/contrib/ker_lowupsampfac_horner_allw_loop.inc index 358a1bdbf..e2fa229b7 100644 --- a/include/cufinufft/contrib/ker_lowupsampfac_horner_allw_loop.inc +++ b/include/cufinufft/contrib/ker_lowupsampfac_horner_allw_loop.inc @@ -2,191 +2,170 @@ // Authors: Alex Barnett & Ludvig af Klinteberg. // (C) The Simons Foundation, Inc. if (w==2) { - constexpr FLT c0[] = {2.3711015472112535E+01, 2.3711015472112539E+01}; - constexpr FLT c1[] = {2.5079742199350566E+01, -2.5079742199350566E+01}; - constexpr FLT c2[] = {-3.5023281580177019E+00, -3.5023281580177028E+00}; - constexpr FLT c3[] = {-7.3894949249195596E+00, 7.3894949249195649E+00}; + constexpr FLT c0[] = {6.1209111871385702E-01, 6.1209111871385702E-01}; + constexpr FLT c1[] = {6.4742429432896431E-01, -6.4742429432896442E-01}; + constexpr FLT c2[] = {-9.0411309581634847E-02, -9.0411309581634750E-02}; + constexpr FLT c3[] = {-1.9075708590566751E-01, 1.9075708590566753E-01}; for (int i=0; i<2; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i]))); } else if (w==3) { - constexpr FLT c0[] = {5.9620016143346866E+01, 2.4110216701187517E+02, 5.9620016148621886E+01}; - constexpr FLT c1[] = {9.7575520958604287E+01, 6.0625609804989280E-15, -9.7575520952908548E+01}; - constexpr FLT c2[] = {3.5838417859768519E+01, -7.3472145274965385E+01, 3.5838417865129472E+01}; - constexpr FLT c3[] = {-1.0721643298166459E+01, 2.2269719700859066E-14, 1.0721643303220411E+01}; - constexpr FLT c4[] = {-7.0570630207138105E+00, 9.1538553399011651E+00, -7.0570630151506615E+00}; + constexpr FLT c0[] = {2.4728112933307078E-01, 1.0000000000000044E+00, 2.4728112935494964E-01}; + constexpr FLT c1[] = {4.0470611346184543E-01, 2.1212921335912390E-17, -4.0470611343822160E-01}; + constexpr FLT c2[] = {1.4864411342268655E-01, -3.0473448739822773E-01, 1.4864411344492173E-01}; + constexpr FLT c3[] = {-4.4469294619149627E-02, 1.3598904496642886E-16, 4.4469294640111616E-02}; + constexpr FLT c4[] = {-2.9270010751775037E-02, 3.7966707032750659E-02, -2.9270010728701147E-02}; for (int i=0; i<3; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i])))); } else if (w==4) { - constexpr FLT c0[] = {1.2612470018753703E+02, 1.1896204292999123E+03, 1.1896204292999125E+03, 1.2612470018753706E+02}; - constexpr FLT c1[] = {2.6158034850676631E+02, 5.6161104654809833E+02, -5.6161104654809833E+02, -2.6158034850676631E+02}; - constexpr FLT c2[] = {1.7145379463699527E+02, -1.6695967127766502E+02, -1.6695967127766531E+02, 1.7145379463699518E+02}; - constexpr FLT c3[] = {2.3525961965887934E+01, -1.0057439659768855E+02, 1.0057439659768869E+02, -2.3525961965887870E+01}; - constexpr FLT c4[] = {-1.5608307370340814E+01, 9.5627412100261218E+00, 9.5627412100261768E+00, -1.5608307370340912E+01}; - constexpr FLT c5[] = {-4.5715207776748672E+00, 7.9904373067896399E+00, -7.9904373067894170E+00, 4.5715207776748832E+00}; + constexpr FLT c0[] = {8.4048892491849839E-02, 7.9275732207620875E-01, 7.9275732207620908E-01, 8.4048892491849811E-02}; + constexpr FLT c1[] = {1.7431588385887239E-01, 3.7425489538028417E-01, -3.7425489538028422E-01, -1.7431588385887242E-01}; + constexpr FLT c2[] = {1.1425598262146337E-01, -1.1126112046907141E-01, -1.1126112046907137E-01, 1.1425598262146335E-01}; + constexpr FLT c3[] = {1.5677587697716072E-02, -6.7022293289915616E-02, 6.7022293289915727E-02, -1.5677587697716041E-02}; + constexpr FLT c4[] = {-1.0401300825285629E-02, 6.3725646657139309E-03, 6.3725646657139005E-03, -1.0401300825285625E-02}; + constexpr FLT c5[] = {-3.0464394190490617E-03, 5.3247889205097435E-03, -5.3247889205097279E-03, 3.0464394190490305E-03}; for (int i=0; i<4; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i]))))); } else if (w==5) { - constexpr FLT c0[] = {2.4106943677442635E+02, 4.3538384278025578E+03, 9.3397486707382068E+03, 4.3538384278025542E+03, 2.4106943677442635E+02}; - constexpr FLT c1[] = {5.8781364250328284E+02, 3.4742855804122032E+03, -2.2247045611533172E-13, -3.4742855804122019E+03, -5.8781364250328272E+02}; - constexpr FLT c2[] = {5.1234107167555874E+02, 3.5219546517037230E+02, -1.7076861141633149E+03, 3.5219546517037259E+02, 5.1234107167555862E+02}; - constexpr FLT c3[] = {1.7540956907856085E+02, -3.5792356187777011E+02, 1.0950032210404113E-12, 3.5792356187777193E+02, -1.7540956907856062E+02}; - constexpr FLT c4[] = {-2.1768066955080412E-01, -7.8322173187697160E+01, 1.3904039464934533E+02, -7.8322173187696521E+01, -2.1768066955089899E-01}; - constexpr FLT c5[] = {-1.4207955403641282E+01, 1.6019466986222039E+01, 6.2864597222035853E-14, -1.6019466986221275E+01, 1.4207955403641282E+01}; - constexpr FLT c6[] = {-2.1966493586752702E+00, 5.0672636163198259E+00, -6.7340544905090631E+00, 5.0672636163192113E+00, -2.1966493586753031E+00}; - for (int i=0; i<5; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i])))))); + constexpr FLT c0[] = {2.5811126752233307E-02, 4.6616226852477344E-01, 1.0000000000000007E+00, 4.6616226852477305E-01, 2.5811126752233318E-02}; + constexpr FLT c1[] = {6.2936773057387055E-02, 3.7198919402374020E-01, 2.1212921335912559E-17, -3.7198919402374009E-01, -6.2936773057387055E-02}; + constexpr FLT c2[] = {5.4855980576944567E-02, 3.7709308632020676E-02, -1.8284069243892637E-01, 3.7709308632020731E-02, 5.4855980576944567E-02}; + constexpr FLT c3[] = {1.8780973157032140E-02, -3.8322611720715660E-02, 1.4047484462204681E-16, 3.8322611720715834E-02, -1.8780973157032116E-02}; + constexpr FLT c4[] = {-2.3306908700105430E-05, -8.3858973028989436E-03, 1.4886952481383787E-02, -8.3858973028988499E-03, -2.3306908700106227E-05}; + constexpr FLT c5[] = {-1.5212353034889806E-03, 1.7151925122365422E-03, 1.0734071182258885E-16, -1.7151925122365888E-03, 1.5212353034889806E-03}; + for (int i=0; i<5; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i]))))); } else if (w==6) { - constexpr FLT c0[] = {4.3011762559089192E+02, 1.3368828836127082E+04, 4.9861340433371268E+04, 4.9861340433371290E+04, 1.3368828836127082E+04, 4.3011762559835182E+02}; - constexpr FLT c1[] = {1.1857225840065146E+03, 1.4112553227730619E+04, 1.5410005180819442E+04, -1.5410005180819426E+04, -1.4112553227730617E+04, -1.1857225839984601E+03}; - constexpr FLT c2[] = {1.2460481448413077E+03, 4.3127030215084988E+03, -5.5438591621431215E+03, -5.5438591621431233E+03, 4.3127030215084969E+03, 1.2460481448488895E+03}; - constexpr FLT c3[] = {6.0825549344387821E+02, -3.4106010789546866E+02, -1.9775725023673151E+03, 1.9775725023673224E+03, 3.4106010789547190E+02, -6.0825549343673049E+02}; - constexpr FLT c4[] = {1.1264961069783713E+02, -3.9740822717990801E+02, 2.7557540616463564E+02, 2.7557540616463149E+02, -3.9740822717990505E+02, 1.1264961070570472E+02}; - constexpr FLT c5[] = {-1.5387906304333869E+01, -3.2640579296386335E+01, 1.1683718215647407E+02, -1.1683718215647050E+02, 3.2640579296386335E+01, 1.5387906311562686E+01}; - constexpr FLT c6[] = {-9.3947198873910107E+00, 1.5069930500884340E+01, -8.0900452409585597E+00, -8.0900452409573536E+00, 1.5069930500885983E+01, -9.3947198802582648E+00}; - constexpr FLT c7[] = {-5.6048841964528473E-01, 2.3377422080932533E+00, -4.2391567591829169E+00, 4.2391567591861783E+00, -2.3377422080911803E+00, 5.6048842664328347E-01}; - for (int i=0; i<6; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i]))))))); + constexpr FLT c0[] = {7.3992041846532818E-03, 2.2998056434514028E-01, 8.5775196559356059E-01, 8.5775196559356115E-01, 2.2998056434514028E-01, 7.3992041847816166E-03}; + constexpr FLT c1[] = {2.0397684222696250E-02, 2.4277466601214742E-01, 2.6509440217151281E-01, -2.6509440217151231E-01, -2.4277466601214739E-01, -2.0397684222557694E-02}; + constexpr FLT c2[] = {2.1435449512033435E-02, 7.4190333865239946E-02, -9.5369600014193256E-02, -9.5369600014193381E-02, 7.4190333865239905E-02, 2.1435449512163876E-02}; + constexpr FLT c3[] = {1.0463664645794037E-02, -5.8671703446042224E-03, -3.4019677093840447E-02, 3.4019677093840760E-02, 5.8671703446042771E-03, -1.0463664645671082E-02}; + constexpr FLT c4[] = {1.9378826192716972E-03, -6.8365127179467735E-03, 4.7406536657957962E-03, 4.7406536657958473E-03, -6.8365127179467848E-03, 1.9378826194070377E-03}; + constexpr FLT c5[] = {-2.6471424081647417E-04, -5.6150758897069279E-04, 2.0099203466671291E-03, -2.0099203466670359E-03, 5.6150758897070829E-04, 2.6471424094083520E-04}; + constexpr FLT c6[] = {-1.6161497824910217E-04, 2.5924418389355766E-04, -1.3917099193215483E-04, -1.3917099193211840E-04, 2.5924418389357192E-04, -1.6161497812639921E-04}; + for (int i=0; i<6; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i])))))); } else if (w==7) { - constexpr FLT c0[] = {7.2950392616203362E+02, 3.6439117038309523E+04, 2.1220891582018451E+05, 3.6180058567561547E+05, 2.1220891582018466E+05, 3.6439117038309538E+04, 7.2950392617434579E+02}; - constexpr FLT c1[] = {2.2197790785452585E+03, 4.6392067080426263E+04, 1.1568051746995676E+05, -2.6471374827810822E-11, -1.1568051746995673E+05, -4.6392067080426248E+04, -2.2197790785319785E+03}; - constexpr FLT c2[] = {2.6796845075663950E+03, 2.0921129984587253E+04, 3.9399551345633640E+01, -4.7251335435527413E+04, 3.9399551345568185E+01, 2.0921129984587242E+04, 2.6796845075789138E+03}; - constexpr FLT c3[] = {1.6253748990844513E+03, 2.6138488347211651E+03, -1.0037546705421486E+04, 4.9207207296884551E-11, 1.0037546705421528E+04, -2.6138488347211514E+03, -1.6253748990726617E+03}; - constexpr FLT c4[] = {4.9106375852553407E+02, -8.6668269315415375E+02, -1.0513434716617946E+03, 2.8444456471590820E+03, -1.0513434716617835E+03, -8.6668269315414682E+02, 4.9106375853851517E+02}; - constexpr FLT c5[] = {4.0739167949763470E+01, -2.8515155742293291E+02, 3.9930326803802245E+02, 9.3897520950192402E-12, -3.9930326803800614E+02, 2.8515155742293899E+02, -4.0739167937836122E+01}; - constexpr FLT c6[] = {-1.7148987139838134E+01, 7.5799002551925454E-01, 6.3260304953181709E+01, -1.0529869309159973E+02, 6.3260304953170241E+01, 7.5799002552861849E-01, -1.7148987128070043E+01}; - constexpr FLT c7[] = {-4.5424411501048008E+00, 9.8749254058339080E+00, -9.6456179777422530E+00, 1.4220101775868667E-11, 9.6456179778363111E+00, -9.8749254058241132E+00, 4.5424411616515830E+00}; - constexpr FLT c8[] = {-5.0793946806705008E-02, 7.3273813711596381E-01, -2.0117140545159620E+00, 2.6999257940738310E+00, -2.0117140545257630E+00, 7.3273813712090197E-01, -5.0793935652734865E-02}; - for (int i=0; i<7; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i])))))))); + constexpr FLT c0[] = {2.0163149398992283E-03, 1.0071602557045130E-01, 5.8653557849806126E-01, 1.0000000000000002E+00, 5.8653557849806159E-01, 1.0071602557045131E-01, 2.0163149399332597E-03}; + constexpr FLT c1[] = {6.1353661835569211E-03, 1.2822551681002711E-01, 3.1973557271594344E-01, -2.1212921335912596E-17, -3.1973557271594366E-01, -1.2822551681002711E-01, -6.1353661835202118E-03}; + constexpr FLT c2[] = {7.4065234100227761E-03, 5.7825030729344404E-02, 1.0889852837592919E-04, -1.3060049459923276E-01, 1.0889852837575314E-04, 5.7825030729344355E-02, 7.4065234100573725E-03}; + constexpr FLT c3[] = {4.4924606632387705E-03, 7.2245566707421303E-03, -2.7743312484355583E-02, 1.0559644416237177E-16, 2.7743312484355832E-02, -7.2245566707420826E-03, -4.4924606632061881E-03}; + constexpr FLT c4[] = {1.3572774007773842E-03, -2.3954706749181320E-03, -2.9058644824981098E-03, 7.8619155407045772E-03, -2.9058644824980807E-03, -2.3954706749181507E-03, 1.3572774008132615E-03}; + constexpr FLT c5[] = {1.1260116639581618E-04, -7.8814564904709067E-04, 1.1036556706849172E-03, -3.0492924261508591E-17, -1.1036556706849482E-03, 7.8814564904710227E-04, -1.1260116636284763E-04}; + constexpr FLT c6[] = {-4.7399003259805808E-05, 2.0950491943152726E-06, 1.7484854214667859E-04, -2.9104069274769336E-04, 1.7484854214659272E-04, 2.0950491943114936E-06, -4.7399003227280901E-05}; + constexpr FLT c7[] = {-1.2555096177146811E-05, 2.7293834771974277E-05, -2.6660039700396876E-05, 5.1878356274645480E-17, 2.6660039700612832E-05, -2.7293834771939816E-05, 1.2555096209061404E-05}; + for (int i=0; i<7; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i]))))))); } else if (w==8) { - constexpr FLT c0[] = {1.1895823653767156E+03, 9.0980236725237002E+04, 7.7438826909537544E+05, 2.0077596413122714E+06, 2.0077596413122721E+06, 7.7438826909537590E+05, 9.0980236725237002E+04, 1.1895823653767152E+03}; - constexpr FLT c1[] = {3.9313191526977803E+03, 1.3318570706800825E+05, 5.7275848637687659E+05, 4.6250273225257988E+05, -4.6250273225258006E+05, -5.7275848637687659E+05, -1.3318570706800825E+05, -3.9313191526977798E+03}; - constexpr FLT c2[] = {5.2976026193612415E+03, 7.5628970871188474E+04, 1.0073339198368331E+05, -1.8165150843791279E+05, -1.8165150843791300E+05, 1.0073339198368324E+05, 7.5628970871188460E+04, 5.2976026193612397E+03}; - constexpr FLT c3[] = {3.7552239608473869E+03, 1.8376340228970930E+04, -2.3878081117551392E+04, -4.6296734056047753E+04, 4.6296734056048466E+04, 2.3878081117551716E+04, -1.8376340228970901E+04, -3.7552239608473869E+03}; - constexpr FLT c4[] = {1.4742862505418659E+03, 1.2842168112180084E+02, -9.1969665138397813E+03, 7.5990739935236888E+03, 7.5990739935236415E+03, -9.1969665138397813E+03, 1.2842168112182003E+02, 1.4742862505418657E+03}; - constexpr FLT c5[] = {2.8158981009344376E+02, -8.8613607108855138E+02, 5.3457145342334591E+01, 2.1750989694613118E+03, -2.1750989694611812E+03, -5.3457145342138865E+01, 8.8613607108855138E+02, -2.8158981009344376E+02}; - constexpr FLT c6[] = {-1.4786862436220549E+00, -1.3935442261829297E+02, 3.2599325739090762E+02, -1.9541889343354751E+02, -1.9541889343356968E+02, 3.2599325739086612E+02, -1.3935442261828183E+02, -1.4786862436238759E+00}; - constexpr FLT c7[] = {-1.1542034522900533E+01, 1.2000512051415985E+01, 1.9687328710253290E+01, -6.3962883082497100E+01, 6.3962883082831397E+01, -1.9687328710065113E+01, -1.2000512051397745E+01, 1.1542034522901620E+01}; - constexpr FLT c8[] = {-1.7448292513541994E+00, 4.8577330433876664E+00, -6.8794163043749101E+00, 3.4611708986529197E+00, 3.4611708984979552E+00, -6.8794163042722616E+00, 4.8577330434089125E+00, -1.7448292513539221E+00}; - constexpr FLT c9[] = {1.5044951479000782E-01, 9.6230159355094672E-02, -7.0399250408500635E-01, 1.3251401130885254E+00, -1.3251401130188682E+00, 7.0399250409661596E-01, -9.6230159344936325E-02, -1.5044951478914617E-01}; - for (int i=0; i<8; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i]))))))))); + constexpr FLT c0[] = {5.2827275612461462E-04, 4.0402734444109238E-02, 3.4389230803369686E-01, 8.9161099745784866E-01, 8.9161099745784866E-01, 3.4389230803369708E-01, 4.0402734444109252E-02, 5.2827275612461408E-04}; + constexpr FLT c1[] = {1.7458301875074096E-03, 5.9145446836664541E-02, 2.5435204236257858E-01, 2.0538938722823222E-01, -2.0538938722823233E-01, -2.5435204236257858E-01, -5.9145446836664547E-02, -1.7458301875074094E-03}; + constexpr FLT c2[] = {2.3525728171808306E-03, 3.3585505340219701E-02, 4.4733940386002209E-02, -8.0668262921248624E-02, -8.0668262921248748E-02, 4.4733940386002119E-02, 3.3585505340219687E-02, 2.3525728171808311E-03}; + constexpr FLT c3[] = {1.6676293877589678E-03, 8.1606118103203940E-03, -1.0603838868224419E-02, -2.0559571166483725E-02, 2.0559571166484002E-02, 1.0603838868224510E-02, -8.1606118103203749E-03, -1.6676293877589678E-03}; + constexpr FLT c4[] = {6.5470478006265378E-04, 5.7029826102775656E-05, -4.0842122325118182E-03, 3.3746160664395084E-03, 3.3746160664396086E-03, -4.0842122325118321E-03, 5.7029826102778678E-05, 6.5470478006265432E-04}; + constexpr FLT c5[] = {1.2504911757628686E-04, -3.9351755557266000E-04, 2.3739384784447216E-05, 9.6592347103022203E-04, -9.6592347103013649E-04, -2.3739384784439440E-05, 3.9351755557266586E-04, -1.2504911757628702E-04}; + constexpr FLT c6[] = {-6.5665874015798238E-07, -6.1884865695206891E-05, 1.4476791315356577E-04, -8.6782118193344350E-05, -8.6782118193318939E-05, 1.4476791315358196E-04, -6.1884865695214169E-05, -6.5665874015806602E-07}; + constexpr FLT c7[] = {-5.1256159860509675E-06, 5.3292178505898186E-06, 8.7427989025457230E-06, -2.8404799465047339E-05, 2.8404799465135336E-05, -8.7427989024875505E-06, -5.3292178505782125E-06, 5.1256159860509675E-06}; + for (int i=0; i<8; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i]))))))); } else if (w==9) { - constexpr FLT c0[] = {1.8793738965777031E+03, 2.1220891582018440E+05, 2.5233246441351655E+06, 9.2877384983420707E+06, 1.4015330434461467E+07, 9.2877384983420800E+06, 2.5233246441351655E+06, 2.1220891582018536E+05, 1.8793738965777065E+03}; - constexpr FLT c1[] = {6.6675066501609354E+03, 3.4704155240987014E+05, 2.2890184838322564E+06, 3.8705035445351237E+06, 1.1717532248112299E-10, -3.8705035445351265E+06, -2.2890184838322559E+06, -3.4704155240987102E+05, -6.6675066501609354E+03}; - constexpr FLT c2[] = {9.8412775404612330E+03, 2.3171563090202375E+05, 6.8167589492092282E+05, -2.1140963571671949E+05, -1.4236515118873832E+06, -2.1140963571672430E+05, 6.8167589492092212E+05, 2.3171563090202416E+05, 9.8412775404612275E+03}; - constexpr FLT c3[] = {7.8762358364031061E+03, 7.6500585979636191E+04, 1.2434778984075345E+04, -2.8572091469429957E+05, 1.1900185890455270E-09, 2.8572091469430370E+05, -1.2434778984074723E+04, -7.6500585979636191E+04, -7.8762358364031033E+03}; - constexpr FLT c4[] = {3.6941911906762075E+03, 9.9232929169976032E+03, -3.3472877669901907E+04, -1.4082384858050133E+04, 6.7911966136974472E+04, -1.4082384858045889E+04, -3.3472877669901856E+04, 9.9232929169977433E+03, 3.6941911906762098E+03}; - constexpr FLT c5[] = {9.8900189723050323E+02, -1.2736589324621348E+03, -5.0407308390125609E+03, 9.8914296140178049E+03, 6.1223023135982708E-10, -9.8914296140230235E+03, 5.0407308390128219E+03, 1.2736589324621673E+03, -9.8900189723050403E+02}; - constexpr FLT c6[] = {1.1165868717716108E+02, -5.9057035448559543E+02, 5.5860705835625356E+02, 9.1996097522935008E+02, -2.0290255886368843E+03, 9.1996097522906575E+02, 5.5860705835607132E+02, -5.9057035448565603E+02, 1.1165868717715755E+02}; - constexpr FLT c7[] = {-1.3142584300867490E+01, -4.2852762793261455E+01, 1.8188640945803897E+02, -2.1362000457586478E+02, 1.1194928851903786E-10, 2.1362000457739751E+02, -1.8188640945787162E+02, 4.2852762793424958E+01, 1.3142584300868396E+01}; - constexpr FLT c8[] = {-5.8088068374876212E+00, 1.0201832931297655E+01, -3.5220973552653217E-01, -2.6632420897260161E+01, 4.2737607183076172E+01, -2.6632420895005694E+01, -3.5220973526763744E-01, 1.0201832931314263E+01, -5.8088068374874551E+00}; - constexpr FLT c9[] = {-4.0642645973149144E-01, 1.8389772328590479E+00, -3.5549484956004700E+00, 3.2273562224626624E+00, 2.3066481718890602E-10, -3.2273562263634674E+00, 3.5549484956933464E+00, -1.8389772328126097E+00, 4.0642645973247782E-01}; - for (int i=0; i<9; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i]))))))))); + constexpr FLT c0[] = {1.3409415535124456E-04, 1.5141199617983757E-02, 1.8004032483820079E-01, 6.6268423293859657E-01, 1.0000000000000004E+00, 6.6268423293859746E-01, 1.8004032483820084E-01, 1.5141199617983828E-02, 1.3409415535124450E-04}; + constexpr FLT c1[] = {4.7572953640583401E-04, 2.4761567630011042E-02, 1.6332247709293549E-01, 2.7616213278983226E-01, -4.2425842671825223E-17, -2.7616213278983237E-01, -1.6332247709293549E-01, -2.4761567630011111E-02, -4.7572953640583401E-04}; + constexpr FLT c2[] = {7.0217948741779855E-04, 1.6533012331430421E-02, 4.8637875368588490E-02, -1.5084170630533007E-02, -1.0157816246606997E-01, -1.5084170630533338E-02, 4.8637875368588449E-02, 1.6533012331430445E-02, 7.0217948741779833E-04}; + constexpr FLT c3[] = {5.6197289626769645E-04, 5.4583505067803007E-03, 8.8722695781044485E-04, -2.0386313118366230E-02, 1.4346537772579219E-16, 2.0386313118366597E-02, -8.8722695781040203E-04, -5.4583505067802999E-03, -5.6197289626769645E-04}; + constexpr FLT c4[] = {2.6358216867957524E-04, 7.0803132065997147E-04, -2.3883045659485441E-03, -1.0047843626593360E-03, 4.8455486978739078E-03, -1.0047843626590051E-03, -2.3883045659485362E-03, 7.0803132065996898E-04, 2.6358216867957530E-04}; + constexpr FLT c5[] = {7.0565721004957831E-05, -9.0876125855045856E-05, -3.5965836571493702E-04, 7.0575785995728897E-04, 5.6006957738110937E-17, -7.0575785995746006E-04, 3.5965836571493702E-04, 9.0876125855046818E-05, -7.0565721004957980E-05}; + constexpr FLT c6[] = {7.9668965137354764E-06, -4.2137454928171943E-05, 3.9856859670063718E-05, 6.5639620808911507E-05, -1.4477186949841611E-04, 6.5639620808762402E-05, 3.9856859670072629E-05, -4.2137454928186349E-05, 7.9668965137352681E-06}; + constexpr FLT c7[] = {-9.3772917893888351E-07, -3.0575635011675480E-06, 1.2977675432514170E-05, -1.5241881422267232E-05, 5.6444540850624641E-17, 1.5241881422464882E-05, -1.2977675432482811E-05, 3.0575635011824812E-06, 9.3772917893893782E-07}; + constexpr FLT c8[] = {-4.1446092652958961E-07, 7.2790527337844100E-07, -2.5130319764268858E-08, -1.9002349621010172E-06, 3.0493470976000790E-06, -1.9002349619116138E-06, -2.5130319761051126E-08, 7.2790527337217009E-07, -4.1446092652952507E-07}; + for (int i=0; i<9; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i])))))))); } else if (w==10) { - constexpr FLT c0[] = {2.8923571298063644E+03, 4.6856831608341972E+05, 7.5304732752870098E+06, 3.7576537584215805E+07, 7.9591606307847947E+07, 7.9591606307847947E+07, 3.7576537584215775E+07, 7.5304732752870088E+06, 4.6856831608341815E+05, 2.8923571298063584E+03}; - constexpr FLT c1[] = {1.0919387804943195E+04, 8.3976685277206486E+05, 7.9494027659552386E+06, 2.1606786285174560E+07, 1.4625897641453253E+07, -1.4625897641453268E+07, -2.1606786285174556E+07, -7.9494027659552386E+06, -8.3976685277206241E+05, -1.0919387804943173E+04}; - constexpr FLT c2[] = {1.7418455635504146E+04, 6.3489952164419868E+05, 3.1358985409389907E+06, 2.2547438801903715E+06, -6.0429762783920690E+06, -6.0429762783920504E+06, 2.2547438801903636E+06, 3.1358985409389869E+06, 6.3489952164419682E+05, 1.7418455635504106E+04}; - constexpr FLT c3[] = {1.5396188098732166E+04, 2.5490607173283477E+05, 4.2818880748176732E+05, -9.5435463094349112E+05, -1.2004850139039194E+06, 1.2004850139039543E+06, 9.5435463094349764E+05, -4.2818880748176464E+05, -2.5490607173283392E+05, -1.5396188098732144E+04}; - constexpr FLT c4[] = {8.2616700456447434E+03, 5.2880641964112423E+04, -6.1165055141129313E+04, -2.1590299490710214E+05, 2.1595822052158226E+05, 2.1595822052158433E+05, -2.1590299490713206E+05, -6.1165055141130644E+04, 5.2880641964112234E+04, 8.2616700456447343E+03}; - constexpr FLT c5[] = {2.7267169079066489E+03, 2.4572549134030178E+03, -2.6065821571076271E+04, 1.3919259807562572E+04, 4.6802084705703302E+04, -4.6802084705714791E+04, -1.3919259807544826E+04, 2.6065821571078101E+04, -2.4572549134029523E+03, -2.7267169079066462E+03}; - constexpr FLT c6[] = {5.0402062537834655E+02, -1.3640153425625094E+03, -1.4063198459010243E+03, 7.0858129627832977E+03, -4.8375233777539070E+03, -4.8375233777688618E+03, 7.0858129627894568E+03, -1.4063198459013925E+03, -1.3640153425628407E+03, 5.0402062537833399E+02}; - constexpr FLT c7[] = {2.4199726682552246E+01, -2.8393731159230907E+02, 5.1652001352658374E+02, 7.4578914842690025E+01, -1.1556759026394043E+03, 1.1556759026669868E+03, -7.4578914836335017E+01, -5.1652001352477316E+02, 2.8393731159271266E+02, -2.4199726682540764E+01}; - constexpr FLT c8[] = {-1.0545675122358718E+01, -3.0306758891736707E+00, 7.2305523762002423E+01, -1.3808908570315674E+02, 7.6293213390392353E+01, 7.6293213419941608E+01, -1.3808908572000124E+02, 7.2305523762424571E+01, -3.0306758892308885E+00, -1.0545675122367939E+01}; - constexpr FLT c9[] = {-2.1836930570445361E+00, 5.4992367507340179E+00, -4.5624617242018264E+00, -6.6492709812433128E+00, 2.0339240340948546E+01, -2.0339240355994509E+01, 6.6492709998185751E+00, 4.5624617253163429E+00, -5.4992367508385041E+00, 2.1836930570532433E+00}; - constexpr FLT c10[] = {-9.1748741454156318E-02, 5.2562451749078731E-01, -1.4144257942386596E+00, 1.8629579002072614E+00, -9.0169873685258095E-01, -9.0169875903814667E-01, 1.8629579050577161E+00, -1.4144257935638165E+00, 5.2562451754351402E-01, -9.1748741461736935E-02}; - for (int i=0; i<10; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i])))))))))); + constexpr FLT c0[] = {3.3157481538170295E-05, 5.3715860775974443E-03, 8.6328042282845782E-02, 4.3077092326437988E-01, 9.1242439930731112E-01, 9.1242439930731112E-01, 4.3077092326437971E-01, 8.6328042282845754E-02, 5.3715860775974227E-03, 3.3157481538170322E-05}; + constexpr FLT c1[] = {1.2517797191066981E-04, 9.6269418565961412E-03, 9.1130577457178452E-02, 2.4769645835465362E-01, 1.6766875916810517E-01, -1.6766875916810536E-01, -2.4769645835465354E-01, -9.1130577457178424E-02, -9.6269418565961117E-03, -1.2517797191066951E-04}; + constexpr FLT c2[] = {1.9968216068682153E-04, 7.2783782301876591E-03, 3.5949398124193940E-02, 2.5847993600195553E-02, -6.9275634160640490E-02, -6.9275634160640504E-02, 2.5847993600195445E-02, 3.5949398124193913E-02, 7.2783782301876375E-03, 1.9968216068682094E-04}; + constexpr FLT c3[] = {1.7649923565147242E-04, 2.9221990881931090E-03, 4.9086823797165058E-03, -1.0940556313145914E-02, -1.3762152424114656E-02, 1.3762152424114910E-02, 1.0940556313146081E-02, -4.9086823797164919E-03, -2.9221990881930998E-03, -1.7649923565147204E-04}; + constexpr FLT c4[] = {9.4710355505531920E-05, 6.0621452710061727E-04, -7.0118560592788729E-04, -2.4750745659639179E-03, 2.4757076628501668E-03, 2.4757076628502063E-03, -2.4750745659640264E-03, -7.0118560592788274E-04, 6.0621452710061163E-04, 9.4710355505531771E-05}; + constexpr FLT c5[] = {3.1258610702677804E-05, 2.8169545035126350E-05, -2.9881406711974808E-04, 1.5956798534243302E-04, 5.3653099874326161E-04, -5.3653099874339388E-04, -1.5956798534226972E-04, 2.9881406711975192E-04, -2.8169545035121488E-05, -3.1258610702677743E-05}; + constexpr FLT c6[] = {5.7780052154065432E-06, -1.5636835808661990E-05, -1.6121807313036067E-05, 8.1230533420465018E-05, -5.5456530742754838E-05, -5.5456530742851827E-05, 8.1230533420445272E-05, -1.6121807313045130E-05, -1.5636835808665131E-05, 5.7780052154064593E-06}; + constexpr FLT c7[] = {2.7742147829406768E-07, -3.2550081973304980E-06, 5.9212960378031332E-06, 8.5495977199682674E-07, -1.3248468528032551E-05, 1.3248468528215217E-05, -8.5495977185729702E-07, -5.9212960377964950E-06, 3.2550081973313239E-06, -2.7742147829400097E-07}; + constexpr FLT c8[] = {-1.2089379439825852E-07, -3.4743143855784781E-08, 8.2889801006379481E-07, -1.5830293785226849E-06, 8.7461219388985494E-07, 8.7461219397529632E-07, -1.5830293786451511E-06, 8.2889801008534534E-07, -3.4743143855462353E-08, -1.2089379439833804E-07}; + constexpr FLT c9[] = {-2.5033479260872450E-08, 6.3042298326687954E-08, -5.2303271559903752E-08, -7.6226091757998386E-08, 2.3316553102767969E-07, -2.3316553111902137E-07, 7.6226091879787297E-08, 5.2303271554367896E-08, -6.3042298324957995E-08, 2.5033479260965031E-08}; + for (int i=0; i<10; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i]))))))))); } else if (w==11) { - constexpr FLT c0[] = {4.3537972057094375E+03, 9.8872306817881158E+05, 2.0938056062983297E+07, 1.3701428307175839E+08, 3.8828289972017384E+08, 5.4292197128519225E+08, 3.8828289972017366E+08, 1.3701428307175839E+08, 2.0938056062983308E+07, 9.8872306817881158E+05, 4.3537972057093921E+03}; - constexpr FLT c1[] = {1.7371472778611500E+04, 1.9155790709433779E+06, 2.4914432724618737E+07, 9.7792160665338382E+07, 1.3126779387874995E+08, -1.1645321713027108E-08, -1.3126779387875001E+08, -9.7792160665338382E+07, -2.4914432724618725E+07, -1.9155790709433777E+06, -1.7371472778611380E+04}; - constexpr FLT c2[] = {2.9650558537745463E+04, 1.6014973065836846E+06, 1.1867448782239098E+07, 2.0812212822540630E+07, -1.1749875870571045E+07, -4.5121922350041404E+07, -1.1749875870570999E+07, 2.0812212822540656E+07, 1.1867448782239093E+07, 1.6014973065836844E+06, 2.9650558537745292E+04}; - constexpr FLT c3[] = {2.8505604980264405E+04, 7.4166660874053370E+05, 2.5711466441825363E+06, -1.2146931938153724E+06, -8.3931576510115806E+06, 5.8947555067017928E-08, 8.3931576510117110E+06, 1.2146931938154269E+06, -2.5711466441825293E+06, -7.4166660874053300E+05, -2.8505604980264299E+04}; - constexpr FLT c4[] = {1.7045632829988484E+04, 1.9785834209758099E+05, 8.6361403553703407E+04, -1.0584472412325807E+06, -1.3367486018954750E+05, 1.7818009619468113E+06, -1.3367486018952320E+05, -1.0584472412325810E+06, 8.6361403553705750E+04, 1.9785834209758116E+05, 1.7045632829988426E+04}; - constexpr FLT c5[] = {6.5462464716912891E+03, 2.5347576368078731E+04, -7.5810878908802741E+04, -8.0774039751698409E+04, 2.5492801112953416E+05, 3.1373949311406158E-08, -2.5492801112952997E+05, 8.0774039751677527E+04, 7.5810878908807950E+04, -2.5347576368078797E+04, -6.5462464716912691E+03}; - constexpr FLT c6[] = {1.5684149291082226E+03, -1.0302687059850266E+03, -1.3446845770824604E+04, 2.0814393480318489E+04, 1.4366994276506950E+04, -4.4581342385966971E+04, 1.4366994276487216E+04, 2.0814393480327166E+04, -1.3446845770825106E+04, -1.0302687059851414E+03, 1.5684149291082156E+03}; - constexpr FLT c7[] = {1.9398419323286674E+02, -8.7329293867233980E+02, 2.4796533428845552E+02, 3.2905701326708659E+03, -4.8989871768521243E+03, 2.5910474731743909E-08, 4.8989871768931434E+03, -3.2905701326280059E+03, -2.4796533428623073E+02, 8.7329293867272952E+02, -1.9398419323288715E+02}; - constexpr FLT c8[] = {-4.2288232505094108E+00, -9.9574929618070513E+01, 2.9563077145679659E+02, -1.9453049353627330E+02, -4.0107401575324394E+02, 7.9532514191794951E+02, -4.0107401576649818E+02, -1.9453049352309569E+02, 2.9563077145970482E+02, -9.9574929617658114E+01, -4.2288232504962613E+00}; - constexpr FLT c9[] = {-5.3741131162116726E+00, 5.5350606001924518E+00, 1.9153744596147146E+01, -6.3189447496716646E+01, 6.6921287671707859E+01, -1.3450045688823196E-08, -6.6921287609294978E+01, 6.3189447455108059E+01, -1.9153744593546609E+01, -5.5350606002853286E+00, 5.3741131162113103E+00}; - constexpr FLT c10[] = {-7.0359426507051681E-01, 2.2229112760631806E+00, -3.2054079730741187E+00, 8.3392535011476268E-02, 6.8879260445103929E+00, -1.0795498350223303E+01, 6.8879260559828390E+00, 8.3392524213879743E-02, -3.2054079670004838E+00, 2.2229112761686296E+00, -7.0359426507381639E-01}; - constexpr FLT c11[] = {5.2648094862911970E-02, 9.9912561370710071E-02, -4.3913938793989010E-01, 7.9792986880755179E-01, -6.9191820607752896E-01, -3.1086723020887482E-08, 6.9191819251103082E-01, -7.9792986253876474E-01, 4.3913938485313375E-01, -9.9912561580306161E-02, -5.2648094876606648E-02}; - for (int i=0; i<11; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i]))))))))))); + constexpr FLT c0[] = {8.0191950887587638E-06, 1.8211144887695905E-03, 3.8565497751765702E-02, 2.5236459439543663E-01, 7.1517256669690443E-01, 1.0000000000000002E+00, 7.1517256669690443E-01, 2.5236459439543651E-01, 3.8565497751765723E-02, 1.8211144887695927E-03, 8.0191950887586707E-06}; + constexpr FLT c1[] = {3.1996260415636073E-05, 3.5282769389657661E-03, 4.5889527487056492E-02, 1.8012194355267480E-01, 2.4178022040260394E-01, 2.1212921335912587E-17, -2.4178022040260411E-01, -1.8012194355267488E-01, -4.5889527487056492E-02, -3.5282769389657648E-03, -3.1996260415635850E-05}; + constexpr FLT c2[] = {5.4612928019025183E-05, 2.9497743530118290E-03, 2.1858479505161201E-02, 3.8333708936616528E-02, -2.1641923687039297E-02, -8.3109405654057292E-02, -2.1641923687039287E-02, 3.8333708936616487E-02, 2.1858479505161187E-02, 2.9497743530118290E-03, 5.4612928019024885E-05}; + constexpr FLT c3[] = {5.2504054888010150E-05, 1.3660648269306127E-03, 4.7357572177382694E-03, -2.2373255422688926E-03, -1.5459233729560824E-02, -3.0584997651941540E-18, 1.5459233729561050E-02, 2.2373255422689746E-03, -4.7357572177382599E-03, -1.3660648269306129E-03, -5.2504054888009953E-05}; + constexpr FLT c4[] = {3.1396100602888584E-05, 3.6443237253636144E-04, 1.5906780001786821E-04, -1.9495384184342716E-03, -2.4621376046556434E-04, 3.2818730060399505E-03, -2.4621376046541547E-04, -1.9495384184342974E-03, 1.5906780001787157E-04, 3.6443237253636144E-04, 3.1396100602888483E-05}; + constexpr FLT c5[] = {1.2057435171015750E-05, 4.6687328398363315E-05, -1.3963494372747466E-04, -1.4877651674418741E-04, 4.6954815721697059E-04, 7.1576260535837041E-17, -4.6954815721696283E-04, 1.4877651674414852E-04, 1.3963494372747659E-04, -4.6687328398363071E-05, -1.2057435171015728E-05}; + constexpr FLT c6[] = {2.8888404081262488E-06, -1.8976367884800935E-06, -2.4767547607257735E-05, 3.8337725458133611E-05, 2.6462355617055980E-05, -8.2113719362939881E-05, 2.6462355617066876E-05, 3.8337725458138978E-05, -2.4767547607262269E-05, -1.8976367884805327E-06, 2.8888404081262340E-06}; + constexpr FLT c7[] = {3.5729663467786725E-07, -1.6085054296206689E-06, 4.5672370507959851E-07, 6.0608527683273524E-06, -9.0233724844644286E-06, -4.5070818825954386E-17, 9.0233724845159214E-06, -6.0608527682667218E-06, -4.5672370507254818E-07, 1.6085054296207723E-06, -3.5729663467788907E-07}; + constexpr FLT c8[] = {-7.7890073973236871E-09, -1.8340559948709468E-07, 5.4451797328971916E-07, -3.5830285713854766E-07, -7.3873233537913819E-07, 1.4648976903075259E-06, -7.3873233536710514E-07, -3.5830285713236262E-07, 5.4451797329704790E-07, -1.8340559948689703E-07, -7.7890073973081013E-09}; + constexpr FLT c9[] = {-9.8984999695252047E-09, 1.0194946774280524E-08, 3.5279000677512062E-08, -1.1638771469313311E-07, 1.2326133617211816E-07, -2.5669371006274292E-17, -1.2326133615551060E-07, 1.1638771463500659E-07, -3.5279000676820083E-08, -1.0194946774410270E-08, 9.8984999695130418E-09}; + for (int i=0; i<11; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i]))))))))); } else if (w==12) { - constexpr FLT c0[] = {6.4299692685485479E+03, 2.0077596413122746E+06, 5.4904521978991687E+07, 4.5946106674819386E+08, 1.6835469840840111E+09, 3.1308386544851584E+09, 3.1308386544851594E+09, 1.6835469840840116E+09, 4.5946106674819499E+08, 5.4904521978991836E+07, 2.0077596413122742E+06, 6.4299692685634491E+03}; - constexpr FLT c1[] = {2.6965848540274084E+04, 4.1625245902732192E+06, 7.2097002594596982E+07, 3.8505085985474664E+08, 7.9479013671674263E+08, 4.7870231281824070E+08, -4.7870231281824070E+08, -7.9479013671674287E+08, -3.8505085985474682E+08, -7.2097002594597101E+07, -4.1625245902732182E+06, -2.6965848540258085E+04}; - constexpr FLT c2[] = {4.8869694409905118E+04, 3.7863371066322499E+06, 3.9530526716552719E+07, 1.1475134266581047E+08, 4.6311261797931008E+07, -2.0442837194260687E+08, -2.0442837194260764E+08, 4.6311261797930703E+07, 1.1475134266581020E+08, 3.9530526716552772E+07, 3.7863371066322499E+06, 4.8869694409920470E+04}; - constexpr FLT c3[] = {5.0530564260114013E+04, 1.9615784087727305E+06, 1.1044597342441026E+07, 7.9812418612436997E+06, -3.4042228324588403E+07, -3.3301805987927672E+07, 3.3301805987928241E+07, 3.4042228324588865E+07, -7.9812418612435153E+06, -1.1044597342440989E+07, -1.9615784087727298E+06, -5.0530564260099913E+04}; - constexpr FLT c4[] = {3.3081876469965486E+04, 6.2011956881368393E+05, 1.3086001239863783E+06, -3.1165484297367223E+06, -5.1982996003441429E+06, 6.3530947749620415E+06, 6.3530947749622557E+06, -5.1982996003440823E+06, -3.1165484297365877E+06, 1.3086001239863841E+06, 6.2011956881368428E+05, 3.3081876469981347E+04}; - constexpr FLT c5[] = {1.4308966168506786E+04, 1.1375573205951968E+05, -1.0318195403423737E+05, -6.6892418721464148E+05, 5.9223570255464804E+05, 1.1093685152670993E+06, -1.1093685152665814E+06, -5.9223570255454781E+05, 6.6892418721485860E+05, 1.0318195403423111E+05, -1.1375573205951942E+05, -1.4308966168492359E+04}; - constexpr FLT c6[] = {4.0848961919701046E+03, 7.5033277163530902E+03, -5.2578904182708357E+04, 6.3431596330007251E+03, 1.5984798504282974E+05, -1.2521363434086266E+05, -1.2521363434064612E+05, 1.5984798504277965E+05, 6.3431596327688303E+03, -5.2578904182719976E+04, 7.5033277163531166E+03, 4.0848961919843532E+03}; - constexpr FLT c7[] = {7.1658797373677851E+02, -1.5499947984091114E+03, -4.5490740453145772E+03, 1.4520122796449663E+04, -3.7896465827621914E+03, -2.3597107892496744E+04, 2.3597107892730306E+04, 3.7896465829102508E+03, -1.4520122796250829E+04, 4.5490740453377412E+03, 1.5499947984094479E+03, -7.1658797372277252E+02}; - constexpr FLT c8[] = {5.2022749592536726E+01, -4.0624258132612465E+02, 5.2256582979411519E+02, 9.3282469962228390E+02, -2.8710622268636553E+03, 1.7594166900407929E+03, 1.7594166904608542E+03, -2.8710622266536416E+03, 9.3282469976057041E+02, 5.2256582978430436E+02, -4.0624258132566132E+02, 5.2022749606076808E+01}; - constexpr FLT c9[] = {-7.0341875498933257E+00, -2.3043166228613529E+01, 1.2279331781902621E+02, -1.6714687552668008E+02, -4.4746498567249184E+01, 3.6060905998808425E+02, -3.6060905975626497E+02, 4.4746498638578188E+01, 1.6714687551479193E+02, -1.2279331779450688E+02, 2.3043166229077912E+01, 7.0341875614883520E+00}; - constexpr FLT c10[] = {-2.1556100132578342E+00, 4.1361104015055048E+00, 1.8107701824759481E+00, -2.1223400283067541E+01, 3.5820961921268712E+01, -1.8782945757357222E+01, -1.8782945295761856E+01, 3.5820961970532480E+01, -2.1223400227730256E+01, 1.8107701446846367E+00, 4.1361104022646886E+00, -2.1556100021360516E+00}; - constexpr FLT c11[] = {-1.1440899376747989E-01, 7.0567641591059616E-01, -1.4530217944402339E+00, 1.0571984630250064E+00, 1.4389000408734942E+00, -4.2241734506571262E+00, 4.2241732732256922E+00, -1.4389001658681779E+00, -1.0571984849752754E+00, 1.4530218273656557E+00, -7.0567641625357191E-01, 1.1440900438178589E-01}; - constexpr FLT c12[] = {-1.4486009664532199E-02, 2.9387825785133236E-03, -1.0265970208873806E-01, 2.6748270027876714E-01, -3.3606433030575705E-01, 1.5850134054436241E-01, 1.5850148084990595E-01, -3.3606430399846576E-01, 2.6748282743067825E-01, -1.0265974511212309E-01, 2.9387825100049524E-03, -1.4486000362352570E-02}; - for (int i=0; i<12; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i] + z*(c12[i])))))))))))); + constexpr FLT c0[] = {1.9028495068410023E-06, 5.9416527261081913E-04, 1.6248140264385581E-02, 1.3597036436097915E-01, 4.9821957378204840E-01, 9.2652305802242962E-01, 9.2652305802242962E-01, 4.9821957378204840E-01, 1.3597036436097937E-01, 1.6248140264385626E-02, 5.9416527261081924E-04, 1.9028495068454171E-06}; + constexpr FLT c1[] = {7.9801239249145923E-06, 1.2318344820958854E-03, 2.1335987794357199E-02, 1.1394981969310448E-01, 2.3520579283187484E-01, 1.4166451219687695E-01, -1.4166451219687687E-01, -2.3520579283187476E-01, -1.1394981969310460E-01, -2.1335987794357230E-02, -1.2318344820958847E-03, -7.9801239249098540E-06}; + constexpr FLT c2[] = {1.4462226804444730E-05, 1.1205076408888257E-03, 1.1698445222077612E-02, 3.3958877046121660E-02, 1.3705098421608795E-02, -6.0497400607811481E-02, -6.0497400607811579E-02, 1.3705098421608806E-02, 3.3958877046121591E-02, 1.1698445222077622E-02, 1.1205076408888255E-03, 1.4462226804449267E-05}; + constexpr FLT c3[] = {1.4953735432776090E-05, 5.8049865432805142E-04, 3.2684769908807722E-03, 2.3619245295514353E-03, -1.0074268581043095E-02, -9.8551520939611746E-03, 9.8551520939615059E-03, 1.0074268581043251E-02, -2.3619245295513252E-03, -3.2684769908807648E-03, -5.8049865432805098E-04, -1.4953735432771914E-05}; + constexpr FLT c4[] = {9.7900673700200676E-06, 1.8351475200221906E-04, 3.8725987583789238E-04, -9.2229408802588448E-04, -1.5383560041742387E-03, 1.8800996948122926E-03, 1.8800996948123033E-03, -1.5383560041742409E-03, -9.2229408802591614E-04, 3.8725987583789064E-04, 1.8351475200221903E-04, 9.7900673700247601E-06}; + constexpr FLT c5[] = {4.2345162286123928E-06, 3.3664241555334181E-05, -3.0535096226552352E-05, -1.9795772057290591E-04, 1.7526295499606013E-04, 3.2830037656743561E-04, -3.2830037656734232E-04, -1.7526295499599014E-04, 1.9795772057292925E-04, 3.0535096226555273E-05, -3.3664241555334181E-05, -4.2345162286081255E-06}; + constexpr FLT c6[] = {1.2088615636792351E-06, 2.2204932634073669E-06, -1.5559909809157569E-05, 1.8771595438708362E-06, 4.7304527720902187E-05, -3.7055029721502823E-05, -3.7055029721506354E-05, 4.7304527720948991E-05, 1.8771595438366184E-06, -1.5559909809165219E-05, 2.2204932634074313E-06, 1.2088615636834544E-06}; + constexpr FLT c7[] = {2.1206307767331379E-07, -4.5869687934383747E-07, -1.3462277877507893E-06, 4.2970047520348418E-06, -1.1214870287581008E-06, -6.9831974682071699E-06, 6.9831974683366982E-06, 1.1214870288087690E-06, -4.2970047519748465E-06, 1.3462277877599186E-06, 4.5869687934394192E-07, -2.1206307766917122E-07}; + constexpr FLT c8[] = {1.5395324498807062E-08, -1.2022118042093087E-07, 1.5464523856613661E-07, 2.7605497716337475E-07, -8.4964626033234966E-07, 5.2067203458077506E-07, 5.2067203461734952E-07, -8.4964626032018743E-07, 2.7605497716040193E-07, 1.5464523856098652E-07, -1.2022118042095769E-07, 1.5395324502815322E-08}; + constexpr FLT c9[] = {-2.0816585198648028E-09, -6.8192670389370156E-09, 3.6338774649049193E-08, -4.9464520974759579E-08, -1.3242031035521981E-08, 1.0671664854533778E-07, -1.0671664854533778E-07, 1.3242031024450263E-08, 4.9464520977527511E-08, -3.6338774639015446E-08, 6.8192670391856967E-09, 2.0816585232951501E-09}; + constexpr FLT c10[] = {-6.3791929313390708E-10, 1.2240176132927394E-09, 5.3586930472778203E-10, -6.2807355748408205E-09, 1.0600657362033408E-08, -5.5585207892891946E-09, -5.5585208232281016E-09, 1.0600657414513137E-08, -6.2807355547288652E-09, 5.3586929184356377E-10, 1.2240176133909372E-09, -6.3791928984134277E-10}; + for (int i=0; i<12; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i])))))))))); } else if (w==13) { - constexpr FLT c0[] = {9.3397060605267925E+03, 3.9447202186643188E+06, 1.3701428307175836E+08, 1.4375660883001420E+09, 6.6384519128895750E+09, 1.5848048271166540E+10, 2.1031560281976685E+10, 1.5848048271166515E+10, 6.6384519128895721E+09, 1.4375660883001390E+09, 1.3701428307175830E+08, 3.9447202186642904E+06, 9.3397060605267870E+03}; - constexpr FLT c1[] = {4.0984512931817779E+04, 8.6828943763566837E+06, 1.9558432133067667E+08, 1.3674961320373521E+09, 3.9251291128182445E+09, 4.5116631434426517E+09, -5.2784645410468957E-07, -4.5116631434426460E+09, -3.9251291128182430E+09, -1.3674961320373495E+09, -1.9558432133067659E+08, -8.6828943763566315E+06, -4.0984512931817771E+04}; - constexpr FLT c2[] = {7.8379538318778941E+04, 8.4928073133582622E+06, 1.1992091153966446E+08, 5.0561697705436689E+08, 6.1845897311594033E+08, -5.1306326495404607E+08, -1.4790096327029381E+09, -5.1306326495404249E+08, 6.1845897311593974E+08, 5.0561697705436635E+08, 1.1992091153966436E+08, 8.4928073133582175E+06, 7.8379538318778941E+04}; - constexpr FLT c3[] = {8.6417670227040027E+04, 4.8250267333349725E+06, 3.9836803808039062E+07, 7.5026052902191281E+07, -7.7565422849559024E+07, -2.5393835488011667E+08, 3.3249826368607219E-06, 2.5393835488012213E+08, 7.7565422849558040E+07, -7.5026052902191922E+07, -3.9836803808038987E+07, -4.8250267333349492E+06, -8.6417670227040042E+04}; - constexpr FLT c4[] = {6.1161604972829395E+04, 1.7331203720075563E+06, 7.0216196997559210E+06, -3.6027138646115125E+06, -3.1775875626363419E+07, 1.6544480876799976E+06, 4.9816566960117713E+07, 1.6544480876825110E+06, -3.1775875626362957E+07, -3.6027138646109658E+06, 7.0216196997559462E+06, 1.7331203720075507E+06, 6.1161604972829424E+04}; - constexpr FLT c5[] = {2.9177164557155927E+04, 3.9318079134661297E+05, 3.1307448297762702E+05, -2.7571366584958737E+06, -9.8421840747392213E+05, 6.8469173866723683E+06, 2.8271164666996988E-07, -6.8469173866687613E+06, 9.8421840747752984E+05, 2.7571366584952055E+06, -3.1307448297760193E+05, -3.9318079134661169E+05, -2.9177164557155942E+04}; - constexpr FLT c6[] = {9.5097815505886592E+03, 4.8799940773717601E+04, -1.2734023162442955E+05, -2.5472337176560360E+05, 6.3596049196317361E+05, 2.2361868201724227E+05, -1.0716559939672153E+06, 2.2361868202200226E+05, 6.3596049196156661E+05, -2.5472337176510989E+05, -1.2734023162441404E+05, 4.8799940773715760E+04, 9.5097815505886429E+03}; - constexpr FLT c7[] = {2.0601715730545525E+03, 1.9365931141588459E+02, -2.5304303117500138E+04, 2.9151392447016315E+04, 5.9055020355996137E+04, -1.1784846181768291E+05, 2.6154044742765007E-06, 1.1784846181457305E+05, -5.9055020356659290E+04, -2.9151392447180453E+04, 2.5304303117533978E+04, -1.9365931141453160E+02, -2.0601715730545707E+03}; - constexpr FLT c8[] = {2.5975061893406377E+02, -1.0025387650570891E+03, -6.8642481197673135E+02, 6.7515314203707721E+03, -7.0772939651788483E+03, -6.5444514138990871E+03, 1.6566898963252905E+04, -6.5444514157945678E+03, -7.0772939632859488E+03, 6.7515314204902643E+03, -6.8642481194565551E+02, -1.0025387650535661E+03, 2.5975061893407650E+02}; - constexpr FLT c9[] = {5.8705282128692158E+00, -1.4424362302794552E+02, 3.3390627212323119E+02, 4.8151337259952918E+01, -1.1431733956368030E+03, 1.4557114776348812E+03, -3.3159944254032091E-07, -1.4557114806782522E+03, 1.1431733967780669E+03, -4.8151337378834590E+01, -3.3390627213511937E+02, 1.4424362302320881E+02, -5.8705282128605081E+00}; - constexpr FLT c10[] = {-4.0954969508851224E+00, -1.2634947171672739E+00, 3.8134139827368251E+01, -8.4115524684139231E+01, 4.2766848660349709E+01, 1.0573434367831015E+02, -1.9636661091449494E+02, 1.0573435467021281E+02, 4.2766847947710779E+01, -8.4115525105243464E+01, 3.8134139870558698E+01, -1.2634947126121756E+00, -4.0954969508837991E+00}; - constexpr FLT c11[] = {-6.2702735485690120E-01, 1.8595467760284645E+00, -1.3027978720941771E+00, -4.9265267037365117E+00, 1.3906831814366365E+01, -1.3753763493382712E+01, 2.6871064791607931E-07, 1.3753755542502716E+01, -1.3906831747296087E+01, 4.9265273573671839E+00, 1.3027978458757612E+00, -1.8595467797630605E+00, 6.2702735484380401E-01}; - constexpr FLT c12[] = {-4.8290636698016143E-02, 1.7531876457248552E-01, -5.0041296501579524E-01, 6.3665129689096389E-01, -1.2477021972354120E-02, -1.2061605995627183E+00, 1.8595304429529254E+00, -1.2061634758265700E+00, -1.2475794298747987E-02, 6.3665098120347430E-01, -5.0041293542010268E-01, 1.7531876909405444E-01, -4.8290636687311379E-02}; - constexpr FLT c13[] = {2.2894665623763296E-02, -7.1358251863425162E-03, -1.4950753078549017E-02, 7.0611554068321924E-02, -1.2311301880976686E-01, 1.0342486048127918E-01, -6.8988570158793749E-07, -1.0342802294420825E-01, 1.2311280070887519E-01, -7.0611922113576600E-02, 1.4950741151156504E-02, 7.1358201810974436E-03, -2.2894665619603353E-02}; - for (int i=0; i<13; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i] + z*(c12[i] + z*(c13[i]))))))))))))); + constexpr FLT c0[] = {4.4408051211162946E-07, 1.8756193861873427E-04, 6.5146989208011716E-03, 6.8352802598867876E-02, 3.1564238810082484E-01, 7.5353649746793960E-01, 9.9999999999999956E-01, 7.5353649746793838E-01, 3.1564238810082484E-01, 6.8352802598867710E-02, 6.5146989208011707E-03, 1.8756193861873272E-04, 4.4408051211162761E-07}; + constexpr FLT c1[] = {1.9487148068106057E-06, 4.1285069961250701E-04, 9.2995630713278762E-03, 6.5021145064983563E-02, 1.8663042875530009E-01, 2.1451870821533808E-01, 1.8840858949353919E-32, -2.1451870821533794E-01, -1.8663042875529998E-01, -6.5021145064983438E-02, -9.2995630713278762E-03, -4.1285069961250425E-04, -1.9487148068106044E-06}; + constexpr FLT c2[] = {3.7267581324409626E-06, 4.0381251792508734E-04, 5.7019503038218408E-03, 2.4040868593456825E-02, 2.9406233528281710E-02, -2.4394921635639378E-02, -7.0323343245740924E-02, -2.4394921635639052E-02, 2.9406233528281724E-02, 2.4040868593456791E-02, 5.7019503038218382E-03, 4.0381251792508501E-04, 3.7267581324409626E-06}; + constexpr FLT c3[] = {4.1089519307370168E-06, 2.2941839162878727E-04, 1.8941440042457443E-03, 3.5673079836347822E-03, -3.6880489041048953E-03, -1.2074156718545214E-02, 7.1013810712957114E-17, 1.2074156718545436E-02, 3.6880489041048944E-03, -3.5673079836347674E-03, -1.8941440042457413E-03, -2.2941839162878624E-04, -4.1089519307370151E-06}; + constexpr FLT c4[] = {2.9080869014384424E-06, 8.2405696428180906E-05, 3.3386109283452779E-04, -1.7130036080580219E-04, -1.5108662980936900E-03, 7.8665018928679242E-05, 2.3686576883603073E-03, 7.8665018928764622E-05, -1.5108662980936485E-03, -1.7130036080580737E-04, 3.3386109283452861E-04, 8.2405696428180703E-05, 2.9080869014384429E-06}; + constexpr FLT c5[] = {1.3873038503072801E-06, 1.8694798962849948E-05, 1.4885937076477316E-05, -1.3109520271106624E-04, -4.6797213058790025E-05, 3.2555441892430825E-04, 6.5502537691746230E-17, -3.2555441892416048E-04, 4.6797213058875582E-05, 1.3109520271106819E-04, -1.4885937076477316E-05, -1.8694798962849962E-05, -1.3873038503072801E-06}; + constexpr FLT c6[] = {4.5216719173889445E-07, 2.3203195635245624E-06, -6.0547210914038460E-06, -1.2111482379340961E-05, 3.0238388566383385E-05, 1.0632529352081665E-05, -5.0954659549722746E-05, 1.0632529352250802E-05, 3.0238388566313227E-05, -1.2111482379347288E-05, -6.0547210914040671E-06, 2.3203195635247352E-06, 4.5216719173889350E-07}; + constexpr FLT c7[] = {9.7956192761412821E-08, 9.2080334896449358E-09, -1.2031586234326618E-06, 1.3860784486076025E-06, 2.8079238803293383E-06, -5.6034103145907796E-06, 1.6113788341939994E-17, 5.6034103146040687E-06, -2.8079238803054550E-06, -1.3860784485997179E-06, 1.2031586234342167E-06, -9.2080334898128650E-09, -9.7956192761411458E-08}; + constexpr FLT c8[] = {1.2350515865275843E-08, -4.7668301905167552E-08, -3.2637845350597966E-08, 3.2101904613347501E-07, -3.3650826994957826E-07, -3.1117289066304045E-07, 7.8771611535813792E-07, -3.1117289069990237E-07, -3.3650826984246136E-07, 3.2101904612282309E-07, -3.2637845349600439E-08, -4.7668301904853071E-08, 1.2350515865276535E-08}; + constexpr FLT c9[] = {2.7912946705592266E-10, -6.8584366111657433E-09, 1.5876438439662156E-08, 2.2894800381734934E-09, -5.4355139631893104E-08, 6.9215572156100812E-08, 1.6320619156148685E-17, -6.9215572241906639E-08, 5.4355139637428967E-08, -2.2894800215659153E-09, -1.5876438439575659E-08, 6.8584366109657170E-09, -2.7912946705524691E-10}; + constexpr FLT c10[] = {-1.9473100882503891E-10, -6.0076128424585684E-11, 1.8131864354130518E-09, -3.9994904462490394E-09, 2.0334605597831887E-09, 5.0274131974512103E-09, -9.3367591026663196E-09, 5.0274136044049357E-09, 2.0334605333861501E-09, -3.9994904745315308E-09, 1.8131864358844393E-09, -6.0076128154532669E-11, -1.9473100882561411E-10}; + constexpr FLT c11[] = {-2.9813639427701670E-11, 8.8416967305832406E-11, -6.1944900155883343E-11, -2.3424446318938161E-10, 6.6123632509207570E-10, -6.5395825305270265E-10, -7.6394712006965382E-17, 6.5395802534269801E-10, -6.6123633886256970E-10, 2.3424448263843040E-10, 6.1944899055662456E-11, -8.8416967554269098E-11, 2.9813639428048382E-11}; + for (int i=0; i<13; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i]))))))))))); } else if (w==14) { - constexpr FLT c0[] = {1.3368785683552924E+04, 7.5304732752870303E+06, 3.2765764524435025E+08, 4.2418096936485295E+09, 2.4197690538177547E+10, 7.2227640697189728E+10, 1.2261475327356721E+11, 1.2261475327356729E+11, 7.2227640697189728E+10, 2.4197690538177608E+10, 4.2418096936485305E+09, 3.2765764524435204E+08, 7.5304732752870284E+06, 1.3368785683578022E+04}; - constexpr FLT c1[] = {6.1154444023081698E+04, 1.7488686085101545E+07, 5.0279014009863281E+08, 4.4777867842655859E+09, 1.6916819861812075E+10, 2.8971884004562843E+10, 1.6054555293734529E+10, -1.6054555293734520E+10, -2.8971884004562851E+10, -1.6916819861812094E+10, -4.4777867842655849E+09, -5.0279014009863436E+08, -1.7488686085101552E+07, -6.1154444023056109E+04}; - constexpr FLT c2[] = {1.2279790808348054E+05, 1.8230319600271538E+07, 3.3815815633684015E+08, 1.9369899011251259E+09, 3.9743454154781294E+09, 7.4954544638351953E+08, -7.0173920607394953E+09, -7.0173920607394981E+09, 7.4954544638350523E+08, 3.9743454154781094E+09, 1.9369899011251252E+09, 3.3815815633684099E+08, 1.8230319600271549E+07, 1.2279790808350702E+05}; - constexpr FLT c3[] = {1.4339321200624772E+05, 1.1200899688172197E+07, 1.2799140125169736E+08, 4.0176966726270700E+08, 7.9146174555817381E+07, -1.1719748245183482E+09, -9.6919138198233318E+08, 9.6919138198235631E+08, 1.1719748245183690E+09, -7.9146174555820629E+07, -4.0176966726270568E+08, -1.2799140125169775E+08, -1.1200899688172201E+07, -1.4339321200622563E+05}; - constexpr FLT c4[] = {1.0866548538632697E+05, 4.4565213401510660E+06, 2.8354150929531515E+07, 2.2805067924010411E+07, -1.2058223609888455E+08, -1.2775415620367479E+08, 1.9261201640091833E+08, 1.9261201640092278E+08, -1.2775415620368402E+08, -1.2058223609887798E+08, 2.2805067924010262E+07, 2.8354150929531977E+07, 4.4565213401510660E+06, 1.0866548538635395E+05}; - constexpr FLT c5[] = {5.6346565047794371E+04, 1.1743908345502394E+06, 3.0601086667308519E+06, -7.2274020134796854E+06, -1.6220595157138506E+07, 2.0773587344464455E+07, 2.8183198298702076E+07, -2.8183198298697799E+07, -2.0773587344463386E+07, 1.6220595157145990E+07, 7.2274020134800859E+06, -3.0601086667311694E+06, -1.1743908345502326E+06, -5.6346565047771030E+04}; - constexpr FLT c6[] = {2.0435142564639620E+04, 1.9450977300079435E+05, -1.1234667576916210E+05, -1.5205767549239143E+06, 1.0515640561116433E+06, 3.7458351782459249E+06, -3.3794074240140119E+06, -3.3794074240169711E+06, 3.7458351782412329E+06, 1.0515640561062016E+06, -1.5205767549244103E+06, -1.1234667576906871E+05, 1.9450977300078108E+05, 2.0435142564663318E+04}; - constexpr FLT c7[] = {5.1491366053560578E+03, 1.4735748500446980E+04, -8.1689482343558659E+04, -3.5176894225535718E+04, 3.7034248411029513E+05, -1.9109669530087037E+05, -5.2637978465954703E+05, 5.2637978466513811E+05, 1.9109669530731969E+05, -3.7034248412243859E+05, 3.5176894226134398E+04, 8.1689482343736949E+04, -1.4735748500440675E+04, -5.1491366053330503E+03}; - constexpr FLT c8[] = {8.5138795113642539E+02, -1.2978618911724870E+03, -8.7500873646799319E+03, 2.1319159614070901E+04, 7.6586611596445446E+03, -6.2424139814276627E+04, 4.2620771484048986E+04, 4.2620771487400976E+04, -6.2424139811762492E+04, 7.6586611726886877E+03, 2.1319159614126653E+04, -8.7500873648028410E+03, -1.2978618911666397E+03, 8.5138795115875746E+02}; - constexpr FLT c9[] = {7.2176142041601707E+01, -4.5543406154804239E+02, 2.8301959889246939E+02, 2.1994171513294418E+03, -4.5082500681007541E+03, 4.7658016701186381E+02, 7.1044827179414842E+03, -7.1044827207946446E+03, -4.7658016510975699E+02, 4.5082500692420190E+03, -2.1994171509014677E+03, -2.8301959872009093E+02, 4.5543406154544186E+02, -7.2176142022434362E+01}; - constexpr FLT c10[] = {-3.1135380162987940E+00, -3.8554406978579038E+01, 1.4396028115898400E+02, -1.1260050343554748E+02, -3.0073664795307559E+02, 7.2079162583931463E+02, -4.1195307853504261E+02, -4.1195308389061950E+02, 7.2079161951195317E+02, -3.0073665201295637E+02, -1.1260050330597517E+02, 1.4396028109959775E+02, -3.8554406977567140E+01, -3.1135379980017595E+00}; - constexpr FLT c11[] = {-1.6022934776926798E+00, 1.8678197421256739E+00, 8.3368944138930399E+00, -3.0791579027234270E+01, 3.4749714150762280E+01, 1.2322523792409507E+01, -7.3924012166427417E+01, 7.3924001493712765E+01, -1.2322523909478123E+01, -3.4749718994457659E+01, 3.0791578402870758E+01, -8.3368943163363198E+00, -1.8678197396867300E+00, 1.6022934951962213E+00}; - constexpr FLT c12[] = {-1.9362061844377096E-01, 6.3024467546449237E-01, -9.3262282246103156E-01, -4.8908745811188170E-01, 4.0479355563504544E+00, -6.2829791472071852E+00, 3.1767781035894589E+00, 3.1767769811448687E+00, -6.2829724125407163E+00, 4.0479411685726534E+00, -4.8908752826470542E-01, -9.3262301538118120E-01, 6.3024467436836862E-01, -1.9362060312354304E-01}; - constexpr FLT c13[] = {1.8785913715361053E-02, 3.1605272623671174E-02, -1.3655798799707175E-01, 2.5016548497515428E-01, -1.6654380378010236E-01, -2.1682631004979175E-01, 6.1785823408636587E-01, -6.1786412281044067E-01, 2.1682412904087514E-01, 1.6654140467029407E-01, -2.5016543044993139E-01, 1.3655803570664179E-01, -3.1605272197692873E-02, -1.8785905270673971E-02}; - constexpr FLT c14[] = {-1.2896545121493665E-02, -3.7106960851979211E-03, 5.8859140039070395E-04, 1.3987190631712249E-02, -3.5710919113872190E-02, 4.3405397573933885E-02, -2.0030939379906375E-02, -2.0032731865340953E-02, 4.3401439168598052E-02, -3.5712796955756618E-02, 1.3987489379284932E-02, 5.8862874383716927E-04, -3.7106965853333437E-03, -1.2896537371347905E-02}; - for (int i=0; i<14; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i] + z*(c12[i] + z*(c13[i] + z*(c14[i])))))))))))))); + constexpr FLT c0[] = {1.0213002307223062E-07, 5.7528591418445639E-05, 2.5031206020280088E-03, 3.2405046511689233E-02, 1.8485678142025513E-01, 5.5177865704975304E-01, 9.3670793123951734E-01, 9.3670793123951712E-01, 5.5177865704975315E-01, 1.8485678142025547E-01, 3.2405046511689239E-02, 2.5031206020280179E-03, 5.7528591418445801E-05, 1.0213002307242253E-07}; + constexpr FLT c1[] = {4.6718564624239767E-07, 1.3360375098030156E-04, 3.8410346178215306E-03, 3.4207779106833425E-02, 1.2923501383683489E-01, 2.2132894130184291E-01, 1.2264779624530273E-01, -1.2264779624530257E-01, -2.2132894130184308E-01, -1.2923501383683503E-01, -3.4207779106833425E-02, -3.8410346178215393E-03, -1.3360375098030178E-04, -4.6718564624220264E-07}; + constexpr FLT c2[] = {9.3810713124204527E-07, 1.3926941499858519E-04, 2.5833386162539013E-03, 1.4797516242328850E-02, 3.0361769467151970E-02, 5.7261067343619262E-03, -5.3608938764866873E-02, -5.3608938764866894E-02, 5.7261067343618603E-03, 3.0361769467151870E-02, 1.4797516242328836E-02, 2.5833386162539061E-03, 1.3926941499858543E-04, 9.3810713124224814E-07}; + constexpr FLT c3[] = {1.0954436997682021E-06, 8.5568590196649221E-05, 9.7778250562911601E-04, 3.0692948752812804E-03, 6.0463237460738756E-04, -8.9532302111318181E-03, -7.4040784665309846E-03, 7.4040784665312838E-03, 8.9532302111319968E-03, -6.0463237460737487E-04, -3.0692948752812708E-03, -9.7778250562911818E-04, -8.5568590196649329E-05, -1.0954436997680333E-06}; + constexpr FLT c4[] = {8.3014334976692641E-07, 3.4045323043173900E-05, 2.1660980714121239E-04, 1.7421792587401689E-04, -9.2118064021561887E-04, -9.7597008655075522E-04, 1.4714477548413631E-03, 1.4714477548414121E-03, -9.7597008655073809E-04, -9.2118064021559762E-04, 1.7421792587402266E-04, 2.1660980714121363E-04, 3.4045323043173968E-05, 8.3014334976713224E-07}; + constexpr FLT c5[] = {4.3045614796951587E-07, 8.9716871724550274E-06, 2.3377513570381849E-05, -5.5213296993546423E-05, -1.2391624765752083E-04, 1.5869855385555775E-04, 2.1530382494154427E-04, -2.1530382494144317E-04, -1.5869855385557331E-04, 1.2391624765755973E-04, 5.5213296993542533E-05, -2.3377513570381968E-05, -8.9716871724550325E-06, -4.3045614796933747E-07}; + constexpr FLT c6[] = {1.5611302559652642E-07, 1.4859455506706785E-06, -8.5826557923722616E-07, -1.1616353402592630E-05, 8.0333594878995593E-06, 2.8616079443375728E-05, -2.5816776957707699E-05, -2.5816776957707652E-05, 2.8616079443268301E-05, 8.0333594878977314E-06, -1.1616353402591744E-05, -8.5826557923811989E-07, 1.4859455506706314E-06, 1.5611302559670737E-07}; + constexpr FLT c7[] = {3.9336515129721532E-08, 1.1257285216182540E-07, -6.2406181937560562E-07, -2.6873173855233150E-07, 2.8292088258393860E-06, -1.4598715516905790E-06, -4.0212462690723253E-06, 4.0212462691823422E-06, 1.4598715517761175E-06, -2.8292088259133913E-06, 2.6873173855647969E-07, 6.2406181937648769E-07, -1.1257285216174059E-07, -3.9336515129545720E-08}; + constexpr FLT c8[] = {6.5041263396088790E-09, -9.9149367808853263E-09, -6.6845758889620994E-08, 1.6286641992901855E-07, 5.8507874943424797E-08, -4.7688540978638226E-07, 3.2559878511421460E-07, 3.2559878519979701E-07, -4.7688540972525423E-07, 5.8507875026096430E-08, 1.6286641993325022E-07, -6.6845758889870313E-08, -9.9149367809131923E-09, 6.5041263397795280E-09}; + constexpr FLT c9[] = {5.5138523621090170E-10, -3.4792607432658830E-09, 2.1621109687111844E-09, 1.6802313210571416E-08, -3.4440501484206901E-08, 3.6408051867813727E-09, 5.4274262350067578E-08, -5.4274262322388281E-08, -3.6408052006210212E-09, 3.4440501481438969E-08, -1.6802313213339344E-08, -2.1621109679759532E-09, 3.4792607432902108E-09, -5.5138523606396516E-10}; + constexpr FLT c10[] = {-2.3785683828448576E-11, -2.9453404124114860E-10, 1.0997757897423152E-09, -8.6020468987368310E-10, -2.2974592934948612E-09, 5.5064437603692059E-09, -3.1470905819229834E-09, -3.1470905272434506E-09, 5.5064436867561607E-09, -2.2974592840673907E-09, -8.6020468484567061E-10, 1.0997757884067548E-09, -2.9453404129270796E-10, -2.3785683688822786E-11}; + constexpr FLT c11[] = {-1.2240623323339709E-11, 1.4269095096874458E-11, 6.3689195980296716E-11, -2.3523039255622989E-10, 2.6546832331592691E-10, 9.4137182189250380E-11, -5.6473803777133577E-10, 5.6473799518218520E-10, -9.4137157913436917E-11, -2.6546835890448598E-10, 2.3523039312408576E-10, -6.3689194329967738E-11, -1.4269094997055950E-11, 1.2240623457297303E-11}; + constexpr FLT c12[] = {-1.4791529085565623E-12, 4.8147158180813514E-12, -7.1247159181258048E-12, -3.7363568005007135E-12, 3.0923958877552072E-11, -4.7998366007614543E-11, 2.4268802632733111E-11, 2.4268880217882715E-11, -4.7998325173324774E-11, 3.0923998690985708E-11, -3.7363589698227313E-12, -7.1247171622956968E-12, 4.8147157313484649E-12, -1.4791527915262285E-12}; + for (int i=0; i<14; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i] + z*(c12[i])))))))))))); } else if (w==15) { - constexpr FLT c0[] = {1.8887777774374495E+04, 1.4015330434461441E+07, 7.5498683300180113E+08, 1.1900937739619959E+10, 8.2530965279375427E+10, 3.0178246269069617E+11, 6.3775691457119177E+11, 8.1471473119305627E+11, 6.3775691457119177E+11, 3.0178246269069659E+11, 8.2530965279375626E+10, 1.1900937739619970E+10, 7.5498683300180113E+08, 1.4015330434461441E+07, 1.8887777774374499E+04}; - constexpr FLT c1[] = {8.9780907163796350E+04, 3.4167636285297170E+07, 1.2346880033823483E+09, 1.3719272724135921E+10, 6.5858241494816727E+10, 1.5266999939989542E+11, 1.5687794513790732E+11, 8.2054309331652521E-05, -1.5687794513790729E+11, -1.5266999939989551E+11, -6.5858241494816811E+10, -1.3719272724135935E+10, -1.2346880033823485E+09, -3.4167636285297155E+07, -8.9780907163796262E+04}; - constexpr FLT c2[] = {1.8850321233130724E+05, 3.7693640983013548E+07, 8.9846818051570022E+08, 6.7094088040439663E+09, 1.9743296615199219E+10, 1.8072727219391186E+10, -2.0634615374559433E+10, -4.9654335197177406E+10, -2.0634615374559402E+10, 1.8072727219391071E+10, 1.9743296615199223E+10, 6.7094088040439653E+09, 8.9846818051569998E+08, 3.7693640983013526E+07, 1.8850321233130703E+05}; - constexpr FLT c3[] = {2.3185006533495741E+05, 2.4789475362741619E+07, 3.7751696829092431E+08, 1.7167916788178215E+09, 1.9832401267745426E+09, -3.4881359830883756E+09, -7.8785602379628572E+09, 9.7140016072625200E-05, 7.8785602379629736E+09, 3.4881359830884337E+09, -1.9832401267745149E+09, -1.7167916788178086E+09, -3.7751696829092413E+08, -2.4789475362741601E+07, -2.3185006533495741E+05}; - constexpr FLT c4[] = {1.8672970114818294E+05, 1.0741068109706741E+07, 9.8017949708492860E+07, 2.0291084954252207E+08, -2.7857869294215119E+08, -9.4112677968749356E+08, 1.7886520649348873E+08, 1.4579673547892964E+09, 1.7886520649342585E+08, -9.4112677968752539E+08, -2.7857869294214994E+08, 2.0291084954251558E+08, 9.8017949708492786E+07, 1.0741068109706741E+07, 1.8672970114818294E+05}; - constexpr FLT c5[] = {1.0411891611891470E+05, 3.1771463075269503E+06, 1.4880104152842240E+07, -6.8136965447559115E+06, -8.7072998215433106E+07, 1.8024116531034056E+06, 1.9067730799617344E+08, 4.2457739417067258E-05, -1.9067730799613068E+08, -1.8024116529409259E+06, 8.7072998215441659E+07, 6.8136965447553769E+06, -1.4880104152842039E+07, -3.1771463075269512E+06, -1.0411891611891471E+05}; - constexpr FLT c6[] = {4.1300641422694804E+04, 6.3217168592498475E+05, 7.7343707634861500E+05, -5.4575962381464886E+06, -3.7387211063140454E+06, 1.8451583614096310E+07, 3.0480804947991944E+06, -2.7500445095909819E+07, 3.0480804948348333E+06, 1.8451583614054784E+07, -3.7387211062913244E+06, -5.4575962381459959E+06, 7.7343707634824759E+05, 6.3217168592497776E+05, 4.1300641422694753E+04}; - constexpr FLT c7[] = {1.1710443348523793E+04, 7.5405449195728594E+04, -1.6634736996463325E+05, -5.6069290801800112E+05, 1.1540571564075467E+06, 1.0209821661192341E+06, -2.9641921942296810E+06, 3.3808352628184138E-05, 2.9641921942798980E+06, -1.0209821662794619E+06, -1.1540571563939669E+06, 5.6069290802062431E+05, 1.6634736996474760E+05, -7.5405449195719484E+04, -1.1710443348523821E+04}; - constexpr FLT c8[] = {2.3142324239350878E+03, 2.1710560541685127E+03, -3.6929625713073510E+04, 2.6143898219454975E+04, 1.4046980089280056E+05, -2.1033190113776314E+05, -1.1132269821056565E+05, 3.7491447377567255E+05, -1.1132269820392072E+05, -2.1033190119832297E+05, 1.4046980086087715E+05, 2.6143898218932318E+04, -3.6929625712961781E+04, 2.1710560541720374E+03, 2.3142324239350669E+03}; - constexpr FLT c9[] = {2.8879718294280184E+02, -9.2801372612475961E+02, -1.9817144426574330E+03, 9.9004179204792053E+03, -5.7928269087620147E+03, -2.1083466263505023E+04, 3.3285501948595454E+04, -2.7485328636422507E-05, -3.3285501965333991E+04, 2.1083466366979632E+04, 5.7928269521300508E+03, -9.9004179216204702E+03, 1.9817144428595318E+03, 9.2801372612847467E+02, -2.8879718294283089E+02}; - constexpr FLT c10[] = {1.3121871131812668E+01, -1.5978845116799533E+02, 2.7429718922951372E+02, 4.4598059414156506E+02, -1.8917609553066516E+03, 1.5303002688244715E+03, 1.7542368497545090E+03, -3.9411530602516441E+03, 1.7542369316431223E+03, 1.5303002442924305E+03, -1.8917609584163495E+03, 4.4598059457347478E+02, 2.7429718902435877E+02, -1.5978845117002061E+02, 1.3121871131803672E+01}; - constexpr FLT c11[] = {-2.4286151057240977E+00, -6.7839829107457454E+00, 4.6999223071396322E+01, -7.4896070961958642E+01, -3.2010113081168477E+01, 2.5022928265034139E+02, -2.8786059319143976E+02, -7.6634590881515742E-06, 2.8786055354435149E+02, -2.5022938574837804E+02, 3.2010133958326769E+01, 7.4896073537458122E+01, -4.6999222973839679E+01, 6.7839829144042234E+00, 2.4286151057002718E+00}; - constexpr FLT c12[] = {-5.4810555663540994E-01, 1.1436870829533889E+00, 8.2471503038810468E-01, -8.5602133190676231E+00, 1.5631626747736027E+01, -6.4979530690388971E+00, -1.8737705444912390E+01, 3.3283700586432069E+01, -1.8737671771580779E+01, -6.4980608237023150E+00, 1.5631576518348636E+01, -8.5602150728872868E+00, 8.2471496023535673E-01, 1.1436870829534245E+00, -5.4810555666110816E-01}; - constexpr FLT c13[] = {-1.4554612894071435E-02, 1.7022157798828938E-01, -3.7563883252838998E-01, 2.0131137597017346E-01, 8.3554102633770899E-01, -2.1191293316246047E+00, 1.9960663397068628E+00, -2.3728355667610635E-05, -1.9960994910423950E+00, 2.1191258420103383E+00, -8.3552532307350946E-01, -2.0131366602953590E-01, 3.7563888705361287E-01, -1.7022157564540871E-01, 1.4554612874103701E-02}; - constexpr FLT c14[] = {-1.2348455954758902E-02, 2.6143546776172359E-03, -2.9252135300577905E-02, 7.5391681327619392E-02, -8.7984403647335341E-02, 1.3344627281489669E-03, 1.5252941418184685E-01, -2.3235937480302737E-01, 1.5257226311939021E-01, 1.3278049251030887E-03, -8.7990378598784807E-02, 7.5392790961460260E-02, -2.9252188648358976E-02, 2.6143533439228375E-03, -1.2348455958015002E-02}; - constexpr FLT c15[] = {1.4214685601398354E-02, -1.2364336624800189E-03, 1.2892619016815934E-03, 1.6178062163508013E-03, -8.2136742192079667E-03, 1.3906385413195475E-02, -1.1450713230272313E-02, -3.7721726447119798E-06, 1.1423376007684534E-02, -1.3922509066323734E-02, 8.2263143670307064E-03, -1.6156663488059737E-03, -1.2892038432598459E-03, 1.2364357359950825E-03, -1.4214685605448193E-02}; - for (int i=0; i<15; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i] + z*(c12[i] + z*(c13[i] + z*(c14[i] + z*(c15[i]))))))))))))))); + constexpr FLT c0[] = {2.3183302143948793E-08, 1.7202745817468655E-05, 9.2668857465754784E-04, 1.4607490553401936E-02, 1.0130044556641116E-01, 3.7041488405244677E-01, 7.8279781886019206E-01, 1.0000000000000018E+00, 7.8279781886019228E-01, 3.7041488405244727E-01, 1.0130044556641139E-01, 1.4607490553401959E-02, 9.2668857465754882E-04, 1.7202745817468652E-05, 2.3183302143948763E-08}; + constexpr FLT c1[] = {1.1019919454791572E-07, 4.1938159428224126E-05, 1.5154850601194973E-03, 1.6839357628952684E-02, 8.0835952724673255E-02, 1.8739074372244105E-01, 1.9255567517255739E-01, -9.4204294746769593E-32, -1.9255567517255723E-01, -1.8739074372244108E-01, -8.0835952724673352E-02, -1.6839357628952709E-02, -1.5154850601194973E-03, -4.1938159428224126E-05, -1.1019919454791572E-07}; + constexpr FLT c2[] = {2.3137327105312791E-07, 4.6266060425611204E-05, 1.1028009511991974E-03, 8.2352859806754802E-03, 2.4233386066663413E-02, 2.2182889945939449E-02, -2.5327411650384993E-02, -6.0946897479642256E-02, -2.5327411650385129E-02, 2.2182889945939359E-02, 2.4233386066663424E-02, 8.2352859806754854E-03, 1.1028009511991970E-03, 4.6266060425611204E-05, 2.3137327105312783E-07}; + constexpr FLT c3[] = {2.8457821671573274E-07, 3.0427184404092299E-05, 4.6337319534911844E-04, 2.1072304367244932E-03, 2.4342755210407531E-03, -4.2814200474568563E-03, -9.6703299158782657E-03, 1.8176153030403361E-16, 9.6703299158783507E-03, 4.2814200474569379E-03, -2.4342755210407076E-03, -2.1072304367244859E-03, -4.6337319534911817E-04, -3.0427184404092296E-05, -2.8457821671573279E-07}; + constexpr FLT c4[] = {2.2919642176438702E-07, 1.3183839322480003E-05, 1.2030953406839325E-04, 2.4905754342428421E-04, -3.4193403196993951E-04, -1.1551611179404738E-03, 2.1954335627567210E-04, 1.7895433812201793E-03, 2.1954335627571010E-04, -1.1551611179404326E-03, -3.4193403196995387E-04, 2.4905754342428610E-04, 1.2030953406839360E-04, 1.3183839322480008E-05, 2.2919642176438720E-07}; + constexpr FLT c5[] = {1.2779800356186583E-07, 3.8997040140349313E-06, 1.8264189394307498E-05, -8.3632912035128204E-06, -1.0687544349164653E-04, 2.2123224044726536E-06, 2.3404180714514772E-04, 6.5064979845545577E-17, -2.3404180714503106E-04, -2.2123224042782134E-06, 1.0687544349166598E-04, 8.3632912035006689E-06, -1.8264189394307559E-05, -3.8997040140349338E-06, -1.2779800356186589E-07}; + constexpr FLT c6[] = {5.0693377499403691E-08, 7.7594237801400426E-07, 9.4933483676717755E-07, -6.6987818302423087E-06, -4.5889941143373546E-06, 2.2647907184667538E-05, 3.7412856035449417E-06, -3.3754692339426772E-05, 3.7412856034892404E-06, 2.2647907184654951E-05, -4.5889941143014083E-06, -6.6987818302351157E-06, 9.4933483676684456E-07, 7.7594237801399991E-07, 5.0693377499403691E-08}; + constexpr FLT c7[] = {1.4373673262756881E-08, 9.2554419735729795E-08, -2.0417866965615742E-07, -6.8820764686271727E-07, 1.4165168644096691E-06, 1.2531774951198972E-06, -3.6383191328570317E-06, 5.9333697238861927E-17, 3.6383191329076855E-06, -1.2531774952992520E-06, -1.4165168643945163E-06, 6.8820764685908223E-07, 2.0417866965620961E-07, -9.2554419735731158E-08, -1.4373673262756913E-08}; + constexpr FLT c8[] = {2.8405432421064598E-09, 2.6648052024128211E-09, -4.5328290134778586E-08, 3.2089634828694367E-08, 1.7241593348808383E-07, -2.5816631656161770E-07, -1.3664009513726493E-07, 4.6017883216168089E-07, -1.3664009510064915E-07, -2.5816631656773852E-07, 1.7241593343152281E-07, 3.2089634835965337E-08, -4.5328290134523662E-08, 2.6648052024185691E-09, 2.8405432421065198E-09}; + constexpr FLT c9[] = {3.5447644664522991E-10, -1.1390658479562114E-09, -2.4324028601311552E-09, 1.2152005527725076E-08, -7.1102518341828894E-09, -2.5878341862165437E-08, 4.0855407178225425E-08, -6.7229636689436406E-18, -4.0855407139474409E-08, 2.5878341989490202E-08, 7.1102518840056246E-09, -1.2152005535163887E-08, 2.4324028601311552E-09, 1.1390658479600971E-09, -3.5447644664517713E-10}; + constexpr FLT c10[] = {1.6106092880607926E-11, -1.9612809866225313E-10, 3.3667881388500915E-10, 5.4740705815843633E-10, -2.3219918220819429E-09, 1.8783264389538617E-09, 2.1531915835821252E-09, -4.8374637778167195E-09, 2.1531915732119103E-09, 1.8783264455530896E-09, -2.3219918255386980E-09, 5.4740706350069505E-10, 3.3667881394392907E-10, -1.9612809866164026E-10, 1.6106092880601619E-11}; + constexpr FLT c11[] = {-2.9809392328002639E-12, -8.3268200084267327E-12, 5.7687950483526562E-11, -9.1929198156856840E-11, -3.9289938224686938E-11, 3.0713724621937891E-10, -3.5332675603861928E-10, -4.7176615708722248E-17, 3.5332675632254561E-10, -3.0713734445835836E-10, 3.9289964949381516E-11, 9.1929194004414145E-11, -5.7687950660981567E-11, 8.3268199995541140E-12, 2.9809392327699276E-12}; + constexpr FLT c12[] = {-6.7275763613050405E-13, 1.4037883809519618E-12, 1.0122748224833392E-12, -1.0507010409950668E-11, 1.9186635811522471E-11, -7.9758147674463026E-12, -2.2999207389706864E-11, 4.0853090072343795E-11, -2.2999199222849929E-11, -7.9758923525966314E-12, 1.9186574560087790E-11, -1.0507007219772089E-11, 1.0122747905815843E-12, 1.4037883779612130E-12, -6.7275763610714771E-13}; + for (int i=0; i<15; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i] + z*(c12[i])))))))))))); } else if (w==16) { - constexpr FLT c0[] = {2.6374086784014766E+04, 2.5501413681212697E+07, 1.6835469840840111E+09, 3.1953580806547901E+10, 2.6584910126662793E+11, 1.1715858191494631E+12, 3.0181658330343154E+12, 4.7888775408612793E+12, 4.7888775408612793E+12, 3.0181658330343149E+12, 1.1715858191494631E+12, 2.6584910126662802E+11, 3.1953580806547905E+10, 1.6835469840840123E+09, 2.5501413681212693E+07, 2.6374086784014838E+04}; - constexpr FLT c1[] = {1.2991568388123445E+05, 6.4986154651133671E+07, 2.9142305012947264E+09, 3.9748054433728172E+10, 2.3649443248440253E+11, 7.0471088240421252E+11, 1.0533888905987035E+12, 5.4832304482297614E+11, -5.4832304482297620E+11, -1.0533888905987037E+12, -7.0471088240421265E+11, -2.3649443248440253E+11, -3.9748054433728172E+10, -2.9142305012947268E+09, -6.4986154651133649E+07, -1.2991568388123452E+05}; - constexpr FLT c2[] = {2.8421223836872837E+05, 7.5448503558118597E+07, 2.2710828032883873E+09, 2.1491603403163834E+10, 8.4299374042308197E+10, 1.3384457365769531E+11, 1.8630012765538406E+09, -2.4384536789321063E+11, -2.4384536789321036E+11, 1.8630012765533686E+09, 1.3384457365769537E+11, 8.4299374042308105E+10, 2.1491603403163818E+10, 2.2710828032883859E+09, 7.5448503558118537E+07, 2.8421223836872837E+05}; - constexpr FLT c3[] = {3.6653021243297530E+05, 5.2693428548387125E+07, 1.0410094433021290E+09, 6.3986267576853638E+09, 1.3313926739756351E+10, -2.7909761561126175E+09, -3.9911638977027939E+10, -2.9236947704012280E+10, 2.9236947704013081E+10, 3.9911638977028137E+10, 2.7909761561130028E+09, -1.3313926739756271E+10, -6.3986267576853542E+09, -1.0410094433021282E+09, -5.2693428548387118E+07, -3.6653021243297530E+05}; - constexpr FLT c4[] = {3.1185660915838124E+05, 2.4564274645530283E+07, 3.0509279143241888E+08, 1.0432225146182600E+09, 6.4966284440289930E+07, -4.2483903608015141E+09, -3.1778261722520151E+09, 5.9880587942837610E+09, 5.9880587942838221E+09, -3.1778261722524805E+09, -4.2483903608015366E+09, 6.4966284440239742E+07, 1.0432225146182716E+09, 3.0509279143241870E+08, 2.4564274645530298E+07, 3.1185660915838124E+05}; - constexpr FLT c5[] = {1.8544733523229556E+05, 7.9824949938292857E+06, 5.6880943382648587E+07, 5.4097201999261037E+07, -3.0776449202831459E+08, -3.7659931821870732E+08, 6.8797698944740057E+08, 7.5429896889854825E+08, -7.5429896889813769E+08, -6.8797698944685316E+08, 3.7659931821880990E+08, 3.0776449202837443E+08, -5.4097201999261037E+07, -5.6880943382648058E+07, -7.9824949938292904E+06, -1.8544733523229562E+05}; - constexpr FLT c6[] = {7.9472339236673346E+04, 1.8159676553648554E+06, 5.7259818806757703E+06, -1.2786136236414703E+07, -3.8677490873126298E+07, 4.7651450515746824E+07, 9.0723760109486386E+07, -9.4532949239712372E+07, -9.4532949239553988E+07, 9.0723760109301269E+07, 4.7651450515691362E+07, -3.8677490873146154E+07, -1.2786136236417659E+07, 5.7259818806749191E+06, 1.8159676553648303E+06, 7.9472339236673288E+04}; - constexpr FLT c7[] = {2.4831718998299966E+04, 2.7536301841718081E+05, -5.1045953355375612E+04, -2.6996387880195463E+06, 1.1656554632389303E+06, 9.1521923450131379E+06, -6.8198180924866442E+06, -1.2555197000819867E+07, 1.2555197001241650E+07, 6.8198180927697066E+06, -9.1521923448700085E+06, -1.1656554631878142E+06, 2.6996387880213680E+06, 5.1045953356119258E+04, -2.7536301841717307E+05, -2.4831718998299926E+04}; - constexpr FLT c8[] = {5.6060763597396308E+03, 2.2154740880106889E+04, -1.0243462874801211E+05, -1.1802198892514131E+05, 6.4061699367996352E+05, -1.1166716767206143E+05, -1.4153578101430011E+06, 1.0790712966724981E+06, 1.0790712967259965E+06, -1.4153578105201155E+06, -1.1166716749694763E+05, 6.4061699367337034E+05, -1.1802198891465126E+05, -1.0243462874806672E+05, 2.2154740880108289E+04, 5.6060763597395980E+03}; - constexpr FLT c9[] = {8.7271993222052015E+02, -7.0074676858636565E+02, -1.2528372958260919E+04, 2.3643101058174649E+04, 3.1699060176870429E+04, -1.1270133590467999E+05, 3.6872846694334214E+04, 1.5168911740364679E+05, -1.5168911743408049E+05, -3.6872846682160729E+04, 1.1270133589250650E+05, -3.1699060125133125E+04, -2.3643101053990013E+04, 1.2528372958926657E+04, 7.0074676859379576E+02, -8.7271993222046206E+02}; - constexpr FLT c10[] = {7.8842259458809167E+01, -4.2070880912368045E+02, -1.0535142084668550E+02, 3.3375056840527291E+03, -4.9426353391946941E+03, -3.6567309106352213E+03, 1.5199085303756190E+04, -9.4972223386509122E+03, -9.4972222612539845E+03, 1.5199085250589107E+04, -3.6567308608802218E+03, -4.9426353295200679E+03, 3.3375056868169195E+03, -1.0535142136497778E+02, -4.2070880912233122E+02, 7.8842259458809863E+01}; - constexpr FLT c11[] = {8.9833076822322541E-02, -4.4163371176090656E+01, 1.2880771155499514E+02, 2.8722193371824223E+00, -5.7164633743445722E+02, 9.0417612969072786E+02, 1.1220387898916500E+00, -1.4190926236781661E+03, 1.4190921497862169E+03, -1.1219395160922474E+00, -9.0417626783116691E+02, 5.7164631339646269E+02, -2.8722233955477368E+00, -1.2880771178913139E+02, 4.4163371168774162E+01, -8.9833076836661779E-02}; - constexpr FLT c12[] = {-1.0900468357478950E+00, -1.1264666525354303E-01, 1.1810668147959248E+01, -3.0289105313513339E+01, 1.5494580774353590E+01, 6.0129886123389447E+01, -1.2330199171381130E+02, 6.7114507519752891E+01, 6.7114417724195803E+01, -1.2330220722314033E+02, 6.0129944490502041E+01, 1.5494578529464169E+01, -3.0289104892597450E+01, 1.1810668147959559E+01, -1.1264666963803399E-01, -1.0900468357479236E+00}; - constexpr FLT c13[] = {-1.1763610120003680E-01, 4.2939195911805172E-01, -2.7950209959937194E-01, -1.7354549670508441E+00, 5.1182015415147619E+00, -5.0538827161604676E+00, -2.1270036462171213E+00, 1.0709458682620088E+01, -1.0709612225647817E+01, 2.1267942693611270E+00, 5.0538338615607357E+00, -5.1181806038291624E+00, 1.7354571480597607E+00, 2.7950229043765212E-01, -4.2939195443229039E-01, 1.1763610122666045E-01}; - constexpr FLT c14[] = {-1.8020499668410097E-02, 3.6694580839244442E-02, -1.1331134794057113E-01, 1.3971228975695787E-01, 8.1734604430561311E-02, -5.4464516301492671E-01, 7.9646109231150031E-01, -3.9024149191964747E-01, -3.9020325223035940E-01, 7.9644613359376126E-01, -5.4458780348100966E-01, 8.1735287282159258E-02, 1.3971280189565236E-01, -1.1331156133169454E-01, 3.6694584840328316E-02, -1.8020499652780946E-02}; - constexpr FLT c15[] = {1.4589783473923206E-02, -7.8885429103313365E-04, -4.4856766056362643E-03, 1.8116483572926646E-02, -3.0574294775135746E-02, 1.8967420978453962E-02, 2.4666137072064612E-02, -6.8017929307730221E-02, 6.7615302446897660E-02, -2.4691085605299815E-02, -1.9038882601578176E-02, 3.0552398456072709E-02, -1.8118938614760938E-02, 4.4854443719491892E-03, 7.8884755210919307E-04, -1.4589783498222219E-02}; - constexpr FLT c16[] = {-1.0467998078291846E-02, -3.2140608463710125E-04, 5.2959666930518063E-04, -1.5769844275261027E-04, -1.4331371817542763E-03, 3.7100687637655694E-03, -3.8742310984482158E-03, 1.6810223071268796E-03, 1.6547563335702548E-03, -3.9924279794162345E-03, 3.6969357769948610E-03, -1.4380620517984166E-03, -1.5934006609813836E-04, 5.2953895598459668E-04, -3.2140848935911386E-04, -1.0467998075160606E-02}; - for (int i=0; i<16; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i] + z*(c12[i] + z*(c13[i] + z*(c14[i] + z*(c15[i] + z*(c16[i])))))))))))))))); + constexpr FLT c0[] = {5.2012152104084075E-09, 5.0291159580938685E-06, 3.3201112337137914E-04, 6.3015433246683345E-03, 5.2427915343763419E-02, 2.3104762006593382E-01, 5.9521037322997228E-01, 9.4441119081353919E-01, 9.4441119081353897E-01, 5.9521037322997228E-01, 2.3104762006593382E-01, 5.2427915343763426E-02, 6.3015433246683362E-03, 3.3201112337137925E-04, 5.0291159580938685E-06, 5.2012152104083968E-09}; + constexpr FLT c1[] = {2.5620581163903698E-08, 1.2815874111792785E-05, 5.7471335914300648E-04, 7.8386860177525539E-03, 4.6638901641906975E-02, 1.3897554029141568E-01, 2.0773808644544139E-01, 1.0813440420918323E-01, -1.0813440420918335E-01, -2.0773808644544151E-01, -1.3897554029141571E-01, -4.6638901641906962E-02, -7.8386860177525539E-03, -5.7471335914300648E-04, -1.2815874111792780E-05, -2.5620581163903678E-08}; + constexpr FLT c2[] = {5.6049296769722407E-08, 1.4879146623074265E-05, 4.4787865139353408E-04, 4.2383440773521713E-03, 1.6624620601556200E-02, 2.6395394769117682E-02, 3.6740117889108559E-04, -4.8088574473126838E-02, -4.8088574473126817E-02, 3.6740117889110039E-04, 2.6395394769117647E-02, 1.6624620601556183E-02, 4.2383440773521705E-03, 4.4787865139353381E-04, 1.4879146623074262E-05, 5.6049296769722367E-08}; + constexpr FLT c3[] = {7.2283166867263369E-08, 1.0391634193778174E-05, 2.0529674430143886E-04, 1.2618687081127949E-03, 2.6256301814801142E-03, -5.5040645592548403E-04, -7.8709464111364428E-03, -5.7657980103485666E-03, 5.7657980103488684E-03, 7.8709464111365764E-03, 5.5040645592556046E-04, -2.6256301814800891E-03, -1.2618687081127923E-03, -2.0529674430143870E-04, -1.0391634193778174E-05, -7.2283166867263382E-08}; + constexpr FLT c4[] = {6.1501023800531295E-08, 4.8443034242391149E-06, 6.0167136036954489E-05, 2.0573318254801955E-04, 1.2811955521425743E-05, -8.3782209201439741E-04, -6.2669687707126603E-04, 1.1809008871739588E-03, 1.1809008871740102E-03, -6.2669687707129801E-04, -8.3782209201439957E-04, 1.2811955521424802E-05, 2.0573318254801969E-04, 6.0167136036954442E-05, 4.8443034242391132E-06, 6.1501023800531308E-08}; + constexpr FLT c5[] = {3.6571939291734573E-08, 1.5742222553115388E-06, 1.1217451065775747E-05, 1.0668471374318139E-05, -6.0694020243058218E-05, -7.4268888177597524E-05, 1.3567546096387106E-04, 1.4875477215044619E-04, -1.4875477215041898E-04, -1.3567546096383994E-04, 7.4268888177628640E-05, 6.0694020243062108E-05, -1.0668471374318139E-05, -1.1217451065775808E-05, -1.5742222553115373E-06, -3.6571939291734560E-08}; + constexpr FLT c6[] = {1.5672684443241293E-08, 3.5812571134853537E-07, 1.1292168823203332E-06, -2.5215449854185100E-06, -7.6275609266365118E-06, 9.3973092319789718E-06, 1.7891569285072030E-05, -1.8642776809419116E-05, -1.8642776809435267E-05, 1.7891569285119396E-05, 9.3973092319861496E-06, -7.6275609266374249E-06, -2.5215449854180577E-06, 1.1292168823202796E-06, 3.5812571134853394E-07, 1.5672684443241266E-08}; + constexpr FLT c7[] = {4.8970459380161511E-09, 5.4304148291621772E-08, -1.0066736763205116E-08, -5.3239387743771190E-07, 2.2987809872388434E-07, 1.8048974519458305E-06, -1.3449315565530231E-06, -2.4760016203656832E-06, 2.4760016205558345E-06, 1.3449315566530894E-06, -1.8048974519264694E-06, -2.2987809871496018E-07, 5.3239387743957950E-07, 1.0066736763205477E-08, -5.4304148291620039E-08, -4.8970459380161527E-09}; + constexpr FLT c8[] = {1.1055703983904693E-09, 4.3691209554215673E-09, -2.0201061499499309E-08, -2.3275033898522544E-08, 1.2633562932172848E-07, -2.2021804055583841E-08, -2.7912172397333448E-07, 2.1280289571270167E-07, 2.1280289561471954E-07, -2.7912172398563377E-07, -2.2021804043311624E-08, 1.2633562932175524E-07, -2.3275033897953490E-08, -2.0201061499405642E-08, 4.3691209554208717E-09, 1.1055703983904937E-09}; + constexpr FLT c9[] = {1.7210848751142109E-10, -1.3819378018358974E-10, -2.4707116696395418E-09, 4.6626394240840718E-09, 6.2513494821407377E-09, -2.2225751663756647E-08, 7.2716681831167356E-09, 2.9914504875425248E-08, -2.9914504880961111E-08, -7.2716681858846656E-09, 2.2225751666524578E-08, -6.2513494807567727E-09, -4.6626394246030589E-09, 2.4707116695638564E-09, 1.3819378018734865E-10, -1.7210848751139469E-10}; + constexpr FLT c10[] = {1.5548426850891040E-11, -8.2967690037353030E-11, -2.0776280196441915E-11, 6.5818716237227360E-10, -9.7473365318544434E-10, -7.2114132190269774E-10, 2.9974008768194548E-09, -1.8729406654385533E-09, -1.8729407980520035E-09, 2.9974009543459026E-09, -7.2114130179071973E-10, -9.7473365601368880E-10, 6.5818716417921449E-10, -2.0776280166982969E-11, -8.2967690036279040E-11, 1.5548426850876794E-11}; + constexpr FLT c11[] = {1.7715918253734007E-14, -8.7094275492396390E-12, 2.5402078548167017E-11, 5.6643084712743339E-13, -1.1273398069226705E-10, 1.7831197627554656E-10, 2.2124056737037060E-13, -2.7985821416111004E-10, 2.7985826569398559E-10, -2.2122821651802181E-13, -1.7831199885666961E-10, 1.1273397622040666E-10, -5.6643203607501166E-13, -2.5402078628021660E-11, 8.7094275492396907E-12, -1.7715918256992908E-14}; + constexpr FLT c12[] = {-2.1496737418348056E-13, -2.2214973543773537E-14, 2.3291735079229971E-12, -5.9732922869516132E-12, 3.0556730493177866E-12, 1.1858129781605648E-11, -2.4316397039401376E-11, 1.3235569405286772E-11, 1.3235463236132106E-11, -2.4316413373117597E-11, 1.1858131823320733E-11, 3.0556730493176707E-12, -5.9732919041302971E-12, 2.3291735916652542E-12, -2.2214974665309464E-14, -2.1496737416109420E-13}; + constexpr FLT c13[] = {-2.3198933254093550E-14, 8.4680085604099498E-14, -5.5120431569756550E-14, -3.4224865085091971E-13, 1.0093479536840142E-12, -9.9670676529397927E-13, -4.1953479545762892E-13, 2.1120282165025634E-12, -2.1120647150379602E-12, 4.1949829692223215E-13, 9.9668454879417257E-13, -1.0093487471304360E-12, 3.4224795658530073E-13, 5.5120400575755698E-14, -8.4680084102827573E-14, 2.3198933260903755E-14}; + for (int i=0; i<16; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i] + z*(c12[i] + z*(c13[i]))))))))))))); } else printf("width not implemented!\n"); diff --git a/include/cufinufft/contrib/ker_lowupsampfac_horner_allw_loop_constexpr.inc b/include/cufinufft/contrib/ker_lowupsampfac_horner_allw_loop_constexpr.inc deleted file mode 100644 index e2fa229b7..000000000 --- a/include/cufinufft/contrib/ker_lowupsampfac_horner_allw_loop_constexpr.inc +++ /dev/null @@ -1,171 +0,0 @@ -// Code generated by gen_all_horner_C_code.m in finufft/devel -// Authors: Alex Barnett & Ludvig af Klinteberg. -// (C) The Simons Foundation, Inc. - if (w==2) { - constexpr FLT c0[] = {6.1209111871385702E-01, 6.1209111871385702E-01}; - constexpr FLT c1[] = {6.4742429432896431E-01, -6.4742429432896442E-01}; - constexpr FLT c2[] = {-9.0411309581634847E-02, -9.0411309581634750E-02}; - constexpr FLT c3[] = {-1.9075708590566751E-01, 1.9075708590566753E-01}; - for (int i=0; i<2; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i]))); - } else if (w==3) { - constexpr FLT c0[] = {2.4728112933307078E-01, 1.0000000000000044E+00, 2.4728112935494964E-01}; - constexpr FLT c1[] = {4.0470611346184543E-01, 2.1212921335912390E-17, -4.0470611343822160E-01}; - constexpr FLT c2[] = {1.4864411342268655E-01, -3.0473448739822773E-01, 1.4864411344492173E-01}; - constexpr FLT c3[] = {-4.4469294619149627E-02, 1.3598904496642886E-16, 4.4469294640111616E-02}; - constexpr FLT c4[] = {-2.9270010751775037E-02, 3.7966707032750659E-02, -2.9270010728701147E-02}; - for (int i=0; i<3; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i])))); - } else if (w==4) { - constexpr FLT c0[] = {8.4048892491849839E-02, 7.9275732207620875E-01, 7.9275732207620908E-01, 8.4048892491849811E-02}; - constexpr FLT c1[] = {1.7431588385887239E-01, 3.7425489538028417E-01, -3.7425489538028422E-01, -1.7431588385887242E-01}; - constexpr FLT c2[] = {1.1425598262146337E-01, -1.1126112046907141E-01, -1.1126112046907137E-01, 1.1425598262146335E-01}; - constexpr FLT c3[] = {1.5677587697716072E-02, -6.7022293289915616E-02, 6.7022293289915727E-02, -1.5677587697716041E-02}; - constexpr FLT c4[] = {-1.0401300825285629E-02, 6.3725646657139309E-03, 6.3725646657139005E-03, -1.0401300825285625E-02}; - constexpr FLT c5[] = {-3.0464394190490617E-03, 5.3247889205097435E-03, -5.3247889205097279E-03, 3.0464394190490305E-03}; - for (int i=0; i<4; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i]))))); - } else if (w==5) { - constexpr FLT c0[] = {2.5811126752233307E-02, 4.6616226852477344E-01, 1.0000000000000007E+00, 4.6616226852477305E-01, 2.5811126752233318E-02}; - constexpr FLT c1[] = {6.2936773057387055E-02, 3.7198919402374020E-01, 2.1212921335912559E-17, -3.7198919402374009E-01, -6.2936773057387055E-02}; - constexpr FLT c2[] = {5.4855980576944567E-02, 3.7709308632020676E-02, -1.8284069243892637E-01, 3.7709308632020731E-02, 5.4855980576944567E-02}; - constexpr FLT c3[] = {1.8780973157032140E-02, -3.8322611720715660E-02, 1.4047484462204681E-16, 3.8322611720715834E-02, -1.8780973157032116E-02}; - constexpr FLT c4[] = {-2.3306908700105430E-05, -8.3858973028989436E-03, 1.4886952481383787E-02, -8.3858973028988499E-03, -2.3306908700106227E-05}; - constexpr FLT c5[] = {-1.5212353034889806E-03, 1.7151925122365422E-03, 1.0734071182258885E-16, -1.7151925122365888E-03, 1.5212353034889806E-03}; - for (int i=0; i<5; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i]))))); - } else if (w==6) { - constexpr FLT c0[] = {7.3992041846532818E-03, 2.2998056434514028E-01, 8.5775196559356059E-01, 8.5775196559356115E-01, 2.2998056434514028E-01, 7.3992041847816166E-03}; - constexpr FLT c1[] = {2.0397684222696250E-02, 2.4277466601214742E-01, 2.6509440217151281E-01, -2.6509440217151231E-01, -2.4277466601214739E-01, -2.0397684222557694E-02}; - constexpr FLT c2[] = {2.1435449512033435E-02, 7.4190333865239946E-02, -9.5369600014193256E-02, -9.5369600014193381E-02, 7.4190333865239905E-02, 2.1435449512163876E-02}; - constexpr FLT c3[] = {1.0463664645794037E-02, -5.8671703446042224E-03, -3.4019677093840447E-02, 3.4019677093840760E-02, 5.8671703446042771E-03, -1.0463664645671082E-02}; - constexpr FLT c4[] = {1.9378826192716972E-03, -6.8365127179467735E-03, 4.7406536657957962E-03, 4.7406536657958473E-03, -6.8365127179467848E-03, 1.9378826194070377E-03}; - constexpr FLT c5[] = {-2.6471424081647417E-04, -5.6150758897069279E-04, 2.0099203466671291E-03, -2.0099203466670359E-03, 5.6150758897070829E-04, 2.6471424094083520E-04}; - constexpr FLT c6[] = {-1.6161497824910217E-04, 2.5924418389355766E-04, -1.3917099193215483E-04, -1.3917099193211840E-04, 2.5924418389357192E-04, -1.6161497812639921E-04}; - for (int i=0; i<6; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i])))))); - } else if (w==7) { - constexpr FLT c0[] = {2.0163149398992283E-03, 1.0071602557045130E-01, 5.8653557849806126E-01, 1.0000000000000002E+00, 5.8653557849806159E-01, 1.0071602557045131E-01, 2.0163149399332597E-03}; - constexpr FLT c1[] = {6.1353661835569211E-03, 1.2822551681002711E-01, 3.1973557271594344E-01, -2.1212921335912596E-17, -3.1973557271594366E-01, -1.2822551681002711E-01, -6.1353661835202118E-03}; - constexpr FLT c2[] = {7.4065234100227761E-03, 5.7825030729344404E-02, 1.0889852837592919E-04, -1.3060049459923276E-01, 1.0889852837575314E-04, 5.7825030729344355E-02, 7.4065234100573725E-03}; - constexpr FLT c3[] = {4.4924606632387705E-03, 7.2245566707421303E-03, -2.7743312484355583E-02, 1.0559644416237177E-16, 2.7743312484355832E-02, -7.2245566707420826E-03, -4.4924606632061881E-03}; - constexpr FLT c4[] = {1.3572774007773842E-03, -2.3954706749181320E-03, -2.9058644824981098E-03, 7.8619155407045772E-03, -2.9058644824980807E-03, -2.3954706749181507E-03, 1.3572774008132615E-03}; - constexpr FLT c5[] = {1.1260116639581618E-04, -7.8814564904709067E-04, 1.1036556706849172E-03, -3.0492924261508591E-17, -1.1036556706849482E-03, 7.8814564904710227E-04, -1.1260116636284763E-04}; - constexpr FLT c6[] = {-4.7399003259805808E-05, 2.0950491943152726E-06, 1.7484854214667859E-04, -2.9104069274769336E-04, 1.7484854214659272E-04, 2.0950491943114936E-06, -4.7399003227280901E-05}; - constexpr FLT c7[] = {-1.2555096177146811E-05, 2.7293834771974277E-05, -2.6660039700396876E-05, 5.1878356274645480E-17, 2.6660039700612832E-05, -2.7293834771939816E-05, 1.2555096209061404E-05}; - for (int i=0; i<7; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i]))))))); - } else if (w==8) { - constexpr FLT c0[] = {5.2827275612461462E-04, 4.0402734444109238E-02, 3.4389230803369686E-01, 8.9161099745784866E-01, 8.9161099745784866E-01, 3.4389230803369708E-01, 4.0402734444109252E-02, 5.2827275612461408E-04}; - constexpr FLT c1[] = {1.7458301875074096E-03, 5.9145446836664541E-02, 2.5435204236257858E-01, 2.0538938722823222E-01, -2.0538938722823233E-01, -2.5435204236257858E-01, -5.9145446836664547E-02, -1.7458301875074094E-03}; - constexpr FLT c2[] = {2.3525728171808306E-03, 3.3585505340219701E-02, 4.4733940386002209E-02, -8.0668262921248624E-02, -8.0668262921248748E-02, 4.4733940386002119E-02, 3.3585505340219687E-02, 2.3525728171808311E-03}; - constexpr FLT c3[] = {1.6676293877589678E-03, 8.1606118103203940E-03, -1.0603838868224419E-02, -2.0559571166483725E-02, 2.0559571166484002E-02, 1.0603838868224510E-02, -8.1606118103203749E-03, -1.6676293877589678E-03}; - constexpr FLT c4[] = {6.5470478006265378E-04, 5.7029826102775656E-05, -4.0842122325118182E-03, 3.3746160664395084E-03, 3.3746160664396086E-03, -4.0842122325118321E-03, 5.7029826102778678E-05, 6.5470478006265432E-04}; - constexpr FLT c5[] = {1.2504911757628686E-04, -3.9351755557266000E-04, 2.3739384784447216E-05, 9.6592347103022203E-04, -9.6592347103013649E-04, -2.3739384784439440E-05, 3.9351755557266586E-04, -1.2504911757628702E-04}; - constexpr FLT c6[] = {-6.5665874015798238E-07, -6.1884865695206891E-05, 1.4476791315356577E-04, -8.6782118193344350E-05, -8.6782118193318939E-05, 1.4476791315358196E-04, -6.1884865695214169E-05, -6.5665874015806602E-07}; - constexpr FLT c7[] = {-5.1256159860509675E-06, 5.3292178505898186E-06, 8.7427989025457230E-06, -2.8404799465047339E-05, 2.8404799465135336E-05, -8.7427989024875505E-06, -5.3292178505782125E-06, 5.1256159860509675E-06}; - for (int i=0; i<8; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i]))))))); - } else if (w==9) { - constexpr FLT c0[] = {1.3409415535124456E-04, 1.5141199617983757E-02, 1.8004032483820079E-01, 6.6268423293859657E-01, 1.0000000000000004E+00, 6.6268423293859746E-01, 1.8004032483820084E-01, 1.5141199617983828E-02, 1.3409415535124450E-04}; - constexpr FLT c1[] = {4.7572953640583401E-04, 2.4761567630011042E-02, 1.6332247709293549E-01, 2.7616213278983226E-01, -4.2425842671825223E-17, -2.7616213278983237E-01, -1.6332247709293549E-01, -2.4761567630011111E-02, -4.7572953640583401E-04}; - constexpr FLT c2[] = {7.0217948741779855E-04, 1.6533012331430421E-02, 4.8637875368588490E-02, -1.5084170630533007E-02, -1.0157816246606997E-01, -1.5084170630533338E-02, 4.8637875368588449E-02, 1.6533012331430445E-02, 7.0217948741779833E-04}; - constexpr FLT c3[] = {5.6197289626769645E-04, 5.4583505067803007E-03, 8.8722695781044485E-04, -2.0386313118366230E-02, 1.4346537772579219E-16, 2.0386313118366597E-02, -8.8722695781040203E-04, -5.4583505067802999E-03, -5.6197289626769645E-04}; - constexpr FLT c4[] = {2.6358216867957524E-04, 7.0803132065997147E-04, -2.3883045659485441E-03, -1.0047843626593360E-03, 4.8455486978739078E-03, -1.0047843626590051E-03, -2.3883045659485362E-03, 7.0803132065996898E-04, 2.6358216867957530E-04}; - constexpr FLT c5[] = {7.0565721004957831E-05, -9.0876125855045856E-05, -3.5965836571493702E-04, 7.0575785995728897E-04, 5.6006957738110937E-17, -7.0575785995746006E-04, 3.5965836571493702E-04, 9.0876125855046818E-05, -7.0565721004957980E-05}; - constexpr FLT c6[] = {7.9668965137354764E-06, -4.2137454928171943E-05, 3.9856859670063718E-05, 6.5639620808911507E-05, -1.4477186949841611E-04, 6.5639620808762402E-05, 3.9856859670072629E-05, -4.2137454928186349E-05, 7.9668965137352681E-06}; - constexpr FLT c7[] = {-9.3772917893888351E-07, -3.0575635011675480E-06, 1.2977675432514170E-05, -1.5241881422267232E-05, 5.6444540850624641E-17, 1.5241881422464882E-05, -1.2977675432482811E-05, 3.0575635011824812E-06, 9.3772917893893782E-07}; - constexpr FLT c8[] = {-4.1446092652958961E-07, 7.2790527337844100E-07, -2.5130319764268858E-08, -1.9002349621010172E-06, 3.0493470976000790E-06, -1.9002349619116138E-06, -2.5130319761051126E-08, 7.2790527337217009E-07, -4.1446092652952507E-07}; - for (int i=0; i<9; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i])))))))); - } else if (w==10) { - constexpr FLT c0[] = {3.3157481538170295E-05, 5.3715860775974443E-03, 8.6328042282845782E-02, 4.3077092326437988E-01, 9.1242439930731112E-01, 9.1242439930731112E-01, 4.3077092326437971E-01, 8.6328042282845754E-02, 5.3715860775974227E-03, 3.3157481538170322E-05}; - constexpr FLT c1[] = {1.2517797191066981E-04, 9.6269418565961412E-03, 9.1130577457178452E-02, 2.4769645835465362E-01, 1.6766875916810517E-01, -1.6766875916810536E-01, -2.4769645835465354E-01, -9.1130577457178424E-02, -9.6269418565961117E-03, -1.2517797191066951E-04}; - constexpr FLT c2[] = {1.9968216068682153E-04, 7.2783782301876591E-03, 3.5949398124193940E-02, 2.5847993600195553E-02, -6.9275634160640490E-02, -6.9275634160640504E-02, 2.5847993600195445E-02, 3.5949398124193913E-02, 7.2783782301876375E-03, 1.9968216068682094E-04}; - constexpr FLT c3[] = {1.7649923565147242E-04, 2.9221990881931090E-03, 4.9086823797165058E-03, -1.0940556313145914E-02, -1.3762152424114656E-02, 1.3762152424114910E-02, 1.0940556313146081E-02, -4.9086823797164919E-03, -2.9221990881930998E-03, -1.7649923565147204E-04}; - constexpr FLT c4[] = {9.4710355505531920E-05, 6.0621452710061727E-04, -7.0118560592788729E-04, -2.4750745659639179E-03, 2.4757076628501668E-03, 2.4757076628502063E-03, -2.4750745659640264E-03, -7.0118560592788274E-04, 6.0621452710061163E-04, 9.4710355505531771E-05}; - constexpr FLT c5[] = {3.1258610702677804E-05, 2.8169545035126350E-05, -2.9881406711974808E-04, 1.5956798534243302E-04, 5.3653099874326161E-04, -5.3653099874339388E-04, -1.5956798534226972E-04, 2.9881406711975192E-04, -2.8169545035121488E-05, -3.1258610702677743E-05}; - constexpr FLT c6[] = {5.7780052154065432E-06, -1.5636835808661990E-05, -1.6121807313036067E-05, 8.1230533420465018E-05, -5.5456530742754838E-05, -5.5456530742851827E-05, 8.1230533420445272E-05, -1.6121807313045130E-05, -1.5636835808665131E-05, 5.7780052154064593E-06}; - constexpr FLT c7[] = {2.7742147829406768E-07, -3.2550081973304980E-06, 5.9212960378031332E-06, 8.5495977199682674E-07, -1.3248468528032551E-05, 1.3248468528215217E-05, -8.5495977185729702E-07, -5.9212960377964950E-06, 3.2550081973313239E-06, -2.7742147829400097E-07}; - constexpr FLT c8[] = {-1.2089379439825852E-07, -3.4743143855784781E-08, 8.2889801006379481E-07, -1.5830293785226849E-06, 8.7461219388985494E-07, 8.7461219397529632E-07, -1.5830293786451511E-06, 8.2889801008534534E-07, -3.4743143855462353E-08, -1.2089379439833804E-07}; - constexpr FLT c9[] = {-2.5033479260872450E-08, 6.3042298326687954E-08, -5.2303271559903752E-08, -7.6226091757998386E-08, 2.3316553102767969E-07, -2.3316553111902137E-07, 7.6226091879787297E-08, 5.2303271554367896E-08, -6.3042298324957995E-08, 2.5033479260965031E-08}; - for (int i=0; i<10; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i]))))))))); - } else if (w==11) { - constexpr FLT c0[] = {8.0191950887587638E-06, 1.8211144887695905E-03, 3.8565497751765702E-02, 2.5236459439543663E-01, 7.1517256669690443E-01, 1.0000000000000002E+00, 7.1517256669690443E-01, 2.5236459439543651E-01, 3.8565497751765723E-02, 1.8211144887695927E-03, 8.0191950887586707E-06}; - constexpr FLT c1[] = {3.1996260415636073E-05, 3.5282769389657661E-03, 4.5889527487056492E-02, 1.8012194355267480E-01, 2.4178022040260394E-01, 2.1212921335912587E-17, -2.4178022040260411E-01, -1.8012194355267488E-01, -4.5889527487056492E-02, -3.5282769389657648E-03, -3.1996260415635850E-05}; - constexpr FLT c2[] = {5.4612928019025183E-05, 2.9497743530118290E-03, 2.1858479505161201E-02, 3.8333708936616528E-02, -2.1641923687039297E-02, -8.3109405654057292E-02, -2.1641923687039287E-02, 3.8333708936616487E-02, 2.1858479505161187E-02, 2.9497743530118290E-03, 5.4612928019024885E-05}; - constexpr FLT c3[] = {5.2504054888010150E-05, 1.3660648269306127E-03, 4.7357572177382694E-03, -2.2373255422688926E-03, -1.5459233729560824E-02, -3.0584997651941540E-18, 1.5459233729561050E-02, 2.2373255422689746E-03, -4.7357572177382599E-03, -1.3660648269306129E-03, -5.2504054888009953E-05}; - constexpr FLT c4[] = {3.1396100602888584E-05, 3.6443237253636144E-04, 1.5906780001786821E-04, -1.9495384184342716E-03, -2.4621376046556434E-04, 3.2818730060399505E-03, -2.4621376046541547E-04, -1.9495384184342974E-03, 1.5906780001787157E-04, 3.6443237253636144E-04, 3.1396100602888483E-05}; - constexpr FLT c5[] = {1.2057435171015750E-05, 4.6687328398363315E-05, -1.3963494372747466E-04, -1.4877651674418741E-04, 4.6954815721697059E-04, 7.1576260535837041E-17, -4.6954815721696283E-04, 1.4877651674414852E-04, 1.3963494372747659E-04, -4.6687328398363071E-05, -1.2057435171015728E-05}; - constexpr FLT c6[] = {2.8888404081262488E-06, -1.8976367884800935E-06, -2.4767547607257735E-05, 3.8337725458133611E-05, 2.6462355617055980E-05, -8.2113719362939881E-05, 2.6462355617066876E-05, 3.8337725458138978E-05, -2.4767547607262269E-05, -1.8976367884805327E-06, 2.8888404081262340E-06}; - constexpr FLT c7[] = {3.5729663467786725E-07, -1.6085054296206689E-06, 4.5672370507959851E-07, 6.0608527683273524E-06, -9.0233724844644286E-06, -4.5070818825954386E-17, 9.0233724845159214E-06, -6.0608527682667218E-06, -4.5672370507254818E-07, 1.6085054296207723E-06, -3.5729663467788907E-07}; - constexpr FLT c8[] = {-7.7890073973236871E-09, -1.8340559948709468E-07, 5.4451797328971916E-07, -3.5830285713854766E-07, -7.3873233537913819E-07, 1.4648976903075259E-06, -7.3873233536710514E-07, -3.5830285713236262E-07, 5.4451797329704790E-07, -1.8340559948689703E-07, -7.7890073973081013E-09}; - constexpr FLT c9[] = {-9.8984999695252047E-09, 1.0194946774280524E-08, 3.5279000677512062E-08, -1.1638771469313311E-07, 1.2326133617211816E-07, -2.5669371006274292E-17, -1.2326133615551060E-07, 1.1638771463500659E-07, -3.5279000676820083E-08, -1.0194946774410270E-08, 9.8984999695130418E-09}; - for (int i=0; i<11; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i]))))))))); - } else if (w==12) { - constexpr FLT c0[] = {1.9028495068410023E-06, 5.9416527261081913E-04, 1.6248140264385581E-02, 1.3597036436097915E-01, 4.9821957378204840E-01, 9.2652305802242962E-01, 9.2652305802242962E-01, 4.9821957378204840E-01, 1.3597036436097937E-01, 1.6248140264385626E-02, 5.9416527261081924E-04, 1.9028495068454171E-06}; - constexpr FLT c1[] = {7.9801239249145923E-06, 1.2318344820958854E-03, 2.1335987794357199E-02, 1.1394981969310448E-01, 2.3520579283187484E-01, 1.4166451219687695E-01, -1.4166451219687687E-01, -2.3520579283187476E-01, -1.1394981969310460E-01, -2.1335987794357230E-02, -1.2318344820958847E-03, -7.9801239249098540E-06}; - constexpr FLT c2[] = {1.4462226804444730E-05, 1.1205076408888257E-03, 1.1698445222077612E-02, 3.3958877046121660E-02, 1.3705098421608795E-02, -6.0497400607811481E-02, -6.0497400607811579E-02, 1.3705098421608806E-02, 3.3958877046121591E-02, 1.1698445222077622E-02, 1.1205076408888255E-03, 1.4462226804449267E-05}; - constexpr FLT c3[] = {1.4953735432776090E-05, 5.8049865432805142E-04, 3.2684769908807722E-03, 2.3619245295514353E-03, -1.0074268581043095E-02, -9.8551520939611746E-03, 9.8551520939615059E-03, 1.0074268581043251E-02, -2.3619245295513252E-03, -3.2684769908807648E-03, -5.8049865432805098E-04, -1.4953735432771914E-05}; - constexpr FLT c4[] = {9.7900673700200676E-06, 1.8351475200221906E-04, 3.8725987583789238E-04, -9.2229408802588448E-04, -1.5383560041742387E-03, 1.8800996948122926E-03, 1.8800996948123033E-03, -1.5383560041742409E-03, -9.2229408802591614E-04, 3.8725987583789064E-04, 1.8351475200221903E-04, 9.7900673700247601E-06}; - constexpr FLT c5[] = {4.2345162286123928E-06, 3.3664241555334181E-05, -3.0535096226552352E-05, -1.9795772057290591E-04, 1.7526295499606013E-04, 3.2830037656743561E-04, -3.2830037656734232E-04, -1.7526295499599014E-04, 1.9795772057292925E-04, 3.0535096226555273E-05, -3.3664241555334181E-05, -4.2345162286081255E-06}; - constexpr FLT c6[] = {1.2088615636792351E-06, 2.2204932634073669E-06, -1.5559909809157569E-05, 1.8771595438708362E-06, 4.7304527720902187E-05, -3.7055029721502823E-05, -3.7055029721506354E-05, 4.7304527720948991E-05, 1.8771595438366184E-06, -1.5559909809165219E-05, 2.2204932634074313E-06, 1.2088615636834544E-06}; - constexpr FLT c7[] = {2.1206307767331379E-07, -4.5869687934383747E-07, -1.3462277877507893E-06, 4.2970047520348418E-06, -1.1214870287581008E-06, -6.9831974682071699E-06, 6.9831974683366982E-06, 1.1214870288087690E-06, -4.2970047519748465E-06, 1.3462277877599186E-06, 4.5869687934394192E-07, -2.1206307766917122E-07}; - constexpr FLT c8[] = {1.5395324498807062E-08, -1.2022118042093087E-07, 1.5464523856613661E-07, 2.7605497716337475E-07, -8.4964626033234966E-07, 5.2067203458077506E-07, 5.2067203461734952E-07, -8.4964626032018743E-07, 2.7605497716040193E-07, 1.5464523856098652E-07, -1.2022118042095769E-07, 1.5395324502815322E-08}; - constexpr FLT c9[] = {-2.0816585198648028E-09, -6.8192670389370156E-09, 3.6338774649049193E-08, -4.9464520974759579E-08, -1.3242031035521981E-08, 1.0671664854533778E-07, -1.0671664854533778E-07, 1.3242031024450263E-08, 4.9464520977527511E-08, -3.6338774639015446E-08, 6.8192670391856967E-09, 2.0816585232951501E-09}; - constexpr FLT c10[] = {-6.3791929313390708E-10, 1.2240176132927394E-09, 5.3586930472778203E-10, -6.2807355748408205E-09, 1.0600657362033408E-08, -5.5585207892891946E-09, -5.5585208232281016E-09, 1.0600657414513137E-08, -6.2807355547288652E-09, 5.3586929184356377E-10, 1.2240176133909372E-09, -6.3791928984134277E-10}; - for (int i=0; i<12; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i])))))))))); - } else if (w==13) { - constexpr FLT c0[] = {4.4408051211162946E-07, 1.8756193861873427E-04, 6.5146989208011716E-03, 6.8352802598867876E-02, 3.1564238810082484E-01, 7.5353649746793960E-01, 9.9999999999999956E-01, 7.5353649746793838E-01, 3.1564238810082484E-01, 6.8352802598867710E-02, 6.5146989208011707E-03, 1.8756193861873272E-04, 4.4408051211162761E-07}; - constexpr FLT c1[] = {1.9487148068106057E-06, 4.1285069961250701E-04, 9.2995630713278762E-03, 6.5021145064983563E-02, 1.8663042875530009E-01, 2.1451870821533808E-01, 1.8840858949353919E-32, -2.1451870821533794E-01, -1.8663042875529998E-01, -6.5021145064983438E-02, -9.2995630713278762E-03, -4.1285069961250425E-04, -1.9487148068106044E-06}; - constexpr FLT c2[] = {3.7267581324409626E-06, 4.0381251792508734E-04, 5.7019503038218408E-03, 2.4040868593456825E-02, 2.9406233528281710E-02, -2.4394921635639378E-02, -7.0323343245740924E-02, -2.4394921635639052E-02, 2.9406233528281724E-02, 2.4040868593456791E-02, 5.7019503038218382E-03, 4.0381251792508501E-04, 3.7267581324409626E-06}; - constexpr FLT c3[] = {4.1089519307370168E-06, 2.2941839162878727E-04, 1.8941440042457443E-03, 3.5673079836347822E-03, -3.6880489041048953E-03, -1.2074156718545214E-02, 7.1013810712957114E-17, 1.2074156718545436E-02, 3.6880489041048944E-03, -3.5673079836347674E-03, -1.8941440042457413E-03, -2.2941839162878624E-04, -4.1089519307370151E-06}; - constexpr FLT c4[] = {2.9080869014384424E-06, 8.2405696428180906E-05, 3.3386109283452779E-04, -1.7130036080580219E-04, -1.5108662980936900E-03, 7.8665018928679242E-05, 2.3686576883603073E-03, 7.8665018928764622E-05, -1.5108662980936485E-03, -1.7130036080580737E-04, 3.3386109283452861E-04, 8.2405696428180703E-05, 2.9080869014384429E-06}; - constexpr FLT c5[] = {1.3873038503072801E-06, 1.8694798962849948E-05, 1.4885937076477316E-05, -1.3109520271106624E-04, -4.6797213058790025E-05, 3.2555441892430825E-04, 6.5502537691746230E-17, -3.2555441892416048E-04, 4.6797213058875582E-05, 1.3109520271106819E-04, -1.4885937076477316E-05, -1.8694798962849962E-05, -1.3873038503072801E-06}; - constexpr FLT c6[] = {4.5216719173889445E-07, 2.3203195635245624E-06, -6.0547210914038460E-06, -1.2111482379340961E-05, 3.0238388566383385E-05, 1.0632529352081665E-05, -5.0954659549722746E-05, 1.0632529352250802E-05, 3.0238388566313227E-05, -1.2111482379347288E-05, -6.0547210914040671E-06, 2.3203195635247352E-06, 4.5216719173889350E-07}; - constexpr FLT c7[] = {9.7956192761412821E-08, 9.2080334896449358E-09, -1.2031586234326618E-06, 1.3860784486076025E-06, 2.8079238803293383E-06, -5.6034103145907796E-06, 1.6113788341939994E-17, 5.6034103146040687E-06, -2.8079238803054550E-06, -1.3860784485997179E-06, 1.2031586234342167E-06, -9.2080334898128650E-09, -9.7956192761411458E-08}; - constexpr FLT c8[] = {1.2350515865275843E-08, -4.7668301905167552E-08, -3.2637845350597966E-08, 3.2101904613347501E-07, -3.3650826994957826E-07, -3.1117289066304045E-07, 7.8771611535813792E-07, -3.1117289069990237E-07, -3.3650826984246136E-07, 3.2101904612282309E-07, -3.2637845349600439E-08, -4.7668301904853071E-08, 1.2350515865276535E-08}; - constexpr FLT c9[] = {2.7912946705592266E-10, -6.8584366111657433E-09, 1.5876438439662156E-08, 2.2894800381734934E-09, -5.4355139631893104E-08, 6.9215572156100812E-08, 1.6320619156148685E-17, -6.9215572241906639E-08, 5.4355139637428967E-08, -2.2894800215659153E-09, -1.5876438439575659E-08, 6.8584366109657170E-09, -2.7912946705524691E-10}; - constexpr FLT c10[] = {-1.9473100882503891E-10, -6.0076128424585684E-11, 1.8131864354130518E-09, -3.9994904462490394E-09, 2.0334605597831887E-09, 5.0274131974512103E-09, -9.3367591026663196E-09, 5.0274136044049357E-09, 2.0334605333861501E-09, -3.9994904745315308E-09, 1.8131864358844393E-09, -6.0076128154532669E-11, -1.9473100882561411E-10}; - constexpr FLT c11[] = {-2.9813639427701670E-11, 8.8416967305832406E-11, -6.1944900155883343E-11, -2.3424446318938161E-10, 6.6123632509207570E-10, -6.5395825305270265E-10, -7.6394712006965382E-17, 6.5395802534269801E-10, -6.6123633886256970E-10, 2.3424448263843040E-10, 6.1944899055662456E-11, -8.8416967554269098E-11, 2.9813639428048382E-11}; - for (int i=0; i<13; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i]))))))))))); - } else if (w==14) { - constexpr FLT c0[] = {1.0213002307223062E-07, 5.7528591418445639E-05, 2.5031206020280088E-03, 3.2405046511689233E-02, 1.8485678142025513E-01, 5.5177865704975304E-01, 9.3670793123951734E-01, 9.3670793123951712E-01, 5.5177865704975315E-01, 1.8485678142025547E-01, 3.2405046511689239E-02, 2.5031206020280179E-03, 5.7528591418445801E-05, 1.0213002307242253E-07}; - constexpr FLT c1[] = {4.6718564624239767E-07, 1.3360375098030156E-04, 3.8410346178215306E-03, 3.4207779106833425E-02, 1.2923501383683489E-01, 2.2132894130184291E-01, 1.2264779624530273E-01, -1.2264779624530257E-01, -2.2132894130184308E-01, -1.2923501383683503E-01, -3.4207779106833425E-02, -3.8410346178215393E-03, -1.3360375098030178E-04, -4.6718564624220264E-07}; - constexpr FLT c2[] = {9.3810713124204527E-07, 1.3926941499858519E-04, 2.5833386162539013E-03, 1.4797516242328850E-02, 3.0361769467151970E-02, 5.7261067343619262E-03, -5.3608938764866873E-02, -5.3608938764866894E-02, 5.7261067343618603E-03, 3.0361769467151870E-02, 1.4797516242328836E-02, 2.5833386162539061E-03, 1.3926941499858543E-04, 9.3810713124224814E-07}; - constexpr FLT c3[] = {1.0954436997682021E-06, 8.5568590196649221E-05, 9.7778250562911601E-04, 3.0692948752812804E-03, 6.0463237460738756E-04, -8.9532302111318181E-03, -7.4040784665309846E-03, 7.4040784665312838E-03, 8.9532302111319968E-03, -6.0463237460737487E-04, -3.0692948752812708E-03, -9.7778250562911818E-04, -8.5568590196649329E-05, -1.0954436997680333E-06}; - constexpr FLT c4[] = {8.3014334976692641E-07, 3.4045323043173900E-05, 2.1660980714121239E-04, 1.7421792587401689E-04, -9.2118064021561887E-04, -9.7597008655075522E-04, 1.4714477548413631E-03, 1.4714477548414121E-03, -9.7597008655073809E-04, -9.2118064021559762E-04, 1.7421792587402266E-04, 2.1660980714121363E-04, 3.4045323043173968E-05, 8.3014334976713224E-07}; - constexpr FLT c5[] = {4.3045614796951587E-07, 8.9716871724550274E-06, 2.3377513570381849E-05, -5.5213296993546423E-05, -1.2391624765752083E-04, 1.5869855385555775E-04, 2.1530382494154427E-04, -2.1530382494144317E-04, -1.5869855385557331E-04, 1.2391624765755973E-04, 5.5213296993542533E-05, -2.3377513570381968E-05, -8.9716871724550325E-06, -4.3045614796933747E-07}; - constexpr FLT c6[] = {1.5611302559652642E-07, 1.4859455506706785E-06, -8.5826557923722616E-07, -1.1616353402592630E-05, 8.0333594878995593E-06, 2.8616079443375728E-05, -2.5816776957707699E-05, -2.5816776957707652E-05, 2.8616079443268301E-05, 8.0333594878977314E-06, -1.1616353402591744E-05, -8.5826557923811989E-07, 1.4859455506706314E-06, 1.5611302559670737E-07}; - constexpr FLT c7[] = {3.9336515129721532E-08, 1.1257285216182540E-07, -6.2406181937560562E-07, -2.6873173855233150E-07, 2.8292088258393860E-06, -1.4598715516905790E-06, -4.0212462690723253E-06, 4.0212462691823422E-06, 1.4598715517761175E-06, -2.8292088259133913E-06, 2.6873173855647969E-07, 6.2406181937648769E-07, -1.1257285216174059E-07, -3.9336515129545720E-08}; - constexpr FLT c8[] = {6.5041263396088790E-09, -9.9149367808853263E-09, -6.6845758889620994E-08, 1.6286641992901855E-07, 5.8507874943424797E-08, -4.7688540978638226E-07, 3.2559878511421460E-07, 3.2559878519979701E-07, -4.7688540972525423E-07, 5.8507875026096430E-08, 1.6286641993325022E-07, -6.6845758889870313E-08, -9.9149367809131923E-09, 6.5041263397795280E-09}; - constexpr FLT c9[] = {5.5138523621090170E-10, -3.4792607432658830E-09, 2.1621109687111844E-09, 1.6802313210571416E-08, -3.4440501484206901E-08, 3.6408051867813727E-09, 5.4274262350067578E-08, -5.4274262322388281E-08, -3.6408052006210212E-09, 3.4440501481438969E-08, -1.6802313213339344E-08, -2.1621109679759532E-09, 3.4792607432902108E-09, -5.5138523606396516E-10}; - constexpr FLT c10[] = {-2.3785683828448576E-11, -2.9453404124114860E-10, 1.0997757897423152E-09, -8.6020468987368310E-10, -2.2974592934948612E-09, 5.5064437603692059E-09, -3.1470905819229834E-09, -3.1470905272434506E-09, 5.5064436867561607E-09, -2.2974592840673907E-09, -8.6020468484567061E-10, 1.0997757884067548E-09, -2.9453404129270796E-10, -2.3785683688822786E-11}; - constexpr FLT c11[] = {-1.2240623323339709E-11, 1.4269095096874458E-11, 6.3689195980296716E-11, -2.3523039255622989E-10, 2.6546832331592691E-10, 9.4137182189250380E-11, -5.6473803777133577E-10, 5.6473799518218520E-10, -9.4137157913436917E-11, -2.6546835890448598E-10, 2.3523039312408576E-10, -6.3689194329967738E-11, -1.4269094997055950E-11, 1.2240623457297303E-11}; - constexpr FLT c12[] = {-1.4791529085565623E-12, 4.8147158180813514E-12, -7.1247159181258048E-12, -3.7363568005007135E-12, 3.0923958877552072E-11, -4.7998366007614543E-11, 2.4268802632733111E-11, 2.4268880217882715E-11, -4.7998325173324774E-11, 3.0923998690985708E-11, -3.7363589698227313E-12, -7.1247171622956968E-12, 4.8147157313484649E-12, -1.4791527915262285E-12}; - for (int i=0; i<14; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i] + z*(c12[i])))))))))))); - } else if (w==15) { - constexpr FLT c0[] = {2.3183302143948793E-08, 1.7202745817468655E-05, 9.2668857465754784E-04, 1.4607490553401936E-02, 1.0130044556641116E-01, 3.7041488405244677E-01, 7.8279781886019206E-01, 1.0000000000000018E+00, 7.8279781886019228E-01, 3.7041488405244727E-01, 1.0130044556641139E-01, 1.4607490553401959E-02, 9.2668857465754882E-04, 1.7202745817468652E-05, 2.3183302143948763E-08}; - constexpr FLT c1[] = {1.1019919454791572E-07, 4.1938159428224126E-05, 1.5154850601194973E-03, 1.6839357628952684E-02, 8.0835952724673255E-02, 1.8739074372244105E-01, 1.9255567517255739E-01, -9.4204294746769593E-32, -1.9255567517255723E-01, -1.8739074372244108E-01, -8.0835952724673352E-02, -1.6839357628952709E-02, -1.5154850601194973E-03, -4.1938159428224126E-05, -1.1019919454791572E-07}; - constexpr FLT c2[] = {2.3137327105312791E-07, 4.6266060425611204E-05, 1.1028009511991974E-03, 8.2352859806754802E-03, 2.4233386066663413E-02, 2.2182889945939449E-02, -2.5327411650384993E-02, -6.0946897479642256E-02, -2.5327411650385129E-02, 2.2182889945939359E-02, 2.4233386066663424E-02, 8.2352859806754854E-03, 1.1028009511991970E-03, 4.6266060425611204E-05, 2.3137327105312783E-07}; - constexpr FLT c3[] = {2.8457821671573274E-07, 3.0427184404092299E-05, 4.6337319534911844E-04, 2.1072304367244932E-03, 2.4342755210407531E-03, -4.2814200474568563E-03, -9.6703299158782657E-03, 1.8176153030403361E-16, 9.6703299158783507E-03, 4.2814200474569379E-03, -2.4342755210407076E-03, -2.1072304367244859E-03, -4.6337319534911817E-04, -3.0427184404092296E-05, -2.8457821671573279E-07}; - constexpr FLT c4[] = {2.2919642176438702E-07, 1.3183839322480003E-05, 1.2030953406839325E-04, 2.4905754342428421E-04, -3.4193403196993951E-04, -1.1551611179404738E-03, 2.1954335627567210E-04, 1.7895433812201793E-03, 2.1954335627571010E-04, -1.1551611179404326E-03, -3.4193403196995387E-04, 2.4905754342428610E-04, 1.2030953406839360E-04, 1.3183839322480008E-05, 2.2919642176438720E-07}; - constexpr FLT c5[] = {1.2779800356186583E-07, 3.8997040140349313E-06, 1.8264189394307498E-05, -8.3632912035128204E-06, -1.0687544349164653E-04, 2.2123224044726536E-06, 2.3404180714514772E-04, 6.5064979845545577E-17, -2.3404180714503106E-04, -2.2123224042782134E-06, 1.0687544349166598E-04, 8.3632912035006689E-06, -1.8264189394307559E-05, -3.8997040140349338E-06, -1.2779800356186589E-07}; - constexpr FLT c6[] = {5.0693377499403691E-08, 7.7594237801400426E-07, 9.4933483676717755E-07, -6.6987818302423087E-06, -4.5889941143373546E-06, 2.2647907184667538E-05, 3.7412856035449417E-06, -3.3754692339426772E-05, 3.7412856034892404E-06, 2.2647907184654951E-05, -4.5889941143014083E-06, -6.6987818302351157E-06, 9.4933483676684456E-07, 7.7594237801399991E-07, 5.0693377499403691E-08}; - constexpr FLT c7[] = {1.4373673262756881E-08, 9.2554419735729795E-08, -2.0417866965615742E-07, -6.8820764686271727E-07, 1.4165168644096691E-06, 1.2531774951198972E-06, -3.6383191328570317E-06, 5.9333697238861927E-17, 3.6383191329076855E-06, -1.2531774952992520E-06, -1.4165168643945163E-06, 6.8820764685908223E-07, 2.0417866965620961E-07, -9.2554419735731158E-08, -1.4373673262756913E-08}; - constexpr FLT c8[] = {2.8405432421064598E-09, 2.6648052024128211E-09, -4.5328290134778586E-08, 3.2089634828694367E-08, 1.7241593348808383E-07, -2.5816631656161770E-07, -1.3664009513726493E-07, 4.6017883216168089E-07, -1.3664009510064915E-07, -2.5816631656773852E-07, 1.7241593343152281E-07, 3.2089634835965337E-08, -4.5328290134523662E-08, 2.6648052024185691E-09, 2.8405432421065198E-09}; - constexpr FLT c9[] = {3.5447644664522991E-10, -1.1390658479562114E-09, -2.4324028601311552E-09, 1.2152005527725076E-08, -7.1102518341828894E-09, -2.5878341862165437E-08, 4.0855407178225425E-08, -6.7229636689436406E-18, -4.0855407139474409E-08, 2.5878341989490202E-08, 7.1102518840056246E-09, -1.2152005535163887E-08, 2.4324028601311552E-09, 1.1390658479600971E-09, -3.5447644664517713E-10}; - constexpr FLT c10[] = {1.6106092880607926E-11, -1.9612809866225313E-10, 3.3667881388500915E-10, 5.4740705815843633E-10, -2.3219918220819429E-09, 1.8783264389538617E-09, 2.1531915835821252E-09, -4.8374637778167195E-09, 2.1531915732119103E-09, 1.8783264455530896E-09, -2.3219918255386980E-09, 5.4740706350069505E-10, 3.3667881394392907E-10, -1.9612809866164026E-10, 1.6106092880601619E-11}; - constexpr FLT c11[] = {-2.9809392328002639E-12, -8.3268200084267327E-12, 5.7687950483526562E-11, -9.1929198156856840E-11, -3.9289938224686938E-11, 3.0713724621937891E-10, -3.5332675603861928E-10, -4.7176615708722248E-17, 3.5332675632254561E-10, -3.0713734445835836E-10, 3.9289964949381516E-11, 9.1929194004414145E-11, -5.7687950660981567E-11, 8.3268199995541140E-12, 2.9809392327699276E-12}; - constexpr FLT c12[] = {-6.7275763613050405E-13, 1.4037883809519618E-12, 1.0122748224833392E-12, -1.0507010409950668E-11, 1.9186635811522471E-11, -7.9758147674463026E-12, -2.2999207389706864E-11, 4.0853090072343795E-11, -2.2999199222849929E-11, -7.9758923525966314E-12, 1.9186574560087790E-11, -1.0507007219772089E-11, 1.0122747905815843E-12, 1.4037883779612130E-12, -6.7275763610714771E-13}; - for (int i=0; i<15; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i] + z*(c12[i])))))))))))); - } else if (w==16) { - constexpr FLT c0[] = {5.2012152104084075E-09, 5.0291159580938685E-06, 3.3201112337137914E-04, 6.3015433246683345E-03, 5.2427915343763419E-02, 2.3104762006593382E-01, 5.9521037322997228E-01, 9.4441119081353919E-01, 9.4441119081353897E-01, 5.9521037322997228E-01, 2.3104762006593382E-01, 5.2427915343763426E-02, 6.3015433246683362E-03, 3.3201112337137925E-04, 5.0291159580938685E-06, 5.2012152104083968E-09}; - constexpr FLT c1[] = {2.5620581163903698E-08, 1.2815874111792785E-05, 5.7471335914300648E-04, 7.8386860177525539E-03, 4.6638901641906975E-02, 1.3897554029141568E-01, 2.0773808644544139E-01, 1.0813440420918323E-01, -1.0813440420918335E-01, -2.0773808644544151E-01, -1.3897554029141571E-01, -4.6638901641906962E-02, -7.8386860177525539E-03, -5.7471335914300648E-04, -1.2815874111792780E-05, -2.5620581163903678E-08}; - constexpr FLT c2[] = {5.6049296769722407E-08, 1.4879146623074265E-05, 4.4787865139353408E-04, 4.2383440773521713E-03, 1.6624620601556200E-02, 2.6395394769117682E-02, 3.6740117889108559E-04, -4.8088574473126838E-02, -4.8088574473126817E-02, 3.6740117889110039E-04, 2.6395394769117647E-02, 1.6624620601556183E-02, 4.2383440773521705E-03, 4.4787865139353381E-04, 1.4879146623074262E-05, 5.6049296769722367E-08}; - constexpr FLT c3[] = {7.2283166867263369E-08, 1.0391634193778174E-05, 2.0529674430143886E-04, 1.2618687081127949E-03, 2.6256301814801142E-03, -5.5040645592548403E-04, -7.8709464111364428E-03, -5.7657980103485666E-03, 5.7657980103488684E-03, 7.8709464111365764E-03, 5.5040645592556046E-04, -2.6256301814800891E-03, -1.2618687081127923E-03, -2.0529674430143870E-04, -1.0391634193778174E-05, -7.2283166867263382E-08}; - constexpr FLT c4[] = {6.1501023800531295E-08, 4.8443034242391149E-06, 6.0167136036954489E-05, 2.0573318254801955E-04, 1.2811955521425743E-05, -8.3782209201439741E-04, -6.2669687707126603E-04, 1.1809008871739588E-03, 1.1809008871740102E-03, -6.2669687707129801E-04, -8.3782209201439957E-04, 1.2811955521424802E-05, 2.0573318254801969E-04, 6.0167136036954442E-05, 4.8443034242391132E-06, 6.1501023800531308E-08}; - constexpr FLT c5[] = {3.6571939291734573E-08, 1.5742222553115388E-06, 1.1217451065775747E-05, 1.0668471374318139E-05, -6.0694020243058218E-05, -7.4268888177597524E-05, 1.3567546096387106E-04, 1.4875477215044619E-04, -1.4875477215041898E-04, -1.3567546096383994E-04, 7.4268888177628640E-05, 6.0694020243062108E-05, -1.0668471374318139E-05, -1.1217451065775808E-05, -1.5742222553115373E-06, -3.6571939291734560E-08}; - constexpr FLT c6[] = {1.5672684443241293E-08, 3.5812571134853537E-07, 1.1292168823203332E-06, -2.5215449854185100E-06, -7.6275609266365118E-06, 9.3973092319789718E-06, 1.7891569285072030E-05, -1.8642776809419116E-05, -1.8642776809435267E-05, 1.7891569285119396E-05, 9.3973092319861496E-06, -7.6275609266374249E-06, -2.5215449854180577E-06, 1.1292168823202796E-06, 3.5812571134853394E-07, 1.5672684443241266E-08}; - constexpr FLT c7[] = {4.8970459380161511E-09, 5.4304148291621772E-08, -1.0066736763205116E-08, -5.3239387743771190E-07, 2.2987809872388434E-07, 1.8048974519458305E-06, -1.3449315565530231E-06, -2.4760016203656832E-06, 2.4760016205558345E-06, 1.3449315566530894E-06, -1.8048974519264694E-06, -2.2987809871496018E-07, 5.3239387743957950E-07, 1.0066736763205477E-08, -5.4304148291620039E-08, -4.8970459380161527E-09}; - constexpr FLT c8[] = {1.1055703983904693E-09, 4.3691209554215673E-09, -2.0201061499499309E-08, -2.3275033898522544E-08, 1.2633562932172848E-07, -2.2021804055583841E-08, -2.7912172397333448E-07, 2.1280289571270167E-07, 2.1280289561471954E-07, -2.7912172398563377E-07, -2.2021804043311624E-08, 1.2633562932175524E-07, -2.3275033897953490E-08, -2.0201061499405642E-08, 4.3691209554208717E-09, 1.1055703983904937E-09}; - constexpr FLT c9[] = {1.7210848751142109E-10, -1.3819378018358974E-10, -2.4707116696395418E-09, 4.6626394240840718E-09, 6.2513494821407377E-09, -2.2225751663756647E-08, 7.2716681831167356E-09, 2.9914504875425248E-08, -2.9914504880961111E-08, -7.2716681858846656E-09, 2.2225751666524578E-08, -6.2513494807567727E-09, -4.6626394246030589E-09, 2.4707116695638564E-09, 1.3819378018734865E-10, -1.7210848751139469E-10}; - constexpr FLT c10[] = {1.5548426850891040E-11, -8.2967690037353030E-11, -2.0776280196441915E-11, 6.5818716237227360E-10, -9.7473365318544434E-10, -7.2114132190269774E-10, 2.9974008768194548E-09, -1.8729406654385533E-09, -1.8729407980520035E-09, 2.9974009543459026E-09, -7.2114130179071973E-10, -9.7473365601368880E-10, 6.5818716417921449E-10, -2.0776280166982969E-11, -8.2967690036279040E-11, 1.5548426850876794E-11}; - constexpr FLT c11[] = {1.7715918253734007E-14, -8.7094275492396390E-12, 2.5402078548167017E-11, 5.6643084712743339E-13, -1.1273398069226705E-10, 1.7831197627554656E-10, 2.2124056737037060E-13, -2.7985821416111004E-10, 2.7985826569398559E-10, -2.2122821651802181E-13, -1.7831199885666961E-10, 1.1273397622040666E-10, -5.6643203607501166E-13, -2.5402078628021660E-11, 8.7094275492396907E-12, -1.7715918256992908E-14}; - constexpr FLT c12[] = {-2.1496737418348056E-13, -2.2214973543773537E-14, 2.3291735079229971E-12, -5.9732922869516132E-12, 3.0556730493177866E-12, 1.1858129781605648E-11, -2.4316397039401376E-11, 1.3235569405286772E-11, 1.3235463236132106E-11, -2.4316413373117597E-11, 1.1858131823320733E-11, 3.0556730493176707E-12, -5.9732919041302971E-12, 2.3291735916652542E-12, -2.2214974665309464E-14, -2.1496737416109420E-13}; - constexpr FLT c13[] = {-2.3198933254093550E-14, 8.4680085604099498E-14, -5.5120431569756550E-14, -3.4224865085091971E-13, 1.0093479536840142E-12, -9.9670676529397927E-13, -4.1953479545762892E-13, 2.1120282165025634E-12, -2.1120647150379602E-12, 4.1949829692223215E-13, 9.9668454879417257E-13, -1.0093487471304360E-12, 3.4224795658530073E-13, 5.5120400575755698E-14, -8.4680084102827573E-14, 2.3198933260903755E-14}; - for (int i=0; i<16; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i] + z*(c12[i] + z*(c13[i]))))))))))))); - } else - printf("width not implemented!\n"); diff --git a/include/cufinufft/spreadinterp.h b/include/cufinufft/spreadinterp.h index 7fd098925..d5009de41 100644 --- a/include/cufinufft/spreadinterp.h +++ b/include/cufinufft/spreadinterp.h @@ -56,7 +56,7 @@ static inline T evaluate_kernel(T x, const finufft_spread_opts &opts) // if spreading/FT careful, shouldn't need this if, but causes no speed hit return 0.0; else - return exp(T(opts.ES_beta) * sqrt(T(1.0) - T(opts.ES_c) * x * x)); + return exp((T)opts.ES_beta * (sqrt((T)1.0 - (T)opts.ES_c * x * x) - (T)1.0)); } template @@ -71,7 +71,9 @@ static __forceinline__ __device__ T evaluate_kernel(T x, T es_c, T es_beta, int This is the "reference implementation", used by eg common/onedim_* 2/17/17 */ { - return abs(x) < ns / 2.0 ? exp(es_beta * (sqrt(1.0 - es_c * x * x))) : 0.0; + return abs(x) < ns / T(2.0) + ? exp((T)es_beta * (sqrt((T)1.0 - (T)es_c * x * x) - (T)1.0)) + : 0.0; } template @@ -82,23 +84,17 @@ static __inline__ __device__ void eval_kernel_vec_horner(T *ker, const T x, cons This is the current evaluation method, since it's faster (except i7 w=16). Two upsampfacs implemented. Params must match ref formula. Barnett 4/24/18 */ { -#ifdef __CUDA_ARCH__ - __builtin_assume(w >= 2); - if constexpr (std::is_same_v) { - __builtin_assume(w <= 7); - } - if constexpr (std::is_same_v) { - __builtin_assume(w <= 16); - } -#endif const auto z = fma(T(2), x, T(w - 1)); // scale so local grid offset z in [-1,1] // T z = 2 * x + w - 1.0; // insert the auto-generated code which expects z, w args, writes to ker... if (upsampfac == 2.0) { // floating point equality is fine here - using FLT = T; - using CUFINUFFT_FLT = T; + using FLT = T; #include "cufinufft/contrib/ker_horner_allw_loop.inc" } + if (upsampfac == 1.25) { // floating point equality is fine here + using FLT = T; +#include "cufinufft/contrib/ker_lowupsampfac_horner_allw_loop.inc" + } } template From ae783da138028538738616332675c0da73b5bb1c Mon Sep 17 00:00:00 2001 From: Marco Barbone Date: Wed, 24 Jul 2024 17:31:22 -0400 Subject: [PATCH 21/68] picked good defaults for method --- CMakeLists.txt | 4 ++-- examples/CMakeLists.txt | 4 ---- include/cufinufft/impl.h | 29 ++++++++++++++--------------- perftest/cuda/bench.py | 10 ++++++---- src/cuda/3d/spread3d_wrapper.cu | 1 + src/cuda/common.cu | 15 ++++++--------- 6 files changed, 29 insertions(+), 34 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 93a34f2af..3c9b84f3b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -271,7 +271,7 @@ if (FINUFFT_USE_CUDA) enable_language(CUDA) find_package(CUDAToolkit REQUIRED) add_subdirectory(src/cuda) - if (BUILD_TESTING OR FINUFFT_BUILD_TESTS) + if (FINUFFT_BUILD_TESTS) add_subdirectory(perftest/cuda) add_subdirectory(test/cuda) endif () @@ -280,7 +280,7 @@ if (FINUFFT_USE_CUDA) endif () # Add tests defined in their own directory -if (FINUFFT_USE_CPU AND (BUILD_TESTING OR FINUFFT_BUILD_TESTS)) +if (FINUFFT_USE_CPU AND FINUFFT_BUILD_TESTS) add_subdirectory(test) add_subdirectory(perftest) endif () diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index af6f067bc..8b5afa4f5 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -21,7 +21,3 @@ if(FINUFFT_USE_OPENMP) enable_asan(${EXAMPLE}) endforeach() endif() - -if (FINUFFT_USE_CUDA) - add_subdirectory(cuda) -endif() diff --git a/include/cufinufft/impl.h b/include/cufinufft/impl.h index 4a1c6ae31..7d63df51e 100644 --- a/include/cufinufft/impl.h +++ b/include/cufinufft/impl.h @@ -144,24 +144,23 @@ int cufinufft_makeplan_impl(int type, int dim, int *nmodes, int iflag, int ntran * For type 2, we always default to method 1 (GM). */ // query the device for the amount of shared memory available - int shared_mem_per_block{}; - cudaDeviceGetAttribute(&shared_mem_per_block, cudaDevAttrMaxSharedMemoryPerBlockOptin, - device_id); - RETURN_IF_CUDA_ERROR - // compute the amount of shared memory required for the method - const auto shared_mem_required = - shared_memory_required(dim, d_plan->spopts.nspread, d_plan->opts.gpu_binsizex, - d_plan->opts.gpu_binsizey, d_plan->opts.gpu_binsizez); - printf("Shared memory available: %d KB, required: %d KB\n", shared_mem_per_block, - shared_mem_required); - if ((shared_mem_required > shared_mem_per_block)) { + if (dim == 3 && std::is_same_v) { d_plan->opts.gpu_method = 1; - printf("choosing method 1\n"); } else { - d_plan->opts.gpu_method = 2; - printf("choosing method 2\n"); + int shared_mem_per_block{}; + cudaDeviceGetAttribute(&shared_mem_per_block, + cudaDevAttrMaxSharedMemoryPerBlockOptin, device_id); + RETURN_IF_CUDA_ERROR + // compute the amount of shared memory required for the method + const auto shared_mem_required = shared_memory_required( + dim, d_plan->spopts.nspread, d_plan->opts.gpu_binsizex, + d_plan->opts.gpu_binsizey, d_plan->opts.gpu_binsizez); + if ((shared_mem_required > shared_mem_per_block)) { + d_plan->opts.gpu_method = 1; + } else { + d_plan->opts.gpu_method = 2; + } } - printf("using method %d\n", d_plan->opts.gpu_method); } int fftsign = (iflag >= 0) ? 1 : -1; diff --git a/perftest/cuda/bench.py b/perftest/cuda/bench.py index 8a9e757a3..aa21acd52 100644 --- a/perftest/cuda/bench.py +++ b/perftest/cuda/bench.py @@ -37,7 +37,7 @@ def build_args(args): # example command to run: # nsys profile -o cuperftest_profile ./cuperftest --prec f --n_runs 10 --method 1 --N1 256 --N2 256 --N3 256 --M 1E8 --tol 1E-6 # example arguments -args = {"--prec": "f", +args = {"--prec": "d", "--n_runs": "5", "--method": "0", "--sort": "1", @@ -71,8 +71,10 @@ def build_args(args): if stderr != '': print(stderr) exit(0) -for i in range(1, 7): - args["--tol"] = "1E-" + str(i) +max_range = 8 if args["--prec"] == "d" else 7 + +for i in range(1, max_range): + args["--tol"] = "1E-" + ("0" if i < 10 else "") + str(i) print("Running with tol = 1E-" + str(i)) for method in ['2', '1']: args["--method"] = method @@ -180,4 +182,4 @@ def build_args(args): plt.savefig("bench.png") plt.savefig("bench.svg") plt.savefig("bench.pdf") -plt.show() \ No newline at end of file +plt.show() diff --git a/src/cuda/3d/spread3d_wrapper.cu b/src/cuda/3d/spread3d_wrapper.cu index bf78ed905..4fb2b073d 100644 --- a/src/cuda/3d/spread3d_wrapper.cu +++ b/src/cuda/3d/spread3d_wrapper.cu @@ -280,6 +280,7 @@ int cuspread3d_blockgather_prop(int nf1, int nf2, int nf3, int M, blocks.x = (threadsPerBlock.x + numbins[0] - 1) / threadsPerBlock.x; blocks.y = (threadsPerBlock.y + numbins[1] - 1) / threadsPerBlock.y; + blocks.y = (threadsPerBlock.y + numbins[1] - 1) / threadsPerBlock.y; blocks.z = (threadsPerBlock.z + numbins[2] - 1) / threadsPerBlock.z; ghost_bin_pts_index<<>>( diff --git a/src/cuda/common.cu b/src/cuda/common.cu index 64c5639dc..ea54a4c77 100644 --- a/src/cuda/common.cu +++ b/src/cuda/common.cu @@ -256,11 +256,15 @@ void cufinufft_setup_binsize(int type, int ns, int dim, cufinufft_opts *opts) { if (const auto err = cudaGetLastError(); err != cudaSuccess) { throw std::runtime_error(cudaGetErrorString(err)); } + // use half of the available shared memory if double precision + if constexpr (std::is_same_v) { + shared_mem_per_block /= 2; + } const int bin_size = shared_mem_per_block / sizeof(cuda_complex) - ((ns + 1) / 2) * 2; - // find the power of 2 that is less than bin_size - // this makes the bin_size use the maximum shared memory available + opts->gpu_binsizex = bin_size; + opts->gpu_binsizex = 1024; const auto shared_mem_required = shared_memory_required( dim, ns, opts->gpu_binsizex, opts->gpu_binsizey, opts->gpu_binsizez); // printf("binsizex: %d, shared_mem_required %ld (bytes)\n", @@ -310,13 +314,6 @@ void cufinufft_setup_binsize(int type, int ns, int dim, cufinufft_opts *opts) { opts->gpu_binsizex = 16; opts->gpu_binsizey = 16; opts->gpu_binsizez = 2; - // const auto shared_mem_required = shared_memory_required( - // dim, ns, opts->gpu_binsizex, opts->gpu_binsizey, - // opts->gpu_binsizez); - // printf( - // "binsizex: %d, binsizey: %d, binsizez: %d shared_mem_required %ld - // (bytes)\n", opts->gpu_binsizex, opts->gpu_binsizey, - // opts->gpu_binsizez, shared_mem_required); } } break; case 4: { From d29fcf517d930ba9a99a24b7d245f01461d6b1d6 Mon Sep 17 00:00:00 2001 From: Marco Barbone Date: Wed, 24 Jul 2024 19:12:35 -0400 Subject: [PATCH 22/68] update configuration --- perftest/cuda/bench.py | 4 ++-- src/cuda/common.cu | 8 +------- 2 files changed, 3 insertions(+), 9 deletions(-) diff --git a/perftest/cuda/bench.py b/perftest/cuda/bench.py index aa21acd52..118c04d3b 100644 --- a/perftest/cuda/bench.py +++ b/perftest/cuda/bench.py @@ -44,7 +44,7 @@ def build_args(args): # "--N1": "16777216", "--N1": "256", "--N2": "256", - "--N3": "256", + # "--N3": "256", "--kerevalmethod": "1", "--M": "1E8", "--tol": "1E-6"} @@ -71,7 +71,7 @@ def build_args(args): if stderr != '': print(stderr) exit(0) -max_range = 8 if args["--prec"] == "d" else 7 +max_range = 16 if args["--prec"] == "d" else 7 for i in range(1, max_range): args["--tol"] = "1E-" + ("0" if i < 10 else "") + str(i) diff --git a/src/cuda/common.cu b/src/cuda/common.cu index ea54a4c77..8499aea8a 100644 --- a/src/cuda/common.cu +++ b/src/cuda/common.cu @@ -263,13 +263,7 @@ void cufinufft_setup_binsize(int type, int ns, int dim, cufinufft_opts *opts) { const int bin_size = shared_mem_per_block / sizeof(cuda_complex) - ((ns + 1) / 2) * 2; - opts->gpu_binsizex = bin_size; - opts->gpu_binsizex = 1024; - const auto shared_mem_required = shared_memory_required( - dim, ns, opts->gpu_binsizex, opts->gpu_binsizey, opts->gpu_binsizez); - // printf("binsizex: %d, shared_mem_required %ld (bytes)\n", - // opts->gpu_binsizex, - // shared_mem_required); + opts->gpu_binsizex = bin_size; } opts->gpu_binsizey = 1; opts->gpu_binsizez = 1; From 73f937b0afcf9a66313bf2607d3086504d5061e7 Mon Sep 17 00:00:00 2001 From: Marco Barbone Date: Thu, 25 Jul 2024 10:22:59 -0400 Subject: [PATCH 23/68] upated build system --- CMakeLists.txt | 17 ++-- perftest/cuda/CMakeLists.txt | 6 ++ perftest/cuda/bench.py | 185 +++++++++++++++++++++++++++++++++++ perftest/cuda/bench.sh | 13 +++ src/cuda/CMakeLists.txt | 39 ++++++-- 5 files changed, 244 insertions(+), 16 deletions(-) create mode 100644 perftest/cuda/bench.py create mode 100644 perftest/cuda/bench.sh diff --git a/CMakeLists.txt b/CMakeLists.txt index 5ca851dfe..3c9b84f3b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,4 +1,4 @@ -cmake_minimum_required(VERSION 3.19) +cmake_minimum_required(VERSION 3.23) project(FINUFFT VERSION 2.2.0 LANGUAGES C CXX) @@ -46,7 +46,7 @@ if (CMAKE_CXX_COMPILER_ID IN_LIST GNU_LIKE_FRONTENDS AND NOT DEFINED FINUFFT_ARC endif () set(FINUFFT_FFTW_SUFFIX "OpenMP" CACHE STRING "Suffix for FFTW libraries (e.g. OpenMP, Threads etc.)") set(FINUFFT_FFTW_LIBRARIES "DEFAULT" CACHE STRING "Specify a custom FFTW library") - +set(FINUFFT_CUDA_ARCHITECTURES "native" CACHE STRING "CUDA architectures to build for (e.g. 60;70;75;)") # All options go here # sphinx tag (don't remove): @cmake_opts_start option(FINUFFT_BUILD_EXAMPLES "Whether to build the FINUFFT examples" OFF) @@ -271,25 +271,26 @@ if (FINUFFT_USE_CUDA) enable_language(CUDA) find_package(CUDAToolkit REQUIRED) add_subdirectory(src/cuda) - if (BUILD_TESTING AND FINUFFT_BUILD_TESTS) + if (FINUFFT_BUILD_TESTS) add_subdirectory(perftest/cuda) + add_subdirectory(test/cuda) endif () list(APPEND INSTALL_TARGETS cufinufft) endif () # Add tests defined in their own directory -if (BUILD_TESTING AND FINUFFT_BUILD_TESTS AND FINUFFT_USE_CPU) +if (FINUFFT_USE_CPU AND FINUFFT_BUILD_TESTS) add_subdirectory(test) add_subdirectory(perftest) endif () -if (BUILD_TESTING AND FINUFFT_BUILD_TESTS AND FINUFFT_USE_CUDA) - add_subdirectory(test/cuda) +if (FINUFFT_BUILD_EXAMPLES AND FINUFFT_USE_CPU) + add_subdirectory(examples) endif () -if (FINUFFT_BUILD_EXAMPLES) - add_subdirectory(examples) +if (FINUFFT_BUILD_EXAMPLES AND FINUFFT_USE_GPU) + add_subdirectory(examples/cuda) endif () if (FINUFFT_BUILD_FORTRAN) diff --git a/perftest/cuda/CMakeLists.txt b/perftest/cuda/CMakeLists.txt index 9d817d5f6..8f8a8a20b 100644 --- a/perftest/cuda/CMakeLists.txt +++ b/perftest/cuda/CMakeLists.txt @@ -1,3 +1,9 @@ add_executable(cuperftest cuperftest.cu) target_include_directories(cuperftest PUBLIC ${CUFINUFFT_INCLUDE_DIRS}) target_link_libraries(cuperftest PUBLIC cufinufft) +set_target_properties(cuperftest PROPERTIES + LINKER_LANGUAGE CUDA + CUDA_ARCHITECTURES ${FINUFFT_CUDA_ARCHITECTURES} +) + +#file(COPY ${CMAKE_CURRENT_SOURCE_DIR}/bench.sh DESTINATION ${CMAKE_CURRENT_BINARY_DIR}) diff --git a/perftest/cuda/bench.py b/perftest/cuda/bench.py new file mode 100644 index 000000000..1d52ff884 --- /dev/null +++ b/perftest/cuda/bench.py @@ -0,0 +1,185 @@ +import matplotlib.pyplot as plt +import os +import subprocess +import pandas as pd +import numpy as np +import io +cwd = os.getcwd() + + +# function that runs a command line command and returns the output +# it also takes a list of arguments to pass to the command +def run_command(command, args): + # convert command and args to a string + try: + cmd = [command] + args + print("Running command:", ' '.join(cmd)) + result = subprocess.run(cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) + return result.stdout, result.stderr + except subprocess.CalledProcessError as e: + print('stdout output:\n', e.stdout) + print('stderr output:\n', e.stderr) + print("Error executing command:", e) + + +# function that builds a string from a dictionary of arguments + +def build_args(args): + args_list = [] + for key, value in args.items(): + args_list.append(key) + args_list.append(value) + return args_list + + +# function + +# example command to run: +# nsys profile -o cuperftest_profile ./cuperftest --prec f --n_runs 10 --method 1 --N1 256 --N2 256 --N3 256 --M 1E8 --tol 1E-6 +# example arguments +args = {"--prec": "f", + "--n_runs": "5", + "--method": "0", + "--sort": "1", + "--N1": "16777216", + # "--N1": "256", + # "--N2": "256", + # "--N3": "256", + "--kerevalmethod": "1", + "--M": "1E8", + "--tol": "1E-6"} +# iterate over tol from 1E-6 to 1E-1 +data = { + 'method': [], + 'throughput': [], + 'tolerance': [], + # 'setpts': [], + 'exec': [], +} +warmup = {"--prec": "f", + "--n_runs": "1", + "--method": "0", + "--N1": "256", + "--N2": "256", + # "--N3": "256", + "--M": "256", + "--tol": "1E-1"} +cmd = ["profile", "--force-overwrite", "true", "-o", "cuperftest_profile", cwd + "/cuperftest"] + build_args(warmup) +print("Warmup") +stdout, stderr = run_command("nsys", cmd) +print("Benchmarking") +if stderr != '': + print(stderr) + exit(0) +max_range = 16 if args["--prec"] == "d" else 7 + +for i in range(1, max_range): + args["--tol"] = "1E-" + ("0" if i < 10 else "") + str(i) + print("Running with tol = 1E-" + str(i)) + for method in ['2', '1']: + args["--method"] = method + if method == '0': + data['method'].append('auto') + elif method == '1': + data['method'].append('GM') + elif method == '2': + data['method'].append('SM') + elif method == '4': + data['method'].append('BLOCK') + print("Method " + data['method'][-1]) + cmd = ["profile", "--force-overwrite", "true", "-o", "cuperftest_profile", cwd + "/cuperftest"] + build_args(args) + stdout, stderr = run_command("nsys", cmd) + if stderr != '': + print(stderr) + exit(0) + # skip all lines starting with # in stdout + conf = [x for x in stdout.splitlines() if x.startswith("#")] + print('\n'.join(conf)) + stdout = [x for x in stdout.splitlines() if not x.startswith("#")][:7] + if stdout[0].startswith("bin"): + print(stdout[0]) + stdout = stdout[1:] + + stdout = '\n'.join(stdout) + # convert stdout to a dataframe from csv string + dt = pd.read_csv(io.StringIO(stdout), sep=',') + setpts = dt[dt["event"].str.contains("setpts")]['nupts/s'].sum() # it is only one row it extracts the value + exec = dt[dt["event"].str.contains("exec")]['nupts/s'].sum() # it is only one row it extracts the value + print(f'setpts pts/s: {setpts}') + print(f'exec pts/s: {exec}') + cmd = ["stats", "--force-overwrite=true", "--force-export=true", "--report", "cuda_gpu_trace", "--report", "cuda_gpu_kern_sum", "cuperftest_profile.nsys-rep", + "--format=csv", "--output", "cuperftest"] + stdout, _ = run_command("nsys", cmd) + # remove format from cmd + cmd = cmd[:-3] + # print(run_command("nsys", cmd)) + # print(csv) + dt = pd.read_csv("./cuperftest_cuda_gpu_trace.csv") + # print(dt) + # sum the "Total Time" column of the ones that contain "fft" in name + # print(dt[dt["Name"].str.contains("fft") & ~dt["Name"].str.contains("cufinufft")]) + total_fft = dt[dt["Name"].str.contains("fft") & ~dt["Name"].str.contains("cufinufft")]['Duration (ns)'].sum() + print(f'total_fft: {total_fft}') + # drop all the rows with spread not in "Name" + dt = dt[dt["Name"].str.contains("cufinufft::spreadinterp::spread")] + # print(dt) + # exit(0) + # sort dt by column "Time (%)" + total_spread = dt['Duration (ns)'].sum() - total_fft + print(f'total_spread: {total_spread}') + if total_fft > total_spread: + print("Warning: total_fft > total_spread") + # exit(0) + # pt/s + throughput = float(args['--M']) * float(args['--n_runs']) * 1_000_000_000 / total_spread + print(f'throughput: {throughput}') + data['throughput'].append(throughput) + data['tolerance'].append(args['--tol']) + # data['setpts'].append(setpts) + data['exec'].append(exec) + + +df = pd.DataFrame(data) +# Pivot the DataFrame +pivot_df = df.pivot(index='tolerance', columns='method') +# print(pivot_df) +# scale the throughput SM by GM +# pivot_df['throughput', 'SM'] /= pivot_df['throughput', 'GM'] +# pivot_df['throughput', 'GM'] /= pivot_df['throughput', 'GM'] +# scale setpts SM by GM +# pivot_df['exec', 'SM'] /= pivot_df['exec', 'GM'] +# pivot_df['exec', 'GM'] /= pivot_df['exec', 'GM'] +# remove the GM column +# pivot_df.drop(('throughput', 'GM'), axis=1, inplace=True) +pivot_df.drop(('exec', 'GM'), axis=1, inplace=True) +pivot_df.drop(('exec', 'SM'), axis=1, inplace=True) +print(pivot_df) +# Plot +pivot_df.plot(kind='bar', figsize=(10, 7)) +# Find the minimum throughput value +min_val = min(pivot_df[('throughput', 'SM')].min(), pivot_df[('throughput', 'GM')].min()) +max_val = max(pivot_df[('throughput', 'SM')].max(), pivot_df[('throughput', 'GM')].max()) +print(min_val, max_val) +plt.ylim(min_val * .90, max_val * 1.1) +# plt.ylim(.8, 1.2) + +# Calculate the smallest power of 10 +# min_pow_10 = 10 ** np.floor(np.log10(min_throughput)) + +# Adjust the plot's y-axis limits +# plt.ylim(df['throughput'].min()*.99, df['throughput'].max() * 1.009) # Adding 10% for upper margin + +# plot an horizontal line at 1 with label "GM" +# plt.axhline(y=1, color='k', linestyle='--', label='GM') +plt.xlabel('Tolerance') +plt.ylabel('Throughput') +plt.title('Throughput by Tolerance and Method') +plt.legend(title='Method') +plt.tight_layout() +plt.show() +plt.xlabel("Tolerance") +plt.ylabel("Points/s") +plt.savefig("bench.png") +plt.savefig("bench.svg") +plt.savefig("bench.pdf") +plt.show() diff --git a/perftest/cuda/bench.sh b/perftest/cuda/bench.sh new file mode 100644 index 000000000..9832e1088 --- /dev/null +++ b/perftest/cuda/bench.sh @@ -0,0 +1,13 @@ +./cuperftest --prec d --n_runs 5 --N1 1e2 --N2 1e2 --M 2e6 --method 0 --tol 1e-4 +./cuperftest --prec d --n_runs 5 --N1 1e1 --N2 1e1 --N3 1e1 --M 2e6 --method 0 --tol 1e-4 +./cuperftest --prec d --n_runs 5 --N1 1e2 --N2 1e2 --N3 1e1 --M 2e6 --method 0 --tol 1e-4 +./cuperftest --prec d --n_runs 5 --N1 1e1 --N2 1e2 --N3 1e3 --M 2e6 --method 0 --tol 1e-4 +./cuperftest --prec d --n_runs 5 --N1 1e2 --N2 1e2 --N3 1e3 --M 2e6 --method 0 --tol 1e-4 +#./cuperftest --prec d --n_runs 5 --N1 1e5 --N2 1e5 --N3 1e5 --M 2e6 --method 0 --tol 1e-10 +#./cuperftest --prec d --n_runs 5 --N1 1e4 --N2 1e4 --N3 1e4 --M 2e6 --method 0 --tol 1e-10 +#./cuperftest --prec d --n_runs 5 --N1 1e5 --N2 1e5 --N3 1e5 --M 2e6 --method 0 --tol 1e-10 +#./cuperftest --prec d --n_runs 5 --N1 1e6 --N2 1e6 --M 2e6 --method 0 --tol 1e-10 +#./cuperftest --prec d --n_runs 5 --N1 1e8 --N2 1e6 --M 2e6 --method 0 --tol 1e-10 +#./cuperftest --prec d --n_runs 5 --N1 1e6 --N2 1e6 --M 2e6 --method 0 --tol 1e-10 +#./cuperftest --prec d --n_runs 5 --N1 1e7 --N2 1e7 --M 2e6 --method 0 --tol 1e-10 +#./cuperftest --prec d --n_runs 5 --N1 1e8 --N2 1e8 --M 2e6 --method 0 --tol 1e-10 diff --git a/src/cuda/CMakeLists.txt b/src/cuda/CMakeLists.txt index c9f13344d..751ccfc6c 100644 --- a/src/cuda/CMakeLists.txt +++ b/src/cuda/CMakeLists.txt @@ -1,8 +1,3 @@ - -set(CMAKE_CXX_STANDARD 17) -set(CMAKE_CXX_EXTENSIONS OFF) -set(CMAKE_CUDA_SEPARABLE_COMPILATION ON) - set(PRECISION_INDEPENDENT_SRC precision_independent.cu utils.cpp ${PROJECT_SOURCE_DIR}/contrib/legendre_rule_fast.cpp) set(PRECISION_DEPENDENT_SRC @@ -22,13 +17,34 @@ set(CUFINUFFT_INCLUDE_DIRS ) set(CUFINUFFT_INCLUDE_DIRS ${CUFINUFFT_INCLUDE_DIRS} PARENT_SCOPE) +# flush denormals to zero and enable verbose PTXAS output +set(FINUFFT_CUDA_FLAGS + -ftz=true -fmad=true -restrict -Xptxas=-v --extra-device-vectorization -res-usage + -Wdouble-promotion -lineinfo --extended-lambda --expt-relaxed-constexpr +) + add_library(cufinufft_common_objects OBJECT ${PRECISION_INDEPENDENT_SRC}) target_include_directories(cufinufft_common_objects PUBLIC ${CUFINUFFT_INCLUDE_DIRS}) -set_property(TARGET cufinufft_common_objects PROPERTY POSITION_INDEPENDENT_CODE ON) +set_target_properties( + cufinufft_common_objects PROPERTIES + POSITION_INDEPENDENT_CODE ${FINUFFT_SHARED_LINKING} + CUDA_ARCHITECTURES ${FINUFFT_CUDA_ARCHITECTURES} + CUDA_SEPARABLE_COMPILATION ON +) + +target_compile_options(cufinufft_common_objects PRIVATE $<$:${FINUFFT_CUDA_FLAGS}>) +target_compile_features(cufinufft_common_objects PRIVATE cxx_std_17) add_library(cufinufft_objects OBJECT ${PRECISION_DEPENDENT_SRC}) target_include_directories(cufinufft_objects PUBLIC ${CUFINUFFT_INCLUDE_DIRS}) -set_property(TARGET cufinufft_objects PROPERTY POSITION_INDEPENDENT_CODE ON) +set_target_properties( + cufinufft_objects PROPERTIES + POSITION_INDEPENDENT_CODE ${FINUFFT_SHARED_LINKING} + CUDA_ARCHITECTURES ${FINUFFT_CUDA_ARCHITECTURES} + CUDA_SEPARABLE_COMPILATION ON +) +target_compile_features(cufinufft_objects PRIVATE cxx_std_17) +target_compile_options(cufinufft_objects PRIVATE $<$:${FINUFFT_CUDA_FLAGS}>) if (FINUFFT_SHARED_LINKING) add_library(cufinufft SHARED @@ -56,5 +72,12 @@ else () target_link_libraries(cufinufft PUBLIC CUDA::cudart_static CUDA::cufft_static CUDA::nvToolsExt) endif () +target_compile_options(cufinufft PRIVATE $<$:${FINUFFT_CUDA_FLAGS}>) file(GLOB CUFINUFFT_PUBLIC_HEADERS "${CMAKE_SOURCE_DIR}/include/cufinufft*.h") -set_target_properties(cufinufft PROPERTIES PUBLIC_HEADER "${CUFINUFFT_PUBLIC_HEADERS}") +set_target_properties( + cufinufft PROPERTIES + PUBLIC_HEADER "${CUFINUFFT_PUBLIC_HEADERS}" + POSITION_INDEPENDENT_CODE ${FINUFFT_SHARED_LINKING} + CUDA_ARCHITECTURES ${FINUFFT_CUDA_ARCHITECTURES} + CUDA_SEPARABLE_COMPILATION ON +) From 07248668e3884d7beae4a60034323aaf1087d6bd Mon Sep 17 00:00:00 2001 From: Marco Barbone Date: Thu, 25 Jul 2024 15:16:55 -0400 Subject: [PATCH 24/68] fixing jenkins --- include/cufinufft/utils.h | 23 +++++ perftest/cuda/bench.py | 10 +- src/cuda/1d/spreadinterp1d.cuh | 58 ++++++------ src/cuda/2d/spreadinterp2d.cuh | 42 ++++++--- src/cuda/3d/spreadinterp3d.cuh | 161 +++++++++++++++++++-------------- 5 files changed, 185 insertions(+), 109 deletions(-) diff --git a/include/cufinufft/utils.h b/include/cufinufft/utils.h index b0a77aec7..29645f9f9 100644 --- a/include/cufinufft/utils.h +++ b/include/cufinufft/utils.h @@ -81,6 +81,29 @@ __forceinline__ __device__ auto interval(const int ns, const double x) { return int2{xstart, xend}; } #endif + +// Define a macro to check if NVCC version is >= 11.3 +#if defined(__CUDACC_VER_MAJOR__) && defined(__CUDACC_VER_MINOR__) +#if (__CUDACC_VER_MAJOR__ > 11) || \ + (__CUDACC_VER_MAJOR__ == 11 && __CUDACC_VER_MINOR__ >= 3 && __CUDA_ARCH__ >= 600) +#define ALLOCA_SUPPORTED 1 +#else +#define ALLOCA_SUPPORTED 0 +#endif +#else +#define ALLOCA_SUPPORTED 0 +#endif + +#if defined(__CUDA_ARCH__) +#if __CUDA_ARCH__ >= 900 +#define COMPUTE_CAPABILITY_90_OR_HIGHER 1 +#else +#define COMPUTE_CAPABILITY_90_OR_HIGHER 0 +#endif +#else +#define COMPUTE_CAPABILITY_90_OR_HIGHER 0 +#endif + } // namespace utils } // namespace cufinufft diff --git a/perftest/cuda/bench.py b/perftest/cuda/bench.py index 118c04d3b..1d52ff884 100644 --- a/perftest/cuda/bench.py +++ b/perftest/cuda/bench.py @@ -37,13 +37,13 @@ def build_args(args): # example command to run: # nsys profile -o cuperftest_profile ./cuperftest --prec f --n_runs 10 --method 1 --N1 256 --N2 256 --N3 256 --M 1E8 --tol 1E-6 # example arguments -args = {"--prec": "d", +args = {"--prec": "f", "--n_runs": "5", "--method": "0", "--sort": "1", - # "--N1": "16777216", - "--N1": "256", - "--N2": "256", + "--N1": "16777216", + # "--N1": "256", + # "--N2": "256", # "--N3": "256", "--kerevalmethod": "1", "--M": "1E8", @@ -60,7 +60,7 @@ def build_args(args): "--n_runs": "1", "--method": "0", "--N1": "256", - # "--N2": "256", + "--N2": "256", # "--N3": "256", "--M": "256", "--tol": "1E-1"} diff --git a/src/cuda/1d/spreadinterp1d.cuh b/src/cuda/1d/spreadinterp1d.cuh index b6c511555..c7d84a9b8 100644 --- a/src/cuda/1d/spreadinterp1d.cuh +++ b/src/cuda/1d/spreadinterp1d.cuh @@ -23,7 +23,12 @@ __global__ void spread_1d_nuptsdriven(const T *x, const cuda_complex *c, cuda_complex *fw, int M, int ns, int nf1, T es_c, T es_beta, T sigma, const int *idxnupts) { // dynamic stack allocation to reduce stack usage - auto ker1 = (T __restrict__ *)alloca(sizeof(T) * ns); +#if ALLOCA_SUPPORTED + auto ker = (T *)alloca(sizeof(T) * ns * 3); + auto *__restrict__ ker1 = ker; +#else + T ker1[MAX_NSPREAD]; +#endif for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < M; i += blockDim.x * gridDim.x) { @@ -37,8 +42,8 @@ __global__ void spread_1d_nuptsdriven(const T *x, const cuda_complex *c, eval_kernel_vec(ker1, x1, ns, es_c, es_beta); for (auto xx = xstart; xx <= xend; xx++) { - auto ix = xx < 0 ? xx + nf1 : (xx > nf1 - 1 ? xx - nf1 : xx); - T kervalue = ker1[xx - xstart]; + auto ix = xx < 0 ? xx + nf1 : (xx > nf1 - 1 ? xx - nf1 : xx); + const T kervalue = ker1[xx - xstart]; atomicAdd(&fw[ix].x, cnow.x * kervalue); atomicAdd(&fw[ix].y, cnow.y * kervalue); } @@ -84,11 +89,6 @@ __global__ void calc_inverse_of_global_sort_idx_1d( } } -template -__forceinline__ __device__ cuda_complex mul(const cuda_complex &a, const T b) { - return {a.x * b, a.y * b}; -} - template __global__ void spread_1d_subprob( const T *x, const cuda_complex *c, cuda_complex *fw, int M, uint8_t ns, int nf1, @@ -96,9 +96,8 @@ __global__ void spread_1d_subprob( int bin_size_x, const int *subprob_to_bin, const int *subprobstartpts, const int *numsubprob, int maxsubprobsize, int nbinx, int *idxnupts) { extern __shared__ char sharedbuf[]; - alignas(256) auto *__restrict__ fwshared = (cuda_complex *)sharedbuf; + auto *__restrict__ fwshared = (cuda_complex *)sharedbuf; - int ix; const int subpidx = blockIdx.x; const int bidx = subprob_to_bin[subpidx]; const int binsubp_idx = subpidx - subprobstartpts[bidx]; @@ -109,7 +108,12 @@ __global__ void spread_1d_subprob( const int N = bin_size_x + 2 * ns_2; // dynamic stack allocation - auto ker1 = (T __restrict__ *)alloca(sizeof(T) * ns); +#if ALLOCA_SUPPORTED + auto ker = (T *)alloca(sizeof(T) * ns * 3); + auto *__restrict__ ker1 = ker; +#else + T ker1[MAX_NSPREAD]; +#endif for (int i = threadIdx.x; i < N; i += blockDim.x) { fwshared[i] = {0, 0}; @@ -130,9 +134,10 @@ __global__ void spread_1d_subprob( else eval_kernel_vec(ker1, x1, ns, es_c, es_beta); for (int xx = xstart; xx <= xend; xx++) { - ix = xx + ns_2; + const auto ix = xx + ns_2; if (ix >= (bin_size_x + ns_2) || ix < 0) break; - const auto result = mul(cnow, ker1[xx - xstart]); + const cuda_complex result{cnow.x * ker1[xx - xstart], + cnow.y * ker1[xx - xstart]}; atomicAdd(&fwshared[ix].x, result.x); atomicAdd(&fwshared[ix].y, result.y); } @@ -140,7 +145,7 @@ __global__ void spread_1d_subprob( __syncthreads(); /* write to global memory */ for (int k = threadIdx.x; k < N; k += blockDim.x) { - ix = xoffset - ns_2 + k; + auto ix = xoffset - ns_2 + k; if (ix < (nf1 + ns_2)) { ix = ix < 0 ? ix + nf1 : (ix > nf1 - 1 ? ix - nf1 : ix); atomicAdd(&fw[ix].x, fwshared[k].x); @@ -155,31 +160,32 @@ template __global__ void interp_1d_nuptsdriven(const T *x, cuda_complex *c, const cuda_complex *fw, int M, int ns, int nf1, T es_c, T es_beta, T sigma, const int *idxnupts) { + // dynamic stack allocation +#if ALLOCA_SUPPORTED + auto ker = (T *)alloca(sizeof(T) * ns * 3); + auto *__restrict__ ker1 = ker; +#else T ker1[MAX_NSPREAD]; +#endif for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < M; i += blockDim.x * gridDim.x) { - T x_rescaled = fold_rescale(x[idxnupts[i]], nf1); + const T x_rescaled = fold_rescale(x[idxnupts[i]], nf1); + const auto [xstart, xend] = interval(ns, x_rescaled); - int xstart = ceil(x_rescaled - ns / 2.0); - int xend = floor(x_rescaled + ns / 2.0); - cuda_complex cnow; - cnow.x = 0.0; - cnow.y = 0.0; + cuda_complex cnow{0, 0}; - T x1 = (T)xstart - x_rescaled; + const T x1 = (T)xstart - x_rescaled; if constexpr (KEREVALMETH == 1) eval_kernel_vec_horner(ker1, x1, ns, sigma); else eval_kernel_vec(ker1, x1, ns, es_c, es_beta); - for (int xx = xstart; xx <= xend; xx++) { - int ix = xx < 0 ? xx + nf1 : (xx > nf1 - 1 ? xx - nf1 : xx); - T kervalue1 = ker1[xx - xstart]; + int ix = xx < 0 ? xx + nf1 : (xx > nf1 - 1 ? xx - nf1 : xx); + const T kervalue1 = ker1[xx - xstart]; cnow.x += fw[ix].x * kervalue1; cnow.y += fw[ix].y * kervalue1; } - c[idxnupts[i]].x = cnow.x; - c[idxnupts[i]].y = cnow.y; + c[idxnupts[i]] = cnow; } } diff --git a/src/cuda/2d/spreadinterp2d.cuh b/src/cuda/2d/spreadinterp2d.cuh index 62a430ca5..e8a69f303 100644 --- a/src/cuda/2d/spreadinterp2d.cuh +++ b/src/cuda/2d/spreadinterp2d.cuh @@ -19,9 +19,14 @@ template __global__ void spread_2d_nupts_driven( const T *x, const T *y, const cuda_complex *c, cuda_complex *fw, int M, int ns, int nf1, int nf2, T es_c, T es_beta, T sigma, const int *idxnupts) { - auto ker = (T *)alloca(sizeof(T) * ns * 2); +#if ALLOCA_SUPPORTED + auto ker = (T *)alloca(sizeof(T) * ns * 3); auto *__restrict__ ker1 = ker; auto *__restrict__ ker2 = ker + ns; +#else + T ker1[MAX_NSPREAD]; + T ker2[MAX_NSPREAD]; +#endif for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < M; i += blockDim.x * gridDim.x) { const auto x_rescaled = fold_rescale(x[idxnupts[i]], nf1); @@ -130,9 +135,14 @@ __global__ void spread_2d_subprob( const auto rounded_ns = ns_2 * 2; const int N = (bin_size_x + rounded_ns) * (bin_size_y + rounded_ns); - auto ker = (T *)alloca(sizeof(T) * ns * 2); +#if ALLOCA_SUPPORTED + auto ker = (T *)alloca(sizeof(T) * ns * 3); auto *__restrict__ ker1 = ker; auto *__restrict__ ker2 = ker + ns; +#else + T ker1[MAX_NSPREAD]; + T ker2[MAX_NSPREAD]; +#endif for (int i = threadIdx.x; i < N; i += blockDim.x) { fwshared[i] = {0, 0}; @@ -202,9 +212,14 @@ template __global__ void interp_2d_nupts_driven( const T *x, const T *y, cuda_complex *c, const cuda_complex *fw, int M, int ns, int nf1, int nf2, T es_c, T es_beta, T sigma, const int *idxnupts) { - auto ker = (T *)alloca(sizeof(T) * ns * 2); +#if ALLOCA_SUPPORTED + auto ker = (T *)alloca(sizeof(T) * ns * 3); auto *__restrict__ ker1 = ker; auto *__restrict__ ker2 = ker + ns; +#else + T ker1[MAX_NSPREAD]; + T ker2[MAX_NSPREAD]; +#endif for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < M; i += blockDim.x * gridDim.x) { @@ -236,8 +251,7 @@ __global__ void interp_2d_nupts_driven( cnow.y += fw[inidx].y * kervalue1 * kervalue2; } } - c[idxnupts[i]].x = cnow.x; - c[idxnupts[i]].y = cnow.y; + c[idxnupts[i]] = cnow; } } @@ -252,9 +266,14 @@ __global__ void interp_2d_subprob( extern __shared__ char sharedbuf[]; cuda_complex *fwshared = (cuda_complex *)sharedbuf; - auto ker = (T *)alloca(sizeof(T) * ns * 2); +#if ALLOCA_SUPPORTED + auto ker = (T *)alloca(sizeof(T) * ns * 3); auto *__restrict__ ker1 = ker; auto *__restrict__ ker2 = ker + ns; +#else + T ker1[MAX_NSPREAD]; + T ker2[MAX_NSPREAD]; +#endif const auto subpidx = blockIdx.x; const auto bidx = subprob_to_bin[subpidx]; @@ -276,12 +295,11 @@ __global__ void interp_2d_subprob( auto ix = xoffset - ns_2 + i; auto iy = yoffset - ns_2 + j; if (ix < (nf1 + ns_2) && iy < (nf2 + ns_2)) { - ix = ix < 0 ? ix + nf1 : (ix > nf1 - 1 ? ix - nf1 : ix); - iy = iy < 0 ? iy + nf2 : (iy > nf2 - 1 ? iy - nf2 : iy); - const auto outidx = ix + int(iy * nf1); - const auto sharedidx = i + j * (bin_size_x + rounded_ns); - fwshared[sharedidx].x = fw[outidx].x; - fwshared[sharedidx].y = fw[outidx].y; + ix = ix < 0 ? ix + nf1 : (ix > nf1 - 1 ? ix - nf1 : ix); + iy = iy < 0 ? iy + nf2 : (iy > nf2 - 1 ? iy - nf2 : iy); + const auto outidx = ix + iy * nf1; + const auto sharedidx = i + j * (bin_size_x + rounded_ns); + fwshared[sharedidx] = fw[outidx]; } } __syncthreads(); diff --git a/src/cuda/3d/spreadinterp3d.cuh b/src/cuda/3d/spreadinterp3d.cuh index dc722ddc3..19eae72a4 100644 --- a/src/cuda/3d/spreadinterp3d.cuh +++ b/src/cuda/3d/spreadinterp3d.cuh @@ -4,6 +4,7 @@ #include #include +#include #include #include #include @@ -81,11 +82,16 @@ __global__ void spread_3d_nupts_driven(const T *x, const T *y, const T *z, const cuda_complex *c, cuda_complex *fw, int M, int ns, int nf1, int nf2, int nf3, T es_c, T es_beta, T sigma, const int *idxnupts) { +#if ALLOCA_SUPPORTED auto ker = (T *)alloca(sizeof(T) * ns * 3); auto *__restrict__ ker1 = ker; auto *__restrict__ ker2 = ker + ns; auto *__restrict__ ker3 = ker + ns + ns; - +#else + T ker1[MAX_NSPREAD]; + T ker2[MAX_NSPREAD]; + T ker3[MAX_NSPREAD]; +#endif for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < M; i += blockDim.x * gridDim.x) { const auto x_rescaled = fold_rescale(x[idxnupts[i]], nf1); @@ -160,10 +166,16 @@ __global__ void spread_3d_subprob( fwshared[i] = {0, 0}; } __syncthreads(); +#if ALLOCA_SUPPORTED auto ker = (T *)alloca(sizeof(T) * ns * 3); auto *__restrict__ ker1 = ker; auto *__restrict__ ker2 = ker + ns; auto *__restrict__ ker3 = ker + ns + ns; +#else + T ker1[MAX_NSPREAD]; + T ker2[MAX_NSPREAD]; + T ker3[MAX_NSPREAD]; +#endif for (int i = threadIdx.x; i < nupts; i += blockDim.x) { const int nuptsidx = idxnupts[ptstart + i]; @@ -309,85 +321,93 @@ __global__ void spread_3d_block_gather( int nobinx, int nobiny, int nobinz, const int *idxnupts) { extern __shared__ char sharedbuf[]; cuda_complex *fwshared = (cuda_complex *)sharedbuf; + const int subpidx = blockIdx.x; + const int obidx = subprob_to_bin[subpidx]; + const int bidx = obidx * binsperobin; - int xstart, ystart, zstart, xend, yend, zend; - int xstartnew, ystartnew, zstartnew, xendnew, yendnew, zendnew; - int subpidx = blockIdx.x; - int obidx = subprob_to_bin[subpidx]; - int bidx = obidx * binsperobin; - - int obinsubp_idx = subpidx - subprobstartpts[obidx]; - int ix, iy, iz; - int outidx; - int ptstart = binstartpts[bidx] + obinsubp_idx * maxsubprobsize; - int nupts = min(maxsubprobsize, binstartpts[bidx + binsperobin] - binstartpts[bidx] - - obinsubp_idx * maxsubprobsize); + const int obinsubp_idx = subpidx - subprobstartpts[obidx]; + const int ptstart = binstartpts[bidx] + obinsubp_idx * maxsubprobsize; + const int nupts = + min(maxsubprobsize, binstartpts[bidx + binsperobin] - binstartpts[bidx] - + obinsubp_idx * maxsubprobsize); - int xoffset = (obidx % nobinx) * obin_size_x; - int yoffset = (obidx / nobinx) % nobiny * obin_size_y; - int zoffset = (obidx / (nobinx * nobiny)) * obin_size_z; + const int xoffset = (obidx % nobinx) * obin_size_x; + const int yoffset = (obidx / nobinx) % nobiny * obin_size_y; + const int zoffset = (obidx / (nobinx * nobiny)) * obin_size_z; - int N = obin_size_x * obin_size_y * obin_size_z; + const int N = obin_size_x * obin_size_y * obin_size_z; +#if ALLOCA_SUPPORTED + auto ker = (T *)alloca(sizeof(T) * ns * 3); + auto *__restrict__ ker1 = ker; + auto *__restrict__ ker2 = ker + ns; + auto *__restrict__ ker3 = ker + ns + ns; +#else T ker1[MAX_NSPREAD]; T ker2[MAX_NSPREAD]; T ker3[MAX_NSPREAD]; - +#endif for (int i = threadIdx.x; i < N; i += blockDim.x) { - fwshared[i].x = 0.0; - fwshared[i].y = 0.0; + fwshared[i] = {0, 0}; } + __syncthreads(); - T x_rescaled, y_rescaled, z_rescaled; - cuda_complex cnow; for (int i = threadIdx.x; i < nupts; i += blockDim.x) { int nidx = idxnupts[ptstart + i]; int b = nidx / M; int box[3]; - for (int d = 0; d < 3; d++) { - box[d] = b % 3; - if (box[d] == 1) box[d] = -1; - if (box[d] == 2) box[d] = 1; + for (int &d : box) { + d = b % 3; + if (d == 1) d = -1; + if (d == 2) d = 1; b = b / 3; } - int ii = nidx % M; - x_rescaled = fold_rescale(x[ii], nf1) + box[0] * nf1; - y_rescaled = fold_rescale(y[ii], nf2) + box[1] * nf2; - z_rescaled = fold_rescale(z[ii], nf3) + box[2] * nf3; - cnow = c[ii]; - - xstart = ceil(x_rescaled - ns / 2.0) - xoffset; - ystart = ceil(y_rescaled - ns / 2.0) - yoffset; - zstart = ceil(z_rescaled - ns / 2.0) - zoffset; - xend = floor(x_rescaled + ns / 2.0) - xoffset; - yend = floor(y_rescaled + ns / 2.0) - yoffset; - zend = floor(z_rescaled + ns / 2.0) - zoffset; + const int ii = nidx % M; + const auto x_rescaled = fold_rescale(x[ii], nf1) + box[0] * nf1; + const auto y_rescaled = fold_rescale(y[ii], nf2) + box[1] * nf2; + const auto z_rescaled = fold_rescale(z[ii], nf3) + box[2] * nf3; + const auto cnow = c[ii]; + auto [xstart, xend] = interval(ns, x_rescaled); + auto [ystart, yend] = interval(ns, y_rescaled); + auto [zstart, zend] = interval(ns, z_rescaled); + + const T x1 = T(xstart) - x_rescaled; + const T y1 = T(ystart) - y_rescaled; + const T z1 = T(zstart) - z_rescaled; + + xstart -= xoffset; + ystart -= yoffset; + zstart -= zoffset; + + xend -= xoffset; + yend -= yoffset; + zend -= zoffset; if constexpr (KEREVALMETH == 1) { - eval_kernel_vec_horner(ker1, xstart + xoffset - x_rescaled, ns, sigma); - eval_kernel_vec_horner(ker2, ystart + yoffset - y_rescaled, ns, sigma); - eval_kernel_vec_horner(ker3, zstart + zoffset - z_rescaled, ns, sigma); + eval_kernel_vec_horner(ker1, x1, ns, sigma); + eval_kernel_vec_horner(ker2, y1, ns, sigma); + eval_kernel_vec_horner(ker3, z1, ns, sigma); } else { - eval_kernel_vec(ker1, xstart + xoffset - x_rescaled, ns, es_c, es_beta); - eval_kernel_vec(ker2, ystart + yoffset - y_rescaled, ns, es_c, es_beta); - eval_kernel_vec(ker3, zstart + zoffset - z_rescaled, ns, es_c, es_beta); + eval_kernel_vec(ker1, x1, ns, es_c, es_beta); + eval_kernel_vec(ker2, y1, ns, es_c, es_beta); + eval_kernel_vec(ker3, z1, ns, es_c, es_beta); } - xstartnew = xstart < 0 ? 0 : xstart; - ystartnew = ystart < 0 ? 0 : ystart; - zstartnew = zstart < 0 ? 0 : zstart; - xendnew = xend >= obin_size_x ? obin_size_x - 1 : xend; - yendnew = yend >= obin_size_y ? obin_size_y - 1 : yend; - zendnew = zend >= obin_size_z ? obin_size_z - 1 : zend; + const auto xstartnew = xstart < 0 ? 0 : xstart; + const auto ystartnew = ystart < 0 ? 0 : ystart; + const auto zstartnew = zstart < 0 ? 0 : zstart; + const auto xendnew = xend >= obin_size_x ? obin_size_x - 1 : xend; + const auto yendnew = yend >= obin_size_y ? obin_size_y - 1 : yend; + const auto zendnew = zend >= obin_size_z ? obin_size_z - 1 : zend; for (int zz = zstartnew; zz <= zendnew; zz++) { - T kervalue3 = ker3[zz - zstart]; + const T kervalue3 = ker3[zz - zstart]; for (int yy = ystartnew; yy <= yendnew; yy++) { - T kervalue2 = ker2[yy - ystart]; + const T kervalue2 = ker2[yy - ystart]; for (int xx = xstartnew; xx <= xendnew; xx++) { - outidx = xx + yy * obin_size_x + zz * obin_size_y * obin_size_x; - T kervalue1 = ker1[xx - xstart]; + const auto outidx = xx + yy * obin_size_x + zz * obin_size_y * obin_size_x; + const T kervalue1 = ker1[xx - xstart]; atomicAdd(&fwshared[outidx].x, cnow.x * kervalue1 * kervalue2 * kervalue3); atomicAdd(&fwshared[outidx].y, cnow.y * kervalue1 * kervalue2 * kervalue3); } @@ -401,10 +421,10 @@ __global__ void spread_3d_block_gather( int j = (n / obin_size_x) % obin_size_y; int k = n / (obin_size_x * obin_size_y); - ix = xoffset + i; - iy = yoffset + j; - iz = zoffset + k; - outidx = ix + iy * nf1 + iz * nf1 * nf2; + const auto ix = xoffset + i; + const auto iy = yoffset + j; + const auto iz = zoffset + k; + const auto outidx = ix + iy * nf1 + iz * nf1 * nf2; atomicAdd(&fw[outidx].x, fwshared[n].x); atomicAdd(&fw[outidx].y, fwshared[n].y); } @@ -416,10 +436,16 @@ template __global__ void interp_3d_nupts_driven( const T *x, const T *y, const T *z, cuda_complex *c, const cuda_complex *fw, int M, int ns, int nf1, int nf2, int nf3, T es_c, T es_beta, T sigma, int *idxnupts) { - auto ker = (T *)alloca(sizeof(T) * ns * 2); +#if ALLOCA_SUPPORTED + auto ker = (T *)alloca(sizeof(T) * ns * 3); auto *__restrict__ ker1 = ker; auto *__restrict__ ker2 = ker + ns; auto *__restrict__ ker3 = ker + ns + ns; +#else + T ker1[MAX_NSPREAD]; + T ker2[MAX_NSPREAD]; + T ker3[MAX_NSPREAD]; +#endif for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < M; i += blockDim.x * gridDim.x) { const auto x_rescaled = fold_rescale(x[idxnupts[i]], nf1); @@ -461,8 +487,7 @@ __global__ void interp_3d_nupts_driven( } } } - c[idxnupts[i]].x = cnow.x; - c[idxnupts[i]].y = cnow.y; + c[idxnupts[i]] = cnow; } } @@ -478,10 +503,16 @@ __global__ void interp_3d_subprob( extern __shared__ char sharedbuf[]; auto fwshared = (cuda_complex *)sharedbuf; - auto ker = (T *)alloca(sizeof(T) * ns * 2); +#if ALLOCA_SUPPORTED + auto ker = (T *)alloca(sizeof(T) * ns * 3); auto *__restrict__ ker1 = ker; auto *__restrict__ ker2 = ker + ns; auto *__restrict__ ker3 = ker + ns + ns; +#else + T ker1[MAX_NSPREAD]; + T ker2[MAX_NSPREAD]; + T ker3[MAX_NSPREAD]; +#endif const auto subpidx = blockIdx.x; const auto bidx = subprob_to_bin[subpidx]; @@ -514,8 +545,7 @@ __global__ void interp_3d_subprob( const auto outidx = ix + iy * nf1 + iz * nf1 * nf2; int sharedidx = i + j * (bin_size_x + rounded_ns) + k * (bin_size_x + rounded_ns) * (bin_size_y + rounded_ns); - fwshared[sharedidx].x = fw[outidx].x; - fwshared[sharedidx].y = fw[outidx].y; + fwshared[sharedidx] = fw[outidx]; } } __syncthreads(); @@ -569,8 +599,7 @@ __global__ void interp_3d_subprob( } } } - c[idxnupts[idx]].x = cnow.x; - c[idxnupts[idx]].y = cnow.y; + c[idxnupts[idx]] = cnow; } } From 8cd50fc3eb4946179f865b60997922ff38207152 Mon Sep 17 00:00:00 2001 From: Marco Barbone Date: Thu, 25 Jul 2024 15:18:37 -0400 Subject: [PATCH 25/68] using cuda 11.2 --- Jenkinsfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Jenkinsfile b/Jenkinsfile index f042f7749..e5e76cf06 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -9,7 +9,7 @@ pipeline { stage('main') { agent { dockerfile { - filename 'tools/cufinufft/docker/cuda12.0/Dockerfile-x86_64' + filename 'tools/cufinufft/docker/cuda11.2/Dockerfile-x86_64' args '--gpus 2' label 'v100' } From 49a9d7eed481953cfa54cbae34b0ab5fb2052237 Mon Sep 17 00:00:00 2001 From: Marco Barbone Date: Thu, 25 Jul 2024 17:15:25 -0400 Subject: [PATCH 26/68] using sm90 atomics --- include/cufinufft/spreadinterp.h | 8 +- include/cufinufft/utils.h | 22 ++++ perftest/cuda/CMakeLists.txt | 5 +- perftest/cuda/bench.py | 172 +++++++++++++++++-------------- src/cuda/1d/spreadinterp1d.cuh | 6 +- src/cuda/2d/spreadinterp2d.cuh | 14 ++- src/cuda/3d/spreadinterp3d.cuh | 19 ++-- src/cuda/CMakeLists.txt | 18 ++-- test/cuda/CMakeLists.txt | 3 + 9 files changed, 151 insertions(+), 116 deletions(-) diff --git a/include/cufinufft/spreadinterp.h b/include/cufinufft/spreadinterp.h index d5009de41..3866233a4 100644 --- a/include/cufinufft/spreadinterp.h +++ b/include/cufinufft/spreadinterp.h @@ -38,11 +38,11 @@ static __forceinline__ __device__ constexpr T fma(const T a, const T b, const T return __fmaf_rn(a, b, c); } else if constexpr (std::is_same_v) { return __fma_rn(a, b, c); - } else { - static_assert(std::is_same_v || std::is_same_v, - "Only float and double are supported."); } -} + static_assert(std::is_same_v || std::is_same_v, + "Only float and double are supported."); + return T{0}; +}; template static inline T evaluate_kernel(T x, const finufft_spread_opts &opts) diff --git a/include/cufinufft/utils.h b/include/cufinufft/utils.h index 29645f9f9..f556da8d6 100644 --- a/include/cufinufft/utils.h +++ b/include/cufinufft/utils.h @@ -12,6 +12,9 @@ #include +#include +#include + #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 600 || defined(__clang__) #else __inline__ __device__ double atomicAdd(double *address, double val) { @@ -104,6 +107,25 @@ __forceinline__ __device__ auto interval(const int ns, const double x) { #define COMPUTE_CAPABILITY_90_OR_HIGHER 0 #endif +template +static __forceinline__ __device__ void atomicAddComplexShared( + cuda_complex *address, cuda_complex res) { + const auto raw_address = reinterpret_cast(address); + atomicAdd(raw_address, res.x); + atomicAdd(raw_address + 1, res.y); +} + +template +static __forceinline__ __device__ void atomicAddComplexGlobal( + cuda_complex *address, cuda_complex res) { + if constexpr ( + std::is_same_v, float2> && COMPUTE_CAPABILITY_90_OR_HIGHER) { + atomicAdd(address, res); + } else { + atomicAddComplexShared(address, res); + } +} + } // namespace utils } // namespace cufinufft diff --git a/perftest/cuda/CMakeLists.txt b/perftest/cuda/CMakeLists.txt index 8f8a8a20b..04412d4e8 100644 --- a/perftest/cuda/CMakeLists.txt +++ b/perftest/cuda/CMakeLists.txt @@ -4,6 +4,7 @@ target_link_libraries(cuperftest PUBLIC cufinufft) set_target_properties(cuperftest PROPERTIES LINKER_LANGUAGE CUDA CUDA_ARCHITECTURES ${FINUFFT_CUDA_ARCHITECTURES} + CUDA_STANDARD 17 + CUDA_STANDARD_REQUIRED ON ) - -#file(COPY ${CMAKE_CURRENT_SOURCE_DIR}/bench.sh DESTINATION ${CMAKE_CURRENT_BINARY_DIR}) +target_compile_features(cuperftest PRIVATE cxx_std_17) diff --git a/perftest/cuda/bench.py b/perftest/cuda/bench.py index 1d52ff884..a7fa5e6f2 100644 --- a/perftest/cuda/bench.py +++ b/perftest/cuda/bench.py @@ -71,89 +71,101 @@ def build_args(args): if stderr != '': print(stderr) exit(0) -max_range = 16 if args["--prec"] == "d" else 7 +for precision in ['f', 'd']: + for dim in range(1, 4): + if dim == 1: + args["--N1"] = "16777216" + if dim == 2: + args["--N1"] = "256" + args["--N2"] = "256" + if dim == 3: + args["--N1"] = "256" + args["--N2"] = "256" + args["--N3"] = "256" + args["--prec"] = precision + max_range = 16 if args["--prec"] == "d" else 7 + for i in range(1, max_range): + args["--tol"] = "1E-" + ("0" if i < 10 else "") + str(i) + print("Running with tol = 1E-" + str(i)) + for method in ['2', '1']: + args["--method"] = method + if method == '0': + data['method'].append('auto') + elif method == '1': + data['method'].append('GM') + elif method == '2': + data['method'].append('SM') + elif method == '4': + data['method'].append('BLOCK') + print("Method " + data['method'][-1]) + cmd = ["profile", "--force-overwrite", "true", "-o", "cuperftest_profile", cwd + "/cuperftest"] + build_args(args) + stdout, stderr = run_command("nsys", cmd) + if stderr != '': + print(stderr) + exit(0) + # skip all lines starting with # in stdout + conf = [x for x in stdout.splitlines() if x.startswith("#")] + print('\n'.join(conf)) + stdout = [x for x in stdout.splitlines() if not x.startswith("#")][:7] + if stdout[0].startswith("bin"): + print(stdout[0]) + stdout = stdout[1:] -for i in range(1, max_range): - args["--tol"] = "1E-" + ("0" if i < 10 else "") + str(i) - print("Running with tol = 1E-" + str(i)) - for method in ['2', '1']: - args["--method"] = method - if method == '0': - data['method'].append('auto') - elif method == '1': - data['method'].append('GM') - elif method == '2': - data['method'].append('SM') - elif method == '4': - data['method'].append('BLOCK') - print("Method " + data['method'][-1]) - cmd = ["profile", "--force-overwrite", "true", "-o", "cuperftest_profile", cwd + "/cuperftest"] + build_args(args) - stdout, stderr = run_command("nsys", cmd) - if stderr != '': - print(stderr) - exit(0) - # skip all lines starting with # in stdout - conf = [x for x in stdout.splitlines() if x.startswith("#")] - print('\n'.join(conf)) - stdout = [x for x in stdout.splitlines() if not x.startswith("#")][:7] - if stdout[0].startswith("bin"): - print(stdout[0]) - stdout = stdout[1:] + stdout = '\n'.join(stdout) + # convert stdout to a dataframe from csv string + dt = pd.read_csv(io.StringIO(stdout), sep=',') + setpts = dt[dt["event"].str.contains("setpts")]['nupts/s'].sum() # it is only one row it extracts the value + exec = dt[dt["event"].str.contains("exec")]['nupts/s'].sum() # it is only one row it extracts the value + print(f'setpts pts/s: {setpts}') + print(f'exec pts/s: {exec}') + cmd = ["stats", "--force-overwrite=true", "--force-export=true", "--report", "cuda_gpu_trace", "--report", "cuda_gpu_kern_sum", "cuperftest_profile.nsys-rep", + "--format=csv", "--output", "cuperftest"] + stdout, _ = run_command("nsys", cmd) + # remove format from cmd + cmd = cmd[:-3] + # print(run_command("nsys", cmd)) + # print(csv) + dt = pd.read_csv("./cuperftest_cuda_gpu_trace.csv") + # print(dt) + # sum the "Total Time" column of the ones that contain "fft" in name + # print(dt[dt["Name"].str.contains("fft") & ~dt["Name"].str.contains("cufinufft")]) + total_fft = dt[dt["Name"].str.contains("fft") & ~dt["Name"].str.contains("cufinufft")]['Duration (ns)'].sum() + print(f'total_fft: {total_fft}') + # drop all the rows with spread not in "Name" + dt = dt[dt["Name"].str.contains("cufinufft::spreadinterp::spread")] + # print(dt) + # exit(0) + # sort dt by column "Time (%)" + total_spread = dt['Duration (ns)'].sum() - total_fft + print(f'total_spread: {total_spread}') + if total_fft > total_spread: + print("Warning: total_fft > total_spread") + # exit(0) + # pt/s + throughput = float(args['--M']) * float(args['--n_runs']) * 1_000_000_000 / total_spread + print(f'throughput: {throughput}') + data['throughput'].append(throughput) + data['tolerance'].append(args['--tol']) + # data['setpts'].append(setpts) + data['exec'].append(exec) - stdout = '\n'.join(stdout) - # convert stdout to a dataframe from csv string - dt = pd.read_csv(io.StringIO(stdout), sep=',') - setpts = dt[dt["event"].str.contains("setpts")]['nupts/s'].sum() # it is only one row it extracts the value - exec = dt[dt["event"].str.contains("exec")]['nupts/s'].sum() # it is only one row it extracts the value - print(f'setpts pts/s: {setpts}') - print(f'exec pts/s: {exec}') - cmd = ["stats", "--force-overwrite=true", "--force-export=true", "--report", "cuda_gpu_trace", "--report", "cuda_gpu_kern_sum", "cuperftest_profile.nsys-rep", - "--format=csv", "--output", "cuperftest"] - stdout, _ = run_command("nsys", cmd) - # remove format from cmd - cmd = cmd[:-3] - # print(run_command("nsys", cmd)) - # print(csv) - dt = pd.read_csv("./cuperftest_cuda_gpu_trace.csv") - # print(dt) - # sum the "Total Time" column of the ones that contain "fft" in name - # print(dt[dt["Name"].str.contains("fft") & ~dt["Name"].str.contains("cufinufft")]) - total_fft = dt[dt["Name"].str.contains("fft") & ~dt["Name"].str.contains("cufinufft")]['Duration (ns)'].sum() - print(f'total_fft: {total_fft}') - # drop all the rows with spread not in "Name" - dt = dt[dt["Name"].str.contains("cufinufft::spreadinterp::spread")] - # print(dt) - # exit(0) - # sort dt by column "Time (%)" - total_spread = dt['Duration (ns)'].sum() - total_fft - print(f'total_spread: {total_spread}') - if total_fft > total_spread: - print("Warning: total_fft > total_spread") - # exit(0) - # pt/s - throughput = float(args['--M']) * float(args['--n_runs']) * 1_000_000_000 / total_spread - print(f'throughput: {throughput}') - data['throughput'].append(throughput) - data['tolerance'].append(args['--tol']) - # data['setpts'].append(setpts) - data['exec'].append(exec) - -df = pd.DataFrame(data) -# Pivot the DataFrame -pivot_df = df.pivot(index='tolerance', columns='method') -# print(pivot_df) -# scale the throughput SM by GM -# pivot_df['throughput', 'SM'] /= pivot_df['throughput', 'GM'] -# pivot_df['throughput', 'GM'] /= pivot_df['throughput', 'GM'] -# scale setpts SM by GM -# pivot_df['exec', 'SM'] /= pivot_df['exec', 'GM'] -# pivot_df['exec', 'GM'] /= pivot_df['exec', 'GM'] -# remove the GM column -# pivot_df.drop(('throughput', 'GM'), axis=1, inplace=True) -pivot_df.drop(('exec', 'GM'), axis=1, inplace=True) -pivot_df.drop(('exec', 'SM'), axis=1, inplace=True) -print(pivot_df) + df = pd.DataFrame(data) + # Pivot the DataFrame + pivot_df = df.pivot(index='tolerance', columns='method') + # print(pivot_df) + # scale the throughput SM by GM + # pivot_df['throughput', 'SM'] /= pivot_df['throughput', 'GM'] + # pivot_df['throughput', 'GM'] /= pivot_df['throughput', 'GM'] + # scale setpts SM by GM + # pivot_df['exec', 'SM'] /= pivot_df['exec', 'GM'] + # pivot_df['exec', 'GM'] /= pivot_df['exec', 'GM'] + # remove the GM column + # pivot_df.drop(('throughput', 'GM'), axis=1, inplace=True) + pivot_df.drop(('exec', 'GM'), axis=1, inplace=True) + pivot_df.drop(('exec', 'SM'), axis=1, inplace=True) + print(pivot_df) +exit(0) # Plot pivot_df.plot(kind='bar', figsize=(10, 7)) # Find the minimum throughput value diff --git a/src/cuda/1d/spreadinterp1d.cuh b/src/cuda/1d/spreadinterp1d.cuh index c7d84a9b8..56493ef73 100644 --- a/src/cuda/1d/spreadinterp1d.cuh +++ b/src/cuda/1d/spreadinterp1d.cuh @@ -138,8 +138,7 @@ __global__ void spread_1d_subprob( if (ix >= (bin_size_x + ns_2) || ix < 0) break; const cuda_complex result{cnow.x * ker1[xx - xstart], cnow.y * ker1[xx - xstart]}; - atomicAdd(&fwshared[ix].x, result.x); - atomicAdd(&fwshared[ix].y, result.y); + atomicAddComplexShared(fwshared + ix, result); } } __syncthreads(); @@ -148,8 +147,7 @@ __global__ void spread_1d_subprob( auto ix = xoffset - ns_2 + k; if (ix < (nf1 + ns_2)) { ix = ix < 0 ? ix + nf1 : (ix > nf1 - 1 ? ix - nf1 : ix); - atomicAdd(&fw[ix].x, fwshared[k].x); - atomicAdd(&fw[ix].y, fwshared[k].y); + atomicAddComplexGlobal(fw + ix, fwshared[k]); } } } diff --git a/src/cuda/2d/spreadinterp2d.cuh b/src/cuda/2d/spreadinterp2d.cuh index e8a69f303..03da3ed8a 100644 --- a/src/cuda/2d/spreadinterp2d.cuh +++ b/src/cuda/2d/spreadinterp2d.cuh @@ -53,8 +53,9 @@ __global__ void spread_2d_nupts_driven( const auto outidx = ix + iy * nf1; const auto kervalue1 = ker1[xx - xstart]; const auto kervalue2 = ker2[yy - ystart]; - atomicAdd(&fw[outidx].x, cnow.x * kervalue1 * kervalue2); - atomicAdd(&fw[outidx].y, cnow.y * kervalue1 * kervalue2); + const cuda_complex res{cnow.x * kervalue1 * kervalue2, + cnow.y * kervalue1 * kervalue2}; + atomicAddComplexGlobal(fw + outidx, res); } } } @@ -180,10 +181,8 @@ __global__ void spread_2d_subprob( if (ix >= (bin_size_x + rounded_ns) || ix < 0) break; const auto outidx = ix + iy * (bin_size_x + rounded_ns); const auto kervalue = ker1[xx - xstart] * ker2[yy - ystart]; - const auto resx = cnow.x * kervalue; - const auto resy = cnow.y * kervalue; - atomicAdd(&fwshared[outidx].x, resx); - atomicAdd(&fwshared[outidx].y, resy); + const cuda_complex res{cnow.x * kervalue, cnow.y * kervalue}; + atomicAddComplexShared(fwshared + outidx, res); } } } @@ -200,8 +199,7 @@ __global__ void spread_2d_subprob( iy = iy < 0 ? iy + nf2 : (iy > nf2 - 1 ? iy - nf2 : iy); const auto outidx = ix + iy * nf1; const auto sharedidx = i + j * (bin_size_x + rounded_ns); - atomicAdd(&fw[outidx].x, fwshared[sharedidx].x); - atomicAdd(&fw[outidx].y, fwshared[sharedidx].y); + atomicAddComplexGlobal(fw + outidx, fwshared[sharedidx]); } } } diff --git a/src/cuda/3d/spreadinterp3d.cuh b/src/cuda/3d/spreadinterp3d.cuh index 19eae72a4..59b4661ff 100644 --- a/src/cuda/3d/spreadinterp3d.cuh +++ b/src/cuda/3d/spreadinterp3d.cuh @@ -127,8 +127,9 @@ __global__ void spread_3d_nupts_driven(const T *x, const T *y, const T *z, const int outidx = ix + iy * nf1 + iz * nf1 * nf2; const auto ker1val = ker1[xx - xstart]; const auto kervalue = ker1val * ker2val * ker3val; - atomicAdd(&fw[outidx].x, c[idxnupts[i]].x * kervalue); - atomicAdd(&fw[outidx].y, c[idxnupts[i]].y * kervalue); + const cuda_complex res{c[idxnupts[i]].x * kervalue, + c[idxnupts[i]].y * kervalue}; + atomicAddComplexGlobal(fw + outidx, res); } } } @@ -223,10 +224,8 @@ __global__ void spread_3d_subprob( const int outidx = ix + iy * (bin_size_x + rounded_ns) + iz * (bin_size_x + rounded_ns) * (bin_size_y + rounded_ns); const auto kervalue = ker1[xx - xstart] * kervalue2 * kervalue3; - const auto resx = cnow.x * kervalue; - const auto resy = cnow.y * kervalue; - atomicAdd(&fwshared[outidx].x, resx); - atomicAdd(&fwshared[outidx].y, resy); + const cuda_complex res{cnow.x * kervalue, cnow.y * kervalue}; + atomicAddComplexShared(fwshared + outidx, res); } } } @@ -250,8 +249,7 @@ __global__ void spread_3d_subprob( const int outidx = ix + iy * nf1 + iz * nf1 * nf2; const int sharedidx = i + j * (bin_size_x + rounded_ns) + k * (bin_size_x + rounded_ns) * (bin_size_y + rounded_ns); - atomicAdd(&fw[outidx].x, fwshared[sharedidx].x); - atomicAdd(&fw[outidx].y, fwshared[sharedidx].y); + atomicAddComplexGlobal(fw + outidx, fwshared[sharedidx]); } } } @@ -408,8 +406,9 @@ __global__ void spread_3d_block_gather( for (int xx = xstartnew; xx <= xendnew; xx++) { const auto outidx = xx + yy * obin_size_x + zz * obin_size_y * obin_size_x; const T kervalue1 = ker1[xx - xstart]; - atomicAdd(&fwshared[outidx].x, cnow.x * kervalue1 * kervalue2 * kervalue3); - atomicAdd(&fwshared[outidx].y, cnow.y * kervalue1 * kervalue2 * kervalue3); + const cuda_complex res{cnow.x * kervalue1 * kervalue2 * kervalue3, + cnow.y * kervalue1 * kervalue2 * kervalue3}; + atomicAddComplexShared(fwshared + outidx, res); } } } diff --git a/src/cuda/CMakeLists.txt b/src/cuda/CMakeLists.txt index 751ccfc6c..69eb0597c 100644 --- a/src/cuda/CMakeLists.txt +++ b/src/cuda/CMakeLists.txt @@ -30,6 +30,8 @@ set_target_properties( POSITION_INDEPENDENT_CODE ${FINUFFT_SHARED_LINKING} CUDA_ARCHITECTURES ${FINUFFT_CUDA_ARCHITECTURES} CUDA_SEPARABLE_COMPILATION ON + CUDA_STANDARD 17 + CUDA_STANDARD_REQUIRED ON ) target_compile_options(cufinufft_common_objects PRIVATE $<$:${FINUFFT_CUDA_FLAGS}>) @@ -42,6 +44,8 @@ set_target_properties( POSITION_INDEPENDENT_CODE ${FINUFFT_SHARED_LINKING} CUDA_ARCHITECTURES ${FINUFFT_CUDA_ARCHITECTURES} CUDA_SEPARABLE_COMPILATION ON + CUDA_STANDARD 17 + CUDA_STANDARD_REQUIRED ON ) target_compile_features(cufinufft_objects PRIVATE cxx_std_17) target_compile_options(cufinufft_objects PRIVATE $<$:${FINUFFT_CUDA_FLAGS}>) @@ -51,20 +55,16 @@ if (FINUFFT_SHARED_LINKING) $ $ ) - set_target_properties( - cufinufft PROPERTIES - LIBRARY_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}" - ) else () add_library(cufinufft STATIC $ $ ) - set_target_properties( - cufinufft PROPERTIES - ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}" - ) endif () +set_target_properties( + cufinufft PROPERTIES + ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}" +) if (WIN32) target_link_libraries(cufinufft PUBLIC CUDA::cudart CUDA::cufft CUDA::nvToolsExt) @@ -80,4 +80,6 @@ set_target_properties( POSITION_INDEPENDENT_CODE ${FINUFFT_SHARED_LINKING} CUDA_ARCHITECTURES ${FINUFFT_CUDA_ARCHITECTURES} CUDA_SEPARABLE_COMPILATION ON + CUDA_STANDARD 17 + CUDA_STANDARD_REQUIRED ON ) diff --git a/test/cuda/CMakeLists.txt b/test/cuda/CMakeLists.txt index 8d77d9fdc..6555d4f64 100644 --- a/test/cuda/CMakeLists.txt +++ b/test/cuda/CMakeLists.txt @@ -7,9 +7,12 @@ foreach(srcfile ${test_src}) add_executable(${executable} ${srcfile}) target_include_directories(${executable} PUBLIC ${CUFINUFFT_INCLUDE_DIRS}) target_link_libraries(${executable} PUBLIC cufinufft m) + target_compile_features(${executable} PRIVATE cxx_std_17) set_target_properties(${executable} PROPERTIES LINKER_LANGUAGE CUDA CUDA_ARCHITECTURES ${FINUFFT_CUDA_ARCHITECTURES} + CUDA_STANDARD 17 + CUDA_STANDARD_REQUIRED ON ) message(STATUS "Adding test ${executable}" " with CUDA_ARCHITECTURES=${FINUFFT_CUDA_ARCHITECTURES}" From 041a536819945eb606771743b6ac6ab4ba95b6a0 Mon Sep 17 00:00:00 2001 From: Marco Barbone Date: Thu, 25 Jul 2024 17:41:35 -0400 Subject: [PATCH 27/68] updated script --- perftest/cuda/bench.py | 31 +++++++++++++++++-------------- 1 file changed, 17 insertions(+), 14 deletions(-) diff --git a/perftest/cuda/bench.py b/perftest/cuda/bench.py index a7fa5e6f2..d01a67668 100644 --- a/perftest/cuda/bench.py +++ b/perftest/cuda/bench.py @@ -49,13 +49,7 @@ def build_args(args): "--M": "1E8", "--tol": "1E-6"} # iterate over tol from 1E-6 to 1E-1 -data = { - 'method': [], - 'throughput': [], - 'tolerance': [], - # 'setpts': [], - 'exec': [], -} + warmup = {"--prec": "f", "--n_runs": "1", "--method": "0", @@ -71,7 +65,8 @@ def build_args(args): if stderr != '': print(stderr) exit(0) -for precision in ['f', 'd']: +for precision in ['d']: + print(f"precision: {precision}") for dim in range(1, 4): if dim == 1: args["--N1"] = "16777216" @@ -84,6 +79,16 @@ def build_args(args): args["--N3"] = "256" args["--prec"] = precision max_range = 16 if args["--prec"] == "d" else 7 + if precision == 'd' and dim == 3: + max_range = 6 + print(f"dimensions {dim}") + data = { + 'method': [], + 'throughput': [], + 'tolerance': [], + # 'setpts': [], + 'exec': [], + } for i in range(1, max_range): args["--tol"] = "1E-" + ("0" if i < 10 else "") + str(i) print("Running with tol = 1E-" + str(i)) @@ -116,8 +121,8 @@ def build_args(args): dt = pd.read_csv(io.StringIO(stdout), sep=',') setpts = dt[dt["event"].str.contains("setpts")]['nupts/s'].sum() # it is only one row it extracts the value exec = dt[dt["event"].str.contains("exec")]['nupts/s'].sum() # it is only one row it extracts the value - print(f'setpts pts/s: {setpts}') - print(f'exec pts/s: {exec}') + # print(f'setpts pts/s: {setpts}') + # print(f'exec pts/s: {exec}') cmd = ["stats", "--force-overwrite=true", "--force-export=true", "--report", "cuda_gpu_trace", "--report", "cuda_gpu_kern_sum", "cuperftest_profile.nsys-rep", "--format=csv", "--output", "cuperftest"] stdout, _ = run_command("nsys", cmd) @@ -130,14 +135,14 @@ def build_args(args): # sum the "Total Time" column of the ones that contain "fft" in name # print(dt[dt["Name"].str.contains("fft") & ~dt["Name"].str.contains("cufinufft")]) total_fft = dt[dt["Name"].str.contains("fft") & ~dt["Name"].str.contains("cufinufft")]['Duration (ns)'].sum() - print(f'total_fft: {total_fft}') + # print(f'total_fft: {total_fft}') # drop all the rows with spread not in "Name" dt = dt[dt["Name"].str.contains("cufinufft::spreadinterp::spread")] # print(dt) # exit(0) # sort dt by column "Time (%)" total_spread = dt['Duration (ns)'].sum() - total_fft - print(f'total_spread: {total_spread}') + # print(f'total_spread: {total_spread}') if total_fft > total_spread: print("Warning: total_fft > total_spread") # exit(0) @@ -148,8 +153,6 @@ def build_args(args): data['tolerance'].append(args['--tol']) # data['setpts'].append(setpts) data['exec'].append(exec) - - df = pd.DataFrame(data) # Pivot the DataFrame pivot_df = df.pivot(index='tolerance', columns='method') From 54683c3c6b2c49fc25a3a3b00c88a39bef2b0263 Mon Sep 17 00:00:00 2001 From: Marco Barbone Date: Fri, 26 Jul 2024 11:36:27 -0400 Subject: [PATCH 28/68] fixed bin sizes --- include/cufinufft/impl.h | 4 ++-- perftest/cuda/bench.py | 4 ++-- src/cuda/common.cu | 7 ++----- 3 files changed, 6 insertions(+), 9 deletions(-) diff --git a/include/cufinufft/impl.h b/include/cufinufft/impl.h index 7d63df51e..3d6e99b35 100644 --- a/include/cufinufft/impl.h +++ b/include/cufinufft/impl.h @@ -143,10 +143,10 @@ int cufinufft_makeplan_impl(int type, int dim, int *nmodes, int iflag, int ntran * * For type 2, we always default to method 1 (GM). */ - // query the device for the amount of shared memory available - if (dim == 3 && std::is_same_v) { + if (d_plan->type == 2) { d_plan->opts.gpu_method = 1; } else { + // query the device for the amount of shared memory available int shared_mem_per_block{}; cudaDeviceGetAttribute(&shared_mem_per_block, cudaDevAttrMaxSharedMemoryPerBlockOptin, device_id); diff --git a/perftest/cuda/bench.py b/perftest/cuda/bench.py index d01a67668..c22c2af9f 100644 --- a/perftest/cuda/bench.py +++ b/perftest/cuda/bench.py @@ -54,7 +54,7 @@ def build_args(args): "--n_runs": "1", "--method": "0", "--N1": "256", - "--N2": "256", + # "--N2": "256", # "--N3": "256", "--M": "256", "--tol": "1E-1"} @@ -67,7 +67,7 @@ def build_args(args): exit(0) for precision in ['d']: print(f"precision: {precision}") - for dim in range(1, 4): + for dim in range(1, 2): if dim == 1: args["--N1"] = "16777216" if dim == 2: diff --git a/src/cuda/common.cu b/src/cuda/common.cu index 8499aea8a..eba170a24 100644 --- a/src/cuda/common.cu +++ b/src/cuda/common.cu @@ -256,13 +256,10 @@ void cufinufft_setup_binsize(int type, int ns, int dim, cufinufft_opts *opts) { if (const auto err = cudaGetLastError(); err != cudaSuccess) { throw std::runtime_error(cudaGetErrorString(err)); } - // use half of the available shared memory if double precision - if constexpr (std::is_same_v) { - shared_mem_per_block /= 2; - } + // use 1/6 of the shared memory for the binsize + shared_mem_per_block /= 6; const int bin_size = shared_mem_per_block / sizeof(cuda_complex) - ((ns + 1) / 2) * 2; - opts->gpu_binsizex = bin_size; } opts->gpu_binsizey = 1; From dc3a62877cd39b1c0d71778a50a6074df2373c6b Mon Sep 17 00:00:00 2001 From: Marco Barbone Date: Fri, 26 Jul 2024 12:01:43 -0400 Subject: [PATCH 29/68] using floor in fold_rescale updated changelog --- CHANGELOG | 9 +++++++ include/cufinufft/spreadinterp.h | 44 +++++++++++++++----------------- 2 files changed, 30 insertions(+), 23 deletions(-) diff --git a/CHANGELOG b/CHANGELOG index 000e03b6f..ba024e07f 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -38,6 +38,15 @@ V 2.3.0beta (7/21/24) any 32-bit integers to 64-bit when calling cufinufft(f)_setpts. Note that internally, 32-bit integers are still used, so calling cufinufft with more than 2e9 points will fail. This restriction may be lifted in the future. +* cuFINUFFT binsize is now a function of the shared memory available where + possible. +* cuFINUFFT GM 1D sorts using thrust::sort instead of bin-sort. +* cuFINUFFT using the new normalized Horner coefficients and added support + for 1.25. +* cuFINUFFT new compile flags for extra-vectorization, flushing single + precision denormals to 0 and using fma where possible. +* cuFINUFFT using intrinsics in foldrescale and other places to increase + performance V 2.2.0 (12/12/23) diff --git a/include/cufinufft/spreadinterp.h b/include/cufinufft/spreadinterp.h index 3866233a4..0ab7aba9a 100644 --- a/include/cufinufft/spreadinterp.h +++ b/include/cufinufft/spreadinterp.h @@ -10,40 +10,38 @@ namespace cufinufft { namespace spreadinterp { template -constexpr __forceinline__ __host__ __device__ T fold_rescale(T x, int N) { - constexpr const auto x2pi = T(0.159154943091895345554011992339482617); - constexpr const auto half = T(0.5); +static __forceinline__ __device__ constexpr T fma(const T a, const T b, const T c) { + if constexpr (std::is_same_v) { + return __fmaf_rn(a, b, c); + } else if constexpr (std::is_same_v) { + return __fma_rn(a, b, c); + } + static_assert(std::is_same_v || std::is_same_v, + "Only float and double are supported."); + return T{0}; +} + +template +constexpr __forceinline__ __host__ __device__ T fold_rescale(const T x, const V N) { + constexpr auto x2pi = T(0.159154943091895345554011992339482617); + constexpr auto half = T(0.5); #if defined(__CUDA_ARCH__) if constexpr (std::is_same_v) { - auto result = __fmaf_rn(x, x2pi, half); - result = __fsub_rd(result, truncf(result)); - return __fmul_rd(result, static_cast(N)); + const auto result = fma(x, x2pi, half); + return (result - floorf(result)) * static_cast(N); } else if constexpr (std::is_same_v) { - auto result = __fma_rn(x, x2pi, half); - result = __dsub_rd(result, trunc(result)); - return __dmul_rd(result, static_cast(N)); + const auto result = fma(x, x2pi, half); + return (result - floor(result)) * static_cast(N); } else { static_assert(std::is_same_v || std::is_same_v, "Only float and double are supported."); } #else - const auto result = std::fma(x, x2pi, half); - return (result - std::trunc(result)) * static_cast(N); + const auto result = fma(x, x2pi, half); + return (result - std::floor(result)) * static_cast(N); #endif } -template -static __forceinline__ __device__ constexpr T fma(const T a, const T b, const T c) { - if constexpr (std::is_same_v) { - return __fmaf_rn(a, b, c); - } else if constexpr (std::is_same_v) { - return __fma_rn(a, b, c); - } - static_assert(std::is_same_v || std::is_same_v, - "Only float and double are supported."); - return T{0}; -}; - template static inline T evaluate_kernel(T x, const finufft_spread_opts &opts) /* ES ("exp sqrt") kernel evaluation at single real argument: From b3237f7e29a75232b03e6ce4bc2d5703fe811cb8 Mon Sep 17 00:00:00 2001 From: Marco Barbone Date: Fri, 26 Jul 2024 12:16:53 -0400 Subject: [PATCH 30/68] fixed a mistake --- include/cufinufft/impl.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/cufinufft/impl.h b/include/cufinufft/impl.h index 3d6e99b35..dcf00f31b 100644 --- a/include/cufinufft/impl.h +++ b/include/cufinufft/impl.h @@ -143,7 +143,7 @@ int cufinufft_makeplan_impl(int type, int dim, int *nmodes, int iflag, int ntran * * For type 2, we always default to method 1 (GM). */ - if (d_plan->type == 2) { + if (type == 2) { d_plan->opts.gpu_method = 1; } else { // query the device for the amount of shared memory available From db80aad0f21cedccc85d1eca211c4286a18a198e Mon Sep 17 00:00:00 2001 From: Marco Barbone Date: Fri, 26 Jul 2024 15:44:47 -0400 Subject: [PATCH 31/68] added comments for review --- CHANGELOG | 1 + include/cufinufft/impl.h | 1 + include/cufinufft/spreadinterp.h | 24 +++++++++++++++++------- include/cufinufft/utils.h | 25 ++++++++++++++++++++----- src/cuda/common.cu | 19 +++++++++---------- 5 files changed, 48 insertions(+), 22 deletions(-) diff --git a/CHANGELOG b/CHANGELOG index ba024e07f..d25d7e5d7 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -47,6 +47,7 @@ V 2.3.0beta (7/21/24) precision denormals to 0 and using fma where possible. * cuFINUFFT using intrinsics in foldrescale and other places to increase performance +* cuFINUFFT using SM90 float2 vector atomicAdd where supported V 2.2.0 (12/12/23) diff --git a/include/cufinufft/impl.h b/include/cufinufft/impl.h index dcf00f31b..c3021a7ff 100644 --- a/include/cufinufft/impl.h +++ b/include/cufinufft/impl.h @@ -60,6 +60,7 @@ int cufinufft_makeplan_impl(int type, int dim, int *nmodes, int iflag, int ntran Variables and arrays inside the plan struct are set and allocated. Melody Shih 07/25/19. Use-facing moved to markdown, Barnett 2/16/21. + Marco Barbone 07/26/24. Using SM when shared memory available is enough. */ int ier; cuDoubleComplex *d_a = nullptr; // fseries temp data diff --git a/include/cufinufft/spreadinterp.h b/include/cufinufft/spreadinterp.h index 0ab7aba9a..2963d381d 100644 --- a/include/cufinufft/spreadinterp.h +++ b/include/cufinufft/spreadinterp.h @@ -12,8 +12,10 @@ namespace spreadinterp { template static __forceinline__ __device__ constexpr T fma(const T a, const T b, const T c) { if constexpr (std::is_same_v) { + // fused multiply-add, round to nearest even return __fmaf_rn(a, b, c); } else if constexpr (std::is_same_v) { + // fused multiply-add, round to nearest even return __fma_rn(a, b, c); } static_assert(std::is_same_v || std::is_same_v, @@ -21,23 +23,31 @@ static __forceinline__ __device__ constexpr T fma(const T a, const T b, const T return T{0}; } -template -constexpr __forceinline__ __host__ __device__ T fold_rescale(const T x, const V N) { +template +constexpr __forceinline__ __host__ __device__ T fold_rescale(T x, int N) { constexpr auto x2pi = T(0.159154943091895345554011992339482617); constexpr auto half = T(0.5); #if defined(__CUDA_ARCH__) if constexpr (std::is_same_v) { - const auto result = fma(x, x2pi, half); - return (result - floorf(result)) * static_cast(N); + // fused multiply-add, round to nearest even + auto result = __fmaf_rn(x, x2pi, half); + // subtract, round down + result = __fsub_rd(result, floorf(result)); + // multiply, round down + return __fmul_rd(result, static_cast(N)); } else if constexpr (std::is_same_v) { - const auto result = fma(x, x2pi, half); - return (result - floor(result)) * static_cast(N); + // fused multiply-add, round to nearest even + auto result = __fma_rn(x, x2pi, half); + // subtract, round down + result = __dsub_rd(result, floor(result)); + // multiply, round down + return __dmul_rd(result, static_cast(N)); } else { static_assert(std::is_same_v || std::is_same_v, "Only float and double are supported."); } #else - const auto result = fma(x, x2pi, half); + const auto result = std::fma(x, x2pi, half); return (result - std::floor(result)) * static_cast(N); #endif } diff --git a/include/cufinufft/utils.h b/include/cufinufft/utils.h index f556da8d6..b4db528ae 100644 --- a/include/cufinufft/utils.h +++ b/include/cufinufft/utils.h @@ -74,11 +74,14 @@ template T infnorm(int n, std::complex *a) { #ifdef __CUDA_ARCH__ __forceinline__ __device__ auto interval(const int ns, const float x) { + // float to int round up and fused multiply-add to round up const auto xstart = __float2int_ru(__fmaf_ru(ns, -.5f, x)); - const auto xend = __float2int_rd(__fmaf_rd(ns, .5f, x)); + // float to int round down and fused multiply-add to round down + const auto xend = __float2int_rd(__fmaf_rd(ns, .5f, x)); return int2{xstart, xend}; } __forceinline__ __device__ auto interval(const int ns, const double x) { + // same as above const auto xstart = __double2int_ru(__fma_ru(ns, -.5, x)); const auto xend = __double2int_rd(__fma_rd(ns, .5, x)); return int2{xstart, xend}; @@ -107,17 +110,29 @@ __forceinline__ __device__ auto interval(const int ns, const double x) { #define COMPUTE_CAPABILITY_90_OR_HIGHER 0 #endif +/** + * does a complex atomic add on a shared memory address + * it adds the real and imaginary parts separately + * cuda does not support atomic operations + * on complex numbers on shared memory directly + */ + template -static __forceinline__ __device__ void atomicAddComplexShared( - cuda_complex *address, cuda_complex res) { +static __forceinline__ __device__ void atomicAddComplexShared(cuda_complex *address, + cuda_complex res) { const auto raw_address = reinterpret_cast(address); atomicAdd(raw_address, res.x); atomicAdd(raw_address + 1, res.y); } +/** + * does a complex atomic add on a global memory address + * since cuda 90 atomic operations on complex numbers + * on shared memory are supported so we leverage them + */ template -static __forceinline__ __device__ void atomicAddComplexGlobal( - cuda_complex *address, cuda_complex res) { +static __forceinline__ __device__ void atomicAddComplexGlobal(cuda_complex *address, + cuda_complex res) { if constexpr ( std::is_same_v, float2> && COMPUTE_CAPABILITY_90_OR_HIGHER) { atomicAdd(address, res); diff --git a/src/cuda/common.cu b/src/cuda/common.cu index eba170a24..19b0cbd1a 100644 --- a/src/cuda/common.cu +++ b/src/cuda/common.cu @@ -202,8 +202,7 @@ void onedim_fseries_kernel_compute(CUFINUFFT_BIGINT nf, T *f, std::complex std::size_t shared_memory_required(int dim, int ns, int bin_size_x, int bin_size_y, int bin_size_z) { - // printf("dim, ns, bin_size_x, bin_size_y, bin_size_z: %d %d %d %d %d\n", dim, ns, - // bin_size_x, bin_size_y, bin_size_z); + // Helper to compute the shared memory required for the spreader when using SM int adjusted_ns = bin_size_x + ((ns + 1) / 2) * 2; if (dim == 1) { @@ -221,17 +220,18 @@ std::size_t shared_memory_required(int dim, int ns, int bin_size_x, int bin_size return adjusted_ns * sizeof(cuda_complex); } -// Function to find bin_size_x == bin_size_y where bin_size_x * bin_size_y < MemSize -template int find_bin_size(std::size_t MemSize, int dim, int ns) { +// Function to find bin_size_x == bin_size_y +// where bin_size_x * bin_size_y * bin_size_z < mem_size +// TODO: this can be done without a loop by using a direct formula +template int find_bin_size(std::size_t mem_size, int dim, int ns) { int binsize = 1; // Start with the smallest possible bin size - while (true) { // Calculate the shared memory required for the current bin_size_x and bin_size_y std::size_t required_memory = shared_memory_required(dim, ns, binsize, binsize, binsize); // Check if the required memory is less than the available memory - if (required_memory > MemSize) { + if (required_memory > mem_size) { // If the condition is met, return the current bin_size_x return binsize - 1; } @@ -243,6 +243,9 @@ template int find_bin_size(std::size_t MemSize, int dim, int ns) { template void cufinufft_setup_binsize(int type, int ns, int dim, cufinufft_opts *opts) { + // Marco Barbone 07/26/24. Using the shared memory available on the device, to + // determine the optimal binsize for the spreader. + // TODO: This can still be improved some sizes are hardcoded still int shared_mem_per_block{}, device_id{}; switch (dim) { case 1: { @@ -290,10 +293,6 @@ void cufinufft_setup_binsize(int type, int ns, int dim, cufinufft_opts *opts) { } break; } } - // const auto shared_mem_required = shared_memory_required( - // dim, ns, opts->gpu_binsizex, opts->gpu_binsizey, opts->gpu_binsizez); - // printf("binsizex: %d, binsizey: %d, shared_mem_required %ld (bytes)\n", - // opts->gpu_binsizex, opts->gpu_binsizey, shared_mem_required); opts->gpu_binsizez = 1; } break; case 3: { From c225fb56eac9b288fd518ea875e4b7ca74ed19ba Mon Sep 17 00:00:00 2001 From: Marco Barbone Date: Wed, 31 Jul 2024 12:42:52 -0400 Subject: [PATCH 32/68] fixing review comments --- src/cuda/common.cu | 2 + src/ker_horner_allw_loop.inc | 207 ----------------------------------- 2 files changed, 2 insertions(+), 207 deletions(-) delete mode 100644 src/ker_horner_allw_loop.inc diff --git a/src/cuda/common.cu b/src/cuda/common.cu index 19b0cbd1a..6e7064b25 100644 --- a/src/cuda/common.cu +++ b/src/cuda/common.cu @@ -260,6 +260,8 @@ void cufinufft_setup_binsize(int type, int ns, int dim, cufinufft_opts *opts) { throw std::runtime_error(cudaGetErrorString(err)); } // use 1/6 of the shared memory for the binsize + // From experiments on multiple GPUs this gives the best tradeoff. + // It is within 90% of the maximum performance for all GPUs tested. shared_mem_per_block /= 6; const int bin_size = shared_mem_per_block / sizeof(cuda_complex) - ((ns + 1) / 2) * 2; diff --git a/src/ker_horner_allw_loop.inc b/src/ker_horner_allw_loop.inc deleted file mode 100644 index 953c4618b..000000000 --- a/src/ker_horner_allw_loop.inc +++ /dev/null @@ -1,207 +0,0 @@ -// Code generated by gen_all_horner_C_code.m in finufft/devel -// Authors: Alex Barnett & Ludvig af Klinteberg. -// (C) The Simons Foundation, Inc. - if (w==2) { - constexpr FLT c0[] = {4.5147043243215343E+01, 4.5147043243215350E+01}; - constexpr FLT c1[] = {5.7408070938221307E+01, -5.7408070938221300E+01}; - constexpr FLT c2[] = {-1.8395117920046544E+00, -1.8395117920046602E+00}; - constexpr FLT c3[] = {-2.0382426253182064E+01, 2.0382426253182086E+01}; - constexpr FLT c4[] = {-2.0940804433577389E+00, -2.0940804433577398E+00}; - for (int i=0; i<2; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i])))); - } else if (w==3) { - constexpr FLT c0[] = {1.5653991189315130E+02, 8.8006872410780375E+02, 1.5653991189967169E+02}; - constexpr FLT c1[] = {3.1653018869611083E+02, 2.7828437114531882E-14, -3.1653018868907077E+02}; - constexpr FLT c2[] = {1.7742692790454484E+02, -3.3149255274727801E+02, 1.7742692791117128E+02}; - constexpr FLT c3[] = {-1.5357716116473071E+01, 1.0675641863333163E-13, 1.5357716122720211E+01}; - constexpr FLT c4[] = {-3.7757583061523640E+01, 5.3222970968867450E+01, -3.7757583054647341E+01}; - constexpr FLT c5[] = {-3.9654011076088449E+00, 4.9521033695040343E-14, 3.9654011139270429E+00}; - for (int i=0; i<3; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i]))))); - } else if (w==4) { - constexpr FLT c0[] = {5.4284366850213223E+02, 1.0073871433088407E+04, 1.0073871433088407E+04, 5.4284366850213269E+02}; - constexpr FLT c1[] = {1.4650917259256942E+03, 6.1905285583602899E+03, -6.1905285583602899E+03, -1.4650917259256942E+03}; - constexpr FLT c2[] = {1.4186910680718349E+03, -1.3995339862725573E+03, -1.3995339862725571E+03, 1.4186910680718345E+03}; - constexpr FLT c3[] = {5.1133995502497481E+02, -1.4191608683682980E+03, 1.4191608683682985E+03, -5.1133995502497402E+02}; - constexpr FLT c4[] = {-4.8293622641173705E+01, 3.9393732546135901E+01, 3.9393732546136945E+01, -4.8293622641173727E+01}; - constexpr FLT c5[] = {-7.8386867802392203E+01, 1.4918904800408794E+02, -1.4918904800408947E+02, 7.8386867802392203E+01}; - constexpr FLT c6[] = {-1.0039212571700403E+01, 5.0626747735617119E+00, 5.0626747735622777E+00, -1.0039212571700599E+01}; - for (int i=0; i<4; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i])))))); - } else if (w==5) { - constexpr FLT c0[] = {9.9223677575398506E+02, 3.7794697666613349E+04, 9.8715771010760567E+04, 3.7794697666613327E+04, 9.9223677575398540E+02}; - constexpr FLT c1[] = {3.0430174925083834E+03, 3.7938404259811425E+04, -4.1880997701304513E-12, -3.7938404259811403E+04, -3.0430174925083829E+03}; - constexpr FLT c2[] = {3.6092689177271232E+03, 7.7501368899498630E+03, -2.2704627332475000E+04, 7.7501368899498721E+03, 3.6092689177271213E+03}; - constexpr FLT c3[] = {1.9990077310495410E+03, -3.8875294641277214E+03, 1.6137850891850780E-11, 3.8875294641277346E+03, -1.9990077310495410E+03}; - constexpr FLT c4[] = {4.0071733590403909E+02, -1.5861137916762543E+03, 2.3839858699098786E+03, -1.5861137916762577E+03, 4.0071733590403909E+02}; - constexpr FLT c5[] = {-9.1301168206167233E+01, 1.2316471075215087E+02, 1.9401736511657983E-12, -1.2316471075215495E+02, 9.1301168206166977E+01}; - constexpr FLT c6[] = {-5.5339722671222894E+01, 1.1960590540262304E+02, -1.5249941358312140E+02, 1.1960590540262024E+02, -5.5339722671224088E+01}; - constexpr FLT c7[] = {-3.3762488150349581E+00, 2.2839981873006558E+00, 8.2819625836083788E-12, -2.2839981872910400E+00, 3.3762488150351579E+00}; - for (int i=0; i<5; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i]))))))); - } else if (w==6) { - constexpr FLT c0[] = {2.0553833234911899E+03, 1.5499537739913145E+05, 8.1177907023291232E+05, 8.1177907023291232E+05, 1.5499537739913145E+05, 2.0553833235005700E+03}; - constexpr FLT c1[] = {7.1269776034442684E+03, 2.0581923258843319E+05, 3.1559612614917679E+05, -3.1559612614917639E+05, -2.0581923258843317E+05, -7.1269776034341394E+03}; - constexpr FLT c2[] = {1.0023404568475091E+04, 9.0916650498360206E+04, -1.0095927514054631E+05, -1.0095927514054631E+05, 9.0916650498360163E+04, 1.0023404568484637E+04}; - constexpr FLT c3[] = {7.2536109410387444E+03, 4.8347162752603444E+03, -5.0512736602018485E+04, 5.0512736602018602E+04, -4.8347162752602972E+03, -7.2536109410297577E+03}; - constexpr FLT c4[] = {2.7021878300949775E+03, -7.8773465553972374E+03, 5.2105876478343516E+03, 5.2105876478343944E+03, -7.8773465553972464E+03, 2.7021878301048723E+03}; - constexpr FLT c5[] = {3.2120291706547630E+02, -1.8229189469936912E+03, 3.7928113414428476E+03, -3.7928113414427171E+03, 1.8229189469937239E+03, -3.2120291705638328E+02}; - constexpr FLT c6[] = {-1.2051267090537345E+02, 2.2400507411399769E+02, -1.2506575852547746E+02, -1.2506575852531816E+02, 2.2400507411399730E+02, -1.2051267089640162E+02}; - constexpr FLT c7[] = {-4.5977202613346755E+01, 1.1536880606857032E+02, -1.7819720186492938E+02, 1.7819720186504426E+02, -1.1536880606851560E+02, 4.5977202622148354E+01}; - constexpr FLT c8[] = {-1.5631081288822022E+00, 7.1037430590520445E-01, -6.9838401262032682E-02, -6.9838401199524530E-02, 7.1037430591562767E-01, -1.5631081203751171E+00}; - for (int i=0; i<6; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i])))))))); - } else if (w==7) { - constexpr FLT c0[] = {3.9948351830487582E+03, 5.4715865608590841E+05, 5.0196413492771825E+06, 9.8206709220713321E+06, 5.0196413492771871E+06, 5.4715865608590853E+05, 3.9948351830642619E+03}; - constexpr FLT c1[] = {1.5290160332974698E+04, 8.7628248584320419E+05, 3.4421061790934466E+06, 6.5103105025927563E-10, -3.4421061790934466E+06, -8.7628248584320443E+05, -1.5290160332958061E+04}; - constexpr FLT c2[] = {2.4458227486779258E+04, 5.3904618484139442E+05, 2.4315566181017563E+05, -1.6133959371974308E+06, 2.4315566181017424E+05, 5.3904618484139396E+05, 2.4458227486795091E+04}; - constexpr FLT c3[] = {2.1166189345881652E+04, 1.3382732160223150E+05, -3.3113450969689601E+05, 2.5683270626620309E-10, 3.3113450969689793E+05, -1.3382732160223130E+05, -2.1166189345866896E+04}; - constexpr FLT c4[] = {1.0542795672344870E+04, -7.0739172265096349E+03, -6.5563293056048627E+04, 1.2429734005960199E+05, -6.5563293056048671E+04, -7.0739172265096395E+03, 1.0542795672361222E+04}; - constexpr FLT c5[] = {2.7903491906228451E+03, -1.0975382873972989E+04, 1.3656979541145318E+04, 4.9801640867456605E-10, -1.3656979541144143E+04, 1.0975382873973054E+04, -2.7903491906078325E+03}; - constexpr FLT c6[] = {1.6069721418054232E+02, -1.5518707872249406E+03, 4.3634273936649897E+03, -5.9891976420600004E+03, 4.3634273936636964E+03, -1.5518707872250636E+03, 1.6069721419532380E+02}; - constexpr FLT c7[] = {-1.2289277373866669E+02, 2.8583630927761948E+02, -2.8318194617245649E+02, -3.5832266061541795E-11, 2.8318194617438041E+02, -2.8583630927744588E+02, 1.2289277375319726E+02}; - constexpr FLT c8[] = {-3.2270164914244575E+01, 9.1892112257588494E+01, -1.6710678096380749E+02, 2.0317049305436126E+02, -1.6710678096299210E+02, 9.1892112257580479E+01, -3.2270164900216493E+01}; - constexpr FLT c9[] = {-1.4761409684320093E-01, -9.1862771282699351E-01, 1.2845147740384601E+00, -5.0335941641611417E-10, -1.2845147731561353E+00, 9.1862771293147938E-01, 1.4761410890830065E-01}; - for (int i=0; i<7; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i]))))))))); - } else if (w==8) { - constexpr FLT c0[] = {7.3898000697448142E+03, 1.7297637497600052E+06, 2.5578341605285820E+07, 8.4789650417103425E+07, 8.4789650417103410E+07, 2.5578341605285831E+07, 1.7297637497600054E+06, 7.3898000697448097E+03}; - constexpr FLT c1[] = {3.0719636811267621E+04, 3.1853145713323932E+06, 2.3797981861403704E+07, 2.4569731244678468E+07, -2.4569731244678475E+07, -2.3797981861403704E+07, -3.1853145713323941E+06, -3.0719636811267595E+04}; - constexpr FLT c2[] = {5.4488498478251728E+04, 2.4101183255475122E+06, 6.4554051283428418E+06, -8.9200440393090658E+06, -8.9200440393090583E+06, 6.4554051283428296E+06, 2.4101183255475126E+06, 5.4488498478251728E+04}; - constexpr FLT c3[] = {5.3926359802542138E+04, 9.0469037926849385E+05, -6.0897036277695757E+05, -3.0743852105799988E+06, 3.0743852105800197E+06, 6.0897036277696723E+05, -9.0469037926849280E+05, -5.3926359802542152E+04}; - constexpr FLT c4[] = {3.2444118016247576E+04, 1.3079802224392162E+05, -5.8652889370128687E+05, 4.2333306008153327E+05, 4.2333306008153543E+05, -5.8652889370128710E+05, 1.3079802224392179E+05, 3.2444118016247601E+04}; - constexpr FLT c5[] = {1.1864306345505300E+04, -2.2700360645707835E+04, -5.0713607251411129E+04, 1.8308704458211461E+05, -1.8308704458211147E+05, 5.0713607251410089E+04, 2.2700360645707704E+04, -1.1864306345505296E+04}; - constexpr FLT c6[] = {2.2812256770903396E+03, -1.1569135767377908E+04, 2.0942387020802456E+04, -1.1661592834947036E+04, -1.1661592834946512E+04, 2.0942387020804370E+04, -1.1569135767377549E+04, 2.2812256770903291E+03}; - constexpr FLT c7[] = {8.5503535636977634E+00, -9.7513976461196773E+02, 3.8242995179186414E+03, -6.9201295567263214E+03, 6.9201295567309990E+03, -3.8242995179140653E+03, 9.7513976461263269E+02, -8.5503535636935535E+00}; - constexpr FLT c8[] = {-1.0230637348345098E+02, 2.8246898554249236E+02, -3.8638201738252542E+02, 1.9106407992706994E+02, 1.9106407993520349E+02, -3.8638201738414602E+02, 2.8246898554297724E+02, -1.0230637348344338E+02}; - constexpr FLT c9[] = {-1.9200143062942033E+01, 6.1692257626381128E+01, -1.2981109187954436E+02, 1.8681284209765820E+02, -1.8681284209914423E+02, 1.2981109187880136E+02, -6.1692257626381128E+01, 1.9200143062947838E+01}; - constexpr FLT c10[] = {3.7894993761363543E-01, -1.7334408835887836E+00, 2.5271184092462979E+00, -1.2600963912775105E+00, -1.2600963880718390E+00, 2.5271184126204269E+00, -1.7334408829982433E+00, 3.7894993761427903E-01}; - for (int i=0; i<8; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i])))))))))); - } else if (w==9) { - constexpr FLT c0[] = {1.3136365370186153E+04, 5.0196413492771843E+06, 1.1303327711722577E+08, 5.8225443924996734E+08, 9.7700272582690716E+08, 5.8225443924996805E+08, 1.1303327711722578E+08, 5.0196413492772263E+06, 1.3136365370186144E+04}; - constexpr FLT c1[] = {5.8623313038274369E+04, 1.0326318537280345E+07, 1.2898448324824868E+08, 3.0522863709830379E+08, 7.2435840302079811E-08, -3.0522863709830397E+08, -1.2898448324824865E+08, -1.0326318537280394E+07, -5.8623313038274347E+04}; - constexpr FLT c2[] = {1.1335001341875960E+05, 9.0726133144784812E+06, 5.3501544534038134E+07, -2.6789524644140172E+05, -1.2483923718899371E+08, -2.6789524644173466E+05, 5.3501544534038089E+07, 9.0726133144785147E+06, 1.1335001341875963E+05}; - constexpr FLT c3[] = {1.2489113703229754E+05, 4.3035547171861976E+06, 6.3021978510599164E+06, -2.6014941986658975E+07, 5.3074599277157087E-08, 2.6014941986659400E+07, -6.3021978510598680E+06, -4.3035547171862088E+06, -1.2489113703229751E+05}; - constexpr FLT c4[] = {8.6425493435991244E+04, 1.0891182836653311E+06, -2.0713033564200432E+06, -2.8994941183505901E+06, 7.5905338661206560E+06, -2.8994941183505324E+06, -2.0713033564200350E+06, 1.0891182836653385E+06, 8.6425493435991288E+04}; - constexpr FLT c5[] = {3.8657354724013800E+04, 7.9936390113329253E+04, -7.0458265546791849E+05, 1.0151095605715540E+06, 7.5990350518026299E-08, -1.0151095605718379E+06, 7.0458265546793933E+05, -7.9936390113333939E+04, -3.8657354724013821E+04}; - constexpr FLT c6[] = {1.0779131453134645E+04, -3.3466718311300116E+04, -1.3245366618985940E+04, 1.8238470515354761E+05, -2.9285656292981049E+05, 1.8238470515352563E+05, -1.3245366618989963E+04, -3.3466718311299133E+04, 1.0779131453134627E+04}; - constexpr FLT c7[] = {1.4992527030548656E+03, -9.7024371533879767E+03, 2.3216330734078529E+04, -2.3465262819038293E+04, -4.5678067266366728E-08, 2.3465262819229152E+04, -2.3216330734050898E+04, 9.7024371533899721E+03, -1.4992527030548690E+03}; - constexpr FLT c8[] = {-7.9857427421152821E+01, -4.0585588534976301E+02, 2.6054813773370911E+03, -6.1806593581469824E+03, 8.0679596873459095E+03, -6.1806593581737125E+03, 2.6054813773390433E+03, -4.0585588535087578E+02, -7.9857427421118601E+01}; - constexpr FLT c9[] = {-7.1572272057928345E+01, 2.2785637019390455E+02, -3.9109820766111051E+02, 3.3597424707310040E+02, -1.3908671051550088E-08, -3.3597424727519922E+02, 3.9109820767448468E+02, -2.2785637019111829E+02, 7.1572272057948652E+01}; - constexpr FLT c10[] = {-9.8886360697883688E+00, 3.5359026950204516E+01, -8.5251867695464611E+01, 1.4285748013461193E+02, -1.6935269664190733E+02, 1.4285748014610570E+02, -8.5251867686017064E+01, 3.5359026947336602E+01, -9.8886360697963340E+00}; - for (int i=0; i<9; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i])))))))))); - } else if (w==10) { - constexpr FLT c0[] = {2.2594586605749224E+04, 1.3595989066786611E+07, 4.4723032442444932E+08, 3.3781755837397552E+09, 8.6836783895849857E+09, 8.6836783895849838E+09, 3.3781755837397523E+09, 4.4723032442444944E+08, 1.3595989066786496E+07, 2.2594586605749344E+04}; - constexpr FLT c1[] = {1.0729981697645644E+05, 3.0651490267742995E+07, 5.9387966085130477E+08, 2.4434902657508349E+09, 2.0073077861288934E+09, -2.0073077861288950E+09, -2.4434902657508330E+09, -5.9387966085130477E+08, -3.0651490267742828E+07, -1.0729981697645634E+05}; - constexpr FLT c2[] = {2.2340399734184612E+05, 3.0258214643190462E+07, 3.1512411458738238E+08, 4.3618276932319850E+08, -7.8178848450497270E+08, -7.8178848450497031E+08, 4.3618276932319820E+08, 3.1512411458738214E+08, 3.0258214643190324E+07, 2.2340399734184553E+05}; - constexpr FLT c3[] = {2.6917433004353492E+05, 1.6875651476661246E+07, 7.4664745481963649E+07, -9.5882157211117968E+07, -2.0622994435532477E+08, 2.0622994435532823E+08, 9.5882157211118430E+07, -7.4664745481963366E+07, -1.6875651476661157E+07, -2.6917433004353428E+05}; - constexpr FLT c4[] = {2.0818422772177897E+05, 5.6084730690362593E+06, 1.4435118192351859E+06, -4.0063869969544269E+07, 3.2803674392747816E+07, 3.2803674392746560E+07, -4.0063869969546124E+07, 1.4435118192352206E+06, 5.6084730690362155E+06, 2.0818422772177868E+05}; - constexpr FLT c5[] = {1.0781139496011086E+05, 9.9202615851199278E+05, -3.3266265543961083E+06, -4.8557049011452327E+05, 1.0176155522772400E+07, -1.0176155522773268E+07, 4.8557049011599307E+05, 3.3266265543962419E+06, -9.9202615851196356E+05, -1.0781139496011072E+05}; - constexpr FLT c6[] = {3.7380102688153638E+04, 1.2716675000361241E+04, -6.2163527451762755E+05, 1.4157962667184302E+06, -8.4419693137719855E+05, -8.4419693137682532E+05, 1.4157962667184921E+06, -6.2163527451772091E+05, 1.2716675000342160E+04, 3.7380102688153478E+04}; - constexpr FLT c7[] = {8.1238936393894573E+03, -3.4872365530440075E+04, 2.3913680325287874E+04, 1.2428850301835715E+05, -3.2158255329711520E+05, 3.2158255329964001E+05, -1.2428850301842803E+05, -2.3913680325138281E+04, 3.4872365530466821E+04, -8.1238936393894610E+03}; - constexpr FLT c8[] = {7.8515926628982811E+02, -6.6607899119346384E+03, 2.0167398338412942E+04, -2.8951401344643764E+04, 1.4622828141516249E+04, 1.4622828142773422E+04, -2.8951401346273171E+04, 2.0167398338466974E+04, -6.6607899119428766E+03, 7.8515926628979298E+02}; - constexpr FLT c9[] = {-1.0147176570538747E+02, -3.5304284178326540E+01, 1.3576976855470537E+03, -4.3921059355373945E+03, 7.3232085265656797E+03, -7.3232085282537992E+03, 4.3921059362506849E+03, -1.3576976853984515E+03, 3.5304284186128150E+01, 1.0147176570552679E+02}; - constexpr FLT c10[] = {-4.3161545259359876E+01, 1.5498490982726668E+02, -3.1771250761814974E+02, 3.7215448796966825E+02, -1.7181762811175784E+02, -1.7181762918070896E+02, 3.7215448823960344E+02, -3.1771250765054128E+02, 1.5498490982861634E+02, -4.3161545259484186E+01}; - constexpr FLT c11[] = {-4.2916172038642904E+00, 1.7402146073587435E+01, -4.7947588063038118E+01, 9.2697697961204668E+01, -1.2821427624698006E+02, 1.2821427667135228E+02, -9.2697698383138089E+01, 4.7947588092305367E+01, -1.7402146072063207E+01, 4.2916172038214455E+00}; - for (int i=0; i<10; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i]))))))))))); - } else if (w==11) { - constexpr FLT c0[] = {3.7794653219809712E+04, 3.4782300224660814E+07, 1.6188020733727572E+09, 1.7196758809615025E+10, 6.3754384857724686E+10, 9.7196447559193588E+10, 6.3754384857724686E+10, 1.7196758809615013E+10, 1.6188020733727574E+09, 3.4782300224660836E+07, 3.7794653219808912E+04}; - constexpr FLT c1[] = {1.8969206922085886E+05, 8.4769319065313712E+07, 2.4230555767723413E+09, 1.5439732722639107E+10, 2.7112836839612331E+10, 7.5382856415600940E-06, -2.7112836839612324E+10, -1.5439732722639109E+10, -2.4230555767723413E+09, -8.4769319065313712E+07, -1.8969206922085691E+05}; - constexpr FLT c2[] = {4.2138380313901440E+05, 9.2050522922791913E+07, 1.5259983101266618E+09, 4.7070559561237240E+09, -1.2448027572952247E+09, -1.0161446790279316E+10, -1.2448027572952359E+09, 4.7070559561237249E+09, 1.5259983101266608E+09, 9.2050522922791883E+07, 4.2138380313901132E+05}; - constexpr FLT c3[] = {5.4814313598122029E+05, 5.8085130777589604E+07, 4.9484006166551131E+08, 1.6222124676641059E+08, -2.0440440381345210E+09, 1.6029666825264191E-05, 2.0440440381345406E+09, -1.6222124676640612E+08, -4.9484006166551065E+08, -5.8085130777589574E+07, -5.4814313598121749E+05}; - constexpr FLT c4[] = {4.6495183529254969E+05, 2.3067199578027170E+07, 6.9832590192482471E+07, -2.2024799260683393E+08, -1.2820270942588173E+08, 5.1017181199129957E+08, -1.2820270942587103E+08, -2.2024799260683718E+08, 6.9832590192482680E+07, 2.3067199578027181E+07, 4.6495183529254753E+05}; - constexpr FLT c5[] = {2.7021781043532956E+05, 5.6764510325100170E+06, -5.5650761736746123E+06, -3.9907385617899098E+07, 7.2453390663685441E+07, 1.3807321808330796E-06, -7.2453390663686499E+07, 3.9907385617896959E+07, 5.5650761736744791E+06, -5.6764510325100273E+06, -2.7021781043532840E+05}; - constexpr FLT c6[] = {1.0933249308680632E+05, 6.9586821127988759E+05, -3.6860240321936086E+06, 2.7428169457744057E+06, 8.3392008440658972E+06, -1.6402201025049815E+07, 8.3392008440622678E+06, 2.7428169457778567E+06, -3.6860240321934861E+06, 6.9586821127989655E+05, 1.0933249308680571E+05}; - constexpr FLT c7[] = {3.0203516161820731E+04, -3.6879059542738614E+04, -4.1141031216769724E+05, 1.4111389975281695E+06, -1.5914376635274226E+06, 6.7631682826831895E-06, 1.5914376635404355E+06, -1.4111389975219201E+06, 4.1141031216798135E+05, 3.6879059542753101E+04, -3.0203516161820640E+04}; - constexpr FLT c8[] = {5.1670143574923986E+03, -2.8613147115359603E+04, 4.3560195427027051E+04, 4.8438679581734432E+04, -2.5856630639957223E+05, 3.7994883866286115E+05, -2.5856630639708077E+05, 4.8438679579228658E+04, 4.3560195427174098E+04, -2.8613147115353891E+04, 5.1670143574923814E+03}; - constexpr FLT c9[] = {3.0888018539742438E+02, -3.7949446187486474E+03, 1.4313303205130735E+04, -2.6681600236165083E+04, 2.3856005159699442E+04, -1.9072153968212169E-06, -2.3856005160079862E+04, 2.6681600234262976E+04, -1.4313303204940523E+04, 3.7949446187568205E+03, -3.0888018539723868E+02}; - constexpr FLT c10[] = {-8.3747489794178762E+01, 1.1948077481430271E+02, 4.8528498043145930E+02, -2.5024391100070475E+03, 5.3511195380863319E+03, -6.7655484103934950E+03, 5.3511195323636521E+03, -2.5024391101798296E+03, 4.8528498086337265E+02, 1.1948077483184566E+02, -8.3747489794339316E+01}; - constexpr FLT c11[] = {-2.2640047135393669E+01, 9.0840898559070766E+01, -2.1597187557069051E+02, 3.1511228970473707E+02, -2.4856618213020064E+02, -2.0962600056762836E-06, 2.4856618232531096E+02, -3.1511228707801843E+02, 2.1597187541459934E+02, -9.0840898577362736E+01, 2.2640047135479467E+01}; - constexpr FLT c12[] = {-1.6306382885603201E+00, 7.3325946574893264E+00, -2.3241017691629008E+01, 5.1715493346619120E+01, -8.2673008978082819E+01, 9.6489716906321945E+01, -8.2673008978083388E+01, 5.1715493276466965E+01, -2.3241017744243891E+01, 7.3325946602297218E+00, -1.6306382886202573E+00}; - for (int i=0; i<11; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i] + z*(c12[i])))))))))))); - } else if (w==12) { - constexpr FLT c0[] = {6.1722991679853279E+04, 8.4789650417103827E+07, 5.4431675199498749E+09, 7.8788892335272308E+10, 4.0355760945670074E+11, 8.8071481911347998E+11, 8.8071481911348035E+11, 4.0355760945670081E+11, 7.8788892335272507E+10, 5.4431675199498901E+09, 8.4789650417103752E+07, 6.1722991679871782E+04}; - constexpr FLT c1[] = {3.2561466099406185E+05, 2.2112758120210630E+08, 8.9911609880089836E+09, 8.3059508064200958E+10, 2.3965569143469873E+11, 1.6939286803305209E+11, -1.6939286803305209E+11, -2.3965569143469867E+11, -8.3059508064201111E+10, -8.9911609880090008E+09, -2.2112758120210621E+08, -3.2561466099404270E+05}; - constexpr FLT c2[] = {7.6621098001581512E+05, 2.6026568260310283E+08, 6.4524338253008652E+09, 3.3729904113826836E+10, 2.8555202212474079E+10, -6.8998572040731476E+10, -6.8998572040731461E+10, 2.8555202212474102E+10, 3.3729904113826820E+10, 6.4524338253008747E+09, 2.6026568260310283E+08, 7.6621098001583782E+05}; - constexpr FLT c3[] = {1.0657807616803222E+06, 1.8144472126891005E+08, 2.5524827004349880E+09, 5.2112383911371851E+09, -1.0268350564014641E+10, -1.4763245309081160E+10, 1.4763245309081381E+10, 1.0268350564014679E+10, -5.2112383911371050E+09, -2.5524827004349866E+09, -1.8144472126890993E+08, -1.0657807616803094E+06}; - constexpr FLT c4[] = {9.7829638830158766E+05, 8.2222351241520002E+07, 5.5676911894064677E+08, -4.8739037675425845E+08, -2.7153428193078089E+09, 2.5627633609246616E+09, 2.5627633609247270E+09, -2.7153428193078089E+09, -4.8739037675429344E+08, 5.5676911894064772E+08, 8.2222351241519988E+07, 9.7829638830161223E+05}; - constexpr FLT c5[] = {6.2536876825113979E+05, 2.4702814073680259E+07, 4.1488431554846764E+07, -2.9274790542417943E+08, 1.0742154109192364E+08, 6.2185168968026125E+08, -6.2185168968025279E+08, -1.0742154109186378E+08, 2.9274790542422217E+08, -4.1488431554844894E+07, -2.4702814073680248E+07, -6.2536876825112430E+05}; - constexpr FLT c6[] = {2.8527714307528501E+05, 4.6266378435690925E+06, -1.0665598090789001E+07, -2.6048960239884529E+07, 9.1597254427304730E+07, -5.9794495983325504E+07, -5.9794495983230442E+07, 9.1597254427350238E+07, -2.6048960239922173E+07, -1.0665598090794679E+07, 4.6266378435690831E+06, 2.8527714307530370E+05}; - constexpr FLT c7[] = {9.2873647411234633E+04, 3.6630046787437343E+05, -3.1271047224703613E+06, 4.8612412939389814E+06, 3.3820440907783178E+06, -1.6880127953644276E+07, 1.6880127953794900E+07, -3.3820440907782884E+06, -4.8612412938910574E+06, 3.1271047224760642E+06, -3.6630046787425788E+05, -9.2873647411217215E+04}; - constexpr FLT c8[] = {2.0817947751046311E+04, -5.5660303410283603E+04, -1.9519783923352187E+05, 1.0804817251249440E+06, -1.8264985852847320E+06, 9.7602844964054180E+05, 9.7602844964026869E+05, -1.8264985852578641E+06, 1.0804817251242315E+06, -1.9519783923298802E+05, -5.5660303410281354E+04, 2.0817947751063894E+04}; - constexpr FLT c9[] = {2.7986023314783351E+03, -1.9404411093657811E+04, 4.3922625001185028E+04, -7.6450317330166517E+03, -1.5273911976404343E+05, 3.3223441450907954E+05, -3.3223441450755787E+05, 1.5273911981578072E+05, 7.6450317512768770E+03, -4.3922624998712294E+04, 1.9404411093676386E+04, -2.7986023314643107E+03}; - constexpr FLT c10[] = {6.7849020474217255E+01, -1.7921351307610907E+03, 8.4980694701237535E+03, -1.9742624848712727E+04, 2.4620674811515193E+04, -1.1676544936917096E+04, -1.1676544845699163E+04, 2.4620674862652242E+04, -1.9742624819688928E+04, 8.4980694644226842E+03, -1.7921351307503089E+03, 6.7849020488654887E+01}; - constexpr FLT c11[] = {-5.4577020998540995E+01, 1.3637112871144197E+02, 4.5513617165591533E+01, -1.1174001347694452E+03, 3.2018768920645603E+03, -5.0580352089258022E+03, 5.0580351705274497E+03, -3.2018769484133886E+03, 1.1174001005075061E+03, -4.5513609907370189E+01, -1.3637112869192950E+02, 5.4577021011650153E+01}; - constexpr FLT c12[] = {-1.0538365872663764E+01, 4.6577222493036992E+01, -1.2606964247581806E+02, 2.1881090265912360E+02, -2.3273404104747246E+02, 1.0274271612440927E+02, 1.0274271612440242E+02, -2.3273400063947102E+02, 2.1881092482740195E+02, -1.2606964693052080E+02, 4.6577222495229805E+01, -1.0538365860486415E+01}; - constexpr FLT c13[] = {-4.6087004138254672E-01, 2.5969759057927089E+00, -9.6946928123584506E+00, 2.4990051638288470E+01, -4.6013914134428035E+01, 6.2056955095902744E+01, -6.2056967309552682E+01, 4.6013924603270830E+01, -2.4990037679831403E+01, 9.6946951024178141E+00, -2.5969758989770559E+00, 4.6087004739949022E-01}; - for (int i=0; i<12; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i] + z*(c12[i] + z*(c13[i]))))))))))))); - } else if (w==13) { - constexpr FLT c0[] = {9.8715725867495858E+04, 1.9828875496808127E+08, 1.7196758809615005E+10, 3.3083776881353601E+11, 2.2668873993375454E+12, 6.7734720591167598E+12, 9.6695220682534863E+12, 6.7734720591167490E+12, 2.2668873993375454E+12, 3.3083776881353540E+11, 1.7196758809615013E+10, 1.9828875496807912E+08, 9.8715725867495596E+04}; - constexpr FLT c1[] = {5.4491110456935561E+05, 5.4903670125539398E+08, 3.0879465445278194E+10, 3.9588436413399976E+11, 1.6860562536749780E+12, 2.4256447893117891E+12, 5.2271652473787576E-04, -2.4256447893117861E+12, -1.6860562536749771E+12, -3.9588436413399896E+11, -3.0879465445278202E+10, -5.4903670125538874E+08, -5.4491110456935479E+05}; - constexpr FLT c2[] = {1.3504711883426080E+06, 6.9286979077463174E+08, 2.4618123595484570E+10, 1.9493985627722617E+11, 3.9422703517046405E+11, -1.8678883613919846E+11, -8.5538079834550037E+11, -1.8678883613919666E+11, 3.9422703517046375E+11, 1.9493985627722595E+11, 2.4618123595484570E+10, 6.9286979077462602E+08, 1.3504711883426073E+06}; - constexpr FLT c3[] = {1.9937206140846505E+06, 5.2512029493766004E+08, 1.1253303793811764E+10, 4.6205527735932259E+10, -1.1607472377982828E+10, -1.6305241755642276E+11, 1.6137900538478137E-04, 1.6305241755642496E+11, 1.1607472377982767E+10, -4.6205527735932159E+10, -1.1253303793811754E+10, -5.2512029493765628E+08, -1.9937206140846501E+06}; - constexpr FLT c4[] = {1.9607419630386413E+06, 2.6425362558103913E+08, 3.1171259341747184E+09, 2.9839860297840395E+09, -1.9585031917561905E+10, -5.0666917387060509E+09, 3.6568794485482040E+10, -5.0666917387052479E+09, -1.9585031917561382E+10, 2.9839860297839293E+09, 3.1171259341747251E+09, 2.6425362558103746E+08, 1.9607419630386424E+06}; - constexpr FLT c5[] = {1.3593773865640303E+06, 9.1556445104158297E+07, 4.7074012944133645E+08, -1.1192579335656993E+09, -2.1090780087868536E+09, 5.2270306737954664E+09, 5.5914317801530834E-04, -5.2270306737946453E+09, 2.1090780087878797E+09, 1.1192579335657849E+09, -4.7074012944133860E+08, -9.1556445104157880E+07, -1.3593773865640303E+06}; - constexpr FLT c6[] = {6.8417206432039291E+05, 2.1561705510027312E+07, 7.5785249893027432E+06, -2.7456096030220407E+08, 3.4589095671070045E+08, 4.0256106808935356E+08, -1.0074306926604354E+09, 4.0256106809054130E+08, 3.4589095671009880E+08, -2.7456096030236250E+08, 7.5785249893008731E+06, 2.1561705510027334E+07, 6.8417206432039256E+05}; - constexpr FLT c7[] = {2.5248269397037590E+05, 3.0985559672617475E+06, -1.1816517087615140E+07, -8.2958498769974122E+06, 8.0546642347458601E+07, -1.0594657799513456E+08, 2.0249720264016184E-04, 1.0594657799514198E+08, -8.0546642347324282E+07, 8.2958498771580132E+06, 1.1816517087620620E+07, -3.0985559672620827E+06, -2.5248269397037590E+05}; - constexpr FLT c8[] = {6.7530100970876185E+04, 1.2373362326675311E+05, -2.1245597183288219E+06, 5.1047323238642653E+06, -1.4139444406972022E+06, -1.1818267556148527E+07, 2.0121548578311723E+07, -1.1818267556689126E+07, -1.4139444399964837E+06, 5.1047323237335468E+06, -2.1245597183262822E+06, 1.2373362326715943E+05, 6.7530100970876825E+04}; - constexpr FLT c9[] = {1.2421368748960511E+04, -5.0576243646858849E+04, -4.8878193436522284E+04, 6.5307896871419600E+05, -1.5497610128521242E+06, 1.5137725913425679E+06, 9.4288709689637382E-06, -1.5137725926086102E+06, 1.5497610130712469E+06, -6.5307896859246108E+05, 4.8878193441087336E+04, 5.0576243646517250E+04, -1.2421368748960882E+04}; - constexpr FLT c10[] = {1.2904654687548632E+03, -1.1169946054771519E+04, 3.3275109715936509E+04, -3.1765222282529230E+04, -5.9810982046625119E+04, 2.2355863065128919E+05, -3.1083591717381903E+05, 2.2355863453495159E+05, -5.9810982317515191E+04, -3.1765222420737289E+04, 3.3275109716627514E+04, -1.1169946054393644E+04, 1.2904654687550840E+03}; - constexpr FLT c11[] = {-1.9043622268214964E+01, -6.8296542209517031E+02, 4.2702512258593224E+03, -1.2165497344048174E+04, 1.9423733117203814E+04, -1.6010024763745962E+04, 3.4546242756821764E-04, 1.6010021562009399E+04, -1.9423732921465795E+04, 1.2165497485154361E+04, -4.2702512258593424E+03, 6.8296542155861471E+02, 1.9043622268233225E+01}; - constexpr FLT c12[] = {-3.0093984466084923E+01, 9.8972865759901183E+01, -9.7437038386122609E+01, -3.5079929976821143E+02, 1.5699249129925884E+03, -3.1287450613413444E+03, 3.8692192717886201E+03, -3.1287461388880197E+03, 1.5699252721748373E+03, -3.5079941874733129E+02, -9.7437038807041006E+01, 9.8972866294818274E+01, -3.0093984465708520E+01}; - constexpr FLT c13[] = {-4.3050286012574066E+00, 2.1108975856232256E+01, -6.4297196943170974E+01, 1.2922884719917388E+02, -1.6991815434264092E+02, 1.2654996803592717E+02, -1.3650372630766216E-04, -1.2655097304483594E+02, 1.6991801475807023E+02, -1.2922895886683040E+02, 6.4297199778482565E+01, -2.1108976173160116E+01, 4.3050286010444170E+00}; - constexpr FLT c14[] = {-1.0957333734356203E-01, 7.2949328697697935E-01, -3.4300803257592030E+00, 1.0470037850609911E+01, -2.2292132783546631E+01, 3.4570970759468082E+01, -3.9923502981338281E+01, 3.4573363471454584E+01, -2.2292171023236033E+01, 1.0470076090299283E+01, -3.4300793014818574E+00, 7.2949361239845723E-01, -1.0957333723937021E-01}; - for (int i=0; i<13; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i] + z*(c12[i] + z*(c13[i] + z*(c14[i])))))))))))))); - } else if (w==14) { - constexpr FLT c0[] = {1.5499533202966311E+05, 4.4723032442444772E+08, 5.1495083701694801E+10, 1.2904576022918081E+12, 1.1534950432785514E+13, 4.5650102198520523E+13, 8.8830582190032719E+13, 8.8830582190032734E+13, 4.5650102198520523E+13, 1.1534950432785541E+13, 1.2904576022918088E+12, 5.1495083701695160E+10, 4.4723032442444867E+08, 1.5499533202970124E+05}; - constexpr FLT c1[] = {8.9188339002980455E+05, 1.3065352538728638E+09, 9.9400185225815598E+10, 1.7136059013402412E+12, 1.0144146621675834E+13, 2.3034036018490723E+13, 1.4630967270448885E+13, -1.4630967270448867E+13, -2.3034036018490715E+13, -1.0144146621675846E+13, -1.7136059013402415E+12, -9.9400185225815979E+10, -1.3065352538728662E+09, -8.9188339002979419E+05}; - constexpr FLT c2[] = {2.3170473769379673E+06, 1.7532505043698251E+09, 8.6523535958354309E+10, 9.7455289065487476E+11, 3.2977972139362329E+12, 1.7874626001697834E+12, -6.1480918082633936E+12, -6.1480918082634014E+12, 1.7874626001697737E+12, 3.2977972139362251E+12, 9.7455289065487329E+11, 8.6523535958354599E+10, 1.7532505043698282E+09, 2.3170473769380408E+06}; - constexpr FLT c3[] = {3.6089249230396431E+06, 1.4278058213962200E+09, 4.4296625537022446E+10, 2.9466624630419830E+11, 3.1903621584503467E+11, -9.8834691411254578E+11, -1.1072264714919094E+12, 1.1072264714919380E+12, 9.8834691411255481E+11, -3.1903621584503326E+11, -2.9466624630419788E+11, -4.4296625537022636E+10, -1.4278058213962224E+09, -3.6089249230396668E+06}; - constexpr FLT c4[] = {3.7733555140851745E+06, 7.8376718099107444E+08, 1.4443117772349586E+10, 4.3197433307418678E+10, -7.6585042240583893E+10, -1.8569640140762125E+11, 2.0385335192658521E+11, 2.0385335192658505E+11, -1.8569640140762244E+11, -7.6585042240577591E+10, 4.3197433307418831E+10, 1.4443117772349697E+10, 7.8376718099107611E+08, 3.7733555140852574E+06}; - constexpr FLT c5[] = {2.8079157920112340E+06, 3.0340753492383713E+08, 2.9498136661747241E+09, -6.2820200387946582E+08, -2.2372008390623741E+10, 1.5217518660587065E+10, 4.0682590266890762E+10, -4.0682590266874344E+10, -1.5217518660581593E+10, 2.2372008390624836E+10, 6.2820200387926054E+08, -2.9498136661747794E+09, -3.0340753492383808E+08, -2.8079157920112382E+06}; - constexpr FLT c6[] = {1.5361613559533129E+06, 8.3513615594416931E+07, 3.0077547202709264E+08, -1.3749596754065564E+09, -6.6733027297578251E+08, 5.9590333632812872E+09, -4.3025685566868906E+09, -4.3025685566947279E+09, 5.9590333632843285E+09, -6.6733027297604084E+08, -1.3749596754066198E+09, 3.0077547202708143E+08, 8.3513615594416305E+07, 1.5361613559533581E+06}; - constexpr FLT c7[] = {6.2759409419593017E+05, 1.5741723594963871E+07, -1.5632610223386128E+07, -1.9294824907063219E+08, 4.4643806532504034E+08, 1.5178998384579189E+07, -9.6771139891231704E+08, 9.6771139892423606E+08, -1.5178998381071322E+07, -4.4643806533015347E+08, 1.9294824907069016E+08, 1.5632610223408137E+07, -1.5741723594963046E+07, -6.2759409419590794E+05}; - constexpr FLT c8[] = {1.9151404903933618E+05, 1.7156606891565623E+06, -9.7733523156695794E+06, 4.2982266232611798E+06, 5.1660907884888940E+07, -1.1279400211171694E+08, 6.4701089576848499E+07, 6.4701089570801638E+07, -1.1279400210612530E+08, 5.1660907893511616E+07, 4.2982266235306170E+06, -9.7733523156822342E+06, 1.7156606891565854E+06, 1.9151404903936735E+05}; - constexpr FLT c9[] = {4.2715272622844263E+04, -2.2565910611002505E+03, -1.1769776156928577E+06, 4.0078399906352242E+06, -3.8951858073074366E+06, -5.0944610789569877E+06, 1.6765992441849992E+07, -1.6765992434448514E+07, 5.0944610797360903E+06, 3.8951858063335577E+06, -4.0078399906595708E+06, 1.1769776157202481E+06, 2.2565910608803192E+03, -4.2715272622819932E+04}; - constexpr FLT c10[] = {6.4806786522801558E+03, -3.5474227032715331E+04, 1.8237100734263218E+04, 3.0934714642964909E+05, -1.0394703930801603E+06, 1.4743920316337310E+06, -7.3356881642929500E+05, -7.3356882324020052E+05, 1.4743920364765557E+06, -1.0394703915764539E+06, 3.0934714676135289E+05, 1.8237100683125096E+04, -3.5474227032952876E+04, 6.4806786523017845E+03}; - constexpr FLT c11[] = {4.9913632908494827E+02, -5.5416668522806276E+03, 2.0614058722611946E+04, -3.2285139157855901E+04, -5.3099566255893524E+03, 1.1559000150525174E+05, -2.2569743273246771E+05, 2.2569743457059452E+05, -1.1559000428242185E+05, 5.3099542679931265E+03, 3.2285138893125553E+04, -2.0614058670789782E+04, 5.5416668532562171E+03, -4.9913632906264002E+02}; - constexpr FLT c12[] = {-3.3076333188696488E+01, -1.8970588558436827E+02, 1.8160423493169353E+03, -6.3715703265863249E+03, 1.2525624646166696E+04, -1.4199807314837786E+04, 6.4441944019082612E+03, 6.4441857815347785E+03, -1.4199805590763088E+04, 1.2525627375951648E+04, -6.3715703355659844E+03, 1.8160422864600705E+03, -1.8970588672434647E+02, -3.3076333168693779E+01}; - constexpr FLT c13[] = {-1.4394533628062636E+01, 5.7000699174526638E+01, -1.0101142144442984E+02, -3.2954074617159108E+01, 6.1417869930814436E+02, -1.6177306801656998E+03, 2.4593354137960296E+03, -2.4593361954696252E+03, 1.6177288934831954E+03, -6.1417959264939657E+02, 3.2954074617159108E+01, 1.0101142929606195E+02, -5.7000698932570963E+01, 1.4394533639244566E+01}; - constexpr FLT c14[] = {-1.5925952284527973E+00, 8.5113930275160214E+00, -2.8993510636695618E+01, 6.6373557362227814E+01, -1.0329536491693236E+02, 1.0280181071020283E+02, -4.3891122033571499E+01, -4.3893656778687756E+01, 1.0280325289276884E+02, -1.0329444716438918E+02, 6.6373666618482872E+01, -2.8993528390837142E+01, 8.5113926647511526E+00, -1.5925952190335899E+00}; - constexpr FLT c15[] = {1.5984868634272537E-02, 1.2876168577716327E-01, -9.8358742969178536E-01, 3.7710928871122080E+00, -9.4315137784350505E+00, 1.6840408563519507E+01, -2.2308532530501328E+01, 2.2310146222863779E+01, -1.6843058416240989E+01, 9.4311230950209399E+00, -3.7712287769953385E+00, 9.8360653920659347E-01, -1.2876103884046056E-01, -1.5984859595043394E-02}; - for (int i=0; i<14; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i] + z*(c12[i] + z*(c13[i] + z*(c14[i] + z*(c15[i]))))))))))))))); - } else if (w==15) { - constexpr FLT c0[] = {2.3939707792242090E+05, 9.7700272582690299E+08, 1.4715933396485275E+11, 4.7242424833337236E+12, 5.3987426629953617E+13, 2.7580474290566103E+14, 7.0693378336533425E+14, 9.6196578554477850E+14, 7.0693378336533425E+14, 2.7580474290566153E+14, 5.3987426629953828E+13, 4.7242424833337285E+12, 1.4715933396485275E+11, 9.7700272582690418E+08, 2.3939707792242119E+05}; - constexpr FLT c1[] = {1.4314487885226035E+06, 2.9961416925358462E+09, 3.0273361232748425E+11, 6.8507333793903604E+12, 5.4192702756911016E+13, 1.7551587948105316E+14, 2.1874615668430153E+14, 5.4722295550654096E-02, -2.1874615668430156E+14, -1.7551587948105334E+14, -5.4192702756911172E+13, -6.8507333793903730E+12, -3.0273361232748438E+11, -2.9961416925358448E+09, -1.4314487885226023E+06}; - constexpr FLT c2[] = {3.8829497354762922E+06, 4.2473082696966453E+09, 2.8414312556015533E+11, 4.3688281331121431E+12, 2.1823119508000547E+13, 3.2228098609392133E+13, -2.1833085454691801E+13, -7.3750710225100750E+13, -2.1833085454691875E+13, 3.2228098609392070E+13, 2.1823119508000590E+13, 4.3688281331121470E+12, 2.8414312556015527E+11, 4.2473082696966438E+09, 3.8829497354762908E+06}; - constexpr FLT c3[] = {6.3495763451755792E+06, 3.6841035003733959E+09, 1.5965774278321054E+11, 1.5630338683778213E+12, 3.8749058615819409E+12, -2.7319740087722651E+12, -1.3233342822865350E+13, 1.2682483963161023E-01, 1.3233342822865453E+13, 2.7319740087724204E+12, -3.8749058615819307E+12, -1.5630338683778201E+12, -1.5965774278321042E+11, -3.6841035003733950E+09, -6.3495763451755783E+06}; - constexpr FLT c4[] = {7.0146619045520453E+06, 2.1782897863065763E+09, 5.8897780310148117E+10, 3.1953009601770477E+11, 4.0651527030195397E+08, -1.6379148273275671E+12, -1.1568753137013023E+11, 2.7451653250461045E+12, -1.1568753137006947E+11, -1.6379148273276748E+12, 4.0651527030228132E+08, 3.1953009601770502E+11, 5.8897780310148155E+10, 2.1782897863065772E+09, 7.0146619045520453E+06}; - constexpr FLT c5[] = {5.5580012413990172E+06, 9.2345162185944211E+08, 1.4522950934020031E+10, 2.7025952371212032E+10, -1.2304576967641461E+11, -1.0116752717201025E+11, 3.8517418245450385E+11, 1.3143739157465117E-02, -3.8517418245443384E+11, 1.0116752717219414E+11, 1.2304576967643431E+11, -2.7025952371216137E+10, -1.4522950934020092E+10, -9.2345162185944176E+08, -5.5580012413990181E+06}; - constexpr FLT c6[] = {3.2693972344231815E+06, 2.8610260147425276E+08, 2.2348528403751349E+09, -3.4574515574230409E+09, -1.7480626463581440E+10, 3.1608597465590984E+10, 1.9879262560063576E+10, -6.6148013553869423E+10, 1.9879262560078850E+10, 3.1608597465530212E+10, -1.7480626463573368E+10, -3.4574515574202504E+09, 2.2348528403750744E+09, 2.8610260147425228E+08, 3.2693972344231787E+06}; - constexpr FLT c7[] = {1.4553539959296281E+06, 6.4136842048384696E+07, 1.3622336582072574E+08, -1.2131510424637468E+09, 6.4322366984755766E+08, 4.5078753872548027E+09, -7.1689413747004452E+09, 3.2111361580040181E-03, 7.1689413747369127E+09, -4.5078753874649162E+09, -6.4322366984639454E+08, 1.2131510424612916E+09, -1.3622336582064471E+08, -6.4136842048384838E+07, -1.4553539959296265E+06}; - constexpr FLT c8[] = {4.9358776531681791E+05, 9.7772970960583091E+06, -2.3511574237971250E+07, -1.0142613816625430E+08, 3.9421144217985487E+08, -2.8449115594571364E+08, -5.7549243248595941E+08, 1.1608781630719392E+09, -5.7549243238966489E+08, -2.8449115596289498E+08, 3.9421144214631909E+08, -1.0142613816300942E+08, -2.3511574237913735E+07, 9.7772970960591603E+06, 4.9358776531681628E+05}; - constexpr FLT c9[] = {1.2660319987326709E+05, 7.7519511328105081E+05, -6.5244610661542164E+06, 9.0878257490973976E+06, 2.3116605621149909E+07, -8.7079594477661625E+07, 9.5542733670714021E+07, -3.4623017322338634E-02, -9.5542733658248380E+07, 8.7079594589852452E+07, -2.3116605559600774E+07, -9.0878257518242579E+06, 6.5244610661450867E+06, -7.7519511328086059E+05, -1.2660319987326671E+05}; - constexpr FLT c10[] = {2.3793325531461589E+04, -4.2305332802771904E+04, -5.2884156975031609E+05, 2.5307340145554747E+06, -4.0404175204335153E+06, -1.7519988538994591E+05, 1.0146438798034744E+07, -1.5828545528861172E+07, 1.0146438794496680E+07, -1.7520001842407117E+05, -4.0404175643064296E+06, 2.5307340160591919E+06, -5.2884156977243477E+05, -4.2305332802771285E+04, 2.3793325531458995E+04}; - constexpr FLT c11[] = {2.9741655196857741E+03, -2.0687056403629973E+04, 3.3295507834673197E+04, 1.0661145690364030E+05, -5.6644238449031080E+05, 1.0874811673184116E+06, -9.6561276275880623E+05, -7.6207036577648435E-02, 9.6561275636531680E+05, -1.0874812580259521E+06, 5.6644242612787138E+05, -1.0661145858193116E+05, -3.3295507822185595E+04, 2.0687056403005630E+04, -2.9741655196852739E+03}; - constexpr FLT c12[] = {1.5389176594840404E+02, -2.3864418517811582E+03, 1.0846266965476148E+04, -2.2940053899336592E+04, 1.4780105833703366E+04, 4.2663634529139046E+04, -1.3047650082135458E+05, 1.7468394417865420E+05, -1.3047642955960588E+05, 4.2663569014305380E+04, 1.4780038020101238E+04, -2.2940052498526344E+04, 1.0846266965476338E+04, -2.3864418513602504E+03, 1.5389176594853458E+02}; - constexpr FLT c13[] = {-2.3857631312306911E+01, -1.9651606200276817E+01, 6.4183084244784663E+02, -2.8648428291977302E+03, 6.8249248253356263E+03, -9.7944434082514545E+03, 7.6177566999585488E+03, -4.8285923071218206E-02, -7.6177709934185850E+03, 9.7944219680614005E+03, -6.8249060651693289E+03, 2.8648407633460843E+03, -6.4183085466149657E+02, 1.9651606115081155E+01, 2.3857631312306911E+01}; - constexpr FLT c14[] = {-6.1348505726741482E+00, 2.7872916302350376E+01, -6.5819898558168433E+01, 5.1367134246654771E+01, 1.7214275703496423E+02, -6.9657243183240860E+02, 1.3192259272931558E+03, -1.6054145588281010E+03, 1.3192138654025996E+03, -6.9662907027505264E+02, 1.7212038135392731E+02, 5.1368095701697484E+01, -6.5819904020980715E+01, 2.7872916473063263E+01, -6.1348505738411490E+00}; - constexpr FLT c15[] = {-4.9671584422774523E-01, 3.0617550953446120E+00, -1.1650665638577927E+01, 3.0081331929557447E+01, -5.4030564936801589E+01, 6.6075844179663960E+01, -4.7176211285519123E+01, -3.4313439732287163E-02, 4.7173085818207042E+01, -6.6061100127341888E+01, 5.4056655794367416E+01, -3.0081722612971500E+01, 1.1650665638577902E+01, -3.0617553939307713E+00, 4.9671584448693240E-01}; - constexpr FLT c16[] = {4.3460783761337983E-03, -1.3199934226522787E-02, -1.9412503880258877E-01, 1.1325756464362078E+00, -3.4439944517155450E+00, 7.1653575841078521E+00, -1.1108195405465501E+01, 1.2348789868125033E+01, -1.1088023137785596E+01, 7.0939141360622937E+00, -3.4847592426682690E+00, 1.1324705825441117E+00, -1.9413837699275374E-01, -1.3199908576142469E-02, 4.3460782759542488E-03}; - for (int i=0; i<15; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i] + z*(c12[i] + z*(c13[i] + z*(c14[i] + z*(c15[i] + z*(c16[i])))))))))))))))); - } else if (w==16) { - constexpr FLT c0[] = {3.6434551345571154E+05, 2.0744705928579516E+09, 4.0355760945670056E+11, 1.6364575388763043E+13, 2.3514830376056566E+14, 1.5192201717462540E+15, 4.9956173084674150E+15, 8.9287666945127440E+15, 8.9287666945127440E+15, 4.9956173084674160E+15, 1.5192201717462542E+15, 2.3514830376056566E+14, 1.6364575388763049E+13, 4.0355760945670068E+11, 2.0744705928579512E+09, 3.6434551345570991E+05}; - constexpr FLT c1[] = {2.2576246485480345E+06, 6.6499571180086479E+09, 8.7873753526056311E+11, 2.5606844387131062E+13, 2.6313738449330162E+14, 1.1495095100701470E+15, 2.1932582707747572E+15, 1.2860244365132608E+15, -1.2860244365132600E+15, -2.1932582707747580E+15, -1.1495095100701462E+15, -2.6313738449330162E+14, -2.5606844387131066E+13, -8.7873753526056299E+11, -6.6499571180086479E+09, -2.2576246485480345E+06}; - constexpr FLT c2[] = {6.3730995546265058E+06, 9.9060026035198078E+09, 8.8097248605449023E+11, 1.7953384130753688E+13, 1.2398425545001667E+14, 3.0749346493041262E+14, 1.0259777520247212E+14, -5.5291976457534244E+14, -5.5291976457534294E+14, 1.0259777520247097E+14, 3.0749346493041212E+14, 1.2398425545001659E+14, 1.7953384130753672E+13, 8.8097248605448987E+11, 9.9060026035198078E+09, 6.3730995546265077E+06}; - constexpr FLT c3[] = {1.0896915393078227E+07, 9.0890343524593887E+09, 5.3565169504010052E+11, 7.3004206720038770E+12, 2.9692333044160145E+13, 1.6051737468109752E+13, -9.1273329108089609E+13, -8.5999306918501562E+13, 8.5999306918502812E+13, 9.1273329108090391E+13, -1.6051737468109348E+13, -2.9692333044160059E+13, -7.3004206720038691E+12, -5.3565169504010046E+11, -9.0890343524593925E+09, -1.0896915393078225E+07}; - constexpr FLT c4[] = {1.2655725616100591E+07, 5.7342804054544220E+09, 2.1822836608899585E+11, 1.8300700858999712E+12, 2.7770431049857900E+12, -8.5034969223848574E+12, -1.2846668467422469E+13, 1.6519076896573322E+13, 1.6519076896573414E+13, -1.2846668467422033E+13, -8.5034969223850078E+12, 2.7770431049858350E+12, 1.8300700858999753E+12, 2.1822836608899594E+11, 5.7342804054544239E+09, 1.2655725616100593E+07}; - constexpr FLT c5[] = {1.0609303958036318E+07, 2.6255609052371716E+09, 6.1673589426039268E+10, 2.6044432099085120E+11, -3.5431628074578119E+11, -1.6077602129631777E+12, 1.5534405614726155E+12, 2.8019935380863682E+12, -2.8019935380852476E+12, -1.5534405614728257E+12, 1.6077602129636682E+12, 3.5431628074579871E+11, -2.6044432099085229E+11, -6.1673589426039368E+10, -2.6255609052371745E+09, -1.0609303958036322E+07}; - constexpr FLT c6[] = {6.6544809363384582E+06, 8.9490403680928528E+08, 1.1882638725190987E+10, 8.1552898137820768E+09, -1.2575562817884897E+11, 2.7074695075942204E+10, 3.9453789461929230E+11, -3.1679644857371918E+11, -3.1679644857384814E+11, 3.9453789461920764E+11, 2.7074695075779831E+10, -1.2575562817882477E+11, 8.1552898137801113E+09, 1.1882638725190844E+10, 8.9490403680928373E+08, 6.6544809363384526E+06}; - constexpr FLT c7[] = {3.1906872142825029E+06, 2.2785946180651915E+08, 1.3744578972811413E+09, -4.3997172592843504E+09, -9.2011130753862667E+09, 3.4690551711764793E+10, -9.4227043392778511E+09, -5.9308465069355759E+10, 5.9308465069781982E+10, 9.4227043396369877E+09, -3.4690551711565643E+10, 9.2011130754329739E+09, 4.3997172592904301E+09, -1.3744578972811375E+09, -2.2785946180652067E+08, -3.1906872142825001E+06}; - constexpr FLT c8[] = {1.1821527096621764E+06, 4.2281234059839748E+07, 2.8723226058752719E+07, -8.3553955857505906E+08, 1.2447304828865275E+09, 2.1955280942222519E+09, -7.0514195727878428E+09, 4.3745141232918625E+09, 4.3745141237316084E+09, -7.0514195722924280E+09, 2.1955280943332024E+09, 1.2447304828901291E+09, -8.3553955857124400E+08, 2.8723226058927339E+07, 4.2281234059842363E+07, 1.1821527096621776E+06}; - constexpr FLT c9[] = {3.3854610744279926E+05, 5.2176984975088174E+06, -2.0677283565109752E+07, -3.5831818967739724E+07, 2.6599346107970935E+08, -3.7992777963644773E+08, -1.3426914477301279E+08, 9.1752051236703849E+08, -9.1752051203046608E+08, 1.3426914449876857E+08, 3.7992777988576066E+08, -2.6599346104854524E+08, 3.5831818969687484E+07, 2.0677283565073233E+07, -5.2176984975085324E+06, -3.3854610744279926E+05}; - constexpr FLT c10[] = {7.3893334077310792E+04, 2.6983804209766653E+05, -3.6415998560216571E+06, 8.4025485866871737E+06, 4.9278860835956605E+06, -5.1437033778820507E+07, 8.7603898248918146E+07, -4.6199497914231867E+07, -4.6199497948197275E+07, 8.7603898697554156E+07, -5.1437033767498761E+07, 4.9278861543586710E+06, 8.4025485891638417E+06, -3.6415998559774463E+06, 2.6983804209732520E+05, 7.3893334077308697E+04}; - constexpr FLT c11[] = {1.1778892113376965E+04, -4.0077190108567142E+04, -1.8372552169915423E+05, 1.3262878389569877E+06, -2.9738540196046322E+06, 1.9493506557541618E+06, 4.1881949490808225E+06, -1.1066749801915919E+07, 1.1066748877418302E+07, -4.1881948928182255E+06, -1.9493507634843190E+06, 2.9738539997848324E+06, -1.3262878392766670E+06, 1.8372552166918706E+05, 4.0077190106849979E+04, -1.1778892113376709E+04}; - constexpr FLT c12[] = {1.2019749667900676E+03, -1.0378455845063749E+04, 2.6333352662141660E+04, 1.7117059675298591E+04, -2.5133289742429825E+05, 6.4713895872015413E+05, -8.1634975674778735E+05, 3.8623909535608569E+05, 3.8623887467451266E+05, -8.1634966479713970E+05, 6.4713897711029404E+05, -2.5133289282677229E+05, 1.7117063267120848E+04, 2.6333352680101594E+04, -1.0378455843660833E+04, 1.2019749667921026E+03}; - constexpr FLT c13[] = {3.1189837631121321E+01, -8.9083493701244504E+02, 4.9454293991649774E+03, -1.3124692742151998E+04, 1.5834795298841136E+04, 6.9608292767098355E+03, -5.9790200829217545E+04, 1.0841735230501879E+05, -1.0841732371809872E+05, 5.9789914960016831E+04, -6.9607435159496199E+03, -1.5834797085523640E+04, 1.3124692295481371E+04, -4.9454294410403490E+03, 8.9083493766674769E+02, -3.1189837632399257E+01}; - constexpr FLT c14[] = {-1.2975319072478742E+01, 1.8283699094028595E+01, 1.7684019694555272E+02, -1.1059902320249000E+03, 3.1998244780238201E+03, -5.5987981589200417E+03, 5.9247600879368474E+03, -2.5988290685215188E+03, -2.5988178806809206E+03, 5.9249852432272892E+03, -5.5987701893187350E+03, 3.1998552445852642E+03, -1.1059895327848767E+03, 1.7684022972243278E+02, 1.8283699179384410E+01, -1.2975319072812146E+01}; - constexpr FLT c15[] = {-2.3155118729306223E+00, 1.1938503369059017E+01, -3.4150537494399323E+01, 4.8897188710734866E+01, 1.5839596560322873E+01, -2.4289147960969117E+02, 6.0143231605823757E+02, -8.8772403477020873E+02, 8.8712611928432557E+02, -6.0139861536721287E+02, 2.4281211991792659E+02, -1.5853729108169823E+01, -4.8898479664625256E+01, 3.4150529001281690E+01, -1.1938504563403686E+01, 2.3155118727038264E+00}; - constexpr FLT c16[] = {-1.5401723836370515E-01, 9.8067787978090881E-01, -4.1900810719931050E+00, 1.2149798852514468E+01, -2.4780790340446881E+01, 3.6014221907804398E+01, -3.4588714991383583E+01, 1.3071629460227753E+01, 1.2883354961750646E+01, -3.4615611348253751E+01, 3.5973877372428277E+01, -2.4777428295844171E+01, 1.2151059619254390E+01, -4.1901237542037384E+00, 9.8067813628521039E-01, -1.5401723766235165E-01}; - constexpr FLT c17[] = {1.1808834947531816E-02, -2.5444032491006262E-02, -1.4707353726716647E-04, 2.5840423001794482E-01, -1.0910598687678679E+00, 2.6514321899473572E+00, -4.5034457705829842E+00, 6.8479728528821520E+00, -6.8634402190500978E+00, 4.4285511554539836E+00, -2.6424773990080204E+00, 1.0878035811535636E+00, -2.5882398584322625E-01, 1.3196868749378181E-04, 2.5444131865017927E-02, -1.1808835384234016E-02}; - for (int i=0; i<16; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i] + z*(c12[i] + z*(c13[i] + z*(c14[i] + z*(c15[i] + z*(c16[i] + z*(c17[i]))))))))))))))))); - } else - printf("width not implemented!\n"); From 74ccd71834634179d26cd7224788c4994015062d Mon Sep 17 00:00:00 2001 From: Marco Barbone Date: Wed, 31 Jul 2024 18:32:51 -0400 Subject: [PATCH 33/68] fixed cmake --- perftest/cuda/CMakeLists.txt | 9 +++++++-- src/cuda/CMakeLists.txt | 35 ++++++++++++++++++++++------------- 2 files changed, 29 insertions(+), 15 deletions(-) diff --git a/perftest/cuda/CMakeLists.txt b/perftest/cuda/CMakeLists.txt index ba3bde04a..ec95760fb 100644 --- a/perftest/cuda/CMakeLists.txt +++ b/perftest/cuda/CMakeLists.txt @@ -1,5 +1,10 @@ add_executable(cuperftest cuperftest.cu) target_include_directories(cuperftest PUBLIC ${CUFINUFFT_INCLUDE_DIRS}) target_link_libraries(cuperftest PUBLIC cufinufft) -# file(COPY ${CMAKE_CURRENT_SOURCE_DIR}/bench.sh DESTINATION -# ${CMAKE_CURRENT_BINARY_DIR}) +target_compile_features(cuperftest PRIVATE cxx_std_17) +set_target_properties( + cuperftest + PROPERTIES LINKER_LANGUAGE CUDA + CUDA_ARCHITECTURES ${FINUFFT_CUDA_ARCHITECTURES} + CUDA_STANDARD 17 + CUDA_STANDARD_REQUIRED ON) diff --git a/src/cuda/CMakeLists.txt b/src/cuda/CMakeLists.txt index 77b86ae77..2b91f91d7 100644 --- a/src/cuda/CMakeLists.txt +++ b/src/cuda/CMakeLists.txt @@ -1,7 +1,3 @@ -set(CMAKE_CXX_STANDARD 17) -set(CMAKE_CXX_EXTENSIONS OFF) -set(CMAKE_CUDA_SEPARABLE_COMPILATION ON) - set(PRECISION_INDEPENDENT_SRC precision_independent.cu utils.cpp ${PROJECT_SOURCE_DIR}/contrib/legendre_rule_fast.cpp) @@ -47,8 +43,14 @@ target_include_directories(cufinufft_common_objects set_target_properties( cufinufft_common_objects PROPERTIES POSITION_INDEPENDENT_CODE ${FINUFFT_SHARED_LINKING} - CUDA_ARCHITECTURES ${FINUFFT_CUDA_ARCHITECTURES}) - + CUDA_ARCHITECTURES ${FINUFFT_CUDA_ARCHITECTURES} + CUDA_SEPARABLE_COMPILATION ON + CUDA_STANDARD 17 + CUDA_STANDARD_REQUIRED ON) +target_compile_features(cufinufft_common_objects PRIVATE cxx_std_17) +target_compile_options( + cufinufft_common_objects + PRIVATE $<$:${FINUFFT_CUDA_FLAGS}>) target_compile_options( cufinufft_common_objects PRIVATE $<$:${FINUFFT_CUDA_FLAGS}>) @@ -58,24 +60,31 @@ target_include_directories(cufinufft_objects PUBLIC ${CUFINUFFT_INCLUDE_DIRS}) set_target_properties( cufinufft_objects PROPERTIES POSITION_INDEPENDENT_CODE ${FINUFFT_SHARED_LINKING} - CUDA_ARCHITECTURES ${FINUFFT_CUDA_ARCHITECTURES}) + CUDA_ARCHITECTURES ${FINUFFT_CUDA_ARCHITECTURES} + CUDA_SEPARABLE_COMPILATION ON + CUDA_STANDARD 17 + CUDA_STANDARD_REQUIRED ON) +target_compile_features(cufinufft_objects PRIVATE cxx_std_17) target_compile_options( cufinufft_objects PRIVATE $<$:${FINUFFT_CUDA_FLAGS}>) if(FINUFFT_SHARED_LINKING) add_library(cufinufft SHARED $ $) - set_target_properties( - cufinufft PROPERTIES LIBRARY_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}" - CUDA_ARCHITECTURES ${FINUFFT_CUDA_ARCHITECTURES}) else() add_library(cufinufft STATIC $ $) - set_target_properties( - cufinufft PROPERTIES ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}" - CUDA_ARCHITECTURES ${FINUFFT_CUDA_ARCHITECTURES}) endif() +set_target_properties( + cufinufft + PROPERTIES CUDA_ARCHITECTURES ${FINUFFT_CUDA_ARCHITECTURES} + CUDA_SEPARABLE_COMPILATION ON + CUDA_STANDARD 17 + CUDA_STANDARD_REQUIRED ON + ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}" + CUDA_ARCHITECTURES ${FINUFFT_CUDA_ARCHITECTURES}) + if(WIN32) target_link_libraries(cufinufft PUBLIC CUDA::cudart CUDA::cufft CUDA::nvToolsExt) From ee28d05c15be75fa6c377e17ecf51ef94e19f902 Mon Sep 17 00:00:00 2001 From: Marco Barbone Date: Thu, 1 Aug 2024 10:45:35 -0400 Subject: [PATCH 34/68] Gcc-9 fixes; Ker size fixed too --- devel/CMakeLists.txt | 24 +++++++++++++----------- examples/CMakeLists.txt | 3 +++ examples/cuda/CMakeLists.txt | 2 +- src/cuda/1d/spreadinterp1d.cuh | 6 +++--- src/cuda/2d/spreadinterp2d.cuh | 8 ++++---- src/cuda/CMakeLists.txt | 1 + test/cuda/CMakeLists.txt | 8 ++++++-- 7 files changed, 31 insertions(+), 21 deletions(-) diff --git a/devel/CMakeLists.txt b/devel/CMakeLists.txt index 9a376408e..45b9a5989 100644 --- a/devel/CMakeLists.txt +++ b/devel/CMakeLists.txt @@ -2,23 +2,25 @@ project(finufft_devel) # Set the minimum required version of CMake cmake_minimum_required(VERSION 3.5) - # include cpm cmake, downloading it -CPMAddPackage( - NAME benchmark - GITHUB_REPOSITORY google/benchmark - VERSION 1.8.3 - OPTIONS "BENCHMARK_ENABLE_TESTING OFF" - -) +cpmaddpackage( + NAME + benchmark + GITHUB_REPOSITORY + google/benchmark + VERSION + 1.8.3 + OPTIONS + "BENCHMARK_ENABLE_TESTING OFF") -if (benchmark_ADDED) - # patch benchmark target - set_target_properties(benchmark PROPERTIES CXX_STANDARD 17) +if(benchmark_ADDED) + # patch benchmark target + set_target_properties(benchmark PROPERTIES CXX_STANDARD 17) endif() add_executable(foldrescale foldrescale.cpp) target_link_libraries(foldrescale finufft benchmark xsimd) add_executable(padding padding.cpp) +target_compile_features(padding PRIVATE cxx_std_17) target_link_libraries(padding finufft xsimd) target_compile_options(padding PRIVATE -march=native) diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index 35ac5662c..27b193cd5 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -11,6 +11,7 @@ set(EXAMPLES_C guru1d1c simple1d1c simple1d1cf) foreach(EXAMPLE ${EXAMPLES}) add_executable(${EXAMPLE} ${EXAMPLE}.cpp) + target_compile_features(${EXAMPLE} PRIVATE cxx_std_17) target_link_libraries(${EXAMPLE} PRIVATE finufft) enable_asan(${EXAMPLE}) endforeach() @@ -18,6 +19,7 @@ endforeach() foreach(EXAMPLE ${EXAMPLES_C}) add_executable(${EXAMPLE} ${EXAMPLE}.c) target_link_libraries(${EXAMPLE} PRIVATE finufft) + target_compile_features(${EXAMPLE} PRIVATE cxx_std_17) enable_asan(${EXAMPLE}) endforeach() @@ -25,6 +27,7 @@ if(FINUFFT_USE_OPENMP) foreach(EXAMPLE ${EXAMPLES_OPENMP}) add_executable(${EXAMPLE} ${EXAMPLE}.cpp) target_link_libraries(${EXAMPLE} PRIVATE finufft OpenMP::OpenMP_CXX) + target_compile_features(${EXAMPLE} PRIVATE cxx_std_17) enable_asan(${EXAMPLE}) endforeach() endif() diff --git a/examples/cuda/CMakeLists.txt b/examples/cuda/CMakeLists.txt index 0c9dba361..b9742a865 100644 --- a/examples/cuda/CMakeLists.txt +++ b/examples/cuda/CMakeLists.txt @@ -1,4 +1,3 @@ - file(GLOB example_src "*.cpp") foreach(srcfile ${example_src}) @@ -7,4 +6,5 @@ foreach(srcfile ${example_src}) add_executable(${executable} ${srcfile}) target_include_directories(${executable} PUBLIC ${CUFINUFFT_INCLUDE_DIRS}) target_link_libraries(${executable} cufinufft) + target_compile_features(${executable} PRIVATE cxx_std_17) endforeach() diff --git a/src/cuda/1d/spreadinterp1d.cuh b/src/cuda/1d/spreadinterp1d.cuh index 56493ef73..72c776c06 100644 --- a/src/cuda/1d/spreadinterp1d.cuh +++ b/src/cuda/1d/spreadinterp1d.cuh @@ -24,7 +24,7 @@ __global__ void spread_1d_nuptsdriven(const T *x, const cuda_complex *c, T es_beta, T sigma, const int *idxnupts) { // dynamic stack allocation to reduce stack usage #if ALLOCA_SUPPORTED - auto ker = (T *)alloca(sizeof(T) * ns * 3); + auto ker = (T *)alloca(sizeof(T) * ns); auto *__restrict__ ker1 = ker; #else T ker1[MAX_NSPREAD]; @@ -109,7 +109,7 @@ __global__ void spread_1d_subprob( // dynamic stack allocation #if ALLOCA_SUPPORTED - auto ker = (T *)alloca(sizeof(T) * ns * 3); + auto ker = (T *)alloca(sizeof(T) * ns); auto *__restrict__ ker1 = ker; #else T ker1[MAX_NSPREAD]; @@ -160,7 +160,7 @@ __global__ void interp_1d_nuptsdriven(const T *x, cuda_complex *c, T es_c, T es_beta, T sigma, const int *idxnupts) { // dynamic stack allocation #if ALLOCA_SUPPORTED - auto ker = (T *)alloca(sizeof(T) * ns * 3); + auto ker = (T *)alloca(sizeof(T) * ns); auto *__restrict__ ker1 = ker; #else T ker1[MAX_NSPREAD]; diff --git a/src/cuda/2d/spreadinterp2d.cuh b/src/cuda/2d/spreadinterp2d.cuh index 03da3ed8a..53a243e7e 100644 --- a/src/cuda/2d/spreadinterp2d.cuh +++ b/src/cuda/2d/spreadinterp2d.cuh @@ -20,7 +20,7 @@ __global__ void spread_2d_nupts_driven( const T *x, const T *y, const cuda_complex *c, cuda_complex *fw, int M, int ns, int nf1, int nf2, T es_c, T es_beta, T sigma, const int *idxnupts) { #if ALLOCA_SUPPORTED - auto ker = (T *)alloca(sizeof(T) * ns * 3); + auto ker = (T *)alloca(sizeof(T) * ns * 2); auto *__restrict__ ker1 = ker; auto *__restrict__ ker2 = ker + ns; #else @@ -137,7 +137,7 @@ __global__ void spread_2d_subprob( const int N = (bin_size_x + rounded_ns) * (bin_size_y + rounded_ns); #if ALLOCA_SUPPORTED - auto ker = (T *)alloca(sizeof(T) * ns * 3); + auto ker = (T *)alloca(sizeof(T) * ns * 2); auto *__restrict__ ker1 = ker; auto *__restrict__ ker2 = ker + ns; #else @@ -211,7 +211,7 @@ __global__ void interp_2d_nupts_driven( const T *x, const T *y, cuda_complex *c, const cuda_complex *fw, int M, int ns, int nf1, int nf2, T es_c, T es_beta, T sigma, const int *idxnupts) { #if ALLOCA_SUPPORTED - auto ker = (T *)alloca(sizeof(T) * ns * 3); + auto ker = (T *)alloca(sizeof(T) * ns * 2); auto *__restrict__ ker1 = ker; auto *__restrict__ ker2 = ker + ns; #else @@ -265,7 +265,7 @@ __global__ void interp_2d_subprob( cuda_complex *fwshared = (cuda_complex *)sharedbuf; #if ALLOCA_SUPPORTED - auto ker = (T *)alloca(sizeof(T) * ns * 3); + auto ker = (T *)alloca(sizeof(T) * ns * 2); auto *__restrict__ ker1 = ker; auto *__restrict__ ker2 = ker + ns; #else diff --git a/src/cuda/CMakeLists.txt b/src/cuda/CMakeLists.txt index 2b91f91d7..ae9431c31 100644 --- a/src/cuda/CMakeLists.txt +++ b/src/cuda/CMakeLists.txt @@ -84,6 +84,7 @@ set_target_properties( CUDA_STANDARD_REQUIRED ON ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}" CUDA_ARCHITECTURES ${FINUFFT_CUDA_ARCHITECTURES}) +target_compile_features(cufinufft PRIVATE cxx_std_17) if(WIN32) target_link_libraries(cufinufft PUBLIC CUDA::cudart CUDA::cufft diff --git a/test/cuda/CMakeLists.txt b/test/cuda/CMakeLists.txt index a74dcdd79..6d93d3f15 100644 --- a/test/cuda/CMakeLists.txt +++ b/test/cuda/CMakeLists.txt @@ -7,8 +7,12 @@ foreach(srcfile ${test_src}) target_include_directories(${executable} PUBLIC ${CUFINUFFT_INCLUDE_DIRS}) target_link_libraries(${executable} PUBLIC cufinufft m) set_target_properties( - ${executable} PROPERTIES LINKER_LANGUAGE CUDA CUDA_ARCHITECTURES - ${FINUFFT_CUDA_ARCHITECTURES}) + ${executable} + PROPERTIES LINKER_LANGUAGE CUDA + CUDA_ARCHITECTURES ${FINUFFT_CUDA_ARCHITECTURES} + CUDA_STANDARD 17 + CUDA_STANDARD_REQUIRED ON) + target_compile_features(${executable} PRIVATE cxx_std_17) message(STATUS "Adding test ${executable}" " with CUDA_ARCHITECTURES=${FINUFFT_CUDA_ARCHITECTURES}" " and INCLUDE=${CUFINUFFT_INCLUDE_DIRS}") From 466ddffe166a505fd37972f39f6555c8d580ffa0 Mon Sep 17 00:00:00 2001 From: Marco Barbone Date: Thu, 1 Aug 2024 16:38:43 -0400 Subject: [PATCH 35/68] windows compatibility tweak; unit testing the 1.25 upsampfact --- include/cufinufft/utils.h | 13 ++++-- src/cuda/spreadinterp.cpp | 2 +- test/cuda/CMakeLists.txt | 75 ++++++++++++++++--------------- test/cuda/cufinufft1d_test.cu | 30 +++++++------ test/cuda/cufinufft2d_test.cu | 37 ++++++++------- test/cuda/cufinufft2dmany_test.cu | 13 +++--- test/cuda/cufinufft3d_test.cu | 38 ++++++++-------- 7 files changed, 113 insertions(+), 95 deletions(-) diff --git a/include/cufinufft/utils.h b/include/cufinufft/utils.h index b4db528ae..4bfaa801d 100644 --- a/include/cufinufft/utils.h +++ b/include/cufinufft/utils.h @@ -92,7 +92,12 @@ __forceinline__ __device__ auto interval(const int ns, const double x) { #if defined(__CUDACC_VER_MAJOR__) && defined(__CUDACC_VER_MINOR__) #if (__CUDACC_VER_MAJOR__ > 11) || \ (__CUDACC_VER_MAJOR__ == 11 && __CUDACC_VER_MINOR__ >= 3 && __CUDA_ARCH__ >= 600) + #define ALLOCA_SUPPORTED 1 +// windows compatibility +#if __has_include() +#include +#endif #else #define ALLOCA_SUPPORTED 0 #endif @@ -118,8 +123,8 @@ __forceinline__ __device__ auto interval(const int ns, const double x) { */ template -static __forceinline__ __device__ void atomicAddComplexShared(cuda_complex *address, - cuda_complex res) { +static __forceinline__ __device__ void atomicAddComplexShared( + cuda_complex *address, cuda_complex res) { const auto raw_address = reinterpret_cast(address); atomicAdd(raw_address, res.x); atomicAdd(raw_address + 1, res.y); @@ -131,8 +136,8 @@ static __forceinline__ __device__ void atomicAddComplexShared(cuda_complex *a * on shared memory are supported so we leverage them */ template -static __forceinline__ __device__ void atomicAddComplexGlobal(cuda_complex *address, - cuda_complex res) { +static __forceinline__ __device__ void atomicAddComplexGlobal( + cuda_complex *address, cuda_complex res) { if constexpr ( std::is_same_v, float2> && COMPUTE_CAPABILITY_90_OR_HIGHER) { atomicAdd(address, res); diff --git a/src/cuda/spreadinterp.cpp b/src/cuda/spreadinterp.cpp index b01d1c98f..98b5382bc 100644 --- a/src/cuda/spreadinterp.cpp +++ b/src/cuda/spreadinterp.cpp @@ -22,7 +22,7 @@ int setup_spreader(finufft_spread_opts &opts, T eps, T upsampfac, int kerevalmet // Must call before any kernel evals done. // Returns: 0 success, 1, warning, >1 failure (see error codes in utils.h) { - if (upsampfac != 2.0) { // nonstandard sigma + if (upsampfac != 2.0 && upsampfac != 1.25) { // nonstandard sigma if (kerevalmeth == 1) { fprintf(stderr, "[%s] nonstandard upsampfac=%.3g cannot be handled by kerevalmeth=1\n", diff --git a/test/cuda/CMakeLists.txt b/test/cuda/CMakeLists.txt index 6d93d3f15..04ae83e75 100644 --- a/test/cuda/CMakeLists.txt +++ b/test/cuda/CMakeLists.txt @@ -7,76 +7,77 @@ foreach(srcfile ${test_src}) target_include_directories(${executable} PUBLIC ${CUFINUFFT_INCLUDE_DIRS}) target_link_libraries(${executable} PUBLIC cufinufft m) set_target_properties( - ${executable} - PROPERTIES LINKER_LANGUAGE CUDA - CUDA_ARCHITECTURES ${FINUFFT_CUDA_ARCHITECTURES} - CUDA_STANDARD 17 - CUDA_STANDARD_REQUIRED ON) - target_compile_features(${executable} PRIVATE cxx_std_17) + ${executable} PROPERTIES LINKER_LANGUAGE CUDA CUDA_ARCHITECTURES + ${FINUFFT_CUDA_ARCHITECTURES}) message(STATUS "Adding test ${executable}" " with CUDA_ARCHITECTURES=${FINUFFT_CUDA_ARCHITECTURES}" " and INCLUDE=${CUFINUFFT_INCLUDE_DIRS}") endforeach() -function(add_tests PREC REQ_TOL CHECK_TOL) - add_test(NAME cufinufft1d1_test_GM_${PREC} - COMMAND cufinufft1d_test 1 1 1e2 2e2 ${REQ_TOL} ${CHECK_TOL} ${PREC}) +function(add_tests PREC REQ_TOL CHECK_TOL UPSAMP) + add_test(NAME cufinufft1d1_test_GM_${PREC}_${UPSAMP} + COMMAND cufinufft1d_test 1 1 1e2 2e2 ${REQ_TOL} ${CHECK_TOL} ${PREC} + ${UPSAMP}) - add_test(NAME cufinufft1d1_test_SM_${PREC} - COMMAND cufinufft1d_test 2 1 1e2 2e2 ${REQ_TOL} ${CHECK_TOL} ${PREC}) + add_test(NAME cufinufft1d1_test_SM_${PREC}_${UPSAMP} + COMMAND cufinufft1d_test 2 1 1e2 2e2 ${REQ_TOL} ${CHECK_TOL} ${PREC} + ${UPSAMP}) - add_test(NAME cufinufft1d2_test_GM_${PREC} - COMMAND cufinufft1d_test 1 2 1e2 2e2 ${REQ_TOL} ${CHECK_TOL} ${PREC}) + add_test(NAME cufinufft1d2_test_GM_${PREC}_${UPSAMP} + COMMAND cufinufft1d_test 1 2 1e2 2e2 ${REQ_TOL} ${CHECK_TOL} ${PREC} + ${UPSAMP}) - add_test(NAME cufinufft2d1_test_GM_${PREC} + add_test(NAME cufinufft2d1_test_GM_${PREC}_${UPSAMP} COMMAND cufinufft2d_test 1 1 1e2 2e2 2e4 ${REQ_TOL} ${CHECK_TOL} - ${PREC}) + ${PREC} ${UPSAMP}) - add_test(NAME cufinufft2d1_test_SM_${PREC} + add_test(NAME cufinufft2d1_test_SM_${PREC}_${UPSAMP} COMMAND cufinufft2d_test 2 1 1e2 2e2 2e4 ${REQ_TOL} ${CHECK_TOL} - ${PREC}) + ${PREC} ${UPSAMP}) - add_test(NAME cufinufft2d1many_test_GM_${PREC} + add_test(NAME cufinufft2d1many_test_GM_${PREC}_${UPSAMP} COMMAND cufinufft2dmany_test 1 1 1e2 2e2 5 0 2e4 ${REQ_TOL} - ${CHECK_TOL} ${PREC}) + ${CHECK_TOL} ${PREC} ${UPSAMP}) - add_test(NAME cufinufft2d1many_test_SM_${PREC} + add_test(NAME cufinufft2d1many_test_SM_${PREC}_${UPSAMP} COMMAND cufinufft2dmany_test 2 1 1e2 2e2 5 0 2e4 ${REQ_TOL} - ${CHECK_TOL} ${PREC}) + ${CHECK_TOL} ${PREC} ${UPSAMP}) - add_test(NAME cufinufft2d2many_test_GM_${PREC} + add_test(NAME cufinufft2d2many_test_GM_${PREC}_${UPSAMP} COMMAND cufinufft2dmany_test 1 2 1e2 2e2 5 0 2e4 ${REQ_TOL} - ${CHECK_TOL} ${PREC}) + ${CHECK_TOL} ${PREC} ${UPSAMP}) - add_test(NAME cufinufft2d2many_test_SM_${PREC} + add_test(NAME cufinufft2d2many_test_SM_${PREC}_${UPSAMP} COMMAND cufinufft2dmany_test 2 2 1e2 2e2 5 0 2e4 ${REQ_TOL} - ${CHECK_TOL} ${PREC}) + ${CHECK_TOL} ${PREC} ${UPSAMP}) - add_test(NAME cufinufft3d1_test_GM_${PREC} + add_test(NAME cufinufft3d1_test_GM_${PREC}_${UPSAMP} COMMAND cufinufft3d_test 1 1 2 5 10 20 ${REQ_TOL} ${CHECK_TOL} - ${PREC}) + ${PREC} ${UPSAMP}) if(${PREC} STREQUAL "float") - add_test(NAME cufinufft3d1_test_SM_${PREC} + add_test(NAME cufinufft3d1_test_SM_${PREC}_${UPSAMP} COMMAND cufinufft3d_test 2 1 2 5 10 20 ${REQ_TOL} ${CHECK_TOL} - ${PREC}) + ${PREC} ${UPSAMP}) - add_test(NAME cufinufft3d1_test_block_${PREC} + add_test(NAME cufinufft3d1_test_block_${PREC}_${UPSAMP} COMMAND cufinufft3d_test 4 1 2 5 10 20 ${REQ_TOL} ${CHECK_TOL} - ${PREC}) + ${PREC} ${UPSAMP}) - add_test(NAME cufinufft3d2_test_SM_${PREC} + add_test(NAME cufinufft3d2_test_SM_${PREC}_${UPSAMP} COMMAND cufinufft3d_test 2 2 2 5 10 20 ${REQ_TOL} ${CHECK_TOL} - ${PREC}) + ${PREC} ${UPSAMP}) endif() - add_test(NAME cufinufft3d2_test_GM_${PREC} + add_test(NAME cufinufft3d2_test_GM_${PREC}_${UPSAMP} COMMAND cufinufft3d_test 1 2 2 5 10 20 ${REQ_TOL} ${CHECK_TOL} - ${PREC}) + ${PREC} ${UPSAMP}) endfunction() -add_tests(float 1e-5 2e-4) -add_tests(double 1e-12 1e-11) +add_tests(float 1e-5 2e-4 2.0) +add_tests(double 1e-12 1e-11 2.0) +add_tests(float 1e-5 2e-4 1.25) +add_tests(double 1e-8 1e-7 1.25) add_test(NAME cufinufft_public_api COMMAND public_api_test) add_test(NAME cufinufft_makeplan COMMAND test_makeplan) diff --git a/test/cuda/cufinufft1d_test.cu b/test/cuda/cufinufft1d_test.cu index 05b62025e..dbd6260ac 100644 --- a/test/cuda/cufinufft1d_test.cu +++ b/test/cuda/cufinufft1d_test.cu @@ -17,7 +17,8 @@ using cufinufft::utils::infnorm; template -int run_test(int method, int type, int N1, int M, T tol, T checktol, int iflag) { +int run_test(int method, int type, int N1, int M, T tol, T checktol, int iflag, + double upsampfac) { std::cout << std::scientific << std::setprecision(3); int ier; @@ -88,6 +89,7 @@ int run_test(int method, int type, int N1, int M, T tol, T checktol, int iflag) opts.gpu_method = method; opts.gpu_maxbatchsize = 1; + opts.upsampfac = upsampfac; int nmodes[3] = {N1, 1, 1}; int ntransf = 1; @@ -178,7 +180,7 @@ int run_test(int method, int type, int N1, int M, T tol, T checktol, int iflag) } int main(int argc, char *argv[]) { - if (argc != 8) { + if (argc != 9) { fprintf(stderr, "Usage: cufinufft1d_test method type N1 M tol checktol prec\n" "Arguments:\n" " method: One of\n" @@ -188,21 +190,23 @@ int main(int argc, char *argv[]) { " M: The number of non-uniform points\n" " tol: NUFFT tolerance\n" " checktol: relative error to pass test\n" - " precision: f or d\n"); + " precision: f or d\n" + " upsampfac: upsampling factor\n"); return 1; } - const int method = atoi(argv[1]); - const int type = atoi(argv[2]); - const int N1 = atof(argv[3]); - const int M = atof(argv[4]); - const double tol = atof(argv[5]); - const double checktol = atof(argv[6]); - const int iflag = 1; - const char prec = argv[7][0]; + const int method = atoi(argv[1]); + const int type = atoi(argv[2]); + const int N1 = atof(argv[3]); + const int M = atof(argv[4]); + const double tol = atof(argv[5]); + const double checktol = atof(argv[6]); + const int iflag = 1; + const char prec = argv[7][0]; + const double upsampfac = atof(argv[8]); if (prec == 'f') - return run_test(method, type, N1, M, tol, checktol, iflag); + return run_test(method, type, N1, M, tol, checktol, iflag, upsampfac); else if (prec == 'd') - return run_test(method, type, N1, M, tol, checktol, iflag); + return run_test(method, type, N1, M, tol, checktol, iflag, upsampfac); else return -1; } diff --git a/test/cuda/cufinufft2d_test.cu b/test/cuda/cufinufft2d_test.cu index 4157f6230..f3b767f2e 100644 --- a/test/cuda/cufinufft2d_test.cu +++ b/test/cuda/cufinufft2d_test.cu @@ -18,7 +18,8 @@ using cufinufft::utils::infnorm; template -int run_test(int method, int type, int N1, int N2, int M, T tol, T checktol, int iflag) { +int run_test(int method, int type, int N1, int N2, int M, T tol, T checktol, int iflag, + double upsampfac) { std::cout << std::scientific << std::setprecision(3); thrust::host_vector x(M), y(M); @@ -88,9 +89,9 @@ int run_test(int method, int type, int N1, int N2, int M, T tol, T checktol, int opts.gpu_method = method; opts.gpu_maxbatchsize = 1; - - int nmodes[3] = {N1, N2, 1}; - int ntransf = 1; + opts.upsampfac = upsampfac; + int nmodes[3] = {N1, N2, 1}; + int ntransf = 1; cudaEventRecord(start); int ier = cufinufft_makeplan_impl(type, dim, nmodes, iflag, ntransf, tol, &dplan, &opts); @@ -178,7 +179,7 @@ int run_test(int method, int type, int N1, int N2, int M, T tol, T checktol, int } int main(int argc, char *argv[]) { - if (argc != 9) { + if (argc != 10) { fprintf(stderr, "Usage: cufinufft2d1_test method N1 N2 M tol checktol\n" "Arguments:\n" " method: One of\n" @@ -189,23 +190,25 @@ int main(int argc, char *argv[]) { " M: The number of non-uniform points\n" " tol: NUFFT tolerance\n" " checktol: relative error to pass test\n" - " prec: 'f' or 'd' (float/double)\n"); + " prec: 'f' or 'd' (float/double)\n" + " upsampfac: upsampling factor\n"); return 1; } - const int method = atoi(argv[1]); - const int type = atoi(argv[2]); - const int N1 = atof(argv[3]); - const int N2 = atof(argv[4]); - const int M = atof(argv[5]); - const double tol = atof(argv[6]); - const double checktol = atof(argv[7]); - const char prec = argv[8][0]; - const int iflag = 1; + const int method = atoi(argv[1]); + const int type = atoi(argv[2]); + const int N1 = atof(argv[3]); + const int N2 = atof(argv[4]); + const int M = atof(argv[5]); + const double tol = atof(argv[6]); + const double checktol = atof(argv[7]); + const char prec = argv[8][0]; + const double upsampfac = atof(argv[9]); + const int iflag = 1; if (prec == 'f') - return run_test(method, type, N1, N2, M, tol, checktol, iflag); + return run_test(method, type, N1, N2, M, tol, checktol, iflag, upsampfac); else if (prec == 'd') - return run_test(method, type, N1, N2, M, tol, checktol, iflag); + return run_test(method, type, N1, N2, M, tol, checktol, iflag, upsampfac); else return -1; } diff --git a/test/cuda/cufinufft2dmany_test.cu b/test/cuda/cufinufft2dmany_test.cu index b4f3529e1..4afcd97dd 100644 --- a/test/cuda/cufinufft2dmany_test.cu +++ b/test/cuda/cufinufft2dmany_test.cu @@ -19,7 +19,7 @@ using cufinufft::utils::infnorm; template int run_test(int method, int type, int N1, int N2, int ntransf, int maxbatchsize, int M, - T tol, T checktol, int iflag) { + T tol, T checktol, int iflag, double upsampfac) { std::cout << std::scientific << std::setprecision(3); int ier; @@ -93,6 +93,7 @@ int run_test(int method, int type, int N1, int N2, int ntransf, int maxbatchsize opts.gpu_method = method; opts.gpu_maxbatchsize = maxbatchsize; + opts.upsampfac = upsampfac; int nmodes[3] = {N1, N2, 1}; cudaEventRecord(start); @@ -184,7 +185,7 @@ int run_test(int method, int type, int N1, int N2, int ntransf, int maxbatchsize } int main(int argc, char *argv[]) { - if (argc != 11) { + if (argc != 12) { fprintf(stderr, "Usage: cufinufft2d1many_test method type N1 N2 ntransf maxbatchsize M " "tol checktol prec\n" @@ -199,7 +200,8 @@ int main(int argc, char *argv[]) { " M: The number of non-uniform points\n" " tol: NUFFT tolerance\n" " checktol: relative error to pass test\n" - " prec: 'f' or 'd' (float/double)\n"); + " prec: 'f' or 'd' (float/double)\n" + " upsampfac: upsampling factor\n"); return 1; } const int method = atoi(argv[1]); @@ -212,14 +214,15 @@ int main(int argc, char *argv[]) { const double tol = atof(argv[8]); const double checktol = atof(argv[9]); const char prec = argv[10][0]; + const double upsampfac = atof(argv[11]); const int iflag = 1; if (prec == 'f') return run_test(method, type, N1, N2, ntransf, maxbatchsize, M, tol, checktol, - iflag); + iflag, upsampfac); else if (prec == 'd') return run_test(method, type, N1, N2, ntransf, maxbatchsize, M, tol, checktol, - iflag); + iflag, upsampfac); else return -1; } diff --git a/test/cuda/cufinufft3d_test.cu b/test/cuda/cufinufft3d_test.cu index 933dda36d..67818c2b2 100644 --- a/test/cuda/cufinufft3d_test.cu +++ b/test/cuda/cufinufft3d_test.cu @@ -19,7 +19,7 @@ using cufinufft::utils::infnorm; template int run_test(int method, int type, int N1, int N2, int N3, int M, T tol, T checktol, - int iflag) { + int iflag, double upsampfac) { std::cout << std::scientific << std::setprecision(3); int ier; @@ -94,9 +94,9 @@ int run_test(int method, int type, int N1, int N2, int N3, int M, T tol, T check opts.gpu_method = method; opts.gpu_kerevalmeth = 1; opts.gpu_maxbatchsize = 1; - - int nmodes[3] = {N1, N2, N3}; - int ntransf = 1; + opts.upsampfac = upsampfac; + int nmodes[3] = {N1, N2, N3}; + int ntransf = 1; cudaEventRecord(start); ier = cufinufft_makeplan_impl(type, dim, nmodes, iflag, ntransf, tol, &dplan, &opts); @@ -190,7 +190,7 @@ int run_test(int method, int type, int N1, int N2, int N3, int M, T tol, T check } int main(int argc, char *argv[]) { - if (argc < 10) { + if (argc != 11) { fprintf(stderr, "Usage: cufinufft3d1_test method type N1 N2 N3 M tol checktol prec\n" "Arguments:\n" @@ -203,24 +203,26 @@ int main(int argc, char *argv[]) { " M: The number of non-uniform points\n" " tol: NUFFT tolerance\n" " checktol: relative error to pass test\n" - " prec: 'f' or 'd' (float/double)\n"); + " prec: 'f' or 'd' (float/double)\n" + " upsamplefac: upsampling factor\n"); return 1; } - const int method = atoi(argv[1]); - const int type = atoi(argv[2]); - const int N1 = atof(argv[3]); - const int N2 = atof(argv[4]); - const int N3 = atof(argv[5]); - const int M = atof(argv[6]); - const double tol = atof(argv[7]); - const double checktol = atof(argv[8]); - const char prec = argv[9][0]; - const int iflag = 1; + const int method = atoi(argv[1]); + const int type = atoi(argv[2]); + const int N1 = atof(argv[3]); + const int N2 = atof(argv[4]); + const int N3 = atof(argv[5]); + const int M = atof(argv[6]); + const double tol = atof(argv[7]); + const double checktol = atof(argv[8]); + const char prec = argv[9][0]; + const double upsampfac = atof(argv[10]); + const int iflag = 1; if (prec == 'f') - return run_test(method, type, N1, N2, N3, M, tol, checktol, iflag); + return run_test(method, type, N1, N2, N3, M, tol, checktol, iflag, upsampfac); else if (prec == 'd') - return run_test(method, type, N1, N2, N3, M, tol, checktol, iflag); + return run_test(method, type, N1, N2, N3, M, tol, checktol, iflag, upsampfac); else return -1; } From fb48ff8d668905bed97eb917e8e18ba4ffd74e4e Mon Sep 17 00:00:00 2001 From: Marco Barbone Date: Thu, 1 Aug 2024 16:47:12 -0400 Subject: [PATCH 36/68] added forgotten c++17 flag --- test/cuda/CMakeLists.txt | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/test/cuda/CMakeLists.txt b/test/cuda/CMakeLists.txt index 04ae83e75..d9c5d312b 100644 --- a/test/cuda/CMakeLists.txt +++ b/test/cuda/CMakeLists.txt @@ -5,7 +5,11 @@ foreach(srcfile ${test_src}) get_filename_component(executable ${executable} NAME) add_executable(${executable} ${srcfile}) target_include_directories(${executable} PUBLIC ${CUFINUFFT_INCLUDE_DIRS}) - target_link_libraries(${executable} PUBLIC cufinufft m) + find_library(MathLib m) + if(MathLib) + target_link_libraries(${executable} PUBLIC cufinufft ${MathLib}) + endif() + target_compile_features(${executable} PUBLIC cxx_std_17) set_target_properties( ${executable} PROPERTIES LINKER_LANGUAGE CUDA CUDA_ARCHITECTURES ${FINUFFT_CUDA_ARCHITECTURES}) From b64f68ef127470b73b44b2e154f645b70bfc8e0e Mon Sep 17 00:00:00 2001 From: Marco Barbone Date: Thu, 8 Aug 2024 15:24:48 -0400 Subject: [PATCH 37/68] Preliminary type 3 commit. Incomplete setpts but greatly simplifies the fseries computation --- include/cufinufft/common.h | 19 +- include/cufinufft/contrib/helper_math.h | 168 ++++++++ include/cufinufft/defs.h | 9 +- include/cufinufft/impl.h | 526 +++++++++++++++++------- include/cufinufft/types.h | 50 ++- include/cufinufft/utils.h | 85 +++- include/cufinufft_opts.h | 2 + src/cuda/CMakeLists.txt | 18 +- src/cuda/common.cu | 190 +++++---- src/cuda/cufinufft.cu | 1 + src/cuda/spreadinterp.cpp | 6 +- src/finufft.cpp | 4 +- test/cuda/CMakeLists.txt | 5 + test/cuda/cufinufft_math_test.cu | 129 ++++++ test/cuda/fseries_kernel_test.cu | 72 ++-- 15 files changed, 959 insertions(+), 325 deletions(-) create mode 100644 include/cufinufft/contrib/helper_math.h create mode 100644 test/cuda/cufinufft_math_test.cu diff --git a/include/cufinufft/common.h b/include/cufinufft/common.h index efa7eb7b1..18478f49f 100644 --- a/include/cufinufft/common.h +++ b/include/cufinufft/common.h @@ -12,22 +12,23 @@ namespace cufinufft { namespace common { template -__global__ void fseries_kernel_compute(int nf1, int nf2, int nf3, T *f, - cuDoubleComplex *a, T *fwkerhalf1, T *fwkerhalf2, - T *fwkerhalf3, int ns); +__global__ void fseries_kernel_compute(int nf1, int nf2, int nf3, T *f, T *a, + T *fwkerhalf1, T *fwkerhalf2, T *fwkerhalf3, + int ns); template -int cufserieskernelcompute(int dim, int nf1, int nf2, int nf3, T *d_f, - cuDoubleComplex *d_a, T *d_fwkerhalf1, T *d_fwkerhalf2, - T *d_fwkerhalf3, int ns, cudaStream_t stream); +int cufserieskernelcompute(int dim, int nf1, int nf2, int nf3, T *d_f, T *d_a, + T *d_fwkerhalf1, T *d_fwkerhalf2, T *d_fwkerhalf3, int ns, + cudaStream_t stream); template int setup_spreader_for_nufft(finufft_spread_opts &spopts, T eps, cufinufft_opts opts); void set_nf_type12(CUFINUFFT_BIGINT ms, cufinufft_opts opts, finufft_spread_opts spopts, CUFINUFFT_BIGINT *nf, CUFINUFFT_BIGINT b); +// template +// void onedim_fseries_kernel(CUFINUFFT_BIGINT nf, T *fwkerhalf, finufft_spread_opts +// opts); template -void onedim_fseries_kernel(CUFINUFFT_BIGINT nf, T *fwkerhalf, finufft_spread_opts opts); -template -void onedim_fseries_kernel_precomp(CUFINUFFT_BIGINT nf, T *f, std::complex *a, +void onedim_fseries_kernel_precomp(CUFINUFFT_BIGINT nf, T *f, T *a, finufft_spread_opts opts); template void onedim_fseries_kernel_compute(CUFINUFFT_BIGINT nf, T *f, std::complex *a, diff --git a/include/cufinufft/contrib/helper_math.h b/include/cufinufft/contrib/helper_math.h new file mode 100644 index 000000000..cc1ff8411 --- /dev/null +++ b/include/cufinufft/contrib/helper_math.h @@ -0,0 +1,168 @@ +#ifndef FINUFFT_INCLUDE_CUFINUFFT_CONTRIB_HELPER_MATH_H +#define FINUFFT_INCLUDE_CUFINUFFT_CONTRIB_HELPER_MATH_H + +#include + +// Addition for cuDoubleComplex (double) with cuDoubleComplex (double) +__host__ __device__ __forceinline__ cuDoubleComplex operator+( + const cuDoubleComplex &a, const cuDoubleComplex &b) noexcept { + return cuCadd(a, b); +} + +// Subtraction for cuDoubleComplex (double) with cuDoubleComplex (double) +__host__ __device__ __forceinline__ cuDoubleComplex operator-( + const cuDoubleComplex &a, const cuDoubleComplex &b) noexcept { + return cuCsub(a, b); +} + +// Multiplication for cuDoubleComplex (double) with cuDoubleComplex (double) +__host__ __device__ __forceinline__ cuDoubleComplex operator*( + const cuDoubleComplex &a, const cuDoubleComplex &b) noexcept { + return cuCmul(a, b); +} + +// Division for cuDoubleComplex (double) with cuDoubleComplex (double) +__host__ __device__ __forceinline__ cuDoubleComplex operator/( + const cuDoubleComplex &a, const cuDoubleComplex &b) noexcept { + return cuCdiv(a, b); +} + +// Equality for cuDoubleComplex (double) with cuDoubleComplex (double) +__host__ __device__ __forceinline__ bool operator==(const cuDoubleComplex &a, + const cuDoubleComplex &b) noexcept { + return cuCreal(a) == cuCreal(b) && cuCimag(a) == cuCimag(b); +} + +// Inequality for cuDoubleComplex (double) with cuDoubleComplex (double) +__host__ __device__ __forceinline__ bool operator!=(const cuDoubleComplex &a, + const cuDoubleComplex &b) noexcept { + return !(a == b); +} + +// Addition for cuDoubleComplex (double) with double +__host__ __device__ __forceinline__ cuDoubleComplex operator+(const cuDoubleComplex &a, + double b) noexcept { + return make_cuDoubleComplex(cuCreal(a) + b, cuCimag(a)); +} + +__host__ __device__ __forceinline__ cuDoubleComplex operator+( + double a, const cuDoubleComplex &b) noexcept { + return make_cuDoubleComplex(a + cuCreal(b), cuCimag(b)); +} + +// Subtraction for cuDoubleComplex (double) with double +__host__ __device__ __forceinline__ cuDoubleComplex operator-(const cuDoubleComplex &a, + double b) noexcept { + return make_cuDoubleComplex(cuCreal(a) - b, cuCimag(a)); +} + +__host__ __device__ __forceinline__ cuDoubleComplex operator-( + double a, const cuDoubleComplex &b) noexcept { + return make_cuDoubleComplex(a - cuCreal(b), -cuCimag(b)); +} + +// Multiplication for cuDoubleComplex (double) with double +__host__ __device__ __forceinline__ cuDoubleComplex operator*(const cuDoubleComplex &a, + double b) noexcept { + return make_cuDoubleComplex(cuCreal(a) * b, cuCimag(a) * b); +} + +__host__ __device__ __forceinline__ cuDoubleComplex operator*( + double a, const cuDoubleComplex &b) noexcept { + return make_cuDoubleComplex(a * cuCreal(b), a * cuCimag(b)); +} + +// Division for cuDoubleComplex (double) with double +__host__ __device__ __forceinline__ cuDoubleComplex operator/(const cuDoubleComplex &a, + double b) noexcept { + return make_cuDoubleComplex(cuCreal(a) / b, cuCimag(a) / b); +} + +__host__ __device__ __forceinline__ cuDoubleComplex operator/( + double a, const cuDoubleComplex &b) noexcept { + double denom = cuCreal(b) * cuCreal(b) + cuCimag(b) * cuCimag(b); + return make_cuDoubleComplex((a * cuCreal(b)) / denom, (-a * cuCimag(b)) / denom); +} + +// Addition for cuFloatComplex (float) with cuFloatComplex (float) +__host__ __device__ __forceinline__ cuFloatComplex operator+( + const cuFloatComplex &a, const cuFloatComplex &b) noexcept { + return cuCaddf(a, b); +} + +// Subtraction for cuFloatComplex (float) with cuFloatComplex (float) +__host__ __device__ __forceinline__ cuFloatComplex operator-( + const cuFloatComplex &a, const cuFloatComplex &b) noexcept { + return cuCsubf(a, b); +} + +// Multiplication for cuFloatComplex (float) with cuFloatComplex (float) +__host__ __device__ __forceinline__ cuFloatComplex operator*( + const cuFloatComplex &a, const cuFloatComplex &b) noexcept { + return cuCmulf(a, b); +} + +// Division for cuFloatComplex (float) with cuFloatComplex (float) +__host__ __device__ __forceinline__ cuFloatComplex operator/( + const cuFloatComplex &a, const cuFloatComplex &b) noexcept { + return cuCdivf(a, b); +} + +// Equality for cuFloatComplex (float) with cuFloatComplex (float) +__host__ __device__ __forceinline__ bool operator==(const cuFloatComplex &a, + const cuFloatComplex &b) noexcept { + return cuCrealf(a) == cuCrealf(b) && cuCimagf(a) == cuCimagf(b); +} + +// Inequality for cuFloatComplex (float) with cuFloatComplex (float) +__host__ __device__ __forceinline__ bool operator!=(const cuFloatComplex &a, + const cuFloatComplex &b) noexcept { + return !(a == b); +} + +// Addition for cuFloatComplex (float) with float +__host__ __device__ __forceinline__ cuFloatComplex operator+(const cuFloatComplex &a, + float b) noexcept { + return make_cuFloatComplex(cuCrealf(a) + b, cuCimagf(a)); +} + +__host__ __device__ __forceinline__ cuFloatComplex operator+( + float a, const cuFloatComplex &b) noexcept { + return make_cuFloatComplex(a + cuCrealf(b), cuCimagf(b)); +} + +// Subtraction for cuFloatComplex (float) with float +__host__ __device__ __forceinline__ cuFloatComplex operator-(const cuFloatComplex &a, + float b) noexcept { + return make_cuFloatComplex(cuCrealf(a) - b, cuCimagf(a)); +} + +__host__ __device__ __forceinline__ cuFloatComplex operator-( + float a, const cuFloatComplex &b) noexcept { + return make_cuFloatComplex(a - cuCrealf(b), -cuCimagf(b)); +} + +// Multiplication for cuFloatComplex (float) with float +__host__ __device__ __forceinline__ cuFloatComplex operator*(const cuFloatComplex &a, + float b) noexcept { + return make_cuFloatComplex(cuCrealf(a) * b, cuCimagf(a) * b); +} + +__host__ __device__ __forceinline__ cuFloatComplex operator*( + float a, const cuFloatComplex &b) noexcept { + return make_cuFloatComplex(a * cuCrealf(b), a * cuCimagf(b)); +} + +// Division for cuFloatComplex (float) with float +__host__ __device__ __forceinline__ cuFloatComplex operator/(const cuFloatComplex &a, + float b) noexcept { + return make_cuFloatComplex(cuCrealf(a) / b, cuCimagf(a) / b); +} + +__host__ __device__ __forceinline__ cuFloatComplex operator/( + float a, const cuFloatComplex &b) noexcept { + float denom = cuCrealf(b) * cuCrealf(b) + cuCimagf(b) * cuCimagf(b); + return make_cuFloatComplex((a * cuCrealf(b)) / denom, (-a * cuCimagf(b)) / denom); +} + +#endif // FINUFFT_INCLUDE_CUFINUFFT_CONTRIB_HELPER_MATH_H diff --git a/include/cufinufft/defs.h b/include/cufinufft/defs.h index 6b2a075ea..8a677d21b 100644 --- a/include/cufinufft/defs.h +++ b/include/cufinufft/defs.h @@ -1,15 +1,18 @@ #ifndef CUFINUFFT_DEFS_H #define CUFINUFFT_DEFS_H +#include #include - // constants needed within common // upper bound on w, ie nspread, even when padded (see evaluate_kernel_vector); also for // common -#define MAX_NSPREAD 16 +#define MAX_NSPREAD 16 // max number of positive quadr nodes -#define MAX_NQUAD 100 +#define MAX_NQUAD 100 + +// Fraction growth cut-off in utils:arraywidcen, sets when translate in type-3 +#define ARRAYWIDCEN_GROWFRAC 0.1 // FIXME: If cufft ever takes N > INT_MAX... constexpr int32_t MAX_NF = std::numeric_limits::max(); diff --git a/include/cufinufft/impl.h b/include/cufinufft/impl.h index 3a9fd6877..85f2d6e0e 100644 --- a/include/cufinufft/impl.h +++ b/include/cufinufft/impl.h @@ -4,6 +4,7 @@ #include #include +#include #include #include @@ -62,11 +63,12 @@ int cufinufft_makeplan_impl(int type, int dim, int *nmodes, int iflag, int ntran Melody Shih 07/25/19. Use-facing moved to markdown, Barnett 2/16/21. Marco Barbone 07/26/24. Using SM when shared memory available is enough. */ + using namespace cufinufft::common; int ier; - cuDoubleComplex *d_a = nullptr; // fseries temp data - T *d_f = nullptr; // fseries temp data + T *d_a = nullptr; // fseries temp data + T *d_f = nullptr; // fseries temp data - if (type < 1 || type > 2) { + if (type < 1 || type > 3) { fprintf(stderr, "[%s] Invalid type (%d): should be 1 or 2.\n", __func__, type); return FINUFFT_ERR_TYPE_NOTVALID; } @@ -76,21 +78,33 @@ int cufinufft_makeplan_impl(int type, int dim, int *nmodes, int iflag, int ntran return FINUFFT_ERR_NTRANS_NOTVALID; } - // Mult-GPU support: set the CUDA Device ID: - const int device_id = opts == nullptr ? 0 : opts->gpu_device_id; - cufinufft::utils::WithCudaDevice device_swapper(device_id); - /* allocate the plan structure, assign address to user pointer. */ auto *d_plan = new cufinufft_plan_t; *d_plan_ptr = d_plan; // Zero out your struct, (sets all pointers to NULL) memset(d_plan, 0, sizeof(*d_plan)); + // set nf1, nf2, nf3 to 1 for type 3, type 1, type 2 will overwrite this + d_plan->nf1 = 1; + d_plan->nf2 = 1; + d_plan->nf3 = 1; /* If a user has not supplied their own options, assign defaults for them. */ if (opts == nullptr) { // use default opts cufinufft_default_opts(&(d_plan->opts)); } else { // or read from what's passed in d_plan->opts = *opts; // keep a deep copy; changing *opts now has no effect } + d_plan->dim = dim; + d_plan->opts.gpu_maxbatchsize = std::max(d_plan->opts.gpu_maxbatchsize, 1); + if (type != 3) { + d_plan->ms = nmodes[0]; + d_plan->mt = nmodes[1]; + d_plan->mu = nmodes[2]; + } + const auto stream = d_plan->stream = (cudaStream_t)d_plan->opts.gpu_stream; + + // Mult-GPU support: set the CUDA Device ID: + const int device_id = d_plan->opts.gpu_device_id; + const cufinufft::utils::WithCudaDevice FromID{device_id}; // cudaMallocAsync isn't supported for all devices, regardless of cuda version. Check // for support @@ -104,162 +118,180 @@ int cufinufft_makeplan_impl(int type, int dim, int *nmodes, int iflag, int ntran warned = true; } - auto &stream = d_plan->stream = (cudaStream_t)d_plan->opts.gpu_stream; - using namespace cufinufft::common; - /* Setup Spreader */ + // simple check to use upsampfac=1.25 if tol is big + // FIXME: since cufft is really fast we should use 1.25 only if we run out of vram + if (d_plan->opts.upsampfac == 0.0) { // indicates auto-choose + d_plan->opts.upsampfac = 2.0; // default, and need for tol small + if (tol >= (T)1E-9) { // the tol sigma=5/4 can reach + d_plan->opts.upsampfac = 1.25; + } + if (d_plan->opts.debug) { + printf("[cufinufft] upsampfac automatically set to %.3g\n", d_plan->opts.upsampfac); + } + } - // can return FINUFFT_WARN_EPS_TOO_SMALL=1, which is OK + /* Setup Spreader */ if ((ier = setup_spreader_for_nufft(d_plan->spopts, tol, d_plan->opts)) > 1) { + // can return FINUFFT_WARN_EPS_TOO_SMALL=1, which is OK delete *d_plan_ptr; *d_plan_ptr = nullptr; return ier; } - d_plan->dim = dim; - d_plan->ms = nmodes[0]; - d_plan->mt = nmodes[1]; - d_plan->mu = nmodes[2]; - - cufinufft_setup_binsize(type, d_plan->spopts.nspread, dim, &d_plan->opts); - RETURN_IF_CUDA_ERROR - - CUFINUFFT_BIGINT nf1 = 1, nf2 = 1, nf3 = 1; - set_nf_type12(d_plan->ms, d_plan->opts, d_plan->spopts, &nf1, - d_plan->opts.gpu_obinsizex); - if (dim > 1) - set_nf_type12(d_plan->mt, d_plan->opts, d_plan->spopts, &nf2, - d_plan->opts.gpu_obinsizey); - if (dim > 2) - set_nf_type12(d_plan->mu, d_plan->opts, d_plan->spopts, &nf3, - d_plan->opts.gpu_obinsizez); - - // dynamically request the maximum amount of shared memory available - // for the spreader - - /* Automatically set GPU method. */ - if (d_plan->opts.gpu_method == 0) { - /* For type 1, we default to method 2 (SM) since this is generally faster - * if there is enough shared memory available. Otherwise, we default to GM. - * - * For type 2, we always default to method 1 (GM). - */ - if (type == 2) { - d_plan->opts.gpu_method = 1; - } else { - // query the device for the amount of shared memory available + d_plan->type = type; + d_plan->spopts.spread_direction = d_plan->type; + + if (type == 1 || type == 2) { + cufinufft_setup_binsize(type, d_plan->spopts.nspread, dim, &d_plan->opts); + + if (d_plan->opts.debug) { + printf("[cufinufft] bin size x: %d", d_plan->opts.gpu_binsizex); + if (dim > 1) printf(" bin size y: %d", d_plan->opts.gpu_binsizey); + if (dim > 2) printf(" bin size z: %d", d_plan->opts.gpu_binsizez); + printf("\n"); + // shared memory required for the spreader vs available shared memory int shared_mem_per_block{}; cudaDeviceGetAttribute(&shared_mem_per_block, cudaDevAttrMaxSharedMemoryPerBlockOptin, device_id); - RETURN_IF_CUDA_ERROR - // compute the amount of shared memory required for the method - const auto shared_mem_required = shared_memory_required( + const auto mem_required = shared_memory_required( dim, d_plan->spopts.nspread, d_plan->opts.gpu_binsizex, d_plan->opts.gpu_binsizey, d_plan->opts.gpu_binsizez); - if ((shared_mem_required > shared_mem_per_block)) { + printf("[cufinufft] shared memory required for the spreader: %d\n", mem_required); + } + + RETURN_IF_CUDA_ERROR + + CUFINUFFT_BIGINT nf1 = 1, nf2 = 1, nf3 = 1; + set_nf_type12(d_plan->ms, d_plan->opts, d_plan->spopts, &nf1, + d_plan->opts.gpu_obinsizex); + if (dim > 1) + set_nf_type12(d_plan->mt, d_plan->opts, d_plan->spopts, &nf2, + d_plan->opts.gpu_obinsizey); + if (dim > 2) + set_nf_type12(d_plan->mu, d_plan->opts, d_plan->spopts, &nf3, + d_plan->opts.gpu_obinsizez); + + // dynamically request the maximum amount of shared memory available + // for the spreader + + /* Automatically set GPU method. */ + if (d_plan->opts.gpu_method == 0) { + /* For type 1, we default to method 2 (SM) since this is generally faster + * if there is enough shared memory available. Otherwise, we default to GM. + * + * For type 2, we always default to method 1 (GM). + */ + if (type == 2) { d_plan->opts.gpu_method = 1; } else { - d_plan->opts.gpu_method = 2; + // query the device for the amount of shared memory available + int shared_mem_per_block{}; + cudaDeviceGetAttribute(&shared_mem_per_block, + cudaDevAttrMaxSharedMemoryPerBlockOptin, device_id); + RETURN_IF_CUDA_ERROR + // compute the amount of shared memory required for the method + const auto shared_mem_required = shared_memory_required( + dim, d_plan->spopts.nspread, d_plan->opts.gpu_binsizex, + d_plan->opts.gpu_binsizey, d_plan->opts.gpu_binsizez); + if ((shared_mem_required > shared_mem_per_block)) { + d_plan->opts.gpu_method = 1; + } else { + d_plan->opts.gpu_method = 2; + } } } - } - - int fftsign = (iflag >= 0) ? 1 : -1; - - d_plan->nf1 = nf1; - d_plan->nf2 = nf2; - d_plan->nf3 = nf3; - d_plan->iflag = fftsign; - d_plan->ntransf = ntransf; - int maxbatchsize = opts ? opts->gpu_maxbatchsize : 0; - if (maxbatchsize == 0) // implies: use a heuristic. - maxbatchsize = std::min(ntransf, 8); // heuristic from test codes - d_plan->maxbatchsize = maxbatchsize; - d_plan->type = type; - - if (d_plan->type == 1) d_plan->spopts.spread_direction = 1; - if (d_plan->type == 2) d_plan->spopts.spread_direction = 2; - - using namespace cufinufft::memtransfer; - switch (d_plan->dim) { - case 1: { - if ((ier = allocgpumem1d_plan(d_plan))) goto finalize; - } break; - case 2: { - if ((ier = allocgpumem2d_plan(d_plan))) goto finalize; - } break; - case 3: { - if ((ier = allocgpumem3d_plan(d_plan))) goto finalize; - } break; - } - - cufftHandle fftplan; - cufftResult_t cufft_status; - switch (d_plan->dim) { - case 1: { - int n[] = {(int)nf1}; - int inembed[] = {(int)nf1}; - - cufft_status = cufftPlanMany(&fftplan, 1, n, inembed, 1, inembed[0], inembed, 1, - inembed[0], cufft_type(), maxbatchsize); - } break; - case 2: { - int n[] = {(int)nf2, (int)nf1}; - int inembed[] = {(int)nf2, (int)nf1}; - - cufft_status = - cufftPlanMany(&fftplan, 2, n, inembed, 1, inembed[0] * inembed[1], inembed, 1, - inembed[0] * inembed[1], cufft_type(), maxbatchsize); - } break; - case 3: { - int n[] = {(int)nf3, (int)nf2, (int)nf1}; - int inembed[] = {(int)nf3, (int)nf2, (int)nf1}; - - cufft_status = cufftPlanMany( - &fftplan, 3, n, inembed, 1, inembed[0] * inembed[1] * inembed[2], inembed, 1, - inembed[0] * inembed[1] * inembed[2], cufft_type(), maxbatchsize); - } break; - } - - if (cufft_status != CUFFT_SUCCESS) { - fprintf(stderr, "[%s] cufft makeplan error: %s", __func__, - cufftGetErrorString(cufft_status)); - ier = FINUFFT_ERR_CUDA_FAILURE; - goto finalize; - } - cufftSetStream(fftplan, stream); - d_plan->fftplan = fftplan; - { - std::complex *a = d_plan->fseries_precomp_a; - T *f = d_plan->fseries_precomp_f; + int fftsign = (iflag >= 0) ? 1 : -1; + + d_plan->nf1 = nf1; + d_plan->nf2 = nf2; + d_plan->nf3 = nf3; + d_plan->iflag = fftsign; + d_plan->ntransf = ntransf; + int maxbatchsize = opts ? opts->gpu_maxbatchsize : 0; + if (maxbatchsize == 0) // implies: use a heuristic. + maxbatchsize = std::min(ntransf, 8); // heuristic from test codes + d_plan->maxbatchsize = maxbatchsize; + + using namespace cufinufft::memtransfer; + switch (d_plan->dim) { + case 1: { + if ((ier = allocgpumem1d_plan(d_plan))) goto finalize; + } break; + case 2: { + if ((ier = allocgpumem2d_plan(d_plan))) goto finalize; + } break; + case 3: { + if ((ier = allocgpumem3d_plan(d_plan))) goto finalize; + } break; + } - onedim_fseries_kernel_precomp(nf1, f, a, d_plan->spopts); - if (dim > 1) - onedim_fseries_kernel_precomp(nf2, f + MAX_NQUAD, a + MAX_NQUAD, d_plan->spopts); - if (dim > 2) - onedim_fseries_kernel_precomp(nf3, f + 2 * MAX_NQUAD, a + 2 * MAX_NQUAD, - d_plan->spopts); + cufftHandle fftplan; + cufftResult_t cufft_status; + switch (d_plan->dim) { + case 1: { + int n[] = {(int)nf1}; + int inembed[] = {(int)nf1}; + + cufft_status = cufftPlanMany(&fftplan, 1, n, inembed, 1, inembed[0], inembed, 1, + inembed[0], cufft_type(), maxbatchsize); + } break; + case 2: { + int n[] = {(int)nf2, (int)nf1}; + int inembed[] = {(int)nf2, (int)nf1}; + + cufft_status = + cufftPlanMany(&fftplan, 2, n, inembed, 1, inembed[0] * inembed[1], inembed, 1, + inembed[0] * inembed[1], cufft_type(), maxbatchsize); + } break; + case 3: { + int n[] = {(int)nf3, (int)nf2, (int)nf1}; + int inembed[] = {(int)nf3, (int)nf2, (int)nf1}; + + cufft_status = cufftPlanMany( + &fftplan, 3, n, inembed, 1, inembed[0] * inembed[1] * inembed[2], inembed, 1, + inembed[0] * inembed[1] * inembed[2], cufft_type(), maxbatchsize); + } break; + } - if ((ier = checkCudaErrors( - cudaMallocWrapper(&d_a, dim * MAX_NQUAD * sizeof(cuDoubleComplex), stream, - d_plan->supports_pools)))) - goto finalize; - if ((ier = checkCudaErrors(cudaMallocWrapper(&d_f, dim * MAX_NQUAD * sizeof(T), - stream, d_plan->supports_pools)))) - goto finalize; - if ((ier = checkCudaErrors( - cudaMemcpyAsync(d_a, a, dim * MAX_NQUAD * sizeof(cuDoubleComplex), - cudaMemcpyHostToDevice, stream)))) - goto finalize; - if ((ier = checkCudaErrors(cudaMemcpyAsync(d_f, f, dim * MAX_NQUAD * sizeof(T), - cudaMemcpyHostToDevice, stream)))) - goto finalize; - if ((ier = cufserieskernelcompute( - d_plan->dim, nf1, nf2, nf3, d_f, d_a, d_plan->fwkerhalf1, d_plan->fwkerhalf2, - d_plan->fwkerhalf3, d_plan->spopts.nspread, stream))) + if (cufft_status != CUFFT_SUCCESS) { + fprintf(stderr, "[%s] cufft makeplan error: %s", __func__, + cufftGetErrorString(cufft_status)); + ier = FINUFFT_ERR_CUDA_FAILURE; goto finalize; + } + cufftSetStream(fftplan, stream); + + d_plan->fftplan = fftplan; + { + T *a = d_plan->fseries_precomp_a; + T *f = d_plan->fseries_precomp_f; + + onedim_fseries_kernel_precomp(nf1, f, a, d_plan->spopts); + if (dim > 1) + onedim_fseries_kernel_precomp(nf2, f + MAX_NQUAD, a + MAX_NQUAD, d_plan->spopts); + if (dim > 2) + onedim_fseries_kernel_precomp(nf3, f + 2 * MAX_NQUAD, a + 2 * MAX_NQUAD, + d_plan->spopts); + + if ((ier = checkCudaErrors(cudaMallocWrapper(&d_a, dim * MAX_NQUAD * sizeof(T), + stream, d_plan->supports_pools)))) + goto finalize; + if ((ier = checkCudaErrors(cudaMallocWrapper(&d_f, dim * MAX_NQUAD * sizeof(T), + stream, d_plan->supports_pools)))) + goto finalize; + if ((ier = checkCudaErrors(cudaMemcpyAsync(d_a, a, dim * MAX_NQUAD * sizeof(T), + cudaMemcpyHostToDevice, stream)))) + goto finalize; + if ((ier = checkCudaErrors(cudaMemcpyAsync(d_f, f, dim * MAX_NQUAD * sizeof(T), + cudaMemcpyHostToDevice, stream)))) + goto finalize; + if ((ier = cufserieskernelcompute( + d_plan->dim, nf1, nf2, nf3, d_f, d_a, d_plan->fwkerhalf1, + d_plan->fwkerhalf2, d_plan->fwkerhalf3, d_plan->spopts.nspread, stream))) + goto finalize; + } } - finalize: cudaFreeWrapper(d_a, stream, d_plan->supports_pools); cudaFreeWrapper(d_f, stream, d_plan->supports_pools); @@ -273,8 +305,8 @@ int cufinufft_makeplan_impl(int type, int dim, int *nmodes, int iflag, int ntran } template -int cufinufft_setpts_impl(int M, T *d_kx, T *d_ky, T *d_kz, int N, T *d_s, T *d_t, T *d_u, - cufinufft_plan_t *d_plan) +int cufinufft_setpts_12_impl(int M, T *d_kx, T *d_ky, T *d_kz, + cufinufft_plan_t *d_plan) /* "setNUpts" stage (in single or double precision). @@ -312,7 +344,7 @@ Notes: the type T means either single or double, matching the Melody Shih 07/25/19; Barnett 2/16/21 moved out docs. */ { - cufinufft::utils::WithCudaDevice device_swapper(d_plan->opts.gpu_device_id); + const cufinufft::utils::WithCudaDevice FromID(d_plan->opts.gpu_device_id); int nf1 = d_plan->nf1; int nf2 = d_plan->nf2; @@ -381,6 +413,210 @@ Notes: the type T means either single or double, matching the return ier; } +template +int cufinufft_setpts_impl(int M, T *d_kx, T *d_ky, T *d_kz, int N, T *d_s, T *d_t, T *d_u, + cufinufft_plan_t *d_plan) { + // type 1 and type 2 setpts + if (d_plan->type == 1 || d_plan->type == 2) { + return cufinufft_setpts_12_impl(M, d_kx, d_ky, d_kz, d_plan); + } + // type 3 setpts + if (d_plan->type != 3) { + fprintf(stderr, "[%s] Invalid type (%d): should be 1, 2, or 3.\n", __func__, + d_plan->type); + return FINUFFT_ERR_TYPE_NOTVALID; + } + if (N < 0) { + fprintf(stderr, "[cufinufft] Invalid N (%d): cannot be negative.\n", N); + return FINUFFT_ERR_NUM_NU_PTS_INVALID; + } + if (N > MAX_NF) { + fprintf(stderr, "[cufinufft] Invalid N (%d): cannot be greater than %d.\n", N, + MAX_NF); + return FINUFFT_ERR_NUM_NU_PTS_INVALID; + } + const auto stream = d_plan->stream; + d_plan->nk = N; + d_plan->d_s = d_s; + d_plan->d_t = d_t; + d_plan->d_u = d_u; + // no need to set the params to zero, as they are already zeroed out in the plan + // memset(d_plan->type3_params, 0, sizeof(d_plan->type3_params)); + using namespace cufinufft::utils; + if (d_plan->dim > 0) { + const auto [x1, c1] = arraywidcen(M, d_plan->kx, stream); + d_plan->type3_params.X1 = x1; + d_plan->type3_params.C1 = c1; + const auto [S1, D1] = arraywidcen(N, d_s, stream); + const auto [nf1, h1, gam1] = set_nhg_type3(S1, x1, d_plan->opts, d_plan->spopts); + d_plan->nf1 = nf1; + d_plan->type3_params.D1 = D1; + d_plan->type3_params.h1 = h1; + d_plan->type3_params.gam1 = gam1; + } + if (d_plan->dim > 1) { + const auto [x2, c2] = arraywidcen(M, d_plan->ky, stream); + d_plan->type3_params.X2 = x2; + d_plan->type3_params.C2 = c2; + const auto [S2, D2] = arraywidcen(N, d_t, stream); + const auto [nf2, h2, gam2] = set_nhg_type3(S2, x2, d_plan->opts, d_plan->spopts); + d_plan->nf2 = nf2; + d_plan->type3_params.D2 = D2; + d_plan->type3_params.h2 = h2; + d_plan->type3_params.gam2 = gam2; + } + if (d_plan->dim > 2) { + const auto [x3, c3] = arraywidcen(M, d_plan->kz, stream); + d_plan->type3_params.X3 = x3; + d_plan->type3_params.C3 = c3; + const auto [S3, D3] = arraywidcen(N, d_u, stream); + const auto [nf3, h3, gam3] = set_nhg_type3(S3, x3, d_plan->opts, d_plan->spopts); + d_plan->nf3 = nf3; + d_plan->type3_params.D3 = D3; + d_plan->type3_params.h3 = h3; + d_plan->type3_params.gam3 = gam3; + } + if (d_plan->opts.debug) { + printf("[%s]", __func__); + printf("\tM=%lld N=%lld\n", M, N); + printf("\tX1=%.3g C1=%.3g D1=%.3g gam1=%g nf1=%lld\t\n", d_plan->type3_params.X1, + d_plan->type3_params.C1, d_plan->type3_params.D1, d_plan->type3_params.gam1, + d_plan->nf1); + if (d_plan->dim > 1) { + printf("\tX2=%.3g C2=%.3g D2=%.3g gam2=%g nf2=%lld\n", d_plan->type3_params.X2, + d_plan->type3_params.C2, d_plan->type3_params.D2, d_plan->type3_params.gam2, + d_plan->nf2); + } + if (d_plan->dim > 2) { + printf("\tX3=%.3g C3=%.3g D3=%.3g gam3=%g nf3=%lld\n", d_plan->type3_params.X3, + d_plan->type3_params.C3, d_plan->type3_params.D3, d_plan->type3_params.gam3, + d_plan->nf3); + } + } + d_plan->nf = d_plan->nf1 * d_plan->nf2 * d_plan->nf3; + // FIXME: MAX_NF might be too small... + if (d_plan->nf * d_plan->opts.gpu_maxbatchsize > MAX_NF) { + fprintf(stderr, + "[%s t3] fwBatch would be bigger than MAX_NF, not attempting malloc!\n", + __func__); + return FINUFFT_ERR_MAXNALLOC; + } + const auto checked_free = [stream, pool = d_plan->supports_pools](auto x) constexpr { + if (!x) return cudaFreeWrapper(x, stream, pool); + return cudaSuccess; + }; + const auto checked_realloc = [checked_free, pool = d_plan->supports_pools, stream]( + auto &x, const auto size) constexpr { + if (auto ier = checked_free(x); ier != cudaSuccess) return ier; + return cudaMallocWrapper(&x, size, stream, pool); + }; + + if (checked_realloc(d_plan->kx, sizeof(T) * M) != cudaSuccess) goto finalize; + if (checked_realloc(d_plan->d_s, sizeof(T) * N) != cudaSuccess) goto finalize; + if (d_plan->dim > 1) { + if (checked_realloc(d_plan->ky, sizeof(T) * M) != cudaSuccess) goto finalize; + if (checked_realloc(d_plan->d_t, sizeof(T) * N) != cudaSuccess) goto finalize; + } + if (d_plan->dim > 1) { + if (checked_realloc(d_plan->kz, sizeof(T) * M) != cudaSuccess) goto finalize; + if (checked_realloc(d_plan->d_u, sizeof(T) * N) != cudaSuccess) goto finalize; + } + if (checked_realloc(d_plan->prephase, sizeof(cuda_complex) * M) != cudaSuccess) + goto finalize; + cudaStreamSynchronize(stream); + + // NOTE: init-captures are not allowed for extended __host__ __device__ lambdas + + if (d_plan->dim > 0) { + // TODO: merging the tree calls to GPU into one as in the version below might + // might be more readable and faster + + const auto ig1 = T(1) / d_plan->type3_params.gam1; + const auto C1 = -d_plan->type3_params.C1; + thrust::transform( + thrust::cuda::par.on(stream), d_kx, d_kx + M, d_plan->kx, + [ig1, C1] __host__ __device__(const T x) -> T { return (x + C1) * ig1; }); + } + if (d_plan->dim > 1) { + const auto ig2 = T(1) / d_plan->type3_params.gam2; + const auto C2 = -d_plan->type3_params.C2; + thrust::transform( + thrust::cuda::par.on(stream), d_ky, d_ky + M, d_plan->ky, + [ig2, C2] __host__ __device__(const T x) -> T { return (x + C2) * ig2; }); + } + if (d_plan->dim > 2) { + const auto ig3 = T(1) / d_plan->type3_params.gam3; + const auto C3 = -d_plan->type3_params.C3; + thrust::transform( + thrust::cuda::par.on(stream), d_kz, d_ky + M, d_plan->ky, + [ig3, C3] __host__ __device__(const T x) -> T { return (x + C3) * ig3; }); + } + + if (d_plan->type3_params.D1 != 0 || d_plan->type3_params.D2 != 0 || + d_plan->type3_params.D3 != 0) { + // if ky is null, use kx for ky and kz + // this is not the most efficient implementation, but it is the most compact + const auto iterator = thrust::make_zip_iterator( + thrust::make_tuple(d_plan->kx, + // to avoid out of bounds access, use kx if ky is null + d_plan->ky ? d_plan->ky : d_plan->kx, + // same idea as above + d_plan->kz ? d_plan->kz : d_plan->kx)); + const auto D1 = d_plan->type3_params.D1; + const auto D2 = d_plan->type3_params.D2; // this should be 0 if dim < 2 + const auto D3 = d_plan->type3_params.D3; // this should be 0 if dim < 3 + const auto imasign = + d_plan->iflag >= 0 ? cuda_complex{0, 1} : cuda_complex{0, -1}; + thrust::transform(iterator, iterator + M, d_plan->prephase, + [D1, D2, D3, imasign] __host__ __device__( + const thrust::tuple &tuple) -> cuda_complex { + const auto x = thrust::get<0>(tuple); + const auto y = thrust::get<1>(tuple); + const auto z = thrust::get<2>(tuple); + // no branching because D2 and D3 are 0 if dim < 2 and dim < 3 + // this is generally faster on GPU + const auto phase = D1 * x + D2 * y + D3 * z; + // TODO: nvcc should have the sincos function + // check the cos + i*sin + // ref: https://en.wikipedia.org/wiki/Cis_(mathematics) + return sin(phase) * imasign + cos(phase); + }); + } else { + thrust::fill(d_plan->prephase, d_plan->prephase + M, cuda_complex{1, 0}); + } + if (d_plan->dim > 0) { + const auto scale = d_plan->type3_params.h1 * d_plan->type3_params.gam1; + const auto D1 = -d_plan->type3_params.D1; + thrust::transform( + d_s, d_s + N, d_plan->d_s, + [scale, D1] __host__ __device__(const T s) -> T { return scale * (s + D1); }); + } + if (d_plan->dim > 1) { + const auto scale = d_plan->type3_params.h2 * d_plan->type3_params.gam2; + const auto D2 = -d_plan->type3_params.D2; + thrust::transform( + d_t, d_t + N, d_plan->d_t, + [scale, D2] __host__ __device__(const T t) -> T { return scale * (t + D2); }); + } + if (d_plan->dim > 2) { + const auto scale = d_plan->type3_params.h3 * d_plan->type3_params.gam3; + const auto D3 = -d_plan->type3_params.D3; + thrust::transform( + d_u, d_u + N, d_plan->d_u, + [scale, D3] __host__ __device__(const T u) -> T { return scale * (u + D3); }); + } + return 0; +finalize: + checked_free(d_plan->kx); + checked_free(d_plan->d_s); + checked_free(d_plan->ky); + checked_free(d_plan->d_t); + checked_free(d_plan->kz); + checked_free(d_plan->d_u); + checked_free(d_plan->prephase); + return FINUFFT_ERR_CUDA_FAILURE; +} + template int cufinufft_execute_impl(cuda_complex *d_c, cuda_complex *d_fk, cufinufft_plan_t *d_plan) diff --git a/include/cufinufft/types.h b/include/cufinufft/types.h index 16046c8ef..030cd268c 100644 --- a/include/cufinufft/types.h +++ b/include/cufinufft/types.h @@ -8,20 +8,27 @@ #include #include -#include +#include #define CUFINUFFT_BIGINT int -// Ugly trick to map a template to a fixed type, here cuda_complex -template struct cuda_complex_impl; -template<> struct cuda_complex_impl { - using type = cuFloatComplex; +// Marco Barbone 8/5/2924, replaced the ugly trick with std::conditional +// to define cuda_complex +// TODO: migrate to cuda/std/complex and remove this +// Issue: cufft seems not to support cuda::std::complex +// A reinterpret_cast should be enough +template +using cuda_complex = typename std::conditional< + std::is_same::value, cuFloatComplex, + typename std::conditional::value, cuDoubleComplex, + void>::type>::type; +namespace { +template struct cufinuftt_type3_params_t { + T X1, C1, D1, h1, gam1; // x dim: X=halfwid C=center D=freqcen h,gam=rescale + T X2, C2, D2, h2, gam2; // y + T X3, C3, D3, h3, gam3; // z }; -template<> struct cuda_complex_impl { - using type = cuDoubleComplex; -}; - -template using cuda_complex = typename cuda_complex_impl::type; +} // namespace template struct cufinufft_plan_t { cufinufft_opts opts; @@ -46,6 +53,8 @@ template struct cufinufft_plan_t { T *fwkerhalf2; T *fwkerhalf3; + // for type 1,2 it is a pointer to kx, ky, kz (no new allocs), for type 3 it + // for t3: allocated as "primed" (scaled) src pts x'_j, etc T *kx; T *ky; T *kz; @@ -53,6 +62,19 @@ template struct cufinufft_plan_t { cuda_complex *fw; cuda_complex *fk; + // Type 3 specific + cufinuftt_type3_params_t type3_params; + int nk; // number of NU freq pts (type 3 only) + CUFINUFFT_BIGINT nf; + T *d_s; + T *d_t; + T *d_u; + + // new allocs. FIXME: convert to device vectors to use resize + cuda_complex *prephase; // pre-phase, for all input NU pts + cuda_complex *deconv; // reciprocal of kernel FT, phase, all output NU pts + cuda_complex *CpBatch; // working array of prephased strengths + // Arrays that used in subprob method int *idxnupts; // length: #nupts, index of the nupts in the bin-sorted order int *sortidx; // length: #nupts, order inside the bin the nupt belongs to @@ -67,17 +89,17 @@ template struct cufinufft_plan_t { int *subprob_to_nupts; // Temporary variables to do fseries precomputation - std::complex fseries_precomp_a[3 * MAX_NQUAD]; + T fseries_precomp_a[3 * MAX_NQUAD]; T fseries_precomp_f[3 * MAX_NQUAD]; cufftHandle fftplan; cudaStream_t stream; }; -template static cufftType_t cufft_type(); -template<> inline cufftType_t cufft_type() { return CUFFT_C2C; } +template constexpr static inline cufftType_t cufft_type(); +template<> constexpr inline cufftType_t cufft_type() { return CUFFT_C2C; } -template<> inline cufftType_t cufft_type() { return CUFFT_Z2Z; } +template<> constexpr inline cufftType_t cufft_type() { return CUFFT_Z2Z; } static inline cufftResult cufft_ex(cufftHandle plan, cufftComplex *idata, cufftComplex *odata, int direction) { diff --git a/include/cufinufft/utils.h b/include/cufinufft/utils.h index 4bfaa801d..9f549e99b 100644 --- a/include/cufinufft/utils.h +++ b/include/cufinufft/utils.h @@ -15,6 +15,8 @@ #include #include +#include + #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 600 || defined(__clang__) #else __inline__ __device__ double atomicAdd(double *address, double val) { @@ -38,15 +40,20 @@ namespace cufinufft { namespace utils { class WithCudaDevice { public: - WithCudaDevice(int device) { - cudaGetDevice(&orig_device_); + explicit WithCudaDevice(const int device) : orig_device_{get_orig_device()} { cudaSetDevice(device); } ~WithCudaDevice() { cudaSetDevice(orig_device_); } private: - int orig_device_; + const int orig_device_; + + static int get_orig_device() noexcept { + int device{}; + cudaGetDevice(&device); + return device; + } }; // jfm timer class @@ -123,8 +130,8 @@ __forceinline__ __device__ auto interval(const int ns, const double x) { */ template -static __forceinline__ __device__ void atomicAddComplexShared( - cuda_complex *address, cuda_complex res) { +static __forceinline__ __device__ void atomicAddComplexShared(cuda_complex *address, + cuda_complex res) { const auto raw_address = reinterpret_cast(address); atomicAdd(raw_address, res.x); atomicAdd(raw_address + 1, res.y); @@ -136,8 +143,8 @@ static __forceinline__ __device__ void atomicAddComplexShared( * on shared memory are supported so we leverage them */ template -static __forceinline__ __device__ void atomicAddComplexGlobal( - cuda_complex *address, cuda_complex res) { +static __forceinline__ __device__ void atomicAddComplexGlobal(cuda_complex *address, + cuda_complex res) { if constexpr ( std::is_same_v, float2> && COMPUTE_CAPABILITY_90_OR_HIGHER) { atomicAdd(address, res); @@ -146,6 +153,70 @@ static __forceinline__ __device__ void atomicAddComplexGlobal( } } +template auto arrayrange(int n, T *a, cudaStream_t stream) { + const auto [d_min, d_max] = + thrust::minmax_element(thrust::cuda::par.on(stream), a, a + n); + return std::make_tuple(*d_min, *d_max); +} + +// Writes out w = half-width and c = center of an interval enclosing all a[n]'s +// Only chooses a nonzero center if this increases w by less than fraction +// ARRAYWIDCEN_GROWFRAC defined in defs.h. +// This prevents rephasings which don't grow nf by much. 6/8/17 +// If n==0, w and c are not finite. +template auto arraywidcen(int n, T *a, cudaStream_t stream) { + const auto [lo, hi] = arrayrange(n, a, stream); + auto w = (hi - lo) / 2; + auto c = (hi + lo) / 2; + if (std::abs(c) < ARRAYWIDCEN_GROWFRAC * (w)) { + w += std::abs(c); + c = 0.0; + } + return std::make_tuple(w, c); +} + +template +auto set_nhg_type3(T S, T X, const cufinufft_opts &opts, + const finufft_spread_opts &spopts) +/* sets nf, h (upsampled grid spacing), and gamma (x_j rescaling factor), + for type 3 only. + Inputs: + X and S are the xj and sk interval half-widths respectively. + opts and spopts are the NUFFT and spreader opts strucs, respectively. + Outputs: + nf is the size of upsampled grid for a given single dimension. + h is the grid spacing = 2pi/nf + gam is the x rescale factor, ie x'_j = x_j/gam (modulo shifts). + Barnett 2/13/17. Caught inf/nan 3/14/17. io int types changed 3/28/17 + New logic 6/12/17 +*/ +{ + int nss = spopts.nspread + 1; // since ns may be odd + T Xsafe = X, Ssafe = S; // may be tweaked locally + if (X == 0.0) // logic ensures XS>=1, handle X=0 a/o S=0 + if (S == 0.0) { + Xsafe = 1.0; + Ssafe = 1.0; + } else + Xsafe = max(Xsafe, 1 / S); + else + Ssafe = max(Ssafe, 1 / X); + // use the safe X and S... + T nfd = 2.0 * opts.upsampfac * Ssafe * Xsafe / M_PI + nss; + if (!isfinite(nfd)) nfd = 0.0; // use FLT to catch inf + auto nf = (int)nfd; + // printf("initial nf=%lld, ns=%d\n",*nf,spopts.nspread); + // catch too small nf, and nan or +-inf, otherwise spread fails... + if (nf < 2 * spopts.nspread) nf = 2 * spopts.nspread; + if (nf < MAX_NF) // otherwise will fail anyway + nf = utils::next235beven(nf, 1); // expensive at huge nf + // Note: b is 1 because type 3 uses a type 2 plan, so it should not need the extra + // condition that seems to be used by Block Gather as type 2 are only GM-sort + auto h = 2 * T(M_PI) / nf; // upsampled grid spacing + auto gam = (T)nf / (2.0 * opts.upsampfac * Ssafe); // x scale fac to x' + return std::make_tuple(nf, h, gam); +} + } // namespace utils } // namespace cufinufft diff --git a/include/cufinufft_opts.h b/include/cufinufft_opts.h index c9898f3b7..743b3cf5c 100644 --- a/include/cufinufft_opts.h +++ b/include/cufinufft_opts.h @@ -29,6 +29,8 @@ typedef struct cufinufft_opts { // see cufinufft_default_opts() for defaults int modeord; // (type 1,2 only): 0 CMCL-style increasing mode order // 1 FFT-style mode order + + int debug; // 0: no debug, 1: debug } cufinufft_opts; #endif diff --git a/src/cuda/CMakeLists.txt b/src/cuda/CMakeLists.txt index ae9431c31..a3743592e 100644 --- a/src/cuda/CMakeLists.txt +++ b/src/cuda/CMakeLists.txt @@ -1,6 +1,7 @@ set(PRECISION_INDEPENDENT_SRC precision_independent.cu utils.cpp ${PROJECT_SOURCE_DIR}/contrib/legendre_rule_fast.cpp) + set(PRECISION_DEPENDENT_SRC spreadinterp.cpp 1d/cufinufft1d.cu @@ -23,19 +24,22 @@ set(CUFINUFFT_INCLUDE_DIRS $ $ $) + set(CUFINUFFT_INCLUDE_DIRS ${CUFINUFFT_INCLUDE_DIRS} PARENT_SCOPE) # flush denormals to zero and enable verbose PTXAS output set(FINUFFT_CUDA_FLAGS + $<$: + --extended-lambda -ftz=true -fmad=true -restrict --extra-device-vectorization $<$:-G -maxrregcount - 32>) + 32>>) add_library(cufinufft_common_objects OBJECT ${PRECISION_INDEPENDENT_SRC}) target_include_directories(cufinufft_common_objects @@ -48,12 +52,7 @@ set_target_properties( CUDA_STANDARD 17 CUDA_STANDARD_REQUIRED ON) target_compile_features(cufinufft_common_objects PRIVATE cxx_std_17) -target_compile_options( - cufinufft_common_objects - PRIVATE $<$:${FINUFFT_CUDA_FLAGS}>) -target_compile_options( - cufinufft_common_objects - PRIVATE $<$:${FINUFFT_CUDA_FLAGS}>) +target_compile_options(cufinufft_common_objects PRIVATE ${FINUFFT_CUDA_FLAGS}) add_library(cufinufft_objects OBJECT ${PRECISION_DEPENDENT_SRC}) target_include_directories(cufinufft_objects PUBLIC ${CUFINUFFT_INCLUDE_DIRS}) @@ -65,8 +64,7 @@ set_target_properties( CUDA_STANDARD 17 CUDA_STANDARD_REQUIRED ON) target_compile_features(cufinufft_objects PRIVATE cxx_std_17) -target_compile_options( - cufinufft_objects PRIVATE $<$:${FINUFFT_CUDA_FLAGS}>) +target_compile_options(cufinufft_objects PRIVATE ${FINUFFT_CUDA_FLAGS}) if(FINUFFT_SHARED_LINKING) add_library(cufinufft SHARED $ @@ -85,7 +83,7 @@ set_target_properties( ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}" CUDA_ARCHITECTURES ${FINUFFT_CUDA_ARCHITECTURES}) target_compile_features(cufinufft PRIVATE cxx_std_17) - +target_compile_options(cufinufft PUBLIC ${FINUFFT_CUDA_FLAGS}) if(WIN32) target_link_libraries(cufinufft PUBLIC CUDA::cudart CUDA::cufft CUDA::nvToolsExt) diff --git a/src/cuda/common.cu b/src/cuda/common.cu index b19986520..f5c5a37fd 100644 --- a/src/cuda/common.cu +++ b/src/cuda/common.cu @@ -26,14 +26,14 @@ using std::max; // a , f are intermediate results from function onedim_fseries_kernel_precomp() // (see cufinufft/contrib/common.cpp for description) template -__global__ void fseries_kernel_compute(int nf1, int nf2, int nf3, T *f, - cuDoubleComplex *a, T *fwkerhalf1, T *fwkerhalf2, - T *fwkerhalf3, int ns) { +__global__ void fseries_kernel_compute(int nf1, int nf2, int nf3, T *f, T *a, + T *fwkerhalf1, T *fwkerhalf2, T *fwkerhalf3, + int ns) { T J2 = ns / 2.0; int q = (int)(2 + 3.0 * J2); int nf; - cuDoubleComplex *at = a + threadIdx.y * MAX_NQUAD; - T *ft = f + threadIdx.y * MAX_NQUAD; + T *at = a + threadIdx.y * MAX_NQUAD; + T *ft = f + threadIdx.y * MAX_NQUAD; T *oarr; if (threadIdx.y == 0) { oarr = fwkerhalf1; @@ -48,19 +48,18 @@ __global__ void fseries_kernel_compute(int nf1, int nf2, int nf3, T *f, for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < nf / 2 + 1; i += blockDim.x * gridDim.x) { - int brk = 0.5 + i; - T x = 0.0; + T x = 0.0; for (int n = 0; n < q; n++) { - x += ft[n] * 2 * (pow(cabs(at[n]), brk) * cos(brk * carg(at[n]))); + x += ft[n] * 2 * cos(i * at[n]); } oarr[i] = x; } } template -int cufserieskernelcompute(int dim, int nf1, int nf2, int nf3, T *d_f, - cuDoubleComplex *d_a, T *d_fwkerhalf1, T *d_fwkerhalf2, - T *d_fwkerhalf3, int ns, cudaStream_t stream) +int cufserieskernelcompute(int dim, int nf1, int nf2, int nf3, T *d_f, T *d_a, + T *d_fwkerhalf1, T *d_fwkerhalf2, T *d_fwkerhalf3, int ns, + cudaStream_t stream) /* wrapper for approximation of Fourier series of real symmetric spreading kernel. @@ -104,38 +103,38 @@ void set_nf_type12(CUFINUFFT_BIGINT ms, cufinufft_opts opts, finufft_spread_opts } } -template -void onedim_fseries_kernel(CUFINUFFT_BIGINT nf, T *fwkerhalf, finufft_spread_opts opts) -/* - Approximates exact Fourier series coeffs of cnufftspread's real symmetric - kernel, directly via q-node quadrature on Euler-Fourier formula, exploiting - narrowness of kernel. Uses phase winding for cheap eval on the regular freq - grid. Note that this is also the Fourier transform of the non-periodized - kernel. The FT definition is f(k) = int e^{-ikx} f(x) dx. The output has an - overall prefactor of 1/h, which is needed anyway for the correction, and - arises because the quadrature weights are scaled for grid units not x units. - - Inputs: - nf - size of 1d uniform spread grid, must be even. - opts - spreading opts object, needed to eval kernel (must be already set up) - - Outputs: - fwkerhalf - real Fourier series coeffs from indices 0 to nf/2 inclusive, - divided by h = 2pi/n. - (should be allocated for at least nf/2+1 Ts) - - Compare onedim_dct_kernel which has same interface, but computes DFT of - sampled kernel, not quite the same object. - - Barnett 2/7/17. openmp (since slow vs fftw in 1D large-N case) 3/3/18 - Melody 2/20/22 separate into precomp & comp functions defined below. - */ -{ - T f[MAX_NQUAD]; - std::complex a[MAX_NQUAD]; - onedim_fseries_kernel_precomp(nf, f, a, opts); - onedim_fseries_kernel_compute(nf, f, a, fwkerhalf, opts); -} +// template +// void onedim_fseries_kernel(CUFINUFFT_BIGINT nf, T *fwkerhalf, finufft_spread_opts opts) +///* +// Approximates exact Fourier series coeffs of cnufftspread's real symmetric +// kernel, directly via q-node quadrature on Euler-Fourier formula, exploiting +// narrowness of kernel. Uses phase winding for cheap eval on the regular freq +// grid. Note that this is also the Fourier transform of the non-periodized +// kernel. The FT definition is f(k) = int e^{-ikx} f(x) dx. The output has an +// overall prefactor of 1/h, which is needed anyway for the correction, and +// arises because the quadrature weights are scaled for grid units not x units. +// +// Inputs: +// nf - size of 1d uniform spread grid, must be even. +// opts - spreading opts object, needed to eval kernel (must be already set up) +// +// Outputs: +// fwkerhalf - real Fourier series coeffs from indices 0 to nf/2 inclusive, +// divided by h = 2pi/n. +// (should be allocated for at least nf/2+1 Ts) +// +// Compare onedim_dct_kernel which has same interface, but computes DFT of +// sampled kernel, not quite the same object. +// +// Barnett 2/7/17. openmp (since slow vs fftw in 1D large-N case) 3/3/18 +// Melody 2/20/22 separate into precomp & comp functions defined below. +// */ +//{ +// T f[MAX_NQUAD]; +// T a[MAX_NQUAD]; +// onedim_fseries_kernel_precomp(nf, f, a, opts); +//// onedim_fseries_kernel_compute(nf, f, a, fwkerhalf, opts); +//} /* Precomputation of approximations of exact Fourier series coeffs of cnufftspread's @@ -151,7 +150,7 @@ void onedim_fseries_kernel(CUFINUFFT_BIGINT nf, T *fwkerhalf, finufft_spread_opt (a, f are provided as the inputs of onedim_fseries_kernel_compute() defined below) */ template -void onedim_fseries_kernel_precomp(CUFINUFFT_BIGINT nf, T *f, std::complex *a, +void onedim_fseries_kernel_precomp(CUFINUFFT_BIGINT nf, T *f, T *a, finufft_spread_opts opts) { T J2 = opts.nspread / 2.0; // J/2, half-width of ker z-support // # quadr nodes in z (from 0 to J/2; reflections will be added)... @@ -164,40 +163,39 @@ void onedim_fseries_kernel_precomp(CUFINUFFT_BIGINT nf, T *f, std::complex(0.0, 1.0) * (T)(nf / 2 - z[n]) / - (T)nf); // phase winding rates + a[n] = ((T)(2.0 * M_PI) * (T)(nf / 2 - z[n]) / (T)nf); // phase winding rates } } -template -void onedim_fseries_kernel_compute(CUFINUFFT_BIGINT nf, T *f, std::complex *a, - T *fwkerhalf, finufft_spread_opts opts) { - T J2 = opts.nspread / 2.0; // J/2, half-width of ker z-support - int q = (int)(2 + 3.0 * J2); // not sure why so large? cannot exceed MAX_NQUAD - CUFINUFFT_BIGINT nout = nf / 2 + 1; // how many values we're writing to - int nt = std::min(nout, MY_OMP_GET_MAX_THREADS()); // how many chunks - std::vector brk(nt + 1); // start indices for each thread - for (int t = 0; t <= nt; ++t) // split nout mode indices btw threads - brk[t] = (CUFINUFFT_BIGINT)(0.5 + nout * t / (double)nt); -#pragma omp parallel - { - int t = MY_OMP_GET_THREAD_NUM(); - if (t < nt) { // could be nt < actual # threads - std::complex aj[MAX_NQUAD]; // phase rotator for this thread - for (int n = 0; n < q; ++n) - aj[n] = pow(a[n], (T)brk[t]); // init phase factors for chunk - for (CUFINUFFT_BIGINT j = brk[t]; j < brk[t + 1]; ++j) { // loop along output - // array - T x = 0.0; // accumulator for answer at this j - for (int n = 0; n < q; ++n) { - x += f[n] * 2 * real(aj[n]); // include the negative freq - aj[n] *= a[n]; // wind the phases - } - fwkerhalf[j] = x; - } - } - } -} +// template +// void onedim_fseries_kernel_compute(CUFINUFFT_BIGINT nf, T *f, std::complex *a, +// T *fwkerhalf, finufft_spread_opts opts) { +// T J2 = opts.nspread / 2.0; // J/2, half-width of ker z-support +// int q = (int)(2 + 3.0 * J2); // not sure why so large? cannot exceed MAX_NQUAD +// CUFINUFFT_BIGINT nout = nf / 2 + 1; // how many values we're writing to +// int nt = std::min(nout, MY_OMP_GET_MAX_THREADS()); // how many chunks +// std::vector brk(nt + 1); // start indices for each thread +// for (int t = 0; t <= nt; ++t) // split nout mode indices btw threads +// brk[t] = (CUFINUFFT_BIGINT)(0.5 + nout * t / (double)nt); +// #pragma omp parallel +// { +// int t = MY_OMP_GET_THREAD_NUM(); +// if (t < nt) { // could be nt < actual # threads +// std::complex aj[MAX_NQUAD]; // phase rotator for this thread +// for (int n = 0; n < q; ++n) +// aj[n] = pow(a[n], (T)brk[t]); // init phase factors for chunk +// for (CUFINUFFT_BIGINT j = brk[t]; j < brk[t + 1]; ++j) { // loop along output +// // array +// T x = 0.0; // accumulator for answer at this j +// for (int n = 0; n < q; ++n) { +// x += f[n] * 2 * real(aj[n]); // include the negative freq +// aj[n] *= a[n]; // wind the phases +// } +// fwkerhalf[j] = x; +// } +// } +// } +// } template std::size_t shared_memory_required(int dim, int ns, int bin_size_x, int bin_size_y, @@ -312,34 +310,32 @@ void cufinufft_setup_binsize(int type, int ns, int dim, cufinufft_opts *opts) { } } -template void onedim_fseries_kernel_compute(CUFINUFFT_BIGINT nf, float *f, - std::complex *a, float *fwkerhalf, - finufft_spread_opts opts); -template void onedim_fseries_kernel_compute(CUFINUFFT_BIGINT nf, double *f, - std::complex *a, double *fwkerhalf, - finufft_spread_opts opts); +// template void onedim_fseries_kernel_compute(CUFINUFFT_BIGINT nf, float *f, +// std::complex *a, float *fwkerhalf, +// finufft_spread_opts opts); +// template void onedim_fseries_kernel_compute(CUFINUFFT_BIGINT nf, double *f, +// std::complex *a, double *fwkerhalf, +// finufft_spread_opts opts); template int setup_spreader_for_nufft(finufft_spread_opts &spopts, float eps, cufinufft_opts opts); template int setup_spreader_for_nufft(finufft_spread_opts &spopts, double eps, cufinufft_opts opts); -template void onedim_fseries_kernel_precomp( - CUFINUFFT_BIGINT nf, float *f, std::complex *a, finufft_spread_opts opts); -template void onedim_fseries_kernel_precomp( - CUFINUFFT_BIGINT nf, double *f, std::complex *a, finufft_spread_opts opts); +template void onedim_fseries_kernel_precomp(CUFINUFFT_BIGINT nf, float *f, float *a, + finufft_spread_opts opts); +template void onedim_fseries_kernel_precomp(CUFINUFFT_BIGINT nf, double *f, double *a, + finufft_spread_opts opts); template int cufserieskernelcompute(int dim, int nf1, int nf2, int nf3, float *d_f, - cuDoubleComplex *d_a, float *d_fwkerhalf1, - float *d_fwkerhalf2, float *d_fwkerhalf3, int ns, - cudaStream_t stream); -template int cufserieskernelcompute(int dim, int nf1, int nf2, int nf3, double *d_f, - cuDoubleComplex *d_a, double *d_fwkerhalf1, - double *d_fwkerhalf2, double *d_fwkerhalf3, int ns, - cudaStream_t stream); - -template void onedim_fseries_kernel(CUFINUFFT_BIGINT nf, float *fwkerhalf, - finufft_spread_opts opts); -template void onedim_fseries_kernel(CUFINUFFT_BIGINT nf, double *fwkerhalf, - finufft_spread_opts opts); + float *d_a, float *d_fwkerhalf1, float *d_fwkerhalf2, + float *d_fwkerhalf3, int ns, cudaStream_t stream); +template int cufserieskernelcompute( + int dim, int nf1, int nf2, int nf3, double *d_f, double *d_a, double *d_fwkerhalf1, + double *d_fwkerhalf2, double *d_fwkerhalf3, int ns, cudaStream_t stream); + +// template void onedim_fseries_kernel(CUFINUFFT_BIGINT nf, float *fwkerhalf, +// finufft_spread_opts opts); +// template void onedim_fseries_kernel(CUFINUFFT_BIGINT nf, double *fwkerhalf, +// finufft_spread_opts opts); template std::size_t shared_memory_required(int dim, int ns, int bin_size_x, int bin_size_y, int bin_size_z); diff --git a/src/cuda/cufinufft.cu b/src/cuda/cufinufft.cu index c00bf8eba..a1904e52c 100644 --- a/src/cuda/cufinufft.cu +++ b/src/cuda/cufinufft.cu @@ -121,6 +121,7 @@ void cufinufft_default_opts(cufinufft_opts *opts) opts->gpu_binsizey = 0; opts->gpu_binsizez = 0; opts->gpu_maxbatchsize = 0; + opts->debug = 0; opts->gpu_stream = cudaStreamDefault; // sphinx tag (don't remove): @gpu_defopts_end } diff --git a/src/cuda/spreadinterp.cpp b/src/cuda/spreadinterp.cpp index 98b5382bc..646ffa434 100644 --- a/src/cuda/spreadinterp.cpp +++ b/src/cuda/spreadinterp.cpp @@ -1,10 +1,7 @@ #include #include -#include -#include #include -#include #include #include @@ -44,8 +41,7 @@ int setup_spreader(finufft_spread_opts &opts, T eps, T upsampfac, int kerevalmet opts.upsampfac = upsampfac; // as in FINUFFT v2.0, allow too-small-eps by truncating to eps_mach... - int ier = 0; - + int ier = 0; constexpr T EPSILON = std::numeric_limits::epsilon(); if (eps < EPSILON) { fprintf(stderr, "setup_spreader: warning, increasing tol=%.3g to eps_mach=%.3g.\n", diff --git a/src/finufft.cpp b/src/finufft.cpp index b71acce87..ed917514d 100644 --- a/src/finufft.cpp +++ b/src/finufft.cpp @@ -726,6 +726,7 @@ int FINUFFT_MAKEPLAN(int type, int dim, BIGINT *n_modes, int iflag, int ntrans, fprintf(stderr, "[%s] fwBatch would be bigger than MAX_NF, not attempting malloc!\n", __func__); + // FIXME: this error causes memory leaks. We should free phiHat1, phiHat2, phiHat3 return FINUFFT_ERR_MAXNALLOC; } @@ -760,7 +761,7 @@ int FINUFFT_MAKEPLAN(int type, int dim, BIGINT *n_modes, int iflag, int ntrans, // set it just for our one plan and then revert to the user value. // Unfortunately fftw_planner_nthreads wasn't introduced until fftw 3.3.9, and // there isn't a convenient mechanism to probe the version - FFTW_PLAN_TH(nthr_fft); + // there is fftw_version which returns a string, but that's not compile time p->fftwPlan = FFTW_PLAN_MANY_DFT(dim, ns, p->batchSize, (FFTW_CPX *)p->fwBatch, NULL, 1, p->nf, (FFTW_CPX *)p->fwBatch, NULL, 1, p->nf, p->fftSign, p->opts.fftw); @@ -916,6 +917,7 @@ int FINUFFT_SETPTS(FINUFFT_PLAN p, BIGINT nj, FLT *xj, FLT *yj, FLT *zj, BIGINT // printf("fwbatch, cpbatch ptrs: %llx %llx\n",p->fwBatch,p->CpBatch); // alloc rescaled NU src pts x'_j (in X etc), rescaled NU targ pts s'_k ... + // FIXME: should use realloc if (p->X) free(p->X); if (p->Sp) free(p->Sp); p->X = (FLT *)malloc(sizeof(FLT) * nj); diff --git a/test/cuda/CMakeLists.txt b/test/cuda/CMakeLists.txt index d9c5d312b..c7323f57b 100644 --- a/test/cuda/CMakeLists.txt +++ b/test/cuda/CMakeLists.txt @@ -82,6 +82,11 @@ add_tests(float 1e-5 2e-4 2.0) add_tests(double 1e-12 1e-11 2.0) add_tests(float 1e-5 2e-4 1.25) add_tests(double 1e-8 1e-7 1.25) +add_tests(float 1e-5 2e-4 0) +add_tests(double 1e-12 1e-11 0) +add_tests(float 1e-5 2e-4 0.0) +add_tests(double 1e-8 1e-7 0.0) add_test(NAME cufinufft_public_api COMMAND public_api_test) add_test(NAME cufinufft_makeplan COMMAND test_makeplan) +add_test(NAME cufinufft_math_test COMMAND cufinufft_math_test) diff --git a/test/cuda/cufinufft_math_test.cu b/test/cuda/cufinufft_math_test.cu new file mode 100644 index 000000000..5a80b95d8 --- /dev/null +++ b/test/cuda/cufinufft_math_test.cu @@ -0,0 +1,129 @@ +#include +#include +#include +#include + +// Include the custom operators for cuComplex +#include +#include + +// Helper function to create cuComplex +template cuda_complex make_cuda_complex(T real, T imag) { + return cuda_complex{real, imag}; +} + +// Helper function to compare cuComplex with std::complex using 1 - ratio as error +template +bool compareComplex(const cuda_complex &a, const std::complex &b, + const std::string &operation, + T epsilon = std::numeric_limits::epsilon()) { + T real_error = 1 - a.x / b.real(); + T imag_error = 1 - a.y / b.imag(); + if (real_error >= epsilon || imag_error >= epsilon) { + std::cout << "Comparison failed in operation: " << operation << "\n"; + std::cout << "cuComplex: (" << a.x << ", " << a.y << ")\n"; + std::cout << "std::complex: (" << b.real() << ", " << b.imag() << ")\n"; + std::cout << "Real error: " << real_error << "\n"; + std::cout << "Imag error: " << imag_error << "\n"; + } + return real_error < epsilon && imag_error < epsilon; +} + +template int testRandomOperations() { + std::random_device rd; + std::mt19937 gen(rd()); + std::uniform_real_distribution dis(-100.0, 100.0); + + for (int i = 0; i < 1000; ++i) { + T real1 = dis(gen); + T imag1 = dis(gen); + T real2 = dis(gen); + T imag2 = dis(gen); + T scalar = dis(gen); + + cuda_complex a = make_cuda_complex(real1, imag1); + cuda_complex b = make_cuda_complex(real2, imag2); + std::complex std_a(real1, imag1); + std::complex std_b(real2, imag2); + + // Test addition + cuda_complex result_add = a + b; + std::complex expected_add = std_a + std_b; + if (!compareComplex(result_add, expected_add, + "add complex<" + std::string(typeid(T).name()) + "> complex<" + + std::string(typeid(T).name()) + ">")) + return 1; + + // Test subtraction + cuda_complex result_sub = a - b; + std::complex expected_sub = std_a - std_b; + if (!compareComplex(result_sub, expected_sub, + "sub complex<" + std::string(typeid(T).name()) + "> complex<" + + std::string(typeid(T).name()) + ">")) + return 1; + + // Test multiplication + cuda_complex result_mul = a * b; + std::complex expected_mul = std_a * std_b; + if (!compareComplex(result_mul, expected_mul, + "mul complex<" + std::string(typeid(T).name()) + "> complex<" + + std::string(typeid(T).name()) + ">")) + return 1; + + // Test division + // Avoid division by small numbers as the implementation is slightly different + // Maybe there is a better way to test it + if (real2 < 1.0 || imag2 < 1.0) { // Avoid division by zero + cuda_complex result_div = a / b; + std::complex expected_div = std_a / std_b; + if (!compareComplex(result_div, expected_div, + "div complex<" + std::string(typeid(T).name()) + "> complex<" + + std::string(typeid(T).name()) + ">", + std::numeric_limits::epsilon() * 1000)) + return 1; + } + + // Test addition with scalar + cuda_complex result_add_scalar = a + scalar; + std::complex expected_add_scalar = std_a + scalar; + if (!compareComplex(result_add_scalar, expected_add_scalar, + "add complex<" + std::string(typeid(T).name()) + "> scalar<" + + std::string(typeid(T).name()) + ">")) + return 1; + + // Test subtraction with scalar + cuda_complex result_sub_scalar = a - scalar; + std::complex expected_sub_scalar = std_a - scalar; + if (!compareComplex(result_sub_scalar, expected_sub_scalar, + "sub complex<" + std::string(typeid(T).name()) + "> scalar<" + + std::string(typeid(T).name()) + ">")) + return 1; + + // Test multiplication with scalar + cuda_complex result_mul_scalar = a * scalar; + std::complex expected_mul_scalar = std_a * scalar; + if (!compareComplex(result_mul_scalar, expected_mul_scalar, + "mul complex<" + std::string(typeid(T).name()) + "> scalar<" + + std::string(typeid(T).name()) + ">")) + return 1; + + // Test division with scalar + if (scalar != 0.0) { // Avoid division by zero + cuda_complex result_div_scalar = a / scalar; + std::complex expected_div_scalar = std_a / scalar; + if (!compareComplex(result_div_scalar, expected_div_scalar, + "div complex<" + std::string(typeid(T).name()) + "> scalar<" + + std::string(typeid(T).name()) + ">")) + return 1; + } + } + return 0; +} + +int main() { + if (testRandomOperations()) return 1; + if (testRandomOperations()) return 1; + + std::cout << "All tests passed!" << std::endl; + return 0; +} diff --git a/test/cuda/fseries_kernel_test.cu b/test/cuda/fseries_kernel_test.cu index 7f18ee21c..0e1766e9e 100644 --- a/test/cuda/fseries_kernel_test.cu +++ b/test/cuda/fseries_kernel_test.cu @@ -34,38 +34,42 @@ template int run_test(int nf1, int dim, T eps, int gpu, int nf2, int CNTime timer; if (!gpu) { - timer.start(); - fwkerhalf1 = (T *)malloc(sizeof(T) * (nf1 / 2 + 1)); - if (dim > 1) fwkerhalf2 = (T *)malloc(sizeof(T) * (nf2 / 2 + 1)); - if (dim > 2) fwkerhalf3 = (T *)malloc(sizeof(T) * (nf3 / 2 + 1)); - - onedim_fseries_kernel(nf1, fwkerhalf1, opts); - if (dim > 1) onedim_fseries_kernel(nf2, fwkerhalf2, opts); - if (dim > 2) onedim_fseries_kernel(nf3, fwkerhalf3, opts); - cputime = timer.elapsedsec(); - cudaEventRecord(start); - { - checkCudaErrors(cudaMemcpy(d_fwkerhalf1, fwkerhalf1, sizeof(T) * (nf1 / 2 + 1), - cudaMemcpyHostToDevice)); - if (dim > 1) - checkCudaErrors(cudaMemcpy(d_fwkerhalf2, fwkerhalf2, sizeof(T) * (nf2 / 2 + 1), - cudaMemcpyHostToDevice)); - if (dim > 2) - checkCudaErrors(cudaMemcpy(d_fwkerhalf3, fwkerhalf3, sizeof(T) * (nf3 / 2 + 1), - cudaMemcpyHostToDevice)); - } - cudaEventRecord(stop); - cudaEventSynchronize(stop); - cudaEventElapsedTime(&milliseconds, start, stop); - gputime = milliseconds; - printf("[time ] dim=%d, nf1=%8d, ns=%2d, CPU: %6.2f ms\n", dim, nf1, opts.nspread, - gputime + cputime * 1000); - free(fwkerhalf1); - if (dim > 1) free(fwkerhalf2); - if (dim > 2) free(fwkerhalf3); + // timer.start(); + // fwkerhalf1 = (T *)malloc(sizeof(T) * (nf1 / 2 + 1)); + // if (dim > 1) fwkerhalf2 = (T *)malloc(sizeof(T) * (nf2 / 2 + 1)); + // if (dim > 2) fwkerhalf3 = (T *)malloc(sizeof(T) * (nf3 / 2 + 1)); + // + // onedim_fseries_kernel(nf1, fwkerhalf1, opts); + // if (dim > 1) onedim_fseries_kernel(nf2, fwkerhalf2, opts); + // if (dim > 2) onedim_fseries_kernel(nf3, fwkerhalf3, opts); + // cputime = timer.elapsedsec(); + // cudaEventRecord(start); + // { + // checkCudaErrors(cudaMemcpy(d_fwkerhalf1, fwkerhalf1, sizeof(T) * (nf1 / 2 + + // 1), + // cudaMemcpyHostToDevice)); + // if (dim > 1) + // checkCudaErrors(cudaMemcpy(d_fwkerhalf2, fwkerhalf2, sizeof(T) * (nf2 / 2 + + // 1), + // cudaMemcpyHostToDevice)); + // if (dim > 2) + // checkCudaErrors(cudaMemcpy(d_fwkerhalf3, fwkerhalf3, sizeof(T) * (nf3 / 2 + + // 1), + // cudaMemcpyHostToDevice)); + // } + // cudaEventRecord(stop); + // cudaEventSynchronize(stop); + // cudaEventElapsedTime(&milliseconds, start, stop); + // gputime = milliseconds; + // printf("[time ] dim=%d, nf1=%8d, ns=%2d, CPU: %6.2f ms\n", dim, nf1, + // opts.nspread, + // gputime + cputime * 1000); + // free(fwkerhalf1); + // if (dim > 1) free(fwkerhalf2); + // if (dim > 2) free(fwkerhalf3); } else { timer.start(); - std::complex a[dim * MAX_NQUAD]; + T a[dim * MAX_NQUAD]; T f[dim * MAX_NQUAD]; onedim_fseries_kernel_precomp(nf1, f, a, opts); if (dim > 1) onedim_fseries_kernel_precomp(nf2, f + MAX_NQUAD, a + MAX_NQUAD, opts); @@ -73,14 +77,14 @@ template int run_test(int nf1, int dim, T eps, int gpu, int nf2, int onedim_fseries_kernel_precomp(nf3, f + 2 * MAX_NQUAD, a + 2 * MAX_NQUAD, opts); cputime = timer.elapsedsec(); - cuDoubleComplex *d_a; + T *d_a; T *d_f; cudaEventRecord(start); { - checkCudaErrors(cudaMalloc(&d_a, dim * MAX_NQUAD * sizeof(cuDoubleComplex))); + checkCudaErrors(cudaMalloc(&d_a, dim * MAX_NQUAD * sizeof(T))); checkCudaErrors(cudaMalloc(&d_f, dim * MAX_NQUAD * sizeof(T))); - checkCudaErrors(cudaMemcpy(d_a, a, dim * MAX_NQUAD * sizeof(cuDoubleComplex), - cudaMemcpyHostToDevice)); + checkCudaErrors( + cudaMemcpy(d_a, a, dim * MAX_NQUAD * sizeof(T), cudaMemcpyHostToDevice)); checkCudaErrors( cudaMemcpy(d_f, f, dim * MAX_NQUAD * sizeof(T), cudaMemcpyHostToDevice)); ier = From 9d44993b57620f64fe17ddb3c43cb4f8390c34eb Mon Sep 17 00:00:00 2001 From: Marco Barbone Date: Tue, 13 Aug 2024 12:55:37 -0400 Subject: [PATCH 38/68] testing --- CMakeLists.txt | 2 +- examples/CMakeLists.txt | 11 ++ include/cufinufft/common.h | 13 +- include/cufinufft/impl.h | 234 ++++++++++++++++++-------- include/cufinufft/types.h | 12 +- include/cufinufft/utils.h | 28 ++-- src/cuda/common.cu | 88 +++++++++- src/cuda/cufinufft.cu | 2 +- src/cuda/memtransfer_wrapper.cu | 3 +- test/cuda/CMakeLists.txt | 26 ++- test/cuda/cufinufft1d_test.cu | 2 +- test/cuda/cufinufft_makeplan_impl.cu | 160 ++++++++++++++++++ test/cuda/cufinufft_setpts.cu | 242 +++++++++++++++++++++++++++ test/cuda/cufinufft_type3_test.cu | 13 ++ test/cuda/fseries_kernel_test.cu | 8 +- 15 files changed, 730 insertions(+), 114 deletions(-) create mode 100644 test/cuda/cufinufft_makeplan_impl.cu create mode 100644 test/cuda/cufinufft_setpts.cu create mode 100644 test/cuda/cufinufft_type3_test.cu diff --git a/CMakeLists.txt b/CMakeLists.txt index d16783eec..61c408cbf 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -14,7 +14,7 @@ include(CMakeDependentOption) option(FINUFFT_BUILD_FORTRAN "Whether to build the FINUFFT Fortran examples" OFF) option(FINUFFT_BUILD_MATLAB "Whether to build the FINUFFT Matlab interface" OFF) option(FINUFFT_BUILD_PYTHON "Whether the Python wrapper should be built." OFF) -option(FINUFFT_ENABLE_SANITIZERS "Whether to enable sanitizers, only effective for Debug configuration." ON) +option(FINUFFT_ENABLE_SANITIZERS "Whether to enable sanitizers, only effective for Debug configuration." OFF) option(FINUFFT_USE_OPENMP "Whether to use OpenMP for parallelization. If disabled, the finufft library will be single threaded. This does not affect the choice of FFTW library." ON) option(FINUFFT_USE_CPU "Whether to build the ordinary FINUFFT library (libfinufft)." ON) option(FINUFFT_USE_CUDA "Whether to build CUDA accelerated FINUFFT library (libcufinufft). This is completely independent of the main FINUFFT library" OFF) diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index 27b193cd5..ee8b0bdd4 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -9,11 +9,16 @@ set(EXAMPLES set(EXAMPLES_OPENMP threadsafe1d1 threadsafe2d2f) set(EXAMPLES_C guru1d1c simple1d1c simple1d1cf) +find_library(MATH_LIBRARY m) + foreach(EXAMPLE ${EXAMPLES}) add_executable(${EXAMPLE} ${EXAMPLE}.cpp) target_compile_features(${EXAMPLE} PRIVATE cxx_std_17) target_link_libraries(${EXAMPLE} PRIVATE finufft) enable_asan(${EXAMPLE}) + if(MATH_LIBRARY) + target_link_libraries(${EXAMPLE} PRIVATE ${MATH_LIBRARY}) + endif() endforeach() foreach(EXAMPLE ${EXAMPLES_C}) @@ -21,6 +26,9 @@ foreach(EXAMPLE ${EXAMPLES_C}) target_link_libraries(${EXAMPLE} PRIVATE finufft) target_compile_features(${EXAMPLE} PRIVATE cxx_std_17) enable_asan(${EXAMPLE}) + if(MATH_LIBRARY) + target_link_libraries(${EXAMPLE} PRIVATE ${MATH_LIBRARY}) + endif() endforeach() if(FINUFFT_USE_OPENMP) @@ -29,5 +37,8 @@ if(FINUFFT_USE_OPENMP) target_link_libraries(${EXAMPLE} PRIVATE finufft OpenMP::OpenMP_CXX) target_compile_features(${EXAMPLE} PRIVATE cxx_std_17) enable_asan(${EXAMPLE}) + if(MATH_LIBRARY) + target_link_libraries(${EXAMPLE} PRIVATE ${MATH_LIBRARY}) + endif() endforeach() endif() diff --git a/include/cufinufft/common.h b/include/cufinufft/common.h index 18478f49f..6747cdb87 100644 --- a/include/cufinufft/common.h +++ b/include/cufinufft/common.h @@ -7,7 +7,8 @@ #include #include -#include +#include +#include namespace cufinufft { namespace common { @@ -16,10 +17,18 @@ __global__ void fseries_kernel_compute(int nf1, int nf2, int nf3, T *f, T *a, T *fwkerhalf1, T *fwkerhalf2, T *fwkerhalf3, int ns); template +__global__ void fseries_kernel_compute(int nf1, int nf2, int nf3, T *f, T *a, T *kx, + T *ky, T *kz, T *fwkerhalf1, T *fwkerhalf2, + T *fwkerhalf3, int ns); +template int cufserieskernelcompute(int dim, int nf1, int nf2, int nf3, T *d_f, T *d_a, T *d_fwkerhalf1, T *d_fwkerhalf2, T *d_fwkerhalf3, int ns, cudaStream_t stream); template +int cufserieskernelcompute(int dim, int nf1, int nf2, int nf3, T *d_f, T *d_a, T *d_kx, + T *d_ky, T *d_kz, T *d_fwkerhalf1, T *d_fwkerhalf2, + T *d_fwkerhalf3, int ns, cudaStream_t stream); +template int setup_spreader_for_nufft(finufft_spread_opts &spopts, T eps, cufinufft_opts opts); void set_nf_type12(CUFINUFFT_BIGINT ms, cufinufft_opts opts, finufft_spread_opts spopts, @@ -27,7 +36,7 @@ void set_nf_type12(CUFINUFFT_BIGINT ms, cufinufft_opts opts, finufft_spread_opts // template // void onedim_fseries_kernel(CUFINUFFT_BIGINT nf, T *fwkerhalf, finufft_spread_opts // opts); -template +template void onedim_fseries_kernel_precomp(CUFINUFFT_BIGINT nf, T *f, T *a, finufft_spread_opts opts); template diff --git a/include/cufinufft/impl.h b/include/cufinufft/impl.h index 85f2d6e0e..750ca1fcd 100644 --- a/include/cufinufft/impl.h +++ b/include/cufinufft/impl.h @@ -15,6 +15,7 @@ #include #include +#include // 1d template @@ -65,9 +66,6 @@ int cufinufft_makeplan_impl(int type, int dim, int *nmodes, int iflag, int ntran */ using namespace cufinufft::common; int ier; - T *d_a = nullptr; // fseries temp data - T *d_f = nullptr; // fseries temp data - if (type < 1 || type > 3) { fprintf(stderr, "[%s] Invalid type (%d): should be 1 or 2.\n", __func__, type); return FINUFFT_ERR_TYPE_NOTVALID; @@ -95,11 +93,17 @@ int cufinufft_makeplan_impl(int type, int dim, int *nmodes, int iflag, int ntran } d_plan->dim = dim; d_plan->opts.gpu_maxbatchsize = std::max(d_plan->opts.gpu_maxbatchsize, 1); + if (type != 3) { d_plan->ms = nmodes[0]; d_plan->mt = nmodes[1]; d_plan->mu = nmodes[2]; } + + int fftsign = (iflag >= 0) ? 1 : -1; + d_plan->iflag = fftsign; + d_plan->ntransf = ntransf; + const auto stream = d_plan->stream = (cudaStream_t)d_plan->opts.gpu_stream; // Mult-GPU support: set the CUDA Device ID: @@ -122,7 +126,7 @@ int cufinufft_makeplan_impl(int type, int dim, int *nmodes, int iflag, int ntran // FIXME: since cufft is really fast we should use 1.25 only if we run out of vram if (d_plan->opts.upsampfac == 0.0) { // indicates auto-choose d_plan->opts.upsampfac = 2.0; // default, and need for tol small - if (tol >= (T)1E-9) { // the tol sigma=5/4 can reach + if (tol >= (T)1E-9 && type == 3) { // the tol sigma=5/4 can reach d_plan->opts.upsampfac = 1.25; } if (d_plan->opts.debug) { @@ -133,9 +137,7 @@ int cufinufft_makeplan_impl(int type, int dim, int *nmodes, int iflag, int ntran /* Setup Spreader */ if ((ier = setup_spreader_for_nufft(d_plan->spopts, tol, d_plan->opts)) > 1) { // can return FINUFFT_WARN_EPS_TOO_SMALL=1, which is OK - delete *d_plan_ptr; - *d_plan_ptr = nullptr; - return ier; + goto finalize; } d_plan->type = type; @@ -143,7 +145,9 @@ int cufinufft_makeplan_impl(int type, int dim, int *nmodes, int iflag, int ntran if (type == 1 || type == 2) { cufinufft_setup_binsize(type, d_plan->spopts.nspread, dim, &d_plan->opts); - + if (ier = cudaGetLastError(), ier != cudaSuccess) { + goto finalize; + } if (d_plan->opts.debug) { printf("[cufinufft] bin size x: %d", d_plan->opts.gpu_binsizex); if (dim > 1) printf(" bin size y: %d", d_plan->opts.gpu_binsizey); @@ -158,9 +162,6 @@ int cufinufft_makeplan_impl(int type, int dim, int *nmodes, int iflag, int ntran d_plan->opts.gpu_binsizey, d_plan->opts.gpu_binsizez); printf("[cufinufft] shared memory required for the spreader: %d\n", mem_required); } - - RETURN_IF_CUDA_ERROR - CUFINUFFT_BIGINT nf1 = 1, nf2 = 1, nf3 = 1; set_nf_type12(d_plan->ms, d_plan->opts, d_plan->spopts, &nf1, d_plan->opts.gpu_obinsizex); @@ -188,7 +189,6 @@ int cufinufft_makeplan_impl(int type, int dim, int *nmodes, int iflag, int ntran int shared_mem_per_block{}; cudaDeviceGetAttribute(&shared_mem_per_block, cudaDevAttrMaxSharedMemoryPerBlockOptin, device_id); - RETURN_IF_CUDA_ERROR // compute the amount of shared memory required for the method const auto shared_mem_required = shared_memory_required( dim, d_plan->spopts.nspread, d_plan->opts.gpu_binsizex, @@ -201,13 +201,14 @@ int cufinufft_makeplan_impl(int type, int dim, int *nmodes, int iflag, int ntran } } - int fftsign = (iflag >= 0) ? 1 : -1; + if ((ier = cudaGetLastError())) { + goto finalize; + } + + d_plan->nf1 = nf1; + d_plan->nf2 = nf2; + d_plan->nf3 = nf3; - d_plan->nf1 = nf1; - d_plan->nf2 = nf2; - d_plan->nf3 = nf3; - d_plan->iflag = fftsign; - d_plan->ntransf = ntransf; int maxbatchsize = opts ? opts->gpu_maxbatchsize : 0; if (maxbatchsize == 0) // implies: use a heuristic. maxbatchsize = std::min(ntransf, 8); // heuristic from test codes @@ -263,44 +264,38 @@ int cufinufft_makeplan_impl(int type, int dim, int *nmodes, int iflag, int ntran cufftSetStream(fftplan, stream); d_plan->fftplan = fftplan; - { - T *a = d_plan->fseries_precomp_a; - T *f = d_plan->fseries_precomp_f; - - onedim_fseries_kernel_precomp(nf1, f, a, d_plan->spopts); - if (dim > 1) - onedim_fseries_kernel_precomp(nf2, f + MAX_NQUAD, a + MAX_NQUAD, d_plan->spopts); - if (dim > 2) - onedim_fseries_kernel_precomp(nf3, f + 2 * MAX_NQUAD, a + 2 * MAX_NQUAD, - d_plan->spopts); - - if ((ier = checkCudaErrors(cudaMallocWrapper(&d_a, dim * MAX_NQUAD * sizeof(T), - stream, d_plan->supports_pools)))) - goto finalize; - if ((ier = checkCudaErrors(cudaMallocWrapper(&d_f, dim * MAX_NQUAD * sizeof(T), - stream, d_plan->supports_pools)))) - goto finalize; - if ((ier = checkCudaErrors(cudaMemcpyAsync(d_a, a, dim * MAX_NQUAD * sizeof(T), - cudaMemcpyHostToDevice, stream)))) - goto finalize; - if ((ier = checkCudaErrors(cudaMemcpyAsync(d_f, f, dim * MAX_NQUAD * sizeof(T), - cudaMemcpyHostToDevice, stream)))) - goto finalize; - if ((ier = cufserieskernelcompute( - d_plan->dim, nf1, nf2, nf3, d_f, d_a, d_plan->fwkerhalf1, - d_plan->fwkerhalf2, d_plan->fwkerhalf3, d_plan->spopts.nspread, stream))) - goto finalize; - } + + T fseries_precomp_a[3 * MAX_NQUAD]; + T fseries_precomp_f[3 * MAX_NQUAD]; + thrust::device_vector d_fseries_precomp_a(3 * MAX_NQUAD); + thrust::device_vector d_fseries_precomp_f(3 * MAX_NQUAD); + onedim_fseries_kernel_precomp(d_plan->nf1, fseries_precomp_f, + fseries_precomp_a, d_plan->spopts); + if (d_plan->dim > 1) + onedim_fseries_kernel_precomp(d_plan->nf2, fseries_precomp_f + MAX_NQUAD, + fseries_precomp_a + MAX_NQUAD, + d_plan->spopts); + if (d_plan->dim > 2) + onedim_fseries_kernel_precomp( + d_plan->nf3, fseries_precomp_f + 2 * MAX_NQUAD, + fseries_precomp_a + 2 * MAX_NQUAD, d_plan->spopts); + // copy the precomputed data to the device using thrust + thrust::copy(fseries_precomp_a, fseries_precomp_a + 3 * MAX_NQUAD, + d_fseries_precomp_a.begin()); + thrust::copy(fseries_precomp_f, fseries_precomp_f + 3 * MAX_NQUAD, + d_fseries_precomp_f.begin()); + if ((ier = cufserieskernelcompute( + d_plan->dim, d_plan->nf1, d_plan->nf2, d_plan->nf3, + d_fseries_precomp_f.data().get(), d_fseries_precomp_a.data().get(), + d_plan->fwkerhalf1, d_plan->fwkerhalf2, d_plan->fwkerhalf3, + d_plan->spopts.nspread, stream))) + goto finalize; } finalize: - cudaFreeWrapper(d_a, stream, d_plan->supports_pools); - cudaFreeWrapper(d_f, stream, d_plan->supports_pools); - if (ier > 1) { delete *d_plan_ptr; *d_plan_ptr = nullptr; } - return ier; } @@ -444,34 +439,37 @@ int cufinufft_setpts_impl(int M, T *d_kx, T *d_ky, T *d_kz, int N, T *d_s, T *d_ // memset(d_plan->type3_params, 0, sizeof(d_plan->type3_params)); using namespace cufinufft::utils; if (d_plan->dim > 0) { - const auto [x1, c1] = arraywidcen(M, d_plan->kx, stream); + const auto [x1, c1] = arraywidcen(M, d_kx, stream); d_plan->type3_params.X1 = x1; d_plan->type3_params.C1 = c1; const auto [S1, D1] = arraywidcen(N, d_s, stream); const auto [nf1, h1, gam1] = set_nhg_type3(S1, x1, d_plan->opts, d_plan->spopts); d_plan->nf1 = nf1; + d_plan->type3_params.S1 = S1; d_plan->type3_params.D1 = D1; d_plan->type3_params.h1 = h1; d_plan->type3_params.gam1 = gam1; } if (d_plan->dim > 1) { - const auto [x2, c2] = arraywidcen(M, d_plan->ky, stream); + const auto [x2, c2] = arraywidcen(M, d_ky, stream); d_plan->type3_params.X2 = x2; d_plan->type3_params.C2 = c2; const auto [S2, D2] = arraywidcen(N, d_t, stream); const auto [nf2, h2, gam2] = set_nhg_type3(S2, x2, d_plan->opts, d_plan->spopts); d_plan->nf2 = nf2; + d_plan->type3_params.S2 = S2; d_plan->type3_params.D2 = D2; d_plan->type3_params.h2 = h2; d_plan->type3_params.gam2 = gam2; } if (d_plan->dim > 2) { - const auto [x3, c3] = arraywidcen(M, d_plan->kz, stream); + const auto [x3, c3] = arraywidcen(M, d_kz, stream); d_plan->type3_params.X3 = x3; d_plan->type3_params.C3 = c3; const auto [S3, D3] = arraywidcen(N, d_u, stream); const auto [nf3, h3, gam3] = set_nhg_type3(S3, x3, d_plan->opts, d_plan->spopts); d_plan->nf3 = nf3; + d_plan->type3_params.S3 = S3; d_plan->type3_params.D3 = D3; d_plan->type3_params.h3 = h3; d_plan->type3_params.gam3 = gam3; @@ -479,18 +477,18 @@ int cufinufft_setpts_impl(int M, T *d_kx, T *d_ky, T *d_kz, int N, T *d_s, T *d_ if (d_plan->opts.debug) { printf("[%s]", __func__); printf("\tM=%lld N=%lld\n", M, N); - printf("\tX1=%.3g C1=%.3g D1=%.3g gam1=%g nf1=%lld\t\n", d_plan->type3_params.X1, - d_plan->type3_params.C1, d_plan->type3_params.D1, d_plan->type3_params.gam1, - d_plan->nf1); + printf("\tX1=%.3g C1=%.3g S1=%.3g D1=%.3g gam1=%g nf1=%lld\t\n", + d_plan->type3_params.X1, d_plan->type3_params.C1, d_plan->type3_params.S1, + d_plan->type3_params.D1, d_plan->type3_params.gam1, d_plan->nf1); if (d_plan->dim > 1) { - printf("\tX2=%.3g C2=%.3g D2=%.3g gam2=%g nf2=%lld\n", d_plan->type3_params.X2, - d_plan->type3_params.C2, d_plan->type3_params.D2, d_plan->type3_params.gam2, - d_plan->nf2); + printf("\tX2=%.3g C2=%.3g S2=%.3g D2=%.3g gam2=%g nf2=%lld\n", + d_plan->type3_params.X2, d_plan->type3_params.C2, d_plan->type3_params.S2, + d_plan->type3_params.D2, d_plan->type3_params.gam2, d_plan->nf2); } if (d_plan->dim > 2) { - printf("\tX3=%.3g C3=%.3g D3=%.3g gam3=%g nf3=%lld\n", d_plan->type3_params.X3, - d_plan->type3_params.C3, d_plan->type3_params.D3, d_plan->type3_params.gam3, - d_plan->nf3); + printf("\tX3=%.3g C3=%.3g S3=%.3g D3=%.3g gam3=%g nf3=%lld\n", + d_plan->type3_params.X3, d_plan->type3_params.C3, d_plan->type3_params.S3, + d_plan->type3_params.D3, d_plan->type3_params.gam3, d_plan->nf3); } } d_plan->nf = d_plan->nf1 * d_plan->nf2 * d_plan->nf3; @@ -523,14 +521,17 @@ int cufinufft_setpts_impl(int M, T *d_kx, T *d_ky, T *d_kz, int N, T *d_s, T *d_ } if (checked_realloc(d_plan->prephase, sizeof(cuda_complex) * M) != cudaSuccess) goto finalize; - cudaStreamSynchronize(stream); + if (checked_realloc(d_plan->deconv, sizeof(cuda_complex) * N) != cudaSuccess) + goto finalize; + + // should not be needed + // cudaStreamSynchronize(stream); // NOTE: init-captures are not allowed for extended __host__ __device__ lambdas if (d_plan->dim > 0) { // TODO: merging the tree calls to GPU into one as in the version below might // might be more readable and faster - const auto ig1 = T(1) / d_plan->type3_params.gam1; const auto C1 = -d_plan->type3_params.C1; thrust::transform( @@ -548,10 +549,9 @@ int cufinufft_setpts_impl(int M, T *d_kx, T *d_ky, T *d_kz, int N, T *d_s, T *d_ const auto ig3 = T(1) / d_plan->type3_params.gam3; const auto C3 = -d_plan->type3_params.C3; thrust::transform( - thrust::cuda::par.on(stream), d_kz, d_ky + M, d_plan->ky, + thrust::cuda::par.on(stream), d_kz, d_kz + M, d_plan->kz, [ig3, C3] __host__ __device__(const T x) -> T { return (x + C3) * ig3; }); } - if (d_plan->type3_params.D1 != 0 || d_plan->type3_params.D2 != 0 || d_plan->type3_params.D3 != 0) { // if ky is null, use kx for ky and kz @@ -567,7 +567,8 @@ int cufinufft_setpts_impl(int M, T *d_kx, T *d_ky, T *d_kz, int N, T *d_s, T *d_ const auto D3 = d_plan->type3_params.D3; // this should be 0 if dim < 3 const auto imasign = d_plan->iflag >= 0 ? cuda_complex{0, 1} : cuda_complex{0, -1}; - thrust::transform(iterator, iterator + M, d_plan->prephase, + thrust::transform(thrust::cuda::par.on(stream), iterator, iterator + M, + d_plan->prephase, [D1, D2, D3, imasign] __host__ __device__( const thrust::tuple &tuple) -> cuda_complex { const auto x = thrust::get<0>(tuple); @@ -582,29 +583,119 @@ int cufinufft_setpts_impl(int M, T *d_kx, T *d_ky, T *d_kz, int N, T *d_s, T *d_ return sin(phase) * imasign + cos(phase); }); } else { - thrust::fill(d_plan->prephase, d_plan->prephase + M, cuda_complex{1, 0}); + thrust::fill(thrust::cuda::par.on(stream), d_plan->prephase, d_plan->prephase + M, + cuda_complex{1, 0}); } if (d_plan->dim > 0) { const auto scale = d_plan->type3_params.h1 * d_plan->type3_params.gam1; const auto D1 = -d_plan->type3_params.D1; thrust::transform( - d_s, d_s + N, d_plan->d_s, + thrust::cuda::par.on(stream), d_s, d_s + N, d_plan->d_s, [scale, D1] __host__ __device__(const T s) -> T { return scale * (s + D1); }); } if (d_plan->dim > 1) { const auto scale = d_plan->type3_params.h2 * d_plan->type3_params.gam2; const auto D2 = -d_plan->type3_params.D2; thrust::transform( - d_t, d_t + N, d_plan->d_t, + thrust::cuda::par.on(stream), d_t, d_t + N, d_plan->d_t, [scale, D2] __host__ __device__(const T t) -> T { return scale * (t + D2); }); } if (d_plan->dim > 2) { const auto scale = d_plan->type3_params.h3 * d_plan->type3_params.gam3; const auto D3 = -d_plan->type3_params.D3; thrust::transform( - d_u, d_u + N, d_plan->d_u, + thrust::cuda::par.on(stream), d_u, d_u + N, d_plan->d_u, [scale, D3] __host__ __device__(const T u) -> T { return scale * (u + D3); }); } + { + using namespace cufinufft::common; + + std::array fseries_precomp_a{}; + std::array fseries_precomp_f{}; + thrust::device_vector d_fseries_precomp_a(3 * MAX_NQUAD); + thrust::device_vector d_fseries_precomp_f(3 * MAX_NQUAD); + thrust::device_vector phi_hat1{}, phi_hat2{}, phi_hat3{}; + if (d_plan->dim > 0) { + phi_hat1.resize(N); + } + if (d_plan->dim > 1) { + phi_hat2.resize(N); + } + if (d_plan->dim > 2) { + phi_hat3.resize(N); + } + onedim_fseries_kernel_precomp(d_plan->nf1, fseries_precomp_f.data(), + fseries_precomp_a.data(), d_plan->spopts); + if (d_plan->dim > 1) { + onedim_fseries_kernel_precomp( + d_plan->nf2, fseries_precomp_f.data() + MAX_NQUAD, + fseries_precomp_a.data() + MAX_NQUAD, d_plan->spopts); + } + if (d_plan->dim > 2) { + onedim_fseries_kernel_precomp( + d_plan->nf3, fseries_precomp_f.data() + 2 * MAX_NQUAD, + fseries_precomp_a.data() + 2 * MAX_NQUAD, d_plan->spopts); + } + // copy the precomputed data to the device using thrust + thrust::copy(fseries_precomp_a.begin(), fseries_precomp_a.end(), + d_fseries_precomp_a.begin()); + thrust::copy(fseries_precomp_f.begin(), fseries_precomp_f.end(), + d_fseries_precomp_f.begin()); + // sync the stream before calling the kernel might be needed + if (cufserieskernelcompute(d_plan->dim, N, N, N, d_fseries_precomp_f.data().get(), + d_fseries_precomp_a.data().get(), d_plan->d_s, d_plan->d_t, + d_plan->d_u, phi_hat1.data().get(), phi_hat2.data().get(), + phi_hat3.data().get(), d_plan->spopts.nspread, stream)) + goto finalize; + const auto is_c_finite = std::isfinite(d_plan->type3_params.C1) && + std::isfinite(d_plan->type3_params.C2) && + std::isfinite(d_plan->type3_params.C3); + const auto is_c_nonzero = d_plan->type3_params.C1 != 0 || + d_plan->type3_params.C2 != 0 || + d_plan->type3_params.C3 != 0; + { + const auto dim = d_plan->dim; + const auto phi_hat_iterator = thrust::make_zip_iterator(thrust::make_tuple( + phi_hat1.begin(), dim > 1 ? phi_hat2.begin() : phi_hat1.begin(), + dim > 2 ? phi_hat3.begin() : phi_hat1.begin())); + thrust::transform(thrust::cuda::par.on(stream), phi_hat_iterator, + phi_hat_iterator + N, d_plan->deconv, + [dim] __host__ __device__( + const thrust::tuple &tuple) -> cuda_complex { + auto phiHat = thrust::get<0>(tuple); + phiHat *= (dim > 1) ? thrust::get<1>(tuple) : 1; + phiHat *= (dim > 2) ? thrust::get<2>(tuple) : 1; + return cuda_complex{1 / phiHat, 0}; + }); + } + if (is_c_finite && is_c_nonzero) { + const auto dim = d_plan->dim; + const auto c1 = d_plan->type3_params.C1; + const auto c2 = d_plan->type3_params.C2; + const auto c3 = d_plan->type3_params.C3; + const auto d1 = -d_plan->type3_params.D1; + const auto d2 = -d_plan->type3_params.D2; + const auto d3 = -d_plan->type3_params.D3; + const auto imasign = + d_plan->iflag >= 0 ? cuda_complex{0, 1} : cuda_complex{0, -1}; + // passing d_s three times if dim == 1 because d_t and d_u are not allocated + // passing d_s and d_t if dim == 2 because d_u is not allocated + const auto phase_iterator = thrust::make_zip_iterator( + thrust::make_tuple(d_plan->d_s, dim > 1 ? d_plan->d_t : d_plan->d_s, + dim > 2 ? d_plan->d_u : d_plan->d_s)); + thrust::transform(thrust::cuda::par.on(stream), phase_iterator, phase_iterator + N, + d_plan->deconv, d_plan->deconv, + [c1, c2, c3, d1, d2, d3, imasign] __host__ __device__( + const thrust::tuple tuple, + cuda_complex deconv) -> cuda_complex { + // d2 and d3 are 0 if dim < 2 and dim < 3 + const auto phase = c1 * (thrust::get<0>(tuple) + d1) + + c2 * (thrust::get<1>(tuple) + d2) + + c3 * (thrust::get<2>(tuple) + d3); + return deconv * (std::sin(phase) * imasign + std::cos(phase)); + }); + } + } return 0; finalize: checked_free(d_plan->kx); @@ -614,6 +705,7 @@ int cufinufft_setpts_impl(int M, T *d_kx, T *d_ky, T *d_kz, int N, T *d_s, T *d_ checked_free(d_plan->kz); checked_free(d_plan->d_u); checked_free(d_plan->prephase); + checked_free(d_plan->deconv); return FINUFFT_ERR_CUDA_FAILURE; } diff --git a/include/cufinufft/types.h b/include/cufinufft/types.h index 030cd268c..9d7d27191 100644 --- a/include/cufinufft/types.h +++ b/include/cufinufft/types.h @@ -24,9 +24,9 @@ using cuda_complex = typename std::conditional< void>::type>::type; namespace { template struct cufinuftt_type3_params_t { - T X1, C1, D1, h1, gam1; // x dim: X=halfwid C=center D=freqcen h,gam=rescale - T X2, C2, D2, h2, gam2; // y - T X3, C3, D3, h3, gam3; // z + T X1, C1, S1, D1, h1, gam1; // x dim: X=halfwid C=center D=freqcen h,gam=rescale + T X2, C2, S2, D2, h2, gam2; // y + T X3, C3, S3, D3, h3, gam3; // z }; } // namespace @@ -88,12 +88,10 @@ template struct cufinufft_plan_t { int *numnupts; int *subprob_to_nupts; - // Temporary variables to do fseries precomputation - T fseries_precomp_a[3 * MAX_NQUAD]; - T fseries_precomp_f[3 * MAX_NQUAD]; - cufftHandle fftplan; cudaStream_t stream; + + using real_t = T; }; template constexpr static inline cufftType_t cufft_type(); diff --git a/include/cufinufft/utils.h b/include/cufinufft/utils.h index 9f549e99b..cbd7e59c6 100644 --- a/include/cufinufft/utils.h +++ b/include/cufinufft/utils.h @@ -130,8 +130,8 @@ __forceinline__ __device__ auto interval(const int ns, const double x) { */ template -static __forceinline__ __device__ void atomicAddComplexShared(cuda_complex *address, - cuda_complex res) { +static __forceinline__ __device__ void atomicAddComplexShared( + cuda_complex *address, cuda_complex res) { const auto raw_address = reinterpret_cast(address); atomicAdd(raw_address, res.x); atomicAdd(raw_address + 1, res.y); @@ -143,8 +143,8 @@ static __forceinline__ __device__ void atomicAddComplexShared(cuda_complex *a * on shared memory are supported so we leverage them */ template -static __forceinline__ __device__ void atomicAddComplexGlobal(cuda_complex *address, - cuda_complex res) { +static __forceinline__ __device__ void atomicAddComplexGlobal( + cuda_complex *address, cuda_complex res) { if constexpr ( std::is_same_v, float2> && COMPUTE_CAPABILITY_90_OR_HIGHER) { atomicAdd(address, res); @@ -154,9 +154,15 @@ static __forceinline__ __device__ void atomicAddComplexGlobal(cuda_complex *a } template auto arrayrange(int n, T *a, cudaStream_t stream) { - const auto [d_min, d_max] = - thrust::minmax_element(thrust::cuda::par.on(stream), a, a + n); - return std::make_tuple(*d_min, *d_max); + const auto d_min_max = thrust::minmax_element(thrust::cuda::par.on(stream), a, a + n); + + // copy d_min and d_max to host + T min{}, max{}; + checkCudaErrors(cudaMemcpy(&min, thrust::raw_pointer_cast(d_min_max.first), sizeof(T), + cudaMemcpyDeviceToHost)); + checkCudaErrors(cudaMemcpy(&max, thrust::raw_pointer_cast(d_min_max.second), sizeof(T), + cudaMemcpyDeviceToHost)); + return std::make_tuple(min, max); } // Writes out w = half-width and c = center of an interval enclosing all a[n]'s @@ -168,7 +174,7 @@ template auto arraywidcen(int n, T *a, cudaStream_t stream) { const auto [lo, hi] = arrayrange(n, a, stream); auto w = (hi - lo) / 2; auto c = (hi + lo) / 2; - if (std::abs(c) < ARRAYWIDCEN_GROWFRAC * (w)) { + if (std::abs(c) < ARRAYWIDCEN_GROWFRAC * w) { w += std::abs(c); c = 0.0; } @@ -198,9 +204,9 @@ auto set_nhg_type3(T S, T X, const cufinufft_opts &opts, Xsafe = 1.0; Ssafe = 1.0; } else - Xsafe = max(Xsafe, 1 / S); + Xsafe = max(Xsafe, T(1) / S); else - Ssafe = max(Ssafe, 1 / X); + Ssafe = max(Ssafe, T(1) / X); // use the safe X and S... T nfd = 2.0 * opts.upsampfac * Ssafe * Xsafe / M_PI + nss; if (!isfinite(nfd)) nfd = 0.0; // use FLT to catch inf @@ -213,7 +219,7 @@ auto set_nhg_type3(T S, T X, const cufinufft_opts &opts, // Note: b is 1 because type 3 uses a type 2 plan, so it should not need the extra // condition that seems to be used by Block Gather as type 2 are only GM-sort auto h = 2 * T(M_PI) / nf; // upsampled grid spacing - auto gam = (T)nf / (2.0 * opts.upsampfac * Ssafe); // x scale fac to x' + auto gam = T(nf) / (2.0 * opts.upsampfac * Ssafe); // x scale fac to x' return std::make_tuple(nf, h, gam); } diff --git a/src/cuda/common.cu b/src/cuda/common.cu index f5c5a37fd..453df96b0 100644 --- a/src/cuda/common.cu +++ b/src/cuda/common.cu @@ -50,7 +50,40 @@ __global__ void fseries_kernel_compute(int nf1, int nf2, int nf3, T *f, T *a, i += blockDim.x * gridDim.x) { T x = 0.0; for (int n = 0; n < q; n++) { - x += ft[n] * 2 * cos(i * at[n]); + x += ft[n] * T(2) * cos(T(i) * at[n]); + } + oarr[i] = x; + } +} + +template +__global__ void fseries_kernel_compute(int nf1, int nf2, int nf3, T *f, T *a, T *kx, + T *ky, T *kz, T *fwkerhalf1, T *fwkerhalf2, + T *fwkerhalf3, int ns) { + T J2 = ns / 2.0; + int q = (int)(2 + 3.0 * J2); + int nf; + T *at = a + threadIdx.y * MAX_NQUAD; + T *ft = f + threadIdx.y * MAX_NQUAD; + T *oarr, *k; + if (threadIdx.y == 0) { + k = kx; + oarr = fwkerhalf1; + nf = nf1; + } else if (threadIdx.y == 1) { + k = ky; + oarr = fwkerhalf2; + nf = nf2; + } else { + k = kz; + oarr = fwkerhalf3; + nf = nf3; + } + for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < nf; + i += blockDim.x * gridDim.x) { + T x = 0.0; + for (int n = 0; n < q; n++) { + x += ft[n] * T(2) * cos(k[i] * at[n]); } oarr[i] = x; } @@ -79,6 +112,30 @@ int cufserieskernelcompute(int dim, int nf1, int nf2, int nf3, T *d_f, T *d_a, return 0; } +template +int cufserieskernelcompute(int dim, int nf1, int nf2, int nf3, T *d_f, T *d_a, T *d_kx, + T *d_ky, T *d_kz, T *d_fwkerhalf1, T *d_fwkerhalf2, + T *d_fwkerhalf3, int ns, cudaStream_t stream) +/* + wrapper for approximation of Fourier series of real symmetric spreading + kernel. + +Melody Shih 2/20/22 +*/ +{ + int nout = max(max(nf1 / 2 + 1, nf2 / 2 + 1), nf3 / 2 + 1); + + dim3 threadsPerBlock(16, dim); + dim3 numBlocks((nout + 16 - 1) / 16, 1); + + fseries_kernel_compute<<>>( + nf1, nf2, nf3, d_f, d_a, d_kx, d_ky, d_kz, d_fwkerhalf1, d_fwkerhalf2, d_fwkerhalf3, + ns); + RETURN_IF_CUDA_ERROR + + return 0; +} + template int setup_spreader_for_nufft(finufft_spread_opts &spopts, T eps, cufinufft_opts opts) // Set up the spreader parameters given eps, and pass across various nufft @@ -149,7 +206,7 @@ void set_nf_type12(CUFINUFFT_BIGINT ms, cufinufft_opts opts, finufft_spread_opts f - funciton values at quadrature nodes multiplied with quadrature weights (a, f are provided as the inputs of onedim_fseries_kernel_compute() defined below) */ -template +template void onedim_fseries_kernel_precomp(CUFINUFFT_BIGINT nf, T *f, T *a, finufft_spread_opts opts) { T J2 = opts.nspread / 2.0; // J/2, half-width of ker z-support @@ -163,7 +220,11 @@ void onedim_fseries_kernel_precomp(CUFINUFFT_BIGINT nf, T *f, T *a, for (int n = 0; n < q; ++n) { // set up nodes z_n and vals f_n z[n] *= J2; // rescale nodes f[n] = J2 * w[n] * evaluate_kernel((T)z[n], opts); // vals & quadr wei - a[n] = ((T)(2.0 * M_PI) * (T)(nf / 2 - z[n]) / (T)nf); // phase winding rates + if constexpr (phase_winding) { + a[n] = ((T)(2.0 * M_PI) * (T)(nf / 2 - z[n]) / (T)nf); // phase winding rates + } else { + a[n] = z[n]; + } } } @@ -321,17 +382,28 @@ template int setup_spreader_for_nufft(finufft_spread_opts &spopts, float eps, cufinufft_opts opts); template int setup_spreader_for_nufft(finufft_spread_opts &spopts, double eps, cufinufft_opts opts); -template void onedim_fseries_kernel_precomp(CUFINUFFT_BIGINT nf, float *f, float *a, - finufft_spread_opts opts); -template void onedim_fseries_kernel_precomp(CUFINUFFT_BIGINT nf, double *f, double *a, - finufft_spread_opts opts); +template void onedim_fseries_kernel_precomp( + CUFINUFFT_BIGINT nf, float *f, float *a, finufft_spread_opts opts); +template void onedim_fseries_kernel_precomp( + CUFINUFFT_BIGINT nf, double *f, double *a, finufft_spread_opts opts); +template void onedim_fseries_kernel_precomp( + CUFINUFFT_BIGINT nf, float *f, float *a, finufft_spread_opts opts); +template void onedim_fseries_kernel_precomp( + CUFINUFFT_BIGINT nf, double *f, double *a, finufft_spread_opts opts); template int cufserieskernelcompute(int dim, int nf1, int nf2, int nf3, float *d_f, float *d_a, float *d_fwkerhalf1, float *d_fwkerhalf2, float *d_fwkerhalf3, int ns, cudaStream_t stream); template int cufserieskernelcompute( int dim, int nf1, int nf2, int nf3, double *d_f, double *d_a, double *d_fwkerhalf1, double *d_fwkerhalf2, double *d_fwkerhalf3, int ns, cudaStream_t stream); - +template int cufserieskernelcompute( + int dim, int nf1, int nf2, int nf3, float *d_f, float *d_a, float *d_kx, float *d_ky, + float *d_kz, float *d_fwkerhalf1, float *d_fwkerhalf2, float *d_fwkerhalf3, int ns, + cudaStream_t stream); +template int cufserieskernelcompute( + int dim, int nf1, int nf2, int nf3, double *d_f, double *d_a, double *d_kx, + double *d_ky, double *d_kz, double *d_fwkerhalf1, double *d_fwkerhalf2, + double *d_fwkerhalf3, int ns, cudaStream_t stream); // template void onedim_fseries_kernel(CUFINUFFT_BIGINT nf, float *fwkerhalf, // finufft_spread_opts opts); // template void onedim_fseries_kernel(CUFINUFFT_BIGINT nf, double *fwkerhalf, diff --git a/src/cuda/cufinufft.cu b/src/cuda/cufinufft.cu index a1904e52c..534fa5358 100644 --- a/src/cuda/cufinufft.cu +++ b/src/cuda/cufinufft.cu @@ -112,7 +112,7 @@ void cufinufft_default_opts(cufinufft_opts *opts) opts->gpu_method = 0; opts->gpu_sort = 1; opts->gpu_kerevalmeth = 1; - opts->upsampfac = 2.0; + opts->upsampfac = 0; opts->gpu_maxsubprobsize = 1024; opts->gpu_obinsizex = 0; opts->gpu_obinsizey = 0; diff --git a/src/cuda/memtransfer_wrapper.cu b/src/cuda/memtransfer_wrapper.cu index 8293c0145..6f3f31abd 100644 --- a/src/cuda/memtransfer_wrapper.cu +++ b/src/cuda/memtransfer_wrapper.cu @@ -423,7 +423,8 @@ void freegpumemory(cufinufft_plan_t *d_plan) */ { utils::WithCudaDevice device_swapper(d_plan->opts.gpu_device_id); - auto &stream = d_plan->stream; + // passing the stream by reference was causing a segfault + const auto stream = d_plan->stream; CUDA_FREE_AND_NULL(d_plan->fw, stream, d_plan->supports_pools); CUDA_FREE_AND_NULL(d_plan->fwkerhalf1, stream, d_plan->supports_pools); diff --git a/test/cuda/CMakeLists.txt b/test/cuda/CMakeLists.txt index c7323f57b..bb2d8444a 100644 --- a/test/cuda/CMakeLists.txt +++ b/test/cuda/CMakeLists.txt @@ -7,12 +7,16 @@ foreach(srcfile ${test_src}) target_include_directories(${executable} PUBLIC ${CUFINUFFT_INCLUDE_DIRS}) find_library(MathLib m) if(MathLib) - target_link_libraries(${executable} PUBLIC cufinufft ${MathLib}) + target_link_libraries(${executable} PUBLIC ${MathLib}) endif() + target_link_libraries(${executable} PUBLIC cufinufft finufft) target_compile_features(${executable} PUBLIC cxx_std_17) set_target_properties( ${executable} PROPERTIES LINKER_LANGUAGE CUDA CUDA_ARCHITECTURES ${FINUFFT_CUDA_ARCHITECTURES}) + target_compile_options( + ${executable} PUBLIC $<$:$<$:-G + -maxrregcount 32>>) message(STATUS "Adding test ${executable}" " with CUDA_ARCHITECTURES=${FINUFFT_CUDA_ARCHITECTURES}" " and INCLUDE=${CUFINUFFT_INCLUDE_DIRS}") @@ -78,15 +82,21 @@ function(add_tests PREC REQ_TOL CHECK_TOL UPSAMP) ${PREC} ${UPSAMP}) endfunction() +add_test(NAME cufinufft_public_api COMMAND public_api_test) +add_test(NAME cufinufft_makeplan COMMAND test_makeplan) +if(CMAKE_BUILD_TYPE STREQUAL "Debug") + add_test(NAME cufinufft_makeplan_impl COMMAND cufinufft_makeplan_impl) +endif() +add_test(NAME cufinufft_setpts COMMAND cufinufft_setpts) +add_test(NAME cufinufft_math_test COMMAND cufinufft_math_test) + add_tests(float 1e-5 2e-4 2.0) add_tests(double 1e-12 1e-11 2.0) add_tests(float 1e-5 2e-4 1.25) add_tests(double 1e-8 1e-7 1.25) -add_tests(float 1e-5 2e-4 0) -add_tests(double 1e-12 1e-11 0) -add_tests(float 1e-5 2e-4 0.0) -add_tests(double 1e-8 1e-7 0.0) +add_tests(float 1e-5 2e-4 0.f) +add_tests(double 1e-12 1e-11 0.f) +add_tests(float 1e-5 2e-4 0.) +add_tests(double 1e-8 1e-7 0.) -add_test(NAME cufinufft_public_api COMMAND public_api_test) -add_test(NAME cufinufft_makeplan COMMAND test_makeplan) -add_test(NAME cufinufft_math_test COMMAND cufinufft_math_test) +# add_test(NAME cufinufft_type3_test COMMAND cufinufft_type3_test) diff --git a/test/cuda/cufinufft1d_test.cu b/test/cuda/cufinufft1d_test.cu index dbd6260ac..38313786a 100644 --- a/test/cuda/cufinufft1d_test.cu +++ b/test/cuda/cufinufft1d_test.cu @@ -1,12 +1,12 @@ #include #include -#include #include #include #include #include +#include #include #include diff --git a/test/cuda/cufinufft_makeplan_impl.cu b/test/cuda/cufinufft_makeplan_impl.cu new file mode 100644 index 000000000..31fd695db --- /dev/null +++ b/test/cuda/cufinufft_makeplan_impl.cu @@ -0,0 +1,160 @@ +#ifdef NDEBUG +#undef NDEBUG +#include +#define NDEBUG +#else +#include +#endif + +#include +#include +#include + +#include + +int main() { + // defaults. tests should shadow them to override + const int iflag = 1; + const float tol = 1e-5; + const int ntransf = 1; + const int dim = 3; + int N[3] = {10, 20, 15}; + const auto cpu_planer = [iflag, tol, ntransf, dim, N](const auto type) { + int64_t Nl[3] = {int64_t(N[0]), int64_t(N[1]), int64_t(N[2])}; + finufft_plan_s *plan{nullptr}; + assert(finufft_makeplan(type, dim, Nl, iflag, ntransf, tol, &plan, nullptr) == 0); + return plan; + }; + const auto test_type1 = [iflag, tol, ntransf, dim, N, cpu_planer](auto *plan) { + // plan is a pointer to a type that contains real_t + using T = typename std::remove_pointer::type::real_t; + const int type = 1; + assert(cufinufft_makeplan_impl(type, dim, (int *)N, iflag, ntransf, T(tol), &plan, + nullptr) == 0); + const auto cpu_plan = cpu_planer(type); + cudaDeviceSynchronize(); + assert(plan->ms == N[0]); + assert(plan->mt == N[1]); + assert(plan->mu == N[2]); + assert(plan->nf1 >= N[0]); + assert(plan->nf2 >= N[1]); + assert(plan->nf3 >= N[2]); + assert(plan->fftplan != 0); + assert(plan->fwkerhalf1 != nullptr); + assert(plan->fwkerhalf2 != nullptr); + assert(plan->fwkerhalf3 != nullptr); + assert(plan->spopts.spread_direction == type); + assert(plan->type == type); + assert(plan->nf1 == cpu_plan->nf1); + assert(plan->nf2 == cpu_plan->nf2); + assert(plan->nf3 == cpu_plan->nf3); + int nf[] = {plan->nf1, plan->nf2, plan->nf3}; + T *fwkerhalf[] = {plan->fwkerhalf1, plan->fwkerhalf2, plan->fwkerhalf3}; + T *phiHat[] = {cpu_plan->phiHat1, cpu_plan->phiHat2, cpu_plan->phiHat3}; + for (int idx = 0; idx < dim; ++idx) { + const auto size = (nf[idx] / 2 + 1); + std::vector fwkerhalf_host(size, -1); + const auto ier = cudaMemcpy(fwkerhalf_host.data(), fwkerhalf[idx], size * sizeof(T), + cudaMemcpyDeviceToHost); + if (ier != cudaSuccess) { + std::cerr << "Error: " << cudaGetErrorString(ier) << std::endl; + } + assert(ier == cudaSuccess); + for (int i = 0; i < size; i++) { + assert(abs(1 - fwkerhalf_host[i] / phiHat[idx][i]) < tol); + } + } + assert(cufinufft_destroy_impl(plan) == 0); + assert(finufft_destroy(cpu_plan) == 0); + plan = nullptr; + }; + auto test_type2 = [iflag, tol, ntransf, dim, N, cpu_planer](auto plan) { + // plan is a pointer to a type that contains real_t + using T = typename std::remove_pointer::type::real_t; + const int type = 2; + assert(cufinufft_makeplan_impl(type, dim, (int *)N, iflag, ntransf, T(tol), &plan, + nullptr) == 0); + const auto cpu_plan = cpu_planer(type); + cudaDeviceSynchronize(); + assert(plan->ms == N[0]); + assert(plan->mt == N[1]); + assert(plan->mu == N[2]); + assert(plan->nf1 >= N[0]); + assert(plan->nf2 >= N[1]); + assert(plan->nf3 >= N[2]); + assert(plan->fftplan != 0); + assert(plan->fwkerhalf1 != nullptr); + assert(plan->fwkerhalf2 != nullptr); + assert(plan->fwkerhalf3 != nullptr); + assert(plan->spopts.spread_direction == type); + assert(plan->type == type); + assert(plan->opts.gpu_method == 1); + assert(plan->nf1 == cpu_plan->nf1); + assert(plan->nf2 == cpu_plan->nf2); + assert(plan->nf3 == cpu_plan->nf3); + assert(plan->spopts.nspread == cpu_plan->spopts.nspread); + int nf[] = {plan->nf1, plan->nf2, plan->nf3}; + T *fwkerhalf[] = {plan->fwkerhalf1, plan->fwkerhalf2, plan->fwkerhalf3}; + T *phiHat[] = {cpu_plan->phiHat1, cpu_plan->phiHat2, cpu_plan->phiHat3}; + for (int idx = 0; idx < dim; ++idx) { + const auto size = (nf[idx] / 2 + 1); + std::vector fwkerhalf_host(size, -1); + const auto ier = cudaMemcpy(fwkerhalf_host.data(), fwkerhalf[idx], size * sizeof(T), + cudaMemcpyDeviceToHost); + if (ier != cudaSuccess) { + std::cerr << "Error: " << cudaGetErrorString(ier) << std::endl; + } + assert(ier == cudaSuccess); + cudaDeviceSynchronize(); + for (int i = 0; i < size; i++) { + assert(abs(1 - fwkerhalf_host[i] / phiHat[idx][i]) < tol); + } + } + assert(cufinufft_destroy_impl(plan) == 0); + cudaDeviceSynchronize(); + assert(finufft_destroy(cpu_plan) == 0); + plan = nullptr; + }; + auto test_type3 = [iflag, tol, ntransf, dim, N, cpu_planer](auto plan) { + // plan is a pointer to a type that contains real_t + using T = typename std::remove_pointer::type::real_t; + const int type = 3; + assert(cufinufft_makeplan_impl(type, dim, (int *)N, iflag, ntransf, T(tol), &plan, + nullptr) == 0); + cudaDeviceSynchronize(); + assert(plan->ms == 0); + assert(plan->mt == 0); + assert(plan->mu == 0); + assert(plan->nf1 == 1); + assert(plan->nf2 == 1); + assert(plan->nf3 == 1); + assert(plan->fftplan == 0); + assert(plan->fwkerhalf1 == nullptr); + assert(plan->fwkerhalf2 == nullptr); + assert(plan->fwkerhalf3 == nullptr); + assert(plan->spopts.spread_direction == type); + assert(plan->type == type); + assert(plan->opts.gpu_method == 0); + assert(plan->opts.upsampfac == 1.25); + assert(cufinufft_destroy_impl(plan) == 0); + plan = nullptr; + cudaDeviceSynchronize(); + }; + // testing correctness of the plan creation + // cufinufft_plan_t *single_plan{nullptr}; + cufinufft_plan_t *double_plan{nullptr}; + test_type1(double_plan); + test_type2(double_plan); + test_type3(double_plan); + return 0; +} + +#ifdef __clang__ +#pragma clang diagnostic pop +#elif defined(__GNUC__) || defined(__GNUG__) +#pragma GCC diagnostic pop +#elif defined(__NVCC__) +#pragma diag_default 177 - D +#elif defined(_MSC_VER) +#pragma warning(pop) +#endif diff --git a/test/cuda/cufinufft_setpts.cu b/test/cuda/cufinufft_setpts.cu new file mode 100644 index 000000000..cfe420474 --- /dev/null +++ b/test/cuda/cufinufft_setpts.cu @@ -0,0 +1,242 @@ +#ifdef NDEBUG +#undef NDEBUG +#include +#define NDEBUG +#else +#include +#endif + +#include + +#include +#include +#include + +#include + +#include +#include +#include + +// for now, once finufft is demacroized we can test float +using T = double; + +template bool equal(V *d_vec, T *cpu, const std::size_t size) { + // copy d_vec to cpu + thrust::host_vector h_vec(size); + // this implicitly converts cuda_complex to std::complex... which is fine, but it may + // cause issues use it with case + assert(cudaMemcpy(h_vec.data(), d_vec, size * sizeof(T), cudaMemcpyDeviceToHost) == + cudaSuccess); + for (std::size_t i = 0; i < size; ++i) { + if (h_vec[i] != cpu[i]) { + std::cout << " gpu[" << i << "]: " << h_vec[i] << " cpu[" << i << "]: " << cpu[i] + << std::endl; + return false; + } + } + return true; +} + +template +T relerrtwonorm(std::complex *a, std::complex *b, const std::size_t n) { + T err = 0.0, nrm = 0.0; + for (std::size_t m = 0; m < n; ++m) { + // std::cout << "a[" << m << "]: " << a[m] << " b[" << m << "]: " << b[m] << "\n"; + nrm += std::real(std::conj(a[m]) * a[m]); + const auto diff = a[m] - b[m]; + err += std::real(std::conj(diff) * diff); + } + return std::sqrt(err / nrm); +} + +template +auto almost_equal(V *d_vec, + T *cpu, + const std::size_t size, + const contained tol = std::numeric_limits::epsilon()) { + // copy d_vec to cpu + std::vector h_vec(size); + // this implicitly converts cuda_complex to std::complex... which is fine, but it may + // cause issues use it with case + assert(cudaMemcpy(h_vec.data(), d_vec, size * sizeof(T), cudaMemcpyDeviceToHost) == + cudaSuccess); + // compare the l2 norm of the difference between the two vectors + if (relerrtwonorm(h_vec.data(), cpu, size) < tol) { + return true; + } + // std::cout << "relerrtwonorm: " << relerrtwonorm(h_vec.data(), cpu, size) << + // std::endl; + return false; +} + +int main() { + // defaults. tests should shadow them to override + cufinufft_opts opts; + cufinufft_default_opts(&opts); + opts.debug = 2; + finufft_opts fin_opts; + finufft_default_opts(&fin_opts); + fin_opts.debug = 2; + const int iflag = 1; + const float tol = 1e-5; + const int ntransf = 1; + const int dim = 3; + int n_modes[3] = {10, 20, 15}; + const int N = n_modes[0] * n_modes[1] * n_modes[2]; + const int M = 100; + + thrust::host_vector x(M * ntransf), y(M * ntransf), z(M * ntransf), s(N * ntransf), + t(N * ntransf), u(N * ntransf); + thrust::host_vector> c(M * ntransf), fk(N * ntransf); + + thrust::device_vector d_x{}, d_y{}, d_z{}, d_s{}, d_t{}, d_u{}; + thrust::device_vector> d_c(M * ntransf), d_fk(N * ntransf); + + std::default_random_engine eng(42); + std::uniform_real_distribution dist11(-1, 1); + auto rand_util_11 = [&eng, &dist11]() { + return dist11(eng); + }; + + // Making data + for (int64_t i = 0; i < M; i++) { + x[i] = M_PI * rand_util_11(); // x in [-pi,pi) + y[i] = M_PI * rand_util_11(); + z[i] = M_PI * rand_util_11(); + } + for (int64_t i = 0; i < N; i++) { + s[i] = M_PI * rand_util_11(); + t[i] = M_PI * rand_util_11(); + u[i] = M_PI * rand_util_11(); + } + + for (int64_t i = M; i < M * ntransf; ++i) { + int64_t j = i % M; + x[i] = x[j]; + y[i] = y[j]; + z[i] = z[j]; + } + for (int64_t i = M; i < N * ntransf; ++i) { + int64_t j = i % N; + s[i] = s[j]; + t[i] = t[j]; + u[i] = u[j]; + } + // copy x, y, z, s, t, u to device d_x, d_y, d_z, d_s, d_t, d_u + d_x = x; + d_y = y; + d_z = z; + d_s = s; + d_t = t; + d_u = u; + cudaDeviceSynchronize(); + + const auto cpu_planer = + [iflag, tol, ntransf, dim, n_modes, M, N, &x, &y, &z, &s, &t, &u, &fin_opts]( + const auto type) { + int64_t Nl[3] = {int64_t(n_modes[0]), int64_t(n_modes[1]), int64_t(n_modes[2])}; + finufft_plan_s *plan{nullptr}; + assert( + finufft_makeplan(type, dim, Nl, iflag, ntransf, tol, &plan, &fin_opts) == 0); + assert(finufft_setpts(plan, M, x.data(), y.data(), z.data(), N, s.data(), + t.data(), u.data()) == 0); + return plan; + }; + const auto test_type1 = [iflag, tol, ntransf, dim, n_modes, cpu_planer, &opts]( + auto *plan) { + // plan is a pointer to a type that contains real_t + using T = typename std::remove_pointer::type::real_t; + const int type = 1; + assert(cufinufft_makeplan_impl(type, dim, (int *)n_modes, iflag, ntransf, T(tol), + &plan, &opts) == 0); + const auto cpu_plan = cpu_planer(type); + cudaDeviceSynchronize(); + assert(cufinufft_destroy_impl(plan) == 0); + assert(finufft_destroy(cpu_plan) == 0); + plan = nullptr; + }; + auto test_type2 = [iflag, tol, ntransf, dim, n_modes, cpu_planer, &opts](auto plan) { + // plan is a pointer to a type that contains real_t + using T = typename std::remove_pointer::type::real_t; + const int type = 2; + assert(cufinufft_makeplan_impl(type, dim, (int *)n_modes, iflag, ntransf, T(tol), + &plan, &opts) == 0); + const auto cpu_plan = cpu_planer(type); + cudaDeviceSynchronize(); + assert(cufinufft_destroy_impl(plan) == 0); + cudaDeviceSynchronize(); + assert(finufft_destroy(cpu_plan) == 0); + plan = nullptr; + }; + auto test_type3 = [iflag, + tol, + ntransf, + dim, + n_modes, + cpu_planer, + M, + N, + &d_x, + &d_y, + &d_z, + &d_s, + &d_t, + &d_u, + &opts](auto plan) { + // plan is a pointer to a type that contains real_t + using T = typename std::remove_pointer::type::real_t; + const int type = 3; + const auto cpu_plan = cpu_planer(type); + assert(cufinufft_makeplan_impl(type, dim, (int *)n_modes, iflag, ntransf, T(tol), + &plan, &opts) == 0); + assert(cufinufft_setpts_impl(M, d_x.data().get(), d_y.data().get(), + d_z.data().get(), N, d_s.data().get(), + d_t.data().get(), d_u.data().get(), plan) == 0); + cudaDeviceSynchronize(); + assert(plan->type3_params.X1 == cpu_plan->t3P.X1); + assert(plan->type3_params.X2 == cpu_plan->t3P.X2); + assert(plan->type3_params.X3 == cpu_plan->t3P.X3); + assert(plan->type3_params.C1 == cpu_plan->t3P.C1); + assert(plan->type3_params.C2 == cpu_plan->t3P.C2); + assert(plan->type3_params.C3 == cpu_plan->t3P.C3); + assert(plan->type3_params.D1 == cpu_plan->t3P.D1); + assert(plan->type3_params.D2 == cpu_plan->t3P.D2); + assert(plan->type3_params.D3 == cpu_plan->t3P.D3); + assert(plan->type3_params.gam1 == cpu_plan->t3P.gam1); + assert(plan->type3_params.gam2 == cpu_plan->t3P.gam2); + assert(plan->type3_params.gam3 == cpu_plan->t3P.gam3); + assert(plan->nf1 == cpu_plan->nf1); + assert(plan->nf2 == cpu_plan->nf2); + assert(plan->nf2 == cpu_plan->nf2); + assert(equal(plan->kx, cpu_plan->X, M)); + assert(equal(plan->ky, cpu_plan->Y, M)); + assert(equal(plan->kz, cpu_plan->Z, M)); + assert(equal(plan->prephase, cpu_plan->prephase, M)); + assert(equal(plan->d_s, cpu_plan->Sp, N)); + assert(equal(plan->d_t, cpu_plan->Tp, N)); + assert(equal(plan->d_u, cpu_plan->Up, N)); + assert(almost_equal(plan->deconv, cpu_plan->deconv, N, tol * T(1e-2))); + assert(cufinufft_destroy_impl(plan) == 0); + assert(finufft_destroy(cpu_plan) == 0); + plan = nullptr; + cudaDeviceSynchronize(); + }; + // testing correctness of the plan creation + // cufinufft_plan_t *single_plan{nullptr}; + cufinufft_plan_t *double_plan{nullptr}; + // test_type1(double_plan); + // test_type2(double_plan); + test_type3(double_plan); + return 0; +} + +#ifdef __clang__ +#pragma clang diagnostic pop +#elif defined(__GNUC__) || defined(__GNUG__) +#pragma GCC diagnostic pop +#elif defined(__NVCC__) +#pragma diag_default 177 - D +#elif defined(_MSC_VER) +#pragma warning(pop) +#endif diff --git a/test/cuda/cufinufft_type3_test.cu b/test/cuda/cufinufft_type3_test.cu new file mode 100644 index 000000000..4ea69d786 --- /dev/null +++ b/test/cuda/cufinufft_type3_test.cu @@ -0,0 +1,13 @@ +#ifdef NDEBUG +#undef NDEBUG +#include +#define NDEBUG +#else +#include +#endif + +#include +#include +#include + +int main() { return 0; } diff --git a/test/cuda/fseries_kernel_test.cu b/test/cuda/fseries_kernel_test.cu index 0e1766e9e..0ab233ace 100644 --- a/test/cuda/fseries_kernel_test.cu +++ b/test/cuda/fseries_kernel_test.cu @@ -71,10 +71,12 @@ template int run_test(int nf1, int dim, T eps, int gpu, int nf2, int timer.start(); T a[dim * MAX_NQUAD]; T f[dim * MAX_NQUAD]; - onedim_fseries_kernel_precomp(nf1, f, a, opts); - if (dim > 1) onedim_fseries_kernel_precomp(nf2, f + MAX_NQUAD, a + MAX_NQUAD, opts); + onedim_fseries_kernel_precomp(nf1, f, a, opts); + if (dim > 1) + onedim_fseries_kernel_precomp(nf2, f + MAX_NQUAD, a + MAX_NQUAD, opts); if (dim > 2) - onedim_fseries_kernel_precomp(nf3, f + 2 * MAX_NQUAD, a + 2 * MAX_NQUAD, opts); + onedim_fseries_kernel_precomp(nf3, f + 2 * MAX_NQUAD, a + 2 * MAX_NQUAD, + opts); cputime = timer.elapsedsec(); T *d_a; From 074dda53766755d4ec0bca27533e402911fce63e Mon Sep 17 00:00:00 2001 From: Marco Barbone Date: Wed, 14 Aug 2024 10:49:26 -0400 Subject: [PATCH 39/68] Adding prephase and deconv with tests --- include/cufinufft/impl.h | 132 +++++++++++++++++----------------- src/finufft.cpp | 1 - test/cuda/cufinufft_setpts.cu | 39 ++++++---- 3 files changed, 94 insertions(+), 78 deletions(-) diff --git a/include/cufinufft/impl.h b/include/cufinufft/impl.h index 750ca1fcd..59c38f9cf 100644 --- a/include/cufinufft/impl.h +++ b/include/cufinufft/impl.h @@ -435,6 +435,7 @@ int cufinufft_setpts_impl(int M, T *d_kx, T *d_ky, T *d_kz, int N, T *d_s, T *d_ d_plan->d_s = d_s; d_plan->d_t = d_t; d_plan->d_u = d_u; + const auto dim = d_plan->dim; // no need to set the params to zero, as they are already zeroed out in the plan // memset(d_plan->type3_params, 0, sizeof(d_plan->type3_params)); using namespace cufinufft::utils; @@ -499,6 +500,8 @@ int cufinufft_setpts_impl(int M, T *d_kx, T *d_ky, T *d_kz, int N, T *d_s, T *d_ __func__); return FINUFFT_ERR_MAXNALLOC; } + + // A macro might be better as it has access to __line__ and __func__ const auto checked_free = [stream, pool = d_plan->supports_pools](auto x) constexpr { if (!x) return cudaFreeWrapper(x, stream, pool); return cudaSuccess; @@ -556,32 +559,32 @@ int cufinufft_setpts_impl(int M, T *d_kx, T *d_ky, T *d_kz, int N, T *d_s, T *d_ d_plan->type3_params.D3 != 0) { // if ky is null, use kx for ky and kz // this is not the most efficient implementation, but it is the most compact - const auto iterator = thrust::make_zip_iterator( - thrust::make_tuple(d_plan->kx, - // to avoid out of bounds access, use kx if ky is null - d_plan->ky ? d_plan->ky : d_plan->kx, - // same idea as above - d_plan->kz ? d_plan->kz : d_plan->kx)); - const auto D1 = d_plan->type3_params.D1; - const auto D2 = d_plan->type3_params.D2; // this should be 0 if dim < 2 - const auto D3 = d_plan->type3_params.D3; // this should be 0 if dim < 3 - const auto imasign = - d_plan->iflag >= 0 ? cuda_complex{0, 1} : cuda_complex{0, -1}; - thrust::transform(thrust::cuda::par.on(stream), iterator, iterator + M, - d_plan->prephase, - [D1, D2, D3, imasign] __host__ __device__( - const thrust::tuple &tuple) -> cuda_complex { - const auto x = thrust::get<0>(tuple); - const auto y = thrust::get<1>(tuple); - const auto z = thrust::get<2>(tuple); - // no branching because D2 and D3 are 0 if dim < 2 and dim < 3 - // this is generally faster on GPU - const auto phase = D1 * x + D2 * y + D3 * z; - // TODO: nvcc should have the sincos function - // check the cos + i*sin - // ref: https://en.wikipedia.org/wiki/Cis_(mathematics) - return sin(phase) * imasign + cos(phase); - }); + const auto iterator = + thrust::make_zip_iterator(thrust::make_tuple(d_kx, + // to avoid out of bounds access, use + // kx if ky is null + (d_plan->dim > 1) ? d_ky : d_kx, + // same idea as above + (d_plan->dim > 1) ? d_kz : d_kx)); + const auto D1 = d_plan->type3_params.D1; + const auto D2 = d_plan->type3_params.D2; // this should be 0 if dim < 2 + const auto D3 = d_plan->type3_params.D3; // this should be 0 if dim < 3 + const auto imasign = d_plan->iflag >= 0 ? T(1) : T(-1); + thrust::transform( + thrust::cuda::par.on(stream), iterator, iterator + M, d_plan->prephase, + [D1, D2, D3, imasign] __host__ __device__( + const thrust::tuple &tuple) -> cuda_complex { + const auto x = thrust::get<0>(tuple); + const auto y = thrust::get<1>(tuple); + const auto z = thrust::get<2>(tuple); + // no branching because D2 and D3 are 0 if dim < 2 and dim < 3 + // this is generally faster on GPU + const auto phase = D1 * x + D2 * y + D3 * z; + // TODO: nvcc should have the sincos function + // check the cos + i*sin + // ref: https://en.wikipedia.org/wiki/Cis_(mathematics) + return cuda_complex{std::cos(phase), std::sin(phase) * imasign}; + }); } else { thrust::fill(thrust::cuda::par.on(stream), d_plan->prephase, d_plan->prephase + M, cuda_complex{1, 0}); @@ -607,7 +610,8 @@ int cufinufft_setpts_impl(int M, T *d_kx, T *d_ky, T *d_kz, int N, T *d_s, T *d_ thrust::cuda::par.on(stream), d_u, d_u + N, d_plan->d_u, [scale, D3] __host__ __device__(const T u) -> T { return scale * (u + D3); }); } - { + { // here we declare phi_hat1, phi_hat2, and phi_hat3 + // and the precomputed data for the fseries kernel using namespace cufinufft::common; std::array fseries_precomp_a{}; @@ -647,54 +651,54 @@ int cufinufft_setpts_impl(int M, T *d_kx, T *d_ky, T *d_kz, int N, T *d_s, T *d_ d_plan->d_u, phi_hat1.data().get(), phi_hat2.data().get(), phi_hat3.data().get(), d_plan->spopts.nspread, stream)) goto finalize; + const auto is_c_finite = std::isfinite(d_plan->type3_params.C1) && std::isfinite(d_plan->type3_params.C2) && std::isfinite(d_plan->type3_params.C3); const auto is_c_nonzero = d_plan->type3_params.C1 != 0 || d_plan->type3_params.C2 != 0 || d_plan->type3_params.C3 != 0; - { - const auto dim = d_plan->dim; - const auto phi_hat_iterator = thrust::make_zip_iterator(thrust::make_tuple( - phi_hat1.begin(), dim > 1 ? phi_hat2.begin() : phi_hat1.begin(), - dim > 2 ? phi_hat3.begin() : phi_hat1.begin())); - thrust::transform(thrust::cuda::par.on(stream), phi_hat_iterator, - phi_hat_iterator + N, d_plan->deconv, - [dim] __host__ __device__( - const thrust::tuple &tuple) -> cuda_complex { - auto phiHat = thrust::get<0>(tuple); - phiHat *= (dim > 1) ? thrust::get<1>(tuple) : 1; - phiHat *= (dim > 2) ? thrust::get<2>(tuple) : 1; - return cuda_complex{1 / phiHat, 0}; - }); - } + + const auto phi_hat_iterator = thrust::make_zip_iterator(thrust::make_tuple( + phi_hat1.begin(), dim > 1 ? phi_hat2.begin() : phi_hat1.begin(), + dim > 2 ? phi_hat3.begin() : phi_hat1.begin())); + thrust::transform(thrust::cuda::par.on(stream), phi_hat_iterator, + phi_hat_iterator + N, d_plan->deconv, + [dim] __host__ __device__( + const thrust::tuple &tuple) -> cuda_complex { + auto phiHat = thrust::get<0>(tuple); + phiHat *= (dim > 1) ? thrust::get<1>(tuple) : T(1); + phiHat *= (dim > 2) ? thrust::get<2>(tuple) : T(1); + return cuda_complex{T(1) / phiHat, T(0)}; + }); + if (is_c_finite && is_c_nonzero) { - const auto dim = d_plan->dim; - const auto c1 = d_plan->type3_params.C1; - const auto c2 = d_plan->type3_params.C2; - const auto c3 = d_plan->type3_params.C3; - const auto d1 = -d_plan->type3_params.D1; - const auto d2 = -d_plan->type3_params.D2; - const auto d3 = -d_plan->type3_params.D3; - const auto imasign = - d_plan->iflag >= 0 ? cuda_complex{0, 1} : cuda_complex{0, -1}; + const auto c1 = d_plan->type3_params.C1; + const auto c2 = d_plan->type3_params.C2; + const auto c3 = d_plan->type3_params.C3; + const auto d1 = -d_plan->type3_params.D1; + const auto d2 = -d_plan->type3_params.D2; + const auto d3 = -d_plan->type3_params.D3; + const auto imasign = d_plan->iflag >= 0 ? T(1) : T(-1); // passing d_s three times if dim == 1 because d_t and d_u are not allocated // passing d_s and d_t if dim == 2 because d_u is not allocated const auto phase_iterator = thrust::make_zip_iterator( - thrust::make_tuple(d_plan->d_s, dim > 1 ? d_plan->d_t : d_plan->d_s, - dim > 2 ? d_plan->d_u : d_plan->d_s)); - thrust::transform(thrust::cuda::par.on(stream), phase_iterator, phase_iterator + N, - d_plan->deconv, d_plan->deconv, - [c1, c2, c3, d1, d2, d3, imasign] __host__ __device__( - const thrust::tuple tuple, - cuda_complex deconv) -> cuda_complex { - // d2 and d3 are 0 if dim < 2 and dim < 3 - const auto phase = c1 * (thrust::get<0>(tuple) + d1) + - c2 * (thrust::get<1>(tuple) + d2) + - c3 * (thrust::get<2>(tuple) + d3); - return deconv * (std::sin(phase) * imasign + std::cos(phase)); - }); + thrust::make_tuple(d_s, dim > 1 ? d_t : d_s, dim > 2 ? d_u : d_s)); + thrust::transform( + thrust::cuda::par.on(stream), phase_iterator, phase_iterator + N, + d_plan->deconv, d_plan->deconv, + [c1, c2, c3, d1, d2, d3, imasign] __host__ __device__( + const thrust::tuple tuple, cuda_complex deconv) + -> cuda_complex { + // d2 and d3 are 0 if dim < 2 and dim < 3 + const auto phase = c1 * (thrust::get<0>(tuple) + d1) + + c2 * (thrust::get<1>(tuple) + d2) + + c3 * (thrust::get<2>(tuple) + d3); + return cuda_complex{std::cos(phase), imasign * std::sin(phase)} * deconv; + }); } + // exiting the block frees the memory allocated for phi_hat1, phi_hat2, and phi_hat3 + // and the precomputed data for the fseries kernel } return 0; finalize: diff --git a/src/finufft.cpp b/src/finufft.cpp index ed917514d..2fb5d0a71 100644 --- a/src/finufft.cpp +++ b/src/finufft.cpp @@ -755,7 +755,6 @@ int FINUFFT_MAKEPLAN(int type, int dim, BIGINT *n_modes, int iflag, int ntrans, // idist, ot, onembed, ostride, odist, sign, flags { std::lock_guard lock(fftw_lock); - // FFTW_PLAN_TH sets all future fftw_plan calls to use nthr_fft threads. // FIXME: Since this might override what the user wants for fftw, we'd like to // set it just for our one plan and then revert to the user value. diff --git a/test/cuda/cufinufft_setpts.cu b/test/cuda/cufinufft_setpts.cu index cfe420474..b071275c0 100644 --- a/test/cuda/cufinufft_setpts.cu +++ b/test/cuda/cufinufft_setpts.cu @@ -38,9 +38,22 @@ template bool equal(V *d_vec, T *cpu, const std::size_t return true; } +template +T infnorm(std::complex *a, std::complex *b, const std::size_t n) { + T err{0}, max_element{0}; + for (std::size_t m = 0; m < n; ++m) { + // std::cout << "a[" << m << "]: " << a[m] << " b[" << m << "]: " << b[m] << "\n"; + err = std::max(err, std::abs(a[m] - b[m])); + max_element = std::max(std::max(std::abs(a[m]), std::abs(b[m])), max_element); + } + return err / max_element; +} +// max error divide by max element +// max ( abs(a-b)) / max(abs(a)) +// 10*(machine precision) template T relerrtwonorm(std::complex *a, std::complex *b, const std::size_t n) { - T err = 0.0, nrm = 0.0; + T err{0}, nrm{0}; for (std::size_t m = 0; m < n; ++m) { // std::cout << "a[" << m << "]: " << a[m] << " b[" << m << "]: " << b[m] << "\n"; nrm += std::real(std::conj(a[m]) * a[m]); @@ -61,12 +74,11 @@ auto almost_equal(V *d_vec, // cause issues use it with case assert(cudaMemcpy(h_vec.data(), d_vec, size * sizeof(T), cudaMemcpyDeviceToHost) == cudaSuccess); + std::cout << "infnorm: " << infnorm(h_vec.data(), cpu, size) << std::endl; // compare the l2 norm of the difference between the two vectors - if (relerrtwonorm(h_vec.data(), cpu, size) < tol) { + if (infnorm(h_vec.data(), cpu, size) < tol) { return true; } - // std::cout << "relerrtwonorm: " << relerrtwonorm(h_vec.data(), cpu, size) << - // std::endl; return false; } @@ -79,7 +91,7 @@ int main() { finufft_default_opts(&fin_opts); fin_opts.debug = 2; const int iflag = 1; - const float tol = 1e-5; + const float tol = 1e-9; const int ntransf = 1; const int dim = 3; int n_modes[3] = {10, 20, 15}; @@ -101,14 +113,14 @@ int main() { // Making data for (int64_t i = 0; i < M; i++) { - x[i] = M_PI * rand_util_11(); // x in [-pi,pi) - y[i] = M_PI * rand_util_11(); - z[i] = M_PI * rand_util_11(); + x[i] = M_PI * rand_util_11() + 4; // x in [-pi,pi) + y[i] = M_PI * rand_util_11() + 4; + z[i] = M_PI * rand_util_11() + 4; } for (int64_t i = 0; i < N; i++) { - s[i] = M_PI * rand_util_11(); - t[i] = M_PI * rand_util_11(); - u[i] = M_PI * rand_util_11(); + s[i] = M_PI * rand_util_11() + 8; // shifted so D1 is 8 + t[i] = M_PI * rand_util_11() + 8; // shifted so D2 is 8 + u[i] = M_PI * rand_util_11() + 8; // shifted so D3 is 8 } for (int64_t i = M; i < M * ntransf; ++i) { @@ -208,14 +220,15 @@ int main() { assert(plan->type3_params.gam3 == cpu_plan->t3P.gam3); assert(plan->nf1 == cpu_plan->nf1); assert(plan->nf2 == cpu_plan->nf2); - assert(plan->nf2 == cpu_plan->nf2); + assert(plan->nf3 == cpu_plan->nf3); assert(equal(plan->kx, cpu_plan->X, M)); assert(equal(plan->ky, cpu_plan->Y, M)); assert(equal(plan->kz, cpu_plan->Z, M)); - assert(equal(plan->prephase, cpu_plan->prephase, M)); assert(equal(plan->d_s, cpu_plan->Sp, N)); assert(equal(plan->d_t, cpu_plan->Tp, N)); assert(equal(plan->d_u, cpu_plan->Up, N)); + // NOTE:seems with infnorm we are getting at most 11 digits of precision + assert(almost_equal(plan->prephase, cpu_plan->prephase, M, tol * T(1e-2))); assert(almost_equal(plan->deconv, cpu_plan->deconv, N, tol * T(1e-2))); assert(cufinufft_destroy_impl(plan) == 0); assert(finufft_destroy(cpu_plan) == 0); From 332b5b7ce43e889ab4570f2b9e2eb90bb0ef81f6 Mon Sep 17 00:00:00 2001 From: Marco Barbone Date: Thu, 15 Aug 2024 14:08:05 -0400 Subject: [PATCH 40/68] first 3D working version --- CMakeLists.txt | 1 + include/cufinufft/impl.h | 129 +++++++++---- include/cufinufft/types.h | 26 +-- include/cufinufft/utils.h | 16 +- src/cuda/3d/cufinufft3d.cu | 60 ++++++ test/cuda/CMakeLists.txt | 4 +- test/cuda/cufinufft3d_test.cu | 40 +++- test/cuda/cufinufft_makeplan_impl.cu | 10 - test/cuda/cufinufft_type3_test.cu | 273 ++++++++++++++++++++++++++- 9 files changed, 483 insertions(+), 76 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 61c408cbf..59be4b617 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -4,6 +4,7 @@ project( VERSION 2.3.0 LANGUAGES C CXX) +set(CMAKE_EXPORT_COMPILE_COMMANDS ON) # windows MSVC runtime flags policy cmake_policy(SET CMP0091 NEW) diff --git a/include/cufinufft/impl.h b/include/cufinufft/impl.h index 59c38f9cf..5ae71b365 100644 --- a/include/cufinufft/impl.h +++ b/include/cufinufft/impl.h @@ -40,6 +40,9 @@ int cufinufft3d1_exec(cuda_complex *d_c, cuda_complex *d_fk, template int cufinufft3d2_exec(cuda_complex *d_c, cuda_complex *d_fk, cufinufft_plan_t *d_plan); +template +int cufinufft3d3_exec(cuda_complex *d_c, cuda_complex *d_fk, + cufinufft_plan_t *d_plan); template int cufinufft_makeplan_impl(int type, int dim, int *nmodes, int iflag, int ntransf, T tol, @@ -85,6 +88,7 @@ int cufinufft_makeplan_impl(int type, int dim, int *nmodes, int iflag, int ntran d_plan->nf1 = 1; d_plan->nf2 = 1; d_plan->nf3 = 1; + d_plan->tol = tol; /* If a user has not supplied their own options, assign defaults for them. */ if (opts == nullptr) { // use default opts cufinufft_default_opts(&(d_plan->opts)); @@ -98,12 +102,21 @@ int cufinufft_makeplan_impl(int type, int dim, int *nmodes, int iflag, int ntran d_plan->ms = nmodes[0]; d_plan->mt = nmodes[1]; d_plan->mu = nmodes[2]; + } else { + d_plan->opts.gpu_method = 1; + d_plan->opts.gpu_spreadinterponly = 1; } int fftsign = (iflag >= 0) ? 1 : -1; d_plan->iflag = fftsign; d_plan->ntransf = ntransf; + int maxbatchsize = (opts != nullptr) ? opts->gpu_maxbatchsize : 0; + // TODO: check if this is the right heuristic + if (maxbatchsize == 0) // implies: use a heuristic. + maxbatchsize = std::min(ntransf, 8); // heuristic from test codes + d_plan->maxbatchsize = maxbatchsize; + const auto stream = d_plan->stream = (cudaStream_t)d_plan->opts.gpu_stream; // Mult-GPU support: set the CUDA Device ID: @@ -143,25 +156,26 @@ int cufinufft_makeplan_impl(int type, int dim, int *nmodes, int iflag, int ntran d_plan->type = type; d_plan->spopts.spread_direction = d_plan->type; + cufinufft_setup_binsize(type, d_plan->spopts.nspread, dim, &d_plan->opts); + if (ier = cudaGetLastError(), ier != cudaSuccess) { + goto finalize; + } + if (d_plan->opts.debug) { + printf("[cufinufft] bin size x: %d", d_plan->opts.gpu_binsizex); + if (dim > 1) printf(" bin size y: %d", d_plan->opts.gpu_binsizey); + if (dim > 2) printf(" bin size z: %d", d_plan->opts.gpu_binsizez); + printf("\n"); + // shared memory required for the spreader vs available shared memory + int shared_mem_per_block{}; + cudaDeviceGetAttribute(&shared_mem_per_block, cudaDevAttrMaxSharedMemoryPerBlockOptin, + device_id); + const auto mem_required = + shared_memory_required(dim, d_plan->spopts.nspread, d_plan->opts.gpu_binsizex, + d_plan->opts.gpu_binsizey, d_plan->opts.gpu_binsizez); + printf("[cufinufft] shared memory required for the spreader: %d\n", mem_required); + } + if (type == 1 || type == 2) { - cufinufft_setup_binsize(type, d_plan->spopts.nspread, dim, &d_plan->opts); - if (ier = cudaGetLastError(), ier != cudaSuccess) { - goto finalize; - } - if (d_plan->opts.debug) { - printf("[cufinufft] bin size x: %d", d_plan->opts.gpu_binsizex); - if (dim > 1) printf(" bin size y: %d", d_plan->opts.gpu_binsizey); - if (dim > 2) printf(" bin size z: %d", d_plan->opts.gpu_binsizez); - printf("\n"); - // shared memory required for the spreader vs available shared memory - int shared_mem_per_block{}; - cudaDeviceGetAttribute(&shared_mem_per_block, - cudaDevAttrMaxSharedMemoryPerBlockOptin, device_id); - const auto mem_required = shared_memory_required( - dim, d_plan->spopts.nspread, d_plan->opts.gpu_binsizex, - d_plan->opts.gpu_binsizey, d_plan->opts.gpu_binsizez); - printf("[cufinufft] shared memory required for the spreader: %d\n", mem_required); - } CUFINUFFT_BIGINT nf1 = 1, nf2 = 1, nf3 = 1; set_nf_type12(d_plan->ms, d_plan->opts, d_plan->spopts, &nf1, d_plan->opts.gpu_obinsizex); @@ -209,11 +223,6 @@ int cufinufft_makeplan_impl(int type, int dim, int *nmodes, int iflag, int ntran d_plan->nf2 = nf2; d_plan->nf3 = nf3; - int maxbatchsize = opts ? opts->gpu_maxbatchsize : 0; - if (maxbatchsize == 0) // implies: use a heuristic. - maxbatchsize = std::min(ntransf, 8); // heuristic from test codes - d_plan->maxbatchsize = maxbatchsize; - using namespace cufinufft::memtransfer; switch (d_plan->dim) { case 1: { @@ -431,7 +440,7 @@ int cufinufft_setpts_impl(int M, T *d_kx, T *d_ky, T *d_kz, int N, T *d_s, T *d_ return FINUFFT_ERR_NUM_NU_PTS_INVALID; } const auto stream = d_plan->stream; - d_plan->nk = N; + d_plan->N = N; d_plan->d_s = d_s; d_plan->d_t = d_t; d_plan->d_u = d_u; @@ -493,6 +502,7 @@ int cufinufft_setpts_impl(int M, T *d_kx, T *d_ky, T *d_kz, int N, T *d_s, T *d_ } } d_plan->nf = d_plan->nf1 * d_plan->nf2 * d_plan->nf3; + // FIXME: MAX_NF might be too small... if (d_plan->nf * d_plan->opts.gpu_maxbatchsize > MAX_NF) { fprintf(stderr, @@ -511,7 +521,13 @@ int cufinufft_setpts_impl(int M, T *d_kx, T *d_ky, T *d_kz, int N, T *d_s, T *d_ if (auto ier = checked_free(x); ier != cudaSuccess) return ier; return cudaMallocWrapper(&x, size, stream, pool); }; - + // FIXME: check the size of the allocs for the batch interface + if (checked_realloc(d_plan->fw, sizeof(cuda_complex) * d_plan->nf * + d_plan->maxbatchsize) != cudaSuccess) + goto finalize; + if (checked_realloc(d_plan->c_batch, + sizeof(cuda_complex) * M * d_plan->maxbatchsize) != cudaSuccess) + goto finalize; if (checked_realloc(d_plan->kx, sizeof(T) * M) != cudaSuccess) goto finalize; if (checked_realloc(d_plan->d_s, sizeof(T) * N) != cudaSuccess) goto finalize; if (d_plan->dim > 1) { @@ -652,12 +668,11 @@ int cufinufft_setpts_impl(int M, T *d_kx, T *d_ky, T *d_kz, int N, T *d_s, T *d_ phi_hat3.data().get(), d_plan->spopts.nspread, stream)) goto finalize; - const auto is_c_finite = std::isfinite(d_plan->type3_params.C1) && - std::isfinite(d_plan->type3_params.C2) && + const auto is_c_finite = std::isfinite(d_plan->type3_params.C1) & + std::isfinite(d_plan->type3_params.C2) & std::isfinite(d_plan->type3_params.C3); - const auto is_c_nonzero = d_plan->type3_params.C1 != 0 || - d_plan->type3_params.C2 != 0 || - d_plan->type3_params.C3 != 0; + const auto is_c_nonzero = d_plan->type3_params.C1 != 0 | + d_plan->type3_params.C2 != 0 | d_plan->type3_params.C3 != 0; const auto phi_hat_iterator = thrust::make_zip_iterator(thrust::make_tuple( phi_hat1.begin(), dim > 1 ? phi_hat2.begin() : phi_hat1.begin(), @@ -688,8 +703,8 @@ int cufinufft_setpts_impl(int M, T *d_kx, T *d_ky, T *d_kz, int N, T *d_s, T *d_ thrust::cuda::par.on(stream), phase_iterator, phase_iterator + N, d_plan->deconv, d_plan->deconv, [c1, c2, c3, d1, d2, d3, imasign] __host__ __device__( - const thrust::tuple tuple, cuda_complex deconv) - -> cuda_complex { + const thrust::tuple tuple, + cuda_complex deconv) -> cuda_complex { // d2 and d3 are 0 if dim < 2 and dim < 3 const auto phase = c1 * (thrust::get<0>(tuple) + d1) + c2 * (thrust::get<1>(tuple) + d2) + @@ -698,9 +713,47 @@ int cufinufft_setpts_impl(int M, T *d_kx, T *d_ky, T *d_kz, int N, T *d_s, T *d_ }); } // exiting the block frees the memory allocated for phi_hat1, phi_hat2, and phi_hat3 - // and the precomputed data for the fseries kernel + // and the precomputed data for the fseries kernel + // since GPU memory is expensive, we should free it as soon as possible + } + + using namespace cufinufft::memtransfer; + switch (d_plan->dim) { + case 1: { + if ((allocgpumem1d_plan(d_plan))) goto finalize; + } break; + case 2: { + if ((allocgpumem2d_plan(d_plan))) goto finalize; + } break; + case 3: { + if ((allocgpumem3d_plan(d_plan))) goto finalize; + } break; + } + if (cufinufft_setpts_12_impl(M, d_plan->kx, d_plan->ky, d_plan->kz, d_plan)) { + fprintf(stderr, "[%s] cufinufft_setpts_12_impl failed\n", __func__); + goto finalize; + } + { + int t2modes[] = {d_plan->nf1, d_plan->nf2, d_plan->nf3}; + cufinufft_opts t2opts = d_plan->opts; + t2opts.modeord = 0; + t2opts.debug = std::max(0, t2opts.debug - 1); + t2opts.gpu_spreadinterponly = 0; + // Safe to ignore the return value here? + if (d_plan->t2_plan) cufinufft_destroy_impl(d_plan->t2_plan); + // check that maxbatchsize is correct + if (cufinufft_makeplan_impl(2, dim, t2modes, d_plan->iflag, d_plan->maxbatchsize, + d_plan->tol, &d_plan->t2_plan, &t2opts)) { + fprintf(stderr, "[%s] inner t2 plan cufinufft_makeplan failed\n", __func__); + goto finalize; + } + if (cufinufft_setpts_12_impl(N, d_plan->d_s, d_plan->d_t, d_plan->d_u, + d_plan->t2_plan)) { + fprintf(stderr, "[%s] inner t2 plan cufinufft_setpts_12 failed\n", __func__); + goto finalize; + } + return 0; } - return 0; finalize: checked_free(d_plan->kx); checked_free(d_plan->d_s); @@ -710,6 +763,9 @@ int cufinufft_setpts_impl(int M, T *d_kx, T *d_ky, T *d_kz, int N, T *d_s, T *d_ checked_free(d_plan->d_u); checked_free(d_plan->prephase); checked_free(d_plan->deconv); + checked_free(d_plan->fw_batch); + checked_free(d_plan->c_batch); + cufinufft_destroy_impl(d_plan->t2_plan); return FINUFFT_ERR_CUDA_FAILURE; } @@ -761,10 +817,7 @@ int cufinufft_execute_impl(cuda_complex *d_c, cuda_complex *d_fk, case 3: { if (type == 1) ier = cufinufft3d1_exec(d_c, d_fk, d_plan); if (type == 2) ier = cufinufft3d2_exec(d_c, d_fk, d_plan); - if (type == 3) { - std::cerr << "Not Implemented yet" << std::endl; - ier = FINUFFT_ERR_TYPE_NOTVALID; - } + if (type == 3) ier = cufinufft3d3_exec(d_c, d_fk, d_plan); } break; } diff --git a/include/cufinufft/types.h b/include/cufinufft/types.h index 9d7d27191..2920e7ae7 100644 --- a/include/cufinufft/types.h +++ b/include/cufinufft/types.h @@ -22,13 +22,6 @@ using cuda_complex = typename std::conditional< std::is_same::value, cuFloatComplex, typename std::conditional::value, cuDoubleComplex, void>::type>::type; -namespace { -template struct cufinuftt_type3_params_t { - T X1, C1, S1, D1, h1, gam1; // x dim: X=halfwid C=center D=freqcen h,gam=rescale - T X2, C2, S2, D2, h2, gam2; // y - T X3, C3, S3, D3, h3, gam3; // z -}; -} // namespace template struct cufinufft_plan_t { cufinufft_opts opts; @@ -44,7 +37,8 @@ template struct cufinufft_plan_t { CUFINUFFT_BIGINT mt; CUFINUFFT_BIGINT mu; int ntransf; - int maxbatchsize; + int maxbatchsize; // TODO: this might be called batchsize non maxbatchsize (double + // check) int iflag; int supports_pools; @@ -58,18 +52,28 @@ template struct cufinufft_plan_t { T *kx; T *ky; T *kz; + cuda_complex *c_batch; + cuda_complex *fw_batch; + + // no allocs here cuda_complex *c; cuda_complex *fw; cuda_complex *fk; // Type 3 specific - cufinuftt_type3_params_t type3_params; - int nk; // number of NU freq pts (type 3 only) + struct { + T X1, C1, S1, D1, h1, gam1; // x dim: X=halfwid C=center D=freqcen h,gam=rescale + T X2, C2, S2, D2, h2, gam2; // y + T X3, C3, S3, D3, h3, gam3; // z + } type3_params; + int N; // number of NU freq pts (type 3 only) CUFINUFFT_BIGINT nf; T *d_s; T *d_t; T *d_u; - + T tol; + // inner type 2 plan for type 3 + cufinufft_plan_t *t2_plan; // new allocs. FIXME: convert to device vectors to use resize cuda_complex *prephase; // pre-phase, for all input NU pts cuda_complex *deconv; // reciprocal of kernel FT, phase, all output NU pts diff --git a/include/cufinufft/utils.h b/include/cufinufft/utils.h index cbd7e59c6..3b4b8b524 100644 --- a/include/cufinufft/utils.h +++ b/include/cufinufft/utils.h @@ -112,6 +112,8 @@ __forceinline__ __device__ auto interval(const int ns, const double x) { #define ALLOCA_SUPPORTED 0 #endif +#undef ALLOCA_SUPPORTED + #if defined(__CUDA_ARCH__) #if __CUDA_ARCH__ >= 900 #define COMPUTE_CAPABILITY_90_OR_HIGHER 1 @@ -130,8 +132,8 @@ __forceinline__ __device__ auto interval(const int ns, const double x) { */ template -static __forceinline__ __device__ void atomicAddComplexShared( - cuda_complex *address, cuda_complex res) { +static __forceinline__ __device__ void atomicAddComplexShared(cuda_complex *address, + cuda_complex res) { const auto raw_address = reinterpret_cast(address); atomicAdd(raw_address, res.x); atomicAdd(raw_address + 1, res.y); @@ -143,8 +145,8 @@ static __forceinline__ __device__ void atomicAddComplexShared( * on shared memory are supported so we leverage them */ template -static __forceinline__ __device__ void atomicAddComplexGlobal( - cuda_complex *address, cuda_complex res) { +static __forceinline__ __device__ void atomicAddComplexGlobal(cuda_complex *address, + cuda_complex res) { if constexpr ( std::is_same_v, float2> && COMPUTE_CAPABILITY_90_OR_HIGHER) { atomicAdd(address, res); @@ -204,12 +206,12 @@ auto set_nhg_type3(T S, T X, const cufinufft_opts &opts, Xsafe = 1.0; Ssafe = 1.0; } else - Xsafe = max(Xsafe, T(1) / S); + Xsafe = std::max(Xsafe, T(1) / S); else - Ssafe = max(Ssafe, T(1) / X); + Ssafe = std::max(Ssafe, T(1) / X); // use the safe X and S... T nfd = 2.0 * opts.upsampfac * Ssafe * Xsafe / M_PI + nss; - if (!isfinite(nfd)) nfd = 0.0; // use FLT to catch inf + if (!std::isfinite(nfd)) nfd = 0.0; // use FLT to catch inf auto nf = (int)nfd; // printf("initial nf=%lld, ns=%d\n",*nf,spopts.nspread); // catch too small nf, and nan or +-inf, otherwise spread fails... diff --git a/src/cuda/3d/cufinufft3d.cu b/src/cuda/3d/cufinufft3d.cu index 5977e6d5f..2baa78e59 100644 --- a/src/cuda/3d/cufinufft3d.cu +++ b/src/cuda/3d/cufinufft3d.cu @@ -3,11 +3,14 @@ #include #include +#include #include #include #include +#include + using namespace cufinufft::deconvolve; using namespace cufinufft::spreadinterp; using std::min; @@ -113,6 +116,58 @@ int cufinufft3d2_exec(cuda_complex *d_c, cuda_complex *d_fk, return 0; } +// TODO: in case data is centered, we could save GPU memory +template +int cufinufft3d3_exec(cuda_complex *d_c, cuda_complex *d_fk, + cufinufft_plan_t *d_plan) { + /* + 3D Type-3 NUFFT + + This function is called in "exec" stage (See ../cufinufft.cu). + It includes (copied from doc in finufft library) + Step 0: pre-phase the input strengths + Step 1: spread data + Step 2: Type 3 NUFFT + Step 3: deconvolve (amplify) each Fourier mode, using kernel Fourier coeff + + Marco Barbone 08/14/2024 + */ + int ier; + cuda_complex *d_cstart; + cuda_complex *d_fkstart; + cuda_complex *d_cbatch_start; + const auto stream = d_plan->stream; + for (int i = 0; i * d_plan->maxbatchsize < d_plan->ntransf; i++) { + int blksize = min(d_plan->ntransf - i * d_plan->maxbatchsize, d_plan->maxbatchsize); + d_cstart = d_c + i * d_plan->maxbatchsize * d_plan->M; + d_fkstart = d_fk + i * d_plan->maxbatchsize * d_plan->N; + d_cbatch_start = d_plan->c_batch + i * d_plan->maxbatchsize * d_plan->M; + d_plan->c = d_cbatch_start; + d_plan->fk = d_plan->fw; + // NOTE: fw might need to be set to 0 + // Step 0: pre-phase the input strengths + for (int i = 0; i < blksize; i++) { + thrust::transform(thrust::cuda::par.on(stream), d_plan->prephase, + d_plan->prephase + d_plan->M, d_cstart + i * d_plan->M, + d_plan->c_batch + i * d_plan->M, + thrust::multiplies>()); + } + // Step 1: Spread + if ((ier = cuspread3d(d_plan, blksize))) return ier; + // Step 2: Type 3 NUFFT + d_plan->t2_plan->ntransf = blksize; + if ((ier = cufinufft3d2_exec(d_fkstart, d_plan->fw, d_plan->t2_plan))) return ier; + // Step 3: deconvolve + for (int i = 0; i < blksize; i++) { + thrust::transform(thrust::cuda::par.on(stream), d_plan->deconv, + d_plan->deconv + d_plan->N, d_fkstart + i * d_plan->N, + d_fkstart + i * d_plan->N, thrust::multiplies>()); + } + } + + return 0; +} + template int cufinufft3d1_exec(cuda_complex *d_c, cuda_complex *d_fk, cufinufft_plan_t *d_plan); template int cufinufft3d1_exec(cuda_complex *d_c, @@ -124,3 +179,8 @@ template int cufinufft3d2_exec(cuda_complex *d_c, cuda_complex(cuda_complex *d_c, cuda_complex *d_fk, cufinufft_plan_t *d_plan); + +template int cufinufft3d3_exec(cuda_complex *d_c, cuda_complex *d_fk, + cufinufft_plan_t *d_plan); +template int cufinufft3d3_exec(cuda_complex *d_c, cuda_complex *d_fk, + cufinufft_plan_t *d_plan); diff --git a/test/cuda/CMakeLists.txt b/test/cuda/CMakeLists.txt index bb2d8444a..af342944c 100644 --- a/test/cuda/CMakeLists.txt +++ b/test/cuda/CMakeLists.txt @@ -14,9 +14,7 @@ foreach(srcfile ${test_src}) set_target_properties( ${executable} PROPERTIES LINKER_LANGUAGE CUDA CUDA_ARCHITECTURES ${FINUFFT_CUDA_ARCHITECTURES}) - target_compile_options( - ${executable} PUBLIC $<$:$<$:-G - -maxrregcount 32>>) + target_compile_options(${executable} PUBLIC ${FINUFFT_CUDA_FLAGS}) message(STATUS "Adding test ${executable}" " with CUDA_ARCHITECTURES=${FINUFFT_CUDA_ARCHITECTURES}" " and INCLUDE=${CUFINUFFT_INCLUDE_DIRS}") diff --git a/test/cuda/cufinufft3d_test.cu b/test/cuda/cufinufft3d_test.cu index 67818c2b2..23f29f1a1 100644 --- a/test/cuda/cufinufft3d_test.cu +++ b/test/cuda/cufinufft3d_test.cu @@ -23,10 +23,10 @@ int run_test(int method, int type, int N1, int N2, int N3, int M, T tol, T check std::cout << std::scientific << std::setprecision(3); int ier; - thrust::host_vector x(M), y(M), z(M); + thrust::host_vector x(M), y(M), z(M), s{}, t{}, u{}; thrust::host_vector> c(M), fk(N1 * N2 * N3); - thrust::device_vector d_x(M), d_y(M), d_z(M); + thrust::device_vector d_x(M), d_y(M), d_z(M), d_s{}, d_t{}, d_u{}; thrust::device_vector> d_c(M), d_fk(N1 * N2 * N3); std::default_random_engine eng(1); @@ -51,6 +51,22 @@ int run_test(int method, int type, int N1, int N2, int N3, int M, T tol, T check fk[i].real(randm11()); fk[i].imag(randm11()); } + } else if (type == 3) { + for (int i = 0; i < M; i++) { + c[i].real(randm11()); + c[i].imag(randm11()); + } + s.resize(N1 * N2 * N3); + t.resize(N1 * N2 * N3); + u.resize(N1 * N2 * N3); + for (int i = 0; i < N1 * N2 * N3; i++) { + s[i] = M_PI * randm11(); + t[i] = M_PI * randm11(); + u[i] = M_PI * randm11(); + } + d_s = s; + d_t = t; + d_u = u; } else { std::cerr << "Invalid type " << type << " supplied\n"; return 1; @@ -64,6 +80,8 @@ int run_test(int method, int type, int N1, int N2, int N3, int M, T tol, T check d_c = c; else if (type == 2) d_fk = fk; + else if (type == 3) + d_c = c; cudaEvent_t start, stop; float milliseconds = 0; @@ -112,7 +130,8 @@ int run_test(int method, int type, int N1, int N2, int N3, int M, T tol, T check cudaEventRecord(start); ier = cufinufft_setpts_impl(M, d_x.data().get(), d_y.data().get(), d_z.data().get(), - 0, nullptr, nullptr, nullptr, dplan); + N1 * N2 * N3, d_s.data().get(), d_t.data().get(), + d_u.data().get(), dplan); if (ier != 0) { printf("err: cufinufft_setpts\n"); return ier; @@ -149,6 +168,8 @@ int run_test(int method, int type, int N1, int N2, int N3, int M, T tol, T check fk = d_fk; else if (type == 2) c = d_c; + else if (type == 3) + fk = d_fk; printf("[Method %d] %d NU pts to %d U pts in %.3g s:\t%.3g NU pts/s\n", opts.gpu_method, M, N1 * N2 * N3, totaltime / 1000, M / totaltime * 1000); @@ -184,6 +205,17 @@ int run_test(int method, int type, int N1, int N2, int N3, int M, T tol, T check rel_error = abs(c[jt] - ct) / infnorm(M, (std::complex *)c.data()); printf("[gpu ] one targ: rel err in c[%ld] is %.3g\n", (int64_t)jt, rel_error); + } else if (type == 3) { + + int jt = (N1 * N2 * N3) / 2; // check arbitrary choice of one targ pt + thrust::complex J = thrust::complex(0, iflag); + thrust::complex Ft = thrust::complex(0, 0); + + for (int j = 0; j < M; ++j) { + Ft += c[j] * exp(J * (x[j] * s[jt] + y[j] * t[jt] + z[j] * u[jt])); + } + rel_error = abs(Ft - fk[jt]) / infnorm(N1 * N2 * N3, (std::complex *)fk.data()); + printf("[gpu ] one mode: rel err in F[%d] is %.3g\n", jt, rel_error); } return std::isnan(rel_error) || rel_error > checktol; @@ -198,7 +230,7 @@ int main(int argc, char *argv[]) { " 1: nupts driven,\n" " 2: sub-problem, or\n" " 4: block gather.\n" - " type: Type of transform (1, 2)" + " type: Type of transform (1, 2, 3)" " N1, N2, N3: The size of the 3D array\n" " M: The number of non-uniform points\n" " tol: NUFFT tolerance\n" diff --git a/test/cuda/cufinufft_makeplan_impl.cu b/test/cuda/cufinufft_makeplan_impl.cu index 31fd695db..5f8d26666 100644 --- a/test/cuda/cufinufft_makeplan_impl.cu +++ b/test/cuda/cufinufft_makeplan_impl.cu @@ -148,13 +148,3 @@ int main() { test_type3(double_plan); return 0; } - -#ifdef __clang__ -#pragma clang diagnostic pop -#elif defined(__GNUC__) || defined(__GNUG__) -#pragma GCC diagnostic pop -#elif defined(__NVCC__) -#pragma diag_default 177 - D -#elif defined(_MSC_VER) -#pragma warning(pop) -#endif diff --git a/test/cuda/cufinufft_type3_test.cu b/test/cuda/cufinufft_type3_test.cu index 4ea69d786..3dc8900d8 100644 --- a/test/cuda/cufinufft_type3_test.cu +++ b/test/cuda/cufinufft_type3_test.cu @@ -1,13 +1,280 @@ #ifdef NDEBUG #undef NDEBUG -#include +#include #define NDEBUG #else -#include +#include #endif +#include + #include #include #include -int main() { return 0; } +#include + +#include +#include +#include + +// for now, once finufft is demacroized we can test float +using T = double; + +template bool equal(V *d_vec, T *cpu, const std::size_t size) { + // copy d_vec to cpu + thrust::host_vector h_vec(size); + // this implicitly converts cuda_complex to std::complex... which is fine, but it may + // cause issues use it with case + assert(cudaMemcpy(h_vec.data(), d_vec, size * sizeof(T), cudaMemcpyDeviceToHost) == + cudaSuccess); + for (std::size_t i = 0; i < size; ++i) { + if (h_vec[i] != cpu[i]) { + std::cout << " gpu[" << i << "]: " << h_vec[i] << " cpu[" << i << "]: " << cpu[i] + << std::endl; + return false; + } + } + return true; +} + +template +T infnorm(std::complex *a, std::complex *b, const std::size_t n) { + T err{0}, max_element{0}; + for (std::size_t m = 0; m < n; ++m) { + // std::cout << "a[" << m << "]: " << a[m] << " b[" << m << "]: " << b[m] << "\n"; + err = std::max(err, std::abs(a[m] - b[m])); + max_element = std::max(std::max(std::abs(a[m]), std::abs(b[m])), max_element); + } + return err / max_element; +} +// max error divide by max element +// max ( abs(a-b)) / max(abs(a)) +// 10*(machine precision) +template +T relerrtwonorm(std::complex *a, std::complex *b, const std::size_t n) { + T err{0}, nrm{0}; + for (std::size_t m = 0; m < n; ++m) { + // std::cout << "a[" << m << "]: " << a[m] << " b[" << m << "]: " << b[m] << "\n"; + nrm += std::real(std::conj(a[m]) * a[m]); + const auto diff = a[m] - b[m]; + err += std::real(std::conj(diff) * diff); + } + return std::sqrt(err / nrm); +} + +template +auto almost_equal(V *d_vec, + T *cpu, + const std::size_t size, + const contained tol = std::numeric_limits::epsilon()) { + // copy d_vec to cpu + std::vector h_vec(size); + // this implicitly converts cuda_complex to std::complex... which is fine, but it may + // cause issues use it with case + assert(cudaMemcpy(h_vec.data(), d_vec, size * sizeof(T), cudaMemcpyDeviceToHost) == + cudaSuccess); + // print h_vec and cpu + // for (std::size_t i = 0; i < size; ++i) { + // std::cout << "gpu[" << i << "]: " << h_vec[i] << " cpu[" << i << "]: " << cpu[i] + // << '\n'; + // } + std::cout << "infnorm: " << infnorm(h_vec.data(), cpu, size) << std::endl; + // compare the l2 norm of the difference between the two vectors + if (infnorm(h_vec.data(), cpu, size) < tol) { + return true; + } + return false; +} + +int main() { + // defaults. tests should shadow them to override + cufinufft_opts opts; + cufinufft_default_opts(&opts); + opts.debug = 2; + // opts.gpu_sort = 0; + finufft_opts fin_opts; + finufft_default_opts(&fin_opts); + fin_opts.debug = 2; + const int iflag = 1; + const double tol = 1e-8; + const int ntransf = 1; + const int dim = 3; + // int n_modes[3] = {5, 6, 4}; + const int N = 1023; + const int M = 10000; + const double bandwidth = 50.0; + + thrust::host_vector x(M * ntransf), y(M * ntransf), z(M * ntransf), s(N * ntransf), + t(N * ntransf), u(N * ntransf); + thrust::host_vector> c(M * ntransf), fk(N * ntransf); + + thrust::device_vector d_x{}, d_y{}, d_z{}, d_s{}, d_t{}, d_u{}; + thrust::device_vector> d_c(M * ntransf), d_fk(N * ntransf); + + std::default_random_engine eng(42); + std::uniform_real_distribution dist11(-1, 1); + auto rand_util_11 = [&eng, &dist11]() { + return dist11(eng); + }; + + // Making data + for (int64_t i = 0; i < M; i++) { + x[i] = M_PI * rand_util_11() + 4; // x in [-pi,pi) + y[i] = M_PI * rand_util_11() + 4; + z[i] = M_PI * rand_util_11() + 4; + } + for (int64_t i = 0; i < N; i++) { + s[i] = M_PI * rand_util_11() * bandwidth + 8; // shifted so D1 is 8 + t[i] = M_PI * rand_util_11() * bandwidth + 8; // shifted so D2 is 8 + u[i] = M_PI * rand_util_11() * bandwidth + 8; // shifted so D3 is 8 + } + + const double deconv_tol = std::numeric_limits::epsilon() * bandwidth * 10000; + + for (int64_t i = M; i < M * ntransf; ++i) { + int64_t j = i % M; + x[i] = x[j]; + y[i] = y[j]; + z[i] = z[j]; + } + for (int64_t i = M; i < N * ntransf; ++i) { + int64_t j = i % N; + s[i] = s[j]; + t[i] = t[j]; + u[i] = u[j]; + } + + // fill them all + + for (int i = 0; i < N * ntransf; i++) { + fk[i].real(randm11()); + fk[i].imag(randm11()); + } + // copy x, y, z, s, t, u to device d_x, d_y, d_z, d_s, d_t, d_u + d_x = x; + d_y = y; + d_z = z; + d_s = s; + d_t = t; + d_u = u; + cudaDeviceSynchronize(); + + const auto cpu_planer = + [iflag, tol, ntransf, dim, M, N, &x, &y, &z, &s, &t, &u, &fin_opts]( + const auto type) { + finufft_plan_s *plan{nullptr}; + assert(finufft_makeplan( + type, dim, nullptr, iflag, ntransf, tol, &plan, &fin_opts) == 0); + assert(finufft_setpts(plan, M, x.data(), y.data(), z.data(), N, s.data(), + t.data(), u.data()) == 0); + return plan; + }; + const auto test_type1 = [iflag, tol, ntransf, dim, cpu_planer, &opts](auto *plan) { + // plan is a pointer to a type that contains real_t + using T = typename std::remove_pointer::type::real_t; + const int type = 1; + + assert(cufinufft_makeplan_impl(type, dim, nullptr, iflag, ntransf, T(tol), &plan, + &opts) == 0); + const auto cpu_plan = cpu_planer(type); + cudaDeviceSynchronize(); + assert(cufinufft_destroy_impl(plan) == 0); + assert(finufft_destroy(cpu_plan) == 0); + plan = nullptr; + }; + auto test_type2 = [iflag, tol, ntransf, dim, cpu_planer, &opts](auto plan) { + // plan is a pointer to a type that contains real_t + using T = typename std::remove_pointer::type::real_t; + const int type = 2; + assert(cufinufft_makeplan_impl(type, dim, nullptr, iflag, ntransf, T(tol), &plan, + &opts) == 0); + const auto cpu_plan = cpu_planer(type); + cudaDeviceSynchronize(); + assert(cufinufft_destroy_impl(plan) == 0); + cudaDeviceSynchronize(); + assert(finufft_destroy(cpu_plan) == 0); + plan = nullptr; + }; + auto test_type3 = [iflag, + tol, + ntransf, + dim, + cpu_planer, + deconv_tol, + M, + N, + &d_x, + &d_y, + &d_z, + &d_s, + &d_t, + &d_u, + &c, + &d_c, + &fk, + &d_fk, + &opts](auto plan) { + // plan is a pointer to a type that contains real_t + using T = typename std::remove_pointer::type::real_t; + const int type = 3; + const auto cpu_plan = cpu_planer(type); + assert(cufinufft_makeplan_impl(type, dim, nullptr, iflag, ntransf, T(tol), &plan, + &opts) == 0); + assert(cufinufft_setpts_impl(M, d_x.data().get(), d_y.data().get(), + d_z.data().get(), N, d_s.data().get(), + d_t.data().get(), d_u.data().get(), plan) == 0); + cudaDeviceSynchronize(); + assert(plan->type3_params.X1 == cpu_plan->t3P.X1); + assert(plan->type3_params.X2 == cpu_plan->t3P.X2); + assert(plan->type3_params.X3 == cpu_plan->t3P.X3); + assert(plan->type3_params.C1 == cpu_plan->t3P.C1); + assert(plan->type3_params.C2 == cpu_plan->t3P.C2); + assert(plan->type3_params.C3 == cpu_plan->t3P.C3); + assert(plan->type3_params.D1 == cpu_plan->t3P.D1); + assert(plan->type3_params.D2 == cpu_plan->t3P.D2); + assert(plan->type3_params.D3 == cpu_plan->t3P.D3); + assert(plan->type3_params.gam1 == cpu_plan->t3P.gam1); + assert(plan->type3_params.gam2 == cpu_plan->t3P.gam2); + assert(plan->type3_params.gam3 == cpu_plan->t3P.gam3); + assert(plan->nf1 == cpu_plan->nf1); + assert(plan->nf2 == cpu_plan->nf2); + assert(plan->nf3 == cpu_plan->nf3); + assert(equal(plan->kx, cpu_plan->X, M)); + assert(equal(plan->ky, cpu_plan->Y, M)); + assert(equal(plan->kz, cpu_plan->Z, M)); + assert(equal(plan->d_s, cpu_plan->Sp, N)); + assert(equal(plan->d_t, cpu_plan->Tp, N)); + assert(equal(plan->d_u, cpu_plan->Up, N)); + // NOTE:seems with infnorm we are getting at most 11 digits of precision + std::cout << "prephase :\n"; + assert(almost_equal(plan->prephase, cpu_plan->prephase, M, 1e-10)); + std::cout << "deconv :\n"; + assert(almost_equal(plan->deconv, cpu_plan->deconv, N, deconv_tol)); + + for (int i = 0; i < M; i++) { + c[i].real(randm11()); + c[i].imag(randm11()); + } + d_c = c; + for (int i = 0; i < N; i++) { + fk[i] = {-100, -100}; + } + d_fk = fk; + cufinufft_execute_impl( + (cuda_complex *)d_c.data().get(), (cuda_complex *)d_fk.data().get(), plan); + finufft_execute(cpu_plan, (std::complex *)c.data(), (std::complex *)fk.data()); + assert(almost_equal(d_fk.data().get(), fk.data(), N, tol)); + assert(cufinufft_destroy_impl(plan) == 0); + assert(finufft_destroy(cpu_plan) == 0); + plan = nullptr; + cudaDeviceSynchronize(); + }; + // testing correctness of the plan creation + // cufinufft_plan_t *single_plan{nullptr}; + cufinufft_plan_t *double_plan{nullptr}; + // test_type1(double_plan); + // test_type2(double_plan); + test_type3(double_plan); + return 0; +} From 53a7c63b2f71fab70ca03c14fdb6890f85233cad Mon Sep 17 00:00:00 2001 From: Marco Barbone Date: Thu, 15 Aug 2024 18:25:56 -0400 Subject: [PATCH 41/68] First working version, Horner breaks --- include/cufinufft/impl.h | 38 +++++++++++++++++---------- src/cuda/common.cu | 11 ++++---- test/cuda/cufinufft_type3_test.cu | 43 ++++++++++++++++--------------- 3 files changed, 52 insertions(+), 40 deletions(-) diff --git a/include/cufinufft/impl.h b/include/cufinufft/impl.h index 5ae71b365..ece94c53f 100644 --- a/include/cufinufft/impl.h +++ b/include/cufinufft/impl.h @@ -156,6 +156,12 @@ int cufinufft_makeplan_impl(int type, int dim, int *nmodes, int iflag, int ntran d_plan->type = type; d_plan->spopts.spread_direction = d_plan->type; + if (d_plan->opts.debug) { + // print the spreader options + printf("[cufinufft] spreader options:\n"); + printf("[cufinufft] nspread: %d\n", d_plan->spopts.nspread); + } + cufinufft_setup_binsize(type, d_plan->spopts.nspread, dim, &d_plan->opts); if (ier = cudaGetLastError(), ier != cudaSuccess) { goto finalize; @@ -674,18 +680,22 @@ int cufinufft_setpts_impl(int M, T *d_kx, T *d_ky, T *d_kz, int N, T *d_s, T *d_ const auto is_c_nonzero = d_plan->type3_params.C1 != 0 | d_plan->type3_params.C2 != 0 | d_plan->type3_params.C3 != 0; - const auto phi_hat_iterator = thrust::make_zip_iterator(thrust::make_tuple( - phi_hat1.begin(), dim > 1 ? phi_hat2.begin() : phi_hat1.begin(), - dim > 2 ? phi_hat3.begin() : phi_hat1.begin())); - thrust::transform(thrust::cuda::par.on(stream), phi_hat_iterator, - phi_hat_iterator + N, d_plan->deconv, - [dim] __host__ __device__( - const thrust::tuple &tuple) -> cuda_complex { - auto phiHat = thrust::get<0>(tuple); - phiHat *= (dim > 1) ? thrust::get<1>(tuple) : T(1); - phiHat *= (dim > 2) ? thrust::get<2>(tuple) : T(1); - return cuda_complex{T(1) / phiHat, T(0)}; - }); + const auto phi_hat_iterator = thrust::make_zip_iterator( + thrust::make_tuple(phi_hat1.begin(), + // to avoid out of bounds access, use phi_hat1 if dim < 2 + dim > 1 ? phi_hat2.begin() : phi_hat1.begin(), + // to avoid out of bounds access, use phi_hat1 if dim < 3 + dim > 2 ? phi_hat3.begin() : phi_hat1.begin())); + thrust::transform( + thrust::cuda::par.on(stream), phi_hat_iterator, phi_hat_iterator + N, + d_plan->deconv, + [dim] __host__ __device__(const thrust::tuple tuple) -> cuda_complex { + auto phiHat = thrust::get<0>(tuple); + // in case dim < 2 or dim < 3, multiply by 1 + phiHat *= (dim > 1) ? thrust::get<1>(tuple) : T(1); + phiHat *= (dim > 2) ? thrust::get<2>(tuple) : T(1); + return {T(1) / phiHat, T(0)}; + }); if (is_c_finite && is_c_nonzero) { const auto c1 = d_plan->type3_params.C1; @@ -703,8 +713,8 @@ int cufinufft_setpts_impl(int M, T *d_kx, T *d_ky, T *d_kz, int N, T *d_s, T *d_ thrust::cuda::par.on(stream), phase_iterator, phase_iterator + N, d_plan->deconv, d_plan->deconv, [c1, c2, c3, d1, d2, d3, imasign] __host__ __device__( - const thrust::tuple tuple, - cuda_complex deconv) -> cuda_complex { + const thrust::tuple tuple, cuda_complex deconv) + -> cuda_complex { // d2 and d3 are 0 if dim < 2 and dim < 3 const auto phase = c1 * (thrust::get<0>(tuple) + d1) + c2 * (thrust::get<1>(tuple) + d2) + diff --git a/src/cuda/common.cu b/src/cuda/common.cu index 453df96b0..a46a94048 100644 --- a/src/cuda/common.cu +++ b/src/cuda/common.cu @@ -50,7 +50,7 @@ __global__ void fseries_kernel_compute(int nf1, int nf2, int nf3, T *f, T *a, i += blockDim.x * gridDim.x) { T x = 0.0; for (int n = 0; n < q; n++) { - x += ft[n] * T(2) * cos(T(i) * at[n]); + x += ft[n] * T(2) * std::cos(T(i) * at[n]); } oarr[i] = x; } @@ -61,7 +61,7 @@ __global__ void fseries_kernel_compute(int nf1, int nf2, int nf3, T *f, T *a, T T *ky, T *kz, T *fwkerhalf1, T *fwkerhalf2, T *fwkerhalf3, int ns) { T J2 = ns / 2.0; - int q = (int)(2 + 3.0 * J2); + int q = (int)(2 + 2.0 * J2); int nf; T *at = a + threadIdx.y * MAX_NQUAD; T *ft = f + threadIdx.y * MAX_NQUAD; @@ -83,7 +83,7 @@ __global__ void fseries_kernel_compute(int nf1, int nf2, int nf3, T *f, T *a, T i += blockDim.x * gridDim.x) { T x = 0.0; for (int n = 0; n < q; n++) { - x += ft[n] * T(2) * cos(k[i] * at[n]); + x += ft[n] * T(2) * std::cos(k[i] * at[n]); } oarr[i] = x; } @@ -123,7 +123,7 @@ int cufserieskernelcompute(int dim, int nf1, int nf2, int nf3, T *d_f, T *d_a, T Melody Shih 2/20/22 */ { - int nout = max(max(nf1 / 2 + 1, nf2 / 2 + 1), nf3 / 2 + 1); + int nout = max(max(nf1, nf2), nf3); dim3 threadsPerBlock(16, dim); dim3 numBlocks((nout + 16 - 1) / 16, 1); @@ -211,7 +211,8 @@ void onedim_fseries_kernel_precomp(CUFINUFFT_BIGINT nf, T *f, T *a, finufft_spread_opts opts) { T J2 = opts.nspread / 2.0; // J/2, half-width of ker z-support // # quadr nodes in z (from 0 to J/2; reflections will be added)... - int q = (int)(2 + 3.0 * J2); // not sure why so large? cannot exceed MAX_NQUAD + int q = (int)(2 + (phase_winding ? 3.0 : 2.0) * J2); // not sure why so large? cannot + // exceed MAX_NQUAD double z[2 * MAX_NQUAD]; double w[2 * MAX_NQUAD]; diff --git a/test/cuda/cufinufft_type3_test.cu b/test/cuda/cufinufft_type3_test.cu index 3dc8900d8..f67f45df3 100644 --- a/test/cuda/cufinufft_type3_test.cu +++ b/test/cuda/cufinufft_type3_test.cu @@ -75,13 +75,13 @@ auto almost_equal(V *d_vec, assert(cudaMemcpy(h_vec.data(), d_vec, size * sizeof(T), cudaMemcpyDeviceToHost) == cudaSuccess); // print h_vec and cpu - // for (std::size_t i = 0; i < size; ++i) { - // std::cout << "gpu[" << i << "]: " << h_vec[i] << " cpu[" << i << "]: " << cpu[i] - // << '\n'; - // } - std::cout << "infnorm: " << infnorm(h_vec.data(), cpu, size) << std::endl; + // for (std::size_t i = 0; i < size; ++i) { + // std::cout << "gpu[" << i << "]: " << h_vec[i] << " cpu[" << i << "]: " << cpu[i] + // << '\n'; + // } + std::cout << "relerrtwonorm: " << infnorm(h_vec.data(), cpu, size) << std::endl; // compare the l2 norm of the difference between the two vectors - if (infnorm(h_vec.data(), cpu, size) < tol) { + if (relerrtwonorm(h_vec.data(), cpu, size) < tol) { return true; } return false; @@ -95,15 +95,15 @@ int main() { // opts.gpu_sort = 0; finufft_opts fin_opts; finufft_default_opts(&fin_opts); - fin_opts.debug = 2; - const int iflag = 1; - const double tol = 1e-8; - const int ntransf = 1; - const int dim = 3; - // int n_modes[3] = {5, 6, 4}; - const int N = 1023; - const int M = 10000; - const double bandwidth = 50.0; + fin_opts.debug = 2; + fin_opts.spread_kerevalmeth = 1; + const int iflag = 1; + const int ntransf = 1; + const int dim = 3; + const double tol = 1e-9; + const int N = 1023; + const int M = 1000; + const double bandwidth = 50.0; thrust::host_vector x(M * ntransf), y(M * ntransf), z(M * ntransf), s(N * ntransf), t(N * ntransf), u(N * ntransf); @@ -130,7 +130,7 @@ int main() { u[i] = M_PI * rand_util_11() * bandwidth + 8; // shifted so D3 is 8 } - const double deconv_tol = std::numeric_limits::epsilon() * bandwidth * 10000; + const double deconv_tol = std::numeric_limits::epsilon() * bandwidth * 100; for (int64_t i = M; i < M * ntransf; ++i) { int64_t j = i % M; @@ -147,10 +147,10 @@ int main() { // fill them all - for (int i = 0; i < N * ntransf; i++) { - fk[i].real(randm11()); - fk[i].imag(randm11()); - } + // for (int i = 0; i < N * ntransf; i++) { + // fk[i].real(randm11()); + // fk[i].imag(randm11()); + // } // copy x, y, z, s, t, u to device d_x, d_y, d_z, d_s, d_t, d_u d_x = x; d_y = y; @@ -248,7 +248,8 @@ int main() { assert(equal(plan->d_u, cpu_plan->Up, N)); // NOTE:seems with infnorm we are getting at most 11 digits of precision std::cout << "prephase :\n"; - assert(almost_equal(plan->prephase, cpu_plan->prephase, M, 1e-10)); + assert(almost_equal( + plan->prephase, cpu_plan->prephase, M, std::numeric_limits::epsilon() * 100)); std::cout << "deconv :\n"; assert(almost_equal(plan->deconv, cpu_plan->deconv, N, deconv_tol)); From 9f517e3a97cec1fd9c04c9d6656d32e02bc15e3f Mon Sep 17 00:00:00 2001 From: Marco Barbone Date: Fri, 16 Aug 2024 17:10:40 -0400 Subject: [PATCH 42/68] Type 3 working --- CMakeLists.txt | 2 +- include/cufinufft/impl.h | 6 +-- src/cuda/3d/cufinufft3d.cu | 17 +++++---- test/cuda/cufinufft_setpts.cu | 16 ++++---- test/cuda/cufinufft_type3_test.cu | 61 ++++++++++++++++++------------- 5 files changed, 58 insertions(+), 44 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 59be4b617..957fa14db 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -33,7 +33,7 @@ set(FINUFFT_CUDA_ARCHITECTURES "native" CACHE STRING "CUDA architectures to buil # if FINUFFT_USE_CPU is OFF, the following options are ignored set(FINUFFT_ARCH_FLAGS "native" CACHE STRING "Compiler flags for specifying target architecture, defaults to -march=native") # sphinx tag (don't remove): @cmake_opts_end -cmake_dependent_option(FINUFFT_ENABLE_INSTALL "Disable installation in the case of python builds" OFF "FINUFFT_BUILD_PYTHON" OFF) +cmake_dependent_option(FINUFFT_ENABLE_INSTALL "Disable installation in the case of python builds" ON "NOT FINUFFT_BUILD_PYTHON" OFF) cmake_dependent_option(FINUFFT_STATIC_LINKING "Disable static libraries in the case of python builds" ON "NOT FINUFFT_BUILD_PYTHON" OFF) cmake_dependent_option(FINUFFT_SHARED_LINKING "Shared should be the opposite of static linking" ON "NOT FINUFFT_STATIC_LINKING" OFF) # cmake-format: on diff --git a/include/cufinufft/impl.h b/include/cufinufft/impl.h index ece94c53f..c936199f9 100644 --- a/include/cufinufft/impl.h +++ b/include/cufinufft/impl.h @@ -713,8 +713,8 @@ int cufinufft_setpts_impl(int M, T *d_kx, T *d_ky, T *d_kz, int N, T *d_s, T *d_ thrust::cuda::par.on(stream), phase_iterator, phase_iterator + N, d_plan->deconv, d_plan->deconv, [c1, c2, c3, d1, d2, d3, imasign] __host__ __device__( - const thrust::tuple tuple, cuda_complex deconv) - -> cuda_complex { + const thrust::tuple tuple, + cuda_complex deconv) -> cuda_complex { // d2 and d3 are 0 if dim < 2 and dim < 3 const auto phase = c1 * (thrust::get<0>(tuple) + d1) + c2 * (thrust::get<1>(tuple) + d2) + @@ -747,7 +747,7 @@ int cufinufft_setpts_impl(int M, T *d_kx, T *d_ky, T *d_kz, int N, T *d_s, T *d_ int t2modes[] = {d_plan->nf1, d_plan->nf2, d_plan->nf3}; cufinufft_opts t2opts = d_plan->opts; t2opts.modeord = 0; - t2opts.debug = std::max(0, t2opts.debug - 1); + t2opts.debug = std::max(0, t2opts.debug); t2opts.gpu_spreadinterponly = 0; // Safe to ignore the return value here? if (d_plan->t2_plan) cufinufft_destroy_impl(d_plan->t2_plan); diff --git a/src/cuda/3d/cufinufft3d.cu b/src/cuda/3d/cufinufft3d.cu index 2baa78e59..e7123084b 100644 --- a/src/cuda/3d/cufinufft3d.cu +++ b/src/cuda/3d/cufinufft3d.cu @@ -135,36 +135,39 @@ int cufinufft3d3_exec(cuda_complex *d_c, cuda_complex *d_fk, int ier; cuda_complex *d_cstart; cuda_complex *d_fkstart; - cuda_complex *d_cbatch_start; const auto stream = d_plan->stream; for (int i = 0; i * d_plan->maxbatchsize < d_plan->ntransf; i++) { int blksize = min(d_plan->ntransf - i * d_plan->maxbatchsize, d_plan->maxbatchsize); d_cstart = d_c + i * d_plan->maxbatchsize * d_plan->M; d_fkstart = d_fk + i * d_plan->maxbatchsize * d_plan->N; - d_cbatch_start = d_plan->c_batch + i * d_plan->maxbatchsize * d_plan->M; - d_plan->c = d_cbatch_start; - d_plan->fk = d_plan->fw; + // setting input for spreader + d_plan->c = d_plan->c_batch + i * d_plan->maxbatchsize * d_plan->M; + // setting output for spreader + d_plan->fk = d_plan->fw; // NOTE: fw might need to be set to 0 // Step 0: pre-phase the input strengths for (int i = 0; i < blksize; i++) { thrust::transform(thrust::cuda::par.on(stream), d_plan->prephase, d_plan->prephase + d_plan->M, d_cstart + i * d_plan->M, - d_plan->c_batch + i * d_plan->M, - thrust::multiplies>()); + d_plan->c + i * d_plan->M, thrust::multiplies>()); } // Step 1: Spread if ((ier = cuspread3d(d_plan, blksize))) return ier; + // now d_plan->fk = d_plan->fw contains the spread values // Step 2: Type 3 NUFFT + // type 2 goes from fk to c + // saving the results directly in the user output array d_fk + // it needs to do blksize transforms d_plan->t2_plan->ntransf = blksize; if ((ier = cufinufft3d2_exec(d_fkstart, d_plan->fw, d_plan->t2_plan))) return ier; // Step 3: deconvolve + // now we need to d_fk = d_fk*d_plan->deconv for (int i = 0; i < blksize; i++) { thrust::transform(thrust::cuda::par.on(stream), d_plan->deconv, d_plan->deconv + d_plan->N, d_fkstart + i * d_plan->N, d_fkstart + i * d_plan->N, thrust::multiplies>()); } } - return 0; } diff --git a/test/cuda/cufinufft_setpts.cu b/test/cuda/cufinufft_setpts.cu index b071275c0..efe0105b7 100644 --- a/test/cuda/cufinufft_setpts.cu +++ b/test/cuda/cufinufft_setpts.cu @@ -19,7 +19,7 @@ #include // for now, once finufft is demacroized we can test float -using T = double; +using test_t = double; template bool equal(V *d_vec, T *cpu, const std::size_t size) { // copy d_vec to cpu @@ -98,15 +98,15 @@ int main() { const int N = n_modes[0] * n_modes[1] * n_modes[2]; const int M = 100; - thrust::host_vector x(M * ntransf), y(M * ntransf), z(M * ntransf), s(N * ntransf), - t(N * ntransf), u(N * ntransf); - thrust::host_vector> c(M * ntransf), fk(N * ntransf); + thrust::host_vector x(M * ntransf), y(M * ntransf), z(M * ntransf), + s(N * ntransf), t(N * ntransf), u(N * ntransf); + thrust::host_vector> c(M * ntransf), fk(N * ntransf); - thrust::device_vector d_x{}, d_y{}, d_z{}, d_s{}, d_t{}, d_u{}; - thrust::device_vector> d_c(M * ntransf), d_fk(N * ntransf); + thrust::device_vector d_x{}, d_y{}, d_z{}, d_s{}, d_t{}, d_u{}; + thrust::device_vector> d_c(M * ntransf), d_fk(N * ntransf); std::default_random_engine eng(42); - std::uniform_real_distribution dist11(-1, 1); + std::uniform_real_distribution dist11(-1, 1); auto rand_util_11 = [&eng, &dist11]() { return dist11(eng); }; @@ -237,7 +237,7 @@ int main() { }; // testing correctness of the plan creation // cufinufft_plan_t *single_plan{nullptr}; - cufinufft_plan_t *double_plan{nullptr}; + cufinufft_plan_t *double_plan{nullptr}; // test_type1(double_plan); // test_type2(double_plan); test_type3(double_plan); diff --git a/test/cuda/cufinufft_type3_test.cu b/test/cuda/cufinufft_type3_test.cu index f67f45df3..b025d29bd 100644 --- a/test/cuda/cufinufft_type3_test.cu +++ b/test/cuda/cufinufft_type3_test.cu @@ -18,9 +18,6 @@ #include #include -// for now, once finufft is demacroized we can test float -using T = double; - template bool equal(V *d_vec, T *cpu, const std::size_t size) { // copy d_vec to cpu thrust::host_vector h_vec(size); @@ -75,10 +72,10 @@ auto almost_equal(V *d_vec, assert(cudaMemcpy(h_vec.data(), d_vec, size * sizeof(T), cudaMemcpyDeviceToHost) == cudaSuccess); // print h_vec and cpu - // for (std::size_t i = 0; i < size; ++i) { - // std::cout << "gpu[" << i << "]: " << h_vec[i] << " cpu[" << i << "]: " << cpu[i] - // << '\n'; - // } + for (std::size_t i = 0; i < size; ++i) { + std::cout << "gpu[" << i << "]: " << h_vec[i] << " cpu[" << i << "]: " << cpu[i] + << '\n'; + } std::cout << "relerrtwonorm: " << infnorm(h_vec.data(), cpu, size) << std::endl; // compare the l2 norm of the difference between the two vectors if (relerrtwonorm(h_vec.data(), cpu, size) < tol) { @@ -88,32 +85,39 @@ auto almost_equal(V *d_vec, } int main() { + // for now, once finufft is demacroized we can test float + using test_t = double; + // defaults. tests should shadow them to override cufinufft_opts opts; cufinufft_default_opts(&opts); - opts.debug = 2; + opts.debug = 2; + opts.upsampfac = 1.25; + opts.gpu_kerevalmeth = 1; // opts.gpu_sort = 0; finufft_opts fin_opts; finufft_default_opts(&fin_opts); fin_opts.debug = 2; fin_opts.spread_kerevalmeth = 1; + fin_opts.upsampfac = 1.25; const int iflag = 1; const int ntransf = 1; const int dim = 3; const double tol = 1e-9; - const int N = 1023; + const int n_modes[] = {10, 5, 3}; + const int N = n_modes[0] * n_modes[1] * n_modes[2]; const int M = 1000; const double bandwidth = 50.0; - thrust::host_vector x(M * ntransf), y(M * ntransf), z(M * ntransf), s(N * ntransf), - t(N * ntransf), u(N * ntransf); - thrust::host_vector> c(M * ntransf), fk(N * ntransf); + thrust::host_vector x(M * ntransf), y(M * ntransf), z(M * ntransf), + s(N * ntransf), t(N * ntransf), u(N * ntransf); + thrust::host_vector> c(M * ntransf), fk(N * ntransf); - thrust::device_vector d_x{}, d_y{}, d_z{}, d_s{}, d_t{}, d_u{}; - thrust::device_vector> d_c(M * ntransf), d_fk(N * ntransf); + thrust::device_vector d_x{}, d_y{}, d_z{}, d_s{}, d_t{}, d_u{}; + thrust::device_vector> d_c(M * ntransf), d_fk(N * ntransf); std::default_random_engine eng(42); - std::uniform_real_distribution dist11(-1, 1); + std::uniform_real_distribution dist11(-1, 1); auto rand_util_11 = [&eng, &dist11]() { return dist11(eng); }; @@ -161,11 +165,12 @@ int main() { cudaDeviceSynchronize(); const auto cpu_planer = - [iflag, tol, ntransf, dim, M, N, &x, &y, &z, &s, &t, &u, &fin_opts]( + [iflag, tol, ntransf, dim, M, N, n_modes, &x, &y, &z, &s, &t, &u, &fin_opts]( const auto type) { finufft_plan_s *plan{nullptr}; - assert(finufft_makeplan( - type, dim, nullptr, iflag, ntransf, tol, &plan, &fin_opts) == 0); + std::int64_t nl[] = {n_modes[0], n_modes[1], n_modes[2]}; + assert( + finufft_makeplan(type, dim, nl, iflag, ntransf, tol, &plan, &fin_opts) == 0); assert(finufft_setpts(plan, M, x.data(), y.data(), z.data(), N, s.data(), t.data(), u.data()) == 0); return plan; @@ -204,6 +209,7 @@ int main() { deconv_tol, M, N, + n_modes, &d_x, &d_y, &d_z, @@ -219,8 +225,8 @@ int main() { using T = typename std::remove_pointer::type::real_t; const int type = 3; const auto cpu_plan = cpu_planer(type); - assert(cufinufft_makeplan_impl(type, dim, nullptr, iflag, ntransf, T(tol), &plan, - &opts) == 0); + assert(cufinufft_makeplan_impl(type, dim, (int *)n_modes, iflag, ntransf, T(tol), + &plan, &opts) == 0); assert(cufinufft_setpts_impl(M, d_x.data().get(), d_y.data().get(), d_z.data().get(), N, d_s.data().get(), d_t.data().get(), d_u.data().get(), plan) == 0); @@ -245,6 +251,11 @@ int main() { assert(equal(plan->kz, cpu_plan->Z, M)); assert(equal(plan->d_s, cpu_plan->Sp, N)); assert(equal(plan->d_t, cpu_plan->Tp, N)); + assert(plan->spopts.nspread == cpu_plan->spopts.nspread); + assert(plan->spopts.upsampfac == cpu_plan->spopts.upsampfac); + assert(plan->spopts.ES_beta == cpu_plan->spopts.ES_beta); + assert(plan->spopts.ES_halfwidth == cpu_plan->spopts.ES_halfwidth); + assert(plan->spopts.ES_c == cpu_plan->spopts.ES_c); assert(equal(plan->d_u, cpu_plan->Up, N)); // NOTE:seems with infnorm we are getting at most 11 digits of precision std::cout << "prephase :\n"; @@ -258,10 +269,10 @@ int main() { c[i].imag(randm11()); } d_c = c; - for (int i = 0; i < N; i++) { - fk[i] = {-100, -100}; - } - d_fk = fk; + // for (int i = 0; i < N; i++) { + // fk[i] = {randm11(), randm11()}; + // } + // d_fk = fk; cufinufft_execute_impl( (cuda_complex *)d_c.data().get(), (cuda_complex *)d_fk.data().get(), plan); finufft_execute(cpu_plan, (std::complex *)c.data(), (std::complex *)fk.data()); @@ -273,7 +284,7 @@ int main() { }; // testing correctness of the plan creation // cufinufft_plan_t *single_plan{nullptr}; - cufinufft_plan_t *double_plan{nullptr}; + cufinufft_plan_t *double_plan{nullptr}; // test_type1(double_plan); // test_type2(double_plan); test_type3(double_plan); From 096cf1ee368c1ef67f63292e36b05d021d60bfc4 Mon Sep 17 00:00:00 2001 From: Marco Barbone Date: Tue, 20 Aug 2024 13:00:14 -0400 Subject: [PATCH 43/68] added 1D&2d type 3 --- include/cufinufft/impl.h | 17 ++++----- src/cuda/1d/cufinufft1d.cu | 62 ++++++++++++++++++++++++++++++ src/cuda/2d/cufinufft2d.cu | 63 +++++++++++++++++++++++++++++++ test/cuda/CMakeLists.txt | 12 ++++++ test/cuda/cufinufft1d_test.cu | 41 ++++++++++++++++---- test/cuda/cufinufft2d_test.cu | 37 +++++++++++++++--- test/cuda/cufinufft_type3_test.cu | 38 +++++++++---------- 7 files changed, 230 insertions(+), 40 deletions(-) diff --git a/include/cufinufft/impl.h b/include/cufinufft/impl.h index c936199f9..89a58faa6 100644 --- a/include/cufinufft/impl.h +++ b/include/cufinufft/impl.h @@ -24,7 +24,9 @@ int cufinufft1d1_exec(cuda_complex *d_c, cuda_complex *d_fk, template int cufinufft1d2_exec(cuda_complex *d_c, cuda_complex *d_fk, cufinufft_plan_t *d_plan); - +template +int cufinufft1d3_exec(cuda_complex *d_c, cuda_complex *d_fk, + cufinufft_plan_t *d_plan); // 2d template int cufinufft2d1_exec(cuda_complex *d_c, cuda_complex *d_fk, @@ -33,6 +35,9 @@ template int cufinufft2d2_exec(cuda_complex *d_c, cuda_complex *d_fk, cufinufft_plan_t *d_plan); +template +int cufinufft2d3_exec(cuda_complex *d_c, cuda_complex *d_fk, + cufinufft_plan_t *d_plan); // 3d template int cufinufft3d1_exec(cuda_complex *d_c, cuda_complex *d_fk, @@ -811,18 +816,12 @@ int cufinufft_execute_impl(cuda_complex *d_c, cuda_complex *d_fk, case 1: { if (type == 1) ier = cufinufft1d1_exec(d_c, d_fk, d_plan); if (type == 2) ier = cufinufft1d2_exec(d_c, d_fk, d_plan); - if (type == 3) { - std::cerr << "Not Implemented yet" << std::endl; - ier = FINUFFT_ERR_TYPE_NOTVALID; - } + if (type == 3) ier = cufinufft1d3_exec(d_c, d_fk, d_plan); } break; case 2: { if (type == 1) ier = cufinufft2d1_exec(d_c, d_fk, d_plan); if (type == 2) ier = cufinufft2d2_exec(d_c, d_fk, d_plan); - if (type == 3) { - std::cerr << "Not Implemented yet" << std::endl; - ier = FINUFFT_ERR_TYPE_NOTVALID; - } + if (type == 3) ier = cufinufft2d3_exec(d_c, d_fk, d_plan); } break; case 3: { if (type == 1) ier = cufinufft3d1_exec(d_c, d_fk, d_plan); diff --git a/src/cuda/1d/cufinufft1d.cu b/src/cuda/1d/cufinufft1d.cu index a17b6f044..d94dc3cea 100644 --- a/src/cuda/1d/cufinufft1d.cu +++ b/src/cuda/1d/cufinufft1d.cu @@ -1,6 +1,9 @@ #include #include +#include + #include +#include #include #include @@ -116,6 +119,60 @@ int cufinufft1d2_exec(cuda_complex *d_c, cuda_complex *d_fk, return 0; } +template +int cufinufft1d3_exec(cuda_complex *d_c, cuda_complex *d_fk, + cufinufft_plan_t *d_plan) { + /* + 3D Type-3 NUFFT + + This function is called in "exec" stage (See ../cufinufft.cu). + It includes (copied from doc in finufft library) + Step 0: pre-phase the input strengths + Step 1: spread data + Step 2: Type 3 NUFFT + Step 3: deconvolve (amplify) each Fourier mode, using kernel Fourier coeff + + Marco Barbone 08/14/2024 + */ + int ier; + cuda_complex *d_cstart; + cuda_complex *d_fkstart; + const auto stream = d_plan->stream; + for (int i = 0; i * d_plan->maxbatchsize < d_plan->ntransf; i++) { + int blksize = min(d_plan->ntransf - i * d_plan->maxbatchsize, d_plan->maxbatchsize); + d_cstart = d_c + i * d_plan->maxbatchsize * d_plan->M; + d_fkstart = d_fk + i * d_plan->maxbatchsize * d_plan->N; + // setting input for spreader + d_plan->c = d_plan->c_batch + i * d_plan->maxbatchsize * d_plan->M; + // setting output for spreader + d_plan->fk = d_plan->fw; + // NOTE: fw might need to be set to 0 + // Step 0: pre-phase the input strengths + for (int i = 0; i < blksize; i++) { + thrust::transform(thrust::cuda::par.on(stream), d_plan->prephase, + d_plan->prephase + d_plan->M, d_cstart + i * d_plan->M, + d_plan->c + i * d_plan->M, thrust::multiplies>()); + } + // Step 1: Spread + if ((ier = cuspread1d(d_plan, blksize))) return ier; + // now d_plan->fk = d_plan->fw contains the spread values + // Step 2: Type 3 NUFFT + // type 2 goes from fk to c + // saving the results directly in the user output array d_fk + // it needs to do blksize transforms + d_plan->t2_plan->ntransf = blksize; + if ((ier = cufinufft1d2_exec(d_fkstart, d_plan->fw, d_plan->t2_plan))) return ier; + // Step 3: deconvolve + // now we need to d_fk = d_fk*d_plan->deconv + for (int i = 0; i < blksize; i++) { + thrust::transform(thrust::cuda::par.on(stream), d_plan->deconv, + d_plan->deconv + d_plan->N, d_fkstart + i * d_plan->N, + d_fkstart + i * d_plan->N, thrust::multiplies>()); + } + } + return 0; +} + template int cufinufft1d1_exec(cuda_complex *d_c, cuda_complex *d_fk, cufinufft_plan_t *d_plan); template int cufinufft1d1_exec(cuda_complex *d_c, @@ -126,3 +183,8 @@ template int cufinufft1d2_exec(cuda_complex *d_c, cuda_complex(cuda_complex *d_c, cuda_complex *d_fk, cufinufft_plan_t *d_plan); +template int cufinufft1d3_exec(cuda_complex *d_c, cuda_complex *d_fk, + cufinufft_plan_t *d_plan); +template int cufinufft1d3_exec(cuda_complex *d_c, + cuda_complex *d_fk, + cufinufft_plan_t *d_plan); diff --git a/src/cuda/2d/cufinufft2d.cu b/src/cuda/2d/cufinufft2d.cu index f7f7b1559..9bcd8d370 100644 --- a/src/cuda/2d/cufinufft2d.cu +++ b/src/cuda/2d/cufinufft2d.cu @@ -2,7 +2,11 @@ #include #include #include + +#include + #include +#include #include #include @@ -115,6 +119,60 @@ int cufinufft2d2_exec(cuda_complex *d_c, cuda_complex *d_fk, return 0; } +template +int cufinufft2d3_exec(cuda_complex *d_c, cuda_complex *d_fk, + cufinufft_plan_t *d_plan) { + /* + 3D Type-3 NUFFT + + This function is called in "exec" stage (See ../cufinufft.cu). + It includes (copied from doc in finufft library) + Step 0: pre-phase the input strengths + Step 1: spread data + Step 2: Type 3 NUFFT + Step 3: deconvolve (amplify) each Fourier mode, using kernel Fourier coeff + + Marco Barbone 08/14/2024 + */ + int ier; + cuda_complex *d_cstart; + cuda_complex *d_fkstart; + const auto stream = d_plan->stream; + for (int i = 0; i * d_plan->maxbatchsize < d_plan->ntransf; i++) { + int blksize = min(d_plan->ntransf - i * d_plan->maxbatchsize, d_plan->maxbatchsize); + d_cstart = d_c + i * d_plan->maxbatchsize * d_plan->M; + d_fkstart = d_fk + i * d_plan->maxbatchsize * d_plan->N; + // setting input for spreader + d_plan->c = d_plan->c_batch + i * d_plan->maxbatchsize * d_plan->M; + // setting output for spreader + d_plan->fk = d_plan->fw; + // NOTE: fw might need to be set to 0 + // Step 0: pre-phase the input strengths + for (int i = 0; i < blksize; i++) { + thrust::transform(thrust::cuda::par.on(stream), d_plan->prephase, + d_plan->prephase + d_plan->M, d_cstart + i * d_plan->M, + d_plan->c + i * d_plan->M, thrust::multiplies>()); + } + // Step 1: Spread + if ((ier = cuspread2d(d_plan, blksize))) return ier; + // now d_plan->fk = d_plan->fw contains the spread values + // Step 2: Type 3 NUFFT + // type 2 goes from fk to c + // saving the results directly in the user output array d_fk + // it needs to do blksize transforms + d_plan->t2_plan->ntransf = blksize; + if ((ier = cufinufft2d2_exec(d_fkstart, d_plan->fw, d_plan->t2_plan))) return ier; + // Step 3: deconvolve + // now we need to d_fk = d_fk*d_plan->deconv + for (int i = 0; i < blksize; i++) { + thrust::transform(thrust::cuda::par.on(stream), d_plan->deconv, + d_plan->deconv + d_plan->N, d_fkstart + i * d_plan->N, + d_fkstart + i * d_plan->N, thrust::multiplies>()); + } + } + return 0; +} + template int cufinufft2d1_exec(cuda_complex *d_c, cuda_complex *d_fk, cufinufft_plan_t *d_plan); template int cufinufft2d1_exec(cuda_complex *d_c, @@ -125,3 +183,8 @@ template int cufinufft2d2_exec(cuda_complex *d_c, cuda_complex(cuda_complex *d_c, cuda_complex *d_fk, cufinufft_plan_t *d_plan); +template int cufinufft2d3_exec(cuda_complex *d_c, cuda_complex *d_fk, + cufinufft_plan_t *d_plan); +template int cufinufft2d3_exec(cuda_complex *d_c, + cuda_complex *d_fk, + cufinufft_plan_t *d_plan); diff --git a/test/cuda/CMakeLists.txt b/test/cuda/CMakeLists.txt index af342944c..c0bc9750f 100644 --- a/test/cuda/CMakeLists.txt +++ b/test/cuda/CMakeLists.txt @@ -41,6 +41,14 @@ function(add_tests PREC REQ_TOL CHECK_TOL UPSAMP) COMMAND cufinufft2d_test 2 1 1e2 2e2 2e4 ${REQ_TOL} ${CHECK_TOL} ${PREC} ${UPSAMP}) + add_test(NAME cufinufft2d2_test_SM_${PREC}_${UPSAMP} + COMMAND cufinufft2d_test 2 2 1e2 2e2 2e4 ${REQ_TOL} ${CHECK_TOL} + ${PREC} ${UPSAMP}) + + add_test(NAME cufinufft2d3_test_SM_${PREC}_${UPSAMP} + COMMAND cufinufft2d_test 2 3 1e2 2e2 2e4 ${REQ_TOL} ${CHECK_TOL} + ${PREC} ${UPSAMP}) + add_test(NAME cufinufft2d1many_test_GM_${PREC}_${UPSAMP} COMMAND cufinufft2dmany_test 1 1 1e2 2e2 5 0 2e4 ${REQ_TOL} ${CHECK_TOL} ${PREC} ${UPSAMP}) @@ -78,6 +86,10 @@ function(add_tests PREC REQ_TOL CHECK_TOL UPSAMP) add_test(NAME cufinufft3d2_test_GM_${PREC}_${UPSAMP} COMMAND cufinufft3d_test 1 2 2 5 10 20 ${REQ_TOL} ${CHECK_TOL} ${PREC} ${UPSAMP}) + + add_test(NAME cufinufft3d3_test_GM_${PREC}_${UPSAMP} + COMMAND cufinufft3d_test 1 3 2 5 10 20 ${REQ_TOL} ${CHECK_TOL} + ${PREC} ${UPSAMP}) endfunction() add_test(NAME cufinufft_public_api COMMAND public_api_test) diff --git a/test/cuda/cufinufft1d_test.cu b/test/cuda/cufinufft1d_test.cu index 38313786a..0a23e5625 100644 --- a/test/cuda/cufinufft1d_test.cu +++ b/test/cuda/cufinufft1d_test.cu @@ -22,11 +22,11 @@ int run_test(int method, int type, int N1, int M, T tol, T checktol, int iflag, std::cout << std::scientific << std::setprecision(3); int ier; - thrust::host_vector x(M); + thrust::host_vector x(M), s{}; thrust::host_vector> c(M); thrust::host_vector> fk(N1); - thrust::device_vector d_x(M); + thrust::device_vector d_x(M), d_s{}; thrust::device_vector> d_c(M); thrust::device_vector> d_fk(N1); @@ -40,6 +40,7 @@ int run_test(int method, int type, int N1, int M, T tol, T checktol, int iflag, for (int i = 0; i < M; i++) { x[i] = M_PI * randm11(); // x in [-pi,pi) } + if (type == 1) { for (int i = 0; i < M; i++) { c[i].real(randm11()); @@ -50,6 +51,16 @@ int run_test(int method, int type, int N1, int M, T tol, T checktol, int iflag, fk[i].real(randm11()); fk[i].imag(randm11()); } + } else if (type == 3) { + for (int i = 0; i < M; i++) { + c[i].real(randm11()); + c[i].imag(randm11()); + } + s.resize(N1); + for (int i = 0; i < N1; i++) { + s[i] = M_PI * randm11(); + } + d_s = s; } else { std::cerr << "Invalid type " << type << " supplied\n"; return 1; @@ -60,6 +71,8 @@ int run_test(int method, int type, int N1, int M, T tol, T checktol, int iflag, d_c = c; else if (type == 2) d_fk = fk; + else if (type == 3) + d_c = c; cudaEvent_t start, stop; float milliseconds = 0; @@ -107,8 +120,8 @@ int run_test(int method, int type, int N1, int M, T tol, T checktol, int iflag, printf("[time ] cufinufft plan:\t\t %.3g s\n", milliseconds / 1000); cudaEventRecord(start); - ier = cufinufft_setpts_impl(M, d_x.data().get(), NULL, NULL, 0, NULL, NULL, NULL, - dplan); + ier = cufinufft_setpts_impl(M, d_x.data().get(), NULL, NULL, 0, d_s.data().get(), + NULL, NULL, dplan); if (ier != 0) { printf("err: cufinufft_setpts\n"); @@ -153,9 +166,15 @@ int run_test(int method, int type, int N1, int M, T tol, T checktol, int iflag, opts.gpu_method, N1, M, totaltime / 1000, M / totaltime * 1000); printf("\t\t\t\t\t(exec-only thoughput: %.3g NU pts/s)\n", M / exec_ms * 1000); + if (type == 1) + fk = d_fk; + else if (type == 2) + c = d_c; + else if (type == 3) + fk = d_fk; + T rel_error = std::numeric_limits::max(); if (type == 1) { - fk = d_fk; int nt1 = 0.37 * N1; // choose some mode index to check thrust::complex Ft = thrust::complex(0, 0), J = thrust::complex(0.0, iflag); for (int j = 0; j < M; ++j) Ft += c[j] * exp(J * (nt1 * x[j])); // crude direct @@ -164,8 +183,6 @@ int run_test(int method, int type, int N1, int M, T tol, T checktol, int iflag, rel_error = abs(Ft - fk[it]) / infnorm(N1, (std::complex *)fk.data()); printf("[gpu ] one mode: rel err in F[%d] is %.3g\n", nt1, rel_error); } else if (type == 2) { - c = d_c; - int jt = M / 2; // check arbitrary choice of one targ pt thrust::complex J = thrust::complex(0, iflag); thrust::complex ct = thrust::complex(0, 0); @@ -174,6 +191,16 @@ int run_test(int method, int type, int N1, int M, T tol, T checktol, int iflag, ct += fk[m++] * exp(J * (m1 * x[jt])); // crude direct rel_error = abs(c[jt] - ct) / infnorm(M, (std::complex *)c.data()); printf("[gpu ] one targ: rel err in c[%d] is %.3g\n", jt, rel_error); + } else if (type == 3) { + int jt = (N1) / 2; // check arbitrary choice of one targ pt + thrust::complex J = thrust::complex(0, iflag); + thrust::complex Ft = thrust::complex(0, 0); + + for (int j = 0; j < M; ++j) { + Ft += c[j] * exp(J * (x[j] * s[jt])); + } + rel_error = abs(Ft - fk[jt]) / infnorm(N1, (std::complex *)fk.data()); + printf("[gpu ] one mode: rel err in F[%d] is %.3g\n", jt, rel_error); } return std::isnan(rel_error) || rel_error > checktol; diff --git a/test/cuda/cufinufft2d_test.cu b/test/cuda/cufinufft2d_test.cu index f3b767f2e..ba99e9acf 100644 --- a/test/cuda/cufinufft2d_test.cu +++ b/test/cuda/cufinufft2d_test.cu @@ -22,10 +22,10 @@ int run_test(int method, int type, int N1, int N2, int M, T tol, T checktol, int double upsampfac) { std::cout << std::scientific << std::setprecision(3); - thrust::host_vector x(M), y(M); + thrust::host_vector x(M), y(M), s{}, t{}; thrust::host_vector> c(M), fk(N1 * N2); - thrust::device_vector d_x(M), d_y(M); + thrust::device_vector d_x(M), d_y(M), d_s{}, d_t{}; thrust::device_vector> d_c(M), d_fk(N1 * N2); std::default_random_engine eng(1); @@ -49,6 +49,19 @@ int run_test(int method, int type, int N1, int N2, int M, T tol, T checktol, int fk[i].real(randm11()); fk[i].imag(randm11()); } + } else if (type == 3) { + for (int i = 0; i < M; i++) { + c[i].real(randm11()); + c[i].imag(randm11()); + } + s.resize(N1 * N2); + t.resize(N1 * N2); + for (int i = 0; i < N1 * N2; i++) { + s[i] = M_PI * randm11(); + t[i] = M_PI * randm11(); + } + d_s = s; + d_t = t; } else { std::cerr << "Invalid type " << type << " supplied\n"; return 1; @@ -60,6 +73,8 @@ int run_test(int method, int type, int N1, int N2, int M, T tol, T checktol, int d_c = c; else if (type == 2) d_fk = fk; + else if (type == 3) + d_c = c; cudaEvent_t start, stop; float milliseconds = 0; @@ -107,7 +122,7 @@ int run_test(int method, int type, int N1, int N2, int M, T tol, T checktol, int cudaEventRecord(start); ier = cufinufft_setpts_impl(M, d_x.data().get(), d_y.data().get(), nullptr, 0, - nullptr, nullptr, nullptr, dplan); + d_s.data().get(), d_t.data().get(), nullptr, dplan); if (ier != 0) { printf("err: cufinufft_setpts\n"); return ier; @@ -144,6 +159,8 @@ int run_test(int method, int type, int N1, int N2, int M, T tol, T checktol, int fk = d_fk; else if (type == 2) c = d_c; + else if (type == 3) + fk = d_fk; printf("[Method %d] %d NU pts to %d U pts in %.3g s: %.3g NU pts/s\n", opts.gpu_method, M, N1 * N2, totaltime / 1000, M / totaltime * 1000); @@ -173,8 +190,18 @@ int run_test(int method, int type, int N1, int N2, int M, T tol, T checktol, int rel_error = abs(c[jt] - ct) / infnorm(M, (std::complex *)c.data()); printf("[gpu ] one targ: rel err in c[%d] is %.3g\n", jt, rel_error); - } + } else if (type == 3) { + + int jt = (N1 * N2) / 2; // check arbitrary choice of one targ pt + thrust::complex J = thrust::complex(0, iflag); + thrust::complex Ft = thrust::complex(0, 0); + for (int j = 0; j < M; ++j) { + Ft += c[j] * exp(J * (x[j] * s[jt] + y[j] * t[jt])); + } + rel_error = abs(Ft - fk[jt]) / infnorm(N1 * N2, (std::complex *)fk.data()); + printf("[gpu ] one mode: rel err in F[%d] is %.3g\n", jt, rel_error); + } return std::isnan(rel_error) || rel_error > checktol; } @@ -185,7 +212,7 @@ int main(int argc, char *argv[]) { " method: One of\n" " 1: nupts driven,\n" " 2: sub-problem, or\n" - " type: Type of transform (1, 2)" + " type: Type of transform (1, 2, 3)" " N1, N2: The size of the 2D array\n" " M: The number of non-uniform points\n" " tol: NUFFT tolerance\n" diff --git a/test/cuda/cufinufft_type3_test.cu b/test/cuda/cufinufft_type3_test.cu index b025d29bd..3547f3041 100644 --- a/test/cuda/cufinufft_type3_test.cu +++ b/test/cuda/cufinufft_type3_test.cu @@ -72,10 +72,10 @@ auto almost_equal(V *d_vec, assert(cudaMemcpy(h_vec.data(), d_vec, size * sizeof(T), cudaMemcpyDeviceToHost) == cudaSuccess); // print h_vec and cpu - for (std::size_t i = 0; i < size; ++i) { - std::cout << "gpu[" << i << "]: " << h_vec[i] << " cpu[" << i << "]: " << cpu[i] - << '\n'; - } + // for (std::size_t i = 0; i < size; ++i) { + // std::cout << "gpu[" << i << "]: " << h_vec[i] << " cpu[" << i << "]: " << cpu[i] + // << '\n'; + // } std::cout << "relerrtwonorm: " << infnorm(h_vec.data(), cpu, size) << std::endl; // compare the l2 norm of the difference between the two vectors if (relerrtwonorm(h_vec.data(), cpu, size) < tol) { @@ -102,7 +102,7 @@ int main() { fin_opts.upsampfac = 1.25; const int iflag = 1; const int ntransf = 1; - const int dim = 3; + const int dim = 1; const double tol = 1e-9; const int n_modes[] = {10, 5, 3}; const int N = n_modes[0] * n_modes[1] * n_modes[2]; @@ -232,31 +232,31 @@ int main() { d_t.data().get(), d_u.data().get(), plan) == 0); cudaDeviceSynchronize(); assert(plan->type3_params.X1 == cpu_plan->t3P.X1); - assert(plan->type3_params.X2 == cpu_plan->t3P.X2); - assert(plan->type3_params.X3 == cpu_plan->t3P.X3); + // assert(plan->type3_params.X2 == cpu_plan->t3P.X2); + // assert(plan->type3_params.X3 == cpu_plan->t3P.X3); assert(plan->type3_params.C1 == cpu_plan->t3P.C1); - assert(plan->type3_params.C2 == cpu_plan->t3P.C2); - assert(plan->type3_params.C3 == cpu_plan->t3P.C3); + // assert(plan->type3_params.C2 == cpu_plan->t3P.C2); + // assert(plan->type3_params.C3 == cpu_plan->t3P.C3); assert(plan->type3_params.D1 == cpu_plan->t3P.D1); - assert(plan->type3_params.D2 == cpu_plan->t3P.D2); - assert(plan->type3_params.D3 == cpu_plan->t3P.D3); + // assert(plan->type3_params.D2 == cpu_plan->t3P.D2); + // assert(plan->type3_params.D3 == cpu_plan->t3P.D3); assert(plan->type3_params.gam1 == cpu_plan->t3P.gam1); - assert(plan->type3_params.gam2 == cpu_plan->t3P.gam2); - assert(plan->type3_params.gam3 == cpu_plan->t3P.gam3); + // assert(plan->type3_params.gam2 == cpu_plan->t3P.gam2); + // assert(plan->type3_params.gam3 == cpu_plan->t3P.gam3); assert(plan->nf1 == cpu_plan->nf1); - assert(plan->nf2 == cpu_plan->nf2); - assert(plan->nf3 == cpu_plan->nf3); + // assert(plan->nf2 == cpu_plan->nf2); + // assert(plan->nf3 == cpu_plan->nf3); assert(equal(plan->kx, cpu_plan->X, M)); - assert(equal(plan->ky, cpu_plan->Y, M)); - assert(equal(plan->kz, cpu_plan->Z, M)); + // assert(equal(plan->ky, cpu_plan->Y, M)); + // assert(equal(plan->kz, cpu_plan->Z, M)); assert(equal(plan->d_s, cpu_plan->Sp, N)); - assert(equal(plan->d_t, cpu_plan->Tp, N)); + // assert(equal(plan->d_t, cpu_plan->Tp, N)); + // assert(equal(plan->d_u, cpu_plan->Up, N)); assert(plan->spopts.nspread == cpu_plan->spopts.nspread); assert(plan->spopts.upsampfac == cpu_plan->spopts.upsampfac); assert(plan->spopts.ES_beta == cpu_plan->spopts.ES_beta); assert(plan->spopts.ES_halfwidth == cpu_plan->spopts.ES_halfwidth); assert(plan->spopts.ES_c == cpu_plan->spopts.ES_c); - assert(equal(plan->d_u, cpu_plan->Up, N)); // NOTE:seems with infnorm we are getting at most 11 digits of precision std::cout << "prephase :\n"; assert(almost_equal( From 3cfe4065ab1019cb4a925b3d9dba46c33c22eb1f Mon Sep 17 00:00:00 2001 From: Marco Barbone Date: Tue, 20 Aug 2024 16:32:51 -0400 Subject: [PATCH 44/68] fixed tests for type3 --- include/cufinufft/impl.h | 5 +++-- test/cuda/CMakeLists.txt | 4 +--- test/cuda/cufinufft1d_test.cu | 2 +- test/cuda/cufinufft2d_test.cu | 2 +- test/cuda/cufinufft_makeplan_impl.cu | 2 +- 5 files changed, 7 insertions(+), 8 deletions(-) diff --git a/include/cufinufft/impl.h b/include/cufinufft/impl.h index 89a58faa6..4ed5f0a48 100644 --- a/include/cufinufft/impl.h +++ b/include/cufinufft/impl.h @@ -592,7 +592,7 @@ int cufinufft_setpts_impl(int M, T *d_kx, T *d_ky, T *d_kz, int N, T *d_s, T *d_ // kx if ky is null (d_plan->dim > 1) ? d_ky : d_kx, // same idea as above - (d_plan->dim > 1) ? d_kz : d_kx)); + (d_plan->dim > 2) ? d_kz : d_kx)); const auto D1 = d_plan->type3_params.D1; const auto D2 = d_plan->type3_params.D2; // this should be 0 if dim < 2 const auto D3 = d_plan->type3_params.D3; // this should be 0 if dim < 3 @@ -845,11 +845,12 @@ int cufinufft_destroy_impl(cufinufft_plan_t *d_plan) Also see ../docs/cppdoc.md for main user-facing documentation. */ { - cufinufft::utils::WithCudaDevice device_swapper(d_plan->opts.gpu_device_id); // Can't destroy a null pointer. if (!d_plan) return FINUFFT_ERR_PLAN_NOTVALID; + cufinufft::utils::WithCudaDevice device_swapper(d_plan->opts.gpu_device_id); + using namespace cufinufft::memtransfer; freegpumemory(d_plan); diff --git a/test/cuda/CMakeLists.txt b/test/cuda/CMakeLists.txt index c0bc9750f..1660b4483 100644 --- a/test/cuda/CMakeLists.txt +++ b/test/cuda/CMakeLists.txt @@ -94,9 +94,7 @@ endfunction() add_test(NAME cufinufft_public_api COMMAND public_api_test) add_test(NAME cufinufft_makeplan COMMAND test_makeplan) -if(CMAKE_BUILD_TYPE STREQUAL "Debug") - add_test(NAME cufinufft_makeplan_impl COMMAND cufinufft_makeplan_impl) -endif() +add_test(NAME cufinufft_makeplan_impl COMMAND cufinufft_makeplan_impl) add_test(NAME cufinufft_setpts COMMAND cufinufft_setpts) add_test(NAME cufinufft_math_test COMMAND cufinufft_math_test) diff --git a/test/cuda/cufinufft1d_test.cu b/test/cuda/cufinufft1d_test.cu index 0a23e5625..ceef0881f 100644 --- a/test/cuda/cufinufft1d_test.cu +++ b/test/cuda/cufinufft1d_test.cu @@ -120,7 +120,7 @@ int run_test(int method, int type, int N1, int M, T tol, T checktol, int iflag, printf("[time ] cufinufft plan:\t\t %.3g s\n", milliseconds / 1000); cudaEventRecord(start); - ier = cufinufft_setpts_impl(M, d_x.data().get(), NULL, NULL, 0, d_s.data().get(), + ier = cufinufft_setpts_impl(M, d_x.data().get(), NULL, NULL, N1, d_s.data().get(), NULL, NULL, dplan); if (ier != 0) { diff --git a/test/cuda/cufinufft2d_test.cu b/test/cuda/cufinufft2d_test.cu index ba99e9acf..39524ae78 100644 --- a/test/cuda/cufinufft2d_test.cu +++ b/test/cuda/cufinufft2d_test.cu @@ -121,7 +121,7 @@ int run_test(int method, int type, int N1, int N2, int M, T tol, T checktol, int printf("[time ] cufinufft plan:\t\t %.3g s\n", milliseconds / 1000); cudaEventRecord(start); - ier = cufinufft_setpts_impl(M, d_x.data().get(), d_y.data().get(), nullptr, 0, + ier = cufinufft_setpts_impl(M, d_x.data().get(), d_y.data().get(), nullptr, N1 * N2, d_s.data().get(), d_t.data().get(), nullptr, dplan); if (ier != 0) { printf("err: cufinufft_setpts\n"); diff --git a/test/cuda/cufinufft_makeplan_impl.cu b/test/cuda/cufinufft_makeplan_impl.cu index 5f8d26666..b5fa039ad 100644 --- a/test/cuda/cufinufft_makeplan_impl.cu +++ b/test/cuda/cufinufft_makeplan_impl.cu @@ -134,7 +134,7 @@ int main() { assert(plan->fwkerhalf3 == nullptr); assert(plan->spopts.spread_direction == type); assert(plan->type == type); - assert(plan->opts.gpu_method == 0); + // assert(plan->opts.gpu_method == 0); assert(plan->opts.upsampfac == 1.25); assert(cufinufft_destroy_impl(plan) == 0); plan = nullptr; From 1842f6842883fd2f7e3528650c7ee527171d1929 Mon Sep 17 00:00:00 2001 From: Marco Barbone Date: Wed, 21 Aug 2024 12:44:41 -0400 Subject: [PATCH 45/68] fixed possible memory leaks --- include/cufinufft/impl.h | 20 ++--- src/cuda/memtransfer_wrapper.cu | 16 ++++ test/cuda/cufinufft_type3_test.cu | 143 ++++++++++++++++++++++-------- 3 files changed, 132 insertions(+), 47 deletions(-) diff --git a/include/cufinufft/impl.h b/include/cufinufft/impl.h index 4ed5f0a48..0959fec10 100644 --- a/include/cufinufft/impl.h +++ b/include/cufinufft/impl.h @@ -497,7 +497,7 @@ int cufinufft_setpts_impl(int M, T *d_kx, T *d_ky, T *d_kz, int N, T *d_s, T *d_ } if (d_plan->opts.debug) { printf("[%s]", __func__); - printf("\tM=%lld N=%lld\n", M, N); + printf("\tM=%d N=%d\n", M, N); printf("\tX1=%.3g C1=%.3g S1=%.3g D1=%.3g gam1=%g nf1=%lld\t\n", d_plan->type3_params.X1, d_plan->type3_params.C1, d_plan->type3_params.S1, d_plan->type3_params.D1, d_plan->type3_params.gam1, d_plan->nf1); @@ -767,19 +767,15 @@ int cufinufft_setpts_impl(int M, T *d_kx, T *d_ky, T *d_kz, int N, T *d_s, T *d_ fprintf(stderr, "[%s] inner t2 plan cufinufft_setpts_12 failed\n", __func__); goto finalize; } + if (d_plan->t2_plan->spopts.spread_direction != 2) { + fprintf(stderr, "[%s] inner t2 plan cufinufft_setpts_12 wrong direction\n", + __func__); + goto finalize; + } return 0; } finalize: - checked_free(d_plan->kx); - checked_free(d_plan->d_s); - checked_free(d_plan->ky); - checked_free(d_plan->d_t); - checked_free(d_plan->kz); - checked_free(d_plan->d_u); - checked_free(d_plan->prephase); - checked_free(d_plan->deconv); - checked_free(d_plan->fw_batch); - checked_free(d_plan->c_batch); + cufinufft_destroy_impl(d_plan); cufinufft_destroy_impl(d_plan->t2_plan); return FINUFFT_ERR_CUDA_FAILURE; } @@ -856,6 +852,8 @@ int cufinufft_destroy_impl(cufinufft_plan_t *d_plan) if (d_plan->fftplan) cufftDestroy(d_plan->fftplan); + if (d_plan->t2_plan) cufinufft_destroy_impl(d_plan->t2_plan); + /* free/destruct the plan */ delete d_plan; diff --git a/src/cuda/memtransfer_wrapper.cu b/src/cuda/memtransfer_wrapper.cu index 6f3f31abd..4584bd8ff 100644 --- a/src/cuda/memtransfer_wrapper.cu +++ b/src/cuda/memtransfer_wrapper.cu @@ -422,6 +422,7 @@ void freegpumemory(cufinufft_plan_t *d_plan) Melody Shih 11/21/21 */ { + utils::WithCudaDevice device_swapper(d_plan->opts.gpu_device_id); // passing the stream by reference was causing a segfault const auto stream = d_plan->stream; @@ -441,6 +442,21 @@ void freegpumemory(cufinufft_plan_t *d_plan) CUDA_FREE_AND_NULL(d_plan->numnupts, stream, d_plan->supports_pools); CUDA_FREE_AND_NULL(d_plan->numsubprob, stream, d_plan->supports_pools); + + if (d_plan->type != 3) { + return; + } + + CUDA_FREE_AND_NULL(d_plan->kx, stream, d_plan->supports_pools); + CUDA_FREE_AND_NULL(d_plan->d_s, stream, d_plan->supports_pools); + CUDA_FREE_AND_NULL(d_plan->ky, stream, d_plan->supports_pools); + CUDA_FREE_AND_NULL(d_plan->d_t, stream, d_plan->supports_pools); + CUDA_FREE_AND_NULL(d_plan->kz, stream, d_plan->supports_pools); + CUDA_FREE_AND_NULL(d_plan->d_u, stream, d_plan->supports_pools); + CUDA_FREE_AND_NULL(d_plan->prephase, stream, d_plan->supports_pools); + CUDA_FREE_AND_NULL(d_plan->deconv, stream, d_plan->supports_pools); + CUDA_FREE_AND_NULL(d_plan->fw_batch, stream, d_plan->supports_pools); + CUDA_FREE_AND_NULL(d_plan->c_batch, stream, d_plan->supports_pools); } template int allocgpumem1d_plan(cufinufft_plan_t *d_plan); diff --git a/test/cuda/cufinufft_type3_test.cu b/test/cuda/cufinufft_type3_test.cu index 3547f3041..08797c7b2 100644 --- a/test/cuda/cufinufft_type3_test.cu +++ b/test/cuda/cufinufft_type3_test.cu @@ -102,7 +102,7 @@ int main() { fin_opts.upsampfac = 1.25; const int iflag = 1; const int ntransf = 1; - const int dim = 1; + const int dim = 3; const double tol = 1e-9; const int n_modes[] = {10, 5, 3}; const int N = n_modes[0] * n_modes[1] * n_modes[2]; @@ -149,12 +149,6 @@ int main() { u[i] = u[j]; } - // fill them all - - // for (int i = 0; i < N * ntransf; i++) { - // fk[i].real(randm11()); - // fk[i].imag(randm11()); - // } // copy x, y, z, s, t, u to device d_x, d_y, d_z, d_s, d_t, d_u d_x = x; d_y = y; @@ -175,32 +169,109 @@ int main() { t.data(), u.data()) == 0); return plan; }; - const auto test_type1 = [iflag, tol, ntransf, dim, cpu_planer, &opts](auto *plan) { - // plan is a pointer to a type that contains real_t - using T = typename std::remove_pointer::type::real_t; - const int type = 1; - assert(cufinufft_makeplan_impl(type, dim, nullptr, iflag, ntransf, T(tol), &plan, - &opts) == 0); + const auto test_type1 = [iflag, + tol, + ntransf, + dim, + cpu_planer, + M, + N, + n_modes, + &d_x, + &d_y, + &d_z, + &c, + &d_c, + &fk, + &d_fk, + &opts](auto plan) { + // plan is a pointer to a type that contains real_t + using T = typename std::remove_pointer::type::real_t; + const int type = 1; const auto cpu_plan = cpu_planer(type); + assert(cufinufft_makeplan_impl(type, dim, (int *)n_modes, iflag, ntransf, T(tol), + &plan, &opts) == 0); + assert( + cufinufft_setpts_impl(M, d_x.data().get(), d_y.data().get(), d_z.data().get(), + 0, nullptr, nullptr, nullptr, plan) == 0); cudaDeviceSynchronize(); + assert(plan->nf1 == cpu_plan->nf1); + assert(plan->nf2 == cpu_plan->nf2); + assert(plan->nf3 == cpu_plan->nf3); + assert(plan->spopts.nspread == cpu_plan->spopts.nspread); + assert(plan->spopts.upsampfac == cpu_plan->spopts.upsampfac); + assert(plan->spopts.ES_beta == cpu_plan->spopts.ES_beta); + assert(plan->spopts.ES_halfwidth == cpu_plan->spopts.ES_halfwidth); + assert(plan->spopts.ES_c == cpu_plan->spopts.ES_c); + + for (int i = 0; i < M; i++) { + c[i].real(randm11()); + c[i].imag(randm11()); + } + d_c = c; + cufinufft_execute_impl( + (cuda_complex *)d_c.data().get(), (cuda_complex *)d_fk.data().get(), plan); + finufft_execute(cpu_plan, (std::complex *)c.data(), (std::complex *)fk.data()); + std::cout << "type " << type << ": "; + assert(almost_equal(d_fk.data().get(), fk.data(), N, tol)); assert(cufinufft_destroy_impl(plan) == 0); assert(finufft_destroy(cpu_plan) == 0); + cudaDeviceSynchronize(); plan = nullptr; }; - auto test_type2 = [iflag, tol, ntransf, dim, cpu_planer, &opts](auto plan) { + + const auto test_type2 = [iflag, + tol, + ntransf, + dim, + cpu_planer, + M, + N, + n_modes, + &d_x, + &d_y, + &d_z, + &c, + &d_c, + &fk, + &d_fk, + &opts](auto plan) { // plan is a pointer to a type that contains real_t - using T = typename std::remove_pointer::type::real_t; - const int type = 2; - assert(cufinufft_makeplan_impl(type, dim, nullptr, iflag, ntransf, T(tol), &plan, - &opts) == 0); + using T = typename std::remove_pointer::type::real_t; + const int type = 2; const auto cpu_plan = cpu_planer(type); + assert(cufinufft_makeplan_impl(type, dim, (int *)n_modes, iflag, ntransf, T(tol), + &plan, &opts) == 0); + assert( + cufinufft_setpts_impl(M, d_x.data().get(), d_y.data().get(), d_z.data().get(), + 0, nullptr, nullptr, nullptr, plan) == 0); cudaDeviceSynchronize(); + assert(plan->nf1 == cpu_plan->nf1); + assert(plan->nf2 == cpu_plan->nf2); + assert(plan->nf3 == cpu_plan->nf3); + assert(plan->spopts.nspread == cpu_plan->spopts.nspread); + assert(plan->spopts.upsampfac == cpu_plan->spopts.upsampfac); + assert(plan->spopts.ES_beta == cpu_plan->spopts.ES_beta); + assert(plan->spopts.ES_halfwidth == cpu_plan->spopts.ES_halfwidth); + assert(plan->spopts.ES_c == cpu_plan->spopts.ES_c); + + for (int i = 0; i < N; i++) { + fk[i].real(randm11()); + fk[i].imag(randm11()); + } + d_fk = fk; + cufinufft_execute_impl( + (cuda_complex *)d_c.data().get(), (cuda_complex *)d_fk.data().get(), plan); + finufft_execute(cpu_plan, (std::complex *)c.data(), (std::complex *)fk.data()); + std::cout << "type " << type << ": "; + assert(almost_equal(d_c.data().get(), c.data(), M, tol)); assert(cufinufft_destroy_impl(plan) == 0); - cudaDeviceSynchronize(); assert(finufft_destroy(cpu_plan) == 0); + cudaDeviceSynchronize(); plan = nullptr; }; + auto test_type3 = [iflag, tol, ntransf, @@ -232,32 +303,31 @@ int main() { d_t.data().get(), d_u.data().get(), plan) == 0); cudaDeviceSynchronize(); assert(plan->type3_params.X1 == cpu_plan->t3P.X1); - // assert(plan->type3_params.X2 == cpu_plan->t3P.X2); - // assert(plan->type3_params.X3 == cpu_plan->t3P.X3); + assert(plan->type3_params.X2 == cpu_plan->t3P.X2); + assert(plan->type3_params.X3 == cpu_plan->t3P.X3); assert(plan->type3_params.C1 == cpu_plan->t3P.C1); - // assert(plan->type3_params.C2 == cpu_plan->t3P.C2); - // assert(plan->type3_params.C3 == cpu_plan->t3P.C3); + assert(plan->type3_params.C2 == cpu_plan->t3P.C2); + assert(plan->type3_params.C3 == cpu_plan->t3P.C3); assert(plan->type3_params.D1 == cpu_plan->t3P.D1); - // assert(plan->type3_params.D2 == cpu_plan->t3P.D2); - // assert(plan->type3_params.D3 == cpu_plan->t3P.D3); + assert(plan->type3_params.D2 == cpu_plan->t3P.D2); + assert(plan->type3_params.D3 == cpu_plan->t3P.D3); assert(plan->type3_params.gam1 == cpu_plan->t3P.gam1); - // assert(plan->type3_params.gam2 == cpu_plan->t3P.gam2); - // assert(plan->type3_params.gam3 == cpu_plan->t3P.gam3); + assert(plan->type3_params.gam2 == cpu_plan->t3P.gam2); + assert(plan->type3_params.gam3 == cpu_plan->t3P.gam3); assert(plan->nf1 == cpu_plan->nf1); - // assert(plan->nf2 == cpu_plan->nf2); - // assert(plan->nf3 == cpu_plan->nf3); + assert(plan->nf2 == cpu_plan->nf2); + assert(plan->nf3 == cpu_plan->nf3); assert(equal(plan->kx, cpu_plan->X, M)); - // assert(equal(plan->ky, cpu_plan->Y, M)); - // assert(equal(plan->kz, cpu_plan->Z, M)); + assert(equal(plan->ky, cpu_plan->Y, M)); + assert(equal(plan->kz, cpu_plan->Z, M)); assert(equal(plan->d_s, cpu_plan->Sp, N)); - // assert(equal(plan->d_t, cpu_plan->Tp, N)); - // assert(equal(plan->d_u, cpu_plan->Up, N)); + assert(equal(plan->d_t, cpu_plan->Tp, N)); + assert(equal(plan->d_u, cpu_plan->Up, N)); assert(plan->spopts.nspread == cpu_plan->spopts.nspread); assert(plan->spopts.upsampfac == cpu_plan->spopts.upsampfac); assert(plan->spopts.ES_beta == cpu_plan->spopts.ES_beta); assert(plan->spopts.ES_halfwidth == cpu_plan->spopts.ES_halfwidth); assert(plan->spopts.ES_c == cpu_plan->spopts.ES_c); - // NOTE:seems with infnorm we are getting at most 11 digits of precision std::cout << "prephase :\n"; assert(almost_equal( plan->prephase, cpu_plan->prephase, M, std::numeric_limits::epsilon() * 100)); @@ -273,6 +343,7 @@ int main() { // fk[i] = {randm11(), randm11()}; // } // d_fk = fk; + cudaDeviceSynchronize(); cufinufft_execute_impl( (cuda_complex *)d_c.data().get(), (cuda_complex *)d_fk.data().get(), plan); finufft_execute(cpu_plan, (std::complex *)c.data(), (std::complex *)fk.data()); @@ -285,8 +356,8 @@ int main() { // testing correctness of the plan creation // cufinufft_plan_t *single_plan{nullptr}; cufinufft_plan_t *double_plan{nullptr}; - // test_type1(double_plan); - // test_type2(double_plan); + test_type1(double_plan); + test_type2(double_plan); test_type3(double_plan); return 0; } From c13a6a9a5a780588a03a3cf45e444236257759fc Mon Sep 17 00:00:00 2001 From: Marco Barbone Date: Wed, 21 Aug 2024 19:59:52 -0400 Subject: [PATCH 46/68] minor changes, mainly for debug --- include/cufinufft/impl.h | 70 +++++++------- include/cufinufft/spreadinterp.h | 6 +- include/cufinufft/utils.h | 13 +-- src/cuda/3d/cufinufft3d.cu | 4 + src/cuda/common.cu | 5 +- src/finufft.cpp | 24 +++-- test/cuda/cufinufft_type3_test.cu | 151 ++++++++++++------------------ 7 files changed, 129 insertions(+), 144 deletions(-) diff --git a/include/cufinufft/impl.h b/include/cufinufft/impl.h index 0959fec10..2a421918a 100644 --- a/include/cufinufft/impl.h +++ b/include/cufinufft/impl.h @@ -108,8 +108,8 @@ int cufinufft_makeplan_impl(int type, int dim, int *nmodes, int iflag, int ntran d_plan->mt = nmodes[1]; d_plan->mu = nmodes[2]; } else { - d_plan->opts.gpu_method = 1; d_plan->opts.gpu_spreadinterponly = 1; + d_plan->opts.gpu_method = 1; } int fftsign = (iflag >= 0) ? 1 : -1; @@ -313,7 +313,7 @@ int cufinufft_makeplan_impl(int type, int dim, int *nmodes, int iflag, int ntran } finalize: if (ier > 1) { - delete *d_plan_ptr; + cufinufft_destroy_impl(*d_plan_ptr); *d_plan_ptr = nullptr; } return ier; @@ -498,18 +498,21 @@ int cufinufft_setpts_impl(int M, T *d_kx, T *d_ky, T *d_kz, int N, T *d_s, T *d_ if (d_plan->opts.debug) { printf("[%s]", __func__); printf("\tM=%d N=%d\n", M, N); - printf("\tX1=%.3g C1=%.3g S1=%.3g D1=%.3g gam1=%g nf1=%lld\t\n", + printf("\tX1=%.3g C1=%.3g S1=%.3g D1=%.3g gam1=%g nf1=%lld h1=%.3g\t\n", d_plan->type3_params.X1, d_plan->type3_params.C1, d_plan->type3_params.S1, - d_plan->type3_params.D1, d_plan->type3_params.gam1, d_plan->nf1); + d_plan->type3_params.D1, d_plan->type3_params.gam1, d_plan->nf1, + d_plan->type3_params.h1); if (d_plan->dim > 1) { - printf("\tX2=%.3g C2=%.3g S2=%.3g D2=%.3g gam2=%g nf2=%lld\n", + printf("\tX2=%.3g C2=%.3g S2=%.3g D2=%.3g gam2=%g nf2=%lld h2=%.3g\n", d_plan->type3_params.X2, d_plan->type3_params.C2, d_plan->type3_params.S2, - d_plan->type3_params.D2, d_plan->type3_params.gam2, d_plan->nf2); + d_plan->type3_params.D2, d_plan->type3_params.gam2, d_plan->nf2, + d_plan->type3_params.h2); } if (d_plan->dim > 2) { - printf("\tX3=%.3g C3=%.3g S3=%.3g D3=%.3g gam3=%g nf3=%lld\n", + printf("\tX3=%.3g C3=%.3g S3=%.3g D3=%.3g gam3=%g nf3=%lld h3=%.3g\n", d_plan->type3_params.X3, d_plan->type3_params.C3, d_plan->type3_params.S3, - d_plan->type3_params.D3, d_plan->type3_params.gam3, d_plan->nf3); + d_plan->type3_params.D3, d_plan->type3_params.gam3, d_plan->nf3, + d_plan->type3_params.h3); } } d_plan->nf = d_plan->nf1 * d_plan->nf2 * d_plan->nf3; @@ -616,26 +619,30 @@ int cufinufft_setpts_impl(int M, T *d_kx, T *d_ky, T *d_kz, int N, T *d_s, T *d_ thrust::fill(thrust::cuda::par.on(stream), d_plan->prephase, d_plan->prephase + M, cuda_complex{1, 0}); } - if (d_plan->dim > 0) { - const auto scale = d_plan->type3_params.h1 * d_plan->type3_params.gam1; - const auto D1 = -d_plan->type3_params.D1; - thrust::transform( - thrust::cuda::par.on(stream), d_s, d_s + N, d_plan->d_s, - [scale, D1] __host__ __device__(const T s) -> T { return scale * (s + D1); }); - } - if (d_plan->dim > 1) { - const auto scale = d_plan->type3_params.h2 * d_plan->type3_params.gam2; - const auto D2 = -d_plan->type3_params.D2; - thrust::transform( - thrust::cuda::par.on(stream), d_t, d_t + N, d_plan->d_t, - [scale, D2] __host__ __device__(const T t) -> T { return scale * (t + D2); }); - } - if (d_plan->dim > 2) { - const auto scale = d_plan->type3_params.h3 * d_plan->type3_params.gam3; - const auto D3 = -d_plan->type3_params.D3; - thrust::transform( - thrust::cuda::par.on(stream), d_u, d_u + N, d_plan->d_u, - [scale, D3] __host__ __device__(const T u) -> T { return scale * (u + D3); }); + { + const auto source_iterator = thrust::make_zip_iterator( + thrust::make_tuple(d_s, dim > 1 ? d_t : d_s, dim > 2 ? d_u : d_s)); + const auto target_iterator = thrust::make_zip_iterator( + thrust::make_tuple(d_plan->d_s, dim > 1 ? d_plan->d_t : d_plan->d_s, + dim > 2 ? d_plan->d_u : d_plan->d_s)); + const auto scale1 = d_plan->type3_params.h1 * d_plan->type3_params.gam1; + const auto D1 = d_plan->type3_params.D1; + const auto scale2 = d_plan->type3_params.h2 * d_plan->type3_params.gam2; + const auto D2 = d_plan->type3_params.D2; + const auto scale3 = d_plan->type3_params.h3 * d_plan->type3_params.gam3; + const auto D3 = d_plan->type3_params.D3; + thrust::transform(thrust::cuda::par.on(stream), source_iterator, source_iterator + N, + target_iterator, + [scale1, D1, scale2, D2, scale3, D3] __host__ __device__( + const thrust::tuple tuple) -> thrust::tuple { + auto s = thrust::get<0>(tuple); + auto t = thrust::get<1>(tuple); + auto u = thrust::get<2>(tuple); + s = scale1 * (s - D1); + t = scale2 * (t - D2); + u = scale3 * (u - D3); + return {s, t, u}; + }); } { // here we declare phi_hat1, phi_hat2, and phi_hat3 // and the precomputed data for the fseries kernel @@ -718,8 +725,8 @@ int cufinufft_setpts_impl(int M, T *d_kx, T *d_ky, T *d_kz, int N, T *d_s, T *d_ thrust::cuda::par.on(stream), phase_iterator, phase_iterator + N, d_plan->deconv, d_plan->deconv, [c1, c2, c3, d1, d2, d3, imasign] __host__ __device__( - const thrust::tuple tuple, - cuda_complex deconv) -> cuda_complex { + const thrust::tuple tuple, cuda_complex deconv) + -> cuda_complex { // d2 and d3 are 0 if dim < 2 and dim < 3 const auto phase = c1 * (thrust::get<0>(tuple) + d1) + c2 * (thrust::get<1>(tuple) + d2) + @@ -751,9 +758,8 @@ int cufinufft_setpts_impl(int M, T *d_kx, T *d_ky, T *d_kz, int N, T *d_s, T *d_ { int t2modes[] = {d_plan->nf1, d_plan->nf2, d_plan->nf3}; cufinufft_opts t2opts = d_plan->opts; - t2opts.modeord = 0; - t2opts.debug = std::max(0, t2opts.debug); t2opts.gpu_spreadinterponly = 0; + // Safe to ignore the return value here? if (d_plan->t2_plan) cufinufft_destroy_impl(d_plan->t2_plan); // check that maxbatchsize is correct diff --git a/include/cufinufft/spreadinterp.h b/include/cufinufft/spreadinterp.h index 2963d381d..fefb38f74 100644 --- a/include/cufinufft/spreadinterp.h +++ b/include/cufinufft/spreadinterp.h @@ -85,15 +85,15 @@ static __forceinline__ __device__ T evaluate_kernel(T x, T es_c, T es_beta, int } template -static __inline__ __device__ void eval_kernel_vec_horner(T *ker, const T x, const int w, - const double upsampfac) +static __device__ void eval_kernel_vec_horner(T *ker, const T x, const int w, + const double upsampfac) /* Fill ker[] with Horner piecewise poly approx to [-w/2,w/2] ES kernel eval at x_j = x + j, for j=0,..,w-1. Thus x in [-w/2,-w/2+1]. w is aka ns. This is the current evaluation method, since it's faster (except i7 w=16). Two upsampfacs implemented. Params must match ref formula. Barnett 4/24/18 */ { const auto z = fma(T(2), x, T(w - 1)); // scale so local grid offset z in [-1,1] - // T z = 2 * x + w - 1.0; + // const T z = 2 * x + w - 1.0; // insert the auto-generated code which expects z, w args, writes to ker... if (upsampfac == 2.0) { // floating point equality is fine here using FLT = T; diff --git a/include/cufinufft/utils.h b/include/cufinufft/utils.h index 3b4b8b524..665b1d099 100644 --- a/include/cufinufft/utils.h +++ b/include/cufinufft/utils.h @@ -99,17 +99,12 @@ __forceinline__ __device__ auto interval(const int ns, const double x) { #if defined(__CUDACC_VER_MAJOR__) && defined(__CUDACC_VER_MINOR__) #if (__CUDACC_VER_MAJOR__ > 11) || \ (__CUDACC_VER_MAJOR__ == 11 && __CUDACC_VER_MINOR__ >= 3 && __CUDA_ARCH__ >= 600) - #define ALLOCA_SUPPORTED 1 // windows compatibility #if __has_include() #include #endif -#else -#define ALLOCA_SUPPORTED 0 #endif -#else -#define ALLOCA_SUPPORTED 0 #endif #undef ALLOCA_SUPPORTED @@ -132,8 +127,8 @@ __forceinline__ __device__ auto interval(const int ns, const double x) { */ template -static __forceinline__ __device__ void atomicAddComplexShared(cuda_complex *address, - cuda_complex res) { +static __forceinline__ __device__ void atomicAddComplexShared( + cuda_complex *address, cuda_complex res) { const auto raw_address = reinterpret_cast(address); atomicAdd(raw_address, res.x); atomicAdd(raw_address + 1, res.y); @@ -145,8 +140,8 @@ static __forceinline__ __device__ void atomicAddComplexShared(cuda_complex *a * on shared memory are supported so we leverage them */ template -static __forceinline__ __device__ void atomicAddComplexGlobal(cuda_complex *address, - cuda_complex res) { +static __forceinline__ __device__ void atomicAddComplexGlobal( + cuda_complex *address, cuda_complex res) { if constexpr ( std::is_same_v, float2> && COMPUTE_CAPABILITY_90_OR_HIGHER) { atomicAdd(address, res); diff --git a/src/cuda/3d/cufinufft3d.cu b/src/cuda/3d/cufinufft3d.cu index e7123084b..6b79f8379 100644 --- a/src/cuda/3d/cufinufft3d.cu +++ b/src/cuda/3d/cufinufft3d.cu @@ -151,6 +151,10 @@ int cufinufft3d3_exec(cuda_complex *d_c, cuda_complex *d_fk, d_plan->prephase + d_plan->M, d_cstart + i * d_plan->M, d_plan->c + i * d_plan->M, thrust::multiplies>()); } + if ((ier = checkCudaErrors(cudaMemsetAsync( + d_plan->fk, 0, d_plan->maxbatchsize * d_plan->nf * sizeof(cuda_complex), + stream)))) + return ier; // Step 1: Spread if ((ier = cuspread3d(d_plan, blksize))) return ier; // now d_plan->fk = d_plan->fw contains the spread values diff --git a/src/cuda/common.cu b/src/cuda/common.cu index a46a94048..03489d949 100644 --- a/src/cuda/common.cu +++ b/src/cuda/common.cu @@ -215,7 +215,6 @@ void onedim_fseries_kernel_precomp(CUFINUFFT_BIGINT nf, T *f, T *a, // exceed MAX_NQUAD double z[2 * MAX_NQUAD]; double w[2 * MAX_NQUAD]; - finufft::quadrature::legendre_compute_glr(2 * q, z, w); // only half the nodes used, // eg on (0,1) for (int n = 0; n < q; ++n) { // set up nodes z_n and vals f_n @@ -224,7 +223,9 @@ void onedim_fseries_kernel_precomp(CUFINUFFT_BIGINT nf, T *f, T *a, if constexpr (phase_winding) { a[n] = ((T)(2.0 * M_PI) * (T)(nf / 2 - z[n]) / (T)nf); // phase winding rates } else { - a[n] = z[n]; + a[n] = T(z[n]); + // printf("[cufinufft] f[%d] = %.16g\n",n,f[n]); + // printf("[cufinufft] z[%d] = %.16g\n",n,z[n]); } } } diff --git a/src/finufft.cpp b/src/finufft.cpp index 2fb5d0a71..cf9dfbe61 100644 --- a/src/finufft.cpp +++ b/src/finufft.cpp @@ -270,7 +270,8 @@ void onedim_nuft_kernel(BIGINT nk, FLT *k, FLT *phihat, finufft_spread_opts opts for (int n = 0; n < q; ++n) { z[n] *= (FLT)J2; // quadr nodes for [0,J/2] f[n] = J2 * (FLT)w[n] * evaluate_kernel((FLT)z[n], opts); // w/ quadr weights - // printf("f[%d] = %.3g\n",n,f[n]); + // printf("[finufft] f[%d] = %.16g\n",n,f[n]); + // printf("[finufft] z[%d] = %.16g\n",n,z[n]); } #pragma omp parallel for num_threads(opts.nthreads) for (BIGINT j = 0; j < nk; ++j) { // loop along output array @@ -877,14 +878,14 @@ int FINUFFT_SETPTS(FINUFFT_PLAN p, BIGINT nj, FLT *xj, FLT *yj, FLT *zj, BIGINT if (p->opts.debug) { // report on choices of shifts, centers, etc... printf("\tM=%lld N=%lld\n", (long long)nj, (long long)nk); - printf("\tX1=%.3g C1=%.3g S1=%.3g D1=%.3g gam1=%g nf1=%lld\t\n", p->t3P.X1, - p->t3P.C1, S1, p->t3P.D1, p->t3P.gam1, (long long)p->nf1); + printf("\tX1=%.3g C1=%.3g S1=%.3g D1=%.3g gam1=%g nf1=%lld h1=%.3g\t\n", p->t3P.X1, + p->t3P.C1, S1, p->t3P.D1, p->t3P.gam1, (long long)p->nf1, p->t3P.h1); if (d > 1) - printf("\tX2=%.3g C2=%.3g S2=%.3g D2=%.3g gam2=%g nf2=%lld\n", p->t3P.X2, - p->t3P.C2, S2, p->t3P.D2, p->t3P.gam2, (long long)p->nf2); + printf("\tX2=%.3g C2=%.3g S2=%.3g D2=%.3g gam2=%g nf2=%lld h2=%.3g\n", p->t3P.X2, + p->t3P.C2, S2, p->t3P.D2, p->t3P.gam2, (long long)p->nf2, p->t3P.h2); if (d > 2) - printf("\tX3=%.3g C3=%.3g S3=%.3g D3=%.3g gam3=%g nf3=%lld\n", p->t3P.X3, - p->t3P.C3, S3, p->t3P.D3, p->t3P.gam3, (long long)p->nf3); + printf("\tX3=%.3g C3=%.3g S3=%.3g D3=%.3g gam3=%g nf3=%lld h3=%.3g\n", p->t3P.X3, + p->t3P.C3, S3, p->t3P.D3, p->t3P.gam3, (long long)p->nf3, p->t3P.h3); } p->nf = p->nf1 * p->nf2 * p->nf3; // fine grid total number of points if (p->nf * p->batchSize > MAX_NF) { @@ -974,7 +975,14 @@ int FINUFFT_SETPTS(FINUFFT_PLAN p, BIGINT nj, FLT *xj, FLT *yj, FLT *zj, BIGINT p->Up[k] = p->t3P.h3 * p->t3P.gam3 * (u[k] - p->t3P.D3); // so |u'_k| < // pi/R } - + // #pragma omp parallel for num_threads(p->opts.nthreads) schedule(static) + // for (BIGINT k = 0; k < nk; ++k) { + // p->Sp[k] = s[k]; + // if (d > 1) + // p->Tp[k] =t[k]; + // if (d > 2) + // p->Up[k] = u[k]; + // } // (old STEP 3a) Compute deconvolution post-factors array (per targ pt)... // (exploits that FT separates because kernel is prod of 1D funcs) if (p->deconv) free(p->deconv); diff --git a/test/cuda/cufinufft_type3_test.cu b/test/cuda/cufinufft_type3_test.cu index 08797c7b2..161531d28 100644 --- a/test/cuda/cufinufft_type3_test.cu +++ b/test/cuda/cufinufft_type3_test.cu @@ -55,27 +55,36 @@ T relerrtwonorm(std::complex *a, std::complex *b, const std::size_t n) { // std::cout << "a[" << m << "]: " << a[m] << " b[" << m << "]: " << b[m] << "\n"; nrm += std::real(std::conj(a[m]) * a[m]); const auto diff = a[m] - b[m]; - err += std::real(std::conj(diff) * diff); + auto this_err = std::real(std::conj(diff) * diff); + if (this_err > 1e-9) { + std::cout << "a[" << m << "]: " << a[m] << " b[" << m << "]: " << b[m] << "\n"; + std::cout << "diff: " << diff << " this_err: " << this_err << std::endl; + } + err += this_err; } return std::sqrt(err / nrm); } template -auto almost_equal(V *d_vec, - T *cpu, - const std::size_t size, - const contained tol = std::numeric_limits::epsilon()) { +auto almost_equal(V *d_vec, T *cpu, const std::size_t size, + const contained tol = std::numeric_limits::epsilon(), + bool print = false) { // copy d_vec to cpu std::vector h_vec(size); // this implicitly converts cuda_complex to std::complex... which is fine, but it may // cause issues use it with case assert(cudaMemcpy(h_vec.data(), d_vec, size * sizeof(T), cudaMemcpyDeviceToHost) == cudaSuccess); + cudaDeviceSynchronize(); // print h_vec and cpu - // for (std::size_t i = 0; i < size; ++i) { - // std::cout << "gpu[" << i << "]: " << h_vec[i] << " cpu[" << i << "]: " << cpu[i] - // << '\n'; - // } + if (print) { + std::cout << std::setprecision(15); + for (std::size_t i = 0; i < size; ++i) { + std::cout << "gpu[" << i << "]: " << h_vec[i] << " cpu[" << i << "]: " << cpu[i] + << '\n'; + } + std::cout << std::setprecision(6); + } std::cout << "relerrtwonorm: " << infnorm(h_vec.data(), cpu, size) << std::endl; // compare the l2 norm of the difference between the two vectors if (relerrtwonorm(h_vec.data(), cpu, size) < tol) { @@ -92,22 +101,21 @@ int main() { cufinufft_opts opts; cufinufft_default_opts(&opts); opts.debug = 2; - opts.upsampfac = 1.25; + opts.upsampfac = 2.0; opts.gpu_kerevalmeth = 1; - // opts.gpu_sort = 0; finufft_opts fin_opts; finufft_default_opts(&fin_opts); - fin_opts.debug = 2; - fin_opts.spread_kerevalmeth = 1; - fin_opts.upsampfac = 1.25; + fin_opts.debug = opts.debug; + fin_opts.spread_kerevalmeth = opts.gpu_kerevalmeth; + fin_opts.upsampfac = opts.upsampfac; const int iflag = 1; const int ntransf = 1; const int dim = 3; - const double tol = 1e-9; + const double tol = 1e-12; const int n_modes[] = {10, 5, 3}; const int N = n_modes[0] * n_modes[1] * n_modes[2]; const int M = 1000; - const double bandwidth = 50.0; + const double bandwidth = 1.0; thrust::host_vector x(M * ntransf), y(M * ntransf), z(M * ntransf), s(N * ntransf), t(N * ntransf), u(N * ntransf); @@ -142,7 +150,7 @@ int main() { y[i] = y[j]; z[i] = z[j]; } - for (int64_t i = M; i < N * ntransf; ++i) { + for (int64_t i = N; i < N * ntransf; ++i) { int64_t j = i % N; s[i] = s[j]; t[i] = t[j]; @@ -158,40 +166,25 @@ int main() { d_u = u; cudaDeviceSynchronize(); - const auto cpu_planer = - [iflag, tol, ntransf, dim, M, N, n_modes, &x, &y, &z, &s, &t, &u, &fin_opts]( - const auto type) { - finufft_plan_s *plan{nullptr}; - std::int64_t nl[] = {n_modes[0], n_modes[1], n_modes[2]}; - assert( - finufft_makeplan(type, dim, nl, iflag, ntransf, tol, &plan, &fin_opts) == 0); - assert(finufft_setpts(plan, M, x.data(), y.data(), z.data(), N, s.data(), - t.data(), u.data()) == 0); - return plan; - }; + const auto cpu_planer = [iflag, tol, ntransf, dim, M, N, n_modes, &x, &y, &z, &s, &t, + &u, &fin_opts](const auto type) { + finufft_plan_s *plan{nullptr}; + std::int64_t nl[] = {n_modes[0], n_modes[1], n_modes[2]}; + assert(finufft_makeplan(type, dim, nl, iflag, ntransf, tol, &plan, &fin_opts) == 0); + assert(finufft_setpts(plan, M, x.data(), y.data(), z.data(), N, s.data(), t.data(), + u.data()) == 0); + return plan; + }; - const auto test_type1 = [iflag, - tol, - ntransf, - dim, - cpu_planer, - M, - N, - n_modes, - &d_x, - &d_y, - &d_z, - &c, - &d_c, - &fk, - &d_fk, - &opts](auto plan) { + const auto test_type1 = [iflag, tol, ntransf, dim, cpu_planer, M, N, n_modes, &d_x, + &d_y, &d_z, &c, &d_c, &fk, &d_fk, &opts](auto plan) { // plan is a pointer to a type that contains real_t using T = typename std::remove_pointer::type::real_t; const int type = 1; const auto cpu_plan = cpu_planer(type); assert(cufinufft_makeplan_impl(type, dim, (int *)n_modes, iflag, ntransf, T(tol), &plan, &opts) == 0); + cudaDeviceSynchronize(); assert( cufinufft_setpts_impl(M, d_x.data().get(), d_y.data().get(), d_z.data().get(), 0, nullptr, nullptr, nullptr, plan) == 0); @@ -210,8 +203,9 @@ int main() { c[i].imag(randm11()); } d_c = c; - cufinufft_execute_impl( - (cuda_complex *)d_c.data().get(), (cuda_complex *)d_fk.data().get(), plan); + cudaDeviceSynchronize(); + cufinufft_execute_impl((cuda_complex *)d_c.data().get(), + (cuda_complex *)d_fk.data().get(), plan); finufft_execute(cpu_plan, (std::complex *)c.data(), (std::complex *)fk.data()); std::cout << "type " << type << ": "; assert(almost_equal(d_fk.data().get(), fk.data(), N, tol)); @@ -221,28 +215,15 @@ int main() { plan = nullptr; }; - const auto test_type2 = [iflag, - tol, - ntransf, - dim, - cpu_planer, - M, - N, - n_modes, - &d_x, - &d_y, - &d_z, - &c, - &d_c, - &fk, - &d_fk, - &opts](auto plan) { + const auto test_type2 = [iflag, tol, ntransf, dim, cpu_planer, M, N, n_modes, &d_x, + &d_y, &d_z, &c, &d_c, &fk, &d_fk, &opts](auto plan) { // plan is a pointer to a type that contains real_t using T = typename std::remove_pointer::type::real_t; const int type = 2; const auto cpu_plan = cpu_planer(type); assert(cufinufft_makeplan_impl(type, dim, (int *)n_modes, iflag, ntransf, T(tol), &plan, &opts) == 0); + cudaDeviceSynchronize(); assert( cufinufft_setpts_impl(M, d_x.data().get(), d_y.data().get(), d_z.data().get(), 0, nullptr, nullptr, nullptr, plan) == 0); @@ -261,9 +242,11 @@ int main() { fk[i].imag(randm11()); } d_fk = fk; - cufinufft_execute_impl( - (cuda_complex *)d_c.data().get(), (cuda_complex *)d_fk.data().get(), plan); - finufft_execute(cpu_plan, (std::complex *)c.data(), (std::complex *)fk.data()); + cudaDeviceSynchronize(); + cufinufft_execute_impl((cuda_complex *)d_c.data().get(), + (cuda_complex *)d_fk.data().get(), plan); + finufft_execute(cpu_plan, c.data(), fk.data()); + cudaDeviceSynchronize(); std::cout << "type " << type << ": "; assert(almost_equal(d_c.data().get(), c.data(), M, tol)); assert(cufinufft_destroy_impl(plan) == 0); @@ -272,25 +255,8 @@ int main() { plan = nullptr; }; - auto test_type3 = [iflag, - tol, - ntransf, - dim, - cpu_planer, - deconv_tol, - M, - N, - n_modes, - &d_x, - &d_y, - &d_z, - &d_s, - &d_t, - &d_u, - &c, - &d_c, - &fk, - &d_fk, + auto test_type3 = [iflag, tol, ntransf, dim, cpu_planer, deconv_tol, M, N, n_modes, + &d_x, &d_y, &d_z, &d_s, &d_t, &d_u, &c, &d_c, &fk, &d_fk, &opts](auto plan) { // plan is a pointer to a type that contains real_t using T = typename std::remove_pointer::type::real_t; @@ -298,6 +264,7 @@ int main() { const auto cpu_plan = cpu_planer(type); assert(cufinufft_makeplan_impl(type, dim, (int *)n_modes, iflag, ntransf, T(tol), &plan, &opts) == 0); + cudaDeviceSynchronize(); assert(cufinufft_setpts_impl(M, d_x.data().get(), d_y.data().get(), d_z.data().get(), N, d_s.data().get(), d_t.data().get(), d_u.data().get(), plan) == 0); @@ -314,6 +281,9 @@ int main() { assert(plan->type3_params.gam1 == cpu_plan->t3P.gam1); assert(plan->type3_params.gam2 == cpu_plan->t3P.gam2); assert(plan->type3_params.gam3 == cpu_plan->t3P.gam3); + assert(plan->type3_params.h1 == cpu_plan->t3P.h1); + assert(plan->type3_params.h2 == cpu_plan->t3P.h2); + assert(plan->type3_params.h3 == cpu_plan->t3P.h3); assert(plan->nf1 == cpu_plan->nf1); assert(plan->nf2 == cpu_plan->nf2); assert(plan->nf3 == cpu_plan->nf3); @@ -329,8 +299,8 @@ int main() { assert(plan->spopts.ES_halfwidth == cpu_plan->spopts.ES_halfwidth); assert(plan->spopts.ES_c == cpu_plan->spopts.ES_c); std::cout << "prephase :\n"; - assert(almost_equal( - plan->prephase, cpu_plan->prephase, M, std::numeric_limits::epsilon() * 100)); + assert(almost_equal(plan->prephase, cpu_plan->prephase, M, + std::numeric_limits::epsilon() * 100)); std::cout << "deconv :\n"; assert(almost_equal(plan->deconv, cpu_plan->deconv, N, deconv_tol)); @@ -344,10 +314,11 @@ int main() { // } // d_fk = fk; cudaDeviceSynchronize(); - cufinufft_execute_impl( - (cuda_complex *)d_c.data().get(), (cuda_complex *)d_fk.data().get(), plan); - finufft_execute(cpu_plan, (std::complex *)c.data(), (std::complex *)fk.data()); - assert(almost_equal(d_fk.data().get(), fk.data(), N, tol)); + cufinufft_execute_impl((cuda_complex *)d_c.data().get(), + (cuda_complex *)d_fk.data().get(), plan); + finufft_execute(cpu_plan, c.data(), fk.data()); + cudaDeviceSynchronize(); + assert(almost_equal(d_fk.data().get(), fk.data(), N, tol, false)); assert(cufinufft_destroy_impl(plan) == 0); assert(finufft_destroy(cpu_plan) == 0); plan = nullptr; From f0a0fa4d33e16325dff28ffa4b200502314e1b20 Mon Sep 17 00:00:00 2001 From: Marco Barbone Date: Thu, 22 Aug 2024 18:13:28 -0400 Subject: [PATCH 47/68] small fixes --- include/cufinufft/impl.h | 113 ++++++++++++++++-------------- include/finufft_errors.h | 1 + src/cuda/3d/cufinufft3d.cu | 23 ++++-- src/cuda/common.cu | 1 + src/finufft.cpp | 14 ++-- test/cuda/CMakeLists.txt | 1 + test/cuda/cufinufft_type3_test.cu | 68 +++++++++--------- 7 files changed, 129 insertions(+), 92 deletions(-) diff --git a/include/cufinufft/impl.h b/include/cufinufft/impl.h index 2a421918a..24f6f1c7d 100644 --- a/include/cufinufft/impl.h +++ b/include/cufinufft/impl.h @@ -7,7 +7,6 @@ #include #include -#include #include #include #include @@ -452,10 +451,25 @@ int cufinufft_setpts_impl(int M, T *d_kx, T *d_ky, T *d_kz, int N, T *d_s, T *d_ } const auto stream = d_plan->stream; d_plan->N = N; - d_plan->d_s = d_s; - d_plan->d_t = d_t; - d_plan->d_u = d_u; - const auto dim = d_plan->dim; + if (d_plan->dim > 0 && d_s == nullptr) { + fprintf(stderr, "[%s] Error: d_s is nullptr but dim > 0.\n", __func__); + return FINUFFT_ERR_INVALID_ARGUMENT; + } + d_plan->d_s = d_plan->dim > 0 ? d_s : nullptr; + + if (d_plan->dim > 1 && d_t == nullptr) { + fprintf(stderr, "[%s] Error: d_t is nullptr but dim > 1.\n", __func__); + return FINUFFT_ERR_INVALID_ARGUMENT; + } + d_plan->d_t = d_plan->dim > 1 ? d_t : nullptr; + + if (d_plan->dim > 2 && d_u == nullptr) { + fprintf(stderr, "[%s] Error: d_u is nullptr but dim > 2.\n", __func__); + return FINUFFT_ERR_INVALID_ARGUMENT; + } + d_plan->d_u = d_plan->dim > 2 ? d_u : nullptr; + + const auto dim = d_plan->dim; // no need to set the params to zero, as they are already zeroed out in the plan // memset(d_plan->type3_params, 0, sizeof(d_plan->type3_params)); using namespace cufinufft::utils; @@ -596,13 +610,13 @@ int cufinufft_setpts_impl(int M, T *d_kx, T *d_ky, T *d_kz, int N, T *d_s, T *d_ (d_plan->dim > 1) ? d_ky : d_kx, // same idea as above (d_plan->dim > 2) ? d_kz : d_kx)); - const auto D1 = d_plan->type3_params.D1; - const auto D2 = d_plan->type3_params.D2; // this should be 0 if dim < 2 - const auto D3 = d_plan->type3_params.D3; // this should be 0 if dim < 3 - const auto imasign = d_plan->iflag >= 0 ? T(1) : T(-1); + const auto D1 = d_plan->type3_params.D1; + const auto D2 = d_plan->type3_params.D2; // this should be 0 if dim < 2 + const auto D3 = d_plan->type3_params.D3; // this should be 0 if dim < 3 + const auto realsign = d_plan->iflag >= 0 ? T(1) : T(-1); thrust::transform( thrust::cuda::par.on(stream), iterator, iterator + M, d_plan->prephase, - [D1, D2, D3, imasign] __host__ __device__( + [D1, D2, D3, realsign] __host__ __device__( const thrust::tuple &tuple) -> cuda_complex { const auto x = thrust::get<0>(tuple); const auto y = thrust::get<1>(tuple); @@ -613,36 +627,33 @@ int cufinufft_setpts_impl(int M, T *d_kx, T *d_ky, T *d_kz, int N, T *d_s, T *d_ // TODO: nvcc should have the sincos function // check the cos + i*sin // ref: https://en.wikipedia.org/wiki/Cis_(mathematics) - return cuda_complex{std::cos(phase), std::sin(phase) * imasign}; + return cuda_complex{std::cos(phase), std::sin(phase) * realsign}; }); } else { thrust::fill(thrust::cuda::par.on(stream), d_plan->prephase, d_plan->prephase + M, cuda_complex{1, 0}); } - { - const auto source_iterator = thrust::make_zip_iterator( - thrust::make_tuple(d_s, dim > 1 ? d_t : d_s, dim > 2 ? d_u : d_s)); - const auto target_iterator = thrust::make_zip_iterator( - thrust::make_tuple(d_plan->d_s, dim > 1 ? d_plan->d_t : d_plan->d_s, - dim > 2 ? d_plan->d_u : d_plan->d_s)); - const auto scale1 = d_plan->type3_params.h1 * d_plan->type3_params.gam1; - const auto D1 = d_plan->type3_params.D1; - const auto scale2 = d_plan->type3_params.h2 * d_plan->type3_params.gam2; - const auto D2 = d_plan->type3_params.D2; - const auto scale3 = d_plan->type3_params.h3 * d_plan->type3_params.gam3; - const auto D3 = d_plan->type3_params.D3; - thrust::transform(thrust::cuda::par.on(stream), source_iterator, source_iterator + N, - target_iterator, - [scale1, D1, scale2, D2, scale3, D3] __host__ __device__( - const thrust::tuple tuple) -> thrust::tuple { - auto s = thrust::get<0>(tuple); - auto t = thrust::get<1>(tuple); - auto u = thrust::get<2>(tuple); - s = scale1 * (s - D1); - t = scale2 * (t - D2); - u = scale3 * (u - D3); - return {s, t, u}; - }); + + if (d_plan->dim > 0) { + const auto scale = d_plan->type3_params.h1 * d_plan->type3_params.gam1; + const auto D1 = -d_plan->type3_params.D1; + thrust::transform( + thrust::cuda::par.on(stream), d_s, d_s + N, d_plan->d_s, + [scale, D1] __host__ __device__(const T s) -> T { return scale * (s + D1); }); + } + if (d_plan->dim > 1) { + const auto scale = d_plan->type3_params.h2 * d_plan->type3_params.gam2; + const auto D2 = -d_plan->type3_params.D2; + thrust::transform( + thrust::cuda::par.on(stream), d_t, d_t + N, d_plan->d_t, + [scale, D2] __host__ __device__(const T t) -> T { return scale * (t + D2); }); + } + if (d_plan->dim > 2) { + const auto scale = d_plan->type3_params.h3 * d_plan->type3_params.gam3; + const auto D3 = -d_plan->type3_params.D3; + thrust::transform( + thrust::cuda::par.on(stream), d_u, d_u + N, d_plan->d_u, + [scale, D3] __host__ __device__(const T u) -> T { return scale * (u + D3); }); } { // here we declare phi_hat1, phi_hat2, and phi_hat3 // and the precomputed data for the fseries kernel @@ -686,11 +697,12 @@ int cufinufft_setpts_impl(int M, T *d_kx, T *d_ky, T *d_kz, int N, T *d_s, T *d_ phi_hat3.data().get(), d_plan->spopts.nspread, stream)) goto finalize; - const auto is_c_finite = std::isfinite(d_plan->type3_params.C1) & - std::isfinite(d_plan->type3_params.C2) & + const auto is_c_finite = std::isfinite(d_plan->type3_params.C1) && + std::isfinite(d_plan->type3_params.C2) && std::isfinite(d_plan->type3_params.C3); - const auto is_c_nonzero = d_plan->type3_params.C1 != 0 | - d_plan->type3_params.C2 != 0 | d_plan->type3_params.C3 != 0; + const auto is_c_nonzero = d_plan->type3_params.C1 != 0 || + d_plan->type3_params.C2 != 0 || + d_plan->type3_params.C3 != 0; const auto phi_hat_iterator = thrust::make_zip_iterator( thrust::make_tuple(phi_hat1.begin(), @@ -710,13 +722,13 @@ int cufinufft_setpts_impl(int M, T *d_kx, T *d_ky, T *d_kz, int N, T *d_s, T *d_ }); if (is_c_finite && is_c_nonzero) { - const auto c1 = d_plan->type3_params.C1; - const auto c2 = d_plan->type3_params.C2; - const auto c3 = d_plan->type3_params.C3; - const auto d1 = -d_plan->type3_params.D1; - const auto d2 = -d_plan->type3_params.D2; - const auto d3 = -d_plan->type3_params.D3; - const auto imasign = d_plan->iflag >= 0 ? T(1) : T(-1); + const auto c1 = d_plan->type3_params.C1; + const auto c2 = d_plan->type3_params.C2; + const auto c3 = d_plan->type3_params.C3; + const auto d1 = -d_plan->type3_params.D1; + const auto d2 = -d_plan->type3_params.D2; + const auto d3 = -d_plan->type3_params.D3; + const auto realsign = d_plan->iflag >= 0 ? T(1) : T(-1); // passing d_s three times if dim == 1 because d_t and d_u are not allocated // passing d_s and d_t if dim == 2 because d_u is not allocated const auto phase_iterator = thrust::make_zip_iterator( @@ -724,14 +736,14 @@ int cufinufft_setpts_impl(int M, T *d_kx, T *d_ky, T *d_kz, int N, T *d_s, T *d_ thrust::transform( thrust::cuda::par.on(stream), phase_iterator, phase_iterator + N, d_plan->deconv, d_plan->deconv, - [c1, c2, c3, d1, d2, d3, imasign] __host__ __device__( - const thrust::tuple tuple, cuda_complex deconv) - -> cuda_complex { + [c1, c2, c3, d1, d2, d3, realsign] __host__ __device__( + const thrust::tuple tuple, + cuda_complex deconv) -> cuda_complex { // d2 and d3 are 0 if dim < 2 and dim < 3 const auto phase = c1 * (thrust::get<0>(tuple) + d1) + c2 * (thrust::get<1>(tuple) + d2) + c3 * (thrust::get<2>(tuple) + d3); - return cuda_complex{std::cos(phase), imasign * std::sin(phase)} * deconv; + return cuda_complex{std::cos(phase), realsign * std::sin(phase)} * deconv; }); } // exiting the block frees the memory allocated for phi_hat1, phi_hat2, and phi_hat3 @@ -782,7 +794,6 @@ int cufinufft_setpts_impl(int M, T *d_kx, T *d_ky, T *d_kz, int N, T *d_s, T *d_ } finalize: cufinufft_destroy_impl(d_plan); - cufinufft_destroy_impl(d_plan->t2_plan); return FINUFFT_ERR_CUDA_FAILURE; } diff --git a/include/finufft_errors.h b/include/finufft_errors.h index 2feaa131a..b08119820 100644 --- a/include/finufft_errors.h +++ b/include/finufft_errors.h @@ -23,4 +23,5 @@ #define FINUFFT_ERR_BINSIZE_NOTVALID 18 #define FINUFFT_ERR_INSUFFICIENT_SHMEM 19 #define FINUFFT_ERR_NUM_NU_PTS_INVALID 20 +#define FINUFFT_ERR_INVALID_ARGUMENT 21 #endif diff --git a/src/cuda/3d/cufinufft3d.cu b/src/cuda/3d/cufinufft3d.cu index 6b79f8379..19654d756 100644 --- a/src/cuda/3d/cufinufft3d.cu +++ b/src/cuda/3d/cufinufft3d.cu @@ -136,6 +136,7 @@ int cufinufft3d3_exec(cuda_complex *d_c, cuda_complex *d_fk, cuda_complex *d_cstart; cuda_complex *d_fkstart; const auto stream = d_plan->stream; + printf("[cufinufft] d_plan->ntransf = %d\n", d_plan->ntransf); for (int i = 0; i * d_plan->maxbatchsize < d_plan->ntransf; i++) { int blksize = min(d_plan->ntransf - i * d_plan->maxbatchsize, d_plan->maxbatchsize); d_cstart = d_c + i * d_plan->maxbatchsize * d_plan->M; @@ -145,24 +146,36 @@ int cufinufft3d3_exec(cuda_complex *d_c, cuda_complex *d_fk, // setting output for spreader d_plan->fk = d_plan->fw; // NOTE: fw might need to be set to 0 + if ((ier = checkCudaErrors(cudaMemsetAsync( + d_plan->fk, 0, d_plan->maxbatchsize * d_plan->nf * sizeof(cuda_complex), + stream)))) + return ier; // Step 0: pre-phase the input strengths for (int i = 0; i < blksize; i++) { thrust::transform(thrust::cuda::par.on(stream), d_plan->prephase, d_plan->prephase + d_plan->M, d_cstart + i * d_plan->M, d_plan->c + i * d_plan->M, thrust::multiplies>()); } - if ((ier = checkCudaErrors(cudaMemsetAsync( - d_plan->fk, 0, d_plan->maxbatchsize * d_plan->nf * sizeof(cuda_complex), - stream)))) - return ier; + // use thrust to print d_plan->c + thrust::for_each( + thrust::cuda::par.on(stream), d_plan->c, d_plan->c + blksize * d_plan->M, + [] __host__ __device__(cuda_complex & x) { + printf("[cufinufft] d_plan->cBatch = %0.16g | %0.16g\n", x.x, x.y); + }); // Step 1: Spread if ((ier = cuspread3d(d_plan, blksize))) return ier; // now d_plan->fk = d_plan->fw contains the spread values + thrust::for_each(thrust::cuda::par.on(stream), d_plan->fw + d_plan->nf1 * d_plan->nf2, + d_plan->fw + d_plan->maxbatchsize * d_plan->nf1 * d_plan->nf2 * 5, + [] __host__ __device__(cuda_complex & x) { + if (x.x != 0 || x.y != 0) + printf("[cufinufft] d_plan->fw = %0.16g | %0.16g\n", x.x, x.y); + }); // Step 2: Type 3 NUFFT // type 2 goes from fk to c // saving the results directly in the user output array d_fk // it needs to do blksize transforms - d_plan->t2_plan->ntransf = blksize; + // d_plan->t2_plan->ntransf = blksize; if ((ier = cufinufft3d2_exec(d_fkstart, d_plan->fw, d_plan->t2_plan))) return ier; // Step 3: deconvolve // now we need to d_fk = d_fk*d_plan->deconv diff --git a/src/cuda/common.cu b/src/cuda/common.cu index 03489d949..4181430d9 100644 --- a/src/cuda/common.cu +++ b/src/cuda/common.cu @@ -50,6 +50,7 @@ __global__ void fseries_kernel_compute(int nf1, int nf2, int nf3, T *f, T *a, i += blockDim.x * gridDim.x) { T x = 0.0; for (int n = 0; n < q; n++) { + // in type 1/2 2*PI/nf -> k[i] x += ft[n] * T(2) * std::cos(T(i) * at[n]); } oarr[i] = x; diff --git a/src/finufft.cpp b/src/finufft.cpp index cf9dfbe61..b06baaff5 100644 --- a/src/finufft.cpp +++ b/src/finufft.cpp @@ -1170,8 +1170,11 @@ int FINUFFT_EXECUTE(FINUFFT_PLAN p, CPX *cj, CPX *fk) { #pragma omp parallel for num_threads(p->opts.nthreads) // or p->batchSize? for (int i = 0; i < thisBatchSize; i++) { BIGINT ioff = i * p->nj; - for (BIGINT j = 0; j < p->nj; ++j) + for (BIGINT j = 0; j < p->nj; ++j) { p->CpBatch[ioff + j] = p->prephase[j] * cjb[ioff + j]; + printf("[finufft] p->CpBatch[%ld] = %.16g | %.16gi\n", j, real(p->CpBatch[j]), + imag(p->CpBatch[j])); // debug + } } t_pre += timer.elapsedsec(); @@ -1181,9 +1184,12 @@ int FINUFFT_EXECUTE(FINUFFT_PLAN p, CPX *cj, CPX *fk) { spreadinterpSortedBatch(thisBatchSize, p, p->CpBatch); // p->X are primed t_spr += timer.elapsedsec(); - // for (int j=0;jnf1;++j) - // printf("fw[%d]=%.3g+%.3gi\n",j,p->fwBatch[j][0],p->fwBatch[j][1]); // - // debug + for (int j = p->nf1 * p->nf2; j < p->nf1 * p->nf2 * 5; ++j) { + if (p->fwBatch[j].real() != 0.0 || p->fwBatch[j].imag() != 0.0) + printf("[finufft] fw[%d]=%.16g+%.16gi\n", j, p->fwBatch[j].real(), + p->fwBatch[j].imag()); // + // debug + } // STEP 2: type 2 NUFFT from fw batch to user output fk array batch... timer.restart(); diff --git a/test/cuda/CMakeLists.txt b/test/cuda/CMakeLists.txt index 1660b4483..7ec71b9ed 100644 --- a/test/cuda/CMakeLists.txt +++ b/test/cuda/CMakeLists.txt @@ -18,6 +18,7 @@ foreach(srcfile ${test_src}) message(STATUS "Adding test ${executable}" " with CUDA_ARCHITECTURES=${FINUFFT_CUDA_ARCHITECTURES}" " and INCLUDE=${CUFINUFFT_INCLUDE_DIRS}") + enable_asan(${executable}) endforeach() function(add_tests PREC REQ_TOL CHECK_TOL UPSAMP) diff --git a/test/cuda/cufinufft_type3_test.cu b/test/cuda/cufinufft_type3_test.cu index 161531d28..96631687c 100644 --- a/test/cuda/cufinufft_type3_test.cu +++ b/test/cuda/cufinufft_type3_test.cu @@ -85,7 +85,7 @@ auto almost_equal(V *d_vec, T *cpu, const std::size_t size, } std::cout << std::setprecision(6); } - std::cout << "relerrtwonorm: " << infnorm(h_vec.data(), cpu, size) << std::endl; + std::cout << "relerrtwonorm: " << relerrtwonorm(h_vec.data(), cpu, size) << std::endl; // compare the l2 norm of the difference between the two vectors if (relerrtwonorm(h_vec.data(), cpu, size) < tol) { return true; @@ -101,27 +101,31 @@ int main() { cufinufft_opts opts; cufinufft_default_opts(&opts); opts.debug = 2; - opts.upsampfac = 2.0; + opts.upsampfac = 2.00; opts.gpu_kerevalmeth = 1; + opts.gpu_method = 1; + opts.gpu_sort = 0; finufft_opts fin_opts; finufft_default_opts(&fin_opts); fin_opts.debug = opts.debug; fin_opts.spread_kerevalmeth = opts.gpu_kerevalmeth; fin_opts.upsampfac = opts.upsampfac; + fin_opts.spread_sort = opts.gpu_sort; const int iflag = 1; const int ntransf = 1; const int dim = 3; - const double tol = 1e-12; - const int n_modes[] = {10, 5, 3}; - const int N = n_modes[0] * n_modes[1] * n_modes[2]; - const int M = 1000; - const double bandwidth = 1.0; + const double tol = 1e-13; + const int n_modes[] = {5, 4, 3}; + const int N = n_modes[0] * (dim > 1 ? n_modes[1] : 1) * (dim > 2 ? n_modes[2] : 1); + const int M = 13; + const double bandwidth = 1.0; thrust::host_vector x(M * ntransf), y(M * ntransf), z(M * ntransf), s(N * ntransf), t(N * ntransf), u(N * ntransf); thrust::host_vector> c(M * ntransf), fk(N * ntransf); - thrust::device_vector d_x{}, d_y{}, d_z{}, d_s{}, d_t{}, d_u{}; + thrust::device_vector d_x(M * ntransf), d_y(M * ntransf), d_z(M * ntransf), + d_s(N * ntransf), d_t(N * ntransf), d_u(N * ntransf); thrust::device_vector> d_c(M * ntransf), d_fk(N * ntransf); std::default_random_engine eng(42); @@ -132,17 +136,17 @@ int main() { // Making data for (int64_t i = 0; i < M; i++) { - x[i] = M_PI * rand_util_11() + 4; // x in [-pi,pi) - y[i] = M_PI * rand_util_11() + 4; - z[i] = M_PI * rand_util_11() + 4; + x[i] = M_PI * rand_util_11(); // x in [-pi,pi) + y[i] = M_PI * rand_util_11(); + z[i] = M_PI * rand_util_11(); } for (int64_t i = 0; i < N; i++) { - s[i] = M_PI * rand_util_11() * bandwidth + 8; // shifted so D1 is 8 - t[i] = M_PI * rand_util_11() * bandwidth + 8; // shifted so D2 is 8 - u[i] = M_PI * rand_util_11() * bandwidth + 8; // shifted so D3 is 8 + s[i] = M_PI * rand_util_11() * bandwidth; // shifted so D1 is 8 + t[i] = M_PI * rand_util_11() * bandwidth; // shifted so D2 is 8 + u[i] = M_PI * rand_util_11() * bandwidth; // shifted so D3 is 8 } - const double deconv_tol = std::numeric_limits::epsilon() * bandwidth * 100; + const double deconv_tol = std::numeric_limits::epsilon() * bandwidth * 1000; for (int64_t i = M; i < M * ntransf; ++i) { int64_t j = i % M; @@ -208,7 +212,7 @@ int main() { (cuda_complex *)d_fk.data().get(), plan); finufft_execute(cpu_plan, (std::complex *)c.data(), (std::complex *)fk.data()); std::cout << "type " << type << ": "; - assert(almost_equal(d_fk.data().get(), fk.data(), N, tol)); + assert(almost_equal(d_fk.data().get(), fk.data(), N, tol * 10)); assert(cufinufft_destroy_impl(plan) == 0); assert(finufft_destroy(cpu_plan) == 0); cudaDeviceSynchronize(); @@ -270,29 +274,29 @@ int main() { d_t.data().get(), d_u.data().get(), plan) == 0); cudaDeviceSynchronize(); assert(plan->type3_params.X1 == cpu_plan->t3P.X1); - assert(plan->type3_params.X2 == cpu_plan->t3P.X2); - assert(plan->type3_params.X3 == cpu_plan->t3P.X3); + if (dim > 1) assert(plan->type3_params.X2 == cpu_plan->t3P.X2); + if (dim > 2) assert(plan->type3_params.X3 == cpu_plan->t3P.X3); assert(plan->type3_params.C1 == cpu_plan->t3P.C1); - assert(plan->type3_params.C2 == cpu_plan->t3P.C2); - assert(plan->type3_params.C3 == cpu_plan->t3P.C3); + if (dim > 1) assert(plan->type3_params.C2 == cpu_plan->t3P.C2); + if (dim > 2) assert(plan->type3_params.C3 == cpu_plan->t3P.C3); assert(plan->type3_params.D1 == cpu_plan->t3P.D1); - assert(plan->type3_params.D2 == cpu_plan->t3P.D2); - assert(plan->type3_params.D3 == cpu_plan->t3P.D3); + if (dim > 1) assert(plan->type3_params.D2 == cpu_plan->t3P.D2); + if (dim > 2) assert(plan->type3_params.D3 == cpu_plan->t3P.D3); assert(plan->type3_params.gam1 == cpu_plan->t3P.gam1); - assert(plan->type3_params.gam2 == cpu_plan->t3P.gam2); - assert(plan->type3_params.gam3 == cpu_plan->t3P.gam3); + if (dim > 1) assert(plan->type3_params.gam2 == cpu_plan->t3P.gam2); + if (dim > 2) assert(plan->type3_params.gam3 == cpu_plan->t3P.gam3); assert(plan->type3_params.h1 == cpu_plan->t3P.h1); - assert(plan->type3_params.h2 == cpu_plan->t3P.h2); - assert(plan->type3_params.h3 == cpu_plan->t3P.h3); + if (dim > 1) assert(plan->type3_params.h2 == cpu_plan->t3P.h2); + if (dim > 2) assert(plan->type3_params.h3 == cpu_plan->t3P.h3); assert(plan->nf1 == cpu_plan->nf1); - assert(plan->nf2 == cpu_plan->nf2); - assert(plan->nf3 == cpu_plan->nf3); + if (dim > 1) assert(plan->nf2 == cpu_plan->nf2); + if (dim > 2) assert(plan->nf3 == cpu_plan->nf3); assert(equal(plan->kx, cpu_plan->X, M)); - assert(equal(plan->ky, cpu_plan->Y, M)); - assert(equal(plan->kz, cpu_plan->Z, M)); + if (dim > 1) assert(equal(plan->ky, cpu_plan->Y, M)); + if (dim > 2) assert(equal(plan->kz, cpu_plan->Z, M)); assert(equal(plan->d_s, cpu_plan->Sp, N)); - assert(equal(plan->d_t, cpu_plan->Tp, N)); - assert(equal(plan->d_u, cpu_plan->Up, N)); + if (dim > 1) assert(equal(plan->d_t, cpu_plan->Tp, N)); + if (dim > 2) assert(equal(plan->d_u, cpu_plan->Up, N)); assert(plan->spopts.nspread == cpu_plan->spopts.nspread); assert(plan->spopts.upsampfac == cpu_plan->spopts.upsampfac); assert(plan->spopts.ES_beta == cpu_plan->spopts.ES_beta); From 066906e6c4a3925a6cd077b572a37ab28fef2d17 Mon Sep 17 00:00:00 2001 From: DiamonDinoia Date: Thu, 22 Aug 2024 22:00:33 -0400 Subject: [PATCH 48/68] adding debug prints --- CMakeLists.txt | 10 +--- include/cufinufft/impl.h | 21 ++++---- include/cufinufft/utils.h | 80 +++++++++++++++---------------- src/cuda/1d/cufinufft1d.cu | 4 ++ src/cuda/2d/cufinufft2d.cu | 4 ++ src/cuda/3d/cufinufft3d.cu | 38 ++++++++------- src/cuda/memtransfer_wrapper.cu | 30 +++++++----- src/finufft.cpp | 17 +++---- test/cuda/cufinufft_type3_test.cu | 27 +++++------ 9 files changed, 119 insertions(+), 112 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 957fa14db..3f846880a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -10,6 +10,8 @@ cmake_policy(SET CMP0091 NEW) include(CMakeDependentOption) +set(CMAKE_CUDA_ARCHITECTURES native) + # cmake-format: off # All options go here sphinx tag (don't remove): @cmake_opts_start option(FINUFFT_BUILD_FORTRAN "Whether to build the FINUFFT Fortran examples" OFF) @@ -281,14 +283,6 @@ if(FINUFFT_USE_CPU) endif() if(FINUFFT_USE_CUDA) - if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES) - message( - "FINUFFT WARNING: No CUDA architecture supplied via '-DCMAKE_CUDA_ARCHITECTURES=...', defaulting to 'native'" - ) - message( - "See: https://developer.nvidia.com/cuda-gpus for more details on what architecture to supply." - ) - endif() enable_language(CUDA) find_package(CUDAToolkit REQUIRED) add_subdirectory(src/cuda) diff --git a/include/cufinufft/impl.h b/include/cufinufft/impl.h index 24f6f1c7d..12379d40c 100644 --- a/include/cufinufft/impl.h +++ b/include/cufinufft/impl.h @@ -108,7 +108,6 @@ int cufinufft_makeplan_impl(int type, int dim, int *nmodes, int iflag, int ntran d_plan->mu = nmodes[2]; } else { d_plan->opts.gpu_spreadinterponly = 1; - d_plan->opts.gpu_method = 1; } int fftsign = (iflag >= 0) ? 1 : -1; @@ -663,7 +662,7 @@ int cufinufft_setpts_impl(int M, T *d_kx, T *d_ky, T *d_kz, int N, T *d_s, T *d_ std::array fseries_precomp_f{}; thrust::device_vector d_fseries_precomp_a(3 * MAX_NQUAD); thrust::device_vector d_fseries_precomp_f(3 * MAX_NQUAD); - thrust::device_vector phi_hat1{}, phi_hat2{}, phi_hat3{}; + thrust::device_vector phi_hat1, phi_hat2, phi_hat3; if (d_plan->dim > 0) { phi_hat1.resize(N); } @@ -673,17 +672,17 @@ int cufinufft_setpts_impl(int M, T *d_kx, T *d_ky, T *d_kz, int N, T *d_s, T *d_ if (d_plan->dim > 2) { phi_hat3.resize(N); } - onedim_fseries_kernel_precomp(d_plan->nf1, fseries_precomp_f.data(), + onedim_fseries_kernel_precomp(0, fseries_precomp_f.data(), fseries_precomp_a.data(), d_plan->spopts); if (d_plan->dim > 1) { - onedim_fseries_kernel_precomp( - d_plan->nf2, fseries_precomp_f.data() + MAX_NQUAD, - fseries_precomp_a.data() + MAX_NQUAD, d_plan->spopts); + onedim_fseries_kernel_precomp(0, fseries_precomp_f.data() + MAX_NQUAD, + fseries_precomp_a.data() + MAX_NQUAD, + d_plan->spopts); } if (d_plan->dim > 2) { - onedim_fseries_kernel_precomp( - d_plan->nf3, fseries_precomp_f.data() + 2 * MAX_NQUAD, - fseries_precomp_a.data() + 2 * MAX_NQUAD, d_plan->spopts); + onedim_fseries_kernel_precomp(0, fseries_precomp_f.data() + 2 * MAX_NQUAD, + fseries_precomp_a.data() + 2 * MAX_NQUAD, + d_plan->spopts); } // copy the precomputed data to the device using thrust thrust::copy(fseries_precomp_a.begin(), fseries_precomp_a.end(), @@ -771,7 +770,7 @@ int cufinufft_setpts_impl(int M, T *d_kx, T *d_ky, T *d_kz, int N, T *d_s, T *d_ int t2modes[] = {d_plan->nf1, d_plan->nf2, d_plan->nf3}; cufinufft_opts t2opts = d_plan->opts; t2opts.gpu_spreadinterponly = 0; - + t2opts.gpu_method = 1; // Safe to ignore the return value here? if (d_plan->t2_plan) cufinufft_destroy_impl(d_plan->t2_plan); // check that maxbatchsize is correct @@ -790,8 +789,8 @@ int cufinufft_setpts_impl(int M, T *d_kx, T *d_ky, T *d_kz, int N, T *d_s, T *d_ __func__); goto finalize; } - return 0; } + return 0; finalize: cufinufft_destroy_impl(d_plan); return FINUFFT_ERR_CUDA_FAILURE; diff --git a/include/cufinufft/utils.h b/include/cufinufft/utils.h index 665b1d099..377ce29e6 100644 --- a/include/cufinufft/utils.h +++ b/include/cufinufft/utils.h @@ -36,6 +36,46 @@ __inline__ __device__ double atomicAdd(double *address, double val) { } #endif +#ifdef __CUDA_ARCH__ +__forceinline__ __device__ auto interval(const int ns, const float x) { + // float to int round up and fused multiply-add to round up + const auto xstart = __float2int_ru(__fmaf_ru(ns, -.5f, x)); + // float to int round down and fused multiply-add to round down + const auto xend = __float2int_rd(__fmaf_rd(ns, .5f, x)); + return int2{xstart, xend}; +} +__forceinline__ __device__ auto interval(const int ns, const double x) { + // same as above + const auto xstart = __double2int_ru(__fma_ru(ns, -.5, x)); + const auto xend = __double2int_rd(__fma_rd(ns, .5, x)); + return int2{xstart, xend}; +} +#endif + +// Define a macro to check if NVCC version is >= 11.3 +#if defined(__CUDACC_VER_MAJOR__) && defined(__CUDACC_VER_MINOR__) +#if (__CUDACC_VER_MAJOR__ > 11) || \ + (__CUDACC_VER_MAJOR__ == 11 && __CUDACC_VER_MINOR__ >= 3 && __CUDA_ARCH__ >= 600) +#define ALLOCA_SUPPORTED 1 +// windows compatibility +#if __has_include() +#include +#endif +#endif +#endif + +#undef ALLOCA_SUPPORTED + +#if defined(__CUDA_ARCH__) +#if __CUDA_ARCH__ >= 900 +#define COMPUTE_CAPABILITY_90_OR_HIGHER 1 +#else +#define COMPUTE_CAPABILITY_90_OR_HIGHER 0 +#endif +#else +#define COMPUTE_CAPABILITY_90_OR_HIGHER 0 +#endif + namespace cufinufft { namespace utils { class WithCudaDevice { @@ -79,46 +119,6 @@ template T infnorm(int n, std::complex *a) { return sqrt(nrm); } -#ifdef __CUDA_ARCH__ -__forceinline__ __device__ auto interval(const int ns, const float x) { - // float to int round up and fused multiply-add to round up - const auto xstart = __float2int_ru(__fmaf_ru(ns, -.5f, x)); - // float to int round down and fused multiply-add to round down - const auto xend = __float2int_rd(__fmaf_rd(ns, .5f, x)); - return int2{xstart, xend}; -} -__forceinline__ __device__ auto interval(const int ns, const double x) { - // same as above - const auto xstart = __double2int_ru(__fma_ru(ns, -.5, x)); - const auto xend = __double2int_rd(__fma_rd(ns, .5, x)); - return int2{xstart, xend}; -} -#endif - -// Define a macro to check if NVCC version is >= 11.3 -#if defined(__CUDACC_VER_MAJOR__) && defined(__CUDACC_VER_MINOR__) -#if (__CUDACC_VER_MAJOR__ > 11) || \ - (__CUDACC_VER_MAJOR__ == 11 && __CUDACC_VER_MINOR__ >= 3 && __CUDA_ARCH__ >= 600) -#define ALLOCA_SUPPORTED 1 -// windows compatibility -#if __has_include() -#include -#endif -#endif -#endif - -#undef ALLOCA_SUPPORTED - -#if defined(__CUDA_ARCH__) -#if __CUDA_ARCH__ >= 900 -#define COMPUTE_CAPABILITY_90_OR_HIGHER 1 -#else -#define COMPUTE_CAPABILITY_90_OR_HIGHER 0 -#endif -#else -#define COMPUTE_CAPABILITY_90_OR_HIGHER 0 -#endif - /** * does a complex atomic add on a shared memory address * it adds the real and imaginary parts separately diff --git a/src/cuda/1d/cufinufft1d.cu b/src/cuda/1d/cufinufft1d.cu index d94dc3cea..ed3c038e5 100644 --- a/src/cuda/1d/cufinufft1d.cu +++ b/src/cuda/1d/cufinufft1d.cu @@ -146,6 +146,10 @@ int cufinufft1d3_exec(cuda_complex *d_c, cuda_complex *d_fk, d_plan->c = d_plan->c_batch + i * d_plan->maxbatchsize * d_plan->M; // setting output for spreader d_plan->fk = d_plan->fw; + if ((ier = checkCudaErrors(cudaMemsetAsync( + d_plan->fw, 0, d_plan->maxbatchsize * d_plan->nf * sizeof(cuda_complex), + stream)))) + return ier; // NOTE: fw might need to be set to 0 // Step 0: pre-phase the input strengths for (int i = 0; i < blksize; i++) { diff --git a/src/cuda/2d/cufinufft2d.cu b/src/cuda/2d/cufinufft2d.cu index 9bcd8d370..91da10e29 100644 --- a/src/cuda/2d/cufinufft2d.cu +++ b/src/cuda/2d/cufinufft2d.cu @@ -146,6 +146,10 @@ int cufinufft2d3_exec(cuda_complex *d_c, cuda_complex *d_fk, d_plan->c = d_plan->c_batch + i * d_plan->maxbatchsize * d_plan->M; // setting output for spreader d_plan->fk = d_plan->fw; + if ((ier = checkCudaErrors(cudaMemsetAsync( + d_plan->fw, 0, d_plan->maxbatchsize * d_plan->nf * sizeof(cuda_complex), + stream)))) + return ier; // NOTE: fw might need to be set to 0 // Step 0: pre-phase the input strengths for (int i = 0; i < blksize; i++) { diff --git a/src/cuda/3d/cufinufft3d.cu b/src/cuda/3d/cufinufft3d.cu index 19654d756..3e78fc08c 100644 --- a/src/cuda/3d/cufinufft3d.cu +++ b/src/cuda/3d/cufinufft3d.cu @@ -43,11 +43,9 @@ int cufinufft3d1_exec(cuda_complex *d_c, cuda_complex *d_fk, d_plan->c = d_cstart; d_plan->fk = d_fkstart; - if ((ier = checkCudaErrors( - cudaMemsetAsync(d_plan->fw, 0, - d_plan->maxbatchsize * d_plan->nf1 * d_plan->nf2 * - d_plan->nf3 * sizeof(cuda_complex), - stream)))) + if ((ier = checkCudaErrors(cudaMemsetAsync( + d_plan->fw, 0, d_plan->maxbatchsize * d_plan->nf * sizeof(cuda_complex), + stream)))) return ier; // Step 1: Spread @@ -147,7 +145,7 @@ int cufinufft3d3_exec(cuda_complex *d_c, cuda_complex *d_fk, d_plan->fk = d_plan->fw; // NOTE: fw might need to be set to 0 if ((ier = checkCudaErrors(cudaMemsetAsync( - d_plan->fk, 0, d_plan->maxbatchsize * d_plan->nf * sizeof(cuda_complex), + d_plan->fw, 0, d_plan->maxbatchsize * d_plan->nf * sizeof(cuda_complex), stream)))) return ier; // Step 0: pre-phase the input strengths @@ -157,25 +155,29 @@ int cufinufft3d3_exec(cuda_complex *d_c, cuda_complex *d_fk, d_plan->c + i * d_plan->M, thrust::multiplies>()); } // use thrust to print d_plan->c - thrust::for_each( - thrust::cuda::par.on(stream), d_plan->c, d_plan->c + blksize * d_plan->M, - [] __host__ __device__(cuda_complex & x) { - printf("[cufinufft] d_plan->cBatch = %0.16g | %0.16g\n", x.x, x.y); - }); + // thrust::for_each( + // thrust::cuda::par.on(stream), d_plan->c, d_plan->c + blksize * d_plan->M, + // [] __host__ __device__(cuda_complex & x) { + // printf("[cufinufft] d_plan->cBatch = %0.16g | %0.16g\n", x.x, x.y); + // }); // Step 1: Spread + if ((ier = cuspread3d(d_plan, blksize))) return ier; // now d_plan->fk = d_plan->fw contains the spread values - thrust::for_each(thrust::cuda::par.on(stream), d_plan->fw + d_plan->nf1 * d_plan->nf2, - d_plan->fw + d_plan->maxbatchsize * d_plan->nf1 * d_plan->nf2 * 5, - [] __host__ __device__(cuda_complex & x) { - if (x.x != 0 || x.y != 0) - printf("[cufinufft] d_plan->fw = %0.16g | %0.16g\n", x.x, x.y); - }); + // thrust::for_each(thrust::cuda::par.on(stream), d_plan->fw + d_plan->nf1 * + // d_plan->nf2, + // d_plan->fw + d_plan->maxbatchsize * d_plan->nf1 * d_plan->nf2 * + // 5, + // [] __host__ __device__(cuda_complex & x) { + // if (x.x != 0 || x.y != 0) + // printf("[cufinufft] d_plan->fw = %0.16g | %0.16g\n", x.x, + // x.y); + // }); // Step 2: Type 3 NUFFT // type 2 goes from fk to c // saving the results directly in the user output array d_fk // it needs to do blksize transforms - // d_plan->t2_plan->ntransf = blksize; + d_plan->t2_plan->ntransf = blksize; if ((ier = cufinufft3d2_exec(d_fkstart, d_plan->fw, d_plan->t2_plan))) return ier; // Step 3: deconvolve // now we need to d_fk = d_fk*d_plan->deconv diff --git a/src/cuda/memtransfer_wrapper.cu b/src/cuda/memtransfer_wrapper.cu index 4584bd8ff..2d30da22c 100644 --- a/src/cuda/memtransfer_wrapper.cu +++ b/src/cuda/memtransfer_wrapper.cu @@ -20,9 +20,9 @@ int allocgpumem1d_plan(cufinufft_plan_t *d_plan) */ { utils::WithCudaDevice device_swapper(d_plan->opts.gpu_device_id); - auto &stream = d_plan->stream; + const auto stream = d_plan->stream; - int ier; + int ier{0}; int nf1 = d_plan->nf1; int maxbatchsize = d_plan->maxbatchsize; @@ -90,8 +90,8 @@ int allocgpumem1d_nupts(cufinufft_plan_t *d_plan) */ { utils::WithCudaDevice device_swapper(d_plan->opts.gpu_device_id); - auto &stream = d_plan->stream; - int ier; + const auto stream = d_plan->stream; + int ier{0}; int M = d_plan->M; CUDA_FREE_AND_NULL(d_plan->sortidx, stream, d_plan->supports_pools); @@ -135,8 +135,8 @@ int allocgpumem2d_plan(cufinufft_plan_t *d_plan) */ { utils::WithCudaDevice device_swapper(d_plan->opts.gpu_device_id); - auto &stream = d_plan->stream; - int ier; + const auto stream = d_plan->stream; + int ier{0}; int nf1 = d_plan->nf1; int nf2 = d_plan->nf2; @@ -213,8 +213,8 @@ int allocgpumem2d_nupts(cufinufft_plan_t *d_plan) */ { utils::WithCudaDevice device_swapper(d_plan->opts.gpu_device_id); - auto &stream = d_plan->stream; - int ier; + const auto stream = d_plan->stream; + int ier{0}; const int M = d_plan->M; @@ -258,8 +258,8 @@ int allocgpumem3d_plan(cufinufft_plan_t *d_plan) */ { utils::WithCudaDevice device_swapper(d_plan->opts.gpu_device_id); - auto &stream = d_plan->stream; - int ier; + const auto stream = d_plan->stream; + int ier{0}; int nf1 = d_plan->nf1; int nf2 = d_plan->nf2; @@ -360,7 +360,11 @@ int allocgpumem3d_plan(cufinufft_plan_t *d_plan) } finalize: - if (ier) freegpumemory(d_plan); + if (ier) { + std::cerr << "[allocgpumem3d_plan] error:" + << cudaGetErrorString(static_cast(ier)) << std::endl; + freegpumemory(d_plan); + } return ier; } @@ -374,8 +378,8 @@ int allocgpumem3d_nupts(cufinufft_plan_t *d_plan) */ { utils::WithCudaDevice device_swapper(d_plan->opts.gpu_device_id); - auto &stream = d_plan->stream; - int ier; + const auto stream = d_plan->stream; + int ier{0}; int M = d_plan->M; CUDA_FREE_AND_NULL(d_plan->sortidx, stream, d_plan->supports_pools); diff --git a/src/finufft.cpp b/src/finufft.cpp index b06baaff5..2ee61f1be 100644 --- a/src/finufft.cpp +++ b/src/finufft.cpp @@ -1172,8 +1172,9 @@ int FINUFFT_EXECUTE(FINUFFT_PLAN p, CPX *cj, CPX *fk) { BIGINT ioff = i * p->nj; for (BIGINT j = 0; j < p->nj; ++j) { p->CpBatch[ioff + j] = p->prephase[j] * cjb[ioff + j]; - printf("[finufft] p->CpBatch[%ld] = %.16g | %.16gi\n", j, real(p->CpBatch[j]), - imag(p->CpBatch[j])); // debug + // printf("[finufft] ??p->CpBatch[%ld] = %.16g | %.16gi\n", j, + // real(p->CpBatch[j]), + // imag(p->CpBatch[j])); // debug } } t_pre += timer.elapsedsec(); @@ -1184,12 +1185,12 @@ int FINUFFT_EXECUTE(FINUFFT_PLAN p, CPX *cj, CPX *fk) { spreadinterpSortedBatch(thisBatchSize, p, p->CpBatch); // p->X are primed t_spr += timer.elapsedsec(); - for (int j = p->nf1 * p->nf2; j < p->nf1 * p->nf2 * 5; ++j) { - if (p->fwBatch[j].real() != 0.0 || p->fwBatch[j].imag() != 0.0) - printf("[finufft] fw[%d]=%.16g+%.16gi\n", j, p->fwBatch[j].real(), - p->fwBatch[j].imag()); // - // debug - } + // for (int j = p->nf1 * p->nf2; j < p->nf1 * p->nf2 * 5; ++j) { + // if (p->fwBatch[j].real() != 0.0 || p->fwBatch[j].imag() != 0.0) + // printf("[finufft] fw[%d]=%.16g+%.16gi\n", j, p->fwBatch[j].real(), + // p->fwBatch[j].imag()); // + // // debug + // } // STEP 2: type 2 NUFFT from fw batch to user output fk array batch... timer.restart(); diff --git a/test/cuda/cufinufft_type3_test.cu b/test/cuda/cufinufft_type3_test.cu index 96631687c..a9b4d03ff 100644 --- a/test/cuda/cufinufft_type3_test.cu +++ b/test/cuda/cufinufft_type3_test.cu @@ -56,7 +56,7 @@ T relerrtwonorm(std::complex *a, std::complex *b, const std::size_t n) { nrm += std::real(std::conj(a[m]) * a[m]); const auto diff = a[m] - b[m]; auto this_err = std::real(std::conj(diff) * diff); - if (this_err > 1e-9) { + if (this_err > 1e-12) { std::cout << "a[" << m << "]: " << a[m] << " b[" << m << "]: " << b[m] << "\n"; std::cout << "diff: " << diff << " this_err: " << this_err << std::endl; } @@ -85,12 +85,11 @@ auto almost_equal(V *d_vec, T *cpu, const std::size_t size, } std::cout << std::setprecision(6); } - std::cout << "relerrtwonorm: " << relerrtwonorm(h_vec.data(), cpu, size) << std::endl; + const auto error = relerrtwonorm(h_vec.data(), cpu, size); + std::cout << "relerrtwonorm: " << error << std::endl; + ; // compare the l2 norm of the difference between the two vectors - if (relerrtwonorm(h_vec.data(), cpu, size) < tol) { - return true; - } - return false; + return (error < tol); } int main() { @@ -102,9 +101,9 @@ int main() { cufinufft_default_opts(&opts); opts.debug = 2; opts.upsampfac = 2.00; - opts.gpu_kerevalmeth = 1; + opts.gpu_kerevalmeth = 0; opts.gpu_method = 1; - opts.gpu_sort = 0; + opts.gpu_sort = 1; finufft_opts fin_opts; finufft_default_opts(&fin_opts); fin_opts.debug = opts.debug; @@ -114,10 +113,10 @@ int main() { const int iflag = 1; const int ntransf = 1; const int dim = 3; - const double tol = 1e-13; - const int n_modes[] = {5, 4, 3}; + const double tol = 1e-15; + const int n_modes[] = {5, 4, 2}; const int N = n_modes[0] * (dim > 1 ? n_modes[1] : 1) * (dim > 2 ? n_modes[2] : 1); - const int M = 13; + const int M = 15; const double bandwidth = 1.0; thrust::host_vector x(M * ntransf), y(M * ntransf), z(M * ntransf), @@ -322,7 +321,7 @@ int main() { (cuda_complex *)d_fk.data().get(), plan); finufft_execute(cpu_plan, c.data(), fk.data()); cudaDeviceSynchronize(); - assert(almost_equal(d_fk.data().get(), fk.data(), N, tol, false)); + assert(almost_equal(d_fk.data().get(), fk.data(), N, tol * 10, false)); assert(cufinufft_destroy_impl(plan) == 0); assert(finufft_destroy(cpu_plan) == 0); plan = nullptr; @@ -331,8 +330,8 @@ int main() { // testing correctness of the plan creation // cufinufft_plan_t *single_plan{nullptr}; cufinufft_plan_t *double_plan{nullptr}; - test_type1(double_plan); - test_type2(double_plan); + // test_type1(double_plan); + // test_type2(double_plan); test_type3(double_plan); return 0; } From 6da956bc3a5bd5e4039522d9c422773b303509dc Mon Sep 17 00:00:00 2001 From: Marco Barbone Date: Fri, 23 Aug 2024 10:46:33 -0400 Subject: [PATCH 49/68] testing inner plan2 & using cudamemcpyasync --- include/cufinufft/utils.h | 9 +++--- test/cuda/cufinufft_makeplan_impl.cu | 4 --- test/cuda/cufinufft_type3_test.cu | 47 ++++++++++++++++++++++++---- 3 files changed, 45 insertions(+), 15 deletions(-) diff --git a/include/cufinufft/utils.h b/include/cufinufft/utils.h index 377ce29e6..0c69529c7 100644 --- a/include/cufinufft/utils.h +++ b/include/cufinufft/utils.h @@ -3,7 +3,6 @@ // octave (mkoctfile) needs this otherwise it doesn't know what int64_t is! #include -#include #include #include @@ -155,10 +154,10 @@ template auto arrayrange(int n, T *a, cudaStream_t stream) { // copy d_min and d_max to host T min{}, max{}; - checkCudaErrors(cudaMemcpy(&min, thrust::raw_pointer_cast(d_min_max.first), sizeof(T), - cudaMemcpyDeviceToHost)); - checkCudaErrors(cudaMemcpy(&max, thrust::raw_pointer_cast(d_min_max.second), sizeof(T), - cudaMemcpyDeviceToHost)); + checkCudaErrors(cudaMemcpyAsync(&min, thrust::raw_pointer_cast(d_min_max.first), + sizeof(T), cudaMemcpyDeviceToHost, stream)); + checkCudaErrors(cudaMemcpyAsync(&max, thrust::raw_pointer_cast(d_min_max.second), + sizeof(T), cudaMemcpyDeviceToHost, stream)); return std::make_tuple(min, max); } diff --git a/test/cuda/cufinufft_makeplan_impl.cu b/test/cuda/cufinufft_makeplan_impl.cu index b5fa039ad..53ee26565 100644 --- a/test/cuda/cufinufft_makeplan_impl.cu +++ b/test/cuda/cufinufft_makeplan_impl.cu @@ -56,9 +56,6 @@ int main() { std::vector fwkerhalf_host(size, -1); const auto ier = cudaMemcpy(fwkerhalf_host.data(), fwkerhalf[idx], size * sizeof(T), cudaMemcpyDeviceToHost); - if (ier != cudaSuccess) { - std::cerr << "Error: " << cudaGetErrorString(ier) << std::endl; - } assert(ier == cudaSuccess); for (int i = 0; i < size; i++) { assert(abs(1 - fwkerhalf_host[i] / phiHat[idx][i]) < tol); @@ -134,7 +131,6 @@ int main() { assert(plan->fwkerhalf3 == nullptr); assert(plan->spopts.spread_direction == type); assert(plan->type == type); - // assert(plan->opts.gpu_method == 0); assert(plan->opts.upsampfac == 1.25); assert(cufinufft_destroy_impl(plan) == 0); plan = nullptr; diff --git a/test/cuda/cufinufft_type3_test.cu b/test/cuda/cufinufft_type3_test.cu index a9b4d03ff..abe0e586c 100644 --- a/test/cuda/cufinufft_type3_test.cu +++ b/test/cuda/cufinufft_type3_test.cu @@ -87,7 +87,6 @@ auto almost_equal(V *d_vec, T *cpu, const std::size_t size, } const auto error = relerrtwonorm(h_vec.data(), cpu, size); std::cout << "relerrtwonorm: " << error << std::endl; - ; // compare the l2 norm of the difference between the two vectors return (error < tol); } @@ -103,7 +102,7 @@ int main() { opts.upsampfac = 2.00; opts.gpu_kerevalmeth = 0; opts.gpu_method = 1; - opts.gpu_sort = 1; + opts.gpu_sort = 0; finufft_opts fin_opts; finufft_default_opts(&fin_opts); fin_opts.debug = opts.debug; @@ -113,7 +112,7 @@ int main() { const int iflag = 1; const int ntransf = 1; const int dim = 3; - const double tol = 1e-15; + const double tol = 1e-13; const int n_modes[] = {5, 4, 2}; const int N = n_modes[0] * (dim > 1 ? n_modes[1] : 1) * (dim > 2 ? n_modes[2] : 1); const int M = 15; @@ -307,6 +306,43 @@ int main() { std::cout << "deconv :\n"; assert(almost_equal(plan->deconv, cpu_plan->deconv, N, deconv_tol)); + assert(plan->t2_plan->nf1 == cpu_plan->innerT2plan->nf1); + if (dim > 1) assert(plan->t2_plan->nf2 == cpu_plan->innerT2plan->nf2); + if (dim > 2) assert(plan->t2_plan->nf3 == cpu_plan->innerT2plan->nf3); + assert(plan->t2_plan->spopts.nspread == cpu_plan->innerT2plan->spopts.nspread); + assert(plan->t2_plan->spopts.upsampfac == cpu_plan->innerT2plan->spopts.upsampfac); + assert(plan->t2_plan->spopts.ES_beta == cpu_plan->innerT2plan->spopts.ES_beta); + assert( + plan->t2_plan->spopts.ES_halfwidth == cpu_plan->innerT2plan->spopts.ES_halfwidth); + assert(plan->t2_plan->spopts.ES_c == cpu_plan->innerT2plan->spopts.ES_c); + + int nf[] = {plan->t2_plan->nf1, plan->t2_plan->nf2, plan->t2_plan->nf3}; + T *fwkerhalf[] = {plan->t2_plan->fwkerhalf1, plan->t2_plan->fwkerhalf2, + plan->t2_plan->fwkerhalf3}; + T *phiHat[] = {cpu_plan->innerT2plan->phiHat1, cpu_plan->innerT2plan->phiHat2, + cpu_plan->innerT2plan->phiHat3}; + for (int idx = 0; idx < dim; ++idx) { + std::cout << "nf[" << idx << "]: " << nf[idx] << std::endl; + const auto size = (nf[idx] / 2 + 1); + std::vector fwkerhalf_host(size, -1); + const auto ier = cudaMemcpy(fwkerhalf_host.data(), fwkerhalf[idx], size * sizeof(T), + cudaMemcpyDeviceToHost); + if (ier != cudaSuccess) { + std::cerr << "Error: " << cudaGetErrorString(ier) << std::endl; + } + assert(ier == cudaSuccess); + cudaDeviceSynchronize(); + for (int i = 0; i < size; i++) { + const auto error = abs(1 - fwkerhalf_host[i] / phiHat[idx][i]); + if (error > tol) { + std::cout << "fwkerhalf[" << idx << "][" << i << "]: " << fwkerhalf_host[i] + << " phiHat[" << idx << "][" << i << "]: " << phiHat[idx][i] + << std::endl; + std::cout << "error: " << error << std::endl; + } + assert(error < tol * 100); + } + } for (int i = 0; i < M; i++) { c[i].real(randm11()); c[i].imag(randm11()); @@ -316,7 +352,6 @@ int main() { // fk[i] = {randm11(), randm11()}; // } // d_fk = fk; - cudaDeviceSynchronize(); cufinufft_execute_impl((cuda_complex *)d_c.data().get(), (cuda_complex *)d_fk.data().get(), plan); finufft_execute(cpu_plan, c.data(), fk.data()); @@ -330,8 +365,8 @@ int main() { // testing correctness of the plan creation // cufinufft_plan_t *single_plan{nullptr}; cufinufft_plan_t *double_plan{nullptr}; - // test_type1(double_plan); - // test_type2(double_plan); + test_type1(double_plan); + test_type2(double_plan); test_type3(double_plan); return 0; } From 6098edccac61659c732e7f7c60ebf1e83e26258b Mon Sep 17 00:00:00 2001 From: DiamonDinoia Date: Mon, 26 Aug 2024 19:21:36 -0400 Subject: [PATCH 50/68] testing the intter type 2 completely --- CMakeLists.txt | 4 +-- include/cufinufft/impl.h | 16 +++++++---- src/cuda/2d/spreadinterp2d.cuh | 4 +-- src/cuda/3d/cufinufft3d.cu | 29 +++++++++---------- src/cuda/3d/interp3d_wrapper.cu | 2 +- src/cuda/3d/spreadinterp3d.cuh | 10 +++---- src/finufft.cpp | 30 ++++++++++++++++++- test/cuda/cufinufft_type3_test.cu | 48 ++++++++++++++++++++----------- 8 files changed, 95 insertions(+), 48 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 3f846880a..929289b12 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -125,8 +125,8 @@ set(FINUFFT_PRECISION_DEPENDENT_SOURCES # set linker flags for sanitizer set(FINUFFT_SANITIZER_FLAGS) if(FINUFFT_ENABLE_SANITIZERS) - set(FINUFFT_SANITIZER_FLAGS -fsanitize=address -fsanitize=undefined - -fsanitize=bounds-strict /fsanitize=address /RTC1) + set(FINUFFT_SANITIZER_FLAGS -fsanitize=undefined -fsanitize=bounds-strict + /fsanitize=address /RTC1) filter_supported_compiler_flags(FINUFFT_SANITIZER_FLAGS FINUFFT_SANITIZER_FLAGS) set(FINUFFT_SANITIZER_FLAGS diff --git a/include/cufinufft/impl.h b/include/cufinufft/impl.h index 12379d40c..4f76e45c8 100644 --- a/include/cufinufft/impl.h +++ b/include/cufinufft/impl.h @@ -106,6 +106,7 @@ int cufinufft_makeplan_impl(int type, int dim, int *nmodes, int iflag, int ntran d_plan->ms = nmodes[0]; d_plan->mt = nmodes[1]; d_plan->mu = nmodes[2]; + printf("[cufinufft] (ms,mt,mu): %d %d %d\n", d_plan->ms, d_plan->mt, d_plan->mu); } else { d_plan->opts.gpu_spreadinterponly = 1; } @@ -231,6 +232,11 @@ int cufinufft_makeplan_impl(int type, int dim, int *nmodes, int iflag, int ntran d_plan->nf1 = nf1; d_plan->nf2 = nf2; d_plan->nf3 = nf3; + d_plan->nf = nf1 * nf2 * nf3; + if (d_plan->opts.debug) { + printf("[cufinufft] (nf1,nf2,nf3) = (%d, %d, %d)\n", d_plan->nf1, d_plan->nf2, + d_plan->nf3); + } using namespace cufinufft::memtransfer; switch (d_plan->dim) { @@ -423,6 +429,9 @@ Notes: the type T means either single or double, matching the } break; } + if (d_plan->opts.debug) { + printf("[cufinufft] plan->M=%d\n", M); + } return ier; } @@ -561,7 +570,7 @@ int cufinufft_setpts_impl(int M, T *d_kx, T *d_ky, T *d_kz, int N, T *d_s, T *d_ if (checked_realloc(d_plan->ky, sizeof(T) * M) != cudaSuccess) goto finalize; if (checked_realloc(d_plan->d_t, sizeof(T) * N) != cudaSuccess) goto finalize; } - if (d_plan->dim > 1) { + if (d_plan->dim > 2) { if (checked_realloc(d_plan->kz, sizeof(T) * M) != cudaSuccess) goto finalize; if (checked_realloc(d_plan->d_u, sizeof(T) * N) != cudaSuccess) goto finalize; } @@ -570,14 +579,9 @@ int cufinufft_setpts_impl(int M, T *d_kx, T *d_ky, T *d_kz, int N, T *d_s, T *d_ if (checked_realloc(d_plan->deconv, sizeof(cuda_complex) * N) != cudaSuccess) goto finalize; - // should not be needed - // cudaStreamSynchronize(stream); - // NOTE: init-captures are not allowed for extended __host__ __device__ lambdas if (d_plan->dim > 0) { - // TODO: merging the tree calls to GPU into one as in the version below might - // might be more readable and faster const auto ig1 = T(1) / d_plan->type3_params.gam1; const auto C1 = -d_plan->type3_params.C1; thrust::transform( diff --git a/src/cuda/2d/spreadinterp2d.cuh b/src/cuda/2d/spreadinterp2d.cuh index 53a243e7e..805e921aa 100644 --- a/src/cuda/2d/spreadinterp2d.cuh +++ b/src/cuda/2d/spreadinterp2d.cuh @@ -47,9 +47,9 @@ __global__ void spread_2d_nupts_driven( } for (auto yy = ystart; yy <= yend; yy++) { + const auto iy = yy < 0 ? yy + nf2 : (yy > nf2 - 1 ? yy - nf2 : yy); for (auto xx = xstart; xx <= xend; xx++) { const auto ix = xx < 0 ? xx + nf1 : (xx > nf1 - 1 ? xx - nf1 : xx); - const auto iy = yy < 0 ? yy + nf2 : (yy > nf2 - 1 ? yy - nf2 : yy); const auto outidx = ix + iy * nf1; const auto kervalue1 = ker1[xx - xstart]; const auto kervalue2 = ker2[yy - ystart]; @@ -240,9 +240,9 @@ __global__ void interp_2d_nupts_driven( cuda_complex cnow{0, 0}; for (int yy = ystart; yy <= yend; yy++) { const T kervalue2 = ker2[yy - ystart]; + const auto iy = yy < 0 ? yy + nf2 : (yy > nf2 - 1 ? yy - nf2 : yy); for (int xx = xstart; xx <= xend; xx++) { const auto ix = xx < 0 ? xx + nf1 : (xx > nf1 - 1 ? xx - nf1 : xx); - const auto iy = yy < 0 ? yy + nf2 : (yy > nf2 - 1 ? yy - nf2 : yy); const auto inidx = ix + iy * nf1; const auto kervalue1 = ker1[xx - xstart]; cnow.x += fw[inidx].x * kervalue1 * kervalue2; diff --git a/src/cuda/3d/cufinufft3d.cu b/src/cuda/3d/cufinufft3d.cu index 3e78fc08c..0721d0300 100644 --- a/src/cuda/3d/cufinufft3d.cu +++ b/src/cuda/3d/cufinufft3d.cu @@ -154,31 +154,30 @@ int cufinufft3d3_exec(cuda_complex *d_c, cuda_complex *d_fk, d_plan->prephase + d_plan->M, d_cstart + i * d_plan->M, d_plan->c + i * d_plan->M, thrust::multiplies>()); } - // use thrust to print d_plan->c - // thrust::for_each( - // thrust::cuda::par.on(stream), d_plan->c, d_plan->c + blksize * d_plan->M, - // [] __host__ __device__(cuda_complex & x) { - // printf("[cufinufft] d_plan->cBatch = %0.16g | %0.16g\n", x.x, x.y); - // }); + // Step 1: Spread if ((ier = cuspread3d(d_plan, blksize))) return ier; // now d_plan->fk = d_plan->fw contains the spread values - // thrust::for_each(thrust::cuda::par.on(stream), d_plan->fw + d_plan->nf1 * - // d_plan->nf2, - // d_plan->fw + d_plan->maxbatchsize * d_plan->nf1 * d_plan->nf2 * - // 5, - // [] __host__ __device__(cuda_complex & x) { - // if (x.x != 0 || x.y != 0) - // printf("[cufinufft] d_plan->fw = %0.16g | %0.16g\n", x.x, - // x.y); - // }); + // Step 2: Type 3 NUFFT + // type 2 goes from fk to c // saving the results directly in the user output array d_fk // it needs to do blksize transforms d_plan->t2_plan->ntransf = blksize; if ((ier = cufinufft3d2_exec(d_fkstart, d_plan->fw, d_plan->t2_plan))) return ier; + // print d_fk using thrust on the GPU + // create a host vector to store the results + // copy d_fk to host + // print the results + // std::vector> h_fk(d_plan->N); + // cudaMemcpyAsync(h_fk.data(), d_fkstart, d_plan->N * sizeof(cuda_complex), + // cudaMemcpyDeviceToHost, stream); + // for (int i = 0; i < d_plan->N; i++) { + // printf("[cufinufft] d_fk = %.16g %.16g\n", h_fk[i].x, h_fk[i].y); + // } + // Step 3: deconvolve // now we need to d_fk = d_fk*d_plan->deconv for (int i = 0; i < blksize; i++) { diff --git a/src/cuda/3d/interp3d_wrapper.cu b/src/cuda/3d/interp3d_wrapper.cu index 91379d3ae..2aaa4914e 100644 --- a/src/cuda/3d/interp3d_wrapper.cu +++ b/src/cuda/3d/interp3d_wrapper.cu @@ -50,7 +50,7 @@ int cuinterp3d(cufinufft_plan_t *d_plan, int blksize) template int cuinterp3d_nuptsdriven(int nf1, int nf2, int nf3, int M, cufinufft_plan_t *d_plan, int blksize) { - auto &stream = d_plan->stream; + const auto stream = d_plan->stream; dim3 threadsPerBlock; dim3 blocks; diff --git a/src/cuda/3d/spreadinterp3d.cuh b/src/cuda/3d/spreadinterp3d.cuh index 59b4661ff..b7d5f26ab 100644 --- a/src/cuda/3d/spreadinterp3d.cuh +++ b/src/cuda/3d/spreadinterp3d.cuh @@ -455,9 +455,9 @@ __global__ void interp_3d_nupts_driven( const auto [ystart, yend] = interval(ns, y_rescaled); const auto [zstart, zend] = interval(ns, z_rescaled); - const auto x1 = T(xstart) - x_rescaled; - const auto y1 = T(ystart) - y_rescaled; - const auto z1 = T(zstart) - z_rescaled; + const T x1 = T(xstart) - x_rescaled; + const T y1 = T(ystart) - y_rescaled; + const T z1 = T(zstart) - z_rescaled; cuda_complex cnow{0, 0}; @@ -478,8 +478,8 @@ __global__ void interp_3d_nupts_driven( const auto kervalue2 = ker2[yy - ystart]; int iy = yy < 0 ? yy + nf2 : (yy > nf2 - 1 ? yy - nf2 : yy); for (int xx = xstart; xx <= xend; xx++) { - const int ix = xx < 0 ? xx + nf1 : (xx > nf1 - 1 ? xx - nf1 : xx); - const int inidx = ix + iy * nf1 + iz * nf2 * nf1; + const auto ix = xx < 0 ? xx + nf1 : (xx > nf1 - 1 ? xx - nf1 : xx); + const auto inidx = ix + iy * nf1 + iz * nf2 * nf1; const auto kervalue1 = ker1[xx - xstart]; cnow.x += fw[inidx].x * kervalue1 * kervalue2 * kervalue3; cnow.y += fw[inidx].y * kervalue1 * kervalue2 * kervalue3; diff --git a/src/finufft.cpp b/src/finufft.cpp index 2ee61f1be..b3bd0efda 100644 --- a/src/finufft.cpp +++ b/src/finufft.cpp @@ -975,6 +975,30 @@ int FINUFFT_SETPTS(FINUFFT_PLAN p, BIGINT nj, FLT *xj, FLT *yj, FLT *zj, BIGINT p->Up[k] = p->t3P.h3 * p->t3P.gam3 * (u[k] - p->t3P.D3); // so |u'_k| < // pi/R } + // // print Sp, Tp, Up + // for (BIGINT k = 0; k < nk; ++k) { + // printf("Sp[%lld] = %.16g\n", (long long)k, p->Sp[k]); + // } + // for (BIGINT k = 0; k < nk; ++k) { + // printf("Tp[%lld] = %.16g\n", (long long)k, p->Tp[k]); + // } + // for (BIGINT k = 0; k < nk; ++k) { + // printf("Up[%lld] = %.16g\n", (long long)k, p->Up[k]); + // } + // // print min, max of Sp, Tp, Up + // FLT minSp = p->Sp[0], maxSp = p->Sp[0]; + // FLT minTp = p->Tp[0], maxTp = p->Tp[0]; + // FLT minUp = p->Up[0], maxUp = p->Up[0]; + // for (BIGINT k = 0; k < nk; ++k) { + // if (p->Sp[k] < minSp) minSp = p->Sp[k]; + // if (p->Sp[k] > maxSp) maxSp = p->Sp[k]; + // if (p->Tp[k] < minTp) minTp = p->Tp[k]; + // if (p->Tp[k] > maxTp) maxTp = p->Tp[k]; + // if (p->Up[k] < minUp) minUp = p->Up[k]; + // if (p->Up[k] > maxUp) maxUp = p->Up[k]; + // } + // printf("minSp = %.16g, maxSp = %.16g\n", minSp, maxSp); + // #pragma omp parallel for num_threads(p->opts.nthreads) schedule(static) // for (BIGINT k = 0; k < nk; ++k) { // p->Sp[k] = s[k]; @@ -1200,7 +1224,11 @@ int FINUFFT_EXECUTE(FINUFFT_PLAN p, CPX *cj, CPX *fk) { still the same size, as Andrea explained; just wastes a few flops) */ FINUFFT_EXECUTE(p->innerT2plan, fkb, p->fwBatch); t_t2 += timer.elapsedsec(); - + // for (int j = 0; j < p->nk; ++j) { + // printf("[finufft] fk[%d]=%.16g %.16g\n", j, fkb[j].real(), + // fkb[j].imag()); + // debug + // } // STEP 3: apply deconvolve (precomputed 1/phiHat(targ_k), phasing too)... timer.restart(); #pragma omp parallel for num_threads(p->opts.nthreads) diff --git a/test/cuda/cufinufft_type3_test.cu b/test/cuda/cufinufft_type3_test.cu index abe0e586c..6dae9ddbc 100644 --- a/test/cuda/cufinufft_type3_test.cu +++ b/test/cuda/cufinufft_type3_test.cu @@ -102,13 +102,15 @@ int main() { opts.upsampfac = 2.00; opts.gpu_kerevalmeth = 0; opts.gpu_method = 1; - opts.gpu_sort = 0; + opts.gpu_sort = 1; + opts.modeord = 0; finufft_opts fin_opts; finufft_default_opts(&fin_opts); fin_opts.debug = opts.debug; fin_opts.spread_kerevalmeth = opts.gpu_kerevalmeth; fin_opts.upsampfac = opts.upsampfac; fin_opts.spread_sort = opts.gpu_sort; + fin_opts.modeord = opts.modeord; const int iflag = 1; const int ntransf = 1; const int dim = 3; @@ -134,9 +136,9 @@ int main() { // Making data for (int64_t i = 0; i < M; i++) { - x[i] = M_PI * rand_util_11(); // x in [-pi,pi) - y[i] = M_PI * rand_util_11(); - z[i] = M_PI * rand_util_11(); + x[i] = rand_util_11(); // x in [-pi,pi) + y[i] = rand_util_11(); + z[i] = rand_util_11(); } for (int64_t i = 0; i < N; i++) { s[i] = M_PI * rand_util_11() * bandwidth; // shifted so D1 is 8 @@ -179,7 +181,8 @@ int main() { }; const auto test_type1 = [iflag, tol, ntransf, dim, cpu_planer, M, N, n_modes, &d_x, - &d_y, &d_z, &c, &d_c, &fk, &d_fk, &opts](auto plan) { + &d_y, &d_z, &c, &d_c, &fk, &d_fk, &opts, + &rand_util_11](auto plan) { // plan is a pointer to a type that contains real_t using T = typename std::remove_pointer::type::real_t; const int type = 1; @@ -201,8 +204,8 @@ int main() { assert(plan->spopts.ES_c == cpu_plan->spopts.ES_c); for (int i = 0; i < M; i++) { - c[i].real(randm11()); - c[i].imag(randm11()); + c[i].real(rand_util_11()); + c[i].imag(rand_util_11()); } d_c = c; cudaDeviceSynchronize(); @@ -218,7 +221,8 @@ int main() { }; const auto test_type2 = [iflag, tol, ntransf, dim, cpu_planer, M, N, n_modes, &d_x, - &d_y, &d_z, &c, &d_c, &fk, &d_fk, &opts](auto plan) { + &d_y, &d_z, &c, &d_c, &fk, &d_fk, &opts, + &rand_util_11](auto plan) { // plan is a pointer to a type that contains real_t using T = typename std::remove_pointer::type::real_t; const int type = 2; @@ -240,8 +244,8 @@ int main() { assert(plan->spopts.ES_c == cpu_plan->spopts.ES_c); for (int i = 0; i < N; i++) { - fk[i].real(randm11()); - fk[i].imag(randm11()); + fk[i].real(rand_util_11()); + fk[i].imag(rand_util_11()); } d_fk = fk; cudaDeviceSynchronize(); @@ -258,8 +262,8 @@ int main() { }; auto test_type3 = [iflag, tol, ntransf, dim, cpu_planer, deconv_tol, M, N, n_modes, - &d_x, &d_y, &d_z, &d_s, &d_t, &d_u, &c, &d_c, &fk, &d_fk, - &opts](auto plan) { + &d_x, &d_y, &d_z, &d_s, &d_t, &d_u, &c, &d_c, &fk, &d_fk, &opts, + &rand_util_11](auto plan) { // plan is a pointer to a type that contains real_t using T = typename std::remove_pointer::type::real_t; const int type = 3; @@ -309,13 +313,17 @@ int main() { assert(plan->t2_plan->nf1 == cpu_plan->innerT2plan->nf1); if (dim > 1) assert(plan->t2_plan->nf2 == cpu_plan->innerT2plan->nf2); if (dim > 2) assert(plan->t2_plan->nf3 == cpu_plan->innerT2plan->nf3); + assert(plan->t2_plan->nf == cpu_plan->innerT2plan->nf); + assert(plan->t2_plan->spopts.nspread == cpu_plan->innerT2plan->spopts.nspread); assert(plan->t2_plan->spopts.upsampfac == cpu_plan->innerT2plan->spopts.upsampfac); assert(plan->t2_plan->spopts.ES_beta == cpu_plan->innerT2plan->spopts.ES_beta); assert( plan->t2_plan->spopts.ES_halfwidth == cpu_plan->innerT2plan->spopts.ES_halfwidth); assert(plan->t2_plan->spopts.ES_c == cpu_plan->innerT2plan->spopts.ES_c); - + assert(plan->t2_plan->ms == cpu_plan->innerT2plan->ms); + assert(plan->t2_plan->mt == cpu_plan->innerT2plan->mt); + assert(plan->t2_plan->mu == cpu_plan->innerT2plan->mu); int nf[] = {plan->t2_plan->nf1, plan->t2_plan->nf2, plan->t2_plan->nf3}; T *fwkerhalf[] = {plan->t2_plan->fwkerhalf1, plan->t2_plan->fwkerhalf2, plan->t2_plan->fwkerhalf3}; @@ -340,12 +348,12 @@ int main() { << std::endl; std::cout << "error: " << error << std::endl; } - assert(error < tol * 100); + // assert(error < tol * 1000); } } for (int i = 0; i < M; i++) { - c[i].real(randm11()); - c[i].imag(randm11()); + c[i].real(rand_util_11()); + c[i].imag(rand_util_11()); } d_c = c; // for (int i = 0; i < N; i++) { @@ -356,6 +364,14 @@ int main() { (cuda_complex *)d_fk.data().get(), plan); finufft_execute(cpu_plan, c.data(), fk.data()); cudaDeviceSynchronize(); + std::cout << "t2_plan->fw : "; + assert(almost_equal(plan->t2_plan->fw, cpu_plan->innerT2plan->fwBatch, + plan->t2_plan->nf, std::numeric_limits::epsilon() * 100)); + std::cout << "CpBatch : "; + assert(almost_equal(plan->c_batch, cpu_plan->CpBatch, M, tol, false)); + std::cout << "fw : "; + assert(almost_equal(plan->fw, cpu_plan->fwBatch, plan->nf, tol * 10, false)); + std::cout << "fk : "; assert(almost_equal(d_fk.data().get(), fk.data(), N, tol * 10, false)); assert(cufinufft_destroy_impl(plan) == 0); assert(finufft_destroy(cpu_plan) == 0); From e89a4f9a69eb09a3397d255a587b34a88e799000 Mon Sep 17 00:00:00 2001 From: DiamonDinoia Date: Tue, 27 Aug 2024 12:50:43 -0400 Subject: [PATCH 51/68] fixed type 3 without horner --- include/cufinufft/impl.h | 23 ++++++------ include/cufinufft/spreadinterp.h | 32 ++++------------- include/cufinufft/utils.h | 16 ++------- src/cuda/2d/spread2d_wrapper.cu | 5 ++- src/cuda/3d/interp3d_wrapper.cu | 9 ++--- src/cuda/3d/spread3d_wrapper.cu | 11 +++--- src/cuda/3d/spreadinterp3d.cuh | 8 +++-- src/cuda/memtransfer_wrapper.cu | 5 +++ test/cuda/cufinufft_type3_test.cu | 60 +++++++++++++++++++++++-------- 9 files changed, 88 insertions(+), 81 deletions(-) diff --git a/include/cufinufft/impl.h b/include/cufinufft/impl.h index 4f76e45c8..e9229525e 100644 --- a/include/cufinufft/impl.h +++ b/include/cufinufft/impl.h @@ -85,9 +85,10 @@ int cufinufft_makeplan_impl(int type, int dim, int *nmodes, int iflag, int ntran /* allocate the plan structure, assign address to user pointer. */ auto *d_plan = new cufinufft_plan_t; - *d_plan_ptr = d_plan; - // Zero out your struct, (sets all pointers to NULL) memset(d_plan, 0, sizeof(*d_plan)); + *d_plan_ptr = d_plan; + + // Zero out your struct, (sets all pointers to NULL) // set nf1, nf2, nf3 to 1 for type 3, type 1, type 2 will overwrite this d_plan->nf1 = 1; d_plan->nf2 = 1; @@ -129,14 +130,16 @@ int cufinufft_makeplan_impl(int type, int dim, int *nmodes, int iflag, int ntran // cudaMallocAsync isn't supported for all devices, regardless of cuda version. Check // for support - cudaDeviceGetAttribute(&d_plan->supports_pools, cudaDevAttrMemoryPoolsSupported, - device_id); - static bool warned = false; - if (!warned && !d_plan->supports_pools && d_plan->opts.gpu_stream != nullptr) { - fprintf(stderr, - "[cufinufft] Warning: cudaMallocAsync not supported on this device. Use of " - "CUDA streams may not perform optimally.\n"); - warned = true; + { + cudaDeviceGetAttribute(&d_plan->supports_pools, cudaDevAttrMemoryPoolsSupported, + device_id); + static bool warned = false; + if (!warned && !d_plan->supports_pools && d_plan->opts.gpu_stream != nullptr) { + fprintf(stderr, + "[cufinufft] Warning: cudaMallocAsync not supported on this device. Use of " + "CUDA streams may not perform optimally.\n"); + warned = true; + } } // simple check to use upsampfac=1.25 if tol is big diff --git a/include/cufinufft/spreadinterp.h b/include/cufinufft/spreadinterp.h index fefb38f74..aed555209 100644 --- a/include/cufinufft/spreadinterp.h +++ b/include/cufinufft/spreadinterp.h @@ -10,7 +10,7 @@ namespace cufinufft { namespace spreadinterp { template -static __forceinline__ __device__ constexpr T fma(const T a, const T b, const T c) { +static __forceinline__ __device__ constexpr T cudaFMA(const T a, const T b, const T c) { if constexpr (std::is_same_v) { // fused multiply-add, round to nearest even return __fmaf_rn(a, b, c); @@ -27,29 +27,9 @@ template constexpr __forceinline__ __host__ __device__ T fold_rescale(T x, int N) { constexpr auto x2pi = T(0.159154943091895345554011992339482617); constexpr auto half = T(0.5); -#if defined(__CUDA_ARCH__) - if constexpr (std::is_same_v) { - // fused multiply-add, round to nearest even - auto result = __fmaf_rn(x, x2pi, half); - // subtract, round down - result = __fsub_rd(result, floorf(result)); - // multiply, round down - return __fmul_rd(result, static_cast(N)); - } else if constexpr (std::is_same_v) { - // fused multiply-add, round to nearest even - auto result = __fma_rn(x, x2pi, half); - // subtract, round down - result = __dsub_rd(result, floor(result)); - // multiply, round down - return __dmul_rd(result, static_cast(N)); - } else { - static_assert(std::is_same_v || std::is_same_v, - "Only float and double are supported."); - } -#else - const auto result = std::fma(x, x2pi, half); - return (result - std::floor(result)) * static_cast(N); -#endif + const auto result = x * x2pi + half; + return (result - std::floor(result)) * T(N); + // #endif } template @@ -92,8 +72,8 @@ static __device__ void eval_kernel_vec_horner(T *ker, const T x, const int w, This is the current evaluation method, since it's faster (except i7 w=16). Two upsampfacs implemented. Params must match ref formula. Barnett 4/24/18 */ { - const auto z = fma(T(2), x, T(w - 1)); // scale so local grid offset z in [-1,1] - // const T z = 2 * x + w - 1.0; + const auto z = cudaFMA(T(2), x, T(w - 1)); // scale so local grid offset z in [-1,1] + // const T z = T(2) * x + T(w - 1); // insert the auto-generated code which expects z, w args, writes to ker... if (upsampfac == 2.0) { // floating point equality is fine here using FLT = T; diff --git a/include/cufinufft/utils.h b/include/cufinufft/utils.h index 0c69529c7..ac1d688da 100644 --- a/include/cufinufft/utils.h +++ b/include/cufinufft/utils.h @@ -35,21 +35,11 @@ __inline__ __device__ double atomicAdd(double *address, double val) { } #endif -#ifdef __CUDA_ARCH__ -__forceinline__ __device__ auto interval(const int ns, const float x) { - // float to int round up and fused multiply-add to round up - const auto xstart = __float2int_ru(__fmaf_ru(ns, -.5f, x)); - // float to int round down and fused multiply-add to round down - const auto xend = __float2int_rd(__fmaf_rd(ns, .5f, x)); +template __forceinline__ __device__ auto interval(const int ns, const T x) { + const auto xstart = int(std::ceil(x - T(ns) * T(.5))); + const auto xend = int(std::floor(x + T(ns) * T(.5))); return int2{xstart, xend}; } -__forceinline__ __device__ auto interval(const int ns, const double x) { - // same as above - const auto xstart = __double2int_ru(__fma_ru(ns, -.5, x)); - const auto xend = __double2int_rd(__fma_rd(ns, .5, x)); - return int2{xstart, xend}; -} -#endif // Define a macro to check if NVCC version is >= 11.3 #if defined(__CUDACC_VER_MAJOR__) && defined(__CUDACC_VER_MINOR__) diff --git a/src/cuda/2d/spread2d_wrapper.cu b/src/cuda/2d/spread2d_wrapper.cu index 80cf9f8e9..490c8eed1 100644 --- a/src/cuda/2d/spread2d_wrapper.cu +++ b/src/cuda/2d/spread2d_wrapper.cu @@ -5,6 +5,7 @@ #include #include #include +#include #include #include @@ -96,9 +97,7 @@ int cuspread2d_nuptsdriven_prop(int nf1, int nf2, int M, cufinufft_plan_t *d_ RETURN_IF_CUDA_ERROR } else { int *d_idxnupts = d_plan->idxnupts; - - trivial_global_sort_index_2d<<<(M + 1024 - 1) / 1024, 1024, 0, stream>>>(M, - d_idxnupts); + thrust::sequence(thrust::cuda::par.on(stream), d_idxnupts, d_idxnupts + M); RETURN_IF_CUDA_ERROR } diff --git a/src/cuda/3d/interp3d_wrapper.cu b/src/cuda/3d/interp3d_wrapper.cu index 2aaa4914e..51c620756 100644 --- a/src/cuda/3d/interp3d_wrapper.cu +++ b/src/cuda/3d/interp3d_wrapper.cu @@ -52,9 +52,6 @@ int cuinterp3d_nuptsdriven(int nf1, int nf2, int nf3, int M, cufinufft_plan_t int blksize) { const auto stream = d_plan->stream; - dim3 threadsPerBlock; - dim3 blocks; - int ns = d_plan->spopts.nspread; // psi's support in terms of number of cells T es_c = d_plan->spopts.ES_c; T es_beta = d_plan->spopts.ES_beta; @@ -68,10 +65,8 @@ int cuinterp3d_nuptsdriven(int nf1, int nf2, int nf3, int M, cufinufft_plan_t cuda_complex *d_c = d_plan->c; cuda_complex *d_fw = d_plan->fw; - threadsPerBlock.x = 16; - threadsPerBlock.y = 1; - blocks.x = (M + threadsPerBlock.x - 1) / threadsPerBlock.x; - blocks.y = 1; + const dim3 threadsPerBlock{16, 1, 1}; + const dim3 blocks{(M + threadsPerBlock.x - 1) / threadsPerBlock.x, 1, 1}; if (d_plan->opts.gpu_kerevalmeth) { for (int t = 0; t < blksize; t++) { diff --git a/src/cuda/3d/spread3d_wrapper.cu b/src/cuda/3d/spread3d_wrapper.cu index 475a888ac..a0411c2b1 100644 --- a/src/cuda/3d/spread3d_wrapper.cu +++ b/src/cuda/3d/spread3d_wrapper.cu @@ -5,6 +5,7 @@ #include #include #include +#include #include #include @@ -72,9 +73,9 @@ int cuspread3d_nuptsdriven_prop(int nf1, int nf2, int nf3, int M, } int numbins[3]; - numbins[0] = ceil((T)nf1 / bin_size_x); - numbins[1] = ceil((T)nf2 / bin_size_y); - numbins[2] = ceil((T)nf3 / bin_size_z); + numbins[0] = (nf1 + bin_size_x - 1) / bin_size_x; + numbins[1] = (nf2 + bin_size_y - 1) / bin_size_y; + numbins[2] = (nf3 + bin_size_z - 1) / bin_size_z; T *d_kx = d_plan->kx; T *d_ky = d_plan->ky; @@ -105,9 +106,7 @@ int cuspread3d_nuptsdriven_prop(int nf1, int nf2, int nf3, int M, RETURN_IF_CUDA_ERROR } else { int *d_idxnupts = d_plan->idxnupts; - - trivial_global_sort_index_3d<<<(M + 1024 - 1) / 1024, 1024, 0, stream>>>(M, - d_idxnupts); + thrust::sequence(thrust::cuda::par.on(stream), d_idxnupts, d_idxnupts + M); RETURN_IF_CUDA_ERROR } diff --git a/src/cuda/3d/spreadinterp3d.cuh b/src/cuda/3d/spreadinterp3d.cuh index b7d5f26ab..298ae4a43 100644 --- a/src/cuda/3d/spreadinterp3d.cuh +++ b/src/cuda/3d/spreadinterp3d.cuh @@ -445,6 +445,7 @@ __global__ void interp_3d_nupts_driven( T ker2[MAX_NSPREAD]; T ker3[MAX_NSPREAD]; #endif + cuda_complex cnow{}; for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < M; i += blockDim.x * gridDim.x) { const auto x_rescaled = fold_rescale(x[idxnupts[i]], nf1); @@ -459,7 +460,10 @@ __global__ void interp_3d_nupts_driven( const T y1 = T(ystart) - y_rescaled; const T z1 = T(zstart) - z_rescaled; - cuda_complex cnow{0, 0}; + // having cnow allocated to 0 inside the loop breaks type 3 spread + // are we doing a buffer overflow somewhere? + cnow.x = T(0); + cnow.y = T(0); if constexpr (KEREVALMETH == 1) { eval_kernel_vec_horner(ker1, x1, ns, sigma); @@ -473,7 +477,7 @@ __global__ void interp_3d_nupts_driven( for (int zz = zstart; zz <= zend; zz++) { const auto kervalue3 = ker3[zz - zstart]; - int iz = zz < 0 ? zz + nf3 : (zz > nf3 - 1 ? zz - nf3 : zz); + const auto iz = zz < 0 ? zz + nf3 : (zz > nf3 - 1 ? zz - nf3 : zz); for (int yy = ystart; yy <= yend; yy++) { const auto kervalue2 = ker2[yy - ystart]; int iy = yy < 0 ? yy + nf2 : (yy > nf2 - 1 ? yy - nf2 : yy); diff --git a/src/cuda/memtransfer_wrapper.cu b/src/cuda/memtransfer_wrapper.cu index 2d30da22c..f87cb6fe5 100644 --- a/src/cuda/memtransfer_wrapper.cu +++ b/src/cuda/memtransfer_wrapper.cu @@ -59,6 +59,7 @@ int allocgpumem1d_plan(cufinufft_plan_t *d_plan) goto finalize; } break; default: + ier = FINUFFT_ERR_METHOD_NOTVALID; std::cerr << "err: invalid method " << std::endl; } @@ -180,6 +181,7 @@ int allocgpumem2d_plan(cufinufft_plan_t *d_plan) goto finalize; } break; default: + ier = FINUFFT_ERR_METHOD_NOTVALID; std::cerr << "[allocgpumem2d_plan] error: invalid method\n"; } @@ -240,6 +242,7 @@ int allocgpumem2d_nupts(cufinufft_plan_t *d_plan) goto finalize; } break; default: + ier = FINUFFT_ERR_METHOD_NOTVALID; std::cerr << "[allocgpumem2d_nupts] error: invalid method\n"; } @@ -337,6 +340,7 @@ int allocgpumem3d_plan(cufinufft_plan_t *d_plan) goto finalize; } break; default: + ier = FINUFFT_ERR_METHOD_NOTVALID; std::cerr << "[allocgpumem3d_plan] error: invalid method\n"; } @@ -409,6 +413,7 @@ int allocgpumem3d_nupts(cufinufft_plan_t *d_plan) goto finalize; } break; default: + ier = FINUFFT_ERR_METHOD_NOTVALID; std::cerr << "[allocgpumem3d_nupts] error: invalid method\n"; } diff --git a/test/cuda/cufinufft_type3_test.cu b/test/cuda/cufinufft_type3_test.cu index 6dae9ddbc..e67217a0e 100644 --- a/test/cuda/cufinufft_type3_test.cu +++ b/test/cuda/cufinufft_type3_test.cu @@ -8,16 +8,16 @@ #include +#include +#include +#include + #include #include #include #include -#include -#include -#include - template bool equal(V *d_vec, T *cpu, const std::size_t size) { // copy d_vec to cpu thrust::host_vector h_vec(size); @@ -91,6 +91,27 @@ auto almost_equal(V *d_vec, T *cpu, const std::size_t size, return (error < tol); } +template +void dirft3d3(T1 nj, T2 *x, T2 *y, T2 *z, T4 *c, T3 iflag, T1 nk, T2 *s, T2 *t, T2 *u, + T4 *f) +/* Direct computation of 3D type-3 nonuniform FFT. Interface same as finufft3d3 +c nj-1 +c f[k] = SUM c[j] exp(+-i (s[k] x[j] + t[k] y[j] + u[k] z[j])) +c j=0 +c for k = 0, ..., nk-1 +c If iflag>0 the + sign is used, otherwise the - sign is used, in the +c exponential. Uses C++ complex type. Simple brute force. Barnett 2/1/17 +*/ +{ + for (BIGINT k = 0; k < nk; ++k) { + CPX ss = (iflag > 0) ? IMA * s[k] : -IMA * s[k]; + CPX tt = (iflag > 0) ? IMA * t[k] : -IMA * t[k]; + CPX uu = (iflag > 0) ? IMA * u[k] : -IMA * u[k]; + f[k] = CPX(0, 0); + for (BIGINT j = 0; j < nj; ++j) f[k] += c[j] * exp(ss * x[j] + tt * y[j] + uu * z[j]); + } +} + int main() { // for now, once finufft is demacroized we can test float using test_t = double; @@ -102,7 +123,7 @@ int main() { opts.upsampfac = 2.00; opts.gpu_kerevalmeth = 0; opts.gpu_method = 1; - opts.gpu_sort = 1; + opts.gpu_sort = 0; opts.modeord = 0; finufft_opts fin_opts; finufft_default_opts(&fin_opts); @@ -263,7 +284,7 @@ int main() { auto test_type3 = [iflag, tol, ntransf, dim, cpu_planer, deconv_tol, M, N, n_modes, &d_x, &d_y, &d_z, &d_s, &d_t, &d_u, &c, &d_c, &fk, &d_fk, &opts, - &rand_util_11](auto plan) { + &rand_util_11, &s, &t, &u, &x, &y, &z](auto plan) { // plan is a pointer to a type that contains real_t using T = typename std::remove_pointer::type::real_t; const int type = 3; @@ -356,23 +377,34 @@ int main() { c[i].imag(rand_util_11()); } d_c = c; - // for (int i = 0; i < N; i++) { - // fk[i] = {randm11(), randm11()}; - // } - // d_fk = fk; + for (int i = 0; i < N; i++) { + fk[i] = {1000, 1000}; + } + d_fk = fk; cufinufft_execute_impl((cuda_complex *)d_c.data().get(), (cuda_complex *)d_fk.data().get(), plan); finufft_execute(cpu_plan, c.data(), fk.data()); cudaDeviceSynchronize(); - std::cout << "t2_plan->fw : "; - assert(almost_equal(plan->t2_plan->fw, cpu_plan->innerT2plan->fwBatch, - plan->t2_plan->nf, std::numeric_limits::epsilon() * 100)); std::cout << "CpBatch : "; assert(almost_equal(plan->c_batch, cpu_plan->CpBatch, M, tol, false)); std::cout << "fw : "; assert(almost_equal(plan->fw, cpu_plan->fwBatch, plan->nf, tol * 10, false)); + std::cout << "t2_plan->fw : "; + assert(almost_equal(plan->t2_plan->fw, cpu_plan->innerT2plan->fwBatch, + plan->t2_plan->nf, std::numeric_limits::epsilon() * 100)); + + if (M * N < TEST_BIGPROB) { + std::vector> Ft(N, 0); + dirft3d3(M, x.data(), y.data(), z.data(), c.data(), cpu_plan->fftSign, N, s.data(), + t.data(), u.data(), Ft.data()); // writes to F + std::cout << "dirft3d cpu: "; + (almost_equal(fk.data(), Ft.data(), N, tol * 10, false)); + std::cout << "dirft3d gpu: "; + (almost_equal(d_fk.data().get(), Ft.data(), N, tol * 10, false)); + } + std::cout << "fk : "; - assert(almost_equal(d_fk.data().get(), fk.data(), N, tol * 10, false)); + (almost_equal(d_fk.data().get(), fk.data(), N, tol * 10, false)); assert(cufinufft_destroy_impl(plan) == 0); assert(finufft_destroy(cpu_plan) == 0); plan = nullptr; From fe1da53d082e65320d74c3172cce70f5eea18abb Mon Sep 17 00:00:00 2001 From: DiamonDinoia Date: Tue, 27 Aug 2024 17:19:44 -0400 Subject: [PATCH 52/68] type3 many support --- src/cuda/1d/cufinufft1d.cu | 9 +++-- src/cuda/2d/cufinufft2d.cu | 2 +- src/cuda/3d/cufinufft3d.cu | 2 +- src/cuda/CMakeLists.txt | 5 +-- test/cuda/cufinufft_type3_test.cu | 64 +++++++++++-------------------- 5 files changed, 32 insertions(+), 50 deletions(-) diff --git a/src/cuda/1d/cufinufft1d.cu b/src/cuda/1d/cufinufft1d.cu index ed3c038e5..0619754e2 100644 --- a/src/cuda/1d/cufinufft1d.cu +++ b/src/cuda/1d/cufinufft1d.cu @@ -143,7 +143,7 @@ int cufinufft1d3_exec(cuda_complex *d_c, cuda_complex *d_fk, d_cstart = d_c + i * d_plan->maxbatchsize * d_plan->M; d_fkstart = d_fk + i * d_plan->maxbatchsize * d_plan->N; // setting input for spreader - d_plan->c = d_plan->c_batch + i * d_plan->maxbatchsize * d_plan->M; + d_plan->c = d_plan->c_batch; // setting output for spreader d_plan->fk = d_plan->fw; if ((ier = checkCudaErrors(cudaMemsetAsync( @@ -152,10 +152,11 @@ int cufinufft1d3_exec(cuda_complex *d_c, cuda_complex *d_fk, return ier; // NOTE: fw might need to be set to 0 // Step 0: pre-phase the input strengths - for (int i = 0; i < blksize; i++) { + for (int block = 0; block < blksize; block++) { thrust::transform(thrust::cuda::par.on(stream), d_plan->prephase, - d_plan->prephase + d_plan->M, d_cstart + i * d_plan->M, - d_plan->c + i * d_plan->M, thrust::multiplies>()); + d_plan->prephase + d_plan->M, d_cstart + block * d_plan->M, + d_plan->c + block * d_plan->M, + thrust::multiplies>()); } // Step 1: Spread if ((ier = cuspread1d(d_plan, blksize))) return ier; diff --git a/src/cuda/2d/cufinufft2d.cu b/src/cuda/2d/cufinufft2d.cu index 91da10e29..e8af3ce7d 100644 --- a/src/cuda/2d/cufinufft2d.cu +++ b/src/cuda/2d/cufinufft2d.cu @@ -143,7 +143,7 @@ int cufinufft2d3_exec(cuda_complex *d_c, cuda_complex *d_fk, d_cstart = d_c + i * d_plan->maxbatchsize * d_plan->M; d_fkstart = d_fk + i * d_plan->maxbatchsize * d_plan->N; // setting input for spreader - d_plan->c = d_plan->c_batch + i * d_plan->maxbatchsize * d_plan->M; + d_plan->c = d_plan->c_batch; // setting output for spreader d_plan->fk = d_plan->fw; if ((ier = checkCudaErrors(cudaMemsetAsync( diff --git a/src/cuda/3d/cufinufft3d.cu b/src/cuda/3d/cufinufft3d.cu index 0721d0300..f33801ea0 100644 --- a/src/cuda/3d/cufinufft3d.cu +++ b/src/cuda/3d/cufinufft3d.cu @@ -140,7 +140,7 @@ int cufinufft3d3_exec(cuda_complex *d_c, cuda_complex *d_fk, d_cstart = d_c + i * d_plan->maxbatchsize * d_plan->M; d_fkstart = d_fk + i * d_plan->maxbatchsize * d_plan->N; // setting input for spreader - d_plan->c = d_plan->c_batch + i * d_plan->maxbatchsize * d_plan->M; + d_plan->c = d_plan->c_batch; // setting output for spreader d_plan->fk = d_plan->fw; // NOTE: fw might need to be set to 0 diff --git a/src/cuda/CMakeLists.txt b/src/cuda/CMakeLists.txt index a3743592e..29699a2bd 100644 --- a/src/cuda/CMakeLists.txt +++ b/src/cuda/CMakeLists.txt @@ -37,9 +37,8 @@ set(FINUFFT_CUDA_FLAGS -fmad=true -restrict --extra-device-vectorization - $<$:-G - -maxrregcount - 32>>) + # $<$:-G -maxrregcount 64 > + >) add_library(cufinufft_common_objects OBJECT ${PRECISION_INDEPENDENT_SRC}) target_include_directories(cufinufft_common_objects diff --git a/test/cuda/cufinufft_type3_test.cu b/test/cuda/cufinufft_type3_test.cu index e67217a0e..63b024c69 100644 --- a/test/cuda/cufinufft_type3_test.cu +++ b/test/cuda/cufinufft_type3_test.cu @@ -121,9 +121,9 @@ int main() { cufinufft_default_opts(&opts); opts.debug = 2; opts.upsampfac = 2.00; - opts.gpu_kerevalmeth = 0; + opts.gpu_kerevalmeth = 1; opts.gpu_method = 1; - opts.gpu_sort = 0; + opts.gpu_sort = 1; opts.modeord = 0; finufft_opts fin_opts; finufft_default_opts(&fin_opts); @@ -133,12 +133,12 @@ int main() { fin_opts.spread_sort = opts.gpu_sort; fin_opts.modeord = opts.modeord; const int iflag = 1; - const int ntransf = 1; + const int ntransf = 10; const int dim = 3; const double tol = 1e-13; - const int n_modes[] = {5, 4, 2}; + const int n_modes[] = {10, 4, 2}; const int N = n_modes[0] * (dim > 1 ? n_modes[1] : 1) * (dim > 2 ? n_modes[2] : 1); - const int M = 15; + const int M = 20; const double bandwidth = 1.0; thrust::host_vector x(M * ntransf), y(M * ntransf), z(M * ntransf), @@ -224,7 +224,7 @@ int main() { assert(plan->spopts.ES_halfwidth == cpu_plan->spopts.ES_halfwidth); assert(plan->spopts.ES_c == cpu_plan->spopts.ES_c); - for (int i = 0; i < M; i++) { + for (int i = 0; i < M * ntransf; i++) { c[i].real(rand_util_11()); c[i].imag(rand_util_11()); } @@ -234,7 +234,7 @@ int main() { (cuda_complex *)d_fk.data().get(), plan); finufft_execute(cpu_plan, (std::complex *)c.data(), (std::complex *)fk.data()); std::cout << "type " << type << ": "; - assert(almost_equal(d_fk.data().get(), fk.data(), N, tol * 10)); + assert(almost_equal(d_fk.data().get(), fk.data(), N * ntransf, tol * 10)); assert(cufinufft_destroy_impl(plan) == 0); assert(finufft_destroy(cpu_plan) == 0); cudaDeviceSynchronize(); @@ -264,7 +264,7 @@ int main() { assert(plan->spopts.ES_halfwidth == cpu_plan->spopts.ES_halfwidth); assert(plan->spopts.ES_c == cpu_plan->spopts.ES_c); - for (int i = 0; i < N; i++) { + for (int i = 0; i < N * ntransf; i++) { fk[i].real(rand_util_11()); fk[i].imag(rand_util_11()); } @@ -275,16 +275,16 @@ int main() { finufft_execute(cpu_plan, c.data(), fk.data()); cudaDeviceSynchronize(); std::cout << "type " << type << ": "; - assert(almost_equal(d_c.data().get(), c.data(), M, tol)); + assert(almost_equal(d_c.data().get(), c.data(), M * ntransf, tol)); assert(cufinufft_destroy_impl(plan) == 0); assert(finufft_destroy(cpu_plan) == 0); cudaDeviceSynchronize(); plan = nullptr; }; - auto test_type3 = [iflag, tol, ntransf, dim, cpu_planer, deconv_tol, M, N, n_modes, - &d_x, &d_y, &d_z, &d_s, &d_t, &d_u, &c, &d_c, &fk, &d_fk, &opts, - &rand_util_11, &s, &t, &u, &x, &y, &z](auto plan) { + const auto test_type3 = [iflag, tol, ntransf, dim, cpu_planer, deconv_tol, M, N, + n_modes, &d_x, &d_y, &d_z, &d_s, &d_t, &d_u, &c, &d_c, &fk, + &d_fk, &opts, &rand_util_11](auto plan) { // plan is a pointer to a type that contains real_t using T = typename std::remove_pointer::type::real_t; const int type = 3; @@ -363,48 +363,30 @@ int main() { cudaDeviceSynchronize(); for (int i = 0; i < size; i++) { const auto error = abs(1 - fwkerhalf_host[i] / phiHat[idx][i]); - if (error > tol) { - std::cout << "fwkerhalf[" << idx << "][" << i << "]: " << fwkerhalf_host[i] - << " phiHat[" << idx << "][" << i << "]: " << phiHat[idx][i] - << std::endl; - std::cout << "error: " << error << std::endl; - } - // assert(error < tol * 1000); + assert(error < tol * 1000); } } - for (int i = 0; i < M; i++) { + for (int i = 0; i < M * ntransf; i++) { c[i].real(rand_util_11()); c[i].imag(rand_util_11()); } d_c = c; - for (int i = 0; i < N; i++) { - fk[i] = {1000, 1000}; - } - d_fk = fk; cufinufft_execute_impl((cuda_complex *)d_c.data().get(), (cuda_complex *)d_fk.data().get(), plan); finufft_execute(cpu_plan, c.data(), fk.data()); cudaDeviceSynchronize(); - std::cout << "CpBatch : "; - assert(almost_equal(plan->c_batch, cpu_plan->CpBatch, M, tol, false)); - std::cout << "fw : "; - assert(almost_equal(plan->fw, cpu_plan->fwBatch, plan->nf, tol * 10, false)); - std::cout << "t2_plan->fw : "; - assert(almost_equal(plan->t2_plan->fw, cpu_plan->innerT2plan->fwBatch, - plan->t2_plan->nf, std::numeric_limits::epsilon() * 100)); - - if (M * N < TEST_BIGPROB) { - std::vector> Ft(N, 0); - dirft3d3(M, x.data(), y.data(), z.data(), c.data(), cpu_plan->fftSign, N, s.data(), - t.data(), u.data(), Ft.data()); // writes to F - std::cout << "dirft3d cpu: "; - (almost_equal(fk.data(), Ft.data(), N, tol * 10, false)); - std::cout << "dirft3d gpu: "; - (almost_equal(d_fk.data().get(), Ft.data(), N, tol * 10, false)); + if (ntransf == 1) { + std::cout << "CpBatch : "; + assert(almost_equal(plan->c_batch, cpu_plan->CpBatch, M, tol, false)); + std::cout << "fw : "; + assert(almost_equal(plan->fw, cpu_plan->fwBatch, plan->nf, tol * 10, false)); + std::cout << "t2_plan->fw : "; + assert(almost_equal(plan->t2_plan->fw, cpu_plan->innerT2plan->fwBatch, + plan->t2_plan->nf, std::numeric_limits::epsilon() * 100)); } std::cout << "fk : "; - (almost_equal(d_fk.data().get(), fk.data(), N, tol * 10, false)); + assert(almost_equal(d_fk.data().get(), fk.data(), N * ntransf, tol * 10, false)); assert(cufinufft_destroy_impl(plan) == 0); assert(finufft_destroy(cpu_plan) == 0); plan = nullptr; From d415f0d1662fe3a46eeb25b849d813945845f37d Mon Sep 17 00:00:00 2001 From: DiamonDinoia Date: Wed, 28 Aug 2024 12:13:57 -0400 Subject: [PATCH 53/68] type3 many tests for one target --- test/cuda/CMakeLists.txt | 17 +++++++++++-- test/cuda/cufinufft2d_test.cu | 1 - test/cuda/cufinufft2dmany_test.cu | 41 ++++++++++++++++++++++++++++--- test/cuda/cufinufft_type3_test.cu | 2 +- 4 files changed, 53 insertions(+), 8 deletions(-) diff --git a/test/cuda/CMakeLists.txt b/test/cuda/CMakeLists.txt index 7ec71b9ed..9be47a87d 100644 --- a/test/cuda/CMakeLists.txt +++ b/test/cuda/CMakeLists.txt @@ -33,6 +33,9 @@ function(add_tests PREC REQ_TOL CHECK_TOL UPSAMP) add_test(NAME cufinufft1d2_test_GM_${PREC}_${UPSAMP} COMMAND cufinufft1d_test 1 2 1e2 2e2 ${REQ_TOL} ${CHECK_TOL} ${PREC} ${UPSAMP}) + add_test(NAME cufinufft1d3_test_GM_${PREC}_${UPSAMP} + COMMAND cufinufft1d_test 1 3 1e2 2e2 ${REQ_TOL} ${CHECK_TOL} ${PREC} + ${UPSAMP}) add_test(NAME cufinufft2d1_test_GM_${PREC}_${UPSAMP} COMMAND cufinufft2d_test 1 1 1e2 2e2 2e4 ${REQ_TOL} ${CHECK_TOL} @@ -66,6 +69,14 @@ function(add_tests PREC REQ_TOL CHECK_TOL UPSAMP) COMMAND cufinufft2dmany_test 2 2 1e2 2e2 5 0 2e4 ${REQ_TOL} ${CHECK_TOL} ${PREC} ${UPSAMP}) + add_test(NAME cufinufft2d3many_test_GM_${PREC}_${UPSAMP} + COMMAND cufinufft2dmany_test 1 3 1e2 2e2 5 0 2e4 ${REQ_TOL} + ${CHECK_TOL} ${PREC} ${UPSAMP}) + + add_test(NAME cufinufft2d3many_test_SM_${PREC}_${UPSAMP} + COMMAND cufinufft2dmany_test 2 3 1e2 2e2 5 0 2e4 ${REQ_TOL} + ${CHECK_TOL} ${PREC} ${UPSAMP}) + add_test(NAME cufinufft3d1_test_GM_${PREC}_${UPSAMP} COMMAND cufinufft3d_test 1 1 2 5 10 20 ${REQ_TOL} ${CHECK_TOL} ${PREC} ${UPSAMP}) @@ -82,6 +93,10 @@ function(add_tests PREC REQ_TOL CHECK_TOL UPSAMP) add_test(NAME cufinufft3d2_test_SM_${PREC}_${UPSAMP} COMMAND cufinufft3d_test 2 2 2 5 10 20 ${REQ_TOL} ${CHECK_TOL} ${PREC} ${UPSAMP}) + + add_test(NAME cufinufft3d3_test_SM_${PREC}_${UPSAMP} + COMMAND cufinufft3d_test 2 3 2 5 10 20 ${REQ_TOL} ${CHECK_TOL} + ${PREC} ${UPSAMP}) endif() add_test(NAME cufinufft3d2_test_GM_${PREC}_${UPSAMP} @@ -107,5 +122,3 @@ add_tests(float 1e-5 2e-4 0.f) add_tests(double 1e-12 1e-11 0.f) add_tests(float 1e-5 2e-4 0.) add_tests(double 1e-8 1e-7 0.) - -# add_test(NAME cufinufft_type3_test COMMAND cufinufft_type3_test) diff --git a/test/cuda/cufinufft2d_test.cu b/test/cuda/cufinufft2d_test.cu index 39524ae78..0832452a6 100644 --- a/test/cuda/cufinufft2d_test.cu +++ b/test/cuda/cufinufft2d_test.cu @@ -191,7 +191,6 @@ int run_test(int method, int type, int N1, int N2, int M, T tol, T checktol, int rel_error = abs(c[jt] - ct) / infnorm(M, (std::complex *)c.data()); printf("[gpu ] one targ: rel err in c[%d] is %.3g\n", jt, rel_error); } else if (type == 3) { - int jt = (N1 * N2) / 2; // check arbitrary choice of one targ pt thrust::complex J = thrust::complex(0, iflag); thrust::complex Ft = thrust::complex(0, 0); diff --git a/test/cuda/cufinufft2dmany_test.cu b/test/cuda/cufinufft2dmany_test.cu index 4afcd97dd..f50c69d85 100644 --- a/test/cuda/cufinufft2dmany_test.cu +++ b/test/cuda/cufinufft2dmany_test.cu @@ -26,10 +26,10 @@ int run_test(int method, int type, int N1, int N2, int ntransf, int maxbatchsize const int N = N1 * N2; printf("#modes = %d, #inputs = %d, #NUpts = %d\n", N, ntransf, M); - thrust::host_vector x(M), y(M); + thrust::host_vector x(M), y(M), s{}, t{}; thrust::host_vector> c(M * ntransf), fk(ntransf * N1 * N2); - thrust::device_vector d_x(M), d_y(M); + thrust::device_vector d_x(M), d_y(M), d_s{}, d_t{}; thrust::device_vector> d_c(M * ntransf), d_fk(ntransf * N1 * N2); std::default_random_engine eng(1); @@ -53,6 +53,19 @@ int run_test(int method, int type, int N1, int N2, int ntransf, int maxbatchsize fk[i].real(randm11()); fk[i].imag(randm11()); } + } else if (type == 3) { + for (int i = 0; i < ntransf * M; i++) { + c[i].real(randm11()); + c[i].imag(randm11()); + } + s.resize(N1 * N2); + t.resize(N1 * N2); + for (int i = 0; i < N1 * N2; i++) { + s[i] = M_PI * randm11(); + t[i] = M_PI * randm11(); + } + d_s = s; + d_t = t; } else { std::cerr << "Invalid type " << type << " supplied\n"; return 1; @@ -64,6 +77,8 @@ int run_test(int method, int type, int N1, int N2, int ntransf, int maxbatchsize d_c = c; else if (type == 2) d_fk = fk; + else if (type == 3) + d_c = c; cudaEvent_t start, stop; float milliseconds = 0; @@ -109,8 +124,8 @@ int run_test(int method, int type, int N1, int N2, int ntransf, int maxbatchsize printf("[time ] cufinufft plan:\t\t %.3g s\n", milliseconds / 1000); cudaEventRecord(start); - ier = cufinufft_setpts_impl(M, d_x.data().get(), d_y.data().get(), NULL, 0, NULL, - NULL, NULL, dplan); + ier = cufinufft_setpts_impl(M, d_x.data().get(), d_y.data().get(), nullptr, N1 * N2, + d_s.data().get(), d_t.data().get(), nullptr, dplan); if (ier != 0) { printf("err: cufinufft_setpts\n"); return ier; @@ -137,6 +152,10 @@ int run_test(int method, int type, int N1, int N2, int ntransf, int maxbatchsize cudaEventRecord(start); ier = cufinufft_destroy_impl(dplan); + if (ier != 0) { + printf("err: cufinufft3d_destroy\n"); + return ier; + } cudaEventRecord(stop); cudaEventSynchronize(stop); cudaEventElapsedTime(&milliseconds, start, stop); @@ -147,6 +166,8 @@ int run_test(int method, int type, int N1, int N2, int ntransf, int maxbatchsize fk = d_fk; else if (type == 2) c = d_c; + else if (type == 3) + fk = d_fk; T rel_error = std::numeric_limits::max(); if (type == 1) { @@ -175,6 +196,18 @@ int run_test(int method, int type, int N1, int N2, int ntransf, int maxbatchsize rel_error = abs(cstart[jt] - ct) / infnorm(M, (std::complex *)c.data()); printf("[gpu ] %dth data one targ: rel err in c[%d] is %.3g\n", t, jt, rel_error); + } else if (type == 3) { + int jt = (N1 * N2) / 2; // check arbitrary choice of one targ pt + thrust::complex J = thrust::complex(0, iflag); + thrust::complex Ft = thrust::complex(0, 0); + thrust::complex *fkstart = fk.data() + (ntransf - 1) * N1 * N2; + const thrust::complex *cstart = c.data() + (ntransf - 1) * M; + + for (int j = 0; j < M; ++j) { + Ft += cstart[j] * exp(J * (x[j] * s[jt] + y[j] * t[jt])); + } + rel_error = abs(Ft - fkstart[jt]) / infnorm(N1 * N2, (std::complex *)fk.data()); + printf("[gpu ] one mode: rel err in F[%d] is %.3g\n", jt, rel_error); } printf("[totaltime] %.3g us, speed %.3g NUpts/s\n", totaltime * 1000, diff --git a/test/cuda/cufinufft_type3_test.cu b/test/cuda/cufinufft_type3_test.cu index 63b024c69..d88a07cec 100644 --- a/test/cuda/cufinufft_type3_test.cu +++ b/test/cuda/cufinufft_type3_test.cu @@ -375,7 +375,7 @@ int main() { (cuda_complex *)d_fk.data().get(), plan); finufft_execute(cpu_plan, c.data(), fk.data()); cudaDeviceSynchronize(); - if (ntransf == 1) { + if (ntransf == 1) { // cpu and gpu handle batching differently std::cout << "CpBatch : "; assert(almost_equal(plan->c_batch, cpu_plan->CpBatch, M, tol, false)); std::cout << "fw : "; From 289fb4f33d8ad34685412fee3e4b8b713c9264b5 Mon Sep 17 00:00:00 2001 From: DiamonDinoia Date: Wed, 28 Aug 2024 12:14:35 -0400 Subject: [PATCH 54/68] updated docstring --- test/cuda/cufinufft2dmany_test.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/cuda/cufinufft2dmany_test.cu b/test/cuda/cufinufft2dmany_test.cu index f50c69d85..02658b671 100644 --- a/test/cuda/cufinufft2dmany_test.cu +++ b/test/cuda/cufinufft2dmany_test.cu @@ -226,7 +226,7 @@ int main(int argc, char *argv[]) { " method: One of\n" " 1: nupts driven,\n" " 2: sub-problem, or\n" - " type: Type of transform (1, 2)\n" + " type: Type of transform (1, 2, 3)\n" " N1, N2: The size of the 2D array\n" " ntransf: Number of inputs\n" " maxbatchsize: Number of simultaneous transforms (or 0 for default)\n" From bca0a73906dd0c7d019c5cd83ab0677749b53a3c Mon Sep 17 00:00:00 2001 From: DiamonDinoia Date: Wed, 28 Aug 2024 12:30:28 -0400 Subject: [PATCH 55/68] removed small transf tests --- test/cuda/CMakeLists.txt | 4 +- test/cuda/cufinufft_makeplan_impl.cu | 146 ---------- test/cuda/cufinufft_math_test.cu | 3 +- test/cuda/cufinufft_setpts.cu | 255 ----------------- test/cuda/cufinufft_type3_test.cu | 402 --------------------------- test/cuda/fseries_kernel_test.cu | 164 ----------- test/cuda/fseriesperf.sh | 29 -- 7 files changed, 3 insertions(+), 1000 deletions(-) delete mode 100644 test/cuda/cufinufft_makeplan_impl.cu delete mode 100644 test/cuda/cufinufft_setpts.cu delete mode 100644 test/cuda/cufinufft_type3_test.cu delete mode 100644 test/cuda/fseries_kernel_test.cu delete mode 100755 test/cuda/fseriesperf.sh diff --git a/test/cuda/CMakeLists.txt b/test/cuda/CMakeLists.txt index 9be47a87d..cd0cf0cbd 100644 --- a/test/cuda/CMakeLists.txt +++ b/test/cuda/CMakeLists.txt @@ -9,7 +9,7 @@ foreach(srcfile ${test_src}) if(MathLib) target_link_libraries(${executable} PUBLIC ${MathLib}) endif() - target_link_libraries(${executable} PUBLIC cufinufft finufft) + target_link_libraries(${executable} PUBLIC cufinufft) target_compile_features(${executable} PUBLIC cxx_std_17) set_target_properties( ${executable} PROPERTIES LINKER_LANGUAGE CUDA CUDA_ARCHITECTURES @@ -110,8 +110,6 @@ endfunction() add_test(NAME cufinufft_public_api COMMAND public_api_test) add_test(NAME cufinufft_makeplan COMMAND test_makeplan) -add_test(NAME cufinufft_makeplan_impl COMMAND cufinufft_makeplan_impl) -add_test(NAME cufinufft_setpts COMMAND cufinufft_setpts) add_test(NAME cufinufft_math_test COMMAND cufinufft_math_test) add_tests(float 1e-5 2e-4 2.0) diff --git a/test/cuda/cufinufft_makeplan_impl.cu b/test/cuda/cufinufft_makeplan_impl.cu deleted file mode 100644 index 53ee26565..000000000 --- a/test/cuda/cufinufft_makeplan_impl.cu +++ /dev/null @@ -1,146 +0,0 @@ -#ifdef NDEBUG -#undef NDEBUG -#include -#define NDEBUG -#else -#include -#endif - -#include -#include -#include - -#include - -int main() { - // defaults. tests should shadow them to override - const int iflag = 1; - const float tol = 1e-5; - const int ntransf = 1; - const int dim = 3; - int N[3] = {10, 20, 15}; - const auto cpu_planer = [iflag, tol, ntransf, dim, N](const auto type) { - int64_t Nl[3] = {int64_t(N[0]), int64_t(N[1]), int64_t(N[2])}; - finufft_plan_s *plan{nullptr}; - assert(finufft_makeplan(type, dim, Nl, iflag, ntransf, tol, &plan, nullptr) == 0); - return plan; - }; - const auto test_type1 = [iflag, tol, ntransf, dim, N, cpu_planer](auto *plan) { - // plan is a pointer to a type that contains real_t - using T = typename std::remove_pointer::type::real_t; - const int type = 1; - assert(cufinufft_makeplan_impl(type, dim, (int *)N, iflag, ntransf, T(tol), &plan, - nullptr) == 0); - const auto cpu_plan = cpu_planer(type); - cudaDeviceSynchronize(); - assert(plan->ms == N[0]); - assert(plan->mt == N[1]); - assert(plan->mu == N[2]); - assert(plan->nf1 >= N[0]); - assert(plan->nf2 >= N[1]); - assert(plan->nf3 >= N[2]); - assert(plan->fftplan != 0); - assert(plan->fwkerhalf1 != nullptr); - assert(plan->fwkerhalf2 != nullptr); - assert(plan->fwkerhalf3 != nullptr); - assert(plan->spopts.spread_direction == type); - assert(plan->type == type); - assert(plan->nf1 == cpu_plan->nf1); - assert(plan->nf2 == cpu_plan->nf2); - assert(plan->nf3 == cpu_plan->nf3); - int nf[] = {plan->nf1, plan->nf2, plan->nf3}; - T *fwkerhalf[] = {plan->fwkerhalf1, plan->fwkerhalf2, plan->fwkerhalf3}; - T *phiHat[] = {cpu_plan->phiHat1, cpu_plan->phiHat2, cpu_plan->phiHat3}; - for (int idx = 0; idx < dim; ++idx) { - const auto size = (nf[idx] / 2 + 1); - std::vector fwkerhalf_host(size, -1); - const auto ier = cudaMemcpy(fwkerhalf_host.data(), fwkerhalf[idx], size * sizeof(T), - cudaMemcpyDeviceToHost); - assert(ier == cudaSuccess); - for (int i = 0; i < size; i++) { - assert(abs(1 - fwkerhalf_host[i] / phiHat[idx][i]) < tol); - } - } - assert(cufinufft_destroy_impl(plan) == 0); - assert(finufft_destroy(cpu_plan) == 0); - plan = nullptr; - }; - auto test_type2 = [iflag, tol, ntransf, dim, N, cpu_planer](auto plan) { - // plan is a pointer to a type that contains real_t - using T = typename std::remove_pointer::type::real_t; - const int type = 2; - assert(cufinufft_makeplan_impl(type, dim, (int *)N, iflag, ntransf, T(tol), &plan, - nullptr) == 0); - const auto cpu_plan = cpu_planer(type); - cudaDeviceSynchronize(); - assert(plan->ms == N[0]); - assert(plan->mt == N[1]); - assert(plan->mu == N[2]); - assert(plan->nf1 >= N[0]); - assert(plan->nf2 >= N[1]); - assert(plan->nf3 >= N[2]); - assert(plan->fftplan != 0); - assert(plan->fwkerhalf1 != nullptr); - assert(plan->fwkerhalf2 != nullptr); - assert(plan->fwkerhalf3 != nullptr); - assert(plan->spopts.spread_direction == type); - assert(plan->type == type); - assert(plan->opts.gpu_method == 1); - assert(plan->nf1 == cpu_plan->nf1); - assert(plan->nf2 == cpu_plan->nf2); - assert(plan->nf3 == cpu_plan->nf3); - assert(plan->spopts.nspread == cpu_plan->spopts.nspread); - int nf[] = {plan->nf1, plan->nf2, plan->nf3}; - T *fwkerhalf[] = {plan->fwkerhalf1, plan->fwkerhalf2, plan->fwkerhalf3}; - T *phiHat[] = {cpu_plan->phiHat1, cpu_plan->phiHat2, cpu_plan->phiHat3}; - for (int idx = 0; idx < dim; ++idx) { - const auto size = (nf[idx] / 2 + 1); - std::vector fwkerhalf_host(size, -1); - const auto ier = cudaMemcpy(fwkerhalf_host.data(), fwkerhalf[idx], size * sizeof(T), - cudaMemcpyDeviceToHost); - if (ier != cudaSuccess) { - std::cerr << "Error: " << cudaGetErrorString(ier) << std::endl; - } - assert(ier == cudaSuccess); - cudaDeviceSynchronize(); - for (int i = 0; i < size; i++) { - assert(abs(1 - fwkerhalf_host[i] / phiHat[idx][i]) < tol); - } - } - assert(cufinufft_destroy_impl(plan) == 0); - cudaDeviceSynchronize(); - assert(finufft_destroy(cpu_plan) == 0); - plan = nullptr; - }; - auto test_type3 = [iflag, tol, ntransf, dim, N, cpu_planer](auto plan) { - // plan is a pointer to a type that contains real_t - using T = typename std::remove_pointer::type::real_t; - const int type = 3; - assert(cufinufft_makeplan_impl(type, dim, (int *)N, iflag, ntransf, T(tol), &plan, - nullptr) == 0); - cudaDeviceSynchronize(); - assert(plan->ms == 0); - assert(plan->mt == 0); - assert(plan->mu == 0); - assert(plan->nf1 == 1); - assert(plan->nf2 == 1); - assert(plan->nf3 == 1); - assert(plan->fftplan == 0); - assert(plan->fwkerhalf1 == nullptr); - assert(plan->fwkerhalf2 == nullptr); - assert(plan->fwkerhalf3 == nullptr); - assert(plan->spopts.spread_direction == type); - assert(plan->type == type); - assert(plan->opts.upsampfac == 1.25); - assert(cufinufft_destroy_impl(plan) == 0); - plan = nullptr; - cudaDeviceSynchronize(); - }; - // testing correctness of the plan creation - // cufinufft_plan_t *single_plan{nullptr}; - cufinufft_plan_t *double_plan{nullptr}; - test_type1(double_plan); - test_type2(double_plan); - test_type3(double_plan); - return 0; -} diff --git a/test/cuda/cufinufft_math_test.cu b/test/cuda/cufinufft_math_test.cu index 5a80b95d8..005dd3199 100644 --- a/test/cuda/cufinufft_math_test.cu +++ b/test/cuda/cufinufft_math_test.cu @@ -108,7 +108,8 @@ template int testRandomOperations() { return 1; // Test division with scalar - if (scalar != 0.0) { // Avoid division by zero + // Avoid division by small numbers which is not accurate + if (scalar > (std::is_same_v ? 1e-15 : 1e-6)) { cuda_complex result_div_scalar = a / scalar; std::complex expected_div_scalar = std_a / scalar; if (!compareComplex(result_div_scalar, expected_div_scalar, diff --git a/test/cuda/cufinufft_setpts.cu b/test/cuda/cufinufft_setpts.cu deleted file mode 100644 index efe0105b7..000000000 --- a/test/cuda/cufinufft_setpts.cu +++ /dev/null @@ -1,255 +0,0 @@ -#ifdef NDEBUG -#undef NDEBUG -#include -#define NDEBUG -#else -#include -#endif - -#include - -#include -#include -#include - -#include - -#include -#include -#include - -// for now, once finufft is demacroized we can test float -using test_t = double; - -template bool equal(V *d_vec, T *cpu, const std::size_t size) { - // copy d_vec to cpu - thrust::host_vector h_vec(size); - // this implicitly converts cuda_complex to std::complex... which is fine, but it may - // cause issues use it with case - assert(cudaMemcpy(h_vec.data(), d_vec, size * sizeof(T), cudaMemcpyDeviceToHost) == - cudaSuccess); - for (std::size_t i = 0; i < size; ++i) { - if (h_vec[i] != cpu[i]) { - std::cout << " gpu[" << i << "]: " << h_vec[i] << " cpu[" << i << "]: " << cpu[i] - << std::endl; - return false; - } - } - return true; -} - -template -T infnorm(std::complex *a, std::complex *b, const std::size_t n) { - T err{0}, max_element{0}; - for (std::size_t m = 0; m < n; ++m) { - // std::cout << "a[" << m << "]: " << a[m] << " b[" << m << "]: " << b[m] << "\n"; - err = std::max(err, std::abs(a[m] - b[m])); - max_element = std::max(std::max(std::abs(a[m]), std::abs(b[m])), max_element); - } - return err / max_element; -} -// max error divide by max element -// max ( abs(a-b)) / max(abs(a)) -// 10*(machine precision) -template -T relerrtwonorm(std::complex *a, std::complex *b, const std::size_t n) { - T err{0}, nrm{0}; - for (std::size_t m = 0; m < n; ++m) { - // std::cout << "a[" << m << "]: " << a[m] << " b[" << m << "]: " << b[m] << "\n"; - nrm += std::real(std::conj(a[m]) * a[m]); - const auto diff = a[m] - b[m]; - err += std::real(std::conj(diff) * diff); - } - return std::sqrt(err / nrm); -} - -template -auto almost_equal(V *d_vec, - T *cpu, - const std::size_t size, - const contained tol = std::numeric_limits::epsilon()) { - // copy d_vec to cpu - std::vector h_vec(size); - // this implicitly converts cuda_complex to std::complex... which is fine, but it may - // cause issues use it with case - assert(cudaMemcpy(h_vec.data(), d_vec, size * sizeof(T), cudaMemcpyDeviceToHost) == - cudaSuccess); - std::cout << "infnorm: " << infnorm(h_vec.data(), cpu, size) << std::endl; - // compare the l2 norm of the difference between the two vectors - if (infnorm(h_vec.data(), cpu, size) < tol) { - return true; - } - return false; -} - -int main() { - // defaults. tests should shadow them to override - cufinufft_opts opts; - cufinufft_default_opts(&opts); - opts.debug = 2; - finufft_opts fin_opts; - finufft_default_opts(&fin_opts); - fin_opts.debug = 2; - const int iflag = 1; - const float tol = 1e-9; - const int ntransf = 1; - const int dim = 3; - int n_modes[3] = {10, 20, 15}; - const int N = n_modes[0] * n_modes[1] * n_modes[2]; - const int M = 100; - - thrust::host_vector x(M * ntransf), y(M * ntransf), z(M * ntransf), - s(N * ntransf), t(N * ntransf), u(N * ntransf); - thrust::host_vector> c(M * ntransf), fk(N * ntransf); - - thrust::device_vector d_x{}, d_y{}, d_z{}, d_s{}, d_t{}, d_u{}; - thrust::device_vector> d_c(M * ntransf), d_fk(N * ntransf); - - std::default_random_engine eng(42); - std::uniform_real_distribution dist11(-1, 1); - auto rand_util_11 = [&eng, &dist11]() { - return dist11(eng); - }; - - // Making data - for (int64_t i = 0; i < M; i++) { - x[i] = M_PI * rand_util_11() + 4; // x in [-pi,pi) - y[i] = M_PI * rand_util_11() + 4; - z[i] = M_PI * rand_util_11() + 4; - } - for (int64_t i = 0; i < N; i++) { - s[i] = M_PI * rand_util_11() + 8; // shifted so D1 is 8 - t[i] = M_PI * rand_util_11() + 8; // shifted so D2 is 8 - u[i] = M_PI * rand_util_11() + 8; // shifted so D3 is 8 - } - - for (int64_t i = M; i < M * ntransf; ++i) { - int64_t j = i % M; - x[i] = x[j]; - y[i] = y[j]; - z[i] = z[j]; - } - for (int64_t i = M; i < N * ntransf; ++i) { - int64_t j = i % N; - s[i] = s[j]; - t[i] = t[j]; - u[i] = u[j]; - } - // copy x, y, z, s, t, u to device d_x, d_y, d_z, d_s, d_t, d_u - d_x = x; - d_y = y; - d_z = z; - d_s = s; - d_t = t; - d_u = u; - cudaDeviceSynchronize(); - - const auto cpu_planer = - [iflag, tol, ntransf, dim, n_modes, M, N, &x, &y, &z, &s, &t, &u, &fin_opts]( - const auto type) { - int64_t Nl[3] = {int64_t(n_modes[0]), int64_t(n_modes[1]), int64_t(n_modes[2])}; - finufft_plan_s *plan{nullptr}; - assert( - finufft_makeplan(type, dim, Nl, iflag, ntransf, tol, &plan, &fin_opts) == 0); - assert(finufft_setpts(plan, M, x.data(), y.data(), z.data(), N, s.data(), - t.data(), u.data()) == 0); - return plan; - }; - const auto test_type1 = [iflag, tol, ntransf, dim, n_modes, cpu_planer, &opts]( - auto *plan) { - // plan is a pointer to a type that contains real_t - using T = typename std::remove_pointer::type::real_t; - const int type = 1; - assert(cufinufft_makeplan_impl(type, dim, (int *)n_modes, iflag, ntransf, T(tol), - &plan, &opts) == 0); - const auto cpu_plan = cpu_planer(type); - cudaDeviceSynchronize(); - assert(cufinufft_destroy_impl(plan) == 0); - assert(finufft_destroy(cpu_plan) == 0); - plan = nullptr; - }; - auto test_type2 = [iflag, tol, ntransf, dim, n_modes, cpu_planer, &opts](auto plan) { - // plan is a pointer to a type that contains real_t - using T = typename std::remove_pointer::type::real_t; - const int type = 2; - assert(cufinufft_makeplan_impl(type, dim, (int *)n_modes, iflag, ntransf, T(tol), - &plan, &opts) == 0); - const auto cpu_plan = cpu_planer(type); - cudaDeviceSynchronize(); - assert(cufinufft_destroy_impl(plan) == 0); - cudaDeviceSynchronize(); - assert(finufft_destroy(cpu_plan) == 0); - plan = nullptr; - }; - auto test_type3 = [iflag, - tol, - ntransf, - dim, - n_modes, - cpu_planer, - M, - N, - &d_x, - &d_y, - &d_z, - &d_s, - &d_t, - &d_u, - &opts](auto plan) { - // plan is a pointer to a type that contains real_t - using T = typename std::remove_pointer::type::real_t; - const int type = 3; - const auto cpu_plan = cpu_planer(type); - assert(cufinufft_makeplan_impl(type, dim, (int *)n_modes, iflag, ntransf, T(tol), - &plan, &opts) == 0); - assert(cufinufft_setpts_impl(M, d_x.data().get(), d_y.data().get(), - d_z.data().get(), N, d_s.data().get(), - d_t.data().get(), d_u.data().get(), plan) == 0); - cudaDeviceSynchronize(); - assert(plan->type3_params.X1 == cpu_plan->t3P.X1); - assert(plan->type3_params.X2 == cpu_plan->t3P.X2); - assert(plan->type3_params.X3 == cpu_plan->t3P.X3); - assert(plan->type3_params.C1 == cpu_plan->t3P.C1); - assert(plan->type3_params.C2 == cpu_plan->t3P.C2); - assert(plan->type3_params.C3 == cpu_plan->t3P.C3); - assert(plan->type3_params.D1 == cpu_plan->t3P.D1); - assert(plan->type3_params.D2 == cpu_plan->t3P.D2); - assert(plan->type3_params.D3 == cpu_plan->t3P.D3); - assert(plan->type3_params.gam1 == cpu_plan->t3P.gam1); - assert(plan->type3_params.gam2 == cpu_plan->t3P.gam2); - assert(plan->type3_params.gam3 == cpu_plan->t3P.gam3); - assert(plan->nf1 == cpu_plan->nf1); - assert(plan->nf2 == cpu_plan->nf2); - assert(plan->nf3 == cpu_plan->nf3); - assert(equal(plan->kx, cpu_plan->X, M)); - assert(equal(plan->ky, cpu_plan->Y, M)); - assert(equal(plan->kz, cpu_plan->Z, M)); - assert(equal(plan->d_s, cpu_plan->Sp, N)); - assert(equal(plan->d_t, cpu_plan->Tp, N)); - assert(equal(plan->d_u, cpu_plan->Up, N)); - // NOTE:seems with infnorm we are getting at most 11 digits of precision - assert(almost_equal(plan->prephase, cpu_plan->prephase, M, tol * T(1e-2))); - assert(almost_equal(plan->deconv, cpu_plan->deconv, N, tol * T(1e-2))); - assert(cufinufft_destroy_impl(plan) == 0); - assert(finufft_destroy(cpu_plan) == 0); - plan = nullptr; - cudaDeviceSynchronize(); - }; - // testing correctness of the plan creation - // cufinufft_plan_t *single_plan{nullptr}; - cufinufft_plan_t *double_plan{nullptr}; - // test_type1(double_plan); - // test_type2(double_plan); - test_type3(double_plan); - return 0; -} - -#ifdef __clang__ -#pragma clang diagnostic pop -#elif defined(__GNUC__) || defined(__GNUG__) -#pragma GCC diagnostic pop -#elif defined(__NVCC__) -#pragma diag_default 177 - D -#elif defined(_MSC_VER) -#pragma warning(pop) -#endif diff --git a/test/cuda/cufinufft_type3_test.cu b/test/cuda/cufinufft_type3_test.cu deleted file mode 100644 index d88a07cec..000000000 --- a/test/cuda/cufinufft_type3_test.cu +++ /dev/null @@ -1,402 +0,0 @@ -#ifdef NDEBUG -#undef NDEBUG -#include -#define NDEBUG -#else -#include -#endif - -#include - -#include -#include -#include - -#include -#include -#include - -#include - -template bool equal(V *d_vec, T *cpu, const std::size_t size) { - // copy d_vec to cpu - thrust::host_vector h_vec(size); - // this implicitly converts cuda_complex to std::complex... which is fine, but it may - // cause issues use it with case - assert(cudaMemcpy(h_vec.data(), d_vec, size * sizeof(T), cudaMemcpyDeviceToHost) == - cudaSuccess); - for (std::size_t i = 0; i < size; ++i) { - if (h_vec[i] != cpu[i]) { - std::cout << " gpu[" << i << "]: " << h_vec[i] << " cpu[" << i << "]: " << cpu[i] - << std::endl; - return false; - } - } - return true; -} - -template -T infnorm(std::complex *a, std::complex *b, const std::size_t n) { - T err{0}, max_element{0}; - for (std::size_t m = 0; m < n; ++m) { - // std::cout << "a[" << m << "]: " << a[m] << " b[" << m << "]: " << b[m] << "\n"; - err = std::max(err, std::abs(a[m] - b[m])); - max_element = std::max(std::max(std::abs(a[m]), std::abs(b[m])), max_element); - } - return err / max_element; -} -// max error divide by max element -// max ( abs(a-b)) / max(abs(a)) -// 10*(machine precision) -template -T relerrtwonorm(std::complex *a, std::complex *b, const std::size_t n) { - T err{0}, nrm{0}; - for (std::size_t m = 0; m < n; ++m) { - // std::cout << "a[" << m << "]: " << a[m] << " b[" << m << "]: " << b[m] << "\n"; - nrm += std::real(std::conj(a[m]) * a[m]); - const auto diff = a[m] - b[m]; - auto this_err = std::real(std::conj(diff) * diff); - if (this_err > 1e-12) { - std::cout << "a[" << m << "]: " << a[m] << " b[" << m << "]: " << b[m] << "\n"; - std::cout << "diff: " << diff << " this_err: " << this_err << std::endl; - } - err += this_err; - } - return std::sqrt(err / nrm); -} - -template -auto almost_equal(V *d_vec, T *cpu, const std::size_t size, - const contained tol = std::numeric_limits::epsilon(), - bool print = false) { - // copy d_vec to cpu - std::vector h_vec(size); - // this implicitly converts cuda_complex to std::complex... which is fine, but it may - // cause issues use it with case - assert(cudaMemcpy(h_vec.data(), d_vec, size * sizeof(T), cudaMemcpyDeviceToHost) == - cudaSuccess); - cudaDeviceSynchronize(); - // print h_vec and cpu - if (print) { - std::cout << std::setprecision(15); - for (std::size_t i = 0; i < size; ++i) { - std::cout << "gpu[" << i << "]: " << h_vec[i] << " cpu[" << i << "]: " << cpu[i] - << '\n'; - } - std::cout << std::setprecision(6); - } - const auto error = relerrtwonorm(h_vec.data(), cpu, size); - std::cout << "relerrtwonorm: " << error << std::endl; - // compare the l2 norm of the difference between the two vectors - return (error < tol); -} - -template -void dirft3d3(T1 nj, T2 *x, T2 *y, T2 *z, T4 *c, T3 iflag, T1 nk, T2 *s, T2 *t, T2 *u, - T4 *f) -/* Direct computation of 3D type-3 nonuniform FFT. Interface same as finufft3d3 -c nj-1 -c f[k] = SUM c[j] exp(+-i (s[k] x[j] + t[k] y[j] + u[k] z[j])) -c j=0 -c for k = 0, ..., nk-1 -c If iflag>0 the + sign is used, otherwise the - sign is used, in the -c exponential. Uses C++ complex type. Simple brute force. Barnett 2/1/17 -*/ -{ - for (BIGINT k = 0; k < nk; ++k) { - CPX ss = (iflag > 0) ? IMA * s[k] : -IMA * s[k]; - CPX tt = (iflag > 0) ? IMA * t[k] : -IMA * t[k]; - CPX uu = (iflag > 0) ? IMA * u[k] : -IMA * u[k]; - f[k] = CPX(0, 0); - for (BIGINT j = 0; j < nj; ++j) f[k] += c[j] * exp(ss * x[j] + tt * y[j] + uu * z[j]); - } -} - -int main() { - // for now, once finufft is demacroized we can test float - using test_t = double; - - // defaults. tests should shadow them to override - cufinufft_opts opts; - cufinufft_default_opts(&opts); - opts.debug = 2; - opts.upsampfac = 2.00; - opts.gpu_kerevalmeth = 1; - opts.gpu_method = 1; - opts.gpu_sort = 1; - opts.modeord = 0; - finufft_opts fin_opts; - finufft_default_opts(&fin_opts); - fin_opts.debug = opts.debug; - fin_opts.spread_kerevalmeth = opts.gpu_kerevalmeth; - fin_opts.upsampfac = opts.upsampfac; - fin_opts.spread_sort = opts.gpu_sort; - fin_opts.modeord = opts.modeord; - const int iflag = 1; - const int ntransf = 10; - const int dim = 3; - const double tol = 1e-13; - const int n_modes[] = {10, 4, 2}; - const int N = n_modes[0] * (dim > 1 ? n_modes[1] : 1) * (dim > 2 ? n_modes[2] : 1); - const int M = 20; - const double bandwidth = 1.0; - - thrust::host_vector x(M * ntransf), y(M * ntransf), z(M * ntransf), - s(N * ntransf), t(N * ntransf), u(N * ntransf); - thrust::host_vector> c(M * ntransf), fk(N * ntransf); - - thrust::device_vector d_x(M * ntransf), d_y(M * ntransf), d_z(M * ntransf), - d_s(N * ntransf), d_t(N * ntransf), d_u(N * ntransf); - thrust::device_vector> d_c(M * ntransf), d_fk(N * ntransf); - - std::default_random_engine eng(42); - std::uniform_real_distribution dist11(-1, 1); - auto rand_util_11 = [&eng, &dist11]() { - return dist11(eng); - }; - - // Making data - for (int64_t i = 0; i < M; i++) { - x[i] = rand_util_11(); // x in [-pi,pi) - y[i] = rand_util_11(); - z[i] = rand_util_11(); - } - for (int64_t i = 0; i < N; i++) { - s[i] = M_PI * rand_util_11() * bandwidth; // shifted so D1 is 8 - t[i] = M_PI * rand_util_11() * bandwidth; // shifted so D2 is 8 - u[i] = M_PI * rand_util_11() * bandwidth; // shifted so D3 is 8 - } - - const double deconv_tol = std::numeric_limits::epsilon() * bandwidth * 1000; - - for (int64_t i = M; i < M * ntransf; ++i) { - int64_t j = i % M; - x[i] = x[j]; - y[i] = y[j]; - z[i] = z[j]; - } - for (int64_t i = N; i < N * ntransf; ++i) { - int64_t j = i % N; - s[i] = s[j]; - t[i] = t[j]; - u[i] = u[j]; - } - - // copy x, y, z, s, t, u to device d_x, d_y, d_z, d_s, d_t, d_u - d_x = x; - d_y = y; - d_z = z; - d_s = s; - d_t = t; - d_u = u; - cudaDeviceSynchronize(); - - const auto cpu_planer = [iflag, tol, ntransf, dim, M, N, n_modes, &x, &y, &z, &s, &t, - &u, &fin_opts](const auto type) { - finufft_plan_s *plan{nullptr}; - std::int64_t nl[] = {n_modes[0], n_modes[1], n_modes[2]}; - assert(finufft_makeplan(type, dim, nl, iflag, ntransf, tol, &plan, &fin_opts) == 0); - assert(finufft_setpts(plan, M, x.data(), y.data(), z.data(), N, s.data(), t.data(), - u.data()) == 0); - return plan; - }; - - const auto test_type1 = [iflag, tol, ntransf, dim, cpu_planer, M, N, n_modes, &d_x, - &d_y, &d_z, &c, &d_c, &fk, &d_fk, &opts, - &rand_util_11](auto plan) { - // plan is a pointer to a type that contains real_t - using T = typename std::remove_pointer::type::real_t; - const int type = 1; - const auto cpu_plan = cpu_planer(type); - assert(cufinufft_makeplan_impl(type, dim, (int *)n_modes, iflag, ntransf, T(tol), - &plan, &opts) == 0); - cudaDeviceSynchronize(); - assert( - cufinufft_setpts_impl(M, d_x.data().get(), d_y.data().get(), d_z.data().get(), - 0, nullptr, nullptr, nullptr, plan) == 0); - cudaDeviceSynchronize(); - assert(plan->nf1 == cpu_plan->nf1); - assert(plan->nf2 == cpu_plan->nf2); - assert(plan->nf3 == cpu_plan->nf3); - assert(plan->spopts.nspread == cpu_plan->spopts.nspread); - assert(plan->spopts.upsampfac == cpu_plan->spopts.upsampfac); - assert(plan->spopts.ES_beta == cpu_plan->spopts.ES_beta); - assert(plan->spopts.ES_halfwidth == cpu_plan->spopts.ES_halfwidth); - assert(plan->spopts.ES_c == cpu_plan->spopts.ES_c); - - for (int i = 0; i < M * ntransf; i++) { - c[i].real(rand_util_11()); - c[i].imag(rand_util_11()); - } - d_c = c; - cudaDeviceSynchronize(); - cufinufft_execute_impl((cuda_complex *)d_c.data().get(), - (cuda_complex *)d_fk.data().get(), plan); - finufft_execute(cpu_plan, (std::complex *)c.data(), (std::complex *)fk.data()); - std::cout << "type " << type << ": "; - assert(almost_equal(d_fk.data().get(), fk.data(), N * ntransf, tol * 10)); - assert(cufinufft_destroy_impl(plan) == 0); - assert(finufft_destroy(cpu_plan) == 0); - cudaDeviceSynchronize(); - plan = nullptr; - }; - - const auto test_type2 = [iflag, tol, ntransf, dim, cpu_planer, M, N, n_modes, &d_x, - &d_y, &d_z, &c, &d_c, &fk, &d_fk, &opts, - &rand_util_11](auto plan) { - // plan is a pointer to a type that contains real_t - using T = typename std::remove_pointer::type::real_t; - const int type = 2; - const auto cpu_plan = cpu_planer(type); - assert(cufinufft_makeplan_impl(type, dim, (int *)n_modes, iflag, ntransf, T(tol), - &plan, &opts) == 0); - cudaDeviceSynchronize(); - assert( - cufinufft_setpts_impl(M, d_x.data().get(), d_y.data().get(), d_z.data().get(), - 0, nullptr, nullptr, nullptr, plan) == 0); - cudaDeviceSynchronize(); - assert(plan->nf1 == cpu_plan->nf1); - assert(plan->nf2 == cpu_plan->nf2); - assert(plan->nf3 == cpu_plan->nf3); - assert(plan->spopts.nspread == cpu_plan->spopts.nspread); - assert(plan->spopts.upsampfac == cpu_plan->spopts.upsampfac); - assert(plan->spopts.ES_beta == cpu_plan->spopts.ES_beta); - assert(plan->spopts.ES_halfwidth == cpu_plan->spopts.ES_halfwidth); - assert(plan->spopts.ES_c == cpu_plan->spopts.ES_c); - - for (int i = 0; i < N * ntransf; i++) { - fk[i].real(rand_util_11()); - fk[i].imag(rand_util_11()); - } - d_fk = fk; - cudaDeviceSynchronize(); - cufinufft_execute_impl((cuda_complex *)d_c.data().get(), - (cuda_complex *)d_fk.data().get(), plan); - finufft_execute(cpu_plan, c.data(), fk.data()); - cudaDeviceSynchronize(); - std::cout << "type " << type << ": "; - assert(almost_equal(d_c.data().get(), c.data(), M * ntransf, tol)); - assert(cufinufft_destroy_impl(plan) == 0); - assert(finufft_destroy(cpu_plan) == 0); - cudaDeviceSynchronize(); - plan = nullptr; - }; - - const auto test_type3 = [iflag, tol, ntransf, dim, cpu_planer, deconv_tol, M, N, - n_modes, &d_x, &d_y, &d_z, &d_s, &d_t, &d_u, &c, &d_c, &fk, - &d_fk, &opts, &rand_util_11](auto plan) { - // plan is a pointer to a type that contains real_t - using T = typename std::remove_pointer::type::real_t; - const int type = 3; - const auto cpu_plan = cpu_planer(type); - assert(cufinufft_makeplan_impl(type, dim, (int *)n_modes, iflag, ntransf, T(tol), - &plan, &opts) == 0); - cudaDeviceSynchronize(); - assert(cufinufft_setpts_impl(M, d_x.data().get(), d_y.data().get(), - d_z.data().get(), N, d_s.data().get(), - d_t.data().get(), d_u.data().get(), plan) == 0); - cudaDeviceSynchronize(); - assert(plan->type3_params.X1 == cpu_plan->t3P.X1); - if (dim > 1) assert(plan->type3_params.X2 == cpu_plan->t3P.X2); - if (dim > 2) assert(plan->type3_params.X3 == cpu_plan->t3P.X3); - assert(plan->type3_params.C1 == cpu_plan->t3P.C1); - if (dim > 1) assert(plan->type3_params.C2 == cpu_plan->t3P.C2); - if (dim > 2) assert(plan->type3_params.C3 == cpu_plan->t3P.C3); - assert(plan->type3_params.D1 == cpu_plan->t3P.D1); - if (dim > 1) assert(plan->type3_params.D2 == cpu_plan->t3P.D2); - if (dim > 2) assert(plan->type3_params.D3 == cpu_plan->t3P.D3); - assert(plan->type3_params.gam1 == cpu_plan->t3P.gam1); - if (dim > 1) assert(plan->type3_params.gam2 == cpu_plan->t3P.gam2); - if (dim > 2) assert(plan->type3_params.gam3 == cpu_plan->t3P.gam3); - assert(plan->type3_params.h1 == cpu_plan->t3P.h1); - if (dim > 1) assert(plan->type3_params.h2 == cpu_plan->t3P.h2); - if (dim > 2) assert(plan->type3_params.h3 == cpu_plan->t3P.h3); - assert(plan->nf1 == cpu_plan->nf1); - if (dim > 1) assert(plan->nf2 == cpu_plan->nf2); - if (dim > 2) assert(plan->nf3 == cpu_plan->nf3); - assert(equal(plan->kx, cpu_plan->X, M)); - if (dim > 1) assert(equal(plan->ky, cpu_plan->Y, M)); - if (dim > 2) assert(equal(plan->kz, cpu_plan->Z, M)); - assert(equal(plan->d_s, cpu_plan->Sp, N)); - if (dim > 1) assert(equal(plan->d_t, cpu_plan->Tp, N)); - if (dim > 2) assert(equal(plan->d_u, cpu_plan->Up, N)); - assert(plan->spopts.nspread == cpu_plan->spopts.nspread); - assert(plan->spopts.upsampfac == cpu_plan->spopts.upsampfac); - assert(plan->spopts.ES_beta == cpu_plan->spopts.ES_beta); - assert(plan->spopts.ES_halfwidth == cpu_plan->spopts.ES_halfwidth); - assert(plan->spopts.ES_c == cpu_plan->spopts.ES_c); - std::cout << "prephase :\n"; - assert(almost_equal(plan->prephase, cpu_plan->prephase, M, - std::numeric_limits::epsilon() * 100)); - std::cout << "deconv :\n"; - assert(almost_equal(plan->deconv, cpu_plan->deconv, N, deconv_tol)); - - assert(plan->t2_plan->nf1 == cpu_plan->innerT2plan->nf1); - if (dim > 1) assert(plan->t2_plan->nf2 == cpu_plan->innerT2plan->nf2); - if (dim > 2) assert(plan->t2_plan->nf3 == cpu_plan->innerT2plan->nf3); - assert(plan->t2_plan->nf == cpu_plan->innerT2plan->nf); - - assert(plan->t2_plan->spopts.nspread == cpu_plan->innerT2plan->spopts.nspread); - assert(plan->t2_plan->spopts.upsampfac == cpu_plan->innerT2plan->spopts.upsampfac); - assert(plan->t2_plan->spopts.ES_beta == cpu_plan->innerT2plan->spopts.ES_beta); - assert( - plan->t2_plan->spopts.ES_halfwidth == cpu_plan->innerT2plan->spopts.ES_halfwidth); - assert(plan->t2_plan->spopts.ES_c == cpu_plan->innerT2plan->spopts.ES_c); - assert(plan->t2_plan->ms == cpu_plan->innerT2plan->ms); - assert(plan->t2_plan->mt == cpu_plan->innerT2plan->mt); - assert(plan->t2_plan->mu == cpu_plan->innerT2plan->mu); - int nf[] = {plan->t2_plan->nf1, plan->t2_plan->nf2, plan->t2_plan->nf3}; - T *fwkerhalf[] = {plan->t2_plan->fwkerhalf1, plan->t2_plan->fwkerhalf2, - plan->t2_plan->fwkerhalf3}; - T *phiHat[] = {cpu_plan->innerT2plan->phiHat1, cpu_plan->innerT2plan->phiHat2, - cpu_plan->innerT2plan->phiHat3}; - for (int idx = 0; idx < dim; ++idx) { - std::cout << "nf[" << idx << "]: " << nf[idx] << std::endl; - const auto size = (nf[idx] / 2 + 1); - std::vector fwkerhalf_host(size, -1); - const auto ier = cudaMemcpy(fwkerhalf_host.data(), fwkerhalf[idx], size * sizeof(T), - cudaMemcpyDeviceToHost); - if (ier != cudaSuccess) { - std::cerr << "Error: " << cudaGetErrorString(ier) << std::endl; - } - assert(ier == cudaSuccess); - cudaDeviceSynchronize(); - for (int i = 0; i < size; i++) { - const auto error = abs(1 - fwkerhalf_host[i] / phiHat[idx][i]); - assert(error < tol * 1000); - } - } - for (int i = 0; i < M * ntransf; i++) { - c[i].real(rand_util_11()); - c[i].imag(rand_util_11()); - } - d_c = c; - cufinufft_execute_impl((cuda_complex *)d_c.data().get(), - (cuda_complex *)d_fk.data().get(), plan); - finufft_execute(cpu_plan, c.data(), fk.data()); - cudaDeviceSynchronize(); - if (ntransf == 1) { // cpu and gpu handle batching differently - std::cout << "CpBatch : "; - assert(almost_equal(plan->c_batch, cpu_plan->CpBatch, M, tol, false)); - std::cout << "fw : "; - assert(almost_equal(plan->fw, cpu_plan->fwBatch, plan->nf, tol * 10, false)); - std::cout << "t2_plan->fw : "; - assert(almost_equal(plan->t2_plan->fw, cpu_plan->innerT2plan->fwBatch, - plan->t2_plan->nf, std::numeric_limits::epsilon() * 100)); - } - - std::cout << "fk : "; - assert(almost_equal(d_fk.data().get(), fk.data(), N * ntransf, tol * 10, false)); - assert(cufinufft_destroy_impl(plan) == 0); - assert(finufft_destroy(cpu_plan) == 0); - plan = nullptr; - cudaDeviceSynchronize(); - }; - // testing correctness of the plan creation - // cufinufft_plan_t *single_plan{nullptr}; - cufinufft_plan_t *double_plan{nullptr}; - test_type1(double_plan); - test_type2(double_plan); - test_type3(double_plan); - return 0; -} diff --git a/test/cuda/fseries_kernel_test.cu b/test/cuda/fseries_kernel_test.cu deleted file mode 100644 index 0ab233ace..000000000 --- a/test/cuda/fseries_kernel_test.cu +++ /dev/null @@ -1,164 +0,0 @@ -#include -#include -#include -#include -#include - -#include -#include -#include -#include - -using namespace cufinufft::common; -using namespace cufinufft::spreadinterp; -using namespace cufinufft::utils; - -template int run_test(int nf1, int dim, T eps, int gpu, int nf2, int nf3) { - - finufft_spread_opts opts; - T *fwkerhalf1, *fwkerhalf2, *fwkerhalf3; - T *d_fwkerhalf1, *d_fwkerhalf2, *d_fwkerhalf3; - checkCudaErrors(cudaMalloc(&d_fwkerhalf1, sizeof(T) * (nf1 / 2 + 1))); - if (dim > 1) checkCudaErrors(cudaMalloc(&d_fwkerhalf2, sizeof(T) * (nf2 / 2 + 1))); - if (dim > 2) checkCudaErrors(cudaMalloc(&d_fwkerhalf3, sizeof(T) * (nf3 / 2 + 1))); - - int ier = setup_spreader(opts, (T)eps, (T)2.0, 0); - - cudaEvent_t start, stop; - cudaEventCreate(&start); - cudaEventCreate(&stop); - - float milliseconds = 0; - float gputime = 0; - float cputime = 0; - - CNTime timer; - if (!gpu) { - // timer.start(); - // fwkerhalf1 = (T *)malloc(sizeof(T) * (nf1 / 2 + 1)); - // if (dim > 1) fwkerhalf2 = (T *)malloc(sizeof(T) * (nf2 / 2 + 1)); - // if (dim > 2) fwkerhalf3 = (T *)malloc(sizeof(T) * (nf3 / 2 + 1)); - // - // onedim_fseries_kernel(nf1, fwkerhalf1, opts); - // if (dim > 1) onedim_fseries_kernel(nf2, fwkerhalf2, opts); - // if (dim > 2) onedim_fseries_kernel(nf3, fwkerhalf3, opts); - // cputime = timer.elapsedsec(); - // cudaEventRecord(start); - // { - // checkCudaErrors(cudaMemcpy(d_fwkerhalf1, fwkerhalf1, sizeof(T) * (nf1 / 2 + - // 1), - // cudaMemcpyHostToDevice)); - // if (dim > 1) - // checkCudaErrors(cudaMemcpy(d_fwkerhalf2, fwkerhalf2, sizeof(T) * (nf2 / 2 + - // 1), - // cudaMemcpyHostToDevice)); - // if (dim > 2) - // checkCudaErrors(cudaMemcpy(d_fwkerhalf3, fwkerhalf3, sizeof(T) * (nf3 / 2 + - // 1), - // cudaMemcpyHostToDevice)); - // } - // cudaEventRecord(stop); - // cudaEventSynchronize(stop); - // cudaEventElapsedTime(&milliseconds, start, stop); - // gputime = milliseconds; - // printf("[time ] dim=%d, nf1=%8d, ns=%2d, CPU: %6.2f ms\n", dim, nf1, - // opts.nspread, - // gputime + cputime * 1000); - // free(fwkerhalf1); - // if (dim > 1) free(fwkerhalf2); - // if (dim > 2) free(fwkerhalf3); - } else { - timer.start(); - T a[dim * MAX_NQUAD]; - T f[dim * MAX_NQUAD]; - onedim_fseries_kernel_precomp(nf1, f, a, opts); - if (dim > 1) - onedim_fseries_kernel_precomp(nf2, f + MAX_NQUAD, a + MAX_NQUAD, opts); - if (dim > 2) - onedim_fseries_kernel_precomp(nf3, f + 2 * MAX_NQUAD, a + 2 * MAX_NQUAD, - opts); - cputime = timer.elapsedsec(); - - T *d_a; - T *d_f; - cudaEventRecord(start); - { - checkCudaErrors(cudaMalloc(&d_a, dim * MAX_NQUAD * sizeof(T))); - checkCudaErrors(cudaMalloc(&d_f, dim * MAX_NQUAD * sizeof(T))); - checkCudaErrors( - cudaMemcpy(d_a, a, dim * MAX_NQUAD * sizeof(T), cudaMemcpyHostToDevice)); - checkCudaErrors( - cudaMemcpy(d_f, f, dim * MAX_NQUAD * sizeof(T), cudaMemcpyHostToDevice)); - ier = - cufserieskernelcompute(dim, nf1, nf2, nf3, d_f, d_a, d_fwkerhalf1, d_fwkerhalf2, - d_fwkerhalf3, opts.nspread, cudaStreamDefault); - } - cudaEventRecord(stop); - cudaEventSynchronize(stop); - cudaEventElapsedTime(&milliseconds, start, stop); - gputime = milliseconds; - printf("[time ] dim=%d, nf1=%8d, ns=%2d, GPU: %6.2f ms\n", dim, nf1, opts.nspread, - gputime + cputime * 1000); - cudaFree(d_a); - cudaFree(d_f); - } - - fwkerhalf1 = (T *)malloc(sizeof(T) * (nf1 / 2 + 1)); - if (dim > 1) fwkerhalf2 = (T *)malloc(sizeof(T) * (nf2 / 2 + 1)); - if (dim > 2) fwkerhalf3 = (T *)malloc(sizeof(T) * (nf3 / 2 + 1)); - - checkCudaErrors(cudaMemcpy(fwkerhalf1, d_fwkerhalf1, sizeof(T) * (nf1 / 2 + 1), - cudaMemcpyDeviceToHost)); - if (dim > 1) - checkCudaErrors(cudaMemcpy(fwkerhalf2, d_fwkerhalf2, sizeof(T) * (nf2 / 2 + 1), - cudaMemcpyDeviceToHost)); - if (dim > 2) - checkCudaErrors(cudaMemcpy(fwkerhalf3, d_fwkerhalf3, sizeof(T) * (nf3 / 2 + 1), - cudaMemcpyDeviceToHost)); - for (int i = 0; i < nf1 / 2 + 1; i++) printf("%10.8e ", fwkerhalf1[i]); - printf("\n"); - if (dim > 1) - for (int i = 0; i < nf2 / 2 + 1; i++) printf("%10.8e ", fwkerhalf2[i]); - printf("\n"); - if (dim > 2) - for (int i = 0; i < nf3 / 2 + 1; i++) printf("%10.8e ", fwkerhalf3[i]); - printf("\n"); - - return 0; -} - -int main(int argc, char *argv[]) { - if (argc < 3) { - fprintf(stderr, - "Usage: onedim_fseries_kernel_test prec nf1 [dim [tol [gpuversion [nf2 " - "[nf3]]]]]\n" - "Arguments:\n" - " prec: 'f' or 'd' (float/double)\n" - " nf1: The size of the upsampled fine grid size in x.\n" - " dim: Dimension of the nuFFT.\n" - " tol: NUFFT tolerance (default 1e-6).\n" - " gpuversion: Use gpu version or not (default True).\n" - " nf2: The size of the upsampled fine grid size in y. (default nf1)\n" - " nf3: The size of the upsampled fine grid size in z. (default nf3)\n"); - return 1; - } - char prec = argv[1][0]; - int nf1 = std::atof(argv[2]); - int dim = 1; - double eps = 1e-6; - int gpu = 1; - int nf2 = nf1; - int nf3 = nf1; - if (argc > 3) dim = std::atoi(argv[3]); - if (argc > 4) eps = std::atof(argv[4]); - if (argc > 5) gpu = std::atoi(argv[5]); - if (argc > 6) nf2 = std::atoi(argv[6]); - if (argc > 7) nf3 = std::atoi(argv[7]); - - if (prec == 'f') - return run_test(nf1, dim, eps, gpu, nf2, nf3); - else if (prec == 'd') - return run_test(nf1, dim, eps, gpu, nf2, nf3); - else - return -1; -} diff --git a/test/cuda/fseriesperf.sh b/test/cuda/fseriesperf.sh deleted file mode 100755 index 36af42276..000000000 --- a/test/cuda/fseriesperf.sh +++ /dev/null @@ -1,29 +0,0 @@ -#!/bin/bash -# basic perf test of compute fseries for 1d, single/double -# Melody 02/20/22 - -BINDIR=./ - -BIN=$BINDIR/fseries_kernel_test -DIM=1 - -echo "Double.............................................." -for N in 1e2 5e2 1e3 2e3 5e3 1e4 5e4 1e5 5e5 -do - for TOL in 1e-8 - do - $BIN $N $DIM $TOL 0 - $BIN $N $DIM $TOL 1 - done -done - -BIN=$BINDIR/fseries_kernel_testf -echo "Single.............................................." -for N in 1e2 5e2 1e3 2e3 5e3 1e4 5e4 1e5 5e5 -do - for TOL in 1e-6 - do - $BIN $N $DIM $TOL 0 - $BIN $N $DIM $TOL 1 - done -done From 71ad4647ed497f4167f4aa1d9cbdc9bf9011ef51 Mon Sep 17 00:00:00 2001 From: DiamonDinoia Date: Wed, 28 Aug 2024 12:47:36 -0400 Subject: [PATCH 56/68] added extended lambda flag to tests --- perftest/cuda/CMakeLists.txt | 2 ++ src/cuda/CMakeLists.txt | 7 +++++-- test/cuda/CMakeLists.txt | 6 ++++-- 3 files changed, 11 insertions(+), 4 deletions(-) diff --git a/perftest/cuda/CMakeLists.txt b/perftest/cuda/CMakeLists.txt index 8b1ad1c9b..92c870cc5 100644 --- a/perftest/cuda/CMakeLists.txt +++ b/perftest/cuda/CMakeLists.txt @@ -2,6 +2,8 @@ add_executable(cuperftest cuperftest.cu) target_include_directories(cuperftest PUBLIC ${CUFINUFFT_INCLUDE_DIRS}) target_link_libraries(cuperftest PUBLIC cufinufft) target_compile_features(cuperftest PRIVATE cxx_std_17) +target_compile_options(cuperftest + PRIVATE $<$:--extended-lambda>) set_target_properties( cuperftest PROPERTIES LINKER_LANGUAGE CUDA diff --git a/src/cuda/CMakeLists.txt b/src/cuda/CMakeLists.txt index dfe5c9b20..0dfed1886 100644 --- a/src/cuda/CMakeLists.txt +++ b/src/cuda/CMakeLists.txt @@ -37,7 +37,10 @@ set(FINUFFT_CUDA_FLAGS -fmad=true -restrict --extra-device-vectorization - # $<$:-G -maxrregcount 64 > + $<$:-G + -maxrregcount + 64 + > >) add_library(cufinufft_common_objects OBJECT ${PRECISION_INDEPENDENT_SRC}) @@ -81,7 +84,7 @@ set_target_properties( CUDA_STANDARD_REQUIRED ON ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}") target_compile_features(cufinufft PRIVATE cxx_std_17) -target_compile_options(cufinufft PUBLIC ${FINUFFT_CUDA_FLAGS}) +target_compile_options(cufinufft PRIVATE ${FINUFFT_CUDA_FLAGS}) if(WIN32) target_link_libraries(cufinufft PUBLIC CUDA::cudart CUDA::cufft CUDA::nvToolsExt) diff --git a/test/cuda/CMakeLists.txt b/test/cuda/CMakeLists.txt index 212fd6674..d40e8e66b 100644 --- a/test/cuda/CMakeLists.txt +++ b/test/cuda/CMakeLists.txt @@ -5,14 +5,16 @@ foreach(srcfile ${test_src}) get_filename_component(executable ${executable} NAME) add_executable(${executable} ${srcfile}) target_include_directories(${executable} PUBLIC ${CUFINUFFT_INCLUDE_DIRS}) + target_compile_options(${executable} + PUBLIC $<$:--extended-lambda>) find_library(MathLib m) if(MathLib) target_link_libraries(${executable} PUBLIC cufinufft ${MathLib}) endif() target_compile_features(${executable} PUBLIC cxx_std_17) set_target_properties( - ${executable} PROPERTIES LINKER_LANGUAGE CUDA CUDA_ARCHITECTURES - "${FINUFFT_CUDA_ARCHITECTURES}") + ${executable} PROPERTIES LINKER_LANGUAGE CUDA + CUDA_ARCHITECTURES "${FINUFFT_CUDA_ARCHITECTURES}") message(STATUS "Adding test ${executable}" " with CUDA_ARCHITECTURES=${FINUFFT_CUDA_ARCHITECTURES}" " and INCLUDE=${CUFINUFFT_INCLUDE_DIRS}") From a494518b3027b53d26d67b79fe67efadf3bd770f Mon Sep 17 00:00:00 2001 From: DiamonDinoia Date: Wed, 28 Aug 2024 13:50:09 -0400 Subject: [PATCH 57/68] CleanUP --- devel/cuda/draft_interfaces_c+py_Jun2023.txt | 4 +- docs/cufinufft_migration.rst | 6 +- docs/opts.rst | 28 +++--- include/cufinufft/impl.h | 37 ++++---- include/cufinufft/precision_independent.h | 4 - include/cufinufft/spreadinterp.h | 28 +++++- include/cufinufft/types.h | 19 ++-- include/cufinufft/utils.h | 2 - matlab/finufft.mw | 8 +- matlab/opts.docbit | 2 +- src/cuda/1d/cufinufft1d.cu | 36 ++++---- src/cuda/2d/cufinufft2d.cu | 30 +++---- src/cuda/3d/cufinufft3d.cu | 30 +++---- src/cuda/common.cu | 95 ++++---------------- src/cuda/deconvolve_wrapper.cu | 6 +- src/cuda/memtransfer_wrapper.cu | 16 ++-- src/cuda/precision_independent.cu | 14 --- src/finufft.cpp | 49 ---------- 18 files changed, 148 insertions(+), 266 deletions(-) diff --git a/devel/cuda/draft_interfaces_c+py_Jun2023.txt b/devel/cuda/draft_interfaces_c+py_Jun2023.txt index 699f22a85..b5620087b 100644 --- a/devel/cuda/draft_interfaces_c+py_Jun2023.txt +++ b/devel/cuda/draft_interfaces_c+py_Jun2023.txt @@ -1,6 +1,6 @@ int finufft_makeplan(int type, int dim, int64_t* nmodes, int iflag, int ntr, double eps, finufft_plan* plan, nufft_opts* opts) -int cufinufft_makeplan(int type, int dim, int* nmodes, int iflag, int ntransf, double tol, int maxbatchsize, cufinufft_plan *plan, cufinufft_opts *opts) -// Remove maxbatchsize (-> opts), use int64_t. Rename ntransf to ntr, tol to eps. +int cufinufft_makeplan(int type, int dim, int* nmodes, int iflag, int ntransf, double tol, int batchsize, cufinufft_plan *plan, cufinufft_opts *opts) +// Remove batchsize (-> opts), use int64_t. Rename ntransf to ntr, tol to eps. int finufft_setpts(finufft_plan plan, int64_t m, double* x, double* y, double* z, int64_t n, double* s, double* t, double* z) int cufinufft_setpts(int m, double* x, double* y, double* z, int n, double* s, double* t, double *u, cufinufft_plan plan) diff --git a/docs/cufinufft_migration.rst b/docs/cufinufft_migration.rst index c6c67b7c7..1f530f8be 100644 --- a/docs/cufinufft_migration.rst +++ b/docs/cufinufft_migration.rst @@ -17,7 +17,7 @@ The following function signatures were updated during the API change: .. code-block:: c int cufinufft_makeplan(int type, int dim, int *n_modes, int iflag, - int ntransf, double tol, int maxbatchsize, cufinufft_plan *plan, + int ntransf, double tol, int batchsize, cufinufft_plan *plan, cufinufft_opts *opts); and now has the signature @@ -29,7 +29,7 @@ The following function signatures were updated during the API change: cufinufft_opts *opts); - In other words, the ``n_modes`` argument now takes the type ``int64_t`` to accomodate larger arrays and the ``maxbatchsize`` argument has been removed (and can now be found as part of ``cufinufft_opts``). + In other words, the ``n_modes`` argument now takes the type ``int64_t`` to accomodate larger arrays and the ``batchsize`` argument has been removed (and can now be found as part of ``cufinufft_opts``). The ``tol`` and ``ntransf`` arguments have also been renamed to ``eps`` and ``ntr``, respectively. - ``cufinufft_setpts``, which had the signature @@ -45,7 +45,7 @@ The following function signatures were updated during the API change: .. code-block:: c int cufinufft_setpts(cufinufft_plan d_plan, int M, double *d_x, - double *d_y, double *d_z, int N, double *d_s, double *d_t, double *d_u); + double *d_y, double *d_z, int N, double *d_Sp, double *d_Tp, double *d_Up); Aside from name changes, main difference here is that the ``plan`` is now the first argument, not the last. diff --git a/docs/opts.rst b/docs/opts.rst index 8e32829d8..3dc06a437 100644 --- a/docs/opts.rst +++ b/docs/opts.rst @@ -20,7 +20,7 @@ to the simple, vectorized, or guru makeplan routines. Recall how to do this from C++: .. code-block:: C++ - + // (... set up M,x,c,tol,N, and allocate F here...) finufft_opts* opts; finufft_default_opts(opts); @@ -30,7 +30,7 @@ Recall how to do this from C++: This setting produces more timing output to ``stdout``. .. warning:: - + In C/C++ and Fortran, don't forget to call the command which sets default options (``finufft_default_opts`` or ``finufftf_default_opts``) before you start changing them and passing them to FINUFFT. @@ -51,9 +51,9 @@ Here are their default settings (from ``src/finufft.cpp:finufft_default_opts``): .. literalinclude:: ../src/finufft.cpp :start-after: @defopts_start :end-before: @defopts_end - + As for quick advice, the main options you'll want to play with are: - + - ``modeord`` to flip ("fftshift") the Fourier mode ordering - ``debug`` to look at timing output (to determine if your problem is spread/interpolation dominated, vs FFT dominated) - ``nthreads`` to run with a different number of threads than the current maximum available through OpenMP (a large number can sometimes be detrimental, and very small problems can sometimes run faster on 1 thread) @@ -92,7 +92,7 @@ Data handling options .. note:: The index *sets* are the same in the two ``modeord`` choices; their ordering differs only by a cyclic shift. The FFT ordering cyclically shifts the CMCL indices $\mbox{floor}(N/2)$ to the left (often called an "fftshift"). **chkbnds**: [DEPRECATED] has no effect. - + Diagnostic options ~~~~~~~~~~~~~~~~~~~~~~~ @@ -100,7 +100,7 @@ Diagnostic options **debug**: Controls the amount of overall debug/timing output to stdout. * ``debug=0`` : silent - + * ``debug=1`` : print some information * ``debug=2`` : prints more information @@ -113,11 +113,11 @@ Diagnostic options * ``spread_debug=2`` : prints lots. This can print thousands of lines since it includes one line per *subproblem*. - + **showwarn**: Whether to print warnings (these go to stderr). - + * ``showwarn=0`` : suppresses such warnings - + * ``showwarn=1`` : prints warnings @@ -173,17 +173,17 @@ for only two settings, as follows. Otherwise, setting it to zero chooses a good **spread_thread**: in the case of multiple transforms per call (``ntr>1``, or the "many" interfaces), controls how multithreading is used to spread/interpolate each batch of data. * ``spread_thread=0`` : makes an automatic choice between the below. Recommended. - + * ``spread_thread=1`` : acts on each vector in the batch in sequence, using multithreaded spread/interpolate on that vector. It can be slightly better than ``2`` for large problems. - + * ``spread_thread=2`` : acts on all vectors in a batch (of size chosen typically to be the number of threads) simultaneously, assigning each a thread which performs a single-threaded spread/interpolate. It is much better than ``1`` for all but large problems. (Historical note: this was used by Melody Shih for the original "2dmany" interface in 2018.) .. note:: - + Historical note: A former option ``3`` has been removed. This was like ``2`` except allowing nested OMP parallelism, so multi-threaded spread-interpolate was used for each of the vectors in a batch in parallel. This was used by Andrea Malleo in 2019. We have not yet found a case where this beats both ``1`` and ``2``, hence removed it due to complications with changing the OMP nesting state in both old and new OMP versions. - -**maxbatchsize**: in the case of multiple transforms per call (``ntr>1``, or the "many" interfaces), set the largest batch size of data vectors. + +**batchsize**: in the case of multiple transforms per call (``ntr>1``, or the "many" interfaces), set the largest batch size of data vectors. Here ``0`` makes an automatic choice. If you are unhappy with this, then for small problems it should equal the number of threads, while for large problems it appears that ``1`` often better (since otherwise too much simultaneous RAM movement occurs). Some further work is needed to optimize this parameter. **spread_nthr_atomic**: if non-negative: for numbers of threads up to this value, an OMP critical block for ``add_wrapped_subgrid`` is used in spreading (type 1 transforms). Above this value, instead OMP atomic writes are used, which scale better for large thread numbers. If negative, the heuristic default in the spreader is used, set in ``src/spreadinterp.cpp:setup_spreader()``. diff --git a/include/cufinufft/impl.h b/include/cufinufft/impl.h index e9229525e..4580db8d0 100644 --- a/include/cufinufft/impl.h +++ b/include/cufinufft/impl.h @@ -120,7 +120,7 @@ int cufinufft_makeplan_impl(int type, int dim, int *nmodes, int iflag, int ntran // TODO: check if this is the right heuristic if (maxbatchsize == 0) // implies: use a heuristic. maxbatchsize = std::min(ntransf, 8); // heuristic from test codes - d_plan->maxbatchsize = maxbatchsize; + d_plan->batchsize = maxbatchsize; const auto stream = d_plan->stream = (cudaStream_t)d_plan->opts.gpu_stream; @@ -466,19 +466,19 @@ int cufinufft_setpts_impl(int M, T *d_kx, T *d_ky, T *d_kz, int N, T *d_s, T *d_ fprintf(stderr, "[%s] Error: d_s is nullptr but dim > 0.\n", __func__); return FINUFFT_ERR_INVALID_ARGUMENT; } - d_plan->d_s = d_plan->dim > 0 ? d_s : nullptr; + d_plan->d_Sp = d_plan->dim > 0 ? d_s : nullptr; if (d_plan->dim > 1 && d_t == nullptr) { fprintf(stderr, "[%s] Error: d_t is nullptr but dim > 1.\n", __func__); return FINUFFT_ERR_INVALID_ARGUMENT; } - d_plan->d_t = d_plan->dim > 1 ? d_t : nullptr; + d_plan->d_Tp = d_plan->dim > 1 ? d_t : nullptr; if (d_plan->dim > 2 && d_u == nullptr) { fprintf(stderr, "[%s] Error: d_u is nullptr but dim > 2.\n", __func__); return FINUFFT_ERR_INVALID_ARGUMENT; } - d_plan->d_u = d_plan->dim > 2 ? d_u : nullptr; + d_plan->d_Up = d_plan->dim > 2 ? d_u : nullptr; const auto dim = d_plan->dim; // no need to set the params to zero, as they are already zeroed out in the plan @@ -562,20 +562,20 @@ int cufinufft_setpts_impl(int M, T *d_kx, T *d_ky, T *d_kz, int N, T *d_s, T *d_ }; // FIXME: check the size of the allocs for the batch interface if (checked_realloc(d_plan->fw, sizeof(cuda_complex) * d_plan->nf * - d_plan->maxbatchsize) != cudaSuccess) + d_plan->batchsize) != cudaSuccess) goto finalize; - if (checked_realloc(d_plan->c_batch, - sizeof(cuda_complex) * M * d_plan->maxbatchsize) != cudaSuccess) + if (checked_realloc(d_plan->CpBatch, sizeof(cuda_complex) * M * d_plan->batchsize) != + cudaSuccess) goto finalize; if (checked_realloc(d_plan->kx, sizeof(T) * M) != cudaSuccess) goto finalize; - if (checked_realloc(d_plan->d_s, sizeof(T) * N) != cudaSuccess) goto finalize; + if (checked_realloc(d_plan->d_Sp, sizeof(T) * N) != cudaSuccess) goto finalize; if (d_plan->dim > 1) { if (checked_realloc(d_plan->ky, sizeof(T) * M) != cudaSuccess) goto finalize; - if (checked_realloc(d_plan->d_t, sizeof(T) * N) != cudaSuccess) goto finalize; + if (checked_realloc(d_plan->d_Tp, sizeof(T) * N) != cudaSuccess) goto finalize; } if (d_plan->dim > 2) { if (checked_realloc(d_plan->kz, sizeof(T) * M) != cudaSuccess) goto finalize; - if (checked_realloc(d_plan->d_u, sizeof(T) * N) != cudaSuccess) goto finalize; + if (checked_realloc(d_plan->d_Up, sizeof(T) * N) != cudaSuccess) goto finalize; } if (checked_realloc(d_plan->prephase, sizeof(cuda_complex) * M) != cudaSuccess) goto finalize; @@ -644,21 +644,21 @@ int cufinufft_setpts_impl(int M, T *d_kx, T *d_ky, T *d_kz, int N, T *d_s, T *d_ const auto scale = d_plan->type3_params.h1 * d_plan->type3_params.gam1; const auto D1 = -d_plan->type3_params.D1; thrust::transform( - thrust::cuda::par.on(stream), d_s, d_s + N, d_plan->d_s, + thrust::cuda::par.on(stream), d_s, d_s + N, d_plan->d_Sp, [scale, D1] __host__ __device__(const T s) -> T { return scale * (s + D1); }); } if (d_plan->dim > 1) { const auto scale = d_plan->type3_params.h2 * d_plan->type3_params.gam2; const auto D2 = -d_plan->type3_params.D2; thrust::transform( - thrust::cuda::par.on(stream), d_t, d_t + N, d_plan->d_t, + thrust::cuda::par.on(stream), d_t, d_t + N, d_plan->d_Tp, [scale, D2] __host__ __device__(const T t) -> T { return scale * (t + D2); }); } if (d_plan->dim > 2) { const auto scale = d_plan->type3_params.h3 * d_plan->type3_params.gam3; const auto D3 = -d_plan->type3_params.D3; thrust::transform( - thrust::cuda::par.on(stream), d_u, d_u + N, d_plan->d_u, + thrust::cuda::par.on(stream), d_u, d_u + N, d_plan->d_Up, [scale, D3] __host__ __device__(const T u) -> T { return scale * (u + D3); }); } { // here we declare phi_hat1, phi_hat2, and phi_hat3 @@ -698,9 +698,10 @@ int cufinufft_setpts_impl(int M, T *d_kx, T *d_ky, T *d_kz, int N, T *d_s, T *d_ d_fseries_precomp_f.begin()); // sync the stream before calling the kernel might be needed if (cufserieskernelcompute(d_plan->dim, N, N, N, d_fseries_precomp_f.data().get(), - d_fseries_precomp_a.data().get(), d_plan->d_s, d_plan->d_t, - d_plan->d_u, phi_hat1.data().get(), phi_hat2.data().get(), - phi_hat3.data().get(), d_plan->spopts.nspread, stream)) + d_fseries_precomp_a.data().get(), d_plan->d_Sp, + d_plan->d_Tp, d_plan->d_Up, phi_hat1.data().get(), + phi_hat2.data().get(), phi_hat3.data().get(), + d_plan->spopts.nspread, stream)) goto finalize; const auto is_c_finite = std::isfinite(d_plan->type3_params.C1) && @@ -781,12 +782,12 @@ int cufinufft_setpts_impl(int M, T *d_kx, T *d_ky, T *d_kz, int N, T *d_s, T *d_ // Safe to ignore the return value here? if (d_plan->t2_plan) cufinufft_destroy_impl(d_plan->t2_plan); // check that maxbatchsize is correct - if (cufinufft_makeplan_impl(2, dim, t2modes, d_plan->iflag, d_plan->maxbatchsize, + if (cufinufft_makeplan_impl(2, dim, t2modes, d_plan->iflag, d_plan->batchsize, d_plan->tol, &d_plan->t2_plan, &t2opts)) { fprintf(stderr, "[%s] inner t2 plan cufinufft_makeplan failed\n", __func__); goto finalize; } - if (cufinufft_setpts_12_impl(N, d_plan->d_s, d_plan->d_t, d_plan->d_u, + if (cufinufft_setpts_12_impl(N, d_plan->d_Sp, d_plan->d_Tp, d_plan->d_Up, d_plan->t2_plan)) { fprintf(stderr, "[%s] inner t2 plan cufinufft_setpts_12 failed\n", __func__); goto finalize; diff --git a/include/cufinufft/precision_independent.h b/include/cufinufft/precision_independent.h index 9fa48a07e..e8ef209c3 100644 --- a/include/cufinufft/precision_independent.h +++ b/include/cufinufft/precision_independent.h @@ -41,8 +41,6 @@ __global__ void calc_subprob_2d(int *bin_size, int *num_subprob, int maxsubprobs __global__ void map_b_into_subprob_2d(int *d_subprob_to_bin, int *d_subprobstartpts, int *d_numsubprob, int numbins); -__global__ void trivial_global_sort_index_2d(int M, int *index); - /* spreadinterp3d */ __global__ void calc_subprob_3d_v2(int *bin_size, int *num_subprob, int maxsubprobsize, int numbins); @@ -57,8 +55,6 @@ __global__ void calc_subprob_3d_v1(int binsperobinx, int binsperobiny, int binsp __global__ void map_b_into_subprob_3d_v1(int *d_subprob_to_obin, int *d_subprobstartpts, int *d_numsubprob, int numbins); -__global__ void trivial_global_sort_index_3d(int M, int *index); - __global__ void fill_ghost_bins(int binsperobinx, int binsperobiny, int binsperobinz, int nobinx, int nobiny, int nobinz, int *binsize); diff --git a/include/cufinufft/spreadinterp.h b/include/cufinufft/spreadinterp.h index aed555209..0cb953f63 100644 --- a/include/cufinufft/spreadinterp.h +++ b/include/cufinufft/spreadinterp.h @@ -20,16 +20,36 @@ static __forceinline__ __device__ constexpr T cudaFMA(const T a, const T b, cons } static_assert(std::is_same_v || std::is_same_v, "Only float and double are supported."); - return T{0}; + return std::fma(a, b, c); } template constexpr __forceinline__ __host__ __device__ T fold_rescale(T x, int N) { constexpr auto x2pi = T(0.159154943091895345554011992339482617); constexpr auto half = T(0.5); - const auto result = x * x2pi + half; - return (result - std::floor(result)) * T(N); - // #endif +#if defined(__CUDA_ARCH__) + if constexpr (std::is_same_v) { + // fused multiply-add, round to nearest even + auto result = cudaFMA(x, x2pi, half); + // subtract, round down + result = __fsub_rd(result, floorf(result)); + // multiply, round down + return __fmul_rd(result, static_cast(N)); + } else if constexpr (std::is_same_v) { + // fused multiply-add, round to nearest even + auto result = cudaFMA(x, x2pi, half); + // subtract, round down + result = __dsub_rd(result, floor(result)); + // multiply, round down + return __dmul_rd(result, static_cast(N)); + } else { + static_assert(std::is_same_v || std::is_same_v, + "Only float and double are supported."); + } +#else + const auto result = std::fma(x, x2pi, half); + return (result - std::floor(result)) * static_cast(N); +#endif } template diff --git a/include/cufinufft/types.h b/include/cufinufft/types.h index 2920e7ae7..830ed4c1b 100644 --- a/include/cufinufft/types.h +++ b/include/cufinufft/types.h @@ -37,8 +37,7 @@ template struct cufinufft_plan_t { CUFINUFFT_BIGINT mt; CUFINUFFT_BIGINT mu; int ntransf; - int maxbatchsize; // TODO: this might be called batchsize non maxbatchsize (double - // check) + int batchsize; int iflag; int supports_pools; @@ -52,8 +51,8 @@ template struct cufinufft_plan_t { T *kx; T *ky; T *kz; - cuda_complex *c_batch; - cuda_complex *fw_batch; + cuda_complex *CpBatch; // working array of prephased strengths + cuda_complex *fwbatch; // no allocs here cuda_complex *c; @@ -68,16 +67,16 @@ template struct cufinufft_plan_t { } type3_params; int N; // number of NU freq pts (type 3 only) CUFINUFFT_BIGINT nf; - T *d_s; - T *d_t; - T *d_u; + T *d_Sp; + T *d_Tp; + T *d_Up; T tol; // inner type 2 plan for type 3 cufinufft_plan_t *t2_plan; - // new allocs. FIXME: convert to device vectors to use resize + // new allocs. + // FIXME: convert to device vectors to use resize cuda_complex *prephase; // pre-phase, for all input NU pts cuda_complex *deconv; // reciprocal of kernel FT, phase, all output NU pts - cuda_complex *CpBatch; // working array of prephased strengths // Arrays that used in subprob method int *idxnupts; // length: #nupts, index of the nupts in the bin-sorted order @@ -94,8 +93,6 @@ template struct cufinufft_plan_t { cufftHandle fftplan; cudaStream_t stream; - - using real_t = T; }; template constexpr static inline cufftType_t cufft_type(); diff --git a/include/cufinufft/utils.h b/include/cufinufft/utils.h index ac1d688da..88eec6cff 100644 --- a/include/cufinufft/utils.h +++ b/include/cufinufft/utils.h @@ -53,8 +53,6 @@ template __forceinline__ __device__ auto interval(const int ns, cons #endif #endif -#undef ALLOCA_SUPPORTED - #if defined(__CUDA_ARCH__) #if __CUDA_ARCH__ >= 900 #define COMPUTE_CAPABILITY_90_OR_HIGHER 1 diff --git a/matlab/finufft.mw b/matlab/finufft.mw index 4758157a1..5c77d1d26 100644 --- a/matlab/finufft.mw +++ b/matlab/finufft.mw @@ -90,8 +90,8 @@ $ } $ else if (strcmp(fname[ifield],"spread_thread") == 0) { $ oc->spread_thread = (int)round(*mxGetPr(mxGetFieldByNumber(om,idx,ifield))); $ } -$ else if (strcmp(fname[ifield],"maxbatchsize") == 0) { -$ oc->maxbatchsize = (int)round(*mxGetPr(mxGetFieldByNumber(om,idx,ifield))); +$ else if (strcmp(fname[ifield],"batchsize") == 0) { +$ oc->batchsize = (int)round(*mxGetPr(mxGetFieldByNumber(om,idx,ifield))); $ } $ else if (strcmp(fname[ifield],"nthreads") == 0) { $ oc->nthreads = (int)round(*mxGetPr(mxGetFieldByNumber(om,idx,ifield))); @@ -150,7 +150,7 @@ classdef finufft_plan < handle plan.floatprec = opts.floatprec; end end - + n_modes = ones(3,1); % is dummy for type 3 if type==3 if length(n_modes_or_dim)~=1 @@ -179,7 +179,7 @@ classdef finufft_plan < handle plan.n_trans = n_trans; % Note the peculiarity that mwrap only accepts a double for n_trans, even % though it's declared int. It complains, also with int64 for nj, etc :( - + % replace in finufft_opts struct whichever fields are in incoming opts... # copy_finufft_opts(mxArray opts, finufft_opts* o); if strcmp(plan.floatprec,'double') diff --git a/matlab/opts.docbit b/matlab/opts.docbit index 1dbc841a5..c920c2e9c 100644 --- a/matlab/opts.docbit +++ b/matlab/opts.docbit @@ -7,5 +7,5 @@ % opts.fftw: FFTW plan mode, 64=FFTW_ESTIMATE (default), 0=FFTW_MEASURE, etc % opts.upsampfac: sigma. 2.0 (default), or 1.25 (low RAM, smaller FFT) % opts.spread_thread: for ntrans>1 only. 0:auto, 1:seq multi, 2:par, etc -% opts.maxbatchsize: for ntrans>1 only. max blocking size, or 0 for auto. +% opts.batchsize: for ntrans>1 only. max blocking size, or 0 for auto. % opts.nthreads: number of threads, or 0: use all available (default) diff --git a/src/cuda/1d/cufinufft1d.cu b/src/cuda/1d/cufinufft1d.cu index 0619754e2..8df2932ca 100644 --- a/src/cuda/1d/cufinufft1d.cu +++ b/src/cuda/1d/cufinufft1d.cu @@ -37,17 +37,16 @@ int cufinufft1d1_exec(cuda_complex *d_c, cuda_complex *d_fk, int ier; cuda_complex *d_fkstart; cuda_complex *d_cstart; - for (int i = 0; i * d_plan->maxbatchsize < d_plan->ntransf; i++) { - int blksize = - std::min(d_plan->ntransf - i * d_plan->maxbatchsize, d_plan->maxbatchsize); - d_cstart = d_c + i * d_plan->maxbatchsize * d_plan->M; - d_fkstart = d_fk + i * d_plan->maxbatchsize * d_plan->ms; - d_plan->c = d_cstart; - d_plan->fk = d_fkstart; + for (int i = 0; i * d_plan->batchsize < d_plan->ntransf; i++) { + int blksize = std::min(d_plan->ntransf - i * d_plan->batchsize, d_plan->batchsize); + d_cstart = d_c + i * d_plan->batchsize * d_plan->M; + d_fkstart = d_fk + i * d_plan->batchsize * d_plan->ms; + d_plan->c = d_cstart; + d_plan->fk = d_fkstart; // this is needed if ((ier = checkCudaErrors(cudaMemsetAsync( - d_plan->fw, 0, d_plan->maxbatchsize * d_plan->nf1 * sizeof(cuda_complex), + d_plan->fw, 0, d_plan->batchsize * d_plan->nf1 * sizeof(cuda_complex), stream)))) return ier; @@ -91,11 +90,10 @@ int cufinufft1d2_exec(cuda_complex *d_c, cuda_complex *d_fk, int ier; cuda_complex *d_fkstart; cuda_complex *d_cstart; - for (int i = 0; i * d_plan->maxbatchsize < d_plan->ntransf; i++) { - int blksize = - std::min(d_plan->ntransf - i * d_plan->maxbatchsize, d_plan->maxbatchsize); - d_cstart = d_c + i * d_plan->maxbatchsize * d_plan->M; - d_fkstart = d_fk + i * d_plan->maxbatchsize * d_plan->ms; + for (int i = 0; i * d_plan->batchsize < d_plan->ntransf; i++) { + int blksize = std::min(d_plan->ntransf - i * d_plan->batchsize, d_plan->batchsize); + d_cstart = d_c + i * d_plan->batchsize * d_plan->M; + d_fkstart = d_fk + i * d_plan->batchsize * d_plan->ms; d_plan->c = d_cstart; d_plan->fk = d_fkstart; @@ -138,16 +136,16 @@ int cufinufft1d3_exec(cuda_complex *d_c, cuda_complex *d_fk, cuda_complex *d_cstart; cuda_complex *d_fkstart; const auto stream = d_plan->stream; - for (int i = 0; i * d_plan->maxbatchsize < d_plan->ntransf; i++) { - int blksize = min(d_plan->ntransf - i * d_plan->maxbatchsize, d_plan->maxbatchsize); - d_cstart = d_c + i * d_plan->maxbatchsize * d_plan->M; - d_fkstart = d_fk + i * d_plan->maxbatchsize * d_plan->N; + for (int i = 0; i * d_plan->batchsize < d_plan->ntransf; i++) { + int blksize = min(d_plan->ntransf - i * d_plan->batchsize, d_plan->batchsize); + d_cstart = d_c + i * d_plan->batchsize * d_plan->M; + d_fkstart = d_fk + i * d_plan->batchsize * d_plan->N; // setting input for spreader - d_plan->c = d_plan->c_batch; + d_plan->c = d_plan->CpBatch; // setting output for spreader d_plan->fk = d_plan->fw; if ((ier = checkCudaErrors(cudaMemsetAsync( - d_plan->fw, 0, d_plan->maxbatchsize * d_plan->nf * sizeof(cuda_complex), + d_plan->fw, 0, d_plan->batchsize * d_plan->nf * sizeof(cuda_complex), stream)))) return ier; // NOTE: fw might need to be set to 0 diff --git a/src/cuda/2d/cufinufft2d.cu b/src/cuda/2d/cufinufft2d.cu index e8af3ce7d..4367c4346 100644 --- a/src/cuda/2d/cufinufft2d.cu +++ b/src/cuda/2d/cufinufft2d.cu @@ -38,17 +38,17 @@ int cufinufft2d1_exec(cuda_complex *d_c, cuda_complex *d_fk, cuda_complex *d_cstart; auto &stream = d_plan->stream; - for (int i = 0; i * d_plan->maxbatchsize < d_plan->ntransf; i++) { - int blksize = min(d_plan->ntransf - i * d_plan->maxbatchsize, d_plan->maxbatchsize); - d_cstart = d_c + i * d_plan->maxbatchsize * d_plan->M; - d_fkstart = d_fk + i * d_plan->maxbatchsize * d_plan->ms * d_plan->mt; + for (int i = 0; i * d_plan->batchsize < d_plan->ntransf; i++) { + int blksize = min(d_plan->ntransf - i * d_plan->batchsize, d_plan->batchsize); + d_cstart = d_c + i * d_plan->batchsize * d_plan->M; + d_fkstart = d_fk + i * d_plan->batchsize * d_plan->ms * d_plan->mt; d_plan->c = d_cstart; d_plan->fk = d_fkstart; // this is needed if ((ier = checkCudaErrors(cudaMemsetAsync( d_plan->fw, 0, - d_plan->maxbatchsize * d_plan->nf1 * d_plan->nf2 * sizeof(cuda_complex), + d_plan->batchsize * d_plan->nf1 * d_plan->nf2 * sizeof(cuda_complex), stream)))) return ier; @@ -92,10 +92,10 @@ int cufinufft2d2_exec(cuda_complex *d_c, cuda_complex *d_fk, int ier; cuda_complex *d_fkstart; cuda_complex *d_cstart; - for (int i = 0; i * d_plan->maxbatchsize < d_plan->ntransf; i++) { - int blksize = min(d_plan->ntransf - i * d_plan->maxbatchsize, d_plan->maxbatchsize); - d_cstart = d_c + i * d_plan->maxbatchsize * d_plan->M; - d_fkstart = d_fk + i * d_plan->maxbatchsize * d_plan->ms * d_plan->mt; + for (int i = 0; i * d_plan->batchsize < d_plan->ntransf; i++) { + int blksize = min(d_plan->ntransf - i * d_plan->batchsize, d_plan->batchsize); + d_cstart = d_c + i * d_plan->batchsize * d_plan->M; + d_fkstart = d_fk + i * d_plan->batchsize * d_plan->ms * d_plan->mt; d_plan->c = d_cstart; d_plan->fk = d_fkstart; @@ -138,16 +138,16 @@ int cufinufft2d3_exec(cuda_complex *d_c, cuda_complex *d_fk, cuda_complex *d_cstart; cuda_complex *d_fkstart; const auto stream = d_plan->stream; - for (int i = 0; i * d_plan->maxbatchsize < d_plan->ntransf; i++) { - int blksize = min(d_plan->ntransf - i * d_plan->maxbatchsize, d_plan->maxbatchsize); - d_cstart = d_c + i * d_plan->maxbatchsize * d_plan->M; - d_fkstart = d_fk + i * d_plan->maxbatchsize * d_plan->N; + for (int i = 0; i * d_plan->batchsize < d_plan->ntransf; i++) { + int blksize = min(d_plan->ntransf - i * d_plan->batchsize, d_plan->batchsize); + d_cstart = d_c + i * d_plan->batchsize * d_plan->M; + d_fkstart = d_fk + i * d_plan->batchsize * d_plan->N; // setting input for spreader - d_plan->c = d_plan->c_batch; + d_plan->c = d_plan->CpBatch; // setting output for spreader d_plan->fk = d_plan->fw; if ((ier = checkCudaErrors(cudaMemsetAsync( - d_plan->fw, 0, d_plan->maxbatchsize * d_plan->nf * sizeof(cuda_complex), + d_plan->fw, 0, d_plan->batchsize * d_plan->nf * sizeof(cuda_complex), stream)))) return ier; // NOTE: fw might need to be set to 0 diff --git a/src/cuda/3d/cufinufft3d.cu b/src/cuda/3d/cufinufft3d.cu index f33801ea0..6d6a2bc95 100644 --- a/src/cuda/3d/cufinufft3d.cu +++ b/src/cuda/3d/cufinufft3d.cu @@ -35,16 +35,16 @@ int cufinufft3d1_exec(cuda_complex *d_c, cuda_complex *d_fk, int ier; cuda_complex *d_fkstart; cuda_complex *d_cstart; - for (int i = 0; i * d_plan->maxbatchsize < d_plan->ntransf; i++) { - int blksize = min(d_plan->ntransf - i * d_plan->maxbatchsize, d_plan->maxbatchsize); - d_cstart = d_c + i * d_plan->maxbatchsize * d_plan->M; - d_fkstart = d_fk + i * d_plan->maxbatchsize * d_plan->ms * d_plan->mt * d_plan->mu; + for (int i = 0; i * d_plan->batchsize < d_plan->ntransf; i++) { + int blksize = min(d_plan->ntransf - i * d_plan->batchsize, d_plan->batchsize); + d_cstart = d_c + i * d_plan->batchsize * d_plan->M; + d_fkstart = d_fk + i * d_plan->batchsize * d_plan->ms * d_plan->mt * d_plan->mu; d_plan->c = d_cstart; d_plan->fk = d_fkstart; if ((ier = checkCudaErrors(cudaMemsetAsync( - d_plan->fw, 0, d_plan->maxbatchsize * d_plan->nf * sizeof(cuda_complex), + d_plan->fw, 0, d_plan->batchsize * d_plan->nf * sizeof(cuda_complex), stream)))) return ier; @@ -86,10 +86,10 @@ int cufinufft3d2_exec(cuda_complex *d_c, cuda_complex *d_fk, int ier; cuda_complex *d_fkstart; cuda_complex *d_cstart; - for (int i = 0; i * d_plan->maxbatchsize < d_plan->ntransf; i++) { - int blksize = min(d_plan->ntransf - i * d_plan->maxbatchsize, d_plan->maxbatchsize); - d_cstart = d_c + i * d_plan->maxbatchsize * d_plan->M; - d_fkstart = d_fk + i * d_plan->maxbatchsize * d_plan->ms * d_plan->mt * d_plan->mu; + for (int i = 0; i * d_plan->batchsize < d_plan->ntransf; i++) { + int blksize = min(d_plan->ntransf - i * d_plan->batchsize, d_plan->batchsize); + d_cstart = d_c + i * d_plan->batchsize * d_plan->M; + d_fkstart = d_fk + i * d_plan->batchsize * d_plan->ms * d_plan->mt * d_plan->mu; d_plan->c = d_cstart; d_plan->fk = d_fkstart; @@ -135,17 +135,17 @@ int cufinufft3d3_exec(cuda_complex *d_c, cuda_complex *d_fk, cuda_complex *d_fkstart; const auto stream = d_plan->stream; printf("[cufinufft] d_plan->ntransf = %d\n", d_plan->ntransf); - for (int i = 0; i * d_plan->maxbatchsize < d_plan->ntransf; i++) { - int blksize = min(d_plan->ntransf - i * d_plan->maxbatchsize, d_plan->maxbatchsize); - d_cstart = d_c + i * d_plan->maxbatchsize * d_plan->M; - d_fkstart = d_fk + i * d_plan->maxbatchsize * d_plan->N; + for (int i = 0; i * d_plan->batchsize < d_plan->ntransf; i++) { + int blksize = min(d_plan->ntransf - i * d_plan->batchsize, d_plan->batchsize); + d_cstart = d_c + i * d_plan->batchsize * d_plan->M; + d_fkstart = d_fk + i * d_plan->batchsize * d_plan->N; // setting input for spreader - d_plan->c = d_plan->c_batch; + d_plan->c = d_plan->CpBatch; // setting output for spreader d_plan->fk = d_plan->fw; // NOTE: fw might need to be set to 0 if ((ier = checkCudaErrors(cudaMemsetAsync( - d_plan->fw, 0, d_plan->maxbatchsize * d_plan->nf * sizeof(cuda_complex), + d_plan->fw, 0, d_plan->batchsize * d_plan->nf * sizeof(cuda_complex), stream)))) return ier; // Step 0: pre-phase the input strengths diff --git a/src/cuda/common.cu b/src/cuda/common.cu index 4181430d9..89a089467 100644 --- a/src/cuda/common.cu +++ b/src/cuda/common.cu @@ -35,6 +35,9 @@ __global__ void fseries_kernel_compute(int nf1, int nf2, int nf3, T *f, T *a, T *at = a + threadIdx.y * MAX_NQUAD; T *ft = f + threadIdx.y * MAX_NQUAD; T *oarr; + // standard parallelism pattern in cuda. using a 2D grid, this allows to leverage more + // threads as the parallelism is x*y*z + // each thread check the y index to determine which array to use if (threadIdx.y == 0) { oarr = fwkerhalf1; nf = nf1; @@ -67,6 +70,9 @@ __global__ void fseries_kernel_compute(int nf1, int nf2, int nf3, T *f, T *a, T T *at = a + threadIdx.y * MAX_NQUAD; T *ft = f + threadIdx.y * MAX_NQUAD; T *oarr, *k; + // standard parallelism pattern in cuda. using a 2D grid, this allows to leverage more + // threads as the parallelism is x*y*z + // each thread check the y index to determine which array to use if (threadIdx.y == 0) { k = kx; oarr = fwkerhalf1; @@ -118,10 +124,13 @@ int cufserieskernelcompute(int dim, int nf1, int nf2, int nf3, T *d_f, T *d_a, T T *d_ky, T *d_kz, T *d_fwkerhalf1, T *d_fwkerhalf2, T *d_fwkerhalf3, int ns, cudaStream_t stream) /* - wrapper for approximation of Fourier series of real symmetric spreading - kernel. + Approximates exact Fourier transform of cnufftspread's real symmetric + kernel, directly via q-node quadrature on Euler-Fourier formula, exploiting + narrowness of kernel. Evaluates at set of arbitrary freqs k in [-pi, pi), + for a kernel with x measured in grid-spacings. (See previous routine for + FT definition). -Melody Shih 2/20/22 + Marco Barbone 08/28/2024 */ { int nout = max(max(nf1, nf2), nf3); @@ -161,39 +170,6 @@ void set_nf_type12(CUFINUFFT_BIGINT ms, cufinufft_opts opts, finufft_spread_opts } } -// template -// void onedim_fseries_kernel(CUFINUFFT_BIGINT nf, T *fwkerhalf, finufft_spread_opts opts) -///* -// Approximates exact Fourier series coeffs of cnufftspread's real symmetric -// kernel, directly via q-node quadrature on Euler-Fourier formula, exploiting -// narrowness of kernel. Uses phase winding for cheap eval on the regular freq -// grid. Note that this is also the Fourier transform of the non-periodized -// kernel. The FT definition is f(k) = int e^{-ikx} f(x) dx. The output has an -// overall prefactor of 1/h, which is needed anyway for the correction, and -// arises because the quadrature weights are scaled for grid units not x units. -// -// Inputs: -// nf - size of 1d uniform spread grid, must be even. -// opts - spreading opts object, needed to eval kernel (must be already set up) -// -// Outputs: -// fwkerhalf - real Fourier series coeffs from indices 0 to nf/2 inclusive, -// divided by h = 2pi/n. -// (should be allocated for at least nf/2+1 Ts) -// -// Compare onedim_dct_kernel which has same interface, but computes DFT of -// sampled kernel, not quite the same object. -// -// Barnett 2/7/17. openmp (since slow vs fftw in 1D large-N case) 3/3/18 -// Melody 2/20/22 separate into precomp & comp functions defined below. -// */ -//{ -// T f[MAX_NQUAD]; -// T a[MAX_NQUAD]; -// onedim_fseries_kernel_precomp(nf, f, a, opts); -//// onedim_fseries_kernel_compute(nf, f, a, fwkerhalf, opts); -//} - /* Precomputation of approximations of exact Fourier series coeffs of cnufftspread's real symmetric kernel. @@ -201,9 +177,11 @@ void set_nf_type12(CUFINUFFT_BIGINT ms, cufinufft_opts opts, finufft_spread_opts Inputs: nf - size of 1d uniform spread grid, must be even. opts - spreading opts object, needed to eval kernel (must be already set up) + phase_winding - if true, compute normalization factors for phase winding rates, + otherwise compute scaled quadrature nodes Outputs: - a - phase winding rates + a - normalization factors if phase winding is true, otherwise scaled quadrature nodes f - funciton values at quadrature nodes multiplied with quadrature weights (a, f are provided as the inputs of onedim_fseries_kernel_compute() defined below) */ @@ -225,42 +203,10 @@ void onedim_fseries_kernel_precomp(CUFINUFFT_BIGINT nf, T *f, T *a, a[n] = ((T)(2.0 * M_PI) * (T)(nf / 2 - z[n]) / (T)nf); // phase winding rates } else { a[n] = T(z[n]); - // printf("[cufinufft] f[%d] = %.16g\n",n,f[n]); - // printf("[cufinufft] z[%d] = %.16g\n",n,z[n]); } } } -// template -// void onedim_fseries_kernel_compute(CUFINUFFT_BIGINT nf, T *f, std::complex *a, -// T *fwkerhalf, finufft_spread_opts opts) { -// T J2 = opts.nspread / 2.0; // J/2, half-width of ker z-support -// int q = (int)(2 + 3.0 * J2); // not sure why so large? cannot exceed MAX_NQUAD -// CUFINUFFT_BIGINT nout = nf / 2 + 1; // how many values we're writing to -// int nt = std::min(nout, MY_OMP_GET_MAX_THREADS()); // how many chunks -// std::vector brk(nt + 1); // start indices for each thread -// for (int t = 0; t <= nt; ++t) // split nout mode indices btw threads -// brk[t] = (CUFINUFFT_BIGINT)(0.5 + nout * t / (double)nt); -// #pragma omp parallel -// { -// int t = MY_OMP_GET_THREAD_NUM(); -// if (t < nt) { // could be nt < actual # threads -// std::complex aj[MAX_NQUAD]; // phase rotator for this thread -// for (int n = 0; n < q; ++n) -// aj[n] = pow(a[n], (T)brk[t]); // init phase factors for chunk -// for (CUFINUFFT_BIGINT j = brk[t]; j < brk[t + 1]; ++j) { // loop along output -// // array -// T x = 0.0; // accumulator for answer at this j -// for (int n = 0; n < q; ++n) { -// x += f[n] * 2 * real(aj[n]); // include the negative freq -// aj[n] *= a[n]; // wind the phases -// } -// fwkerhalf[j] = x; -// } -// } -// } -// } - template std::size_t shared_memory_required(int dim, int ns, int bin_size_x, int bin_size_y, int bin_size_z) { @@ -374,13 +320,6 @@ void cufinufft_setup_binsize(int type, int ns, int dim, cufinufft_opts *opts) { } } -// template void onedim_fseries_kernel_compute(CUFINUFFT_BIGINT nf, float *f, -// std::complex *a, float *fwkerhalf, -// finufft_spread_opts opts); -// template void onedim_fseries_kernel_compute(CUFINUFFT_BIGINT nf, double *f, -// std::complex *a, double *fwkerhalf, -// finufft_spread_opts opts); - template int setup_spreader_for_nufft(finufft_spread_opts &spopts, float eps, cufinufft_opts opts); template int setup_spreader_for_nufft(finufft_spread_opts &spopts, double eps, @@ -407,10 +346,6 @@ template int cufserieskernelcompute( int dim, int nf1, int nf2, int nf3, double *d_f, double *d_a, double *d_kx, double *d_ky, double *d_kz, double *d_fwkerhalf1, double *d_fwkerhalf2, double *d_fwkerhalf3, int ns, cudaStream_t stream); -// template void onedim_fseries_kernel(CUFINUFFT_BIGINT nf, float *fwkerhalf, -// finufft_spread_opts opts); -// template void onedim_fseries_kernel(CUFINUFFT_BIGINT nf, double *fwkerhalf, -// finufft_spread_opts opts); template std::size_t shared_memory_required(int dim, int ns, int bin_size_x, int bin_size_y, int bin_size_z); diff --git a/src/cuda/deconvolve_wrapper.cu b/src/cuda/deconvolve_wrapper.cu index 94eb6b4c8..38a4f0da9 100644 --- a/src/cuda/deconvolve_wrapper.cu +++ b/src/cuda/deconvolve_wrapper.cu @@ -235,7 +235,7 @@ int cudeconvolve1d(cufinufft_plan_t *d_plan, int blksize) int ms = d_plan->ms; int nf1 = d_plan->nf1; int nmodes = ms; - int maxbatchsize = d_plan->maxbatchsize; + int maxbatchsize = d_plan->batchsize; if (d_plan->spopts.spread_direction == 1) { for (int t = 0; t < blksize; t++) { @@ -268,7 +268,7 @@ int cudeconvolve2d(cufinufft_plan_t *d_plan, int blksize) int nf1 = d_plan->nf1; int nf2 = d_plan->nf2; int nmodes = ms * mt; - int maxbatchsize = d_plan->maxbatchsize; + int maxbatchsize = d_plan->batchsize; if (d_plan->spopts.spread_direction == 1) { for (int t = 0; t < blksize; t++) { @@ -305,7 +305,7 @@ int cudeconvolve3d(cufinufft_plan_t *d_plan, int blksize) int nf2 = d_plan->nf2; int nf3 = d_plan->nf3; int nmodes = ms * mt * mu; - int maxbatchsize = d_plan->maxbatchsize; + int maxbatchsize = d_plan->batchsize; if (d_plan->spopts.spread_direction == 1) { for (int t = 0; t < blksize; t++) { deconvolve_3d<<<(nmodes + 256 - 1) / 256, 256, 0, stream>>>( diff --git a/src/cuda/memtransfer_wrapper.cu b/src/cuda/memtransfer_wrapper.cu index a83bc1042..e5308e8bc 100644 --- a/src/cuda/memtransfer_wrapper.cu +++ b/src/cuda/memtransfer_wrapper.cu @@ -24,7 +24,7 @@ int allocgpumem1d_plan(cufinufft_plan_t *d_plan) int ier{0}; int nf1 = d_plan->nf1; - int maxbatchsize = d_plan->maxbatchsize; + int maxbatchsize = d_plan->batchsize; switch (d_plan->opts.gpu_method) { case 1: { @@ -141,7 +141,7 @@ int allocgpumem2d_plan(cufinufft_plan_t *d_plan) int nf1 = d_plan->nf1; int nf2 = d_plan->nf2; - int maxbatchsize = d_plan->maxbatchsize; + int maxbatchsize = d_plan->batchsize; switch (d_plan->opts.gpu_method) { case 1: { @@ -267,7 +267,7 @@ int allocgpumem3d_plan(cufinufft_plan_t *d_plan) int nf1 = d_plan->nf1; int nf2 = d_plan->nf2; int nf3 = d_plan->nf3; - int maxbatchsize = d_plan->maxbatchsize; + int maxbatchsize = d_plan->batchsize; switch (d_plan->opts.gpu_method) { case 1: { @@ -456,15 +456,15 @@ void freegpumemory(cufinufft_plan_t *d_plan) } CUDA_FREE_AND_NULL(d_plan->kx, stream, d_plan->supports_pools); - CUDA_FREE_AND_NULL(d_plan->d_s, stream, d_plan->supports_pools); + CUDA_FREE_AND_NULL(d_plan->d_Sp, stream, d_plan->supports_pools); CUDA_FREE_AND_NULL(d_plan->ky, stream, d_plan->supports_pools); - CUDA_FREE_AND_NULL(d_plan->d_t, stream, d_plan->supports_pools); + CUDA_FREE_AND_NULL(d_plan->d_Tp, stream, d_plan->supports_pools); CUDA_FREE_AND_NULL(d_plan->kz, stream, d_plan->supports_pools); - CUDA_FREE_AND_NULL(d_plan->d_u, stream, d_plan->supports_pools); + CUDA_FREE_AND_NULL(d_plan->d_Up, stream, d_plan->supports_pools); CUDA_FREE_AND_NULL(d_plan->prephase, stream, d_plan->supports_pools); CUDA_FREE_AND_NULL(d_plan->deconv, stream, d_plan->supports_pools); - CUDA_FREE_AND_NULL(d_plan->fw_batch, stream, d_plan->supports_pools); - CUDA_FREE_AND_NULL(d_plan->c_batch, stream, d_plan->supports_pools); + CUDA_FREE_AND_NULL(d_plan->fwbatch, stream, d_plan->supports_pools); + CUDA_FREE_AND_NULL(d_plan->CpBatch, stream, d_plan->supports_pools); } template int allocgpumem1d_plan(cufinufft_plan_t *d_plan); diff --git a/src/cuda/precision_independent.cu b/src/cuda/precision_independent.cu index b2c0c292f..7b199220a 100644 --- a/src/cuda/precision_independent.cu +++ b/src/cuda/precision_independent.cu @@ -71,13 +71,6 @@ __global__ void map_b_into_subprob_2d(int *d_subprob_to_bin, int *d_subprobstart } } -__global__ void trivial_global_sort_index_2d(int M, int *index) { - for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < M; - i += gridDim.x * blockDim.x) { - index[i] = i; - } -} - /* spreadinterp3d */ __global__ void calc_subprob_3d_v2(int *bin_size, int *num_subprob, int maxsubprobsize, int numbins) { @@ -121,13 +114,6 @@ __global__ void map_b_into_subprob_3d_v1(int *d_subprob_to_obin, int *d_subprobs } } -__global__ void trivial_global_sort_index_3d(int M, int *index) { - for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < M; - i += gridDim.x * blockDim.x) { - index[i] = i; - } -} - __global__ void fill_ghost_bins(int binsperobinx, int binsperobiny, int binsperobinz, int nobinx, int nobiny, int nobinz, int *binsize) { int binx = threadIdx.x + blockIdx.x * blockDim.x; diff --git a/src/finufft.cpp b/src/finufft.cpp index b3bd0efda..e1fa2b41d 100644 --- a/src/finufft.cpp +++ b/src/finufft.cpp @@ -270,8 +270,6 @@ void onedim_nuft_kernel(BIGINT nk, FLT *k, FLT *phihat, finufft_spread_opts opts for (int n = 0; n < q; ++n) { z[n] *= (FLT)J2; // quadr nodes for [0,J/2] f[n] = J2 * (FLT)w[n] * evaluate_kernel((FLT)z[n], opts); // w/ quadr weights - // printf("[finufft] f[%d] = %.16g\n",n,f[n]); - // printf("[finufft] z[%d] = %.16g\n",n,z[n]); } #pragma omp parallel for num_threads(opts.nthreads) for (BIGINT j = 0; j < nk; ++j) { // loop along output array @@ -975,38 +973,6 @@ int FINUFFT_SETPTS(FINUFFT_PLAN p, BIGINT nj, FLT *xj, FLT *yj, FLT *zj, BIGINT p->Up[k] = p->t3P.h3 * p->t3P.gam3 * (u[k] - p->t3P.D3); // so |u'_k| < // pi/R } - // // print Sp, Tp, Up - // for (BIGINT k = 0; k < nk; ++k) { - // printf("Sp[%lld] = %.16g\n", (long long)k, p->Sp[k]); - // } - // for (BIGINT k = 0; k < nk; ++k) { - // printf("Tp[%lld] = %.16g\n", (long long)k, p->Tp[k]); - // } - // for (BIGINT k = 0; k < nk; ++k) { - // printf("Up[%lld] = %.16g\n", (long long)k, p->Up[k]); - // } - // // print min, max of Sp, Tp, Up - // FLT minSp = p->Sp[0], maxSp = p->Sp[0]; - // FLT minTp = p->Tp[0], maxTp = p->Tp[0]; - // FLT minUp = p->Up[0], maxUp = p->Up[0]; - // for (BIGINT k = 0; k < nk; ++k) { - // if (p->Sp[k] < minSp) minSp = p->Sp[k]; - // if (p->Sp[k] > maxSp) maxSp = p->Sp[k]; - // if (p->Tp[k] < minTp) minTp = p->Tp[k]; - // if (p->Tp[k] > maxTp) maxTp = p->Tp[k]; - // if (p->Up[k] < minUp) minUp = p->Up[k]; - // if (p->Up[k] > maxUp) maxUp = p->Up[k]; - // } - // printf("minSp = %.16g, maxSp = %.16g\n", minSp, maxSp); - - // #pragma omp parallel for num_threads(p->opts.nthreads) schedule(static) - // for (BIGINT k = 0; k < nk; ++k) { - // p->Sp[k] = s[k]; - // if (d > 1) - // p->Tp[k] =t[k]; - // if (d > 2) - // p->Up[k] = u[k]; - // } // (old STEP 3a) Compute deconvolution post-factors array (per targ pt)... // (exploits that FT separates because kernel is prod of 1D funcs) if (p->deconv) free(p->deconv); @@ -1196,9 +1162,6 @@ int FINUFFT_EXECUTE(FINUFFT_PLAN p, CPX *cj, CPX *fk) { BIGINT ioff = i * p->nj; for (BIGINT j = 0; j < p->nj; ++j) { p->CpBatch[ioff + j] = p->prephase[j] * cjb[ioff + j]; - // printf("[finufft] ??p->CpBatch[%ld] = %.16g | %.16gi\n", j, - // real(p->CpBatch[j]), - // imag(p->CpBatch[j])); // debug } } t_pre += timer.elapsedsec(); @@ -1209,13 +1172,6 @@ int FINUFFT_EXECUTE(FINUFFT_PLAN p, CPX *cj, CPX *fk) { spreadinterpSortedBatch(thisBatchSize, p, p->CpBatch); // p->X are primed t_spr += timer.elapsedsec(); - // for (int j = p->nf1 * p->nf2; j < p->nf1 * p->nf2 * 5; ++j) { - // if (p->fwBatch[j].real() != 0.0 || p->fwBatch[j].imag() != 0.0) - // printf("[finufft] fw[%d]=%.16g+%.16gi\n", j, p->fwBatch[j].real(), - // p->fwBatch[j].imag()); // - // // debug - // } - // STEP 2: type 2 NUFFT from fw batch to user output fk array batch... timer.restart(); // illegal possible shrink of ntrans *after* plan for smaller last batch: @@ -1224,11 +1180,6 @@ int FINUFFT_EXECUTE(FINUFFT_PLAN p, CPX *cj, CPX *fk) { still the same size, as Andrea explained; just wastes a few flops) */ FINUFFT_EXECUTE(p->innerT2plan, fkb, p->fwBatch); t_t2 += timer.elapsedsec(); - // for (int j = 0; j < p->nk; ++j) { - // printf("[finufft] fk[%d]=%.16g %.16g\n", j, fkb[j].real(), - // fkb[j].imag()); - // debug - // } // STEP 3: apply deconvolve (precomputed 1/phiHat(targ_k), phasing too)... timer.restart(); #pragma omp parallel for num_threads(p->opts.nthreads) From 5788320c72362a56ce4d129e029e211d9726209e Mon Sep 17 00:00:00 2001 From: DiamonDinoia Date: Wed, 28 Aug 2024 13:58:35 -0400 Subject: [PATCH 58/68] Updated changelog --- CHANGELOG | 9 +++++++++ matlab/finufft.mw | 4 ++-- matlab/opts.docbit | 2 +- 3 files changed, 12 insertions(+), 3 deletions(-) diff --git a/CHANGELOG b/CHANGELOG index 91ab909ed..0cb369456 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,6 +1,15 @@ List of features / changes made / release notes, in reverse chronological order. If not stated, FINUFFT is assumed (cuFINUFFT <=1.3 is listed separately). +V 2.4.0 (08/28/24) +* Support for type 3 in 1D, 2D, and 3D in the GPU library cufinufft (PR #517). +* Removed the CPU fseries computation (only used for benchmark no longer needed). +* Added complex arithmetic support for cuda_complex type +* Added tests for type 3 in 1D, 2D, and 3D and cuda_complex arithmetic +* Minor fixes on the GPU code: + - removed memory leaks in case of errors + - renamed maxbatchsize to batchsize + V 2.3.0-rc1 (8/6/24) * Switched C++ standards from C++14 to C++17, allowing various templating diff --git a/matlab/finufft.mw b/matlab/finufft.mw index 5c77d1d26..73c362e6a 100644 --- a/matlab/finufft.mw +++ b/matlab/finufft.mw @@ -90,8 +90,8 @@ $ } $ else if (strcmp(fname[ifield],"spread_thread") == 0) { $ oc->spread_thread = (int)round(*mxGetPr(mxGetFieldByNumber(om,idx,ifield))); $ } -$ else if (strcmp(fname[ifield],"batchsize") == 0) { -$ oc->batchsize = (int)round(*mxGetPr(mxGetFieldByNumber(om,idx,ifield))); +$ else if (strcmp(fname[ifield],"maxbatchsize") == 0) { +$ oc->maxbatchsize = (int)round(*mxGetPr(mxGetFieldByNumber(om,idx,ifield))); $ } $ else if (strcmp(fname[ifield],"nthreads") == 0) { $ oc->nthreads = (int)round(*mxGetPr(mxGetFieldByNumber(om,idx,ifield))); diff --git a/matlab/opts.docbit b/matlab/opts.docbit index c920c2e9c..1dbc841a5 100644 --- a/matlab/opts.docbit +++ b/matlab/opts.docbit @@ -7,5 +7,5 @@ % opts.fftw: FFTW plan mode, 64=FFTW_ESTIMATE (default), 0=FFTW_MEASURE, etc % opts.upsampfac: sigma. 2.0 (default), or 1.25 (low RAM, smaller FFT) % opts.spread_thread: for ntrans>1 only. 0:auto, 1:seq multi, 2:par, etc -% opts.batchsize: for ntrans>1 only. max blocking size, or 0 for auto. +% opts.maxbatchsize: for ntrans>1 only. max blocking size, or 0 for auto. % opts.nthreads: number of threads, or 0: use all available (default) From 4c7388e0a13e9f7bf0c33efcb95b60ceb0055906 Mon Sep 17 00:00:00 2001 From: DiamonDinoia Date: Wed, 28 Aug 2024 14:00:42 -0400 Subject: [PATCH 59/68] fixed printf warning --- include/cufinufft/impl.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/include/cufinufft/impl.h b/include/cufinufft/impl.h index 4580db8d0..82e6e36a7 100644 --- a/include/cufinufft/impl.h +++ b/include/cufinufft/impl.h @@ -185,7 +185,7 @@ int cufinufft_makeplan_impl(int type, int dim, int *nmodes, int iflag, int ntran const auto mem_required = shared_memory_required(dim, d_plan->spopts.nspread, d_plan->opts.gpu_binsizex, d_plan->opts.gpu_binsizey, d_plan->opts.gpu_binsizez); - printf("[cufinufft] shared memory required for the spreader: %d\n", mem_required); + printf("[cufinufft] shared memory required for the spreader: %ld\n", mem_required); } if (type == 1 || type == 2) { @@ -523,18 +523,18 @@ int cufinufft_setpts_impl(int M, T *d_kx, T *d_ky, T *d_kz, int N, T *d_s, T *d_ if (d_plan->opts.debug) { printf("[%s]", __func__); printf("\tM=%d N=%d\n", M, N); - printf("\tX1=%.3g C1=%.3g S1=%.3g D1=%.3g gam1=%g nf1=%lld h1=%.3g\t\n", + printf("\tX1=%.3g C1=%.3g S1=%.3g D1=%.3g gam1=%g nf1=%d h1=%.3g\t\n", d_plan->type3_params.X1, d_plan->type3_params.C1, d_plan->type3_params.S1, d_plan->type3_params.D1, d_plan->type3_params.gam1, d_plan->nf1, d_plan->type3_params.h1); if (d_plan->dim > 1) { - printf("\tX2=%.3g C2=%.3g S2=%.3g D2=%.3g gam2=%g nf2=%lld h2=%.3g\n", + printf("\tX2=%.3g C2=%.3g S2=%.3g D2=%.3g gam2=%g nf2=%d h2=%.3g\n", d_plan->type3_params.X2, d_plan->type3_params.C2, d_plan->type3_params.S2, d_plan->type3_params.D2, d_plan->type3_params.gam2, d_plan->nf2, d_plan->type3_params.h2); } if (d_plan->dim > 2) { - printf("\tX3=%.3g C3=%.3g S3=%.3g D3=%.3g gam3=%g nf3=%lld h3=%.3g\n", + printf("\tX3=%.3g C3=%.3g S3=%.3g D3=%.3g gam3=%g nf3=%d h3=%.3g\n", d_plan->type3_params.X3, d_plan->type3_params.C3, d_plan->type3_params.S3, d_plan->type3_params.D3, d_plan->type3_params.gam3, d_plan->nf3, d_plan->type3_params.h3); From 46eb1d46587a054b940a8fa576308d9f5be438a8 Mon Sep 17 00:00:00 2001 From: DiamonDinoia Date: Wed, 28 Aug 2024 15:08:54 -0400 Subject: [PATCH 60/68] restored fftw behaviour --- src/finufft.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/finufft.cpp b/src/finufft.cpp index e1fa2b41d..3a50554a3 100644 --- a/src/finufft.cpp +++ b/src/finufft.cpp @@ -760,6 +760,7 @@ int FINUFFT_MAKEPLAN(int type, int dim, BIGINT *n_modes, int iflag, int ntrans, // Unfortunately fftw_planner_nthreads wasn't introduced until fftw 3.3.9, and // there isn't a convenient mechanism to probe the version // there is fftw_version which returns a string, but that's not compile time + FFTW_PLAN_TH(nthr_fft); p->fftwPlan = FFTW_PLAN_MANY_DFT(dim, ns, p->batchSize, (FFTW_CPX *)p->fwBatch, NULL, 1, p->nf, (FFTW_CPX *)p->fwBatch, NULL, 1, p->nf, p->fftSign, p->opts.fftw); From 0ada7a0368262e745f64e4baef6a6765b346b9fb Mon Sep 17 00:00:00 2001 From: DiamonDinoia Date: Wed, 28 Aug 2024 15:26:13 -0400 Subject: [PATCH 61/68] Added devnotes on the issue --- docs/devnotes.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/devnotes.rst b/docs/devnotes.rst index bf008d85a..d6517f61f 100644 --- a/docs/devnotes.rst +++ b/docs/devnotes.rst @@ -51,6 +51,8 @@ Developer notes * CMake compiling on linux at Flatiron Institute (Rusty cluster): We have had a report that if you want to use LLVM, you need to ``module load llvm/16.0.3`` otherwise the default ``llvm/14.0.6`` does not find ``OpenMP_CXX``. +* Note to the nvcc developer. nvcc with debug symbols causes a stack overflow that is undetected at both compile and runtime. This goes undetected until ns>=10, for ns<10, one can use -G and debug the code with cuda-gdb. The way to avoid is to not use Debug symbols, possibly using ``--generate-line-info`` might work (not tested). As a side note, compute-sanitizers do not detect the issue. + * Testing cufinufft (for FI, mostly): .. code-block:: sh From 671e4acc9bc1e95a02790577d778465d0ffca761 Mon Sep 17 00:00:00 2001 From: DiamonDinoia Date: Wed, 28 Aug 2024 15:39:18 -0400 Subject: [PATCH 62/68] removed sprurious changes --- devel/cuda/draft_interfaces_c+py_Jun2023.txt | 4 ++-- docs/cufinufft_migration.rst | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/devel/cuda/draft_interfaces_c+py_Jun2023.txt b/devel/cuda/draft_interfaces_c+py_Jun2023.txt index b5620087b..699f22a85 100644 --- a/devel/cuda/draft_interfaces_c+py_Jun2023.txt +++ b/devel/cuda/draft_interfaces_c+py_Jun2023.txt @@ -1,6 +1,6 @@ int finufft_makeplan(int type, int dim, int64_t* nmodes, int iflag, int ntr, double eps, finufft_plan* plan, nufft_opts* opts) -int cufinufft_makeplan(int type, int dim, int* nmodes, int iflag, int ntransf, double tol, int batchsize, cufinufft_plan *plan, cufinufft_opts *opts) -// Remove batchsize (-> opts), use int64_t. Rename ntransf to ntr, tol to eps. +int cufinufft_makeplan(int type, int dim, int* nmodes, int iflag, int ntransf, double tol, int maxbatchsize, cufinufft_plan *plan, cufinufft_opts *opts) +// Remove maxbatchsize (-> opts), use int64_t. Rename ntransf to ntr, tol to eps. int finufft_setpts(finufft_plan plan, int64_t m, double* x, double* y, double* z, int64_t n, double* s, double* t, double* z) int cufinufft_setpts(int m, double* x, double* y, double* z, int n, double* s, double* t, double *u, cufinufft_plan plan) diff --git a/docs/cufinufft_migration.rst b/docs/cufinufft_migration.rst index 1f530f8be..c6c67b7c7 100644 --- a/docs/cufinufft_migration.rst +++ b/docs/cufinufft_migration.rst @@ -17,7 +17,7 @@ The following function signatures were updated during the API change: .. code-block:: c int cufinufft_makeplan(int type, int dim, int *n_modes, int iflag, - int ntransf, double tol, int batchsize, cufinufft_plan *plan, + int ntransf, double tol, int maxbatchsize, cufinufft_plan *plan, cufinufft_opts *opts); and now has the signature @@ -29,7 +29,7 @@ The following function signatures were updated during the API change: cufinufft_opts *opts); - In other words, the ``n_modes`` argument now takes the type ``int64_t`` to accomodate larger arrays and the ``batchsize`` argument has been removed (and can now be found as part of ``cufinufft_opts``). + In other words, the ``n_modes`` argument now takes the type ``int64_t`` to accomodate larger arrays and the ``maxbatchsize`` argument has been removed (and can now be found as part of ``cufinufft_opts``). The ``tol`` and ``ntransf`` arguments have also been renamed to ``eps`` and ``ntr``, respectively. - ``cufinufft_setpts``, which had the signature @@ -45,7 +45,7 @@ The following function signatures were updated during the API change: .. code-block:: c int cufinufft_setpts(cufinufft_plan d_plan, int M, double *d_x, - double *d_y, double *d_z, int N, double *d_Sp, double *d_Tp, double *d_Up); + double *d_y, double *d_z, int N, double *d_s, double *d_t, double *d_u); Aside from name changes, main difference here is that the ``plan`` is now the first argument, not the last. From 7a7cff57c940c4abe305f99fddb8050f8d969978 Mon Sep 17 00:00:00 2001 From: DiamonDinoia Date: Wed, 28 Aug 2024 15:43:17 -0400 Subject: [PATCH 63/68] Minor cleanup --- src/cuda/3d/cufinufft3d.cu | 15 --------------- 1 file changed, 15 deletions(-) diff --git a/src/cuda/3d/cufinufft3d.cu b/src/cuda/3d/cufinufft3d.cu index 6d6a2bc95..8e1263a9a 100644 --- a/src/cuda/3d/cufinufft3d.cu +++ b/src/cuda/3d/cufinufft3d.cu @@ -154,30 +154,15 @@ int cufinufft3d3_exec(cuda_complex *d_c, cuda_complex *d_fk, d_plan->prephase + d_plan->M, d_cstart + i * d_plan->M, d_plan->c + i * d_plan->M, thrust::multiplies>()); } - // Step 1: Spread - if ((ier = cuspread3d(d_plan, blksize))) return ier; // now d_plan->fk = d_plan->fw contains the spread values - // Step 2: Type 3 NUFFT - // type 2 goes from fk to c // saving the results directly in the user output array d_fk // it needs to do blksize transforms d_plan->t2_plan->ntransf = blksize; if ((ier = cufinufft3d2_exec(d_fkstart, d_plan->fw, d_plan->t2_plan))) return ier; - // print d_fk using thrust on the GPU - // create a host vector to store the results - // copy d_fk to host - // print the results - // std::vector> h_fk(d_plan->N); - // cudaMemcpyAsync(h_fk.data(), d_fkstart, d_plan->N * sizeof(cuda_complex), - // cudaMemcpyDeviceToHost, stream); - // for (int i = 0; i < d_plan->N; i++) { - // printf("[cufinufft] d_fk = %.16g %.16g\n", h_fk[i].x, h_fk[i].y); - // } - // Step 3: deconvolve // now we need to d_fk = d_fk*d_plan->deconv for (int i = 0; i < blksize; i++) { From 9b0da6616021356ba30b0f435740569b5c711e87 Mon Sep 17 00:00:00 2001 From: DiamonDinoia Date: Thu, 29 Aug 2024 17:09:50 -0400 Subject: [PATCH 64/68] fixed math test --- test/cuda/cufinufft_math_test.cu | 29 ++++++++++++----------------- 1 file changed, 12 insertions(+), 17 deletions(-) diff --git a/test/cuda/cufinufft_math_test.cu b/test/cuda/cufinufft_math_test.cu index 005dd3199..27e3325af 100644 --- a/test/cuda/cufinufft_math_test.cu +++ b/test/cuda/cufinufft_math_test.cu @@ -14,19 +14,18 @@ template cuda_complex make_cuda_complex(T real, T imag) { // Helper function to compare cuComplex with std::complex using 1 - ratio as error template -bool compareComplex(const cuda_complex &a, const std::complex &b, +bool compareComplex(const cuda_complex a, const std::complex b, const std::string &operation, T epsilon = std::numeric_limits::epsilon()) { - T real_error = 1 - a.x / b.real(); - T imag_error = 1 - a.y / b.imag(); - if (real_error >= epsilon || imag_error >= epsilon) { + const auto std_a = std::complex(a.x, a.y); + const auto err = std::abs(std_a - b) / std::abs(std_a); + if (err > epsilon) { std::cout << "Comparison failed in operation: " << operation << "\n"; std::cout << "cuComplex: (" << a.x << ", " << a.y << ")\n"; std::cout << "std::complex: (" << b.real() << ", " << b.imag() << ")\n"; - std::cout << "Real error: " << real_error << "\n"; - std::cout << "Imag error: " << imag_error << "\n"; + std::cout << "Error: " << err << "\n"; } - return real_error < epsilon && imag_error < epsilon; + return err <= epsilon; } template int testRandomOperations() { @@ -107,16 +106,12 @@ template int testRandomOperations() { std::string(typeid(T).name()) + ">")) return 1; - // Test division with scalar - // Avoid division by small numbers which is not accurate - if (scalar > (std::is_same_v ? 1e-15 : 1e-6)) { - cuda_complex result_div_scalar = a / scalar; - std::complex expected_div_scalar = std_a / scalar; - if (!compareComplex(result_div_scalar, expected_div_scalar, - "div complex<" + std::string(typeid(T).name()) + "> scalar<" + - std::string(typeid(T).name()) + ">")) - return 1; - } + cuda_complex result_div_scalar = a / scalar; + std::complex expected_div_scalar = std_a / scalar; + if (!compareComplex(result_div_scalar, expected_div_scalar, + "div complex<" + std::string(typeid(T).name()) + "> scalar<" + + std::string(typeid(T).name()) + ">")) + return 1; } return 0; } From d3d4d343d3ef4578d325bc1feced9e5b28787895 Mon Sep 17 00:00:00 2001 From: Marco Barbone Date: Wed, 4 Sep 2024 17:18:13 -0400 Subject: [PATCH 65/68] Addressed review comments --- CHANGELOG | 12 +++++----- docs/devnotes.rst | 2 +- docs/opts.rst | 2 +- include/cufinufft/common.h | 1 - include/cufinufft/contrib/helper_math.h | 5 +++++ include/cufinufft/defs.h | 1 - include/cufinufft/impl.h | 28 ++++++++++++++---------- include/cufinufft/spreadinterp.h | 6 ++++- include/cufinufft/types.h | 7 +++++- include/cufinufft/utils.h | 28 ++++++++++-------------- src/cuda/1d/cufinufft1d.cu | 6 ++--- src/cuda/2d/cufinufft2d.cu | 6 ++--- src/cuda/3d/cufinufft3d.cu | 4 ++-- src/cuda/common.cu | 29 +++++++++++++++++-------- test/cuda/CMakeLists.txt | 8 +++++-- test/cuda/cufinufft1d_test.cu | 4 ++-- test/cuda/cufinufft2d_test.cu | 4 ++-- test/cuda/cufinufft3d_test.cu | 6 ++--- test/cuda/cufinufft_math_test.cu | 6 +++-- 19 files changed, 98 insertions(+), 67 deletions(-) diff --git a/CHANGELOG b/CHANGELOG index 0cb369456..7ae28c9a3 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -3,12 +3,12 @@ If not stated, FINUFFT is assumed (cuFINUFFT <=1.3 is listed separately). V 2.4.0 (08/28/24) * Support for type 3 in 1D, 2D, and 3D in the GPU library cufinufft (PR #517). -* Removed the CPU fseries computation (only used for benchmark no longer needed). -* Added complex arithmetic support for cuda_complex type -* Added tests for type 3 in 1D, 2D, and 3D and cuda_complex arithmetic -* Minor fixes on the GPU code: - - removed memory leaks in case of errors - - renamed maxbatchsize to batchsize + - Removed the CPU fseries computation (only used for benchmark no longer needed). + - Added complex arithmetic support for cuda_complex type + - Added tests for type 3 in 1D, 2D, and 3D and cuda_complex arithmetic + - Minor fixes on the GPU code: + a) removed memory leaks in case of errors + b) renamed maxbatchsize to batchsize V 2.3.0-rc1 (8/6/24) diff --git a/docs/devnotes.rst b/docs/devnotes.rst index d6517f61f..a196c5607 100644 --- a/docs/devnotes.rst +++ b/docs/devnotes.rst @@ -51,7 +51,7 @@ Developer notes * CMake compiling on linux at Flatiron Institute (Rusty cluster): We have had a report that if you want to use LLVM, you need to ``module load llvm/16.0.3`` otherwise the default ``llvm/14.0.6`` does not find ``OpenMP_CXX``. -* Note to the nvcc developer. nvcc with debug symbols causes a stack overflow that is undetected at both compile and runtime. This goes undetected until ns>=10, for ns<10, one can use -G and debug the code with cuda-gdb. The way to avoid is to not use Debug symbols, possibly using ``--generate-line-info`` might work (not tested). As a side note, compute-sanitizers do not detect the issue. +* Note to the nvcc developer. nvcc with debug symbols causes a stack overflow that is undetected at both compile and runtime. This goes undetected until ns>=10 and dim=3, for ns<10 or dim < 3, one can use -G and debug the code with cuda-gdb. The way to avoid is to not use Debug symbols, possibly using ``--generate-line-info`` might work (not tested). As a side note, compute-sanitizers do not detect the issue. * Testing cufinufft (for FI, mostly): diff --git a/docs/opts.rst b/docs/opts.rst index 3dc06a437..e0fac5dbe 100644 --- a/docs/opts.rst +++ b/docs/opts.rst @@ -183,7 +183,7 @@ for only two settings, as follows. Otherwise, setting it to zero chooses a good Historical note: A former option ``3`` has been removed. This was like ``2`` except allowing nested OMP parallelism, so multi-threaded spread-interpolate was used for each of the vectors in a batch in parallel. This was used by Andrea Malleo in 2019. We have not yet found a case where this beats both ``1`` and ``2``, hence removed it due to complications with changing the OMP nesting state in both old and new OMP versions. -**batchsize**: in the case of multiple transforms per call (``ntr>1``, or the "many" interfaces), set the largest batch size of data vectors. +**maxbatchsize**: in the case of multiple transforms per call (``ntr>1``, or the "many" interfaces), set the largest batch size of data vectors. Here ``0`` makes an automatic choice. If you are unhappy with this, then for small problems it should equal the number of threads, while for large problems it appears that ``1`` often better (since otherwise too much simultaneous RAM movement occurs). Some further work is needed to optimize this parameter. **spread_nthr_atomic**: if non-negative: for numbers of threads up to this value, an OMP critical block for ``add_wrapped_subgrid`` is used in spreading (type 1 transforms). Above this value, instead OMP atomic writes are used, which scale better for large thread numbers. If negative, the heuristic default in the spreader is used, set in ``src/spreadinterp.cpp:setup_spreader()``. diff --git a/include/cufinufft/common.h b/include/cufinufft/common.h index 6747cdb87..1d508e61d 100644 --- a/include/cufinufft/common.h +++ b/include/cufinufft/common.h @@ -8,7 +8,6 @@ #include #include -#include namespace cufinufft { namespace common { diff --git a/include/cufinufft/contrib/helper_math.h b/include/cufinufft/contrib/helper_math.h index cc1ff8411..119aca6b6 100644 --- a/include/cufinufft/contrib/helper_math.h +++ b/include/cufinufft/contrib/helper_math.h @@ -3,6 +3,11 @@ #include +// This header provides some helper functions for cuComplex types. +// It mainly wraps existing CUDA implementations to provide operator overloads +// e.g. cuAdd, cuSub, cuMul, cuDiv, cuCreal, cuCimag, cuCabs, cuCarg, cuConj are all +// provided by CUDA + // Addition for cuDoubleComplex (double) with cuDoubleComplex (double) __host__ __device__ __forceinline__ cuDoubleComplex operator+( const cuDoubleComplex &a, const cuDoubleComplex &b) noexcept { diff --git a/include/cufinufft/defs.h b/include/cufinufft/defs.h index 8a677d21b..630989a26 100644 --- a/include/cufinufft/defs.h +++ b/include/cufinufft/defs.h @@ -1,7 +1,6 @@ #ifndef CUFINUFFT_DEFS_H #define CUFINUFFT_DEFS_H -#include #include // constants needed within common // upper bound on w, ie nspread, even when padded (see evaluate_kernel_vector); also for diff --git a/include/cufinufft/impl.h b/include/cufinufft/impl.h index 82e6e36a7..d85dfa22b 100644 --- a/include/cufinufft/impl.h +++ b/include/cufinufft/impl.h @@ -116,11 +116,11 @@ int cufinufft_makeplan_impl(int type, int dim, int *nmodes, int iflag, int ntran d_plan->iflag = fftsign; d_plan->ntransf = ntransf; - int maxbatchsize = (opts != nullptr) ? opts->gpu_maxbatchsize : 0; + int batchsize = (opts != nullptr) ? opts->gpu_maxbatchsize : 0; // TODO: check if this is the right heuristic - if (maxbatchsize == 0) // implies: use a heuristic. - maxbatchsize = std::min(ntransf, 8); // heuristic from test codes - d_plan->batchsize = maxbatchsize; + if (batchsize == 0) // implies: use a heuristic. + batchsize = std::min(ntransf, 8); // heuristic from test codes + d_plan->batchsize = batchsize; const auto stream = d_plan->stream = (cudaStream_t)d_plan->opts.gpu_stream; @@ -262,7 +262,7 @@ int cufinufft_makeplan_impl(int type, int dim, int *nmodes, int iflag, int ntran int inembed[] = {(int)nf1}; cufft_status = cufftPlanMany(&fftplan, 1, n, inembed, 1, inembed[0], inembed, 1, - inembed[0], cufft_type(), maxbatchsize); + inembed[0], cufft_type(), batchsize); } break; case 2: { int n[] = {(int)nf2, (int)nf1}; @@ -270,7 +270,7 @@ int cufinufft_makeplan_impl(int type, int dim, int *nmodes, int iflag, int ntran cufft_status = cufftPlanMany(&fftplan, 2, n, inembed, 1, inembed[0] * inembed[1], inembed, 1, - inembed[0] * inembed[1], cufft_type(), maxbatchsize); + inembed[0] * inembed[1], cufft_type(), batchsize); } break; case 3: { int n[] = {(int)nf3, (int)nf2, (int)nf1}; @@ -278,7 +278,7 @@ int cufinufft_makeplan_impl(int type, int dim, int *nmodes, int iflag, int ntran cufft_status = cufftPlanMany( &fftplan, 3, n, inembed, 1, inembed[0] * inembed[1] * inembed[2], inembed, 1, - inembed[0] * inembed[1] * inembed[2], cufft_type(), maxbatchsize); + inembed[0] * inembed[1] * inembed[2], cufft_type(), batchsize); } break; } @@ -292,6 +292,7 @@ int cufinufft_makeplan_impl(int type, int dim, int *nmodes, int iflag, int ntran d_plan->fftplan = fftplan; + // compute up to 3 * NQUAD precomputed values on CPU T fseries_precomp_a[3 * MAX_NQUAD]; T fseries_precomp_f[3 * MAX_NQUAD]; thrust::device_vector d_fseries_precomp_a(3 * MAX_NQUAD); @@ -311,6 +312,7 @@ int cufinufft_makeplan_impl(int type, int dim, int *nmodes, int iflag, int ntran d_fseries_precomp_a.begin()); thrust::copy(fseries_precomp_f, fseries_precomp_f + 3 * MAX_NQUAD, d_fseries_precomp_f.begin()); + // the full fseries is done on the GPU here if ((ier = cufserieskernelcompute( d_plan->dim, d_plan->nf1, d_plan->nf2, d_plan->nf3, d_fseries_precomp_f.data().get(), d_fseries_precomp_a.data().get(), @@ -446,6 +448,12 @@ int cufinufft_setpts_impl(int M, T *d_kx, T *d_ky, T *d_kz, int N, T *d_s, T *d_ return cufinufft_setpts_12_impl(M, d_kx, d_ky, d_kz, d_plan); } // type 3 setpts + + // This code follows the same implementation of the CPU code in finufft and uses similar + // variables names where possible. However, the use of GPU routines and paradigms make + // it harder to follow. To understand the code, it is recommended to read the CPU code + // first. + if (d_plan->type != 3) { fprintf(stderr, "[%s] Invalid type (%d): should be 1, 2, or 3.\n", __func__, d_plan->type); @@ -744,8 +752,8 @@ int cufinufft_setpts_impl(int M, T *d_kx, T *d_ky, T *d_kz, int N, T *d_s, T *d_ thrust::cuda::par.on(stream), phase_iterator, phase_iterator + N, d_plan->deconv, d_plan->deconv, [c1, c2, c3, d1, d2, d3, realsign] __host__ __device__( - const thrust::tuple tuple, - cuda_complex deconv) -> cuda_complex { + const thrust::tuple tuple, cuda_complex deconv) + -> cuda_complex { // d2 and d3 are 0 if dim < 2 and dim < 3 const auto phase = c1 * (thrust::get<0>(tuple) + d1) + c2 * (thrust::get<1>(tuple) + d2) + @@ -861,8 +869,6 @@ int cufinufft_destroy_impl(cufinufft_plan_t *d_plan) In this stage, we (1) free all the memories that have been allocated on gpu (2) delete the cuFFT plan - - Also see ../docs/cppdoc.md for main user-facing documentation. */ { diff --git a/include/cufinufft/spreadinterp.h b/include/cufinufft/spreadinterp.h index 0cb953f63..9efd094c8 100644 --- a/include/cufinufft/spreadinterp.h +++ b/include/cufinufft/spreadinterp.h @@ -23,6 +23,10 @@ static __forceinline__ __device__ constexpr T cudaFMA(const T a, const T b, cons return std::fma(a, b, c); } +/** + * local NU coord fold+rescale macro: does the following affine transform to x: + * (x+PI) mod PI each to [0,N) + */ template constexpr __forceinline__ __host__ __device__ T fold_rescale(T x, int N) { constexpr auto x2pi = T(0.159154943091895345554011992339482617); @@ -92,8 +96,8 @@ static __device__ void eval_kernel_vec_horner(T *ker, const T x, const int w, This is the current evaluation method, since it's faster (except i7 w=16). Two upsampfacs implemented. Params must match ref formula. Barnett 4/24/18 */ { + // const T z = T(2) * x + T(w - 1); const auto z = cudaFMA(T(2), x, T(w - 1)); // scale so local grid offset z in [-1,1] - // const T z = T(2) * x + T(w - 1); // insert the auto-generated code which expects z, w args, writes to ker... if (upsampfac == 2.0) { // floating point equality is fine here using FLT = T; diff --git a/include/cufinufft/types.h b/include/cufinufft/types.h index 830ed4c1b..5b2fba790 100644 --- a/include/cufinufft/types.h +++ b/include/cufinufft/types.h @@ -14,6 +14,10 @@ // Marco Barbone 8/5/2924, replaced the ugly trick with std::conditional // to define cuda_complex +// by using std::conditional and std::is_same, we can define cuda_complex +// if T is float, cuda_complex is cuFloatComplex +// if T is double, cuda_complex is cuDoubleComplex +// where cuFloatComplex and cuDoubleComplex are defined in cuComplex.h // TODO: migrate to cuda/std/complex and remove this // Issue: cufft seems not to support cuda::std::complex // A reinterpret_cast should be enough @@ -61,7 +65,8 @@ template struct cufinufft_plan_t { // Type 3 specific struct { - T X1, C1, S1, D1, h1, gam1; // x dim: X=halfwid C=center D=freqcen h,gam=rescale + T X1, C1, S1, D1, h1, gam1; // x dim: X=halfwid C=center D=freqcen h,gam=rescale, + // s=interval T X2, C2, S2, D2, h2, gam2; // y T X3, C3, S3, D3, h3, gam3; // z } type3_params; diff --git a/include/cufinufft/utils.h b/include/cufinufft/utils.h index 88eec6cff..432711aae 100644 --- a/include/cufinufft/utils.h +++ b/include/cufinufft/utils.h @@ -35,6 +35,10 @@ __inline__ __device__ double atomicAdd(double *address, double val) { } #endif +/** + * It computes the stard and end point of the spreading window given the center x and the + * width ns. + */ template __forceinline__ __device__ auto interval(const int ns, const T x) { const auto xstart = int(std::ceil(x - T(ns) * T(.5))); const auto xend = int(std::floor(x + T(ns) * T(.5))); @@ -114,8 +118,8 @@ template T infnorm(int n, std::complex *a) { */ template -static __forceinline__ __device__ void atomicAddComplexShared( - cuda_complex *address, cuda_complex res) { +static __forceinline__ __device__ void atomicAddComplexShared(cuda_complex *address, + cuda_complex res) { const auto raw_address = reinterpret_cast(address); atomicAdd(raw_address, res.x); atomicAdd(raw_address + 1, res.y); @@ -127,8 +131,8 @@ static __forceinline__ __device__ void atomicAddComplexShared( * on shared memory are supported so we leverage them */ template -static __forceinline__ __device__ void atomicAddComplexGlobal( - cuda_complex *address, cuda_complex res) { +static __forceinline__ __device__ void atomicAddComplexGlobal(cuda_complex *address, + cuda_complex res) { if constexpr ( std::is_same_v, float2> && COMPUTE_CAPABILITY_90_OR_HIGHER) { atomicAdd(address, res); @@ -168,18 +172,10 @@ template auto arraywidcen(int n, T *a, cudaStream_t stream) { template auto set_nhg_type3(T S, T X, const cufinufft_opts &opts, const finufft_spread_opts &spopts) -/* sets nf, h (upsampled grid spacing), and gamma (x_j rescaling factor), - for type 3 only. - Inputs: - X and S are the xj and sk interval half-widths respectively. - opts and spopts are the NUFFT and spreader opts strucs, respectively. - Outputs: - nf is the size of upsampled grid for a given single dimension. - h is the grid spacing = 2pi/nf - gam is the x rescale factor, ie x'_j = x_j/gam (modulo shifts). - Barnett 2/13/17. Caught inf/nan 3/14/17. io int types changed 3/28/17 - New logic 6/12/17 -*/ +/* + * It implements the same function in finufft.cpp + * set_nhg_type3 in finufft.cpp for documentation + */ { int nss = spopts.nspread + 1; // since ns may be odd T Xsafe = X, Ssafe = S; // may be tweaked locally diff --git a/src/cuda/1d/cufinufft1d.cu b/src/cuda/1d/cufinufft1d.cu index 8df2932ca..06389ef75 100644 --- a/src/cuda/1d/cufinufft1d.cu +++ b/src/cuda/1d/cufinufft1d.cu @@ -121,13 +121,13 @@ template int cufinufft1d3_exec(cuda_complex *d_c, cuda_complex *d_fk, cufinufft_plan_t *d_plan) { /* - 3D Type-3 NUFFT + 1D Type-3 NUFFT This function is called in "exec" stage (See ../cufinufft.cu). It includes (copied from doc in finufft library) Step 0: pre-phase the input strengths Step 1: spread data - Step 2: Type 3 NUFFT + Step 2: Type 2 NUFFT Step 3: deconvolve (amplify) each Fourier mode, using kernel Fourier coeff Marco Barbone 08/14/2024 @@ -159,7 +159,7 @@ int cufinufft1d3_exec(cuda_complex *d_c, cuda_complex *d_fk, // Step 1: Spread if ((ier = cuspread1d(d_plan, blksize))) return ier; // now d_plan->fk = d_plan->fw contains the spread values - // Step 2: Type 3 NUFFT + // Step 2: Type 2 NUFFT // type 2 goes from fk to c // saving the results directly in the user output array d_fk // it needs to do blksize transforms diff --git a/src/cuda/2d/cufinufft2d.cu b/src/cuda/2d/cufinufft2d.cu index 4367c4346..8c165edbf 100644 --- a/src/cuda/2d/cufinufft2d.cu +++ b/src/cuda/2d/cufinufft2d.cu @@ -123,13 +123,13 @@ template int cufinufft2d3_exec(cuda_complex *d_c, cuda_complex *d_fk, cufinufft_plan_t *d_plan) { /* - 3D Type-3 NUFFT + 2D Type-3 NUFFT This function is called in "exec" stage (See ../cufinufft.cu). It includes (copied from doc in finufft library) Step 0: pre-phase the input strengths Step 1: spread data - Step 2: Type 3 NUFFT + Step 2: Type 2 NUFFT Step 3: deconvolve (amplify) each Fourier mode, using kernel Fourier coeff Marco Barbone 08/14/2024 @@ -160,7 +160,7 @@ int cufinufft2d3_exec(cuda_complex *d_c, cuda_complex *d_fk, // Step 1: Spread if ((ier = cuspread2d(d_plan, blksize))) return ier; // now d_plan->fk = d_plan->fw contains the spread values - // Step 2: Type 3 NUFFT + // Step 2: Type 2 NUFFT // type 2 goes from fk to c // saving the results directly in the user output array d_fk // it needs to do blksize transforms diff --git a/src/cuda/3d/cufinufft3d.cu b/src/cuda/3d/cufinufft3d.cu index 8e1263a9a..ce1f37c0e 100644 --- a/src/cuda/3d/cufinufft3d.cu +++ b/src/cuda/3d/cufinufft3d.cu @@ -125,7 +125,7 @@ int cufinufft3d3_exec(cuda_complex *d_c, cuda_complex *d_fk, It includes (copied from doc in finufft library) Step 0: pre-phase the input strengths Step 1: spread data - Step 2: Type 3 NUFFT + Step 2: Type 2 NUFFT Step 3: deconvolve (amplify) each Fourier mode, using kernel Fourier coeff Marco Barbone 08/14/2024 @@ -157,7 +157,7 @@ int cufinufft3d3_exec(cuda_complex *d_c, cuda_complex *d_fk, // Step 1: Spread if ((ier = cuspread3d(d_plan, blksize))) return ier; // now d_plan->fk = d_plan->fw contains the spread values - // Step 2: Type 3 NUFFT + // Step 2: Type 2 NUFFT // type 2 goes from fk to c // saving the results directly in the user output array d_fk // it needs to do blksize transforms diff --git a/src/cuda/common.cu b/src/cuda/common.cu index 89a089467..57c1e39dc 100644 --- a/src/cuda/common.cu +++ b/src/cuda/common.cu @@ -21,10 +21,13 @@ namespace common { using namespace cufinufft::spreadinterp; using std::max; -/* Kernel for computing approximations of exact Fourier series coeffs of - cnufftspread's real symmetric kernel. */ -// a , f are intermediate results from function onedim_fseries_kernel_precomp() -// (see cufinufft/contrib/common.cpp for description) +/** Kernel for computing approximations of exact Fourier series coeffs of + * cnufftspread's real symmetric kernel. + * a , f are intermediate results from function onedim_fseries_kernel_precomp() + * (see cufinufft/contrib/common.cpp for description) + * this is the equispaced frequency case, used by type 1 & 2, matching + * onedim_fseries_kernel in CPU code + */ template __global__ void fseries_kernel_compute(int nf1, int nf2, int nf3, T *f, T *a, T *fwkerhalf1, T *fwkerhalf2, T *fwkerhalf3, @@ -60,6 +63,13 @@ __global__ void fseries_kernel_compute(int nf1, int nf2, int nf3, T *f, T *a, } } +/** Kernel for computing approximations of exact Fourier series coeffs of + * cnufftspread's real symmetric kernel. + * a , f are intermediate results from function onedim_fseries_kernel_precomp() + * (see cufinufft/contrib/common.cpp for description) + * this is the arbitrary frequency case (hence the extra kx, ky, kx arguments), used by + * type 3, matching onedim_nuft_kernel in CPU code + */ template __global__ void fseries_kernel_compute(int nf1, int nf2, int nf3, T *f, T *a, T *kx, T *ky, T *kz, T *fwkerhalf1, T *fwkerhalf2, @@ -129,6 +139,7 @@ int cufserieskernelcompute(int dim, int nf1, int nf2, int nf3, T *d_f, T *d_a, T narrowness of kernel. Evaluates at set of arbitrary freqs k in [-pi, pi), for a kernel with x measured in grid-spacings. (See previous routine for FT definition). + It implements onedim_nuft_kernel in CPU code. Marco Barbone 08/28/2024 */ @@ -177,13 +188,13 @@ void set_nf_type12(CUFINUFFT_BIGINT ms, cufinufft_opts opts, finufft_spread_opts Inputs: nf - size of 1d uniform spread grid, must be even. opts - spreading opts object, needed to eval kernel (must be already set up) - phase_winding - if true, compute normalization factors for phase winding rates, - otherwise compute scaled quadrature nodes + phase_winding - if true (type 1-2), scaling for the equispaced case else (type 3) + scaling for the general kx,ky,kz case Outputs: - a - normalization factors if phase winding is true, otherwise scaled quadrature nodes - f - funciton values at quadrature nodes multiplied with quadrature weights - (a, f are provided as the inputs of onedim_fseries_kernel_compute() defined below) + a - vector of scaled quadrature nodes; + f - funciton values at quadrature nodes multiplied with quadrature weights (a, f are + provided as the inputs of onedim_fseries_kernel_compute() defined below) */ template void onedim_fseries_kernel_precomp(CUFINUFFT_BIGINT nf, T *f, T *a, diff --git a/test/cuda/CMakeLists.txt b/test/cuda/CMakeLists.txt index d40e8e66b..1cadb7569 100644 --- a/test/cuda/CMakeLists.txt +++ b/test/cuda/CMakeLists.txt @@ -94,7 +94,7 @@ function(add_tests PREC REQ_TOL CHECK_TOL UPSAMP) ${PREC} ${UPSAMP}) add_test(NAME cufinufft3d3_test_SM_${PREC}_${UPSAMP} - COMMAND cufinufft3d_test 2 3 2 5 10 20 ${REQ_TOL} ${CHECK_TOL} + COMMAND cufinufft3d_test 2 3 2 5 10 30 ${REQ_TOL} ${CHECK_TOL} ${PREC} ${UPSAMP}) endif() @@ -103,7 +103,7 @@ function(add_tests PREC REQ_TOL CHECK_TOL UPSAMP) ${PREC} ${UPSAMP}) add_test(NAME cufinufft3d3_test_GM_${PREC}_${UPSAMP} - COMMAND cufinufft3d_test 1 3 2 5 10 20 ${REQ_TOL} ${CHECK_TOL} + COMMAND cufinufft3d_test 1 3 2 3 7 20 ${REQ_TOL} ${CHECK_TOL}*100 ${PREC} ${UPSAMP}) endfunction() @@ -115,6 +115,10 @@ add_tests(float 1e-5 2e-4 2.0) add_tests(double 1e-12 1e-11 2.0) add_tests(float 1e-5 2e-4 1.25) add_tests(double 1e-8 1e-7 1.25) +# the upsamp is appended to the testname, ctest does not allows multiple tests +# to share the same testname hence we use the trick 0. and 0.f to differentiate +# the tests and allow them to run in the future we should add the precision to +# the test (f add_tests(float 1e-5 2e-4 0.f) add_tests(double 1e-12 1e-11 0.f) add_tests(float 1e-5 2e-4 0.) diff --git a/test/cuda/cufinufft1d_test.cu b/test/cuda/cufinufft1d_test.cu index ceef0881f..52d40ca0e 100644 --- a/test/cuda/cufinufft1d_test.cu +++ b/test/cuda/cufinufft1d_test.cu @@ -58,7 +58,7 @@ int run_test(int method, int type, int N1, int M, T tol, T checktol, int iflag, } s.resize(N1); for (int i = 0; i < N1; i++) { - s[i] = M_PI * randm11(); + s[i] = N1 / 2 * randm11(); } d_s = s; } else { @@ -212,7 +212,7 @@ int main(int argc, char *argv[]) { "Arguments:\n" " method: One of\n" " 1: nupts driven\n" - " type: Type of transform (1, 2)\n" + " type: Type of transform (1, 2, 3)\n" " N1: Number of fourier modes\n" " M: The number of non-uniform points\n" " tol: NUFFT tolerance\n" diff --git a/test/cuda/cufinufft2d_test.cu b/test/cuda/cufinufft2d_test.cu index 0832452a6..549508d26 100644 --- a/test/cuda/cufinufft2d_test.cu +++ b/test/cuda/cufinufft2d_test.cu @@ -57,8 +57,8 @@ int run_test(int method, int type, int N1, int N2, int M, T tol, T checktol, int s.resize(N1 * N2); t.resize(N1 * N2); for (int i = 0; i < N1 * N2; i++) { - s[i] = M_PI * randm11(); - t[i] = M_PI * randm11(); + s[i] = (N1 / 2) * randm11(); + t[i] = (N2 / 2) * randm11(); } d_s = s; d_t = t; diff --git a/test/cuda/cufinufft3d_test.cu b/test/cuda/cufinufft3d_test.cu index 23f29f1a1..65b0d7a0c 100644 --- a/test/cuda/cufinufft3d_test.cu +++ b/test/cuda/cufinufft3d_test.cu @@ -60,9 +60,9 @@ int run_test(int method, int type, int N1, int N2, int N3, int M, T tol, T check t.resize(N1 * N2 * N3); u.resize(N1 * N2 * N3); for (int i = 0; i < N1 * N2 * N3; i++) { - s[i] = M_PI * randm11(); - t[i] = M_PI * randm11(); - u[i] = M_PI * randm11(); + s[i] = (N1 / 2) * randm11(); + t[i] = (N2 / 2) * randm11(); + u[i] = (N3 / 2) * randm11(); } d_s = s; d_t = t; diff --git a/test/cuda/cufinufft_math_test.cu b/test/cuda/cufinufft_math_test.cu index 27e3325af..6da6e5509 100644 --- a/test/cuda/cufinufft_math_test.cu +++ b/test/cuda/cufinufft_math_test.cu @@ -19,13 +19,15 @@ bool compareComplex(const cuda_complex a, const std::complex b, T epsilon = std::numeric_limits::epsilon()) { const auto std_a = std::complex(a.x, a.y); const auto err = std::abs(std_a - b) / std::abs(std_a); - if (err > epsilon) { + // add 10% tolerance to the error + constexpr auto tol = std::is_same_v ? 1e2f : 1e4; + if (err > epsilon * tol) { std::cout << "Comparison failed in operation: " << operation << "\n"; std::cout << "cuComplex: (" << a.x << ", " << a.y << ")\n"; std::cout << "std::complex: (" << b.real() << ", " << b.imag() << ")\n"; std::cout << "Error: " << err << "\n"; } - return err <= epsilon; + return err <= epsilon * tol; } template int testRandomOperations() { From 1355818f58aceb204ae5ba3ea895b5922760a564 Mon Sep 17 00:00:00 2001 From: Marco Barbone Date: Wed, 11 Sep 2024 16:20:57 -0400 Subject: [PATCH 66/68] splitting onedim_f_series in two functions --- include/cufinufft/common.h | 13 ++++++----- include/cufinufft/impl.h | 29 ++++++++++++------------ src/cuda/common.cu | 46 +++++++++++++++++++++++++------------- 3 files changed, 52 insertions(+), 36 deletions(-) diff --git a/include/cufinufft/common.h b/include/cufinufft/common.h index 09c940013..5e8a63b56 100644 --- a/include/cufinufft/common.h +++ b/include/cufinufft/common.h @@ -32,12 +32,13 @@ int setup_spreader_for_nufft(finufft_spread_opts &spopts, T eps, cufinufft_opts void set_nf_type12(CUFINUFFT_BIGINT ms, cufinufft_opts opts, finufft_spread_opts spopts, CUFINUFFT_BIGINT *nf, CUFINUFFT_BIGINT b); -// template -// void onedim_fseries_kernel(CUFINUFFT_BIGINT nf, T *fwkerhalf, finufft_spread_opts -// opts); -template -void onedim_fseries_kernel_precomp(CUFINUFFT_BIGINT nf, T *f, T *a, - finufft_spread_opts opts); + +template +void onedim_uniformn_fseries_kernel_precomp(CUFINUFFT_BIGINT nf, T *f, T *a, + finufft_spread_opts opts); +template +void onedim_non_uniform_fseries_kernel_precomp(T *f, T *a, finufft_spread_opts opts); + template void onedim_fseries_kernel_compute(CUFINUFFT_BIGINT nf, T *f, std::complex *a, T *fwkerhalf, finufft_spread_opts opts); diff --git a/include/cufinufft/impl.h b/include/cufinufft/impl.h index d85dfa22b..9d4dcfdf5 100644 --- a/include/cufinufft/impl.h +++ b/include/cufinufft/impl.h @@ -297,14 +297,14 @@ int cufinufft_makeplan_impl(int type, int dim, int *nmodes, int iflag, int ntran T fseries_precomp_f[3 * MAX_NQUAD]; thrust::device_vector d_fseries_precomp_a(3 * MAX_NQUAD); thrust::device_vector d_fseries_precomp_f(3 * MAX_NQUAD); - onedim_fseries_kernel_precomp(d_plan->nf1, fseries_precomp_f, - fseries_precomp_a, d_plan->spopts); + onedim_uniformn_fseries_kernel_precomp(d_plan->nf1, fseries_precomp_f, + fseries_precomp_a, d_plan->spopts); if (d_plan->dim > 1) - onedim_fseries_kernel_precomp(d_plan->nf2, fseries_precomp_f + MAX_NQUAD, - fseries_precomp_a + MAX_NQUAD, - d_plan->spopts); + onedim_uniformn_fseries_kernel_precomp( + d_plan->nf2, fseries_precomp_f + MAX_NQUAD, fseries_precomp_a + MAX_NQUAD, + d_plan->spopts); if (d_plan->dim > 2) - onedim_fseries_kernel_precomp( + onedim_uniformn_fseries_kernel_precomp( d_plan->nf3, fseries_precomp_f + 2 * MAX_NQUAD, fseries_precomp_a + 2 * MAX_NQUAD, d_plan->spopts); // copy the precomputed data to the device using thrust @@ -687,17 +687,18 @@ int cufinufft_setpts_impl(int M, T *d_kx, T *d_ky, T *d_kz, int N, T *d_s, T *d_ if (d_plan->dim > 2) { phi_hat3.resize(N); } - onedim_fseries_kernel_precomp(0, fseries_precomp_f.data(), - fseries_precomp_a.data(), d_plan->spopts); + onedim_non_uniform_fseries_kernel_precomp( + fseries_precomp_f.data(), fseries_precomp_a.data(), d_plan->spopts); if (d_plan->dim > 1) { - onedim_fseries_kernel_precomp(0, fseries_precomp_f.data() + MAX_NQUAD, - fseries_precomp_a.data() + MAX_NQUAD, - d_plan->spopts); + onedim_non_uniform_fseries_kernel_precomp(fseries_precomp_f.data() + MAX_NQUAD, + fseries_precomp_a.data() + MAX_NQUAD, + d_plan->spopts); } if (d_plan->dim > 2) { - onedim_fseries_kernel_precomp(0, fseries_precomp_f.data() + 2 * MAX_NQUAD, - fseries_precomp_a.data() + 2 * MAX_NQUAD, - d_plan->spopts); + onedim_non_uniform_fseries_kernel_precomp( + fseries_precomp_f.data() + 2 * MAX_NQUAD, + fseries_precomp_a.data() + 2 * MAX_NQUAD, + d_plan->spopts); } // copy the precomputed data to the device using thrust thrust::copy(fseries_precomp_a.begin(), fseries_precomp_a.end(), diff --git a/src/cuda/common.cu b/src/cuda/common.cu index 57c1e39dc..5072c06fe 100644 --- a/src/cuda/common.cu +++ b/src/cuda/common.cu @@ -196,13 +196,31 @@ void set_nf_type12(CUFINUFFT_BIGINT ms, cufinufft_opts opts, finufft_spread_opts f - funciton values at quadrature nodes multiplied with quadrature weights (a, f are provided as the inputs of onedim_fseries_kernel_compute() defined below) */ -template -void onedim_fseries_kernel_precomp(CUFINUFFT_BIGINT nf, T *f, T *a, - finufft_spread_opts opts) { + +template +void onedim_uniformn_fseries_kernel_precomp(CUFINUFFT_BIGINT nf, T *f, T *a, + finufft_spread_opts opts) { + T J2 = opts.nspread / 2.0; // J/2, half-width of ker z-support + // # quadr nodes in z (from 0 to J/2; reflections will be added)... + int q = (int)(2 + 3.0 * J2); // not sure why so large? cannot + // exceed MAX_NQUAD + double z[2 * MAX_NQUAD]; + double w[2 * MAX_NQUAD]; + finufft::quadrature::legendre_compute_glr(2 * q, z, w); // only half the nodes used, + // eg on (0,1) + for (int n = 0; n < q; ++n) { // set up nodes z_n and vals f_n + z[n] *= J2; // rescale nodes + f[n] = J2 * w[n] * evaluate_kernel((T)z[n], opts); // vals & quadr wei + a[n] = ((T)(2.0 * M_PI) * (T)(nf / 2 - z[n]) / (T)nf); // phase winding rates + } +} + +template +void onedim_non_uniform_fseries_kernel_precomp(T *f, T *a, finufft_spread_opts opts) { T J2 = opts.nspread / 2.0; // J/2, half-width of ker z-support // # quadr nodes in z (from 0 to J/2; reflections will be added)... - int q = (int)(2 + (phase_winding ? 3.0 : 2.0) * J2); // not sure why so large? cannot - // exceed MAX_NQUAD + int q = (int)(2 + 2.0 * J2); // not sure why so large? cannot + // exceed MAX_NQUAD double z[2 * MAX_NQUAD]; double w[2 * MAX_NQUAD]; finufft::quadrature::legendre_compute_glr(2 * q, z, w); // only half the nodes used, @@ -210,11 +228,7 @@ void onedim_fseries_kernel_precomp(CUFINUFFT_BIGINT nf, T *f, T *a, for (int n = 0; n < q; ++n) { // set up nodes z_n and vals f_n z[n] *= J2; // rescale nodes f[n] = J2 * w[n] * evaluate_kernel((T)z[n], opts); // vals & quadr wei - if constexpr (phase_winding) { - a[n] = ((T)(2.0 * M_PI) * (T)(nf / 2 - z[n]) / (T)nf); // phase winding rates - } else { - a[n] = T(z[n]); - } + a[n] = T(z[n]); } } @@ -335,14 +349,14 @@ template int setup_spreader_for_nufft(finufft_spread_opts &spopts, float eps, cufinufft_opts opts); template int setup_spreader_for_nufft(finufft_spread_opts &spopts, double eps, cufinufft_opts opts); -template void onedim_fseries_kernel_precomp( - CUFINUFFT_BIGINT nf, float *f, float *a, finufft_spread_opts opts); -template void onedim_fseries_kernel_precomp( - CUFINUFFT_BIGINT nf, double *f, double *a, finufft_spread_opts opts); -template void onedim_fseries_kernel_precomp( +template void onedim_uniformn_fseries_kernel_precomp( CUFINUFFT_BIGINT nf, float *f, float *a, finufft_spread_opts opts); -template void onedim_fseries_kernel_precomp( +template void onedim_uniformn_fseries_kernel_precomp( CUFINUFFT_BIGINT nf, double *f, double *a, finufft_spread_opts opts); +template void onedim_non_uniform_fseries_kernel_precomp(float *f, float *a, + finufft_spread_opts opts); +template void onedim_non_uniform_fseries_kernel_precomp(double *f, double *a, + finufft_spread_opts opts); template int cufserieskernelcompute(int dim, int nf1, int nf2, int nf3, float *d_f, float *d_a, float *d_fwkerhalf1, float *d_fwkerhalf2, float *d_fwkerhalf3, int ns, cudaStream_t stream); From bc64a92b28f692d326c869f04e86f20c2b92ad49 Mon Sep 17 00:00:00 2001 From: Marco Barbone Date: Thu, 12 Sep 2024 12:53:29 -0400 Subject: [PATCH 67/68] GPU flipwind type 1-2; fseries and nuft renaming to match CPU code --- include/cufinufft/common.h | 20 +++---- include/cufinufft/impl.h | 71 ++++++++++++------------ src/cuda/common.cu | 108 ++++++++++++++++++------------------- 3 files changed, 95 insertions(+), 104 deletions(-) diff --git a/include/cufinufft/common.h b/include/cufinufft/common.h index 5e8a63b56..5c11e8815 100644 --- a/include/cufinufft/common.h +++ b/include/cufinufft/common.h @@ -16,17 +16,17 @@ __global__ void fseries_kernel_compute(int nf1, int nf2, int nf3, T *f, T *a, T *fwkerhalf1, T *fwkerhalf2, T *fwkerhalf3, int ns); template -__global__ void fseries_kernel_compute(int nf1, int nf2, int nf3, T *f, T *a, T *kx, +__global__ void cu_nuft_kernel_compute(int nf1, int nf2, int nf3, T *f, T *z, T *kx, T *ky, T *kz, T *fwkerhalf1, T *fwkerhalf2, T *fwkerhalf3, int ns); template -int cufserieskernelcompute(int dim, int nf1, int nf2, int nf3, T *d_f, T *d_a, +int fseries_kernel_compute(int dim, int nf1, int nf2, int nf3, T *d_f, T *d_phase, T *d_fwkerhalf1, T *d_fwkerhalf2, T *d_fwkerhalf3, int ns, cudaStream_t stream); template -int cufserieskernelcompute(int dim, int nf1, int nf2, int nf3, T *d_f, T *d_a, T *d_kx, - T *d_ky, T *d_kz, T *d_fwkerhalf1, T *d_fwkerhalf2, - T *d_fwkerhalf3, int ns, cudaStream_t stream); +int nuft_kernel_compute(int dim, int nf1, int nf2, int nf3, T *d_f, T *d_z, T *d_kx, + T *d_ky, T *d_kz, T *d_fwkerhalf1, T *d_fwkerhalf2, + T *d_fwkerhalf3, int ns, cudaStream_t stream); template int setup_spreader_for_nufft(finufft_spread_opts &spopts, T eps, cufinufft_opts opts); @@ -34,14 +34,10 @@ void set_nf_type12(CUFINUFFT_BIGINT ms, cufinufft_opts opts, finufft_spread_opts CUFINUFFT_BIGINT *nf, CUFINUFFT_BIGINT b); template -void onedim_uniformn_fseries_kernel_precomp(CUFINUFFT_BIGINT nf, T *f, T *a, - finufft_spread_opts opts); +void onedim_fseries_kernel_precomp(CUFINUFFT_BIGINT nf, T *f, T *a, + finufft_spread_opts opts); template -void onedim_non_uniform_fseries_kernel_precomp(T *f, T *a, finufft_spread_opts opts); - -template -void onedim_fseries_kernel_compute(CUFINUFFT_BIGINT nf, T *f, std::complex *a, - T *fwkerhalf, finufft_spread_opts opts); +void onedim_nuft_kernel_precomp(T *f, T *zout, finufft_spread_opts opts); template std::size_t shared_memory_required(int dim, int ns, int bin_size_x, int bin_size_y, diff --git a/include/cufinufft/impl.h b/include/cufinufft/impl.h index 9d4dcfdf5..0913a404e 100644 --- a/include/cufinufft/impl.h +++ b/include/cufinufft/impl.h @@ -293,29 +293,28 @@ int cufinufft_makeplan_impl(int type, int dim, int *nmodes, int iflag, int ntran d_plan->fftplan = fftplan; // compute up to 3 * NQUAD precomputed values on CPU - T fseries_precomp_a[3 * MAX_NQUAD]; + T fseries_precomp_phase[3 * MAX_NQUAD]; T fseries_precomp_f[3 * MAX_NQUAD]; - thrust::device_vector d_fseries_precomp_a(3 * MAX_NQUAD); + thrust::device_vector d_fseries_precomp_phase(3 * MAX_NQUAD); thrust::device_vector d_fseries_precomp_f(3 * MAX_NQUAD); - onedim_uniformn_fseries_kernel_precomp(d_plan->nf1, fseries_precomp_f, - fseries_precomp_a, d_plan->spopts); + onedim_fseries_kernel_precomp(d_plan->nf1, fseries_precomp_f, + fseries_precomp_phase, d_plan->spopts); if (d_plan->dim > 1) - onedim_uniformn_fseries_kernel_precomp( - d_plan->nf2, fseries_precomp_f + MAX_NQUAD, fseries_precomp_a + MAX_NQUAD, - d_plan->spopts); + onedim_fseries_kernel_precomp(d_plan->nf2, fseries_precomp_f + MAX_NQUAD, + fseries_precomp_phase + MAX_NQUAD, d_plan->spopts); if (d_plan->dim > 2) - onedim_uniformn_fseries_kernel_precomp( - d_plan->nf3, fseries_precomp_f + 2 * MAX_NQUAD, - fseries_precomp_a + 2 * MAX_NQUAD, d_plan->spopts); + onedim_fseries_kernel_precomp(d_plan->nf3, fseries_precomp_f + 2 * MAX_NQUAD, + fseries_precomp_phase + 2 * MAX_NQUAD, + d_plan->spopts); // copy the precomputed data to the device using thrust - thrust::copy(fseries_precomp_a, fseries_precomp_a + 3 * MAX_NQUAD, - d_fseries_precomp_a.begin()); + thrust::copy(fseries_precomp_phase, fseries_precomp_phase + 3 * MAX_NQUAD, + d_fseries_precomp_phase.begin()); thrust::copy(fseries_precomp_f, fseries_precomp_f + 3 * MAX_NQUAD, d_fseries_precomp_f.begin()); // the full fseries is done on the GPU here - if ((ier = cufserieskernelcompute( + if ((ier = fseries_kernel_compute( d_plan->dim, d_plan->nf1, d_plan->nf2, d_plan->nf3, - d_fseries_precomp_f.data().get(), d_fseries_precomp_a.data().get(), + d_fseries_precomp_f.data().get(), d_fseries_precomp_phase.data().get(), d_plan->fwkerhalf1, d_plan->fwkerhalf2, d_plan->fwkerhalf3, d_plan->spopts.nspread, stream))) goto finalize; @@ -673,10 +672,10 @@ int cufinufft_setpts_impl(int M, T *d_kx, T *d_ky, T *d_kz, int N, T *d_s, T *d_ // and the precomputed data for the fseries kernel using namespace cufinufft::common; - std::array fseries_precomp_a{}; - std::array fseries_precomp_f{}; - thrust::device_vector d_fseries_precomp_a(3 * MAX_NQUAD); - thrust::device_vector d_fseries_precomp_f(3 * MAX_NQUAD); + std::array nuft_precomp_z{}; + std::array nuft_precomp_f{}; + thrust::device_vector d_nuft_precomp_z(3 * MAX_NQUAD); + thrust::device_vector d_nuft_precomp_f(3 * MAX_NQUAD); thrust::device_vector phi_hat1, phi_hat2, phi_hat3; if (d_plan->dim > 0) { phi_hat1.resize(N); @@ -687,30 +686,26 @@ int cufinufft_setpts_impl(int M, T *d_kx, T *d_ky, T *d_kz, int N, T *d_s, T *d_ if (d_plan->dim > 2) { phi_hat3.resize(N); } - onedim_non_uniform_fseries_kernel_precomp( - fseries_precomp_f.data(), fseries_precomp_a.data(), d_plan->spopts); + onedim_nuft_kernel_precomp(nuft_precomp_f.data(), nuft_precomp_z.data(), + d_plan->spopts); if (d_plan->dim > 1) { - onedim_non_uniform_fseries_kernel_precomp(fseries_precomp_f.data() + MAX_NQUAD, - fseries_precomp_a.data() + MAX_NQUAD, - d_plan->spopts); + onedim_nuft_kernel_precomp(nuft_precomp_f.data() + MAX_NQUAD, + nuft_precomp_z.data() + MAX_NQUAD, + d_plan->spopts); } if (d_plan->dim > 2) { - onedim_non_uniform_fseries_kernel_precomp( - fseries_precomp_f.data() + 2 * MAX_NQUAD, - fseries_precomp_a.data() + 2 * MAX_NQUAD, - d_plan->spopts); + onedim_nuft_kernel_precomp(nuft_precomp_f.data() + 2 * MAX_NQUAD, + nuft_precomp_z.data() + 2 * MAX_NQUAD, + d_plan->spopts); } // copy the precomputed data to the device using thrust - thrust::copy(fseries_precomp_a.begin(), fseries_precomp_a.end(), - d_fseries_precomp_a.begin()); - thrust::copy(fseries_precomp_f.begin(), fseries_precomp_f.end(), - d_fseries_precomp_f.begin()); + thrust::copy(nuft_precomp_z.begin(), nuft_precomp_z.end(), d_nuft_precomp_z.begin()); + thrust::copy(nuft_precomp_f.begin(), nuft_precomp_f.end(), d_nuft_precomp_f.begin()); // sync the stream before calling the kernel might be needed - if (cufserieskernelcompute(d_plan->dim, N, N, N, d_fseries_precomp_f.data().get(), - d_fseries_precomp_a.data().get(), d_plan->d_Sp, - d_plan->d_Tp, d_plan->d_Up, phi_hat1.data().get(), - phi_hat2.data().get(), phi_hat3.data().get(), - d_plan->spopts.nspread, stream)) + if (nuft_kernel_compute(d_plan->dim, N, N, N, d_nuft_precomp_f.data().get(), + d_nuft_precomp_z.data().get(), d_plan->d_Sp, d_plan->d_Tp, + d_plan->d_Up, phi_hat1.data().get(), phi_hat2.data().get(), + phi_hat3.data().get(), d_plan->spopts.nspread, stream)) goto finalize; const auto is_c_finite = std::isfinite(d_plan->type3_params.C1) && @@ -753,8 +748,8 @@ int cufinufft_setpts_impl(int M, T *d_kx, T *d_ky, T *d_kz, int N, T *d_s, T *d_ thrust::cuda::par.on(stream), phase_iterator, phase_iterator + N, d_plan->deconv, d_plan->deconv, [c1, c2, c3, d1, d2, d3, realsign] __host__ __device__( - const thrust::tuple tuple, cuda_complex deconv) - -> cuda_complex { + const thrust::tuple tuple, + cuda_complex deconv) -> cuda_complex { // d2 and d3 are 0 if dim < 2 and dim < 3 const auto phase = c1 * (thrust::get<0>(tuple) + d1) + c2 * (thrust::get<1>(tuple) + d2) + diff --git a/src/cuda/common.cu b/src/cuda/common.cu index 5072c06fe..31f0418e2 100644 --- a/src/cuda/common.cu +++ b/src/cuda/common.cu @@ -23,20 +23,20 @@ using std::max; /** Kernel for computing approximations of exact Fourier series coeffs of * cnufftspread's real symmetric kernel. - * a , f are intermediate results from function onedim_fseries_kernel_precomp() + * phase, f are intermediate results from function onedim_fseries_kernel_precomp() * (see cufinufft/contrib/common.cpp for description) * this is the equispaced frequency case, used by type 1 & 2, matching * onedim_fseries_kernel in CPU code */ template -__global__ void fseries_kernel_compute(int nf1, int nf2, int nf3, T *f, T *a, - T *fwkerhalf1, T *fwkerhalf2, T *fwkerhalf3, - int ns) { +__global__ void cu_fseries_kernel_compute(int nf1, int nf2, int nf3, T *f, T *phase, + T *fwkerhalf1, T *fwkerhalf2, T *fwkerhalf3, + int ns) { T J2 = ns / 2.0; int q = (int)(2 + 3.0 * J2); int nf; - T *at = a + threadIdx.y * MAX_NQUAD; - T *ft = f + threadIdx.y * MAX_NQUAD; + T *phaset = phase + threadIdx.y * MAX_NQUAD; + T *ft = f + threadIdx.y * MAX_NQUAD; T *oarr; // standard parallelism pattern in cuda. using a 2D grid, this allows to leverage more // threads as the parallelism is x*y*z @@ -57,9 +57,9 @@ __global__ void fseries_kernel_compute(int nf1, int nf2, int nf3, T *f, T *a, T x = 0.0; for (int n = 0; n < q; n++) { // in type 1/2 2*PI/nf -> k[i] - x += ft[n] * T(2) * std::cos(T(i) * at[n]); + x += ft[n] * T(2) * std::cos(T(i) * phaset[n]); } - oarr[i] = x; + oarr[i] = x * T(i % 2 ? -1 : 1); // signflip for the kernel origin being at PI } } @@ -71,13 +71,13 @@ __global__ void fseries_kernel_compute(int nf1, int nf2, int nf3, T *f, T *a, * type 3, matching onedim_nuft_kernel in CPU code */ template -__global__ void fseries_kernel_compute(int nf1, int nf2, int nf3, T *f, T *a, T *kx, +__global__ void cu_nuft_kernel_compute(int nf1, int nf2, int nf3, T *f, T *z, T *kx, T *ky, T *kz, T *fwkerhalf1, T *fwkerhalf2, T *fwkerhalf3, int ns) { T J2 = ns / 2.0; int q = (int)(2 + 2.0 * J2); int nf; - T *at = a + threadIdx.y * MAX_NQUAD; + T *at = z + threadIdx.y * MAX_NQUAD; T *ft = f + threadIdx.y * MAX_NQUAD; T *oarr, *k; // standard parallelism pattern in cuda. using a 2D grid, this allows to leverage more @@ -107,7 +107,7 @@ __global__ void fseries_kernel_compute(int nf1, int nf2, int nf3, T *f, T *a, T } template -int cufserieskernelcompute(int dim, int nf1, int nf2, int nf3, T *d_f, T *d_a, +int fseries_kernel_compute(int dim, int nf1, int nf2, int nf3, T *d_f, T *d_phase, T *d_fwkerhalf1, T *d_fwkerhalf2, T *d_fwkerhalf3, int ns, cudaStream_t stream) /* @@ -122,24 +122,25 @@ int cufserieskernelcompute(int dim, int nf1, int nf2, int nf3, T *d_f, T *d_a, dim3 threadsPerBlock(16, dim); dim3 numBlocks((nout + 16 - 1) / 16, 1); - fseries_kernel_compute<<>>( - nf1, nf2, nf3, d_f, d_a, d_fwkerhalf1, d_fwkerhalf2, d_fwkerhalf3, ns); + cu_fseries_kernel_compute<<>>( + nf1, nf2, nf3, d_f, d_phase, d_fwkerhalf1, d_fwkerhalf2, d_fwkerhalf3, ns); RETURN_IF_CUDA_ERROR return 0; } template -int cufserieskernelcompute(int dim, int nf1, int nf2, int nf3, T *d_f, T *d_a, T *d_kx, - T *d_ky, T *d_kz, T *d_fwkerhalf1, T *d_fwkerhalf2, - T *d_fwkerhalf3, int ns, cudaStream_t stream) +int nuft_kernel_compute(int dim, int nf1, int nf2, int nf3, T *d_f, T *d_z, T *d_kx, + T *d_ky, T *d_kz, T *d_fwkerhalf1, T *d_fwkerhalf2, + T *d_fwkerhalf3, int ns, cudaStream_t stream) /* Approximates exact Fourier transform of cnufftspread's real symmetric kernel, directly via q-node quadrature on Euler-Fourier formula, exploiting narrowness of kernel. Evaluates at set of arbitrary freqs k in [-pi, pi), for a kernel with x measured in grid-spacings. (See previous routine for FT definition). - It implements onedim_nuft_kernel in CPU code. + It implements onedim_nuft_kernel in CPU code. Except it combines up to three + onedimensional kernel evaluations at once (for efficiency). Marco Barbone 08/28/2024 */ @@ -149,8 +150,8 @@ int cufserieskernelcompute(int dim, int nf1, int nf2, int nf3, T *d_f, T *d_a, T dim3 threadsPerBlock(16, dim); dim3 numBlocks((nout + 16 - 1) / 16, 1); - fseries_kernel_compute<<>>( - nf1, nf2, nf3, d_f, d_a, d_kx, d_ky, d_kz, d_fwkerhalf1, d_fwkerhalf2, d_fwkerhalf3, + cu_nuft_kernel_compute<<>>( + nf1, nf2, nf3, d_f, d_z, d_kx, d_ky, d_kz, d_fwkerhalf1, d_fwkerhalf2, d_fwkerhalf3, ns); RETURN_IF_CUDA_ERROR @@ -192,43 +193,42 @@ void set_nf_type12(CUFINUFFT_BIGINT ms, cufinufft_opts opts, finufft_spread_opts scaling for the general kx,ky,kz case Outputs: - a - vector of scaled quadrature nodes; - f - funciton values at quadrature nodes multiplied with quadrature weights (a, f are + a - vector of phases to be used for cosines on the GPU; + f - function values at quadrature nodes multiplied with quadrature weights (a, f are provided as the inputs of onedim_fseries_kernel_compute() defined below) */ template -void onedim_uniformn_fseries_kernel_precomp(CUFINUFFT_BIGINT nf, T *f, T *a, - finufft_spread_opts opts) { +void onedim_fseries_kernel_precomp(CUFINUFFT_BIGINT nf, T *f, T *phase, + finufft_spread_opts opts) { T J2 = opts.nspread / 2.0; // J/2, half-width of ker z-support // # quadr nodes in z (from 0 to J/2; reflections will be added)... - int q = (int)(2 + 3.0 * J2); // not sure why so large? cannot - // exceed MAX_NQUAD + const auto q = (int)(2 + 3.0 * J2); // matches CPU code double z[2 * MAX_NQUAD]; double w[2 * MAX_NQUAD]; finufft::quadrature::legendre_compute_glr(2 * q, z, w); // only half the nodes used, // eg on (0,1) - for (int n = 0; n < q; ++n) { // set up nodes z_n and vals f_n - z[n] *= J2; // rescale nodes - f[n] = J2 * w[n] * evaluate_kernel((T)z[n], opts); // vals & quadr wei - a[n] = ((T)(2.0 * M_PI) * (T)(nf / 2 - z[n]) / (T)nf); // phase winding rates + for (int n = 0; n < q; ++n) { // set up nodes z_n and vals f_n + z[n] *= J2; // rescale nodes + f[n] = J2 * w[n] * evaluate_kernel((T)z[n], opts); // vals & quadr wei + phase[n] = T(2.0 * M_PI * z[n] / T(nf)); // phase winding rates } } template -void onedim_non_uniform_fseries_kernel_precomp(T *f, T *a, finufft_spread_opts opts) { +void onedim_nuft_kernel_precomp(T *f, T *z, finufft_spread_opts opts) { + // it implements the first half of onedim_nuft_kernel in CPU code T J2 = opts.nspread / 2.0; // J/2, half-width of ker z-support // # quadr nodes in z (from 0 to J/2; reflections will be added)... - int q = (int)(2 + 2.0 * J2); // not sure why so large? cannot - // exceed MAX_NQUAD - double z[2 * MAX_NQUAD]; - double w[2 * MAX_NQUAD]; - finufft::quadrature::legendre_compute_glr(2 * q, z, w); // only half the nodes used, - // eg on (0,1) + int q = (int)(2 + 2.0 * J2); // matches CPU code + double z_local[2 * MAX_NQUAD]; + double w_local[2 * MAX_NQUAD]; + finufft::quadrature::legendre_compute_glr(2 * q, z_local, w_local); // only half the + // nodes used, eg on + // (0,1) for (int n = 0; n < q; ++n) { // set up nodes z_n and vals f_n - z[n] *= J2; // rescale nodes - f[n] = J2 * w[n] * evaluate_kernel((T)z[n], opts); // vals & quadr wei - a[n] = T(z[n]); + z[n] = J2 * T(z_local[n]); // rescale nodes + f[n] = J2 * w_local[n] * evaluate_kernel(z[n], opts); // vals & quadr wei } } @@ -349,25 +349,25 @@ template int setup_spreader_for_nufft(finufft_spread_opts &spopts, float eps, cufinufft_opts opts); template int setup_spreader_for_nufft(finufft_spread_opts &spopts, double eps, cufinufft_opts opts); -template void onedim_uniformn_fseries_kernel_precomp( - CUFINUFFT_BIGINT nf, float *f, float *a, finufft_spread_opts opts); -template void onedim_uniformn_fseries_kernel_precomp( - CUFINUFFT_BIGINT nf, double *f, double *a, finufft_spread_opts opts); -template void onedim_non_uniform_fseries_kernel_precomp(float *f, float *a, - finufft_spread_opts opts); -template void onedim_non_uniform_fseries_kernel_precomp(double *f, double *a, - finufft_spread_opts opts); -template int cufserieskernelcompute(int dim, int nf1, int nf2, int nf3, float *d_f, +template void onedim_fseries_kernel_precomp(CUFINUFFT_BIGINT nf, float *f, + float *a, finufft_spread_opts opts); +template void onedim_fseries_kernel_precomp(CUFINUFFT_BIGINT nf, double *f, + double *a, finufft_spread_opts opts); +template void onedim_nuft_kernel_precomp(float *f, float *a, + finufft_spread_opts opts); +template void onedim_nuft_kernel_precomp(double *f, double *a, + finufft_spread_opts opts); +template int fseries_kernel_compute(int dim, int nf1, int nf2, int nf3, float *d_f, float *d_a, float *d_fwkerhalf1, float *d_fwkerhalf2, float *d_fwkerhalf3, int ns, cudaStream_t stream); -template int cufserieskernelcompute( +template int fseries_kernel_compute( int dim, int nf1, int nf2, int nf3, double *d_f, double *d_a, double *d_fwkerhalf1, double *d_fwkerhalf2, double *d_fwkerhalf3, int ns, cudaStream_t stream); -template int cufserieskernelcompute( - int dim, int nf1, int nf2, int nf3, float *d_f, float *d_a, float *d_kx, float *d_ky, - float *d_kz, float *d_fwkerhalf1, float *d_fwkerhalf2, float *d_fwkerhalf3, int ns, - cudaStream_t stream); -template int cufserieskernelcompute( +template int nuft_kernel_compute(int dim, int nf1, int nf2, int nf3, float *d_f, + float *d_a, float *d_kx, float *d_ky, float *d_kz, + float *d_fwkerhalf1, float *d_fwkerhalf2, + float *d_fwkerhalf3, int ns, cudaStream_t stream); +template int nuft_kernel_compute( int dim, int nf1, int nf2, int nf3, double *d_f, double *d_a, double *d_kx, double *d_ky, double *d_kz, double *d_fwkerhalf1, double *d_fwkerhalf2, double *d_fwkerhalf3, int ns, cudaStream_t stream); From 96980d3eb308ab53a68a20a3a0f04068db0e6c3a Mon Sep 17 00:00:00 2001 From: Marco Barbone Date: Thu, 12 Sep 2024 13:05:54 -0400 Subject: [PATCH 68/68] fixed complex math test --- test/cuda/cufinufft_math_test.cu | 92 ++++++++++++++++++-------------- 1 file changed, 51 insertions(+), 41 deletions(-) diff --git a/test/cuda/cufinufft_math_test.cu b/test/cuda/cufinufft_math_test.cu index 6da6e5509..1588abb23 100644 --- a/test/cuda/cufinufft_math_test.cu +++ b/test/cuda/cufinufft_math_test.cu @@ -14,26 +14,41 @@ template cuda_complex make_cuda_complex(T real, T imag) { // Helper function to compare cuComplex with std::complex using 1 - ratio as error template -bool compareComplex(const cuda_complex a, const std::complex b, - const std::string &operation, - T epsilon = std::numeric_limits::epsilon()) { +bool compareComplexRel(const cuda_complex a, const std::complex b, + const std::string &operation, + T epsilon = std::numeric_limits::epsilon()) { const auto std_a = std::complex(a.x, a.y); const auto err = std::abs(std_a - b) / std::abs(std_a); - // add 10% tolerance to the error - constexpr auto tol = std::is_same_v ? 1e2f : 1e4; - if (err > epsilon * tol) { + const auto tol = epsilon * T(10); // factor to allow for rounding error + if (err > tol) { std::cout << "Comparison failed in operation: " << operation << "\n"; std::cout << "cuComplex: (" << a.x << ", " << a.y << ")\n"; std::cout << "std::complex: (" << b.real() << ", " << b.imag() << ")\n"; - std::cout << "Error: " << err << "\n"; + std::cout << "RelError: " << err << "\n"; } - return err <= epsilon * tol; + return err <= tol; +} + +template +bool compareComplexAbs(const cuda_complex a, const std::complex b, + const std::string &operation, + T epsilon = std::numeric_limits::epsilon()) { + const auto std_a = std::complex(a.x, a.y); + const auto err = std::abs(std_a - b); + const auto tol = epsilon * T(10); // factor to allow for rounding error + if (err > tol) { + std::cout << "Comparison failed in operation: " << operation << "\n"; + std::cout << "cuComplex: (" << a.x << ", " << a.y << ")\n"; + std::cout << "std::complex: (" << b.real() << ", " << b.imag() << ")\n"; + std::cout << "AbsError: " << err << "\n"; + } + return err <= tol; } template int testRandomOperations() { std::random_device rd; std::mt19937 gen(rd()); - std::uniform_real_distribution dis(-100.0, 100.0); + std::uniform_real_distribution dis(-1.0, 1.0); for (int i = 0; i < 1000; ++i) { T real1 = dis(gen); @@ -50,69 +65,64 @@ template int testRandomOperations() { // Test addition cuda_complex result_add = a + b; std::complex expected_add = std_a + std_b; - if (!compareComplex(result_add, expected_add, - "add complex<" + std::string(typeid(T).name()) + "> complex<" + - std::string(typeid(T).name()) + ">")) + if (!compareComplexAbs(result_add, expected_add, + "add complex<" + std::string(typeid(T).name()) + "> complex<" + + std::string(typeid(T).name()) + ">")) return 1; // Test subtraction cuda_complex result_sub = a - b; std::complex expected_sub = std_a - std_b; - if (!compareComplex(result_sub, expected_sub, - "sub complex<" + std::string(typeid(T).name()) + "> complex<" + - std::string(typeid(T).name()) + ">")) + if (!compareComplexAbs(result_sub, expected_sub, + "sub complex<" + std::string(typeid(T).name()) + "> complex<" + + std::string(typeid(T).name()) + ">")) return 1; // Test multiplication cuda_complex result_mul = a * b; std::complex expected_mul = std_a * std_b; - if (!compareComplex(result_mul, expected_mul, - "mul complex<" + std::string(typeid(T).name()) + "> complex<" + - std::string(typeid(T).name()) + ">")) + if (!compareComplexRel(result_mul, expected_mul, + "mul complex<" + std::string(typeid(T).name()) + "> complex<" + + std::string(typeid(T).name()) + ">")) return 1; // Test division - // Avoid division by small numbers as the implementation is slightly different - // Maybe there is a better way to test it - if (real2 < 1.0 || imag2 < 1.0) { // Avoid division by zero - cuda_complex result_div = a / b; - std::complex expected_div = std_a / std_b; - if (!compareComplex(result_div, expected_div, - "div complex<" + std::string(typeid(T).name()) + "> complex<" + - std::string(typeid(T).name()) + ">", - std::numeric_limits::epsilon() * 1000)) - return 1; - } + cuda_complex result_div = a / b; + std::complex expected_div = std_a / std_b; + if (!compareComplexRel(result_div, expected_div, + "div complex<" + std::string(typeid(T).name()) + "> complex<" + + std::string(typeid(T).name()) + ">")) + return 1; // Test addition with scalar cuda_complex result_add_scalar = a + scalar; std::complex expected_add_scalar = std_a + scalar; - if (!compareComplex(result_add_scalar, expected_add_scalar, - "add complex<" + std::string(typeid(T).name()) + "> scalar<" + - std::string(typeid(T).name()) + ">")) + if (!compareComplexRel(result_add_scalar, expected_add_scalar, + "add complex<" + std::string(typeid(T).name()) + "> scalar<" + + std::string(typeid(T).name()) + ">")) return 1; // Test subtraction with scalar cuda_complex result_sub_scalar = a - scalar; std::complex expected_sub_scalar = std_a - scalar; - if (!compareComplex(result_sub_scalar, expected_sub_scalar, - "sub complex<" + std::string(typeid(T).name()) + "> scalar<" + - std::string(typeid(T).name()) + ">")) + if (!compareComplexRel(result_sub_scalar, expected_sub_scalar, + "sub complex<" + std::string(typeid(T).name()) + "> scalar<" + + std::string(typeid(T).name()) + ">")) return 1; // Test multiplication with scalar cuda_complex result_mul_scalar = a * scalar; std::complex expected_mul_scalar = std_a * scalar; - if (!compareComplex(result_mul_scalar, expected_mul_scalar, - "mul complex<" + std::string(typeid(T).name()) + "> scalar<" + - std::string(typeid(T).name()) + ">")) + if (!compareComplexRel(result_mul_scalar, expected_mul_scalar, + "mul complex<" + std::string(typeid(T).name()) + "> scalar<" + + std::string(typeid(T).name()) + ">")) return 1; cuda_complex result_div_scalar = a / scalar; std::complex expected_div_scalar = std_a / scalar; - if (!compareComplex(result_div_scalar, expected_div_scalar, - "div complex<" + std::string(typeid(T).name()) + "> scalar<" + - std::string(typeid(T).name()) + ">")) + if (!compareComplexRel(result_div_scalar, expected_div_scalar, + "div complex<" + std::string(typeid(T).name()) + "> scalar<" + + std::string(typeid(T).name()) + ">")) return 1; } return 0;