diff --git a/.github/workflows/cmake_ci.yml b/.github/workflows/cmake_ci.yml index 9b483ecbf..75be3e19f 100644 --- a/.github/workflows/cmake_ci.yml +++ b/.github/workflows/cmake_ci.yml @@ -55,3 +55,33 @@ jobs: working-directory: ./build run: | ctest -C ${{matrix.build_type}} --output-on-failure + + - name: Set up Python + if: matrix.finufft_static_linking + uses: actions/setup-python@v5 + with: + python-version: '3.10' + + - name: Build Python wheels + if: matrix.finufft_static_linking + env: + MACOSX_DEPLOYMENT_TARGET: 13 + shell: bash + run: | + python3 -m pip install \ + --verbose \ + -C cmake.define.CMAKE_BUILD_TYPE=${{ matrix.build_type }} \ + -C cmake.define.FINUFFT_ARCH_FLAGS=${{ matrix.arch_flags }} \ + -C cmake.define.FINUFFT_USE_DUCC0=${{ matrix.ducc_fft }} \ + python/finufft + + - name: Install pytest + if: matrix.finufft_static_linking + run: | + python3 -m pip install --upgrade pip + python3 -m pip install pytest + + - name: Test Python package + if: matrix.finufft_static_linking + run: | + python3 -m pytest python/finufft/test diff --git a/include/cufinufft/impl.h b/include/cufinufft/impl.h index 012050b15..543634f15 100644 --- a/include/cufinufft/impl.h +++ b/include/cufinufft/impl.h @@ -74,7 +74,7 @@ int cufinufft_makeplan_impl(int type, int dim, int *nmodes, int iflag, int ntran using namespace cufinufft::common; int ier; if (type < 1 || type > 3) { - fprintf(stderr, "[%s] Invalid type (%d): should be 1 or 2.\n", __func__, type); + fprintf(stderr, "[%s] Invalid type (%d): should be 1, 2, or 3.\n", __func__, type); return FINUFFT_ERR_TYPE_NOTVALID; } if (ntransf < 1) { @@ -178,7 +178,8 @@ int cufinufft_makeplan_impl(int type, int dim, int *nmodes, int iflag, int ntran } cufinufft_setup_binsize(type, d_plan->spopts.nspread, dim, &d_plan->opts); - if (ier = cudaGetLastError(), ier != cudaSuccess) { + if (cudaGetLastError() != cudaSuccess) { + ier = FINUFFT_ERR_CUDA_FAILURE; goto finalize; } if (d_plan->opts.debug) { @@ -196,6 +197,42 @@ int cufinufft_makeplan_impl(int type, int dim, int *nmodes, int iflag, int ntran printf("[cufinufft] shared memory required for the spreader: %ld\n", mem_required); } + + // dynamically request the maximum amount of shared memory available + // for the spreader + + /* Automatically set GPU method. */ + if (d_plan->opts.gpu_method == 0) { + /* For type 1, we default to method 2 (SM) since this is generally faster + * if there is enough shared memory available. Otherwise, we default to GM. + * Type 3 inherits this behavior since the outer plan here is also a type 1. + * + * For type 2, we always default to method 1 (GM). + */ + if (type == 2) { + d_plan->opts.gpu_method = 1; + } else { + // query the device for the amount of shared memory available + int shared_mem_per_block{}; + cudaDeviceGetAttribute(&shared_mem_per_block, + cudaDevAttrMaxSharedMemoryPerBlockOptin, device_id); + // compute the amount of shared memory required for the method + const auto shared_mem_required = shared_memory_required( + dim, d_plan->spopts.nspread, d_plan->opts.gpu_binsizex, + d_plan->opts.gpu_binsizey, d_plan->opts.gpu_binsizez); + if ((shared_mem_required > shared_mem_per_block)) { + d_plan->opts.gpu_method = 1; + } else { + d_plan->opts.gpu_method = 2; + } + } + } + + if (cudaGetLastError() != cudaSuccess) { + ier = FINUFFT_ERR_CUDA_FAILURE; + goto finalize; + } + if (type == 1 || type == 2) { CUFINUFFT_BIGINT nf1 = 1, nf2 = 1, nf3 = 1; set_nf_type12(d_plan->ms, d_plan->opts, d_plan->spopts, &nf1, @@ -207,39 +244,6 @@ int cufinufft_makeplan_impl(int type, int dim, int *nmodes, int iflag, int ntran set_nf_type12(d_plan->mu, d_plan->opts, d_plan->spopts, &nf3, d_plan->opts.gpu_obinsizez); - // dynamically request the maximum amount of shared memory available - // for the spreader - - /* Automatically set GPU method. */ - if (d_plan->opts.gpu_method == 0) { - /* For type 1, we default to method 2 (SM) since this is generally faster - * if there is enough shared memory available. Otherwise, we default to GM. - * - * For type 2, we always default to method 1 (GM). - */ - if (type == 2) { - d_plan->opts.gpu_method = 1; - } else { - // query the device for the amount of shared memory available - int shared_mem_per_block{}; - cudaDeviceGetAttribute(&shared_mem_per_block, - cudaDevAttrMaxSharedMemoryPerBlockOptin, device_id); - // compute the amount of shared memory required for the method - const auto shared_mem_required = shared_memory_required( - dim, d_plan->spopts.nspread, d_plan->opts.gpu_binsizex, - d_plan->opts.gpu_binsizey, d_plan->opts.gpu_binsizez); - if ((shared_mem_required > shared_mem_per_block)) { - d_plan->opts.gpu_method = 1; - } else { - d_plan->opts.gpu_method = 2; - } - } - } - - if ((ier = cudaGetLastError())) { - goto finalize; - } - d_plan->nf1 = nf1; d_plan->nf2 = nf2; d_plan->nf3 = nf3; @@ -795,7 +799,7 @@ int cufinufft_setpts_impl(int M, T *d_kx, T *d_ky, T *d_kz, int N, T *d_s, T *d_ int t2modes[] = {d_plan->nf1, d_plan->nf2, d_plan->nf3}; cufinufft_opts t2opts = d_plan->opts; t2opts.gpu_spreadinterponly = 0; - t2opts.gpu_method = 1; + t2opts.gpu_method = 0; // Safe to ignore the return value here? if (d_plan->t2_plan) cufinufft_destroy_impl(d_plan->t2_plan); // check that maxbatchsize is correct diff --git a/src/cuda/cufinufft.cu b/src/cuda/cufinufft.cu index 534fa5358..fa4990285 100644 --- a/src/cuda/cufinufft.cu +++ b/src/cuda/cufinufft.cu @@ -6,7 +6,13 @@ #include #include -inline bool is_invalid_mode_array(int dim, const int64_t *modes64, int32_t modes32[3]) { +inline bool is_invalid_mode_array(int type, int dim, const int64_t *modes64, + int32_t modes32[3]) { + if (type == 3) { + modes32[0] = modes32[1] = modes32[2] = 1; + return false; + } + int64_t tot_size = 1; for (int i = 0; i < dim; ++i) { if (modes64[i] > std::numeric_limits::max()) return true; @@ -28,7 +34,9 @@ int cufinufftf_makeplan(int type, int dim, const int64_t *nmodes, int iflag, int } int nmodes32[3]; - if (is_invalid_mode_array(dim, nmodes, nmodes32)) return FINUFFT_ERR_NDATA_NOTVALID; + if (is_invalid_mode_array(type, dim, nmodes, nmodes32)) { + return FINUFFT_ERR_NDATA_NOTVALID; + } return cufinufft_makeplan_impl(type, dim, nmodes32, iflag, ntransf, tol, (cufinufft_plan_t **)d_plan_ptr, opts); @@ -42,7 +50,9 @@ int cufinufft_makeplan(int type, int dim, const int64_t *nmodes, int iflag, int } int nmodes32[3]; - if (is_invalid_mode_array(dim, nmodes, nmodes32)) return FINUFFT_ERR_NDATA_NOTVALID; + if (is_invalid_mode_array(type, dim, nmodes, nmodes32)) { + return FINUFFT_ERR_NDATA_NOTVALID; + } return cufinufft_makeplan_impl(type, dim, nmodes32, iflag, ntransf, tol, (cufinufft_plan_t **)d_plan_ptr, opts); diff --git a/test/cuda/CMakeLists.txt b/test/cuda/CMakeLists.txt index 1cadb7569..d8af7918a 100644 --- a/test/cuda/CMakeLists.txt +++ b/test/cuda/CMakeLists.txt @@ -21,6 +21,10 @@ foreach(srcfile ${test_src}) endforeach() function(add_tests PREC REQ_TOL CHECK_TOL UPSAMP) + add_test(NAME cufinufft1d1_test_auto_${PREC}_${UPSAMP} + COMMAND cufinufft1d_test 0 1 1e2 2e2 ${REQ_TOL} ${CHECK_TOL} ${PREC} + ${UPSAMP}) + add_test(NAME cufinufft1d1_test_GM_${PREC}_${UPSAMP} COMMAND cufinufft1d_test 1 1 1e2 2e2 ${REQ_TOL} ${CHECK_TOL} ${PREC} ${UPSAMP}) @@ -29,13 +33,26 @@ function(add_tests PREC REQ_TOL CHECK_TOL UPSAMP) COMMAND cufinufft1d_test 2 1 1e2 2e2 ${REQ_TOL} ${CHECK_TOL} ${PREC} ${UPSAMP}) + add_test(NAME cufinufft1d2_test_auto_${PREC}_${UPSAMP} + COMMAND cufinufft1d_test 0 2 1e2 2e2 ${REQ_TOL} ${CHECK_TOL} ${PREC} + ${UPSAMP}) + add_test(NAME cufinufft1d2_test_GM_${PREC}_${UPSAMP} COMMAND cufinufft1d_test 1 2 1e2 2e2 ${REQ_TOL} ${CHECK_TOL} ${PREC} ${UPSAMP}) + + add_test(NAME cufinufft1d3_test_auto_${PREC}_${UPSAMP} + COMMAND cufinufft1d_test 0 3 1e2 2e2 ${REQ_TOL} ${CHECK_TOL} ${PREC} + ${UPSAMP}) + add_test(NAME cufinufft1d3_test_GM_${PREC}_${UPSAMP} COMMAND cufinufft1d_test 1 3 1e2 2e2 ${REQ_TOL} ${CHECK_TOL} ${PREC} ${UPSAMP}) + add_test(NAME cufinufft2d1_test_auto_${PREC}_${UPSAMP} + COMMAND cufinufft2d_test 0 1 1e2 2e2 2e4 ${REQ_TOL} ${CHECK_TOL} + ${PREC} ${UPSAMP}) + add_test(NAME cufinufft2d1_test_GM_${PREC}_${UPSAMP} COMMAND cufinufft2d_test 1 1 1e2 2e2 2e4 ${REQ_TOL} ${CHECK_TOL} ${PREC} ${UPSAMP}) @@ -44,10 +61,18 @@ function(add_tests PREC REQ_TOL CHECK_TOL UPSAMP) COMMAND cufinufft2d_test 2 1 1e2 2e2 2e4 ${REQ_TOL} ${CHECK_TOL} ${PREC} ${UPSAMP}) + add_test(NAME cufinufft2d2_test_auto_${PREC}_${UPSAMP} + COMMAND cufinufft2d_test 0 2 1e2 2e2 2e4 ${REQ_TOL} ${CHECK_TOL} + ${PREC} ${UPSAMP}) + add_test(NAME cufinufft2d2_test_SM_${PREC}_${UPSAMP} COMMAND cufinufft2d_test 2 2 1e2 2e2 2e4 ${REQ_TOL} ${CHECK_TOL} ${PREC} ${UPSAMP}) + add_test(NAME cufinufft2d3_test_auto_${PREC}_${UPSAMP} + COMMAND cufinufft2d_test 0 3 1e2 2e2 2e4 ${REQ_TOL} ${CHECK_TOL} + ${PREC} ${UPSAMP}) + add_test(NAME cufinufft2d3_test_SM_${PREC}_${UPSAMP} COMMAND cufinufft2d_test 2 3 1e2 2e2 2e4 ${REQ_TOL} ${CHECK_TOL} ${PREC} ${UPSAMP}) @@ -76,6 +101,10 @@ function(add_tests PREC REQ_TOL CHECK_TOL UPSAMP) COMMAND cufinufft2dmany_test 2 3 1e2 2e2 5 0 2e4 ${REQ_TOL} ${CHECK_TOL} ${PREC} ${UPSAMP}) + add_test(NAME cufinufft3d1_test_auto_${PREC}_${UPSAMP} + COMMAND cufinufft3d_test 0 1 2 5 10 20 ${REQ_TOL} ${CHECK_TOL} + ${PREC} ${UPSAMP}) + add_test(NAME cufinufft3d1_test_GM_${PREC}_${UPSAMP} COMMAND cufinufft3d_test 1 1 2 5 10 20 ${REQ_TOL} ${CHECK_TOL} ${PREC} ${UPSAMP}) @@ -98,10 +127,18 @@ function(add_tests PREC REQ_TOL CHECK_TOL UPSAMP) ${PREC} ${UPSAMP}) endif() + add_test(NAME cufinufft3d2_test_auto_${PREC}_${UPSAMP} + COMMAND cufinufft3d_test 0 2 2 5 10 20 ${REQ_TOL} ${CHECK_TOL} + ${PREC} ${UPSAMP}) + add_test(NAME cufinufft3d2_test_GM_${PREC}_${UPSAMP} COMMAND cufinufft3d_test 1 2 2 5 10 20 ${REQ_TOL} ${CHECK_TOL} ${PREC} ${UPSAMP}) + add_test(NAME cufinufft3d3_test_auto_${PREC}_${UPSAMP} + COMMAND cufinufft3d_test 0 3 2 5 10 30 ${REQ_TOL} ${CHECK_TOL} + ${PREC} ${UPSAMP}) + add_test(NAME cufinufft3d3_test_GM_${PREC}_${UPSAMP} COMMAND cufinufft3d_test 1 3 2 3 7 20 ${REQ_TOL} ${CHECK_TOL}*100 ${PREC} ${UPSAMP})