diff --git a/docs/c_gpu.rst b/docs/c_gpu.rst index 97c50b31a..a5f050220 100644 --- a/docs/c_gpu.rst +++ b/docs/c_gpu.rst @@ -311,11 +311,8 @@ while ``modeord=1`` selects FFT-style ordering starting at zero and wrapping ove **gpu_device_id**: Sets the GPU device ID. Leave at default unless you know what you're doing. [To be documented] -Diagnostic options -~~~~~~~~~~~~~~~~~~ - **gpu_spreadinterponly**: if ``0`` do the NUFFT as intended. If ``1``, omit the FFT and kernel FT deconvolution steps and return garbage answers. -Nonzero value is *only* to be used to aid timing tests (although currently there are no timing codes that exploit this option), and will give wrong or undefined answers for the NUFFT transforms! +Nonzero value can be used *only* with `upsampfac=1` and `kerevalmeth=1` for using cufinufft for only spread and interpolate mode. This has applications into estimating the density compensation, which is conventionally used in MRI. (See [MRI-NUFFT](https://mind-inria.github.io/mri-nufft/nufft.html)) Algorithm performance options diff --git a/docs/error.rst b/docs/error.rst index 9d9edc9a4..6e12d2859 100644 --- a/docs/error.rst +++ b/docs/error.rst @@ -29,6 +29,7 @@ has the following meanings (see codes in ``include/finufft_errors.h``): 18 size of bins for subprob/blockgather invalid 19 GPU shmem too small for subprob/blockgather parameters 20 invalid number of nonuniform points: nj or nk negative, or too big (see defs.h) + 23 invalid upsampfac set while using gpu_spreadinterponly mode. When ``ier=1`` (warning only) the transform(s) is/are still completed, at the smallest epsilon achievable, so, with that caveat, the answer should still be usable. diff --git a/include/cufinufft/impl.h b/include/cufinufft/impl.h index 0913a404e..0ec67c818 100644 --- a/include/cufinufft/impl.h +++ b/include/cufinufft/impl.h @@ -107,7 +107,9 @@ int cufinufft_makeplan_impl(int type, int dim, int *nmodes, int iflag, int ntran d_plan->ms = nmodes[0]; d_plan->mt = nmodes[1]; d_plan->mu = nmodes[2]; - printf("[cufinufft] (ms,mt,mu): %d %d %d\n", d_plan->ms, d_plan->mt, d_plan->mu); + if (d_plan->opts.debug) { + printf("[cufinufft] (ms,mt,mu): %d %d %d\n", d_plan->ms, d_plan->mt, d_plan->mu); + } } else { d_plan->opts.gpu_spreadinterponly = 1; } @@ -153,7 +155,13 @@ int cufinufft_makeplan_impl(int type, int dim, int *nmodes, int iflag, int ntran printf("[cufinufft] upsampfac automatically set to %.3g\n", d_plan->opts.upsampfac); } } - + if (d_plan->opts.gpu_spreadinterponly) { + // XNOR implementation below with boolean logic. + if ((d_plan->opts.upsampfac !=1) == (type != 3)) { + ier = FINUFFT_ERR_SPREADONLY_UPSAMP_INVALID; + goto finalize; + } + } /* Setup Spreader */ if ((ier = setup_spreader_for_nufft(d_plan->spopts, tol, d_plan->opts)) > 1) { // can return FINUFFT_WARN_EPS_TOO_SMALL=1, which is OK @@ -254,70 +262,74 @@ int cufinufft_makeplan_impl(int type, int dim, int *nmodes, int iflag, int ntran } break; } - cufftHandle fftplan; - cufftResult_t cufft_status; - switch (d_plan->dim) { - case 1: { - int n[] = {(int)nf1}; - int inembed[] = {(int)nf1}; - - cufft_status = cufftPlanMany(&fftplan, 1, n, inembed, 1, inembed[0], inembed, 1, - inembed[0], cufft_type(), batchsize); - } break; - case 2: { - int n[] = {(int)nf2, (int)nf1}; - int inembed[] = {(int)nf2, (int)nf1}; - - cufft_status = - cufftPlanMany(&fftplan, 2, n, inembed, 1, inembed[0] * inembed[1], inembed, 1, - inembed[0] * inembed[1], cufft_type(), batchsize); - } break; - case 3: { - int n[] = {(int)nf3, (int)nf2, (int)nf1}; - int inembed[] = {(int)nf3, (int)nf2, (int)nf1}; - - cufft_status = cufftPlanMany( - &fftplan, 3, n, inembed, 1, inembed[0] * inembed[1] * inembed[2], inembed, 1, - inembed[0] * inembed[1] * inembed[2], cufft_type(), batchsize); - } break; - } + // We dont need any cuFFT plans or kernel values if we are only spreading / interpolating + if(!d_plan->opts.gpu_spreadinterponly) { + cufftHandle fftplan; + cufftResult_t cufft_status; + switch (d_plan->dim) { + case 1: { + int n[] = {(int)nf1}; + int inembed[] = {(int)nf1}; + + cufft_status = cufftPlanMany(&fftplan, 1, n, inembed, 1, inembed[0], inembed, 1, + inembed[0], cufft_type(), batchsize); + } break; + case 2: { + int n[] = {(int)nf2, (int)nf1}; + int inembed[] = {(int)nf2, (int)nf1}; + + cufft_status = + cufftPlanMany(&fftplan, 2, n, inembed, 1, inembed[0] * inembed[1], inembed, 1, + inembed[0] * inembed[1], cufft_type(), batchsize); + } break; + case 3: { + int n[] = {(int)nf3, (int)nf2, (int)nf1}; + int inembed[] = {(int)nf3, (int)nf2, (int)nf1}; + + cufft_status = cufftPlanMany( + &fftplan, 3, n, inembed, 1, inembed[0] * inembed[1] * inembed[2], inembed, 1, + inembed[0] * inembed[1] * inembed[2], cufft_type(), batchsize); + } break; + } - if (cufft_status != CUFFT_SUCCESS) { - fprintf(stderr, "[%s] cufft makeplan error: %s", __func__, - cufftGetErrorString(cufft_status)); - ier = FINUFFT_ERR_CUDA_FAILURE; - goto finalize; + if (cufft_status != CUFFT_SUCCESS) { + fprintf(stderr, "[%s] cufft makeplan error: %s", __func__, + cufftGetErrorString(cufft_status)); + ier = FINUFFT_ERR_CUDA_FAILURE; + goto finalize; + } + cufftSetStream(fftplan, stream); + + d_plan->fftplan = fftplan; + + // compute up to 3 * NQUAD precomputed values on CPU + T fseries_precomp_phase[3 * MAX_NQUAD]; + T fseries_precomp_f[3 * MAX_NQUAD]; + thrust::device_vector d_fseries_precomp_phase(3 * MAX_NQUAD); + thrust::device_vector d_fseries_precomp_f(3 * MAX_NQUAD); + onedim_fseries_kernel_precomp(d_plan->nf1, fseries_precomp_f, + fseries_precomp_phase, d_plan->spopts); + if (d_plan->dim > 1) + onedim_fseries_kernel_precomp(d_plan->nf2, fseries_precomp_f + MAX_NQUAD, + fseries_precomp_phase + MAX_NQUAD, d_plan->spopts); + if (d_plan->dim > 2) + onedim_fseries_kernel_precomp(d_plan->nf3, fseries_precomp_f + 2 * MAX_NQUAD, + fseries_precomp_phase + 2 * MAX_NQUAD, + d_plan->spopts); + // copy the precomputed data to the device using thrust + thrust::copy(fseries_precomp_phase, fseries_precomp_phase + 3 * MAX_NQUAD, + d_fseries_precomp_phase.begin()); + thrust::copy(fseries_precomp_f, fseries_precomp_f + 3 * MAX_NQUAD, + d_fseries_precomp_f.begin()); + // the full fseries is done on the GPU here + if ((ier = fseries_kernel_compute( + d_plan->dim, d_plan->nf1, d_plan->nf2, d_plan->nf3, + d_fseries_precomp_f.data().get(), d_fseries_precomp_phase.data().get(), + d_plan->fwkerhalf1, d_plan->fwkerhalf2, d_plan->fwkerhalf3, + d_plan->spopts.nspread, stream))) + goto finalize; } - cufftSetStream(fftplan, stream); - - d_plan->fftplan = fftplan; - - // compute up to 3 * NQUAD precomputed values on CPU - T fseries_precomp_phase[3 * MAX_NQUAD]; - T fseries_precomp_f[3 * MAX_NQUAD]; - thrust::device_vector d_fseries_precomp_phase(3 * MAX_NQUAD); - thrust::device_vector d_fseries_precomp_f(3 * MAX_NQUAD); - onedim_fseries_kernel_precomp(d_plan->nf1, fseries_precomp_f, - fseries_precomp_phase, d_plan->spopts); - if (d_plan->dim > 1) - onedim_fseries_kernel_precomp(d_plan->nf2, fseries_precomp_f + MAX_NQUAD, - fseries_precomp_phase + MAX_NQUAD, d_plan->spopts); - if (d_plan->dim > 2) - onedim_fseries_kernel_precomp(d_plan->nf3, fseries_precomp_f + 2 * MAX_NQUAD, - fseries_precomp_phase + 2 * MAX_NQUAD, - d_plan->spopts); - // copy the precomputed data to the device using thrust - thrust::copy(fseries_precomp_phase, fseries_precomp_phase + 3 * MAX_NQUAD, - d_fseries_precomp_phase.begin()); - thrust::copy(fseries_precomp_f, fseries_precomp_f + 3 * MAX_NQUAD, - d_fseries_precomp_f.begin()); - // the full fseries is done on the GPU here - if ((ier = fseries_kernel_compute( - d_plan->dim, d_plan->nf1, d_plan->nf2, d_plan->nf3, - d_fseries_precomp_f.data().get(), d_fseries_precomp_phase.data().get(), - d_plan->fwkerhalf1, d_plan->fwkerhalf2, d_plan->fwkerhalf3, - d_plan->spopts.nspread, stream))) - goto finalize; + } finalize: if (ier > 1) { diff --git a/include/finufft_errors.h b/include/finufft_errors.h index 10465aa13..86158c219 100644 --- a/include/finufft_errors.h +++ b/include/finufft_errors.h @@ -25,6 +25,7 @@ enum { FINUFFT_ERR_INSUFFICIENT_SHMEM = 19, FINUFFT_ERR_NUM_NU_PTS_INVALID = 20, FINUFFT_ERR_INVALID_ARGUMENT = 21, - FINUFFT_ERR_LOCK_FUNS_INVALID = 22 + FINUFFT_ERR_LOCK_FUNS_INVALID = 22, + FINUFFT_ERR_SPREADONLY_UPSAMP_INVALID = 23, }; #endif diff --git a/src/cuda/1d/cufinufft1d.cu b/src/cuda/1d/cufinufft1d.cu index 06389ef75..d0328236f 100644 --- a/src/cuda/1d/cufinufft1d.cu +++ b/src/cuda/1d/cufinufft1d.cu @@ -43,8 +43,10 @@ int cufinufft1d1_exec(cuda_complex *d_c, cuda_complex *d_fk, d_fkstart = d_fk + i * d_plan->batchsize * d_plan->ms; d_plan->c = d_cstart; d_plan->fk = d_fkstart; - - // this is needed + if (d_plan->opts.gpu_spreadinterponly) + d_plan->fw = d_fkstart; + + // this is needed if ((ier = checkCudaErrors(cudaMemsetAsync( d_plan->fw, 0, d_plan->batchsize * d_plan->nf1 * sizeof(cuda_complex), stream)))) @@ -52,7 +54,11 @@ int cufinufft1d1_exec(cuda_complex *d_c, cuda_complex *d_fk, // Step 1: Spread if ((ier = cuspread1d(d_plan, blksize))) return ier; + // if spreadonly, skip the rest + if (d_plan->opts.gpu_spreadinterponly) + continue; + // Step 2: FFT cufftResult cufft_status = cufft_ex(d_plan->fftplan, d_plan->fw, d_plan->fw, d_plan->iflag); @@ -97,20 +103,25 @@ int cufinufft1d2_exec(cuda_complex *d_c, cuda_complex *d_fk, d_plan->c = d_cstart; d_plan->fk = d_fkstart; - - // Step 1: amplify Fourier coeffs fk and copy into upsampled array fw - if (d_plan->opts.modeord == 0) { - if ((ier = cudeconvolve1d(d_plan, blksize))) return ier; - } else { - if ((ier = cudeconvolve1d(d_plan, blksize))) return ier; + + // Skip steps 1 and 2 if interponly + if (!d_plan->opts.gpu_spreadinterponly) { + // Step 1: amplify Fourier coeffs fk and copy into upsampled array fw + if (d_plan->opts.modeord == 0) { + if ((ier = cudeconvolve1d(d_plan, blksize))) return ier; + } else { + if ((ier = cudeconvolve1d(d_plan, blksize))) return ier; + } + + // Step 2: FFT + cufftResult cufft_status = + cufft_ex(d_plan->fftplan, d_plan->fw, d_plan->fw, d_plan->iflag); + if (cufft_status != CUFFT_SUCCESS) return FINUFFT_ERR_CUDA_FAILURE; } - - // Step 2: FFT - cufftResult cufft_status = - cufft_ex(d_plan->fftplan, d_plan->fw, d_plan->fw, d_plan->iflag); - if (cufft_status != CUFFT_SUCCESS) return FINUFFT_ERR_CUDA_FAILURE; - - // Step 3: deconvolve and shuffle + else + d_plan->fw = d_fkstart; + + // Step 3: Interpolate if ((ier = cuinterp1d(d_plan, blksize))) return ier; } diff --git a/src/cuda/2d/cufinufft2d.cu b/src/cuda/2d/cufinufft2d.cu index 8c165edbf..916c4c71d 100644 --- a/src/cuda/2d/cufinufft2d.cu +++ b/src/cuda/2d/cufinufft2d.cu @@ -44,6 +44,8 @@ int cufinufft2d1_exec(cuda_complex *d_c, cuda_complex *d_fk, d_fkstart = d_fk + i * d_plan->batchsize * d_plan->ms * d_plan->mt; d_plan->c = d_cstart; d_plan->fk = d_fkstart; + if (d_plan->opts.gpu_spreadinterponly) + d_plan->fw = d_fkstart; // this is needed if ((ier = checkCudaErrors(cudaMemsetAsync( @@ -55,6 +57,10 @@ int cufinufft2d1_exec(cuda_complex *d_c, cuda_complex *d_fk, // Step 1: Spread if ((ier = cuspread2d(d_plan, blksize))) return ier; + // if spreadonly, skip the rest + if (d_plan->opts.gpu_spreadinterponly) + continue; + // Step 2: FFT cufftResult cufft_status = cufft_ex(d_plan->fftplan, d_plan->fw, d_plan->fw, d_plan->iflag); @@ -100,19 +106,24 @@ int cufinufft2d2_exec(cuda_complex *d_c, cuda_complex *d_fk, d_plan->c = d_cstart; d_plan->fk = d_fkstart; - // Step 1: amplify Fourier coeffs fk and copy into upsampled array fw - if (d_plan->opts.modeord == 0) { - if ((ier = cudeconvolve2d(d_plan, blksize))) return ier; - } else { - if ((ier = cudeconvolve2d(d_plan, blksize))) return ier; + // Skip steps 1 and 2 if interponly + if (!d_plan->opts.gpu_spreadinterponly) { + // Step 1: amplify Fourier coeffs fk and copy into upsampled array fw + if (d_plan->opts.modeord == 0) { + if ((ier = cudeconvolve2d(d_plan, blksize))) return ier; + } else { + if ((ier = cudeconvolve2d(d_plan, blksize))) return ier; + } + + // Step 2: FFT + cufftResult cufft_status = + cufft_ex(d_plan->fftplan, d_plan->fw, d_plan->fw, d_plan->iflag); + if (cufft_status != CUFFT_SUCCESS) return FINUFFT_ERR_CUDA_FAILURE; } - - // Step 2: FFT - cufftResult cufft_status = - cufft_ex(d_plan->fftplan, d_plan->fw, d_plan->fw, d_plan->iflag); - if (cufft_status != CUFFT_SUCCESS) return FINUFFT_ERR_CUDA_FAILURE; - - // Step 3: deconvolve and shuffle + else + d_plan->fw = d_fkstart; + + // Step 3: Interpolate if ((ier = cuinterp2d(d_plan, blksize))) return ier; } diff --git a/src/cuda/3d/cufinufft3d.cu b/src/cuda/3d/cufinufft3d.cu index ce1f37c0e..9f376297c 100644 --- a/src/cuda/3d/cufinufft3d.cu +++ b/src/cuda/3d/cufinufft3d.cu @@ -42,6 +42,8 @@ int cufinufft3d1_exec(cuda_complex *d_c, cuda_complex *d_fk, d_plan->c = d_cstart; d_plan->fk = d_fkstart; + if (d_plan->opts.gpu_spreadinterponly) + d_plan->fw = d_fkstart; if ((ier = checkCudaErrors(cudaMemsetAsync( d_plan->fw, 0, d_plan->batchsize * d_plan->nf * sizeof(cuda_complex), @@ -50,8 +52,12 @@ int cufinufft3d1_exec(cuda_complex *d_c, cuda_complex *d_fk, // Step 1: Spread if ((ier = cuspread3d(d_plan, blksize))) return ier; - - // Step 2: FFT + + // if spreadonly, skip the rest + if (d_plan->opts.gpu_spreadinterponly) + continue; + + // Step 2: FFT cufftResult cufft_status = cufft_ex(d_plan->fftplan, d_plan->fw, d_plan->fw, d_plan->iflag); if (cufft_status != CUFFT_SUCCESS) return FINUFFT_ERR_CUDA_FAILURE; @@ -94,20 +100,24 @@ int cufinufft3d2_exec(cuda_complex *d_c, cuda_complex *d_fk, d_plan->c = d_cstart; d_plan->fk = d_fkstart; - // Step 1: amplify Fourier coeffs fk and copy into upsampled array fw - if (d_plan->opts.modeord == 0) { - if ((ier = cudeconvolve3d(d_plan, blksize))) return ier; - } else { - if ((ier = cudeconvolve3d(d_plan, blksize))) return ier; + // Skip steps 1 and 2 if interponly + if (!d_plan->opts.gpu_spreadinterponly) { + // Step 1: amplify Fourier coeffs fk and copy into upsampled array fw + if (d_plan->opts.modeord == 0) { + if ((ier = cudeconvolve3d(d_plan, blksize))) return ier; + } else { + if ((ier = cudeconvolve3d(d_plan, blksize))) return ier; + } + // Step 2: FFT + RETURN_IF_CUDA_ERROR + cufftResult cufft_status = + cufft_ex(d_plan->fftplan, d_plan->fw, d_plan->fw, d_plan->iflag); + if (cufft_status != CUFFT_SUCCESS) return FINUFFT_ERR_CUDA_FAILURE; } - - // Step 2: FFT - RETURN_IF_CUDA_ERROR - cufftResult cufft_status = - cufft_ex(d_plan->fftplan, d_plan->fw, d_plan->fw, d_plan->iflag); - if (cufft_status != CUFFT_SUCCESS) return FINUFFT_ERR_CUDA_FAILURE; - - // Step 3: deconvolve and shuffle + else + d_plan->fw = d_fkstart; + + // Step 3: Interpolate if ((ier = cuinterp3d(d_plan, blksize))) return ier; } diff --git a/src/cuda/memtransfer_wrapper.cu b/src/cuda/memtransfer_wrapper.cu index e5308e8bc..20bbb6355 100644 --- a/src/cuda/memtransfer_wrapper.cu +++ b/src/cuda/memtransfer_wrapper.cu @@ -434,12 +434,13 @@ void freegpumemory(cufinufft_plan_t *d_plan) utils::WithCudaDevice device_swapper(d_plan->opts.gpu_device_id); // Fixes a crash whewre the plan itself is deleted before the stream const auto stream = d_plan->stream; - - CUDA_FREE_AND_NULL(d_plan->fw, stream, d_plan->supports_pools); + // Dont clear fw if spreadinterponly for type 1 and 2 as fw belongs to original program (it is d_fk) + if(!d_plan->opts.gpu_spreadinterponly || d_plan->type == 3) + CUDA_FREE_AND_NULL(d_plan->fw, stream, d_plan->supports_pools); CUDA_FREE_AND_NULL(d_plan->fwkerhalf1, stream, d_plan->supports_pools); CUDA_FREE_AND_NULL(d_plan->fwkerhalf2, stream, d_plan->supports_pools); CUDA_FREE_AND_NULL(d_plan->fwkerhalf3, stream, d_plan->supports_pools); - + CUDA_FREE_AND_NULL(d_plan->idxnupts, stream, d_plan->supports_pools); CUDA_FREE_AND_NULL(d_plan->sortidx, stream, d_plan->supports_pools); CUDA_FREE_AND_NULL(d_plan->numsubprob, stream, d_plan->supports_pools); diff --git a/src/cuda/spreadinterp.cpp b/src/cuda/spreadinterp.cpp index 646ffa434..f9c1de2ad 100644 --- a/src/cuda/spreadinterp.cpp +++ b/src/cuda/spreadinterp.cpp @@ -26,8 +26,9 @@ int setup_spreader(finufft_spread_opts &opts, T eps, T upsampfac, int kerevalmet __func__, upsampfac); return FINUFFT_ERR_HORNER_WRONG_BETA; } - if (upsampfac <= 1.0) { - fprintf(stderr, "[%s] error: upsampfac=%.3g is <=1.0\n", __func__, upsampfac); + // Upsampfac == 1.0 is valid for spreadinterponly mode. + if (upsampfac < 1.0) { + fprintf(stderr, "[%s] error: upsampfac=%.3g is <1.0\n", __func__, upsampfac); return FINUFFT_ERR_UPSAMPFAC_TOO_SMALL; } // calling routine must abort on above errors, since opts is garbage!