Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support for gpu_spreadinterponly=1 #564

Merged
merged 19 commits into from
Oct 8, 2024
Merged
Show file tree
Hide file tree
Changes from 13 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 1 addition & 4 deletions docs/c_gpu.rst
Original file line number Diff line number Diff line change
Expand Up @@ -311,11 +311,8 @@ while ``modeord=1`` selects FFT-style ordering starting at zero and wrapping ove

**gpu_device_id**: Sets the GPU device ID. Leave at default unless you know what you're doing. [To be documented]

Diagnostic options
~~~~~~~~~~~~~~~~~~

**gpu_spreadinterponly**: if ``0`` do the NUFFT as intended. If ``1``, omit the FFT and kernel FT deconvolution steps and return garbage answers.
Nonzero value is *only* to be used to aid timing tests (although currently there are no timing codes that exploit this option), and will give wrong or undefined answers for the NUFFT transforms!
Nonzero value can be used *only* with `upsampfac=1` and `kerevalmeth=1` for using cufinufft for only spread and interpolate mode. This has applications into estimating the density compensation, which is conventionally used in MRI. (See [MRI-NUFFT](https://mind-inria.github.io/mri-nufft/nufft.html))


Algorithm performance options
Expand Down
1 change: 1 addition & 0 deletions docs/error.rst
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ has the following meanings (see codes in ``include/finufft_errors.h``):
18 size of bins for subprob/blockgather invalid
19 GPU shmem too small for subprob/blockgather parameters
20 invalid number of nonuniform points: nj or nk negative, or too big (see defs.h)
23 invalid upsampfac set while using gpu_spreadinterponly mode.

When ``ier=1`` (warning only) the transform(s) is/are still completed, at the smallest epsilon achievable, so, with that caveat, the answer should still be usable.

Expand Down
24 changes: 16 additions & 8 deletions include/cufinufft/impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,9 @@ int cufinufft_makeplan_impl(int type, int dim, int *nmodes, int iflag, int ntran
d_plan->ms = nmodes[0];
d_plan->mt = nmodes[1];
d_plan->mu = nmodes[2];
printf("[cufinufft] (ms,mt,mu): %d %d %d\n", d_plan->ms, d_plan->mt, d_plan->mu);
if (d_plan->opts.debug) {
printf("[cufinufft] (ms,mt,mu): %d %d %d\n", d_plan->ms, d_plan->mt, d_plan->mu);
}
} else {
d_plan->opts.gpu_spreadinterponly = 1;
}
Expand Down Expand Up @@ -153,7 +155,10 @@ int cufinufft_makeplan_impl(int type, int dim, int *nmodes, int iflag, int ntran
printf("[cufinufft] upsampfac automatically set to %.3g\n", d_plan->opts.upsampfac);
}
}

if (d_plan->opts.gpu_spreadinterponly && d_plan->opts.upsampfac != 1 && d_plan->type != 3) {
ier = FINUFFT_ERR_SPREADONLY_UPSAMP_INVALID;
goto finalize;
}
/* Setup Spreader */
if ((ier = setup_spreader_for_nufft(d_plan->spopts, tol, d_plan->opts)) > 1) {
// can return FINUFFT_WARN_EPS_TOO_SMALL=1, which is OK
Expand Down Expand Up @@ -311,13 +316,16 @@ int cufinufft_makeplan_impl(int type, int dim, int *nmodes, int iflag, int ntran
d_fseries_precomp_phase.begin());
thrust::copy(fseries_precomp_f, fseries_precomp_f + 3 * MAX_NQUAD,
d_fseries_precomp_f.begin());
if(!d_plan->opts.gpu_spreadinterponly) {
DiamonDinoia marked this conversation as resolved.
Show resolved Hide resolved
// the full fseries is done on the GPU here
if ((ier = fseries_kernel_compute(
d_plan->dim, d_plan->nf1, d_plan->nf2, d_plan->nf3,
d_fseries_precomp_f.data().get(), d_fseries_precomp_phase.data().get(),
d_plan->fwkerhalf1, d_plan->fwkerhalf2, d_plan->fwkerhalf3,
d_plan->spopts.nspread, stream)))
goto finalize;
if ((ier = fseries_kernel_compute(
d_plan->dim, d_plan->nf1, d_plan->nf2, d_plan->nf3,
d_fseries_precomp_f.data().get(), d_fseries_precomp_phase.data().get(),
d_plan->fwkerhalf1, d_plan->fwkerhalf2, d_plan->fwkerhalf3,
d_plan->spopts.nspread, stream)))
goto finalize;
}

}
finalize:
if (ier > 1) {
Expand Down
3 changes: 2 additions & 1 deletion include/finufft_errors.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ enum {
FINUFFT_ERR_INSUFFICIENT_SHMEM = 19,
FINUFFT_ERR_NUM_NU_PTS_INVALID = 20,
FINUFFT_ERR_INVALID_ARGUMENT = 21,
FINUFFT_ERR_LOCK_FUNS_INVALID = 22
FINUFFT_ERR_LOCK_FUNS_INVALID = 22,
FINUFFT_ERR_SPREADONLY_UPSAMP_INVALID = 23,
};
#endif
41 changes: 26 additions & 15 deletions src/cuda/1d/cufinufft1d.cu
Original file line number Diff line number Diff line change
Expand Up @@ -43,16 +43,22 @@ int cufinufft1d1_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk,
d_fkstart = d_fk + i * d_plan->batchsize * d_plan->ms;
d_plan->c = d_cstart;
d_plan->fk = d_fkstart;

// this is needed
if (d_plan->opts.gpu_spreadinterponly)
d_plan->fw = d_fkstart;

// this is needed
if ((ier = checkCudaErrors(cudaMemsetAsync(
d_plan->fw, 0, d_plan->batchsize * d_plan->nf1 * sizeof(cuda_complex<T>),
stream))))
return ier;
DiamonDinoia marked this conversation as resolved.
Show resolved Hide resolved

// Step 1: Spread
if ((ier = cuspread1d<T>(d_plan, blksize))) return ier;
// if spreadonly, skip the rest

if (d_plan->opts.gpu_spreadinterponly)
continue;

// Step 2: FFT
cufftResult cufft_status =
cufft_ex(d_plan->fftplan, d_plan->fw, d_plan->fw, d_plan->iflag);
Expand Down Expand Up @@ -97,20 +103,25 @@ int cufinufft1d2_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk,

d_plan->c = d_cstart;
d_plan->fk = d_fkstart;

// Step 1: amplify Fourier coeffs fk and copy into upsampled array fw
if (d_plan->opts.modeord == 0) {
if ((ier = cudeconvolve1d<T, 0>(d_plan, blksize))) return ier;
} else {
if ((ier = cudeconvolve1d<T, 1>(d_plan, blksize))) return ier;

// Skip steps 1 and 2 if interponly, but if it is type3 (upsampfac > 1.0), we need to do step 1
if (!d_plan->opts.gpu_spreadinterponly || d_plan->opts.upsampfac > 1.0) {
// Step 1: amplify Fourier coeffs fk and copy into upsampled array fw
if (d_plan->opts.modeord == 0) {
if ((ier = cudeconvolve1d<T, 0>(d_plan, blksize))) return ier;
} else {
if ((ier = cudeconvolve1d<T, 1>(d_plan, blksize))) return ier;
}

// Step 2: FFT
cufftResult cufft_status =
cufft_ex(d_plan->fftplan, d_plan->fw, d_plan->fw, d_plan->iflag);
if (cufft_status != CUFFT_SUCCESS) return FINUFFT_ERR_CUDA_FAILURE;
}

// Step 2: FFT
cufftResult cufft_status =
cufft_ex(d_plan->fftplan, d_plan->fw, d_plan->fw, d_plan->iflag);
if (cufft_status != CUFFT_SUCCESS) return FINUFFT_ERR_CUDA_FAILURE;

// Step 3: deconvolve and shuffle
else
d_plan->fw = d_fkstart;

// Step 3: Interpolate
if ((ier = cuinterp1d<T>(d_plan, blksize))) return ier;
}

Expand Down
35 changes: 23 additions & 12 deletions src/cuda/2d/cufinufft2d.cu
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,8 @@ int cufinufft2d1_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk,
d_fkstart = d_fk + i * d_plan->batchsize * d_plan->ms * d_plan->mt;
d_plan->c = d_cstart;
d_plan->fk = d_fkstart;
if (d_plan->opts.gpu_spreadinterponly)
d_plan->fw = d_fkstart;

// this is needed
if ((ier = checkCudaErrors(cudaMemsetAsync(
Expand All @@ -55,6 +57,10 @@ int cufinufft2d1_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk,
// Step 1: Spread
if ((ier = cuspread2d<T>(d_plan, blksize))) return ier;

// if spreadonly, skip the rest
if (d_plan->opts.gpu_spreadinterponly)
continue;

// Step 2: FFT
cufftResult cufft_status =
cufft_ex(d_plan->fftplan, d_plan->fw, d_plan->fw, d_plan->iflag);
Expand Down Expand Up @@ -100,19 +106,24 @@ int cufinufft2d2_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk,
d_plan->c = d_cstart;
d_plan->fk = d_fkstart;

// Step 1: amplify Fourier coeffs fk and copy into upsampled array fw
if (d_plan->opts.modeord == 0) {
if ((ier = cudeconvolve2d<T, 0>(d_plan, blksize))) return ier;
} else {
if ((ier = cudeconvolve2d<T, 1>(d_plan, blksize))) return ier;
// Skip steps 1 and 2 if interponly, but if it is type3 (upsampfac > 1.0), we need to do step 1
if (!d_plan->opts.gpu_spreadinterponly || d_plan->opts.upsampfac > 1.0) {
ahbarnett marked this conversation as resolved.
Show resolved Hide resolved
// Step 1: amplify Fourier coeffs fk and copy into upsampled array fw
if (d_plan->opts.modeord == 0) {
if ((ier = cudeconvolve2d<T, 0>(d_plan, blksize))) return ier;
} else {
if ((ier = cudeconvolve2d<T, 1>(d_plan, blksize))) return ier;
}

// Step 2: FFT
cufftResult cufft_status =
cufft_ex(d_plan->fftplan, d_plan->fw, d_plan->fw, d_plan->iflag);
if (cufft_status != CUFFT_SUCCESS) return FINUFFT_ERR_CUDA_FAILURE;
}

// Step 2: FFT
cufftResult cufft_status =
cufft_ex(d_plan->fftplan, d_plan->fw, d_plan->fw, d_plan->iflag);
if (cufft_status != CUFFT_SUCCESS) return FINUFFT_ERR_CUDA_FAILURE;

// Step 3: deconvolve and shuffle
else
d_plan->fw = d_fkstart;

// Step 3: Interpolate
if ((ier = cuinterp2d<T>(d_plan, blksize))) return ier;
}

Expand Down
40 changes: 25 additions & 15 deletions src/cuda/3d/cufinufft3d.cu
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,8 @@ int cufinufft3d1_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk,

d_plan->c = d_cstart;
d_plan->fk = d_fkstart;
if (d_plan->opts.gpu_spreadinterponly)
d_plan->fw = d_fkstart;

if ((ier = checkCudaErrors(cudaMemsetAsync(
d_plan->fw, 0, d_plan->batchsize * d_plan->nf * sizeof(cuda_complex<T>),
Expand All @@ -50,8 +52,12 @@ int cufinufft3d1_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk,

// Step 1: Spread
if ((ier = cuspread3d<T>(d_plan, blksize))) return ier;

// Step 2: FFT

// if spreadonly, skip the rest
if (d_plan->opts.gpu_spreadinterponly)
continue;

// Step 2: FFT
cufftResult cufft_status =
cufft_ex(d_plan->fftplan, d_plan->fw, d_plan->fw, d_plan->iflag);
if (cufft_status != CUFFT_SUCCESS) return FINUFFT_ERR_CUDA_FAILURE;
Expand Down Expand Up @@ -94,20 +100,24 @@ int cufinufft3d2_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk,
d_plan->c = d_cstart;
d_plan->fk = d_fkstart;

// Step 1: amplify Fourier coeffs fk and copy into upsampled array fw
if (d_plan->opts.modeord == 0) {
if ((ier = cudeconvolve3d<T, 0>(d_plan, blksize))) return ier;
} else {
if ((ier = cudeconvolve3d<T, 1>(d_plan, blksize))) return ier;
// Skip steps 1 and 2 if interponly, but if it is type3 (upsampfac > 1.0), we need to do step 1
if (!d_plan->opts.gpu_spreadinterponly || d_plan->opts.upsampfac > 1.0) {
// Step 1: amplify Fourier coeffs fk and copy into upsampled array fw
if (d_plan->opts.modeord == 0) {
if ((ier = cudeconvolve3d<T, 0>(d_plan, blksize))) return ier;
} else {
if ((ier = cudeconvolve3d<T, 1>(d_plan, blksize))) return ier;
}
// Step 2: FFT
RETURN_IF_CUDA_ERROR
cufftResult cufft_status =
cufft_ex(d_plan->fftplan, d_plan->fw, d_plan->fw, d_plan->iflag);
if (cufft_status != CUFFT_SUCCESS) return FINUFFT_ERR_CUDA_FAILURE;
}

// Step 2: FFT
RETURN_IF_CUDA_ERROR
cufftResult cufft_status =
cufft_ex(d_plan->fftplan, d_plan->fw, d_plan->fw, d_plan->iflag);
if (cufft_status != CUFFT_SUCCESS) return FINUFFT_ERR_CUDA_FAILURE;

// Step 3: deconvolve and shuffle
else
d_plan->fw = d_fkstart;

// Step 3: Interpolate
if ((ier = cuinterp3d<T>(d_plan, blksize))) return ier;
}

Expand Down
13 changes: 7 additions & 6 deletions src/cuda/memtransfer_wrapper.cu
Original file line number Diff line number Diff line change
Expand Up @@ -434,12 +434,13 @@ void freegpumemory(cufinufft_plan_t<T> *d_plan)
utils::WithCudaDevice device_swapper(d_plan->opts.gpu_device_id);
// Fixes a crash whewre the plan itself is deleted before the stream
const auto stream = d_plan->stream;

CUDA_FREE_AND_NULL(d_plan->fw, stream, d_plan->supports_pools);
CUDA_FREE_AND_NULL(d_plan->fwkerhalf1, stream, d_plan->supports_pools);
CUDA_FREE_AND_NULL(d_plan->fwkerhalf2, stream, d_plan->supports_pools);
CUDA_FREE_AND_NULL(d_plan->fwkerhalf3, stream, d_plan->supports_pools);

if(!d_plan->opts.gpu_spreadinterponly) {
CUDA_FREE_AND_NULL(d_plan->fw, stream, d_plan->supports_pools);
CUDA_FREE_AND_NULL(d_plan->fwkerhalf1, stream, d_plan->supports_pools);
DiamonDinoia marked this conversation as resolved.
Show resolved Hide resolved
CUDA_FREE_AND_NULL(d_plan->fwkerhalf2, stream, d_plan->supports_pools);
CUDA_FREE_AND_NULL(d_plan->fwkerhalf3, stream, d_plan->supports_pools);
}

CUDA_FREE_AND_NULL(d_plan->idxnupts, stream, d_plan->supports_pools);
CUDA_FREE_AND_NULL(d_plan->sortidx, stream, d_plan->supports_pools);
CUDA_FREE_AND_NULL(d_plan->numsubprob, stream, d_plan->supports_pools);
Expand Down
5 changes: 3 additions & 2 deletions src/cuda/spreadinterp.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,9 @@ int setup_spreader(finufft_spread_opts &opts, T eps, T upsampfac, int kerevalmet
__func__, upsampfac);
return FINUFFT_ERR_HORNER_WRONG_BETA;
}
if (upsampfac <= 1.0) {
fprintf(stderr, "[%s] error: upsampfac=%.3g is <=1.0\n", __func__, upsampfac);
// Upsampfac == 1.0 is valid for spreadinterponly mode.
if (upsampfac < 1.0) {
DiamonDinoia marked this conversation as resolved.
Show resolved Hide resolved
fprintf(stderr, "[%s] error: upsampfac=%.3g is <1.0\n", __func__, upsampfac);
return FINUFFT_ERR_UPSAMPFAC_TOO_SMALL;
}
// calling routine must abort on above errors, since opts is garbage!
Expand Down