Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support for gpu_spreadinterponly=1 #564

Merged
merged 19 commits into from
Oct 8, 2024
Merged
Show file tree
Hide file tree
Changes from 18 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 1 addition & 4 deletions docs/c_gpu.rst
Original file line number Diff line number Diff line change
Expand Up @@ -311,11 +311,8 @@ while ``modeord=1`` selects FFT-style ordering starting at zero and wrapping ove

**gpu_device_id**: Sets the GPU device ID. Leave at default unless you know what you're doing. [To be documented]

Diagnostic options
~~~~~~~~~~~~~~~~~~

**gpu_spreadinterponly**: if ``0`` do the NUFFT as intended. If ``1``, omit the FFT and kernel FT deconvolution steps and return garbage answers.
Nonzero value is *only* to be used to aid timing tests (although currently there are no timing codes that exploit this option), and will give wrong or undefined answers for the NUFFT transforms!
Nonzero value can be used *only* with `upsampfac=1` and `kerevalmeth=1` for using cufinufft for only spread and interpolate mode. This has applications into estimating the density compensation, which is conventionally used in MRI. (See [MRI-NUFFT](https://mind-inria.github.io/mri-nufft/nufft.html))


Algorithm performance options
Expand Down
1 change: 1 addition & 0 deletions docs/error.rst
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ has the following meanings (see codes in ``include/finufft_errors.h``):
18 size of bins for subprob/blockgather invalid
19 GPU shmem too small for subprob/blockgather parameters
20 invalid number of nonuniform points: nj or nk negative, or too big (see defs.h)
23 invalid upsampfac set while using gpu_spreadinterponly mode.

When ``ier=1`` (warning only) the transform(s) is/are still completed, at the smallest epsilon achievable, so, with that caveat, the answer should still be usable.

Expand Down
139 changes: 75 additions & 64 deletions include/cufinufft/impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,9 @@ int cufinufft_makeplan_impl(int type, int dim, int *nmodes, int iflag, int ntran
d_plan->ms = nmodes[0];
d_plan->mt = nmodes[1];
d_plan->mu = nmodes[2];
printf("[cufinufft] (ms,mt,mu): %d %d %d\n", d_plan->ms, d_plan->mt, d_plan->mu);
if (d_plan->opts.debug) {
printf("[cufinufft] (ms,mt,mu): %d %d %d\n", d_plan->ms, d_plan->mt, d_plan->mu);
}
} else {
d_plan->opts.gpu_spreadinterponly = 1;
}
Expand Down Expand Up @@ -153,7 +155,13 @@ int cufinufft_makeplan_impl(int type, int dim, int *nmodes, int iflag, int ntran
printf("[cufinufft] upsampfac automatically set to %.3g\n", d_plan->opts.upsampfac);
}
}

if (d_plan->opts.gpu_spreadinterponly) {
// XNOR implementation below with boolean logic.
if ((d_plan->opts.upsampfac !=1) == (type != 3)) {
ahbarnett marked this conversation as resolved.
Show resolved Hide resolved
ier = FINUFFT_ERR_SPREADONLY_UPSAMP_INVALID;
goto finalize;
}
}
/* Setup Spreader */
if ((ier = setup_spreader_for_nufft(d_plan->spopts, tol, d_plan->opts)) > 1) {
// can return FINUFFT_WARN_EPS_TOO_SMALL=1, which is OK
Expand Down Expand Up @@ -254,70 +262,73 @@ int cufinufft_makeplan_impl(int type, int dim, int *nmodes, int iflag, int ntran
} break;
}

cufftHandle fftplan;
cufftResult_t cufft_status;
switch (d_plan->dim) {
case 1: {
int n[] = {(int)nf1};
int inembed[] = {(int)nf1};

cufft_status = cufftPlanMany(&fftplan, 1, n, inembed, 1, inembed[0], inembed, 1,
inembed[0], cufft_type<T>(), batchsize);
} break;
case 2: {
int n[] = {(int)nf2, (int)nf1};
int inembed[] = {(int)nf2, (int)nf1};

cufft_status =
cufftPlanMany(&fftplan, 2, n, inembed, 1, inembed[0] * inembed[1], inembed, 1,
inembed[0] * inembed[1], cufft_type<T>(), batchsize);
} break;
case 3: {
int n[] = {(int)nf3, (int)nf2, (int)nf1};
int inembed[] = {(int)nf3, (int)nf2, (int)nf1};

cufft_status = cufftPlanMany(
&fftplan, 3, n, inembed, 1, inembed[0] * inembed[1] * inembed[2], inembed, 1,
inembed[0] * inembed[1] * inembed[2], cufft_type<T>(), batchsize);
} break;
}
if(!d_plan->opts.gpu_spreadinterponly) {
DiamonDinoia marked this conversation as resolved.
Show resolved Hide resolved
DiamonDinoia marked this conversation as resolved.
Show resolved Hide resolved
DiamonDinoia marked this conversation as resolved.
Show resolved Hide resolved
cufftHandle fftplan;
cufftResult_t cufft_status;
switch (d_plan->dim) {
case 1: {
int n[] = {(int)nf1};
int inembed[] = {(int)nf1};

cufft_status = cufftPlanMany(&fftplan, 1, n, inembed, 1, inembed[0], inembed, 1,
inembed[0], cufft_type<T>(), batchsize);
} break;
case 2: {
int n[] = {(int)nf2, (int)nf1};
int inembed[] = {(int)nf2, (int)nf1};

cufft_status =
cufftPlanMany(&fftplan, 2, n, inembed, 1, inembed[0] * inembed[1], inembed, 1,
inembed[0] * inembed[1], cufft_type<T>(), batchsize);
} break;
case 3: {
int n[] = {(int)nf3, (int)nf2, (int)nf1};
int inembed[] = {(int)nf3, (int)nf2, (int)nf1};

cufft_status = cufftPlanMany(
&fftplan, 3, n, inembed, 1, inembed[0] * inembed[1] * inembed[2], inembed, 1,
inembed[0] * inembed[1] * inembed[2], cufft_type<T>(), batchsize);
} break;
}

if (cufft_status != CUFFT_SUCCESS) {
fprintf(stderr, "[%s] cufft makeplan error: %s", __func__,
cufftGetErrorString(cufft_status));
ier = FINUFFT_ERR_CUDA_FAILURE;
goto finalize;
if (cufft_status != CUFFT_SUCCESS) {
fprintf(stderr, "[%s] cufft makeplan error: %s", __func__,
cufftGetErrorString(cufft_status));
ier = FINUFFT_ERR_CUDA_FAILURE;
goto finalize;
}
cufftSetStream(fftplan, stream);

d_plan->fftplan = fftplan;

// compute up to 3 * NQUAD precomputed values on CPU
T fseries_precomp_phase[3 * MAX_NQUAD];
T fseries_precomp_f[3 * MAX_NQUAD];
thrust::device_vector<T> d_fseries_precomp_phase(3 * MAX_NQUAD);
thrust::device_vector<T> d_fseries_precomp_f(3 * MAX_NQUAD);
onedim_fseries_kernel_precomp<T>(d_plan->nf1, fseries_precomp_f,
fseries_precomp_phase, d_plan->spopts);
if (d_plan->dim > 1)
onedim_fseries_kernel_precomp<T>(d_plan->nf2, fseries_precomp_f + MAX_NQUAD,
fseries_precomp_phase + MAX_NQUAD, d_plan->spopts);
if (d_plan->dim > 2)
onedim_fseries_kernel_precomp<T>(d_plan->nf3, fseries_precomp_f + 2 * MAX_NQUAD,
fseries_precomp_phase + 2 * MAX_NQUAD,
d_plan->spopts);
// copy the precomputed data to the device using thrust
thrust::copy(fseries_precomp_phase, fseries_precomp_phase + 3 * MAX_NQUAD,
d_fseries_precomp_phase.begin());
thrust::copy(fseries_precomp_f, fseries_precomp_f + 3 * MAX_NQUAD,
d_fseries_precomp_f.begin());
// the full fseries is done on the GPU here
if ((ier = fseries_kernel_compute(
d_plan->dim, d_plan->nf1, d_plan->nf2, d_plan->nf3,
d_fseries_precomp_f.data().get(), d_fseries_precomp_phase.data().get(),
d_plan->fwkerhalf1, d_plan->fwkerhalf2, d_plan->fwkerhalf3,
d_plan->spopts.nspread, stream)))
goto finalize;
}
cufftSetStream(fftplan, stream);

d_plan->fftplan = fftplan;

// compute up to 3 * NQUAD precomputed values on CPU
T fseries_precomp_phase[3 * MAX_NQUAD];
T fseries_precomp_f[3 * MAX_NQUAD];
thrust::device_vector<T> d_fseries_precomp_phase(3 * MAX_NQUAD);
thrust::device_vector<T> d_fseries_precomp_f(3 * MAX_NQUAD);
onedim_fseries_kernel_precomp<T>(d_plan->nf1, fseries_precomp_f,
fseries_precomp_phase, d_plan->spopts);
if (d_plan->dim > 1)
onedim_fseries_kernel_precomp<T>(d_plan->nf2, fseries_precomp_f + MAX_NQUAD,
fseries_precomp_phase + MAX_NQUAD, d_plan->spopts);
if (d_plan->dim > 2)
onedim_fseries_kernel_precomp<T>(d_plan->nf3, fseries_precomp_f + 2 * MAX_NQUAD,
fseries_precomp_phase + 2 * MAX_NQUAD,
d_plan->spopts);
// copy the precomputed data to the device using thrust
thrust::copy(fseries_precomp_phase, fseries_precomp_phase + 3 * MAX_NQUAD,
d_fseries_precomp_phase.begin());
thrust::copy(fseries_precomp_f, fseries_precomp_f + 3 * MAX_NQUAD,
d_fseries_precomp_f.begin());
// the full fseries is done on the GPU here
if ((ier = fseries_kernel_compute(
d_plan->dim, d_plan->nf1, d_plan->nf2, d_plan->nf3,
d_fseries_precomp_f.data().get(), d_fseries_precomp_phase.data().get(),
d_plan->fwkerhalf1, d_plan->fwkerhalf2, d_plan->fwkerhalf3,
d_plan->spopts.nspread, stream)))
goto finalize;

}
finalize:
if (ier > 1) {
Expand Down
3 changes: 2 additions & 1 deletion include/finufft_errors.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ enum {
FINUFFT_ERR_INSUFFICIENT_SHMEM = 19,
FINUFFT_ERR_NUM_NU_PTS_INVALID = 20,
FINUFFT_ERR_INVALID_ARGUMENT = 21,
FINUFFT_ERR_LOCK_FUNS_INVALID = 22
FINUFFT_ERR_LOCK_FUNS_INVALID = 22,
FINUFFT_ERR_SPREADONLY_UPSAMP_INVALID = 23,
};
#endif
41 changes: 26 additions & 15 deletions src/cuda/1d/cufinufft1d.cu
Original file line number Diff line number Diff line change
Expand Up @@ -43,16 +43,22 @@ int cufinufft1d1_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk,
d_fkstart = d_fk + i * d_plan->batchsize * d_plan->ms;
d_plan->c = d_cstart;
d_plan->fk = d_fkstart;

// this is needed
if (d_plan->opts.gpu_spreadinterponly)
d_plan->fw = d_fkstart;

// this is needed
if ((ier = checkCudaErrors(cudaMemsetAsync(
d_plan->fw, 0, d_plan->batchsize * d_plan->nf1 * sizeof(cuda_complex<T>),
stream))))
return ier;
DiamonDinoia marked this conversation as resolved.
Show resolved Hide resolved

// Step 1: Spread
if ((ier = cuspread1d<T>(d_plan, blksize))) return ier;
// if spreadonly, skip the rest

if (d_plan->opts.gpu_spreadinterponly)
continue;

// Step 2: FFT
cufftResult cufft_status =
cufft_ex(d_plan->fftplan, d_plan->fw, d_plan->fw, d_plan->iflag);
Expand Down Expand Up @@ -97,20 +103,25 @@ int cufinufft1d2_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk,

d_plan->c = d_cstart;
d_plan->fk = d_fkstart;

// Step 1: amplify Fourier coeffs fk and copy into upsampled array fw
if (d_plan->opts.modeord == 0) {
if ((ier = cudeconvolve1d<T, 0>(d_plan, blksize))) return ier;
} else {
if ((ier = cudeconvolve1d<T, 1>(d_plan, blksize))) return ier;

// Skip steps 1 and 2 if interponly
if (!d_plan->opts.gpu_spreadinterponly) {
DiamonDinoia marked this conversation as resolved.
Show resolved Hide resolved
// Step 1: amplify Fourier coeffs fk and copy into upsampled array fw
if (d_plan->opts.modeord == 0) {
if ((ier = cudeconvolve1d<T, 0>(d_plan, blksize))) return ier;
} else {
if ((ier = cudeconvolve1d<T, 1>(d_plan, blksize))) return ier;
}

// Step 2: FFT
cufftResult cufft_status =
cufft_ex(d_plan->fftplan, d_plan->fw, d_plan->fw, d_plan->iflag);
if (cufft_status != CUFFT_SUCCESS) return FINUFFT_ERR_CUDA_FAILURE;
}

// Step 2: FFT
cufftResult cufft_status =
cufft_ex(d_plan->fftplan, d_plan->fw, d_plan->fw, d_plan->iflag);
if (cufft_status != CUFFT_SUCCESS) return FINUFFT_ERR_CUDA_FAILURE;

// Step 3: deconvolve and shuffle
else
d_plan->fw = d_fkstart;

// Step 3: Interpolate
if ((ier = cuinterp1d<T>(d_plan, blksize))) return ier;
}

Expand Down
35 changes: 23 additions & 12 deletions src/cuda/2d/cufinufft2d.cu
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,8 @@ int cufinufft2d1_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk,
d_fkstart = d_fk + i * d_plan->batchsize * d_plan->ms * d_plan->mt;
d_plan->c = d_cstart;
d_plan->fk = d_fkstart;
if (d_plan->opts.gpu_spreadinterponly)
d_plan->fw = d_fkstart;

// this is needed
if ((ier = checkCudaErrors(cudaMemsetAsync(
Expand All @@ -55,6 +57,10 @@ int cufinufft2d1_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk,
// Step 1: Spread
if ((ier = cuspread2d<T>(d_plan, blksize))) return ier;

// if spreadonly, skip the rest
if (d_plan->opts.gpu_spreadinterponly)
continue;

// Step 2: FFT
cufftResult cufft_status =
cufft_ex(d_plan->fftplan, d_plan->fw, d_plan->fw, d_plan->iflag);
Expand Down Expand Up @@ -100,19 +106,24 @@ int cufinufft2d2_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk,
d_plan->c = d_cstart;
d_plan->fk = d_fkstart;

// Step 1: amplify Fourier coeffs fk and copy into upsampled array fw
if (d_plan->opts.modeord == 0) {
if ((ier = cudeconvolve2d<T, 0>(d_plan, blksize))) return ier;
} else {
if ((ier = cudeconvolve2d<T, 1>(d_plan, blksize))) return ier;
// Skip steps 1 and 2 if interponly
if (!d_plan->opts.gpu_spreadinterponly) {
DiamonDinoia marked this conversation as resolved.
Show resolved Hide resolved
// Step 1: amplify Fourier coeffs fk and copy into upsampled array fw
if (d_plan->opts.modeord == 0) {
if ((ier = cudeconvolve2d<T, 0>(d_plan, blksize))) return ier;
} else {
if ((ier = cudeconvolve2d<T, 1>(d_plan, blksize))) return ier;
}

// Step 2: FFT
cufftResult cufft_status =
cufft_ex(d_plan->fftplan, d_plan->fw, d_plan->fw, d_plan->iflag);
if (cufft_status != CUFFT_SUCCESS) return FINUFFT_ERR_CUDA_FAILURE;
}

// Step 2: FFT
cufftResult cufft_status =
cufft_ex(d_plan->fftplan, d_plan->fw, d_plan->fw, d_plan->iflag);
if (cufft_status != CUFFT_SUCCESS) return FINUFFT_ERR_CUDA_FAILURE;

// Step 3: deconvolve and shuffle
else
d_plan->fw = d_fkstart;

// Step 3: Interpolate
if ((ier = cuinterp2d<T>(d_plan, blksize))) return ier;
}

Expand Down
40 changes: 25 additions & 15 deletions src/cuda/3d/cufinufft3d.cu
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,8 @@ int cufinufft3d1_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk,

d_plan->c = d_cstart;
d_plan->fk = d_fkstart;
if (d_plan->opts.gpu_spreadinterponly)
d_plan->fw = d_fkstart;

if ((ier = checkCudaErrors(cudaMemsetAsync(
d_plan->fw, 0, d_plan->batchsize * d_plan->nf * sizeof(cuda_complex<T>),
Expand All @@ -50,8 +52,12 @@ int cufinufft3d1_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk,

// Step 1: Spread
if ((ier = cuspread3d<T>(d_plan, blksize))) return ier;

// Step 2: FFT

// if spreadonly, skip the rest
if (d_plan->opts.gpu_spreadinterponly)
continue;

// Step 2: FFT
cufftResult cufft_status =
cufft_ex(d_plan->fftplan, d_plan->fw, d_plan->fw, d_plan->iflag);
if (cufft_status != CUFFT_SUCCESS) return FINUFFT_ERR_CUDA_FAILURE;
Expand Down Expand Up @@ -94,20 +100,24 @@ int cufinufft3d2_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk,
d_plan->c = d_cstart;
d_plan->fk = d_fkstart;

// Step 1: amplify Fourier coeffs fk and copy into upsampled array fw
if (d_plan->opts.modeord == 0) {
if ((ier = cudeconvolve3d<T, 0>(d_plan, blksize))) return ier;
} else {
if ((ier = cudeconvolve3d<T, 1>(d_plan, blksize))) return ier;
// Skip steps 1 and 2 if interponly
if (!d_plan->opts.gpu_spreadinterponly) {
DiamonDinoia marked this conversation as resolved.
Show resolved Hide resolved
// Step 1: amplify Fourier coeffs fk and copy into upsampled array fw
if (d_plan->opts.modeord == 0) {
if ((ier = cudeconvolve3d<T, 0>(d_plan, blksize))) return ier;
} else {
if ((ier = cudeconvolve3d<T, 1>(d_plan, blksize))) return ier;
}
// Step 2: FFT
RETURN_IF_CUDA_ERROR
cufftResult cufft_status =
cufft_ex(d_plan->fftplan, d_plan->fw, d_plan->fw, d_plan->iflag);
if (cufft_status != CUFFT_SUCCESS) return FINUFFT_ERR_CUDA_FAILURE;
}

// Step 2: FFT
RETURN_IF_CUDA_ERROR
cufftResult cufft_status =
cufft_ex(d_plan->fftplan, d_plan->fw, d_plan->fw, d_plan->iflag);
if (cufft_status != CUFFT_SUCCESS) return FINUFFT_ERR_CUDA_FAILURE;

// Step 3: deconvolve and shuffle
else
d_plan->fw = d_fkstart;

// Step 3: Interpolate
if ((ier = cuinterp3d<T>(d_plan, blksize))) return ier;
}

Expand Down
7 changes: 4 additions & 3 deletions src/cuda/memtransfer_wrapper.cu
Original file line number Diff line number Diff line change
Expand Up @@ -434,12 +434,13 @@ void freegpumemory(cufinufft_plan_t<T> *d_plan)
utils::WithCudaDevice device_swapper(d_plan->opts.gpu_device_id);
// Fixes a crash whewre the plan itself is deleted before the stream
const auto stream = d_plan->stream;

CUDA_FREE_AND_NULL(d_plan->fw, stream, d_plan->supports_pools);
// Dont clear fw if spreadinterponly for type 1 and 2 as fw belongs to original program (it is d_fk)
if(!d_plan->opts.gpu_spreadinterponly || d_plan->type == 3)
CUDA_FREE_AND_NULL(d_plan->fw, stream, d_plan->supports_pools);
CUDA_FREE_AND_NULL(d_plan->fwkerhalf1, stream, d_plan->supports_pools);
CUDA_FREE_AND_NULL(d_plan->fwkerhalf2, stream, d_plan->supports_pools);
CUDA_FREE_AND_NULL(d_plan->fwkerhalf3, stream, d_plan->supports_pools);

CUDA_FREE_AND_NULL(d_plan->idxnupts, stream, d_plan->supports_pools);
CUDA_FREE_AND_NULL(d_plan->sortidx, stream, d_plan->supports_pools);
CUDA_FREE_AND_NULL(d_plan->numsubprob, stream, d_plan->supports_pools);
Expand Down
Loading