Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support for gpu_spreadinterponly=1 #564

Merged
merged 19 commits into from
Oct 8, 2024
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/error.rst
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ has the following meanings (see codes in ``include/finufft_errors.h``):
18 size of bins for subprob/blockgather invalid
19 GPU shmem too small for subprob/blockgather parameters
20 invalid number of nonuniform points: nj or nk negative, or too big (see defs.h)
23 invalid upsampfac set while using gpu_spreadinterponly mode.

When ``ier=1`` (warning only) the transform(s) is/are still completed, at the smallest epsilon achievable, so, with that caveat, the answer should still be usable.

Expand Down
24 changes: 16 additions & 8 deletions include/cufinufft/impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,9 @@ int cufinufft_makeplan_impl(int type, int dim, int *nmodes, int iflag, int ntran
d_plan->ms = nmodes[0];
d_plan->mt = nmodes[1];
d_plan->mu = nmodes[2];
printf("[cufinufft] (ms,mt,mu): %d %d %d\n", d_plan->ms, d_plan->mt, d_plan->mu);
if (d_plan->opts.debug) {
printf("[cufinufft] (ms,mt,mu): %d %d %d\n", d_plan->ms, d_plan->mt, d_plan->mu);
}
} else {
d_plan->opts.gpu_spreadinterponly = 1;
}
Expand Down Expand Up @@ -153,7 +155,10 @@ int cufinufft_makeplan_impl(int type, int dim, int *nmodes, int iflag, int ntran
printf("[cufinufft] upsampfac automatically set to %.3g\n", d_plan->opts.upsampfac);
}
}

if (d_plan->opts.gpu_spreadinterponly && d_plan->opts.upsampfac != 1) {
ier = FINUFFT_ERR_SPREADONLY_UPSAMP_INVALID;
goto finalize;
}
/* Setup Spreader */
if ((ier = setup_spreader_for_nufft(d_plan->spopts, tol, d_plan->opts)) > 1) {
// can return FINUFFT_WARN_EPS_TOO_SMALL=1, which is OK
Expand Down Expand Up @@ -311,13 +316,16 @@ int cufinufft_makeplan_impl(int type, int dim, int *nmodes, int iflag, int ntran
d_fseries_precomp_phase.begin());
thrust::copy(fseries_precomp_f, fseries_precomp_f + 3 * MAX_NQUAD,
d_fseries_precomp_f.begin());
if(!d_plan->opts.gpu_spreadinterponly) {
DiamonDinoia marked this conversation as resolved.
Show resolved Hide resolved
// the full fseries is done on the GPU here
if ((ier = fseries_kernel_compute(
d_plan->dim, d_plan->nf1, d_plan->nf2, d_plan->nf3,
d_fseries_precomp_f.data().get(), d_fseries_precomp_phase.data().get(),
d_plan->fwkerhalf1, d_plan->fwkerhalf2, d_plan->fwkerhalf3,
d_plan->spopts.nspread, stream)))
goto finalize;
if ((ier = fseries_kernel_compute(
d_plan->dim, d_plan->nf1, d_plan->nf2, d_plan->nf3,
d_fseries_precomp_f.data().get(), d_fseries_precomp_phase.data().get(),
d_plan->fwkerhalf1, d_plan->fwkerhalf2, d_plan->fwkerhalf3,
d_plan->spopts.nspread, stream)))
goto finalize;
}

}
finalize:
if (ier > 1) {
Expand Down
3 changes: 2 additions & 1 deletion include/finufft_errors.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ enum {
FINUFFT_ERR_INSUFFICIENT_SHMEM = 19,
FINUFFT_ERR_NUM_NU_PTS_INVALID = 20,
FINUFFT_ERR_INVALID_ARGUMENT = 21,
FINUFFT_ERR_LOCK_FUNS_INVALID = 22
FINUFFT_ERR_LOCK_FUNS_INVALID = 22,
FINUFFT_ERR_SPREADONLY_UPSAMP_INVALID = 23,
};
#endif
39 changes: 25 additions & 14 deletions src/cuda/1d/cufinufft1d.cu
Original file line number Diff line number Diff line change
Expand Up @@ -43,16 +43,22 @@ int cufinufft1d1_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk,
d_fkstart = d_fk + i * d_plan->batchsize * d_plan->ms;
d_plan->c = d_cstart;
d_plan->fk = d_fkstart;

// this is needed
if (d_plan->opts.gpu_spreadinterponly)
d_plan->fw = d_fkstart;

// this is needed
if ((ier = checkCudaErrors(cudaMemsetAsync(
d_plan->fw, 0, d_plan->batchsize * d_plan->nf1 * sizeof(cuda_complex<T>),
stream))))
return ier;
DiamonDinoia marked this conversation as resolved.
Show resolved Hide resolved

// Step 1: Spread
if ((ier = cuspread1d<T>(d_plan, blksize))) return ier;
// Step 1.5: if spreadonly, skip the rest
ahbarnett marked this conversation as resolved.
Show resolved Hide resolved

if (d_plan->opts.gpu_spreadinterponly)
continue;

// Step 2: FFT
cufftResult cufft_status =
cufft_ex(d_plan->fftplan, d_plan->fw, d_plan->fw, d_plan->iflag);
Expand Down Expand Up @@ -97,19 +103,24 @@ int cufinufft1d2_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk,

d_plan->c = d_cstart;
d_plan->fk = d_fkstart;

// Step 1: amplify Fourier coeffs fk and copy into upsampled array fw
if (d_plan->opts.modeord == 0) {
if ((ier = cudeconvolve1d<T, 0>(d_plan, blksize))) return ier;
} else {
if ((ier = cudeconvolve1d<T, 1>(d_plan, blksize))) return ier;

// Skip all steps if interponly
ahbarnett marked this conversation as resolved.
Show resolved Hide resolved
if (!d_plan->opts.gpu_spreadinterponly) {
DiamonDinoia marked this conversation as resolved.
Show resolved Hide resolved
// Step 1: amplify Fourier coeffs fk and copy into upsampled array fw
if (d_plan->opts.modeord == 0) {
if ((ier = cudeconvolve1d<T, 0>(d_plan, blksize))) return ier;
} else {
if ((ier = cudeconvolve1d<T, 1>(d_plan, blksize))) return ier;
}

// Step 2: FFT
cufftResult cufft_status =
cufft_ex(d_plan->fftplan, d_plan->fw, d_plan->fw, d_plan->iflag);
if (cufft_status != CUFFT_SUCCESS) return FINUFFT_ERR_CUDA_FAILURE;
}

// Step 2: FFT
cufftResult cufft_status =
cufft_ex(d_plan->fftplan, d_plan->fw, d_plan->fw, d_plan->iflag);
if (cufft_status != CUFFT_SUCCESS) return FINUFFT_ERR_CUDA_FAILURE;

else
d_plan->fw = d_fkstart;

// Step 3: deconvolve and shuffle
ahbarnett marked this conversation as resolved.
Show resolved Hide resolved
if ((ier = cuinterp1d<T>(d_plan, blksize))) return ier;
}
Expand Down
33 changes: 22 additions & 11 deletions src/cuda/2d/cufinufft2d.cu
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,8 @@ int cufinufft2d1_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk,
d_fkstart = d_fk + i * d_plan->batchsize * d_plan->ms * d_plan->mt;
d_plan->c = d_cstart;
d_plan->fk = d_fkstart;
if (d_plan->opts.gpu_spreadinterponly)
d_plan->fw = d_fkstart;

// this is needed
if ((ier = checkCudaErrors(cudaMemsetAsync(
Expand All @@ -55,6 +57,10 @@ int cufinufft2d1_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk,
// Step 1: Spread
if ((ier = cuspread2d<T>(d_plan, blksize))) return ier;

// Step 1.5: if spreadonly, skip the rest
ahbarnett marked this conversation as resolved.
Show resolved Hide resolved
if (d_plan->opts.gpu_spreadinterponly)
continue;

// Step 2: FFT
cufftResult cufft_status =
cufft_ex(d_plan->fftplan, d_plan->fw, d_plan->fw, d_plan->iflag);
Expand Down Expand Up @@ -100,18 +106,23 @@ int cufinufft2d2_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk,
d_plan->c = d_cstart;
d_plan->fk = d_fkstart;

// Step 1: amplify Fourier coeffs fk and copy into upsampled array fw
if (d_plan->opts.modeord == 0) {
if ((ier = cudeconvolve2d<T, 0>(d_plan, blksize))) return ier;
} else {
if ((ier = cudeconvolve2d<T, 1>(d_plan, blksize))) return ier;
// Skip all steps if interponly
ahbarnett marked this conversation as resolved.
Show resolved Hide resolved
if (!d_plan->opts.gpu_spreadinterponly) {
DiamonDinoia marked this conversation as resolved.
Show resolved Hide resolved
// Step 1: amplify Fourier coeffs fk and copy into upsampled array fw
if (d_plan->opts.modeord == 0) {
if ((ier = cudeconvolve2d<T, 0>(d_plan, blksize))) return ier;
} else {
if ((ier = cudeconvolve2d<T, 1>(d_plan, blksize))) return ier;
}

// Step 2: FFT
cufftResult cufft_status =
cufft_ex(d_plan->fftplan, d_plan->fw, d_plan->fw, d_plan->iflag);
if (cufft_status != CUFFT_SUCCESS) return FINUFFT_ERR_CUDA_FAILURE;
}

// Step 2: FFT
cufftResult cufft_status =
cufft_ex(d_plan->fftplan, d_plan->fw, d_plan->fw, d_plan->iflag);
if (cufft_status != CUFFT_SUCCESS) return FINUFFT_ERR_CUDA_FAILURE;

else
d_plan->fw = d_fkstart;

// Step 3: deconvolve and shuffle
if ((ier = cuinterp2d<T>(d_plan, blksize))) return ier;
}
Expand Down
38 changes: 24 additions & 14 deletions src/cuda/3d/cufinufft3d.cu
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,8 @@ int cufinufft3d1_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk,

d_plan->c = d_cstart;
d_plan->fk = d_fkstart;
if (d_plan->opts.gpu_spreadinterponly)
d_plan->fw = d_fkstart;

if ((ier = checkCudaErrors(cudaMemsetAsync(
d_plan->fw, 0, d_plan->batchsize * d_plan->nf * sizeof(cuda_complex<T>),
Expand All @@ -50,8 +52,12 @@ int cufinufft3d1_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk,

// Step 1: Spread
if ((ier = cuspread3d<T>(d_plan, blksize))) return ier;

// Step 2: FFT

// Step 1.5: if spreadonly, skip the rest
if (d_plan->opts.gpu_spreadinterponly)
continue;

// Step 2: FFT
cufftResult cufft_status =
cufft_ex(d_plan->fftplan, d_plan->fw, d_plan->fw, d_plan->iflag);
if (cufft_status != CUFFT_SUCCESS) return FINUFFT_ERR_CUDA_FAILURE;
Expand Down Expand Up @@ -94,19 +100,23 @@ int cufinufft3d2_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk,
d_plan->c = d_cstart;
d_plan->fk = d_fkstart;

// Step 1: amplify Fourier coeffs fk and copy into upsampled array fw
if (d_plan->opts.modeord == 0) {
if ((ier = cudeconvolve3d<T, 0>(d_plan, blksize))) return ier;
} else {
if ((ier = cudeconvolve3d<T, 1>(d_plan, blksize))) return ier;
// Skip all steps if interponly
if (!d_plan->opts.gpu_spreadinterponly) {
DiamonDinoia marked this conversation as resolved.
Show resolved Hide resolved
// Step 1: amplify Fourier coeffs fk and copy into upsampled array fw
if (d_plan->opts.modeord == 0) {
if ((ier = cudeconvolve3d<T, 0>(d_plan, blksize))) return ier;
} else {
if ((ier = cudeconvolve3d<T, 1>(d_plan, blksize))) return ier;
}
// Step 2: FFT
RETURN_IF_CUDA_ERROR
cufftResult cufft_status =
cufft_ex(d_plan->fftplan, d_plan->fw, d_plan->fw, d_plan->iflag);
if (cufft_status != CUFFT_SUCCESS) return FINUFFT_ERR_CUDA_FAILURE;
}

// Step 2: FFT
RETURN_IF_CUDA_ERROR
cufftResult cufft_status =
cufft_ex(d_plan->fftplan, d_plan->fw, d_plan->fw, d_plan->iflag);
if (cufft_status != CUFFT_SUCCESS) return FINUFFT_ERR_CUDA_FAILURE;

else
d_plan->fw = d_fkstart;

// Step 3: deconvolve and shuffle
if ((ier = cuinterp3d<T>(d_plan, blksize))) return ier;
}
Expand Down
13 changes: 7 additions & 6 deletions src/cuda/memtransfer_wrapper.cu
Original file line number Diff line number Diff line change
Expand Up @@ -434,12 +434,13 @@ void freegpumemory(cufinufft_plan_t<T> *d_plan)
utils::WithCudaDevice device_swapper(d_plan->opts.gpu_device_id);
// Fixes a crash whewre the plan itself is deleted before the stream
const auto stream = d_plan->stream;

CUDA_FREE_AND_NULL(d_plan->fw, stream, d_plan->supports_pools);
CUDA_FREE_AND_NULL(d_plan->fwkerhalf1, stream, d_plan->supports_pools);
CUDA_FREE_AND_NULL(d_plan->fwkerhalf2, stream, d_plan->supports_pools);
CUDA_FREE_AND_NULL(d_plan->fwkerhalf3, stream, d_plan->supports_pools);

if(!d_plan->opts.gpu_spreadinterponly) {
CUDA_FREE_AND_NULL(d_plan->fw, stream, d_plan->supports_pools);
CUDA_FREE_AND_NULL(d_plan->fwkerhalf1, stream, d_plan->supports_pools);
DiamonDinoia marked this conversation as resolved.
Show resolved Hide resolved
CUDA_FREE_AND_NULL(d_plan->fwkerhalf2, stream, d_plan->supports_pools);
CUDA_FREE_AND_NULL(d_plan->fwkerhalf3, stream, d_plan->supports_pools);
}

CUDA_FREE_AND_NULL(d_plan->idxnupts, stream, d_plan->supports_pools);
CUDA_FREE_AND_NULL(d_plan->sortidx, stream, d_plan->supports_pools);
CUDA_FREE_AND_NULL(d_plan->numsubprob, stream, d_plan->supports_pools);
Expand Down
2 changes: 1 addition & 1 deletion src/cuda/spreadinterp.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ int setup_spreader(finufft_spread_opts &opts, T eps, T upsampfac, int kerevalmet
__func__, upsampfac);
return FINUFFT_ERR_HORNER_WRONG_BETA;
}
if (upsampfac <= 1.0) {
if (upsampfac < 1.0) {
DiamonDinoia marked this conversation as resolved.
Show resolved Hide resolved
fprintf(stderr, "[%s] error: upsampfac=%.3g is <=1.0\n", __func__, upsampfac);
return FINUFFT_ERR_UPSAMPFAC_TOO_SMALL;
}
Expand Down