flatironinstitute · ahbarnett · Oct 8, 2024 · Sep 19, 2024 · Sep 20, 2024 · Sep 20, 2024
diff --git a/docs/c_gpu.rst b/docs/c_gpu.rst
@@ -311,11 +311,8 @@ while ``modeord=1`` selects FFT-style ordering starting at zero and wrapping ove
 
 **gpu_device_id**: Sets the GPU device ID. Leave at default unless you know what you're doing. [To be documented]
 
-Diagnostic options
-~~~~~~~~~~~~~~~~~~
-
 **gpu_spreadinterponly**: if ``0`` do the NUFFT as intended. If ``1``, omit the FFT and kernel FT deconvolution steps and return garbage answers.
-Nonzero value is *only* to be used to aid timing tests (although currently there are no timing codes that exploit this option), and will give wrong or undefined answers for the NUFFT transforms!
+Nonzero value can be used *only* with `upsampfac=1` and `kerevalmeth=1` for using cufinufft for only spread and interpolate mode. This has applications into estimating the density compensation, which is conventionally used in MRI. (See [MRI-NUFFT](https://mind-inria.github.io/mri-nufft/nufft.html))
 
 
 Algorithm performance options

diff --git a/docs/error.rst b/docs/error.rst
@@ -29,6 +29,7 @@ has the following meanings (see codes in ``include/finufft_errors.h``):
   18 size of bins for subprob/blockgather invalid
   19 GPU shmem too small for subprob/blockgather parameters
   20 invalid number of nonuniform points: nj or nk negative, or too big (see defs.h)
+  23 invalid upsampfac set while using gpu_spreadinterponly mode.
 
 When ``ier=1`` (warning only) the transform(s) is/are still completed, at the smallest epsilon achievable, so, with that caveat, the answer should still be usable.
 

diff --git a/include/cufinufft/impl.h b/include/cufinufft/impl.h
@@ -107,7 +107,9 @@ int cufinufft_makeplan_impl(int type, int dim, int *nmodes, int iflag, int ntran
     d_plan->ms = nmodes[0];
     d_plan->mt = nmodes[1];
     d_plan->mu = nmodes[2];
-    printf("[cufinufft] (ms,mt,mu): %d %d %d\n", d_plan->ms, d_plan->mt, d_plan->mu);
+    if (d_plan->opts.debug) {
+        printf("[cufinufft] (ms,mt,mu): %d %d %d\n", d_plan->ms, d_plan->mt, d_plan->mu);
+    }
   } else {
     d_plan->opts.gpu_spreadinterponly = 1;
   }
@@ -153,7 +155,10 @@ int cufinufft_makeplan_impl(int type, int dim, int *nmodes, int iflag, int ntran
       printf("[cufinufft] upsampfac automatically set to %.3g\n", d_plan->opts.upsampfac);
     }
   }
-
+  if (d_plan->opts.gpu_spreadinterponly && d_plan->opts.upsampfac != 1 && d_plan->type != 3) {
+    ier = FINUFFT_ERR_SPREADONLY_UPSAMP_INVALID;
+    goto finalize;
+  }
   /* Setup Spreader */
   if ((ier = setup_spreader_for_nufft(d_plan->spopts, tol, d_plan->opts)) > 1) {
     // can return FINUFFT_WARN_EPS_TOO_SMALL=1, which is OK
@@ -311,13 +316,16 @@ int cufinufft_makeplan_impl(int type, int dim, int *nmodes, int iflag, int ntran
                  d_fseries_precomp_phase.begin());
     thrust::copy(fseries_precomp_f, fseries_precomp_f + 3 * MAX_NQUAD,
                  d_fseries_precomp_f.begin());
+    if(!d_plan->opts.gpu_spreadinterponly) {
     // the full fseries is done on the GPU here
-    if ((ier = fseries_kernel_compute(
-             d_plan->dim, d_plan->nf1, d_plan->nf2, d_plan->nf3,
-             d_fseries_precomp_f.data().get(), d_fseries_precomp_phase.data().get(),
-             d_plan->fwkerhalf1, d_plan->fwkerhalf2, d_plan->fwkerhalf3,
-             d_plan->spopts.nspread, stream)))
-      goto finalize;
+        if ((ier = fseries_kernel_compute(
+                 d_plan->dim, d_plan->nf1, d_plan->nf2, d_plan->nf3,
+                 d_fseries_precomp_f.data().get(), d_fseries_precomp_phase.data().get(),
+                 d_plan->fwkerhalf1, d_plan->fwkerhalf2, d_plan->fwkerhalf3,
+                 d_plan->spopts.nspread, stream)))
+          goto finalize;
+    }
+
   }
 finalize:
   if (ier > 1) {

diff --git a/include/finufft_errors.h b/include/finufft_errors.h
@@ -25,6 +25,7 @@ enum {
   FINUFFT_ERR_INSUFFICIENT_SHMEM     = 19,
   FINUFFT_ERR_NUM_NU_PTS_INVALID     = 20,
   FINUFFT_ERR_INVALID_ARGUMENT       = 21,
-  FINUFFT_ERR_LOCK_FUNS_INVALID      = 22
+  FINUFFT_ERR_LOCK_FUNS_INVALID      = 22,
+  FINUFFT_ERR_SPREADONLY_UPSAMP_INVALID  = 23,
 };
 #endif
diff --git a/src/cuda/1d/cufinufft1d.cu b/src/cuda/1d/cufinufft1d.cu
@@ -43,16 +43,22 @@ int cufinufft1d1_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk,
     d_fkstart   = d_fk + i * d_plan->batchsize * d_plan->ms;
     d_plan->c   = d_cstart;
     d_plan->fk  = d_fkstart;
-
-    // this is needed
+    if (d_plan->opts.gpu_spreadinterponly)
+        d_plan->fw = d_fkstart;
+
+        // this is needed
     if ((ier = checkCudaErrors(cudaMemsetAsync(
              d_plan->fw, 0, d_plan->batchsize * d_plan->nf1 * sizeof(cuda_complex<T>),
              stream))))
       return ier;
 
     // Step 1: Spread
     if ((ier = cuspread1d<T>(d_plan, blksize))) return ier;
+    // if spreadonly, skip the rest
 
+    if (d_plan->opts.gpu_spreadinterponly)
+        continue;
+
     // Step 2: FFT
     cufftResult cufft_status =
         cufft_ex(d_plan->fftplan, d_plan->fw, d_plan->fw, d_plan->iflag);
@@ -97,20 +103,25 @@ int cufinufft1d2_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk,
 
     d_plan->c  = d_cstart;
     d_plan->fk = d_fkstart;
-
-    // Step 1: amplify Fourier coeffs fk and copy into upsampled array fw
-    if (d_plan->opts.modeord == 0) {
-      if ((ier = cudeconvolve1d<T, 0>(d_plan, blksize))) return ier;
-    } else {
-      if ((ier = cudeconvolve1d<T, 1>(d_plan, blksize))) return ier;
+
+    // Skip steps 1 and 2 if interponly, but if it is type3 (upsampfac > 1.0), we need to do step 1
+    if (!d_plan->opts.gpu_spreadinterponly || d_plan->opts.upsampfac > 1.0) {
+        // Step 1: amplify Fourier coeffs fk and copy into upsampled array fw
+        if (d_plan->opts.modeord == 0) {
+          if ((ier = cudeconvolve1d<T, 0>(d_plan, blksize))) return ier;
+        } else {
+          if ((ier = cudeconvolve1d<T, 1>(d_plan, blksize))) return ier;
+        }
+
+        // Step 2: FFT
+        cufftResult cufft_status =
+            cufft_ex(d_plan->fftplan, d_plan->fw, d_plan->fw, d_plan->iflag);
+        if (cufft_status != CUFFT_SUCCESS) return FINUFFT_ERR_CUDA_FAILURE;
     }
-
-    // Step 2: FFT
-    cufftResult cufft_status =
-        cufft_ex(d_plan->fftplan, d_plan->fw, d_plan->fw, d_plan->iflag);
-    if (cufft_status != CUFFT_SUCCESS) return FINUFFT_ERR_CUDA_FAILURE;
-
-    // Step 3: deconvolve and shuffle
+    else
+        d_plan->fw = d_fkstart;
+
+    // Step 3: Interpolate
     if ((ier = cuinterp1d<T>(d_plan, blksize))) return ier;
   }
 

diff --git a/src/cuda/2d/cufinufft2d.cu b/src/cuda/2d/cufinufft2d.cu
@@ -44,6 +44,8 @@ int cufinufft2d1_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk,
     d_fkstart   = d_fk + i * d_plan->batchsize * d_plan->ms * d_plan->mt;
     d_plan->c   = d_cstart;
     d_plan->fk  = d_fkstart;
+    if (d_plan->opts.gpu_spreadinterponly)
+        d_plan->fw = d_fkstart;
 
     // this is needed
     if ((ier = checkCudaErrors(cudaMemsetAsync(
@@ -55,6 +57,10 @@ int cufinufft2d1_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk,
     // Step 1: Spread
     if ((ier = cuspread2d<T>(d_plan, blksize))) return ier;
 
+    // if spreadonly, skip the rest
+    if (d_plan->opts.gpu_spreadinterponly)
+        continue;
+
     // Step 2: FFT
     cufftResult cufft_status =
         cufft_ex(d_plan->fftplan, d_plan->fw, d_plan->fw, d_plan->iflag);
@@ -100,19 +106,24 @@ int cufinufft2d2_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk,
     d_plan->c  = d_cstart;
     d_plan->fk = d_fkstart;
 
-    // Step 1: amplify Fourier coeffs fk and copy into upsampled array fw
-    if (d_plan->opts.modeord == 0) {
-      if ((ier = cudeconvolve2d<T, 0>(d_plan, blksize))) return ier;
-    } else {
-      if ((ier = cudeconvolve2d<T, 1>(d_plan, blksize))) return ier;
+    // Skip steps 1 and 2 if interponly, but if it is type3 (upsampfac > 1.0), we need to do step 1
+    if (!d_plan->opts.gpu_spreadinterponly || d_plan->opts.upsampfac > 1.0) {
+        // Step 1: amplify Fourier coeffs fk and copy into upsampled array fw
+        if (d_plan->opts.modeord == 0) {
+          if ((ier = cudeconvolve2d<T, 0>(d_plan, blksize))) return ier;
+        } else {
+          if ((ier = cudeconvolve2d<T, 1>(d_plan, blksize))) return ier;
+        }
+
+        // Step 2: FFT
+        cufftResult cufft_status =
+            cufft_ex(d_plan->fftplan, d_plan->fw, d_plan->fw, d_plan->iflag);
+        if (cufft_status != CUFFT_SUCCESS) return FINUFFT_ERR_CUDA_FAILURE;
     }
-
-    // Step 2: FFT
-    cufftResult cufft_status =
-        cufft_ex(d_plan->fftplan, d_plan->fw, d_plan->fw, d_plan->iflag);
-    if (cufft_status != CUFFT_SUCCESS) return FINUFFT_ERR_CUDA_FAILURE;
-
-    // Step 3: deconvolve and shuffle
+    else
+        d_plan->fw = d_fkstart;
+
+    // Step 3: Interpolate
     if ((ier = cuinterp2d<T>(d_plan, blksize))) return ier;
   }
 

diff --git a/src/cuda/3d/cufinufft3d.cu b/src/cuda/3d/cufinufft3d.cu
@@ -42,6 +42,8 @@ int cufinufft3d1_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk,
 
     d_plan->c  = d_cstart;
     d_plan->fk = d_fkstart;
+    if (d_plan->opts.gpu_spreadinterponly)
+        d_plan->fw = d_fkstart;
 
     if ((ier = checkCudaErrors(cudaMemsetAsync(
              d_plan->fw, 0, d_plan->batchsize * d_plan->nf * sizeof(cuda_complex<T>),
@@ -50,8 +52,12 @@ int cufinufft3d1_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk,
 
     // Step 1: Spread
     if ((ier = cuspread3d<T>(d_plan, blksize))) return ier;
-
-    // Step 2: FFT
+
+    // if spreadonly, skip the rest
+    if (d_plan->opts.gpu_spreadinterponly)
+        continue;
+
+        // Step 2: FFT
     cufftResult cufft_status =
         cufft_ex(d_plan->fftplan, d_plan->fw, d_plan->fw, d_plan->iflag);
     if (cufft_status != CUFFT_SUCCESS) return FINUFFT_ERR_CUDA_FAILURE;
@@ -94,20 +100,24 @@ int cufinufft3d2_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk,
     d_plan->c  = d_cstart;
     d_plan->fk = d_fkstart;
 
-    // Step 1: amplify Fourier coeffs fk and copy into upsampled array fw
-    if (d_plan->opts.modeord == 0) {
-      if ((ier = cudeconvolve3d<T, 0>(d_plan, blksize))) return ier;
-    } else {
-      if ((ier = cudeconvolve3d<T, 1>(d_plan, blksize))) return ier;
+    // Skip steps 1 and 2 if interponly, but if it is type3 (upsampfac > 1.0), we need to do step 1
+    if (!d_plan->opts.gpu_spreadinterponly || d_plan->opts.upsampfac > 1.0) {
+        // Step 1: amplify Fourier coeffs fk and copy into upsampled array fw
+        if (d_plan->opts.modeord == 0) {
+          if ((ier = cudeconvolve3d<T, 0>(d_plan, blksize))) return ier;
+        } else {
+          if ((ier = cudeconvolve3d<T, 1>(d_plan, blksize))) return ier;
+        }
+        // Step 2: FFT
+        RETURN_IF_CUDA_ERROR
+        cufftResult cufft_status =
+            cufft_ex(d_plan->fftplan, d_plan->fw, d_plan->fw, d_plan->iflag);
+        if (cufft_status != CUFFT_SUCCESS) return FINUFFT_ERR_CUDA_FAILURE;
     }
-
-    // Step 2: FFT
-    RETURN_IF_CUDA_ERROR
-    cufftResult cufft_status =
-        cufft_ex(d_plan->fftplan, d_plan->fw, d_plan->fw, d_plan->iflag);
-    if (cufft_status != CUFFT_SUCCESS) return FINUFFT_ERR_CUDA_FAILURE;
-
-    // Step 3: deconvolve and shuffle
+    else
+        d_plan->fw = d_fkstart;
+
+    // Step 3: Interpolate
     if ((ier = cuinterp3d<T>(d_plan, blksize))) return ier;
   }
 

diff --git a/src/cuda/memtransfer_wrapper.cu b/src/cuda/memtransfer_wrapper.cu
@@ -434,12 +434,13 @@ void freegpumemory(cufinufft_plan_t<T> *d_plan)
   utils::WithCudaDevice device_swapper(d_plan->opts.gpu_device_id);
   // Fixes a crash whewre the plan itself is deleted before the stream
   const auto stream = d_plan->stream;
-
-  CUDA_FREE_AND_NULL(d_plan->fw, stream, d_plan->supports_pools);
-  CUDA_FREE_AND_NULL(d_plan->fwkerhalf1, stream, d_plan->supports_pools);
-  CUDA_FREE_AND_NULL(d_plan->fwkerhalf2, stream, d_plan->supports_pools);
-  CUDA_FREE_AND_NULL(d_plan->fwkerhalf3, stream, d_plan->supports_pools);
-
+  if(!d_plan->opts.gpu_spreadinterponly) {
+    CUDA_FREE_AND_NULL(d_plan->fw, stream, d_plan->supports_pools);
+    CUDA_FREE_AND_NULL(d_plan->fwkerhalf1, stream, d_plan->supports_pools);
+    CUDA_FREE_AND_NULL(d_plan->fwkerhalf2, stream, d_plan->supports_pools);
+    CUDA_FREE_AND_NULL(d_plan->fwkerhalf3, stream, d_plan->supports_pools);
+  }
+
   CUDA_FREE_AND_NULL(d_plan->idxnupts, stream, d_plan->supports_pools);
   CUDA_FREE_AND_NULL(d_plan->sortidx, stream, d_plan->supports_pools);
   CUDA_FREE_AND_NULL(d_plan->numsubprob, stream, d_plan->supports_pools);

diff --git a/src/cuda/spreadinterp.cpp b/src/cuda/spreadinterp.cpp
@@ -26,8 +26,9 @@ int setup_spreader(finufft_spread_opts &opts, T eps, T upsampfac, int kerevalmet
               __func__, upsampfac);
       return FINUFFT_ERR_HORNER_WRONG_BETA;
     }
-    if (upsampfac <= 1.0) {
-      fprintf(stderr, "[%s] error: upsampfac=%.3g is <=1.0\n", __func__, upsampfac);
+    // Upsampfac == 1.0 is valid for spreadinterponly mode.
+    if (upsampfac < 1.0) {
+      fprintf(stderr, "[%s] error: upsampfac=%.3g is <1.0\n", __func__, upsampfac);
       return FINUFFT_ERR_UPSAMPFAC_TOO_SMALL;
     }
     // calling routine must abort on above errors, since opts is garbage!