diff --git a/docs/c_gpu.rst b/docs/c_gpu.rst
index 97c50b31a..a5f050220 100644
--- a/docs/c_gpu.rst
+++ b/docs/c_gpu.rst
@@ -311,11 +311,8 @@ while ``modeord=1`` selects FFT-style ordering starting at zero and wrapping ove
 
 **gpu_device_id**: Sets the GPU device ID. Leave at default unless you know what you're doing. [To be documented]
 
-Diagnostic options
-~~~~~~~~~~~~~~~~~~
-
 **gpu_spreadinterponly**: if ``0`` do the NUFFT as intended. If ``1``, omit the FFT and kernel FT deconvolution steps and return garbage answers.
-Nonzero value is *only* to be used to aid timing tests (although currently there are no timing codes that exploit this option), and will give wrong or undefined answers for the NUFFT transforms!
+Nonzero value can be used *only* with `upsampfac=1` and `kerevalmeth=1` for using cufinufft for only spread and interpolate mode. This has applications into estimating the density compensation, which is conventionally used in MRI. (See [MRI-NUFFT](https://mind-inria.github.io/mri-nufft/nufft.html))
 
 
 Algorithm performance options
diff --git a/docs/error.rst b/docs/error.rst
index 9d9edc9a4..6e12d2859 100644
--- a/docs/error.rst
+++ b/docs/error.rst
@@ -29,6 +29,7 @@ has the following meanings (see codes in ``include/finufft_errors.h``):
   18 size of bins for subprob/blockgather invalid
   19 GPU shmem too small for subprob/blockgather parameters
   20 invalid number of nonuniform points: nj or nk negative, or too big (see defs.h)
+  23 invalid upsampfac set while using gpu_spreadinterponly mode.
 
 When ``ier=1`` (warning only) the transform(s) is/are still completed, at the smallest epsilon achievable, so, with that caveat, the answer should still be usable.
 
diff --git a/include/cufinufft/impl.h b/include/cufinufft/impl.h
index 0913a404e..0ec67c818 100644
--- a/include/cufinufft/impl.h
+++ b/include/cufinufft/impl.h
@@ -107,7 +107,9 @@ int cufinufft_makeplan_impl(int type, int dim, int *nmodes, int iflag, int ntran
     d_plan->ms = nmodes[0];
     d_plan->mt = nmodes[1];
     d_plan->mu = nmodes[2];
-    printf("[cufinufft] (ms,mt,mu): %d %d %d\n", d_plan->ms, d_plan->mt, d_plan->mu);
+    if (d_plan->opts.debug) {
+        printf("[cufinufft] (ms,mt,mu): %d %d %d\n", d_plan->ms, d_plan->mt, d_plan->mu);
+    }
   } else {
     d_plan->opts.gpu_spreadinterponly = 1;
   }
@@ -153,7 +155,13 @@ int cufinufft_makeplan_impl(int type, int dim, int *nmodes, int iflag, int ntran
       printf("[cufinufft] upsampfac automatically set to %.3g\n", d_plan->opts.upsampfac);
     }
   }
-
+  if (d_plan->opts.gpu_spreadinterponly) {
+    // XNOR implementation below with boolean logic.
+    if ((d_plan->opts.upsampfac !=1) == (type != 3)) {
+        ier = FINUFFT_ERR_SPREADONLY_UPSAMP_INVALID;
+        goto finalize;
+    }
+  }
   /* Setup Spreader */
   if ((ier = setup_spreader_for_nufft(d_plan->spopts, tol, d_plan->opts)) > 1) {
     // can return FINUFFT_WARN_EPS_TOO_SMALL=1, which is OK
@@ -254,70 +262,74 @@ int cufinufft_makeplan_impl(int type, int dim, int *nmodes, int iflag, int ntran
     } break;
     }
 
-    cufftHandle fftplan;
-    cufftResult_t cufft_status;
-    switch (d_plan->dim) {
-    case 1: {
-      int n[]       = {(int)nf1};
-      int inembed[] = {(int)nf1};
-
-      cufft_status = cufftPlanMany(&fftplan, 1, n, inembed, 1, inembed[0], inembed, 1,
-                                   inembed[0], cufft_type<T>(), batchsize);
-    } break;
-    case 2: {
-      int n[]       = {(int)nf2, (int)nf1};
-      int inembed[] = {(int)nf2, (int)nf1};
-
-      cufft_status =
-          cufftPlanMany(&fftplan, 2, n, inembed, 1, inembed[0] * inembed[1], inembed, 1,
-                        inembed[0] * inembed[1], cufft_type<T>(), batchsize);
-    } break;
-    case 3: {
-      int n[]       = {(int)nf3, (int)nf2, (int)nf1};
-      int inembed[] = {(int)nf3, (int)nf2, (int)nf1};
-
-      cufft_status = cufftPlanMany(
-          &fftplan, 3, n, inembed, 1, inembed[0] * inembed[1] * inembed[2], inembed, 1,
-          inembed[0] * inembed[1] * inembed[2], cufft_type<T>(), batchsize);
-    } break;
-    }
+    // We dont need any cuFFT plans or kernel values if we are only spreading / interpolating
+    if(!d_plan->opts.gpu_spreadinterponly) {
+        cufftHandle fftplan;
+        cufftResult_t cufft_status;
+        switch (d_plan->dim) {
+        case 1: {
+          int n[]       = {(int)nf1};
+          int inembed[] = {(int)nf1};
+
+          cufft_status = cufftPlanMany(&fftplan, 1, n, inembed, 1, inembed[0], inembed, 1,
+                                       inembed[0], cufft_type<T>(), batchsize);
+        } break;
+        case 2: {
+          int n[]       = {(int)nf2, (int)nf1};
+          int inembed[] = {(int)nf2, (int)nf1};
+
+          cufft_status =
+              cufftPlanMany(&fftplan, 2, n, inembed, 1, inembed[0] * inembed[1], inembed, 1,
+                            inembed[0] * inembed[1], cufft_type<T>(), batchsize);
+        } break;
+        case 3: {
+          int n[]       = {(int)nf3, (int)nf2, (int)nf1};
+          int inembed[] = {(int)nf3, (int)nf2, (int)nf1};
+
+          cufft_status = cufftPlanMany(
+              &fftplan, 3, n, inembed, 1, inembed[0] * inembed[1] * inembed[2], inembed, 1,
+              inembed[0] * inembed[1] * inembed[2], cufft_type<T>(), batchsize);
+        } break;
+        }
 
-    if (cufft_status != CUFFT_SUCCESS) {
-      fprintf(stderr, "[%s] cufft makeplan error: %s", __func__,
-              cufftGetErrorString(cufft_status));
-      ier = FINUFFT_ERR_CUDA_FAILURE;
-      goto finalize;
+        if (cufft_status != CUFFT_SUCCESS) {
+          fprintf(stderr, "[%s] cufft makeplan error: %s", __func__,
+                  cufftGetErrorString(cufft_status));
+          ier = FINUFFT_ERR_CUDA_FAILURE;
+          goto finalize;
+        }
+        cufftSetStream(fftplan, stream);
+
+        d_plan->fftplan = fftplan;
+
+        // compute up to 3 * NQUAD precomputed values on CPU
+        T fseries_precomp_phase[3 * MAX_NQUAD];
+        T fseries_precomp_f[3 * MAX_NQUAD];
+        thrust::device_vector<T> d_fseries_precomp_phase(3 * MAX_NQUAD);
+        thrust::device_vector<T> d_fseries_precomp_f(3 * MAX_NQUAD);
+        onedim_fseries_kernel_precomp<T>(d_plan->nf1, fseries_precomp_f,
+                                         fseries_precomp_phase, d_plan->spopts);
+        if (d_plan->dim > 1)
+          onedim_fseries_kernel_precomp<T>(d_plan->nf2, fseries_precomp_f + MAX_NQUAD,
+                                           fseries_precomp_phase + MAX_NQUAD, d_plan->spopts);
+        if (d_plan->dim > 2)
+          onedim_fseries_kernel_precomp<T>(d_plan->nf3, fseries_precomp_f + 2 * MAX_NQUAD,
+                                           fseries_precomp_phase + 2 * MAX_NQUAD,
+                                           d_plan->spopts);
+        // copy the precomputed data to the device using thrust
+        thrust::copy(fseries_precomp_phase, fseries_precomp_phase + 3 * MAX_NQUAD,
+                     d_fseries_precomp_phase.begin());
+        thrust::copy(fseries_precomp_f, fseries_precomp_f + 3 * MAX_NQUAD,
+                     d_fseries_precomp_f.begin());
+        // the full fseries is done on the GPU here
+        if ((ier = fseries_kernel_compute(
+                 d_plan->dim, d_plan->nf1, d_plan->nf2, d_plan->nf3,
+                 d_fseries_precomp_f.data().get(), d_fseries_precomp_phase.data().get(),
+                 d_plan->fwkerhalf1, d_plan->fwkerhalf2, d_plan->fwkerhalf3,
+                 d_plan->spopts.nspread, stream)))
+          goto finalize;
     }
-    cufftSetStream(fftplan, stream);
-
-    d_plan->fftplan = fftplan;
-
-    // compute up to 3 * NQUAD precomputed values on CPU
-    T fseries_precomp_phase[3 * MAX_NQUAD];
-    T fseries_precomp_f[3 * MAX_NQUAD];
-    thrust::device_vector<T> d_fseries_precomp_phase(3 * MAX_NQUAD);
-    thrust::device_vector<T> d_fseries_precomp_f(3 * MAX_NQUAD);
-    onedim_fseries_kernel_precomp<T>(d_plan->nf1, fseries_precomp_f,
-                                     fseries_precomp_phase, d_plan->spopts);
-    if (d_plan->dim > 1)
-      onedim_fseries_kernel_precomp<T>(d_plan->nf2, fseries_precomp_f + MAX_NQUAD,
-                                       fseries_precomp_phase + MAX_NQUAD, d_plan->spopts);
-    if (d_plan->dim > 2)
-      onedim_fseries_kernel_precomp<T>(d_plan->nf3, fseries_precomp_f + 2 * MAX_NQUAD,
-                                       fseries_precomp_phase + 2 * MAX_NQUAD,
-                                       d_plan->spopts);
-    // copy the precomputed data to the device using thrust
-    thrust::copy(fseries_precomp_phase, fseries_precomp_phase + 3 * MAX_NQUAD,
-                 d_fseries_precomp_phase.begin());
-    thrust::copy(fseries_precomp_f, fseries_precomp_f + 3 * MAX_NQUAD,
-                 d_fseries_precomp_f.begin());
-    // the full fseries is done on the GPU here
-    if ((ier = fseries_kernel_compute(
-             d_plan->dim, d_plan->nf1, d_plan->nf2, d_plan->nf3,
-             d_fseries_precomp_f.data().get(), d_fseries_precomp_phase.data().get(),
-             d_plan->fwkerhalf1, d_plan->fwkerhalf2, d_plan->fwkerhalf3,
-             d_plan->spopts.nspread, stream)))
-      goto finalize;
+    
   }
 finalize:
   if (ier > 1) {
diff --git a/include/finufft_errors.h b/include/finufft_errors.h
index 10465aa13..86158c219 100644
--- a/include/finufft_errors.h
+++ b/include/finufft_errors.h
@@ -25,6 +25,7 @@ enum {
   FINUFFT_ERR_INSUFFICIENT_SHMEM     = 19,
   FINUFFT_ERR_NUM_NU_PTS_INVALID     = 20,
   FINUFFT_ERR_INVALID_ARGUMENT       = 21,
-  FINUFFT_ERR_LOCK_FUNS_INVALID      = 22
+  FINUFFT_ERR_LOCK_FUNS_INVALID      = 22,
+  FINUFFT_ERR_SPREADONLY_UPSAMP_INVALID  = 23,
 };
 #endif
diff --git a/src/cuda/1d/cufinufft1d.cu b/src/cuda/1d/cufinufft1d.cu
index 06389ef75..d0328236f 100644
--- a/src/cuda/1d/cufinufft1d.cu
+++ b/src/cuda/1d/cufinufft1d.cu
@@ -43,8 +43,10 @@ int cufinufft1d1_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk,
     d_fkstart   = d_fk + i * d_plan->batchsize * d_plan->ms;
     d_plan->c   = d_cstart;
     d_plan->fk  = d_fkstart;
-
-    // this is needed
+    if (d_plan->opts.gpu_spreadinterponly)
+        d_plan->fw = d_fkstart;
+    
+        // this is needed
     if ((ier = checkCudaErrors(cudaMemsetAsync(
              d_plan->fw, 0, d_plan->batchsize * d_plan->nf1 * sizeof(cuda_complex<T>),
              stream))))
@@ -52,7 +54,11 @@ int cufinufft1d1_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk,
 
     // Step 1: Spread
     if ((ier = cuspread1d<T>(d_plan, blksize))) return ier;
+    // if spreadonly, skip the rest
 
+    if (d_plan->opts.gpu_spreadinterponly)
+        continue;
+    
     // Step 2: FFT
     cufftResult cufft_status =
         cufft_ex(d_plan->fftplan, d_plan->fw, d_plan->fw, d_plan->iflag);
@@ -97,20 +103,25 @@ int cufinufft1d2_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk,
 
     d_plan->c  = d_cstart;
     d_plan->fk = d_fkstart;
-
-    // Step 1: amplify Fourier coeffs fk and copy into upsampled array fw
-    if (d_plan->opts.modeord == 0) {
-      if ((ier = cudeconvolve1d<T, 0>(d_plan, blksize))) return ier;
-    } else {
-      if ((ier = cudeconvolve1d<T, 1>(d_plan, blksize))) return ier;
+    
+    // Skip steps 1 and 2 if interponly
+    if (!d_plan->opts.gpu_spreadinterponly) {
+        // Step 1: amplify Fourier coeffs fk and copy into upsampled array fw
+        if (d_plan->opts.modeord == 0) {
+          if ((ier = cudeconvolve1d<T, 0>(d_plan, blksize))) return ier;
+        } else {
+          if ((ier = cudeconvolve1d<T, 1>(d_plan, blksize))) return ier;
+        }
+
+        // Step 2: FFT
+        cufftResult cufft_status =
+            cufft_ex(d_plan->fftplan, d_plan->fw, d_plan->fw, d_plan->iflag);
+        if (cufft_status != CUFFT_SUCCESS) return FINUFFT_ERR_CUDA_FAILURE;
     }
-
-    // Step 2: FFT
-    cufftResult cufft_status =
-        cufft_ex(d_plan->fftplan, d_plan->fw, d_plan->fw, d_plan->iflag);
-    if (cufft_status != CUFFT_SUCCESS) return FINUFFT_ERR_CUDA_FAILURE;
-
-    // Step 3: deconvolve and shuffle
+    else
+        d_plan->fw = d_fkstart;
+        
+    // Step 3: Interpolate
     if ((ier = cuinterp1d<T>(d_plan, blksize))) return ier;
   }
 
diff --git a/src/cuda/2d/cufinufft2d.cu b/src/cuda/2d/cufinufft2d.cu
index 8c165edbf..916c4c71d 100644
--- a/src/cuda/2d/cufinufft2d.cu
+++ b/src/cuda/2d/cufinufft2d.cu
@@ -44,6 +44,8 @@ int cufinufft2d1_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk,
     d_fkstart   = d_fk + i * d_plan->batchsize * d_plan->ms * d_plan->mt;
     d_plan->c   = d_cstart;
     d_plan->fk  = d_fkstart;
+    if (d_plan->opts.gpu_spreadinterponly)
+        d_plan->fw = d_fkstart;
 
     // this is needed
     if ((ier = checkCudaErrors(cudaMemsetAsync(
@@ -55,6 +57,10 @@ int cufinufft2d1_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk,
     // Step 1: Spread
     if ((ier = cuspread2d<T>(d_plan, blksize))) return ier;
 
+    // if spreadonly, skip the rest
+    if (d_plan->opts.gpu_spreadinterponly)
+        continue;
+    
     // Step 2: FFT
     cufftResult cufft_status =
         cufft_ex(d_plan->fftplan, d_plan->fw, d_plan->fw, d_plan->iflag);
@@ -100,19 +106,24 @@ int cufinufft2d2_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk,
     d_plan->c  = d_cstart;
     d_plan->fk = d_fkstart;
 
-    // Step 1: amplify Fourier coeffs fk and copy into upsampled array fw
-    if (d_plan->opts.modeord == 0) {
-      if ((ier = cudeconvolve2d<T, 0>(d_plan, blksize))) return ier;
-    } else {
-      if ((ier = cudeconvolve2d<T, 1>(d_plan, blksize))) return ier;
+    // Skip steps 1 and 2 if interponly
+    if (!d_plan->opts.gpu_spreadinterponly) {
+        // Step 1: amplify Fourier coeffs fk and copy into upsampled array fw
+        if (d_plan->opts.modeord == 0) {
+          if ((ier = cudeconvolve2d<T, 0>(d_plan, blksize))) return ier;
+        } else {
+          if ((ier = cudeconvolve2d<T, 1>(d_plan, blksize))) return ier;
+        }
+
+        // Step 2: FFT
+        cufftResult cufft_status =
+            cufft_ex(d_plan->fftplan, d_plan->fw, d_plan->fw, d_plan->iflag);
+        if (cufft_status != CUFFT_SUCCESS) return FINUFFT_ERR_CUDA_FAILURE;
     }
-
-    // Step 2: FFT
-    cufftResult cufft_status =
-        cufft_ex(d_plan->fftplan, d_plan->fw, d_plan->fw, d_plan->iflag);
-    if (cufft_status != CUFFT_SUCCESS) return FINUFFT_ERR_CUDA_FAILURE;
-
-    // Step 3: deconvolve and shuffle
+    else
+        d_plan->fw = d_fkstart;
+    
+    // Step 3: Interpolate
     if ((ier = cuinterp2d<T>(d_plan, blksize))) return ier;
   }
 
diff --git a/src/cuda/3d/cufinufft3d.cu b/src/cuda/3d/cufinufft3d.cu
index ce1f37c0e..9f376297c 100644
--- a/src/cuda/3d/cufinufft3d.cu
+++ b/src/cuda/3d/cufinufft3d.cu
@@ -42,6 +42,8 @@ int cufinufft3d1_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk,
 
     d_plan->c  = d_cstart;
     d_plan->fk = d_fkstart;
+    if (d_plan->opts.gpu_spreadinterponly)
+        d_plan->fw = d_fkstart;
 
     if ((ier = checkCudaErrors(cudaMemsetAsync(
              d_plan->fw, 0, d_plan->batchsize * d_plan->nf * sizeof(cuda_complex<T>),
@@ -50,8 +52,12 @@ int cufinufft3d1_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk,
 
     // Step 1: Spread
     if ((ier = cuspread3d<T>(d_plan, blksize))) return ier;
-
-    // Step 2: FFT
+    
+    // if spreadonly, skip the rest
+    if (d_plan->opts.gpu_spreadinterponly)
+        continue;
+    
+        // Step 2: FFT
     cufftResult cufft_status =
         cufft_ex(d_plan->fftplan, d_plan->fw, d_plan->fw, d_plan->iflag);
     if (cufft_status != CUFFT_SUCCESS) return FINUFFT_ERR_CUDA_FAILURE;
@@ -94,20 +100,24 @@ int cufinufft3d2_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk,
     d_plan->c  = d_cstart;
     d_plan->fk = d_fkstart;
 
-    // Step 1: amplify Fourier coeffs fk and copy into upsampled array fw
-    if (d_plan->opts.modeord == 0) {
-      if ((ier = cudeconvolve3d<T, 0>(d_plan, blksize))) return ier;
-    } else {
-      if ((ier = cudeconvolve3d<T, 1>(d_plan, blksize))) return ier;
+    // Skip steps 1 and 2 if interponly
+    if (!d_plan->opts.gpu_spreadinterponly) {
+        // Step 1: amplify Fourier coeffs fk and copy into upsampled array fw
+        if (d_plan->opts.modeord == 0) {
+          if ((ier = cudeconvolve3d<T, 0>(d_plan, blksize))) return ier;
+        } else {
+          if ((ier = cudeconvolve3d<T, 1>(d_plan, blksize))) return ier;
+        }
+        // Step 2: FFT
+        RETURN_IF_CUDA_ERROR
+        cufftResult cufft_status =
+            cufft_ex(d_plan->fftplan, d_plan->fw, d_plan->fw, d_plan->iflag);
+        if (cufft_status != CUFFT_SUCCESS) return FINUFFT_ERR_CUDA_FAILURE;
     }
-
-    // Step 2: FFT
-    RETURN_IF_CUDA_ERROR
-    cufftResult cufft_status =
-        cufft_ex(d_plan->fftplan, d_plan->fw, d_plan->fw, d_plan->iflag);
-    if (cufft_status != CUFFT_SUCCESS) return FINUFFT_ERR_CUDA_FAILURE;
-
-    // Step 3: deconvolve and shuffle
+    else
+        d_plan->fw = d_fkstart;
+        
+    // Step 3: Interpolate
     if ((ier = cuinterp3d<T>(d_plan, blksize))) return ier;
   }
 
diff --git a/src/cuda/memtransfer_wrapper.cu b/src/cuda/memtransfer_wrapper.cu
index e5308e8bc..20bbb6355 100644
--- a/src/cuda/memtransfer_wrapper.cu
+++ b/src/cuda/memtransfer_wrapper.cu
@@ -434,12 +434,13 @@ void freegpumemory(cufinufft_plan_t<T> *d_plan)
   utils::WithCudaDevice device_swapper(d_plan->opts.gpu_device_id);
   // Fixes a crash whewre the plan itself is deleted before the stream
   const auto stream = d_plan->stream;
-
-  CUDA_FREE_AND_NULL(d_plan->fw, stream, d_plan->supports_pools);
+  // Dont clear fw if spreadinterponly for type 1 and 2 as fw belongs to original program (it is d_fk)
+  if(!d_plan->opts.gpu_spreadinterponly || d_plan->type == 3)
+    CUDA_FREE_AND_NULL(d_plan->fw, stream, d_plan->supports_pools);
   CUDA_FREE_AND_NULL(d_plan->fwkerhalf1, stream, d_plan->supports_pools);
   CUDA_FREE_AND_NULL(d_plan->fwkerhalf2, stream, d_plan->supports_pools);
   CUDA_FREE_AND_NULL(d_plan->fwkerhalf3, stream, d_plan->supports_pools);
-
+  
   CUDA_FREE_AND_NULL(d_plan->idxnupts, stream, d_plan->supports_pools);
   CUDA_FREE_AND_NULL(d_plan->sortidx, stream, d_plan->supports_pools);
   CUDA_FREE_AND_NULL(d_plan->numsubprob, stream, d_plan->supports_pools);
diff --git a/src/cuda/spreadinterp.cpp b/src/cuda/spreadinterp.cpp
index 646ffa434..f9c1de2ad 100644
--- a/src/cuda/spreadinterp.cpp
+++ b/src/cuda/spreadinterp.cpp
@@ -26,8 +26,9 @@ int setup_spreader(finufft_spread_opts &opts, T eps, T upsampfac, int kerevalmet
               __func__, upsampfac);
       return FINUFFT_ERR_HORNER_WRONG_BETA;
     }
-    if (upsampfac <= 1.0) {
-      fprintf(stderr, "[%s] error: upsampfac=%.3g is <=1.0\n", __func__, upsampfac);
+    // Upsampfac == 1.0 is valid for spreadinterponly mode.
+    if (upsampfac < 1.0) {
+      fprintf(stderr, "[%s] error: upsampfac=%.3g is <1.0\n", __func__, upsampfac);
       return FINUFFT_ERR_UPSAMPFAC_TOO_SMALL;
     }
     // calling routine must abort on above errors, since opts is garbage!