diff --git a/.github/workflows/cmake_ci.yml b/.github/workflows/cmake_ci.yml
index 9b483ecbf..75be3e19f 100644
--- a/.github/workflows/cmake_ci.yml
+++ b/.github/workflows/cmake_ci.yml
@@ -55,3 +55,33 @@ jobs:
         working-directory: ./build
         run: |
           ctest -C ${{matrix.build_type}} --output-on-failure
+
+      - name: Set up Python
+        if: matrix.finufft_static_linking
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.10'
+
+      - name: Build Python wheels
+        if: matrix.finufft_static_linking
+        env:
+          MACOSX_DEPLOYMENT_TARGET: 13
+        shell: bash
+        run: |
+          python3 -m pip install \
+            --verbose \
+            -C cmake.define.CMAKE_BUILD_TYPE=${{ matrix.build_type }} \
+            -C cmake.define.FINUFFT_ARCH_FLAGS=${{ matrix.arch_flags }} \
+            -C cmake.define.FINUFFT_USE_DUCC0=${{ matrix.ducc_fft }} \
+            python/finufft
+
+      - name: Install pytest
+        if: matrix.finufft_static_linking
+        run: |
+          python3 -m pip install --upgrade pip
+          python3 -m pip install pytest
+
+      - name: Test Python package
+        if: matrix.finufft_static_linking
+        run: |
+          python3 -m pytest python/finufft/test
diff --git a/include/cufinufft/impl.h b/include/cufinufft/impl.h
index 012050b15..543634f15 100644
--- a/include/cufinufft/impl.h
+++ b/include/cufinufft/impl.h
@@ -74,7 +74,7 @@ int cufinufft_makeplan_impl(int type, int dim, int *nmodes, int iflag, int ntran
   using namespace cufinufft::common;
   int ier;
   if (type < 1 || type > 3) {
-    fprintf(stderr, "[%s] Invalid type (%d): should be 1 or 2.\n", __func__, type);
+    fprintf(stderr, "[%s] Invalid type (%d): should be 1, 2, or 3.\n", __func__, type);
     return FINUFFT_ERR_TYPE_NOTVALID;
   }
   if (ntransf < 1) {
@@ -178,7 +178,8 @@ int cufinufft_makeplan_impl(int type, int dim, int *nmodes, int iflag, int ntran
   }
 
   cufinufft_setup_binsize<T>(type, d_plan->spopts.nspread, dim, &d_plan->opts);
-  if (ier = cudaGetLastError(), ier != cudaSuccess) {
+  if (cudaGetLastError() != cudaSuccess) {
+    ier = FINUFFT_ERR_CUDA_FAILURE;
     goto finalize;
   }
   if (d_plan->opts.debug) {
@@ -196,6 +197,42 @@ int cufinufft_makeplan_impl(int type, int dim, int *nmodes, int iflag, int ntran
     printf("[cufinufft] shared memory required for the spreader: %ld\n", mem_required);
   }
 
+
+  // dynamically request the maximum amount of shared memory available
+  // for the spreader
+
+  /* Automatically set GPU method. */
+  if (d_plan->opts.gpu_method == 0) {
+    /* For type 1, we default to method 2 (SM) since this is generally faster
+     * if there is enough shared memory available. Otherwise, we default to GM.
+     * Type 3 inherits this behavior since the outer plan here is also a type 1.
+     *
+     * For type 2, we always default to method 1 (GM).
+     */
+    if (type == 2) {
+      d_plan->opts.gpu_method = 1;
+    } else {
+      // query the device for the amount of shared memory available
+      int shared_mem_per_block{};
+      cudaDeviceGetAttribute(&shared_mem_per_block,
+                             cudaDevAttrMaxSharedMemoryPerBlockOptin, device_id);
+      // compute the amount of shared memory required for the method
+      const auto shared_mem_required = shared_memory_required<T>(
+          dim, d_plan->spopts.nspread, d_plan->opts.gpu_binsizex,
+          d_plan->opts.gpu_binsizey, d_plan->opts.gpu_binsizez);
+      if ((shared_mem_required > shared_mem_per_block)) {
+        d_plan->opts.gpu_method = 1;
+      } else {
+        d_plan->opts.gpu_method = 2;
+      }
+    }
+  }
+
+  if (cudaGetLastError() != cudaSuccess) {
+    ier = FINUFFT_ERR_CUDA_FAILURE;
+    goto finalize;
+  }
+
   if (type == 1 || type == 2) {
     CUFINUFFT_BIGINT nf1 = 1, nf2 = 1, nf3 = 1;
     set_nf_type12(d_plan->ms, d_plan->opts, d_plan->spopts, &nf1,
@@ -207,39 +244,6 @@ int cufinufft_makeplan_impl(int type, int dim, int *nmodes, int iflag, int ntran
       set_nf_type12(d_plan->mu, d_plan->opts, d_plan->spopts, &nf3,
                     d_plan->opts.gpu_obinsizez);
 
-    // dynamically request the maximum amount of shared memory available
-    // for the spreader
-
-    /* Automatically set GPU method. */
-    if (d_plan->opts.gpu_method == 0) {
-      /* For type 1, we default to method 2 (SM) since this is generally faster
-       * if there is enough shared memory available. Otherwise, we default to GM.
-       *
-       * For type 2, we always default to method 1 (GM).
-       */
-      if (type == 2) {
-        d_plan->opts.gpu_method = 1;
-      } else {
-        // query the device for the amount of shared memory available
-        int shared_mem_per_block{};
-        cudaDeviceGetAttribute(&shared_mem_per_block,
-                               cudaDevAttrMaxSharedMemoryPerBlockOptin, device_id);
-        // compute the amount of shared memory required for the method
-        const auto shared_mem_required = shared_memory_required<T>(
-            dim, d_plan->spopts.nspread, d_plan->opts.gpu_binsizex,
-            d_plan->opts.gpu_binsizey, d_plan->opts.gpu_binsizez);
-        if ((shared_mem_required > shared_mem_per_block)) {
-          d_plan->opts.gpu_method = 1;
-        } else {
-          d_plan->opts.gpu_method = 2;
-        }
-      }
-    }
-
-    if ((ier = cudaGetLastError())) {
-      goto finalize;
-    }
-
     d_plan->nf1 = nf1;
     d_plan->nf2 = nf2;
     d_plan->nf3 = nf3;
@@ -795,7 +799,7 @@ int cufinufft_setpts_impl(int M, T *d_kx, T *d_ky, T *d_kz, int N, T *d_s, T *d_
     int t2modes[]               = {d_plan->nf1, d_plan->nf2, d_plan->nf3};
     cufinufft_opts t2opts       = d_plan->opts;
     t2opts.gpu_spreadinterponly = 0;
-    t2opts.gpu_method           = 1;
+    t2opts.gpu_method           = 0;
     // Safe to ignore the return value here?
     if (d_plan->t2_plan) cufinufft_destroy_impl(d_plan->t2_plan);
     // check that maxbatchsize is correct
diff --git a/src/cuda/cufinufft.cu b/src/cuda/cufinufft.cu
index 534fa5358..fa4990285 100644
--- a/src/cuda/cufinufft.cu
+++ b/src/cuda/cufinufft.cu
@@ -6,7 +6,13 @@
 #include <cufinufft.h>
 #include <cufinufft/impl.h>
 
-inline bool is_invalid_mode_array(int dim, const int64_t *modes64, int32_t modes32[3]) {
+inline bool is_invalid_mode_array(int type, int dim, const int64_t *modes64,
+                                  int32_t modes32[3]) {
+  if (type == 3) {
+    modes32[0] = modes32[1] = modes32[2] = 1;
+    return false;
+  }
+
   int64_t tot_size = 1;
   for (int i = 0; i < dim; ++i) {
     if (modes64[i] > std::numeric_limits<int32_t>::max()) return true;
@@ -28,7 +34,9 @@ int cufinufftf_makeplan(int type, int dim, const int64_t *nmodes, int iflag, int
   }
 
   int nmodes32[3];
-  if (is_invalid_mode_array(dim, nmodes, nmodes32)) return FINUFFT_ERR_NDATA_NOTVALID;
+  if (is_invalid_mode_array(type, dim, nmodes, nmodes32)) {
+    return FINUFFT_ERR_NDATA_NOTVALID;
+  }
 
   return cufinufft_makeplan_impl(type, dim, nmodes32, iflag, ntransf, tol,
                                  (cufinufft_plan_t<float> **)d_plan_ptr, opts);
@@ -42,7 +50,9 @@ int cufinufft_makeplan(int type, int dim, const int64_t *nmodes, int iflag, int
   }
 
   int nmodes32[3];
-  if (is_invalid_mode_array(dim, nmodes, nmodes32)) return FINUFFT_ERR_NDATA_NOTVALID;
+  if (is_invalid_mode_array(type, dim, nmodes, nmodes32)) {
+    return FINUFFT_ERR_NDATA_NOTVALID;
+  }
 
   return cufinufft_makeplan_impl(type, dim, nmodes32, iflag, ntransf, tol,
                                  (cufinufft_plan_t<double> **)d_plan_ptr, opts);
diff --git a/test/cuda/CMakeLists.txt b/test/cuda/CMakeLists.txt
index 1cadb7569..d8af7918a 100644
--- a/test/cuda/CMakeLists.txt
+++ b/test/cuda/CMakeLists.txt
@@ -21,6 +21,10 @@ foreach(srcfile ${test_src})
 endforeach()
 
 function(add_tests PREC REQ_TOL CHECK_TOL UPSAMP)
+  add_test(NAME cufinufft1d1_test_auto_${PREC}_${UPSAMP}
+           COMMAND cufinufft1d_test 0 1 1e2 2e2 ${REQ_TOL} ${CHECK_TOL} ${PREC}
+                   ${UPSAMP})
+
   add_test(NAME cufinufft1d1_test_GM_${PREC}_${UPSAMP}
            COMMAND cufinufft1d_test 1 1 1e2 2e2 ${REQ_TOL} ${CHECK_TOL} ${PREC}
                    ${UPSAMP})
@@ -29,13 +33,26 @@ function(add_tests PREC REQ_TOL CHECK_TOL UPSAMP)
            COMMAND cufinufft1d_test 2 1 1e2 2e2 ${REQ_TOL} ${CHECK_TOL} ${PREC}
                    ${UPSAMP})
 
+  add_test(NAME cufinufft1d2_test_auto_${PREC}_${UPSAMP}
+           COMMAND cufinufft1d_test 0 2 1e2 2e2 ${REQ_TOL} ${CHECK_TOL} ${PREC}
+                   ${UPSAMP})
+
   add_test(NAME cufinufft1d2_test_GM_${PREC}_${UPSAMP}
            COMMAND cufinufft1d_test 1 2 1e2 2e2 ${REQ_TOL} ${CHECK_TOL} ${PREC}
                    ${UPSAMP})
+
+  add_test(NAME cufinufft1d3_test_auto_${PREC}_${UPSAMP}
+           COMMAND cufinufft1d_test 0 3 1e2 2e2 ${REQ_TOL} ${CHECK_TOL} ${PREC}
+                   ${UPSAMP})
+
   add_test(NAME cufinufft1d3_test_GM_${PREC}_${UPSAMP}
            COMMAND cufinufft1d_test 1 3 1e2 2e2 ${REQ_TOL} ${CHECK_TOL} ${PREC}
                    ${UPSAMP})
 
+  add_test(NAME cufinufft2d1_test_auto_${PREC}_${UPSAMP}
+           COMMAND cufinufft2d_test 0 1 1e2 2e2 2e4 ${REQ_TOL} ${CHECK_TOL}
+                   ${PREC} ${UPSAMP})
+
   add_test(NAME cufinufft2d1_test_GM_${PREC}_${UPSAMP}
            COMMAND cufinufft2d_test 1 1 1e2 2e2 2e4 ${REQ_TOL} ${CHECK_TOL}
                    ${PREC} ${UPSAMP})
@@ -44,10 +61,18 @@ function(add_tests PREC REQ_TOL CHECK_TOL UPSAMP)
            COMMAND cufinufft2d_test 2 1 1e2 2e2 2e4 ${REQ_TOL} ${CHECK_TOL}
                    ${PREC} ${UPSAMP})
 
+  add_test(NAME cufinufft2d2_test_auto_${PREC}_${UPSAMP}
+           COMMAND cufinufft2d_test 0 2 1e2 2e2 2e4 ${REQ_TOL} ${CHECK_TOL}
+                   ${PREC} ${UPSAMP})
+
   add_test(NAME cufinufft2d2_test_SM_${PREC}_${UPSAMP}
            COMMAND cufinufft2d_test 2 2 1e2 2e2 2e4 ${REQ_TOL} ${CHECK_TOL}
                    ${PREC} ${UPSAMP})
 
+  add_test(NAME cufinufft2d3_test_auto_${PREC}_${UPSAMP}
+           COMMAND cufinufft2d_test 0 3 1e2 2e2 2e4 ${REQ_TOL} ${CHECK_TOL}
+                   ${PREC} ${UPSAMP})
+
   add_test(NAME cufinufft2d3_test_SM_${PREC}_${UPSAMP}
            COMMAND cufinufft2d_test 2 3 1e2 2e2 2e4 ${REQ_TOL} ${CHECK_TOL}
                    ${PREC} ${UPSAMP})
@@ -76,6 +101,10 @@ function(add_tests PREC REQ_TOL CHECK_TOL UPSAMP)
            COMMAND cufinufft2dmany_test 2 3 1e2 2e2 5 0 2e4 ${REQ_TOL}
                    ${CHECK_TOL} ${PREC} ${UPSAMP})
 
+  add_test(NAME cufinufft3d1_test_auto_${PREC}_${UPSAMP}
+           COMMAND cufinufft3d_test 0 1 2 5 10 20 ${REQ_TOL} ${CHECK_TOL}
+                   ${PREC} ${UPSAMP})
+
   add_test(NAME cufinufft3d1_test_GM_${PREC}_${UPSAMP}
            COMMAND cufinufft3d_test 1 1 2 5 10 20 ${REQ_TOL} ${CHECK_TOL}
                    ${PREC} ${UPSAMP})
@@ -98,10 +127,18 @@ function(add_tests PREC REQ_TOL CHECK_TOL UPSAMP)
                      ${PREC} ${UPSAMP})
   endif()
 
+  add_test(NAME cufinufft3d2_test_auto_${PREC}_${UPSAMP}
+           COMMAND cufinufft3d_test 0 2 2 5 10 20 ${REQ_TOL} ${CHECK_TOL}
+                   ${PREC} ${UPSAMP})
+
   add_test(NAME cufinufft3d2_test_GM_${PREC}_${UPSAMP}
            COMMAND cufinufft3d_test 1 2 2 5 10 20 ${REQ_TOL} ${CHECK_TOL}
                    ${PREC} ${UPSAMP})
 
+  add_test(NAME cufinufft3d3_test_auto_${PREC}_${UPSAMP}
+           COMMAND cufinufft3d_test 0 3 2 5 10 30 ${REQ_TOL} ${CHECK_TOL}
+                   ${PREC} ${UPSAMP})
+
   add_test(NAME cufinufft3d3_test_GM_${PREC}_${UPSAMP}
            COMMAND cufinufft3d_test 1 3 2 3 7 20 ${REQ_TOL} ${CHECK_TOL}*100
                    ${PREC} ${UPSAMP})