diff --git a/MAKE/Makefile.Freezer_cuda b/MAKE/Makefile.Freezer_cuda
index c58762bd5..2ef1d5085 100644
--- a/MAKE/Makefile.Freezer_cuda
+++ b/MAKE/Makefile.Freezer_cuda
@@ -35,7 +35,6 @@ VECL=16
 # LDFLAGS flags for linker
 
 USE_CUDA=1
-CUDABLOCKS=108
 
 # Tell mpic++ to use nvcc for all compiling
 CMP = OMPI_CXX='nvcc' OMPI_CXXFLAGS='' mpic++
diff --git a/MAKE/Makefile.mahti_cuda b/MAKE/Makefile.mahti_cuda
index 492315516..bc632c696 100644
--- a/MAKE/Makefile.mahti_cuda
+++ b/MAKE/Makefile.mahti_cuda
@@ -33,7 +33,6 @@ VECL=16
 #                         mpi.h in c++ on Cray
 
 USE_CUDA=1
-CUDABLOCKS=108
 
 #-ggdb not available on nvcc
 #-G (device debug) overrides --generate-line-info -line-info but also requires more device-side resources to run
diff --git a/MAKE/Makefile.puhti_gcc_cuda b/MAKE/Makefile.puhti_gcc_cuda
index f5a151e5c..478ccf6c6 100644
--- a/MAKE/Makefile.puhti_gcc_cuda
+++ b/MAKE/Makefile.puhti_gcc_cuda
@@ -45,7 +45,6 @@ VECL=64
 #                         mpi.h in c++ on Cray
 
 USE_CUDA=1
-CUDABLOCKS=108
 CMP = mpic++
 LNK = mpic++
 NVCC = nvcc
diff --git a/arch/arch_device_cuda.h b/arch/arch_device_cuda.h
index 03f92519f..5ec72b0fa 100644
--- a/arch/arch_device_cuda.h
+++ b/arch/arch_device_cuda.h
@@ -100,11 +100,6 @@
 #define ARCH_BLOCKSIZE_R 512
 #define ARCH_BLOCKSIZE_R_SMALL 32
 
-/* GPU blocksize used by Vlasov solvers */
-#ifndef GPUBLOCKS
-#  define GPUBLOCKS (108)
-#endif
-
 /* values used by kernels */
 #ifndef GPUTHREADS
 #define GPUTHREADS (32)
diff --git a/arch/arch_device_hip.h b/arch/arch_device_hip.h
index cdb8c3d8d..03ea30d3f 100644
--- a/arch/arch_device_hip.h
+++ b/arch/arch_device_hip.h
@@ -97,11 +97,6 @@
 #define ARCH_BLOCKSIZE_R 512
 #define ARCH_BLOCKSIZE_R_SMALL 64
 
-/* GPU blocksize used by Vlasov solvers */
-#ifndef GPUBLOCKS
-#  define GPUBLOCKS (108)
-#endif
-
 /* values used by kernels */
 #ifndef GPUTHREADS
 #define GPUTHREADS (64)
diff --git a/spatial_cell_gpu.cpp b/spatial_cell_gpu.cpp
index 5cdc8dd51..cb915eed4 100644
--- a/spatial_cell_gpu.cpp
+++ b/spatial_cell_gpu.cpp
@@ -906,7 +906,6 @@ namespace spatial_cell {
       // Evaluate velocity halo for local content blocks
       if (velocity_block_with_content_list_size>0) {
          //phiprof::Timer blockHaloTimer {"Block halo kernel"};
-         //nGpuBlocks = velocity_block_with_content_list_size > GPUBLOCKS ? GPUBLOCKS : velocity_block_with_content_list_size;
          const int addWidthV = getObjectWrapper().particleSpecies[popID].sparseBlockAddWidthV;
          if (addWidthV!=1) {
             std::cerr<<"Warning! "<<__FILE__<<":"<<__LINE__<<" Halo extent is not 1, unsupported size."<<std::endl;
@@ -1237,7 +1236,7 @@ namespace spatial_cell {
 
       // Third argument specifies the number of bytes in *shared memory* that is
       // dynamically allocated per block for this call in addition to the statically allocated memory.
-      //update_velocity_block_content_lists_kernel<<<GPUBLOCKS, WID3, WID3*sizeof(int), stream>>> (
+      //update_velocity_block_content_lists_kernel<<<launchBlocks, WID3, WID3*sizeof(int), stream>>> (
       update_velocity_block_content_lists_kernel<<<launchBlocks, (vlasiBlocksPerWorkUnit * WID3), 0, stream>>> (
          populations[popID].dev_vmesh,
          populations[popID].dev_blockContainer,
diff --git a/spatial_cell_gpu.hpp b/spatial_cell_gpu.hpp
index 6e4ffe95e..60a842b1f 100644
--- a/spatial_cell_gpu.hpp
+++ b/spatial_cell_gpu.hpp
@@ -137,19 +137,17 @@ namespace spatial_cell {
       vmesh::VelocityBlockContainer *blockContainer,
       const Real factor
       ) {
-      const int gpuBlocks = gridDim.x;
       const int blocki = blockIdx.x;
       const int i = threadIdx.x;
       const int j = threadIdx.y;
       const int k = threadIdx.z;
       const uint ti = k*WID2 + j*WID + i;
       // loop over whole velocity space and scale the values
-      for (uint blockLID=blocki; blockLID<nBlocks; blockLID += gpuBlocks) {
-         // Pointer to target block data
-         Realf* data = blockContainer->getData(blockLID);
-         // Scale value
-         data[ti] = data[ti] * factor;
-      }
+      const uint blockLID = blocki;
+      // Pointer to target block data
+      Realf* data = blockContainer->getData(blockLID);
+      // Scale value
+      data[ti] = data[ti] * factor;
    }
    /** GPU kernel for adding a particle population to another with a scaling factor
     This kernel increments existing blocks, creates new ones if the block does
@@ -379,11 +377,10 @@ namespace spatial_cell {
          }
          // Now loop over whole velocity space and scale the values
          vmesh::LocalID nBlocks = vmesh->size();
-         const uint nGpuBlocks = nBlocks > GPUBLOCKS ? GPUBLOCKS : nBlocks;
          gpuStream_t stream = gpu_getStream();
-         if (nGpuBlocks > 0) {
+         if (nBlocks > 0) {
             dim3 block(WID,WID,WID);
-            population_scale_kernel<<<nGpuBlocks, block, 0, stream>>> (
+            population_scale_kernel<<<nBlocks, block, 0, stream>>> (
                nBlocks,
                dev_vmesh,
                dev_blockContainer,
@@ -410,8 +407,7 @@ namespace spatial_cell {
          CHK_ERR( gpuStreamSynchronize(stream) );
          // Loop over the whole velocity space, and add scaled values with
          // a kernel. Addition of new blocks is not block-parallel-safe.
-         const uint nGpuBlocks = nBlocks > GPUBLOCKS ? GPUBLOCKS : nBlocks;
-         if (nGpuBlocks > 0) {
+         if (nBlocks > 0) {
             dim3 block(WID,WID,WID);
             // Now serial
             population_increment_kernel<<<1, block, 0, stream>>> (
@@ -439,13 +435,13 @@ namespace spatial_cell {
       const fileReal* gpuInitBuffer,
       const uint nBlocks
       ) {
-      const int gpuBlocks = gridDim.x;
       const int blocki = blockIdx.x;
       //const int warpSize = blockDim.x*blockDim.y*blockDim.z;
       const uint ti = threadIdx.z*blockDim.x*blockDim.y + threadIdx.y*blockDim.x + threadIdx.x;
       Real* parameters = blockContainer->getParameters(startLID);
       Realf *cellBlockData = blockContainer->getData(startLID);
-      for (uint index=blocki; index<nBlocks; index += gpuBlocks) {
+      const uint index = blocki;
+      {
          // Copy in cell data, perform conversion float<->double if necessary
          cellBlockData[index*WID3 + ti] = (Realf)gpuInitBuffer[index*WID3 + ti];
          // Set block parameters
@@ -1229,13 +1225,12 @@ namespace spatial_cell {
       CHK_ERR( gpuMemcpyAsync(gpuInitBlocks, blocks.data(),
                               nBlocks*sizeof(vmesh::GlobalID), gpuMemcpyHostToDevice, stream) );
 
-      const uint nGpuBlocks = nBlocks > GPUBLOCKS ? GPUBLOCKS : nBlocks;
-      if (nGpuBlocks>0) {
+      if (nBlocks>0) {
          dim3 block(WID,WID,WID);
          // Third argument specifies the number of bytes in *shared memory* that is
          // dynamically allocated per block for this call in addition to the statically allocated memory.
          CHK_ERR( gpuStreamSynchronize(stream) );
-         spatial_cell::add_blocks_from_buffer_kernel<<<nGpuBlocks, block, 0, stream>>> (
+         spatial_cell::add_blocks_from_buffer_kernel<<<nBlocks, block, 0, stream>>> (
             populations[popID].dev_vmesh,
             populations[popID].dev_blockContainer,
             startLID,
diff --git a/vlasovsolver/gpu_acc_map.cpp b/vlasovsolver/gpu_acc_map.cpp
index 9c2c45071..174062942 100644
--- a/vlasovsolver/gpu_acc_map.cpp
+++ b/vlasovsolver/gpu_acc_map.cpp
@@ -70,7 +70,6 @@ __global__ void __launch_bounds__(VECL,4) reorder_blocks_by_dimension_kernel(
    vmesh::VelocityBlockContainer *blockContainer,
    Vec *gpu_blockDataOrdered,
    uint *gpu_cell_indices_to_id,
-   uint totalColumns,
    vmesh::LocalID *gpu_LIDlist,
    ColumnOffsets* columnData
 ) {
@@ -79,14 +78,13 @@ __global__ void __launch_bounds__(VECL,4) reorder_blocks_by_dimension_kernel(
    // Works column-per-column and adds the necessary one empty block at each end
    const int nThreads = blockDim.x; // should be equal to VECL
    const int ti = threadIdx.x;
-   const int blocki = blockIdx.x;
-   const int gpuBlocks = gridDim.x;
+   const uint iColumn = blockIdx.x;
    Realf *gpu_blockData = blockContainer->getData();
    if (nThreads != VECL) {
       if (ti==0) printf("Warning! VECL not matching thread count for GPU kernel!\n");
    }
-   // Loop over columns in steps of gpuBlocks. Each gpuBlock deals with one column.
-   for (uint iColumn = blocki; iColumn < totalColumns; iColumn += gpuBlocks) {
+   // Each gpuBlock deals with one column.
+   {
       uint inputOffset = columnData->columnBlockOffsets[iColumn];
       uint outputOffset = (inputOffset + 2 * iColumn) * (WID3/VECL);
       uint columnLength = columnData->columnNumBlocks[iColumn];
@@ -125,7 +123,7 @@ __global__ void __launch_bounds__(VECL,4) reorder_blocks_by_dimension_kernel(
                gpu_blockDataOrdered[outputOffset + i_pcolumnv_gpu_b(jk, k, columnLength, columnLength)][ti] = 0.0;
          }
       }
-   } // end loop iColumn
+   } // end iColumn
    // Note: this kernel does not memset gpu_blockData to zero.
    // A separate memsetasync call is required for that.
 }
@@ -207,7 +205,6 @@ __global__ void __launch_bounds__(GPUTHREADS,4) evaluate_column_extents_kernel(
    Realv dv,
    uint *bailout_flag
    ) {
-   const uint gpuBlocks = gridDim.x * gridDim.y * gridDim.z;
    const uint warpSize = blockDim.x * blockDim.y * blockDim.z;
    const uint blocki = blockIdx.z*gridDim.x*gridDim.y + blockIdx.y*gridDim.x + blockIdx.x;
    const uint ti = threadIdx.z*blockDim.x*blockDim.y + threadIdx.y*blockDim.x + threadIdx.x;
@@ -215,189 +212,188 @@ __global__ void __launch_bounds__(GPUTHREADS,4) evaluate_column_extents_kernel(
    // Shared within all threads in one block
    __shared__ int isTargetBlock[MAX_BLOCKS_PER_DIM];
    __shared__ int isSourceBlock[MAX_BLOCKS_PER_DIM];
-   for( uint setIndex=blocki; setIndex < gpu_columnData->setColumnOffsets.size(); setIndex += gpuBlocks) {
-      if (setIndex < gpu_columnData->setColumnOffsets.size()) {
-
-         // Clear flags used for this columnSet
-         for(uint tti = 0; tti < MAX_BLOCKS_PER_DIM; tti += warpSize ) {
-            const uint index = tti + ti;
-            if (index < MAX_BLOCKS_PER_DIM) {
-               isTargetBlock[index] = 0;
-               isSourceBlock[index] = 0;
-            }
+   const uint setIndex=blocki;
+   if (setIndex < gpu_columnData->setColumnOffsets.size()) {
+
+      // Clear flags used for this columnSet
+      for(uint tti = 0; tti < MAX_BLOCKS_PER_DIM; tti += warpSize ) {
+         const uint index = tti + ti;
+         if (index < MAX_BLOCKS_PER_DIM) {
+            isTargetBlock[index] = 0;
+            isSourceBlock[index] = 0;
+         }
+      }
+      __syncthreads();
+
+      /*need x,y coordinate of this column set of blocks, take it from first
+        block in first column*/
+      vmesh::LocalID setFirstBlockIndices0,setFirstBlockIndices1,setFirstBlockIndices2;
+      vmesh->getIndices(GIDlist[gpu_columnData->columnBlockOffsets[gpu_columnData->setColumnOffsets[setIndex]]],
+                        setFirstBlockIndices0, setFirstBlockIndices1, setFirstBlockIndices2);
+      swapBlockIndices(setFirstBlockIndices0,setFirstBlockIndices1,setFirstBlockIndices2,dimension);
+      /*compute the maximum starting point of the lagrangian (target) grid
+        (base level) within the 4 corner cells in this
+        block. Needed for computig maximum extent of target column*/
+
+      Realv max_intersectionMin = intersection +
+         (setFirstBlockIndices0 * WID + 0) * intersection_di +
+         (setFirstBlockIndices1 * WID + 0) * intersection_dj;
+      max_intersectionMin =  std::max(max_intersectionMin,
+                                      intersection +
+                                      (setFirstBlockIndices0 * WID + 0) * intersection_di +
+                                      (setFirstBlockIndices1 * WID + WID - 1) * intersection_dj);
+      max_intersectionMin =  std::max(max_intersectionMin,
+                                      intersection +
+                                      (setFirstBlockIndices0 * WID + WID - 1) * intersection_di +
+                                      (setFirstBlockIndices1 * WID + 0) * intersection_dj);
+      max_intersectionMin =  std::max(max_intersectionMin,
+                                      intersection +
+                                      (setFirstBlockIndices0 * WID + WID - 1) * intersection_di +
+                                      (setFirstBlockIndices1 * WID + WID - 1) * intersection_dj);
+
+      Realv min_intersectionMin = intersection +
+         (setFirstBlockIndices0 * WID + 0) * intersection_di +
+         (setFirstBlockIndices1 * WID + 0) * intersection_dj;
+      min_intersectionMin =  std::min(min_intersectionMin,
+                                      intersection +
+                                      (setFirstBlockIndices0 * WID + 0) * intersection_di +
+                                      (setFirstBlockIndices1 * WID + WID - 1) * intersection_dj);
+      min_intersectionMin =  std::min(min_intersectionMin,
+                                      intersection +
+                                      (setFirstBlockIndices0 * WID + WID - 1) * intersection_di +
+                                      (setFirstBlockIndices1 * WID + 0) * intersection_dj);
+      min_intersectionMin =  std::min(min_intersectionMin,
+                                      intersection +
+                                      (setFirstBlockIndices0 * WID + WID - 1) * intersection_di +
+                                      (setFirstBlockIndices1 * WID + WID - 1) * intersection_dj);
+
+      //now, record which blocks are target blocks
+      for (uint columnIndex = gpu_columnData->setColumnOffsets[setIndex];
+           columnIndex < gpu_columnData->setColumnOffsets[setIndex] + gpu_columnData->setNumColumns[setIndex] ;
+           ++columnIndex) {
+         // Not parallelizing this at this level; not going to be many columns within a set
+
+         // Abort all threads if vector capacity bailout
+         if (bailout_flag[1] ) {
+            return;
          }
-         __syncthreads();
 
-         /*need x,y coordinate of this column set of blocks, take it from first
-           block in first column*/
-         vmesh::LocalID setFirstBlockIndices0,setFirstBlockIndices1,setFirstBlockIndices2;
-         vmesh->getIndices(GIDlist[gpu_columnData->columnBlockOffsets[gpu_columnData->setColumnOffsets[setIndex]]],
-                           setFirstBlockIndices0, setFirstBlockIndices1, setFirstBlockIndices2);
-         swapBlockIndices(setFirstBlockIndices0,setFirstBlockIndices1,setFirstBlockIndices2,dimension);
-         /*compute the maximum starting point of the lagrangian (target) grid
-           (base level) within the 4 corner cells in this
-           block. Needed for computig maximum extent of target column*/
-
-         Realv max_intersectionMin = intersection +
-            (setFirstBlockIndices0 * WID + 0) * intersection_di +
-            (setFirstBlockIndices1 * WID + 0) * intersection_dj;
-         max_intersectionMin =  std::max(max_intersectionMin,
-                                         intersection +
-                                         (setFirstBlockIndices0 * WID + 0) * intersection_di +
-                                         (setFirstBlockIndices1 * WID + WID - 1) * intersection_dj);
-         max_intersectionMin =  std::max(max_intersectionMin,
-                                         intersection +
-                                         (setFirstBlockIndices0 * WID + WID - 1) * intersection_di +
-                                         (setFirstBlockIndices1 * WID + 0) * intersection_dj);
-         max_intersectionMin =  std::max(max_intersectionMin,
-                                         intersection +
-                                         (setFirstBlockIndices0 * WID + WID - 1) * intersection_di +
-                                         (setFirstBlockIndices1 * WID + WID - 1) * intersection_dj);
-
-         Realv min_intersectionMin = intersection +
-            (setFirstBlockIndices0 * WID + 0) * intersection_di +
-            (setFirstBlockIndices1 * WID + 0) * intersection_dj;
-         min_intersectionMin =  std::min(min_intersectionMin,
-                                         intersection +
-                                         (setFirstBlockIndices0 * WID + 0) * intersection_di +
-                                         (setFirstBlockIndices1 * WID + WID - 1) * intersection_dj);
-         min_intersectionMin =  std::min(min_intersectionMin,
-                                         intersection +
-                                         (setFirstBlockIndices0 * WID + WID - 1) * intersection_di +
-                                         (setFirstBlockIndices1 * WID + 0) * intersection_dj);
-         min_intersectionMin =  std::min(min_intersectionMin,
-                                         intersection +
-                                         (setFirstBlockIndices0 * WID + WID - 1) * intersection_di +
-                                         (setFirstBlockIndices1 * WID + WID - 1) * intersection_dj);
-
-         //now, record which blocks are target blocks
-         for (uint columnIndex = gpu_columnData->setColumnOffsets[setIndex];
-              columnIndex < gpu_columnData->setColumnOffsets[setIndex] + gpu_columnData->setNumColumns[setIndex] ;
-              ++columnIndex) {
-            // Not parallelizing this at this level; not going to be many columns within a set
-
-            // Abort all threads if vector capacity bailout
-            if (bailout_flag[1] ) {
-               return;
+         const vmesh::LocalID n_cblocks = gpu_columnData->columnNumBlocks[columnIndex];
+         vmesh::GlobalID* cblocks = GIDlist + gpu_columnData->columnBlockOffsets[columnIndex]; //column blocks
+         vmesh::LocalID firstBlockIndices0,firstBlockIndices1,firstBlockIndices2;
+         vmesh::LocalID lastBlockIndices0,lastBlockIndices1,lastBlockIndices2;
+         vmesh->getIndices(cblocks[0],
+                           firstBlockIndices0, firstBlockIndices1, firstBlockIndices2);
+         vmesh->getIndices(cblocks[n_cblocks -1],
+                           lastBlockIndices0, lastBlockIndices1, lastBlockIndices2);
+         swapBlockIndices(firstBlockIndices0,firstBlockIndices1,firstBlockIndices2, dimension);
+         swapBlockIndices(lastBlockIndices0,lastBlockIndices1,lastBlockIndices2, dimension);
+
+         /* firstBlockV is in z the minimum velocity value of the lower
+          *  edge in source grid.
+          * lastBlockV is in z the maximum velocity value of the upper
+          *  edge in source grid. */
+         Realv firstBlockMinV = (WID * firstBlockIndices2) * dv + v_min;
+         Realv lastBlockMaxV = (WID * (lastBlockIndices2 + 1)) * dv + v_min;
+
+         /* gk is now the k value in terms of cells in target
+            grid. This distance between max_intersectionMin (so lagrangian
+            plan, well max value here) and V of source grid, divided by
+            intersection_dk to find out how many grid cells that is*/
+         const int firstBlock_gk = (int)((firstBlockMinV - max_intersectionMin)/intersection_dk);
+         const int lastBlock_gk = (int)((lastBlockMaxV - min_intersectionMin)/intersection_dk);
+
+         int firstBlockIndexK = firstBlock_gk/WID;
+         int lastBlockIndexK = lastBlock_gk/WID;
+
+         // now enforce mesh limits for target column blocks (and check if we are
+         // too close to the velocity space boundaries)
+         firstBlockIndexK = (firstBlockIndexK >= 0)            ? firstBlockIndexK : 0;
+         firstBlockIndexK = (firstBlockIndexK < max_v_length ) ? firstBlockIndexK : max_v_length - 1;
+         lastBlockIndexK  = (lastBlockIndexK  >= 0)            ? lastBlockIndexK  : 0;
+         lastBlockIndexK  = (lastBlockIndexK  < max_v_length ) ? lastBlockIndexK  : max_v_length - 1;
+         if(firstBlockIndexK < bailout_velocity_space_wall_margin
+            || firstBlockIndexK >= max_v_length - bailout_velocity_space_wall_margin
+            || lastBlockIndexK < bailout_velocity_space_wall_margin
+            || lastBlockIndexK >= max_v_length - bailout_velocity_space_wall_margin
+            ) {
+            // Pass bailout (hitting the wall) flag back to host
+            if (ti==0) {
+               bailout_flag[0] = 1;
             }
+         }
 
-            const vmesh::LocalID n_cblocks = gpu_columnData->columnNumBlocks[columnIndex];
-            vmesh::GlobalID* cblocks = GIDlist + gpu_columnData->columnBlockOffsets[columnIndex]; //column blocks
-            vmesh::LocalID firstBlockIndices0,firstBlockIndices1,firstBlockIndices2;
-            vmesh::LocalID lastBlockIndices0,lastBlockIndices1,lastBlockIndices2;
-            vmesh->getIndices(cblocks[0],
-                              firstBlockIndices0, firstBlockIndices1, firstBlockIndices2);
-            vmesh->getIndices(cblocks[n_cblocks -1],
-                              lastBlockIndices0, lastBlockIndices1, lastBlockIndices2);
-            swapBlockIndices(firstBlockIndices0,firstBlockIndices1,firstBlockIndices2, dimension);
-            swapBlockIndices(lastBlockIndices0,lastBlockIndices1,lastBlockIndices2, dimension);
-
-            /* firstBlockV is in z the minimum velocity value of the lower
-             *  edge in source grid.
-             * lastBlockV is in z the maximum velocity value of the upper
-             *  edge in source grid. */
-            Realv firstBlockMinV = (WID * firstBlockIndices2) * dv + v_min;
-            Realv lastBlockMaxV = (WID * (lastBlockIndices2 + 1)) * dv + v_min;
-
-            /* gk is now the k value in terms of cells in target
-               grid. This distance between max_intersectionMin (so lagrangian
-               plan, well max value here) and V of source grid, divided by
-               intersection_dk to find out how many grid cells that is*/
-            const int firstBlock_gk = (int)((firstBlockMinV - max_intersectionMin)/intersection_dk);
-            const int lastBlock_gk = (int)((lastBlockMaxV - min_intersectionMin)/intersection_dk);
-
-            int firstBlockIndexK = firstBlock_gk/WID;
-            int lastBlockIndexK = lastBlock_gk/WID;
-
-            // now enforce mesh limits for target column blocks (and check if we are
-            // too close to the velocity space boundaries)
-            firstBlockIndexK = (firstBlockIndexK >= 0)            ? firstBlockIndexK : 0;
-            firstBlockIndexK = (firstBlockIndexK < max_v_length ) ? firstBlockIndexK : max_v_length - 1;
-            lastBlockIndexK  = (lastBlockIndexK  >= 0)            ? lastBlockIndexK  : 0;
-            lastBlockIndexK  = (lastBlockIndexK  < max_v_length ) ? lastBlockIndexK  : max_v_length - 1;
-            if(firstBlockIndexK < bailout_velocity_space_wall_margin
-               || firstBlockIndexK >= max_v_length - bailout_velocity_space_wall_margin
-               || lastBlockIndexK < bailout_velocity_space_wall_margin
-               || lastBlockIndexK >= max_v_length - bailout_velocity_space_wall_margin
-               ) {
-               // Pass bailout (hitting the wall) flag back to host
-               if (ti==0) {
-                  bailout_flag[0] = 1;
-               }
+         //store source blocks
+         for (uint blockK = firstBlockIndices2; blockK <= lastBlockIndices2; blockK +=warpSize){
+            if ((blockK+ti) <= lastBlockIndices2) {
+               //const int old  = atomicAdd(&isSourceBlock[blockK+ti],1);
+               isSourceBlock[blockK+ti] = 1; // Does not need to be atomic, as long as it's no longer zero
             }
+         }
+         __syncthreads();
 
-            //store source blocks
-            for (uint blockK = firstBlockIndices2; blockK <= lastBlockIndices2; blockK +=warpSize){
-               if ((blockK+ti) <= lastBlockIndices2) {
-                  //const int old  = atomicAdd(&isSourceBlock[blockK+ti],1);
-                  isSourceBlock[blockK+ti] = 1; // Does not need to be atomic, as long as it's no longer zero
-               }
+         //store target blocks
+         for (uint blockK = (uint)firstBlockIndexK; blockK <= (uint)lastBlockIndexK; blockK+=warpSize){
+            if ((blockK+ti) <= (uint)lastBlockIndexK) {
+               isTargetBlock[blockK+ti] = 1; // Does not need to be atomic, as long as it's no longer zero
+               //const int old  = atomicAdd(&isTargetBlock[blockK+ti],1);
             }
-            __syncthreads();
+         }
+         __syncthreads();
 
-            //store target blocks
-            for (uint blockK = (uint)firstBlockIndexK; blockK <= (uint)lastBlockIndexK; blockK+=warpSize){
-               if ((blockK+ti) <= (uint)lastBlockIndexK) {
-                  isTargetBlock[blockK+ti] = 1; // Does not need to be atomic, as long as it's no longer zero
-                  //const int old  = atomicAdd(&isTargetBlock[blockK+ti],1);
-               }
-            }
-            __syncthreads();
+         if (ti==0) {
+            // Set columns' transverse coordinates
+            gpu_columns[columnIndex].i = setFirstBlockIndices0;
+            gpu_columns[columnIndex].j = setFirstBlockIndices1;
+            gpu_columns[columnIndex].kBegin = firstBlockIndices2;
 
-            if (ti==0) {
-               // Set columns' transverse coordinates
-               gpu_columns[columnIndex].i = setFirstBlockIndices0;
-               gpu_columns[columnIndex].j = setFirstBlockIndices1;
-               gpu_columns[columnIndex].kBegin = firstBlockIndices2;
-
-               //store also for each column firstBlockIndexK, and lastBlockIndexK
-               gpu_columns[columnIndex].minBlockK = firstBlockIndexK;
-               gpu_columns[columnIndex].maxBlockK = lastBlockIndexK;
-            }
-         } // end loop over columns in set
-         __syncthreads();
+            //store also for each column firstBlockIndexK, and lastBlockIndexK
+            gpu_columns[columnIndex].minBlockK = firstBlockIndexK;
+            gpu_columns[columnIndex].maxBlockK = lastBlockIndexK;
+         }
+      } // end loop over columns in set
+      __syncthreads();
 
-         for (uint blockT = 0; blockT < MAX_BLOCKS_PER_DIM; blockT +=warpSize) {
-            const uint blockK = blockT + ti;
-            if (blockK < MAX_BLOCKS_PER_DIM) {
-               if(isTargetBlock[blockK]!=0)  {
-                  const int targetBlock =
-                     setFirstBlockIndices0 * gpu_block_indices_to_id[0] +
-                     setFirstBlockIndices1 * gpu_block_indices_to_id[1] +
-                     blockK                * gpu_block_indices_to_id[2];
-                  dev_map_require->set_element(targetBlock,vmesh->getLocalID(targetBlock));
-                  // if(!BlocksRequired->device_push_back(targetBlock)) {
-                  //    bailout_flag[1]=1;
-                  //    return;
-                  // }
+      for (uint blockT = 0; blockT < MAX_BLOCKS_PER_DIM; blockT +=warpSize) {
+         const uint blockK = blockT + ti;
+         if (blockK < MAX_BLOCKS_PER_DIM) {
+            if(isTargetBlock[blockK]!=0)  {
+               const int targetBlock =
+                  setFirstBlockIndices0 * gpu_block_indices_to_id[0] +
+                  setFirstBlockIndices1 * gpu_block_indices_to_id[1] +
+                  blockK                * gpu_block_indices_to_id[2];
+               dev_map_require->set_element(targetBlock,vmesh->getLocalID(targetBlock));
+               // if(!BlocksRequired->device_push_back(targetBlock)) {
+               //    bailout_flag[1]=1;
+               //    return;
+               // }
+            }
+            if(isTargetBlock[blockK]!=0 && isSourceBlock[blockK]==0 )  {
+               const int targetBlock =
+                  setFirstBlockIndices0 * gpu_block_indices_to_id[0] +
+                  setFirstBlockIndices1 * gpu_block_indices_to_id[1] +
+                  blockK                * gpu_block_indices_to_id[2];
+               if(!list_with_replace_new->device_push_back(targetBlock)) {
+                  bailout_flag[1]=1; // out of capacity
                }
-               if(isTargetBlock[blockK]!=0 && isSourceBlock[blockK]==0 )  {
-                  const int targetBlock =
-                     setFirstBlockIndices0 * gpu_block_indices_to_id[0] +
-                     setFirstBlockIndices1 * gpu_block_indices_to_id[1] +
-                     blockK                * gpu_block_indices_to_id[2];
-                  if(!list_with_replace_new->device_push_back(targetBlock)) {
-                     bailout_flag[1]=1; // out of capacity
-                  }
 
-               }
-               if(isTargetBlock[blockK]==0 && isSourceBlock[blockK]!=0 )  {
-                  const int targetBlock =
-                     setFirstBlockIndices0 * gpu_block_indices_to_id[0] +
-                     setFirstBlockIndices1 * gpu_block_indices_to_id[1] +
-                     blockK                * gpu_block_indices_to_id[2];
-                  dev_map_remove->set_element(targetBlock,vmesh->getLocalID(targetBlock));
-                  // GPUTODO: could use device_insert to verify insertion, but not worth it
-                  // if(!list_to_replace->device_push_back(
-                  //       Hashinator::hash_pair<vmesh::GlobalID,vmesh::LocalID>(targetBlock,vmesh->getLocalID(targetBlock)))) {
-                  //    bailout_flag[2]=1; // out of capacity
-                  //    return;
-                  // }
-               }
-            } // block within MAX_BLOCKS_PER_DIM
-         } // loop over all potential blocks
-      } // if valid setIndez
-   } // for setIndexB
+            }
+            if(isTargetBlock[blockK]==0 && isSourceBlock[blockK]!=0 )  {
+               const int targetBlock =
+                  setFirstBlockIndices0 * gpu_block_indices_to_id[0] +
+                  setFirstBlockIndices1 * gpu_block_indices_to_id[1] +
+                  blockK                * gpu_block_indices_to_id[2];
+               dev_map_remove->set_element(targetBlock,vmesh->getLocalID(targetBlock));
+               // GPUTODO: could use device_insert to verify insertion, but not worth it
+               // if(!list_to_replace->device_push_back(
+               //       Hashinator::hash_pair<vmesh::GlobalID,vmesh::LocalID>(targetBlock,vmesh->getLocalID(targetBlock)))) {
+               //    bailout_flag[2]=1; // out of capacity
+               //    return;
+               // }
+            }
+         } // block within MAX_BLOCKS_PER_DIM
+      } // loop over all potential blocks
+   } // if valid setIndex
 }
 
 __global__ void __launch_bounds__(VECL,4) acceleration_kernel(
@@ -424,7 +420,6 @@ __global__ void __launch_bounds__(VECL,4) acceleration_kernel(
    const uint w_tid = threadIdx.z*blockDim.x*blockDim.y + threadIdx.y*blockDim.x + threadIdx.x;
 
    Realf *gpu_blockData = blockContainer->getData();
-   //for (uint column = blocki; column < totalColumns; column += gpuBlocks) {
    const uint column = blocki;
    {
       /* New threading with each warp/wavefront working on one vector */
@@ -538,7 +533,7 @@ __global__ void __launch_bounds__(VECL,4) acceleration_kernel(
             } // for loop over target k-indices of current source block
          } // for-loop over source blocks
       } //for loop over j index
-   } // End loop over columns
+   } // End this column
 } // end semilag acc kernel
 
 /*
@@ -753,13 +748,11 @@ __host__ bool gpu_acc_map_1d(spatial_cell::SpatialCell* spatial_cell,
    //storeTimer.stop();
 
    //phiprof::Timer reorderTimer {"Reorder blocks by dimension"};
-   uint gpublocks = host_totalColumns > GPUBLOCKS ? GPUBLOCKS : host_totalColumns;
    // Launch kernels for transposing and ordering velocity space data into columns
-   reorder_blocks_by_dimension_kernel<<<gpublocks, VECL, 0, stream>>> (
+   reorder_blocks_by_dimension_kernel<<<host_totalColumns, VECL, 0, stream>>> (
       dev_blockContainer,
       gpu_blockDataOrdered[cpuThreadID],
       gpu_cell_indices_to_id[cpuThreadID],
-      host_totalColumns,
       LIDlist,
       columnData
       );
@@ -775,7 +768,7 @@ __host__ bool gpu_acc_map_1d(spatial_cell::SpatialCell* spatial_cell,
       map_remove->clear<false>(Hashinator::targets::device,stream,std::pow(2,spatial_cell->vbwncl_sizePower));
       // Hashmap clear includes a stream sync
       //CHK_ERR( gpuStreamSynchronize(stream) );
-      evaluate_column_extents_kernel<<<gpublocks, GPUTHREADS, 0, stream>>> (
+      evaluate_column_extents_kernel<<<host_totalColumns, GPUTHREADS, 0, stream>>> (
          dimension,
          dev_vmesh,
          columnData,
diff --git a/vlasovsolver/gpu_acc_sort_blocks.cpp b/vlasovsolver/gpu_acc_sort_blocks.cpp
index 15c6682a7..fd103407d 100644
--- a/vlasovsolver/gpu_acc_sort_blocks.cpp
+++ b/vlasovsolver/gpu_acc_sort_blocks.cpp
@@ -56,16 +56,13 @@ __global__ void __launch_bounds__(GPUTHREADS,4) blocksID_mapped_dim0_kernel(
    vmesh::LocalID *blocksLID_unsorted,
    const uint nBlocks
    ) {
-   const int gpuBlocks = gridDim.x * gridDim.y * gridDim.z;
    const uint warpSize = blockDim.x * blockDim.y * blockDim.z;
    const int blocki = blockIdx.z*gridDim.x*gridDim.y + blockIdx.y*gridDim.x + blockIdx.x;
    const uint ti = threadIdx.z*blockDim.x*blockDim.y + threadIdx.y*blockDim.x + threadIdx.x;
-   for (vmesh::LocalID index=blocki*warpSize; index<nBlocks; index += gpuBlocks*warpSize) {
-      const vmesh::LocalID LID = (index+ti);
-      if (LID < nBlocks) {
-         blocksID_mapped[LID] = vmesh->getGlobalID(LID);
-         blocksLID_unsorted[LID]=LID;
-      }
+   const vmesh::LocalID LID = blocki * warpSize + ti;
+   if (LID < nBlocks) {
+      blocksID_mapped[LID] = vmesh->getGlobalID(LID);
+      blocksLID_unsorted[LID]=LID;
    }
 }
 
@@ -75,22 +72,19 @@ __global__ void __launch_bounds__(GPUTHREADS,4) blocksID_mapped_dim1_kernel(
    vmesh::LocalID *blocksLID_unsorted,
    const uint nBlocks
    ) {
-   const int gpuBlocks = gridDim.x * gridDim.y * gridDim.z;
    const uint warpSize = blockDim.x * blockDim.y * blockDim.z;
    const int blocki = blockIdx.z*gridDim.x*gridDim.y + blockIdx.y*gridDim.x + blockIdx.x;
    const uint ti = threadIdx.z*blockDim.x*blockDim.y + threadIdx.y*blockDim.x + threadIdx.x;
    const vmesh::LocalID D0 = vmesh->getGridLength()[0];
    const vmesh::LocalID D1 = vmesh->getGridLength()[1];
    // const vmesh::LocalID D2 = vmesh->getGridLength()[2];
-   for (vmesh::LocalID index=blocki*warpSize; index<nBlocks; index += gpuBlocks*warpSize) {
-      const vmesh::LocalID LID = (index+ti);
-      if (LID < nBlocks) {
-         const vmesh::GlobalID GID = vmesh->getGlobalID(LID);
-         const vmesh::LocalID x_index = GID % D0;
-         const vmesh::LocalID y_index = (GID / D0) % D1;
-         blocksID_mapped[LID] = GID - (x_index + y_index*D0) + y_index + x_index * D1;
-         blocksLID_unsorted[LID]=LID;
-      }
+   const vmesh::LocalID LID = blocki * warpSize + ti;
+   if (LID < nBlocks) {
+      const vmesh::GlobalID GID = vmesh->getGlobalID(LID);
+      const vmesh::LocalID x_index = GID % D0;
+      const vmesh::LocalID y_index = (GID / D0) % D1;
+      blocksID_mapped[LID] = GID - (x_index + y_index*D0) + y_index + x_index * D1;
+      blocksLID_unsorted[LID]=LID;
    }
 }
 
@@ -100,65 +94,40 @@ __global__ void __launch_bounds__(GPUTHREADS,4) blocksID_mapped_dim2_kernel(
    vmesh::LocalID *blocksLID_unsorted,
    const uint nBlocks
    ) {
-   const int gpuBlocks = gridDim.x * gridDim.y * gridDim.z;
    const uint warpSize = blockDim.x * blockDim.y * blockDim.z;
    const int blocki = blockIdx.z*gridDim.x*gridDim.y + blockIdx.y*gridDim.x + blockIdx.x;
    const uint ti = threadIdx.z*blockDim.x*blockDim.y + threadIdx.y*blockDim.x + threadIdx.x;
    const vmesh::LocalID D0 = vmesh->getGridLength()[0];
    const vmesh::LocalID D1 = vmesh->getGridLength()[1];
    const vmesh::LocalID D2 = vmesh->getGridLength()[2];
-   for (vmesh::LocalID index=blocki*warpSize; index<nBlocks; index += gpuBlocks*warpSize) {
-      const vmesh::LocalID LID = (index+ti);
-      if (LID < nBlocks) {
-         const vmesh::GlobalID GID = vmesh->getGlobalID(LID);
-         const vmesh::LocalID x_index = GID % D0;
-         const vmesh::LocalID y_index = (GID / D0) % D1;
-         const vmesh::LocalID z_index = (GID / (D0*D1));
-         blocksID_mapped[LID] = z_index + y_index*D2 + x_index*D1*D2;
-         blocksLID_unsorted[LID]=LID;
-      }
+   const vmesh::LocalID LID = blocki * warpSize + ti;
+   if (LID < nBlocks) {
+      const vmesh::GlobalID GID = vmesh->getGlobalID(LID);
+      const vmesh::LocalID x_index = GID % D0;
+      const vmesh::LocalID y_index = (GID / D0) % D1;
+      const vmesh::LocalID z_index = (GID / (D0*D1));
+      blocksID_mapped[LID] = z_index + y_index*D2 + x_index*D1*D2;
+      blocksLID_unsorted[LID]=LID;
    }
 }
 
 // LIDs are already in order.
 // Now also order GIDS. (can be ridiculously parallel, minus memory access patterns)
+// also used for scanning columnsets for block counts
 __global__ void __launch_bounds__(GPUTHREADS,4) order_GIDs_kernel(
    const vmesh::VelocityMesh* vmesh,
    vmesh::GlobalID *blocksLID,
    vmesh::GlobalID *blocksGID,
-   const uint nBlocks,
-   ColumnOffsets* columnData // passed just for resetting
-   ) {
-   const int gpuBlocks = gridDim.x * gridDim.y * gridDim.z;
-   const uint warpSize = blockDim.x * blockDim.y * blockDim.z;
-   const int blocki = blockIdx.z*gridDim.x*gridDim.y + blockIdx.y*gridDim.x + blockIdx.x;
-   const uint ti = threadIdx.z*blockDim.x*blockDim.y + threadIdx.y*blockDim.x + threadIdx.x;
-   for (vmesh::LocalID index=blocki*warpSize; index<nBlocks; index += gpuBlocks*warpSize) {
-      const vmesh::LocalID i = (index+ti);
-      if (i < nBlocks) {
-         blocksGID[i]=vmesh->getGlobalID(blocksLID[i]);
-      }
-   }
-   if (blockIdx.x == blockIdx.y == blockIdx.z == threadIdx.x == threadIdx.y == threadIdx.z == 0) {
-      columnData->columnBlockOffsets.clear();
-      columnData->columnNumBlocks.clear();
-      columnData->setColumnOffsets.clear();
-      columnData->setNumColumns.clear();
-   }
-}
-
-// Kernel for scanning columnsets for block counts
-__global__ void __launch_bounds__(GPUTHREADS,4) scan_blocks_for_columns_kernel(
-   const vmesh::VelocityMesh* vmesh,
    const uint dimension,
    vmesh::GlobalID *blocksID_mapped_sorted,
    vmesh::LocalID *gpu_columnNBlocks,
-   const uint nBlocks
+   const uint nBlocks,
+   ColumnOffsets* columnData // passed just for resetting
    ) {
-   const int gpuBlocks = gridDim.x * gridDim.y * gridDim.z;
    const uint warpSize = blockDim.x * blockDim.y * blockDim.z;
    const int blocki = blockIdx.z*gridDim.x*gridDim.y + blockIdx.y*gridDim.x + blockIdx.x;
    const uint ti = threadIdx.z*blockDim.x*blockDim.y + threadIdx.y*blockDim.x + threadIdx.x;
+
    vmesh::LocalID DX;
    switch (dimension) {
       case 0:
@@ -173,21 +142,29 @@ __global__ void __launch_bounds__(GPUTHREADS,4) scan_blocks_for_columns_kernel(
       default:
          printf("Incorrect dimension in __FILE__ __LINE__\n");
    }
-   for (vmesh::LocalID LID=blocki*warpSize; LID<nBlocks; LID += gpuBlocks*warpSize) {
-      if (LID+ti < nBlocks) {
-         vmesh::LocalID column_id = blocksID_mapped_sorted[LID+ti] / DX;
-         // Increment number of blocks in column
-         const vmesh::LocalID old  = atomicAdd(&gpu_columnNBlocks[column_id],1);
-         // // Evaluate smallest GID in column
-         // old = atomicMin(&columnMinBlock[columnid],GID);
-         // // Evaluate largest GID in colum
-         // old = atomicMax(&columnMaxBlock[columnid],GID);
-      }
+
+   const vmesh::LocalID index = blocki * warpSize + ti;
+   if (index < nBlocks) {
+      blocksGID[index] = vmesh->getGlobalID(blocksLID[index]);
+
+      const vmesh::LocalID column_id = blocksID_mapped_sorted[index] / DX;
+      // Increment number of blocks in column
+      const vmesh::LocalID old  = atomicAdd(&gpu_columnNBlocks[column_id],1);
+      // // Evaluate smallest GID in column
+      // old = atomicMin(&columnMinBlock[columnid],GID);
+      // // Evaluate largest GID in colum
+      // old = atomicMax(&columnMaxBlock[columnid],GID);
+   }
+   if (blockIdx.x == blockIdx.y == blockIdx.z == threadIdx.x == threadIdx.y == threadIdx.z == 0) {
+      columnData->columnBlockOffsets.clear();
+      columnData->columnNumBlocks.clear();
+      columnData->setColumnOffsets.clear();
+      columnData->setNumColumns.clear();
    }
 }
 
 /*** Kernel for constructing columns
- Checks if all blocks in a columnset belong to a single kernel, and
+ Checks if all blocks in a columnset belong to a single column, and
  can quickly jump through the whole column.
  For columnsets containing several columns, it trials blocks by scanning
  warpSize GIDs at a time.
@@ -346,14 +323,17 @@ void sortBlocklistByDimension( //const spatial_cell::SpatialCell* spatial_cell,
                                gpuStream_t stream
    ) {
 
-   // Ensure at least one launch block
-   uint nGpuBlocks  = (nBlocks/GPUTHREADS) > GPUBLOCKS ? GPUBLOCKS : std::ceil((Real)nBlocks/(Real)GPUTHREADS);
+   // Ensure at least one launch block, ceil int division
+   //const uint maxThreads = WARPSPERBLOCK*GPUTHREADS;
+   // For some reason, a direct increase of launch threads breaks things. Stick with GPUTHREADS for now.
+   const uint maxThreads = GPUTHREADS;
+   const uint launchBlocks = 1 + ((nBlocks - 1) / (maxThreads));
 
    //phiprof::Timer calcTimer {"calc new dimension id"};
    // Map blocks to new dimensionality
    switch( dimension ) {
       case 0: {
-         blocksID_mapped_dim0_kernel<<<nGpuBlocks, GPUTHREADS, 0, stream>>> (
+         blocksID_mapped_dim0_kernel<<<launchBlocks, maxThreads, 0, stream>>> (
             vmesh,
             blocksID_mapped,
             blocksLID_unsorted,
@@ -362,7 +342,7 @@ void sortBlocklistByDimension( //const spatial_cell::SpatialCell* spatial_cell,
          break;
       }
       case 1: {
-         blocksID_mapped_dim1_kernel<<<nGpuBlocks, GPUTHREADS, 0, stream>>> (
+         blocksID_mapped_dim1_kernel<<<launchBlocks, maxThreads, 0, stream>>> (
             vmesh,
             blocksID_mapped,
             blocksLID_unsorted,
@@ -371,7 +351,7 @@ void sortBlocklistByDimension( //const spatial_cell::SpatialCell* spatial_cell,
          break;
       }
       case 2: {
-         blocksID_mapped_dim2_kernel<<<nGpuBlocks, GPUTHREADS, 0, stream>>> (
+         blocksID_mapped_dim2_kernel<<<launchBlocks, maxThreads, 0, stream>>> (
             vmesh,
             blocksID_mapped,
             blocksLID_unsorted,
@@ -428,31 +408,28 @@ void sortBlocklistByDimension( //const spatial_cell::SpatialCell* spatial_cell,
    sortTimer.stop();
 
    // Gather GIDs in order
-   //phiprof::Timer reorderTimer {"reorder GIDs"};
-   order_GIDs_kernel<<<nGpuBlocks, GPUTHREADS, 0, stream>>> (
+   //phiprof::Timer reorderTimer {"reorder GIDs"}; // and scan for column block counts
+   order_GIDs_kernel<<<launchBlocks, maxThreads, 0, stream>>> (
       vmesh,
       blocksLID,
       blocksGID,
-      nBlocks,
-      columnData // Pass this just to clear it on device
-      );
-   CHK_ERR( gpuPeekAtLastError() );
-   //reorderTimer.stop();
-
-   //phiprof::Timer scanTimer {"scan for column block counts"};
-   scan_blocks_for_columns_kernel<<<nGpuBlocks, GPUTHREADS, 0, stream>>> (
-      vmesh,
       dimension,
       blocksID_mapped,
       gpu_columnNBlocks,
-      nBlocks
+      nBlocks,
+      columnData // Pass this just to clear it on device
       );
    CHK_ERR( gpuPeekAtLastError() );
-   //scanTimer.stop();
+   //reorderTimer.stop();
 
    //phiprof::Timer constructTimer {"construct columns"};
    // Construct columns. To ensure order,
    // these are done serially, but still form within a kernel.
+   // Optimizing launch threads for this kernel as well needs a bit more checking
+   // uint columnThreads = maxThreads;
+   // columnThreads = (vmesh->getGridLength()[0]) < columnThreads ? vmesh->getGridLength()[0] : columnThreads;
+   // columnThreads = (vmesh->getGridLength()[1]) < columnThreads ? vmesh->getGridLength()[1] : columnThreads;
+   // columnThreads = (vmesh->getGridLength()[2]) < columnThreads ? vmesh->getGridLength()[2] : columnThreads;
    construct_columns_kernel<<<1, GPUTHREADS, 0, stream>>> (
       vmesh,
       dimension,