diff --git a/MAKE/Makefile.Freezer_cuda b/MAKE/Makefile.Freezer_cuda index c58762bd5..2ef1d5085 100644 --- a/MAKE/Makefile.Freezer_cuda +++ b/MAKE/Makefile.Freezer_cuda @@ -35,7 +35,6 @@ VECL=16 # LDFLAGS flags for linker USE_CUDA=1 -CUDABLOCKS=108 # Tell mpic++ to use nvcc for all compiling CMP = OMPI_CXX='nvcc' OMPI_CXXFLAGS='' mpic++ diff --git a/MAKE/Makefile.mahti_cuda b/MAKE/Makefile.mahti_cuda index 492315516..bc632c696 100644 --- a/MAKE/Makefile.mahti_cuda +++ b/MAKE/Makefile.mahti_cuda @@ -33,7 +33,6 @@ VECL=16 # mpi.h in c++ on Cray USE_CUDA=1 -CUDABLOCKS=108 #-ggdb not available on nvcc #-G (device debug) overrides --generate-line-info -line-info but also requires more device-side resources to run diff --git a/MAKE/Makefile.puhti_gcc_cuda b/MAKE/Makefile.puhti_gcc_cuda index f5a151e5c..478ccf6c6 100644 --- a/MAKE/Makefile.puhti_gcc_cuda +++ b/MAKE/Makefile.puhti_gcc_cuda @@ -45,7 +45,6 @@ VECL=64 # mpi.h in c++ on Cray USE_CUDA=1 -CUDABLOCKS=108 CMP = mpic++ LNK = mpic++ NVCC = nvcc diff --git a/arch/arch_device_cuda.h b/arch/arch_device_cuda.h index 03f92519f..5ec72b0fa 100644 --- a/arch/arch_device_cuda.h +++ b/arch/arch_device_cuda.h @@ -100,11 +100,6 @@ #define ARCH_BLOCKSIZE_R 512 #define ARCH_BLOCKSIZE_R_SMALL 32 -/* GPU blocksize used by Vlasov solvers */ -#ifndef GPUBLOCKS -# define GPUBLOCKS (108) -#endif - /* values used by kernels */ #ifndef GPUTHREADS #define GPUTHREADS (32) diff --git a/arch/arch_device_hip.h b/arch/arch_device_hip.h index cdb8c3d8d..03ea30d3f 100644 --- a/arch/arch_device_hip.h +++ b/arch/arch_device_hip.h @@ -97,11 +97,6 @@ #define ARCH_BLOCKSIZE_R 512 #define ARCH_BLOCKSIZE_R_SMALL 64 -/* GPU blocksize used by Vlasov solvers */ -#ifndef GPUBLOCKS -# define GPUBLOCKS (108) -#endif - /* values used by kernels */ #ifndef GPUTHREADS #define GPUTHREADS (64) diff --git a/spatial_cell_gpu.cpp b/spatial_cell_gpu.cpp index 5cdc8dd51..cb915eed4 100644 --- a/spatial_cell_gpu.cpp +++ b/spatial_cell_gpu.cpp @@ -906,7 +906,6 @@ namespace spatial_cell { // Evaluate velocity halo for local content blocks if (velocity_block_with_content_list_size>0) { //phiprof::Timer blockHaloTimer {"Block halo kernel"}; - //nGpuBlocks = velocity_block_with_content_list_size > GPUBLOCKS ? GPUBLOCKS : velocity_block_with_content_list_size; const int addWidthV = getObjectWrapper().particleSpecies[popID].sparseBlockAddWidthV; if (addWidthV!=1) { std::cerr<<"Warning! "<<__FILE__<<":"<<__LINE__<<" Halo extent is not 1, unsupported size."<>> ( + //update_velocity_block_content_lists_kernel<<>> ( update_velocity_block_content_lists_kernel<<>> ( populations[popID].dev_vmesh, populations[popID].dev_blockContainer, diff --git a/spatial_cell_gpu.hpp b/spatial_cell_gpu.hpp index 6e4ffe95e..60a842b1f 100644 --- a/spatial_cell_gpu.hpp +++ b/spatial_cell_gpu.hpp @@ -137,19 +137,17 @@ namespace spatial_cell { vmesh::VelocityBlockContainer *blockContainer, const Real factor ) { - const int gpuBlocks = gridDim.x; const int blocki = blockIdx.x; const int i = threadIdx.x; const int j = threadIdx.y; const int k = threadIdx.z; const uint ti = k*WID2 + j*WID + i; // loop over whole velocity space and scale the values - for (uint blockLID=blocki; blockLIDgetData(blockLID); - // Scale value - data[ti] = data[ti] * factor; - } + const uint blockLID = blocki; + // Pointer to target block data + Realf* data = blockContainer->getData(blockLID); + // Scale value + data[ti] = data[ti] * factor; } /** GPU kernel for adding a particle population to another with a scaling factor This kernel increments existing blocks, creates new ones if the block does @@ -379,11 +377,10 @@ namespace spatial_cell { } // Now loop over whole velocity space and scale the values vmesh::LocalID nBlocks = vmesh->size(); - const uint nGpuBlocks = nBlocks > GPUBLOCKS ? GPUBLOCKS : nBlocks; gpuStream_t stream = gpu_getStream(); - if (nGpuBlocks > 0) { + if (nBlocks > 0) { dim3 block(WID,WID,WID); - population_scale_kernel<<>> ( + population_scale_kernel<<>> ( nBlocks, dev_vmesh, dev_blockContainer, @@ -410,8 +407,7 @@ namespace spatial_cell { CHK_ERR( gpuStreamSynchronize(stream) ); // Loop over the whole velocity space, and add scaled values with // a kernel. Addition of new blocks is not block-parallel-safe. - const uint nGpuBlocks = nBlocks > GPUBLOCKS ? GPUBLOCKS : nBlocks; - if (nGpuBlocks > 0) { + if (nBlocks > 0) { dim3 block(WID,WID,WID); // Now serial population_increment_kernel<<<1, block, 0, stream>>> ( @@ -439,13 +435,13 @@ namespace spatial_cell { const fileReal* gpuInitBuffer, const uint nBlocks ) { - const int gpuBlocks = gridDim.x; const int blocki = blockIdx.x; //const int warpSize = blockDim.x*blockDim.y*blockDim.z; const uint ti = threadIdx.z*blockDim.x*blockDim.y + threadIdx.y*blockDim.x + threadIdx.x; Real* parameters = blockContainer->getParameters(startLID); Realf *cellBlockData = blockContainer->getData(startLID); - for (uint index=blocki; indexdouble if necessary cellBlockData[index*WID3 + ti] = (Realf)gpuInitBuffer[index*WID3 + ti]; // Set block parameters @@ -1229,13 +1225,12 @@ namespace spatial_cell { CHK_ERR( gpuMemcpyAsync(gpuInitBlocks, blocks.data(), nBlocks*sizeof(vmesh::GlobalID), gpuMemcpyHostToDevice, stream) ); - const uint nGpuBlocks = nBlocks > GPUBLOCKS ? GPUBLOCKS : nBlocks; - if (nGpuBlocks>0) { + if (nBlocks>0) { dim3 block(WID,WID,WID); // Third argument specifies the number of bytes in *shared memory* that is // dynamically allocated per block for this call in addition to the statically allocated memory. CHK_ERR( gpuStreamSynchronize(stream) ); - spatial_cell::add_blocks_from_buffer_kernel<<>> ( + spatial_cell::add_blocks_from_buffer_kernel<<>> ( populations[popID].dev_vmesh, populations[popID].dev_blockContainer, startLID, diff --git a/vlasovsolver/gpu_acc_map.cpp b/vlasovsolver/gpu_acc_map.cpp index 9c2c45071..174062942 100644 --- a/vlasovsolver/gpu_acc_map.cpp +++ b/vlasovsolver/gpu_acc_map.cpp @@ -70,7 +70,6 @@ __global__ void __launch_bounds__(VECL,4) reorder_blocks_by_dimension_kernel( vmesh::VelocityBlockContainer *blockContainer, Vec *gpu_blockDataOrdered, uint *gpu_cell_indices_to_id, - uint totalColumns, vmesh::LocalID *gpu_LIDlist, ColumnOffsets* columnData ) { @@ -79,14 +78,13 @@ __global__ void __launch_bounds__(VECL,4) reorder_blocks_by_dimension_kernel( // Works column-per-column and adds the necessary one empty block at each end const int nThreads = blockDim.x; // should be equal to VECL const int ti = threadIdx.x; - const int blocki = blockIdx.x; - const int gpuBlocks = gridDim.x; + const uint iColumn = blockIdx.x; Realf *gpu_blockData = blockContainer->getData(); if (nThreads != VECL) { if (ti==0) printf("Warning! VECL not matching thread count for GPU kernel!\n"); } - // Loop over columns in steps of gpuBlocks. Each gpuBlock deals with one column. - for (uint iColumn = blocki; iColumn < totalColumns; iColumn += gpuBlocks) { + // Each gpuBlock deals with one column. + { uint inputOffset = columnData->columnBlockOffsets[iColumn]; uint outputOffset = (inputOffset + 2 * iColumn) * (WID3/VECL); uint columnLength = columnData->columnNumBlocks[iColumn]; @@ -125,7 +123,7 @@ __global__ void __launch_bounds__(VECL,4) reorder_blocks_by_dimension_kernel( gpu_blockDataOrdered[outputOffset + i_pcolumnv_gpu_b(jk, k, columnLength, columnLength)][ti] = 0.0; } } - } // end loop iColumn + } // end iColumn // Note: this kernel does not memset gpu_blockData to zero. // A separate memsetasync call is required for that. } @@ -207,7 +205,6 @@ __global__ void __launch_bounds__(GPUTHREADS,4) evaluate_column_extents_kernel( Realv dv, uint *bailout_flag ) { - const uint gpuBlocks = gridDim.x * gridDim.y * gridDim.z; const uint warpSize = blockDim.x * blockDim.y * blockDim.z; const uint blocki = blockIdx.z*gridDim.x*gridDim.y + blockIdx.y*gridDim.x + blockIdx.x; const uint ti = threadIdx.z*blockDim.x*blockDim.y + threadIdx.y*blockDim.x + threadIdx.x; @@ -215,189 +212,188 @@ __global__ void __launch_bounds__(GPUTHREADS,4) evaluate_column_extents_kernel( // Shared within all threads in one block __shared__ int isTargetBlock[MAX_BLOCKS_PER_DIM]; __shared__ int isSourceBlock[MAX_BLOCKS_PER_DIM]; - for( uint setIndex=blocki; setIndex < gpu_columnData->setColumnOffsets.size(); setIndex += gpuBlocks) { - if (setIndex < gpu_columnData->setColumnOffsets.size()) { - - // Clear flags used for this columnSet - for(uint tti = 0; tti < MAX_BLOCKS_PER_DIM; tti += warpSize ) { - const uint index = tti + ti; - if (index < MAX_BLOCKS_PER_DIM) { - isTargetBlock[index] = 0; - isSourceBlock[index] = 0; - } + const uint setIndex=blocki; + if (setIndex < gpu_columnData->setColumnOffsets.size()) { + + // Clear flags used for this columnSet + for(uint tti = 0; tti < MAX_BLOCKS_PER_DIM; tti += warpSize ) { + const uint index = tti + ti; + if (index < MAX_BLOCKS_PER_DIM) { + isTargetBlock[index] = 0; + isSourceBlock[index] = 0; + } + } + __syncthreads(); + + /*need x,y coordinate of this column set of blocks, take it from first + block in first column*/ + vmesh::LocalID setFirstBlockIndices0,setFirstBlockIndices1,setFirstBlockIndices2; + vmesh->getIndices(GIDlist[gpu_columnData->columnBlockOffsets[gpu_columnData->setColumnOffsets[setIndex]]], + setFirstBlockIndices0, setFirstBlockIndices1, setFirstBlockIndices2); + swapBlockIndices(setFirstBlockIndices0,setFirstBlockIndices1,setFirstBlockIndices2,dimension); + /*compute the maximum starting point of the lagrangian (target) grid + (base level) within the 4 corner cells in this + block. Needed for computig maximum extent of target column*/ + + Realv max_intersectionMin = intersection + + (setFirstBlockIndices0 * WID + 0) * intersection_di + + (setFirstBlockIndices1 * WID + 0) * intersection_dj; + max_intersectionMin = std::max(max_intersectionMin, + intersection + + (setFirstBlockIndices0 * WID + 0) * intersection_di + + (setFirstBlockIndices1 * WID + WID - 1) * intersection_dj); + max_intersectionMin = std::max(max_intersectionMin, + intersection + + (setFirstBlockIndices0 * WID + WID - 1) * intersection_di + + (setFirstBlockIndices1 * WID + 0) * intersection_dj); + max_intersectionMin = std::max(max_intersectionMin, + intersection + + (setFirstBlockIndices0 * WID + WID - 1) * intersection_di + + (setFirstBlockIndices1 * WID + WID - 1) * intersection_dj); + + Realv min_intersectionMin = intersection + + (setFirstBlockIndices0 * WID + 0) * intersection_di + + (setFirstBlockIndices1 * WID + 0) * intersection_dj; + min_intersectionMin = std::min(min_intersectionMin, + intersection + + (setFirstBlockIndices0 * WID + 0) * intersection_di + + (setFirstBlockIndices1 * WID + WID - 1) * intersection_dj); + min_intersectionMin = std::min(min_intersectionMin, + intersection + + (setFirstBlockIndices0 * WID + WID - 1) * intersection_di + + (setFirstBlockIndices1 * WID + 0) * intersection_dj); + min_intersectionMin = std::min(min_intersectionMin, + intersection + + (setFirstBlockIndices0 * WID + WID - 1) * intersection_di + + (setFirstBlockIndices1 * WID + WID - 1) * intersection_dj); + + //now, record which blocks are target blocks + for (uint columnIndex = gpu_columnData->setColumnOffsets[setIndex]; + columnIndex < gpu_columnData->setColumnOffsets[setIndex] + gpu_columnData->setNumColumns[setIndex] ; + ++columnIndex) { + // Not parallelizing this at this level; not going to be many columns within a set + + // Abort all threads if vector capacity bailout + if (bailout_flag[1] ) { + return; } - __syncthreads(); - /*need x,y coordinate of this column set of blocks, take it from first - block in first column*/ - vmesh::LocalID setFirstBlockIndices0,setFirstBlockIndices1,setFirstBlockIndices2; - vmesh->getIndices(GIDlist[gpu_columnData->columnBlockOffsets[gpu_columnData->setColumnOffsets[setIndex]]], - setFirstBlockIndices0, setFirstBlockIndices1, setFirstBlockIndices2); - swapBlockIndices(setFirstBlockIndices0,setFirstBlockIndices1,setFirstBlockIndices2,dimension); - /*compute the maximum starting point of the lagrangian (target) grid - (base level) within the 4 corner cells in this - block. Needed for computig maximum extent of target column*/ - - Realv max_intersectionMin = intersection + - (setFirstBlockIndices0 * WID + 0) * intersection_di + - (setFirstBlockIndices1 * WID + 0) * intersection_dj; - max_intersectionMin = std::max(max_intersectionMin, - intersection + - (setFirstBlockIndices0 * WID + 0) * intersection_di + - (setFirstBlockIndices1 * WID + WID - 1) * intersection_dj); - max_intersectionMin = std::max(max_intersectionMin, - intersection + - (setFirstBlockIndices0 * WID + WID - 1) * intersection_di + - (setFirstBlockIndices1 * WID + 0) * intersection_dj); - max_intersectionMin = std::max(max_intersectionMin, - intersection + - (setFirstBlockIndices0 * WID + WID - 1) * intersection_di + - (setFirstBlockIndices1 * WID + WID - 1) * intersection_dj); - - Realv min_intersectionMin = intersection + - (setFirstBlockIndices0 * WID + 0) * intersection_di + - (setFirstBlockIndices1 * WID + 0) * intersection_dj; - min_intersectionMin = std::min(min_intersectionMin, - intersection + - (setFirstBlockIndices0 * WID + 0) * intersection_di + - (setFirstBlockIndices1 * WID + WID - 1) * intersection_dj); - min_intersectionMin = std::min(min_intersectionMin, - intersection + - (setFirstBlockIndices0 * WID + WID - 1) * intersection_di + - (setFirstBlockIndices1 * WID + 0) * intersection_dj); - min_intersectionMin = std::min(min_intersectionMin, - intersection + - (setFirstBlockIndices0 * WID + WID - 1) * intersection_di + - (setFirstBlockIndices1 * WID + WID - 1) * intersection_dj); - - //now, record which blocks are target blocks - for (uint columnIndex = gpu_columnData->setColumnOffsets[setIndex]; - columnIndex < gpu_columnData->setColumnOffsets[setIndex] + gpu_columnData->setNumColumns[setIndex] ; - ++columnIndex) { - // Not parallelizing this at this level; not going to be many columns within a set - - // Abort all threads if vector capacity bailout - if (bailout_flag[1] ) { - return; + const vmesh::LocalID n_cblocks = gpu_columnData->columnNumBlocks[columnIndex]; + vmesh::GlobalID* cblocks = GIDlist + gpu_columnData->columnBlockOffsets[columnIndex]; //column blocks + vmesh::LocalID firstBlockIndices0,firstBlockIndices1,firstBlockIndices2; + vmesh::LocalID lastBlockIndices0,lastBlockIndices1,lastBlockIndices2; + vmesh->getIndices(cblocks[0], + firstBlockIndices0, firstBlockIndices1, firstBlockIndices2); + vmesh->getIndices(cblocks[n_cblocks -1], + lastBlockIndices0, lastBlockIndices1, lastBlockIndices2); + swapBlockIndices(firstBlockIndices0,firstBlockIndices1,firstBlockIndices2, dimension); + swapBlockIndices(lastBlockIndices0,lastBlockIndices1,lastBlockIndices2, dimension); + + /* firstBlockV is in z the minimum velocity value of the lower + * edge in source grid. + * lastBlockV is in z the maximum velocity value of the upper + * edge in source grid. */ + Realv firstBlockMinV = (WID * firstBlockIndices2) * dv + v_min; + Realv lastBlockMaxV = (WID * (lastBlockIndices2 + 1)) * dv + v_min; + + /* gk is now the k value in terms of cells in target + grid. This distance between max_intersectionMin (so lagrangian + plan, well max value here) and V of source grid, divided by + intersection_dk to find out how many grid cells that is*/ + const int firstBlock_gk = (int)((firstBlockMinV - max_intersectionMin)/intersection_dk); + const int lastBlock_gk = (int)((lastBlockMaxV - min_intersectionMin)/intersection_dk); + + int firstBlockIndexK = firstBlock_gk/WID; + int lastBlockIndexK = lastBlock_gk/WID; + + // now enforce mesh limits for target column blocks (and check if we are + // too close to the velocity space boundaries) + firstBlockIndexK = (firstBlockIndexK >= 0) ? firstBlockIndexK : 0; + firstBlockIndexK = (firstBlockIndexK < max_v_length ) ? firstBlockIndexK : max_v_length - 1; + lastBlockIndexK = (lastBlockIndexK >= 0) ? lastBlockIndexK : 0; + lastBlockIndexK = (lastBlockIndexK < max_v_length ) ? lastBlockIndexK : max_v_length - 1; + if(firstBlockIndexK < bailout_velocity_space_wall_margin + || firstBlockIndexK >= max_v_length - bailout_velocity_space_wall_margin + || lastBlockIndexK < bailout_velocity_space_wall_margin + || lastBlockIndexK >= max_v_length - bailout_velocity_space_wall_margin + ) { + // Pass bailout (hitting the wall) flag back to host + if (ti==0) { + bailout_flag[0] = 1; } + } - const vmesh::LocalID n_cblocks = gpu_columnData->columnNumBlocks[columnIndex]; - vmesh::GlobalID* cblocks = GIDlist + gpu_columnData->columnBlockOffsets[columnIndex]; //column blocks - vmesh::LocalID firstBlockIndices0,firstBlockIndices1,firstBlockIndices2; - vmesh::LocalID lastBlockIndices0,lastBlockIndices1,lastBlockIndices2; - vmesh->getIndices(cblocks[0], - firstBlockIndices0, firstBlockIndices1, firstBlockIndices2); - vmesh->getIndices(cblocks[n_cblocks -1], - lastBlockIndices0, lastBlockIndices1, lastBlockIndices2); - swapBlockIndices(firstBlockIndices0,firstBlockIndices1,firstBlockIndices2, dimension); - swapBlockIndices(lastBlockIndices0,lastBlockIndices1,lastBlockIndices2, dimension); - - /* firstBlockV is in z the minimum velocity value of the lower - * edge in source grid. - * lastBlockV is in z the maximum velocity value of the upper - * edge in source grid. */ - Realv firstBlockMinV = (WID * firstBlockIndices2) * dv + v_min; - Realv lastBlockMaxV = (WID * (lastBlockIndices2 + 1)) * dv + v_min; - - /* gk is now the k value in terms of cells in target - grid. This distance between max_intersectionMin (so lagrangian - plan, well max value here) and V of source grid, divided by - intersection_dk to find out how many grid cells that is*/ - const int firstBlock_gk = (int)((firstBlockMinV - max_intersectionMin)/intersection_dk); - const int lastBlock_gk = (int)((lastBlockMaxV - min_intersectionMin)/intersection_dk); - - int firstBlockIndexK = firstBlock_gk/WID; - int lastBlockIndexK = lastBlock_gk/WID; - - // now enforce mesh limits for target column blocks (and check if we are - // too close to the velocity space boundaries) - firstBlockIndexK = (firstBlockIndexK >= 0) ? firstBlockIndexK : 0; - firstBlockIndexK = (firstBlockIndexK < max_v_length ) ? firstBlockIndexK : max_v_length - 1; - lastBlockIndexK = (lastBlockIndexK >= 0) ? lastBlockIndexK : 0; - lastBlockIndexK = (lastBlockIndexK < max_v_length ) ? lastBlockIndexK : max_v_length - 1; - if(firstBlockIndexK < bailout_velocity_space_wall_margin - || firstBlockIndexK >= max_v_length - bailout_velocity_space_wall_margin - || lastBlockIndexK < bailout_velocity_space_wall_margin - || lastBlockIndexK >= max_v_length - bailout_velocity_space_wall_margin - ) { - // Pass bailout (hitting the wall) flag back to host - if (ti==0) { - bailout_flag[0] = 1; - } + //store source blocks + for (uint blockK = firstBlockIndices2; blockK <= lastBlockIndices2; blockK +=warpSize){ + if ((blockK+ti) <= lastBlockIndices2) { + //const int old = atomicAdd(&isSourceBlock[blockK+ti],1); + isSourceBlock[blockK+ti] = 1; // Does not need to be atomic, as long as it's no longer zero } + } + __syncthreads(); - //store source blocks - for (uint blockK = firstBlockIndices2; blockK <= lastBlockIndices2; blockK +=warpSize){ - if ((blockK+ti) <= lastBlockIndices2) { - //const int old = atomicAdd(&isSourceBlock[blockK+ti],1); - isSourceBlock[blockK+ti] = 1; // Does not need to be atomic, as long as it's no longer zero - } + //store target blocks + for (uint blockK = (uint)firstBlockIndexK; blockK <= (uint)lastBlockIndexK; blockK+=warpSize){ + if ((blockK+ti) <= (uint)lastBlockIndexK) { + isTargetBlock[blockK+ti] = 1; // Does not need to be atomic, as long as it's no longer zero + //const int old = atomicAdd(&isTargetBlock[blockK+ti],1); } - __syncthreads(); + } + __syncthreads(); - //store target blocks - for (uint blockK = (uint)firstBlockIndexK; blockK <= (uint)lastBlockIndexK; blockK+=warpSize){ - if ((blockK+ti) <= (uint)lastBlockIndexK) { - isTargetBlock[blockK+ti] = 1; // Does not need to be atomic, as long as it's no longer zero - //const int old = atomicAdd(&isTargetBlock[blockK+ti],1); - } - } - __syncthreads(); + if (ti==0) { + // Set columns' transverse coordinates + gpu_columns[columnIndex].i = setFirstBlockIndices0; + gpu_columns[columnIndex].j = setFirstBlockIndices1; + gpu_columns[columnIndex].kBegin = firstBlockIndices2; - if (ti==0) { - // Set columns' transverse coordinates - gpu_columns[columnIndex].i = setFirstBlockIndices0; - gpu_columns[columnIndex].j = setFirstBlockIndices1; - gpu_columns[columnIndex].kBegin = firstBlockIndices2; - - //store also for each column firstBlockIndexK, and lastBlockIndexK - gpu_columns[columnIndex].minBlockK = firstBlockIndexK; - gpu_columns[columnIndex].maxBlockK = lastBlockIndexK; - } - } // end loop over columns in set - __syncthreads(); + //store also for each column firstBlockIndexK, and lastBlockIndexK + gpu_columns[columnIndex].minBlockK = firstBlockIndexK; + gpu_columns[columnIndex].maxBlockK = lastBlockIndexK; + } + } // end loop over columns in set + __syncthreads(); - for (uint blockT = 0; blockT < MAX_BLOCKS_PER_DIM; blockT +=warpSize) { - const uint blockK = blockT + ti; - if (blockK < MAX_BLOCKS_PER_DIM) { - if(isTargetBlock[blockK]!=0) { - const int targetBlock = - setFirstBlockIndices0 * gpu_block_indices_to_id[0] + - setFirstBlockIndices1 * gpu_block_indices_to_id[1] + - blockK * gpu_block_indices_to_id[2]; - dev_map_require->set_element(targetBlock,vmesh->getLocalID(targetBlock)); - // if(!BlocksRequired->device_push_back(targetBlock)) { - // bailout_flag[1]=1; - // return; - // } + for (uint blockT = 0; blockT < MAX_BLOCKS_PER_DIM; blockT +=warpSize) { + const uint blockK = blockT + ti; + if (blockK < MAX_BLOCKS_PER_DIM) { + if(isTargetBlock[blockK]!=0) { + const int targetBlock = + setFirstBlockIndices0 * gpu_block_indices_to_id[0] + + setFirstBlockIndices1 * gpu_block_indices_to_id[1] + + blockK * gpu_block_indices_to_id[2]; + dev_map_require->set_element(targetBlock,vmesh->getLocalID(targetBlock)); + // if(!BlocksRequired->device_push_back(targetBlock)) { + // bailout_flag[1]=1; + // return; + // } + } + if(isTargetBlock[blockK]!=0 && isSourceBlock[blockK]==0 ) { + const int targetBlock = + setFirstBlockIndices0 * gpu_block_indices_to_id[0] + + setFirstBlockIndices1 * gpu_block_indices_to_id[1] + + blockK * gpu_block_indices_to_id[2]; + if(!list_with_replace_new->device_push_back(targetBlock)) { + bailout_flag[1]=1; // out of capacity } - if(isTargetBlock[blockK]!=0 && isSourceBlock[blockK]==0 ) { - const int targetBlock = - setFirstBlockIndices0 * gpu_block_indices_to_id[0] + - setFirstBlockIndices1 * gpu_block_indices_to_id[1] + - blockK * gpu_block_indices_to_id[2]; - if(!list_with_replace_new->device_push_back(targetBlock)) { - bailout_flag[1]=1; // out of capacity - } - } - if(isTargetBlock[blockK]==0 && isSourceBlock[blockK]!=0 ) { - const int targetBlock = - setFirstBlockIndices0 * gpu_block_indices_to_id[0] + - setFirstBlockIndices1 * gpu_block_indices_to_id[1] + - blockK * gpu_block_indices_to_id[2]; - dev_map_remove->set_element(targetBlock,vmesh->getLocalID(targetBlock)); - // GPUTODO: could use device_insert to verify insertion, but not worth it - // if(!list_to_replace->device_push_back( - // Hashinator::hash_pair(targetBlock,vmesh->getLocalID(targetBlock)))) { - // bailout_flag[2]=1; // out of capacity - // return; - // } - } - } // block within MAX_BLOCKS_PER_DIM - } // loop over all potential blocks - } // if valid setIndez - } // for setIndexB + } + if(isTargetBlock[blockK]==0 && isSourceBlock[blockK]!=0 ) { + const int targetBlock = + setFirstBlockIndices0 * gpu_block_indices_to_id[0] + + setFirstBlockIndices1 * gpu_block_indices_to_id[1] + + blockK * gpu_block_indices_to_id[2]; + dev_map_remove->set_element(targetBlock,vmesh->getLocalID(targetBlock)); + // GPUTODO: could use device_insert to verify insertion, but not worth it + // if(!list_to_replace->device_push_back( + // Hashinator::hash_pair(targetBlock,vmesh->getLocalID(targetBlock)))) { + // bailout_flag[2]=1; // out of capacity + // return; + // } + } + } // block within MAX_BLOCKS_PER_DIM + } // loop over all potential blocks + } // if valid setIndex } __global__ void __launch_bounds__(VECL,4) acceleration_kernel( @@ -424,7 +420,6 @@ __global__ void __launch_bounds__(VECL,4) acceleration_kernel( const uint w_tid = threadIdx.z*blockDim.x*blockDim.y + threadIdx.y*blockDim.x + threadIdx.x; Realf *gpu_blockData = blockContainer->getData(); - //for (uint column = blocki; column < totalColumns; column += gpuBlocks) { const uint column = blocki; { /* New threading with each warp/wavefront working on one vector */ @@ -538,7 +533,7 @@ __global__ void __launch_bounds__(VECL,4) acceleration_kernel( } // for loop over target k-indices of current source block } // for-loop over source blocks } //for loop over j index - } // End loop over columns + } // End this column } // end semilag acc kernel /* @@ -753,13 +748,11 @@ __host__ bool gpu_acc_map_1d(spatial_cell::SpatialCell* spatial_cell, //storeTimer.stop(); //phiprof::Timer reorderTimer {"Reorder blocks by dimension"}; - uint gpublocks = host_totalColumns > GPUBLOCKS ? GPUBLOCKS : host_totalColumns; // Launch kernels for transposing and ordering velocity space data into columns - reorder_blocks_by_dimension_kernel<<>> ( + reorder_blocks_by_dimension_kernel<<>> ( dev_blockContainer, gpu_blockDataOrdered[cpuThreadID], gpu_cell_indices_to_id[cpuThreadID], - host_totalColumns, LIDlist, columnData ); @@ -775,7 +768,7 @@ __host__ bool gpu_acc_map_1d(spatial_cell::SpatialCell* spatial_cell, map_remove->clear(Hashinator::targets::device,stream,std::pow(2,spatial_cell->vbwncl_sizePower)); // Hashmap clear includes a stream sync //CHK_ERR( gpuStreamSynchronize(stream) ); - evaluate_column_extents_kernel<<>> ( + evaluate_column_extents_kernel<<>> ( dimension, dev_vmesh, columnData, diff --git a/vlasovsolver/gpu_acc_sort_blocks.cpp b/vlasovsolver/gpu_acc_sort_blocks.cpp index 15c6682a7..fd103407d 100644 --- a/vlasovsolver/gpu_acc_sort_blocks.cpp +++ b/vlasovsolver/gpu_acc_sort_blocks.cpp @@ -56,16 +56,13 @@ __global__ void __launch_bounds__(GPUTHREADS,4) blocksID_mapped_dim0_kernel( vmesh::LocalID *blocksLID_unsorted, const uint nBlocks ) { - const int gpuBlocks = gridDim.x * gridDim.y * gridDim.z; const uint warpSize = blockDim.x * blockDim.y * blockDim.z; const int blocki = blockIdx.z*gridDim.x*gridDim.y + blockIdx.y*gridDim.x + blockIdx.x; const uint ti = threadIdx.z*blockDim.x*blockDim.y + threadIdx.y*blockDim.x + threadIdx.x; - for (vmesh::LocalID index=blocki*warpSize; indexgetGlobalID(LID); - blocksLID_unsorted[LID]=LID; - } + const vmesh::LocalID LID = blocki * warpSize + ti; + if (LID < nBlocks) { + blocksID_mapped[LID] = vmesh->getGlobalID(LID); + blocksLID_unsorted[LID]=LID; } } @@ -75,22 +72,19 @@ __global__ void __launch_bounds__(GPUTHREADS,4) blocksID_mapped_dim1_kernel( vmesh::LocalID *blocksLID_unsorted, const uint nBlocks ) { - const int gpuBlocks = gridDim.x * gridDim.y * gridDim.z; const uint warpSize = blockDim.x * blockDim.y * blockDim.z; const int blocki = blockIdx.z*gridDim.x*gridDim.y + blockIdx.y*gridDim.x + blockIdx.x; const uint ti = threadIdx.z*blockDim.x*blockDim.y + threadIdx.y*blockDim.x + threadIdx.x; const vmesh::LocalID D0 = vmesh->getGridLength()[0]; const vmesh::LocalID D1 = vmesh->getGridLength()[1]; // const vmesh::LocalID D2 = vmesh->getGridLength()[2]; - for (vmesh::LocalID index=blocki*warpSize; indexgetGlobalID(LID); - const vmesh::LocalID x_index = GID % D0; - const vmesh::LocalID y_index = (GID / D0) % D1; - blocksID_mapped[LID] = GID - (x_index + y_index*D0) + y_index + x_index * D1; - blocksLID_unsorted[LID]=LID; - } + const vmesh::LocalID LID = blocki * warpSize + ti; + if (LID < nBlocks) { + const vmesh::GlobalID GID = vmesh->getGlobalID(LID); + const vmesh::LocalID x_index = GID % D0; + const vmesh::LocalID y_index = (GID / D0) % D1; + blocksID_mapped[LID] = GID - (x_index + y_index*D0) + y_index + x_index * D1; + blocksLID_unsorted[LID]=LID; } } @@ -100,65 +94,40 @@ __global__ void __launch_bounds__(GPUTHREADS,4) blocksID_mapped_dim2_kernel( vmesh::LocalID *blocksLID_unsorted, const uint nBlocks ) { - const int gpuBlocks = gridDim.x * gridDim.y * gridDim.z; const uint warpSize = blockDim.x * blockDim.y * blockDim.z; const int blocki = blockIdx.z*gridDim.x*gridDim.y + blockIdx.y*gridDim.x + blockIdx.x; const uint ti = threadIdx.z*blockDim.x*blockDim.y + threadIdx.y*blockDim.x + threadIdx.x; const vmesh::LocalID D0 = vmesh->getGridLength()[0]; const vmesh::LocalID D1 = vmesh->getGridLength()[1]; const vmesh::LocalID D2 = vmesh->getGridLength()[2]; - for (vmesh::LocalID index=blocki*warpSize; indexgetGlobalID(LID); - const vmesh::LocalID x_index = GID % D0; - const vmesh::LocalID y_index = (GID / D0) % D1; - const vmesh::LocalID z_index = (GID / (D0*D1)); - blocksID_mapped[LID] = z_index + y_index*D2 + x_index*D1*D2; - blocksLID_unsorted[LID]=LID; - } + const vmesh::LocalID LID = blocki * warpSize + ti; + if (LID < nBlocks) { + const vmesh::GlobalID GID = vmesh->getGlobalID(LID); + const vmesh::LocalID x_index = GID % D0; + const vmesh::LocalID y_index = (GID / D0) % D1; + const vmesh::LocalID z_index = (GID / (D0*D1)); + blocksID_mapped[LID] = z_index + y_index*D2 + x_index*D1*D2; + blocksLID_unsorted[LID]=LID; } } // LIDs are already in order. // Now also order GIDS. (can be ridiculously parallel, minus memory access patterns) +// also used for scanning columnsets for block counts __global__ void __launch_bounds__(GPUTHREADS,4) order_GIDs_kernel( const vmesh::VelocityMesh* vmesh, vmesh::GlobalID *blocksLID, vmesh::GlobalID *blocksGID, - const uint nBlocks, - ColumnOffsets* columnData // passed just for resetting - ) { - const int gpuBlocks = gridDim.x * gridDim.y * gridDim.z; - const uint warpSize = blockDim.x * blockDim.y * blockDim.z; - const int blocki = blockIdx.z*gridDim.x*gridDim.y + blockIdx.y*gridDim.x + blockIdx.x; - const uint ti = threadIdx.z*blockDim.x*blockDim.y + threadIdx.y*blockDim.x + threadIdx.x; - for (vmesh::LocalID index=blocki*warpSize; indexgetGlobalID(blocksLID[i]); - } - } - if (blockIdx.x == blockIdx.y == blockIdx.z == threadIdx.x == threadIdx.y == threadIdx.z == 0) { - columnData->columnBlockOffsets.clear(); - columnData->columnNumBlocks.clear(); - columnData->setColumnOffsets.clear(); - columnData->setNumColumns.clear(); - } -} - -// Kernel for scanning columnsets for block counts -__global__ void __launch_bounds__(GPUTHREADS,4) scan_blocks_for_columns_kernel( - const vmesh::VelocityMesh* vmesh, const uint dimension, vmesh::GlobalID *blocksID_mapped_sorted, vmesh::LocalID *gpu_columnNBlocks, - const uint nBlocks + const uint nBlocks, + ColumnOffsets* columnData // passed just for resetting ) { - const int gpuBlocks = gridDim.x * gridDim.y * gridDim.z; const uint warpSize = blockDim.x * blockDim.y * blockDim.z; const int blocki = blockIdx.z*gridDim.x*gridDim.y + blockIdx.y*gridDim.x + blockIdx.x; const uint ti = threadIdx.z*blockDim.x*blockDim.y + threadIdx.y*blockDim.x + threadIdx.x; + vmesh::LocalID DX; switch (dimension) { case 0: @@ -173,21 +142,29 @@ __global__ void __launch_bounds__(GPUTHREADS,4) scan_blocks_for_columns_kernel( default: printf("Incorrect dimension in __FILE__ __LINE__\n"); } - for (vmesh::LocalID LID=blocki*warpSize; LIDgetGlobalID(blocksLID[index]); + + const vmesh::LocalID column_id = blocksID_mapped_sorted[index] / DX; + // Increment number of blocks in column + const vmesh::LocalID old = atomicAdd(&gpu_columnNBlocks[column_id],1); + // // Evaluate smallest GID in column + // old = atomicMin(&columnMinBlock[columnid],GID); + // // Evaluate largest GID in colum + // old = atomicMax(&columnMaxBlock[columnid],GID); + } + if (blockIdx.x == blockIdx.y == blockIdx.z == threadIdx.x == threadIdx.y == threadIdx.z == 0) { + columnData->columnBlockOffsets.clear(); + columnData->columnNumBlocks.clear(); + columnData->setColumnOffsets.clear(); + columnData->setNumColumns.clear(); } } /*** Kernel for constructing columns - Checks if all blocks in a columnset belong to a single kernel, and + Checks if all blocks in a columnset belong to a single column, and can quickly jump through the whole column. For columnsets containing several columns, it trials blocks by scanning warpSize GIDs at a time. @@ -346,14 +323,17 @@ void sortBlocklistByDimension( //const spatial_cell::SpatialCell* spatial_cell, gpuStream_t stream ) { - // Ensure at least one launch block - uint nGpuBlocks = (nBlocks/GPUTHREADS) > GPUBLOCKS ? GPUBLOCKS : std::ceil((Real)nBlocks/(Real)GPUTHREADS); + // Ensure at least one launch block, ceil int division + //const uint maxThreads = WARPSPERBLOCK*GPUTHREADS; + // For some reason, a direct increase of launch threads breaks things. Stick with GPUTHREADS for now. + const uint maxThreads = GPUTHREADS; + const uint launchBlocks = 1 + ((nBlocks - 1) / (maxThreads)); //phiprof::Timer calcTimer {"calc new dimension id"}; // Map blocks to new dimensionality switch( dimension ) { case 0: { - blocksID_mapped_dim0_kernel<<>> ( + blocksID_mapped_dim0_kernel<<>> ( vmesh, blocksID_mapped, blocksLID_unsorted, @@ -362,7 +342,7 @@ void sortBlocklistByDimension( //const spatial_cell::SpatialCell* spatial_cell, break; } case 1: { - blocksID_mapped_dim1_kernel<<>> ( + blocksID_mapped_dim1_kernel<<>> ( vmesh, blocksID_mapped, blocksLID_unsorted, @@ -371,7 +351,7 @@ void sortBlocklistByDimension( //const spatial_cell::SpatialCell* spatial_cell, break; } case 2: { - blocksID_mapped_dim2_kernel<<>> ( + blocksID_mapped_dim2_kernel<<>> ( vmesh, blocksID_mapped, blocksLID_unsorted, @@ -428,31 +408,28 @@ void sortBlocklistByDimension( //const spatial_cell::SpatialCell* spatial_cell, sortTimer.stop(); // Gather GIDs in order - //phiprof::Timer reorderTimer {"reorder GIDs"}; - order_GIDs_kernel<<>> ( + //phiprof::Timer reorderTimer {"reorder GIDs"}; // and scan for column block counts + order_GIDs_kernel<<>> ( vmesh, blocksLID, blocksGID, - nBlocks, - columnData // Pass this just to clear it on device - ); - CHK_ERR( gpuPeekAtLastError() ); - //reorderTimer.stop(); - - //phiprof::Timer scanTimer {"scan for column block counts"}; - scan_blocks_for_columns_kernel<<>> ( - vmesh, dimension, blocksID_mapped, gpu_columnNBlocks, - nBlocks + nBlocks, + columnData // Pass this just to clear it on device ); CHK_ERR( gpuPeekAtLastError() ); - //scanTimer.stop(); + //reorderTimer.stop(); //phiprof::Timer constructTimer {"construct columns"}; // Construct columns. To ensure order, // these are done serially, but still form within a kernel. + // Optimizing launch threads for this kernel as well needs a bit more checking + // uint columnThreads = maxThreads; + // columnThreads = (vmesh->getGridLength()[0]) < columnThreads ? vmesh->getGridLength()[0] : columnThreads; + // columnThreads = (vmesh->getGridLength()[1]) < columnThreads ? vmesh->getGridLength()[1] : columnThreads; + // columnThreads = (vmesh->getGridLength()[2]) < columnThreads ? vmesh->getGridLength()[2] : columnThreads; construct_columns_kernel<<<1, GPUTHREADS, 0, stream>>> ( vmesh, dimension,