Purged config references to the GPUBLOCKS / CUDABLOCKS parameter and …

…skipped it in the code. Merged two small kernels in acceleration
fmihpc · Jun 4, 2024 · e700da3 · e700da3
1 parent 236741f
commit e700da3
Show file tree

Hide file tree

Showing 9 changed files with 250 additions and 299 deletions.
diff --git a/MAKE/Makefile.Freezer_cuda b/MAKE/Makefile.Freezer_cuda
@@ -35,7 +35,6 @@ VECL=16
 # LDFLAGS flags for linker
 
 USE_CUDA=1
-CUDABLOCKS=108
 
 # Tell mpic++ to use nvcc for all compiling
 CMP = OMPI_CXX='nvcc' OMPI_CXXFLAGS='' mpic++

diff --git a/MAKE/Makefile.mahti_cuda b/MAKE/Makefile.mahti_cuda
@@ -33,7 +33,6 @@ VECL=16
 # mpi.h in c++ on Cray
 
 USE_CUDA=1
-CUDABLOCKS=108
 
 #-ggdb not available on nvcc
 #-G (device debug) overrides --generate-line-info -line-info but also requires more device-side resources to run

diff --git a/MAKE/Makefile.puhti_gcc_cuda b/MAKE/Makefile.puhti_gcc_cuda
@@ -45,7 +45,6 @@ VECL=64
 # mpi.h in c++ on Cray
 
 USE_CUDA=1
-CUDABLOCKS=108
 CMP = mpic++
 LNK = mpic++
 NVCC = nvcc

diff --git a/arch/arch_device_cuda.h b/arch/arch_device_cuda.h
@@ -100,11 +100,6 @@
 #define ARCH_BLOCKSIZE_R 512
 #define ARCH_BLOCKSIZE_R_SMALL 32
 
-/* GPU blocksize used by Vlasov solvers */
-#ifndef GPUBLOCKS
-# define GPUBLOCKS (108)
-#endif
-
 /* values used by kernels */
 #ifndef GPUTHREADS
 #define GPUTHREADS (32)

diff --git a/arch/arch_device_hip.h b/arch/arch_device_hip.h
@@ -97,11 +97,6 @@
 #define ARCH_BLOCKSIZE_R 512
 #define ARCH_BLOCKSIZE_R_SMALL 64
 
-/* GPU blocksize used by Vlasov solvers */
-#ifndef GPUBLOCKS
-# define GPUBLOCKS (108)
-#endif
-
 /* values used by kernels */
 #ifndef GPUTHREADS
 #define GPUTHREADS (64)

diff --git a/spatial_cell_gpu.cpp b/spatial_cell_gpu.cpp
@@ -906,7 +906,6 @@ namespace spatial_cell {
  // Evaluate velocity halo for local content blocks
  if (velocity_block_with_content_list_size>0) {
  //phiprof::Timer blockHaloTimer {"Block halo kernel"};
- //nGpuBlocks = velocity_block_with_content_list_size > GPUBLOCKS ? GPUBLOCKS : velocity_block_with_content_list_size;
  const int addWidthV = getObjectWrapper().particleSpecies[popID].sparseBlockAddWidthV;
  if (addWidthV!=1) {
  std::cerr<<"Warning! "<<__FILE__<<":"<<__LINE__<<" Halo extent is not 1, unsupported size."<<std::endl;
@@ -1237,7 +1236,7 @@ namespace spatial_cell {
 
  // Third argument specifies the number of bytes in *shared memory* that is
  // dynamically allocated per block for this call in addition to the statically allocated memory.
- //update_velocity_block_content_lists_kernel<<<GPUBLOCKS, WID3, WID3*sizeof(int), stream>>> (
+ //update_velocity_block_content_lists_kernel<<<launchBlocks, WID3, WID3*sizeof(int), stream>>> (
  update_velocity_block_content_lists_kernel<<<launchBlocks, (vlasiBlocksPerWorkUnit * WID3), 0, stream>>> (
  populations[popID].dev_vmesh,
  populations[popID].dev_blockContainer,

diff --git a/spatial_cell_gpu.hpp b/spatial_cell_gpu.hpp
@@ -137,19 +137,17 @@ namespace spatial_cell {
  vmesh::VelocityBlockContainer *blockContainer,
  const Real factor
  ) {
- const int gpuBlocks = gridDim.x;
  const int blocki = blockIdx.x;
  const int i = threadIdx.x;
  const int j = threadIdx.y;
  const int k = threadIdx.z;
  const uint ti = k*WID2 + j*WID + i;
  // loop over whole velocity space and scale the values
- for (uint blockLID=blocki; blockLID<nBlocks; blockLID += gpuBlocks) {
- // Pointer to target block data
- Realf* data = blockContainer->getData(blockLID);
- // Scale value
- data[ti] = data[ti] * factor;
- }
+ const uint blockLID = blocki;
+ // Pointer to target block data
+ Realf* data = blockContainer->getData(blockLID);
+ // Scale value
+ data[ti] = data[ti] * factor;
  }
  /** GPU kernel for adding a particle population to another with a scaling factor
  This kernel increments existing blocks, creates new ones if the block does
@@ -379,11 +377,10 @@ namespace spatial_cell {
  }
  // Now loop over whole velocity space and scale the values
  vmesh::LocalID nBlocks = vmesh->size();
- const uint nGpuBlocks = nBlocks > GPUBLOCKS ? GPUBLOCKS : nBlocks;
  gpuStream_t stream = gpu_getStream();
- if (nGpuBlocks > 0) {
+ if (nBlocks > 0) {
  dim3 block(WID,WID,WID);
- population_scale_kernel<<<nGpuBlocks, block, 0, stream>>> (
+ population_scale_kernel<<<nBlocks, block, 0, stream>>> (
  nBlocks,
  dev_vmesh,
  dev_blockContainer,
@@ -410,8 +407,7 @@ namespace spatial_cell {
  CHK_ERR( gpuStreamSynchronize(stream) );
  // Loop over the whole velocity space, and add scaled values with
  // a kernel. Addition of new blocks is not block-parallel-safe.
- const uint nGpuBlocks = nBlocks > GPUBLOCKS ? GPUBLOCKS : nBlocks;
- if (nGpuBlocks > 0) {
+ if (nBlocks > 0) {
  dim3 block(WID,WID,WID);
  // Now serial
  population_increment_kernel<<<1, block, 0, stream>>> (
@@ -439,13 +435,13 @@ namespace spatial_cell {
  const fileReal* gpuInitBuffer,
  const uint nBlocks
  ) {
- const int gpuBlocks = gridDim.x;
  const int blocki = blockIdx.x;
  //const int warpSize = blockDim.x*blockDim.y*blockDim.z;
  const uint ti = threadIdx.z*blockDim.x*blockDim.y + threadIdx.y*blockDim.x + threadIdx.x;
  Real* parameters = blockContainer->getParameters(startLID);
  Realf *cellBlockData = blockContainer->getData(startLID);
- for (uint index=blocki; index<nBlocks; index += gpuBlocks) {
+ const uint index = blocki;
+ {
  // Copy in cell data, perform conversion float<->double if necessary
  cellBlockData[index*WID3 + ti] = (Realf)gpuInitBuffer[index*WID3 + ti];
  // Set block parameters
@@ -1229,13 +1225,12 @@ namespace spatial_cell {
  CHK_ERR( gpuMemcpyAsync(gpuInitBlocks, blocks.data(),
  nBlocks*sizeof(vmesh::GlobalID), gpuMemcpyHostToDevice, stream) );
 
- const uint nGpuBlocks = nBlocks > GPUBLOCKS ? GPUBLOCKS : nBlocks;
- if (nGpuBlocks>0) {
+ if (nBlocks>0) {
  dim3 block(WID,WID,WID);
  // Third argument specifies the number of bytes in *shared memory* that is
  // dynamically allocated per block for this call in addition to the statically allocated memory.
  CHK_ERR( gpuStreamSynchronize(stream) );
- spatial_cell::add_blocks_from_buffer_kernel<<<nGpuBlocks, block, 0, stream>>> (
+ spatial_cell::add_blocks_from_buffer_kernel<<<nBlocks, block, 0, stream>>> (
  populations[popID].dev_vmesh,
  populations[popID].dev_blockContainer,
  startLID,