Skip to content

Commit

Permalink
Purged config references to the GPUBLOCKS / CUDABLOCKS parameter and …
Browse files Browse the repository at this point in the history
…skipped it in the code. Merged two small kernels in acceleration
  • Loading branch information
markusbattarbee committed Jun 4, 2024
1 parent 236741f commit e700da3
Show file tree
Hide file tree
Showing 9 changed files with 250 additions and 299 deletions.
1 change: 0 additions & 1 deletion MAKE/Makefile.Freezer_cuda
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,6 @@ VECL=16
# LDFLAGS flags for linker

USE_CUDA=1
CUDABLOCKS=108

# Tell mpic++ to use nvcc for all compiling
CMP = OMPI_CXX='nvcc' OMPI_CXXFLAGS='' mpic++
Expand Down
1 change: 0 additions & 1 deletion MAKE/Makefile.mahti_cuda
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,6 @@ VECL=16
# mpi.h in c++ on Cray

USE_CUDA=1
CUDABLOCKS=108

#-ggdb not available on nvcc
#-G (device debug) overrides --generate-line-info -line-info but also requires more device-side resources to run
Expand Down
1 change: 0 additions & 1 deletion MAKE/Makefile.puhti_gcc_cuda
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,6 @@ VECL=64
# mpi.h in c++ on Cray

USE_CUDA=1
CUDABLOCKS=108
CMP = mpic++
LNK = mpic++
NVCC = nvcc
Expand Down
5 changes: 0 additions & 5 deletions arch/arch_device_cuda.h
Original file line number Diff line number Diff line change
Expand Up @@ -100,11 +100,6 @@
#define ARCH_BLOCKSIZE_R 512
#define ARCH_BLOCKSIZE_R_SMALL 32

/* GPU blocksize used by Vlasov solvers */
#ifndef GPUBLOCKS
# define GPUBLOCKS (108)
#endif

/* values used by kernels */
#ifndef GPUTHREADS
#define GPUTHREADS (32)
Expand Down
5 changes: 0 additions & 5 deletions arch/arch_device_hip.h
Original file line number Diff line number Diff line change
Expand Up @@ -97,11 +97,6 @@
#define ARCH_BLOCKSIZE_R 512
#define ARCH_BLOCKSIZE_R_SMALL 64

/* GPU blocksize used by Vlasov solvers */
#ifndef GPUBLOCKS
# define GPUBLOCKS (108)
#endif

/* values used by kernels */
#ifndef GPUTHREADS
#define GPUTHREADS (64)
Expand Down
3 changes: 1 addition & 2 deletions spatial_cell_gpu.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -906,7 +906,6 @@ namespace spatial_cell {
// Evaluate velocity halo for local content blocks
if (velocity_block_with_content_list_size>0) {
//phiprof::Timer blockHaloTimer {"Block halo kernel"};
//nGpuBlocks = velocity_block_with_content_list_size > GPUBLOCKS ? GPUBLOCKS : velocity_block_with_content_list_size;
const int addWidthV = getObjectWrapper().particleSpecies[popID].sparseBlockAddWidthV;
if (addWidthV!=1) {
std::cerr<<"Warning! "<<__FILE__<<":"<<__LINE__<<" Halo extent is not 1, unsupported size."<<std::endl;
Expand Down Expand Up @@ -1237,7 +1236,7 @@ namespace spatial_cell {

// Third argument specifies the number of bytes in *shared memory* that is
// dynamically allocated per block for this call in addition to the statically allocated memory.
//update_velocity_block_content_lists_kernel<<<GPUBLOCKS, WID3, WID3*sizeof(int), stream>>> (
//update_velocity_block_content_lists_kernel<<<launchBlocks, WID3, WID3*sizeof(int), stream>>> (
update_velocity_block_content_lists_kernel<<<launchBlocks, (vlasiBlocksPerWorkUnit * WID3), 0, stream>>> (
populations[popID].dev_vmesh,
populations[popID].dev_blockContainer,
Expand Down
29 changes: 12 additions & 17 deletions spatial_cell_gpu.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -137,19 +137,17 @@ namespace spatial_cell {
vmesh::VelocityBlockContainer *blockContainer,
const Real factor
) {
const int gpuBlocks = gridDim.x;
const int blocki = blockIdx.x;
const int i = threadIdx.x;
const int j = threadIdx.y;
const int k = threadIdx.z;
const uint ti = k*WID2 + j*WID + i;
// loop over whole velocity space and scale the values
for (uint blockLID=blocki; blockLID<nBlocks; blockLID += gpuBlocks) {
// Pointer to target block data
Realf* data = blockContainer->getData(blockLID);
// Scale value
data[ti] = data[ti] * factor;
}
const uint blockLID = blocki;
// Pointer to target block data
Realf* data = blockContainer->getData(blockLID);
// Scale value
data[ti] = data[ti] * factor;
}
/** GPU kernel for adding a particle population to another with a scaling factor
This kernel increments existing blocks, creates new ones if the block does
Expand Down Expand Up @@ -379,11 +377,10 @@ namespace spatial_cell {
}
// Now loop over whole velocity space and scale the values
vmesh::LocalID nBlocks = vmesh->size();
const uint nGpuBlocks = nBlocks > GPUBLOCKS ? GPUBLOCKS : nBlocks;
gpuStream_t stream = gpu_getStream();
if (nGpuBlocks > 0) {
if (nBlocks > 0) {
dim3 block(WID,WID,WID);
population_scale_kernel<<<nGpuBlocks, block, 0, stream>>> (
population_scale_kernel<<<nBlocks, block, 0, stream>>> (
nBlocks,
dev_vmesh,
dev_blockContainer,
Expand All @@ -410,8 +407,7 @@ namespace spatial_cell {
CHK_ERR( gpuStreamSynchronize(stream) );
// Loop over the whole velocity space, and add scaled values with
// a kernel. Addition of new blocks is not block-parallel-safe.
const uint nGpuBlocks = nBlocks > GPUBLOCKS ? GPUBLOCKS : nBlocks;
if (nGpuBlocks > 0) {
if (nBlocks > 0) {
dim3 block(WID,WID,WID);
// Now serial
population_increment_kernel<<<1, block, 0, stream>>> (
Expand Down Expand Up @@ -439,13 +435,13 @@ namespace spatial_cell {
const fileReal* gpuInitBuffer,
const uint nBlocks
) {
const int gpuBlocks = gridDim.x;
const int blocki = blockIdx.x;
//const int warpSize = blockDim.x*blockDim.y*blockDim.z;
const uint ti = threadIdx.z*blockDim.x*blockDim.y + threadIdx.y*blockDim.x + threadIdx.x;
Real* parameters = blockContainer->getParameters(startLID);
Realf *cellBlockData = blockContainer->getData(startLID);
for (uint index=blocki; index<nBlocks; index += gpuBlocks) {
const uint index = blocki;
{
// Copy in cell data, perform conversion float<->double if necessary
cellBlockData[index*WID3 + ti] = (Realf)gpuInitBuffer[index*WID3 + ti];
// Set block parameters
Expand Down Expand Up @@ -1229,13 +1225,12 @@ namespace spatial_cell {
CHK_ERR( gpuMemcpyAsync(gpuInitBlocks, blocks.data(),
nBlocks*sizeof(vmesh::GlobalID), gpuMemcpyHostToDevice, stream) );

const uint nGpuBlocks = nBlocks > GPUBLOCKS ? GPUBLOCKS : nBlocks;
if (nGpuBlocks>0) {
if (nBlocks>0) {
dim3 block(WID,WID,WID);
// Third argument specifies the number of bytes in *shared memory* that is
// dynamically allocated per block for this call in addition to the statically allocated memory.
CHK_ERR( gpuStreamSynchronize(stream) );
spatial_cell::add_blocks_from_buffer_kernel<<<nGpuBlocks, block, 0, stream>>> (
spatial_cell::add_blocks_from_buffer_kernel<<<nBlocks, block, 0, stream>>> (
populations[popID].dev_vmesh,
populations[popID].dev_blockContainer,
startLID,
Expand Down
Loading

0 comments on commit e700da3

Please sign in to comment.