Skip to content

Commit

Permalink
Refactor grid-stride loop
Browse files Browse the repository at this point in the history
Move grid-stride loop out of GPU kernels. @ashesh2512 noticed performance
issues with grid-stride loops on AMD GPUs in PelePhyscis's large kernels.

Thank @AlexanderSinn for the suggestion implemented in this PR.
  • Loading branch information
WeiqunZhang committed Oct 11, 2024
1 parent fcc5bd2 commit 8293699
Show file tree
Hide file tree
Showing 2 changed files with 39 additions and 10 deletions.
26 changes: 26 additions & 0 deletions Src/Base/AMReX_GpuLaunch.H
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
#include <AMReX_RandomEngine.H>
#include <AMReX_Algorithm.H>
#include <AMReX_Math.H>
#include <AMReX_Vector.H>
#include <cstddef>
#include <limits>
#include <algorithm>
Expand Down Expand Up @@ -176,6 +177,31 @@ namespace Gpu {
{
return makeExecutionConfig<MT>(box.numPts());
}

template <int MT>
Vector<std::pair<Long,int>> makeNExecutionConfigs (Long N) noexcept
{
Long numblocks_max = std::numeric_limits<int>::max(); // Max # of blocks in a kernel launch
Long nmax = Long(MT) * numblocks_max; // Max # of threads in a kernel launch
auto nlaunches = (N+nmax-1)/nmax; // # of launches needed for N elements
Vector<std::pair<Long,int>> r(nlaunches);
for (int i = 0; i < nlaunches; ++i) {
Long nblocks;
if (N <= nmax) {
nblocks = (N+MT-1) / MT;
} else {
nblocks = numblocks_max;
}
r[i].first = nblocks * MT; // Total # of threads in this launch
r[i].second = int(nblocks); // # of blocks in this launch
}
}

template <int MT>
Vector<std::pair<Long,int>> makeNExecutionConfigs (Box const& box) noexcept
{
return makeNExecutionConfigs<MT>(box.numPts());
}
#endif

}
Expand Down
23 changes: 13 additions & 10 deletions Src/Base/AMReX_GpuLaunchFunctsG.H
Original file line number Diff line number Diff line change
Expand Up @@ -766,16 +766,19 @@ std::enable_if_t<MaybeDeviceRunnable<L>::value>
ParallelFor (Gpu::KernelInfo const&, T n, L const& f) noexcept
{
if (amrex::isEmpty(n)) { return; }
const auto ec = Gpu::makeExecutionConfig<MT>(n);
AMREX_LAUNCH_KERNEL(MT, ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(),
[=] AMREX_GPU_DEVICE () noexcept {
for (Long i = Long(blockDim.x)*blockIdx.x+threadIdx.x, stride = Long(blockDim.x)*gridDim.x;
i < Long(n); i += stride) {
detail::call_f_scalar_handler(f, T(i),
Gpu::Handler(amrex::min((std::uint64_t(n)-i+(std::uint64_t)threadIdx.x),
(std::uint64_t)blockDim.x)));
}
});
const auto& nec = Gpu::makeNExecutionConfigs<MT>(n);
Long ndone = 0;
for (auto& ec : nec) {
AMREX_LAUNCH_KERNEL(MT, ec.second, MT, 0, Gpu::gpuStream(),
[=] AMREX_GPU_DEVICE () noexcept {
auto i = Long(blockDim.x)*blockIdx.x+threadIdx.x + ndone;
if (i < Long(n)) {
detail::call_f_scalar_handler(f, T(i),
Gpu::Handler(amrex::min((std::uint64_t(n)-i+(std::uint64_t)threadIdx.x),
(std::uint64_t)blockDim.x)));
});
ndone += ec.first;
}
AMREX_GPU_ERROR_CHECK();
}

Expand Down

0 comments on commit 8293699

Please sign in to comment.