Refactor grid-stride loop

Move grid-stride loop out of GPU kernels. @ashesh2512 noticed performance issues with grid-stride loops on AMD GPUs in PelePhyscis's large kernels. Thank @AlexanderSinn for the suggestion implemented in this PR.
WeiqunZhang · Oct 11, 2024 · 8293699 · 8293699
1 parent fcc5bd2
commit 8293699
Show file tree

Hide file tree

Showing 2 changed files with 39 additions and 10 deletions.
diff --git a/Src/Base/AMReX_GpuLaunch.H b/Src/Base/AMReX_GpuLaunch.H
@@ -21,6 +21,7 @@
 #include <AMReX_RandomEngine.H>
 #include <AMReX_Algorithm.H>
 #include <AMReX_Math.H>
+#include <AMReX_Vector.H>
 #include <cstddef>
 #include <limits>
 #include <algorithm>
@@ -176,6 +177,31 @@ namespace Gpu {
  {
  return makeExecutionConfig<MT>(box.numPts());
  }
+
+ template <int MT>
+ Vector<std::pair<Long,int>> makeNExecutionConfigs (Long N) noexcept
+ {
+ Long numblocks_max = std::numeric_limits<int>::max(); // Max # of blocks in a kernel launch
+ Long nmax = Long(MT) * numblocks_max; // Max # of threads in a kernel launch
+ auto nlaunches = (N+nmax-1)/nmax; // # of launches needed for N elements
+ Vector<std::pair<Long,int>> r(nlaunches);
+ for (int i = 0; i < nlaunches; ++i) {
+ Long nblocks;
+ if (N <= nmax) {
+ nblocks = (N+MT-1) / MT;
+ } else {
+ nblocks = numblocks_max;
+ }
+ r[i].first = nblocks * MT; // Total # of threads in this launch
+ r[i].second = int(nblocks); // # of blocks in this launch
+ }
+ }
+
+ template <int MT>
+ Vector<std::pair<Long,int>> makeNExecutionConfigs (Box const& box) noexcept
+ {
+ return makeNExecutionConfigs<MT>(box.numPts());
+ }
 #endif
 
 }

diff --git a/Src/Base/AMReX_GpuLaunchFunctsG.H b/Src/Base/AMReX_GpuLaunchFunctsG.H
@@ -766,16 +766,19 @@ std::enable_if_t<MaybeDeviceRunnable<L>::value>
 ParallelFor (Gpu::KernelInfo const&, T n, L const& f) noexcept
 {
  if (amrex::isEmpty(n)) { return; }
- const auto ec = Gpu::makeExecutionConfig<MT>(n);
- AMREX_LAUNCH_KERNEL(MT, ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(),
- [=] AMREX_GPU_DEVICE () noexcept {
- for (Long i = Long(blockDim.x)*blockIdx.x+threadIdx.x, stride = Long(blockDim.x)*gridDim.x;
- i < Long(n); i += stride) {
- detail::call_f_scalar_handler(f, T(i),
- Gpu::Handler(amrex::min((std::uint64_t(n)-i+(std::uint64_t)threadIdx.x),
- (std::uint64_t)blockDim.x)));
- }
- });
+ const auto& nec = Gpu::makeNExecutionConfigs<MT>(n);
+ Long ndone = 0;
+ for (auto& ec : nec) {
+ AMREX_LAUNCH_KERNEL(MT, ec.second, MT, 0, Gpu::gpuStream(),
+ [=] AMREX_GPU_DEVICE () noexcept {
+ auto i = Long(blockDim.x)*blockIdx.x+threadIdx.x + ndone;
+ if (i < Long(n)) {
+ detail::call_f_scalar_handler(f, T(i),
+ Gpu::Handler(amrex::min((std::uint64_t(n)-i+(std::uint64_t)threadIdx.x),
+ (std::uint64_t)blockDim.x)));
+ });
+ ndone += ec.first;
+ }
  AMREX_GPU_ERROR_CHECK();
 }