Refactor grid-stride loop

Move grid-stride loop out of GPU kernels. @ashesh2512 noticed performance issues with grid-stride loops on AMD GPUs in PelePhyscis's large kernels. Thank @AlexanderSinn for the suggestion implemented in this PR.
WeiqunZhang · Oct 12, 2024 · f2af094 · f2af094
1 parent 62c2a81
commit f2af094
Show file tree

Hide file tree

Showing 2 changed files with 126 additions and 40 deletions.
diff --git a/Src/Base/AMReX_GpuLaunch.H b/Src/Base/AMReX_GpuLaunch.H
@@ -21,6 +21,7 @@
 #include <AMReX_RandomEngine.H>
 #include <AMReX_Algorithm.H>
 #include <AMReX_Math.H>
+#include <AMReX_Vector.H>
 #include <cstddef>
 #include <limits>
 #include <algorithm>
@@ -176,6 +177,45 @@ namespace Gpu {
  {
  return makeExecutionConfig<MT>(box.numPts());
  }
+
+ struct ExecConfig
+ {
+ Long ntotalthreads;
+ int nblocks;
+ };
+
+ template <int MT>
+ Vector<ExecConfig> makeNExecutionConfigs (Long N) noexcept
+ {
+ // Max # of blocks in a kernel launch
+ int numblocks_max = std::numeric_limits<int>::max();
+ // Max # of threads in a kernel launch
+ Long nmax = Long(MT) * numblocks_max;
+ // # of launches needed for N elements without using grid-stride
+ // loops inside GPU kernels.
+ auto nlaunches = int((N+nmax-1)/nmax);
+ Vector<ExecConfig> r(nlaunches);
+ for (int i = 0; i < nlaunches; ++i) {
+ int nblocks;
+ if (N > nmax) {
+ nblocks = numblocks_max;
+ N -= nmax;
+ } else {
+ nblocks = int((N+MT-1)/MT);
+ }
+ // Total # of threads in this launch
+ r[i].ntotalthreads = Long(nblocks) * MT;
+ // # of blocks in this launch
+ r[i].nblocks = nblocks;
+ }
+ return r;
+ }
+
+ template <int MT, int dim>
+ Vector<ExecConfig> makeNExecutionConfigs (BoxND<dim> const& box) noexcept
+ {
+ return makeNExecutionConfigs<MT>(box.numPts());
+ }
 #endif
 
 }

diff --git a/Src/Base/AMReX_GpuLaunchFunctsG.H b/Src/Base/AMReX_GpuLaunchFunctsG.H
@@ -747,35 +747,74 @@ void launch (int nblocks, int nthreads_per_block, gpuStream_t stream, L&& f) noe
  launch(nblocks, nthreads_per_block, 0, stream, std::forward<L>(f));
 }
 
-template<int MT, typename T, typename L>
+template<int MT, typename T, typename L, std::enable_if_t<std::is_integral_v<T>,int> FOO = 0>
 void launch (T const& n, L const& f) noexcept
 {
+ static_assert(sizeof(T) >= 2);
  if (amrex::isEmpty(n)) { return; }
- const auto ec = Gpu::makeExecutionConfig<MT>(n);
- AMREX_LAUNCH_KERNEL(MT, ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(),
- [=] AMREX_GPU_DEVICE () noexcept {
- for (auto const i : Gpu::Range(n)) {
- f(i);
+ const auto& nec = Gpu::makeNExecutionConfigs<MT>(n);
+ T ndone = 0;
+ for (auto const& ec : nec) {
+ T nthis = n - ndone;
+ AMREX_LAUNCH_KERNEL(MT, ec.nblocks, MT, 0, Gpu::gpuStream(),
+ [=] AMREX_GPU_DEVICE () noexcept {
+ // This will not overflow, even though nblocks*MT might.
+ auto tid = T(MT)*T(blockIdx.x)+T(threadIdx.x);
+ if (tid < nthis) {
+ f(tid+ndone);
+ }
+ });
+ if (&ec != &nec.back()) {
+ ndone += T(ec.ntotalthreads);
  }
- });
+ }
+ AMREX_GPU_ERROR_CHECK();
+}
+
+template<int MT, int dim, typename L>
+void launch (BoxND<dim> const& box, L const& f) noexcept
+{
+ if (box.isEmpty()) { return; }
+ const auto& nec = Gpu::makeNExecutionConfigs<MT>(box);
+ const BoxIndexerND<dim> indexer(box);
+ const auto type = box.ixType();
+ std::uint64_t ndone = 0;
+ for (auto const& ec : nec) {
+ AMREX_LAUNCH_KERNEL(MT, ec.nblocks, MT, 0, Gpu::gpuStream(),
+ [=] AMREX_GPU_DEVICE () noexcept {
+ auto icell = std::uint64_t(MT)*blockIdx.x+threadIdx.x + ndone;
+ if (icell < indexer.numPts()) {
+ auto iv = indexer.intVect(icell);
+ f(BoxND<dim>(iv,iv,type));
+ }
+ });
+ ndone += ec.ntotalthreads;
+ }
  AMREX_GPU_ERROR_CHECK();
 }
 
 template <int MT, typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
 std::enable_if_t<MaybeDeviceRunnable<L>::value>
 ParallelFor (Gpu::KernelInfo const&, T n, L const& f) noexcept
 {
+ static_assert(sizeof(T) >= 2);
  if (amrex::isEmpty(n)) { return; }
- const auto ec = Gpu::makeExecutionConfig<MT>(n);
- AMREX_LAUNCH_KERNEL(MT, ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(),
- [=] AMREX_GPU_DEVICE () noexcept {
- for (Long i = Long(blockDim.x)*blockIdx.x+threadIdx.x, stride = Long(blockDim.x)*gridDim.x;
- i < Long(n); i += stride) {
- detail::call_f_scalar_handler(f, T(i),
- Gpu::Handler(amrex::min((std::uint64_t(n)-i+(std::uint64_t)threadIdx.x),
- (std::uint64_t)blockDim.x)));
- }
- });
+ const auto& nec = Gpu::makeNExecutionConfigs<MT>(n);
+ T ndone = 0;
+ for (auto const& ec : nec) {
+ T nthis = n - ndone;
+ AMREX_LAUNCH_KERNEL(MT, ec.nblocks, MT, 0, Gpu::gpuStream(),
+ [=] AMREX_GPU_DEVICE () noexcept {
+ // This will not overflow, even though nblocks*MT might.
+ auto tid = T(MT)*T(blockIdx.x)+T(threadIdx.x);
+ if (tid < nthis) {
+ detail::call_f_scalar_handler(f, tid+ndone,
+ Gpu::Handler(amrex::min((std::uint64_t(nthis)-tid+(std::uint64_t)threadIdx.x),
+ (std::uint64_t)blockDim.x)));
+ }
+ });
+ ndone += ec.ntotalthreads;
+ }
  AMREX_GPU_ERROR_CHECK();
 }
 
@@ -785,18 +824,21 @@ ParallelFor (Gpu::KernelInfo const&, BoxND<dim> const& box, L const& f) noexcept
 {
  if (amrex::isEmpty(box)) { return; }
  const BoxIndexerND<dim> indexer(box);
- const auto ec = Gpu::makeExecutionConfig<MT>(box.numPts());
- AMREX_LAUNCH_KERNEL(MT, ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(),
- [=] AMREX_GPU_DEVICE () noexcept {
- for (std::uint64_t icell = std::uint64_t(blockDim.x)*blockIdx.x+threadIdx.x, stride = std::uint64_t(blockDim.x)*gridDim.x;
- icell < indexer.numPts(); icell += stride)
- {
- auto iv = indexer.intVect(icell);
- detail::call_f_intvect_handler(f, iv,
- Gpu::Handler(amrex::min((indexer.numPts()-icell+(std::uint64_t)threadIdx.x),
- (std::uint64_t)blockDim.x)));
- }
- });
+ const auto& nec = Gpu::makeNExecutionConfigs<MT>(box);
+ std::uint64_t ndone = 0;
+ for (auto const& ec : nec) {
+ AMREX_LAUNCH_KERNEL(MT, ec.nblocks, MT, 0, Gpu::gpuStream(),
+ [=] AMREX_GPU_DEVICE () noexcept {
+ auto icell = std::uint64_t(MT)*blockIdx.x+threadIdx.x + ndone;
+ if (icell < indexer.numPts()) {
+ auto iv = indexer.intVect(icell);
+ detail::call_f_intvect_handler(f, iv,
+ Gpu::Handler(amrex::min((indexer.numPts()-icell+(std::uint64_t)threadIdx.x),
+ (std::uint64_t)blockDim.x)));
+ }
+ });
+ ndone += ec.ntotalthreads;
+ }
  AMREX_GPU_ERROR_CHECK();
 }
 
@@ -806,17 +848,21 @@ ParallelFor (Gpu::KernelInfo const&, BoxND<dim> const& box, T ncomp, L const& f)
 {
  if (amrex::isEmpty(box)) { return; }
  const BoxIndexerND<dim> indexer(box);
- const auto ec = Gpu::makeExecutionConfig<MT>(box.numPts());
- AMREX_LAUNCH_KERNEL(MT, ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(),
- [=] AMREX_GPU_DEVICE () noexcept {
- for (std::uint64_t icell = std::uint64_t(blockDim.x)*blockIdx.x+threadIdx.x, stride = std::uint64_t(blockDim.x)*gridDim.x;
- icell < indexer.numPts(); icell += stride) {
- auto iv = indexer.intVect(icell);
- detail::call_f_intvect_ncomp_handler(f, iv, ncomp,
- Gpu::Handler(amrex::min((indexer.numPts()-icell+(std::uint64_t)threadIdx.x),
- (std::uint64_t)blockDim.x)));
- }
- });
+ const auto& nec = Gpu::makeNExecutionConfigs<MT>(box);
+ std::uint64_t ndone = 0;
+ for (auto const& ec : nec) {
+ AMREX_LAUNCH_KERNEL(MT, ec.nblocks, MT, 0, Gpu::gpuStream(),
+ [=] AMREX_GPU_DEVICE () noexcept {
+ auto icell = std::uint64_t(MT)*blockIdx.x+threadIdx.x + ndone;
+ if (icell < indexer.numPts()) {
+ auto iv = indexer.intVect(icell);
+ detail::call_f_intvect_ncomp_handler(f, iv, ncomp,
+ Gpu::Handler(amrex::min((indexer.numPts()-icell+(std::uint64_t)threadIdx.x),
+ (std::uint64_t)blockDim.x)));
+ }
+ });
+ ndone += ec.ntotalthreads;
+ }
  AMREX_GPU_ERROR_CHECK();
 }