Skip to content

Commit

Permalink
Refactor grid-stride loop
Browse files Browse the repository at this point in the history
Move grid-stride loop out of GPU kernels. @ashesh2512 noticed performance
issues with grid-stride loops on AMD GPUs in PelePhyscis's large kernels.

Thank @AlexanderSinn for the suggestion implemented in this PR.
  • Loading branch information
WeiqunZhang committed Oct 12, 2024
1 parent 62c2a81 commit f2af094
Show file tree
Hide file tree
Showing 2 changed files with 126 additions and 40 deletions.
40 changes: 40 additions & 0 deletions Src/Base/AMReX_GpuLaunch.H
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
#include <AMReX_RandomEngine.H>
#include <AMReX_Algorithm.H>
#include <AMReX_Math.H>
#include <AMReX_Vector.H>
#include <cstddef>
#include <limits>
#include <algorithm>
Expand Down Expand Up @@ -176,6 +177,45 @@ namespace Gpu {
{
return makeExecutionConfig<MT>(box.numPts());
}

struct ExecConfig
{
Long ntotalthreads;
int nblocks;
};

template <int MT>
Vector<ExecConfig> makeNExecutionConfigs (Long N) noexcept
{
// Max # of blocks in a kernel launch
int numblocks_max = std::numeric_limits<int>::max();
// Max # of threads in a kernel launch
Long nmax = Long(MT) * numblocks_max;
// # of launches needed for N elements without using grid-stride
// loops inside GPU kernels.
auto nlaunches = int((N+nmax-1)/nmax);
Vector<ExecConfig> r(nlaunches);
for (int i = 0; i < nlaunches; ++i) {
int nblocks;
if (N > nmax) {
nblocks = numblocks_max;
N -= nmax;
} else {
nblocks = int((N+MT-1)/MT);
}
// Total # of threads in this launch
r[i].ntotalthreads = Long(nblocks) * MT;
// # of blocks in this launch
r[i].nblocks = nblocks;
}
return r;
}

template <int MT, int dim>
Vector<ExecConfig> makeNExecutionConfigs (BoxND<dim> const& box) noexcept
{
return makeNExecutionConfigs<MT>(box.numPts());
}
#endif

}
Expand Down
126 changes: 86 additions & 40 deletions Src/Base/AMReX_GpuLaunchFunctsG.H
Original file line number Diff line number Diff line change
Expand Up @@ -747,35 +747,74 @@ void launch (int nblocks, int nthreads_per_block, gpuStream_t stream, L&& f) noe
launch(nblocks, nthreads_per_block, 0, stream, std::forward<L>(f));
}

template<int MT, typename T, typename L>
template<int MT, typename T, typename L, std::enable_if_t<std::is_integral_v<T>,int> FOO = 0>
void launch (T const& n, L const& f) noexcept
{
static_assert(sizeof(T) >= 2);
if (amrex::isEmpty(n)) { return; }
const auto ec = Gpu::makeExecutionConfig<MT>(n);
AMREX_LAUNCH_KERNEL(MT, ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(),
[=] AMREX_GPU_DEVICE () noexcept {
for (auto const i : Gpu::Range(n)) {
f(i);
const auto& nec = Gpu::makeNExecutionConfigs<MT>(n);
T ndone = 0;
for (auto const& ec : nec) {
T nthis = n - ndone;
AMREX_LAUNCH_KERNEL(MT, ec.nblocks, MT, 0, Gpu::gpuStream(),
[=] AMREX_GPU_DEVICE () noexcept {
// This will not overflow, even though nblocks*MT might.
auto tid = T(MT)*T(blockIdx.x)+T(threadIdx.x);
if (tid < nthis) {
f(tid+ndone);
}
});
if (&ec != &nec.back()) {
ndone += T(ec.ntotalthreads);
}
});
}
AMREX_GPU_ERROR_CHECK();
}

template<int MT, int dim, typename L>
void launch (BoxND<dim> const& box, L const& f) noexcept
{
if (box.isEmpty()) { return; }
const auto& nec = Gpu::makeNExecutionConfigs<MT>(box);
const BoxIndexerND<dim> indexer(box);
const auto type = box.ixType();
std::uint64_t ndone = 0;
for (auto const& ec : nec) {
AMREX_LAUNCH_KERNEL(MT, ec.nblocks, MT, 0, Gpu::gpuStream(),
[=] AMREX_GPU_DEVICE () noexcept {
auto icell = std::uint64_t(MT)*blockIdx.x+threadIdx.x + ndone;
if (icell < indexer.numPts()) {
auto iv = indexer.intVect(icell);
f(BoxND<dim>(iv,iv,type));
}
});
ndone += ec.ntotalthreads;
}
AMREX_GPU_ERROR_CHECK();
}

template <int MT, typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
std::enable_if_t<MaybeDeviceRunnable<L>::value>
ParallelFor (Gpu::KernelInfo const&, T n, L const& f) noexcept
{
static_assert(sizeof(T) >= 2);
if (amrex::isEmpty(n)) { return; }
const auto ec = Gpu::makeExecutionConfig<MT>(n);
AMREX_LAUNCH_KERNEL(MT, ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(),
[=] AMREX_GPU_DEVICE () noexcept {
for (Long i = Long(blockDim.x)*blockIdx.x+threadIdx.x, stride = Long(blockDim.x)*gridDim.x;
i < Long(n); i += stride) {
detail::call_f_scalar_handler(f, T(i),
Gpu::Handler(amrex::min((std::uint64_t(n)-i+(std::uint64_t)threadIdx.x),
(std::uint64_t)blockDim.x)));
}
});
const auto& nec = Gpu::makeNExecutionConfigs<MT>(n);
T ndone = 0;
for (auto const& ec : nec) {
T nthis = n - ndone;
AMREX_LAUNCH_KERNEL(MT, ec.nblocks, MT, 0, Gpu::gpuStream(),
[=] AMREX_GPU_DEVICE () noexcept {
// This will not overflow, even though nblocks*MT might.
auto tid = T(MT)*T(blockIdx.x)+T(threadIdx.x);
if (tid < nthis) {
detail::call_f_scalar_handler(f, tid+ndone,
Gpu::Handler(amrex::min((std::uint64_t(nthis)-tid+(std::uint64_t)threadIdx.x),
(std::uint64_t)blockDim.x)));
}
});
ndone += ec.ntotalthreads;
}
AMREX_GPU_ERROR_CHECK();
}

Expand All @@ -785,18 +824,21 @@ ParallelFor (Gpu::KernelInfo const&, BoxND<dim> const& box, L const& f) noexcept
{
if (amrex::isEmpty(box)) { return; }
const BoxIndexerND<dim> indexer(box);
const auto ec = Gpu::makeExecutionConfig<MT>(box.numPts());
AMREX_LAUNCH_KERNEL(MT, ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(),
[=] AMREX_GPU_DEVICE () noexcept {
for (std::uint64_t icell = std::uint64_t(blockDim.x)*blockIdx.x+threadIdx.x, stride = std::uint64_t(blockDim.x)*gridDim.x;
icell < indexer.numPts(); icell += stride)
{
auto iv = indexer.intVect(icell);
detail::call_f_intvect_handler(f, iv,
Gpu::Handler(amrex::min((indexer.numPts()-icell+(std::uint64_t)threadIdx.x),
(std::uint64_t)blockDim.x)));
}
});
const auto& nec = Gpu::makeNExecutionConfigs<MT>(box);
std::uint64_t ndone = 0;
for (auto const& ec : nec) {
AMREX_LAUNCH_KERNEL(MT, ec.nblocks, MT, 0, Gpu::gpuStream(),
[=] AMREX_GPU_DEVICE () noexcept {
auto icell = std::uint64_t(MT)*blockIdx.x+threadIdx.x + ndone;
if (icell < indexer.numPts()) {
auto iv = indexer.intVect(icell);
detail::call_f_intvect_handler(f, iv,
Gpu::Handler(amrex::min((indexer.numPts()-icell+(std::uint64_t)threadIdx.x),
(std::uint64_t)blockDim.x)));
}
});
ndone += ec.ntotalthreads;
}
AMREX_GPU_ERROR_CHECK();
}

Expand All @@ -806,17 +848,21 @@ ParallelFor (Gpu::KernelInfo const&, BoxND<dim> const& box, T ncomp, L const& f)
{
if (amrex::isEmpty(box)) { return; }
const BoxIndexerND<dim> indexer(box);
const auto ec = Gpu::makeExecutionConfig<MT>(box.numPts());
AMREX_LAUNCH_KERNEL(MT, ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(),
[=] AMREX_GPU_DEVICE () noexcept {
for (std::uint64_t icell = std::uint64_t(blockDim.x)*blockIdx.x+threadIdx.x, stride = std::uint64_t(blockDim.x)*gridDim.x;
icell < indexer.numPts(); icell += stride) {
auto iv = indexer.intVect(icell);
detail::call_f_intvect_ncomp_handler(f, iv, ncomp,
Gpu::Handler(amrex::min((indexer.numPts()-icell+(std::uint64_t)threadIdx.x),
(std::uint64_t)blockDim.x)));
}
});
const auto& nec = Gpu::makeNExecutionConfigs<MT>(box);
std::uint64_t ndone = 0;
for (auto const& ec : nec) {
AMREX_LAUNCH_KERNEL(MT, ec.nblocks, MT, 0, Gpu::gpuStream(),
[=] AMREX_GPU_DEVICE () noexcept {
auto icell = std::uint64_t(MT)*blockIdx.x+threadIdx.x + ndone;
if (icell < indexer.numPts()) {
auto iv = indexer.intVect(icell);
detail::call_f_intvect_ncomp_handler(f, iv, ncomp,
Gpu::Handler(amrex::min((indexer.numPts()-icell+(std::uint64_t)threadIdx.x),
(std::uint64_t)blockDim.x)));
}
});
ndone += ec.ntotalthreads;
}
AMREX_GPU_ERROR_CHECK();
}

Expand Down

0 comments on commit f2af094

Please sign in to comment.