diff --git a/Src/Base/AMReX_GpuLaunch.H b/Src/Base/AMReX_GpuLaunch.H index 435a11f342..9962f7a256 100644 --- a/Src/Base/AMReX_GpuLaunch.H +++ b/Src/Base/AMReX_GpuLaunch.H @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include @@ -176,6 +177,45 @@ namespace Gpu { { return makeExecutionConfig(box.numPts()); } + + struct ExecConfig + { + Long ntotalthreads; + int nblocks; + }; + + template + Vector makeNExecutionConfigs (Long N) noexcept + { + // Max # of blocks in a kernel launch + int numblocks_max = std::numeric_limits::max(); + // Max # of threads in a kernel launch + Long nmax = Long(MT) * numblocks_max; + // # of launches needed for N elements without usinging grid-stride + // # loops inside GPU kernels. + auto nlaunches = int((N+nmax-1)/nmax); + Vector r(nlaunches); + for (int i = 0; i < nlaunches; ++i) { + int nblocks; + if (N > nmax) { + nblocks = numblocks_max; + N -= nmax; + } else { + nblocks = int((N+MT-1)/MT); + } + // Total # of threads in this launch + r[i].ntotalthreads = Long(nblocks) * MT; + // # of blocks in this launch + r[i].nblocks = nblocks; + } + return r; + } + + template + Vector makeNExecutionConfigs (BoxND const& box) noexcept + { + return makeNExecutionConfigs(box.numPts()); + } #endif } diff --git a/Src/Base/AMReX_GpuLaunchFunctsG.H b/Src/Base/AMReX_GpuLaunchFunctsG.H index 7955410f8b..16468696ed 100644 --- a/Src/Base/AMReX_GpuLaunchFunctsG.H +++ b/Src/Base/AMReX_GpuLaunchFunctsG.H @@ -747,17 +747,47 @@ void launch (int nblocks, int nthreads_per_block, gpuStream_t stream, L&& f) noe launch(nblocks, nthreads_per_block, 0, stream, std::forward(f)); } -template +template,int> FOO = 0> void launch (T const& n, L const& f) noexcept { if (amrex::isEmpty(n)) { return; } - const auto ec = Gpu::makeExecutionConfig(n); - AMREX_LAUNCH_KERNEL(MT, ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(), - [=] AMREX_GPU_DEVICE () noexcept { - for (auto const i : Gpu::Range(n)) { - f(i); + const auto& nec = Gpu::makeNExecutionConfigs(n); + T ndone = 0; + for (auto const& ec : nec) { + AMREX_LAUNCH_KERNEL(MT, ec.nblocks, MT, 0, Gpu::gpuStream(), + [=] AMREX_GPU_DEVICE () noexcept { + // This will not overflow, even though nblocks*MT might. + auto tid = T(MT*blockIdx.x+threadIdx.x); + if (tid < n-ndone) { + f(tid+ndone); + } + }); + if (&ec != &nec.back()) { + ndone += T(ec.ntotalthreads); } - }); + } + AMREX_GPU_ERROR_CHECK(); +} + +template +void launch (BoxND const& box, L const& f) noexcept +{ + if (box.isEmpty()) { return; } + const auto& nec = Gpu::makeNExecutionConfigs(box); + const BoxIndexerND indexer(box); + const auto type = box.ixType(); + std::uint64_t ndone = 0; + for (auto const& ec : nec) { + AMREX_LAUNCH_KERNEL(MT, ec.nblocks, MT, 0, Gpu::gpuStream(), + [=] AMREX_GPU_DEVICE () noexcept { + auto icell = std::uint64_t(MT)*blockIdx.x+threadIdx.x + ndone; + if (icell < indexer.numPts()) { + auto iv = indexer.intVect(icell); + f(BoxND(iv,iv,type)); + } + }); + ndone += ec.ntotalthreads; + } AMREX_GPU_ERROR_CHECK(); } @@ -766,16 +796,22 @@ std::enable_if_t::value> ParallelFor (Gpu::KernelInfo const&, T n, L const& f) noexcept { if (amrex::isEmpty(n)) { return; } - const auto ec = Gpu::makeExecutionConfig(n); - AMREX_LAUNCH_KERNEL(MT, ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(), - [=] AMREX_GPU_DEVICE () noexcept { - for (Long i = Long(blockDim.x)*blockIdx.x+threadIdx.x, stride = Long(blockDim.x)*gridDim.x; - i < Long(n); i += stride) { - detail::call_f_scalar_handler(f, T(i), - Gpu::Handler(amrex::min((std::uint64_t(n)-i+(std::uint64_t)threadIdx.x), - (std::uint64_t)blockDim.x))); - } - }); + const auto& nec = Gpu::makeNExecutionConfigs(n); + T ndone = 0; + for (auto const& ec : nec) { + AMREX_LAUNCH_KERNEL(MT, ec.nblocks, MT, 0, Gpu::gpuStream(), + [=] AMREX_GPU_DEVICE () noexcept { + // This will not overflow, even though nblocks*MT might. + auto tid = T(MT*blockIdx.x+threadIdx.x); + if (tid < n-ndone) { + tid += ndone; + detail::call_f_scalar_handler(f, tid, + Gpu::Handler(amrex::min((std::uint64_t(n)-tid+(std::uint64_t)threadIdx.x), + (std::uint64_t)blockDim.x))); + } + }); + ndone += ec.ntotalthreads; + } AMREX_GPU_ERROR_CHECK(); } @@ -785,18 +821,21 @@ ParallelFor (Gpu::KernelInfo const&, BoxND const& box, L const& f) noexcept { if (amrex::isEmpty(box)) { return; } const BoxIndexerND indexer(box); - const auto ec = Gpu::makeExecutionConfig(box.numPts()); - AMREX_LAUNCH_KERNEL(MT, ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(), - [=] AMREX_GPU_DEVICE () noexcept { - for (std::uint64_t icell = std::uint64_t(blockDim.x)*blockIdx.x+threadIdx.x, stride = std::uint64_t(blockDim.x)*gridDim.x; - icell < indexer.numPts(); icell += stride) - { - auto iv = indexer.intVect(icell); - detail::call_f_intvect_handler(f, iv, - Gpu::Handler(amrex::min((indexer.numPts()-icell+(std::uint64_t)threadIdx.x), - (std::uint64_t)blockDim.x))); - } - }); + const auto& nec = Gpu::makeNExecutionConfigs(box); + std::uint64_t ndone = 0; + for (auto const& ec : nec) { + AMREX_LAUNCH_KERNEL(MT, ec.nblocks, MT, 0, Gpu::gpuStream(), + [=] AMREX_GPU_DEVICE () noexcept { + auto icell = std::uint64_t(MT)*blockIdx.x+threadIdx.x + ndone; + if (icell < indexer.numPts()) { + auto iv = indexer.intVect(icell); + detail::call_f_intvect_handler(f, iv, + Gpu::Handler(amrex::min((indexer.numPts()-icell+(std::uint64_t)threadIdx.x), + (std::uint64_t)blockDim.x))); + } + }); + ndone += ec.ntotalthreads; + } AMREX_GPU_ERROR_CHECK(); } @@ -806,17 +845,21 @@ ParallelFor (Gpu::KernelInfo const&, BoxND const& box, T ncomp, L const& f) { if (amrex::isEmpty(box)) { return; } const BoxIndexerND indexer(box); - const auto ec = Gpu::makeExecutionConfig(box.numPts()); - AMREX_LAUNCH_KERNEL(MT, ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(), - [=] AMREX_GPU_DEVICE () noexcept { - for (std::uint64_t icell = std::uint64_t(blockDim.x)*blockIdx.x+threadIdx.x, stride = std::uint64_t(blockDim.x)*gridDim.x; - icell < indexer.numPts(); icell += stride) { - auto iv = indexer.intVect(icell); - detail::call_f_intvect_ncomp_handler(f, iv, ncomp, - Gpu::Handler(amrex::min((indexer.numPts()-icell+(std::uint64_t)threadIdx.x), - (std::uint64_t)blockDim.x))); - } - }); + const auto& nec = Gpu::makeNExecutionConfigs(box); + std::uint64_t ndone = 0; + for (auto const& ec : nec) { + AMREX_LAUNCH_KERNEL(MT, ec.nblocks, MT, 0, Gpu::gpuStream(), + [=] AMREX_GPU_DEVICE () noexcept { + auto icell = std::uint64_t(MT)*blockIdx.x+threadIdx.x + ndone; + if (icell < indexer.numPts()) { + auto iv = indexer.intVect(icell); + detail::call_f_intvect_ncomp_handler(f, iv, ncomp, + Gpu::Handler(amrex::min((indexer.numPts()-icell+(std::uint64_t)threadIdx.x), + (std::uint64_t)blockDim.x))); + } + }); + ndone += ec.ntotalthreads; + } AMREX_GPU_ERROR_CHECK(); }