Skip to content

Commit

Permalink
BaseFab::lockAdd: Faster version of BaseFab::atomicAdd for OpenMP
Browse files Browse the repository at this point in the history
For WarpX's Gordon Bell runs on Fugaku, Stephan Jaure of ATOS optimized
atomicAdd using pthread spin locks. This commit implements Stephan's
approach using OpenMP.

In AMReX::Initialize, we create a number of OMP locks. When BaseFab::lockAdd
is called, we loop over planes in z-direction and try to acquire a lock with
omp_test_lock. If it's successful, we can access the data in that z-plane
without worrying about race conditions. This allows us to use simd
instructions without using omp atomic add. If it's not successful, we will
try a different z-plane. The process stops till all planes are processed.
  • Loading branch information
WeiqunZhang committed Jan 12, 2024
1 parent 656bb64 commit c124af0
Show file tree
Hide file tree
Showing 4 changed files with 139 additions and 8 deletions.
6 changes: 5 additions & 1 deletion Src/Base/AMReX.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -462,7 +462,7 @@ amrex::Initialize (int& argc, char**& argv, bool build_parm_parse,
#endif

#ifdef AMREX_USE_OMP
amrex::OpenMP::init_threads();
amrex::OpenMP::Initialize();

// status output
if (system::verbose > 0) {
Expand Down Expand Up @@ -817,6 +817,10 @@ amrex::Finalize (amrex::AMReX* pamrex)
Gpu::Device::Finalize();
#endif

#ifdef AMREX_USE_OMP
amrex::OpenMP::Finalize();
#endif

#if defined(AMREX_USE_UPCXX)
upcxx::finalize();
#endif
Expand Down
100 changes: 96 additions & 4 deletions Src/Base/AMReX_BaseFab.H
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,8 @@
#include <AMReX_Reduce.H>
#include <AMReX_Gpu.H>
#include <AMReX_Math.H>

#ifdef AMREX_USE_OMP
#include <omp.h>
#endif
#include <AMReX_OpenMP.H>
#include <AMReX_MemPool.H>

#include <cmath>
#include <cstdlib>
Expand Down Expand Up @@ -979,6 +977,19 @@ public:
BaseFab<T>& atomicAdd (const BaseFab<T>& src, const Box& srcbox, const Box& destbox,
int srccomp, int destcomp, int numcomp=1) noexcept;

/**
* \brief Atomically add srcbox region of src FAB to destbox region of this FAB.
* The srcbox and destbox must be same size. When OMP is on, this uses OMP locks
* in the implementation and it's usually faster than atomicAdd.
*/
#if defined(AMREX_USE_GPU)
template <RunOn run_on>
#else
template <RunOn run_on=RunOn::Host>
#endif
BaseFab<T>& lockAdd (const BaseFab<T>& src, const Box& srcbox, const Box& destbox,
int srccomp, int destcomp, int numcomp) noexcept;

//! FAB SAXPY (y[i] <- y[i] + a * x[i]), in place.
#if defined(AMREX_USE_GPU)
template <RunOn run_on>
Expand Down Expand Up @@ -3300,6 +3311,87 @@ BaseFab<T>::atomicAdd (const BaseFab<T>& src, const Box& srcbox, const Box& dest
return *this;
}

template <class T>
template <RunOn run_on>
BaseFab<T>&
BaseFab<T>::lockAdd (const BaseFab<T>& src, const Box& srcbox, const Box& destbox,
int srccomp, int destcomp, int numcomp) noexcept
{
#if defined(AMREX_USE_OMP) && (AMREX_SPACEDIM == 3)
# if defined(AMREX_USE_GPU)
if (run_on == RunOn::Host || Gpu::notInLaunchRegion()) {
# endif
// Maybe we want to do this for 2D too
BL_ASSERT(destbox.ok());
BL_ASSERT(src.box().contains(srcbox));
BL_ASSERT(box().contains(destbox));
BL_ASSERT(destbox.sameSize(srcbox));
BL_ASSERT(srccomp >= 0 && srccomp+numcomp <= src.nComp());
BL_ASSERT(destcomp >= 0 && destcomp+numcomp <= nComp());

Array4<T> const& d = this->array();
Array4<T const> const& s = src.const_array();
const auto dlo = amrex::lbound(destbox);
const auto dhi = amrex::ubound(destbox);
const auto slo = amrex::lbound(srcbox);
const auto len = amrex::length(srcbox);
const Dim3 offset{slo.x-dlo.x,slo.y-dlo.y,slo.z-dlo.z};

auto* mask = (bool*) amrex_mempool_alloc(sizeof(bool)*len.z);
for (int kk = 0; kk < len.z; ++kk) {
mask[kk] = false;
}

int kk = 0;
int planes_left = len.z;
while (planes_left > 0) {
auto const k = kk + dlo.z;
int ilock = k % OpenMP::nlocks;
if (ilock < 0) { ilock += OpenMP::nlocks; }
auto* lock = &(OpenMP::omp_locks[ilock]);
if (omp_test_lock(lock)) {
for (int n = 0; n < numcomp; ++n) {
for (int j = dlo.y; j <= dhi.y; ++j) {
auto * pdst = d.ptr(dlo.x,j ,k ,n+destcomp);
auto const* psrc = s.ptr(slo.x,j+offset.y,k+offset.z,n+srccomp);
#pragma omp simd
for (int ii = 0; ii < len.x; ++ii) {
pdst[ii] += psrc[ii];
}
}
}

mask[kk] = true;
--planes_left;
omp_unset_lock(lock);
}

if (planes_left == 0) { break; }

++kk;
for (int ip = 0; ip < len.z; ++ip) {
int new_kk = (kk+ip) % len.z;
if ( ! mask[new_kk] ) {
kk = new_kk;
break;
}
}
}

amrex_mempool_free(mask);

return *this;

# if defined(AMREX_USE_GPU)
} else {
return this->template atomicAdd<run_on>(src, srcbox, destbox, srccomp, destcomp, numcom
}
# endif
#else
return this->template atomicAdd<run_on>(src, srcbox, destbox, srccomp, destcomp, numcomp);
#endif
}

template <class T>
template <RunOn run_on>
BaseFab<T>&
Expand Down
9 changes: 7 additions & 2 deletions Src/Base/AMReX_OpenMP.H
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,9 @@
#include <AMReX_Config.H>

#ifdef AMREX_USE_OMP
#include <AMReX_Extension.H>
#include <omp.h>
#include <array>

namespace amrex::OpenMP {

Expand All @@ -13,7 +15,11 @@ namespace amrex::OpenMP {
inline int in_parallel () { return omp_in_parallel(); }
inline void set_num_threads (int num) { omp_set_num_threads(num); }

void init_threads ();
void Initialize ();
void Finalize ();

static constexpr int nlocks = 128;
extern AMREX_EXPORT std::array<omp_lock_t,nlocks> omp_locks;
}

#else // AMREX_USE_OMP
Expand All @@ -25,7 +31,6 @@ namespace amrex::OpenMP {
constexpr int get_thread_num () { return 0; }
constexpr int in_parallel () { return false; }
constexpr void set_num_threads (int) { /* nothing */ }
constexpr void init_threads () { /* nothing */ }
}

#endif // AMREX_USE_OMP
Expand Down
32 changes: 31 additions & 1 deletion Src/Base/AMReX_OpenMP.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -135,8 +135,19 @@ namespace amrex
#ifdef AMREX_USE_OMP
namespace amrex::OpenMP
{
void init_threads ()
std::array<omp_lock_t,nlocks> omp_locks;

namespace {
unsigned int initialized = 0;
}

void Initialize ()
{
if (initialized) {
++initialized;
return;
}

amrex::ParmParse pp("amrex");
std::string omp_threads = "system";
pp.queryAdd("omp_threads", omp_threads);
Expand Down Expand Up @@ -173,6 +184,25 @@ namespace amrex::OpenMP
}
}
}

for (auto& lck : omp_locks) {
omp_init_lock(&lck);
}

++initialized;
}

void Finalize ()
{
if (initialized) {
--initialized;
if (initialized == 0) {
for (auto& lck : omp_locks) {
omp_destroy_lock(&lck);
}
}
}
}

} // namespace amrex::OpenMP
#endif // AMREX_USE_OMP

0 comments on commit c124af0

Please sign in to comment.