BaseFab::lockAdd: Faster version of BaseFab::atomicAdd for OpenMP

For WarpX's Gordon Bell runs on Fugaku, Stephan Jaure of ATOS optimized atomicAdd using pthread spin locks. This commit implements Stephan's approach using OpenMP. In AMReX::Initialize, we create a number of OMP locks. When BaseFab::lockAdd is called, we loop over planes in z-direction and try to acquire a lock with omp_test_lock. If it's successful, we can access the data in that z-plane without worrying about race conditions. This allows us to use simd instructions without using omp atomic add. If it's not successful, we will try a different z-plane. The process stops till all planes are processed.
WeiqunZhang · Jan 12, 2024 · c124af0 · c124af0
1 parent 656bb64
commit c124af0
Show file tree

Hide file tree

Showing 4 changed files with 139 additions and 8 deletions.
diff --git a/Src/Base/AMReX.cpp b/Src/Base/AMReX.cpp
@@ -462,7 +462,7 @@ amrex::Initialize (int& argc, char**& argv, bool build_parm_parse,
 #endif
 
 #ifdef AMREX_USE_OMP
- amrex::OpenMP::init_threads();
+ amrex::OpenMP::Initialize();
 
  // status output
  if (system::verbose > 0) {
@@ -817,6 +817,10 @@ amrex::Finalize (amrex::AMReX* pamrex)
  Gpu::Device::Finalize();
 #endif
 
+#ifdef AMREX_USE_OMP
+ amrex::OpenMP::Finalize();
+#endif
+
 #if defined(AMREX_USE_UPCXX)
  upcxx::finalize();
 #endif

diff --git a/Src/Base/AMReX_BaseFab.H b/Src/Base/AMReX_BaseFab.H
@@ -20,10 +20,8 @@
 #include <AMReX_Reduce.H>
 #include <AMReX_Gpu.H>
 #include <AMReX_Math.H>
-
-#ifdef AMREX_USE_OMP
-#include <omp.h>
-#endif
+#include <AMReX_OpenMP.H>
+#include <AMReX_MemPool.H>
 
 #include <cmath>
 #include <cstdlib>
@@ -979,6 +977,19 @@ public:
  BaseFab<T>& atomicAdd (const BaseFab<T>& src, const Box& srcbox, const Box& destbox,
  int srccomp, int destcomp, int numcomp=1) noexcept;
 
+ /**
+ * \brief Atomically add srcbox region of src FAB to destbox region of this FAB.
+ * The srcbox and destbox must be same size. When OMP is on, this uses OMP locks
+ * in the implementation and it's usually faster than atomicAdd.
+ */
+#if defined(AMREX_USE_GPU)
+ template <RunOn run_on>
+#else
+ template <RunOn run_on=RunOn::Host>
+#endif
+ BaseFab<T>& lockAdd (const BaseFab<T>& src, const Box& srcbox, const Box& destbox,
+ int srccomp, int destcomp, int numcomp) noexcept;
+
  //! FAB SAXPY (y[i] <- y[i] + a * x[i]), in place.
 #if defined(AMREX_USE_GPU)
  template <RunOn run_on>
@@ -3300,6 +3311,87 @@ BaseFab<T>::atomicAdd (const BaseFab<T>& src, const Box& srcbox, const Box& dest
  return *this;
 }
 
+template <class T>
+template <RunOn run_on>
+BaseFab<T>&
+BaseFab<T>::lockAdd (const BaseFab<T>& src, const Box& srcbox, const Box& destbox,
+ int srccomp, int destcomp, int numcomp) noexcept
+{
+#if defined(AMREX_USE_OMP) && (AMREX_SPACEDIM == 3)
+# if defined(AMREX_USE_GPU)
+ if (run_on == RunOn::Host || Gpu::notInLaunchRegion()) {
+# endif
+ // Maybe we want to do this for 2D too
+ BL_ASSERT(destbox.ok());
+ BL_ASSERT(src.box().contains(srcbox));
+ BL_ASSERT(box().contains(destbox));
+ BL_ASSERT(destbox.sameSize(srcbox));
+ BL_ASSERT(srccomp >= 0 && srccomp+numcomp <= src.nComp());
+ BL_ASSERT(destcomp >= 0 && destcomp+numcomp <= nComp());
+
+ Array4<T> const& d = this->array();
+ Array4<T const> const& s = src.const_array();
+ const auto dlo = amrex::lbound(destbox);
+ const auto dhi = amrex::ubound(destbox);
+ const auto slo = amrex::lbound(srcbox);
+ const auto len = amrex::length(srcbox);
+ const Dim3 offset{slo.x-dlo.x,slo.y-dlo.y,slo.z-dlo.z};
+
+ auto* mask = (bool*) amrex_mempool_alloc(sizeof(bool)*len.z);
+ for (int kk = 0; kk < len.z; ++kk) {
+ mask[kk] = false;
+ }
+
+ int kk = 0;
+ int planes_left = len.z;
+ while (planes_left > 0) {
+ auto const k = kk + dlo.z;
+ int ilock = k % OpenMP::nlocks;
+ if (ilock < 0) { ilock += OpenMP::nlocks; }
+ auto* lock = &(OpenMP::omp_locks[ilock]);
+ if (omp_test_lock(lock)) {
+ for (int n = 0; n < numcomp; ++n) {
+ for (int j = dlo.y; j <= dhi.y; ++j) {
+ auto * pdst = d.ptr(dlo.x,j ,k ,n+destcomp);
+ auto const* psrc = s.ptr(slo.x,j+offset.y,k+offset.z,n+srccomp);
+#pragma omp simd
+ for (int ii = 0; ii < len.x; ++ii) {
+ pdst[ii] += psrc[ii];
+ }
+ }
+ }
+
+ mask[kk] = true;
+ --planes_left;
+ omp_unset_lock(lock);
+ }
+
+ if (planes_left == 0) { break; }
+
+ ++kk;
+ for (int ip = 0; ip < len.z; ++ip) {
+ int new_kk = (kk+ip) % len.z;
+ if ( ! mask[new_kk] ) {
+ kk = new_kk;
+ break;
+ }
+ }
+ }
+
+ amrex_mempool_free(mask);
+
+ return *this;
+
+# if defined(AMREX_USE_GPU)
+ } else {
+ return this->template atomicAdd<run_on>(src, srcbox, destbox, srccomp, destcomp, numcom
+ }
+# endif
+#else
+ return this->template atomicAdd<run_on>(src, srcbox, destbox, srccomp, destcomp, numcomp);
+#endif
+}
+
 template <class T>
 template <RunOn run_on>
 BaseFab<T>&

diff --git a/Src/Base/AMReX_OpenMP.H b/Src/Base/AMReX_OpenMP.H
@@ -3,7 +3,9 @@
 #include <AMReX_Config.H>
 
 #ifdef AMREX_USE_OMP
+#include <AMReX_Extension.H>
 #include <omp.h>
+#include <array>
 
 namespace amrex::OpenMP {
 
@@ -13,7 +15,11 @@ namespace amrex::OpenMP {
  inline int in_parallel () { return omp_in_parallel(); }
  inline void set_num_threads (int num) { omp_set_num_threads(num); }
 
- void init_threads ();
+ void Initialize ();
+ void Finalize ();
+
+ static constexpr int nlocks = 128;
+ extern AMREX_EXPORT std::array<omp_lock_t,nlocks> omp_locks;
 }
 
 #else // AMREX_USE_OMP
@@ -25,7 +31,6 @@ namespace amrex::OpenMP {
  constexpr int get_thread_num () { return 0; }
  constexpr int in_parallel () { return false; }
  constexpr void set_num_threads (int) { /* nothing */ }
- constexpr void init_threads () { /* nothing */ }
 }
 
 #endif // AMREX_USE_OMP

diff --git a/Src/Base/AMReX_OpenMP.cpp b/Src/Base/AMReX_OpenMP.cpp
@@ -135,8 +135,19 @@ namespace amrex
 #ifdef AMREX_USE_OMP
 namespace amrex::OpenMP
 {
- void init_threads ()
+ std::array<omp_lock_t,nlocks> omp_locks;
+
+ namespace {
+ unsigned int initialized = 0;
+ }
+
+ void Initialize ()
  {
+ if (initialized) {
+ ++initialized;
+ return;
+ }
+
  amrex::ParmParse pp("amrex");
  std::string omp_threads = "system";
  pp.queryAdd("omp_threads", omp_threads);
@@ -173,6 +184,25 @@ namespace amrex::OpenMP
  }
  }
  }
+
+ for (auto& lck : omp_locks) {
+ omp_init_lock(&lck);
+ }
+
+ ++initialized;
  }
+
+ void Finalize ()
+ {
+ if (initialized) {
+ --initialized;
+ if (initialized == 0) {
+ for (auto& lck : omp_locks) {
+ omp_destroy_lock(&lck);
+ }
+ }
+ }
+ }
+
 } // namespace amrex::OpenMP
 #endif // AMREX_USE_OMP