Merge branch 'AMReX-Codes:development' into development

ruohai0925 · Jan 26, 2024 · a471816 · a471816
2 parents cbb9f6a + d5cc579
commit a471816
Show file tree

Hide file tree

Showing 14 changed files with 590 additions and 70 deletions.
diff --git a/Docs/sphinx_documentation/source/EB.rst b/Docs/sphinx_documentation/source/EB.rst
@@ -48,7 +48,7 @@ Here is a simple example of initialize the database for an embedded sphere.
     EB2::Build(shop, geom, 0, 0);
 
 Alternatively, the EB information can be initialized from an STL file
-specified by a :cpp:`ParmParse` parameter ``eb2.stl_file``.  The
+specified by a :cpp:`ParmParse` parameter ``eb2.stl_file``. (This also requires setting ``eb2.geom_type = stl``.) The
 initialization is done by calling
 
 .. highlight:: c++

diff --git a/Src/Base/AMReX_BaseFab.H b/Src/Base/AMReX_BaseFab.H
@@ -3330,15 +3330,25 @@ BaseFab<T>::lockAdd (const BaseFab<T>& src, const Box& srcbox, const Box& destbo
 
         Array4<T> const& d = this->array();
         Array4<T const> const& s = src.const_array();
-        auto const& dlo = destbox.smallEnd();
-#if (AMREX_SPACEDIM == 3)
-        auto const& dhi = destbox.bigEnd();
-#endif
-        auto const& slo = srcbox.smallEnd();
-        auto const offset = slo - dlo;
-        auto const lenx = srcbox.length(0);
+        auto const& dlo = amrex::lbound(destbox);
+        auto const& dhi = amrex::ubound(destbox);
+        auto const& len = amrex::length(destbox);
+        auto const& slo = amrex::lbound(srcbox);
+        Dim3 const offset{slo.x-dlo.x, slo.y-dlo.y, slo.z-dlo.z};
+
+        int planedim;
+        int nplanes;
+        int plo;
+        if (len.z == 1) {
+            planedim = 1;
+            nplanes = len.y;
+            plo = dlo.y;
+        } else {
+            planedim = 2;
+            nplanes = len.z;
+            plo = dlo.z;
+        }
 
-        auto const nplanes = srcbox.length(AMREX_SPACEDIM-1);
         auto* mask = (bool*) amrex_mempool_alloc(sizeof(bool)*nplanes);
         for (int ip = 0; ip < nplanes; ++ip) {
             mask[ip] = false;
@@ -3348,27 +3358,31 @@ BaseFab<T>::lockAdd (const BaseFab<T>& src, const Box& srcbox, const Box& destbo
         int planes_left = nplanes;
         while (planes_left > 0) {
             AMREX_ASSERT(mm < nplanes);
-            auto const m = mm + dlo[AMREX_SPACEDIM-1];
+            auto const m = mm + plo;
             int ilock = m % OpenMP::nlocks;
             if (ilock < 0) { ilock += OpenMP::nlocks; }
             auto* lock = &(OpenMP::omp_locks[ilock]);
             if (omp_test_lock(lock))
             {
-                for (int n = 0; n < numcomp; ++n)
-                {
-#if (AMREX_SPACEDIM == 3)
-                    for (int j = dlo[1]; j <= dhi[1]; ++j)
-                    {
-                        IntVect div(dlo[0], j, m);
-#elif (AMREX_SPACEDIM == 2)
-                    {
-                        IntVect div(dlo[0], m);
-#endif
-                        auto      * pdst = d.ptr(div       ,n+destcomp);
-                        auto const* psrc = s.ptr(div+offset,n+srccomp);
+                auto lo = dlo;
+                auto hi = dhi;
+                if (planedim == 1) {
+                    lo.y = m;
+                    hi.y = m;
+                } else {
+                    lo.z = m;
+                    hi.z = m;
+                }
+
+                for (int n = 0; n < numcomp; ++n) {
+                    for     (int k = lo.z; k <= hi.z; ++k) {
+                        for (int j = lo.y; j <= hi.y; ++j) {
+                            auto      * pdst = d.ptr(dlo.x,j         ,k         ,n+destcomp);
+                            auto const* psrc = s.ptr(slo.x,j+offset.y,k+offset.z,n+ srccomp);
 #pragma omp simd
-                        for (int ii = 0; ii < lenx; ++ii) {
-                            pdst[ii] += psrc[ii];
+                            for (int ii = 0; ii < len.x; ++ii) {
+                                pdst[ii] += psrc[ii];
+                            }
                         }
                     }
                 }

diff --git a/Src/Base/AMReX_Extension.H b/Src/Base/AMReX_Extension.H
@@ -138,9 +138,9 @@
 #define AMREX_TO_STRING(X)          AMREX_TO_STRING_HELPER(X)
 
 #if defined(__clang__) || defined(__CUDACC__) || defined(__HIP__) || defined(__INTEL_CLANG_COMPILER)
-#define AMREX_UNROLL_LOOP(n)  _Pragma(AMREX_TO_STRING(unroll (n)))
+#define AMREX_UNROLL_LOOP(n)  _Pragma(AMREX_TO_STRING(unroll n))
 #elif defined(__GNUC__)
-#define AMREX_UNROLL_LOOP(n)  _Pragma(AMREX_TO_STRING(GCC unroll (n)))
+#define AMREX_UNROLL_LOOP(n)  _Pragma(AMREX_TO_STRING(GCC unroll n))
 #else
 #define AMREX_UNROLL_LOOP(n)
 #endif

diff --git a/Src/Base/AMReX_Functional.H b/Src/Base/AMReX_Functional.H
@@ -63,6 +63,24 @@ struct LogicalOr
     }
 };
 
+template <typename T>
+struct Multiplies
+{
+    constexpr T operator() (const T & lhs, const T & rhs) const
+    {
+        return lhs * rhs;
+    }
+};
+
+template <typename T>
+struct Divides
+{
+    constexpr T operator() (const T & lhs, const T & rhs) const
+    {
+        return lhs / rhs;
+    }
+};
+
 }
 
 #endif
diff --git a/Src/Base/AMReX_GpuAtomic.H b/Src/Base/AMReX_GpuAtomic.H
@@ -13,11 +13,13 @@ namespace amrex {
 namespace Gpu::Atomic {
 
 // For Add, Min and Max, we support int, unsigned int, long, unsigned long long, float and double.
+// For Multiply and Divide, we support generic types provided they are the same size as int or unsigned long long
+// and have *= and /= operators.
 // For LogicalOr and LogicalAnd, the data type is int.
 // For Exch and CAS, the data type is generic.
 // All these functions are non-atomic in host code!!!
 // If one needs them to be atomic in host code, use HostDevice::Atomic::*.  Currently only
-// HostDevice::Atomic is supported.  We could certainly add more.
+// HostDevice::Atomic::Add is supported.  We could certainly add more.
 
 namespace detail {
 
@@ -526,6 +528,78 @@ namespace detail {
         ))
 #endif
     }
+
+////////////////////////////////////////////////////////////////////////
+//  Multiply
+////////////////////////////////////////////////////////////////////////
+
+#ifdef AMREX_USE_GPU
+
+    template <typename T, std::enable_if_t<sizeof(T) == sizeof(int), int> = 0>
+    AMREX_GPU_DEVICE AMREX_FORCE_INLINE
+    T Multiply_device (T* const prod, T const value) noexcept
+    {
+        return detail::atomic_op<T, int>(prod,value,amrex::Multiplies<T>());
+    }
+
+    template <typename T, std::enable_if_t<sizeof(T) == sizeof(unsigned long long), int> = 0>
+    AMREX_GPU_DEVICE AMREX_FORCE_INLINE
+    T Multiply_device (T* const prod, T const value) noexcept
+    {
+        return detail::atomic_op<T, unsigned long long>(prod,value,amrex::Multiplies<T>());
+    }
+
+#endif
+
+    template<class T>
+    AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+    T Multiply (T* const prod, T const value) noexcept
+    {
+        AMREX_IF_ON_DEVICE((
+            return Multiply_device(prod, value);
+        ))
+        AMREX_IF_ON_HOST((
+            auto const old = *prod;
+            *prod *= value;
+            return old;
+        ))
+    }
+
+////////////////////////////////////////////////////////////////////////
+//  Divide
+////////////////////////////////////////////////////////////////////////
+
+#ifdef AMREX_USE_GPU
+
+    template <typename T, std::enable_if_t<sizeof(T) == sizeof(int), int> = 0>
+    AMREX_GPU_DEVICE AMREX_FORCE_INLINE
+    T Divide_device (T* const quot, T const value) noexcept
+    {
+        return detail::atomic_op<T, int>(quot,value,amrex::Divides<T>());
+    }
+
+    template <typename T, std::enable_if_t<sizeof(T) == sizeof(unsigned long long), int> = 0>
+    AMREX_GPU_DEVICE AMREX_FORCE_INLINE
+    T Divide_device (T* const quot, T const value) noexcept
+    {
+        return detail::atomic_op<T, unsigned long long>(quot,value,amrex::Divides<T>());
+    }
+
+#endif
+
+    template<class T>
+    AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+    T Divide (T* const quot, T const value) noexcept
+    {
+        AMREX_IF_ON_DEVICE((
+            return Divide_device(quot, value);
+        ))
+        AMREX_IF_ON_HOST((
+            auto const old = *quot;
+            *quot /= value;
+            return old;
+        ))
+    }
 }
 
 namespace HostDevice::Atomic {

diff --git a/Src/Base/AMReX_Reduce.H b/Src/Base/AMReX_Reduce.H
@@ -67,14 +67,14 @@ namespace Reduce::detail {
 
     template <std::size_t I, typename T, typename P>
     AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
-    void for_each_init (T& t)
+    constexpr void for_each_init (T& t)
     {
         P().init(amrex::get<I>(t));
     }
 
     template <std::size_t I, typename T, typename P, typename P1, typename... Ps>
     AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
-    void for_each_init (T& t)
+    constexpr void for_each_init (T& t)
     {
         P().init(amrex::get<I>(t));
         for_each_init<I+1,T,P1,Ps...>(t);
@@ -1275,6 +1275,34 @@ bool AnyOf (Box const& box, P&&pred)
 
 #endif
 
+/**
+ * \brief Return a GpuTuple containing the identity element for each operation in ReduceOps.
+ * For example 0, +inf and -inf for ReduceOpSum, ReduceOpMin and ReduceOpMax respectively.
+ */
+template <typename... Ts, typename... Ps>
+AMREX_GPU_HOST_DEVICE
+constexpr GpuTuple<Ts...>
+IdentityTuple (GpuTuple<Ts...>, ReduceOps<Ps...>) noexcept
+{
+    GpuTuple<Ts...> r{};
+    Reduce::detail::for_each_init<0, decltype(r), Ps...>(r);
+    return r;
+}
+
+/**
+ * \brief Return a GpuTuple containing the identity element for each ReduceOp in TypeList.
+ * For example 0, +inf and -inf for ReduceOpSum, ReduceOpMin and ReduceOpMax respectively.
+ */
+template <typename... Ts, typename... Ps>
+AMREX_GPU_HOST_DEVICE
+constexpr GpuTuple<Ts...>
+IdentityTuple (GpuTuple<Ts...>, TypeList<Ps...>) noexcept
+{
+    GpuTuple<Ts...> r{};
+    Reduce::detail::for_each_init<0, decltype(r), Ps...>(r);
+    return r;
+}
+
 }
 
 #endif
diff --git a/Src/Base/AMReX_Tuple.H b/Src/Base/AMReX_Tuple.H
@@ -378,6 +378,20 @@ ForwardAsTuple (Ts&&... args) noexcept
     return GpuTuple<Ts&&...>(std::forward<Ts>(args)...);
 }
 
+// MakeZeroTuple
+
+/**
+ * \brief Return a GpuTuple containing all zeros.
+ * Note that a default-constructed GpuTuple can have uninitialized values.
+ */
+template <typename... Ts>
+AMREX_GPU_HOST_DEVICE
+constexpr GpuTuple<Ts...>
+MakeZeroTuple (GpuTuple<Ts...>) noexcept
+{
+    return GpuTuple<Ts...>(static_cast<Ts>(0)...);
+}
+
 }
 
 #endif /*AMREX_TUPLE_H_*/
diff --git a/Src/Base/AMReX_TypeList.H b/Src/Base/AMReX_TypeList.H
@@ -93,7 +93,7 @@ ForEach (TypeList<Ts...>, F&& f)
          // dst and src are either MultiFab or fMultiFab
          auto tt = CartesianProduct(TypeList<MultiFab,fMultiFab>{},
                                     TypeList<MultiFab,fMultiFab>{});
-         bool r = ForEachUtil(tt, [&] (auto t) -> bool
+         bool r = ForEachUntil(tt, [&] (auto t) -> bool
          {
              using MF0 = TypeAt<0,decltype(t)>;
              using MF1 = TypeAt<1,decltype(t)>;
@@ -151,6 +151,53 @@ constexpr auto CartesianProduct (Ls...) {
     return (TypeList<TypeList<>>{} * ... * Ls{});
 }
 
+namespace detail {
+    // return TypeList<T, T, T, T, ... (N times)> by using the fast power algorithm
+    template <class T, std::size_t N>
+    constexpr auto SingleTypeMultiplier_impl () {
+        if constexpr (N == 0) {
+            return TypeList<>{};
+        } else if constexpr (N == 1) {
+            return TypeList<T>{};
+        } else if constexpr (N % 2 == 0) {
+            return SingleTypeMultiplier_impl<T, N / 2>() + SingleTypeMultiplier_impl<T, N / 2>();
+        } else {
+            return SingleTypeMultiplier_impl<T, N - 1>() + TypeList<T>{};
+        }
+    }
+
+    // overload of SingleTypeMultiplier for multiple types:
+    // convert T[N] to  T, T, T, T, ... (N times with N >= 1)
+    template <class T, std::size_t N>
+    constexpr auto SingleTypeMultiplier (const T (&)[N]) {
+        return SingleTypeMultiplier_impl<T, N>();
+    }
+
+    // overload of SingleTypeMultiplier for one regular type
+    template <class T>
+    constexpr auto SingleTypeMultiplier (T) {
+        return TypeList<T>{};
+    }
+
+    // apply the types of the input TypeList as template arguments to TParam
+    template <template <class...> class TParam, class... Args>
+    constexpr auto TApply (TypeList<Args...>) {
+        return TypeList<TParam<Args...>>{};
+    }
+}
+
+/**
+ * \brief Return the first template argument with the later arguments applied to it.
+ * Types of the form T[N] are expanded to T, T, T, T, ... (N times with N >= 1).
+ *
+ * For example, TypeMultiplier<ReduceData, Real[4], int[2], Long>
+ * is an alias to the type ReduceData<Real, Real, Real, Real, int, int, Long>.
+ */
+template <template <class...> class TParam, class... Types>
+using TypeMultiplier = TypeAt<0, decltype(detail::TApply<TParam>(
+    (TypeList<>{} + ... + detail::SingleTypeMultiplier(Types{}))
+))>;
+
 }
 
 #endif
diff --git a/Src/Particle/AMReX_DenseBins.H b/Src/Particle/AMReX_DenseBins.H
@@ -481,7 +481,7 @@ public:
         m_offsets.resize(0);
         m_offsets.resize(nbins+1);
 
-        for (int i = 0; i < nitems; ++i) {
+        for (N i = 0; i < nitems; ++i) {
             m_bins[i] = call_f(f,v,i);
             ++m_counts[m_bins[i]];
         }
@@ -490,7 +490,7 @@ public:
 
         Gpu::copy(Gpu::deviceToDevice, m_offsets.begin(), m_offsets.end(), m_counts.begin());
 
-        for (int i = 0; i < nitems; ++i) {
+        for (N i = 0; i < nitems; ++i) {
             index_type index = m_counts[m_bins[i]]++;
             m_perm[index] = i;
         }