Skip to content

Commit

Permalink
Merge branch 'AMReX-Codes:development' into development
Browse files Browse the repository at this point in the history
  • Loading branch information
ruohai0925 authored Jan 26, 2024
2 parents cbb9f6a + d5cc579 commit a471816
Show file tree
Hide file tree
Showing 14 changed files with 590 additions and 70 deletions.
2 changes: 1 addition & 1 deletion Docs/sphinx_documentation/source/EB.rst
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ Here is a simple example of initialize the database for an embedded sphere.
EB2::Build(shop, geom, 0, 0);

Alternatively, the EB information can be initialized from an STL file
specified by a :cpp:`ParmParse` parameter ``eb2.stl_file``. The
specified by a :cpp:`ParmParse` parameter ``eb2.stl_file``. (This also requires setting ``eb2.geom_type = stl``.) The
initialization is done by calling

.. highlight:: c++
Expand Down
60 changes: 37 additions & 23 deletions Src/Base/AMReX_BaseFab.H
Original file line number Diff line number Diff line change
Expand Up @@ -3330,15 +3330,25 @@ BaseFab<T>::lockAdd (const BaseFab<T>& src, const Box& srcbox, const Box& destbo

Array4<T> const& d = this->array();
Array4<T const> const& s = src.const_array();
auto const& dlo = destbox.smallEnd();
#if (AMREX_SPACEDIM == 3)
auto const& dhi = destbox.bigEnd();
#endif
auto const& slo = srcbox.smallEnd();
auto const offset = slo - dlo;
auto const lenx = srcbox.length(0);
auto const& dlo = amrex::lbound(destbox);
auto const& dhi = amrex::ubound(destbox);
auto const& len = amrex::length(destbox);
auto const& slo = amrex::lbound(srcbox);
Dim3 const offset{slo.x-dlo.x, slo.y-dlo.y, slo.z-dlo.z};

int planedim;
int nplanes;
int plo;
if (len.z == 1) {
planedim = 1;
nplanes = len.y;
plo = dlo.y;
} else {
planedim = 2;
nplanes = len.z;
plo = dlo.z;
}

auto const nplanes = srcbox.length(AMREX_SPACEDIM-1);
auto* mask = (bool*) amrex_mempool_alloc(sizeof(bool)*nplanes);
for (int ip = 0; ip < nplanes; ++ip) {
mask[ip] = false;
Expand All @@ -3348,27 +3358,31 @@ BaseFab<T>::lockAdd (const BaseFab<T>& src, const Box& srcbox, const Box& destbo
int planes_left = nplanes;
while (planes_left > 0) {
AMREX_ASSERT(mm < nplanes);
auto const m = mm + dlo[AMREX_SPACEDIM-1];
auto const m = mm + plo;
int ilock = m % OpenMP::nlocks;
if (ilock < 0) { ilock += OpenMP::nlocks; }
auto* lock = &(OpenMP::omp_locks[ilock]);
if (omp_test_lock(lock))
{
for (int n = 0; n < numcomp; ++n)
{
#if (AMREX_SPACEDIM == 3)
for (int j = dlo[1]; j <= dhi[1]; ++j)
{
IntVect div(dlo[0], j, m);
#elif (AMREX_SPACEDIM == 2)
{
IntVect div(dlo[0], m);
#endif
auto * pdst = d.ptr(div ,n+destcomp);
auto const* psrc = s.ptr(div+offset,n+srccomp);
auto lo = dlo;
auto hi = dhi;
if (planedim == 1) {
lo.y = m;
hi.y = m;
} else {
lo.z = m;
hi.z = m;
}

for (int n = 0; n < numcomp; ++n) {
for (int k = lo.z; k <= hi.z; ++k) {
for (int j = lo.y; j <= hi.y; ++j) {
auto * pdst = d.ptr(dlo.x,j ,k ,n+destcomp);
auto const* psrc = s.ptr(slo.x,j+offset.y,k+offset.z,n+ srccomp);
#pragma omp simd
for (int ii = 0; ii < lenx; ++ii) {
pdst[ii] += psrc[ii];
for (int ii = 0; ii < len.x; ++ii) {
pdst[ii] += psrc[ii];
}
}
}
}
Expand Down
4 changes: 2 additions & 2 deletions Src/Base/AMReX_Extension.H
Original file line number Diff line number Diff line change
Expand Up @@ -138,9 +138,9 @@
#define AMREX_TO_STRING(X) AMREX_TO_STRING_HELPER(X)

#if defined(__clang__) || defined(__CUDACC__) || defined(__HIP__) || defined(__INTEL_CLANG_COMPILER)
#define AMREX_UNROLL_LOOP(n) _Pragma(AMREX_TO_STRING(unroll (n)))
#define AMREX_UNROLL_LOOP(n) _Pragma(AMREX_TO_STRING(unroll n))
#elif defined(__GNUC__)
#define AMREX_UNROLL_LOOP(n) _Pragma(AMREX_TO_STRING(GCC unroll (n)))
#define AMREX_UNROLL_LOOP(n) _Pragma(AMREX_TO_STRING(GCC unroll n))
#else
#define AMREX_UNROLL_LOOP(n)
#endif
Expand Down
18 changes: 18 additions & 0 deletions Src/Base/AMReX_Functional.H
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,24 @@ struct LogicalOr
}
};

template <typename T>
struct Multiplies
{
constexpr T operator() (const T & lhs, const T & rhs) const
{
return lhs * rhs;
}
};

template <typename T>
struct Divides
{
constexpr T operator() (const T & lhs, const T & rhs) const
{
return lhs / rhs;
}
};

}

#endif
76 changes: 75 additions & 1 deletion Src/Base/AMReX_GpuAtomic.H
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,13 @@ namespace amrex {
namespace Gpu::Atomic {

// For Add, Min and Max, we support int, unsigned int, long, unsigned long long, float and double.
// For Multiply and Divide, we support generic types provided they are the same size as int or unsigned long long
// and have *= and /= operators.
// For LogicalOr and LogicalAnd, the data type is int.
// For Exch and CAS, the data type is generic.
// All these functions are non-atomic in host code!!!
// If one needs them to be atomic in host code, use HostDevice::Atomic::*. Currently only
// HostDevice::Atomic is supported. We could certainly add more.
// HostDevice::Atomic::Add is supported. We could certainly add more.

namespace detail {

Expand Down Expand Up @@ -526,6 +528,78 @@ namespace detail {
))
#endif
}

////////////////////////////////////////////////////////////////////////
// Multiply
////////////////////////////////////////////////////////////////////////

#ifdef AMREX_USE_GPU

template <typename T, std::enable_if_t<sizeof(T) == sizeof(int), int> = 0>
AMREX_GPU_DEVICE AMREX_FORCE_INLINE
T Multiply_device (T* const prod, T const value) noexcept
{
return detail::atomic_op<T, int>(prod,value,amrex::Multiplies<T>());
}

template <typename T, std::enable_if_t<sizeof(T) == sizeof(unsigned long long), int> = 0>
AMREX_GPU_DEVICE AMREX_FORCE_INLINE
T Multiply_device (T* const prod, T const value) noexcept
{
return detail::atomic_op<T, unsigned long long>(prod,value,amrex::Multiplies<T>());
}

#endif

template<class T>
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
T Multiply (T* const prod, T const value) noexcept
{
AMREX_IF_ON_DEVICE((
return Multiply_device(prod, value);
))
AMREX_IF_ON_HOST((
auto const old = *prod;
*prod *= value;
return old;
))
}

////////////////////////////////////////////////////////////////////////
// Divide
////////////////////////////////////////////////////////////////////////

#ifdef AMREX_USE_GPU

template <typename T, std::enable_if_t<sizeof(T) == sizeof(int), int> = 0>
AMREX_GPU_DEVICE AMREX_FORCE_INLINE
T Divide_device (T* const quot, T const value) noexcept
{
return detail::atomic_op<T, int>(quot,value,amrex::Divides<T>());
}

template <typename T, std::enable_if_t<sizeof(T) == sizeof(unsigned long long), int> = 0>
AMREX_GPU_DEVICE AMREX_FORCE_INLINE
T Divide_device (T* const quot, T const value) noexcept
{
return detail::atomic_op<T, unsigned long long>(quot,value,amrex::Divides<T>());
}

#endif

template<class T>
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
T Divide (T* const quot, T const value) noexcept
{
AMREX_IF_ON_DEVICE((
return Divide_device(quot, value);
))
AMREX_IF_ON_HOST((
auto const old = *quot;
*quot /= value;
return old;
))
}
}

namespace HostDevice::Atomic {
Expand Down
32 changes: 30 additions & 2 deletions Src/Base/AMReX_Reduce.H
Original file line number Diff line number Diff line change
Expand Up @@ -67,14 +67,14 @@ namespace Reduce::detail {

template <std::size_t I, typename T, typename P>
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
void for_each_init (T& t)
constexpr void for_each_init (T& t)
{
P().init(amrex::get<I>(t));
}

template <std::size_t I, typename T, typename P, typename P1, typename... Ps>
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
void for_each_init (T& t)
constexpr void for_each_init (T& t)
{
P().init(amrex::get<I>(t));
for_each_init<I+1,T,P1,Ps...>(t);
Expand Down Expand Up @@ -1275,6 +1275,34 @@ bool AnyOf (Box const& box, P&&pred)

#endif

/**
* \brief Return a GpuTuple containing the identity element for each operation in ReduceOps.
* For example 0, +inf and -inf for ReduceOpSum, ReduceOpMin and ReduceOpMax respectively.
*/
template <typename... Ts, typename... Ps>
AMREX_GPU_HOST_DEVICE
constexpr GpuTuple<Ts...>
IdentityTuple (GpuTuple<Ts...>, ReduceOps<Ps...>) noexcept
{
GpuTuple<Ts...> r{};
Reduce::detail::for_each_init<0, decltype(r), Ps...>(r);
return r;
}

/**
* \brief Return a GpuTuple containing the identity element for each ReduceOp in TypeList.
* For example 0, +inf and -inf for ReduceOpSum, ReduceOpMin and ReduceOpMax respectively.
*/
template <typename... Ts, typename... Ps>
AMREX_GPU_HOST_DEVICE
constexpr GpuTuple<Ts...>
IdentityTuple (GpuTuple<Ts...>, TypeList<Ps...>) noexcept
{
GpuTuple<Ts...> r{};
Reduce::detail::for_each_init<0, decltype(r), Ps...>(r);
return r;
}

}

#endif
14 changes: 14 additions & 0 deletions Src/Base/AMReX_Tuple.H
Original file line number Diff line number Diff line change
Expand Up @@ -378,6 +378,20 @@ ForwardAsTuple (Ts&&... args) noexcept
return GpuTuple<Ts&&...>(std::forward<Ts>(args)...);
}

// MakeZeroTuple

/**
* \brief Return a GpuTuple containing all zeros.
* Note that a default-constructed GpuTuple can have uninitialized values.
*/
template <typename... Ts>
AMREX_GPU_HOST_DEVICE
constexpr GpuTuple<Ts...>
MakeZeroTuple (GpuTuple<Ts...>) noexcept
{
return GpuTuple<Ts...>(static_cast<Ts>(0)...);
}

}

#endif /*AMREX_TUPLE_H_*/
49 changes: 48 additions & 1 deletion Src/Base/AMReX_TypeList.H
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ ForEach (TypeList<Ts...>, F&& f)
// dst and src are either MultiFab or fMultiFab
auto tt = CartesianProduct(TypeList<MultiFab,fMultiFab>{},
TypeList<MultiFab,fMultiFab>{});
bool r = ForEachUtil(tt, [&] (auto t) -> bool
bool r = ForEachUntil(tt, [&] (auto t) -> bool
{
using MF0 = TypeAt<0,decltype(t)>;
using MF1 = TypeAt<1,decltype(t)>;
Expand Down Expand Up @@ -151,6 +151,53 @@ constexpr auto CartesianProduct (Ls...) {
return (TypeList<TypeList<>>{} * ... * Ls{});
}

namespace detail {
// return TypeList<T, T, T, T, ... (N times)> by using the fast power algorithm
template <class T, std::size_t N>
constexpr auto SingleTypeMultiplier_impl () {
if constexpr (N == 0) {
return TypeList<>{};
} else if constexpr (N == 1) {
return TypeList<T>{};
} else if constexpr (N % 2 == 0) {
return SingleTypeMultiplier_impl<T, N / 2>() + SingleTypeMultiplier_impl<T, N / 2>();
} else {
return SingleTypeMultiplier_impl<T, N - 1>() + TypeList<T>{};
}
}

// overload of SingleTypeMultiplier for multiple types:
// convert T[N] to T, T, T, T, ... (N times with N >= 1)
template <class T, std::size_t N>
constexpr auto SingleTypeMultiplier (const T (&)[N]) {
return SingleTypeMultiplier_impl<T, N>();
}

// overload of SingleTypeMultiplier for one regular type
template <class T>
constexpr auto SingleTypeMultiplier (T) {
return TypeList<T>{};
}

// apply the types of the input TypeList as template arguments to TParam
template <template <class...> class TParam, class... Args>
constexpr auto TApply (TypeList<Args...>) {
return TypeList<TParam<Args...>>{};
}
}

/**
* \brief Return the first template argument with the later arguments applied to it.
* Types of the form T[N] are expanded to T, T, T, T, ... (N times with N >= 1).
*
* For example, TypeMultiplier<ReduceData, Real[4], int[2], Long>
* is an alias to the type ReduceData<Real, Real, Real, Real, int, int, Long>.
*/
template <template <class...> class TParam, class... Types>
using TypeMultiplier = TypeAt<0, decltype(detail::TApply<TParam>(
(TypeList<>{} + ... + detail::SingleTypeMultiplier(Types{}))
))>;

}

#endif
4 changes: 2 additions & 2 deletions Src/Particle/AMReX_DenseBins.H
Original file line number Diff line number Diff line change
Expand Up @@ -481,7 +481,7 @@ public:
m_offsets.resize(0);
m_offsets.resize(nbins+1);

for (int i = 0; i < nitems; ++i) {
for (N i = 0; i < nitems; ++i) {
m_bins[i] = call_f(f,v,i);
++m_counts[m_bins[i]];
}
Expand All @@ -490,7 +490,7 @@ public:

Gpu::copy(Gpu::deviceToDevice, m_offsets.begin(), m_offsets.end(), m_counts.begin());

for (int i = 0; i < nitems; ++i) {
for (N i = 0; i < nitems; ++i) {
index_type index = m_counts[m_bins[i]]++;
m_perm[index] = i;
}
Expand Down
Loading

0 comments on commit a471816

Please sign in to comment.