Skip to content

Commit

Permalink
Merge branch 'AMReX-Codes:development' into development
Browse files Browse the repository at this point in the history
  • Loading branch information
ruohai0925 authored May 23, 2024
2 parents c943626 + e6c93bf commit f7f6baf
Show file tree
Hide file tree
Showing 6 changed files with 108 additions and 26 deletions.
8 changes: 8 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,13 @@
cmake_minimum_required(VERSION 3.18)

#
# Setting a cmake_policy to OLD is deprecated by definition and will raise a
# verbose warning
#
if(CMAKE_SOURCE_DIR STREQUAL PROJECT_SOURCE_DIR)
set(CMAKE_WARN_DEPRECATED OFF CACHE BOOL "" FORCE)
endif()

#
# Allow for MSVC Runtime library controls
#
Expand Down
8 changes: 8 additions & 0 deletions Src/Base/AMReX_OpenMP.H
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,15 @@ namespace amrex::OpenMP {
void Initialize ();
void Finalize ();

#if defined(_WIN32)
void** get_lock_impl (int ilock);

inline omp_lock_t* get_lock (int ilock) {
return (omp_lock_t*)(*(get_lock_impl(ilock)));
}
#else
omp_lock_t* get_lock (int ilock);
#endif
}

#else // AMREX_USE_OMP
Expand Down
24 changes: 24 additions & 0 deletions Src/Base/AMReX_OpenMP.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -137,7 +137,11 @@ namespace amrex::OpenMP
{
namespace {
constexpr int nlocks = 128;
#if defined(_WIN32)
void* omp_locks[nlocks];
#else
omp_lock_t omp_locks[nlocks];
#endif
unsigned int initialized = 0;
}

Expand Down Expand Up @@ -183,9 +187,17 @@ namespace amrex::OpenMP
}
}

#if defined(_WIN32)
for (auto& vp : omp_locks) {
auto* p = new omp_lock_t;
omp_init_lock(p);
vp = (void*) p;
}
#else
for (auto& lck : omp_locks) {
omp_init_lock(&lck);
}
#endif

++initialized;
}
Expand All @@ -195,14 +207,26 @@ namespace amrex::OpenMP
if (initialized) {
--initialized;
if (initialized == 0) {
#if defined(_WIN32)
for (auto vp : omp_locks) {
auto* p = (omp_lock_t*)vp;
omp_destroy_lock(p);
delete p;
}
#else
for (auto& lck : omp_locks) {
omp_destroy_lock(&lck);
}
#endif
}
}
}

#if defined(_WIN32)
void** get_lock_impl(int ilock)
#else
omp_lock_t* get_lock (int ilock)
#endif
{
ilock = ilock % nlocks;
if (ilock < 0) { ilock += nlocks; }
Expand Down
2 changes: 1 addition & 1 deletion Src/Particle/AMReX_ParticleInit.H
Original file line number Diff line number Diff line change
Expand Up @@ -873,7 +873,7 @@ InitFromBinaryFile (const std::string& file,
auto& pmap = m_particles[lev];
auto& tmp_pmap = tmp_particles[lev];

for (auto kv : pmap) {
for (auto& kv : pmap) {
auto& aos = kv.second.GetArrayOfStructs()();
auto& tmp_aos = tmp_pmap[kv.first].GetArrayOfStructs()();

Expand Down
23 changes: 17 additions & 6 deletions Src/Particle/AMReX_ParticleLocator.H
Original file line number Diff line number Diff line change
Expand Up @@ -35,13 +35,13 @@ struct AssignGrid
m_plo(a_geom.ProbLoArray()), m_dxi(a_geom.InvCellSizeArray())
{
// clamp bin size and num_bins to 1 for AMREX_SPACEDIM < 3
m_bin_size.x = amrex::max(m_bin_size.x, 1);
m_bin_size.y = amrex::max(m_bin_size.y, 1);
m_bin_size.z = amrex::max(m_bin_size.z, 1);
if (m_bin_size.x >= 0) {m_bin_size.x = amrex::max(m_bin_size.x, 1);}
if (m_bin_size.y >= 0) {m_bin_size.y = amrex::max(m_bin_size.y, 1);}
if (m_bin_size.z >= 0) {m_bin_size.z = amrex::max(m_bin_size.z, 1);}

m_num_bins.x = amrex::max(m_num_bins.x, 1);
m_num_bins.y = amrex::max(m_num_bins.y, 1);
m_num_bins.z = amrex::max(m_num_bins.z, 1);
if (m_bin_size.x >= 0) {m_num_bins.x = amrex::max(m_num_bins.x, 1);}
if (m_bin_size.y >= 0) {m_num_bins.y = amrex::max(m_num_bins.y, 1);}
if (m_bin_size.z >= 0) {m_num_bins.z = amrex::max(m_num_bins.z, 1);}
}

template <typename P, typename Assignor = DefaultAssignor>
Expand All @@ -55,6 +55,9 @@ struct AssignGrid
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
int operator() (const IntVect& iv, int nGrow=0) const noexcept
{
if (AMREX_D_TERM((m_num_bins.x == 0), && (m_num_bins.y == 0), && (m_num_bins.z == 0))) {
return -1;
}
const auto lo = iv.dim3();
int ix_lo = amrex::max((lo.x - nGrow - m_lo.x) / m_bin_size.x - 1, 0);
int iy_lo = amrex::max((lo.y - nGrow - m_lo.y) / m_bin_size.y - 1, 0);
Expand Down Expand Up @@ -117,6 +120,14 @@ public:
m_device_boxes.resize(num_boxes);
Gpu::copyAsync(Gpu::hostToDevice, m_host_boxes.begin(), m_host_boxes.end(), m_device_boxes.begin());

if (num_boxes == 0) {
m_bins_lo = IntVect(AMREX_D_DECL( 0, 0, 0));
m_bins_hi = IntVect(AMREX_D_DECL(-1, -1, -1));
m_bin_size = IntVect(AMREX_D_DECL(-1, -1, -1));
m_num_bins = IntVect(AMREX_D_DECL( 0, 0, 0));
return;
}

// compute the lo, hi and the max box size in each direction
ReduceOps<AMREX_D_DECL(ReduceOpMin, ReduceOpMin, ReduceOpMin),
AMREX_D_DECL(ReduceOpMax, ReduceOpMax, ReduceOpMax),
Expand Down
69 changes: 50 additions & 19 deletions Src/Particle/AMReX_ParticleUtil.H
Original file line number Diff line number Diff line change
Expand Up @@ -697,9 +697,20 @@ void PermutationForDeposition (Gpu::DeviceVector<index_type>& perm, index_type n
{
BL_PROFILE("PermutationForDeposition()");

constexpr index_type gpu_block_size = 1024;
constexpr index_type gpu_block_size_m1 = gpu_block_size - 1;
constexpr index_type llist_guard = std::numeric_limits<index_type>::max();
#if defined(AMREX_USE_HIP)
// MI250X has a small L2 cache and is more tolerant of atomic add contention,
// so we use a small block size of 64 and the compressed layout.
static constexpr index_type gpu_block_size = 64;
static constexpr bool compressed_layout = true;
#else
// A100 has a larger L2 cache and is very sensitive to atomic add contention,
// so we use a large bock size of 1024 and not the compressed layout.
static constexpr index_type gpu_block_size = 1024;
static constexpr bool compressed_layout = false;
#endif

static constexpr index_type gpu_block_size_m1 = gpu_block_size - 1;
static constexpr index_type llist_guard = std::numeric_limits<index_type>::max();

// round up to gpu_block_size
nbins = (nbins + gpu_block_size_m1) / gpu_block_size * gpu_block_size;
Expand All @@ -722,9 +733,34 @@ void PermutationForDeposition (Gpu::DeviceVector<index_type>& perm, index_type n

#if defined(AMREX_USE_CUDA) || defined(AMREX_USE_HIP)
amrex::launch<gpu_block_size>(nbins / gpu_block_size, Gpu::gpuStream(),
[=] AMREX_GPU_DEVICE () {
[pllist_start,pllist_next,pperm,pglobal_idx] AMREX_GPU_DEVICE () {
__shared__ index_type sdata[gpu_block_size];
index_type current_idx = pllist_start[threadIdx.x + gpu_block_size * blockIdx.x];
__shared__ index_type global_idx_start;
__shared__ index_type idx_start;

index_type current_idx = 0;

if constexpr (compressed_layout) {
// Compressed layout: subsequent sweeps of up to gpu_block_size contiguous particles
// are put right next to each other, while without the compressed layout,
// there can be other particle sweeps from different locations between them.
current_idx = pllist_start[threadIdx.x + gpu_block_size * blockIdx.x];

index_type num_particles_thread = 0;
while (current_idx != llist_guard) {
++num_particles_thread;
current_idx = pllist_next[current_idx];
}

index_type num_particles_block =
Gpu::blockReduceSum<gpu_block_size>(num_particles_thread);

if (threadIdx.x == 0) {
global_idx_start = Gpu::Atomic::Add(pglobal_idx, num_particles_block);
}
}

current_idx = pllist_start[threadIdx.x + gpu_block_size * blockIdx.x];

while (true) {
sdata[threadIdx.x] = index_type(current_idx != llist_guard);
Expand All @@ -745,30 +781,25 @@ void PermutationForDeposition (Gpu::DeviceVector<index_type>& perm, index_type n
if (sdata[gpu_block_size_m1] == 0) {
break;
}
__syncthreads();
if (threadIdx.x == gpu_block_size_m1) {
x = sdata[gpu_block_size_m1];
sdata[gpu_block_size_m1] = Gpu::Atomic::Add(pglobal_idx, x);
}
__syncthreads();
if (threadIdx.x < gpu_block_size_m1) {
sdata[threadIdx.x] += sdata[gpu_block_size_m1];
}
__syncthreads();
if (threadIdx.x == gpu_block_size_m1) {
sdata[gpu_block_size_m1] += x;
if constexpr (compressed_layout) {
idx_start = global_idx_start;
global_idx_start += sdata[gpu_block_size_m1];
} else {
idx_start = Gpu::Atomic::Add(pglobal_idx, sdata[gpu_block_size_m1]);
}
}
__syncthreads();

sdata[threadIdx.x] += idx_start;
if (current_idx != llist_guard) {
pperm[sdata[threadIdx.x] - 1] = current_idx;
current_idx = pllist_next[current_idx];
}
}
});
#else
amrex::ignore_unused(pperm, pglobal_idx);
Abort("Not implemented");
amrex::ignore_unused(pperm, pglobal_idx, compressed_layout);
Abort("PermutationForDeposition only implemented for CUDA and HIP");
#endif

Gpu::Device::streamSynchronize();
Expand Down

0 comments on commit f7f6baf

Please sign in to comment.