diff --git a/CMakeLists.txt b/CMakeLists.txt index 1d450157e5..d3d99f6e7c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,5 +1,13 @@ cmake_minimum_required(VERSION 3.18) +# +# Setting a cmake_policy to OLD is deprecated by definition and will raise a +# verbose warning +# +if(CMAKE_SOURCE_DIR STREQUAL PROJECT_SOURCE_DIR) + set(CMAKE_WARN_DEPRECATED OFF CACHE BOOL "" FORCE) +endif() + # # Allow for MSVC Runtime library controls # diff --git a/Src/Base/AMReX_OpenMP.H b/Src/Base/AMReX_OpenMP.H index 15d6854c92..67e180003d 100644 --- a/Src/Base/AMReX_OpenMP.H +++ b/Src/Base/AMReX_OpenMP.H @@ -17,7 +17,15 @@ namespace amrex::OpenMP { void Initialize (); void Finalize (); +#if defined(_WIN32) + void** get_lock_impl (int ilock); + + inline omp_lock_t* get_lock (int ilock) { + return (omp_lock_t*)(*(get_lock_impl(ilock))); + } +#else omp_lock_t* get_lock (int ilock); +#endif } #else // AMREX_USE_OMP diff --git a/Src/Base/AMReX_OpenMP.cpp b/Src/Base/AMReX_OpenMP.cpp index fafb00c11c..53782c8c34 100644 --- a/Src/Base/AMReX_OpenMP.cpp +++ b/Src/Base/AMReX_OpenMP.cpp @@ -137,7 +137,11 @@ namespace amrex::OpenMP { namespace { constexpr int nlocks = 128; +#if defined(_WIN32) + void* omp_locks[nlocks]; +#else omp_lock_t omp_locks[nlocks]; +#endif unsigned int initialized = 0; } @@ -183,9 +187,17 @@ namespace amrex::OpenMP } } +#if defined(_WIN32) + for (auto& vp : omp_locks) { + auto* p = new omp_lock_t; + omp_init_lock(p); + vp = (void*) p; + } +#else for (auto& lck : omp_locks) { omp_init_lock(&lck); } +#endif ++initialized; } @@ -195,14 +207,26 @@ namespace amrex::OpenMP if (initialized) { --initialized; if (initialized == 0) { +#if defined(_WIN32) + for (auto vp : omp_locks) { + auto* p = (omp_lock_t*)vp; + omp_destroy_lock(p); + delete p; + } +#else for (auto& lck : omp_locks) { omp_destroy_lock(&lck); } +#endif } } } +#if defined(_WIN32) + void** get_lock_impl(int ilock) +#else omp_lock_t* get_lock (int ilock) +#endif { ilock = ilock % nlocks; if (ilock < 0) { ilock += nlocks; } diff --git a/Src/Particle/AMReX_ParticleInit.H b/Src/Particle/AMReX_ParticleInit.H index 75316dec74..9fee3729ab 100644 --- a/Src/Particle/AMReX_ParticleInit.H +++ b/Src/Particle/AMReX_ParticleInit.H @@ -873,7 +873,7 @@ InitFromBinaryFile (const std::string& file, auto& pmap = m_particles[lev]; auto& tmp_pmap = tmp_particles[lev]; - for (auto kv : pmap) { + for (auto& kv : pmap) { auto& aos = kv.second.GetArrayOfStructs()(); auto& tmp_aos = tmp_pmap[kv.first].GetArrayOfStructs()(); diff --git a/Src/Particle/AMReX_ParticleLocator.H b/Src/Particle/AMReX_ParticleLocator.H index 0a07c85553..9b75f50330 100644 --- a/Src/Particle/AMReX_ParticleLocator.H +++ b/Src/Particle/AMReX_ParticleLocator.H @@ -35,13 +35,13 @@ struct AssignGrid m_plo(a_geom.ProbLoArray()), m_dxi(a_geom.InvCellSizeArray()) { // clamp bin size and num_bins to 1 for AMREX_SPACEDIM < 3 - m_bin_size.x = amrex::max(m_bin_size.x, 1); - m_bin_size.y = amrex::max(m_bin_size.y, 1); - m_bin_size.z = amrex::max(m_bin_size.z, 1); + if (m_bin_size.x >= 0) {m_bin_size.x = amrex::max(m_bin_size.x, 1);} + if (m_bin_size.y >= 0) {m_bin_size.y = amrex::max(m_bin_size.y, 1);} + if (m_bin_size.z >= 0) {m_bin_size.z = amrex::max(m_bin_size.z, 1);} - m_num_bins.x = amrex::max(m_num_bins.x, 1); - m_num_bins.y = amrex::max(m_num_bins.y, 1); - m_num_bins.z = amrex::max(m_num_bins.z, 1); + if (m_bin_size.x >= 0) {m_num_bins.x = amrex::max(m_num_bins.x, 1);} + if (m_bin_size.y >= 0) {m_num_bins.y = amrex::max(m_num_bins.y, 1);} + if (m_bin_size.z >= 0) {m_num_bins.z = amrex::max(m_num_bins.z, 1);} } template @@ -55,6 +55,9 @@ struct AssignGrid AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE int operator() (const IntVect& iv, int nGrow=0) const noexcept { + if (AMREX_D_TERM((m_num_bins.x == 0), && (m_num_bins.y == 0), && (m_num_bins.z == 0))) { + return -1; + } const auto lo = iv.dim3(); int ix_lo = amrex::max((lo.x - nGrow - m_lo.x) / m_bin_size.x - 1, 0); int iy_lo = amrex::max((lo.y - nGrow - m_lo.y) / m_bin_size.y - 1, 0); @@ -117,6 +120,14 @@ public: m_device_boxes.resize(num_boxes); Gpu::copyAsync(Gpu::hostToDevice, m_host_boxes.begin(), m_host_boxes.end(), m_device_boxes.begin()); + if (num_boxes == 0) { + m_bins_lo = IntVect(AMREX_D_DECL( 0, 0, 0)); + m_bins_hi = IntVect(AMREX_D_DECL(-1, -1, -1)); + m_bin_size = IntVect(AMREX_D_DECL(-1, -1, -1)); + m_num_bins = IntVect(AMREX_D_DECL( 0, 0, 0)); + return; + } + // compute the lo, hi and the max box size in each direction ReduceOps& perm, index_type n { BL_PROFILE("PermutationForDeposition()"); - constexpr index_type gpu_block_size = 1024; - constexpr index_type gpu_block_size_m1 = gpu_block_size - 1; - constexpr index_type llist_guard = std::numeric_limits::max(); +#if defined(AMREX_USE_HIP) + // MI250X has a small L2 cache and is more tolerant of atomic add contention, + // so we use a small block size of 64 and the compressed layout. + static constexpr index_type gpu_block_size = 64; + static constexpr bool compressed_layout = true; +#else + // A100 has a larger L2 cache and is very sensitive to atomic add contention, + // so we use a large bock size of 1024 and not the compressed layout. + static constexpr index_type gpu_block_size = 1024; + static constexpr bool compressed_layout = false; +#endif + + static constexpr index_type gpu_block_size_m1 = gpu_block_size - 1; + static constexpr index_type llist_guard = std::numeric_limits::max(); // round up to gpu_block_size nbins = (nbins + gpu_block_size_m1) / gpu_block_size * gpu_block_size; @@ -722,9 +733,34 @@ void PermutationForDeposition (Gpu::DeviceVector& perm, index_type n #if defined(AMREX_USE_CUDA) || defined(AMREX_USE_HIP) amrex::launch(nbins / gpu_block_size, Gpu::gpuStream(), - [=] AMREX_GPU_DEVICE () { + [pllist_start,pllist_next,pperm,pglobal_idx] AMREX_GPU_DEVICE () { __shared__ index_type sdata[gpu_block_size]; - index_type current_idx = pllist_start[threadIdx.x + gpu_block_size * blockIdx.x]; + __shared__ index_type global_idx_start; + __shared__ index_type idx_start; + + index_type current_idx = 0; + + if constexpr (compressed_layout) { + // Compressed layout: subsequent sweeps of up to gpu_block_size contiguous particles + // are put right next to each other, while without the compressed layout, + // there can be other particle sweeps from different locations between them. + current_idx = pllist_start[threadIdx.x + gpu_block_size * blockIdx.x]; + + index_type num_particles_thread = 0; + while (current_idx != llist_guard) { + ++num_particles_thread; + current_idx = pllist_next[current_idx]; + } + + index_type num_particles_block = + Gpu::blockReduceSum(num_particles_thread); + + if (threadIdx.x == 0) { + global_idx_start = Gpu::Atomic::Add(pglobal_idx, num_particles_block); + } + } + + current_idx = pllist_start[threadIdx.x + gpu_block_size * blockIdx.x]; while (true) { sdata[threadIdx.x] = index_type(current_idx != llist_guard); @@ -745,21 +781,16 @@ void PermutationForDeposition (Gpu::DeviceVector& perm, index_type n if (sdata[gpu_block_size_m1] == 0) { break; } - __syncthreads(); - if (threadIdx.x == gpu_block_size_m1) { - x = sdata[gpu_block_size_m1]; - sdata[gpu_block_size_m1] = Gpu::Atomic::Add(pglobal_idx, x); - } - __syncthreads(); - if (threadIdx.x < gpu_block_size_m1) { - sdata[threadIdx.x] += sdata[gpu_block_size_m1]; - } - __syncthreads(); if (threadIdx.x == gpu_block_size_m1) { - sdata[gpu_block_size_m1] += x; + if constexpr (compressed_layout) { + idx_start = global_idx_start; + global_idx_start += sdata[gpu_block_size_m1]; + } else { + idx_start = Gpu::Atomic::Add(pglobal_idx, sdata[gpu_block_size_m1]); + } } __syncthreads(); - + sdata[threadIdx.x] += idx_start; if (current_idx != llist_guard) { pperm[sdata[threadIdx.x] - 1] = current_idx; current_idx = pllist_next[current_idx]; @@ -767,8 +798,8 @@ void PermutationForDeposition (Gpu::DeviceVector& perm, index_type n } }); #else - amrex::ignore_unused(pperm, pglobal_idx); - Abort("Not implemented"); + amrex::ignore_unused(pperm, pglobal_idx, compressed_layout); + Abort("PermutationForDeposition only implemented for CUDA and HIP"); #endif Gpu::Device::streamSynchronize();