Merge branch 'AMReX-Codes:development' into development

ruohai0925 · May 23, 2024 · f7f6baf · f7f6baf
2 parents c943626 + e6c93bf
commit f7f6baf
Show file tree

Hide file tree

Showing 6 changed files with 108 additions and 26 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -1,5 +1,13 @@
 cmake_minimum_required(VERSION 3.18)
 
+#
+# Setting a cmake_policy to OLD is deprecated by definition and will raise a
+# verbose warning
+#
+if(CMAKE_SOURCE_DIR STREQUAL PROJECT_SOURCE_DIR)
+    set(CMAKE_WARN_DEPRECATED OFF CACHE BOOL "" FORCE)
+endif()
+
 #
 # Allow for MSVC Runtime library controls
 #

diff --git a/Src/Base/AMReX_OpenMP.H b/Src/Base/AMReX_OpenMP.H
@@ -17,7 +17,15 @@ namespace amrex::OpenMP {
     void Initialize ();
     void Finalize ();
 
+#if defined(_WIN32)
+    void** get_lock_impl (int ilock);
+
+    inline omp_lock_t* get_lock (int ilock) {
+        return (omp_lock_t*)(*(get_lock_impl(ilock)));
+    }
+#else
     omp_lock_t* get_lock (int ilock);
+#endif
 }
 
 #else // AMREX_USE_OMP

diff --git a/Src/Base/AMReX_OpenMP.cpp b/Src/Base/AMReX_OpenMP.cpp
@@ -137,7 +137,11 @@ namespace amrex::OpenMP
 {
     namespace {
         constexpr int nlocks = 128;
+#if defined(_WIN32)
+        void* omp_locks[nlocks];
+#else
         omp_lock_t omp_locks[nlocks];
+#endif
         unsigned int initialized = 0;
     }
 
@@ -183,9 +187,17 @@ namespace amrex::OpenMP
             }
         }
 
+#if defined(_WIN32)
+        for (auto& vp : omp_locks) {
+            auto* p = new omp_lock_t;
+            omp_init_lock(p);
+            vp = (void*) p;
+        }
+#else
         for (auto& lck : omp_locks) {
             omp_init_lock(&lck);
         }
+#endif
 
         ++initialized;
     }
@@ -195,14 +207,26 @@ namespace amrex::OpenMP
         if (initialized) {
             --initialized;
             if (initialized == 0) {
+#if defined(_WIN32)
+                for (auto vp : omp_locks) {
+                    auto* p = (omp_lock_t*)vp;
+                    omp_destroy_lock(p);
+                    delete p;
+                }
+#else
                 for (auto& lck : omp_locks) {
                     omp_destroy_lock(&lck);
                 }
+#endif
             }
         }
     }
 
+#if defined(_WIN32)
+    void** get_lock_impl(int ilock)
+#else
     omp_lock_t* get_lock (int ilock)
+#endif
     {
         ilock = ilock % nlocks;
         if (ilock < 0) { ilock += nlocks; }

diff --git a/Src/Particle/AMReX_ParticleInit.H b/Src/Particle/AMReX_ParticleInit.H
@@ -873,7 +873,7 @@ InitFromBinaryFile (const std::string& file,
             auto& pmap     = m_particles[lev];
             auto& tmp_pmap = tmp_particles[lev];
 
-            for (auto kv : pmap) {
+            for (auto& kv : pmap) {
                 auto& aos = kv.second.GetArrayOfStructs()();
                 auto& tmp_aos = tmp_pmap[kv.first].GetArrayOfStructs()();
 

diff --git a/Src/Particle/AMReX_ParticleLocator.H b/Src/Particle/AMReX_ParticleLocator.H
@@ -35,13 +35,13 @@ struct AssignGrid
           m_plo(a_geom.ProbLoArray()), m_dxi(a_geom.InvCellSizeArray())
         {
             // clamp bin size and num_bins to 1 for AMREX_SPACEDIM < 3
-            m_bin_size.x = amrex::max(m_bin_size.x, 1);
-            m_bin_size.y = amrex::max(m_bin_size.y, 1);
-            m_bin_size.z = amrex::max(m_bin_size.z, 1);
+            if (m_bin_size.x >= 0) {m_bin_size.x = amrex::max(m_bin_size.x, 1);}
+            if (m_bin_size.y >= 0) {m_bin_size.y = amrex::max(m_bin_size.y, 1);}
+            if (m_bin_size.z >= 0) {m_bin_size.z = amrex::max(m_bin_size.z, 1);}
 
-            m_num_bins.x = amrex::max(m_num_bins.x, 1);
-            m_num_bins.y = amrex::max(m_num_bins.y, 1);
-            m_num_bins.z = amrex::max(m_num_bins.z, 1);
+            if (m_bin_size.x >= 0) {m_num_bins.x = amrex::max(m_num_bins.x, 1);}
+            if (m_bin_size.y >= 0) {m_num_bins.y = amrex::max(m_num_bins.y, 1);}
+            if (m_bin_size.z >= 0) {m_num_bins.z = amrex::max(m_num_bins.z, 1);}
         }
 
     template <typename P, typename Assignor = DefaultAssignor>
@@ -55,6 +55,9 @@ struct AssignGrid
     AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
     int operator() (const IntVect& iv, int nGrow=0) const noexcept
     {
+        if (AMREX_D_TERM((m_num_bins.x == 0), && (m_num_bins.y == 0), && (m_num_bins.z == 0))) {
+            return -1;
+        }
         const auto lo = iv.dim3();
         int ix_lo = amrex::max((lo.x - nGrow - m_lo.x) / m_bin_size.x - 1, 0);
         int iy_lo = amrex::max((lo.y - nGrow - m_lo.y) / m_bin_size.y - 1, 0);
@@ -117,6 +120,14 @@ public:
         m_device_boxes.resize(num_boxes);
         Gpu::copyAsync(Gpu::hostToDevice, m_host_boxes.begin(), m_host_boxes.end(), m_device_boxes.begin());
 
+        if (num_boxes == 0) {
+            m_bins_lo  = IntVect(AMREX_D_DECL( 0,  0,  0));
+            m_bins_hi  = IntVect(AMREX_D_DECL(-1, -1, -1));
+            m_bin_size = IntVect(AMREX_D_DECL(-1, -1, -1));
+            m_num_bins = IntVect(AMREX_D_DECL( 0,  0,  0));
+            return;
+        }
+
         // compute the lo, hi and the max box size in each direction
         ReduceOps<AMREX_D_DECL(ReduceOpMin, ReduceOpMin, ReduceOpMin),
                   AMREX_D_DECL(ReduceOpMax, ReduceOpMax, ReduceOpMax),

diff --git a/Src/Particle/AMReX_ParticleUtil.H b/Src/Particle/AMReX_ParticleUtil.H
@@ -697,9 +697,20 @@ void PermutationForDeposition (Gpu::DeviceVector<index_type>& perm, index_type n
 {
     BL_PROFILE("PermutationForDeposition()");
 
-    constexpr index_type gpu_block_size = 1024;
-    constexpr index_type gpu_block_size_m1 = gpu_block_size - 1;
-    constexpr index_type llist_guard = std::numeric_limits<index_type>::max();
+#if defined(AMREX_USE_HIP)
+    // MI250X has a small L2 cache and is more tolerant of atomic add contention,
+    // so we use a small block size of 64 and the compressed layout.
+    static constexpr index_type gpu_block_size = 64;
+    static constexpr bool compressed_layout = true;
+#else
+    // A100 has a larger L2 cache and is very sensitive to atomic add contention,
+    // so we use a large bock size of 1024 and not the compressed layout.
+    static constexpr index_type gpu_block_size = 1024;
+    static constexpr bool compressed_layout = false;
+#endif
+
+    static constexpr index_type gpu_block_size_m1 = gpu_block_size - 1;
+    static constexpr index_type llist_guard = std::numeric_limits<index_type>::max();
 
     // round up to gpu_block_size
     nbins = (nbins + gpu_block_size_m1) / gpu_block_size * gpu_block_size;
@@ -722,9 +733,34 @@ void PermutationForDeposition (Gpu::DeviceVector<index_type>& perm, index_type n
 
 #if defined(AMREX_USE_CUDA) || defined(AMREX_USE_HIP)
     amrex::launch<gpu_block_size>(nbins / gpu_block_size, Gpu::gpuStream(),
-        [=] AMREX_GPU_DEVICE () {
+        [pllist_start,pllist_next,pperm,pglobal_idx] AMREX_GPU_DEVICE () {
             __shared__ index_type sdata[gpu_block_size];
-            index_type current_idx = pllist_start[threadIdx.x + gpu_block_size * blockIdx.x];
+            __shared__ index_type global_idx_start;
+            __shared__ index_type idx_start;
+
+            index_type current_idx = 0;
+
+            if constexpr (compressed_layout) {
+                // Compressed layout: subsequent sweeps of up to gpu_block_size contiguous particles
+                // are put right next to each other, while without the compressed layout,
+                // there can be other particle sweeps from different locations between them.
+                current_idx = pllist_start[threadIdx.x + gpu_block_size * blockIdx.x];
+
+                index_type num_particles_thread = 0;
+                while (current_idx != llist_guard) {
+                    ++num_particles_thread;
+                    current_idx = pllist_next[current_idx];
+                }
+
+                index_type num_particles_block =
+                    Gpu::blockReduceSum<gpu_block_size>(num_particles_thread);
+
+                if (threadIdx.x == 0) {
+                    global_idx_start = Gpu::Atomic::Add(pglobal_idx, num_particles_block);
+                }
+            }
+
+            current_idx = pllist_start[threadIdx.x + gpu_block_size * blockIdx.x];
 
             while (true) {
                 sdata[threadIdx.x] = index_type(current_idx != llist_guard);
@@ -745,30 +781,25 @@ void PermutationForDeposition (Gpu::DeviceVector<index_type>& perm, index_type n
                 if (sdata[gpu_block_size_m1] == 0) {
                     break;
                 }
-                __syncthreads();
-                if (threadIdx.x == gpu_block_size_m1) {
-                    x = sdata[gpu_block_size_m1];
-                    sdata[gpu_block_size_m1] = Gpu::Atomic::Add(pglobal_idx, x);
-                }
-                __syncthreads();
-                if (threadIdx.x < gpu_block_size_m1) {
-                    sdata[threadIdx.x] += sdata[gpu_block_size_m1];
-                }
-                __syncthreads();
                 if (threadIdx.x == gpu_block_size_m1) {
-                    sdata[gpu_block_size_m1] += x;
+                    if constexpr (compressed_layout) {
+                        idx_start = global_idx_start;
+                        global_idx_start += sdata[gpu_block_size_m1];
+                    } else {
+                        idx_start = Gpu::Atomic::Add(pglobal_idx, sdata[gpu_block_size_m1]);
+                    }
                 }
                 __syncthreads();
-
+                sdata[threadIdx.x] += idx_start;
                 if (current_idx != llist_guard) {
                     pperm[sdata[threadIdx.x] - 1] = current_idx;
                     current_idx = pllist_next[current_idx];
                 }
             }
         });
 #else
-    amrex::ignore_unused(pperm, pglobal_idx);
-    Abort("Not implemented");
+    amrex::ignore_unused(pperm, pglobal_idx, compressed_layout);
+    Abort("PermutationForDeposition only implemented for CUDA and HIP");
 #endif
 
     Gpu::Device::streamSynchronize();