From b523c3297d00526ae11a44585106ac454743c6cf Mon Sep 17 00:00:00 2001
From: Weiqun Zhang <WeiqunZhang@lbl.gov>
Date: Tue, 18 Jul 2023 18:50:39 -0700
Subject: [PATCH] Memory Efficient AoS Particle Sorting (#3427)

Because of the size of AoS particle, creating a temporary vector for all
the AoS data during reording might use too much memory. In fact, this is
usually the memory usage bottleneck in WarpX simulations. In this new
approach, we divide the AoS into smaller chunks and reorder them one at
a time, thus significantly reducing the peak memory usage.

Thanks @AlexanderSinn for the suggestion.
---
 Src/Particle/AMReX_ParticleContainerI.H | 50 +++++++++++++++----------
 1 file changed, 30 insertions(+), 20 deletions(-)

diff --git a/Src/Particle/AMReX_ParticleContainerI.H b/Src/Particle/AMReX_ParticleContainerI.H
index 8f2d10bb7b8..9712a13a415 100644
--- a/Src/Particle/AMReX_ParticleContainerI.H
+++ b/Src/Particle/AMReX_ParticleContainerI.H
@@ -1097,31 +1097,41 @@ ParticleContainer_impl<ParticleType, NArrayReal, NArrayInt, Allocator>
 
     if (memEfficientSort) {
         if constexpr(!ParticleType::is_soa_particle) {
-            ParticleVector tmp_particles(np_total);
-            auto src = ptile.getParticleTileData();
-            ParticleType* dst = tmp_particles.data();
-
-            AMREX_HOST_DEVICE_FOR_1D( np_total, i,
-            {
-                dst[i] = i < np ? src.m_aos[permutations[i]] : src.m_aos[i];
-            });
-
+            static_assert(sizeof(ParticleType)%4 == 0 && sizeof(uint32_t) == 4);
+            using tmp_t = std::conditional_t<sizeof(ParticleType)%8 == 0,
+                                             uint64_t, uint32_t>;
+            constexpr std::size_t nchunks = sizeof(ParticleType) / sizeof(tmp_t);
+            Gpu::DeviceVector<tmp_t> tmp(np);
+            auto* ptmp = tmp.data();
+            auto* paos = (tmp_t*)(ptile.getParticleTileData().m_aos);
+            for (std::size_t ichunk = 0; ichunk < nchunks; ++ichunk) {
+                // Do not need to reorder neighbor particles
+                AMREX_HOST_DEVICE_FOR_1D(np, i,
+                {
+                    ptmp[i] = paos[permutations[i]*nchunks+ichunk];
+                });
+                AMREX_HOST_DEVICE_FOR_1D(np, i,
+                {
+                    paos[i*nchunks+ichunk] = ptmp[i];
+                });
+            }
             Gpu::streamSynchronize();
-            ptile.GetArrayOfStructs()().swap(tmp_particles);
         }
 
-        RealVector tmp_real(np_total);
-        for (int comp = 0; comp < NArrayReal + m_num_runtime_real; ++comp) {
-            auto src = ptile.GetStructOfArrays().GetRealData(comp).data();
-            ParticleReal* dst = tmp_real.data();
-            AMREX_HOST_DEVICE_FOR_1D( np_total, i,
-            {
-                dst[i] = i < np ? src[permutations[i]] : src[i];
-            });
+        { // Create a scope for the temporary vector below
+            RealVector tmp_real(np_total);
+            for (int comp = 0; comp < NArrayReal + m_num_runtime_real; ++comp) {
+                auto src = ptile.GetStructOfArrays().GetRealData(comp).data();
+                ParticleReal* dst = tmp_real.data();
+                AMREX_HOST_DEVICE_FOR_1D( np_total, i,
+                {
+                    dst[i] = i < np ? src[permutations[i]] : src[i];
+                });
 
-            Gpu::streamSynchronize();
+                Gpu::streamSynchronize();
 
-            ptile.GetStructOfArrays().GetRealData(comp).swap(tmp_real);
+                ptile.GetStructOfArrays().GetRealData(comp).swap(tmp_real);
+            }
         }
 
         IntVector tmp_int(np_total);