Pure SoA Particle: Separate Array for IdCPU (AMReX-Codes#3585)

## Summary This addresses a regression we see when moving to pure SoA particles: - slightly slower read/write to Ids when needed, e.g., for sorting - issues going up to the full 64bit range ## Additional background Once finished, this will close AMReX-Codes#3569. ## Checklist The proposed changes: - [ ] fix a bug or incorrect behavior in AMReX - [x] add new capabilities to AMReX - [ ] changes answers in the test suite to more than roundoff level - [ ] are likely to significantly affect the results of downstream AMReX users - [ ] include documentation in the code and/or rst files, if appropriate --------- Co-authored-by: Andrew Myers <[email protected]>
guj · Dec 13, 2023 · 409b6d9 · 409b6d9
1 parent 7d0575d
commit 409b6d9
Show file tree

Hide file tree

Showing 8 changed files with 212 additions and 56 deletions.
diff --git a/Src/Particle/AMReX_ParticleContainerI.H b/Src/Particle/AMReX_ParticleContainerI.H
@@ -18,10 +18,10 @@ ParticleContainer_impl<ParticleType, NArrayReal, NArrayInt, Allocator, CellAssig
  if (h_redistribute_int_comp[i]) {++num_int_comm_comps;}
  }
 
- if constexpr(!ParticleType::is_soa_particle) {
- particle_size = sizeof(ParticleType);
+ if constexpr (ParticleType::is_soa_particle) {
+ particle_size = sizeof(uint64_t); // idcpu
  } else {
- particle_size = 0;
+ particle_size = sizeof(ParticleType);
  }
  superparticle_size = particle_size +
  num_real_comm_comps*sizeof(ParticleReal) + num_int_comm_comps*sizeof(int);
@@ -1095,7 +1095,7 @@ ParticleContainer_impl<ParticleType, NArrayReal, NArrayInt, Allocator, CellAssig
  const size_t np_total = np + ptile.numNeighborParticles();
 
  if (memEfficientSort) {
- if constexpr(!ParticleType::is_soa_particle) {
+ if constexpr (!ParticleType::is_soa_particle) {
  static_assert(sizeof(ParticleType)%4 == 0 && sizeof(uint32_t) == 4);
  using tmp_t = std::conditional_t<sizeof(ParticleType)%8 == 0,
  uint64_t, uint32_t>;
@@ -1530,7 +1530,7 @@ ParticleContainer_impl<ParticleType, NArrayReal, NArrayInt, Allocator, CellAssig
  unsigned npart = ptile_ptrs[pmap_it]->numParticles();
  ParticleLocData pld;
 
- if constexpr(!ParticleType::is_soa_particle){
+ if constexpr (!ParticleType::is_soa_particle){
 
  if (npart != 0) {
  Long last = npart - 1;
@@ -1647,7 +1647,7 @@ ParticleContainer_impl<ParticleType, NArrayReal, NArrayInt, Allocator, CellAssig
  }
  }
 
- } else{ // soa particle
+ } else { // soa particle
 
  auto particle_tile = ptile_ptrs[pmap_it];
  if (npart != 0) {
@@ -1663,6 +1663,7 @@ ParticleContainer_impl<ParticleType, NArrayReal, NArrayInt, Allocator, CellAssig
  }
 
  if (p.id() < 0){
+ soa.GetIdCPUData()[pindex] = soa.GetIdCPUData()[last];
  for (int comp = 0; comp < NumRealComps(); comp++) {
  soa.GetRealData(comp)[pindex] = soa.GetRealData(comp)[last];
  }
@@ -1679,6 +1680,7 @@ ParticleContainer_impl<ParticleType, NArrayReal, NArrayInt, Allocator, CellAssig
  particlePostLocate(p, pld, lev);
 
  if (p.id() < 0) {
+ soa.GetIdCPUData()[pindex] = soa.GetIdCPUData()[last];
  for (int comp = 0; comp < NumRealComps(); comp++) {
  soa.GetRealData(comp)[pindex] = soa.GetRealData(comp)[last];
  }
@@ -1696,6 +1698,10 @@ ParticleContainer_impl<ParticleType, NArrayReal, NArrayInt, Allocator, CellAssig
  // We own it but must shift it to another place.
  auto index = std::make_pair(pld.m_grid, pld.m_tile);
  AMREX_ASSERT(soa_local[pld.m_lev][index].size() == num_threads);
+ {
+ auto& arr = soa_local[pld.m_lev][index][thread_num].GetIdCPUData();
+ arr.push_back(soa.GetIdCPUData()[pindex]);
+ }
  for (int comp = 0; comp < NumRealComps(); ++comp) {
  RealVector& arr = soa_local[pld.m_lev][index][thread_num].GetRealData(comp);
  arr.push_back(soa.GetRealData(comp)[pindex]);
@@ -1715,6 +1721,10 @@ ParticleContainer_impl<ParticleType, NArrayReal, NArrayInt, Allocator, CellAssig
  particles_to_send.resize(new_size);
 
  char* dst = &particles_to_send[old_size];
+ {
+ std::memcpy(dst, &soa.GetIdCPUData()[pindex], sizeof(uint64_t));
+ dst += sizeof(uint64_t);
+ }
  int array_comp_start = AMREX_SPACEDIM + NStructReal;
  for (int comp = 0; comp < NumRealComps(); comp++) {
  if (h_redistribute_real_comp[array_comp_start + comp]) {
@@ -1733,6 +1743,7 @@ ParticleContainer_impl<ParticleType, NArrayReal, NArrayInt, Allocator, CellAssig
  }
 
  if (p.id() < 0){
+ soa.GetIdCPUData()[pindex] = soa.GetIdCPUData()[last];
  for (int comp = 0; comp < NumRealComps(); comp++) {
  soa.GetRealData(comp)[pindex] = soa.GetRealData(comp)[last];
  }
@@ -1747,6 +1758,10 @@ ParticleContainer_impl<ParticleType, NArrayReal, NArrayInt, Allocator, CellAssig
  ++pindex;
  }
 
+ {
+ auto& iddata = soa.GetIdCPUData();
+ iddata.erase(iddata.begin() + last + 1, iddata.begin() + npart);
+ }
  for (int comp = 0; comp < NumRealComps(); comp++) {
  RealVector& rdata = soa.GetRealData(comp);
  rdata.erase(rdata.begin() + last + 1, rdata.begin() + npart);
@@ -1828,6 +1843,12 @@ ParticleContainer_impl<ParticleType, NArrayReal, NArrayInt, Allocator, CellAssig
  auto& soa = ptile.GetStructOfArrays();
  auto& soa_tmp = soa_local[lev][index];
  for (int i = 0; i < num_threads; ++i) {
+ {
+ auto& arr = soa.GetIdCPUData();
+ auto& tmp = soa_tmp[i].GetIdCPUData();
+ arr.insert(arr.end(), tmp.begin(), tmp.end());
+ tmp.erase(tmp.begin(), tmp.end());
+ }
  for (int comp = 0; comp < NumRealComps(); ++comp) {
  RealVector& arr = soa.GetRealData(comp);
  RealVector& tmp = soa_tmp[i].GetRealData(comp);
@@ -2045,20 +2066,16 @@ RedistributeMPI (std::map<int, Vector<char> >& not_ours,
 
  Particle<NStructReal, NStructInt> p;
 
- if constexpr (!ParticleType::is_soa_particle) {
- std::memcpy(&p, pbuf, sizeof(ParticleType));
- } else {
+ if constexpr (ParticleType::is_soa_particle) {
+  std::memcpy(&p.m_idcpu, pbuf, sizeof(uint64_t));
+
  ParticleReal pos[AMREX_SPACEDIM];
- std::memcpy(&pos[0], pbuf, AMREX_SPACEDIM*sizeof(ParticleReal));
+ std::memcpy(&pos[0], pbuf + sizeof(uint64_t), AMREX_SPACEDIM*sizeof(ParticleReal));
  AMREX_D_TERM(p.pos(0) = pos[0];,
  p.pos(1) = pos[1];,
  p.pos(2) = pos[2]);
-
- int idcpu[2];
- std::memcpy(&idcpu[0], pbuf + NumRealComps()*sizeof(ParticleReal), 2*sizeof(int));
-
- p.id() = idcpu[0];
- p.cpu() = idcpu[1];
+ } else {
+ std::memcpy(&p, pbuf, sizeof(ParticleType));
  }
 
  bool success = Where(p, pld, lev_min, lev_max, 0);
@@ -2097,7 +2114,12 @@ RedistributeMPI (std::map<int, Vector<char> >& not_ours,
  rcv_tile[ipart])];
  char* pbuf = ((char*) &recvdata[offset]) + j*superparticle_size;
 
- if constexpr(! ParticleType::is_soa_particle) {
+ if constexpr (ParticleType::is_soa_particle) {
+ uint64_t idcpudata;
+ std::memcpy(&idcpudata, pbuf, sizeof(uint64_t));
+ pbuf += sizeof(uint64_t);
+ ptile.GetStructOfArrays().GetIdCPUData().push_back(idcpudata);
+ } else {
  ParticleType p;
  std::memcpy(&p, pbuf, sizeof(ParticleType));
  pbuf += sizeof(ParticleType);
@@ -2146,6 +2168,10 @@ RedistributeMPI (std::map<int, Vector<char> >& not_ours,
  host_int_attribs.reserve(15);
  host_int_attribs.resize(finestLevel()+1);
 
+ Vector<std::map<std::pair<int, int>, Gpu::HostVector<uint64_t> > > host_idcpu;
+ host_idcpu.reserve(15);
+ host_idcpu.resize(finestLevel()+1);
+
  ipart = 0;
  for (int i = 0; i < nrcvs; ++i)
  {
@@ -2159,7 +2185,15 @@ RedistributeMPI (std::map<int, Vector<char> >& not_ours,
 
  char* pbuf = ((char*) &recvdata[offset]) + j*superparticle_size;
 
- if constexpr(! ParticleType::is_soa_particle) {
+ host_real_attribs[lev][ind].resize(NumRealComps());
+ host_int_attribs[lev][ind].resize(NumIntComps());
+
+ if constexpr (ParticleType::is_soa_particle) {
+ uint64_t idcpudata;
+ std::memcpy(&idcpudata, pbuf, sizeof(uint64_t));
+ pbuf += sizeof(uint64_t);
+ host_idcpu[lev][ind].push_back(idcpudata);
+ } else {
  ParticleType p;
  std::memcpy(&p, pbuf, sizeof(ParticleType));
  pbuf += sizeof(ParticleType);
@@ -2210,7 +2244,12 @@ RedistributeMPI (std::map<int, Vector<char> >& not_ours,
  auto new_size = old_size + src_tile.size();
  dst_tile.resize(new_size);
 
- if constexpr(! ParticleType::is_soa_particle) {
+ if constexpr (ParticleType::is_soa_particle) {
+ Gpu::copyAsync(Gpu::hostToDevice,
+ host_idcpu[host_lev][std::make_pair(grid,tile)].begin(),
+ host_idcpu[host_lev][std::make_pair(grid,tile)].end(),
+ dst_tile.GetStructOfArrays().GetIdCPUData().begin() + old_size);
+ } else {
  Gpu::copyAsync(Gpu::hostToDevice,
  src_tile.begin(), src_tile.end(),
  dst_tile.GetArrayOfStructs().begin() + old_size);

diff --git a/Src/Particle/AMReX_ParticleIO.H b/Src/Particle/AMReX_ParticleIO.H
@@ -954,6 +954,10 @@ ParticleContainer_impl<ParticleType, NArrayReal, NArrayInt, Allocator, CellAssig
  host_int_attribs.reserve(15);
  host_int_attribs.resize(finest_level_in_file+1);
 
+ Vector<std::map<std::pair<int, int>, Gpu::HostVector<uint64_t> > > host_idcpu;
+ host_idcpu.reserve(15);
+ host_idcpu.resize(finestLevel()+1);
+
  for (int i = 0; i < cnt; i++) {
  // note: for pure SoA particle layouts, we do write the id, cpu and positions as a struct
  // for backwards compatibility with readers
@@ -1021,8 +1025,7 @@ ParticleContainer_impl<ParticleType, NArrayReal, NArrayInt, Allocator, CellAssig
  host_real_attribs[pld.m_lev][ind][j].push_back(ptemp.pos(j));
  }
 
- host_int_attribs[pld.m_lev][ind][0].push_back(ptemp.id());
- host_int_attribs[pld.m_lev][ind][1].push_back(ptemp.cpu());
+ host_idcpu[pld.m_lev][ind].push_back(ptemp.m_idcpu);
 
  // read all other SoA
  // add the real...
@@ -1032,7 +1035,7 @@ ParticleContainer_impl<ParticleType, NArrayReal, NArrayInt, Allocator, CellAssig
  }
 
  // ... and int array data
- for (int icomp = 2; icomp < NumIntComps(); icomp++) {
+ for (int icomp = 0; icomp < NumIntComps(); icomp++) {
  host_int_attribs[lev][ind][icomp].push_back(*iptr);
  ++iptr;
  }
@@ -1061,6 +1064,11 @@ ParticleContainer_impl<ParticleType, NArrayReal, NArrayInt, Allocator, CellAssig
  {
  Gpu::copyAsync(Gpu::hostToDevice, src_tile.begin(), src_tile.end(),
  dst_tile.GetArrayOfStructs().begin() + old_size);
+ } else {
+ Gpu::copyAsync(Gpu::hostToDevice,
+ host_idcpu[host_lev][std::make_pair(grid,tile)].begin(),
+ host_idcpu[host_lev][std::make_pair(grid,tile)].end(),
+ dst_tile.GetStructOfArrays().GetIdCPUData().begin() + old_size);
  }
 
  for (int i = 0; i < NumRealComps(); ++i) { // NOLINT(readability-misleading-indentation)

diff --git a/Src/Particle/AMReX_ParticleInit.H b/Src/Particle/AMReX_ParticleInit.H
@@ -1062,6 +1062,10 @@ InitRandom (Long icount,
  host_int_attribs.reserve(15);
  host_int_attribs.resize(finestLevel()+1);
 
+ Vector<std::map<std::pair<int, int>, Gpu::HostVector<uint64_t> > > host_idcpu;
+ host_idcpu.reserve(15);
+ host_idcpu.resize(finestLevel()+1);
+
  for (Long j = 0; j < icount; j++)
  {
  Particle<0, 0> ptest;
@@ -1117,8 +1121,9 @@ InitRandom (Long icount,
  host_real_attribs[pld.m_lev][ind][i].push_back(pos[j*AMREX_SPACEDIM+i]);
  }
 
- host_int_attribs[pld.m_lev][ind][0].push_back(ParticleType::NextID());
- host_int_attribs[pld.m_lev][ind][1].push_back(MyProc);
+ host_idcpu[pld.m_lev][ind].push_back(0);
+ ParticleIDWrapper(host_idcpu[pld.m_lev][ind].back()) = ParticleType::NextID();
+ ParticleCPUWrapper(host_idcpu[pld.m_lev][ind].back()) = ParallelDescriptor::MyProc();
 
  host_particles[pld.m_lev][ind];
 
@@ -1157,6 +1162,11 @@ InitRandom (Long icount,
  {
  Gpu::copyAsync(Gpu::hostToDevice, src_tile.begin(), src_tile.end(),
  dst_tile.GetArrayOfStructs().begin() + old_size);
+ } else {
+ Gpu::copyAsync(Gpu::hostToDevice,
+ host_idcpu[host_lev][std::make_pair(grid,tile)].begin(),
+ host_idcpu[host_lev][std::make_pair(grid,tile)].end(),
+ dst_tile.GetStructOfArrays().GetIdCPUData().begin() + old_size);
  }
 
  for (int i = 0; i < NArrayReal; ++i) { // NOLINT(readability-misleading-indentation)
@@ -1201,6 +1211,10 @@ InitRandom (Long icount,
  host_int_attribs.reserve(15);
  host_int_attribs.resize(finestLevel()+1);
 
+ Vector<std::map<std::pair<int, int>, Gpu::HostVector<uint64_t> > > host_idcpu;
+ host_idcpu.reserve(15);
+ host_idcpu.resize(finestLevel()+1);
+
  for (Long icnt = 0; icnt < M; icnt++) {
  Particle<0, 0> ptest;
  for (int i = 0; i < AMREX_SPACEDIM; i++) {
@@ -1261,8 +1275,9 @@ InitRandom (Long icount,
  host_real_attribs[pld.m_lev][ind][i].push_back(ptest.pos(i));
  }
 
- host_int_attribs[pld.m_lev][ind][0].push_back(ptest.id());
- host_int_attribs[pld.m_lev][ind][1].push_back(ptest.cpu());
+ host_idcpu[pld.m_lev][ind].push_back(0);
+ ParticleIDWrapper(host_idcpu[pld.m_lev][ind].back()) = ParticleType::NextID();
+ ParticleCPUWrapper(host_idcpu[pld.m_lev][ind].back()) = ParallelDescriptor::MyProc();
 
  host_particles[pld.m_lev][ind];
 
@@ -1300,6 +1315,11 @@ InitRandom (Long icount,
  {
  Gpu::copyAsync(Gpu::hostToDevice, src_tile.begin(), src_tile.end(),
  dst_tile.GetArrayOfStructs().begin() + old_size);
+ } else {
+ Gpu::copyAsync(Gpu::hostToDevice,
+ host_idcpu[host_lev][std::make_pair(grid,tile)].begin(),
+ host_idcpu[host_lev][std::make_pair(grid,tile)].end(),
+ dst_tile.GetStructOfArrays().GetIdCPUData().begin() + old_size);
  }
 
  for (int i = 0; i < NArrayReal; ++i) { // NOLINT(readability-misleading-indentation)