Skip to content

Commit

Permalink
Pure SoA Particle: Separate Array for IdCPU (AMReX-Codes#3585)
Browse files Browse the repository at this point in the history
## Summary

This addresses a regression we see when moving to pure SoA particles:
- slightly slower read/write to Ids when needed, e.g., for sorting
- issues going up to the full 64bit range

## Additional background

Once finished, this will close AMReX-Codes#3569.

## Checklist

The proposed changes:
- [ ] fix a bug or incorrect behavior in AMReX
- [x] add new capabilities to AMReX
- [ ] changes answers in the test suite to more than roundoff level
- [ ] are likely to significantly affect the results of downstream AMReX
users
- [ ] include documentation in the code and/or rst files, if appropriate

---------

Co-authored-by: Andrew Myers <[email protected]>
  • Loading branch information
2 people authored and guj committed Dec 13, 2023
1 parent 7d0575d commit 409b6d9
Show file tree
Hide file tree
Showing 8 changed files with 212 additions and 56 deletions.
77 changes: 58 additions & 19 deletions Src/Particle/AMReX_ParticleContainerI.H
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,10 @@ ParticleContainer_impl<ParticleType, NArrayReal, NArrayInt, Allocator, CellAssig
if (h_redistribute_int_comp[i]) {++num_int_comm_comps;}
}

if constexpr(!ParticleType::is_soa_particle) {
particle_size = sizeof(ParticleType);
if constexpr (ParticleType::is_soa_particle) {
particle_size = sizeof(uint64_t); // idcpu
} else {
particle_size = 0;
particle_size = sizeof(ParticleType);
}
superparticle_size = particle_size +
num_real_comm_comps*sizeof(ParticleReal) + num_int_comm_comps*sizeof(int);
Expand Down Expand Up @@ -1095,7 +1095,7 @@ ParticleContainer_impl<ParticleType, NArrayReal, NArrayInt, Allocator, CellAssig
const size_t np_total = np + ptile.numNeighborParticles();

if (memEfficientSort) {
if constexpr(!ParticleType::is_soa_particle) {
if constexpr (!ParticleType::is_soa_particle) {
static_assert(sizeof(ParticleType)%4 == 0 && sizeof(uint32_t) == 4);
using tmp_t = std::conditional_t<sizeof(ParticleType)%8 == 0,
uint64_t, uint32_t>;
Expand Down Expand Up @@ -1530,7 +1530,7 @@ ParticleContainer_impl<ParticleType, NArrayReal, NArrayInt, Allocator, CellAssig
unsigned npart = ptile_ptrs[pmap_it]->numParticles();
ParticleLocData pld;

if constexpr(!ParticleType::is_soa_particle){
if constexpr (!ParticleType::is_soa_particle){

if (npart != 0) {
Long last = npart - 1;
Expand Down Expand Up @@ -1647,7 +1647,7 @@ ParticleContainer_impl<ParticleType, NArrayReal, NArrayInt, Allocator, CellAssig
}
}

} else{ // soa particle
} else { // soa particle

auto particle_tile = ptile_ptrs[pmap_it];
if (npart != 0) {
Expand All @@ -1663,6 +1663,7 @@ ParticleContainer_impl<ParticleType, NArrayReal, NArrayInt, Allocator, CellAssig
}

if (p.id() < 0){
soa.GetIdCPUData()[pindex] = soa.GetIdCPUData()[last];
for (int comp = 0; comp < NumRealComps(); comp++) {
soa.GetRealData(comp)[pindex] = soa.GetRealData(comp)[last];
}
Expand All @@ -1679,6 +1680,7 @@ ParticleContainer_impl<ParticleType, NArrayReal, NArrayInt, Allocator, CellAssig
particlePostLocate(p, pld, lev);

if (p.id() < 0) {
soa.GetIdCPUData()[pindex] = soa.GetIdCPUData()[last];
for (int comp = 0; comp < NumRealComps(); comp++) {
soa.GetRealData(comp)[pindex] = soa.GetRealData(comp)[last];
}
Expand All @@ -1696,6 +1698,10 @@ ParticleContainer_impl<ParticleType, NArrayReal, NArrayInt, Allocator, CellAssig
// We own it but must shift it to another place.
auto index = std::make_pair(pld.m_grid, pld.m_tile);
AMREX_ASSERT(soa_local[pld.m_lev][index].size() == num_threads);
{
auto& arr = soa_local[pld.m_lev][index][thread_num].GetIdCPUData();
arr.push_back(soa.GetIdCPUData()[pindex]);
}
for (int comp = 0; comp < NumRealComps(); ++comp) {
RealVector& arr = soa_local[pld.m_lev][index][thread_num].GetRealData(comp);
arr.push_back(soa.GetRealData(comp)[pindex]);
Expand All @@ -1715,6 +1721,10 @@ ParticleContainer_impl<ParticleType, NArrayReal, NArrayInt, Allocator, CellAssig
particles_to_send.resize(new_size);

char* dst = &particles_to_send[old_size];
{
std::memcpy(dst, &soa.GetIdCPUData()[pindex], sizeof(uint64_t));
dst += sizeof(uint64_t);
}
int array_comp_start = AMREX_SPACEDIM + NStructReal;
for (int comp = 0; comp < NumRealComps(); comp++) {
if (h_redistribute_real_comp[array_comp_start + comp]) {
Expand All @@ -1733,6 +1743,7 @@ ParticleContainer_impl<ParticleType, NArrayReal, NArrayInt, Allocator, CellAssig
}

if (p.id() < 0){
soa.GetIdCPUData()[pindex] = soa.GetIdCPUData()[last];
for (int comp = 0; comp < NumRealComps(); comp++) {
soa.GetRealData(comp)[pindex] = soa.GetRealData(comp)[last];
}
Expand All @@ -1747,6 +1758,10 @@ ParticleContainer_impl<ParticleType, NArrayReal, NArrayInt, Allocator, CellAssig
++pindex;
}

{
auto& iddata = soa.GetIdCPUData();
iddata.erase(iddata.begin() + last + 1, iddata.begin() + npart);
}
for (int comp = 0; comp < NumRealComps(); comp++) {
RealVector& rdata = soa.GetRealData(comp);
rdata.erase(rdata.begin() + last + 1, rdata.begin() + npart);
Expand Down Expand Up @@ -1828,6 +1843,12 @@ ParticleContainer_impl<ParticleType, NArrayReal, NArrayInt, Allocator, CellAssig
auto& soa = ptile.GetStructOfArrays();
auto& soa_tmp = soa_local[lev][index];
for (int i = 0; i < num_threads; ++i) {
{
auto& arr = soa.GetIdCPUData();
auto& tmp = soa_tmp[i].GetIdCPUData();
arr.insert(arr.end(), tmp.begin(), tmp.end());
tmp.erase(tmp.begin(), tmp.end());
}
for (int comp = 0; comp < NumRealComps(); ++comp) {
RealVector& arr = soa.GetRealData(comp);
RealVector& tmp = soa_tmp[i].GetRealData(comp);
Expand Down Expand Up @@ -2045,20 +2066,16 @@ RedistributeMPI (std::map<int, Vector<char> >& not_ours,

Particle<NStructReal, NStructInt> p;

if constexpr (!ParticleType::is_soa_particle) {
std::memcpy(&p, pbuf, sizeof(ParticleType));
} else {
if constexpr (ParticleType::is_soa_particle) {
std::memcpy(&p.m_idcpu, pbuf, sizeof(uint64_t));

ParticleReal pos[AMREX_SPACEDIM];
std::memcpy(&pos[0], pbuf, AMREX_SPACEDIM*sizeof(ParticleReal));
std::memcpy(&pos[0], pbuf + sizeof(uint64_t), AMREX_SPACEDIM*sizeof(ParticleReal));
AMREX_D_TERM(p.pos(0) = pos[0];,
p.pos(1) = pos[1];,
p.pos(2) = pos[2]);

int idcpu[2];
std::memcpy(&idcpu[0], pbuf + NumRealComps()*sizeof(ParticleReal), 2*sizeof(int));

p.id() = idcpu[0];
p.cpu() = idcpu[1];
} else {
std::memcpy(&p, pbuf, sizeof(ParticleType));
}

bool success = Where(p, pld, lev_min, lev_max, 0);
Expand Down Expand Up @@ -2097,7 +2114,12 @@ RedistributeMPI (std::map<int, Vector<char> >& not_ours,
rcv_tile[ipart])];
char* pbuf = ((char*) &recvdata[offset]) + j*superparticle_size;

if constexpr(! ParticleType::is_soa_particle) {
if constexpr (ParticleType::is_soa_particle) {
uint64_t idcpudata;
std::memcpy(&idcpudata, pbuf, sizeof(uint64_t));
pbuf += sizeof(uint64_t);
ptile.GetStructOfArrays().GetIdCPUData().push_back(idcpudata);
} else {
ParticleType p;
std::memcpy(&p, pbuf, sizeof(ParticleType));
pbuf += sizeof(ParticleType);
Expand Down Expand Up @@ -2146,6 +2168,10 @@ RedistributeMPI (std::map<int, Vector<char> >& not_ours,
host_int_attribs.reserve(15);
host_int_attribs.resize(finestLevel()+1);

Vector<std::map<std::pair<int, int>, Gpu::HostVector<uint64_t> > > host_idcpu;
host_idcpu.reserve(15);
host_idcpu.resize(finestLevel()+1);

ipart = 0;
for (int i = 0; i < nrcvs; ++i)
{
Expand All @@ -2159,7 +2185,15 @@ RedistributeMPI (std::map<int, Vector<char> >& not_ours,

char* pbuf = ((char*) &recvdata[offset]) + j*superparticle_size;

if constexpr(! ParticleType::is_soa_particle) {
host_real_attribs[lev][ind].resize(NumRealComps());
host_int_attribs[lev][ind].resize(NumIntComps());

if constexpr (ParticleType::is_soa_particle) {
uint64_t idcpudata;
std::memcpy(&idcpudata, pbuf, sizeof(uint64_t));
pbuf += sizeof(uint64_t);
host_idcpu[lev][ind].push_back(idcpudata);
} else {
ParticleType p;
std::memcpy(&p, pbuf, sizeof(ParticleType));
pbuf += sizeof(ParticleType);
Expand Down Expand Up @@ -2210,7 +2244,12 @@ RedistributeMPI (std::map<int, Vector<char> >& not_ours,
auto new_size = old_size + src_tile.size();
dst_tile.resize(new_size);

if constexpr(! ParticleType::is_soa_particle) {
if constexpr (ParticleType::is_soa_particle) {
Gpu::copyAsync(Gpu::hostToDevice,
host_idcpu[host_lev][std::make_pair(grid,tile)].begin(),
host_idcpu[host_lev][std::make_pair(grid,tile)].end(),
dst_tile.GetStructOfArrays().GetIdCPUData().begin() + old_size);
} else {
Gpu::copyAsync(Gpu::hostToDevice,
src_tile.begin(), src_tile.end(),
dst_tile.GetArrayOfStructs().begin() + old_size);
Expand Down
14 changes: 11 additions & 3 deletions Src/Particle/AMReX_ParticleIO.H
Original file line number Diff line number Diff line change
Expand Up @@ -954,6 +954,10 @@ ParticleContainer_impl<ParticleType, NArrayReal, NArrayInt, Allocator, CellAssig
host_int_attribs.reserve(15);
host_int_attribs.resize(finest_level_in_file+1);

Vector<std::map<std::pair<int, int>, Gpu::HostVector<uint64_t> > > host_idcpu;
host_idcpu.reserve(15);
host_idcpu.resize(finestLevel()+1);

for (int i = 0; i < cnt; i++) {
// note: for pure SoA particle layouts, we do write the id, cpu and positions as a struct
// for backwards compatibility with readers
Expand Down Expand Up @@ -1021,8 +1025,7 @@ ParticleContainer_impl<ParticleType, NArrayReal, NArrayInt, Allocator, CellAssig
host_real_attribs[pld.m_lev][ind][j].push_back(ptemp.pos(j));
}

host_int_attribs[pld.m_lev][ind][0].push_back(ptemp.id());
host_int_attribs[pld.m_lev][ind][1].push_back(ptemp.cpu());
host_idcpu[pld.m_lev][ind].push_back(ptemp.m_idcpu);

// read all other SoA
// add the real...
Expand All @@ -1032,7 +1035,7 @@ ParticleContainer_impl<ParticleType, NArrayReal, NArrayInt, Allocator, CellAssig
}

// ... and int array data
for (int icomp = 2; icomp < NumIntComps(); icomp++) {
for (int icomp = 0; icomp < NumIntComps(); icomp++) {
host_int_attribs[lev][ind][icomp].push_back(*iptr);
++iptr;
}
Expand Down Expand Up @@ -1061,6 +1064,11 @@ ParticleContainer_impl<ParticleType, NArrayReal, NArrayInt, Allocator, CellAssig
{
Gpu::copyAsync(Gpu::hostToDevice, src_tile.begin(), src_tile.end(),
dst_tile.GetArrayOfStructs().begin() + old_size);
} else {
Gpu::copyAsync(Gpu::hostToDevice,
host_idcpu[host_lev][std::make_pair(grid,tile)].begin(),
host_idcpu[host_lev][std::make_pair(grid,tile)].end(),
dst_tile.GetStructOfArrays().GetIdCPUData().begin() + old_size);
}

for (int i = 0; i < NumRealComps(); ++i) { // NOLINT(readability-misleading-indentation)
Expand Down
28 changes: 24 additions & 4 deletions Src/Particle/AMReX_ParticleInit.H
Original file line number Diff line number Diff line change
Expand Up @@ -1062,6 +1062,10 @@ InitRandom (Long icount,
host_int_attribs.reserve(15);
host_int_attribs.resize(finestLevel()+1);

Vector<std::map<std::pair<int, int>, Gpu::HostVector<uint64_t> > > host_idcpu;
host_idcpu.reserve(15);
host_idcpu.resize(finestLevel()+1);

for (Long j = 0; j < icount; j++)
{
Particle<0, 0> ptest;
Expand Down Expand Up @@ -1117,8 +1121,9 @@ InitRandom (Long icount,
host_real_attribs[pld.m_lev][ind][i].push_back(pos[j*AMREX_SPACEDIM+i]);
}

host_int_attribs[pld.m_lev][ind][0].push_back(ParticleType::NextID());
host_int_attribs[pld.m_lev][ind][1].push_back(MyProc);
host_idcpu[pld.m_lev][ind].push_back(0);
ParticleIDWrapper(host_idcpu[pld.m_lev][ind].back()) = ParticleType::NextID();
ParticleCPUWrapper(host_idcpu[pld.m_lev][ind].back()) = ParallelDescriptor::MyProc();

host_particles[pld.m_lev][ind];

Expand Down Expand Up @@ -1157,6 +1162,11 @@ InitRandom (Long icount,
{
Gpu::copyAsync(Gpu::hostToDevice, src_tile.begin(), src_tile.end(),
dst_tile.GetArrayOfStructs().begin() + old_size);
} else {
Gpu::copyAsync(Gpu::hostToDevice,
host_idcpu[host_lev][std::make_pair(grid,tile)].begin(),
host_idcpu[host_lev][std::make_pair(grid,tile)].end(),
dst_tile.GetStructOfArrays().GetIdCPUData().begin() + old_size);
}

for (int i = 0; i < NArrayReal; ++i) { // NOLINT(readability-misleading-indentation)
Expand Down Expand Up @@ -1201,6 +1211,10 @@ InitRandom (Long icount,
host_int_attribs.reserve(15);
host_int_attribs.resize(finestLevel()+1);

Vector<std::map<std::pair<int, int>, Gpu::HostVector<uint64_t> > > host_idcpu;
host_idcpu.reserve(15);
host_idcpu.resize(finestLevel()+1);

for (Long icnt = 0; icnt < M; icnt++) {
Particle<0, 0> ptest;
for (int i = 0; i < AMREX_SPACEDIM; i++) {
Expand Down Expand Up @@ -1261,8 +1275,9 @@ InitRandom (Long icount,
host_real_attribs[pld.m_lev][ind][i].push_back(ptest.pos(i));
}

host_int_attribs[pld.m_lev][ind][0].push_back(ptest.id());
host_int_attribs[pld.m_lev][ind][1].push_back(ptest.cpu());
host_idcpu[pld.m_lev][ind].push_back(0);
ParticleIDWrapper(host_idcpu[pld.m_lev][ind].back()) = ParticleType::NextID();
ParticleCPUWrapper(host_idcpu[pld.m_lev][ind].back()) = ParallelDescriptor::MyProc();

host_particles[pld.m_lev][ind];

Expand Down Expand Up @@ -1300,6 +1315,11 @@ InitRandom (Long icount,
{
Gpu::copyAsync(Gpu::hostToDevice, src_tile.begin(), src_tile.end(),
dst_tile.GetArrayOfStructs().begin() + old_size);
} else {
Gpu::copyAsync(Gpu::hostToDevice,
host_idcpu[host_lev][std::make_pair(grid,tile)].begin(),
host_idcpu[host_lev][std::make_pair(grid,tile)].end(),
dst_tile.GetStructOfArrays().GetIdCPUData().begin() + old_size);
}

for (int i = 0; i < NArrayReal; ++i) { // NOLINT(readability-misleading-indentation)
Expand Down
Loading

0 comments on commit 409b6d9

Please sign in to comment.