Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added a variable sized SoA macro definition. #4

Open
wants to merge 5 commits into
base: clue_porting_to_cmssw
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
297 changes: 297 additions & 0 deletions CUDADataFormats/Common/interface/SoAmacros.h

Large diffs are not rendered by default.

16 changes: 0 additions & 16 deletions CUDADataFormats/HGCal/interface/ConstHGCCLUESoA.h

This file was deleted.

35 changes: 6 additions & 29 deletions CUDADataFormats/HGCal/interface/HGCCLUECPUProduct.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,16 +6,14 @@
#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"

#include "CUDADataFormats/HGCal/interface/HGCCLUESoA.h"
#include "CUDADataFormats/HGCal/interface/ConstHGCCLUESoA.h"
#include "CUDADataFormats/HGCal/interface/HGCUncalibRecHitSoA.h"

class HGCCLUECPUProduct {
public:
HGCCLUECPUProduct() = default;
explicit HGCCLUECPUProduct(uint32_t nhits, const cudaStream_t &stream) : nhits_(nhits) {
size_tot_ = std::accumulate(sizes_.begin(), sizes_.end(), 0);
pad_ = ((nhits - 1) / 32 + 1) * 32; //align to warp boundary (assumption: warpSize = 32)
mMemCLUEHost = cms::cuda::make_host_unique<std::byte[]>(pad_ * size_tot_, stream);
mMemCLUEHost = cms::cuda::make_host_unique<std::byte[]>(
HGCCLUESoADescriptor::computeDataSize(nhits), stream);
}
~HGCCLUECPUProduct() = default;

Expand All @@ -25,44 +23,23 @@ class HGCCLUECPUProduct {
HGCCLUECPUProduct &operator=(HGCCLUECPUProduct &&) = default;

HGCCLUESoA get() {
HGCCLUESoA soa;
soa.rho = reinterpret_cast<float *>(mMemCLUEHost.get());
soa.delta = soa.rho + pad_;
soa.nearestHigher = reinterpret_cast<int32_t *>(soa.delta + pad_);
soa.clusterIndex = soa.nearestHigher + pad_;
soa.isSeed = reinterpret_cast<bool *>(soa.clusterIndex + pad_);
soa.nbytes = size_tot_;
HGCCLUESoA soa(mMemCLUEHost.get(), nhits_);
soa.nhits = nhits_;
soa.pad = pad_;
return soa;
}

ConstHGCCLUESoA get() const {
ConstHGCCLUESoA soa;
soa.rho = reinterpret_cast<float const*>(mMemCLUEHost.get());
soa.delta = soa.rho + pad_;
soa.nearestHigher = reinterpret_cast<int32_t const*>(soa.delta + pad_);
soa.clusterIndex = soa.nearestHigher + pad_;
soa.isSeed = reinterpret_cast<bool const*>(soa.clusterIndex + pad_);
const HGCCLUESoA get() const {
HGCCLUESoA soa(mMemCLUEHost.get(), nhits_);
soa.nhits = nhits_;
return soa;
}

//number of hits stored in the SoA
uint32_t nHits() const { return nhits_; }
//pad of memory block (used for warp alignment, slighlty larger than 'nhits_')
uint32_t pad() const { return pad_; }
//number of bytes of the SoA
uint32_t nBytes() const { return size_tot_; }

private:
cms::cuda::host::unique_ptr<std::byte[]> mMemCLUEHost;
static constexpr std::array<uint32_t, memory::npointers::ntypes_hgcclue_soa> sizes_ = {
{memory::npointers::float_hgcclue_soa * sizeof(float),
memory::npointers::int32_hgcclue_soa * sizeof(uint32_t),
memory::npointers::bool_hgcclue_soa * sizeof(bool)}};
uint32_t pad_;
uint32_t nhits_;
uint32_t size_tot_;
};

#endif //CUDADAtaFormats_HGCal_HGCCLUECPUProduct_H
36 changes: 7 additions & 29 deletions CUDADataFormats/HGCal/interface/HGCCLUEGPUProduct.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,16 +6,15 @@
#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"

#include "CUDADataFormats/HGCal/interface/HGCCLUESoA.h"
#include "CUDADataFormats/HGCal/interface/ConstHGCCLUESoA.h"
#include "CUDADataFormats/HGCal/interface/HGCUncalibRecHitSoA.h"

class HGCCLUEGPUProduct {
public:
HGCCLUEGPUProduct() = default;
explicit HGCCLUEGPUProduct(uint32_t nhits, const cudaStream_t &stream) : nhits_(nhits) {
size_tot_ = std::accumulate(sizes_.begin(), sizes_.end(), 0);
pad_ = ((nhits - 1) / 32 + 1) * 32; //align to warp boundary (assumption: warpSize = 32)
mMemCLUEDev = cms::cuda::make_device_unique<std::byte[]>(pad_ * size_tot_, stream);
/* CUDA allocations are already aligned */
mMemCLUEDev = cms::cuda::make_device_unique<std::byte[]>(
Comment on lines +15 to +16
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we mean different things here (I have to admit my comment in the code was misleading). I was not trying to align the overall memory block. I was instead making sure the total size of the allocated memory is a multiple of the warpSize (32), and further making sure that each variable within the SoA is also aligned, so that no warp must allocate unnecessary memory blocks (that is why I am later using pad_ when defining the layout of the SoA). If I would not do this, the only variable of the SoA which would surely be aligned would be the first.

HGCCLUESoADescriptor::computeDataSize(nhits), stream);
}
~HGCCLUEGPUProduct() = default;

Expand All @@ -25,44 +24,23 @@ class HGCCLUEGPUProduct {
HGCCLUEGPUProduct &operator=(HGCCLUEGPUProduct &&) = default;

HGCCLUESoA get() {
HGCCLUESoA soa;
soa.rho = reinterpret_cast<float *>(mMemCLUEDev.get());
soa.delta = soa.rho + pad_;
soa.nearestHigher = reinterpret_cast<int32_t *>(soa.delta + pad_);
soa.clusterIndex = soa.nearestHigher + pad_;
soa.isSeed = reinterpret_cast<bool *>(soa.clusterIndex + pad_);
soa.nbytes = size_tot_;
HGCCLUESoA soa(mMemCLUEDev.get(), nhits_);
soa.nhits = nhits_;
soa.pad = pad_;
return soa;
}

ConstHGCCLUESoA get() const {
ConstHGCCLUESoA soa;
soa.rho = reinterpret_cast<float const*>(mMemCLUEDev.get());
soa.delta = soa.rho + pad_;
soa.nearestHigher = reinterpret_cast<int32_t const*>(soa.delta + pad_);
soa.clusterIndex = soa.nearestHigher + pad_;
soa.isSeed = reinterpret_cast<bool const*>(soa.clusterIndex + pad_);
const HGCCLUESoA get() const {
HGCCLUESoA soa(mMemCLUEDev.get(), nhits_);
soa.nhits = nhits_;
return soa;
}

//number of hits stored in the SoA
uint32_t nHits() const { return nhits_; }
//pad of memory block (used for warp alignment, slighlty larger than 'nhits_')
uint32_t pad() const { return pad_; }
//number of bytes of the SoA
uint32_t nBytes() const { return size_tot_; }

private:
cms::cuda::device::unique_ptr<std::byte[]> mMemCLUEDev;
static constexpr std::array<uint32_t, memory::npointers::ntypes_hgcclue_soa> sizes_ = {
{memory::npointers::float_hgcclue_soa * sizeof(float),
memory::npointers::int32_hgcclue_soa * sizeof(uint32_t),
memory::npointers::bool_hgcclue_soa * sizeof(bool)}};
uint32_t pad_;
uint32_t nhits_;
uint32_t size_tot_;
};

#endif //CUDADAtaFormats_HGCal_HGCCLUEGPUProduct_H
34 changes: 14 additions & 20 deletions CUDADataFormats/HGCal/interface/HGCCLUESoA.h
Original file line number Diff line number Diff line change
Expand Up @@ -2,32 +2,26 @@
#define CUDADataFormats_HGCal_HGCCLUESoA_h

#include <cstdint>
#include "CUDADataFormats/Common/interface/SoAmacros.h"

class HGCCLUESoA {
declare_SoA_template(HGCCLUESoADescriptor,
SoA_column(float, rho), /* energy density of the calibrated rechit */
SoA_column(float, delta), /* closest distance to a rechit with a higher density */
SoA_column(int32_t, nearestHigher), /* index of the nearest rechit with a higher density */
SoA_column(int32_t, clusterIndex), /* cluster index the rechit belongs to */
SoA_column(bool, isSeed) /* is the rechit a cluster seed? */
/* Note: isSeed is of type int in the CPU version to to std::vector optimizations */
);


class HGCCLUESoA: public HGCCLUESoADescriptor {
public:
float *rho; //energy density of the calibrated rechit
float *delta; //closest distance to a rechit with a higher density
int32_t *nearestHigher; //index of the nearest rechit with a higher density
int32_t *clusterIndex; //cluster index the rechit belongs to
bool *isSeed; // is the rechit a cluster seed?
//Note: isSeed is of type int in the CPU version to to std::vector optimizations
HGCCLUESoA(std::byte* mem, size_t nElements):
HGCCLUESoADescriptor(mem, nElements) {}

uint32_t nbytes; //number of bytes of the SoA
uint32_t nhits; //number of hits stored in the SoA
uint32_t pad; //pad of memory block (used for warp alignment, slightly larger than 'nhits_')
};

namespace memory {
namespace npointers {
//number of float pointers in the rechits SoA
constexpr unsigned float_hgcclue_soa = 2;
//number of int32 pointers in the rechits SoA
constexpr unsigned int32_hgcclue_soa = 2;
//number of bool pointers in the rechits SoA
constexpr unsigned bool_hgcclue_soa = 1;
//number of different pointer types in the rechits SoA
constexpr unsigned ntypes_hgcclue_soa = 3;
} // namespace npointers
} // namespace memory

#endif //CUDADataFormats_HGCal_HGCCLUESoA_h
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ void kernel_calculate_density( LayerTilesGPU *hist,
} // end of loop over bins in search box
}

out.rho[i] = (float)rhoi;
out[i].rho = (float)rhoi;
}
} //kernel

Expand Down Expand Up @@ -133,7 +133,7 @@ void kernel_calculate_distanceToHigher(LayerTilesGPU* hist,
int layeri = in.layer[i];
float xi = in.x[i];
float yi = in.y[i];
float rhoi = out.rho[i];
float rhoi = out[i].rho;

// get search box
int4 search_box = hist[layeri].searchBox(xi-dm, xi+dm, yi-dm, yi+dm);
Expand All @@ -155,9 +155,9 @@ void kernel_calculate_distanceToHigher(LayerTilesGPU* hist,
float xj = in.x[j];
float yj = in.y[j];
float dist_ij = std::sqrt((xi-xj)*(xi-xj) + (yi-yj)*(yi-yj));
bool foundHigher = (out.rho[j] > rhoi);
bool foundHigher = (out[j].rho > rhoi);
// in the rare case where rho is the same, use detid
foundHigher = foundHigher || ( (out.rho[j] == rhoi) && (j>i));
foundHigher = foundHigher || ( (out[j].rho == rhoi) && (j>i));
if(foundHigher && dist_ij <= dm) { // definition of N'_{dm}(i)
// find the nearest point within N'_{dm}(i)
if (dist_ij<deltai) {
Expand All @@ -173,8 +173,8 @@ void kernel_calculate_distanceToHigher(LayerTilesGPU* hist,

}

out.delta[i] = deltai;
out.nearestHigher[i] = nearestHigheri;
out[i].delta = deltai;
out[i].nearestHigher = nearestHigheri;
}
} //kernel

Expand All @@ -194,23 +194,23 @@ void kernel_find_clusters( cms::cuda::VecArray<int,clue_gpu::maxNSeeds>* d_seeds

if (i < numberOfPoints and is_energy_valid(in.energy[i])) {
// initialize clusterIndex
out.clusterIndex[i] = -1;
out[i].clusterIndex = -1;
// determine seed or outlier
float deltai = out.delta[i];
float rhoi = out.rho[i];
float deltai = out[i].delta;
float rhoi = out[i].rho;
float rhoc = kappa * in.sigmaNoise[i];
bool isSeed = (deltai > dc) && (rhoi >= rhoc);
bool isOutlier = (deltai > outlierDeltaFactor * dc) && (rhoi < rhoc);

if (isSeed) {
// set isSeed as 1
out.isSeed[i] = 1;
out[i].isSeed = 1;
d_seeds[0].push_back(i); // head of d_seeds
} else {
if (!isOutlier) {
assert(out.nearestHigher[i] < numberOfPoints);
assert(out[i].nearestHigher < numberOfPoints);
// register as follower of its nearest higher
d_followers[out.nearestHigher[i]].push_back(i);
d_followers[out[i].nearestHigher].push_back(i);
}
}
}
Expand All @@ -233,7 +233,7 @@ void kernel_assign_clusters( const cms::cuda::VecArray<int,clue_gpu::maxNSeeds>*

// asgine cluster to seed[idxCls]
int idxThisSeed = seeds[idxCls];
out.clusterIndex[idxThisSeed] = idxCls;
out[idxThisSeed].clusterIndex = idxCls;
// push_back idThisSeed to localStack
localStack[localStackSize] = idxThisSeed;
localStackSize++;
Expand All @@ -242,15 +242,15 @@ void kernel_assign_clusters( const cms::cuda::VecArray<int,clue_gpu::maxNSeeds>*
// get last element of localStack
int idxEndOflocalStack = localStack[localStackSize-1];

int temp_clusterIndex = out.clusterIndex[idxEndOflocalStack];
int temp_clusterIndex = out[idxEndOflocalStack].clusterIndex;
// pop_back last element of localStack
localStack[localStackSize-1] = -1;
localStackSize--;

// loop over followers of last element of localStack
for( int j : d_followers[idxEndOflocalStack]){
// // pass id to follower
out.clusterIndex[j] = temp_clusterIndex;
out[j].clusterIndex = temp_clusterIndex;
// push_back follower to localStack
//localStack[localStackSize] = j;
localStackSize++;
Expand Down