bfonta · ericcano · Jun 28, 2021 · Jun 29, 2021 · Jun 30, 2021 · Jun 30, 2021
diff --git a/CUDADataFormats/Common/interface/SoAmacros.h b/CUDADataFormats/Common/interface/SoAmacros.h
diff --git a/CUDADataFormats/HGCal/interface/ConstHGCCLUESoA.h b/CUDADataFormats/HGCal/interface/ConstHGCCLUESoA.h
diff --git a/CUDADataFormats/HGCal/interface/HGCCLUECPUProduct.h b/CUDADataFormats/HGCal/interface/HGCCLUECPUProduct.h
@@ -6,16 +6,14 @@
 #include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"
 
 #include "CUDADataFormats/HGCal/interface/HGCCLUESoA.h"
-#include "CUDADataFormats/HGCal/interface/ConstHGCCLUESoA.h"
 #include "CUDADataFormats/HGCal/interface/HGCUncalibRecHitSoA.h"
 
 class HGCCLUECPUProduct {
 public:
   HGCCLUECPUProduct() = default;
   explicit HGCCLUECPUProduct(uint32_t nhits, const cudaStream_t &stream) : nhits_(nhits) {
-    size_tot_ = std::accumulate(sizes_.begin(), sizes_.end(), 0);
-    pad_ = ((nhits - 1) / 32 + 1) * 32; //align to warp boundary (assumption: warpSize = 32)
-    mMemCLUEHost = cms::cuda::make_host_unique<std::byte[]>(pad_ * size_tot_, stream);
+    mMemCLUEHost = cms::cuda::make_host_unique<std::byte[]>(
+            HGCCLUESoADescriptor::computeDataSize(nhits), stream);
   }
   ~HGCCLUECPUProduct() = default;
 
@@ -25,44 +23,23 @@ class HGCCLUECPUProduct {
   HGCCLUECPUProduct &operator=(HGCCLUECPUProduct &&) = default;
 
   HGCCLUESoA get() {
-    HGCCLUESoA soa;
-    soa.rho = reinterpret_cast<float *>(mMemCLUEHost.get());
-    soa.delta = soa.rho + pad_;
-    soa.nearestHigher = reinterpret_cast<int32_t *>(soa.delta + pad_);
-    soa.clusterIndex = soa.nearestHigher + pad_;
-    soa.isSeed = reinterpret_cast<bool *>(soa.clusterIndex + pad_);
-    soa.nbytes = size_tot_;
+    HGCCLUESoA soa(mMemCLUEHost.get(), nhits_);
     soa.nhits = nhits_;
-    soa.pad = pad_;
     return soa;
   }
 
-  ConstHGCCLUESoA get() const {
-    ConstHGCCLUESoA soa;
-    soa.rho = reinterpret_cast<float const*>(mMemCLUEHost.get());
-    soa.delta = soa.rho + pad_;
-    soa.nearestHigher = reinterpret_cast<int32_t const*>(soa.delta + pad_);
-    soa.clusterIndex = soa.nearestHigher + pad_;
-    soa.isSeed = reinterpret_cast<bool const*>(soa.clusterIndex + pad_);
+  const HGCCLUESoA get() const {
+    HGCCLUESoA soa(mMemCLUEHost.get(), nhits_);
+    soa.nhits = nhits_;
     return soa;
   }
 
   //number of hits stored in the SoA
   uint32_t nHits() const { return nhits_; }
-  //pad of memory block (used for warp alignment, slighlty larger than 'nhits_')
-  uint32_t pad() const { return pad_; }
-  //number of bytes of the SoA
-  uint32_t nBytes() const { return size_tot_; }
 
 private:
   cms::cuda::host::unique_ptr<std::byte[]> mMemCLUEHost;
-  static constexpr std::array<uint32_t, memory::npointers::ntypes_hgcclue_soa> sizes_ = {
-      {memory::npointers::float_hgcclue_soa * sizeof(float),
-       memory::npointers::int32_hgcclue_soa * sizeof(uint32_t),
-       memory::npointers::bool_hgcclue_soa * sizeof(bool)}};
-  uint32_t pad_;
   uint32_t nhits_;
-  uint32_t size_tot_;
 };
 
 #endif  //CUDADAtaFormats_HGCal_HGCCLUECPUProduct_H
diff --git a/CUDADataFormats/HGCal/interface/HGCCLUEGPUProduct.h b/CUDADataFormats/HGCal/interface/HGCCLUEGPUProduct.h
@@ -6,16 +6,15 @@
 #include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
 
 #include "CUDADataFormats/HGCal/interface/HGCCLUESoA.h"
-#include "CUDADataFormats/HGCal/interface/ConstHGCCLUESoA.h"
 #include "CUDADataFormats/HGCal/interface/HGCUncalibRecHitSoA.h"
 
 class HGCCLUEGPUProduct {
 public:
   HGCCLUEGPUProduct() = default;
   explicit HGCCLUEGPUProduct(uint32_t nhits, const cudaStream_t &stream) : nhits_(nhits) {
-    size_tot_ = std::accumulate(sizes_.begin(), sizes_.end(), 0);
-    pad_ = ((nhits - 1) / 32 + 1) * 32; //align to warp boundary (assumption: warpSize = 32)
-    mMemCLUEDev = cms::cuda::make_device_unique<std::byte[]>(pad_ * size_tot_, stream);
+    /* CUDA allocations are already aligned */
+    mMemCLUEDev = cms::cuda::make_device_unique<std::byte[]>(
+        HGCCLUESoADescriptor::computeDataSize(nhits), stream);
   }
   ~HGCCLUEGPUProduct() = default;
 
@@ -25,44 +24,23 @@ class HGCCLUEGPUProduct {
   HGCCLUEGPUProduct &operator=(HGCCLUEGPUProduct &&) = default;
 
   HGCCLUESoA get() {
-    HGCCLUESoA soa;
-    soa.rho = reinterpret_cast<float *>(mMemCLUEDev.get());
-    soa.delta = soa.rho + pad_;
-    soa.nearestHigher = reinterpret_cast<int32_t *>(soa.delta + pad_);
-    soa.clusterIndex = soa.nearestHigher + pad_;
-    soa.isSeed = reinterpret_cast<bool *>(soa.clusterIndex + pad_);
-    soa.nbytes = size_tot_;
+    HGCCLUESoA soa(mMemCLUEDev.get(), nhits_);
     soa.nhits = nhits_;
-    soa.pad = pad_;
     return soa;
   }
 
-  ConstHGCCLUESoA get() const {
-    ConstHGCCLUESoA soa;
-    soa.rho = reinterpret_cast<float const*>(mMemCLUEDev.get());
-    soa.delta = soa.rho + pad_;
-    soa.nearestHigher = reinterpret_cast<int32_t const*>(soa.delta + pad_);
-    soa.clusterIndex = soa.nearestHigher + pad_;
-    soa.isSeed = reinterpret_cast<bool const*>(soa.clusterIndex + pad_);
+  const HGCCLUESoA get() const {
+    HGCCLUESoA soa(mMemCLUEDev.get(), nhits_);
+    soa.nhits = nhits_;
     return soa;
   }
 
   //number of hits stored in the SoA
   uint32_t nHits() const { return nhits_; }
-  //pad of memory block (used for warp alignment, slighlty larger than 'nhits_')
-  uint32_t pad() const { return pad_; }
-  //number of bytes of the SoA
-  uint32_t nBytes() const { return size_tot_; }
 
 private:
   cms::cuda::device::unique_ptr<std::byte[]> mMemCLUEDev;
-  static constexpr std::array<uint32_t, memory::npointers::ntypes_hgcclue_soa> sizes_ = {
-      {memory::npointers::float_hgcclue_soa * sizeof(float),
-       memory::npointers::int32_hgcclue_soa * sizeof(uint32_t),
-       memory::npointers::bool_hgcclue_soa * sizeof(bool)}};
-  uint32_t pad_;
   uint32_t nhits_;
-  uint32_t size_tot_;
 };
 
 #endif  //CUDADAtaFormats_HGCal_HGCCLUEGPUProduct_H
diff --git a/CUDADataFormats/HGCal/interface/HGCCLUESoA.h b/CUDADataFormats/HGCal/interface/HGCCLUESoA.h
@@ -2,32 +2,26 @@
 #define CUDADataFormats_HGCal_HGCCLUESoA_h
 
 #include <cstdint>
+#include "CUDADataFormats/Common/interface/SoAmacros.h"
 
-class HGCCLUESoA {
+declare_SoA_template(HGCCLUESoADescriptor,
+  SoA_column(float, rho),             /* energy density of the calibrated rechit */
+  SoA_column(float, delta),           /* closest distance to a rechit with a higher density */
+  SoA_column(int32_t, nearestHigher), /* index of the nearest rechit with a higher density */
+  SoA_column(int32_t, clusterIndex),  /* cluster index the rechit belongs to */
+  SoA_column(bool, isSeed)            /* is the rechit a cluster seed? */
+  /* Note: isSeed is of type int in the CPU version to to std::vector optimizations */      
+);
+
+
+class HGCCLUESoA: public HGCCLUESoADescriptor {
 public:
-  float *rho; //energy density of the calibrated rechit
-  float *delta; //closest distance to a rechit with a higher density
-  int32_t *nearestHigher; //index of the nearest rechit with a higher density
-  int32_t *clusterIndex;  //cluster index the rechit belongs to
-  bool *isSeed; // is the rechit a cluster seed?
-  //Note: isSeed is of type int in the CPU version to to std::vector optimizations
+  HGCCLUESoA(std::byte* mem, size_t nElements): 
+    HGCCLUESoADescriptor(mem, nElements) {}
 
   uint32_t nbytes;  //number of bytes of the SoA
   uint32_t nhits;   //number of hits stored in the SoA
   uint32_t pad;     //pad of memory block (used for warp alignment, slightly larger than 'nhits_')
 };
 
-namespace memory {
-  namespace npointers {
-    //number of float pointers in the rechits SoA
-    constexpr unsigned float_hgcclue_soa = 2;
-    //number of int32 pointers in the rechits SoA
-    constexpr unsigned int32_hgcclue_soa = 2;
-    //number of bool pointers in the rechits SoA
-    constexpr unsigned bool_hgcclue_soa = 1;
-    //number of different pointer types in the rechits SoA
-    constexpr unsigned ntypes_hgcclue_soa = 3;
-  } // namespace npointers
-} // namespace memory
-
 #endif  //CUDADataFormats_HGCal_HGCCLUESoA_h
diff --git a/RecoLocalCalo/HGCalRecProducers/plugins/HGCalCLUEAlgoGPUEMKernelImpl.cu b/RecoLocalCalo/HGCalRecProducers/plugins/HGCalCLUEAlgoGPUEMKernelImpl.cu
@@ -105,7 +105,7 @@ void kernel_calculate_density( LayerTilesGPU *hist,
       } // end of loop over bins in search box
     }
 
-    out.rho[i] = (float)rhoi;
+    out[i].rho = (float)rhoi;
   }
 } //kernel
 
@@ -133,7 +133,7 @@ void kernel_calculate_distanceToHigher(LayerTilesGPU* hist,
       int layeri = in.layer[i];
       float xi = in.x[i];
       float yi = in.y[i];
-      float rhoi = out.rho[i];
+      float rhoi = out[i].rho;
 
       // get search box 
       int4 search_box = hist[layeri].searchBox(xi-dm, xi+dm, yi-dm, yi+dm);
@@ -155,9 +155,9 @@ void kernel_calculate_distanceToHigher(LayerTilesGPU* hist,
 	      float xj = in.x[j];
 	      float yj = in.y[j];
 	      float dist_ij = std::sqrt((xi-xj)*(xi-xj) + (yi-yj)*(yi-yj));
-	      bool foundHigher = (out.rho[j] > rhoi);
+	      bool foundHigher = (out[j].rho > rhoi);
 	      // in the rare case where rho is the same, use detid
-	      foundHigher = foundHigher || ( (out.rho[j] == rhoi) && (j>i));
+	      foundHigher = foundHigher || ( (out[j].rho == rhoi) && (j>i));
 	      if(foundHigher && dist_ij <= dm) { // definition of N'_{dm}(i)
 		// find the nearest point within N'_{dm}(i)
 		if (dist_ij<deltai) {
@@ -173,8 +173,8 @@ void kernel_calculate_distanceToHigher(LayerTilesGPU* hist,
 
     }
 
-    out.delta[i] = deltai;
-    out.nearestHigher[i] = nearestHigheri;
+    out[i].delta = deltai;
+    out[i].nearestHigher = nearestHigheri;
   }
 } //kernel
 
@@ -194,23 +194,23 @@ void kernel_find_clusters( cms::cuda::VecArray<int,clue_gpu::maxNSeeds>* d_seeds
 
   if (i < numberOfPoints and is_energy_valid(in.energy[i])) {
     // initialize clusterIndex
-    out.clusterIndex[i] = -1;
+    out[i].clusterIndex = -1;
     // determine seed or outlier
-    float deltai = out.delta[i];
-    float rhoi = out.rho[i];
+    float deltai = out[i].delta;
+    float rhoi = out[i].rho;
     float rhoc = kappa * in.sigmaNoise[i];
     bool isSeed = (deltai > dc) && (rhoi >= rhoc);
     bool isOutlier = (deltai > outlierDeltaFactor * dc) && (rhoi < rhoc);
 
     if (isSeed) {
       // set isSeed as 1
-      out.isSeed[i] = 1;
+      out[i].isSeed = 1;
       d_seeds[0].push_back(i); // head of d_seeds
     } else {
       if (!isOutlier) {
-        assert(out.nearestHigher[i] < numberOfPoints);
+        assert(out[i].nearestHigher < numberOfPoints);
         // register as follower of its nearest higher
-        d_followers[out.nearestHigher[i]].push_back(i);  
+        d_followers[out[i].nearestHigher].push_back(i);  
       }
     }
   }
@@ -233,7 +233,7 @@ void kernel_assign_clusters( const cms::cuda::VecArray<int,clue_gpu::maxNSeeds>*
 
     // asgine cluster to seed[idxCls]
     int idxThisSeed = seeds[idxCls];
-    out.clusterIndex[idxThisSeed] = idxCls;
+    out[idxThisSeed].clusterIndex = idxCls;
     // push_back idThisSeed to localStack
     localStack[localStackSize] = idxThisSeed;
     localStackSize++;
@@ -242,15 +242,15 @@ void kernel_assign_clusters( const cms::cuda::VecArray<int,clue_gpu::maxNSeeds>*
       // get last element of localStack
       int idxEndOflocalStack = localStack[localStackSize-1];
 
-      int temp_clusterIndex = out.clusterIndex[idxEndOflocalStack];
+      int temp_clusterIndex = out[idxEndOflocalStack].clusterIndex;
       // pop_back last element of localStack
       localStack[localStackSize-1] = -1;
       localStackSize--;
 
       // loop over followers of last element of localStack
       for( int j : d_followers[idxEndOflocalStack]){
         // // pass id to follower
-        out.clusterIndex[j] = temp_clusterIndex;
+        out[j].clusterIndex = temp_clusterIndex;
         // push_back follower to localStack
         //localStack[localStackSize] = j;
         localStackSize++;