From eb104c4468280080fd872bc99f38ef9aa541beb2 Mon Sep 17 00:00:00 2001
From: sbaldu <simone.balducci00@gmail.com>
Date: Thu, 14 Dec 2023 11:51:16 +0100
Subject: [PATCH] Change block_size to parameter for mainRun

---
 .../alpaka/BindingModules/CLUEstering.py      | 11 ++++---
 .../alpaka/BindingModules/binding_cpu.cc      | 26 ++++++++++-------
 .../alpaka/BindingModules/binding_cpu_tbb.cc  | 24 +++++++++------
 .../alpaka/BindingModules/binding_gpu_cuda.cc | 24 +++++++++------
 CLUEstering/alpaka/CLUE/CLUEAlgoAlpaka.h      | 29 +++++++++----------
 CLUEstering/alpaka/CLUE/Run.h                 | 15 ++++++----
 6 files changed, 75 insertions(+), 54 deletions(-)
diff --git a/CLUEstering/alpaka/BindingModules/CLUEstering.py b/CLUEstering/alpaka/BindingModules/CLUEstering.py
index f36b9d34..a1ad2efc 100644
--- a/CLUEstering/alpaka/BindingModules/CLUEstering.py
+++ b/CLUEstering/alpaka/BindingModules/CLUEstering.py
@@ -504,7 +504,10 @@ def choose_kernel(self,
             raise ValueError("Invalid kernel. The allowed choices for the"
                              + " kernels are: flat, exp, gaus and custom.")
 
-    def run_clue(self, backend: str = "cpu serial", verbose: bool = False) -> None:
+    def run_clue(self,
+                 backend: str = "cpu serial",
+                 block_size: int = 1024,
+                 verbose: bool = False) -> None:
         """
         Executes the CLUE clustering algorithm.
 
@@ -537,15 +540,15 @@ def run_clue(self, backend: str = "cpu serial", verbose: bool = False) -> None:
         if backend == "cpu serial":
             cluster_id_is_seed = cpu_serial.mainRun(self.dc_, self.rhoc, self.outlier, self.ppbin,
                                                     self.clust_data.coords, self.clust_data.weight,
-                                                    self.kernel, self.clust_data.n_dim)
+                                                    self.kernel, self.clust_data.n_dim, block_size)
         elif backend == "cpu tbb":
             cluster_id_is_seed = cpu_serial.mainRun(self.dc_, self.rhoc, self.outlier, self.ppbin,
                                                     self.clust_data.coords, self.clust_data.weight,
-                                                    self.kernel, self.clust_data.n_dim)
+                                                    self.kernel, self.clust_data.n_dim, block_size)
         elif backend == "gpu cuda":
             cluster_id_is_seed = gpu_cuda.mainRun(self.dc_, float(self.rhoc), self.outlier, self.ppbin,
                                                   self.clust_data.coords, self.clust_data.weight,
-                                                  self.kernel, self.clust_data.n_dim)
+                                                  self.kernel, self.clust_data.n_dim, block_size)
             # cluster_id_is_seed = cpu_tbb.mainRun(self.dc_, self.rhoc, self.outlier, self.ppbin,
             #                                      self.clust_data.coords, self.clust_data.weight,
             #                                      self.kernel, self.clust_data.n_dim)
diff --git a/CLUEstering/alpaka/BindingModules/binding_cpu.cc b/CLUEstering/alpaka/BindingModules/binding_cpu.cc
index bd2eae4e..82fb7f57 100644
--- a/CLUEstering/alpaka/BindingModules/binding_cpu.cc
+++ b/CLUEstering/alpaka/BindingModules/binding_cpu.cc
@@ -20,7 +20,8 @@ namespace alpaka_serial_sync {
                                         const std::vector<std::vector<float>>& coords,
                                         const std::vector<float>& weights,
                                         const FlatKernel& kernel,
-                                        int Ndim) {
+                                        int Ndim,
+										size_t block_size = 1024) {
     auto const dev_acc = alpaka::getDevByIdx<Acc1D>(0u);
 
     // Create the queue
@@ -41,7 +42,7 @@ namespace alpaka_serial_sync {
         /* return run1(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_); */
         break;
       [[likely]] case (2) :
-        return run2(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_);
+        return run2(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size);
         break;
       [[likely]] case (3) :
         /* return run3(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_); */
@@ -81,7 +82,8 @@ namespace alpaka_serial_sync {
                                         const std::vector<std::vector<float>>& coords,
                                         const std::vector<float>& weights,
                                         const ExponentialKernel& kernel,
-                                        int Ndim) {
+                                        int Ndim,
+										size_t block_size = 1024) {
     auto const dev_acc = alpaka::getDevByIdx<Acc1D>(0u);
 
     // Create the queue
@@ -102,7 +104,7 @@ namespace alpaka_serial_sync {
         /* return run1(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_); */
         break;
       [[likely]] case (2) :
-        return run2(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_);
+        return run2(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size);
         break;
       [[likely]] case (3) :
         /* return run3(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_); */
@@ -142,7 +144,8 @@ namespace alpaka_serial_sync {
                                         const std::vector<std::vector<float>>& coords,
                                         const std::vector<float>& weights,
                                         const GaussianKernel& kernel,
-                                        int Ndim) {
+                                        int Ndim,
+										size_t block_size) {
     auto const dev_acc = alpaka::getDevByIdx<Acc1D>(0u);
 
     // Create the queue
@@ -163,7 +166,7 @@ namespace alpaka_serial_sync {
         /* return run1(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_); */
         break;
       [[likely]] case (2) :
-        return run2(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_);
+        return run2(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size);
         break;
       [[likely]] case (3) :
         /* return run3(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_); */
@@ -207,7 +210,8 @@ namespace alpaka_serial_sync {
                                   const std::vector<std::vector<float>>&,
                                   const std::vector<float>&,
                                   const FlatKernel&,
-                                  int>(&mainRun),
+                                  int,
+								  size_t>(&mainRun),
           "mainRun");
     m.def("mainRun",
           pybind11::overload_cast<float,
@@ -217,7 +221,8 @@ namespace alpaka_serial_sync {
                                   const std::vector<std::vector<float>>&,
                                   const std::vector<float>&,
                                   const ExponentialKernel&,
-                                  int>(&mainRun),
+                                  int,
+								  size_t>(&mainRun),
           "mainRun");
     m.def("mainRun",
           pybind11::overload_cast<float,
@@ -227,9 +232,8 @@ namespace alpaka_serial_sync {
                                   const std::vector<std::vector<float>>&,
                                   const std::vector<float>&,
                                   const GaussianKernel&,
-                                  int>(&mainRun),
+                                  int,
+								  size_t>(&mainRun),
           "mainRun");
-
-	m.def("set_blocksize", CLUEAlgoAlpaka::setBlockSize, "set_blocksize");
   }
 };  // namespace alpaka_serial_sync
diff --git a/CLUEstering/alpaka/BindingModules/binding_cpu_tbb.cc b/CLUEstering/alpaka/BindingModules/binding_cpu_tbb.cc
index bd2b4797..cae0d353 100644
--- a/CLUEstering/alpaka/BindingModules/binding_cpu_tbb.cc
+++ b/CLUEstering/alpaka/BindingModules/binding_cpu_tbb.cc
@@ -20,7 +20,8 @@ namespace alpaka_tbb_async {
                                         const std::vector<std::vector<float>>& coords,
                                         const std::vector<float>& weights,
 										const FlatKernel& kernel,
-                                        int Ndim) {
+                                        int Ndim,
+										size_t block_size) {
     auto const dev_acc = alpaka::getDevByIdx<Acc1D>(0u);
 
     // Create the queue
@@ -41,7 +42,7 @@ namespace alpaka_tbb_async {
         /* return run1(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_); */
         break;
       [[likely]] case (2) :
-        return run2(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_);
+        return run2(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size);
         break;
       [[likely]] case (3) :
         /* return run3(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_); */
@@ -81,7 +82,8 @@ namespace alpaka_tbb_async {
                                         const std::vector<std::vector<float>>& coords,
                                         const std::vector<float>& weights,
 										const ExponentialKernel& kernel,
-                                        int Ndim) {
+                                        int Ndim,
+										size_t block_size) {
     auto const dev_acc = alpaka::getDevByIdx<Acc1D>(0u);
 
     // Create the queue
@@ -102,7 +104,7 @@ namespace alpaka_tbb_async {
         /* return run1(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_); */
         break;
       [[likely]] case (2) :
-        return run2(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_);
+        return run2(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size);
         break;
       [[likely]] case (3) :
         /* return run3(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_); */
@@ -142,7 +144,8 @@ namespace alpaka_tbb_async {
                                         const std::vector<std::vector<float>>& coords,
                                         const std::vector<float>& weights,
 										const GaussianKernel& kernel,
-                                        int Ndim) {
+                                        int Ndim,
+										size_t block_size) {
     auto const dev_acc = alpaka::getDevByIdx<Acc1D>(0u);
 
     // Create the queue
@@ -163,7 +166,7 @@ namespace alpaka_tbb_async {
         /* return run1(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_); */
         break;
       [[likely]] case (2) :
-        return run2(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_);
+        return run2(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size);
         break;
       [[likely]] case (3) :
         /* return run3(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_); */
@@ -221,7 +224,8 @@ namespace alpaka_tbb_async {
                                   const std::vector<std::vector<float>>&,
                                   const std::vector<float>&,
                                   const FlatKernel&,
-                                  int>(&mainRun),
+                                  int,
+								  size_t>(&mainRun),
           "mainRun");
     m.def("mainRun",
           pybind11::overload_cast<float,
@@ -231,7 +235,8 @@ namespace alpaka_tbb_async {
                                   const std::vector<std::vector<float>>&,
                                   const std::vector<float>&,
                                   const ExponentialKernel&,
-                                  int>(&mainRun),
+                                  int,
+								  size_t>(&mainRun),
           "mainRun");
     m.def("mainRun",
           pybind11::overload_cast<float,
@@ -241,7 +246,8 @@ namespace alpaka_tbb_async {
                                   const std::vector<std::vector<float>>&,
                                   const std::vector<float>&,
                                   const GaussianKernel&,
-                                  int>(&mainRun),
+                                  int,
+								  size_t>(&mainRun),
           "mainRun");
 
     /* m.def("mainRun", &mainRun, "mainRun"); */
diff --git a/CLUEstering/alpaka/BindingModules/binding_gpu_cuda.cc b/CLUEstering/alpaka/BindingModules/binding_gpu_cuda.cc
index 5069bf71..0f36f115 100644
--- a/CLUEstering/alpaka/BindingModules/binding_gpu_cuda.cc
+++ b/CLUEstering/alpaka/BindingModules/binding_gpu_cuda.cc
@@ -22,7 +22,8 @@ namespace alpaka_cuda_async {
                                         const std::vector<std::vector<float>>& coords,
                                         const std::vector<float>& weights,
                                         const FlatKernel& kernel,
-                                        int Ndim) {
+                                        int Ndim,
+										size_t block_size) {
 	std::vector<Device> devices = alpaka::getDevs<Platform>();
 
     auto const dev_acc = alpaka::getDevByIdx<Acc1D>(0u);
@@ -47,7 +48,7 @@ namespace alpaka_cuda_async {
         /* return run1(dc, rhoc, outlier, pPBin, coords, weights, queue_); */
         break;
       [[likely]] case (2) :
-        return run2(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_);
+        return run2(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size);
         break;
       [[likely]] case (3) :
         /* return run3(dc, rhoc, outlier, pPBin, coords, weights, queue_); */
@@ -87,7 +88,8 @@ namespace alpaka_cuda_async {
                                         const std::vector<std::vector<float>>& coords,
                                         const std::vector<float>& weights,
                                         const ExponentialKernel& kernel,
-                                        int Ndim) {
+                                        int Ndim,
+										size_t block_size) {
     auto const dev_acc = alpaka::getDevByIdx<Acc1D>(0u);
 
     // Create the queue
@@ -108,7 +110,7 @@ namespace alpaka_cuda_async {
         /* return run1(dc, rhoc, outlier, pPBin, coords, weights, queue_); */
         break;
       [[likely]] case (2) :
-        return run2(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_);
+        return run2(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size);
         break;
       [[likely]] case (3) :
         /* return run3(dc, rhoc, outlier, pPBin, coords, weights, queue_); */
@@ -148,7 +150,8 @@ namespace alpaka_cuda_async {
                                         const std::vector<std::vector<float>>& coords,
                                         const std::vector<float>& weights,
                                         const GaussianKernel& kernel,
-                                        int Ndim) {
+                                        int Ndim,
+										size_t block_size) {
     auto const dev_acc = alpaka::getDevByIdx<Acc1D>(0u);
 
     // Create the queue
@@ -169,7 +172,7 @@ namespace alpaka_cuda_async {
         /* return run1(dc, rhoc, outlier, pPBin, coords, weights, queue_); */
         break;
       [[likely]] case (2) :
-        return run2(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_);
+        return run2(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size);
         break;
       [[likely]] case (3) :
         /* return run3(dc, rhoc, outlier, pPBin, coords, weights, queue_); */
@@ -213,7 +216,8 @@ namespace alpaka_cuda_async {
                                   const std::vector<std::vector<float>>&,
                                   const std::vector<float>&,
                                   const FlatKernel&,
-                                  int>(&mainRun),
+                                  int,
+								  size_t>(&mainRun),
           "mainRun");
     m.def("mainRun",
           pybind11::overload_cast<float,
@@ -223,7 +227,8 @@ namespace alpaka_cuda_async {
                                   const std::vector<std::vector<float>>&,
                                   const std::vector<float>&,
                                   const ExponentialKernel&,
-                                  int>(&mainRun),
+                                  int,
+								  size_t>(&mainRun),
           "mainRun");
     m.def("mainRun",
           pybind11::overload_cast<float,
@@ -233,7 +238,8 @@ namespace alpaka_cuda_async {
                                   const std::vector<std::vector<float>>&,
                                   const std::vector<float>&,
                                   const GaussianKernel&,
-                                  int>(&mainRun),
+                                  int,
+								  size_t>(&mainRun),
           "mainRun");
   }
 };  // namespace alpaka_tbb_async
diff --git a/CLUEstering/alpaka/CLUE/CLUEAlgoAlpaka.h b/CLUEstering/alpaka/CLUE/CLUEAlgoAlpaka.h
index 18859a3d..53b70e0e 100644
--- a/CLUEstering/alpaka/CLUE/CLUEAlgoAlpaka.h
+++ b/CLUEstering/alpaka/CLUE/CLUEAlgoAlpaka.h
@@ -37,13 +37,12 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE {
     VecArray<int32_t, max_seeds>* m_seeds;
     VecArray<int32_t, max_followers>* m_followers;
 
-	void setBlockSize(std::size_t blockSize) { blockSize_ = blockSize; }
-
     template <typename KernelType>
     std::vector<std::vector<int>> make_clusters(Points<Ndim>& h_points,
                                                 PointsAlpaka<Ndim>& d_points,
                                                 const KernelType& kernel,
-                                                Queue queue_);
+                                                Queue queue_,
+												size_t block_size);
 
   private:
     float dc_;
@@ -52,8 +51,6 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE {
     // average number of points found in a tile
     int pointsPerTile_;
 
-	std::size_t blockSize_ = 1024;
-
     /* domain_t<Ndim> m_domains; */
 
     // Buffers
@@ -66,7 +63,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE {
 
     // Private methods
     void init_device(Queue queue_);
-    void setup(const Points<Ndim>& h_points, PointsAlpaka<Ndim>& d_points, Queue queue_);
+    void setup(const Points<Ndim>& h_points, PointsAlpaka<Ndim>& d_points, Queue queue_, size_t block_size);
 
     // Construction of the tiles
     void calculate_tile_size(TilesAlpaka<Ndim>& h_tiles, const Points<Ndim>& h_points);
@@ -107,7 +104,8 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE {
   template <typename TAcc, uint8_t Ndim>
   void CLUEAlgoAlpaka<TAcc, Ndim>::setup(const Points<Ndim>& h_points,
                                          PointsAlpaka<Ndim>& d_points,
-                                         Queue queue_) {
+                                         Queue queue_,
+										 size_t block_size) {
     // Create temporary tiles object
     TilesAlpaka<Ndim> temp;
     calculate_tile_size(temp, h_points);
@@ -122,8 +120,8 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE {
     alpaka::memset(queue_, (*d_seeds), 0x00);
 
     // Define the working division
-    Idx grid_size = cms::alpakatools::divide_up_by(h_points.n, blockSize_);
-    auto working_div = cms::alpakatools::make_workdiv<Acc1D>(grid_size, blockSize_);
+    Idx grid_size = cms::alpakatools::divide_up_by(h_points.n, block_size);
+    auto working_div = cms::alpakatools::make_workdiv<Acc1D>(grid_size, block_size);
     alpaka::enqueue(
         queue_,
         alpaka::createTaskKernel<Acc1D>(working_div, KernelResetFollowers{}, m_followers, h_points.n));
@@ -135,11 +133,12 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE {
   std::vector<std::vector<int>> CLUEAlgoAlpaka<TAcc, Ndim>::make_clusters(Points<Ndim>& h_points,
                                                                           PointsAlpaka<Ndim>& d_points,
                                                                           const KernelType& kernel,
-                                                                          Queue queue_) {
-    setup(h_points, d_points, queue_);
+                                                                          Queue queue_,
+																		  size_t block_size) {
+    setup(h_points, d_points, queue_, block_size);
 
-    const Idx grid_size = cms::alpakatools::divide_up_by(h_points.n, blockSize_);
-    auto working_div = cms::alpakatools::make_workdiv<Acc1D>(grid_size, blockSize_);
+    const Idx grid_size = cms::alpakatools::divide_up_by(h_points.n, block_size);
+    auto working_div = cms::alpakatools::make_workdiv<Acc1D>(grid_size, block_size);
     alpaka::enqueue(queue_,
                     alpaka::createTaskKernel<Acc1D>(
                         working_div, KernelFillTiles(), d_points.view(), m_tiles, h_points.n));
@@ -174,8 +173,8 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE {
                                                     h_points.n));
 
     // We change the working division when assigning the clusters
-    const Idx grid_size_seeds = cms::alpakatools::divide_up_by(max_seeds, blockSize_);
-    auto working_div_seeds = cms::alpakatools::make_workdiv<Acc1D>(grid_size_seeds, blockSize_);
+    const Idx grid_size_seeds = cms::alpakatools::divide_up_by(max_seeds, block_size);
+    auto working_div_seeds = cms::alpakatools::make_workdiv<Acc1D>(grid_size_seeds, block_size);
     alpaka::enqueue(
         queue_,
         alpaka::createTaskKernel<Acc1D>(
diff --git a/CLUEstering/alpaka/CLUE/Run.h b/CLUEstering/alpaka/CLUE/Run.h
index 32233663..f9d3056d 100644
--- a/CLUEstering/alpaka/CLUE/Run.h
+++ b/CLUEstering/alpaka/CLUE/Run.h
@@ -31,14 +31,15 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE {
                                      std::vector<std::vector<float>> const &coordinates,
                                      std::vector<float> const &weight,
 									 const FlatKernel& kernel,
-									 Queue queue_) {
+									 Queue queue_,
+									 size_t block_size) {
     CLUEAlgoAlpaka<Acc1D, 2> algo(dc, rhoc, outlier, pPBin, queue_);
 
 	// Create the host and device points
 	Points<2> h_points(coordinates, weight);
 	PointsAlpaka<2> d_points(queue_, weight.size());
 
-    return algo.make_clusters(h_points, d_points, kernel, queue_);
+    return algo.make_clusters(h_points, d_points, kernel, queue_, block_size);
   }
 
   std::vector<std::vector<int>> run2(float dc,
@@ -48,14 +49,15 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE {
                                      std::vector<std::vector<float>> const &coordinates,
                                      std::vector<float> const &weight,
 									 const ExponentialKernel& kernel,
-									 Queue queue_) {
+									 Queue queue_,
+									 size_t block_size) {
     CLUEAlgoAlpaka<Acc1D, 2> algo(dc, rhoc, outlier, pPBin, queue_);
 
 	// Create the host and device points
 	Points<2> h_points(coordinates, weight);
 	PointsAlpaka<2> d_points(queue_, weight.size());
 
-    return algo.make_clusters(h_points, d_points, kernel, queue_);
+    return algo.make_clusters(h_points, d_points, kernel, queue_, block_size);
   }
 
   std::vector<std::vector<int>> run2(float dc,
@@ -65,14 +67,15 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE {
                                      std::vector<std::vector<float>> const &coordinates,
                                      std::vector<float> const &weight,
 									 const GaussianKernel& kernel,
-									 Queue queue_) {
+									 Queue queue_,
+									 size_t block_size) {
     CLUEAlgoAlpaka<Acc1D, 2> algo(dc, rhoc, outlier, pPBin, queue_);
 
 	// Create the host and device points
 	Points<2> h_points(coordinates, weight);
 	PointsAlpaka<2> d_points(queue_, weight.size());
 
-    return algo.make_clusters(h_points, d_points, kernel, queue_);
+    return algo.make_clusters(h_points, d_points, kernel, queue_, block_size);
   }
 
   /* std::vector<std::vector<int>> run3(float dc, */