From eb104c4468280080fd872bc99f38ef9aa541beb2 Mon Sep 17 00:00:00 2001 From: sbaldu Date: Thu, 14 Dec 2023 11:51:16 +0100 Subject: [PATCH] Change block_size to parameter for mainRun --- .../alpaka/BindingModules/CLUEstering.py | 11 ++++--- .../alpaka/BindingModules/binding_cpu.cc | 26 ++++++++++------- .../alpaka/BindingModules/binding_cpu_tbb.cc | 24 +++++++++------ .../alpaka/BindingModules/binding_gpu_cuda.cc | 24 +++++++++------ CLUEstering/alpaka/CLUE/CLUEAlgoAlpaka.h | 29 +++++++++---------- CLUEstering/alpaka/CLUE/Run.h | 15 ++++++---- 6 files changed, 75 insertions(+), 54 deletions(-) diff --git a/CLUEstering/alpaka/BindingModules/CLUEstering.py b/CLUEstering/alpaka/BindingModules/CLUEstering.py index f36b9d34..a1ad2efc 100644 --- a/CLUEstering/alpaka/BindingModules/CLUEstering.py +++ b/CLUEstering/alpaka/BindingModules/CLUEstering.py @@ -504,7 +504,10 @@ def choose_kernel(self, raise ValueError("Invalid kernel. The allowed choices for the" + " kernels are: flat, exp, gaus and custom.") - def run_clue(self, backend: str = "cpu serial", verbose: bool = False) -> None: + def run_clue(self, + backend: str = "cpu serial", + block_size: int = 1024, + verbose: bool = False) -> None: """ Executes the CLUE clustering algorithm. @@ -537,15 +540,15 @@ def run_clue(self, backend: str = "cpu serial", verbose: bool = False) -> None: if backend == "cpu serial": cluster_id_is_seed = cpu_serial.mainRun(self.dc_, self.rhoc, self.outlier, self.ppbin, self.clust_data.coords, self.clust_data.weight, - self.kernel, self.clust_data.n_dim) + self.kernel, self.clust_data.n_dim, block_size) elif backend == "cpu tbb": cluster_id_is_seed = cpu_serial.mainRun(self.dc_, self.rhoc, self.outlier, self.ppbin, self.clust_data.coords, self.clust_data.weight, - self.kernel, self.clust_data.n_dim) + self.kernel, self.clust_data.n_dim, block_size) elif backend == "gpu cuda": cluster_id_is_seed = gpu_cuda.mainRun(self.dc_, float(self.rhoc), self.outlier, self.ppbin, self.clust_data.coords, self.clust_data.weight, - self.kernel, self.clust_data.n_dim) + self.kernel, self.clust_data.n_dim, block_size) # cluster_id_is_seed = cpu_tbb.mainRun(self.dc_, self.rhoc, self.outlier, self.ppbin, # self.clust_data.coords, self.clust_data.weight, # self.kernel, self.clust_data.n_dim) diff --git a/CLUEstering/alpaka/BindingModules/binding_cpu.cc b/CLUEstering/alpaka/BindingModules/binding_cpu.cc index bd2eae4e..82fb7f57 100644 --- a/CLUEstering/alpaka/BindingModules/binding_cpu.cc +++ b/CLUEstering/alpaka/BindingModules/binding_cpu.cc @@ -20,7 +20,8 @@ namespace alpaka_serial_sync { const std::vector>& coords, const std::vector& weights, const FlatKernel& kernel, - int Ndim) { + int Ndim, + size_t block_size = 1024) { auto const dev_acc = alpaka::getDevByIdx(0u); // Create the queue @@ -41,7 +42,7 @@ namespace alpaka_serial_sync { /* return run1(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_); */ break; [[likely]] case (2) : - return run2(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_); + return run2(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size); break; [[likely]] case (3) : /* return run3(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_); */ @@ -81,7 +82,8 @@ namespace alpaka_serial_sync { const std::vector>& coords, const std::vector& weights, const ExponentialKernel& kernel, - int Ndim) { + int Ndim, + size_t block_size = 1024) { auto const dev_acc = alpaka::getDevByIdx(0u); // Create the queue @@ -102,7 +104,7 @@ namespace alpaka_serial_sync { /* return run1(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_); */ break; [[likely]] case (2) : - return run2(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_); + return run2(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size); break; [[likely]] case (3) : /* return run3(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_); */ @@ -142,7 +144,8 @@ namespace alpaka_serial_sync { const std::vector>& coords, const std::vector& weights, const GaussianKernel& kernel, - int Ndim) { + int Ndim, + size_t block_size) { auto const dev_acc = alpaka::getDevByIdx(0u); // Create the queue @@ -163,7 +166,7 @@ namespace alpaka_serial_sync { /* return run1(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_); */ break; [[likely]] case (2) : - return run2(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_); + return run2(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size); break; [[likely]] case (3) : /* return run3(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_); */ @@ -207,7 +210,8 @@ namespace alpaka_serial_sync { const std::vector>&, const std::vector&, const FlatKernel&, - int>(&mainRun), + int, + size_t>(&mainRun), "mainRun"); m.def("mainRun", pybind11::overload_cast>&, const std::vector&, const ExponentialKernel&, - int>(&mainRun), + int, + size_t>(&mainRun), "mainRun"); m.def("mainRun", pybind11::overload_cast>&, const std::vector&, const GaussianKernel&, - int>(&mainRun), + int, + size_t>(&mainRun), "mainRun"); - - m.def("set_blocksize", CLUEAlgoAlpaka::setBlockSize, "set_blocksize"); } }; // namespace alpaka_serial_sync diff --git a/CLUEstering/alpaka/BindingModules/binding_cpu_tbb.cc b/CLUEstering/alpaka/BindingModules/binding_cpu_tbb.cc index bd2b4797..cae0d353 100644 --- a/CLUEstering/alpaka/BindingModules/binding_cpu_tbb.cc +++ b/CLUEstering/alpaka/BindingModules/binding_cpu_tbb.cc @@ -20,7 +20,8 @@ namespace alpaka_tbb_async { const std::vector>& coords, const std::vector& weights, const FlatKernel& kernel, - int Ndim) { + int Ndim, + size_t block_size) { auto const dev_acc = alpaka::getDevByIdx(0u); // Create the queue @@ -41,7 +42,7 @@ namespace alpaka_tbb_async { /* return run1(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_); */ break; [[likely]] case (2) : - return run2(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_); + return run2(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size); break; [[likely]] case (3) : /* return run3(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_); */ @@ -81,7 +82,8 @@ namespace alpaka_tbb_async { const std::vector>& coords, const std::vector& weights, const ExponentialKernel& kernel, - int Ndim) { + int Ndim, + size_t block_size) { auto const dev_acc = alpaka::getDevByIdx(0u); // Create the queue @@ -102,7 +104,7 @@ namespace alpaka_tbb_async { /* return run1(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_); */ break; [[likely]] case (2) : - return run2(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_); + return run2(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size); break; [[likely]] case (3) : /* return run3(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_); */ @@ -142,7 +144,8 @@ namespace alpaka_tbb_async { const std::vector>& coords, const std::vector& weights, const GaussianKernel& kernel, - int Ndim) { + int Ndim, + size_t block_size) { auto const dev_acc = alpaka::getDevByIdx(0u); // Create the queue @@ -163,7 +166,7 @@ namespace alpaka_tbb_async { /* return run1(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_); */ break; [[likely]] case (2) : - return run2(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_); + return run2(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size); break; [[likely]] case (3) : /* return run3(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_); */ @@ -221,7 +224,8 @@ namespace alpaka_tbb_async { const std::vector>&, const std::vector&, const FlatKernel&, - int>(&mainRun), + int, + size_t>(&mainRun), "mainRun"); m.def("mainRun", pybind11::overload_cast>&, const std::vector&, const ExponentialKernel&, - int>(&mainRun), + int, + size_t>(&mainRun), "mainRun"); m.def("mainRun", pybind11::overload_cast>&, const std::vector&, const GaussianKernel&, - int>(&mainRun), + int, + size_t>(&mainRun), "mainRun"); /* m.def("mainRun", &mainRun, "mainRun"); */ diff --git a/CLUEstering/alpaka/BindingModules/binding_gpu_cuda.cc b/CLUEstering/alpaka/BindingModules/binding_gpu_cuda.cc index 5069bf71..0f36f115 100644 --- a/CLUEstering/alpaka/BindingModules/binding_gpu_cuda.cc +++ b/CLUEstering/alpaka/BindingModules/binding_gpu_cuda.cc @@ -22,7 +22,8 @@ namespace alpaka_cuda_async { const std::vector>& coords, const std::vector& weights, const FlatKernel& kernel, - int Ndim) { + int Ndim, + size_t block_size) { std::vector devices = alpaka::getDevs(); auto const dev_acc = alpaka::getDevByIdx(0u); @@ -47,7 +48,7 @@ namespace alpaka_cuda_async { /* return run1(dc, rhoc, outlier, pPBin, coords, weights, queue_); */ break; [[likely]] case (2) : - return run2(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_); + return run2(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size); break; [[likely]] case (3) : /* return run3(dc, rhoc, outlier, pPBin, coords, weights, queue_); */ @@ -87,7 +88,8 @@ namespace alpaka_cuda_async { const std::vector>& coords, const std::vector& weights, const ExponentialKernel& kernel, - int Ndim) { + int Ndim, + size_t block_size) { auto const dev_acc = alpaka::getDevByIdx(0u); // Create the queue @@ -108,7 +110,7 @@ namespace alpaka_cuda_async { /* return run1(dc, rhoc, outlier, pPBin, coords, weights, queue_); */ break; [[likely]] case (2) : - return run2(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_); + return run2(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size); break; [[likely]] case (3) : /* return run3(dc, rhoc, outlier, pPBin, coords, weights, queue_); */ @@ -148,7 +150,8 @@ namespace alpaka_cuda_async { const std::vector>& coords, const std::vector& weights, const GaussianKernel& kernel, - int Ndim) { + int Ndim, + size_t block_size) { auto const dev_acc = alpaka::getDevByIdx(0u); // Create the queue @@ -169,7 +172,7 @@ namespace alpaka_cuda_async { /* return run1(dc, rhoc, outlier, pPBin, coords, weights, queue_); */ break; [[likely]] case (2) : - return run2(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_); + return run2(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size); break; [[likely]] case (3) : /* return run3(dc, rhoc, outlier, pPBin, coords, weights, queue_); */ @@ -213,7 +216,8 @@ namespace alpaka_cuda_async { const std::vector>&, const std::vector&, const FlatKernel&, - int>(&mainRun), + int, + size_t>(&mainRun), "mainRun"); m.def("mainRun", pybind11::overload_cast>&, const std::vector&, const ExponentialKernel&, - int>(&mainRun), + int, + size_t>(&mainRun), "mainRun"); m.def("mainRun", pybind11::overload_cast>&, const std::vector&, const GaussianKernel&, - int>(&mainRun), + int, + size_t>(&mainRun), "mainRun"); } }; // namespace alpaka_tbb_async diff --git a/CLUEstering/alpaka/CLUE/CLUEAlgoAlpaka.h b/CLUEstering/alpaka/CLUE/CLUEAlgoAlpaka.h index 18859a3d..53b70e0e 100644 --- a/CLUEstering/alpaka/CLUE/CLUEAlgoAlpaka.h +++ b/CLUEstering/alpaka/CLUE/CLUEAlgoAlpaka.h @@ -37,13 +37,12 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { VecArray* m_seeds; VecArray* m_followers; - void setBlockSize(std::size_t blockSize) { blockSize_ = blockSize; } - template std::vector> make_clusters(Points& h_points, PointsAlpaka& d_points, const KernelType& kernel, - Queue queue_); + Queue queue_, + size_t block_size); private: float dc_; @@ -52,8 +51,6 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { // average number of points found in a tile int pointsPerTile_; - std::size_t blockSize_ = 1024; - /* domain_t m_domains; */ // Buffers @@ -66,7 +63,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { // Private methods void init_device(Queue queue_); - void setup(const Points& h_points, PointsAlpaka& d_points, Queue queue_); + void setup(const Points& h_points, PointsAlpaka& d_points, Queue queue_, size_t block_size); // Construction of the tiles void calculate_tile_size(TilesAlpaka& h_tiles, const Points& h_points); @@ -107,7 +104,8 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { template void CLUEAlgoAlpaka::setup(const Points& h_points, PointsAlpaka& d_points, - Queue queue_) { + Queue queue_, + size_t block_size) { // Create temporary tiles object TilesAlpaka temp; calculate_tile_size(temp, h_points); @@ -122,8 +120,8 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { alpaka::memset(queue_, (*d_seeds), 0x00); // Define the working division - Idx grid_size = cms::alpakatools::divide_up_by(h_points.n, blockSize_); - auto working_div = cms::alpakatools::make_workdiv(grid_size, blockSize_); + Idx grid_size = cms::alpakatools::divide_up_by(h_points.n, block_size); + auto working_div = cms::alpakatools::make_workdiv(grid_size, block_size); alpaka::enqueue( queue_, alpaka::createTaskKernel(working_div, KernelResetFollowers{}, m_followers, h_points.n)); @@ -135,11 +133,12 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { std::vector> CLUEAlgoAlpaka::make_clusters(Points& h_points, PointsAlpaka& d_points, const KernelType& kernel, - Queue queue_) { - setup(h_points, d_points, queue_); + Queue queue_, + size_t block_size) { + setup(h_points, d_points, queue_, block_size); - const Idx grid_size = cms::alpakatools::divide_up_by(h_points.n, blockSize_); - auto working_div = cms::alpakatools::make_workdiv(grid_size, blockSize_); + const Idx grid_size = cms::alpakatools::divide_up_by(h_points.n, block_size); + auto working_div = cms::alpakatools::make_workdiv(grid_size, block_size); alpaka::enqueue(queue_, alpaka::createTaskKernel( working_div, KernelFillTiles(), d_points.view(), m_tiles, h_points.n)); @@ -174,8 +173,8 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { h_points.n)); // We change the working division when assigning the clusters - const Idx grid_size_seeds = cms::alpakatools::divide_up_by(max_seeds, blockSize_); - auto working_div_seeds = cms::alpakatools::make_workdiv(grid_size_seeds, blockSize_); + const Idx grid_size_seeds = cms::alpakatools::divide_up_by(max_seeds, block_size); + auto working_div_seeds = cms::alpakatools::make_workdiv(grid_size_seeds, block_size); alpaka::enqueue( queue_, alpaka::createTaskKernel( diff --git a/CLUEstering/alpaka/CLUE/Run.h b/CLUEstering/alpaka/CLUE/Run.h index 32233663..f9d3056d 100644 --- a/CLUEstering/alpaka/CLUE/Run.h +++ b/CLUEstering/alpaka/CLUE/Run.h @@ -31,14 +31,15 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { std::vector> const &coordinates, std::vector const &weight, const FlatKernel& kernel, - Queue queue_) { + Queue queue_, + size_t block_size) { CLUEAlgoAlpaka algo(dc, rhoc, outlier, pPBin, queue_); // Create the host and device points Points<2> h_points(coordinates, weight); PointsAlpaka<2> d_points(queue_, weight.size()); - return algo.make_clusters(h_points, d_points, kernel, queue_); + return algo.make_clusters(h_points, d_points, kernel, queue_, block_size); } std::vector> run2(float dc, @@ -48,14 +49,15 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { std::vector> const &coordinates, std::vector const &weight, const ExponentialKernel& kernel, - Queue queue_) { + Queue queue_, + size_t block_size) { CLUEAlgoAlpaka algo(dc, rhoc, outlier, pPBin, queue_); // Create the host and device points Points<2> h_points(coordinates, weight); PointsAlpaka<2> d_points(queue_, weight.size()); - return algo.make_clusters(h_points, d_points, kernel, queue_); + return algo.make_clusters(h_points, d_points, kernel, queue_, block_size); } std::vector> run2(float dc, @@ -65,14 +67,15 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { std::vector> const &coordinates, std::vector const &weight, const GaussianKernel& kernel, - Queue queue_) { + Queue queue_, + size_t block_size) { CLUEAlgoAlpaka algo(dc, rhoc, outlier, pPBin, queue_); // Create the host and device points Points<2> h_points(coordinates, weight); PointsAlpaka<2> d_points(queue_, weight.size()); - return algo.make_clusters(h_points, d_points, kernel, queue_); + return algo.make_clusters(h_points, d_points, kernel, queue_, block_size); } /* std::vector> run3(float dc, */