From 14024e9d51e84de2a18b25dc6ef0051e27568cca Mon Sep 17 00:00:00 2001
From: sbaldu <simone.balducci00@gmail.com>
Date: Fri, 5 Jan 2024 13:11:14 +0100
Subject: [PATCH] Prepare compilation for AMD GPUs

---
 CLUEstering/alpaka/BindingModules/Makefile    |   9 +
 .../alpaka/BindingModules/binding_gpu_hip.cc  | 237 ++++++++++++++++++
 2 files changed, 246 insertions(+)
 create mode 100644 CLUEstering/alpaka/BindingModules/binding_gpu_hip.cc
diff --git a/CLUEstering/alpaka/BindingModules/Makefile b/CLUEstering/alpaka/BindingModules/Makefile
index ada50437..043b6566 100644
--- a/CLUEstering/alpaka/BindingModules/Makefile
+++ b/CLUEstering/alpaka/BindingModules/Makefile
@@ -2,6 +2,7 @@
 # Compilers
 export CXX := g++
 export CUDA := nvcc
+export HIP := hipcc
 
 export CUDA_ARCH := 50 60 61 62 70
 
@@ -12,6 +13,10 @@ export CUDA_FLAGS = -x cu --expt-relaxed-constexpr -gencode arch=compute_61,code
 
 # CUDA_FLAGS := $$(foreach ARCH,-gencode arch=compute_$$(CUDA_ARCH),code=[sm_$$(CUDA_ARCH),compute_$$(CUDA_ARCH)]) -Wno-deprecated-gpu-targets -Xcudafe --diag_suppress=esa_on_defaulted_function_ignored --expt-relaxed-constexpr --expt-extended-lambda --generate-line-info --source-in-ptx --display-error-number --cudart=shared
 # $(2)NVCC_COMMON := -std=c++17 -O3 -g $$($(2)NVCC_FLAGS) -ccbin $(CXX) --compiler-options '$(HOST_CXXFLAGS) $(USER_CXXFLAGS)'
+export HIP_BASE = /opt/rocm/
+export HIP_FLAGS := -I$(HIP_BASE)/include \
+					-I$(HIP_BASE)/hiprand/include \
+					-I$(HIP_BASE)/rocrand/include
 
 TBB_FLAGS = -ltbb
 
@@ -27,6 +32,7 @@ BOOST_PATH = /usr/include/boost
 ALPAKA_SERIAL_FLAGS = -DALPAKA_HOST_ONLY -DALPAKA_ACC_CPU_B_SEQ_T_SEQ_PRESENT -DALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED -DALPAKA_ACC_CPU_B_SEQ_T_SEQ_SYNC_BACKEND
 ALPAKA_TBB_FLAGS = -DALPAKA_ACC_CPU_B_TBB_T_SEQ_PRESENT -DALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED -DALPAKA_ACC_CPU_B_TBB_T_SEQ_ASYNC_BACKEND
 ALPAKA_CUDA_FLAGS = -DALPAKA_ACC_GPU_CUDA_PRESENT -DALPAKA_ACC_GPU_CUDA_ENABLED -DALPAKA_ACC_GPU_CUDA_ASYNC_BACKEND 
+ALPAKA_HIP_FLAGS = -DALPAKA_ACC_GPU_HIP_PRESENT -DALPAKA_ACC_GPU_HIP_ENABLED -DALPAKA_ACC_GPU_HIP_ASYNC_BACKEND
 
 # Binding flags
 PYTHON_VERS := $(shell python3 -V | awk -F '' '{print $$8$$9$$10$$11}' | sed 's/\.//g'  )
@@ -58,5 +64,8 @@ tbb:
 cuda:
 	$(CUDA) $(CUDA_FLAGS) $(CXX_FLAGS) -I$(BOOST_PATH) -I$(ALPAKA_PATH) $(ALPAKA_CUDA_FLAGS) $(CUDA_BINDING_FLAGS) binding_gpu_cuda.cc -o $(CUDA_MODULE_NAME)
 
+hip:
+	$(HIP) $(HIP_FLAGS) $(CXX_FLAGS) -I$(BOOST_PATH) -I$(ALPAKA_PATH) $(ALPAKA_HIP_FLAGS) $(BINDING_FLAGS) binding_gpu_hip.cc -o CLUE_GPU_HIP.cpython-$(PYTHON_VERS)-x86_64-linux-gnu.so
+
 kernel:
 	$(CXX) $(CXX_FLAGS) -I$(BOOST_PATH) -I$(ALPAKA_PATH) $(ALPAKA_SERIAL_FLAGS) $(BINDING_FLAGS) binding_kernels.cc -o $(KERNELS_MODULE_NAME)
diff --git a/CLUEstering/alpaka/BindingModules/binding_gpu_hip.cc b/CLUEstering/alpaka/BindingModules/binding_gpu_hip.cc
new file mode 100644
index 00000000..58575061
--- /dev/null
+++ b/CLUEstering/alpaka/BindingModules/binding_gpu_hip.cc
@@ -0,0 +1,237 @@
+#include <alpaka/alpaka.hpp>
+#include <vector>
+
+#include "../CLUE/CLUEAlgoAlpaka.h"
+#include "../CLUE/Run.h"
+#include "../DataFormats/Points.h"
+#include "../DataFormats/alpaka/PointsAlpaka.h"
+#include "../AlpakaCore/initialise.h"
+
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include <pybind11/functional.h>
+#include <stdint.h>
+
+using cms::alpakatools::initialise;
+
+namespace alpaka_rocm_async {
+  void listDevices(const std::string& backend) {
+    const char tab = '\t';
+    const std::vector<Device> devices = alpaka::getDevs<Platform>();
+    if (devices.empty()) {
+      std::cout << "No devices found for the " << backend << " backend." << std::endl;
+      return;
+    } else {
+      std::cout << backend << " devices found: \n";
+      for (size_t i{}; i < devices.size(); ++i) {
+        std::cout << tab << "device " << i << ": " << alpaka::getName(devices[i]) << '\n';
+      }
+    }
+  }
+
+  std::vector<std::vector<int>> mainRun(float dc,
+                                        float rhoc,
+                                        float outlier,
+                                        int pPBin,
+                                        const std::vector<std::vector<float>>& coords,
+                                        const std::vector<float>& weights,
+                                        const FlatKernel& kernel,
+                                        int Ndim,
+                                        size_t block_size,
+                                        size_t device_id) {
+    auto const dev_acc = alpaka::getDevByIdx<Acc1D>(device_id);
+
+    /* initialise<Platform>(); */
+
+    // Create the queue
+    Queue queue_(dev_acc);
+
+    // Running the clustering algorithm //
+    switch (Ndim) {
+      [[unlikely]] case (1) :
+        /* return run1(dc, rhoc, outlier, pPBin, coords, weights, queue_); */
+        break;
+      [[likely]] case (2) :
+        return run2(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size);
+        break;
+      [[likely]] case (3) :
+        /* return run3(dc, rhoc, outlier, pPBin, coords, weights, queue_); */
+        break;
+      [[unlikely]] case (4) :
+        /* return run4(dc, rhoc, outlier, pPBin, coords, weights, queue_); */
+        break;
+      [[unlikely]] case (5) :
+        /* return run5(dc, rhoc, outlier, pPBin, coords, weights, queue_); */
+        break;
+      [[unlikely]] case (6) :
+        /* return run6(dc, rhoc, outlier, pPBin, coords, weights, queue_); */
+        break;
+      [[unlikely]] case (7) :
+        /* return run7(dc, rhoc, outlier, pPBin, coords, weights, queue_); */
+        break;
+      [[unlikely]] case (8) :
+        /* return run8(dc, rhoc, outlier, pPBin, coords, weights, queue_); */
+        break;
+      [[unlikely]] case (9) :
+        /* return run9(dc, rhoc, outlier, pPBin, coords, weights, queue_); */
+        break;
+      [[unlikely]] case (10) :
+        /* return run10(dc, rhoc, outlier, pPBin, coords, weights, queue_); */
+        break;
+      [[unlikely]] default:
+        std::cout << "This library only works up to 10 dimensions\n";
+        return {};
+        break;
+    }
+  }
+
+  std::vector<std::vector<int>> mainRun(float dc,
+                                        float rhoc,
+                                        float outlier,
+                                        int pPBin,
+                                        const std::vector<std::vector<float>>& coords,
+                                        const std::vector<float>& weights,
+                                        const ExponentialKernel& kernel,
+                                        int Ndim,
+                                        size_t block_size,
+                                        size_t device_id) {
+    auto const dev_acc = alpaka::getDevByIdx<Acc1D>(device_id);
+
+    // Create the queue
+    Queue queue_(dev_acc);
+
+    // Running the clustering algorithm //
+    switch (Ndim) {
+      [[unlikely]] case (1) :
+        /* return run1(dc, rhoc, outlier, pPBin, coords, weights, queue_); */
+        break;
+      [[likely]] case (2) :
+        return run2(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size);
+        break;
+      [[likely]] case (3) :
+        /* return run3(dc, rhoc, outlier, pPBin, coords, weights, queue_); */
+        break;
+      [[unlikely]] case (4) :
+        /* return run4(dc, rhoc, outlier, pPBin, coords, weights, queue_); */
+        break;
+      [[unlikely]] case (5) :
+        /* return run5(dc, rhoc, outlier, pPBin, coords, weights, queue_); */
+        break;
+      [[unlikely]] case (6) :
+        /* return run6(dc, rhoc, outlier, pPBin, coords, weights, queue_); */
+        break;
+      [[unlikely]] case (7) :
+        /* return run7(dc, rhoc, outlier, pPBin, coords, weights, queue_); */
+        break;
+      [[unlikely]] case (8) :
+        /* return run8(dc, rhoc, outlier, pPBin, coords, weights, queue_); */
+        break;
+      [[unlikely]] case (9) :
+        /* return run9(dc, rhoc, outlier, pPBin, coords, weights, queue_); */
+        break;
+      [[unlikely]] case (10) :
+        /* return run10(dc, rhoc, outlier, pPBin, coords, weights, queue_); */
+        break;
+      [[unlikely]] default:
+        std::cout << "This library only works up to 10 dimensions\n";
+        return {};
+        break;
+    }
+  }
+
+  std::vector<std::vector<int>> mainRun(float dc,
+                                        float rhoc,
+                                        float outlier,
+                                        int pPBin,
+                                        const std::vector<std::vector<float>>& coords,
+                                        const std::vector<float>& weights,
+                                        const GaussianKernel& kernel,
+                                        int Ndim,
+                                        size_t block_size,
+                                        size_t device_id) {
+    auto const dev_acc = alpaka::getDevByIdx<Acc1D>(device_id);
+
+    // Create the queue
+    Queue queue_(dev_acc);
+
+    // Running the clustering algorithm //
+    switch (Ndim) {
+      [[unlikely]] case (1) :
+        /* return run1(dc, rhoc, outlier, pPBin, coords, weights, queue_); */
+        break;
+      [[likely]] case (2) :
+        return run2(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size);
+        break;
+      [[likely]] case (3) :
+        /* return run3(dc, rhoc, outlier, pPBin, coords, weights, queue_); */
+        break;
+      [[unlikely]] case (4) :
+        /* return run4(dc, rhoc, outlier, pPBin, coords, weights, queue_); */
+        break;
+      [[unlikely]] case (5) :
+        /* return run5(dc, rhoc, outlier, pPBin, coords, weights, queue_); */
+        break;
+      [[unlikely]] case (6) :
+        /* return run6(dc, rhoc, outlier, pPBin, coords, weights, queue_); */
+        break;
+      [[unlikely]] case (7) :
+        /* return run7(dc, rhoc, outlier, pPBin, coords, weights, queue_); */
+        break;
+      [[unlikely]] case (8) :
+        /* return run8(dc, rhoc, outlier, pPBin, coords, weights, queue_); */
+        break;
+      [[unlikely]] case (9) :
+        /* return run9(dc, rhoc, outlier, pPBin, coords, weights, queue_); */
+        break;
+      [[unlikely]] case (10) :
+        /* return run10(dc, rhoc, outlier, pPBin, coords, weights, queue_); */
+        break;
+      [[unlikely]] default:
+        std::cout << "This library only works up to 10 dimensions\n";
+        return {};
+        break;
+    }
+  }
+
+  PYBIND11_MODULE(CLUE_GPU_HIP, m) {
+    m.doc() = "Binding of the CLUE algorithm running on AMD GPUs";
+
+    m.def("listDevices", &listDevices, "List the available devices for the HIP/ROCm backend");
+    m.def("mainRun",
+          pybind11::overload_cast<float,
+                                  float,
+                                  float,
+                                  int,
+                                  const std::vector<std::vector<float>>&,
+                                  const std::vector<float>&,
+                                  const FlatKernel&,
+                                  int,
+                                  size_t,
+                                  size_t>(&mainRun),
+          "mainRun");
+    m.def("mainRun",
+          pybind11::overload_cast<float,
+                                  float,
+                                  float,
+                                  int,
+                                  const std::vector<std::vector<float>>&,
+                                  const std::vector<float>&,
+                                  const ExponentialKernel&,
+                                  int,
+                                  size_t,
+                                  size_t>(&mainRun),
+          "mainRun");
+    m.def("mainRun",
+          pybind11::overload_cast<float,
+                                  float,
+                                  float,
+                                  int,
+                                  const std::vector<std::vector<float>>&,
+                                  const std::vector<float>&,
+                                  const GaussianKernel&,
+                                  int,
+                                  size_t,
+                                  size_t>(&mainRun),
+          "mainRun");
+  }
+};  // namespace alpaka_rocm_async