From 14024e9d51e84de2a18b25dc6ef0051e27568cca Mon Sep 17 00:00:00 2001 From: sbaldu Date: Fri, 5 Jan 2024 13:11:14 +0100 Subject: [PATCH] Prepare compilation for AMD GPUs --- CLUEstering/alpaka/BindingModules/Makefile | 9 + .../alpaka/BindingModules/binding_gpu_hip.cc | 237 ++++++++++++++++++ 2 files changed, 246 insertions(+) create mode 100644 CLUEstering/alpaka/BindingModules/binding_gpu_hip.cc diff --git a/CLUEstering/alpaka/BindingModules/Makefile b/CLUEstering/alpaka/BindingModules/Makefile index ada50437..043b6566 100644 --- a/CLUEstering/alpaka/BindingModules/Makefile +++ b/CLUEstering/alpaka/BindingModules/Makefile @@ -2,6 +2,7 @@ # Compilers export CXX := g++ export CUDA := nvcc +export HIP := hipcc export CUDA_ARCH := 50 60 61 62 70 @@ -12,6 +13,10 @@ export CUDA_FLAGS = -x cu --expt-relaxed-constexpr -gencode arch=compute_61,code # CUDA_FLAGS := $$(foreach ARCH,-gencode arch=compute_$$(CUDA_ARCH),code=[sm_$$(CUDA_ARCH),compute_$$(CUDA_ARCH)]) -Wno-deprecated-gpu-targets -Xcudafe --diag_suppress=esa_on_defaulted_function_ignored --expt-relaxed-constexpr --expt-extended-lambda --generate-line-info --source-in-ptx --display-error-number --cudart=shared # $(2)NVCC_COMMON := -std=c++17 -O3 -g $$($(2)NVCC_FLAGS) -ccbin $(CXX) --compiler-options '$(HOST_CXXFLAGS) $(USER_CXXFLAGS)' +export HIP_BASE = /opt/rocm/ +export HIP_FLAGS := -I$(HIP_BASE)/include \ + -I$(HIP_BASE)/hiprand/include \ + -I$(HIP_BASE)/rocrand/include TBB_FLAGS = -ltbb @@ -27,6 +32,7 @@ BOOST_PATH = /usr/include/boost ALPAKA_SERIAL_FLAGS = -DALPAKA_HOST_ONLY -DALPAKA_ACC_CPU_B_SEQ_T_SEQ_PRESENT -DALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED -DALPAKA_ACC_CPU_B_SEQ_T_SEQ_SYNC_BACKEND ALPAKA_TBB_FLAGS = -DALPAKA_ACC_CPU_B_TBB_T_SEQ_PRESENT -DALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED -DALPAKA_ACC_CPU_B_TBB_T_SEQ_ASYNC_BACKEND ALPAKA_CUDA_FLAGS = -DALPAKA_ACC_GPU_CUDA_PRESENT -DALPAKA_ACC_GPU_CUDA_ENABLED -DALPAKA_ACC_GPU_CUDA_ASYNC_BACKEND +ALPAKA_HIP_FLAGS = -DALPAKA_ACC_GPU_HIP_PRESENT -DALPAKA_ACC_GPU_HIP_ENABLED -DALPAKA_ACC_GPU_HIP_ASYNC_BACKEND # Binding flags PYTHON_VERS := $(shell python3 -V | awk -F '' '{print $$8$$9$$10$$11}' | sed 's/\.//g' ) @@ -58,5 +64,8 @@ tbb: cuda: $(CUDA) $(CUDA_FLAGS) $(CXX_FLAGS) -I$(BOOST_PATH) -I$(ALPAKA_PATH) $(ALPAKA_CUDA_FLAGS) $(CUDA_BINDING_FLAGS) binding_gpu_cuda.cc -o $(CUDA_MODULE_NAME) +hip: + $(HIP) $(HIP_FLAGS) $(CXX_FLAGS) -I$(BOOST_PATH) -I$(ALPAKA_PATH) $(ALPAKA_HIP_FLAGS) $(BINDING_FLAGS) binding_gpu_hip.cc -o CLUE_GPU_HIP.cpython-$(PYTHON_VERS)-x86_64-linux-gnu.so + kernel: $(CXX) $(CXX_FLAGS) -I$(BOOST_PATH) -I$(ALPAKA_PATH) $(ALPAKA_SERIAL_FLAGS) $(BINDING_FLAGS) binding_kernels.cc -o $(KERNELS_MODULE_NAME) diff --git a/CLUEstering/alpaka/BindingModules/binding_gpu_hip.cc b/CLUEstering/alpaka/BindingModules/binding_gpu_hip.cc new file mode 100644 index 00000000..58575061 --- /dev/null +++ b/CLUEstering/alpaka/BindingModules/binding_gpu_hip.cc @@ -0,0 +1,237 @@ +#include +#include + +#include "../CLUE/CLUEAlgoAlpaka.h" +#include "../CLUE/Run.h" +#include "../DataFormats/Points.h" +#include "../DataFormats/alpaka/PointsAlpaka.h" +#include "../AlpakaCore/initialise.h" + +#include +#include +#include +#include + +using cms::alpakatools::initialise; + +namespace alpaka_rocm_async { + void listDevices(const std::string& backend) { + const char tab = '\t'; + const std::vector devices = alpaka::getDevs(); + if (devices.empty()) { + std::cout << "No devices found for the " << backend << " backend." << std::endl; + return; + } else { + std::cout << backend << " devices found: \n"; + for (size_t i{}; i < devices.size(); ++i) { + std::cout << tab << "device " << i << ": " << alpaka::getName(devices[i]) << '\n'; + } + } + } + + std::vector> mainRun(float dc, + float rhoc, + float outlier, + int pPBin, + const std::vector>& coords, + const std::vector& weights, + const FlatKernel& kernel, + int Ndim, + size_t block_size, + size_t device_id) { + auto const dev_acc = alpaka::getDevByIdx(device_id); + + /* initialise(); */ + + // Create the queue + Queue queue_(dev_acc); + + // Running the clustering algorithm // + switch (Ndim) { + [[unlikely]] case (1) : + /* return run1(dc, rhoc, outlier, pPBin, coords, weights, queue_); */ + break; + [[likely]] case (2) : + return run2(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size); + break; + [[likely]] case (3) : + /* return run3(dc, rhoc, outlier, pPBin, coords, weights, queue_); */ + break; + [[unlikely]] case (4) : + /* return run4(dc, rhoc, outlier, pPBin, coords, weights, queue_); */ + break; + [[unlikely]] case (5) : + /* return run5(dc, rhoc, outlier, pPBin, coords, weights, queue_); */ + break; + [[unlikely]] case (6) : + /* return run6(dc, rhoc, outlier, pPBin, coords, weights, queue_); */ + break; + [[unlikely]] case (7) : + /* return run7(dc, rhoc, outlier, pPBin, coords, weights, queue_); */ + break; + [[unlikely]] case (8) : + /* return run8(dc, rhoc, outlier, pPBin, coords, weights, queue_); */ + break; + [[unlikely]] case (9) : + /* return run9(dc, rhoc, outlier, pPBin, coords, weights, queue_); */ + break; + [[unlikely]] case (10) : + /* return run10(dc, rhoc, outlier, pPBin, coords, weights, queue_); */ + break; + [[unlikely]] default: + std::cout << "This library only works up to 10 dimensions\n"; + return {}; + break; + } + } + + std::vector> mainRun(float dc, + float rhoc, + float outlier, + int pPBin, + const std::vector>& coords, + const std::vector& weights, + const ExponentialKernel& kernel, + int Ndim, + size_t block_size, + size_t device_id) { + auto const dev_acc = alpaka::getDevByIdx(device_id); + + // Create the queue + Queue queue_(dev_acc); + + // Running the clustering algorithm // + switch (Ndim) { + [[unlikely]] case (1) : + /* return run1(dc, rhoc, outlier, pPBin, coords, weights, queue_); */ + break; + [[likely]] case (2) : + return run2(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size); + break; + [[likely]] case (3) : + /* return run3(dc, rhoc, outlier, pPBin, coords, weights, queue_); */ + break; + [[unlikely]] case (4) : + /* return run4(dc, rhoc, outlier, pPBin, coords, weights, queue_); */ + break; + [[unlikely]] case (5) : + /* return run5(dc, rhoc, outlier, pPBin, coords, weights, queue_); */ + break; + [[unlikely]] case (6) : + /* return run6(dc, rhoc, outlier, pPBin, coords, weights, queue_); */ + break; + [[unlikely]] case (7) : + /* return run7(dc, rhoc, outlier, pPBin, coords, weights, queue_); */ + break; + [[unlikely]] case (8) : + /* return run8(dc, rhoc, outlier, pPBin, coords, weights, queue_); */ + break; + [[unlikely]] case (9) : + /* return run9(dc, rhoc, outlier, pPBin, coords, weights, queue_); */ + break; + [[unlikely]] case (10) : + /* return run10(dc, rhoc, outlier, pPBin, coords, weights, queue_); */ + break; + [[unlikely]] default: + std::cout << "This library only works up to 10 dimensions\n"; + return {}; + break; + } + } + + std::vector> mainRun(float dc, + float rhoc, + float outlier, + int pPBin, + const std::vector>& coords, + const std::vector& weights, + const GaussianKernel& kernel, + int Ndim, + size_t block_size, + size_t device_id) { + auto const dev_acc = alpaka::getDevByIdx(device_id); + + // Create the queue + Queue queue_(dev_acc); + + // Running the clustering algorithm // + switch (Ndim) { + [[unlikely]] case (1) : + /* return run1(dc, rhoc, outlier, pPBin, coords, weights, queue_); */ + break; + [[likely]] case (2) : + return run2(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size); + break; + [[likely]] case (3) : + /* return run3(dc, rhoc, outlier, pPBin, coords, weights, queue_); */ + break; + [[unlikely]] case (4) : + /* return run4(dc, rhoc, outlier, pPBin, coords, weights, queue_); */ + break; + [[unlikely]] case (5) : + /* return run5(dc, rhoc, outlier, pPBin, coords, weights, queue_); */ + break; + [[unlikely]] case (6) : + /* return run6(dc, rhoc, outlier, pPBin, coords, weights, queue_); */ + break; + [[unlikely]] case (7) : + /* return run7(dc, rhoc, outlier, pPBin, coords, weights, queue_); */ + break; + [[unlikely]] case (8) : + /* return run8(dc, rhoc, outlier, pPBin, coords, weights, queue_); */ + break; + [[unlikely]] case (9) : + /* return run9(dc, rhoc, outlier, pPBin, coords, weights, queue_); */ + break; + [[unlikely]] case (10) : + /* return run10(dc, rhoc, outlier, pPBin, coords, weights, queue_); */ + break; + [[unlikely]] default: + std::cout << "This library only works up to 10 dimensions\n"; + return {}; + break; + } + } + + PYBIND11_MODULE(CLUE_GPU_HIP, m) { + m.doc() = "Binding of the CLUE algorithm running on AMD GPUs"; + + m.def("listDevices", &listDevices, "List the available devices for the HIP/ROCm backend"); + m.def("mainRun", + pybind11::overload_cast>&, + const std::vector&, + const FlatKernel&, + int, + size_t, + size_t>(&mainRun), + "mainRun"); + m.def("mainRun", + pybind11::overload_cast>&, + const std::vector&, + const ExponentialKernel&, + int, + size_t, + size_t>(&mainRun), + "mainRun"); + m.def("mainRun", + pybind11::overload_cast>&, + const std::vector&, + const GaussianKernel&, + int, + size_t, + size_t>(&mainRun), + "mainRun"); + } +}; // namespace alpaka_rocm_async