Skip to content

Commit

Permalink
Prepare compilation for AMD GPUs
Browse files Browse the repository at this point in the history
  • Loading branch information
sbaldu committed Jan 5, 2024
1 parent d8d87b8 commit 14024e9
Show file tree
Hide file tree
Showing 2 changed files with 246 additions and 0 deletions.
9 changes: 9 additions & 0 deletions CLUEstering/alpaka/BindingModules/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
# Compilers
export CXX := g++
export CUDA := nvcc
export HIP := hipcc

export CUDA_ARCH := 50 60 61 62 70

Expand All @@ -12,6 +13,10 @@ export CUDA_FLAGS = -x cu --expt-relaxed-constexpr -gencode arch=compute_61,code

# CUDA_FLAGS := $$(foreach ARCH,-gencode arch=compute_$$(CUDA_ARCH),code=[sm_$$(CUDA_ARCH),compute_$$(CUDA_ARCH)]) -Wno-deprecated-gpu-targets -Xcudafe --diag_suppress=esa_on_defaulted_function_ignored --expt-relaxed-constexpr --expt-extended-lambda --generate-line-info --source-in-ptx --display-error-number --cudart=shared
# $(2)NVCC_COMMON := -std=c++17 -O3 -g $$($(2)NVCC_FLAGS) -ccbin $(CXX) --compiler-options '$(HOST_CXXFLAGS) $(USER_CXXFLAGS)'
export HIP_BASE = /opt/rocm/
export HIP_FLAGS := -I$(HIP_BASE)/include \
-I$(HIP_BASE)/hiprand/include \
-I$(HIP_BASE)/rocrand/include

TBB_FLAGS = -ltbb

Expand All @@ -27,6 +32,7 @@ BOOST_PATH = /usr/include/boost
ALPAKA_SERIAL_FLAGS = -DALPAKA_HOST_ONLY -DALPAKA_ACC_CPU_B_SEQ_T_SEQ_PRESENT -DALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED -DALPAKA_ACC_CPU_B_SEQ_T_SEQ_SYNC_BACKEND
ALPAKA_TBB_FLAGS = -DALPAKA_ACC_CPU_B_TBB_T_SEQ_PRESENT -DALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED -DALPAKA_ACC_CPU_B_TBB_T_SEQ_ASYNC_BACKEND
ALPAKA_CUDA_FLAGS = -DALPAKA_ACC_GPU_CUDA_PRESENT -DALPAKA_ACC_GPU_CUDA_ENABLED -DALPAKA_ACC_GPU_CUDA_ASYNC_BACKEND
ALPAKA_HIP_FLAGS = -DALPAKA_ACC_GPU_HIP_PRESENT -DALPAKA_ACC_GPU_HIP_ENABLED -DALPAKA_ACC_GPU_HIP_ASYNC_BACKEND

# Binding flags
PYTHON_VERS := $(shell python3 -V | awk -F '' '{print $$8$$9$$10$$11}' | sed 's/\.//g' )
Expand Down Expand Up @@ -58,5 +64,8 @@ tbb:
cuda:
$(CUDA) $(CUDA_FLAGS) $(CXX_FLAGS) -I$(BOOST_PATH) -I$(ALPAKA_PATH) $(ALPAKA_CUDA_FLAGS) $(CUDA_BINDING_FLAGS) binding_gpu_cuda.cc -o $(CUDA_MODULE_NAME)

hip:
$(HIP) $(HIP_FLAGS) $(CXX_FLAGS) -I$(BOOST_PATH) -I$(ALPAKA_PATH) $(ALPAKA_HIP_FLAGS) $(BINDING_FLAGS) binding_gpu_hip.cc -o CLUE_GPU_HIP.cpython-$(PYTHON_VERS)-x86_64-linux-gnu.so

kernel:
$(CXX) $(CXX_FLAGS) -I$(BOOST_PATH) -I$(ALPAKA_PATH) $(ALPAKA_SERIAL_FLAGS) $(BINDING_FLAGS) binding_kernels.cc -o $(KERNELS_MODULE_NAME)
237 changes: 237 additions & 0 deletions CLUEstering/alpaka/BindingModules/binding_gpu_hip.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,237 @@
#include <alpaka/alpaka.hpp>
#include <vector>

#include "../CLUE/CLUEAlgoAlpaka.h"
#include "../CLUE/Run.h"
#include "../DataFormats/Points.h"
#include "../DataFormats/alpaka/PointsAlpaka.h"
#include "../AlpakaCore/initialise.h"

#include <pybind11/pybind11.h>
#include <pybind11/stl.h>
#include <pybind11/functional.h>
#include <stdint.h>

using cms::alpakatools::initialise;

namespace alpaka_rocm_async {
void listDevices(const std::string& backend) {
const char tab = '\t';
const std::vector<Device> devices = alpaka::getDevs<Platform>();
if (devices.empty()) {
std::cout << "No devices found for the " << backend << " backend." << std::endl;
return;
} else {
std::cout << backend << " devices found: \n";
for (size_t i{}; i < devices.size(); ++i) {
std::cout << tab << "device " << i << ": " << alpaka::getName(devices[i]) << '\n';
}
}
}

std::vector<std::vector<int>> mainRun(float dc,
float rhoc,
float outlier,
int pPBin,
const std::vector<std::vector<float>>& coords,
const std::vector<float>& weights,
const FlatKernel& kernel,
int Ndim,
size_t block_size,
size_t device_id) {
auto const dev_acc = alpaka::getDevByIdx<Acc1D>(device_id);

/* initialise<Platform>(); */

// Create the queue
Queue queue_(dev_acc);

// Running the clustering algorithm //
switch (Ndim) {
[[unlikely]] case (1) :
/* return run1(dc, rhoc, outlier, pPBin, coords, weights, queue_); */
break;
[[likely]] case (2) :
return run2(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size);
break;
[[likely]] case (3) :
/* return run3(dc, rhoc, outlier, pPBin, coords, weights, queue_); */
break;
[[unlikely]] case (4) :
/* return run4(dc, rhoc, outlier, pPBin, coords, weights, queue_); */
break;
[[unlikely]] case (5) :
/* return run5(dc, rhoc, outlier, pPBin, coords, weights, queue_); */
break;
[[unlikely]] case (6) :
/* return run6(dc, rhoc, outlier, pPBin, coords, weights, queue_); */
break;
[[unlikely]] case (7) :
/* return run7(dc, rhoc, outlier, pPBin, coords, weights, queue_); */
break;
[[unlikely]] case (8) :
/* return run8(dc, rhoc, outlier, pPBin, coords, weights, queue_); */
break;
[[unlikely]] case (9) :
/* return run9(dc, rhoc, outlier, pPBin, coords, weights, queue_); */
break;
[[unlikely]] case (10) :
/* return run10(dc, rhoc, outlier, pPBin, coords, weights, queue_); */
break;
[[unlikely]] default:
std::cout << "This library only works up to 10 dimensions\n";
return {};
break;
}
}

std::vector<std::vector<int>> mainRun(float dc,
float rhoc,
float outlier,
int pPBin,
const std::vector<std::vector<float>>& coords,
const std::vector<float>& weights,
const ExponentialKernel& kernel,
int Ndim,
size_t block_size,
size_t device_id) {
auto const dev_acc = alpaka::getDevByIdx<Acc1D>(device_id);

// Create the queue
Queue queue_(dev_acc);

// Running the clustering algorithm //
switch (Ndim) {
[[unlikely]] case (1) :
/* return run1(dc, rhoc, outlier, pPBin, coords, weights, queue_); */
break;
[[likely]] case (2) :
return run2(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size);
break;
[[likely]] case (3) :
/* return run3(dc, rhoc, outlier, pPBin, coords, weights, queue_); */
break;
[[unlikely]] case (4) :
/* return run4(dc, rhoc, outlier, pPBin, coords, weights, queue_); */
break;
[[unlikely]] case (5) :
/* return run5(dc, rhoc, outlier, pPBin, coords, weights, queue_); */
break;
[[unlikely]] case (6) :
/* return run6(dc, rhoc, outlier, pPBin, coords, weights, queue_); */
break;
[[unlikely]] case (7) :
/* return run7(dc, rhoc, outlier, pPBin, coords, weights, queue_); */
break;
[[unlikely]] case (8) :
/* return run8(dc, rhoc, outlier, pPBin, coords, weights, queue_); */
break;
[[unlikely]] case (9) :
/* return run9(dc, rhoc, outlier, pPBin, coords, weights, queue_); */
break;
[[unlikely]] case (10) :
/* return run10(dc, rhoc, outlier, pPBin, coords, weights, queue_); */
break;
[[unlikely]] default:
std::cout << "This library only works up to 10 dimensions\n";
return {};
break;
}
}

std::vector<std::vector<int>> mainRun(float dc,
float rhoc,
float outlier,
int pPBin,
const std::vector<std::vector<float>>& coords,
const std::vector<float>& weights,
const GaussianKernel& kernel,
int Ndim,
size_t block_size,
size_t device_id) {
auto const dev_acc = alpaka::getDevByIdx<Acc1D>(device_id);

// Create the queue
Queue queue_(dev_acc);

// Running the clustering algorithm //
switch (Ndim) {
[[unlikely]] case (1) :
/* return run1(dc, rhoc, outlier, pPBin, coords, weights, queue_); */
break;
[[likely]] case (2) :
return run2(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size);
break;
[[likely]] case (3) :
/* return run3(dc, rhoc, outlier, pPBin, coords, weights, queue_); */
break;
[[unlikely]] case (4) :
/* return run4(dc, rhoc, outlier, pPBin, coords, weights, queue_); */
break;
[[unlikely]] case (5) :
/* return run5(dc, rhoc, outlier, pPBin, coords, weights, queue_); */
break;
[[unlikely]] case (6) :
/* return run6(dc, rhoc, outlier, pPBin, coords, weights, queue_); */
break;
[[unlikely]] case (7) :
/* return run7(dc, rhoc, outlier, pPBin, coords, weights, queue_); */
break;
[[unlikely]] case (8) :
/* return run8(dc, rhoc, outlier, pPBin, coords, weights, queue_); */
break;
[[unlikely]] case (9) :
/* return run9(dc, rhoc, outlier, pPBin, coords, weights, queue_); */
break;
[[unlikely]] case (10) :
/* return run10(dc, rhoc, outlier, pPBin, coords, weights, queue_); */
break;
[[unlikely]] default:
std::cout << "This library only works up to 10 dimensions\n";
return {};
break;
}
}

PYBIND11_MODULE(CLUE_GPU_HIP, m) {
m.doc() = "Binding of the CLUE algorithm running on AMD GPUs";

m.def("listDevices", &listDevices, "List the available devices for the HIP/ROCm backend");
m.def("mainRun",
pybind11::overload_cast<float,
float,
float,
int,
const std::vector<std::vector<float>>&,
const std::vector<float>&,
const FlatKernel&,
int,
size_t,
size_t>(&mainRun),
"mainRun");
m.def("mainRun",
pybind11::overload_cast<float,
float,
float,
int,
const std::vector<std::vector<float>>&,
const std::vector<float>&,
const ExponentialKernel&,
int,
size_t,
size_t>(&mainRun),
"mainRun");
m.def("mainRun",
pybind11::overload_cast<float,
float,
float,
int,
const std::vector<std::vector<float>>&,
const std::vector<float>&,
const GaussianKernel&,
int,
size_t,
size_t>(&mainRun),
"mainRun");
}
}; // namespace alpaka_rocm_async

0 comments on commit 14024e9

Please sign in to comment.