Skip to content

Commit

Permalink
workaround RCCL
Browse files Browse the repository at this point in the history
  • Loading branch information
hliuca committed Dec 18, 2024
2 parents f199039 + 194c73c commit 1614bee
Show file tree
Hide file tree
Showing 216 changed files with 2,388 additions and 293 deletions.
3 changes: 3 additions & 0 deletions .gitmodules
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,6 @@
[submodule "gputreeshap"]
path = gputreeshap
url = https://github.com/rapidsai/gputreeshap.git
[submodule "rocgputreeshap"]
path = rocgputreeshap
url = https://github.com/ROCmSoftwarePlatform/rocgputreeshap
42 changes: 41 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ option(ENABLE_ALL_WARNINGS "Enable all compiler warnings. Only effective for GCC
option(LOG_CAPI_INVOCATION "Log all C API invocations for debugging" OFF)
option(GOOGLE_TEST "Build google tests" OFF)
option(USE_DMLC_GTEST "Use google tests bundled with dmlc-core submodule" OFF)
option(USE_DEVICE_DEBUG "Generate CUDA device debug info." OFF)
option(USE_DEVICE_DEBUG "Generate CUDA/HIP device debug info." OFF)
option(USE_NVTX "Build with cuda profiling annotations. Developers only." OFF)
set(NVTX_HEADER_DIR "" CACHE PATH "Path to the stand-alone nvtx header")
option(HIDE_CXX_SYMBOLS "Build shared library and hide all C++ symbols" OFF)
Expand Down Expand Up @@ -98,6 +98,12 @@ cmake_dependent_option(USE_CUDA_LTO
"${CMAKE_INTERPROCEDURAL_OPTIMIZATION}"
"CMAKE_VERSION VERSION_GREATER_EQUAL 3.25;USE_CUDA;CMAKE_INTERPROCEDURAL_OPTIMIZATION"
OFF)
## HIP
option(USE_HIP "Build with GPU acceleration" OFF)
option(USE_RCCL "Build with RCCL to enable distributed GPU support." OFF)
# This is specifically designed for PyPI binary release and should be disabled for most of the cases.
option(USE_DLOPEN_RCCL "Whether to load nccl dynamically." OFF)
option(BUILD_WITH_SHARED_RCCL "Build with shared RCCL library." OFF)
## Sanitizers
option(USE_SANITIZER "Use santizer flags" OFF)
option(SANITIZER_PATH "Path to sanitizes.")
Expand Down Expand Up @@ -130,6 +136,18 @@ endif()
if(USE_DLOPEN_NCCL AND (NOT (CMAKE_SYSTEM_NAME STREQUAL "Linux")))
message(SEND_ERROR "`USE_DLOPEN_NCCL` supports only Linux at the moment.")
endif()
if(USE_RCCL AND NOT (USE_HIP))
message(SEND_ERROR "`USE_RCCL` must be enabled with `USE_HIP` flag.")
endif()
if(BUILD_WITH_SHARED_RCCL AND (NOT USE_RCCL))
message(SEND_ERROR "Build XGBoost with -DUSE_RCCL=ON to enable BUILD_WITH_SHARED_RCCL.")
endif()
if(USE_DLOPEN_RCCL AND (NOT USE_RCCL))
message(SEND_ERROR "Build XGBoost with -DUSE_RCCL=ON to enable USE_DLOPEN_RCCL.")
endif()
if(USE_DLOPEN_RCCL AND (NOT (CMAKE_SYSTEM_NAME STREQUAL "Linux")))
message(SEND_ERROR "`USE_DLOPEN_RCCL` supports only Linux at the moment.")
endif()
if(JVM_BINDINGS AND R_LIB)
message(SEND_ERROR "`R_LIB' is not compatible with `JVM_BINDINGS' as they both have customized configurations.")
endif()
Expand Down Expand Up @@ -244,6 +262,24 @@ if(USE_CUDA)
endif()
endif()

if (USE_HIP)
set(USE_OPENMP ON CACHE BOOL "HIP requires OpenMP" FORCE)
# `export CXX=' is ignored by CMake HIP.
set(CMAKE_HIP_HOST_COMPILER ${CMAKE_CXX_COMPILER})
message(STATUS "Configured HIP host compiler: ${CMAKE_HIP_HOST_COMPILER}")

enable_language(HIP)
find_package(hip REQUIRED)
find_package(rocthrust REQUIRED)
find_package(hipcub REQUIRED)

set(CMAKE_HIP_FLAGS "${CMAKE_HIP_FLAGS} -I${HIP_INCLUDE_DIRS}")
set(CMAKE_HIP_FLAGS "${CMAKE_HIP_FLAGS} -Wunused-result -w")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D__HIP_PLATFORM_AMD__")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -I${HIP_INCLUDE_DIRS}")
add_subdirectory(${PROJECT_SOURCE_DIR}/rocgputreeshap)
endif (USE_HIP)

if(FORCE_COLORED_OUTPUT AND (CMAKE_GENERATOR STREQUAL "Ninja") AND
((CMAKE_CXX_COMPILER_ID STREQUAL "GNU") OR
(CMAKE_CXX_COMPILER_ID STREQUAL "Clang")))
Expand Down Expand Up @@ -272,6 +308,10 @@ if(USE_NCCL)
find_package(Nccl REQUIRED)
endif()

if(USE_RCCL)
find_package(rccl REQUIRED)
endif()

if(MSVC)
if(FORCE_SHARED_CRT)
message(STATUS "XGBoost: Using dynamically linked MSVC runtime...")
Expand Down
2 changes: 1 addition & 1 deletion R-package/src/xgboost_custom.cc
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ namespace common {
bool CheckNAN(double v) {
return ISNAN(v);
}
#if !defined(XGBOOST_USE_CUDA)
#if !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)
double LogGamma(double v) {
return lgammafn(v);
}
Expand Down
64 changes: 64 additions & 0 deletions README-ROCm.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
# ROCm version

ROCm 5.5 and newer

# Code
Clone the code from our repo

1. `git clone https://github.com/ROCmSoftwarePlatform/xgboost`
1. `cd xgboost`
1. `git checkout master-rocm`

or a tag/branch with rocm suffix, such as v2.0.1-rocm

# Submodules
XGBoost ROCm support requires a few modules, which can be initialized as,

`git submodule update --init --recursive`

# Configure
The following export may be required for some systems, and the ROCm path depends on installation,

1. `export CMAKE_PREFIX_PATH=$CMAKE_PREFIX_PATH:/opt/rocm/lib/cmake:/opt/rocm/lib/cmake/AMDDeviceLibs/`
1. `mkdir build`
1. `cd build`
1. `cmake -DUSE_HIP=ON ../`
1. or `cmake -DUSE_HIP=1 ../`
1. or `cmake -DUSE_HIP=1 -DUSE_RCCL=1 ../`
1. or `cmake -DUSE_HIP=1 -DGOOGLE_TEST=1 ../`

The first command may be optional depending on system configure.

The **USE_HIP** macro enables HIP/ROCm support. **USE_RCCL** enables RCCL. **GOOGLE_TEST** enables Google test.

apt-get install libgtest-dev libgmock-dev

# Compile
To compile, run command,

`make -j`

# Python Support
After compilation, XGBoost can be installed as a Python package and supports a wide range of applications,

1. `cd python-package/`
1. `pip3 install .`

# Use AMD GPUs
When calling XGBoost, set the parameter `device` to `gpu` or `cuda`. Python sample,

```
params = dict()
params["device"] = "gpu"
params["tree_method"] = "hist"
...
```

or

```
params = dict()
params["device"] = "cuda"
params["tree_method"] = "hist"
...
```
47 changes: 46 additions & 1 deletion cmake/Utils.cmake
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
# Automatically set source group based on folder
function(auto_source_group SOURCES)

foreach(FILE ${SOURCES})
get_filename_component(PARENT_DIR "${FILE}" PATH)

Expand Down Expand Up @@ -145,6 +144,23 @@ function(xgboost_set_cuda_flags target)
endif()
endfunction()

# Set HIP related flags to target.
function(xgboost_set_hip_flags target)
if (USE_DEVICE_DEBUG)
target_compile_options(${target} PRIVATE
$<$<AND:$<CONFIG:DEBUG>,$<COMPILE_LANGUAGE:HIP>>:-G>)
endif (USE_DEVICE_DEBUG)

target_compile_definitions(${target} PRIVATE -DXGBOOST_USE_HIP=1)
target_include_directories(${target} PRIVATE ${xgboost_SOURCE_DIR}/rocgputreeshap)
target_include_directories(${target} PRIVATE ${xgboost_SOURCE_DIR}/warp-primitives/include)

set_target_properties(${target} PROPERTIES
HIP_STANDARD 17
HIP_STANDARD_REQUIRED ON
HIP_SEPARABLE_COMPILATION OFF)
endfunction(xgboost_set_hip_flags)

function(xgboost_link_nccl target)
set(xgboost_nccl_flags -DXGBOOST_USE_NCCL=1)
if(USE_DLOPEN_NCCL)
Expand All @@ -165,6 +181,27 @@ function(xgboost_link_nccl target)
endif()
endfunction()

function(xgboost_link_rccl target)
set(xgboost_rccl_flags -DXGBOOST_USE_RCCL=1)
if(USE_DLOPEN_RCCL)
list(APPEND xgboost_rccl_flags -DXGBOOST_USE_DLOPEN_RCCL=1)
endif()

if(BUILD_STATIC_LIB)
target_include_directories(${target} PUBLIC ${RCCL_INCLUDE_DIR}/rccl)
target_compile_definitions(${target} PUBLIC ${xgboost_rccl_flags})
target_link_directories(${target} PUBLIC ${HIP_LIB_INSTALL_DIR})
target_link_libraries(${target} PUBLIC ${RCCL_LIBRARY})
else()
target_include_directories(${target} PRIVATE ${RCCL_INCLUDE_DIR}/rccl)
target_compile_definitions(${target} PRIVATE ${xgboost_rccl_flags})
target_link_directories(${target} PUBLIC ${HIP_LIB_INSTALL_DIR})
if(NOT USE_DLOPEN_RCCL)
target_link_libraries(${target} PRIVATE ${RCCL_LIBRARY})
endif()
endif()
endfunction()

# compile options
macro(xgboost_target_properties target)
set_target_properties(${target} PROPERTIES
Expand Down Expand Up @@ -254,6 +291,10 @@ macro(xgboost_target_link_libraries target)
xgboost_set_cuda_flags(${target})
endif()

if (USE_HIP)
xgboost_set_hip_flags(${target})
endif (USE_HIP)

if(PLUGIN_RMM)
target_link_libraries(${target} PRIVATE rmm::rmm)
endif()
Expand All @@ -262,6 +303,10 @@ macro(xgboost_target_link_libraries target)
xgboost_link_nccl(${target})
endif()

if(USE_RCCL)
xgboost_link_rccl(${target})
endif()

if(USE_NVTX)
target_link_libraries(${target} PRIVATE CUDA::nvtx3)
endif()
Expand Down
5 changes: 5 additions & 0 deletions cmake/xgboost-config.cmake.in
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
set(USE_OPENMP @USE_OPENMP@)
set(USE_CUDA @USE_CUDA@)
set(USE_NCCL @USE_NCCL@)
set(USE_HIP @USE_HIP@)
set(USE_RCCL @USE_RCCL@)
set(XGBOOST_BUILD_STATIC_LIB @BUILD_STATIC_LIB@)

include(CMakeFindDependencyMacro)
Expand All @@ -15,6 +17,9 @@ if (XGBOOST_BUILD_STATIC_LIB)
if(USE_CUDA)
find_dependency(CUDA)
endif()
if(USE_HIP)
find_dependency(HIP)
endif()
# nccl should be linked statically if xgboost is built as static library.
endif (XGBOOST_BUILD_STATIC_LIB)

Expand Down
8 changes: 4 additions & 4 deletions demo/CLI/regression/runexp.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,13 @@ python mapfeat.py
# split train and test
python mknfold.py machine.txt 1
# training and output the models
../../xgboost machine.conf
../../../xgboost machine.conf
# output predictions of test data
../../xgboost machine.conf task=pred model_in=0002.model
../../../xgboost machine.conf task=pred model_in=0002.model
# print the boosters of 0002.model in dump.raw.txt
../../xgboost machine.conf task=dump model_in=0002.model name_dump=dump.raw.txt
../../../xgboost machine.conf task=dump model_in=0002.model name_dump=dump.raw.txt
# print the boosters of 0002.model in dump.nice.txt with feature map
../../xgboost machine.conf task=dump model_in=0002.model fmap=featmap.txt name_dump=dump.nice.txt
../../../xgboost machine.conf task=dump model_in=0002.model fmap=featmap.txt name_dump=dump.nice.txt

# cat the result
cat dump.nice.txt
8 changes: 4 additions & 4 deletions include/xgboost/base.h
Original file line number Diff line number Diff line change
Expand Up @@ -56,19 +56,19 @@
/*!
* \brief Tag function as usable by device
*/
#if defined (__CUDA__) || defined(__NVCC__)
#if defined (__CUDA__) || defined(__NVCC__) || defined(__HIPCC__)
#define XGBOOST_DEVICE __host__ __device__
#else
#define XGBOOST_DEVICE
#endif // defined (__CUDA__) || defined(__NVCC__)
#endif // defined (__CUDA__) || defined(__NVCC__) || defined(__HIPCC__)

#if defined(__CUDA__) || defined(__CUDACC__)
#if defined(__CUDA__) || defined(__CUDACC__) || defined(__HIPCC__)
#define XGBOOST_HOST_DEV_INLINE XGBOOST_DEVICE __forceinline__
#define XGBOOST_DEV_INLINE __device__ __forceinline__
#else
#define XGBOOST_HOST_DEV_INLINE
#define XGBOOST_DEV_INLINE
#endif // defined(__CUDA__) || defined(__CUDACC__)
#endif // defined(__CUDA__) || defined(__CUDACC__) || defined(__HIPCC__)

// These check are for Makefile.
#if !defined(XGBOOST_MM_PREFETCH_PRESENT) && !defined(XGBOOST_BUILTIN_PREFETCH_PRESENT)
Expand Down
4 changes: 2 additions & 2 deletions include/xgboost/host_device_vector.h
Original file line number Diff line number Diff line change
Expand Up @@ -58,11 +58,11 @@

namespace xgboost {

#ifdef __CUDACC__
#if defined(__CUDACC__) || defined(__HIPCC__)
// Sets a function to call instead of cudaSetDevice();
// only added for testing
void SetCudaSetDeviceHandler(void (*handler)(int));
#endif // __CUDACC__
#endif // __CUDACC__ || __HIPCC__

template <typename T> struct HostDeviceVectorImpl;

Expand Down
10 changes: 5 additions & 5 deletions include/xgboost/linalg.h
Original file line number Diff line number Diff line change
Expand Up @@ -30,11 +30,11 @@

// decouple it from xgboost.
#ifndef LINALG_HD
#if defined(__CUDA__) || defined(__NVCC__)
#if defined(__CUDA__) || defined(__NVCC__) || defined(__HIPCC__)
#define LINALG_HD __host__ __device__
#else
#define LINALG_HD
#endif // defined (__CUDA__) || defined(__NVCC__)
#endif // defined (__CUDA__) || defined(__NVCC__) || defined(__HIPCC__)
#endif // LINALG_HD

namespace xgboost::linalg {
Expand Down Expand Up @@ -118,7 +118,7 @@ using IndexToTag = std::conditional_t<std::is_integral<RemoveCRType<S>>::value,

template <int32_t n, typename Fn>
LINALG_HD constexpr auto UnrollLoop(Fn fn) {
#if defined __CUDA_ARCH__
#if defined(__CUDA_ARCH__) || defined(__HIPCC__)
#pragma unroll n
#endif // defined __CUDA_ARCH__
for (int32_t i = 0; i < n; ++i) {
Expand All @@ -136,7 +136,7 @@ int32_t NativePopc(T v) {
inline LINALG_HD int Popc(uint32_t v) {
#if defined(__CUDA_ARCH__)
return __popc(v);
#elif defined(__GNUC__) || defined(__clang__)
#elif defined(__GNUC__) || defined(__clang__) || defined(__HIPCC__)
return __builtin_popcount(v);
#elif defined(_MSC_VER)
return __popcnt(v);
Expand All @@ -148,7 +148,7 @@ inline LINALG_HD int Popc(uint32_t v) {
inline LINALG_HD int Popc(uint64_t v) {
#if defined(__CUDA_ARCH__)
return __popcll(v);
#elif defined(__GNUC__) || defined(__clang__)
#elif defined(__GNUC__) || defined(__clang__) || defined(__HIPCC__)
return __builtin_popcountll(v);
#elif defined(_MSC_VER) && defined(_M_X64)
return __popcnt64(v);
Expand Down
Loading

0 comments on commit 1614bee

Please sign in to comment.