Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use CUDA virtual memory for pinned memory allocator. #10850

Merged
merged 9 commits into from
Sep 27, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
108 changes: 108 additions & 0 deletions src/common/cuda_dr_utils.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
/**
* Copyright 2024, XGBoost contributors
*/
#if defined(XGBOOST_USE_CUDA)
#include "cuda_dr_utils.h"

#include <algorithm> // for max
#include <cstdint> // for int32_t
#include <cstring> // for memset
#include <memory> // for make_unique
#include <mutex> // for call_once
#include <sstream> // for stringstream
#include <string> // for string

#include "common.h" // for safe_cuda
#include "cuda_rt_utils.h" // for CurrentDevice
#include "xgboost/string_view.h" // for StringVie

namespace xgboost::cudr {
CuDriverApi::CuDriverApi() {
// similar to dlopen, but without the need to release a handle.
auto safe_load = [](xgboost::StringView name, auto **fnptr) {
cudaDriverEntryPointQueryResult status;
dh::safe_cuda(cudaGetDriverEntryPoint(name.c_str(), reinterpret_cast<void **>(fnptr),
cudaEnablePerThreadDefaultStream, &status));
CHECK(status == cudaDriverEntryPointSuccess) << name;
CHECK(*fnptr);
};

safe_load("cuMemGetAllocationGranularity", &this->cuMemGetAllocationGranularity);
safe_load("cuMemCreate", &this->cuMemCreate);
safe_load("cuMemMap", &this->cuMemMap);
safe_load("cuMemAddressReserve", &this->cuMemAddressReserve);
safe_load("cuMemSetAccess", &this->cuMemSetAccess);
safe_load("cuMemUnmap", &this->cuMemUnmap);
safe_load("cuMemRelease", &this->cuMemRelease);
safe_load("cuMemAddressFree", &this->cuMemAddressFree);
safe_load("cuGetErrorString", &this->cuGetErrorString);
safe_load("cuGetErrorName", &this->cuGetErrorName);
safe_load("cuDeviceGetAttribute", &this->cuDeviceGetAttribute);
safe_load("cuDeviceGet", &this->cuDeviceGet);

CHECK(this->cuMemGetAllocationGranularity);
}

void CuDriverApi::ThrowIfError(CUresult status, StringView fn, std::int32_t line,
char const *file) const {
if (status == CUDA_SUCCESS) {
return;
}
std::string cuerr{"CUDA driver error:"};

char const *name{nullptr};
auto err0 = this->cuGetErrorName(status, &name);
if (err0 != CUDA_SUCCESS) {
LOG(WARNING) << cuerr << status << ". Then we failed to get error name:" << err0;
}
char const *msg{nullptr};
auto err1 = this->cuGetErrorString(status, &msg);
if (err1 != CUDA_SUCCESS) {
LOG(WARNING) << cuerr << status << ". Then we failed to get error string:" << err1;
}

std::stringstream ss;
ss << fn << "[" << file << ":" << line << "]:";
if (name != nullptr && err0 == CUDA_SUCCESS) {
ss << cuerr << " " << name << ".";
}
if (msg != nullptr && err1 == CUDA_SUCCESS) {
ss << " " << msg << "\n";
}
LOG(FATAL) << ss.str();
}

[[nodiscard]] CuDriverApi &GetGlobalCuDriverApi() {
static std::once_flag flag;
static std::unique_ptr<CuDriverApi> cu;
std::call_once(flag, [&] { cu = std::make_unique<CuDriverApi>(); });
return *cu;
}

void MakeCuMemLocation(CUmemLocationType type, CUmemLocation *loc) {
auto ordinal = curt::CurrentDevice();
loc->type = type;

if (type == CU_MEM_LOCATION_TYPE_DEVICE) {
loc->id = ordinal;
} else {
std::int32_t numa_id = -1;
CUdevice device;
safe_cu(GetGlobalCuDriverApi().cuDeviceGet(&device, ordinal));
safe_cu(GetGlobalCuDriverApi().cuDeviceGetAttribute(&numa_id, CU_DEVICE_ATTRIBUTE_HOST_NUMA_ID,
device));
numa_id = std::max(numa_id, 0);

loc->id = numa_id;
}
}

[[nodiscard]] CUmemAllocationProp MakeAllocProp(CUmemLocationType type) {
CUmemAllocationProp prop;
std::memset(&prop, '\0', sizeof(prop));
prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
MakeCuMemLocation(type, &prop.location);
return prop;
}
} // namespace xgboost::cudr
#endif
105 changes: 105 additions & 0 deletions src/common/cuda_dr_utils.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
/**
* Copyright 2024, XGBoost contributors
*
* @brief Utility for CUDA driver API.
*
* XGBoost doesn't link libcuda.so at build time. The utilities here load the shared
* object at runtime.
*/
#pragma once

#include <cuda.h>
#include <cuda_runtime_api.h>

#include <cstdint> // for int32_t

#include "xgboost/string_view.h" // for StringView

namespace xgboost::cudr {
/**
* @brief A struct for retrieving CUDA driver API from the runtime API.
*/
struct CuDriverApi {
using Flags = unsigned long long; // NOLINT

// Memroy manipulation functions.
using MemGetAllocationGranularityFn = CUresult(size_t *granularity,
const CUmemAllocationProp *prop,
CUmemAllocationGranularity_flags option);
using MemCreateFn = CUresult(CUmemGenericAllocationHandle *handle, size_t size,
const CUmemAllocationProp *prop, Flags flags);
using MemMapFn = CUresult(CUdeviceptr ptr, size_t size, size_t offset,
CUmemGenericAllocationHandle handle, Flags flags);
using MemAddressReserveFn = CUresult(CUdeviceptr *ptr, size_t size, size_t alignment,
CUdeviceptr addr, Flags flags);
using MemSetAccessFn = CUresult(CUdeviceptr ptr, size_t size, const CUmemAccessDesc *desc,
size_t count);
using MemUnmapFn = CUresult(CUdeviceptr ptr, size_t size);
using MemReleaseFn = CUresult(CUmemGenericAllocationHandle handle);
using MemAddressFreeFn = CUresult(CUdeviceptr ptr, size_t size);
// Error handling
using GetErrorString = CUresult(CUresult error, const char **pStr);
using GetErrorName = CUresult(CUresult error, const char **pStr);
// Device attributes
using DeviceGetAttribute = CUresult(int *pi, CUdevice_attribute attrib, CUdevice dev);
using DeviceGet = CUresult(CUdevice *device, int ordinal);

MemGetAllocationGranularityFn *cuMemGetAllocationGranularity{nullptr}; // NOLINT
MemCreateFn *cuMemCreate{nullptr}; // NOLINT
/**
* @param[in] offset - Must be zero.
*/
MemMapFn *cuMemMap{nullptr}; // NOLINT
/**
* @param[out] ptr - Resulting pointer to start of virtual address range allocated
* @param[in] size - Size of the reserved virtual address range requested
* @param[in] alignment - Alignment of the reserved virtual address range requested
* @param[in] addr - Fixed starting address range requested
* @param[in] flags - Currently unused, must be zero
*/
MemAddressReserveFn *cuMemAddressReserve{nullptr}; // NOLINT
MemSetAccessFn *cuMemSetAccess{nullptr}; // NOLINT
MemUnmapFn *cuMemUnmap{nullptr}; // NOLINT
MemReleaseFn *cuMemRelease{nullptr}; // NOLINT
MemAddressFreeFn *cuMemAddressFree{nullptr}; // NOLINT
GetErrorString *cuGetErrorString{nullptr}; // NOLINT
GetErrorName *cuGetErrorName{nullptr}; // NOLINT
DeviceGetAttribute *cuDeviceGetAttribute{nullptr}; // NOLINT
DeviceGet *cuDeviceGet{nullptr}; // NOLINT

CuDriverApi();

void ThrowIfError(CUresult status, StringView fn, std::int32_t line, char const *file) const;
};

[[nodiscard]] CuDriverApi &GetGlobalCuDriverApi();

/**
* @brief Macro for guarding CUDA driver API calls.
*/
#define safe_cu(call) \
do { \
auto __status = (call); \
if (__status != CUDA_SUCCESS) { \
::xgboost::cudr::GetGlobalCuDriverApi().ThrowIfError(__status, #call, __LINE__, __FILE__); \
} \
} while (0)

// Get the allocation granularity.
inline auto GetAllocGranularity(CUmemAllocationProp const *prop) {
std::size_t granularity;
safe_cu(GetGlobalCuDriverApi().cuMemGetAllocationGranularity(
&granularity, prop, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED));
return granularity;
}

/**
* @brief Obtain appropriate device ordinal for `CUmemLocation`.
*/
void MakeCuMemLocation(CUmemLocationType type, CUmemLocation* loc);

/**
* @brief Construct a `CUmemAllocationProp`.
*/
[[nodiscard]] CUmemAllocationProp MakeAllocProp(CUmemLocationType type);
} // namespace xgboost::cudr
36 changes: 31 additions & 5 deletions src/common/cuda_rt_utils.cc
Original file line number Diff line number Diff line change
Expand Up @@ -8,18 +8,19 @@
#endif // defined(XGBOOST_USE_CUDA)

#include <cstdint> // for int32_t
#include <mutex> // for once_flag, call_once

#include "common.h" // for safe_cuda

namespace xgboost::common {
namespace xgboost::curt {
#if defined(XGBOOST_USE_CUDA)
std::int32_t AllVisibleGPUs() {
int n_visgpus = 0;
try {
// When compiled with CUDA but running on CPU only device,
// cudaGetDeviceCount will fail.
dh::safe_cuda(cudaGetDeviceCount(&n_visgpus));
} catch (const dmlc::Error &) {
} catch (const dmlc::Error&) {
cudaGetLastError(); // reset error.
return 0;
}
Expand Down Expand Up @@ -63,11 +64,36 @@ void SetDevice(std::int32_t device) {
dh::safe_cuda(cudaSetDevice(device));
}
}

namespace {
template <typename Fn>
void GetVersionImpl(Fn&& fn, std::int32_t* major, std::int32_t* minor) {
static std::int32_t version = 0;
static std::once_flag flag;
std::call_once(flag, [&] { fn(&version); });
if (major) {
*major = version / 1000;
}
if (minor) {
*minor = version % 100 / 10;
}
}
} // namespace

void RtVersion(std::int32_t* major, std::int32_t* minor) {
GetVersionImpl([](std::int32_t* ver) { dh::safe_cuda(cudaRuntimeGetVersion(ver)); }, major,
minor);
}

void DrVersion(std::int32_t* major, std::int32_t* minor) {
GetVersionImpl([](std::int32_t* ver) { dh::safe_cuda(cudaDriverGetVersion(ver)); }, major, minor);
}

#else
std::int32_t AllVisibleGPUs() { return 0; }

std::int32_t CurrentDevice() {
AssertGPUSupport();
common::AssertGPUSupport();
return -1;
}

Expand All @@ -79,8 +105,8 @@ void CheckComputeCapability() {}

void SetDevice(std::int32_t device) {
if (device >= 0) {
AssertGPUSupport();
common::AssertGPUSupport();
}
}
#endif // !defined(XGBOOST_USE_CUDA)
} // namespace xgboost::common
} // namespace xgboost::curt
12 changes: 9 additions & 3 deletions src/common/cuda_rt_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
#include <nvtx3/nvtx3.hpp>
#endif // defined(XGBOOST_USE_NVTX)

namespace xgboost::common {
namespace xgboost::curt {
std::int32_t AllVisibleGPUs();

std::int32_t CurrentDevice();
Expand All @@ -24,6 +24,12 @@ void CheckComputeCapability();

void SetDevice(std::int32_t device);

// Returns the CUDA Runtime version.
void RtVersion(std::int32_t* major, std::int32_t* minor);

// Returns the latest version of CUDA supported by the driver.
void DrVersion(std::int32_t* major, std::int32_t* minor);

struct NvtxDomain {
static constexpr char const *name{"libxgboost"}; // NOLINT
};
Expand All @@ -49,10 +55,10 @@ class NvtxRgb {
explicit NvtxRgb(Args &&...) {}
};
#endif // defined(XGBOOST_USE_NVTX)
} // namespace xgboost::common
} // namespace xgboost::curt

#if defined(XGBOOST_USE_NVTX)
#define xgboost_NVTX_FN_RANGE() NVTX3_FUNC_RANGE_IN(::xgboost::common::NvtxDomain)
#define xgboost_NVTX_FN_RANGE() NVTX3_FUNC_RANGE_IN(::xgboost::curt::NvtxDomain)
#else
#define xgboost_NVTX_FN_RANGE()
#endif // defined(XGBOOST_USE_NVTX)
23 changes: 23 additions & 0 deletions src/common/device_helpers.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
/**
* Copyright 2024, XGBoost contributors
*/
#include "cuda_rt_utils.h" // for RtVersion
#include "device_helpers.cuh"
#include "xgboost/windefs.h" // for xgboost_IS_WIN

namespace dh {
PinnedMemory::PinnedMemory() {
#if defined(xgboost_IS_WIN)
this->impl_.emplace<detail::GrowOnlyPinnedMemoryImpl>();
#else
std::int32_t major{0}, minor{0};
xgboost::curt::DrVersion(&major, &minor);
// Host NUMA allocation requires driver that supports CTK >= 12.5 to be stable.
if (major >= 12 && minor >= 5) {
this->impl_.emplace<detail::GrowOnlyVirtualMemVec>(CU_MEM_LOCATION_TYPE_HOST_NUMA);
} else {
this->impl_.emplace<detail::GrowOnlyPinnedMemoryImpl>();
}
#endif
}
} // namespace dh
Loading
Loading