dmlc · trivialfis · Sep 27, 2024 · Aug 27, 2024 · Sep 26, 2024 · Sep 26, 2024
diff --git a/src/common/cuda_dr_utils.cc b/src/common/cuda_dr_utils.cc
@@ -0,0 +1,108 @@
+/**
+ * Copyright 2024, XGBoost contributors
+ */
+#if defined(XGBOOST_USE_CUDA)
+#include "cuda_dr_utils.h"
+
+#include <algorithm>  // for max
+#include <cstdint>    // for int32_t
+#include <cstring>    // for memset
+#include <memory>     // for make_unique
+#include <mutex>      // for call_once
+#include <sstream>    // for stringstream
+#include <string>     // for string
+
+#include "common.h"               // for safe_cuda
+#include "cuda_rt_utils.h"        // for CurrentDevice
+#include "xgboost/string_view.h"  // for StringVie
+
+namespace xgboost::cudr {
+CuDriverApi::CuDriverApi() {
+  // similar to dlopen, but without the need to release a handle.
+  auto safe_load = [](xgboost::StringView name, auto **fnptr) {
+    cudaDriverEntryPointQueryResult status;
+    dh::safe_cuda(cudaGetDriverEntryPoint(name.c_str(), reinterpret_cast<void **>(fnptr),
+                                          cudaEnablePerThreadDefaultStream, &status));
+    CHECK(status == cudaDriverEntryPointSuccess) << name;
+    CHECK(*fnptr);
+  };
+
+  safe_load("cuMemGetAllocationGranularity", &this->cuMemGetAllocationGranularity);
+  safe_load("cuMemCreate", &this->cuMemCreate);
+  safe_load("cuMemMap", &this->cuMemMap);
+  safe_load("cuMemAddressReserve", &this->cuMemAddressReserve);
+  safe_load("cuMemSetAccess", &this->cuMemSetAccess);
+  safe_load("cuMemUnmap", &this->cuMemUnmap);
+  safe_load("cuMemRelease", &this->cuMemRelease);
+  safe_load("cuMemAddressFree", &this->cuMemAddressFree);
+  safe_load("cuGetErrorString", &this->cuGetErrorString);
+  safe_load("cuGetErrorName", &this->cuGetErrorName);
+  safe_load("cuDeviceGetAttribute", &this->cuDeviceGetAttribute);
+  safe_load("cuDeviceGet", &this->cuDeviceGet);
+
+  CHECK(this->cuMemGetAllocationGranularity);
+}
+
+void CuDriverApi::ThrowIfError(CUresult status, StringView fn, std::int32_t line,
+                               char const *file) const {
+  if (status == CUDA_SUCCESS) {
+    return;
+  }
+  std::string cuerr{"CUDA driver error:"};
+
+  char const *name{nullptr};
+  auto err0 = this->cuGetErrorName(status, &name);
+  if (err0 != CUDA_SUCCESS) {
+    LOG(WARNING) << cuerr << status << ". Then we failed to get error name:" << err0;
+  }
+  char const *msg{nullptr};
+  auto err1 = this->cuGetErrorString(status, &msg);
+  if (err1 != CUDA_SUCCESS) {
+    LOG(WARNING) << cuerr << status << ". Then we failed to get error string:" << err1;
+  }
+
+  std::stringstream ss;
+  ss << fn << "[" << file << ":" << line << "]:";
+  if (name != nullptr && err0 == CUDA_SUCCESS) {
+    ss << cuerr << " " << name << ".";
+  }
+  if (msg != nullptr && err1 == CUDA_SUCCESS) {
+    ss << " " << msg << "\n";
+  }
+  LOG(FATAL) << ss.str();
+}
+
+[[nodiscard]] CuDriverApi &GetGlobalCuDriverApi() {
+  static std::once_flag flag;
+  static std::unique_ptr<CuDriverApi> cu;
+  std::call_once(flag, [&] { cu = std::make_unique<CuDriverApi>(); });
+  return *cu;
+}
+
+void MakeCuMemLocation(CUmemLocationType type, CUmemLocation *loc) {
+  auto ordinal = curt::CurrentDevice();
+  loc->type = type;
+
+  if (type == CU_MEM_LOCATION_TYPE_DEVICE) {
+    loc->id = ordinal;
+  } else {
+    std::int32_t numa_id = -1;
+    CUdevice device;
+    safe_cu(GetGlobalCuDriverApi().cuDeviceGet(&device, ordinal));
+    safe_cu(GetGlobalCuDriverApi().cuDeviceGetAttribute(&numa_id, CU_DEVICE_ATTRIBUTE_HOST_NUMA_ID,
+                                                        device));
+    numa_id = std::max(numa_id, 0);
+
+    loc->id = numa_id;
+  }
+}
+
+[[nodiscard]] CUmemAllocationProp MakeAllocProp(CUmemLocationType type) {
+  CUmemAllocationProp prop;
+  std::memset(&prop, '\0', sizeof(prop));
+  prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
+  MakeCuMemLocation(type, &prop.location);
+  return prop;
+}
+}  // namespace xgboost::cudr
+#endif
diff --git a/src/common/cuda_dr_utils.h b/src/common/cuda_dr_utils.h
@@ -0,0 +1,105 @@
+/**
+ * Copyright 2024, XGBoost contributors
+ *
+ * @brief Utility for CUDA driver API.
+ *
+ * XGBoost doesn't link libcuda.so at build time. The utilities here load the shared
+ * object at runtime.
+ */
+#pragma once
+
+#include <cuda.h>
+#include <cuda_runtime_api.h>
+
+#include <cstdint>  // for int32_t
+
+#include "xgboost/string_view.h"  // for StringView
+
+namespace xgboost::cudr {
+/**
+ * @brief A struct for retrieving CUDA driver API from the runtime API.
+ */
+struct CuDriverApi {
+  using Flags = unsigned long long;  // NOLINT
+
+  // Memroy manipulation functions.
+  using MemGetAllocationGranularityFn = CUresult(size_t *granularity,
+                                                 const CUmemAllocationProp *prop,
+                                                 CUmemAllocationGranularity_flags option);
+  using MemCreateFn = CUresult(CUmemGenericAllocationHandle *handle, size_t size,
+                               const CUmemAllocationProp *prop, Flags flags);
+  using MemMapFn = CUresult(CUdeviceptr ptr, size_t size, size_t offset,
+                            CUmemGenericAllocationHandle handle, Flags flags);
+  using MemAddressReserveFn = CUresult(CUdeviceptr *ptr, size_t size, size_t alignment,
+                                       CUdeviceptr addr, Flags flags);
+  using MemSetAccessFn = CUresult(CUdeviceptr ptr, size_t size, const CUmemAccessDesc *desc,
+                                  size_t count);
+  using MemUnmapFn = CUresult(CUdeviceptr ptr, size_t size);
+  using MemReleaseFn = CUresult(CUmemGenericAllocationHandle handle);
+  using MemAddressFreeFn = CUresult(CUdeviceptr ptr, size_t size);
+  // Error handling
+  using GetErrorString = CUresult(CUresult error, const char **pStr);
+  using GetErrorName = CUresult(CUresult error, const char **pStr);
+  // Device attributes
+  using DeviceGetAttribute = CUresult(int *pi, CUdevice_attribute attrib, CUdevice dev);
+  using DeviceGet = CUresult(CUdevice *device, int ordinal);
+
+  MemGetAllocationGranularityFn *cuMemGetAllocationGranularity{nullptr};  // NOLINT
+  MemCreateFn *cuMemCreate{nullptr};                                      // NOLINT
+  /**
+   * @param[in] offset - Must be zero.
+   */
+  MemMapFn *cuMemMap{nullptr};                                            // NOLINT
+  /**
+   * @param[out] ptr       - Resulting pointer to start of virtual address range allocated
+   * @param[in]  size      - Size of the reserved virtual address range requested
+   * @param[in]  alignment - Alignment of the reserved virtual address range requested
+   * @param[in]  addr      - Fixed starting address range requested
+   * @param[in]  flags     - Currently unused, must be zero
+   */
+  MemAddressReserveFn *cuMemAddressReserve{nullptr};  // NOLINT
+  MemSetAccessFn *cuMemSetAccess{nullptr};            // NOLINT
+  MemUnmapFn *cuMemUnmap{nullptr};                    // NOLINT
+  MemReleaseFn *cuMemRelease{nullptr};                // NOLINT
+  MemAddressFreeFn *cuMemAddressFree{nullptr};        // NOLINT
+  GetErrorString *cuGetErrorString{nullptr};          // NOLINT
+  GetErrorName *cuGetErrorName{nullptr};              // NOLINT
+  DeviceGetAttribute *cuDeviceGetAttribute{nullptr};  // NOLINT
+  DeviceGet *cuDeviceGet{nullptr};                    // NOLINT
+
+  CuDriverApi();
+
+  void ThrowIfError(CUresult status, StringView fn, std::int32_t line, char const *file) const;
+};
+
+[[nodiscard]] CuDriverApi &GetGlobalCuDriverApi();
+
+/**
+ * @brief Macro for guarding CUDA driver API calls.
+ */
+#define safe_cu(call)                                                                            \
+  do {                                                                                           \
+    auto __status = (call);                                                                      \
+    if (__status != CUDA_SUCCESS) {                                                              \
+      ::xgboost::cudr::GetGlobalCuDriverApi().ThrowIfError(__status, #call, __LINE__, __FILE__); \
+    }                                                                                            \
+  } while (0)
+
+// Get the allocation granularity.
+inline auto GetAllocGranularity(CUmemAllocationProp const *prop) {
+  std::size_t granularity;
+  safe_cu(GetGlobalCuDriverApi().cuMemGetAllocationGranularity(
+      &granularity, prop, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED));
+  return granularity;
+}
+
+/**
+ * @brief Obtain appropriate device ordinal for `CUmemLocation`.
+ */
+void MakeCuMemLocation(CUmemLocationType type, CUmemLocation* loc);
+
+/**
+ * @brief Construct a `CUmemAllocationProp`.
+ */
+[[nodiscard]] CUmemAllocationProp MakeAllocProp(CUmemLocationType type);
+}  // namespace xgboost::cudr
diff --git a/src/common/cuda_rt_utils.cc b/src/common/cuda_rt_utils.cc
@@ -8,18 +8,19 @@
 #endif  // defined(XGBOOST_USE_CUDA)
 
 #include <cstdint>  // for int32_t
+#include <mutex>    // for once_flag, call_once
 
 #include "common.h"  // for safe_cuda
 
-namespace xgboost::common {
+namespace xgboost::curt {
 #if defined(XGBOOST_USE_CUDA)
 std::int32_t AllVisibleGPUs() {
   int n_visgpus = 0;
   try {
     // When compiled with CUDA but running on CPU only device,
     // cudaGetDeviceCount will fail.
     dh::safe_cuda(cudaGetDeviceCount(&n_visgpus));
-  } catch (const dmlc::Error &) {
+  } catch (const dmlc::Error&) {
     cudaGetLastError();  // reset error.
     return 0;
   }
@@ -63,11 +64,36 @@ void SetDevice(std::int32_t device) {
     dh::safe_cuda(cudaSetDevice(device));
   }
 }
+
+namespace {
+template <typename Fn>
+void GetVersionImpl(Fn&& fn, std::int32_t* major, std::int32_t* minor) {
+  static std::int32_t version = 0;
+  static std::once_flag flag;
+  std::call_once(flag, [&] { fn(&version); });
+  if (major) {
+    *major = version / 1000;
+  }
+  if (minor) {
+    *minor = version % 100 / 10;
+  }
+}
+}  // namespace
+
+void RtVersion(std::int32_t* major, std::int32_t* minor) {
+  GetVersionImpl([](std::int32_t* ver) { dh::safe_cuda(cudaRuntimeGetVersion(ver)); }, major,
+                 minor);
+}
+
+void DrVersion(std::int32_t* major, std::int32_t* minor) {
+  GetVersionImpl([](std::int32_t* ver) { dh::safe_cuda(cudaDriverGetVersion(ver)); }, major, minor);
+}
+
 #else
 std::int32_t AllVisibleGPUs() { return 0; }
 
 std::int32_t CurrentDevice() {
-  AssertGPUSupport();
+  common::AssertGPUSupport();
   return -1;
 }
 
@@ -79,8 +105,8 @@ void CheckComputeCapability() {}
 
 void SetDevice(std::int32_t device) {
   if (device >= 0) {
-    AssertGPUSupport();
+    common::AssertGPUSupport();
   }
 }
 #endif  // !defined(XGBOOST_USE_CUDA)
-}  // namespace xgboost::common
+}  // namespace xgboost::curt
diff --git a/src/common/cuda_rt_utils.h b/src/common/cuda_rt_utils.h
@@ -8,7 +8,7 @@
 #include <nvtx3/nvtx3.hpp>
 #endif  // defined(XGBOOST_USE_NVTX)
 
-namespace xgboost::common {
+namespace xgboost::curt {
 std::int32_t AllVisibleGPUs();
 
 std::int32_t CurrentDevice();
@@ -24,6 +24,12 @@ void CheckComputeCapability();
 
 void SetDevice(std::int32_t device);
 
+// Returns the CUDA Runtime version.
+void RtVersion(std::int32_t* major, std::int32_t* minor);
+
+// Returns the latest version of CUDA supported by the driver.
+void DrVersion(std::int32_t* major, std::int32_t* minor);
+
 struct NvtxDomain {
   static constexpr char const *name{"libxgboost"};  // NOLINT
 };
@@ -49,10 +55,10 @@ class NvtxRgb {
   explicit NvtxRgb(Args &&...) {}
 };
 #endif  // defined(XGBOOST_USE_NVTX)
-}  // namespace xgboost::common
+}  // namespace xgboost::curt
 
 #if defined(XGBOOST_USE_NVTX)
-#define xgboost_NVTX_FN_RANGE() NVTX3_FUNC_RANGE_IN(::xgboost::common::NvtxDomain)
+#define xgboost_NVTX_FN_RANGE() NVTX3_FUNC_RANGE_IN(::xgboost::curt::NvtxDomain)
 #else
 #define xgboost_NVTX_FN_RANGE()
 #endif  // defined(XGBOOST_USE_NVTX)
diff --git a/src/common/device_helpers.cu b/src/common/device_helpers.cu
@@ -0,0 +1,23 @@
+/**
+ * Copyright 2024, XGBoost contributors
+ */
+#include "cuda_rt_utils.h"  // for RtVersion
+#include "device_helpers.cuh"
+#include "xgboost/windefs.h"  // for xgboost_IS_WIN
+
+namespace dh {
+PinnedMemory::PinnedMemory() {
+#if defined(xgboost_IS_WIN)
+  this->impl_.emplace<detail::GrowOnlyPinnedMemoryImpl>();
+#else
+  std::int32_t major{0}, minor{0};
+  xgboost::curt::DrVersion(&major, &minor);
+  // Host NUMA allocation requires driver that supports CTK >= 12.5 to be stable.
+  if (major >= 12 && minor >= 5) {
+    this->impl_.emplace<detail::GrowOnlyVirtualMemVec>(CU_MEM_LOCATION_TYPE_HOST_NUMA);
+  } else {
+    this->impl_.emplace<detail::GrowOnlyPinnedMemoryImpl>();
+  }
+#endif
+}
+}  // namespace dh