Skip to content

Commit

Permalink
Add support for NCCL user buffer registration (#23)
Browse files Browse the repository at this point in the history
* Add support for NCCL user buffer registration feature.

* Update docs.

* Fix env var naming in docs.
  • Loading branch information
romerojosh authored Feb 27, 2024
1 parent 5be1865 commit 0dad43e
Show file tree
Hide file tree
Showing 5 changed files with 165 additions and 7 deletions.
17 changes: 17 additions & 0 deletions docs/env_vars.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
.. _env-var-section-ref:

Environment Variables
==============================

The following section lists the environment variables available to configure the cuDecomp library.

CUDECOMP_ENABLE_NCCL_UBR
------------------------
(since v0.4.0, requires NCCL v2.19 or newer)

:code:`CUDECOMP_ENABLE_NCCL_UBR` controls whether cuDecomp registers its communication buffers with the NCCL library using :code:`ncclCommRegister`/:code:`ncclCommDeregister` (i.e., user buffer registration).
Registration can improve NCCL send/receive performance in some scenarios. See the `User Buffer Registration <https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/usage/bufferreg.html>`_
section of the NCCL documentation for more details.

Default setting is off (:code:`0`). Setting this variable to :code:`1` will enable this feature.

1 change: 1 addition & 0 deletions docs/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ Table of Contents
autotuning
nvshmem
api
env_vars


Indices and tables
Expand Down
3 changes: 3 additions & 0 deletions include/internal/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
#include <map>
#include <string>
#include <unordered_map>
#include <utility>
#include <vector>

#include <cuda_runtime.h>
Expand All @@ -59,6 +60,8 @@ struct cudecompHandle {
int n_grid_descs_using_nccl = 0; // Count of grid descriptors using NCCL
ncclComm_t nccl_comm = nullptr; // NCCL communicator (global)
ncclComm_t nccl_local_comm = nullptr; // NCCL communicator (intranode)
bool nccl_enable_ubr = false; // Flag to control NCCL user buffer registration usage
std::unordered_map<void*, std::vector<std::pair<ncclComm_t, void*>>> nccl_ubr_handles; // map of allocated buffer address to NCCL registration handle(s)

cudaStream_t pl_stream = nullptr; // stream used for pipelined backends

Expand Down
116 changes: 109 additions & 7 deletions src/autotune.cc
Original file line number Diff line number Diff line change
Expand Up @@ -98,13 +98,16 @@ void autotuneTransposeBackend(cudecompHandle_t handle, cudecompGridDesc_t grid_d
bool autotune_pdims = (grid_desc->config.pdims[0] == 0 && grid_desc->config.pdims[1] == 0);

std::vector<cudecompTransposeCommBackend_t> comm_backend_list;
bool need_nccl = false;
std::array<void*, 2> nccl_work_ubr_handles{nullptr, nullptr};
bool need_nvshmem = false;
if (autotune_comm) {
comm_backend_list = {CUDECOMP_TRANSPOSE_COMM_MPI_P2P, CUDECOMP_TRANSPOSE_COMM_MPI_P2P_PL,
CUDECOMP_TRANSPOSE_COMM_MPI_A2A};
if (!options->disable_nccl_backends) {
comm_backend_list.push_back(CUDECOMP_TRANSPOSE_COMM_NCCL);
comm_backend_list.push_back(CUDECOMP_TRANSPOSE_COMM_NCCL_PL);
need_nccl = true;
}
#ifdef ENABLE_NVSHMEM
if (!options->disable_nvshmem_backends) {
Expand All @@ -115,6 +118,7 @@ void autotuneTransposeBackend(cudecompHandle_t handle, cudecompGridDesc_t grid_d
#endif
} else {
comm_backend_list = {grid_desc->config.transpose_comm_backend};
if (transposeBackendRequiresNccl(comm_backend_list[0])) { need_nccl = true; }
#ifdef ENABLE_NVSHMEM
if (transposeBackendRequiresNvshmem(comm_backend_list[0])) { need_nvshmem = true; }
#endif
Expand Down Expand Up @@ -193,7 +197,16 @@ void autotuneTransposeBackend(cudecompHandle_t handle, cudecompGridDesc_t grid_d
work_sz = work_sz_new;
if (need_nvshmem) {
#ifdef ENABLE_NVSHMEM
if (work && work != work_nvshmem) CHECK_CUDA(cudaFree(work));
if (work && work != work_nvshmem) {
CHECK_CUDA(cudaFree(work));
#if NCCL_VERSION_CODE >= NCCL_VERSION(2,19,0)
if (nccl_work_ubr_handles[0]) {
CHECK_NCCL(ncclCommDeregister(handle->nccl_comm, nccl_work_ubr_handles[0]));
CHECK_NCCL(ncclCommDeregister(handle->nccl_local_comm, nccl_work_ubr_handles[1]));
nccl_work_ubr_handles = {nullptr, nullptr};
}
#endif
}
// Temporarily set backend to force nvshmem_malloc patch in cudecompMalloc/Free
auto tmp = grid_desc->config.transpose_comm_backend;
grid_desc->config.transpose_comm_backend = CUDECOMP_TRANSPOSE_COMM_NVSHMEM;
Expand All @@ -212,11 +225,32 @@ void autotuneTransposeBackend(cudecompHandle_t handle, cudecompGridDesc_t grid_d
cudaGetLastError(); // Reset CUDA error state
} else {
CHECK_CUDA(ret);
#if NCCL_VERSION_CODE >= NCCL_VERSION(2,19,0)
if (need_nccl && handle->nccl_enable_ubr) {
CHECK_NCCL(ncclCommRegister(handle->nccl_comm, work, work_sz, &nccl_work_ubr_handles[0]));
CHECK_NCCL(ncclCommRegister(handle->nccl_local_comm, work, work_sz, &nccl_work_ubr_handles[1]));
}
#endif
}
#endif
} else {
if (work) CHECK_CUDA(cudaFree(work));
if (work) {
CHECK_CUDA(cudaFree(work));
#if NCCL_VERSION_CODE >= NCCL_VERSION(2,19,0)
if (nccl_work_ubr_handles[0]) {
CHECK_NCCL(ncclCommDeregister(handle->nccl_comm, nccl_work_ubr_handles[0]));
CHECK_NCCL(ncclCommDeregister(handle->nccl_local_comm, nccl_work_ubr_handles[1]));
nccl_work_ubr_handles = {nullptr, nullptr};
}
#endif
}
CHECK_CUDA(cudaMalloc(&work, work_sz));
#if NCCL_VERSION_CODE >= NCCL_VERSION(2,19,0)
if (need_nccl && handle->nccl_enable_ubr) {
CHECK_NCCL(ncclCommRegister(handle->nccl_comm, work, work_sz, &nccl_work_ubr_handles[0]));
CHECK_NCCL(ncclCommRegister(handle->nccl_local_comm, work, work_sz, &nccl_work_ubr_handles[1]));
}
#endif
}
}

Expand Down Expand Up @@ -394,7 +428,16 @@ void autotuneTransposeBackend(cudecompHandle_t handle, cudecompGridDesc_t grid_d

// Free test data and workspace
if (need_nvshmem) {
if (work != work_nvshmem) { CHECK_CUDA(cudaFree(work)); }
if (work != work_nvshmem) {
CHECK_CUDA(cudaFree(work));
#if NCCL_VERSION_CODE >= NCCL_VERSION(2,19,0)
if (nccl_work_ubr_handles[0]) {
CHECK_NCCL(ncclCommDeregister(handle->nccl_comm, nccl_work_ubr_handles[0]));
CHECK_NCCL(ncclCommDeregister(handle->nccl_local_comm, nccl_work_ubr_handles[1]));
nccl_work_ubr_handles = {nullptr, nullptr};
}
#endif
}
#ifdef ENABLE_NVSHMEM
// Temporarily set backend to force nvshmem_malloc patch in cudecompMalloc/Free
auto tmp = grid_desc->config.transpose_comm_backend;
Expand All @@ -404,6 +447,13 @@ void autotuneTransposeBackend(cudecompHandle_t handle, cudecompGridDesc_t grid_d
#endif
} else {
CHECK_CUDA(cudaFree(work));
#if NCCL_VERSION_CODE >= NCCL_VERSION(2,19,0)
if (nccl_work_ubr_handles[0]) {
CHECK_NCCL(ncclCommDeregister(handle->nccl_comm, nccl_work_ubr_handles[0]));
CHECK_NCCL(ncclCommDeregister(handle->nccl_local_comm, nccl_work_ubr_handles[1]));
nccl_work_ubr_handles = {nullptr, nullptr};
}
#endif
}

CHECK_CUDA(cudaFree(data));
Expand Down Expand Up @@ -444,10 +494,15 @@ void autotuneHaloBackend(cudecompHandle_t handle, cudecompGridDesc_t grid_desc,
bool autotune_pdims = (grid_desc->config.pdims[0] == 0 && grid_desc->config.pdims[1] == 0);

std::vector<cudecompHaloCommBackend_t> comm_backend_list;
bool need_nccl = false;
std::array<void*, 2> nccl_work_ubr_handles{nullptr, nullptr};
bool need_nvshmem = false;
if (autotune_comm) {
comm_backend_list = {CUDECOMP_HALO_COMM_MPI, CUDECOMP_HALO_COMM_MPI_BLOCKING};
if (!options->disable_nccl_backends) { comm_backend_list.push_back(CUDECOMP_HALO_COMM_NCCL); }
if (!options->disable_nccl_backends) {
comm_backend_list.push_back(CUDECOMP_HALO_COMM_NCCL);
need_nccl = true;
}
#ifdef ENABLE_NVSHMEM
if (!options->disable_nvshmem_backends) {
comm_backend_list.push_back(CUDECOMP_HALO_COMM_NVSHMEM);
Expand All @@ -457,6 +512,7 @@ void autotuneHaloBackend(cudecompHandle_t handle, cudecompGridDesc_t grid_desc,
#endif
} else {
comm_backend_list = {grid_desc->config.halo_comm_backend};
if (haloBackendRequiresNccl(comm_backend_list[0])) { need_nccl = true; }
#ifdef ENABLE_NVSHMEM
if (haloBackendRequiresNvshmem(comm_backend_list[0])) { need_nvshmem = true; }
#endif
Expand Down Expand Up @@ -523,7 +579,16 @@ void autotuneHaloBackend(cudecompHandle_t handle, cudecompGridDesc_t grid_desc,
work_sz = work_sz_new;
if (need_nvshmem) {
#ifdef ENABLE_NVSHMEM
if (work && work != work_nvshmem) CHECK_CUDA(cudaFree(work));
if (work && work != work_nvshmem) {
CHECK_CUDA(cudaFree(work));
#if NCCL_VERSION_CODE >= NCCL_VERSION(2,19,0)
if (nccl_work_ubr_handles[0]) {
CHECK_NCCL(ncclCommDeregister(handle->nccl_comm, nccl_work_ubr_handles[0]));
CHECK_NCCL(ncclCommDeregister(handle->nccl_local_comm, nccl_work_ubr_handles[1]));
nccl_work_ubr_handles = {nullptr, nullptr};
}
#endif
}
// Temporarily set backend to force nvshmem_malloc patch in cudecompMalloc/Free
auto tmp = grid_desc->config.halo_comm_backend;
grid_desc->config.halo_comm_backend = CUDECOMP_HALO_COMM_NVSHMEM;
Expand All @@ -542,11 +607,32 @@ void autotuneHaloBackend(cudecompHandle_t handle, cudecompGridDesc_t grid_desc,
cudaGetLastError(); // Reset CUDA error state
} else {
CHECK_CUDA(ret);
#if NCCL_VERSION_CODE >= NCCL_VERSION(2,19,0)
if (need_nccl && handle->nccl_enable_ubr) {
CHECK_NCCL(ncclCommRegister(handle->nccl_comm, work, work_sz, &nccl_work_ubr_handles[0]));
CHECK_NCCL(ncclCommRegister(handle->nccl_local_comm, work, work_sz, &nccl_work_ubr_handles[1]));
}
#endif
}
#endif
} else {
if (work) CHECK_CUDA(cudaFree(work));
if (work) {
CHECK_CUDA(cudaFree(work));
#if NCCL_VERSION_CODE >= NCCL_VERSION(2,19,0)
if (nccl_work_ubr_handles[0]) {
CHECK_NCCL(ncclCommDeregister(handle->nccl_comm, nccl_work_ubr_handles[0]));
CHECK_NCCL(ncclCommDeregister(handle->nccl_local_comm, nccl_work_ubr_handles[1]));
nccl_work_ubr_handles = {nullptr, nullptr};
}
#endif
}
CHECK_CUDA(cudaMalloc(&work, work_sz));
#if NCCL_VERSION_CODE >= NCCL_VERSION(2,19,0)
if (need_nccl && handle->nccl_enable_ubr) {
CHECK_NCCL(ncclCommRegister(handle->nccl_comm, work, work_sz, &nccl_work_ubr_handles[0]));
CHECK_NCCL(ncclCommRegister(handle->nccl_local_comm, work, work_sz, &nccl_work_ubr_handles[1]));
}
#endif
}
}

Expand Down Expand Up @@ -680,7 +766,16 @@ void autotuneHaloBackend(cudecompHandle_t handle, cudecompGridDesc_t grid_desc,

// Free test data and workspace
if (need_nvshmem) {
if (work != work_nvshmem) { CHECK_CUDA(cudaFree(work)); }
if (work != work_nvshmem) {
CHECK_CUDA(cudaFree(work));
#if NCCL_VERSION_CODE >= NCCL_VERSION(2,19,0)
if (nccl_work_ubr_handles[0]) {
CHECK_NCCL(ncclCommDeregister(handle->nccl_comm, nccl_work_ubr_handles[0]));
CHECK_NCCL(ncclCommDeregister(handle->nccl_local_comm, nccl_work_ubr_handles[1]));
nccl_work_ubr_handles = {nullptr, nullptr};
}
#endif
}
#ifdef ENABLE_NVSHMEM
// Temporarily set backend to force nvshmem_malloc patch in cudecompMalloc/Free
auto tmp = grid_desc->config.halo_comm_backend;
Expand All @@ -690,6 +785,13 @@ void autotuneHaloBackend(cudecompHandle_t handle, cudecompGridDesc_t grid_desc,
#endif
} else {
CHECK_CUDA(cudaFree(work));
#if NCCL_VERSION_CODE >= NCCL_VERSION(2,19,0)
if (nccl_work_ubr_handles[0]) {
CHECK_NCCL(ncclCommDeregister(handle->nccl_comm, nccl_work_ubr_handles[0]));
CHECK_NCCL(ncclCommDeregister(handle->nccl_local_comm, nccl_work_ubr_handles[1]));
nccl_work_ubr_handles = {nullptr, nullptr};
}
#endif
}

CHECK_CUDA(cudaFree(data));
Expand Down
35 changes: 35 additions & 0 deletions src/cudecomp.cc
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,12 @@ static void gatherGlobalMPIInfo(cudecompHandle_t& handle) {
handle->mpi_comm));
}

static void getCudecompEnvVars(cudecompHandle_t& handle) {
// Check CUDECOMP_ENABLE_NCCL_UBR (NCCL user buffer registration)
char* nccl_enable_ubr_str = std::getenv("CUDECOMP_ENABLE_NCCL_UBR");
if (nccl_enable_ubr_str) { handle->nccl_enable_ubr = std::strtol(nccl_enable_ubr_str, nullptr, 10) == 1; }
}

#ifdef ENABLE_NVSHMEM
static void inspectNvshmemEnvVars(cudecompHandle_t& handle) {
// Check NVSHMEM_DISABLE_CUDA_VMM
Expand Down Expand Up @@ -220,6 +226,9 @@ cudecompResult_t cudecompInit(cudecompHandle_t* handle_in, MPI_Comm mpi_comm) {
// Gather extra MPI info from all communicator ranks
gatherGlobalMPIInfo(handle);

// Gather cuDecomp environment variable settings
getCudecompEnvVars(handle);

handle->initialized = true;
cudecomp_initialized = true;

Expand Down Expand Up @@ -720,6 +729,19 @@ cudecompResult_t cudecompMalloc(cudecompHandle_t handle, cudecompGridDesc_t grid
#endif
} else {
CHECK_CUDA(cudaMalloc(buffer, buffer_size_bytes));
#if NCCL_VERSION_CODE >= NCCL_VERSION(2,19,0)
if (transposeBackendRequiresNccl(grid_desc->config.transpose_comm_backend) ||
haloBackendRequiresNccl(grid_desc->config.halo_comm_backend)) {

if (handle->nccl_enable_ubr) {
void* nccl_ubr_handle;
CHECK_NCCL(ncclCommRegister(handle->nccl_comm, buffer, buffer_size_bytes, &nccl_ubr_handle));
handle->nccl_ubr_handles[*buffer].push_back(std::make_pair(handle->nccl_comm, nccl_ubr_handle));
CHECK_NCCL(ncclCommRegister(handle->nccl_local_comm, buffer, buffer_size_bytes, &nccl_ubr_handle));
handle->nccl_ubr_handles[*buffer].push_back(std::make_pair(handle->nccl_local_comm, nccl_ubr_handle));
}
}
#endif
}

} catch (const cudecomp::BaseException& e) {
Expand All @@ -735,6 +757,19 @@ cudecompResult_t cudecompFree(cudecompHandle_t handle, cudecompGridDesc_t grid_d
checkHandle(handle);
checkGridDesc(grid_desc);

#if NCCL_VERSION_CODE >= NCCL_VERSION(2,19,0)
if (transposeBackendRequiresNccl(grid_desc->config.transpose_comm_backend) ||
haloBackendRequiresNccl(grid_desc->config.halo_comm_backend)) {

if (handle->nccl_ubr_handles.count(buffer) != 0) {
for (const auto &entry : handle->nccl_ubr_handles[buffer]) {
CHECK_NCCL(ncclCommDeregister(entry.first, entry.second));
}
handle->nccl_ubr_handles.erase(buffer);
}
}
#endif

if (transposeBackendRequiresNvshmem(grid_desc->config.transpose_comm_backend) ||
haloBackendRequiresNvshmem(grid_desc->config.halo_comm_backend)) {
#ifdef ENABLE_NVSHMEM
Expand Down

0 comments on commit 0dad43e

Please sign in to comment.