Add support for NCCL user buffer registration (#23)

* Add support for NCCL user buffer registration feature. * Update docs. * Fix env var naming in docs.
NVIDIA · Feb 27, 2024 · 0dad43e · 0dad43e
1 parent 5be1865
commit 0dad43e
Show file tree

Hide file tree

Showing 5 changed files with 165 additions and 7 deletions.
diff --git a/docs/env_vars.rst b/docs/env_vars.rst
@@ -0,0 +1,17 @@
+.. _env-var-section-ref:
+
+Environment Variables
+==============================
+
+The following section lists the environment variables available to configure the cuDecomp library.
+
+CUDECOMP_ENABLE_NCCL_UBR
+------------------------
+(since v0.4.0, requires NCCL v2.19 or newer)
+
+:code:`CUDECOMP_ENABLE_NCCL_UBR` controls whether cuDecomp registers its communication buffers with the NCCL library using :code:`ncclCommRegister`/:code:`ncclCommDeregister` (i.e., user buffer registration).
+Registration can improve NCCL send/receive performance in some scenarios. See the `User Buffer Registration <https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/usage/bufferreg.html>`_
+section of the NCCL documentation for more details.
+
+Default setting is off (:code:`0`). Setting this variable to :code:`1` will enable this feature.
+
diff --git a/docs/index.rst b/docs/index.rst
@@ -30,6 +30,7 @@ Table of Contents
    autotuning
    nvshmem
    api
+   env_vars
 
 
 Indices and tables

diff --git a/include/internal/common.h b/include/internal/common.h
@@ -36,6 +36,7 @@
 #include <map>
 #include <string>
 #include <unordered_map>
+#include <utility>
 #include <vector>
 
 #include <cuda_runtime.h>
@@ -59,6 +60,8 @@ struct cudecompHandle {
   int n_grid_descs_using_nccl = 0;      // Count of grid descriptors using NCCL
   ncclComm_t nccl_comm = nullptr;       // NCCL communicator (global)
   ncclComm_t nccl_local_comm = nullptr; // NCCL communicator (intranode)
+  bool nccl_enable_ubr = false;         // Flag to control NCCL user buffer registration usage
+  std::unordered_map<void*, std::vector<std::pair<ncclComm_t, void*>>> nccl_ubr_handles; // map of allocated buffer address to NCCL registration handle(s)
 
   cudaStream_t pl_stream = nullptr; // stream used for pipelined backends
 

diff --git a/src/autotune.cc b/src/autotune.cc
@@ -98,13 +98,16 @@ void autotuneTransposeBackend(cudecompHandle_t handle, cudecompGridDesc_t grid_d
   bool autotune_pdims = (grid_desc->config.pdims[0] == 0 && grid_desc->config.pdims[1] == 0);
 
   std::vector<cudecompTransposeCommBackend_t> comm_backend_list;
+  bool need_nccl = false;
+  std::array<void*, 2> nccl_work_ubr_handles{nullptr, nullptr};
   bool need_nvshmem = false;
   if (autotune_comm) {
     comm_backend_list = {CUDECOMP_TRANSPOSE_COMM_MPI_P2P, CUDECOMP_TRANSPOSE_COMM_MPI_P2P_PL,
                          CUDECOMP_TRANSPOSE_COMM_MPI_A2A};
     if (!options->disable_nccl_backends) {
       comm_backend_list.push_back(CUDECOMP_TRANSPOSE_COMM_NCCL);
       comm_backend_list.push_back(CUDECOMP_TRANSPOSE_COMM_NCCL_PL);
+      need_nccl = true;
     }
 #ifdef ENABLE_NVSHMEM
     if (!options->disable_nvshmem_backends) {
@@ -115,6 +118,7 @@ void autotuneTransposeBackend(cudecompHandle_t handle, cudecompGridDesc_t grid_d
 #endif
   } else {
     comm_backend_list = {grid_desc->config.transpose_comm_backend};
+    if (transposeBackendRequiresNccl(comm_backend_list[0])) { need_nccl = true; }
 #ifdef ENABLE_NVSHMEM
     if (transposeBackendRequiresNvshmem(comm_backend_list[0])) { need_nvshmem = true; }
 #endif
@@ -193,7 +197,16 @@ void autotuneTransposeBackend(cudecompHandle_t handle, cudecompGridDesc_t grid_d
       work_sz = work_sz_new;
       if (need_nvshmem) {
 #ifdef ENABLE_NVSHMEM
-        if (work && work != work_nvshmem) CHECK_CUDA(cudaFree(work));
+        if (work && work != work_nvshmem) {
+          CHECK_CUDA(cudaFree(work));
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2,19,0)
+          if (nccl_work_ubr_handles[0]) {
+            CHECK_NCCL(ncclCommDeregister(handle->nccl_comm, nccl_work_ubr_handles[0]));
+            CHECK_NCCL(ncclCommDeregister(handle->nccl_local_comm, nccl_work_ubr_handles[1]));
+            nccl_work_ubr_handles = {nullptr, nullptr};
+          }
+#endif
+        }
         // Temporarily set backend to force nvshmem_malloc patch in cudecompMalloc/Free
         auto tmp = grid_desc->config.transpose_comm_backend;
         grid_desc->config.transpose_comm_backend = CUDECOMP_TRANSPOSE_COMM_NVSHMEM;
@@ -212,11 +225,32 @@ void autotuneTransposeBackend(cudecompHandle_t handle, cudecompGridDesc_t grid_d
           cudaGetLastError(); // Reset CUDA error state
         } else {
           CHECK_CUDA(ret);
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2,19,0)
+          if (need_nccl && handle->nccl_enable_ubr) {
+            CHECK_NCCL(ncclCommRegister(handle->nccl_comm, work, work_sz, &nccl_work_ubr_handles[0]));
+            CHECK_NCCL(ncclCommRegister(handle->nccl_local_comm, work, work_sz, &nccl_work_ubr_handles[1]));
+          }
+#endif
         }
 #endif
       } else {
-        if (work) CHECK_CUDA(cudaFree(work));
+        if (work) {
+          CHECK_CUDA(cudaFree(work));
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2,19,0)
+          if (nccl_work_ubr_handles[0]) {
+            CHECK_NCCL(ncclCommDeregister(handle->nccl_comm, nccl_work_ubr_handles[0]));
+            CHECK_NCCL(ncclCommDeregister(handle->nccl_local_comm, nccl_work_ubr_handles[1]));
+            nccl_work_ubr_handles = {nullptr, nullptr};
+          }
+#endif
+        }
         CHECK_CUDA(cudaMalloc(&work, work_sz));
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2,19,0)
+        if (need_nccl && handle->nccl_enable_ubr) {
+          CHECK_NCCL(ncclCommRegister(handle->nccl_comm, work, work_sz, &nccl_work_ubr_handles[0]));
+          CHECK_NCCL(ncclCommRegister(handle->nccl_local_comm, work, work_sz, &nccl_work_ubr_handles[1]));
+        }
+#endif
       }
     }
 
@@ -394,7 +428,16 @@ void autotuneTransposeBackend(cudecompHandle_t handle, cudecompGridDesc_t grid_d
 
   // Free test data and workspace
   if (need_nvshmem) {
-    if (work != work_nvshmem) { CHECK_CUDA(cudaFree(work)); }
+    if (work != work_nvshmem) {
+      CHECK_CUDA(cudaFree(work));
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2,19,0)
+      if (nccl_work_ubr_handles[0]) {
+        CHECK_NCCL(ncclCommDeregister(handle->nccl_comm, nccl_work_ubr_handles[0]));
+        CHECK_NCCL(ncclCommDeregister(handle->nccl_local_comm, nccl_work_ubr_handles[1]));
+        nccl_work_ubr_handles = {nullptr, nullptr};
+      }
+#endif
+    }
 #ifdef ENABLE_NVSHMEM
     // Temporarily set backend to force nvshmem_malloc patch in cudecompMalloc/Free
     auto tmp = grid_desc->config.transpose_comm_backend;
@@ -404,6 +447,13 @@ void autotuneTransposeBackend(cudecompHandle_t handle, cudecompGridDesc_t grid_d
 #endif
   } else {
     CHECK_CUDA(cudaFree(work));
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2,19,0)
+    if (nccl_work_ubr_handles[0]) {
+      CHECK_NCCL(ncclCommDeregister(handle->nccl_comm, nccl_work_ubr_handles[0]));
+      CHECK_NCCL(ncclCommDeregister(handle->nccl_local_comm, nccl_work_ubr_handles[1]));
+      nccl_work_ubr_handles = {nullptr, nullptr};
+    }
+#endif
   }
 
   CHECK_CUDA(cudaFree(data));
@@ -444,10 +494,15 @@ void autotuneHaloBackend(cudecompHandle_t handle, cudecompGridDesc_t grid_desc,
   bool autotune_pdims = (grid_desc->config.pdims[0] == 0 && grid_desc->config.pdims[1] == 0);
 
   std::vector<cudecompHaloCommBackend_t> comm_backend_list;
+  bool need_nccl = false;
+  std::array<void*, 2> nccl_work_ubr_handles{nullptr, nullptr};
   bool need_nvshmem = false;
   if (autotune_comm) {
     comm_backend_list = {CUDECOMP_HALO_COMM_MPI, CUDECOMP_HALO_COMM_MPI_BLOCKING};
-    if (!options->disable_nccl_backends) { comm_backend_list.push_back(CUDECOMP_HALO_COMM_NCCL); }
+    if (!options->disable_nccl_backends) {
+      comm_backend_list.push_back(CUDECOMP_HALO_COMM_NCCL);
+      need_nccl = true;
+    }
 #ifdef ENABLE_NVSHMEM
     if (!options->disable_nvshmem_backends) {
       comm_backend_list.push_back(CUDECOMP_HALO_COMM_NVSHMEM);
@@ -457,6 +512,7 @@ void autotuneHaloBackend(cudecompHandle_t handle, cudecompGridDesc_t grid_desc,
 #endif
   } else {
     comm_backend_list = {grid_desc->config.halo_comm_backend};
+    if (haloBackendRequiresNccl(comm_backend_list[0])) { need_nccl = true; }
 #ifdef ENABLE_NVSHMEM
     if (haloBackendRequiresNvshmem(comm_backend_list[0])) { need_nvshmem = true; }
 #endif
@@ -523,7 +579,16 @@ void autotuneHaloBackend(cudecompHandle_t handle, cudecompGridDesc_t grid_desc,
       work_sz = work_sz_new;
       if (need_nvshmem) {
 #ifdef ENABLE_NVSHMEM
-        if (work && work != work_nvshmem) CHECK_CUDA(cudaFree(work));
+        if (work && work != work_nvshmem) {
+          CHECK_CUDA(cudaFree(work));
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2,19,0)
+          if (nccl_work_ubr_handles[0]) {
+            CHECK_NCCL(ncclCommDeregister(handle->nccl_comm, nccl_work_ubr_handles[0]));
+            CHECK_NCCL(ncclCommDeregister(handle->nccl_local_comm, nccl_work_ubr_handles[1]));
+            nccl_work_ubr_handles = {nullptr, nullptr};
+          }
+#endif
+        }
         // Temporarily set backend to force nvshmem_malloc patch in cudecompMalloc/Free
         auto tmp = grid_desc->config.halo_comm_backend;
         grid_desc->config.halo_comm_backend = CUDECOMP_HALO_COMM_NVSHMEM;
@@ -542,11 +607,32 @@ void autotuneHaloBackend(cudecompHandle_t handle, cudecompGridDesc_t grid_desc,
           cudaGetLastError(); // Reset CUDA error state
         } else {
           CHECK_CUDA(ret);
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2,19,0)
+          if (need_nccl && handle->nccl_enable_ubr) {
+            CHECK_NCCL(ncclCommRegister(handle->nccl_comm, work, work_sz, &nccl_work_ubr_handles[0]));
+            CHECK_NCCL(ncclCommRegister(handle->nccl_local_comm, work, work_sz, &nccl_work_ubr_handles[1]));
+          }
+#endif
         }
 #endif
       } else {
-        if (work) CHECK_CUDA(cudaFree(work));
+        if (work) {
+          CHECK_CUDA(cudaFree(work));
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2,19,0)
+          if (nccl_work_ubr_handles[0]) {
+            CHECK_NCCL(ncclCommDeregister(handle->nccl_comm, nccl_work_ubr_handles[0]));
+            CHECK_NCCL(ncclCommDeregister(handle->nccl_local_comm, nccl_work_ubr_handles[1]));
+            nccl_work_ubr_handles = {nullptr, nullptr};
+          }
+#endif
+        }
         CHECK_CUDA(cudaMalloc(&work, work_sz));
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2,19,0)
+        if (need_nccl && handle->nccl_enable_ubr) {
+          CHECK_NCCL(ncclCommRegister(handle->nccl_comm, work, work_sz, &nccl_work_ubr_handles[0]));
+          CHECK_NCCL(ncclCommRegister(handle->nccl_local_comm, work, work_sz, &nccl_work_ubr_handles[1]));
+        }
+#endif
       }
     }
 
@@ -680,7 +766,16 @@ void autotuneHaloBackend(cudecompHandle_t handle, cudecompGridDesc_t grid_desc,
 
   // Free test data and workspace
   if (need_nvshmem) {
-    if (work != work_nvshmem) { CHECK_CUDA(cudaFree(work)); }
+    if (work != work_nvshmem) {
+      CHECK_CUDA(cudaFree(work));
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2,19,0)
+      if (nccl_work_ubr_handles[0]) {
+        CHECK_NCCL(ncclCommDeregister(handle->nccl_comm, nccl_work_ubr_handles[0]));
+        CHECK_NCCL(ncclCommDeregister(handle->nccl_local_comm, nccl_work_ubr_handles[1]));
+        nccl_work_ubr_handles = {nullptr, nullptr};
+      }
+#endif
+    }
 #ifdef ENABLE_NVSHMEM
     // Temporarily set backend to force nvshmem_malloc patch in cudecompMalloc/Free
     auto tmp = grid_desc->config.halo_comm_backend;
@@ -690,6 +785,13 @@ void autotuneHaloBackend(cudecompHandle_t handle, cudecompGridDesc_t grid_desc,
 #endif
   } else {
     CHECK_CUDA(cudaFree(work));
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2,19,0)
+    if (nccl_work_ubr_handles[0]) {
+      CHECK_NCCL(ncclCommDeregister(handle->nccl_comm, nccl_work_ubr_handles[0]));
+      CHECK_NCCL(ncclCommDeregister(handle->nccl_local_comm, nccl_work_ubr_handles[1]));
+      nccl_work_ubr_handles = {nullptr, nullptr};
+    }
+#endif
   }
 
   CHECK_CUDA(cudaFree(data));

diff --git a/src/cudecomp.cc b/src/cudecomp.cc
@@ -151,6 +151,12 @@ static void gatherGlobalMPIInfo(cudecompHandle_t& handle) {
                           handle->mpi_comm));
 }
 
+static void getCudecompEnvVars(cudecompHandle_t& handle) {
+  // Check CUDECOMP_ENABLE_NCCL_UBR (NCCL user buffer registration)
+  char* nccl_enable_ubr_str = std::getenv("CUDECOMP_ENABLE_NCCL_UBR");
+  if (nccl_enable_ubr_str) { handle->nccl_enable_ubr = std::strtol(nccl_enable_ubr_str, nullptr, 10) == 1; }
+}
+
 #ifdef ENABLE_NVSHMEM
 static void inspectNvshmemEnvVars(cudecompHandle_t& handle) {
   // Check NVSHMEM_DISABLE_CUDA_VMM
@@ -220,6 +226,9 @@ cudecompResult_t cudecompInit(cudecompHandle_t* handle_in, MPI_Comm mpi_comm) {
     // Gather extra MPI info from all communicator ranks
     gatherGlobalMPIInfo(handle);
 
+    // Gather cuDecomp environment variable settings
+    getCudecompEnvVars(handle);
+
     handle->initialized = true;
     cudecomp_initialized = true;
 
@@ -720,6 +729,19 @@ cudecompResult_t cudecompMalloc(cudecompHandle_t handle, cudecompGridDesc_t grid
 #endif
     } else {
       CHECK_CUDA(cudaMalloc(buffer, buffer_size_bytes));
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2,19,0)
+      if (transposeBackendRequiresNccl(grid_desc->config.transpose_comm_backend) ||
+          haloBackendRequiresNccl(grid_desc->config.halo_comm_backend)) {
+
+        if (handle->nccl_enable_ubr) {
+          void* nccl_ubr_handle;
+          CHECK_NCCL(ncclCommRegister(handle->nccl_comm, buffer, buffer_size_bytes, &nccl_ubr_handle));
+          handle->nccl_ubr_handles[*buffer].push_back(std::make_pair(handle->nccl_comm, nccl_ubr_handle));
+          CHECK_NCCL(ncclCommRegister(handle->nccl_local_comm, buffer, buffer_size_bytes, &nccl_ubr_handle));
+          handle->nccl_ubr_handles[*buffer].push_back(std::make_pair(handle->nccl_local_comm, nccl_ubr_handle));
+        }
+      }
+#endif
     }
 
   } catch (const cudecomp::BaseException& e) {
@@ -735,6 +757,19 @@ cudecompResult_t cudecompFree(cudecompHandle_t handle, cudecompGridDesc_t grid_d
     checkHandle(handle);
     checkGridDesc(grid_desc);
 
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2,19,0)
+    if (transposeBackendRequiresNccl(grid_desc->config.transpose_comm_backend) ||
+        haloBackendRequiresNccl(grid_desc->config.halo_comm_backend)) {
+
+      if (handle->nccl_ubr_handles.count(buffer) != 0) {
+        for (const auto &entry : handle->nccl_ubr_handles[buffer]) {
+          CHECK_NCCL(ncclCommDeregister(entry.first, entry.second));
+        }
+        handle->nccl_ubr_handles.erase(buffer);
+      }
+    }
+#endif
+
     if (transposeBackendRequiresNvshmem(grid_desc->config.transpose_comm_backend) ||
         haloBackendRequiresNvshmem(grid_desc->config.halo_comm_backend)) {
 #ifdef ENABLE_NVSHMEM
-Original file line number
+Diff line change
@@ Expand Up / @@ -30,6 +30,7 @@ Table of Contents @@
        autotuning
        nvshmem
        api
+       env_vars
     Indices and tables
@@ Expand Down @@