From 7e2b0ecc8d4c4aac3270774fe66c1dde0164e649 Mon Sep 17 00:00:00 2001
From: Mikhail Brinskii <mikhailb@nvidia.com>
Date: Wed, 2 Oct 2024 14:24:31 +0300
Subject: [PATCH] UCT: Use fabric info for cuda-ipc reachabilit check

---
 config/m4/cuda.m4                      |   7 ++
 src/ucp/wireup/wireup.c                |  10 ++-
 src/uct/api/v2/uct_v2.h                |  10 ++-
 src/uct/cuda/base/cuda_md.c            |  24 ++++++
 src/uct/cuda/base/cuda_md.h            |   8 ++
 src/uct/cuda/cuda_ipc/cuda_ipc_iface.c | 100 ++++++++++++++++++-------
 src/uct/cuda/cuda_ipc/cuda_ipc_md.c    |  57 +++++++++++++-
 src/uct/cuda/cuda_ipc/cuda_ipc_md.h    |   5 +-
 8 files changed, 184 insertions(+), 37 deletions(-)

diff --git a/config/m4/cuda.m4 b/config/m4/cuda.m4
index a30548534e4..4163f898e19 100644
--- a/config/m4/cuda.m4
+++ b/config/m4/cuda.m4
@@ -66,6 +66,13 @@ AS_IF([test "x$cuda_checked" != "xyes"],
                                     [AC_MSG_ERROR([libnvidia-ml not found. Install appropriate nvidia-driver package])])
                               cuda_happy="no"])])
 
+         # Check for nvmlDeviceGetGpuFabricInfoV
+         AC_CHECK_DECLS([nvmlDeviceGetGpuFabricInfoV],
+                        [AC_DEFINE([HAVE_NVML_FABRIC_INFO], 1, [Enable NVML GPU fabric info support])],
+                        [AC_MSG_NOTICE([nvmlDeviceGetGpuFabricInfoV function not found in libnvidia-ml. MNNVL support will be disabled.])],
+                        [[#include <nvml.h>]])
+
+
          # Check for cuda static library
          have_cuda_static="no"
          AS_IF([test "x$cuda_happy" = "xyes"],
diff --git a/src/ucp/wireup/wireup.c b/src/ucp/wireup/wireup.c
index 009feb46426..4a22c34c73f 100644
--- a/src/ucp/wireup/wireup.c
+++ b/src/ucp/wireup/wireup.c
@@ -1249,10 +1249,12 @@ int ucp_wireup_is_reachable(ucp_ep_h ep, unsigned ep_init_flags,
     ucp_context_h context      = ep->worker->context;
     ucp_worker_iface_t *wiface = ucp_worker_iface(ep->worker, rsc_index);
     uct_iface_is_reachable_params_t params = {
-        .field_mask  = UCT_IFACE_IS_REACHABLE_FIELD_DEVICE_ADDR |
-                       UCT_IFACE_IS_REACHABLE_FIELD_IFACE_ADDR,
-        .device_addr = ae->dev_addr,
-        .iface_addr  = ae->iface_addr,
+        .field_mask         = UCT_IFACE_IS_REACHABLE_FIELD_DEVICE_ADDR |
+                              UCT_IFACE_IS_REACHABLE_FIELD_IFACE_ADDR |
+                              UCT_IFACE_IS_REACHABLE_FIELD_DEVICE_ADDR_LENGTH,
+        .device_addr        = ae->dev_addr,
+        .iface_addr         = ae->iface_addr,
+        .device_addr_length = ae->dev_addr_len
     };
 
     if (info_str != NULL) {
diff --git a/src/uct/api/v2/uct_v2.h b/src/uct/api/v2/uct_v2.h
index a6649903483..3c9c7707e2c 100644
--- a/src/uct/api/v2/uct_v2.h
+++ b/src/uct/api/v2/uct_v2.h
@@ -273,7 +273,8 @@ typedef enum {
     UCT_IFACE_IS_REACHABLE_FIELD_IFACE_ADDR         = UCS_BIT(1), /**< iface_addr field */
     UCT_IFACE_IS_REACHABLE_FIELD_INFO_STRING        = UCS_BIT(2), /**< info_string field */
     UCT_IFACE_IS_REACHABLE_FIELD_INFO_STRING_LENGTH = UCS_BIT(3), /**< info_string_length field */
-    UCT_IFACE_IS_REACHABLE_FIELD_SCOPE              = UCS_BIT(4) /**<  scope field */
+    UCT_IFACE_IS_REACHABLE_FIELD_SCOPE              = UCS_BIT(4), /**< scope field */
+    UCT_IFACE_IS_REACHABLE_FIELD_DEVICE_ADDR_LENGTH = UCS_BIT(5),
 } uct_iface_is_reachable_field_mask_t;
 
 
@@ -611,6 +612,13 @@ typedef struct uct_iface_is_reachable_params {
      * Reachability scope.
      */
     uct_iface_reachability_scope_t scope;
+
+    /**
+     * Device address length. If not provided, the transport will assume a
+     * default minimal length according to the address buffer contents.
+     */
+    size_t                        device_addr_length;
+
 } uct_iface_is_reachable_params_t;
 
 
diff --git a/src/uct/cuda/base/cuda_md.c b/src/uct/cuda/base/cuda_md.c
index da8e7126af8..10192744445 100644
--- a/src/uct/cuda/base/cuda_md.c
+++ b/src/uct/cuda/base/cuda_md.c
@@ -101,6 +101,30 @@ uct_cuda_base_query_md_resources(uct_component_t *component,
                                            num_resources_p);
 }
 
+int uct_cuda_base_is_coherent()
+{
+    int coherent = 0;
+#if HAVE_CUDA_FABRIC
+    CUdevice cu_device;
+    ucs_status_t status;
+
+    status = UCT_CUDADRV_FUNC_LOG_ERR(cuDeviceGet(&cu_device, 0));
+    if (status != UCS_OK) {
+        return 0;
+    }
+
+    status = UCT_CUDADRV_FUNC_LOG_ERR(
+            cuDeviceGetAttribute(&coherent,
+                                 CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES,
+                                 cu_device));
+    if (status != UCS_OK) {
+        return 0;
+    }
+
+#endif
+    return coherent;
+}
+
 UCS_STATIC_INIT
 {
     /* coverity[check_return] */
diff --git a/src/uct/cuda/base/cuda_md.h b/src/uct/cuda/base/cuda_md.h
index 23eb233796f..9c1d2660ab4 100644
--- a/src/uct/cuda/base/cuda_md.h
+++ b/src/uct/cuda/base/cuda_md.h
@@ -42,4 +42,12 @@ uct_cuda_base_query_devices(uct_md_h md,
  */
 ucs_status_t uct_cuda_base_check_device_name(const uct_iface_params_t *params);
 
+
+/**
+ * Check whether the platform is coherent.
+ *
+ * @return 1 if coherent, or 0 otherwise.
+ */
+int uct_cuda_base_is_coherent();
+
 #endif
diff --git a/src/uct/cuda/cuda_ipc/cuda_ipc_iface.c b/src/uct/cuda/cuda_ipc/cuda_ipc_iface.c
index d18a8156006..3f3ddf52248 100644
--- a/src/uct/cuda/cuda_ipc/cuda_ipc_iface.c
+++ b/src/uct/cuda/cuda_ipc/cuda_ipc_iface.c
@@ -20,6 +20,18 @@
 #include <pthread.h>
 #include <nvml.h>
 
+
+typedef struct {
+    uint64_t     system_uuid;
+#if HAVE_NVML_FABRIC_INFO
+    struct {
+        uint32_t clique_id;
+        uint8_t  cluster_uuid[NVML_GPU_FABRIC_UUID_LEN];
+    } mnnvl_addr;
+#endif
+} UCS_S_PACKED uct_cuda_ipc_device_addr_t;
+
+
 static ucs_config_field_t uct_cuda_ipc_iface_config_table[] = {
 
     {"", "", NULL,
@@ -63,7 +75,22 @@ static void UCS_CLASS_DELETE_FUNC_NAME(uct_cuda_ipc_iface_t)(uct_iface_t*);
 ucs_status_t uct_cuda_ipc_iface_get_device_address(uct_iface_t *tl_iface,
                                                    uct_device_addr_t *addr)
 {
-    *(uint64_t*)addr = ucs_get_system_id();
+    uct_cuda_ipc_device_addr_t *dev_addr = (uct_cuda_ipc_device_addr_t*)addr;
+#if HAVE_NVML_FABRIC_INFO
+    uct_cuda_ipc_iface_t *iface          = ucs_derived_of(tl_iface,
+                                                          uct_cuda_ipc_iface_t);
+    uct_cuda_ipc_md_t *md                = ucs_derived_of(iface->super.super.md,
+                                                           uct_cuda_ipc_md_t);
+
+    if (md->enable_mnnvl) {
+        memcpy(dev_addr->mnnvl_addr.cluster_uuid, &md->fabric_info.clusterUuid,
+               sizeof(dev_addr->mnnvl_addr.cluster_uuid));
+        dev_addr->mnnvl_addr.clique_id = md->fabric_info.cliqueId;
+    }
+#endif
+
+    dev_addr->system_uuid = ucs_get_system_id();
+
     return UCS_OK;
 }
 
@@ -74,56 +101,69 @@ static ucs_status_t uct_cuda_ipc_iface_get_address(uct_iface_h tl_iface,
     return UCS_OK;
 }
 
-static int uct_cuda_ipc_iface_is_mnnvl_supported(uct_cuda_ipc_md_t *md)
+static int
+uct_cuda_ipc_iface_mnnvl_reachable(uct_cuda_ipc_md_t *md,
+                                   const uct_cuda_ipc_device_addr_t *dev_addr,
+                                   size_t dev_addr_len,
+                                   const uct_iface_is_reachable_params_t *params)
 {
-#if HAVE_CUDA_FABRIC
-    CUdevice cu_device;
-    int coherent;
-    ucs_status_t status;
-
-    status = UCT_CUDADRV_FUNC_LOG_ERR(cuDeviceGet(&cu_device, 0));
-    if (status != UCS_OK) {
+#if HAVE_NVML_FABRIC_INFO
+    if (memcmp(&dev_addr->mnnvl_addr.cluster_uuid,
+                &md->fabric_info.clusterUuid,
+               sizeof(dev_addr->mnnvl_addr.cluster_uuid))) {
+        uct_iface_fill_info_str_buf(params, "cluster uuid doesn't match");
         return 0;
     }
 
-    status = UCT_CUDADRV_FUNC_LOG_ERR(
-            cuDeviceGetAttribute(&coherent,
-                                 CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES,
-                                 cu_device));
-    if (status != UCS_OK) {
+    if (dev_addr->mnnvl_addr.clique_id != md->fabric_info.cliqueId){
+        uct_iface_fill_info_str_buf(params, "clique id doesn't match");
         return 0;
     }
 
-    return coherent && (md->enable_mnnvl != UCS_NO);
-#else
-    return 0;
+    return 1;
 #endif
+
+    return 0;
 }
 
 static int
 uct_cuda_ipc_iface_is_reachable_v2(const uct_iface_h tl_iface,
                                    const uct_iface_is_reachable_params_t *params)
 {
-    uct_base_iface_t *base_iface = ucs_derived_of(tl_iface, uct_base_iface_t);
-    uct_cuda_ipc_md_t *md        = ucs_derived_of(base_iface->md, uct_cuda_ipc_md_t);
+    uct_cuda_ipc_iface_t *iface = ucs_derived_of(tl_iface, uct_cuda_ipc_iface_t);
+    uct_cuda_ipc_md_t *md       = ucs_derived_of(iface->super.super.md,
+                                                 uct_cuda_ipc_md_t);
+    const uct_cuda_ipc_device_addr_t *dev_addr;
+    size_t dev_addr_len;
+    int same_uuid;
 
     if (!uct_iface_is_reachable_params_addrs_valid(params)) {
         return 0;
     }
 
-    if (getpid() == *(pid_t*)params->iface_addr) {
+    dev_addr_len = UCS_PARAM_VALUE(UCT_IFACE_IS_REACHABLE_FIELD, params,
+                                   device_addr_length, DEVICE_ADDR_LENGTH,
+                                   sizeof(uint64_t));
+    dev_addr        = (const uct_cuda_ipc_device_addr_t *)UCS_PARAM_VALUE(
+                        UCT_IFACE_IS_REACHABLE_FIELD, params, device_addr,
+                        DEVICE_ADDR, NULL);
+    same_uuid       = ucs_get_system_id() == dev_addr->system_uuid;
+
+    if ((getpid() == *(pid_t*)params->iface_addr) && same_uuid) {
         uct_iface_fill_info_str_buf(params, "same process");
         return 0;
     }
 
-    /* Either multi-node NVLINK should be supported or iface has to be on the
-     * same node for cuda-ipc to be reachable */
-    if ((ucs_get_system_id() != *((const uint64_t*)params->device_addr)) &&
-        !uct_cuda_ipc_iface_is_mnnvl_supported(md)) {
+    if (md->enable_mnnvl && (dev_addr_len != sizeof(uint64_t))) {
+        if (!uct_cuda_ipc_iface_mnnvl_reachable(md, dev_addr, dev_addr_len,
+                                                params)) {
+            return 0;
+        }
+    } else if (!same_uuid) {
         uct_iface_fill_info_str_buf(params,
                                     "different system id %"PRIx64" vs %"PRIx64"",
-                                    ucs_get_system_id(),
-                                    *((const uint64_t*)params->device_addr));
+                                    ucs_get_system_id(), dev_addr->system_uuid);
+
         return 0;
     }
 
@@ -245,7 +285,9 @@ static ucs_status_t uct_cuda_ipc_iface_query(uct_iface_h tl_iface,
     uct_base_iface_query(&iface->super.super, iface_attr);
 
     iface_attr->iface_addr_len          = sizeof(pid_t);
-    iface_attr->device_addr_len         = sizeof(uint64_t);
+    iface_attr->device_addr_len         = (md->enable_mnnvl) ?
+                                          sizeof(uct_cuda_ipc_device_addr_t) :
+                                          sizeof(uint64_t);
     iface_attr->ep_addr_len             = 0;
     iface_attr->max_conn_priv           = 0;
     iface_attr->cap.flags               = UCT_IFACE_FLAG_ERRHANDLE_PEER_FAILURE |
@@ -253,7 +295,7 @@ static ucs_status_t uct_cuda_ipc_iface_query(uct_iface_h tl_iface,
                                           UCT_IFACE_FLAG_PENDING          |
                                           UCT_IFACE_FLAG_GET_ZCOPY        |
                                           UCT_IFACE_FLAG_PUT_ZCOPY;
-    if (uct_cuda_ipc_iface_is_mnnvl_supported(md)) {
+    if (md->enable_mnnvl) {
         iface_attr->cap.flags |= UCT_IFACE_FLAG_INTER_NODE;
     }
 
@@ -614,7 +656,7 @@ uct_cuda_ipc_query_devices(
     uct_device_type_t dev_type = UCT_DEVICE_TYPE_SHM;
     uct_cuda_ipc_md_t *md      = ucs_derived_of(uct_md, uct_cuda_ipc_md_t);
 
-    if (uct_cuda_ipc_iface_is_mnnvl_supported(md)) {
+    if (md->enable_mnnvl) {
         dev_type = UCT_DEVICE_TYPE_NET;
     }
 
diff --git a/src/uct/cuda/cuda_ipc/cuda_ipc_md.c b/src/uct/cuda/cuda_ipc/cuda_ipc_md.c
index f7edbc99d70..511cace1708 100644
--- a/src/uct/cuda/cuda_ipc/cuda_ipc_md.c
+++ b/src/uct/cuda/cuda_ipc/cuda_ipc_md.c
@@ -407,6 +407,58 @@ uct_cuda_ipc_mem_dereg(uct_md_h md, const uct_md_mem_dereg_params_t *params)
     return UCS_OK;
 }
 
+static int
+uct_cuda_ipc_md_init_fabric_info(uct_cuda_ipc_md_t *md,
+                                 ucs_ternary_auto_value_t mnnvl_enable)
+{
+    int mnnvl_supported = 0;
+#if HAVE_NVML_FABRIC_INFO
+    nvmlDevice_t device;
+    ucs_status_t status;
+
+    if (!uct_cuda_base_is_coherent() || (mnnvl_enable == UCS_NO)) {
+        goto err;
+    }
+
+    status = UCT_NVML_FUNC(nvmlInit_v2(), UCS_LOG_LEVEL_DIAG);
+    if (status != UCS_OK) {
+        goto err;
+    }
+
+    status = UCT_NVML_FUNC_LOG_ERR(nvmlDeviceGetHandleByIndex(0, &device));
+    if (status != UCS_OK) {
+        goto err_sd;
+    }
+
+    md->fabric_info.version = nvmlGpuFabricInfo_v2;
+    status                  = UCT_NVML_FUNC_LOG_ERR(
+                         nvmlDeviceGetGpuFabricInfoV(device, &md->fabric_info));
+    if (status != UCS_OK) {
+        goto err_sd;
+    }
+
+    ucs_debug("Fabric_info: clique %u healthmask %u state %u status %u",
+              md->fabric_info.cliqueId, md->fabric_info.healthMask,
+              md->fabric_info.state, md->fabric_info.status);
+
+    if ((md->fabric_info.state != NVML_GPU_FABRIC_STATE_COMPLETED) ||
+        (md->fabric_info.status != NVML_SUCCESS)) {
+        goto err_sd;
+    }
+
+    mnnvl_supported = 1;
+
+err_sd:
+    UCT_NVML_FUNC_LOG_ERR(nvmlShutdown());
+err:
+#endif
+    if ((mnnvl_enable == UCS_YES) && !mnnvl_supported) {
+        ucs_warn("multi-node NVLINK support is requested but not supported");
+    }
+
+    return mnnvl_supported;
+}
+
 static void uct_cuda_ipc_md_close(uct_md_h md)
 {
     ucs_free(md);
@@ -436,9 +488,10 @@ uct_cuda_ipc_md_open(uct_component_t *component, const char *md_name,
 
     md->super.ops       = &md_ops;
     md->super.component = &uct_cuda_ipc_component.super;
-    md->enable_mnnvl    = ipc_config->enable_mnnvl;
     *md_p               = &md->super;
-  
+    md->enable_mnnvl    = uct_cuda_ipc_md_init_fabric_info(
+                                                  md, ipc_config->enable_mnnvl);
+
     return UCS_OK;
 }
 
diff --git a/src/uct/cuda/cuda_ipc/cuda_ipc_md.h b/src/uct/cuda/cuda_ipc/cuda_ipc_md.h
index da2a83d6d80..5aca1416ce0 100644
--- a/src/uct/cuda/cuda_ipc/cuda_ipc_md.h
+++ b/src/uct/cuda/cuda_ipc/cuda_ipc_md.h
@@ -42,7 +42,10 @@ typedef CUipcMemHandle uct_cuda_ipc_md_handle_t;
  */
 typedef struct uct_cuda_ipc_md {
     uct_md_t                 super;   /**< Domain info */
-    ucs_ternary_auto_value_t enable_mnnvl;
+    int                      enable_mnnvl;
+#if HAVE_NVML_FABRIC_INFO
+    nvmlGpuFabricInfoV_t     fabric_info;
+#endif
 } uct_cuda_ipc_md_t;