From 7e2b0ecc8d4c4aac3270774fe66c1dde0164e649 Mon Sep 17 00:00:00 2001 From: Mikhail Brinskii Date: Wed, 2 Oct 2024 14:24:31 +0300 Subject: [PATCH] UCT: Use fabric info for cuda-ipc reachabilit check --- config/m4/cuda.m4 | 7 ++ src/ucp/wireup/wireup.c | 10 ++- src/uct/api/v2/uct_v2.h | 10 ++- src/uct/cuda/base/cuda_md.c | 24 ++++++ src/uct/cuda/base/cuda_md.h | 8 ++ src/uct/cuda/cuda_ipc/cuda_ipc_iface.c | 100 ++++++++++++++++++------- src/uct/cuda/cuda_ipc/cuda_ipc_md.c | 57 +++++++++++++- src/uct/cuda/cuda_ipc/cuda_ipc_md.h | 5 +- 8 files changed, 184 insertions(+), 37 deletions(-) diff --git a/config/m4/cuda.m4 b/config/m4/cuda.m4 index a30548534e4..4163f898e19 100644 --- a/config/m4/cuda.m4 +++ b/config/m4/cuda.m4 @@ -66,6 +66,13 @@ AS_IF([test "x$cuda_checked" != "xyes"], [AC_MSG_ERROR([libnvidia-ml not found. Install appropriate nvidia-driver package])]) cuda_happy="no"])]) + # Check for nvmlDeviceGetGpuFabricInfoV + AC_CHECK_DECLS([nvmlDeviceGetGpuFabricInfoV], + [AC_DEFINE([HAVE_NVML_FABRIC_INFO], 1, [Enable NVML GPU fabric info support])], + [AC_MSG_NOTICE([nvmlDeviceGetGpuFabricInfoV function not found in libnvidia-ml. MNNVL support will be disabled.])], + [[#include ]]) + + # Check for cuda static library have_cuda_static="no" AS_IF([test "x$cuda_happy" = "xyes"], diff --git a/src/ucp/wireup/wireup.c b/src/ucp/wireup/wireup.c index 009feb46426..4a22c34c73f 100644 --- a/src/ucp/wireup/wireup.c +++ b/src/ucp/wireup/wireup.c @@ -1249,10 +1249,12 @@ int ucp_wireup_is_reachable(ucp_ep_h ep, unsigned ep_init_flags, ucp_context_h context = ep->worker->context; ucp_worker_iface_t *wiface = ucp_worker_iface(ep->worker, rsc_index); uct_iface_is_reachable_params_t params = { - .field_mask = UCT_IFACE_IS_REACHABLE_FIELD_DEVICE_ADDR | - UCT_IFACE_IS_REACHABLE_FIELD_IFACE_ADDR, - .device_addr = ae->dev_addr, - .iface_addr = ae->iface_addr, + .field_mask = UCT_IFACE_IS_REACHABLE_FIELD_DEVICE_ADDR | + UCT_IFACE_IS_REACHABLE_FIELD_IFACE_ADDR | + UCT_IFACE_IS_REACHABLE_FIELD_DEVICE_ADDR_LENGTH, + .device_addr = ae->dev_addr, + .iface_addr = ae->iface_addr, + .device_addr_length = ae->dev_addr_len }; if (info_str != NULL) { diff --git a/src/uct/api/v2/uct_v2.h b/src/uct/api/v2/uct_v2.h index a6649903483..3c9c7707e2c 100644 --- a/src/uct/api/v2/uct_v2.h +++ b/src/uct/api/v2/uct_v2.h @@ -273,7 +273,8 @@ typedef enum { UCT_IFACE_IS_REACHABLE_FIELD_IFACE_ADDR = UCS_BIT(1), /**< iface_addr field */ UCT_IFACE_IS_REACHABLE_FIELD_INFO_STRING = UCS_BIT(2), /**< info_string field */ UCT_IFACE_IS_REACHABLE_FIELD_INFO_STRING_LENGTH = UCS_BIT(3), /**< info_string_length field */ - UCT_IFACE_IS_REACHABLE_FIELD_SCOPE = UCS_BIT(4) /**< scope field */ + UCT_IFACE_IS_REACHABLE_FIELD_SCOPE = UCS_BIT(4), /**< scope field */ + UCT_IFACE_IS_REACHABLE_FIELD_DEVICE_ADDR_LENGTH = UCS_BIT(5), } uct_iface_is_reachable_field_mask_t; @@ -611,6 +612,13 @@ typedef struct uct_iface_is_reachable_params { * Reachability scope. */ uct_iface_reachability_scope_t scope; + + /** + * Device address length. If not provided, the transport will assume a + * default minimal length according to the address buffer contents. + */ + size_t device_addr_length; + } uct_iface_is_reachable_params_t; diff --git a/src/uct/cuda/base/cuda_md.c b/src/uct/cuda/base/cuda_md.c index da8e7126af8..10192744445 100644 --- a/src/uct/cuda/base/cuda_md.c +++ b/src/uct/cuda/base/cuda_md.c @@ -101,6 +101,30 @@ uct_cuda_base_query_md_resources(uct_component_t *component, num_resources_p); } +int uct_cuda_base_is_coherent() +{ + int coherent = 0; +#if HAVE_CUDA_FABRIC + CUdevice cu_device; + ucs_status_t status; + + status = UCT_CUDADRV_FUNC_LOG_ERR(cuDeviceGet(&cu_device, 0)); + if (status != UCS_OK) { + return 0; + } + + status = UCT_CUDADRV_FUNC_LOG_ERR( + cuDeviceGetAttribute(&coherent, + CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + cu_device)); + if (status != UCS_OK) { + return 0; + } + +#endif + return coherent; +} + UCS_STATIC_INIT { /* coverity[check_return] */ diff --git a/src/uct/cuda/base/cuda_md.h b/src/uct/cuda/base/cuda_md.h index 23eb233796f..9c1d2660ab4 100644 --- a/src/uct/cuda/base/cuda_md.h +++ b/src/uct/cuda/base/cuda_md.h @@ -42,4 +42,12 @@ uct_cuda_base_query_devices(uct_md_h md, */ ucs_status_t uct_cuda_base_check_device_name(const uct_iface_params_t *params); + +/** + * Check whether the platform is coherent. + * + * @return 1 if coherent, or 0 otherwise. + */ +int uct_cuda_base_is_coherent(); + #endif diff --git a/src/uct/cuda/cuda_ipc/cuda_ipc_iface.c b/src/uct/cuda/cuda_ipc/cuda_ipc_iface.c index d18a8156006..3f3ddf52248 100644 --- a/src/uct/cuda/cuda_ipc/cuda_ipc_iface.c +++ b/src/uct/cuda/cuda_ipc/cuda_ipc_iface.c @@ -20,6 +20,18 @@ #include #include + +typedef struct { + uint64_t system_uuid; +#if HAVE_NVML_FABRIC_INFO + struct { + uint32_t clique_id; + uint8_t cluster_uuid[NVML_GPU_FABRIC_UUID_LEN]; + } mnnvl_addr; +#endif +} UCS_S_PACKED uct_cuda_ipc_device_addr_t; + + static ucs_config_field_t uct_cuda_ipc_iface_config_table[] = { {"", "", NULL, @@ -63,7 +75,22 @@ static void UCS_CLASS_DELETE_FUNC_NAME(uct_cuda_ipc_iface_t)(uct_iface_t*); ucs_status_t uct_cuda_ipc_iface_get_device_address(uct_iface_t *tl_iface, uct_device_addr_t *addr) { - *(uint64_t*)addr = ucs_get_system_id(); + uct_cuda_ipc_device_addr_t *dev_addr = (uct_cuda_ipc_device_addr_t*)addr; +#if HAVE_NVML_FABRIC_INFO + uct_cuda_ipc_iface_t *iface = ucs_derived_of(tl_iface, + uct_cuda_ipc_iface_t); + uct_cuda_ipc_md_t *md = ucs_derived_of(iface->super.super.md, + uct_cuda_ipc_md_t); + + if (md->enable_mnnvl) { + memcpy(dev_addr->mnnvl_addr.cluster_uuid, &md->fabric_info.clusterUuid, + sizeof(dev_addr->mnnvl_addr.cluster_uuid)); + dev_addr->mnnvl_addr.clique_id = md->fabric_info.cliqueId; + } +#endif + + dev_addr->system_uuid = ucs_get_system_id(); + return UCS_OK; } @@ -74,56 +101,69 @@ static ucs_status_t uct_cuda_ipc_iface_get_address(uct_iface_h tl_iface, return UCS_OK; } -static int uct_cuda_ipc_iface_is_mnnvl_supported(uct_cuda_ipc_md_t *md) +static int +uct_cuda_ipc_iface_mnnvl_reachable(uct_cuda_ipc_md_t *md, + const uct_cuda_ipc_device_addr_t *dev_addr, + size_t dev_addr_len, + const uct_iface_is_reachable_params_t *params) { -#if HAVE_CUDA_FABRIC - CUdevice cu_device; - int coherent; - ucs_status_t status; - - status = UCT_CUDADRV_FUNC_LOG_ERR(cuDeviceGet(&cu_device, 0)); - if (status != UCS_OK) { +#if HAVE_NVML_FABRIC_INFO + if (memcmp(&dev_addr->mnnvl_addr.cluster_uuid, + &md->fabric_info.clusterUuid, + sizeof(dev_addr->mnnvl_addr.cluster_uuid))) { + uct_iface_fill_info_str_buf(params, "cluster uuid doesn't match"); return 0; } - status = UCT_CUDADRV_FUNC_LOG_ERR( - cuDeviceGetAttribute(&coherent, - CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, - cu_device)); - if (status != UCS_OK) { + if (dev_addr->mnnvl_addr.clique_id != md->fabric_info.cliqueId){ + uct_iface_fill_info_str_buf(params, "clique id doesn't match"); return 0; } - return coherent && (md->enable_mnnvl != UCS_NO); -#else - return 0; + return 1; #endif + + return 0; } static int uct_cuda_ipc_iface_is_reachable_v2(const uct_iface_h tl_iface, const uct_iface_is_reachable_params_t *params) { - uct_base_iface_t *base_iface = ucs_derived_of(tl_iface, uct_base_iface_t); - uct_cuda_ipc_md_t *md = ucs_derived_of(base_iface->md, uct_cuda_ipc_md_t); + uct_cuda_ipc_iface_t *iface = ucs_derived_of(tl_iface, uct_cuda_ipc_iface_t); + uct_cuda_ipc_md_t *md = ucs_derived_of(iface->super.super.md, + uct_cuda_ipc_md_t); + const uct_cuda_ipc_device_addr_t *dev_addr; + size_t dev_addr_len; + int same_uuid; if (!uct_iface_is_reachable_params_addrs_valid(params)) { return 0; } - if (getpid() == *(pid_t*)params->iface_addr) { + dev_addr_len = UCS_PARAM_VALUE(UCT_IFACE_IS_REACHABLE_FIELD, params, + device_addr_length, DEVICE_ADDR_LENGTH, + sizeof(uint64_t)); + dev_addr = (const uct_cuda_ipc_device_addr_t *)UCS_PARAM_VALUE( + UCT_IFACE_IS_REACHABLE_FIELD, params, device_addr, + DEVICE_ADDR, NULL); + same_uuid = ucs_get_system_id() == dev_addr->system_uuid; + + if ((getpid() == *(pid_t*)params->iface_addr) && same_uuid) { uct_iface_fill_info_str_buf(params, "same process"); return 0; } - /* Either multi-node NVLINK should be supported or iface has to be on the - * same node for cuda-ipc to be reachable */ - if ((ucs_get_system_id() != *((const uint64_t*)params->device_addr)) && - !uct_cuda_ipc_iface_is_mnnvl_supported(md)) { + if (md->enable_mnnvl && (dev_addr_len != sizeof(uint64_t))) { + if (!uct_cuda_ipc_iface_mnnvl_reachable(md, dev_addr, dev_addr_len, + params)) { + return 0; + } + } else if (!same_uuid) { uct_iface_fill_info_str_buf(params, "different system id %"PRIx64" vs %"PRIx64"", - ucs_get_system_id(), - *((const uint64_t*)params->device_addr)); + ucs_get_system_id(), dev_addr->system_uuid); + return 0; } @@ -245,7 +285,9 @@ static ucs_status_t uct_cuda_ipc_iface_query(uct_iface_h tl_iface, uct_base_iface_query(&iface->super.super, iface_attr); iface_attr->iface_addr_len = sizeof(pid_t); - iface_attr->device_addr_len = sizeof(uint64_t); + iface_attr->device_addr_len = (md->enable_mnnvl) ? + sizeof(uct_cuda_ipc_device_addr_t) : + sizeof(uint64_t); iface_attr->ep_addr_len = 0; iface_attr->max_conn_priv = 0; iface_attr->cap.flags = UCT_IFACE_FLAG_ERRHANDLE_PEER_FAILURE | @@ -253,7 +295,7 @@ static ucs_status_t uct_cuda_ipc_iface_query(uct_iface_h tl_iface, UCT_IFACE_FLAG_PENDING | UCT_IFACE_FLAG_GET_ZCOPY | UCT_IFACE_FLAG_PUT_ZCOPY; - if (uct_cuda_ipc_iface_is_mnnvl_supported(md)) { + if (md->enable_mnnvl) { iface_attr->cap.flags |= UCT_IFACE_FLAG_INTER_NODE; } @@ -614,7 +656,7 @@ uct_cuda_ipc_query_devices( uct_device_type_t dev_type = UCT_DEVICE_TYPE_SHM; uct_cuda_ipc_md_t *md = ucs_derived_of(uct_md, uct_cuda_ipc_md_t); - if (uct_cuda_ipc_iface_is_mnnvl_supported(md)) { + if (md->enable_mnnvl) { dev_type = UCT_DEVICE_TYPE_NET; } diff --git a/src/uct/cuda/cuda_ipc/cuda_ipc_md.c b/src/uct/cuda/cuda_ipc/cuda_ipc_md.c index f7edbc99d70..511cace1708 100644 --- a/src/uct/cuda/cuda_ipc/cuda_ipc_md.c +++ b/src/uct/cuda/cuda_ipc/cuda_ipc_md.c @@ -407,6 +407,58 @@ uct_cuda_ipc_mem_dereg(uct_md_h md, const uct_md_mem_dereg_params_t *params) return UCS_OK; } +static int +uct_cuda_ipc_md_init_fabric_info(uct_cuda_ipc_md_t *md, + ucs_ternary_auto_value_t mnnvl_enable) +{ + int mnnvl_supported = 0; +#if HAVE_NVML_FABRIC_INFO + nvmlDevice_t device; + ucs_status_t status; + + if (!uct_cuda_base_is_coherent() || (mnnvl_enable == UCS_NO)) { + goto err; + } + + status = UCT_NVML_FUNC(nvmlInit_v2(), UCS_LOG_LEVEL_DIAG); + if (status != UCS_OK) { + goto err; + } + + status = UCT_NVML_FUNC_LOG_ERR(nvmlDeviceGetHandleByIndex(0, &device)); + if (status != UCS_OK) { + goto err_sd; + } + + md->fabric_info.version = nvmlGpuFabricInfo_v2; + status = UCT_NVML_FUNC_LOG_ERR( + nvmlDeviceGetGpuFabricInfoV(device, &md->fabric_info)); + if (status != UCS_OK) { + goto err_sd; + } + + ucs_debug("Fabric_info: clique %u healthmask %u state %u status %u", + md->fabric_info.cliqueId, md->fabric_info.healthMask, + md->fabric_info.state, md->fabric_info.status); + + if ((md->fabric_info.state != NVML_GPU_FABRIC_STATE_COMPLETED) || + (md->fabric_info.status != NVML_SUCCESS)) { + goto err_sd; + } + + mnnvl_supported = 1; + +err_sd: + UCT_NVML_FUNC_LOG_ERR(nvmlShutdown()); +err: +#endif + if ((mnnvl_enable == UCS_YES) && !mnnvl_supported) { + ucs_warn("multi-node NVLINK support is requested but not supported"); + } + + return mnnvl_supported; +} + static void uct_cuda_ipc_md_close(uct_md_h md) { ucs_free(md); @@ -436,9 +488,10 @@ uct_cuda_ipc_md_open(uct_component_t *component, const char *md_name, md->super.ops = &md_ops; md->super.component = &uct_cuda_ipc_component.super; - md->enable_mnnvl = ipc_config->enable_mnnvl; *md_p = &md->super; - + md->enable_mnnvl = uct_cuda_ipc_md_init_fabric_info( + md, ipc_config->enable_mnnvl); + return UCS_OK; } diff --git a/src/uct/cuda/cuda_ipc/cuda_ipc_md.h b/src/uct/cuda/cuda_ipc/cuda_ipc_md.h index da2a83d6d80..5aca1416ce0 100644 --- a/src/uct/cuda/cuda_ipc/cuda_ipc_md.h +++ b/src/uct/cuda/cuda_ipc/cuda_ipc_md.h @@ -42,7 +42,10 @@ typedef CUipcMemHandle uct_cuda_ipc_md_handle_t; */ typedef struct uct_cuda_ipc_md { uct_md_t super; /**< Domain info */ - ucs_ternary_auto_value_t enable_mnnvl; + int enable_mnnvl; +#if HAVE_NVML_FABRIC_INFO + nvmlGpuFabricInfoV_t fabric_info; +#endif } uct_cuda_ipc_md_t;