openucx · SeyedMir · Sep 17, 2024 · ivankochin · Sep 19, 2024 · SeyedMir
diff --git a/config/ucx.conf b/config/ucx.conf
@@ -6,6 +6,7 @@ UCX_IB_MLX5_DEVX_OBJECTS=
 UCX_GDR_COPY_BW=0MBs,get_dedicated:30GBs,put_dedicated:30GBs
 UCX_GDR_COPY_LAT=30e-9
 UCX_DISTANCE_BW=auto,sys:16500MBs
+UCX_CUDA_COPY_BW=h2d:400GBs,d2h:300GBs,d2d:400GBs,other:10000MBs
 
 [Fujitsu ARM]
 CPU vendor=Fujitsu ARM

diff --git a/src/uct/cuda/cuda_copy/cuda_copy_iface.c b/src/uct/cuda/cuda_copy/cuda_copy_iface.c
@@ -37,14 +37,24 @@ static ucs_config_field_t uct_cuda_copy_iface_config_table[] = {
  "Max number of cuda events. -1 is infinite",
  ucs_offsetof(uct_cuda_copy_iface_config_t, max_cuda_events), UCS_CONFIG_TYPE_UINT},
 
- {"BW", "10000MBs",
- "Effective memory bandwidth",
- ucs_offsetof(uct_cuda_copy_iface_config_t, bandwidth), UCS_CONFIG_TYPE_BW},
+ /* TODO: 1. Add separate keys for shared and dedicated bandwidth
+ 2. Remove the "other" key (use pref_loc for managed memory) */
+ {"BW", "10000MBs,h2d:8300MBs,d2h:11660MBs,d2d:320GBs",
+ "Effective memory bandwidth", 0,
+ UCS_CONFIG_TYPE_KEY_VALUE(UCS_CONFIG_TYPE_BW,
+ {"h2d", "host to device bandwidth",
+ ucs_offsetof(uct_cuda_copy_iface_config_t, bw.h2d)},
+ {"d2h", "device to host bandwidth",
+ ucs_offsetof(uct_cuda_copy_iface_config_t, bw.d2h)},
+ {"d2d", "device to device bandwidth",
+ ucs_offsetof(uct_cuda_copy_iface_config_t, bw.d2d)},
+ {"other", "any other memory types combinations bandwidth",
+ ucs_offsetof(uct_cuda_copy_iface_config_t, bw.other)},
+ {NULL})},
 
  {NULL}
 };
 
-
 /* Forward declaration for the delete function */
 static void UCS_CLASS_DELETE_FUNC_NAME(uct_cuda_copy_iface_t)(uct_iface_t*);
 
@@ -134,7 +144,7 @@ static ucs_status_t uct_cuda_copy_iface_query(uct_iface_h tl_iface,
 
  iface_attr->latency = UCT_CUDA_COPY_IFACE_LATENCY;
  iface_attr->bandwidth.dedicated = 0;
- iface_attr->bandwidth.shared = iface->config.bandwidth;
+ iface_attr->bandwidth.shared = iface->config.bw.other;
  iface_attr->overhead = UCT_CUDA_COPY_IFACE_OVERHEAD;
  iface_attr->priority = 0;
 
@@ -407,16 +417,17 @@ uct_cuda_copy_estimate_perf(uct_iface_h tl_iface, uct_perf_attr_t *perf_attr)
  perf_attr->bandwidth.dedicated = 0;
  if ((src_mem_type == UCS_MEMORY_TYPE_HOST) &&
  (dst_mem_type == UCS_MEMORY_TYPE_CUDA)) {
- perf_attr->bandwidth.shared = (zcopy ? 8300.0 : 7900.0) * UCS_MBYTE;
+ perf_attr->bandwidth.shared = zcopy ? iface->config.bw.h2d :
+ iface->config.bw.h2d * 0.95;
  } else if ((src_mem_type == UCS_MEMORY_TYPE_CUDA) &&
  (dst_mem_type == UCS_MEMORY_TYPE_HOST)) {
- perf_attr->bandwidth.shared = (zcopy ? 11660.0 : 9320.0) *
- UCS_MBYTE;
+ perf_attr->bandwidth.shared = zcopy ? iface->config.bw.d2h :
+ iface->config.bw.d2h * 0.95;
  } else if ((src_mem_type == UCS_MEMORY_TYPE_CUDA) &&
  (dst_mem_type == UCS_MEMORY_TYPE_CUDA)) {
- perf_attr->bandwidth.shared = 320.0 * UCS_GBYTE;
+ perf_attr->bandwidth.shared = iface->config.bw.d2d;
  } else {
- perf_attr->bandwidth.shared = iface->config.bandwidth;
+ perf_attr->bandwidth.shared = iface->config.bw.other;
  }
  }
 
@@ -491,7 +502,10 @@ static UCS_CLASS_INIT_FUNC(uct_cuda_copy_iface_t, uct_md_h md, uct_worker_h work
  self->id = ucs_generate_uuid((uintptr_t)self);
  self->config.max_poll = config->max_poll;
  self->config.max_cuda_events = config->max_cuda_events;
- self->config.bandwidth = config->bandwidth;
+ self->config.bw.h2d = config->bw.h2d;
+ self->config.bw.d2h = config->bw.d2h;
+ self->config.bw.d2d = config->bw.d2d;
+ self->config.bw.other = config->bw.other;
  UCS_STATIC_BITMAP_RESET_ALL(&self->streams_to_sync);
 
  ucs_mpool_params_reset(&mp_params);

diff --git a/src/uct/cuda/cuda_copy/cuda_copy_iface.h b/src/uct/cuda/cuda_copy/cuda_copy_iface.h
@@ -69,7 +69,12 @@ typedef struct uct_cuda_copy_iface {
  struct {
  unsigned max_poll;
  unsigned max_cuda_events;
- double bandwidth;
+ struct {
+ double h2d;
+ double d2h;
+ double d2d;
+ double other;
+ } bw;
  } config;
  /* handler to support arm/wakeup feature */
  struct {
@@ -87,7 +92,12 @@ typedef struct uct_cuda_copy_iface_config {
  uct_iface_config_t super;
  unsigned max_poll;
  unsigned max_cuda_events;
- double bandwidth;
+ struct {
+ double h2d;
+ double d2h;
+ double d2d;
+ double other;
+ } bw;
 } uct_cuda_copy_iface_config_t;