From be116611c7a7206bf056d7419313df3a3d137616 Mon Sep 17 00:00:00 2001 From: Michael Heinz Date: Wed, 13 May 2020 09:31:02 -0400 Subject: [PATCH] Updates the PSM2 library to match release 11.2.173 Fixes include: Improved handling of the CUDA memory handle cache. Improved handling of the case where the libpsm2 library is opened multiple times by a single process. (This can happen when, for example, multiple Open MPI transport layers all access libpsm2.) Signed-off-by: Michael Heinz --- COMMIT | 2 +- Makefile | 2 +- README | 7 + include/rbtree.c | 55 +++++- psm.c | 36 ++-- psm2.h | 1 + psm2_hal.c | 12 +- psm2_hal.h | 4 +- psm_user.h | 30 ++++ ptl_am/am_cuda_memhandle_cache.c | 281 +++++++++++++++++++++++-------- ptl_am/am_cuda_memhandle_cache.h | 58 +------ ptl_am/am_reqrep_shmem.c | 5 +- rpm_release_extension | 2 +- 13 files changed, 344 insertions(+), 151 deletions(-) diff --git a/COMMIT b/COMMIT index c8041e5..a0dafc7 100644 --- a/COMMIT +++ b/COMMIT @@ -1 +1 @@ -bc17e0522f6b64e8e054d3cfea4506ac155724c2 \ No newline at end of file +61553edd6b9fefce4a246b4701abc213e7c36b2b \ No newline at end of file diff --git a/Makefile b/Makefile index 21ac7b9..5a31d64 100644 --- a/Makefile +++ b/Makefile @@ -483,7 +483,7 @@ dist: distclean PRUNE_LIST=""; \ for pd in ".git" "cscope*" "$(shell realpath --relative-to=${top_srcdir} ${OUTDIR})" \ "*.orig" "*~" "#*" ".gitignore" "doc" "libcm" "psm.supp" "test" "psm_hal_MOCK" \ - "tools" "artifacts" "*.rej.patch"; do \ + "psm_test" "tools" "artifacts" "*.rej.patch"; do \ PRUNE_LIST="$$PRUNE_LIST -name $$pd -prune -o"; \ done; \ for hid in psm_hal_* ; do \ diff --git a/README b/README index 7990555..2db5fc4 100644 --- a/README +++ b/README @@ -220,6 +220,13 @@ Note: It is also possible to use rpm command to install rpm's, but it is recomme that one use yum/dnf as rpm tool has issues with name changes and obsoletes tags. yum or dnf should be better able to resolve dependency issues. +TESTING +======= + +Please see the subdirectory psm_test, starting with the file: psm_test/README for +code and instructions on testing the psm2 library. Additionally, the +directory: psm_test/samples contains sample code to test. + RELATED SOFTWARE TO PSM2 ======================== diff --git a/include/rbtree.c b/include/rbtree.c index 9d6930d..b79f135 100644 --- a/include/rbtree.c +++ b/include/rbtree.c @@ -85,13 +85,22 @@ #include /* for memset declaration */ -#if !defined ( RBTREE_GET_LEFTMOST ) || \ +// RBTREE_CMP should be a comparator, i.e. RBTREE_CMP(a, b) should evaluate to +// -1, 0, or 1 depending on if a < b, a == b, or a > b, respectively. +#ifdef RBTREE_CMP + +#if defined(RBTREE_GET_LEFTMOST) || defined(RBTREE_GET_RIGHTMOST) +#error Cannot define both RBTREE_CMP and RBTREE_GET_(LEFT|RIGHT)MOST +#endif + +#elif !defined ( RBTREE_GET_LEFTMOST ) || \ ! defined ( RBTREE_GET_RIGHTMOST ) || \ ! defined ( RBTREE_MAP_COUNT ) || \ ! defined ( RBTREE_ASSERT ) #error "You must define RBTREE_GET_LEFTMOST and RBTREE_GET_RIGHTMOST and \ RBTREE_MAP_COUNT and RBTREE_ASSERT before including rbtree.c" -#endif + +#endif /* RBTREE_CMP */ #define IN /* nothing */ @@ -117,13 +126,24 @@ static void ips_cl_qmap_remove_item( static cl_map_item_t* ips_cl_qmap_successor( IN cl_qmap_t* const p_map, IN const cl_map_item_t* p_item); + + +#ifndef RBTREE_NO_EMIT_IPS_CL_QMAP_PREDECESSOR static cl_map_item_t* ips_cl_qmap_predecessor( IN cl_qmap_t* const p_map, IN const cl_map_item_t* p_item); +#endif + +#if defined(RBTREE_GET_LEFTMOST) static cl_map_item_t* ips_cl_qmap_search( IN cl_qmap_t* const p_map, IN unsigned long start, IN unsigned long end); +#else +static cl_map_item_t* ips_cl_qmap_searchv( + cl_qmap_t* const p_map, + const RBTREE_MI_PL *key); +#endif /* * Get the root. @@ -380,7 +400,11 @@ ips_cl_qmap_insert_item( p_insert_at = p_comp_item; /* Traverse the tree until the correct insertion point is found. */ +#ifdef RBTREE_GET_LEFTMOST if( RBTREE_GET_LEFTMOST(&p_item->payload) < RBTREE_GET_LEFTMOST(&p_insert_at->payload) ) +#else + if(RBTREE_CMP(&p_item->payload, &p_insert_at->payload) < 0) +#endif { p_comp_item = p_insert_at->p_left; compare_res = 1; @@ -604,6 +628,11 @@ ips_cl_qmap_successor( } } +// When includer defines RBTREE_CMP, ips_cl_qmap_search() is not emitted. +// When this happens, ips_cl_qmap_predecessor() may not be called. +// Combined with -Werror -Wunused-function, libpsm2 fails to build. +// So provide macro to control emitting this function +#ifndef RBTREE_NO_EMIT_IPS_CL_QMAP_PREDECESSOR static cl_map_item_t * ips_cl_qmap_predecessor( IN cl_qmap_t* const p_map, @@ -627,7 +656,9 @@ ips_cl_qmap_predecessor( return p_tmp; } } +#endif /* RBTREE_NO_EMIT_IPS_CL_QMAP_PREDECESSOR */ +#if defined(RBTREE_GET_LEFTMOST) /* * return the first node with buffer overlapping or zero. */ @@ -690,3 +721,23 @@ ips_cl_qmap_search(cl_qmap_t * const p_map, return p_item; } +#else /* defined(...LEFTMOST) || defined(...RIGHTMOST) */ +static cl_map_item_t * +ips_cl_qmap_searchv(cl_qmap_t * const p_map, const RBTREE_MI_PL *key) +{ + RBTREE_ASSERT( p_map ); + cl_map_item_t *p_item = __cl_map_root(p_map); + + while (p_item != p_map->nil_item) { + if (RBTREE_CMP(key, &p_item->payload) > 0) { + p_item = p_item->p_right; + } else if (RBTREE_CMP(key, &p_item->payload) < 0) { + p_item = p_item->p_left; + } else { + break; + } + } + + return p_item; +} +#endif /* defined(...LEFTMOST) || defined(...RIGHTMOST) */ diff --git a/psm.c b/psm.c index 3aec403..a4a47be 100644 --- a/psm.c +++ b/psm.c @@ -65,11 +65,14 @@ static int psmi_verno = PSMI_VERNO_MAKE(PSM2_VERNO_MAJOR, PSM2_VERNO_MINOR); static int psmi_verno_client_val; int psmi_epid_ver; +// Special psmi_refcount values #define PSMI_NOT_INITIALIZED 0 -#define PSMI_INITIALIZED 1 -#define PSMI_FINALIZED -1 /* Prevent the user from calling psm2_init - * once psm_finalize has been called. */ -static int psmi_isinit = PSMI_NOT_INITIALIZED; +#define PSMI_FINALIZED -1 + +// PSM2 doesn't support transitioning out of the PSMI_FINALIZED state +// once psmi_refcount is set to PSMI_FINALIZED, any further attempts to change +// psmi_refcount should be treated as an error +static int psmi_refcount = PSMI_NOT_INITIALIZED; /* Global lock used for endpoint creation and destroy * (in functions psm2_ep_open and psm2_ep_close) and also @@ -104,9 +107,8 @@ uint32_t gdr_copy_threshold_recv; * It is supposed to be filled with logical OR * on conditional compilation basis * along with future features/capabilities. - * At the very beginning we start with Multi EPs. */ -uint64_t psm2_capabilities_bitset = PSM2_MULTI_EP_CAP; +uint64_t psm2_capabilities_bitset = PSM2_MULTI_EP_CAP | PSM2_LIB_REFCOUNT_CAP; int psmi_verno_client() { @@ -130,7 +132,7 @@ int psmi_verno_isinteroperable(uint16_t verno) int MOCKABLE(psmi_isinitialized)() { - return (psmi_isinit == PSMI_INITIALIZED); + return (psmi_refcount > 0); } MOCK_DEF_EPILOGUE(psmi_isinitialized); @@ -356,10 +358,12 @@ psm2_error_t __psm2_init(int *major, int *minor) GENERIC_PERF_SET_SLOT_NAME(PSM_TX_SPEEDPATH_CTR, "TX"); GENERIC_PERF_SET_SLOT_NAME(PSM_RX_SPEEDPATH_CTR, "RX"); - if (psmi_isinit == PSMI_INITIALIZED) + if (psmi_refcount > 0) { + psmi_refcount++; goto update; + } - if (psmi_isinit == PSMI_FINALIZED) { + if (psmi_refcount == PSMI_FINALIZED) { err = PSM2_IS_FINALIZED; goto fail; } @@ -435,7 +439,7 @@ psm2_error_t __psm2_init(int *major, int *minor) ((id.eax & CPUID_EXMODEL_MASK) >> 12); } - psmi_isinit = PSMI_INITIALIZED; + psmi_refcount++; /* hfi_debug lives in libhfi.so */ psmi_getenv("PSM2_TRACEMASK", "Mask flags for tracing", @@ -520,7 +524,6 @@ psm2_error_t __psm2_init(int *major, int *minor) #endif update: - if (getenv("PSM2_IDENTIFY")) { Dl_info info_psm; char ofed_delta[100] = ""; @@ -557,6 +560,8 @@ psm2_error_t __psm2_init(int *major, int *minor) *major = (int)psmi_verno_major; *minor = (int)psmi_verno_minor; fail: + _HFI_DBG("psmi_refcount=%d,err=%u\n", psmi_refcount, err); + PSM2_LOG_MSG("leaving"); return err; } @@ -779,7 +784,14 @@ psm2_error_t __psm2_finalize(void) PSM2_LOG_MSG("entering"); + _HFI_DBG("psmi_refcount=%d\n", psmi_refcount); PSMI_ERR_UNLESS_INITIALIZED(NULL); + psmi_assert(psmi_refcount > 0); + psmi_refcount--; + + if (psmi_refcount > 0) { + return PSM2_OK; + } /* When PSM_PERF is enabled, the following line causes the instruction cycles gathered in the current run to be dumped @@ -856,7 +868,7 @@ psm2_error_t __psm2_finalize(void) } #endif - psmi_isinit = PSMI_FINALIZED; + psmi_refcount = PSMI_FINALIZED; PSM2_LOG_MSG("leaving"); psmi_log_fini(); diff --git a/psm2.h b/psm2.h index f665970..84f59bb 100644 --- a/psm2.h +++ b/psm2.h @@ -488,6 +488,7 @@ psm2_error_t psm2_init(int *api_verno_major, int *api_verno_minor); * consecutive bits : 0x2, 0x4 ... and so on. */ #define PSM2_MULTI_EP_CAP 0x1 /* Multiple Endpoints capability */ +#define PSM2_LIB_REFCOUNT_CAP 0x2 /* Library finalization is managed with reference count */ /** @brief PSM2 capabilities provider * diff --git a/psm2_hal.c b/psm2_hal.c index 6ba281a..b4b9d9a 100644 --- a/psm2_hal.c +++ b/psm2_hal.c @@ -326,14 +326,10 @@ static struct _psmi_hal_instance *psmi_hal_get_pi_inst(void) p->params.num_ports = nports; p->params.default_pkey = dflt_pkey; p->params.sw_status |= valid_flags; - p->params.unit_active = (uint8_t *) psmi_calloc(PSMI_EP_NONE, UNDEFINED, nunits, - sizeof(uint8_t)); - p->params.unit_active_valid = (uint8_t *) psmi_calloc(PSMI_EP_NONE, UNDEFINED, nunits, - sizeof(uint8_t)); - p->params.port_active = (uint8_t *) psmi_calloc(PSMI_EP_NONE, UNDEFINED, nunits*nports, - sizeof(uint8_t)); - p->params.port_active_valid = (uint8_t *) psmi_calloc(PSMI_EP_NONE, UNDEFINED, nunits*nports, - sizeof(uint8_t)); + p->params.unit_active = (int8_t *) psmi_calloc(PSMI_EP_NONE, UNDEFINED, nunits, sizeof(int8_t)); + p->params.unit_active_valid = (int8_t *) psmi_calloc(PSMI_EP_NONE, UNDEFINED, nunits, sizeof(int8_t)); + p->params.port_active = (int8_t *) psmi_calloc(PSMI_EP_NONE, UNDEFINED, nunits*nports, sizeof(int8_t)); + p->params.port_active_valid = (int8_t *) psmi_calloc(PSMI_EP_NONE, UNDEFINED, nunits*nports, sizeof(int8_t)); p->params.num_contexts = (uint16_t *) psmi_calloc(PSMI_EP_NONE, UNDEFINED, nunits, sizeof(uint16_t)); p->params.num_contexts_valid = (uint16_t *) psmi_calloc(PSMI_EP_NONE, UNDEFINED, nunits, diff --git a/psm2_hal.h b/psm2_hal.h index 02d62a2..1bec596 100644 --- a/psm2_hal.h +++ b/psm2_hal.h @@ -183,8 +183,8 @@ typedef struct _psmi_hal_params uint16_t num_units; uint16_t num_ports; uint16_t default_pkey; - uint8_t *unit_active,*unit_active_valid; - uint8_t *port_active,*port_active_valid; + int8_t *unit_active,*unit_active_valid; + int8_t *port_active,*port_active_valid; uint16_t *num_contexts,*num_contexts_valid; uint16_t *num_free_contexts,*num_free_contexts_valid; } psmi_hal_params_t; diff --git a/psm_user.h b/psm_user.h index 157bc8d..e412ff4 100644 --- a/psm_user.h +++ b/psm_user.h @@ -362,6 +362,36 @@ CUresult (*psmi_cuDevicePrimaryCtxRelease)(CUdevice device); } \ } while (0) +/** + * Similar to PSMI_CUDA_CALL() except does not error out + * if func(args) returns CUDA_SUCCESS or except_err + * + * Invoker must provide 'CUresult cudaerr' in invoked scope + * so invoker can inspect whether cudaerr == CUDA_SUCCESS or + * cudaerr == except_err after expanded code is executed. + */ +#define PSMI_CUDA_CALL_EXCEPT(except_err, func, args...) do { \ + cudaerr = psmi_##func(args); \ + if (cudaerr != CUDA_SUCCESS && cudaerr != except_err) { \ + if (ctxt == NULL) \ + _HFI_ERROR( \ + "Check if CUDA is initialized" \ + "before psm2_ep_open call \n"); \ + _HFI_ERROR( \ + "CUDA failure: %s() (at %s:%d)" \ + "returned %d\n", \ + #func, __FILE__, __LINE__, cudaerr); \ + psmi_handle_error( \ + PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, \ + "Error returned from CUDA function.\n");\ + } else if (cudaerr == except_err) { \ + _HFI_INFO( \ + "CUDA warning: %s() (at %s:%d)" \ + "returned %d\n", \ + #func, __FILE__, __LINE__, cudaerr); \ + } \ + } while (0) + #define PSMI_CUDA_CHECK_EVENT(event, cudaerr) do { \ cudaerr = psmi_cuEventQuery(event); \ if ((cudaerr != CUDA_SUCCESS) && \ diff --git a/ptl_am/am_cuda_memhandle_cache.c b/ptl_am/am_cuda_memhandle_cache.c index 8406a37..4173008 100644 --- a/ptl_am/am_cuda_memhandle_cache.c +++ b/ptl_am/am_cuda_memhandle_cache.c @@ -55,22 +55,132 @@ #include "psm_user.h" #include "am_cuda_memhandle_cache.h" -#define RBTREE_GET_LEFTMOST(PAYLOAD_PTR) ((PAYLOAD_PTR)->start) -#define RBTREE_GET_RIGHTMOST(PAYLOAD_PTR) ((PAYLOAD_PTR)->start+((PAYLOAD_PTR)->length)) + +/* + * rbtree cruft + */ +struct _cl_map_item; + +typedef struct +{ + unsigned long start; /* start virtual address */ + CUipcMemHandle cuda_ipc_handle; /* cuda ipc mem handle */ + CUdeviceptr cuda_ipc_dev_ptr;/* Cuda device pointer */ + uint16_t length; /* length*/ + psm2_epid_t epid; + struct _cl_map_item* i_prev; /* idle queue previous */ + struct _cl_map_item* i_next; /* idle queue next */ +}__attribute__ ((aligned (128))) rbtree_cuda_memhandle_cache_mapitem_pl_t; + +typedef struct { + uint32_t nelems; /* number of elements in the cache */ +} rbtree_cuda_memhandle_cache_map_pl_t; + +static psm2_error_t am_cuda_memhandle_mpool_init(uint32_t memcache_size); + +/* + * Custom comparator + */ +typedef rbtree_cuda_memhandle_cache_mapitem_pl_t cuda_cache_item; + +static int cuda_cache_key_cmp(const cuda_cache_item *a, const cuda_cache_item *b) +{ + // When multi-ep is disabled, cache can assume + // 1 epid == 1 remote process == 1 CUDA address space + // But when multi-ep is enabled, one process can have many epids, so in this case + // cannot use epid as part of cache key. + if (!psmi_multi_ep_enabled) { + if (a->epid < b->epid) + return -1; + if (a->epid > b->epid) + return 1; + } + + unsigned long a_end, b_end; + // normalize into inclusive upper bounds to handle + // 0-length entries + a_end = (a->start + a->length); + b_end = (b->start + b->length); + if (a->length > 0) + a_end--; + + if (b->length > 0) + b_end--; + + if (a_end < b->start) + return -1; + if (b_end < a->start) + return 1; + + return 0; +} + + +/* + * Necessary rbtree cruft + */ +#define RBTREE_MI_PL rbtree_cuda_memhandle_cache_mapitem_pl_t +#define RBTREE_MAP_PL rbtree_cuda_memhandle_cache_map_pl_t +#define RBTREE_CMP(a,b) cuda_cache_key_cmp((a), (b)) #define RBTREE_ASSERT psmi_assert #define RBTREE_MAP_COUNT(PAYLOAD_PTR) ((PAYLOAD_PTR)->nelems) +#define RBTREE_NO_EMIT_IPS_CL_QMAP_PREDECESSOR +#include "rbtree.h" #include "rbtree.c" -#ifdef PSM_DEBUG -static int cache_hit_counter; -static int cache_miss_counter; -#endif +/* + * Convenience rbtree cruft + */ +#define NELEMS cuda_memhandle_cachemap.payload.nelems + +#define IHEAD cuda_memhandle_cachemap.root +#define LAST IHEAD->payload.i_prev +#define FIRST IHEAD->payload.i_next +#define INEXT(x) x->payload.i_next +#define IPREV(x) x->payload.i_prev + +/* + * Actual module data + */ +static cl_qmap_t cuda_memhandle_cachemap; /* Global cache */ +static uint8_t cuda_memhandle_cache_enabled; +static mpool_t cuda_memhandle_mpool; +static uint32_t cuda_memhandle_cache_size; + +static uint64_t cache_hit_counter; +static uint64_t cache_miss_counter; +static uint64_t cache_evict_counter; +static uint64_t cache_collide_counter; + +static void print_cuda_memhandle_cache_stats(void) +{ + _HFI_DBG("enabled=%u,size=%u,hit=%lu,miss=%lu,evict=%lu,collide=%lu\n", + cuda_memhandle_cache_enabled, cuda_memhandle_cache_size, + cache_hit_counter, cache_miss_counter, + cache_evict_counter, cache_collide_counter); +} + +/* + * This is the callback function when mempool are resized or destroyed. + * Upon calling cache fini mpool is detroyed which in turn calls this callback + * which helps in closing all memhandles. + */ +static void +psmi_cuda_memhandle_cache_alloc_func(int is_alloc, void* context, void* obj) +{ + cl_map_item_t* memcache_item = (cl_map_item_t*)obj; + if (!is_alloc) { + if(memcache_item->payload.start) + PSMI_CUDA_CALL(cuIpcCloseMemHandle, + memcache_item->payload.cuda_ipc_dev_ptr); + } +} /* * Creating mempool for cuda memhandle cache nodes. */ -psm2_error_t +static psm2_error_t am_cuda_memhandle_mpool_init(uint32_t memcache_size) { psm2_error_t err; @@ -95,8 +205,12 @@ am_cuda_memhandle_mpool_init(uint32_t memcache_size) /* * Initialize rbtree. */ -psm2_error_t am_cuda_memhandle_cache_map_init() +psm2_error_t am_cuda_memhandle_cache_init(uint32_t memcache_size) { + psm2_error_t err = am_cuda_memhandle_mpool_init(memcache_size); + if (err != PSM2_OK) + return err; + cl_map_item_t *root, *nil_item; root = (cl_map_item_t *)psmi_calloc(NULL, UNDEFINED, 1, sizeof(cl_map_item_t)); if (root == NULL) @@ -115,10 +229,7 @@ psm2_error_t am_cuda_memhandle_cache_map_init() void am_cuda_memhandle_cache_map_fini() { -#ifdef PSM_DEBUG - _HFI_DBG("cache hit counter: %d\n", cache_hit_counter); - _HFI_DBG("cache miss counter: %d\n", cache_miss_counter); -#endif + print_cuda_memhandle_cache_stats(); if (cuda_memhandle_cachemap.nil_item) psmi_free(cuda_memhandle_cachemap.nil_item); @@ -143,6 +254,7 @@ am_cuda_idleq_insert(cl_map_item_t* memcache_item) INEXT(FIRST) = memcache_item; IPREV(memcache_item) = FIRST; FIRST = memcache_item; + INEXT(FIRST) = NULL; return; } @@ -155,11 +267,13 @@ am_cuda_idleq_remove_last(cl_map_item_t* memcache_item) if (!INEXT(memcache_item)) { LAST = NULL; FIRST = NULL; - return; + } else { + LAST = INEXT(memcache_item); + IPREV(LAST) = NULL; } - LAST = INEXT(memcache_item); - IPREV(LAST) = NULL; - return; + // Null-out now-removed memcache_item's next and prev pointers out of + // an abundance of caution + INEXT(memcache_item) = IPREV(memcache_item) = NULL; } static void @@ -167,15 +281,16 @@ am_cuda_idleq_remove(cl_map_item_t* memcache_item) { if (LAST == memcache_item) { am_cuda_idleq_remove_last(memcache_item); - return; - } - if (INEXT(memcache_item) == NULL) { - INEXT(IPREV(memcache_item)) = NULL; - return; + } else if (FIRST == memcache_item) { + FIRST = IPREV(memcache_item); + INEXT(FIRST) = NULL; + } else { + INEXT(IPREV(memcache_item)) = INEXT(memcache_item); + IPREV(INEXT(memcache_item)) = IPREV(memcache_item); } - INEXT(IPREV(memcache_item)) = INEXT(memcache_item); - IPREV(INEXT(memcache_item)) = IPREV(memcache_item); - return; + // Null-out now-removed memcache_item's next and prev pointers out of + // an abundance of caution + INEXT(memcache_item) = IPREV(memcache_item) = NULL; } static void @@ -207,10 +322,14 @@ am_cuda_memhandle_cache_validate(cl_map_item_t* memcache_item, && epid == memcache_item->payload.epid) { return PSM2_OK; } + _HFI_DBG("cache collision: new entry start=%lu,length=%u\n", sbuf, length); + + cache_collide_counter++; ips_cl_qmap_remove_item(&cuda_memhandle_cachemap, memcache_item); PSMI_CUDA_CALL(cuIpcCloseMemHandle, memcache_item->payload.cuda_ipc_dev_ptr); am_cuda_idleq_remove(memcache_item); + memset(memcache_item, 0, sizeof(*memcache_item)); psmi_mpool_put(memcache_item); return PSM2_OK_NO_PROGRESS; } @@ -219,14 +338,18 @@ am_cuda_memhandle_cache_validate(cl_map_item_t* memcache_item, * Current eviction policy: Least Recently Used. */ static void -am_cuda_memhandle_cache_evict() +am_cuda_memhandle_cache_evict(void) { + cache_evict_counter++; cl_map_item_t *p_item = LAST; + _HFI_VDBG("Removing (epid=%lu,start=%lu,length=%u,dev_ptr=0x%llX,it=%p) from cuda_memhandle_cachemap.\n", + p_item->payload.epid, p_item->payload.start, p_item->payload.length, + p_item->payload.cuda_ipc_dev_ptr, p_item); ips_cl_qmap_remove_item(&cuda_memhandle_cachemap, p_item); PSMI_CUDA_CALL(cuIpcCloseMemHandle, p_item->payload.cuda_ipc_dev_ptr); am_cuda_idleq_remove_last(p_item); + memset(p_item, 0, sizeof(*p_item)); psmi_mpool_put(p_item); - return; } static psm2_error_t @@ -236,6 +359,7 @@ am_cuda_memhandle_cache_register(uintptr_t sbuf, CUipcMemHandle* handle, { if (NELEMS == cuda_memhandle_cache_size) am_cuda_memhandle_cache_evict(); + cl_map_item_t* memcache_item = psmi_mpool_get(cuda_memhandle_mpool); /* memcache_item cannot be NULL as we evict * before the call to mpool_get. Check has @@ -253,6 +377,15 @@ am_cuda_memhandle_cache_register(uintptr_t sbuf, CUipcMemHandle* handle, return PSM2_OK; } +static void am_cuda_memhandle_cache_clear(void) +{ + _HFI_DBG("Closing all handles, clearing cuda_memhandle_cachemap and idleq. NELEMS=%u\n", NELEMS); + while (NELEMS) { + am_cuda_memhandle_cache_evict(); + } + _HFI_DBG("Closed all handles, cleared cuda_memhandle_cachemap and idleq. NELEMS=%u\n", NELEMS); +} + /* * The key used to search the cache is the senders buf address pointer. * Upon a succesful hit in the cache, additional validation is required @@ -262,36 +395,66 @@ CUdeviceptr am_cuda_memhandle_acquire(uintptr_t sbuf, CUipcMemHandle* handle, uint32_t length, psm2_epid_t epid) { + _HFI_VDBG("sbuf=%lu,handle=%p,length=%u,epid=%lu\n", + sbuf, handle, length, epid); + CUdeviceptr cuda_ipc_dev_ptr; - if(cuda_memhandle_cache_enabled) { - cl_qmap_t *p_map = &cuda_memhandle_cachemap; - cl_map_item_t *p_item; - unsigned long start = (unsigned long)sbuf; - unsigned long end = start + length; - p_item = ips_cl_qmap_search(p_map, start, end); - if (p_item->payload.start) { - if (am_cuda_memhandle_cache_validate(p_item, sbuf, - handle, length, epid) == PSM2_OK) { -#ifdef PSM_DEBUG - cache_hit_counter++; -#endif - am_cuda_idleq_reorder(p_item); - return p_item->payload.cuda_ipc_dev_ptr; - } - } -#ifdef PSM_DEBUG - cache_miss_counter++; -#endif - PSMI_CUDA_CALL(cuIpcOpenMemHandle, &cuda_ipc_dev_ptr, - *handle, CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS); - am_cuda_memhandle_cache_register(sbuf, handle, - length, epid, cuda_ipc_dev_ptr); - return cuda_ipc_dev_ptr; - } else { + if(!cuda_memhandle_cache_enabled) { PSMI_CUDA_CALL(cuIpcOpenMemHandle, &cuda_ipc_dev_ptr, *handle, CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS); return cuda_ipc_dev_ptr; } + + cuda_cache_item key = { + .start = (unsigned long) sbuf, + .length= length, + .epid = epid + }; + + /* + * preconditions: + * 1) newrange [start,end) may or may not be in cachemap already + * 2) there are no overlapping address ranges in cachemap + * postconditions: + * 1) newrange is in cachemap + * 2) there are no overlapping address ranges in cachemap + * + * The key used to search the cache is the senders buf address pointer. + * Upon a succesful hit in the cache, additional validation is required + * as multiple senders could potentially send the same buf address value. + */ + cl_map_item_t *p_item = ips_cl_qmap_searchv(&cuda_memhandle_cachemap, &key); + while (p_item->payload.start) { + // Since a precondition is that there are no overlapping ranges in cachemap, + // an exact match implies no need to check further + if (am_cuda_memhandle_cache_validate(p_item, sbuf, handle, length, epid) == PSM2_OK) { + cache_hit_counter++; + am_cuda_idleq_reorder(p_item); + return p_item->payload.cuda_ipc_dev_ptr; + } + + // newrange is not in the cache and overlaps at least one existing range. + // am_cuda_memhandle_cache_validate() closed and removed existing range. + // Continue searching for more overlapping ranges + p_item = ips_cl_qmap_searchv(&cuda_memhandle_cachemap, &key); + } + cache_miss_counter++; + + CUresult cudaerr; + PSMI_CUDA_CALL_EXCEPT(CUDA_ERROR_ALREADY_MAPPED, cuIpcOpenMemHandle, + &cuda_ipc_dev_ptr, *handle, CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS); + + if (cudaerr == CUDA_ERROR_ALREADY_MAPPED) { + // remote memory already mapped. Close all handles, clear cache, + // and try again + am_cuda_memhandle_cache_clear(); + PSMI_CUDA_CALL(cuIpcOpenMemHandle, &cuda_ipc_dev_ptr, *handle, + CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS); + } + + am_cuda_memhandle_cache_register(sbuf, handle, + length, epid, cuda_ipc_dev_ptr); + return cuda_ipc_dev_ptr; } void @@ -302,20 +465,4 @@ am_cuda_memhandle_release(CUdeviceptr cuda_ipc_dev_ptr) return; } -/* - * This is the callback function when mempool are resized or destroyed. - * Upon calling cache fini mpool is detroyed which in turn calls this callback - * which helps in closing all memhandles. - */ -void -psmi_cuda_memhandle_cache_alloc_func(int is_alloc, void* context, void* obj) -{ - cl_map_item_t* memcache_item = (cl_map_item_t*)obj; - if (!is_alloc) { - if(memcache_item->payload.start) - PSMI_CUDA_CALL(cuIpcCloseMemHandle, - memcache_item->payload.cuda_ipc_dev_ptr); - } -} - #endif diff --git a/ptl_am/am_cuda_memhandle_cache.h b/ptl_am/am_cuda_memhandle_cache.h index 494de32..f25b3af 100644 --- a/ptl_am/am_cuda_memhandle_cache.h +++ b/ptl_am/am_cuda_memhandle_cache.h @@ -56,58 +56,12 @@ #ifndef _AM_CUDA_MEMHANDLE_CACHE_H #define _AM_CUDA_MEMHANDLE_CACHE_H -#include -#include +#include "psm_user.h" #include -#include - -struct _cl_map_item; - -typedef struct -{ - unsigned long start; /* start virtual address */ - CUipcMemHandle cuda_ipc_handle; /* cuda ipc mem handle */ - CUdeviceptr cuda_ipc_dev_ptr;/* Cuda device pointer */ - uint16_t length; /* length*/ - psm2_epid_t epid; - struct _cl_map_item* i_prev; /* idle queue previous */ - struct _cl_map_item* i_next; /* idle queue next */ -}__attribute__ ((aligned (128))) rbtree_cuda_memhandle_cache_mapitem_pl_t; - -typedef struct { - uint32_t nelems; /* number of elements in the cache */ -} rbtree_cuda_memhandle_cache_map_pl_t; - -#define RBTREE_MI_PL rbtree_cuda_memhandle_cache_mapitem_pl_t -#define RBTREE_MAP_PL rbtree_cuda_memhandle_cache_map_pl_t - -#include "rbtree.h" - -cl_qmap_t cuda_memhandle_cachemap; /* Global cache */ -uint8_t cuda_memhandle_cache_enabled; -mpool_t cuda_memhandle_mpool; -uint32_t cuda_memhandle_cache_size; -#define CUDA_MEMHANDLE_CACHE_SIZE 64 - -/* - * Macro definition for easy programming. - */ -#define NELEMS cuda_memhandle_cachemap.payload.nelems - -/* - * Macro for idle queue management. - */ -#define IHEAD cuda_memhandle_cachemap.root -#define LAST IHEAD->payload.i_prev -#define FIRST IHEAD->payload.i_next -#define INEXT(x) x->payload.i_next -#define IPREV(x) x->payload.i_prev - - -psm2_error_t am_cuda_memhandle_mpool_init(uint32_t memcache_size); +#define CUDA_MEMHANDLE_CACHE_SIZE 64 -psm2_error_t am_cuda_memhandle_cache_map_init(); +psm2_error_t am_cuda_memhandle_cache_init(uint32_t memcache_size); CUdeviceptr am_cuda_memhandle_acquire(uintptr_t sbuf, CUipcMemHandle* handle, @@ -115,10 +69,8 @@ am_cuda_memhandle_acquire(uintptr_t sbuf, CUipcMemHandle* handle, void am_cuda_memhandle_release(CUdeviceptr cuda_ipc_dev_ptr); -void psmi_cuda_memhandle_cache_alloc_func(int is_alloc, void* context, void* obj); - void am_cuda_memhandle_cache_map_fini(); -#endif +#endif /* _AM_CUDA_MEMHANDLE_CACHE_H */ -#endif +#endif /* PSM_CUDA */ diff --git a/ptl_am/am_reqrep_shmem.c b/ptl_am/am_reqrep_shmem.c index 42819db..9be72f9 100644 --- a/ptl_am/am_reqrep_shmem.c +++ b/ptl_am/am_reqrep_shmem.c @@ -2574,10 +2574,7 @@ amsh_init(psm2_ep_t ep, ptl_t *ptl_gen, ptl_ctl_t *ctl) PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT, (union psmi_envvar_val) CUDA_MEMHANDLE_CACHE_SIZE, &env_memcache_size); - if ((err = am_cuda_memhandle_mpool_init(env_memcache_size.e_uint) - != PSM2_OK)) - goto fail; - if ((err = am_cuda_memhandle_cache_map_init() != PSM2_OK)) + if ((err = am_cuda_memhandle_cache_init(env_memcache_size.e_uint) != PSM2_OK)) goto fail; } #endif diff --git a/rpm_release_extension b/rpm_release_extension index 91b629b..c4597e5 100644 --- a/rpm_release_extension +++ b/rpm_release_extension @@ -1 +1 @@ -156 +173