From be116611c7a7206bf056d7419313df3a3d137616 Mon Sep 17 00:00:00 2001
From: Michael Heinz <michael.william.heinz@intel.com>
Date: Wed, 13 May 2020 09:31:02 -0400
Subject: [PATCH] Updates the PSM2 library to match release 11.2.173

Fixes include:

Improved handling of the CUDA memory handle cache.

Improved handling of the case where the libpsm2 library is opened
multiple times by a single process. (This can happen when, for example,
multiple Open MPI transport layers all access libpsm2.)

Signed-off-by: Michael Heinz <michael.william.heinz@intel.com>
---
 COMMIT                           |   2 +-
 Makefile                         |   2 +-
 README                           |   7 +
 include/rbtree.c                 |  55 +++++-
 psm.c                            |  36 ++--
 psm2.h                           |   1 +
 psm2_hal.c                       |  12 +-
 psm2_hal.h                       |   4 +-
 psm_user.h                       |  30 ++++
 ptl_am/am_cuda_memhandle_cache.c | 281 +++++++++++++++++++++++--------
 ptl_am/am_cuda_memhandle_cache.h |  58 +------
 ptl_am/am_reqrep_shmem.c         |   5 +-
 rpm_release_extension            |   2 +-
 13 files changed, 344 insertions(+), 151 deletions(-)

diff --git a/COMMIT b/COMMIT
index c8041e5..a0dafc7 100644
--- a/COMMIT
+++ b/COMMIT
@@ -1 +1 @@
-bc17e0522f6b64e8e054d3cfea4506ac155724c2
\ No newline at end of file
+61553edd6b9fefce4a246b4701abc213e7c36b2b
\ No newline at end of file
diff --git a/Makefile b/Makefile
index 21ac7b9..5a31d64 100644
--- a/Makefile
+++ b/Makefile
@@ -483,7 +483,7 @@ dist: distclean
 	PRUNE_LIST="";										\
 	for pd in ".git" "cscope*" "$(shell realpath --relative-to=${top_srcdir} ${OUTDIR})"	\
 		"*.orig" "*~" "#*" ".gitignore" "doc" "libcm" "psm.supp" "test" "psm_hal_MOCK"	\
-		 "tools" "artifacts" "*.rej.patch"; do			\
+		 "psm_test" "tools" "artifacts" "*.rej.patch"; do			\
 		PRUNE_LIST="$$PRUNE_LIST -name $$pd -prune -o";					\
 	done;											\
 	for hid in psm_hal_* ; do								\
diff --git a/README b/README
index 7990555..2db5fc4 100644
--- a/README
+++ b/README
@@ -220,6 +220,13 @@ Note: It is also possible to use rpm command to install rpm's, but it is recomme
 that one use yum/dnf as rpm tool has issues with name changes and obsoletes tags.
 yum or dnf should be better able to resolve dependency issues.
 
+TESTING
+=======
+
+Please see the subdirectory psm_test, starting with the file: psm_test/README for
+code and instructions on testing the psm2 library.  Additionally, the
+directory: psm_test/samples contains sample code to test.
+
 RELATED SOFTWARE TO PSM2
 ========================
 
diff --git a/include/rbtree.c b/include/rbtree.c
index 9d6930d..b79f135 100644
--- a/include/rbtree.c
+++ b/include/rbtree.c
@@ -85,13 +85,22 @@
 
 #include <string.h> /* for memset declaration */
 
-#if !defined ( RBTREE_GET_LEFTMOST )       || \
+// RBTREE_CMP should be a comparator, i.e. RBTREE_CMP(a, b) should evaluate to
+// -1, 0, or 1 depending on if a < b, a == b, or a > b, respectively.
+#ifdef RBTREE_CMP
+
+#if defined(RBTREE_GET_LEFTMOST) || defined(RBTREE_GET_RIGHTMOST)
+#error Cannot define both RBTREE_CMP and RBTREE_GET_(LEFT|RIGHT)MOST
+#endif
+
+#elif !defined ( RBTREE_GET_LEFTMOST )       || \
 	! defined ( RBTREE_GET_RIGHTMOST ) || \
 	! defined ( RBTREE_MAP_COUNT )     || \
 	! defined ( RBTREE_ASSERT )
 #error "You must define RBTREE_GET_LEFTMOST and RBTREE_GET_RIGHTMOST and \
         RBTREE_MAP_COUNT and RBTREE_ASSERT before including rbtree.c"
-#endif
+
+#endif /* RBTREE_CMP */
 
 #define IN /* nothing */
 
@@ -117,13 +126,24 @@ static void ips_cl_qmap_remove_item(
 static cl_map_item_t* ips_cl_qmap_successor(
 				IN	cl_qmap_t* const	p_map,
 				IN	const cl_map_item_t*	p_item);
+
+
+#ifndef RBTREE_NO_EMIT_IPS_CL_QMAP_PREDECESSOR
 static cl_map_item_t* ips_cl_qmap_predecessor(
 				IN	cl_qmap_t* const	p_map,
 				IN	const cl_map_item_t*	p_item);
+#endif
+
+#if defined(RBTREE_GET_LEFTMOST)
 static cl_map_item_t* ips_cl_qmap_search(
 				IN	cl_qmap_t* const	p_map,
 				IN	unsigned long		start,
 				IN	unsigned long		end);
+#else
+static cl_map_item_t* ips_cl_qmap_searchv(
+				cl_qmap_t* const	p_map,
+				const RBTREE_MI_PL *key);
+#endif
 
 /*
  * Get the root.
@@ -380,7 +400,11 @@ ips_cl_qmap_insert_item(
 		p_insert_at = p_comp_item;
 
 		/* Traverse the tree until the correct insertion point is found. */
+#ifdef RBTREE_GET_LEFTMOST
 		if( RBTREE_GET_LEFTMOST(&p_item->payload) < RBTREE_GET_LEFTMOST(&p_insert_at->payload) )
+#else
+		if(RBTREE_CMP(&p_item->payload, &p_insert_at->payload) < 0)
+#endif
 		{
 			p_comp_item = p_insert_at->p_left;
 			compare_res = 1;
@@ -604,6 +628,11 @@ ips_cl_qmap_successor(
 	}
 }
 
+// When includer defines RBTREE_CMP, ips_cl_qmap_search() is not emitted.
+// When this happens, ips_cl_qmap_predecessor() may not be called.
+// Combined with -Werror -Wunused-function, libpsm2 fails to build.
+// So provide macro to control emitting this function
+#ifndef RBTREE_NO_EMIT_IPS_CL_QMAP_PREDECESSOR
 static cl_map_item_t *
 ips_cl_qmap_predecessor(
 	IN	cl_qmap_t* const		p_map,
@@ -627,7 +656,9 @@ ips_cl_qmap_predecessor(
 		return p_tmp;
 	}
 }
+#endif /* RBTREE_NO_EMIT_IPS_CL_QMAP_PREDECESSOR */
 
+#if defined(RBTREE_GET_LEFTMOST)
 /*
  * return the first node with buffer overlapping or zero.
  */
@@ -690,3 +721,23 @@ ips_cl_qmap_search(cl_qmap_t * const p_map,
 
 	return p_item;
 }
+#else /* defined(...LEFTMOST) || defined(...RIGHTMOST) */
+static cl_map_item_t *
+ips_cl_qmap_searchv(cl_qmap_t * const p_map, const RBTREE_MI_PL *key)
+{
+	RBTREE_ASSERT( p_map );
+	cl_map_item_t *p_item = __cl_map_root(p_map);
+
+	while (p_item != p_map->nil_item) {
+		if (RBTREE_CMP(key, &p_item->payload) > 0) {
+			p_item = p_item->p_right;
+		} else if (RBTREE_CMP(key, &p_item->payload) < 0) {
+			p_item = p_item->p_left;
+		} else {
+			break;
+		}
+	}
+
+	return p_item;
+}
+#endif /* defined(...LEFTMOST) || defined(...RIGHTMOST) */
diff --git a/psm.c b/psm.c
index 3aec403..a4a47be 100644
--- a/psm.c
+++ b/psm.c
@@ -65,11 +65,14 @@ static int psmi_verno = PSMI_VERNO_MAKE(PSM2_VERNO_MAJOR, PSM2_VERNO_MINOR);
 static int psmi_verno_client_val;
 int psmi_epid_ver;
 
+// Special psmi_refcount values
 #define PSMI_NOT_INITIALIZED    0
-#define PSMI_INITIALIZED        1
-#define PSMI_FINALIZED         -1	/* Prevent the user from calling psm2_init
-					 * once psm_finalize has been called. */
-static int psmi_isinit = PSMI_NOT_INITIALIZED;
+#define PSMI_FINALIZED         -1
+
+// PSM2 doesn't support transitioning out of the PSMI_FINALIZED state
+// once psmi_refcount is set to PSMI_FINALIZED, any further attempts to change
+// psmi_refcount should be treated as an error
+static int psmi_refcount = PSMI_NOT_INITIALIZED;
 
 /* Global lock used for endpoint creation and destroy
  * (in functions psm2_ep_open and psm2_ep_close) and also
@@ -104,9 +107,8 @@ uint32_t gdr_copy_threshold_recv;
  * It is supposed to be filled with logical OR
  * on conditional compilation basis
  * along with future features/capabilities.
- * At the very beginning we start with Multi EPs.
  */
-uint64_t psm2_capabilities_bitset = PSM2_MULTI_EP_CAP;
+uint64_t psm2_capabilities_bitset = PSM2_MULTI_EP_CAP | PSM2_LIB_REFCOUNT_CAP;
 
 int psmi_verno_client()
 {
@@ -130,7 +132,7 @@ int psmi_verno_isinteroperable(uint16_t verno)
 
 int MOCKABLE(psmi_isinitialized)()
 {
-	return (psmi_isinit == PSMI_INITIALIZED);
+	return (psmi_refcount > 0);
 }
 MOCK_DEF_EPILOGUE(psmi_isinitialized);
 
@@ -356,10 +358,12 @@ psm2_error_t __psm2_init(int *major, int *minor)
 	GENERIC_PERF_SET_SLOT_NAME(PSM_TX_SPEEDPATH_CTR, "TX");
 	GENERIC_PERF_SET_SLOT_NAME(PSM_RX_SPEEDPATH_CTR, "RX");
 
-	if (psmi_isinit == PSMI_INITIALIZED)
+	if (psmi_refcount > 0) {
+		psmi_refcount++;
 		goto update;
+	}
 
-	if (psmi_isinit == PSMI_FINALIZED) {
+	if (psmi_refcount == PSMI_FINALIZED) {
 		err = PSM2_IS_FINALIZED;
 		goto fail;
 	}
@@ -435,7 +439,7 @@ psm2_error_t __psm2_init(int *major, int *minor)
 				((id.eax & CPUID_EXMODEL_MASK) >> 12);
 	}
 
-	psmi_isinit = PSMI_INITIALIZED;
+	psmi_refcount++;
 	/* hfi_debug lives in libhfi.so */
 	psmi_getenv("PSM2_TRACEMASK",
 		    "Mask flags for tracing",
@@ -520,7 +524,6 @@ psm2_error_t __psm2_init(int *major, int *minor)
 #endif
 
 update:
-
 	if (getenv("PSM2_IDENTIFY")) {
                 Dl_info info_psm;
 		char ofed_delta[100] = "";
@@ -557,6 +560,8 @@ psm2_error_t __psm2_init(int *major, int *minor)
 	*major = (int)psmi_verno_major;
 	*minor = (int)psmi_verno_minor;
 fail:
+	_HFI_DBG("psmi_refcount=%d,err=%u\n", psmi_refcount, err);
+
 	PSM2_LOG_MSG("leaving");
 	return err;
 }
@@ -779,7 +784,14 @@ psm2_error_t __psm2_finalize(void)
 
 	PSM2_LOG_MSG("entering");
 
+	_HFI_DBG("psmi_refcount=%d\n", psmi_refcount);
 	PSMI_ERR_UNLESS_INITIALIZED(NULL);
+	psmi_assert(psmi_refcount > 0);
+	psmi_refcount--;
+
+	if (psmi_refcount > 0) {
+		return PSM2_OK;
+	}
 
 	/* When PSM_PERF is enabled, the following line causes the
 	   instruction cycles gathered in the current run to be dumped
@@ -856,7 +868,7 @@ psm2_error_t __psm2_finalize(void)
 	}
 #endif
 
-	psmi_isinit = PSMI_FINALIZED;
+	psmi_refcount = PSMI_FINALIZED;
 	PSM2_LOG_MSG("leaving");
 	psmi_log_fini();
 
diff --git a/psm2.h b/psm2.h
index f665970..84f59bb 100644
--- a/psm2.h
+++ b/psm2.h
@@ -488,6 +488,7 @@ psm2_error_t psm2_init(int *api_verno_major, int *api_verno_minor);
  * consecutive bits : 0x2, 0x4 ... and so on.
  */
 #define PSM2_MULTI_EP_CAP 0x1	/* Multiple Endpoints capability */
+#define PSM2_LIB_REFCOUNT_CAP 0x2	/* Library finalization is managed with reference count */
 
 /** @brief PSM2 capabilities provider
  *
diff --git a/psm2_hal.c b/psm2_hal.c
index 6ba281a..b4b9d9a 100644
--- a/psm2_hal.c
+++ b/psm2_hal.c
@@ -326,14 +326,10 @@ static struct _psmi_hal_instance *psmi_hal_get_pi_inst(void)
 				p->params.num_ports = nports;
 				p->params.default_pkey = dflt_pkey;
 				p->params.sw_status |= valid_flags;
-				p->params.unit_active = (uint8_t *) psmi_calloc(PSMI_EP_NONE, UNDEFINED, nunits,
-										sizeof(uint8_t));
-				p->params.unit_active_valid = (uint8_t *) psmi_calloc(PSMI_EP_NONE, UNDEFINED, nunits,
-										      sizeof(uint8_t));
-				p->params.port_active = (uint8_t *) psmi_calloc(PSMI_EP_NONE, UNDEFINED, nunits*nports,
-										sizeof(uint8_t));
-				p->params.port_active_valid = (uint8_t *) psmi_calloc(PSMI_EP_NONE, UNDEFINED, nunits*nports,
-										      sizeof(uint8_t));
+				p->params.unit_active = (int8_t *) psmi_calloc(PSMI_EP_NONE, UNDEFINED, nunits, sizeof(int8_t));
+				p->params.unit_active_valid = (int8_t *) psmi_calloc(PSMI_EP_NONE, UNDEFINED, nunits, sizeof(int8_t));
+				p->params.port_active = (int8_t *) psmi_calloc(PSMI_EP_NONE, UNDEFINED, nunits*nports, sizeof(int8_t));
+				p->params.port_active_valid = (int8_t *) psmi_calloc(PSMI_EP_NONE, UNDEFINED, nunits*nports, sizeof(int8_t));
 				p->params.num_contexts = (uint16_t *) psmi_calloc(PSMI_EP_NONE, UNDEFINED, nunits,
 										  sizeof(uint16_t));
 				p->params.num_contexts_valid = (uint16_t *) psmi_calloc(PSMI_EP_NONE, UNDEFINED, nunits,
diff --git a/psm2_hal.h b/psm2_hal.h
index 02d62a2..1bec596 100644
--- a/psm2_hal.h
+++ b/psm2_hal.h
@@ -183,8 +183,8 @@ typedef struct _psmi_hal_params
 	uint16_t   num_units;
 	uint16_t   num_ports;
 	uint16_t   default_pkey;
-	uint8_t    *unit_active,*unit_active_valid;
-	uint8_t    *port_active,*port_active_valid;
+	int8_t     *unit_active,*unit_active_valid;
+	int8_t     *port_active,*port_active_valid;
 	uint16_t   *num_contexts,*num_contexts_valid;
 	uint16_t   *num_free_contexts,*num_free_contexts_valid;
 } psmi_hal_params_t;
diff --git a/psm_user.h b/psm_user.h
index 157bc8d..e412ff4 100644
--- a/psm_user.h
+++ b/psm_user.h
@@ -362,6 +362,36 @@ CUresult (*psmi_cuDevicePrimaryCtxRelease)(CUdevice device);
 		}							\
 	} while (0)
 
+/**
+ * Similar to PSMI_CUDA_CALL() except does not error out
+ * if func(args) returns CUDA_SUCCESS or except_err
+ *
+ * Invoker must provide 'CUresult cudaerr' in invoked scope
+ * so invoker can inspect whether cudaerr == CUDA_SUCCESS or
+ * cudaerr == except_err after expanded code is executed.
+ */
+#define PSMI_CUDA_CALL_EXCEPT(except_err, func, args...) do { \
+		cudaerr = psmi_##func(args);				\
+		if (cudaerr != CUDA_SUCCESS && cudaerr != except_err) {	\
+			if (ctxt == NULL)				\
+				_HFI_ERROR(				\
+				"Check if CUDA is initialized"	\
+				"before psm2_ep_open call \n");		\
+			_HFI_ERROR(					\
+				"CUDA failure: %s() (at %s:%d)"		\
+				"returned %d\n",			\
+				#func, __FILE__, __LINE__, cudaerr);	\
+			psmi_handle_error(				\
+				PSMI_EP_NORETURN, PSM2_INTERNAL_ERR,	\
+				"Error returned from CUDA function.\n");\
+		} else if (cudaerr == except_err) { \
+			_HFI_INFO( \
+				"CUDA warning: %s() (at %s:%d)"		\
+				"returned %d\n",			\
+				#func, __FILE__, __LINE__, cudaerr);	\
+		} \
+	} while (0)
+
 #define PSMI_CUDA_CHECK_EVENT(event, cudaerr) do {			\
 		cudaerr = psmi_cuEventQuery(event);			\
 		if ((cudaerr != CUDA_SUCCESS) &&			\
diff --git a/ptl_am/am_cuda_memhandle_cache.c b/ptl_am/am_cuda_memhandle_cache.c
index 8406a37..4173008 100644
--- a/ptl_am/am_cuda_memhandle_cache.c
+++ b/ptl_am/am_cuda_memhandle_cache.c
@@ -55,22 +55,132 @@
 
 #include "psm_user.h"
 #include "am_cuda_memhandle_cache.h"
-#define RBTREE_GET_LEFTMOST(PAYLOAD_PTR)  ((PAYLOAD_PTR)->start)
-#define RBTREE_GET_RIGHTMOST(PAYLOAD_PTR) ((PAYLOAD_PTR)->start+((PAYLOAD_PTR)->length))
+
+/*
+ * rbtree cruft
+ */
+struct _cl_map_item;
+
+typedef struct
+{
+	unsigned long		start;		 /* start virtual address */
+	CUipcMemHandle		cuda_ipc_handle; /* cuda ipc mem handle */
+	CUdeviceptr		cuda_ipc_dev_ptr;/* Cuda device pointer */
+	uint16_t		length;	 /* length*/
+	psm2_epid_t             epid;
+	struct _cl_map_item*	i_prev;	 /* idle queue previous */
+	struct _cl_map_item*	i_next;	 /* idle queue next */
+}__attribute__ ((aligned (128))) rbtree_cuda_memhandle_cache_mapitem_pl_t;
+
+typedef struct {
+	uint32_t		nelems;	/* number of elements in the cache */
+} rbtree_cuda_memhandle_cache_map_pl_t;
+
+static psm2_error_t am_cuda_memhandle_mpool_init(uint32_t memcache_size);
+
+/*
+ * Custom comparator
+ */
+typedef rbtree_cuda_memhandle_cache_mapitem_pl_t cuda_cache_item;
+
+static int cuda_cache_key_cmp(const cuda_cache_item *a, const cuda_cache_item *b)
+{
+	// When multi-ep is disabled, cache can assume
+	//   1 epid == 1 remote process == 1 CUDA address space
+	// But when multi-ep is enabled, one process can have many epids, so in this case
+	// cannot use epid as part of cache key.
+	if (!psmi_multi_ep_enabled) {
+		if (a->epid < b->epid)
+			return -1;
+		if (a->epid > b->epid)
+			return 1;
+	}
+
+	unsigned long a_end, b_end;
+	// normalize into inclusive upper bounds to handle
+	// 0-length entries
+	a_end = (a->start + a->length);
+	b_end = (b->start + b->length);
+	if (a->length > 0)
+		a_end--;
+
+	if (b->length > 0)
+		b_end--;
+
+	if (a_end < b->start)
+		return -1;
+	if (b_end < a->start)
+		return 1;
+
+	return 0;
+}
+
+
+/*
+ * Necessary rbtree cruft
+ */
+#define RBTREE_MI_PL  rbtree_cuda_memhandle_cache_mapitem_pl_t
+#define RBTREE_MAP_PL rbtree_cuda_memhandle_cache_map_pl_t
+#define RBTREE_CMP(a,b) cuda_cache_key_cmp((a), (b))
 #define RBTREE_ASSERT                     psmi_assert
 #define RBTREE_MAP_COUNT(PAYLOAD_PTR)     ((PAYLOAD_PTR)->nelems)
+#define RBTREE_NO_EMIT_IPS_CL_QMAP_PREDECESSOR
 
+#include "rbtree.h"
 #include "rbtree.c"
 
-#ifdef PSM_DEBUG
-static int cache_hit_counter;
-static int cache_miss_counter;
-#endif
+/*
+ * Convenience rbtree cruft
+ */
+#define NELEMS			cuda_memhandle_cachemap.payload.nelems
+
+#define IHEAD			cuda_memhandle_cachemap.root
+#define LAST			IHEAD->payload.i_prev
+#define FIRST			IHEAD->payload.i_next
+#define INEXT(x)		x->payload.i_next
+#define IPREV(x)		x->payload.i_prev
+
+/*
+ * Actual module data
+ */
+static cl_qmap_t cuda_memhandle_cachemap; /* Global cache */
+static uint8_t cuda_memhandle_cache_enabled;
+static mpool_t cuda_memhandle_mpool;
+static uint32_t cuda_memhandle_cache_size;
+
+static uint64_t cache_hit_counter;
+static uint64_t cache_miss_counter;
+static uint64_t cache_evict_counter;
+static uint64_t cache_collide_counter;
+
+static void print_cuda_memhandle_cache_stats(void)
+{
+	_HFI_DBG("enabled=%u,size=%u,hit=%lu,miss=%lu,evict=%lu,collide=%lu\n",
+		cuda_memhandle_cache_enabled, cuda_memhandle_cache_size,
+		cache_hit_counter, cache_miss_counter,
+		cache_evict_counter, cache_collide_counter);
+}
+
+/*
+ * This is the callback function when mempool are resized or destroyed.
+ * Upon calling cache fini mpool is detroyed which in turn calls this callback
+ * which helps in closing all memhandles.
+ */
+static void
+psmi_cuda_memhandle_cache_alloc_func(int is_alloc, void* context, void* obj)
+{
+	cl_map_item_t* memcache_item = (cl_map_item_t*)obj;
+	if (!is_alloc) {
+		if(memcache_item->payload.start)
+			PSMI_CUDA_CALL(cuIpcCloseMemHandle,
+				       memcache_item->payload.cuda_ipc_dev_ptr);
+	}
+}
 
 /*
  * Creating mempool for cuda memhandle cache nodes.
  */
-psm2_error_t
+static psm2_error_t
 am_cuda_memhandle_mpool_init(uint32_t memcache_size)
 {
 	psm2_error_t err;
@@ -95,8 +205,12 @@ am_cuda_memhandle_mpool_init(uint32_t memcache_size)
 /*
  * Initialize rbtree.
  */
-psm2_error_t am_cuda_memhandle_cache_map_init()
+psm2_error_t am_cuda_memhandle_cache_init(uint32_t memcache_size)
 {
+	psm2_error_t err = am_cuda_memhandle_mpool_init(memcache_size);
+	if (err != PSM2_OK)
+		return err;
+
 	cl_map_item_t *root, *nil_item;
 	root = (cl_map_item_t *)psmi_calloc(NULL, UNDEFINED, 1, sizeof(cl_map_item_t));
 	if (root == NULL)
@@ -115,10 +229,7 @@ psm2_error_t am_cuda_memhandle_cache_map_init()
 
 void am_cuda_memhandle_cache_map_fini()
 {
-#ifdef PSM_DEBUG
-	_HFI_DBG("cache hit counter: %d\n", cache_hit_counter);
-	_HFI_DBG("cache miss counter: %d\n", cache_miss_counter);
-#endif
+	print_cuda_memhandle_cache_stats();
 
 	if (cuda_memhandle_cachemap.nil_item)
 		psmi_free(cuda_memhandle_cachemap.nil_item);
@@ -143,6 +254,7 @@ am_cuda_idleq_insert(cl_map_item_t* memcache_item)
 	INEXT(FIRST) = memcache_item;
 	IPREV(memcache_item) = FIRST;
 	FIRST = memcache_item;
+	INEXT(FIRST) = NULL;
 	return;
 }
 
@@ -155,11 +267,13 @@ am_cuda_idleq_remove_last(cl_map_item_t* memcache_item)
 	if (!INEXT(memcache_item)) {
 		LAST = NULL;
 		FIRST = NULL;
-		return;
+	} else {
+		LAST = INEXT(memcache_item);
+		IPREV(LAST) = NULL;
 	}
-	LAST = INEXT(memcache_item);
-	IPREV(LAST) = NULL;
-	return;
+	// Null-out now-removed memcache_item's next and prev pointers out of
+	// an abundance of caution
+	INEXT(memcache_item) = IPREV(memcache_item) = NULL;
 }
 
 static void
@@ -167,15 +281,16 @@ am_cuda_idleq_remove(cl_map_item_t* memcache_item)
 {
 	if (LAST == memcache_item) {
 		am_cuda_idleq_remove_last(memcache_item);
-		return;
-	}
-	if (INEXT(memcache_item) == NULL) {
-		INEXT(IPREV(memcache_item)) = NULL;
-		return;
+	} else if (FIRST == memcache_item) {
+		FIRST = IPREV(memcache_item);
+		INEXT(FIRST) = NULL;
+	} else {
+		INEXT(IPREV(memcache_item)) = INEXT(memcache_item);
+		IPREV(INEXT(memcache_item)) = IPREV(memcache_item);
 	}
-	INEXT(IPREV(memcache_item)) = INEXT(memcache_item);
-	IPREV(INEXT(memcache_item)) = IPREV(memcache_item);
-	return;
+	// Null-out now-removed memcache_item's next and prev pointers out of
+	// an abundance of caution
+	INEXT(memcache_item) = IPREV(memcache_item) = NULL;
 }
 
 static void
@@ -207,10 +322,14 @@ am_cuda_memhandle_cache_validate(cl_map_item_t* memcache_item,
 			 && epid == memcache_item->payload.epid) {
 		return PSM2_OK;
 	}
+	_HFI_DBG("cache collision: new entry start=%lu,length=%u\n", sbuf, length);
+
+	cache_collide_counter++;
 	ips_cl_qmap_remove_item(&cuda_memhandle_cachemap, memcache_item);
 	PSMI_CUDA_CALL(cuIpcCloseMemHandle,
 		       memcache_item->payload.cuda_ipc_dev_ptr);
 	am_cuda_idleq_remove(memcache_item);
+	memset(memcache_item, 0, sizeof(*memcache_item));
 	psmi_mpool_put(memcache_item);
 	return PSM2_OK_NO_PROGRESS;
 }
@@ -219,14 +338,18 @@ am_cuda_memhandle_cache_validate(cl_map_item_t* memcache_item,
  * Current eviction policy: Least Recently Used.
  */
 static void
-am_cuda_memhandle_cache_evict()
+am_cuda_memhandle_cache_evict(void)
 {
+	cache_evict_counter++;
 	cl_map_item_t *p_item = LAST;
+	_HFI_VDBG("Removing (epid=%lu,start=%lu,length=%u,dev_ptr=0x%llX,it=%p) from cuda_memhandle_cachemap.\n",
+		p_item->payload.epid, p_item->payload.start, p_item->payload.length,
+		p_item->payload.cuda_ipc_dev_ptr, p_item);
 	ips_cl_qmap_remove_item(&cuda_memhandle_cachemap, p_item);
 	PSMI_CUDA_CALL(cuIpcCloseMemHandle, p_item->payload.cuda_ipc_dev_ptr);
 	am_cuda_idleq_remove_last(p_item);
+	memset(p_item, 0, sizeof(*p_item));
 	psmi_mpool_put(p_item);
-	return;
 }
 
 static psm2_error_t
@@ -236,6 +359,7 @@ am_cuda_memhandle_cache_register(uintptr_t sbuf, CUipcMemHandle* handle,
 {
 	if (NELEMS == cuda_memhandle_cache_size)
 		am_cuda_memhandle_cache_evict();
+
 	cl_map_item_t* memcache_item = psmi_mpool_get(cuda_memhandle_mpool);
 	/* memcache_item cannot be NULL as we evict
 	 * before the call to mpool_get. Check has
@@ -253,6 +377,15 @@ am_cuda_memhandle_cache_register(uintptr_t sbuf, CUipcMemHandle* handle,
 	return PSM2_OK;
 }
 
+static void am_cuda_memhandle_cache_clear(void)
+{
+	_HFI_DBG("Closing all handles, clearing cuda_memhandle_cachemap and idleq. NELEMS=%u\n", NELEMS);
+	while (NELEMS) {
+		am_cuda_memhandle_cache_evict();
+	}
+	_HFI_DBG("Closed all handles, cleared cuda_memhandle_cachemap and idleq. NELEMS=%u\n", NELEMS);
+}
+
 /*
  * The key used to search the cache is the senders buf address pointer.
  * Upon a succesful hit in the cache, additional validation is required
@@ -262,36 +395,66 @@ CUdeviceptr
 am_cuda_memhandle_acquire(uintptr_t sbuf, CUipcMemHandle* handle,
 				uint32_t length, psm2_epid_t epid)
 {
+	_HFI_VDBG("sbuf=%lu,handle=%p,length=%u,epid=%lu\n",
+		sbuf, handle, length, epid);
+
 	CUdeviceptr cuda_ipc_dev_ptr;
-	if(cuda_memhandle_cache_enabled) {
-		cl_qmap_t *p_map = &cuda_memhandle_cachemap;
-		cl_map_item_t *p_item;
-		unsigned long start = (unsigned long)sbuf;
-		unsigned long end = start + length;
-		p_item = ips_cl_qmap_search(p_map, start, end);
-		if (p_item->payload.start) {
-			if (am_cuda_memhandle_cache_validate(p_item, sbuf,
-					       handle, length, epid) == PSM2_OK) {
-#ifdef PSM_DEBUG
-				cache_hit_counter++;
-#endif
-				am_cuda_idleq_reorder(p_item);
-				return p_item->payload.cuda_ipc_dev_ptr;
-			}
-		}
-#ifdef PSM_DEBUG
-		cache_miss_counter++;
-#endif
-		PSMI_CUDA_CALL(cuIpcOpenMemHandle, &cuda_ipc_dev_ptr,
-				 *handle, CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS);
-		am_cuda_memhandle_cache_register(sbuf, handle,
-					       length, epid, cuda_ipc_dev_ptr);
-		return cuda_ipc_dev_ptr;
-	} else {
+	if(!cuda_memhandle_cache_enabled) {
 		PSMI_CUDA_CALL(cuIpcOpenMemHandle, &cuda_ipc_dev_ptr,
 				 *handle, CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS);
 		return cuda_ipc_dev_ptr;
 	}
+
+	cuda_cache_item key = {
+		.start = (unsigned long) sbuf,
+		.length= length,
+		.epid = epid
+	};
+
+	/*
+	 * preconditions:
+	 *  1) newrange [start,end) may or may not be in cachemap already
+	 *  2) there are no overlapping address ranges in cachemap
+	 * postconditions:
+	 *  1) newrange is in cachemap
+	 *  2) there are no overlapping address ranges in cachemap
+	 *
+	 * The key used to search the cache is the senders buf address pointer.
+	 * Upon a succesful hit in the cache, additional validation is required
+	 * as multiple senders could potentially send the same buf address value.
+	 */
+	cl_map_item_t *p_item = ips_cl_qmap_searchv(&cuda_memhandle_cachemap, &key);
+	while (p_item->payload.start) {
+		// Since a precondition is that there are no overlapping ranges in cachemap,
+		// an exact match implies no need to check further
+		if (am_cuda_memhandle_cache_validate(p_item, sbuf, handle, length, epid) == PSM2_OK) {
+			cache_hit_counter++;
+			am_cuda_idleq_reorder(p_item);
+			return p_item->payload.cuda_ipc_dev_ptr;
+		}
+
+		// newrange is not in the cache and overlaps at least one existing range.
+		// am_cuda_memhandle_cache_validate() closed and removed existing range.
+		// Continue searching for more overlapping ranges
+		p_item = ips_cl_qmap_searchv(&cuda_memhandle_cachemap, &key);
+	}
+	cache_miss_counter++;
+
+	CUresult cudaerr;
+	PSMI_CUDA_CALL_EXCEPT(CUDA_ERROR_ALREADY_MAPPED, cuIpcOpenMemHandle,
+		&cuda_ipc_dev_ptr, *handle, CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS);
+
+	if (cudaerr == CUDA_ERROR_ALREADY_MAPPED) {
+		// remote memory already mapped. Close all handles, clear cache,
+		// and try again
+		am_cuda_memhandle_cache_clear();
+		PSMI_CUDA_CALL(cuIpcOpenMemHandle, &cuda_ipc_dev_ptr, *handle,
+			CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS);
+	}
+
+	am_cuda_memhandle_cache_register(sbuf, handle,
+					   length, epid, cuda_ipc_dev_ptr);
+	return cuda_ipc_dev_ptr;
 }
 
 void
@@ -302,20 +465,4 @@ am_cuda_memhandle_release(CUdeviceptr cuda_ipc_dev_ptr)
 	return;
 }
 
-/*
- * This is the callback function when mempool are resized or destroyed.
- * Upon calling cache fini mpool is detroyed which in turn calls this callback
- * which helps in closing all memhandles.
- */
-void
-psmi_cuda_memhandle_cache_alloc_func(int is_alloc, void* context, void* obj)
-{
-	cl_map_item_t* memcache_item = (cl_map_item_t*)obj;
-	if (!is_alloc) {
-		if(memcache_item->payload.start)
-			PSMI_CUDA_CALL(cuIpcCloseMemHandle,
-				       memcache_item->payload.cuda_ipc_dev_ptr);
-	}
-}
-
 #endif
diff --git a/ptl_am/am_cuda_memhandle_cache.h b/ptl_am/am_cuda_memhandle_cache.h
index 494de32..f25b3af 100644
--- a/ptl_am/am_cuda_memhandle_cache.h
+++ b/ptl_am/am_cuda_memhandle_cache.h
@@ -56,58 +56,12 @@
 #ifndef _AM_CUDA_MEMHANDLE_CACHE_H
 #define _AM_CUDA_MEMHANDLE_CACHE_H
 
-#include <stdio.h>
-#include <stdlib.h>
+#include "psm_user.h"
 #include <stdint.h>
-#include <unistd.h>
-
-struct _cl_map_item;
-
-typedef struct
-{
-	unsigned long		start;		 /* start virtual address */
-	CUipcMemHandle		cuda_ipc_handle; /* cuda ipc mem handle */
-	CUdeviceptr		cuda_ipc_dev_ptr;/* Cuda device pointer */
-	uint16_t		length;	 /* length*/
-	psm2_epid_t             epid;
-	struct _cl_map_item*	i_prev;	 /* idle queue previous */
-	struct _cl_map_item*	i_next;	 /* idle queue next */
-}__attribute__ ((aligned (128))) rbtree_cuda_memhandle_cache_mapitem_pl_t;
-
-typedef struct {
-	uint32_t		nelems;	/* number of elements in the cache */
-} rbtree_cuda_memhandle_cache_map_pl_t;
-
-#define RBTREE_MI_PL  rbtree_cuda_memhandle_cache_mapitem_pl_t
-#define RBTREE_MAP_PL rbtree_cuda_memhandle_cache_map_pl_t
-
-#include "rbtree.h"
-
-cl_qmap_t cuda_memhandle_cachemap; /* Global cache */
-uint8_t cuda_memhandle_cache_enabled;
-mpool_t cuda_memhandle_mpool;
-uint32_t cuda_memhandle_cache_size;
-#define CUDA_MEMHANDLE_CACHE_SIZE 64
-
-/*
- * Macro definition for easy programming.
- */
 
-#define NELEMS			cuda_memhandle_cachemap.payload.nelems
-
-/*
- * Macro for idle queue management.
- */
-#define IHEAD			cuda_memhandle_cachemap.root
-#define LAST			IHEAD->payload.i_prev
-#define FIRST			IHEAD->payload.i_next
-#define INEXT(x)		x->payload.i_next
-#define IPREV(x)		x->payload.i_prev
-
-
-psm2_error_t am_cuda_memhandle_mpool_init(uint32_t memcache_size);
+#define CUDA_MEMHANDLE_CACHE_SIZE 64
 
-psm2_error_t am_cuda_memhandle_cache_map_init();
+psm2_error_t am_cuda_memhandle_cache_init(uint32_t memcache_size);
 
 CUdeviceptr
 am_cuda_memhandle_acquire(uintptr_t sbuf, CUipcMemHandle* handle,
@@ -115,10 +69,8 @@ am_cuda_memhandle_acquire(uintptr_t sbuf, CUipcMemHandle* handle,
 void
 am_cuda_memhandle_release(CUdeviceptr cuda_ipc_dev_ptr);
 
-void psmi_cuda_memhandle_cache_alloc_func(int is_alloc, void* context, void* obj);
-
 void am_cuda_memhandle_cache_map_fini();
 
-#endif
+#endif /* _AM_CUDA_MEMHANDLE_CACHE_H */
 
-#endif
+#endif /* PSM_CUDA */
diff --git a/ptl_am/am_reqrep_shmem.c b/ptl_am/am_reqrep_shmem.c
index 42819db..9be72f9 100644
--- a/ptl_am/am_reqrep_shmem.c
+++ b/ptl_am/am_reqrep_shmem.c
@@ -2574,10 +2574,7 @@ amsh_init(psm2_ep_t ep, ptl_t *ptl_gen, ptl_ctl_t *ctl)
 			    PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT,
 			    (union psmi_envvar_val)
 			    CUDA_MEMHANDLE_CACHE_SIZE, &env_memcache_size);
-		if ((err = am_cuda_memhandle_mpool_init(env_memcache_size.e_uint)
-		     != PSM2_OK))
-			goto fail;
-		if ((err = am_cuda_memhandle_cache_map_init() != PSM2_OK))
+		if ((err = am_cuda_memhandle_cache_init(env_memcache_size.e_uint) != PSM2_OK))
 			goto fail;
 	}
 #endif
diff --git a/rpm_release_extension b/rpm_release_extension
index 91b629b..c4597e5 100644
--- a/rpm_release_extension
+++ b/rpm_release_extension
@@ -1 +1 @@
-156
+173