From 0f9213e7af8d32c291d4657ff4a3279918de1e60 Mon Sep 17 00:00:00 2001
From: Aravind Gopalakrishnan <Aravind.Gopalakrishnan@intel.com>
Date: Mon, 7 May 2018 12:38:17 -0700
Subject: [PATCH] Sync with PSM2 master up to correction of URL for GitHub PSM2
 repo

This code surface corresponds to version 10.3.58 of PSM2. In addition
to GitHub URL fixes, the change also includes-
  - PSM2 GDR Copy feature and tuning of thresholds to fully utilize the feature
  - Fix uniformity of process mapping to HFIs
  - PSM2 specific perf benchmarks
  - Fix for issue 27: Null termination of string
  - Update README

Signed-off-by: Aravind Gopalakrishnan <Aravind.Gopalakrishnan@intel.com>
---
 COMMIT                       |   2 +-
 Makefile                     |   3 +-
 README                       |  48 +++---
 include/opa_service.h        |   5 +
 libpsm2.spec.in              |   4 +-
 makesrpm.sh                  |  17 +-
 opa/opa_sysfs.c              |   4 +
 perf_test/Makefile           |  72 ++++++++
 perf_test/README             | 184 ++++++++++++++++++++
 perf_test/bi-bw-mrate.c      | 175 +++++++++++++++++++
 perf_test/bw-mrate.c         | 171 +++++++++++++++++++
 perf_test/latency.c          | 156 +++++++++++++++++
 perf_test/libpsm2.c          | 206 +++++++++++++++++++++++
 perf_test/libpsm2.h          | 160 ++++++++++++++++++
 perf_test/psm2perf.c         | 315 +++++++++++++++++++++++++++++++++++
 perf_test/psm2perf.h         | 138 +++++++++++++++
 psm.c                        |  91 ++++++++++
 psm_context.c                | 185 ++++++++++++++++++--
 psm_ep.c                     |  20 ++-
 psm_gdrcpy.c                 | 226 +++++++++++++++++++++++++
 psm_gdrcpy.h                 |  77 +++++++++
 psm_lock.h                   |  48 ++++++
 psm_mq.c                     | 181 ++++++++++++++++----
 psm_mq_internal.h            |  27 ++-
 psm_mq_recv.c                |  54 +++++-
 psm_mq_utils.c               |   4 +
 psm_user.h                   |  50 +++++-
 ptl_ips/ips_proto.c          |  34 +++-
 ptl_ips/ips_proto_expected.c |   1 +
 ptl_ips/ips_proto_mq.c       | 175 ++++++++++++++-----
 ptl_ips/ips_proto_params.h   |  11 +-
 ptl_ips/ips_scb.h            |   8 +-
 rpm_release_extension        |   2 +-
 33 files changed, 2708 insertions(+), 146 deletions(-)
 create mode 100644 perf_test/Makefile
 create mode 100644 perf_test/README
 create mode 100644 perf_test/bi-bw-mrate.c
 create mode 100644 perf_test/bw-mrate.c
 create mode 100644 perf_test/latency.c
 create mode 100644 perf_test/libpsm2.c
 create mode 100644 perf_test/libpsm2.h
 create mode 100644 perf_test/psm2perf.c
 create mode 100644 perf_test/psm2perf.h
 create mode 100644 psm_gdrcpy.c
 create mode 100644 psm_gdrcpy.h

diff --git a/COMMIT b/COMMIT
index 50cba6e..53ed157 100644
--- a/COMMIT
+++ b/COMMIT
@@ -1 +1 @@
-8b6ba42b45df1815a1da540ebb088b10cc8d88ea
\ No newline at end of file
+8a6c3e5b8d873b8ff4375a4967610d5931691ec2
\ No newline at end of file
diff --git a/Makefile b/Makefile
index dc11046..7f4e5ca 100644
--- a/Makefile
+++ b/Makefile
@@ -489,7 +489,8 @@ ${TARGLIB}-objs := ptl_am/am_reqrep_shmem.o	\
 		   ptl_self/ptl.o		\
 		   opa/*.o			\
 		   psm_diags.o 			\
-		   psmi_wrappers.o
+		   psmi_wrappers.o              \
+		   psm_gdrcpy.o
 
 ${TARGLIB}-objs := $(patsubst %.o, ${OUTDIR}/%.o, ${${TARGLIB}-objs})
 
diff --git a/README b/README
index e74c865..f9f0537 100644
--- a/README
+++ b/README
@@ -213,34 +213,31 @@ RELATED SOFTWARE TO PSM2
 
 MPI Libraries supported
 -----------------------
-A large number of open source (OpenMPI, MVAPICH2) and Vendor MPI
+A large number of open source (Open MPI, MVAPICH2) and Vendor MPI
 implementations support PSM2 for optimized communication on HCAs. Vendor MPI
 implementations (HP-MPI, Intel MPI 4.0 with PMI, Platform/Scali MPI)
 require that the PSM2 runtime libraries be installed and available on
 each node. Usually a configuration file or a command line switch to mpirun
 needs to be specified to utilize the PSM2 transport.
 
-OpenMPI support
+Open MPI support
 ---------------
-It is recommended to use the v1.10.4 or newer version of OpenMPI.
-Prior versions of OpenMPI have an issue with support PSM2 network transports
-mixed with standard Verbs transport (BTL openib). This prevents an OpenMPI
-installation with network modules available for PSM2 and Verbs to work
-correctly on nodes with no HFI hardware. This has been fixed in the
-latest development branch allowing a single OpenMPI installation to target
-HFI hardware via PSM2 or Verbs as well as alternate transports seamlessly.
-
-If NVIDIA CUDA support is desired, you can use the OpenMPI build
-(v1.10.4-cuda-hfi) provided by Intel in the IFS installer v10.4.X or newer.
-The changes have also been accepted into v3.0.x branch of upstream OpenMPI
-repository. Therefore subsequent v3.0.x versions of OpenMPI should carry the
-required OpenMPI support for PSM2 GPUDirect feature.
-
-PSM2 header and runtime files need to be installed on a node where the OpenMPI
+If using a version of Open MPI that is not packaged within IFS release, it
+is required to use at least v1.10.4. Older versions are not supported. Since
+v1.10.4 is not in active development, it is further recommended to use upstream
+versions v2.1.2 or newer.
+
+If NVIDIA* CUDA* support is desired, you can use Open MPI built with CUDA*
+support provided by Intel in the IFS installer 10.4 or newer. This Open MPI
+build is identified with the "-cuda-hfi" tag to the Open MPI base version
+name.  The NVIDIA* CUDA* support changes have also been accepted into v2.1.3,
+v3.0.1 and v3.1.0 branches of upstream Open MPI repository.
+
+PSM2 header and runtime files need to be installed on a node where the Open MPI
 build is performed. All compute nodes additionally should have the PSM2 runtime
-libraries available on them. OpenMPI provides a standard configure, make and
+libraries available on them. Open MPI provides a standard configure, make and
 make install mechanism which will detect and build the relevant PSM2 network
-modules for OpenMPI once the header and runtime files are detected.
+modules for Open MPI once the header and runtime files are detected.
 
 MVAPICH2 support
 ----------------
@@ -251,30 +248,25 @@ compute nodes should also have the PSM2 runtime libraries available on them.
 
 For building and installing MVAPICH2 with OPA support, refer to MVAPICH2
 user guides here:
-http://mvapich.cse.ohio-state.edu/static/media/mvapich/mvapich2-2.2rc1-userguide.html
+http://mvapich.cse.ohio-state.edu/userguide/
 
-(Note: Support for PSM2 is currently on v2.2rc1 of OSU MVAPICH2 code base.
-The above link might change when a stable v2.2 is released.)
+(Note: Support for PSM2 is included in v2.2 and newer)
 
 OFED Support
 ------------
 Intel OPA is not yet included within OFED. But the hfi1 driver is available
-publicly at kernel.org. Please do pull the driver from either kernel.org or
-the github page for opa-hfi1 driver (https://github.com/01org/opa-hfi1)
+publicly at kernel.org.
 
 SUPPORTING DOCUMENTATION
 ------------------------
 PSM2 Programmer's Guide is published along with documentation for "Intel® Omni-Path
 Host Fabric Interface PCIe Adapter 100 Series"
-(http://www.intel.com/content/www/us/en/support/network-and-i-o/fabric-products/000016242.html)
+(https://www.intel.com/content/www/us/en/support/articles/000016242/network-and-i-o/fabric-products.html)
 
 Refer to this document for description on APIs and environment variables that
 are available for use. For sample code on writing applications leveraging the
 PSM2 APIs, refer to Section 5.
 
-Link to latest (as of Sep 2017) PSM2 Programmer's Guide:
-https://www.intel.com/content/dam/support/us/en/documents/network-and-i-o/fabric-products/Intel_PSM2_PG_H76473_v7_0.pdf
-
 PSM Compatibility Support
 ------------
 
diff --git a/include/opa_service.h b/include/opa_service.h
index 1ddfcac..728dd90 100644
--- a/include/opa_service.h
+++ b/include/opa_service.h
@@ -75,6 +75,11 @@
 
 /* base name of path (without unit #) for qib driver */
 #define HFI_DEVICE_PATH "/dev/hfi1"
+
+#ifdef PSM_CUDA
+#define GDR_DEVICE_PATH "/dev/hfi1_gdr"
+#endif
+
 #define HFI_CLASS_PATH "/sys/class/infiniband/hfi1"
 
 /* Commands used to communicate with driver. */
diff --git a/libpsm2.spec.in b/libpsm2.spec.in
index 7bd7836..382e73f 100644
--- a/libpsm2.spec.in
+++ b/libpsm2.spec.in
@@ -53,10 +53,10 @@ Name: @RPM_NAME@
 Version: @VERSION@
 Release: 1@SPEC_FILE_RELEASE_DIST@
 License: BSD or GPLv2
-URL: https://github.com/01org/opa-psm2/
+URL: https://github.com/intel/opa-psm2/
 
 # The tarball can be created by:
-# git clone https://github.com/01org/opa-psm2
+# git clone https://github.com/intel/opa-psm2
 # cd opa-psm2
 # git checkout @DIST_SHA@
 # make dist
diff --git a/makesrpm.sh b/makesrpm.sh
index 5fc4939..31caa01 100755
--- a/makesrpm.sh
+++ b/makesrpm.sh
@@ -79,12 +79,16 @@ function usage()
     echo "           Sets PSM_CUDA=1, creating -cuda based spec and rpms"
     echo "     -d <path>, -dir <path>"
     echo "           Optionally sets output folder for rpmbuild to use"
+    echo "     -h <hal_gen>, -hal_gen <hal_gen>"
+    echo "           Optional, default is includes all HAL generations"
+    echo "           Sets hal generations for rpmbuild to use"
     echo "     Examples:"
     echo "           $0 b"
     echo "           $0 s -cuda"
     echo "           $0 -cuda"
     echo "           $0 -d ./temp"
     echo "           $0 b -cuda -dir output"
+    echo "           $0 -h gen1"
     exit 1
 }
 
@@ -98,6 +102,8 @@ OUTDIR=build_release
 # It can be set the same as OUTDIR, and work just fine if desired.
 TEMPDIR=temp.$$
 
+HAL_GENS=""
+
 while [ "$1" != "" ]; do
     case $1 in
         -d | -dir)      shift
@@ -116,6 +122,9 @@ while [ "$1" != "" ]; do
                         RPM_NAME_BASEEXT="$1"
                         export RPM_NAME_BASEEXT="$1"
                         ;;
+        -h | -halgen)   shift
+                        HAL_GENS="$1 $HAL_GENS"
+	                ;;
         -r | -rpmname)  shift
                         if [ -z "$1" ]; then
                             usage
@@ -132,11 +141,15 @@ while [ "$1" != "" ]; do
     shift
 done
 
+if [ "$HAL_GENS" = "" ]; then
+    HAL_GENS="*"
+fi
+
 # Generic cleanup, build, and tmp folder creation
 make distclean OUTDIR=$OUTDIR
-make RPM_NAME=$RPM_NAME RPM_NAME_BASEEXT=$RPM_NAME_BASEEXT dist OUTDIR=$OUTDIR
+make RPM_NAME=$RPM_NAME RPM_NAME_BASEEXT=$RPM_NAME_BASEEXT "PSM_HAL_ENABLE=$HAL_GENS" dist OUTDIR=$OUTDIR
 mkdir -p ./$TEMPDIR/{BUILD,RPMS,SOURCES,SPECS,SRPMS,BUILDROOT}
-# Differnet paths based on RPM_EXT
+# Different paths based on RPM_EXT
 cp ${OUTDIR}/$RPM_NAME-*.tar.gz $TEMPDIR/SOURCES
 make RPM_NAME=$RPM_NAME RPM_NAME_BASEEXT=$RPM_NAME_BASEEXT specfile OUTDIR=$OUTDIR
 cp ${OUTDIR}/$RPM_NAME.spec $TEMPDIR/SPECS
diff --git a/opa/opa_sysfs.c b/opa/opa_sysfs.c
index 9a27698..4444426 100644
--- a/opa/opa_sysfs.c
+++ b/opa/opa_sysfs.c
@@ -374,6 +374,10 @@ static int read_page(int fd, char **datap)
 	if (ret == -1) {
 		free(data);
 	} else {
+		if (ret < sysfs_page_size)
+			data[ret] = 0;
+		else
+			data[sysfs_page_size-1] = 0;
 		*datap = data;
 	}
 
diff --git a/perf_test/Makefile b/perf_test/Makefile
new file mode 100644
index 0000000..2f0b362
--- /dev/null
+++ b/perf_test/Makefile
@@ -0,0 +1,72 @@
+#
+#  This file is provided under a dual BSD/GPLv2 license.  When using or
+#  redistributing this file, you may do so under either license.
+#
+#  GPL LICENSE SUMMARY
+#
+#  Copyright(c) 2018 Intel Corporation.
+#
+#  This program is free software; you can redistribute it and/or modify
+#  it under the terms of version 2 of the GNU General Public License as
+#  published by the Free Software Foundation.
+#
+#  This program is distributed in the hope that it will be useful, but
+#  WITHOUT ANY WARRANTY; without even the implied warranty of
+#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+#  General Public License for more details.
+#
+#  Contact Information:
+#  Intel Corporation, www.intel.com
+#
+#  BSD LICENSE
+#
+#  Copyright(c) 2018 Intel Corporation.
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions
+#  are met:
+#
+#    * Redistributions of source code must retain the above copyright
+#      notice, this list of conditions and the following disclaimer.
+#    * Redistributions in binary form must reproduce the above copyright
+#      notice, this list of conditions and the following disclaimer in
+#      the documentation and/or other materials provided with the
+#      distribution.
+#    * Neither the name of Intel Corporation nor the names of its
+#      contributors may be used to endorse or promote products derived
+#      from this software without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+#  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+#  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+#  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+#  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+#  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+#  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+#  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+#  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+#  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+
+CFLAGS+=-Wall -g -I.
+LIBS=-lpsm2
+
+SRC=$(wildcard *.c)
+OBJS=$(patsubst %.c, %.o, $(SRC))
+MAIN_SRC=latency.o bi-bw-mrate.o bw-mrate.o
+BIN_OBJS=$(filter-out $(MAIN_SRC), $(OBJS))
+
+all: latency bw-mrate bi-bw-mrate
+latency: $(BIN_OBJS) latency.o
+	$(CC) $(CFLAGS) $(LIBS) -o latency latency.c $(BIN_OBJS)
+bw-mrate: $(BIN_OBJS) bw-mrate.o
+	$(CC) $(CFLAGS) $(LIBS) -o bw-mrate bw-mrate.c $(BIN_OBJS)
+bi-bw-mrate: $(BIN_OBJS) bi-bw-mrate.o
+	$(CC) $(CFLAGS) $(LIBS) -o bi-bw-mrate bi-bw-mrate.c $(BIN_OBJS)
+
+$(OBJS): %.o : %.c
+	$(CC) $(CFLAGS) -c $< $(LIBS)
+
+clean:
+	rm -f *.o bi-bw-mrate bw-mrate latency
diff --git a/perf_test/README b/perf_test/README
new file mode 100644
index 0000000..1426cde
--- /dev/null
+++ b/perf_test/README
@@ -0,0 +1,184 @@
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2018 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2018 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+  Copyright (c) 2018 Intel Corporation. All rights reserved.
+
+================================================================================
+
+INTRODUCTION
+============
+This directory contains psm2-level bandwidth and latency benchmarks.
+These benchmarks do not rely on MPI or any other non-psm2 protocols to setup or
+run. Latency, uni-directional bandwidth and bi-directional bandwidth benchmarks
+are provided.
+
+COMPILATION
+===========
+Please install the libpsm2-devel package before attempting to build as the
+benchmarks depend on the libpsm2-devel package.
+
+Running 'make' on the command line will produce the following binaries:
+1. latency
+This is a uni-directional latency benchmark.
+2. bw-mrate
+This is a uni-directional bandwidth benchmark.
+3. bi-bw-mrate
+This is a bi-directional bandwidth benchmark.
+
+USAGE REQUIREMENTS
+==================
+Two nodes are utilized to run these benchmarks.
+One node acts as a server and the other will be a client.
+The only runtime requirement is libpsm2.
+
+RUNNING THE BENCHMARKS
+======================
+All three benchmarks have the same runtime arguments.
+
+Example for latency:
+Server:
+./latency
+
+Client:
+./latency server_hostname
+
+BENCHMARK ARGUMENTS:
+====================
+usage: ./latency [server] [-m size] [-M size] [-f --flush] [-h --help]
+options:
+server, server node to connect to, this node will be the client
+-m, starting message size in bytes (default 1)
+-M, ending message size in bytes (default 4194304)
+-f/--flush, flush L3 cache before benchmark
+--mqstats, show psm2 mq counters
+-h/--help, show this help message
+
+
+SAMPLE OUTPUT
+=============
+
+-bash-4.2$ ./latency
+# PSM2 Ping Pong Latency Test
+# Message Size(B)      Latency(us)
+1                         xxx.yy
+2                         xxx.yy
+4                         xxx.yy
+8                         xxx.yy
+16                        xxx.yy
+32                        xxx.yy
+64                        xxx.yy
+128                       xxx.yy
+256                       xxx.yy
+512                       xxx.yy
+1024                      xxx.yy
+2048                      xxx.yy
+4096                      xxx.yy
+8192                      xxx.yy
+16384                     xxx.yy
+32768                     xxx.yy
+65536                     xxx.yy
+131072                    xxx.yy
+262144                    xxx.yy
+524288                    xxx.yy
+1048576                   xxx.yy
+2097152                   xxx.yy
+4194304                   xxx.yy
+-bash-4.2$ ./bi-bw-mrate
+# PSM2 Bi-directional Bandwidth, Message Rate Test
+# Message Size(B)  Bandwidth(MB/s)  Message Rate(Mmps)
+1                       xxxxx.yy                x.yy
+2                       xxxxx.yy                x.yy
+4                       xxxxx.yy                x.yy
+8                       xxxxx.yy                x.yy
+16                      xxxxx.yy                x.yy
+32                      xxxxx.yy                x.yy
+64                      xxxxx.yy                x.yy
+128                     xxxxx.yy                x.yy
+256                     xxxxx.yy                x.yy
+512                     xxxxx.yy                x.yy
+1024                    xxxxx.yy                x.yy
+2048                    xxxxx.yy                x.yy
+4096                    xxxxx.yy                x.yy
+8192                    xxxxx.yy                x.yy
+16384                   xxxxx.yy                x.yy
+32768                   xxxxx.yy                x.yy
+65536                   xxxxx.yy                x.yy
+131072                  xxxxx.yy                x.yy
+262144                  xxxxx.yy                x.yy
+524288                  xxxxx.yy                x.yy
+1048576                 xxxxx.yy                x.yy
+2097152                 xxxxx.yy                x.yy
+4194304                 xxxxx.yy                x.yy
+-bash-4.2$ ./bw-mrate
+# PSM2 Uni-directional Bandwidth, Message Rate Test
+# Message Size(B)  Bandwidth(MB/s)  Message Rate(Mmps)
+1                       xxxxx.yy                x.yy
+2                       xxxxx.yy                x.yy
+4                       xxxxx.yy                x.yy
+8                       xxxxx.yy                x.yy
+16                      xxxxx.yy                x.yy
+32                      xxxxx.yy                x.yy
+64                      xxxxx.yy                x.yy
+128                     xxxxx.yy                x.yy
+256                     xxxxx.yy                x.yy
+512                     xxxxx.yy                x.yy
+1024                    xxxxx.yy                x.yy
+2048                    xxxxx.yy                x.yy
+4096                    xxxxx.yy                x.yy
+8192                    xxxxx.yy                x.yy
+16384                   xxxxx.yy                x.yy
+32768                   xxxxx.yy                x.yy
+65536                   xxxxx.yy                x.yy
+131072                  xxxxx.yy                x.yy
+262144                  xxxxx.yy                x.yy
+524288                  xxxxx.yy                x.yy
+1048576                 xxxxx.yy                x.yy
+2097152                 xxxxx.yy                x.yy
+4194304                 xxxxx.yy                x.yy
diff --git a/perf_test/bi-bw-mrate.c b/perf_test/bi-bw-mrate.c
new file mode 100644
index 0000000..7e41b7c
--- /dev/null
+++ b/perf_test/bi-bw-mrate.c
@@ -0,0 +1,175 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2018 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2018 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "libpsm2.h"
+#include "psm2perf.h"
+
+int run_bi_bw_mrate(struct benchmark_info *info, int sock);
+
+int main(int argc, char **argv)
+{
+	int ret = 0;
+
+	struct benchmark_info *info = init_benchmark(argc, argv);
+	if (info == NULL) {
+		ret = -1;
+		goto bail;
+	}
+
+	int sock = open_socket(info->server, info->is_server, SERVER_PORT);
+	if (sock < 0) {
+		ret = -1;
+		goto bail;
+	}
+
+	ret = exchange_info(sock, info);
+	if (ret == -1)
+		goto bail;
+
+	ret = libpsm2_init(sock, info->is_server);
+	if (ret == -1)
+		goto bail;
+
+	if (info->run_flush) {
+		printf("Flushing L3 Cache... will take a few seconds\n");
+		printf("Flushed L3 cache (%ld)\n",flush_l3cache());
+	}
+
+	ret = run_bi_bw_mrate(info, sock);
+	if (info->show_mqstats)
+		print_psm2_stats();
+	libpsm2_shutdown();
+
+bail:
+	if (sock > 0)
+		close(sock);
+	if (info != NULL)
+		free(info);
+	return ret;
+}
+
+int run_bi_bw_mrate(struct benchmark_info *info, int sock)
+{
+	int msize, i, w;
+	psm2_mq_req_t sreq[WINDOW], rreq[WINDOW];
+	unsigned long long time_start;
+	unsigned long long time_end;
+	double time_elapsed, bw, mrate;
+	int iters[2] = {ITERS_LARGE, ITERS_SMALL}, iter = iters[0];
+
+	printf("# PSM2 Bi-directional Bandwidth, Message Rate Test\n");
+	printf("# Message Size(B)  Bandwidth(MB/s)  Message Rate(Mmps)\n");
+
+	for (msize = info->min_msg_sz; msize <= info->max_msg_sz; msize *= 2) {
+		if (msize > LARGE_MSG)
+			iter = iters[1];
+
+		if (info->is_server) {
+			// warmup
+			for (i = 0; i < iter; i++) {
+				for (w = 0; w < WINDOW; w++)
+					post_irecv(rbuff, msize, PSM2_TAG, PSM2_TAGSEL, info->partner, &sreq[w]);
+				for (w = 0; w < WINDOW; w++)
+					post_isend(sbuff, msize, PSM2_TAG, info->partner, &rreq[w]);
+
+				psm2_waitall(WINDOW, &rreq[0], NULL);
+				psm2_waitall(WINDOW, &sreq[0], NULL);
+			}
+
+			TIMER(time_start);
+			for (i = 0; i < iter; i++) {
+				for (w = 0; w < WINDOW; w++)
+					post_irecv(rbuff, msize, PSM2_TAG, PSM2_TAGSEL, info->partner, &sreq[w]);
+				for (w = 0; w < WINDOW; w++)
+					post_isend(sbuff, msize, PSM2_TAG, info->partner, &rreq[w]);
+
+				psm2_waitall(WINDOW, &rreq[0], NULL);
+				psm2_waitall(WINDOW, &sreq[0], NULL);
+			}
+			TIMER(time_end);
+
+			time_end -= time_start;
+			time_elapsed = (double) time_end /  info->cpu_freq;
+			bw = 2 * msize / 1e6 / time_elapsed * iter * WINDOW;
+			mrate = bw / msize;
+		} else {
+			for (i = 0; i < 2 * iter; i++) {
+				for (w = 0; w < WINDOW; w++)
+					post_irecv(rbuff, msize, PSM2_TAG, PSM2_TAGSEL, info->partner, &rreq[w]);
+				for (w = 0; w < WINDOW; w++)
+					post_isend(sbuff, msize, PSM2_TAG, info->partner, &sreq[w]);
+
+				psm2_waitall(WINDOW, &rreq[0], NULL);
+				psm2_waitall(WINDOW, &sreq[0], NULL);
+			}
+		}
+
+		if (info->is_server) {
+			SEND(sock, bw, double);
+			SEND(sock, mrate, double);
+		} else {
+			RECV(sock, bw, double);
+			RECV(sock, mrate, double);
+		}
+
+		printf("%-15d  %15.2f  %18.2f\n",
+				msize, bw, mrate);
+	}
+
+	return 0;
+bail:
+	return -1;
+}
diff --git a/perf_test/bw-mrate.c b/perf_test/bw-mrate.c
new file mode 100644
index 0000000..d191b11
--- /dev/null
+++ b/perf_test/bw-mrate.c
@@ -0,0 +1,171 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2018 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2018 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "libpsm2.h"
+#include "psm2perf.h"
+
+int run_bw_mrate(struct benchmark_info *info, int sock);
+
+int main(int argc, char **argv)
+{
+	int ret = 0;
+
+	struct benchmark_info *info = init_benchmark(argc, argv);
+	if (info == NULL) {
+		ret = -1;
+		goto bail;
+	}
+
+	int sock = open_socket(info->server, info->is_server, SERVER_PORT);
+	if (sock < 0) {
+		ret = -1;
+		goto bail;
+	}
+
+	ret = exchange_info(sock, info);
+	if (ret == -1)
+		goto bail;
+
+	ret = libpsm2_init(sock, info->is_server);
+	if (ret == -1)
+		goto bail;
+
+	if (info->run_flush) {
+		printf("Flushing L3 Cache... will take a few seconds\n");
+		printf("Flushed L3 cache (%ld)\n",flush_l3cache());
+	}
+
+	ret = run_bw_mrate(info, sock);
+	if (info->show_mqstats)
+		print_psm2_stats();
+	libpsm2_shutdown();
+
+bail:
+	if (sock > 0)
+		close(sock);
+	if (info != NULL)
+		free(info);
+	return ret;
+}
+
+int run_bw_mrate(struct benchmark_info *info, int sock)
+{
+	int i, w, ack, msize;
+	int iters[2] = {ITERS_MEDIUM, ITERS_SMALL}, iter = iters[0];
+	unsigned long long time_start;
+	unsigned long long time_end;
+	double time_elapsed, bw, mrate;
+	psm2_mq_req_t req[WINDOW], ack_req;
+
+	printf("# PSM2 Uni-directional Bandwidth, Message Rate Test\n");
+	printf("# Message Size(B)  Bandwidth(MB/s)  Message Rate(Mmps)\n");
+
+	for (msize = info->min_msg_sz; msize <= info->max_msg_sz; msize *= 2) {
+		if (msize > LARGE_MSG)
+			iter = iters[1];
+		if (info->is_server) {
+			// warmup
+			for (i = 0; i < iter; i++) {
+				for (w = 0; w < WINDOW; w++)
+					post_isend(rbuff, msize, PSM2_TAG, info->partner, &req[w]);
+
+				psm2_waitall(WINDOW, &req[0], NULL);
+				post_irecv(&ack, sizeof(int), PSM2_TAG, PSM2_TAGSEL,
+						info->partner, &ack_req);
+				psm2_mq_wait(&ack_req, NULL);
+			}
+
+			TIMER(time_start);
+			for (i = 0; i < iter; i++) {
+				for (w = 0; w < WINDOW; w++)
+					post_isend(rbuff, msize, PSM2_TAG, info->partner, &req[w]);
+
+				psm2_waitall(WINDOW, &req[0], NULL);
+				post_irecv(&ack, sizeof(int), PSM2_TAG, PSM2_TAGSEL,
+						info->partner, &ack_req);
+				psm2_mq_wait(&ack_req, NULL);
+			}
+			TIMER(time_end);
+
+			time_end -= time_start;
+			time_elapsed = (double) time_end /  info->cpu_freq;
+			bw = msize / 1e6 / time_elapsed * iter * WINDOW;
+			mrate = bw / msize;
+		} else {
+			for (i = 0; i < 2 * iter; i++) {
+				for (w = 0; w < WINDOW; w++)
+					post_irecv(rbuff, msize, PSM2_TAG, PSM2_TAGSEL, info->partner, &req[w]);
+
+				psm2_waitall(WINDOW, &req[0], NULL);
+				post_send(&ack, sizeof(int), PSM2_TAG, info->partner);
+			}
+		}
+		if (info->is_server) {
+			SEND(sock, bw, double);
+			SEND(sock, mrate, double);
+		} else {
+			RECV(sock, bw, double);
+			RECV(sock, mrate, double);
+		}
+
+		printf("%-15d  %15.2f  %18.2f\n",
+				msize, bw, mrate);
+	}
+
+	return 0;
+bail:
+	return -1;
+}
diff --git a/perf_test/latency.c b/perf_test/latency.c
new file mode 100644
index 0000000..598dc6b
--- /dev/null
+++ b/perf_test/latency.c
@@ -0,0 +1,156 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2018 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2018 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "libpsm2.h"
+#include "psm2perf.h"
+
+int run_latency(struct benchmark_info *info, int sock);
+
+int main(int argc, char **argv)
+{
+	int ret = 0;
+
+	struct benchmark_info *info = init_benchmark(argc, argv);
+	if (info == NULL) {
+		ret = -1;
+		goto bail;
+	}
+
+	int sock = open_socket(info->server, info->is_server, SERVER_PORT);
+	if (sock < 0) {
+		ret = -1;
+		goto bail;
+	}
+
+	ret = exchange_info(sock, info);
+	if (ret == -1)
+		goto bail;
+
+	ret = libpsm2_init(sock, info->is_server);
+	if (ret == -1)
+		goto bail;
+
+	if (info->run_flush) {
+		printf("Flushing L3 Cache... will take a few seconds\n");
+		printf("Flushed L3 cache (%ld)\n",flush_l3cache());
+	}
+
+	ret = run_latency(info, sock);
+	if (info->show_mqstats)
+		print_psm2_stats();
+	libpsm2_shutdown();
+
+bail:
+	if (sock > 0)
+		close(sock);
+	if (info != NULL)
+		free(info);
+	return ret;
+}
+
+int run_latency(struct benchmark_info *info, int sock)
+{
+	unsigned long long time_start, time_end;
+	double time_elapsed = 0;
+	int msize, i;
+	int iters[2] = {ITERS_LARGE, ITERS_SMALL}, iter = iters[0];
+	psm2_mq_req_t req;
+	printf("# PSM2 Ping Pong Latency Test\n");
+	printf("# Message Size(B)      Latency(us)\n");
+
+	iter = iters[0];
+	for (msize = info->min_msg_sz; msize <= info->max_msg_sz; msize *= 2) {
+		if (msize > LARGE_MSG)
+			iter = iters[1];
+
+		if (info->is_server) {
+			// warmup
+			for (i = 0; i < iter; i++) {
+				post_send(sbuff, msize, 0xF, info->partner);
+				post_irecv(rbuff, msize, 0xF, 0xF, info->partner, &req);
+				psm2_mq_wait(&req, NULL);
+			}
+
+			TIMER(time_start);
+			for (i = 0; i < iter; i++) {
+				post_send(sbuff, msize, 0xF, info->partner);
+				post_irecv(rbuff, msize, 0xF, 0xF, info->partner, &req);
+				psm2_mq_wait(&req, NULL);
+			}
+			TIMER(time_end);
+
+			time_end -= time_start;
+			time_elapsed = (double) time_end / info->cpu_freq / iter / 2;
+		} else {
+			for (i = 0; i < 2 * iter; i++) {
+				post_irecv(rbuff, msize, 0xF, 0xF, info->partner, &req);
+				psm2_mq_wait(&req, NULL);
+				post_send(sbuff, msize, 0xF, info->partner);
+			}
+		}
+
+		if (info->is_server)
+			SEND(sock, time_elapsed, double);
+		else
+			RECV(sock, time_elapsed, double);
+
+		printf("%-15d  %15.2f\n", msize, 1e6 * time_elapsed);
+	}
+
+	return 0;
+bail:
+	return -1;
+}
diff --git a/perf_test/libpsm2.c b/perf_test/libpsm2.c
new file mode 100644
index 0000000..b9453d7
--- /dev/null
+++ b/perf_test/libpsm2.c
@@ -0,0 +1,206 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2018 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2018 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include <stdint.h>
+#include <stdio.h>
+#include <assert.h>
+#include "libpsm2.h"
+#include "psm2perf.h"
+
+int libpsm2_rank;
+
+psm2_epaddr_t *libpsm2_epaddrs;
+psm2_ep_t libpsm2_ep;
+psm2_mq_t libpsm2_mq;
+static psm2_epid_t libpsm2_epid;
+
+/* Initialize psm2, return 0 on success, -1 on failure */
+int libpsm2_init(int sock, int is_server)
+{
+	int i;
+	psm2_error_t err, epid_errs[MAX_PSM2_RANKS];
+	psm2_uuid_t uuid;
+	psm2_epid_t *epids = NULL;
+	psm2_epaddr_t *epaddrs = NULL;
+	int ver_major = PSM2_VERNO_MAJOR;
+	int ver_minor = PSM2_VERNO_MINOR;
+
+	libpsm2_rank = 0;
+	if (!is_server)
+		libpsm2_rank = 1;
+
+	libpsm2_epaddrs = (psm2_epaddr_t *)
+		malloc(sizeof(psm2_epaddr_t) * MAX_PSM2_RANKS);
+	if (libpsm2_epaddrs == NULL) {
+		perror("malloc libpsm2_epaddrs");
+		goto bail;
+	}
+
+	epids = (psm2_epid_t *) malloc(sizeof(psm2_epid_t) * MAX_PSM2_RANKS);
+	if (epids == NULL) {
+		perror("malloc epids");
+		goto bail;
+	}
+
+	epaddrs = (psm2_epaddr_t *) malloc(sizeof(psm2_epaddr_t) * MAX_PSM2_RANKS);
+	if (epaddrs == NULL) {
+		perror("malloc epaddrs");
+		goto bail;
+	}
+
+	err = psm2_init(&ver_major, &ver_minor);
+	if (err != PSM2_OK) {
+		PSM2_ERR(err, "psm2_init failure\n");
+		goto bail;
+	}
+
+	// Generate and exchange the uuid for this job
+	if (is_server) {
+		psm2_uuid_generate(uuid);
+		SEND(sock, uuid, psm2_uuid_t);
+	} else
+		RECV(sock, uuid, psm2_uuid_t);
+
+	err = psm2_ep_open(uuid, NULL, &libpsm2_ep, &libpsm2_epid);
+	if (err != PSM2_OK) {
+		PSM2_ERR(err, "psm2_ep_open error\n");
+		goto bail;
+	}
+	epids[libpsm2_rank] = libpsm2_epid;
+
+	err = psm2_mq_init(libpsm2_ep, PSM2_MQ_ORDERMASK_NONE,
+			NULL, 0, &libpsm2_mq);
+	if (err != PSM2_OK) {
+		PSM2_ERR(err, "psm2_mq_init failure\n");
+		goto bail;
+	}
+
+	// Exchange server and client epids
+	SEND(sock, libpsm2_epid, psm2_epid_t);
+	RECV(sock, epids[(libpsm2_rank + 1) % MAX_PSM2_RANKS], psm2_epid_t);
+
+	err = psm2_ep_connect(libpsm2_ep,
+			MAX_PSM2_RANKS,
+			epids,
+			NULL,
+			epid_errs,
+			epaddrs,
+			0);
+	if (err != PSM2_OK) {
+		PSM2_ERR(err, "psm2_ep_connect failure\n");
+		goto bail;
+	}
+
+	// Save the epaddrs for later
+	for (i = 0; i < MAX_PSM2_RANKS; i++)
+		libpsm2_epaddrs[i] = epaddrs[i];
+
+	free(epids);
+	free(epaddrs);
+	return 0;
+
+bail:
+	fprintf(stderr, "%s failed\n", __func__);
+	if (epids != NULL)
+		free(epids);
+	if (epaddrs != NULL)
+		free(epaddrs);
+	if (libpsm2_epaddrs != NULL)
+		free(libpsm2_epaddrs);
+	if (libpsm2_mq != NULL)
+		psm2_mq_finalize(libpsm2_mq);
+	if (libpsm2_ep != NULL)
+		psm2_ep_close(libpsm2_ep, PSM2_EP_CLOSE_GRACEFUL, -1);
+	psm2_finalize();
+	return -1;
+}
+
+/* Only psm2_mq_finalize can return something other than PSM2_OK.
+ * Ignoring if psm2_mq_finalize has a problem.
+ */
+void libpsm2_shutdown(void)
+{
+	free(libpsm2_epaddrs);
+
+	int err = psm2_mq_finalize(libpsm2_mq);
+
+	if (err != PSM2_OK)
+		PSM2_ERR(err, "psm2_mq_finalize failure\n");
+
+	psm2_ep_close(libpsm2_ep, PSM2_EP_CLOSE_GRACEFUL, -1);
+	psm2_finalize();
+}
+
+void print_psm2_stats(void)
+{
+	psm2_mq_stats_t stats;
+
+	psm2_mq_get_stats(libpsm2_mq, &stats);
+	printf("PSM2 MQ STATS:\n");
+	printf("rx_user_bytes %lu\n", stats.rx_user_bytes);
+	printf("rx_user_num %lu\n", stats.rx_user_num);
+	printf("rx_sys_bytes %lu\n", stats.rx_sys_bytes);
+	printf("rx_sys_num %lu\n", stats.rx_sys_num);
+
+	printf("tx_num %lu\n", stats.tx_num);
+	printf("tx_eager_num %lu\n", stats.tx_eager_num);
+	printf("tx_eager_bytes %lu\n", stats.tx_eager_bytes);
+	printf("tx_rndv_num %lu\n", stats.tx_rndv_num);
+	printf("tx_rndv_bytes %lu\n", stats.tx_rndv_bytes);
+
+	printf("tx_shm_num %lu\n", stats.tx_shm_num);
+	printf("rx_shm_num %lu\n", stats.rx_shm_num);
+
+	printf("rx_sysbuf_num %lu\n", stats.rx_sysbuf_num);
+	printf("rx_sysbuf_bytes %lu\n", stats.rx_sysbuf_bytes);
+}
diff --git a/perf_test/libpsm2.h b/perf_test/libpsm2.h
new file mode 100644
index 0000000..6ba245e
--- /dev/null
+++ b/perf_test/libpsm2.h
@@ -0,0 +1,160 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2018 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2018 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef _LIBPSM_H_
+#define _LIBPSM_H_
+#include <stdint.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <strings.h>
+#include <string.h>
+#include <unistd.h>
+#include <psm2.h>
+#include <psm2_mq.h>
+
+extern psm2_epaddr_t *libpsm2_epaddrs;
+extern psm2_ep_t libpsm2_ep;
+extern psm2_mq_t libpsm2_mq;
+extern int libpsm2_mpi_rank;
+
+#define PSM2_TAG 0xF
+#define PSM2_TAGSEL 0xF
+#define MAX_PSM2_RANKS 2 /* only one server/client supported now */
+
+#define PSM2_ERR(err, msg) fprintf(stderr, "%s %s\n", \
+		msg, psm2_error_get_string(err));
+
+int libpsm2_init(int sock, int is_server);
+void libpsm2_shutdown(void);
+
+static inline void post_irecv(void *buf, uint32_t len, uint64_t tag,
+		uint64_t tagsel, uint32_t rank, psm2_mq_req_t *req)
+{
+	psm2_mq_irecv(libpsm2_mq, tag, tagsel, 0, buf, len, NULL, req);
+}
+
+static inline void post_isend(void *buf, uint32_t len, uint64_t tag,
+		uint32_t rank, psm2_mq_req_t *req)
+{
+	psm2_mq_isend(libpsm2_mq, libpsm2_epaddrs[rank],
+			0, tag, buf, len, NULL, req);
+}
+
+static inline void post_send(void *buf, uint32_t len,
+		uint64_t tag, uint32_t rank)
+{
+	psm2_mq_send(libpsm2_mq, libpsm2_epaddrs[rank], 0, tag, buf, len);
+}
+
+static inline int cancel(psm2_mq_req_t *req)
+{
+	int ret = psm2_mq_cancel(req);
+
+	if (ret == PSM2_OK)
+		psm2_mq_test(req, NULL);
+	return ret == PSM2_OK;
+}
+
+static inline int test(psm2_mq_req_t *req, psm2_mq_status_t *status)
+{
+	return (psm2_mq_test(req, status) == PSM2_OK);
+}
+
+static inline uint64_t get_cycles(void)
+{
+	uint64_t v;
+	uint32_t a, d;
+
+	asm volatile("rdtsc" : "=a" (a), "=d" (d));
+	v = ((uint64_t)a) | (((uint64_t)d)<<32);
+
+	return v;
+}
+
+static inline void psm2_waitall(int num_req, psm2_mq_req_t *req_list,
+		psm2_mq_status_t *st_list)
+{
+	int cnt = 0;
+	int c[num_req];
+	int w;
+
+	for (w = 0; w < num_req; w++)
+		c[w] = 0;
+
+	if (!st_list) {
+		do {
+			psm2_poll(libpsm2_ep);
+
+			for (w = 0; w < num_req; w++) {
+				if (!c[w] && psm2_mq_test(&req_list[w], NULL) == PSM2_OK) {
+					c[w] = 1;
+					cnt++;
+				}
+			}
+		} while (cnt < num_req);
+	} else {
+		do {
+			psm2_poll(libpsm2_ep);
+
+			for (w = 0; w < num_req; w++) {
+				if (!c[w] && psm2_mq_test(&req_list[w], &st_list[w]) == PSM2_OK) {
+					c[w] = 1;
+					cnt++;
+				}
+			}
+		} while (cnt < num_req);
+	}
+}
+
+void print_psm2_stats(void);
+#endif
diff --git a/perf_test/psm2perf.c b/perf_test/psm2perf.c
new file mode 100644
index 0000000..e08ea08
--- /dev/null
+++ b/perf_test/psm2perf.c
@@ -0,0 +1,315 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2018 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2018 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+#include <unistd.h>
+#include <getopt.h>
+#include <limits.h>
+#include "psm2perf.h"
+
+/* Only return positive numbers from string, -1 on error */
+static long str_to_positive_long(const char *str)
+{
+	char *end;
+	long num = strtol(optarg, &end, 10);
+
+	if (end == str) {
+		fprintf(stderr, "Could not parse input %s\n", str);
+		return -1;
+	} else if ((num == LONG_MAX || num == LONG_MIN) && errno == ERANGE) {
+		fprintf(stderr, "Underflow or overflow in %s\n", str);
+		return -1;
+	} else if (num > 0)
+		return num;
+
+	fprintf(stderr,
+			"Expected a positive, non-zero number, got %ld\n", num);
+	return -1;
+}
+
+static void print_usage(char *name)
+{
+	const char usage[] =
+		"usage: %s [server] [-m size] [-M size] [-f --flush] [-h --help]\n"
+		"options:\n"
+		"server, server node to connect to, this node will be the client\n"
+		"-m, starting message size in bytes (default %i)\n"
+		"-M, ending message size in bytes (default %i)\n"
+		"-f/--flush, flush L3 cache before benchmark\n"
+		"--mqstats, show psm2 mq counters\n"
+		"-h/--help, show this help message\n";
+	fprintf(stderr, usage, name, MIN_MSG_SZ, MAX_MSG_SZ);
+}
+
+/* Get settings from argv on the client so they can be sent to server */
+struct benchmark_info *init_benchmark(int argc, char **argv)
+{
+	int opt_idx = 0, got_args = 0, c;
+	struct benchmark_info *info = (struct benchmark_info *)
+		malloc(sizeof(struct benchmark_info));
+
+	if (info == NULL) {
+		perror("benchmark_info malloc");
+		return NULL;
+	}
+
+	const struct option long_options[] = {
+		{"flush", no_argument, NULL, 'f'},
+		{"help", no_argument, NULL, 'h'},
+		{"mqstats", no_argument, &info->show_mqstats, 1},
+		{0, 0, 0, 0}
+	};
+
+	// Force reset of optind
+	optind = 1;
+
+	// Set default values
+	info->run_flush = 0;
+	info->min_msg_sz = MIN_MSG_SZ;
+	info->max_msg_sz = MAX_MSG_SZ;
+	info->show_mqstats = 0;
+
+	while (1) {
+		c = getopt_long(argc, argv, "fm:M:h",
+				long_options, &opt_idx);
+
+		if (c == -1)
+			break;
+
+		switch (c) {
+		case 0:
+			if (long_options[opt_idx].flag != 0)
+				break;
+			break;
+		case 'f':
+			info->run_flush = 1;
+			got_args = 1;
+			break;
+		case 'm':
+			info->min_msg_sz = str_to_positive_long(optarg);
+			got_args = 1;
+			if (info->min_msg_sz == -1) {
+				fprintf(stderr, "Invalid number for m\n");
+				goto bail;
+			}
+			break;
+		case 'M':
+			info->max_msg_sz = str_to_positive_long(optarg);
+			got_args = 1;
+			if (info->max_msg_sz == -1) {
+				fprintf(stderr, "Invalid number for M\n");
+				goto bail;
+			}
+			if (info->max_msg_sz < info->min_msg_sz) {
+				fprintf(stderr, "Max msg size larger than Min size\n");
+				goto bail;
+			}
+			break;
+		case 'h':
+		default:
+			print_usage(argv[0]);
+			goto bail;
+		}
+	}
+
+	if (gethostname(info->hostname, HOSTNAME_SZ) != 0) {
+		perror("gethostname");
+		goto bail;
+	}
+
+	info->cpu_freq = get_cpu_rate() * 1e6;
+	if (info->cpu_freq == 0.0)
+		goto bail;
+
+	// Server will be the process with no positional arguments
+	if (argc == optind) {
+		info->is_server = 1;
+		info->partner = 1;
+		strncpy(info->server, info->hostname, HOSTNAME_SZ);
+		if (got_args)
+			printf("WARN: all arguments ignored for server\n");
+	// Found a positional argument, desginates client process
+	} else if (optind == (argc - 1)) {
+		info->is_server = 0;
+		info->partner = 0;
+		strncpy(info->server, argv[optind], HOSTNAME_SZ);
+	} else {
+		print_usage(argv[0]);
+		fprintf(stderr, "Found extra positional arguments\n");
+		goto bail;
+	}
+
+	return info;
+bail:
+	free(info);
+	return NULL;
+}
+
+/* Get cpu freq from /proc/cpuinfo, return 0.0 on fail */
+float get_cpu_rate(void)
+{
+	FILE *fd;
+	char buf[STR_SZ];
+	char tmp[STR_SZ];
+	float rate;
+	char mhz_str[] = "cpu MHz";
+
+	fd = fopen("/proc/cpuinfo", "r");
+	if (fd == NULL) {
+		perror("fopen /proc/cpuinfo");
+		return 0.0;
+	}
+
+	while (fgets(buf, STR_SZ, fd) != NULL) {
+		if (strncmp(buf, mhz_str, strlen(mhz_str)) == 0) {
+			if (sscanf(buf, "%[^:]:%f", tmp, &rate) == 2) {
+				fclose(fd);
+				return rate;
+			}
+		}
+	}
+
+	fclose(fd);
+	fprintf(stderr, "Could not find cpu MHz in /proc/cpuinfo\n");
+	return 0.0;
+}
+
+/* Open socket between server and client */
+int open_socket(char *server_name, int is_server, int port)
+{
+	int sock, close_immed = 1;
+	socklen_t remote_len;
+	struct hostent *server;
+	struct sockaddr_in server_addr, remote_addr;
+
+	sock = socket(AF_INET, SOCK_STREAM, 0);
+	if (sock < 0) {
+		perror("Could not open socket");
+		return sock;
+	}
+
+	if (setsockopt(sock, SOL_SOCKET, SO_REUSEADDR,
+			(char *)&close_immed, sizeof(close_immed))) {
+		perror("setsockopt");
+		goto bail;
+	}
+
+	server = gethostbyname(server_name);
+	if (server == NULL) {
+		fprintf(stderr, "Error in gethostbyname\n");
+		goto bail;
+	}
+
+	server_addr.sin_family = AF_INET;
+	server_addr.sin_port = htons(port);
+
+	if (is_server) {
+		if (bind(sock,
+					(struct sockaddr *) &server_addr,
+					sizeof(server_addr)) == -1) {
+			perror("bind");
+			goto bail;
+		}
+
+		if (listen(sock, MAX_CLIENTS) == -1) {
+			perror("listen");
+			goto bail;
+		}
+
+		remote_len = sizeof(remote_addr);
+		sock = accept(sock,
+				(struct sockaddr *) &remote_addr,
+				&remote_len);
+		if (sock < 0) {
+			perror("accept");
+			return sock;
+		}
+	} else {
+		memcpy(&server_addr.sin_addr, server->h_addr_list[0],
+				server->h_length);
+		if (connect(sock,
+					(struct sockaddr *)&server_addr,
+					sizeof(server_addr)) == -1) {
+			perror("connect");
+			goto bail;
+		}
+	}
+
+	return sock;
+bail:
+	close(sock);
+	return -1;
+}
+
+/* Send client options to server, return 0 on success, -1 on failure */
+int exchange_info(int sock, struct benchmark_info *info)
+{
+	if (info->is_server) {
+		RECV(sock, info->min_msg_sz, long);
+		RECV(sock, info->max_msg_sz, long);
+		RECV(sock, info->run_flush, int);
+		RECV(sock, info->show_mqstats, int);
+	} else {
+		SEND(sock, info->min_msg_sz, long);
+		SEND(sock, info->max_msg_sz, long);
+		SEND(sock, info->run_flush, int);
+		SEND(sock, info->show_mqstats, int);
+	}
+
+	return 0;
+bail:
+	fprintf(stderr, "%s failure\n", __func__);
+	return -1;
+}
diff --git a/perf_test/psm2perf.h b/perf_test/psm2perf.h
new file mode 100644
index 0000000..dd053d4
--- /dev/null
+++ b/perf_test/psm2perf.h
@@ -0,0 +1,138 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2018 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2018 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef _PSM2PERF_H_
+#define _PSM2PERF_H_
+#include <stdio.h>
+#include <sys/socket.h>
+#include <netinet/in.h>
+#include <netdb.h>
+
+/* Executing this function before running a benchmark
+ * can help get more consistent results.
+ */
+static inline long flush_l3cache()
+{
+	const long l3_cache_size_sc = sysconf(_SC_LEVEL3_CACHE_SIZE);
+	const long l3_cache_size = (l3_cache_size_sc == -1) ?
+		(28*1024*1024) : (l3_cache_size_sc);
+	int i, j;
+	if (l3_cache_size_sc == -1)
+		printf("WARN: could not get L3 cache size from sysconf().\n"
+		       "using: %ld instead.\n", l3_cache_size);
+	/* allocating and scribbling twice the size of L3 cache in order to
+	 ensure the L3 cache is invalidated. */
+	char *_cache_flush = malloc(2*l3_cache_size);
+	for (i = 0; i < 80; i++)
+		for (j = 0; j < l3_cache_size*2; j++)
+			_cache_flush[j] = i * j;
+	free(_cache_flush);
+	return l3_cache_size;
+}
+
+#define SEND(sock, data, type)						\
+	do {								\
+		if (send(sock, (void *)&data, sizeof(type), 0) == -1) {	\
+			perror("send");					\
+			goto bail;					\
+		}							\
+	} while (0)
+
+#define RECV(sock, data, type)						\
+	do {								\
+		if (recv(sock, (void *)&data, sizeof(type), 0) == -1) {	\
+			perror("recv");					\
+			goto bail;					\
+		}							\
+	} while (0)
+
+#define TIMER(a) { a = get_cycles(); }
+
+#define LARGE_MSG    65536
+#define HOSTNAME_SZ  256
+#define ITERS_SMALL  50
+#define ITERS_MEDIUM 500
+#define ITERS_LARGE  50000
+#define SERVER_PORT  33087
+#define MAX_CLIENTS  1
+#define STR_SZ       1024
+
+/* Defaults */
+#define MIN_MSG_SZ   1
+#define MAX_MSG_SZ   (4*1048576)
+#define WINDOW       64
+
+char server_name[HOSTNAME_SZ];
+
+char sbuff[MAX_MSG_SZ];
+char rbuff[MAX_MSG_SZ];
+
+struct benchmark_info {
+	double cpu_freq;
+	char hostname[HOSTNAME_SZ];
+	char server[HOSTNAME_SZ];
+	int is_server;
+	int partner;
+
+	/* These are selected by the client, sent to server */
+	long min_msg_sz;
+	long max_msg_sz;
+	int run_flush;
+	int show_mqstats;
+};
+
+struct benchmark_info *init_benchmark(int argc, char **argv);
+int open_socket(char *server_name, int is_server, int port);
+int exchange_info(int sock, struct benchmark_info *info);
+float get_cpu_rate(void);
+#endif
diff --git a/psm.c b/psm.c
index 287c290..fc9f769 100644
--- a/psm.c
+++ b/psm.c
@@ -77,11 +77,22 @@ static int psmi_isinit = PSMI_NOT_INITIALIZED;
  * will not work on an endpoint which is in a middle of closing). */
 psmi_lock_t psmi_creation_lock;
 
+sem_t *sem_affinity_shm_rw = NULL;
+int psmi_affinity_shared_file_opened = 0;
+int psmi_affinity_semaphore_open = 0;
+uint64_t *shared_affinity_ptr;
+char *sem_affinity_shm_rw_name;
+char *affinity_shm_name;
+
 #ifdef PSM_CUDA
 int is_cuda_enabled;
+int is_gdr_copy_enabled;
 int device_support_gpudirect;
 int cuda_runtime_version;
 int is_driver_gpudirect_enabled;
+uint32_t cuda_thresh_rndv;
+uint32_t gdr_copy_threshold_send;
+uint32_t gdr_copy_threshold_recv;
 #endif
 
 /*
@@ -212,6 +223,49 @@ int psmi_cuda_initialize()
     		}
 
 	}
+
+#ifdef PSM_CUDA
+	union psmi_envvar_val env_enable_gdr_copy;
+	psmi_getenv("PSM2_GDRCOPY",
+				"Enable (set envvar to 1) for gdr copy support in PSM (Enabled by default)",
+				PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_INT,
+				(union psmi_envvar_val)1, &env_enable_gdr_copy);
+	is_gdr_copy_enabled = env_enable_gdr_copy.e_int;
+
+	union psmi_envvar_val env_cuda_thresh_rndv;
+	psmi_getenv("PSM2_CUDA_THRESH_RNDV",
+				"RNDV protocol is used for message sizes greater than the threshold \n",
+				PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_INT,
+				(union psmi_envvar_val)CUDA_THRESH_RNDV, &env_cuda_thresh_rndv);
+	cuda_thresh_rndv = env_cuda_thresh_rndv.e_int;
+
+	if (cuda_thresh_rndv < 0 || cuda_thresh_rndv > CUDA_THRESH_RNDV)
+	    cuda_thresh_rndv = CUDA_THRESH_RNDV;
+
+	union psmi_envvar_val env_gdr_copy_thresh_send;
+	psmi_getenv("PSM2_GDRCOPY_THRESH_SEND",
+				"GDR Copy is turned off on the send side"
+				" for message sizes greater than the threshold \n",
+				PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_INT,
+				(union psmi_envvar_val)GDR_COPY_THRESH_SEND, &env_gdr_copy_thresh_send);
+	gdr_copy_threshold_send = env_gdr_copy_thresh_send.e_int;
+
+	if (gdr_copy_threshold_send < 8 || gdr_copy_threshold_send > cuda_thresh_rndv)
+		gdr_copy_threshold_send = GDR_COPY_THRESH_SEND;
+
+	union psmi_envvar_val env_gdr_copy_thresh_recv;
+	psmi_getenv("PSM2_GDRCOPY_THRESH_RECV",
+				"GDR Copy is turned off on the recv side"
+				" for message sizes greater than the threshold \n",
+				PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_INT,
+				(union psmi_envvar_val)GDR_COPY_THRESH_RECV, &env_gdr_copy_thresh_recv);
+	gdr_copy_threshold_recv = env_gdr_copy_thresh_recv.e_int;
+
+	if (gdr_copy_threshold_recv < 8)
+		gdr_copy_threshold_recv = GDR_COPY_THRESH_RECV;
+
+#endif
+
 	PSM2_LOG_MSG("leaving");
 	return err;
 fail:
@@ -466,6 +520,43 @@ psm2_error_t __psm2_finalize(void)
 		psmi_free(hostname);
 	psmi_epid_itor_fini(&itor);
 
+	/* unmap shared mem object for affinity */
+	if (psmi_affinity_shared_file_opened) {
+		/*
+		 * Start critical section to decrement ref count and unlink
+		 * affinity shm file.
+		 */
+		psmi_sem_timedwait(sem_affinity_shm_rw, sem_affinity_shm_rw_name);
+
+		shared_affinity_ptr[AFFINITY_SHM_REF_COUNT_LOCATION] -= 1;
+		if (shared_affinity_ptr[AFFINITY_SHM_REF_COUNT_LOCATION] <= 0) {
+			_HFI_VDBG("Unlink shm file for HFI affinity as there are no more users\n");
+			shm_unlink(affinity_shm_name);
+		} else {
+			_HFI_VDBG("Number of affinity shared memory users left=%ld\n",
+				  shared_affinity_ptr[AFFINITY_SHM_REF_COUNT_LOCATION]);
+		}
+
+		msync(shared_affinity_ptr, AFFINITY_SHMEMSIZE, MS_SYNC);
+
+		/* End critical section */
+		psmi_sem_post(sem_affinity_shm_rw, sem_affinity_shm_rw_name);
+
+		munmap(shared_affinity_ptr, AFFINITY_SHMEMSIZE);
+		psmi_free(affinity_shm_name);
+		affinity_shm_name = NULL;
+		psmi_affinity_shared_file_opened = 0;
+	}
+
+	if (psmi_affinity_semaphore_open) {
+		_HFI_VDBG("Closing and Unlinking Semaphore: %s.\n", sem_affinity_shm_rw_name);
+		sem_close(sem_affinity_shm_rw);
+		sem_unlink(sem_affinity_shm_rw_name);
+		psmi_free(sem_affinity_shm_rw_name);
+		sem_affinity_shm_rw_name = NULL;
+		psmi_affinity_semaphore_open = 0;
+	}
+
 	psmi_isinit = PSMI_FINALIZED;
 	PSM2_LOG_MSG("leaving");
 	psmi_log_fini();
diff --git a/psm_context.c b/psm_context.c
index b2181b1..db7f0a6 100644
--- a/psm_context.c
+++ b/psm_context.c
@@ -55,7 +55,6 @@
 
 #include <sys/types.h>
 #include <sys/stat.h>
-#include <fcntl.h>
 
 #include "psm_user.h"
 
@@ -204,6 +203,169 @@ psmi_spread_hfi_selection(psm2_uuid_t const job_key, long *unit_start,
 	}
 }
 
+static int
+psmi_create_and_open_affinity_shm(psm2_uuid_t const job_key)
+{
+	int shm_fd, ret;
+	int first_to_create = 0;
+	size_t shm_name_len = 256;
+	shared_affinity_ptr = NULL;
+	affinity_shm_name = NULL;
+	affinity_shm_name = (char *) psmi_malloc(PSMI_EP_NONE, UNDEFINED, shm_name_len);
+
+	snprintf(affinity_shm_name, shm_name_len,
+		 AFFINITY_SHM_BASENAME".%d",
+		 psmi_get_uuid_hash(job_key));
+	shm_fd = shm_open(affinity_shm_name, O_RDWR | O_CREAT | O_EXCL,
+			  S_IRUSR | S_IWUSR);
+	if ((shm_fd < 0) && (errno == EEXIST)) {
+		shm_fd = shm_open(affinity_shm_name, O_RDWR, S_IRUSR | S_IWUSR);
+		if (shm_fd < 0) {
+			_HFI_VDBG("Cannot open affinity shared mem fd:%s, errno=%d\n",
+				  affinity_shm_name, errno);
+			return shm_fd;
+		}
+	} else if (shm_fd > 0) {
+		first_to_create = 1;
+	} else {
+		_HFI_VDBG("Cannot create affinity shared mem fd:%s, errno=%d\n",
+			  affinity_shm_name, errno);
+	}
+
+	ret = ftruncate(shm_fd, AFFINITY_SHMEMSIZE);
+	if ( ret < 0 )
+		return ret;
+
+	shared_affinity_ptr = (uint64_t *) mmap(NULL, AFFINITY_SHMEMSIZE, PROT_READ | PROT_WRITE,
+					MAP_SHARED, shm_fd, 0);
+	if (shared_affinity_ptr == MAP_FAILED) {
+		_HFI_VDBG("Cannot mmap affinity shared memory. errno=%d\n",
+			  errno);
+		close(shm_fd);
+		return -1;
+	}
+	close(shm_fd);
+
+	psmi_affinity_shared_file_opened = 1;
+
+	if (first_to_create) {
+		_HFI_VDBG("Creating shm to store HFI affinity per socket\n");
+
+		memset(shared_affinity_ptr, 0, AFFINITY_SHMEMSIZE);
+
+		/*
+		 * Once shm object is initialized, unlock others to be able to
+		 * use it.
+		 */
+		psmi_sem_post(sem_affinity_shm_rw, sem_affinity_shm_rw_name);
+	} else {
+		_HFI_VDBG("Opening shm object to read/write HFI affinity per socket\n");
+	}
+
+	/*
+	 * Start critical section to increment reference count when creating
+	 * or opening shm object. Decrement of ref count will be done before
+	 * closing the shm.
+	 */
+	if (psmi_sem_timedwait(sem_affinity_shm_rw, sem_affinity_shm_rw_name)) {
+		_HFI_VDBG("Could not enter critical section to update shm refcount\n");
+		return -1;
+	}
+
+	shared_affinity_ptr[AFFINITY_SHM_REF_COUNT_LOCATION] += 1;
+
+	/* End critical section */
+	psmi_sem_post(sem_affinity_shm_rw, sem_affinity_shm_rw_name);
+
+	return 0;
+}
+
+/*
+ * Spread HFI selection between units if we find more than one within a socket.
+ */
+static void
+psmi_spread_hfi_within_socket(long *unit_start, long *unit_end, int node_id,
+			      int *saved_hfis, int found, psm2_uuid_t const job_key)
+{
+	int ret, shm_location;
+
+	/*
+	 * Take affinity lock and open shared memory region to be able to
+	 * accurately determine which HFI to pick for this process. If any
+	 * issues, bail by picking first known HFI.
+	 */
+	if (!psmi_affinity_semaphore_open)
+		goto spread_hfi_fallback;
+
+	ret = psmi_create_and_open_affinity_shm(job_key);
+	if (ret < 0)
+		goto spread_hfi_fallback;
+
+	shm_location = AFFINITY_SHM_HFI_INDEX_LOCATION + node_id;
+	if (shm_location > AFFINITY_SHMEMSIZE)
+		goto spread_hfi_fallback;
+
+	/* Start critical section to read/write shm object */
+	if (psmi_sem_timedwait(sem_affinity_shm_rw, sem_affinity_shm_rw_name)) {
+		_HFI_VDBG("Could not enter critical section to update HFI index\n");
+		goto spread_hfi_fallback;
+	}
+
+	*unit_start = *unit_end = shared_affinity_ptr[shm_location];
+	shared_affinity_ptr[shm_location] =
+		(shared_affinity_ptr[shm_location] + 1) % found;
+	_HFI_VDBG("Selected HFI index= %ld, Next HFI=%ld, node = %d, local rank=%d, found=%d.\n",
+		  *unit_start, shared_affinity_ptr[shm_location], node_id,
+		  psmi_get_envvar("MPI_LOCALRANKID"), found);
+
+	/* End Critical Section */
+	psmi_sem_post(sem_affinity_shm_rw, sem_affinity_shm_rw_name);
+
+spread_hfi_fallback:
+	*unit_start = *unit_end = saved_hfis[0];
+}
+
+static void
+psmi_create_affinity_semaphores(psm2_uuid_t const job_key)
+{
+	int ret;
+	sem_affinity_shm_rw_name = NULL;
+	size_t sem_len = 256;
+
+	/*
+	 * If already opened, no need to do anything else.
+	 * This could be true for Multi-EP cases where a different thread has
+	 * already created the semaphores. We don't need separate locks here as
+	 * we are protected by the overall "psmi_creation_lock" which each
+	 * thread will take in psm2_ep_open()
+	 */
+	if (psmi_affinity_semaphore_open)
+		return;
+
+	sem_affinity_shm_rw_name = (char *) psmi_malloc(PSMI_EP_NONE, UNDEFINED, sem_len);
+	snprintf(sem_affinity_shm_rw_name, sem_len,
+		 SEM_AFFINITY_SHM_RW_BASENAME".%d",
+		 psmi_get_uuid_hash(job_key));
+
+	ret = psmi_init_semaphore(&sem_affinity_shm_rw, sem_affinity_shm_rw_name,
+				  S_IRUSR | S_IWUSR, 0);
+	if (ret) {
+		_HFI_VDBG("Cannot initialize semaphore: %s for read-write access to shm object.\n",
+			  sem_affinity_shm_rw_name);
+		sem_close(sem_affinity_shm_rw);
+		psmi_free(sem_affinity_shm_rw_name);
+		sem_affinity_shm_rw_name = NULL;
+		return;
+	}
+
+	_HFI_VDBG("Semaphore: %s created for read-write access to shm object.\n",
+		  sem_affinity_shm_rw_name);
+
+	psmi_affinity_semaphore_open = 1;
+
+	return;
+}
+
 static
 psm2_error_t
 psmi_compute_start_and_end_unit(psmi_context_t *context,long unit_param,
@@ -212,6 +374,7 @@ psmi_compute_start_and_end_unit(psmi_context_t *context,long unit_param,
 {
 	int node_id, unit_id, found = 0;
 	int saved_hfis[nunits];
+
 	context->user_info.hfi1_alg = HFI1_ALG_ACROSS;
 	/* if the user did not set HFI_UNIT then ... */
 	if (unit_param == HFI_UNIT_ID_ANY)
@@ -237,20 +400,14 @@ psmi_compute_start_and_end_unit(psmi_context_t *context,long unit_param,
 					if (hfi_sysfs_unit_read_node_s64(unit_id) == node_id) {
 						saved_hfis[found] = unit_id;
 						found++;
-						_HFI_VDBG("Picking unit: %d for current task"
-							  " which is on node:%d\n", unit_id, node_id);
 					}
 				}
 
-				/*
-				 * Spread HFI selection between units if
-				 * we find more than one within a socket.
-				 */
 				if (found > 1) {
-					*unit_start = (psmi_get_envvar("MPI_LOCALRANKID") +
-						psmi_get_uuid_hash(job_key)) % found;
-
-					*unit_start = *unit_end = saved_hfis[*unit_start];
+					psmi_create_affinity_semaphores(job_key);
+					psmi_spread_hfi_within_socket(unit_start, unit_end,
+								      node_id, saved_hfis,
+								      found, job_key);
 				} else if (found == 1) {
 					*unit_start = *unit_end = saved_hfis[0];
 				}
@@ -270,12 +427,12 @@ psmi_compute_start_and_end_unit(psmi_context_t *context,long unit_param,
 			*unit_end = nunits - 1;
 		}
 	} else if (unit_param >= 0) {
-	/* the user specified HFI_UNIT, we use it. */
+		/* the user specified HFI_UNIT, we use it. */
 		*unit_start = *unit_end = unit_param;
 	} else {
 		psmi_handle_error(NULL, PSM2_EP_DEVICE_FAILURE,
-					"PSM2 can't open unit: %ld for reading and writing",
-					unit_param);
+				 "PSM2 can't open unit: %ld for reading and writing",
+				 unit_param);
 		return PSM2_EP_DEVICE_FAILURE;
 	}
 
diff --git a/psm_ep.c b/psm_ep.c
index 957aed9..f463f48 100644
--- a/psm_ep.c
+++ b/psm_ep.c
@@ -55,7 +55,6 @@
 
 #include <sys/types.h>
 #include <sys/stat.h>
-#include <fcntl.h>
 #include <sched.h>		/* cpu_set */
 #include <ctype.h>		/* isalpha */
 
@@ -63,6 +62,9 @@
 #include "psm_mq_internal.h"
 #include "psm_am_internal.h"
 
+#ifdef PSM_CUDA
+#include "psm_gdrcpy.h"
+#endif
 /*
  * Endpoint management
  */
@@ -1076,6 +1078,11 @@ __psm2_ep_open(psm2_uuid_t const unique_job_key,
 		}
 	}
 
+#ifdef PSM_CUDA
+	if (PSMI_IS_GDR_COPY_ENABLED)
+		hfi_gdr_open();
+#endif
+
 	err = __psm2_ep_open_internal(unique_job_key,
 				     devid_enabled, opts_i, mq, &ep, &epid);
 	if (err != PSM2_OK)
@@ -1142,6 +1149,17 @@ psm2_error_t __psm2_ep_close(psm2_ep_t ep, int mode, int64_t timeout_in)
 		t_start = get_cycles();
 	}
 #endif
+
+#ifdef PSM_CUDA
+	/*
+	 * The close on the gdr fd needs to be called before the
+	 * close on the hfi fd as the the gdr device will hold
+	 * reference count on the hfi device which will make the close
+	 * on the hfi fd return without actually closing the fd.
+	 */
+	if (PSMI_IS_GDR_COPY_ENABLED)
+		hfi_gdr_close();
+#endif
 	union psmi_envvar_val timeout_intval;
 	psm2_ep_t tmp;
 	psm2_mq_t mmq;
diff --git a/psm_gdrcpy.c b/psm_gdrcpy.c
new file mode 100644
index 0000000..daf14ba
--- /dev/null
+++ b/psm_gdrcpy.c
@@ -0,0 +1,226 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2018 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2018 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+#ifdef PSM_CUDA
+#include "psm_user.h"
+#include "psm_gdrcpy.h"
+#include <fcntl.h>
+#include <sys/ioctl.h>
+#include <sys/types.h>
+#include "ptl_ips/ips_tid.h"
+#include "ptl_ips/ips_expected_proto.h"
+
+
+static int gdr_fd;
+
+int is_gdr_copy_enabled;
+
+
+int get_gdr_fd(){
+	return gdr_fd;
+}
+
+#define GPU_PAGE_OFFSET_MASK (PSMI_GPU_PAGESIZE -1)
+#define GPU_PAGE_MASK ~GPU_PAGE_OFFSET_MASK
+
+uint64_t
+gdr_cache_evict() {
+	int ret;
+	struct hfi1_gdr_cache_evict_params params;
+	params.evict_params_in.version = HFI1_GDR_VERSION;
+	params.evict_params_in.pages_to_evict = 4;
+
+	ret = ioctl(gdr_fd, HFI1_IOCTL_GDR_GPU_CACHE_EVICT, &params);
+	if (ret) {
+		/* Fatal error */
+		psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR,
+						  "PIN/MMAP ioctl failed ret %d errno %d\n",
+						  ret, errno);
+		return ret;
+	}
+
+	return params.evict_params_out.pages_evicted;
+}
+
+
+uint64_t
+ips_sdma_gpu_cache_evict(int fd) {
+	int ret;
+	struct hfi1_sdma_gpu_cache_evict_params params;
+	params.evict_params_in.version = HFI1_GDR_VERSION;
+	params.evict_params_in.pages_to_evict = 2;
+
+	ret = ioctl(fd, HFI1_IOCTL_SDMA_CACHE_EVICT, &params);
+	if (ret) {
+		/* Fatal error */
+		psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR,
+						  "SDMA Cache Evict failed ret %d errno %d\n",
+						  ret, errno);
+		return ret;
+	}
+
+	return params.evict_params_out.pages_evicted;
+}
+
+/* handle_out_of_bar_space is called when the driver tries
+ * to self evict in the GDR cache and finds no entries.
+ * This could be due to the fact that all the pages pinned
+ * in the BAR1 region are cached in the SDMA and TID cache.
+ * We try to evict from both the caches for 30 seconds after
+ * which we bail out. If successful we retry to PIN/MMAP once
+ * again
+ */
+uint64_t
+handle_out_of_bar_space(struct ips_proto *proto)
+{
+	time_t lastEvictTime = 0;
+	uint64_t lengthEvicted;
+	time_t now;
+ retry:
+	now = time(NULL);
+
+	if (!lastEvictTime)
+		lastEvictTime = now;
+
+	if (proto->protoexp && proto->protoexp->tidc.tid_cachemap.payload.nidle) {
+		lengthEvicted =
+			ips_tidcache_evict(&proto->protoexp->tidc, -1);
+
+		if (lengthEvicted) {
+			lastEvictTime = 0;
+			return lengthEvicted; /* signals a retry of the writev command. */
+		}
+	}
+
+	lengthEvicted = ips_sdma_gpu_cache_evict(proto->fd);
+	if (lengthEvicted) {
+		lastEvictTime = 0;
+		return lengthEvicted;
+	}
+	static const double thirtySeconds = 30.0;
+	if (difftime(now, lastEvictTime) >
+		thirtySeconds) {
+		return 0;
+	} else {
+		goto retry;
+	}
+}
+
+void *
+gdr_convert_gpu_to_host_addr(int gdr_fd, unsigned long buf,
+							 size_t size, int flags,
+							 struct ips_proto* proto)
+{
+	struct hfi1_gdr_query_params query_params;
+	void *host_addr_buf;
+	int ret;
+
+	query_params.query_params_in.version = HFI1_GDR_VERSION;
+	uintptr_t pageaddr = buf & GPU_PAGE_MASK;
+	/* As size is guarenteed to be in the range of 0-8kB
+	 * there is a guarentee that buf+size-1 does not overflow
+	 * 64 bits.
+	 */
+	uint32_t pagelen = (uint32_t) (PSMI_GPU_PAGESIZE +
+					   ((buf + size - 1) & GPU_PAGE_MASK) -
+					   pageaddr);
+
+	query_params.query_params_in.gpu_buf_addr = pageaddr;
+	query_params.query_params_in.gpu_buf_size = pagelen;
+ retry:
+
+	ret = ioctl(gdr_fd, HFI1_IOCTL_GDR_GPU_PIN_MMAP, &query_params);
+
+	if (ret) {
+		if (errno == ENOMEM || errno == EINVAL) {
+			if (!handle_out_of_bar_space(proto)) {
+				/* Fatal error */
+				psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR,
+								  "Unable to PIN GPU pages(Out of BAR1 space)\n");
+				return NULL;
+			} else {
+				goto retry;
+			}
+		} else {
+			/* Fatal error */
+			psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR,
+							  "PIN/MMAP ioctl failed ret %d errno %d\n",
+							  ret, errno);
+			return NULL;
+		}
+	}
+	host_addr_buf = (void *)query_params.query_params_out.host_buf_addr;
+	return host_addr_buf + (buf & GPU_PAGE_OFFSET_MASK);
+}
+
+
+void hfi_gdr_open(){
+	gdr_fd = open(GDR_DEVICE_PATH, O_RDWR);
+	if (-1 == gdr_fd ) {
+		/* Non-Fatal error. If device cannot be found we assume
+		 * that the driver does not support GDR Copy and we fallback
+		 * to sending all GPU messages using rndv protocol
+		 */
+		_HFI_INFO(" Warning: The HFI1 driver installed does not support GPUDirect RDMA"
+				  " fast copy. Turning off GDR fast copy in PSM \n");
+		is_gdr_copy_enabled = 0;
+		return;
+	}
+	return;
+}
+
+void hfi_gdr_close()
+{
+	close(GDR_FD);
+}
+
+#endif
diff --git a/psm_gdrcpy.h b/psm_gdrcpy.h
new file mode 100644
index 0000000..2773454
--- /dev/null
+++ b/psm_gdrcpy.h
@@ -0,0 +1,77 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2018 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2018 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2018 Intel Corporation. All rights reserved. */
+#ifndef GDR_CPY_H
+#define GDR_CPY_H
+#ifdef PSM_CUDA
+
+#include "ptl_ips/ips_proto.h"
+
+#define GDR_FD get_gdr_fd()
+
+int get_gdr_fd();
+
+void hfi_gdr_open();
+
+void hfi_gdr_close();
+
+void *
+gdr_convert_gpu_to_host_addr(int gdr_fd, unsigned long buf,
+				size_t size, int flags,
+				struct ips_proto* proto);
+
+uint64_t
+gdr_cache_evict();
+#endif
+#endif
diff --git a/psm_lock.h b/psm_lock.h
index 56e82a8..c82960c 100644
--- a/psm_lock.h
+++ b/psm_lock.h
@@ -139,4 +139,52 @@ PSMI_ALWAYS_INLINE(void psmi_init_lock(psmi_lock_t *lock))
 #endif
 }
 
+PSMI_ALWAYS_INLINE(int psmi_sem_post(sem_t *sem, const char *name))
+{
+	if (sem_post(sem) == -1) {
+		_HFI_VDBG("Semaphore %s: post failed\n", name ? name : "NULL" );
+		return -1;
+	}
+
+	_HFI_VDBG("Semaphore %s: post succeeded\n", name ? name : "NULL");
+
+	return 0;
+}
+
+PSMI_ALWAYS_INLINE(int psmi_sem_timedwait(sem_t *sem, const char *name))
+{
+	/* Wait 5 seconds for shm read-write lock to open */
+	struct timespec ts;
+	clock_gettime(CLOCK_REALTIME, &ts);
+	ts.tv_sec += 5;
+
+	if (sem_timedwait(sem, &ts) == -1) {
+		_HFI_VDBG("Semaphore %s: Timedwait failed\n", name ? name : "NULL" );
+		return -1;
+	}
+
+	_HFI_VDBG("Semaphore %s: Timedwait succeeded\n", name ? name : "NULL");
+
+	return 0;
+}
+
+PSMI_ALWAYS_INLINE(int psmi_init_semaphore(sem_t **sem, const char *name,
+					   mode_t mode, int value))
+{
+	*sem = sem_open(name, O_CREAT | O_EXCL, mode, value);
+	if ((*sem == SEM_FAILED) && (errno == EEXIST)) {
+		*sem = sem_open(name, O_CREAT, mode, value);
+		if (*sem == SEM_FAILED) {
+			_HFI_VDBG("Cannot open semaphore %s, errno=%d\n",
+				  name, errno);
+			return -1;
+		}
+	} else if (*sem == SEM_FAILED) {
+		_HFI_VDBG("Cannot create semaphore %s, errno=%d\n", name, errno);
+		return -1;
+	}
+
+	return 0;
+}
+
 #endif /* _PSMI_LOCK_H */
diff --git a/psm_mq.c b/psm_mq.c
index 37290bd..0574493 100644
--- a/psm_mq.c
+++ b/psm_mq.c
@@ -58,6 +58,10 @@
 #include "psm_user.h"
 #include "psm_mq_internal.h"
 
+#ifdef PSM_CUDA
+#include "psm_gdrcpy.h"
+#endif
+
 /*
  * Functions to manipulate the expected queue in mq_ep.
  */
@@ -859,12 +863,18 @@ psm2_mq_irecv_inner(psm2_mq_t mq, psm2_mq_req_t req, void *buf, uint32_t len)
 	case MQ_STATE_COMPLETE:
 		if (req->buf != NULL) {	/* 0-byte messages don't alloc a sysbuf */
 			copysz = mq_set_msglen(req, len, req->send_msglen);
+			void *ubuf = buf;
 #ifdef PSM_CUDA
-			psmi_mtucpy_fn
+			if (PSMI_USE_GDR_COPY(req, len)) {
+				ubuf = gdr_convert_gpu_to_host_addr(GDR_FD, (unsigned long)buf,
+								    len, 1,
+								    mq->ep->epaddr->proto);
+				psmi_mtucpy_fn = psmi_mq_mtucpy_host_mem;
+			}
+			psmi_mtucpy_fn(ubuf, (const void *)req->buf, copysz);
 #else
-			psmi_mq_mtucpy
+			psmi_mq_mtucpy(ubuf, (const void *)req->buf, copysz);
 #endif
-				(buf, (const void *)req->buf, copysz);
 			psmi_mq_sysbuf_free(mq, req->buf);
 		}
 		req->buf = buf;
@@ -878,6 +888,16 @@ psm2_mq_irecv_inner(psm2_mq_t mq, psm2_mq_req_t req, void *buf, uint32_t len)
 		 * any more than copysz.  After that, swap system with user buffer
 		 */
 		req->recv_msgoff = min(req->recv_msgoff, copysz);
+
+#ifdef PSM_CUDA
+		if (PSMI_USE_GDR_COPY(req, req->send_msglen)) {
+			buf = gdr_convert_gpu_to_host_addr(GDR_FD, (unsigned long)req->user_gpu_buffer,
+							   req->send_msglen, 1,
+							   mq->ep->epaddr->proto);
+			psmi_mtucpy_fn = psmi_mq_mtucpy_host_mem;
+		}
+#endif
+
 		if (req->recv_msgoff) {
 #ifdef PSM_CUDA
 			psmi_mtucpy_fn
@@ -994,6 +1014,10 @@ __psm2_mq_irecv2(psm2_mq_t mq, psm2_epaddr_t src,
 
 #ifdef PSM_CUDA
 		req->is_buf_gpu_mem = gpu_mem;
+		if (gpu_mem)
+			req->user_gpu_buffer = buf;
+		else
+			req->user_gpu_buffer = NULL;
 #endif
 
 		/* Nobody should touch the buffer after it's posted */
@@ -1011,6 +1035,10 @@ __psm2_mq_irecv2(psm2_mq_t mq, psm2_epaddr_t src,
 			  tagsel->tag[0], tagsel->tag[1], tagsel->tag[2], req);
 #ifdef PSM_CUDA
 		req->is_buf_gpu_mem = gpu_mem;
+		if (gpu_mem)
+			req->user_gpu_buffer = buf;
+		else
+			req->user_gpu_buffer = NULL;
 #endif
 
 		req->context = context;
@@ -1261,6 +1289,112 @@ psm2_error_t __psm2_mq_setopt(psm2_mq_t mq, int key, const void *value)
 }
 PSMI_API_DECL(psm2_mq_setopt)
 
+#define TAB_SIZE 16
+#define STATS				\
+	STAT(rx_user_num)		\
+	STAT(rx_sys_bytes)		\
+	STAT(rx_sys_num)		\
+	STAT(tx_num)			\
+	STAT(tx_eager_num)		\
+	STAT(tx_eager_bytes)		\
+	STAT(tx_rndv_num)		\
+	STAT(tx_rndv_bytes)		\
+	STAT(tx_shm_num)		\
+	STAT(rx_shm_num)		\
+	STAT(rx_sysbuf_num)		\
+	STAT(rx_sysbuf_bytes)
+
+static
+void
+psmi_mq_print_stats(psm2_mq_t mq, FILE *perf_stats_fd)
+{
+	psm2_mq_stats_t stats;
+	char msg_buffer[MSG_BUFFER_LEN];
+
+	psm2_mq_get_stats(mq, &stats);
+
+#define STAT(x) \
+	snprintf(msg_buffer, MSG_BUFFER_LEN, "%*lu",TAB_SIZE, stats.x); \
+	fwrite(msg_buffer, sizeof(char), strlen(msg_buffer), perf_stats_fd);
+
+	STATS
+
+#undef STAT
+
+	fwrite("\n", sizeof(char), 1, perf_stats_fd);
+}
+
+
+static
+void
+*psmi_mq_print_stats_thread(void *_mq)
+{
+	psm2_mq_t mq = (psm2_mq_t)_mq;
+	char perf_file_name[MSG_BUFFER_LEN];
+	char msg_buffer[MSG_BUFFER_LEN];
+	int delta_t = 0;
+
+	snprintf(perf_file_name, MSG_BUFFER_LEN, "./psm2-perf-stat-ep-%" PRIu64 "-pid-%d",
+			(uint64_t)(mq->ep->epid),
+			getpid());
+	FILE *perf_stats_fd = fopen(perf_file_name, "w+");
+
+	if (!perf_stats_fd)
+	{
+		_HFI_ERROR("Failed to create fd for performance logging\n");
+		goto end;
+	}
+
+#define STAT(x) \
+	snprintf(msg_buffer, MSG_BUFFER_LEN, "%*s",TAB_SIZE, #x);\
+	fwrite(msg_buffer, sizeof(char), strlen(msg_buffer), perf_stats_fd);
+
+	STAT(delta_t)
+	STATS
+
+#undef STAT
+
+	fwrite("\n", sizeof(char), 1, perf_stats_fd);
+
+	/* Performance stats will be printed every $PSM2_MQ_PRINT_STATS seconds */
+	do {
+		snprintf(msg_buffer, MSG_BUFFER_LEN, "%*d",TAB_SIZE, delta_t);
+		fwrite(msg_buffer, sizeof(char), strlen(msg_buffer), perf_stats_fd);
+		psmi_mq_print_stats(mq, perf_stats_fd);
+		fflush(perf_stats_fd);
+		usleep(MICRO_SEC * mq->print_stats);
+		delta_t += mq->print_stats;
+	} while (mq->mq_perf_data.perf_print_stats);
+
+	fclose(perf_stats_fd);
+end:
+	pthread_exit(NULL);
+}
+
+static
+void
+psmi_mq_print_stats_init(psm2_mq_t mq)
+{
+	mq->mq_perf_data.perf_print_stats = 1;
+	if (pthread_create(&(mq->mq_perf_data.perf_print_thread), NULL,
+				psmi_mq_print_stats_thread, (void*)mq))
+	{
+		mq->mq_perf_data.perf_print_stats = 0;
+		_HFI_ERROR("Failed to create logging thread\n");
+	}
+}
+
+static
+void
+psmi_mq_print_stats_finalize(psm2_mq_t mq)
+{
+	if (mq->mq_perf_data.perf_print_stats)
+	{
+		mq->mq_perf_data.perf_print_stats = 0;
+		pthread_join(mq->mq_perf_data.perf_print_thread, NULL);
+	}
+}
+
 /*
  * This is the API for the user.  We actually allocate the MQ much earlier, but
  * the user can set options after obtaining an endpoint
@@ -1300,37 +1434,15 @@ __psm2_mq_init(psm2_ep_t ep, uint64_t tag_order_mask,
 
 	*mqo = mq;
 
+	if (mq->print_stats > 0)
+		psmi_mq_print_stats_init(mq);
+
 fail:
 	PSM2_LOG_MSG("leaving");
 	return err;
 }
 PSMI_API_DECL(psm2_mq_init)
 
-static
-void
-psmi_mq_print_stats(psm2_mq_t mq)
-{
-	psm2_mq_stats_t stats;
-
-	psm2_mq_get_stats(mq, &stats);
-	_HFI_INFO("rx_user_bytes %lu\n", stats.rx_user_bytes);
-	_HFI_INFO("rx_user_num %lu\n", stats.rx_user_num);
-	_HFI_INFO("rx_sys_bytes %lu\n", stats.rx_sys_bytes);
-	_HFI_INFO("rx_sys_num %lu\n", stats.rx_sys_num);
-
-	_HFI_INFO("tx_num %lu\n", stats.tx_num);
-	_HFI_INFO("tx_eager_num %lu\n", stats.tx_eager_num);
-	_HFI_INFO("tx_eager_bytes %lu\n", stats.tx_eager_bytes);
-	_HFI_INFO("tx_rndv_num %lu\n", stats.tx_rndv_num);
-	_HFI_INFO("tx_rndv_bytes %lu\n", stats.tx_rndv_bytes);
-
-	_HFI_INFO("tx_shm_num %lu\n", stats.tx_shm_num);
-	_HFI_INFO("rx_shm_num %lu\n", stats.rx_shm_num);
-
-	_HFI_INFO("rx_sysbuf_num %lu\n", stats.rx_sysbuf_num);
-	_HFI_INFO("rx_sysbuf_bytes %lu\n", stats.rx_sysbuf_bytes);
-}
-
 psm2_error_t __psm2_mq_finalize(psm2_mq_t mq)
 {
 	psm2_error_t rv = PSM2_OK;
@@ -1339,8 +1451,13 @@ psm2_error_t __psm2_mq_finalize(psm2_mq_t mq)
 
 	PSMI_ERR_UNLESS_INITIALIZED(mq->ep);
 
+	if (mq->print_stats == -1)
+	{
+		mq->print_stats = 1;
+		psmi_mq_print_stats_init(mq);
+	}
 	if (mq->print_stats != 0)
-		psmi_mq_print_stats(mq);
+		psmi_mq_print_stats_finalize(mq);
 
 	PSM2_LOG_MSG("leaving");
 	return rv;
@@ -1395,7 +1512,7 @@ psm2_error_t psmi_mq_malloc(psm2_mq_t *mqo)
 	mq->hfi_thresh_tiny = MQ_HFI_THRESH_TINY;
 #ifdef PSM_CUDA
 	if (PSMI_IS_CUDA_ENABLED)
-		mq->hfi_base_window_rv = MQ_HFI_THRESH_RNDV_CUDA;
+		mq->hfi_base_window_rv = MQ_HFI_WINDOW_RNDV_CUDA;
 #endif
 	mq->shm_thresh_rv = MQ_SHM_THRESH_RNDV;
 
@@ -1446,7 +1563,9 @@ psm2_error_t psmi_mq_initialize_defaults(psm2_mq_t mq)
 	mq->shm_thresh_rv = env_shmrv.e_uint;
 
 	psmi_getenv("PSM2_MQ_PRINT_STATS",
-		    "Print MQ stats during finalization",
+		    "Prints MQ performance stats every n seconds to file"
+			"./psm2-perf-stat-ep-[epid]-[pid] when set to -1 stats are "
+			"printed only once during finalization",
 		    PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_UINT,
 		    (union psmi_envvar_val) 0, &env_stats);
 	mq->print_stats = env_stats.e_uint;
diff --git a/psm_mq_internal.h b/psm_mq_internal.h
index 0f30e5c..dd2b255 100644
--- a/psm_mq_internal.h
+++ b/psm_mq_internal.h
@@ -81,6 +81,14 @@ typedef psm2_error_t(*psm_mq_unexpected_callback_fn_t)
 #define NUM_HASH_CONFIGS 3
 #define NUM_MQ_SUBLISTS (NUM_HASH_CONFIGS + 1)
 #define REMOVE_ENTRY 1
+#define MICRO_SEC 1000000
+#define MSG_BUFFER_LEN 100
+
+struct psm2_mq_perf_data
+{
+	pthread_t perf_print_thread;
+	int perf_print_stats;
+};
 
 enum psm2_mq_tag_pattern {
 	PSM2_TAG_SRC = 0,
@@ -118,8 +126,12 @@ struct psm2_mq {
 	int memmode;
 
 	uint64_t timestamp;
+
 	psm2_mq_stats_t stats;	/**> MQ stats, accumulated by each PTL */
+
 	int print_stats;
+	struct psm2_mq_perf_data mq_perf_data;
+
 	int nohash_fastpath;
 	unsigned unexpected_hash_len;
 	unsigned unexpected_list_len;
@@ -146,7 +158,7 @@ struct psm2_mq {
 #define MQ_HFI_WINDOW_RNDV_XEON  131072
 
 #ifdef PSM_CUDA
-#define MQ_HFI_THRESH_RNDV_CUDA 2097152
+#define MQ_HFI_WINDOW_RNDV_CUDA 2097152
 #endif
 
 #define MQ_SHM_THRESH_RNDV 16000
@@ -268,6 +280,13 @@ struct psm2_mq_req {
 	uintptr_t rts_sbuf;
 
 #ifdef PSM_CUDA
+	uint8_t* user_gpu_buffer;
+	STAILQ_HEAD(sendreq_spec_, ips_cuda_hostbuf) sendreq_prefetch;
+	uint32_t prefetch_send_msgoff;
+	int cuda_hostbuf_used;
+	cudaIpcMemHandle_t cuda_ipc_handle;
+	cudaEvent_t cuda_ipc_event;
+	uint8_t cuda_ipc_handle_attached;
 	/* is_buf_gpu_mem - used to indicate if the send or receive is issued
 	 * on a device/host buffer.
 	 * is_sendbuf_gpu_mem - Used to always select TID path on the receiver
@@ -275,12 +294,6 @@ struct psm2_mq_req {
 	 */
 	uint8_t is_buf_gpu_mem;
 	uint8_t is_sendbuf_gpu_mem;
-	STAILQ_HEAD(sendreq_spec_, ips_cuda_hostbuf) sendreq_prefetch;
-	uint32_t prefetch_send_msgoff;
-	int cuda_hostbuf_used;
-	cudaIpcMemHandle_t cuda_ipc_handle;
-	cudaEvent_t cuda_ipc_event;
-	uint8_t cuda_ipc_handle_attached;
 #endif
 
 	uint64_t user_reserved[4];
diff --git a/psm_mq_recv.c b/psm_mq_recv.c
index 3ac481c..1948574 100644
--- a/psm_mq_recv.c
+++ b/psm_mq_recv.c
@@ -57,6 +57,10 @@
 #include "psm_mq_internal.h"
 #include "ptl_ips/ips_proto_header.h"
 
+#ifdef PSM_CUDA
+#include "psm_gdrcpy.h"
+#endif
+
 #if 0
 /* Not exposed in public psm, but may extend parts of PSM 2.1 to support
  * this feature before 2.3 */
@@ -362,8 +366,25 @@ psmi_mq_handle_envelope(psm2_mq_t mq, psm2_epaddr_t src, psm2_mq_tag_t *tag,
 			PSM_VALGRIND_DEFINE_MQ_RECV(req->buf, req->buf_len,
 						    msglen);
 			/* mq_copy_tiny() can handle zero byte */
+
+#ifdef PSM_CUDA
+			if (PSMI_USE_GDR_COPY(req, msglen)) {
+				void* mmaped_host = gdr_convert_gpu_to_host_addr(GDR_FD,
+								(unsigned long)req->buf,
+								msglen, 1, src->proto);
+				mq_copy_tiny((uint32_t *) mmaped_host,
+							 (uint32_t *) payload, msglen);
+			}
+			else {
+				mq_copy_tiny((uint32_t *) req->buf,
+							 (uint32_t *) payload, msglen);
+			}
+#else
+
 			mq_copy_tiny((uint32_t *) req->buf,
-				     (uint32_t *) payload, msglen);
+						 (uint32_t *) payload, msglen);
+#endif
+
 			req->state = MQ_STATE_COMPLETE;
 			ips_barrier();
 			mq_qq_append(&mq->completed_q, req);
@@ -372,16 +393,34 @@ psmi_mq_handle_envelope(psm2_mq_t mq, psm2_epaddr_t src, psm2_mq_tag_t *tag,
 		case MQ_MSG_SHORT:	/* message fits in 1 payload */
 			PSM_VALGRIND_DEFINE_MQ_RECV(req->buf, req->buf_len,
 						    msglen);
+			void* user_buffer = req->buf;
+#ifdef PSM_CUDA
+			psmi_mtucpy_fn_t psmi_mtucpy_fn = psmi_mq_mtucpy;
+			if (PSMI_USE_GDR_COPY(req, msglen)) {
+				user_buffer = gdr_convert_gpu_to_host_addr(GDR_FD,
+							(unsigned long)req->buf,
+							msglen, 1, src->proto);
+				psmi_mtucpy_fn = psmi_mq_mtucpy_host_mem;
+			}
+#endif
 			if (msglen <= paylen) {
-				psmi_mq_mtucpy(req->buf, payload, msglen);
+#ifdef PSM_CUDA
+				psmi_mtucpy_fn(user_buffer, payload, msglen);
+#else
+				psmi_mq_mtucpy(user_buffer, payload, msglen);
+#endif
 			} else {
 				psmi_assert((msglen & ~0x3) == paylen);
-				psmi_mq_mtucpy(req->buf, payload, paylen);
+#ifdef PSM_CUDA
+				psmi_mtucpy_fn(user_buffer, payload, paylen);
+#else
+				psmi_mq_mtucpy(user_buffer, payload, paylen);
+#endif
 				/*
 				 * there are nonDW bytes attached in header,
 				 * copy after the DW payload.
 				 */
-				mq_copy_tiny((uint32_t *)(req->buf+paylen),
+				mq_copy_tiny((uint32_t *)(user_buffer+paylen),
 					(uint32_t *)&offset, msglen & 0x3);
 			}
 			req->state = MQ_STATE_COMPLETE;
@@ -396,6 +435,13 @@ psmi_mq_handle_envelope(psm2_mq_t mq, psm2_epaddr_t src, psm2_mq_tag_t *tag,
 			STAILQ_INSERT_TAIL(&mq->eager_q, req, nextq);
 			_HFI_VDBG("exp MSG_EAGER of length %d bytes pay=%d\n",
 				  msglen, paylen);
+#ifdef PSM_CUDA
+			if (PSMI_USE_GDR_COPY(req, req->send_msglen)) {
+				req->buf = gdr_convert_gpu_to_host_addr(GDR_FD,
+						(unsigned long)req->user_gpu_buffer,
+						req->send_msglen, 1, src->proto);
+			}
+#endif
 			if (paylen > 0)
 				psmi_mq_handle_data(mq, req, offset, payload,
 						    paylen);
diff --git a/psm_mq_utils.c b/psm_mq_utils.c
index ff8a52a..1f3da7c 100644
--- a/psm_mq_utils.c
+++ b/psm_mq_utils.c
@@ -91,6 +91,10 @@ psm2_mq_req_t MOCKABLE(psmi_mq_req_alloc)(psm2_mq_t mq, uint32_t type)
 		req->peer = NULL;
 		req->ptl_req_ptr = NULL;
 		req->flags = 0;
+#ifdef PSM_CUDA
+		req->is_buf_gpu_mem = 0;
+		req->user_gpu_buffer = NULL;
+#endif
 		return req;
 	} else {	/* we're out of reqs */
 		int issend = (type == MQE_TYPE_SEND);
diff --git a/psm_user.h b/psm_user.h
index 3c46f4f..3b7689c 100644
--- a/psm_user.h
+++ b/psm_user.h
@@ -61,6 +61,8 @@
 
 #include <sched.h>
 #include <numa.h>
+#include <semaphore.h>
+#include <fcntl.h>
 
 #include "psm2.h"
 #include "psm2_mq.h"
@@ -145,9 +147,22 @@ int psmi_get_current_proc_location();
 extern int psmi_epid_ver;
 extern uint32_t non_dw_mul_sdma;
 extern psmi_lock_t psmi_creation_lock;
-
 extern psm2_ep_t psmi_opened_endpoint;
 
+extern int psmi_affinity_shared_file_opened;
+extern uint64_t *shared_affinity_ptr;
+extern char *affinity_shm_name;
+
+extern sem_t *sem_affinity_shm_rw;
+extern int psmi_affinity_semaphore_open;
+extern char *sem_affinity_shm_rw_name;
+
+#define AFFINITY_SHM_BASENAME			"/psm2_hfi_affinity_shm"
+#define AFFINITY_SHMEMSIZE			sysconf(_SC_PAGE_SIZE)
+#define AFFINITY_SHM_REF_COUNT_LOCATION		0
+#define AFFINITY_SHM_HFI_INDEX_LOCATION		1
+#define SEM_AFFINITY_SHM_RW_BASENAME		"/psm2_hfi_affinity_shm_rw_mutex"
+
 PSMI_ALWAYS_INLINE(
 int
 _psmi_get_epid_version()) {
@@ -337,6 +352,7 @@ void psmi_profile_reblock(int did_no_progress) __attribute__ ((weak));
 #endif
 
 extern int is_cuda_enabled;
+extern int is_gdr_copy_enabled;
 extern int device_support_gpudirect;
 extern int cuda_runtime_version;
 
@@ -454,6 +470,15 @@ _psmi_is_cuda_enabled())
 
 #define PSMI_IS_CUDA_ENABLED _psmi_is_cuda_enabled()
 
+PSMI_ALWAYS_INLINE(
+int
+_psmi_is_gdr_copy_enabled())
+{
+        return is_gdr_copy_enabled;
+}
+
+#define PSMI_IS_GDR_COPY_ENABLED _psmi_is_gdr_copy_enabled()
+
 #define PSMI_IS_CUDA_MEM(p) _psmi_is_cuda_mem(p)
 /* XXX TODO: Getting the gpu page size from driver at init time */
 #define PSMI_GPU_PAGESIZE 65536
@@ -493,6 +518,29 @@ void psmi_cuda_hostbuf_alloc_func(int is_alloc, void *context, void *obj);
 
 extern uint32_t gpudirect_send_threshold;
 extern uint32_t gpudirect_recv_threshold;
+extern uint32_t cuda_thresh_rndv;
+/* This threshold dictates when the sender turns off
+ * GDR Copy. The threshold needs to be less than
+ * CUDA RNDV threshold.
+ */
+extern uint32_t gdr_copy_threshold_send;
+/* This threshold dictates when the reciever turns off
+ * GDR Copy. The threshold needs to be less than
+ * CUDA RNDV threshold.
+ */
+extern uint32_t gdr_copy_threshold_recv;
+
+#define GDR_COPY_THRESH_SEND 32
+#define GDR_COPY_THRESH_RECV 64000
+
+#define PSMI_USE_GDR_COPY(req, len) req->is_buf_gpu_mem &&       \
+				    PSMI_IS_GDR_COPY_ENABLED  && \
+				    len >=1 && len <= gdr_copy_threshold_recv
+
+/* All GPU transfers beyond this threshold use
+ * RNDV protocol. It is mostly a send side knob.
+ */
+#define CUDA_THRESH_RNDV 32768
 
 enum psm2_chb_match_type {
 	/* Complete data found in a single chb */
diff --git a/ptl_ips/ips_proto.c b/ptl_ips/ips_proto.c
index cf87c33..5b6c058 100644
--- a/ptl_ips/ips_proto.c
+++ b/ptl_ips/ips_proto.c
@@ -66,6 +66,10 @@
 #include "ips_proto_help.h"
 #include "psmi_wrappers.h"
 
+#ifdef PSM_CUDA
+#include "psm_gdrcpy.h"
+#endif
+
 /*
  * Control message types have their own flag to determine whether a message of
  * that type is queued or not.  These flags are kept in a state bitfield.
@@ -620,6 +624,14 @@ ips_proto_init(const psmi_context_t *context, const ptl_t *ptl,
 				PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT_FLAGS,
 				(union psmi_envvar_val)0, /* Disabled by default */
 				&env_gpudirect_rdma);
+	/* The following cases need to be handled:
+	 * 1) GPU DIRECT is turned off but GDR COPY is turned on by the user or
+	 *    by default - Turn off GDR COPY
+	 * 2) GPU DIRECT is on but GDR COPY is turned off by the user - Leave
+	 *.   this config as it is.
+	 */
+	if (!env_gpudirect_rdma.e_uint)
+		is_gdr_copy_enabled = 0;
 
 	/* Default Send threshold for Gpu-direct set to 30000 */
 	union psmi_envvar_val env_gpudirect_send_thresh;
@@ -1688,10 +1700,28 @@ handle_ENOMEM_on_DMA_completion(struct ips_proto *proto)
 
 		if (lengthEvicted)
 			return PSM2_OK; /* signals a retry of the writev command. */
-		else
-			return PSM2_EP_NO_RESOURCES;  /* should signal a return of
+		else {
+#ifdef PSM_CUDA
+			if (PSMI_IS_GDR_COPY_ENABLED && gdr_cache_evict()) {
+				return PSM2_OK;
+			} else
+#endif
+				return PSM2_EP_NO_RESOURCES;  /* should signal a return of
 							no progress, and retry later */
+		}
+	}
+#ifdef PSM_CUDA
+	else if (PSMI_IS_GDR_COPY_ENABLED) {
+		uint64_t lengthEvicted = gdr_cache_evict();
+		if (!proto->writevFailTime)
+			proto->writevFailTime = now;
+
+		if (lengthEvicted)
+			return PSM2_OK;
+		else
+			return PSM2_EP_NO_RESOURCES;
 	}
+#endif
 	else if (!proto->writevFailTime)
 	{
 		proto->writevFailTime = now;
diff --git a/ptl_ips/ips_proto_expected.c b/ptl_ips/ips_proto_expected.c
index d7a37b9..069165c 100644
--- a/ptl_ips/ips_proto_expected.c
+++ b/ptl_ips/ips_proto_expected.c
@@ -1785,6 +1785,7 @@ ips_scb_prepare_tid_sendctrl(struct ips_flow *flow,
 	if (tidsendc->mqreq->is_buf_gpu_mem &&		/* request's buffer comes from GPU realm */
 	   !tidsendc->mqreq->cuda_hostbuf_used) {	/* and it was NOT moved to HOST memory */
 		scb->mq_req = tidsendc->mqreq;		/* so let's mark it per scb, not to check its locality again */
+		ips_scb_flags(scb) |= IPS_SEND_FLAG_PAYLOAD_BUF_GPU;
 	}
 #endif
 
diff --git a/ptl_ips/ips_proto_mq.c b/ptl_ips/ips_proto_mq.c
index 6f13473..23802e1 100644
--- a/ptl_ips/ips_proto_mq.c
+++ b/ptl_ips/ips_proto_mq.c
@@ -59,6 +59,10 @@
 #include "ips_proto.h"
 #include "ips_proto_internal.h"
 
+#ifdef PSM_CUDA
+#include "psm_gdrcpy.h"
+#endif
+
 uint32_t non_dw_mul_sdma = 0;
 
 void
@@ -335,6 +339,17 @@ ips_ptl_mq_eager(struct ips_proto *proto, psm2_mq_req_t req,
 		     (void *)buf, pktlen, flow->frag_size, nbytes_left);
 		ips_scb_buffer(scb) = (void *)buf;
 
+#ifdef PSM_CUDA
+		/* PSM would never send packets using eager protocol
+		 * if GPU Direct RDMA is turned off, which makes setting
+		 * these flags safe.
+		 */
+		if (req->is_buf_gpu_mem) {
+			ips_scb_flags(scb) |= IPS_SEND_FLAG_PAYLOAD_BUF_GPU;
+			ips_scb_flags(scb) |= IPS_SEND_FLAG_USER_BUF_GPU;
+		}
+#endif
+
 		buf += pktlen;
 		offset += pktlen;
 		nbytes_left -= pktlen;
@@ -438,7 +453,7 @@ ips_ptl_mq_rndv(struct ips_proto *proto, psm2_mq_req_t req,
 	 * receiver select TID instead of using eager buffers.
 	 */
 	if (req->is_buf_gpu_mem) {
-		ips_scb_flags(scb) |= IPS_SEND_FLAG_GPU_BUF;
+		ips_scb_flags(scb) |= IPS_SEND_FLAG_USER_BUF_GPU;
 		scb->mq_req = req;	/* request comes from GPU domain (device) ... */
 	}
 	req->cuda_hostbuf_used = 0;
@@ -456,11 +471,6 @@ ips_ptl_mq_rndv(struct ips_proto *proto, psm2_mq_req_t req,
 		STAILQ_INIT(&req->sendreq_prefetch);
 		offset = 0;
 		req->cuda_hostbuf_used = 1;
-		scb->mq_req = NULL;	/*  ... but it is transferred to host memory,
-					   so setting req = NULL lets us take a faster
-					   decision on scb's locality while sending
-					   (see IS_CUDA_BUF() macro) */
-
 		/* start prefetching */
 		req->prefetch_send_msgoff = 0;
 		while ((offset < len) &&
@@ -544,7 +554,7 @@ ips_proto_mq_isend(psm2_mq_t mq, psm2_epaddr_t mepaddr, uint32_t flags,
 
 	req = psmi_mq_req_alloc(mq, MQE_TYPE_SEND);
 	if_pf(req == NULL)
-	    return PSM2_NO_MEMORY;
+		return PSM2_NO_MEMORY;
 
 	ipsaddr = ((ips_epaddr_t *) mepaddr)->msgctl->ipsaddr_next;
 	ipsaddr->msgctl->ipsaddr_next = ipsaddr->next;
@@ -567,7 +577,11 @@ ips_proto_mq_isend(psm2_mq_t mq, psm2_epaddr_t mepaddr, uint32_t flags,
 			       CU_POINTER_ATTRIBUTE_SYNC_MEMOPS,
 			      (CUdeviceptr)ubuf);
 		req->is_buf_gpu_mem = 1;
-		goto do_rendezvous;
+		if (!(proto->flags & IPS_PROTO_FLAG_GPUDIRECT_RDMA_SEND) ||
+		    !PSMI_IS_GDR_COPY_ENABLED ||
+			 len < 1 || len > cuda_thresh_rndv){
+			goto do_rendezvous;
+		}
 	} else
 		req->is_buf_gpu_mem = 0;
 #endif
@@ -584,13 +598,25 @@ ips_proto_mq_isend(psm2_mq_t mq, psm2_epaddr_t mepaddr, uint32_t flags,
 		    ipsaddr->msgctl->mq_send_seqnum++;
 		ips_scb_copy_tag(scb->ips_lrh.tag, tag->tag);
 
+		const void *user_buffer = ubuf;
 #ifdef PSM_CUDA
-		mq_copy_tiny_host_mem
+		if (req->is_buf_gpu_mem) {
+			/* The following functions PINS the GPU pages
+			 * and mmaps the pages into the process virtual
+			 * space. This allows PSM to issue a standard
+			 * memcpy to move data between HFI resources
+			 * and the GPU
+			 */
+			ips_scb_flags(scb) |= IPS_SEND_FLAG_USER_BUF_GPU;
+			user_buffer = gdr_convert_gpu_to_host_addr(GDR_FD,
+						(unsigned long)ubuf, len, 0, proto);
+		}
+		mq_copy_tiny_host_mem((uint32_t *) &ips_scb_hdrdata(scb),
+							  (uint32_t *) user_buffer, len);
 #else
-		mq_copy_tiny
+		mq_copy_tiny((uint32_t *) &ips_scb_hdrdata(scb),
+					 (uint32_t *) user_buffer, len);
 #endif
-			((uint32_t *) &ips_scb_hdrdata(scb),
-			     (uint32_t *) ubuf, len);
 		err = ips_mq_send_envelope(proto, flow, scb, PSMI_TRUE);
 		if (err != PSM2_OK)
 			return err;
@@ -614,18 +640,23 @@ ips_proto_mq_isend(psm2_mq_t mq, psm2_epaddr_t mepaddr, uint32_t flags,
 		scb->ips_lrh.khdr.kdeth0 = ipsaddr->msgctl->mq_send_seqnum++;
 		ips_scb_hdrdata(scb).u32w1 = len;
 		ips_scb_copy_tag(scb->ips_lrh.tag, tag->tag);
+		const void * user_buffer = ubuf;
+#ifdef PSM_CUDA
+		if (req->is_buf_gpu_mem && len <= gdr_copy_threshold_send){
+			ips_scb_flags(scb) |= IPS_SEND_FLAG_USER_BUF_GPU;
+			user_buffer = gdr_convert_gpu_to_host_addr(GDR_FD,
+						(unsigned long)ubuf, len , 0, proto);
+		}
+#endif
+
+		ips_scb_buffer(scb) = (void *)user_buffer;
 
-		ips_scb_buffer(scb) = (void *)ubuf;
 		ips_scb_length(scb) = paylen;
 		if (len > paylen) {
 			/* there are nonDW bytes, copy to header */
-#ifdef PSM_CUDA
-			mq_copy_tiny_host_mem
-#else
 			mq_copy_tiny
-#endif
 				((uint32_t *)&ips_scb_hdrdata(scb).u32w0,
-				(uint32_t *)((uintptr_t)ubuf + paylen),
+				(uint32_t *)((uintptr_t)user_buffer + paylen),
 				len - paylen);
 
 			/* for complete callback */
@@ -639,8 +670,14 @@ ips_proto_mq_isend(psm2_mq_t mq, psm2_epaddr_t mepaddr, uint32_t flags,
 		 * send from user buffer.
 		 */
 		ips_scb_flags(scb) |= IPS_SEND_FLAG_ACKREQ;
-
-		flow = &ipsaddr->flows[proto->msgflowid];
+#ifdef PSM_CUDA
+		if (req->is_buf_gpu_mem && len > gdr_copy_threshold_send) {
+			ips_scb_flags(scb) |= IPS_SEND_FLAG_USER_BUF_GPU;
+			ips_scb_flags(scb) |= IPS_SEND_FLAG_PAYLOAD_BUF_GPU;
+			flow = &ipsaddr->flows[EP_FLOW_GO_BACK_N_DMA];
+		} else
+#endif
+			flow = &ipsaddr->flows[proto->msgflowid];
 		err = ips_mq_send_envelope(proto, flow, scb, PSMI_TRUE);
 		if (err != PSM2_OK)
 			return err;
@@ -652,7 +689,7 @@ ips_proto_mq_isend(psm2_mq_t mq, psm2_epaddr_t mepaddr, uint32_t flags,
 		 * scb pool. Even if the same scb is re-used, it
 		 * is not possible to set to this 'buf' address.
 		 */
-		if (ips_scb_buffer(scb) == (void *)ubuf) {
+		if (ips_scb_buffer(scb) == (void *)user_buffer) {
 			/* continue to send from user buffer */
 			ips_scb_cb(scb) = ips_proto_mq_eager_complete;
 			ips_scb_cb_param(scb) = req;
@@ -667,7 +704,13 @@ ips_proto_mq_isend(psm2_mq_t mq, psm2_epaddr_t mepaddr, uint32_t flags,
 		     psmi_epaddr_get_name(((psm2_epaddr_t) ipsaddr)->epid), ubuf,
 		     len, tag->tag[0], tag->tag[1], tag->tag[2], req);
 	} else if (len <= mq->hfi_thresh_rv) {
+
+#ifdef PSM_CUDA
+		/* GPU buffers will always use DMA flow for Eager protocol */
+		if (!req->is_buf_gpu_mem && len <= proto->iovec_thresh_eager) {
+#else
 		if (len <= proto->iovec_thresh_eager) {
+#endif
 			/* use PIO transfer */
 			psmi_assert((proto->flags & IPS_PROTO_FLAG_SDMA) == 0);
 			flow = &ipsaddr->flows[EP_FLOW_GO_BACK_N_PIO];
@@ -720,7 +763,15 @@ ips_proto_mq_send(psm2_mq_t mq, psm2_epaddr_t mepaddr, uint32_t flags,
 	int gpu_mem;
 	if (PSMI_IS_CUDA_ENABLED && PSMI_IS_CUDA_MEM((void*)ubuf)) {
 		gpu_mem = 1;
-		goto do_rendezvous;
+		int trueflag = 1;
+		PSMI_CUDA_CALL(cuPointerSetAttribute, &trueflag,
+				       CU_POINTER_ATTRIBUTE_SYNC_MEMOPS,
+					   (CUdeviceptr)ubuf);
+		if (!(proto->flags & IPS_PROTO_FLAG_GPUDIRECT_RDMA_SEND) ||
+			!PSMI_IS_GDR_COPY_ENABLED ||
+			len < 1 || len > cuda_thresh_rndv){
+			goto do_rendezvous;
+		}
 	} else
 		gpu_mem = 0;
 #endif
@@ -736,14 +787,26 @@ ips_proto_mq_send(psm2_mq_t mq, psm2_epaddr_t mepaddr, uint32_t flags,
 		    ((len & HFI_KHDR_TINYLEN_MASK) << HFI_KHDR_TINYLEN_SHIFT) |
 		    ipsaddr->msgctl->mq_send_seqnum++;
 		ips_scb_copy_tag(scb->ips_lrh.tag, tag->tag);
-
+		const void *user_buffer = ubuf;
 #ifdef PSM_CUDA
-		mq_copy_tiny_host_mem
+		if (gpu_mem){
+			/* The following functions PINS the GPU pages
+			 * and mmaps the pages into the process virtual
+			 * space. This allows PSM to issue a standard
+			 * memcpy to move data between HFI resources
+			 * and the GPU
+			 */
+			ips_scb_flags(scb) |= IPS_SEND_FLAG_USER_BUF_GPU;
+			user_buffer = gdr_convert_gpu_to_host_addr(GDR_FD,
+						(unsigned long)ubuf, len, 0, proto);
+		}
+		mq_copy_tiny_host_mem((uint32_t *) &ips_scb_hdrdata(scb),
+							  (uint32_t *) user_buffer, len);
 #else
 		mq_copy_tiny
-#endif
 			((uint32_t *) &ips_scb_hdrdata(scb),
-			     (uint32_t *) ubuf, len);
+			 (uint32_t *) user_buffer, len);
+#endif
 		err = ips_mq_send_envelope(proto, flow, scb, PSMI_TRUE);
 		if (err != PSM2_OK)
 			return err;
@@ -763,17 +826,23 @@ ips_proto_mq_send(psm2_mq_t mq, psm2_epaddr_t mepaddr, uint32_t flags,
 		ips_scb_hdrdata(scb).u32w1 = len;
 		ips_scb_copy_tag(scb->ips_lrh.tag, tag->tag);
 
-		ips_scb_buffer(scb) = (void *)ubuf;
+		const void * user_buffer = ubuf;
+#ifdef PSM_CUDA
+		if (gpu_mem && len <= gdr_copy_threshold_send) {
+			ips_scb_flags(scb) |= IPS_SEND_FLAG_USER_BUF_GPU;
+			user_buffer = gdr_convert_gpu_to_host_addr(GDR_FD,
+						(unsigned long)ubuf, len, 0, proto);
+		}
+#endif
+
+
+		ips_scb_buffer(scb) = (void *)user_buffer;
 		ips_scb_length(scb) = paylen;
 		if (len > paylen) {
 			/* there are nonDW bytes, copy to header */
-#ifdef PSM_CUDA
-			mq_copy_tiny_host_mem
-#else
 			mq_copy_tiny
-#endif
 				((uint32_t *)&ips_scb_hdrdata(scb).u32w0,
-				(uint32_t *)((uintptr_t)ubuf + paylen),
+				(uint32_t *)((uintptr_t)user_buffer + paylen),
 				len - paylen);
 		}
 
@@ -782,8 +851,14 @@ ips_proto_mq_send(psm2_mq_t mq, psm2_epaddr_t mepaddr, uint32_t flags,
 		 * send from user buffer.
 		 */
 		ips_scb_flags(scb) |= IPS_SEND_FLAG_ACKREQ;
-
-		flow = &ipsaddr->flows[proto->msgflowid];
+#ifdef PSM_CUDA
+		if (gpu_mem && len > gdr_copy_threshold_send) {
+			ips_scb_flags(scb) |= IPS_SEND_FLAG_USER_BUF_GPU;
+			ips_scb_flags(scb) |= IPS_SEND_FLAG_PAYLOAD_BUF_GPU;
+			flow = &ipsaddr->flows[EP_FLOW_GO_BACK_N_DMA];
+		} else
+#endif
+			flow = &ipsaddr->flows[proto->msgflowid];
 		err = ips_mq_send_envelope(proto, flow, scb, PSMI_TRUE);
 		if (err != PSM2_OK)
 			return err;
@@ -795,7 +870,7 @@ ips_proto_mq_send(psm2_mq_t mq, psm2_epaddr_t mepaddr, uint32_t flags,
 		 * scb pool. Even if the same scb is re-used, it
 		 * is not possible to set to this 'ubuf' address.
 		 */
-		if (ips_scb_buffer(scb) == (void *)ubuf) {
+		if (ips_scb_buffer(scb) == (void *)user_buffer) {
 			if (flow->transfer != PSM_TRANSFER_PIO ||
 			    paylen > proto->scb_bufsize ||
 			    !ips_scbctrl_bufalloc(scb)) {
@@ -804,7 +879,7 @@ ips_proto_mq_send(psm2_mq_t mq, psm2_epaddr_t mepaddr, uint32_t flags,
 				 * or, can't allocate bounce buffer,
 				 * send from user buffer till complete */
 				PSMI_BLOCKUNTIL(mq->ep, err,
-					ips_scb_buffer(scb) != (void*)ubuf);
+					ips_scb_buffer(scb) != (void*)user_buffer);
 				if (err > PSM2_OK_NO_PROGRESS)
 					return err;
 				err = PSM2_OK;
@@ -816,7 +891,7 @@ ips_proto_mq_send(psm2_mq_t mq, psm2_epaddr_t mepaddr, uint32_t flags,
 				ips_shortcpy
 #endif
 					(ips_scb_buffer(scb),
-					(void*)ubuf, paylen);
+					 (void*)user_buffer, paylen);
 			}
 		}
 		_HFI_VDBG("[shrt][%s->%s][b=%p][m=%d][t=%08x.%08x.%08x]\n",
@@ -826,7 +901,12 @@ ips_proto_mq_send(psm2_mq_t mq, psm2_epaddr_t mepaddr, uint32_t flags,
 	} else if (len <= mq->hfi_thresh_rv) {
 		psm2_mq_req_t req;
 
+#ifdef PSM_CUDA
+		/* GPU buffers will always use DMA flow for Eager protocol */
+		if (!gpu_mem && len <= proto->iovec_thresh_eager_blocking) {
+#else
 		if (len <= proto->iovec_thresh_eager_blocking) {
+#endif
 			/* use PIO transfer */
 			psmi_assert((proto->flags & IPS_PROTO_FLAG_SDMA) == 0);
 			flow = &ipsaddr->flows[EP_FLOW_GO_BACK_N_PIO];
@@ -835,7 +915,6 @@ ips_proto_mq_send(psm2_mq_t mq, psm2_epaddr_t mepaddr, uint32_t flags,
 			psmi_assert((proto->flags & IPS_PROTO_FLAG_SPIO) == 0);
 			flow = &ipsaddr->flows[EP_FLOW_GO_BACK_N_DMA];
 		}
-
 		/* Block until we can get a req */
 		PSMI_BLOCKUNTIL(mq->ep, err,
 				(req =
@@ -843,6 +922,13 @@ ips_proto_mq_send(psm2_mq_t mq, psm2_epaddr_t mepaddr, uint32_t flags,
 		if (err > PSM2_OK_NO_PROGRESS)
 			return err;
 
+#ifdef PSM_CUDA
+		if (gpu_mem) {
+			req->is_buf_gpu_mem = 1;
+		} else
+			req->is_buf_gpu_mem = 0;
+#endif
+
 		req->type |= MQE_TYPE_WAITING;
 		req->send_msglen = len;
 		req->tag = *tag;
@@ -880,10 +966,6 @@ ips_proto_mq_send(psm2_mq_t mq, psm2_epaddr_t mepaddr, uint32_t flags,
 		 * always synchronize
 		 */
 		if (gpu_mem) {
-			int trueflag = 1;
-			PSMI_CUDA_CALL(cuPointerSetAttribute, &trueflag,
-				       CU_POINTER_ATTRIBUTE_SYNC_MEMOPS,
-				      (CUdeviceptr)ubuf);
 			req->is_buf_gpu_mem = 1;
 		} else
 			req->is_buf_gpu_mem = 0;
@@ -1317,7 +1399,7 @@ ips_proto_mq_handle_rts(struct ips_recvhdrq_event *rcv_ev)
 		req->type |= MQE_TYPE_WAITING_PEER;
 
 #ifdef PSM_CUDA
-	if (p_hdr->flags & IPS_SEND_FLAG_GPU_BUF)
+	if (p_hdr->flags & IPS_SEND_FLAG_USER_BUF_GPU)
 		req->is_sendbuf_gpu_mem = 1;
 	else
 		req->is_sendbuf_gpu_mem = 0;
@@ -1574,6 +1656,13 @@ ips_proto_mq_handle_eager(struct ips_recvhdrq_event *rcv_ev)
 		 * error is caught below.
 		 */
 		if (req) {
+#ifdef PSM_CUDA
+			if (PSMI_USE_GDR_COPY(req, req->send_msglen)) {
+				req->buf = gdr_convert_gpu_to_host_addr(GDR_FD,
+							(unsigned long)req->user_gpu_buffer,
+							req->send_msglen, 1, rcv_ev->proto);
+			}
+#endif
 			psmi_mq_handle_data(mq, req,
 				p_hdr->data[1].u32w0, payload, paylen);
 
diff --git a/ptl_ips/ips_proto_params.h b/ptl_ips/ips_proto_params.h
index 6e5e49a..6a77967 100644
--- a/ptl_ips/ips_proto_params.h
+++ b/ptl_ips/ips_proto_params.h
@@ -159,7 +159,7 @@
  * in a special case where the send is on a device
  * buffer and the receive is on a host buffer.
  */
-#define IPS_SEND_FLAG_GPU_BUF           0x08
+#define IPS_SEND_FLAG_USER_BUF_GPU      0x08
 #endif
 
 #define IPS_SEND_FLAG_PROTO_OPTS        0x3f	/* only 6bits wire flags */
@@ -168,6 +168,15 @@
 #define IPS_SEND_FLAG_PENDING		0x0100
 #define IPS_SEND_FLAG_PERSISTENT	0x0200
 
+/* This flag is used to indicate if the send is on
+ * a GPU buffer. This helps PIO/SDMA paths to detect
+ * if payload is GPU buffer without having to call
+ * cudaGetPointerAttribute.
+ */
+#ifdef PSM_CUDA
+#define IPS_SEND_FLAG_PAYLOAD_BUF_GPU   0x0400
+#endif
+
 /* 0x10000000, interrupt when done */
 #define IPS_SEND_FLAG_INTR		(1<<HFI_KHDR_INTR_SHIFT)
 /* 0x20000000, header suppression */
diff --git a/ptl_ips/ips_scb.h b/ptl_ips/ips_scb.h
index 62a509b..2d97c8e 100644
--- a/ptl_ips/ips_scb.h
+++ b/ptl_ips/ips_scb.h
@@ -196,13 +196,7 @@ struct ips_scb {
 };
 
 #ifdef PSM_CUDA
-#define IS_TRANSFER_BUF_GPU_MEM(scb) (scb->mq_req != NULL)
-/* In case we need to be more precise about scb's locality
- * we can expand the macro in place, e.g.
- * #define IS_TRANSFER_BUF_GPU_MEM(scb) (scb->mq_req != NULL && \
- * 					 scb->mq_req->is_buf_gpu_mem && \
- * 					!scb->mq_req->cuda_hostbuf_used)
- */
+#define IS_TRANSFER_BUF_GPU_MEM(scb) (ips_scb_flags(scb) & IPS_SEND_FLAG_PAYLOAD_BUF_GPU)
 #endif
 
 void ips_scbctrl_free(ips_scb_t *scb);
diff --git a/rpm_release_extension b/rpm_release_extension
index 9e5feb5..8c61d23 100644
--- a/rpm_release_extension
+++ b/rpm_release_extension
@@ -1 +1 @@
-46
+58