From 5c588c791b0e6600458824a4c8162d6864eb7e69 Mon Sep 17 00:00:00 2001 From: Giuseppe Congiu Date: Mon, 24 Dec 2018 10:37:55 +0100 Subject: [PATCH 1/7] shm: refactor shared memory segment allocation functions This is a refactoring patch for shared memory segment allocation functions. 'MPIDU_shm_seg_alloc' now also takes a memory type parameter defining in which target memory the requested allocation should be placed. 'MPIDU_shm_seg_commit' now also takes an hwloc numa node logical identifier and a shared memory object identifier. The numa node id allows binding memory to a specific memory domain while the object identifier allows user defined binding information for that specific object. The patch also introduces shared memory object definitions in 'src/mpid/common/shm/mpidu_shm_obj.h', and memory type definitions to which objects can be bound to in 'src/include/mpir_memtype.h'. --- src/include/Makefile.mk | 3 +- src/include/mpir_memtype.h | 18 ++++ .../ch3/channels/nemesis/src/mpid_nem_init.c | 16 ++-- src/mpid/ch4/shm/posix/eager/fbox/fbox_init.h | 7 +- src/mpid/common/bc/mpidu_bc.c | 18 ++-- src/mpid/common/shm/Makefile.mk | 3 +- src/mpid/common/shm/mpidu_shm.h | 7 +- src/mpid/common/shm/mpidu_shm_alloc.c | 92 ++++++++++++------- src/mpid/common/shm/mpidu_shm_obj.h | 19 ++++ 9 files changed, 133 insertions(+), 50 deletions(-) create mode 100644 src/include/mpir_memtype.h create mode 100644 src/mpid/common/shm/mpidu_shm_obj.h diff --git a/src/include/Makefile.mk b/src/include/Makefile.mk index 8c71f811d35..3d82d23a9a8 100644 --- a/src/include/Makefile.mk +++ b/src/include/Makefile.mk @@ -70,7 +70,8 @@ noinst_HEADERS += \ src/include/nopackage.h \ src/include/rlog.h \ src/include/rlog_macros.h \ - src/include/mpir_op_util.h + src/include/mpir_op_util.h \ + src/include/mpir_memtype.h src/include/mpir_cvars.h: $(top_srcdir)/maint/extractcvars --dirs="`cat $(top_srcdir)/maint/cvardirs`" diff --git a/src/include/mpir_memtype.h b/src/include/mpir_memtype.h new file mode 100644 index 00000000000..416fb279270 --- /dev/null +++ b/src/include/mpir_memtype.h @@ -0,0 +1,18 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */ +/* + * (C) 2019 by Argonne National Laboratory. + * See COPYRIGHT in top-level directory. + */ + +#ifndef MPIR_MEMTYPE_H_INCLUDED +#define MPIR_MEMTYPE_H_INCLUDED + +typedef enum MPIR_Memtype { + MPIR_MEMTYPE__NONE = -1, + MPIR_MEMTYPE__DDR = 0, + MPIR_MEMTYPE__MCDRAM, + MPIR_MEMTYPE__NUM, + MPIR_MEMTYPE__DEFAULT = MPIR_MEMTYPE__DDR +} MPIR_Memtype; + +#endif /* MPIR_MEMTYPE_H_INCLUDED */ diff --git a/src/mpid/ch3/channels/nemesis/src/mpid_nem_init.c b/src/mpid/ch3/channels/nemesis/src/mpid_nem_init.c index de0c7e64c70..ab6d3291b06 100644 --- a/src/mpid/ch3/channels/nemesis/src/mpid_nem_init.c +++ b/src/mpid/ch3/channels/nemesis/src/mpid_nem_init.c @@ -239,30 +239,34 @@ MPID_nem_init(int pg_rank, MPIDI_PG_t *pg_p, int has_parent ATTRIBUTE((unused))) /* Request fastboxes region */ mpi_errno = MPIDU_shm_seg_alloc(MPL_MAX((num_local*((num_local-1)*sizeof(MPID_nem_fastbox_t))), MPID_NEM_ASYMM_NULL_VAL), - (void **)&fastboxes_p, MPL_MEM_SHM); + (void **)&fastboxes_p, MPIR_MEMTYPE__DEFAULT, MPL_MEM_SHM); if (mpi_errno) MPIR_ERR_POP(mpi_errno); /* Request data cells region */ - mpi_errno = MPIDU_shm_seg_alloc(num_local * MPID_NEM_NUM_CELLS * sizeof(MPID_nem_cell_t), (void **)&cells_p, MPL_MEM_SHM); + mpi_errno = MPIDU_shm_seg_alloc(num_local * MPID_NEM_NUM_CELLS * sizeof(MPID_nem_cell_t), (void **)&cells_p, + MPIR_MEMTYPE__DEFAULT, MPL_MEM_SHM); if (mpi_errno) MPIR_ERR_POP(mpi_errno); /* Request free q region */ - mpi_errno = MPIDU_shm_seg_alloc(num_local * sizeof(MPID_nem_queue_t), (void **)&free_queues_p, MPL_MEM_SHM); + mpi_errno = MPIDU_shm_seg_alloc(num_local * sizeof(MPID_nem_queue_t), (void **)&free_queues_p, + MPIR_MEMTYPE__DEFAULT, MPL_MEM_SHM); if (mpi_errno) MPIR_ERR_POP(mpi_errno); /* Request recv q region */ - mpi_errno = MPIDU_shm_seg_alloc(num_local * sizeof(MPID_nem_queue_t), (void **)&recv_queues_p, MPL_MEM_SHM); + mpi_errno = MPIDU_shm_seg_alloc(num_local * sizeof(MPID_nem_queue_t), (void **)&recv_queues_p, + MPIR_MEMTYPE__DEFAULT, MPL_MEM_SHM); if (mpi_errno) MPIR_ERR_POP(mpi_errno); /* Request shared collectives barrier vars region */ mpi_errno = MPIDU_shm_seg_alloc(MPID_NEM_NUM_BARRIER_VARS * sizeof(MPID_nem_barrier_vars_t), - (void **)&MPID_nem_mem_region.barrier_vars, MPL_MEM_SHM); + (void **)&MPID_nem_mem_region.barrier_vars, MPIR_MEMTYPE__DEFAULT, + MPL_MEM_SHM); if (mpi_errno) MPIR_ERR_POP(mpi_errno); /* Actually allocate the segment and assign regions to the pointers */ mpi_errno = MPIDU_shm_seg_commit(&MPID_nem_mem_region.memory, &MPID_nem_mem_region.barrier, num_local, local_rank, MPID_nem_mem_region.local_procs[0], - MPID_nem_mem_region.rank, MPL_MEM_SHM); + MPID_nem_mem_region.rank, 0, MPIDU_SHM_OBJ__NONE, MPL_MEM_SHM); /* check_alloc steps */ if (MPID_nem_mem_region.memory.symmetrical == 1) { MPID_nem_asymm_base_addr = NULL; diff --git a/src/mpid/ch4/shm/posix/eager/fbox/fbox_init.h b/src/mpid/ch4/shm/posix/eager/fbox/fbox_init.h index 6d00b85f555..c83f5b186b6 100644 --- a/src/mpid/ch4/shm/posix/eager/fbox/fbox_init.h +++ b/src/mpid/ch4/shm/posix/eager/fbox/fbox_init.h @@ -79,21 +79,22 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_POSIX_eager_init(int rank, int size) /* Create region with one fastbox for every pair of local processes. */ mpi_errno = MPIDU_shm_seg_alloc(num_local * num_local * sizeof(MPIDI_POSIX_fastbox_t), - (void **) &fastboxes_p, MPL_MEM_SHM); + (void **) &fastboxes_p, MPIR_MEMTYPE__DEFAULT, MPL_MEM_SHM); if (mpi_errno) MPIR_ERR_POP(mpi_errno); /* Request shared collective barrier vars region */ mpi_errno = MPIDU_shm_seg_alloc(MAX(sizeof(MPIDU_shm_barrier_t), MPIDU_SHM_CACHE_LINE_LEN), (void **) &MPIDI_POSIX_eager_fbox_control_global.barrier_region, - MPL_MEM_SHM); + MPIR_MEMTYPE__DEFAULT, MPL_MEM_SHM); if (mpi_errno) MPIR_ERR_POP(mpi_errno); /* Actually allocate the segment and assign regions to the pointers */ mpi_errno = MPIDU_shm_seg_commit(&MPIDI_POSIX_eager_fbox_control_global.memory, &MPIDI_POSIX_eager_fbox_control_global.barrier, - num_local, my_local_rank, local_rank_0, rank, MPL_MEM_SHM); + num_local, my_local_rank, local_rank_0, rank, 0, + MPIDU_SHM_OBJ__NONE, MPL_MEM_SHM); if (mpi_errno) MPIR_ERR_POP(mpi_errno); diff --git a/src/mpid/common/bc/mpidu_bc.c b/src/mpid/common/bc/mpidu_bc.c index bfab89278e6..0a6049706ec 100644 --- a/src/mpid/common/bc/mpidu_bc.c +++ b/src/mpid/common/bc/mpidu_bc.c @@ -148,12 +148,14 @@ int MPIDU_bc_table_create(int rank, int size, int *nodemap, void *bc, int bc_len /* if business cards can be different length, allocate 2x the space */ if (!same_len) bc_len = VALLEN; - mpi_errno = MPIDU_shm_seg_alloc(bc_len * size, (void **) &segment, MPL_MEM_ADDRESS); + mpi_errno = + MPIDU_shm_seg_alloc(bc_len * size, (void **) &segment, MPIR_MEMTYPE__DEFAULT, + MPL_MEM_ADDRESS); if (mpi_errno) MPIR_ERR_POP(mpi_errno); mpi_errno = MPIDU_shm_seg_commit(&memory, &barrier, local_size, local_rank, local_leader, rank, - MPL_MEM_ADDRESS); + 0, MPIDU_SHM_OBJ__NONE, MPL_MEM_ADDRESS); if (mpi_errno) MPIR_ERR_POP(mpi_errno); @@ -254,12 +256,14 @@ int MPIDU_bc_table_create(int rank, int size, int *nodemap, void *bc, int bc_len /* if business cards can be different length, allocate 2x the space */ if (!same_len) bc_len = PMI2_MAX_VALLEN; - mpi_errno = MPIDU_shm_seg_alloc(bc_len * size, (void **) &segment, MPL_MEM_ADDRESS); + mpi_errno = + MPIDU_shm_seg_alloc(bc_len * size, (void **) &segment, MPIR_MEMTYPE__DEFAULT, + MPL_MEM_ADDRESS); if (mpi_errno) MPIR_ERR_POP(mpi_errno); mpi_errno = MPIDU_shm_seg_commit(&memory, &barrier, local_size, local_rank, local_leader, rank, - MPL_MEM_ADDRESS); + 0, MPIDU_SHM_OBJ__NONE, MPL_MEM_ADDRESS); if (mpi_errno) MPIR_ERR_POP(mpi_errno); @@ -365,12 +369,14 @@ int MPIDU_bc_table_create(int rank, int size, int *nodemap, void *bc, int bc_len /* if business cards can be different length, allocate 2x the space */ if (!same_len) bc_len = val_max; - mpi_errno = MPIDU_shm_seg_alloc(bc_len * size, (void **) &segment, MPL_MEM_ADDRESS); + mpi_errno = + MPIDU_shm_seg_alloc(bc_len * size, (void **) &segment, MPIR_MEMTYPE__DEFAULT, + MPL_MEM_ADDRESS); if (mpi_errno) MPIR_ERR_POP(mpi_errno); mpi_errno = MPIDU_shm_seg_commit(&memory, &barrier, local_size, local_rank, local_leader, rank, - MPL_MEM_ADDRESS); + 0, MPIDU_SHM_OBJ__NONE, MPL_MEM_ADDRESS); if (mpi_errno) MPIR_ERR_POP(mpi_errno); diff --git a/src/mpid/common/shm/Makefile.mk b/src/mpid/common/shm/Makefile.mk index 3a75980d4b3..5e820a94b58 100644 --- a/src/mpid/common/shm/Makefile.mk +++ b/src/mpid/common/shm/Makefile.mk @@ -17,6 +17,7 @@ AM_CPPFLAGS += -I$(top_srcdir)/src/mpid/common/shm noinst_HEADERS += \ src/mpid/common/shm/mpidu_generic_queue.h \ src/mpid/common/shm/mpidu_shm_impl.h \ - src/mpid/common/shm/mpidu_shm.h + src/mpid/common/shm/mpidu_shm.h \ + src/mpid/common/shm/mpidu_shm_obj.h endif BUILD_MPID_COMMON_SHM diff --git a/src/mpid/common/shm/mpidu_shm.h b/src/mpid/common/shm/mpidu_shm.h index 84820068b23..97ad0687329 100644 --- a/src/mpid/common/shm/mpidu_shm.h +++ b/src/mpid/common/shm/mpidu_shm.h @@ -8,6 +8,9 @@ #ifndef MPIDU_SHM_H_INCLUDED #define MPIDU_SHM_H_INCLUDED +#include "mpidu_shm_obj.h" +#include "mpir_memtype.h" + #define MPIDU_SHM_MAX_FNAME_LEN 256 #define MPIDU_SHM_CACHE_LINE_LEN 64 @@ -33,10 +36,10 @@ typedef struct MPIDU_shm_seg_info { char *addr; } MPIDU_shm_seg_info_t; -int MPIDU_shm_seg_alloc(size_t len, void **ptr_p, MPL_memory_class class); +int MPIDU_shm_seg_alloc(size_t len, void **ptr_p, MPIR_Memtype type, MPL_memory_class class); int MPIDU_shm_seg_commit(MPIDU_shm_seg_t * memory, MPIDU_shm_barrier_t ** barrier, int num_local, int local_rank, int local_procs_0, int rank, - MPL_memory_class class); + int node_id, MPIDU_shm_obj_t object, MPL_memory_class class); int MPIDU_shm_seg_destroy(MPIDU_shm_seg_t * memory, int num_local); int MPIDU_shm_barrier_init(MPIDU_shm_barrier_t * barrier_region, diff --git a/src/mpid/common/shm/mpidu_shm_alloc.c b/src/mpid/common/shm/mpidu_shm_alloc.c index 819317fbe40..d3bc9f934b5 100644 --- a/src/mpid/common/shm/mpidu_shm_alloc.c +++ b/src/mpid/common/shm/mpidu_shm_alloc.c @@ -40,20 +40,18 @@ typedef struct alloc_elem { static struct { alloc_elem_t *head, *tail; -} allocq = { +} allocq_default = { +0}, allocq_sibling = { + 0}; static int check_alloc(MPIDU_shm_seg_t * memory, MPIDU_shm_barrier_t * barrier, int num_local, int local_rank); -#define ALLOCQ_HEAD() GENERIC_Q_HEAD(allocq) -#define ALLOCQ_EMPTY() GENERIC_Q_EMPTY(allocq) -#define ALLOCQ_ENQUEUE(ep) GENERIC_Q_ENQUEUE(&allocq, ep, next) -#define ALLOCQ_DEQUEUE(epp) GENERIC_Q_DEQUEUE(&allocq, epp, next) - #define ROUND_UP_8(x) (((x) + (size_t)7) & ~(size_t)7) /* rounds up to multiple of 8 */ -static size_t segment_len = 0; +static size_t default_segment_len = 0; +static size_t sibling_segment_len = 0; static int num_segments = 0; @@ -80,7 +78,7 @@ static asym_check_region *asym_check_region_p = NULL; #define FUNCNAME MPIDU_shm_seg_alloc #undef FCNAME #define FCNAME MPL_QUOTE(FUNCNAME) -int MPIDU_shm_seg_alloc(size_t len, void **ptr_p, MPL_memory_class class) +int MPIDU_shm_seg_alloc(size_t len, void **ptr_p, MPIR_Memtype type, MPL_memory_class class) { int mpi_errno = MPI_SUCCESS; alloc_elem_t *ep; @@ -101,9 +99,13 @@ int MPIDU_shm_seg_alloc(size_t len, void **ptr_p, MPL_memory_class class) ep->ptr_p = ptr_p; ep->len = len; - ALLOCQ_ENQUEUE(ep); - - segment_len += len; + if (type == MPIR_MEMTYPE__DEFAULT || type == MPIR_MEMTYPE__NONE) { + GENERIC_Q_ENQUEUE(&allocq_default, ep, next); + default_segment_len += len; + } else { + GENERIC_Q_ENQUEUE(&allocq_sibling, ep, next); + sibling_segment_len += len; + } fn_exit: MPIR_CHKPMEM_COMMIT(); @@ -135,7 +137,7 @@ int MPIDU_shm_seg_alloc(size_t len, void **ptr_p, MPL_memory_class class) #define FCNAME MPL_QUOTE(FUNCNAME) int MPIDU_shm_seg_commit(MPIDU_shm_seg_t * memory, MPIDU_shm_barrier_t ** barrier, int num_local, int local_rank, int local_procs_0, int rank, - MPL_memory_class class) + int node_id, MPIDU_shm_obj_t object, MPL_memory_class class) { int mpi_errno = MPI_SUCCESS, mpl_err = 0; int pmi_errno; @@ -153,6 +155,7 @@ int MPIDU_shm_seg_commit(MPIDU_shm_seg_t * memory, MPIDU_shm_barrier_t ** barrie void *current_addr; void *start_addr ATTRIBUTE((unused)); size_t size_left; + size_t padding = 0; MPIR_CHKPMEM_DECL(1); MPIR_CHKLMEM_DECL(2); MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDU_SHM_SEG_COMMIT); @@ -160,11 +163,13 @@ int MPIDU_shm_seg_commit(MPIDU_shm_seg_t * memory, MPIDU_shm_barrier_t ** barrie MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPIDU_SHM_SEG_COMMIT); /* MPIDU_shm_seg_alloc() needs to have been called before this function */ - MPIR_Assert(!ALLOCQ_EMPTY()); - MPIR_Assert(segment_len > 0); + MPIR_Assert(!GENERIC_Q_EMPTY(allocq_default) || !GENERIC_Q_EMPTY(allocq_sibling)); + MPIR_Assert(default_segment_len > 0 || sibling_segment_len > 0); /* allocate an area to check if the segment was allocated symmetrically */ - mpl_err = MPIDU_shm_seg_alloc(sizeof(asym_check_region), (void **) &asym_check_region_p, class); + mpl_err = + MPIDU_shm_seg_alloc(sizeof(asym_check_region), (void **) &asym_check_region_p, + MPIR_MEMTYPE__DEFAULT, class); MPIR_ERR_CHKANDJUMP(mpl_err, mpi_errno, MPI_ERR_OTHER, "**alloc_shar_mem"); mpl_err = MPL_shm_hnd_init(&(memory->hnd)); @@ -182,7 +187,7 @@ int MPIDU_shm_seg_commit(MPIDU_shm_seg_t * memory, MPIDU_shm_barrier_t ** barrie /* add space for local barrier region. Use a whole cacheline. */ MPIR_Assert(MPIDU_SHM_CACHE_LINE_LEN >= sizeof(MPIDU_shm_barrier_t)); - segment_len += MPIDU_SHM_CACHE_LINE_LEN; + default_segment_len += MPIDU_SHM_CACHE_LINE_LEN; #ifdef OPA_USE_LOCK_BASED_PRIMITIVES /* We have a similar bootstrapping problem when using OpenPA in @@ -195,18 +200,28 @@ int MPIDU_shm_seg_commit(MPIDU_shm_seg_t * memory, MPIDU_shm_barrier_t ** barrie ipc_lock_offset = MPIDU_SHM_CACHE_LINE_LEN; MPIR_Assert(ipc_lock_offset >= sizeof(OPA_emulation_ipl_t)); - segment_len += MPIDU_SHM_CACHE_LINE_LEN; + default_segment_len += MPIDU_SHM_CACHE_LINE_LEN; #endif - memory->segment_len = segment_len; +#ifdef HAVE_HWLOC + /* add padding to align sibling region to page boundaries */ + if (num_local > 1 && default_segment_len > 0 && sibling_segment_len > 0) { + long page_sz = sysconf(_SC_PAGESIZE); + padding = + ((default_segment_len + (size_t) (page_sz - 1)) & ~((size_t) (page_sz - 1))) - + default_segment_len; + } +#endif + /* segment has two contributions: default (e.g., DRAM) and sibling (e.g., MCDRAM) */ + memory->segment_len = (default_segment_len + padding) + sibling_segment_len; #ifdef USE_PMI2_API /* if there is only one process on this processor, don't use shared memory */ if (num_local == 1) { char *addr; - MPIR_CHKPMEM_MALLOC(addr, char *, segment_len + MPIDU_SHM_CACHE_LINE_LEN, mpi_errno, - "segment", class); + MPIR_CHKPMEM_MALLOC(addr, char *, memory->segment_len + + MPIDU_SHM_CACHE_LINE_LEN, mpi_errno, "segment", class); memory->base_addr = addr; current_addr = @@ -316,8 +331,8 @@ int MPIDU_shm_seg_commit(MPIDU_shm_seg_t * memory, MPIDU_shm_barrier_t ** barrie if (num_local == 1) { char *addr; - MPIR_CHKPMEM_MALLOC(addr, char *, segment_len + MPIDU_SHM_CACHE_LINE_LEN, mpi_errno, - "segment", class); + MPIR_CHKPMEM_MALLOC(addr, char *, memory->segment_len + + MPIDU_SHM_CACHE_LINE_LEN, mpi_errno, "segment", class); memory->base_addr = addr; current_addr = @@ -461,8 +476,8 @@ int MPIDU_shm_seg_commit(MPIDU_shm_seg_t * memory, MPIDU_shm_barrier_t ** barrie if (num_local == 1) { char *addr; - MPIR_CHKPMEM_MALLOC(addr, char *, segment_len + MPIDU_SHM_CACHE_LINE_LEN, mpi_errno, - "segment", class); + MPIR_CHKPMEM_MALLOC(addr, char *, memory->segment_len + + MPIDU_SHM_CACHE_LINE_LEN, mpi_errno, "segment", class); memory->base_addr = addr; current_addr = @@ -583,7 +598,7 @@ int MPIDU_shm_seg_commit(MPIDU_shm_seg_t * memory, MPIDU_shm_barrier_t ** barrie /* assign sections of the shared memory segment to their pointers */ start_addr = current_addr; - size_left = segment_len; + size_left = default_segment_len + padding + sibling_segment_len; /* reserve room for shared mem barrier (We used a whole cacheline) */ current_addr = (char *) current_addr + MPIDU_SHM_CACHE_LINE_LEN; @@ -597,11 +612,26 @@ int MPIDU_shm_seg_commit(MPIDU_shm_seg_t * memory, MPIDU_shm_barrier_t ** barrie size_left -= MPIDU_SHM_CACHE_LINE_LEN; #endif - do { + while (!GENERIC_Q_EMPTY(allocq_default)) { alloc_elem_t *ep; + GENERIC_Q_DEQUEUE(&allocq_default, &ep, next); + *(ep->ptr_p) = current_addr; + MPIR_Assert(size_left >= ep->len); + size_left -= ep->len; + current_addr = (char *) current_addr + ep->len; + + MPL_free(ep); - ALLOCQ_DEQUEUE(&ep); + MPIR_Assert((char *) current_addr <= (char *) start_addr + default_segment_len); + } + + /* discard padding region */ + current_addr = (char *) current_addr + padding; + start_addr = current_addr; + while (!GENERIC_Q_EMPTY(allocq_sibling)) { + alloc_elem_t *ep; + GENERIC_Q_DEQUEUE(&allocq_sibling, &ep, next); *(ep->ptr_p) = current_addr; MPIR_Assert(size_left >= ep->len); size_left -= ep->len; @@ -609,9 +639,8 @@ int MPIDU_shm_seg_commit(MPIDU_shm_seg_t * memory, MPIDU_shm_barrier_t ** barrie MPL_free(ep); - MPIR_Assert((char *) current_addr <= (char *) start_addr + segment_len); + MPIR_Assert((char *) current_addr <= (char *) start_addr + sibling_segment_len); } - while (!ALLOCQ_EMPTY()); mpi_errno = check_alloc(memory, *barrier, num_local, local_rank); if (mpi_errno) @@ -619,8 +648,9 @@ int MPIDU_shm_seg_commit(MPIDU_shm_seg_t * memory, MPIDU_shm_barrier_t ** barrie MPIR_CHKPMEM_COMMIT(); fn_exit: - /* reset segment_len to zero */ - segment_len = 0; + /* reset default_segment_len and sibling_segment_len to zero */ + default_segment_len = 0; + sibling_segment_len = 0; MPIR_CHKLMEM_FREEALL(); MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPIDU_SHM_SEG_COMMIT); diff --git a/src/mpid/common/shm/mpidu_shm_obj.h b/src/mpid/common/shm/mpidu_shm_obj.h new file mode 100644 index 00000000000..7e74d82d1fb --- /dev/null +++ b/src/mpid/common/shm/mpidu_shm_obj.h @@ -0,0 +1,19 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */ +/* + * (C) 2019 by Argonne National Laboratory. + * See COPYRIGHT in top-level directory. + */ + +#ifndef MPIDU_SHM_OBJ_H_INCLUDED +#define MPIDU_SHM_OBJ_H_INCLUDED + +typedef enum MPIDU_shm_obj { + MPIDU_SHM_OBJ__NONE = -1, + MPIDU_SHM_OBJ__FASTBOXES = 0, + MPIDU_SHM_OBJ__CELLS, + MPIDU_SHM_OBJ__COPYBUFS, + MPIDU_SHM_OBJ__WIN, + MPIDU_SHM_OBJ__NUM +} MPIDU_shm_obj_t; + +#endif /* MPIDU_SHM_OBJ_H_INCLUDED */ From 1273c0f983e88865e0cefe78470ef3c5ff7ff699 Mon Sep 17 00:00:00 2001 From: Giuseppe Congiu Date: Tue, 26 Jun 2018 14:27:45 -0500 Subject: [PATCH 2/7] shm: add support for numa architectures This patch introduces support for numa architectures, including detection and usage of heterogeneous memory, e.g., KNL MCDRAM. The patch adds functionalities to detect numa nodes of different type and set up information useful for binding allocated objects to different types of memory. --- src/mpi/errhan/errnames.txt | 7 + src/mpid/common/shm/Makefile.mk | 3 +- src/mpid/common/shm/mpidu_shm.h | 37 +++ src/mpid/common/shm/mpidu_shm_numa.c | 432 +++++++++++++++++++++++++++ 4 files changed, 478 insertions(+), 1 deletion(-) create mode 100644 src/mpid/common/shm/mpidu_shm_numa.c diff --git a/src/mpi/errhan/errnames.txt b/src/mpi/errhan/errnames.txt index 809bce373d9..c3d94154e4b 100644 --- a/src/mpi/errhan/errnames.txt +++ b/src/mpi/errhan/errnames.txt @@ -479,6 +479,13 @@ unexpected messages queued. **init:Initialization failed **progresshookstoomany: too many progress hooks are registered +# +# Errors for NUMA detection +# +**nobind: not initialized (probably due to no process binding) +**memtype: no memory type found +**hwloc: not enabled + # # To be removed # diff --git a/src/mpid/common/shm/Makefile.mk b/src/mpid/common/shm/Makefile.mk index 5e820a94b58..d95f5f54f5a 100644 --- a/src/mpid/common/shm/Makefile.mk +++ b/src/mpid/common/shm/Makefile.mk @@ -9,7 +9,8 @@ if BUILD_MPID_COMMON_SHM mpi_core_sources += \ src/mpid/common/shm/mpidu_shm_alloc.c \ - src/mpid/common/shm/mpidu_shm_barrier.c + src/mpid/common/shm/mpidu_shm_barrier.c \ + src/mpid/common/shm/mpidu_shm_numa.c # there are no AC_OUTPUT_FILES headers, so builddir is unnecessary AM_CPPFLAGS += -I$(top_srcdir)/src/mpid/common/shm diff --git a/src/mpid/common/shm/mpidu_shm.h b/src/mpid/common/shm/mpidu_shm.h index 97ad0687329..dfaae7b44d2 100644 --- a/src/mpid/common/shm/mpidu_shm.h +++ b/src/mpid/common/shm/mpidu_shm.h @@ -11,8 +11,17 @@ #include "mpidu_shm_obj.h" #include "mpir_memtype.h" +#ifdef HAVE_HWLOC +#include "hwloc.h" +#define MPIDU_Mempolicy hwloc_membind_policy_t +#else +#define MPIDU_Mempolicy enum +#endif + +#define MPIDU_shm_numa_bitmap int #define MPIDU_SHM_MAX_FNAME_LEN 256 #define MPIDU_SHM_CACHE_LINE_LEN 64 +#define MPIDU_SHM_MAX_NUMA_NUM (sizeof(MPIDU_shm_numa_bitmap) * 8) typedef struct MPIDU_shm_barrier { OPA_int_t val; @@ -36,6 +45,28 @@ typedef struct MPIDU_shm_seg_info { char *addr; } MPIDU_shm_seg_info_t; +/* Per object memory binding information */ +typedef struct MPIDU_shm_obj_info_t { + MPIR_Memtype object_type[MPIDU_SHM_OBJ__NUM]; + MPIDU_Mempolicy object_policy[MPIDU_SHM_OBJ__NUM]; + int object_flags[MPIDU_SHM_OBJ__NUM]; +} MPIDU_shm_obj_info_t; + +extern MPIDU_shm_obj_info_t MPIDU_shm_obj_info; + +/* Numa node specific information for each process */ +typedef struct MPIDU_shm_numa_info_t { + int nodeset_is_valid; /* set to 1 if valid node binding exists */ + int nnodes; /* number of nodes */ + int *nodeid; /* hwloc node id for each local rank */ + int *siblid; /* sibling id for each node id. Sibling contains, + * if available, the closest HBM node */ + MPIR_Memtype *type; /* memory type for each node */ + MPIDU_shm_numa_bitmap bitmap; /* nodes that have procs bound to them */ +} MPIDU_shm_numa_info_t; + +extern MPIDU_shm_numa_info_t MPIDU_shm_numa_info; + int MPIDU_shm_seg_alloc(size_t len, void **ptr_p, MPIR_Memtype type, MPL_memory_class class); int MPIDU_shm_seg_commit(MPIDU_shm_seg_t * memory, MPIDU_shm_barrier_t ** barrier, int num_local, int local_rank, int local_procs_0, int rank, @@ -46,4 +77,10 @@ int MPIDU_shm_barrier_init(MPIDU_shm_barrier_t * barrier_region, MPIDU_shm_barrier_t ** barrier, int init_values); int MPIDU_shm_barrier(MPIDU_shm_barrier_t * barrier, int num_local); +/* NUMA utility functions */ +int MPIDU_shm_numa_info_init(int rank, int size, int *nodemap); +int MPIDU_shm_numa_info_finalize(void); +int MPIDU_shm_numa_bind_set(void *addr, size_t len, int numa_id, MPIR_Info * info, + MPIDU_shm_obj_t object); + #endif /* MPIDU_SHM_H_INCLUDED */ diff --git a/src/mpid/common/shm/mpidu_shm_numa.c b/src/mpid/common/shm/mpidu_shm_numa.c new file mode 100644 index 00000000000..4669a72e350 --- /dev/null +++ b/src/mpid/common/shm/mpidu_shm_numa.c @@ -0,0 +1,432 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */ +/* + * (C) 2018 by Argonne National Laboratory. + * See COPYRIGHT in top-level directory. + */ + +#include "mpidu_shm_impl.h" +#include "build_nodemap.h" +#include "mpir_mem.h" + +/* +=== BEGIN_MPI_T_CVAR_INFO_BLOCK === + +categories: + - name : MEMBIND + description : cvars controlling shared memory objects binding + +cvars: + - name : MPIR_CVAR_MEMBIND_NUMA_ENABLE + category : MEMBIND + type : string + default : "NO" + class : none + verbosity : MPI_T_VERBOSITY_USER_BASIC + scope : MPI_T_SCOPE_ALL_EQ + description : >- + Enable NUMA architecture support. NUMA support is needed to + use heterogeneous memory. + + - name : MPIR_CVAR_MEMBIND_TYPE_LIST + category : MEMBIND + type : string + default : "" + class : none + verbosity : MPI_T_VERBOSITY_USER_BASIC + scope : MPI_T_SCOPE_ALL_EQ + description : >- + The memory types used for allocating MPICH objects in a + heterogeneous memory system. + + - name : MPIR_CVAR_MEMBIND_POLICY_LIST + category : MEMBIND + type : string + default : "" + class : none + verbosity : MPI_T_VERBOSITY_USER_BASIC + scope : MPI_T_SCOPE_ALL + description : >- + The memory policy used for allocating MPICH objects in a + heterogeneous memory system (e.g., BIND, INTERLEAVE). + + - name : MPIR_CVAR_MEMBIND_FLAGS_LIST + category : MEMBIND + type : string + default : "" + class : none + verbosity : MPI_T_VERBOSITY_USER_BASIC + scope : MPI_T_SCOPE_ALL + description : >- + The memory flags used for allocating MPICH objects in a + heterogeneous memory system (e.g., STRICT). + +=== END_MPI_T_CVAR_INFO_BLOCK === +*/ + +#define MEMBIND_INFO_SET(cvar, key, val) \ + do { \ + char *token, *tok, *brk_o, *brk_p, *info_str, *val_str; \ + for (token = strtok_r(cvar, ",", &brk_o); token; \ + token = strtok_r(NULL, ",", &brk_o)) { \ + tok = MPL_strdup(token); \ + MPIR_Memtype mbind_type; \ + hwloc_membind_policy_t mbind_policy; \ + int mbind_flags; \ + MPIDU_shm_obj_t mbind_object; \ + \ + info_str = strtok_r(tok, ":", &brk_p); \ + val_str = strtok_r(NULL, ":", &brk_p); \ + \ + if (!strcmp(info_str, "FASTBOXES")) { \ + mbind_object = MPIDU_SHM_OBJ__FASTBOXES; \ + } else if (!strcmp(info_str, "CELLS")) { \ + mbind_object = MPIDU_SHM_OBJ__CELLS; \ + } else if (!strcmp(info_str, "COPYBUFS")) { \ + mbind_object = MPIDU_SHM_OBJ__COPYBUFS; \ + } else if (!strcmp(info_str, "WIN")) { \ + mbind_object = MPIDU_SHM_OBJ__WIN; \ + } else { \ + continue; \ + } \ + \ + if (!strcmp(key, "type")) { \ + if (!strcmp(val_str, "AUTO")) { \ + mbind_type = MPIR_MEMTYPE__DEFAULT; \ + } else if (!strcmp(val_str, "DDR")) { \ + mbind_type = MPIR_MEMTYPE__DDR; \ + } else if (!strcmp(val_str, "MCDRAM")) { \ + mbind_type = MPIR_MEMTYPE__MCDRAM; \ + } else { \ + continue; \ + } \ + } else if (!strcmp(key, "policy")) { \ + if (!strcmp(val_str, "BIND")) { \ + mbind_policy = HWLOC_MEMBIND_BIND; \ + } else if (!strcmp(val_str, "INTERLEAVE")) { \ + mbind_policy = HWLOC_MEMBIND_INTERLEAVE; \ + } else { \ + continue; \ + } \ + } else { \ + if (!strcmp(val_str, "STRICT")) { \ + mbind_flags = HWLOC_MEMBIND_STRICT; \ + } \ + } \ + \ + MPIDU_shm_obj_info.object_##val[mbind_object] = mbind_##val; \ + \ + MPL_free(tok); \ + } \ + } while (0) + +#define ENABLE_NUMA ((!strcmp(MPIR_CVAR_MEMBIND_NUMA_ENABLE, "YES")) ? 1 : 0) + +MPIDU_shm_obj_info_t MPIDU_shm_obj_info = { 0 }; +MPIDU_shm_numa_info_t MPIDU_shm_numa_info = { 0 }; + +static MPIDU_shm_seg_t memory; +static MPIDU_shm_barrier_t *barrier; + +#undef FUNCNAME +#define FUNCNAME MPIDU_shm_numa_info_init +#undef FCNAME +#define FCNAME MPL_QUOTE(FUNCNAME) +int MPIDU_shm_numa_info_init(int rank, int size, int *nodemap) +{ + int i, mpi_errno = MPI_SUCCESS; + int num_local; + int local_rank; + int local_rank_0; + int nnodes = 1; + int num_bound_nodes = 0; + int *nodeid_p = NULL; + +#ifdef HAVE_HWLOC + if (ENABLE_NUMA && MPIR_Process.bindset_is_valid) { + /* Detect number of NUMA nodes in the system */ + nnodes = hwloc_get_nbobjs_by_type(MPIR_Process.hwloc_topology, HWLOC_OBJ_NUMANODE); + + /* Detect whether processes are bound to only one numa node or not */ + hwloc_nodeset_t nodeset = hwloc_bitmap_alloc(); + hwloc_cpuset_to_nodeset(MPIR_Process.hwloc_topology, MPIR_Process.bindset, nodeset); + for (i = 0; i < nnodes; i++) { + hwloc_obj_t obj = + hwloc_get_obj_by_type(MPIR_Process.hwloc_topology, HWLOC_OBJ_NUMANODE, i); + /* NOTE: need 'hwloc_bitmap_intersects' instead of 'hwloc_bitmap_isequal' + * because in hwloc numa nodes in the same Group (Cluster) share the + * cpuset. Thus, converting cpusets to nodesets on systems like KNL + * will result on more than one node to be set in the bitmap (one for + * DDR and one for MCDRAM). However, in such systems, we only need to + * detect binding to DDR nodes (which have no 'subtype'). For more + * information see: B. Goglin "Exposing the Locality of Heterogeneous + * Memory Architectures to HPC Applications" */ + if (hwloc_bitmap_intersects(obj->nodeset, nodeset) && !obj->subtype) { + num_bound_nodes++; + } + } + hwloc_bitmap_free(nodeset); + } +#endif + + /* Initialize numa info */ + MPIDU_shm_numa_info.nodeset_is_valid = (num_bound_nodes == 1); + MPIDU_shm_numa_info.nnodes = nnodes; + MPIDU_shm_numa_info.nodeid = NULL; + MPIDU_shm_numa_info.siblid = NULL; + MPIDU_shm_numa_info.type = NULL; + MPIDU_shm_numa_info.bitmap = 0; + + /* Get nodemap info to initialize shm segment */ + MPIR_NODEMAP_get_local_info(rank, size, nodemap, &num_local, &local_rank, &local_rank_0); + + MPIR_CHKPMEM_DECL(3); + MPIR_CHKPMEM_MALLOC(MPIDU_shm_numa_info.nodeid, int *, num_local * sizeof(int), mpi_errno, + "node ids", MPL_MEM_OTHER); + MPIR_CHKPMEM_MALLOC(MPIDU_shm_numa_info.siblid, int *, nnodes * sizeof(int), mpi_errno, + "sibling ids", MPL_MEM_OTHER); + MPIR_CHKPMEM_MALLOC(MPIDU_shm_numa_info.type, MPIR_Memtype *, nnodes * sizeof(MPIR_Memtype), + mpi_errno, "node memory type", MPL_MEM_OTHER); + + /* Initialize node ids */ + for (i = 0; i < num_local; i++) + MPIDU_shm_numa_info.nodeid[i] = 0; + + /* Initialize sibling ids */ + for (i = 0; i < nnodes; i++) + MPIDU_shm_numa_info.siblid[i] = -1; + + /* Request node id per local rank region */ + mpi_errno = + MPIDU_shm_seg_alloc(num_local * sizeof(int), (void **) &nodeid_p, MPIR_MEMTYPE__DEFAULT, + MPL_MEM_SHM); + if (mpi_errno) + MPIR_ERR_POP(mpi_errno); + + /* Actually allocate segment and assign regions to pointer */ + mpi_errno = + MPIDU_shm_seg_commit(&memory, &barrier, num_local, local_rank, local_rank_0, rank, 0, + MPIDU_SHM_OBJ__NONE, MPL_MEM_SHM); + if (mpi_errno) + MPIR_ERR_POP(mpi_errno); + + /* Synchronize all processes */ + MPIDU_shm_barrier(barrier, num_local); + +#ifdef HAVE_HWLOC + /* Initialize default binding values */ + for (i = 0; i < MPIDU_SHM_OBJ__NUM; i++) { + MPIDU_shm_obj_info.object_type[i] = MPIR_MEMTYPE__NONE; + MPIDU_shm_obj_info.object_policy[i] = HWLOC_MEMBIND_BIND; + MPIDU_shm_obj_info.object_flags[i] = HWLOC_MEMBIND_STRICT; + } + + /* Detect NUMAs only if node binding is valid */ + if (MPIDU_shm_numa_info.nodeset_is_valid) { + hwloc_nodeset_t nodeset = hwloc_bitmap_alloc(); + hwloc_cpuset_to_nodeset(MPIR_Process.hwloc_topology, MPIR_Process.bindset, nodeset); + + for (i = 0; i < nnodes; i++) { + hwloc_obj_t obj = + hwloc_get_obj_by_type(MPIR_Process.hwloc_topology, HWLOC_OBJ_NUMANODE, i); + + /* Every rank sets its node id */ + if (hwloc_bitmap_intersects(obj->nodeset, nodeset) && !obj->subtype) { + nodeid_p[local_rank] = i; + } + + /* Set memory type for node to default */ + MPIDU_shm_numa_info.type[i] = MPIR_MEMTYPE__DEFAULT; + + /* Detect actual memory type for the node */ + if (obj->subtype && !strcmp(obj->subtype, "MCDRAM")) { + MPIDU_shm_numa_info.type[i] = MPIR_MEMTYPE__MCDRAM; + } + + /* Detect the sibling NUMA memory for the node */ + if (obj->next_sibling && obj->next_sibling->subtype && + !strcmp(obj->next_sibling->subtype, "MCDRAM")) { + MPIDU_shm_numa_info.siblid[i] = obj->next_sibling->logical_index; + } + } + + /* Wait for all ranks to update their shm segment information */ + MPIDU_shm_barrier(barrier, num_local); + + /* Each rank updates its private bitmap and nodeid array */ + for (i = 0; i < num_local; i++) { + if (!(MPIDU_shm_numa_info.bitmap & (1 << nodeid_p[i]))) { + MPIDU_shm_numa_info.bitmap |= (1 << nodeid_p[i]); + } + MPIDU_shm_numa_info.nodeid[i] = nodeid_p[i]; + } + + /* Set user defined binding values, if any */ + MEMBIND_INFO_SET(MPIR_CVAR_MEMBIND_TYPE_LIST, "type", type); + MEMBIND_INFO_SET(MPIR_CVAR_MEMBIND_POLICY_LIST, "policy", policy); + MEMBIND_INFO_SET(MPIR_CVAR_MEMBIND_FLAGS_LIST, "flags", flags); + + hwloc_bitmap_free(nodeset); + } else { + /* If no binding we need at least one node for later + * shared memory segment allocation, however we won't + * need to bind memory to the node in this case. */ + MPIDU_shm_numa_info.bitmap = 1; + } +#else + /* Initialize default binding values */ + for (i = 0; i < MPIDU_SHM_OBJ__NUM; i++) { + MPIDU_shm_obj_info.object_type[i] = MPIR_MEMTYPE__NONE; + } + + /* No hwloc, init numa_info to have at least one node */ + MPIDU_shm_numa_info.bitmap = 1; + MPIR_ERR_CHKANDJUMP(1, mpi_errno, MPI_ERR_OTHER, "**hwloc"); +#endif + + MPIDU_shm_barrier(barrier, num_local); + + fn_exit: + /* Destroy shared memory */ + MPIDU_shm_seg_destroy(&memory, num_local); + return mpi_errno; + fn_fail: + MPIR_CHKPMEM_REAP(); + goto fn_exit; +} + +#undef FUNCNAME +#define FUNCNAME MPIDU_shm_numa_info_finalize +#undef FCNAME +#define FCNAME MPL_QUOTE(FUNCNAME) +int MPIDU_shm_numa_info_finalize() +{ + MPL_free(MPIDU_shm_numa_info.nodeid); + MPL_free(MPIDU_shm_numa_info.siblid); + MPL_free(MPIDU_shm_numa_info.type); + return MPI_SUCCESS; +} + +#undef FUNCNAME +#define FUNCNAME MPIDU_shm_numa_bind_set +#undef FCNAME +#define FCNAME MPL_QUOTE(FUNCNAME) +int MPIDU_shm_numa_bind_set(void *addr, size_t len, int node_id, MPIR_Info * info, + MPIDU_shm_obj_t object) +{ + int mpi_errno = MPI_SUCCESS; +#ifdef HAVE_HWLOC + int i; + int info_flag; + int flags = 0; + hwloc_membind_policy_t policy = HWLOC_MEMBIND_BIND; + hwloc_obj_t obj; + hwloc_nodeset_t nodeset; + MPIR_Memtype type = MPIR_MEMTYPE__NONE; + char info_value[MPI_MAX_INFO_VAL + 1]; + + /* If cannot bind fail */ + if (!MPIDU_shm_numa_info.nodeset_is_valid) { + MPIR_ERR_CHKANDJUMP(1, mpi_errno, MPI_ERR_OTHER, "**nobind"); + } + + /* If object is none skip binding */ + if (object == MPIDU_SHM_OBJ__NONE) { + goto fn_exit; + } + + if (object == MPIDU_SHM_OBJ__WIN) { + if (info) { + MPIR_Info_get_impl(info, "mpich_win_membind_type", MPI_MAX_INFO_VAL, info_value, + &info_flag); + if (info_flag) { + if (!strcmp(info_value, "dram")) { + type = MPIR_MEMTYPE__DDR; + } else if (!strcmp(info_value, "mcdram")) { + type = MPIR_MEMTYPE__MCDRAM; + } else { + type = MPIR_MEMTYPE__DEFAULT; + } + } + + MPIR_Info_get_impl(info, "mpich_win_membind_policy", MPI_MAX_INFO_VAL, info_value, + &info_flag); + if (info_flag) { + if (!strcmp(info_value, "bind")) { + policy = HWLOC_MEMBIND_BIND; + } else if (!strcmp(info_value, "interleave")) { + policy = HWLOC_MEMBIND_INTERLEAVE; + } + } + + MPIR_Info_get_impl(info, "mpich_win_membind_flags", MPI_MAX_INFO_VAL, info_value, + &info_flag); + if (info_flag) { + if (!strcmp(info_value, "strict")) { + flags = HWLOC_MEMBIND_STRICT; + } + } + } + } + + /* Overwrite user hints with environment */ + if (MPIDU_shm_obj_info.object_type[object] != MPIR_MEMTYPE__NONE) { + type = MPIDU_shm_obj_info.object_type[object]; + policy = MPIDU_shm_obj_info.object_policy[object]; + flags = MPIDU_shm_obj_info.object_flags[object]; + } + + /* No binding set by user, terminate */ + if (type == MPIR_MEMTYPE__NONE) { + goto fn_exit; + } + + int target_id = node_id; + int node_bitmap = MPIDU_shm_numa_info.bitmap; + int *nodeid_p = MPIDU_shm_numa_info.nodeid; + int *siblid_p = MPIDU_shm_numa_info.siblid; + + if (type != MPIR_MEMTYPE__DEFAULT) { + target_id = siblid_p[node_id]; + if (target_id >= 0) { + /* If non-default memory is not available, fail */ + if (MPIDU_shm_obj_info.object_type[object] != MPIDU_shm_numa_info.type[target_id]) { + MPIR_ERR_CHKANDJUMP(1, mpi_errno, MPI_ERR_OTHER, "**memtype"); + } + } else { + MPIR_ERR_CHKANDJUMP(1, mpi_errno, MPI_ERR_OTHER, "**memtype"); + } + } + + /* Get hwloc nodeset for memory binding */ + if (policy == HWLOC_MEMBIND_BIND) { + obj = hwloc_get_obj_by_type(MPIR_Process.hwloc_topology, HWLOC_OBJ_NUMANODE, target_id); + nodeset = hwloc_bitmap_dup(obj->nodeset); + } else if (policy == HWLOC_MEMBIND_INTERLEAVE) { + nodeset = hwloc_bitmap_alloc(); + for (i = 0; i < MPIDU_shm_numa_info.nnodes; i++) { + if (node_bitmap & (1 << i)) { + if (type == MPIR_MEMTYPE__DEFAULT) + obj = hwloc_get_obj_by_type(MPIR_Process.hwloc_topology, HWLOC_OBJ_NUMANODE, i); + else + obj = + hwloc_get_obj_by_type(MPIR_Process.hwloc_topology, HWLOC_OBJ_NUMANODE, + siblid_p[i]); + hwloc_bitmap_or(nodeset, obj->nodeset, nodeset); + } + } + } + + /* Do memory binding */ + hwloc_set_area_membind(MPIR_Process.hwloc_topology, addr, len, nodeset, + policy, HWLOC_MEMBIND_BYNODESET | flags); + + hwloc_bitmap_free(nodeset); +#else + MPIR_ERR_CHKANDJUMP(1, mpi_errno, MPI_ERR_OTHER, "**hwloc"); +#endif + + fn_exit: + return mpi_errno; + fn_fail: + goto fn_exit; +} From d8dfc407735c865478c11e6aaca3f67266552021 Mon Sep 17 00:00:00 2001 From: Giuseppe Congiu Date: Tue, 12 Feb 2019 21:47:58 -0600 Subject: [PATCH 3/7] shm: add binding for shared segment allocations --- src/mpid/common/shm/mpidu_shm_alloc.c | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/src/mpid/common/shm/mpidu_shm_alloc.c b/src/mpid/common/shm/mpidu_shm_alloc.c index d3bc9f934b5..ec8d2d7b3fc 100644 --- a/src/mpid/common/shm/mpidu_shm_alloc.c +++ b/src/mpid/common/shm/mpidu_shm_alloc.c @@ -250,6 +250,13 @@ int MPIDU_shm_seg_commit(MPIDU_shm_seg_t * memory, MPIDU_shm_barrier_t ** barrie (void **) &(memory->base_addr), 0); MPIR_ERR_CHKANDJUMP(mpl_err, mpi_errno, MPI_ERR_OTHER, "**alloc_shar_mem"); + if (sibling_segment_len > 0) + MPIDU_shm_numa_bind_set(memory->base_addr + default_segment_len + padding, + sibling_segment_len, node_id, NULL, object); + else + MPIDU_shm_numa_bind_set(memory->base_addr, default_segment_len, node_id, NULL, + object); + /* post name of shared file */ MPIR_Assert(local_procs_0 == rank); @@ -368,6 +375,13 @@ int MPIDU_shm_seg_commit(MPIDU_shm_seg_t * memory, MPIDU_shm_barrier_t ** barrie (void **) &(memory->base_addr), 0); MPIR_ERR_CHKANDJUMP(mpl_err, mpi_errno, MPI_ERR_OTHER, "**alloc_shar_mem"); + if (sibling_segment_len > 0) + MPIDU_shm_numa_bind_set(memory->base_addr + default_segment_len + padding, + sibling_segment_len, node_id, NULL, object); + else + MPIDU_shm_numa_bind_set(memory->base_addr, default_segment_len, node_id, NULL, + object); + /* post name of shared file */ MPIR_Assert(local_procs_0 == rank); @@ -520,6 +534,13 @@ int MPIDU_shm_seg_commit(MPIDU_shm_seg_t * memory, MPIDU_shm_barrier_t ** barrie (void **) &(memory->base_addr), 0); MPIR_ERR_CHKANDJUMP(mpl_err, mpi_errno, MPI_ERR_OTHER, "**alloc_shar_mem"); + if (sibling_segment_len > 0) + MPIDU_shm_numa_bind_set(memory->base_addr + default_segment_len + padding, + sibling_segment_len, node_id, NULL, object); + else + MPIDU_shm_numa_bind_set(memory->base_addr, default_segment_len, node_id, NULL, + object); + /* post name of shared file */ MPIR_Assert(local_procs_0 == rank); MPL_snprintf(key, key_max_sz, "sharedFilename[%i]-%i", rank, num_segments); From a97ef594106135d720eef2b9240528fbe3df44df Mon Sep 17 00:00:00 2001 From: Giuseppe Congiu Date: Tue, 12 Feb 2019 22:52:09 -0600 Subject: [PATCH 4/7] posix: integrate numa-aware shared memory segment allocation in fbox This patch modifies the previous fbox segment allocation mechanism to make it numa and heterogeneous memory-aware. This is done by counting the number of available numa nodes used by MPI processes and creating an equal number of shared memory segments (instead of just one). Each of these segments will contain the fbox elements for the processes located in the corresponding numa node and can be bound to the requested type of memory (i.e., DDR or MCDRAM). --- src/mpid/ch4/shm/posix/eager/fbox/fbox_init.h | 121 +++++++++++++----- .../ch4/shm/posix/eager/fbox/fbox_types.h | 4 +- src/mpid/ch4/src/ch4_init.h | 4 + 3 files changed, 95 insertions(+), 34 deletions(-) diff --git a/src/mpid/ch4/shm/posix/eager/fbox/fbox_init.h b/src/mpid/ch4/shm/posix/eager/fbox/fbox_init.h index c83f5b186b6..ed217d395dc 100644 --- a/src/mpid/ch4/shm/posix/eager/fbox/fbox_init.h +++ b/src/mpid/ch4/shm/posix/eager/fbox/fbox_init.h @@ -13,7 +13,8 @@ #include "fbox_types.h" -#define MPIDI_POSIX_MAILBOX_INDEX(sender, receiver) ((num_local) * (sender) + (receiver)) +#define MPIDI_POSIX_SEGMENT_INDEX(receiver) (rank_to_seg[receiver]) +#define MPIDI_POSIX_MAILBOX_INDEX(sender,receiver) (((num_in_seg[receiver]) * (num_local)) + (sender)) extern MPIDI_POSIX_eager_fbox_control_t MPIDI_POSIX_eager_fbox_control_global; @@ -29,7 +30,13 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_POSIX_eager_init(int rank, int size) int *local_ranks, *local_procs; MPIDI_av_entry_t *av = NULL; - MPIDI_POSIX_fastbox_t *fastboxes_p = NULL; + MPIDI_POSIX_fastbox_t **fastboxes_p = NULL; + int seg_to_nodeid[MPIDU_SHM_MAX_NUMA_NUM]; + int nodeid_to_seg[MPIDU_SHM_MAX_NUMA_NUM]; + int num_seg = 0; + int *num_in_seg = NULL; + int *ranks_per_seg = NULL; + int *rank_to_seg = NULL; MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDI_POSIX_EAGER_INIT); MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPIDI_POSIX_EAGER_INIT); @@ -38,9 +45,18 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_POSIX_eager_init(int rank, int size) MPIDI_CH4_SHM_POSIX_FBOX_GENERAL = MPL_dbg_class_alloc("SHM_POSIX_FBOX", "shm_posix_fbox"); #endif /* MPL_USE_DBG_LOGGING */ - MPIR_CHKPMEM_DECL(5); + MPIR_CHKPMEM_DECL(7); - MPIDI_POSIX_eager_fbox_control_global.num_seg = 1; + /* Get number of shared memory segments & map segment/node to node/segment */ + for (i = 0; i < MPIDU_shm_numa_info.nnodes; i++) { + if (MPIDU_shm_numa_info.bitmap & (1 << i)) { + seg_to_nodeid[num_seg] = i; + nodeid_to_seg[i] = num_seg; + num_seg++; + } + } + + MPIDI_POSIX_eager_fbox_control_global.num_seg = num_seg; MPIDI_POSIX_eager_fbox_control_global.next_poll_local_rank = 0; MPIR_CHKPMEM_MALLOC(local_procs, int *, size * sizeof(int), mpi_errno, @@ -70,33 +86,63 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_POSIX_eager_init(int rank, int size) MPIDI_POSIX_eager_fbox_control_global.local_ranks = local_ranks; MPIDI_POSIX_eager_fbox_control_global.local_procs = local_procs; - MPIR_CHKPMEM_MALLOC(MPIDI_POSIX_eager_fbox_control_global.seg, - MPIDU_shm_seg_info_t *, + /* Allocate memory for fastboxes and mapping info */ + MPIR_CHKLMEM_DECL(4); + MPIR_CHKLMEM_MALLOC(fastboxes_p, MPIDI_POSIX_fastbox_t **, + num_seg * sizeof(MPIDI_POSIX_fastbox_t), mpi_errno, "fbox pointers", + MPL_MEM_OTHER); + MPIR_CHKLMEM_MALLOC(ranks_per_seg, int *, num_seg * sizeof(int), mpi_errno, + "processes per segment", MPL_MEM_OTHER); + MPIR_CHKLMEM_MALLOC(rank_to_seg, int *, num_local * sizeof(int), + mpi_errno, "rank to segment map", MPL_MEM_OTHER); + MPIR_CHKLMEM_MALLOC(num_in_seg, int *, num_local * sizeof(int), + mpi_errno, "my number in segment", MPL_MEM_OTHER); + + /* Populate rank to segment mapping info */ + memset(ranks_per_seg, 0, sizeof(int) * num_seg); + for (i = 0; i < num_local; i++) { + rank_to_seg[i] = nodeid_to_seg[MPIDU_shm_numa_info.nodeid[i]]; + num_in_seg[i] = ranks_per_seg[rank_to_seg[i]]++; + } + + MPIR_CHKPMEM_MALLOC(MPIDI_POSIX_eager_fbox_control_global.memory, + MPIDU_shm_seg_t *, + num_seg * sizeof(MPIDU_shm_seg_t), mpi_errno, "mem_region", MPL_MEM_SHM); + MPIR_CHKPMEM_MALLOC(MPIDI_POSIX_eager_fbox_control_global.barrier_region, + void **, num_seg * sizeof(void *), mpi_errno, "barrier_region", + MPL_MEM_SHM); + MPIR_CHKPMEM_MALLOC(MPIDI_POSIX_eager_fbox_control_global.seg, MPIDU_shm_seg_info_t *, MPIDI_POSIX_eager_fbox_control_global.num_seg * sizeof(MPIDU_shm_seg_info_t), mpi_errno, "mem_region segments", MPL_MEM_SHM); - /* Create region with one fastbox for every pair of local processes. */ - mpi_errno = - MPIDU_shm_seg_alloc(num_local * num_local * sizeof(MPIDI_POSIX_fastbox_t), - (void **) &fastboxes_p, MPIR_MEMTYPE__DEFAULT, MPL_MEM_SHM); - if (mpi_errno) - MPIR_ERR_POP(mpi_errno); - - /* Request shared collective barrier vars region */ - mpi_errno = MPIDU_shm_seg_alloc(MAX(sizeof(MPIDU_shm_barrier_t), MPIDU_SHM_CACHE_LINE_LEN), - (void **) &MPIDI_POSIX_eager_fbox_control_global.barrier_region, - MPIR_MEMTYPE__DEFAULT, MPL_MEM_SHM); - if (mpi_errno) - MPIR_ERR_POP(mpi_errno); - - /* Actually allocate the segment and assign regions to the pointers */ - mpi_errno = MPIDU_shm_seg_commit(&MPIDI_POSIX_eager_fbox_control_global.memory, - &MPIDI_POSIX_eager_fbox_control_global.barrier, - num_local, my_local_rank, local_rank_0, rank, 0, - MPIDU_SHM_OBJ__NONE, MPL_MEM_SHM); - if (mpi_errno) - MPIR_ERR_POP(mpi_errno); + for (i = 0; i < num_seg; i++) { + /* Create region with one fastbox for every pair of local processes. */ + mpi_errno = + MPIDU_shm_seg_alloc(ranks_per_seg[i] * num_local * sizeof(MPIDI_POSIX_fastbox_t), + (void **) &fastboxes_p[i], + MPIDU_shm_obj_info.object_type[MPIDU_SHM_OBJ__FASTBOXES], + MPL_MEM_SHM); + if (mpi_errno) + MPIR_ERR_POP(mpi_errno); + + /* Request shared collective barrier vars region */ + mpi_errno = MPIDU_shm_seg_alloc(MAX(sizeof(MPIDU_shm_barrier_t), MPIDU_SHM_CACHE_LINE_LEN), + (void **) + &MPIDI_POSIX_eager_fbox_control_global.barrier_region[i], + MPIR_MEMTYPE__DEFAULT, MPL_MEM_SHM); + if (mpi_errno) + MPIR_ERR_POP(mpi_errno); + + /* Actually allocate the segment and assign regions to the pointers */ + mpi_errno = MPIDU_shm_seg_commit(&MPIDI_POSIX_eager_fbox_control_global.memory[i], + &MPIDI_POSIX_eager_fbox_control_global.barrier, + num_local, my_local_rank, local_rank_0, rank, + seg_to_nodeid[i], MPIDU_SHM_OBJ__FASTBOXES, + MPL_MEM_SHM); + if (mpi_errno) + MPIR_ERR_POP(mpi_errno); + } /* Allocate table of pointers to fastboxes */ MPIR_CHKPMEM_MALLOC(MPIDI_POSIX_eager_fbox_control_global.mailboxes.in, @@ -107,11 +153,17 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_POSIX_eager_init(int rank, int size) mpi_errno, "fastboxes", MPL_MEM_SHM); /* Fill in fbox tables */ + int segment_idx, mailbox_idx; for (i = 0; i < num_local; i++) { + segment_idx = MPIDI_POSIX_SEGMENT_INDEX(my_local_rank); + mailbox_idx = MPIDI_POSIX_MAILBOX_INDEX(i, my_local_rank); MPIDI_POSIX_eager_fbox_control_global.mailboxes.in[i] = - &fastboxes_p[MPIDI_POSIX_MAILBOX_INDEX(i, my_local_rank)]; + &fastboxes_p[segment_idx][mailbox_idx]; + + segment_idx = MPIDI_POSIX_SEGMENT_INDEX(i); + mailbox_idx = MPIDI_POSIX_MAILBOX_INDEX(my_local_rank, i); MPIDI_POSIX_eager_fbox_control_global.mailboxes.out[i] = - &fastboxes_p[MPIDI_POSIX_MAILBOX_INDEX(my_local_rank, i)]; + &fastboxes_p[segment_idx][mailbox_idx]; memset(MPIDI_POSIX_eager_fbox_control_global.mailboxes.in[i], 0, sizeof(MPIDI_POSIX_fastbox_t)); @@ -124,6 +176,7 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_POSIX_eager_init(int rank, int size) MPIR_CHKPMEM_COMMIT(); fn_exit: + MPIR_CHKLMEM_FREEALL(); MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPIDI_POSIX_EAGER_INIT); return mpi_errno; fn_fail: @@ -139,7 +192,7 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_POSIX_eager_init(int rank, int size) #define FCNAME MPL_QUOTE(FUNCNAME) MPL_STATIC_INLINE_PREFIX int MPIDI_POSIX_eager_finalize() { - int mpi_errno = MPI_SUCCESS; + int i, mpi_errno = MPI_SUCCESS; MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDI_POSIX_EAGER_FINALIZE); MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPIDI_POSIX_EAGER_FINALIZE); @@ -156,8 +209,12 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_POSIX_eager_finalize() MPL_free(MPIDI_POSIX_eager_fbox_control_global.local_ranks); MPL_free(MPIDI_POSIX_eager_fbox_control_global.local_procs); - mpi_errno = MPIDU_shm_seg_destroy(&MPIDI_POSIX_eager_fbox_control_global.memory, - MPIDI_POSIX_eager_fbox_control_global.num_local); + for (i = 0; i < MPIDI_POSIX_eager_fbox_control_global.num_seg; i++) { + mpi_errno = MPIDU_shm_seg_destroy(&MPIDI_POSIX_eager_fbox_control_global.memory[i], + MPIDI_POSIX_eager_fbox_control_global.num_local); + } + MPL_free(MPIDI_POSIX_eager_fbox_control_global.memory); + MPL_free(MPIDI_POSIX_eager_fbox_control_global.barrier_region); fn_exit: MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPIDI_POSIX_EAGER_FINALIZE); diff --git a/src/mpid/ch4/shm/posix/eager/fbox/fbox_types.h b/src/mpid/ch4/shm/posix/eager/fbox/fbox_types.h index 51b40f2acb0..ebb2b111e45 100644 --- a/src/mpid/ch4/shm/posix/eager/fbox/fbox_types.h +++ b/src/mpid/ch4/shm/posix/eager/fbox/fbox_types.h @@ -35,7 +35,7 @@ typedef struct MPIDI_POSIX_fbox_arrays { typedef struct MPIDI_POSIX_eager_fbox_control { - MPIDU_shm_seg_t memory; + MPIDU_shm_seg_t *memory; MPIDU_shm_seg_info_t *seg; int num_seg; @@ -44,7 +44,7 @@ typedef struct MPIDI_POSIX_eager_fbox_control { * of mailboxes */ MPIDU_shm_barrier_t *barrier; - void *barrier_region; + void **barrier_region; /* Keep track of all of the local processes in MPI_COMM_WORLD and what their original rank was * in that communicator. */ diff --git a/src/mpid/ch4/src/ch4_init.h b/src/mpid/ch4/src/ch4_init.h index 61006b6cb82..3725b1174f6 100644 --- a/src/mpid/ch4/src/ch4_init.h +++ b/src/mpid/ch4/src/ch4_init.h @@ -430,6 +430,8 @@ MPL_STATIC_INLINE_PREFIX int MPID_Init(int *argc, #endif { + MPIDU_shm_numa_info_init(rank, size, MPIDI_CH4_Global.node_map[0]); + int shm_tag_bits = MPIR_TAG_BITS_DEFAULT, nm_tag_bits = MPIR_TAG_BITS_DEFAULT; #ifndef MPIDI_CH4_DIRECT_NETMOD mpi_errno = MPIDI_SHM_mpi_init_hook(rank, size, &n_shm_vnis_provided, &shm_tag_bits); @@ -528,6 +530,8 @@ MPL_STATIC_INLINE_PREFIX int MPID_Finalize(void) MPIR_ERR_POP(mpi_errno); #endif + MPIDU_shm_numa_info_finalize(); + int i; int max_n_avts; max_n_avts = MPIDIU_get_max_n_avts(); From 1062cfbcb3969412987f0925a7b45c6c67e9864b Mon Sep 17 00:00:00 2001 From: Giuseppe Congiu Date: Tue, 22 Jan 2019 12:09:58 -0600 Subject: [PATCH 5/7] test/mpi: add pt2pt tests for heterogeneous memory --- test/mpi/pt2pt/testlist.def | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/test/mpi/pt2pt/testlist.def b/test/mpi/pt2pt/testlist.def index 2a31f743325..701c1f4a38b 100644 --- a/test/mpi/pt2pt/testlist.def +++ b/test/mpi/pt2pt/testlist.def @@ -50,3 +50,11 @@ dtype_send 2 recv_any 2 irecv_any 2 large_tag 2 + +# Heterogeneous memory tests +sendflood 8 env=MPIR_CVAR_MEMBIND_NUMA_ENABLE=YES env=MPIR_CVAR_MEMBIND_TYPE_LIST=FASTBOXES:AUTO env=MPIR_CVAR_MEMBIND_POLICY_LIST=FASTBOXES:BIND env=MPIR_CVAR_MEMBIND_FLAGS_LIST=FASTBOXES:STRICT timeLimit=600 +sendflood 8 env=MPIR_CVAR_MEMBIND_NUMA_ENABLE=YES env=MPIR_CVAR_MEMBIND_TYPE_LIST=FASTBOXES:AUTO env=MPIR_CVAR_MEMBIND_POLICY_LIST=FASTBOXES:INTERLEAVE env=MPIR_CVAR_MEMBIND_FLAGS_LIST=FASTBOXES:STRICT timeLimit=600 +sendflood 8 env=MPIR_CVAR_MEMBIND_NUMA_ENABLE=YES env=MPIR_CVAR_MEMBIND_TYPE_LIST=FASTBOXES:DRAM env=MPIR_CVAR_MEMBIND_POLICY_LIST=FASTBOXES:BIND env=MPIR_CVAR_MEMBIND_FLAGS_LIST=FASTBOXES:STRICT timeLimit=600 +sendflood 8 env=MPIR_CVAR_MEMBIND_NUMA_ENABLE=YES env=MPIR_CVAR_MEMBIND_TYPE_LIST=FASTBOXES:DRAM env=MPIR_CVAR_MEMBIND_POLICY_LIST=FASTBOXES:INTERLEAVE env=MPIR_CVAR_MEMBIND_FLAGS_LIST=FASTBOXES:STRICT timeLimit=600 +sendflood 8 env=MPIR_CVAR_MEMBIND_NUMA_ENABLE=YES env=MPIR_CVAR_MEMBIND_TYPE_LIST=FASTBOXES:MCDRAM env=MPIR_CVAR_MEMBIND_POLICY_LIST=FASTBOXES:BIND env=MPIR_CVAR_MEMBIND_FLAGS_LIST=FASTBOXES:STRICT timeLimit=600 +sendflood 8 env=MPIR_CVAR_MEMBIND_NUMA_ENABLE=YES env=MPIR_CVAR_MEMBIND_TYPE_LIST=FASTBOXES:MCDRAM env=MPIR_CVAR_MEMBIND_POLICY_LIST=FASTBOXES:INTERLEAVE env=MPIR_CVAR_MEMBIND_FLAGS_LIST=FASTBOXES:STRICT timeLimit=600 From e9b75764c9943313ebbfb159ae2d05b7f857b7e6 Mon Sep 17 00:00:00 2001 From: Giuseppe Congiu Date: Thu, 24 Jan 2019 18:03:45 -0600 Subject: [PATCH 6/7] posix: add numa support to RMA shared memory window code Similarly to pt2pt fastbox integration this patch decomposes current single shared segment into multiple segments, one per numa node, that can then be separately bound using hwloc. Moreover, when using symheap either all single segment allocations succeed or none of them does. If a symheap segment allocation fails all the previous should be reverted. In order to accomplish this the new function: `MPIDI_CH4R_release_shm_symheap` has been introduced. --- src/mpid/ch4/include/mpidpre.h | 7 +- src/mpid/ch4/src/ch4r_symheap.h | 31 +++- src/mpid/ch4/src/ch4r_win.h | 290 ++++++++++++++++++++++++++------ 3 files changed, 269 insertions(+), 59 deletions(-) diff --git a/src/mpid/ch4/include/mpidpre.h b/src/mpid/ch4/include/mpidpre.h index c1f2563f43a..491fa74d03d 100644 --- a/src/mpid/ch4/include/mpidpre.h +++ b/src/mpid/ch4/include/mpidpre.h @@ -366,10 +366,11 @@ typedef struct MPIDI_CH4U_win_target { typedef struct MPIDI_CH4U_win_t { uint64_t win_id; - void *mmap_addr; - int64_t mmap_sz; + void **mmap_addr; + int64_t *mmap_sz; + int num_seg; - MPL_shm_hnd_t shm_segment_handle; + MPL_shm_hnd_t *shm_segment_handle; /* per-window OP completion for fence */ MPIR_cc_t local_cmpl_cnts; /* increase at OP issuing, decrease at local completion */ diff --git a/src/mpid/ch4/src/ch4r_symheap.h b/src/mpid/ch4/src/ch4r_symheap.h index 34dcb3b8b3c..a44c423a744 100644 --- a/src/mpid/ch4/src/ch4r_symheap.h +++ b/src/mpid/ch4/src/ch4r_symheap.h @@ -388,7 +388,8 @@ static inline int MPIDI_CH4I_allreduce_maxloc(size_t mysz, int myloc, MPIR_Comm #undef FCNAME #define FCNAME MPL_QUOTE(FUNCNAME) static inline int MPIDI_CH4R_get_shm_symheap(MPI_Aint shm_size, MPI_Aint * shm_offsets, - MPIR_Comm * comm, MPIR_Win * win, int *fail_flag) + MPIR_Comm * comm, MPIR_Win * win, int seg_num, + int *fail_flag) { int mpi_errno = MPI_SUCCESS; unsigned any_mapfail_flag = 1; @@ -398,8 +399,8 @@ static inline int MPIDI_CH4R_get_shm_symheap(MPI_Aint shm_size, MPI_Aint * shm_o #ifdef USE_SYM_HEAP int iter = MPIR_CVAR_CH4_SHM_SYMHEAP_RETRY; - MPL_shm_hnd_t *shm_segment_hdl_ptr = &MPIDI_CH4U_WIN(win, shm_segment_handle); - void **base_ptr = &MPIDI_CH4U_WIN(win, mmap_addr); + MPL_shm_hnd_t *shm_segment_hdl_ptr = &MPIDI_CH4U_WIN(win, shm_segment_handle[seg_num]); + void **base_ptr = &MPIDI_CH4U_WIN(win, mmap_addr[seg_num]); size_t mapsize = 0, page_sz = 0, maxsz = 0; int maxsz_loc = 0; @@ -505,4 +506,28 @@ static inline int MPIDI_CH4R_get_shm_symheap(MPI_Aint shm_size, MPI_Aint * shm_o return mpi_errno; } +#undef FUNCNAME +#define FUNCNAME MPIDI_CH4R_release_shm_symheap +#undef FCNAME +#define FCNAME MPL_QUOTE(FUNCNAME) +static inline int MPIDI_CH4R_release_shm_symheap(MPI_Aint shm_size, MPIR_Win * win, int seg_num) +{ + int mpi_errno = MPI_SUCCESS; + +#ifdef USE_SYM_HEAP + MPL_shm_hnd_t *shm_segment_hdl_ptr = &MPIDI_CH4U_WIN(win, shm_segment_handle[seg_num]); + void *base_ptr = MPIDI_CH4U_WIN(win, mmap_addr[seg_num]); + + /* destroy successful shm segment */ + mpi_errno = MPIDI_CH4U_destroy_shm_segment(shm_size, shm_segment_hdl_ptr, base_ptr); + if (mpi_errno) + MPIR_ERR_POP(mpi_errno); +#endif + + fn_exit: + return mpi_errno; + fn_fail: + goto fn_exit; +} + #endif /* CH4R_SYMHEAP_H_INCLUDED */ diff --git a/src/mpid/ch4/src/ch4r_win.h b/src/mpid/ch4/src/ch4r_win.h index 48f1c780a34..142f6753cb4 100644 --- a/src/mpid/ch4/src/ch4r_win.h +++ b/src/mpid/ch4/src/ch4r_win.h @@ -17,6 +17,8 @@ #include "mpir_info.h" #include "ch4r_symheap.h" #include "uthash.h" +#include "mpir_memtype.h" +#include "mpidu_shm.h" #ifdef HAVE_SYS_MMAN_H #include #endif /* HAVE_SYS_MMAN_H */ @@ -45,6 +47,12 @@ extern MPIR_T_pvar_timer_t PVAR_TIMER_rma_winlock_getlocallock ATTRIBUTE((unused extern MPIR_T_pvar_timer_t PVAR_TIMER_rma_wincreate_allgather ATTRIBUTE((unused)); extern MPIR_T_pvar_timer_t PVAR_TIMER_rma_amhdr_set ATTRIBUTE((unused)); +/* arrays used for mapping segments to NUMA nodes */ +#define LOCAL_SEGMAP_ARRAY_NUM (1) +static int win_nodeid_to_seg[MPIDU_SHM_MAX_NUMA_NUM]; +static int win_seg_to_nodeid[MPIDU_SHM_MAX_NUMA_NUM]; +static int *win_rank_to_seg = NULL; + MPL_STATIC_INLINE_PREFIX void MPIDI_CH4I_parse_info_accu_ops_str(const char *str, uint32_t * ops_ptr) { @@ -351,11 +359,19 @@ static inline int MPIDI_CH4R_win_init(MPI_Aint length, MPIR_Comm * comm_ptr, int create_flavor, int model) { int mpi_errno = MPI_SUCCESS; + int i, j; + int num_local = 0; + int num_seg = 0; + int *local_rank = NULL; MPIR_Win *win = (MPIR_Win *) MPIR_Handle_obj_alloc(&MPIR_Win_mem); MPIDI_CH4U_win_target_t *targets = NULL; MPIR_Comm *win_comm_ptr; + MPIR_Comm *shm_comm_ptr = NULL; MPIDI_CH4U_win_info_accu_op_shift_t op_shift; + MPIR_Errflag_t errflag = MPIR_ERR_NONE; + MPIR_CHKPMEM_DECL(3 + LOCAL_SEGMAP_ARRAY_NUM); + MPIR_CHKLMEM_DECL(2); MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDI_CH4R_WIN_INIT); MPIR_FUNC_VERBOSE_RMA_ENTER(MPID_STATE_MPIDI_CH4R_WIN_INIT); @@ -419,10 +435,92 @@ static inline int MPIDI_CH4R_win_init(MPI_Aint length, MPIR_ERR_POP(mpi_errno); } + shm_comm_ptr = win_comm_ptr->node_comm; - MPIDI_CH4U_WIN(win, mmap_sz) = 0; + MPIDI_CH4U_WIN(win, mmap_sz) = NULL; MPIDI_CH4U_WIN(win, mmap_addr) = NULL; + if (create_flavor == MPI_WIN_FLAVOR_ALLOCATE || create_flavor == MPI_WIN_FLAVOR_SHARED) { + if (shm_comm_ptr != NULL) { + int *comm_world_rank = NULL; + MPIR_CHKLMEM_MALLOC(comm_world_rank, int *, shm_comm_ptr->local_size * sizeof(int), + mpi_errno, "win comm world ranks", MPL_MEM_RMA); + + /* convert shm_comm rank to comm_world rank */ + comm_world_rank[shm_comm_ptr->rank] = MPIR_Process.comm_world->rank; + + /* exchange comm_world ranks inside node */ + mpi_errno = MPIR_Allgather(MPI_IN_PLACE, + 0, + MPI_DATATYPE_NULL, + comm_world_rank, sizeof(int), MPI_BYTE, shm_comm_ptr, + &errflag); + if (mpi_errno != MPI_SUCCESS) + goto fn_fail; + + /* allocate array to keep local ranks in comm_world */ + MPIR_CHKLMEM_MALLOC(local_rank, int *, + shm_comm_ptr->local_size * sizeof(int), mpi_errno, + "win comm local ranks", MPL_MEM_RMA); + + /* convert comm_world ranks to local ranks */ + for (i = 0, j = 0; i < MPIR_Process.comm_world->local_size; i++) { + MPIDI_av_entry_t *av = MPIDIU_comm_rank_to_av(MPIR_Process.comm_world, i); + if (MPIDI_av_is_local(av)) { + if (i == comm_world_rank[num_local]) { + local_rank[num_local++] = j; + } + j++; + } + } + MPIR_Assert(num_local == shm_comm_ptr->local_size); + } else { + num_local = 1; /* only one proc per node */ + MPIR_CHKLMEM_MALLOC(local_rank, int *, sizeof(int), mpi_errno, "win comm local ranks", + MPL_MEM_RMA); + *local_rank = 0; + } + + /* get number of segments to be allocated and node mappings */ + int bitmap = 0, bit_val; + for (i = 0; i < num_local; i++) { + j = MPIDU_shm_numa_info.nodeid[local_rank[i]]; + bit_val = (1 << j); + if (!(bitmap & bit_val)) { + win_nodeid_to_seg[j] = num_seg; + win_seg_to_nodeid[num_seg] = j; + num_seg++; + bitmap |= bit_val; + } + } + + /* allocate segment mapping arrays */ + MPIR_CHKPMEM_MALLOC(win_rank_to_seg, int *, num_local * sizeof(int), + mpi_errno, "win rank to segment map", MPL_MEM_RMA); + + /* compute rank to seg mapping */ + for (i = 0; i < num_local; i++) { + win_rank_to_seg[i] = win_nodeid_to_seg[MPIDU_shm_numa_info.nodeid[local_rank[i]]]; + } + + /* register number of segments in win object */ + MPIDI_CH4U_WIN(win, num_seg) = num_seg; + + MPIR_CHKPMEM_MALLOC(MPIDI_CH4U_WIN(win, shm_segment_handle), MPL_shm_hnd_t *, + num_seg * sizeof(MPL_shm_hnd_t), mpi_errno, "win shm segment handle", + MPL_MEM_RMA); + MPIR_CHKPMEM_MALLOC(MPIDI_CH4U_WIN(win, mmap_sz), int64_t *, num_seg * sizeof(int64_t), + mpi_errno, "win mmap sizes", MPL_MEM_RMA); + MPIR_CHKPMEM_MALLOC(MPIDI_CH4U_WIN(win, mmap_addr), void **, num_seg * sizeof(void *), + mpi_errno, "win mmap address pointers", MPL_MEM_RMA); + + /* init segments size and address */ + for (i = 0; i < num_seg; i++) { + MPIDI_CH4U_WIN(win, mmap_sz[i]) = 0; + MPIDI_CH4U_WIN(win, mmap_addr[i]) = NULL; + } + } + MPIR_cc_set(&MPIDI_CH4U_WIN(win, local_cmpl_cnts), 0); MPIR_cc_set(&MPIDI_CH4U_WIN(win, remote_cmpl_cnts), 0); MPIR_cc_set(&MPIDI_CH4U_WIN(win, remote_acc_cmpl_cnts), 0); @@ -430,10 +528,14 @@ static inline int MPIDI_CH4R_win_init(MPI_Aint length, MPIDI_CH4U_WIN(win, win_id) = MPIDI_CH4U_generate_win_id(comm_ptr); MPIDI_CH4U_map_set(MPIDI_CH4_Global.win_map, MPIDI_CH4U_WIN(win, win_id), win, MPL_MEM_RMA); + MPIR_CHKPMEM_COMMIT(); + fn_exit: MPIR_FUNC_VERBOSE_RMA_EXIT(MPID_STATE_MPIDI_CH4R_WIN_INIT); + MPIR_CHKLMEM_FREEALL(); return mpi_errno; fn_fail: + MPIR_CHKPMEM_REAP(); goto fn_exit; } @@ -1027,6 +1129,7 @@ static inline int MPIDI_CH4R_win_finalize(MPIR_Win ** win_ptr) { int mpi_errno = MPI_SUCCESS; int all_completed = 0; + int i; MPIR_Win *win = *win_ptr; MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDI_CH4R_WIN_FINALIZE); MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPIDI_CH4R_WIN_FINALIZE); @@ -1069,21 +1172,30 @@ static inline int MPIDI_CH4R_win_finalize(MPIR_Win ** win_ptr) win->create_flavor == MPI_WIN_FLAVOR_SHARED) { /* if more than one process on a node, we always use shared memory */ if (win->comm_ptr->node_comm != NULL) { - if (MPIDI_CH4U_WIN(win, mmap_sz) > 0) { - /* destroy shared window memory */ - mpi_errno = MPIDI_CH4U_destroy_shm_segment(MPIDI_CH4U_WIN(win, mmap_sz), - &MPIDI_CH4U_WIN(win, shm_segment_handle), - &MPIDI_CH4U_WIN(win, mmap_addr)); - if (mpi_errno) - MPIR_ERR_POP(mpi_errno); + for (i = 0; i < MPIDI_CH4U_WIN(win, num_seg); i++) { + if (MPIDI_CH4U_WIN(win, mmap_sz[i]) > 0) { + /* destroy shared window memory */ + mpi_errno = MPIDI_CH4U_destroy_shm_segment(MPIDI_CH4U_WIN(win, mmap_sz[i]), + &MPIDI_CH4U_WIN(win, + shm_segment_handle + [i]), + &MPIDI_CH4U_WIN(win, mmap_addr[i])); + if (mpi_errno) + MPIR_ERR_POP(mpi_errno); + } } MPL_free(MPIDI_CH4U_WIN(win, shared_table)); - } else if (MPIDI_CH4U_WIN(win, mmap_sz) > 0) { + } else if (MPIDI_CH4U_WIN(win, mmap_sz[0]) > 0) { /* if single process on the node, we use mmap with symm heap */ - MPL_munmap(MPIDI_CH4U_WIN(win, mmap_addr), MPIDI_CH4U_WIN(win, mmap_sz), MPL_MEM_RMA); + MPL_munmap(MPIDI_CH4U_WIN(win, mmap_addr[0]), MPIDI_CH4U_WIN(win, mmap_sz[0]), + MPL_MEM_RMA); } else MPL_free(win->base); + + MPL_free(MPIDI_CH4U_WIN(win, mmap_sz)); + MPL_free(MPIDI_CH4U_WIN(win, shm_segment_handle)); + MPL_free(MPIDI_CH4U_WIN(win, mmap_addr)); } MPIDI_CH4U_map_erase(MPIDI_CH4_Global.win_map, MPIDI_CH4U_WIN(win, win_id)); @@ -1279,19 +1391,20 @@ static inline int MPIDI_CH4I_win_shm_alloc_impl(MPI_Aint size, MPIR_Comm * comm_ptr, void **base_ptr, MPIR_Win ** win_ptr) { - int i, mpi_errno = MPI_SUCCESS; + int i, j, mpi_errno = MPI_SUCCESS; MPIR_Errflag_t errflag = MPIR_ERR_NONE; MPIR_Win *win = NULL; - size_t total_shm_size = 0LL; + size_t *total_shm_size = NULL; MPIDI_CH4U_win_shared_info_t *shared_table = NULL; - MPI_Aint *shm_offsets = NULL; + MPI_Aint **shm_offsets = NULL; MPIR_Comm *shm_comm_ptr = comm_ptr->node_comm; - size_t page_sz = 0, mapsize; + size_t page_sz = 0; int mapfail_flag = 0; unsigned symheap_flag = 1, global_symheap_flag = 0; + int num_seg = MPIDI_CH4U_WIN(*win_ptr, num_seg); MPIR_CHKPMEM_DECL(2); - MPIR_CHKLMEM_DECL(1); + MPIR_CHKLMEM_DECL(2 + MPIDU_SHM_MAX_NUMA_NUM); MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDI_CH4I_WIN_SHM_ALLOC_IMPL); MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPIDI_CH4I_WIN_SHM_ALLOC_IMPL); @@ -1323,31 +1436,56 @@ static inline int MPIDI_CH4I_win_shm_alloc_impl(MPI_Aint size, if (mpi_errno != MPI_SUCCESS) goto fn_fail; - MPIR_CHKLMEM_MALLOC(shm_offsets, MPI_Aint *, shm_comm_ptr->local_size * sizeof(MPI_Aint), - mpi_errno, "shm offset", MPL_MEM_RMA); + MPIR_CHKLMEM_MALLOC(shm_offsets, MPI_Aint **, num_seg * sizeof(MPI_Aint *), + mpi_errno, "shm offset pointers", MPL_MEM_RMA); + MPIR_CHKLMEM_MALLOC(total_shm_size, size_t *, num_seg * sizeof(size_t), + mpi_errno, "total shm size", MPL_MEM_RMA); + + for (i = 0; i < num_seg; i++) { + MPIR_CHKLMEM_MALLOC(shm_offsets[i], MPI_Aint *, + shm_comm_ptr->local_size * sizeof(MPI_Aint), mpi_errno, + "shm offset", MPL_MEM_RMA); + } /* No allreduce here because this is a shared memory domain * and should be a relatively small number of processes * and a non performance sensitive API. */ - for (i = 0; i < shm_comm_ptr->local_size; i++) { - shm_offsets[i] = (MPI_Aint) total_shm_size; - if (MPIDI_CH4U_WIN(win, info_args).alloc_shared_noncontig) - total_shm_size += MPIDI_CH4R_get_mapsize(shared_table[i].size, &page_sz); - else - total_shm_size += shared_table[i].size; + for (i = 0; i < num_seg; i++) { + total_shm_size[i] = 0LL; + for (j = 0; j < shm_comm_ptr->local_size; j++) { + if (win_rank_to_seg[j] == i) { + shm_offsets[i][j] = (MPI_Aint) total_shm_size[i]; + if (MPIDI_CH4U_WIN(win, info_args).alloc_shared_noncontig) + total_shm_size[i] += MPIDI_CH4R_get_mapsize(shared_table[j].size, &page_sz); + else + total_shm_size[i] += shared_table[j].size; + } else { + shm_offsets[i][j] = 0; + } + } } /* if all processes give zero size on a single node window, simply return. */ - if (total_shm_size == 0 && shm_comm_ptr->local_size == comm_ptr->local_size) + int zero_size_seg = 0; + for (i = 0; i < num_seg; i++) + if (total_shm_size[i] == 0) + zero_size_seg++; + + if (zero_size_seg == num_seg && shm_comm_ptr->local_size == comm_ptr->local_size) goto fn_exit; /* if my size is not page aligned and noncontig is disabled, skip global symheap. */ if (size != MPIDI_CH4R_get_mapsize(size, &page_sz) && !MPIDI_CH4U_WIN(win, info_args).alloc_shared_noncontig) symheap_flag = 0; - } else - total_shm_size = size; + } else { + MPIR_CHKLMEM_MALLOC(shm_offsets, MPI_Aint **, sizeof(MPI_Aint *), + mpi_errno, "shm offset pointers", MPL_MEM_RMA); + MPIR_CHKLMEM_MALLOC(total_shm_size, size_t *, sizeof(size_t), + mpi_errno, "total shm size", MPL_MEM_RMA); + *total_shm_size = size; + } /* try global symm heap only when multiple processes exist */ if (comm_ptr->local_size > 1) { @@ -1367,55 +1505,85 @@ static inline int MPIDI_CH4I_win_shm_alloc_impl(MPI_Aint size, /* because MPI_shm follows a create & attach mode, we need to set the * size of entire shared memory segment on each node as the size of * each process. */ - mapsize = MPIDI_CH4R_get_mapsize(total_shm_size, &page_sz); - MPIDI_CH4U_WIN(win, mmap_sz) = mapsize; + for (i = 0; i < num_seg; i++) { + MPIDI_CH4U_WIN(win, mmap_sz[i]) = MPIDI_CH4R_get_mapsize(total_shm_size[i], &page_sz); + } /* first try global symmetric heap segment allocation */ if (global_symheap_flag) { - mpi_errno = MPIDI_CH4R_get_shm_symheap(mapsize, shm_offsets, comm_ptr, win, &mapfail_flag); - if (mpi_errno != MPI_SUCCESS) - goto fn_fail; + for (i = 0; i < num_seg; i++) { + mpi_errno = + MPIDI_CH4R_get_shm_symheap(MPIDI_CH4U_WIN(win, mmap_sz[i]), shm_offsets[i], + comm_ptr, win, i, &mapfail_flag); + if (mpi_errno != MPI_SUCCESS) + goto fn_fail; + + if (mapfail_flag) { + for (j = 0; j < i; j++) { + /* release successfully allocated segments */ + mpi_errno = + MPIDI_CH4R_release_shm_symheap(MPIDI_CH4U_WIN(win, mmap_sz[j]), win, j); + if (mpi_errno) + MPIR_ERR_POP(mpi_errno); + } + break; + } + } } /* if fails, try normal shm segment allocation or malloc */ if (!global_symheap_flag || mapfail_flag) { - if (shm_comm_ptr != NULL && mapsize) { - mpi_errno = MPIDI_CH4U_allocate_shm_segment(shm_comm_ptr, mapsize, - &MPIDI_CH4U_WIN(win, shm_segment_handle), - &MPIDI_CH4U_WIN(win, mmap_addr)); - if (mpi_errno != MPI_SUCCESS) - goto fn_fail; + if (shm_comm_ptr != NULL) { + for (i = 0; i < num_seg; i++) { + if (MPIDI_CH4U_WIN(win, mmap_sz[i])) { + mpi_errno = + MPIDI_CH4U_allocate_shm_segment(shm_comm_ptr, + MPIDI_CH4U_WIN(win, mmap_sz[i]), + &MPIDI_CH4U_WIN(win, shm_segment_handle[i]), + &MPIDI_CH4U_WIN(win, mmap_addr[i])); + if (mpi_errno != MPI_SUCCESS) + goto fn_fail; + } + } } else if (size > 0) { MPIR_CHKPMEM_MALLOC(*base_ptr, void *, size, mpi_errno, "(*win_ptr)->base", MPL_MEM_RMA); MPL_VG_MEM_INIT(*base_ptr, size); - MPIDI_CH4U_WIN(win, mmap_sz) = 0; /* reset mmap_sz if use malloc */ + MPIDI_CH4U_WIN(win, mmap_sz[0]) = 0; /* reset mmap_sz if use malloc */ } } /* compute the base addresses of each process within the shared memory segment */ + /* need number of processes per segment too */ if (shm_comm_ptr != NULL) { - char *cur_base = (char *) MPIDI_CH4U_WIN(win, mmap_addr); - for (i = 0; i < shm_comm_ptr->local_size; i++) { - if (shared_table[i].size) - shared_table[i].shm_base_addr = cur_base; - else - shared_table[i].shm_base_addr = NULL; + for (i = 0; i < num_seg; i++) { + char *cur_base = (char *) MPIDI_CH4U_WIN(win, mmap_addr[i]); + for (j = 0; j < shm_comm_ptr->local_size; j++) { + if (win_rank_to_seg[j] != i) + continue; + + if (shared_table[j].size) + shared_table[j].shm_base_addr = cur_base; + else + shared_table[j].shm_base_addr = NULL; - if (MPIDI_CH4U_WIN(win, info_args).alloc_shared_noncontig) - cur_base += MPIDI_CH4R_get_mapsize(shared_table[i].size, &page_sz); - else - cur_base += shared_table[i].size; + if (MPIDI_CH4U_WIN(win, info_args).alloc_shared_noncontig) + cur_base += MPIDI_CH4R_get_mapsize(shared_table[j].size, &page_sz); + else + cur_base += shared_table[j].size; + } } - *base_ptr = shared_table[shm_comm_ptr->rank].shm_base_addr; - } else if (MPIDI_CH4U_WIN(win, mmap_sz) > 0) { + } else if (MPIDI_CH4U_WIN(win, mmap_sz[0]) > 0) { /* if symm heap is allocated without shared memory, use the mapping address */ - *base_ptr = MPIDI_CH4U_WIN(win, mmap_addr); + *base_ptr = MPIDI_CH4U_WIN(win, mmap_addr[0]); } /* otherwise, it has already be assigned with a local memory region or NULL (zero size). */ fn_exit: + /* free segment mapping arrays */ + MPL_free(win_rank_to_seg); + MPIR_CHKLMEM_FREEALL(); MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPIDI_CH4I_WIN_SHM_ALLOC_IMPL); return mpi_errno; @@ -1434,7 +1602,7 @@ static inline int MPIDI_CH4R_mpi_win_allocate_shared(MPI_Aint size, MPIR_Comm * comm_ptr, void **base_ptr, MPIR_Win ** win_ptr) { - int mpi_errno = MPI_SUCCESS; + int i, mpi_errno = MPI_SUCCESS; MPIR_Errflag_t errflag = MPIR_ERR_NONE; MPIR_Win *win = NULL; MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDI_CH4R_MPI_WIN_ALLOCATE_SHARED); @@ -1449,6 +1617,14 @@ static inline int MPIDI_CH4R_mpi_win_allocate_shared(MPI_Aint size, if (mpi_errno != MPI_SUCCESS) MPIR_ERR_POP(mpi_errno); + /* bind segments to nodes */ + if (comm_ptr->node_comm && comm_ptr->node_comm->rank == 0) { + for (i = 0; i < MPIDI_CH4U_WIN(*win_ptr, num_seg); i++) + MPIDU_shm_numa_bind_set(MPIDI_CH4U_WIN(*win_ptr, mmap_addr[i]), + MPIDI_CH4U_WIN(*win_ptr, mmap_sz[i]), win_seg_to_nodeid[i], + info_ptr, MPIDU_SHM_OBJ__WIN); + } + win = *win_ptr; win->base = *base_ptr; win->size = size; @@ -1563,7 +1739,7 @@ static inline int MPIDI_CH4R_mpi_win_allocate(MPI_Aint size, MPIR_Info * info, MPIR_Comm * comm, void *baseptr, MPIR_Win ** win_ptr) { - int mpi_errno = MPI_SUCCESS; + int i, mpi_errno = MPI_SUCCESS; MPIR_Errflag_t errflag = MPIR_ERR_NONE; MPIR_Win *win; void **base_ptr = (void **) baseptr; @@ -1581,6 +1757,14 @@ static inline int MPIDI_CH4R_mpi_win_allocate(MPI_Aint size, if (mpi_errno != MPI_SUCCESS) goto fn_fail; + /* bind segments to nodes */ + if (comm->node_comm && comm->node_comm->rank == 0) { + for (i = 0; i < MPIDI_CH4U_WIN(*win_ptr, num_seg); i++) + MPIDU_shm_numa_bind_set(MPIDI_CH4U_WIN(*win_ptr, mmap_addr[i]), + MPIDI_CH4U_WIN(*win_ptr, mmap_sz[i]), win_seg_to_nodeid[i], + info, MPIDU_SHM_OBJ__WIN); + } + win = *win_ptr; win->base = *(void **) baseptr; win->size = size; From 81c24b1d6fec7f6bc2cd903c54b1ebe46e316c56 Mon Sep 17 00:00:00 2001 From: Giuseppe Congiu Date: Fri, 8 Feb 2019 20:53:47 -0600 Subject: [PATCH 7/7] test/mpi: add rma tests for heterogeneous memory --- test/mpi/rma/testlist.def | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/test/mpi/rma/testlist.def b/test/mpi/rma/testlist.def index 4e963cce330..9fcc243531b 100644 --- a/test/mpi/rma/testlist.def +++ b/test/mpi/rma/testlist.def @@ -139,6 +139,12 @@ manyget 2 timeLimit=300 derived_acc_flush_local 3 mpiversion=3.0 large_acc_flush_local 3 mpiversion=3.0 large_small_acc 2 +win_shared_put_flush_load 3 env=MPIR_CVAR_MEMBIND_NUMA_ENABLE=YES env=MPIR_CVAR_MEMBIND_TYPE_LIST=WIN:AUTO env=MPIR_CVAR_MEMBIND_POLICY_LIST=WIN:BIND env=MPIR_CVAR_MEMBIND_FLAGS_LIST=WIN:STRICT mpiversion=3.0 +win_shared_put_flush_load 3 env=MPIR_CVAR_MEMBIND_NUMA_ENABLE=YES env=MPIR_CVAR_MEMBIND_TYPE_LIST=WIN:AUTO env=MPIR_CVAR_MEMBIND_POLICY_LIST=WIN:INTERLEAVE env=MPIR_CVAR_MEMBIND_FLAGS_LIST=WIN:STRICT mpiversion=3.0 +win_shared_put_flush_load 3 env=MPIR_CVAR_MEMBIND_NUMA_ENABLE=YES env=MPIR_CVAR_MEMBIND_TYPE_LIST=WIN:DRAM env=MPIR_CVAR_MEMBIND_POLICY_LIST=WIN:BIND env=MPIR_CVAR_MEMBIND_FLAGS_LIST=WIN:STRICT mpiversion=3.0 +win_shared_put_flush_load 3 env=MPIR_CVAR_MEMBIND_NUMA_ENABLE=YES env=MPIR_CVAR_MEMBIND_TYPE_LIST=WIN:DRAM env=MPIR_CVAR_MEMBIND_POLICY_LIST=WIN:INTERLEAVE env=MPIR_CVAR_MEMBIND_FLAGS_LIST=WIN:STRICT mpiversion=3.0 +win_shared_put_flush_load 3 env=MPIR_CVAR_MEMBIND_NUMA_ENABLE=YES env=MPIR_CVAR_MEMBIND_TYPE_LIST=WIN:MCDRAM env=MPIR_CVAR_MEMBIND_POLICY_LIST=WIN:BIND env=MPIR_CVAR_MEMBIND_FLAGS_LIST=WIN:STRICT mpiversion=3.0 +win_shared_put_flush_load 3 env=MPIR_CVAR_MEMBIND_NUMA_ENABLE=YES env=MPIR_CVAR_MEMBIND_TYPE_LIST=WIN:MCDRAM env=MPIR_CVAR_MEMBIND_POLICY_LIST=WIN:INTERLEAVE env=MPIR_CVAR_MEMBIND_FLAGS_LIST=WIN:STRICT mpiversion=3.0 win_shared_put_flush_load 3 mpiversion=3.0 win_shared_acc_flush_load 3 mpiversion=3.0 win_shared_gacc_flush_load 3 mpiversion=3.0