diff --git a/configure.ac b/configure.ac index 88e6879e6ff..b0ee91a7915 100644 --- a/configure.ac +++ b/configure.ac @@ -1173,6 +1173,7 @@ fi if test "$izem_atomic" = "yes" ; then AC_DEFINE(ENABLE_IZEM_ATOMIC,1,[Define to enable using Izem CPU atomics]) fi +AM_CONDITIONAL([ENABLE_IZEM_ATOMIC], [test x$izem_atomic = xyes]) AC_ARG_VAR([ZMLIBNAME],[can be used to override the name of the Izem library (default: "zm")]) ZMLIBNAME=${ZMLIBNAME:-"zm"} diff --git a/maint/extracterrmsgs b/maint/extracterrmsgs index 05c401d74fe..d795f54810b 100755 --- a/maint/extracterrmsgs +++ b/maint/extracterrmsgs @@ -731,7 +731,7 @@ sub ProcessFile !($args[$errClassLoc] =~ /^MPIDI_CH3I_SOCK_ERR_/) && !($args[$errClassLoc] =~ /^MPIX_ERR_/) && !($args[$errClassLoc] =~ /^errclass/) && - !($args[$errClassLoc] =~ /^\*\(errflag_\)/) && + !($args[$errClassLoc] =~ /^errflag/) && !($args[$errClassLoc] =~ /^\*errflag/)) { $bad_syntax_in_file{$filename} = 1; print STDERR "Invalid argument $args[$errClassLoc] for the MPI Error class in $routineName in $filename\n"; diff --git a/src/mpi/coll/algorithms/common/algo_common.h b/src/mpi/coll/algorithms/common/algo_common.h index 9914d0c7b45..6e4f35dd52b 100644 --- a/src/mpi/coll/algorithms/common/algo_common.h +++ b/src/mpi/coll/algorithms/common/algo_common.h @@ -22,19 +22,19 @@ static int MPII_Algo_compare_int(const void *a, const void *b) } /* Avoid unused function warning in certain configurations */ -static int MPII_Algo_calculate_pipeline_chunk_info(int maxbytes, int type_size, int count, - int *num_segments, int *segsize_floor, - int *segsize_ceil) ATTRIBUTE((unused)); -static int MPII_Algo_calculate_pipeline_chunk_info(int maxbytes, - int type_size, int count, - int *num_segments, - int *segsize_floor, int *segsize_ceil) +static inline int MPIR_Algo_calculate_pipeline_chunk_info(int maxbytes, int type_size, int count, + int *num_segments, int *segsize_floor, + int *segsize_ceil) ATTRIBUTE((unused)); +static inline int MPIR_Algo_calculate_pipeline_chunk_info(int maxbytes, + int type_size, int count, + int *num_segments, + int *segsize_floor, int *segsize_ceil) { int maxelems; int mpi_errno = MPI_SUCCESS; - MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPII_ALGO_CALCULATE_PIPELINE_CHUNK_INFO); - MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPII_ALGO_CALCULATE_PIPELINE_CHUNK_INFO); + MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIR_ALGO_CALCULATE_PIPELINE_CHUNK_INFO); + MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPIR_ALGO_CALCULATE_PIPELINE_CHUNK_INFO); if (count == 0 || type_size == 0) { *num_segments = *segsize_floor = *segsize_ceil = 0; @@ -57,7 +57,7 @@ static int MPII_Algo_calculate_pipeline_chunk_info(int maxbytes, MPL_DBG_MSG_FMT(MPIR_DBG_COLL, VERBOSE, (MPL_DBG_FDEST, "num_segments %d", *num_segments)); - MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPII_ALGO_CALCULATE_PIPELINE_CHUNK_INFO); + MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPIR_ALGO_CALCULATE_PIPELINE_CHUNK_INFO); fn_exit: return mpi_errno; diff --git a/src/mpi/coll/algorithms/treealgo/treealgo.c b/src/mpi/coll/algorithms/treealgo/treealgo.c index 67bb3807cca..317adc4eef1 100644 --- a/src/mpi/coll/algorithms/treealgo/treealgo.c +++ b/src/mpi/coll/algorithms/treealgo/treealgo.c @@ -35,13 +35,13 @@ int MPII_Treealgo_comm_cleanup(MPIR_Comm * comm) } -int MPII_Treealgo_tree_create(int rank, int nranks, int tree_type, int k, int root, - MPII_Treealgo_tree_t * ct) +int MPIR_Treealgo_tree_create(int rank, int nranks, int tree_type, int k, int root, + MPIR_Treealgo_tree_t * ct) { int mpi_errno = MPI_SUCCESS; - MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPII_TREEALGO_TREE_INIT); - MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPII_TREEALGO_TREE_INIT); + MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIR_TREEALGO_TREE_INIT); + MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPIR_TREEALGO_TREE_INIT); switch (tree_type) { case MPIR_TREE_TYPE_KARY: @@ -68,7 +68,7 @@ int MPII_Treealgo_tree_create(int rank, int nranks, int tree_type, int k, int ro break; } - MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPII_TREEALGO_TREE_INIT); + MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPIR_TREEALGO_TREE_INIT); fn_exit: return mpi_errno; @@ -78,7 +78,7 @@ int MPII_Treealgo_tree_create(int rank, int nranks, int tree_type, int k, int ro } -void MPII_Treealgo_tree_free(MPII_Treealgo_tree_t * tree) +void MPIR_Treealgo_tree_free(MPIR_Treealgo_tree_t * tree) { utarray_free(tree->children); } diff --git a/src/mpi/coll/algorithms/treealgo/treealgo.h b/src/mpi/coll/algorithms/treealgo/treealgo.h index e4f6418410b..7a96b26cd99 100644 --- a/src/mpi/coll/algorithms/treealgo/treealgo.h +++ b/src/mpi/coll/algorithms/treealgo/treealgo.h @@ -16,8 +16,8 @@ int MPII_Treealgo_init(void); int MPII_Treealgo_comm_init(MPIR_Comm * comm); int MPII_Treealgo_comm_cleanup(MPIR_Comm * comm); -int MPII_Treealgo_tree_create(int rank, int nranks, int tree_type, int k, int root, - MPII_Treealgo_tree_t * ct); -void MPII_Treealgo_tree_free(MPII_Treealgo_tree_t * tree); +int MPIR_Treealgo_tree_create(int rank, int nranks, int tree_type, int k, int root, + MPIR_Treealgo_tree_t * ct); +void MPIR_Treealgo_tree_free(MPIR_Treealgo_tree_t * tree); #endif /* TREEALGO_H_INCLUDED */ diff --git a/src/mpi/coll/algorithms/treealgo/treealgo_types.h b/src/mpi/coll/algorithms/treealgo/treealgo_types.h index ad98f26a5b0..fd471c46709 100644 --- a/src/mpi/coll/algorithms/treealgo/treealgo_types.h +++ b/src/mpi/coll/algorithms/treealgo/treealgo_types.h @@ -18,6 +18,6 @@ typedef struct { int parent; int num_children; UT_array *children; -} MPII_Treealgo_tree_t; +} MPIR_Treealgo_tree_t; #endif /* TREEALGO_TYPES_H_INCLUDED */ diff --git a/src/mpi/coll/algorithms/treealgo/treeutil.c b/src/mpi/coll/algorithms/treealgo/treeutil.c index ebc88e3fce5..53eff66ebac 100644 --- a/src/mpi/coll/algorithms/treealgo/treeutil.c +++ b/src/mpi/coll/algorithms/treealgo/treeutil.c @@ -15,7 +15,7 @@ #include "treeutil.h" #include "mpiimpl.h" -static int tree_add_child(MPII_Treealgo_tree_t * t, int rank) +static int tree_add_child(MPIR_Treealgo_tree_t * t, int rank) { int mpi_errno = MPI_SUCCESS; @@ -26,7 +26,7 @@ static int tree_add_child(MPII_Treealgo_tree_t * t, int rank) } -int MPII_Treeutil_tree_kary_init(int rank, int nranks, int k, int root, MPII_Treealgo_tree_t * ct) +int MPII_Treeutil_tree_kary_init(int rank, int nranks, int k, int root, MPIR_Treealgo_tree_t * ct) { int lrank, child; int mpi_errno = MPI_SUCCESS; @@ -76,7 +76,7 @@ int MPII_Treeutil_tree_kary_init(int rank, int nranks, int k, int root, MPII_Tre * 3 */ int MPII_Treeutil_tree_knomial_1_init(int rank, int nranks, int k, int root, - MPII_Treealgo_tree_t * ct) + MPIR_Treealgo_tree_t * ct) { int lrank, i, j, maxtime, tmp, time, parent, current_rank, running_rank, crank; int mpi_errno = MPI_SUCCESS; @@ -171,7 +171,7 @@ int MPII_Treeutil_tree_knomial_1_init(int rank, int nranks, int k, int root, * 7 */ int MPII_Treeutil_tree_knomial_2_init(int rank, int nranks, int k, int root, - MPII_Treealgo_tree_t * ct) + MPIR_Treealgo_tree_t * ct) { int mpi_errno = MPI_SUCCESS; int lrank, i, j, depth; diff --git a/src/mpi/coll/algorithms/treealgo/treeutil.h b/src/mpi/coll/algorithms/treealgo/treeutil.h index 3105083a3f4..59a324df79a 100644 --- a/src/mpi/coll/algorithms/treealgo/treeutil.h +++ b/src/mpi/coll/algorithms/treealgo/treeutil.h @@ -13,14 +13,14 @@ #define TREEUTIL_H_INCLUDED /* Generate kary tree information for rank 'rank' */ -int MPII_Treeutil_tree_kary_init(int rank, int nranks, int k, int root, MPII_Treealgo_tree_t * ct); +int MPII_Treeutil_tree_kary_init(int rank, int nranks, int k, int root, MPIR_Treealgo_tree_t * ct); /* Generate knomial_1 tree information for rank 'rank' */ int MPII_Treeutil_tree_knomial_1_init(int rank, int nranks, int k, int root, - MPII_Treealgo_tree_t * ct); + MPIR_Treealgo_tree_t * ct); /* Generate knomial_2 tree information for rank 'rank' */ int MPII_Treeutil_tree_knomial_2_init(int rank, int nranks, int k, int root, - MPII_Treealgo_tree_t * ct); + MPIR_Treealgo_tree_t * ct); #endif /* TREEUTIL_H_INCLUDED */ diff --git a/src/mpi/coll/iallreduce/iallreduce_tsp_tree_algos.h b/src/mpi/coll/iallreduce/iallreduce_tsp_tree_algos.h index 9a156db48a3..6ed093d8e41 100644 --- a/src/mpi/coll/iallreduce/iallreduce_tsp_tree_algos.h +++ b/src/mpi/coll/iallreduce/iallreduce_tsp_tree_algos.h @@ -33,7 +33,7 @@ int MPIR_TSP_Iallreduce_sched_intra_tree(const void *sendbuf, void *recvbuf, int int size; int rank; int num_children = 0; - MPII_Treealgo_tree_t my_tree; + MPIR_Treealgo_tree_t my_tree; void **child_buffer; /* Buffer array in which data from children is received */ void *reduce_buffer; /* Buffer in which allreduced data is present */ int *vtcs = NULL, *recv_id = NULL, *reduce_id = NULL; /* Arrays to store graph vertex ids */ @@ -61,7 +61,7 @@ int MPIR_TSP_Iallreduce_sched_intra_tree(const void *sendbuf, void *recvbuf, int /* calculate chunking information for pipelining */ - MPII_Algo_calculate_pipeline_chunk_info(maxbytes, type_size, count, &num_chunks, + MPIR_Algo_calculate_pipeline_chunk_info(maxbytes, type_size, count, &num_chunks, &chunk_size_floor, &chunk_size_ceil); /* print chunking information */ MPL_DBG_MSG_FMT(MPIR_DBG_COLL, VERBOSE, (MPL_DBG_FDEST, @@ -76,7 +76,7 @@ int MPIR_TSP_Iallreduce_sched_intra_tree(const void *sendbuf, void *recvbuf, int /* initialize the tree */ my_tree.children = NULL; - mpi_errno = MPII_Treealgo_tree_create(rank, size, tree_type, k, root, &my_tree); + mpi_errno = MPIR_Treealgo_tree_create(rank, size, tree_type, k, root, &my_tree); if (mpi_errno) MPIR_ERR_POP(mpi_errno); num_children = my_tree.num_children; @@ -216,7 +216,7 @@ int MPIR_TSP_Iallreduce_sched_intra_tree(const void *sendbuf, void *recvbuf, int offset += msgsize; } - MPII_Treealgo_tree_free(&my_tree); + MPIR_Treealgo_tree_free(&my_tree); fn_exit: MPL_free(vtcs); diff --git a/src/mpi/coll/ibcast/ibcast_tsp_tree_algos.h b/src/mpi/coll/ibcast/ibcast_tsp_tree_algos.h index 62c6c6106e7..ea853623fa9 100644 --- a/src/mpi/coll/ibcast/ibcast_tsp_tree_algos.h +++ b/src/mpi/coll/ibcast/ibcast_tsp_tree_algos.h @@ -32,7 +32,7 @@ int MPIR_TSP_Ibcast_sched_intra_tree(void *buffer, int count, MPI_Datatype datat int rank; int recv_id; int num_children; - MPII_Treealgo_tree_t my_tree; + MPIR_Treealgo_tree_t my_tree; int tag; MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIR_TSP_IBCAST_SCHED_INTRA_TREE); @@ -51,7 +51,7 @@ int MPIR_TSP_Ibcast_sched_intra_tree(void *buffer, int count, MPI_Datatype datat extent = MPL_MAX(extent, true_extent); /* calculate chunking information for pipelining */ - MPII_Algo_calculate_pipeline_chunk_info(maxbytes, type_size, count, &num_chunks, + MPIR_Algo_calculate_pipeline_chunk_info(maxbytes, type_size, count, &num_chunks, &chunk_size_floor, &chunk_size_ceil); /* print chunking information */ MPL_DBG_MSG_FMT(MPIR_DBG_COLL, VERBOSE, (MPL_DBG_FDEST, @@ -59,7 +59,7 @@ int MPIR_TSP_Ibcast_sched_intra_tree(void *buffer, int count, MPI_Datatype datat maxbytes, count, num_chunks, chunk_size_floor, chunk_size_ceil)); - mpi_errno = MPII_Treealgo_tree_create(rank, size, tree_type, k, root, &my_tree); + mpi_errno = MPIR_Treealgo_tree_create(rank, size, tree_type, k, root, &my_tree); if (mpi_errno) MPIR_ERR_POP(mpi_errno); num_children = my_tree.num_children; @@ -93,7 +93,7 @@ int MPIR_TSP_Ibcast_sched_intra_tree(void *buffer, int count, MPI_Datatype datat offset += msgsize; } - MPII_Treealgo_tree_free(&my_tree); + MPIR_Treealgo_tree_free(&my_tree); fn_exit: MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPIR_TSP_IBCAST_SCHED_INTRA_TREE); diff --git a/src/mpi/coll/igather/igather_tsp_tree_algos.h b/src/mpi/coll/igather/igather_tsp_tree_algos.h index 6e70ac73a57..e4229deb56d 100644 --- a/src/mpi/coll/igather/igather_tsp_tree_algos.h +++ b/src/mpi/coll/igather/igather_tsp_tree_algos.h @@ -32,7 +32,7 @@ int MPIR_TSP_Igather_sched_intra_tree(const void *sendbuf, int sendcount, void *tmp_buf = NULL; const void *data_buf = NULL; int tree_type; - MPII_Treealgo_tree_t my_tree, parents_tree; + MPIR_Treealgo_tree_t my_tree, parents_tree; int next_child, num_children, *child_subtree_size = NULL, *child_data_offset = NULL; int offset, recv_size, num_dependencies; @@ -48,7 +48,7 @@ int MPIR_TSP_Igather_sched_intra_tree(const void *sendbuf, int sendcount, is_inplace = (sendbuf == MPI_IN_PLACE); /* For gather, MPI_IN_PLACE is significant only at root */ tree_type = MPIR_TREE_TYPE_KNOMIAL_1; /* currently only tree_type=MPIR_TREE_TYPE_KNOMIAL_1 is supported for gather */ - mpi_errno = MPII_Treealgo_tree_create(rank, size, tree_type, k, root, &my_tree); + mpi_errno = MPIR_Treealgo_tree_create(rank, size, tree_type, k, root, &my_tree); if (mpi_errno) MPIR_ERR_POP(mpi_errno); num_children = my_tree.num_children; @@ -83,7 +83,7 @@ int MPIR_TSP_Igather_sched_intra_tree(const void *sendbuf, int sendcount, /* get tree information of the parent */ if (my_tree.parent != -1) { - MPII_Treealgo_tree_create(my_tree.parent, size, tree_type, k, root, &parents_tree); + MPIR_Treealgo_tree_create(my_tree.parent, size, tree_type, k, root, &parents_tree); } else { /* initialize an empty children array */ utarray_new(parents_tree.children, &ut_int_icd, MPL_MEM_COLL); parents_tree.num_children = 0; @@ -116,7 +116,7 @@ int MPIR_TSP_Igather_sched_intra_tree(const void *sendbuf, int sendcount, recv_size += child_subtree_size[i]; } - MPII_Treealgo_tree_free(&parents_tree); + MPIR_Treealgo_tree_free(&parents_tree); recv_size *= (lrank == 0) ? recvcount : sendcount; offset = (lrank == 0) ? recvcount : sendcount; @@ -184,7 +184,7 @@ int MPIR_TSP_Igather_sched_intra_tree(const void *sendbuf, int sendcount, } - MPII_Treealgo_tree_free(&my_tree); + MPIR_Treealgo_tree_free(&my_tree); fn_exit: MPL_free(child_subtree_size); diff --git a/src/mpi/coll/ireduce/ireduce_tsp_tree_algos.h b/src/mpi/coll/ireduce/ireduce_tsp_tree_algos.h index f5a66d80745..c1188ba9030 100644 --- a/src/mpi/coll/ireduce/ireduce_tsp_tree_algos.h +++ b/src/mpi/coll/ireduce/ireduce_tsp_tree_algos.h @@ -40,7 +40,7 @@ int MPIR_TSP_Ireduce_sched_intra_tree(const void *sendbuf, void *recvbuf, int co * rank 0 sends the reduced data to the root of the ecollective op. */ int is_tree_root, is_tree_leaf, is_tree_intermediate; /* Variables to store location of this rank in the tree */ int is_root; - MPII_Treealgo_tree_t my_tree; + MPIR_Treealgo_tree_t my_tree; void **child_buffer = NULL; /* Buffer array in which data from children is received */ void *reduce_buffer; /* Buffer in which reduced data is present */ int *vtcs = NULL, *recv_id = NULL, *reduce_id = NULL; /* Arrays to store graph vertex ids */ @@ -66,7 +66,7 @@ int MPIR_TSP_Ireduce_sched_intra_tree(const void *sendbuf, void *recvbuf, int co is_commutative = MPIR_Op_is_commutative(op); /* calculate chunking information for pipelining */ - MPII_Algo_calculate_pipeline_chunk_info(maxbytes, type_size, count, &num_chunks, + MPIR_Algo_calculate_pipeline_chunk_info(maxbytes, type_size, count, &num_chunks, &chunk_size_floor, &chunk_size_ceil); /* print chunking information */ MPL_DBG_MSG_FMT(MPIR_DBG_COLL, VERBOSE, (MPL_DBG_FDEST, @@ -85,7 +85,7 @@ int MPIR_TSP_Ireduce_sched_intra_tree(const void *sendbuf, void *recvbuf, int co /* initialize the tree */ my_tree.children = NULL; - mpi_errno = MPII_Treealgo_tree_create(rank, size, tree_type, k, tree_root, &my_tree); + mpi_errno = MPIR_Treealgo_tree_create(rank, size, tree_type, k, tree_root, &my_tree); if (mpi_errno) MPIR_ERR_POP(mpi_errno); num_children = my_tree.num_children; @@ -240,7 +240,7 @@ int MPIR_TSP_Ireduce_sched_intra_tree(const void *sendbuf, void *recvbuf, int co offset += msgsize; } - MPII_Treealgo_tree_free(&my_tree); + MPIR_Treealgo_tree_free(&my_tree); fn_exit: MPL_free(vtcs); diff --git a/src/mpi/coll/iscatter/iscatter_tsp_tree_algos.h b/src/mpi/coll/iscatter/iscatter_tsp_tree_algos.h index e771a5391be..4296065189e 100644 --- a/src/mpi/coll/iscatter/iscatter_tsp_tree_algos.h +++ b/src/mpi/coll/iscatter/iscatter_tsp_tree_algos.h @@ -35,7 +35,7 @@ int MPIR_TSP_Iscatter_sched_intra_tree(const void *sendbuf, int sendcount, void *tmp_buf = NULL; int recv_id; int tree_type; - MPII_Treealgo_tree_t my_tree, parents_tree; + MPIR_Treealgo_tree_t my_tree, parents_tree; int next_child; int num_children, *child_subtree_size = NULL, *child_data_offset = NULL; int offset, recv_size; @@ -50,7 +50,7 @@ int MPIR_TSP_Iscatter_sched_intra_tree(const void *sendbuf, int sendcount, is_inplace = (recvbuf == MPI_IN_PLACE); /* For scatter, MPI_IN_PLACE is significant only at root */ tree_type = MPIR_TREE_TYPE_KNOMIAL_1; /* currently only tree_type=MPIR_TREE_TYPE_KNOMIAL_1 is supported for scatter */ - mpi_errno = MPII_Treealgo_tree_create(rank, size, tree_type, k, root, &my_tree); + mpi_errno = MPIR_Treealgo_tree_create(rank, size, tree_type, k, root, &my_tree); if (mpi_errno) MPIR_ERR_POP(mpi_errno); num_children = my_tree.num_children; @@ -85,7 +85,7 @@ int MPIR_TSP_Iscatter_sched_intra_tree(const void *sendbuf, int sendcount, /* get tree information of the parent */ if (my_tree.parent != -1) { - MPII_Treealgo_tree_create(my_tree.parent, size, tree_type, k, root, &parents_tree); + MPIR_Treealgo_tree_create(my_tree.parent, size, tree_type, k, root, &parents_tree); } else { /* initialize an empty children array */ utarray_new(parents_tree.children, &ut_int_icd, MPL_MEM_COLL); parents_tree.num_children = 0; @@ -118,7 +118,7 @@ int MPIR_TSP_Iscatter_sched_intra_tree(const void *sendbuf, int sendcount, recv_size += child_subtree_size[i]; } - MPII_Treealgo_tree_free(&parents_tree); + MPIR_Treealgo_tree_free(&parents_tree); recv_size *= (lrank == 0) ? sendcount : recvcount; offset = (lrank == 0) ? sendcount : recvcount; @@ -177,7 +177,7 @@ int MPIR_TSP_Iscatter_sched_intra_tree(const void *sendbuf, int sendcount, recvcount, recvtype, sched, 0, NULL); } - MPII_Treealgo_tree_free(&my_tree); + MPIR_Treealgo_tree_free(&my_tree); fn_exit: MPL_free(child_subtree_size); diff --git a/src/mpi/errhan/errnames.txt b/src/mpi/errhan/errnames.txt index f5c4eecea2d..d3faab09b13 100644 --- a/src/mpi/errhan/errnames.txt +++ b/src/mpi/errhan/errnames.txt @@ -213,6 +213,9 @@ MPI_TYPECLASS_INTEGER, or MPI_TYPECLASS_COMPLEX **treetype: Invalid tree type used for initializing Tree algorithms **treetype %d: Invalid tree type (%d) used for initializing Tree algorithms +**noizem: release_gather based intra-node collectives cannot be used without izem submodule. \ +Reconfigure mpich with --enable-izem=atomic --with-zm-prefix=yes + # -- FIXME: Some (but not all) of the messages below this line have been used #---- The messages below this line haven't been used yet. # diff --git a/src/mpid/ch4/shm/posix/Makefile.mk b/src/mpid/ch4/shm/posix/Makefile.mk index fbef5357dfc..39375e9c086 100644 --- a/src/mpid/ch4/shm/posix/Makefile.mk +++ b/src/mpid/ch4/shm/posix/Makefile.mk @@ -12,6 +12,10 @@ if BUILD_SHM_POSIX +if ENABLE_IZEM_ATOMIC +include $(top_srcdir)/src/mpid/ch4/shm/posix/release_gather/Makefile.mk +endif + noinst_HEADERS += src/mpid/ch4/shm/posix/posix_am.h \ src/mpid/ch4/shm/posix/posix_coll.h \ src/mpid/ch4/shm/posix/shm_inline.h \ @@ -31,6 +35,9 @@ noinst_HEADERS += src/mpid/ch4/shm/posix/posix_am.h \ src/mpid/ch4/shm/posix/posix_proc.h \ src/mpid/ch4/shm/posix/posix_types.h +if ENABLE_IZEM_ATOMIC +noinst_HEADERS += src/mpid/ch4/shm/posix/posix_coll_release_gather.h +endif mpi_core_sources += src/mpid/ch4/shm/posix/globals.c \ src/mpid/ch4/shm/posix/posix_comm.c \ src/mpid/ch4/shm/posix/posix_init.c \ @@ -38,7 +45,8 @@ mpi_core_sources += src/mpid/ch4/shm/posix/globals.c \ src/mpid/ch4/shm/posix/posix_datatype.c \ src/mpid/ch4/shm/posix/posix_spawn.c \ src/mpid/ch4/shm/posix/posix_win.c \ - src/mpid/ch4/shm/posix/posix_eager_array.c + src/mpid/ch4/shm/posix/posix_eager_array.c \ + src/mpid/ch4/shm/posix/posix_coll_init.c include $(top_srcdir)/src/mpid/ch4/shm/posix/eager/Makefile.mk diff --git a/src/mpid/ch4/shm/posix/eager/fbox/fbox_init.h b/src/mpid/ch4/shm/posix/eager/fbox/fbox_init.h index 5810d94fa03..05eaa08325a 100644 --- a/src/mpid/ch4/shm/posix/eager/fbox/fbox_init.h +++ b/src/mpid/ch4/shm/posix/eager/fbox/fbox_init.h @@ -31,16 +31,14 @@ === END_MPI_T_CVAR_INFO_BLOCK === */ -#define MPIDI_POSIX_MAILBOX_INDEX(sender, receiver) ((num_local) * (sender) + (receiver)) +#define MPIDI_POSIX_MAILBOX_INDEX(sender, receiver) ((MPIDI_POSIX_global.num_local) * (sender) + (receiver)) extern MPIDI_POSIX_eager_fbox_control_t MPIDI_POSIX_eager_fbox_control_global; MPL_STATIC_INLINE_PREFIX int MPIDI_POSIX_eager_init(int rank, int size) { int mpi_errno = MPI_SUCCESS; - - int i, num_local = 0, local_rank_0 = -1, my_local_rank = -1; - + int i; MPIDI_POSIX_fastbox_t *fastboxes_p = NULL; MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDI_POSIX_EAGER_INIT); @@ -50,7 +48,7 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_POSIX_eager_init(int rank, int size) MPIDI_CH4_SHM_POSIX_FBOX_GENERAL = MPL_dbg_class_alloc("SHM_POSIX_FBOX", "shm_posix_fbox"); #endif /* MPL_USE_DBG_LOGGING */ - MPIR_CHKPMEM_DECL(5); + MPIR_CHKPMEM_DECL(3); MPIDI_POSIX_eager_fbox_control_global.num_seg = 1; MPIDI_POSIX_eager_fbox_control_global.first_poll_local_ranks = @@ -68,19 +66,6 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_POSIX_eager_init(int rank, int size) * messages if all other entries in the cache are empty. */ MPIDI_POSIX_eager_fbox_control_global.first_poll_local_ranks[i] = 0; - /* Populate these values with transformation information about each rank and its original - * information in MPI_COMM_WORLD. */ - - mpi_errno = MPIR_Find_local(MPIR_Process.comm_world, &num_local, &my_local_rank, - /* comm_world rank of each local process */ - &MPIDI_POSIX_eager_fbox_control_global.local_procs, - /* local rank of each process in comm_world if it is on the same node */ - &MPIDI_POSIX_eager_fbox_control_global.local_ranks); - - local_rank_0 = MPIDI_POSIX_eager_fbox_control_global.local_procs[0]; - MPIDI_POSIX_eager_fbox_control_global.num_local = num_local; - MPIDI_POSIX_eager_fbox_control_global.my_local_rank = my_local_rank; - MPIR_CHKPMEM_MALLOC(MPIDI_POSIX_eager_fbox_control_global.seg, MPIDU_shm_seg_info_t *, MPIDI_POSIX_eager_fbox_control_global.num_seg * @@ -89,45 +74,41 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_POSIX_eager_init(int rank, int size) /* Create region with one fastbox for every pair of local processes. */ mpi_errno = - MPIDU_shm_seg_alloc(num_local * num_local * sizeof(MPIDI_POSIX_fastbox_t), - (void **) &fastboxes_p, MPL_MEM_SHM); - if (mpi_errno) - MPIR_ERR_POP(mpi_errno); - - /* Request shared collective barrier vars region */ - mpi_errno = MPIDU_shm_seg_alloc(MAX(sizeof(MPIDU_shm_barrier_t), MPIDU_SHM_CACHE_LINE_LEN), - (void **) &MPIDI_POSIX_eager_fbox_control_global.barrier_region, - MPL_MEM_SHM); + MPIDU_shm_seg_alloc(MPIDI_POSIX_global.num_local * MPIDI_POSIX_global.num_local * + sizeof(MPIDI_POSIX_fastbox_t), (void **) &fastboxes_p, MPL_MEM_SHM); if (mpi_errno) MPIR_ERR_POP(mpi_errno); /* Actually allocate the segment and assign regions to the pointers */ mpi_errno = MPIDU_shm_seg_commit(&MPIDI_POSIX_eager_fbox_control_global.memory, - &MPIDI_POSIX_eager_fbox_control_global.barrier, - num_local, my_local_rank, local_rank_0, rank, MPL_MEM_SHM); + &MPIDI_POSIX_global.barrier, + MPIDI_POSIX_global.num_local, MPIDI_POSIX_global.my_local_rank, + MPIDI_POSIX_global.local_rank_0, rank, MPL_MEM_SHM); if (mpi_errno) MPIR_ERR_POP(mpi_errno); /* Allocate table of pointers to fastboxes */ MPIR_CHKPMEM_MALLOC(MPIDI_POSIX_eager_fbox_control_global.mailboxes.in, - MPIDI_POSIX_fastbox_t **, num_local * sizeof(MPIDI_POSIX_fastbox_t *), - mpi_errno, "fastboxes", MPL_MEM_SHM); + MPIDI_POSIX_fastbox_t **, + MPIDI_POSIX_global.num_local * sizeof(MPIDI_POSIX_fastbox_t *), mpi_errno, + "fastboxes", MPL_MEM_SHM); MPIR_CHKPMEM_MALLOC(MPIDI_POSIX_eager_fbox_control_global.mailboxes.out, - MPIDI_POSIX_fastbox_t **, num_local * sizeof(MPIDI_POSIX_fastbox_t *), - mpi_errno, "fastboxes", MPL_MEM_SHM); + MPIDI_POSIX_fastbox_t **, + MPIDI_POSIX_global.num_local * sizeof(MPIDI_POSIX_fastbox_t *), mpi_errno, + "fastboxes", MPL_MEM_SHM); /* Fill in fbox tables */ - for (i = 0; i < num_local; i++) { + for (i = 0; i < MPIDI_POSIX_global.num_local; i++) { MPIDI_POSIX_eager_fbox_control_global.mailboxes.in[i] = - &fastboxes_p[MPIDI_POSIX_MAILBOX_INDEX(i, my_local_rank)]; + &fastboxes_p[MPIDI_POSIX_MAILBOX_INDEX(i, MPIDI_POSIX_global.my_local_rank)]; MPIDI_POSIX_eager_fbox_control_global.mailboxes.out[i] = - &fastboxes_p[MPIDI_POSIX_MAILBOX_INDEX(my_local_rank, i)]; + &fastboxes_p[MPIDI_POSIX_MAILBOX_INDEX(MPIDI_POSIX_global.my_local_rank, i)]; memset(MPIDI_POSIX_eager_fbox_control_global.mailboxes.in[i], 0, sizeof(MPIDI_POSIX_fastbox_t)); } - mpi_errno = MPIDU_shm_barrier(MPIDI_POSIX_eager_fbox_control_global.barrier, num_local); + mpi_errno = MPIDU_shm_barrier(MPIDI_POSIX_global.barrier, MPIDI_POSIX_global.num_local); if (mpi_errno) MPIR_ERR_POP(mpi_errno); @@ -150,8 +131,7 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_POSIX_eager_finalize() MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDI_POSIX_EAGER_FINALIZE); MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPIDI_POSIX_EAGER_FINALIZE); - mpi_errno = MPIDU_shm_barrier(MPIDI_POSIX_eager_fbox_control_global.barrier, - MPIDI_POSIX_eager_fbox_control_global.num_local); + mpi_errno = MPIDU_shm_barrier(MPIDI_POSIX_global.barrier, MPIDI_POSIX_global.num_local); if (mpi_errno) MPIR_ERR_POP(mpi_errno); @@ -159,12 +139,10 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_POSIX_eager_finalize() MPL_free(MPIDI_POSIX_eager_fbox_control_global.seg); MPL_free(MPIDI_POSIX_eager_fbox_control_global.mailboxes.in); MPL_free(MPIDI_POSIX_eager_fbox_control_global.mailboxes.out); - MPL_free(MPIDI_POSIX_eager_fbox_control_global.local_ranks); - MPL_free(MPIDI_POSIX_eager_fbox_control_global.local_procs); MPL_free(MPIDI_POSIX_eager_fbox_control_global.first_poll_local_ranks); mpi_errno = MPIDU_shm_seg_destroy(&MPIDI_POSIX_eager_fbox_control_global.memory, - MPIDI_POSIX_eager_fbox_control_global.num_local); + MPIDI_POSIX_global.num_local); fn_exit: MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPIDI_POSIX_EAGER_FINALIZE); diff --git a/src/mpid/ch4/shm/posix/eager/fbox/fbox_recv.h b/src/mpid/ch4/shm/posix/eager/fbox/fbox_recv.h index 07d5eb919ca..4b589b4f3ab 100644 --- a/src/mpid/ch4/shm/posix/eager/fbox/fbox_recv.h +++ b/src/mpid/ch4/shm/posix/eager/fbox/fbox_recv.h @@ -58,8 +58,7 @@ MPIDI_POSIX_eager_recv_begin(MPIDI_POSIX_eager_recv_transaction_t * transaction) MPIDI_POSIX_eager_fbox_control_global.first_poll_local_ranks [MPIR_CVAR_CH4_POSIX_EAGER_FBOX_POLL_CACHE_SIZE]; /* Figure out the next fastbox to poll by incrementing the counter. */ - last_cache = - (last_cache + 1) % (int16_t) MPIDI_POSIX_eager_fbox_control_global.num_local; + last_cache = (last_cache + 1) % (int16_t) MPIDI_POSIX_global.num_local; local_rank = last_cache; MPIDI_POSIX_eager_fbox_control_global.first_poll_local_ranks [MPIR_CVAR_CH4_POSIX_EAGER_FBOX_POLL_CACHE_SIZE] = last_cache; @@ -75,7 +74,7 @@ MPIDI_POSIX_eager_recv_begin(MPIDI_POSIX_eager_recv_transaction_t * transaction) /* If the data ready flag is set, there is a message waiting. */ if (fbox_in->data_ready) { /* Initialize public transaction part */ - grank = MPIDI_POSIX_eager_fbox_control_global.local_procs[local_rank]; + grank = MPIDI_POSIX_global.local_procs[local_rank]; if (likely(fbox_in->is_header)) { /* Only received the header for the message */ @@ -154,7 +153,7 @@ MPL_STATIC_INLINE_PREFIX void MPIDI_POSIX_eager_recv_posted_hook(int grank) int local_rank, i; if (grank >= 0) { - local_rank = MPIDI_POSIX_eager_fbox_control_global.local_ranks[grank]; + local_rank = MPIDI_POSIX_global.local_ranks[grank]; /* Put the posted receive in the list of fastboxes to be polled first. If the list is full, * it will get polled after the boxes in the list are polled, which will be slower, but will @@ -183,7 +182,7 @@ MPL_STATIC_INLINE_PREFIX void MPIDI_POSIX_eager_recv_completed_hook(int grank) MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPIDI_POSIX_FBOX_EAGER_RECV_COMPLETED_HOOK); if (grank >= 0) { - local_rank = MPIDI_POSIX_eager_fbox_control_global.local_ranks[grank]; + local_rank = MPIDI_POSIX_global.local_ranks[grank]; /* Remove the posted receive from the list of fastboxes to be polled first now that the * request is done. */ diff --git a/src/mpid/ch4/shm/posix/eager/fbox/fbox_send.h b/src/mpid/ch4/shm/posix/eager/fbox/fbox_send.h index d850d0c70d0..a9514893036 100644 --- a/src/mpid/ch4/shm/posix/eager/fbox/fbox_send.h +++ b/src/mpid/ch4/shm/posix/eager/fbox/fbox_send.h @@ -40,8 +40,7 @@ MPIDI_POSIX_eager_send(int grank, MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPIDI_POSIX_EAGER_SEND); fbox_out = - MPIDI_POSIX_eager_fbox_control_global.mailboxes. - out[MPIDI_POSIX_eager_fbox_control_global.local_ranks[grank]]; + MPIDI_POSIX_eager_fbox_control_global.mailboxes.out[MPIDI_POSIX_global.local_ranks[grank]]; /* Check if the fastbox is already full and if so, return to the caller, which will cause this * message to be queued. */ diff --git a/src/mpid/ch4/shm/posix/eager/fbox/fbox_types.h b/src/mpid/ch4/shm/posix/eager/fbox/fbox_types.h index 25cd409418c..d04b6878b10 100644 --- a/src/mpid/ch4/shm/posix/eager/fbox/fbox_types.h +++ b/src/mpid/ch4/shm/posix/eager/fbox/fbox_types.h @@ -43,16 +43,6 @@ typedef struct MPIDI_POSIX_eager_fbox_control { MPIDI_POSIX_fbox_arrays_t mailboxes; /* The array of buffers that make up the total collection * of mailboxes */ - MPIDU_shm_barrier_t *barrier; - void *barrier_region; - - /* Keep track of all of the local processes in MPI_COMM_WORLD and what their original rank was - * in that communicator. */ - int num_local; - int my_local_rank; - int *local_ranks; - int *local_procs; - /* A small cache of local ranks that have posted receives that we use to poll fastboxes more * efficiently. The last entry in this array is a counter to keep track of the most recently * checked fastbox so we can make sure we don't starve the fastboxes where receives haven't been @@ -62,6 +52,7 @@ typedef struct MPIDI_POSIX_eager_fbox_control { * is much bigger than any currently immaginable single-node without something like wildly * oversubscribed ranks as threads). */ int16_t *first_poll_local_ranks; + int next_poll_local_rank; } MPIDI_POSIX_eager_fbox_control_t; diff --git a/src/mpid/ch4/shm/posix/globals.c b/src/mpid/ch4/shm/posix/globals.c index bf419d1041f..74192fa2d86 100644 --- a/src/mpid/ch4/shm/posix/globals.c +++ b/src/mpid/ch4/shm/posix/globals.c @@ -17,3 +17,6 @@ MPIDI_POSIX_global_t MPIDI_POSIX_global = { 0 }; MPIDI_POSIX_eager_funcs_t *MPIDI_POSIX_eager_func = NULL; MPL_dbg_class MPIDI_CH4_SHM_POSIX_GENERAL; +#ifdef ENABLE_IZEM_ATOMIC +zm_atomic_uint_t *MPIDI_POSIX_shm_limit_counter = NULL; +#endif diff --git a/src/mpid/ch4/shm/posix/posix_coll.h b/src/mpid/ch4/shm/posix/posix_coll.h index 0e2340d1766..4cab059693b 100644 --- a/src/mpid/ch4/shm/posix/posix_coll.h +++ b/src/mpid/ch4/shm/posix/posix_coll.h @@ -16,6 +16,9 @@ #include "ch4_coll_select.h" #include "posix_coll_params.h" #include "posix_coll_select.h" +#ifdef ENABLE_IZEM_ATOMIC +#include "posix_coll_release_gather.h" +#endif static inline int MPIDI_POSIX_mpi_barrier(MPIR_Comm * comm, MPIR_Errflag_t * errflag, const void *ch4_algo_parameters_container_in) @@ -77,6 +80,18 @@ static inline int MPIDI_POSIX_mpi_bcast(void *buffer, int count, MPI_Datatype da MPIR_Bcast_intra_scatter_ring_allgather(buffer, count, datatype, root, comm, errflag); break; + case MPIDI_POSIX_Bcast_intra_release_gather_id: +#ifdef ENABLE_IZEM_ATOMIC + mpi_errno = + MPIDI_POSIX_mpi_bcast_release_gather(buffer, count, datatype, root, comm, errflag); +#else + /* else block is needed to keep the compiler happy */ + /* release_gather based algorithms cannot be used without izem submodule */ + MPIR_ERR_SETANDJUMP(mpi_errno, MPI_ERR_OTHER, "**noizem"); +#endif + break; + case MPIDI_POSIX_Bcast_intra_invalid_id: + MPIR_ERR_SETANDJUMP(mpi_errno, MPI_ERR_OTHER, "**noizem"); default: mpi_errno = MPIR_Bcast_impl(buffer, count, datatype, root, comm, errflag); break; @@ -577,6 +592,19 @@ static inline int MPIDI_POSIX_mpi_reduce(const void *sendbuf, void *recvbuf, int MPIR_Reduce_intra_binomial(sendbuf, recvbuf, count, datatype, op, root, comm, errflag); break; + case MPIDI_POSIX_Reduce_intra_release_gather_id: +#ifdef ENABLE_IZEM_ATOMIC + mpi_errno = + MPIDI_POSIX_mpi_reduce_release_gather(sendbuf, recvbuf, count, datatype, + op, root, comm, errflag); +#else + /* else block is needed to keep the compiler happy */ + /* release_gather based algorithms cannot be used without izem submodule */ + MPIR_ERR_SETANDJUMP(mpi_errno, MPI_ERR_OTHER, "**noizem"); +#endif + break; + case MPIDI_POSIX_Reduce_intra_invalid_id: + MPIR_ERR_SETANDJUMP(mpi_errno, MPI_ERR_OTHER, "**noizem"); default: mpi_errno = MPIR_Reduce_impl(sendbuf, recvbuf, count, datatype, op, root, comm, errflag); diff --git a/src/mpid/ch4/shm/posix/posix_coll_containers.h b/src/mpid/ch4/shm/posix/posix_coll_containers.h index 01726db7be3..f335566d5c0 100644 --- a/src/mpid/ch4/shm/posix/posix_coll_containers.h +++ b/src/mpid/ch4/shm/posix/posix_coll_containers.h @@ -9,10 +9,14 @@ extern const MPIDI_POSIX_coll_algo_container_t MPIDI_POSIX_Bcast_intra_binomial_ extern const MPIDI_POSIX_coll_algo_container_t MPIDI_POSIX_Bcast_intra_scatter_recursive_doubling_allgather_cnt; extern const MPIDI_POSIX_coll_algo_container_t MPIDI_POSIX_Bcast_intra_scatter_ring_allgather_cnt; +extern const MPIDI_POSIX_coll_algo_container_t MPIDI_POSIX_Bcast_intra_invalid_cnt; +extern const MPIDI_POSIX_coll_algo_container_t MPIDI_POSIX_Bcast_intra_release_gather_cnt; /* Reduce POSIX containers declaration */ extern const MPIDI_POSIX_coll_algo_container_t MPIDI_POSIX_Reduce_intra_binomial_cnt; extern const MPIDI_POSIX_coll_algo_container_t MPIDI_POSIX_Reduce_intra_reduce_scatter_gather_cnt; +extern const MPIDI_POSIX_coll_algo_container_t MPIDI_POSIX_Reduce_intra_invalid_cnt; +extern const MPIDI_POSIX_coll_algo_container_t MPIDI_POSIX_Reduce_intra_release_gather_cnt; /* Allreduce POSIX containers declaration */ extern const MPIDI_POSIX_coll_algo_container_t MPIDI_POSIX_Allreduce_intra_recursive_doubling_cnt; diff --git a/src/mpid/ch4/shm/posix/posix_coll_globals_default.c b/src/mpid/ch4/shm/posix/posix_coll_globals_default.c index 5afd5e9a168..ab6f3ad95cd 100644 --- a/src/mpid/ch4/shm/posix/posix_coll_globals_default.c +++ b/src/mpid/ch4/shm/posix/posix_coll_globals_default.c @@ -22,6 +22,14 @@ const MPIDI_POSIX_coll_algo_container_t MPIDI_POSIX_Bcast_intra_scatter_ring_all .id = MPIDI_POSIX_Bcast_intra_scatter_ring_allgather_id }; +const MPIDI_POSIX_coll_algo_container_t MPIDI_POSIX_Bcast_intra_invalid_cnt = { + .id = MPIDI_POSIX_Bcast_intra_invalid_id +}; + +const MPIDI_POSIX_coll_algo_container_t MPIDI_POSIX_Bcast_intra_release_gather_cnt = { + .id = MPIDI_POSIX_Bcast_intra_release_gather_id +}; + /* Reduce default POSIX containers initialization*/ const MPIDI_POSIX_coll_algo_container_t MPIDI_POSIX_Reduce_intra_reduce_scatter_gather_cnt = { .id = MPIDI_POSIX_Reduce_intra_reduce_scatter_gather_id @@ -31,6 +39,14 @@ const MPIDI_POSIX_coll_algo_container_t MPIDI_POSIX_Reduce_intra_binomial_cnt = .id = MPIDI_POSIX_Reduce_intra_binomial_id }; +const MPIDI_POSIX_coll_algo_container_t MPIDI_POSIX_Reduce_intra_invalid_cnt = { + .id = MPIDI_POSIX_Reduce_intra_invalid_id +}; + +const MPIDI_POSIX_coll_algo_container_t MPIDI_POSIX_Reduce_intra_release_gather_cnt = { + .id = MPIDI_POSIX_Reduce_intra_release_gather_id +}; + /* Allreduce default POSIX containers initialization*/ const MPIDI_POSIX_coll_algo_container_t MPIDI_POSIX_Allreduce_intra_recursive_doubling_cnt = { .id = MPIDI_POSIX_Allreduce_intra_recursive_doubling_id diff --git a/src/mpid/ch4/shm/posix/posix_coll_init.c b/src/mpid/ch4/shm/posix/posix_coll_init.c new file mode 100644 index 00000000000..2e6d61617a6 --- /dev/null +++ b/src/mpid/ch4/shm/posix/posix_coll_init.c @@ -0,0 +1,73 @@ + +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */ +/* + * (C) 2018 by Argonne National Laboratory. + * See COPYRIGHT in top-level directory. + * + * Portions of this code were written by Intel Corporation. + * Copyright (C) 2011-2017 Intel Corporation. Intel provides this material + * to Argonne National Laboratory subject to Software Grant and Corporate + * Contributor License Agreement dated February 8, 2012. + */ + +#include "mpidimpl.h" + +/* +=== BEGIN_MPI_T_CVAR_INFO_BLOCK === + +cvars: + - name : MPIR_CVAR_BCAST_POSIX_INTRA_ALGORITHM + category : COLLECTIVE + type : string + default : auto + class : device + verbosity : MPI_T_VERBOSITY_USER_BASIC + scope : MPI_T_SCOPE_ALL_EQ + description : >- + Variable to select algorithm for intra-node bcast + auto - Internal algorithm selection from pt2pt based algorithms + release_gather - Force shm optimized algo using release, gather primitives + (izem submodule should be build and enabled for this) + + - name : MPIR_CVAR_REDUCE_POSIX_INTRA_ALGORITHM + category : COLLECTIVE + type : string + default : auto + class : device + verbosity : MPI_T_VERBOSITY_USER_BASIC + scope : MPI_T_SCOPE_ALL_EQ + description : >- + Variable to select algorithm for intra-node reduce + auto - Internal algorithm selection from pt2pt based algorithms + release_gather - Force shm optimized algo using release, gather primitives + (izem submodule should be build and enabled for this) + +=== END_MPI_T_CVAR_INFO_BLOCK === +*/ + +MPIDI_POSIX_Bcast_id_t MPIDI_POSIX_Bcast_algo_choice = MPIDI_POSIX_Bcast_intra_auto_id; +MPIDI_POSIX_Reduce_id_t MPIDI_POSIX_Reduce_algo_choice = MPIDI_POSIX_Reduce_intra_auto_id; + +int collective_cvars_init(void); + +int collective_cvars_init(void) +{ + int mpi_errno = MPI_SUCCESS; + + /* Bcast */ + if (0 == strcmp(MPIR_CVAR_BCAST_POSIX_INTRA_ALGORITHM, "release_gather")) + MPIDI_POSIX_Bcast_algo_choice = MPIDI_POSIX_Bcast_intra_release_gather_id; + else + MPIDI_POSIX_Bcast_algo_choice = MPIDI_POSIX_Bcast_intra_auto_id; + + /* Reduce */ + if (0 == strcmp(MPIR_CVAR_REDUCE_POSIX_INTRA_ALGORITHM, "release_gather")) + MPIDI_POSIX_Reduce_algo_choice = MPIDI_POSIX_Reduce_intra_release_gather_id; + else + MPIDI_POSIX_Reduce_algo_choice = MPIDI_POSIX_Reduce_intra_auto_id; + + fn_exit: + return mpi_errno; + fn_fail: + goto fn_exit; +} diff --git a/src/mpid/ch4/shm/posix/posix_coll_params.h b/src/mpid/ch4/shm/posix/posix_coll_params.h index bbd43c3c4b2..bbf08fdd377 100644 --- a/src/mpid/ch4/shm/posix/posix_coll_params.h +++ b/src/mpid/ch4/shm/posix/posix_coll_params.h @@ -15,7 +15,10 @@ typedef union { typedef enum { MPIDI_POSIX_Bcast_intra_binomial_id, MPIDI_POSIX_Bcast_intra_scatter_recursive_doubling_allgather_id, - MPIDI_POSIX_Bcast_intra_scatter_ring_allgather_id + MPIDI_POSIX_Bcast_intra_scatter_ring_allgather_id, + MPIDI_POSIX_Bcast_intra_auto_id, + MPIDI_POSIX_Bcast_intra_invalid_id, + MPIDI_POSIX_Bcast_intra_release_gather_id } MPIDI_POSIX_Bcast_id_t; typedef union { @@ -24,23 +27,42 @@ typedef union { int radix; int block_size; } posix_bcast_knomial_parameters; + struct MPIDI_POSIX_Bcast_release_gather_parameters { + int radix; + int tree_type; + int buffer_size; + int num_buffers; + } posix_bcast_release_gather_parameters; struct MPIDI_POSIX_Bcast_empty_parameters { int empty; } posix_bcast_empty_parameters; } MPIDI_POSIX_Bcast_params_t; +extern MPIDI_POSIX_Bcast_id_t MPIDI_POSIX_Bcast_algo_choice; + typedef enum { MPIDI_POSIX_Reduce_intra_reduce_scatter_gather_id, - MPIDI_POSIX_Reduce_intra_binomial_id + MPIDI_POSIX_Reduce_intra_binomial_id, + MPIDI_POSIX_Reduce_intra_auto_id, + MPIDI_POSIX_Reduce_intra_invalid_id, + MPIDI_POSIX_Reduce_intra_release_gather_id } MPIDI_POSIX_Reduce_id_t; typedef union { /* reserved for parameters related to SHM specific collectives */ + struct MPIDI_POSIX_Reduce_release_gather_parameters { + int radix; + int tree_type; + int buffer_size; + int num_buffers; + } posix_reduce_release_gather_parameters; struct MPIDI_POSIX_Reduce_empty_parameters { int empty; } posix_reduce_empty_parameters; } MPIDI_POSIX_Reduce_params_t; +extern MPIDI_POSIX_Reduce_id_t MPIDI_POSIX_Reduce_algo_choice; + typedef enum { MPIDI_POSIX_Allreduce_intra_recursive_doubling_id, MPIDI_POSIX_Allreduce_intra_reduce_scatter_allgather_id diff --git a/src/mpid/ch4/shm/posix/posix_coll_release_gather.h b/src/mpid/ch4/shm/posix/posix_coll_release_gather.h new file mode 100644 index 00000000000..bb49b786409 --- /dev/null +++ b/src/mpid/ch4/shm/posix/posix_coll_release_gather.h @@ -0,0 +1,273 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */ +/* + * + * (C) 2018 by Argonne National Laboratory. + * See COPYRIGHT in top-level directory. + */ + +#ifndef POSIX_COLL_RELEASE_GATHER_H_INCLUDED +#define POSIX_COLL_RELEASE_GATHER_H_INCLUDED + +#include "mpiimpl.h" +#include "algo_common.h" +#include "release_gather.h" + +/* Intra-node bcast is implemented as a release step followed by gather step in release_gather + * framework. The actual data movement happens in release step. Gather step makes sure that + * the shared bcast buffer can be reused for next bcast call. Release gather framework has + * multitple cells in bcast buffer, so that the copying in next cell can be overlapped with + * copying out of previous cells (pipelining). + */ +MPL_STATIC_INLINE_PREFIX int MPIDI_POSIX_mpi_bcast_release_gather(void *buffer, + int count, + MPI_Datatype datatype, + int root, MPIR_Comm * comm_ptr, + MPIR_Errflag_t * errflag) +{ + MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_POSIX_MPI_BCAST_RELEASE_GATHER); + MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_POSIX_MPI_BCAST_RELEASE_GATHER); + + int i, my_rank, num_chunks, chunk_count_floor, chunk_count_ceil; + int offset = 0, is_contig, ori_count = count; + int mpi_errno = MPI_SUCCESS, mpi_errno_ret = MPI_SUCCESS; + MPI_Aint position; + MPI_Aint lb, true_extent, extent, type_size; + void *ori_buffer = buffer; + MPI_Datatype ori_datatype = datatype; + + /* If there is only one process or no data, return */ + if (count == 0 || (MPIR_Comm_size(comm_ptr) == 1)) { + goto fn_exit; + } + + /* Lazy initialization of release_gather specific struct */ + mpi_errno = + MPIDI_POSIX_mpi_release_gather_comm_init(comm_ptr, MPIDI_POSIX_RELEASE_GATHER_OPCODE_BCAST); + if (mpi_errno) { + /* Fall back to other algo as release_gather based bcast cannot be used */ + mpi_errno = MPIR_Bcast_impl(buffer, count, datatype, root, comm_ptr, errflag); + goto fn_exit; + } + + my_rank = MPIR_Comm_rank(comm_ptr); + MPIR_Type_get_extent_impl(datatype, &lb, &extent); + MPIR_Type_get_true_extent_impl(datatype, &lb, &true_extent); + extent = MPL_MAX(extent, true_extent); + + MPIR_Datatype_is_contig(datatype, &is_contig); + + if (is_contig) { + MPIR_Datatype_get_size_macro(datatype, type_size); + } else { + MPIR_Pack_size_impl(1, datatype, &type_size); + } + + if (!is_contig || type_size >= MPIDI_POSIX_RELEASE_GATHER_BCAST_CELLSIZE) { + /* Convert to MPI_BYTE datatype */ + count = type_size * count; + datatype = MPI_BYTE; + type_size = 1; + extent = 1; + + if (!is_contig) { + buffer = MPL_malloc(count, MPL_MEM_COLL); + if (my_rank == root) { + /* Root packs the data before sending, for non contiguous datatypes */ + position = 0; + mpi_errno = + MPIR_Pack_impl(ori_buffer, ori_count, ori_datatype, buffer, count, &position); + if (mpi_errno) { + /* for communication errors, just record the error but continue */ + *errflag = MPIR_ERR_OTHER; + MPIR_ERR_SET(mpi_errno, *errflag, "**fail"); + MPIR_ERR_ADD(mpi_errno_ret, mpi_errno); + } + } + } + } + + /* Calculate chunking information for pipelining */ + MPIR_Algo_calculate_pipeline_chunk_info(MPIDI_POSIX_RELEASE_GATHER_BCAST_CELLSIZE, type_size, + count, &num_chunks, &chunk_count_floor, + &chunk_count_ceil); + /* Print chunking information */ + MPL_DBG_MSG_FMT(MPIR_DBG_COLL, VERBOSE, (MPL_DBG_FDEST, + "Bcast shmgr pipeline info: segsize=%d count=%d num_chunks=%d chunk_count_floor=%d chunk_count_ceil=%d \n", + MPIDI_POSIX_RELEASE_GATHER_BCAST_CELLSIZE, count, + num_chunks, chunk_count_floor, chunk_count_ceil)); + + /* Do pipelined release-gather */ + for (i = 0; i < num_chunks; i++) { + int chunk_count = (i == 0) ? chunk_count_floor : chunk_count_ceil; + + mpi_errno = + MPIDI_POSIX_mpi_release_gather_release((char *) buffer + offset * extent, + chunk_count, datatype, root, comm_ptr, + errflag, + MPIDI_POSIX_RELEASE_GATHER_OPCODE_BCAST); + if (mpi_errno) { + /* for communication errors, just record the error but continue */ + *errflag = + MPIX_ERR_PROC_FAILED == + MPIR_ERR_GET_CLASS(mpi_errno) ? MPIR_ERR_PROC_FAILED : MPIR_ERR_OTHER; + MPIR_ERR_SET(mpi_errno, *errflag, "**fail"); + MPIR_ERR_ADD(mpi_errno_ret, mpi_errno); + } + + mpi_errno = + MPIDI_POSIX_mpi_release_gather_gather(NULL, NULL, 0, MPI_DATATYPE_NULL, + MPI_OP_NULL, root, comm_ptr, errflag, + MPIDI_POSIX_RELEASE_GATHER_OPCODE_BCAST); + if (mpi_errno) { + /* for communication errors, just record the error but continue */ + *errflag = + MPIX_ERR_PROC_FAILED == + MPIR_ERR_GET_CLASS(mpi_errno) ? MPIR_ERR_PROC_FAILED : MPIR_ERR_OTHER; + MPIR_ERR_SET(mpi_errno, *errflag, "**fail"); + MPIR_ERR_ADD(mpi_errno_ret, mpi_errno); + } + offset += chunk_count; + } + + if (!is_contig) { + if (my_rank != root) { + /* Non-root unpack the data if expecting non-contiguous datatypes */ + position = 0; + mpi_errno = + MPIR_Unpack_impl(buffer, count, &position, ori_buffer, ori_count, ori_datatype); + if (mpi_errno) { + /* for communication errors, just record the error but continue */ + *errflag = MPIR_ERR_OTHER; + MPIR_ERR_SET(mpi_errno, *errflag, "**fail"); + MPIR_ERR_ADD(mpi_errno_ret, mpi_errno); + } + } + MPL_free(buffer); + } + + fn_exit: + MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPIDI_POSIX_MPI_BCAST_RELEASE_GATHER); + return mpi_errno; + fn_fail: + goto fn_exit; +} + +/* Intra-node reduce is implemented as a release step followed by gather step in release_gather + * framework. The actual data movement happens in gather step. Release step makes sure that + * the shared reduce buffer can be reused for next reduce call. Release gather framework has + * multitple cells in reduce buffer, so that the copying in next cell can be overlapped with + * reduction and copying out of previous cells (pipelining). + */ +MPL_STATIC_INLINE_PREFIX int MPIDI_POSIX_mpi_reduce_release_gather(const void *sendbuf, + void *recvbuf, int count, + MPI_Datatype datatype, + MPI_Op op, int root, + MPIR_Comm * comm_ptr, + MPIR_Errflag_t * errflag) +{ + int i, num_chunks, chunk_size_floor, chunk_size_ceil; + int offset = 0, is_contig; + int mpi_errno = MPI_SUCCESS, mpi_errno_ret = MPI_SUCCESS; + MPI_Aint lb, true_extent, extent, type_size; + + MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_POSIX_MPI_REDUCE_RELEASE_GATHER); + MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_POSIX_MPI_REDUCE_RELEASE_GATHER); + + /* If there is no data, return */ + if (count == 0) { + goto fn_exit; + } + + if ((MPIR_Comm_size(comm_ptr) == 1) && (sendbuf != MPI_IN_PLACE)) { + /* Simply copy the data from sendbuf to recvbuf if there is only 1 rank and MPI_IN_PLACE + * is not used */ + mpi_errno = MPIR_Localcopy(sendbuf, count, datatype, recvbuf, count, datatype); + if (mpi_errno) { + /* for communication errors, just record the error but continue */ + *errflag = MPIR_ERR_OTHER; + MPIR_ERR_SET(mpi_errno, *errflag, "**fail"); + MPIR_ERR_ADD(mpi_errno_ret, mpi_errno); + } + goto fn_exit; + } + + /* Lazy initialization of release_gather specific struct */ + mpi_errno = + MPIDI_POSIX_mpi_release_gather_comm_init(comm_ptr, + MPIDI_POSIX_RELEASE_GATHER_OPCODE_REDUCE); + if (mpi_errno) { + /* Fall back to other algo as release_gather algo cannot be used */ + mpi_errno = + MPIR_Reduce_impl(sendbuf, recvbuf, count, datatype, op, root, comm_ptr, errflag); + goto fn_exit; + } + + MPIR_Type_get_extent_impl(datatype, &lb, &extent); + MPIR_Type_get_true_extent_impl(datatype, &lb, &true_extent); + extent = MPL_MAX(extent, true_extent); + + MPIR_Datatype_is_contig(datatype, &is_contig); + + if (is_contig) { + MPIR_Datatype_get_size_macro(datatype, type_size); + } else { + MPIR_Pack_size_impl(1, datatype, &type_size); + } + + if (sendbuf == MPI_IN_PLACE) { + sendbuf = recvbuf; + } + + /* Calculate chunking information, taking max(extent, type_size) handles contiguous and non-contiguous datatypes both */ + MPIR_Algo_calculate_pipeline_chunk_info(MPIDI_POSIX_RELEASE_GATHER_REDUCE_CELLSIZE, + MPL_MAX(extent, type_size), count, &num_chunks, + &chunk_size_floor, &chunk_size_ceil); + + /* Print chunking information */ + MPL_DBG_MSG_FMT(MPIR_DBG_COLL, VERBOSE, (MPL_DBG_FDEST, + "Reduce shmgr pipeline info: segsize=%d count=%d num_chunks=%d chunk_size_floor=%d chunk_size_ceil=%d \n", + MPIDI_POSIX_RELEASE_GATHER_REDUCE_CELLSIZE, count, + num_chunks, chunk_size_floor, chunk_size_ceil)); + + /* Do pipelined release-gather */ + for (i = 0; i < num_chunks; i++) { + int chunk_count = (i == 0) ? chunk_size_floor : chunk_size_ceil; + + mpi_errno = + MPIDI_POSIX_mpi_release_gather_release(NULL, 0, MPI_DATATYPE_NULL, root, + comm_ptr, errflag, + MPIDI_POSIX_RELEASE_GATHER_OPCODE_REDUCE); + if (mpi_errno) { + /* for communication errors, just record the error but continue */ + *errflag = + MPIX_ERR_PROC_FAILED == + MPIR_ERR_GET_CLASS(mpi_errno) ? MPIR_ERR_PROC_FAILED : MPIR_ERR_OTHER; + MPIR_ERR_SET(mpi_errno, *errflag, "**fail"); + MPIR_ERR_ADD(mpi_errno_ret, mpi_errno); + } + + mpi_errno = + MPIDI_POSIX_mpi_release_gather_gather((char *) sendbuf + offset * extent, + (char *) recvbuf + offset * extent, + chunk_count, datatype, op, root, comm_ptr, + errflag, + MPIDI_POSIX_RELEASE_GATHER_OPCODE_REDUCE); + if (mpi_errno) { + /* for communication errors, just record the error but continue */ + *errflag = + MPIX_ERR_PROC_FAILED == + MPIR_ERR_GET_CLASS(mpi_errno) ? MPIR_ERR_PROC_FAILED : MPIR_ERR_OTHER; + MPIR_ERR_SET(mpi_errno, *errflag, "**fail"); + MPIR_ERR_ADD(mpi_errno_ret, mpi_errno); + } + offset += chunk_count; + } + + fn_exit: + MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_POSIX_MPI_REDUCE_RELEASE_GATHER); + return mpi_errno; + fn_fail: + goto fn_exit; +} + +#endif /* POSIX_COLL_RELEASE_GATHER_H_INCLUDED */ diff --git a/src/mpid/ch4/shm/posix/posix_coll_select.h b/src/mpid/ch4/shm/posix/posix_coll_select.h index e94c1a5499c..d6aa56a4a2b 100644 --- a/src/mpid/ch4/shm/posix/posix_coll_select.h +++ b/src/mpid/ch4/shm/posix/posix_coll_select.h @@ -26,8 +26,27 @@ MPIDI_POSIX_coll_algo_container_t *MPIDI_POSIX_Bcast_select(void *buffer, int nbytes = 0; MPI_Aint type_size = 0; - MPIR_Datatype_get_size_macro(datatype, type_size); + if (MPIDI_POSIX_Bcast_algo_choice == MPIDI_POSIX_Bcast_intra_release_gather_id) { + /* release_gather based algorithm can be used only if izem submodule is built (and enabled) + * and MPICH is not multi-threaded */ +#ifdef ENABLE_IZEM_ATOMIC +#ifdef MPICH_IS_THREADED + if (!MPIR_ThreadInfo.isThreaded) { + /* MPICH configured with threading support but not actually used */ + return &MPIDI_POSIX_Bcast_intra_release_gather_cnt; + } +#else + /* MPICH not configured with threading support */ + return &MPIDI_POSIX_Bcast_intra_release_gather_cnt; +#endif /* MPICH_IS_THREADED */ +#else + /* release_gather algo is chosen through CVAR but izem is not built */ + return &MPIDI_POSIX_Bcast_intra_invalid_cnt; +#endif /* ENABLE_IZEM_ATOMIC */ + } + /* Choose from pt2pt based algorithms */ + MPIR_Datatype_get_size_macro(datatype, type_size); nbytes = type_size * count; if ((nbytes < MPIR_CVAR_BCAST_SHORT_MSG_SIZE) || (comm->local_size < MPIR_CVAR_BCAST_MIN_PROCS)) { @@ -82,6 +101,27 @@ MPIDI_POSIX_coll_algo_container_t *MPIDI_POSIX_Reduce_select(const void *sendbuf MPI_Aint type_size = 0; int pof2 = 0; + if (MPIDI_POSIX_Reduce_algo_choice == MPIDI_POSIX_Reduce_intra_release_gather_id && + MPIR_Op_is_commutative(op)) { + /* release_gather based algorithm can be used only if izem submodule is built (and enabled) + * and MPICH is not multi-threaded. Also when the op is commutative */ +#ifdef ENABLE_IZEM_ATOMIC +#ifdef MPICH_IS_THREADED + if (!MPIR_ThreadInfo.isThreaded) { + /* MPICH configured with threading support but not actually used */ + return &MPIDI_POSIX_Reduce_intra_release_gather_cnt; + } +#else + /* MPICH not configured with threading support */ + return &MPIDI_POSIX_Reduce_intra_release_gather_cnt; +#endif /* MPICH_IS_THREADED */ +#else + /* release_gather algo is chosen through CVAR but izem is not built */ + return &MPIDI_POSIX_Reduce_intra_invalid_cnt; +#endif /* ENABLE_IZEM_ATOMIC */ + } + + /* Choose from pt2pt based algorithms */ MPIR_Datatype_get_size_macro(datatype, type_size); pof2 = comm->pof2; if ((count * type_size > MPIR_CVAR_REDUCE_SHORT_MSG_SIZE) && diff --git a/src/mpid/ch4/shm/posix/posix_comm.c b/src/mpid/ch4/shm/posix/posix_comm.c index 4d0ad27e2cd..aab7219e654 100644 --- a/src/mpid/ch4/shm/posix/posix_comm.c +++ b/src/mpid/ch4/shm/posix/posix_comm.c @@ -10,6 +10,9 @@ #include "mpidimpl.h" #include "posix_noinline.h" +#ifdef ENABLE_IZEM_ATOMIC +#include "release_gather.h" +#endif int MPIDI_POSIX_mpi_comm_create_hook(MPIR_Comm * comm) { @@ -17,6 +20,13 @@ int MPIDI_POSIX_mpi_comm_create_hook(MPIR_Comm * comm) MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDI_POSIX_MPI_COMM_CREATE_HOOK); MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPIDI_POSIX_MPI_COMM_CREATE_HOOK); +#ifdef ENABLE_IZEM_ATOMIC + /* Release_gather primitives based collective algorithm works for Intra-comms only */ + if (comm->comm_kind == MPIR_COMM_KIND__INTRACOMM) { + mpi_errno = MPIDI_POSIX_mpi_release_gather_comm_init_null(comm); + } +#endif + MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPIDI_POSIX_MPI_COMM_CREATE_HOOK); return mpi_errno; } @@ -27,6 +37,13 @@ int MPIDI_POSIX_mpi_comm_free_hook(MPIR_Comm * comm) MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDI_POSIX_MPI_COMM_FREE_HOOK); MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPIDI_POSIX_MPI_COMM_FREE_HOOK); +#ifdef ENABLE_IZEM_ATOMIC + /* Release_gather primitives based collective algorithm works for Intra-comms only */ + if (comm->comm_kind == MPIR_COMM_KIND__INTRACOMM) { + mpi_errno = MPIDI_POSIX_mpi_release_gather_comm_free(comm); + } +#endif + MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPIDI_POSIX_MPI_COMM_FREE_HOOK); return mpi_errno; } diff --git a/src/mpid/ch4/shm/posix/posix_init.c b/src/mpid/ch4/shm/posix/posix_init.c index 8a92dfefb1b..7e595676c43 100644 --- a/src/mpid/ch4/shm/posix/posix_init.c +++ b/src/mpid/ch4/shm/posix/posix_init.c @@ -32,6 +32,9 @@ #include "posix_eager.h" #include "posix_noinline.h" +#ifdef ENABLE_IZEM_ATOMIC +extern zm_atomic_uint_t *MPIDI_POSIX_shm_limit_counter; +#endif static int choose_posix_eager(void); @@ -73,7 +76,7 @@ static int choose_posix_eager(void) int MPIDI_POSIX_mpi_init_hook(int rank, int size, int *n_vcis_provided, int *tag_bits) { int mpi_errno = MPI_SUCCESS; - int i; + int i, num_local = 0, local_rank_0 = -1, my_local_rank = -1; #ifdef MPL_USE_DBG_LOGGING MPIDI_CH4_SHM_POSIX_GENERAL = MPL_dbg_class_alloc("SHM_POSIX", "shm_posix"); @@ -87,6 +90,20 @@ int MPIDI_POSIX_mpi_init_hook(int rank, int size, int *n_vcis_provided, int *tag MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDI_POSIX_INIT_HOOK); MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPIDI_POSIX_INIT_HOOK); + /* Populate these values with transformation information about each rank and its original + * information in MPI_COMM_WORLD. */ + + mpi_errno = MPIR_Find_local(MPIR_Process.comm_world, &num_local, &my_local_rank, + /* comm_world rank of each local process */ + &MPIDI_POSIX_global.local_procs, + /* local rank of each process in comm_world if it is on the same node */ + &MPIDI_POSIX_global.local_ranks); + + local_rank_0 = MPIDI_POSIX_global.local_procs[0]; + MPIDI_POSIX_global.num_local = num_local; + MPIDI_POSIX_global.my_local_rank = my_local_rank; + + MPIDI_POSIX_global.local_rank_0 = local_rank_0; *n_vcis_provided = 1; /* This is used to track messages that the eager submodule was not ready to send. */ @@ -103,10 +120,16 @@ int MPIDI_POSIX_mpi_init_hook(int rank, int size, int *n_vcis_provided, int *tag choose_posix_eager(); mpi_errno = MPIDI_POSIX_eager_init(rank, size); + if (mpi_errno) + MPIR_ERR_POP(mpi_errno); /* There is no restriction on the tag_bits from the posix shmod side */ *tag_bits = MPIR_TAG_BITS_DEFAULT; + mpi_errno = MPIDI_POSIX_coll_init(rank, size); + if (mpi_errno) + MPIR_ERR_POP(mpi_errno); + MPIR_CHKPMEM_COMMIT(); fn_exit: @@ -126,14 +149,20 @@ int MPIDI_POSIX_mpi_finalize_hook(void) MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPIDI_POSIX_FINALIZE_HOOK); mpi_errno = MPIDI_POSIX_eager_finalize(); + if (mpi_errno) + MPIR_ERR_POP(mpi_errno); MPIDIU_destroy_buf_pool(MPIDI_POSIX_global.am_buf_pool); MPL_free(MPIDI_POSIX_global.active_rreq); + mpi_errno = MPIDI_POSIX_coll_finalize(); if (mpi_errno) MPIR_ERR_POP(mpi_errno); + MPL_free(MPIDI_POSIX_global.local_ranks); + MPL_free(MPIDI_POSIX_global.local_procs); + fn_exit: MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPIDI_POSIX_FINALIZE_HOOK); return mpi_errno; @@ -141,6 +170,67 @@ int MPIDI_POSIX_mpi_finalize_hook(void) goto fn_exit; } +int MPIDI_POSIX_coll_init(int rank, int size) +{ + int mpi_errno = MPI_SUCCESS; + MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDI_POSIX_COLL_INIT); + MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPIDI_POSIX_COLL_INIT); + +#ifdef ENABLE_IZEM_ATOMIC + /* Allocate a shared counter to track the amount of shared memory created per node for + * intra-node collectives */ + mpi_errno = + MPIDU_shm_seg_alloc(sizeof(int), (void **) &MPIDI_POSIX_shm_limit_counter, MPL_MEM_SHM); + if (mpi_errno) + MPIR_ERR_POP(mpi_errno); + + /* Actually allocate the segment and assign regions to the pointers */ + mpi_errno = + MPIDU_shm_seg_commit(&MPIDI_POSIX_global.memory, &MPIDI_POSIX_global.barrier, + MPIDI_POSIX_global.num_local, MPIDI_POSIX_global.my_local_rank, + MPIDI_POSIX_global.local_rank_0, rank, MPL_MEM_SHM); + if (mpi_errno) + MPIR_ERR_POP(mpi_errno); + + mpi_errno = MPIDU_shm_barrier(MPIDI_POSIX_global.barrier, MPIDI_POSIX_global.num_local); + if (mpi_errno) + MPIR_ERR_POP(mpi_errno); + + /* Set the counter to 0 */ + zm_atomic_store(MPIDI_POSIX_shm_limit_counter, 0, zm_memord_relaxed); +#endif + + mpi_errno = collective_cvars_init(); + if (mpi_errno) + MPIR_ERR_POP(mpi_errno); + + fn_exit: + MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPIDI_POSIX_COLL_INIT); + return mpi_errno; + fn_fail: + goto fn_exit; +} + +int MPIDI_POSIX_coll_finalize(void) +{ + int mpi_errno = MPI_SUCCESS; + + MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDI_POSIX_COLL_FINALIZE); + MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPIDI_POSIX_COLL_FINALIZE); + +#ifdef ENABLE_IZEM_ATOMIC + /* Destroy the shared counter which was used to track the amount of shared memory created + * per node for intra-node collectives */ + mpi_errno = MPIDU_shm_seg_destroy(&MPIDI_POSIX_global.memory, MPIDI_POSIX_global.num_local); +#endif + + fn_exit: + MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPIDI_POSIX_COLL_FINALIZE); + return mpi_errno; + fn_fail: + goto fn_exit; +} + int MPIDI_POSIX_get_vci_attr(int vci) { MPIR_Assert(0 <= vci && vci < 1); diff --git a/src/mpid/ch4/shm/posix/posix_noinline.h b/src/mpid/ch4/shm/posix/posix_noinline.h index d0944c4b4f1..acfd322588c 100644 --- a/src/mpid/ch4/shm/posix/posix_noinline.h +++ b/src/mpid/ch4/shm/posix/posix_noinline.h @@ -13,6 +13,10 @@ int MPIDI_POSIX_mpi_init_hook(int rank, int size, int *n_vcis_provided, int *tag int MPIDI_POSIX_mpi_finalize_hook(void); int MPIDI_POSIX_get_vci_attr(int vci); +int collective_cvars_init(void); +int MPIDI_POSIX_coll_init(int rank, int size); +int MPIDI_POSIX_coll_finalize(void); + int MPIDI_POSIX_mpi_comm_create_hook(MPIR_Comm * comm); int MPIDI_POSIX_mpi_comm_free_hook(MPIR_Comm * comm); int MPIDI_POSIX_mpi_type_commit_hook(MPIR_Datatype * type); diff --git a/src/mpid/ch4/shm/posix/posix_pre.h b/src/mpid/ch4/shm/posix/posix_pre.h index f48f4c41c4b..91f8710bb1d 100644 --- a/src/mpid/ch4/shm/posix/posix_pre.h +++ b/src/mpid/ch4/shm/posix/posix_pre.h @@ -13,6 +13,9 @@ #define POSIX_PRE_H_INCLUDED #include +#ifdef ENABLE_IZEM_ATOMIC +#include "release_gather_types.h" +#endif #define MPIDI_POSIX_MAX_AM_HDR_SIZE (32) @@ -35,7 +38,11 @@ struct MPIR_Segment; /* These structs are populated with dummy variables because empty structs are not supported in all * compilers: https://stackoverflow.com/a/755339/491687 */ typedef struct { +#ifdef ENABLE_IZEM_ATOMIC + MPIDI_POSIX_release_gather_comm_t *release_gather; +#else int dummy; +#endif } MPIDI_POSIX_comm_t; typedef struct { diff --git a/src/mpid/ch4/shm/posix/posix_types.h b/src/mpid/ch4/shm/posix/posix_types.h index 1d9f96056f1..ff8bab488b1 100644 --- a/src/mpid/ch4/shm/posix/posix_types.h +++ b/src/mpid/ch4/shm/posix/posix_types.h @@ -28,6 +28,7 @@ enum { #define MPIDI_POSIX_AMREQUEST_HDR(req, field) ((req)->dev.ch4.am.shm_am.posix.req_hdr->field) #define MPIDI_POSIX_AMREQUEST_HDR_PTR(req) ((req)->dev.ch4.am.shm_am.posix.req_hdr) #define MPIDI_POSIX_REQUEST(req, field) ((req)->dev.ch4.shm.posix.field) +#define MPIDI_POSIX_COMM(comm) (&(comm)->dev.ch4.shm.posix) typedef struct { MPIDIU_buf_pool_t *am_buf_pool; @@ -37,6 +38,17 @@ typedef struct { /* Active recv requests array */ MPIR_Request **active_rreq; + + MPIDU_shm_seg_t memory; + MPIDU_shm_barrier_t *barrier; + + /* Keep track of all of the local processes in MPI_COMM_WORLD and what their original rank was + * in that communicator. */ + int num_local; + int my_local_rank; + int *local_ranks; + int *local_procs; + int local_rank_0; } MPIDI_POSIX_global_t; extern MPIDI_POSIX_global_t MPIDI_POSIX_global; diff --git a/src/mpid/ch4/shm/posix/release_gather/Makefile.mk b/src/mpid/ch4/shm/posix/release_gather/Makefile.mk new file mode 100644 index 00000000000..45808c6bb26 --- /dev/null +++ b/src/mpid/ch4/shm/posix/release_gather/Makefile.mk @@ -0,0 +1,14 @@ +## (C) 2018 by Argonne National Laboratory. +## See COPYRIGHT in top-level directory. +## +## Portions of this code were written by Intel Corporation. +## Copyright (C) 2011-2017 Intel Corporation. Intel provides this material +## to Argonne National Laboratory subject to Software Grant and Corporate +## Contributor License Agreement dated February 8, 2012. + +AM_CPPFLAGS += -I$(top_srcdir)/src/mpid/ch4/shm/posix/release_gather + +noinst_HEADERS += src/mpid/ch4/shm/posix/release_gather/release_gather_types.h \ + src/mpid/ch4/shm/posix/release_gather/release_gather.h + +mpi_core_sources += src/mpid/ch4/shm/posix/release_gather/release_gather.c diff --git a/src/mpid/ch4/shm/posix/release_gather/release_gather.c b/src/mpid/ch4/shm/posix/release_gather/release_gather.c new file mode 100644 index 00000000000..d8ea640a159 --- /dev/null +++ b/src/mpid/ch4/shm/posix/release_gather/release_gather.c @@ -0,0 +1,495 @@ +/* + * (C) 2018 by Argonne National Laboratory. + * See COPYRIGHT in top-level directory. + * + * Portions of this code were written by Intel Corporation. + * Copyright (C) 2011-2017 Intel Corporation. Intel provides this material + * to Argonne National Laboratory subject to Software Grant and Corporate + * Contributor License Agreement dated February 8, 2012. + */ + +/* +=== BEGIN_MPI_T_CVAR_INFO_BLOCK === + +cvars: + - name : MPIR_CVAR_COLL_SHM_LIMIT_PER_NODE + category : COLLECTIVE + type : int + default : 65536 + class : device + verbosity : MPI_T_VERBOSITY_USER_BASIC + scope : MPI_T_SCOPE_ALL_EQ + description : >- + Maximum shared memory created per node for optimized intra-node collectives (in KB) + + - name : MPIR_CVAR_BCAST_INTRANODE_BUFFER_TOTAL_SIZE + category : COLLECTIVE + type : int + default : 32768 + class : device + verbosity : MPI_T_VERBOSITY_USER_BASIC + scope : MPI_T_SCOPE_ALL_EQ + description : >- + Total size of the bcast buffer (in bytes) + + - name : MPIR_CVAR_BCAST_INTRANODE_NUM_CELLS + category : COLLECTIVE + type : int + default : 4 + class : device + verbosity : MPI_T_VERBOSITY_USER_BASIC + scope : MPI_T_SCOPE_ALL_EQ + description : >- + Number of cells the bcast buffer is divided into + + - name : MPIR_CVAR_REDUCE_INTRANODE_BUFFER_TOTAL_SIZE + category : COLLECTIVE + type : int + default : 32768 + class : device + verbosity : MPI_T_VERBOSITY_USER_BASIC + scope : MPI_T_SCOPE_ALL_EQ + description : >- + Total size of the reduce buffer per rank (in bytes) + + - name : MPIR_CVAR_REDUCE_INTRANODE_NUM_CELLS + category : COLLECTIVE + type : int + default : 4 + class : device + verbosity : MPI_T_VERBOSITY_USER_BASIC + scope : MPI_T_SCOPE_ALL_EQ + description : >- + Number of cells the reduce buffer is divided into, for each rank + + - name : MPIR_CVAR_BCAST_INTRANODE_TREE_KVAL + category : COLLECTIVE + type : int + default : 64 + class : device + verbosity : MPI_T_VERBOSITY_USER_BASIC + scope : MPI_T_SCOPE_ALL_EQ + description : >- + K value for the kary/knomial tree for intra-node bcast + + - name : MPIR_CVAR_BCAST_INTRANODE_TREE_TYPE + category : COLLECTIVE + type : string + default : kary + class : device + verbosity : MPI_T_VERBOSITY_USER_BASIC + scope : MPI_T_SCOPE_ALL_EQ + description : >- + Tree type for intra-node bcast tree + kary - kary tree type + knomial_1 - knomial_1 tree type (ranks are added in order from the left side) + knomial_2 - knomial_2 tree type (ranks are added in order from the right side) + + - name : MPIR_CVAR_REDUCE_INTRANODE_TREE_KVAL + category : COLLECTIVE + type : int + default : 4 + class : device + verbosity : MPI_T_VERBOSITY_USER_BASIC + scope : MPI_T_SCOPE_ALL_EQ + description : >- + K value for the kary/knomial tree for intra-node reduce + + - name : MPIR_CVAR_REDUCE_INTRANODE_TREE_TYPE + category : COLLECTIVE + type : string + default : kary + class : device + verbosity : MPI_T_VERBOSITY_USER_BASIC + scope : MPI_T_SCOPE_ALL_EQ + description : >- + Tree type for intra-node reduce tree + kary - kary tree type + knomial_1 - knomial_1 tree type (ranks are added in order from the left side) + knomial_2 - knomial_2 tree type (ranks are added in order from the right side) + +=== END_MPI_T_CVAR_INFO_BLOCK === +*/ + +#include "mpiimpl.h" +#include "release_gather.h" + +#define COMM_FIELD(comm, field) \ + MPIDI_POSIX_COMM(comm)->release_gather->field + + +MPIDI_POSIX_release_gather_tree_type_t MPIDI_POSIX_Bcast_tree_type, MPIDI_POSIX_Reduce_tree_type; + +/* Initialize the release_gather struct to NULL */ +int MPIDI_POSIX_mpi_release_gather_comm_init_null(MPIR_Comm * comm_ptr) +{ + MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_POSIX_MPI_RELEASE_GATHER_COMM_INIT_NULL); + MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_POSIX_MPI_RELEASE_GATHER_COMM_INIT_NULL); + + MPIDI_POSIX_COMM(comm_ptr)->release_gather = NULL; + + if (0 == strcmp(MPIR_CVAR_BCAST_INTRANODE_TREE_TYPE, "kary")) + MPIDI_POSIX_Bcast_tree_type = MPIDI_POSIX_RELEASE_GATHER_TREE_TYPE_KARY; + else if (0 == strcmp(MPIR_CVAR_BCAST_INTRANODE_TREE_TYPE, "knomial_1")) + MPIDI_POSIX_Bcast_tree_type = MPIDI_POSIX_RELEASE_GATHER_TREE_TYPE_KNOMIAL_1; + else if (0 == strcmp(MPIR_CVAR_BCAST_INTRANODE_TREE_TYPE, "knomial_2")) + MPIDI_POSIX_Bcast_tree_type = MPIDI_POSIX_RELEASE_GATHER_TREE_TYPE_KNOMIAL_2; + else + MPIDI_POSIX_Bcast_tree_type = MPIDI_POSIX_RELEASE_GATHER_TREE_TYPE_KARY; + + if (0 == strcmp(MPIR_CVAR_REDUCE_INTRANODE_TREE_TYPE, "kary")) + MPIDI_POSIX_Reduce_tree_type = MPIDI_POSIX_RELEASE_GATHER_TREE_TYPE_KARY; + else if (0 == strcmp(MPIR_CVAR_REDUCE_INTRANODE_TREE_TYPE, "knomial_1")) + MPIDI_POSIX_Reduce_tree_type = MPIDI_POSIX_RELEASE_GATHER_TREE_TYPE_KNOMIAL_1; + else if (0 == strcmp(MPIR_CVAR_REDUCE_INTRANODE_TREE_TYPE, "knomial_2")) + MPIDI_POSIX_Reduce_tree_type = MPIDI_POSIX_RELEASE_GATHER_TREE_TYPE_KNOMIAL_2; + else + MPIDI_POSIX_Reduce_tree_type = MPIDI_POSIX_RELEASE_GATHER_TREE_TYPE_KARY; + + MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_POSIX_MPI_RELEASE_GATHER_COMM_INIT_NULL); + return MPI_SUCCESS; +} + +/* Initialize the data structures and allocate the shared memory (flags, bcast buffer and reduce + * buffer) */ +int MPIDI_POSIX_mpi_release_gather_comm_init(MPIR_Comm * comm_ptr, + const MPIDI_POSIX_release_gather_opcode_t operation) +{ + MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_POSIX_MPI_RELEASE_GATHER_COMM_INIT); + MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_POSIX_MPI_RELEASE_GATHER_COMM_INIT); + + int mpi_errno = MPI_SUCCESS; + int mpi_errno_ret = MPI_SUCCESS; + int rank, num_ranks; + bool initialize_flags = false, initialize_bcast_buf = false, initialize_reduce_buf = false; + int flags_num_pages, fallback = 0; + size_t flags_shm_size = 0; + const long pg_sz = sysconf(_SC_PAGESIZE); + MPIR_Errflag_t errflag = MPIR_ERR_NONE; + bool mapfail_flag = false; + + rank = MPIR_Comm_rank(comm_ptr); + num_ranks = MPIR_Comm_size(comm_ptr); + /* Layout of the shm region: 1 gather and release flag each per rank, followed by + * bcast buffer (divided into multiple cells), followed by + * reduce buffer (divided into multiple cells) per rank. */ + + if (MPIDI_POSIX_COMM(comm_ptr)->release_gather == NULL) { + /* release_gather based collectives have not been used before on this comm */ + initialize_flags = true; + if (operation == MPIDI_POSIX_RELEASE_GATHER_OPCODE_BCAST) { + initialize_bcast_buf = true; + } else if (operation == MPIDI_POSIX_RELEASE_GATHER_OPCODE_REDUCE) { + initialize_reduce_buf = true; + } + } else { + /* at least one release_gather based collective was used on this comm */ + if (operation == MPIDI_POSIX_RELEASE_GATHER_OPCODE_BCAST && + COMM_FIELD(comm_ptr, bcast_buf_addr) == NULL) { + initialize_bcast_buf = true; + } else if (operation == MPIDI_POSIX_RELEASE_GATHER_OPCODE_REDUCE && + COMM_FIELD(comm_ptr, reduce_buf_addr) == NULL) { + initialize_reduce_buf = true; + } + } + + if (initialize_flags || initialize_bcast_buf || initialize_reduce_buf) { + /* Calculate the amount of shm to be created */ + size_t tmp_shm_counter; + size_t memory_to_be_allocated = 0; + if (initialize_flags) { + /* Calculate the amount of memory that would be allocated for flags */ + flags_shm_size = MPIDI_POSIX_RELEASE_GATHER_FLAG_SPACE_PER_RANK * num_ranks; + /* Reset flags_shm_size so that the data buffers are aligned to the system pages */ + flags_num_pages = flags_shm_size / (int) (pg_sz); + if (flags_shm_size % pg_sz != 0) { + flags_num_pages++; + } + flags_shm_size = flags_num_pages * pg_sz; + + /* Update the amount of memory to be allocated */ + memory_to_be_allocated += flags_shm_size; + } + + if (initialize_bcast_buf) { + memory_to_be_allocated += MPIR_CVAR_BCAST_INTRANODE_BUFFER_TOTAL_SIZE; + } + if (initialize_reduce_buf) { + memory_to_be_allocated += (num_ranks * MPIR_CVAR_REDUCE_INTRANODE_BUFFER_TOTAL_SIZE); + } + + if (rank == 0) { + /* rank 0 decides if more memory can be created and broadcasts the decision to other ranks */ + tmp_shm_counter = zm_atomic_load(MPIDI_POSIX_shm_limit_counter, zm_memord_acquire); + + /* Check if it is allowed to create more shm on this node */ + if ((tmp_shm_counter + memory_to_be_allocated) > + (MPIR_CVAR_COLL_SHM_LIMIT_PER_NODE * 1024)) { + /* cannot create more shm, fallback to MPIR level algorithms, and broadcast the decision to other ranks */ + fallback = 1; + MPIR_Bcast_impl(&fallback, 1, MPI_INT, 0, comm_ptr, &errflag); + MPIR_ERR_SETANDJUMP(mpi_errno, MPI_ERR_NO_MEM, "**nomem"); + } else { + /* More shm can be created, update the shared counter */ + zm_atomic_fetch_add(MPIDI_POSIX_shm_limit_counter, memory_to_be_allocated, + zm_memord_seq_cst); + fallback = 0; + mpi_errno = MPIR_Bcast_impl(&fallback, 1, MPI_INT, 0, comm_ptr, &errflag); + if (mpi_errno) { + /* for communication errors, just record the error but continue */ + errflag = + MPIX_ERR_PROC_FAILED == + MPIR_ERR_GET_CLASS(mpi_errno) ? MPIR_ERR_PROC_FAILED : MPIR_ERR_OTHER; + MPIR_ERR_SET(mpi_errno, errflag, "**fail"); + MPIR_ERR_ADD(mpi_errno_ret, mpi_errno); + } + } + } else { + mpi_errno = MPIR_Bcast_impl(&fallback, 1, MPI_INT, 0, comm_ptr, &errflag); + if (mpi_errno) { + /* for communication errors, just record the error but continue */ + errflag = + MPIX_ERR_PROC_FAILED == + MPIR_ERR_GET_CLASS(mpi_errno) ? MPIR_ERR_PROC_FAILED : MPIR_ERR_OTHER; + MPIR_ERR_SET(mpi_errno, errflag, "**fail"); + MPIR_ERR_ADD(mpi_errno_ret, mpi_errno); + } + if (fallback) { + MPIR_ERR_SETANDJUMP(mpi_errno, MPI_ERR_NO_MEM, "**nomem"); + } + } + } + + if (initialize_flags) { + /* Initialize the release_gather struct and allocate shm for flags */ + MPIDI_POSIX_release_gather_comm_t *release_gather_info_ptr; + release_gather_info_ptr = + MPL_malloc(sizeof(struct MPIDI_POSIX_release_gather_comm_t), MPL_MEM_COLL); + MPIR_ERR_CHKANDJUMP(!release_gather_info_ptr, mpi_errno, MPI_ERR_OTHER, "**nomem"); + + release_gather_info_ptr->flags_shm_size = flags_shm_size; + + /* Create bcast_tree and reduce_tree with root of the tree as 0 */ + mpi_errno = + MPIR_Treealgo_tree_create(rank, num_ranks, MPIDI_POSIX_Bcast_tree_type, + MPIR_CVAR_BCAST_INTRANODE_TREE_KVAL, 0, + &release_gather_info_ptr->bcast_tree); + if (mpi_errno) { + /* for communication errors, just record the error but continue */ + errflag = + MPIX_ERR_PROC_FAILED == + MPIR_ERR_GET_CLASS(mpi_errno) ? MPIR_ERR_PROC_FAILED : MPIR_ERR_OTHER; + MPIR_ERR_SET(mpi_errno, errflag, "**fail"); + MPIR_ERR_ADD(mpi_errno_ret, mpi_errno); + } + mpi_errno = + MPIR_Treealgo_tree_create(rank, num_ranks, MPIDI_POSIX_Reduce_tree_type, + MPIR_CVAR_REDUCE_INTRANODE_TREE_KVAL, 0, + &release_gather_info_ptr->reduce_tree); + if (mpi_errno) { + /* for communication errors, just record the error but continue */ + errflag = + MPIX_ERR_PROC_FAILED == + MPIR_ERR_GET_CLASS(mpi_errno) ? MPIR_ERR_PROC_FAILED : MPIR_ERR_OTHER; + MPIR_ERR_SET(mpi_errno, errflag, "**fail"); + MPIR_ERR_ADD(mpi_errno_ret, mpi_errno); + } + + release_gather_info_ptr->gather_state = release_gather_info_ptr->release_state + = MPIR_CVAR_BCAST_INTRANODE_NUM_CELLS + MPIR_CVAR_REDUCE_INTRANODE_NUM_CELLS; + + release_gather_info_ptr->bcast_buf_addr = NULL; + release_gather_info_ptr->reduce_buf_addr = NULL; + release_gather_info_ptr->child_reduce_buf_addr = NULL; + + mpi_errno = MPIDIU_allocate_shm_segment(comm_ptr, flags_shm_size, + &(release_gather_info_ptr->shm_flags_handle), + (void **) + &(release_gather_info_ptr->flags_addr), + &mapfail_flag); + if (mpi_errno || mapfail_flag) { + /* for communication errors, just record the error but continue */ + errflag = + MPIX_ERR_PROC_FAILED == + MPIR_ERR_GET_CLASS(mpi_errno) ? MPIR_ERR_PROC_FAILED : MPIR_ERR_OTHER; + MPIR_ERR_SET(mpi_errno, errflag, "**fail"); + MPIR_ERR_ADD(mpi_errno_ret, mpi_errno); + } + + /* Calculate gather and release flag address and initialize to the gather and release states */ + release_gather_info_ptr->gather_flag_addr = + MPIDI_POSIX_RELEASE_GATHER_GATHER_FLAG_ADDR(rank); + release_gather_info_ptr->release_flag_addr = + MPIDI_POSIX_RELEASE_GATHER_RELEASE_FLAG_ADDR(rank); + *(release_gather_info_ptr->gather_flag_addr) = (release_gather_info_ptr->gather_state); + *(release_gather_info_ptr->release_flag_addr) = (release_gather_info_ptr->release_state); + + /* Make sure all the flags are set before ranks start reading each other's flags from shm */ + mpi_errno = MPIR_Barrier_impl(comm_ptr, &errflag); + if (mpi_errno) { + /* for communication errors, just record the error but continue */ + errflag = + MPIX_ERR_PROC_FAILED == + MPIR_ERR_GET_CLASS(mpi_errno) ? MPIR_ERR_PROC_FAILED : MPIR_ERR_OTHER; + MPIR_ERR_SET(mpi_errno, errflag, "**fail"); + MPIR_ERR_ADD(mpi_errno_ret, mpi_errno); + } + MPIDI_POSIX_COMM(comm_ptr)->release_gather = release_gather_info_ptr; + } + + if (initialize_bcast_buf) { + /* Allocate the shared memory for bcast buffer */ + mpi_errno = + MPIDIU_allocate_shm_segment(comm_ptr, MPIR_CVAR_BCAST_INTRANODE_BUFFER_TOTAL_SIZE, + &(COMM_FIELD(comm_ptr, shm_bcast_buf_handle)), + (void **) &(COMM_FIELD(comm_ptr, bcast_buf_addr)), + &mapfail_flag); + if (mpi_errno || mapfail_flag) { + /* for communication errors, just record the error but continue */ + errflag = + MPIX_ERR_PROC_FAILED == + MPIR_ERR_GET_CLASS(mpi_errno) ? MPIR_ERR_PROC_FAILED : MPIR_ERR_OTHER; + MPIR_ERR_SET(mpi_errno, errflag, "**fail"); + MPIR_ERR_ADD(mpi_errno_ret, mpi_errno); + } + } + + if (initialize_reduce_buf) { + /* Allocate the shared memory for a reduce buffer per rank */ + int i; + COMM_FIELD(comm_ptr, child_reduce_buf_addr) = + MPL_malloc(num_ranks * sizeof(void *), MPL_MEM_COLL); + + mpi_errno = + MPIDIU_allocate_shm_segment(comm_ptr, + num_ranks * + MPIR_CVAR_REDUCE_INTRANODE_BUFFER_TOTAL_SIZE, + &(COMM_FIELD(comm_ptr, shm_reduce_buf_handle)), + (void **) &(COMM_FIELD(comm_ptr, reduce_buf_addr)), + &mapfail_flag); + if (mpi_errno || mapfail_flag) { + /* for communication errors, just record the error but continue */ + errflag = + MPIX_ERR_PROC_FAILED == + MPIR_ERR_GET_CLASS(mpi_errno) ? MPIR_ERR_PROC_FAILED : MPIR_ERR_OTHER; + MPIR_ERR_SET(mpi_errno, errflag, "**fail"); + MPIR_ERR_ADD(mpi_errno_ret, mpi_errno); + } + + /* Store address of each of the children's reduce buffer */ + for (i = 0; i < COMM_FIELD(comm_ptr, reduce_tree.num_children); i++) { + MPIR_ERR_CHKANDJUMP(!utarray_eltptr(COMM_FIELD(comm_ptr, reduce_tree.children), i), + mpi_errno, MPI_ERR_OTHER, "**nomem"); + COMM_FIELD(comm_ptr, child_reduce_buf_addr[i]) = + (char *) COMM_FIELD(comm_ptr, reduce_buf_addr) + + ((*utarray_eltptr(COMM_FIELD(comm_ptr, reduce_tree.children), i)) + * MPIR_CVAR_REDUCE_INTRANODE_BUFFER_TOTAL_SIZE); + } + } + + fn_exit: + MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_POSIX_MPI_RELEASE_GATHER_COMM_INIT); + return mpi_errno; + fn_fail: + goto fn_exit; +} + +/* Cleanup the release_gather data structures and free the allocated memory */ +int MPIDI_POSIX_mpi_release_gather_comm_free(MPIR_Comm * comm_ptr) +{ + MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_POSIX_MPI_RELEASE_GATHER_COMM_FREE); + MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_POSIX_MPI_RELEASE_GATHER_COMM_FREE); + + int mpi_errno = MPI_SUCCESS; + int mpi_errno_ret = MPI_SUCCESS; + MPIR_Errflag_t errflag = MPIR_ERR_NONE; + + /* Clean up is not required for NULL struct */ + if (MPIDI_POSIX_COMM(comm_ptr)->release_gather == NULL) { + goto fn_exit; + } + + /* destroy and detach shared memory used for flags */ + mpi_errno = MPL_shm_seg_detach(COMM_FIELD(comm_ptr, shm_flags_handle), + (void **) &COMM_FIELD(comm_ptr, flags_addr), + COMM_FIELD(comm_ptr, flags_shm_size)); + if (mpi_errno) { + /* for communication errors, just record the error but continue */ + errflag = + MPIX_ERR_PROC_FAILED == + MPIR_ERR_GET_CLASS(mpi_errno) ? MPIR_ERR_PROC_FAILED : MPIR_ERR_OTHER; + MPIR_ERR_SET(mpi_errno, errflag, "**fail"); + MPIR_ERR_ADD(mpi_errno_ret, mpi_errno); + } + + mpi_errno = MPL_shm_hnd_finalize(&COMM_FIELD(comm_ptr, shm_flags_handle)); + if (mpi_errno) { + /* for communication errors, just record the error but continue */ + errflag = + MPIX_ERR_PROC_FAILED == + MPIR_ERR_GET_CLASS(mpi_errno) ? MPIR_ERR_PROC_FAILED : MPIR_ERR_OTHER; + MPIR_ERR_SET(mpi_errno, errflag, "**fail"); + MPIR_ERR_ADD(mpi_errno_ret, mpi_errno); + } + + if (COMM_FIELD(comm_ptr, bcast_buf_addr) != NULL) { + /* destroy and detach shared memory used for bcast buffer */ + mpi_errno = MPL_shm_seg_detach(COMM_FIELD(comm_ptr, shm_bcast_buf_handle), + (void **) &COMM_FIELD(comm_ptr, bcast_buf_addr), + MPIR_CVAR_BCAST_INTRANODE_BUFFER_TOTAL_SIZE); + if (mpi_errno) { + /* for communication errors, just record the error but continue */ + errflag = + MPIX_ERR_PROC_FAILED == + MPIR_ERR_GET_CLASS(mpi_errno) ? MPIR_ERR_PROC_FAILED : MPIR_ERR_OTHER; + MPIR_ERR_SET(mpi_errno, errflag, "**fail"); + MPIR_ERR_ADD(mpi_errno_ret, mpi_errno); + } + + mpi_errno = MPL_shm_hnd_finalize(&COMM_FIELD(comm_ptr, shm_bcast_buf_handle)); + if (mpi_errno) { + /* for communication errors, just record the error but continue */ + errflag = + MPIX_ERR_PROC_FAILED == + MPIR_ERR_GET_CLASS(mpi_errno) ? MPIR_ERR_PROC_FAILED : MPIR_ERR_OTHER; + MPIR_ERR_SET(mpi_errno, errflag, "**fail"); + MPIR_ERR_ADD(mpi_errno_ret, mpi_errno); + } + } + + if (COMM_FIELD(comm_ptr, reduce_buf_addr) != NULL) { + /* destroy and detach shared memory used for reduce buffers */ + mpi_errno = MPL_shm_seg_detach(COMM_FIELD(comm_ptr, shm_reduce_buf_handle), + (void **) &COMM_FIELD(comm_ptr, reduce_buf_addr), + MPIR_Comm_size(comm_ptr) + * MPIR_CVAR_REDUCE_INTRANODE_BUFFER_TOTAL_SIZE); + if (mpi_errno) { + /* for communication errors, just record the error but continue */ + errflag = + MPIX_ERR_PROC_FAILED == + MPIR_ERR_GET_CLASS(mpi_errno) ? MPIR_ERR_PROC_FAILED : MPIR_ERR_OTHER; + MPIR_ERR_SET(mpi_errno, errflag, "**fail"); + MPIR_ERR_ADD(mpi_errno_ret, mpi_errno); + } + + mpi_errno = MPL_shm_hnd_finalize(&COMM_FIELD(comm_ptr, shm_reduce_buf_handle)); + if (mpi_errno) { + /* for communication errors, just record the error but continue */ + errflag = + MPIX_ERR_PROC_FAILED == + MPIR_ERR_GET_CLASS(mpi_errno) ? MPIR_ERR_PROC_FAILED : MPIR_ERR_OTHER; + MPIR_ERR_SET(mpi_errno, errflag, "**fail"); + MPIR_ERR_ADD(mpi_errno_ret, mpi_errno); + } + + MPL_free(COMM_FIELD(comm_ptr, child_reduce_buf_addr)); + } + + MPIR_Treealgo_tree_free(&(COMM_FIELD(comm_ptr, bcast_tree))); + MPIR_Treealgo_tree_free(&(COMM_FIELD(comm_ptr, reduce_tree))); + MPL_free(MPIDI_POSIX_COMM(comm_ptr)->release_gather); + + fn_exit: + MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_POSIX_MPI_RELEASE_GATHER_COMM_FREE); + return mpi_errno; +} diff --git a/src/mpid/ch4/shm/posix/release_gather/release_gather.h b/src/mpid/ch4/shm/posix/release_gather/release_gather.h new file mode 100644 index 00000000000..1991526b0b8 --- /dev/null +++ b/src/mpid/ch4/shm/posix/release_gather/release_gather.h @@ -0,0 +1,342 @@ +/* + * (C) 2018 by Argonne National Laboratory. + * See COPYRIGHT in top-level directory. + * + * Portions of this code were written by Intel Corporation. + * Copyright (C) 2011-2017 Intel Corporation. Intel provides this material + * to Argonne National Laboratory subject to Software Grant and Corporate + * Contributor License Agreement dated February 8, 2012. + */ + +#ifndef RELEASE_GATHER_H_INCLUDED +#define RELEASE_GATHER_H_INCLUDED + +extern zm_atomic_uint_t *MPIDI_POSIX_shm_limit_counter; +extern MPL_shm_hnd_t shm_limit_handle; +extern MPIDI_POSIX_release_gather_tree_type_t MPIDI_POSIX_Bcast_tree_type, + MPIDI_POSIX_Reduce_tree_type; + +/*Blocking wait implementation*/ +/* zm_memord_acquire makes sure no writes/reads are reordered before this load */ +#define MPIDI_POSIX_RELEASE_GATHER_WAIT_WHILE_LESS_THAN(ptr, value) \ + do { \ + int spin_count = 0; \ + while (zm_atomic_load(ptr, zm_memord_acquire) < (value)) { \ + if (++spin_count >= 10000) { \ + /* Call progress only after waiting for a while */ \ + MPID_Progress_test(); \ + spin_count = 0; \ + } \ + } \ + } \ + while (0) +#define MPIDI_POSIX_RELEASE_GATHER_FLAG_SIZE (sizeof(zm_atomic_uint_t)) +/* 1 cache_line each for gather and release flag */ +#define MPIDI_POSIX_RELEASE_GATHER_FLAG_SPACE_PER_RANK (MPIDU_SHM_CACHE_LINE_LEN * 2) +#define MPIDI_POSIX_RELEASE_GATHER_GATHER_FLAG_OFFSET (0) +#define MPIDI_POSIX_RELEASE_GATHER_RELEASE_FLAG_OFFSET (MPIDU_SHM_CACHE_LINE_LEN) +#define MPIDI_POSIX_RELEASE_GATHER_GATHER_FLAG_ADDR(rank) \ + (((zm_atomic_uint_t *)release_gather_info_ptr->flags_addr) + \ + ((rank * MPIDI_POSIX_RELEASE_GATHER_FLAG_SPACE_PER_RANK + MPIDI_POSIX_RELEASE_GATHER_GATHER_FLAG_OFFSET)/(MPIDI_POSIX_RELEASE_GATHER_FLAG_SIZE))) +#define MPIDI_POSIX_RELEASE_GATHER_RELEASE_FLAG_ADDR(rank) \ + (((zm_atomic_uint_t *)release_gather_info_ptr->flags_addr) + \ + ((rank * MPIDI_POSIX_RELEASE_GATHER_FLAG_SPACE_PER_RANK + MPIDI_POSIX_RELEASE_GATHER_RELEASE_FLAG_OFFSET)/(MPIDI_POSIX_RELEASE_GATHER_FLAG_SIZE))) +#define MPIDI_POSIX_RELEASE_GATHER_BCAST_DATA_ADDR(buf) \ + (char *) release_gather_info_ptr->bcast_buf_addr + \ + (buf * MPIDI_POSIX_RELEASE_GATHER_BCAST_CELLSIZE) +#define MPIDI_POSIX_RELEASE_GATHER_REDUCE_DATA_ADDR(rank, buf) \ + (((char *) release_gather_info_ptr->reduce_buf_addr) + \ + (rank * MPIR_CVAR_REDUCE_INTRANODE_BUFFER_TOTAL_SIZE) + \ + (buf * MPIDI_POSIX_RELEASE_GATHER_REDUCE_CELLSIZE)) +#define MPIDI_POSIX_RELEASE_GATHER_BCAST_CELLSIZE \ + (MPIR_CVAR_BCAST_INTRANODE_BUFFER_TOTAL_SIZE / MPIR_CVAR_BCAST_INTRANODE_NUM_CELLS) +#define MPIDI_POSIX_RELEASE_GATHER_REDUCE_CELLSIZE \ + (MPIR_CVAR_REDUCE_INTRANODE_BUFFER_TOTAL_SIZE / MPIR_CVAR_REDUCE_INTRANODE_NUM_CELLS) + +int MPIDI_POSIX_mpi_release_gather_comm_init_null(MPIR_Comm * comm_ptr); +int MPIDI_POSIX_mpi_release_gather_comm_init(MPIR_Comm * comm_ptr, + const MPIDI_POSIX_release_gather_opcode_t operation); +int MPIDI_POSIX_mpi_release_gather_comm_free(MPIR_Comm * comm_ptr); + +/* Release step of the release_gather framework. This is top-down step in the release_tree. + * Parent notifies the children to go, once it arrives. In case of Bcast, root places the data in + * shm bcast buffer before notifying the children. Children copy the data out of shm buffer when + * notified by the parent */ +MPL_STATIC_INLINE_PREFIX int MPIDI_POSIX_mpi_release_gather_release(void *local_buf, + const int count, + MPI_Datatype datatype, + const int root, + MPIR_Comm * comm_ptr, + MPIR_Errflag_t * errflag, + const + MPIDI_POSIX_release_gather_opcode_t + operation) +{ + MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_POSIX_MPI_RELEASE_GATHER_RELEASE); + MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_POSIX_MPI_RELEASE_GATHER_RELEASE); + + int mpi_errno = MPI_SUCCESS; + int mpi_errno_ret = MPI_SUCCESS; + MPIDI_POSIX_release_gather_comm_t *release_gather_info_ptr; + int segment, rank; + void *bcast_data_addr = NULL; + volatile zm_atomic_uint_t *parent_flag_addr; + /* Set the relaxation to 0 because in Bcast, gather step is "relaxed" to make sure multiple + * buffers can be used to pipeline the copying in and out of shared memory, and data is not + * overwritten */ + const int relaxation = + (operation == + MPIDI_POSIX_RELEASE_GATHER_OPCODE_BCAST) ? 0 : MPIR_CVAR_REDUCE_INTRANODE_NUM_CELLS - 1; + + rank = MPIR_Comm_rank(comm_ptr); + release_gather_info_ptr = MPIDI_POSIX_COMM(comm_ptr)->release_gather; + + if (operation == MPIDI_POSIX_RELEASE_GATHER_OPCODE_BCAST) { + segment = release_gather_info_ptr->release_state % MPIR_CVAR_BCAST_INTRANODE_NUM_CELLS; + bcast_data_addr = MPIDI_POSIX_RELEASE_GATHER_BCAST_DATA_ADDR(segment); + + if (root != 0) { + /* Root sends data to rank 0 */ + if (rank == root) { + mpi_errno = + MPIC_Send(local_buf, count, datatype, 0, MPIR_BCAST_TAG, comm_ptr, errflag); + if (mpi_errno) { + /* for communication errors, just record the error but continue */ + *errflag = + MPIX_ERR_PROC_FAILED == + MPIR_ERR_GET_CLASS(mpi_errno) ? MPIR_ERR_PROC_FAILED : MPIR_ERR_OTHER; + MPIR_ERR_SET(mpi_errno, *errflag, "**fail"); + MPIR_ERR_ADD(mpi_errno_ret, mpi_errno); + } + } else if (rank == 0) { + mpi_errno = + MPIC_Recv(bcast_data_addr, count, datatype, root, MPIR_BCAST_TAG, comm_ptr, + MPI_STATUS_IGNORE, errflag); + if (mpi_errno) { + /* for communication errors, just record the error but continue */ + *errflag = + MPIX_ERR_PROC_FAILED == + MPIR_ERR_GET_CLASS(mpi_errno) ? MPIR_ERR_PROC_FAILED : MPIR_ERR_OTHER; + MPIR_ERR_SET(mpi_errno, *errflag, "**fail"); + MPIR_ERR_ADD(mpi_errno_ret, mpi_errno); + } + } + } else if (rank == 0) { + mpi_errno = MPIR_Localcopy(local_buf, count, datatype, + bcast_data_addr, count, datatype); + if (mpi_errno) { + /* for communication errors, just record the error but continue */ + *errflag = MPIR_ERR_OTHER; + MPIR_ERR_SET(mpi_errno, *errflag, "**fail"); + MPIR_ERR_ADD(mpi_errno_ret, mpi_errno); + } + } + } + + release_gather_info_ptr->release_state++; + + if (rank == 0) { + /* Rank 0 updates its flag when it arrives and data is ready in shm buffer (if bcast) */ + /* zm_memord_release makes sure that the write of data does not get reordered after this + * store */ + zm_atomic_store((release_gather_info_ptr->release_flag_addr), + release_gather_info_ptr->release_state, zm_memord_release); + } else { + if (operation == MPIDI_POSIX_RELEASE_GATHER_OPCODE_BCAST) { + parent_flag_addr = + MPIDI_POSIX_RELEASE_GATHER_RELEASE_FLAG_ADDR(release_gather_info_ptr-> + bcast_tree.parent); + } else { + parent_flag_addr = + MPIDI_POSIX_RELEASE_GATHER_RELEASE_FLAG_ADDR(release_gather_info_ptr-> + reduce_tree.parent); + } + + /* Wait until the parent has updated its flag */ + MPIDI_POSIX_RELEASE_GATHER_WAIT_WHILE_LESS_THAN(parent_flag_addr, + release_gather_info_ptr->release_state - + relaxation); + /* Update its own flag */ + /* zm_memord_release makes sure that the read of parent's flag does not get reordered after + * this store */ + zm_atomic_store((release_gather_info_ptr->release_flag_addr), + release_gather_info_ptr->release_state, zm_memord_release); + } + + if (operation == MPIDI_POSIX_RELEASE_GATHER_OPCODE_BCAST) { + /* Non-root ranks copy data from shm buffer to user buffer */ + if (rank != root) { + MPIR_ERR_CHKANDJUMP(!bcast_data_addr, mpi_errno, MPI_ERR_OTHER, "**nomem"); + mpi_errno = MPIR_Localcopy(bcast_data_addr, count, datatype, + local_buf, count, datatype); + if (mpi_errno) { + /* for communication errors, just record the error but continue */ + *errflag = MPIR_ERR_OTHER; + MPIR_ERR_SET(mpi_errno, *errflag, "**fail"); + MPIR_ERR_ADD(mpi_errno_ret, mpi_errno); + } + } + } + + fn_exit: + MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_POSIX_MPI_RELEASE_GATHER_RELEASE); + return mpi_errno; + fn_fail: + goto fn_exit; +} + +/* Gather step of the release_gather framework. This is bottom-up step in the gather_tree. + * Children notify the parent when it arrives. In case of Reduce, each rank places its data in shm + * reduce buffer. A parent reduces all its children data with its own before notifying its parent. */ +MPL_STATIC_INLINE_PREFIX int MPIDI_POSIX_mpi_release_gather_gather(const void *inbuf, void *outbuf, + const int count, + MPI_Datatype datatype, MPI_Op op, + const int root, + MPIR_Comm * comm_ptr, + MPIR_Errflag_t * errflag, + const + MPIDI_POSIX_release_gather_opcode_t + operation) +{ + MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_POSIX_MPI_RELEASE_GATHER_GATHER); + MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_POSIX_MPI_RELEASE_GATHER_GATHER); + + MPIDI_POSIX_release_gather_comm_t *release_gather_info_ptr; + int segment, rank, num_children; + volatile void *child_data_addr; + volatile zm_atomic_uint_t child_gather_flag, *child_flag_addr; + volatile void *reduce_data_addr = NULL; + int i, mpi_errno = MPI_SUCCESS, mpi_errno_ret = MPI_SUCCESS; + bool skip_checking = false; + /* Set the relaxation to 0 because in Reduce, release step is "relaxed" to make sure multiple + * buffers can be used to pipeline the copying in and out of shared memory, and data is not + * overwritten */ + const int relaxation = + (operation == + MPIDI_POSIX_RELEASE_GATHER_OPCODE_REDUCE) ? 0 : MPIR_CVAR_BCAST_INTRANODE_NUM_CELLS - 1; + zm_atomic_uint_t min_gather; + UT_array *children; + + release_gather_info_ptr = MPIDI_POSIX_COMM(comm_ptr)->release_gather; + segment = release_gather_info_ptr->gather_state % MPIR_CVAR_REDUCE_INTRANODE_NUM_CELLS; + children = release_gather_info_ptr->bcast_tree.children; + num_children = release_gather_info_ptr->bcast_tree.num_children; + rank = MPIR_Comm_rank(comm_ptr); + + release_gather_info_ptr->gather_state++; + min_gather = release_gather_info_ptr->gather_state; + + if (operation == MPIDI_POSIX_RELEASE_GATHER_OPCODE_REDUCE) { + if (rank == 0) { + /* Rank 0 reduces the data directly in its outbuf. Copy the data from inbuf to outbuf + * if needed */ + if (inbuf != outbuf) { + mpi_errno = MPIR_Localcopy(inbuf, count, datatype, outbuf, count, datatype); + } + reduce_data_addr = outbuf; + } else { + reduce_data_addr = MPIDI_POSIX_RELEASE_GATHER_REDUCE_DATA_ADDR(rank, segment); + /* Copy data from user buffer to shared buffer */ + mpi_errno = + MPIR_Localcopy(inbuf, count, datatype, (void *) reduce_data_addr, count, datatype); + } + if (mpi_errno) { + /* for communication errors, just record the error but continue */ + *errflag = MPIR_ERR_OTHER; + MPIR_ERR_SET(mpi_errno, *errflag, "**fail"); + MPIR_ERR_ADD(mpi_errno_ret, mpi_errno); + } + num_children = release_gather_info_ptr->reduce_tree.num_children; + children = release_gather_info_ptr->reduce_tree.children; + } + + /* Avoid checking for availabilty of next buffer if it is guaranteed to be available */ + /* zm_memord_acquire makes sure no writes/reads are reordered before this load */ + if ((operation != MPIDI_POSIX_RELEASE_GATHER_OPCODE_REDUCE) && + (zm_atomic_load(release_gather_info_ptr->gather_flag_addr, zm_memord_acquire)) >= + (release_gather_info_ptr->gather_state - relaxation)) { + skip_checking = true; + } + + /* Leaf nodes never skip checking */ + if (num_children == 0 || !skip_checking) { + for (i = 0; i < num_children; i++) { + MPIR_ERR_CHKANDJUMP(!utarray_eltptr(children, i), mpi_errno, MPI_ERR_OTHER, "**nomem"); + child_flag_addr = + MPIDI_POSIX_RELEASE_GATHER_GATHER_FLAG_ADDR(*utarray_eltptr(children, i)); + /* Wait until the child has arrived */ + MPIDI_POSIX_RELEASE_GATHER_WAIT_WHILE_LESS_THAN(child_flag_addr, + release_gather_info_ptr->gather_state - + relaxation); + + if (operation == MPIDI_POSIX_RELEASE_GATHER_OPCODE_REDUCE) { + child_data_addr = + (char *) release_gather_info_ptr->child_reduce_buf_addr[i] + + segment * MPIDI_POSIX_RELEASE_GATHER_REDUCE_CELLSIZE; + /* zm_memord_acquire in MPIDI_POSIX_RELEASE_GATHER_WAIT_WHILE_LESS_THAN makes sure + * that the reduce_local call does not get reordered before read of children's flag + * in MPIDI_POSIX_RELEASE_GATHER_WAIT_WHILE_LESS_THAN */ + mpi_errno = + MPIR_Reduce_local((void *) child_data_addr, (void *) reduce_data_addr, + count, datatype, op); + if (mpi_errno) { + /* for communication errors, just record the error but continue */ + *errflag = + MPIX_ERR_PROC_FAILED == + MPIR_ERR_GET_CLASS(mpi_errno) ? MPIR_ERR_PROC_FAILED : MPIR_ERR_OTHER; + MPIR_ERR_SET(mpi_errno, *errflag, "**fail"); + MPIR_ERR_ADD(mpi_errno_ret, mpi_errno); + } + } + /* Read child_flag_addr which 'may' be larger than the strongest waiting condition + * so, it is safe */ + child_gather_flag = *child_flag_addr; + min_gather = MPL_MIN(child_gather_flag, min_gather); + } + /* zm_memord_release makes sure that the write of data (reduce_local) does not get + * reordered after this store */ + zm_atomic_store((release_gather_info_ptr->gather_flag_addr), min_gather, zm_memord_release); + } + + if (operation == MPIDI_POSIX_RELEASE_GATHER_OPCODE_REDUCE) { + if (root != 0) { + /* send-recv between root and rank 0 */ + if (rank == root) { + mpi_errno = + MPIC_Recv(outbuf, count, datatype, 0, MPIR_REDUCE_TAG, comm_ptr, + MPI_STATUS_IGNORE, errflag); + if (mpi_errno) { + /* for communication errors, just record the error but continue */ + *errflag = + MPIX_ERR_PROC_FAILED == + MPIR_ERR_GET_CLASS(mpi_errno) ? MPIR_ERR_PROC_FAILED : MPIR_ERR_OTHER; + MPIR_ERR_SET(mpi_errno, *errflag, "**fail"); + MPIR_ERR_ADD(mpi_errno_ret, mpi_errno); + } + } else if (rank == 0) { + MPIR_ERR_CHKANDJUMP(!reduce_data_addr, mpi_errno, MPI_ERR_OTHER, "**nomem"); + mpi_errno = + MPIC_Send((void *) reduce_data_addr, count, datatype, root, MPIR_REDUCE_TAG, + comm_ptr, errflag); + if (mpi_errno) { + /* for communication errors, just record the error but continue */ + *errflag = + MPIX_ERR_PROC_FAILED == + MPIR_ERR_GET_CLASS(mpi_errno) ? MPIR_ERR_PROC_FAILED : MPIR_ERR_OTHER; + MPIR_ERR_SET(mpi_errno, *errflag, "**fail"); + MPIR_ERR_ADD(mpi_errno_ret, mpi_errno); + } + } + } + /* No data copy is required if root was rank 0, because it reduced the data directly in its + * outbuf */ + } + + fn_exit: + MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_POSIX_MPI_RELEASE_GATHER_GATHER); + return mpi_errno; + fn_fail: + goto fn_exit; +} + +#endif /* RELEASE_GATHER_H_INCLUDED */ diff --git a/src/mpid/ch4/shm/posix/release_gather/release_gather_types.h b/src/mpid/ch4/shm/posix/release_gather/release_gather_types.h new file mode 100644 index 00000000000..2e722896107 --- /dev/null +++ b/src/mpid/ch4/shm/posix/release_gather/release_gather_types.h @@ -0,0 +1,40 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */ +/* + * (C) 2018 by Argonne National Laboratory. + * See COPYRIGHT in top-level directory. + * + * Portions of this code were written by Intel Corporation. + * Copyright (C) 2011-2017 Corporation. Intel provides this material + * to Argonne National Laboratory subject to Software Grant and Corporate + * Contributor License Agreement dated February 8, 2012. + */ + +#ifndef RELEASE_GATHER_TYPES_H_INCLUDED +#define RELEASE_GATHER_TYPES_H_INCLUDED + +#include "treealgo_types.h" +#include "common/zm_common.h" + +typedef enum { + MPIDI_POSIX_RELEASE_GATHER_OPCODE_BCAST, + MPIDI_POSIX_RELEASE_GATHER_OPCODE_REDUCE, +} MPIDI_POSIX_release_gather_opcode_t; + +typedef enum MPIDI_POSIX_release_gather_tree_type_t { + MPIDI_POSIX_RELEASE_GATHER_TREE_TYPE_KARY, + MPIDI_POSIX_RELEASE_GATHER_TREE_TYPE_KNOMIAL_1, + MPIDI_POSIX_RELEASE_GATHER_TREE_TYPE_KNOMIAL_2, +} MPIDI_POSIX_release_gather_tree_type_t; + +typedef struct MPIDI_POSIX_release_gather_comm_t { + MPIR_Treealgo_tree_t bcast_tree, reduce_tree; + MPL_shm_hnd_t shm_flags_handle, shm_bcast_buf_handle, shm_reduce_buf_handle; + int flags_shm_size; + uint64_t gather_state, release_state; + + volatile void *flags_addr, *bcast_buf_addr, *reduce_buf_addr; + volatile void **child_reduce_buf_addr; + volatile zm_atomic_uint_t *release_flag_addr, *gather_flag_addr; +} MPIDI_POSIX_release_gather_comm_t; + +#endif /* RELEASE_GATHER_TYPES_H_INCLUDED */ diff --git a/test/mpi/coll/test_coll_algos.sh b/test/mpi/coll/test_coll_algos.sh index 8fdf7de38ca..d86bdb4a5bc 100755 --- a/test/mpi/coll/test_coll_algos.sh +++ b/test/mpi/coll/test_coll_algos.sh @@ -443,4 +443,62 @@ for algo_name in ${algo_names}; do coll_algo_tests+="neighb_alltoallv 4 mpiversion=3.0 ${env}${nl}" done +########## Add tests for intra-node bcast algorithms ############ + +#use release gather based intra-node bcast +testing_env="env=MPIR_CVAR_BCAST_POSIX_INTRA_ALGORITHM=release_gather " + +testing_env+="env=MPIR_CVAR_COLL_SHM_LIMIT_PER_NODE=131072 " #128MB +buffer_sizes="16384 32768" +num_cells="2 4" +tree_types="knomial_1 knomial_2" +kvalues="8 64" + +for buf_size in ${buffer_sizes}; do + for num_cell in ${num_cells}; do + for kval in ${kvalues}; do + for tree_type in ${tree_types}; do + #set the environment + env="${testing_env} env=MPIR_CVAR_BCAST_INTRANODE_BUFFER_TOTAL_SIZE=${buf_size} " + env+="env=MPIR_CVAR_BCAST_INTRANODE_NUM_CELLS=${num_cell} " + env+="env=MPIR_CVAR_BCAST_INTRANODE_TREE_KVAL=${kval} " + env+="env=MPIR_CVAR_BCAST_INTRANODE_TREE_TYPE=${tree_type} " + + coll_algo_tests+="bcasttest 10 ${env}${nl}" + coll_algo_tests+="bcastzerotype 5 ${env}${nl}" + done + done + done +done + +########## Add tests for intra-node reduce algorithms ############ + +#use release gather based intra-node reduce +testing_env="env=MPIR_CVAR_REDUCE_POSIX_INTRA_ALGORITHM=release_gather " + +testing_env+="env=MPIR_CVAR_COLL_SHM_LIMIT_PER_NODE=131072 " #128MB +buffer_sizes="16384 32768" +num_cells="2 4" +tree_types="knomial_1 knomial_2" +kvalues="4 8" + +for buf_size in ${buffer_sizes}; do + for num_cell in ${num_cells}; do + for kval in ${kvalues}; do + for tree_type in ${tree_types}; do + #set the environment + env="${testing_env} env=MPIR_CVAR_REDUCE_INTRANODE_BUFFER_TOTAL_SIZE=${buf_size} " + env+="env=MPIR_CVAR_REDUCE_INTRANODE_NUM_CELLS=${num_cell} " + env+="env=MPIR_CVAR_REDUCE_INTRANODE_TREE_KVAL=${kval} " + env+="env=MPIR_CVAR_REDUCE_INTRANODE_TREE_TYPE=${tree_type} " + + coll_algo_tests+="reduce 5 ${env}${nl}" + coll_algo_tests+="reduce 10 ${env}${nl}" + coll_algo_tests+="red3 10 ${env}${nl}" + coll_algo_tests+="red4 10 ${env}${nl}" + done + done + done +done + export coll_algo_tests diff --git a/test/mpi/maint/jenkins/xfail.conf b/test/mpi/maint/jenkins/xfail.conf index ff01044d9f3..6b3d78298eb 100644 --- a/test/mpi/maint/jenkins/xfail.conf +++ b/test/mpi/maint/jenkins/xfail.conf @@ -206,3 +206,6 @@ valgrind * * * * sed -i "s+\(^accfence1 .*\)+\1 xfail=ticket0+g" test/mpi/rma/te * * debug ch4:ucx * sed -i "s+\(^rqfreeb .*\)+\1 xfail=ticket0+g" test/mpi/pt2pt/testlist # Bug - Github Issue https://github.com/pmodels/mpich/issues/3618 * * * ch4:ucx * sed -i "s+\(^darray_pack .*\)+\1 xfail=ticket0+g" test/mpi/datatype/testlist +# release_gather intra-node coll algorithms relies on izem which is not default. +* * * * * sed -i "s+\(env=MPIR_CVAR_BCAST_POSIX_INTRA_ALGORITHM=release_gather .*\)+xfail=ticket0+g" test/mpi/coll/testlist +* * * * * sed -i "s+\(env=MPIR_CVAR_REDUCE_POSIX_INTRA_ALGORITHM=release_gather .*\)+xfail=ticket0+g" test/mpi/coll/testlist