From 5bdd117dcca7d8fba6a6b4ce96928949d5f40436 Mon Sep 17 00:00:00 2001 From: Surabhi Jain Date: Fri, 11 May 2018 13:41:51 -0500 Subject: [PATCH 01/13] coll: Namespacing change in calculate_chunking_info Change MPII to MPIR so that it could be used from device Inlining fixes the linking error in fortran tests using gcc, debug mode when this function is used from posix Signed-off-by: Yanfei Guo --- src/mpi/coll/algorithms/common/algo_common.h | 20 +++++++++---------- .../iallreduce/iallreduce_tsp_tree_algos.h | 2 +- src/mpi/coll/ibcast/ibcast_tsp_tree_algos.h | 2 +- src/mpi/coll/ireduce/ireduce_tsp_tree_algos.h | 2 +- 4 files changed, 13 insertions(+), 13 deletions(-) diff --git a/src/mpi/coll/algorithms/common/algo_common.h b/src/mpi/coll/algorithms/common/algo_common.h index 9914d0c7b45..6e4f35dd52b 100644 --- a/src/mpi/coll/algorithms/common/algo_common.h +++ b/src/mpi/coll/algorithms/common/algo_common.h @@ -22,19 +22,19 @@ static int MPII_Algo_compare_int(const void *a, const void *b) } /* Avoid unused function warning in certain configurations */ -static int MPII_Algo_calculate_pipeline_chunk_info(int maxbytes, int type_size, int count, - int *num_segments, int *segsize_floor, - int *segsize_ceil) ATTRIBUTE((unused)); -static int MPII_Algo_calculate_pipeline_chunk_info(int maxbytes, - int type_size, int count, - int *num_segments, - int *segsize_floor, int *segsize_ceil) +static inline int MPIR_Algo_calculate_pipeline_chunk_info(int maxbytes, int type_size, int count, + int *num_segments, int *segsize_floor, + int *segsize_ceil) ATTRIBUTE((unused)); +static inline int MPIR_Algo_calculate_pipeline_chunk_info(int maxbytes, + int type_size, int count, + int *num_segments, + int *segsize_floor, int *segsize_ceil) { int maxelems; int mpi_errno = MPI_SUCCESS; - MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPII_ALGO_CALCULATE_PIPELINE_CHUNK_INFO); - MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPII_ALGO_CALCULATE_PIPELINE_CHUNK_INFO); + MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIR_ALGO_CALCULATE_PIPELINE_CHUNK_INFO); + MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPIR_ALGO_CALCULATE_PIPELINE_CHUNK_INFO); if (count == 0 || type_size == 0) { *num_segments = *segsize_floor = *segsize_ceil = 0; @@ -57,7 +57,7 @@ static int MPII_Algo_calculate_pipeline_chunk_info(int maxbytes, MPL_DBG_MSG_FMT(MPIR_DBG_COLL, VERBOSE, (MPL_DBG_FDEST, "num_segments %d", *num_segments)); - MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPII_ALGO_CALCULATE_PIPELINE_CHUNK_INFO); + MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPIR_ALGO_CALCULATE_PIPELINE_CHUNK_INFO); fn_exit: return mpi_errno; diff --git a/src/mpi/coll/iallreduce/iallreduce_tsp_tree_algos.h b/src/mpi/coll/iallreduce/iallreduce_tsp_tree_algos.h index 9a156db48a3..e2ed8736f44 100644 --- a/src/mpi/coll/iallreduce/iallreduce_tsp_tree_algos.h +++ b/src/mpi/coll/iallreduce/iallreduce_tsp_tree_algos.h @@ -61,7 +61,7 @@ int MPIR_TSP_Iallreduce_sched_intra_tree(const void *sendbuf, void *recvbuf, int /* calculate chunking information for pipelining */ - MPII_Algo_calculate_pipeline_chunk_info(maxbytes, type_size, count, &num_chunks, + MPIR_Algo_calculate_pipeline_chunk_info(maxbytes, type_size, count, &num_chunks, &chunk_size_floor, &chunk_size_ceil); /* print chunking information */ MPL_DBG_MSG_FMT(MPIR_DBG_COLL, VERBOSE, (MPL_DBG_FDEST, diff --git a/src/mpi/coll/ibcast/ibcast_tsp_tree_algos.h b/src/mpi/coll/ibcast/ibcast_tsp_tree_algos.h index 62c6c6106e7..06172c99655 100644 --- a/src/mpi/coll/ibcast/ibcast_tsp_tree_algos.h +++ b/src/mpi/coll/ibcast/ibcast_tsp_tree_algos.h @@ -51,7 +51,7 @@ int MPIR_TSP_Ibcast_sched_intra_tree(void *buffer, int count, MPI_Datatype datat extent = MPL_MAX(extent, true_extent); /* calculate chunking information for pipelining */ - MPII_Algo_calculate_pipeline_chunk_info(maxbytes, type_size, count, &num_chunks, + MPIR_Algo_calculate_pipeline_chunk_info(maxbytes, type_size, count, &num_chunks, &chunk_size_floor, &chunk_size_ceil); /* print chunking information */ MPL_DBG_MSG_FMT(MPIR_DBG_COLL, VERBOSE, (MPL_DBG_FDEST, diff --git a/src/mpi/coll/ireduce/ireduce_tsp_tree_algos.h b/src/mpi/coll/ireduce/ireduce_tsp_tree_algos.h index f5a66d80745..b7700262ee2 100644 --- a/src/mpi/coll/ireduce/ireduce_tsp_tree_algos.h +++ b/src/mpi/coll/ireduce/ireduce_tsp_tree_algos.h @@ -66,7 +66,7 @@ int MPIR_TSP_Ireduce_sched_intra_tree(const void *sendbuf, void *recvbuf, int co is_commutative = MPIR_Op_is_commutative(op); /* calculate chunking information for pipelining */ - MPII_Algo_calculate_pipeline_chunk_info(maxbytes, type_size, count, &num_chunks, + MPIR_Algo_calculate_pipeline_chunk_info(maxbytes, type_size, count, &num_chunks, &chunk_size_floor, &chunk_size_ceil); /* print chunking information */ MPL_DBG_MSG_FMT(MPIR_DBG_COLL, VERBOSE, (MPL_DBG_FDEST, From 8c3835f600fb233b1995d9aca73b18a54a823b95 Mon Sep 17 00:00:00 2001 From: Surabhi Jain Date: Thu, 13 Dec 2018 15:42:42 -0600 Subject: [PATCH 02/13] coll: Namespacing change in MPII Treealgo tree Changing the related functions and data structures prefix to MPIR so that it could be used from device Signed-off-by: Yanfei Guo --- src/mpi/coll/algorithms/treealgo/treealgo.c | 12 ++++++------ src/mpi/coll/algorithms/treealgo/treealgo.h | 6 +++--- src/mpi/coll/algorithms/treealgo/treealgo_types.h | 2 +- src/mpi/coll/algorithms/treealgo/treeutil.c | 8 ++++---- src/mpi/coll/algorithms/treealgo/treeutil.h | 6 +++--- src/mpi/coll/iallreduce/iallreduce_tsp_tree_algos.h | 6 +++--- src/mpi/coll/ibcast/ibcast_tsp_tree_algos.h | 6 +++--- src/mpi/coll/igather/igather_tsp_tree_algos.h | 10 +++++----- src/mpi/coll/ireduce/ireduce_tsp_tree_algos.h | 6 +++--- src/mpi/coll/iscatter/iscatter_tsp_tree_algos.h | 10 +++++----- 10 files changed, 36 insertions(+), 36 deletions(-) diff --git a/src/mpi/coll/algorithms/treealgo/treealgo.c b/src/mpi/coll/algorithms/treealgo/treealgo.c index 67bb3807cca..317adc4eef1 100644 --- a/src/mpi/coll/algorithms/treealgo/treealgo.c +++ b/src/mpi/coll/algorithms/treealgo/treealgo.c @@ -35,13 +35,13 @@ int MPII_Treealgo_comm_cleanup(MPIR_Comm * comm) } -int MPII_Treealgo_tree_create(int rank, int nranks, int tree_type, int k, int root, - MPII_Treealgo_tree_t * ct) +int MPIR_Treealgo_tree_create(int rank, int nranks, int tree_type, int k, int root, + MPIR_Treealgo_tree_t * ct) { int mpi_errno = MPI_SUCCESS; - MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPII_TREEALGO_TREE_INIT); - MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPII_TREEALGO_TREE_INIT); + MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIR_TREEALGO_TREE_INIT); + MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPIR_TREEALGO_TREE_INIT); switch (tree_type) { case MPIR_TREE_TYPE_KARY: @@ -68,7 +68,7 @@ int MPII_Treealgo_tree_create(int rank, int nranks, int tree_type, int k, int ro break; } - MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPII_TREEALGO_TREE_INIT); + MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPIR_TREEALGO_TREE_INIT); fn_exit: return mpi_errno; @@ -78,7 +78,7 @@ int MPII_Treealgo_tree_create(int rank, int nranks, int tree_type, int k, int ro } -void MPII_Treealgo_tree_free(MPII_Treealgo_tree_t * tree) +void MPIR_Treealgo_tree_free(MPIR_Treealgo_tree_t * tree) { utarray_free(tree->children); } diff --git a/src/mpi/coll/algorithms/treealgo/treealgo.h b/src/mpi/coll/algorithms/treealgo/treealgo.h index e4f6418410b..7a96b26cd99 100644 --- a/src/mpi/coll/algorithms/treealgo/treealgo.h +++ b/src/mpi/coll/algorithms/treealgo/treealgo.h @@ -16,8 +16,8 @@ int MPII_Treealgo_init(void); int MPII_Treealgo_comm_init(MPIR_Comm * comm); int MPII_Treealgo_comm_cleanup(MPIR_Comm * comm); -int MPII_Treealgo_tree_create(int rank, int nranks, int tree_type, int k, int root, - MPII_Treealgo_tree_t * ct); -void MPII_Treealgo_tree_free(MPII_Treealgo_tree_t * tree); +int MPIR_Treealgo_tree_create(int rank, int nranks, int tree_type, int k, int root, + MPIR_Treealgo_tree_t * ct); +void MPIR_Treealgo_tree_free(MPIR_Treealgo_tree_t * tree); #endif /* TREEALGO_H_INCLUDED */ diff --git a/src/mpi/coll/algorithms/treealgo/treealgo_types.h b/src/mpi/coll/algorithms/treealgo/treealgo_types.h index ad98f26a5b0..fd471c46709 100644 --- a/src/mpi/coll/algorithms/treealgo/treealgo_types.h +++ b/src/mpi/coll/algorithms/treealgo/treealgo_types.h @@ -18,6 +18,6 @@ typedef struct { int parent; int num_children; UT_array *children; -} MPII_Treealgo_tree_t; +} MPIR_Treealgo_tree_t; #endif /* TREEALGO_TYPES_H_INCLUDED */ diff --git a/src/mpi/coll/algorithms/treealgo/treeutil.c b/src/mpi/coll/algorithms/treealgo/treeutil.c index ebc88e3fce5..53eff66ebac 100644 --- a/src/mpi/coll/algorithms/treealgo/treeutil.c +++ b/src/mpi/coll/algorithms/treealgo/treeutil.c @@ -15,7 +15,7 @@ #include "treeutil.h" #include "mpiimpl.h" -static int tree_add_child(MPII_Treealgo_tree_t * t, int rank) +static int tree_add_child(MPIR_Treealgo_tree_t * t, int rank) { int mpi_errno = MPI_SUCCESS; @@ -26,7 +26,7 @@ static int tree_add_child(MPII_Treealgo_tree_t * t, int rank) } -int MPII_Treeutil_tree_kary_init(int rank, int nranks, int k, int root, MPII_Treealgo_tree_t * ct) +int MPII_Treeutil_tree_kary_init(int rank, int nranks, int k, int root, MPIR_Treealgo_tree_t * ct) { int lrank, child; int mpi_errno = MPI_SUCCESS; @@ -76,7 +76,7 @@ int MPII_Treeutil_tree_kary_init(int rank, int nranks, int k, int root, MPII_Tre * 3 */ int MPII_Treeutil_tree_knomial_1_init(int rank, int nranks, int k, int root, - MPII_Treealgo_tree_t * ct) + MPIR_Treealgo_tree_t * ct) { int lrank, i, j, maxtime, tmp, time, parent, current_rank, running_rank, crank; int mpi_errno = MPI_SUCCESS; @@ -171,7 +171,7 @@ int MPII_Treeutil_tree_knomial_1_init(int rank, int nranks, int k, int root, * 7 */ int MPII_Treeutil_tree_knomial_2_init(int rank, int nranks, int k, int root, - MPII_Treealgo_tree_t * ct) + MPIR_Treealgo_tree_t * ct) { int mpi_errno = MPI_SUCCESS; int lrank, i, j, depth; diff --git a/src/mpi/coll/algorithms/treealgo/treeutil.h b/src/mpi/coll/algorithms/treealgo/treeutil.h index 3105083a3f4..59a324df79a 100644 --- a/src/mpi/coll/algorithms/treealgo/treeutil.h +++ b/src/mpi/coll/algorithms/treealgo/treeutil.h @@ -13,14 +13,14 @@ #define TREEUTIL_H_INCLUDED /* Generate kary tree information for rank 'rank' */ -int MPII_Treeutil_tree_kary_init(int rank, int nranks, int k, int root, MPII_Treealgo_tree_t * ct); +int MPII_Treeutil_tree_kary_init(int rank, int nranks, int k, int root, MPIR_Treealgo_tree_t * ct); /* Generate knomial_1 tree information for rank 'rank' */ int MPII_Treeutil_tree_knomial_1_init(int rank, int nranks, int k, int root, - MPII_Treealgo_tree_t * ct); + MPIR_Treealgo_tree_t * ct); /* Generate knomial_2 tree information for rank 'rank' */ int MPII_Treeutil_tree_knomial_2_init(int rank, int nranks, int k, int root, - MPII_Treealgo_tree_t * ct); + MPIR_Treealgo_tree_t * ct); #endif /* TREEUTIL_H_INCLUDED */ diff --git a/src/mpi/coll/iallreduce/iallreduce_tsp_tree_algos.h b/src/mpi/coll/iallreduce/iallreduce_tsp_tree_algos.h index e2ed8736f44..6ed093d8e41 100644 --- a/src/mpi/coll/iallreduce/iallreduce_tsp_tree_algos.h +++ b/src/mpi/coll/iallreduce/iallreduce_tsp_tree_algos.h @@ -33,7 +33,7 @@ int MPIR_TSP_Iallreduce_sched_intra_tree(const void *sendbuf, void *recvbuf, int int size; int rank; int num_children = 0; - MPII_Treealgo_tree_t my_tree; + MPIR_Treealgo_tree_t my_tree; void **child_buffer; /* Buffer array in which data from children is received */ void *reduce_buffer; /* Buffer in which allreduced data is present */ int *vtcs = NULL, *recv_id = NULL, *reduce_id = NULL; /* Arrays to store graph vertex ids */ @@ -76,7 +76,7 @@ int MPIR_TSP_Iallreduce_sched_intra_tree(const void *sendbuf, void *recvbuf, int /* initialize the tree */ my_tree.children = NULL; - mpi_errno = MPII_Treealgo_tree_create(rank, size, tree_type, k, root, &my_tree); + mpi_errno = MPIR_Treealgo_tree_create(rank, size, tree_type, k, root, &my_tree); if (mpi_errno) MPIR_ERR_POP(mpi_errno); num_children = my_tree.num_children; @@ -216,7 +216,7 @@ int MPIR_TSP_Iallreduce_sched_intra_tree(const void *sendbuf, void *recvbuf, int offset += msgsize; } - MPII_Treealgo_tree_free(&my_tree); + MPIR_Treealgo_tree_free(&my_tree); fn_exit: MPL_free(vtcs); diff --git a/src/mpi/coll/ibcast/ibcast_tsp_tree_algos.h b/src/mpi/coll/ibcast/ibcast_tsp_tree_algos.h index 06172c99655..ea853623fa9 100644 --- a/src/mpi/coll/ibcast/ibcast_tsp_tree_algos.h +++ b/src/mpi/coll/ibcast/ibcast_tsp_tree_algos.h @@ -32,7 +32,7 @@ int MPIR_TSP_Ibcast_sched_intra_tree(void *buffer, int count, MPI_Datatype datat int rank; int recv_id; int num_children; - MPII_Treealgo_tree_t my_tree; + MPIR_Treealgo_tree_t my_tree; int tag; MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIR_TSP_IBCAST_SCHED_INTRA_TREE); @@ -59,7 +59,7 @@ int MPIR_TSP_Ibcast_sched_intra_tree(void *buffer, int count, MPI_Datatype datat maxbytes, count, num_chunks, chunk_size_floor, chunk_size_ceil)); - mpi_errno = MPII_Treealgo_tree_create(rank, size, tree_type, k, root, &my_tree); + mpi_errno = MPIR_Treealgo_tree_create(rank, size, tree_type, k, root, &my_tree); if (mpi_errno) MPIR_ERR_POP(mpi_errno); num_children = my_tree.num_children; @@ -93,7 +93,7 @@ int MPIR_TSP_Ibcast_sched_intra_tree(void *buffer, int count, MPI_Datatype datat offset += msgsize; } - MPII_Treealgo_tree_free(&my_tree); + MPIR_Treealgo_tree_free(&my_tree); fn_exit: MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPIR_TSP_IBCAST_SCHED_INTRA_TREE); diff --git a/src/mpi/coll/igather/igather_tsp_tree_algos.h b/src/mpi/coll/igather/igather_tsp_tree_algos.h index 6e70ac73a57..e4229deb56d 100644 --- a/src/mpi/coll/igather/igather_tsp_tree_algos.h +++ b/src/mpi/coll/igather/igather_tsp_tree_algos.h @@ -32,7 +32,7 @@ int MPIR_TSP_Igather_sched_intra_tree(const void *sendbuf, int sendcount, void *tmp_buf = NULL; const void *data_buf = NULL; int tree_type; - MPII_Treealgo_tree_t my_tree, parents_tree; + MPIR_Treealgo_tree_t my_tree, parents_tree; int next_child, num_children, *child_subtree_size = NULL, *child_data_offset = NULL; int offset, recv_size, num_dependencies; @@ -48,7 +48,7 @@ int MPIR_TSP_Igather_sched_intra_tree(const void *sendbuf, int sendcount, is_inplace = (sendbuf == MPI_IN_PLACE); /* For gather, MPI_IN_PLACE is significant only at root */ tree_type = MPIR_TREE_TYPE_KNOMIAL_1; /* currently only tree_type=MPIR_TREE_TYPE_KNOMIAL_1 is supported for gather */ - mpi_errno = MPII_Treealgo_tree_create(rank, size, tree_type, k, root, &my_tree); + mpi_errno = MPIR_Treealgo_tree_create(rank, size, tree_type, k, root, &my_tree); if (mpi_errno) MPIR_ERR_POP(mpi_errno); num_children = my_tree.num_children; @@ -83,7 +83,7 @@ int MPIR_TSP_Igather_sched_intra_tree(const void *sendbuf, int sendcount, /* get tree information of the parent */ if (my_tree.parent != -1) { - MPII_Treealgo_tree_create(my_tree.parent, size, tree_type, k, root, &parents_tree); + MPIR_Treealgo_tree_create(my_tree.parent, size, tree_type, k, root, &parents_tree); } else { /* initialize an empty children array */ utarray_new(parents_tree.children, &ut_int_icd, MPL_MEM_COLL); parents_tree.num_children = 0; @@ -116,7 +116,7 @@ int MPIR_TSP_Igather_sched_intra_tree(const void *sendbuf, int sendcount, recv_size += child_subtree_size[i]; } - MPII_Treealgo_tree_free(&parents_tree); + MPIR_Treealgo_tree_free(&parents_tree); recv_size *= (lrank == 0) ? recvcount : sendcount; offset = (lrank == 0) ? recvcount : sendcount; @@ -184,7 +184,7 @@ int MPIR_TSP_Igather_sched_intra_tree(const void *sendbuf, int sendcount, } - MPII_Treealgo_tree_free(&my_tree); + MPIR_Treealgo_tree_free(&my_tree); fn_exit: MPL_free(child_subtree_size); diff --git a/src/mpi/coll/ireduce/ireduce_tsp_tree_algos.h b/src/mpi/coll/ireduce/ireduce_tsp_tree_algos.h index b7700262ee2..c1188ba9030 100644 --- a/src/mpi/coll/ireduce/ireduce_tsp_tree_algos.h +++ b/src/mpi/coll/ireduce/ireduce_tsp_tree_algos.h @@ -40,7 +40,7 @@ int MPIR_TSP_Ireduce_sched_intra_tree(const void *sendbuf, void *recvbuf, int co * rank 0 sends the reduced data to the root of the ecollective op. */ int is_tree_root, is_tree_leaf, is_tree_intermediate; /* Variables to store location of this rank in the tree */ int is_root; - MPII_Treealgo_tree_t my_tree; + MPIR_Treealgo_tree_t my_tree; void **child_buffer = NULL; /* Buffer array in which data from children is received */ void *reduce_buffer; /* Buffer in which reduced data is present */ int *vtcs = NULL, *recv_id = NULL, *reduce_id = NULL; /* Arrays to store graph vertex ids */ @@ -85,7 +85,7 @@ int MPIR_TSP_Ireduce_sched_intra_tree(const void *sendbuf, void *recvbuf, int co /* initialize the tree */ my_tree.children = NULL; - mpi_errno = MPII_Treealgo_tree_create(rank, size, tree_type, k, tree_root, &my_tree); + mpi_errno = MPIR_Treealgo_tree_create(rank, size, tree_type, k, tree_root, &my_tree); if (mpi_errno) MPIR_ERR_POP(mpi_errno); num_children = my_tree.num_children; @@ -240,7 +240,7 @@ int MPIR_TSP_Ireduce_sched_intra_tree(const void *sendbuf, void *recvbuf, int co offset += msgsize; } - MPII_Treealgo_tree_free(&my_tree); + MPIR_Treealgo_tree_free(&my_tree); fn_exit: MPL_free(vtcs); diff --git a/src/mpi/coll/iscatter/iscatter_tsp_tree_algos.h b/src/mpi/coll/iscatter/iscatter_tsp_tree_algos.h index e771a5391be..4296065189e 100644 --- a/src/mpi/coll/iscatter/iscatter_tsp_tree_algos.h +++ b/src/mpi/coll/iscatter/iscatter_tsp_tree_algos.h @@ -35,7 +35,7 @@ int MPIR_TSP_Iscatter_sched_intra_tree(const void *sendbuf, int sendcount, void *tmp_buf = NULL; int recv_id; int tree_type; - MPII_Treealgo_tree_t my_tree, parents_tree; + MPIR_Treealgo_tree_t my_tree, parents_tree; int next_child; int num_children, *child_subtree_size = NULL, *child_data_offset = NULL; int offset, recv_size; @@ -50,7 +50,7 @@ int MPIR_TSP_Iscatter_sched_intra_tree(const void *sendbuf, int sendcount, is_inplace = (recvbuf == MPI_IN_PLACE); /* For scatter, MPI_IN_PLACE is significant only at root */ tree_type = MPIR_TREE_TYPE_KNOMIAL_1; /* currently only tree_type=MPIR_TREE_TYPE_KNOMIAL_1 is supported for scatter */ - mpi_errno = MPII_Treealgo_tree_create(rank, size, tree_type, k, root, &my_tree); + mpi_errno = MPIR_Treealgo_tree_create(rank, size, tree_type, k, root, &my_tree); if (mpi_errno) MPIR_ERR_POP(mpi_errno); num_children = my_tree.num_children; @@ -85,7 +85,7 @@ int MPIR_TSP_Iscatter_sched_intra_tree(const void *sendbuf, int sendcount, /* get tree information of the parent */ if (my_tree.parent != -1) { - MPII_Treealgo_tree_create(my_tree.parent, size, tree_type, k, root, &parents_tree); + MPIR_Treealgo_tree_create(my_tree.parent, size, tree_type, k, root, &parents_tree); } else { /* initialize an empty children array */ utarray_new(parents_tree.children, &ut_int_icd, MPL_MEM_COLL); parents_tree.num_children = 0; @@ -118,7 +118,7 @@ int MPIR_TSP_Iscatter_sched_intra_tree(const void *sendbuf, int sendcount, recv_size += child_subtree_size[i]; } - MPII_Treealgo_tree_free(&parents_tree); + MPIR_Treealgo_tree_free(&parents_tree); recv_size *= (lrank == 0) ? sendcount : recvcount; offset = (lrank == 0) ? sendcount : recvcount; @@ -177,7 +177,7 @@ int MPIR_TSP_Iscatter_sched_intra_tree(const void *sendbuf, int sendcount, recvcount, recvtype, sched, 0, NULL); } - MPII_Treealgo_tree_free(&my_tree); + MPIR_Treealgo_tree_free(&my_tree); fn_exit: MPL_free(child_subtree_size); From a32680b99dd43a776edf385eee29c4393c616301 Mon Sep 17 00:00:00 2001 From: Surabhi Jain Date: Tue, 28 Aug 2018 17:09:03 -0500 Subject: [PATCH 03/13] configure: Add AM_CONDITIONAL for izem_atomic Signed-off-by: Yanfei Guo --- configure.ac | 1 + 1 file changed, 1 insertion(+) diff --git a/configure.ac b/configure.ac index 88e6879e6ff..b0ee91a7915 100644 --- a/configure.ac +++ b/configure.ac @@ -1173,6 +1173,7 @@ fi if test "$izem_atomic" = "yes" ; then AC_DEFINE(ENABLE_IZEM_ATOMIC,1,[Define to enable using Izem CPU atomics]) fi +AM_CONDITIONAL([ENABLE_IZEM_ATOMIC], [test x$izem_atomic = xyes]) AC_ARG_VAR([ZMLIBNAME],[can be used to override the name of the Izem library (default: "zm")]) ZMLIBNAME=${ZMLIBNAME:-"zm"} From 0bb172aa2f8e0a237716f86b766221dfcca0e295 Mon Sep 17 00:00:00 2001 From: Surabhi Jain Date: Fri, 30 Nov 2018 16:11:52 -0600 Subject: [PATCH 04/13] err: Add new err for noizem Signed-off-by: Yanfei Guo --- src/mpi/errhan/errnames.txt | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/mpi/errhan/errnames.txt b/src/mpi/errhan/errnames.txt index f5c4eecea2d..d3faab09b13 100644 --- a/src/mpi/errhan/errnames.txt +++ b/src/mpi/errhan/errnames.txt @@ -213,6 +213,9 @@ MPI_TYPECLASS_INTEGER, or MPI_TYPECLASS_COMPLEX **treetype: Invalid tree type used for initializing Tree algorithms **treetype %d: Invalid tree type (%d) used for initializing Tree algorithms +**noizem: release_gather based intra-node collectives cannot be used without izem submodule. \ +Reconfigure mpich with --enable-izem=atomic --with-zm-prefix=yes + # -- FIXME: Some (but not all) of the messages below this line have been used #---- The messages below this line haven't been used yet. # From 5a531d4566ed0b7c6fdd89449e2b78e8323abd1e Mon Sep 17 00:00:00 2001 From: Surabhi Jain Date: Mon, 17 Dec 2018 15:10:46 -0600 Subject: [PATCH 05/13] maint/extracterrmsgs: Allow errflag creation in func This change allows to create errflag in a function and propagate it further. Needed for init and finalize calls which don't have errflag passed to them. Signed-off-by: Yanfei Guo --- maint/extracterrmsgs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/maint/extracterrmsgs b/maint/extracterrmsgs index 05c401d74fe..d795f54810b 100755 --- a/maint/extracterrmsgs +++ b/maint/extracterrmsgs @@ -731,7 +731,7 @@ sub ProcessFile !($args[$errClassLoc] =~ /^MPIDI_CH3I_SOCK_ERR_/) && !($args[$errClassLoc] =~ /^MPIX_ERR_/) && !($args[$errClassLoc] =~ /^errclass/) && - !($args[$errClassLoc] =~ /^\*\(errflag_\)/) && + !($args[$errClassLoc] =~ /^errflag/) && !($args[$errClassLoc] =~ /^\*errflag/)) { $bad_syntax_in_file{$filename} = 1; print STDERR "Invalid argument $args[$errClassLoc] for the MPI Error class in $routineName in $filename\n"; From 66a4397eb9e2dce60b10772764e129538ff9e756 Mon Sep 17 00:00:00 2001 From: Surabhi Jain Date: Thu, 29 Nov 2018 11:15:36 -0600 Subject: [PATCH 06/13] posix: Fix mpi_errno in posix_init Signed-off-by: Yanfei Guo --- src/mpid/ch4/shm/posix/posix_init.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/mpid/ch4/shm/posix/posix_init.c b/src/mpid/ch4/shm/posix/posix_init.c index 8a92dfefb1b..5a09810277a 100644 --- a/src/mpid/ch4/shm/posix/posix_init.c +++ b/src/mpid/ch4/shm/posix/posix_init.c @@ -103,6 +103,8 @@ int MPIDI_POSIX_mpi_init_hook(int rank, int size, int *n_vcis_provided, int *tag choose_posix_eager(); mpi_errno = MPIDI_POSIX_eager_init(rank, size); + if (mpi_errno) + MPIR_ERR_POP(mpi_errno); /* There is no restriction on the tag_bits from the posix shmod side */ *tag_bits = MPIR_TAG_BITS_DEFAULT; @@ -126,14 +128,13 @@ int MPIDI_POSIX_mpi_finalize_hook(void) MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPIDI_POSIX_FINALIZE_HOOK); mpi_errno = MPIDI_POSIX_eager_finalize(); + if (mpi_errno) + MPIR_ERR_POP(mpi_errno); MPIDIU_destroy_buf_pool(MPIDI_POSIX_global.am_buf_pool); MPL_free(MPIDI_POSIX_global.active_rreq); - if (mpi_errno) - MPIR_ERR_POP(mpi_errno); - fn_exit: MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPIDI_POSIX_FINALIZE_HOOK); return mpi_errno; From 41010ed2d40cdf1b0b849f0210c60359d9d22a09 Mon Sep 17 00:00:00 2001 From: Surabhi Jain Date: Fri, 30 Nov 2018 11:49:02 -0600 Subject: [PATCH 07/13] posix: Add collective algorithm CVARS at shm level Give user ability to choose an algorithm for intra-node bcast, reduce Also set up infrastructure for posix_coll_init and posix_coll_finalize Signed-off-by: Yanfei Guo --- src/mpid/ch4/shm/posix/Makefile.mk | 3 +- .../ch4/shm/posix/posix_coll_containers.h | 2 + .../shm/posix/posix_coll_globals_default.c | 8 +++ src/mpid/ch4/shm/posix/posix_coll_init.c | 63 +++++++++++++++++++ src/mpid/ch4/shm/posix/posix_coll_params.h | 12 +++- src/mpid/ch4/shm/posix/posix_init.c | 39 ++++++++++++ src/mpid/ch4/shm/posix/posix_noinline.h | 4 ++ 7 files changed, 128 insertions(+), 3 deletions(-) create mode 100644 src/mpid/ch4/shm/posix/posix_coll_init.c diff --git a/src/mpid/ch4/shm/posix/Makefile.mk b/src/mpid/ch4/shm/posix/Makefile.mk index fbef5357dfc..3d5dc149273 100644 --- a/src/mpid/ch4/shm/posix/Makefile.mk +++ b/src/mpid/ch4/shm/posix/Makefile.mk @@ -38,7 +38,8 @@ mpi_core_sources += src/mpid/ch4/shm/posix/globals.c \ src/mpid/ch4/shm/posix/posix_datatype.c \ src/mpid/ch4/shm/posix/posix_spawn.c \ src/mpid/ch4/shm/posix/posix_win.c \ - src/mpid/ch4/shm/posix/posix_eager_array.c + src/mpid/ch4/shm/posix/posix_eager_array.c \ + src/mpid/ch4/shm/posix/posix_coll_init.c include $(top_srcdir)/src/mpid/ch4/shm/posix/eager/Makefile.mk diff --git a/src/mpid/ch4/shm/posix/posix_coll_containers.h b/src/mpid/ch4/shm/posix/posix_coll_containers.h index 01726db7be3..df87fba777a 100644 --- a/src/mpid/ch4/shm/posix/posix_coll_containers.h +++ b/src/mpid/ch4/shm/posix/posix_coll_containers.h @@ -9,10 +9,12 @@ extern const MPIDI_POSIX_coll_algo_container_t MPIDI_POSIX_Bcast_intra_binomial_ extern const MPIDI_POSIX_coll_algo_container_t MPIDI_POSIX_Bcast_intra_scatter_recursive_doubling_allgather_cnt; extern const MPIDI_POSIX_coll_algo_container_t MPIDI_POSIX_Bcast_intra_scatter_ring_allgather_cnt; +extern const MPIDI_POSIX_coll_algo_container_t MPIDI_POSIX_Bcast_intra_invalid_cnt; /* Reduce POSIX containers declaration */ extern const MPIDI_POSIX_coll_algo_container_t MPIDI_POSIX_Reduce_intra_binomial_cnt; extern const MPIDI_POSIX_coll_algo_container_t MPIDI_POSIX_Reduce_intra_reduce_scatter_gather_cnt; +extern const MPIDI_POSIX_coll_algo_container_t MPIDI_POSIX_Reduce_intra_reduce_invalid_cnt; /* Allreduce POSIX containers declaration */ extern const MPIDI_POSIX_coll_algo_container_t MPIDI_POSIX_Allreduce_intra_recursive_doubling_cnt; diff --git a/src/mpid/ch4/shm/posix/posix_coll_globals_default.c b/src/mpid/ch4/shm/posix/posix_coll_globals_default.c index 5afd5e9a168..e84a5261aca 100644 --- a/src/mpid/ch4/shm/posix/posix_coll_globals_default.c +++ b/src/mpid/ch4/shm/posix/posix_coll_globals_default.c @@ -22,6 +22,10 @@ const MPIDI_POSIX_coll_algo_container_t MPIDI_POSIX_Bcast_intra_scatter_ring_all .id = MPIDI_POSIX_Bcast_intra_scatter_ring_allgather_id }; +const MPIDI_POSIX_coll_algo_container_t MPIDI_POSIX_Bcast_intra_invalid_cnt = { + .id = MPIDI_POSIX_Bcast_intra_invalid_id +}; + /* Reduce default POSIX containers initialization*/ const MPIDI_POSIX_coll_algo_container_t MPIDI_POSIX_Reduce_intra_reduce_scatter_gather_cnt = { .id = MPIDI_POSIX_Reduce_intra_reduce_scatter_gather_id @@ -31,6 +35,10 @@ const MPIDI_POSIX_coll_algo_container_t MPIDI_POSIX_Reduce_intra_binomial_cnt = .id = MPIDI_POSIX_Reduce_intra_binomial_id }; +const MPIDI_POSIX_coll_algo_container_t MPIDI_POSIX_Reduce_intra_invalid_cnt = { + .id = MPIDI_POSIX_Reduce_intra_invalid_id +}; + /* Allreduce default POSIX containers initialization*/ const MPIDI_POSIX_coll_algo_container_t MPIDI_POSIX_Allreduce_intra_recursive_doubling_cnt = { .id = MPIDI_POSIX_Allreduce_intra_recursive_doubling_id diff --git a/src/mpid/ch4/shm/posix/posix_coll_init.c b/src/mpid/ch4/shm/posix/posix_coll_init.c new file mode 100644 index 00000000000..17287cdaa8b --- /dev/null +++ b/src/mpid/ch4/shm/posix/posix_coll_init.c @@ -0,0 +1,63 @@ + +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */ +/* + * (C) 2018 by Argonne National Laboratory. + * See COPYRIGHT in top-level directory. + * + * Portions of this code were written by Intel Corporation. + * Copyright (C) 2011-2017 Intel Corporation. Intel provides this material + * to Argonne National Laboratory subject to Software Grant and Corporate + * Contributor License Agreement dated February 8, 2012. + */ + +#include "mpidimpl.h" + +/* +=== BEGIN_MPI_T_CVAR_INFO_BLOCK === + +cvars: + - name : MPIR_CVAR_BCAST_POSIX_INTRA_ALGORITHM + category : COLLECTIVE + type : string + default : auto + class : device + verbosity : MPI_T_VERBOSITY_USER_BASIC + scope : MPI_T_SCOPE_ALL_EQ + description : >- + Variable to select algorithm for intra-node bcast + auto - Internal algorithm selection from pt2pt based algorithms + + - name : MPIR_CVAR_REDUCE_POSIX_INTRA_ALGORITHM + category : COLLECTIVE + type : string + default : auto + class : device + verbosity : MPI_T_VERBOSITY_USER_BASIC + scope : MPI_T_SCOPE_ALL_EQ + description : >- + Variable to select algorithm for intra-node reduce + auto - Internal algorithm selection from pt2pt based algorithms + +=== END_MPI_T_CVAR_INFO_BLOCK === +*/ + +MPIDI_POSIX_Bcast_id_t MPIDI_POSIX_Bcast_algo_choice = MPIDI_POSIX_Bcast_intra_auto_id; +MPIDI_POSIX_Reduce_id_t MPIDI_POSIX_Reduce_algo_choice = MPIDI_POSIX_Reduce_intra_auto_id; + +int collective_cvars_init(void); + +int collective_cvars_init(void) +{ + int mpi_errno = MPI_SUCCESS; + + /* Bcast */ + MPIDI_POSIX_Bcast_algo_choice = MPIDI_POSIX_Bcast_intra_auto_id; + + /* Reduce */ + MPIDI_POSIX_Reduce_algo_choice = MPIDI_POSIX_Reduce_intra_auto_id; + + fn_exit: + return mpi_errno; + fn_fail: + goto fn_exit; +} diff --git a/src/mpid/ch4/shm/posix/posix_coll_params.h b/src/mpid/ch4/shm/posix/posix_coll_params.h index bbd43c3c4b2..ed2fb2503ea 100644 --- a/src/mpid/ch4/shm/posix/posix_coll_params.h +++ b/src/mpid/ch4/shm/posix/posix_coll_params.h @@ -15,7 +15,9 @@ typedef union { typedef enum { MPIDI_POSIX_Bcast_intra_binomial_id, MPIDI_POSIX_Bcast_intra_scatter_recursive_doubling_allgather_id, - MPIDI_POSIX_Bcast_intra_scatter_ring_allgather_id + MPIDI_POSIX_Bcast_intra_scatter_ring_allgather_id, + MPIDI_POSIX_Bcast_intra_auto_id, + MPIDI_POSIX_Bcast_intra_invalid_id } MPIDI_POSIX_Bcast_id_t; typedef union { @@ -29,9 +31,13 @@ typedef union { } posix_bcast_empty_parameters; } MPIDI_POSIX_Bcast_params_t; +extern MPIDI_POSIX_Bcast_id_t MPIDI_POSIX_Bcast_algo_choice; + typedef enum { MPIDI_POSIX_Reduce_intra_reduce_scatter_gather_id, - MPIDI_POSIX_Reduce_intra_binomial_id + MPIDI_POSIX_Reduce_intra_binomial_id, + MPIDI_POSIX_Reduce_intra_auto_id, + MPIDI_POSIX_Reduce_intra_invalid_id } MPIDI_POSIX_Reduce_id_t; typedef union { @@ -41,6 +47,8 @@ typedef union { } posix_reduce_empty_parameters; } MPIDI_POSIX_Reduce_params_t; +extern MPIDI_POSIX_Reduce_id_t MPIDI_POSIX_Reduce_algo_choice; + typedef enum { MPIDI_POSIX_Allreduce_intra_recursive_doubling_id, MPIDI_POSIX_Allreduce_intra_reduce_scatter_allgather_id diff --git a/src/mpid/ch4/shm/posix/posix_init.c b/src/mpid/ch4/shm/posix/posix_init.c index 5a09810277a..f670408580f 100644 --- a/src/mpid/ch4/shm/posix/posix_init.c +++ b/src/mpid/ch4/shm/posix/posix_init.c @@ -109,6 +109,10 @@ int MPIDI_POSIX_mpi_init_hook(int rank, int size, int *n_vcis_provided, int *tag /* There is no restriction on the tag_bits from the posix shmod side */ *tag_bits = MPIR_TAG_BITS_DEFAULT; + mpi_errno = MPIDI_POSIX_coll_init(rank, size); + if (mpi_errno) + MPIR_ERR_POP(mpi_errno); + MPIR_CHKPMEM_COMMIT(); fn_exit: @@ -135,6 +139,10 @@ int MPIDI_POSIX_mpi_finalize_hook(void) MPL_free(MPIDI_POSIX_global.active_rreq); + mpi_errno = MPIDI_POSIX_coll_finalize(); + if (mpi_errno) + MPIR_ERR_POP(mpi_errno); + fn_exit: MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPIDI_POSIX_FINALIZE_HOOK); return mpi_errno; @@ -142,6 +150,37 @@ int MPIDI_POSIX_mpi_finalize_hook(void) goto fn_exit; } +int MPIDI_POSIX_coll_init(int rank, int size) +{ + int mpi_errno = MPI_SUCCESS; + MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDI_POSIX_COLL_INIT); + MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPIDI_POSIX_COLL_INIT); + + mpi_errno = collective_cvars_init(); + if (mpi_errno) + MPIR_ERR_POP(mpi_errno); + + fn_exit: + MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPIDI_POSIX_COLL_INIT); + return mpi_errno; + fn_fail: + goto fn_exit; +} + +int MPIDI_POSIX_coll_finalize(void) +{ + int mpi_errno = MPI_SUCCESS; + + MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDI_POSIX_COLL_FINALIZE); + MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPIDI_POSIX_COLL_FINALIZE); + + fn_exit: + MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPIDI_POSIX_COLL_FINALIZE); + return mpi_errno; + fn_fail: + goto fn_exit; +} + int MPIDI_POSIX_get_vci_attr(int vci) { MPIR_Assert(0 <= vci && vci < 1); diff --git a/src/mpid/ch4/shm/posix/posix_noinline.h b/src/mpid/ch4/shm/posix/posix_noinline.h index d0944c4b4f1..acfd322588c 100644 --- a/src/mpid/ch4/shm/posix/posix_noinline.h +++ b/src/mpid/ch4/shm/posix/posix_noinline.h @@ -13,6 +13,10 @@ int MPIDI_POSIX_mpi_init_hook(int rank, int size, int *n_vcis_provided, int *tag int MPIDI_POSIX_mpi_finalize_hook(void); int MPIDI_POSIX_get_vci_attr(int vci); +int collective_cvars_init(void); +int MPIDI_POSIX_coll_init(int rank, int size); +int MPIDI_POSIX_coll_finalize(void); + int MPIDI_POSIX_mpi_comm_create_hook(MPIR_Comm * comm); int MPIDI_POSIX_mpi_comm_free_hook(MPIR_Comm * comm); int MPIDI_POSIX_mpi_type_commit_hook(MPIR_Datatype * type); From 71690fa0070b2a2bcd14323709972a57afd786e8 Mon Sep 17 00:00:00 2001 From: Surabhi Jain Date: Thu, 13 Dec 2018 15:19:29 -0600 Subject: [PATCH 08/13] posix: Reuse POSIX global data structures in fbox The global data structures can be reused by posix level intra-node collectives as well Signed-off-by: Yanfei Guo --- src/mpid/ch4/shm/posix/eager/fbox/fbox_init.h | 62 ++++++------------- src/mpid/ch4/shm/posix/eager/fbox/fbox_recv.h | 9 ++- src/mpid/ch4/shm/posix/eager/fbox/fbox_send.h | 3 +- .../ch4/shm/posix/eager/fbox/fbox_types.h | 11 +--- src/mpid/ch4/shm/posix/posix_init.c | 19 +++++- src/mpid/ch4/shm/posix/posix_types.h | 11 ++++ 6 files changed, 55 insertions(+), 60 deletions(-) diff --git a/src/mpid/ch4/shm/posix/eager/fbox/fbox_init.h b/src/mpid/ch4/shm/posix/eager/fbox/fbox_init.h index 5810d94fa03..05eaa08325a 100644 --- a/src/mpid/ch4/shm/posix/eager/fbox/fbox_init.h +++ b/src/mpid/ch4/shm/posix/eager/fbox/fbox_init.h @@ -31,16 +31,14 @@ === END_MPI_T_CVAR_INFO_BLOCK === */ -#define MPIDI_POSIX_MAILBOX_INDEX(sender, receiver) ((num_local) * (sender) + (receiver)) +#define MPIDI_POSIX_MAILBOX_INDEX(sender, receiver) ((MPIDI_POSIX_global.num_local) * (sender) + (receiver)) extern MPIDI_POSIX_eager_fbox_control_t MPIDI_POSIX_eager_fbox_control_global; MPL_STATIC_INLINE_PREFIX int MPIDI_POSIX_eager_init(int rank, int size) { int mpi_errno = MPI_SUCCESS; - - int i, num_local = 0, local_rank_0 = -1, my_local_rank = -1; - + int i; MPIDI_POSIX_fastbox_t *fastboxes_p = NULL; MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDI_POSIX_EAGER_INIT); @@ -50,7 +48,7 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_POSIX_eager_init(int rank, int size) MPIDI_CH4_SHM_POSIX_FBOX_GENERAL = MPL_dbg_class_alloc("SHM_POSIX_FBOX", "shm_posix_fbox"); #endif /* MPL_USE_DBG_LOGGING */ - MPIR_CHKPMEM_DECL(5); + MPIR_CHKPMEM_DECL(3); MPIDI_POSIX_eager_fbox_control_global.num_seg = 1; MPIDI_POSIX_eager_fbox_control_global.first_poll_local_ranks = @@ -68,19 +66,6 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_POSIX_eager_init(int rank, int size) * messages if all other entries in the cache are empty. */ MPIDI_POSIX_eager_fbox_control_global.first_poll_local_ranks[i] = 0; - /* Populate these values with transformation information about each rank and its original - * information in MPI_COMM_WORLD. */ - - mpi_errno = MPIR_Find_local(MPIR_Process.comm_world, &num_local, &my_local_rank, - /* comm_world rank of each local process */ - &MPIDI_POSIX_eager_fbox_control_global.local_procs, - /* local rank of each process in comm_world if it is on the same node */ - &MPIDI_POSIX_eager_fbox_control_global.local_ranks); - - local_rank_0 = MPIDI_POSIX_eager_fbox_control_global.local_procs[0]; - MPIDI_POSIX_eager_fbox_control_global.num_local = num_local; - MPIDI_POSIX_eager_fbox_control_global.my_local_rank = my_local_rank; - MPIR_CHKPMEM_MALLOC(MPIDI_POSIX_eager_fbox_control_global.seg, MPIDU_shm_seg_info_t *, MPIDI_POSIX_eager_fbox_control_global.num_seg * @@ -89,45 +74,41 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_POSIX_eager_init(int rank, int size) /* Create region with one fastbox for every pair of local processes. */ mpi_errno = - MPIDU_shm_seg_alloc(num_local * num_local * sizeof(MPIDI_POSIX_fastbox_t), - (void **) &fastboxes_p, MPL_MEM_SHM); - if (mpi_errno) - MPIR_ERR_POP(mpi_errno); - - /* Request shared collective barrier vars region */ - mpi_errno = MPIDU_shm_seg_alloc(MAX(sizeof(MPIDU_shm_barrier_t), MPIDU_SHM_CACHE_LINE_LEN), - (void **) &MPIDI_POSIX_eager_fbox_control_global.barrier_region, - MPL_MEM_SHM); + MPIDU_shm_seg_alloc(MPIDI_POSIX_global.num_local * MPIDI_POSIX_global.num_local * + sizeof(MPIDI_POSIX_fastbox_t), (void **) &fastboxes_p, MPL_MEM_SHM); if (mpi_errno) MPIR_ERR_POP(mpi_errno); /* Actually allocate the segment and assign regions to the pointers */ mpi_errno = MPIDU_shm_seg_commit(&MPIDI_POSIX_eager_fbox_control_global.memory, - &MPIDI_POSIX_eager_fbox_control_global.barrier, - num_local, my_local_rank, local_rank_0, rank, MPL_MEM_SHM); + &MPIDI_POSIX_global.barrier, + MPIDI_POSIX_global.num_local, MPIDI_POSIX_global.my_local_rank, + MPIDI_POSIX_global.local_rank_0, rank, MPL_MEM_SHM); if (mpi_errno) MPIR_ERR_POP(mpi_errno); /* Allocate table of pointers to fastboxes */ MPIR_CHKPMEM_MALLOC(MPIDI_POSIX_eager_fbox_control_global.mailboxes.in, - MPIDI_POSIX_fastbox_t **, num_local * sizeof(MPIDI_POSIX_fastbox_t *), - mpi_errno, "fastboxes", MPL_MEM_SHM); + MPIDI_POSIX_fastbox_t **, + MPIDI_POSIX_global.num_local * sizeof(MPIDI_POSIX_fastbox_t *), mpi_errno, + "fastboxes", MPL_MEM_SHM); MPIR_CHKPMEM_MALLOC(MPIDI_POSIX_eager_fbox_control_global.mailboxes.out, - MPIDI_POSIX_fastbox_t **, num_local * sizeof(MPIDI_POSIX_fastbox_t *), - mpi_errno, "fastboxes", MPL_MEM_SHM); + MPIDI_POSIX_fastbox_t **, + MPIDI_POSIX_global.num_local * sizeof(MPIDI_POSIX_fastbox_t *), mpi_errno, + "fastboxes", MPL_MEM_SHM); /* Fill in fbox tables */ - for (i = 0; i < num_local; i++) { + for (i = 0; i < MPIDI_POSIX_global.num_local; i++) { MPIDI_POSIX_eager_fbox_control_global.mailboxes.in[i] = - &fastboxes_p[MPIDI_POSIX_MAILBOX_INDEX(i, my_local_rank)]; + &fastboxes_p[MPIDI_POSIX_MAILBOX_INDEX(i, MPIDI_POSIX_global.my_local_rank)]; MPIDI_POSIX_eager_fbox_control_global.mailboxes.out[i] = - &fastboxes_p[MPIDI_POSIX_MAILBOX_INDEX(my_local_rank, i)]; + &fastboxes_p[MPIDI_POSIX_MAILBOX_INDEX(MPIDI_POSIX_global.my_local_rank, i)]; memset(MPIDI_POSIX_eager_fbox_control_global.mailboxes.in[i], 0, sizeof(MPIDI_POSIX_fastbox_t)); } - mpi_errno = MPIDU_shm_barrier(MPIDI_POSIX_eager_fbox_control_global.barrier, num_local); + mpi_errno = MPIDU_shm_barrier(MPIDI_POSIX_global.barrier, MPIDI_POSIX_global.num_local); if (mpi_errno) MPIR_ERR_POP(mpi_errno); @@ -150,8 +131,7 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_POSIX_eager_finalize() MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDI_POSIX_EAGER_FINALIZE); MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPIDI_POSIX_EAGER_FINALIZE); - mpi_errno = MPIDU_shm_barrier(MPIDI_POSIX_eager_fbox_control_global.barrier, - MPIDI_POSIX_eager_fbox_control_global.num_local); + mpi_errno = MPIDU_shm_barrier(MPIDI_POSIX_global.barrier, MPIDI_POSIX_global.num_local); if (mpi_errno) MPIR_ERR_POP(mpi_errno); @@ -159,12 +139,10 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_POSIX_eager_finalize() MPL_free(MPIDI_POSIX_eager_fbox_control_global.seg); MPL_free(MPIDI_POSIX_eager_fbox_control_global.mailboxes.in); MPL_free(MPIDI_POSIX_eager_fbox_control_global.mailboxes.out); - MPL_free(MPIDI_POSIX_eager_fbox_control_global.local_ranks); - MPL_free(MPIDI_POSIX_eager_fbox_control_global.local_procs); MPL_free(MPIDI_POSIX_eager_fbox_control_global.first_poll_local_ranks); mpi_errno = MPIDU_shm_seg_destroy(&MPIDI_POSIX_eager_fbox_control_global.memory, - MPIDI_POSIX_eager_fbox_control_global.num_local); + MPIDI_POSIX_global.num_local); fn_exit: MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPIDI_POSIX_EAGER_FINALIZE); diff --git a/src/mpid/ch4/shm/posix/eager/fbox/fbox_recv.h b/src/mpid/ch4/shm/posix/eager/fbox/fbox_recv.h index 07d5eb919ca..4b589b4f3ab 100644 --- a/src/mpid/ch4/shm/posix/eager/fbox/fbox_recv.h +++ b/src/mpid/ch4/shm/posix/eager/fbox/fbox_recv.h @@ -58,8 +58,7 @@ MPIDI_POSIX_eager_recv_begin(MPIDI_POSIX_eager_recv_transaction_t * transaction) MPIDI_POSIX_eager_fbox_control_global.first_poll_local_ranks [MPIR_CVAR_CH4_POSIX_EAGER_FBOX_POLL_CACHE_SIZE]; /* Figure out the next fastbox to poll by incrementing the counter. */ - last_cache = - (last_cache + 1) % (int16_t) MPIDI_POSIX_eager_fbox_control_global.num_local; + last_cache = (last_cache + 1) % (int16_t) MPIDI_POSIX_global.num_local; local_rank = last_cache; MPIDI_POSIX_eager_fbox_control_global.first_poll_local_ranks [MPIR_CVAR_CH4_POSIX_EAGER_FBOX_POLL_CACHE_SIZE] = last_cache; @@ -75,7 +74,7 @@ MPIDI_POSIX_eager_recv_begin(MPIDI_POSIX_eager_recv_transaction_t * transaction) /* If the data ready flag is set, there is a message waiting. */ if (fbox_in->data_ready) { /* Initialize public transaction part */ - grank = MPIDI_POSIX_eager_fbox_control_global.local_procs[local_rank]; + grank = MPIDI_POSIX_global.local_procs[local_rank]; if (likely(fbox_in->is_header)) { /* Only received the header for the message */ @@ -154,7 +153,7 @@ MPL_STATIC_INLINE_PREFIX void MPIDI_POSIX_eager_recv_posted_hook(int grank) int local_rank, i; if (grank >= 0) { - local_rank = MPIDI_POSIX_eager_fbox_control_global.local_ranks[grank]; + local_rank = MPIDI_POSIX_global.local_ranks[grank]; /* Put the posted receive in the list of fastboxes to be polled first. If the list is full, * it will get polled after the boxes in the list are polled, which will be slower, but will @@ -183,7 +182,7 @@ MPL_STATIC_INLINE_PREFIX void MPIDI_POSIX_eager_recv_completed_hook(int grank) MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPIDI_POSIX_FBOX_EAGER_RECV_COMPLETED_HOOK); if (grank >= 0) { - local_rank = MPIDI_POSIX_eager_fbox_control_global.local_ranks[grank]; + local_rank = MPIDI_POSIX_global.local_ranks[grank]; /* Remove the posted receive from the list of fastboxes to be polled first now that the * request is done. */ diff --git a/src/mpid/ch4/shm/posix/eager/fbox/fbox_send.h b/src/mpid/ch4/shm/posix/eager/fbox/fbox_send.h index d850d0c70d0..a9514893036 100644 --- a/src/mpid/ch4/shm/posix/eager/fbox/fbox_send.h +++ b/src/mpid/ch4/shm/posix/eager/fbox/fbox_send.h @@ -40,8 +40,7 @@ MPIDI_POSIX_eager_send(int grank, MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPIDI_POSIX_EAGER_SEND); fbox_out = - MPIDI_POSIX_eager_fbox_control_global.mailboxes. - out[MPIDI_POSIX_eager_fbox_control_global.local_ranks[grank]]; + MPIDI_POSIX_eager_fbox_control_global.mailboxes.out[MPIDI_POSIX_global.local_ranks[grank]]; /* Check if the fastbox is already full and if so, return to the caller, which will cause this * message to be queued. */ diff --git a/src/mpid/ch4/shm/posix/eager/fbox/fbox_types.h b/src/mpid/ch4/shm/posix/eager/fbox/fbox_types.h index 25cd409418c..d04b6878b10 100644 --- a/src/mpid/ch4/shm/posix/eager/fbox/fbox_types.h +++ b/src/mpid/ch4/shm/posix/eager/fbox/fbox_types.h @@ -43,16 +43,6 @@ typedef struct MPIDI_POSIX_eager_fbox_control { MPIDI_POSIX_fbox_arrays_t mailboxes; /* The array of buffers that make up the total collection * of mailboxes */ - MPIDU_shm_barrier_t *barrier; - void *barrier_region; - - /* Keep track of all of the local processes in MPI_COMM_WORLD and what their original rank was - * in that communicator. */ - int num_local; - int my_local_rank; - int *local_ranks; - int *local_procs; - /* A small cache of local ranks that have posted receives that we use to poll fastboxes more * efficiently. The last entry in this array is a counter to keep track of the most recently * checked fastbox so we can make sure we don't starve the fastboxes where receives haven't been @@ -62,6 +52,7 @@ typedef struct MPIDI_POSIX_eager_fbox_control { * is much bigger than any currently immaginable single-node without something like wildly * oversubscribed ranks as threads). */ int16_t *first_poll_local_ranks; + int next_poll_local_rank; } MPIDI_POSIX_eager_fbox_control_t; diff --git a/src/mpid/ch4/shm/posix/posix_init.c b/src/mpid/ch4/shm/posix/posix_init.c index f670408580f..0cacf3403c0 100644 --- a/src/mpid/ch4/shm/posix/posix_init.c +++ b/src/mpid/ch4/shm/posix/posix_init.c @@ -73,7 +73,7 @@ static int choose_posix_eager(void) int MPIDI_POSIX_mpi_init_hook(int rank, int size, int *n_vcis_provided, int *tag_bits) { int mpi_errno = MPI_SUCCESS; - int i; + int i, num_local = 0, local_rank_0 = -1, my_local_rank = -1; #ifdef MPL_USE_DBG_LOGGING MPIDI_CH4_SHM_POSIX_GENERAL = MPL_dbg_class_alloc("SHM_POSIX", "shm_posix"); @@ -87,6 +87,20 @@ int MPIDI_POSIX_mpi_init_hook(int rank, int size, int *n_vcis_provided, int *tag MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDI_POSIX_INIT_HOOK); MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPIDI_POSIX_INIT_HOOK); + /* Populate these values with transformation information about each rank and its original + * information in MPI_COMM_WORLD. */ + + mpi_errno = MPIR_Find_local(MPIR_Process.comm_world, &num_local, &my_local_rank, + /* comm_world rank of each local process */ + &MPIDI_POSIX_global.local_procs, + /* local rank of each process in comm_world if it is on the same node */ + &MPIDI_POSIX_global.local_ranks); + + local_rank_0 = MPIDI_POSIX_global.local_procs[0]; + MPIDI_POSIX_global.num_local = num_local; + MPIDI_POSIX_global.my_local_rank = my_local_rank; + + MPIDI_POSIX_global.local_rank_0 = local_rank_0; *n_vcis_provided = 1; /* This is used to track messages that the eager submodule was not ready to send. */ @@ -143,6 +157,9 @@ int MPIDI_POSIX_mpi_finalize_hook(void) if (mpi_errno) MPIR_ERR_POP(mpi_errno); + MPL_free(MPIDI_POSIX_global.local_ranks); + MPL_free(MPIDI_POSIX_global.local_procs); + fn_exit: MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPIDI_POSIX_FINALIZE_HOOK); return mpi_errno; diff --git a/src/mpid/ch4/shm/posix/posix_types.h b/src/mpid/ch4/shm/posix/posix_types.h index 1d9f96056f1..7dd45ae3d6a 100644 --- a/src/mpid/ch4/shm/posix/posix_types.h +++ b/src/mpid/ch4/shm/posix/posix_types.h @@ -37,6 +37,17 @@ typedef struct { /* Active recv requests array */ MPIR_Request **active_rreq; + + MPIDU_shm_seg_t memory; + MPIDU_shm_barrier_t *barrier; + + /* Keep track of all of the local processes in MPI_COMM_WORLD and what their original rank was + * in that communicator. */ + int num_local; + int my_local_rank; + int *local_ranks; + int *local_procs; + int local_rank_0; } MPIDI_POSIX_global_t; extern MPIDI_POSIX_global_t MPIDI_POSIX_global; From c6be9f1374c6c9614866996e28089a57a137df8b Mon Sep 17 00:00:00 2001 From: Surabhi Jain Date: Fri, 31 Aug 2018 15:42:30 -0500 Subject: [PATCH 09/13] posix: Set up release, gather based infrastructure Implement the release and gather building blocks which will be used to implement intra-node bcast and intra-node reduce. Shared memory is created per communicator, which is used to place the data to be broadcasted, the data which is to be redued, and flags to update the children or parent in the tree. Release is top-down step in tree, while gather is bottom-up step. A shared limit counter is implemented to track and limit the amount of shared memory created per node for optimized intra-node collectives. Signed-off-by: Yanfei Guo --- src/mpid/ch4/shm/posix/Makefile.mk | 4 + src/mpid/ch4/shm/posix/globals.c | 3 + src/mpid/ch4/shm/posix/posix_comm.c | 17 + src/mpid/ch4/shm/posix/posix_init.c | 33 ++ src/mpid/ch4/shm/posix/posix_pre.h | 7 + src/mpid/ch4/shm/posix/posix_types.h | 1 + .../ch4/shm/posix/release_gather/Makefile.mk | 14 + .../shm/posix/release_gather/release_gather.c | 495 ++++++++++++++++++ .../shm/posix/release_gather/release_gather.h | 342 ++++++++++++ .../release_gather/release_gather_types.h | 40 ++ 10 files changed, 956 insertions(+) create mode 100644 src/mpid/ch4/shm/posix/release_gather/Makefile.mk create mode 100644 src/mpid/ch4/shm/posix/release_gather/release_gather.c create mode 100644 src/mpid/ch4/shm/posix/release_gather/release_gather.h create mode 100644 src/mpid/ch4/shm/posix/release_gather/release_gather_types.h diff --git a/src/mpid/ch4/shm/posix/Makefile.mk b/src/mpid/ch4/shm/posix/Makefile.mk index 3d5dc149273..06ee0ffcd02 100644 --- a/src/mpid/ch4/shm/posix/Makefile.mk +++ b/src/mpid/ch4/shm/posix/Makefile.mk @@ -12,6 +12,10 @@ if BUILD_SHM_POSIX +if ENABLE_IZEM_ATOMIC +include $(top_srcdir)/src/mpid/ch4/shm/posix/release_gather/Makefile.mk +endif + noinst_HEADERS += src/mpid/ch4/shm/posix/posix_am.h \ src/mpid/ch4/shm/posix/posix_coll.h \ src/mpid/ch4/shm/posix/shm_inline.h \ diff --git a/src/mpid/ch4/shm/posix/globals.c b/src/mpid/ch4/shm/posix/globals.c index bf419d1041f..74192fa2d86 100644 --- a/src/mpid/ch4/shm/posix/globals.c +++ b/src/mpid/ch4/shm/posix/globals.c @@ -17,3 +17,6 @@ MPIDI_POSIX_global_t MPIDI_POSIX_global = { 0 }; MPIDI_POSIX_eager_funcs_t *MPIDI_POSIX_eager_func = NULL; MPL_dbg_class MPIDI_CH4_SHM_POSIX_GENERAL; +#ifdef ENABLE_IZEM_ATOMIC +zm_atomic_uint_t *MPIDI_POSIX_shm_limit_counter = NULL; +#endif diff --git a/src/mpid/ch4/shm/posix/posix_comm.c b/src/mpid/ch4/shm/posix/posix_comm.c index 4d0ad27e2cd..aab7219e654 100644 --- a/src/mpid/ch4/shm/posix/posix_comm.c +++ b/src/mpid/ch4/shm/posix/posix_comm.c @@ -10,6 +10,9 @@ #include "mpidimpl.h" #include "posix_noinline.h" +#ifdef ENABLE_IZEM_ATOMIC +#include "release_gather.h" +#endif int MPIDI_POSIX_mpi_comm_create_hook(MPIR_Comm * comm) { @@ -17,6 +20,13 @@ int MPIDI_POSIX_mpi_comm_create_hook(MPIR_Comm * comm) MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDI_POSIX_MPI_COMM_CREATE_HOOK); MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPIDI_POSIX_MPI_COMM_CREATE_HOOK); +#ifdef ENABLE_IZEM_ATOMIC + /* Release_gather primitives based collective algorithm works for Intra-comms only */ + if (comm->comm_kind == MPIR_COMM_KIND__INTRACOMM) { + mpi_errno = MPIDI_POSIX_mpi_release_gather_comm_init_null(comm); + } +#endif + MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPIDI_POSIX_MPI_COMM_CREATE_HOOK); return mpi_errno; } @@ -27,6 +37,13 @@ int MPIDI_POSIX_mpi_comm_free_hook(MPIR_Comm * comm) MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDI_POSIX_MPI_COMM_FREE_HOOK); MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPIDI_POSIX_MPI_COMM_FREE_HOOK); +#ifdef ENABLE_IZEM_ATOMIC + /* Release_gather primitives based collective algorithm works for Intra-comms only */ + if (comm->comm_kind == MPIR_COMM_KIND__INTRACOMM) { + mpi_errno = MPIDI_POSIX_mpi_release_gather_comm_free(comm); + } +#endif + MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPIDI_POSIX_MPI_COMM_FREE_HOOK); return mpi_errno; } diff --git a/src/mpid/ch4/shm/posix/posix_init.c b/src/mpid/ch4/shm/posix/posix_init.c index 0cacf3403c0..7e595676c43 100644 --- a/src/mpid/ch4/shm/posix/posix_init.c +++ b/src/mpid/ch4/shm/posix/posix_init.c @@ -32,6 +32,9 @@ #include "posix_eager.h" #include "posix_noinline.h" +#ifdef ENABLE_IZEM_ATOMIC +extern zm_atomic_uint_t *MPIDI_POSIX_shm_limit_counter; +#endif static int choose_posix_eager(void); @@ -173,6 +176,30 @@ int MPIDI_POSIX_coll_init(int rank, int size) MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDI_POSIX_COLL_INIT); MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPIDI_POSIX_COLL_INIT); +#ifdef ENABLE_IZEM_ATOMIC + /* Allocate a shared counter to track the amount of shared memory created per node for + * intra-node collectives */ + mpi_errno = + MPIDU_shm_seg_alloc(sizeof(int), (void **) &MPIDI_POSIX_shm_limit_counter, MPL_MEM_SHM); + if (mpi_errno) + MPIR_ERR_POP(mpi_errno); + + /* Actually allocate the segment and assign regions to the pointers */ + mpi_errno = + MPIDU_shm_seg_commit(&MPIDI_POSIX_global.memory, &MPIDI_POSIX_global.barrier, + MPIDI_POSIX_global.num_local, MPIDI_POSIX_global.my_local_rank, + MPIDI_POSIX_global.local_rank_0, rank, MPL_MEM_SHM); + if (mpi_errno) + MPIR_ERR_POP(mpi_errno); + + mpi_errno = MPIDU_shm_barrier(MPIDI_POSIX_global.barrier, MPIDI_POSIX_global.num_local); + if (mpi_errno) + MPIR_ERR_POP(mpi_errno); + + /* Set the counter to 0 */ + zm_atomic_store(MPIDI_POSIX_shm_limit_counter, 0, zm_memord_relaxed); +#endif + mpi_errno = collective_cvars_init(); if (mpi_errno) MPIR_ERR_POP(mpi_errno); @@ -191,6 +218,12 @@ int MPIDI_POSIX_coll_finalize(void) MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDI_POSIX_COLL_FINALIZE); MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPIDI_POSIX_COLL_FINALIZE); +#ifdef ENABLE_IZEM_ATOMIC + /* Destroy the shared counter which was used to track the amount of shared memory created + * per node for intra-node collectives */ + mpi_errno = MPIDU_shm_seg_destroy(&MPIDI_POSIX_global.memory, MPIDI_POSIX_global.num_local); +#endif + fn_exit: MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPIDI_POSIX_COLL_FINALIZE); return mpi_errno; diff --git a/src/mpid/ch4/shm/posix/posix_pre.h b/src/mpid/ch4/shm/posix/posix_pre.h index f48f4c41c4b..91f8710bb1d 100644 --- a/src/mpid/ch4/shm/posix/posix_pre.h +++ b/src/mpid/ch4/shm/posix/posix_pre.h @@ -13,6 +13,9 @@ #define POSIX_PRE_H_INCLUDED #include +#ifdef ENABLE_IZEM_ATOMIC +#include "release_gather_types.h" +#endif #define MPIDI_POSIX_MAX_AM_HDR_SIZE (32) @@ -35,7 +38,11 @@ struct MPIR_Segment; /* These structs are populated with dummy variables because empty structs are not supported in all * compilers: https://stackoverflow.com/a/755339/491687 */ typedef struct { +#ifdef ENABLE_IZEM_ATOMIC + MPIDI_POSIX_release_gather_comm_t *release_gather; +#else int dummy; +#endif } MPIDI_POSIX_comm_t; typedef struct { diff --git a/src/mpid/ch4/shm/posix/posix_types.h b/src/mpid/ch4/shm/posix/posix_types.h index 7dd45ae3d6a..ff8bab488b1 100644 --- a/src/mpid/ch4/shm/posix/posix_types.h +++ b/src/mpid/ch4/shm/posix/posix_types.h @@ -28,6 +28,7 @@ enum { #define MPIDI_POSIX_AMREQUEST_HDR(req, field) ((req)->dev.ch4.am.shm_am.posix.req_hdr->field) #define MPIDI_POSIX_AMREQUEST_HDR_PTR(req) ((req)->dev.ch4.am.shm_am.posix.req_hdr) #define MPIDI_POSIX_REQUEST(req, field) ((req)->dev.ch4.shm.posix.field) +#define MPIDI_POSIX_COMM(comm) (&(comm)->dev.ch4.shm.posix) typedef struct { MPIDIU_buf_pool_t *am_buf_pool; diff --git a/src/mpid/ch4/shm/posix/release_gather/Makefile.mk b/src/mpid/ch4/shm/posix/release_gather/Makefile.mk new file mode 100644 index 00000000000..45808c6bb26 --- /dev/null +++ b/src/mpid/ch4/shm/posix/release_gather/Makefile.mk @@ -0,0 +1,14 @@ +## (C) 2018 by Argonne National Laboratory. +## See COPYRIGHT in top-level directory. +## +## Portions of this code were written by Intel Corporation. +## Copyright (C) 2011-2017 Intel Corporation. Intel provides this material +## to Argonne National Laboratory subject to Software Grant and Corporate +## Contributor License Agreement dated February 8, 2012. + +AM_CPPFLAGS += -I$(top_srcdir)/src/mpid/ch4/shm/posix/release_gather + +noinst_HEADERS += src/mpid/ch4/shm/posix/release_gather/release_gather_types.h \ + src/mpid/ch4/shm/posix/release_gather/release_gather.h + +mpi_core_sources += src/mpid/ch4/shm/posix/release_gather/release_gather.c diff --git a/src/mpid/ch4/shm/posix/release_gather/release_gather.c b/src/mpid/ch4/shm/posix/release_gather/release_gather.c new file mode 100644 index 00000000000..d8ea640a159 --- /dev/null +++ b/src/mpid/ch4/shm/posix/release_gather/release_gather.c @@ -0,0 +1,495 @@ +/* + * (C) 2018 by Argonne National Laboratory. + * See COPYRIGHT in top-level directory. + * + * Portions of this code were written by Intel Corporation. + * Copyright (C) 2011-2017 Intel Corporation. Intel provides this material + * to Argonne National Laboratory subject to Software Grant and Corporate + * Contributor License Agreement dated February 8, 2012. + */ + +/* +=== BEGIN_MPI_T_CVAR_INFO_BLOCK === + +cvars: + - name : MPIR_CVAR_COLL_SHM_LIMIT_PER_NODE + category : COLLECTIVE + type : int + default : 65536 + class : device + verbosity : MPI_T_VERBOSITY_USER_BASIC + scope : MPI_T_SCOPE_ALL_EQ + description : >- + Maximum shared memory created per node for optimized intra-node collectives (in KB) + + - name : MPIR_CVAR_BCAST_INTRANODE_BUFFER_TOTAL_SIZE + category : COLLECTIVE + type : int + default : 32768 + class : device + verbosity : MPI_T_VERBOSITY_USER_BASIC + scope : MPI_T_SCOPE_ALL_EQ + description : >- + Total size of the bcast buffer (in bytes) + + - name : MPIR_CVAR_BCAST_INTRANODE_NUM_CELLS + category : COLLECTIVE + type : int + default : 4 + class : device + verbosity : MPI_T_VERBOSITY_USER_BASIC + scope : MPI_T_SCOPE_ALL_EQ + description : >- + Number of cells the bcast buffer is divided into + + - name : MPIR_CVAR_REDUCE_INTRANODE_BUFFER_TOTAL_SIZE + category : COLLECTIVE + type : int + default : 32768 + class : device + verbosity : MPI_T_VERBOSITY_USER_BASIC + scope : MPI_T_SCOPE_ALL_EQ + description : >- + Total size of the reduce buffer per rank (in bytes) + + - name : MPIR_CVAR_REDUCE_INTRANODE_NUM_CELLS + category : COLLECTIVE + type : int + default : 4 + class : device + verbosity : MPI_T_VERBOSITY_USER_BASIC + scope : MPI_T_SCOPE_ALL_EQ + description : >- + Number of cells the reduce buffer is divided into, for each rank + + - name : MPIR_CVAR_BCAST_INTRANODE_TREE_KVAL + category : COLLECTIVE + type : int + default : 64 + class : device + verbosity : MPI_T_VERBOSITY_USER_BASIC + scope : MPI_T_SCOPE_ALL_EQ + description : >- + K value for the kary/knomial tree for intra-node bcast + + - name : MPIR_CVAR_BCAST_INTRANODE_TREE_TYPE + category : COLLECTIVE + type : string + default : kary + class : device + verbosity : MPI_T_VERBOSITY_USER_BASIC + scope : MPI_T_SCOPE_ALL_EQ + description : >- + Tree type for intra-node bcast tree + kary - kary tree type + knomial_1 - knomial_1 tree type (ranks are added in order from the left side) + knomial_2 - knomial_2 tree type (ranks are added in order from the right side) + + - name : MPIR_CVAR_REDUCE_INTRANODE_TREE_KVAL + category : COLLECTIVE + type : int + default : 4 + class : device + verbosity : MPI_T_VERBOSITY_USER_BASIC + scope : MPI_T_SCOPE_ALL_EQ + description : >- + K value for the kary/knomial tree for intra-node reduce + + - name : MPIR_CVAR_REDUCE_INTRANODE_TREE_TYPE + category : COLLECTIVE + type : string + default : kary + class : device + verbosity : MPI_T_VERBOSITY_USER_BASIC + scope : MPI_T_SCOPE_ALL_EQ + description : >- + Tree type for intra-node reduce tree + kary - kary tree type + knomial_1 - knomial_1 tree type (ranks are added in order from the left side) + knomial_2 - knomial_2 tree type (ranks are added in order from the right side) + +=== END_MPI_T_CVAR_INFO_BLOCK === +*/ + +#include "mpiimpl.h" +#include "release_gather.h" + +#define COMM_FIELD(comm, field) \ + MPIDI_POSIX_COMM(comm)->release_gather->field + + +MPIDI_POSIX_release_gather_tree_type_t MPIDI_POSIX_Bcast_tree_type, MPIDI_POSIX_Reduce_tree_type; + +/* Initialize the release_gather struct to NULL */ +int MPIDI_POSIX_mpi_release_gather_comm_init_null(MPIR_Comm * comm_ptr) +{ + MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_POSIX_MPI_RELEASE_GATHER_COMM_INIT_NULL); + MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_POSIX_MPI_RELEASE_GATHER_COMM_INIT_NULL); + + MPIDI_POSIX_COMM(comm_ptr)->release_gather = NULL; + + if (0 == strcmp(MPIR_CVAR_BCAST_INTRANODE_TREE_TYPE, "kary")) + MPIDI_POSIX_Bcast_tree_type = MPIDI_POSIX_RELEASE_GATHER_TREE_TYPE_KARY; + else if (0 == strcmp(MPIR_CVAR_BCAST_INTRANODE_TREE_TYPE, "knomial_1")) + MPIDI_POSIX_Bcast_tree_type = MPIDI_POSIX_RELEASE_GATHER_TREE_TYPE_KNOMIAL_1; + else if (0 == strcmp(MPIR_CVAR_BCAST_INTRANODE_TREE_TYPE, "knomial_2")) + MPIDI_POSIX_Bcast_tree_type = MPIDI_POSIX_RELEASE_GATHER_TREE_TYPE_KNOMIAL_2; + else + MPIDI_POSIX_Bcast_tree_type = MPIDI_POSIX_RELEASE_GATHER_TREE_TYPE_KARY; + + if (0 == strcmp(MPIR_CVAR_REDUCE_INTRANODE_TREE_TYPE, "kary")) + MPIDI_POSIX_Reduce_tree_type = MPIDI_POSIX_RELEASE_GATHER_TREE_TYPE_KARY; + else if (0 == strcmp(MPIR_CVAR_REDUCE_INTRANODE_TREE_TYPE, "knomial_1")) + MPIDI_POSIX_Reduce_tree_type = MPIDI_POSIX_RELEASE_GATHER_TREE_TYPE_KNOMIAL_1; + else if (0 == strcmp(MPIR_CVAR_REDUCE_INTRANODE_TREE_TYPE, "knomial_2")) + MPIDI_POSIX_Reduce_tree_type = MPIDI_POSIX_RELEASE_GATHER_TREE_TYPE_KNOMIAL_2; + else + MPIDI_POSIX_Reduce_tree_type = MPIDI_POSIX_RELEASE_GATHER_TREE_TYPE_KARY; + + MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_POSIX_MPI_RELEASE_GATHER_COMM_INIT_NULL); + return MPI_SUCCESS; +} + +/* Initialize the data structures and allocate the shared memory (flags, bcast buffer and reduce + * buffer) */ +int MPIDI_POSIX_mpi_release_gather_comm_init(MPIR_Comm * comm_ptr, + const MPIDI_POSIX_release_gather_opcode_t operation) +{ + MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_POSIX_MPI_RELEASE_GATHER_COMM_INIT); + MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_POSIX_MPI_RELEASE_GATHER_COMM_INIT); + + int mpi_errno = MPI_SUCCESS; + int mpi_errno_ret = MPI_SUCCESS; + int rank, num_ranks; + bool initialize_flags = false, initialize_bcast_buf = false, initialize_reduce_buf = false; + int flags_num_pages, fallback = 0; + size_t flags_shm_size = 0; + const long pg_sz = sysconf(_SC_PAGESIZE); + MPIR_Errflag_t errflag = MPIR_ERR_NONE; + bool mapfail_flag = false; + + rank = MPIR_Comm_rank(comm_ptr); + num_ranks = MPIR_Comm_size(comm_ptr); + /* Layout of the shm region: 1 gather and release flag each per rank, followed by + * bcast buffer (divided into multiple cells), followed by + * reduce buffer (divided into multiple cells) per rank. */ + + if (MPIDI_POSIX_COMM(comm_ptr)->release_gather == NULL) { + /* release_gather based collectives have not been used before on this comm */ + initialize_flags = true; + if (operation == MPIDI_POSIX_RELEASE_GATHER_OPCODE_BCAST) { + initialize_bcast_buf = true; + } else if (operation == MPIDI_POSIX_RELEASE_GATHER_OPCODE_REDUCE) { + initialize_reduce_buf = true; + } + } else { + /* at least one release_gather based collective was used on this comm */ + if (operation == MPIDI_POSIX_RELEASE_GATHER_OPCODE_BCAST && + COMM_FIELD(comm_ptr, bcast_buf_addr) == NULL) { + initialize_bcast_buf = true; + } else if (operation == MPIDI_POSIX_RELEASE_GATHER_OPCODE_REDUCE && + COMM_FIELD(comm_ptr, reduce_buf_addr) == NULL) { + initialize_reduce_buf = true; + } + } + + if (initialize_flags || initialize_bcast_buf || initialize_reduce_buf) { + /* Calculate the amount of shm to be created */ + size_t tmp_shm_counter; + size_t memory_to_be_allocated = 0; + if (initialize_flags) { + /* Calculate the amount of memory that would be allocated for flags */ + flags_shm_size = MPIDI_POSIX_RELEASE_GATHER_FLAG_SPACE_PER_RANK * num_ranks; + /* Reset flags_shm_size so that the data buffers are aligned to the system pages */ + flags_num_pages = flags_shm_size / (int) (pg_sz); + if (flags_shm_size % pg_sz != 0) { + flags_num_pages++; + } + flags_shm_size = flags_num_pages * pg_sz; + + /* Update the amount of memory to be allocated */ + memory_to_be_allocated += flags_shm_size; + } + + if (initialize_bcast_buf) { + memory_to_be_allocated += MPIR_CVAR_BCAST_INTRANODE_BUFFER_TOTAL_SIZE; + } + if (initialize_reduce_buf) { + memory_to_be_allocated += (num_ranks * MPIR_CVAR_REDUCE_INTRANODE_BUFFER_TOTAL_SIZE); + } + + if (rank == 0) { + /* rank 0 decides if more memory can be created and broadcasts the decision to other ranks */ + tmp_shm_counter = zm_atomic_load(MPIDI_POSIX_shm_limit_counter, zm_memord_acquire); + + /* Check if it is allowed to create more shm on this node */ + if ((tmp_shm_counter + memory_to_be_allocated) > + (MPIR_CVAR_COLL_SHM_LIMIT_PER_NODE * 1024)) { + /* cannot create more shm, fallback to MPIR level algorithms, and broadcast the decision to other ranks */ + fallback = 1; + MPIR_Bcast_impl(&fallback, 1, MPI_INT, 0, comm_ptr, &errflag); + MPIR_ERR_SETANDJUMP(mpi_errno, MPI_ERR_NO_MEM, "**nomem"); + } else { + /* More shm can be created, update the shared counter */ + zm_atomic_fetch_add(MPIDI_POSIX_shm_limit_counter, memory_to_be_allocated, + zm_memord_seq_cst); + fallback = 0; + mpi_errno = MPIR_Bcast_impl(&fallback, 1, MPI_INT, 0, comm_ptr, &errflag); + if (mpi_errno) { + /* for communication errors, just record the error but continue */ + errflag = + MPIX_ERR_PROC_FAILED == + MPIR_ERR_GET_CLASS(mpi_errno) ? MPIR_ERR_PROC_FAILED : MPIR_ERR_OTHER; + MPIR_ERR_SET(mpi_errno, errflag, "**fail"); + MPIR_ERR_ADD(mpi_errno_ret, mpi_errno); + } + } + } else { + mpi_errno = MPIR_Bcast_impl(&fallback, 1, MPI_INT, 0, comm_ptr, &errflag); + if (mpi_errno) { + /* for communication errors, just record the error but continue */ + errflag = + MPIX_ERR_PROC_FAILED == + MPIR_ERR_GET_CLASS(mpi_errno) ? MPIR_ERR_PROC_FAILED : MPIR_ERR_OTHER; + MPIR_ERR_SET(mpi_errno, errflag, "**fail"); + MPIR_ERR_ADD(mpi_errno_ret, mpi_errno); + } + if (fallback) { + MPIR_ERR_SETANDJUMP(mpi_errno, MPI_ERR_NO_MEM, "**nomem"); + } + } + } + + if (initialize_flags) { + /* Initialize the release_gather struct and allocate shm for flags */ + MPIDI_POSIX_release_gather_comm_t *release_gather_info_ptr; + release_gather_info_ptr = + MPL_malloc(sizeof(struct MPIDI_POSIX_release_gather_comm_t), MPL_MEM_COLL); + MPIR_ERR_CHKANDJUMP(!release_gather_info_ptr, mpi_errno, MPI_ERR_OTHER, "**nomem"); + + release_gather_info_ptr->flags_shm_size = flags_shm_size; + + /* Create bcast_tree and reduce_tree with root of the tree as 0 */ + mpi_errno = + MPIR_Treealgo_tree_create(rank, num_ranks, MPIDI_POSIX_Bcast_tree_type, + MPIR_CVAR_BCAST_INTRANODE_TREE_KVAL, 0, + &release_gather_info_ptr->bcast_tree); + if (mpi_errno) { + /* for communication errors, just record the error but continue */ + errflag = + MPIX_ERR_PROC_FAILED == + MPIR_ERR_GET_CLASS(mpi_errno) ? MPIR_ERR_PROC_FAILED : MPIR_ERR_OTHER; + MPIR_ERR_SET(mpi_errno, errflag, "**fail"); + MPIR_ERR_ADD(mpi_errno_ret, mpi_errno); + } + mpi_errno = + MPIR_Treealgo_tree_create(rank, num_ranks, MPIDI_POSIX_Reduce_tree_type, + MPIR_CVAR_REDUCE_INTRANODE_TREE_KVAL, 0, + &release_gather_info_ptr->reduce_tree); + if (mpi_errno) { + /* for communication errors, just record the error but continue */ + errflag = + MPIX_ERR_PROC_FAILED == + MPIR_ERR_GET_CLASS(mpi_errno) ? MPIR_ERR_PROC_FAILED : MPIR_ERR_OTHER; + MPIR_ERR_SET(mpi_errno, errflag, "**fail"); + MPIR_ERR_ADD(mpi_errno_ret, mpi_errno); + } + + release_gather_info_ptr->gather_state = release_gather_info_ptr->release_state + = MPIR_CVAR_BCAST_INTRANODE_NUM_CELLS + MPIR_CVAR_REDUCE_INTRANODE_NUM_CELLS; + + release_gather_info_ptr->bcast_buf_addr = NULL; + release_gather_info_ptr->reduce_buf_addr = NULL; + release_gather_info_ptr->child_reduce_buf_addr = NULL; + + mpi_errno = MPIDIU_allocate_shm_segment(comm_ptr, flags_shm_size, + &(release_gather_info_ptr->shm_flags_handle), + (void **) + &(release_gather_info_ptr->flags_addr), + &mapfail_flag); + if (mpi_errno || mapfail_flag) { + /* for communication errors, just record the error but continue */ + errflag = + MPIX_ERR_PROC_FAILED == + MPIR_ERR_GET_CLASS(mpi_errno) ? MPIR_ERR_PROC_FAILED : MPIR_ERR_OTHER; + MPIR_ERR_SET(mpi_errno, errflag, "**fail"); + MPIR_ERR_ADD(mpi_errno_ret, mpi_errno); + } + + /* Calculate gather and release flag address and initialize to the gather and release states */ + release_gather_info_ptr->gather_flag_addr = + MPIDI_POSIX_RELEASE_GATHER_GATHER_FLAG_ADDR(rank); + release_gather_info_ptr->release_flag_addr = + MPIDI_POSIX_RELEASE_GATHER_RELEASE_FLAG_ADDR(rank); + *(release_gather_info_ptr->gather_flag_addr) = (release_gather_info_ptr->gather_state); + *(release_gather_info_ptr->release_flag_addr) = (release_gather_info_ptr->release_state); + + /* Make sure all the flags are set before ranks start reading each other's flags from shm */ + mpi_errno = MPIR_Barrier_impl(comm_ptr, &errflag); + if (mpi_errno) { + /* for communication errors, just record the error but continue */ + errflag = + MPIX_ERR_PROC_FAILED == + MPIR_ERR_GET_CLASS(mpi_errno) ? MPIR_ERR_PROC_FAILED : MPIR_ERR_OTHER; + MPIR_ERR_SET(mpi_errno, errflag, "**fail"); + MPIR_ERR_ADD(mpi_errno_ret, mpi_errno); + } + MPIDI_POSIX_COMM(comm_ptr)->release_gather = release_gather_info_ptr; + } + + if (initialize_bcast_buf) { + /* Allocate the shared memory for bcast buffer */ + mpi_errno = + MPIDIU_allocate_shm_segment(comm_ptr, MPIR_CVAR_BCAST_INTRANODE_BUFFER_TOTAL_SIZE, + &(COMM_FIELD(comm_ptr, shm_bcast_buf_handle)), + (void **) &(COMM_FIELD(comm_ptr, bcast_buf_addr)), + &mapfail_flag); + if (mpi_errno || mapfail_flag) { + /* for communication errors, just record the error but continue */ + errflag = + MPIX_ERR_PROC_FAILED == + MPIR_ERR_GET_CLASS(mpi_errno) ? MPIR_ERR_PROC_FAILED : MPIR_ERR_OTHER; + MPIR_ERR_SET(mpi_errno, errflag, "**fail"); + MPIR_ERR_ADD(mpi_errno_ret, mpi_errno); + } + } + + if (initialize_reduce_buf) { + /* Allocate the shared memory for a reduce buffer per rank */ + int i; + COMM_FIELD(comm_ptr, child_reduce_buf_addr) = + MPL_malloc(num_ranks * sizeof(void *), MPL_MEM_COLL); + + mpi_errno = + MPIDIU_allocate_shm_segment(comm_ptr, + num_ranks * + MPIR_CVAR_REDUCE_INTRANODE_BUFFER_TOTAL_SIZE, + &(COMM_FIELD(comm_ptr, shm_reduce_buf_handle)), + (void **) &(COMM_FIELD(comm_ptr, reduce_buf_addr)), + &mapfail_flag); + if (mpi_errno || mapfail_flag) { + /* for communication errors, just record the error but continue */ + errflag = + MPIX_ERR_PROC_FAILED == + MPIR_ERR_GET_CLASS(mpi_errno) ? MPIR_ERR_PROC_FAILED : MPIR_ERR_OTHER; + MPIR_ERR_SET(mpi_errno, errflag, "**fail"); + MPIR_ERR_ADD(mpi_errno_ret, mpi_errno); + } + + /* Store address of each of the children's reduce buffer */ + for (i = 0; i < COMM_FIELD(comm_ptr, reduce_tree.num_children); i++) { + MPIR_ERR_CHKANDJUMP(!utarray_eltptr(COMM_FIELD(comm_ptr, reduce_tree.children), i), + mpi_errno, MPI_ERR_OTHER, "**nomem"); + COMM_FIELD(comm_ptr, child_reduce_buf_addr[i]) = + (char *) COMM_FIELD(comm_ptr, reduce_buf_addr) + + ((*utarray_eltptr(COMM_FIELD(comm_ptr, reduce_tree.children), i)) + * MPIR_CVAR_REDUCE_INTRANODE_BUFFER_TOTAL_SIZE); + } + } + + fn_exit: + MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_POSIX_MPI_RELEASE_GATHER_COMM_INIT); + return mpi_errno; + fn_fail: + goto fn_exit; +} + +/* Cleanup the release_gather data structures and free the allocated memory */ +int MPIDI_POSIX_mpi_release_gather_comm_free(MPIR_Comm * comm_ptr) +{ + MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_POSIX_MPI_RELEASE_GATHER_COMM_FREE); + MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_POSIX_MPI_RELEASE_GATHER_COMM_FREE); + + int mpi_errno = MPI_SUCCESS; + int mpi_errno_ret = MPI_SUCCESS; + MPIR_Errflag_t errflag = MPIR_ERR_NONE; + + /* Clean up is not required for NULL struct */ + if (MPIDI_POSIX_COMM(comm_ptr)->release_gather == NULL) { + goto fn_exit; + } + + /* destroy and detach shared memory used for flags */ + mpi_errno = MPL_shm_seg_detach(COMM_FIELD(comm_ptr, shm_flags_handle), + (void **) &COMM_FIELD(comm_ptr, flags_addr), + COMM_FIELD(comm_ptr, flags_shm_size)); + if (mpi_errno) { + /* for communication errors, just record the error but continue */ + errflag = + MPIX_ERR_PROC_FAILED == + MPIR_ERR_GET_CLASS(mpi_errno) ? MPIR_ERR_PROC_FAILED : MPIR_ERR_OTHER; + MPIR_ERR_SET(mpi_errno, errflag, "**fail"); + MPIR_ERR_ADD(mpi_errno_ret, mpi_errno); + } + + mpi_errno = MPL_shm_hnd_finalize(&COMM_FIELD(comm_ptr, shm_flags_handle)); + if (mpi_errno) { + /* for communication errors, just record the error but continue */ + errflag = + MPIX_ERR_PROC_FAILED == + MPIR_ERR_GET_CLASS(mpi_errno) ? MPIR_ERR_PROC_FAILED : MPIR_ERR_OTHER; + MPIR_ERR_SET(mpi_errno, errflag, "**fail"); + MPIR_ERR_ADD(mpi_errno_ret, mpi_errno); + } + + if (COMM_FIELD(comm_ptr, bcast_buf_addr) != NULL) { + /* destroy and detach shared memory used for bcast buffer */ + mpi_errno = MPL_shm_seg_detach(COMM_FIELD(comm_ptr, shm_bcast_buf_handle), + (void **) &COMM_FIELD(comm_ptr, bcast_buf_addr), + MPIR_CVAR_BCAST_INTRANODE_BUFFER_TOTAL_SIZE); + if (mpi_errno) { + /* for communication errors, just record the error but continue */ + errflag = + MPIX_ERR_PROC_FAILED == + MPIR_ERR_GET_CLASS(mpi_errno) ? MPIR_ERR_PROC_FAILED : MPIR_ERR_OTHER; + MPIR_ERR_SET(mpi_errno, errflag, "**fail"); + MPIR_ERR_ADD(mpi_errno_ret, mpi_errno); + } + + mpi_errno = MPL_shm_hnd_finalize(&COMM_FIELD(comm_ptr, shm_bcast_buf_handle)); + if (mpi_errno) { + /* for communication errors, just record the error but continue */ + errflag = + MPIX_ERR_PROC_FAILED == + MPIR_ERR_GET_CLASS(mpi_errno) ? MPIR_ERR_PROC_FAILED : MPIR_ERR_OTHER; + MPIR_ERR_SET(mpi_errno, errflag, "**fail"); + MPIR_ERR_ADD(mpi_errno_ret, mpi_errno); + } + } + + if (COMM_FIELD(comm_ptr, reduce_buf_addr) != NULL) { + /* destroy and detach shared memory used for reduce buffers */ + mpi_errno = MPL_shm_seg_detach(COMM_FIELD(comm_ptr, shm_reduce_buf_handle), + (void **) &COMM_FIELD(comm_ptr, reduce_buf_addr), + MPIR_Comm_size(comm_ptr) + * MPIR_CVAR_REDUCE_INTRANODE_BUFFER_TOTAL_SIZE); + if (mpi_errno) { + /* for communication errors, just record the error but continue */ + errflag = + MPIX_ERR_PROC_FAILED == + MPIR_ERR_GET_CLASS(mpi_errno) ? MPIR_ERR_PROC_FAILED : MPIR_ERR_OTHER; + MPIR_ERR_SET(mpi_errno, errflag, "**fail"); + MPIR_ERR_ADD(mpi_errno_ret, mpi_errno); + } + + mpi_errno = MPL_shm_hnd_finalize(&COMM_FIELD(comm_ptr, shm_reduce_buf_handle)); + if (mpi_errno) { + /* for communication errors, just record the error but continue */ + errflag = + MPIX_ERR_PROC_FAILED == + MPIR_ERR_GET_CLASS(mpi_errno) ? MPIR_ERR_PROC_FAILED : MPIR_ERR_OTHER; + MPIR_ERR_SET(mpi_errno, errflag, "**fail"); + MPIR_ERR_ADD(mpi_errno_ret, mpi_errno); + } + + MPL_free(COMM_FIELD(comm_ptr, child_reduce_buf_addr)); + } + + MPIR_Treealgo_tree_free(&(COMM_FIELD(comm_ptr, bcast_tree))); + MPIR_Treealgo_tree_free(&(COMM_FIELD(comm_ptr, reduce_tree))); + MPL_free(MPIDI_POSIX_COMM(comm_ptr)->release_gather); + + fn_exit: + MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_POSIX_MPI_RELEASE_GATHER_COMM_FREE); + return mpi_errno; +} diff --git a/src/mpid/ch4/shm/posix/release_gather/release_gather.h b/src/mpid/ch4/shm/posix/release_gather/release_gather.h new file mode 100644 index 00000000000..1991526b0b8 --- /dev/null +++ b/src/mpid/ch4/shm/posix/release_gather/release_gather.h @@ -0,0 +1,342 @@ +/* + * (C) 2018 by Argonne National Laboratory. + * See COPYRIGHT in top-level directory. + * + * Portions of this code were written by Intel Corporation. + * Copyright (C) 2011-2017 Intel Corporation. Intel provides this material + * to Argonne National Laboratory subject to Software Grant and Corporate + * Contributor License Agreement dated February 8, 2012. + */ + +#ifndef RELEASE_GATHER_H_INCLUDED +#define RELEASE_GATHER_H_INCLUDED + +extern zm_atomic_uint_t *MPIDI_POSIX_shm_limit_counter; +extern MPL_shm_hnd_t shm_limit_handle; +extern MPIDI_POSIX_release_gather_tree_type_t MPIDI_POSIX_Bcast_tree_type, + MPIDI_POSIX_Reduce_tree_type; + +/*Blocking wait implementation*/ +/* zm_memord_acquire makes sure no writes/reads are reordered before this load */ +#define MPIDI_POSIX_RELEASE_GATHER_WAIT_WHILE_LESS_THAN(ptr, value) \ + do { \ + int spin_count = 0; \ + while (zm_atomic_load(ptr, zm_memord_acquire) < (value)) { \ + if (++spin_count >= 10000) { \ + /* Call progress only after waiting for a while */ \ + MPID_Progress_test(); \ + spin_count = 0; \ + } \ + } \ + } \ + while (0) +#define MPIDI_POSIX_RELEASE_GATHER_FLAG_SIZE (sizeof(zm_atomic_uint_t)) +/* 1 cache_line each for gather and release flag */ +#define MPIDI_POSIX_RELEASE_GATHER_FLAG_SPACE_PER_RANK (MPIDU_SHM_CACHE_LINE_LEN * 2) +#define MPIDI_POSIX_RELEASE_GATHER_GATHER_FLAG_OFFSET (0) +#define MPIDI_POSIX_RELEASE_GATHER_RELEASE_FLAG_OFFSET (MPIDU_SHM_CACHE_LINE_LEN) +#define MPIDI_POSIX_RELEASE_GATHER_GATHER_FLAG_ADDR(rank) \ + (((zm_atomic_uint_t *)release_gather_info_ptr->flags_addr) + \ + ((rank * MPIDI_POSIX_RELEASE_GATHER_FLAG_SPACE_PER_RANK + MPIDI_POSIX_RELEASE_GATHER_GATHER_FLAG_OFFSET)/(MPIDI_POSIX_RELEASE_GATHER_FLAG_SIZE))) +#define MPIDI_POSIX_RELEASE_GATHER_RELEASE_FLAG_ADDR(rank) \ + (((zm_atomic_uint_t *)release_gather_info_ptr->flags_addr) + \ + ((rank * MPIDI_POSIX_RELEASE_GATHER_FLAG_SPACE_PER_RANK + MPIDI_POSIX_RELEASE_GATHER_RELEASE_FLAG_OFFSET)/(MPIDI_POSIX_RELEASE_GATHER_FLAG_SIZE))) +#define MPIDI_POSIX_RELEASE_GATHER_BCAST_DATA_ADDR(buf) \ + (char *) release_gather_info_ptr->bcast_buf_addr + \ + (buf * MPIDI_POSIX_RELEASE_GATHER_BCAST_CELLSIZE) +#define MPIDI_POSIX_RELEASE_GATHER_REDUCE_DATA_ADDR(rank, buf) \ + (((char *) release_gather_info_ptr->reduce_buf_addr) + \ + (rank * MPIR_CVAR_REDUCE_INTRANODE_BUFFER_TOTAL_SIZE) + \ + (buf * MPIDI_POSIX_RELEASE_GATHER_REDUCE_CELLSIZE)) +#define MPIDI_POSIX_RELEASE_GATHER_BCAST_CELLSIZE \ + (MPIR_CVAR_BCAST_INTRANODE_BUFFER_TOTAL_SIZE / MPIR_CVAR_BCAST_INTRANODE_NUM_CELLS) +#define MPIDI_POSIX_RELEASE_GATHER_REDUCE_CELLSIZE \ + (MPIR_CVAR_REDUCE_INTRANODE_BUFFER_TOTAL_SIZE / MPIR_CVAR_REDUCE_INTRANODE_NUM_CELLS) + +int MPIDI_POSIX_mpi_release_gather_comm_init_null(MPIR_Comm * comm_ptr); +int MPIDI_POSIX_mpi_release_gather_comm_init(MPIR_Comm * comm_ptr, + const MPIDI_POSIX_release_gather_opcode_t operation); +int MPIDI_POSIX_mpi_release_gather_comm_free(MPIR_Comm * comm_ptr); + +/* Release step of the release_gather framework. This is top-down step in the release_tree. + * Parent notifies the children to go, once it arrives. In case of Bcast, root places the data in + * shm bcast buffer before notifying the children. Children copy the data out of shm buffer when + * notified by the parent */ +MPL_STATIC_INLINE_PREFIX int MPIDI_POSIX_mpi_release_gather_release(void *local_buf, + const int count, + MPI_Datatype datatype, + const int root, + MPIR_Comm * comm_ptr, + MPIR_Errflag_t * errflag, + const + MPIDI_POSIX_release_gather_opcode_t + operation) +{ + MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_POSIX_MPI_RELEASE_GATHER_RELEASE); + MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_POSIX_MPI_RELEASE_GATHER_RELEASE); + + int mpi_errno = MPI_SUCCESS; + int mpi_errno_ret = MPI_SUCCESS; + MPIDI_POSIX_release_gather_comm_t *release_gather_info_ptr; + int segment, rank; + void *bcast_data_addr = NULL; + volatile zm_atomic_uint_t *parent_flag_addr; + /* Set the relaxation to 0 because in Bcast, gather step is "relaxed" to make sure multiple + * buffers can be used to pipeline the copying in and out of shared memory, and data is not + * overwritten */ + const int relaxation = + (operation == + MPIDI_POSIX_RELEASE_GATHER_OPCODE_BCAST) ? 0 : MPIR_CVAR_REDUCE_INTRANODE_NUM_CELLS - 1; + + rank = MPIR_Comm_rank(comm_ptr); + release_gather_info_ptr = MPIDI_POSIX_COMM(comm_ptr)->release_gather; + + if (operation == MPIDI_POSIX_RELEASE_GATHER_OPCODE_BCAST) { + segment = release_gather_info_ptr->release_state % MPIR_CVAR_BCAST_INTRANODE_NUM_CELLS; + bcast_data_addr = MPIDI_POSIX_RELEASE_GATHER_BCAST_DATA_ADDR(segment); + + if (root != 0) { + /* Root sends data to rank 0 */ + if (rank == root) { + mpi_errno = + MPIC_Send(local_buf, count, datatype, 0, MPIR_BCAST_TAG, comm_ptr, errflag); + if (mpi_errno) { + /* for communication errors, just record the error but continue */ + *errflag = + MPIX_ERR_PROC_FAILED == + MPIR_ERR_GET_CLASS(mpi_errno) ? MPIR_ERR_PROC_FAILED : MPIR_ERR_OTHER; + MPIR_ERR_SET(mpi_errno, *errflag, "**fail"); + MPIR_ERR_ADD(mpi_errno_ret, mpi_errno); + } + } else if (rank == 0) { + mpi_errno = + MPIC_Recv(bcast_data_addr, count, datatype, root, MPIR_BCAST_TAG, comm_ptr, + MPI_STATUS_IGNORE, errflag); + if (mpi_errno) { + /* for communication errors, just record the error but continue */ + *errflag = + MPIX_ERR_PROC_FAILED == + MPIR_ERR_GET_CLASS(mpi_errno) ? MPIR_ERR_PROC_FAILED : MPIR_ERR_OTHER; + MPIR_ERR_SET(mpi_errno, *errflag, "**fail"); + MPIR_ERR_ADD(mpi_errno_ret, mpi_errno); + } + } + } else if (rank == 0) { + mpi_errno = MPIR_Localcopy(local_buf, count, datatype, + bcast_data_addr, count, datatype); + if (mpi_errno) { + /* for communication errors, just record the error but continue */ + *errflag = MPIR_ERR_OTHER; + MPIR_ERR_SET(mpi_errno, *errflag, "**fail"); + MPIR_ERR_ADD(mpi_errno_ret, mpi_errno); + } + } + } + + release_gather_info_ptr->release_state++; + + if (rank == 0) { + /* Rank 0 updates its flag when it arrives and data is ready in shm buffer (if bcast) */ + /* zm_memord_release makes sure that the write of data does not get reordered after this + * store */ + zm_atomic_store((release_gather_info_ptr->release_flag_addr), + release_gather_info_ptr->release_state, zm_memord_release); + } else { + if (operation == MPIDI_POSIX_RELEASE_GATHER_OPCODE_BCAST) { + parent_flag_addr = + MPIDI_POSIX_RELEASE_GATHER_RELEASE_FLAG_ADDR(release_gather_info_ptr-> + bcast_tree.parent); + } else { + parent_flag_addr = + MPIDI_POSIX_RELEASE_GATHER_RELEASE_FLAG_ADDR(release_gather_info_ptr-> + reduce_tree.parent); + } + + /* Wait until the parent has updated its flag */ + MPIDI_POSIX_RELEASE_GATHER_WAIT_WHILE_LESS_THAN(parent_flag_addr, + release_gather_info_ptr->release_state - + relaxation); + /* Update its own flag */ + /* zm_memord_release makes sure that the read of parent's flag does not get reordered after + * this store */ + zm_atomic_store((release_gather_info_ptr->release_flag_addr), + release_gather_info_ptr->release_state, zm_memord_release); + } + + if (operation == MPIDI_POSIX_RELEASE_GATHER_OPCODE_BCAST) { + /* Non-root ranks copy data from shm buffer to user buffer */ + if (rank != root) { + MPIR_ERR_CHKANDJUMP(!bcast_data_addr, mpi_errno, MPI_ERR_OTHER, "**nomem"); + mpi_errno = MPIR_Localcopy(bcast_data_addr, count, datatype, + local_buf, count, datatype); + if (mpi_errno) { + /* for communication errors, just record the error but continue */ + *errflag = MPIR_ERR_OTHER; + MPIR_ERR_SET(mpi_errno, *errflag, "**fail"); + MPIR_ERR_ADD(mpi_errno_ret, mpi_errno); + } + } + } + + fn_exit: + MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_POSIX_MPI_RELEASE_GATHER_RELEASE); + return mpi_errno; + fn_fail: + goto fn_exit; +} + +/* Gather step of the release_gather framework. This is bottom-up step in the gather_tree. + * Children notify the parent when it arrives. In case of Reduce, each rank places its data in shm + * reduce buffer. A parent reduces all its children data with its own before notifying its parent. */ +MPL_STATIC_INLINE_PREFIX int MPIDI_POSIX_mpi_release_gather_gather(const void *inbuf, void *outbuf, + const int count, + MPI_Datatype datatype, MPI_Op op, + const int root, + MPIR_Comm * comm_ptr, + MPIR_Errflag_t * errflag, + const + MPIDI_POSIX_release_gather_opcode_t + operation) +{ + MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_POSIX_MPI_RELEASE_GATHER_GATHER); + MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_POSIX_MPI_RELEASE_GATHER_GATHER); + + MPIDI_POSIX_release_gather_comm_t *release_gather_info_ptr; + int segment, rank, num_children; + volatile void *child_data_addr; + volatile zm_atomic_uint_t child_gather_flag, *child_flag_addr; + volatile void *reduce_data_addr = NULL; + int i, mpi_errno = MPI_SUCCESS, mpi_errno_ret = MPI_SUCCESS; + bool skip_checking = false; + /* Set the relaxation to 0 because in Reduce, release step is "relaxed" to make sure multiple + * buffers can be used to pipeline the copying in and out of shared memory, and data is not + * overwritten */ + const int relaxation = + (operation == + MPIDI_POSIX_RELEASE_GATHER_OPCODE_REDUCE) ? 0 : MPIR_CVAR_BCAST_INTRANODE_NUM_CELLS - 1; + zm_atomic_uint_t min_gather; + UT_array *children; + + release_gather_info_ptr = MPIDI_POSIX_COMM(comm_ptr)->release_gather; + segment = release_gather_info_ptr->gather_state % MPIR_CVAR_REDUCE_INTRANODE_NUM_CELLS; + children = release_gather_info_ptr->bcast_tree.children; + num_children = release_gather_info_ptr->bcast_tree.num_children; + rank = MPIR_Comm_rank(comm_ptr); + + release_gather_info_ptr->gather_state++; + min_gather = release_gather_info_ptr->gather_state; + + if (operation == MPIDI_POSIX_RELEASE_GATHER_OPCODE_REDUCE) { + if (rank == 0) { + /* Rank 0 reduces the data directly in its outbuf. Copy the data from inbuf to outbuf + * if needed */ + if (inbuf != outbuf) { + mpi_errno = MPIR_Localcopy(inbuf, count, datatype, outbuf, count, datatype); + } + reduce_data_addr = outbuf; + } else { + reduce_data_addr = MPIDI_POSIX_RELEASE_GATHER_REDUCE_DATA_ADDR(rank, segment); + /* Copy data from user buffer to shared buffer */ + mpi_errno = + MPIR_Localcopy(inbuf, count, datatype, (void *) reduce_data_addr, count, datatype); + } + if (mpi_errno) { + /* for communication errors, just record the error but continue */ + *errflag = MPIR_ERR_OTHER; + MPIR_ERR_SET(mpi_errno, *errflag, "**fail"); + MPIR_ERR_ADD(mpi_errno_ret, mpi_errno); + } + num_children = release_gather_info_ptr->reduce_tree.num_children; + children = release_gather_info_ptr->reduce_tree.children; + } + + /* Avoid checking for availabilty of next buffer if it is guaranteed to be available */ + /* zm_memord_acquire makes sure no writes/reads are reordered before this load */ + if ((operation != MPIDI_POSIX_RELEASE_GATHER_OPCODE_REDUCE) && + (zm_atomic_load(release_gather_info_ptr->gather_flag_addr, zm_memord_acquire)) >= + (release_gather_info_ptr->gather_state - relaxation)) { + skip_checking = true; + } + + /* Leaf nodes never skip checking */ + if (num_children == 0 || !skip_checking) { + for (i = 0; i < num_children; i++) { + MPIR_ERR_CHKANDJUMP(!utarray_eltptr(children, i), mpi_errno, MPI_ERR_OTHER, "**nomem"); + child_flag_addr = + MPIDI_POSIX_RELEASE_GATHER_GATHER_FLAG_ADDR(*utarray_eltptr(children, i)); + /* Wait until the child has arrived */ + MPIDI_POSIX_RELEASE_GATHER_WAIT_WHILE_LESS_THAN(child_flag_addr, + release_gather_info_ptr->gather_state - + relaxation); + + if (operation == MPIDI_POSIX_RELEASE_GATHER_OPCODE_REDUCE) { + child_data_addr = + (char *) release_gather_info_ptr->child_reduce_buf_addr[i] + + segment * MPIDI_POSIX_RELEASE_GATHER_REDUCE_CELLSIZE; + /* zm_memord_acquire in MPIDI_POSIX_RELEASE_GATHER_WAIT_WHILE_LESS_THAN makes sure + * that the reduce_local call does not get reordered before read of children's flag + * in MPIDI_POSIX_RELEASE_GATHER_WAIT_WHILE_LESS_THAN */ + mpi_errno = + MPIR_Reduce_local((void *) child_data_addr, (void *) reduce_data_addr, + count, datatype, op); + if (mpi_errno) { + /* for communication errors, just record the error but continue */ + *errflag = + MPIX_ERR_PROC_FAILED == + MPIR_ERR_GET_CLASS(mpi_errno) ? MPIR_ERR_PROC_FAILED : MPIR_ERR_OTHER; + MPIR_ERR_SET(mpi_errno, *errflag, "**fail"); + MPIR_ERR_ADD(mpi_errno_ret, mpi_errno); + } + } + /* Read child_flag_addr which 'may' be larger than the strongest waiting condition + * so, it is safe */ + child_gather_flag = *child_flag_addr; + min_gather = MPL_MIN(child_gather_flag, min_gather); + } + /* zm_memord_release makes sure that the write of data (reduce_local) does not get + * reordered after this store */ + zm_atomic_store((release_gather_info_ptr->gather_flag_addr), min_gather, zm_memord_release); + } + + if (operation == MPIDI_POSIX_RELEASE_GATHER_OPCODE_REDUCE) { + if (root != 0) { + /* send-recv between root and rank 0 */ + if (rank == root) { + mpi_errno = + MPIC_Recv(outbuf, count, datatype, 0, MPIR_REDUCE_TAG, comm_ptr, + MPI_STATUS_IGNORE, errflag); + if (mpi_errno) { + /* for communication errors, just record the error but continue */ + *errflag = + MPIX_ERR_PROC_FAILED == + MPIR_ERR_GET_CLASS(mpi_errno) ? MPIR_ERR_PROC_FAILED : MPIR_ERR_OTHER; + MPIR_ERR_SET(mpi_errno, *errflag, "**fail"); + MPIR_ERR_ADD(mpi_errno_ret, mpi_errno); + } + } else if (rank == 0) { + MPIR_ERR_CHKANDJUMP(!reduce_data_addr, mpi_errno, MPI_ERR_OTHER, "**nomem"); + mpi_errno = + MPIC_Send((void *) reduce_data_addr, count, datatype, root, MPIR_REDUCE_TAG, + comm_ptr, errflag); + if (mpi_errno) { + /* for communication errors, just record the error but continue */ + *errflag = + MPIX_ERR_PROC_FAILED == + MPIR_ERR_GET_CLASS(mpi_errno) ? MPIR_ERR_PROC_FAILED : MPIR_ERR_OTHER; + MPIR_ERR_SET(mpi_errno, *errflag, "**fail"); + MPIR_ERR_ADD(mpi_errno_ret, mpi_errno); + } + } + } + /* No data copy is required if root was rank 0, because it reduced the data directly in its + * outbuf */ + } + + fn_exit: + MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_POSIX_MPI_RELEASE_GATHER_GATHER); + return mpi_errno; + fn_fail: + goto fn_exit; +} + +#endif /* RELEASE_GATHER_H_INCLUDED */ diff --git a/src/mpid/ch4/shm/posix/release_gather/release_gather_types.h b/src/mpid/ch4/shm/posix/release_gather/release_gather_types.h new file mode 100644 index 00000000000..2e722896107 --- /dev/null +++ b/src/mpid/ch4/shm/posix/release_gather/release_gather_types.h @@ -0,0 +1,40 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */ +/* + * (C) 2018 by Argonne National Laboratory. + * See COPYRIGHT in top-level directory. + * + * Portions of this code were written by Intel Corporation. + * Copyright (C) 2011-2017 Corporation. Intel provides this material + * to Argonne National Laboratory subject to Software Grant and Corporate + * Contributor License Agreement dated February 8, 2012. + */ + +#ifndef RELEASE_GATHER_TYPES_H_INCLUDED +#define RELEASE_GATHER_TYPES_H_INCLUDED + +#include "treealgo_types.h" +#include "common/zm_common.h" + +typedef enum { + MPIDI_POSIX_RELEASE_GATHER_OPCODE_BCAST, + MPIDI_POSIX_RELEASE_GATHER_OPCODE_REDUCE, +} MPIDI_POSIX_release_gather_opcode_t; + +typedef enum MPIDI_POSIX_release_gather_tree_type_t { + MPIDI_POSIX_RELEASE_GATHER_TREE_TYPE_KARY, + MPIDI_POSIX_RELEASE_GATHER_TREE_TYPE_KNOMIAL_1, + MPIDI_POSIX_RELEASE_GATHER_TREE_TYPE_KNOMIAL_2, +} MPIDI_POSIX_release_gather_tree_type_t; + +typedef struct MPIDI_POSIX_release_gather_comm_t { + MPIR_Treealgo_tree_t bcast_tree, reduce_tree; + MPL_shm_hnd_t shm_flags_handle, shm_bcast_buf_handle, shm_reduce_buf_handle; + int flags_shm_size; + uint64_t gather_state, release_state; + + volatile void *flags_addr, *bcast_buf_addr, *reduce_buf_addr; + volatile void **child_reduce_buf_addr; + volatile zm_atomic_uint_t *release_flag_addr, *gather_flag_addr; +} MPIDI_POSIX_release_gather_comm_t; + +#endif /* RELEASE_GATHER_TYPES_H_INCLUDED */ From a1ee2b34697337b6e42b14f3e6efbf3b5505281d Mon Sep 17 00:00:00 2001 From: Surabhi Jain Date: Tue, 31 Jul 2018 13:35:02 -0500 Subject: [PATCH 10/13] posix: Implement shm bcast using release, gather Intra-node bcast is implemented using release step followed by gather step. Data movement takes place in release (top-down step) in the tree. Gather (bottom-up step) is used for acknowledgement. Non-roots notify the root that the data was copied out of shared bcast buffer and root can reuse the buffer for next bcast call. Bcast buffer is split into multiple cells, so that the copying in of the next chunk by root can be overlapped with copying out of previous chunks by non-roots (pipelining). Large messages are split into chunks of cell size each and pipelining is used. Signed-off-by: Yanfei Guo --- src/mpid/ch4/shm/posix/Makefile.mk | 3 + src/mpid/ch4/shm/posix/posix_coll.h | 15 ++ .../ch4/shm/posix/posix_coll_containers.h | 1 + .../shm/posix/posix_coll_globals_default.c | 4 + src/mpid/ch4/shm/posix/posix_coll_init.c | 9 +- src/mpid/ch4/shm/posix/posix_coll_params.h | 9 +- .../ch4/shm/posix/posix_coll_release_gather.h | 155 ++++++++++++++++++ src/mpid/ch4/shm/posix/posix_coll_select.h | 21 ++- 8 files changed, 213 insertions(+), 4 deletions(-) create mode 100644 src/mpid/ch4/shm/posix/posix_coll_release_gather.h diff --git a/src/mpid/ch4/shm/posix/Makefile.mk b/src/mpid/ch4/shm/posix/Makefile.mk index 06ee0ffcd02..39375e9c086 100644 --- a/src/mpid/ch4/shm/posix/Makefile.mk +++ b/src/mpid/ch4/shm/posix/Makefile.mk @@ -35,6 +35,9 @@ noinst_HEADERS += src/mpid/ch4/shm/posix/posix_am.h \ src/mpid/ch4/shm/posix/posix_proc.h \ src/mpid/ch4/shm/posix/posix_types.h +if ENABLE_IZEM_ATOMIC +noinst_HEADERS += src/mpid/ch4/shm/posix/posix_coll_release_gather.h +endif mpi_core_sources += src/mpid/ch4/shm/posix/globals.c \ src/mpid/ch4/shm/posix/posix_comm.c \ src/mpid/ch4/shm/posix/posix_init.c \ diff --git a/src/mpid/ch4/shm/posix/posix_coll.h b/src/mpid/ch4/shm/posix/posix_coll.h index 0e2340d1766..660ddd1fec5 100644 --- a/src/mpid/ch4/shm/posix/posix_coll.h +++ b/src/mpid/ch4/shm/posix/posix_coll.h @@ -16,6 +16,9 @@ #include "ch4_coll_select.h" #include "posix_coll_params.h" #include "posix_coll_select.h" +#ifdef ENABLE_IZEM_ATOMIC +#include "posix_coll_release_gather.h" +#endif static inline int MPIDI_POSIX_mpi_barrier(MPIR_Comm * comm, MPIR_Errflag_t * errflag, const void *ch4_algo_parameters_container_in) @@ -77,6 +80,18 @@ static inline int MPIDI_POSIX_mpi_bcast(void *buffer, int count, MPI_Datatype da MPIR_Bcast_intra_scatter_ring_allgather(buffer, count, datatype, root, comm, errflag); break; + case MPIDI_POSIX_Bcast_intra_release_gather_id: +#ifdef ENABLE_IZEM_ATOMIC + mpi_errno = + MPIDI_POSIX_mpi_bcast_release_gather(buffer, count, datatype, root, comm, errflag); +#else + /* else block is needed to keep the compiler happy */ + /* release_gather based algorithms cannot be used without izem submodule */ + MPIR_ERR_SETANDJUMP(mpi_errno, MPI_ERR_OTHER, "**noizem"); +#endif + break; + case MPIDI_POSIX_Bcast_intra_invalid_id: + MPIR_ERR_SETANDJUMP(mpi_errno, MPI_ERR_OTHER, "**noizem"); default: mpi_errno = MPIR_Bcast_impl(buffer, count, datatype, root, comm, errflag); break; diff --git a/src/mpid/ch4/shm/posix/posix_coll_containers.h b/src/mpid/ch4/shm/posix/posix_coll_containers.h index df87fba777a..3a0be269b9a 100644 --- a/src/mpid/ch4/shm/posix/posix_coll_containers.h +++ b/src/mpid/ch4/shm/posix/posix_coll_containers.h @@ -10,6 +10,7 @@ extern const MPIDI_POSIX_coll_algo_container_t MPIDI_POSIX_Bcast_intra_scatter_recursive_doubling_allgather_cnt; extern const MPIDI_POSIX_coll_algo_container_t MPIDI_POSIX_Bcast_intra_scatter_ring_allgather_cnt; extern const MPIDI_POSIX_coll_algo_container_t MPIDI_POSIX_Bcast_intra_invalid_cnt; +extern const MPIDI_POSIX_coll_algo_container_t MPIDI_POSIX_Bcast_intra_release_gather_cnt; /* Reduce POSIX containers declaration */ extern const MPIDI_POSIX_coll_algo_container_t MPIDI_POSIX_Reduce_intra_binomial_cnt; diff --git a/src/mpid/ch4/shm/posix/posix_coll_globals_default.c b/src/mpid/ch4/shm/posix/posix_coll_globals_default.c index e84a5261aca..c4fe3244134 100644 --- a/src/mpid/ch4/shm/posix/posix_coll_globals_default.c +++ b/src/mpid/ch4/shm/posix/posix_coll_globals_default.c @@ -26,6 +26,10 @@ const MPIDI_POSIX_coll_algo_container_t MPIDI_POSIX_Bcast_intra_invalid_cnt = { .id = MPIDI_POSIX_Bcast_intra_invalid_id }; +const MPIDI_POSIX_coll_algo_container_t MPIDI_POSIX_Bcast_intra_release_gather_cnt = { + .id = MPIDI_POSIX_Bcast_intra_release_gather_id +}; + /* Reduce default POSIX containers initialization*/ const MPIDI_POSIX_coll_algo_container_t MPIDI_POSIX_Reduce_intra_reduce_scatter_gather_cnt = { .id = MPIDI_POSIX_Reduce_intra_reduce_scatter_gather_id diff --git a/src/mpid/ch4/shm/posix/posix_coll_init.c b/src/mpid/ch4/shm/posix/posix_coll_init.c index 17287cdaa8b..59c5f944b5c 100644 --- a/src/mpid/ch4/shm/posix/posix_coll_init.c +++ b/src/mpid/ch4/shm/posix/posix_coll_init.c @@ -25,7 +25,9 @@ scope : MPI_T_SCOPE_ALL_EQ description : >- Variable to select algorithm for intra-node bcast - auto - Internal algorithm selection from pt2pt based algorithms + auto - Internal algorithm selection from pt2pt based algorithms + release_gather - Force shm optimized algo using release, gather primitives + (izem submodule should be build and enabled for this) - name : MPIR_CVAR_REDUCE_POSIX_INTRA_ALGORITHM category : COLLECTIVE @@ -51,7 +53,10 @@ int collective_cvars_init(void) int mpi_errno = MPI_SUCCESS; /* Bcast */ - MPIDI_POSIX_Bcast_algo_choice = MPIDI_POSIX_Bcast_intra_auto_id; + if (0 == strcmp(MPIR_CVAR_BCAST_POSIX_INTRA_ALGORITHM, "release_gather")) + MPIDI_POSIX_Bcast_algo_choice = MPIDI_POSIX_Bcast_intra_release_gather_id; + else + MPIDI_POSIX_Bcast_algo_choice = MPIDI_POSIX_Bcast_intra_auto_id; /* Reduce */ MPIDI_POSIX_Reduce_algo_choice = MPIDI_POSIX_Reduce_intra_auto_id; diff --git a/src/mpid/ch4/shm/posix/posix_coll_params.h b/src/mpid/ch4/shm/posix/posix_coll_params.h index ed2fb2503ea..3cbbffdaa97 100644 --- a/src/mpid/ch4/shm/posix/posix_coll_params.h +++ b/src/mpid/ch4/shm/posix/posix_coll_params.h @@ -17,7 +17,8 @@ typedef enum { MPIDI_POSIX_Bcast_intra_scatter_recursive_doubling_allgather_id, MPIDI_POSIX_Bcast_intra_scatter_ring_allgather_id, MPIDI_POSIX_Bcast_intra_auto_id, - MPIDI_POSIX_Bcast_intra_invalid_id + MPIDI_POSIX_Bcast_intra_invalid_id, + MPIDI_POSIX_Bcast_intra_release_gather_id } MPIDI_POSIX_Bcast_id_t; typedef union { @@ -26,6 +27,12 @@ typedef union { int radix; int block_size; } posix_bcast_knomial_parameters; + struct MPIDI_POSIX_Bcast_release_gather_parameters { + int radix; + int tree_type; + int buffer_size; + int num_buffers; + } posix_bcast_release_gather_parameters; struct MPIDI_POSIX_Bcast_empty_parameters { int empty; } posix_bcast_empty_parameters; diff --git a/src/mpid/ch4/shm/posix/posix_coll_release_gather.h b/src/mpid/ch4/shm/posix/posix_coll_release_gather.h new file mode 100644 index 00000000000..ae4a85bea07 --- /dev/null +++ b/src/mpid/ch4/shm/posix/posix_coll_release_gather.h @@ -0,0 +1,155 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */ +/* + * + * (C) 2018 by Argonne National Laboratory. + * See COPYRIGHT in top-level directory. + */ + +#ifndef POSIX_COLL_RELEASE_GATHER_H_INCLUDED +#define POSIX_COLL_RELEASE_GATHER_H_INCLUDED + +#include "mpiimpl.h" +#include "algo_common.h" +#include "release_gather.h" + +/* Intra-node bcast is implemented as a release step followed by gather step in release_gather + * framework. The actual data movement happens in release step. Gather step makes sure that + * the shared bcast buffer can be reused for next bcast call. Release gather framework has + * multitple cells in bcast buffer, so that the copying in next cell can be overlapped with + * copying out of previous cells (pipelining). + */ +MPL_STATIC_INLINE_PREFIX int MPIDI_POSIX_mpi_bcast_release_gather(void *buffer, + int count, + MPI_Datatype datatype, + int root, MPIR_Comm * comm_ptr, + MPIR_Errflag_t * errflag) +{ + MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_POSIX_MPI_BCAST_RELEASE_GATHER); + MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_POSIX_MPI_BCAST_RELEASE_GATHER); + + int i, my_rank, num_chunks, chunk_count_floor, chunk_count_ceil; + int offset = 0, is_contig, ori_count = count; + int mpi_errno = MPI_SUCCESS, mpi_errno_ret = MPI_SUCCESS; + MPI_Aint position; + MPI_Aint lb, true_extent, extent, type_size; + void *ori_buffer = buffer; + MPI_Datatype ori_datatype = datatype; + + /* If there is only one process or no data, return */ + if (count == 0 || (MPIR_Comm_size(comm_ptr) == 1)) { + goto fn_exit; + } + + /* Lazy initialization of release_gather specific struct */ + mpi_errno = + MPIDI_POSIX_mpi_release_gather_comm_init(comm_ptr, MPIDI_POSIX_RELEASE_GATHER_OPCODE_BCAST); + if (mpi_errno) { + /* Fall back to other algo as release_gather based bcast cannot be used */ + mpi_errno = MPIR_Bcast_impl(buffer, count, datatype, root, comm_ptr, errflag); + goto fn_exit; + } + + my_rank = MPIR_Comm_rank(comm_ptr); + MPIR_Type_get_extent_impl(datatype, &lb, &extent); + MPIR_Type_get_true_extent_impl(datatype, &lb, &true_extent); + extent = MPL_MAX(extent, true_extent); + + MPIR_Datatype_is_contig(datatype, &is_contig); + + if (is_contig) { + MPIR_Datatype_get_size_macro(datatype, type_size); + } else { + MPIR_Pack_size_impl(1, datatype, &type_size); + } + + if (!is_contig || type_size >= MPIDI_POSIX_RELEASE_GATHER_BCAST_CELLSIZE) { + /* Convert to MPI_BYTE datatype */ + count = type_size * count; + datatype = MPI_BYTE; + type_size = 1; + extent = 1; + + if (!is_contig) { + buffer = MPL_malloc(count, MPL_MEM_COLL); + if (my_rank == root) { + /* Root packs the data before sending, for non contiguous datatypes */ + position = 0; + mpi_errno = + MPIR_Pack_impl(ori_buffer, ori_count, ori_datatype, buffer, count, &position); + if (mpi_errno) { + /* for communication errors, just record the error but continue */ + *errflag = MPIR_ERR_OTHER; + MPIR_ERR_SET(mpi_errno, *errflag, "**fail"); + MPIR_ERR_ADD(mpi_errno_ret, mpi_errno); + } + } + } + } + + /* Calculate chunking information for pipelining */ + MPIR_Algo_calculate_pipeline_chunk_info(MPIDI_POSIX_RELEASE_GATHER_BCAST_CELLSIZE, type_size, + count, &num_chunks, &chunk_count_floor, + &chunk_count_ceil); + /* Print chunking information */ + MPL_DBG_MSG_FMT(MPIR_DBG_COLL, VERBOSE, (MPL_DBG_FDEST, + "Bcast shmgr pipeline info: segsize=%d count=%d num_chunks=%d chunk_count_floor=%d chunk_count_ceil=%d \n", + MPIDI_POSIX_RELEASE_GATHER_BCAST_CELLSIZE, count, + num_chunks, chunk_count_floor, chunk_count_ceil)); + + /* Do pipelined release-gather */ + for (i = 0; i < num_chunks; i++) { + int chunk_count = (i == 0) ? chunk_count_floor : chunk_count_ceil; + + mpi_errno = + MPIDI_POSIX_mpi_release_gather_release((char *) buffer + offset * extent, + chunk_count, datatype, root, comm_ptr, + errflag, + MPIDI_POSIX_RELEASE_GATHER_OPCODE_BCAST); + if (mpi_errno) { + /* for communication errors, just record the error but continue */ + *errflag = + MPIX_ERR_PROC_FAILED == + MPIR_ERR_GET_CLASS(mpi_errno) ? MPIR_ERR_PROC_FAILED : MPIR_ERR_OTHER; + MPIR_ERR_SET(mpi_errno, *errflag, "**fail"); + MPIR_ERR_ADD(mpi_errno_ret, mpi_errno); + } + + mpi_errno = + MPIDI_POSIX_mpi_release_gather_gather(NULL, NULL, 0, MPI_DATATYPE_NULL, + MPI_OP_NULL, root, comm_ptr, errflag, + MPIDI_POSIX_RELEASE_GATHER_OPCODE_BCAST); + if (mpi_errno) { + /* for communication errors, just record the error but continue */ + *errflag = + MPIX_ERR_PROC_FAILED == + MPIR_ERR_GET_CLASS(mpi_errno) ? MPIR_ERR_PROC_FAILED : MPIR_ERR_OTHER; + MPIR_ERR_SET(mpi_errno, *errflag, "**fail"); + MPIR_ERR_ADD(mpi_errno_ret, mpi_errno); + } + offset += chunk_count; + } + + if (!is_contig) { + if (my_rank != root) { + /* Non-root unpack the data if expecting non-contiguous datatypes */ + position = 0; + mpi_errno = + MPIR_Unpack_impl(buffer, count, &position, ori_buffer, ori_count, ori_datatype); + if (mpi_errno) { + /* for communication errors, just record the error but continue */ + *errflag = MPIR_ERR_OTHER; + MPIR_ERR_SET(mpi_errno, *errflag, "**fail"); + MPIR_ERR_ADD(mpi_errno_ret, mpi_errno); + } + } + MPL_free(buffer); + } + + fn_exit: + MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPIDI_POSIX_MPI_BCAST_RELEASE_GATHER); + return mpi_errno; + fn_fail: + goto fn_exit; +} + +#endif /* POSIX_COLL_RELEASE_GATHER_H_INCLUDED */ diff --git a/src/mpid/ch4/shm/posix/posix_coll_select.h b/src/mpid/ch4/shm/posix/posix_coll_select.h index e94c1a5499c..ad7cb656f5d 100644 --- a/src/mpid/ch4/shm/posix/posix_coll_select.h +++ b/src/mpid/ch4/shm/posix/posix_coll_select.h @@ -26,8 +26,27 @@ MPIDI_POSIX_coll_algo_container_t *MPIDI_POSIX_Bcast_select(void *buffer, int nbytes = 0; MPI_Aint type_size = 0; - MPIR_Datatype_get_size_macro(datatype, type_size); + if (MPIDI_POSIX_Bcast_algo_choice == MPIDI_POSIX_Bcast_intra_release_gather_id) { + /* release_gather based algorithm can be used only if izem submodule is built (and enabled) + * and MPICH is not multi-threaded */ +#ifdef ENABLE_IZEM_ATOMIC +#ifdef MPICH_IS_THREADED + if (!MPIR_ThreadInfo.isThreaded) { + /* MPICH configured with threading support but not actually used */ + return &MPIDI_POSIX_Bcast_intra_release_gather_cnt; + } +#else + /* MPICH not configured with threading support */ + return &MPIDI_POSIX_Bcast_intra_release_gather_cnt; +#endif /* MPICH_IS_THREADED */ +#else + /* release_gather algo is chosen through CVAR but izem is not built */ + return &MPIDI_POSIX_Bcast_intra_invalid_cnt; +#endif /* ENABLE_IZEM_ATOMIC */ + } + /* Choose from pt2pt based algorithms */ + MPIR_Datatype_get_size_macro(datatype, type_size); nbytes = type_size * count; if ((nbytes < MPIR_CVAR_BCAST_SHORT_MSG_SIZE) || (comm->local_size < MPIR_CVAR_BCAST_MIN_PROCS)) { From c736c97b1d9c8849687dbad2dbe8dc8e1ade862d Mon Sep 17 00:00:00 2001 From: Surabhi Jain Date: Tue, 12 Jun 2018 11:36:59 -0500 Subject: [PATCH 11/13] posix: Implement shm reduce using release, gather Intra-node reduce is implemented using release step followed by gather step. Data movement takes place in gather (bottom-up step) in the tree. Release (top-down) step is used for acknowledgement. Root notifies the non-roots that the data was reduced and copied out of its reduce buffer. Hence, children ranks can reuse the reduce buffer for next reduce call. There is a reduce shm buffer per rank, as each rank contributes data in reduce. Each buffer is split into multiple cells, so the copying in of the next chunk by children can be overlapped with reduce and copy out by the parent rank for the previous cells (pipelining). Large messages are split into chunks of cell size each and pipelining is used. Signed-off-by: Yanfei Guo --- src/mpid/ch4/shm/posix/posix_coll.h | 13 ++ .../ch4/shm/posix/posix_coll_containers.h | 3 +- .../shm/posix/posix_coll_globals_default.c | 4 + src/mpid/ch4/shm/posix/posix_coll_init.c | 9 +- src/mpid/ch4/shm/posix/posix_coll_params.h | 9 +- .../ch4/shm/posix/posix_coll_release_gather.h | 118 ++++++++++++++++++ src/mpid/ch4/shm/posix/posix_coll_select.h | 21 ++++ 7 files changed, 173 insertions(+), 4 deletions(-) diff --git a/src/mpid/ch4/shm/posix/posix_coll.h b/src/mpid/ch4/shm/posix/posix_coll.h index 660ddd1fec5..4cab059693b 100644 --- a/src/mpid/ch4/shm/posix/posix_coll.h +++ b/src/mpid/ch4/shm/posix/posix_coll.h @@ -592,6 +592,19 @@ static inline int MPIDI_POSIX_mpi_reduce(const void *sendbuf, void *recvbuf, int MPIR_Reduce_intra_binomial(sendbuf, recvbuf, count, datatype, op, root, comm, errflag); break; + case MPIDI_POSIX_Reduce_intra_release_gather_id: +#ifdef ENABLE_IZEM_ATOMIC + mpi_errno = + MPIDI_POSIX_mpi_reduce_release_gather(sendbuf, recvbuf, count, datatype, + op, root, comm, errflag); +#else + /* else block is needed to keep the compiler happy */ + /* release_gather based algorithms cannot be used without izem submodule */ + MPIR_ERR_SETANDJUMP(mpi_errno, MPI_ERR_OTHER, "**noizem"); +#endif + break; + case MPIDI_POSIX_Reduce_intra_invalid_id: + MPIR_ERR_SETANDJUMP(mpi_errno, MPI_ERR_OTHER, "**noizem"); default: mpi_errno = MPIR_Reduce_impl(sendbuf, recvbuf, count, datatype, op, root, comm, errflag); diff --git a/src/mpid/ch4/shm/posix/posix_coll_containers.h b/src/mpid/ch4/shm/posix/posix_coll_containers.h index 3a0be269b9a..f335566d5c0 100644 --- a/src/mpid/ch4/shm/posix/posix_coll_containers.h +++ b/src/mpid/ch4/shm/posix/posix_coll_containers.h @@ -15,7 +15,8 @@ extern const MPIDI_POSIX_coll_algo_container_t MPIDI_POSIX_Bcast_intra_release_g /* Reduce POSIX containers declaration */ extern const MPIDI_POSIX_coll_algo_container_t MPIDI_POSIX_Reduce_intra_binomial_cnt; extern const MPIDI_POSIX_coll_algo_container_t MPIDI_POSIX_Reduce_intra_reduce_scatter_gather_cnt; -extern const MPIDI_POSIX_coll_algo_container_t MPIDI_POSIX_Reduce_intra_reduce_invalid_cnt; +extern const MPIDI_POSIX_coll_algo_container_t MPIDI_POSIX_Reduce_intra_invalid_cnt; +extern const MPIDI_POSIX_coll_algo_container_t MPIDI_POSIX_Reduce_intra_release_gather_cnt; /* Allreduce POSIX containers declaration */ extern const MPIDI_POSIX_coll_algo_container_t MPIDI_POSIX_Allreduce_intra_recursive_doubling_cnt; diff --git a/src/mpid/ch4/shm/posix/posix_coll_globals_default.c b/src/mpid/ch4/shm/posix/posix_coll_globals_default.c index c4fe3244134..ab6f3ad95cd 100644 --- a/src/mpid/ch4/shm/posix/posix_coll_globals_default.c +++ b/src/mpid/ch4/shm/posix/posix_coll_globals_default.c @@ -43,6 +43,10 @@ const MPIDI_POSIX_coll_algo_container_t MPIDI_POSIX_Reduce_intra_invalid_cnt = { .id = MPIDI_POSIX_Reduce_intra_invalid_id }; +const MPIDI_POSIX_coll_algo_container_t MPIDI_POSIX_Reduce_intra_release_gather_cnt = { + .id = MPIDI_POSIX_Reduce_intra_release_gather_id +}; + /* Allreduce default POSIX containers initialization*/ const MPIDI_POSIX_coll_algo_container_t MPIDI_POSIX_Allreduce_intra_recursive_doubling_cnt = { .id = MPIDI_POSIX_Allreduce_intra_recursive_doubling_id diff --git a/src/mpid/ch4/shm/posix/posix_coll_init.c b/src/mpid/ch4/shm/posix/posix_coll_init.c index 59c5f944b5c..2e6d61617a6 100644 --- a/src/mpid/ch4/shm/posix/posix_coll_init.c +++ b/src/mpid/ch4/shm/posix/posix_coll_init.c @@ -38,7 +38,9 @@ scope : MPI_T_SCOPE_ALL_EQ description : >- Variable to select algorithm for intra-node reduce - auto - Internal algorithm selection from pt2pt based algorithms + auto - Internal algorithm selection from pt2pt based algorithms + release_gather - Force shm optimized algo using release, gather primitives + (izem submodule should be build and enabled for this) === END_MPI_T_CVAR_INFO_BLOCK === */ @@ -59,7 +61,10 @@ int collective_cvars_init(void) MPIDI_POSIX_Bcast_algo_choice = MPIDI_POSIX_Bcast_intra_auto_id; /* Reduce */ - MPIDI_POSIX_Reduce_algo_choice = MPIDI_POSIX_Reduce_intra_auto_id; + if (0 == strcmp(MPIR_CVAR_REDUCE_POSIX_INTRA_ALGORITHM, "release_gather")) + MPIDI_POSIX_Reduce_algo_choice = MPIDI_POSIX_Reduce_intra_release_gather_id; + else + MPIDI_POSIX_Reduce_algo_choice = MPIDI_POSIX_Reduce_intra_auto_id; fn_exit: return mpi_errno; diff --git a/src/mpid/ch4/shm/posix/posix_coll_params.h b/src/mpid/ch4/shm/posix/posix_coll_params.h index 3cbbffdaa97..bbf08fdd377 100644 --- a/src/mpid/ch4/shm/posix/posix_coll_params.h +++ b/src/mpid/ch4/shm/posix/posix_coll_params.h @@ -44,11 +44,18 @@ typedef enum { MPIDI_POSIX_Reduce_intra_reduce_scatter_gather_id, MPIDI_POSIX_Reduce_intra_binomial_id, MPIDI_POSIX_Reduce_intra_auto_id, - MPIDI_POSIX_Reduce_intra_invalid_id + MPIDI_POSIX_Reduce_intra_invalid_id, + MPIDI_POSIX_Reduce_intra_release_gather_id } MPIDI_POSIX_Reduce_id_t; typedef union { /* reserved for parameters related to SHM specific collectives */ + struct MPIDI_POSIX_Reduce_release_gather_parameters { + int radix; + int tree_type; + int buffer_size; + int num_buffers; + } posix_reduce_release_gather_parameters; struct MPIDI_POSIX_Reduce_empty_parameters { int empty; } posix_reduce_empty_parameters; diff --git a/src/mpid/ch4/shm/posix/posix_coll_release_gather.h b/src/mpid/ch4/shm/posix/posix_coll_release_gather.h index ae4a85bea07..bb49b786409 100644 --- a/src/mpid/ch4/shm/posix/posix_coll_release_gather.h +++ b/src/mpid/ch4/shm/posix/posix_coll_release_gather.h @@ -152,4 +152,122 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_POSIX_mpi_bcast_release_gather(void *buffer, goto fn_exit; } +/* Intra-node reduce is implemented as a release step followed by gather step in release_gather + * framework. The actual data movement happens in gather step. Release step makes sure that + * the shared reduce buffer can be reused for next reduce call. Release gather framework has + * multitple cells in reduce buffer, so that the copying in next cell can be overlapped with + * reduction and copying out of previous cells (pipelining). + */ +MPL_STATIC_INLINE_PREFIX int MPIDI_POSIX_mpi_reduce_release_gather(const void *sendbuf, + void *recvbuf, int count, + MPI_Datatype datatype, + MPI_Op op, int root, + MPIR_Comm * comm_ptr, + MPIR_Errflag_t * errflag) +{ + int i, num_chunks, chunk_size_floor, chunk_size_ceil; + int offset = 0, is_contig; + int mpi_errno = MPI_SUCCESS, mpi_errno_ret = MPI_SUCCESS; + MPI_Aint lb, true_extent, extent, type_size; + + MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_POSIX_MPI_REDUCE_RELEASE_GATHER); + MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_POSIX_MPI_REDUCE_RELEASE_GATHER); + + /* If there is no data, return */ + if (count == 0) { + goto fn_exit; + } + + if ((MPIR_Comm_size(comm_ptr) == 1) && (sendbuf != MPI_IN_PLACE)) { + /* Simply copy the data from sendbuf to recvbuf if there is only 1 rank and MPI_IN_PLACE + * is not used */ + mpi_errno = MPIR_Localcopy(sendbuf, count, datatype, recvbuf, count, datatype); + if (mpi_errno) { + /* for communication errors, just record the error but continue */ + *errflag = MPIR_ERR_OTHER; + MPIR_ERR_SET(mpi_errno, *errflag, "**fail"); + MPIR_ERR_ADD(mpi_errno_ret, mpi_errno); + } + goto fn_exit; + } + + /* Lazy initialization of release_gather specific struct */ + mpi_errno = + MPIDI_POSIX_mpi_release_gather_comm_init(comm_ptr, + MPIDI_POSIX_RELEASE_GATHER_OPCODE_REDUCE); + if (mpi_errno) { + /* Fall back to other algo as release_gather algo cannot be used */ + mpi_errno = + MPIR_Reduce_impl(sendbuf, recvbuf, count, datatype, op, root, comm_ptr, errflag); + goto fn_exit; + } + + MPIR_Type_get_extent_impl(datatype, &lb, &extent); + MPIR_Type_get_true_extent_impl(datatype, &lb, &true_extent); + extent = MPL_MAX(extent, true_extent); + + MPIR_Datatype_is_contig(datatype, &is_contig); + + if (is_contig) { + MPIR_Datatype_get_size_macro(datatype, type_size); + } else { + MPIR_Pack_size_impl(1, datatype, &type_size); + } + + if (sendbuf == MPI_IN_PLACE) { + sendbuf = recvbuf; + } + + /* Calculate chunking information, taking max(extent, type_size) handles contiguous and non-contiguous datatypes both */ + MPIR_Algo_calculate_pipeline_chunk_info(MPIDI_POSIX_RELEASE_GATHER_REDUCE_CELLSIZE, + MPL_MAX(extent, type_size), count, &num_chunks, + &chunk_size_floor, &chunk_size_ceil); + + /* Print chunking information */ + MPL_DBG_MSG_FMT(MPIR_DBG_COLL, VERBOSE, (MPL_DBG_FDEST, + "Reduce shmgr pipeline info: segsize=%d count=%d num_chunks=%d chunk_size_floor=%d chunk_size_ceil=%d \n", + MPIDI_POSIX_RELEASE_GATHER_REDUCE_CELLSIZE, count, + num_chunks, chunk_size_floor, chunk_size_ceil)); + + /* Do pipelined release-gather */ + for (i = 0; i < num_chunks; i++) { + int chunk_count = (i == 0) ? chunk_size_floor : chunk_size_ceil; + + mpi_errno = + MPIDI_POSIX_mpi_release_gather_release(NULL, 0, MPI_DATATYPE_NULL, root, + comm_ptr, errflag, + MPIDI_POSIX_RELEASE_GATHER_OPCODE_REDUCE); + if (mpi_errno) { + /* for communication errors, just record the error but continue */ + *errflag = + MPIX_ERR_PROC_FAILED == + MPIR_ERR_GET_CLASS(mpi_errno) ? MPIR_ERR_PROC_FAILED : MPIR_ERR_OTHER; + MPIR_ERR_SET(mpi_errno, *errflag, "**fail"); + MPIR_ERR_ADD(mpi_errno_ret, mpi_errno); + } + + mpi_errno = + MPIDI_POSIX_mpi_release_gather_gather((char *) sendbuf + offset * extent, + (char *) recvbuf + offset * extent, + chunk_count, datatype, op, root, comm_ptr, + errflag, + MPIDI_POSIX_RELEASE_GATHER_OPCODE_REDUCE); + if (mpi_errno) { + /* for communication errors, just record the error but continue */ + *errflag = + MPIX_ERR_PROC_FAILED == + MPIR_ERR_GET_CLASS(mpi_errno) ? MPIR_ERR_PROC_FAILED : MPIR_ERR_OTHER; + MPIR_ERR_SET(mpi_errno, *errflag, "**fail"); + MPIR_ERR_ADD(mpi_errno_ret, mpi_errno); + } + offset += chunk_count; + } + + fn_exit: + MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_POSIX_MPI_REDUCE_RELEASE_GATHER); + return mpi_errno; + fn_fail: + goto fn_exit; +} + #endif /* POSIX_COLL_RELEASE_GATHER_H_INCLUDED */ diff --git a/src/mpid/ch4/shm/posix/posix_coll_select.h b/src/mpid/ch4/shm/posix/posix_coll_select.h index ad7cb656f5d..d6aa56a4a2b 100644 --- a/src/mpid/ch4/shm/posix/posix_coll_select.h +++ b/src/mpid/ch4/shm/posix/posix_coll_select.h @@ -101,6 +101,27 @@ MPIDI_POSIX_coll_algo_container_t *MPIDI_POSIX_Reduce_select(const void *sendbuf MPI_Aint type_size = 0; int pof2 = 0; + if (MPIDI_POSIX_Reduce_algo_choice == MPIDI_POSIX_Reduce_intra_release_gather_id && + MPIR_Op_is_commutative(op)) { + /* release_gather based algorithm can be used only if izem submodule is built (and enabled) + * and MPICH is not multi-threaded. Also when the op is commutative */ +#ifdef ENABLE_IZEM_ATOMIC +#ifdef MPICH_IS_THREADED + if (!MPIR_ThreadInfo.isThreaded) { + /* MPICH configured with threading support but not actually used */ + return &MPIDI_POSIX_Reduce_intra_release_gather_cnt; + } +#else + /* MPICH not configured with threading support */ + return &MPIDI_POSIX_Reduce_intra_release_gather_cnt; +#endif /* MPICH_IS_THREADED */ +#else + /* release_gather algo is chosen through CVAR but izem is not built */ + return &MPIDI_POSIX_Reduce_intra_invalid_cnt; +#endif /* ENABLE_IZEM_ATOMIC */ + } + + /* Choose from pt2pt based algorithms */ MPIR_Datatype_get_size_macro(datatype, type_size); pof2 = comm->pof2; if ((count * type_size > MPIR_CVAR_REDUCE_SHORT_MSG_SIZE) && From b7b4aaa1a9fefc825f8519cddc9c28ce36a7db1f Mon Sep 17 00:00:00 2001 From: Surabhi Jain Date: Fri, 7 Sep 2018 17:23:32 -0500 Subject: [PATCH 12/13] test: Add bcast, reduce tests for newly added CVARS Run a few bcast and reduce tests by varying the CVARS for multiple buffer sizes and type, radix of trees. Signed-off-by: Yanfei Guo --- test/mpi/coll/test_coll_algos.sh | 58 ++++++++++++++++++++++++++++++++ 1 file changed, 58 insertions(+) diff --git a/test/mpi/coll/test_coll_algos.sh b/test/mpi/coll/test_coll_algos.sh index 8fdf7de38ca..d86bdb4a5bc 100755 --- a/test/mpi/coll/test_coll_algos.sh +++ b/test/mpi/coll/test_coll_algos.sh @@ -443,4 +443,62 @@ for algo_name in ${algo_names}; do coll_algo_tests+="neighb_alltoallv 4 mpiversion=3.0 ${env}${nl}" done +########## Add tests for intra-node bcast algorithms ############ + +#use release gather based intra-node bcast +testing_env="env=MPIR_CVAR_BCAST_POSIX_INTRA_ALGORITHM=release_gather " + +testing_env+="env=MPIR_CVAR_COLL_SHM_LIMIT_PER_NODE=131072 " #128MB +buffer_sizes="16384 32768" +num_cells="2 4" +tree_types="knomial_1 knomial_2" +kvalues="8 64" + +for buf_size in ${buffer_sizes}; do + for num_cell in ${num_cells}; do + for kval in ${kvalues}; do + for tree_type in ${tree_types}; do + #set the environment + env="${testing_env} env=MPIR_CVAR_BCAST_INTRANODE_BUFFER_TOTAL_SIZE=${buf_size} " + env+="env=MPIR_CVAR_BCAST_INTRANODE_NUM_CELLS=${num_cell} " + env+="env=MPIR_CVAR_BCAST_INTRANODE_TREE_KVAL=${kval} " + env+="env=MPIR_CVAR_BCAST_INTRANODE_TREE_TYPE=${tree_type} " + + coll_algo_tests+="bcasttest 10 ${env}${nl}" + coll_algo_tests+="bcastzerotype 5 ${env}${nl}" + done + done + done +done + +########## Add tests for intra-node reduce algorithms ############ + +#use release gather based intra-node reduce +testing_env="env=MPIR_CVAR_REDUCE_POSIX_INTRA_ALGORITHM=release_gather " + +testing_env+="env=MPIR_CVAR_COLL_SHM_LIMIT_PER_NODE=131072 " #128MB +buffer_sizes="16384 32768" +num_cells="2 4" +tree_types="knomial_1 knomial_2" +kvalues="4 8" + +for buf_size in ${buffer_sizes}; do + for num_cell in ${num_cells}; do + for kval in ${kvalues}; do + for tree_type in ${tree_types}; do + #set the environment + env="${testing_env} env=MPIR_CVAR_REDUCE_INTRANODE_BUFFER_TOTAL_SIZE=${buf_size} " + env+="env=MPIR_CVAR_REDUCE_INTRANODE_NUM_CELLS=${num_cell} " + env+="env=MPIR_CVAR_REDUCE_INTRANODE_TREE_KVAL=${kval} " + env+="env=MPIR_CVAR_REDUCE_INTRANODE_TREE_TYPE=${tree_type} " + + coll_algo_tests+="reduce 5 ${env}${nl}" + coll_algo_tests+="reduce 10 ${env}${nl}" + coll_algo_tests+="red3 10 ${env}${nl}" + coll_algo_tests+="red4 10 ${env}${nl}" + done + done + done +done + export coll_algo_tests From 703b78e6d2f0821edeedf4430e241524296da724 Mon Sep 17 00:00:00 2001 From: Yanfei Guo Date: Tue, 23 Apr 2019 13:03:24 -0500 Subject: [PATCH 13/13] test: xfail coll algo tests using release_gather for intra-node The algorithm is expected to fail since izem is not used by default. This commit is a temporary measure until we decide what to do with enabling izem by default or bring izem functionalities into OPA/MPL. No reviewer. --- test/mpi/maint/jenkins/xfail.conf | 3 +++ 1 file changed, 3 insertions(+) diff --git a/test/mpi/maint/jenkins/xfail.conf b/test/mpi/maint/jenkins/xfail.conf index ff01044d9f3..6b3d78298eb 100644 --- a/test/mpi/maint/jenkins/xfail.conf +++ b/test/mpi/maint/jenkins/xfail.conf @@ -206,3 +206,6 @@ valgrind * * * * sed -i "s+\(^accfence1 .*\)+\1 xfail=ticket0+g" test/mpi/rma/te * * debug ch4:ucx * sed -i "s+\(^rqfreeb .*\)+\1 xfail=ticket0+g" test/mpi/pt2pt/testlist # Bug - Github Issue https://github.com/pmodels/mpich/issues/3618 * * * ch4:ucx * sed -i "s+\(^darray_pack .*\)+\1 xfail=ticket0+g" test/mpi/datatype/testlist +# release_gather intra-node coll algorithms relies on izem which is not default. +* * * * * sed -i "s+\(env=MPIR_CVAR_BCAST_POSIX_INTRA_ALGORITHM=release_gather .*\)+xfail=ticket0+g" test/mpi/coll/testlist +* * * * * sed -i "s+\(env=MPIR_CVAR_REDUCE_POSIX_INTRA_ALGORITHM=release_gather .*\)+xfail=ticket0+g" test/mpi/coll/testlist