Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Delete topo map #19

Open
wants to merge 37 commits into
base: huawei
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
37 commits
Select commit Hold shift + click to select a range
d9ff93b
FEAT: HMPI INITIAL COMMIT
nsosnsos Oct 20, 2020
affeec0
UCG: fix configure to include UCG
alex--m Nov 1, 2020
4980a36
UCG: support for non-contiguous datatypes
alex--m Nov 1, 2020
8b48192
Merge pull request #1 from kunpengcompute/topic/configure_ucg_fix
nsosnsos Nov 2, 2020
3fc5793
Merge pull request #2 from kunpengcompute/topic/non_contig_datatypes
nsosnsos Nov 16, 2020
4b6ac18
fix format error
zheng871026 Nov 16, 2020
698d3cf
Merge pull request #3 from zheng871026/huawei
nsosnsos Nov 16, 2020
16f926b
fix for allreduce non-contiguous datatypes
zheng871026 Nov 20, 2020
3c64389
Merge pull request #4 from zheng871026/huawei
nsosnsos Nov 21, 2020
9b6d056
fix cleancode
zheng871026 Dec 2, 2020
b0538eb
Merge pull request #5 from zheng871026/huawei
nsosnsos Dec 2, 2020
961eda6
FIX: REMOVE TRAILING SPACES
nsosnsos Dec 4, 2020
67e322f
Allreduce support non-contiguous datatype
shizhibao Dec 6, 2020
4481e72
Merge pull request #6 from shizhibao/huawei
nsosnsos Dec 6, 2020
9d68b69
increase the check of data size
RainybIue Dec 8, 2020
386000f
Merge pull request #7 from RainybIue/huawei
nsosnsos Dec 9, 2020
a8fca2d
modify the format and add some notes
RainybIue Dec 9, 2020
9a64c16
Merge pull request #9 from RainybIue/huawei
nsosnsos Dec 9, 2020
3409554
Support allreduce non-contiguous datatype
shizhibao Dec 10, 2020
e5c8471
Merge pull request #10 from shizhibao/huawei
nsosnsos Dec 10, 2020
db86411
Fix allreduce empty datatype bug
shizhibao Dec 16, 2020
37d97bf
Merge pull request #11 from shizhibao/huawei
nsosnsos Dec 17, 2020
3147fac
solve the init topo map fault
zheng871026 Dec 17, 2020
a5e67b5
Merge pull request #12 from zheng871026/huawei
nsosnsos Dec 17, 2020
7bb2bb6
Solve the problem of program jam when one socket is balanced and the …
RainybIue Dec 18, 2020
ede8e63
Merge pull request #13 from RainybIue/huawei
nsosnsos Dec 18, 2020
3120765
Fixed a bug that failed to call allreduce
RainybIue Dec 21, 2020
4767dab
Merge pull request #14 from RainybIue/huawei
nsosnsos Dec 21, 2020
1f81ff8
Fix bug in non-contiguous datatype
RainybIue Dec 28, 2020
d002962
Merge pull request #15 from zheng871026/huawei
nsosnsos Dec 28, 2020
18e9d1c
Fix bug in discontig datatype
RainybIue Dec 30, 2020
e173486
Merge pull request #16 from zheng871026/huawei
nsosnsos Dec 30, 2020
5c6dbec
MCA/COLL/UCX: Change UCG API, Improve code structure
RainybIue Feb 7, 2021
d37e4bc
Merge pull request #17 from RainybIue/huawei
nsosnsos Feb 20, 2021
06b6116
MCA/COLL/UCX: Optimize part of the code format
RainybIue Feb 22, 2021
a8a818e
Merge pull request #18 from RainybIue/huawei
zheng871026 Feb 22, 2021
41098d0
Delete topo_map
shizhibao Mar 15, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions LICENSE
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,8 @@ Copyright (c) 2017-2018 Amazon.com, Inc. or its affiliates. All Rights
reserved.
Copyright (c) 2019 Triad National Security, LLC. All rights
reserved.
Copyright (c) 2020 Huawei Technologies Co.,Ltd. All rights
reserved.

$COPYRIGHT$

Expand Down
2 changes: 2 additions & 0 deletions README
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@ Copyright (c) 2017 Research Organization for Information Science
and Technology (RIST). All rights reserved.
Copyright (c) 2019 Triad National Security, LLC. All rights
reserved.
Copyright (c) 2020 Huawei Technologies Co.,Ltd. All rights
reserved.

$COPYRIGHT$

Expand Down
34 changes: 30 additions & 4 deletions config/ompi_check_ucx.m4
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,10 @@
#
# $HEADER$
#
#2020.06.09-Changed process for coll_ucx
# Huawei Technologies Co., Ltd. 2020.
#


# OMPI_CHECK_UCX(prefix, [action-if-found], [action-if-not-found])
# --------------------------------------------------------
Expand Down Expand Up @@ -41,6 +45,7 @@ AC_DEFUN([OMPI_CHECK_UCX],[
[ompi_check_ucx_dir=])],
[true])])
ompi_check_ucx_happy="no"
ompi_check_ucg_happy="no"
AS_IF([test -z "$ompi_check_ucx_dir"],
[OPAL_CHECK_PACKAGE([ompi_check_ucx],
[ucp/api/ucp.h],
Expand All @@ -51,6 +56,15 @@ AC_DEFUN([OMPI_CHECK_UCX],[
[],
[ompi_check_ucx_happy="yes"],
[ompi_check_ucx_happy="no"])
OPAL_CHECK_PACKAGE([ompi_check_ucg],
[ucg/api/ucg.h],
[ucg],
[ucg_collective_destroy],
[-lucp -luct -lucm -lucs],
[],
[],
[ompi_check_ucg_happy="yes"],
[ompi_check_ucg_happy="no"])
AS_IF([test "$ompi_check_ucx_happy" = yes],
[AC_MSG_CHECKING(for UCX version compatibility)
AC_REQUIRE_CPP
Expand Down Expand Up @@ -83,6 +97,15 @@ AC_DEFUN([OMPI_CHECK_UCX],[
[$ompi_check_ucx_libdir],
[ompi_check_ucx_happy="yes"],
[ompi_check_ucx_happy="no"])
OPAL_CHECK_PACKAGE([ompi_check_ucg],
[ucg/api/ucg.h],
[ucg],
[ucg_collective_destroy],
[-lucp -luct -lucm -lucs],
[$ompi_check_ucx_dir],
[$ompi_check_ucx_libdir],
[ompi_check_ucg_happy="yes"],
[ompi_check_ucg_happy="no"])

CPPFLAGS="$ompi_check_ucx_$1_save_CPPFLAGS"
LDFLAGS="$ompi_check_ucx_$1_save_LDFLAGS"
Expand Down Expand Up @@ -133,10 +156,13 @@ AC_DEFUN([OMPI_CHECK_UCX],[
OPAL_SUMMARY_ADD([[Transports]],[[Open UCX]],[$1],[$ompi_check_ucx_happy])])])

AS_IF([test "$ompi_check_ucx_happy" = "yes"],
[$1_CPPFLAGS="[$]$1_CPPFLAGS $ompi_check_ucx_CPPFLAGS"
$1_LDFLAGS="[$]$1_LDFLAGS $ompi_check_ucx_LDFLAGS"
$1_LIBS="[$]$1_LIBS $ompi_check_ucx_LIBS"
AC_DEFINE([HAVE_UCX], [1], [have ucx])
[AS_IF([test "$ompi_check_ucg_happy" = "yes"],
[$1_CPPFLAGS="[$]$1_CPPFLAGS $ompi_check_ucg_CPPFLAGS"
$1_LDFLAGS="[$]$1_LDFLAGS $ompi_check_ucg_LDFLAGS"
$1_LIBS="[$]$1_LIBS $ompi_check_ucg_LIBS"],
[$1_CPPFLAGS="[$]$1_CPPFLAGS $ompi_check_ucx_CPPFLAGS"
$1_LDFLAGS="[$]$1_LDFLAGS $ompi_check_ucx_LDFLAGS"
$1_LIBS="[$]$1_LIBS $ompi_check_ucx_LIBS"])
$2],
[AS_IF([test ! -z "$with_ucx" && test "$with_ucx" != "no"],
[AC_MSG_ERROR([UCX support requested but not found. Aborting])])
Expand Down
4 changes: 2 additions & 2 deletions contrib/platform/mellanox/optimized
Original file line number Diff line number Diff line change
Expand Up @@ -52,9 +52,9 @@ else
enable_picky=no
enable_heterogeneous=no
enable_ft_thread=no
with_mpi_param_check=no
with_mpi_param_check=yes
CXXFLAGS="-O3 -g"
CCASFLAGS="-O3 -g"
FCFLAGS="-O3 -g"
CFLAGS="-O3 -g"
CFLAGS="-O3 -g $CFLAGS"
fi
51 changes: 51 additions & 0 deletions ompi/mca/coll/ucx/Makefile.am
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
# -*- shell-script -*-
#
#
# Copyright (c) 2011 Mellanox Technologies. All rights reserved.
# Copyright (c) 2013 Cisco Systems, Inc. All rights reserved.
# Copyright (c) 2020 Huawei Technologies Co., Ltd. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
#

AM_CPPFLAGS = $(coll_ucx_CPPFLAGS) -DCOLL_UCX_HOME=\"$(coll_ucx_HOME)\" $(coll_ucx_extra_CPPFLAGS)

#dist_ompidata_DATA = help-coll-ucx.txt
coll_ucx_sources = \
coll_ucx.h \
coll_ucx_request.h \
coll_ucx_datatype.h \
coll_ucx_freelist.h \
coll_ucx_op.c \
coll_ucx_module.c \
coll_ucx_request.c \
coll_ucx_datatype.c \
coll_ucx_component.c

# Make the output library in this directory, and name it either
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
# (for static builds).

if MCA_BUILD_ompi_coll_ucx_DSO
component_noinst =
component_install = mca_coll_ucx.la
else
component_noinst = libmca_coll_ucx.la
component_install =
endif

mcacomponentdir = $(ompilibdir)
mcacomponent_LTLIBRARIES = $(component_install)
mca_coll_ucx_la_SOURCES = $(coll_ucx_sources)
mca_coll_ucx_la_LIBADD = $(top_builddir)/ompi/lib@[email protected] $(coll_ucx_LIBS) \
$(OPAL_TOP_BUILDDIR)/opal/mca/common/ucx/lib@OPAL_LIB_PREFIX@mca_common_ucx.la
mca_coll_ucx_la_LDFLAGS = -module -avoid-version $(coll_ucx_LDFLAGS)

noinst_LTLIBRARIES = $(component_noinst)
libmca_coll_ucx_la_SOURCES =$(coll_ucx_sources)
libmca_coll_ucx_la_LIBADD = $(coll_ucx_LIBS)
libmca_coll_ucx_la_LDFLAGS = -module -avoid-version $(coll_ucx_LDFLAGS)
202 changes: 202 additions & 0 deletions ompi/mca/coll/ucx/coll_ucx.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,202 @@
/**
Copyright (c) 2011 Mellanox Technologies. All rights reserved.
Copyright (c) 2015 Research Organization for Information Science
and Technology (RIST). All rights reserved.
Copyright (c) 2020 Huawei Technologies Co., Ltd. All rights reserved.
$COPYRIGHT$

Additional copyrights may follow

$HEADER$
*/

#ifndef MCA_COLL_UCX_H
#define MCA_COLL_UCX_H

#include "ompi_config.h"

#include "mpi.h"
#include "ompi/mca/mca.h"
#include "opal/memoryhooks/memory.h"
#include "opal/mca/memory/base/base.h"
#include "ompi/mca/coll/coll.h"
#include "ompi/request/request.h"
#include "ompi/mca/pml/pml.h"
#include "ompi/mca/coll/base/coll_tags.h"
#include "ompi/communicator/communicator.h"
#include "ompi/datatype/ompi_datatype.h"
#include "ompi/attribute/attribute.h"

#include "orte/runtime/orte_globals.h"
#include "ompi/datatype/ompi_datatype_internal.h"
#include "opal/mca/common/ucx/common_ucx.h"

#include "ucg/api/ucg_mpi.h"
#include "ucs/datastruct/list.h"
#include "coll_ucx_freelist.h"

#ifndef UCX_VERSION
#define UCX_VERSION(major, minor) (((major)<<UCX_MAJOR_BIT) | ((minor)<<UCX_MINOR_BIT))
#endif

#define COLL_UCX_ASSERT MCA_COMMON_UCX_ASSERT
#define COLL_UCX_ERROR MCA_COMMON_UCX_ERROR
#define COLL_UCX_WARN MCA_COMMON_UCX_WARN
#define COLL_UCX_VERBOSE MCA_COMMON_UCX_VERBOSE

BEGIN_C_DECLS

typedef struct coll_ucx_persistent_op mca_coll_ucx_persistent_op_t;
typedef struct coll_ucx_convertor mca_coll_ucx_convertor_t;

typedef enum {
COLL_UCX_TOPO_LEVEL_ROOT,
COLL_UCX_TOPO_LEVEL_NODE,
COLL_UCX_TOPO_LEVEL_SOCKET,
COLL_UCX_TOPO_LEVEL_L3CACHE,
} coll_ucx_topo_level_t;

typedef union coll_ucx_topo_tree {
struct {
int rank_nums;
int child_nums;
union coll_ucx_topo_tree *child;
} inter;
struct {
int rank_nums;
int rank_min;
int rank_max;
} leaf;
} coll_ucx_topo_tree_t;

typedef struct {
uint32_t node_id : 24;
uint32_t sock_id : 8;
} rank_location_t;

typedef struct {
int rank_nums;
int node_nums;
int sock_nums;
coll_ucx_topo_level_t level;
coll_ucx_topo_tree_t tree;
rank_location_t *locs;
} coll_ucx_topo_info_t;

typedef struct mca_coll_ucx_component {
/* base MCA collectives component */
mca_coll_base_component_t super;

/* MCA parameters */
int priority;
int verbose;
int num_disconnect;
int topo_aware_level;

/* UCX global objects */
ucp_context_h ucp_context;
ucp_worker_h ucp_worker;
ucg_context_h ucg_context;
ucg_group_h ucg_group;
int output;
ucs_list_link_t group_head;
coll_ucx_topo_info_t topo;

/* Requests */
mca_coll_ucx_freelist_t persistent_ops;
ompi_request_t completed_send_req;
size_t request_size;

/* Datatypes */
int datatype_attr_keyval;
ucp_datatype_t predefined_types[OMPI_DATATYPE_MPI_MAX_PREDEFINED];

/* Converters pool */
mca_coll_ucx_freelist_t convs;
} mca_coll_ucx_component_t;
OMPI_MODULE_DECLSPEC extern mca_coll_ucx_component_t mca_coll_ucx_component;

typedef struct mca_coll_ucx_module {
mca_coll_base_module_t super;

/* per-communicator topo info and op interface */
coll_ucx_topo_tree_t *topo_tree;

/* UCX per-communicator context */
ucg_group_h ucg_group;

/* Progress list membership */
ucs_list_link_t ucs_list;
} mca_coll_ucx_module_t;
OBJ_CLASS_DECLARATION(mca_coll_ucx_module_t);

/*
* TESTING PURPOSES: get the worker from the module.
*/
ucp_worker_h mca_coll_ucx_get_component_worker(void);

/*
* Start persistent collectives from an array of requests.
*/
int mca_coll_ucx_start(size_t count, ompi_request_t** requests);

/*
* The collective operations themselves.
*/
int mca_coll_ucx_allreduce(const void *sbuf, void *rbuf, int count, struct ompi_datatype_t *dtype, struct ompi_op_t *op,
struct ompi_communicator_t *comm, mca_coll_base_module_t *module);

int mca_coll_ucx_iallreduce(const void *sbuf, void *rbuf, int count,
struct ompi_datatype_t *dtype,
struct ompi_op_t *op,
struct ompi_communicator_t *comm,
struct ompi_request_t **request,
mca_coll_base_module_t *module);

int mca_coll_ucx_allreduce_init(const void *sbuf, void *rbuf, int count,
struct ompi_datatype_t *dtype,
struct ompi_op_t *op,
struct ompi_communicator_t *comm,
struct ompi_info_t *info,
struct ompi_request_t **request,
mca_coll_base_module_t *module);

int mca_coll_ucx_bcast(void *buff, int count, struct ompi_datatype_t *datatype,
int root, struct ompi_communicator_t *comm,
mca_coll_base_module_t *module);

int mca_coll_ucx_reduce(const void *sbuf, void* rbuf, int count,
struct ompi_datatype_t *dtype, struct ompi_op_t *op,
int root, struct ompi_communicator_t *comm,
mca_coll_base_module_t *module);

int mca_coll_ucx_scatter(const void *sbuf, int scount, struct ompi_datatype_t *sdtype,
void *rbuf, int rcount, struct ompi_datatype_t *rdtype,
int root, struct ompi_communicator_t *comm,
mca_coll_base_module_t *module);

int mca_coll_ucx_gather(const void *sbuf, int scount, struct ompi_datatype_t *sdtype,
void *rbuf, int rcount, struct ompi_datatype_t *rdtype,
int root, struct ompi_communicator_t *comm,
mca_coll_base_module_t *module);

int mca_coll_ucx_allgather(const void *sbuf, int scount, struct ompi_datatype_t *sdtype,
void *rbuf, int rcount, struct ompi_datatype_t *rdtype,
struct ompi_communicator_t *comm,
mca_coll_base_module_t *module);

int mca_coll_ucx_alltoall(const void *sbuf, int scount, struct ompi_datatype_t *sdtype,
void *rbuf, int rcount, struct ompi_datatype_t *rdtype,
struct ompi_communicator_t *comm,
mca_coll_base_module_t *module);

int mca_coll_ucx_barrier(struct ompi_communicator_t *comm, mca_coll_base_module_t *module);

int mca_coll_ucx_ineighbor_alltoallv(void *sbuf, int *scounts, int *sdisps, struct ompi_datatype_t *sdtype,
void *rbuf, int *rcounts, int *rdisps, struct ompi_datatype_t *rdtype,
struct ompi_communicator_t *comm, ompi_request_t ** request,
mca_coll_base_module_t *module);

END_C_DECLS

#endif /* COLL_UCX_H_ */
Loading