From 8c587460b322a7ef9ca2d1d724f071bb348c475f Mon Sep 17 00:00:00 2001 From: wejoncy Date: Thu, 21 Sep 2023 09:13:02 +0800 Subject: [PATCH] fix --- .../contrib_ops/cuda/collective/nccl_kernels.cc | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/onnxruntime/contrib_ops/cuda/collective/nccl_kernels.cc b/onnxruntime/contrib_ops/cuda/collective/nccl_kernels.cc index 3e0667587d9d1..cfeef6a65797b 100644 --- a/onnxruntime/contrib_ops/cuda/collective/nccl_kernels.cc +++ b/onnxruntime/contrib_ops/cuda/collective/nccl_kernels.cc @@ -42,7 +42,6 @@ static ncclDataType_t GetNcclDataType(onnxruntime::MLDataType type) { } } -#ifndef USE_MPI namespace IPC { #define FLLOG LOGS_DEFAULT(VERBOSE) #define FLLOGERRNO LOGS_DEFAULT(WARNING) << "error:" << strerror(errno) @@ -56,14 +55,14 @@ int WriteOnRank0(ncclUniqueId* nccl_id, int word_size) { return -1; } - int32_t PortNumber = ParseEnvironmentVariableWithDefault("RANK0_PORT", 18888); + int32_t port_number = ParseEnvironmentVariableWithDefault("RANK0_PORT", 18888); /* bind the server's local address in memory */ struct sockaddr_in saddr; memset(&saddr, 0, sizeof(saddr)); /* clear the bytes */ saddr.sin_family = AF_INET; /* versus AF_LOCAL */ saddr.sin_addr.s_addr = htonl(INADDR_ANY); // htonl(INADDR_ANY); /* host-to-network endian */ - saddr.sin_port = htons(PortNumber); /* for listening */ + saddr.sin_port = htons(port_number); /* for listening */ if (bind(fd, (struct sockaddr*)&saddr, sizeof(saddr)) < 0) { FLLOGERRNO << ("bind\n"); /* terminate */ @@ -75,7 +74,7 @@ int WriteOnRank0(ncclUniqueId* nccl_id, int word_size) { return -1; } - FLLOG << "Listening on port " << PortNumber << " for the other GPU processores...\n"; + FLLOG << "Listening on port " << port_number << " for the other GPU processores...\n"; word_size--; // rank 0 is not in word_size while (word_size-- > 0) { int client_fd = accept(fd, nullptr, nullptr); /* accept blocks */ @@ -106,13 +105,13 @@ int ReadFromRank0(ncclUniqueId* nccl_id) { /* connect to the server: configure server's address 1st */ std::string rank0_ip = ParseEnvironmentVariableWithDefault("RANK0_IP", "127.0.0.1"); - int32_t PortNumber = ParseEnvironmentVariableWithDefault("RANK0_PORT", 18888); + int32_t port_number = ParseEnvironmentVariableWithDefault("RANK0_PORT", 18888); struct sockaddr_in saddr; memset(&saddr, 0, sizeof(saddr)); saddr.sin_family = AF_INET; saddr.sin_addr.s_addr = inet_addr(rank0_ip.c_str()); - saddr.sin_port = htons(PortNumber); /* port number in big-endian */ + saddr.sin_port = htons(port_number); /* port number in big-endian */ time_t start_time = time(0); int conn_ret = connect(sockfd, (struct sockaddr*)&saddr, sizeof(saddr)); while (time(0) - start_time < 40 && conn_ret < 0) { @@ -147,9 +146,7 @@ int IPC_Bcast(ncclUniqueId* nccl_id, int rank, int world_size) { } } // namespace IPC -#endif -#ifdef USE_MPI static Status CreateNcclCommunicator(int world_size, int rank, ncclComm_t* comm, bool is_launched_by_mpi) { // Create new NCCL communicator ncclUniqueId nccl_id; @@ -169,7 +166,6 @@ static Status CreateNcclCommunicator(int world_size, int rank, ncclComm_t* comm, return Status::OK(); } -#endif NcclContext::NcclContext() { world_size_ = -1; @@ -196,7 +192,7 @@ NcclContext::NcclContext() { } // Initialize global Parallel Group NCCL Communicator - auto ret = CreateNcclCommByMPI(world_size_, rank_, &comm_, is_launched_by_mpi); + auto ret = CreateNcclCommunicator(world_size_, rank_, &comm_, is_launched_by_mpi); ORT_ENFORCE(ret.IsOK()); }