Skip to content

Commit

Permalink
fix
Browse files Browse the repository at this point in the history
  • Loading branch information
wejoncy committed Sep 21, 2023
1 parent 2f7c953 commit 8c58746
Showing 1 changed file with 6 additions and 10 deletions.
16 changes: 6 additions & 10 deletions onnxruntime/contrib_ops/cuda/collective/nccl_kernels.cc
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,6 @@ static ncclDataType_t GetNcclDataType(onnxruntime::MLDataType type) {
}
}

#ifndef USE_MPI
namespace IPC {
#define FLLOG LOGS_DEFAULT(VERBOSE)
#define FLLOGERRNO LOGS_DEFAULT(WARNING) << "error:" << strerror(errno)
Expand All @@ -56,14 +55,14 @@ int WriteOnRank0(ncclUniqueId* nccl_id, int word_size) {
return -1;
}

int32_t PortNumber = ParseEnvironmentVariableWithDefault<int32_t>("RANK0_PORT", 18888);
int32_t port_number = ParseEnvironmentVariableWithDefault<int32_t>("RANK0_PORT", 18888);

/* bind the server's local address in memory */
struct sockaddr_in saddr;
memset(&saddr, 0, sizeof(saddr)); /* clear the bytes */
saddr.sin_family = AF_INET; /* versus AF_LOCAL */
saddr.sin_addr.s_addr = htonl(INADDR_ANY); // htonl(INADDR_ANY); /* host-to-network endian */
saddr.sin_port = htons(PortNumber); /* for listening */
saddr.sin_port = htons(port_number); /* for listening */

if (bind(fd, (struct sockaddr*)&saddr, sizeof(saddr)) < 0) {
FLLOGERRNO << ("bind\n"); /* terminate */
Expand All @@ -75,7 +74,7 @@ int WriteOnRank0(ncclUniqueId* nccl_id, int word_size) {
return -1;
}

FLLOG << "Listening on port " << PortNumber << " for the other GPU processores...\n";
FLLOG << "Listening on port " << port_number << " for the other GPU processores...\n";
word_size--; // rank 0 is not in word_size
while (word_size-- > 0) {
int client_fd = accept(fd, nullptr, nullptr); /* accept blocks */
Expand Down Expand Up @@ -106,13 +105,13 @@ int ReadFromRank0(ncclUniqueId* nccl_id) {

/* connect to the server: configure server's address 1st */
std::string rank0_ip = ParseEnvironmentVariableWithDefault<std::string>("RANK0_IP", "127.0.0.1");
int32_t PortNumber = ParseEnvironmentVariableWithDefault<int32_t>("RANK0_PORT", 18888);
int32_t port_number = ParseEnvironmentVariableWithDefault<int32_t>("RANK0_PORT", 18888);

struct sockaddr_in saddr;
memset(&saddr, 0, sizeof(saddr));
saddr.sin_family = AF_INET;
saddr.sin_addr.s_addr = inet_addr(rank0_ip.c_str());
saddr.sin_port = htons(PortNumber); /* port number in big-endian */
saddr.sin_port = htons(port_number); /* port number in big-endian */
time_t start_time = time(0);
int conn_ret = connect(sockfd, (struct sockaddr*)&saddr, sizeof(saddr));
while (time(0) - start_time < 40 && conn_ret < 0) {
Expand Down Expand Up @@ -147,9 +146,7 @@ int IPC_Bcast(ncclUniqueId* nccl_id, int rank, int world_size) {
}
} // namespace IPC

#endif

#ifdef USE_MPI
static Status CreateNcclCommunicator(int world_size, int rank, ncclComm_t* comm, bool is_launched_by_mpi) {
// Create new NCCL communicator
ncclUniqueId nccl_id;
Expand All @@ -169,7 +166,6 @@ static Status CreateNcclCommunicator(int world_size, int rank, ncclComm_t* comm,

return Status::OK();
}
#endif

NcclContext::NcclContext() {
world_size_ = -1;
Expand All @@ -196,7 +192,7 @@ NcclContext::NcclContext() {
}

// Initialize global Parallel Group NCCL Communicator
auto ret = CreateNcclCommByMPI(world_size_, rank_, &comm_, is_launched_by_mpi);
auto ret = CreateNcclCommunicator(world_size_, rank_, &comm_, is_launched_by_mpi);
ORT_ENFORCE(ret.IsOK());
}

Expand Down

0 comments on commit 8c58746

Please sign in to comment.