Skip to content

Commit

Permalink
Merge pull request #7201 from raffenet/issue-7200
Browse files Browse the repository at this point in the history
ch4/{ofi,ucx}: Improved error checking for business card exchange

Approved-by: Hui Zhou <[email protected]>
  • Loading branch information
raffenet authored Nov 8, 2024
2 parents e38d557 + 0afd4f5 commit 6dc849e
Show file tree
Hide file tree
Showing 2 changed files with 18 additions and 11 deletions.
14 changes: 9 additions & 5 deletions src/mpid/ch4/netmod/ofi/init_addrxchg.c
Original file line number Diff line number Diff line change
Expand Up @@ -137,8 +137,10 @@ int MPIDI_OFI_addr_exchange_root_ctx(void)
}
MPL_free(mapped_table);
/* Then, allgather all address names using init_comm */
MPIDU_bc_allgather(init_comm, MPIDI_OFI_global.addrname, MPIDI_OFI_global.addrnamelen,
TRUE, &table, &rank_map, &recv_bc_len);
mpi_errno =
MPIDU_bc_allgather(init_comm, MPIDI_OFI_global.addrname, MPIDI_OFI_global.addrnamelen,
TRUE, &table, &rank_map, &recv_bc_len);
MPIR_ERR_CHECK(mpi_errno);

/* Insert the rest of the addresses */
for (int i = 0; i < MPIR_Process.size; i++) {
Expand All @@ -150,7 +152,8 @@ int MPIDI_OFI_addr_exchange_root_ctx(void)
MPIDI_OFI_AV(&MPIDIU_get_av(0, i)).dest[0][0] = addr;
}
}
MPIDU_bc_table_destroy();
mpi_errno = MPIDU_bc_table_destroy();
MPIR_ERR_CHECK(mpi_errno);
} else {
/* not "ROOTS_ONLY", we already have everyone's address name, insert all of them */
fi_addr_t *mapped_table;
Expand All @@ -163,7 +166,8 @@ int MPIDI_OFI_addr_exchange_root_ctx(void)
MPIDI_OFI_AV(&MPIDIU_get_av(0, i)).dest[0][0] = mapped_table[i];
}
MPL_free(mapped_table);
MPIDU_bc_table_destroy();
mpi_errno = MPIDU_bc_table_destroy();
MPIR_ERR_CHECK(mpi_errno);
}

/* check */
Expand All @@ -175,7 +179,7 @@ int MPIDI_OFI_addr_exchange_root_ctx(void)
}

fn_exit:
if (init_comm) {
if (init_comm && !mpi_errno) {
MPIDI_destroy_init_comm(&init_comm);
}
return mpi_errno;
Expand Down
15 changes: 9 additions & 6 deletions src/mpid/ch4/netmod/ucx/ucx_init.c
Original file line number Diff line number Diff line change
Expand Up @@ -106,9 +106,10 @@ static int initial_address_exchange(void)
MPIDI_UCX_CHK_STATUS(ucx_status);
MPIDIU_upidhash_add(ep_params.address, recv_bc_len, 0, node_roots[i]);
}
MPIDU_bc_allgather(init_comm, MPIDI_UCX_global.ctx[0].if_address,
(int) MPIDI_UCX_global.ctx[0].addrname_len, FALSE,
(void **) &table, &rank_map, &recv_bc_len);
mpi_errno = MPIDU_bc_allgather(init_comm, MPIDI_UCX_global.ctx[0].if_address,
(int) MPIDI_UCX_global.ctx[0].addrname_len, FALSE,
(void **) &table, &rank_map, &recv_bc_len);
MPIR_ERR_CHECK(mpi_errno);

/* insert new addresses, skipping over node roots */
for (int i = 0; i < MPIR_Process.size; i++) {
Expand All @@ -121,7 +122,8 @@ static int initial_address_exchange(void)
MPIDIU_upidhash_add(ep_params.address, recv_bc_len, 0, i);
}
}
MPIDU_bc_table_destroy();
mpi_errno = MPIDU_bc_table_destroy();
MPIR_ERR_CHECK(mpi_errno);
} else {
for (int i = 0; i < size; i++) {
ep_params.field_mask = UCP_EP_PARAM_FIELD_REMOTE_ADDRESS;
Expand All @@ -132,11 +134,12 @@ static int initial_address_exchange(void)
MPIDI_UCX_CHK_STATUS(ucx_status);
MPIDIU_upidhash_add(ep_params.address, recv_bc_len, 0, i);
}
MPIDU_bc_table_destroy();
mpi_errno = MPIDU_bc_table_destroy();
MPIR_ERR_CHECK(mpi_errno);
}

fn_exit:
if (init_comm) {
if (init_comm && !mpi_errno) {
MPIDI_destroy_init_comm(&init_comm);
}
return mpi_errno;
Expand Down

0 comments on commit 6dc849e

Please sign in to comment.