From c522de11d61540b263f1fe0f1770828b7bd8688c Mon Sep 17 00:00:00 2001 From: Howard Pritchard Date: Tue, 21 May 2024 08:03:54 -0600 Subject: [PATCH] BTL/OFI: retry posting receive buffer There are cases under heavy load (at least for HPE CXI provider) that trying to post a receive buffer can return -FI_EAGAIN. This PR uses the OFI_RETRY_UNTIL_DONE macro to try reposting receive buffer in the event -FI_EAGAIN is returned from the fi_recv call. Signed-off-by: Howard Pritchard --- opal/mca/btl/ofi/btl_ofi_module.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/opal/mca/btl/ofi/btl_ofi_module.c b/opal/mca/btl/ofi/btl_ofi_module.c index 578ac8d019b..e213d5b1865 100644 --- a/opal/mca/btl/ofi/btl_ofi_module.c +++ b/opal/mca/btl/ofi/btl_ofi_module.c @@ -16,7 +16,7 @@ * * Copyright (c) 2018 Amazon.com, Inc. or its affiliates. All Rights reserved. * Copyright (c) 2020 Google, LLC. All rights reserved. - * Copyright (c) 2022-2023 Triad National Security, LLC. All rights + * Copyright (c) 2022-2024 Triad National Security, LLC. All rights * reserved. * $COPYRIGHT$ * @@ -31,6 +31,7 @@ #include "opal/mca/accelerator/accelerator.h" #include "opal/mca/accelerator/base/base.h" #include "opal/mca/btl/btl.h" +#include "opal/mca/common/ofi/common_ofi.h" #include "opal/mca/mpool/base/base.h" #include "opal/mca/mpool/mpool.h" #include "opal/util/printf.h" @@ -412,9 +413,8 @@ int mca_btl_ofi_post_recvs(mca_btl_base_module_t *module, mca_btl_ofi_context_t comp = mca_btl_ofi_frag_completion_alloc(module, context, frag, MCA_BTL_OFI_TYPE_RECV); - rc = fi_recv(context->rx_ctx, &frag->hdr, MCA_BTL_OFI_RECV_SIZE, NULL, FI_ADDR_UNSPEC, - &comp->comp_ctx); - + OFI_RETRY_UNTIL_DONE(fi_recv(context->rx_ctx, &frag->hdr, MCA_BTL_OFI_RECV_SIZE, NULL, FI_ADDR_UNSPEC, + &comp->comp_ctx), rc); if (FI_SUCCESS != rc) { BTL_ERROR(("cannot post recvs")); return OPAL_ERROR;