Skip to content

Commit

Permalink
NA: add NA_IO_ERROR return code for generic I/O errors
Browse files Browse the repository at this point in the history
HG: add HG_IO_ERROR return code and reserve space for additional NA codes

Update OFI and UCX plugins to use new code
  • Loading branch information
soumagne committed Oct 24, 2024
1 parent 5ee197c commit decc5ea
Show file tree
Hide file tree
Showing 5 changed files with 72 additions and 36 deletions.
6 changes: 4 additions & 2 deletions src/mercury.c
Original file line number Diff line number Diff line change
Expand Up @@ -207,7 +207,7 @@ hg_core_respond_cb(const struct hg_core_cb_info *callback_info);
/*******************/

/* Return code string table */
#define X(a) #a,
#define X(a, b) #a,
static const char *const hg_return_name[] = {HG_RETURN_VALUES};
#undef X

Expand Down Expand Up @@ -1054,7 +1054,9 @@ HG_Version_get(
const char *
HG_Error_to_string(hg_return_t errnum)
{
return hg_return_name[errnum];
return hg_return_name[errnum < NA_RETURN_MAX
? errnum
: errnum - HG_NA_ERRNO_OFFSET + NA_RETURN_MAX];
}

/*---------------------------------------------------------------------------*/
Expand Down
63 changes: 35 additions & 28 deletions src/mercury_core_types.h
Original file line number Diff line number Diff line change
Expand Up @@ -119,37 +119,44 @@ struct hg_init_info {
unsigned int multi_recv_copy_threshold;
};

/* Keep offset to keep room for additional NA error codes */
#define HG_NA_ERRNO_OFFSET 64

/* Error return codes:
* Functions return 0 for success or corresponding return code */
#define HG_RETURN_VALUES \
X(HG_SUCCESS) /*!< operation succeeded */ \
X(HG_PERMISSION) /*!< operation not permitted */ \
X(HG_NOENTRY) /*!< no such file or directory */ \
X(HG_INTERRUPT) /*!< operation interrupted */ \
X(HG_AGAIN) /*!< operation must be retried */ \
X(HG_NOMEM) /*!< out of memory */ \
X(HG_ACCESS) /*!< permission denied */ \
X(HG_FAULT) /*!< bad address */ \
X(HG_BUSY) /*!< device or resource busy */ \
X(HG_EXIST) /*!< entry already exists */ \
X(HG_NODEV) /*!< no such device */ \
X(HG_INVALID_ARG) /*!< invalid argument */ \
X(HG_PROTOCOL_ERROR) /*!< protocol error */ \
X(HG_OVERFLOW) /*!< value too large */ \
X(HG_MSGSIZE) /*!< message size too long */ \
X(HG_PROTONOSUPPORT) /*!< protocol not supported */ \
X(HG_OPNOTSUPPORTED) /*!< operation not supported on endpoint */ \
X(HG_ADDRINUSE) /*!< address already in use */ \
X(HG_ADDRNOTAVAIL) /*!< cannot assign requested address */ \
X(HG_HOSTUNREACH) /*!< cannot reach host during operation */ \
X(HG_TIMEOUT) /*!< operation reached timeout */ \
X(HG_CANCELED) /*!< operation canceled */ \
X(HG_CHECKSUM_ERROR) /*!< checksum error */ \
X(HG_NA_ERROR) /*!< generic NA error */ \
X(HG_OTHER_ERROR) /*!< generic HG error */ \
X(HG_RETURN_MAX)

#define X(a) a,
X(HG_SUCCESS, NA_SUCCESS) /*!< operation succeeded */ \
X(HG_PERMISSION, NA_PERMISSION) /*!< operation not permitted */ \
X(HG_NOENTRY, NA_NOENTRY) /*!< no such file or directory */ \
X(HG_INTERRUPT, NA_INTERRUPT) /*!< operation interrupted */ \
X(HG_AGAIN, NA_AGAIN) /*!< operation must be retried */ \
X(HG_NOMEM, NA_NOMEM) /*!< out of memory */ \
X(HG_ACCESS, NA_ACCESS) /*!< permission denied */ \
X(HG_FAULT, NA_FAULT) /*!< bad address */ \
X(HG_BUSY, NA_BUSY) /*!< device or resource busy */ \
X(HG_EXIST, NA_EXIST) /*!< entry already exists */ \
X(HG_NODEV, NA_NODEV) /*!< no such device */ \
X(HG_INVALID_ARG, NA_INVALID_ARG) /*!< invalid argument */ \
X(HG_PROTOCOL_ERROR, NA_PROTOCOL_ERROR) /*!< protocol error */ \
X(HG_OVERFLOW, NA_OVERFLOW) /*!< value too large */ \
X(HG_MSGSIZE, NA_MSGSIZE) /*!< message size too long */ \
X(HG_PROTONOSUPPORT, NA_PROTONOSUPPORT) /*!< protocol not supported */ \
X(HG_OPNOTSUPPORTED, \
NA_OPNOTSUPPORTED) /*!< operation not supported on endpoint */ \
X(HG_ADDRINUSE, NA_ADDRINUSE) /*!< address already in use */ \
X(HG_ADDRNOTAVAIL, \
NA_ADDRNOTAVAIL) /*!< cannot assign requested address */ \
X(HG_HOSTUNREACH, \
NA_HOSTUNREACH) /*!< cannot reach host during operation */ \
X(HG_TIMEOUT, NA_TIMEOUT) /*!< operation reached timeout */ \
X(HG_CANCELED, NA_CANCELED) /*!< operation canceled */ \
X(HG_IO_ERROR, NA_IO_ERROR) /*!< I/O error */ \
X(HG_CHECKSUM_ERROR, HG_NA_ERRNO_OFFSET) /*!< checksum error */ \
X(HG_NA_ERROR, HG_NA_ERRNO_OFFSET + 1) /*!< generic NA error */ \
X(HG_OTHER_ERROR, HG_NA_ERRNO_OFFSET + 2) /*!< generic HG error */ \
X(HG_RETURN_MAX, HG_NA_ERRNO_OFFSET + 3)

#define X(a, b) a = b,
typedef enum hg_return { HG_RETURN_VALUES } hg_return_t;
#undef X

Expand Down
31 changes: 27 additions & 4 deletions src/na/na_ofi.c
Original file line number Diff line number Diff line change
Expand Up @@ -2169,13 +2169,28 @@ na_ofi_errno_to_na(int rc)
case FI_EINTR:
ret = NA_INTERRUPT;
break;
case FI_EIO:
#if !defined(__APPLE__)
case FI_EREMOTEIO:
#endif
ret = NA_IO_ERROR;
break;
case FI_EAGAIN:
#ifdef _WIN32
case FI_EWOULDBLOCK:
#endif
ret = NA_AGAIN;
break;
case FI_ENOMEM:
case FI_EMFILE:
case FI_ENOSPC:
case FI_ENOBUFS:
ret = NA_NOMEM;
break;
case FI_EACCES:
#if !defined(_WIN32) && !defined(__APPLE__)
case FI_EKEYREJECTED:
#endif
ret = NA_ACCESS;
break;
case FI_EFAULT:
Expand All @@ -2187,6 +2202,8 @@ na_ofi_errno_to_na(int rc)
case FI_ENODEV:
ret = NA_NODEV;
break;
case FI_E2BIG:
case FI_EBADF:
case FI_EINVAL:
ret = NA_INVALID_ARG;
break;
Expand All @@ -2197,6 +2214,7 @@ na_ofi_errno_to_na(int rc)
ret = NA_MSGSIZE;
break;
case FI_ENOPROTOOPT:
case FI_ENOSYS:
ret = NA_PROTONOSUPPORT;
break;
case FI_EOPNOTSUPP:
Expand All @@ -2210,14 +2228,12 @@ na_ofi_errno_to_na(int rc)
break;
case FI_ENETDOWN:
case FI_ENETUNREACH:
case FI_ENOTCONN:
case FI_ECONNABORTED:
case FI_ECONNREFUSED:
case FI_ECONNRESET:
#ifndef _WIN32
case FI_ENOTCONN:
case FI_ESHUTDOWN:
case FI_ECONNREFUSED:
case FI_EHOSTDOWN:
#endif
case FI_EHOSTUNREACH:
ret = NA_HOSTUNREACH;
break;
Expand All @@ -2227,6 +2243,13 @@ na_ofi_errno_to_na(int rc)
case FI_ECANCELED:
ret = NA_CANCELED;
break;
case FI_ENOMSG:
case FI_ENODATA:
/* In practice the following codes are not errors but treat them as is
* in this routine. */
case FI_EISCONN:
case FI_EALREADY:
case FI_EINPROGRESS:
default:
ret = NA_PROTOCOL_ERROR;
break;
Expand Down
1 change: 1 addition & 0 deletions src/na/na_types.h
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,7 @@ struct na_protocol_info {
X(NA_HOSTUNREACH) /*!< cannot reach host during operation */ \
X(NA_TIMEOUT) /*!< operation reached timeout */ \
X(NA_CANCELED) /*!< operation canceled */ \
X(NA_IO_ERROR) /*!< I/O error */ \
X(NA_RETURN_MAX)

#define X(a) a,
Expand Down
7 changes: 5 additions & 2 deletions src/na/na_ucx.c
Original file line number Diff line number Diff line change
Expand Up @@ -1045,7 +1045,6 @@ na_ucs_status_to_na(ucs_status_t status)
ret = NA_ADDRNOTAVAIL;
break;

case UCS_ERR_SOME_CONNECTS_FAILED:
case UCS_ERR_UNREACHABLE:
case UCS_ERR_CONNECTION_RESET:
case UCS_ERR_NOT_CONNECTED:
Expand All @@ -1062,8 +1061,12 @@ na_ucs_status_to_na(ucs_status_t status)
ret = NA_CANCELED;
break;

case UCS_ERR_NO_MESSAGE:
case UCS_ERR_SOME_CONNECTS_FAILED:
case UCS_ERR_IO_ERROR:
ret = NA_IO_ERROR;
break;

case UCS_ERR_NO_MESSAGE:
case UCS_ERR_SHMEM_SEGMENT:
default:
ret = NA_PROTOCOL_ERROR;
Expand Down

0 comments on commit decc5ea

Please sign in to comment.