From 197693466f48b90bdd208c7f8bd14440de319825 Mon Sep 17 00:00:00 2001 From: Alexander Grissik Date: Sun, 27 Aug 2023 15:00:52 +0300 Subject: [PATCH 001/169] issue: 3514044 Introducing cq_mgr_regrq and cq_mgr_strq Signed-off-by: Alexander Grissik --- src/core/Makefile.am | 8 ++++---- src/core/dev/cq_mgr_mlx5.inl | 2 +- src/core/dev/{cq_mgr_mlx5.cpp => cq_mgr_regrq.cpp} | 2 +- src/core/dev/{cq_mgr_mlx5.h => cq_mgr_regrq.h} | 0 src/core/dev/{cq_mgr_mlx5_strq.cpp => cq_mgr_strq.cpp} | 2 +- src/core/dev/{cq_mgr_mlx5_strq.h => cq_mgr_strq.h} | 2 +- src/core/dev/qp_mgr_eth_mlx5.cpp | 2 +- src/core/dev/qp_mgr_eth_mlx5_dpcp.cpp | 2 +- 8 files changed, 10 insertions(+), 10 deletions(-) rename src/core/dev/{cq_mgr_mlx5.cpp => cq_mgr_regrq.cpp} (99%) rename src/core/dev/{cq_mgr_mlx5.h => cq_mgr_regrq.h} (100%) rename src/core/dev/{cq_mgr_mlx5_strq.cpp => cq_mgr_strq.cpp} (99%) rename src/core/dev/{cq_mgr_mlx5_strq.h => cq_mgr_strq.h} (99%) diff --git a/src/core/Makefile.am b/src/core/Makefile.am index f19605b4d..da76afca0 100644 --- a/src/core/Makefile.am +++ b/src/core/Makefile.am @@ -63,8 +63,8 @@ libxlio_la_SOURCES := \ dev/allocator.cpp \ dev/buffer_pool.cpp \ dev/cq_mgr.cpp \ - dev/cq_mgr_mlx5.cpp \ - dev/cq_mgr_mlx5_strq.cpp \ + dev/cq_mgr_regrq.cpp \ + dev/cq_mgr_strq.cpp \ dev/dm_mgr.cpp \ dev/qp_mgr.cpp \ dev/qp_mgr_eth_mlx5.cpp \ @@ -172,8 +172,8 @@ libxlio_la_SOURCES := \ dev/allocator.h \ dev/buffer_pool.h \ dev/cq_mgr.h \ - dev/cq_mgr_mlx5.h \ - dev/cq_mgr_mlx5_strq.h \ + dev/cq_mgr_regrq.h \ + dev/cq_mgr_strq.h \ dev/dm_mgr.h \ dev/gro_mgr.h \ dev/ib_ctx_handler_collection.h \ diff --git a/src/core/dev/cq_mgr_mlx5.inl b/src/core/dev/cq_mgr_mlx5.inl index 39b00a36f..38549ed4d 100644 --- a/src/core/dev/cq_mgr_mlx5.inl +++ b/src/core/dev/cq_mgr_mlx5.inl @@ -33,7 +33,7 @@ #ifndef CQ_MGR_MLX5_INL_H #define CQ_MGR_MLX5_INL_H -#include "dev/cq_mgr_mlx5.h" +#include "dev/cq_mgr_regrq.h" #if defined(DEFINED_DIRECT_VERBS) diff --git a/src/core/dev/cq_mgr_mlx5.cpp b/src/core/dev/cq_mgr_regrq.cpp similarity index 99% rename from src/core/dev/cq_mgr_mlx5.cpp rename to src/core/dev/cq_mgr_regrq.cpp index af825c32d..6319e96d8 100644 --- a/src/core/dev/cq_mgr_mlx5.cpp +++ b/src/core/dev/cq_mgr_regrq.cpp @@ -30,7 +30,7 @@ * SOFTWARE. */ -#include "cq_mgr_mlx5.h" +#include "cq_mgr_regrq.h" #if defined(DEFINED_DIRECT_VERBS) diff --git a/src/core/dev/cq_mgr_mlx5.h b/src/core/dev/cq_mgr_regrq.h similarity index 100% rename from src/core/dev/cq_mgr_mlx5.h rename to src/core/dev/cq_mgr_regrq.h diff --git a/src/core/dev/cq_mgr_mlx5_strq.cpp b/src/core/dev/cq_mgr_strq.cpp similarity index 99% rename from src/core/dev/cq_mgr_mlx5_strq.cpp rename to src/core/dev/cq_mgr_strq.cpp index d18f2b34b..f728cb050 100644 --- a/src/core/dev/cq_mgr_mlx5_strq.cpp +++ b/src/core/dev/cq_mgr_strq.cpp @@ -30,7 +30,7 @@ * SOFTWARE. */ -#include "cq_mgr_mlx5_strq.h" +#include "cq_mgr_strq.h" #if defined(DEFINED_DIRECT_VERBS) diff --git a/src/core/dev/cq_mgr_mlx5_strq.h b/src/core/dev/cq_mgr_strq.h similarity index 99% rename from src/core/dev/cq_mgr_mlx5_strq.h rename to src/core/dev/cq_mgr_strq.h index 8a00de614..b5265b12d 100644 --- a/src/core/dev/cq_mgr_mlx5_strq.h +++ b/src/core/dev/cq_mgr_strq.h @@ -35,7 +35,7 @@ #include #include -#include "cq_mgr_mlx5.h" +#include "cq_mgr_regrq.h" class cq_mgr_mlx5_strq : public cq_mgr_mlx5 { public: diff --git a/src/core/dev/qp_mgr_eth_mlx5.cpp b/src/core/dev/qp_mgr_eth_mlx5.cpp index ddfe5cd35..47f11620e 100644 --- a/src/core/dev/qp_mgr_eth_mlx5.cpp +++ b/src/core/dev/qp_mgr_eth_mlx5.cpp @@ -35,7 +35,7 @@ #include #include -#include "cq_mgr_mlx5.h" +#include "cq_mgr_regrq.h" #include "proto/tls.h" #include "util/utils.h" #include "vlogger/vlogger.h" diff --git a/src/core/dev/qp_mgr_eth_mlx5_dpcp.cpp b/src/core/dev/qp_mgr_eth_mlx5_dpcp.cpp index 3c4b2cb5a..e793fb3a7 100644 --- a/src/core/dev/qp_mgr_eth_mlx5_dpcp.cpp +++ b/src/core/dev/qp_mgr_eth_mlx5_dpcp.cpp @@ -36,7 +36,7 @@ #include #include "ring_simple.h" #include "rfs_rule_dpcp.h" -#include "cq_mgr_mlx5_strq.h" +#include "cq_mgr_strq.h" #define MODULE_NAME "qp_mgr_eth_mlx5_dpcp" From 1495ac485fcdc5c3fca9c3618ee106b4543bd3df Mon Sep 17 00:00:00 2001 From: Alexander Grissik Date: Sun, 27 Aug 2023 15:07:21 +0300 Subject: [PATCH 002/169] issue: 3514044 Renaming cq_mgr_mlx5 to cq_mgr_regrq Signed-off-by: Alexander Grissik --- src/core/Makefile.am | 1 - src/core/dev/cq_mgr_regrq.cpp | 37 ++++++++++++++++---------------- src/core/dev/cq_mgr_regrq.h | 33 ++++++++++++++++++++++------ src/core/dev/cq_mgr_strq.cpp | 3 +-- src/core/dev/cq_mgr_strq.h | 2 +- src/core/dev/qp_mgr.h | 2 +- src/core/dev/qp_mgr_eth_mlx5.cpp | 8 +++---- src/core/dev/qp_mgr_eth_mlx5.h | 2 +- src/core/dev/ring_simple.h | 2 +- 9 files changed, 53 insertions(+), 37 deletions(-) diff --git a/src/core/Makefile.am b/src/core/Makefile.am index da76afca0..a30ff3a71 100644 --- a/src/core/Makefile.am +++ b/src/core/Makefile.am @@ -26,7 +26,6 @@ SUBDIRS = infra netlink EXTRA_DIST = \ dev/cq_mgr.inl \ - dev/cq_mgr_mlx5.inl \ util/libxlio.conf sysconf_DATA = util/libxlio.conf diff --git a/src/core/dev/cq_mgr_regrq.cpp b/src/core/dev/cq_mgr_regrq.cpp index 6319e96d8..e31707fd9 100644 --- a/src/core/dev/cq_mgr_regrq.cpp +++ b/src/core/dev/cq_mgr_regrq.cpp @@ -36,14 +36,13 @@ #include #include "cq_mgr.inl" -#include "cq_mgr_mlx5.inl" #include "qp_mgr.h" #include "qp_mgr_eth_mlx5.h" #include "ring_simple.h" #include -#define MODULE_NAME "cqm_mlx5" +#define MODULE_NAME "cq_mgr_regrq" #define cq_logfunc __log_info_func #define cq_logdbg __log_info_dbg @@ -52,7 +51,7 @@ #define cq_logpanic __log_info_panic #define cq_logfuncall __log_info_funcall -cq_mgr_mlx5::cq_mgr_mlx5(ring_simple *p_ring, ib_ctx_handler *p_ib_ctx_handler, uint32_t cq_size, +cq_mgr_regrq::cq_mgr_regrq(ring_simple *p_ring, ib_ctx_handler *p_ib_ctx_handler, uint32_t cq_size, struct ibv_comp_channel *p_comp_event_channel, bool is_rx, bool call_configure) : cq_mgr(p_ring, p_ib_ctx_handler, cq_size, p_comp_event_channel, is_rx, call_configure) @@ -64,7 +63,7 @@ cq_mgr_mlx5::cq_mgr_mlx5(ring_simple *p_ring, ib_ctx_handler *p_ib_ctx_handler, memset(&m_mlx5_cq, 0, sizeof(m_mlx5_cq)); } -uint32_t cq_mgr_mlx5::clean_cq() +uint32_t cq_mgr_regrq::clean_cq() { uint32_t ret_total = 0; uint64_t cq_poll_sn = 0; @@ -106,13 +105,13 @@ uint32_t cq_mgr_mlx5::clean_cq() return ret_total; } -cq_mgr_mlx5::~cq_mgr_mlx5() +cq_mgr_regrq::~cq_mgr_regrq() { cq_logfunc(""); cq_logdbg("destroying CQ as %s", (m_b_is_rx ? "Rx" : "Tx")); } -mem_buf_desc_t *cq_mgr_mlx5::poll(enum buff_status_e &status) +mem_buf_desc_t *cq_mgr_regrq::poll(enum buff_status_e &status) { mem_buf_desc_t *buff = NULL; @@ -152,7 +151,7 @@ mem_buf_desc_t *cq_mgr_mlx5::poll(enum buff_status_e &status) return buff; } -void cq_mgr_mlx5::cqe_to_mem_buff_desc(struct xlio_mlx5_cqe *cqe, mem_buf_desc_t *p_rx_wc_buf_desc, +void cq_mgr_regrq::cqe_to_mem_buff_desc(struct xlio_mlx5_cqe *cqe, mem_buf_desc_t *p_rx_wc_buf_desc, enum buff_status_e &status) { struct mlx5_err_cqe *ecqe; @@ -229,7 +228,7 @@ void cq_mgr_mlx5::cqe_to_mem_buff_desc(struct xlio_mlx5_cqe *cqe, mem_buf_desc_t } } -int cq_mgr_mlx5::drain_and_proccess_helper(mem_buf_desc_t *buff, buff_status_e status, +int cq_mgr_regrq::drain_and_proccess_helper(mem_buf_desc_t *buff, buff_status_e status, uintptr_t *p_recycle_buffers_last_wr_id) { ++m_n_wce_counter; @@ -265,7 +264,7 @@ int cq_mgr_mlx5::drain_and_proccess_helper(mem_buf_desc_t *buff, buff_status_e s return 1; } -int cq_mgr_mlx5::drain_and_proccess(uintptr_t *p_recycle_buffers_last_wr_id /*=NULL*/) +int cq_mgr_regrq::drain_and_proccess(uintptr_t *p_recycle_buffers_last_wr_id /*=NULL*/) { cq_logfuncall("cq was %s drained. %d processed wce since last check. %d wce in m_rx_queue", (m_b_was_drained ? "" : "not "), m_n_wce_counter, m_rx_queue.size()); @@ -348,7 +347,7 @@ int cq_mgr_mlx5::drain_and_proccess(uintptr_t *p_recycle_buffers_last_wr_id /*=N return ret_total; } -mem_buf_desc_t *cq_mgr_mlx5::cqe_process_rx(mem_buf_desc_t *p_mem_buf_desc, +mem_buf_desc_t *cq_mgr_regrq::cqe_process_rx(mem_buf_desc_t *p_mem_buf_desc, enum buff_status_e status) { /* Assume locked!!! */ @@ -379,7 +378,7 @@ mem_buf_desc_t *cq_mgr_mlx5::cqe_process_rx(mem_buf_desc_t *p_mem_buf_desc, return p_mem_buf_desc; } -mem_buf_desc_t *cq_mgr_mlx5::poll_and_process_socketxtreme() +mem_buf_desc_t *cq_mgr_regrq::poll_and_process_socketxtreme() { buff_status_e status = BS_OK; mem_buf_desc_t *buff_wqe = poll(status); @@ -400,7 +399,7 @@ mem_buf_desc_t *cq_mgr_mlx5::poll_and_process_socketxtreme() return nullptr; } -int cq_mgr_mlx5::poll_and_process_element_rx(uint64_t *p_cq_poll_sn, void *pv_fd_ready_array) +int cq_mgr_regrq::poll_and_process_element_rx(uint64_t *p_cq_poll_sn, void *pv_fd_ready_array) { /* Assume locked!!! */ cq_logfuncall(""); @@ -452,7 +451,7 @@ int cq_mgr_mlx5::poll_and_process_element_rx(uint64_t *p_cq_poll_sn, void *pv_fd return ret_rx_processed; } -void cq_mgr_mlx5::log_cqe_error(struct xlio_mlx5_cqe *cqe) +void cq_mgr_regrq::log_cqe_error(struct xlio_mlx5_cqe *cqe) { struct mlx5_err_cqe *ecqe = (struct mlx5_err_cqe *)cqe; @@ -470,7 +469,7 @@ void cq_mgr_mlx5::log_cqe_error(struct xlio_mlx5_cqe *cqe) } } -void cq_mgr_mlx5::handle_sq_wqe_prop(unsigned index) +void cq_mgr_regrq::handle_sq_wqe_prop(unsigned index) { sq_wqe_prop *p = &m_qp->m_sq_wqe_idx_to_prop[index]; sq_wqe_prop *prev; @@ -515,7 +514,7 @@ void cq_mgr_mlx5::handle_sq_wqe_prop(unsigned index) m_qp->m_sq_wqe_prop_last_signalled = index; } -int cq_mgr_mlx5::poll_and_process_element_tx(uint64_t *p_cq_poll_sn) +int cq_mgr_regrq::poll_and_process_element_tx(uint64_t *p_cq_poll_sn) { cq_logfuncall(""); @@ -544,7 +543,7 @@ int cq_mgr_mlx5::poll_and_process_element_tx(uint64_t *p_cq_poll_sn) return ret; } -void cq_mgr_mlx5::set_qp_rq(qp_mgr *qp) +void cq_mgr_regrq::set_qp_rq(qp_mgr *qp) { m_qp = static_cast(qp); @@ -559,14 +558,14 @@ void cq_mgr_mlx5::set_qp_rq(qp_mgr *qp) m_mlx5_cq.cq_buf); } -void cq_mgr_mlx5::add_qp_rx(qp_mgr *qp) +void cq_mgr_regrq::add_qp_rx(qp_mgr *qp) { cq_logfunc(""); set_qp_rq(qp); cq_mgr::add_qp_rx(qp); } -void cq_mgr_mlx5::add_qp_tx(qp_mgr *qp) +void cq_mgr_regrq::add_qp_tx(qp_mgr *qp) { // Assume locked! cq_mgr::add_qp_tx(qp); @@ -580,7 +579,7 @@ void cq_mgr_mlx5::add_qp_tx(qp_mgr *qp) m_mlx5_cq.cq_buf); } -void cq_mgr_mlx5::lro_update_hdr(struct xlio_mlx5_cqe *cqe, mem_buf_desc_t *p_rx_wc_buf_desc) +void cq_mgr_regrq::lro_update_hdr(struct xlio_mlx5_cqe *cqe, mem_buf_desc_t *p_rx_wc_buf_desc) { struct ethhdr *p_eth_h = (struct ethhdr *)(p_rx_wc_buf_desc->p_buffer); struct tcphdr *p_tcp_h; diff --git a/src/core/dev/cq_mgr_regrq.h b/src/core/dev/cq_mgr_regrq.h index ff656fe0b..3119b4369 100644 --- a/src/core/dev/cq_mgr_regrq.h +++ b/src/core/dev/cq_mgr_regrq.h @@ -30,8 +30,8 @@ * SOFTWARE. */ -#ifndef CQ_MGR_MLX5_H -#define CQ_MGR_MLX5_H +#ifndef CQ_MGR_REGRQ_H +#define CQ_MGR_REGRQ_H #include "cq_mgr.h" #include "qp_mgr_eth_mlx5.h" @@ -46,7 +46,7 @@ class qp_mgr_eth_mlx5; /* Get CQE owner bit. */ #define MLX5_CQE_OWNER(op_own) ((op_own)&MLX5_CQE_OWNER_MASK) -class cq_mgr_mlx5 : public cq_mgr { +class cq_mgr_regrq : public cq_mgr { public: enum buff_status_e { BS_OK, @@ -56,10 +56,10 @@ class cq_mgr_mlx5 : public cq_mgr { BS_GENERAL_ERR }; - cq_mgr_mlx5(ring_simple *p_ring, ib_ctx_handler *p_ib_ctx_handler, uint32_t cq_size, + cq_mgr_regrq(ring_simple *p_ring, ib_ctx_handler *p_ib_ctx_handler, uint32_t cq_size, struct ibv_comp_channel *p_comp_event_channel, bool is_rx, bool call_configure = true); - virtual ~cq_mgr_mlx5(); + virtual ~cq_mgr_regrq(); virtual int drain_and_proccess(uintptr_t *p_recycle_buffers_last_wr_id = NULL); virtual mem_buf_desc_t *poll_and_process_socketxtreme(); @@ -97,7 +97,7 @@ class cq_mgr_mlx5 : public cq_mgr { virtual int req_notify_cq() { return xlio_ib_mlx5_req_notify_cq(&m_mlx5_cq, 0); }; }; -inline void cq_mgr_mlx5::update_global_sn(uint64_t &cq_poll_sn, uint32_t num_polled_cqes) +inline void cq_mgr_regrq::update_global_sn(uint64_t &cq_poll_sn, uint32_t num_polled_cqes) { if (num_polled_cqes > 0) { // spoil the global sn if we have packets ready @@ -118,7 +118,7 @@ inline void cq_mgr_mlx5::update_global_sn(uint64_t &cq_poll_sn, uint32_t num_pol cq_poll_sn = m_n_global_sn; } -inline struct xlio_mlx5_cqe *cq_mgr_mlx5::get_cqe_tx(uint32_t &num_polled_cqes) +inline struct xlio_mlx5_cqe *cq_mgr_regrq::get_cqe_tx(uint32_t &num_polled_cqes) { struct xlio_mlx5_cqe *cqe_ret = nullptr; struct xlio_mlx5_cqe *cqe = @@ -150,5 +150,24 @@ inline struct xlio_mlx5_cqe *cq_mgr_mlx5::get_cqe_tx(uint32_t &num_polled_cqes) return cqe_ret; } +inline struct xlio_mlx5_cqe *cq_mgr_regrq::check_cqe(void) +{ + struct xlio_mlx5_cqe *cqe = + (struct xlio_mlx5_cqe *)(((uint8_t *)m_mlx5_cq.cq_buf) + + ((m_mlx5_cq.cq_ci & (m_mlx5_cq.cqe_count - 1)) + << m_mlx5_cq.cqe_size_log)); + /* + * CQE ownership is defined by Owner bit in the CQE. + * The value indicating SW ownership is flipped every + * time CQ wraps around. + * */ + if (likely((MLX5_CQE_OPCODE(cqe->op_own)) != MLX5_CQE_INVALID) && + !((MLX5_CQE_OWNER(cqe->op_own)) ^ !!(m_mlx5_cq.cq_ci & m_mlx5_cq.cqe_count))) { + return cqe; + } + + return NULL; +} + #endif /* DEFINED_DIRECT_VERBS */ #endif // CQ_MGR_MLX5_H diff --git a/src/core/dev/cq_mgr_strq.cpp b/src/core/dev/cq_mgr_strq.cpp index f728cb050..b15684165 100644 --- a/src/core/dev/cq_mgr_strq.cpp +++ b/src/core/dev/cq_mgr_strq.cpp @@ -36,7 +36,6 @@ #include #include "cq_mgr.inl" -#include "cq_mgr_mlx5.inl" #include "qp_mgr.h" #include "qp_mgr_eth_mlx5.h" #include "ring_simple.h" @@ -61,7 +60,7 @@ cq_mgr_mlx5_strq::cq_mgr_mlx5_strq(ring_simple *p_ring, ib_ctx_handler *p_ib_ctx uint32_t strides_num, struct ibv_comp_channel *p_comp_event_channel, bool call_configure) - : cq_mgr_mlx5(p_ring, p_ib_ctx_handler, cq_size, p_comp_event_channel, true, call_configure) + : cq_mgr_regrq(p_ring, p_ib_ctx_handler, cq_size, p_comp_event_channel, true, call_configure) , _owner_ring(p_ring) , _stride_size_bytes(stride_size_bytes) , _strides_num(strides_num) diff --git a/src/core/dev/cq_mgr_strq.h b/src/core/dev/cq_mgr_strq.h index b5265b12d..c5006c1ef 100644 --- a/src/core/dev/cq_mgr_strq.h +++ b/src/core/dev/cq_mgr_strq.h @@ -37,7 +37,7 @@ #include #include "cq_mgr_regrq.h" -class cq_mgr_mlx5_strq : public cq_mgr_mlx5 { +class cq_mgr_mlx5_strq : public cq_mgr_regrq { public: cq_mgr_mlx5_strq(ring_simple *p_ring, ib_ctx_handler *p_ib_ctx_handler, uint32_t cq_size, uint32_t stride_size_bytes, uint32_t strides_num, diff --git a/src/core/dev/qp_mgr.h b/src/core/dev/qp_mgr.h index 57b816757..e391b9a06 100644 --- a/src/core/dev/qp_mgr.h +++ b/src/core/dev/qp_mgr.h @@ -153,7 +153,7 @@ class xlio_ti { */ class qp_mgr { friend class cq_mgr; - friend class cq_mgr_mlx5; + friend class cq_mgr_regrq; friend class cq_mgr_mlx5_strq; friend class cq_mgr_mp; diff --git a/src/core/dev/qp_mgr_eth_mlx5.cpp b/src/core/dev/qp_mgr_eth_mlx5.cpp index 47f11620e..fa6e222c8 100644 --- a/src/core/dev/qp_mgr_eth_mlx5.cpp +++ b/src/core/dev/qp_mgr_eth_mlx5.cpp @@ -366,15 +366,15 @@ bool qp_mgr_eth_mlx5::init_rx_cq_mgr_prepare() cq_mgr *qp_mgr_eth_mlx5::init_rx_cq_mgr(struct ibv_comp_channel *p_rx_comp_event_channel) { return (!init_rx_cq_mgr_prepare() ? NULL - : new cq_mgr_mlx5(m_p_ring, m_p_ib_ctx_handler, m_rx_num_wr, - p_rx_comp_event_channel, true)); + : new cq_mgr_regrq(m_p_ring, m_p_ib_ctx_handler, m_rx_num_wr, + p_rx_comp_event_channel, true)); } cq_mgr *qp_mgr_eth_mlx5::init_tx_cq_mgr() { m_tx_num_wr = align32pow2(m_tx_num_wr); - return new cq_mgr_mlx5(m_p_ring, m_p_ib_ctx_handler, m_tx_num_wr, - m_p_ring->get_tx_comp_event_channel(), false); + return new cq_mgr_regrq(m_p_ring, m_p_ib_ctx_handler, m_tx_num_wr, + m_p_ring->get_tx_comp_event_channel(), false); } inline void qp_mgr_eth_mlx5::ring_doorbell(int db_method, int num_wqebb, int num_wqebb_top, diff --git a/src/core/dev/qp_mgr_eth_mlx5.h b/src/core/dev/qp_mgr_eth_mlx5.h index 1bc9a20bb..28a500058 100644 --- a/src/core/dev/qp_mgr_eth_mlx5.h +++ b/src/core/dev/qp_mgr_eth_mlx5.h @@ -62,7 +62,7 @@ struct sq_wqe_prop { typedef struct sq_wqe_prop sq_wqe_prop; class qp_mgr_eth_mlx5 : public qp_mgr_eth { - friend class cq_mgr_mlx5; + friend class cq_mgr_regrq; public: qp_mgr_eth_mlx5(struct qp_mgr_desc *desc, const uint32_t tx_num_wr, const uint16_t vlan, diff --git a/src/core/dev/ring_simple.h b/src/core/dev/ring_simple.h index cf96e4377..476d635d8 100644 --- a/src/core/dev/ring_simple.h +++ b/src/core/dev/ring_simple.h @@ -287,7 +287,7 @@ class ring_simple : public ring_slave { } friend class cq_mgr; - friend class cq_mgr_mlx5; + friend class cq_mgr_regrq; friend class cq_mgr_mlx5_strq; friend class qp_mgr; friend class qp_mgr_eth_mlx5; From fb6022feacb2c7995d147fa890bc3320be966b9e Mon Sep 17 00:00:00 2001 From: Alexander Grissik Date: Sun, 27 Aug 2023 15:11:06 +0300 Subject: [PATCH 003/169] issue: 3514044 Renaming cq_mgr_mlx5_strq to cq_mgr_strq Signed-off-by: Alexander Grissik --- src/core/dev/cq_mgr_strq.cpp | 46 +++++++++++++-------------- src/core/dev/cq_mgr_strq.h | 14 ++++---- src/core/dev/qp_mgr.h | 2 +- src/core/dev/qp_mgr_eth_mlx5_dpcp.cpp | 10 +++--- src/core/dev/ring_simple.h | 2 +- 5 files changed, 37 insertions(+), 37 deletions(-) diff --git a/src/core/dev/cq_mgr_strq.cpp b/src/core/dev/cq_mgr_strq.cpp index b15684165..e0b797140 100644 --- a/src/core/dev/cq_mgr_strq.cpp +++ b/src/core/dev/cq_mgr_strq.cpp @@ -41,7 +41,7 @@ #include "ring_simple.h" #include -#define MODULE_NAME "cq_mgr_mlx5_strq" +#define MODULE_NAME "cq_mgr_strq" #define cq_logfunc __log_info_func #define cq_logdbg __log_info_dbg @@ -55,11 +55,11 @@ ##log_args); \ } while (0) -cq_mgr_mlx5_strq::cq_mgr_mlx5_strq(ring_simple *p_ring, ib_ctx_handler *p_ib_ctx_handler, - uint32_t cq_size, uint32_t stride_size_bytes, - uint32_t strides_num, - struct ibv_comp_channel *p_comp_event_channel, - bool call_configure) +cq_mgr_strq::cq_mgr_strq(ring_simple *p_ring, ib_ctx_handler *p_ib_ctx_handler, + uint32_t cq_size, uint32_t stride_size_bytes, + uint32_t strides_num, + struct ibv_comp_channel *p_comp_event_channel, + bool call_configure) : cq_mgr_regrq(p_ring, p_ib_ctx_handler, cq_size, p_comp_event_channel, true, call_configure) , _owner_ring(p_ring) , _stride_size_bytes(stride_size_bytes) @@ -73,7 +73,7 @@ cq_mgr_mlx5_strq::cq_mgr_mlx5_strq(ring_simple *p_ring, ib_ctx_handler *p_ib_ctx return_stride(next_stride()); // Fill _stride_cache } -cq_mgr_mlx5_strq::~cq_mgr_mlx5_strq() +cq_mgr_strq::~cq_mgr_strq() { cq_logfunc(""); cq_logdbg("destroying CQ STRQ"); @@ -100,7 +100,7 @@ cq_mgr_mlx5_strq::~cq_mgr_mlx5_strq() g_buffer_pool_rx_stride->put_buffers_thread_safe(&_stride_cache, _stride_cache.size()); } -mem_buf_desc_t *cq_mgr_mlx5_strq::next_stride() +mem_buf_desc_t *cq_mgr_strq::next_stride() { if (unlikely(_stride_cache.size() <= 0U)) { if (!g_buffer_pool_rx_stride->get_buffers_thread_safe( @@ -116,7 +116,7 @@ mem_buf_desc_t *cq_mgr_mlx5_strq::next_stride() return _stride_cache.get_and_pop_back(); } -void cq_mgr_mlx5_strq::return_stride(mem_buf_desc_t *desc) +void cq_mgr_strq::return_stride(mem_buf_desc_t *desc) { _stride_cache.push_back(desc); @@ -126,7 +126,7 @@ void cq_mgr_mlx5_strq::return_stride(mem_buf_desc_t *desc) } } -uint32_t cq_mgr_mlx5_strq::clean_cq() +uint32_t cq_mgr_strq::clean_cq() { uint32_t ret_total = 0; uint64_t cq_poll_sn = 0; @@ -155,7 +155,7 @@ uint32_t cq_mgr_mlx5_strq::clean_cq() return ret_total; } -bool cq_mgr_mlx5_strq::set_current_hot_buffer() +bool cq_mgr_strq::set_current_hot_buffer() { if (likely(m_qp->m_mlx5_qp.rq.tail != (m_qp->m_mlx5_qp.rq.head))) { uint32_t index = m_qp->m_mlx5_qp.rq.tail & (m_qp_rec.qp->m_rx_num_wr - 1); @@ -170,7 +170,7 @@ bool cq_mgr_mlx5_strq::set_current_hot_buffer() return false; } -mem_buf_desc_t *cq_mgr_mlx5_strq::poll(enum buff_status_e &status, mem_buf_desc_t *&buff_stride) +mem_buf_desc_t *cq_mgr_strq::poll(enum buff_status_e &status, mem_buf_desc_t *&buff_stride) { mem_buf_desc_t *buff = NULL; @@ -227,7 +227,7 @@ mem_buf_desc_t *cq_mgr_mlx5_strq::poll(enum buff_status_e &status, mem_buf_desc_ return buff; } -inline bool cq_mgr_mlx5_strq::strq_cqe_to_mem_buff_desc(struct xlio_mlx5_cqe *cqe, +inline bool cq_mgr_strq::strq_cqe_to_mem_buff_desc(struct xlio_mlx5_cqe *cqe, enum buff_status_e &status, bool &is_filler) { struct mlx5_err_cqe *ecqe; @@ -337,9 +337,9 @@ inline bool cq_mgr_mlx5_strq::strq_cqe_to_mem_buff_desc(struct xlio_mlx5_cqe *cq return false; } -int cq_mgr_mlx5_strq::drain_and_proccess_helper(mem_buf_desc_t *buff, mem_buf_desc_t *buff_wqe, - buff_status_e status, - uintptr_t *p_recycle_buffers_last_wr_id) +int cq_mgr_strq::drain_and_proccess_helper(mem_buf_desc_t *buff, mem_buf_desc_t *buff_wqe, + buff_status_e status, + uintptr_t *p_recycle_buffers_last_wr_id) { int ret_total = 0; if (buff_wqe && (++m_qp_rec.debt >= (int)m_n_sysvar_rx_num_wr_to_post_recv) && @@ -377,7 +377,7 @@ int cq_mgr_mlx5_strq::drain_and_proccess_helper(mem_buf_desc_t *buff, mem_buf_de return ret_total; } -int cq_mgr_mlx5_strq::drain_and_proccess(uintptr_t *p_recycle_buffers_last_wr_id) +int cq_mgr_strq::drain_and_proccess(uintptr_t *p_recycle_buffers_last_wr_id) { cq_logfuncall("cq was %s drained. %d processed wce since last check. %d wce in m_rx_queue", (m_b_was_drained ? "" : "not "), m_n_wce_counter, m_rx_queue.size()); @@ -424,7 +424,7 @@ int cq_mgr_mlx5_strq::drain_and_proccess(uintptr_t *p_recycle_buffers_last_wr_id return ret_total; } -mem_buf_desc_t *cq_mgr_mlx5_strq::process_strq_cq_element_rx(mem_buf_desc_t *p_mem_buf_desc, +mem_buf_desc_t *cq_mgr_strq::process_strq_cq_element_rx(mem_buf_desc_t *p_mem_buf_desc, enum buff_status_e status) { /* Assume locked!!! */ @@ -449,7 +449,7 @@ mem_buf_desc_t *cq_mgr_mlx5_strq::process_strq_cq_element_rx(mem_buf_desc_t *p_m return p_mem_buf_desc; } -mem_buf_desc_t *cq_mgr_mlx5_strq::poll_and_process_socketxtreme() +mem_buf_desc_t *cq_mgr_strq::poll_and_process_socketxtreme() { buff_status_e status = BS_OK; mem_buf_desc_t *buff = nullptr; @@ -462,7 +462,7 @@ mem_buf_desc_t *cq_mgr_mlx5_strq::poll_and_process_socketxtreme() return (buff && cqe_process_rx(buff, status) ? buff : nullptr); } -int cq_mgr_mlx5_strq::poll_and_process_element_rx(uint64_t *p_cq_poll_sn, void *pv_fd_ready_array) +int cq_mgr_strq::poll_and_process_element_rx(uint64_t *p_cq_poll_sn, void *pv_fd_ready_array) { /* Assume locked!!! */ cq_logfuncall(""); @@ -512,7 +512,7 @@ int cq_mgr_mlx5_strq::poll_and_process_element_rx(uint64_t *p_cq_poll_sn, void * return ret_rx_processed; } -void cq_mgr_mlx5_strq::add_qp_rx(qp_mgr *qp) +void cq_mgr_strq::add_qp_rx(qp_mgr *qp) { cq_logfunc(""); set_qp_rq(qp); @@ -521,7 +521,7 @@ void cq_mgr_mlx5_strq::add_qp_rx(qp_mgr *qp) cq_mgr::add_qp_rx(qp); } -void cq_mgr_mlx5_strq::statistics_print() +void cq_mgr_strq::statistics_print() { cq_mgr::statistics_print(); cq_logdbg_no_funcname("RWQE consumed: %12" PRIu64, m_p_cq_stat->n_rx_consumed_rwqe_count); @@ -533,7 +533,7 @@ void cq_mgr_mlx5_strq::statistics_print() cq_logdbg_no_funcname("LRO bytes: %12" PRIu64, m_p_cq_stat->n_rx_lro_bytes); } -void cq_mgr_mlx5_strq::reclaim_recv_buffer_helper(mem_buf_desc_t *buff) +void cq_mgr_strq::reclaim_recv_buffer_helper(mem_buf_desc_t *buff) { if (buff->dec_ref_count() <= 1 && (buff->lwip_pbuf.pbuf.ref-- <= 1)) { if (likely(buff->p_desc_owner == m_p_ring)) { diff --git a/src/core/dev/cq_mgr_strq.h b/src/core/dev/cq_mgr_strq.h index c5006c1ef..9a374e527 100644 --- a/src/core/dev/cq_mgr_strq.h +++ b/src/core/dev/cq_mgr_strq.h @@ -30,20 +30,20 @@ * SOFTWARE. */ -#ifndef CQ_MGR_MLX5_STRQ_H -#define CQ_MGR_MLX5_STRQ_H +#ifndef CQ_MGR_STRQ_H +#define CQ_MGR_STRQ_H #include #include #include "cq_mgr_regrq.h" -class cq_mgr_mlx5_strq : public cq_mgr_regrq { +class cq_mgr_strq : public cq_mgr_regrq { public: - cq_mgr_mlx5_strq(ring_simple *p_ring, ib_ctx_handler *p_ib_ctx_handler, uint32_t cq_size, - uint32_t stride_size_bytes, uint32_t strides_num, - struct ibv_comp_channel *p_comp_event_channel, bool call_configure = true); + cq_mgr_strq(ring_simple *p_ring, ib_ctx_handler *p_ib_ctx_handler, uint32_t cq_size, + uint32_t stride_size_bytes, uint32_t strides_num, + struct ibv_comp_channel *p_comp_event_channel, bool call_configure = true); - virtual ~cq_mgr_mlx5_strq() override; + virtual ~cq_mgr_strq() override; virtual int drain_and_proccess(uintptr_t *p_recycle_buffers_last_wr_id = NULL) override; virtual mem_buf_desc_t *poll_and_process_socketxtreme() override; diff --git a/src/core/dev/qp_mgr.h b/src/core/dev/qp_mgr.h index e391b9a06..4cf883ef4 100644 --- a/src/core/dev/qp_mgr.h +++ b/src/core/dev/qp_mgr.h @@ -154,7 +154,7 @@ class xlio_ti { class qp_mgr { friend class cq_mgr; friend class cq_mgr_regrq; - friend class cq_mgr_mlx5_strq; + friend class cq_mgr_strq; friend class cq_mgr_mp; public: diff --git a/src/core/dev/qp_mgr_eth_mlx5_dpcp.cpp b/src/core/dev/qp_mgr_eth_mlx5_dpcp.cpp index e793fb3a7..a976bdf1c 100644 --- a/src/core/dev/qp_mgr_eth_mlx5_dpcp.cpp +++ b/src/core/dev/qp_mgr_eth_mlx5_dpcp.cpp @@ -293,11 +293,11 @@ cq_mgr *qp_mgr_eth_mlx5_dpcp::init_rx_cq_mgr(struct ibv_comp_channel *p_rx_comp_ return (!init_rx_cq_mgr_prepare() ? nullptr - : new cq_mgr_mlx5_strq(m_p_ring, m_p_ib_ctx_handler, - safe_mce_sys().strq_stride_num_per_rwqe * m_rx_num_wr, - safe_mce_sys().strq_stride_size_bytes, - safe_mce_sys().strq_stride_num_per_rwqe, - p_rx_comp_event_channel, true)); + : new cq_mgr_strq(m_p_ring, m_p_ib_ctx_handler, + safe_mce_sys().strq_stride_num_per_rwqe * m_rx_num_wr, + safe_mce_sys().strq_stride_size_bytes, + safe_mce_sys().strq_stride_num_per_rwqe, + p_rx_comp_event_channel, true)); } void qp_mgr_eth_mlx5_dpcp::post_recv_buffer(mem_buf_desc_t *p_mem_buf_desc) diff --git a/src/core/dev/ring_simple.h b/src/core/dev/ring_simple.h index 476d635d8..67d81c90a 100644 --- a/src/core/dev/ring_simple.h +++ b/src/core/dev/ring_simple.h @@ -288,7 +288,7 @@ class ring_simple : public ring_slave { friend class cq_mgr; friend class cq_mgr_regrq; - friend class cq_mgr_mlx5_strq; + friend class cq_mgr_strq; friend class qp_mgr; friend class qp_mgr_eth_mlx5; friend class qp_mgr_eth_mlx5_dpcp; From f21880d6f05b6e2b9cb89b3af73cc5333ab07848 Mon Sep 17 00:00:00 2001 From: Alexander Grissik Date: Sun, 27 Aug 2023 15:37:52 +0300 Subject: [PATCH 004/169] issue: 3514044 Moving cq_mgr_regrq tx methods to cq_mgr Signed-off-by: Alexander Grissik --- src/core/dev/cq_mgr.cpp | 123 ++++++++++++++++++++++++++------- src/core/dev/cq_mgr.h | 72 ++++++++++++++++++- src/core/dev/cq_mgr_regrq.cpp | 95 ------------------------- src/core/dev/cq_mgr_regrq.h | 80 +++------------------ src/core/dev/qp_mgr_eth_mlx5.h | 1 + 5 files changed, 176 insertions(+), 195 deletions(-) diff --git a/src/core/dev/cq_mgr.cpp b/src/core/dev/cq_mgr.cpp index baf8b8f90..f36cdb7d7 100644 --- a/src/core/dev/cq_mgr.cpp +++ b/src/core/dev/cq_mgr.cpp @@ -48,6 +48,7 @@ #include "buffer_pool.h" #include "qp_mgr.h" #include "ring_simple.h" +#include "qp_mgr_eth_mlx5.h" #define MODULE_NAME "cqm" @@ -72,7 +73,8 @@ uint64_t cq_mgr::m_n_global_sn = 0; cq_mgr::cq_mgr(ring_simple *p_ring, ib_ctx_handler *p_ib_ctx_handler, int cq_size, struct ibv_comp_channel *p_comp_event_channel, bool is_rx, bool config) - : m_p_ibv_cq(NULL) + : m_qp(NULL) + , m_p_ibv_cq(NULL) , m_b_is_rx(is_rx) , m_cq_id(0) , m_n_cq_poll_sn(0) @@ -114,6 +116,8 @@ cq_mgr::cq_mgr(ring_simple *p_ring, ib_ctx_handler *p_ib_ctx_handler, int cq_siz if (config) { configure(cq_size); } + + memset(&m_mlx5_cq, 0, sizeof(m_mlx5_cq)); } void cq_mgr::configure(int cq_size) @@ -677,31 +681,6 @@ int cq_mgr::poll_and_process_element_rx(uint64_t *p_cq_poll_sn, void *pv_fd_read return ret_rx_processed; } -int cq_mgr::poll_and_process_element_tx(uint64_t *p_cq_poll_sn) -{ - // Assume locked!!! - cq_logfuncall(""); - - /* coverity[stack_use_local_overflow] */ - xlio_ibv_wc wce[MCE_MAX_CQ_POLL_BATCH]; - int ret = poll(wce, m_n_sysvar_cq_poll_batch_max, p_cq_poll_sn); - if (ret > 0) { - m_n_wce_counter += ret; - if (ret < (int)m_n_sysvar_cq_poll_batch_max) { - m_b_was_drained = true; - } - - for (int i = 0; i < ret; i++) { - mem_buf_desc_t *buff = cqe_log_and_get_buf_tx((&wce[i])); - if (buff) { - process_tx_buffer_list(buff); - } - } - } - - return ret; -} - bool cq_mgr::reclaim_recv_buffers(mem_buf_desc_t *rx_reuse_lst) { if (m_rx_buffs_rdy_for_free_head) { @@ -988,3 +967,95 @@ cq_mgr *get_cq_mgr_from_cq_event(struct ibv_comp_channel *p_cq_channel) return p_cq_mgr; } + +int cq_mgr::poll_and_process_element_tx(uint64_t *p_cq_poll_sn) +{ + cq_logfuncall(""); + + static auto is_error_opcode = [&](uint8_t opcode) { + return opcode == MLX5_CQE_REQ_ERR || opcode == MLX5_CQE_RESP_ERR; + }; + + int ret = 0; + uint32_t num_polled_cqes = 0; + xlio_mlx5_cqe *cqe = get_cqe_tx(num_polled_cqes); + + if (likely(cqe)) { + unsigned index = ntohs(cqe->wqe_counter) & (m_qp->m_tx_num_wr - 1); + + // All error opcodes have the most significant bit set. + if (unlikely(cqe->op_own & 0x80) && is_error_opcode(cqe->op_own >> 4)) { + m_p_cq_stat->n_rx_cqe_error++; + log_cqe_error(cqe); + } + + handle_sq_wqe_prop(index); + ret = 1; + } + update_global_sn(*p_cq_poll_sn, num_polled_cqes); + + return ret; +} + +void cq_mgr::log_cqe_error(struct xlio_mlx5_cqe *cqe) +{ + struct mlx5_err_cqe *ecqe = (struct mlx5_err_cqe *)cqe; + + /* TODO We can also ask qp_mgr to log WQE fields from SQ. But at first, we need to remove + * prefetch and memset of the next WQE there. Credit system will guarantee that we don't + * reuse the WQE at this point. + */ + + if (MLX5_CQE_SYNDROME_WR_FLUSH_ERR != ecqe->syndrome) { + cq_logwarn("cqe: syndrome=0x%x vendor=0x%x hw=0x%x (type=0x%x) wqe_opcode_qpn=0x%x " + "wqe_counter=0x%x", + ecqe->syndrome, ecqe->vendor_err_synd, *((uint8_t *)&ecqe->rsvd1 + 16), + *((uint8_t *)&ecqe->rsvd1 + 17), ntohl(ecqe->s_wqe_opcode_qpn), + ntohs(ecqe->wqe_counter)); + } +} + +void cq_mgr::handle_sq_wqe_prop(unsigned index) +{ + sq_wqe_prop *p = &m_qp->m_sq_wqe_idx_to_prop[index]; + sq_wqe_prop *prev; + unsigned credits = 0; + + /* + * TX completions can be signalled for a set of WQEs as an optimization. + * Therefore, for every TX completion we may need to handle multiple + * WQEs. Since every WQE can have various size and the WQE index is + * wrapped around, we build a linked list to simplify things. Each + * element of the linked list represents properties of a previously + * posted WQE. + * + * We keep index of the last completed WQE and stop processing the list + * when we reach the index. This condition is checked in + * is_sq_wqe_prop_valid(). + */ + + do { + if (p->buf) { + m_p_ring->mem_buf_desc_return_single_locked(p->buf); + } + if (p->ti) { + xlio_ti *ti = p->ti; + if (ti->m_callback) { + ti->m_callback(ti->m_callback_arg); + } + + ti->put(); + if (unlikely(ti->m_released && ti->m_ref == 0)) { + m_qp->ti_released(ti); + } + } + credits += p->credits; + + prev = p; + p = p->next; + } while (p != NULL && m_qp->is_sq_wqe_prop_valid(p, prev)); + + m_p_ring->return_tx_pool_to_global_pool(); + m_qp->credits_return(credits); + m_qp->m_sq_wqe_prop_last_signalled = index; +} diff --git a/src/core/dev/cq_mgr.h b/src/core/dev/cq_mgr.h index 1e10900bc..2fc90e257 100644 --- a/src/core/dev/cq_mgr.h +++ b/src/core/dev/cq_mgr.h @@ -57,6 +57,7 @@ class net_device_mgr; class ring; class qp_mgr; +class qp_mgr_eth_mlx5; class ring_simple; #define LOCAL_IF_INFO_INVALID \ @@ -136,7 +137,7 @@ class cq_mgr { * < 0 error */ virtual int poll_and_process_element_rx(uint64_t *p_cq_poll_sn, void *pv_fd_ready_array = NULL); - virtual int poll_and_process_element_tx(uint64_t *p_cq_poll_sn); + int poll_and_process_element_tx(uint64_t *p_cq_poll_sn); virtual mem_buf_desc_t *poll_and_process_socketxtreme() { return nullptr; }; /** @@ -185,7 +186,9 @@ class cq_mgr { int poll(xlio_ibv_wc *p_wce, int num_entries, uint64_t *p_cq_poll_sn); void compensate_qp_poll_failed(); inline void process_recv_buffer(mem_buf_desc_t *buff, void *pv_fd_ready_array = NULL); - + + inline void update_global_sn(uint64_t &cq_poll_sn, uint32_t rettotal); + /* Process a WCE... meaning... * - extract the mem_buf_desc from the wce.wr_id and then loop on all linked mem_buf_desc * and deliver them to their owner for further processing (sockinfo on Tx path and ib_conn_mgr @@ -207,6 +210,8 @@ class cq_mgr { // returns list of buffers to the owner. void process_tx_buffer_list(mem_buf_desc_t *p_mem_buf_desc); + xlio_ib_mlx5_cq_t m_mlx5_cq; + qp_mgr_eth_mlx5 *m_qp; struct ibv_cq *m_p_ibv_cq; bool m_b_is_rx; descq_t m_rx_queue; @@ -247,6 +252,12 @@ class cq_mgr { cq_stats_t m_cq_stat_static; static atomic_t m_n_cq_id_counter; + inline struct xlio_mlx5_cqe *get_cqe_tx(uint32_t &num_polled_cqes); + + void log_cqe_error(struct xlio_mlx5_cqe *cqe); + + void handle_sq_wqe_prop(unsigned index); + void handle_tcp_ctl_packets(uint32_t rx_processed, void *pv_fd_ready_array); // requests safe_mce_sys().qp_compensation_level buffers from global pool @@ -270,4 +281,61 @@ class cq_mgr { // Since we have a single TX CQ comp channel for all cq_mgr's, it might not be the active_cq object cq_mgr *get_cq_mgr_from_cq_event(struct ibv_comp_channel *p_cq_channel); +#if defined(DEFINED_DIRECT_VERBS) + +inline void cq_mgr::update_global_sn(uint64_t &cq_poll_sn, uint32_t num_polled_cqes) +{ + if (num_polled_cqes > 0) { + // spoil the global sn if we have packets ready + union __attribute__((packed)) { + uint64_t global_sn; + struct { + uint32_t cq_id; + uint32_t cq_sn; + } bundle; + } next_sn; + m_n_cq_poll_sn += num_polled_cqes; + next_sn.bundle.cq_sn = m_n_cq_poll_sn; + next_sn.bundle.cq_id = m_cq_id; + + m_n_global_sn = next_sn.global_sn; + } + + cq_poll_sn = m_n_global_sn; +} + +inline struct xlio_mlx5_cqe *cq_mgr::get_cqe_tx(uint32_t &num_polled_cqes) +{ + struct xlio_mlx5_cqe *cqe_ret = nullptr; + struct xlio_mlx5_cqe *cqe = + (struct xlio_mlx5_cqe *)(((uint8_t *)m_mlx5_cq.cq_buf) + + ((m_mlx5_cq.cq_ci & (m_mlx5_cq.cqe_count - 1)) + << m_mlx5_cq.cqe_size_log)); + + /* According to PRM, SW ownership bit flips with every CQ overflow. Since cqe_count is + * a power of 2, we use it to get cq_ci bit just after the significant bits. The bit changes + * with each CQ overflow and actually equals to the SW ownership bit. + */ + while (((cqe->op_own & MLX5_CQE_OWNER_MASK) == !!(m_mlx5_cq.cq_ci & m_mlx5_cq.cqe_count)) && + ((cqe->op_own >> 4) != MLX5_CQE_INVALID)) { + ++m_mlx5_cq.cq_ci; + ++num_polled_cqes; + cqe_ret = cqe; + if (unlikely(cqe->op_own & 0x80)) { + // This is likely an error CQE. Return it explicitly to log the errors. + break; + } + cqe = (struct xlio_mlx5_cqe *)(((uint8_t *)m_mlx5_cq.cq_buf) + + ((m_mlx5_cq.cq_ci & (m_mlx5_cq.cqe_count - 1)) + << m_mlx5_cq.cqe_size_log)); + } + if (cqe_ret) { + rmb(); + *m_mlx5_cq.dbrec = htonl(m_mlx5_cq.cq_ci); + } + return cqe_ret; +} + +#endif /* DEFINED_DIRECT_VERBS */ + #endif // CQ_MGR_H diff --git a/src/core/dev/cq_mgr_regrq.cpp b/src/core/dev/cq_mgr_regrq.cpp index e31707fd9..87fc90124 100644 --- a/src/core/dev/cq_mgr_regrq.cpp +++ b/src/core/dev/cq_mgr_regrq.cpp @@ -55,12 +55,9 @@ cq_mgr_regrq::cq_mgr_regrq(ring_simple *p_ring, ib_ctx_handler *p_ib_ctx_handler struct ibv_comp_channel *p_comp_event_channel, bool is_rx, bool call_configure) : cq_mgr(p_ring, p_ib_ctx_handler, cq_size, p_comp_event_channel, is_rx, call_configure) - , m_qp(NULL) , m_rx_hot_buffer(NULL) { cq_logfunc(""); - - memset(&m_mlx5_cq, 0, sizeof(m_mlx5_cq)); } uint32_t cq_mgr_regrq::clean_cq() @@ -451,98 +448,6 @@ int cq_mgr_regrq::poll_and_process_element_rx(uint64_t *p_cq_poll_sn, void *pv_f return ret_rx_processed; } -void cq_mgr_regrq::log_cqe_error(struct xlio_mlx5_cqe *cqe) -{ - struct mlx5_err_cqe *ecqe = (struct mlx5_err_cqe *)cqe; - - /* TODO We can also ask qp_mgr to log WQE fields from SQ. But at first, we need to remove - * prefetch and memset of the next WQE there. Credit system will guarantee that we don't - * reuse the WQE at this point. - */ - - if (MLX5_CQE_SYNDROME_WR_FLUSH_ERR != ecqe->syndrome) { - cq_logwarn("cqe: syndrome=0x%x vendor=0x%x hw=0x%x (type=0x%x) wqe_opcode_qpn=0x%x " - "wqe_counter=0x%x", - ecqe->syndrome, ecqe->vendor_err_synd, *((uint8_t *)&ecqe->rsvd1 + 16), - *((uint8_t *)&ecqe->rsvd1 + 17), ntohl(ecqe->s_wqe_opcode_qpn), - ntohs(ecqe->wqe_counter)); - } -} - -void cq_mgr_regrq::handle_sq_wqe_prop(unsigned index) -{ - sq_wqe_prop *p = &m_qp->m_sq_wqe_idx_to_prop[index]; - sq_wqe_prop *prev; - unsigned credits = 0; - - /* - * TX completions can be signalled for a set of WQEs as an optimization. - * Therefore, for every TX completion we may need to handle multiple - * WQEs. Since every WQE can have various size and the WQE index is - * wrapped around, we build a linked list to simplify things. Each - * element of the linked list represents properties of a previously - * posted WQE. - * - * We keep index of the last completed WQE and stop processing the list - * when we reach the index. This condition is checked in - * is_sq_wqe_prop_valid(). - */ - - do { - if (p->buf) { - m_p_ring->mem_buf_desc_return_single_locked(p->buf); - } - if (p->ti) { - xlio_ti *ti = p->ti; - if (ti->m_callback) { - ti->m_callback(ti->m_callback_arg); - } - - ti->put(); - if (unlikely(ti->m_released && ti->m_ref == 0)) { - m_qp->ti_released(ti); - } - } - credits += p->credits; - - prev = p; - p = p->next; - } while (p != NULL && m_qp->is_sq_wqe_prop_valid(p, prev)); - - m_p_ring->return_tx_pool_to_global_pool(); - m_qp->credits_return(credits); - m_qp->m_sq_wqe_prop_last_signalled = index; -} - -int cq_mgr_regrq::poll_and_process_element_tx(uint64_t *p_cq_poll_sn) -{ - cq_logfuncall(""); - - static auto is_error_opcode = [&](uint8_t opcode) { - return opcode == MLX5_CQE_REQ_ERR || opcode == MLX5_CQE_RESP_ERR; - }; - - int ret = 0; - uint32_t num_polled_cqes = 0; - xlio_mlx5_cqe *cqe = get_cqe_tx(num_polled_cqes); - - if (likely(cqe)) { - unsigned index = ntohs(cqe->wqe_counter) & (m_qp->m_tx_num_wr - 1); - - // All error opcodes have the most significant bit set. - if (unlikely(cqe->op_own & 0x80) && is_error_opcode(cqe->op_own >> 4)) { - m_p_cq_stat->n_rx_cqe_error++; - log_cqe_error(cqe); - } - - handle_sq_wqe_prop(index); - ret = 1; - } - update_global_sn(*p_cq_poll_sn, num_polled_cqes); - - return ret; -} - void cq_mgr_regrq::set_qp_rq(qp_mgr *qp) { m_qp = static_cast(qp); diff --git a/src/core/dev/cq_mgr_regrq.h b/src/core/dev/cq_mgr_regrq.h index 3119b4369..b8021fbaf 100644 --- a/src/core/dev/cq_mgr_regrq.h +++ b/src/core/dev/cq_mgr_regrq.h @@ -34,12 +34,9 @@ #define CQ_MGR_REGRQ_H #include "cq_mgr.h" -#include "qp_mgr_eth_mlx5.h" #if defined(DEFINED_DIRECT_VERBS) -class qp_mgr_eth_mlx5; - /* Get CQE opcode. */ #define MLX5_CQE_OPCODE(op_own) ((op_own) >> 4) @@ -64,92 +61,31 @@ class cq_mgr_regrq : public cq_mgr { virtual int drain_and_proccess(uintptr_t *p_recycle_buffers_last_wr_id = NULL); virtual mem_buf_desc_t *poll_and_process_socketxtreme(); virtual int poll_and_process_element_rx(uint64_t *p_cq_poll_sn, void *pv_fd_ready_array = NULL); - virtual int poll_and_process_element_tx(uint64_t *p_cq_poll_sn); - mem_buf_desc_t *cqe_process_rx(mem_buf_desc_t *p_mem_buf_desc, enum buff_status_e status); + mem_buf_desc_t *cqe_process_rx(mem_buf_desc_t *p_mem_buf_desc, enum buff_status_e status); // MOVE virtual void add_qp_rx(qp_mgr *qp); - void set_qp_rq(qp_mgr *qp); - virtual void add_qp_tx(qp_mgr *qp); + void set_qp_rq(qp_mgr *qp); // MOVE + virtual void add_qp_tx(qp_mgr *qp); // MOVE virtual uint32_t clean_cq(); - virtual void get_cq_event(int count = 1) { xlio_ib_mlx5_get_cq_event(&m_mlx5_cq, count); }; + virtual void get_cq_event(int count = 1) { xlio_ib_mlx5_get_cq_event(&m_mlx5_cq, count); }; // MOVE protected: - qp_mgr_eth_mlx5 *m_qp; - xlio_ib_mlx5_cq_t m_mlx5_cq; - mem_buf_desc_t *m_rx_hot_buffer; + mem_buf_desc_t *m_rx_hot_buffer; // MOVE - inline struct xlio_mlx5_cqe *check_cqe(void); + inline struct xlio_mlx5_cqe *check_cqe(void); // MOVE mem_buf_desc_t *poll(enum buff_status_e &status); - inline struct xlio_mlx5_cqe *get_cqe_tx(uint32_t &num_polled_cqes); - void log_cqe_error(struct xlio_mlx5_cqe *cqe); inline void cqe_to_mem_buff_desc(struct xlio_mlx5_cqe *cqe, mem_buf_desc_t *p_rx_wc_buf_desc, enum buff_status_e &status); - inline void update_global_sn(uint64_t &cq_poll_sn, uint32_t rettotal); - void lro_update_hdr(struct xlio_mlx5_cqe *cqe, mem_buf_desc_t *p_rx_wc_buf_desc); + void lro_update_hdr(struct xlio_mlx5_cqe *cqe, mem_buf_desc_t *p_rx_wc_buf_desc); // MOVE private: - void handle_sq_wqe_prop(unsigned index); - int drain_and_proccess_socketxtreme(uintptr_t *p_recycle_buffers_last_wr_id); int drain_and_proccess_helper(mem_buf_desc_t *buff, buff_status_e status, uintptr_t *p_recycle_buffers_last_wr_id); - virtual int req_notify_cq() { return xlio_ib_mlx5_req_notify_cq(&m_mlx5_cq, 0); }; + virtual int req_notify_cq() { return xlio_ib_mlx5_req_notify_cq(&m_mlx5_cq, 0); }; // MOVE }; -inline void cq_mgr_regrq::update_global_sn(uint64_t &cq_poll_sn, uint32_t num_polled_cqes) -{ - if (num_polled_cqes > 0) { - // spoil the global sn if we have packets ready - union __attribute__((packed)) { - uint64_t global_sn; - struct { - uint32_t cq_id; - uint32_t cq_sn; - } bundle; - } next_sn; - m_n_cq_poll_sn += num_polled_cqes; - next_sn.bundle.cq_sn = m_n_cq_poll_sn; - next_sn.bundle.cq_id = m_cq_id; - - m_n_global_sn = next_sn.global_sn; - } - - cq_poll_sn = m_n_global_sn; -} - -inline struct xlio_mlx5_cqe *cq_mgr_regrq::get_cqe_tx(uint32_t &num_polled_cqes) -{ - struct xlio_mlx5_cqe *cqe_ret = nullptr; - struct xlio_mlx5_cqe *cqe = - (struct xlio_mlx5_cqe *)(((uint8_t *)m_mlx5_cq.cq_buf) + - ((m_mlx5_cq.cq_ci & (m_mlx5_cq.cqe_count - 1)) - << m_mlx5_cq.cqe_size_log)); - - /* According to PRM, SW ownership bit flips with every CQ overflow. Since cqe_count is - * a power of 2, we use it to get cq_ci bit just after the significant bits. The bit changes - * with each CQ overflow and actually equals to the SW ownership bit. - */ - while (((cqe->op_own & MLX5_CQE_OWNER_MASK) == !!(m_mlx5_cq.cq_ci & m_mlx5_cq.cqe_count)) && - ((cqe->op_own >> 4) != MLX5_CQE_INVALID)) { - ++m_mlx5_cq.cq_ci; - ++num_polled_cqes; - cqe_ret = cqe; - if (unlikely(cqe->op_own & 0x80)) { - // This is likely an error CQE. Return it explicitly to log the errors. - break; - } - cqe = (struct xlio_mlx5_cqe *)(((uint8_t *)m_mlx5_cq.cq_buf) + - ((m_mlx5_cq.cq_ci & (m_mlx5_cq.cqe_count - 1)) - << m_mlx5_cq.cqe_size_log)); - } - if (cqe_ret) { - rmb(); - *m_mlx5_cq.dbrec = htonl(m_mlx5_cq.cq_ci); - } - return cqe_ret; -} - inline struct xlio_mlx5_cqe *cq_mgr_regrq::check_cqe(void) { struct xlio_mlx5_cqe *cqe = diff --git a/src/core/dev/qp_mgr_eth_mlx5.h b/src/core/dev/qp_mgr_eth_mlx5.h index 28a500058..4454a4faa 100644 --- a/src/core/dev/qp_mgr_eth_mlx5.h +++ b/src/core/dev/qp_mgr_eth_mlx5.h @@ -62,6 +62,7 @@ struct sq_wqe_prop { typedef struct sq_wqe_prop sq_wqe_prop; class qp_mgr_eth_mlx5 : public qp_mgr_eth { + friend class cq_mgr; friend class cq_mgr_regrq; public: From ea4a4e623797da2658793c2fb7cdbd378636adf2 Mon Sep 17 00:00:00 2001 From: Alexander Grissik Date: Sun, 27 Aug 2023 15:40:16 +0300 Subject: [PATCH 005/169] issue: 3514044 Moving cq_mgr_regrq events to cq_mgr Signed-off-by: Alexander Grissik --- src/core/dev/cq_mgr.h | 6 +++--- src/core/dev/cq_mgr_regrq.h | 3 --- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/src/core/dev/cq_mgr.h b/src/core/dev/cq_mgr.h index 2fc90e257..415e80e8e 100644 --- a/src/core/dev/cq_mgr.h +++ b/src/core/dev/cq_mgr.h @@ -173,7 +173,7 @@ class cq_mgr { // unmaps the qpn and vlan id void unmap_vlan_and_qpn(int qp_num, uint16_t vlan_id); - virtual void get_cq_event(int count = 1) { NOT_IN_USE(count); }; + void get_cq_event(int count = 1) { xlio_ib_mlx5_get_cq_event(&m_mlx5_cq, count); }; protected: /** @@ -188,7 +188,7 @@ class cq_mgr { inline void process_recv_buffer(mem_buf_desc_t *buff, void *pv_fd_ready_array = NULL); inline void update_global_sn(uint64_t &cq_poll_sn, uint32_t rettotal); - + /* Process a WCE... meaning... * - extract the mem_buf_desc from the wce.wr_id and then loop on all linked mem_buf_desc * and deliver them to their owner for further processing (sockinfo on Tx path and ib_conn_mgr @@ -274,7 +274,7 @@ class cq_mgr { void process_cq_element_log_helper(mem_buf_desc_t *p_mem_buf_desc, xlio_ibv_wc *p_wce); - virtual int req_notify_cq() { return ibv_req_notify_cq(m_p_ibv_cq, 0); }; + int req_notify_cq() { return xlio_ib_mlx5_req_notify_cq(&m_mlx5_cq, 0); }; }; // Helper gunction to extract the Tx cq_mgr from the CQ event, diff --git a/src/core/dev/cq_mgr_regrq.h b/src/core/dev/cq_mgr_regrq.h index b8021fbaf..b64404c03 100644 --- a/src/core/dev/cq_mgr_regrq.h +++ b/src/core/dev/cq_mgr_regrq.h @@ -67,7 +67,6 @@ class cq_mgr_regrq : public cq_mgr { void set_qp_rq(qp_mgr *qp); // MOVE virtual void add_qp_tx(qp_mgr *qp); // MOVE virtual uint32_t clean_cq(); - virtual void get_cq_event(int count = 1) { xlio_ib_mlx5_get_cq_event(&m_mlx5_cq, count); }; // MOVE protected: mem_buf_desc_t *m_rx_hot_buffer; // MOVE @@ -82,8 +81,6 @@ class cq_mgr_regrq : public cq_mgr { private: int drain_and_proccess_helper(mem_buf_desc_t *buff, buff_status_e status, uintptr_t *p_recycle_buffers_last_wr_id); - - virtual int req_notify_cq() { return xlio_ib_mlx5_req_notify_cq(&m_mlx5_cq, 0); }; // MOVE }; inline struct xlio_mlx5_cqe *cq_mgr_regrq::check_cqe(void) From 8f5999ca5cf36740568de3ba7bc8e8bb5ad81c47 Mon Sep 17 00:00:00 2001 From: Alexander Grissik Date: Sun, 27 Aug 2023 15:42:59 +0300 Subject: [PATCH 006/169] issue: 3514044 Moving cq_mgr_regrq add_qp_tx to cq_mgr Signed-off-by: Alexander Grissik --- src/core/dev/cq_mgr.cpp | 9 +++++++++ src/core/dev/cq_mgr.h | 2 +- src/core/dev/cq_mgr_regrq.cpp | 14 -------------- src/core/dev/cq_mgr_regrq.h | 1 - 4 files changed, 10 insertions(+), 16 deletions(-) diff --git a/src/core/dev/cq_mgr.cpp b/src/core/dev/cq_mgr.cpp index f36cdb7d7..4ab50c9cb 100644 --- a/src/core/dev/cq_mgr.cpp +++ b/src/core/dev/cq_mgr.cpp @@ -331,6 +331,15 @@ void cq_mgr::add_qp_tx(qp_mgr *qp) cq_logdbg("qp_mgr=%p", qp); m_qp_rec.qp = qp; m_qp_rec.debt = 0; + + m_qp = static_cast(qp); + + if (0 != xlio_ib_mlx5_get_cq(m_p_ibv_cq, &m_mlx5_cq)) { + cq_logpanic("xlio_ib_mlx5_get_cq failed (errno=%d %m)", errno); + } + + cq_logfunc("qp_mgr=%p m_mlx5_cq.dbrec=%p m_mlx5_cq.cq_buf=%p", m_qp, m_mlx5_cq.dbrec, + m_mlx5_cq.cq_buf); } void cq_mgr::del_qp_tx(qp_mgr *qp) diff --git a/src/core/dev/cq_mgr.h b/src/core/dev/cq_mgr.h index 415e80e8e..24f21ed5d 100644 --- a/src/core/dev/cq_mgr.h +++ b/src/core/dev/cq_mgr.h @@ -157,7 +157,7 @@ class cq_mgr { virtual void add_qp_rx(qp_mgr *qp); virtual void del_qp_rx(qp_mgr *qp); - virtual void add_qp_tx(qp_mgr *qp); + void add_qp_tx(qp_mgr *qp); virtual void del_qp_tx(qp_mgr *qp); virtual uint32_t clean_cq(); diff --git a/src/core/dev/cq_mgr_regrq.cpp b/src/core/dev/cq_mgr_regrq.cpp index 87fc90124..dca001e8a 100644 --- a/src/core/dev/cq_mgr_regrq.cpp +++ b/src/core/dev/cq_mgr_regrq.cpp @@ -470,20 +470,6 @@ void cq_mgr_regrq::add_qp_rx(qp_mgr *qp) cq_mgr::add_qp_rx(qp); } -void cq_mgr_regrq::add_qp_tx(qp_mgr *qp) -{ - // Assume locked! - cq_mgr::add_qp_tx(qp); - m_qp = static_cast(qp); - - if (0 != xlio_ib_mlx5_get_cq(m_p_ibv_cq, &m_mlx5_cq)) { - cq_logpanic("xlio_ib_mlx5_get_cq failed (errno=%d %m)", errno); - } - - cq_logfunc("qp_mgr=%p m_mlx5_cq.dbrec=%p m_mlx5_cq.cq_buf=%p", m_qp, m_mlx5_cq.dbrec, - m_mlx5_cq.cq_buf); -} - void cq_mgr_regrq::lro_update_hdr(struct xlio_mlx5_cqe *cqe, mem_buf_desc_t *p_rx_wc_buf_desc) { struct ethhdr *p_eth_h = (struct ethhdr *)(p_rx_wc_buf_desc->p_buffer); diff --git a/src/core/dev/cq_mgr_regrq.h b/src/core/dev/cq_mgr_regrq.h index b64404c03..047209416 100644 --- a/src/core/dev/cq_mgr_regrq.h +++ b/src/core/dev/cq_mgr_regrq.h @@ -65,7 +65,6 @@ class cq_mgr_regrq : public cq_mgr { mem_buf_desc_t *cqe_process_rx(mem_buf_desc_t *p_mem_buf_desc, enum buff_status_e status); // MOVE virtual void add_qp_rx(qp_mgr *qp); void set_qp_rq(qp_mgr *qp); // MOVE - virtual void add_qp_tx(qp_mgr *qp); // MOVE virtual uint32_t clean_cq(); protected: From 16f67fc0de19ce682ee75d27c8b9c92cd1fa1eae Mon Sep 17 00:00:00 2001 From: Alexander Grissik Date: Sun, 27 Aug 2023 16:17:46 +0300 Subject: [PATCH 007/169] issue: 3514044 Moving cq_mgr_regrq RX common to cq_mgr Signed-off-by: Alexander Grissik --- src/core/dev/cq_mgr.cpp | 340 +++++++++------------------------- src/core/dev/cq_mgr.h | 50 ++++- src/core/dev/cq_mgr_regrq.cpp | 148 +++++---------- src/core/dev/cq_mgr_regrq.h | 50 +---- src/core/dev/cq_mgr_strq.cpp | 2 +- src/core/dev/cq_mgr_strq.h | 6 +- src/core/dev/qp_mgr.cpp | 40 ---- src/core/dev/qp_mgr.h | 7 +- 8 files changed, 180 insertions(+), 463 deletions(-) diff --git a/src/core/dev/cq_mgr.cpp b/src/core/dev/cq_mgr.cpp index 4ab50c9cb..4430b3c50 100644 --- a/src/core/dev/cq_mgr.cpp +++ b/src/core/dev/cq_mgr.cpp @@ -74,6 +74,7 @@ uint64_t cq_mgr::m_n_global_sn = 0; cq_mgr::cq_mgr(ring_simple *p_ring, ib_ctx_handler *p_ib_ctx_handler, int cq_size, struct ibv_comp_channel *p_comp_event_channel, bool is_rx, bool config) : m_qp(NULL) + , m_rx_hot_buffer(NULL) , m_p_ibv_cq(NULL) , m_b_is_rx(is_rx) , m_cq_id(0) @@ -178,31 +179,6 @@ void cq_mgr::prep_ibv_cq(xlio_ibv_cq_init_attr &attr) const } } -uint32_t cq_mgr::clean_cq() -{ - uint32_t ret_total = 0; - int ret = 0; - uint64_t cq_poll_sn = 0; - mem_buf_desc_t *buff = NULL; - /* coverity[stack_use_local_overflow] */ - xlio_ibv_wc wce[MCE_MAX_CQ_POLL_BATCH]; - while ((ret = poll(wce, MCE_MAX_CQ_POLL_BATCH, &cq_poll_sn)) > 0) { - for (int i = 0; i < ret; i++) { - if (m_b_is_rx) { - buff = cqe_process_rx(&wce[i]); - } else { - buff = cqe_log_and_get_buf_tx(&wce[i]); - } - if (buff) { - m_rx_queue.push_back(buff); - } - } - ret_total += ret; - } - - return ret_total; -} - cq_mgr::~cq_mgr() { cq_logfunc(""); @@ -262,6 +238,21 @@ int cq_mgr::get_channel_fd() return m_comp_event_channel->fd; } +void cq_mgr::set_qp_rq(qp_mgr *qp) +{ + m_qp = static_cast(qp); + + m_qp->m_rq_wqe_counter = 0; // In case of bonded qp, wqe_counter must be reset to zero + m_rx_hot_buffer = NULL; + + if (0 != xlio_ib_mlx5_get_cq(m_p_ibv_cq, &m_mlx5_cq)) { + cq_logpanic("xlio_ib_mlx5_get_cq failed (errno=%d %m)", errno); + } + VALGRIND_MAKE_MEM_DEFINED(&m_mlx5_cq, sizeof(m_mlx5_cq)); + cq_logfunc("qp_mgr=%p m_mlx5_cq.dbrec=%p m_mlx5_cq.cq_buf=%p", m_qp, m_mlx5_cq.dbrec, + m_mlx5_cq.cq_buf); +} + void cq_mgr::add_qp_rx(qp_mgr *qp) { cq_logdbg("qp_mgr=%p", qp); @@ -355,6 +346,60 @@ void cq_mgr::del_qp_tx(qp_mgr *qp) memset(&m_qp_rec, 0, sizeof(m_qp_rec)); } +void cq_mgr::lro_update_hdr(struct xlio_mlx5_cqe *cqe, mem_buf_desc_t *p_rx_wc_buf_desc) +{ + struct ethhdr *p_eth_h = (struct ethhdr *)(p_rx_wc_buf_desc->p_buffer); + struct tcphdr *p_tcp_h; + size_t transport_header_len = ETH_HDR_LEN; + + if (p_eth_h->h_proto == htons(ETH_P_8021Q)) { + transport_header_len = ETH_VLAN_HDR_LEN; + } + + if (0x02 == ((cqe->l4_hdr_type_etc >> 2) & 0x3)) { + // CQE indicates IPv4 in the l3_hdr_type field + struct iphdr *p_ip_h = (struct iphdr *)(p_rx_wc_buf_desc->p_buffer + transport_header_len); + + assert(p_ip_h->version == IPV4_VERSION); + assert(p_ip_h->protocol == IPPROTO_TCP); + + p_ip_h->ttl = cqe->lro_min_ttl; + p_ip_h->tot_len = htons(ntohl(cqe->byte_cnt) - transport_header_len); + p_ip_h->check = 0; // Ignore. + + p_tcp_h = (struct tcphdr *)((uint8_t *)p_ip_h + (int)(p_ip_h->ihl) * 4); + } else { + // Assume LRO can happen for either IPv4 or IPv6 L3 protocol. Skip checking l3_hdr_type. + struct ip6_hdr *p_ip6_h = + (struct ip6_hdr *)(p_rx_wc_buf_desc->p_buffer + transport_header_len); + + assert(0x01 == ((cqe->l4_hdr_type_etc >> 2) & 0x3)); // IPv6 L3 header. + assert(ip_header_version(p_ip6_h) == IPV6); + assert(p_ip6_h->ip6_nxt == IPPROTO_TCP); + assert(ntohl(cqe->byte_cnt) >= transport_header_len + IPV6_HLEN); + + p_ip6_h->ip6_hlim = cqe->lro_min_ttl; + // Payload length doesn't include main header. + p_ip6_h->ip6_plen = htons(ntohl(cqe->byte_cnt) - transport_header_len - IPV6_HLEN); + + // LRO doesn't create a session for packets with extension headers, so IPv6 header is 40b. + p_tcp_h = (struct tcphdr *)((uint8_t *)p_ip6_h + IPV6_HLEN); + } + + p_tcp_h->psh = !!(cqe->lro_tcppsh_abort_dupack & MLX5_CQE_LRO_TCP_PUSH_MASK); + + /* TCP packet flag is set, and packet carries no data or + * TCP packet flag is set, and packet carries data + */ + if ((0x03 == ((cqe->l4_hdr_type_etc >> 4) & 0x7)) || + (0x04 == ((cqe->l4_hdr_type_etc >> 4) & 0x7))) { + p_tcp_h->ack = 1; + p_tcp_h->ack_seq = cqe->lro_ack_seq_num; + p_tcp_h->window = cqe->lro_tcp_win; + p_tcp_h->check = 0; // Ignore. + } +} + bool cq_mgr::request_more_buffers() { cq_logfuncall("Allocating additional %d buffers for internal use", @@ -385,48 +430,6 @@ void cq_mgr::return_extra_buffers() m_p_cq_stat->n_buffer_pool_len = m_rx_pool.size(); } -int cq_mgr::poll(xlio_ibv_wc *p_wce, int num_entries, uint64_t *p_cq_poll_sn) -{ - // Assume locked!!! - cq_logfuncall(""); - - int ret = xlio_ibv_poll_cq(m_p_ibv_cq, num_entries, p_wce); - if (ret <= 0) { - // Zero polled wce OR ibv_poll_cq() has driver specific errors - // so we can't really do anything with them - *p_cq_poll_sn = m_n_global_sn; - return 0; - } - - if (unlikely(g_vlogger_level >= VLOG_FUNC_ALL)) { - for (int i = 0; i < ret; i++) { - cq_logfuncall("wce[%d] info wr_id=%x, status=%x, opcode=%x, vendor_err=%x, " - "byte_len=%d, imm_data=%x", - i, p_wce[i].wr_id, p_wce[i].status, xlio_wc_opcode(p_wce[i]), - p_wce[i].vendor_err, p_wce[i].byte_len, p_wce[i].imm_data); - cq_logfuncall("qp_num=%x, src_qp=%x, wc_flags=%x, pkey_index=%x, slid=%x, sl=%x, " - "dlid_path_bits=%x", - p_wce[i].qp_num, p_wce[i].src_qp, xlio_wc_flags(p_wce[i]), - p_wce[i].pkey_index, p_wce[i].slid, p_wce[i].sl, p_wce[i].dlid_path_bits); - } - } - - // spoil the global sn if we have packets ready - union __attribute__((packed)) { - uint64_t global_sn; - struct { - uint32_t cq_id; - uint32_t cq_sn; - } bundle; - } next_sn; - next_sn.bundle.cq_sn = ++m_n_cq_poll_sn; - next_sn.bundle.cq_id = m_cq_id; - - *p_cq_poll_sn = m_n_global_sn = next_sn.global_sn; - - return ret; -} - void cq_mgr::process_cq_element_log_helper(mem_buf_desc_t *p_mem_buf_desc, xlio_ibv_wc *p_wce) { BULLSEYE_EXCLUDE_BLOCK_START @@ -480,74 +483,33 @@ mem_buf_desc_t *cq_mgr::cqe_log_and_get_buf_tx(xlio_ibv_wc *p_wce) return p_mem_buf_desc; } -mem_buf_desc_t *cq_mgr::cqe_process_rx(xlio_ibv_wc *p_wce) +mem_buf_desc_t *cq_mgr::cqe_process_rx(mem_buf_desc_t *p_mem_buf_desc, + enum buff_status_e status) { - // Assume locked!!! + /* Assume locked!!! */ cq_logfuncall(""); - // Get related mem_buf_desc pointer from the wr_id - mem_buf_desc_t *p_mem_buf_desc = (mem_buf_desc_t *)(uintptr_t)p_wce->wr_id; - - bool bad_wce = p_wce->status != IBV_WC_SUCCESS; - - if (unlikely(bad_wce || p_mem_buf_desc == NULL)) { - if (p_mem_buf_desc == NULL) { - m_p_next_rx_desc_poll = NULL; - cq_logdbg("wce->wr_id = 0!!! When status == IBV_WC_SUCCESS"); - return NULL; - } - - process_cq_element_log_helper(p_mem_buf_desc, p_wce); + /* we use context to verify that on reclaim rx buffer path we return the buffer to the right CQ + */ + p_mem_buf_desc->rx.is_xlio_thr = false; + p_mem_buf_desc->rx.context = NULL; + if (unlikely(status != BS_OK)) { m_p_next_rx_desc_poll = NULL; - - if (p_mem_buf_desc == NULL) { - cq_logdbg("wce->wr_id = 0!!! When status != IBV_WC_SUCCESS"); - return NULL; - } - if (p_mem_buf_desc->p_desc_owner) { - reclaim_recv_buffer_helper(p_mem_buf_desc); - return NULL; - } - // AlexR: can this wce have a valid mem_buf_desc pointer? - // AlexR: are we throwing away a data buffer and a mem_buf_desc element? - cq_logdbg("no desc_owner(wr_id=%lu, qp_num=%x)", p_wce->wr_id, p_wce->qp_num); + reclaim_recv_buffer_helper(p_mem_buf_desc); return NULL; } if (m_n_sysvar_rx_prefetch_bytes_before_poll) { - /*for debug: - if (m_p_next_rx_desc_poll && m_p_next_rx_desc_poll != p_mem_buf_desc) { - cq_logerr("prefetched wrong buffer"); - }*/ m_p_next_rx_desc_poll = p_mem_buf_desc->p_prev_desc; p_mem_buf_desc->p_prev_desc = NULL; } - p_mem_buf_desc->rx.is_sw_csum_need = !(m_b_is_rx_hw_csum_on && xlio_wc_rx_hw_csum_ok(*p_wce)); - - if (likely(xlio_wc_opcode(*p_wce) & XLIO_IBV_WC_RECV)) { - // Save recevied total bytes - p_mem_buf_desc->sz_data = p_wce->byte_len; + VALGRIND_MAKE_MEM_DEFINED(p_mem_buf_desc->p_buffer, p_mem_buf_desc->sz_data); - // we use context to verify that on reclaim rx buffer path we return the buffer to the right - // CQ - p_mem_buf_desc->rx.is_xlio_thr = false; - p_mem_buf_desc->rx.context = this; - - // this is not a deadcode if timestamping is defined in verbs API - // coverity[dead_error_condition] - if (xlio_wc_flags(*p_wce) & XLIO_IBV_WC_WITH_TIMESTAMP) { - p_mem_buf_desc->rx.timestamps.hw_raw = xlio_wc_timestamp(*p_wce); - } - - VALGRIND_MAKE_MEM_DEFINED(p_mem_buf_desc->p_buffer, p_mem_buf_desc->sz_data); - - prefetch_range((uint8_t *)p_mem_buf_desc->p_buffer + m_sz_transport_header, - std::min(p_mem_buf_desc->sz_data - m_sz_transport_header, - (size_t)m_n_sysvar_rx_prefetch_bytes)); - // prefetch((uint8_t*)p_mem_buf_desc->p_buffer + m_sz_transport_header); - } + prefetch_range((uint8_t *)p_mem_buf_desc->p_buffer + m_sz_transport_header, + std::min(p_mem_buf_desc->sz_data - m_sz_transport_header, + (size_t)m_n_sysvar_rx_prefetch_bytes)); return p_mem_buf_desc; } @@ -643,53 +605,6 @@ void cq_mgr::mem_buf_desc_return_to_owner(mem_buf_desc_t *p_mem_buf_desc, cq_mgr::reclaim_recv_buffer_helper(p_mem_buf_desc); } -int cq_mgr::poll_and_process_element_rx(uint64_t *p_cq_poll_sn, void *pv_fd_ready_array) -{ - // Assume locked!!! - cq_logfuncall(""); - - /* coverity[stack_use_local_overflow] */ - xlio_ibv_wc wce[MCE_MAX_CQ_POLL_BATCH]; - - int ret; - uint32_t ret_rx_processed = process_recv_queue(pv_fd_ready_array); - if (unlikely(ret_rx_processed >= m_n_sysvar_cq_poll_batch_max)) { - m_p_ring->m_gro_mgr.flush_all(pv_fd_ready_array); - return ret_rx_processed; - } - - if (m_p_next_rx_desc_poll) { - prefetch_range((uint8_t *)m_p_next_rx_desc_poll->p_buffer, - m_n_sysvar_rx_prefetch_bytes_before_poll); - } - - ret = poll(wce, m_n_sysvar_cq_poll_batch_max, p_cq_poll_sn); - if (ret > 0) { - m_n_wce_counter += ret; - if (ret < (int)m_n_sysvar_cq_poll_batch_max) { - m_b_was_drained = true; - } - - for (int i = 0; i < ret; i++) { - mem_buf_desc_t *buff = cqe_process_rx((&wce[i])); - if (buff) { - if (xlio_wc_opcode(wce[i]) & XLIO_IBV_WC_RECV) { - if ((++m_qp_rec.debt < (int)m_n_sysvar_rx_num_wr_to_post_recv) || - !compensate_qp_poll_success(buff)) { - process_recv_buffer(buff, pv_fd_ready_array); - } - } - } - } - ret_rx_processed += ret; - m_p_ring->m_gro_mgr.flush_all(pv_fd_ready_array); - } else { - compensate_qp_poll_failed(); - } - - return ret_rx_processed; -} - bool cq_mgr::reclaim_recv_buffers(mem_buf_desc_t *rx_reuse_lst) { if (m_rx_buffs_rdy_for_free_head) { @@ -747,95 +662,6 @@ bool cq_mgr::reclaim_recv_buffers(descq_t *rx_reuse) return true; } -// -// @OUT: p_recycle_buffers_last_wr_id Returns the final WR_ID handled. When set, this indicates -// this is a CQE drain flow. -// @OUT: returns total number of processes CQE's -// - -int cq_mgr::drain_and_proccess(uintptr_t *p_recycle_buffers_last_wr_id /*=NULL*/) -{ - cq_logfuncall("cq was %s drained. %d processed wce since last check. %d strides in m_rx_queue", - (m_b_was_drained ? "" : "not "), m_n_wce_counter, m_rx_queue.size()); - - // CQ polling loop until max wce limit is reached for this interval or CQ is drained - uint32_t ret_total = 0; - uint64_t cq_poll_sn = 0; - - /* drain_and_proccess() is mainly called in following cases as - * Internal thread: - * Frequency of real polling can be controlled by - * XLIO_PROGRESS_ENGINE_INTERVAL and XLIO_PROGRESS_ENGINE_WCE_MAX. - * socketxtreme: - * User does socketxtreme_poll() - * Cleanup: - * QP down logic to release rx buffers should force polling to do this. - * Not null argument indicates one. - */ - while (((m_n_sysvar_progress_engine_wce_max > m_n_wce_counter) && (!m_b_was_drained)) || - (p_recycle_buffers_last_wr_id)) { - - /* coverity[stack_use_local_overflow] */ - xlio_ibv_wc wce[MCE_MAX_CQ_POLL_BATCH]; - int ret = poll(wce, MCE_MAX_CQ_POLL_BATCH, &cq_poll_sn); - if (ret <= 0) { - m_b_was_drained = true; - m_p_ring->m_gro_mgr.flush_all(NULL); - return ret_total; - } - - m_n_wce_counter += ret; - if (ret < MCE_MAX_CQ_POLL_BATCH) { - m_b_was_drained = true; - } - - for (int i = 0; i < ret; i++) { - mem_buf_desc_t *buff = cqe_process_rx(&wce[i]); - if (buff) { - if (p_recycle_buffers_last_wr_id) { - m_p_cq_stat->n_rx_pkt_drop++; - reclaim_recv_buffer_helper(buff); - } else { - bool procces_now = false; - if (m_transport_type == XLIO_TRANSPORT_ETH) { - procces_now = is_eth_tcp_frame(buff); - } - // We process immediately all non udp/ip traffic.. - if (procces_now) { - buff->rx.is_xlio_thr = true; - if ((++m_qp_rec.debt < (int)m_n_sysvar_rx_num_wr_to_post_recv) || - !compensate_qp_poll_success(buff)) { - process_recv_buffer(buff, NULL); - } - } else { // udp/ip traffic we just put in the cq's rx queue - m_rx_queue.push_back(buff); - mem_buf_desc_t *buff_cur = m_rx_queue.get_and_pop_front(); - if ((++m_qp_rec.debt < (int)m_n_sysvar_rx_num_wr_to_post_recv) || - !compensate_qp_poll_success(buff_cur)) { - m_rx_queue.push_front(buff_cur); - } - } - } - } - if (p_recycle_buffers_last_wr_id) { - *p_recycle_buffers_last_wr_id = (uintptr_t)wce[i].wr_id; - } - } - ret_total += ret; - } - m_p_ring->m_gro_mgr.flush_all(NULL); - - m_n_wce_counter = 0; - m_b_was_drained = false; - - // Update cq statistics - m_p_cq_stat->n_rx_sw_queue_len = m_rx_queue.size(); - m_p_cq_stat->n_rx_drained_at_once_max = - std::max(ret_total, m_p_cq_stat->n_rx_drained_at_once_max); - - return ret_total; -} - // 1 -> busy // 0 -> ok // -1 -> error diff --git a/src/core/dev/cq_mgr.h b/src/core/dev/cq_mgr.h index 24f21ed5d..4d87915d2 100644 --- a/src/core/dev/cq_mgr.h +++ b/src/core/dev/cq_mgr.h @@ -63,6 +63,12 @@ class ring_simple; #define LOCAL_IF_INFO_INVALID \ (local_if_info_t) { 0, 0 } +/* Get CQE opcode. */ +#define MLX5_CQE_OPCODE(op_own) ((op_own) >> 4) + +/* Get CQE owner bit. */ +#define MLX5_CQE_OWNER(op_own) ((op_own)&MLX5_CQE_OWNER_MASK) + struct cq_request_info_t { struct ibv_device *p_ibv_device; struct ibv_context *p_ibv_context; @@ -96,6 +102,14 @@ class cq_mgr { friend class rfs_uc_tcp_gro; // need for stats public: + enum buff_status_e { + BS_OK, + BS_CQE_RESP_WR_IMM_NOT_SUPPORTED, + BS_IBV_WC_WR_FLUSH_ERR, + BS_CQE_INVALID, + BS_GENERAL_ERR + }; + cq_mgr(ring_simple *p_ring, ib_ctx_handler *p_ib_ctx_handler, int cq_size, struct ibv_comp_channel *p_comp_event_channel, bool is_rx, bool config = true); virtual ~cq_mgr(); @@ -136,7 +150,7 @@ class cq_mgr { * @return >=0 number of wce processed * < 0 error */ - virtual int poll_and_process_element_rx(uint64_t *p_cq_poll_sn, void *pv_fd_ready_array = NULL); + virtual int poll_and_process_element_rx(uint64_t *p_cq_poll_sn, void *pv_fd_ready_array = NULL) = 0; int poll_and_process_element_tx(uint64_t *p_cq_poll_sn); virtual mem_buf_desc_t *poll_and_process_socketxtreme() { return nullptr; }; @@ -146,7 +160,7 @@ class cq_mgr { * @return >=0 number of wce processed * < 0 error */ - virtual int drain_and_proccess(uintptr_t *p_recycle_buffers_last_wr_id = NULL); + virtual int drain_and_proccess(uintptr_t *p_recycle_buffers_last_wr_id = NULL) = 0; // CQ implements the Rx mem_buf_desc_owner. // These callbacks will be called for each Rx buffer that passed processed completion @@ -160,7 +174,7 @@ class cq_mgr { void add_qp_tx(qp_mgr *qp); virtual void del_qp_tx(qp_mgr *qp); - virtual uint32_t clean_cq(); + virtual uint32_t clean_cq() = 0; bool reclaim_recv_buffers(descq_t *rx_reuse); bool reclaim_recv_buffers(mem_buf_desc_t *rx_reuse_lst); @@ -183,12 +197,17 @@ class cq_mgr { * @p_cq_poll_sn global unique wce id that maps last wce polled * @return Number of successfully polled wce */ - int poll(xlio_ibv_wc *p_wce, int num_entries, uint64_t *p_cq_poll_sn); void compensate_qp_poll_failed(); + void set_qp_rq(qp_mgr *qp); + void lro_update_hdr(struct xlio_mlx5_cqe *cqe, mem_buf_desc_t *p_rx_wc_buf_desc); inline void process_recv_buffer(mem_buf_desc_t *buff, void *pv_fd_ready_array = NULL); inline void update_global_sn(uint64_t &cq_poll_sn, uint32_t rettotal); + inline struct xlio_mlx5_cqe *check_cqe(void); + + mem_buf_desc_t *cqe_process_rx(mem_buf_desc_t *p_mem_buf_desc, enum buff_status_e status); + /* Process a WCE... meaning... * - extract the mem_buf_desc from the wce.wr_id and then loop on all linked mem_buf_desc * and deliver them to their owner for further processing (sockinfo on Tx path and ib_conn_mgr @@ -197,7 +216,6 @@ class cq_mgr { * are returned */ mem_buf_desc_t *cqe_log_and_get_buf_tx(xlio_ibv_wc *p_wce); - mem_buf_desc_t *cqe_process_rx(xlio_ibv_wc *p_wce); virtual void reclaim_recv_buffer_helper(mem_buf_desc_t *buff); // Returns true if the given buffer was used, @@ -212,6 +230,7 @@ class cq_mgr { xlio_ib_mlx5_cq_t m_mlx5_cq; qp_mgr_eth_mlx5 *m_qp; + mem_buf_desc_t *m_rx_hot_buffer; struct ibv_cq *m_p_ibv_cq; bool m_b_is_rx; descq_t m_rx_queue; @@ -281,8 +300,6 @@ class cq_mgr { // Since we have a single TX CQ comp channel for all cq_mgr's, it might not be the active_cq object cq_mgr *get_cq_mgr_from_cq_event(struct ibv_comp_channel *p_cq_channel); -#if defined(DEFINED_DIRECT_VERBS) - inline void cq_mgr::update_global_sn(uint64_t &cq_poll_sn, uint32_t num_polled_cqes) { if (num_polled_cqes > 0) { @@ -336,6 +353,23 @@ inline struct xlio_mlx5_cqe *cq_mgr::get_cqe_tx(uint32_t &num_polled_cqes) return cqe_ret; } -#endif /* DEFINED_DIRECT_VERBS */ +inline struct xlio_mlx5_cqe *cq_mgr::check_cqe(void) +{ + struct xlio_mlx5_cqe *cqe = + (struct xlio_mlx5_cqe *)(((uint8_t *)m_mlx5_cq.cq_buf) + + ((m_mlx5_cq.cq_ci & (m_mlx5_cq.cqe_count - 1)) + << m_mlx5_cq.cqe_size_log)); + /* + * CQE ownership is defined by Owner bit in the CQE. + * The value indicating SW ownership is flipped every + * time CQ wraps around. + * */ + if (likely((MLX5_CQE_OPCODE(cqe->op_own)) != MLX5_CQE_INVALID) && + !((MLX5_CQE_OWNER(cqe->op_own)) ^ !!(m_mlx5_cq.cq_ci & m_mlx5_cq.cqe_count))) { + return cqe; + } + + return NULL; +} #endif // CQ_MGR_H diff --git a/src/core/dev/cq_mgr_regrq.cpp b/src/core/dev/cq_mgr_regrq.cpp index dca001e8a..d18b20577 100644 --- a/src/core/dev/cq_mgr_regrq.cpp +++ b/src/core/dev/cq_mgr_regrq.cpp @@ -55,7 +55,6 @@ cq_mgr_regrq::cq_mgr_regrq(ring_simple *p_ring, ib_ctx_handler *p_ib_ctx_handler struct ibv_comp_channel *p_comp_event_channel, bool is_rx, bool call_configure) : cq_mgr(p_ring, p_ib_ctx_handler, cq_size, p_comp_event_channel, is_rx, call_configure) - , m_rx_hot_buffer(NULL) { cq_logfunc(""); } @@ -88,7 +87,7 @@ uint32_t cq_mgr_regrq::clean_cq() int ret = 0; /* coverity[stack_use_local_overflow] */ xlio_ibv_wc wce[MCE_MAX_CQ_POLL_BATCH]; - while ((ret = cq_mgr::poll(wce, MCE_MAX_CQ_POLL_BATCH, &cq_poll_sn)) > 0) { + while ((ret = poll_tx(wce, MCE_MAX_CQ_POLL_BATCH, &cq_poll_sn)) > 0) { for (int i = 0; i < ret; i++) { buff = cqe_log_and_get_buf_tx(&wce[i]); if (buff) { @@ -102,10 +101,53 @@ uint32_t cq_mgr_regrq::clean_cq() return ret_total; } +int cq_mgr_regrq::poll_tx(xlio_ibv_wc *p_wce, int num_entries, uint64_t *p_cq_poll_sn) +{ + // Assume locked!!! + cq_logfuncall(""); + + int ret = xlio_ibv_poll_cq(m_p_ibv_cq, num_entries, p_wce); + if (ret <= 0) { + // Zero polled wce OR ibv_poll_cq() has driver specific errors + // so we can't really do anything with them + *p_cq_poll_sn = m_n_global_sn; + return 0; + } + + if (unlikely(g_vlogger_level >= VLOG_FUNC_ALL)) { + for (int i = 0; i < ret; i++) { + cq_logfuncall("wce[%d] info wr_id=%x, status=%x, opcode=%x, vendor_err=%x, " + "byte_len=%d, imm_data=%x", + i, p_wce[i].wr_id, p_wce[i].status, xlio_wc_opcode(p_wce[i]), + p_wce[i].vendor_err, p_wce[i].byte_len, p_wce[i].imm_data); + cq_logfuncall("qp_num=%x, src_qp=%x, wc_flags=%x, pkey_index=%x, slid=%x, sl=%x, " + "dlid_path_bits=%x", + p_wce[i].qp_num, p_wce[i].src_qp, xlio_wc_flags(p_wce[i]), + p_wce[i].pkey_index, p_wce[i].slid, p_wce[i].sl, p_wce[i].dlid_path_bits); + } + } + + // spoil the global sn if we have packets ready + union __attribute__((packed)) { + uint64_t global_sn; + struct { + uint32_t cq_id; + uint32_t cq_sn; + } bundle; + } next_sn; + next_sn.bundle.cq_sn = ++m_n_cq_poll_sn; + next_sn.bundle.cq_id = m_cq_id; + + *p_cq_poll_sn = m_n_global_sn = next_sn.global_sn; + + return ret; +} + + cq_mgr_regrq::~cq_mgr_regrq() { cq_logfunc(""); - cq_logdbg("destroying CQ as %s", (m_b_is_rx ? "Rx" : "Tx")); + cq_logdbg("destroying CQ REGRQ as %s", (m_b_is_rx ? "Rx" : "Tx")); } mem_buf_desc_t *cq_mgr_regrq::poll(enum buff_status_e &status) @@ -344,37 +386,6 @@ int cq_mgr_regrq::drain_and_proccess(uintptr_t *p_recycle_buffers_last_wr_id /*= return ret_total; } -mem_buf_desc_t *cq_mgr_regrq::cqe_process_rx(mem_buf_desc_t *p_mem_buf_desc, - enum buff_status_e status) -{ - /* Assume locked!!! */ - cq_logfuncall(""); - - /* we use context to verify that on reclaim rx buffer path we return the buffer to the right CQ - */ - p_mem_buf_desc->rx.is_xlio_thr = false; - p_mem_buf_desc->rx.context = NULL; - - if (unlikely(status != BS_OK)) { - m_p_next_rx_desc_poll = NULL; - reclaim_recv_buffer_helper(p_mem_buf_desc); - return NULL; - } - - if (m_n_sysvar_rx_prefetch_bytes_before_poll) { - m_p_next_rx_desc_poll = p_mem_buf_desc->p_prev_desc; - p_mem_buf_desc->p_prev_desc = NULL; - } - - VALGRIND_MAKE_MEM_DEFINED(p_mem_buf_desc->p_buffer, p_mem_buf_desc->sz_data); - - prefetch_range((uint8_t *)p_mem_buf_desc->p_buffer + m_sz_transport_header, - std::min(p_mem_buf_desc->sz_data - m_sz_transport_header, - (size_t)m_n_sysvar_rx_prefetch_bytes)); - - return p_mem_buf_desc; -} - mem_buf_desc_t *cq_mgr_regrq::poll_and_process_socketxtreme() { buff_status_e status = BS_OK; @@ -448,21 +459,6 @@ int cq_mgr_regrq::poll_and_process_element_rx(uint64_t *p_cq_poll_sn, void *pv_f return ret_rx_processed; } -void cq_mgr_regrq::set_qp_rq(qp_mgr *qp) -{ - m_qp = static_cast(qp); - - m_qp->m_rq_wqe_counter = 0; // In case of bonded qp, wqe_counter must be reset to zero - m_rx_hot_buffer = NULL; - - if (0 != xlio_ib_mlx5_get_cq(m_p_ibv_cq, &m_mlx5_cq)) { - cq_logpanic("xlio_ib_mlx5_get_cq failed (errno=%d %m)", errno); - } - VALGRIND_MAKE_MEM_DEFINED(&m_mlx5_cq, sizeof(m_mlx5_cq)); - cq_logfunc("qp_mgr=%p m_mlx5_cq.dbrec=%p m_mlx5_cq.cq_buf=%p", m_qp, m_mlx5_cq.dbrec, - m_mlx5_cq.cq_buf); -} - void cq_mgr_regrq::add_qp_rx(qp_mgr *qp) { cq_logfunc(""); @@ -470,58 +466,4 @@ void cq_mgr_regrq::add_qp_rx(qp_mgr *qp) cq_mgr::add_qp_rx(qp); } -void cq_mgr_regrq::lro_update_hdr(struct xlio_mlx5_cqe *cqe, mem_buf_desc_t *p_rx_wc_buf_desc) -{ - struct ethhdr *p_eth_h = (struct ethhdr *)(p_rx_wc_buf_desc->p_buffer); - struct tcphdr *p_tcp_h; - size_t transport_header_len = ETH_HDR_LEN; - - if (p_eth_h->h_proto == htons(ETH_P_8021Q)) { - transport_header_len = ETH_VLAN_HDR_LEN; - } - - if (0x02 == ((cqe->l4_hdr_type_etc >> 2) & 0x3)) { - // CQE indicates IPv4 in the l3_hdr_type field - struct iphdr *p_ip_h = (struct iphdr *)(p_rx_wc_buf_desc->p_buffer + transport_header_len); - - assert(p_ip_h->version == IPV4_VERSION); - assert(p_ip_h->protocol == IPPROTO_TCP); - - p_ip_h->ttl = cqe->lro_min_ttl; - p_ip_h->tot_len = htons(ntohl(cqe->byte_cnt) - transport_header_len); - p_ip_h->check = 0; // Ignore. - - p_tcp_h = (struct tcphdr *)((uint8_t *)p_ip_h + (int)(p_ip_h->ihl) * 4); - } else { - // Assume LRO can happen for either IPv4 or IPv6 L3 protocol. Skip checking l3_hdr_type. - struct ip6_hdr *p_ip6_h = - (struct ip6_hdr *)(p_rx_wc_buf_desc->p_buffer + transport_header_len); - - assert(0x01 == ((cqe->l4_hdr_type_etc >> 2) & 0x3)); // IPv6 L3 header. - assert(ip_header_version(p_ip6_h) == IPV6); - assert(p_ip6_h->ip6_nxt == IPPROTO_TCP); - assert(ntohl(cqe->byte_cnt) >= transport_header_len + IPV6_HLEN); - - p_ip6_h->ip6_hlim = cqe->lro_min_ttl; - // Payload length doesn't include main header. - p_ip6_h->ip6_plen = htons(ntohl(cqe->byte_cnt) - transport_header_len - IPV6_HLEN); - - // LRO doesn't create a session for packets with extension headers, so IPv6 header is 40b. - p_tcp_h = (struct tcphdr *)((uint8_t *)p_ip6_h + IPV6_HLEN); - } - - p_tcp_h->psh = !!(cqe->lro_tcppsh_abort_dupack & MLX5_CQE_LRO_TCP_PUSH_MASK); - - /* TCP packet flag is set, and packet carries no data or - * TCP packet flag is set, and packet carries data - */ - if ((0x03 == ((cqe->l4_hdr_type_etc >> 4) & 0x7)) || - (0x04 == ((cqe->l4_hdr_type_etc >> 4) & 0x7))) { - p_tcp_h->ack = 1; - p_tcp_h->ack_seq = cqe->lro_ack_seq_num; - p_tcp_h->window = cqe->lro_tcp_win; - p_tcp_h->check = 0; // Ignore. - } -} - #endif /* DEFINED_DIRECT_VERBS */ diff --git a/src/core/dev/cq_mgr_regrq.h b/src/core/dev/cq_mgr_regrq.h index 047209416..342be2102 100644 --- a/src/core/dev/cq_mgr_regrq.h +++ b/src/core/dev/cq_mgr_regrq.h @@ -35,71 +35,29 @@ #include "cq_mgr.h" -#if defined(DEFINED_DIRECT_VERBS) - -/* Get CQE opcode. */ -#define MLX5_CQE_OPCODE(op_own) ((op_own) >> 4) - -/* Get CQE owner bit. */ -#define MLX5_CQE_OWNER(op_own) ((op_own)&MLX5_CQE_OWNER_MASK) - class cq_mgr_regrq : public cq_mgr { public: - enum buff_status_e { - BS_OK, - BS_CQE_RESP_WR_IMM_NOT_SUPPORTED, - BS_IBV_WC_WR_FLUSH_ERR, - BS_CQE_INVALID, - BS_GENERAL_ERR - }; - cq_mgr_regrq(ring_simple *p_ring, ib_ctx_handler *p_ib_ctx_handler, uint32_t cq_size, struct ibv_comp_channel *p_comp_event_channel, bool is_rx, bool call_configure = true); virtual ~cq_mgr_regrq(); - virtual int drain_and_proccess(uintptr_t *p_recycle_buffers_last_wr_id = NULL); + virtual int drain_and_proccess(uintptr_t *p_recycle_buffers_last_wr_id = NULL) override; virtual mem_buf_desc_t *poll_and_process_socketxtreme(); - virtual int poll_and_process_element_rx(uint64_t *p_cq_poll_sn, void *pv_fd_ready_array = NULL); + virtual int poll_and_process_element_rx(uint64_t *p_cq_poll_sn, void *pv_fd_ready_array = NULL) override; - mem_buf_desc_t *cqe_process_rx(mem_buf_desc_t *p_mem_buf_desc, enum buff_status_e status); // MOVE virtual void add_qp_rx(qp_mgr *qp); - void set_qp_rq(qp_mgr *qp); // MOVE - virtual uint32_t clean_cq(); + virtual uint32_t clean_cq() override; protected: - mem_buf_desc_t *m_rx_hot_buffer; // MOVE - - inline struct xlio_mlx5_cqe *check_cqe(void); // MOVE mem_buf_desc_t *poll(enum buff_status_e &status); - + int poll_tx(xlio_ibv_wc *p_wce, int num_entries, uint64_t *p_cq_poll_sn); inline void cqe_to_mem_buff_desc(struct xlio_mlx5_cqe *cqe, mem_buf_desc_t *p_rx_wc_buf_desc, enum buff_status_e &status); - void lro_update_hdr(struct xlio_mlx5_cqe *cqe, mem_buf_desc_t *p_rx_wc_buf_desc); // MOVE private: int drain_and_proccess_helper(mem_buf_desc_t *buff, buff_status_e status, uintptr_t *p_recycle_buffers_last_wr_id); }; -inline struct xlio_mlx5_cqe *cq_mgr_regrq::check_cqe(void) -{ - struct xlio_mlx5_cqe *cqe = - (struct xlio_mlx5_cqe *)(((uint8_t *)m_mlx5_cq.cq_buf) + - ((m_mlx5_cq.cq_ci & (m_mlx5_cq.cqe_count - 1)) - << m_mlx5_cq.cqe_size_log)); - /* - * CQE ownership is defined by Owner bit in the CQE. - * The value indicating SW ownership is flipped every - * time CQ wraps around. - * */ - if (likely((MLX5_CQE_OPCODE(cqe->op_own)) != MLX5_CQE_INVALID) && - !((MLX5_CQE_OWNER(cqe->op_own)) ^ !!(m_mlx5_cq.cq_ci & m_mlx5_cq.cqe_count))) { - return cqe; - } - - return NULL; -} - -#endif /* DEFINED_DIRECT_VERBS */ #endif // CQ_MGR_MLX5_H diff --git a/src/core/dev/cq_mgr_strq.cpp b/src/core/dev/cq_mgr_strq.cpp index e0b797140..653d10f59 100644 --- a/src/core/dev/cq_mgr_strq.cpp +++ b/src/core/dev/cq_mgr_strq.cpp @@ -60,7 +60,7 @@ cq_mgr_strq::cq_mgr_strq(ring_simple *p_ring, ib_ctx_handler *p_ib_ctx_handler, uint32_t strides_num, struct ibv_comp_channel *p_comp_event_channel, bool call_configure) - : cq_mgr_regrq(p_ring, p_ib_ctx_handler, cq_size, p_comp_event_channel, true, call_configure) + : cq_mgr(p_ring, p_ib_ctx_handler, cq_size, p_comp_event_channel, true, call_configure) , _owner_ring(p_ring) , _stride_size_bytes(stride_size_bytes) , _strides_num(strides_num) diff --git a/src/core/dev/cq_mgr_strq.h b/src/core/dev/cq_mgr_strq.h index 9a374e527..061db0cbe 100644 --- a/src/core/dev/cq_mgr_strq.h +++ b/src/core/dev/cq_mgr_strq.h @@ -35,9 +35,9 @@ #include #include -#include "cq_mgr_regrq.h" +#include "cq_mgr.h" -class cq_mgr_strq : public cq_mgr_regrq { +class cq_mgr_strq : public cq_mgr { public: cq_mgr_strq(ring_simple *p_ring, ib_ctx_handler *p_ib_ctx_handler, uint32_t cq_size, uint32_t stride_size_bytes, uint32_t strides_num, @@ -56,7 +56,7 @@ class cq_mgr_strq : public cq_mgr_regrq { virtual void statistics_print() override; virtual void reclaim_recv_buffer_helper(mem_buf_desc_t *buff) override; - inline mem_buf_desc_t *poll(enum buff_status_e &status, mem_buf_desc_t *&buff_stride); + mem_buf_desc_t *poll(enum buff_status_e &status, mem_buf_desc_t *&buff_stride); private: mem_buf_desc_t *next_stride(); diff --git a/src/core/dev/qp_mgr.cpp b/src/core/dev/qp_mgr.cpp index 21d5d91ca..a79bbfa7f 100644 --- a/src/core/dev/qp_mgr.cpp +++ b/src/core/dev/qp_mgr.cpp @@ -132,46 +132,6 @@ qp_mgr::~qp_mgr() qp_logdbg("delete done"); } -cq_mgr *qp_mgr::handle_cq_initialization(uint32_t *num_wr, - struct ibv_comp_channel *comp_event_channel, bool is_rx) -{ - qp_logfunc(""); - cq_mgr *cq = NULL; - - try { - cq = new cq_mgr(m_p_ring, m_p_ib_ctx_handler, *num_wr, comp_event_channel, is_rx); - } catch (xlio_exception &e) { - // This is a workaround for an issue with cq creation of mlx4 devices on - // upstream-driver VMs over Windows Hypervisor. - if (safe_mce_sys().hypervisor == mce_sys_var::HYPER_MSHV && m_p_ib_ctx_handler->is_mlx4() && - *num_wr > MAX_UPSTREAM_CQ_MSHV_SIZE) { - qp_logdbg("cq creation failed with cq_size of %d. retrying with size of %d", *num_wr, - MAX_UPSTREAM_CQ_MSHV_SIZE); - *num_wr = MAX_UPSTREAM_CQ_MSHV_SIZE; - try { - cq = new cq_mgr(m_p_ring, m_p_ib_ctx_handler, *num_wr, comp_event_channel, is_rx); - } catch (xlio_exception &) { - } - } - - if (!cq) { - qp_logerr("%s", e.message); - } - } - - return cq; -} - -cq_mgr *qp_mgr::init_rx_cq_mgr(struct ibv_comp_channel *p_rx_comp_event_channel) -{ - return handle_cq_initialization(&m_rx_num_wr, p_rx_comp_event_channel, true); -} - -cq_mgr *qp_mgr::init_tx_cq_mgr() -{ - return handle_cq_initialization(&m_tx_num_wr, m_p_ring->get_tx_comp_event_channel(), false); -} - int qp_mgr::configure(struct qp_mgr_desc *desc) { qp_logdbg("Creating QP of transport type '%s' on ibv device '%s' [%p] on port %d", diff --git a/src/core/dev/qp_mgr.h b/src/core/dev/qp_mgr.h index 4cf883ef4..c301829e7 100644 --- a/src/core/dev/qp_mgr.h +++ b/src/core/dev/qp_mgr.h @@ -375,11 +375,8 @@ class qp_mgr { return m_n_unsignaled_count == m_n_sysvar_tx_num_wr_to_signal - 1; } - virtual cq_mgr *init_rx_cq_mgr(struct ibv_comp_channel *p_rx_comp_event_channel); - virtual cq_mgr *init_tx_cq_mgr(void); - - cq_mgr *handle_cq_initialization(uint32_t *num_wr, struct ibv_comp_channel *comp_event_channel, - bool is_rx); + virtual cq_mgr *init_rx_cq_mgr(struct ibv_comp_channel *p_rx_comp_event_channel) = 0; + virtual cq_mgr *init_tx_cq_mgr(void) = 0; virtual int send_to_wire(xlio_ibv_send_wr *p_send_wqe, xlio_wr_tx_packet_attr attr, bool request_comp, xlio_tis *tis, unsigned credits); From 52d0cf5ea35cc7e1303d8e638cafe9e5e6d799a2 Mon Sep 17 00:00:00 2001 From: Alexander Grissik Date: Sun, 27 Aug 2023 18:53:30 +0300 Subject: [PATCH 008/169] issue: 3514044 Moving Tx from cq_mgr to cq_mgr_tx Signed-off-by: Alexander Grissik --- src/core/Makefile.am | 2 + src/core/dev/cq_mgr.cpp | 359 ++------------------------ src/core/dev/cq_mgr.h | 159 +++--------- src/core/dev/cq_mgr_regrq.cpp | 109 ++------ src/core/dev/cq_mgr_regrq.h | 4 +- src/core/dev/cq_mgr_strq.cpp | 16 +- src/core/dev/cq_mgr_strq.h | 2 +- src/core/dev/cq_mgr_tx.cpp | 347 +++++++++++++++++++++++++ src/core/dev/cq_mgr_tx.h | 156 +++++++++++ src/core/dev/ib_ctx_handler.cpp | 6 - src/core/dev/ib_ctx_handler.h | 1 - src/core/dev/qp_mgr.h | 9 +- src/core/dev/qp_mgr_eth_mlx5.cpp | 8 +- src/core/dev/qp_mgr_eth_mlx5.h | 3 +- src/core/dev/qp_mgr_eth_mlx5_dpcp.cpp | 2 +- src/core/dev/ring_bond.cpp | 2 +- src/core/dev/ring_simple.cpp | 8 +- src/core/dev/ring_simple.h | 2 +- src/core/ib/base/verbs_extra.h | 4 - 19 files changed, 608 insertions(+), 591 deletions(-) create mode 100644 src/core/dev/cq_mgr_tx.cpp create mode 100644 src/core/dev/cq_mgr_tx.h diff --git a/src/core/Makefile.am b/src/core/Makefile.am index a30ff3a71..873646cb5 100644 --- a/src/core/Makefile.am +++ b/src/core/Makefile.am @@ -64,6 +64,7 @@ libxlio_la_SOURCES := \ dev/cq_mgr.cpp \ dev/cq_mgr_regrq.cpp \ dev/cq_mgr_strq.cpp \ + dev/cq_mgr_tx.cpp \ dev/dm_mgr.cpp \ dev/qp_mgr.cpp \ dev/qp_mgr_eth_mlx5.cpp \ @@ -173,6 +174,7 @@ libxlio_la_SOURCES := \ dev/cq_mgr.h \ dev/cq_mgr_regrq.h \ dev/cq_mgr_strq.h \ + dev/cq_mgr_tx.h \ dev/dm_mgr.h \ dev/gro_mgr.h \ dev/ib_ctx_handler_collection.h \ diff --git a/src/core/dev/cq_mgr.cpp b/src/core/dev/cq_mgr.cpp index 4430b3c50..6137a4261 100644 --- a/src/core/dev/cq_mgr.cpp +++ b/src/core/dev/cq_mgr.cpp @@ -50,7 +50,7 @@ #include "ring_simple.h" #include "qp_mgr_eth_mlx5.h" -#define MODULE_NAME "cqm" +#define MODULE_NAME "cq_mgr" #define cq_logpanic __log_info_panic #define cq_logerr __log_info_err @@ -67,41 +67,24 @@ ##log_args); \ } while (0) -atomic_t cq_mgr::m_n_cq_id_counter = ATOMIC_INIT(1); +atomic_t cq_mgr::m_n_cq_id_counter_rx = ATOMIC_INIT(1); -uint64_t cq_mgr::m_n_global_sn = 0; +uint64_t cq_mgr::m_n_global_sn_rx = 0; cq_mgr::cq_mgr(ring_simple *p_ring, ib_ctx_handler *p_ib_ctx_handler, int cq_size, - struct ibv_comp_channel *p_comp_event_channel, bool is_rx, bool config) - : m_qp(NULL) - , m_rx_hot_buffer(NULL) - , m_p_ibv_cq(NULL) - , m_b_is_rx(is_rx) - , m_cq_id(0) - , m_n_cq_poll_sn(0) - , m_p_ring(p_ring) - , m_n_wce_counter(0) - , m_b_was_drained(false) - , m_b_is_rx_hw_csum_on(false) + struct ibv_comp_channel *p_comp_event_channel) + : m_p_ring(p_ring) , m_n_sysvar_cq_poll_batch_max(safe_mce_sys().cq_poll_batch_max) , m_n_sysvar_progress_engine_wce_max(safe_mce_sys().progress_engine_wce_max) - , m_p_cq_stat(&m_cq_stat_static) // use local copy of stats by default (on rx cq get shared - // memory stats) - , m_transport_type(m_p_ring->get_transport_type()) - , m_p_next_rx_desc_poll(NULL) + , m_p_cq_stat(&m_cq_stat_static) // use local copy of stats by default , m_n_sysvar_rx_prefetch_bytes_before_poll(safe_mce_sys().rx_prefetch_bytes_before_poll) , m_n_sysvar_rx_prefetch_bytes(safe_mce_sys().rx_prefetch_bytes) - , m_sz_transport_header(0) , m_p_ib_ctx_handler(p_ib_ctx_handler) , m_n_sysvar_rx_num_wr_to_post_recv(safe_mce_sys().rx_num_wr_to_post_recv) - , m_rx_buffs_rdy_for_free_head(NULL) - , m_rx_buffs_rdy_for_free_tail(NULL) , m_comp_event_channel(p_comp_event_channel) - , m_b_notification_armed(false) , m_n_sysvar_qp_compensation_level(safe_mce_sys().qp_compensation_level) , m_rx_lkey(g_buffer_pool_rx_rwqe->find_lkey_by_ib_ctx_thread_safe(m_p_ib_ctx_handler)) , m_b_sysvar_cq_keep_qp_full(safe_mce_sys().cq_keep_qp_full) - , m_n_out_of_free_bufs_warning(0) { BULLSEYE_EXCLUDE_BLOCK_START if (m_rx_lkey == 0) { @@ -113,10 +96,8 @@ cq_mgr::cq_mgr(ring_simple *p_ring, ib_ctx_handler *p_ib_ctx_handler, int cq_siz memset(&m_qp_rec, 0, sizeof(m_qp_rec)); m_rx_queue.set_id("cq_mgr (%p) : m_rx_queue", this); m_rx_pool.set_id("cq_mgr (%p) : m_rx_pool", this); - m_cq_id = atomic_fetch_and_inc(&m_n_cq_id_counter); // cq id is nonzero - if (config) { - configure(cq_size); - } + m_cq_id_rx = atomic_fetch_and_inc(&m_n_cq_id_counter_rx); // cq id is nonzero + configure(cq_size); memset(&m_mlx5_cq, 0, sizeof(m_mlx5_cq)); } @@ -126,8 +107,6 @@ void cq_mgr::configure(int cq_size) xlio_ibv_cq_init_attr attr; memset(&attr, 0, sizeof(attr)); - prep_ibv_cq(attr); - struct ibv_context *context = m_p_ib_ctx_handler->get_ibv_context(); int comp_vector = 0; #if defined(DEFINED_NGINX) || defined(DEFINED_ENVOY) @@ -147,42 +126,21 @@ void cq_mgr::configure(int cq_size) } BULLSEYE_EXCLUDE_BLOCK_END VALGRIND_MAKE_MEM_DEFINED(m_p_ibv_cq, sizeof(ibv_cq)); - switch (m_transport_type) { - case XLIO_TRANSPORT_ETH: - m_sz_transport_header = ETH_HDR_LEN; - break; - BULLSEYE_EXCLUDE_BLOCK_START - default: - cq_logpanic("Unknown transport type: %d", m_transport_type); - break; - BULLSEYE_EXCLUDE_BLOCK_END - } - - if (m_b_is_rx) { - xlio_stats_instance_create_cq_block(m_p_cq_stat); - } - - if (m_b_is_rx) { - m_b_is_rx_hw_csum_on = - xlio_is_rx_hw_csum_supported(m_p_ib_ctx_handler->get_ibv_device_attr()); - cq_logdbg("RX CSUM support = %d", m_b_is_rx_hw_csum_on); - } - - cq_logdbg("Created CQ as %s with fd[%d] and of size %d elements (ibv_cq_hndl=%p)", - (m_b_is_rx ? "Rx" : "Tx"), get_channel_fd(), cq_size, m_p_ibv_cq); -} + + xlio_stats_instance_create_cq_block(m_p_cq_stat); + + m_b_is_rx_hw_csum_on = + xlio_is_rx_hw_csum_supported(m_p_ib_ctx_handler->get_ibv_device_attr()); -void cq_mgr::prep_ibv_cq(xlio_ibv_cq_init_attr &attr) const -{ - if (m_p_ib_ctx_handler->get_ctx_time_converter_status()) { - xlio_ibv_cq_init_ts_attr(&attr); - } + cq_logdbg("RX CSUM support = %d", m_b_is_rx_hw_csum_on); + + cq_logdbg("Created CQ as Rx with fd[%d] and of size %d elements (ibv_cq_hndl=%p)", + get_channel_fd(), cq_size, m_p_ibv_cq); } cq_mgr::~cq_mgr() { - cq_logfunc(""); - cq_logdbg("destroying CQ as %s", (m_b_is_rx ? "Rx" : "Tx")); + cq_logdbg("Destroying Rx CQ"); if (m_rx_buffs_rdy_for_free_head) { reclaim_recv_buffers(m_rx_buffs_rdy_for_free_head); @@ -209,11 +167,9 @@ cq_mgr::~cq_mgr() VALGRIND_MAKE_MEM_UNDEFINED(m_p_ibv_cq, sizeof(ibv_cq)); statistics_print(); - if (m_b_is_rx) { - xlio_stats_instance_remove_cq_block(m_p_cq_stat); - } + xlio_stats_instance_remove_cq_block(m_p_cq_stat); - cq_logdbg("done"); + cq_logdbg("Destroying Rx CQ done"); } void cq_mgr::statistics_print() @@ -228,16 +184,6 @@ void cq_mgr::statistics_print() } } -ibv_cq *cq_mgr::get_ibv_cq_hndl() -{ - return m_p_ibv_cq; -} - -int cq_mgr::get_channel_fd() -{ - return m_comp_event_channel->fd; -} - void cq_mgr::set_qp_rq(qp_mgr *qp) { m_qp = static_cast(qp); @@ -316,36 +262,6 @@ void cq_mgr::del_qp_rx(qp_mgr *qp) memset(&m_qp_rec, 0, sizeof(m_qp_rec)); } -void cq_mgr::add_qp_tx(qp_mgr *qp) -{ - // Assume locked! - cq_logdbg("qp_mgr=%p", qp); - m_qp_rec.qp = qp; - m_qp_rec.debt = 0; - - m_qp = static_cast(qp); - - if (0 != xlio_ib_mlx5_get_cq(m_p_ibv_cq, &m_mlx5_cq)) { - cq_logpanic("xlio_ib_mlx5_get_cq failed (errno=%d %m)", errno); - } - - cq_logfunc("qp_mgr=%p m_mlx5_cq.dbrec=%p m_mlx5_cq.cq_buf=%p", m_qp, m_mlx5_cq.dbrec, - m_mlx5_cq.cq_buf); -} - -void cq_mgr::del_qp_tx(qp_mgr *qp) -{ - BULLSEYE_EXCLUDE_BLOCK_START - if (m_qp_rec.qp != qp) { - cq_logdbg("wrong qp_mgr=%p != m_qp_rec.qp=%p", qp, m_qp_rec.qp); - return; - } - BULLSEYE_EXCLUDE_BLOCK_END - cq_logdbg("qp_mgr=%p", m_qp_rec.qp); - - memset(&m_qp_rec, 0, sizeof(m_qp_rec)); -} - void cq_mgr::lro_update_hdr(struct xlio_mlx5_cqe *cqe, mem_buf_desc_t *p_rx_wc_buf_desc) { struct ethhdr *p_eth_h = (struct ethhdr *)(p_rx_wc_buf_desc->p_buffer); @@ -430,59 +346,6 @@ void cq_mgr::return_extra_buffers() m_p_cq_stat->n_buffer_pool_len = m_rx_pool.size(); } -void cq_mgr::process_cq_element_log_helper(mem_buf_desc_t *p_mem_buf_desc, xlio_ibv_wc *p_wce) -{ - BULLSEYE_EXCLUDE_BLOCK_START - // wce with bad status value - if (p_wce->status == IBV_WC_SUCCESS) { - cq_logdbg("wce: wr_id=%#lx, status=%#x, vendor_err=%#x, qp_num=%#x", p_wce->wr_id, - p_wce->status, p_wce->vendor_err, p_wce->qp_num); - if (m_b_is_rx_hw_csum_on && !xlio_wc_rx_hw_csum_ok(*p_wce)) { - cq_logdbg("wce: bad rx_csum"); - } - cq_logdbg("wce: opcode=%#x, byte_len=%u, src_qp=%#x, wc_flags=%#lx", xlio_wc_opcode(*p_wce), - p_wce->byte_len, p_wce->src_qp, (unsigned long)xlio_wc_flags(*p_wce)); - cq_logdbg("wce: pkey_index=%#x, slid=%#x, sl=%#x, dlid_path_bits=%#x, imm_data=%#x", - p_wce->pkey_index, p_wce->slid, p_wce->sl, p_wce->dlid_path_bits, - p_wce->imm_data); - if (p_mem_buf_desc) { - cq_logdbg("mem_buf_desc: lkey=%#x, p_buffer=%p, sz_buffer=%lu", p_mem_buf_desc->lkey, - p_mem_buf_desc->p_buffer, p_mem_buf_desc->sz_buffer); - } - } else if (p_wce->status != IBV_WC_WR_FLUSH_ERR) { - cq_logwarn("wce: wr_id=%#lx, status=%#x, vendor_err=%#x, qp_num=%#x", p_wce->wr_id, - p_wce->status, p_wce->vendor_err, p_wce->qp_num); - cq_loginfo("wce: opcode=%#x, byte_len=%u, src_qp=%#x, wc_flags=%#lx", - xlio_wc_opcode(*p_wce), p_wce->byte_len, p_wce->src_qp, - (unsigned long)xlio_wc_flags(*p_wce)); - cq_loginfo("wce: pkey_index=%#x, slid=%#x, sl=%#x, dlid_path_bits=%#x, imm_data=%#x", - p_wce->pkey_index, p_wce->slid, p_wce->sl, p_wce->dlid_path_bits, - p_wce->imm_data); - - m_p_cq_stat->n_rx_cqe_error++; - if (p_mem_buf_desc) { - cq_logwarn("mem_buf_desc: lkey=%#x, p_buffer=%p, sz_buffer=%lu", p_mem_buf_desc->lkey, - p_mem_buf_desc->p_buffer, p_mem_buf_desc->sz_buffer); - } - } - BULLSEYE_EXCLUDE_BLOCK_END - - cq_logfunc("wce error status '%s' [%d] (wr_id=%p, qp_num=%x)", - priv_ibv_wc_status_str(p_wce->status), p_wce->status, p_wce->wr_id, p_wce->qp_num); -} - -mem_buf_desc_t *cq_mgr::cqe_log_and_get_buf_tx(xlio_ibv_wc *p_wce) -{ - // Assume locked!!! - cq_logfuncall(""); - - mem_buf_desc_t *p_mem_buf_desc = (mem_buf_desc_t *)(uintptr_t)p_wce->wr_id; - if (unlikely(p_wce->status != IBV_WC_SUCCESS)) { - process_cq_element_log_helper(p_mem_buf_desc, p_wce); - } - return p_mem_buf_desc; -} - mem_buf_desc_t *cq_mgr::cqe_process_rx(mem_buf_desc_t *p_mem_buf_desc, enum buff_status_e status) { @@ -573,29 +436,6 @@ void cq_mgr::reclaim_recv_buffer_helper(mem_buf_desc_t *buff) } } -void cq_mgr::process_tx_buffer_list(mem_buf_desc_t *p_mem_buf_desc) -{ - // Assume locked!!! - BULLSEYE_EXCLUDE_BLOCK_START - if (p_mem_buf_desc && - (p_mem_buf_desc->p_desc_owner == - m_p_ring /*|| m_p_ring->get_parent()->is_member(p_mem_buf_desc->p_desc_owner)*/)) { - m_p_ring->mem_buf_desc_return_to_owner_tx(p_mem_buf_desc); - /* if decided to free buffers of another ring here, need to modify return_to_owner to check - * owner and return to gpool. */ - } else if (p_mem_buf_desc && m_p_ring->get_parent()->is_member(p_mem_buf_desc->p_desc_owner)) { - cq_logerr("got buffer of wrong owner, high-availability event? buf=%p, owner=%p", - p_mem_buf_desc, p_mem_buf_desc ? p_mem_buf_desc->p_desc_owner : NULL); - /* if decided to free buffers here, remember its a list and need to deref members. */ - // p_mem_buf_desc->p_desc_owner->mem_buf_desc_return_to_owner_tx(p_mem_buf_desc); /* this - // can cause a deadlock between rings, use trylock? */ - } else { - cq_logerr("got buffer of wrong owner, buf=%p, owner=%p", p_mem_buf_desc, - p_mem_buf_desc ? p_mem_buf_desc->p_desc_owner : NULL); - } - BULLSEYE_EXCLUDE_BLOCK_END -} - // This method is called when ring release returns unposted buffers. void cq_mgr::mem_buf_desc_return_to_owner(mem_buf_desc_t *p_mem_buf_desc, void *pv_fd_ready_array /*=NULL*/) @@ -662,46 +502,15 @@ bool cq_mgr::reclaim_recv_buffers(descq_t *rx_reuse) return true; } -// 1 -> busy -// 0 -> ok -// -1 -> error -int cq_mgr::ack_and_request_notification() -{ - int res, cq_ev_count = 0; - ibv_cq *ib_cq; - void *cq_context; - do { - res = ibv_get_cq_event(m_comp_event_channel, &ib_cq, &cq_context); - if (res == 0) { - ++cq_ev_count; - } - } while (res == 0); - if (errno != EAGAIN) { - return -1; - } - if (cq_ev_count > 0) { - get_cq_event(cq_ev_count); - ibv_ack_cq_events(m_p_ibv_cq, cq_ev_count); - return 1; - } - IF_VERBS_FAILURE(req_notify_cq()) - { - cq_logerr("Failure arming the qp_mgr notification channel (errno=%d %m)", errno); - return -1; - } - ENDIF_VERBS_FAILURE - return 0; -} - int cq_mgr::request_notification(uint64_t poll_sn) { int ret = -1; cq_logfuncall(""); - if ((m_n_global_sn > 0 && poll_sn != m_n_global_sn)) { + if ((m_n_global_sn_rx > 0 && poll_sn != m_n_global_sn_rx)) { // The cq_mgr's has receive packets pending processing (or got processed since cq_poll_sn) - cq_logfunc("miss matched poll sn (user=0x%lx, cq=0x%lx)", poll_sn, m_n_cq_poll_sn); + cq_logfunc("miss matched poll sn (user=0x%lx, cq=0x%lx)", poll_sn, m_n_cq_poll_sn_rx); return 1; } @@ -710,7 +519,7 @@ int cq_mgr::request_notification(uint64_t poll_sn) cq_logfunc("arming cq_mgr notification channel"); // Arm the CQ notification channel - IF_VERBS_FAILURE(req_notify_cq()) + IF_VERBS_FAILURE(xlio_ib_mlx5_req_notify_cq(&m_mlx5_cq, 0)) { cq_logerr("Failure arming the qp_mgr notification channel (errno=%d %m)", errno); } @@ -764,11 +573,7 @@ int cq_mgr::wait_for_notification_and_process_element(uint64_t *p_cq_poll_sn, m_b_notification_armed = false; // Now try processing the ready element - if (m_b_is_rx) { - ret = poll_and_process_element_rx(p_cq_poll_sn, pv_fd_ready_array); - } else { - ret = poll_and_process_element_tx(p_cq_poll_sn); - } + ret = poll_and_process_element_rx(p_cq_poll_sn, pv_fd_ready_array); } ENDIF_VERBS_FAILURE; } else { @@ -778,119 +583,3 @@ int cq_mgr::wait_for_notification_and_process_element(uint64_t *p_cq_poll_sn, return ret; } - -cq_mgr *get_cq_mgr_from_cq_event(struct ibv_comp_channel *p_cq_channel) -{ - cq_mgr *p_cq_mgr = NULL; - struct ibv_cq *p_cq_hndl = NULL; - void *p_context; // deal with compiler warnings - - // read & ack the CQ event - IF_VERBS_FAILURE(ibv_get_cq_event(p_cq_channel, &p_cq_hndl, &p_context)) - { - vlog_printf(VLOG_INFO, - MODULE_NAME ":%d: waiting on cq_mgr event returned with error (errno=%d %m)\n", - __LINE__, errno); - } - else - { - p_cq_mgr = (cq_mgr *)p_context; // Save the cq_mgr - p_cq_mgr->get_cq_event(); - ibv_ack_cq_events(p_cq_hndl, 1); // Ack the ibv event - } - ENDIF_VERBS_FAILURE; - - return p_cq_mgr; -} - -int cq_mgr::poll_and_process_element_tx(uint64_t *p_cq_poll_sn) -{ - cq_logfuncall(""); - - static auto is_error_opcode = [&](uint8_t opcode) { - return opcode == MLX5_CQE_REQ_ERR || opcode == MLX5_CQE_RESP_ERR; - }; - - int ret = 0; - uint32_t num_polled_cqes = 0; - xlio_mlx5_cqe *cqe = get_cqe_tx(num_polled_cqes); - - if (likely(cqe)) { - unsigned index = ntohs(cqe->wqe_counter) & (m_qp->m_tx_num_wr - 1); - - // All error opcodes have the most significant bit set. - if (unlikely(cqe->op_own & 0x80) && is_error_opcode(cqe->op_own >> 4)) { - m_p_cq_stat->n_rx_cqe_error++; - log_cqe_error(cqe); - } - - handle_sq_wqe_prop(index); - ret = 1; - } - update_global_sn(*p_cq_poll_sn, num_polled_cqes); - - return ret; -} - -void cq_mgr::log_cqe_error(struct xlio_mlx5_cqe *cqe) -{ - struct mlx5_err_cqe *ecqe = (struct mlx5_err_cqe *)cqe; - - /* TODO We can also ask qp_mgr to log WQE fields from SQ. But at first, we need to remove - * prefetch and memset of the next WQE there. Credit system will guarantee that we don't - * reuse the WQE at this point. - */ - - if (MLX5_CQE_SYNDROME_WR_FLUSH_ERR != ecqe->syndrome) { - cq_logwarn("cqe: syndrome=0x%x vendor=0x%x hw=0x%x (type=0x%x) wqe_opcode_qpn=0x%x " - "wqe_counter=0x%x", - ecqe->syndrome, ecqe->vendor_err_synd, *((uint8_t *)&ecqe->rsvd1 + 16), - *((uint8_t *)&ecqe->rsvd1 + 17), ntohl(ecqe->s_wqe_opcode_qpn), - ntohs(ecqe->wqe_counter)); - } -} - -void cq_mgr::handle_sq_wqe_prop(unsigned index) -{ - sq_wqe_prop *p = &m_qp->m_sq_wqe_idx_to_prop[index]; - sq_wqe_prop *prev; - unsigned credits = 0; - - /* - * TX completions can be signalled for a set of WQEs as an optimization. - * Therefore, for every TX completion we may need to handle multiple - * WQEs. Since every WQE can have various size and the WQE index is - * wrapped around, we build a linked list to simplify things. Each - * element of the linked list represents properties of a previously - * posted WQE. - * - * We keep index of the last completed WQE and stop processing the list - * when we reach the index. This condition is checked in - * is_sq_wqe_prop_valid(). - */ - - do { - if (p->buf) { - m_p_ring->mem_buf_desc_return_single_locked(p->buf); - } - if (p->ti) { - xlio_ti *ti = p->ti; - if (ti->m_callback) { - ti->m_callback(ti->m_callback_arg); - } - - ti->put(); - if (unlikely(ti->m_released && ti->m_ref == 0)) { - m_qp->ti_released(ti); - } - } - credits += p->credits; - - prev = p; - p = p->next; - } while (p != NULL && m_qp->is_sq_wqe_prop_valid(p, prev)); - - m_p_ring->return_tx_pool_to_global_pool(); - m_qp->credits_return(credits); - m_qp->m_sq_wqe_prop_last_signalled = index; -} diff --git a/src/core/dev/cq_mgr.h b/src/core/dev/cq_mgr.h index 4d87915d2..d74cc0cb0 100644 --- a/src/core/dev/cq_mgr.h +++ b/src/core/dev/cq_mgr.h @@ -60,34 +60,12 @@ class qp_mgr; class qp_mgr_eth_mlx5; class ring_simple; -#define LOCAL_IF_INFO_INVALID \ - (local_if_info_t) { 0, 0 } - /* Get CQE opcode. */ #define MLX5_CQE_OPCODE(op_own) ((op_own) >> 4) /* Get CQE owner bit. */ #define MLX5_CQE_OWNER(op_own) ((op_own)&MLX5_CQE_OWNER_MASK) -struct cq_request_info_t { - struct ibv_device *p_ibv_device; - struct ibv_context *p_ibv_context; - int n_port; - qp_mgr *p_qp_mgr; -}; - -struct buff_lst_info_t { - mem_buf_desc_t *buff_lst; - uint32_t n_buff_num; -}; - -typedef std::pair local_if_info_key_t; - -typedef struct local_if_info_t { - in_addr_t addr; - uint32_t attached_grp_ref_cnt; -} local_if_info_t; - struct qp_rec { qp_mgr *qp; int debt; @@ -96,9 +74,9 @@ struct qp_rec { // Class cq_mgr // class cq_mgr { - friend class ring; // need to expose the m_n_global_sn only to ring - friend class ring_simple; // need to expose the m_n_global_sn only to ring - friend class ring_bond; // need to expose the m_n_global_sn only to ring + friend class ring; // need to expose the m_n_global_sn_rx only to ring + friend class ring_simple; // need to expose the m_n_global_sn_rx only to ring + friend class ring_bond; // need to expose the m_n_global_sn_rx only to ring friend class rfs_uc_tcp_gro; // need for stats public: @@ -111,15 +89,14 @@ class cq_mgr { }; cq_mgr(ring_simple *p_ring, ib_ctx_handler *p_ib_ctx_handler, int cq_size, - struct ibv_comp_channel *p_comp_event_channel, bool is_rx, bool config = true); + struct ibv_comp_channel *p_comp_event_channel); virtual ~cq_mgr(); void configure(int cq_size); - ibv_cq *get_ibv_cq_hndl(); - int get_channel_fd(); - // ack events and rearm CQ - int ack_and_request_notification(); + ibv_cq *get_ibv_cq_hndl() { return m_p_ibv_cq; } + int get_channel_fd() { return m_comp_event_channel->fd; } + /** * Arm the managed CQ's notification channel * Calling this more then once without get_event() will return without @@ -151,7 +128,6 @@ class cq_mgr { * < 0 error */ virtual int poll_and_process_element_rx(uint64_t *p_cq_poll_sn, void *pv_fd_ready_array = NULL) = 0; - int poll_and_process_element_tx(uint64_t *p_cq_poll_sn); virtual mem_buf_desc_t *poll_and_process_socketxtreme() { return nullptr; }; /** @@ -171,9 +147,6 @@ class cq_mgr { virtual void add_qp_rx(qp_mgr *qp); virtual void del_qp_rx(qp_mgr *qp); - void add_qp_tx(qp_mgr *qp); - virtual void del_qp_tx(qp_mgr *qp); - virtual uint32_t clean_cq() = 0; bool reclaim_recv_buffers(descq_t *rx_reuse); @@ -181,12 +154,6 @@ class cq_mgr { bool reclaim_recv_buffers_no_lock(mem_buf_desc_t *rx_reuse_lst); int reclaim_recv_single_buffer(mem_buf_desc_t *rx_reuse); - // maps between qpn and vlan id to the local interface - void map_vlan_and_qpn_to_local_if(int qp_num, uint16_t vlan_id, in_addr_t local_if); - - // unmaps the qpn and vlan id - void unmap_vlan_and_qpn(int qp_num, uint16_t vlan_id); - void get_cq_event(int count = 1) { xlio_ib_mlx5_get_cq_event(&m_mlx5_cq, count); }; protected: @@ -202,20 +169,12 @@ class cq_mgr { void lro_update_hdr(struct xlio_mlx5_cqe *cqe, mem_buf_desc_t *p_rx_wc_buf_desc); inline void process_recv_buffer(mem_buf_desc_t *buff, void *pv_fd_ready_array = NULL); - inline void update_global_sn(uint64_t &cq_poll_sn, uint32_t rettotal); + inline void update_global_sn_rx(uint64_t &cq_poll_sn, uint32_t rettotal); inline struct xlio_mlx5_cqe *check_cqe(void); mem_buf_desc_t *cqe_process_rx(mem_buf_desc_t *p_mem_buf_desc, enum buff_status_e status); - /* Process a WCE... meaning... - * - extract the mem_buf_desc from the wce.wr_id and then loop on all linked mem_buf_desc - * and deliver them to their owner for further processing (sockinfo on Tx path and ib_conn_mgr - * on Rx path) - * - for Tx wce the data buffers will be released to the associated ring before the mem_buf_desc - * are returned - */ - mem_buf_desc_t *cqe_log_and_get_buf_tx(xlio_ibv_wc *p_wce); virtual void reclaim_recv_buffer_helper(mem_buf_desc_t *buff); // Returns true if the given buffer was used, @@ -224,32 +183,27 @@ class cq_mgr { inline uint32_t process_recv_queue(void *pv_fd_ready_array = NULL); virtual void statistics_print(); - virtual void prep_ibv_cq(xlio_ibv_cq_init_attr &attr) const; - // returns list of buffers to the owner. - void process_tx_buffer_list(mem_buf_desc_t *p_mem_buf_desc); xlio_ib_mlx5_cq_t m_mlx5_cq; - qp_mgr_eth_mlx5 *m_qp; - mem_buf_desc_t *m_rx_hot_buffer; - struct ibv_cq *m_p_ibv_cq; - bool m_b_is_rx; + qp_mgr_eth_mlx5 *m_qp = nullptr; + mem_buf_desc_t *m_rx_hot_buffer = nullptr; + struct ibv_cq *m_p_ibv_cq = nullptr; descq_t m_rx_queue; - static uint64_t m_n_global_sn; - uint32_t m_cq_id; - uint32_t m_n_cq_poll_sn; + static uint64_t m_n_global_sn_rx; + uint32_t m_cq_id_rx = 0U; + uint32_t m_n_cq_poll_sn_rx = 0U; ring_simple *m_p_ring; - uint32_t m_n_wce_counter; - bool m_b_was_drained; - bool m_b_is_rx_hw_csum_on; + uint32_t m_n_wce_counter = 0U; + bool m_b_was_drained = false; + bool m_b_is_rx_hw_csum_on = false; qp_rec m_qp_rec; const uint32_t m_n_sysvar_cq_poll_batch_max; const uint32_t m_n_sysvar_progress_engine_wce_max; cq_stats_t *m_p_cq_stat; - transport_type_t m_transport_type; - mem_buf_desc_t *m_p_next_rx_desc_poll; + mem_buf_desc_t *m_p_next_rx_desc_poll = nullptr; uint32_t m_n_sysvar_rx_prefetch_bytes_before_poll; const uint32_t m_n_sysvar_rx_prefetch_bytes; - size_t m_sz_transport_header; + size_t m_sz_transport_header = ETH_HDR_LEN; ib_ctx_handler *m_p_ib_ctx_handler; const uint32_t m_n_sysvar_rx_num_wr_to_post_recv; descq_t m_rx_pool; @@ -258,49 +212,26 @@ class cq_mgr { * represented as struct xlio_buff_t * from user application by special XLIO extended API */ - mem_buf_desc_t *m_rx_buffs_rdy_for_free_head; - mem_buf_desc_t *m_rx_buffs_rdy_for_free_tail; + mem_buf_desc_t *m_rx_buffs_rdy_for_free_head = nullptr; + mem_buf_desc_t *m_rx_buffs_rdy_for_free_tail = nullptr; private: struct ibv_comp_channel *m_comp_event_channel; - bool m_b_notification_armed; + bool m_b_notification_armed = false; const uint32_t m_n_sysvar_qp_compensation_level; const uint32_t m_rx_lkey; const bool m_b_sysvar_cq_keep_qp_full; - int32_t m_n_out_of_free_bufs_warning; cq_stats_t m_cq_stat_static; - static atomic_t m_n_cq_id_counter; - - inline struct xlio_mlx5_cqe *get_cqe_tx(uint32_t &num_polled_cqes); - - void log_cqe_error(struct xlio_mlx5_cqe *cqe); - - void handle_sq_wqe_prop(unsigned index); - - void handle_tcp_ctl_packets(uint32_t rx_processed, void *pv_fd_ready_array); + static atomic_t m_n_cq_id_counter_rx; // requests safe_mce_sys().qp_compensation_level buffers from global pool bool request_more_buffers() __attribute__((noinline)); // returns safe_mce_sys().qp_compensation_level buffers to global pool void return_extra_buffers() __attribute__((noinline)); - - // Finds and sets the local if to which the buff is addressed (according to qpn and vlan id). - inline void find_buff_dest_local_if(mem_buf_desc_t *buff); - - // Finds and sets the xlio if to which the buff is addressed (according to qpn). - inline void find_buff_dest_xlio_if_ctx(mem_buf_desc_t *buff); - - void process_cq_element_log_helper(mem_buf_desc_t *p_mem_buf_desc, xlio_ibv_wc *p_wce); - - int req_notify_cq() { return xlio_ib_mlx5_req_notify_cq(&m_mlx5_cq, 0); }; }; -// Helper gunction to extract the Tx cq_mgr from the CQ event, -// Since we have a single TX CQ comp channel for all cq_mgr's, it might not be the active_cq object -cq_mgr *get_cq_mgr_from_cq_event(struct ibv_comp_channel *p_cq_channel); - -inline void cq_mgr::update_global_sn(uint64_t &cq_poll_sn, uint32_t num_polled_cqes) +inline void cq_mgr::update_global_sn_rx(uint64_t &cq_poll_sn, uint32_t num_polled_cqes) { if (num_polled_cqes > 0) { // spoil the global sn if we have packets ready @@ -311,46 +242,14 @@ inline void cq_mgr::update_global_sn(uint64_t &cq_poll_sn, uint32_t num_polled_c uint32_t cq_sn; } bundle; } next_sn; - m_n_cq_poll_sn += num_polled_cqes; - next_sn.bundle.cq_sn = m_n_cq_poll_sn; - next_sn.bundle.cq_id = m_cq_id; + m_n_cq_poll_sn_rx += num_polled_cqes; + next_sn.bundle.cq_sn = m_n_cq_poll_sn_rx; + next_sn.bundle.cq_id = m_cq_id_rx; - m_n_global_sn = next_sn.global_sn; + m_n_global_sn_rx = next_sn.global_sn; } - cq_poll_sn = m_n_global_sn; -} - -inline struct xlio_mlx5_cqe *cq_mgr::get_cqe_tx(uint32_t &num_polled_cqes) -{ - struct xlio_mlx5_cqe *cqe_ret = nullptr; - struct xlio_mlx5_cqe *cqe = - (struct xlio_mlx5_cqe *)(((uint8_t *)m_mlx5_cq.cq_buf) + - ((m_mlx5_cq.cq_ci & (m_mlx5_cq.cqe_count - 1)) - << m_mlx5_cq.cqe_size_log)); - - /* According to PRM, SW ownership bit flips with every CQ overflow. Since cqe_count is - * a power of 2, we use it to get cq_ci bit just after the significant bits. The bit changes - * with each CQ overflow and actually equals to the SW ownership bit. - */ - while (((cqe->op_own & MLX5_CQE_OWNER_MASK) == !!(m_mlx5_cq.cq_ci & m_mlx5_cq.cqe_count)) && - ((cqe->op_own >> 4) != MLX5_CQE_INVALID)) { - ++m_mlx5_cq.cq_ci; - ++num_polled_cqes; - cqe_ret = cqe; - if (unlikely(cqe->op_own & 0x80)) { - // This is likely an error CQE. Return it explicitly to log the errors. - break; - } - cqe = (struct xlio_mlx5_cqe *)(((uint8_t *)m_mlx5_cq.cq_buf) + - ((m_mlx5_cq.cq_ci & (m_mlx5_cq.cqe_count - 1)) - << m_mlx5_cq.cqe_size_log)); - } - if (cqe_ret) { - rmb(); - *m_mlx5_cq.dbrec = htonl(m_mlx5_cq.cq_ci); - } - return cqe_ret; + cq_poll_sn = m_n_global_sn_rx; } inline struct xlio_mlx5_cqe *cq_mgr::check_cqe(void) diff --git a/src/core/dev/cq_mgr_regrq.cpp b/src/core/dev/cq_mgr_regrq.cpp index d18b20577..a33ac9505 100644 --- a/src/core/dev/cq_mgr_regrq.cpp +++ b/src/core/dev/cq_mgr_regrq.cpp @@ -52,9 +52,8 @@ #define cq_logfuncall __log_info_funcall cq_mgr_regrq::cq_mgr_regrq(ring_simple *p_ring, ib_ctx_handler *p_ib_ctx_handler, uint32_t cq_size, - struct ibv_comp_channel *p_comp_event_channel, bool is_rx, - bool call_configure) - : cq_mgr(p_ring, p_ib_ctx_handler, cq_size, p_comp_event_channel, is_rx, call_configure) + struct ibv_comp_channel *p_comp_event_channel) + : cq_mgr(p_ring, p_ib_ctx_handler, cq_size, p_comp_event_channel) { cq_logfunc(""); } @@ -65,89 +64,30 @@ uint32_t cq_mgr_regrq::clean_cq() uint64_t cq_poll_sn = 0; mem_buf_desc_t *buff; - if (m_b_is_rx) { - /* Sanity check for cq: initialization of tx and rx cq has difference: - * tx - is done in qp_mgr::configure() - * rx - is done in qp_mgr::up() - * as a result rx cq can be created but not initialized - */ - if (NULL == m_qp) { - return 0; - } - - buff_status_e status = BS_OK; - while ((buff = poll(status))) { - if (cqe_process_rx(buff, status)) { - m_rx_queue.push_back(buff); - } - ++ret_total; - } - update_global_sn(cq_poll_sn, ret_total); - } else { // Tx - int ret = 0; - /* coverity[stack_use_local_overflow] */ - xlio_ibv_wc wce[MCE_MAX_CQ_POLL_BATCH]; - while ((ret = poll_tx(wce, MCE_MAX_CQ_POLL_BATCH, &cq_poll_sn)) > 0) { - for (int i = 0; i < ret; i++) { - buff = cqe_log_and_get_buf_tx(&wce[i]); - if (buff) { - m_p_ring->mem_buf_desc_return_single_to_owner_tx(buff); - } - } - ret_total += ret; - } + /* Sanity check for cq: initialization of tx and rx cq has difference: + * tx - is done in qp_mgr::configure() + * rx - is done in qp_mgr::up() + * as a result rx cq can be created but not initialized + */ + if (NULL == m_qp) { + return 0; } - return ret_total; -} - -int cq_mgr_regrq::poll_tx(xlio_ibv_wc *p_wce, int num_entries, uint64_t *p_cq_poll_sn) -{ - // Assume locked!!! - cq_logfuncall(""); - - int ret = xlio_ibv_poll_cq(m_p_ibv_cq, num_entries, p_wce); - if (ret <= 0) { - // Zero polled wce OR ibv_poll_cq() has driver specific errors - // so we can't really do anything with them - *p_cq_poll_sn = m_n_global_sn; - return 0; - } - - if (unlikely(g_vlogger_level >= VLOG_FUNC_ALL)) { - for (int i = 0; i < ret; i++) { - cq_logfuncall("wce[%d] info wr_id=%x, status=%x, opcode=%x, vendor_err=%x, " - "byte_len=%d, imm_data=%x", - i, p_wce[i].wr_id, p_wce[i].status, xlio_wc_opcode(p_wce[i]), - p_wce[i].vendor_err, p_wce[i].byte_len, p_wce[i].imm_data); - cq_logfuncall("qp_num=%x, src_qp=%x, wc_flags=%x, pkey_index=%x, slid=%x, sl=%x, " - "dlid_path_bits=%x", - p_wce[i].qp_num, p_wce[i].src_qp, xlio_wc_flags(p_wce[i]), - p_wce[i].pkey_index, p_wce[i].slid, p_wce[i].sl, p_wce[i].dlid_path_bits); + buff_status_e status = BS_OK; + while ((buff = poll(status))) { + if (cqe_process_rx(buff, status)) { + m_rx_queue.push_back(buff); } + ++ret_total; } + update_global_sn_rx(cq_poll_sn, ret_total); - // spoil the global sn if we have packets ready - union __attribute__((packed)) { - uint64_t global_sn; - struct { - uint32_t cq_id; - uint32_t cq_sn; - } bundle; - } next_sn; - next_sn.bundle.cq_sn = ++m_n_cq_poll_sn; - next_sn.bundle.cq_id = m_cq_id; - - *p_cq_poll_sn = m_n_global_sn = next_sn.global_sn; - - return ret; + return ret_total; } - cq_mgr_regrq::~cq_mgr_regrq() { - cq_logfunc(""); - cq_logdbg("destroying CQ REGRQ as %s", (m_b_is_rx ? "Rx" : "Tx")); + cq_logdbg("Destroying CQ REGRQ"); } mem_buf_desc_t *cq_mgr_regrq::poll(enum buff_status_e &status) @@ -276,8 +216,7 @@ int cq_mgr_regrq::drain_and_proccess_helper(mem_buf_desc_t *buff, buff_status_e m_p_cq_stat->n_rx_pkt_drop++; reclaim_recv_buffer_helper(buff); } else { - bool procces_now = - (m_transport_type == XLIO_TRANSPORT_ETH ? is_eth_tcp_frame(buff) : false); + bool procces_now = is_eth_tcp_frame(buff); if (procces_now) { // We process immediately all non udp/ip traffic.. buff->rx.is_xlio_thr = true; @@ -328,7 +267,7 @@ int cq_mgr_regrq::drain_and_proccess(uintptr_t *p_recycle_buffers_last_wr_id /*= buff_status_e status = BS_OK; mem_buf_desc_t *buff = poll(status); if (NULL == buff) { - update_global_sn(cq_poll_sn, ret_total); + update_global_sn_rx(cq_poll_sn, ret_total); m_b_was_drained = true; m_p_ring->m_gro_mgr.flush_all(NULL); return ret_total; @@ -341,10 +280,8 @@ int cq_mgr_regrq::drain_and_proccess(uintptr_t *p_recycle_buffers_last_wr_id /*= m_p_cq_stat->n_rx_pkt_drop++; reclaim_recv_buffer_helper(buff); } else { - bool procces_now = false; - if (m_transport_type == XLIO_TRANSPORT_ETH) { - procces_now = is_eth_tcp_frame(buff); - } + bool procces_now = is_eth_tcp_frame(buff); + /* We process immediately all non udp/ip traffic.. */ if (procces_now) { buff->rx.is_xlio_thr = true; @@ -371,7 +308,7 @@ int cq_mgr_regrq::drain_and_proccess(uintptr_t *p_recycle_buffers_last_wr_id /*= ++ret_total; } - update_global_sn(cq_poll_sn, ret_total); + update_global_sn_rx(cq_poll_sn, ret_total); m_p_ring->m_gro_mgr.flush_all(NULL); @@ -446,7 +383,7 @@ int cq_mgr_regrq::poll_and_process_element_rx(uint64_t *p_cq_poll_sn, void *pv_f } } - update_global_sn(*p_cq_poll_sn, ret); + update_global_sn_rx(*p_cq_poll_sn, ret); if (likely(ret > 0)) { ret_rx_processed += ret; diff --git a/src/core/dev/cq_mgr_regrq.h b/src/core/dev/cq_mgr_regrq.h index 342be2102..f2f2781f4 100644 --- a/src/core/dev/cq_mgr_regrq.h +++ b/src/core/dev/cq_mgr_regrq.h @@ -38,8 +38,7 @@ class cq_mgr_regrq : public cq_mgr { public: cq_mgr_regrq(ring_simple *p_ring, ib_ctx_handler *p_ib_ctx_handler, uint32_t cq_size, - struct ibv_comp_channel *p_comp_event_channel, bool is_rx, - bool call_configure = true); + struct ibv_comp_channel *p_comp_event_channel); virtual ~cq_mgr_regrq(); virtual int drain_and_proccess(uintptr_t *p_recycle_buffers_last_wr_id = NULL) override; @@ -51,7 +50,6 @@ class cq_mgr_regrq : public cq_mgr { protected: mem_buf_desc_t *poll(enum buff_status_e &status); - int poll_tx(xlio_ibv_wc *p_wce, int num_entries, uint64_t *p_cq_poll_sn); inline void cqe_to_mem_buff_desc(struct xlio_mlx5_cqe *cqe, mem_buf_desc_t *p_rx_wc_buf_desc, enum buff_status_e &status); diff --git a/src/core/dev/cq_mgr_strq.cpp b/src/core/dev/cq_mgr_strq.cpp index 653d10f59..e245eba84 100644 --- a/src/core/dev/cq_mgr_strq.cpp +++ b/src/core/dev/cq_mgr_strq.cpp @@ -58,9 +58,8 @@ cq_mgr_strq::cq_mgr_strq(ring_simple *p_ring, ib_ctx_handler *p_ib_ctx_handler, uint32_t cq_size, uint32_t stride_size_bytes, uint32_t strides_num, - struct ibv_comp_channel *p_comp_event_channel, - bool call_configure) - : cq_mgr(p_ring, p_ib_ctx_handler, cq_size, p_comp_event_channel, true, call_configure) + struct ibv_comp_channel *p_comp_event_channel) + : cq_mgr(p_ring, p_ib_ctx_handler, cq_size, p_comp_event_channel) , _owner_ring(p_ring) , _stride_size_bytes(stride_size_bytes) , _strides_num(strides_num) @@ -150,7 +149,7 @@ uint32_t cq_mgr_strq::clean_cq() stride_buf = nullptr; } - update_global_sn(cq_poll_sn, ret_total); + update_global_sn_rx(cq_poll_sn, ret_total); return ret_total; } @@ -356,8 +355,7 @@ int cq_mgr_strq::drain_and_proccess_helper(mem_buf_desc_t *buff, mem_buf_desc_t m_p_cq_stat->n_rx_pkt_drop++; reclaim_recv_buffer_helper(buff); } else { - bool procces_now = - (m_transport_type == XLIO_TRANSPORT_ETH ? is_eth_tcp_frame(buff) : false); + bool procces_now = is_eth_tcp_frame(buff); // We process immediately all non udp/ip traffic.. if (procces_now) { @@ -400,7 +398,7 @@ int cq_mgr_strq::drain_and_proccess(uintptr_t *p_recycle_buffers_last_wr_id) mem_buf_desc_t *buff = nullptr; mem_buf_desc_t *buff_wqe = poll(status, buff); if (!buff && !buff_wqe) { - update_global_sn(cq_poll_sn, ret_total); + update_global_sn_rx(cq_poll_sn, ret_total); m_b_was_drained = true; m_p_ring->m_gro_mgr.flush_all(nullptr); return ret_total; @@ -410,7 +408,7 @@ int cq_mgr_strq::drain_and_proccess(uintptr_t *p_recycle_buffers_last_wr_id) drain_and_proccess_helper(buff, buff_wqe, status, p_recycle_buffers_last_wr_id); } - update_global_sn(cq_poll_sn, ret_total); + update_global_sn_rx(cq_poll_sn, ret_total); m_p_ring->m_gro_mgr.flush_all(nullptr); m_n_wce_counter = 0; // Actually strides count. @@ -500,7 +498,7 @@ int cq_mgr_strq::poll_and_process_element_rx(uint64_t *p_cq_poll_sn, void *pv_fd } } - update_global_sn(*p_cq_poll_sn, ret); + update_global_sn_rx(*p_cq_poll_sn, ret); if (likely(ret > 0)) { m_n_wce_counter += ret; // Actually strides count. diff --git a/src/core/dev/cq_mgr_strq.h b/src/core/dev/cq_mgr_strq.h index 061db0cbe..e9d1468eb 100644 --- a/src/core/dev/cq_mgr_strq.h +++ b/src/core/dev/cq_mgr_strq.h @@ -41,7 +41,7 @@ class cq_mgr_strq : public cq_mgr { public: cq_mgr_strq(ring_simple *p_ring, ib_ctx_handler *p_ib_ctx_handler, uint32_t cq_size, uint32_t stride_size_bytes, uint32_t strides_num, - struct ibv_comp_channel *p_comp_event_channel, bool call_configure = true); + struct ibv_comp_channel *p_comp_event_channel); virtual ~cq_mgr_strq() override; diff --git a/src/core/dev/cq_mgr_tx.cpp b/src/core/dev/cq_mgr_tx.cpp new file mode 100644 index 000000000..acc118ef3 --- /dev/null +++ b/src/core/dev/cq_mgr_tx.cpp @@ -0,0 +1,347 @@ +/* + * Copyright (c) 2001-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "dev/cq_mgr_tx.h" +#include +#include +#include "ring_simple.h" +#include "qp_mgr_eth_mlx5.h" + +#define MODULE_NAME "cq_mgr_tx" + +#define cq_logpanic __log_info_panic +#define cq_logerr __log_info_err +#define cq_logwarn __log_info_warn +#define cq_loginfo __log_info_info +#define cq_logdbg __log_info_dbg +#define cq_logfunc __log_info_func +#define cq_logfuncall __log_info_funcall + +atomic_t cq_mgr_tx::m_n_cq_id_counter_tx = ATOMIC_INIT(1); + +uint64_t cq_mgr_tx::m_n_global_sn_tx = 0U; + + cq_mgr_tx::cq_mgr_tx(ring_simple *p_ring, ib_ctx_handler *p_ib_ctx_handler, int cq_size, + ibv_comp_channel *p_comp_event_channel) + : m_p_ring(p_ring) + , m_p_ib_ctx_handler(p_ib_ctx_handler) + , m_comp_event_channel(p_comp_event_channel) +{ + m_cq_id_tx = atomic_fetch_and_inc(&m_n_cq_id_counter_tx); // cq id is nonzero + configure(cq_size); + + memset(&m_mlx5_cq, 0, sizeof(m_mlx5_cq)); +} + +cq_mgr_tx::~cq_mgr_tx() +{ + cq_logdbg("Destroying CQ as Tx"); + + IF_VERBS_FAILURE_EX(ibv_destroy_cq(m_p_ibv_cq), EIO) + { + cq_logdbg("destroy cq failed (errno=%d %m)", errno); + } + ENDIF_VERBS_FAILURE; + VALGRIND_MAKE_MEM_UNDEFINED(m_p_ibv_cq, sizeof(ibv_cq)); + cq_logdbg("Destroying CQ as Tx done"); +} + +uint32_t cq_mgr_tx::clean_cq() +{ + uint32_t ret_total = 0; + uint64_t cq_poll_sn = 0; + mem_buf_desc_t *buff; + + int ret = 0; + /* coverity[stack_use_local_overflow] */ + xlio_ibv_wc wce[MCE_MAX_CQ_POLL_BATCH]; + while ((ret = clean_cq_poll_tx(wce, MCE_MAX_CQ_POLL_BATCH, &cq_poll_sn)) > 0) { + for (int i = 0; i < ret; i++) { + buff = (mem_buf_desc_t *)(uintptr_t)(wce[i].wr_id); + if (buff) { + m_p_ring->mem_buf_desc_return_single_to_owner_tx(buff); + } + } + ret_total += ret; + } + + return ret_total; +} + +int cq_mgr_tx::clean_cq_poll_tx(xlio_ibv_wc *p_wce, int num_entries, uint64_t *p_cq_poll_sn) +{ + // Assume locked!!! + cq_logfuncall(""); + + int ret = xlio_ibv_poll_cq(m_p_ibv_cq, num_entries, p_wce); + if (ret <= 0) { + // Zero polled wce OR ibv_poll_cq() has driver specific errors + // so we can't really do anything with them + *p_cq_poll_sn = m_n_global_sn_tx; + return 0; + } + + if (unlikely(g_vlogger_level >= VLOG_FUNC_ALL)) { + for (int i = 0; i < ret; i++) { + cq_logfuncall("wce[%d] info wr_id=%x, status=%x, opcode=%x, vendor_err=%x, " + "byte_len=%d, imm_data=%x", + i, p_wce[i].wr_id, p_wce[i].status, xlio_wc_opcode(p_wce[i]), + p_wce[i].vendor_err, p_wce[i].byte_len, p_wce[i].imm_data); + cq_logfuncall("qp_num=%x, src_qp=%x, wc_flags=%x, pkey_index=%x, slid=%x, sl=%x, " + "dlid_path_bits=%x", + p_wce[i].qp_num, p_wce[i].src_qp, xlio_wc_flags(p_wce[i]), + p_wce[i].pkey_index, p_wce[i].slid, p_wce[i].sl, p_wce[i].dlid_path_bits); + } + } + + // spoil the global sn if we have packets ready + union __attribute__((packed)) { + uint64_t global_sn; + struct { + uint32_t cq_id; + uint32_t cq_sn; + } bundle; + } next_sn; + next_sn.bundle.cq_sn = ++m_n_cq_poll_sn_tx; + next_sn.bundle.cq_id = m_cq_id_tx; + + *p_cq_poll_sn = m_n_global_sn_tx = next_sn.global_sn; + + return ret; +} + +void cq_mgr_tx::configure(int cq_size) +{ + xlio_ibv_cq_init_attr attr; + memset(&attr, 0, sizeof(attr)); + + struct ibv_context *context = m_p_ib_ctx_handler->get_ibv_context(); + int comp_vector = 0; +#if defined(DEFINED_NGINX) + /* + * For NGINX scenario we may want to distribute CQs across multiple + * CPUs to improve CPS in case of multiple NGINX worker processes. + */ + if (safe_mce_sys().nginx_distribute_cq_interrupts) { + comp_vector = g_worker_index % context->num_comp_vectors; + } +#endif + m_p_ibv_cq = xlio_ibv_create_cq(context, cq_size - 1, (void *)this, m_comp_event_channel, + comp_vector, &attr); + BULLSEYE_EXCLUDE_BLOCK_START + if (!m_p_ibv_cq) { + throw_xlio_exception("ibv_create_cq failed"); + } + BULLSEYE_EXCLUDE_BLOCK_END + VALGRIND_MAKE_MEM_DEFINED(m_p_ibv_cq, sizeof(ibv_cq)); + + cq_logdbg("Created CQ as Tx with fd[%d] and of size %d elements (ibv_cq_hndl=%p)", + get_channel_fd(), cq_size, m_p_ibv_cq); +} + +void cq_mgr_tx::add_qp_tx(qp_mgr *qp) +{ + // Assume locked! + cq_logdbg("qp_mgr=%p", qp); + m_qp = static_cast(qp); + + if (0 != xlio_ib_mlx5_get_cq(m_p_ibv_cq, &m_mlx5_cq)) { + cq_logpanic("xlio_ib_mlx5_get_cq failed (errno=%d %m)", errno); + } + + cq_logfunc("qp_mgr=%p m_mlx5_cq.dbrec=%p m_mlx5_cq.cq_buf=%p", m_qp, m_mlx5_cq.dbrec, + m_mlx5_cq.cq_buf); +} + +void cq_mgr_tx::del_qp_tx(qp_mgr *qp) +{ + BULLSEYE_EXCLUDE_BLOCK_START + if (m_qp != qp) { + cq_logdbg("wrong qp_mgr=%p != m_qp=%p", qp, m_qp); + return; + } + BULLSEYE_EXCLUDE_BLOCK_END + cq_logdbg("qp_mgr=%p", m_qp); + m_qp = nullptr; +} + +int cq_mgr_tx::request_notification(uint64_t poll_sn) +{ + int ret = -1; + + cq_logfuncall(""); + + if ((m_n_global_sn_tx > 0 && poll_sn != m_n_global_sn_tx)) { + // The cq_mgr's has receive packets pending processing (or got processed since cq_poll_sn) + cq_logfunc("miss matched poll sn (user=0x%lx, cq=0x%lx)", poll_sn, m_n_cq_poll_sn_tx); + return 1; + } + + if (m_b_notification_armed == false) { + + cq_logfunc("arming cq_mgr notification channel"); + + // Arm the CQ notification channel + IF_VERBS_FAILURE(xlio_ib_mlx5_req_notify_cq(&m_mlx5_cq, 0)) + { + cq_logerr("Failure arming the qp_mgr notification channel (errno=%d %m)", errno); + } + else + { + ret = 0; + m_b_notification_armed = true; + } + ENDIF_VERBS_FAILURE; + } else { + // cq_mgr notification channel already armed + ret = 0; + } + + cq_logfuncall("returning with %d", ret); + return ret; +} + +cq_mgr_tx *cq_mgr_tx::get_cq_mgr_from_cq_event(struct ibv_comp_channel *p_cq_channel) +{ + cq_mgr_tx *p_cq_mgr = NULL; + struct ibv_cq *p_cq_hndl = NULL; + void *p_context; // deal with compiler warnings + + // read & ack the CQ event + IF_VERBS_FAILURE(ibv_get_cq_event(p_cq_channel, &p_cq_hndl, &p_context)) + { + vlog_printf(VLOG_INFO, + MODULE_NAME ":%d: waiting on cq_mgr event returned with error (errno=%d %m)\n", + __LINE__, errno); + } + else + { + p_cq_mgr = (cq_mgr_tx *)p_context; // Save the cq_mgr + p_cq_mgr->get_cq_event(); + ibv_ack_cq_events(p_cq_hndl, 1); // Ack the ibv event + } + ENDIF_VERBS_FAILURE; + + return p_cq_mgr; +} + +int cq_mgr_tx::poll_and_process_element_tx(uint64_t *p_cq_poll_sn) +{ + cq_logfuncall(""); + + static auto is_error_opcode = [&](uint8_t opcode) { + return opcode == MLX5_CQE_REQ_ERR || opcode == MLX5_CQE_RESP_ERR; + }; + + int ret = 0; + uint32_t num_polled_cqes = 0; + xlio_mlx5_cqe *cqe = get_cqe_tx(num_polled_cqes); + + if (likely(cqe)) { + unsigned index = ntohs(cqe->wqe_counter) & (m_qp->m_tx_num_wr - 1); + + // All error opcodes have the most significant bit set. + if (unlikely(cqe->op_own & 0x80) && is_error_opcode(cqe->op_own >> 4)) { + //m_p_cq_stat->n_tx_cqe_error++; Future counter + log_cqe_error(cqe); + } + + handle_sq_wqe_prop(index); + ret = 1; + } + update_global_sn_tx(*p_cq_poll_sn, num_polled_cqes); + + return ret; +} + +void cq_mgr_tx::log_cqe_error(struct xlio_mlx5_cqe *cqe) +{ + struct mlx5_err_cqe *ecqe = (struct mlx5_err_cqe *)cqe; + + /* TODO We can also ask qp_mgr to log WQE fields from SQ. But at first, we need to remove + * prefetch and memset of the next WQE there. Credit system will guarantee that we don't + * reuse the WQE at this point. + */ + + if (MLX5_CQE_SYNDROME_WR_FLUSH_ERR != ecqe->syndrome) { + cq_logwarn("cqe: syndrome=0x%x vendor=0x%x hw=0x%x (type=0x%x) wqe_opcode_qpn=0x%x " + "wqe_counter=0x%x", + ecqe->syndrome, ecqe->vendor_err_synd, *((uint8_t *)&ecqe->rsvd1 + 16), + *((uint8_t *)&ecqe->rsvd1 + 17), ntohl(ecqe->s_wqe_opcode_qpn), + ntohs(ecqe->wqe_counter)); + } +} + +void cq_mgr_tx::handle_sq_wqe_prop(unsigned index) +{ + sq_wqe_prop *p = &m_qp->m_sq_wqe_idx_to_prop[index]; + sq_wqe_prop *prev; + unsigned credits = 0; + + /* + * TX completions can be signalled for a set of WQEs as an optimization. + * Therefore, for every TX completion we may need to handle multiple + * WQEs. Since every WQE can have various size and the WQE index is + * wrapped around, we build a linked list to simplify things. Each + * element of the linked list represents properties of a previously + * posted WQE. + * + * We keep index of the last completed WQE and stop processing the list + * when we reach the index. This condition is checked in + * is_sq_wqe_prop_valid(). + */ + + do { + if (p->buf) { + m_p_ring->mem_buf_desc_return_single_locked(p->buf); + } + if (p->ti) { + xlio_ti *ti = p->ti; + if (ti->m_callback) { + ti->m_callback(ti->m_callback_arg); + } + + ti->put(); + if (unlikely(ti->m_released && ti->m_ref == 0)) { + m_qp->ti_released(ti); + } + } + credits += p->credits; + + prev = p; + p = p->next; + } while (p != NULL && m_qp->is_sq_wqe_prop_valid(p, prev)); + + m_p_ring->return_tx_pool_to_global_pool(); + m_qp->credits_return(credits); + m_qp->m_sq_wqe_prop_last_signalled = index; +} diff --git a/src/core/dev/cq_mgr_tx.h b/src/core/dev/cq_mgr_tx.h new file mode 100644 index 000000000..b2ed502dc --- /dev/null +++ b/src/core/dev/cq_mgr_tx.h @@ -0,0 +1,156 @@ +/* + * Copyright (c) 2001-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef CQ_MGR_TX_H +#define CQ_MGR_TX_H + +#include "dev/ib_ctx_handler.h" + +class qp_mgr; +class qp_mgr_eth_mlx5; +class ring_simple; + +class cq_mgr_tx { +public: + + cq_mgr_tx(ring_simple *p_ring, ib_ctx_handler *p_ib_ctx_handler, int cq_size, + ibv_comp_channel *p_comp_event_channel); + ~cq_mgr_tx(); + + // Helper gunction to extract the Tx cq_mgr from the CQ event, + // Since we have a single TX CQ comp channel for all cq_mgr's, it might not be the active_cq object + static cq_mgr_tx *get_cq_mgr_from_cq_event(struct ibv_comp_channel *p_cq_channel); + + ibv_cq *get_ibv_cq_hndl() { return m_p_ibv_cq; } + int get_channel_fd() { return m_comp_event_channel->fd; } + + void configure(int cq_size); + void add_qp_tx(qp_mgr *qp); + void del_qp_tx(qp_mgr *qp); + + uint32_t clean_cq(); + + /** + * Arm the managed CQ's notification channel + * Calling this more then once without get_event() will return without + * doing anything (arm flag is changed to true on first call). + * This call will also check if a wce was processes between the + * last poll and this arm request - if true it will not arm the CQ + * @return ==0 cq is armed + * ==1 cq not armed (cq poll_sn out of sync) + * < 0 on error + */ + int request_notification(uint64_t poll_sn); + + int poll_and_process_element_tx(uint64_t *p_cq_poll_sn); + + void reset_notification_armed() { m_b_notification_armed = false; } + +private: + + void log_cqe_error(struct xlio_mlx5_cqe *cqe); + void handle_sq_wqe_prop(unsigned index); + int clean_cq_poll_tx(xlio_ibv_wc *p_wce, int num_entries, uint64_t *p_cq_poll_sn); + + void get_cq_event(int count = 1) { xlio_ib_mlx5_get_cq_event(&m_mlx5_cq, count); }; + + inline void update_global_sn_tx(uint64_t &cq_poll_sn, uint32_t rettotal); + inline struct xlio_mlx5_cqe *get_cqe_tx(uint32_t &num_polled_cqes); + + static atomic_t m_n_cq_id_counter_tx; + static uint64_t m_n_global_sn_tx; + + xlio_ib_mlx5_cq_t m_mlx5_cq; + ring_simple *m_p_ring; + ib_ctx_handler *m_p_ib_ctx_handler; + ibv_comp_channel *m_comp_event_channel; + qp_mgr_eth_mlx5 *m_qp = nullptr; + struct ibv_cq *m_p_ibv_cq = nullptr; + uint32_t m_cq_id_tx = 0U; + uint32_t m_n_cq_poll_sn_tx = 0U; + bool m_b_notification_armed = false; +}; + +inline void cq_mgr_tx::update_global_sn_tx(uint64_t &cq_poll_sn, uint32_t num_polled_cqes) +{ + if (num_polled_cqes > 0) { + // spoil the global sn if we have packets ready + union __attribute__((packed)) { + uint64_t global_sn; + struct { + uint32_t cq_id; + uint32_t cq_sn; + } bundle; + } next_sn; + m_n_cq_poll_sn_tx += num_polled_cqes; + next_sn.bundle.cq_sn = m_n_cq_poll_sn_tx; + next_sn.bundle.cq_id = m_cq_id_tx; + + m_n_global_sn_tx = next_sn.global_sn; + } + + cq_poll_sn = m_n_global_sn_tx; +} + +inline struct xlio_mlx5_cqe *cq_mgr_tx::get_cqe_tx(uint32_t &num_polled_cqes) +{ + struct xlio_mlx5_cqe *cqe_ret = nullptr; + struct xlio_mlx5_cqe *cqe = + (struct xlio_mlx5_cqe *)(((uint8_t *)m_mlx5_cq.cq_buf) + + ((m_mlx5_cq.cq_ci & (m_mlx5_cq.cqe_count - 1)) + << m_mlx5_cq.cqe_size_log)); + + /* According to PRM, SW ownership bit flips with every CQ overflow. Since cqe_count is + * a power of 2, we use it to get cq_ci bit just after the significant bits. The bit changes + * with each CQ overflow and actually equals to the SW ownership bit. + */ + while (((cqe->op_own & MLX5_CQE_OWNER_MASK) == !!(m_mlx5_cq.cq_ci & m_mlx5_cq.cqe_count)) && + ((cqe->op_own >> 4) != MLX5_CQE_INVALID)) { + ++m_mlx5_cq.cq_ci; + ++num_polled_cqes; + cqe_ret = cqe; + if (unlikely(cqe->op_own & 0x80)) { + // This is likely an error CQE. Return it explicitly to log the errors. + break; + } + cqe = (struct xlio_mlx5_cqe *)(((uint8_t *)m_mlx5_cq.cq_buf) + + ((m_mlx5_cq.cq_ci & (m_mlx5_cq.cqe_count - 1)) + << m_mlx5_cq.cqe_size_log)); + } + if (cqe_ret) { + rmb(); + *m_mlx5_cq.dbrec = htonl(m_mlx5_cq.cq_ci); + } + return cqe_ret; +} + +#endif // CQ_MGR_TX_H diff --git a/src/core/dev/ib_ctx_handler.cpp b/src/core/dev/ib_ctx_handler.cpp index bf9bd40a2..cd4889127 100644 --- a/src/core/dev/ib_ctx_handler.cpp +++ b/src/core/dev/ib_ctx_handler.cpp @@ -439,12 +439,6 @@ void ib_ctx_handler::set_ctx_time_converter_status(ts_conversion_mode_t conversi #endif // DEFINED_IBV_CQ_TIMESTAMP } -ts_conversion_mode_t ib_ctx_handler::get_ctx_time_converter_status() -{ - return m_p_ctx_time_converter ? m_p_ctx_time_converter->get_converter_status() - : TS_CONVERSION_MODE_DISABLE; -} - uint32_t ib_ctx_handler::mem_reg(void *addr, size_t length, uint64_t access) { struct ibv_mr *mr = NULL; diff --git a/src/core/dev/ib_ctx_handler.h b/src/core/dev/ib_ctx_handler.h index 8c97ac3bf..7ece4da68 100644 --- a/src/core/dev/ib_ctx_handler.h +++ b/src/core/dev/ib_ctx_handler.h @@ -93,7 +93,6 @@ class ib_ctx_handler : public event_handler_ibverbs { uint32_t user_mem_reg(void *addr, size_t length, uint64_t access); bool is_removed() { return m_removed; } void set_ctx_time_converter_status(ts_conversion_mode_t conversion_mode); - ts_conversion_mode_t get_ctx_time_converter_status(); void set_flow_tag_capability(bool flow_tag_capability); bool get_flow_tag_capability() { return m_flow_tag_enabled; } // m_flow_tag_capability void set_burst_capability(bool burst); diff --git a/src/core/dev/qp_mgr.h b/src/core/dev/qp_mgr.h index c301829e7..d9b8f22c6 100644 --- a/src/core/dev/qp_mgr.h +++ b/src/core/dev/qp_mgr.h @@ -50,6 +50,7 @@ #include "infra/sender.h" #include "dev/ib_ctx_handler.h" #include "dev/cq_mgr.h" +#include "dev/cq_mgr_tx.h" #include "dev/rfs_rule.h" /* Forward declarations */ @@ -155,7 +156,7 @@ class qp_mgr { friend class cq_mgr; friend class cq_mgr_regrq; friend class cq_mgr_strq; - friend class cq_mgr_mp; + friend class cq_mgr_tx; public: qp_mgr(struct qp_mgr_desc *desc, const uint32_t tx_num_wr); @@ -176,7 +177,7 @@ class qp_mgr { int get_port_num() const { return m_port_num; } virtual uint16_t get_partiton() const { return 0; }; struct ibv_qp *get_ibv_qp() const { return m_qp; }; - class cq_mgr *get_tx_cq_mgr() const { return m_p_cq_mgr_tx; } + class cq_mgr_tx *get_tx_cq_mgr() const { return m_p_cq_mgr_tx; } class cq_mgr *get_rx_cq_mgr() const { return m_p_cq_mgr_rx; } virtual uint32_t get_rx_max_wr_num(); // This function can be replaced with a parameter during ring creation. @@ -330,7 +331,7 @@ class qp_mgr { uint32_t m_max_qp_wr; cq_mgr *m_p_cq_mgr_rx; - cq_mgr *m_p_cq_mgr_tx; + cq_mgr_tx *m_p_cq_mgr_tx; uint32_t m_rx_num_wr; uint32_t m_tx_num_wr; @@ -376,7 +377,7 @@ class qp_mgr { } virtual cq_mgr *init_rx_cq_mgr(struct ibv_comp_channel *p_rx_comp_event_channel) = 0; - virtual cq_mgr *init_tx_cq_mgr(void) = 0; + virtual cq_mgr_tx *init_tx_cq_mgr(void) = 0; virtual int send_to_wire(xlio_ibv_send_wr *p_send_wqe, xlio_wr_tx_packet_attr attr, bool request_comp, xlio_tis *tis, unsigned credits); diff --git a/src/core/dev/qp_mgr_eth_mlx5.cpp b/src/core/dev/qp_mgr_eth_mlx5.cpp index fa6e222c8..3c947663b 100644 --- a/src/core/dev/qp_mgr_eth_mlx5.cpp +++ b/src/core/dev/qp_mgr_eth_mlx5.cpp @@ -367,14 +367,14 @@ cq_mgr *qp_mgr_eth_mlx5::init_rx_cq_mgr(struct ibv_comp_channel *p_rx_comp_event { return (!init_rx_cq_mgr_prepare() ? NULL : new cq_mgr_regrq(m_p_ring, m_p_ib_ctx_handler, m_rx_num_wr, - p_rx_comp_event_channel, true)); + p_rx_comp_event_channel)); } -cq_mgr *qp_mgr_eth_mlx5::init_tx_cq_mgr() +cq_mgr_tx *qp_mgr_eth_mlx5::init_tx_cq_mgr() { m_tx_num_wr = align32pow2(m_tx_num_wr); - return new cq_mgr_regrq(m_p_ring, m_p_ib_ctx_handler, m_tx_num_wr, - m_p_ring->get_tx_comp_event_channel(), false); + return new cq_mgr_tx(m_p_ring, m_p_ib_ctx_handler, m_tx_num_wr, + m_p_ring->get_tx_comp_event_channel()); } inline void qp_mgr_eth_mlx5::ring_doorbell(int db_method, int num_wqebb, int num_wqebb_top, diff --git a/src/core/dev/qp_mgr_eth_mlx5.h b/src/core/dev/qp_mgr_eth_mlx5.h index 4454a4faa..0950292bb 100644 --- a/src/core/dev/qp_mgr_eth_mlx5.h +++ b/src/core/dev/qp_mgr_eth_mlx5.h @@ -64,6 +64,7 @@ typedef struct sq_wqe_prop sq_wqe_prop; class qp_mgr_eth_mlx5 : public qp_mgr_eth { friend class cq_mgr; friend class cq_mgr_regrq; + friend class cq_mgr_tx; public: qp_mgr_eth_mlx5(struct qp_mgr_desc *desc, const uint32_t tx_num_wr, const uint16_t vlan, @@ -139,7 +140,7 @@ class qp_mgr_eth_mlx5 : public qp_mgr_eth { void init_qp(); void init_device_memory(); cq_mgr *init_rx_cq_mgr(struct ibv_comp_channel *p_rx_comp_event_channel) override; - cq_mgr *init_tx_cq_mgr(void) override; + cq_mgr_tx *init_tx_cq_mgr(void) override; void put_tls_tir_in_cache(xlio_tir *tir); void put_tls_tis_in_cache(xlio_tis *tis); diff --git a/src/core/dev/qp_mgr_eth_mlx5_dpcp.cpp b/src/core/dev/qp_mgr_eth_mlx5_dpcp.cpp index a976bdf1c..04a1bb681 100644 --- a/src/core/dev/qp_mgr_eth_mlx5_dpcp.cpp +++ b/src/core/dev/qp_mgr_eth_mlx5_dpcp.cpp @@ -297,7 +297,7 @@ cq_mgr *qp_mgr_eth_mlx5_dpcp::init_rx_cq_mgr(struct ibv_comp_channel *p_rx_comp_ safe_mce_sys().strq_stride_num_per_rwqe * m_rx_num_wr, safe_mce_sys().strq_stride_size_bytes, safe_mce_sys().strq_stride_num_per_rwqe, - p_rx_comp_event_channel, true)); + p_rx_comp_event_channel)); } void qp_mgr_eth_mlx5_dpcp::post_recv_buffer(mem_buf_desc_t *p_mem_buf_desc) diff --git a/src/core/dev/ring_bond.cpp b/src/core/dev/ring_bond.cpp index 7ba89555c..e4e3d1ada 100644 --- a/src/core/dev/ring_bond.cpp +++ b/src/core/dev/ring_bond.cpp @@ -307,7 +307,7 @@ void ring_bond::restart() popup_xmit_rings(); int ret = 0; - uint64_t poll_sn = cq_mgr::m_n_global_sn; + uint64_t poll_sn = cq_mgr::m_n_global_sn_rx; ret = request_notification(CQT_RX, poll_sn); if (ret < 0) { ring_logdbg("failed arming rx cq_mgr (errno=%d %m)", errno); diff --git a/src/core/dev/ring_simple.cpp b/src/core/dev/ring_simple.cpp index 334272788..d11c3b436 100644 --- a/src/core/dev/ring_simple.cpp +++ b/src/core/dev/ring_simple.cpp @@ -664,11 +664,11 @@ mem_buf_desc_t *ring_simple::mem_buf_tx_get(ring_user_id_t id, bool b_block, pbu // Find the correct Tx cq_mgr from the CQ event, // It might not be the active_cq object since we have a single TX CQ comp // channel for all cq_mgr's - cq_mgr *p_cq_mgr_tx = get_cq_mgr_from_cq_event(get_tx_comp_event_channel()); + cq_mgr_tx *p_cq_mgr_tx = cq_mgr_tx::get_cq_mgr_from_cq_event(get_tx_comp_event_channel()); if (p_cq_mgr_tx) { // Allow additional CQ arming now - p_cq_mgr_tx->m_b_notification_armed = false; + p_cq_mgr_tx->reset_notification_armed(); // Perform a non blocking event read, clear the fd channel ret = p_cq_mgr_tx->poll_and_process_element_tx(&poll_sn); @@ -854,11 +854,11 @@ bool ring_simple::is_available_qp_wr(bool b_block, unsigned credits) // Find the correct Tx cq_mgr from the CQ event, // It might not be the active_cq object since we have a single TX CQ comp // channel for all cq_mgr's - cq_mgr *p_cq_mgr_tx = get_cq_mgr_from_cq_event(get_tx_comp_event_channel()); + cq_mgr_tx *p_cq_mgr_tx = cq_mgr_tx::get_cq_mgr_from_cq_event(get_tx_comp_event_channel()); if (p_cq_mgr_tx) { // Allow additional CQ arming now - p_cq_mgr_tx->m_b_notification_armed = false; + p_cq_mgr_tx->reset_notification_armed(); // Perform a non blocking event read, clear the fd channel ret = p_cq_mgr_tx->poll_and_process_element_tx(&poll_sn); diff --git a/src/core/dev/ring_simple.h b/src/core/dev/ring_simple.h index 67d81c90a..97b0f53d4 100644 --- a/src/core/dev/ring_simple.h +++ b/src/core/dev/ring_simple.h @@ -362,7 +362,7 @@ class ring_simple : public ring_slave { qp_mgr *m_p_qp_mgr; struct cq_moderation_info m_cq_moderation_info; cq_mgr *m_p_cq_mgr_rx; - cq_mgr *m_p_cq_mgr_tx; + cq_mgr_tx *m_p_cq_mgr_tx; std::unordered_map m_user_lkey_map; private: diff --git a/src/core/ib/base/verbs_extra.h b/src/core/ib/base/verbs_extra.h index 7471c8b18..9279f6501 100644 --- a/src/core/ib/base/verbs_extra.h +++ b/src/core/ib/base/verbs_extra.h @@ -171,10 +171,6 @@ typedef int xlio_ibv_cq_init_attr; // rx hw timestamp #define XLIO_IBV_WC_WITH_TIMESTAMP 0 #define xlio_wc_timestamp(wc) 0 -#define xlio_ibv_cq_init_ts_attr(attr) \ - { \ - NOT_IN_USE(attr); \ - } #ifdef DEFINED_IBV_CQ_TIMESTAMP #define XLIO_IBV_DEVICE_ATTR_HCA_CORE_CLOCK 0 From 7eb415693bcd81d0cd9b60b7c1d7f9524981348b Mon Sep 17 00:00:00 2001 From: Alexander Grissik Date: Mon, 28 Aug 2023 15:52:37 +0300 Subject: [PATCH 009/169] issue: 3514044 Rename cq_mgr to cq_mgr_rx Signed-off-by: Alexander Grissik --- src/core/Makefile.am | 14 ++-- src/core/dev/{cq_mgr.cpp => cq_mgr_rx.cpp} | 84 +++++++++---------- src/core/dev/{cq_mgr.h => cq_mgr_rx.h} | 20 ++--- src/core/dev/{cq_mgr.inl => cq_mgr_rx.inl} | 6 +- .../{cq_mgr_regrq.cpp => cq_mgr_rx_regrq.cpp} | 36 ++++---- .../dev/{cq_mgr_regrq.h => cq_mgr_rx_regrq.h} | 8 +- .../{cq_mgr_strq.cpp => cq_mgr_rx_strq.cpp} | 60 ++++++------- .../dev/{cq_mgr_strq.h => cq_mgr_rx_strq.h} | 12 +-- src/core/dev/cq_mgr_tx.cpp | 10 +-- src/core/dev/cq_mgr_tx.h | 4 +- src/core/dev/qp_mgr.cpp | 18 ++-- src/core/dev/qp_mgr.h | 16 ++-- src/core/dev/qp_mgr_eth_mlx5.cpp | 8 +- src/core/dev/qp_mgr_eth_mlx5.h | 6 +- src/core/dev/qp_mgr_eth_mlx5_dpcp.cpp | 14 ++-- src/core/dev/qp_mgr_eth_mlx5_dpcp.h | 2 +- src/core/dev/ring.h | 1 - src/core/dev/ring_bond.cpp | 6 +- src/core/dev/ring_simple.cpp | 36 ++++---- src/core/dev/ring_simple.h | 8 +- src/core/proto/ip_frag.cpp | 4 +- src/core/proto/mem_buf_desc.h | 6 +- src/core/sock/socket_fd_api.h | 4 +- src/core/sock/sockinfo.cpp | 10 +-- src/core/sock/sockinfo_tcp.h | 2 +- src/core/sock/sockinfo_udp.h | 2 +- 26 files changed, 197 insertions(+), 200 deletions(-) rename src/core/dev/{cq_mgr.cpp => cq_mgr_rx.cpp} (88%) rename src/core/dev/{cq_mgr.h => cq_mgr_rx.h} (95%) rename src/core/dev/{cq_mgr.inl => cq_mgr_rx.inl} (95%) rename src/core/dev/{cq_mgr_regrq.cpp => cq_mgr_rx_regrq.cpp} (91%) rename src/core/dev/{cq_mgr_regrq.h => cq_mgr_rx_regrq.h} (92%) rename src/core/dev/{cq_mgr_strq.cpp => cq_mgr_rx_strq.cpp} (91%) rename src/core/dev/{cq_mgr_strq.h => cq_mgr_rx_strq.h} (90%) diff --git a/src/core/Makefile.am b/src/core/Makefile.am index 873646cb5..b60115082 100644 --- a/src/core/Makefile.am +++ b/src/core/Makefile.am @@ -25,7 +25,7 @@ dist-hook: SUBDIRS = infra netlink EXTRA_DIST = \ - dev/cq_mgr.inl \ + dev/cq_mgr_rx.inl \ util/libxlio.conf sysconf_DATA = util/libxlio.conf @@ -61,9 +61,9 @@ libxlio_la_LIBADD = \ libxlio_la_SOURCES := \ dev/allocator.cpp \ dev/buffer_pool.cpp \ - dev/cq_mgr.cpp \ - dev/cq_mgr_regrq.cpp \ - dev/cq_mgr_strq.cpp \ + dev/cq_mgr_rx.cpp \ + dev/cq_mgr_rx_regrq.cpp \ + dev/cq_mgr_rx_strq.cpp \ dev/cq_mgr_tx.cpp \ dev/dm_mgr.cpp \ dev/qp_mgr.cpp \ @@ -171,9 +171,9 @@ libxlio_la_SOURCES := \ \ dev/allocator.h \ dev/buffer_pool.h \ - dev/cq_mgr.h \ - dev/cq_mgr_regrq.h \ - dev/cq_mgr_strq.h \ + dev/cq_mgr_rx.h \ + dev/cq_mgr_rx_regrq.h \ + dev/cq_mgr_rx_strq.h \ dev/cq_mgr_tx.h \ dev/dm_mgr.h \ dev/gro_mgr.h \ diff --git a/src/core/dev/cq_mgr.cpp b/src/core/dev/cq_mgr_rx.cpp similarity index 88% rename from src/core/dev/cq_mgr.cpp rename to src/core/dev/cq_mgr_rx.cpp index 6137a4261..3172549b4 100644 --- a/src/core/dev/cq_mgr.cpp +++ b/src/core/dev/cq_mgr_rx.cpp @@ -30,8 +30,8 @@ * SOFTWARE. */ -#include "cq_mgr.h" -#include "cq_mgr.inl" +#include "cq_mgr_rx.h" +#include "cq_mgr_rx.inl" #include #include #include @@ -50,7 +50,7 @@ #include "ring_simple.h" #include "qp_mgr_eth_mlx5.h" -#define MODULE_NAME "cq_mgr" +#define MODULE_NAME "cq_mgr_rx" #define cq_logpanic __log_info_panic #define cq_logerr __log_info_err @@ -67,12 +67,12 @@ ##log_args); \ } while (0) -atomic_t cq_mgr::m_n_cq_id_counter_rx = ATOMIC_INIT(1); +atomic_t cq_mgr_rx::m_n_cq_id_counter_rx = ATOMIC_INIT(1); -uint64_t cq_mgr::m_n_global_sn_rx = 0; +uint64_t cq_mgr_rx::m_n_global_sn_rx = 0; -cq_mgr::cq_mgr(ring_simple *p_ring, ib_ctx_handler *p_ib_ctx_handler, int cq_size, - struct ibv_comp_channel *p_comp_event_channel) +cq_mgr_rx::cq_mgr_rx(ring_simple *p_ring, ib_ctx_handler *p_ib_ctx_handler, int cq_size, + struct ibv_comp_channel *p_comp_event_channel) : m_p_ring(p_ring) , m_n_sysvar_cq_poll_batch_max(safe_mce_sys().cq_poll_batch_max) , m_n_sysvar_progress_engine_wce_max(safe_mce_sys().progress_engine_wce_max) @@ -94,15 +94,15 @@ cq_mgr::cq_mgr(ring_simple *p_ring, ib_ctx_handler *p_ib_ctx_handler, int cq_siz memset(&m_cq_stat_static, 0, sizeof(m_cq_stat_static)); memset(&m_qp_rec, 0, sizeof(m_qp_rec)); - m_rx_queue.set_id("cq_mgr (%p) : m_rx_queue", this); - m_rx_pool.set_id("cq_mgr (%p) : m_rx_pool", this); + m_rx_queue.set_id("cq_mgr_rx (%p) : m_rx_queue", this); + m_rx_pool.set_id("cq_mgr_rx (%p) : m_rx_pool", this); m_cq_id_rx = atomic_fetch_and_inc(&m_n_cq_id_counter_rx); // cq id is nonzero configure(cq_size); memset(&m_mlx5_cq, 0, sizeof(m_mlx5_cq)); } -void cq_mgr::configure(int cq_size) +void cq_mgr_rx::configure(int cq_size) { xlio_ibv_cq_init_attr attr; memset(&attr, 0, sizeof(attr)); @@ -138,7 +138,7 @@ void cq_mgr::configure(int cq_size) get_channel_fd(), cq_size, m_p_ibv_cq); } -cq_mgr::~cq_mgr() +cq_mgr_rx::~cq_mgr_rx() { cq_logdbg("Destroying Rx CQ"); @@ -172,7 +172,7 @@ cq_mgr::~cq_mgr() cq_logdbg("Destroying Rx CQ done"); } -void cq_mgr::statistics_print() +void cq_mgr_rx::statistics_print() { if (m_p_cq_stat->n_rx_pkt_drop || m_p_cq_stat->n_rx_sw_queue_len || m_p_cq_stat->n_rx_drained_at_once_max || m_p_cq_stat->n_buffer_pool_len) { @@ -184,7 +184,7 @@ void cq_mgr::statistics_print() } } -void cq_mgr::set_qp_rq(qp_mgr *qp) +void cq_mgr_rx::set_qp_rq(qp_mgr *qp) { m_qp = static_cast(qp); @@ -199,11 +199,11 @@ void cq_mgr::set_qp_rq(qp_mgr *qp) m_mlx5_cq.cq_buf); } -void cq_mgr::add_qp_rx(qp_mgr *qp) +void cq_mgr_rx::add_qp_rx(qp_mgr *qp) { cq_logdbg("qp_mgr=%p", qp); descq_t temp_desc_list; - temp_desc_list.set_id("cq_mgr (%p) : temp_desc_list", this); + temp_desc_list.set_id("cq_mgr_rx (%p) : temp_desc_list", this); m_p_cq_stat->n_rx_drained_at_once_max = 0; @@ -247,7 +247,7 @@ void cq_mgr::add_qp_rx(qp_mgr *qp) m_qp_rec.debt = 0; } -void cq_mgr::del_qp_rx(qp_mgr *qp) +void cq_mgr_rx::del_qp_rx(qp_mgr *qp) { BULLSEYE_EXCLUDE_BLOCK_START if (m_qp_rec.qp != qp) { @@ -262,7 +262,7 @@ void cq_mgr::del_qp_rx(qp_mgr *qp) memset(&m_qp_rec, 0, sizeof(m_qp_rec)); } -void cq_mgr::lro_update_hdr(struct xlio_mlx5_cqe *cqe, mem_buf_desc_t *p_rx_wc_buf_desc) +void cq_mgr_rx::lro_update_hdr(struct xlio_mlx5_cqe *cqe, mem_buf_desc_t *p_rx_wc_buf_desc) { struct ethhdr *p_eth_h = (struct ethhdr *)(p_rx_wc_buf_desc->p_buffer); struct tcphdr *p_tcp_h; @@ -316,7 +316,7 @@ void cq_mgr::lro_update_hdr(struct xlio_mlx5_cqe *cqe, mem_buf_desc_t *p_rx_wc_b } } -bool cq_mgr::request_more_buffers() +bool cq_mgr_rx::request_more_buffers() { cq_logfuncall("Allocating additional %d buffers for internal use", m_n_sysvar_qp_compensation_level); @@ -334,7 +334,7 @@ bool cq_mgr::request_more_buffers() return true; } -void cq_mgr::return_extra_buffers() +void cq_mgr_rx::return_extra_buffers() { if (m_rx_pool.size() < m_n_sysvar_qp_compensation_level * 2) { return; @@ -346,8 +346,8 @@ void cq_mgr::return_extra_buffers() m_p_cq_stat->n_buffer_pool_len = m_rx_pool.size(); } -mem_buf_desc_t *cq_mgr::cqe_process_rx(mem_buf_desc_t *p_mem_buf_desc, - enum buff_status_e status) +mem_buf_desc_t *cq_mgr_rx::cqe_process_rx(mem_buf_desc_t *p_mem_buf_desc, + enum buff_status_e status) { /* Assume locked!!! */ cq_logfuncall(""); @@ -377,7 +377,7 @@ mem_buf_desc_t *cq_mgr::cqe_process_rx(mem_buf_desc_t *p_mem_buf_desc, return p_mem_buf_desc; } -bool cq_mgr::compensate_qp_poll_success(mem_buf_desc_t *buff_cur) +bool cq_mgr_rx::compensate_qp_poll_success(mem_buf_desc_t *buff_cur) { // Assume locked!!! // Compensate QP for all completions that we found @@ -397,7 +397,7 @@ bool cq_mgr::compensate_qp_poll_success(mem_buf_desc_t *buff_cur) return false; } -void cq_mgr::compensate_qp_poll_failed() +void cq_mgr_rx::compensate_qp_poll_failed() { // Assume locked!!! // Compensate QP for all completions debt @@ -411,7 +411,7 @@ void cq_mgr::compensate_qp_poll_failed() } } -void cq_mgr::reclaim_recv_buffer_helper(mem_buf_desc_t *buff) +void cq_mgr_rx::reclaim_recv_buffer_helper(mem_buf_desc_t *buff) { if (buff->dec_ref_count() <= 1 && (buff->lwip_pbuf.pbuf.ref-- <= 1)) { if (likely(buff->p_desc_owner == m_p_ring)) { @@ -437,15 +437,15 @@ void cq_mgr::reclaim_recv_buffer_helper(mem_buf_desc_t *buff) } // This method is called when ring release returns unposted buffers. -void cq_mgr::mem_buf_desc_return_to_owner(mem_buf_desc_t *p_mem_buf_desc, +void cq_mgr_rx::mem_buf_desc_return_to_owner(mem_buf_desc_t *p_mem_buf_desc, void *pv_fd_ready_array /*=NULL*/) { cq_logfuncall(""); NOT_IN_USE(pv_fd_ready_array); - cq_mgr::reclaim_recv_buffer_helper(p_mem_buf_desc); + cq_mgr_rx::reclaim_recv_buffer_helper(p_mem_buf_desc); } -bool cq_mgr::reclaim_recv_buffers(mem_buf_desc_t *rx_reuse_lst) +bool cq_mgr_rx::reclaim_recv_buffers(mem_buf_desc_t *rx_reuse_lst) { if (m_rx_buffs_rdy_for_free_head) { reclaim_recv_buffer_helper(m_rx_buffs_rdy_for_free_head); @@ -457,7 +457,7 @@ bool cq_mgr::reclaim_recv_buffers(mem_buf_desc_t *rx_reuse_lst) return true; } -bool cq_mgr::reclaim_recv_buffers_no_lock(mem_buf_desc_t *rx_reuse_lst) +bool cq_mgr_rx::reclaim_recv_buffers_no_lock(mem_buf_desc_t *rx_reuse_lst) { if (likely(rx_reuse_lst)) { reclaim_recv_buffer_helper(rx_reuse_lst); @@ -466,7 +466,7 @@ bool cq_mgr::reclaim_recv_buffers_no_lock(mem_buf_desc_t *rx_reuse_lst) return false; } -int cq_mgr::reclaim_recv_single_buffer(mem_buf_desc_t *rx_reuse) +int cq_mgr_rx::reclaim_recv_single_buffer(mem_buf_desc_t *rx_reuse) { int ret_val = 0; @@ -489,10 +489,10 @@ int cq_mgr::reclaim_recv_single_buffer(mem_buf_desc_t *rx_reuse) return ret_val; } -bool cq_mgr::reclaim_recv_buffers(descq_t *rx_reuse) +bool cq_mgr_rx::reclaim_recv_buffers(descq_t *rx_reuse) { cq_logfuncall(""); - // Called from outside cq_mgr context which is not locked!! + // Called from outside cq_mgr_rx context which is not locked!! while (!rx_reuse->empty()) { mem_buf_desc_t *buff = rx_reuse->get_and_pop_front(); reclaim_recv_buffer_helper(buff); @@ -502,21 +502,21 @@ bool cq_mgr::reclaim_recv_buffers(descq_t *rx_reuse) return true; } -int cq_mgr::request_notification(uint64_t poll_sn) +int cq_mgr_rx::request_notification(uint64_t poll_sn) { int ret = -1; cq_logfuncall(""); if ((m_n_global_sn_rx > 0 && poll_sn != m_n_global_sn_rx)) { - // The cq_mgr's has receive packets pending processing (or got processed since cq_poll_sn) + // The cq_mgr_rx's has receive packets pending processing (or got processed since cq_poll_sn) cq_logfunc("miss matched poll sn (user=0x%lx, cq=0x%lx)", poll_sn, m_n_cq_poll_sn_rx); return 1; } if (m_b_notification_armed == false) { - cq_logfunc("arming cq_mgr notification channel"); + cq_logfunc("arming cq_mgr_rx notification channel"); // Arm the CQ notification channel IF_VERBS_FAILURE(xlio_ib_mlx5_req_notify_cq(&m_mlx5_cq, 0)) @@ -530,7 +530,7 @@ int cq_mgr::request_notification(uint64_t poll_sn) } ENDIF_VERBS_FAILURE; } else { - // cq_mgr notification channel already armed + // cq_mgr_rx notification channel already armed ret = 0; } @@ -538,7 +538,7 @@ int cq_mgr::request_notification(uint64_t poll_sn) return ret; } -int cq_mgr::wait_for_notification_and_process_element(uint64_t *p_cq_poll_sn, +int cq_mgr_rx::wait_for_notification_and_process_element(uint64_t *p_cq_poll_sn, void *pv_fd_ready_array) { int ret = -1; @@ -546,24 +546,24 @@ int cq_mgr::wait_for_notification_and_process_element(uint64_t *p_cq_poll_sn, cq_logfunc(""); if (m_b_notification_armed) { - cq_mgr *p_cq_mgr_context = NULL; + cq_mgr_rx *p_cq_mgr_context = NULL; struct ibv_cq *p_cq_hndl = NULL; void *p; // deal with compiler warnings - // Block on the cq_mgr's notification event channel + // Block on the cq_mgr_rx's notification event channel IF_VERBS_FAILURE(ibv_get_cq_event(m_comp_event_channel, &p_cq_hndl, &p)) { - cq_logfunc("waiting on cq_mgr event returned with error (errno=%d %m)", errno); + cq_logfunc("waiting on cq_mgr_rx event returned with error (errno=%d %m)", errno); } else { get_cq_event(); - p_cq_mgr_context = (cq_mgr *)p; + p_cq_mgr_context = (cq_mgr_rx *)p; if (p_cq_mgr_context != this) { - cq_logerr("mismatch with cq_mgr returned from new event (event->cq_mgr->%p)", + cq_logerr("mismatch with cq_mgr_rx returned from new event (event->cq_mgr_rx->%p)", p_cq_mgr_context); // this can be if we are using a single channel for several/all cq_mgrs - // in this case we need to deliver the event to the correct cq_mgr + // in this case we need to deliver the event to the correct cq_mgr_rx } // Ack event diff --git a/src/core/dev/cq_mgr.h b/src/core/dev/cq_mgr_rx.h similarity index 95% rename from src/core/dev/cq_mgr.h rename to src/core/dev/cq_mgr_rx.h index d74cc0cb0..a848a19cc 100644 --- a/src/core/dev/cq_mgr.h +++ b/src/core/dev/cq_mgr_rx.h @@ -30,8 +30,8 @@ * SOFTWARE. */ -#ifndef CQ_MGR_H -#define CQ_MGR_H +#ifndef CQ_MGR_RX_H +#define CQ_MGR_RX_H #include "ib/base/verbs_extra.h" #include "utils/atomic.h" @@ -71,9 +71,7 @@ struct qp_rec { int debt; }; -// Class cq_mgr -// -class cq_mgr { +class cq_mgr_rx { friend class ring; // need to expose the m_n_global_sn_rx only to ring friend class ring_simple; // need to expose the m_n_global_sn_rx only to ring friend class ring_bond; // need to expose the m_n_global_sn_rx only to ring @@ -88,9 +86,9 @@ class cq_mgr { BS_GENERAL_ERR }; - cq_mgr(ring_simple *p_ring, ib_ctx_handler *p_ib_ctx_handler, int cq_size, - struct ibv_comp_channel *p_comp_event_channel); - virtual ~cq_mgr(); + cq_mgr_rx(ring_simple *p_ring, ib_ctx_handler *p_ib_ctx_handler, int cq_size, + struct ibv_comp_channel *p_comp_event_channel); + virtual ~cq_mgr_rx(); void configure(int cq_size); @@ -140,7 +138,7 @@ class cq_mgr { // CQ implements the Rx mem_buf_desc_owner. // These callbacks will be called for each Rx buffer that passed processed completion - // Rx completion handling at the cq_mgr level is forwarding the packet to the ib_comm_mgr layer + // Rx completion handling at the cq_mgr_rx level is forwarding the packet to the ib_comm_mgr layer void mem_buf_desc_return_to_owner(mem_buf_desc_t *p_mem_buf_desc, void *pv_fd_ready_array = NULL); @@ -231,7 +229,7 @@ class cq_mgr { void return_extra_buffers() __attribute__((noinline)); }; -inline void cq_mgr::update_global_sn_rx(uint64_t &cq_poll_sn, uint32_t num_polled_cqes) +inline void cq_mgr_rx::update_global_sn_rx(uint64_t &cq_poll_sn, uint32_t num_polled_cqes) { if (num_polled_cqes > 0) { // spoil the global sn if we have packets ready @@ -252,7 +250,7 @@ inline void cq_mgr::update_global_sn_rx(uint64_t &cq_poll_sn, uint32_t num_polle cq_poll_sn = m_n_global_sn_rx; } -inline struct xlio_mlx5_cqe *cq_mgr::check_cqe(void) +inline struct xlio_mlx5_cqe *cq_mgr_rx::check_cqe(void) { struct xlio_mlx5_cqe *cqe = (struct xlio_mlx5_cqe *)(((uint8_t *)m_mlx5_cq.cq_buf) + diff --git a/src/core/dev/cq_mgr.inl b/src/core/dev/cq_mgr_rx.inl similarity index 95% rename from src/core/dev/cq_mgr.inl rename to src/core/dev/cq_mgr_rx.inl index dc8670188..7ab65f966 100644 --- a/src/core/dev/cq_mgr.inl +++ b/src/core/dev/cq_mgr_rx.inl @@ -33,7 +33,7 @@ #ifndef CQ_MGR_INL_H #define CQ_MGR_INL_H -#include "cq_mgr.h" +#include "cq_mgr_rx.h" #include "ring_simple.h" #include "util/utils.h" @@ -41,7 +41,7 @@ /** inlining functions can only help if they are implemented before their usage **/ /**/ -inline void cq_mgr::process_recv_buffer(mem_buf_desc_t *p_mem_buf_desc, void *pv_fd_ready_array) +inline void cq_mgr_rx::process_recv_buffer(mem_buf_desc_t *p_mem_buf_desc, void *pv_fd_ready_array) { // Assume locked!!! @@ -52,7 +52,7 @@ inline void cq_mgr::process_recv_buffer(mem_buf_desc_t *p_mem_buf_desc, void *pv } } -inline uint32_t cq_mgr::process_recv_queue(void *pv_fd_ready_array) +inline uint32_t cq_mgr_rx::process_recv_queue(void *pv_fd_ready_array) { // Assume locked!!! // If we have packets in the queue, dequeue one and process it diff --git a/src/core/dev/cq_mgr_regrq.cpp b/src/core/dev/cq_mgr_rx_regrq.cpp similarity index 91% rename from src/core/dev/cq_mgr_regrq.cpp rename to src/core/dev/cq_mgr_rx_regrq.cpp index a33ac9505..8259ccaec 100644 --- a/src/core/dev/cq_mgr_regrq.cpp +++ b/src/core/dev/cq_mgr_rx_regrq.cpp @@ -30,19 +30,19 @@ * SOFTWARE. */ -#include "cq_mgr_regrq.h" +#include "cq_mgr_rx_regrq.h" #if defined(DEFINED_DIRECT_VERBS) #include -#include "cq_mgr.inl" +#include "cq_mgr_rx.inl" #include "qp_mgr.h" #include "qp_mgr_eth_mlx5.h" #include "ring_simple.h" #include -#define MODULE_NAME "cq_mgr_regrq" +#define MODULE_NAME "cq_mgr_rx_regrq" #define cq_logfunc __log_info_func #define cq_logdbg __log_info_dbg @@ -51,14 +51,14 @@ #define cq_logpanic __log_info_panic #define cq_logfuncall __log_info_funcall -cq_mgr_regrq::cq_mgr_regrq(ring_simple *p_ring, ib_ctx_handler *p_ib_ctx_handler, uint32_t cq_size, - struct ibv_comp_channel *p_comp_event_channel) - : cq_mgr(p_ring, p_ib_ctx_handler, cq_size, p_comp_event_channel) +cq_mgr_rx_regrq::cq_mgr_rx_regrq(ring_simple *p_ring, ib_ctx_handler *p_ib_ctx_handler, uint32_t cq_size, + struct ibv_comp_channel *p_comp_event_channel) + : cq_mgr_rx(p_ring, p_ib_ctx_handler, cq_size, p_comp_event_channel) { cq_logfunc(""); } -uint32_t cq_mgr_regrq::clean_cq() +uint32_t cq_mgr_rx_regrq::clean_cq() { uint32_t ret_total = 0; uint64_t cq_poll_sn = 0; @@ -85,12 +85,12 @@ uint32_t cq_mgr_regrq::clean_cq() return ret_total; } -cq_mgr_regrq::~cq_mgr_regrq() +cq_mgr_rx_regrq::~cq_mgr_rx_regrq() { cq_logdbg("Destroying CQ REGRQ"); } -mem_buf_desc_t *cq_mgr_regrq::poll(enum buff_status_e &status) +mem_buf_desc_t *cq_mgr_rx_regrq::poll(enum buff_status_e &status) { mem_buf_desc_t *buff = NULL; @@ -130,8 +130,8 @@ mem_buf_desc_t *cq_mgr_regrq::poll(enum buff_status_e &status) return buff; } -void cq_mgr_regrq::cqe_to_mem_buff_desc(struct xlio_mlx5_cqe *cqe, mem_buf_desc_t *p_rx_wc_buf_desc, - enum buff_status_e &status) +void cq_mgr_rx_regrq::cqe_to_mem_buff_desc(struct xlio_mlx5_cqe *cqe, mem_buf_desc_t *p_rx_wc_buf_desc, + enum buff_status_e &status) { struct mlx5_err_cqe *ecqe; ecqe = (struct mlx5_err_cqe *)cqe; @@ -207,8 +207,8 @@ void cq_mgr_regrq::cqe_to_mem_buff_desc(struct xlio_mlx5_cqe *cqe, mem_buf_desc_ } } -int cq_mgr_regrq::drain_and_proccess_helper(mem_buf_desc_t *buff, buff_status_e status, - uintptr_t *p_recycle_buffers_last_wr_id) +int cq_mgr_rx_regrq::drain_and_proccess_helper(mem_buf_desc_t *buff, buff_status_e status, + uintptr_t *p_recycle_buffers_last_wr_id) { ++m_n_wce_counter; if (cqe_process_rx(buff, status)) { @@ -242,7 +242,7 @@ int cq_mgr_regrq::drain_and_proccess_helper(mem_buf_desc_t *buff, buff_status_e return 1; } -int cq_mgr_regrq::drain_and_proccess(uintptr_t *p_recycle_buffers_last_wr_id /*=NULL*/) +int cq_mgr_rx_regrq::drain_and_proccess(uintptr_t *p_recycle_buffers_last_wr_id /*=NULL*/) { cq_logfuncall("cq was %s drained. %d processed wce since last check. %d wce in m_rx_queue", (m_b_was_drained ? "" : "not "), m_n_wce_counter, m_rx_queue.size()); @@ -323,7 +323,7 @@ int cq_mgr_regrq::drain_and_proccess(uintptr_t *p_recycle_buffers_last_wr_id /*= return ret_total; } -mem_buf_desc_t *cq_mgr_regrq::poll_and_process_socketxtreme() +mem_buf_desc_t *cq_mgr_rx_regrq::poll_and_process_socketxtreme() { buff_status_e status = BS_OK; mem_buf_desc_t *buff_wqe = poll(status); @@ -344,7 +344,7 @@ mem_buf_desc_t *cq_mgr_regrq::poll_and_process_socketxtreme() return nullptr; } -int cq_mgr_regrq::poll_and_process_element_rx(uint64_t *p_cq_poll_sn, void *pv_fd_ready_array) +int cq_mgr_rx_regrq::poll_and_process_element_rx(uint64_t *p_cq_poll_sn, void *pv_fd_ready_array) { /* Assume locked!!! */ cq_logfuncall(""); @@ -396,11 +396,11 @@ int cq_mgr_regrq::poll_and_process_element_rx(uint64_t *p_cq_poll_sn, void *pv_f return ret_rx_processed; } -void cq_mgr_regrq::add_qp_rx(qp_mgr *qp) +void cq_mgr_rx_regrq::add_qp_rx(qp_mgr *qp) { cq_logfunc(""); set_qp_rq(qp); - cq_mgr::add_qp_rx(qp); + cq_mgr_rx::add_qp_rx(qp); } #endif /* DEFINED_DIRECT_VERBS */ diff --git a/src/core/dev/cq_mgr_regrq.h b/src/core/dev/cq_mgr_rx_regrq.h similarity index 92% rename from src/core/dev/cq_mgr_regrq.h rename to src/core/dev/cq_mgr_rx_regrq.h index f2f2781f4..9e70acda9 100644 --- a/src/core/dev/cq_mgr_regrq.h +++ b/src/core/dev/cq_mgr_rx_regrq.h @@ -33,13 +33,13 @@ #ifndef CQ_MGR_REGRQ_H #define CQ_MGR_REGRQ_H -#include "cq_mgr.h" +#include "cq_mgr_rx.h" -class cq_mgr_regrq : public cq_mgr { +class cq_mgr_rx_regrq : public cq_mgr_rx { public: - cq_mgr_regrq(ring_simple *p_ring, ib_ctx_handler *p_ib_ctx_handler, uint32_t cq_size, + cq_mgr_rx_regrq(ring_simple *p_ring, ib_ctx_handler *p_ib_ctx_handler, uint32_t cq_size, struct ibv_comp_channel *p_comp_event_channel); - virtual ~cq_mgr_regrq(); + virtual ~cq_mgr_rx_regrq(); virtual int drain_and_proccess(uintptr_t *p_recycle_buffers_last_wr_id = NULL) override; virtual mem_buf_desc_t *poll_and_process_socketxtreme(); diff --git a/src/core/dev/cq_mgr_strq.cpp b/src/core/dev/cq_mgr_rx_strq.cpp similarity index 91% rename from src/core/dev/cq_mgr_strq.cpp rename to src/core/dev/cq_mgr_rx_strq.cpp index e245eba84..ec8f02789 100644 --- a/src/core/dev/cq_mgr_strq.cpp +++ b/src/core/dev/cq_mgr_rx_strq.cpp @@ -30,18 +30,18 @@ * SOFTWARE. */ -#include "cq_mgr_strq.h" +#include "cq_mgr_rx_strq.h" #if defined(DEFINED_DIRECT_VERBS) #include -#include "cq_mgr.inl" +#include "cq_mgr_rx.inl" #include "qp_mgr.h" #include "qp_mgr_eth_mlx5.h" #include "ring_simple.h" #include -#define MODULE_NAME "cq_mgr_strq" +#define MODULE_NAME "cq_mgr_rx_strq" #define cq_logfunc __log_info_func #define cq_logdbg __log_info_dbg @@ -55,11 +55,11 @@ ##log_args); \ } while (0) -cq_mgr_strq::cq_mgr_strq(ring_simple *p_ring, ib_ctx_handler *p_ib_ctx_handler, - uint32_t cq_size, uint32_t stride_size_bytes, - uint32_t strides_num, - struct ibv_comp_channel *p_comp_event_channel) - : cq_mgr(p_ring, p_ib_ctx_handler, cq_size, p_comp_event_channel) +cq_mgr_rx_strq::cq_mgr_rx_strq(ring_simple *p_ring, ib_ctx_handler *p_ib_ctx_handler, + uint32_t cq_size, uint32_t stride_size_bytes, + uint32_t strides_num, + struct ibv_comp_channel *p_comp_event_channel) + : cq_mgr_rx(p_ring, p_ib_ctx_handler, cq_size, p_comp_event_channel) , _owner_ring(p_ring) , _stride_size_bytes(stride_size_bytes) , _strides_num(strides_num) @@ -72,7 +72,7 @@ cq_mgr_strq::cq_mgr_strq(ring_simple *p_ring, ib_ctx_handler *p_ib_ctx_handler, return_stride(next_stride()); // Fill _stride_cache } -cq_mgr_strq::~cq_mgr_strq() +cq_mgr_rx_strq::~cq_mgr_rx_strq() { cq_logfunc(""); cq_logdbg("destroying CQ STRQ"); @@ -99,7 +99,7 @@ cq_mgr_strq::~cq_mgr_strq() g_buffer_pool_rx_stride->put_buffers_thread_safe(&_stride_cache, _stride_cache.size()); } -mem_buf_desc_t *cq_mgr_strq::next_stride() +mem_buf_desc_t *cq_mgr_rx_strq::next_stride() { if (unlikely(_stride_cache.size() <= 0U)) { if (!g_buffer_pool_rx_stride->get_buffers_thread_safe( @@ -115,7 +115,7 @@ mem_buf_desc_t *cq_mgr_strq::next_stride() return _stride_cache.get_and_pop_back(); } -void cq_mgr_strq::return_stride(mem_buf_desc_t *desc) +void cq_mgr_rx_strq::return_stride(mem_buf_desc_t *desc) { _stride_cache.push_back(desc); @@ -125,7 +125,7 @@ void cq_mgr_strq::return_stride(mem_buf_desc_t *desc) } } -uint32_t cq_mgr_strq::clean_cq() +uint32_t cq_mgr_rx_strq::clean_cq() { uint32_t ret_total = 0; uint64_t cq_poll_sn = 0; @@ -154,7 +154,7 @@ uint32_t cq_mgr_strq::clean_cq() return ret_total; } -bool cq_mgr_strq::set_current_hot_buffer() +bool cq_mgr_rx_strq::set_current_hot_buffer() { if (likely(m_qp->m_mlx5_qp.rq.tail != (m_qp->m_mlx5_qp.rq.head))) { uint32_t index = m_qp->m_mlx5_qp.rq.tail & (m_qp_rec.qp->m_rx_num_wr - 1); @@ -169,7 +169,7 @@ bool cq_mgr_strq::set_current_hot_buffer() return false; } -mem_buf_desc_t *cq_mgr_strq::poll(enum buff_status_e &status, mem_buf_desc_t *&buff_stride) +mem_buf_desc_t *cq_mgr_rx_strq::poll(enum buff_status_e &status, mem_buf_desc_t *&buff_stride) { mem_buf_desc_t *buff = NULL; @@ -226,8 +226,8 @@ mem_buf_desc_t *cq_mgr_strq::poll(enum buff_status_e &status, mem_buf_desc_t *&b return buff; } -inline bool cq_mgr_strq::strq_cqe_to_mem_buff_desc(struct xlio_mlx5_cqe *cqe, - enum buff_status_e &status, bool &is_filler) +inline bool cq_mgr_rx_strq::strq_cqe_to_mem_buff_desc(struct xlio_mlx5_cqe *cqe, + enum buff_status_e &status, bool &is_filler) { struct mlx5_err_cqe *ecqe; ecqe = (struct mlx5_err_cqe *)cqe; @@ -336,9 +336,9 @@ inline bool cq_mgr_strq::strq_cqe_to_mem_buff_desc(struct xlio_mlx5_cqe *cqe, return false; } -int cq_mgr_strq::drain_and_proccess_helper(mem_buf_desc_t *buff, mem_buf_desc_t *buff_wqe, - buff_status_e status, - uintptr_t *p_recycle_buffers_last_wr_id) +int cq_mgr_rx_strq::drain_and_proccess_helper(mem_buf_desc_t *buff, mem_buf_desc_t *buff_wqe, + buff_status_e status, + uintptr_t *p_recycle_buffers_last_wr_id) { int ret_total = 0; if (buff_wqe && (++m_qp_rec.debt >= (int)m_n_sysvar_rx_num_wr_to_post_recv) && @@ -375,7 +375,7 @@ int cq_mgr_strq::drain_and_proccess_helper(mem_buf_desc_t *buff, mem_buf_desc_t return ret_total; } -int cq_mgr_strq::drain_and_proccess(uintptr_t *p_recycle_buffers_last_wr_id) +int cq_mgr_rx_strq::drain_and_proccess(uintptr_t *p_recycle_buffers_last_wr_id) { cq_logfuncall("cq was %s drained. %d processed wce since last check. %d wce in m_rx_queue", (m_b_was_drained ? "" : "not "), m_n_wce_counter, m_rx_queue.size()); @@ -422,8 +422,8 @@ int cq_mgr_strq::drain_and_proccess(uintptr_t *p_recycle_buffers_last_wr_id) return ret_total; } -mem_buf_desc_t *cq_mgr_strq::process_strq_cq_element_rx(mem_buf_desc_t *p_mem_buf_desc, - enum buff_status_e status) +mem_buf_desc_t *cq_mgr_rx_strq::process_strq_cq_element_rx(mem_buf_desc_t *p_mem_buf_desc, + enum buff_status_e status) { /* Assume locked!!! */ cq_logfuncall(""); @@ -447,7 +447,7 @@ mem_buf_desc_t *cq_mgr_strq::process_strq_cq_element_rx(mem_buf_desc_t *p_mem_bu return p_mem_buf_desc; } -mem_buf_desc_t *cq_mgr_strq::poll_and_process_socketxtreme() +mem_buf_desc_t *cq_mgr_rx_strq::poll_and_process_socketxtreme() { buff_status_e status = BS_OK; mem_buf_desc_t *buff = nullptr; @@ -460,7 +460,7 @@ mem_buf_desc_t *cq_mgr_strq::poll_and_process_socketxtreme() return (buff && cqe_process_rx(buff, status) ? buff : nullptr); } -int cq_mgr_strq::poll_and_process_element_rx(uint64_t *p_cq_poll_sn, void *pv_fd_ready_array) +int cq_mgr_rx_strq::poll_and_process_element_rx(uint64_t *p_cq_poll_sn, void *pv_fd_ready_array) { /* Assume locked!!! */ cq_logfuncall(""); @@ -510,18 +510,18 @@ int cq_mgr_strq::poll_and_process_element_rx(uint64_t *p_cq_poll_sn, void *pv_fd return ret_rx_processed; } -void cq_mgr_strq::add_qp_rx(qp_mgr *qp) +void cq_mgr_rx_strq::add_qp_rx(qp_mgr *qp) { cq_logfunc(""); set_qp_rq(qp); _hot_buffer_stride = nullptr; _current_wqe_consumed_bytes = 0U; - cq_mgr::add_qp_rx(qp); + cq_mgr_rx::add_qp_rx(qp); } -void cq_mgr_strq::statistics_print() +void cq_mgr_rx_strq::statistics_print() { - cq_mgr::statistics_print(); + cq_mgr_rx::statistics_print(); cq_logdbg_no_funcname("RWQE consumed: %12" PRIu64, m_p_cq_stat->n_rx_consumed_rwqe_count); cq_logdbg_no_funcname("Packets count: %12" PRIu64, m_p_cq_stat->n_rx_packet_count); cq_logdbg_no_funcname("Max Strides per Packet: %12" PRIu16, @@ -531,7 +531,7 @@ void cq_mgr_strq::statistics_print() cq_logdbg_no_funcname("LRO bytes: %12" PRIu64, m_p_cq_stat->n_rx_lro_bytes); } -void cq_mgr_strq::reclaim_recv_buffer_helper(mem_buf_desc_t *buff) +void cq_mgr_rx_strq::reclaim_recv_buffer_helper(mem_buf_desc_t *buff) { if (buff->dec_ref_count() <= 1 && (buff->lwip_pbuf.pbuf.ref-- <= 1)) { if (likely(buff->p_desc_owner == m_p_ring)) { @@ -548,7 +548,7 @@ void cq_mgr_strq::reclaim_recv_buffer_helper(mem_buf_desc_t *buff) reinterpret_cast(buff->lwip_pbuf.pbuf.desc.mdesc); if (buff->rx.strides_num == rwqe->add_ref_count(-buff->rx.strides_num)) { // Is last stride. - cq_mgr::reclaim_recv_buffer_helper(rwqe); + cq_mgr_rx::reclaim_recv_buffer_helper(rwqe); } VLIST_DEBUG_CQ_MGR_PRINT_ERROR_IS_MEMBER; diff --git a/src/core/dev/cq_mgr_strq.h b/src/core/dev/cq_mgr_rx_strq.h similarity index 90% rename from src/core/dev/cq_mgr_strq.h rename to src/core/dev/cq_mgr_rx_strq.h index e9d1468eb..fbad4d003 100644 --- a/src/core/dev/cq_mgr_strq.h +++ b/src/core/dev/cq_mgr_rx_strq.h @@ -35,15 +35,15 @@ #include #include -#include "cq_mgr.h" +#include "cq_mgr_rx.h" -class cq_mgr_strq : public cq_mgr { +class cq_mgr_rx_strq : public cq_mgr_rx { public: - cq_mgr_strq(ring_simple *p_ring, ib_ctx_handler *p_ib_ctx_handler, uint32_t cq_size, - uint32_t stride_size_bytes, uint32_t strides_num, - struct ibv_comp_channel *p_comp_event_channel); + cq_mgr_rx_strq(ring_simple *p_ring, ib_ctx_handler *p_ib_ctx_handler, uint32_t cq_size, + uint32_t stride_size_bytes, uint32_t strides_num, + struct ibv_comp_channel *p_comp_event_channel); - virtual ~cq_mgr_strq() override; + virtual ~cq_mgr_rx_strq() override; virtual int drain_and_proccess(uintptr_t *p_recycle_buffers_last_wr_id = NULL) override; virtual mem_buf_desc_t *poll_and_process_socketxtreme() override; diff --git a/src/core/dev/cq_mgr_tx.cpp b/src/core/dev/cq_mgr_tx.cpp index acc118ef3..fc3f812d5 100644 --- a/src/core/dev/cq_mgr_tx.cpp +++ b/src/core/dev/cq_mgr_tx.cpp @@ -201,14 +201,14 @@ int cq_mgr_tx::request_notification(uint64_t poll_sn) cq_logfuncall(""); if ((m_n_global_sn_tx > 0 && poll_sn != m_n_global_sn_tx)) { - // The cq_mgr's has receive packets pending processing (or got processed since cq_poll_sn) + // The cq_mgr_tx's has receive packets pending processing (or got processed since cq_poll_sn) cq_logfunc("miss matched poll sn (user=0x%lx, cq=0x%lx)", poll_sn, m_n_cq_poll_sn_tx); return 1; } if (m_b_notification_armed == false) { - cq_logfunc("arming cq_mgr notification channel"); + cq_logfunc("arming cq_mgr_tx notification channel"); // Arm the CQ notification channel IF_VERBS_FAILURE(xlio_ib_mlx5_req_notify_cq(&m_mlx5_cq, 0)) @@ -222,7 +222,7 @@ int cq_mgr_tx::request_notification(uint64_t poll_sn) } ENDIF_VERBS_FAILURE; } else { - // cq_mgr notification channel already armed + // cq_mgr_tx notification channel already armed ret = 0; } @@ -240,12 +240,12 @@ cq_mgr_tx *cq_mgr_tx::get_cq_mgr_from_cq_event(struct ibv_comp_channel *p_cq_cha IF_VERBS_FAILURE(ibv_get_cq_event(p_cq_channel, &p_cq_hndl, &p_context)) { vlog_printf(VLOG_INFO, - MODULE_NAME ":%d: waiting on cq_mgr event returned with error (errno=%d %m)\n", + MODULE_NAME ":%d: waiting on cq_mgr_tx event returned with error (errno=%d %m)\n", __LINE__, errno); } else { - p_cq_mgr = (cq_mgr_tx *)p_context; // Save the cq_mgr + p_cq_mgr = (cq_mgr_tx *)p_context; // Save the cq_mgr_tx p_cq_mgr->get_cq_event(); ibv_ack_cq_events(p_cq_hndl, 1); // Ack the ibv event } diff --git a/src/core/dev/cq_mgr_tx.h b/src/core/dev/cq_mgr_tx.h index b2ed502dc..47eb2c6e7 100644 --- a/src/core/dev/cq_mgr_tx.h +++ b/src/core/dev/cq_mgr_tx.h @@ -46,8 +46,8 @@ class cq_mgr_tx { ibv_comp_channel *p_comp_event_channel); ~cq_mgr_tx(); - // Helper gunction to extract the Tx cq_mgr from the CQ event, - // Since we have a single TX CQ comp channel for all cq_mgr's, it might not be the active_cq object + // Helper gunction to extract the cq_mgr_tx from the CQ event, + // Since we have a single TX CQ comp channel for all cq_mgr_tx's, it might not be the active_cq object static cq_mgr_tx *get_cq_mgr_from_cq_event(struct ibv_comp_channel *p_cq_channel); ibv_cq *get_ibv_cq_hndl() { return m_p_ibv_cq; } diff --git a/src/core/dev/qp_mgr.cpp b/src/core/dev/qp_mgr.cpp index a79bbfa7f..cc95ac148 100644 --- a/src/core/dev/qp_mgr.cpp +++ b/src/core/dev/qp_mgr.cpp @@ -37,7 +37,7 @@ #include "util/instrumentation.h" #include "iomux/io_mux_call.h" #include "buffer_pool.h" -#include "cq_mgr.h" +#include "cq_mgr_rx.h" #include "ring_simple.h" #include "util/valgrind.h" #include "dev/rfs_rule_ibv.h" @@ -163,7 +163,7 @@ int qp_mgr::configure(struct qp_mgr_desc *desc) } BULLSEYE_EXCLUDE_BLOCK_END - // Modify the Rx and Tx cq_mgr to use a non-blocking event channel + // Modify the cq_mgr_rx and cq_mgr_tx to use a non-blocking event channel set_fd_block_mode(m_p_cq_mgr_rx->get_channel_fd(), false); set_fd_block_mode(m_p_cq_mgr_tx->get_channel_fd(), false); @@ -340,7 +340,7 @@ void qp_mgr::release_rx_buffers() } } // Wait for all FLUSHed WQE on Rx CQ - qp_logdbg("draining rx cq_mgr %p (last_posted_rx_wr_id = %lu)", m_p_cq_mgr_rx, + qp_logdbg("draining cq_mgr_rx %p (last_posted_rx_wr_id = %lu)", m_p_cq_mgr_rx, m_last_posted_rx_wr_id); uintptr_t last_polled_rx_wr_id = 0; while (m_p_cq_mgr_rx && last_polled_rx_wr_id != m_last_posted_rx_wr_id && errno != EIO && @@ -348,7 +348,7 @@ void qp_mgr::release_rx_buffers() // Process the FLUSH'ed WQE's int ret = m_p_cq_mgr_rx->drain_and_proccess(&last_polled_rx_wr_id); - qp_logdbg("draining completed on rx cq_mgr (%d wce) last_polled_rx_wr_id = %lu", ret, + qp_logdbg("draining completed on cq_mgr_rx (%d wce) last_polled_rx_wr_id = %lu", ret, last_polled_rx_wr_id); total_ret += ret; @@ -364,7 +364,7 @@ void qp_mgr::release_rx_buffers() nanosleep(&short_sleep, NULL); } m_last_posted_rx_wr_id = 0; // Clear the posted WR_ID flag, we just clear the entire RQ - qp_logdbg("draining completed with a total of %d wce's on rx cq_mgr", total_ret); + qp_logdbg("draining completed with a total of %d wce's on cq_mgr_rx", total_ret); NOT_IN_USE(total_ret); // Suppress --enable-opt-log=high warning } @@ -372,11 +372,11 @@ void qp_mgr::release_tx_buffers() { int ret; uint64_t poll_sn = 0; - qp_logdbg("draining tx cq_mgr %p", m_p_cq_mgr_tx); + qp_logdbg("draining cq_mgr_tx %p", m_p_cq_mgr_tx); while (m_p_cq_mgr_tx && m_qp && ((ret = m_p_cq_mgr_tx->poll_and_process_element_tx(&poll_sn)) > 0) && (errno != EIO && !m_p_ib_ctx_handler->is_removed())) { - qp_logdbg("draining completed on tx cq_mgr (%d wce)", ret); + qp_logdbg("draining completed on cq_mgr_tx (%d wce)", ret); } NOT_IN_USE(ret); // Suppress --enable-opt-log=high warning } @@ -494,7 +494,7 @@ void qp_mgr::post_recv_buffer(mem_buf_desc_t *p_mem_buf_desc) void qp_mgr::post_recv_buffers(descq_t *p_buffers, size_t count) { qp_logfuncall(""); - // Called from cq_mgr context under cq_mgr::LOCK! + // Called from cq_mgr_rx context under cq_mgr_rx::LOCK! while (count--) { post_recv_buffer(p_buffers->get_and_pop_front()); } @@ -568,7 +568,7 @@ int qp_mgr::send(xlio_ibv_send_wr *p_send_wqe, xlio_wr_tx_packet_attr attr, xlio qp_logerr("error from cq_mgr_tx->process_next_element (ret=%d %m)", ret); } BULLSEYE_EXCLUDE_BLOCK_END - qp_logfunc("polling succeeded on tx cq_mgr (%d wce)", ret); + qp_logfunc("polling succeeded on cq_mgr_tx (%d wce)", ret); } return 0; diff --git a/src/core/dev/qp_mgr.h b/src/core/dev/qp_mgr.h index d9b8f22c6..f289bd420 100644 --- a/src/core/dev/qp_mgr.h +++ b/src/core/dev/qp_mgr.h @@ -49,7 +49,7 @@ #include "proto/mem_buf_desc.h" #include "infra/sender.h" #include "dev/ib_ctx_handler.h" -#include "dev/cq_mgr.h" +#include "dev/cq_mgr_rx.h" #include "dev/cq_mgr_tx.h" #include "dev/rfs_rule.h" @@ -58,7 +58,7 @@ struct xlio_tls_info; class xlio_tis; class xlio_tir; class buffer_pool; -class cq_mgr; +class cq_mgr_rx; struct slave_data; class ring; class ring_simple; @@ -153,9 +153,9 @@ class xlio_ti { * */ class qp_mgr { - friend class cq_mgr; - friend class cq_mgr_regrq; - friend class cq_mgr_strq; + friend class cq_mgr_rx; + friend class cq_mgr_rx_regrq; + friend class cq_mgr_rx_strq; friend class cq_mgr_tx; public: @@ -178,7 +178,7 @@ class qp_mgr { virtual uint16_t get_partiton() const { return 0; }; struct ibv_qp *get_ibv_qp() const { return m_qp; }; class cq_mgr_tx *get_tx_cq_mgr() const { return m_p_cq_mgr_tx; } - class cq_mgr *get_rx_cq_mgr() const { return m_p_cq_mgr_rx; } + class cq_mgr_rx *get_rx_cq_mgr() const { return m_p_cq_mgr_rx; } virtual uint32_t get_rx_max_wr_num(); // This function can be replaced with a parameter during ring creation. // chain of calls may serve as cache warm for dummy send feature. @@ -330,7 +330,7 @@ class qp_mgr { struct ibv_qp_cap m_qp_cap; uint32_t m_max_qp_wr; - cq_mgr *m_p_cq_mgr_rx; + cq_mgr_rx *m_p_cq_mgr_rx; cq_mgr_tx *m_p_cq_mgr_tx; uint32_t m_rx_num_wr; @@ -376,7 +376,7 @@ class qp_mgr { return m_n_unsignaled_count == m_n_sysvar_tx_num_wr_to_signal - 1; } - virtual cq_mgr *init_rx_cq_mgr(struct ibv_comp_channel *p_rx_comp_event_channel) = 0; + virtual cq_mgr_rx *init_rx_cq_mgr(struct ibv_comp_channel *p_rx_comp_event_channel) = 0; virtual cq_mgr_tx *init_tx_cq_mgr(void) = 0; virtual int send_to_wire(xlio_ibv_send_wr *p_send_wqe, xlio_wr_tx_packet_attr attr, diff --git a/src/core/dev/qp_mgr_eth_mlx5.cpp b/src/core/dev/qp_mgr_eth_mlx5.cpp index 3c947663b..52888e529 100644 --- a/src/core/dev/qp_mgr_eth_mlx5.cpp +++ b/src/core/dev/qp_mgr_eth_mlx5.cpp @@ -35,7 +35,7 @@ #include #include -#include "cq_mgr_regrq.h" +#include "cq_mgr_rx_regrq.h" #include "proto/tls.h" #include "util/utils.h" #include "vlogger/vlogger.h" @@ -363,11 +363,11 @@ bool qp_mgr_eth_mlx5::init_rx_cq_mgr_prepare() return true; } -cq_mgr *qp_mgr_eth_mlx5::init_rx_cq_mgr(struct ibv_comp_channel *p_rx_comp_event_channel) +cq_mgr_rx *qp_mgr_eth_mlx5::init_rx_cq_mgr(struct ibv_comp_channel *p_rx_comp_event_channel) { return (!init_rx_cq_mgr_prepare() ? NULL - : new cq_mgr_regrq(m_p_ring, m_p_ib_ctx_handler, m_rx_num_wr, - p_rx_comp_event_channel)); + : new cq_mgr_rx_regrq(m_p_ring, m_p_ib_ctx_handler, m_rx_num_wr, + p_rx_comp_event_channel)); } cq_mgr_tx *qp_mgr_eth_mlx5::init_tx_cq_mgr() diff --git a/src/core/dev/qp_mgr_eth_mlx5.h b/src/core/dev/qp_mgr_eth_mlx5.h index 0950292bb..432cac18a 100644 --- a/src/core/dev/qp_mgr_eth_mlx5.h +++ b/src/core/dev/qp_mgr_eth_mlx5.h @@ -62,8 +62,8 @@ struct sq_wqe_prop { typedef struct sq_wqe_prop sq_wqe_prop; class qp_mgr_eth_mlx5 : public qp_mgr_eth { - friend class cq_mgr; - friend class cq_mgr_regrq; + friend class cq_mgr_rx; + friend class cq_mgr_rx_regrq; friend class cq_mgr_tx; public: @@ -139,7 +139,7 @@ class qp_mgr_eth_mlx5 : public qp_mgr_eth { bool init_rx_cq_mgr_prepare(); void init_qp(); void init_device_memory(); - cq_mgr *init_rx_cq_mgr(struct ibv_comp_channel *p_rx_comp_event_channel) override; + cq_mgr_rx *init_rx_cq_mgr(struct ibv_comp_channel *p_rx_comp_event_channel) override; cq_mgr_tx *init_tx_cq_mgr(void) override; void put_tls_tir_in_cache(xlio_tir *tir); diff --git a/src/core/dev/qp_mgr_eth_mlx5_dpcp.cpp b/src/core/dev/qp_mgr_eth_mlx5_dpcp.cpp index 04a1bb681..77355df5a 100644 --- a/src/core/dev/qp_mgr_eth_mlx5_dpcp.cpp +++ b/src/core/dev/qp_mgr_eth_mlx5_dpcp.cpp @@ -36,7 +36,7 @@ #include #include "ring_simple.h" #include "rfs_rule_dpcp.h" -#include "cq_mgr_strq.h" +#include "cq_mgr_rx_strq.h" #define MODULE_NAME "qp_mgr_eth_mlx5_dpcp" @@ -285,7 +285,7 @@ void qp_mgr_eth_mlx5_dpcp::modify_rq_to_ready_state() } } -cq_mgr *qp_mgr_eth_mlx5_dpcp::init_rx_cq_mgr(struct ibv_comp_channel *p_rx_comp_event_channel) +cq_mgr_rx *qp_mgr_eth_mlx5_dpcp::init_rx_cq_mgr(struct ibv_comp_channel *p_rx_comp_event_channel) { if (unlikely(!safe_mce_sys().enable_striding_rq)) { return qp_mgr_eth_mlx5::init_rx_cq_mgr(p_rx_comp_event_channel); @@ -293,11 +293,11 @@ cq_mgr *qp_mgr_eth_mlx5_dpcp::init_rx_cq_mgr(struct ibv_comp_channel *p_rx_comp_ return (!init_rx_cq_mgr_prepare() ? nullptr - : new cq_mgr_strq(m_p_ring, m_p_ib_ctx_handler, - safe_mce_sys().strq_stride_num_per_rwqe * m_rx_num_wr, - safe_mce_sys().strq_stride_size_bytes, - safe_mce_sys().strq_stride_num_per_rwqe, - p_rx_comp_event_channel)); + : new cq_mgr_rx_strq(m_p_ring, m_p_ib_ctx_handler, + safe_mce_sys().strq_stride_num_per_rwqe * m_rx_num_wr, + safe_mce_sys().strq_stride_size_bytes, + safe_mce_sys().strq_stride_num_per_rwqe, + p_rx_comp_event_channel)); } void qp_mgr_eth_mlx5_dpcp::post_recv_buffer(mem_buf_desc_t *p_mem_buf_desc) diff --git a/src/core/dev/qp_mgr_eth_mlx5_dpcp.h b/src/core/dev/qp_mgr_eth_mlx5_dpcp.h index 3fcce4281..899559b1d 100644 --- a/src/core/dev/qp_mgr_eth_mlx5_dpcp.h +++ b/src/core/dev/qp_mgr_eth_mlx5_dpcp.h @@ -55,7 +55,7 @@ class qp_mgr_eth_mlx5_dpcp : public qp_mgr_eth_mlx5 { virtual void post_recv_buffer(mem_buf_desc_t *p_mem_buf_desc) override; protected: - virtual cq_mgr *init_rx_cq_mgr(struct ibv_comp_channel *p_rx_comp_event_channel) override; + virtual cq_mgr_rx *init_rx_cq_mgr(struct ibv_comp_channel *p_rx_comp_event_channel) override; private: #ifdef DEFINED_UTLS diff --git a/src/core/dev/ring.h b/src/core/dev/ring.h index 4db195b61..cd63e0116 100644 --- a/src/core/dev/ring.h +++ b/src/core/dev/ring.h @@ -100,7 +100,6 @@ class ring { virtual int send_lwip_buffer(ring_user_id_t id, xlio_ibv_send_wr *p_send_wqe, xlio_wr_tx_packet_attr attr, xlio_tis *tis) = 0; - // Funcs taken from cq_mgr.h virtual int get_num_resources() const = 0; virtual int *get_rx_channel_fds(size_t &length) const { diff --git a/src/core/dev/ring_bond.cpp b/src/core/dev/ring_bond.cpp index e4e3d1ada..3d95d108e 100644 --- a/src/core/dev/ring_bond.cpp +++ b/src/core/dev/ring_bond.cpp @@ -307,14 +307,14 @@ void ring_bond::restart() popup_xmit_rings(); int ret = 0; - uint64_t poll_sn = cq_mgr::m_n_global_sn_rx; + uint64_t poll_sn = cq_mgr_rx::m_n_global_sn_rx; ret = request_notification(CQT_RX, poll_sn); if (ret < 0) { - ring_logdbg("failed arming rx cq_mgr (errno=%d %m)", errno); + ring_logdbg("failed arming cq_mgr_rx (errno=%d %m)", errno); } ret = request_notification(CQT_TX, poll_sn); if (ret < 0) { - ring_logdbg("failed arming tx cq_mgr (errno=%d %m)", errno); + ring_logdbg("failed arming cq_mgr_tx (errno=%d %m)", errno); } if (m_type == net_device_val::ACTIVE_BACKUP) { diff --git a/src/core/dev/ring_simple.cpp b/src/core/dev/ring_simple.cpp index d11c3b436..c22472f00 100644 --- a/src/core/dev/ring_simple.cpp +++ b/src/core/dev/ring_simple.cpp @@ -392,7 +392,7 @@ void ring_simple::create_resources() } BULLSEYE_EXCLUDE_BLOCK_END - // save cq_mgr pointers + // save pointers m_p_cq_mgr_rx = m_p_qp_mgr->get_rx_cq_mgr(); m_p_cq_mgr_tx = m_p_qp_mgr->get_tx_cq_mgr(); @@ -606,16 +606,16 @@ mem_buf_desc_t *ring_simple::mem_buf_tx_get(ring_user_id_t id, bool b_block, pbu // Try to poll once in the hope that we get a few freed tx mem_buf_desc ret = m_p_cq_mgr_tx->poll_and_process_element_tx(&poll_sn); if (ret < 0) { - ring_logdbg("failed polling on tx cq_mgr (qp_mgr=%p, cq_mgr_tx=%p) (ret=%d %m)", + ring_logdbg("failed polling on cq_mgr_tx (qp_mgr=%p, cq_mgr_tx=%p) (ret=%d %m)", m_p_qp_mgr, m_p_cq_mgr_tx, ret); /* coverity[double_unlock] TODO: RM#1049980 */ m_lock_ring_tx.unlock(); return NULL; } else if (ret > 0) { - ring_logfunc("polling succeeded on tx cq_mgr (%d wce)", ret); + ring_logfunc("polling succeeded on cq_mgr_tx (%d wce)", ret); buff_list = get_tx_buffers(type, n_num_mem_bufs); } else if (b_block) { // (ret == 0) - // Arm & Block on tx cq_mgr notification channel + // Arm & Block on tx cq_mgr_tx notification channel // until we get a few freed tx mem_buf_desc & data buffers // Only a single thread should block on next Tx cqe event, hence the dedicated lock! @@ -632,7 +632,7 @@ mem_buf_desc_t *ring_simple::mem_buf_tx_get(ring_user_id_t id, bool b_block, pbu ret = m_p_cq_mgr_tx->request_notification(poll_sn); if (ret < 0) { // this is most likely due to cq_poll_sn out of sync, need to poll_cq again - ring_logdbg("failed arming tx cq_mgr (qp_mgr=%p, cq_mgr_tx=%p) (errno=%d %m)", + ring_logdbg("failed arming cq_mgr_tx (qp_mgr=%p, cq_mgr_tx=%p) (errno=%d %m)", m_p_qp_mgr, m_p_cq_mgr_tx, errno); } else if (ret == 0) { @@ -654,16 +654,16 @@ mem_buf_desc_t *ring_simple::mem_buf_tx_get(ring_user_id_t id, bool b_block, pbu buff_list = get_tx_buffers(type, n_num_mem_bufs); continue; } else if (ret < 0) { - ring_logdbg("failed blocking on tx cq_mgr (errno=%d %m)", errno); + ring_logdbg("failed blocking on cq_mgr_tx (errno=%d %m)", errno); m_lock_ring_tx_buf_wait.unlock(); return NULL; } /* coverity[double_lock] TODO: RM#1049980 */ m_lock_ring_tx.lock(); - // Find the correct Tx cq_mgr from the CQ event, + // Find the correct cq_mgr_tx from the CQ event, // It might not be the active_cq object since we have a single TX CQ comp - // channel for all cq_mgr's + // channel for all cq_mgr_tx's cq_mgr_tx *p_cq_mgr_tx = cq_mgr_tx::get_cq_mgr_from_cq_event(get_tx_comp_event_channel()); if (p_cq_mgr_tx) { @@ -673,7 +673,7 @@ mem_buf_desc_t *ring_simple::mem_buf_tx_get(ring_user_id_t id, bool b_block, pbu // Perform a non blocking event read, clear the fd channel ret = p_cq_mgr_tx->poll_and_process_element_tx(&poll_sn); if (ret < 0) { - ring_logdbg("failed handling Tx cq_mgr channel (qp_mgr=%p, " + ring_logdbg("failed handling cq_mgr_tx channel (qp_mgr=%p, " "cq_mgr_tx=%p) (errno=%d %m)", m_p_qp_mgr, m_p_cq_mgr_tx, errno); /* coverity[double_unlock] TODO: RM#1049980 */ @@ -681,7 +681,7 @@ mem_buf_desc_t *ring_simple::mem_buf_tx_get(ring_user_id_t id, bool b_block, pbu m_lock_ring_tx_buf_wait.unlock(); return NULL; } - ring_logfunc("polling/blocking succeeded on tx cq_mgr (we got %d wce)", + ring_logfunc("polling/blocking succeeded on cq_mgr_tx (we got %d wce)", ret); } } @@ -802,7 +802,7 @@ bool ring_simple::is_available_qp_wr(bool b_block, unsigned credits) // Try to poll once in the hope that we get space in SQ ret = m_p_cq_mgr_tx->poll_and_process_element_tx(&poll_sn); if (ret < 0) { - ring_logdbg("failed polling on tx cq_mgr (qp_mgr=%p, cq_mgr_tx=%p) (ret=%d %m)", + ring_logdbg("failed polling on cq_mgr_tx (qp_mgr=%p, cq_mgr_tx=%p) (ret=%d %m)", m_p_qp_mgr, m_p_cq_mgr_tx, ret); /* coverity[missing_unlock] */ return false; @@ -813,7 +813,7 @@ bool ring_simple::is_available_qp_wr(bool b_block, unsigned credits) } if (b_block) { - // Arm & Block on tx cq_mgr notification channel until we get space in SQ + // Arm & Block on cq_mgr_tx notification channel until we get space in SQ // Only a single thread should block on next Tx cqe event, hence the dedicated lock! /* coverity[double_unlock] TODO: RM#1049980 */ @@ -826,7 +826,7 @@ bool ring_simple::is_available_qp_wr(bool b_block, unsigned credits) ret = m_p_cq_mgr_tx->request_notification(poll_sn); if (ret < 0) { // this is most likely due to cq_poll_sn out of sync, need to poll_cq again - ring_logdbg("failed arming tx cq_mgr (qp_mgr=%p, cq_mgr_tx=%p) (errno=%d %m)", + ring_logdbg("failed arming cq_mgr_tx (qp_mgr=%p, cq_mgr_tx=%p) (errno=%d %m)", m_p_qp_mgr, m_p_cq_mgr_tx, errno); } else if (ret == 0) { // prepare to block @@ -841,7 +841,7 @@ bool ring_simple::is_available_qp_wr(bool b_block, unsigned credits) ret = orig_os_api.poll(&poll_fd, 1, -1); if (ret <= 0) { - ring_logdbg("failed blocking on tx cq_mgr (errno=%d %m)", errno); + ring_logdbg("failed blocking on cq_mgr_tx (errno=%d %m)", errno); m_lock_ring_tx_buf_wait.unlock(); /* coverity[double_lock] TODO: RM#1049980 */ m_lock_ring_tx.lock(); @@ -851,9 +851,9 @@ bool ring_simple::is_available_qp_wr(bool b_block, unsigned credits) /* coverity[double_lock] TODO: RM#1049980 */ m_lock_ring_tx.lock(); - // Find the correct Tx cq_mgr from the CQ event, + // Find the correct cq_mgr_tx from the CQ event, // It might not be the active_cq object since we have a single TX CQ comp - // channel for all cq_mgr's + // channel for all cq_mgr_tx's cq_mgr_tx *p_cq_mgr_tx = cq_mgr_tx::get_cq_mgr_from_cq_event(get_tx_comp_event_channel()); if (p_cq_mgr_tx) { @@ -863,7 +863,7 @@ bool ring_simple::is_available_qp_wr(bool b_block, unsigned credits) // Perform a non blocking event read, clear the fd channel ret = p_cq_mgr_tx->poll_and_process_element_tx(&poll_sn); if (ret < 0) { - ring_logdbg("failed handling Tx cq_mgr channel (qp_mgr=%p, " + ring_logdbg("failed handling cq_mgr_tx channel (qp_mgr=%p, " "cq_mgr_tx=%p) (errno=%d %m)", m_p_qp_mgr, m_p_cq_mgr_tx, errno); /* coverity[double_unlock] TODO: RM#1049980 */ @@ -873,7 +873,7 @@ bool ring_simple::is_available_qp_wr(bool b_block, unsigned credits) m_lock_ring_tx.lock(); return false; } - ring_logfunc("polling/blocking succeeded on tx cq_mgr (we got %d wce)", ret); + ring_logfunc("polling/blocking succeeded on cq_mgr_tx (we got %d wce)", ret); } } diff --git a/src/core/dev/ring_simple.h b/src/core/dev/ring_simple.h index 97b0f53d4..999c0cd30 100644 --- a/src/core/dev/ring_simple.h +++ b/src/core/dev/ring_simple.h @@ -286,9 +286,9 @@ class ring_simple : public ring_slave { m_p_qp_mgr->credits_return(credits); } - friend class cq_mgr; - friend class cq_mgr_regrq; - friend class cq_mgr_strq; + friend class cq_mgr_rx; + friend class cq_mgr_rx_regrq; + friend class cq_mgr_rx_strq; friend class qp_mgr; friend class qp_mgr_eth_mlx5; friend class qp_mgr_eth_mlx5_dpcp; @@ -361,7 +361,7 @@ class ring_simple : public ring_slave { ib_ctx_handler *m_p_ib_ctx; qp_mgr *m_p_qp_mgr; struct cq_moderation_info m_cq_moderation_info; - cq_mgr *m_p_cq_mgr_rx; + cq_mgr_rx *m_p_cq_mgr_rx; cq_mgr_tx *m_p_cq_mgr_tx; std::unordered_map m_user_lkey_map; diff --git a/src/core/proto/ip_frag.cpp b/src/core/proto/ip_frag.cpp index b2b3ed96b..7159bc6c2 100644 --- a/src/core/proto/ip_frag.cpp +++ b/src/core/proto/ip_frag.cpp @@ -162,7 +162,7 @@ void ip_frag_manager::free_frag_resources(void) unlock(); - // Must call cq_mgr outside the lock to avoid ABBA deadlock + // Must call cq_mgr_rx outside the lock to avoid ABBA deadlock return_buffers_to_owners(temp_buff_map); delete[] desc_base; @@ -565,7 +565,7 @@ void ip_frag_manager::handle_timer_expired(void *user_data) PRINT_STATISTICS(); unlock(); - // Must call cq_mgr outside the lock to avoid ABBA deadlock + // Must call cq_mgr_rx outside the lock to avoid ABBA deadlock return_buffers_to_owners(temp_buff_map); } diff --git a/src/core/proto/mem_buf_desc.h b/src/core/proto/mem_buf_desc.h index a250b0f96..d0330a0d6 100644 --- a/src/core/proto/mem_buf_desc.h +++ b/src/core/proto/mem_buf_desc.h @@ -184,13 +184,13 @@ class mem_buf_desc_t { size_t sz_buffer; // this is the size of the buffer size_t sz_data; // this is the amount of data inside the buffer (sz_data <= sz_buffer) - // Tx: qp_mgr owns the mem_buf_desc and the associated data buffer - // Rx: cq_mgr owns the mem_buf_desc and the associated data buffer + // Tx: cq_mgr_tx owns the mem_buf_desc and the associated data buffer + // Rx: cq_mgr_rx owns the mem_buf_desc and the associated data buffer ring_slave *p_desc_owner; private: atomic_t n_ref_count; // number of interested receivers (sockinfo) [can be modified only in - // cq_mgr context] + // cq_mgr_rx context] public: inline void clear_transport_data(void) diff --git a/src/core/sock/socket_fd_api.h b/src/core/sock/socket_fd_api.h index 34ba49ab2..e37567ac6 100644 --- a/src/core/sock/socket_fd_api.h +++ b/src/core/sock/socket_fd_api.h @@ -37,7 +37,7 @@ #include #include "xlio_extra.h" -#include +#include #include #include @@ -53,7 +53,7 @@ #define IS_DUMMY_PACKET(flags) (flags & XLIO_SND_FLAGS_DUMMY) -class cq_mgr; +class cq_mgr_rx; class epfd_info; class mem_buf_desc_t; diff --git a/src/core/sock/sockinfo.cpp b/src/core/sock/sockinfo.cpp index 855d57db9..cb7ee8f0d 100644 --- a/src/core/sock/sockinfo.cpp +++ b/src/core/sock/sockinfo.cpp @@ -1621,7 +1621,7 @@ void sockinfo::rx_add_ring_cb(ring *p_ring) do_wakeup(); // A ready wce can be pending due to the drain logic (cq channel will not wake // up by itself) } else { - // Increase ref count on cq_mgr object + // Increase ref count on cq_mgr_rx object rx_ring_iter->second->refcnt++; } @@ -1645,7 +1645,7 @@ void sockinfo::rx_del_ring_cb(ring *p_ring) bool notify_epoll = false; - // Remove the rx cq_mgr from our rx cq map + // Remove the rx cq_mgr_rx from our rx cq map unlock_rx_q(); m_rx_ring_map_lock.lock(); lock_rx_q(); @@ -1662,13 +1662,13 @@ void sockinfo::rx_del_ring_cb(ring *p_ring) if (rx_ring_iter != m_rx_ring_map.end()) { BULLSEYE_EXCLUDE_BLOCK_END ring_info_t *p_ring_info = rx_ring_iter->second; - // Decrease ref count on cq_mgr object + // Decrease ref count on cq_mgr_rx object p_ring_info->refcnt--; - // Is this the last reference to this cq_mgr? + // Is this the last reference to this cq_mgr_rx? if (p_ring_info->refcnt == 0) { - // Move all cq_mgr->rx_reuse buffers to temp reuse queue related to p_rx_cq_mgr + // Move all cq_mgr_rx->rx_reuse buffers to temp reuse queue related to p_rx_cq_mgr move_descs(base_ring, &temp_rx_reuse, &p_ring_info->rx_reuse_info.rx_reuse, true); move_descs(base_ring, &temp_rx_reuse_global, &p_ring_info->rx_reuse_info.rx_reuse, false); diff --git a/src/core/sock/sockinfo_tcp.h b/src/core/sock/sockinfo_tcp.h index e72e8d46d..76df75e83 100644 --- a/src/core/sock/sockinfo_tcp.h +++ b/src/core/sock/sockinfo_tcp.h @@ -37,7 +37,7 @@ #include "proto/mem_buf_desc.h" #include "sock/socket_fd_api.h" #include "dev/buffer_pool.h" -#include "dev/cq_mgr.h" +#include "dev/cq_mgr_rx.h" #include "xlio_extra.h" // LWIP includes diff --git a/src/core/sock/sockinfo_udp.h b/src/core/sock/sockinfo_udp.h index 39fa29628..b6e298f53 100644 --- a/src/core/sock/sockinfo_udp.h +++ b/src/core/sock/sockinfo_udp.h @@ -132,7 +132,7 @@ class sockinfo_udp : public sockinfo { /** * Arm the event channel(s) assosiated with this sockinfo * Fill the fd_set (p_rxfds) with the correct fd channel values and the p_nfds with the (max_fd - * + 1) Fill the p_cq_mgr_fd_map with the pointer to the cq_mgr asosiated with the fd Return + * + 1) Fill the p_cq_mgr_fd_map with the pointer to the cq_mgr_rx asosiated with the fd Return * count of channels (fds) that where mapped */ int rx_request_notification(uint64_t poll_sn); From 553cc35d804d089b20d1eb81b0541a3e96743c18 Mon Sep 17 00:00:00 2001 From: Alexander Grissik Date: Mon, 28 Aug 2023 16:06:16 +0300 Subject: [PATCH 010/169] issue: 3514044 Remove qp_rec struct Signed-off-by: Alexander Grissik --- src/core/dev/cq_mgr_rx.cpp | 43 +++++++++++++++----------------- src/core/dev/cq_mgr_rx.h | 8 +----- src/core/dev/cq_mgr_rx_regrq.cpp | 25 +++++++------------ src/core/dev/cq_mgr_rx_regrq.h | 6 ++--- src/core/dev/cq_mgr_rx_strq.cpp | 9 +++---- 5 files changed, 37 insertions(+), 54 deletions(-) diff --git a/src/core/dev/cq_mgr_rx.cpp b/src/core/dev/cq_mgr_rx.cpp index 3172549b4..8229742fb 100644 --- a/src/core/dev/cq_mgr_rx.cpp +++ b/src/core/dev/cq_mgr_rx.cpp @@ -93,7 +93,7 @@ cq_mgr_rx::cq_mgr_rx(ring_simple *p_ring, ib_ctx_handler *p_ib_ctx_handler, int BULLSEYE_EXCLUDE_BLOCK_END memset(&m_cq_stat_static, 0, sizeof(m_cq_stat_static)); - memset(&m_qp_rec, 0, sizeof(m_qp_rec)); + m_rx_queue.set_id("cq_mgr_rx (%p) : m_rx_queue", this); m_rx_pool.set_id("cq_mgr_rx (%p) : m_rx_pool", this); m_cq_id_rx = atomic_fetch_and_inc(&m_n_cq_id_counter_rx); // cq id is nonzero @@ -184,7 +184,7 @@ void cq_mgr_rx::statistics_print() } } -void cq_mgr_rx::set_qp_rq(qp_mgr *qp) +void cq_mgr_rx::add_qp_rx(qp_mgr *qp) { m_qp = static_cast(qp); @@ -194,14 +194,11 @@ void cq_mgr_rx::set_qp_rq(qp_mgr *qp) if (0 != xlio_ib_mlx5_get_cq(m_p_ibv_cq, &m_mlx5_cq)) { cq_logpanic("xlio_ib_mlx5_get_cq failed (errno=%d %m)", errno); } + VALGRIND_MAKE_MEM_DEFINED(&m_mlx5_cq, sizeof(m_mlx5_cq)); cq_logfunc("qp_mgr=%p m_mlx5_cq.dbrec=%p m_mlx5_cq.cq_buf=%p", m_qp, m_mlx5_cq.dbrec, m_mlx5_cq.cq_buf); -} -void cq_mgr_rx::add_qp_rx(qp_mgr *qp) -{ - cq_logdbg("qp_mgr=%p", qp); descq_t temp_desc_list; temp_desc_list.set_id("cq_mgr_rx (%p) : temp_desc_list", this); @@ -239,27 +236,27 @@ void cq_mgr_rx::add_qp_rx(qp_mgr *qp) } qp_rx_wr_num -= n_num_mem_bufs; } + cq_logdbg("Successfully post_recv qp with %d new Rx buffers (planned=%d)", qp->get_rx_max_wr_num() - qp_rx_wr_num, qp->get_rx_max_wr_num()); - // Add qp_mgr to map - m_qp_rec.qp = qp; - m_qp_rec.debt = 0; + m_debt = 0; } void cq_mgr_rx::del_qp_rx(qp_mgr *qp) { BULLSEYE_EXCLUDE_BLOCK_START - if (m_qp_rec.qp != qp) { - cq_logdbg("wrong qp_mgr=%p != m_qp_rec.qp=%p", qp, m_qp_rec.qp); + if (m_qp != qp) { + cq_logdbg("wrong qp_mgr=%p != m_qp=%p", qp, m_qp); return; } BULLSEYE_EXCLUDE_BLOCK_END - cq_logdbg("qp_mgr=%p", m_qp_rec.qp); + cq_logdbg("qp_mgr=%p", m_qp); return_extra_buffers(); clean_cq(); - memset(&m_qp_rec, 0, sizeof(m_qp_rec)); + m_qp = nullptr; + m_debt = 0; } void cq_mgr_rx::lro_update_hdr(struct xlio_mlx5_cqe *cqe, mem_buf_desc_t *p_rx_wc_buf_desc) @@ -382,15 +379,15 @@ bool cq_mgr_rx::compensate_qp_poll_success(mem_buf_desc_t *buff_cur) // Assume locked!!! // Compensate QP for all completions that we found if (m_rx_pool.size() || request_more_buffers()) { - size_t buffers = std::min(m_qp_rec.debt, m_rx_pool.size()); - m_qp_rec.qp->post_recv_buffers(&m_rx_pool, buffers); - m_qp_rec.debt -= buffers; + size_t buffers = std::min(m_debt, m_rx_pool.size()); + m_qp->post_recv_buffers(&m_rx_pool, buffers); + m_debt -= buffers; m_p_cq_stat->n_buffer_pool_len = m_rx_pool.size(); } else if (m_b_sysvar_cq_keep_qp_full || - m_qp_rec.debt + MCE_MAX_CQ_POLL_BATCH > (int)m_qp_rec.qp->m_rx_num_wr) { + m_debt + MCE_MAX_CQ_POLL_BATCH > (int)m_qp->m_rx_num_wr) { m_p_cq_stat->n_rx_pkt_drop++; - m_qp_rec.qp->post_recv_buffer(buff_cur); - --m_qp_rec.debt; + m_qp->post_recv_buffer(buff_cur); + --m_debt; return true; } @@ -401,11 +398,11 @@ void cq_mgr_rx::compensate_qp_poll_failed() { // Assume locked!!! // Compensate QP for all completions debt - if (m_qp_rec.debt) { + if (m_debt) { if (likely(m_rx_pool.size() || request_more_buffers())) { - size_t buffers = std::min(m_qp_rec.debt, m_rx_pool.size()); - m_qp_rec.qp->post_recv_buffers(&m_rx_pool, buffers); - m_qp_rec.debt -= buffers; + size_t buffers = std::min(m_debt, m_rx_pool.size()); + m_qp->post_recv_buffers(&m_rx_pool, buffers); + m_debt -= buffers; m_p_cq_stat->n_buffer_pool_len = m_rx_pool.size(); } } diff --git a/src/core/dev/cq_mgr_rx.h b/src/core/dev/cq_mgr_rx.h index a848a19cc..2433df871 100644 --- a/src/core/dev/cq_mgr_rx.h +++ b/src/core/dev/cq_mgr_rx.h @@ -66,11 +66,6 @@ class ring_simple; /* Get CQE owner bit. */ #define MLX5_CQE_OWNER(op_own) ((op_own)&MLX5_CQE_OWNER_MASK) -struct qp_rec { - qp_mgr *qp; - int debt; -}; - class cq_mgr_rx { friend class ring; // need to expose the m_n_global_sn_rx only to ring friend class ring_simple; // need to expose the m_n_global_sn_rx only to ring @@ -163,7 +158,6 @@ class cq_mgr_rx { * @return Number of successfully polled wce */ void compensate_qp_poll_failed(); - void set_qp_rq(qp_mgr *qp); void lro_update_hdr(struct xlio_mlx5_cqe *cqe, mem_buf_desc_t *p_rx_wc_buf_desc); inline void process_recv_buffer(mem_buf_desc_t *buff, void *pv_fd_ready_array = NULL); @@ -194,7 +188,7 @@ class cq_mgr_rx { uint32_t m_n_wce_counter = 0U; bool m_b_was_drained = false; bool m_b_is_rx_hw_csum_on = false; - qp_rec m_qp_rec; + int m_debt = 0; const uint32_t m_n_sysvar_cq_poll_batch_max; const uint32_t m_n_sysvar_progress_engine_wce_max; cq_stats_t *m_p_cq_stat; diff --git a/src/core/dev/cq_mgr_rx_regrq.cpp b/src/core/dev/cq_mgr_rx_regrq.cpp index 8259ccaec..99c6a7dbe 100644 --- a/src/core/dev/cq_mgr_rx_regrq.cpp +++ b/src/core/dev/cq_mgr_rx_regrq.cpp @@ -96,7 +96,7 @@ mem_buf_desc_t *cq_mgr_rx_regrq::poll(enum buff_status_e &status) if (unlikely(NULL == m_rx_hot_buffer)) { if (likely(m_qp->m_mlx5_qp.rq.tail != (m_qp->m_mlx5_qp.rq.head))) { - uint32_t index = m_qp->m_mlx5_qp.rq.tail & (m_qp_rec.qp->m_rx_num_wr - 1); + uint32_t index = m_qp->m_mlx5_qp.rq.tail & (m_qp->m_rx_num_wr - 1); m_rx_hot_buffer = (mem_buf_desc_t *)m_qp->m_rq_wqe_idx_to_wrid[index]; m_qp->m_rq_wqe_idx_to_wrid[index] = 0; prefetch((void *)m_rx_hot_buffer); @@ -220,14 +220,14 @@ int cq_mgr_rx_regrq::drain_and_proccess_helper(mem_buf_desc_t *buff, buff_status if (procces_now) { // We process immediately all non udp/ip traffic.. buff->rx.is_xlio_thr = true; - if ((++m_qp_rec.debt < (int)m_n_sysvar_rx_num_wr_to_post_recv) || + if ((++m_debt < (int)m_n_sysvar_rx_num_wr_to_post_recv) || !compensate_qp_poll_success(buff)) { process_recv_buffer(buff, nullptr); } } else { // udp/ip traffic we just put in the cq's rx queue m_rx_queue.push_back(buff); mem_buf_desc_t *buff_cur = m_rx_queue.get_and_pop_front(); - if ((++m_qp_rec.debt < (int)m_n_sysvar_rx_num_wr_to_post_recv) || + if ((++m_debt < (int)m_n_sysvar_rx_num_wr_to_post_recv) || !compensate_qp_poll_success(buff_cur)) { m_rx_queue.push_front(buff_cur); } @@ -285,7 +285,7 @@ int cq_mgr_rx_regrq::drain_and_proccess(uintptr_t *p_recycle_buffers_last_wr_id /* We process immediately all non udp/ip traffic.. */ if (procces_now) { buff->rx.is_xlio_thr = true; - if ((++m_qp_rec.debt < (int)m_n_sysvar_rx_num_wr_to_post_recv) || + if ((++m_debt < (int)m_n_sysvar_rx_num_wr_to_post_recv) || !compensate_qp_poll_success(buff)) { process_recv_buffer(buff, NULL); } @@ -293,7 +293,7 @@ int cq_mgr_rx_regrq::drain_and_proccess(uintptr_t *p_recycle_buffers_last_wr_id m_rx_queue.push_back(buff); mem_buf_desc_t *buff_cur = m_rx_queue.front(); m_rx_queue.pop_front(); - if ((++m_qp_rec.debt < (int)m_n_sysvar_rx_num_wr_to_post_recv) || + if ((++m_debt < (int)m_n_sysvar_rx_num_wr_to_post_recv) || !compensate_qp_poll_success(buff_cur)) { m_rx_queue.push_front(buff_cur); } @@ -330,11 +330,11 @@ mem_buf_desc_t *cq_mgr_rx_regrq::poll_and_process_socketxtreme() if (buff_wqe) { if (cqe_process_rx(buff_wqe, status)) { - if ((++m_qp_rec.debt < (int)m_n_sysvar_rx_num_wr_to_post_recv) || + if ((++m_debt < (int)m_n_sysvar_rx_num_wr_to_post_recv) || !compensate_qp_poll_success(buff_wqe)) { return buff_wqe; } - } else if (++m_qp_rec.debt >= (int)m_n_sysvar_rx_num_wr_to_post_recv) { + } else if (++m_debt >= (int)m_n_sysvar_rx_num_wr_to_post_recv) { compensate_qp_poll_failed(); } } else { @@ -367,13 +367,13 @@ int cq_mgr_rx_regrq::poll_and_process_element_rx(uint64_t *p_cq_poll_sn, void *p if (buff) { ++ret; if (cqe_process_rx(buff, status)) { - if ((++m_qp_rec.debt < (int)m_n_sysvar_rx_num_wr_to_post_recv) || + if ((++m_debt < (int)m_n_sysvar_rx_num_wr_to_post_recv) || !compensate_qp_poll_success(buff)) { process_recv_buffer(buff, pv_fd_ready_array); } } else { m_p_cq_stat->n_rx_pkt_drop++; - if (++m_qp_rec.debt >= (int)m_n_sysvar_rx_num_wr_to_post_recv) { + if (++m_debt >= (int)m_n_sysvar_rx_num_wr_to_post_recv) { compensate_qp_poll_failed(); } } @@ -396,11 +396,4 @@ int cq_mgr_rx_regrq::poll_and_process_element_rx(uint64_t *p_cq_poll_sn, void *p return ret_rx_processed; } -void cq_mgr_rx_regrq::add_qp_rx(qp_mgr *qp) -{ - cq_logfunc(""); - set_qp_rq(qp); - cq_mgr_rx::add_qp_rx(qp); -} - #endif /* DEFINED_DIRECT_VERBS */ diff --git a/src/core/dev/cq_mgr_rx_regrq.h b/src/core/dev/cq_mgr_rx_regrq.h index 9e70acda9..b36fbf489 100644 --- a/src/core/dev/cq_mgr_rx_regrq.h +++ b/src/core/dev/cq_mgr_rx_regrq.h @@ -39,13 +39,13 @@ class cq_mgr_rx_regrq : public cq_mgr_rx { public: cq_mgr_rx_regrq(ring_simple *p_ring, ib_ctx_handler *p_ib_ctx_handler, uint32_t cq_size, struct ibv_comp_channel *p_comp_event_channel); - virtual ~cq_mgr_rx_regrq(); + + virtual ~cq_mgr_rx_regrq() override; virtual int drain_and_proccess(uintptr_t *p_recycle_buffers_last_wr_id = NULL) override; - virtual mem_buf_desc_t *poll_and_process_socketxtreme(); + virtual mem_buf_desc_t *poll_and_process_socketxtreme() override; virtual int poll_and_process_element_rx(uint64_t *p_cq_poll_sn, void *pv_fd_ready_array = NULL) override; - virtual void add_qp_rx(qp_mgr *qp); virtual uint32_t clean_cq() override; protected: diff --git a/src/core/dev/cq_mgr_rx_strq.cpp b/src/core/dev/cq_mgr_rx_strq.cpp index ec8f02789..f3b5a0692 100644 --- a/src/core/dev/cq_mgr_rx_strq.cpp +++ b/src/core/dev/cq_mgr_rx_strq.cpp @@ -157,7 +157,7 @@ uint32_t cq_mgr_rx_strq::clean_cq() bool cq_mgr_rx_strq::set_current_hot_buffer() { if (likely(m_qp->m_mlx5_qp.rq.tail != (m_qp->m_mlx5_qp.rq.head))) { - uint32_t index = m_qp->m_mlx5_qp.rq.tail & (m_qp_rec.qp->m_rx_num_wr - 1); + uint32_t index = m_qp->m_mlx5_qp.rq.tail & (m_qp->m_rx_num_wr - 1); m_rx_hot_buffer = (mem_buf_desc_t *)m_qp->m_rq_wqe_idx_to_wrid[index]; m_rx_hot_buffer->set_ref_count(_strides_num); m_qp->m_rq_wqe_idx_to_wrid[index] = 0; @@ -341,7 +341,7 @@ int cq_mgr_rx_strq::drain_and_proccess_helper(mem_buf_desc_t *buff, mem_buf_desc uintptr_t *p_recycle_buffers_last_wr_id) { int ret_total = 0; - if (buff_wqe && (++m_qp_rec.debt >= (int)m_n_sysvar_rx_num_wr_to_post_recv) && + if (buff_wqe && (++m_debt >= (int)m_n_sysvar_rx_num_wr_to_post_recv) && !p_recycle_buffers_last_wr_id) { compensate_qp_poll_failed(); // Reuse this method as success. } @@ -453,7 +453,7 @@ mem_buf_desc_t *cq_mgr_rx_strq::poll_and_process_socketxtreme() mem_buf_desc_t *buff = nullptr; mem_buf_desc_t *buff_wqe = poll(status, buff); - if ((buff_wqe && (++m_qp_rec.debt >= (int)m_n_sysvar_rx_num_wr_to_post_recv)) || !buff) { + if ((buff_wqe && (++m_debt >= (int)m_n_sysvar_rx_num_wr_to_post_recv)) || !buff) { compensate_qp_poll_failed(); // Reuse this method as success. } @@ -482,7 +482,7 @@ int cq_mgr_rx_strq::poll_and_process_element_rx(uint64_t *p_cq_poll_sn, void *pv mem_buf_desc_t *buff = nullptr; mem_buf_desc_t *buff_wqe = poll(status, buff); - if (buff_wqe && (++m_qp_rec.debt >= (int)m_n_sysvar_rx_num_wr_to_post_recv)) { + if (buff_wqe && (++m_debt >= (int)m_n_sysvar_rx_num_wr_to_post_recv)) { compensate_qp_poll_failed(); // Reuse this method as success. } @@ -513,7 +513,6 @@ int cq_mgr_rx_strq::poll_and_process_element_rx(uint64_t *p_cq_poll_sn, void *pv void cq_mgr_rx_strq::add_qp_rx(qp_mgr *qp) { cq_logfunc(""); - set_qp_rq(qp); _hot_buffer_stride = nullptr; _current_wqe_consumed_bytes = 0U; cq_mgr_rx::add_qp_rx(qp); From 797c40c5a22ef57aeaa3d929251388365475c544 Mon Sep 17 00:00:00 2001 From: Alexander Grissik Date: Mon, 28 Aug 2023 16:18:23 +0300 Subject: [PATCH 011/169] issue: 3514044 Squash qp_mgr_eth to qp_mgr Signed-off-by: Alexander Grissik --- src/core/dev/qp_mgr.cpp | 7 ++++--- src/core/dev/qp_mgr.h | 33 +++++--------------------------- src/core/dev/qp_mgr_eth_mlx5.cpp | 4 ++-- src/core/dev/qp_mgr_eth_mlx5.h | 2 +- 4 files changed, 12 insertions(+), 34 deletions(-) diff --git a/src/core/dev/qp_mgr.cpp b/src/core/dev/qp_mgr.cpp index cc95ac148..b402aed73 100644 --- a/src/core/dev/qp_mgr.cpp +++ b/src/core/dev/qp_mgr.cpp @@ -64,7 +64,7 @@ #define MAX_UPSTREAM_CQ_MSHV_SIZE 8192 -qp_mgr::qp_mgr(struct qp_mgr_desc *desc, const uint32_t tx_num_wr) +qp_mgr::qp_mgr(struct qp_mgr_desc *desc, const uint32_t tx_num_wr, uint16_t vlan) : m_qp(NULL) , m_rq_wqe_idx_to_wrid(NULL) , m_p_ring((ring_simple *)desc->ring) @@ -85,6 +85,7 @@ qp_mgr::qp_mgr(struct qp_mgr_desc *desc, const uint32_t tx_num_wr) , m_p_prev_rx_desc_pushed(NULL) , m_n_ip_id_base(0) , m_n_ip_id_offset(0) + , m_vlan(vlan) { memset(&m_qp_cap, 0, sizeof(m_qp_cap)); m_qp_cap.max_inline_data = safe_mce_sys().tx_max_inline; @@ -574,7 +575,7 @@ int qp_mgr::send(xlio_ibv_send_wr *p_send_wqe, xlio_wr_tx_packet_attr attr, xlio return 0; } -void qp_mgr_eth::modify_qp_to_ready_state() +void qp_mgr::modify_qp_to_ready_state() { qp_logdbg(""); int ret = 0; @@ -595,7 +596,7 @@ void qp_mgr_eth::modify_qp_to_ready_state() BULLSEYE_EXCLUDE_BLOCK_END } -int qp_mgr_eth::prepare_ibv_qp(xlio_ibv_qp_init_attr &qp_init_attr) +int qp_mgr::prepare_ibv_qp(xlio_ibv_qp_init_attr &qp_init_attr) { qp_logdbg(""); int ret = 0; diff --git a/src/core/dev/qp_mgr.h b/src/core/dev/qp_mgr.h index f289bd420..153d367f7 100644 --- a/src/core/dev/qp_mgr.h +++ b/src/core/dev/qp_mgr.h @@ -159,7 +159,7 @@ class qp_mgr { friend class cq_mgr_tx; public: - qp_mgr(struct qp_mgr_desc *desc, const uint32_t tx_num_wr); + qp_mgr(struct qp_mgr_desc *desc, const uint32_t tx_num_wr, uint16_t vlan); virtual ~qp_mgr(); virtual void up(); @@ -175,7 +175,7 @@ class qp_mgr { inline uint32_t get_max_inline_data() const { return m_qp_cap.max_inline_data; } inline uint32_t get_max_send_sge() const { return m_qp_cap.max_send_sge; } int get_port_num() const { return m_port_num; } - virtual uint16_t get_partiton() const { return 0; }; + uint16_t get_partiton() const { return m_vlan; }; struct ibv_qp *get_ibv_qp() const { return m_qp; }; class cq_mgr_tx *get_tx_cq_mgr() const { return m_p_cq_mgr_tx; } class cq_mgr_rx *get_rx_cq_mgr() const { return m_p_cq_mgr_rx; } @@ -184,7 +184,7 @@ class qp_mgr { // chain of calls may serve as cache warm for dummy send feature. inline bool get_hw_dummy_send_support() { return m_hw_dummy_send_support; } - virtual void modify_qp_to_ready_state() = 0; + virtual void modify_qp_to_ready_state(); virtual void modify_qp_to_error_state(); void release_rx_buffers(); @@ -357,10 +357,11 @@ class qp_mgr { // generating packet IDs uint16_t m_n_ip_id_base; uint16_t m_n_ip_id_offset; + uint16_t m_vlan; struct xlio_rate_limit_t m_rate_limit; int configure(struct qp_mgr_desc *desc); - virtual int prepare_ibv_qp(xlio_ibv_qp_init_attr &qp_init_attr) = 0; + int prepare_ibv_qp(xlio_ibv_qp_init_attr &qp_init_attr); inline void set_unsignaled_count(void) { m_n_unsignaled_count = m_n_sysvar_tx_num_wr_to_signal - 1; @@ -385,30 +386,6 @@ class qp_mgr { virtual bool is_rq_empty() const { return false; } }; -class qp_mgr_eth : public qp_mgr { -public: - qp_mgr_eth(struct qp_mgr_desc *desc, const uint32_t tx_num_wr, const uint16_t vlan, - bool call_configure = true) - : qp_mgr(desc, tx_num_wr) - , m_vlan(vlan) - { - if (call_configure && configure(desc)) { - throw_xlio_exception("failed creating qp"); - } - }; - - virtual ~qp_mgr_eth() {} - - virtual void modify_qp_to_ready_state(); - virtual uint16_t get_partiton() const { return m_vlan; }; - -protected: - virtual int prepare_ibv_qp(xlio_ibv_qp_init_attr &qp_init_attr); - -private: - const uint16_t m_vlan; -}; - #if defined(DEFINED_UTLS) || defined(DEFINED_DPCP) class xlio_tis : public xlio_ti { public: diff --git a/src/core/dev/qp_mgr_eth_mlx5.cpp b/src/core/dev/qp_mgr_eth_mlx5.cpp index 52888e529..67421e318 100644 --- a/src/core/dev/qp_mgr_eth_mlx5.cpp +++ b/src/core/dev/qp_mgr_eth_mlx5.cpp @@ -120,7 +120,7 @@ static inline uint32_t get_mlx5_opcode(xlio_ibv_wr_opcode verbs_opcode) qp_mgr_eth_mlx5::qp_mgr_eth_mlx5(struct qp_mgr_desc *desc, const uint32_t tx_num_wr, const uint16_t vlan, bool call_configure) - : qp_mgr_eth(desc, tx_num_wr, vlan, false) + : qp_mgr(desc, tx_num_wr, vlan) , m_sq_wqe_idx_to_prop(NULL) , m_sq_wqe_prop_last(NULL) , m_sq_wqe_prop_last_signalled(0) @@ -138,7 +138,7 @@ qp_mgr_eth_mlx5::qp_mgr_eth_mlx5(struct qp_mgr_desc *desc, const uint32_t tx_num m_hw_dummy_send_support = xlio_is_nop_supported(m_p_ib_ctx_handler->get_ibv_device_attr()); if (call_configure && configure(desc)) { - throw_xlio_exception("failed creating qp_mgr_eth"); + throw_xlio_exception("failed creating qp_mgr_eth_mlx5"); } memset(&m_mlx5_qp, 0, sizeof(m_mlx5_qp)); diff --git a/src/core/dev/qp_mgr_eth_mlx5.h b/src/core/dev/qp_mgr_eth_mlx5.h index 432cac18a..f2b304d3c 100644 --- a/src/core/dev/qp_mgr_eth_mlx5.h +++ b/src/core/dev/qp_mgr_eth_mlx5.h @@ -61,7 +61,7 @@ struct sq_wqe_prop { }; typedef struct sq_wqe_prop sq_wqe_prop; -class qp_mgr_eth_mlx5 : public qp_mgr_eth { +class qp_mgr_eth_mlx5 : public qp_mgr { friend class cq_mgr_rx; friend class cq_mgr_rx_regrq; friend class cq_mgr_tx; From 5fb76810d5aaa1d55b52c9e97b3e248def716fc7 Mon Sep 17 00:00:00 2001 From: Alexander Grissik Date: Tue, 3 Oct 2023 10:35:20 +0300 Subject: [PATCH 012/169] issue: 3514044 Remove DEFINED_DPCP from qp_mgr and styling fixes Signed-off-by: Alexander Grissik --- src/core/dev/cq_mgr_rx.cpp | 19 +++++++++---------- src/core/dev/cq_mgr_rx.h | 8 +++++--- src/core/dev/cq_mgr_rx_regrq.cpp | 15 ++++++++------- src/core/dev/cq_mgr_rx_regrq.h | 5 +++-- src/core/dev/cq_mgr_rx_strq.cpp | 3 +-- src/core/dev/cq_mgr_tx.cpp | 16 +++++++++------- src/core/dev/cq_mgr_tx.h | 5 ++--- src/core/dev/qp_mgr.h | 15 +++------------ src/core/dev/qp_mgr_eth_mlx5.cpp | 6 ++---- src/core/dev/qp_mgr_eth_mlx5.h | 7 ++----- src/core/dev/qp_mgr_eth_mlx5_dpcp.cpp | 4 ---- src/core/dev/qp_mgr_eth_mlx5_dpcp.h | 3 --- src/core/dev/ring_simple.cpp | 6 ++++-- src/core/main.cpp | 7 ------- src/core/util/sys_vars.cpp | 2 -- src/core/util/sys_vars.h | 7 +------ 16 files changed, 49 insertions(+), 79 deletions(-) diff --git a/src/core/dev/cq_mgr_rx.cpp b/src/core/dev/cq_mgr_rx.cpp index 8229742fb..54d8ea29b 100644 --- a/src/core/dev/cq_mgr_rx.cpp +++ b/src/core/dev/cq_mgr_rx.cpp @@ -126,14 +126,13 @@ void cq_mgr_rx::configure(int cq_size) } BULLSEYE_EXCLUDE_BLOCK_END VALGRIND_MAKE_MEM_DEFINED(m_p_ibv_cq, sizeof(ibv_cq)); - + xlio_stats_instance_create_cq_block(m_p_cq_stat); - - m_b_is_rx_hw_csum_on = - xlio_is_rx_hw_csum_supported(m_p_ib_ctx_handler->get_ibv_device_attr()); + + m_b_is_rx_hw_csum_on = xlio_is_rx_hw_csum_supported(m_p_ib_ctx_handler->get_ibv_device_attr()); cq_logdbg("RX CSUM support = %d", m_b_is_rx_hw_csum_on); - + cq_logdbg("Created CQ as Rx with fd[%d] and of size %d elements (ibv_cq_hndl=%p)", get_channel_fd(), cq_size, m_p_ibv_cq); } @@ -343,8 +342,7 @@ void cq_mgr_rx::return_extra_buffers() m_p_cq_stat->n_buffer_pool_len = m_rx_pool.size(); } -mem_buf_desc_t *cq_mgr_rx::cqe_process_rx(mem_buf_desc_t *p_mem_buf_desc, - enum buff_status_e status) +mem_buf_desc_t *cq_mgr_rx::cqe_process_rx(mem_buf_desc_t *p_mem_buf_desc, enum buff_status_e status) { /* Assume locked!!! */ cq_logfuncall(""); @@ -435,7 +433,7 @@ void cq_mgr_rx::reclaim_recv_buffer_helper(mem_buf_desc_t *buff) // This method is called when ring release returns unposted buffers. void cq_mgr_rx::mem_buf_desc_return_to_owner(mem_buf_desc_t *p_mem_buf_desc, - void *pv_fd_ready_array /*=NULL*/) + void *pv_fd_ready_array /*=NULL*/) { cq_logfuncall(""); NOT_IN_USE(pv_fd_ready_array); @@ -506,7 +504,8 @@ int cq_mgr_rx::request_notification(uint64_t poll_sn) cq_logfuncall(""); if ((m_n_global_sn_rx > 0 && poll_sn != m_n_global_sn_rx)) { - // The cq_mgr_rx's has receive packets pending processing (or got processed since cq_poll_sn) + // The cq_mgr_rx's has receive packets pending processing (or got processed since + // cq_poll_sn) cq_logfunc("miss matched poll sn (user=0x%lx, cq=0x%lx)", poll_sn, m_n_cq_poll_sn_rx); return 1; } @@ -536,7 +535,7 @@ int cq_mgr_rx::request_notification(uint64_t poll_sn) } int cq_mgr_rx::wait_for_notification_and_process_element(uint64_t *p_cq_poll_sn, - void *pv_fd_ready_array) + void *pv_fd_ready_array) { int ret = -1; diff --git a/src/core/dev/cq_mgr_rx.h b/src/core/dev/cq_mgr_rx.h index 2433df871..d306bb6cf 100644 --- a/src/core/dev/cq_mgr_rx.h +++ b/src/core/dev/cq_mgr_rx.h @@ -120,7 +120,8 @@ class cq_mgr_rx { * @return >=0 number of wce processed * < 0 error */ - virtual int poll_and_process_element_rx(uint64_t *p_cq_poll_sn, void *pv_fd_ready_array = NULL) = 0; + virtual int poll_and_process_element_rx(uint64_t *p_cq_poll_sn, + void *pv_fd_ready_array = NULL) = 0; virtual mem_buf_desc_t *poll_and_process_socketxtreme() { return nullptr; }; /** @@ -133,7 +134,8 @@ class cq_mgr_rx { // CQ implements the Rx mem_buf_desc_owner. // These callbacks will be called for each Rx buffer that passed processed completion - // Rx completion handling at the cq_mgr_rx level is forwarding the packet to the ib_comm_mgr layer + // Rx completion handling at the cq_mgr_rx level is forwarding the packet to the ib_comm_mgr + // layer void mem_buf_desc_return_to_owner(mem_buf_desc_t *p_mem_buf_desc, void *pv_fd_ready_array = NULL); @@ -160,7 +162,7 @@ class cq_mgr_rx { void compensate_qp_poll_failed(); void lro_update_hdr(struct xlio_mlx5_cqe *cqe, mem_buf_desc_t *p_rx_wc_buf_desc); inline void process_recv_buffer(mem_buf_desc_t *buff, void *pv_fd_ready_array = NULL); - + inline void update_global_sn_rx(uint64_t &cq_poll_sn, uint32_t rettotal); inline struct xlio_mlx5_cqe *check_cqe(void); diff --git a/src/core/dev/cq_mgr_rx_regrq.cpp b/src/core/dev/cq_mgr_rx_regrq.cpp index 99c6a7dbe..ccd57f8d4 100644 --- a/src/core/dev/cq_mgr_rx_regrq.cpp +++ b/src/core/dev/cq_mgr_rx_regrq.cpp @@ -51,8 +51,8 @@ #define cq_logpanic __log_info_panic #define cq_logfuncall __log_info_funcall -cq_mgr_rx_regrq::cq_mgr_rx_regrq(ring_simple *p_ring, ib_ctx_handler *p_ib_ctx_handler, uint32_t cq_size, - struct ibv_comp_channel *p_comp_event_channel) +cq_mgr_rx_regrq::cq_mgr_rx_regrq(ring_simple *p_ring, ib_ctx_handler *p_ib_ctx_handler, + uint32_t cq_size, struct ibv_comp_channel *p_comp_event_channel) : cq_mgr_rx(p_ring, p_ib_ctx_handler, cq_size, p_comp_event_channel) { cq_logfunc(""); @@ -65,10 +65,10 @@ uint32_t cq_mgr_rx_regrq::clean_cq() mem_buf_desc_t *buff; /* Sanity check for cq: initialization of tx and rx cq has difference: - * tx - is done in qp_mgr::configure() - * rx - is done in qp_mgr::up() - * as a result rx cq can be created but not initialized - */ + * tx - is done in qp_mgr::configure() + * rx - is done in qp_mgr::up() + * as a result rx cq can be created but not initialized + */ if (NULL == m_qp) { return 0; } @@ -130,7 +130,8 @@ mem_buf_desc_t *cq_mgr_rx_regrq::poll(enum buff_status_e &status) return buff; } -void cq_mgr_rx_regrq::cqe_to_mem_buff_desc(struct xlio_mlx5_cqe *cqe, mem_buf_desc_t *p_rx_wc_buf_desc, +void cq_mgr_rx_regrq::cqe_to_mem_buff_desc(struct xlio_mlx5_cqe *cqe, + mem_buf_desc_t *p_rx_wc_buf_desc, enum buff_status_e &status) { struct mlx5_err_cqe *ecqe; diff --git a/src/core/dev/cq_mgr_rx_regrq.h b/src/core/dev/cq_mgr_rx_regrq.h index b36fbf489..c5ab51cf8 100644 --- a/src/core/dev/cq_mgr_rx_regrq.h +++ b/src/core/dev/cq_mgr_rx_regrq.h @@ -38,13 +38,14 @@ class cq_mgr_rx_regrq : public cq_mgr_rx { public: cq_mgr_rx_regrq(ring_simple *p_ring, ib_ctx_handler *p_ib_ctx_handler, uint32_t cq_size, - struct ibv_comp_channel *p_comp_event_channel); + struct ibv_comp_channel *p_comp_event_channel); virtual ~cq_mgr_rx_regrq() override; virtual int drain_and_proccess(uintptr_t *p_recycle_buffers_last_wr_id = NULL) override; virtual mem_buf_desc_t *poll_and_process_socketxtreme() override; - virtual int poll_and_process_element_rx(uint64_t *p_cq_poll_sn, void *pv_fd_ready_array = NULL) override; + virtual int poll_and_process_element_rx(uint64_t *p_cq_poll_sn, + void *pv_fd_ready_array = NULL) override; virtual uint32_t clean_cq() override; diff --git a/src/core/dev/cq_mgr_rx_strq.cpp b/src/core/dev/cq_mgr_rx_strq.cpp index f3b5a0692..e140421a4 100644 --- a/src/core/dev/cq_mgr_rx_strq.cpp +++ b/src/core/dev/cq_mgr_rx_strq.cpp @@ -56,8 +56,7 @@ } while (0) cq_mgr_rx_strq::cq_mgr_rx_strq(ring_simple *p_ring, ib_ctx_handler *p_ib_ctx_handler, - uint32_t cq_size, uint32_t stride_size_bytes, - uint32_t strides_num, + uint32_t cq_size, uint32_t stride_size_bytes, uint32_t strides_num, struct ibv_comp_channel *p_comp_event_channel) : cq_mgr_rx(p_ring, p_ib_ctx_handler, cq_size, p_comp_event_channel) , _owner_ring(p_ring) diff --git a/src/core/dev/cq_mgr_tx.cpp b/src/core/dev/cq_mgr_tx.cpp index fc3f812d5..eec8c5cf0 100644 --- a/src/core/dev/cq_mgr_tx.cpp +++ b/src/core/dev/cq_mgr_tx.cpp @@ -29,7 +29,7 @@ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ - + #include "dev/cq_mgr_tx.h" #include #include @@ -50,7 +50,7 @@ atomic_t cq_mgr_tx::m_n_cq_id_counter_tx = ATOMIC_INIT(1); uint64_t cq_mgr_tx::m_n_global_sn_tx = 0U; - cq_mgr_tx::cq_mgr_tx(ring_simple *p_ring, ib_ctx_handler *p_ib_ctx_handler, int cq_size, +cq_mgr_tx::cq_mgr_tx(ring_simple *p_ring, ib_ctx_handler *p_ib_ctx_handler, int cq_size, ibv_comp_channel *p_comp_event_channel) : m_p_ring(p_ring) , m_p_ib_ctx_handler(p_ib_ctx_handler) @@ -108,7 +108,7 @@ int cq_mgr_tx::clean_cq_poll_tx(xlio_ibv_wc *p_wce, int num_entries, uint64_t *p // so we can't really do anything with them *p_cq_poll_sn = m_n_global_sn_tx; return 0; - } + } if (unlikely(g_vlogger_level >= VLOG_FUNC_ALL)) { for (int i = 0; i < ret; i++) { @@ -163,7 +163,7 @@ void cq_mgr_tx::configure(int cq_size) } BULLSEYE_EXCLUDE_BLOCK_END VALGRIND_MAKE_MEM_DEFINED(m_p_ibv_cq, sizeof(ibv_cq)); - + cq_logdbg("Created CQ as Tx with fd[%d] and of size %d elements (ibv_cq_hndl=%p)", get_channel_fd(), cq_size, m_p_ibv_cq); } @@ -201,7 +201,8 @@ int cq_mgr_tx::request_notification(uint64_t poll_sn) cq_logfuncall(""); if ((m_n_global_sn_tx > 0 && poll_sn != m_n_global_sn_tx)) { - // The cq_mgr_tx's has receive packets pending processing (or got processed since cq_poll_sn) + // The cq_mgr_tx's has receive packets pending processing (or got processed since + // cq_poll_sn) cq_logfunc("miss matched poll sn (user=0x%lx, cq=0x%lx)", poll_sn, m_n_cq_poll_sn_tx); return 1; } @@ -240,7 +241,8 @@ cq_mgr_tx *cq_mgr_tx::get_cq_mgr_from_cq_event(struct ibv_comp_channel *p_cq_cha IF_VERBS_FAILURE(ibv_get_cq_event(p_cq_channel, &p_cq_hndl, &p_context)) { vlog_printf(VLOG_INFO, - MODULE_NAME ":%d: waiting on cq_mgr_tx event returned with error (errno=%d %m)\n", + MODULE_NAME + ":%d: waiting on cq_mgr_tx event returned with error (errno=%d %m)\n", __LINE__, errno); } else @@ -271,7 +273,7 @@ int cq_mgr_tx::poll_and_process_element_tx(uint64_t *p_cq_poll_sn) // All error opcodes have the most significant bit set. if (unlikely(cqe->op_own & 0x80) && is_error_opcode(cqe->op_own >> 4)) { - //m_p_cq_stat->n_tx_cqe_error++; Future counter + // m_p_cq_stat->n_tx_cqe_error++; Future counter log_cqe_error(cqe); } diff --git a/src/core/dev/cq_mgr_tx.h b/src/core/dev/cq_mgr_tx.h index 47eb2c6e7..202ae613c 100644 --- a/src/core/dev/cq_mgr_tx.h +++ b/src/core/dev/cq_mgr_tx.h @@ -41,13 +41,13 @@ class ring_simple; class cq_mgr_tx { public: - cq_mgr_tx(ring_simple *p_ring, ib_ctx_handler *p_ib_ctx_handler, int cq_size, ibv_comp_channel *p_comp_event_channel); ~cq_mgr_tx(); // Helper gunction to extract the cq_mgr_tx from the CQ event, - // Since we have a single TX CQ comp channel for all cq_mgr_tx's, it might not be the active_cq object + // Since we have a single TX CQ comp channel for all cq_mgr_tx's, it might not be the active_cq + // object static cq_mgr_tx *get_cq_mgr_from_cq_event(struct ibv_comp_channel *p_cq_channel); ibv_cq *get_ibv_cq_hndl() { return m_p_ibv_cq; } @@ -76,7 +76,6 @@ class cq_mgr_tx { void reset_notification_armed() { m_b_notification_armed = false; } private: - void log_cqe_error(struct xlio_mlx5_cqe *cqe); void handle_sq_wqe_prop(unsigned index); int clean_cq_poll_tx(xlio_ibv_wc *p_wce, int num_entries, uint64_t *p_cq_poll_sn); diff --git a/src/core/dev/qp_mgr.h b/src/core/dev/qp_mgr.h index 153d367f7..100ed918e 100644 --- a/src/core/dev/qp_mgr.h +++ b/src/core/dev/qp_mgr.h @@ -248,9 +248,9 @@ class qp_mgr { NOT_IN_USE(first); } #endif /* DEFINED_UTLS */ -#if defined(DEFINED_DPCP) + virtual std::unique_ptr create_tis(uint32_t) const { return nullptr; }; -#endif /* defined(DEFINED_DPCP) */ + virtual void nvme_set_static_context(xlio_tis *tis, uint32_t config) { NOT_IN_USE(tis); @@ -386,7 +386,6 @@ class qp_mgr { virtual bool is_rq_empty() const { return false; } }; -#if defined(DEFINED_UTLS) || defined(DEFINED_DPCP) class xlio_tis : public xlio_ti { public: xlio_tis(std::unique_ptr _tis, xlio_ti::ti_type type) @@ -468,13 +467,5 @@ class xlio_tir : public xlio_ti { uint32_t m_tirn; uint32_t m_dek_id; }; -#else /* DEFINED_UTLS or DEFINED_DPCP */ -/* A stub classes to compile without uTLS support. */ -class xlio_tis : public xlio_ti { -public: - inline uint32_t get_tisn(void) noexcept { return 0; } -}; -class xlio_tir : public xlio_ti { -}; -#endif /* DEFINED_UTLS or DEFINED_DPCP */ + #endif diff --git a/src/core/dev/qp_mgr_eth_mlx5.cpp b/src/core/dev/qp_mgr_eth_mlx5.cpp index 67421e318..a5a1b00e7 100644 --- a/src/core/dev/qp_mgr_eth_mlx5.cpp +++ b/src/core/dev/qp_mgr_eth_mlx5.cpp @@ -366,8 +366,8 @@ bool qp_mgr_eth_mlx5::init_rx_cq_mgr_prepare() cq_mgr_rx *qp_mgr_eth_mlx5::init_rx_cq_mgr(struct ibv_comp_channel *p_rx_comp_event_channel) { return (!init_rx_cq_mgr_prepare() ? NULL - : new cq_mgr_rx_regrq(m_p_ring, m_p_ib_ctx_handler, m_rx_num_wr, - p_rx_comp_event_channel)); + : new cq_mgr_rx_regrq(m_p_ring, m_p_ib_ctx_handler, + m_rx_num_wr, p_rx_comp_event_channel)); } cq_mgr_tx *qp_mgr_eth_mlx5::init_tx_cq_mgr() @@ -1203,7 +1203,6 @@ void qp_mgr_eth_mlx5::ti_released(xlio_ti *) {}; void qp_mgr_eth_mlx5::destroy_tis_cache(void) {}; #endif /* DEFINED_UTLS */ -#ifdef DEFINED_DPCP std::unique_ptr qp_mgr_eth_mlx5::create_tis(uint32_t flags) const { dpcp::adapter *adapter = m_p_ib_ctx_handler->get_dpcp_adapter(); @@ -1322,7 +1321,6 @@ void qp_mgr_eth_mlx5::nvme_set_progress_context(xlio_tis *tis, uint32_t tcp_seqn ring_doorbell(MLX5_DB_METHOD_DB, MLX5E_NVMEOTCP_PROGRESS_PARAMS_WQEBBS); update_next_wqe_hot(); } -#endif /* DEFINED_DPCP */ #if defined(DEFINED_UTLS) void qp_mgr_eth_mlx5::ti_released(xlio_ti *ti) diff --git a/src/core/dev/qp_mgr_eth_mlx5.h b/src/core/dev/qp_mgr_eth_mlx5.h index f2b304d3c..1259f5074 100644 --- a/src/core/dev/qp_mgr_eth_mlx5.h +++ b/src/core/dev/qp_mgr_eth_mlx5.h @@ -90,16 +90,13 @@ class qp_mgr_eth_mlx5 : public qp_mgr { void tls_tx_post_dump_wqe(xlio_tis *tis, void *addr, uint32_t len, uint32_t lkey, bool first) override; #endif /* DEFINED_UTLS */ -#ifdef DEFINED_DPCP + #define DPCP_TIS_FLAGS (dpcp::TIS_ATTR_TRANSPORT_DOMAIN | dpcp::TIS_ATTR_PD) #define DPCP_TIS_NVME_FLAG (dpcp::TIS_ATTR_NVMEOTCP) std::unique_ptr create_tis(uint32_t flags) const override; void nvme_set_static_context(xlio_tis *tis, uint32_t config) override; void nvme_set_progress_context(xlio_tis *tis, uint32_t tcp_seqno) override; -#else -#define DPCP_TIS_FLAGS (0U) -#define DPCP_TIS_NVME_FLAG (0U) -#endif /* DEFINED_DPCP */ + /* Get a memory inside a wqebb at a wqebb_num offset from the m_sq_wqe_hot and account for * m_sq_wqe_counter wrap-around. Use offset_in_wqebb to for the internal address. Use the * template parameter to cast the resulting address to the required pointer type */ diff --git a/src/core/dev/qp_mgr_eth_mlx5_dpcp.cpp b/src/core/dev/qp_mgr_eth_mlx5_dpcp.cpp index 77355df5a..c0d63e24a 100644 --- a/src/core/dev/qp_mgr_eth_mlx5_dpcp.cpp +++ b/src/core/dev/qp_mgr_eth_mlx5_dpcp.cpp @@ -31,8 +31,6 @@ */ #include "qp_mgr_eth_mlx5_dpcp.h" -#if defined(DEFINED_DPCP) - #include #include "ring_simple.h" #include "rfs_rule_dpcp.h" @@ -344,5 +342,3 @@ dpcp::tir *qp_mgr_eth_mlx5_dpcp::create_tir(bool is_tls /*=false*/) return tir_obj; } - -#endif // defined(DEFINED_DPCP) diff --git a/src/core/dev/qp_mgr_eth_mlx5_dpcp.h b/src/core/dev/qp_mgr_eth_mlx5_dpcp.h index 899559b1d..e4570ead8 100644 --- a/src/core/dev/qp_mgr_eth_mlx5_dpcp.h +++ b/src/core/dev/qp_mgr_eth_mlx5_dpcp.h @@ -35,7 +35,6 @@ #include -#if defined(DEFINED_DPCP) #include #include #include "dev/qp_mgr_eth_mlx5.h" @@ -75,6 +74,4 @@ class qp_mgr_eth_mlx5_dpcp : public qp_mgr_eth_mlx5 { uint32_t _strq_wqe_reserved_seg = 0U; }; -#endif // defined(DEFINED_DPCP) - #endif diff --git a/src/core/dev/ring_simple.cpp b/src/core/dev/ring_simple.cpp index c22472f00..ce7373082 100644 --- a/src/core/dev/ring_simple.cpp +++ b/src/core/dev/ring_simple.cpp @@ -664,7 +664,8 @@ mem_buf_desc_t *ring_simple::mem_buf_tx_get(ring_user_id_t id, bool b_block, pbu // Find the correct cq_mgr_tx from the CQ event, // It might not be the active_cq object since we have a single TX CQ comp // channel for all cq_mgr_tx's - cq_mgr_tx *p_cq_mgr_tx = cq_mgr_tx::get_cq_mgr_from_cq_event(get_tx_comp_event_channel()); + cq_mgr_tx *p_cq_mgr_tx = + cq_mgr_tx::get_cq_mgr_from_cq_event(get_tx_comp_event_channel()); if (p_cq_mgr_tx) { // Allow additional CQ arming now @@ -854,7 +855,8 @@ bool ring_simple::is_available_qp_wr(bool b_block, unsigned credits) // Find the correct cq_mgr_tx from the CQ event, // It might not be the active_cq object since we have a single TX CQ comp // channel for all cq_mgr_tx's - cq_mgr_tx *p_cq_mgr_tx = cq_mgr_tx::get_cq_mgr_from_cq_event(get_tx_comp_event_channel()); + cq_mgr_tx *p_cq_mgr_tx = + cq_mgr_tx::get_cq_mgr_from_cq_event(get_tx_comp_event_channel()); if (p_cq_mgr_tx) { // Allow additional CQ arming now diff --git a/src/core/main.cpp b/src/core/main.cpp index 4557d4625..e51b308e5 100644 --- a/src/core/main.cpp +++ b/src/core/main.cpp @@ -889,13 +889,6 @@ void print_xlio_global_settings() vlog_printf(VLOG_INFO, "---------------------------------------------------------------------------\n"); - -#if !defined(DEFINED_DPCP) - if (safe_mce_sys().mce_spec == MCE_SPEC_NVME_BF2) { - vlog_printf(VLOG_INFO, "XLIO '%s' spec is used without enabled DPCP!\n", - xlio_spec::to_str(MCE_SPEC_NVME_BF2)); - } -#endif } void prepare_fork() diff --git a/src/core/util/sys_vars.cpp b/src/core/util/sys_vars.cpp index ab0d1acba..66657c906 100644 --- a/src/core/util/sys_vars.cpp +++ b/src/core/util/sys_vars.cpp @@ -920,11 +920,9 @@ void mce_sys_var::get_env_params() progress_engine_interval_msec = MCE_CQ_DRAIN_INTERVAL_DISABLED; } -#if defined(DEFINED_DPCP) if ((env_ptr = getenv(SYS_VAR_STRQ)) != NULL) { enable_strq_env = option_strq::from_str(env_ptr, MCE_DEFAULT_STRQ); } -#endif enable_striding_rq = (enable_strq_env == option_strq::ON || enable_strq_env == option_strq::AUTO); diff --git a/src/core/util/sys_vars.h b/src/core/util/sys_vars.h index 9dd1aaa22..b9d752df4 100644 --- a/src/core/util/sys_vars.h +++ b/src/core/util/sys_vars.h @@ -758,12 +758,7 @@ extern mce_sys_var &safe_mce_sys(); #define MCE_DEFAULT_TX_SEGS_POOL_BATCH_TCP (16384) #define MCE_DEFAULT_TX_NUM_SGE (4) -#if defined(DEFINED_DPCP) -#define MCE_DEFAULT_STRQ (option_strq::ON) -#else -#define MCE_DEFAULT_STRQ (option_strq::OFF) -#endif - +#define MCE_DEFAULT_STRQ (option_strq::ON) #define MCE_DEFAULT_STRQ_NUM_STRIDES (16384) #define MCE_DEFAULT_STRQ_STRIDE_SIZE_BYTES (512) #define MCE_DEFAULT_STRQ_NUM_BUFS (64) From 3e1a3bfd77d7a60194294a37263317db0e87d407 Mon Sep 17 00:00:00 2001 From: Alexander Grissik Date: Tue, 3 Oct 2023 16:41:12 +0300 Subject: [PATCH 013/169] issue: 3514044 Squash qp_mgr_eth_mlx5 to qp_mgr Signed-off-by: Alexander Grissik --- src/core/Makefile.am | 2 - src/core/dev/cq_mgr_rx.cpp | 4 +- src/core/dev/cq_mgr_rx.h | 3 +- src/core/dev/cq_mgr_rx_regrq.cpp | 1 - src/core/dev/cq_mgr_rx_strq.cpp | 1 - src/core/dev/cq_mgr_tx.cpp | 4 +- src/core/dev/cq_mgr_tx.h | 3 +- src/core/dev/dm_mgr.h | 7 +- src/core/dev/qp_mgr.cpp | 1650 ++++++++++++++++++++++--- src/core/dev/qp_mgr.h | 417 ++++--- src/core/dev/qp_mgr_eth_mlx5_dpcp.cpp | 51 +- src/core/dev/qp_mgr_eth_mlx5_dpcp.h | 6 +- src/core/dev/ring_simple.cpp | 13 +- src/core/dev/ring_simple.h | 14 +- src/core/ib/mlx5/ib_mlx5.cpp | 47 +- src/core/ib/mlx5/ib_mlx5.h | 5 +- src/core/sock/sockinfo_nvme.h | 2 +- src/core/util/sys_vars.cpp | 1 - src/core/util/sys_vars.h | 1 - tests/gtest/nvme/nvme.cc | 2 +- 20 files changed, 1742 insertions(+), 492 deletions(-) diff --git a/src/core/Makefile.am b/src/core/Makefile.am index b60115082..072d1cab0 100644 --- a/src/core/Makefile.am +++ b/src/core/Makefile.am @@ -67,7 +67,6 @@ libxlio_la_SOURCES := \ dev/cq_mgr_tx.cpp \ dev/dm_mgr.cpp \ dev/qp_mgr.cpp \ - dev/qp_mgr_eth_mlx5.cpp \ dev/qp_mgr_eth_mlx5_dpcp.cpp \ dev/gro_mgr.cpp \ dev/rfs.cpp \ @@ -187,7 +186,6 @@ libxlio_la_SOURCES := \ dev/net_device_table_mgr.h \ dev/net_device_val.h \ dev/qp_mgr.h \ - dev/qp_mgr_eth_mlx5.h \ dev/qp_mgr_eth_mlx5_dpcp.h \ dev/rfs.h \ dev/rfs_mc.h \ diff --git a/src/core/dev/cq_mgr_rx.cpp b/src/core/dev/cq_mgr_rx.cpp index 54d8ea29b..bcf8924d2 100644 --- a/src/core/dev/cq_mgr_rx.cpp +++ b/src/core/dev/cq_mgr_rx.cpp @@ -48,7 +48,6 @@ #include "buffer_pool.h" #include "qp_mgr.h" #include "ring_simple.h" -#include "qp_mgr_eth_mlx5.h" #define MODULE_NAME "cq_mgr_rx" @@ -185,8 +184,7 @@ void cq_mgr_rx::statistics_print() void cq_mgr_rx::add_qp_rx(qp_mgr *qp) { - m_qp = static_cast(qp); - + m_qp = qp; m_qp->m_rq_wqe_counter = 0; // In case of bonded qp, wqe_counter must be reset to zero m_rx_hot_buffer = NULL; diff --git a/src/core/dev/cq_mgr_rx.h b/src/core/dev/cq_mgr_rx.h index d306bb6cf..6b9798fd6 100644 --- a/src/core/dev/cq_mgr_rx.h +++ b/src/core/dev/cq_mgr_rx.h @@ -57,7 +57,6 @@ class net_device_mgr; class ring; class qp_mgr; -class qp_mgr_eth_mlx5; class ring_simple; /* Get CQE opcode. */ @@ -179,7 +178,7 @@ class cq_mgr_rx { virtual void statistics_print(); xlio_ib_mlx5_cq_t m_mlx5_cq; - qp_mgr_eth_mlx5 *m_qp = nullptr; + qp_mgr *m_qp = nullptr; mem_buf_desc_t *m_rx_hot_buffer = nullptr; struct ibv_cq *m_p_ibv_cq = nullptr; descq_t m_rx_queue; diff --git a/src/core/dev/cq_mgr_rx_regrq.cpp b/src/core/dev/cq_mgr_rx_regrq.cpp index ccd57f8d4..93a7460e5 100644 --- a/src/core/dev/cq_mgr_rx_regrq.cpp +++ b/src/core/dev/cq_mgr_rx_regrq.cpp @@ -37,7 +37,6 @@ #include #include "cq_mgr_rx.inl" #include "qp_mgr.h" -#include "qp_mgr_eth_mlx5.h" #include "ring_simple.h" #include diff --git a/src/core/dev/cq_mgr_rx_strq.cpp b/src/core/dev/cq_mgr_rx_strq.cpp index e140421a4..5c8e6ef70 100644 --- a/src/core/dev/cq_mgr_rx_strq.cpp +++ b/src/core/dev/cq_mgr_rx_strq.cpp @@ -37,7 +37,6 @@ #include #include "cq_mgr_rx.inl" #include "qp_mgr.h" -#include "qp_mgr_eth_mlx5.h" #include "ring_simple.h" #include diff --git a/src/core/dev/cq_mgr_tx.cpp b/src/core/dev/cq_mgr_tx.cpp index eec8c5cf0..5a7919a0d 100644 --- a/src/core/dev/cq_mgr_tx.cpp +++ b/src/core/dev/cq_mgr_tx.cpp @@ -34,7 +34,7 @@ #include #include #include "ring_simple.h" -#include "qp_mgr_eth_mlx5.h" +#include "qp_mgr.h" #define MODULE_NAME "cq_mgr_tx" @@ -172,7 +172,7 @@ void cq_mgr_tx::add_qp_tx(qp_mgr *qp) { // Assume locked! cq_logdbg("qp_mgr=%p", qp); - m_qp = static_cast(qp); + m_qp = qp; if (0 != xlio_ib_mlx5_get_cq(m_p_ibv_cq, &m_mlx5_cq)) { cq_logpanic("xlio_ib_mlx5_get_cq failed (errno=%d %m)", errno); diff --git a/src/core/dev/cq_mgr_tx.h b/src/core/dev/cq_mgr_tx.h index 202ae613c..65d12b9d7 100644 --- a/src/core/dev/cq_mgr_tx.h +++ b/src/core/dev/cq_mgr_tx.h @@ -36,7 +36,6 @@ #include "dev/ib_ctx_handler.h" class qp_mgr; -class qp_mgr_eth_mlx5; class ring_simple; class cq_mgr_tx { @@ -92,7 +91,7 @@ class cq_mgr_tx { ring_simple *m_p_ring; ib_ctx_handler *m_p_ib_ctx_handler; ibv_comp_channel *m_comp_event_channel; - qp_mgr_eth_mlx5 *m_qp = nullptr; + qp_mgr *m_qp = nullptr; struct ibv_cq *m_p_ibv_cq = nullptr; uint32_t m_cq_id_tx = 0U; uint32_t m_n_cq_poll_sn_tx = 0U; diff --git a/src/core/dev/dm_mgr.h b/src/core/dev/dm_mgr.h index 924b5cb5b..adaf52eda 100644 --- a/src/core/dev/dm_mgr.h +++ b/src/core/dev/dm_mgr.h @@ -52,7 +52,10 @@ class dm_mgr { bool copy_data(struct mlx5_wqe_data_seg *seg, uint8_t *src, uint32_t length, mem_buf_desc_t *buff); void release_data(mem_buf_desc_t *buff); - inline bool is_completion_need() { return m_allocation - m_used < DM_COMPLETION_THRESHOLD; }; + inline bool is_completion_need() const + { + return m_allocation - m_used < DM_COMPLETION_THRESHOLD; + }; private: struct ibv_mr *m_p_dm_mr; @@ -85,7 +88,7 @@ class dm_mgr { return false; }; inline void release_data(mem_buf_desc_t *buff) { NOT_IN_USE(buff); }; - inline bool is_completion_need() { return false; }; + inline bool is_completion_need() const { return false; }; }; #endif /* DEFINED_IBV_DM */ diff --git a/src/core/dev/qp_mgr.cpp b/src/core/dev/qp_mgr.cpp index b402aed73..85b75c760 100644 --- a/src/core/dev/qp_mgr.cpp +++ b/src/core/dev/qp_mgr.cpp @@ -37,14 +37,15 @@ #include "util/instrumentation.h" #include "iomux/io_mux_call.h" #include "buffer_pool.h" -#include "cq_mgr_rx.h" #include "ring_simple.h" #include "util/valgrind.h" #include "dev/rfs_rule_ibv.h" #include +#include "cq_mgr_rx_regrq.h" +#include "proto/tls.h" #undef MODULE_NAME -#define MODULE_NAME "qpm" +#define MODULE_NAME "qp_mgr" #define qp_logpanic __log_info_panic #define qp_logerr __log_info_err @@ -54,7 +55,7 @@ #define qp_logfunc __log_info_func #define qp_logfuncall __log_info_funcall -//#define ALIGN_WR_UP(_num_wr_) (max(32, ((_num_wr_ + 0xf) & ~(0xf)))) +//#define ALIGN_WR_UP(_num_wr_) (max(32, ((_num_wr_ + 0xf) & ~(0xf)))) #define ALIGN_WR_DOWN(_num_wr_) (max(32, ((_num_wr_) & ~(0xf)))) #define FICTIVE_REMOTE_QPN 0x48 @@ -64,65 +65,155 @@ #define MAX_UPSTREAM_CQ_MSHV_SIZE 8192 -qp_mgr::qp_mgr(struct qp_mgr_desc *desc, const uint32_t tx_num_wr, uint16_t vlan) - : m_qp(NULL) - , m_rq_wqe_idx_to_wrid(NULL) - , m_p_ring((ring_simple *)desc->ring) +#if !defined(MLX5_ETH_INLINE_HEADER_SIZE) +#define MLX5_ETH_INLINE_HEADER_SIZE 18 +#endif + +#define OCTOWORD 16 +#define WQEBB 64 + +//#define DBG_DUMP_WQE 1 + +#ifdef DBG_DUMP_WQE +#define dbg_dump_wqe(_addr, _size) \ + { \ + uint32_t *_wqe = _addr; \ + qp_logfunc("Dumping %d bytes from %p", _size, _wqe); \ + for (int i = 0; i < (int)_size / 4; i += 4) { \ + qp_logfunc("%08x %08x %08x %08x", ntohl(_wqe[i + 0]), ntohl(_wqe[i + 1]), \ + ntohl(_wqe[i + 2]), ntohl(_wqe[i + 3])); \ + } \ + } +#else +#define dbg_dump_wqe(_addr, _size) +#endif + +static inline uint64_t align_to_octoword_up(uint64_t val) +{ + return ((val + 16 - 1) >> 4) << 4; +} + +static inline uint64_t align_to_WQEBB_up(uint64_t val) +{ + return ((val + 4 - 1) >> 2) << 2; +} + +static bool is_bf(struct ibv_context *ib_ctx) +{ + char *env; + + /* This limitation is done for RM: 1557652, 1894523, 1914464, 2069198 */ + if (safe_mce_sys().hypervisor != mce_sys_var::HYPER_NONE) { + return false; + } + + env = getenv("MLX5_SHUT_UP_BF"); + if (!env || !strcmp(env, "0")) { + struct mlx5dv_devx_uar *uar = mlx5dv_devx_alloc_uar(ib_ctx, MLX5DV_UAR_ALLOC_TYPE_BF); + if (uar) { + mlx5dv_devx_free_uar(uar); + return true; + } + } + + return false; +} + +// Maps xlio_ibv_wr_opcode to real MLX5 opcode. +static inline uint32_t get_mlx5_opcode(xlio_ibv_wr_opcode verbs_opcode) +{ + switch (verbs_opcode) { + case XLIO_IBV_WR_SEND: + return MLX5_OPCODE_SEND; + case XLIO_IBV_WR_TSO: + return MLX5_OPCODE_TSO; + case XLIO_IBV_WR_NOP: + return MLX5_OPCODE_NOP; + default: + return MLX5_OPCODE_SEND; + } +} + +qp_mgr::qp_mgr(struct qp_mgr_desc *desc, const uint32_t tx_num_wr, uint16_t vlan, + bool call_configure) + : m_p_ring((ring_simple *)desc->ring) , m_port_num((uint8_t)desc->slave->port_num) , m_p_ib_ctx_handler((ib_ctx_handler *)desc->slave->p_ib_ctx) - , m_max_qp_wr(0) - , m_p_cq_mgr_rx(NULL) - , m_p_cq_mgr_tx(NULL) , m_rx_num_wr(safe_mce_sys().rx_num_wr) , m_tx_num_wr(tx_num_wr) - , m_hw_dummy_send_support(false) , m_n_sysvar_rx_num_wr_to_post_recv(safe_mce_sys().rx_num_wr_to_post_recv) , m_n_sysvar_tx_num_wr_to_signal(safe_mce_sys().tx_num_wr_to_signal) , m_n_sysvar_rx_prefetch_bytes_before_poll(safe_mce_sys().rx_prefetch_bytes_before_poll) - , m_curr_rx_wr(0) - , m_last_posted_rx_wr_id(0) - , m_n_unsignaled_count(0) - , m_p_prev_rx_desc_pushed(NULL) - , m_n_ip_id_base(0) - , m_n_ip_id_offset(0) , m_vlan(vlan) { - memset(&m_qp_cap, 0, sizeof(m_qp_cap)); - m_qp_cap.max_inline_data = safe_mce_sys().tx_max_inline; - m_qp_cap.max_send_sge = (m_p_ring->is_tso() ? m_p_ib_ctx_handler->get_ibv_device_attr()->max_sge - : MCE_DEFAULT_TX_NUM_SGE); - m_qp_cap.max_recv_sge = (m_p_ring->is_socketxtreme()) ? 1 : MCE_DEFAULT_RX_NUM_SGE; + qp_logfunc(""); + + memset(&m_mlx5_qp, 0, sizeof(m_mlx5_qp)); + + m_mlx5_qp.cap.max_inline_data = safe_mce_sys().tx_max_inline; + m_mlx5_qp.cap.max_send_sge = + (m_p_ring->is_tso() ? m_p_ib_ctx_handler->get_ibv_device_attr()->max_sge + : MCE_DEFAULT_TX_NUM_SGE); + m_mlx5_qp.cap.max_recv_sge = (m_p_ring->is_socketxtreme()) ? 1 : MCE_DEFAULT_RX_NUM_SGE; m_ibv_rx_sg_array = new ibv_sge[m_n_sysvar_rx_num_wr_to_post_recv]; m_ibv_rx_wr_array = new ibv_recv_wr[m_n_sysvar_rx_num_wr_to_post_recv]; memset(&m_rate_limit, 0, sizeof(struct xlio_rate_limit_t)); - qp_logfunc(""); + // Check device capabilities for dummy send support + m_hw_dummy_send_support = xlio_is_nop_supported(m_p_ib_ctx_handler->get_ibv_device_attr()); + + if (call_configure && configure(desc)) { + throw_xlio_exception("Failed creating qp_mgr"); + } + + m_db_method = + (is_bf(((ib_ctx_handler *)desc->slave->p_ib_ctx)->get_ibv_context()) ? MLX5_DB_METHOD_BF + : MLX5_DB_METHOD_DB); + + qp_logdbg("m_db_method=%d", m_db_method); } qp_mgr::~qp_mgr() { qp_logfunc(""); - qp_logdbg("calling ibv_destroy_qp(qp=%p)", m_qp); - if (m_qp) { - IF_VERBS_FAILURE_EX(ibv_destroy_qp(m_qp), EIO) + if (m_rq_wqe_idx_to_wrid) { + if (0 != munmap(m_rq_wqe_idx_to_wrid, m_rx_num_wr * sizeof(*m_rq_wqe_idx_to_wrid))) { + qp_logerr("Failed deallocating memory with munmap m_rq_wqe_idx_to_wrid (errno=%d %m)", + errno); + } + m_rq_wqe_idx_to_wrid = nullptr; + } + if (m_sq_wqe_idx_to_prop) { + if (0 != munmap(m_sq_wqe_idx_to_prop, m_tx_num_wr * sizeof(*m_sq_wqe_idx_to_prop))) { + qp_logerr("Failed deallocating memory with munmap m_sq_wqe_idx_to_prop (errno=%d %m)", + errno); + } + m_sq_wqe_idx_to_prop = nullptr; + } + + destroy_tis_cache(); + + qp_logdbg("calling ibv_destroy_qp(qp=%p)", m_mlx5_qp.qp); + if (m_mlx5_qp.qp) { + IF_VERBS_FAILURE_EX(ibv_destroy_qp(m_mlx5_qp.qp), EIO) { qp_logdbg("QP destroy failure (errno = %d %m)", -errno); } ENDIF_VERBS_FAILURE; - VALGRIND_MAKE_MEM_UNDEFINED(m_qp, sizeof(ibv_qp)); + VALGRIND_MAKE_MEM_UNDEFINED(m_mlx5_qp.qp, sizeof(ibv_qp)); + m_mlx5_qp.qp = nullptr; } - m_qp = NULL; if (m_p_cq_mgr_tx) { delete m_p_cq_mgr_tx; - m_p_cq_mgr_tx = NULL; + m_p_cq_mgr_tx = nullptr; } if (m_p_cq_mgr_rx) { delete m_p_cq_mgr_rx; - m_p_cq_mgr_rx = NULL; + m_p_cq_mgr_rx = nullptr; } delete[] m_ibv_rx_sg_array; @@ -174,12 +265,12 @@ int qp_mgr::configure(struct qp_mgr_desc *desc) xlio_ibv_qp_init_attr qp_init_attr; memset(&qp_init_attr, 0, sizeof(qp_init_attr)); - // TODO: m_tx_num_wr and m_rx_num_wr should be part of m_qp_cap + // TODO: m_tx_num_wr and m_rx_num_wr should be part of m_mlx5_qp.cap // and assigned as a result of ibv_query_qp() - m_qp_cap.max_send_wr = m_tx_num_wr; - m_qp_cap.max_recv_wr = m_rx_num_wr; + m_mlx5_qp.cap.max_send_wr = m_tx_num_wr; + m_mlx5_qp.cap.max_recv_wr = m_rx_num_wr; - memcpy(&qp_init_attr.cap, &m_qp_cap, sizeof(qp_init_attr.cap)); + memcpy(&qp_init_attr.cap, &m_mlx5_qp.cap, sizeof(qp_init_attr.cap)); qp_init_attr.recv_cq = m_p_cq_mgr_rx->get_ibv_cq_hndl(); qp_init_attr.send_cq = m_p_cq_mgr_tx->get_ibv_cq_hndl(); qp_init_attr.sq_sig_all = 0; @@ -222,24 +313,26 @@ int qp_mgr::configure(struct qp_mgr_desc *desc) enum ibv_qp_attr_mask attr_mask = IBV_QP_CAP; struct ibv_qp_attr tmp_ibv_qp_attr; struct ibv_qp_init_attr tmp_ibv_qp_init_attr; - IF_VERBS_FAILURE(ibv_query_qp(m_qp, &tmp_ibv_qp_attr, attr_mask, &tmp_ibv_qp_init_attr)) + IF_VERBS_FAILURE(ibv_query_qp(m_mlx5_qp.qp, &tmp_ibv_qp_attr, attr_mask, &tmp_ibv_qp_init_attr)) { qp_logerr("ibv_query_qp failed (errno=%d %m)", errno); return -1; } ENDIF_VERBS_FAILURE; - m_qp_cap.max_send_wr = min(tmp_ibv_qp_attr.cap.max_send_wr, m_qp_cap.max_send_wr); - m_qp_cap.max_recv_wr = min(tmp_ibv_qp_attr.cap.max_recv_wr, m_qp_cap.max_recv_wr); - m_qp_cap.max_send_sge = min(tmp_ibv_qp_attr.cap.max_send_sge, m_qp_cap.max_send_sge); - m_qp_cap.max_recv_sge = min(tmp_ibv_qp_attr.cap.max_recv_sge, m_qp_cap.max_recv_sge); - m_qp_cap.max_inline_data = min(tmp_ibv_qp_attr.cap.max_inline_data, m_qp_cap.max_inline_data); + m_mlx5_qp.cap.max_send_wr = min(tmp_ibv_qp_attr.cap.max_send_wr, m_mlx5_qp.cap.max_send_wr); + m_mlx5_qp.cap.max_recv_wr = min(tmp_ibv_qp_attr.cap.max_recv_wr, m_mlx5_qp.cap.max_recv_wr); + m_mlx5_qp.cap.max_send_sge = min(tmp_ibv_qp_attr.cap.max_send_sge, m_mlx5_qp.cap.max_send_sge); + m_mlx5_qp.cap.max_recv_sge = min(tmp_ibv_qp_attr.cap.max_recv_sge, m_mlx5_qp.cap.max_recv_sge); + m_mlx5_qp.cap.max_inline_data = + min(tmp_ibv_qp_attr.cap.max_inline_data, m_mlx5_qp.cap.max_inline_data); qp_logdbg("Used QP (num=%d) " "wre: tx = %d rx = %d " "sge: tx = %d rx = %d " "inline: %d", - m_qp->qp_num, m_qp_cap.max_send_wr, m_qp_cap.max_recv_wr, m_qp_cap.max_send_sge, - m_qp_cap.max_recv_sge, m_qp_cap.max_inline_data); + m_mlx5_qp.qp->qp_num, m_mlx5_qp.cap.max_send_wr, m_mlx5_qp.cap.max_recv_wr, + m_mlx5_qp.cap.max_send_sge, m_mlx5_qp.cap.max_recv_sge, + m_mlx5_qp.cap.max_inline_data); #if defined(DEFINED_ROCE_LAG) if (desc->slave && desc->slave->lag_tx_port_affinity > 0) { @@ -252,11 +345,11 @@ int qp_mgr::configure(struct qp_mgr_desc *desc) qp_logdbg("QP ROCE LAG port: %d of %d", p_slave->lag_tx_port_affinity, attr_out.num_lag_ports); - if (!mlx5dv_modify_qp_lag_port(m_qp, p_slave->lag_tx_port_affinity)) { + if (!mlx5dv_modify_qp_lag_port(m_mlx5_qp.qp, p_slave->lag_tx_port_affinity)) { uint8_t current_port_num = 0; uint8_t active_port_num = 0; - if (!mlx5dv_query_qp_lag_port(m_qp, ¤t_port_num, &active_port_num)) { + if (!mlx5dv_query_qp_lag_port(m_mlx5_qp.qp, ¤t_port_num, &active_port_num)) { qp_logdbg("QP ROCE LAG port affinity: %d => %d", current_port_num, active_port_num); } @@ -281,8 +374,10 @@ int qp_mgr::configure(struct qp_mgr_desc *desc) void qp_mgr::up() { + init_qp(); + // Add buffers - qp_logdbg("QP current state: %d", priv_ibv_query_qp_state(m_qp)); + qp_logdbg("QP current state: %d", priv_ibv_query_qp_state(m_mlx5_qp.qp)); m_p_cq_mgr_tx->add_qp_tx(this); @@ -292,11 +387,17 @@ void qp_mgr::up() modify_qp_to_ready_state(); m_p_cq_mgr_rx->add_qp_rx(this); + + init_device_memory(); } void qp_mgr::down() { - qp_logdbg("QP current state: %d", priv_ibv_query_qp_state(m_qp)); + if (m_dm_enabled) { + m_dm_mgr.release_resources(); + } + + qp_logdbg("QP current state: %d", priv_ibv_query_qp_state(m_mlx5_qp.qp)); modify_qp_to_error_state(); // free buffers from current active resource iterator @@ -317,7 +418,7 @@ void qp_mgr::modify_qp_to_error_state() qp_logdbg(""); BULLSEYE_EXCLUDE_BLOCK_START - if (priv_ibv_modify_qp_to_err(m_qp)) { + if (priv_ibv_modify_qp_to_err(m_mlx5_qp.qp)) { qp_logdbg("ibv_modify_qp failure (errno = %d %m)", errno); } BULLSEYE_EXCLUDE_BLOCK_END @@ -374,7 +475,7 @@ void qp_mgr::release_tx_buffers() int ret; uint64_t poll_sn = 0; qp_logdbg("draining cq_mgr_tx %p", m_p_cq_mgr_tx); - while (m_p_cq_mgr_tx && m_qp && + while (m_p_cq_mgr_tx && m_mlx5_qp.qp && ((ret = m_p_cq_mgr_tx->poll_and_process_element_tx(&poll_sn)) > 0) && (errno != EIO && !m_p_ib_ctx_handler->is_removed())) { qp_logdbg("draining completed on cq_mgr_tx (%d wce)", ret); @@ -382,116 +483,6 @@ void qp_mgr::release_tx_buffers() NOT_IN_USE(ret); // Suppress --enable-opt-log=high warning } -void qp_mgr::trigger_completion_for_all_sent_packets() -{ - xlio_ibv_send_wr send_wr; - ibv_sge sge[1]; - - // Handle releasing of Tx buffers - // Single post send with SIGNAL of a dummy packet - - // NOTE: Since the QP is in ERROR state no packets will be sent on the wire! - // So we can post_send anything we want :) - - qp_logdbg("unsignaled count=%d", m_n_unsignaled_count); - if (!is_signal_requested_for_last_wqe()) { - qp_logdbg("Need to send closing tx wr..."); - mem_buf_desc_t *p_mem_buf_desc = m_p_ring->mem_buf_tx_get(0, true, PBUF_RAM); - m_p_ring->m_missing_buf_ref_count--; // Align Tx buffer accounting since we will be - // bypassing the normal send calls - if (!p_mem_buf_desc) { - qp_logerr("no buffer in pool"); - return; - } - - // Prepare dummy packet: zeroed payload ('0000'). - // For ETH it replaces the MAC header!! (Nothing is going on the wire, QP in error state) - // For IB it replaces the IPoIB header. - - /* need to send at least eth+ip, since libmlx5 will drop just eth header */ - ethhdr *p_buffer_ethhdr = (ethhdr *)p_mem_buf_desc->p_buffer; - memset(p_buffer_ethhdr, 0, sizeof(*p_buffer_ethhdr)); - p_buffer_ethhdr->h_proto = htons(ETH_P_IP); - iphdr *p_buffer_iphdr = (iphdr *)(p_mem_buf_desc->p_buffer + sizeof(*p_buffer_ethhdr)); - memset(p_buffer_iphdr, 0, sizeof(*p_buffer_iphdr)); - sge[0].length = sizeof(ethhdr) + sizeof(iphdr); - sge[0].addr = (uintptr_t)(p_mem_buf_desc->p_buffer); - sge[0].lkey = m_p_ring->m_tx_lkey; - - // Prepare send wr for (does not care if it is UD/IB or RAW/ETH) - // UD requires AH+qkey, RAW requires minimal payload instead of MAC header. - - memset(&send_wr, 0, sizeof(send_wr)); - send_wr.wr_id = (uintptr_t)p_mem_buf_desc; - send_wr.sg_list = sge; - send_wr.num_sge = 1; - send_wr.next = NULL; - xlio_send_wr_opcode(send_wr) = XLIO_IBV_WR_SEND; - qp_logdbg("IBV_SEND_SIGNALED"); - - // Close the Tx unsignaled send list - set_unsignaled_count(); - - // We don't check for available space in SQ, because this is legacy code. - send_to_wire(&send_wr, - (xlio_wr_tx_packet_attr)(XLIO_TX_PACKET_L3_CSUM | XLIO_TX_PACKET_L4_CSUM), - true, NULL, 0); - } -} - -uint32_t qp_mgr::get_rx_max_wr_num() -{ - return m_rx_num_wr; -} - -void qp_mgr::post_recv_buffer(mem_buf_desc_t *p_mem_buf_desc) -{ - if (m_n_sysvar_rx_prefetch_bytes_before_poll) { - if (m_p_prev_rx_desc_pushed) { - m_p_prev_rx_desc_pushed->p_prev_desc = p_mem_buf_desc; - } - m_p_prev_rx_desc_pushed = p_mem_buf_desc; - } - - m_ibv_rx_wr_array[m_curr_rx_wr].wr_id = (uintptr_t)p_mem_buf_desc; - m_ibv_rx_sg_array[m_curr_rx_wr].addr = (uintptr_t)p_mem_buf_desc->p_buffer; - m_ibv_rx_sg_array[m_curr_rx_wr].length = p_mem_buf_desc->sz_buffer; - m_ibv_rx_sg_array[m_curr_rx_wr].lkey = p_mem_buf_desc->lkey; - - if (m_curr_rx_wr == m_n_sysvar_rx_num_wr_to_post_recv - 1) { - - m_last_posted_rx_wr_id = (uintptr_t)p_mem_buf_desc; - - m_p_prev_rx_desc_pushed = NULL; - p_mem_buf_desc->p_prev_desc = NULL; - - m_curr_rx_wr = 0; - struct ibv_recv_wr *bad_wr = NULL; - IF_VERBS_FAILURE(ibv_post_recv(m_qp, &m_ibv_rx_wr_array[0], &bad_wr)) - { - uint32_t n_pos_bad_rx_wr = - ((uint8_t *)bad_wr - (uint8_t *)m_ibv_rx_wr_array) / sizeof(struct ibv_recv_wr); - qp_logerr("failed posting list (errno=%d %m)", errno); - qp_logerr("bad_wr is %d in submitted list (bad_wr=%p, m_ibv_rx_wr_array=%p, size=%zu)", - n_pos_bad_rx_wr, bad_wr, m_ibv_rx_wr_array, sizeof(struct ibv_recv_wr)); - qp_logerr("bad_wr info: wr_id=%#lx, next=%p, addr=%#lx, length=%d, lkey=%#x", - bad_wr[0].wr_id, bad_wr[0].next, bad_wr[0].sg_list[0].addr, - bad_wr[0].sg_list[0].length, bad_wr[0].sg_list[0].lkey); - qp_logerr("QP current state: %d", priv_ibv_query_qp_state(m_qp)); - - // Fix broken linked list of rx_wr - if (n_pos_bad_rx_wr != (m_n_sysvar_rx_num_wr_to_post_recv - 1)) { - m_ibv_rx_wr_array[n_pos_bad_rx_wr].next = &m_ibv_rx_wr_array[n_pos_bad_rx_wr + 1]; - } - throw; - } - ENDIF_VERBS_FAILURE; - qp_logfunc("Successful ibv_post_recv"); - } else { - m_curr_rx_wr++; - } -} - void qp_mgr::post_recv_buffers(descq_t *p_buffers, size_t count) { qp_logfuncall(""); @@ -501,44 +492,6 @@ void qp_mgr::post_recv_buffers(descq_t *p_buffers, size_t count) } } -inline int qp_mgr::send_to_wire(xlio_ibv_send_wr *p_send_wqe, xlio_wr_tx_packet_attr attr, - bool request_comp, xlio_tis *tis, unsigned credits) -{ - NOT_IN_USE(attr); - NOT_IN_USE(tis); - NOT_IN_USE(credits); - int ret = 0; - xlio_ibv_send_wr *bad_wr = NULL; - - if (request_comp) { - xlio_send_wr_send_flags(*p_send_wqe) = - (xlio_ibv_send_flags)(xlio_send_wr_send_flags(*p_send_wqe) | XLIO_IBV_SEND_SIGNALED); - } - - IF_VERBS_FAILURE(xlio_ibv_post_send(m_qp, p_send_wqe, &bad_wr)) - { - qp_logerr( - "failed post_send%s (errno=%d %m)\n", - ((xlio_send_wr_send_flags(*p_send_wqe) & XLIO_IBV_SEND_INLINE) ? "(+inline)" : ""), - errno); - if (bad_wr) { - qp_logerr("bad_wr info: wr_id=%#lx, send_flags=%#lx, addr=%#lx, length=%d, lkey=%#x, " - "max_inline_data=%d", - bad_wr->wr_id, (unsigned long)xlio_send_wr_send_flags(*bad_wr), - bad_wr->sg_list[0].addr, bad_wr->sg_list[0].length, bad_wr->sg_list[0].lkey, - get_max_inline_data()); - } - ret = -1; - } - ENDIF_VERBS_FAILURE; - - // Clear the SINGAL request - xlio_send_wr_send_flags(*p_send_wqe) = - (xlio_ibv_send_flags)(xlio_send_wr_send_flags(*p_send_wqe) & ~XLIO_IBV_SEND_SIGNALED); - - return ret; -} - int qp_mgr::send(xlio_ibv_send_wr *p_send_wqe, xlio_wr_tx_packet_attr attr, xlio_tis *tis, unsigned credits) { @@ -579,17 +532,17 @@ void qp_mgr::modify_qp_to_ready_state() { qp_logdbg(""); int ret = 0; - int qp_state = priv_ibv_query_qp_state(m_qp); + int qp_state = priv_ibv_query_qp_state(m_mlx5_qp.qp); if (qp_state != IBV_QPS_INIT) { BULLSEYE_EXCLUDE_BLOCK_START - if ((ret = priv_ibv_modify_qp_from_err_to_init_raw(m_qp, m_port_num)) != 0) { + if ((ret = priv_ibv_modify_qp_from_err_to_init_raw(m_mlx5_qp.qp, m_port_num)) != 0) { qp_logpanic("failed to modify QP from %d to RTS state (ret = %d)", qp_state, ret); } BULLSEYE_EXCLUDE_BLOCK_END } BULLSEYE_EXCLUDE_BLOCK_START - if ((ret = priv_ibv_modify_qp_from_init_to_rts(m_qp)) != 0) { + if ((ret = priv_ibv_modify_qp_from_init_to_rts(m_mlx5_qp.qp)) != 0) { qp_logpanic("failed to modify QP from INIT to RTS state (ret = %d)", ret); } @@ -609,15 +562,15 @@ int qp_mgr::prepare_ibv_qp(xlio_ibv_qp_init_attr &qp_init_attr) qp_logdbg("create qp with max_tso_header = %d", m_p_ring->get_max_header_sz()); } - m_qp = xlio_ibv_create_qp(m_p_ib_ctx_handler->get_ibv_pd(), &qp_init_attr); + m_mlx5_qp.qp = xlio_ibv_create_qp(m_p_ib_ctx_handler->get_ibv_pd(), &qp_init_attr); BULLSEYE_EXCLUDE_BLOCK_START - if (!m_qp) { + if (!m_mlx5_qp.qp) { qp_logerr("ibv_create_qp failed (errno=%d %m)", errno); return -1; } - VALGRIND_MAKE_MEM_DEFINED(m_qp, sizeof(ibv_qp)); - if ((ret = priv_ibv_modify_qp_from_err_to_init_raw(m_qp, m_port_num)) != 0) { + VALGRIND_MAKE_MEM_DEFINED(m_mlx5_qp.qp, sizeof(ibv_qp)); + if ((ret = priv_ibv_modify_qp_from_err_to_init_raw(m_mlx5_qp.qp, m_port_num)) != 0) { qp_logerr("failed to modify QP from ERR to INIT state (ret = %d)", ret); return ret; } @@ -647,7 +600,7 @@ int qp_mgr::modify_qp_ratelimit(struct xlio_rate_limit_t &rate_limit, uint32_t r { int ret; - ret = priv_ibv_modify_qp_ratelimit(m_qp, rate_limit, rl_changes); + ret = priv_ibv_modify_qp_ratelimit(m_mlx5_qp.qp, rate_limit, rl_changes); if (ret) { qp_logdbg("failed to modify qp ratelimit ret %d (errno=%d %m)", ret, errno); return -1; @@ -672,3 +625,1292 @@ rfs_rule *qp_mgr::create_rfs_rule(xlio_ibv_flow_attr &attrs, xlio_tir *tir_ext) return nullptr; } + +void qp_mgr::init_qp() +{ + if (0 != xlio_ib_mlx5_get_qp_tx(&m_mlx5_qp)) { + qp_logpanic("xlio_ib_mlx5_get_qp_tx failed (errno=%d %m)", errno); + } + + m_sq_wqes = (struct mlx5_eth_wqe(*)[])(uintptr_t)m_mlx5_qp.sq.buf; + m_sq_wqe_hot = &(*m_sq_wqes)[0]; + m_sq_wqes_end = + (uint8_t *)((uintptr_t)m_mlx5_qp.sq.buf + m_mlx5_qp.sq.wqe_cnt * m_mlx5_qp.sq.stride); + m_sq_wqe_counter = 0; + + m_sq_wqe_hot_index = 0; + + uint32_t old_wr_val = m_tx_num_wr; + m_tx_num_wr = (m_sq_wqes_end - (uint8_t *)m_sq_wqe_hot) / WQEBB; + + // We use the min between CQ size and the QP size (that might be increases by ibv creation). + m_sq_free_credits = std::min(m_tx_num_wr, old_wr_val); + + /* Maximum BF inlining consists of: + * - CTRL: + * - 1st WQEBB is mostly used for CTRL and ETH segment (where ETH header is inlined) + * - 4 bytes for size of inline data + * - DATA: + * - 1 OCTOWORD from 1st WQEBB is used for data inlining, except for + * the 4 bytes used for stating the inline data size + * - 3 WQEBB are fully availabie for data inlining + */ + m_mlx5_qp.cap.max_inline_data = OCTOWORD - 4 + 3 * WQEBB; + + if (m_sq_wqe_idx_to_prop == NULL) { + m_sq_wqe_idx_to_prop = + (sq_wqe_prop *)mmap(NULL, m_tx_num_wr * sizeof(*m_sq_wqe_idx_to_prop), + PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); + if (m_sq_wqe_idx_to_prop == MAP_FAILED) { + qp_logerr("Failed allocating m_sq_wqe_idx_to_prop (errno=%d %m)", errno); + return; + } + m_sq_wqe_prop_last_signalled = m_tx_num_wr - 1; + m_sq_wqe_prop_last = NULL; + } + + qp_logfunc("m_tx_num_wr=%d max_inline_data: %d m_sq_wqe_idx_to_prop=%p", m_tx_num_wr, + get_max_inline_data(), m_sq_wqe_idx_to_prop); + + memset((void *)(uintptr_t)m_sq_wqe_hot, 0, sizeof(struct mlx5_eth_wqe)); + m_sq_wqe_hot->ctrl.data[0] = htonl(MLX5_OPCODE_SEND); + m_sq_wqe_hot->ctrl.data[1] = htonl((m_mlx5_qp.qpn << 8) | 4); + m_sq_wqe_hot->ctrl.data[2] = 0; + m_sq_wqe_hot->eseg.inline_hdr_sz = htons(MLX5_ETH_INLINE_HEADER_SIZE); + m_sq_wqe_hot->eseg.cs_flags = XLIO_TX_PACKET_L3_CSUM | XLIO_TX_PACKET_L4_CSUM; + + qp_logfunc("%p allocated for %d QPs sq_wqes:%p sq_wqes_end: %p and configured %d WRs " + "BlueFlame: %p buf_size: %d offset: %d", + m_mlx5_qp.qp, m_mlx5_qp.qpn, m_sq_wqes, m_sq_wqes_end, m_tx_num_wr, m_mlx5_qp.bf.reg, + m_mlx5_qp.bf.size, m_mlx5_qp.bf.offset); +} + +void qp_mgr::init_device_memory() +{ + /* This limitation is done because of a observation + * that dm_copy takes a lot of time on VMs w/o BF (RM:1542628) + */ + if (m_p_ib_ctx_handler->get_on_device_memory_size() > 0) { + if (m_db_method == MLX5_DB_METHOD_BF) { + m_dm_enabled = + m_dm_mgr.allocate_resources(m_p_ib_ctx_handler, m_p_ring->m_p_ring_stat.get()); + + } else { +#if defined(DEFINED_IBV_DM) + VLOG_PRINTF_ONCE_THEN_DEBUG( + VLOG_WARNING, + "Device Memory functionality is not used on devices w/o Blue Flame support\n"); +#endif /* DEFINED_IBV_DM */ + } + } +} + +#if defined(DEFINED_UTLS) +void qp_mgr::destroy_tis_cache(void) +{ + while (!m_tls_tis_cache.empty()) { + xlio_tis *tis = m_tls_tis_cache.back(); + m_tls_tis_cache.pop_back(); + delete tis; + } +} +#endif /* defined(DEFINED_UTLS) */ + +void qp_mgr::update_next_wqe_hot() +{ + // Preparing next WQE as Ethernet send WQE and index: + m_sq_wqe_hot = &(*m_sq_wqes)[m_sq_wqe_counter & (m_tx_num_wr - 1)]; + m_sq_wqe_hot_index = m_sq_wqe_counter & (m_tx_num_wr - 1); + memset(m_sq_wqe_hot, 0, sizeof(mlx5_eth_wqe)); + + // Fill Ethernet segment with header inline: + struct mlx5_wqe_eth_seg *eth_seg = + (struct mlx5_wqe_eth_seg *)((uint8_t *)m_sq_wqe_hot + sizeof(struct mlx5_wqe_ctrl_seg)); + eth_seg->inline_hdr_sz = htons(MLX5_ETH_INLINE_HEADER_SIZE); +} + +void qp_mgr::post_recv_buffer(mem_buf_desc_t *p_mem_buf_desc) +{ + m_ibv_rx_sg_array[m_curr_rx_wr].addr = (uintptr_t)p_mem_buf_desc->p_buffer; + m_ibv_rx_sg_array[m_curr_rx_wr].length = p_mem_buf_desc->sz_buffer; + m_ibv_rx_sg_array[m_curr_rx_wr].lkey = p_mem_buf_desc->lkey; + + post_recv_buffer_rq(p_mem_buf_desc); +} + +void qp_mgr::post_recv_buffer_rq(mem_buf_desc_t *p_mem_buf_desc) +{ + if (m_n_sysvar_rx_prefetch_bytes_before_poll) { + if (m_p_prev_rx_desc_pushed) { + m_p_prev_rx_desc_pushed->p_prev_desc = p_mem_buf_desc; + } + m_p_prev_rx_desc_pushed = p_mem_buf_desc; + } + + m_ibv_rx_wr_array[m_curr_rx_wr].wr_id = (uintptr_t)p_mem_buf_desc; + + if (m_rq_wqe_idx_to_wrid) { + uint32_t index = m_rq_wqe_counter & (m_rx_num_wr - 1); + m_rq_wqe_idx_to_wrid[index] = (uintptr_t)p_mem_buf_desc; + ++m_rq_wqe_counter; + } + + if (m_curr_rx_wr == m_n_sysvar_rx_num_wr_to_post_recv - 1) { + + m_last_posted_rx_wr_id = (uintptr_t)p_mem_buf_desc; + + m_p_prev_rx_desc_pushed = NULL; + p_mem_buf_desc->p_prev_desc = NULL; + + m_curr_rx_wr = 0; + struct ibv_recv_wr *bad_wr = NULL; + IF_VERBS_FAILURE(xlio_ib_mlx5_post_recv(&m_mlx5_qp, &m_ibv_rx_wr_array[0], &bad_wr)) + { + uint32_t n_pos_bad_rx_wr = + ((uint8_t *)bad_wr - (uint8_t *)m_ibv_rx_wr_array) / sizeof(struct ibv_recv_wr); + qp_logerr("failed posting list (errno=%d %s)", errno, strerror(errno)); + qp_logerr("bad_wr is %d in submitted list (bad_wr=%p, m_ibv_rx_wr_array=%p, size=%zu)", + n_pos_bad_rx_wr, bad_wr, m_ibv_rx_wr_array, sizeof(struct ibv_recv_wr)); + qp_logerr("bad_wr info: wr_id=%#lx, next=%p, addr=%#lx, length=%d, lkey=%#x", + bad_wr[0].wr_id, bad_wr[0].next, bad_wr[0].sg_list[0].addr, + bad_wr[0].sg_list[0].length, bad_wr[0].sg_list[0].lkey); + qp_logerr("QP current state: %d", priv_ibv_query_qp_state(m_mlx5_qp.qp)); + + // Fix broken linked list of rx_wr + if (n_pos_bad_rx_wr != (m_n_sysvar_rx_num_wr_to_post_recv - 1)) { + m_ibv_rx_wr_array[n_pos_bad_rx_wr].next = &m_ibv_rx_wr_array[n_pos_bad_rx_wr + 1]; + } + throw; + } + ENDIF_VERBS_FAILURE; + qp_logfunc("Successful ibv_post_recv"); + } else { + m_curr_rx_wr++; + } +} + +bool qp_mgr::init_rx_cq_mgr_prepare() +{ + m_rx_num_wr = align32pow2(m_rx_num_wr); + + m_rq_wqe_idx_to_wrid = + (uint64_t *)mmap(NULL, m_rx_num_wr * sizeof(*m_rq_wqe_idx_to_wrid), PROT_READ | PROT_WRITE, + MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); + if (m_rq_wqe_idx_to_wrid == MAP_FAILED) { + qp_logerr("Failed allocating m_rq_wqe_idx_to_wrid (errno=%d %m)", errno); + return false; + } + + return true; +} + +cq_mgr_rx *qp_mgr::init_rx_cq_mgr(struct ibv_comp_channel *p_rx_comp_event_channel) +{ + return (!init_rx_cq_mgr_prepare() ? NULL + : new cq_mgr_rx_regrq(m_p_ring, m_p_ib_ctx_handler, + m_rx_num_wr, p_rx_comp_event_channel)); +} + +cq_mgr_tx *qp_mgr::init_tx_cq_mgr() +{ + m_tx_num_wr = align32pow2(m_tx_num_wr); + return new cq_mgr_tx(m_p_ring, m_p_ib_ctx_handler, m_tx_num_wr, + m_p_ring->get_tx_comp_event_channel()); +} + +inline void qp_mgr::ring_doorbell(int db_method, int num_wqebb, int num_wqebb_top, + bool skip_comp /*=false*/) +{ + uint64_t *dst = (uint64_t *)((uint8_t *)m_mlx5_qp.bf.reg + m_mlx5_qp.bf.offset); + uint64_t *src = reinterpret_cast(m_sq_wqe_hot); + struct xlio_mlx5_wqe_ctrl_seg *ctrl = reinterpret_cast(src); + + /* TODO Refactor m_n_unsignedled_count, is_completion_need(), set_unsignaled_count(): + * Some logic is hidden inside the methods and in one branch the field is changed directly. + */ + if (!skip_comp && is_completion_need()) { + ctrl->fm_ce_se |= MLX5_WQE_CTRL_CQ_UPDATE; + } + if (ctrl->fm_ce_se & MLX5_WQE_CTRL_CQ_UPDATE) { + set_unsignaled_count(); + } else { + dec_unsignaled_count(); + } + if (unlikely(m_b_fence_needed)) { + ctrl->fm_ce_se |= MLX5_FENCE_MODE_INITIATOR_SMALL; + m_b_fence_needed = false; + } + + m_sq_wqe_counter = (m_sq_wqe_counter + num_wqebb + num_wqebb_top) & 0xFFFF; + + // Make sure that descriptors are written before + // updating doorbell record and ringing the doorbell + wmb(); + *m_mlx5_qp.sq.dbrec = htonl(m_sq_wqe_counter); + + // This wc_wmb ensures ordering between DB record and BF copy + wc_wmb(); + if (likely(db_method == MLX5_DB_METHOD_BF)) { + /* Copying src to BlueFlame register buffer by Write Combining cnt WQEBBs + * Avoid using memcpy() to copy to BlueFlame page, since memcpy() + * implementations may use move-string-buffer assembler instructions, + * which do not guarantee order of copying. + */ + while (num_wqebb--) { + COPY_64B_NT(dst, src); + } + src = (uint64_t *)m_sq_wqes; + while (num_wqebb_top--) { + COPY_64B_NT(dst, src); + } + } else { + *dst = *src; + } + + /* Use wc_wmb() to ensure write combining buffers are flushed out + * of the running CPU. + * sfence instruction affects only the WC buffers of the CPU that executes it + */ + wc_wmb(); + m_mlx5_qp.bf.offset ^= m_mlx5_qp.bf.size; +} + +inline int qp_mgr::fill_inl_segment(sg_array &sga, uint8_t *cur_seg, uint8_t *data_addr, + int max_inline_len, int inline_len) +{ + int wqe_inline_size = 0; + while ((data_addr != NULL) && inline_len) { + dbg_dump_wqe((uint32_t *)data_addr, inline_len); + memcpy(cur_seg, data_addr, inline_len); + wqe_inline_size += inline_len; + cur_seg += inline_len; + inline_len = max_inline_len - wqe_inline_size; + data_addr = sga.get_data(&inline_len); + qp_logfunc("data_addr:%p cur_seg: %p inline_len: %d wqe_inline_size: %d", data_addr, + cur_seg, inline_len, wqe_inline_size); + } + return wqe_inline_size; +} + +//! Fill WQE dynamically, based on amount of free WQEBB in SQ +inline int qp_mgr::fill_wqe(xlio_ibv_send_wr *pswr) +{ + // control segment is mostly filled by preset after previous packet + // we always inline ETH header + sg_array sga(pswr->sg_list, pswr->num_sge); + uint8_t *cur_seg = (uint8_t *)m_sq_wqe_hot + sizeof(struct mlx5_wqe_ctrl_seg); + int inline_len = MLX5_ETH_INLINE_HEADER_SIZE; + int data_len = sga.length(); + int wqe_size = sizeof(struct mlx5_wqe_ctrl_seg) / OCTOWORD; + int max_inline_len = get_max_inline_data(); + + // assume packet is full inline + if (likely(data_len <= max_inline_len && xlio_send_wr_opcode(*pswr) == XLIO_IBV_WR_SEND)) { + uint8_t *data_addr = sga.get_data(&inline_len); // data for inlining in ETH header + data_len -= inline_len; + qp_logfunc( + "wqe_hot:%p num_sge: %d data_addr: %p data_len: %d max_inline_len: %d inline_len: %d", + m_sq_wqe_hot, pswr->num_sge, data_addr, data_len, max_inline_len, inline_len); + + // Fill Ethernet segment with header inline, static data + // were populated in preset after previous packet send + memcpy(cur_seg + offsetof(struct mlx5_wqe_eth_seg, inline_hdr_start), data_addr, + MLX5_ETH_INLINE_HEADER_SIZE); + cur_seg += sizeof(struct mlx5_wqe_eth_seg); + wqe_size += sizeof(struct mlx5_wqe_eth_seg) / OCTOWORD; + + max_inline_len = data_len; + // Filling inline data segment + // size of BlueFlame buffer is 4*WQEBBs, 3*OCTOWORDS of the first + // was allocated for control and ethernet segment so we have 3*WQEBB+16-4 + int rest_space = std::min((int)(m_sq_wqes_end - cur_seg - 4), (3 * WQEBB + OCTOWORD - 4)); + // Filling till the end of inline WQE segment or + // to end of WQEs + if (likely(max_inline_len <= rest_space)) { + inline_len = max_inline_len; + qp_logfunc("data_addr:%p cur_seg: %p rest_space: %d inline_len: %d wqe_size: %d", + data_addr, cur_seg, rest_space, inline_len, wqe_size); + // bypass inline size and fill inline data segment + data_addr = sga.get_data(&inline_len); + inline_len = fill_inl_segment(sga, cur_seg + 4, data_addr, max_inline_len, inline_len); + + // store inline data size and mark the data as inlined + *(uint32_t *)((uint8_t *)m_sq_wqe_hot + sizeof(struct mlx5_wqe_ctrl_seg) + + sizeof(struct mlx5_wqe_eth_seg)) = htonl(0x80000000 | inline_len); + rest_space = align_to_octoword_up(inline_len + 4); // align to OCTOWORDs + wqe_size += rest_space / OCTOWORD; + // assert((data_len-inline_len)==0); + // configuring control + m_sq_wqe_hot->ctrl.data[1] = htonl((m_mlx5_qp.qpn << 8) | wqe_size); + rest_space = align_to_WQEBB_up(wqe_size) / 4; + qp_logfunc("data_len: %d inline_len: %d wqe_size: %d wqebbs: %d", data_len - inline_len, + inline_len, wqe_size, rest_space); + ring_doorbell(m_db_method, rest_space); + return rest_space; + } else { + // wrap around case, first filling till the end of m_sq_wqes + int wrap_up_size = max_inline_len - rest_space; + inline_len = rest_space; + qp_logfunc("WRAP_UP_SIZE: %d data_addr:%p cur_seg: %p rest_space: %d inline_len: %d " + "wqe_size: %d", + wrap_up_size, data_addr, cur_seg, rest_space, inline_len, wqe_size); + + data_addr = sga.get_data(&inline_len); + inline_len = fill_inl_segment(sga, cur_seg + 4, data_addr, rest_space, inline_len); + data_len -= inline_len; + rest_space = align_to_octoword_up(inline_len + 4); + wqe_size += rest_space / OCTOWORD; + rest_space = + align_to_WQEBB_up(rest_space / OCTOWORD) / 4; // size of 1st chunk at the end + + qp_logfunc( + "END chunk data_addr: %p data_len: %d inline_len: %d wqe_size: %d wqebbs: %d", + data_addr, data_len, inline_len, wqe_size, rest_space); + // Wrap around + // + cur_seg = (uint8_t *)m_sq_wqes; + data_addr = sga.get_data(&wrap_up_size); + + wrap_up_size = fill_inl_segment(sga, cur_seg, data_addr, data_len, wrap_up_size); + inline_len += wrap_up_size; + max_inline_len = align_to_octoword_up(wrap_up_size); + wqe_size += max_inline_len / OCTOWORD; + max_inline_len = align_to_WQEBB_up(max_inline_len / OCTOWORD) / 4; + // store inline data size + *(uint32_t *)((uint8_t *)m_sq_wqe_hot + sizeof(struct mlx5_wqe_ctrl_seg) + + sizeof(struct mlx5_wqe_eth_seg)) = htonl(0x80000000 | inline_len); + qp_logfunc("BEGIN_CHUNK data_addr: %p data_len: %d wqe_size: %d inline_len: %d " + "end_wqebbs: %d wqebbs: %d", + data_addr, data_len - wrap_up_size, wqe_size, inline_len + wrap_up_size, + rest_space, max_inline_len); + // assert((data_len-wrap_up_size)==0); + // configuring control + m_sq_wqe_hot->ctrl.data[1] = htonl((m_mlx5_qp.qpn << 8) | wqe_size); + + dbg_dump_wqe((uint32_t *)m_sq_wqe_hot, rest_space * 4 * 16); + dbg_dump_wqe((uint32_t *)m_sq_wqes, max_inline_len * 4 * 16); + + ring_doorbell(m_db_method, rest_space, max_inline_len); + return rest_space + max_inline_len; + } + } else { + if (xlio_send_wr_opcode(*pswr) == XLIO_IBV_WR_SEND) { + /* data is bigger than max to inline we inlined only ETH header + uint from IP (18 + * bytes) the rest will be in data pointer segment adding data seg with pointer if there + * still data to transfer + */ + wqe_size = fill_wqe_send(pswr); + return wqe_size; + } else { + /* Support XLIO_IBV_WR_SEND_TSO operation + */ + wqe_size = fill_wqe_lso(pswr); + return wqe_size; + } + } + return 1; +} + +inline int qp_mgr::fill_wqe_send(xlio_ibv_send_wr *pswr) +{ + struct mlx5_wqe_eth_seg *eseg; + struct mlx5_wqe_data_seg *dseg; + int wqe_size = sizeof(mlx5_wqe_ctrl_seg) / OCTOWORD; + + eseg = (struct mlx5_wqe_eth_seg *)((uint8_t *)m_sq_wqe_hot + sizeof(mlx5_wqe_ctrl_seg)); + eseg->inline_hdr_sz = 0; + + /* Unlike Linux kernel, rdma-core defines mlx5_wqe_eth_seg as 32 bytes, because it contains + * 18 bytes of inline header. We don't want to inline partial header to avoid an extra copy + * and code complication. Therefore, we cannot rely on the structure definition and need to + * hardcode 16 bytes here. + */ + wqe_size += 1; + dseg = (struct mlx5_wqe_data_seg *)((uintptr_t)eseg + OCTOWORD); + + for (int i = 0; i < pswr->num_sge; ++i) { + if (unlikely((uintptr_t)dseg >= (uintptr_t)m_sq_wqes_end)) { + dseg = (struct mlx5_wqe_data_seg *)m_sq_wqes; + } + if (likely(pswr->sg_list[i].length)) { + dseg->byte_count = htonl(pswr->sg_list[i].length); + /* Try to copy data to On Device Memory in first */ + if (!(m_dm_enabled && + m_dm_mgr.copy_data(dseg, (uint8_t *)((uintptr_t)pswr->sg_list[i].addr), + pswr->sg_list[i].length, (mem_buf_desc_t *)pswr->wr_id))) { + dseg->lkey = htonl(pswr->sg_list[i].lkey); + dseg->addr = htonll((uintptr_t)pswr->sg_list[i].addr); + } + ++dseg; + wqe_size += sizeof(struct mlx5_wqe_data_seg) / OCTOWORD; + } + } + + m_sq_wqe_hot->ctrl.data[1] = htonl((m_mlx5_qp.qpn << 8) | wqe_size); + int wqebbs = align_to_WQEBB_up(wqe_size) / 4; + /* TODO FIXME Split into top and bottom parts */ + ring_doorbell(m_db_method, wqebbs); + + return wqebbs; +} + +//! Filling wqe for LSO +inline int qp_mgr::fill_wqe_lso(xlio_ibv_send_wr *pswr) +{ + struct mlx5_wqe_ctrl_seg *ctrl = NULL; + struct mlx5_wqe_eth_seg *eseg = NULL; + struct mlx5_wqe_data_seg *dpseg = NULL; + uint8_t *cur_seg = NULL; + uint8_t *p_hdr = (uint8_t *)pswr->tso.hdr; + int inl_hdr_size = pswr->tso.hdr_sz; + int inl_hdr_copy_size = 0; + int max_inline_len = align_to_octoword_up(sizeof(struct mlx5_wqe_eth_seg) + inl_hdr_size - + MLX5_ETH_INLINE_HEADER_SIZE); + int wqe_size = sizeof(struct mlx5_wqe_ctrl_seg) / OCTOWORD; + int rest = 0; + int i = 0; + + ctrl = (struct mlx5_wqe_ctrl_seg *)m_sq_wqe_hot; + + /* Do usual send operation in case payload less than mss */ + if (0 == pswr->tso.mss) { + ctrl->opmod_idx_opcode = + htonl(((m_sq_wqe_counter & 0xffff) << 8) | (get_mlx5_opcode(XLIO_IBV_WR_SEND) & 0xff)); + } + + eseg = (struct mlx5_wqe_eth_seg *)((uint8_t *)m_sq_wqe_hot + sizeof(*ctrl)); + eseg->mss = htons(pswr->tso.mss); + eseg->inline_hdr_sz = htons(inl_hdr_size); + + rest = (int)((uintptr_t)(void *)m_sq_wqes_end - (uintptr_t)(void *)eseg); + cur_seg = (uint8_t *)eseg; + + if (likely(max_inline_len <= rest)) { + // Fill Ethernet segment with full header inline + inl_hdr_copy_size = inl_hdr_size; + memcpy(eseg->inline_hdr_start, p_hdr, inl_hdr_copy_size); + cur_seg += max_inline_len; + } else { + // wrap around SQ on inline ethernet header + inl_hdr_copy_size = rest - offsetof(struct mlx5_wqe_eth_seg, inline_hdr_start); + memcpy(eseg->inline_hdr_start, p_hdr, inl_hdr_copy_size); + p_hdr += inl_hdr_copy_size; + inl_hdr_copy_size = inl_hdr_size - inl_hdr_copy_size; + memcpy(m_sq_wqes, p_hdr, inl_hdr_copy_size); + max_inline_len = align_to_octoword_up(inl_hdr_copy_size); + cur_seg = (uint8_t *)m_sq_wqes + max_inline_len; + wqe_size += rest / OCTOWORD; + inl_hdr_copy_size = align_to_WQEBB_up(wqe_size) / 4; + } + wqe_size += max_inline_len / OCTOWORD; + qp_logfunc("TSO: num_sge: %d max_inline_len: %d inl_hdr_size: %d rest: %d", pswr->num_sge, + max_inline_len, inl_hdr_size, rest); + // Filling data pointer segments with payload by scatter-gather list elements + dpseg = (struct mlx5_wqe_data_seg *)cur_seg; + for (i = 0; i < pswr->num_sge; i++) { + if (unlikely((uintptr_t)dpseg >= (uintptr_t)m_sq_wqes_end)) { + dpseg = (struct mlx5_wqe_data_seg *)m_sq_wqes; + inl_hdr_copy_size = align_to_WQEBB_up(wqe_size) / 4; + } + dpseg->addr = htonll((uint64_t)pswr->sg_list[i].addr); + dpseg->lkey = htonl(pswr->sg_list[i].lkey); + dpseg->byte_count = htonl(pswr->sg_list[i].length); + + qp_logfunc("DATA_SEG: addr:%llx len: %d lkey: %x dp_seg: %p wqe_size: %d", + pswr->sg_list[i].addr, pswr->sg_list[i].length, dpseg->lkey, dpseg, wqe_size); + + dpseg++; + wqe_size += sizeof(struct mlx5_wqe_data_seg) / OCTOWORD; + } + inl_hdr_size = align_to_WQEBB_up(wqe_size) / 4; + m_sq_wqe_hot->ctrl.data[1] = htonl((m_mlx5_qp.qpn << 8) | wqe_size); + + // sending by BlueFlame or DoorBell covering wrap around + // TODO Make a single doorbell call + if (likely(inl_hdr_size <= 4)) { + if (likely(inl_hdr_copy_size == 0)) { + ring_doorbell(MLX5_DB_METHOD_DB, inl_hdr_size); + } else { + ring_doorbell(MLX5_DB_METHOD_DB, inl_hdr_copy_size, inl_hdr_size - inl_hdr_copy_size); + } + } else { + ring_doorbell(MLX5_DB_METHOD_DB, inl_hdr_size); + } + return align_to_WQEBB_up(wqe_size) / 4; +} + +void qp_mgr::store_current_wqe_prop(mem_buf_desc_t *buf, unsigned credits, xlio_ti *ti) +{ + m_sq_wqe_idx_to_prop[m_sq_wqe_hot_index] = sq_wqe_prop { + .buf = buf, + .credits = credits, + .ti = ti, + .next = m_sq_wqe_prop_last, + }; + m_sq_wqe_prop_last = &m_sq_wqe_idx_to_prop[m_sq_wqe_hot_index]; + if (ti != NULL) { + ti->get(); + } +} + +//! Send one RAW packet by MLX5 BlueFlame +// +int qp_mgr::send_to_wire(xlio_ibv_send_wr *p_send_wqe, xlio_wr_tx_packet_attr attr, + bool request_comp, xlio_tis *tis, unsigned credits) +{ + struct xlio_mlx5_wqe_ctrl_seg *ctrl = NULL; + struct mlx5_wqe_eth_seg *eseg = NULL; + uint32_t tisn = tis ? tis->get_tisn() : 0; + + ctrl = (struct xlio_mlx5_wqe_ctrl_seg *)m_sq_wqe_hot; + eseg = (struct mlx5_wqe_eth_seg *)((uint8_t *)m_sq_wqe_hot + sizeof(*ctrl)); + + /* Configure ctrl segment + * qpn_ds or ctrl.data[1] is set inside fill_wqe() + */ + ctrl->opmod_idx_opcode = htonl(((m_sq_wqe_counter & 0xffff) << 8) | + (get_mlx5_opcode(xlio_send_wr_opcode(*p_send_wqe)) & 0xff)); + m_sq_wqe_hot->ctrl.data[2] = 0; + ctrl->fm_ce_se = (request_comp ? (uint8_t)MLX5_WQE_CTRL_CQ_UPDATE : 0); + ctrl->tis_tir_num = htobe32(tisn << 8); + + /* Configure eth segment + * reset rsvd0, cs_flags, rsvd1, mss and rsvd2 fields + * checksum flags are set here + */ + *((uint64_t *)eseg) = 0; + eseg->rsvd2 = 0; + eseg->cs_flags = (uint8_t)(attr & (XLIO_TX_PACKET_L3_CSUM | XLIO_TX_PACKET_L4_CSUM) & 0xff); + + /* Store buffer descriptor */ + store_current_wqe_prop(reinterpret_cast(p_send_wqe->wr_id), credits, tis); + + /* Complete WQE */ + int wqebbs = fill_wqe(p_send_wqe); + assert(wqebbs > 0 && (unsigned)wqebbs <= credits); + NOT_IN_USE(wqebbs); + + update_next_wqe_hot(); + + qp_logfunc( + "m_sq_wqe_hot: %p m_sq_wqe_hot_index: %d wqe_counter: %d new_hot_index: %d wr_id: %llx", + m_sq_wqe_hot, m_sq_wqe_hot_index, m_sq_wqe_counter, (m_sq_wqe_counter & (m_tx_num_wr - 1)), + p_send_wqe->wr_id); + + return 0; +} + +#ifdef DEFINED_UTLS + +std::unique_ptr qp_mgr::get_new_tls_dek(const void *key, uint32_t key_size_bytes) +{ + dpcp::tls_dek *_dek = nullptr; + dpcp::adapter *adapter = m_p_ib_ctx_handler->get_dpcp_adapter(); + if (likely(adapter)) { + dpcp::status status; + struct dpcp::dek_attr dek_attr; + memset(&dek_attr, 0, sizeof(dek_attr)); + dek_attr.key_blob = (void *)key; + dek_attr.key_blob_size = key_size_bytes; + dek_attr.key_size = key_size_bytes; + dek_attr.pd_id = adapter->get_pd(); + status = adapter->create_tls_dek(dek_attr, _dek); + if (unlikely(status != dpcp::DPCP_OK)) { + qp_logwarn("Failed to create new DEK, status: %d", status); + if (_dek) { + delete _dek; + _dek = nullptr; + } + } + } + + return std::unique_ptr(_dek); +} + +std::unique_ptr qp_mgr::get_tls_dek(const void *key, uint32_t key_size_bytes) +{ + dpcp::status status; + dpcp::adapter *adapter = m_p_ib_ctx_handler->get_dpcp_adapter(); + + if (unlikely(!adapter)) { + return std::unique_ptr(nullptr); + } + + // If the amount of available DEKs in m_dek_put_cache is smaller than + // low-watermark we continue to create new DEKs. This is to avoid situations + // where one DEKs is returned and then fetched in a throttlling manner + // causing too frequent crypto-sync. + // It is also possible that crypto-sync may have higher impact with higher number + // of active connections. + if (unlikely(!m_p_ring->tls_sync_dek_supported()) || + (unlikely(m_tls_dek_get_cache.empty()) && + (m_tls_dek_put_cache.size() <= safe_mce_sys().utls_low_wmark_dek_cache_size))) { + return get_new_tls_dek(key, key_size_bytes); + } + + if (unlikely(m_tls_dek_get_cache.empty())) { + qp_logdbg("Empty DEK get cache. Swapping caches and do Sync-Crypto. Put-Cache size: %zu", + m_tls_dek_put_cache.size()); + + status = adapter->sync_crypto_tls(); + if (unlikely(status != dpcp::DPCP_OK)) { + qp_logwarn("Failed to flush DEK HW cache, status: %d", status); + return get_new_tls_dek(key, key_size_bytes); + } + + m_tls_dek_get_cache.swap(m_tls_dek_put_cache); + } + + std::unique_ptr out_dek(std::move(m_tls_dek_get_cache.front())); + m_tls_dek_get_cache.pop_front(); + + struct dpcp::dek_attr dek_attr; + memset(&dek_attr, 0, sizeof(dek_attr)); + dek_attr.key_blob = const_cast(key); + dek_attr.key_blob_size = key_size_bytes; + dek_attr.key_size = key_size_bytes; + dek_attr.pd_id = adapter->get_pd(); + status = out_dek->modify(dek_attr); + if (unlikely(status != dpcp::DPCP_OK)) { + qp_logwarn("Failed to modify DEK, status: %d", status); + out_dek.reset(nullptr); + } + + return out_dek; +} + +void qp_mgr::put_tls_dek(std::unique_ptr &&tls_dek_obj) +{ + if (tls_dek_obj == nullptr) { + return; + } + // We don't allow unlimited DEK cache to avoid system DEK starvation. + if (likely(m_p_ring->tls_sync_dek_supported()) && + m_tls_dek_put_cache.size() < safe_mce_sys().utls_high_wmark_dek_cache_size) { + m_tls_dek_put_cache.emplace_back(std::forward>(tls_dek_obj)); + } +} + +xlio_tis *qp_mgr::tls_context_setup_tx(const xlio_tls_info *info) +{ + std::unique_ptr tis; + if (m_tls_tis_cache.empty()) { + tis = create_tis(DPCP_TIS_FLAGS | dpcp::TIS_ATTR_TLS); + if (unlikely(tis == nullptr)) { + return nullptr; + } + } else { + tis.reset(m_tls_tis_cache.back()); + m_tls_tis_cache.pop_back(); + } + + auto dek_obj = get_tls_dek(info->key, info->key_len); + if (unlikely(!dek_obj)) { + m_tls_tis_cache.push_back(tis.release()); + return nullptr; + } + + tis->assign_dek(std::move(dek_obj)); + uint32_t tisn = tis->get_tisn(); + + tls_post_static_params_wqe(tis.get(), info, tisn, tis->get_dek_id(), 0, false, true); + tls_post_progress_params_wqe(tis.get(), tisn, 0, false, true); + /* The 1st post after TLS configuration must be with fence. */ + m_b_fence_needed = true; + + assert(!tis->m_released); + + return tis.release(); +} + +void qp_mgr::tls_context_resync_tx(const xlio_tls_info *info, xlio_tis *tis, bool skip_static) +{ + uint32_t tisn = tis->get_tisn(); + + if (!skip_static) { + tls_post_static_params_wqe(tis, info, tisn, tis->get_dek_id(), 0, true, true); + } + tls_post_progress_params_wqe(tis, tisn, 0, skip_static, true); + m_b_fence_needed = true; +} + +xlio_tir *qp_mgr::tls_create_tir(bool cached) +{ + xlio_tir *tir = NULL; + + if (cached && !m_tls_tir_cache.empty()) { + tir = m_tls_tir_cache.back(); + m_tls_tir_cache.pop_back(); + } else if (!cached) { + dpcp::tir *_tir = create_tir(true); + + if (_tir != NULL) { + tir = new xlio_tir(_tir, xlio_ti::ti_type::TLS_TIR); + } + if (unlikely(tir == NULL && _tir != NULL)) { + delete _tir; + } + } + return tir; +} + +int qp_mgr::tls_context_setup_rx(xlio_tir *tir, const xlio_tls_info *info, + uint32_t next_record_tcp_sn, xlio_comp_cb_t callback, + void *callback_arg) +{ + uint32_t tirn; + dpcp::tls_dek *_dek; + dpcp::status status; + dpcp::adapter *adapter = m_p_ib_ctx_handler->get_dpcp_adapter(); + struct dpcp::dek_attr dek_attr; + + memset(&dek_attr, 0, sizeof(dek_attr)); + dek_attr.key_blob = (void *)info->key; + dek_attr.key_blob_size = info->key_len; + dek_attr.key_size = info->key_len; + dek_attr.pd_id = adapter->get_pd(); + status = adapter->create_tls_dek(dek_attr, _dek); + if (unlikely(status != dpcp::DPCP_OK)) { + qp_logerr("Failed to create DEK, status: %d", status); + return -1; + } + tir->assign_dek(_dek); + tir->assign_callback(callback, callback_arg); + tirn = tir->get_tirn(); + + tls_post_static_params_wqe(NULL, info, tirn, _dek->get_key_id(), 0, false, false); + tls_post_progress_params_wqe(tir, tirn, next_record_tcp_sn, false, false); + + assert(!tir->m_released); + + return 0; +} + +void qp_mgr::tls_resync_rx(xlio_tir *tir, const xlio_tls_info *info, uint32_t hw_resync_tcp_sn) +{ + tls_post_static_params_wqe(tir, info, tir->get_tirn(), tir->get_dek_id(), hw_resync_tcp_sn, + false, false); +} + +void qp_mgr::tls_get_progress_params_rx(xlio_tir *tir, void *buf, uint32_t lkey) +{ + /* Address must be aligned by 64. */ + assert((uintptr_t)buf == ((uintptr_t)buf >> 6U << 6U)); + + tls_get_progress_params_wqe(tir, tir->get_tirn(), buf, lkey); +} + +inline void qp_mgr::tls_fill_static_params_wqe(struct mlx5_wqe_tls_static_params_seg *params, + const struct xlio_tls_info *info, uint32_t key_id, + uint32_t resync_tcp_sn) +{ + unsigned char *initial_rn, *iv; + uint8_t tls_version; + uint8_t *ctx; + + ctx = params->ctx; + + iv = DEVX_ADDR_OF(tls_static_params, ctx, gcm_iv); + initial_rn = DEVX_ADDR_OF(tls_static_params, ctx, initial_record_number); + + memcpy(iv, info->salt, TLS_AES_GCM_SALT_LEN); + memcpy(initial_rn, info->rec_seq, TLS_AES_GCM_REC_SEQ_LEN); + if (info->tls_version == TLS_1_3_VERSION) { + iv = DEVX_ADDR_OF(tls_static_params, ctx, implicit_iv); + memcpy(iv, info->iv, TLS_AES_GCM_IV_LEN); + } + + tls_version = (info->tls_version == TLS_1_2_VERSION) ? MLX5E_STATIC_PARAMS_CONTEXT_TLS_1_2 + : MLX5E_STATIC_PARAMS_CONTEXT_TLS_1_3; + + DEVX_SET(tls_static_params, ctx, tls_version, tls_version); + DEVX_SET(tls_static_params, ctx, const_1, 1); + DEVX_SET(tls_static_params, ctx, const_2, 2); + DEVX_SET(tls_static_params, ctx, encryption_standard, MLX5E_ENCRYPTION_STANDARD_TLS); + DEVX_SET(tls_static_params, ctx, resync_tcp_sn, resync_tcp_sn); + DEVX_SET(tls_static_params, ctx, dek_index, key_id); +} + +inline void qp_mgr::tls_post_static_params_wqe(xlio_ti *ti, const struct xlio_tls_info *info, + uint32_t tis_tir_number, uint32_t key_id, + uint32_t resync_tcp_sn, bool fence, bool is_tx) +{ + struct mlx5_set_tls_static_params_wqe *wqe = + reinterpret_cast(m_sq_wqe_hot); + struct xlio_mlx5_wqe_ctrl_seg *cseg = &wqe->ctrl.ctrl; + xlio_mlx5_wqe_umr_ctrl_seg *ucseg = &wqe->uctrl; + struct mlx5_mkey_seg *mkcseg = &wqe->mkc; + struct mlx5_wqe_tls_static_params_seg *tspseg = &wqe->params; + uint8_t opmod = is_tx ? MLX5_OPC_MOD_TLS_TIS_STATIC_PARAMS : MLX5_OPC_MOD_TLS_TIR_STATIC_PARAMS; + +#define STATIC_PARAMS_DS_CNT DIV_ROUND_UP(sizeof(*wqe), MLX5_SEND_WQE_DS) + + /* + * SQ wrap around handling information + * + * UMR WQE has the size of 3 WQEBBs. + * The following are segments sizes the WQE contains. + * + * UMR WQE segments sizes: + * sizeof(wqe->ctrl) = 16[B] + * sizeof(wqe->uctrl) = 48[B] + * sizeof(wqe->mkc) = 64[B] + * sizeof(wqe->params) = 64[B] + * + * UMR WQEBBs to segments mapping: + * WQEBB1: [wqe->ctrl(16[B]), wqe->uctrl(48[B])] -> 64[B] + * WQEBB2: [wqe->mkc(64[B])] -> 64[B] + * WQEBB3: [wqe->params(64[B])] -> 64[B] + * + * There are 3 cases: + * 1. There is enough room in the SQ for 3 WQEBBs: + * 3 WQEBBs posted from m_sq_wqe_hot current location. + * 2. There is enough room in the SQ for 2 WQEBBs: + * 2 WQEBBs posted from m_sq_wqe_hot current location till m_sq_wqes_end. + * 1 WQEBB posted from m_sq_wqes beginning. + * 3. There is enough room in the SQ for 1 WQEBB: + * 1 WQEBB posted from m_sq_wqe_hot current location till m_sq_wqes_end. + * 2 WQEBBs posted from m_sq_wqes beginning. + * The case of 0 WQEBBs room left in the SQ shouldn't happen, m_sq_wqe_hot wrap around handling + * done when setting next m_sq_wqe_hot. + * + * In all the 3 cases, no need to change cseg and ucseg pointers, since they fit to + * one WQEBB and will be posted before m_sq_wqes_end. + */ + + // XXX: We set inline_hdr_sz for every new hot wqe. This corrupts UMR WQE without memset(). + memset(m_sq_wqe_hot, 0, sizeof(*m_sq_wqe_hot)); + cseg->opmod_idx_opcode = + htobe32(((m_sq_wqe_counter & 0xffff) << 8) | MLX5_OPCODE_UMR | (opmod << 24)); + cseg->qpn_ds = htobe32((m_mlx5_qp.qpn << MLX5_WQE_CTRL_QPN_SHIFT) | STATIC_PARAMS_DS_CNT); + cseg->fm_ce_se = fence ? MLX5_FENCE_MODE_INITIATOR_SMALL : 0; + cseg->tis_tir_num = htobe32(tis_tir_number << 8); + + ucseg->flags = MLX5_UMR_INLINE; + ucseg->bsf_octowords = htobe16(DEVX_ST_SZ_BYTES(tls_static_params) / 16); + + int num_wqebbs = TLS_SET_STATIC_PARAMS_WQEBBS; + int num_wqebbs_top = 0; + int sq_wqebbs_room_left = + (static_cast(m_sq_wqes_end - reinterpret_cast(cseg)) / MLX5_SEND_WQE_BB); + + /* Case 1: + * In this case we don't need to change + * the pointers of the different segments, because there is enough room in the SQ. + * Thus, no need to do special handling. + */ + + if (unlikely(sq_wqebbs_room_left == 2)) { // Case 2: Change tspseg pointer: + tspseg = reinterpret_cast(m_sq_wqes); + num_wqebbs = 2; + num_wqebbs_top = 1; + } else if (unlikely(sq_wqebbs_room_left == 1)) { // Case 3: Change mkcseg and tspseg pointers: + mkcseg = reinterpret_cast(m_sq_wqes); + tspseg = reinterpret_cast( + reinterpret_cast(m_sq_wqes) + sizeof(*mkcseg)); + num_wqebbs = 1; + num_wqebbs_top = 2; + } + + memset(mkcseg, 0, sizeof(*mkcseg)); + memset(tspseg, 0, sizeof(*tspseg)); + + tls_fill_static_params_wqe(tspseg, info, key_id, resync_tcp_sn); + store_current_wqe_prop(nullptr, SQ_CREDITS_UMR, ti); + + ring_doorbell(MLX5_DB_METHOD_DB, num_wqebbs, num_wqebbs_top, true); + dbg_dump_wqe((uint32_t *)m_sq_wqe_hot, sizeof(mlx5_set_tls_static_params_wqe)); + + update_next_wqe_hot(); +} + +inline void qp_mgr::tls_fill_progress_params_wqe(struct mlx5_wqe_tls_progress_params_seg *params, + uint32_t tis_tir_number, + uint32_t next_record_tcp_sn) +{ + uint8_t *ctx = params->ctx; + + params->tis_tir_num = htobe32(tis_tir_number); + + DEVX_SET(tls_progress_params, ctx, next_record_tcp_sn, next_record_tcp_sn); + DEVX_SET(tls_progress_params, ctx, record_tracker_state, + MLX5E_TLS_PROGRESS_PARAMS_RECORD_TRACKER_STATE_START); + DEVX_SET(tls_progress_params, ctx, auth_state, MLX5E_TLS_PROGRESS_PARAMS_AUTH_STATE_NO_OFFLOAD); +} + +inline void qp_mgr::tls_post_progress_params_wqe(xlio_ti *ti, uint32_t tis_tir_number, + uint32_t next_record_tcp_sn, bool fence, + bool is_tx) +{ + uint16_t num_wqebbs = TLS_SET_PROGRESS_PARAMS_WQEBBS; + + struct mlx5_set_tls_progress_params_wqe *wqe = + reinterpret_cast(m_sq_wqe_hot); + struct xlio_mlx5_wqe_ctrl_seg *cseg = &wqe->ctrl.ctrl; + uint8_t opmod = + is_tx ? MLX5_OPC_MOD_TLS_TIS_PROGRESS_PARAMS : MLX5_OPC_MOD_TLS_TIR_PROGRESS_PARAMS; + + memset(wqe, 0, sizeof(*wqe)); + +#define PROGRESS_PARAMS_DS_CNT DIV_ROUND_UP(sizeof(*wqe), MLX5_SEND_WQE_DS) + + cseg->opmod_idx_opcode = + htobe32(((m_sq_wqe_counter & 0xffff) << 8) | XLIO_MLX5_OPCODE_SET_PSV | (opmod << 24)); + cseg->qpn_ds = htobe32((m_mlx5_qp.qpn << MLX5_WQE_CTRL_QPN_SHIFT) | PROGRESS_PARAMS_DS_CNT); + /* Request completion for TLS RX offload to create TLS rule ASAP. */ + cseg->fm_ce_se = + (fence ? MLX5_FENCE_MODE_INITIATOR_SMALL : 0) | (is_tx ? 0 : MLX5_WQE_CTRL_CQ_UPDATE); + + tls_fill_progress_params_wqe(&wqe->params, tis_tir_number, next_record_tcp_sn); + store_current_wqe_prop(nullptr, SQ_CREDITS_SET_PSV, ti); + + ring_doorbell(MLX5_DB_METHOD_DB, num_wqebbs); + dbg_dump_wqe((uint32_t *)m_sq_wqe_hot, sizeof(mlx5_set_tls_progress_params_wqe)); + + update_next_wqe_hot(); +} + +inline void qp_mgr::tls_get_progress_params_wqe(xlio_ti *ti, uint32_t tirn, void *buf, + uint32_t lkey) +{ + uint16_t num_wqebbs = TLS_GET_PROGRESS_WQEBBS; + + struct mlx5_get_tls_progress_params_wqe *wqe = + reinterpret_cast(m_sq_wqe_hot); + struct xlio_mlx5_wqe_ctrl_seg *cseg = &wqe->ctrl.ctrl; + struct xlio_mlx5_seg_get_psv *psv = &wqe->psv; + uint8_t opmod = MLX5_OPC_MOD_TLS_TIR_PROGRESS_PARAMS; + + memset(wqe, 0, sizeof(*wqe)); + +#define PROGRESS_PARAMS_DS_CNT DIV_ROUND_UP(sizeof(*wqe), MLX5_SEND_WQE_DS) + + cseg->opmod_idx_opcode = + htobe32(((m_sq_wqe_counter & 0xffff) << 8) | XLIO_MLX5_OPCODE_GET_PSV | (opmod << 24)); + cseg->qpn_ds = htobe32((m_mlx5_qp.qpn << MLX5_WQE_CTRL_QPN_SHIFT) | PROGRESS_PARAMS_DS_CNT); + cseg->fm_ce_se = MLX5_WQE_CTRL_CQ_UPDATE; + + psv->num_psv = 1U << 4U; + psv->l_key = htobe32(lkey); + psv->psv_index[0] = htobe32(tirn); + psv->va = htobe64((uintptr_t)buf); + + store_current_wqe_prop(nullptr, SQ_CREDITS_GET_PSV, ti); + + ring_doorbell(MLX5_DB_METHOD_DB, num_wqebbs); + + update_next_wqe_hot(); +} + +void qp_mgr::tls_tx_post_dump_wqe(xlio_tis *tis, void *addr, uint32_t len, uint32_t lkey, + bool first) +{ + post_dump_wqe(tis, addr, len, lkey, first); +} + +void qp_mgr::tls_release_tis(xlio_tis *tis) +{ + assert(tis != nullptr && tis->m_type == xlio_ti::ti_type::TLS_TIS); + tis->m_released = true; + if (tis->m_ref == 0) { + put_tls_tis_in_cache(tis); + } +} + +void qp_mgr::tls_release_tir(xlio_tir *tir) +{ + /* TODO We don't have to lock ring to destroy DEK object (a garbage collector?). */ + + assert(tir != nullptr && tir->m_type == xlio_ti::ti_type::TLS_TIR); + tir->m_released = true; + tir->assign_callback(NULL, NULL); + if (tir->m_ref == 0) { + put_tls_tir_in_cache(tir); + } +} + +dpcp::tir *qp_mgr::xlio_tir_to_dpcp_tir(xlio_tir *tir) +{ + return tir->m_p_tir.get(); +} +#else /* DEFINED_UTLS */ +void qp_mgr::ti_released(xlio_ti *) {}; +void qp_mgr::destroy_tis_cache(void) {}; +#endif /* DEFINED_UTLS */ + +std::unique_ptr qp_mgr::create_tis(uint32_t flags) const +{ + dpcp::adapter *adapter = m_p_ib_ctx_handler->get_dpcp_adapter(); + bool is_tls = flags & dpcp::TIS_ATTR_TLS, is_nvme = flags & dpcp::TIS_ATTR_NVMEOTCP; + if (unlikely(adapter == nullptr || (is_tls && is_nvme))) { + return nullptr; + } + + dpcp::tis::attr tis_attr = { + .flags = flags, + .tls_en = is_tls, + .nvmeotcp = is_nvme, + .transport_domain = adapter->get_td(), + .pd = adapter->get_pd(), + }; + + dpcp::tis *dpcp_tis = nullptr; + if (unlikely(adapter->create_tis(tis_attr, dpcp_tis) != dpcp::DPCP_OK)) { + qp_logerr("Failed to create TIS with NVME enabled"); + return nullptr; + } + + auto tis_type = is_tls ? xlio_ti::ti_type::TLS_TIS : xlio_ti::ti_type::NVME_TIS; + return std::make_unique(std::unique_ptr(dpcp_tis), tis_type); +} + +static inline void nvme_fill_static_params_control(xlio_mlx5_wqe_ctrl_seg *cseg, + xlio_mlx5_wqe_umr_ctrl_seg *ucseg, + uint32_t producer_index, uint32_t qpn, + uint32_t tisn, uint8_t fence_flags) +{ + memset(cseg, 0, sizeof(*cseg)); + memset(ucseg, 0, sizeof(*ucseg)); + cseg->opmod_idx_opcode = + htobe32(((producer_index & 0xffff) << 8) | MLX5_OPCODE_UMR | + (MLX5_CTRL_SEGMENT_OPC_MOD_UMR_NVMEOTCP_TIS_STATIC_PARAMS << 24)); + size_t num_wqe_ds = 12U; + cseg->qpn_ds = htobe32((qpn << MLX5_WQE_CTRL_QPN_SHIFT) | num_wqe_ds); + cseg->fm_ce_se = fence_flags; + cseg->tis_tir_num = htobe32(tisn << MLX5_WQE_CTRL_TIR_TIS_INDEX_SHIFT); + + ucseg->flags = MLX5_UMR_INLINE; + ucseg->bsf_octowords = htobe16(MLX5E_TRANSPORT_STATIC_PARAMS_OCTWORD_SIZE); +} + +static inline void nvme_fill_static_params_transport_params( + mlx5_wqe_transport_static_params_seg *params, uint32_t config) + +{ + memset(params, 0, sizeof(*params)); + void *ctx = params->ctx; + + DEVX_SET(transport_static_params, ctx, const_1, 1); + DEVX_SET(transport_static_params, ctx, const_2, 2); + DEVX_SET(transport_static_params, ctx, acc_type, MLX5_TRANSPORT_STATIC_PARAMS_ACC_TYPE_NVMETCP); + DEVX_SET(transport_static_params, ctx, nvme_resync_tcp_sn, 0); + DEVX_SET(transport_static_params, ctx, pda, static_cast(config & XLIO_NVME_PDA_MASK)); + DEVX_SET(transport_static_params, ctx, ddgst_en, bool(config & XLIO_NVME_DDGST_ENABLE)); + DEVX_SET(transport_static_params, ctx, ddgst_offload_en, + bool(config & XLIO_NVME_DDGST_OFFLOAD)); + DEVX_SET(transport_static_params, ctx, hddgst_en, bool(config & XLIO_NVME_HDGST_ENABLE)); + DEVX_SET(transport_static_params, ctx, hdgst_offload_en, + bool(config & XLIO_NVME_HDGST_OFFLOAD)); + DEVX_SET(transport_static_params, ctx, ti, MLX5_TRANSPORT_STATIC_PARAMS_TI_INITIATOR); + DEVX_SET(transport_static_params, ctx, const1, 1); + DEVX_SET(transport_static_params, ctx, zero_copy_en, 0); +} + +static inline void nvme_fill_progress_wqe(mlx5e_set_nvmeotcp_progress_params_wqe *wqe, + uint32_t producer_index, uint32_t qpn, uint32_t tisn, + uint32_t tcp_seqno, uint8_t fence_flags) +{ + memset(wqe, 0, sizeof(*wqe)); + auto cseg = &wqe->ctrl.ctrl; + + size_t progres_params_ds = DIV_ROUND_UP(sizeof(*wqe), MLX5_SEND_WQE_DS); + cseg->opmod_idx_opcode = + htobe32(((producer_index & 0xffff) << 8) | XLIO_MLX5_OPCODE_SET_PSV | + (MLX5_CTRL_SEGMENT_OPC_MOD_UMR_NVMEOTCP_TIS_PROGRESS_PARAMS << 24)); + cseg->qpn_ds = htobe32((qpn << MLX5_WQE_CTRL_QPN_SHIFT) | progres_params_ds); + cseg->fm_ce_se = fence_flags; + + mlx5_seg_nvmeotcp_progress_params *params = &wqe->params; + params->tir_num = htobe32(tisn); + void *ctx = params->ctx; + + DEVX_SET(nvmeotcp_progress_params, ctx, next_pdu_tcp_sn, tcp_seqno); + DEVX_SET(nvmeotcp_progress_params, ctx, pdu_tracker_state, + MLX5E_NVMEOTCP_PROGRESS_PARAMS_PDU_TRACKER_STATE_START); + /* if (is_tx) offloading state == 0*/ + DEVX_SET(nvmeotcp_progress_params, ctx, offloading_state, 0); +} + +void qp_mgr::nvme_set_static_context(xlio_tis *tis, uint32_t config) +{ + auto *cseg = wqebb_get(0U); + auto *ucseg = wqebb_get(0U, sizeof(*cseg)); + + nvme_fill_static_params_control(cseg, ucseg, m_sq_wqe_counter, m_mlx5_qp.qpn, tis->get_tisn(), + 0); + memset(wqebb_get(1U), 0, sizeof(mlx5_mkey_seg)); + + auto *params = wqebb_get(2U); + nvme_fill_static_params_transport_params(params, config); + store_current_wqe_prop(nullptr, SQ_CREDITS_UMR, tis); + ring_doorbell(MLX5_DB_METHOD_DB, MLX5E_TRANSPORT_SET_STATIC_PARAMS_WQEBBS); + update_next_wqe_hot(); +} + +void qp_mgr::nvme_set_progress_context(xlio_tis *tis, uint32_t tcp_seqno) +{ + auto *wqe = reinterpret_cast(m_sq_wqe_hot); + nvme_fill_progress_wqe(wqe, m_sq_wqe_counter, m_mlx5_qp.qpn, tis->get_tisn(), tcp_seqno, + MLX5_FENCE_MODE_INITIATOR_SMALL); + store_current_wqe_prop(nullptr, SQ_CREDITS_SET_PSV, tis); + ring_doorbell(MLX5_DB_METHOD_DB, MLX5E_NVMEOTCP_PROGRESS_PARAMS_WQEBBS); + update_next_wqe_hot(); +} + +#if defined(DEFINED_UTLS) +void qp_mgr::ti_released(xlio_ti *ti) +{ + assert(ti->m_released); + assert(ti->m_ref == 0); + if (ti->m_type == xlio_ti::ti_type::TLS_TIS) { + put_tls_tis_in_cache(static_cast(ti)); + } else if (ti->m_type == xlio_ti::ti_type::TLS_TIR) { + put_tls_tir_in_cache(static_cast(ti)); + } +} + +void qp_mgr::put_tls_tis_in_cache(xlio_tis *tis) +{ + std::unique_ptr dek = tis->release_dek(); + assert(dynamic_cast(dek.get()) != nullptr); + + put_tls_dek(std::unique_ptr(dynamic_cast(dek.release()))); + m_tls_tis_cache.push_back(tis); +} + +void qp_mgr::put_tls_tir_in_cache(xlio_tir *tir) +{ + // Because the absense of TIR flush command, reusing a TIR + // may result in undefined behaviour. + // Until a flush command is available the TIR cache is disabled. + // Re-enabling TIR cache should also add destroy_tir_cache on ring cleanup. + // m_tls_tir_cache.push_back(tir); + + delete tir; +} +#endif /* defined(DEFINED_UTLS) */ + +void qp_mgr::post_nop_fence(void) +{ + struct mlx5_wqe *wqe = reinterpret_cast(m_sq_wqe_hot); + struct xlio_mlx5_wqe_ctrl_seg *cseg = &wqe->ctrl; + + memset(wqe, 0, sizeof(*wqe)); + + cseg->opmod_idx_opcode = htobe32(((m_sq_wqe_counter & 0xffff) << 8) | MLX5_OPCODE_NOP); + cseg->qpn_ds = htobe32((m_mlx5_qp.qpn << MLX5_WQE_CTRL_QPN_SHIFT) | 0x01); + cseg->fm_ce_se = MLX5_FENCE_MODE_INITIATOR_SMALL; + + store_current_wqe_prop(nullptr, SQ_CREDITS_NOP, NULL); + + ring_doorbell(MLX5_DB_METHOD_DB, 1); + + update_next_wqe_hot(); +} + +void qp_mgr::post_dump_wqe(xlio_tis *tis, void *addr, uint32_t len, uint32_t lkey, bool is_first) +{ + struct mlx5_dump_wqe *wqe = reinterpret_cast(m_sq_wqe_hot); + struct xlio_mlx5_wqe_ctrl_seg *cseg = &wqe->ctrl.ctrl; + struct mlx5_wqe_data_seg *dseg = &wqe->data; + uint32_t tisn = tis ? tis->get_tisn() : 0; + uint16_t num_wqebbs = XLIO_DUMP_WQEBBS; + uint16_t ds_cnt = sizeof(*wqe) / MLX5_SEND_WQE_DS; + + memset(wqe, 0, sizeof(*wqe)); + + cseg->opmod_idx_opcode = htobe32(((m_sq_wqe_counter & 0xffff) << 8) | XLIO_MLX5_OPCODE_DUMP); + cseg->qpn_ds = htobe32((m_mlx5_qp.qpn << MLX5_WQE_CTRL_QPN_SHIFT) | ds_cnt); + cseg->fm_ce_se = is_first ? MLX5_FENCE_MODE_INITIATOR_SMALL : 0; + cseg->tis_tir_num = htobe32(tisn << 8); + + dseg->addr = htobe64((uintptr_t)addr); + dseg->lkey = htobe32(lkey); + dseg->byte_count = htobe32(len); + + store_current_wqe_prop(nullptr, SQ_CREDITS_DUMP, tis); + + ring_doorbell(MLX5_DB_METHOD_DB, num_wqebbs, 0, true); + + update_next_wqe_hot(); +} + +//! Handle releasing of Tx buffers +// Single post send with SIGNAL of a dummy packet +// NOTE: Since the QP is in ERROR state no packets will be sent on the wire! +// So we can post_send anything we want :) +void qp_mgr::trigger_completion_for_all_sent_packets() +{ + qp_logfunc("unsignaled count=%d", m_n_unsignaled_count); + + if (!is_signal_requested_for_last_wqe()) { + // Post a dummy WQE and request a signal to complete all the unsignaled WQEs in SQ + qp_logdbg("Need to send closing tx wr..."); + mem_buf_desc_t *p_mem_buf_desc = m_p_ring->mem_buf_tx_get(0, true, PBUF_RAM); + // Align Tx buffer accounting since we will be bypassing the normal send calls + m_p_ring->m_missing_buf_ref_count--; + if (!p_mem_buf_desc) { + qp_logerr("no buffer in pool"); + return; + } + + // Prepare dummy packet: zeroed payload ('0000'). + // For ETH it replaces the MAC header!! (Nothing is going on the wire, QP in error state) + /* need to send at least eth+ip, since libmlx5 will drop just eth header */ + ethhdr *p_buffer_ethhdr = (ethhdr *)p_mem_buf_desc->p_buffer; + memset(p_buffer_ethhdr, 0, sizeof(*p_buffer_ethhdr)); + p_buffer_ethhdr->h_proto = htons(ETH_P_IP); + iphdr *p_buffer_iphdr = (iphdr *)(p_mem_buf_desc->p_buffer + sizeof(*p_buffer_ethhdr)); + memset(p_buffer_iphdr, 0, sizeof(*p_buffer_iphdr)); + + ibv_sge sge[1]; + sge[0].length = sizeof(ethhdr) + sizeof(iphdr); + sge[0].addr = (uintptr_t)(p_mem_buf_desc->p_buffer); + sge[0].lkey = m_p_ring->m_tx_lkey; + + // Prepare send wr for (does not care if it is UD/IB or RAW/ETH) + // UD requires AH+qkey, RAW requires minimal payload instead of MAC header. + xlio_ibv_send_wr send_wr; + + memset(&send_wr, 0, sizeof(send_wr)); + send_wr.wr_id = (uintptr_t)p_mem_buf_desc; + send_wr.wr.ud.ah = NULL; + send_wr.sg_list = sge; + send_wr.num_sge = 1; + send_wr.next = NULL; + xlio_send_wr_opcode(send_wr) = XLIO_IBV_WR_SEND; + + unsigned credits = credits_calculate(&send_wr); + if (!credits_get(credits)) { + // TODO Wait for available space in SQ to post the WQE. This method mustn't fail, + // because we may want to wait until all the WQEs are completed and we need to post + // something and request signal. + qp_logdbg("No space in SQ to trigger completions with a post operation"); + return; + } + + send_to_wire(&send_wr, + (xlio_wr_tx_packet_attr)(XLIO_TX_PACKET_L3_CSUM | XLIO_TX_PACKET_L4_CSUM), + true, nullptr, credits); + } +} + +void qp_mgr::reset_inflight_zc_buffers_ctx(void *ctx) +{ + sq_wqe_prop *p = m_sq_wqe_prop_last; + sq_wqe_prop *prev; + if (p) { + unsigned p_i = p - m_sq_wqe_idx_to_prop; + if (p_i == m_sq_wqe_prop_last_signalled) { + return; + } + do { + mem_buf_desc_t *desc = p->buf; + if (desc && desc->tx.zc.ctx == ctx) { + desc->tx.zc.ctx = nullptr; + } + prev = p; + p = p->next; + } while (p && is_sq_wqe_prop_valid(p, prev)); + } +} diff --git a/src/core/dev/qp_mgr.h b/src/core/dev/qp_mgr.h index 100ed918e..90e12b479 100644 --- a/src/core/dev/qp_mgr.h +++ b/src/core/dev/qp_mgr.h @@ -52,6 +52,10 @@ #include "dev/cq_mgr_rx.h" #include "dev/cq_mgr_tx.h" #include "dev/rfs_rule.h" +#include "util/sg_array.h" +#include "dev/dm_mgr.h" +#include +#include /* Forward declarations */ struct xlio_tls_info; @@ -135,6 +139,99 @@ class xlio_ti { void *m_callback_arg; }; +class xlio_tis : public xlio_ti { +public: + xlio_tis(std::unique_ptr _tis, xlio_ti::ti_type type) + : xlio_ti(type) + , m_dek() + , m_p_tis(std::move(_tis)) + , m_tisn(0U) + , m_dek_id(0U) + { + dpcp::status ret = m_p_tis->get_tisn(m_tisn); + assert(ret == dpcp::DPCP_OK); + (void)ret; + } + + ~xlio_tis() = default; + + std::unique_ptr release_dek() + { + assert(m_ref == 0); + m_released = false; + return std::move(m_dek); + } + + uint32_t get_tisn() noexcept { return m_tisn; } + + void assign_dek(std::unique_ptr &&dek_ptr) + { + m_dek = std::move(dek_ptr); + m_dek_id = m_dek->get_key_id(); + } + + uint32_t get_dek_id() noexcept { return m_dek_id; } + +private: + std::unique_ptr m_dek; + std::unique_ptr m_p_tis; + uint32_t m_tisn; + uint32_t m_dek_id; +}; + +class xlio_tir : public xlio_ti { +public: + xlio_tir(dpcp::tir *_tir, xlio_ti::ti_type type) + : xlio_ti(type) + { + m_p_tir.reset(_tir); + m_dek = NULL; + m_tirn = 0; + m_dek_id = 0; + + /* Cache the tir number. Mustn't fail for a valid TIR object. */ + m_tirn = m_p_tir->get_tirn(); + assert(m_tirn != 0); + } + + ~xlio_tir() = default; + + std::unique_ptr release_dek() + { + assert(m_ref == 0); + m_released = false; + return std::move(m_dek); + } + + uint32_t get_tirn() { return m_tirn; } + + void assign_dek(void *dek_ptr) + { + m_dek.reset(reinterpret_cast(dek_ptr)); + m_dek_id = m_dek->get_key_id(); + } + + uint32_t get_dek_id() { return m_dek_id; } + + std::unique_ptr m_p_tir; + +private: + std::unique_ptr m_dek; + uint32_t m_tirn; + uint32_t m_dek_id; +}; + +/* WQE properties description. */ +struct sq_wqe_prop { + /* A buffer held by the WQE. This is NULL for control WQEs. */ + mem_buf_desc_t *buf; + /* Number of credits (usually number of WQEBBs). */ + unsigned credits; + /* Transport interface (TIS/TIR) current WQE holds reference to. */ + xlio_ti *ti; + struct sq_wqe_prop *next; +}; + /** * @class qp_mgr * @@ -159,7 +256,7 @@ class qp_mgr { friend class cq_mgr_tx; public: - qp_mgr(struct qp_mgr_desc *desc, const uint32_t tx_num_wr, uint16_t vlan); + qp_mgr(struct qp_mgr_desc *desc, const uint32_t tx_num_wr, uint16_t vlan, bool call_configure); virtual ~qp_mgr(); virtual void up(); @@ -167,19 +264,20 @@ class qp_mgr { // Post for receive single mem_buf_desc virtual void post_recv_buffer(mem_buf_desc_t *p_mem_buf_desc); + // Post for receive a list of mem_buf_desc void post_recv_buffers(descq_t *p_buffers, size_t count); int send(xlio_ibv_send_wr *p_send_wqe, xlio_wr_tx_packet_attr attr, xlio_tis *tis, unsigned credits); - inline uint32_t get_max_inline_data() const { return m_qp_cap.max_inline_data; } - inline uint32_t get_max_send_sge() const { return m_qp_cap.max_send_sge; } + inline uint32_t get_max_inline_data() const { return m_mlx5_qp.cap.max_inline_data; } + inline uint32_t get_max_send_sge() const { return m_mlx5_qp.cap.max_send_sge; } int get_port_num() const { return m_port_num; } uint16_t get_partiton() const { return m_vlan; }; - struct ibv_qp *get_ibv_qp() const { return m_qp; }; - class cq_mgr_tx *get_tx_cq_mgr() const { return m_p_cq_mgr_tx; } - class cq_mgr_rx *get_rx_cq_mgr() const { return m_p_cq_mgr_rx; } - virtual uint32_t get_rx_max_wr_num(); + struct ibv_qp *get_ibv_qp() const { return m_mlx5_qp.qp; }; + cq_mgr_tx *get_tx_cq_mgr() const { return m_p_cq_mgr_tx; } + cq_mgr_rx *get_rx_cq_mgr() const { return m_p_cq_mgr_rx; } + uint32_t get_rx_max_wr_num() { return m_rx_num_wr; } // This function can be replaced with a parameter during ring creation. // chain of calls may serve as cache warm for dummy send feature. inline bool get_hw_dummy_send_support() { return m_hw_dummy_send_support; } @@ -189,96 +287,66 @@ class qp_mgr { void release_rx_buffers(); void release_tx_buffers(); - virtual void trigger_completion_for_all_sent_packets(); uint32_t is_ratelimit_change(struct xlio_rate_limit_t &rate_limit); int modify_qp_ratelimit(struct xlio_rate_limit_t &rate_limit, uint32_t rl_changes); - virtual void dm_release_data(mem_buf_desc_t *buff) { NOT_IN_USE(buff); } + void dm_release_data(mem_buf_desc_t *buff) { m_dm_mgr.release_data(buff); } virtual rfs_rule *create_rfs_rule(xlio_ibv_flow_attr &attrs, xlio_tir *tir_ext); #ifdef DEFINED_UTLS - virtual xlio_tis *tls_context_setup_tx(const xlio_tls_info *info) - { - NOT_IN_USE(info); - return NULL; - } - virtual xlio_tir *tls_create_tir(bool cached) - { - NOT_IN_USE(cached); - return NULL; - } - virtual int tls_context_setup_rx(xlio_tir *tir, const xlio_tls_info *info, - uint32_t next_record_tcp_sn, xlio_comp_cb_t callback, - void *callback_arg) - { - NOT_IN_USE(tir); - NOT_IN_USE(info); - NOT_IN_USE(next_record_tcp_sn); - NOT_IN_USE(callback); - NOT_IN_USE(callback_arg); - return -1; - } - virtual void tls_context_resync_tx(const xlio_tls_info *info, xlio_tis *tis, bool skip_static) - { - NOT_IN_USE(info); - NOT_IN_USE(tis); - NOT_IN_USE(skip_static); - } - virtual void tls_resync_rx(xlio_tir *tir, const xlio_tls_info *info, uint32_t hw_resync_tcp_sn) - { - NOT_IN_USE(tir); - NOT_IN_USE(info); - NOT_IN_USE(hw_resync_tcp_sn); - } - virtual void tls_get_progress_params_rx(xlio_tir *tir, void *buf, uint32_t lkey) - { - NOT_IN_USE(tir); - NOT_IN_USE(buf); - NOT_IN_USE(lkey); - } - virtual void tls_release_tis(xlio_tis *tis) { NOT_IN_USE(tis); } - virtual void tls_release_tir(xlio_tir *tir) { NOT_IN_USE(tir); } - virtual void tls_tx_post_dump_wqe(xlio_tis *tis, void *addr, uint32_t len, uint32_t lkey, - bool first) - { - NOT_IN_USE(tis); - NOT_IN_USE(addr); - NOT_IN_USE(len); - NOT_IN_USE(lkey); - NOT_IN_USE(first); - } + xlio_tis *tls_context_setup_tx(const xlio_tls_info *info) override; + xlio_tir *tls_create_tir(bool cached) override; + int tls_context_setup_rx(xlio_tir *tir, const xlio_tls_info *info, uint32_t next_record_tcp_sn, + xlio_comp_cb_t callback, void *callback_arg); + void tls_context_resync_tx(const xlio_tls_info *info, xlio_tis *tis, bool skip_static) override; + void tls_resync_rx(xlio_tir *tir, const xlio_tls_info *info, uint32_t hw_resync_tcp_sn) override; + void tls_get_progress_params_rx(xlio_tir *tir, void *buf, uint32_t lkey) override; + void tls_release_tis(xlio_tis *tis) override; + void tls_release_tir(xlio_tir *tir) override; + void tls_tx_post_dump_wqe(xlio_tis *tis, void *addr, uint32_t len, uint32_t lkey, bool first) override; #endif /* DEFINED_UTLS */ - virtual std::unique_ptr create_tis(uint32_t) const { return nullptr; }; - - virtual void nvme_set_static_context(xlio_tis *tis, uint32_t config) - { - NOT_IN_USE(tis); - NOT_IN_USE(config); - }; - virtual void nvme_set_progress_context(xlio_tis *tis, uint32_t tcp_seqno) - { - NOT_IN_USE(tis); - NOT_IN_USE(tcp_seqno); - }; - virtual void post_nop_fence(void) {} - virtual void post_dump_wqe(xlio_tis *tis, void *addr, uint32_t len, uint32_t lkey, bool first) +#define DPCP_TIS_FLAGS (dpcp::TIS_ATTR_TRANSPORT_DOMAIN | dpcp::TIS_ATTR_PD) +#define DPCP_TIS_NVME_FLAG (dpcp::TIS_ATTR_NVMEOTCP) + std::unique_ptr create_tis(uint32_t flags) const; + void nvme_set_static_context(xlio_tis *tis, uint32_t config); + void nvme_set_progress_context(xlio_tis *tis, uint32_t tcp_seqno); + + /* Get a memory inside a wqebb at a wqebb_num offset from the m_sq_wqe_hot and account for + * m_sq_wqe_counter wrap-around. Use offset_in_wqebb to for the internal address. Use the + * template parameter to cast the resulting address to the required pointer type */ + template + constexpr inline T wqebb_get(size_t wqebb_num, size_t offset_in_wqebb = 0U) { - NOT_IN_USE(tis); - NOT_IN_USE(addr); - NOT_IN_USE(len); - NOT_IN_USE(lkey); - NOT_IN_USE(first); + return reinterpret_cast( + reinterpret_cast( + &(*m_sq_wqes)[(m_sq_wqe_counter + wqebb_num) & (m_tx_num_wr - 1)]) + + offset_in_wqebb); } - virtual void reset_inflight_zc_buffers_ctx(void *ctx) { NOT_IN_USE(ctx); } - virtual bool credits_get(unsigned credits) + void post_nop_fence(); + void post_dump_wqe(xlio_tis *tis, void *addr, uint32_t len, uint32_t lkey, bool first); + +#if defined(DEFINED_UTLS) + std::unique_ptr get_new_tls_dek(const void *key, uint32_t key_size_bytes); + std::unique_ptr get_tls_dek(const void *key, uint32_t key_size_bytes); + void put_tls_dek(std::unique_ptr &&dek_obj); +#endif + + void reset_inflight_zc_buffers_ctx(void *ctx); + + void credits_return(unsigned credits) { m_sq_free_credits += credits; } + + bool credits_get(unsigned credits) { - NOT_IN_USE(credits); - return true; + if (m_sq_free_credits >= credits) { + m_sq_free_credits -= credits; + return true; + } + return false; } - virtual void credits_return(unsigned credits) { NOT_IN_USE(credits); } - inline unsigned credits_calculate(xlio_ibv_send_wr *p_send_wqe) + + unsigned credits_calculate(xlio_ibv_send_wr *p_send_wqe) { /* Credit is a logical value which is opaque for users. Only qp_mgr can interpret the * value and currently, one credit equals to one WQEBB in the SQ. @@ -320,15 +388,14 @@ class qp_mgr { } protected: - struct ibv_qp *m_qp; - uint64_t *m_rq_wqe_idx_to_wrid; + xlio_ib_mlx5_qp_t m_mlx5_qp; + uint64_t *m_rq_wqe_idx_to_wrid = nullptr; ring_simple *m_p_ring; uint8_t m_port_num; ib_ctx_handler *m_p_ib_ctx_handler; - struct ibv_qp_cap m_qp_cap; - uint32_t m_max_qp_wr; + uint32_t m_max_qp_wr = 0U; cq_mgr_rx *m_p_cq_mgr_rx; cq_mgr_tx *m_p_cq_mgr_tx; @@ -336,7 +403,7 @@ class qp_mgr { uint32_t m_rx_num_wr; uint32_t m_tx_num_wr; - bool m_hw_dummy_send_support; + bool m_hw_dummy_send_support = false; uint32_t m_n_sysvar_rx_num_wr_to_post_recv; const uint32_t m_n_sysvar_tx_num_wr_to_signal; @@ -345,127 +412,127 @@ class qp_mgr { // recv_wr ibv_sge *m_ibv_rx_sg_array; ibv_recv_wr *m_ibv_rx_wr_array; - uint32_t m_curr_rx_wr; - uintptr_t m_last_posted_rx_wr_id; // Remember so in case we flush RQ we know to wait until this - // WR_ID is received + uint32_t m_curr_rx_wr = 0U; + uintptr_t m_last_posted_rx_wr_id = 0U; // Remember so in case we flush RQ we know to wait until + // this WR_ID is received // send wr - uint32_t m_n_unsignaled_count; + uint32_t m_n_unsignaled_count = 0U; - mem_buf_desc_t *m_p_prev_rx_desc_pushed; + mem_buf_desc_t *m_p_prev_rx_desc_pushed = nullptr; - // generating packet IDs - uint16_t m_n_ip_id_base; - uint16_t m_n_ip_id_offset; uint16_t m_vlan; struct xlio_rate_limit_t m_rate_limit; int configure(struct qp_mgr_desc *desc); int prepare_ibv_qp(xlio_ibv_qp_init_attr &qp_init_attr); - inline void set_unsignaled_count(void) - { - m_n_unsignaled_count = m_n_sysvar_tx_num_wr_to_signal - 1; - } - inline void dec_unsignaled_count(void) + void init_qp(); + void init_device_memory(); + bool init_rx_cq_mgr_prepare(); + void post_recv_buffer_rq(mem_buf_desc_t *p_mem_buf_desc); + + void set_unsignaled_count(void) { m_n_unsignaled_count = m_n_sysvar_tx_num_wr_to_signal - 1; } + + void dec_unsignaled_count(void) { if (m_n_unsignaled_count > 0) { --m_n_unsignaled_count; } } - inline bool is_signal_requested_for_last_wqe() - { - return m_n_unsignaled_count == m_n_sysvar_tx_num_wr_to_signal - 1; - } - - virtual cq_mgr_rx *init_rx_cq_mgr(struct ibv_comp_channel *p_rx_comp_event_channel) = 0; - virtual cq_mgr_tx *init_tx_cq_mgr(void) = 0; - - virtual int send_to_wire(xlio_ibv_send_wr *p_send_wqe, xlio_wr_tx_packet_attr attr, - bool request_comp, xlio_tis *tis, unsigned credits); - virtual bool is_completion_need() { return !m_n_unsignaled_count; } - virtual bool is_rq_empty() const { return false; } -}; -class xlio_tis : public xlio_ti { -public: - xlio_tis(std::unique_ptr _tis, xlio_ti::ti_type type) - : xlio_ti(type) - , m_dek() - , m_p_tis(std::move(_tis)) - , m_tisn(0U) - , m_dek_id(0U) + bool is_signal_requested_for_last_wqe() { - dpcp::status ret = m_p_tis->get_tisn(m_tisn); - assert(ret == dpcp::DPCP_OK); - (void)ret; + return m_n_unsignaled_count == m_n_sysvar_tx_num_wr_to_signal - 1; } - ~xlio_tis() = default; - - inline std::unique_ptr release_dek() - { - assert(m_ref == 0); - m_released = false; - return std::move(m_dek); - } + virtual cq_mgr_rx *init_rx_cq_mgr(struct ibv_comp_channel *p_rx_comp_event_channel); + cq_mgr_tx *init_tx_cq_mgr(); - inline uint32_t get_tisn(void) noexcept { return m_tisn; } + int send_to_wire(xlio_ibv_send_wr *p_send_wqe, xlio_wr_tx_packet_attr attr, bool request_comp, + xlio_tis *tis, unsigned credits); - inline void assign_dek(std::unique_ptr &&dek_ptr) +#if defined(DEFINED_UTLS) + dpcp::tir *xlio_tir_to_dpcp_tir(xlio_tir *tir); + virtual dpcp::tir *create_tir(bool is_tls = false) { - m_dek = std::move(dek_ptr); - m_dek_id = m_dek->get_key_id(); + NOT_IN_USE(is_tls); + return NULL; } - - inline uint32_t get_dek_id(void) noexcept { return m_dek_id; } +#endif /* DEFINED_UTLS */ private: - std::unique_ptr m_dek; - std::unique_ptr m_p_tis; - uint32_t m_tisn; - uint32_t m_dek_id; -}; - -class xlio_tir : public xlio_ti { -public: - xlio_tir(dpcp::tir *_tir, xlio_ti::ti_type type) - : xlio_ti(type) + void trigger_completion_for_all_sent_packets(); + void update_next_wqe_hot(); + void destroy_tis_cache(); + void ti_released(xlio_ti *ti); + void put_tls_tir_in_cache(xlio_tir *tir); + void put_tls_tis_in_cache(xlio_tis *tis); + bool is_rq_empty() const override { return (m_mlx5_qp.rq.head == m_mlx5_qp.rq.tail); } + bool is_completion_need() const { - m_p_tir.reset(_tir); - m_dek = NULL; - m_tirn = 0; - m_dek_id = 0; - - /* Cache the tir number. Mustn't fail for a valid TIR object. */ - m_tirn = m_p_tir->get_tirn(); - assert(m_tirn != 0); + return !m_n_unsignaled_count || (m_dm_enabled && m_dm_mgr.is_completion_need()); } - ~xlio_tir() = default; +#if defined(DEFINED_UTLS) + inline void tls_fill_static_params_wqe(struct mlx5_wqe_tls_static_params_seg *params, + const struct xlio_tls_info *info, uint32_t key_id, + uint32_t resync_tcp_sn); + inline void tls_post_static_params_wqe(xlio_ti *ti, const struct xlio_tls_info *info, + uint32_t tis_tir_number, uint32_t key_id, + uint32_t resync_tcp_sn, bool fence, bool is_tx); + inline void tls_fill_progress_params_wqe(struct mlx5_wqe_tls_progress_params_seg *params, + uint32_t tis_tir_number, uint32_t next_record_tcp_sn); + inline void tls_post_progress_params_wqe(xlio_ti *ti, uint32_t tis_tir_number, + uint32_t next_record_tcp_sn, bool fence, bool is_tx); + inline void tls_get_progress_params_wqe(xlio_ti *ti, uint32_t tirn, void *buf, uint32_t lkey); +#endif /* DEFINED_UTLS */ + + inline void store_current_wqe_prop(mem_buf_desc_t *wr_id, unsigned credits, xlio_ti *ti); + inline int fill_wqe(xlio_ibv_send_wr *p_send_wqe); + inline int fill_wqe_send(xlio_ibv_send_wr *pswr); + inline int fill_wqe_lso(xlio_ibv_send_wr *pswr); + inline int fill_inl_segment(sg_array &sga, uint8_t *cur_seg, uint8_t *data_addr, + int max_inline_len, int inline_len); + inline void ring_doorbell(int db_method, int num_wqebb, int num_wqebb_top = 0, + bool skip_comp = false); - inline std::unique_ptr release_dek() + bool is_sq_wqe_prop_valid(sq_wqe_prop *p, sq_wqe_prop *prev) { - assert(m_ref == 0); - m_released = false; - return std::move(m_dek); + unsigned p_i = p - m_sq_wqe_idx_to_prop; + unsigned prev_i = prev - m_sq_wqe_idx_to_prop; + return (p_i != m_sq_wqe_prop_last_signalled) && + ((m_tx_num_wr + p_i - m_sq_wqe_prop_last_signalled) % m_tx_num_wr < + (m_tx_num_wr + prev_i - m_sq_wqe_prop_last_signalled) % m_tx_num_wr); } - inline uint32_t get_tirn(void) { return m_tirn; } + sq_wqe_prop *m_sq_wqe_idx_to_prop = nullptr; + sq_wqe_prop *m_sq_wqe_prop_last = nullptr; + unsigned m_sq_wqe_prop_last_signalled = 0U; + unsigned m_sq_free_credits = 0U; + uint64_t m_rq_wqe_counter = 0U; - inline void assign_dek(void *dek_ptr) - { - m_dek.reset(reinterpret_cast(dek_ptr)); - m_dek_id = m_dek->get_key_id(); - } + struct mlx5_eth_wqe (*m_sq_wqes)[] = nullptr; + struct mlx5_eth_wqe *m_sq_wqe_hot = nullptr; + uint8_t *m_sq_wqes_end = nullptr; + enum { MLX5_DB_METHOD_BF, MLX5_DB_METHOD_DB } m_db_method; - inline uint32_t get_dek_id(void) { return m_dek_id; } + int m_sq_wqe_hot_index = 0; + uint16_t m_sq_wqe_counter = 0U; + bool m_b_fence_needed = false; + bool m_dm_enabled = false; + dm_mgr m_dm_mgr; - std::unique_ptr m_p_tir; + /* + * TIS cache. Protected by ring tx lock. + * TODO Move to ring. + */ + std::vector m_tls_tis_cache; + std::vector m_tls_tir_cache; -private: - std::unique_ptr m_dek; - uint32_t m_tirn; - uint32_t m_dek_id; +#if defined(DEFINED_UTLS) + std::list> m_tls_dek_get_cache; + std::list> m_tls_dek_put_cache; +#endif }; #endif diff --git a/src/core/dev/qp_mgr_eth_mlx5_dpcp.cpp b/src/core/dev/qp_mgr_eth_mlx5_dpcp.cpp index c0d63e24a..ae2af8d1d 100644 --- a/src/core/dev/qp_mgr_eth_mlx5_dpcp.cpp +++ b/src/core/dev/qp_mgr_eth_mlx5_dpcp.cpp @@ -38,9 +38,17 @@ #define MODULE_NAME "qp_mgr_eth_mlx5_dpcp" +#define qp_logpanic __log_info_panic +#define qp_logerr __log_info_err +#define qp_logwarn __log_info_warn +#define qp_loginfo __log_info_info +#define qp_logdbg __log_info_dbg +#define qp_logfunc __log_info_func +#define qp_logfuncall __log_info_funcall + qp_mgr_eth_mlx5_dpcp::qp_mgr_eth_mlx5_dpcp(struct qp_mgr_desc *desc, uint32_t tx_num_wr, uint16_t vlan) - : qp_mgr_eth_mlx5(desc, tx_num_wr, vlan, false) + : qp_mgr(desc, tx_num_wr, vlan, false) { if (configure(desc)) { throw_xlio_exception("Failed creating qp_mgr_eth_mlx5_dpcp"); @@ -57,10 +65,10 @@ bool qp_mgr_eth_mlx5_dpcp::configure_rq_dpcp() priv_xlio_transport_type_str(m_p_ring->get_transport_type()), m_p_ib_ctx_handler->get_ibname(), m_p_ib_ctx_handler->get_ibv_device(), m_port_num); - m_qp_cap.max_recv_wr = m_rx_num_wr; + m_mlx5_qp.cap.max_recv_wr = m_rx_num_wr; - qp_logdbg("Requested RQ parameters: wre: rx = %d sge: rx = %d", m_qp_cap.max_recv_wr, - m_qp_cap.max_recv_sge); + qp_logdbg("Requested RQ parameters: wre: rx = %d sge: rx = %d", m_mlx5_qp.cap.max_recv_wr, + m_mlx5_qp.cap.max_recv_sge); xlio_ib_mlx5_cq_t mlx5_cq; memset(&mlx5_cq, 0, sizeof(mlx5_cq)); @@ -70,14 +78,16 @@ bool qp_mgr_eth_mlx5_dpcp::configure_rq_dpcp() static_cast(mlx5_cq.cq_num)); if (safe_mce_sys().enable_striding_rq) { - m_qp_cap.max_recv_sge = 2U; // Striding-RQ needs a reserved segment. + m_mlx5_qp.cap.max_recv_sge = 2U; // Striding-RQ needs a reserved segment. _strq_wqe_reserved_seg = 1U; delete[] m_ibv_rx_sg_array; - m_ibv_rx_sg_array = new ibv_sge[m_n_sysvar_rx_num_wr_to_post_recv * m_qp_cap.max_recv_sge]; + m_ibv_rx_sg_array = + new ibv_sge[m_n_sysvar_rx_num_wr_to_post_recv * m_mlx5_qp.cap.max_recv_sge]; for (uint32_t wr_idx = 0; wr_idx < m_n_sysvar_rx_num_wr_to_post_recv; wr_idx++) { - m_ibv_rx_wr_array[wr_idx].sg_list = &m_ibv_rx_sg_array[wr_idx * m_qp_cap.max_recv_sge]; - m_ibv_rx_wr_array[wr_idx].num_sge = m_qp_cap.max_recv_sge; + m_ibv_rx_wr_array[wr_idx].sg_list = + &m_ibv_rx_sg_array[wr_idx * m_mlx5_qp.cap.max_recv_sge]; + m_ibv_rx_wr_array[wr_idx].num_sge = m_mlx5_qp.cap.max_recv_sge; memset(m_ibv_rx_wr_array[wr_idx].sg_list, 0, sizeof(ibv_sge)); m_ibv_rx_wr_array[wr_idx].sg_list[0].length = 1U; // To bypass a check inside xlio_ib_mlx5_post_recv. @@ -106,8 +116,8 @@ bool qp_mgr_eth_mlx5_dpcp::prepare_rq(uint32_t cqn) dpcp::rq_attr rqattrs; memset(&rqattrs, 0, sizeof(rqattrs)); rqattrs.cqn = cqn; - rqattrs.wqe_num = m_qp_cap.max_recv_wr; - rqattrs.wqe_sz = m_qp_cap.max_recv_sge; + rqattrs.wqe_num = m_mlx5_qp.cap.max_recv_wr; + rqattrs.wqe_sz = m_mlx5_qp.cap.max_recv_sge; if (safe_mce_sys().hw_ts_conversion_mode == TS_CONVERSION_MODE_RTC) { qp_logdbg("Enabled RTC timestamp format for RQ"); @@ -123,7 +133,7 @@ bool qp_mgr_eth_mlx5_dpcp::prepare_rq(uint32_t cqn) // Striding-RQ WQE format is as of Shared-RQ (PRM, page 381, wq_type). // In this case the WQE minimum size is 2 * 16, and the first segment is reserved. - rqattrs.wqe_sz = m_qp_cap.max_recv_sge * 16U; + rqattrs.wqe_sz = m_mlx5_qp.cap.max_recv_sge * 16U; dpcp::striding_rq *new_rq_ptr = nullptr; rc = dpcp_adapter->create_striding_rq(rqattrs, new_rq_ptr); @@ -139,7 +149,6 @@ bool qp_mgr_eth_mlx5_dpcp::prepare_rq(uint32_t cqn) return false; } - memset(&m_mlx5_qp, 0, sizeof(m_mlx5_qp)); if (!store_rq_mlx5_params(*new_rq)) { qp_logerr( "Failed to retrieve initial DPCP RQ parameters, rc: %d, basic_rq: %p, cqn: %" PRIu32, @@ -192,8 +201,6 @@ bool qp_mgr_eth_mlx5_dpcp::store_rq_mlx5_params(dpcp::basic_rq &new_rq) m_mlx5_qp.rq.wqe_shift = ilog_2(m_mlx5_qp.rq.stride); m_mlx5_qp.rq.head = 0; m_mlx5_qp.rq.tail = 0; - m_mlx5_qp.cap.max_recv_wr = m_qp_cap.max_recv_wr; - m_mlx5_qp.cap.max_recv_sge = m_qp_cap.max_recv_sge; m_mlx5_qp.tirn = 0U; return true; @@ -201,10 +208,6 @@ bool qp_mgr_eth_mlx5_dpcp::store_rq_mlx5_params(dpcp::basic_rq &new_rq) void qp_mgr_eth_mlx5_dpcp::init_tir_rq() { - if (_rq && !store_rq_mlx5_params(*_rq)) { - qp_logpanic("Failed to retrieve DPCP RQ parameters (errno=%d %m)", errno); - } - _tir.reset(create_tir()); if (!_tir) { qp_logpanic("TIR creation for qp_mgr_eth_mlx5_dpcp failed (errno=%d %m)", errno); @@ -213,7 +216,7 @@ void qp_mgr_eth_mlx5_dpcp::init_tir_rq() void qp_mgr_eth_mlx5_dpcp::up() { - qp_mgr_eth_mlx5::init_qp(); + qp_mgr::init_qp(); init_tir_rq(); qp_mgr::up(); init_device_memory(); @@ -223,7 +226,7 @@ void qp_mgr_eth_mlx5_dpcp::down() { _tir.reset(nullptr); - qp_mgr_eth_mlx5::down(); + qp_mgr::down(); } rfs_rule *qp_mgr_eth_mlx5_dpcp::create_rfs_rule(xlio_ibv_flow_attr &attrs, xlio_tir *tir_ext) @@ -251,7 +254,7 @@ rfs_rule *qp_mgr_eth_mlx5_dpcp::create_rfs_rule(xlio_ibv_flow_attr &attrs, xlio_ void qp_mgr_eth_mlx5_dpcp::modify_qp_to_ready_state() { - qp_mgr_eth_mlx5::modify_qp_to_ready_state(); + qp_mgr::modify_qp_to_ready_state(); modify_rq_to_ready_state(); } @@ -259,7 +262,7 @@ void qp_mgr_eth_mlx5_dpcp::modify_qp_to_error_state() { m_p_cq_mgr_rx->clean_cq(); - qp_mgr_eth_mlx5::modify_qp_to_error_state(); + qp_mgr::modify_qp_to_error_state(); dpcp::status rc = _rq->modify_state(dpcp::RQ_ERR); @@ -286,7 +289,7 @@ void qp_mgr_eth_mlx5_dpcp::modify_rq_to_ready_state() cq_mgr_rx *qp_mgr_eth_mlx5_dpcp::init_rx_cq_mgr(struct ibv_comp_channel *p_rx_comp_event_channel) { if (unlikely(!safe_mce_sys().enable_striding_rq)) { - return qp_mgr_eth_mlx5::init_rx_cq_mgr(p_rx_comp_event_channel); + return qp_mgr::init_rx_cq_mgr(p_rx_comp_event_channel); } return (!init_rx_cq_mgr_prepare() @@ -300,7 +303,7 @@ cq_mgr_rx *qp_mgr_eth_mlx5_dpcp::init_rx_cq_mgr(struct ibv_comp_channel *p_rx_co void qp_mgr_eth_mlx5_dpcp::post_recv_buffer(mem_buf_desc_t *p_mem_buf_desc) { - uint32_t index = (m_curr_rx_wr * m_qp_cap.max_recv_sge) + _strq_wqe_reserved_seg; + uint32_t index = (m_curr_rx_wr * m_mlx5_qp.cap.max_recv_sge) + _strq_wqe_reserved_seg; m_ibv_rx_sg_array[index].addr = (uintptr_t)p_mem_buf_desc->p_buffer; m_ibv_rx_sg_array[index].length = p_mem_buf_desc->sz_buffer; m_ibv_rx_sg_array[index].lkey = p_mem_buf_desc->lkey; diff --git a/src/core/dev/qp_mgr_eth_mlx5_dpcp.h b/src/core/dev/qp_mgr_eth_mlx5_dpcp.h index e4570ead8..5eed174eb 100644 --- a/src/core/dev/qp_mgr_eth_mlx5_dpcp.h +++ b/src/core/dev/qp_mgr_eth_mlx5_dpcp.h @@ -37,9 +37,9 @@ #include #include -#include "dev/qp_mgr_eth_mlx5.h" +#include "dev/qp_mgr.h" -class qp_mgr_eth_mlx5_dpcp : public qp_mgr_eth_mlx5 { +class qp_mgr_eth_mlx5_dpcp : public qp_mgr { public: qp_mgr_eth_mlx5_dpcp(struct qp_mgr_desc *desc, uint32_t tx_num_wr, uint16_t vlan); @@ -58,7 +58,7 @@ class qp_mgr_eth_mlx5_dpcp : public qp_mgr_eth_mlx5 { private: #ifdef DEFINED_UTLS - // TODO: Move UTLS related code to this class and remove qp_mgr_eth_mlx5::create_tir() + // TODO: Move UTLS related code to this class and remove qp_mgr::create_tir() dpcp::tir *create_tir(bool is_tls = false) override; #else dpcp::tir *create_tir(bool is_tls = false); diff --git a/src/core/dev/ring_simple.cpp b/src/core/dev/ring_simple.cpp index ce7373082..cf50f1ab7 100644 --- a/src/core/dev/ring_simple.cpp +++ b/src/core/dev/ring_simple.cpp @@ -37,7 +37,6 @@ #include "util/sg_array.h" #include "sock/fd_collection.h" #if defined(DEFINED_DIRECT_VERBS) -#include "dev/qp_mgr_eth_mlx5.h" #include "dev/qp_mgr_eth_mlx5_dpcp.h" #endif @@ -88,16 +87,6 @@ inline void ring_simple::send_status_handler(int ret, xlio_ibv_send_wr *p_send_w BULLSEYE_EXCLUDE_BLOCK_END } -qp_mgr *ring_eth::create_qp_mgr(struct qp_mgr_desc *desc) -{ -#if defined(DEFINED_DPCP) - if (safe_mce_sys().enable_dpcp_rq) { - return new qp_mgr_eth_mlx5_dpcp(desc, get_tx_num_wr(), m_partition); - } -#endif - return new qp_mgr_eth_mlx5(desc, get_tx_num_wr(), m_partition); -} - ring_simple::ring_simple(int if_index, ring *parent, ring_type_t type, bool use_locks) : ring_slave(if_index, parent, type, use_locks) , m_p_ib_ctx(NULL) @@ -384,7 +373,7 @@ void ring_simple::create_resources() desc.ring = this; desc.slave = p_slave; desc.rx_comp_event_channel = m_p_rx_comp_event_channel; - m_p_qp_mgr = create_qp_mgr(&desc); + m_p_qp_mgr = new qp_mgr_eth_mlx5_dpcp(&desc, get_tx_num_wr(), m_partition); BULLSEYE_EXCLUDE_BLOCK_START if (m_p_qp_mgr == NULL) { ring_logerr("Failed to allocate qp_mgr!"); diff --git a/src/core/dev/ring_simple.h b/src/core/dev/ring_simple.h index 999c0cd30..e4d8005e2 100644 --- a/src/core/dev/ring_simple.h +++ b/src/core/dev/ring_simple.h @@ -290,7 +290,6 @@ class ring_simple : public ring_slave { friend class cq_mgr_rx_regrq; friend class cq_mgr_rx_strq; friend class qp_mgr; - friend class qp_mgr_eth_mlx5; friend class qp_mgr_eth_mlx5_dpcp; friend class rfs; friend class rfs_uc; @@ -299,7 +298,6 @@ class ring_simple : public ring_slave { friend class ring_bond; protected: - virtual qp_mgr *create_qp_mgr(struct qp_mgr_desc *desc) = 0; void create_resources(); virtual void init_tx_buffers(uint32_t count); void inc_cq_moderation_stats(size_t sz_data) override; @@ -361,8 +359,8 @@ class ring_simple : public ring_slave { ib_ctx_handler *m_p_ib_ctx; qp_mgr *m_p_qp_mgr; struct cq_moderation_info m_cq_moderation_info; - cq_mgr_rx *m_p_cq_mgr_rx; - cq_mgr_tx *m_p_cq_mgr_tx; + cq_mgr_rx *m_p_cq_mgr_rx = nullptr; + cq_mgr_tx *m_p_cq_mgr_tx = nullptr; std::unordered_map m_user_lkey_map; private: @@ -449,19 +447,11 @@ class ring_eth : public ring_simple { if (p_ndev) { m_partition = p_ndev->get_vlan(); - /* Do resource initialization for - * ring_eth_direct, ring_eth_cb inside related - * constructors because - * they use own create_qp_mgr() methods - */ if (call_create_res) { create_resources(); } } } - -protected: - qp_mgr *create_qp_mgr(struct qp_mgr_desc *desc) override; }; #endif // RING_SIMPLE_H diff --git a/src/core/ib/mlx5/ib_mlx5.cpp b/src/core/ib/mlx5/ib_mlx5.cpp index 3f38f6c42..d58eeaa9b 100644 --- a/src/core/ib/mlx5/ib_mlx5.cpp +++ b/src/core/ib/mlx5/ib_mlx5.cpp @@ -40,7 +40,7 @@ #include "util/utils.h" #include "ib/mlx5/ib_mlx5.h" -int xlio_ib_mlx5_get_qp(struct ibv_qp *qp, xlio_ib_mlx5_qp_t *mlx5_qp, uint32_t flags) +int xlio_ib_mlx5_get_qp_tx(xlio_ib_mlx5_qp_t *mlx5_qp) { int ret = 0; struct mlx5dv_obj obj; @@ -52,7 +52,7 @@ int xlio_ib_mlx5_get_qp(struct ibv_qp *qp, xlio_ib_mlx5_qp_t *mlx5_qp, uint32_t memset(&obj, 0, sizeof(obj)); memset(&dqp, 0, sizeof(dqp)); - obj.qp.in = qp; + obj.qp.in = mlx5_qp->qp; obj.qp.out = &dqp; #if defined(DEFINED_DV_RAW_QP_HANDLES) dqp.comp_mask |= MLX5DV_QP_MASK_RAW_QP_HANDLES; @@ -62,42 +62,28 @@ int xlio_ib_mlx5_get_qp(struct ibv_qp *qp, xlio_ib_mlx5_qp_t *mlx5_qp, uint32_t goto out; } - memset(mlx5_qp, 0, sizeof(*mlx5_qp)); VALGRIND_MAKE_MEM_DEFINED(&dqp, sizeof(dqp)); - mlx5_qp->qp = qp; - mlx5_qp->qpn = qp->qp_num; - mlx5_qp->flags = flags; + mlx5_qp->qpn = mlx5_qp->qp->qp_num; mlx5_qp->sq.dbrec = &dqp.dbrec[MLX5_SND_DBR]; mlx5_qp->sq.buf = dqp.sq.buf; mlx5_qp->sq.wqe_cnt = dqp.sq.wqe_cnt; mlx5_qp->sq.stride = dqp.sq.stride; - mlx5_qp->rq.dbrec = &dqp.dbrec[MLX5_RCV_DBR]; - mlx5_qp->rq.buf = dqp.rq.buf; - mlx5_qp->rq.wqe_cnt = dqp.rq.wqe_cnt; - mlx5_qp->rq.stride = dqp.rq.stride; - mlx5_qp->rq.wqe_shift = ilog_2(dqp.rq.stride); - mlx5_qp->rq.head = 0; - mlx5_qp->rq.tail = 0; mlx5_qp->bf.reg = dqp.bf.reg; mlx5_qp->bf.size = dqp.bf.size; mlx5_qp->bf.offset = 0; #if defined(DEFINED_DV_RAW_QP_HANDLES) - mlx5_qp->tirn = dqp.tirn; mlx5_qp->tisn = dqp.tisn; - mlx5_qp->rqn = dqp.rqn; mlx5_qp->sqn = dqp.sqn; #endif /* DEFINED_DV_RAW_QP_HANDLES */ - ret = ibv_query_qp(qp, &tmp_ibv_qp_attr, attr_mask, &tmp_ibv_qp_init_attr); + ret = ibv_query_qp(mlx5_qp->qp, &tmp_ibv_qp_attr, attr_mask, &tmp_ibv_qp_init_attr); if (ret != 0) { goto out; } VALGRIND_MAKE_MEM_DEFINED(&tmp_ibv_qp_attr, sizeof(tmp_ibv_qp_attr)); mlx5_qp->cap.max_send_wr = tmp_ibv_qp_attr.cap.max_send_wr; - mlx5_qp->cap.max_recv_wr = tmp_ibv_qp_attr.cap.max_recv_wr; mlx5_qp->cap.max_send_sge = tmp_ibv_qp_attr.cap.max_send_sge; - mlx5_qp->cap.max_recv_sge = tmp_ibv_qp_attr.cap.max_recv_sge; mlx5_qp->cap.max_inline_data = tmp_ibv_qp_attr.cap.max_inline_data; out: @@ -204,27 +190,10 @@ int xlio_ib_mlx5_post_recv(xlio_ib_mlx5_qp_t *mlx5_qp, struct ibv_recv_wr *wr, if (likely(nreq)) { mlx5_qp->rq.head += nreq; - /* - * Make sure that descriptors are written before - * doorbell record. - */ - wmb(); - - /* - * For Raw Packet QP, avoid updating the doorbell record - * as long as the QP isn't in RTR state, to avoid receiving - * packets in illegal states. - * This is only for Raw Packet QPs since they are represented - * differently in the hardware. - * For DPCP RQ, the RQ state is switched along with the QP-unused-rq, - * and in such case if RQ.State == RST, doorbells are not processed anyway - * and for RDY state without a TIR incomming messages never reach RQ (PRM 8.14.1). - */ - if (likely(!((mlx5_qp->qp->qp_type == IBV_QPT_RAW_PACKET || - mlx5_qp->flags & XLIO_IB_MLX5_QP_FLAGS_USE_UNDERLAY) && - mlx5_qp->qp->state < IBV_QPS_RTR))) { - *mlx5_qp->rq.dbrec = htonl(mlx5_qp->rq.head & 0xffff); - } + wmb(); // Make sure that descriptors are written before doorbell record. + + // Buffers are posted only after the RQ is in ready state. OK to update doorbell. + *mlx5_qp->rq.dbrec = htonl(mlx5_qp->rq.head & 0xffff); } return err; diff --git a/src/core/ib/mlx5/ib_mlx5.h b/src/core/ib/mlx5/ib_mlx5.h index 948e98517..bb70b7f63 100644 --- a/src/core/ib/mlx5/ib_mlx5.h +++ b/src/core/ib/mlx5/ib_mlx5.h @@ -65,15 +65,12 @@ extern "C" { */ int xlio_ib_mlx5dv_init_obj(struct mlx5dv_obj *obj, uint64_t type); -enum { XLIO_IB_MLX5_QP_FLAGS_USE_UNDERLAY = 0x01 }; - enum { XLIO_IB_MLX5_CQ_SET_CI = 0, XLIO_IB_MLX5_CQ_ARM_DB = 1 }; /* Queue pair */ typedef struct xlio_ib_mlx5_qp { struct ibv_qp *qp; uint32_t qpn; - uint32_t flags; struct ibv_qp_cap cap; struct { volatile uint32_t *dbrec; @@ -482,7 +479,7 @@ enum { /* * Interfaces */ -int xlio_ib_mlx5_get_qp(struct ibv_qp *qp, xlio_ib_mlx5_qp_t *mlx5_qp, uint32_t flags = 0); +int xlio_ib_mlx5_get_qp_tx(xlio_ib_mlx5_qp_t *mlx5_qp); int xlio_ib_mlx5_post_recv(xlio_ib_mlx5_qp_t *mlx5_qp, struct ibv_recv_wr *wr, struct ibv_recv_wr **bad_wr); diff --git a/src/core/sock/sockinfo_nvme.h b/src/core/sock/sockinfo_nvme.h index e4968ce61..0283dd1a7 100644 --- a/src/core/sock/sockinfo_nvme.h +++ b/src/core/sock/sockinfo_nvme.h @@ -36,7 +36,7 @@ #include #include #include "sockinfo_ulp.h" /* sockinfo_tcp_ops */ -#include "dev/qp_mgr_eth_mlx5.h" +#include "dev/qp_mgr.h" #include "proto/nvme_parse_input_args.h" #include "xlio_extra.h" #include "lwip/err.h" /* err_t */ diff --git a/src/core/util/sys_vars.cpp b/src/core/util/sys_vars.cpp index 66657c906..a4e8d172d 100644 --- a/src/core/util/sys_vars.cpp +++ b/src/core/util/sys_vars.cpp @@ -926,7 +926,6 @@ void mce_sys_var::get_env_params() enable_striding_rq = (enable_strq_env == option_strq::ON || enable_strq_env == option_strq::AUTO); - enable_dpcp_rq = (enable_striding_rq || (enable_strq_env == option_strq::REGULAR_RQ)); if (enable_striding_rq) { rx_num_bufs = MCE_DEFAULT_STRQ_NUM_BUFS; diff --git a/src/core/util/sys_vars.h b/src/core/util/sys_vars.h index b9d752df4..52f33cf67 100644 --- a/src/core/util/sys_vars.h +++ b/src/core/util/sys_vars.h @@ -415,7 +415,6 @@ struct mce_sys_var { bool disable_flow_tag; bool enable_striding_rq; - bool enable_dpcp_rq; bool tcp_3t_rules; bool udp_3t_rules; bool eth_mc_l2_only_rules; diff --git a/tests/gtest/nvme/nvme.cc b/tests/gtest/nvme/nvme.cc index 49e594de0..36c5b81b4 100644 --- a/tests/gtest/nvme/nvme.cc +++ b/tests/gtest/nvme/nvme.cc @@ -36,7 +36,7 @@ #include #include "common/def.h" #include "common/base.h" -#include "dev/qp_mgr_eth_mlx5.h" +#include "dev/qp_mgr.h" #include "proto/nvme_parse_input_args.h" #include "tcp/tcp_base.h" #include "xlio_extra.h" From 6dfaffcde0e1441d0eda8f9639dd07511168bad7 Mon Sep 17 00:00:00 2001 From: Alexander Grissik Date: Wed, 4 Oct 2023 17:27:16 +0300 Subject: [PATCH 014/169] issue: 3514044 Squash qp_mgr_eth_mlx5_dpcp to qp_mgr Signed-off-by: Alexander Grissik --- src/core/Makefile.am | 4 - src/core/dev/qp_mgr.cpp | 315 ++++++++++++++++++++++++++++++----- src/core/dev/qp_mgr.h | 42 ++--- src/core/dev/ring_simple.cpp | 5 +- src/core/dev/ring_simple.h | 1 - 5 files changed, 293 insertions(+), 74 deletions(-) diff --git a/src/core/Makefile.am b/src/core/Makefile.am index 072d1cab0..d59a53db8 100644 --- a/src/core/Makefile.am +++ b/src/core/Makefile.am @@ -67,13 +67,11 @@ libxlio_la_SOURCES := \ dev/cq_mgr_tx.cpp \ dev/dm_mgr.cpp \ dev/qp_mgr.cpp \ - dev/qp_mgr_eth_mlx5_dpcp.cpp \ dev/gro_mgr.cpp \ dev/rfs.cpp \ dev/rfs_uc.cpp \ dev/rfs_uc_tcp_gro.cpp \ dev/rfs_mc.cpp \ - dev/rfs_rule_ibv.cpp \ dev/rfs_rule_dpcp.cpp \ dev/time_converter.cpp \ dev/time_converter_ptp.cpp \ @@ -186,13 +184,11 @@ libxlio_la_SOURCES := \ dev/net_device_table_mgr.h \ dev/net_device_val.h \ dev/qp_mgr.h \ - dev/qp_mgr_eth_mlx5_dpcp.h \ dev/rfs.h \ dev/rfs_mc.h \ dev/rfs_uc.h \ dev/rfs_uc_tcp_gro.h \ dev/rfs_rule.h \ - dev/rfs_rule_ibv.h \ dev/rfs_rule_dpcp.h \ dev/src_addr_selector.h \ dev/ring.h \ diff --git a/src/core/dev/qp_mgr.cpp b/src/core/dev/qp_mgr.cpp index 85b75c760..e500b9354 100644 --- a/src/core/dev/qp_mgr.cpp +++ b/src/core/dev/qp_mgr.cpp @@ -31,6 +31,7 @@ */ #include "qp_mgr.h" +#include #include "utils/bullseye.h" #include "util/utils.h" #include "util/valgrind.h" @@ -38,11 +39,10 @@ #include "iomux/io_mux_call.h" #include "buffer_pool.h" #include "ring_simple.h" -#include "util/valgrind.h" -#include "dev/rfs_rule_ibv.h" -#include #include "cq_mgr_rx_regrq.h" +#include "cq_mgr_rx_strq.h" #include "proto/tls.h" +#include "rfs_rule_dpcp.h" #undef MODULE_NAME #define MODULE_NAME "qp_mgr" @@ -134,8 +134,7 @@ static inline uint32_t get_mlx5_opcode(xlio_ibv_wr_opcode verbs_opcode) } } -qp_mgr::qp_mgr(struct qp_mgr_desc *desc, const uint32_t tx_num_wr, uint16_t vlan, - bool call_configure) +qp_mgr::qp_mgr(struct qp_mgr_desc *desc, const uint32_t tx_num_wr, uint16_t vlan) : m_p_ring((ring_simple *)desc->ring) , m_port_num((uint8_t)desc->slave->port_num) , m_p_ib_ctx_handler((ib_ctx_handler *)desc->slave->p_ib_ctx) @@ -164,21 +163,26 @@ qp_mgr::qp_mgr(struct qp_mgr_desc *desc, const uint32_t tx_num_wr, uint16_t vlan // Check device capabilities for dummy send support m_hw_dummy_send_support = xlio_is_nop_supported(m_p_ib_ctx_handler->get_ibv_device_attr()); - if (call_configure && configure(desc)) { - throw_xlio_exception("Failed creating qp_mgr"); - } - m_db_method = - (is_bf(((ib_ctx_handler *)desc->slave->p_ib_ctx)->get_ibv_context()) ? MLX5_DB_METHOD_BF - : MLX5_DB_METHOD_DB); + (is_bf((desc->slave->p_ib_ctx)->get_ibv_context()) ? MLX5_DB_METHOD_BF : MLX5_DB_METHOD_DB); qp_logdbg("m_db_method=%d", m_db_method); + + if (configure(desc)) { + throw_xlio_exception("Failed creating qp_mgr"); + } + + if (!configure_rq_dpcp()) { + throw_xlio_exception("Failed to create qp_mgr"); + } } qp_mgr::~qp_mgr() { qp_logfunc(""); + _rq.reset(nullptr); // Must be destroyed before RX CQ. + if (m_rq_wqe_idx_to_wrid) { if (0 != munmap(m_rq_wqe_idx_to_wrid, m_rx_num_wr * sizeof(*m_rq_wqe_idx_to_wrid))) { qp_logerr("Failed deallocating memory with munmap m_rq_wqe_idx_to_wrid (errno=%d %m)", @@ -376,6 +380,11 @@ void qp_mgr::up() { init_qp(); + _tir.reset(create_tir()); + if (!_tir) { + qp_logpanic("TIR creation for qp_mgr failed (errno=%d %m)", errno); + } + // Add buffers qp_logdbg("QP current state: %d", priv_ibv_query_qp_state(m_mlx5_qp.qp)); @@ -393,6 +402,8 @@ void qp_mgr::up() void qp_mgr::down() { + _tir.reset(nullptr); + if (m_dm_enabled) { m_dm_mgr.release_resources(); } @@ -413,17 +424,6 @@ void qp_mgr::down() m_p_cq_mgr_rx->del_qp_rx(this); } -void qp_mgr::modify_qp_to_error_state() -{ - qp_logdbg(""); - - BULLSEYE_EXCLUDE_BLOCK_START - if (priv_ibv_modify_qp_to_err(m_mlx5_qp.qp)) { - qp_logdbg("ibv_modify_qp failure (errno = %d %m)", errno); - } - BULLSEYE_EXCLUDE_BLOCK_END -} - void qp_mgr::release_rx_buffers() { int total_ret = m_curr_rx_wr; @@ -547,6 +547,33 @@ void qp_mgr::modify_qp_to_ready_state() } BULLSEYE_EXCLUDE_BLOCK_END + + modify_rq_to_ready_state(); +} + +void qp_mgr::modify_qp_to_error_state() +{ + qp_logdbg(""); + + m_p_cq_mgr_rx->clean_cq(); + + BULLSEYE_EXCLUDE_BLOCK_START + if (priv_ibv_modify_qp_to_err(m_mlx5_qp.qp)) { + qp_logdbg("ibv_modify_qp failure (errno = %d %m)", errno); + } + BULLSEYE_EXCLUDE_BLOCK_END + + dpcp::status rc = _rq->modify_state(dpcp::RQ_ERR); + + /* During plugout theres is possibility that kernel + * remove device resources before working process complete + * removing process. As a result ibv api function can + * return EIO=5 errno code. + */ + if (dpcp::DPCP_OK != rc && errno != EIO) { + qp_logerr("Failed to modify rq state to ERR, rc: %d, rqn: %" PRIu32, static_cast(rc), + m_mlx5_qp.rqn); + } } int qp_mgr::prepare_ibv_qp(xlio_ibv_qp_init_attr &qp_init_attr) @@ -612,15 +639,15 @@ int qp_mgr::modify_qp_ratelimit(struct xlio_rate_limit_t &rate_limit, uint32_t r rfs_rule *qp_mgr::create_rfs_rule(xlio_ibv_flow_attr &attrs, xlio_tir *tir_ext) { - if (unlikely(tir_ext != NULL)) { - qp_logwarn("Requested steering rule cannot be created. Consider " - "building XLIO with DPCP support or disabling legacy RQ mode."); - return nullptr; - } + if (m_p_ib_ctx_handler && m_p_ib_ctx_handler->get_dpcp_adapter()) { + // TLS RX uses tir_ext. + dpcp::tir *dpcp_tir = (tir_ext ? xlio_tir_to_dpcp_tir(tir_ext) : _tir.get()); - unique_ptr new_rule(new rfs_rule_ibv()); - if (new_rule->create(attrs, this->get_ibv_qp())) { - return new_rule.release(); + std::unique_ptr new_rule(new rfs_rule_dpcp()); + if (dpcp_tir && + new_rule->create(attrs, *dpcp_tir, *m_p_ib_ctx_handler->get_dpcp_adapter())) { + return new_rule.release(); + } } return nullptr; @@ -731,9 +758,10 @@ void qp_mgr::update_next_wqe_hot() void qp_mgr::post_recv_buffer(mem_buf_desc_t *p_mem_buf_desc) { - m_ibv_rx_sg_array[m_curr_rx_wr].addr = (uintptr_t)p_mem_buf_desc->p_buffer; - m_ibv_rx_sg_array[m_curr_rx_wr].length = p_mem_buf_desc->sz_buffer; - m_ibv_rx_sg_array[m_curr_rx_wr].lkey = p_mem_buf_desc->lkey; + uint32_t index = (m_curr_rx_wr * m_mlx5_qp.cap.max_recv_sge) + _strq_wqe_reserved_seg; + m_ibv_rx_sg_array[index].addr = (uintptr_t)p_mem_buf_desc->p_buffer; + m_ibv_rx_sg_array[index].length = p_mem_buf_desc->sz_buffer; + m_ibv_rx_sg_array[index].lkey = p_mem_buf_desc->lkey; post_recv_buffer_rq(p_mem_buf_desc); } @@ -806,9 +834,18 @@ bool qp_mgr::init_rx_cq_mgr_prepare() cq_mgr_rx *qp_mgr::init_rx_cq_mgr(struct ibv_comp_channel *p_rx_comp_event_channel) { - return (!init_rx_cq_mgr_prepare() ? NULL - : new cq_mgr_rx_regrq(m_p_ring, m_p_ib_ctx_handler, - m_rx_num_wr, p_rx_comp_event_channel)); + if (!init_rx_cq_mgr_prepare()) { + return nullptr; + } + + if (safe_mce_sys().enable_striding_rq) { + return new cq_mgr_rx_strq(m_p_ring, m_p_ib_ctx_handler, + safe_mce_sys().strq_stride_num_per_rwqe * m_rx_num_wr, + safe_mce_sys().strq_stride_size_bytes, + safe_mce_sys().strq_stride_num_per_rwqe, p_rx_comp_event_channel); + } + + return new cq_mgr_rx_regrq(m_p_ring, m_p_ib_ctx_handler, m_rx_num_wr, p_rx_comp_event_channel); } cq_mgr_tx *qp_mgr::init_tx_cq_mgr() @@ -1342,13 +1379,13 @@ xlio_tir *qp_mgr::tls_create_tir(bool cached) tir = m_tls_tir_cache.back(); m_tls_tir_cache.pop_back(); } else if (!cached) { - dpcp::tir *_tir = create_tir(true); + dpcp::tir *new_tir = create_tir(true); - if (_tir != NULL) { - tir = new xlio_tir(_tir, xlio_ti::ti_type::TLS_TIR); + if (new_tir != NULL) { + tir = new xlio_tir(new_tir, xlio_ti::ti_type::TLS_TIR); } - if (unlikely(tir == NULL && _tir != NULL)) { - delete _tir; + if (unlikely(tir == NULL && new_tir != NULL)) { + delete new_tir; } } return tir; @@ -1628,11 +1665,6 @@ void qp_mgr::tls_release_tir(xlio_tir *tir) put_tls_tir_in_cache(tir); } } - -dpcp::tir *qp_mgr::xlio_tir_to_dpcp_tir(xlio_tir *tir) -{ - return tir->m_p_tir.get(); -} #else /* DEFINED_UTLS */ void qp_mgr::ti_released(xlio_ti *) {}; void qp_mgr::destroy_tis_cache(void) {}; @@ -1914,3 +1946,194 @@ void qp_mgr::reset_inflight_zc_buffers_ctx(void *ctx) } while (p && is_sq_wqe_prop_valid(p, prev)); } } + +dpcp::tir *qp_mgr::create_tir(bool is_tls /*=false*/) +{ + dpcp::tir *tir_obj = nullptr; + dpcp::status status = dpcp::DPCP_OK; + dpcp::tir::attr tir_attr; + + memset(&tir_attr, 0, sizeof(tir_attr)); + tir_attr.flags = dpcp::TIR_ATTR_INLINE_RQN | dpcp::TIR_ATTR_TRANSPORT_DOMAIN; + tir_attr.inline_rqn = m_mlx5_qp.rqn; + tir_attr.transport_domain = m_p_ib_ctx_handler->get_dpcp_adapter()->get_td(); + + if (m_p_ring->m_lro.cap && m_p_ring->m_lro.max_payload_sz) { + tir_attr.flags |= dpcp::TIR_ATTR_LRO; + tir_attr.lro.timeout_period_usecs = XLIO_MLX5_PARAMS_LRO_TIMEOUT; + tir_attr.lro.enable_mask = 3; // Bitmask for IPv4 and IPv6 support + tir_attr.lro.max_msg_sz = m_p_ring->m_lro.max_payload_sz >> 8; + } + + if (is_tls) { + tir_attr.flags |= dpcp::TIR_ATTR_TLS; + tir_attr.tls_en = 1; + } + + status = m_p_ib_ctx_handler->get_dpcp_adapter()->create_tir(tir_attr, tir_obj); + + if (dpcp::DPCP_OK != status) { + qp_logerr("Failed creating dpcp tir with flags=0x%x status=%d", tir_attr.flags, status); + return nullptr; + } + + qp_logdbg("TIR: %p created", tir_obj); + + return tir_obj; +} + +void qp_mgr::modify_rq_to_ready_state() +{ + dpcp::status rc = _rq->modify_state(dpcp::RQ_RDY); + if (dpcp::DPCP_OK != rc) { + qp_logerr("Failed to modify rq state to RDY, rc: %d, rqn: %" PRIu32, static_cast(rc), + m_mlx5_qp.rqn); + } +} + +bool qp_mgr::configure_rq_dpcp() +{ + qp_logdbg("Creating RQ of transport type '%s' on ibv device '%s' [%p] on port %d", + priv_xlio_transport_type_str(m_p_ring->get_transport_type()), + m_p_ib_ctx_handler->get_ibname(), m_p_ib_ctx_handler->get_ibv_device(), m_port_num); + + m_mlx5_qp.cap.max_recv_wr = m_rx_num_wr; + + qp_logdbg("Requested RQ parameters: wre: rx = %d sge: rx = %d", m_mlx5_qp.cap.max_recv_wr, + m_mlx5_qp.cap.max_recv_sge); + + xlio_ib_mlx5_cq_t mlx5_cq; + memset(&mlx5_cq, 0, sizeof(mlx5_cq)); + xlio_ib_mlx5_get_cq(m_p_cq_mgr_rx->get_ibv_cq_hndl(), &mlx5_cq); + + qp_logdbg("Configuring dpcp RQ, cq-rx: %p, cqn-rx: %u", m_p_cq_mgr_rx, + static_cast(mlx5_cq.cq_num)); + + if (safe_mce_sys().enable_striding_rq) { + m_mlx5_qp.cap.max_recv_sge = 2U; // Striding-RQ needs a reserved segment. + _strq_wqe_reserved_seg = 1U; + + delete[] m_ibv_rx_sg_array; + m_ibv_rx_sg_array = + new ibv_sge[m_n_sysvar_rx_num_wr_to_post_recv * m_mlx5_qp.cap.max_recv_sge]; + for (uint32_t wr_idx = 0; wr_idx < m_n_sysvar_rx_num_wr_to_post_recv; wr_idx++) { + m_ibv_rx_wr_array[wr_idx].sg_list = + &m_ibv_rx_sg_array[wr_idx * m_mlx5_qp.cap.max_recv_sge]; + m_ibv_rx_wr_array[wr_idx].num_sge = m_mlx5_qp.cap.max_recv_sge; + memset(m_ibv_rx_wr_array[wr_idx].sg_list, 0, sizeof(ibv_sge)); + m_ibv_rx_wr_array[wr_idx].sg_list[0].length = + 1U; // To bypass a check inside xlio_ib_mlx5_post_recv. + } + } + + // Create the QP + if (!prepare_rq(mlx5_cq.cq_num)) { + return false; + } + + return true; +} + +bool qp_mgr::prepare_rq(uint32_t cqn) +{ + qp_logdbg(""); + + dpcp::adapter *dpcp_adapter = m_p_ib_ctx_handler->get_dpcp_adapter(); + if (!dpcp_adapter) { + qp_logerr("Failed to get dpcp::adapter for prepare_rq"); + return false; + } + + // user_index Unused. + dpcp::rq_attr rqattrs; + memset(&rqattrs, 0, sizeof(rqattrs)); + rqattrs.cqn = cqn; + rqattrs.wqe_num = m_mlx5_qp.cap.max_recv_wr; + rqattrs.wqe_sz = m_mlx5_qp.cap.max_recv_sge; + + if (safe_mce_sys().hw_ts_conversion_mode == TS_CONVERSION_MODE_RTC) { + qp_logdbg("Enabled RTC timestamp format for RQ"); + rqattrs.ts_format = dpcp::rq_ts_format::RQ_TS_REAL_TIME; + } + + std::unique_ptr new_rq; + dpcp::status rc = dpcp::DPCP_OK; + + if (safe_mce_sys().enable_striding_rq) { + rqattrs.buf_stride_sz = safe_mce_sys().strq_stride_size_bytes; + rqattrs.buf_stride_num = safe_mce_sys().strq_stride_num_per_rwqe; + + // Striding-RQ WQE format is as of Shared-RQ (PRM, page 381, wq_type). + // In this case the WQE minimum size is 2 * 16, and the first segment is reserved. + rqattrs.wqe_sz = m_mlx5_qp.cap.max_recv_sge * 16U; + + dpcp::striding_rq *new_rq_ptr = nullptr; + rc = dpcp_adapter->create_striding_rq(rqattrs, new_rq_ptr); + new_rq.reset(new_rq_ptr); + } else { + dpcp::regular_rq *new_rq_ptr = nullptr; + rc = dpcp_adapter->create_regular_rq(rqattrs, new_rq_ptr); + new_rq.reset(new_rq_ptr); + } + + if (dpcp::DPCP_OK != rc) { + qp_logerr("Failed to create dpcp rq, rc: %d, cqn: %" PRIu32, static_cast(rc), cqn); + return false; + } + + if (!store_rq_mlx5_params(*new_rq)) { + qp_logerr( + "Failed to retrieve initial DPCP RQ parameters, rc: %d, basic_rq: %p, cqn: %" PRIu32, + static_cast(rc), new_rq.get(), cqn); + return false; + } + + _rq = std::move(new_rq); + + // At this stage there is no TIR associated with the RQ, So it mimics QP INIT state. + // At RDY state without a TIR, Work Requests can be submitted to the RQ. + modify_rq_to_ready_state(); + + qp_logdbg("Succeeded to create dpcp rq, rqn: %" PRIu32 ", cqn: %" PRIu32, m_mlx5_qp.rqn, cqn); + + return true; +} + +bool qp_mgr::store_rq_mlx5_params(dpcp::basic_rq &new_rq) +{ + uint32_t *dbrec_tmp = nullptr; + dpcp::status rc = new_rq.get_dbrec(dbrec_tmp); + if (dpcp::DPCP_OK != rc) { + qp_logerr("Failed to retrieve dbrec of dpcp rq, rc: %d, basic_rq: %p", static_cast(rc), + &new_rq); + return false; + } + m_mlx5_qp.rq.dbrec = dbrec_tmp; + + rc = new_rq.get_wq_buf(m_mlx5_qp.rq.buf); + if (dpcp::DPCP_OK != rc) { + qp_logerr("Failed to retrieve wq-buf of dpcp rq, rc: %d, basic_rq: %p", + static_cast(rc), &new_rq); + return false; + } + + rc = new_rq.get_id(m_mlx5_qp.rqn); + if (dpcp::DPCP_OK != rc) { + qp_logerr("Failed to retrieve rqn of dpcp rq, rc: %d, basic_rq: %p", static_cast(rc), + &new_rq); + return false; + } + + new_rq.get_wqe_num(m_mlx5_qp.rq.wqe_cnt); + new_rq.get_wq_stride_sz(m_mlx5_qp.rq.stride); + if (safe_mce_sys().enable_striding_rq) { + m_mlx5_qp.rq.stride /= 16U; + } + + m_mlx5_qp.rq.wqe_shift = ilog_2(m_mlx5_qp.rq.stride); + m_mlx5_qp.rq.head = 0; + m_mlx5_qp.rq.tail = 0; + m_mlx5_qp.tirn = 0U; + + return true; +} diff --git a/src/core/dev/qp_mgr.h b/src/core/dev/qp_mgr.h index 90e12b479..14b880369 100644 --- a/src/core/dev/qp_mgr.h +++ b/src/core/dev/qp_mgr.h @@ -256,14 +256,14 @@ class qp_mgr { friend class cq_mgr_tx; public: - qp_mgr(struct qp_mgr_desc *desc, const uint32_t tx_num_wr, uint16_t vlan, bool call_configure); - virtual ~qp_mgr(); + qp_mgr(struct qp_mgr_desc *desc, const uint32_t tx_num_wr, uint16_t vlan); + ~qp_mgr(); - virtual void up(); - virtual void down(); + void up(); + void down(); // Post for receive single mem_buf_desc - virtual void post_recv_buffer(mem_buf_desc_t *p_mem_buf_desc); + void post_recv_buffer(mem_buf_desc_t *p_mem_buf_desc); // Post for receive a list of mem_buf_desc void post_recv_buffers(descq_t *p_buffers, size_t count); @@ -282,8 +282,8 @@ class qp_mgr { // chain of calls may serve as cache warm for dummy send feature. inline bool get_hw_dummy_send_support() { return m_hw_dummy_send_support; } - virtual void modify_qp_to_ready_state(); - virtual void modify_qp_to_error_state(); + void modify_qp_to_ready_state(); + void modify_qp_to_error_state(); void release_rx_buffers(); void release_tx_buffers(); @@ -291,7 +291,7 @@ class qp_mgr { int modify_qp_ratelimit(struct xlio_rate_limit_t &rate_limit, uint32_t rl_changes); void dm_release_data(mem_buf_desc_t *buff) { m_dm_mgr.release_data(buff); } - virtual rfs_rule *create_rfs_rule(xlio_ibv_flow_attr &attrs, xlio_tir *tir_ext); + rfs_rule *create_rfs_rule(xlio_ibv_flow_attr &attrs, xlio_tir *tir_ext); #ifdef DEFINED_UTLS xlio_tis *tls_context_setup_tx(const xlio_tls_info *info) override; @@ -445,21 +445,12 @@ class qp_mgr { return m_n_unsignaled_count == m_n_sysvar_tx_num_wr_to_signal - 1; } - virtual cq_mgr_rx *init_rx_cq_mgr(struct ibv_comp_channel *p_rx_comp_event_channel); + cq_mgr_rx *init_rx_cq_mgr(struct ibv_comp_channel *p_rx_comp_event_channel); cq_mgr_tx *init_tx_cq_mgr(); int send_to_wire(xlio_ibv_send_wr *p_send_wqe, xlio_wr_tx_packet_attr attr, bool request_comp, xlio_tis *tis, unsigned credits); -#if defined(DEFINED_UTLS) - dpcp::tir *xlio_tir_to_dpcp_tir(xlio_tir *tir); - virtual dpcp::tir *create_tir(bool is_tls = false) - { - NOT_IN_USE(is_tls); - return NULL; - } -#endif /* DEFINED_UTLS */ - private: void trigger_completion_for_all_sent_packets(); void update_next_wqe_hot(); @@ -467,13 +458,22 @@ class qp_mgr { void ti_released(xlio_ti *ti); void put_tls_tir_in_cache(xlio_tir *tir); void put_tls_tis_in_cache(xlio_tis *tis); - bool is_rq_empty() const override { return (m_mlx5_qp.rq.head == m_mlx5_qp.rq.tail); } + void modify_rq_to_ready_state(); + bool prepare_rq(uint32_t cqn); + bool configure_rq_dpcp(); + bool store_rq_mlx5_params(dpcp::basic_rq &new_rq); + bool is_rq_empty() const { return (m_mlx5_qp.rq.head == m_mlx5_qp.rq.tail); } bool is_completion_need() const { return !m_n_unsignaled_count || (m_dm_enabled && m_dm_mgr.is_completion_need()); } + dpcp::tir *create_tir(bool is_tls = false); + + dpcp::tir *xlio_tir_to_dpcp_tir(xlio_tir *tir) { return tir->m_p_tir.get(); } + #if defined(DEFINED_UTLS) + inline void tls_fill_static_params_wqe(struct mlx5_wqe_tls_static_params_seg *params, const struct xlio_tls_info *info, uint32_t key_id, uint32_t resync_tcp_sn); @@ -533,6 +533,10 @@ class qp_mgr { std::list> m_tls_dek_get_cache; std::list> m_tls_dek_put_cache; #endif + + std::unique_ptr _tir = {nullptr}; + std::unique_ptr _rq = {nullptr}; + uint32_t _strq_wqe_reserved_seg = 0U; }; #endif diff --git a/src/core/dev/ring_simple.cpp b/src/core/dev/ring_simple.cpp index cf50f1ab7..06c5f5e56 100644 --- a/src/core/dev/ring_simple.cpp +++ b/src/core/dev/ring_simple.cpp @@ -36,9 +36,6 @@ #include "util/valgrind.h" #include "util/sg_array.h" #include "sock/fd_collection.h" -#if defined(DEFINED_DIRECT_VERBS) -#include "dev/qp_mgr_eth_mlx5_dpcp.h" -#endif #undef MODULE_NAME #define MODULE_NAME "ring_simple" @@ -373,7 +370,7 @@ void ring_simple::create_resources() desc.ring = this; desc.slave = p_slave; desc.rx_comp_event_channel = m_p_rx_comp_event_channel; - m_p_qp_mgr = new qp_mgr_eth_mlx5_dpcp(&desc, get_tx_num_wr(), m_partition); + m_p_qp_mgr = new qp_mgr(&desc, get_tx_num_wr(), m_partition); BULLSEYE_EXCLUDE_BLOCK_START if (m_p_qp_mgr == NULL) { ring_logerr("Failed to allocate qp_mgr!"); diff --git a/src/core/dev/ring_simple.h b/src/core/dev/ring_simple.h index e4d8005e2..c581dae52 100644 --- a/src/core/dev/ring_simple.h +++ b/src/core/dev/ring_simple.h @@ -290,7 +290,6 @@ class ring_simple : public ring_slave { friend class cq_mgr_rx_regrq; friend class cq_mgr_rx_strq; friend class qp_mgr; - friend class qp_mgr_eth_mlx5_dpcp; friend class rfs; friend class rfs_uc; friend class rfs_uc_tcp_gro; From c890d0de285af69bfb4b6061708cfb0803ad0206 Mon Sep 17 00:00:00 2001 From: Alexander Grissik Date: Sun, 8 Oct 2023 15:10:35 +0300 Subject: [PATCH 015/169] issue: 3514044 Split qp_mgr to hw_queue_tx and hw_queue_rx Signed-off-by: Alexander Grissik --- src/core/Makefile.am | 6 +- src/core/dev/cq_mgr_rx.cpp | 60 +- src/core/dev/cq_mgr_rx.h | 9 +- src/core/dev/cq_mgr_rx.inl | 2 + src/core/dev/cq_mgr_rx_regrq.cpp | 19 +- src/core/dev/cq_mgr_rx_strq.cpp | 22 +- src/core/dev/cq_mgr_rx_strq.h | 2 +- src/core/dev/cq_mgr_tx.cpp | 36 +- src/core/dev/cq_mgr_tx.h | 8 +- src/core/dev/hw_queue_rx.cpp | 636 ++++++++++ src/core/dev/hw_queue_rx.h | 132 ++ src/core/dev/{qp_mgr.cpp => hw_queue_tx.cpp} | 1184 +++++------------- src/core/dev/{qp_mgr.h => hw_queue_tx.h} | 376 ++---- src/core/dev/net_device_val.h | 8 +- src/core/dev/rfs.cpp | 5 +- src/core/dev/rfs.h | 16 +- src/core/dev/rfs_mc.cpp | 6 +- src/core/dev/rfs_mc.h | 6 +- src/core/dev/rfs_uc.cpp | 6 +- src/core/dev/rfs_uc.h | 6 +- src/core/dev/ring.h | 7 +- src/core/dev/ring_bond.cpp | 6 +- src/core/dev/ring_simple.cpp | 141 ++- src/core/dev/ring_simple.h | 72 +- src/core/dev/ring_slave.cpp | 6 +- src/core/dev/ring_slave.h | 2 +- src/core/dev/xlio_ti.h | 184 +++ src/core/ib/mlx5/ib_mlx5.cpp | 64 - src/core/ib/mlx5/ib_mlx5.h | 11 - src/core/proto/route_table_mgr.cpp | 1 + src/core/sock/sockinfo_nvme.h | 2 +- tests/gtest/nvme/nvme.cc | 2 +- 32 files changed, 1601 insertions(+), 1442 deletions(-) create mode 100644 src/core/dev/hw_queue_rx.cpp create mode 100644 src/core/dev/hw_queue_rx.h rename src/core/dev/{qp_mgr.cpp => hw_queue_tx.cpp} (62%) rename src/core/dev/{qp_mgr.h => hw_queue_tx.h} (62%) create mode 100644 src/core/dev/xlio_ti.h diff --git a/src/core/Makefile.am b/src/core/Makefile.am index d59a53db8..214498bdd 100644 --- a/src/core/Makefile.am +++ b/src/core/Makefile.am @@ -66,7 +66,8 @@ libxlio_la_SOURCES := \ dev/cq_mgr_rx_strq.cpp \ dev/cq_mgr_tx.cpp \ dev/dm_mgr.cpp \ - dev/qp_mgr.cpp \ + dev/hw_queue_tx.cpp \ + dev/hw_queue_rx.cpp \ dev/gro_mgr.cpp \ dev/rfs.cpp \ dev/rfs_uc.cpp \ @@ -183,7 +184,8 @@ libxlio_la_SOURCES := \ dev/net_device_entry.h \ dev/net_device_table_mgr.h \ dev/net_device_val.h \ - dev/qp_mgr.h \ + dev/hw_queue_rx.h \ + dev/hw_queue_tx.h \ dev/rfs.h \ dev/rfs_mc.h \ dev/rfs_uc.h \ diff --git a/src/core/dev/cq_mgr_rx.cpp b/src/core/dev/cq_mgr_rx.cpp index bcf8924d2..c0baa098e 100644 --- a/src/core/dev/cq_mgr_rx.cpp +++ b/src/core/dev/cq_mgr_rx.cpp @@ -46,7 +46,7 @@ #include "ib/base/verbs_extra.h" #include "buffer_pool.h" -#include "qp_mgr.h" +#include "hw_queue_rx.h" #include "ring_simple.h" #define MODULE_NAME "cq_mgr_rx" @@ -121,6 +121,8 @@ void cq_mgr_rx::configure(int cq_size) comp_vector, &attr); BULLSEYE_EXCLUDE_BLOCK_START if (!m_p_ibv_cq) { + cq_logerr("Failed to create CQ, this: %p, ctx: %p size: %d compch: %p", this, context, + cq_size - 1, m_comp_event_channel); throw_xlio_exception("ibv_create_cq failed"); } BULLSEYE_EXCLUDE_BLOCK_END @@ -182,10 +184,10 @@ void cq_mgr_rx::statistics_print() } } -void cq_mgr_rx::add_qp_rx(qp_mgr *qp) +void cq_mgr_rx::add_hqrx(hw_queue_rx *hqrx_ptr) { - m_qp = qp; - m_qp->m_rq_wqe_counter = 0; // In case of bonded qp, wqe_counter must be reset to zero + m_hqrx_ptr = hqrx_ptr; + m_hqrx_ptr->m_rq_wqe_counter = 0; // In case of bonded hqrx, wqe_counter must be reset to zero m_rx_hot_buffer = NULL; if (0 != xlio_ib_mlx5_get_cq(m_p_ibv_cq, &m_mlx5_cq)) { @@ -193,7 +195,7 @@ void cq_mgr_rx::add_qp_rx(qp_mgr *qp) } VALGRIND_MAKE_MEM_DEFINED(&m_mlx5_cq, sizeof(m_mlx5_cq)); - cq_logfunc("qp_mgr=%p m_mlx5_cq.dbrec=%p m_mlx5_cq.cq_buf=%p", m_qp, m_mlx5_cq.dbrec, + cq_logfunc("hqrx_ptr=%p m_mlx5_cq.dbrec=%p m_mlx5_cq.cq_buf=%p", hqrx_ptr, m_mlx5_cq.dbrec, m_mlx5_cq.cq_buf); descq_t temp_desc_list; @@ -204,55 +206,55 @@ void cq_mgr_rx::add_qp_rx(qp_mgr *qp) /* return_extra_buffers(); */ // todo?? // Initial fill of receiver work requests - uint32_t qp_rx_wr_num = qp->get_rx_max_wr_num(); - cq_logdbg("Trying to push %d WRE to allocated qp (%p)", qp_rx_wr_num, qp); - while (qp_rx_wr_num) { + uint32_t hqrx_wr_num = hqrx_ptr->get_rx_max_wr_num(); + cq_logdbg("Trying to push %d WRE to allocated hqrx (%p)", hqrx_wr_num, hqrx_ptr); + while (hqrx_wr_num) { uint32_t n_num_mem_bufs = m_n_sysvar_rx_num_wr_to_post_recv; - if (n_num_mem_bufs > qp_rx_wr_num) { - n_num_mem_bufs = qp_rx_wr_num; + if (n_num_mem_bufs > hqrx_wr_num) { + n_num_mem_bufs = hqrx_wr_num; } bool res = g_buffer_pool_rx_rwqe->get_buffers_thread_safe(temp_desc_list, m_p_ring, n_num_mem_bufs, m_rx_lkey); if (!res) { VLOG_PRINTF_INFO_ONCE_THEN_ALWAYS( VLOG_WARNING, VLOG_DEBUG, - "WARNING Out of mem_buf_desc from Rx buffer pool for qp_mgr qp_mgr initialization " - "(qp=%p),\n" + "WARNING Out of mem_buf_desc from Rx buffer pool for hqrx initialization " + "(hqrx_ptr=%p),\n" "\tThis might happen due to wrong setting of XLIO_RX_BUFS and XLIO_RX_WRE. Please " "refer to README.txt for more info", - qp); + hqrx_ptr); break; } - qp->post_recv_buffers(&temp_desc_list, temp_desc_list.size()); + hqrx_ptr->post_recv_buffers(&temp_desc_list, temp_desc_list.size()); if (!temp_desc_list.empty()) { - cq_logdbg("qp post recv is already full (push=%d, planned=%d)", - qp->get_rx_max_wr_num() - qp_rx_wr_num, qp->get_rx_max_wr_num()); + cq_logdbg("hqrx_ptr post recv is already full (push=%d, planned=%d)", + hqrx_ptr->get_rx_max_wr_num() - hqrx_wr_num, hqrx_ptr->get_rx_max_wr_num()); g_buffer_pool_rx_rwqe->put_buffers_thread_safe(&temp_desc_list, temp_desc_list.size()); break; } - qp_rx_wr_num -= n_num_mem_bufs; + hqrx_wr_num -= n_num_mem_bufs; } - cq_logdbg("Successfully post_recv qp with %d new Rx buffers (planned=%d)", - qp->get_rx_max_wr_num() - qp_rx_wr_num, qp->get_rx_max_wr_num()); + cq_logdbg("Successfully post_recv hqrx with %d new Rx buffers (planned=%d)", + hqrx_ptr->get_rx_max_wr_num() - hqrx_wr_num, hqrx_ptr->get_rx_max_wr_num()); m_debt = 0; } -void cq_mgr_rx::del_qp_rx(qp_mgr *qp) +void cq_mgr_rx::del_hqrx(hw_queue_rx *hqrx_ptr) { BULLSEYE_EXCLUDE_BLOCK_START - if (m_qp != qp) { - cq_logdbg("wrong qp_mgr=%p != m_qp=%p", qp, m_qp); + if (m_hqrx_ptr != hqrx_ptr) { + cq_logdbg("wrong hqrx_ptr=%p != m_hqrx_ptr=%p", hqrx_ptr, m_hqrx_ptr); return; } BULLSEYE_EXCLUDE_BLOCK_END - cq_logdbg("qp_mgr=%p", m_qp); + cq_logdbg("m_hqrx_ptr=%p", m_hqrx_ptr); return_extra_buffers(); clean_cq(); - m_qp = nullptr; + m_hqrx_ptr = nullptr; m_debt = 0; } @@ -376,13 +378,13 @@ bool cq_mgr_rx::compensate_qp_poll_success(mem_buf_desc_t *buff_cur) // Compensate QP for all completions that we found if (m_rx_pool.size() || request_more_buffers()) { size_t buffers = std::min(m_debt, m_rx_pool.size()); - m_qp->post_recv_buffers(&m_rx_pool, buffers); + m_hqrx_ptr->post_recv_buffers(&m_rx_pool, buffers); m_debt -= buffers; m_p_cq_stat->n_buffer_pool_len = m_rx_pool.size(); } else if (m_b_sysvar_cq_keep_qp_full || - m_debt + MCE_MAX_CQ_POLL_BATCH > (int)m_qp->m_rx_num_wr) { + m_debt + MCE_MAX_CQ_POLL_BATCH > (int)m_hqrx_ptr->m_rx_num_wr) { m_p_cq_stat->n_rx_pkt_drop++; - m_qp->post_recv_buffer(buff_cur); + m_hqrx_ptr->post_recv_buffer(buff_cur); --m_debt; return true; } @@ -397,7 +399,7 @@ void cq_mgr_rx::compensate_qp_poll_failed() if (m_debt) { if (likely(m_rx_pool.size() || request_more_buffers())) { size_t buffers = std::min(m_debt, m_rx_pool.size()); - m_qp->post_recv_buffers(&m_rx_pool, buffers); + m_hqrx_ptr->post_recv_buffers(&m_rx_pool, buffers); m_debt -= buffers; m_p_cq_stat->n_buffer_pool_len = m_rx_pool.size(); } @@ -515,7 +517,7 @@ int cq_mgr_rx::request_notification(uint64_t poll_sn) // Arm the CQ notification channel IF_VERBS_FAILURE(xlio_ib_mlx5_req_notify_cq(&m_mlx5_cq, 0)) { - cq_logerr("Failure arming the qp_mgr notification channel (errno=%d %m)", errno); + cq_logerr("Failure arming the RX notification channel (errno=%d %m)", errno); } else { diff --git a/src/core/dev/cq_mgr_rx.h b/src/core/dev/cq_mgr_rx.h index 6b9798fd6..9de2e64a5 100644 --- a/src/core/dev/cq_mgr_rx.h +++ b/src/core/dev/cq_mgr_rx.h @@ -35,7 +35,6 @@ #include "ib/base/verbs_extra.h" #include "utils/atomic.h" -#include "dev/qp_mgr.h" #include "dev/ib_ctx_handler.h" #include "util/sys_vars.h" #include "util/xlio_stats.h" @@ -56,7 +55,7 @@ class net_device_mgr; class ring; -class qp_mgr; +class hw_queue_rx; class ring_simple; /* Get CQE opcode. */ @@ -138,8 +137,8 @@ class cq_mgr_rx { void mem_buf_desc_return_to_owner(mem_buf_desc_t *p_mem_buf_desc, void *pv_fd_ready_array = NULL); - virtual void add_qp_rx(qp_mgr *qp); - virtual void del_qp_rx(qp_mgr *qp); + virtual void add_hqrx(hw_queue_rx *hqrx_ptr); + virtual void del_hqrx(hw_queue_rx *hqrx_ptr); virtual uint32_t clean_cq() = 0; @@ -178,7 +177,7 @@ class cq_mgr_rx { virtual void statistics_print(); xlio_ib_mlx5_cq_t m_mlx5_cq; - qp_mgr *m_qp = nullptr; + hw_queue_rx *m_hqrx_ptr = nullptr; mem_buf_desc_t *m_rx_hot_buffer = nullptr; struct ibv_cq *m_p_ibv_cq = nullptr; descq_t m_rx_queue; diff --git a/src/core/dev/cq_mgr_rx.inl b/src/core/dev/cq_mgr_rx.inl index 7ab65f966..c03fe082e 100644 --- a/src/core/dev/cq_mgr_rx.inl +++ b/src/core/dev/cq_mgr_rx.inl @@ -36,6 +36,8 @@ #include "cq_mgr_rx.h" #include "ring_simple.h" #include "util/utils.h" +#include +#include /**/ /** inlining functions can only help if they are implemented before their usage **/ diff --git a/src/core/dev/cq_mgr_rx_regrq.cpp b/src/core/dev/cq_mgr_rx_regrq.cpp index 93a7460e5..91881f64e 100644 --- a/src/core/dev/cq_mgr_rx_regrq.cpp +++ b/src/core/dev/cq_mgr_rx_regrq.cpp @@ -36,7 +36,7 @@ #include #include "cq_mgr_rx.inl" -#include "qp_mgr.h" +#include "hw_queue_rx.h" #include "ring_simple.h" #include @@ -63,12 +63,7 @@ uint32_t cq_mgr_rx_regrq::clean_cq() uint64_t cq_poll_sn = 0; mem_buf_desc_t *buff; - /* Sanity check for cq: initialization of tx and rx cq has difference: - * tx - is done in qp_mgr::configure() - * rx - is done in qp_mgr::up() - * as a result rx cq can be created but not initialized - */ - if (NULL == m_qp) { + if (NULL == m_hqrx_ptr) { // Sanity check return 0; } @@ -94,10 +89,10 @@ mem_buf_desc_t *cq_mgr_rx_regrq::poll(enum buff_status_e &status) mem_buf_desc_t *buff = NULL; if (unlikely(NULL == m_rx_hot_buffer)) { - if (likely(m_qp->m_mlx5_qp.rq.tail != (m_qp->m_mlx5_qp.rq.head))) { - uint32_t index = m_qp->m_mlx5_qp.rq.tail & (m_qp->m_rx_num_wr - 1); - m_rx_hot_buffer = (mem_buf_desc_t *)m_qp->m_rq_wqe_idx_to_wrid[index]; - m_qp->m_rq_wqe_idx_to_wrid[index] = 0; + if (likely(m_hqrx_ptr->m_rq_data.tail != (m_hqrx_ptr->m_rq_data.head))) { + uint32_t index = m_hqrx_ptr->m_rq_data.tail & (m_hqrx_ptr->m_rx_num_wr - 1); + m_rx_hot_buffer = (mem_buf_desc_t *)m_hqrx_ptr->m_rq_wqe_idx_to_wrid[index]; + m_hqrx_ptr->m_rq_wqe_idx_to_wrid[index] = 0; prefetch((void *)m_rx_hot_buffer); prefetch((uint8_t *)m_mlx5_cq.cq_buf + ((m_mlx5_cq.cq_ci & (m_mlx5_cq.cqe_count - 1)) << m_mlx5_cq.cqe_size_log)); @@ -114,7 +109,7 @@ mem_buf_desc_t *cq_mgr_rx_regrq::poll(enum buff_status_e &status) rmb(); cqe_to_mem_buff_desc(cqe, m_rx_hot_buffer, status); - ++m_qp->m_mlx5_qp.rq.tail; + ++m_hqrx_ptr->m_rq_data.tail; *m_mlx5_cq.dbrec = htonl(m_mlx5_cq.cq_ci & 0xffffff); buff = m_rx_hot_buffer; diff --git a/src/core/dev/cq_mgr_rx_strq.cpp b/src/core/dev/cq_mgr_rx_strq.cpp index 5c8e6ef70..647f30d38 100644 --- a/src/core/dev/cq_mgr_rx_strq.cpp +++ b/src/core/dev/cq_mgr_rx_strq.cpp @@ -36,7 +36,7 @@ #include #include "cq_mgr_rx.inl" -#include "qp_mgr.h" +#include "hw_queue_rx.h" #include "ring_simple.h" #include @@ -128,11 +128,7 @@ uint32_t cq_mgr_rx_strq::clean_cq() uint32_t ret_total = 0; uint64_t cq_poll_sn = 0; - /* Sanity check for cq: initialization of tx and rx cq has difference: - * rx - is done in qp_mgr::up() - * as a result rx cq can be created but not initialized - */ - if (NULL == m_qp) { + if (NULL == m_hqrx_ptr) { // Sanity check return 0; } @@ -154,11 +150,11 @@ uint32_t cq_mgr_rx_strq::clean_cq() bool cq_mgr_rx_strq::set_current_hot_buffer() { - if (likely(m_qp->m_mlx5_qp.rq.tail != (m_qp->m_mlx5_qp.rq.head))) { - uint32_t index = m_qp->m_mlx5_qp.rq.tail & (m_qp->m_rx_num_wr - 1); - m_rx_hot_buffer = (mem_buf_desc_t *)m_qp->m_rq_wqe_idx_to_wrid[index]; + if (likely(m_hqrx_ptr->m_rq_data.tail != (m_hqrx_ptr->m_rq_data.head))) { + uint32_t index = m_hqrx_ptr->m_rq_data.tail & (m_hqrx_ptr->m_rx_num_wr - 1); + m_rx_hot_buffer = (mem_buf_desc_t *)m_hqrx_ptr->m_rq_wqe_idx_to_wrid[index]; m_rx_hot_buffer->set_ref_count(_strides_num); - m_qp->m_rq_wqe_idx_to_wrid[index] = 0; + m_hqrx_ptr->m_rq_wqe_idx_to_wrid[index] = 0; return true; } @@ -195,7 +191,7 @@ mem_buf_desc_t *cq_mgr_rx_strq::poll(enum buff_status_e &status, mem_buf_desc_t bool is_wqe_complete = strq_cqe_to_mem_buff_desc(cqe, status, is_filler); if (is_wqe_complete) { - ++m_qp->m_mlx5_qp.rq.tail; + ++m_hqrx_ptr->m_rq_data.tail; buff = m_rx_hot_buffer; m_rx_hot_buffer = NULL; if (likely(status == BS_OK)) { @@ -508,12 +504,12 @@ int cq_mgr_rx_strq::poll_and_process_element_rx(uint64_t *p_cq_poll_sn, void *pv return ret_rx_processed; } -void cq_mgr_rx_strq::add_qp_rx(qp_mgr *qp) +void cq_mgr_rx_strq::add_hqrx(hw_queue_rx *hqrx) { cq_logfunc(""); _hot_buffer_stride = nullptr; _current_wqe_consumed_bytes = 0U; - cq_mgr_rx::add_qp_rx(qp); + cq_mgr_rx::add_hqrx(hqrx); } void cq_mgr_rx_strq::statistics_print() diff --git a/src/core/dev/cq_mgr_rx_strq.h b/src/core/dev/cq_mgr_rx_strq.h index fbad4d003..3852465f2 100644 --- a/src/core/dev/cq_mgr_rx_strq.h +++ b/src/core/dev/cq_mgr_rx_strq.h @@ -49,7 +49,7 @@ class cq_mgr_rx_strq : public cq_mgr_rx { virtual mem_buf_desc_t *poll_and_process_socketxtreme() override; virtual int poll_and_process_element_rx(uint64_t *p_cq_poll_sn, void *pv_fd_ready_array = NULL) override; - virtual void add_qp_rx(qp_mgr *qp) override; + virtual void add_hqrx(hw_queue_rx *qp) override; virtual uint32_t clean_cq() override; protected: diff --git a/src/core/dev/cq_mgr_tx.cpp b/src/core/dev/cq_mgr_tx.cpp index 5a7919a0d..c144d7dfa 100644 --- a/src/core/dev/cq_mgr_tx.cpp +++ b/src/core/dev/cq_mgr_tx.cpp @@ -34,7 +34,7 @@ #include #include #include "ring_simple.h" -#include "qp_mgr.h" +#include "hw_queue_tx.h" #define MODULE_NAME "cq_mgr_tx" @@ -168,30 +168,30 @@ void cq_mgr_tx::configure(int cq_size) get_channel_fd(), cq_size, m_p_ibv_cq); } -void cq_mgr_tx::add_qp_tx(qp_mgr *qp) +void cq_mgr_tx::add_qp_tx(hw_queue_tx *hqtx_ptr) { // Assume locked! - cq_logdbg("qp_mgr=%p", qp); - m_qp = qp; + cq_logdbg("hqtx_ptr=%p", hqtx_ptr); + m_hqtx_ptr = hqtx_ptr; if (0 != xlio_ib_mlx5_get_cq(m_p_ibv_cq, &m_mlx5_cq)) { cq_logpanic("xlio_ib_mlx5_get_cq failed (errno=%d %m)", errno); } - cq_logfunc("qp_mgr=%p m_mlx5_cq.dbrec=%p m_mlx5_cq.cq_buf=%p", m_qp, m_mlx5_cq.dbrec, + cq_logfunc("hqtx_ptr=%p m_mlx5_cq.dbrec=%p m_mlx5_cq.cq_buf=%p", m_hqtx_ptr, m_mlx5_cq.dbrec, m_mlx5_cq.cq_buf); } -void cq_mgr_tx::del_qp_tx(qp_mgr *qp) +void cq_mgr_tx::del_qp_tx(hw_queue_tx *hqtx_ptr) { BULLSEYE_EXCLUDE_BLOCK_START - if (m_qp != qp) { - cq_logdbg("wrong qp_mgr=%p != m_qp=%p", qp, m_qp); + if (m_hqtx_ptr != hqtx_ptr) { + cq_logdbg("wrong hqtx_ptr=%p != m_hqtx_ptr=%p", hqtx_ptr, m_hqtx_ptr); return; } BULLSEYE_EXCLUDE_BLOCK_END - cq_logdbg("qp_mgr=%p", m_qp); - m_qp = nullptr; + cq_logdbg("m_hqtx_ptr=%p", m_hqtx_ptr); + m_hqtx_ptr = nullptr; } int cq_mgr_tx::request_notification(uint64_t poll_sn) @@ -214,7 +214,7 @@ int cq_mgr_tx::request_notification(uint64_t poll_sn) // Arm the CQ notification channel IF_VERBS_FAILURE(xlio_ib_mlx5_req_notify_cq(&m_mlx5_cq, 0)) { - cq_logerr("Failure arming the qp_mgr notification channel (errno=%d %m)", errno); + cq_logerr("Failure arming the TX notification channel (errno=%d %m)", errno); } else { @@ -269,7 +269,7 @@ int cq_mgr_tx::poll_and_process_element_tx(uint64_t *p_cq_poll_sn) xlio_mlx5_cqe *cqe = get_cqe_tx(num_polled_cqes); if (likely(cqe)) { - unsigned index = ntohs(cqe->wqe_counter) & (m_qp->m_tx_num_wr - 1); + unsigned index = ntohs(cqe->wqe_counter) & (m_hqtx_ptr->m_tx_num_wr - 1); // All error opcodes have the most significant bit set. if (unlikely(cqe->op_own & 0x80) && is_error_opcode(cqe->op_own >> 4)) { @@ -289,7 +289,7 @@ void cq_mgr_tx::log_cqe_error(struct xlio_mlx5_cqe *cqe) { struct mlx5_err_cqe *ecqe = (struct mlx5_err_cqe *)cqe; - /* TODO We can also ask qp_mgr to log WQE fields from SQ. But at first, we need to remove + /* TODO We can also ask hw_queue_tx to log WQE fields from SQ. But at first, we need to remove * prefetch and memset of the next WQE there. Credit system will guarantee that we don't * reuse the WQE at this point. */ @@ -305,7 +305,7 @@ void cq_mgr_tx::log_cqe_error(struct xlio_mlx5_cqe *cqe) void cq_mgr_tx::handle_sq_wqe_prop(unsigned index) { - sq_wqe_prop *p = &m_qp->m_sq_wqe_idx_to_prop[index]; + sq_wqe_prop *p = &m_hqtx_ptr->m_sq_wqe_idx_to_prop[index]; sq_wqe_prop *prev; unsigned credits = 0; @@ -334,16 +334,16 @@ void cq_mgr_tx::handle_sq_wqe_prop(unsigned index) ti->put(); if (unlikely(ti->m_released && ti->m_ref == 0)) { - m_qp->ti_released(ti); + ti->ti_released(); } } credits += p->credits; prev = p; p = p->next; - } while (p != NULL && m_qp->is_sq_wqe_prop_valid(p, prev)); + } while (p != NULL && m_hqtx_ptr->is_sq_wqe_prop_valid(p, prev)); m_p_ring->return_tx_pool_to_global_pool(); - m_qp->credits_return(credits); - m_qp->m_sq_wqe_prop_last_signalled = index; + m_hqtx_ptr->credits_return(credits); + m_hqtx_ptr->m_sq_wqe_prop_last_signalled = index; } diff --git a/src/core/dev/cq_mgr_tx.h b/src/core/dev/cq_mgr_tx.h index 65d12b9d7..91c17bc52 100644 --- a/src/core/dev/cq_mgr_tx.h +++ b/src/core/dev/cq_mgr_tx.h @@ -35,7 +35,7 @@ #include "dev/ib_ctx_handler.h" -class qp_mgr; +class hw_queue_tx; class ring_simple; class cq_mgr_tx { @@ -53,8 +53,8 @@ class cq_mgr_tx { int get_channel_fd() { return m_comp_event_channel->fd; } void configure(int cq_size); - void add_qp_tx(qp_mgr *qp); - void del_qp_tx(qp_mgr *qp); + void add_qp_tx(hw_queue_tx *hqtx_ptr); + void del_qp_tx(hw_queue_tx *hqtx_ptr); uint32_t clean_cq(); @@ -91,7 +91,7 @@ class cq_mgr_tx { ring_simple *m_p_ring; ib_ctx_handler *m_p_ib_ctx_handler; ibv_comp_channel *m_comp_event_channel; - qp_mgr *m_qp = nullptr; + hw_queue_tx *m_hqtx_ptr = nullptr; struct ibv_cq *m_p_ibv_cq = nullptr; uint32_t m_cq_id_tx = 0U; uint32_t m_n_cq_poll_sn_tx = 0U; diff --git a/src/core/dev/hw_queue_rx.cpp b/src/core/dev/hw_queue_rx.cpp new file mode 100644 index 000000000..5d602dcd5 --- /dev/null +++ b/src/core/dev/hw_queue_rx.cpp @@ -0,0 +1,636 @@ +/* + * Copyright (c) 2001-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include "dev/hw_queue_rx.h" +#include "dev/buffer_pool.h" +#include "dev/ring_simple.h" +#include "dev/rfs_rule_dpcp.h" +#include "dev/cq_mgr_rx_regrq.h" +#include "dev/cq_mgr_rx_strq.h" + +#undef MODULE_NAME +#define MODULE_NAME "hw_queue_rx" + +#define hwqrx_logpanic __log_info_panic +#define hwqrx_logerr __log_info_err +#define hwqrx_logwarn __log_info_warn +#define hwqrx_loginfo __log_info_info +#define hwqrx_logdbg __log_info_dbg +#define hwqrx_logfunc __log_info_func +#define hwqrx_logfuncall __log_info_funcall + +#define ALIGN_WR_DOWN(_num_wr_) (std::max(32, ((_num_wr_) & ~(0xf)))) + +hw_queue_rx::hw_queue_rx(ring_simple *ring, ib_ctx_handler *ib_ctx, + ibv_comp_channel *rx_comp_event_channel, uint16_t vlan) + : m_p_ring(ring) + , m_p_ib_ctx_handler(ib_ctx) + , m_n_sysvar_rx_num_wr_to_post_recv(safe_mce_sys().rx_num_wr_to_post_recv) + , m_rx_num_wr(align32pow2(safe_mce_sys().rx_num_wr)) + , m_n_sysvar_rx_prefetch_bytes_before_poll(safe_mce_sys().rx_prefetch_bytes_before_poll) + , m_vlan(vlan) +{ + hwqrx_logfunc(""); + + if (!configure_rq(rx_comp_event_channel)) { + throw_xlio_exception("Failed to create RQ"); + } +} + +hw_queue_rx::~hw_queue_rx() +{ + hwqrx_logfunc(""); + + m_rq.reset(nullptr); // Must be destroyed before RX CQ. + + if (m_rq_wqe_idx_to_wrid) { + if (0 != munmap(m_rq_wqe_idx_to_wrid, m_rx_num_wr * sizeof(*m_rq_wqe_idx_to_wrid))) { + hwqrx_logerr( + "Failed deallocating memory with munmap m_rq_wqe_idx_to_wrid (errno=%d %m)", errno); + } + m_rq_wqe_idx_to_wrid = nullptr; + } + + if (m_p_cq_mgr_rx) { + delete m_p_cq_mgr_rx; + m_p_cq_mgr_rx = nullptr; + } + + delete[] m_ibv_rx_sg_array; + delete[] m_ibv_rx_wr_array; + + hwqrx_logdbg("Rx buffer poll: %ld free global buffers available", + g_buffer_pool_rx_rwqe->get_free_count()); +} + +bool hw_queue_rx::configure_rq(ibv_comp_channel *rx_comp_event_channel) +{ + // Check device capabilities for max QP work requests + /*uint32_t max_qp_wr = ALIGN_WR_DOWN(m_p_ib_ctx_handler->get_ibv_device_attr()->max_qp_wr - 1); + if (m_rx_num_wr > max_qp_wr) { + hwqrx_logwarn("Allocating only %d Rx work requests while user " + "requested %s=%d for RX on <%p>", + max_qp_wr, SYS_VAR_RX_NUM_WRE, m_rx_num_wr, m_p_ib_ctx_handler); + m_rx_num_wr = max_qp_wr; + }*/ + + // Create associated cq_mgr_tx + BULLSEYE_EXCLUDE_BLOCK_START + m_p_cq_mgr_rx = init_rx_cq_mgr(rx_comp_event_channel); + if (!m_p_cq_mgr_rx) { + hwqrx_logerr("Failed allocating m_p_cq_mgr_rx (errno=%d %m)", errno); + return false; + } + BULLSEYE_EXCLUDE_BLOCK_END + + // Modify the cq_mgr_rx to use a non-blocking event channel + set_fd_block_mode(m_p_cq_mgr_rx->get_channel_fd(), false); + + m_curr_rx_wr = 0; + + xlio_ib_mlx5_cq_t mlx5_cq; + memset(&mlx5_cq, 0, sizeof(mlx5_cq)); + xlio_ib_mlx5_get_cq(m_p_cq_mgr_rx->get_ibv_cq_hndl(), &mlx5_cq); + + hwqrx_logdbg( + "Creating RQ of transport type '%s' on ibv device '%s' [%p], cq: %p(%u), wre: %d, sge: %d", + priv_xlio_transport_type_str(m_p_ring->get_transport_type()), + m_p_ib_ctx_handler->get_ibname(), m_p_ib_ctx_handler->get_ibv_device(), m_p_cq_mgr_rx, + mlx5_cq.cq_num, m_rx_num_wr, m_rx_sge); + + if (safe_mce_sys().enable_striding_rq) { + m_rx_sge = 2U; // Striding-RQ needs a reserved segment. + m_strq_wqe_reserved_seg = 1U; + } + + m_ibv_rx_wr_array = new ibv_recv_wr[m_n_sysvar_rx_num_wr_to_post_recv]; + m_ibv_rx_sg_array = new ibv_sge[m_n_sysvar_rx_num_wr_to_post_recv * m_rx_sge]; + + for (uint32_t wr_idx = 0; wr_idx < m_n_sysvar_rx_num_wr_to_post_recv; wr_idx++) { + m_ibv_rx_wr_array[wr_idx].sg_list = &m_ibv_rx_sg_array[wr_idx * m_rx_sge]; + m_ibv_rx_wr_array[wr_idx].num_sge = m_rx_sge; + m_ibv_rx_wr_array[wr_idx].next = &m_ibv_rx_wr_array[wr_idx + 1]; + } + + m_ibv_rx_wr_array[m_n_sysvar_rx_num_wr_to_post_recv - 1].next = nullptr; + + if (safe_mce_sys().enable_striding_rq) { + for (uint32_t wr_idx = 0; wr_idx < m_n_sysvar_rx_num_wr_to_post_recv; wr_idx++) { + memset(m_ibv_rx_wr_array[wr_idx].sg_list, 0, sizeof(ibv_sge)); + // To bypass a check inside xlio_ib_mlx5_post_recv. + m_ibv_rx_wr_array[wr_idx].sg_list[0].length = 1U; + } + } + + // Create the QP + if (!prepare_rq(mlx5_cq.cq_num)) { + return false; + } + + return true; +} + +void hw_queue_rx::up() +{ + m_tir.reset(create_tir()); + if (!m_tir) { + hwqrx_logpanic("TIR creation for hw_queue_rx failed (errno=%d %m)", errno); + } + + release_rx_buffers(); // We might have old flushed cqe's in our CQ still from previous HA event + + modify_queue_to_ready_state(); + + m_p_cq_mgr_rx->add_hqrx(this); +} + +void hw_queue_rx::down() +{ + m_tir.reset(nullptr); + + modify_queue_to_error_state(); + + // let the QP drain all wqe's to flushed cqe's now that we moved + // it to error state and post_sent final trigger for completion + usleep(1000); + + release_rx_buffers(); + m_p_cq_mgr_rx->del_hqrx(this); +} + +void hw_queue_rx::release_rx_buffers() +{ + int total_ret = m_curr_rx_wr; + if (m_curr_rx_wr) { + hwqrx_logdbg("Returning %d pending post_recv buffers to CQ owner", m_curr_rx_wr); + while (m_curr_rx_wr) { + // Cleaning unposted buffers. Unposted buffers are not attached to any strides. + --m_curr_rx_wr; + mem_buf_desc_t *p_mem_buf_desc = + (mem_buf_desc_t *)(uintptr_t)m_ibv_rx_wr_array[m_curr_rx_wr].wr_id; + if (p_mem_buf_desc && p_mem_buf_desc->p_desc_owner) { + m_p_ring->mem_buf_desc_return_to_owner_rx(p_mem_buf_desc); + } else { + g_buffer_pool_rx_rwqe->put_buffers_thread_safe(p_mem_buf_desc); + } + } + } + // Wait for all FLUSHed WQE on Rx CQ + hwqrx_logdbg("draining cq_mgr_rx %p (last_posted_rx_wr_id = %lu)", m_p_cq_mgr_rx, + m_last_posted_rx_wr_id); + uintptr_t last_polled_rx_wr_id = 0; + while (m_p_cq_mgr_rx && last_polled_rx_wr_id != m_last_posted_rx_wr_id && errno != EIO && + && !is_rq_empty() && !m_p_ib_ctx_handler->is_removed()) { + + // Process the FLUSH'ed WQE's + int ret = m_p_cq_mgr_rx->drain_and_proccess(&last_polled_rx_wr_id); + hwqrx_logdbg("draining completed on cq_mgr_rx (%d wce) last_polled_rx_wr_id = %lu", ret, + last_polled_rx_wr_id); + + total_ret += ret; + + if (!ret) { + // Query context for ib_verbs events (especially for IBV_EVENT_DEVICE_FATAL) + g_p_event_handler_manager->query_for_ibverbs_event( + m_p_ib_ctx_handler->get_ibv_context()->async_fd); + } + + // Add short delay (500 usec) to allow for WQE's to be flushed to CQ every poll cycle + const struct timespec short_sleep = {0, 500000}; // 500 usec + nanosleep(&short_sleep, NULL); + } + m_last_posted_rx_wr_id = 0; // Clear the posted WR_ID flag, we just clear the entire RQ + hwqrx_logdbg("draining completed with a total of %d wce's on cq_mgr_rx", total_ret); + NOT_IN_USE(total_ret); // Suppress --enable-opt-log=high warning +} + +void hw_queue_rx::post_recv_buffers(descq_t *p_buffers, size_t count) +{ + hwqrx_logfuncall(""); + // Called from cq_mgr_rx context under cq_mgr_rx::LOCK! + while (count--) { + post_recv_buffer(p_buffers->get_and_pop_front()); + } +} + +void hw_queue_rx::modify_queue_to_ready_state() +{ + hwqrx_logdbg(""); + dpcp::status rc = m_rq->modify_state(dpcp::RQ_RDY); + if (dpcp::DPCP_OK != rc) { + hwqrx_logerr("Failed to modify rq state to RDY, rc: %d, rqn: %" PRIu32, + static_cast(rc), m_rq_data.rqn); + } +} + +void hw_queue_rx::modify_queue_to_error_state() +{ + hwqrx_logdbg(""); + + m_p_cq_mgr_rx->clean_cq(); + + dpcp::status rc = m_rq->modify_state(dpcp::RQ_ERR); + + /* During plugout theres is possibility that kernel + * remove device resources before working process complete + * removing process. As a result ibv api function can + * return EIO=5 errno code. + */ + if (dpcp::DPCP_OK != rc && errno != EIO) { + hwqrx_logerr("Failed to modify rq state to ERR, rc: %d, rqn: %" PRIu32, + static_cast(rc), m_rq_data.rqn); + } +} + +rfs_rule *hw_queue_rx::create_rfs_rule(xlio_ibv_flow_attr &attrs, xlio_tir *tir_ext) +{ + if (m_p_ib_ctx_handler && m_p_ib_ctx_handler->get_dpcp_adapter()) { + // TLS RX uses tir_ext. + dpcp::tir *dpcp_tir = (tir_ext ? xlio_tir_to_dpcp_tir(tir_ext) : m_tir.get()); + + std::unique_ptr new_rule(new rfs_rule_dpcp()); + if (dpcp_tir && + new_rule->create(attrs, *dpcp_tir, *m_p_ib_ctx_handler->get_dpcp_adapter())) { + return new_rule.release(); + } + } + + return nullptr; +} + +void hw_queue_rx::post_recv_buffer(mem_buf_desc_t *p_mem_buf_desc) +{ + uint32_t index = (m_curr_rx_wr * m_rx_sge) + m_strq_wqe_reserved_seg; + m_ibv_rx_sg_array[index].addr = (uintptr_t)p_mem_buf_desc->p_buffer; + m_ibv_rx_sg_array[index].length = p_mem_buf_desc->sz_buffer; + m_ibv_rx_sg_array[index].lkey = p_mem_buf_desc->lkey; + + post_recv_buffer_rq(p_mem_buf_desc); +} + +void hw_queue_rx::post_recv_buffer_rq(mem_buf_desc_t *p_mem_buf_desc) +{ + if (m_n_sysvar_rx_prefetch_bytes_before_poll) { + if (m_p_prev_rx_desc_pushed) { + m_p_prev_rx_desc_pushed->p_prev_desc = p_mem_buf_desc; + } + m_p_prev_rx_desc_pushed = p_mem_buf_desc; + } + + m_ibv_rx_wr_array[m_curr_rx_wr].wr_id = (uintptr_t)p_mem_buf_desc; + + if (m_rq_wqe_idx_to_wrid) { + uint32_t index = m_rq_wqe_counter & (m_rx_num_wr - 1); + m_rq_wqe_idx_to_wrid[index] = (uintptr_t)p_mem_buf_desc; + ++m_rq_wqe_counter; + } + + if (m_curr_rx_wr == m_n_sysvar_rx_num_wr_to_post_recv - 1) { + + m_last_posted_rx_wr_id = (uintptr_t)p_mem_buf_desc; + + m_p_prev_rx_desc_pushed = NULL; + p_mem_buf_desc->p_prev_desc = NULL; + + m_curr_rx_wr = 0; + struct ibv_recv_wr *bad_wr = nullptr; + IF_VERBS_FAILURE(xlio_raw_post_recv(&bad_wr)) + { + uint32_t n_pos_bad_rx_wr = + ((uint8_t *)bad_wr - (uint8_t *)m_ibv_rx_wr_array) / sizeof(struct ibv_recv_wr); + hwqrx_logerr("failed posting list (errno=%d %s)", errno, strerror(errno)); + hwqrx_logerr( + "bad_wr is %d in submitted list (bad_wr=%p, m_ibv_rx_wr_array=%p, size=%zu)", + n_pos_bad_rx_wr, bad_wr, m_ibv_rx_wr_array, sizeof(struct ibv_recv_wr)); + hwqrx_logerr("bad_wr info: wr_id=%#lx, next=%p, addr=%#lx, length=%d, lkey=%#x", + bad_wr[0].wr_id, bad_wr[0].next, bad_wr[0].sg_list[0].addr, + bad_wr[0].sg_list[0].length, bad_wr[0].sg_list[0].lkey); + + // Fix broken linked list of rx_wr + if (n_pos_bad_rx_wr != (m_n_sysvar_rx_num_wr_to_post_recv - 1)) { + m_ibv_rx_wr_array[n_pos_bad_rx_wr].next = &m_ibv_rx_wr_array[n_pos_bad_rx_wr + 1]; + } + throw; + } + ENDIF_VERBS_FAILURE; + hwqrx_logfunc("Successful ibv_post_recv"); + } else { + m_curr_rx_wr++; + } +} + +int hw_queue_rx::xlio_raw_post_recv(struct ibv_recv_wr **bad_wr) +{ + struct mlx5_wqe_data_seg *scat; + int err = 0; + int nreq = 0; + int i, j; + int ind = m_rq_data.head & (m_rq_data.wqe_cnt - 1); + + struct ibv_recv_wr *wr = m_ibv_rx_wr_array; + for (; wr; ++nreq, wr = wr->next) { + if (unlikely((int)m_rq_data.head - (int)m_rq_data.tail + nreq >= (int)m_rx_num_wr)) { + errno = ENOMEM; + err = -errno; + *bad_wr = wr; + goto out; + } + + if (unlikely(wr->num_sge > (int)m_rx_sge)) { + errno = EINVAL; + err = -errno; + *bad_wr = wr; + goto out; + } + + scat = + (struct mlx5_wqe_data_seg *)((uint8_t *)m_rq_data.buf + (ind << m_rq_data.wqe_shift)); + + for (i = 0, j = 0; i < wr->num_sge; ++i) { + if (unlikely(!wr->sg_list[i].length)) { + continue; + } + + scat[j].byte_count = htonl(wr->sg_list[i].length); + scat[j].lkey = htonl(wr->sg_list[i].lkey); + scat[j].addr = htonll(wr->sg_list[i].addr); + j++; + } + + if (j < (int)m_rx_sge) { + scat[j].byte_count = 0; + scat[j].lkey = htonl(MLX5_INVALID_LKEY); + scat[j].addr = 0; + } + + ind = (ind + 1) & (m_rq_data.wqe_cnt - 1); + } + +out: + if (likely(nreq)) { + m_rq_data.head += nreq; + + wmb(); // Make sure that descriptors are written before doorbell record. + + // Buffers are posted only after the RQ is in ready state. OK to update doorbell. + *m_rq_data.dbrec = htonl(m_rq_data.head & 0xffff); + } + + return err; +} + +bool hw_queue_rx::init_rx_cq_mgr_prepare() +{ + m_rq_wqe_idx_to_wrid = + (uint64_t *)mmap(NULL, m_rx_num_wr * sizeof(*m_rq_wqe_idx_to_wrid), PROT_READ | PROT_WRITE, + MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); + if (m_rq_wqe_idx_to_wrid == MAP_FAILED) { + hwqrx_logerr("Failed allocating m_rq_wqe_idx_to_wrid (errno=%d %m)", errno); + return false; + } + + return true; +} + +cq_mgr_rx *hw_queue_rx::init_rx_cq_mgr(struct ibv_comp_channel *p_rx_comp_event_channel) +{ + if (!init_rx_cq_mgr_prepare()) { + return nullptr; + } + + if (safe_mce_sys().enable_striding_rq) { + return new cq_mgr_rx_strq(m_p_ring, m_p_ib_ctx_handler, + safe_mce_sys().strq_stride_num_per_rwqe * m_rx_num_wr, + safe_mce_sys().strq_stride_size_bytes, + safe_mce_sys().strq_stride_num_per_rwqe, p_rx_comp_event_channel); + } + + return new cq_mgr_rx_regrq(m_p_ring, m_p_ib_ctx_handler, m_rx_num_wr, p_rx_comp_event_channel); +} + +#if defined(DEFINED_UTLS) +xlio_tir *hw_queue_rx::tls_create_tir(bool cached) +{ + xlio_tir *tir = NULL; + + if (cached && !m_tls_tir_cache.empty()) { + tir = m_tls_tir_cache.back(); + m_tls_tir_cache.pop_back(); + } else if (!cached) { + dpcp::tir *new_tir = create_tir(true); + + if (new_tir != NULL) { + tir = new xlio_tir(this, new_tir, xlio_ti::ti_type::TLS_TIR); + } + if (unlikely(tir == NULL && new_tir != NULL)) { + delete new_tir; + } + } + return tir; +} + +void hw_queue_rx::tls_release_tir(xlio_tir *tir) +{ + /* TODO We don't have to lock ring to destroy DEK object (a garbage collector?). */ + + assert(tir != nullptr && tir->m_type == xlio_ti::ti_type::TLS_TIR); + tir->m_released = true; + tir->assign_callback(NULL, NULL); + if (tir->m_ref == 0) { + put_tls_tir_in_cache(tir); + } +} + +void hw_queue_rx::put_tls_tir_in_cache(xlio_tir *tir) +{ + // Because the absense of TIR flush command, reusing a TIR + // may result in undefined behaviour. + // Until a flush command is available the TIR cache is disabled. + // Re-enabling TIR cache should also add destroy_tir_cache on ring cleanup. + // m_tls_tir_cache.push_back(tir); + + delete tir; +} + +void hw_queue_rx::ti_released(xlio_ti *ti) +{ + assert(ti->m_released); + assert(ti->m_ref == 0); + if (ti->m_type == xlio_ti::ti_type::TLS_TIR) { + put_tls_tir_in_cache(static_cast(ti)); + } +} +#else /* DEFINED_UTLS */ +void hw_queue_rx::ti_released(xlio_ti *) {}; +#endif /* defined(DEFINED_UTLS) */ + +dpcp::tir *hw_queue_rx::create_tir(bool is_tls /*=false*/) +{ + dpcp::tir *tir_obj = nullptr; + dpcp::status status = dpcp::DPCP_OK; + dpcp::tir::attr tir_attr; + + memset(&tir_attr, 0, sizeof(tir_attr)); + tir_attr.flags = dpcp::TIR_ATTR_INLINE_RQN | dpcp::TIR_ATTR_TRANSPORT_DOMAIN; + tir_attr.inline_rqn = m_rq_data.rqn; + tir_attr.transport_domain = m_p_ib_ctx_handler->get_dpcp_adapter()->get_td(); + + if (m_p_ring->m_lro.cap && m_p_ring->m_lro.max_payload_sz) { + tir_attr.flags |= dpcp::TIR_ATTR_LRO; + tir_attr.lro.timeout_period_usecs = XLIO_MLX5_PARAMS_LRO_TIMEOUT; + tir_attr.lro.enable_mask = 3; // Bitmask for IPv4 and IPv6 support + tir_attr.lro.max_msg_sz = m_p_ring->m_lro.max_payload_sz >> 8; + } + + if (is_tls) { + tir_attr.flags |= dpcp::TIR_ATTR_TLS; + tir_attr.tls_en = 1; + } + + status = m_p_ib_ctx_handler->get_dpcp_adapter()->create_tir(tir_attr, tir_obj); + + if (dpcp::DPCP_OK != status) { + hwqrx_logerr("Failed creating dpcp tir with flags=0x%x status=%d", tir_attr.flags, status); + return nullptr; + } + + hwqrx_logdbg("TIR: %p created", tir_obj); + + return tir_obj; +} + +bool hw_queue_rx::prepare_rq(uint32_t cqn) +{ + hwqrx_logdbg(""); + + dpcp::adapter *dpcp_adapter = m_p_ib_ctx_handler->get_dpcp_adapter(); + if (!dpcp_adapter) { + hwqrx_logerr("Failed to get dpcp::adapter for prepare_rq"); + return false; + } + + // user_index Unused. + dpcp::rq_attr rqattrs; + memset(&rqattrs, 0, sizeof(rqattrs)); + rqattrs.cqn = cqn; + rqattrs.wqe_num = m_rx_num_wr; + rqattrs.wqe_sz = m_rx_sge; + + if (safe_mce_sys().hw_ts_conversion_mode == TS_CONVERSION_MODE_RTC) { + hwqrx_logdbg("Enabled RTC timestamp format for RQ"); + rqattrs.ts_format = dpcp::rq_ts_format::RQ_TS_REAL_TIME; + } + + std::unique_ptr new_rq; + dpcp::status rc = dpcp::DPCP_OK; + + if (safe_mce_sys().enable_striding_rq) { + rqattrs.buf_stride_sz = safe_mce_sys().strq_stride_size_bytes; + rqattrs.buf_stride_num = safe_mce_sys().strq_stride_num_per_rwqe; + + // Striding-RQ WQE format is as of Shared-RQ (PRM, page 381, wq_type). + // In this case the WQE minimum size is 2 * 16, and the first segment is reserved. + rqattrs.wqe_sz = m_rx_sge * 16U; + + dpcp::striding_rq *new_rq_ptr = nullptr; + rc = dpcp_adapter->create_striding_rq(rqattrs, new_rq_ptr); + new_rq.reset(new_rq_ptr); + } else { + dpcp::regular_rq *new_rq_ptr = nullptr; + rc = dpcp_adapter->create_regular_rq(rqattrs, new_rq_ptr); + new_rq.reset(new_rq_ptr); + } + + if (dpcp::DPCP_OK != rc) { + hwqrx_logerr("Failed to create dpcp rq, rc: %d, cqn: %" PRIu32, static_cast(rc), cqn); + return false; + } + + if (!store_rq_mlx5_params(*new_rq)) { + hwqrx_logerr( + "Failed to retrieve initial DPCP RQ parameters, rc: %d, basic_rq: %p, cqn: %" PRIu32, + static_cast(rc), new_rq.get(), cqn); + return false; + } + + m_rq = std::move(new_rq); + + // At this stage there is no TIR associated with the RQ, So it mimics QP INIT state. + // At RDY state without a TIR, Work Requests can be submitted to the RQ. + modify_queue_to_ready_state(); + + hwqrx_logdbg("Succeeded to create dpcp rq, rqn: %" PRIu32 ", cqn: %" PRIu32, m_rq_data.rqn, + cqn); + + return true; +} + +bool hw_queue_rx::store_rq_mlx5_params(dpcp::basic_rq &new_rq) +{ + uint32_t *dbrec_tmp = nullptr; + dpcp::status rc = new_rq.get_dbrec(dbrec_tmp); + if (dpcp::DPCP_OK != rc) { + hwqrx_logerr("Failed to retrieve dbrec of dpcp rq, rc: %d, basic_rq: %p", + static_cast(rc), &new_rq); + return false; + } + m_rq_data.dbrec = dbrec_tmp; + + rc = new_rq.get_wq_buf(m_rq_data.buf); + if (dpcp::DPCP_OK != rc) { + hwqrx_logerr("Failed to retrieve wq-buf of dpcp rq, rc: %d, basic_rq: %p", + static_cast(rc), &new_rq); + return false; + } + + rc = new_rq.get_id(m_rq_data.rqn); + if (dpcp::DPCP_OK != rc) { + hwqrx_logerr("Failed to retrieve rqn of dpcp rq, rc: %d, basic_rq: %p", + static_cast(rc), &new_rq); + return false; + } + + new_rq.get_wqe_num(m_rq_data.wqe_cnt); + new_rq.get_wq_stride_sz(m_rq_data.stride); + if (safe_mce_sys().enable_striding_rq) { + m_rq_data.stride /= 16U; + } + + m_rq_data.wqe_shift = ilog_2(m_rq_data.stride); + m_rq_data.head = 0; + m_rq_data.tail = 0; + + return true; +} \ No newline at end of file diff --git a/src/core/dev/hw_queue_rx.h b/src/core/dev/hw_queue_rx.h new file mode 100644 index 000000000..eec7f7abc --- /dev/null +++ b/src/core/dev/hw_queue_rx.h @@ -0,0 +1,132 @@ +/* + * Copyright (c) 2001-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef HW_QUEUE_RX_H +#define HW_QUEUE_RX_H + +#include +#include "dev/xlio_ti.h" +#include "dev/ib_ctx_handler.h" +#include "dev/rfs_rule.h" +#include "dev/cq_mgr_rx.h" +#include "proto/mem_buf_desc.h" +#include "util/sg_array.h" + +class ring_simple; + +// @class hw_queue_rx +// Object to manages the SQ operations. This object is used for Rx. +// Once created it requests from the system a CQ to work with. +class hw_queue_rx : public xlio_ti_owner { + friend class cq_mgr_rx; + friend class cq_mgr_rx_regrq; + friend class cq_mgr_rx_strq; + +public: + hw_queue_rx(ring_simple *ring, ib_ctx_handler *ib_ctx, ibv_comp_channel *rx_comp_event_channel, + uint16_t vlan); + virtual ~hw_queue_rx(); + + virtual void ti_released(xlio_ti *ti) override; + + void up(); + void down(); + + // Post for receive single mem_buf_desc + void post_recv_buffer(mem_buf_desc_t *p_mem_buf_desc); + + // Post for receive a list of mem_buf_desc + void post_recv_buffers(descq_t *p_buffers, size_t count); + + cq_mgr_rx *get_rx_cq_mgr() const { return m_p_cq_mgr_rx; } + uint32_t get_rx_max_wr_num() const { return m_rx_num_wr; } + uint16_t get_vlan() const { return m_vlan; }; + void modify_queue_to_ready_state(); + void modify_queue_to_error_state(); + void release_rx_buffers(); + + rfs_rule *create_rfs_rule(xlio_ibv_flow_attr &attrs, xlio_tir *tir_ext); + +#ifdef DEFINED_UTLS + xlio_tir *tls_create_tir(bool cached); + void tls_release_tir(xlio_tir *tir); +#endif /* DEFINED_UTLS */ + +private: + cq_mgr_rx *init_rx_cq_mgr(struct ibv_comp_channel *p_rx_comp_event_channel); + + bool init_rx_cq_mgr_prepare(); + void post_recv_buffer_rq(mem_buf_desc_t *p_mem_buf_desc); + void put_tls_tir_in_cache(xlio_tir *tir); + bool prepare_rq(uint32_t cqn); + bool configure_rq(ibv_comp_channel *rx_comp_event_channel); + bool store_rq_mlx5_params(dpcp::basic_rq &new_rq); + int xlio_raw_post_recv(struct ibv_recv_wr **bad_wr); + bool is_rq_empty() const { return (m_rq_data.head == m_rq_data.tail); } + + dpcp::tir *create_tir(bool is_tls = false); + dpcp::tir *xlio_tir_to_dpcp_tir(xlio_tir *tir) { return tir->m_p_tir.get(); } + + struct { + volatile uint32_t *dbrec; + void *buf; + uint32_t wqe_cnt; + uint32_t stride; + uint32_t wqe_shift; + uint32_t rqn; + unsigned head; + unsigned tail; + } m_rq_data; + + std::vector m_tls_tir_cache; + std::unique_ptr m_tir = {nullptr}; + std::unique_ptr m_rq = {nullptr}; + ring_simple *m_p_ring; + cq_mgr_rx *m_p_cq_mgr_rx = nullptr; + ib_ctx_handler *m_p_ib_ctx_handler; + ibv_sge *m_ibv_rx_sg_array; + ibv_recv_wr *m_ibv_rx_wr_array; + uintptr_t m_last_posted_rx_wr_id = 0U; // Remember so in case we flush RQ we know to wait until + // this WR_ID is received + mem_buf_desc_t *m_p_prev_rx_desc_pushed = nullptr; + uint64_t *m_rq_wqe_idx_to_wrid = nullptr; + uint64_t m_rq_wqe_counter = 0U; + uint32_t m_curr_rx_wr = 0U; + uint32_t m_strq_wqe_reserved_seg = 0U; + uint32_t m_n_sysvar_rx_num_wr_to_post_recv; + uint32_t m_rx_num_wr; + uint32_t m_rx_sge = MCE_DEFAULT_RX_NUM_SGE; + const uint32_t m_n_sysvar_rx_prefetch_bytes_before_poll; + uint16_t m_vlan; +}; + +#endif // HW_QUEUE_RX_H diff --git a/src/core/dev/qp_mgr.cpp b/src/core/dev/hw_queue_tx.cpp similarity index 62% rename from src/core/dev/qp_mgr.cpp rename to src/core/dev/hw_queue_tx.cpp index e500b9354..24368d738 100644 --- a/src/core/dev/qp_mgr.cpp +++ b/src/core/dev/hw_queue_tx.cpp @@ -30,41 +30,29 @@ * SOFTWARE. */ -#include "qp_mgr.h" -#include -#include "utils/bullseye.h" -#include "util/utils.h" -#include "util/valgrind.h" -#include "util/instrumentation.h" -#include "iomux/io_mux_call.h" -#include "buffer_pool.h" -#include "ring_simple.h" -#include "cq_mgr_rx_regrq.h" -#include "cq_mgr_rx_strq.h" +#include +#include +#include +#include "dev/hw_queue_tx.h" +#include "dev/ring_simple.h" +#include "dev/cq_mgr_rx_regrq.h" #include "proto/tls.h" -#include "rfs_rule_dpcp.h" +#include "util/valgrind.h" #undef MODULE_NAME -#define MODULE_NAME "qp_mgr" +#define MODULE_NAME "hw_queue_tx" -#define qp_logpanic __log_info_panic -#define qp_logerr __log_info_err -#define qp_logwarn __log_info_warn -#define qp_loginfo __log_info_info -#define qp_logdbg __log_info_dbg -#define qp_logfunc __log_info_func -#define qp_logfuncall __log_info_funcall +#define hwqtx_logpanic __log_info_panic +#define hwqtx_logerr __log_info_err +#define hwqtx_logwarn __log_info_warn +#define hwqtx_loginfo __log_info_info +#define hwqtx_logdbg __log_info_dbg +#define hwqtx_logfunc __log_info_func +#define hwqtx_logfuncall __log_info_funcall //#define ALIGN_WR_UP(_num_wr_) (max(32, ((_num_wr_ + 0xf) & ~(0xf)))) #define ALIGN_WR_DOWN(_num_wr_) (max(32, ((_num_wr_) & ~(0xf)))) -#define FICTIVE_REMOTE_QPN 0x48 -#define FICTIVE_REMOTE_QKEY 0x01234567 -#define FICTIVE_AH_SL 5 -#define FICTIVE_AH_DLID 0x3 - -#define MAX_UPSTREAM_CQ_MSHV_SIZE 8192 - #if !defined(MLX5_ETH_INLINE_HEADER_SIZE) #define MLX5_ETH_INLINE_HEADER_SIZE 18 #endif @@ -78,7 +66,7 @@ #define dbg_dump_wqe(_addr, _size) \ { \ uint32_t *_wqe = _addr; \ - qp_logfunc("Dumping %d bytes from %p", _size, _wqe); \ + hwqtx_logfunc("Dumping %d bytes from %p", _size, _wqe); \ for (int i = 0; i < (int)_size / 4; i += 4) { \ qp_logfunc("%08x %08x %08x %08x", ntohl(_wqe[i + 0]), ntohl(_wqe[i + 1]), \ ntohl(_wqe[i + 2]), ntohl(_wqe[i + 3])); \ @@ -134,18 +122,14 @@ static inline uint32_t get_mlx5_opcode(xlio_ibv_wr_opcode verbs_opcode) } } -qp_mgr::qp_mgr(struct qp_mgr_desc *desc, const uint32_t tx_num_wr, uint16_t vlan) - : m_p_ring((ring_simple *)desc->ring) - , m_port_num((uint8_t)desc->slave->port_num) - , m_p_ib_ctx_handler((ib_ctx_handler *)desc->slave->p_ib_ctx) - , m_rx_num_wr(safe_mce_sys().rx_num_wr) - , m_tx_num_wr(tx_num_wr) - , m_n_sysvar_rx_num_wr_to_post_recv(safe_mce_sys().rx_num_wr_to_post_recv) +hw_queue_tx::hw_queue_tx(ring_simple *ring, const slave_data_t *slave, const uint32_t tx_num_wr) + : m_p_ring(ring) + , m_p_ib_ctx_handler(slave->p_ib_ctx) , m_n_sysvar_tx_num_wr_to_signal(safe_mce_sys().tx_num_wr_to_signal) - , m_n_sysvar_rx_prefetch_bytes_before_poll(safe_mce_sys().rx_prefetch_bytes_before_poll) - , m_vlan(vlan) + , m_tx_num_wr(tx_num_wr) + , m_port_num(slave->port_num) { - qp_logfunc(""); + hwqtx_logfunc(""); memset(&m_mlx5_qp, 0, sizeof(m_mlx5_qp)); @@ -153,10 +137,6 @@ qp_mgr::qp_mgr(struct qp_mgr_desc *desc, const uint32_t tx_num_wr, uint16_t vlan m_mlx5_qp.cap.max_send_sge = (m_p_ring->is_tso() ? m_p_ib_ctx_handler->get_ibv_device_attr()->max_sge : MCE_DEFAULT_TX_NUM_SGE); - m_mlx5_qp.cap.max_recv_sge = (m_p_ring->is_socketxtreme()) ? 1 : MCE_DEFAULT_RX_NUM_SGE; - - m_ibv_rx_sg_array = new ibv_sge[m_n_sysvar_rx_num_wr_to_post_recv]; - m_ibv_rx_wr_array = new ibv_recv_wr[m_n_sysvar_rx_num_wr_to_post_recv]; memset(&m_rate_limit, 0, sizeof(struct xlio_rate_limit_t)); @@ -164,47 +144,34 @@ qp_mgr::qp_mgr(struct qp_mgr_desc *desc, const uint32_t tx_num_wr, uint16_t vlan m_hw_dummy_send_support = xlio_is_nop_supported(m_p_ib_ctx_handler->get_ibv_device_attr()); m_db_method = - (is_bf((desc->slave->p_ib_ctx)->get_ibv_context()) ? MLX5_DB_METHOD_BF : MLX5_DB_METHOD_DB); + (is_bf((slave->p_ib_ctx)->get_ibv_context()) ? MLX5_DB_METHOD_BF : MLX5_DB_METHOD_DB); - qp_logdbg("m_db_method=%d", m_db_method); + hwqtx_logdbg("m_db_method=%d", m_db_method); - if (configure(desc)) { - throw_xlio_exception("Failed creating qp_mgr"); - } - - if (!configure_rq_dpcp()) { - throw_xlio_exception("Failed to create qp_mgr"); + if (configure(slave)) { + throw_xlio_exception("Failed to configure"); } } -qp_mgr::~qp_mgr() +hw_queue_tx::~hw_queue_tx() { - qp_logfunc(""); - - _rq.reset(nullptr); // Must be destroyed before RX CQ. + hwqtx_logfunc(""); - if (m_rq_wqe_idx_to_wrid) { - if (0 != munmap(m_rq_wqe_idx_to_wrid, m_rx_num_wr * sizeof(*m_rq_wqe_idx_to_wrid))) { - qp_logerr("Failed deallocating memory with munmap m_rq_wqe_idx_to_wrid (errno=%d %m)", - errno); - } - m_rq_wqe_idx_to_wrid = nullptr; - } if (m_sq_wqe_idx_to_prop) { if (0 != munmap(m_sq_wqe_idx_to_prop, m_tx_num_wr * sizeof(*m_sq_wqe_idx_to_prop))) { - qp_logerr("Failed deallocating memory with munmap m_sq_wqe_idx_to_prop (errno=%d %m)", - errno); + hwqtx_logerr( + "Failed deallocating memory with munmap m_sq_wqe_idx_to_prop (errno=%d %m)", errno); } m_sq_wqe_idx_to_prop = nullptr; } destroy_tis_cache(); - qp_logdbg("calling ibv_destroy_qp(qp=%p)", m_mlx5_qp.qp); + hwqtx_logdbg("calling ibv_destroy_qp(qp=%p)", m_mlx5_qp.qp); if (m_mlx5_qp.qp) { IF_VERBS_FAILURE_EX(ibv_destroy_qp(m_mlx5_qp.qp), EIO) { - qp_logdbg("QP destroy failure (errno = %d %m)", -errno); + hwqtx_logdbg("QP destroy failure (errno = %d %m)", -errno); } ENDIF_VERBS_FAILURE; VALGRIND_MAKE_MEM_UNDEFINED(m_mlx5_qp.qp, sizeof(ibv_qp)); @@ -215,55 +182,40 @@ qp_mgr::~qp_mgr() delete m_p_cq_mgr_tx; m_p_cq_mgr_tx = nullptr; } - if (m_p_cq_mgr_rx) { - delete m_p_cq_mgr_rx; - m_p_cq_mgr_rx = nullptr; - } - delete[] m_ibv_rx_sg_array; - delete[] m_ibv_rx_wr_array; + if (m_p_cq_mgr_rx_unused) { + delete m_p_cq_mgr_rx_unused; + m_p_cq_mgr_rx_unused = nullptr; + } - qp_logdbg("Rx buffer poll: %ld free global buffers available", - g_buffer_pool_rx_rwqe->get_free_count()); - qp_logdbg("delete done"); + hwqtx_logdbg("Destructor hw_queue_tx end"); } -int qp_mgr::configure(struct qp_mgr_desc *desc) +int hw_queue_tx::configure(const slave_data_t *slave) { - qp_logdbg("Creating QP of transport type '%s' on ibv device '%s' [%p] on port %d", - priv_xlio_transport_type_str(m_p_ring->get_transport_type()), - m_p_ib_ctx_handler->get_ibname(), m_p_ib_ctx_handler->get_ibv_device(), m_port_num); + hwqtx_logdbg("Creating QP of transport type '%s' on ibv device '%s' [%p] on port %d", + priv_xlio_transport_type_str(m_p_ring->get_transport_type()), + m_p_ib_ctx_handler->get_ibname(), m_p_ib_ctx_handler->get_ibv_device(), + m_port_num); + hwqtx_logdbg("HW Dummy send support for QP = %d", m_hw_dummy_send_support); - // Check device capabilities for max QP work requests - m_max_qp_wr = ALIGN_WR_DOWN(m_p_ib_ctx_handler->get_ibv_device_attr()->max_qp_wr - 1); - if (m_rx_num_wr > m_max_qp_wr) { - qp_logwarn("Allocating only %d Rx QP work requests while user " - "requested %s=%d for QP on <%p, %d>", - m_max_qp_wr, SYS_VAR_RX_NUM_WRE, m_rx_num_wr, m_p_ib_ctx_handler, m_port_num); - m_rx_num_wr = m_max_qp_wr; - } - - qp_logdbg("HW Dummy send support for QP = %d", m_hw_dummy_send_support); - - // Create associated Tx & Rx cq_mgrs - m_p_cq_mgr_tx = init_tx_cq_mgr(); + // Create associated cq_mgr_tx and unused cq_mgr_rx_regrq just for QP sake. BULLSEYE_EXCLUDE_BLOCK_START + m_p_cq_mgr_tx = init_tx_cq_mgr(); if (!m_p_cq_mgr_tx) { - qp_logerr("Failed allocating m_p_cq_mgr_tx (errno=%d %m)", errno); + hwqtx_logerr("Failed allocating m_p_cq_mgr_tx (errno=%d %m)", errno); return -1; } - m_p_cq_mgr_rx = init_rx_cq_mgr(desc->rx_comp_event_channel); - if (!m_p_cq_mgr_rx) { - qp_logerr("Failed allocating m_p_cq_mgr_rx (errno=%d %m)", errno); + m_p_cq_mgr_rx_unused = new cq_mgr_rx_regrq(m_p_ring, m_p_ib_ctx_handler, 2, nullptr); + if (!m_p_cq_mgr_rx_unused) { + hwqtx_logerr("Failed allocating m_p_cq_mgr_rx_unused (errno=%d %m)", errno); return -1; } BULLSEYE_EXCLUDE_BLOCK_END - // Modify the cq_mgr_rx and cq_mgr_tx to use a non-blocking event channel - set_fd_block_mode(m_p_cq_mgr_rx->get_channel_fd(), false); + // Modify the cq_mgr_tx to use a non-blocking event channel set_fd_block_mode(m_p_cq_mgr_tx->get_channel_fd(), false); - - qp_logdbg("cq tx: %p rx: %p", m_p_cq_mgr_tx, m_p_cq_mgr_rx); + hwqtx_logdbg("cq tx: %p", m_p_cq_mgr_tx); // Create QP xlio_ibv_qp_init_attr qp_init_attr; @@ -272,10 +224,11 @@ int qp_mgr::configure(struct qp_mgr_desc *desc) // TODO: m_tx_num_wr and m_rx_num_wr should be part of m_mlx5_qp.cap // and assigned as a result of ibv_query_qp() m_mlx5_qp.cap.max_send_wr = m_tx_num_wr; - m_mlx5_qp.cap.max_recv_wr = m_rx_num_wr; + m_mlx5_qp.cap.max_recv_wr = 1; + m_mlx5_qp.cap.max_recv_sge = 1; memcpy(&qp_init_attr.cap, &m_mlx5_qp.cap, sizeof(qp_init_attr.cap)); - qp_init_attr.recv_cq = m_p_cq_mgr_rx->get_ibv_cq_hndl(); + qp_init_attr.recv_cq = m_p_cq_mgr_rx_unused->get_ibv_cq_hndl(); qp_init_attr.send_cq = m_p_cq_mgr_tx->get_ibv_cq_hndl(); qp_init_attr.sq_sig_all = 0; @@ -286,32 +239,24 @@ int qp_mgr::configure(struct qp_mgr_desc *desc) 16 + 14 + 16 * qp_init_attr.cap.max_send_sge + qp_init_attr.cap.max_inline_data + 4; max_wqe_sz += (m_p_ring->is_tso() ? m_p_ring->m_tso.max_header_sz : 94); int num_wr = 32678 * 64 / max_wqe_sz; - qp_logdbg("calculated max_wqe_sz=%d num_wr=%d", max_wqe_sz, num_wr); + hwqtx_logdbg("calculated max_wqe_sz=%d num_wr=%d", max_wqe_sz, num_wr); if (num_wr < (signed)m_tx_num_wr) { qp_init_attr.cap.max_send_wr = num_wr; // force min for create_qp or you will have error of memory allocation } - qp_logdbg("Requested QP parameters: " - "wre: tx = %d rx = %d " - "sge: tx = %d rx = %d " - "inline: %d", - qp_init_attr.cap.max_send_wr, qp_init_attr.cap.max_recv_wr, - qp_init_attr.cap.max_send_sge, qp_init_attr.cap.max_recv_sge, - qp_init_attr.cap.max_inline_data); + hwqtx_logdbg("Requested QP parameters: wre: tx = %d sge: tx = %d inline: %d", + qp_init_attr.cap.max_send_wr, qp_init_attr.cap.max_send_sge, + qp_init_attr.cap.max_inline_data); - // Create the QP - if (prepare_ibv_qp(qp_init_attr)) { + // Create the HW Queue + if (prepare_queue(qp_init_attr)) { return -1; } - qp_logdbg("Configured QP parameters: " - "wre: tx = %d rx = %d " - "sge: tx = %d rx = %d " - "inline: %d", - qp_init_attr.cap.max_send_wr, qp_init_attr.cap.max_recv_wr, - qp_init_attr.cap.max_send_sge, qp_init_attr.cap.max_recv_sge, - qp_init_attr.cap.max_inline_data); + hwqtx_logdbg("Configured QP parameters: wre: tx = %d sge: tx = %d inline: %d", + qp_init_attr.cap.max_send_wr, qp_init_attr.cap.max_send_sge, + qp_init_attr.cap.max_inline_data); /* Check initial parameters with actual */ enum ibv_qp_attr_mask attr_mask = IBV_QP_CAP; @@ -319,97 +264,71 @@ int qp_mgr::configure(struct qp_mgr_desc *desc) struct ibv_qp_init_attr tmp_ibv_qp_init_attr; IF_VERBS_FAILURE(ibv_query_qp(m_mlx5_qp.qp, &tmp_ibv_qp_attr, attr_mask, &tmp_ibv_qp_init_attr)) { - qp_logerr("ibv_query_qp failed (errno=%d %m)", errno); + hwqtx_logerr("ibv_query_qp failed (errno=%d %m)", errno); return -1; } ENDIF_VERBS_FAILURE; - m_mlx5_qp.cap.max_send_wr = min(tmp_ibv_qp_attr.cap.max_send_wr, m_mlx5_qp.cap.max_send_wr); - m_mlx5_qp.cap.max_recv_wr = min(tmp_ibv_qp_attr.cap.max_recv_wr, m_mlx5_qp.cap.max_recv_wr); - m_mlx5_qp.cap.max_send_sge = min(tmp_ibv_qp_attr.cap.max_send_sge, m_mlx5_qp.cap.max_send_sge); - m_mlx5_qp.cap.max_recv_sge = min(tmp_ibv_qp_attr.cap.max_recv_sge, m_mlx5_qp.cap.max_recv_sge); + m_mlx5_qp.cap.max_send_wr = + std::min(tmp_ibv_qp_attr.cap.max_send_wr, m_mlx5_qp.cap.max_send_wr); + m_mlx5_qp.cap.max_send_sge = + std::min(tmp_ibv_qp_attr.cap.max_send_sge, m_mlx5_qp.cap.max_send_sge); m_mlx5_qp.cap.max_inline_data = - min(tmp_ibv_qp_attr.cap.max_inline_data, m_mlx5_qp.cap.max_inline_data); + std::min(tmp_ibv_qp_attr.cap.max_inline_data, m_mlx5_qp.cap.max_inline_data); - qp_logdbg("Used QP (num=%d) " - "wre: tx = %d rx = %d " - "sge: tx = %d rx = %d " - "inline: %d", - m_mlx5_qp.qp->qp_num, m_mlx5_qp.cap.max_send_wr, m_mlx5_qp.cap.max_recv_wr, - m_mlx5_qp.cap.max_send_sge, m_mlx5_qp.cap.max_recv_sge, - m_mlx5_qp.cap.max_inline_data); + hwqtx_logdbg("Used QP (num=%d) wre: tx = %d sge: tx = %d inline: %d", m_mlx5_qp.qp->qp_num, + m_mlx5_qp.cap.max_send_wr, m_mlx5_qp.cap.max_send_sge, + m_mlx5_qp.cap.max_inline_data); #if defined(DEFINED_ROCE_LAG) - if (desc->slave && desc->slave->lag_tx_port_affinity > 0) { - const slave_data_t *p_slave = desc->slave; + if (slave && slave->lag_tx_port_affinity > 0) { struct mlx5dv_context attr_out; memset(&attr_out, 0, sizeof(attr_out)); attr_out.comp_mask |= MLX5DV_CONTEXT_MASK_NUM_LAG_PORTS; - if (!mlx5dv_query_device(p_slave->p_ib_ctx->get_ibv_context(), &attr_out)) { - qp_logdbg("QP ROCE LAG port: %d of %d", p_slave->lag_tx_port_affinity, - attr_out.num_lag_ports); + if (!mlx5dv_query_device(slave->p_ib_ctx->get_ibv_context(), &attr_out)) { + hwqtx_logdbg("QP ROCE LAG port: %d of %d", slave->lag_tx_port_affinity, + attr_out.num_lag_ports); - if (!mlx5dv_modify_qp_lag_port(m_mlx5_qp.qp, p_slave->lag_tx_port_affinity)) { + if (!mlx5dv_modify_qp_lag_port(m_mlx5_qp.qp, slave->lag_tx_port_affinity)) { uint8_t current_port_num = 0; uint8_t active_port_num = 0; if (!mlx5dv_query_qp_lag_port(m_mlx5_qp.qp, ¤t_port_num, &active_port_num)) { - qp_logdbg("QP ROCE LAG port affinity: %d => %d", current_port_num, - active_port_num); + hwqtx_logdbg("QP ROCE LAG port affinity: %d => %d", current_port_num, + active_port_num); } } } } #endif /* DEFINED_ROCE_LAG */ - // All buffers will be allocated from this qp_mgr buffer pool so we can already set the Rx & Tx - // lkeys - for (uint32_t wr_idx = 0; wr_idx < m_n_sysvar_rx_num_wr_to_post_recv; wr_idx++) { - m_ibv_rx_wr_array[wr_idx].sg_list = &m_ibv_rx_sg_array[wr_idx]; - m_ibv_rx_wr_array[wr_idx].num_sge = 1; - m_ibv_rx_wr_array[wr_idx].next = - (wr_idx < (m_n_sysvar_rx_num_wr_to_post_recv - 1) ? &m_ibv_rx_wr_array[wr_idx + 1] - : NULL); // pre-define the linked list - } - - m_curr_rx_wr = 0; return 0; } -void qp_mgr::up() +void hw_queue_tx::up() { - init_qp(); - - _tir.reset(create_tir()); - if (!_tir) { - qp_logpanic("TIR creation for qp_mgr failed (errno=%d %m)", errno); - } + init_queue(); // Add buffers - qp_logdbg("QP current state: %d", priv_ibv_query_qp_state(m_mlx5_qp.qp)); + hwqtx_logdbg("QP current state: %d", priv_ibv_query_qp_state(m_mlx5_qp.qp)); m_p_cq_mgr_tx->add_qp_tx(this); - release_rx_buffers(); // We might have old flushed cqe's in our CQ still from previous HA event release_tx_buffers(); - modify_qp_to_ready_state(); - - m_p_cq_mgr_rx->add_qp_rx(this); + modify_queue_to_ready_state(); init_device_memory(); } -void qp_mgr::down() +void hw_queue_tx::down() { - _tir.reset(nullptr); - if (m_dm_enabled) { m_dm_mgr.release_resources(); } - qp_logdbg("QP current state: %d", priv_ibv_query_qp_state(m_mlx5_qp.qp)); - modify_qp_to_error_state(); + hwqtx_logdbg("QP current state: %d", priv_ibv_query_qp_state(m_mlx5_qp.qp)); + modify_queue_to_error_state(); // free buffers from current active resource iterator trigger_completion_for_all_sent_packets(); @@ -419,81 +338,24 @@ void qp_mgr::down() usleep(1000); release_tx_buffers(); - release_rx_buffers(); m_p_cq_mgr_tx->del_qp_tx(this); - m_p_cq_mgr_rx->del_qp_rx(this); } -void qp_mgr::release_rx_buffers() -{ - int total_ret = m_curr_rx_wr; - if (m_curr_rx_wr) { - qp_logdbg("Returning %d pending post_recv buffers to CQ owner", m_curr_rx_wr); - while (m_curr_rx_wr) { - // Cleaning unposted buffers. Unposted buffers are not attached to any strides. - --m_curr_rx_wr; - mem_buf_desc_t *p_mem_buf_desc = - (mem_buf_desc_t *)(uintptr_t)m_ibv_rx_wr_array[m_curr_rx_wr].wr_id; - if (p_mem_buf_desc && p_mem_buf_desc->p_desc_owner) { - m_p_ring->mem_buf_desc_return_to_owner_rx(p_mem_buf_desc); - } else { - g_buffer_pool_rx_rwqe->put_buffers_thread_safe(p_mem_buf_desc); - } - } - } - // Wait for all FLUSHed WQE on Rx CQ - qp_logdbg("draining cq_mgr_rx %p (last_posted_rx_wr_id = %lu)", m_p_cq_mgr_rx, - m_last_posted_rx_wr_id); - uintptr_t last_polled_rx_wr_id = 0; - while (m_p_cq_mgr_rx && last_polled_rx_wr_id != m_last_posted_rx_wr_id && errno != EIO && - !m_p_ib_ctx_handler->is_removed() && !is_rq_empty() && !g_b_exit) { - - // Process the FLUSH'ed WQE's - int ret = m_p_cq_mgr_rx->drain_and_proccess(&last_polled_rx_wr_id); - qp_logdbg("draining completed on cq_mgr_rx (%d wce) last_polled_rx_wr_id = %lu", ret, - last_polled_rx_wr_id); - - total_ret += ret; - - if (!ret) { - // Query context for ib_verbs events (especially for IBV_EVENT_DEVICE_FATAL) - g_p_event_handler_manager->query_for_ibverbs_event( - m_p_ib_ctx_handler->get_ibv_context()->async_fd); - } - - // Add short delay (500 usec) to allow for WQE's to be flushed to CQ every poll cycle - const struct timespec short_sleep = {0, 500000}; // 500 usec - nanosleep(&short_sleep, NULL); - } - m_last_posted_rx_wr_id = 0; // Clear the posted WR_ID flag, we just clear the entire RQ - qp_logdbg("draining completed with a total of %d wce's on cq_mgr_rx", total_ret); - NOT_IN_USE(total_ret); // Suppress --enable-opt-log=high warning -} - -void qp_mgr::release_tx_buffers() +void hw_queue_tx::release_tx_buffers() { int ret; uint64_t poll_sn = 0; - qp_logdbg("draining cq_mgr_tx %p", m_p_cq_mgr_tx); + hwqtx_logdbg("draining cq_mgr_tx %p", m_p_cq_mgr_tx); while (m_p_cq_mgr_tx && m_mlx5_qp.qp && ((ret = m_p_cq_mgr_tx->poll_and_process_element_tx(&poll_sn)) > 0) && (errno != EIO && !m_p_ib_ctx_handler->is_removed())) { - qp_logdbg("draining completed on cq_mgr_tx (%d wce)", ret); + hwqtx_logdbg("draining completed on cq_mgr_tx (%d wce)", ret); } NOT_IN_USE(ret); // Suppress --enable-opt-log=high warning } -void qp_mgr::post_recv_buffers(descq_t *p_buffers, size_t count) -{ - qp_logfuncall(""); - // Called from cq_mgr_rx context under cq_mgr_rx::LOCK! - while (count--) { - post_recv_buffer(p_buffers->get_and_pop_front()); - } -} - -int qp_mgr::send(xlio_ibv_send_wr *p_send_wqe, xlio_wr_tx_packet_attr attr, xlio_tis *tis, - unsigned credits) +int hw_queue_tx::send(xlio_ibv_send_wr *p_send_wqe, xlio_wr_tx_packet_attr attr, xlio_tis *tis, + unsigned credits) { mem_buf_desc_t *p_mem_buf_desc = (mem_buf_desc_t *)p_send_wqe->wr_id; /* Control tx completions: @@ -507,7 +369,7 @@ int qp_mgr::send(xlio_ibv_send_wr *p_send_wqe, xlio_wr_tx_packet_attr attr, xlio */ bool request_comp = (p_mem_buf_desc->m_flags & mem_buf_desc_t::ZCOPY); - qp_logfunc("VERBS send, unsignaled_count: %d", m_n_unsignaled_count); + hwqtx_logfunc("VERBS send, unsignaled_count: %d", m_n_unsignaled_count); // TODO send_to_wire() and send() can return void after removing ibverbs support if (send_to_wire(p_send_wqe, attr, request_comp, tis, credits)) { @@ -519,66 +381,50 @@ int qp_mgr::send(xlio_ibv_send_wr *p_send_wqe, xlio_wr_tx_packet_attr attr, xlio int ret = m_p_cq_mgr_tx->poll_and_process_element_tx(&dummy_poll_sn); BULLSEYE_EXCLUDE_BLOCK_START if (ret < 0) { - qp_logerr("error from cq_mgr_tx->process_next_element (ret=%d %m)", ret); + hwqtx_logerr("error from cq_mgr_tx->process_next_element (ret=%d %m)", ret); } BULLSEYE_EXCLUDE_BLOCK_END - qp_logfunc("polling succeeded on cq_mgr_tx (%d wce)", ret); + hwqtx_logfunc("polling succeeded on cq_mgr_tx (%d wce)", ret); } return 0; } -void qp_mgr::modify_qp_to_ready_state() +void hw_queue_tx::modify_queue_to_ready_state() { - qp_logdbg(""); + hwqtx_logdbg(""); int ret = 0; int qp_state = priv_ibv_query_qp_state(m_mlx5_qp.qp); if (qp_state != IBV_QPS_INIT) { BULLSEYE_EXCLUDE_BLOCK_START if ((ret = priv_ibv_modify_qp_from_err_to_init_raw(m_mlx5_qp.qp, m_port_num)) != 0) { - qp_logpanic("failed to modify QP from %d to RTS state (ret = %d)", qp_state, ret); + hwqtx_logpanic("failed to modify QP from %d to RTS state (ret = %d)", qp_state, ret); } BULLSEYE_EXCLUDE_BLOCK_END } BULLSEYE_EXCLUDE_BLOCK_START if ((ret = priv_ibv_modify_qp_from_init_to_rts(m_mlx5_qp.qp)) != 0) { - qp_logpanic("failed to modify QP from INIT to RTS state (ret = %d)", ret); + hwqtx_logpanic("failed to modify QP from INIT to RTS state (ret = %d)", ret); } BULLSEYE_EXCLUDE_BLOCK_END - - modify_rq_to_ready_state(); } -void qp_mgr::modify_qp_to_error_state() +void hw_queue_tx::modify_queue_to_error_state() { - qp_logdbg(""); - - m_p_cq_mgr_rx->clean_cq(); + hwqtx_logdbg(""); BULLSEYE_EXCLUDE_BLOCK_START if (priv_ibv_modify_qp_to_err(m_mlx5_qp.qp)) { - qp_logdbg("ibv_modify_qp failure (errno = %d %m)", errno); + hwqtx_logdbg("ibv_modify_qp failure (errno = %d %m)", errno); } BULLSEYE_EXCLUDE_BLOCK_END - - dpcp::status rc = _rq->modify_state(dpcp::RQ_ERR); - - /* During plugout theres is possibility that kernel - * remove device resources before working process complete - * removing process. As a result ibv api function can - * return EIO=5 errno code. - */ - if (dpcp::DPCP_OK != rc && errno != EIO) { - qp_logerr("Failed to modify rq state to ERR, rc: %d, rqn: %" PRIu32, static_cast(rc), - m_mlx5_qp.rqn); - } } -int qp_mgr::prepare_ibv_qp(xlio_ibv_qp_init_attr &qp_init_attr) +int hw_queue_tx::prepare_queue(xlio_ibv_qp_init_attr &qp_init_attr) { - qp_logdbg(""); + hwqtx_logdbg(""); int ret = 0; qp_init_attr.qp_type = IBV_QPT_RAW_PACKET; @@ -586,19 +432,19 @@ int qp_mgr::prepare_ibv_qp(xlio_ibv_qp_init_attr &qp_init_attr) if (m_p_ring->is_tso()) { xlio_ibv_qp_init_attr_tso(qp_init_attr, m_p_ring->get_max_header_sz()); - qp_logdbg("create qp with max_tso_header = %d", m_p_ring->get_max_header_sz()); + hwqtx_logdbg("create qp with max_tso_header = %d", m_p_ring->get_max_header_sz()); } m_mlx5_qp.qp = xlio_ibv_create_qp(m_p_ib_ctx_handler->get_ibv_pd(), &qp_init_attr); BULLSEYE_EXCLUDE_BLOCK_START if (!m_mlx5_qp.qp) { - qp_logerr("ibv_create_qp failed (errno=%d %m)", errno); + hwqtx_logerr("ibv_create_qp failed (errno=%d %m)", errno); return -1; } VALGRIND_MAKE_MEM_DEFINED(m_mlx5_qp.qp, sizeof(ibv_qp)); if ((ret = priv_ibv_modify_qp_from_err_to_init_raw(m_mlx5_qp.qp, m_port_num)) != 0) { - qp_logerr("failed to modify QP from ERR to INIT state (ret = %d)", ret); + hwqtx_logerr("failed to modify QP from ERR to INIT state (ret = %d)", ret); return ret; } BULLSEYE_EXCLUDE_BLOCK_END @@ -606,57 +452,10 @@ int qp_mgr::prepare_ibv_qp(xlio_ibv_qp_init_attr &qp_init_attr) return 0; } -uint32_t qp_mgr::is_ratelimit_change(struct xlio_rate_limit_t &rate_limit) -{ - uint32_t rl_changes = 0; - - if (m_rate_limit.rate != rate_limit.rate) { - rl_changes |= RL_RATE; - } - if (m_rate_limit.max_burst_sz != rate_limit.max_burst_sz) { - rl_changes |= RL_BURST_SIZE; - } - if (m_rate_limit.typical_pkt_sz != rate_limit.typical_pkt_sz) { - rl_changes |= RL_PKT_SIZE; - } - - return rl_changes; -} - -int qp_mgr::modify_qp_ratelimit(struct xlio_rate_limit_t &rate_limit, uint32_t rl_changes) -{ - int ret; - - ret = priv_ibv_modify_qp_ratelimit(m_mlx5_qp.qp, rate_limit, rl_changes); - if (ret) { - qp_logdbg("failed to modify qp ratelimit ret %d (errno=%d %m)", ret, errno); - return -1; - } - - m_rate_limit = rate_limit; - return 0; -} - -rfs_rule *qp_mgr::create_rfs_rule(xlio_ibv_flow_attr &attrs, xlio_tir *tir_ext) -{ - if (m_p_ib_ctx_handler && m_p_ib_ctx_handler->get_dpcp_adapter()) { - // TLS RX uses tir_ext. - dpcp::tir *dpcp_tir = (tir_ext ? xlio_tir_to_dpcp_tir(tir_ext) : _tir.get()); - - std::unique_ptr new_rule(new rfs_rule_dpcp()); - if (dpcp_tir && - new_rule->create(attrs, *dpcp_tir, *m_p_ib_ctx_handler->get_dpcp_adapter())) { - return new_rule.release(); - } - } - - return nullptr; -} - -void qp_mgr::init_qp() +void hw_queue_tx::init_queue() { if (0 != xlio_ib_mlx5_get_qp_tx(&m_mlx5_qp)) { - qp_logpanic("xlio_ib_mlx5_get_qp_tx failed (errno=%d %m)", errno); + hwqtx_logpanic("xlio_ib_mlx5_get_qp_tx failed (errno=%d %m)", errno); } m_sq_wqes = (struct mlx5_eth_wqe(*)[])(uintptr_t)m_mlx5_qp.sq.buf; @@ -689,15 +488,15 @@ void qp_mgr::init_qp() (sq_wqe_prop *)mmap(NULL, m_tx_num_wr * sizeof(*m_sq_wqe_idx_to_prop), PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); if (m_sq_wqe_idx_to_prop == MAP_FAILED) { - qp_logerr("Failed allocating m_sq_wqe_idx_to_prop (errno=%d %m)", errno); + hwqtx_logerr("Failed allocating m_sq_wqe_idx_to_prop (errno=%d %m)", errno); return; } m_sq_wqe_prop_last_signalled = m_tx_num_wr - 1; m_sq_wqe_prop_last = NULL; } - qp_logfunc("m_tx_num_wr=%d max_inline_data: %d m_sq_wqe_idx_to_prop=%p", m_tx_num_wr, - get_max_inline_data(), m_sq_wqe_idx_to_prop); + hwqtx_logfunc("m_tx_num_wr=%d max_inline_data: %d m_sq_wqe_idx_to_prop=%p", m_tx_num_wr, + get_max_inline_data(), m_sq_wqe_idx_to_prop); memset((void *)(uintptr_t)m_sq_wqe_hot, 0, sizeof(struct mlx5_eth_wqe)); m_sq_wqe_hot->ctrl.data[0] = htonl(MLX5_OPCODE_SEND); @@ -706,13 +505,13 @@ void qp_mgr::init_qp() m_sq_wqe_hot->eseg.inline_hdr_sz = htons(MLX5_ETH_INLINE_HEADER_SIZE); m_sq_wqe_hot->eseg.cs_flags = XLIO_TX_PACKET_L3_CSUM | XLIO_TX_PACKET_L4_CSUM; - qp_logfunc("%p allocated for %d QPs sq_wqes:%p sq_wqes_end: %p and configured %d WRs " - "BlueFlame: %p buf_size: %d offset: %d", - m_mlx5_qp.qp, m_mlx5_qp.qpn, m_sq_wqes, m_sq_wqes_end, m_tx_num_wr, m_mlx5_qp.bf.reg, - m_mlx5_qp.bf.size, m_mlx5_qp.bf.offset); + hwqtx_logfunc("%p allocated for %d QPs sq_wqes:%p sq_wqes_end: %p and configured %d WRs " + "BlueFlame: %p buf_size: %d offset: %d", + m_mlx5_qp.qp, m_mlx5_qp.qpn, m_sq_wqes, m_sq_wqes_end, m_tx_num_wr, + m_mlx5_qp.bf.reg, m_mlx5_qp.bf.size, m_mlx5_qp.bf.offset); } -void qp_mgr::init_device_memory() +void hw_queue_tx::init_device_memory() { /* This limitation is done because of a observation * that dm_copy takes a lot of time on VMs w/o BF (RM:1542628) @@ -732,18 +531,7 @@ void qp_mgr::init_device_memory() } } -#if defined(DEFINED_UTLS) -void qp_mgr::destroy_tis_cache(void) -{ - while (!m_tls_tis_cache.empty()) { - xlio_tis *tis = m_tls_tis_cache.back(); - m_tls_tis_cache.pop_back(); - delete tis; - } -} -#endif /* defined(DEFINED_UTLS) */ - -void qp_mgr::update_next_wqe_hot() +void hw_queue_tx::update_next_wqe_hot() { // Preparing next WQE as Ethernet send WQE and index: m_sq_wqe_hot = &(*m_sq_wqes)[m_sq_wqe_counter & (m_tx_num_wr - 1)]; @@ -756,107 +544,15 @@ void qp_mgr::update_next_wqe_hot() eth_seg->inline_hdr_sz = htons(MLX5_ETH_INLINE_HEADER_SIZE); } -void qp_mgr::post_recv_buffer(mem_buf_desc_t *p_mem_buf_desc) -{ - uint32_t index = (m_curr_rx_wr * m_mlx5_qp.cap.max_recv_sge) + _strq_wqe_reserved_seg; - m_ibv_rx_sg_array[index].addr = (uintptr_t)p_mem_buf_desc->p_buffer; - m_ibv_rx_sg_array[index].length = p_mem_buf_desc->sz_buffer; - m_ibv_rx_sg_array[index].lkey = p_mem_buf_desc->lkey; - - post_recv_buffer_rq(p_mem_buf_desc); -} - -void qp_mgr::post_recv_buffer_rq(mem_buf_desc_t *p_mem_buf_desc) -{ - if (m_n_sysvar_rx_prefetch_bytes_before_poll) { - if (m_p_prev_rx_desc_pushed) { - m_p_prev_rx_desc_pushed->p_prev_desc = p_mem_buf_desc; - } - m_p_prev_rx_desc_pushed = p_mem_buf_desc; - } - - m_ibv_rx_wr_array[m_curr_rx_wr].wr_id = (uintptr_t)p_mem_buf_desc; - - if (m_rq_wqe_idx_to_wrid) { - uint32_t index = m_rq_wqe_counter & (m_rx_num_wr - 1); - m_rq_wqe_idx_to_wrid[index] = (uintptr_t)p_mem_buf_desc; - ++m_rq_wqe_counter; - } - - if (m_curr_rx_wr == m_n_sysvar_rx_num_wr_to_post_recv - 1) { - - m_last_posted_rx_wr_id = (uintptr_t)p_mem_buf_desc; - - m_p_prev_rx_desc_pushed = NULL; - p_mem_buf_desc->p_prev_desc = NULL; - - m_curr_rx_wr = 0; - struct ibv_recv_wr *bad_wr = NULL; - IF_VERBS_FAILURE(xlio_ib_mlx5_post_recv(&m_mlx5_qp, &m_ibv_rx_wr_array[0], &bad_wr)) - { - uint32_t n_pos_bad_rx_wr = - ((uint8_t *)bad_wr - (uint8_t *)m_ibv_rx_wr_array) / sizeof(struct ibv_recv_wr); - qp_logerr("failed posting list (errno=%d %s)", errno, strerror(errno)); - qp_logerr("bad_wr is %d in submitted list (bad_wr=%p, m_ibv_rx_wr_array=%p, size=%zu)", - n_pos_bad_rx_wr, bad_wr, m_ibv_rx_wr_array, sizeof(struct ibv_recv_wr)); - qp_logerr("bad_wr info: wr_id=%#lx, next=%p, addr=%#lx, length=%d, lkey=%#x", - bad_wr[0].wr_id, bad_wr[0].next, bad_wr[0].sg_list[0].addr, - bad_wr[0].sg_list[0].length, bad_wr[0].sg_list[0].lkey); - qp_logerr("QP current state: %d", priv_ibv_query_qp_state(m_mlx5_qp.qp)); - - // Fix broken linked list of rx_wr - if (n_pos_bad_rx_wr != (m_n_sysvar_rx_num_wr_to_post_recv - 1)) { - m_ibv_rx_wr_array[n_pos_bad_rx_wr].next = &m_ibv_rx_wr_array[n_pos_bad_rx_wr + 1]; - } - throw; - } - ENDIF_VERBS_FAILURE; - qp_logfunc("Successful ibv_post_recv"); - } else { - m_curr_rx_wr++; - } -} - -bool qp_mgr::init_rx_cq_mgr_prepare() -{ - m_rx_num_wr = align32pow2(m_rx_num_wr); - - m_rq_wqe_idx_to_wrid = - (uint64_t *)mmap(NULL, m_rx_num_wr * sizeof(*m_rq_wqe_idx_to_wrid), PROT_READ | PROT_WRITE, - MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); - if (m_rq_wqe_idx_to_wrid == MAP_FAILED) { - qp_logerr("Failed allocating m_rq_wqe_idx_to_wrid (errno=%d %m)", errno); - return false; - } - - return true; -} - -cq_mgr_rx *qp_mgr::init_rx_cq_mgr(struct ibv_comp_channel *p_rx_comp_event_channel) -{ - if (!init_rx_cq_mgr_prepare()) { - return nullptr; - } - - if (safe_mce_sys().enable_striding_rq) { - return new cq_mgr_rx_strq(m_p_ring, m_p_ib_ctx_handler, - safe_mce_sys().strq_stride_num_per_rwqe * m_rx_num_wr, - safe_mce_sys().strq_stride_size_bytes, - safe_mce_sys().strq_stride_num_per_rwqe, p_rx_comp_event_channel); - } - - return new cq_mgr_rx_regrq(m_p_ring, m_p_ib_ctx_handler, m_rx_num_wr, p_rx_comp_event_channel); -} - -cq_mgr_tx *qp_mgr::init_tx_cq_mgr() +cq_mgr_tx *hw_queue_tx::init_tx_cq_mgr() { m_tx_num_wr = align32pow2(m_tx_num_wr); return new cq_mgr_tx(m_p_ring, m_p_ib_ctx_handler, m_tx_num_wr, m_p_ring->get_tx_comp_event_channel()); } -inline void qp_mgr::ring_doorbell(int db_method, int num_wqebb, int num_wqebb_top, - bool skip_comp /*=false*/) +inline void hw_queue_tx::ring_doorbell(int db_method, int num_wqebb, int num_wqebb_top, + bool skip_comp /*=false*/) { uint64_t *dst = (uint64_t *)((uint8_t *)m_mlx5_qp.bf.reg + m_mlx5_qp.bf.offset); uint64_t *src = reinterpret_cast(m_sq_wqe_hot); @@ -912,8 +608,8 @@ inline void qp_mgr::ring_doorbell(int db_method, int num_wqebb, int num_wqebb_to m_mlx5_qp.bf.offset ^= m_mlx5_qp.bf.size; } -inline int qp_mgr::fill_inl_segment(sg_array &sga, uint8_t *cur_seg, uint8_t *data_addr, - int max_inline_len, int inline_len) +inline int hw_queue_tx::fill_inl_segment(sg_array &sga, uint8_t *cur_seg, uint8_t *data_addr, + int max_inline_len, int inline_len) { int wqe_inline_size = 0; while ((data_addr != NULL) && inline_len) { @@ -923,14 +619,14 @@ inline int qp_mgr::fill_inl_segment(sg_array &sga, uint8_t *cur_seg, uint8_t *da cur_seg += inline_len; inline_len = max_inline_len - wqe_inline_size; data_addr = sga.get_data(&inline_len); - qp_logfunc("data_addr:%p cur_seg: %p inline_len: %d wqe_inline_size: %d", data_addr, - cur_seg, inline_len, wqe_inline_size); + hwqtx_logfunc("data_addr:%p cur_seg: %p inline_len: %d wqe_inline_size: %d", data_addr, + cur_seg, inline_len, wqe_inline_size); } return wqe_inline_size; } //! Fill WQE dynamically, based on amount of free WQEBB in SQ -inline int qp_mgr::fill_wqe(xlio_ibv_send_wr *pswr) +inline int hw_queue_tx::fill_wqe(xlio_ibv_send_wr *pswr) { // control segment is mostly filled by preset after previous packet // we always inline ETH header @@ -945,7 +641,7 @@ inline int qp_mgr::fill_wqe(xlio_ibv_send_wr *pswr) if (likely(data_len <= max_inline_len && xlio_send_wr_opcode(*pswr) == XLIO_IBV_WR_SEND)) { uint8_t *data_addr = sga.get_data(&inline_len); // data for inlining in ETH header data_len -= inline_len; - qp_logfunc( + hwqtx_logfunc( "wqe_hot:%p num_sge: %d data_addr: %p data_len: %d max_inline_len: %d inline_len: %d", m_sq_wqe_hot, pswr->num_sge, data_addr, data_len, max_inline_len, inline_len); @@ -965,8 +661,8 @@ inline int qp_mgr::fill_wqe(xlio_ibv_send_wr *pswr) // to end of WQEs if (likely(max_inline_len <= rest_space)) { inline_len = max_inline_len; - qp_logfunc("data_addr:%p cur_seg: %p rest_space: %d inline_len: %d wqe_size: %d", - data_addr, cur_seg, rest_space, inline_len, wqe_size); + hwqtx_logfunc("data_addr:%p cur_seg: %p rest_space: %d inline_len: %d wqe_size: %d", + data_addr, cur_seg, rest_space, inline_len, wqe_size); // bypass inline size and fill inline data segment data_addr = sga.get_data(&inline_len); inline_len = fill_inl_segment(sga, cur_seg + 4, data_addr, max_inline_len, inline_len); @@ -980,17 +676,17 @@ inline int qp_mgr::fill_wqe(xlio_ibv_send_wr *pswr) // configuring control m_sq_wqe_hot->ctrl.data[1] = htonl((m_mlx5_qp.qpn << 8) | wqe_size); rest_space = align_to_WQEBB_up(wqe_size) / 4; - qp_logfunc("data_len: %d inline_len: %d wqe_size: %d wqebbs: %d", data_len - inline_len, - inline_len, wqe_size, rest_space); + hwqtx_logfunc("data_len: %d inline_len: %d wqe_size: %d wqebbs: %d", + data_len - inline_len, inline_len, wqe_size, rest_space); ring_doorbell(m_db_method, rest_space); return rest_space; } else { // wrap around case, first filling till the end of m_sq_wqes int wrap_up_size = max_inline_len - rest_space; inline_len = rest_space; - qp_logfunc("WRAP_UP_SIZE: %d data_addr:%p cur_seg: %p rest_space: %d inline_len: %d " - "wqe_size: %d", - wrap_up_size, data_addr, cur_seg, rest_space, inline_len, wqe_size); + hwqtx_logfunc("WRAP_UP_SIZE: %d data_addr:%p cur_seg: %p rest_space: %d inline_len: %d " + "wqe_size: %d", + wrap_up_size, data_addr, cur_seg, rest_space, inline_len, wqe_size); data_addr = sga.get_data(&inline_len); inline_len = fill_inl_segment(sga, cur_seg + 4, data_addr, rest_space, inline_len); @@ -1000,7 +696,7 @@ inline int qp_mgr::fill_wqe(xlio_ibv_send_wr *pswr) rest_space = align_to_WQEBB_up(rest_space / OCTOWORD) / 4; // size of 1st chunk at the end - qp_logfunc( + hwqtx_logfunc( "END chunk data_addr: %p data_len: %d inline_len: %d wqe_size: %d wqebbs: %d", data_addr, data_len, inline_len, wqe_size, rest_space); // Wrap around @@ -1016,10 +712,10 @@ inline int qp_mgr::fill_wqe(xlio_ibv_send_wr *pswr) // store inline data size *(uint32_t *)((uint8_t *)m_sq_wqe_hot + sizeof(struct mlx5_wqe_ctrl_seg) + sizeof(struct mlx5_wqe_eth_seg)) = htonl(0x80000000 | inline_len); - qp_logfunc("BEGIN_CHUNK data_addr: %p data_len: %d wqe_size: %d inline_len: %d " - "end_wqebbs: %d wqebbs: %d", - data_addr, data_len - wrap_up_size, wqe_size, inline_len + wrap_up_size, - rest_space, max_inline_len); + hwqtx_logfunc("BEGIN_CHUNK data_addr: %p data_len: %d wqe_size: %d inline_len: %d " + "end_wqebbs: %d wqebbs: %d", + data_addr, data_len - wrap_up_size, wqe_size, inline_len + wrap_up_size, + rest_space, max_inline_len); // assert((data_len-wrap_up_size)==0); // configuring control m_sq_wqe_hot->ctrl.data[1] = htonl((m_mlx5_qp.qpn << 8) | wqe_size); @@ -1048,7 +744,7 @@ inline int qp_mgr::fill_wqe(xlio_ibv_send_wr *pswr) return 1; } -inline int qp_mgr::fill_wqe_send(xlio_ibv_send_wr *pswr) +inline int hw_queue_tx::fill_wqe_send(xlio_ibv_send_wr *pswr) { struct mlx5_wqe_eth_seg *eseg; struct mlx5_wqe_data_seg *dseg; @@ -1092,7 +788,7 @@ inline int qp_mgr::fill_wqe_send(xlio_ibv_send_wr *pswr) } //! Filling wqe for LSO -inline int qp_mgr::fill_wqe_lso(xlio_ibv_send_wr *pswr) +inline int hw_queue_tx::fill_wqe_lso(xlio_ibv_send_wr *pswr) { struct mlx5_wqe_ctrl_seg *ctrl = NULL; struct mlx5_wqe_eth_seg *eseg = NULL; @@ -1140,8 +836,8 @@ inline int qp_mgr::fill_wqe_lso(xlio_ibv_send_wr *pswr) inl_hdr_copy_size = align_to_WQEBB_up(wqe_size) / 4; } wqe_size += max_inline_len / OCTOWORD; - qp_logfunc("TSO: num_sge: %d max_inline_len: %d inl_hdr_size: %d rest: %d", pswr->num_sge, - max_inline_len, inl_hdr_size, rest); + hwqtx_logfunc("TSO: num_sge: %d max_inline_len: %d inl_hdr_size: %d rest: %d", pswr->num_sge, + max_inline_len, inl_hdr_size, rest); // Filling data pointer segments with payload by scatter-gather list elements dpseg = (struct mlx5_wqe_data_seg *)cur_seg; for (i = 0; i < pswr->num_sge; i++) { @@ -1153,8 +849,8 @@ inline int qp_mgr::fill_wqe_lso(xlio_ibv_send_wr *pswr) dpseg->lkey = htonl(pswr->sg_list[i].lkey); dpseg->byte_count = htonl(pswr->sg_list[i].length); - qp_logfunc("DATA_SEG: addr:%llx len: %d lkey: %x dp_seg: %p wqe_size: %d", - pswr->sg_list[i].addr, pswr->sg_list[i].length, dpseg->lkey, dpseg, wqe_size); + hwqtx_logfunc("DATA_SEG: addr:%llx len: %d lkey: %x dp_seg: %p wqe_size: %d", + pswr->sg_list[i].addr, pswr->sg_list[i].length, dpseg->lkey, dpseg, wqe_size); dpseg++; wqe_size += sizeof(struct mlx5_wqe_data_seg) / OCTOWORD; @@ -1176,7 +872,7 @@ inline int qp_mgr::fill_wqe_lso(xlio_ibv_send_wr *pswr) return align_to_WQEBB_up(wqe_size) / 4; } -void qp_mgr::store_current_wqe_prop(mem_buf_desc_t *buf, unsigned credits, xlio_ti *ti) +void hw_queue_tx::store_current_wqe_prop(mem_buf_desc_t *buf, unsigned credits, xlio_ti *ti) { m_sq_wqe_idx_to_prop[m_sq_wqe_hot_index] = sq_wqe_prop { .buf = buf, @@ -1192,8 +888,8 @@ void qp_mgr::store_current_wqe_prop(mem_buf_desc_t *buf, unsigned credits, xlio_ //! Send one RAW packet by MLX5 BlueFlame // -int qp_mgr::send_to_wire(xlio_ibv_send_wr *p_send_wqe, xlio_wr_tx_packet_attr attr, - bool request_comp, xlio_tis *tis, unsigned credits) +int hw_queue_tx::send_to_wire(xlio_ibv_send_wr *p_send_wqe, xlio_wr_tx_packet_attr attr, + bool request_comp, xlio_tis *tis, unsigned credits) { struct xlio_mlx5_wqe_ctrl_seg *ctrl = NULL; struct mlx5_wqe_eth_seg *eseg = NULL; @@ -1229,7 +925,7 @@ int qp_mgr::send_to_wire(xlio_ibv_send_wr *p_send_wqe, xlio_wr_tx_packet_attr at update_next_wqe_hot(); - qp_logfunc( + hwqtx_logfunc( "m_sq_wqe_hot: %p m_sq_wqe_hot_index: %d wqe_counter: %d new_hot_index: %d wr_id: %llx", m_sq_wqe_hot, m_sq_wqe_hot_index, m_sq_wqe_counter, (m_sq_wqe_counter & (m_tx_num_wr - 1)), p_send_wqe->wr_id); @@ -1237,9 +933,127 @@ int qp_mgr::send_to_wire(xlio_ibv_send_wr *p_send_wqe, xlio_wr_tx_packet_attr at return 0; } -#ifdef DEFINED_UTLS +std::unique_ptr hw_queue_tx::create_tis(uint32_t flags) +{ + dpcp::adapter *adapter = m_p_ib_ctx_handler->get_dpcp_adapter(); + bool is_tls = flags & dpcp::TIS_ATTR_TLS, is_nvme = flags & dpcp::TIS_ATTR_NVMEOTCP; + if (unlikely(adapter == nullptr || (is_tls && is_nvme))) { + return nullptr; + } + + dpcp::tis::attr tis_attr = { + .flags = flags, + .tls_en = is_tls, + .nvmeotcp = is_nvme, + .transport_domain = adapter->get_td(), + .pd = adapter->get_pd(), + }; + + dpcp::tis *dpcp_tis = nullptr; + if (unlikely(adapter->create_tis(tis_attr, dpcp_tis) != dpcp::DPCP_OK)) { + hwqtx_logerr("Failed to create TIS with NVME enabled"); + return nullptr; + } + + auto tis_type = is_tls ? xlio_ti::ti_type::TLS_TIS : xlio_ti::ti_type::NVME_TIS; + return std::make_unique(this, std::unique_ptr(dpcp_tis), tis_type); +} + +static inline void nvme_fill_static_params_control(xlio_mlx5_wqe_ctrl_seg *cseg, + xlio_mlx5_wqe_umr_ctrl_seg *ucseg, + uint32_t producer_index, uint32_t qpn, + uint32_t tisn, uint8_t fence_flags) +{ + memset(cseg, 0, sizeof(*cseg)); + memset(ucseg, 0, sizeof(*ucseg)); + cseg->opmod_idx_opcode = + htobe32(((producer_index & 0xffff) << 8) | MLX5_OPCODE_UMR | + (MLX5_CTRL_SEGMENT_OPC_MOD_UMR_NVMEOTCP_TIS_STATIC_PARAMS << 24)); + size_t num_wqe_ds = 12U; + cseg->qpn_ds = htobe32((qpn << MLX5_WQE_CTRL_QPN_SHIFT) | num_wqe_ds); + cseg->fm_ce_se = fence_flags; + cseg->tis_tir_num = htobe32(tisn << MLX5_WQE_CTRL_TIR_TIS_INDEX_SHIFT); + + ucseg->flags = MLX5_UMR_INLINE; + ucseg->bsf_octowords = htobe16(MLX5E_TRANSPORT_STATIC_PARAMS_OCTWORD_SIZE); +} + +static inline void nvme_fill_static_params_transport_params( + mlx5_wqe_transport_static_params_seg *params, uint32_t config) -std::unique_ptr qp_mgr::get_new_tls_dek(const void *key, uint32_t key_size_bytes) +{ + memset(params, 0, sizeof(*params)); + void *ctx = params->ctx; + + DEVX_SET(transport_static_params, ctx, const_1, 1); + DEVX_SET(transport_static_params, ctx, const_2, 2); + DEVX_SET(transport_static_params, ctx, acc_type, MLX5_TRANSPORT_STATIC_PARAMS_ACC_TYPE_NVMETCP); + DEVX_SET(transport_static_params, ctx, nvme_resync_tcp_sn, 0); + DEVX_SET(transport_static_params, ctx, pda, static_cast(config & XLIO_NVME_PDA_MASK)); + DEVX_SET(transport_static_params, ctx, ddgst_en, bool(config & XLIO_NVME_DDGST_ENABLE)); + DEVX_SET(transport_static_params, ctx, ddgst_offload_en, + bool(config & XLIO_NVME_DDGST_OFFLOAD)); + DEVX_SET(transport_static_params, ctx, hddgst_en, bool(config & XLIO_NVME_HDGST_ENABLE)); + DEVX_SET(transport_static_params, ctx, hdgst_offload_en, + bool(config & XLIO_NVME_HDGST_OFFLOAD)); + DEVX_SET(transport_static_params, ctx, ti, MLX5_TRANSPORT_STATIC_PARAMS_TI_INITIATOR); + DEVX_SET(transport_static_params, ctx, const1, 1); + DEVX_SET(transport_static_params, ctx, zero_copy_en, 0); +} + +static inline void nvme_fill_progress_wqe(mlx5e_set_nvmeotcp_progress_params_wqe *wqe, + uint32_t producer_index, uint32_t qpn, uint32_t tisn, + uint32_t tcp_seqno, uint8_t fence_flags) +{ + memset(wqe, 0, sizeof(*wqe)); + auto cseg = &wqe->ctrl.ctrl; + + size_t progres_params_ds = DIV_ROUND_UP(sizeof(*wqe), MLX5_SEND_WQE_DS); + cseg->opmod_idx_opcode = + htobe32(((producer_index & 0xffff) << 8) | XLIO_MLX5_OPCODE_SET_PSV | + (MLX5_CTRL_SEGMENT_OPC_MOD_UMR_NVMEOTCP_TIS_PROGRESS_PARAMS << 24)); + cseg->qpn_ds = htobe32((qpn << MLX5_WQE_CTRL_QPN_SHIFT) | progres_params_ds); + cseg->fm_ce_se = fence_flags; + + mlx5_seg_nvmeotcp_progress_params *params = &wqe->params; + params->tir_num = htobe32(tisn); + void *ctx = params->ctx; + + DEVX_SET(nvmeotcp_progress_params, ctx, next_pdu_tcp_sn, tcp_seqno); + DEVX_SET(nvmeotcp_progress_params, ctx, pdu_tracker_state, + MLX5E_NVMEOTCP_PROGRESS_PARAMS_PDU_TRACKER_STATE_START); + /* if (is_tx) offloading state == 0*/ + DEVX_SET(nvmeotcp_progress_params, ctx, offloading_state, 0); +} + +void hw_queue_tx::nvme_set_static_context(xlio_tis *tis, uint32_t config) +{ + auto *cseg = wqebb_get(0U); + auto *ucseg = wqebb_get(0U, sizeof(*cseg)); + + nvme_fill_static_params_control(cseg, ucseg, m_sq_wqe_counter, m_mlx5_qp.qpn, tis->get_tisn(), + 0); + memset(wqebb_get(1U), 0, sizeof(mlx5_mkey_seg)); + + auto *params = wqebb_get(2U); + nvme_fill_static_params_transport_params(params, config); + store_current_wqe_prop(nullptr, SQ_CREDITS_UMR, tis); + ring_doorbell(MLX5_DB_METHOD_DB, MLX5E_TRANSPORT_SET_STATIC_PARAMS_WQEBBS); + update_next_wqe_hot(); +} + +void hw_queue_tx::nvme_set_progress_context(xlio_tis *tis, uint32_t tcp_seqno) +{ + auto *wqe = reinterpret_cast(m_sq_wqe_hot); + nvme_fill_progress_wqe(wqe, m_sq_wqe_counter, m_mlx5_qp.qpn, tis->get_tisn(), tcp_seqno, + MLX5_FENCE_MODE_INITIATOR_SMALL); + store_current_wqe_prop(nullptr, SQ_CREDITS_SET_PSV, tis); + ring_doorbell(MLX5_DB_METHOD_DB, MLX5E_NVMEOTCP_PROGRESS_PARAMS_WQEBBS); + update_next_wqe_hot(); +} + +#if defined(DEFINED_UTLS) +std::unique_ptr hw_queue_tx::get_new_tls_dek(const void *key, uint32_t key_size_bytes) { dpcp::tls_dek *_dek = nullptr; dpcp::adapter *adapter = m_p_ib_ctx_handler->get_dpcp_adapter(); @@ -1253,7 +1067,7 @@ std::unique_ptr qp_mgr::get_new_tls_dek(const void *key, uint32_t dek_attr.pd_id = adapter->get_pd(); status = adapter->create_tls_dek(dek_attr, _dek); if (unlikely(status != dpcp::DPCP_OK)) { - qp_logwarn("Failed to create new DEK, status: %d", status); + hwqtx_logwarn("Failed to create new DEK, status: %d", status); if (_dek) { delete _dek; _dek = nullptr; @@ -1264,7 +1078,7 @@ std::unique_ptr qp_mgr::get_new_tls_dek(const void *key, uint32_t return std::unique_ptr(_dek); } -std::unique_ptr qp_mgr::get_tls_dek(const void *key, uint32_t key_size_bytes) +std::unique_ptr hw_queue_tx::get_tls_dek(const void *key, uint32_t key_size_bytes) { dpcp::status status; dpcp::adapter *adapter = m_p_ib_ctx_handler->get_dpcp_adapter(); @@ -1286,12 +1100,12 @@ std::unique_ptr qp_mgr::get_tls_dek(const void *key, uint32_t key } if (unlikely(m_tls_dek_get_cache.empty())) { - qp_logdbg("Empty DEK get cache. Swapping caches and do Sync-Crypto. Put-Cache size: %zu", - m_tls_dek_put_cache.size()); + hwqtx_logdbg("Empty DEK get cache. Swapping caches and do Sync-Crypto. Put-Cache size: %zu", + m_tls_dek_put_cache.size()); status = adapter->sync_crypto_tls(); if (unlikely(status != dpcp::DPCP_OK)) { - qp_logwarn("Failed to flush DEK HW cache, status: %d", status); + hwqtx_logwarn("Failed to flush DEK HW cache, status: %d", status); return get_new_tls_dek(key, key_size_bytes); } @@ -1309,14 +1123,14 @@ std::unique_ptr qp_mgr::get_tls_dek(const void *key, uint32_t key dek_attr.pd_id = adapter->get_pd(); status = out_dek->modify(dek_attr); if (unlikely(status != dpcp::DPCP_OK)) { - qp_logwarn("Failed to modify DEK, status: %d", status); + hwqtx_logwarn("Failed to modify DEK, status: %d", status); out_dek.reset(nullptr); } return out_dek; } -void qp_mgr::put_tls_dek(std::unique_ptr &&tls_dek_obj) +void hw_queue_tx::put_tls_dek(std::unique_ptr &&tls_dek_obj) { if (tls_dek_obj == nullptr) { return; @@ -1328,7 +1142,7 @@ void qp_mgr::put_tls_dek(std::unique_ptr &&tls_dek_obj) } } -xlio_tis *qp_mgr::tls_context_setup_tx(const xlio_tls_info *info) +xlio_tis *hw_queue_tx::tls_context_setup_tx(const xlio_tls_info *info) { std::unique_ptr tis; if (m_tls_tis_cache.empty()) { @@ -1360,7 +1174,7 @@ xlio_tis *qp_mgr::tls_context_setup_tx(const xlio_tls_info *info) return tis.release(); } -void qp_mgr::tls_context_resync_tx(const xlio_tls_info *info, xlio_tis *tis, bool skip_static) +void hw_queue_tx::tls_context_resync_tx(const xlio_tls_info *info, xlio_tis *tis, bool skip_static) { uint32_t tisn = tis->get_tisn(); @@ -1371,29 +1185,9 @@ void qp_mgr::tls_context_resync_tx(const xlio_tls_info *info, xlio_tis *tis, boo m_b_fence_needed = true; } -xlio_tir *qp_mgr::tls_create_tir(bool cached) -{ - xlio_tir *tir = NULL; - - if (cached && !m_tls_tir_cache.empty()) { - tir = m_tls_tir_cache.back(); - m_tls_tir_cache.pop_back(); - } else if (!cached) { - dpcp::tir *new_tir = create_tir(true); - - if (new_tir != NULL) { - tir = new xlio_tir(new_tir, xlio_ti::ti_type::TLS_TIR); - } - if (unlikely(tir == NULL && new_tir != NULL)) { - delete new_tir; - } - } - return tir; -} - -int qp_mgr::tls_context_setup_rx(xlio_tir *tir, const xlio_tls_info *info, - uint32_t next_record_tcp_sn, xlio_comp_cb_t callback, - void *callback_arg) +int hw_queue_tx::tls_context_setup_rx(xlio_tir *tir, const xlio_tls_info *info, + uint32_t next_record_tcp_sn, xlio_comp_cb_t callback, + void *callback_arg) { uint32_t tirn; dpcp::tls_dek *_dek; @@ -1408,7 +1202,7 @@ int qp_mgr::tls_context_setup_rx(xlio_tir *tir, const xlio_tls_info *info, dek_attr.pd_id = adapter->get_pd(); status = adapter->create_tls_dek(dek_attr, _dek); if (unlikely(status != dpcp::DPCP_OK)) { - qp_logerr("Failed to create DEK, status: %d", status); + hwqtx_logerr("Failed to create DEK, status: %d", status); return -1; } tir->assign_dek(_dek); @@ -1423,13 +1217,13 @@ int qp_mgr::tls_context_setup_rx(xlio_tir *tir, const xlio_tls_info *info, return 0; } -void qp_mgr::tls_resync_rx(xlio_tir *tir, const xlio_tls_info *info, uint32_t hw_resync_tcp_sn) +void hw_queue_tx::tls_resync_rx(xlio_tir *tir, const xlio_tls_info *info, uint32_t hw_resync_tcp_sn) { tls_post_static_params_wqe(tir, info, tir->get_tirn(), tir->get_dek_id(), hw_resync_tcp_sn, false, false); } -void qp_mgr::tls_get_progress_params_rx(xlio_tir *tir, void *buf, uint32_t lkey) +void hw_queue_tx::tls_get_progress_params_rx(xlio_tir *tir, void *buf, uint32_t lkey) { /* Address must be aligned by 64. */ assert((uintptr_t)buf == ((uintptr_t)buf >> 6U << 6U)); @@ -1437,9 +1231,9 @@ void qp_mgr::tls_get_progress_params_rx(xlio_tir *tir, void *buf, uint32_t lkey) tls_get_progress_params_wqe(tir, tir->get_tirn(), buf, lkey); } -inline void qp_mgr::tls_fill_static_params_wqe(struct mlx5_wqe_tls_static_params_seg *params, - const struct xlio_tls_info *info, uint32_t key_id, - uint32_t resync_tcp_sn) +inline void hw_queue_tx::tls_fill_static_params_wqe(struct mlx5_wqe_tls_static_params_seg *params, + const struct xlio_tls_info *info, + uint32_t key_id, uint32_t resync_tcp_sn) { unsigned char *initial_rn, *iv; uint8_t tls_version; @@ -1468,9 +1262,9 @@ inline void qp_mgr::tls_fill_static_params_wqe(struct mlx5_wqe_tls_static_params DEVX_SET(tls_static_params, ctx, dek_index, key_id); } -inline void qp_mgr::tls_post_static_params_wqe(xlio_ti *ti, const struct xlio_tls_info *info, - uint32_t tis_tir_number, uint32_t key_id, - uint32_t resync_tcp_sn, bool fence, bool is_tx) +inline void hw_queue_tx::tls_post_static_params_wqe(xlio_ti *ti, const struct xlio_tls_info *info, + uint32_t tis_tir_number, uint32_t key_id, + uint32_t resync_tcp_sn, bool fence, bool is_tx) { struct mlx5_set_tls_static_params_wqe *wqe = reinterpret_cast(m_sq_wqe_hot); @@ -1561,9 +1355,9 @@ inline void qp_mgr::tls_post_static_params_wqe(xlio_ti *ti, const struct xlio_tl update_next_wqe_hot(); } -inline void qp_mgr::tls_fill_progress_params_wqe(struct mlx5_wqe_tls_progress_params_seg *params, - uint32_t tis_tir_number, - uint32_t next_record_tcp_sn) +inline void hw_queue_tx::tls_fill_progress_params_wqe( + struct mlx5_wqe_tls_progress_params_seg *params, uint32_t tis_tir_number, + uint32_t next_record_tcp_sn) { uint8_t *ctx = params->ctx; @@ -1575,9 +1369,9 @@ inline void qp_mgr::tls_fill_progress_params_wqe(struct mlx5_wqe_tls_progress_pa DEVX_SET(tls_progress_params, ctx, auth_state, MLX5E_TLS_PROGRESS_PARAMS_AUTH_STATE_NO_OFFLOAD); } -inline void qp_mgr::tls_post_progress_params_wqe(xlio_ti *ti, uint32_t tis_tir_number, - uint32_t next_record_tcp_sn, bool fence, - bool is_tx) +inline void hw_queue_tx::tls_post_progress_params_wqe(xlio_ti *ti, uint32_t tis_tir_number, + uint32_t next_record_tcp_sn, bool fence, + bool is_tx) { uint16_t num_wqebbs = TLS_SET_PROGRESS_PARAMS_WQEBBS; @@ -1607,8 +1401,8 @@ inline void qp_mgr::tls_post_progress_params_wqe(xlio_ti *ti, uint32_t tis_tir_n update_next_wqe_hot(); } -inline void qp_mgr::tls_get_progress_params_wqe(xlio_ti *ti, uint32_t tirn, void *buf, - uint32_t lkey) +inline void hw_queue_tx::tls_get_progress_params_wqe(xlio_ti *ti, uint32_t tirn, void *buf, + uint32_t lkey) { uint16_t num_wqebbs = TLS_GET_PROGRESS_WQEBBS; @@ -1639,13 +1433,13 @@ inline void qp_mgr::tls_get_progress_params_wqe(xlio_ti *ti, uint32_t tirn, void update_next_wqe_hot(); } -void qp_mgr::tls_tx_post_dump_wqe(xlio_tis *tis, void *addr, uint32_t len, uint32_t lkey, - bool first) +void hw_queue_tx::tls_tx_post_dump_wqe(xlio_tis *tis, void *addr, uint32_t len, uint32_t lkey, + bool first) { post_dump_wqe(tis, addr, len, lkey, first); } -void qp_mgr::tls_release_tis(xlio_tis *tis) +void hw_queue_tx::tls_release_tis(xlio_tis *tis) { assert(tis != nullptr && tis->m_type == xlio_ti::ti_type::TLS_TIS); tis->m_released = true; @@ -1654,175 +1448,38 @@ void qp_mgr::tls_release_tis(xlio_tis *tis) } } -void qp_mgr::tls_release_tir(xlio_tir *tir) -{ - /* TODO We don't have to lock ring to destroy DEK object (a garbage collector?). */ - - assert(tir != nullptr && tir->m_type == xlio_ti::ti_type::TLS_TIR); - tir->m_released = true; - tir->assign_callback(NULL, NULL); - if (tir->m_ref == 0) { - put_tls_tir_in_cache(tir); - } -} -#else /* DEFINED_UTLS */ -void qp_mgr::ti_released(xlio_ti *) {}; -void qp_mgr::destroy_tis_cache(void) {}; -#endif /* DEFINED_UTLS */ - -std::unique_ptr qp_mgr::create_tis(uint32_t flags) const -{ - dpcp::adapter *adapter = m_p_ib_ctx_handler->get_dpcp_adapter(); - bool is_tls = flags & dpcp::TIS_ATTR_TLS, is_nvme = flags & dpcp::TIS_ATTR_NVMEOTCP; - if (unlikely(adapter == nullptr || (is_tls && is_nvme))) { - return nullptr; - } - - dpcp::tis::attr tis_attr = { - .flags = flags, - .tls_en = is_tls, - .nvmeotcp = is_nvme, - .transport_domain = adapter->get_td(), - .pd = adapter->get_pd(), - }; - - dpcp::tis *dpcp_tis = nullptr; - if (unlikely(adapter->create_tis(tis_attr, dpcp_tis) != dpcp::DPCP_OK)) { - qp_logerr("Failed to create TIS with NVME enabled"); - return nullptr; - } - - auto tis_type = is_tls ? xlio_ti::ti_type::TLS_TIS : xlio_ti::ti_type::NVME_TIS; - return std::make_unique(std::unique_ptr(dpcp_tis), tis_type); -} - -static inline void nvme_fill_static_params_control(xlio_mlx5_wqe_ctrl_seg *cseg, - xlio_mlx5_wqe_umr_ctrl_seg *ucseg, - uint32_t producer_index, uint32_t qpn, - uint32_t tisn, uint8_t fence_flags) -{ - memset(cseg, 0, sizeof(*cseg)); - memset(ucseg, 0, sizeof(*ucseg)); - cseg->opmod_idx_opcode = - htobe32(((producer_index & 0xffff) << 8) | MLX5_OPCODE_UMR | - (MLX5_CTRL_SEGMENT_OPC_MOD_UMR_NVMEOTCP_TIS_STATIC_PARAMS << 24)); - size_t num_wqe_ds = 12U; - cseg->qpn_ds = htobe32((qpn << MLX5_WQE_CTRL_QPN_SHIFT) | num_wqe_ds); - cseg->fm_ce_se = fence_flags; - cseg->tis_tir_num = htobe32(tisn << MLX5_WQE_CTRL_TIR_TIS_INDEX_SHIFT); - - ucseg->flags = MLX5_UMR_INLINE; - ucseg->bsf_octowords = htobe16(MLX5E_TRANSPORT_STATIC_PARAMS_OCTWORD_SIZE); -} - -static inline void nvme_fill_static_params_transport_params( - mlx5_wqe_transport_static_params_seg *params, uint32_t config) - +void hw_queue_tx::put_tls_tis_in_cache(xlio_tis *tis) { - memset(params, 0, sizeof(*params)); - void *ctx = params->ctx; - - DEVX_SET(transport_static_params, ctx, const_1, 1); - DEVX_SET(transport_static_params, ctx, const_2, 2); - DEVX_SET(transport_static_params, ctx, acc_type, MLX5_TRANSPORT_STATIC_PARAMS_ACC_TYPE_NVMETCP); - DEVX_SET(transport_static_params, ctx, nvme_resync_tcp_sn, 0); - DEVX_SET(transport_static_params, ctx, pda, static_cast(config & XLIO_NVME_PDA_MASK)); - DEVX_SET(transport_static_params, ctx, ddgst_en, bool(config & XLIO_NVME_DDGST_ENABLE)); - DEVX_SET(transport_static_params, ctx, ddgst_offload_en, - bool(config & XLIO_NVME_DDGST_OFFLOAD)); - DEVX_SET(transport_static_params, ctx, hddgst_en, bool(config & XLIO_NVME_HDGST_ENABLE)); - DEVX_SET(transport_static_params, ctx, hdgst_offload_en, - bool(config & XLIO_NVME_HDGST_OFFLOAD)); - DEVX_SET(transport_static_params, ctx, ti, MLX5_TRANSPORT_STATIC_PARAMS_TI_INITIATOR); - DEVX_SET(transport_static_params, ctx, const1, 1); - DEVX_SET(transport_static_params, ctx, zero_copy_en, 0); -} - -static inline void nvme_fill_progress_wqe(mlx5e_set_nvmeotcp_progress_params_wqe *wqe, - uint32_t producer_index, uint32_t qpn, uint32_t tisn, - uint32_t tcp_seqno, uint8_t fence_flags) -{ - memset(wqe, 0, sizeof(*wqe)); - auto cseg = &wqe->ctrl.ctrl; - - size_t progres_params_ds = DIV_ROUND_UP(sizeof(*wqe), MLX5_SEND_WQE_DS); - cseg->opmod_idx_opcode = - htobe32(((producer_index & 0xffff) << 8) | XLIO_MLX5_OPCODE_SET_PSV | - (MLX5_CTRL_SEGMENT_OPC_MOD_UMR_NVMEOTCP_TIS_PROGRESS_PARAMS << 24)); - cseg->qpn_ds = htobe32((qpn << MLX5_WQE_CTRL_QPN_SHIFT) | progres_params_ds); - cseg->fm_ce_se = fence_flags; - - mlx5_seg_nvmeotcp_progress_params *params = &wqe->params; - params->tir_num = htobe32(tisn); - void *ctx = params->ctx; - - DEVX_SET(nvmeotcp_progress_params, ctx, next_pdu_tcp_sn, tcp_seqno); - DEVX_SET(nvmeotcp_progress_params, ctx, pdu_tracker_state, - MLX5E_NVMEOTCP_PROGRESS_PARAMS_PDU_TRACKER_STATE_START); - /* if (is_tx) offloading state == 0*/ - DEVX_SET(nvmeotcp_progress_params, ctx, offloading_state, 0); -} - -void qp_mgr::nvme_set_static_context(xlio_tis *tis, uint32_t config) -{ - auto *cseg = wqebb_get(0U); - auto *ucseg = wqebb_get(0U, sizeof(*cseg)); - - nvme_fill_static_params_control(cseg, ucseg, m_sq_wqe_counter, m_mlx5_qp.qpn, tis->get_tisn(), - 0); - memset(wqebb_get(1U), 0, sizeof(mlx5_mkey_seg)); - - auto *params = wqebb_get(2U); - nvme_fill_static_params_transport_params(params, config); - store_current_wqe_prop(nullptr, SQ_CREDITS_UMR, tis); - ring_doorbell(MLX5_DB_METHOD_DB, MLX5E_TRANSPORT_SET_STATIC_PARAMS_WQEBBS); - update_next_wqe_hot(); -} + std::unique_ptr dek = tis->release_dek(); + assert(dynamic_cast(dek.get()) != nullptr); -void qp_mgr::nvme_set_progress_context(xlio_tis *tis, uint32_t tcp_seqno) -{ - auto *wqe = reinterpret_cast(m_sq_wqe_hot); - nvme_fill_progress_wqe(wqe, m_sq_wqe_counter, m_mlx5_qp.qpn, tis->get_tisn(), tcp_seqno, - MLX5_FENCE_MODE_INITIATOR_SMALL); - store_current_wqe_prop(nullptr, SQ_CREDITS_SET_PSV, tis); - ring_doorbell(MLX5_DB_METHOD_DB, MLX5E_NVMEOTCP_PROGRESS_PARAMS_WQEBBS); - update_next_wqe_hot(); + put_tls_dek(std::unique_ptr(dynamic_cast(dek.release()))); + m_tls_tis_cache.push_back(tis); } -#if defined(DEFINED_UTLS) -void qp_mgr::ti_released(xlio_ti *ti) +void hw_queue_tx::ti_released(xlio_ti *ti) { assert(ti->m_released); assert(ti->m_ref == 0); if (ti->m_type == xlio_ti::ti_type::TLS_TIS) { put_tls_tis_in_cache(static_cast(ti)); - } else if (ti->m_type == xlio_ti::ti_type::TLS_TIR) { - put_tls_tir_in_cache(static_cast(ti)); } } -void qp_mgr::put_tls_tis_in_cache(xlio_tis *tis) -{ - std::unique_ptr dek = tis->release_dek(); - assert(dynamic_cast(dek.get()) != nullptr); - - put_tls_dek(std::unique_ptr(dynamic_cast(dek.release()))); - m_tls_tis_cache.push_back(tis); -} - -void qp_mgr::put_tls_tir_in_cache(xlio_tir *tir) +void hw_queue_tx::destroy_tis_cache(void) { - // Because the absense of TIR flush command, reusing a TIR - // may result in undefined behaviour. - // Until a flush command is available the TIR cache is disabled. - // Re-enabling TIR cache should also add destroy_tir_cache on ring cleanup. - // m_tls_tir_cache.push_back(tir); - - delete tir; + while (!m_tls_tis_cache.empty()) { + xlio_tis *tis = m_tls_tis_cache.back(); + m_tls_tis_cache.pop_back(); + delete tis; + } } +#else /* DEFINED_UTLS */ +void hw_queue_tx::ti_released(xlio_ti *) {}; +void hw_queue_tx::destroy_tis_cache(void) {}; #endif /* defined(DEFINED_UTLS) */ -void qp_mgr::post_nop_fence(void) +void hw_queue_tx::post_nop_fence(void) { struct mlx5_wqe *wqe = reinterpret_cast(m_sq_wqe_hot); struct xlio_mlx5_wqe_ctrl_seg *cseg = &wqe->ctrl; @@ -1840,7 +1497,8 @@ void qp_mgr::post_nop_fence(void) update_next_wqe_hot(); } -void qp_mgr::post_dump_wqe(xlio_tis *tis, void *addr, uint32_t len, uint32_t lkey, bool is_first) +void hw_queue_tx::post_dump_wqe(xlio_tis *tis, void *addr, uint32_t len, uint32_t lkey, + bool is_first) { struct mlx5_dump_wqe *wqe = reinterpret_cast(m_sq_wqe_hot); struct xlio_mlx5_wqe_ctrl_seg *cseg = &wqe->ctrl.ctrl; @@ -1871,18 +1529,18 @@ void qp_mgr::post_dump_wqe(xlio_tis *tis, void *addr, uint32_t len, uint32_t lke // Single post send with SIGNAL of a dummy packet // NOTE: Since the QP is in ERROR state no packets will be sent on the wire! // So we can post_send anything we want :) -void qp_mgr::trigger_completion_for_all_sent_packets() +void hw_queue_tx::trigger_completion_for_all_sent_packets() { - qp_logfunc("unsignaled count=%d", m_n_unsignaled_count); + hwqtx_logfunc("unsignaled count=%d", m_n_unsignaled_count); if (!is_signal_requested_for_last_wqe()) { // Post a dummy WQE and request a signal to complete all the unsignaled WQEs in SQ - qp_logdbg("Need to send closing tx wr..."); + hwqtx_logdbg("Need to send closing tx wr..."); mem_buf_desc_t *p_mem_buf_desc = m_p_ring->mem_buf_tx_get(0, true, PBUF_RAM); // Align Tx buffer accounting since we will be bypassing the normal send calls m_p_ring->m_missing_buf_ref_count--; if (!p_mem_buf_desc) { - qp_logerr("no buffer in pool"); + hwqtx_logerr("no buffer in pool"); return; } @@ -1917,7 +1575,7 @@ void qp_mgr::trigger_completion_for_all_sent_packets() // TODO Wait for available space in SQ to post the WQE. This method mustn't fail, // because we may want to wait until all the WQEs are completed and we need to post // something and request signal. - qp_logdbg("No space in SQ to trigger completions with a post operation"); + hwqtx_logdbg("No space in SQ to trigger completions with a post operation"); return; } @@ -1927,7 +1585,7 @@ void qp_mgr::trigger_completion_for_all_sent_packets() } } -void qp_mgr::reset_inflight_zc_buffers_ctx(void *ctx) +void hw_queue_tx::reset_inflight_zc_buffers_ctx(void *ctx) { sq_wqe_prop *p = m_sq_wqe_prop_last; sq_wqe_prop *prev; @@ -1947,193 +1605,33 @@ void qp_mgr::reset_inflight_zc_buffers_ctx(void *ctx) } } -dpcp::tir *qp_mgr::create_tir(bool is_tls /*=false*/) +uint32_t hw_queue_tx::is_ratelimit_change(struct xlio_rate_limit_t &rate_limit) { - dpcp::tir *tir_obj = nullptr; - dpcp::status status = dpcp::DPCP_OK; - dpcp::tir::attr tir_attr; - - memset(&tir_attr, 0, sizeof(tir_attr)); - tir_attr.flags = dpcp::TIR_ATTR_INLINE_RQN | dpcp::TIR_ATTR_TRANSPORT_DOMAIN; - tir_attr.inline_rqn = m_mlx5_qp.rqn; - tir_attr.transport_domain = m_p_ib_ctx_handler->get_dpcp_adapter()->get_td(); - - if (m_p_ring->m_lro.cap && m_p_ring->m_lro.max_payload_sz) { - tir_attr.flags |= dpcp::TIR_ATTR_LRO; - tir_attr.lro.timeout_period_usecs = XLIO_MLX5_PARAMS_LRO_TIMEOUT; - tir_attr.lro.enable_mask = 3; // Bitmask for IPv4 and IPv6 support - tir_attr.lro.max_msg_sz = m_p_ring->m_lro.max_payload_sz >> 8; - } - - if (is_tls) { - tir_attr.flags |= dpcp::TIR_ATTR_TLS; - tir_attr.tls_en = 1; - } - - status = m_p_ib_ctx_handler->get_dpcp_adapter()->create_tir(tir_attr, tir_obj); - - if (dpcp::DPCP_OK != status) { - qp_logerr("Failed creating dpcp tir with flags=0x%x status=%d", tir_attr.flags, status); - return nullptr; - } - - qp_logdbg("TIR: %p created", tir_obj); - - return tir_obj; -} - -void qp_mgr::modify_rq_to_ready_state() -{ - dpcp::status rc = _rq->modify_state(dpcp::RQ_RDY); - if (dpcp::DPCP_OK != rc) { - qp_logerr("Failed to modify rq state to RDY, rc: %d, rqn: %" PRIu32, static_cast(rc), - m_mlx5_qp.rqn); - } -} - -bool qp_mgr::configure_rq_dpcp() -{ - qp_logdbg("Creating RQ of transport type '%s' on ibv device '%s' [%p] on port %d", - priv_xlio_transport_type_str(m_p_ring->get_transport_type()), - m_p_ib_ctx_handler->get_ibname(), m_p_ib_ctx_handler->get_ibv_device(), m_port_num); - - m_mlx5_qp.cap.max_recv_wr = m_rx_num_wr; - - qp_logdbg("Requested RQ parameters: wre: rx = %d sge: rx = %d", m_mlx5_qp.cap.max_recv_wr, - m_mlx5_qp.cap.max_recv_sge); - - xlio_ib_mlx5_cq_t mlx5_cq; - memset(&mlx5_cq, 0, sizeof(mlx5_cq)); - xlio_ib_mlx5_get_cq(m_p_cq_mgr_rx->get_ibv_cq_hndl(), &mlx5_cq); - - qp_logdbg("Configuring dpcp RQ, cq-rx: %p, cqn-rx: %u", m_p_cq_mgr_rx, - static_cast(mlx5_cq.cq_num)); - - if (safe_mce_sys().enable_striding_rq) { - m_mlx5_qp.cap.max_recv_sge = 2U; // Striding-RQ needs a reserved segment. - _strq_wqe_reserved_seg = 1U; - - delete[] m_ibv_rx_sg_array; - m_ibv_rx_sg_array = - new ibv_sge[m_n_sysvar_rx_num_wr_to_post_recv * m_mlx5_qp.cap.max_recv_sge]; - for (uint32_t wr_idx = 0; wr_idx < m_n_sysvar_rx_num_wr_to_post_recv; wr_idx++) { - m_ibv_rx_wr_array[wr_idx].sg_list = - &m_ibv_rx_sg_array[wr_idx * m_mlx5_qp.cap.max_recv_sge]; - m_ibv_rx_wr_array[wr_idx].num_sge = m_mlx5_qp.cap.max_recv_sge; - memset(m_ibv_rx_wr_array[wr_idx].sg_list, 0, sizeof(ibv_sge)); - m_ibv_rx_wr_array[wr_idx].sg_list[0].length = - 1U; // To bypass a check inside xlio_ib_mlx5_post_recv. - } - } - - // Create the QP - if (!prepare_rq(mlx5_cq.cq_num)) { - return false; - } - - return true; -} - -bool qp_mgr::prepare_rq(uint32_t cqn) -{ - qp_logdbg(""); - - dpcp::adapter *dpcp_adapter = m_p_ib_ctx_handler->get_dpcp_adapter(); - if (!dpcp_adapter) { - qp_logerr("Failed to get dpcp::adapter for prepare_rq"); - return false; - } - - // user_index Unused. - dpcp::rq_attr rqattrs; - memset(&rqattrs, 0, sizeof(rqattrs)); - rqattrs.cqn = cqn; - rqattrs.wqe_num = m_mlx5_qp.cap.max_recv_wr; - rqattrs.wqe_sz = m_mlx5_qp.cap.max_recv_sge; - - if (safe_mce_sys().hw_ts_conversion_mode == TS_CONVERSION_MODE_RTC) { - qp_logdbg("Enabled RTC timestamp format for RQ"); - rqattrs.ts_format = dpcp::rq_ts_format::RQ_TS_REAL_TIME; - } - - std::unique_ptr new_rq; - dpcp::status rc = dpcp::DPCP_OK; - - if (safe_mce_sys().enable_striding_rq) { - rqattrs.buf_stride_sz = safe_mce_sys().strq_stride_size_bytes; - rqattrs.buf_stride_num = safe_mce_sys().strq_stride_num_per_rwqe; - - // Striding-RQ WQE format is as of Shared-RQ (PRM, page 381, wq_type). - // In this case the WQE minimum size is 2 * 16, and the first segment is reserved. - rqattrs.wqe_sz = m_mlx5_qp.cap.max_recv_sge * 16U; + uint32_t rl_changes = 0; - dpcp::striding_rq *new_rq_ptr = nullptr; - rc = dpcp_adapter->create_striding_rq(rqattrs, new_rq_ptr); - new_rq.reset(new_rq_ptr); - } else { - dpcp::regular_rq *new_rq_ptr = nullptr; - rc = dpcp_adapter->create_regular_rq(rqattrs, new_rq_ptr); - new_rq.reset(new_rq_ptr); + if (m_rate_limit.rate != rate_limit.rate) { + rl_changes |= RL_RATE; } - - if (dpcp::DPCP_OK != rc) { - qp_logerr("Failed to create dpcp rq, rc: %d, cqn: %" PRIu32, static_cast(rc), cqn); - return false; + if (m_rate_limit.max_burst_sz != rate_limit.max_burst_sz) { + rl_changes |= RL_BURST_SIZE; } - - if (!store_rq_mlx5_params(*new_rq)) { - qp_logerr( - "Failed to retrieve initial DPCP RQ parameters, rc: %d, basic_rq: %p, cqn: %" PRIu32, - static_cast(rc), new_rq.get(), cqn); - return false; + if (m_rate_limit.typical_pkt_sz != rate_limit.typical_pkt_sz) { + rl_changes |= RL_PKT_SIZE; } - _rq = std::move(new_rq); - - // At this stage there is no TIR associated with the RQ, So it mimics QP INIT state. - // At RDY state without a TIR, Work Requests can be submitted to the RQ. - modify_rq_to_ready_state(); - - qp_logdbg("Succeeded to create dpcp rq, rqn: %" PRIu32 ", cqn: %" PRIu32, m_mlx5_qp.rqn, cqn); - - return true; + return rl_changes; } -bool qp_mgr::store_rq_mlx5_params(dpcp::basic_rq &new_rq) +int hw_queue_tx::modify_qp_ratelimit(struct xlio_rate_limit_t &rate_limit, uint32_t rl_changes) { - uint32_t *dbrec_tmp = nullptr; - dpcp::status rc = new_rq.get_dbrec(dbrec_tmp); - if (dpcp::DPCP_OK != rc) { - qp_logerr("Failed to retrieve dbrec of dpcp rq, rc: %d, basic_rq: %p", static_cast(rc), - &new_rq); - return false; - } - m_mlx5_qp.rq.dbrec = dbrec_tmp; - - rc = new_rq.get_wq_buf(m_mlx5_qp.rq.buf); - if (dpcp::DPCP_OK != rc) { - qp_logerr("Failed to retrieve wq-buf of dpcp rq, rc: %d, basic_rq: %p", - static_cast(rc), &new_rq); - return false; - } - - rc = new_rq.get_id(m_mlx5_qp.rqn); - if (dpcp::DPCP_OK != rc) { - qp_logerr("Failed to retrieve rqn of dpcp rq, rc: %d, basic_rq: %p", static_cast(rc), - &new_rq); - return false; - } + int ret; - new_rq.get_wqe_num(m_mlx5_qp.rq.wqe_cnt); - new_rq.get_wq_stride_sz(m_mlx5_qp.rq.stride); - if (safe_mce_sys().enable_striding_rq) { - m_mlx5_qp.rq.stride /= 16U; + ret = priv_ibv_modify_qp_ratelimit(m_mlx5_qp.qp, rate_limit, rl_changes); + if (ret) { + hwqtx_logdbg("failed to modify qp ratelimit ret %d (errno=%d %m)", ret, errno); + return -1; } - m_mlx5_qp.rq.wqe_shift = ilog_2(m_mlx5_qp.rq.stride); - m_mlx5_qp.rq.head = 0; - m_mlx5_qp.rq.tail = 0; - m_mlx5_qp.tirn = 0U; - - return true; + m_rate_limit = rate_limit; + return 0; } diff --git a/src/core/dev/qp_mgr.h b/src/core/dev/hw_queue_tx.h similarity index 62% rename from src/core/dev/qp_mgr.h rename to src/core/dev/hw_queue_tx.h index 14b880369..f2ec6675d 100644 --- a/src/core/dev/qp_mgr.h +++ b/src/core/dev/hw_queue_tx.h @@ -30,48 +30,26 @@ * SOFTWARE. */ -#ifndef QP_MGR_H -#define QP_MGR_H +#ifndef HW_QUEUE_TX_H +#define HW_QUEUE_TX_H -#include -#include -#include - -#include "ib/base/verbs_extra.h" -#include "proto/xlio_lwip.h" -#include "vlogger/vlogger.h" -#include "utils/atomic.h" -#include "util/vtypes.h" -#include "util/sys_vars.h" -#include "util/libxlio.h" -#include "util/if.h" -#include "lwip/opt.h" -#include "proto/mem_buf_desc.h" -#include "infra/sender.h" -#include "dev/ib_ctx_handler.h" -#include "dev/cq_mgr_rx.h" -#include "dev/cq_mgr_tx.h" -#include "dev/rfs_rule.h" -#include "util/sg_array.h" -#include "dev/dm_mgr.h" #include #include - -/* Forward declarations */ -struct xlio_tls_info; -class xlio_tis; -class xlio_tir; -class buffer_pool; -class cq_mgr_rx; -struct slave_data; -class ring; -class ring_simple; -class ring_eth_cb; +#include "dev/xlio_ti.h" +#include "dev/cq_mgr_tx.h" +#include "dev/cq_mgr_rx.h" +#include "dev/dm_mgr.h" +#include "proto/mem_buf_desc.h" +#include "proto/xlio_lwip.h" +#include "util/sg_array.h" #ifndef MAX_SUPPORTED_IB_INLINE_SIZE #define MAX_SUPPORTED_IB_INLINE_SIZE 884 #endif +struct slave_data_t; +struct xlio_tls_info; + enum { SQ_CREDITS_UMR = 3U, SQ_CREDITS_SET_PSV = 1U, @@ -84,143 +62,6 @@ enum { SQ_CREDITS_TLS_RX_GET_PSV = SQ_CREDITS_GET_PSV, }; -struct qp_mgr_desc { - ring_simple *ring; - const struct slave_data *slave; - struct ibv_comp_channel *rx_comp_event_channel; -}; - -/* Work request completion callback */ -/* TODO Add argument for completion status to handle errors. */ -typedef void (*xlio_comp_cb_t)(void *); - -class xlio_ti { -public: - enum ti_type : uint8_t { UNKNOWN, TLS_TIS, TLS_TIR, NVME_TIS, NVME_TIR }; - - xlio_ti(ti_type type = UNKNOWN) - : m_type(type) - , m_released(false) - , m_ref(0) - , m_callback(nullptr) - , m_callback_arg(nullptr) - { - } - virtual ~xlio_ti() {}; - - inline void assign_callback(xlio_comp_cb_t callback, void *callback_arg) - { - m_callback = callback; - m_callback_arg = callback_arg; - } - - /* - * Reference counting. m_ref must be protected by ring tx lock. Device - * layer (QP, CQ) is responsible for the reference counting. - */ - - inline void get(void) - { - ++m_ref; - assert(m_ref > 0); - } - - inline uint32_t put(void) - { - assert(m_ref > 0); - return --m_ref; - } - - ti_type m_type; - bool m_released; - uint32_t m_ref; - - xlio_comp_cb_t m_callback; - void *m_callback_arg; -}; - -class xlio_tis : public xlio_ti { -public: - xlio_tis(std::unique_ptr _tis, xlio_ti::ti_type type) - : xlio_ti(type) - , m_dek() - , m_p_tis(std::move(_tis)) - , m_tisn(0U) - , m_dek_id(0U) - { - dpcp::status ret = m_p_tis->get_tisn(m_tisn); - assert(ret == dpcp::DPCP_OK); - (void)ret; - } - - ~xlio_tis() = default; - - std::unique_ptr release_dek() - { - assert(m_ref == 0); - m_released = false; - return std::move(m_dek); - } - - uint32_t get_tisn() noexcept { return m_tisn; } - - void assign_dek(std::unique_ptr &&dek_ptr) - { - m_dek = std::move(dek_ptr); - m_dek_id = m_dek->get_key_id(); - } - - uint32_t get_dek_id() noexcept { return m_dek_id; } - -private: - std::unique_ptr m_dek; - std::unique_ptr m_p_tis; - uint32_t m_tisn; - uint32_t m_dek_id; -}; - -class xlio_tir : public xlio_ti { -public: - xlio_tir(dpcp::tir *_tir, xlio_ti::ti_type type) - : xlio_ti(type) - { - m_p_tir.reset(_tir); - m_dek = NULL; - m_tirn = 0; - m_dek_id = 0; - - /* Cache the tir number. Mustn't fail for a valid TIR object. */ - m_tirn = m_p_tir->get_tirn(); - assert(m_tirn != 0); - } - - ~xlio_tir() = default; - - std::unique_ptr release_dek() - { - assert(m_ref == 0); - m_released = false; - return std::move(m_dek); - } - - uint32_t get_tirn() { return m_tirn; } - - void assign_dek(void *dek_ptr) - { - m_dek.reset(reinterpret_cast(dek_ptr)); - m_dek_id = m_dek->get_key_id(); - } - - uint32_t get_dek_id() { return m_dek_id; } - - std::unique_ptr m_p_tir; - -private: - std::unique_ptr m_dek; - uint32_t m_tirn; - uint32_t m_dek_id; -}; - /* WQE properties description. */ struct sq_wqe_prop { /* A buffer held by the WQE. This is NULL for control WQEs. */ @@ -232,67 +73,40 @@ struct sq_wqe_prop { struct sq_wqe_prop *next; }; -/** - * @class qp_mgr - * - * Object to manages the QP operation - * This object is used for Rx & Tx at the same time - * Once created it requests from the system a CQ to work with (for Rx & Tx separately) - * - * The qp_mgr object will manage the memory data buffers to be used for Rx & Tx. - * A descriptor (mem_buf_desc_t) is used to point to each memory data buffers which is also menaged - * by the qm_mgr. - * - * NOTE: - * The idea here is to use the rdma_cma_id object to manage the QP - * all we need is to rdma_resolve_addr() so we have the correct pkey in the cma_id object - * the rest is a simple transition of the QP states that is hidden inside the rdma_cm - * - */ -class qp_mgr { - friend class cq_mgr_rx; - friend class cq_mgr_rx_regrq; - friend class cq_mgr_rx_strq; +// @class hw_queue_tx +// Object to manages the SQ operations. This object is used for Tx. +// Once created it requests from the system a CQ to work with. +class hw_queue_tx : public xlio_ti_owner { friend class cq_mgr_tx; public: - qp_mgr(struct qp_mgr_desc *desc, const uint32_t tx_num_wr, uint16_t vlan); - ~qp_mgr(); + hw_queue_tx(ring_simple *ring, const slave_data_t *slave, const uint32_t tx_num_wr); + virtual ~hw_queue_tx(); + + virtual void ti_released(xlio_ti *ti) override; void up(); void down(); - // Post for receive single mem_buf_desc - void post_recv_buffer(mem_buf_desc_t *p_mem_buf_desc); - - // Post for receive a list of mem_buf_desc - void post_recv_buffers(descq_t *p_buffers, size_t count); int send(xlio_ibv_send_wr *p_send_wqe, xlio_wr_tx_packet_attr attr, xlio_tis *tis, unsigned credits); - inline uint32_t get_max_inline_data() const { return m_mlx5_qp.cap.max_inline_data; } - inline uint32_t get_max_send_sge() const { return m_mlx5_qp.cap.max_send_sge; } - int get_port_num() const { return m_port_num; } - uint16_t get_partiton() const { return m_vlan; }; struct ibv_qp *get_ibv_qp() const { return m_mlx5_qp.qp; }; - cq_mgr_tx *get_tx_cq_mgr() const { return m_p_cq_mgr_tx; } - cq_mgr_rx *get_rx_cq_mgr() const { return m_p_cq_mgr_rx; } - uint32_t get_rx_max_wr_num() { return m_rx_num_wr; } + // This function can be replaced with a parameter during ring creation. // chain of calls may serve as cache warm for dummy send feature. - inline bool get_hw_dummy_send_support() { return m_hw_dummy_send_support; } - - void modify_qp_to_ready_state(); - void modify_qp_to_error_state(); + bool get_hw_dummy_send_support() { return m_hw_dummy_send_support; } + cq_mgr_tx *get_tx_cq_mgr() const { return m_p_cq_mgr_tx; } + uint32_t get_max_inline_data() const { return m_mlx5_qp.cap.max_inline_data; } + uint32_t get_max_send_sge() const { return m_mlx5_qp.cap.max_send_sge; } - void release_rx_buffers(); + void modify_queue_to_ready_state(); + void modify_queue_to_error_state(); void release_tx_buffers(); uint32_t is_ratelimit_change(struct xlio_rate_limit_t &rate_limit); int modify_qp_ratelimit(struct xlio_rate_limit_t &rate_limit, uint32_t rl_changes); void dm_release_data(mem_buf_desc_t *buff) { m_dm_mgr.release_data(buff); } - rfs_rule *create_rfs_rule(xlio_ibv_flow_attr &attrs, xlio_tir *tir_ext); - #ifdef DEFINED_UTLS xlio_tis *tls_context_setup_tx(const xlio_tls_info *info) override; xlio_tir *tls_create_tir(bool cached) override; @@ -308,7 +122,7 @@ class qp_mgr { #define DPCP_TIS_FLAGS (dpcp::TIS_ATTR_TRANSPORT_DOMAIN | dpcp::TIS_ATTR_PD) #define DPCP_TIS_NVME_FLAG (dpcp::TIS_ATTR_NVMEOTCP) - std::unique_ptr create_tis(uint32_t flags) const; + std::unique_ptr create_tis(uint32_t flags); void nvme_set_static_context(xlio_tis *tis, uint32_t config); void nvme_set_progress_context(xlio_tis *tis, uint32_t tcp_seqno); @@ -348,7 +162,7 @@ class qp_mgr { unsigned credits_calculate(xlio_ibv_send_wr *p_send_wqe) { - /* Credit is a logical value which is opaque for users. Only qp_mgr can interpret the + /* Credit is a logical value which is opaque for users. Only hw_queue_tx can interpret the * value and currently, one credit equals to one WQEBB in the SQ. * * Current method does best effort to predict how many WQEBBs will be used to send @@ -387,57 +201,26 @@ class qp_mgr { } } -protected: - xlio_ib_mlx5_qp_t m_mlx5_qp; - uint64_t *m_rq_wqe_idx_to_wrid = nullptr; - - ring_simple *m_p_ring; - uint8_t m_port_num; - ib_ctx_handler *m_p_ib_ctx_handler; - - uint32_t m_max_qp_wr = 0U; - - cq_mgr_rx *m_p_cq_mgr_rx; - cq_mgr_tx *m_p_cq_mgr_tx; - - uint32_t m_rx_num_wr; - uint32_t m_tx_num_wr; - - bool m_hw_dummy_send_support = false; - - uint32_t m_n_sysvar_rx_num_wr_to_post_recv; - const uint32_t m_n_sysvar_tx_num_wr_to_signal; - const uint32_t m_n_sysvar_rx_prefetch_bytes_before_poll; - - // recv_wr - ibv_sge *m_ibv_rx_sg_array; - ibv_recv_wr *m_ibv_rx_wr_array; - uint32_t m_curr_rx_wr = 0U; - uintptr_t m_last_posted_rx_wr_id = 0U; // Remember so in case we flush RQ we know to wait until - // this WR_ID is received - - // send wr - uint32_t m_n_unsignaled_count = 0U; - - mem_buf_desc_t *m_p_prev_rx_desc_pushed = nullptr; - - uint16_t m_vlan; - struct xlio_rate_limit_t m_rate_limit; +private: + cq_mgr_tx *init_tx_cq_mgr(); - int configure(struct qp_mgr_desc *desc); - int prepare_ibv_qp(xlio_ibv_qp_init_attr &qp_init_attr); - void init_qp(); + int configure(const slave_data_t *slave); + int prepare_queue(xlio_ibv_qp_init_attr &qp_init_attr); + void init_queue(); void init_device_memory(); - bool init_rx_cq_mgr_prepare(); - void post_recv_buffer_rq(mem_buf_desc_t *p_mem_buf_desc); + void trigger_completion_for_all_sent_packets(); + void update_next_wqe_hot(); + void destroy_tis_cache(); + void put_tls_tis_in_cache(xlio_tis *tis); + + int send_to_wire(xlio_ibv_send_wr *p_send_wqe, xlio_wr_tx_packet_attr attr, bool request_comp, + xlio_tis *tis, unsigned credits); void set_unsignaled_count(void) { m_n_unsignaled_count = m_n_sysvar_tx_num_wr_to_signal - 1; } - void dec_unsignaled_count(void) + bool is_completion_need() const { - if (m_n_unsignaled_count > 0) { - --m_n_unsignaled_count; - } + return !m_n_unsignaled_count || (m_dm_enabled && m_dm_mgr.is_completion_need()); } bool is_signal_requested_for_last_wqe() @@ -445,35 +228,23 @@ class qp_mgr { return m_n_unsignaled_count == m_n_sysvar_tx_num_wr_to_signal - 1; } - cq_mgr_rx *init_rx_cq_mgr(struct ibv_comp_channel *p_rx_comp_event_channel); - cq_mgr_tx *init_tx_cq_mgr(); - - int send_to_wire(xlio_ibv_send_wr *p_send_wqe, xlio_wr_tx_packet_attr attr, bool request_comp, - xlio_tis *tis, unsigned credits); - -private: - void trigger_completion_for_all_sent_packets(); - void update_next_wqe_hot(); - void destroy_tis_cache(); - void ti_released(xlio_ti *ti); - void put_tls_tir_in_cache(xlio_tir *tir); - void put_tls_tis_in_cache(xlio_tis *tis); - void modify_rq_to_ready_state(); - bool prepare_rq(uint32_t cqn); - bool configure_rq_dpcp(); - bool store_rq_mlx5_params(dpcp::basic_rq &new_rq); - bool is_rq_empty() const { return (m_mlx5_qp.rq.head == m_mlx5_qp.rq.tail); } - bool is_completion_need() const + void dec_unsignaled_count(void) { - return !m_n_unsignaled_count || (m_dm_enabled && m_dm_mgr.is_completion_need()); + if (m_n_unsignaled_count > 0) { + --m_n_unsignaled_count; + } } - dpcp::tir *create_tir(bool is_tls = false); - - dpcp::tir *xlio_tir_to_dpcp_tir(xlio_tir *tir) { return tir->m_p_tir.get(); } + bool is_sq_wqe_prop_valid(sq_wqe_prop *p, sq_wqe_prop *prev) + { + unsigned p_i = p - m_sq_wqe_idx_to_prop; + unsigned prev_i = prev - m_sq_wqe_idx_to_prop; + return (p_i != m_sq_wqe_prop_last_signalled) && + ((m_tx_num_wr + p_i - m_sq_wqe_prop_last_signalled) % m_tx_num_wr < + (m_tx_num_wr + prev_i - m_sq_wqe_prop_last_signalled) % m_tx_num_wr); + } #if defined(DEFINED_UTLS) - inline void tls_fill_static_params_wqe(struct mlx5_wqe_tls_static_params_seg *params, const struct xlio_tls_info *info, uint32_t key_id, uint32_t resync_tcp_sn); @@ -496,47 +267,40 @@ class qp_mgr { inline void ring_doorbell(int db_method, int num_wqebb, int num_wqebb_top = 0, bool skip_comp = false); - bool is_sq_wqe_prop_valid(sq_wqe_prop *p, sq_wqe_prop *prev) - { - unsigned p_i = p - m_sq_wqe_idx_to_prop; - unsigned prev_i = prev - m_sq_wqe_idx_to_prop; - return (p_i != m_sq_wqe_prop_last_signalled) && - ((m_tx_num_wr + p_i - m_sq_wqe_prop_last_signalled) % m_tx_num_wr < - (m_tx_num_wr + prev_i - m_sq_wqe_prop_last_signalled) % m_tx_num_wr); - } - + struct xlio_rate_limit_t m_rate_limit; + xlio_ib_mlx5_qp_t m_mlx5_qp; + ring_simple *m_p_ring; + cq_mgr_tx *m_p_cq_mgr_tx; + cq_mgr_rx *m_p_cq_mgr_rx_unused; + ib_ctx_handler *m_p_ib_ctx_handler; sq_wqe_prop *m_sq_wqe_idx_to_prop = nullptr; sq_wqe_prop *m_sq_wqe_prop_last = nullptr; - unsigned m_sq_wqe_prop_last_signalled = 0U; - unsigned m_sq_free_credits = 0U; - uint64_t m_rq_wqe_counter = 0U; struct mlx5_eth_wqe (*m_sq_wqes)[] = nullptr; struct mlx5_eth_wqe *m_sq_wqe_hot = nullptr; uint8_t *m_sq_wqes_end = nullptr; enum { MLX5_DB_METHOD_BF, MLX5_DB_METHOD_DB } m_db_method; + const uint32_t m_n_sysvar_tx_num_wr_to_signal; + uint32_t m_tx_num_wr; + unsigned m_sq_wqe_prop_last_signalled = 0U; + unsigned m_sq_free_credits = 0U; + uint32_t m_n_unsignaled_count = 0U; int m_sq_wqe_hot_index = 0; uint16_t m_sq_wqe_counter = 0U; + uint8_t m_port_num; bool m_b_fence_needed = false; bool m_dm_enabled = false; + bool m_hw_dummy_send_support = false; dm_mgr m_dm_mgr; - /* - * TIS cache. Protected by ring tx lock. - * TODO Move to ring. - */ + // TIS cache. Protected by ring tx lock. TODO Move to ring. std::vector m_tls_tis_cache; - std::vector m_tls_tir_cache; #if defined(DEFINED_UTLS) std::list> m_tls_dek_get_cache; std::list> m_tls_dek_put_cache; #endif - - std::unique_ptr _tir = {nullptr}; - std::unique_ptr _rq = {nullptr}; - uint32_t _strq_wqe_reserved_seg = 0U; }; -#endif +#endif // HW_QUEUE_TX_H diff --git a/src/core/dev/net_device_val.h b/src/core/dev/net_device_val.h index 34d9aae84..8e3ec7d2f 100644 --- a/src/core/dev/net_device_val.h +++ b/src/core/dev/net_device_val.h @@ -129,14 +129,14 @@ typedef std::unordered_map slave_data_vector_t; diff --git a/src/core/dev/rfs.cpp b/src/core/dev/rfs.cpp index c5088a475..b8b9efb2e 100644 --- a/src/core/dev/rfs.cpp +++ b/src/core/dev/rfs.cpp @@ -32,7 +32,6 @@ #include "utils/bullseye.h" #include "dev/rfs.h" -#include "dev/qp_mgr.h" #include "dev/ring_simple.h" #include "sock/sock-redirect.h" #include "sock/sock-app.h" @@ -355,7 +354,7 @@ rfs_rule *create_rule_T(xlio_tir *tir, const flow_tuple &flow_spec, attach_flow_ } // The highest priority to override TCP rule flow_attr.attr.priority = 0; - return iter->p_qp_mgr->create_rfs_rule(flow_attr.attr, tir); + return iter->hqrx_ptr->create_rfs_rule(flow_attr.attr, tir); } rfs_rule *rfs::create_rule(xlio_tir *tir, const flow_tuple &flow_spec) @@ -379,7 +378,7 @@ bool rfs::create_flow() { for (size_t i = 0; i < m_attach_flow_data_vector.size(); i++) { attach_flow_data_t *iter = m_attach_flow_data_vector[i]; - iter->rfs_flow = iter->p_qp_mgr->create_rfs_rule(iter->ibv_flow_attr, NULL); + iter->rfs_flow = iter->hqrx_ptr->create_rfs_rule(iter->ibv_flow_attr, NULL); if (!iter->rfs_flow) { rfs_logerr("Create RFS flow failed, Tag: %" PRIu32 ", Flow: %s, Priority: %" PRIu16 ", errno: %d - %m", diff --git a/src/core/dev/rfs.h b/src/core/dev/rfs.h index a5cf28efe..39123e430 100644 --- a/src/core/dev/rfs.h +++ b/src/core/dev/rfs.h @@ -43,7 +43,7 @@ #define RFS_SINKS_LIST_DEFAULT_LEN 32 -class qp_mgr; +class hw_queue_rx; class pkt_rcvr_sink; /* @@ -66,20 +66,20 @@ typedef struct ibv_flow_attr_eth { template struct attach_flow_data_eth_ip_tcp_udp_t { rfs_rule *rfs_flow; - qp_mgr *p_qp_mgr; + hw_queue_rx *hqrx_ptr; struct ibv_flow_attr_eth_ip_tcp_udp : public ibv_flow_attr_eth { T ip; xlio_ibv_flow_spec_tcp_udp tcp_udp; xlio_ibv_flow_spec_action_tag flow_tag; // must be the last as struct can be used without it - ibv_flow_attr_eth_ip_tcp_udp(uint8_t port) + ibv_flow_attr_eth_ip_tcp_udp() { memset(this, 0, sizeof(*this)); attr.size = sizeof(T) - sizeof(flow_tag); attr.num_of_specs = 3; attr.type = XLIO_IBV_FLOW_ATTR_NORMAL; attr.priority = 2; // almost highest priority, 1 is used for 5-tuple later - attr.port = port; + attr.port = 0; } inline void add_flow_tag_spec(void) { @@ -87,10 +87,10 @@ template struct attach_flow_data_eth_ip_tcp_udp_t { attr.size += sizeof(flow_tag); } } ibv_flow_attr; - attach_flow_data_eth_ip_tcp_udp_t(qp_mgr *qp_mgr) + attach_flow_data_eth_ip_tcp_udp_t(hw_queue_rx *hqrx) : rfs_flow(NULL) - , p_qp_mgr(qp_mgr) - , ibv_flow_attr(qp_mgr->get_port_num()) + , hqrx_ptr(hqrx) + , ibv_flow_attr() { } }; @@ -102,7 +102,7 @@ typedef attach_flow_data_eth_ip_tcp_udp_t typedef struct attach_flow_data_t { rfs_rule *rfs_flow; - qp_mgr *p_qp_mgr; + hw_queue_rx *hqrx_ptr; xlio_ibv_flow_attr ibv_flow_attr; } attach_flow_data_t; diff --git a/src/core/dev/rfs_mc.cpp b/src/core/dev/rfs_mc.cpp index 96e5a3079..ca7adcff7 100644 --- a/src/core/dev/rfs_mc.cpp +++ b/src/core/dev/rfs_mc.cpp @@ -77,10 +77,10 @@ bool rfs_mc::prepare_flow_spec() bool is_ipv4 = (m_flow_tuple.get_family() == AF_INET); if (is_ipv4) { prepare_flow_spec_by_ip( - p_ring->m_p_qp_mgr, p_attach_flow_data, p_eth, p_tcp_udp); + p_ring->m_hqrx, p_attach_flow_data, p_eth, p_tcp_udp); } else { prepare_flow_spec_by_ip( - p_ring->m_p_qp_mgr, p_attach_flow_data, p_eth, p_tcp_udp); + p_ring->m_hqrx, p_attach_flow_data, p_eth, p_tcp_udp); } if (!p_attach_flow_data) { @@ -89,7 +89,7 @@ bool rfs_mc::prepare_flow_spec() uint8_t dst_mac[6]; create_multicast_mac_from_ip(dst_mac, m_flow_tuple.get_dst_ip(), m_flow_tuple.get_family()); - ibv_flow_spec_eth_set(p_eth, dst_mac, htons(p_ring->m_p_qp_mgr->get_partiton()), is_ipv4); + ibv_flow_spec_eth_set(p_eth, dst_mac, htons(p_ring->m_hqrx->get_vlan()), is_ipv4); if (safe_mce_sys().eth_mc_l2_only_rules) { ibv_flow_spec_tcp_udp_set(p_tcp_udp, 0, 0, 0); diff --git a/src/core/dev/rfs_mc.h b/src/core/dev/rfs_mc.h index 87c514a47..c3a1042f1 100644 --- a/src/core/dev/rfs_mc.h +++ b/src/core/dev/rfs_mc.h @@ -56,17 +56,17 @@ class rfs_mc : public rfs { virtual bool prepare_flow_spec(); template - void prepare_flow_spec_by_ip(qp_mgr *qp_mgr, attach_flow_data_t *&p_attach_flow_data, + void prepare_flow_spec_by_ip(hw_queue_rx *hqrx_ptr, attach_flow_data_t *&p_attach_flow_data, xlio_ibv_flow_spec_eth *&p_eth, xlio_ibv_flow_spec_tcp_udp *&p_tcp_udp); }; template -void rfs_mc::prepare_flow_spec_by_ip(qp_mgr *qp_mgr, attach_flow_data_t *&p_attach_flow_data, +void rfs_mc::prepare_flow_spec_by_ip(hw_queue_rx *hqrx_ptr, attach_flow_data_t *&p_attach_flow_data, xlio_ibv_flow_spec_eth *&p_eth, xlio_ibv_flow_spec_tcp_udp *&p_tcp_udp) { - T *attach_flow_data_eth = new (std::nothrow) T(qp_mgr); + T *attach_flow_data_eth = new (std::nothrow) T(hqrx_ptr); if (!attach_flow_data_eth) { return; } diff --git a/src/core/dev/rfs_uc.cpp b/src/core/dev/rfs_uc.cpp index f7c3565bc..b0640e6bf 100644 --- a/src/core/dev/rfs_uc.cpp +++ b/src/core/dev/rfs_uc.cpp @@ -78,10 +78,10 @@ bool rfs_uc::prepare_flow_spec() bool is_ipv4 = (m_flow_tuple.get_family() == AF_INET); if (is_ipv4) { prepare_flow_spec_by_ip( - p_ring->m_p_qp_mgr, p_attach_flow_data, p_eth, p_tcp_udp); + p_ring->m_hqrx, p_attach_flow_data, p_eth, p_tcp_udp); } else { prepare_flow_spec_by_ip( - p_ring->m_p_qp_mgr, p_attach_flow_data, p_eth, p_tcp_udp); + p_ring->m_hqrx, p_attach_flow_data, p_eth, p_tcp_udp); } if (!p_attach_flow_data) { @@ -89,7 +89,7 @@ bool rfs_uc::prepare_flow_spec() } ibv_flow_spec_eth_set(p_eth, p_ring->m_p_l2_addr->get_address(), - htons(p_ring->m_p_qp_mgr->get_partiton()), is_ipv4); + htons(p_ring->m_hqrx->get_vlan()), is_ipv4); break; } diff --git a/src/core/dev/rfs_uc.h b/src/core/dev/rfs_uc.h index e5937a765..c6cb1825d 100644 --- a/src/core/dev/rfs_uc.h +++ b/src/core/dev/rfs_uc.h @@ -56,17 +56,17 @@ class rfs_uc : public rfs { virtual bool prepare_flow_spec(); template - void prepare_flow_spec_by_ip(qp_mgr *qp_mgr, attach_flow_data_t *&p_attach_flow_data, + void prepare_flow_spec_by_ip(hw_queue_rx *hqrx_ptr, attach_flow_data_t *&p_attach_flow_data, xlio_ibv_flow_spec_eth *&p_eth, xlio_ibv_flow_spec_tcp_udp *&p_tcp_udp); }; template -void rfs_uc::prepare_flow_spec_by_ip(qp_mgr *qp_mgr, attach_flow_data_t *&p_attach_flow_data, +void rfs_uc::prepare_flow_spec_by_ip(hw_queue_rx *hqrx_ptr, attach_flow_data_t *&p_attach_flow_data, xlio_ibv_flow_spec_eth *&p_eth, xlio_ibv_flow_spec_tcp_udp *&p_tcp_udp) { - T *attach_flow_data_eth = new (std::nothrow) T(qp_mgr); + T *attach_flow_data_eth = new (std::nothrow) T(hqrx_ptr); if (!attach_flow_data_eth) { return; } diff --git a/src/core/dev/ring.h b/src/core/dev/ring.h index cd63e0116..c0554fa29 100644 --- a/src/core/dev/ring.h +++ b/src/core/dev/ring.h @@ -37,12 +37,14 @@ #include "ib/base/verbs_extra.h" #include "proto/flow_tuple.h" #include "sock/socket_fd_api.h" +#include "sock/tcp_seg_pool.h" +#include "proto/L2_address.h" +#include "dev/xlio_ti.h" /* Forward declarations */ struct xlio_tls_info; -class xlio_tis; class pkt_rcvr_sink; -typedef void (*xlio_comp_cb_t)(void *); // Copied from qp_mgr.h +class rfs_rule; #define ring_logpanic __log_info_panic #define ring_logerr __log_info_err @@ -85,7 +87,6 @@ class ring { virtual void restart() = 0; - // Funcs taken from qp_mgr.h // Get/Release memory buffer descriptor with a linked data memory buffer virtual mem_buf_desc_t *mem_buf_tx_get(ring_user_id_t id, bool b_block, pbuf_type type, int n_num_mem_bufs = 1) = 0; diff --git a/src/core/dev/ring_bond.cpp b/src/core/dev/ring_bond.cpp index 3d95d108e..4bad26388 100644 --- a/src/core/dev/ring_bond.cpp +++ b/src/core/dev/ring_bond.cpp @@ -291,13 +291,15 @@ void ring_bond::restart() if (slaves[j]->active) { ring_logdbg("ring %d active", i); if (slaves[j]->lag_tx_port_affinity != 1) { - tmp_ring->start_active_qp_mgr(); + tmp_ring->start_active_queue_tx(); + tmp_ring->start_active_queue_rx(); } m_bond_rings[i]->m_active = true; } else { ring_logdbg("ring %d not active", i); if (slaves[j]->lag_tx_port_affinity != 1) { - tmp_ring->stop_active_qp_mgr(); + tmp_ring->stop_active_queue_tx(); + tmp_ring->stop_active_queue_rx(); } m_bond_rings[i]->m_active = false; } diff --git a/src/core/dev/ring_simple.cpp b/src/core/dev/ring_simple.cpp index 06c5f5e56..717cd9ca2 100644 --- a/src/core/dev/ring_simple.cpp +++ b/src/core/dev/ring_simple.cpp @@ -86,21 +86,8 @@ inline void ring_simple::send_status_handler(int ret, xlio_ibv_send_wr *p_send_w ring_simple::ring_simple(int if_index, ring *parent, ring_type_t type, bool use_locks) : ring_slave(if_index, parent, type, use_locks) - , m_p_ib_ctx(NULL) - , m_p_qp_mgr(NULL) - , m_p_cq_mgr_rx(NULL) - , m_p_cq_mgr_tx(NULL) , m_lock_ring_tx_buf_wait("ring:lock_tx_buf_wait") - , m_tx_num_bufs(0) - , m_zc_num_bufs(0) - , m_tx_num_wr(0) - , m_missing_buf_ref_count(0) - , m_tx_lkey(0) , m_gro_mgr(safe_mce_sys().gro_streams_max, MAX_GRO_BUFS) - , m_up(false) - , m_p_rx_comp_event_channel(NULL) - , m_p_tx_comp_event_channel(NULL) - , m_p_l2_addr(NULL) { net_device_val *p_ndev = g_p_net_device_table_mgr->get_net_device_val(m_parent->get_if_index()); const slave_data_t *p_slave = p_ndev->get_slave(get_if_index()); @@ -149,12 +136,20 @@ ring_simple::~ring_simple() // Was done in order to allow iperf's FIN packet to be sent. usleep(25000); - if (m_p_qp_mgr) { - stop_active_qp_mgr(); + if (m_hqtx) { + stop_active_queue_tx(); // Release QP/CQ resources - delete m_p_qp_mgr; - m_p_qp_mgr = NULL; + delete m_hqtx; + m_hqtx = nullptr; + } + + if (m_hqrx) { + stop_active_queue_rx(); + + // Release QP/CQ resources + delete m_hqrx; + m_hqrx = nullptr; } /* coverity[double_lock] TODO: RM#1049980 */ @@ -365,22 +360,22 @@ void ring_simple::create_resources() g_p_fd_collection->add_cq_channel_fd(m_p_tx_comp_event_channel->fd, this); } - struct qp_mgr_desc desc; - memset(&desc, 0, sizeof(desc)); - desc.ring = this; - desc.slave = p_slave; - desc.rx_comp_event_channel = m_p_rx_comp_event_channel; - m_p_qp_mgr = new qp_mgr(&desc, get_tx_num_wr(), m_partition); + std::unique_ptr temp_hqtx(new hw_queue_tx(this, p_slave, get_tx_num_wr())); + std::unique_ptr temp_hqrx( + new hw_queue_rx(this, p_slave->p_ib_ctx, m_p_rx_comp_event_channel, m_vlan)); BULLSEYE_EXCLUDE_BLOCK_START - if (m_p_qp_mgr == NULL) { - ring_logerr("Failed to allocate qp_mgr!"); - throw_xlio_exception("create qp failed"); + if (!temp_hqtx || !temp_hqrx) { + ring_logerr("Failed to allocate hw_queue_tx/hw_queue_rx!"); + throw_xlio_exception("Create hw_queue_tx/hw_queue_rx failed"); } BULLSEYE_EXCLUDE_BLOCK_END + m_hqtx = temp_hqtx.release(); + m_hqrx = temp_hqrx.release(); + // save pointers - m_p_cq_mgr_rx = m_p_qp_mgr->get_rx_cq_mgr(); - m_p_cq_mgr_tx = m_p_qp_mgr->get_tx_cq_mgr(); + m_p_cq_mgr_rx = m_hqrx->get_rx_cq_mgr(); + m_p_cq_mgr_tx = m_hqtx->get_tx_cq_mgr(); init_tx_buffers(RING_TX_BUFS_COMPENSATE); @@ -394,7 +389,8 @@ void ring_simple::create_resources() * even if slave is not active */ if (p_slave->active || (p_slave->lag_tx_port_affinity == 1)) { - start_active_qp_mgr(); + start_active_queue_tx(); + start_active_queue_rx(); } ring_logdbg("new ring_simple() completed"); @@ -592,8 +588,8 @@ mem_buf_desc_t *ring_simple::mem_buf_tx_get(ring_user_id_t id, bool b_block, pbu // Try to poll once in the hope that we get a few freed tx mem_buf_desc ret = m_p_cq_mgr_tx->poll_and_process_element_tx(&poll_sn); if (ret < 0) { - ring_logdbg("failed polling on cq_mgr_tx (qp_mgr=%p, cq_mgr_tx=%p) (ret=%d %m)", - m_p_qp_mgr, m_p_cq_mgr_tx, ret); + ring_logdbg("failed polling on cq_mgr_tx (hqtx=%p, cq_mgr_tx=%p) (ret=%d %m)", m_hqtx, + m_p_cq_mgr_tx, ret); /* coverity[double_unlock] TODO: RM#1049980 */ m_lock_ring_tx.unlock(); return NULL; @@ -618,8 +614,8 @@ mem_buf_desc_t *ring_simple::mem_buf_tx_get(ring_user_id_t id, bool b_block, pbu ret = m_p_cq_mgr_tx->request_notification(poll_sn); if (ret < 0) { // this is most likely due to cq_poll_sn out of sync, need to poll_cq again - ring_logdbg("failed arming cq_mgr_tx (qp_mgr=%p, cq_mgr_tx=%p) (errno=%d %m)", - m_p_qp_mgr, m_p_cq_mgr_tx, errno); + ring_logdbg("failed arming cq_mgr_tx (hqtx=%p, cq_mgr_tx=%p) (errno=%d %m)", + m_hqtx, m_p_cq_mgr_tx, errno); } else if (ret == 0) { // prepare to block @@ -660,9 +656,9 @@ mem_buf_desc_t *ring_simple::mem_buf_tx_get(ring_user_id_t id, bool b_block, pbu // Perform a non blocking event read, clear the fd channel ret = p_cq_mgr_tx->poll_and_process_element_tx(&poll_sn); if (ret < 0) { - ring_logdbg("failed handling cq_mgr_tx channel (qp_mgr=%p, " + ring_logdbg("failed handling cq_mgr_tx channel (hqtx=%p, " "cq_mgr_tx=%p) (errno=%d %m)", - m_p_qp_mgr, m_p_cq_mgr_tx, errno); + m_hqtx, m_p_cq_mgr_tx, errno); /* coverity[double_unlock] TODO: RM#1049980 */ m_lock_ring_tx.unlock(); m_lock_ring_tx_buf_wait.unlock(); @@ -726,11 +722,11 @@ inline int ring_simple::send_buffer(xlio_ibv_send_wr *p_send_wqe, xlio_wr_tx_pac xlio_tis *tis) { int ret = 0; - unsigned credits = m_p_qp_mgr->credits_calculate(p_send_wqe); + unsigned credits = m_hqtx->credits_calculate(p_send_wqe); - if (likely(m_p_qp_mgr->credits_get(credits)) || + if (likely(m_hqtx->credits_get(credits)) || is_available_qp_wr(is_set(attr, XLIO_TX_PACKET_BLOCK), credits)) { - ret = m_p_qp_mgr->send(p_send_wqe, attr, tis, credits); + ret = m_hqtx->send(p_send_wqe, attr, tis, credits); } else { ring_logdbg("Silent packet drop, SQ is full!"); ret = -1; @@ -745,7 +741,7 @@ bool ring_simple::get_hw_dummy_send_support(ring_user_id_t id, xlio_ibv_send_wr NOT_IN_USE(id); NOT_IN_USE(p_send_wqe); - return m_p_qp_mgr->get_hw_dummy_send_support(); + return m_hqtx->get_hw_dummy_send_support(); } void ring_simple::send_ring_buffer(ring_user_id_t id, xlio_ibv_send_wr *p_send_wqe, @@ -789,12 +785,12 @@ bool ring_simple::is_available_qp_wr(bool b_block, unsigned credits) // Try to poll once in the hope that we get space in SQ ret = m_p_cq_mgr_tx->poll_and_process_element_tx(&poll_sn); if (ret < 0) { - ring_logdbg("failed polling on cq_mgr_tx (qp_mgr=%p, cq_mgr_tx=%p) (ret=%d %m)", - m_p_qp_mgr, m_p_cq_mgr_tx, ret); + ring_logdbg("failed polling on cq_mgr_tx (hqtx=%p, cq_mgr_tx=%p) (ret=%d %m)", m_hqtx, + m_p_cq_mgr_tx, ret); /* coverity[missing_unlock] */ return false; } - granted = m_p_qp_mgr->credits_get(credits); + granted = m_hqtx->credits_get(credits); if (granted) { break; } @@ -813,8 +809,8 @@ bool ring_simple::is_available_qp_wr(bool b_block, unsigned credits) ret = m_p_cq_mgr_tx->request_notification(poll_sn); if (ret < 0) { // this is most likely due to cq_poll_sn out of sync, need to poll_cq again - ring_logdbg("failed arming cq_mgr_tx (qp_mgr=%p, cq_mgr_tx=%p) (errno=%d %m)", - m_p_qp_mgr, m_p_cq_mgr_tx, errno); + ring_logdbg("failed arming cq_mgr_tx (hqtx=%p, cq_mgr_tx=%p) (errno=%d %m)", m_hqtx, + m_p_cq_mgr_tx, errno); } else if (ret == 0) { // prepare to block // CQ is armed, block on the CQ's Tx event channel (fd) @@ -851,9 +847,9 @@ bool ring_simple::is_available_qp_wr(bool b_block, unsigned credits) // Perform a non blocking event read, clear the fd channel ret = p_cq_mgr_tx->poll_and_process_element_tx(&poll_sn); if (ret < 0) { - ring_logdbg("failed handling cq_mgr_tx channel (qp_mgr=%p, " + ring_logdbg("failed handling cq_mgr_tx channel (hqtx=%p " "cq_mgr_tx=%p) (errno=%d %m)", - m_p_qp_mgr, m_p_cq_mgr_tx, errno); + m_hqtx, m_p_cq_mgr_tx, errno); /* coverity[double_unlock] TODO: RM#1049980 */ m_lock_ring_tx.unlock(); m_lock_ring_tx_buf_wait.unlock(); @@ -959,7 +955,7 @@ void ring_simple::return_tx_pool_to_global_pool() int ring_simple::put_tx_buffer_helper(mem_buf_desc_t *buff) { if (buff->tx.dev_mem_length) { - m_p_qp_mgr->dm_release_data(buff); + m_hqtx->dm_release_data(buff); } // Potential race, ref is protected here by ring_tx lock, and in dst_entry_tcp & @@ -1094,37 +1090,56 @@ void ring_simple::adapt_cq_moderation() m_lock_ring_rx.unlock(); } -void ring_simple::start_active_qp_mgr() +void ring_simple::start_active_queue_tx() { - m_lock_ring_rx.lock(); m_lock_ring_tx.lock(); - if (!m_up) { + if (!m_up_tx) { /* TODO: consider avoid using sleep */ /* coverity[sleep] */ - m_p_qp_mgr->up(); - m_up = true; + m_hqtx->up(); + m_up_tx = true; } m_lock_ring_tx.unlock(); - m_lock_ring_rx.unlock(); } -void ring_simple::stop_active_qp_mgr() +void ring_simple::start_active_queue_rx() { m_lock_ring_rx.lock(); + if (!m_up_rx) { + /* TODO: consider avoid using sleep */ + /* coverity[sleep] */ + m_hqrx->up(); + m_up_rx = true; + } + m_lock_ring_rx.unlock(); +} + +void ring_simple::stop_active_queue_tx() +{ m_lock_ring_tx.lock(); - if (m_up) { - m_up = false; + if (m_up_tx) { + m_up_tx = false; /* TODO: consider avoid using sleep */ /* coverity[sleep] */ - m_p_qp_mgr->down(); + m_hqtx->down(); } m_lock_ring_tx.unlock(); +} +void ring_simple::stop_active_queue_rx() +{ + m_lock_ring_rx.lock(); + if (m_up_rx) { + m_up_rx = false; + /* TODO: consider avoid using sleep */ + /* coverity[sleep] */ + m_hqrx->down(); + } m_lock_ring_rx.unlock(); } bool ring_simple::is_up() { - return m_up; + return m_up_tx && m_up_rx; } int ring_simple::modify_ratelimit(struct xlio_rate_limit_t &rate_limit) @@ -1140,10 +1155,10 @@ int ring_simple::modify_ratelimit(struct xlio_rate_limit_t &rate_limit) return -1; } - uint32_t rl_changes = m_p_qp_mgr->is_ratelimit_change(rate_limit); + uint32_t rl_changes = m_hqtx->is_ratelimit_change(rate_limit); - if (m_up && rl_changes) { - return m_p_qp_mgr->modify_qp_ratelimit(rate_limit, rl_changes); + if (m_up_tx && rl_changes) { + return m_hqtx->modify_qp_ratelimit(rate_limit, rl_changes); } return 0; @@ -1186,12 +1201,12 @@ uint32_t ring_simple::get_tx_user_lkey(void *addr, size_t length, void *p_mappin uint32_t ring_simple::get_max_inline_data() { - return m_p_qp_mgr->get_max_inline_data(); + return m_hqtx->get_max_inline_data(); } uint32_t ring_simple::get_max_send_sge(void) { - return m_p_qp_mgr->get_max_send_sge(); + return m_hqtx->get_max_send_sge(); } uint32_t ring_simple::get_max_payload_sz(void) diff --git a/src/core/dev/ring_simple.h b/src/core/dev/ring_simple.h index c581dae52..7cd0378cc 100644 --- a/src/core/dev/ring_simple.h +++ b/src/core/dev/ring_simple.h @@ -39,7 +39,8 @@ #include #include "dev/gro_mgr.h" -#include "dev/qp_mgr.h" +#include "dev/hw_queue_tx.h" +#include "dev/hw_queue_rx.h" #include "dev/net_device_table_mgr.h" struct cq_moderation_info { @@ -85,8 +86,10 @@ class ring_simple : public ring_slave { inline int send_buffer(xlio_ibv_send_wr *p_send_wqe, xlio_wr_tx_packet_attr attr, xlio_tis *tis); bool is_up() override; - void start_active_qp_mgr(); - void stop_active_qp_mgr(); + void start_active_queue_tx(); + void start_active_queue_rx(); + void stop_active_queue_tx(); + void stop_active_queue_rx(); mem_buf_desc_t *mem_buf_tx_get(ring_user_id_t id, bool b_block, pbuf_type type, int n_num_mem_bufs = 1) override; int mem_buf_tx_release(mem_buf_desc_t *p_mem_buf_desc_list, bool b_accounting, @@ -138,7 +141,7 @@ class ring_simple : public ring_slave { { std::lock_guard lock(m_lock_ring_tx); - xlio_tis *tis = m_p_qp_mgr->tls_context_setup_tx(info); + xlio_tis *tis = m_hqtx->tls_context_setup_tx(info); if (likely(tis != NULL)) { ++m_p_ring_stat->n_tx_tls_contexts; } @@ -156,7 +159,7 @@ class ring_simple : public ring_slave { * Locking is required for TX ring with cached=true. */ std::lock_guard lock(m_lock_ring_tx); - return m_p_qp_mgr->tls_create_tir(cached); + return m_hqrx->tls_create_tir(cached); } int tls_context_setup_rx(xlio_tir *tir, const xlio_tls_info *info, uint32_t next_record_tcp_sn, xlio_comp_cb_t callback, void *callback_arg) override @@ -165,7 +168,7 @@ class ring_simple : public ring_slave { std::lock_guard lock(m_lock_ring_tx); int rc = - m_p_qp_mgr->tls_context_setup_rx(tir, info, next_record_tcp_sn, callback, callback_arg); + m_hqtx->tls_context_setup_rx(tir, info, next_record_tcp_sn, callback, callback_arg); if (likely(rc == 0)) { ++m_p_ring_stat->n_rx_tls_contexts; } @@ -179,7 +182,7 @@ class ring_simple : public ring_slave { void tls_context_resync_tx(const xlio_tls_info *info, xlio_tis *tis, bool skip_static) override { std::lock_guard lock(m_lock_ring_tx); - m_p_qp_mgr->tls_context_resync_tx(info, tis, skip_static); + m_hqtx->tls_context_resync_tx(info, tis, skip_static); uint64_t dummy_poll_sn = 0; m_p_cq_mgr_tx->poll_and_process_element_tx(&dummy_poll_sn); @@ -187,7 +190,7 @@ class ring_simple : public ring_slave { void tls_resync_rx(xlio_tir *tir, const xlio_tls_info *info, uint32_t hw_resync_tcp_sn) override { std::lock_guard lock(m_lock_ring_tx); - m_p_qp_mgr->tls_resync_rx(tir, info, hw_resync_tcp_sn); + m_hqtx->tls_resync_rx(tir, info, hw_resync_tcp_sn); } void tls_get_progress_params_rx(xlio_tir *tir, void *buf, uint32_t lkey) override { @@ -195,7 +198,7 @@ class ring_simple : public ring_slave { if (lkey == LKEY_TX_DEFAULT) { lkey = m_tx_lkey; } - m_p_qp_mgr->tls_get_progress_params_rx(tir, buf, lkey); + m_hqtx->tls_get_progress_params_rx(tir, buf, lkey); /* Do polling to speedup handling of the completion. */ uint64_t dummy_poll_sn = 0; m_p_cq_mgr_tx->poll_and_process_element_tx(&dummy_poll_sn); @@ -203,13 +206,13 @@ class ring_simple : public ring_slave { void tls_release_tis(xlio_tis *tis) override { std::lock_guard lock(m_lock_ring_tx); - m_p_qp_mgr->tls_release_tis(tis); + m_hqtx->tls_release_tis(tis); } void tls_release_tir(xlio_tir *tir) override { /* TIR objects are protected with TX lock */ std::lock_guard lock(m_lock_ring_tx); - m_p_qp_mgr->tls_release_tir(tir); + m_hqrx->tls_release_tir(tir); } void tls_tx_post_dump_wqe(xlio_tis *tis, void *addr, uint32_t len, uint32_t lkey, bool first) override @@ -218,14 +221,14 @@ class ring_simple : public ring_slave { if (lkey == LKEY_TX_DEFAULT) { lkey = m_tx_lkey; } - m_p_qp_mgr->tls_tx_post_dump_wqe(tis, addr, len, lkey, first); + m_hqtx->tls_tx_post_dump_wqe(tis, addr, len, lkey, first); } #endif /* DEFINED_UTLS */ #ifdef DEFINED_DPCP std::unique_ptr create_tis(uint32_t flags) const override { std::lock_guard lock(m_lock_ring_tx); - return m_p_qp_mgr->create_tis(flags); + return m_hqtx->create_tis(flags); } int get_supported_nvme_feature_mask() const override { @@ -244,52 +247,53 @@ class ring_simple : public ring_slave { void nvme_set_static_context(xlio_tis *tis, uint32_t config) override { std::lock_guard lock(m_lock_ring_tx); - m_p_qp_mgr->nvme_set_static_context(tis, config); + m_hqtx->nvme_set_static_context(tis, config); } void nvme_set_progress_context(xlio_tis *tis, uint32_t tcp_seqno) override { std::lock_guard lock(m_lock_ring_tx); - m_p_qp_mgr->nvme_set_progress_context(tis, tcp_seqno); + m_hqtx->nvme_set_progress_context(tis, tcp_seqno); } #endif /* DEFINED_DPCP */ void post_nop_fence(void) override { std::lock_guard lock(m_lock_ring_tx); - m_p_qp_mgr->post_nop_fence(); + m_hqtx->post_nop_fence(); } void post_dump_wqe(xlio_tis *tis, void *addr, uint32_t len, uint32_t lkey, bool is_first) override { std::lock_guard lock(m_lock_ring_tx); - m_p_qp_mgr->post_dump_wqe(tis, addr, len, lkey, is_first); + m_hqtx->post_dump_wqe(tis, addr, len, lkey, is_first); } void reset_inflight_zc_buffers_ctx(ring_user_id_t id, void *ctx) override { std::lock_guard lock(m_lock_ring_tx); NOT_IN_USE(id); - m_p_qp_mgr->reset_inflight_zc_buffers_ctx(ctx); + m_hqtx->reset_inflight_zc_buffers_ctx(ctx); } bool credits_get(unsigned credits) override { std::lock_guard lock(m_lock_ring_tx); - return m_p_qp_mgr->credits_get(credits); + return m_hqtx->credits_get(credits); } void credits_return(unsigned credits) override { std::lock_guard lock(m_lock_ring_tx); - m_p_qp_mgr->credits_return(credits); + m_hqtx->credits_return(credits); } friend class cq_mgr_rx; friend class cq_mgr_rx_regrq; friend class cq_mgr_rx_strq; - friend class qp_mgr; + friend class hw_queue_tx; + friend class hw_queue_rx; friend class rfs; friend class rfs_uc; friend class rfs_uc_tcp_gro; @@ -356,7 +360,8 @@ class ring_simple : public ring_slave { protected: ib_ctx_handler *m_p_ib_ctx; - qp_mgr *m_p_qp_mgr; + hw_queue_tx *m_hqtx = nullptr; + hw_queue_rx *m_hqrx = nullptr; struct cq_moderation_info m_cq_moderation_info; cq_mgr_rx *m_p_cq_mgr_rx = nullptr; cq_mgr_tx *m_p_cq_mgr_tx = nullptr; @@ -376,17 +381,18 @@ class ring_simple : public ring_slave { } m_socketxtreme; lock_mutex m_lock_ring_tx_buf_wait; - uint32_t m_tx_num_bufs; - uint32_t m_zc_num_bufs; - uint32_t m_tx_num_wr; - uint32_t m_missing_buf_ref_count; - uint32_t m_tx_lkey; // this is the registered memory lkey for a given specific device for the - // buffer pool use + uint32_t m_tx_num_bufs = 0U; + uint32_t m_zc_num_bufs = 0U; + uint32_t m_tx_num_wr = 0U; + uint32_t m_missing_buf_ref_count = 0U; + uint32_t m_tx_lkey = 0U; // this is the registered memory lkey for a given specific device for + // the buffer pool use gro_mgr m_gro_mgr; - bool m_up; - struct ibv_comp_channel *m_p_rx_comp_event_channel; - struct ibv_comp_channel *m_p_tx_comp_event_channel; - L2_address *m_p_l2_addr; + bool m_up_tx = false; + bool m_up_rx = false; + struct ibv_comp_channel *m_p_rx_comp_event_channel = nullptr; + struct ibv_comp_channel *m_p_tx_comp_event_channel = nullptr; + L2_address *m_p_l2_addr = nullptr; uint32_t m_mtu; struct { @@ -444,7 +450,7 @@ class ring_eth : public ring_simple { net_device_val_eth *p_ndev = dynamic_cast( g_p_net_device_table_mgr->get_net_device_val(m_parent->get_if_index())); if (p_ndev) { - m_partition = p_ndev->get_vlan(); + m_vlan = p_ndev->get_vlan(); if (call_create_res) { create_resources(); diff --git a/src/core/dev/ring_slave.cpp b/src/core/dev/ring_slave.cpp index 7e5e598f1..11ac9e1df 100644 --- a/src/core/dev/ring_slave.cpp +++ b/src/core/dev/ring_slave.cpp @@ -61,7 +61,7 @@ ring_slave::ring_slave(int if_index, ring *parent, ring_type_t type, bool use_lo , m_lock_ring_rx(get_new_lock("ring_slave:lock_rx", use_locks)) , m_lock_ring_tx(get_new_lock("ring_slave:lock_tx", use_locks)) , m_p_ring_stat(new ring_stats_t) - , m_partition(0) + , m_vlan(0) , m_flow_tag_enabled(false) , m_b_sysvar_eth_mc_l2_only_rules(safe_mce_sys().eth_mc_l2_only_rules) , m_b_sysvar_mc_force_flowtag(safe_mce_sys().mc_force_flowtag) @@ -721,9 +721,9 @@ bool ring_slave::rx_process_buffer(mem_buf_desc_t *p_rx_wc_buf_desc, void *pv_fd // TODO: Remove this code when handling vlan in flow steering will be available. Change this // code if vlan stripping is performed. - if ((m_partition & VLAN_VID_MASK) != packet_vlan) { + if ((m_vlan & VLAN_VID_MASK) != packet_vlan) { ring_logfunc("Rx buffer dropped- Mismatched vlan. Packet vlan = %d, Local vlan = %d", - packet_vlan, m_partition & VLAN_VID_MASK); + packet_vlan, m_vlan & VLAN_VID_MASK); return false; } diff --git a/src/core/dev/ring_slave.h b/src/core/dev/ring_slave.h index 80c802db3..5ada7dfb1 100644 --- a/src/core/dev/ring_slave.h +++ b/src/core/dev/ring_slave.h @@ -328,7 +328,7 @@ class ring_slave : public ring { descq_t m_zc_pool; transport_type_t m_transport_type; /* transport ETH/IB */ std::unique_ptr m_p_ring_stat; - uint16_t m_partition; + uint16_t m_vlan; bool m_flow_tag_enabled; const bool m_b_sysvar_eth_mc_l2_only_rules; const bool m_b_sysvar_mc_force_flowtag; diff --git a/src/core/dev/xlio_ti.h b/src/core/dev/xlio_ti.h new file mode 100644 index 000000000..6ca0fd44b --- /dev/null +++ b/src/core/dev/xlio_ti.h @@ -0,0 +1,184 @@ +/* + * Copyright (c) 2001-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef XLIO_TI_H +#define XLIO_TI_H + +#include +#include +#include +#include + +/* Work request completion callback */ +/* TODO Add argument for completion status to handle errors. */ +typedef void (*xlio_comp_cb_t)(void *); + +class xlio_ti; + +class xlio_ti_owner { +public: + virtual void ti_released(xlio_ti *ti) = 0; +}; + +class xlio_ti { +public: + enum ti_type : uint8_t { UNKNOWN, TLS_TIS, TLS_TIR, NVME_TIS, NVME_TIR }; + + xlio_ti(xlio_ti_owner *ti_owner, ti_type type = UNKNOWN) + : m_ti_owner(ti_owner) + , m_type(type) + , m_released(false) + , m_ref(0) + , m_callback(nullptr) + , m_callback_arg(nullptr) + { + } + + virtual ~xlio_ti() {}; + + void assign_callback(xlio_comp_cb_t callback, void *callback_arg) + { + m_callback = callback; + m_callback_arg = callback_arg; + } + + /* + * Reference counting. m_ref must be protected by ring tx lock. Device + * layer (QP, CQ) is responsible for the reference counting. + */ + + void get() + { + ++m_ref; + assert(m_ref > 0); + } + + uint32_t put() + { + assert(m_ref > 0); + return --m_ref; + } + + void ti_released() { m_ti_owner->ti_released(this); } + + xlio_ti_owner *const m_ti_owner; + ti_type m_type; + bool m_released; + uint32_t m_ref; + + xlio_comp_cb_t m_callback; + void *m_callback_arg; +}; + +class xlio_tis : public xlio_ti { +public: + xlio_tis(xlio_ti_owner *ti_owner, std::unique_ptr _tis, xlio_ti::ti_type type) + : xlio_ti(ti_owner, type) + , m_dek() + , m_p_tis(std::move(_tis)) + , m_tisn(0U) + , m_dek_id(0U) + { + dpcp::status ret = m_p_tis->get_tisn(m_tisn); + assert(ret == dpcp::DPCP_OK); + (void)ret; + } + + ~xlio_tis() = default; + + std::unique_ptr release_dek() + { + assert(m_ref == 0); + m_released = false; + return std::move(m_dek); + } + + uint32_t get_tisn() noexcept { return m_tisn; } + + void assign_dek(std::unique_ptr &&dek_ptr) + { + m_dek = std::move(dek_ptr); + m_dek_id = m_dek->get_key_id(); + } + + uint32_t get_dek_id() noexcept { return m_dek_id; } + +private: + std::unique_ptr m_dek; + std::unique_ptr m_p_tis; + uint32_t m_tisn; + uint32_t m_dek_id; +}; + +class xlio_tir : public xlio_ti { +public: + xlio_tir(xlio_ti_owner *ti_owner, dpcp::tir *dpcp_tir, xlio_ti::ti_type type) + : xlio_ti(ti_owner, type) + { + m_p_tir.reset(dpcp_tir); + m_dek = NULL; + m_tirn = 0; + m_dek_id = 0; + + /* Cache the tir number. Mustn't fail for a valid TIR object. */ + m_tirn = m_p_tir->get_tirn(); + assert(m_tirn != 0); + } + + ~xlio_tir() = default; + + std::unique_ptr release_dek() + { + assert(m_ref == 0); + m_released = false; + return std::move(m_dek); + } + + uint32_t get_tirn() { return m_tirn; } + + void assign_dek(void *dek_ptr) + { + m_dek.reset(reinterpret_cast(dek_ptr)); + m_dek_id = m_dek->get_key_id(); + } + + uint32_t get_dek_id() { return m_dek_id; } + + std::unique_ptr m_p_tir; + +private: + std::unique_ptr m_dek; + uint32_t m_tirn; + uint32_t m_dek_id; +}; + +#endif // XLIO_TI_H diff --git a/src/core/ib/mlx5/ib_mlx5.cpp b/src/core/ib/mlx5/ib_mlx5.cpp index d58eeaa9b..cfc0178ce 100644 --- a/src/core/ib/mlx5/ib_mlx5.cpp +++ b/src/core/ib/mlx5/ib_mlx5.cpp @@ -135,68 +135,4 @@ int xlio_ib_mlx5_get_cq(struct ibv_cq *cq, xlio_ib_mlx5_cq_t *mlx5_cq) return 0; } -int xlio_ib_mlx5_post_recv(xlio_ib_mlx5_qp_t *mlx5_qp, struct ibv_recv_wr *wr, - struct ibv_recv_wr **bad_wr) -{ - struct mlx5_wqe_data_seg *scat; - int err = 0; - int nreq; - int ind; - int i, j; - - ind = mlx5_qp->rq.head & (mlx5_qp->rq.wqe_cnt - 1); - *bad_wr = NULL; - - for (nreq = 0; wr; ++nreq, wr = wr->next) { - if (unlikely((int)mlx5_qp->rq.head - (int)mlx5_qp->rq.tail + nreq >= - (int)mlx5_qp->cap.max_recv_wr)) { - errno = ENOMEM; - err = -errno; - *bad_wr = wr; - goto out; - } - - if (unlikely(wr->num_sge > (int)mlx5_qp->cap.max_recv_sge)) { - errno = EINVAL; - err = -errno; - *bad_wr = wr; - goto out; - } - - scat = (struct mlx5_wqe_data_seg *)((uint8_t *)mlx5_qp->rq.buf + - (ind << mlx5_qp->rq.wqe_shift)); - - for (i = 0, j = 0; i < wr->num_sge; ++i) { - if (unlikely(!wr->sg_list[i].length)) { - continue; - } - - scat[j].byte_count = htonl(wr->sg_list[i].length); - scat[j].lkey = htonl(wr->sg_list[i].lkey); - scat[j].addr = htonll(wr->sg_list[i].addr); - j++; - } - - if (j < (int)mlx5_qp->cap.max_recv_sge) { - scat[j].byte_count = 0; - scat[j].lkey = htonl(MLX5_INVALID_LKEY); - scat[j].addr = 0; - } - - ind = (ind + 1) & (mlx5_qp->rq.wqe_cnt - 1); - } - -out: - if (likely(nreq)) { - mlx5_qp->rq.head += nreq; - - wmb(); // Make sure that descriptors are written before doorbell record. - - // Buffers are posted only after the RQ is in ready state. OK to update doorbell. - *mlx5_qp->rq.dbrec = htonl(mlx5_qp->rq.head & 0xffff); - } - - return err; -} - #endif /* DEFINED_DIRECT_VERBS */ diff --git a/src/core/ib/mlx5/ib_mlx5.h b/src/core/ib/mlx5/ib_mlx5.h index bb70b7f63..e85c83681 100644 --- a/src/core/ib/mlx5/ib_mlx5.h +++ b/src/core/ib/mlx5/ib_mlx5.h @@ -78,23 +78,12 @@ typedef struct xlio_ib_mlx5_qp { uint32_t wqe_cnt; uint32_t stride; } sq; - struct { - volatile uint32_t *dbrec; - void *buf; - uint32_t wqe_cnt; - uint32_t stride; - uint32_t wqe_shift; - unsigned head; - unsigned tail; - } rq; struct { void *reg; uint32_t size; uint32_t offset; } bf; - uint32_t tirn; uint32_t tisn; - uint32_t rqn; uint32_t sqn; } xlio_ib_mlx5_qp_t; diff --git a/src/core/proto/route_table_mgr.cpp b/src/core/proto/route_table_mgr.cpp index 94aa606ca..8ad0c2b52 100644 --- a/src/core/proto/route_table_mgr.cpp +++ b/src/core/proto/route_table_mgr.cpp @@ -30,6 +30,7 @@ * SOFTWARE. */ +#include #include #include #include diff --git a/src/core/sock/sockinfo_nvme.h b/src/core/sock/sockinfo_nvme.h index 0283dd1a7..78f647943 100644 --- a/src/core/sock/sockinfo_nvme.h +++ b/src/core/sock/sockinfo_nvme.h @@ -36,7 +36,7 @@ #include #include #include "sockinfo_ulp.h" /* sockinfo_tcp_ops */ -#include "dev/qp_mgr.h" +#include "dev/hw_queue_tx.h" #include "proto/nvme_parse_input_args.h" #include "xlio_extra.h" #include "lwip/err.h" /* err_t */ diff --git a/tests/gtest/nvme/nvme.cc b/tests/gtest/nvme/nvme.cc index 36c5b81b4..df4542a02 100644 --- a/tests/gtest/nvme/nvme.cc +++ b/tests/gtest/nvme/nvme.cc @@ -36,7 +36,7 @@ #include #include "common/def.h" #include "common/base.h" -#include "dev/qp_mgr.h" +#include "dev/hw_queue_tx.h" #include "proto/nvme_parse_input_args.h" #include "tcp/tcp_base.h" #include "xlio_extra.h" From 4bd1df49a842da674840c807d9173b340abe8586 Mon Sep 17 00:00:00 2001 From: Alexander Grissik Date: Sun, 8 Oct 2023 18:08:14 +0300 Subject: [PATCH 016/169] issue: 3514044 Squash rfs_rule_dpcp to rfs_rule Signed-off-by: Alexander Grissik --- src/core/Makefile.am | 3 +-- src/core/dev/hw_queue_rx.cpp | 4 ++-- src/core/dev/rfs.cpp | 8 ++++++++ src/core/dev/rfs_mc.cpp | 8 ++++++++ src/core/dev/rfs_mc.h | 4 ---- src/core/dev/rfs_rule.h | 17 +++++++---------- src/core/dev/rfs_uc.cpp | 8 ++++++++ src/core/dev/rfs_uc.h | 4 ---- src/core/dev/rfs_uc_tcp_gro.cpp | 2 ++ 9 files changed, 36 insertions(+), 22 deletions(-) diff --git a/src/core/Makefile.am b/src/core/Makefile.am index 214498bdd..27fe9ea95 100644 --- a/src/core/Makefile.am +++ b/src/core/Makefile.am @@ -73,7 +73,7 @@ libxlio_la_SOURCES := \ dev/rfs_uc.cpp \ dev/rfs_uc_tcp_gro.cpp \ dev/rfs_mc.cpp \ - dev/rfs_rule_dpcp.cpp \ + dev/rfs_rule.cpp \ dev/time_converter.cpp \ dev/time_converter_ptp.cpp \ dev/time_converter_rtc.cpp \ @@ -191,7 +191,6 @@ libxlio_la_SOURCES := \ dev/rfs_uc.h \ dev/rfs_uc_tcp_gro.h \ dev/rfs_rule.h \ - dev/rfs_rule_dpcp.h \ dev/src_addr_selector.h \ dev/ring.h \ dev/ring_bond.h \ diff --git a/src/core/dev/hw_queue_rx.cpp b/src/core/dev/hw_queue_rx.cpp index 5d602dcd5..1af5d5fed 100644 --- a/src/core/dev/hw_queue_rx.cpp +++ b/src/core/dev/hw_queue_rx.cpp @@ -36,7 +36,7 @@ #include "dev/hw_queue_rx.h" #include "dev/buffer_pool.h" #include "dev/ring_simple.h" -#include "dev/rfs_rule_dpcp.h" +#include "dev/rfs_rule.h" #include "dev/cq_mgr_rx_regrq.h" #include "dev/cq_mgr_rx_strq.h" @@ -280,7 +280,7 @@ rfs_rule *hw_queue_rx::create_rfs_rule(xlio_ibv_flow_attr &attrs, xlio_tir *tir_ // TLS RX uses tir_ext. dpcp::tir *dpcp_tir = (tir_ext ? xlio_tir_to_dpcp_tir(tir_ext) : m_tir.get()); - std::unique_ptr new_rule(new rfs_rule_dpcp()); + std::unique_ptr new_rule(new rfs_rule()); if (dpcp_tir && new_rule->create(attrs, *dpcp_tir, *m_p_ib_ctx_handler->get_dpcp_adapter())) { return new_rule.release(); diff --git a/src/core/dev/rfs.cpp b/src/core/dev/rfs.cpp index b8b9efb2e..2db30ba8f 100644 --- a/src/core/dev/rfs.cpp +++ b/src/core/dev/rfs.cpp @@ -39,6 +39,14 @@ #define MODULE_NAME "rfs" +#define rfs_logpanic __log_info_panic +#define rfs_logerr __log_info_err +#define rfs_logwarn __log_info_warn +#define rfs_loginfo __log_info_info +#define rfs_logdbg __log_info_dbg +#define rfs_logfunc __log_info_func +#define rfs_logfuncall __log_info_funcall + /**/ /** inlining functions can only help if they are implemented before their usage **/ /**/ diff --git a/src/core/dev/rfs_mc.cpp b/src/core/dev/rfs_mc.cpp index ca7adcff7..83e4428fe 100644 --- a/src/core/dev/rfs_mc.cpp +++ b/src/core/dev/rfs_mc.cpp @@ -37,6 +37,14 @@ #define MODULE_NAME "rfs_mc" +#define rfs_logpanic __log_info_panic +#define rfs_logerr __log_info_err +#define rfs_logwarn __log_info_warn +#define rfs_loginfo __log_info_info +#define rfs_logdbg __log_info_dbg +#define rfs_logfunc __log_info_func +#define rfs_logfuncall __log_info_funcall + rfs_mc::rfs_mc(flow_tuple *flow_spec_5t, ring_slave *p_ring, rfs_rule_filter *rule_filter /*= NULL*/, int flow_tag_id /*=0*/) : rfs(flow_spec_5t, p_ring, rule_filter, flow_tag_id) diff --git a/src/core/dev/rfs_mc.h b/src/core/dev/rfs_mc.h index c3a1042f1..3288ada91 100644 --- a/src/core/dev/rfs_mc.h +++ b/src/core/dev/rfs_mc.h @@ -35,8 +35,6 @@ #include "dev/rfs.h" -#define MODULE_NAME "rfs_mc" - /** * @class rfs_mc * @@ -83,8 +81,6 @@ void rfs_mc::prepare_flow_spec_by_ip(hw_queue_rx *hqrx_ptr, attach_flow_data_t * if (m_flow_tag_id) { // Will not attach flow_tag spec to rule for tag_id==0 ibv_flow_spec_flow_tag_set(&(attach_flow_data_eth->ibv_flow_attr.flow_tag), m_flow_tag_id); attach_flow_data_eth->ibv_flow_attr.add_flow_tag_spec(); - rfs_logdbg("Adding flow_tag spec to MC rule, num_of_specs: %d flow_tag_id: %d", - attach_flow_data_eth->ibv_flow_attr.attr.num_of_specs, m_flow_tag_id); } } diff --git a/src/core/dev/rfs_rule.h b/src/core/dev/rfs_rule.h index 9cd2eb813..dbbc481a8 100644 --- a/src/core/dev/rfs_rule.h +++ b/src/core/dev/rfs_rule.h @@ -33,19 +33,16 @@ #ifndef RFS_RULE_H #define RFS_RULE_H -#include - -#define rfs_logpanic __log_info_panic -#define rfs_logerr __log_info_err -#define rfs_logwarn __log_info_warn -#define rfs_loginfo __log_info_info -#define rfs_logdbg __log_info_dbg -#define rfs_logfunc __log_info_func -#define rfs_logfuncall __log_info_funcall +#include +#include "ib/base/verbs_extra.h" +#include class rfs_rule { public: - virtual ~rfs_rule() {} + bool create(const xlio_ibv_flow_attr &attrs, dpcp::tir &in_tir, dpcp::adapter &in_adapter); + +private: + std::unique_ptr _dpcp_flow; }; #endif diff --git a/src/core/dev/rfs_uc.cpp b/src/core/dev/rfs_uc.cpp index b0640e6bf..3115bbcdc 100644 --- a/src/core/dev/rfs_uc.cpp +++ b/src/core/dev/rfs_uc.cpp @@ -40,6 +40,14 @@ #define MODULE_NAME "rfs_uc" +#define rfs_logpanic __log_info_panic +#define rfs_logerr __log_info_err +#define rfs_logwarn __log_info_warn +#define rfs_loginfo __log_info_info +#define rfs_logdbg __log_info_dbg +#define rfs_logfunc __log_info_func +#define rfs_logfuncall __log_info_funcall + rfs_uc::rfs_uc(flow_tuple *flow_spec_5t, ring_slave *p_ring, rfs_rule_filter *rule_filter, uint32_t flow_tag_id) : rfs(flow_spec_5t, p_ring, rule_filter, flow_tag_id) diff --git a/src/core/dev/rfs_uc.h b/src/core/dev/rfs_uc.h index c6cb1825d..2fcf0400c 100644 --- a/src/core/dev/rfs_uc.h +++ b/src/core/dev/rfs_uc.h @@ -35,8 +35,6 @@ #include "dev/rfs.h" -#define MODULE_NAME "rfs_uc" - /** * @class rfs_uc * @@ -81,8 +79,6 @@ void rfs_uc::prepare_flow_spec_by_ip(hw_queue_rx *hqrx_ptr, attach_flow_data_t * if (m_flow_tag_id) { // Will not attach flow_tag spec to rule for tag_id==0 ibv_flow_spec_flow_tag_set(&(attach_flow_data_eth->ibv_flow_attr.flow_tag), m_flow_tag_id); attach_flow_data_eth->ibv_flow_attr.add_flow_tag_spec(); - rfs_logdbg("Adding flow_tag spec to rule, num_of_specs: %d flow_tag_id: %d", - attach_flow_data_eth->ibv_flow_attr.attr.num_of_specs, m_flow_tag_id); } } diff --git a/src/core/dev/rfs_uc_tcp_gro.cpp b/src/core/dev/rfs_uc_tcp_gro.cpp index 4571007b3..bfba8f94e 100644 --- a/src/core/dev/rfs_uc_tcp_gro.cpp +++ b/src/core/dev/rfs_uc_tcp_gro.cpp @@ -39,6 +39,8 @@ #define MODULE_NAME "rfs_uc_tcp_gro" +#define rfs_logpanic __log_info_panic + #define TCP_H_LEN_NO_OPTIONS 5 #define TCP_H_LEN_TIMESTAMP 8 From 01594f511657ecadb0a0bfe2ba88a1d3a0cac29d Mon Sep 17 00:00:00 2001 From: Alexander Grissik Date: Mon, 9 Oct 2023 13:48:40 +0300 Subject: [PATCH 017/169] issue: 3514044 Removing m_attach_flow_data vector from rfs At most a single element of this vector is always used. Once rfs constructor is complete there must be exactly one attach_flow_data element in case of ring_simple. For ring_tap this element remains null. Signed-off-by: Alexander Grissik --- src/core/dev/rfs.cpp | 98 +++++++++++++++++---------------------- src/core/dev/rfs.h | 4 +- src/core/dev/rfs_mc.cpp | 2 +- src/core/dev/rfs_uc.cpp | 2 +- src/core/dev/ring_slave.h | 2 +- 5 files changed, 47 insertions(+), 61 deletions(-) diff --git a/src/core/dev/rfs.cpp b/src/core/dev/rfs.cpp index 2db30ba8f..db01215bc 100644 --- a/src/core/dev/rfs.cpp +++ b/src/core/dev/rfs.cpp @@ -75,13 +75,13 @@ inline void rfs::filter_keep_attached(rule_filter_map_t::iterator &filter_iter) return; } - // save all ibv_flow rules only for filter - for (size_t i = 0; i < m_attach_flow_data_vector.size(); i++) { - filter_iter->second.rfs_rule_vec.push_back(m_attach_flow_data_vector[i]->rfs_flow); + // save ibv_flow rule only for filter + if (m_attach_flow_data) { + filter_iter->second.rfs_rule_holder = m_attach_flow_data->rfs_flow; rfs_logdbg("filter_keep_attached copying rfs_flow, Tag: %" PRIu32 - ", Flow: %s, Index: %zu, Ptr: %p, Counter: %d", - m_flow_tag_id, m_flow_tuple.to_str().c_str(), i, - m_attach_flow_data_vector[i]->rfs_flow, filter_iter->second.counter); + ", Flow: %s, Ptr: %p, Counter: %d", + m_flow_tag_id, m_flow_tuple.to_str().c_str(), m_attach_flow_data->rfs_flow, + filter_iter->second.counter); } } @@ -110,32 +110,25 @@ inline void rfs::prepare_filter_detach(int &filter_counter, bool decrease_counte filter_counter = filter_iter->second.counter; // if we do not need to destroy rfs_rule, still mark this rfs as detached m_b_tmp_is_attached = (filter_counter == 0) && m_b_tmp_is_attached; - if (filter_counter != 0 || filter_iter->second.rfs_rule_vec.empty()) { + if (filter_counter != 0) { return; } BULLSEYE_EXCLUDE_BLOCK_START - if (m_attach_flow_data_vector.size() != filter_iter->second.rfs_rule_vec.size()) { - // sanity check for having the same number of qps on all rfs objects - rfs_logerr("all rfs objects in the ring should have the same number of elements"); - } - BULLSEYE_EXCLUDE_BLOCK_END - - for (size_t i = 0; i < m_attach_flow_data_vector.size(); i++) { - BULLSEYE_EXCLUDE_BLOCK_START - if (m_attach_flow_data_vector[i]->rfs_flow && - m_attach_flow_data_vector[i]->rfs_flow != filter_iter->second.rfs_rule_vec[i]) { + if (m_attach_flow_data) { + if (m_attach_flow_data->rfs_flow && + m_attach_flow_data->rfs_flow != filter_iter->second.rfs_rule_holder) { rfs_logerr( "our assumption that there should be only one rule for filter group is wrong"); - } else if (filter_iter->second.rfs_rule_vec[i]) { - m_attach_flow_data_vector[i]->rfs_flow = filter_iter->second.rfs_rule_vec[i]; + } else if (filter_iter->second.rfs_rule_holder) { + m_attach_flow_data->rfs_flow = filter_iter->second.rfs_rule_holder; rfs_logdbg("prepare_filter_detach copying rfs_flow, Tag: %" PRIu32 - ", Flow: %s, Index: %zu, Ptr: %p, Counter: %d", - m_flow_tag_id, m_flow_tuple.to_str().c_str(), i, - m_attach_flow_data_vector[i]->rfs_flow, filter_iter->second.counter); + ", Flow: %s, Ptr: %p, Counter: %d", + m_flow_tag_id, m_flow_tuple.to_str().c_str(), m_attach_flow_data->rfs_flow, + filter_iter->second.counter); } - BULLSEYE_EXCLUDE_BLOCK_END } + BULLSEYE_EXCLUDE_BLOCK_END } rfs::rfs(flow_tuple *flow_spec_5t, ring_slave *p_ring, rfs_rule_filter *rule_filter /*= NULL*/, @@ -190,15 +183,14 @@ rfs::~rfs() } delete[] m_sinks_list; - while (m_attach_flow_data_vector.size() > 0) { - attach_flow_data_t *flow_data = m_attach_flow_data_vector.back(); - if (reinterpret_cast(&flow_data->ibv_flow_attr)->eth.val.ether_type == - htons(ETH_P_IP)) { - delete reinterpret_cast(flow_data); + if (m_attach_flow_data) { + if (reinterpret_cast(&m_attach_flow_data->ibv_flow_attr) + ->eth.val.ether_type == htons(ETH_P_IP)) { + delete reinterpret_cast(m_attach_flow_data); } else { - delete reinterpret_cast(flow_data); + delete reinterpret_cast(m_attach_flow_data); } - m_attach_flow_data_vector.pop_back(); + m_attach_flow_data = nullptr; } } @@ -296,7 +288,7 @@ bool rfs::attach_flow(pkt_rcvr_sink *sink) } else { rfs_logdbg("rfs: Joining existing flow"); #if defined(DEFINED_NGINX) || defined(DEFINED_ENVOY) - if (g_p_app->type != APP_NONE && g_p_app->add_second_4t_rule) { + if (g_p_app->type != APP_NONE && m_p_ring->is_simple() && g_p_app->add_second_4t_rule) { // This is second 4 tuple rule for the same worker (when number // of workers is not power of two) create_flow(); @@ -367,14 +359,14 @@ rfs_rule *create_rule_T(xlio_tir *tir, const flow_tuple &flow_spec, attach_flow_ rfs_rule *rfs::create_rule(xlio_tir *tir, const flow_tuple &flow_spec) { - if (m_attach_flow_data_vector.size() == 1) { + if (m_attach_flow_data) { if (m_flow_tuple.get_family() == AF_INET) { return create_rule_T( - tir, flow_spec, m_attach_flow_data_vector[0], m_flow_tuple.is_5_tuple()); + tir, flow_spec, m_attach_flow_data, m_flow_tuple.is_5_tuple()); } return create_rule_T( - tir, flow_spec, m_attach_flow_data_vector[0], m_flow_tuple.is_5_tuple()); + tir, flow_spec, m_attach_flow_data, m_flow_tuple.is_5_tuple()); } return nullptr; @@ -384,16 +376,14 @@ rfs_rule *rfs::create_rule(xlio_tir *tir, const flow_tuple &flow_spec) bool rfs::create_flow() { - for (size_t i = 0; i < m_attach_flow_data_vector.size(); i++) { - attach_flow_data_t *iter = m_attach_flow_data_vector[i]; - iter->rfs_flow = iter->hqrx_ptr->create_rfs_rule(iter->ibv_flow_attr, NULL); - if (!iter->rfs_flow) { - rfs_logerr("Create RFS flow failed, Tag: %" PRIu32 ", Flow: %s, Priority: %" PRIu16 - ", errno: %d - %m", - m_flow_tag_id, m_flow_tuple.to_str().c_str(), iter->ibv_flow_attr.priority, - errno); // TODO ALEXR - Add info about QP, spec into log msg - return false; - } + m_attach_flow_data->rfs_flow = + m_attach_flow_data->hqrx_ptr->create_rfs_rule(m_attach_flow_data->ibv_flow_attr, NULL); + if (!m_attach_flow_data->rfs_flow) { + rfs_logerr("Create RFS flow failed, Tag: %" PRIu32 ", Flow: %s, Priority: %" PRIu16 + ", errno: %d - %m", + m_flow_tag_id, m_flow_tuple.to_str().c_str(), + m_attach_flow_data->ibv_flow_attr.priority, errno); + return false; } m_b_tmp_is_attached = true; @@ -405,18 +395,16 @@ bool rfs::create_flow() bool rfs::destroy_flow() { - for (size_t i = 0; i < m_attach_flow_data_vector.size(); i++) { - attach_flow_data_t *iter = m_attach_flow_data_vector[i]; - if (unlikely(!iter->rfs_flow)) { - rfs_logdbg( - "Destroy RFS flow failed, RFS flow was not created. " - "This is OK for MC same ip diff port scenario. Tag: %" PRIu32 - ", Flow: %s, Priority: %" PRIu16, - m_flow_tag_id, m_flow_tuple.to_str().c_str(), - iter->ibv_flow_attr.priority); // TODO ALEXR - Add info about QP, spec into log msg + if (m_attach_flow_data) { + if (unlikely(!m_attach_flow_data->rfs_flow)) { + rfs_logdbg("Destroy RFS flow failed, RFS flow was not created. " + "This is OK for MC same ip diff port scenario. Tag: %" PRIu32 + ", Flow: %s, Priority: %" PRIu16, + m_flow_tag_id, m_flow_tuple.to_str().c_str(), + m_attach_flow_data->ibv_flow_attr.priority); } else { - delete iter->rfs_flow; - iter->rfs_flow = nullptr; + delete m_attach_flow_data->rfs_flow; + m_attach_flow_data->rfs_flow = nullptr; } } diff --git a/src/core/dev/rfs.h b/src/core/dev/rfs.h index 39123e430..6c3edace2 100644 --- a/src/core/dev/rfs.h +++ b/src/core/dev/rfs.h @@ -106,8 +106,6 @@ typedef struct attach_flow_data_t { xlio_ibv_flow_attr ibv_flow_attr; } attach_flow_data_t; -typedef std::vector attach_flow_data_vector_t; - class rfs_rule_filter { public: rfs_rule_filter(rule_filter_map_t &map, const sock_addr &key, flow_tuple &flow_tuple) @@ -159,7 +157,7 @@ class rfs { flow_tuple m_flow_tuple; ring_slave *m_p_ring; rfs_rule_filter *m_p_rule_filter; - attach_flow_data_vector_t m_attach_flow_data_vector; + attach_flow_data_t *m_attach_flow_data = nullptr; pkt_rcvr_sink **m_sinks_list; uint32_t m_n_sinks_list_entries; // Number of actual sinks in the array (we shrink the array if // a sink is removed) diff --git a/src/core/dev/rfs_mc.cpp b/src/core/dev/rfs_mc.cpp index 83e4428fe..41187db25 100644 --- a/src/core/dev/rfs_mc.cpp +++ b/src/core/dev/rfs_mc.cpp @@ -116,7 +116,7 @@ bool rfs_mc::prepare_flow_spec() BULLSEYE_EXCLUDE_BLOCK_END } - m_attach_flow_data_vector.push_back(p_attach_flow_data); + m_attach_flow_data = p_attach_flow_data; return true; } diff --git a/src/core/dev/rfs_uc.cpp b/src/core/dev/rfs_uc.cpp index 3115bbcdc..9da1bccfc 100644 --- a/src/core/dev/rfs_uc.cpp +++ b/src/core/dev/rfs_uc.cpp @@ -148,7 +148,7 @@ bool rfs_uc::prepare_flow_spec() p_ring->get_transport_type(), p_attach_flow_data->ibv_flow_attr.num_of_specs, m_flow_tag_id); - m_attach_flow_data_vector.push_back(p_attach_flow_data); + m_attach_flow_data = p_attach_flow_data; return true; } diff --git a/src/core/dev/ring_slave.h b/src/core/dev/ring_slave.h index 5ada7dfb1..25d905a9f 100644 --- a/src/core/dev/ring_slave.h +++ b/src/core/dev/ring_slave.h @@ -233,7 +233,7 @@ inline bool operator==(flow_spec_4t_key_ipv6 const &key1, flow_spec_4t_key_ipv6 struct counter_and_ibv_flows { int counter; - std::vector rfs_rule_vec; + rfs_rule *rfs_rule_holder; }; typedef std::unordered_map rule_filter_map_t; From d5bb9e48faeaef80500ba0c473d5af39ca336895 Mon Sep 17 00:00:00 2001 From: Alexander Grissik Date: Mon, 9 Oct 2023 14:36:18 +0300 Subject: [PATCH 018/169] issue: 3514044 Removing hqrx from attach_flow_data_t Signed-off-by: Alexander Grissik --- src/core/dev/rfs.cpp | 15 ++++++++------- src/core/dev/rfs.h | 5 +---- src/core/dev/rfs_mc.cpp | 8 ++++---- src/core/dev/rfs_mc.h | 6 +++--- src/core/dev/rfs_uc.cpp | 8 ++++---- src/core/dev/rfs_uc.h | 6 +++--- 6 files changed, 23 insertions(+), 25 deletions(-) diff --git a/src/core/dev/rfs.cpp b/src/core/dev/rfs.cpp index db01215bc..97db2798c 100644 --- a/src/core/dev/rfs.cpp +++ b/src/core/dev/rfs.cpp @@ -333,8 +333,8 @@ bool rfs::detach_flow(pkt_rcvr_sink *sink) #ifdef DEFINED_UTLS template -rfs_rule *create_rule_T(xlio_tir *tir, const flow_tuple &flow_spec, attach_flow_data_t *iter, - bool is5T) +rfs_rule *create_rule_T(hw_queue_rx *hqrx, xlio_tir *tir, const flow_tuple &flow_spec, + attach_flow_data_t *iter, bool is5T) { auto *p_attr = reinterpret_cast(&iter->ibv_flow_attr); @@ -354,19 +354,20 @@ rfs_rule *create_rule_T(xlio_tir *tir, const flow_tuple &flow_spec, attach_flow_ } // The highest priority to override TCP rule flow_attr.attr.priority = 0; - return iter->hqrx_ptr->create_rfs_rule(flow_attr.attr, tir); + return hqrx->create_rfs_rule(flow_attr.attr, tir); } rfs_rule *rfs::create_rule(xlio_tir *tir, const flow_tuple &flow_spec) { + auto *hqrx = dynamic_cast(m_p_ring)->m_hqrx; if (m_attach_flow_data) { if (m_flow_tuple.get_family() == AF_INET) { return create_rule_T( - tir, flow_spec, m_attach_flow_data, m_flow_tuple.is_5_tuple()); + hqrx, tir, flow_spec, m_attach_flow_data, m_flow_tuple.is_5_tuple()); } return create_rule_T( - tir, flow_spec, m_attach_flow_data, m_flow_tuple.is_5_tuple()); + hqrx, tir, flow_spec, m_attach_flow_data, m_flow_tuple.is_5_tuple()); } return nullptr; @@ -376,8 +377,8 @@ rfs_rule *rfs::create_rule(xlio_tir *tir, const flow_tuple &flow_spec) bool rfs::create_flow() { - m_attach_flow_data->rfs_flow = - m_attach_flow_data->hqrx_ptr->create_rfs_rule(m_attach_flow_data->ibv_flow_attr, NULL); + m_attach_flow_data->rfs_flow = dynamic_cast(m_p_ring)->m_hqrx->create_rfs_rule( + m_attach_flow_data->ibv_flow_attr, NULL); if (!m_attach_flow_data->rfs_flow) { rfs_logerr("Create RFS flow failed, Tag: %" PRIu32 ", Flow: %s, Priority: %" PRIu16 ", errno: %d - %m", diff --git a/src/core/dev/rfs.h b/src/core/dev/rfs.h index 6c3edace2..d9b899c79 100644 --- a/src/core/dev/rfs.h +++ b/src/core/dev/rfs.h @@ -66,7 +66,6 @@ typedef struct ibv_flow_attr_eth { template struct attach_flow_data_eth_ip_tcp_udp_t { rfs_rule *rfs_flow; - hw_queue_rx *hqrx_ptr; struct ibv_flow_attr_eth_ip_tcp_udp : public ibv_flow_attr_eth { T ip; xlio_ibv_flow_spec_tcp_udp tcp_udp; @@ -87,9 +86,8 @@ template struct attach_flow_data_eth_ip_tcp_udp_t { attr.size += sizeof(flow_tag); } } ibv_flow_attr; - attach_flow_data_eth_ip_tcp_udp_t(hw_queue_rx *hqrx) + attach_flow_data_eth_ip_tcp_udp_t() : rfs_flow(NULL) - , hqrx_ptr(hqrx) , ibv_flow_attr() { } @@ -102,7 +100,6 @@ typedef attach_flow_data_eth_ip_tcp_udp_t typedef struct attach_flow_data_t { rfs_rule *rfs_flow; - hw_queue_rx *hqrx_ptr; xlio_ibv_flow_attr ibv_flow_attr; } attach_flow_data_t; diff --git a/src/core/dev/rfs_mc.cpp b/src/core/dev/rfs_mc.cpp index 41187db25..5df2d1b51 100644 --- a/src/core/dev/rfs_mc.cpp +++ b/src/core/dev/rfs_mc.cpp @@ -84,11 +84,11 @@ bool rfs_mc::prepare_flow_spec() case XLIO_TRANSPORT_ETH: { bool is_ipv4 = (m_flow_tuple.get_family() == AF_INET); if (is_ipv4) { - prepare_flow_spec_by_ip( - p_ring->m_hqrx, p_attach_flow_data, p_eth, p_tcp_udp); + prepare_flow_spec_by_ip(p_attach_flow_data, p_eth, + p_tcp_udp); } else { - prepare_flow_spec_by_ip( - p_ring->m_hqrx, p_attach_flow_data, p_eth, p_tcp_udp); + prepare_flow_spec_by_ip(p_attach_flow_data, p_eth, + p_tcp_udp); } if (!p_attach_flow_data) { diff --git a/src/core/dev/rfs_mc.h b/src/core/dev/rfs_mc.h index 3288ada91..868af4f13 100644 --- a/src/core/dev/rfs_mc.h +++ b/src/core/dev/rfs_mc.h @@ -54,17 +54,17 @@ class rfs_mc : public rfs { virtual bool prepare_flow_spec(); template - void prepare_flow_spec_by_ip(hw_queue_rx *hqrx_ptr, attach_flow_data_t *&p_attach_flow_data, + void prepare_flow_spec_by_ip(attach_flow_data_t *&p_attach_flow_data, xlio_ibv_flow_spec_eth *&p_eth, xlio_ibv_flow_spec_tcp_udp *&p_tcp_udp); }; template -void rfs_mc::prepare_flow_spec_by_ip(hw_queue_rx *hqrx_ptr, attach_flow_data_t *&p_attach_flow_data, +void rfs_mc::prepare_flow_spec_by_ip(attach_flow_data_t *&p_attach_flow_data, xlio_ibv_flow_spec_eth *&p_eth, xlio_ibv_flow_spec_tcp_udp *&p_tcp_udp) { - T *attach_flow_data_eth = new (std::nothrow) T(hqrx_ptr); + T *attach_flow_data_eth = new (std::nothrow) T(); if (!attach_flow_data_eth) { return; } diff --git a/src/core/dev/rfs_uc.cpp b/src/core/dev/rfs_uc.cpp index 9da1bccfc..ba464a4b7 100644 --- a/src/core/dev/rfs_uc.cpp +++ b/src/core/dev/rfs_uc.cpp @@ -85,11 +85,11 @@ bool rfs_uc::prepare_flow_spec() case XLIO_TRANSPORT_ETH: { bool is_ipv4 = (m_flow_tuple.get_family() == AF_INET); if (is_ipv4) { - prepare_flow_spec_by_ip( - p_ring->m_hqrx, p_attach_flow_data, p_eth, p_tcp_udp); + prepare_flow_spec_by_ip(p_attach_flow_data, p_eth, + p_tcp_udp); } else { - prepare_flow_spec_by_ip( - p_ring->m_hqrx, p_attach_flow_data, p_eth, p_tcp_udp); + prepare_flow_spec_by_ip(p_attach_flow_data, p_eth, + p_tcp_udp); } if (!p_attach_flow_data) { diff --git a/src/core/dev/rfs_uc.h b/src/core/dev/rfs_uc.h index 2fcf0400c..b916d6879 100644 --- a/src/core/dev/rfs_uc.h +++ b/src/core/dev/rfs_uc.h @@ -54,17 +54,17 @@ class rfs_uc : public rfs { virtual bool prepare_flow_spec(); template - void prepare_flow_spec_by_ip(hw_queue_rx *hqrx_ptr, attach_flow_data_t *&p_attach_flow_data, + void prepare_flow_spec_by_ip(attach_flow_data_t *&p_attach_flow_data, xlio_ibv_flow_spec_eth *&p_eth, xlio_ibv_flow_spec_tcp_udp *&p_tcp_udp); }; template -void rfs_uc::prepare_flow_spec_by_ip(hw_queue_rx *hqrx_ptr, attach_flow_data_t *&p_attach_flow_data, +void rfs_uc::prepare_flow_spec_by_ip(attach_flow_data_t *&p_attach_flow_data, xlio_ibv_flow_spec_eth *&p_eth, xlio_ibv_flow_spec_tcp_udp *&p_tcp_udp) { - T *attach_flow_data_eth = new (std::nothrow) T(hqrx_ptr); + T *attach_flow_data_eth = new (std::nothrow) T(); if (!attach_flow_data_eth) { return; } From 70b1bb3872249f1f7be11f6d6a99ea4e3c1eb6e1 Mon Sep 17 00:00:00 2001 From: Alexander Grissik Date: Wed, 11 Oct 2023 18:50:11 +0300 Subject: [PATCH 019/169] issue: 3514044 Removing ibv steering flows Signed-off-by: Alexander Grissik --- config/m4/verbs.m4 | 1 - src/core/dev/cq_mgr_rx_regrq.cpp | 2 +- src/core/dev/cq_mgr_rx_strq.cpp | 2 +- src/core/dev/hw_queue_rx.cpp | 7 +- src/core/dev/hw_queue_rx.h | 3 +- src/core/dev/net_device_val.cpp | 5 +- src/core/dev/rfs.cpp | 166 +++++++++++++++++-------------- src/core/dev/rfs.h | 59 ++--------- src/core/dev/rfs_mc.cpp | 70 ++++--------- src/core/dev/rfs_mc.h | 37 +------ src/core/dev/rfs_rule.cpp | 137 +++++++++++++++++++++++++ src/core/dev/rfs_rule.h | 3 +- src/core/dev/rfs_uc.cpp | 89 +++++------------ src/core/dev/rfs_uc.h | 35 +------ src/core/dev/ring_slave.h | 2 +- src/core/ib/base/verbs_extra.cpp | 83 ---------------- src/core/ib/base/verbs_extra.h | 126 +---------------------- 17 files changed, 300 insertions(+), 527 deletions(-) create mode 100644 src/core/dev/rfs_rule.cpp diff --git a/config/m4/verbs.m4 b/config/m4/verbs.m4 index 7c2feb870..62ad5a15a 100644 --- a/config/m4/verbs.m4 +++ b/config/m4/verbs.m4 @@ -139,7 +139,6 @@ CHECK_VERBS_ATTRIBUTE([IBV_QPT_RAW_PACKET], [infiniband/verbs.h]) CHECK_VERBS_ATTRIBUTE([IBV_WC_WITH_VLAN], [infiniband/verbs.h]) CHECK_VERBS_ATTRIBUTE([IBV_DEVICE_RAW_IP_CSUM], [infiniband/verbs.h]) CHECK_VERBS_ATTRIBUTE([IBV_SEND_IP_CSUM], [infiniband/verbs.h]) -CHECK_VERBS_ATTRIBUTE([IBV_FLOW_SPEC_ACTION_TAG], [infiniband/verbs.h], [IBV_FLOW_TAG]) CHECK_VERBS_ATTRIBUTE([IBV_WC_EX_WITH_COMPLETION_TIMESTAMP], [infiniband/verbs.h], [IBV_CQ_TIMESTAMP]) CHECK_VERBS_MEMBER([struct ibv_device_attr_ex.orig_attr], [infiniband/verbs.h], [IBV_DEVICE_ATTR_EX]) CHECK_VERBS_MEMBER([struct ibv_alloc_dm_attr.length], [infiniband/verbs.h], [IBV_DM]) diff --git a/src/core/dev/cq_mgr_rx_regrq.cpp b/src/core/dev/cq_mgr_rx_regrq.cpp index 91881f64e..4f12afd4c 100644 --- a/src/core/dev/cq_mgr_rx_regrq.cpp +++ b/src/core/dev/cq_mgr_rx_regrq.cpp @@ -145,7 +145,7 @@ void cq_mgr_rx_regrq::cqe_to_mem_buff_desc(struct xlio_mlx5_cqe *cqe, p_rx_wc_buf_desc->rx.tls_decrypted = (cqe->pkt_info >> 3) & 0x3; #endif /* DEFINED_UTLS */ p_rx_wc_buf_desc->rx.timestamps.hw_raw = ntohll(cqe->timestamp); - p_rx_wc_buf_desc->rx.flow_tag_id = xlio_get_flow_tag(cqe); + p_rx_wc_buf_desc->rx.flow_tag_id = ntohl((uint32_t)(cqe->sop_drop_qpn)); p_rx_wc_buf_desc->rx.is_sw_csum_need = !(m_b_is_rx_hw_csum_on && (cqe->hds_ip_ext & MLX5_CQE_L4_OK) && (cqe->hds_ip_ext & MLX5_CQE_L3_OK)); diff --git a/src/core/dev/cq_mgr_rx_strq.cpp b/src/core/dev/cq_mgr_rx_strq.cpp index 647f30d38..daccd1aa2 100644 --- a/src/core/dev/cq_mgr_rx_strq.cpp +++ b/src/core/dev/cq_mgr_rx_strq.cpp @@ -249,7 +249,7 @@ inline bool cq_mgr_rx_strq::strq_cqe_to_mem_buff_desc(struct xlio_mlx5_cqe *cqe, _current_wqe_consumed_bytes += _hot_buffer_stride->sz_buffer; _hot_buffer_stride->rx.timestamps.hw_raw = ntohll(cqe->timestamp); - _hot_buffer_stride->rx.flow_tag_id = xlio_get_flow_tag(cqe); + _hot_buffer_stride->rx.flow_tag_id = ntohl((uint32_t)(cqe->sop_drop_qpn)); _hot_buffer_stride->rx.is_sw_csum_need = !(m_b_is_rx_hw_csum_on && (cqe->hds_ip_ext & MLX5_CQE_L4_OK) && (cqe->hds_ip_ext & MLX5_CQE_L3_OK)); diff --git a/src/core/dev/hw_queue_rx.cpp b/src/core/dev/hw_queue_rx.cpp index 1af5d5fed..baf61cc39 100644 --- a/src/core/dev/hw_queue_rx.cpp +++ b/src/core/dev/hw_queue_rx.cpp @@ -274,7 +274,9 @@ void hw_queue_rx::modify_queue_to_error_state() } } -rfs_rule *hw_queue_rx::create_rfs_rule(xlio_ibv_flow_attr &attrs, xlio_tir *tir_ext) +rfs_rule *hw_queue_rx::create_rfs_rule(dpcp::match_params &match_value, + dpcp::match_params &match_mask, uint16_t priority, + uint32_t flow_tag, xlio_tir *tir_ext) { if (m_p_ib_ctx_handler && m_p_ib_ctx_handler->get_dpcp_adapter()) { // TLS RX uses tir_ext. @@ -282,7 +284,8 @@ rfs_rule *hw_queue_rx::create_rfs_rule(xlio_ibv_flow_attr &attrs, xlio_tir *tir_ std::unique_ptr new_rule(new rfs_rule()); if (dpcp_tir && - new_rule->create(attrs, *dpcp_tir, *m_p_ib_ctx_handler->get_dpcp_adapter())) { + new_rule->create(match_value, match_mask, *dpcp_tir, priority, flow_tag, + *m_p_ib_ctx_handler->get_dpcp_adapter())) { return new_rule.release(); } } diff --git a/src/core/dev/hw_queue_rx.h b/src/core/dev/hw_queue_rx.h index eec7f7abc..da0bc9ad9 100644 --- a/src/core/dev/hw_queue_rx.h +++ b/src/core/dev/hw_queue_rx.h @@ -74,7 +74,8 @@ class hw_queue_rx : public xlio_ti_owner { void modify_queue_to_error_state(); void release_rx_buffers(); - rfs_rule *create_rfs_rule(xlio_ibv_flow_attr &attrs, xlio_tir *tir_ext); + rfs_rule *create_rfs_rule(dpcp::match_params &match_value, dpcp::match_params &match_mask, + uint16_t priority, uint32_t flow_tag, xlio_tir *tir_ext); #ifdef DEFINED_UTLS xlio_tir *tls_create_tir(bool cached); diff --git a/src/core/dev/net_device_val.cpp b/src/core/dev/net_device_val.cpp index d65cb3eea..f5de4a6f0 100644 --- a/src/core/dev/net_device_val.cpp +++ b/src/core/dev/net_device_val.cpp @@ -1647,9 +1647,8 @@ bool net_device_val::verify_qp_creation(const char *ifname, enum ibv_qp_type qp_ if (qp) { success = true; - if (qp_type == IBV_QPT_RAW_PACKET && - !priv_ibv_query_flow_tag_supported(qp, port_num, AF_INET) && - !priv_ibv_query_flow_tag_supported(qp, port_num, AF_INET6)) { + // TODO: Add flow_tag capability check on dpcp::adapter + if (qp_type == IBV_QPT_RAW_PACKET) { p_ib_ctx->set_flow_tag_capability(true); } nd_logdbg("verified interface %s for flow tag capabilities : %s", ifname, diff --git a/src/core/dev/rfs.cpp b/src/core/dev/rfs.cpp index 97db2798c..35e8ba97f 100644 --- a/src/core/dev/rfs.cpp +++ b/src/core/dev/rfs.cpp @@ -76,13 +76,10 @@ inline void rfs::filter_keep_attached(rule_filter_map_t::iterator &filter_iter) } // save ibv_flow rule only for filter - if (m_attach_flow_data) { - filter_iter->second.rfs_rule_holder = m_attach_flow_data->rfs_flow; - rfs_logdbg("filter_keep_attached copying rfs_flow, Tag: %" PRIu32 - ", Flow: %s, Ptr: %p, Counter: %d", - m_flow_tag_id, m_flow_tuple.to_str().c_str(), m_attach_flow_data->rfs_flow, - filter_iter->second.counter); - } + filter_iter->second.rfs_rule_holder = m_rfs_flow; + rfs_logdbg( + "filter_keep_attached copying rfs_flow, Tag: %" PRIu32 ", Flow: %s, Ptr: %p, Counter: %d", + m_flow_tag_id, m_flow_tuple.to_str().c_str(), m_rfs_flow, filter_iter->second.counter); } inline void rfs::prepare_filter_detach(int &filter_counter, bool decrease_counter) @@ -115,18 +112,14 @@ inline void rfs::prepare_filter_detach(int &filter_counter, bool decrease_counte } BULLSEYE_EXCLUDE_BLOCK_START - if (m_attach_flow_data) { - if (m_attach_flow_data->rfs_flow && - m_attach_flow_data->rfs_flow != filter_iter->second.rfs_rule_holder) { - rfs_logerr( - "our assumption that there should be only one rule for filter group is wrong"); - } else if (filter_iter->second.rfs_rule_holder) { - m_attach_flow_data->rfs_flow = filter_iter->second.rfs_rule_holder; - rfs_logdbg("prepare_filter_detach copying rfs_flow, Tag: %" PRIu32 - ", Flow: %s, Ptr: %p, Counter: %d", - m_flow_tag_id, m_flow_tuple.to_str().c_str(), m_attach_flow_data->rfs_flow, - filter_iter->second.counter); - } + if (m_rfs_flow && m_rfs_flow != filter_iter->second.rfs_rule_holder) { + rfs_logerr("our assumption that there should be only one rule for filter group is wrong"); + } else if (filter_iter->second.rfs_rule_holder) { + m_rfs_flow = filter_iter->second.rfs_rule_holder; + rfs_logdbg("prepare_filter_detach copying rfs_flow, Tag: %" PRIu32 + ", Flow: %s, Ptr: %p, Counter: %d", + m_flow_tag_id, m_flow_tuple.to_str().c_str(), m_rfs_flow, + filter_iter->second.counter); } BULLSEYE_EXCLUDE_BLOCK_END } @@ -141,6 +134,9 @@ rfs::rfs(flow_tuple *flow_spec_5t, ring_slave *p_ring, rfs_rule_filter *rule_fil , m_flow_tag_id(flow_tag_id) , m_b_tmp_is_attached(false) { + memset(&m_match_value, 0, sizeof(m_match_value)); + memset(&m_match_mask, 0, sizeof(m_match_mask)); + m_sinks_list = new pkt_rcvr_sink *[m_n_sinks_list_max_length]; #if defined(DEFINED_NGINX) || defined(DEFINED_ENVOY) @@ -182,16 +178,6 @@ rfs::~rfs() m_p_rule_filter = NULL; } delete[] m_sinks_list; - - if (m_attach_flow_data) { - if (reinterpret_cast(&m_attach_flow_data->ibv_flow_attr) - ->eth.val.ether_type == htons(ETH_P_IP)) { - delete reinterpret_cast(m_attach_flow_data); - } else { - delete reinterpret_cast(m_attach_flow_data); - } - m_attach_flow_data = nullptr; - } } bool rfs::add_sink(pkt_rcvr_sink *p_sink) @@ -332,58 +318,46 @@ bool rfs::detach_flow(pkt_rcvr_sink *sink) #ifdef DEFINED_UTLS -template -rfs_rule *create_rule_T(hw_queue_rx *hqrx, xlio_tir *tir, const flow_tuple &flow_spec, - attach_flow_data_t *iter, bool is5T) +rfs_rule *rfs::create_rule(xlio_tir *tir, const flow_tuple &flow_spec) { - auto *p_attr = - reinterpret_cast(&iter->ibv_flow_attr); + auto *hqrx = dynamic_cast(m_p_ring)->m_hqrx; - if (unlikely(p_attr->eth.type != XLIO_IBV_FLOW_SPEC_ETH)) { - // We support only ETH rules for now - return NULL; - } + dpcp::match_params match_value_tmp; + dpcp::match_params match_mask_tmp; + memcpy(&match_value_tmp, &m_match_value, sizeof(m_match_value)); + memcpy(&match_mask_tmp, &m_match_mask, sizeof(m_match_mask)); - auto flow_attr(*p_attr); - if (!is5T) { + if (!m_flow_tuple.is_5_tuple()) { // For UTLS, We need the most specific 5T rule (in case the current rule is 3T). - ibv_flow_spec_set_single_ip(flow_attr.ip.val.src_ip, flow_attr.ip.mask.src_ip, - flow_spec.get_src_ip()); - flow_attr.tcp_udp.val.src_port = flow_spec.get_src_port(); - flow_attr.tcp_udp.mask.src_port = FS_MASK_ON_16; - } - // The highest priority to override TCP rule - flow_attr.attr.priority = 0; - return hqrx->create_rfs_rule(flow_attr.attr, tir); -} -rfs_rule *rfs::create_rule(xlio_tir *tir, const flow_tuple &flow_spec) -{ - auto *hqrx = dynamic_cast(m_p_ring)->m_hqrx; - if (m_attach_flow_data) { - if (m_flow_tuple.get_family() == AF_INET) { - return create_rule_T( - hqrx, tir, flow_spec, m_attach_flow_data, m_flow_tuple.is_5_tuple()); + if (match_value_tmp.ethertype == ETH_P_IP) { + match_mask_tmp.src.ipv4 = flow_spec.get_src_ip().is_anyaddr() ? 0U : 0xFFFFFFFFU; + match_value_tmp.src.ipv4 = ntohl(flow_spec.get_src_ip().get_in4_addr().s_addr); + } else { + memset(match_mask_tmp.src.ipv6, flow_spec.get_src_ip().is_anyaddr() ? 0U : 0xFFU, + sizeof(match_mask_tmp.src.ipv6)); + memcpy(match_value_tmp.src.ipv6, &flow_spec.get_src_ip().get_in6_addr(), + sizeof(match_value_tmp.src.ipv6)); } - return create_rule_T( - hqrx, tir, flow_spec, m_attach_flow_data, m_flow_tuple.is_5_tuple()); + match_mask_tmp.src_port = 0xFFFFU; + match_value_tmp.src_port = ntohs(flow_spec.get_src_port()); } - return nullptr; + // The highest priority to override TCP rule + return hqrx->create_rfs_rule(match_value_tmp, match_mask_tmp, 0, m_flow_tag_id, tir); } #endif /* DEFINED_UTLS */ bool rfs::create_flow() { - m_attach_flow_data->rfs_flow = dynamic_cast(m_p_ring)->m_hqrx->create_rfs_rule( - m_attach_flow_data->ibv_flow_attr, NULL); - if (!m_attach_flow_data->rfs_flow) { + m_rfs_flow = dynamic_cast(m_p_ring)->m_hqrx->create_rfs_rule( + m_match_value, m_match_mask, m_priority, m_flow_tag_id, nullptr); + if (!m_rfs_flow) { rfs_logerr("Create RFS flow failed, Tag: %" PRIu32 ", Flow: %s, Priority: %" PRIu16 ", errno: %d - %m", - m_flow_tag_id, m_flow_tuple.to_str().c_str(), - m_attach_flow_data->ibv_flow_attr.priority, errno); + m_flow_tag_id, m_flow_tuple.to_str().c_str(), m_priority, errno); return false; } @@ -396,17 +370,14 @@ bool rfs::create_flow() bool rfs::destroy_flow() { - if (m_attach_flow_data) { - if (unlikely(!m_attach_flow_data->rfs_flow)) { - rfs_logdbg("Destroy RFS flow failed, RFS flow was not created. " - "This is OK for MC same ip diff port scenario. Tag: %" PRIu32 - ", Flow: %s, Priority: %" PRIu16, - m_flow_tag_id, m_flow_tuple.to_str().c_str(), - m_attach_flow_data->ibv_flow_attr.priority); - } else { - delete m_attach_flow_data->rfs_flow; - m_attach_flow_data->rfs_flow = nullptr; - } + if (unlikely(!m_rfs_flow)) { + rfs_logdbg("Destroy RFS flow failed, RFS flow was not created. " + "This is OK for MC same ip diff port scenario. Tag: %" PRIu32 + ", Flow: %s, Priority: %" PRIu16, + m_flow_tag_id, m_flow_tuple.to_str().c_str(), m_priority); + } else { + delete m_rfs_flow; + m_rfs_flow = nullptr; } m_b_tmp_is_attached = false; @@ -415,3 +386,48 @@ bool rfs::destroy_flow() return true; } + +void rfs::prepare_flow_spec_eth_ip(const ip_address &dst_ip, const ip_address &src_ip) +{ + ring_simple *p_ring = dynamic_cast(m_p_ring); + + if (!p_ring) { + rfs_logpanic("Incompatible ring type"); + } + + m_match_value.vlan_id = p_ring->m_hqrx->get_vlan() & VLAN_VID_MASK; + m_match_mask.vlan_id = (p_ring->m_hqrx->get_vlan() ? VLAN_VID_MASK : 0); + + bool is_ipv4 = (m_flow_tuple.get_family() == AF_INET); + if (is_ipv4) { + m_match_mask.dst.ipv4 = dst_ip.is_anyaddr() ? 0U : 0xFFFFFFFFU; + m_match_value.dst.ipv4 = ntohl(dst_ip.get_in4_addr().s_addr); + m_match_mask.src.ipv4 = src_ip.is_anyaddr() ? 0U : 0xFFFFFFFFU; + m_match_value.src.ipv4 = ntohl(src_ip.get_in4_addr().s_addr); + m_match_mask.ip_version = 0xF; + m_match_value.ip_version = 4U; + m_match_mask.ethertype = 0xFFFFU; + m_match_value.ethertype = ETH_P_IP; + } else { + memset(m_match_mask.dst.ipv6, dst_ip.is_anyaddr() ? 0U : 0xFFU, + sizeof(m_match_mask.dst.ipv6)); + memcpy(m_match_value.dst.ipv6, &dst_ip.get_in6_addr(), sizeof(m_match_value.dst.ipv6)); + memset(m_match_mask.src.ipv6, src_ip.is_anyaddr() ? 0U : 0xFFU, + sizeof(m_match_mask.src.ipv6)); + memcpy(m_match_value.src.ipv6, &src_ip.get_in6_addr(), sizeof(m_match_value.src.ipv6)); + m_match_mask.ip_version = 0xF; + m_match_value.ip_version = 6U; + m_match_mask.ethertype = 0xFFFFU; + m_match_value.ethertype = ETH_P_IPV6; + } +} + +void rfs::prepare_flow_spec_tcp_udp() +{ + m_match_mask.dst_port = (m_flow_tuple.get_dst_port() ? 0xFFFFU : 0U); + m_match_value.dst_port = ntohs(m_flow_tuple.get_dst_port()); + m_match_mask.src_port = (m_flow_tuple.get_src_port() ? 0xFFFFU : 0U); + m_match_value.src_port = ntohs(m_flow_tuple.get_src_port()); + m_match_mask.protocol = 0xFF; + m_match_value.protocol = (m_flow_tuple.get_protocol() == PROTO_TCP ? IPPROTO_TCP : IPPROTO_UDP); +} diff --git a/src/core/dev/rfs.h b/src/core/dev/rfs.h index d9b899c79..7b631ba40 100644 --- a/src/core/dev/rfs.h +++ b/src/core/dev/rfs.h @@ -34,7 +34,7 @@ #define RFS_H #include - +#include #include "ib/base/verbs_extra.h" #include "util/vtypes.h" #include "dev/ring_simple.h" @@ -56,53 +56,6 @@ class pkt_rcvr_sink; * shadow for socket reuse feature. */ -/* ETHERNET - */ - -typedef struct ibv_flow_attr_eth { - xlio_ibv_flow_attr attr; - xlio_ibv_flow_spec_eth eth; -} ibv_flow_attr_eth; - -template struct attach_flow_data_eth_ip_tcp_udp_t { - rfs_rule *rfs_flow; - struct ibv_flow_attr_eth_ip_tcp_udp : public ibv_flow_attr_eth { - T ip; - xlio_ibv_flow_spec_tcp_udp tcp_udp; - xlio_ibv_flow_spec_action_tag flow_tag; // must be the last as struct can be used without it - - ibv_flow_attr_eth_ip_tcp_udp() - { - memset(this, 0, sizeof(*this)); - attr.size = sizeof(T) - sizeof(flow_tag); - attr.num_of_specs = 3; - attr.type = XLIO_IBV_FLOW_ATTR_NORMAL; - attr.priority = 2; // almost highest priority, 1 is used for 5-tuple later - attr.port = 0; - } - inline void add_flow_tag_spec(void) - { - attr.num_of_specs++; - attr.size += sizeof(flow_tag); - } - } ibv_flow_attr; - attach_flow_data_eth_ip_tcp_udp_t() - : rfs_flow(NULL) - , ibv_flow_attr() - { - } -}; - -typedef attach_flow_data_eth_ip_tcp_udp_t - attach_flow_data_eth_ipv4_tcp_udp_t; -typedef attach_flow_data_eth_ip_tcp_udp_t - attach_flow_data_eth_ipv6_tcp_udp_t; - -typedef struct attach_flow_data_t { - rfs_rule *rfs_flow; - xlio_ibv_flow_attr ibv_flow_attr; -} attach_flow_data_t; - class rfs_rule_filter { public: rfs_rule_filter(rule_filter_map_t &map, const sock_addr &key, flow_tuple &flow_tuple) @@ -154,19 +107,25 @@ class rfs { flow_tuple m_flow_tuple; ring_slave *m_p_ring; rfs_rule_filter *m_p_rule_filter; - attach_flow_data_t *m_attach_flow_data = nullptr; + rfs_rule *m_rfs_flow = nullptr; pkt_rcvr_sink **m_sinks_list; uint32_t m_n_sinks_list_entries; // Number of actual sinks in the array (we shrink the array if // a sink is removed) uint32_t m_n_sinks_list_max_length; uint32_t m_flow_tag_id; // Associated with this rule, set by attach_flow() + uint16_t m_priority = 2U; // Almost highest priority, 1 is used for 5-tuple later bool m_b_tmp_is_attached; // Only temporary, while ibcm calls attach_flow with no sinks... + dpcp::match_params m_match_value; + dpcp::match_params m_match_mask; + bool create_flow(); // Attach flow to all queues bool destroy_flow(); // Detach flow from all queues bool add_sink(pkt_rcvr_sink *p_sink); bool del_sink(pkt_rcvr_sink *p_sink); - virtual bool prepare_flow_spec() = 0; + void prepare_flow_spec_eth_ip(const ip_address &dst_ip, const ip_address &src_ip); + void prepare_flow_spec_tcp_udp(); + virtual void prepare_flow_spec() = 0; private: rfs(); // I don't want anyone to use the default constructor diff --git a/src/core/dev/rfs_mc.cpp b/src/core/dev/rfs_mc.cpp index 5df2d1b51..6aaa4f251 100644 --- a/src/core/dev/rfs_mc.cpp +++ b/src/core/dev/rfs_mc.cpp @@ -55,69 +55,33 @@ rfs_mc::rfs_mc(flow_tuple *flow_spec_5t, ring_slave *p_ring, } BULLSEYE_EXCLUDE_BLOCK_END - if (m_p_ring->is_simple() && !prepare_flow_spec()) { - throw_xlio_exception("IB multicast offload is not supported"); + if (m_p_ring->is_simple()) { + prepare_flow_spec(); } } -bool rfs_mc::prepare_flow_spec() +void rfs_mc::prepare_flow_spec() { - ring_simple *p_ring = dynamic_cast(m_p_ring); + const ip_address &dst_ip = + (safe_mce_sys().eth_mc_l2_only_rules ? ip_address::any_addr() : m_flow_tuple.get_dst_ip()); - if (!p_ring) { - rfs_logpanic("Incompatible ring type"); - } - - transport_type_t type = p_ring->get_transport_type(); + prepare_flow_spec_eth_ip(dst_ip, ip_address::any_addr()); - /* - * todo note that ring is not locked here. - * we touch members that should not change during the ring life. - * the ring will not be deleted as we increased refcnt. - * if one of these assumptions change, we must lock. - */ - attach_flow_data_t *p_attach_flow_data = nullptr; - xlio_ibv_flow_spec_eth *p_eth = nullptr; - xlio_ibv_flow_spec_tcp_udp *p_tcp_udp = nullptr; + uint8_t dst_mac[6]; + create_multicast_mac_from_ip(dst_mac, m_flow_tuple.get_dst_ip(), m_flow_tuple.get_family()); - switch (type) { - case XLIO_TRANSPORT_ETH: { - bool is_ipv4 = (m_flow_tuple.get_family() == AF_INET); - if (is_ipv4) { - prepare_flow_spec_by_ip(p_attach_flow_data, p_eth, - p_tcp_udp); - } else { - prepare_flow_spec_by_ip(p_attach_flow_data, p_eth, - p_tcp_udp); - } - - if (!p_attach_flow_data) { - return false; - } + memset(&m_match_mask.dst_mac, 0xFF, sizeof(m_match_mask.dst_mac)); + memcpy(&m_match_value.dst_mac, dst_mac, sizeof(dst_mac)); - uint8_t dst_mac[6]; - create_multicast_mac_from_ip(dst_mac, m_flow_tuple.get_dst_ip(), m_flow_tuple.get_family()); - ibv_flow_spec_eth_set(p_eth, dst_mac, htons(p_ring->m_hqrx->get_vlan()), is_ipv4); + if (safe_mce_sys().eth_mc_l2_only_rules) { + m_match_mask.dst_port = m_match_value.dst_port = m_match_mask.src_port = + m_match_value.src_port = 0U; - if (safe_mce_sys().eth_mc_l2_only_rules) { - ibv_flow_spec_tcp_udp_set(p_tcp_udp, 0, 0, 0); - } else { - ibv_flow_spec_tcp_udp_set(p_tcp_udp, (m_flow_tuple.get_protocol() == PROTO_TCP), - m_flow_tuple.get_dst_port(), m_flow_tuple.get_src_port()); - } - - break; + m_match_mask.protocol = 0xFF; + m_match_value.protocol = IPPROTO_UDP; + } else { + prepare_flow_spec_tcp_udp(); } - BULLSEYE_EXCLUDE_BLOCK_START - default: - rfs_logpanic("Incompatible transport type = %d", type); - return false; - break; - BULLSEYE_EXCLUDE_BLOCK_END - } - - m_attach_flow_data = p_attach_flow_data; - return true; } bool rfs_mc::rx_dispatch_packet(mem_buf_desc_t *p_rx_wc_buf_desc, void *pv_fd_ready_array) diff --git a/src/core/dev/rfs_mc.h b/src/core/dev/rfs_mc.h index 868af4f13..6a51cff4b 100644 --- a/src/core/dev/rfs_mc.h +++ b/src/core/dev/rfs_mc.h @@ -48,42 +48,11 @@ class rfs_mc : public rfs { rfs_mc(flow_tuple *flow_spec_5t, ring_slave *p_ring, rfs_rule_filter *rule_filter = NULL, int32_t flow_tag_id = 0); - virtual bool rx_dispatch_packet(mem_buf_desc_t *p_rx_wc_buf_desc, void *pv_fd_ready_array); + virtual bool rx_dispatch_packet(mem_buf_desc_t *p_rx_wc_buf_desc, + void *pv_fd_ready_array) override; protected: - virtual bool prepare_flow_spec(); - - template - void prepare_flow_spec_by_ip(attach_flow_data_t *&p_attach_flow_data, - xlio_ibv_flow_spec_eth *&p_eth, - xlio_ibv_flow_spec_tcp_udp *&p_tcp_udp); + void prepare_flow_spec() override; }; -template -void rfs_mc::prepare_flow_spec_by_ip(attach_flow_data_t *&p_attach_flow_data, - xlio_ibv_flow_spec_eth *&p_eth, - xlio_ibv_flow_spec_tcp_udp *&p_tcp_udp) -{ - T *attach_flow_data_eth = new (std::nothrow) T(); - if (!attach_flow_data_eth) { - return; - } - - p_eth = &(attach_flow_data_eth->ibv_flow_attr.eth); - p_tcp_udp = &(attach_flow_data_eth->ibv_flow_attr.tcp_udp); - p_attach_flow_data = reinterpret_cast(attach_flow_data_eth); - - const ip_address &dst_ip = - (safe_mce_sys().eth_mc_l2_only_rules ? ip_address::any_addr() : m_flow_tuple.get_dst_ip()); - - ibv_flow_spec_ip_set(&(attach_flow_data_eth->ibv_flow_attr.ip), dst_ip, ip_address::any_addr()); - - if (m_flow_tag_id) { // Will not attach flow_tag spec to rule for tag_id==0 - ibv_flow_spec_flow_tag_set(&(attach_flow_data_eth->ibv_flow_attr.flow_tag), m_flow_tag_id); - attach_flow_data_eth->ibv_flow_attr.add_flow_tag_spec(); - } -} - -#undef MODULE_NAME - #endif /* RFS_MC_H */ diff --git a/src/core/dev/rfs_rule.cpp b/src/core/dev/rfs_rule.cpp new file mode 100644 index 000000000..80b908ead --- /dev/null +++ b/src/core/dev/rfs_rule.cpp @@ -0,0 +1,137 @@ +/* + * Copyright (c) 2001-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "dev/rfs_rule.h" + +#include +#include "dev/rfs.h" + +#define MODULE_NAME "rfs_rule" + +#define rfs_logpanic __log_info_panic +#define rfs_logerr __log_info_err +#define rfs_logwarn __log_info_warn +#define rfs_loginfo __log_info_info +#define rfs_logdbg __log_info_dbg +#define rfs_logfunc __log_info_func +#define rfs_logfuncall __log_info_funcall + +bool rfs_rule::create(dpcp::match_params &match_value, dpcp::match_params &match_mask, + dpcp::tir &in_tir, uint16_t priority, uint32_t flow_tag, + dpcp::adapter &in_adapter) +{ + rfs_logdbg("Creating flow dpcp_adpater::create_flow_rule(), priority %" PRIu16 + ", flow_tag: %" PRIu32, + priority, flow_tag); + rfs_logdbg("match_mask:\n" + "ethertype: 0x%04" PRIx16 ", vlan_id: 0x%04" PRIx16 ", protocol: 0x%02" PRIx8 + ", ip_version: 0x%02" PRIx8 "\n" + "dst_port: 0x%04" PRIx16 ", src_ports: 0x%04" PRIx16 "\n" + "src_ip: ipv4: 0x%08" PRIx32 ", ipv6: 0x%016" PRIx64 "%016" PRIx64 "\n" + "dst_ip: ipv4: 0x%08" PRIx32 ", ipv6: 0x%016" PRIx64 "%016" PRIx64 "\n" + "dst_mac: 0x%016" PRIx64, + match_mask.ethertype, match_mask.vlan_id, match_mask.protocol, match_mask.ip_version, + match_mask.dst_port, match_mask.src_port, match_mask.src.ipv4, + *reinterpret_cast(match_mask.src.ipv6 + 8), + *reinterpret_cast(match_mask.src.ipv6), match_mask.dst.ipv4, + *reinterpret_cast(match_mask.dst.ipv6 + 8), + *reinterpret_cast(match_mask.dst.ipv6), + *reinterpret_cast(match_mask.dst_mac)); + rfs_logdbg("match_value:\n" + "ethertype: 0x%04" PRIx16 ", vlan_id: %" PRIu16 ", protocol: %" PRIu8 + ", ip_version: %" PRIu8 "\n" + "dst_port: %" PRIu16 ", src_ports: %" PRIu16 "\n" + "src_ip: ipv4: 0x%08" PRIx32 ", ipv6: 0x%016" PRIx64 "%016" PRIx64 "\n" + "dst_ip: ipv4: 0x%08" PRIx32 ", ipv6: 0x%016" PRIx64 "%016" PRIx64 "\n" + "dst_mac: 0x%016" PRIx64, + match_value.ethertype, match_value.vlan_id, match_value.protocol, + match_value.ip_version, match_value.dst_port, match_value.src_port, + match_value.src.ipv4, *reinterpret_cast(match_value.src.ipv6 + 8), + *reinterpret_cast(match_value.src.ipv6), match_value.dst.ipv4, + *reinterpret_cast(match_value.dst.ipv6 + 8), + *reinterpret_cast(match_value.dst.ipv6), + *reinterpret_cast(match_value.dst_mac)); + + dpcp::flow_rule *new_rule = nullptr; + dpcp::status status_out = in_adapter.create_flow_rule(priority, match_mask, new_rule); + if (status_out != dpcp::DPCP_OK) { + rfs_logerr("Failed dpcp_adpater::create_flow_rule(), Priority %" PRIu16 ", Status: %d", + priority, static_cast(status_out)); + return false; + } + + rfs_logdbg("Succeeded dpcp_adpater::create_flow_rule(), Priority %" PRIu16 + ", rfs_rule %p, dpcp_flow: %p", + priority, this, new_rule); + + _dpcp_flow.reset(new_rule); + + status_out = _dpcp_flow->set_match_value(match_value); + if (status_out != dpcp::DPCP_OK) { + rfs_logerr("Failed dpcp_flow_rule::set_match_value(), Status: %d, dpcp_flow: %p", + static_cast(status_out), new_rule); + return false; + } + + status_out = _dpcp_flow->add_dest_tir(&in_tir); + if (status_out != dpcp::DPCP_OK) { + rfs_logerr("Failed dpcp_flow_rule::add_dest_tir(), Status: %d, dpcp_flow: %p", + static_cast(status_out), new_rule); + return false; + } + + uint32_t tirn = 0U; + in_tir.get_id(tirn); + rfs_logdbg("Added dpcp_flow_rule::add_dest_tir() TIR %" PRIu32 ", dpcp_flow: %p", tirn, + new_rule); + + if (flow_tag) { + rfs_logdbg("Setting flow tag dpcp_adpater::set_flow_id(), Tag: %" PRIu32 ", dpcp_flow: %p", + flow_tag, new_rule); + + status_out = _dpcp_flow->set_flow_id(flow_tag); + if (status_out != dpcp::DPCP_OK) { + rfs_logerr("Failed dpcp_flow_rule::set_flow_id(), Status: %d, dpcp_flow: %p", + static_cast(status_out), new_rule); + return false; + } + } + + status_out = _dpcp_flow->apply_settings(); + if (status_out != dpcp::DPCP_OK) { + rfs_logerr("Failed dpcp_flow_rule::apply_settings(), Status: %d, dpcp_flow: %p", + static_cast(status_out), new_rule); + return false; + } + + return true; +} diff --git a/src/core/dev/rfs_rule.h b/src/core/dev/rfs_rule.h index dbbc481a8..0ce39394e 100644 --- a/src/core/dev/rfs_rule.h +++ b/src/core/dev/rfs_rule.h @@ -39,7 +39,8 @@ class rfs_rule { public: - bool create(const xlio_ibv_flow_attr &attrs, dpcp::tir &in_tir, dpcp::adapter &in_adapter); + bool create(dpcp::match_params &match_value, dpcp::match_params &match_mask, dpcp::tir &in_tir, + uint16_t priority, uint32_t flow_tag, dpcp::adapter &in_adapter); private: std::unique_ptr _dpcp_flow; diff --git a/src/core/dev/rfs_uc.cpp b/src/core/dev/rfs_uc.cpp index ba464a4b7..67de9a958 100644 --- a/src/core/dev/rfs_uc.cpp +++ b/src/core/dev/rfs_uc.cpp @@ -58,63 +58,25 @@ rfs_uc::rfs_uc(flow_tuple *flow_spec_5t, ring_slave *p_ring, rfs_rule_filter *ru } BULLSEYE_EXCLUDE_BLOCK_END - if (m_p_ring->is_simple() && !prepare_flow_spec()) { - throw_xlio_exception("rfs_uc: Incompatible transport type"); + if (m_p_ring->is_simple()) { + prepare_flow_spec(); } } -bool rfs_uc::prepare_flow_spec() +void rfs_uc::prepare_flow_spec() { - ring_simple *p_ring = dynamic_cast(m_p_ring); + prepare_flow_spec_eth_ip(m_flow_tuple.get_dst_ip(), m_flow_tuple.get_src_ip()); + prepare_flow_spec_tcp_udp(); - if (!p_ring) { - rfs_logpanic("Incompatible ring type"); - } - - /* - * todo note that ring is not locked here. - * we touch members that should not change during the ring life. - * the ring will not be deleted as we increased refcnt. - * if one of these assumptions change, we must lock. - */ - attach_flow_data_t *p_attach_flow_data = nullptr; - xlio_ibv_flow_spec_eth *p_eth = nullptr; - xlio_ibv_flow_spec_tcp_udp *p_tcp_udp = nullptr; - - switch (p_ring->get_transport_type()) { - case XLIO_TRANSPORT_ETH: { - bool is_ipv4 = (m_flow_tuple.get_family() == AF_INET); - if (is_ipv4) { - prepare_flow_spec_by_ip(p_attach_flow_data, p_eth, - p_tcp_udp); - } else { - prepare_flow_spec_by_ip(p_attach_flow_data, p_eth, - p_tcp_udp); - } - - if (!p_attach_flow_data) { - return false; - } - - ibv_flow_spec_eth_set(p_eth, p_ring->m_p_l2_addr->get_address(), - htons(p_ring->m_hqrx->get_vlan()), is_ipv4); - - break; - } - BULLSEYE_EXCLUDE_BLOCK_START - default: - return false; - break; - BULLSEYE_EXCLUDE_BLOCK_END - } - - ibv_flow_spec_tcp_udp_set(p_tcp_udp, (m_flow_tuple.get_protocol() == PROTO_TCP), - m_flow_tuple.get_dst_port(), m_flow_tuple.get_src_port()); + memset(&m_match_mask.dst_mac, 0xFF, sizeof(m_match_mask.dst_mac)); + memcpy(&m_match_value.dst_mac, + dynamic_cast(m_p_ring)->m_p_l2_addr->get_address(), + sizeof(m_match_value.dst_mac)); if (m_flow_tuple.get_src_port() || !m_flow_tuple.get_src_ip().is_anyaddr()) { - // set priority of 5-tuple to be higher than 3-tuple - // to make sure 5-tuple have higher priority on ConnectX-4 - p_attach_flow_data->ibv_flow_attr.priority = 1; + // Set priority of 5-tuple to be higher than 3-tuple + // to make sure 5-tuple have higher priority. + m_priority = 1; } #if defined(DEFINED_NGINX) || defined(DEFINED_ENVOY) else if (g_p_app->type != APP_NONE && g_p_app->get_worker_id() >= 0) { @@ -131,25 +93,24 @@ bool rfs_uc::prepare_flow_spec() } else { src_port = g_p_app->get_worker_id(); } - p_tcp_udp->val.src_port = htons((uint16_t)src_port * g_p_app->src_port_stride); - p_tcp_udp->mask.src_port = - htons((uint16_t)((g_p_app->workers_pow2 * g_p_app->src_port_stride) - 2)); - p_attach_flow_data->ibv_flow_attr.priority = 1; - rfs_logdbg("src_port_stride: %d workers_num %d \n", g_p_app->src_port_stride, - g_p_app->workers_num); + + m_match_mask.src_port = static_cast( + (g_p_app->workers_pow2 * g_p_app->src_port_stride) - 2); + m_match_value.src_port = + static_cast(src_port * g_p_app->src_port_stride); + + m_priority = 1; + rfs_logdbg("src_port_stride: %d workers_num %d \n", + g_p_app->src_port_stride, g_p_app->workers_num); rfs_logdbg("sp_tcp_udp->val.src_port: %d p_tcp_udp->mask.src_port %d \n", - ntohs(p_tcp_udp->val.src_port), ntohs(p_tcp_udp->mask.src_port)); - m_flow_tuple.set_src_port(p_tcp_udp->val.src_port); + m_match_value.src_port, m_match_mask.src_port); + + m_flow_tuple.set_src_port(m_match_value.src_port); } } #endif - rfs_logfunc("transport type: %d, num_of_specs: %d flow_tag_id: %d", - p_ring->get_transport_type(), p_attach_flow_data->ibv_flow_attr.num_of_specs, - m_flow_tag_id); - - m_attach_flow_data = p_attach_flow_data; - return true; + rfs_logfunc("Transport type: %d, flow_tag_id: %d", p_ring->get_transport_type(), m_flow_tag_id); } bool rfs_uc::rx_dispatch_packet(mem_buf_desc_t *p_rx_wc_buf_desc, void *pv_fd_ready_array) diff --git a/src/core/dev/rfs_uc.h b/src/core/dev/rfs_uc.h index b916d6879..b59835619 100644 --- a/src/core/dev/rfs_uc.h +++ b/src/core/dev/rfs_uc.h @@ -48,40 +48,11 @@ class rfs_uc : public rfs { rfs_uc(flow_tuple *flow_spec_5t, ring_slave *p_ring, rfs_rule_filter *rule_filter = NULL, uint32_t flow_tag_id = 0); - virtual bool rx_dispatch_packet(mem_buf_desc_t *p_rx_wc_buf_desc, void *pv_fd_ready_array); + virtual bool rx_dispatch_packet(mem_buf_desc_t *p_rx_wc_buf_desc, + void *pv_fd_ready_array) override; protected: - virtual bool prepare_flow_spec(); - - template - void prepare_flow_spec_by_ip(attach_flow_data_t *&p_attach_flow_data, - xlio_ibv_flow_spec_eth *&p_eth, - xlio_ibv_flow_spec_tcp_udp *&p_tcp_udp); + virtual void prepare_flow_spec() override; }; -template -void rfs_uc::prepare_flow_spec_by_ip(attach_flow_data_t *&p_attach_flow_data, - xlio_ibv_flow_spec_eth *&p_eth, - xlio_ibv_flow_spec_tcp_udp *&p_tcp_udp) -{ - T *attach_flow_data_eth = new (std::nothrow) T(); - if (!attach_flow_data_eth) { - return; - } - - decltype(T::ibv_flow_attr_eth_ip_tcp_udp::ip) *p_ip = &(attach_flow_data_eth->ibv_flow_attr.ip); - p_eth = &(attach_flow_data_eth->ibv_flow_attr.eth); - p_tcp_udp = &(attach_flow_data_eth->ibv_flow_attr.tcp_udp); - p_attach_flow_data = reinterpret_cast(attach_flow_data_eth); - - ibv_flow_spec_ip_set(p_ip, m_flow_tuple.get_dst_ip(), m_flow_tuple.get_src_ip()); - - if (m_flow_tag_id) { // Will not attach flow_tag spec to rule for tag_id==0 - ibv_flow_spec_flow_tag_set(&(attach_flow_data_eth->ibv_flow_attr.flow_tag), m_flow_tag_id); - attach_flow_data_eth->ibv_flow_attr.add_flow_tag_spec(); - } -} - -#undef MODULE_NAME - #endif /* RFS_UC_H */ diff --git a/src/core/dev/ring_slave.h b/src/core/dev/ring_slave.h index 25d905a9f..8c7c8d0e2 100644 --- a/src/core/dev/ring_slave.h +++ b/src/core/dev/ring_slave.h @@ -233,7 +233,7 @@ inline bool operator==(flow_spec_4t_key_ipv6 const &key1, flow_spec_4t_key_ipv6 struct counter_and_ibv_flows { int counter; - rfs_rule *rfs_rule_holder; + rfs_rule *rfs_rule_holder = nullptr; }; typedef std::unordered_map rule_filter_map_t; diff --git a/src/core/ib/base/verbs_extra.cpp b/src/core/ib/base/verbs_extra.cpp index dc7cf746e..65a661793 100644 --- a/src/core/ib/base/verbs_extra.cpp +++ b/src/core/ib/base/verbs_extra.cpp @@ -261,89 +261,6 @@ int priv_ibv_query_burst_supported(struct ibv_qp *qp, uint8_t port_num) return -1; } -int priv_ibv_query_flow_tag_supported(struct ibv_qp *qp, uint8_t port_num, sa_family_t family) -{ - NOT_IN_USE(qp); - NOT_IN_USE(port_num); - int res = -1; - -#ifdef DEFINED_IBV_FLOW_TAG - - // Create - struct { - xlio_ibv_flow_attr attr; - xlio_ibv_flow_spec_eth eth; - xlio_ibv_flow_spec_ipv4 ipv4; - xlio_ibv_flow_spec_tcp_udp tcp_udp; - xlio_ibv_flow_spec_action_tag flow_tag; - } ft_attr_ipv4; - - struct { - xlio_ibv_flow_attr attr; - xlio_ibv_flow_spec_eth eth; - xlio_ibv_flow_spec_ipv6 ipv6; - xlio_ibv_flow_spec_tcp_udp tcp_udp; - xlio_ibv_flow_spec_action_tag flow_tag; - } ft_attr_ipv6; - - xlio_ibv_flow_attr *p_attr = nullptr; - xlio_ibv_flow_spec_eth *p_eth = nullptr; - xlio_ibv_flow_spec_tcp_udp *p_tcp_udp = nullptr; - xlio_ibv_flow_spec_action_tag *p_flow_tag = nullptr; - - // Initialize - if (family == AF_INET) { - memset(&ft_attr_ipv4, 0, sizeof(ft_attr_ipv4)); - p_attr = &(ft_attr_ipv4.attr); - p_eth = &(ft_attr_ipv4.eth); - p_tcp_udp = &(ft_attr_ipv4.tcp_udp); - p_flow_tag = &(ft_attr_ipv4.flow_tag); - p_attr->size = sizeof(ft_attr_ipv4); - } else { - memset(&ft_attr_ipv6, 0, sizeof(ft_attr_ipv6)); - p_attr = &(ft_attr_ipv6.attr); - p_eth = &(ft_attr_ipv6.eth); - p_tcp_udp = &(ft_attr_ipv6.tcp_udp); - p_flow_tag = &(ft_attr_ipv6.flow_tag); - p_attr->size = sizeof(ft_attr_ipv6); - } - - p_attr->num_of_specs = 4; - p_attr->type = XLIO_IBV_FLOW_ATTR_NORMAL; - p_attr->priority = 2; // almost highest priority, 1 is used for 5-tuple later - p_attr->port = port_num; - - // Set filters - uint8_t mac_0[ETH_ALEN] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00}; - uint8_t mac_f[ETH_ALEN] = {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF}; - - bool is_ipv4 = (family == AF_INET); - ibv_flow_spec_eth_set(p_eth, mac_0, 0, is_ipv4); // L2 filter - memcpy(p_eth->val.src_mac, mac_f, ETH_ALEN); - memset(p_eth->mask.src_mac, FS_MASK_ON_8, ETH_ALEN); - - if (is_ipv4) { - ibv_flow_spec_ip_set(&ft_attr_ipv4.ipv4, ip_address::loopback4_addr(), - ip_address::loopback4_addr()); // L3 filter - } else { - ibv_flow_spec_ip_set(&ft_attr_ipv6.ipv6, ip_address::loopback6_addr(), - ip_address::loopback6_addr()); // L3 filter - } - - ibv_flow_spec_tcp_udp_set(p_tcp_udp, true, 0, 0); // L4 filter - ibv_flow_spec_flow_tag_set(p_flow_tag, FLOW_TAG_MASK - 1); // enable flow tag - - // Create flow - xlio_ibv_flow *ibv_flow = xlio_ibv_create_flow(qp, p_attr); - if (ibv_flow) { - res = 0; - xlio_ibv_destroy_flow(ibv_flow); - } -#endif // DEFINED_IBV_FLOW_TAG - - return res; -} - int xlio_rdma_lib_reset() { #ifdef HAVE_RDMA_LIB_RESET diff --git a/src/core/ib/base/verbs_extra.h b/src/core/ib/base/verbs_extra.h index 9279f6501..344ae02dd 100644 --- a/src/core/ib/base/verbs_extra.h +++ b/src/core/ib/base/verbs_extra.h @@ -105,7 +105,6 @@ void priv_ibv_modify_cq_moderation(struct ibv_cq *cq, uint32_t period, uint32_t #define FS_MASK_ON_64 (0xffffffffffffffff) #define FLOW_TAG_MASK ((1 << 20) - 1) -int priv_ibv_query_flow_tag_supported(struct ibv_qp *qp, uint8_t port_num, sa_family_t family); int priv_ibv_query_burst_supported(struct ibv_qp *qp, uint8_t port_num); /* DEFINED_VERBS_VERSION: @@ -169,9 +168,6 @@ typedef int xlio_ibv_cq_init_attr; ibv_create_cq(context, cqe, cq_context, channel, comp_vector) // rx hw timestamp -#define XLIO_IBV_WC_WITH_TIMESTAMP 0 -#define xlio_wc_timestamp(wc) 0 - #ifdef DEFINED_IBV_CQ_TIMESTAMP #define XLIO_IBV_DEVICE_ATTR_HCA_CORE_CLOCK 0 #define XLIO_IBV_VALUES_MASK_RAW_CLOCK IBV_VALUES_MASK_RAW_CLOCK @@ -181,14 +177,12 @@ typedef struct ibv_values_ex xlio_ts_values; #endif // ibv_post_send -#define XLIO_IBV_SEND_SIGNALED IBV_SEND_SIGNALED -#define XLIO_IBV_SEND_INLINE IBV_SEND_INLINE +#define XLIO_IBV_SEND_INLINE IBV_SEND_INLINE #ifdef DEFINED_IBV_SEND_IP_CSUM #define XLIO_IBV_SEND_IP_CSUM (IBV_SEND_IP_CSUM) #else #define DEFINED_SW_CSUM #endif -#define xlio_ibv_send_flags ibv_send_flags #define xlio_send_wr_send_flags(wr) (wr).send_flags #define XLIO_IBV_WR_SEND IBV_WR_SEND #define xlio_ibv_wr_opcode ibv_wr_opcode @@ -214,38 +208,9 @@ typedef struct ibv_tso_caps xlio_ibv_tso_caps; (xlio_ibv_wr_opcode)(0) // Use 0 as "default" opcode when NOP is not defined. #endif -#define xlio_ibv_post_send(qp, wr, bad_wr) ibv_post_send(qp, wr, bad_wr) typedef struct ibv_send_wr xlio_ibv_send_wr; // ibv_reg_mr #define XLIO_IBV_ACCESS_LOCAL_WRITE IBV_ACCESS_LOCAL_WRITE -// flow steering -#define XLIO_IBV_FLOW_ATTR_NORMAL IBV_FLOW_ATTR_NORMAL -#define XLIO_IBV_FLOW_ATTR_FLAGS_ALLOW_LOOP_BACK IBV_FLOW_ATTR_FLAGS_ALLOW_LOOP_BACK -#define XLIO_IBV_FLOW_SPEC_ETH IBV_FLOW_SPEC_ETH -#define XLIO_IBV_FLOW_SPEC_IPV4 IBV_FLOW_SPEC_IPV4 -#define XLIO_IBV_FLOW_SPEC_IPV6 IBV_FLOW_SPEC_IPV6 -#define XLIO_IBV_FLOW_SPEC_TCP IBV_FLOW_SPEC_TCP -#define XLIO_IBV_FLOW_SPEC_UDP IBV_FLOW_SPEC_UDP -#define xlio_ibv_create_flow(qp, flow) ibv_create_flow(qp, flow) -#define xlio_ibv_destroy_flow(flow_id) ibv_destroy_flow(flow_id) -typedef struct ibv_flow xlio_ibv_flow; -typedef struct ibv_flow_attr xlio_ibv_flow_attr; -typedef struct ibv_flow_spec_ib xlio_ibv_flow_spec_ib; -typedef struct ibv_flow_spec_eth xlio_ibv_flow_spec_eth; -typedef struct ibv_flow_spec_ipv4 xlio_ibv_flow_spec_ipv4; -typedef struct ibv_flow_spec_ipv6 xlio_ibv_flow_spec_ipv6; -typedef struct ibv_flow_spec_tcp_udp xlio_ibv_flow_spec_tcp_udp; - -// Flow tag -#ifdef DEFINED_IBV_FLOW_TAG -#define XLIO_IBV_FLOW_SPEC_ACTION_TAG IBV_FLOW_SPEC_ACTION_TAG -typedef struct ibv_flow_spec_action_tag xlio_ibv_flow_spec_action_tag; -#define xlio_get_flow_tag(cqe) ntohl((uint32_t)(cqe->sop_drop_qpn)) -#else -typedef struct ibv_flow_spec_action_tag_dummy { -} xlio_ibv_flow_spec_action_tag; -#define xlio_get_flow_tag(cqe) 0 -#endif // DEFINED_IBV_FLOW_TAG #ifdef DEFINED_IBV_CQ_ATTR_MODERATE typedef struct ibv_modify_cq_attr xlio_ibv_cq_attr; @@ -347,93 +312,4 @@ typedef enum { int xlio_rdma_lib_reset(); -static inline void ibv_flow_spec_eth_set(xlio_ibv_flow_spec_eth *eth, uint8_t *dst_mac, - uint16_t vlan_tag, bool is_ipv4) -{ - eth->type = XLIO_IBV_FLOW_SPEC_ETH; - eth->size = sizeof(xlio_ibv_flow_spec_eth); - eth->val.ether_type = ntohs(is_ipv4 ? ETH_P_IP : ETH_P_IPV6); - eth->mask.ether_type = FS_MASK_ON_16; - memcpy(eth->val.dst_mac, dst_mac, ETH_ALEN); - memset(eth->mask.dst_mac, FS_MASK_ON_8, ETH_ALEN); - eth->val.vlan_tag = vlan_tag & htons(VLAN_VID_MASK); - eth->mask.vlan_tag = - eth->val.vlan_tag ? htons(VLAN_VID_MASK) : 0; // we do not support vlan options -} - -template -static inline void ibv_flow_spec_set_single_ip(T &spec_ip_val, T &spec_ip_mask, - const ip_address &src_ip) -{ -} - -typedef decltype(ibv_flow_ipv4_filter::src_ip) spec_ipv4_type; -template <> -inline void ibv_flow_spec_set_single_ip(spec_ipv4_type &spec_ip_val, spec_ipv4_type &spec_ip_mask, - const ip_address &in_ip) -{ - memcpy(&spec_ip_val, &in_ip.get_in4_addr(), sizeof(spec_ipv4_type)); - spec_ip_mask = (!in_ip.is_anyaddr() ? FS_MASK_ON_32 : 0U); -} - -typedef decltype(ibv_flow_ipv6_filter::src_ip) spec_ipv6_type; -template <> -inline void ibv_flow_spec_set_single_ip(spec_ipv6_type &spec_ip_val, spec_ipv6_type &spec_ip_mask, - const ip_address &in_ip) -{ - memcpy(&spec_ip_val, &in_ip.get_in6_addr(), sizeof(spec_ipv6_type)); - memset(&spec_ip_mask, in_ip.is_anyaddr() ? 0 : 0xff, sizeof(spec_ipv6_type)); -} - -static inline void ibv_flow_spec_ip_set(xlio_ibv_flow_spec_ipv4 *ipv4, const ip_address &dst_ip, - const ip_address &src_ip) -{ - ipv4->type = XLIO_IBV_FLOW_SPEC_IPV4; - ipv4->size = sizeof(xlio_ibv_flow_spec_ipv4); - ibv_flow_spec_set_single_ip(ipv4->val.src_ip, ipv4->mask.src_ip, src_ip); - ibv_flow_spec_set_single_ip(ipv4->val.dst_ip, ipv4->mask.dst_ip, dst_ip); -} - -static inline void ibv_flow_spec_ip_set(xlio_ibv_flow_spec_ipv6 *ipv6, const ip_address &dst_ip, - const ip_address &src_ip) -{ - ipv6->type = XLIO_IBV_FLOW_SPEC_IPV6; - ipv6->size = sizeof(xlio_ibv_flow_spec_ipv6); - ibv_flow_spec_set_single_ip(ipv6->val.src_ip, ipv6->mask.src_ip, src_ip); - ibv_flow_spec_set_single_ip(ipv6->val.dst_ip, ipv6->mask.dst_ip, dst_ip); - ipv6->val.flow_label = ipv6->mask.flow_label = 0U; - ipv6->val.next_hdr = ipv6->mask.next_hdr = 0U; - ipv6->val.traffic_class = ipv6->mask.traffic_class = 0U; - ipv6->val.hop_limit = ipv6->mask.hop_limit = 0U; -} - -static inline void ibv_flow_spec_tcp_udp_set(xlio_ibv_flow_spec_tcp_udp *tcp_udp, bool is_tcp, - uint16_t dst_port, uint16_t src_port) -{ - tcp_udp->type = is_tcp ? XLIO_IBV_FLOW_SPEC_TCP : XLIO_IBV_FLOW_SPEC_UDP; - tcp_udp->size = sizeof(xlio_ibv_flow_spec_tcp_udp); - tcp_udp->val.src_port = src_port; - if (tcp_udp->val.src_port) { - tcp_udp->mask.src_port = FS_MASK_ON_16; - } - tcp_udp->val.dst_port = dst_port; - if (tcp_udp->val.dst_port) { - tcp_udp->mask.dst_port = FS_MASK_ON_16; - } -} - -static inline void ibv_flow_spec_flow_tag_set(xlio_ibv_flow_spec_action_tag *flow_tag, - uint32_t tag_id) -{ - NOT_IN_USE(tag_id); - if (flow_tag == NULL) { - return; - } -#ifdef DEFINED_IBV_FLOW_TAG - flow_tag->type = XLIO_IBV_FLOW_SPEC_ACTION_TAG; - flow_tag->size = sizeof(xlio_ibv_flow_spec_action_tag); - flow_tag->tag_id = tag_id; -#endif // DEFINED_IBV_FLOW_TAG -} - #endif From bc3e7284b98e6f5f1bcb72fd031c19ab47bd5b5d Mon Sep 17 00:00:00 2001 From: Alexander Grissik Date: Sun, 15 Oct 2023 17:24:19 +0300 Subject: [PATCH 020/169] issue: 3514044 Adding flow tag check through dpcp::adapter Signed-off-by: Alexander Grissik --- src/core/dev/ib_ctx_handler.cpp | 39 +++++++++++---------------------- src/core/dev/ib_ctx_handler.h | 3 +-- src/core/dev/net_device_val.cpp | 8 ------- 3 files changed, 14 insertions(+), 36 deletions(-) diff --git a/src/core/dev/ib_ctx_handler.cpp b/src/core/dev/ib_ctx_handler.cpp index cd4889127..60ec40d24 100644 --- a/src/core/dev/ib_ctx_handler.cpp +++ b/src/core/dev/ib_ctx_handler.cpp @@ -73,30 +73,10 @@ ib_ctx_handler::ib_ctx_handler(struct ib_ctx_handler_desc *desc) } m_p_ibv_context = NULL; -#ifdef DEFINED_DPCP m_p_adapter = set_dpcp_adapter(); - if (NULL == m_p_adapter) -#endif /* DEFINED_DPCP */ - { -#if defined(DEFINED_ROCE_LAG) - struct mlx5dv_context_attr dv_attr; - - memset(&dv_attr, 0, sizeof(dv_attr)); - dv_attr.flags |= MLX5DV_CONTEXT_FLAGS_DEVX; - m_p_ibv_context = mlx5dv_open_device(m_p_ibv_device, &dv_attr); -#endif /* DEFINED_ROCE_LAG */ - if (m_p_ibv_context == NULL) { - m_p_ibv_context = ibv_open_device(m_p_ibv_device); - } - if (m_p_ibv_context == NULL) { - ibch_logpanic("m_p_ibv_context is invalid"); - } - // Create pd for this device - m_p_ibv_pd = ibv_alloc_pd(m_p_ibv_context); - if (m_p_ibv_pd == NULL) { - ibch_logpanic("ibv device %p pd allocation failure (ibv context %p) (errno=%d %m)", - m_p_ibv_device, m_p_ibv_context, errno); - } + if (!m_p_adapter) { + ibch_logpanic("ibv device %p adapter allocation failure (errno=%d %m)", + m_p_ibv_device, errno); } VALGRIND_MAKE_MEM_DEFINED(m_p_ibv_pd, sizeof(struct ibv_pd)); @@ -235,8 +215,6 @@ void ib_ctx_handler::print_val() ibch_logdbg("%s", m_str); } -#ifdef DEFINED_DPCP - int parse_dpcp_version(const char *dpcp_ver) { static const std::string s_delimiter("."); @@ -355,6 +333,7 @@ dpcp::adapter *ib_ctx_handler::set_dpcp_adapter() m_p_adapter = adapter; m_p_ibv_context = ctx; m_p_ibv_pd = pd; + check_capabilities(); ibch_logdbg("dpcp adapter: %s is up", adapter->get_name().c_str()); } @@ -369,7 +348,15 @@ dpcp::adapter *ib_ctx_handler::set_dpcp_adapter() return m_p_adapter; } -#endif /* DEFINED_DPCP */ + +void ib_ctx_handler::check_capabilities() { + dpcp::adapter_hca_capabilities caps; + dpcp::status rc = m_p_adapter->get_hca_capabilities(caps); + if (rc == dpcp::DPCP_OK) { + set_flow_tag_capability(caps.flow_table_caps.receive.is_flow_action_tag_supported); + ibch_logerr("Flow Tag Support: %s", get_flow_tag_capability() ? "Yes" : "No"); + } +} void ib_ctx_handler::set_ctx_time_converter_status(ts_conversion_mode_t conversion_mode) { diff --git a/src/core/dev/ib_ctx_handler.h b/src/core/dev/ib_ctx_handler.h index 7ece4da68..1b225e1e9 100644 --- a/src/core/dev/ib_ctx_handler.h +++ b/src/core/dev/ib_ctx_handler.h @@ -78,10 +78,9 @@ class ib_ctx_handler : public event_handler_ibverbs { ibv_device *get_ibv_device() { return m_p_ibv_device; } inline char *get_ibname() { return (m_p_ibv_device ? m_p_ibv_device->name : (char *)""); } struct ibv_context *get_ibv_context() { return m_p_ibv_context; } -#ifdef DEFINED_DPCP dpcp::adapter *set_dpcp_adapter(); dpcp::adapter *get_dpcp_adapter() { return m_p_adapter; } -#endif /* DEFINED_DPCP */ + void check_capabilities(); xlio_ibv_device_attr *get_ibv_device_attr() { return xlio_get_device_orig_attr(m_p_ibv_device_attr); diff --git a/src/core/dev/net_device_val.cpp b/src/core/dev/net_device_val.cpp index f5de4a6f0..a184c3054 100644 --- a/src/core/dev/net_device_val.cpp +++ b/src/core/dev/net_device_val.cpp @@ -1646,14 +1646,6 @@ bool net_device_val::verify_qp_creation(const char *ifname, enum ibv_qp_type qp_ qp = xlio_ibv_create_qp(p_ib_ctx->get_ibv_pd(), &qp_init_attr); if (qp) { success = true; - - // TODO: Add flow_tag capability check on dpcp::adapter - if (qp_type == IBV_QPT_RAW_PACKET) { - p_ib_ctx->set_flow_tag_capability(true); - } - nd_logdbg("verified interface %s for flow tag capabilities : %s", ifname, - p_ib_ctx->get_flow_tag_capability() ? "enabled" : "disabled"); - if (qp_type == IBV_QPT_RAW_PACKET && p_ib_ctx->is_packet_pacing_supported() && !priv_ibv_query_burst_supported(qp, port_num)) { p_ib_ctx->set_burst_capability(true); From ea7dfd093afa2d43d469518ee5a8e46eadafdb64 Mon Sep 17 00:00:00 2001 From: Alexander Grissik Date: Sun, 15 Oct 2023 18:36:08 +0300 Subject: [PATCH 021/169] issue: 3514044 Require dpcp for configure and CI Signed-off-by: Alexander Grissik --- .ci/matrix_job.yaml | 1 - README.md | 11 ---- config/m4/dpcp.m4 | 89 +++++++++++++++------------------ contrib/test_jenkins.sh | 6 +-- src/core/dev/ib_ctx_handler.cpp | 6 +-- src/core/dev/ib_ctx_handler.h | 5 -- src/core/dev/ring_simple.cpp | 2 - src/core/dev/ring_simple.h | 3 +- tests/gtest/nvme/nvme.cc | 2 - 9 files changed, 45 insertions(+), 80 deletions(-) diff --git a/.ci/matrix_job.yaml b/.ci/matrix_job.yaml index cd7970a70..6e4b29ca1 100644 --- a/.ci/matrix_job.yaml +++ b/.ci/matrix_job.yaml @@ -51,7 +51,6 @@ matrix: axes: flags: - default - - dpcp arch: - x86_64 - aarch64 diff --git a/README.md b/README.md index 5726648da..13a7d6671 100644 --- a/README.md +++ b/README.md @@ -64,17 +64,6 @@ $ make -j $ make install ``` -### Building XLIO without dpcp - -```sh -$ ./autogen.sh -$ ./configure --prefix=/where/to/install -$ make -j -$ make install -``` - -Advanced HW features are not enabled for this build type. - ### Usage Examples #### Sockperf diff --git a/config/m4/dpcp.m4 b/config/m4/dpcp.m4 index 2c57680bc..8b85fa27c 100644 --- a/config/m4/dpcp.m4 +++ b/config/m4/dpcp.m4 @@ -60,60 +60,57 @@ get_min_supported_version() } AC_ARG_WITH([dpcp], - AS_HELP_STRING([--with-dpcp(=DIR)], - [Search for dpcp headers and libraries in DIR (default NO)]), + AS_HELP_STRING([--with-dpcp@<:@=DIR@:>@], + [Search for dpcp headers and libraries in DIR @<:@default: /usr@:>@]), [], - [with_dpcp=no] + [] ) -if test "x$prj_cv_directverbs" != x3 && test "x$with_dpcp" != xno; then - AC_MSG_ERROR([dpcp can be used under RDMA-core subsystem only]) +if test "x$prj_cv_directverbs" != x3; then + AC_MSG_ERROR([RDMA-core subsystem required]) fi prj_cv_dpcp=0 -AS_IF([test "x$with_dpcp" == xno], - [], - [ - if test -z "$with_dpcp" || test "$with_dpcp" = "yes"; then - with_dpcp=/usr - fi +if test -z "$with_dpcp" || test "$with_dpcp" = "yes"; then + with_dpcp=/usr +fi - FUNC_CHECK_WITHDIR([dpcp], [$with_dpcp], [include/mellanox/dpcp.h]) +FUNC_CHECK_WITHDIR([dpcp], [$with_dpcp], [include/mellanox/dpcp.h]) - prj_cv_dpcp_save_CPPFLAGS="$CPPFLAGS" - prj_cv_dpcp_save_CXXFLAGS="$CXXFLAGS" - prj_cv_dpcp_save_CFLAGS="$CFLAGS" - prj_cv_dpcp_save_LDFLAGS="$LDFLAGS" - prj_cv_dpcp_save_LIBS="$LIBS" +prj_cv_dpcp_save_CPPFLAGS="$CPPFLAGS" +prj_cv_dpcp_save_CXXFLAGS="$CXXFLAGS" +prj_cv_dpcp_save_CFLAGS="$CFLAGS" +prj_cv_dpcp_save_LDFLAGS="$LDFLAGS" +prj_cv_dpcp_save_LIBS="$LIBS" - prj_cv_dpcp_CPPFLAGS="-I$with_dpcp/include" - prj_cv_dpcp_LIBS="-ldpcp -lmlx5" - prj_cv_dpcp_LDFLAGS="-L$with_dpcp/lib -Wl,--rpath,$with_dpcp/lib" - if test -d "$with_dpcp/lib64"; then - prj_cv_dpcp_LDFLAGS="-L$with_dpcp/lib64 -Wl,--rpath,$with_dpcp/lib64" - fi +prj_cv_dpcp_CPPFLAGS="-I$with_dpcp/include" +prj_cv_dpcp_LIBS="-ldpcp -lmlx5" +prj_cv_dpcp_LDFLAGS="-L$with_dpcp/lib -Wl,--rpath,$with_dpcp/lib" +if test -d "$with_dpcp/lib64"; then + prj_cv_dpcp_LDFLAGS="-L$with_dpcp/lib64 -Wl,--rpath,$with_dpcp/lib64" +fi - CPPFLAGS="$prj_cv_dpcp_CPPFLAGS $CPPFLAGS" - CXXFLAGS="-std=c++11 $CXXFLAGS" - LDFLAGS="$prj_cv_dpcp_LDFLAGS $LDFLAGS" - LIBS="$prj_cv_dpcp_LIBS $LIBS" - - AC_LANG_PUSH([C++]) - AC_CHECK_HEADER( - [mellanox/dpcp.h], - [AC_LINK_IFELSE([AC_LANG_PROGRAM([[#include ]], - [[dpcp::provider *provider; - dpcp::provider::get_instance(provider);]])], - [prj_cv_dpcp=1]) - ]) - AC_LANG_POP() - - CPPFLAGS="$prj_cv_dpcp_save_CPPFLAGS" - CXXFLAGS="$prj_cv_dpcp_save_CXXFLAGS" - CFLAGS="$prj_cv_dpcp_save_CFLAGS" - LDFLAGS="$prj_cv_dpcp_save_LDFLAGS" - LIBS="$prj_cv_dpcp_save_LIBS" +CPPFLAGS="$prj_cv_dpcp_CPPFLAGS $CPPFLAGS" +CXXFLAGS="-std=c++11 $CXXFLAGS" +LDFLAGS="$prj_cv_dpcp_LDFLAGS $LDFLAGS" +LIBS="$prj_cv_dpcp_LIBS $LIBS" + +AC_LANG_PUSH([C++]) +AC_CHECK_HEADER( + [mellanox/dpcp.h], + [AC_LINK_IFELSE([AC_LANG_PROGRAM([[#include ]], + [[dpcp::provider *provider; + dpcp::provider::get_instance(provider);]])], + [prj_cv_dpcp=1]) ]) +AC_LANG_POP() + +CPPFLAGS="$prj_cv_dpcp_save_CPPFLAGS" +CXXFLAGS="$prj_cv_dpcp_save_CXXFLAGS" +CFLAGS="$prj_cv_dpcp_save_CFLAGS" +LDFLAGS="$prj_cv_dpcp_save_LDFLAGS" +LIBS="$prj_cv_dpcp_save_LIBS" + AC_MSG_CHECKING([for dpcp support]) if test "$prj_cv_dpcp" -ne 0; then @@ -124,16 +121,12 @@ if test "$prj_cv_dpcp" -ne 0; then min_supported_version=($(get_min_supported_version)) if test "$dpcp_version_number" -ge "$min_supported_version"; then - AC_DEFINE_UNQUOTED([DEFINED_DPCP], [$dpcp_version_number], [Define to DPCP version number (major * 10000 + minor * 100 + patch)]) AC_DEFINE_UNQUOTED([DEFINED_DPCP_MIN], [$min_supported_version], [Define to DPCP version number (major * 10000 + minor * 100 + patch)]) AC_MSG_RESULT([yes]) else - AC_MSG_RESULT([no]) AC_MSG_ERROR([found incompatible dpcp version $dpcp_version_number (min supported version $min_supported_version) ]) fi else - AS_IF([test "x$with_dpcp" == xno], - [AC_MSG_RESULT([no])], - [AC_MSG_ERROR([dpcp support requested but not present])]) + AC_MSG_ERROR([dpcp support requested but not present]) fi ]) diff --git a/contrib/test_jenkins.sh b/contrib/test_jenkins.sh index ea8209144..9e89b8ae2 100755 --- a/contrib/test_jenkins.sh +++ b/contrib/test_jenkins.sh @@ -103,13 +103,9 @@ do_check_env TARGET=${TARGET:=all} i=0 if [ "$TARGET" == "all" -o "$TARGET" == "default" ]; then - target_list[$i]="default: --disable-nginx" - i=$((i+1)) -fi -if [ "$TARGET" == "all" -o "$TARGET" == "dpcp" ]; then do_check_dpcp opt_value if [ ! -z "${opt_value}" ]; then - target_list[$i]="dpcp: --enable-nginx --with-dpcp=${opt_value}" + target_list[$i]="default: --enable-nginx --with-dpcp=${opt_value}" i=$((i+1)) else echo "Requested dpcp support can not be executed" diff --git a/src/core/dev/ib_ctx_handler.cpp b/src/core/dev/ib_ctx_handler.cpp index 60ec40d24..6e055d2be 100644 --- a/src/core/dev/ib_ctx_handler.cpp +++ b/src/core/dev/ib_ctx_handler.cpp @@ -118,12 +118,11 @@ ib_ctx_handler::ib_ctx_handler(struct ib_ctx_handler_desc *desc) ibv_dealloc_pd(m_p_ibv_pd); } -#ifdef DEFINED_DPCP if (m_p_adapter) { delete m_p_adapter; m_p_ibv_context = NULL; } -#endif /* DEFINED_DPCP */ + if (m_p_ibv_context) { ibv_close_device(m_p_ibv_context); m_p_ibv_context = NULL; @@ -159,12 +158,11 @@ ib_ctx_handler::~ib_ctx_handler() } delete m_p_ibv_device_attr; -#ifdef DEFINED_DPCP if (m_p_adapter) { delete m_p_adapter; m_p_ibv_context = NULL; } -#endif /* DEFINED_DPCP */ + if (m_p_ibv_context) { ibv_close_device(m_p_ibv_context); m_p_ibv_context = NULL; diff --git a/src/core/dev/ib_ctx_handler.h b/src/core/dev/ib_ctx_handler.h index 1b225e1e9..973c11c40 100644 --- a/src/core/dev/ib_ctx_handler.h +++ b/src/core/dev/ib_ctx_handler.h @@ -40,10 +40,7 @@ #include "dev/time_converter.h" #include "ib/base/verbs_extra.h" #include "utils/lock_wrapper.h" - -#ifdef DEFINED_DPCP #include -#endif /* DEFINED_DPCP */ typedef std::unordered_map mr_map_lkey_t; @@ -115,9 +112,7 @@ class ib_ctx_handler : public event_handler_ibverbs { void handle_event_device_fatal(); ibv_device *m_p_ibv_device; // HCA handle struct ibv_context *m_p_ibv_context; -#ifdef DEFINED_DPCP dpcp::adapter *m_p_adapter; -#endif /* DEFINED_DPCP */ xlio_ibv_device_attr_ex *m_p_ibv_device_attr; ibv_pd *m_p_ibv_pd; bool m_flow_tag_enabled; diff --git a/src/core/dev/ring_simple.cpp b/src/core/dev/ring_simple.cpp index 717cd9ca2..0b1d2383c 100644 --- a/src/core/dev/ring_simple.cpp +++ b/src/core/dev/ring_simple.cpp @@ -272,7 +272,6 @@ void ring_simple::create_resources() memset(&m_lro, 0, sizeof(m_lro)); if ((safe_mce_sys().enable_lro == option_3::ON) || ((safe_mce_sys().enable_lro == option_3::AUTO) && (1 == validate_lro(get_if_index())))) { -#if defined(DEFINED_DPCP) dpcp::adapter_hca_capabilities caps; if (m_p_ib_ctx->get_dpcp_adapter() && @@ -298,7 +297,6 @@ void ring_simple::create_resources() m_lro.max_payload_sz = std::min(actual_buf_size, XLIO_MLX5_PARAMS_LRO_PAYLOAD_SIZE) / 256U * 256U; } -#endif /* DEFINED_DPCP */ } ring_logdbg("ring attributes: m_lro = %d", m_lro.cap); ring_logdbg("ring attributes: m_lro:psh_flag = %d", m_lro.psh_flag); diff --git a/src/core/dev/ring_simple.h b/src/core/dev/ring_simple.h index 7cd0378cc..91376e17b 100644 --- a/src/core/dev/ring_simple.h +++ b/src/core/dev/ring_simple.h @@ -224,7 +224,7 @@ class ring_simple : public ring_slave { m_hqtx->tls_tx_post_dump_wqe(tis, addr, len, lkey, first); } #endif /* DEFINED_UTLS */ -#ifdef DEFINED_DPCP + std::unique_ptr create_tis(uint32_t flags) const override { std::lock_guard lock(m_lock_ring_tx); @@ -255,7 +255,6 @@ class ring_simple : public ring_slave { std::lock_guard lock(m_lock_ring_tx); m_hqtx->nvme_set_progress_context(tis, tcp_seqno); } -#endif /* DEFINED_DPCP */ void post_nop_fence(void) override { diff --git a/tests/gtest/nvme/nvme.cc b/tests/gtest/nvme/nvme.cc index df4542a02..98c2ede98 100644 --- a/tests/gtest/nvme/nvme.cc +++ b/tests/gtest/nvme/nvme.cc @@ -44,7 +44,6 @@ using namespace std; -#ifdef DEFINED_DPCP using test_iovec = vector; static ssize_t total_test_iovec_size(test_iovec &pdus) @@ -642,4 +641,3 @@ TEST_F(nvme_tx, send_multiple_pdus) server_process(pid, rx_iovs); } } -#endif /* DEFINED_DPCP */ From e2acc5122ae19973c22ebfaf230180150b4b0654 Mon Sep 17 00:00:00 2001 From: Alexander Grissik Date: Mon, 16 Oct 2023 11:07:24 +0300 Subject: [PATCH 022/169] issue: 3514044 Rebasing changes on top 3.20.5 with coverity fixes Signed-off-by: Alexander Grissik --- src/core/dev/cq_mgr_mlx5.inl | 63 -- src/core/dev/cq_mgr_tx.cpp | 13 +- src/core/dev/cq_mgr_tx.h | 2 +- src/core/dev/hw_queue_rx.cpp | 4 +- src/core/dev/hw_queue_rx.h | 2 +- src/core/dev/hw_queue_tx.cpp | 5 +- src/core/dev/hw_queue_tx.h | 16 +- src/core/dev/ib_ctx_handler.cpp | 9 +- src/core/dev/qp_mgr_eth_mlx5.cpp | 1484 ------------------------- src/core/dev/qp_mgr_eth_mlx5.h | 232 ---- src/core/dev/qp_mgr_eth_mlx5_dpcp.cpp | 347 ------ src/core/dev/qp_mgr_eth_mlx5_dpcp.h | 77 -- src/core/dev/rfs.cpp | 23 +- src/core/dev/rfs.h | 1 + src/core/dev/rfs_rule.cpp | 2 +- src/core/dev/rfs_rule_dpcp.cpp | 160 --- src/core/dev/rfs_rule_dpcp.h | 60 - src/core/dev/rfs_rule_ibv.cpp | 65 -- src/core/dev/rfs_rule_ibv.h | 59 - src/core/dev/rfs_uc.cpp | 21 +- src/core/dev/ring_bond.cpp | 3 + src/core/dev/xlio_ti.h | 2 +- src/core/main.cpp | 14 +- src/core/sock/sockinfo_ulp.cpp | 7 +- 24 files changed, 69 insertions(+), 2602 deletions(-) delete mode 100644 src/core/dev/cq_mgr_mlx5.inl delete mode 100644 src/core/dev/qp_mgr_eth_mlx5.cpp delete mode 100644 src/core/dev/qp_mgr_eth_mlx5.h delete mode 100644 src/core/dev/qp_mgr_eth_mlx5_dpcp.cpp delete mode 100644 src/core/dev/qp_mgr_eth_mlx5_dpcp.h delete mode 100644 src/core/dev/rfs_rule_dpcp.cpp delete mode 100644 src/core/dev/rfs_rule_dpcp.h delete mode 100644 src/core/dev/rfs_rule_ibv.cpp delete mode 100644 src/core/dev/rfs_rule_ibv.h diff --git a/src/core/dev/cq_mgr_mlx5.inl b/src/core/dev/cq_mgr_mlx5.inl deleted file mode 100644 index 38549ed4d..000000000 --- a/src/core/dev/cq_mgr_mlx5.inl +++ /dev/null @@ -1,63 +0,0 @@ -/* - * Copyright (c) 2001-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#ifndef CQ_MGR_MLX5_INL_H -#define CQ_MGR_MLX5_INL_H - -#include "dev/cq_mgr_regrq.h" - -#if defined(DEFINED_DIRECT_VERBS) - -/**/ -/** inlining functions can only help if they are implemented before their usage **/ -/**/ -inline struct xlio_mlx5_cqe *cq_mgr_mlx5::check_cqe(void) -{ - struct xlio_mlx5_cqe *cqe = - (struct xlio_mlx5_cqe *)(((uint8_t *)m_mlx5_cq.cq_buf) + - ((m_mlx5_cq.cq_ci & (m_mlx5_cq.cqe_count - 1)) - << m_mlx5_cq.cqe_size_log)); - /* - * CQE ownership is defined by Owner bit in the CQE. - * The value indicating SW ownership is flipped every - * time CQ wraps around. - * */ - if (likely((MLX5_CQE_OPCODE(cqe->op_own)) != MLX5_CQE_INVALID) && - !((MLX5_CQE_OWNER(cqe->op_own)) ^ !!(m_mlx5_cq.cq_ci & m_mlx5_cq.cqe_count))) { - return cqe; - } - - return NULL; -} - -#endif /* DEFINED_DIRECT_VERBS */ -#endif // CQ_MGR_MLX5_INL_H diff --git a/src/core/dev/cq_mgr_tx.cpp b/src/core/dev/cq_mgr_tx.cpp index c144d7dfa..9f2ba0a6a 100644 --- a/src/core/dev/cq_mgr_tx.cpp +++ b/src/core/dev/cq_mgr_tx.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2001-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * Copyright (c) 2001-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -33,6 +33,7 @@ #include "dev/cq_mgr_tx.h" #include #include +#include #include "ring_simple.h" #include "hw_queue_tx.h" @@ -146,13 +147,13 @@ void cq_mgr_tx::configure(int cq_size) struct ibv_context *context = m_p_ib_ctx_handler->get_ibv_context(); int comp_vector = 0; -#if defined(DEFINED_NGINX) +#if defined(DEFINED_NGINX) || defined(DEFINED_ENVOY) /* - * For NGINX scenario we may want to distribute CQs across multiple - * CPUs to improve CPS in case of multiple NGINX worker processes. + * For some scenario with forking usage we may want to distribute CQs across multiple + * CPUs to improve CPS in case of multiple processes. */ - if (safe_mce_sys().nginx_distribute_cq_interrupts) { - comp_vector = g_worker_index % context->num_comp_vectors; + if (safe_mce_sys().app.distribute_cq_interrupts && g_p_app->get_worker_id() >= 0) { + comp_vector = g_p_app->get_worker_id() % context->num_comp_vectors; } #endif m_p_ibv_cq = xlio_ibv_create_cq(context, cq_size - 1, (void *)this, m_comp_event_channel, diff --git a/src/core/dev/cq_mgr_tx.h b/src/core/dev/cq_mgr_tx.h index 91c17bc52..f5b5b7fec 100644 --- a/src/core/dev/cq_mgr_tx.h +++ b/src/core/dev/cq_mgr_tx.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2001-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * Copyright (c) 2001-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU diff --git a/src/core/dev/hw_queue_rx.cpp b/src/core/dev/hw_queue_rx.cpp index baf61cc39..55c5d569d 100644 --- a/src/core/dev/hw_queue_rx.cpp +++ b/src/core/dev/hw_queue_rx.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2001-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * Copyright (c) 2001-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -212,7 +212,7 @@ void hw_queue_rx::release_rx_buffers() m_last_posted_rx_wr_id); uintptr_t last_polled_rx_wr_id = 0; while (m_p_cq_mgr_rx && last_polled_rx_wr_id != m_last_posted_rx_wr_id && errno != EIO && - && !is_rq_empty() && !m_p_ib_ctx_handler->is_removed()) { + !is_rq_empty() && !m_p_ib_ctx_handler->is_removed()) { // Process the FLUSH'ed WQE's int ret = m_p_cq_mgr_rx->drain_and_proccess(&last_polled_rx_wr_id); diff --git a/src/core/dev/hw_queue_rx.h b/src/core/dev/hw_queue_rx.h index da0bc9ad9..7aaec82a2 100644 --- a/src/core/dev/hw_queue_rx.h +++ b/src/core/dev/hw_queue_rx.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2001-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * Copyright (c) 2001-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU diff --git a/src/core/dev/hw_queue_tx.cpp b/src/core/dev/hw_queue_tx.cpp index 24368d738..9fbb922b0 100644 --- a/src/core/dev/hw_queue_tx.cpp +++ b/src/core/dev/hw_queue_tx.cpp @@ -301,7 +301,7 @@ int hw_queue_tx::configure(const slave_data_t *slave) } } #endif /* DEFINED_ROCE_LAG */ - + NOT_IN_USE(slave); return 0; } @@ -1053,7 +1053,8 @@ void hw_queue_tx::nvme_set_progress_context(xlio_tis *tis, uint32_t tcp_seqno) } #if defined(DEFINED_UTLS) -std::unique_ptr hw_queue_tx::get_new_tls_dek(const void *key, uint32_t key_size_bytes) +std::unique_ptr hw_queue_tx::get_new_tls_dek(const void *key, + uint32_t key_size_bytes) { dpcp::tls_dek *_dek = nullptr; dpcp::adapter *adapter = m_p_ib_ctx_handler->get_dpcp_adapter(); diff --git a/src/core/dev/hw_queue_tx.h b/src/core/dev/hw_queue_tx.h index f2ec6675d..e96c3a535 100644 --- a/src/core/dev/hw_queue_tx.h +++ b/src/core/dev/hw_queue_tx.h @@ -108,16 +108,16 @@ class hw_queue_tx : public xlio_ti_owner { void dm_release_data(mem_buf_desc_t *buff) { m_dm_mgr.release_data(buff); } #ifdef DEFINED_UTLS - xlio_tis *tls_context_setup_tx(const xlio_tls_info *info) override; - xlio_tir *tls_create_tir(bool cached) override; + xlio_tis *tls_context_setup_tx(const xlio_tls_info *info); + xlio_tir *tls_create_tir(bool cached); int tls_context_setup_rx(xlio_tir *tir, const xlio_tls_info *info, uint32_t next_record_tcp_sn, xlio_comp_cb_t callback, void *callback_arg); - void tls_context_resync_tx(const xlio_tls_info *info, xlio_tis *tis, bool skip_static) override; - void tls_resync_rx(xlio_tir *tir, const xlio_tls_info *info, uint32_t hw_resync_tcp_sn) override; - void tls_get_progress_params_rx(xlio_tir *tir, void *buf, uint32_t lkey) override; - void tls_release_tis(xlio_tis *tis) override; - void tls_release_tir(xlio_tir *tir) override; - void tls_tx_post_dump_wqe(xlio_tis *tis, void *addr, uint32_t len, uint32_t lkey, bool first) override; + void tls_context_resync_tx(const xlio_tls_info *info, xlio_tis *tis, bool skip_static); + void tls_resync_rx(xlio_tir *tir, const xlio_tls_info *info, uint32_t hw_resync_tcp_sn); + void tls_get_progress_params_rx(xlio_tir *tir, void *buf, uint32_t lkey); + void tls_release_tis(xlio_tis *tis); + void tls_release_tir(xlio_tir *tir); + void tls_tx_post_dump_wqe(xlio_tis *tis, void *addr, uint32_t len, uint32_t lkey, bool first); #endif /* DEFINED_UTLS */ #define DPCP_TIS_FLAGS (dpcp::TIS_ATTR_TRANSPORT_DOMAIN | dpcp::TIS_ATTR_PD) diff --git a/src/core/dev/ib_ctx_handler.cpp b/src/core/dev/ib_ctx_handler.cpp index 6e055d2be..b645c22b1 100644 --- a/src/core/dev/ib_ctx_handler.cpp +++ b/src/core/dev/ib_ctx_handler.cpp @@ -75,8 +75,8 @@ ib_ctx_handler::ib_ctx_handler(struct ib_ctx_handler_desc *desc) m_p_ibv_context = NULL; m_p_adapter = set_dpcp_adapter(); if (!m_p_adapter) { - ibch_logpanic("ibv device %p adapter allocation failure (errno=%d %m)", - m_p_ibv_device, errno); + ibch_logpanic("ibv device %p adapter allocation failure (errno=%d %m)", m_p_ibv_device, + errno); } VALGRIND_MAKE_MEM_DEFINED(m_p_ibv_pd, sizeof(struct ibv_pd)); @@ -347,12 +347,13 @@ dpcp::adapter *ib_ctx_handler::set_dpcp_adapter() return m_p_adapter; } -void ib_ctx_handler::check_capabilities() { +void ib_ctx_handler::check_capabilities() +{ dpcp::adapter_hca_capabilities caps; dpcp::status rc = m_p_adapter->get_hca_capabilities(caps); if (rc == dpcp::DPCP_OK) { set_flow_tag_capability(caps.flow_table_caps.receive.is_flow_action_tag_supported); - ibch_logerr("Flow Tag Support: %s", get_flow_tag_capability() ? "Yes" : "No"); + ibch_logdbg("Flow Tag Support: %s", get_flow_tag_capability() ? "Yes" : "No"); } } diff --git a/src/core/dev/qp_mgr_eth_mlx5.cpp b/src/core/dev/qp_mgr_eth_mlx5.cpp deleted file mode 100644 index a5a1b00e7..000000000 --- a/src/core/dev/qp_mgr_eth_mlx5.cpp +++ /dev/null @@ -1,1484 +0,0 @@ -/* - * Copyright (c) 2001-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "qp_mgr_eth_mlx5.h" - -#if defined(DEFINED_DIRECT_VERBS) - -#include -#include -#include "cq_mgr_rx_regrq.h" -#include "proto/tls.h" -#include "util/utils.h" -#include "vlogger/vlogger.h" -#include "ring_simple.h" - -#undef MODULE_NAME -#define MODULE_NAME "qpm_mlx5" - -#if !defined(MLX5_ETH_INLINE_HEADER_SIZE) -#define MLX5_ETH_INLINE_HEADER_SIZE 18 -#endif - -#define OCTOWORD 16 -#define WQEBB 64 - -//#define DBG_DUMP_WQE 1 - -#ifdef DBG_DUMP_WQE -#define dbg_dump_wqe(_addr, _size) \ - { \ - uint32_t *_wqe = _addr; \ - qp_logfunc("Dumping %d bytes from %p", _size, _wqe); \ - for (int i = 0; i < (int)_size / 4; i += 4) { \ - qp_logfunc("%08x %08x %08x %08x", ntohl(_wqe[i + 0]), ntohl(_wqe[i + 1]), \ - ntohl(_wqe[i + 2]), ntohl(_wqe[i + 3])); \ - } \ - } -#else -#define dbg_dump_wqe(_addr, _size) -#endif - -static inline uint64_t align_to_octoword_up(uint64_t val) -{ - return ((val + 16 - 1) >> 4) << 4; -} - -static inline uint64_t align_to_WQEBB_up(uint64_t val) -{ - return ((val + 4 - 1) >> 2) << 2; -} - -static bool is_bf(struct ibv_context *ib_ctx) -{ - char *env; - - /* This limitation is done for RM: 1557652, 1894523, 1914464, 2069198 */ - if (safe_mce_sys().hypervisor != mce_sys_var::HYPER_NONE) { - return false; - } - - env = getenv("MLX5_SHUT_UP_BF"); - if (!env || !strcmp(env, "0")) { -#if defined(DEFINED_DIRECT_VERBS) && (DEFINED_DIRECT_VERBS == 3) && \ - defined(MLX5DV_UAR_ALLOC_TYPE_BF) - struct mlx5dv_devx_uar *uar = mlx5dv_devx_alloc_uar(ib_ctx, MLX5DV_UAR_ALLOC_TYPE_BF); - if (uar) { - mlx5dv_devx_free_uar(uar); - return true; - } -#else - NOT_IN_USE(ib_ctx); -#endif /* DEFINED_DIRECT_VERBS */ - } - return false; -} - -//! Maps xlio_ibv_wr_opcode to real MLX5 opcode. -// -static inline uint32_t get_mlx5_opcode(xlio_ibv_wr_opcode verbs_opcode) -{ - switch (verbs_opcode) { - case XLIO_IBV_WR_SEND: - return MLX5_OPCODE_SEND; - case XLIO_IBV_WR_TSO: - return MLX5_OPCODE_TSO; - case XLIO_IBV_WR_NOP: - return MLX5_OPCODE_NOP; - default: - return MLX5_OPCODE_SEND; - } -} - -qp_mgr_eth_mlx5::qp_mgr_eth_mlx5(struct qp_mgr_desc *desc, const uint32_t tx_num_wr, - const uint16_t vlan, bool call_configure) - : qp_mgr(desc, tx_num_wr, vlan) - , m_sq_wqe_idx_to_prop(NULL) - , m_sq_wqe_prop_last(NULL) - , m_sq_wqe_prop_last_signalled(0) - , m_sq_free_credits(0) - , m_rq_wqe_counter(0) - , m_sq_wqes(NULL) - , m_sq_wqe_hot(NULL) - , m_sq_wqes_end(NULL) - , m_sq_wqe_hot_index(0) - , m_sq_wqe_counter(0) - , m_b_fence_needed(false) - , m_dm_enabled(false) -{ - // Check device capabilities for dummy send support - m_hw_dummy_send_support = xlio_is_nop_supported(m_p_ib_ctx_handler->get_ibv_device_attr()); - - if (call_configure && configure(desc)) { - throw_xlio_exception("failed creating qp_mgr_eth_mlx5"); - } - - memset(&m_mlx5_qp, 0, sizeof(m_mlx5_qp)); - m_db_method = - (is_bf(((ib_ctx_handler *)desc->slave->p_ib_ctx)->get_ibv_context()) ? MLX5_DB_METHOD_BF - : MLX5_DB_METHOD_DB); - - qp_logdbg("m_db_method=%d", m_db_method); -} - -void qp_mgr_eth_mlx5::init_qp() -{ - if (0 != xlio_ib_mlx5_get_qp(m_qp, &m_mlx5_qp)) { - qp_logpanic("xlio_ib_mlx5_get_qp failed (errno=%d %m)", errno); - } - - m_sq_wqes = (struct mlx5_eth_wqe(*)[])(uintptr_t)m_mlx5_qp.sq.buf; - m_sq_wqe_hot = &(*m_sq_wqes)[0]; - m_sq_wqes_end = - (uint8_t *)((uintptr_t)m_mlx5_qp.sq.buf + m_mlx5_qp.sq.wqe_cnt * m_mlx5_qp.sq.stride); - m_sq_wqe_counter = 0; - - m_sq_wqe_hot_index = 0; - - uint32_t old_wr_val = m_tx_num_wr; - m_tx_num_wr = (m_sq_wqes_end - (uint8_t *)m_sq_wqe_hot) / WQEBB; - - // We use the min between CQ size and the QP size (that might be increases by ibv creation). - m_sq_free_credits = std::min(m_tx_num_wr, old_wr_val); - - /* Maximum BF inlining consists of: - * - CTRL: - * - 1st WQEBB is mostly used for CTRL and ETH segment (where ETH header is inlined) - * - 4 bytes for size of inline data - * - DATA: - * - 1 OCTOWORD from 1st WQEBB is used for data inlining, except for - * the 4 bytes used for stating the inline data size - * - 3 WQEBB are fully availabie for data inlining - */ - m_qp_cap.max_inline_data = OCTOWORD - 4 + 3 * WQEBB; - - if (m_sq_wqe_idx_to_prop == NULL) { - m_sq_wqe_idx_to_prop = - (sq_wqe_prop *)mmap(NULL, m_tx_num_wr * sizeof(*m_sq_wqe_idx_to_prop), - PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); - if (m_sq_wqe_idx_to_prop == MAP_FAILED) { - qp_logerr("Failed allocating m_sq_wqe_idx_to_prop (errno=%d %m)", errno); - return; - } - m_sq_wqe_prop_last_signalled = m_tx_num_wr - 1; - m_sq_wqe_prop_last = NULL; - } - - qp_logfunc("m_tx_num_wr=%d max_inline_data: %d m_sq_wqe_idx_to_prop=%p", m_tx_num_wr, - get_max_inline_data(), m_sq_wqe_idx_to_prop); - - memset((void *)(uintptr_t)m_sq_wqe_hot, 0, sizeof(struct mlx5_eth_wqe)); - m_sq_wqe_hot->ctrl.data[0] = htonl(MLX5_OPCODE_SEND); - m_sq_wqe_hot->ctrl.data[1] = htonl((m_mlx5_qp.qpn << 8) | 4); - m_sq_wqe_hot->ctrl.data[2] = 0; - m_sq_wqe_hot->eseg.inline_hdr_sz = htons(MLX5_ETH_INLINE_HEADER_SIZE); - m_sq_wqe_hot->eseg.cs_flags = XLIO_TX_PACKET_L3_CSUM | XLIO_TX_PACKET_L4_CSUM; - - qp_logfunc("%p allocated for %d QPs sq_wqes:%p sq_wqes_end: %p and configured %d WRs " - "BlueFlame: %p buf_size: %d offset: %d", - m_qp, m_mlx5_qp.qpn, m_sq_wqes, m_sq_wqes_end, m_tx_num_wr, m_mlx5_qp.bf.reg, - m_mlx5_qp.bf.size, m_mlx5_qp.bf.offset); -} - -void qp_mgr_eth_mlx5::init_device_memory() -{ - /* This limitation is done because of a observation - * that dm_copy takes a lot of time on VMs w/o BF (RM:1542628) - */ - if (m_p_ib_ctx_handler->get_on_device_memory_size() > 0) { - if (m_db_method == MLX5_DB_METHOD_BF) { - m_dm_enabled = - m_dm_mgr.allocate_resources(m_p_ib_ctx_handler, m_p_ring->m_p_ring_stat.get()); - - } else { -#if defined(DEFINED_IBV_DM) - VLOG_PRINTF_ONCE_THEN_DEBUG( - VLOG_WARNING, - "Device Memory functionality is not used on devices w/o Blue Flame support\n"); -#endif /* DEFINED_IBV_DM */ - } - } -} - -void qp_mgr_eth_mlx5::up() -{ - init_qp(); - qp_mgr::up(); - init_device_memory(); -} - -void qp_mgr_eth_mlx5::down() -{ - if (m_dm_enabled) { - m_dm_mgr.release_resources(); - } - - qp_mgr::down(); -} - -#if defined(DEFINED_UTLS) -void qp_mgr_eth_mlx5::destroy_tis_cache(void) -{ - while (!m_tls_tis_cache.empty()) { - xlio_tis *tis = m_tls_tis_cache.back(); - m_tls_tis_cache.pop_back(); - delete tis; - } -} -#endif /* defined(DEFINED_UTLS) */ - -void qp_mgr_eth_mlx5::update_next_wqe_hot() -{ - // Preparing next WQE as Ethernet send WQE and index: - m_sq_wqe_hot = &(*m_sq_wqes)[m_sq_wqe_counter & (m_tx_num_wr - 1)]; - m_sq_wqe_hot_index = m_sq_wqe_counter & (m_tx_num_wr - 1); - memset(m_sq_wqe_hot, 0, sizeof(mlx5_eth_wqe)); - - // Fill Ethernet segment with header inline: - struct mlx5_wqe_eth_seg *eth_seg = - (struct mlx5_wqe_eth_seg *)((uint8_t *)m_sq_wqe_hot + sizeof(struct mlx5_wqe_ctrl_seg)); - eth_seg->inline_hdr_sz = htons(MLX5_ETH_INLINE_HEADER_SIZE); -} - -//! Cleanup resources QP itself will be freed by base class DTOR -qp_mgr_eth_mlx5::~qp_mgr_eth_mlx5() -{ - if (m_rq_wqe_idx_to_wrid) { - if (0 != munmap(m_rq_wqe_idx_to_wrid, m_rx_num_wr * sizeof(*m_rq_wqe_idx_to_wrid))) { - qp_logerr("Failed deallocating memory with munmap m_rq_wqe_idx_to_wrid (errno=%d %m)", - errno); - } - m_rq_wqe_idx_to_wrid = NULL; - } - if (m_sq_wqe_idx_to_prop) { - if (0 != munmap(m_sq_wqe_idx_to_prop, m_tx_num_wr * sizeof(*m_sq_wqe_idx_to_prop))) { - qp_logerr("Failed deallocating memory with munmap m_sq_wqe_idx_to_prop (errno=%d %m)", - errno); - } - m_sq_wqe_idx_to_prop = NULL; - } - destroy_tis_cache(); -} - -void qp_mgr_eth_mlx5::post_recv_buffer(mem_buf_desc_t *p_mem_buf_desc) -{ - m_ibv_rx_sg_array[m_curr_rx_wr].addr = (uintptr_t)p_mem_buf_desc->p_buffer; - m_ibv_rx_sg_array[m_curr_rx_wr].length = p_mem_buf_desc->sz_buffer; - m_ibv_rx_sg_array[m_curr_rx_wr].lkey = p_mem_buf_desc->lkey; - - post_recv_buffer_rq(p_mem_buf_desc); -} - -void qp_mgr_eth_mlx5::post_recv_buffer_rq(mem_buf_desc_t *p_mem_buf_desc) -{ - if (m_n_sysvar_rx_prefetch_bytes_before_poll) { - if (m_p_prev_rx_desc_pushed) { - m_p_prev_rx_desc_pushed->p_prev_desc = p_mem_buf_desc; - } - m_p_prev_rx_desc_pushed = p_mem_buf_desc; - } - - m_ibv_rx_wr_array[m_curr_rx_wr].wr_id = (uintptr_t)p_mem_buf_desc; - - if (m_rq_wqe_idx_to_wrid) { - uint32_t index = m_rq_wqe_counter & (m_rx_num_wr - 1); - m_rq_wqe_idx_to_wrid[index] = (uintptr_t)p_mem_buf_desc; - ++m_rq_wqe_counter; - } - - if (m_curr_rx_wr == m_n_sysvar_rx_num_wr_to_post_recv - 1) { - - m_last_posted_rx_wr_id = (uintptr_t)p_mem_buf_desc; - - m_p_prev_rx_desc_pushed = NULL; - p_mem_buf_desc->p_prev_desc = NULL; - - m_curr_rx_wr = 0; - struct ibv_recv_wr *bad_wr = NULL; - IF_VERBS_FAILURE(xlio_ib_mlx5_post_recv(&m_mlx5_qp, &m_ibv_rx_wr_array[0], &bad_wr)) - { - uint32_t n_pos_bad_rx_wr = - ((uint8_t *)bad_wr - (uint8_t *)m_ibv_rx_wr_array) / sizeof(struct ibv_recv_wr); - qp_logerr("failed posting list (errno=%d %s)", errno, strerror(errno)); - qp_logerr("bad_wr is %d in submitted list (bad_wr=%p, m_ibv_rx_wr_array=%p, size=%zu)", - n_pos_bad_rx_wr, bad_wr, m_ibv_rx_wr_array, sizeof(struct ibv_recv_wr)); - qp_logerr("bad_wr info: wr_id=%#lx, next=%p, addr=%#lx, length=%d, lkey=%#x", - bad_wr[0].wr_id, bad_wr[0].next, bad_wr[0].sg_list[0].addr, - bad_wr[0].sg_list[0].length, bad_wr[0].sg_list[0].lkey); - qp_logerr("QP current state: %d", priv_ibv_query_qp_state(m_qp)); - - // Fix broken linked list of rx_wr - if (n_pos_bad_rx_wr != (m_n_sysvar_rx_num_wr_to_post_recv - 1)) { - m_ibv_rx_wr_array[n_pos_bad_rx_wr].next = &m_ibv_rx_wr_array[n_pos_bad_rx_wr + 1]; - } - throw; - } - ENDIF_VERBS_FAILURE; - qp_logfunc("Successful ibv_post_recv"); - } else { - m_curr_rx_wr++; - } -} - -bool qp_mgr_eth_mlx5::init_rx_cq_mgr_prepare() -{ - m_rx_num_wr = align32pow2(m_rx_num_wr); - - m_rq_wqe_idx_to_wrid = - (uint64_t *)mmap(NULL, m_rx_num_wr * sizeof(*m_rq_wqe_idx_to_wrid), PROT_READ | PROT_WRITE, - MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); - if (m_rq_wqe_idx_to_wrid == MAP_FAILED) { - qp_logerr("Failed allocating m_rq_wqe_idx_to_wrid (errno=%d %m)", errno); - return false; - } - - return true; -} - -cq_mgr_rx *qp_mgr_eth_mlx5::init_rx_cq_mgr(struct ibv_comp_channel *p_rx_comp_event_channel) -{ - return (!init_rx_cq_mgr_prepare() ? NULL - : new cq_mgr_rx_regrq(m_p_ring, m_p_ib_ctx_handler, - m_rx_num_wr, p_rx_comp_event_channel)); -} - -cq_mgr_tx *qp_mgr_eth_mlx5::init_tx_cq_mgr() -{ - m_tx_num_wr = align32pow2(m_tx_num_wr); - return new cq_mgr_tx(m_p_ring, m_p_ib_ctx_handler, m_tx_num_wr, - m_p_ring->get_tx_comp_event_channel()); -} - -inline void qp_mgr_eth_mlx5::ring_doorbell(int db_method, int num_wqebb, int num_wqebb_top, - bool skip_comp /*=false*/) -{ - uint64_t *dst = (uint64_t *)((uint8_t *)m_mlx5_qp.bf.reg + m_mlx5_qp.bf.offset); - uint64_t *src = reinterpret_cast(m_sq_wqe_hot); - struct xlio_mlx5_wqe_ctrl_seg *ctrl = reinterpret_cast(src); - - /* TODO Refactor m_n_unsignedled_count, is_completion_need(), set_unsignaled_count(): - * Some logic is hidden inside the methods and in one branch the field is changed directly. - */ - if (!skip_comp && is_completion_need()) { - ctrl->fm_ce_se |= MLX5_WQE_CTRL_CQ_UPDATE; - } - if (ctrl->fm_ce_se & MLX5_WQE_CTRL_CQ_UPDATE) { - set_unsignaled_count(); - } else { - dec_unsignaled_count(); - } - if (unlikely(m_b_fence_needed)) { - ctrl->fm_ce_se |= MLX5_FENCE_MODE_INITIATOR_SMALL; - m_b_fence_needed = false; - } - - m_sq_wqe_counter = (m_sq_wqe_counter + num_wqebb + num_wqebb_top) & 0xFFFF; - - // Make sure that descriptors are written before - // updating doorbell record and ringing the doorbell - wmb(); - *m_mlx5_qp.sq.dbrec = htonl(m_sq_wqe_counter); - - // This wc_wmb ensures ordering between DB record and BF copy - wc_wmb(); - if (likely(db_method == MLX5_DB_METHOD_BF)) { - /* Copying src to BlueFlame register buffer by Write Combining cnt WQEBBs - * Avoid using memcpy() to copy to BlueFlame page, since memcpy() - * implementations may use move-string-buffer assembler instructions, - * which do not guarantee order of copying. - */ - while (num_wqebb--) { - COPY_64B_NT(dst, src); - } - src = (uint64_t *)m_sq_wqes; - while (num_wqebb_top--) { - COPY_64B_NT(dst, src); - } - } else { - *dst = *src; - } - - /* Use wc_wmb() to ensure write combining buffers are flushed out - * of the running CPU. - * sfence instruction affects only the WC buffers of the CPU that executes it - */ - wc_wmb(); - m_mlx5_qp.bf.offset ^= m_mlx5_qp.bf.size; -} - -inline int qp_mgr_eth_mlx5::fill_inl_segment(sg_array &sga, uint8_t *cur_seg, uint8_t *data_addr, - int max_inline_len, int inline_len) -{ - int wqe_inline_size = 0; - while ((data_addr != NULL) && inline_len) { - dbg_dump_wqe((uint32_t *)data_addr, inline_len); - memcpy(cur_seg, data_addr, inline_len); - wqe_inline_size += inline_len; - cur_seg += inline_len; - inline_len = max_inline_len - wqe_inline_size; - data_addr = sga.get_data(&inline_len); - qp_logfunc("data_addr:%p cur_seg: %p inline_len: %d wqe_inline_size: %d", data_addr, - cur_seg, inline_len, wqe_inline_size); - } - return wqe_inline_size; -} - -//! Fill WQE dynamically, based on amount of free WQEBB in SQ -inline int qp_mgr_eth_mlx5::fill_wqe(xlio_ibv_send_wr *pswr) -{ - // control segment is mostly filled by preset after previous packet - // we always inline ETH header - sg_array sga(pswr->sg_list, pswr->num_sge); - uint8_t *cur_seg = (uint8_t *)m_sq_wqe_hot + sizeof(struct mlx5_wqe_ctrl_seg); - int inline_len = MLX5_ETH_INLINE_HEADER_SIZE; - int data_len = sga.length(); - int wqe_size = sizeof(struct mlx5_wqe_ctrl_seg) / OCTOWORD; - int max_inline_len = get_max_inline_data(); - - // assume packet is full inline - if (likely(data_len <= max_inline_len && xlio_send_wr_opcode(*pswr) == XLIO_IBV_WR_SEND)) { - uint8_t *data_addr = sga.get_data(&inline_len); // data for inlining in ETH header - data_len -= inline_len; - qp_logfunc( - "wqe_hot:%p num_sge: %d data_addr: %p data_len: %d max_inline_len: %d inline_len: %d", - m_sq_wqe_hot, pswr->num_sge, data_addr, data_len, max_inline_len, inline_len); - - // Fill Ethernet segment with header inline, static data - // were populated in preset after previous packet send - memcpy(cur_seg + offsetof(struct mlx5_wqe_eth_seg, inline_hdr_start), data_addr, - MLX5_ETH_INLINE_HEADER_SIZE); - cur_seg += sizeof(struct mlx5_wqe_eth_seg); - wqe_size += sizeof(struct mlx5_wqe_eth_seg) / OCTOWORD; - - max_inline_len = data_len; - // Filling inline data segment - // size of BlueFlame buffer is 4*WQEBBs, 3*OCTOWORDS of the first - // was allocated for control and ethernet segment so we have 3*WQEBB+16-4 - int rest_space = std::min((int)(m_sq_wqes_end - cur_seg - 4), (3 * WQEBB + OCTOWORD - 4)); - // Filling till the end of inline WQE segment or - // to end of WQEs - if (likely(max_inline_len <= rest_space)) { - inline_len = max_inline_len; - qp_logfunc("data_addr:%p cur_seg: %p rest_space: %d inline_len: %d wqe_size: %d", - data_addr, cur_seg, rest_space, inline_len, wqe_size); - // bypass inline size and fill inline data segment - data_addr = sga.get_data(&inline_len); - inline_len = fill_inl_segment(sga, cur_seg + 4, data_addr, max_inline_len, inline_len); - - // store inline data size and mark the data as inlined - *(uint32_t *)((uint8_t *)m_sq_wqe_hot + sizeof(struct mlx5_wqe_ctrl_seg) + - sizeof(struct mlx5_wqe_eth_seg)) = htonl(0x80000000 | inline_len); - rest_space = align_to_octoword_up(inline_len + 4); // align to OCTOWORDs - wqe_size += rest_space / OCTOWORD; - // assert((data_len-inline_len)==0); - // configuring control - m_sq_wqe_hot->ctrl.data[1] = htonl((m_mlx5_qp.qpn << 8) | wqe_size); - rest_space = align_to_WQEBB_up(wqe_size) / 4; - qp_logfunc("data_len: %d inline_len: %d wqe_size: %d wqebbs: %d", data_len - inline_len, - inline_len, wqe_size, rest_space); - ring_doorbell(m_db_method, rest_space); - return rest_space; - } else { - // wrap around case, first filling till the end of m_sq_wqes - int wrap_up_size = max_inline_len - rest_space; - inline_len = rest_space; - qp_logfunc("WRAP_UP_SIZE: %d data_addr:%p cur_seg: %p rest_space: %d inline_len: %d " - "wqe_size: %d", - wrap_up_size, data_addr, cur_seg, rest_space, inline_len, wqe_size); - - data_addr = sga.get_data(&inline_len); - inline_len = fill_inl_segment(sga, cur_seg + 4, data_addr, rest_space, inline_len); - data_len -= inline_len; - rest_space = align_to_octoword_up(inline_len + 4); - wqe_size += rest_space / OCTOWORD; - rest_space = - align_to_WQEBB_up(rest_space / OCTOWORD) / 4; // size of 1st chunk at the end - - qp_logfunc( - "END chunk data_addr: %p data_len: %d inline_len: %d wqe_size: %d wqebbs: %d", - data_addr, data_len, inline_len, wqe_size, rest_space); - // Wrap around - // - cur_seg = (uint8_t *)m_sq_wqes; - data_addr = sga.get_data(&wrap_up_size); - - wrap_up_size = fill_inl_segment(sga, cur_seg, data_addr, data_len, wrap_up_size); - inline_len += wrap_up_size; - max_inline_len = align_to_octoword_up(wrap_up_size); - wqe_size += max_inline_len / OCTOWORD; - max_inline_len = align_to_WQEBB_up(max_inline_len / OCTOWORD) / 4; - // store inline data size - *(uint32_t *)((uint8_t *)m_sq_wqe_hot + sizeof(struct mlx5_wqe_ctrl_seg) + - sizeof(struct mlx5_wqe_eth_seg)) = htonl(0x80000000 | inline_len); - qp_logfunc("BEGIN_CHUNK data_addr: %p data_len: %d wqe_size: %d inline_len: %d " - "end_wqebbs: %d wqebbs: %d", - data_addr, data_len - wrap_up_size, wqe_size, inline_len + wrap_up_size, - rest_space, max_inline_len); - // assert((data_len-wrap_up_size)==0); - // configuring control - m_sq_wqe_hot->ctrl.data[1] = htonl((m_mlx5_qp.qpn << 8) | wqe_size); - - dbg_dump_wqe((uint32_t *)m_sq_wqe_hot, rest_space * 4 * 16); - dbg_dump_wqe((uint32_t *)m_sq_wqes, max_inline_len * 4 * 16); - - ring_doorbell(m_db_method, rest_space, max_inline_len); - return rest_space + max_inline_len; - } - } else { - if (xlio_send_wr_opcode(*pswr) == XLIO_IBV_WR_SEND) { - /* data is bigger than max to inline we inlined only ETH header + uint from IP (18 - * bytes) the rest will be in data pointer segment adding data seg with pointer if there - * still data to transfer - */ - wqe_size = fill_wqe_send(pswr); - return wqe_size; - } else { - /* Support XLIO_IBV_WR_SEND_TSO operation - */ - wqe_size = fill_wqe_lso(pswr); - return wqe_size; - } - } - return 1; -} - -inline int qp_mgr_eth_mlx5::fill_wqe_send(xlio_ibv_send_wr *pswr) -{ - struct mlx5_wqe_eth_seg *eseg; - struct mlx5_wqe_data_seg *dseg; - int wqe_size = sizeof(mlx5_wqe_ctrl_seg) / OCTOWORD; - - eseg = (struct mlx5_wqe_eth_seg *)((uint8_t *)m_sq_wqe_hot + sizeof(mlx5_wqe_ctrl_seg)); - eseg->inline_hdr_sz = 0; - - /* Unlike Linux kernel, rdma-core defines mlx5_wqe_eth_seg as 32 bytes, because it contains - * 18 bytes of inline header. We don't want to inline partial header to avoid an extra copy - * and code complication. Therefore, we cannot rely on the structure definition and need to - * hardcode 16 bytes here. - */ - wqe_size += 1; - dseg = (struct mlx5_wqe_data_seg *)((uintptr_t)eseg + OCTOWORD); - - for (int i = 0; i < pswr->num_sge; ++i) { - if (unlikely((uintptr_t)dseg >= (uintptr_t)m_sq_wqes_end)) { - dseg = (struct mlx5_wqe_data_seg *)m_sq_wqes; - } - if (likely(pswr->sg_list[i].length)) { - dseg->byte_count = htonl(pswr->sg_list[i].length); - /* Try to copy data to On Device Memory in first */ - if (!(m_dm_enabled && - m_dm_mgr.copy_data(dseg, (uint8_t *)((uintptr_t)pswr->sg_list[i].addr), - pswr->sg_list[i].length, (mem_buf_desc_t *)pswr->wr_id))) { - dseg->lkey = htonl(pswr->sg_list[i].lkey); - dseg->addr = htonll((uintptr_t)pswr->sg_list[i].addr); - } - ++dseg; - wqe_size += sizeof(struct mlx5_wqe_data_seg) / OCTOWORD; - } - } - - m_sq_wqe_hot->ctrl.data[1] = htonl((m_mlx5_qp.qpn << 8) | wqe_size); - int wqebbs = align_to_WQEBB_up(wqe_size) / 4; - /* TODO FIXME Split into top and bottom parts */ - ring_doorbell(m_db_method, wqebbs); - - return wqebbs; -} - -//! Filling wqe for LSO -inline int qp_mgr_eth_mlx5::fill_wqe_lso(xlio_ibv_send_wr *pswr) -{ - struct mlx5_wqe_ctrl_seg *ctrl = NULL; - struct mlx5_wqe_eth_seg *eseg = NULL; - struct mlx5_wqe_data_seg *dpseg = NULL; - uint8_t *cur_seg = NULL; - uint8_t *p_hdr = (uint8_t *)pswr->tso.hdr; - int inl_hdr_size = pswr->tso.hdr_sz; - int inl_hdr_copy_size = 0; - int max_inline_len = align_to_octoword_up(sizeof(struct mlx5_wqe_eth_seg) + inl_hdr_size - - MLX5_ETH_INLINE_HEADER_SIZE); - int wqe_size = sizeof(struct mlx5_wqe_ctrl_seg) / OCTOWORD; - int rest = 0; - int i = 0; - - ctrl = (struct mlx5_wqe_ctrl_seg *)m_sq_wqe_hot; - - /* Do usual send operation in case payload less than mss */ - if (0 == pswr->tso.mss) { - ctrl->opmod_idx_opcode = - htonl(((m_sq_wqe_counter & 0xffff) << 8) | (get_mlx5_opcode(XLIO_IBV_WR_SEND) & 0xff)); - } - - eseg = (struct mlx5_wqe_eth_seg *)((uint8_t *)m_sq_wqe_hot + sizeof(*ctrl)); - eseg->mss = htons(pswr->tso.mss); - eseg->inline_hdr_sz = htons(inl_hdr_size); - - rest = (int)((uintptr_t)(void *)m_sq_wqes_end - (uintptr_t)(void *)eseg); - cur_seg = (uint8_t *)eseg; - - if (likely(max_inline_len <= rest)) { - // Fill Ethernet segment with full header inline - inl_hdr_copy_size = inl_hdr_size; - memcpy(eseg->inline_hdr_start, p_hdr, inl_hdr_copy_size); - cur_seg += max_inline_len; - } else { - // wrap around SQ on inline ethernet header - inl_hdr_copy_size = rest - offsetof(struct mlx5_wqe_eth_seg, inline_hdr_start); - memcpy(eseg->inline_hdr_start, p_hdr, inl_hdr_copy_size); - p_hdr += inl_hdr_copy_size; - inl_hdr_copy_size = inl_hdr_size - inl_hdr_copy_size; - memcpy(m_sq_wqes, p_hdr, inl_hdr_copy_size); - max_inline_len = align_to_octoword_up(inl_hdr_copy_size); - cur_seg = (uint8_t *)m_sq_wqes + max_inline_len; - wqe_size += rest / OCTOWORD; - inl_hdr_copy_size = align_to_WQEBB_up(wqe_size) / 4; - } - wqe_size += max_inline_len / OCTOWORD; - qp_logfunc("TSO: num_sge: %d max_inline_len: %d inl_hdr_size: %d rest: %d", pswr->num_sge, - max_inline_len, inl_hdr_size, rest); - // Filling data pointer segments with payload by scatter-gather list elements - dpseg = (struct mlx5_wqe_data_seg *)cur_seg; - for (i = 0; i < pswr->num_sge; i++) { - if (unlikely((uintptr_t)dpseg >= (uintptr_t)m_sq_wqes_end)) { - dpseg = (struct mlx5_wqe_data_seg *)m_sq_wqes; - inl_hdr_copy_size = align_to_WQEBB_up(wqe_size) / 4; - } - dpseg->addr = htonll((uint64_t)pswr->sg_list[i].addr); - dpseg->lkey = htonl(pswr->sg_list[i].lkey); - dpseg->byte_count = htonl(pswr->sg_list[i].length); - - qp_logfunc("DATA_SEG: addr:%llx len: %d lkey: %x dp_seg: %p wqe_size: %d", - pswr->sg_list[i].addr, pswr->sg_list[i].length, dpseg->lkey, dpseg, wqe_size); - - dpseg++; - wqe_size += sizeof(struct mlx5_wqe_data_seg) / OCTOWORD; - } - inl_hdr_size = align_to_WQEBB_up(wqe_size) / 4; - m_sq_wqe_hot->ctrl.data[1] = htonl((m_mlx5_qp.qpn << 8) | wqe_size); - - // sending by BlueFlame or DoorBell covering wrap around - // TODO Make a single doorbell call - if (likely(inl_hdr_size <= 4)) { - if (likely(inl_hdr_copy_size == 0)) { - ring_doorbell(MLX5_DB_METHOD_DB, inl_hdr_size); - } else { - ring_doorbell(MLX5_DB_METHOD_DB, inl_hdr_copy_size, inl_hdr_size - inl_hdr_copy_size); - } - } else { - ring_doorbell(MLX5_DB_METHOD_DB, inl_hdr_size); - } - return align_to_WQEBB_up(wqe_size) / 4; -} - -void qp_mgr_eth_mlx5::store_current_wqe_prop(mem_buf_desc_t *buf, unsigned credits, xlio_ti *ti) -{ - m_sq_wqe_idx_to_prop[m_sq_wqe_hot_index] = sq_wqe_prop { - .buf = buf, - .credits = credits, - .ti = ti, - .next = m_sq_wqe_prop_last, - }; - m_sq_wqe_prop_last = &m_sq_wqe_idx_to_prop[m_sq_wqe_hot_index]; - if (ti != NULL) { - ti->get(); - } -} - -//! Send one RAW packet by MLX5 BlueFlame -// -int qp_mgr_eth_mlx5::send_to_wire(xlio_ibv_send_wr *p_send_wqe, xlio_wr_tx_packet_attr attr, - bool request_comp, xlio_tis *tis, unsigned credits) -{ - struct xlio_mlx5_wqe_ctrl_seg *ctrl = NULL; - struct mlx5_wqe_eth_seg *eseg = NULL; - uint32_t tisn = tis ? tis->get_tisn() : 0; - - ctrl = (struct xlio_mlx5_wqe_ctrl_seg *)m_sq_wqe_hot; - eseg = (struct mlx5_wqe_eth_seg *)((uint8_t *)m_sq_wqe_hot + sizeof(*ctrl)); - - /* Configure ctrl segment - * qpn_ds or ctrl.data[1] is set inside fill_wqe() - */ - ctrl->opmod_idx_opcode = htonl(((m_sq_wqe_counter & 0xffff) << 8) | - (get_mlx5_opcode(xlio_send_wr_opcode(*p_send_wqe)) & 0xff)); - m_sq_wqe_hot->ctrl.data[2] = 0; - ctrl->fm_ce_se = (request_comp ? (uint8_t)MLX5_WQE_CTRL_CQ_UPDATE : 0); - ctrl->tis_tir_num = htobe32(tisn << 8); - - /* Configure eth segment - * reset rsvd0, cs_flags, rsvd1, mss and rsvd2 fields - * checksum flags are set here - */ - *((uint64_t *)eseg) = 0; - eseg->rsvd2 = 0; - eseg->cs_flags = (uint8_t)(attr & (XLIO_TX_PACKET_L3_CSUM | XLIO_TX_PACKET_L4_CSUM) & 0xff); - - /* Store buffer descriptor */ - store_current_wqe_prop(reinterpret_cast(p_send_wqe->wr_id), credits, tis); - - /* Complete WQE */ - int wqebbs = fill_wqe(p_send_wqe); - assert(wqebbs > 0 && (unsigned)wqebbs <= credits); - NOT_IN_USE(wqebbs); - - update_next_wqe_hot(); - - qp_logfunc( - "m_sq_wqe_hot: %p m_sq_wqe_hot_index: %d wqe_counter: %d new_hot_index: %d wr_id: %llx", - m_sq_wqe_hot, m_sq_wqe_hot_index, m_sq_wqe_counter, (m_sq_wqe_counter & (m_tx_num_wr - 1)), - p_send_wqe->wr_id); - - return 0; -} - -#ifdef DEFINED_UTLS - -std::unique_ptr qp_mgr_eth_mlx5::get_new_tls_dek(const void *key, - uint32_t key_size_bytes) -{ - dpcp::tls_dek *_dek = nullptr; - dpcp::adapter *adapter = m_p_ib_ctx_handler->get_dpcp_adapter(); - if (likely(adapter)) { - dpcp::status status; - struct dpcp::dek_attr dek_attr; - memset(&dek_attr, 0, sizeof(dek_attr)); - dek_attr.key_blob = (void *)key; - dek_attr.key_blob_size = key_size_bytes; - dek_attr.key_size = key_size_bytes; - dek_attr.pd_id = adapter->get_pd(); - status = adapter->create_tls_dek(dek_attr, _dek); - if (unlikely(status != dpcp::DPCP_OK)) { - qp_logwarn("Failed to create new DEK, status: %d", status); - if (_dek) { - delete _dek; - _dek = nullptr; - } - } - } - - return std::unique_ptr(_dek); -} - -std::unique_ptr qp_mgr_eth_mlx5::get_tls_dek(const void *key, - uint32_t key_size_bytes) -{ - dpcp::status status; - dpcp::adapter *adapter = m_p_ib_ctx_handler->get_dpcp_adapter(); - - if (unlikely(!adapter)) { - return std::unique_ptr(nullptr); - } - - // If the amount of available DEKs in m_dek_put_cache is smaller than - // low-watermark we continue to create new DEKs. This is to avoid situations - // where one DEKs is returned and then fetched in a throttlling manner - // causing too frequent crypto-sync. - // It is also possible that crypto-sync may have higher impact with higher number - // of active connections. - if (unlikely(!m_p_ring->tls_sync_dek_supported()) || - (unlikely(m_tls_dek_get_cache.empty()) && - (m_tls_dek_put_cache.size() <= safe_mce_sys().utls_low_wmark_dek_cache_size))) { - return get_new_tls_dek(key, key_size_bytes); - } - - if (unlikely(m_tls_dek_get_cache.empty())) { - qp_logdbg("Empty DEK get cache. Swapping caches and do Sync-Crypto. Put-Cache size: %zu", - m_tls_dek_put_cache.size()); - - status = adapter->sync_crypto_tls(); - if (unlikely(status != dpcp::DPCP_OK)) { - qp_logwarn("Failed to flush DEK HW cache, status: %d", status); - return get_new_tls_dek(key, key_size_bytes); - } - - m_tls_dek_get_cache.swap(m_tls_dek_put_cache); - } - - std::unique_ptr out_dek(std::move(m_tls_dek_get_cache.front())); - m_tls_dek_get_cache.pop_front(); - - struct dpcp::dek_attr dek_attr; - memset(&dek_attr, 0, sizeof(dek_attr)); - dek_attr.key_blob = const_cast(key); - dek_attr.key_blob_size = key_size_bytes; - dek_attr.key_size = key_size_bytes; - dek_attr.pd_id = adapter->get_pd(); - status = out_dek->modify(dek_attr); - if (unlikely(status != dpcp::DPCP_OK)) { - qp_logwarn("Failed to modify DEK, status: %d", status); - out_dek.reset(nullptr); - } - - return out_dek; -} - -void qp_mgr_eth_mlx5::put_tls_dek(std::unique_ptr &&tls_dek_obj) -{ - if (tls_dek_obj == nullptr) { - return; - } - // We don't allow unlimited DEK cache to avoid system DEK starvation. - if (likely(m_p_ring->tls_sync_dek_supported()) && - m_tls_dek_put_cache.size() < safe_mce_sys().utls_high_wmark_dek_cache_size) { - m_tls_dek_put_cache.emplace_back(std::forward>(tls_dek_obj)); - } -} - -xlio_tis *qp_mgr_eth_mlx5::tls_context_setup_tx(const xlio_tls_info *info) -{ - std::unique_ptr tis; - if (m_tls_tis_cache.empty()) { - tis = create_tis(DPCP_TIS_FLAGS | dpcp::TIS_ATTR_TLS); - if (unlikely(tis == nullptr)) { - return nullptr; - } - } else { - tis.reset(m_tls_tis_cache.back()); - m_tls_tis_cache.pop_back(); - } - - auto dek_obj = get_tls_dek(info->key, info->key_len); - if (unlikely(!dek_obj)) { - m_tls_tis_cache.push_back(tis.release()); - return nullptr; - } - - tis->assign_dek(std::move(dek_obj)); - uint32_t tisn = tis->get_tisn(); - - tls_post_static_params_wqe(tis.get(), info, tisn, tis->get_dek_id(), 0, false, true); - tls_post_progress_params_wqe(tis.get(), tisn, 0, false, true); - /* The 1st post after TLS configuration must be with fence. */ - m_b_fence_needed = true; - - assert(!tis->m_released); - - return tis.release(); -} - -void qp_mgr_eth_mlx5::tls_context_resync_tx(const xlio_tls_info *info, xlio_tis *tis, - bool skip_static) -{ - uint32_t tisn = tis->get_tisn(); - - if (!skip_static) { - tls_post_static_params_wqe(tis, info, tisn, tis->get_dek_id(), 0, true, true); - } - tls_post_progress_params_wqe(tis, tisn, 0, skip_static, true); - m_b_fence_needed = true; -} - -xlio_tir *qp_mgr_eth_mlx5::tls_create_tir(bool cached) -{ - xlio_tir *tir = NULL; - - if (cached && !m_tls_tir_cache.empty()) { - tir = m_tls_tir_cache.back(); - m_tls_tir_cache.pop_back(); - } else if (!cached) { - dpcp::tir *_tir = create_tir(true); - - if (_tir != NULL) { - tir = new xlio_tir(_tir, xlio_ti::ti_type::TLS_TIR); - } - if (unlikely(tir == NULL && _tir != NULL)) { - delete _tir; - } - } - return tir; -} - -int qp_mgr_eth_mlx5::tls_context_setup_rx(xlio_tir *tir, const xlio_tls_info *info, - uint32_t next_record_tcp_sn, xlio_comp_cb_t callback, - void *callback_arg) -{ - uint32_t tirn; - dpcp::tls_dek *_dek; - dpcp::status status; - dpcp::adapter *adapter = m_p_ib_ctx_handler->get_dpcp_adapter(); - struct dpcp::dek_attr dek_attr; - - memset(&dek_attr, 0, sizeof(dek_attr)); - dek_attr.key_blob = (void *)info->key; - dek_attr.key_blob_size = info->key_len; - dek_attr.key_size = info->key_len; - dek_attr.pd_id = adapter->get_pd(); - status = adapter->create_tls_dek(dek_attr, _dek); - if (unlikely(status != dpcp::DPCP_OK)) { - qp_logerr("Failed to create DEK, status: %d", status); - return -1; - } - tir->assign_dek(_dek); - tir->assign_callback(callback, callback_arg); - tirn = tir->get_tirn(); - - tls_post_static_params_wqe(NULL, info, tirn, _dek->get_key_id(), 0, false, false); - tls_post_progress_params_wqe(tir, tirn, next_record_tcp_sn, false, false); - - assert(!tir->m_released); - - return 0; -} - -void qp_mgr_eth_mlx5::tls_resync_rx(xlio_tir *tir, const xlio_tls_info *info, - uint32_t hw_resync_tcp_sn) -{ - tls_post_static_params_wqe(tir, info, tir->get_tirn(), tir->get_dek_id(), hw_resync_tcp_sn, - false, false); -} - -void qp_mgr_eth_mlx5::tls_get_progress_params_rx(xlio_tir *tir, void *buf, uint32_t lkey) -{ - /* Address must be aligned by 64. */ - assert((uintptr_t)buf == ((uintptr_t)buf >> 6U << 6U)); - - tls_get_progress_params_wqe(tir, tir->get_tirn(), buf, lkey); -} - -inline void qp_mgr_eth_mlx5::tls_fill_static_params_wqe( - struct mlx5_wqe_tls_static_params_seg *params, const struct xlio_tls_info *info, - uint32_t key_id, uint32_t resync_tcp_sn) -{ - unsigned char *initial_rn, *iv; - uint8_t tls_version; - uint8_t *ctx; - - ctx = params->ctx; - - iv = DEVX_ADDR_OF(tls_static_params, ctx, gcm_iv); - initial_rn = DEVX_ADDR_OF(tls_static_params, ctx, initial_record_number); - - memcpy(iv, info->salt, TLS_AES_GCM_SALT_LEN); - memcpy(initial_rn, info->rec_seq, TLS_AES_GCM_REC_SEQ_LEN); - if (info->tls_version == TLS_1_3_VERSION) { - iv = DEVX_ADDR_OF(tls_static_params, ctx, implicit_iv); - memcpy(iv, info->iv, TLS_AES_GCM_IV_LEN); - } - - tls_version = (info->tls_version == TLS_1_2_VERSION) ? MLX5E_STATIC_PARAMS_CONTEXT_TLS_1_2 - : MLX5E_STATIC_PARAMS_CONTEXT_TLS_1_3; - - DEVX_SET(tls_static_params, ctx, tls_version, tls_version); - DEVX_SET(tls_static_params, ctx, const_1, 1); - DEVX_SET(tls_static_params, ctx, const_2, 2); - DEVX_SET(tls_static_params, ctx, encryption_standard, MLX5E_ENCRYPTION_STANDARD_TLS); - DEVX_SET(tls_static_params, ctx, resync_tcp_sn, resync_tcp_sn); - DEVX_SET(tls_static_params, ctx, dek_index, key_id); -} - -inline void qp_mgr_eth_mlx5::tls_post_static_params_wqe(xlio_ti *ti, - const struct xlio_tls_info *info, - uint32_t tis_tir_number, uint32_t key_id, - uint32_t resync_tcp_sn, bool fence, - bool is_tx) -{ - struct mlx5_set_tls_static_params_wqe *wqe = - reinterpret_cast(m_sq_wqe_hot); - struct xlio_mlx5_wqe_ctrl_seg *cseg = &wqe->ctrl.ctrl; - xlio_mlx5_wqe_umr_ctrl_seg *ucseg = &wqe->uctrl; - struct mlx5_mkey_seg *mkcseg = &wqe->mkc; - struct mlx5_wqe_tls_static_params_seg *tspseg = &wqe->params; - uint8_t opmod = is_tx ? MLX5_OPC_MOD_TLS_TIS_STATIC_PARAMS : MLX5_OPC_MOD_TLS_TIR_STATIC_PARAMS; - -#define STATIC_PARAMS_DS_CNT DIV_ROUND_UP(sizeof(*wqe), MLX5_SEND_WQE_DS) - - /* - * SQ wrap around handling information - * - * UMR WQE has the size of 3 WQEBBs. - * The following are segments sizes the WQE contains. - * - * UMR WQE segments sizes: - * sizeof(wqe->ctrl) = 16[B] - * sizeof(wqe->uctrl) = 48[B] - * sizeof(wqe->mkc) = 64[B] - * sizeof(wqe->params) = 64[B] - * - * UMR WQEBBs to segments mapping: - * WQEBB1: [wqe->ctrl(16[B]), wqe->uctrl(48[B])] -> 64[B] - * WQEBB2: [wqe->mkc(64[B])] -> 64[B] - * WQEBB3: [wqe->params(64[B])] -> 64[B] - * - * There are 3 cases: - * 1. There is enough room in the SQ for 3 WQEBBs: - * 3 WQEBBs posted from m_sq_wqe_hot current location. - * 2. There is enough room in the SQ for 2 WQEBBs: - * 2 WQEBBs posted from m_sq_wqe_hot current location till m_sq_wqes_end. - * 1 WQEBB posted from m_sq_wqes beginning. - * 3. There is enough room in the SQ for 1 WQEBB: - * 1 WQEBB posted from m_sq_wqe_hot current location till m_sq_wqes_end. - * 2 WQEBBs posted from m_sq_wqes beginning. - * The case of 0 WQEBBs room left in the SQ shouldn't happen, m_sq_wqe_hot wrap around handling - * done when setting next m_sq_wqe_hot. - * - * In all the 3 cases, no need to change cseg and ucseg pointers, since they fit to - * one WQEBB and will be posted before m_sq_wqes_end. - */ - - // XXX: We set inline_hdr_sz for every new hot wqe. This corrupts UMR WQE without memset(). - memset(m_sq_wqe_hot, 0, sizeof(*m_sq_wqe_hot)); - cseg->opmod_idx_opcode = - htobe32(((m_sq_wqe_counter & 0xffff) << 8) | MLX5_OPCODE_UMR | (opmod << 24)); - cseg->qpn_ds = htobe32((m_mlx5_qp.qpn << MLX5_WQE_CTRL_QPN_SHIFT) | STATIC_PARAMS_DS_CNT); - cseg->fm_ce_se = fence ? MLX5_FENCE_MODE_INITIATOR_SMALL : 0; - cseg->tis_tir_num = htobe32(tis_tir_number << 8); - - ucseg->flags = MLX5_UMR_INLINE; - ucseg->bsf_octowords = htobe16(DEVX_ST_SZ_BYTES(tls_static_params) / 16); - - int num_wqebbs = TLS_SET_STATIC_PARAMS_WQEBBS; - int num_wqebbs_top = 0; - int sq_wqebbs_room_left = - (static_cast(m_sq_wqes_end - reinterpret_cast(cseg)) / MLX5_SEND_WQE_BB); - - /* Case 1: - * In this case we don't need to change - * the pointers of the different segments, because there is enough room in the SQ. - * Thus, no need to do special handling. - */ - - if (unlikely(sq_wqebbs_room_left == 2)) { // Case 2: Change tspseg pointer: - tspseg = reinterpret_cast(m_sq_wqes); - num_wqebbs = 2; - num_wqebbs_top = 1; - } else if (unlikely(sq_wqebbs_room_left == 1)) { // Case 3: Change mkcseg and tspseg pointers: - mkcseg = reinterpret_cast(m_sq_wqes); - tspseg = reinterpret_cast( - reinterpret_cast(m_sq_wqes) + sizeof(*mkcseg)); - num_wqebbs = 1; - num_wqebbs_top = 2; - } - - memset(mkcseg, 0, sizeof(*mkcseg)); - memset(tspseg, 0, sizeof(*tspseg)); - - tls_fill_static_params_wqe(tspseg, info, key_id, resync_tcp_sn); - store_current_wqe_prop(nullptr, SQ_CREDITS_UMR, ti); - - ring_doorbell(MLX5_DB_METHOD_DB, num_wqebbs, num_wqebbs_top, true); - dbg_dump_wqe((uint32_t *)m_sq_wqe_hot, sizeof(mlx5_set_tls_static_params_wqe)); - - update_next_wqe_hot(); -} - -inline void qp_mgr_eth_mlx5::tls_fill_progress_params_wqe( - struct mlx5_wqe_tls_progress_params_seg *params, uint32_t tis_tir_number, - uint32_t next_record_tcp_sn) -{ - uint8_t *ctx = params->ctx; - - params->tis_tir_num = htobe32(tis_tir_number); - - DEVX_SET(tls_progress_params, ctx, next_record_tcp_sn, next_record_tcp_sn); - DEVX_SET(tls_progress_params, ctx, record_tracker_state, - MLX5E_TLS_PROGRESS_PARAMS_RECORD_TRACKER_STATE_START); - DEVX_SET(tls_progress_params, ctx, auth_state, MLX5E_TLS_PROGRESS_PARAMS_AUTH_STATE_NO_OFFLOAD); -} - -inline void qp_mgr_eth_mlx5::tls_post_progress_params_wqe(xlio_ti *ti, uint32_t tis_tir_number, - uint32_t next_record_tcp_sn, bool fence, - bool is_tx) -{ - uint16_t num_wqebbs = TLS_SET_PROGRESS_PARAMS_WQEBBS; - - struct mlx5_set_tls_progress_params_wqe *wqe = - reinterpret_cast(m_sq_wqe_hot); - struct xlio_mlx5_wqe_ctrl_seg *cseg = &wqe->ctrl.ctrl; - uint8_t opmod = - is_tx ? MLX5_OPC_MOD_TLS_TIS_PROGRESS_PARAMS : MLX5_OPC_MOD_TLS_TIR_PROGRESS_PARAMS; - - memset(wqe, 0, sizeof(*wqe)); - -#define PROGRESS_PARAMS_DS_CNT DIV_ROUND_UP(sizeof(*wqe), MLX5_SEND_WQE_DS) - - cseg->opmod_idx_opcode = - htobe32(((m_sq_wqe_counter & 0xffff) << 8) | XLIO_MLX5_OPCODE_SET_PSV | (opmod << 24)); - cseg->qpn_ds = htobe32((m_mlx5_qp.qpn << MLX5_WQE_CTRL_QPN_SHIFT) | PROGRESS_PARAMS_DS_CNT); - /* Request completion for TLS RX offload to create TLS rule ASAP. */ - cseg->fm_ce_se = - (fence ? MLX5_FENCE_MODE_INITIATOR_SMALL : 0) | (is_tx ? 0 : MLX5_WQE_CTRL_CQ_UPDATE); - - tls_fill_progress_params_wqe(&wqe->params, tis_tir_number, next_record_tcp_sn); - store_current_wqe_prop(nullptr, SQ_CREDITS_SET_PSV, ti); - - ring_doorbell(MLX5_DB_METHOD_DB, num_wqebbs); - dbg_dump_wqe((uint32_t *)m_sq_wqe_hot, sizeof(mlx5_set_tls_progress_params_wqe)); - - update_next_wqe_hot(); -} - -inline void qp_mgr_eth_mlx5::tls_get_progress_params_wqe(xlio_ti *ti, uint32_t tirn, void *buf, - uint32_t lkey) -{ - uint16_t num_wqebbs = TLS_GET_PROGRESS_WQEBBS; - - struct mlx5_get_tls_progress_params_wqe *wqe = - reinterpret_cast(m_sq_wqe_hot); - struct xlio_mlx5_wqe_ctrl_seg *cseg = &wqe->ctrl.ctrl; - struct xlio_mlx5_seg_get_psv *psv = &wqe->psv; - uint8_t opmod = MLX5_OPC_MOD_TLS_TIR_PROGRESS_PARAMS; - - memset(wqe, 0, sizeof(*wqe)); - -#define PROGRESS_PARAMS_DS_CNT DIV_ROUND_UP(sizeof(*wqe), MLX5_SEND_WQE_DS) - - cseg->opmod_idx_opcode = - htobe32(((m_sq_wqe_counter & 0xffff) << 8) | XLIO_MLX5_OPCODE_GET_PSV | (opmod << 24)); - cseg->qpn_ds = htobe32((m_mlx5_qp.qpn << MLX5_WQE_CTRL_QPN_SHIFT) | PROGRESS_PARAMS_DS_CNT); - cseg->fm_ce_se = MLX5_WQE_CTRL_CQ_UPDATE; - - psv->num_psv = 1U << 4U; - psv->l_key = htobe32(lkey); - psv->psv_index[0] = htobe32(tirn); - psv->va = htobe64((uintptr_t)buf); - - store_current_wqe_prop(nullptr, SQ_CREDITS_GET_PSV, ti); - - ring_doorbell(MLX5_DB_METHOD_DB, num_wqebbs); - - update_next_wqe_hot(); -} - -void qp_mgr_eth_mlx5::tls_tx_post_dump_wqe(xlio_tis *tis, void *addr, uint32_t len, uint32_t lkey, - bool first) -{ - post_dump_wqe(tis, addr, len, lkey, first); -} - -void qp_mgr_eth_mlx5::tls_release_tis(xlio_tis *tis) -{ - assert(tis != nullptr && tis->m_type == xlio_ti::ti_type::TLS_TIS); - tis->m_released = true; - if (tis->m_ref == 0) { - put_tls_tis_in_cache(tis); - } -} - -void qp_mgr_eth_mlx5::tls_release_tir(xlio_tir *tir) -{ - /* TODO We don't have to lock ring to destroy DEK object (a garbage collector?). */ - - assert(tir != nullptr && tir->m_type == xlio_ti::ti_type::TLS_TIR); - tir->m_released = true; - tir->assign_callback(NULL, NULL); - if (tir->m_ref == 0) { - put_tls_tir_in_cache(tir); - } -} - -dpcp::tir *qp_mgr_eth_mlx5::xlio_tir_to_dpcp_tir(xlio_tir *tir) -{ - return tir->m_p_tir.get(); -} -#else /* DEFINED_UTLS */ -void qp_mgr_eth_mlx5::ti_released(xlio_ti *) {}; -void qp_mgr_eth_mlx5::destroy_tis_cache(void) {}; -#endif /* DEFINED_UTLS */ - -std::unique_ptr qp_mgr_eth_mlx5::create_tis(uint32_t flags) const -{ - dpcp::adapter *adapter = m_p_ib_ctx_handler->get_dpcp_adapter(); - bool is_tls = flags & dpcp::TIS_ATTR_TLS, is_nvme = flags & dpcp::TIS_ATTR_NVMEOTCP; - if (unlikely(adapter == nullptr || (is_tls && is_nvme))) { - return nullptr; - } - - dpcp::tis::attr tis_attr = { - .flags = flags, - .tls_en = is_tls, - .nvmeotcp = is_nvme, - .transport_domain = adapter->get_td(), - .pd = adapter->get_pd(), - }; - - dpcp::tis *dpcp_tis = nullptr; - if (unlikely(adapter->create_tis(tis_attr, dpcp_tis) != dpcp::DPCP_OK)) { - qp_logerr("Failed to create TIS with NVME enabled"); - return nullptr; - } - - auto tis_type = is_tls ? xlio_ti::ti_type::TLS_TIS : xlio_ti::ti_type::NVME_TIS; - return std::make_unique(std::unique_ptr(dpcp_tis), tis_type); -} - -static inline void nvme_fill_static_params_control(xlio_mlx5_wqe_ctrl_seg *cseg, - xlio_mlx5_wqe_umr_ctrl_seg *ucseg, - uint32_t producer_index, uint32_t qpn, - uint32_t tisn, uint8_t fence_flags) -{ - memset(cseg, 0, sizeof(*cseg)); - memset(ucseg, 0, sizeof(*ucseg)); - cseg->opmod_idx_opcode = - htobe32(((producer_index & 0xffff) << 8) | MLX5_OPCODE_UMR | - (MLX5_CTRL_SEGMENT_OPC_MOD_UMR_NVMEOTCP_TIS_STATIC_PARAMS << 24)); - size_t num_wqe_ds = 12U; - cseg->qpn_ds = htobe32((qpn << MLX5_WQE_CTRL_QPN_SHIFT) | num_wqe_ds); - cseg->fm_ce_se = fence_flags; - cseg->tis_tir_num = htobe32(tisn << MLX5_WQE_CTRL_TIR_TIS_INDEX_SHIFT); - - ucseg->flags = MLX5_UMR_INLINE; - ucseg->bsf_octowords = htobe16(MLX5E_TRANSPORT_STATIC_PARAMS_OCTWORD_SIZE); -} - -static inline void nvme_fill_static_params_transport_params( - mlx5_wqe_transport_static_params_seg *params, uint32_t config) - -{ - memset(params, 0, sizeof(*params)); - void *ctx = params->ctx; - - DEVX_SET(transport_static_params, ctx, const_1, 1); - DEVX_SET(transport_static_params, ctx, const_2, 2); - DEVX_SET(transport_static_params, ctx, acc_type, MLX5_TRANSPORT_STATIC_PARAMS_ACC_TYPE_NVMETCP); - DEVX_SET(transport_static_params, ctx, nvme_resync_tcp_sn, 0); - DEVX_SET(transport_static_params, ctx, pda, static_cast(config & XLIO_NVME_PDA_MASK)); - DEVX_SET(transport_static_params, ctx, ddgst_en, bool(config & XLIO_NVME_DDGST_ENABLE)); - DEVX_SET(transport_static_params, ctx, ddgst_offload_en, - bool(config & XLIO_NVME_DDGST_OFFLOAD)); - DEVX_SET(transport_static_params, ctx, hddgst_en, bool(config & XLIO_NVME_HDGST_ENABLE)); - DEVX_SET(transport_static_params, ctx, hdgst_offload_en, - bool(config & XLIO_NVME_HDGST_OFFLOAD)); - DEVX_SET(transport_static_params, ctx, ti, MLX5_TRANSPORT_STATIC_PARAMS_TI_INITIATOR); - DEVX_SET(transport_static_params, ctx, const1, 1); - DEVX_SET(transport_static_params, ctx, zero_copy_en, 0); -} - -static inline void nvme_fill_progress_wqe(mlx5e_set_nvmeotcp_progress_params_wqe *wqe, - uint32_t producer_index, uint32_t qpn, uint32_t tisn, - uint32_t tcp_seqno, uint8_t fence_flags) -{ - memset(wqe, 0, sizeof(*wqe)); - auto cseg = &wqe->ctrl.ctrl; - - size_t progres_params_ds = DIV_ROUND_UP(sizeof(*wqe), MLX5_SEND_WQE_DS); - cseg->opmod_idx_opcode = - htobe32(((producer_index & 0xffff) << 8) | XLIO_MLX5_OPCODE_SET_PSV | - (MLX5_CTRL_SEGMENT_OPC_MOD_UMR_NVMEOTCP_TIS_PROGRESS_PARAMS << 24)); - cseg->qpn_ds = htobe32((qpn << MLX5_WQE_CTRL_QPN_SHIFT) | progres_params_ds); - cseg->fm_ce_se = fence_flags; - - mlx5_seg_nvmeotcp_progress_params *params = &wqe->params; - params->tir_num = htobe32(tisn); - void *ctx = params->ctx; - - DEVX_SET(nvmeotcp_progress_params, ctx, next_pdu_tcp_sn, tcp_seqno); - DEVX_SET(nvmeotcp_progress_params, ctx, pdu_tracker_state, - MLX5E_NVMEOTCP_PROGRESS_PARAMS_PDU_TRACKER_STATE_START); - /* if (is_tx) offloading state == 0*/ - DEVX_SET(nvmeotcp_progress_params, ctx, offloading_state, 0); -} - -void qp_mgr_eth_mlx5::nvme_set_static_context(xlio_tis *tis, uint32_t config) -{ - auto *cseg = wqebb_get(0U); - auto *ucseg = wqebb_get(0U, sizeof(*cseg)); - - nvme_fill_static_params_control(cseg, ucseg, m_sq_wqe_counter, m_mlx5_qp.qpn, tis->get_tisn(), - 0); - memset(wqebb_get(1U), 0, sizeof(mlx5_mkey_seg)); - - auto *params = wqebb_get(2U); - nvme_fill_static_params_transport_params(params, config); - store_current_wqe_prop(nullptr, SQ_CREDITS_UMR, tis); - ring_doorbell(MLX5_DB_METHOD_DB, MLX5E_TRANSPORT_SET_STATIC_PARAMS_WQEBBS); - update_next_wqe_hot(); -} - -void qp_mgr_eth_mlx5::nvme_set_progress_context(xlio_tis *tis, uint32_t tcp_seqno) -{ - auto *wqe = reinterpret_cast(m_sq_wqe_hot); - nvme_fill_progress_wqe(wqe, m_sq_wqe_counter, m_mlx5_qp.qpn, tis->get_tisn(), tcp_seqno, - MLX5_FENCE_MODE_INITIATOR_SMALL); - store_current_wqe_prop(nullptr, SQ_CREDITS_SET_PSV, tis); - ring_doorbell(MLX5_DB_METHOD_DB, MLX5E_NVMEOTCP_PROGRESS_PARAMS_WQEBBS); - update_next_wqe_hot(); -} - -#if defined(DEFINED_UTLS) -void qp_mgr_eth_mlx5::ti_released(xlio_ti *ti) -{ - assert(ti->m_released); - assert(ti->m_ref == 0); - if (ti->m_type == xlio_ti::ti_type::TLS_TIS) { - put_tls_tis_in_cache(static_cast(ti)); - } else if (ti->m_type == xlio_ti::ti_type::TLS_TIR) { - put_tls_tir_in_cache(static_cast(ti)); - } -} - -void qp_mgr_eth_mlx5::put_tls_tis_in_cache(xlio_tis *tis) -{ - std::unique_ptr dek = tis->release_dek(); - assert(dynamic_cast(dek.get()) != nullptr); - - put_tls_dek(std::unique_ptr(dynamic_cast(dek.release()))); - m_tls_tis_cache.push_back(tis); -} - -void qp_mgr_eth_mlx5::put_tls_tir_in_cache(xlio_tir *tir) -{ - // Because the absense of TIR flush command, reusing a TIR - // may result in undefined behaviour. - // Until a flush command is available the TIR cache is disabled. - // Re-enabling TIR cache should also add destroy_tir_cache on ring cleanup. - // m_tls_tir_cache.push_back(tir); - - delete tir; -} -#endif /* defined(DEFINED_UTLS) */ - -void qp_mgr_eth_mlx5::post_nop_fence(void) -{ - struct mlx5_wqe *wqe = reinterpret_cast(m_sq_wqe_hot); - struct xlio_mlx5_wqe_ctrl_seg *cseg = &wqe->ctrl; - - memset(wqe, 0, sizeof(*wqe)); - - cseg->opmod_idx_opcode = htobe32(((m_sq_wqe_counter & 0xffff) << 8) | MLX5_OPCODE_NOP); - cseg->qpn_ds = htobe32((m_mlx5_qp.qpn << MLX5_WQE_CTRL_QPN_SHIFT) | 0x01); - cseg->fm_ce_se = MLX5_FENCE_MODE_INITIATOR_SMALL; - - store_current_wqe_prop(nullptr, SQ_CREDITS_NOP, NULL); - - ring_doorbell(MLX5_DB_METHOD_DB, 1); - - update_next_wqe_hot(); -} - -void qp_mgr_eth_mlx5::post_dump_wqe(xlio_tis *tis, void *addr, uint32_t len, uint32_t lkey, - bool is_first) -{ - struct mlx5_dump_wqe *wqe = reinterpret_cast(m_sq_wqe_hot); - struct xlio_mlx5_wqe_ctrl_seg *cseg = &wqe->ctrl.ctrl; - struct mlx5_wqe_data_seg *dseg = &wqe->data; - uint32_t tisn = tis ? tis->get_tisn() : 0; - uint16_t num_wqebbs = XLIO_DUMP_WQEBBS; - uint16_t ds_cnt = sizeof(*wqe) / MLX5_SEND_WQE_DS; - - memset(wqe, 0, sizeof(*wqe)); - - cseg->opmod_idx_opcode = htobe32(((m_sq_wqe_counter & 0xffff) << 8) | XLIO_MLX5_OPCODE_DUMP); - cseg->qpn_ds = htobe32((m_mlx5_qp.qpn << MLX5_WQE_CTRL_QPN_SHIFT) | ds_cnt); - cseg->fm_ce_se = is_first ? MLX5_FENCE_MODE_INITIATOR_SMALL : 0; - cseg->tis_tir_num = htobe32(tisn << 8); - - dseg->addr = htobe64((uintptr_t)addr); - dseg->lkey = htobe32(lkey); - dseg->byte_count = htobe32(len); - - store_current_wqe_prop(nullptr, SQ_CREDITS_DUMP, tis); - - ring_doorbell(MLX5_DB_METHOD_DB, num_wqebbs, 0, true); - - update_next_wqe_hot(); -} - -//! Handle releasing of Tx buffers -// Single post send with SIGNAL of a dummy packet -// NOTE: Since the QP is in ERROR state no packets will be sent on the wire! -// So we can post_send anything we want :) -void qp_mgr_eth_mlx5::trigger_completion_for_all_sent_packets() -{ - qp_logfunc("unsignaled count=%d", m_n_unsignaled_count); - - if (!is_signal_requested_for_last_wqe()) { - // Post a dummy WQE and request a signal to complete all the unsignaled WQEs in SQ - qp_logdbg("Need to send closing tx wr..."); - mem_buf_desc_t *p_mem_buf_desc = m_p_ring->mem_buf_tx_get(0, true, PBUF_RAM); - // Align Tx buffer accounting since we will be bypassing the normal send calls - m_p_ring->m_missing_buf_ref_count--; - if (!p_mem_buf_desc) { - qp_logerr("no buffer in pool"); - return; - } - - // Prepare dummy packet: zeroed payload ('0000'). - // For ETH it replaces the MAC header!! (Nothing is going on the wire, QP in error state) - /* need to send at least eth+ip, since libmlx5 will drop just eth header */ - ethhdr *p_buffer_ethhdr = (ethhdr *)p_mem_buf_desc->p_buffer; - memset(p_buffer_ethhdr, 0, sizeof(*p_buffer_ethhdr)); - p_buffer_ethhdr->h_proto = htons(ETH_P_IP); - iphdr *p_buffer_iphdr = (iphdr *)(p_mem_buf_desc->p_buffer + sizeof(*p_buffer_ethhdr)); - memset(p_buffer_iphdr, 0, sizeof(*p_buffer_iphdr)); - - ibv_sge sge[1]; - sge[0].length = sizeof(ethhdr) + sizeof(iphdr); - sge[0].addr = (uintptr_t)(p_mem_buf_desc->p_buffer); - sge[0].lkey = m_p_ring->m_tx_lkey; - - // Prepare send wr for (does not care if it is UD/IB or RAW/ETH) - // UD requires AH+qkey, RAW requires minimal payload instead of MAC header. - xlio_ibv_send_wr send_wr; - - memset(&send_wr, 0, sizeof(send_wr)); - send_wr.wr_id = (uintptr_t)p_mem_buf_desc; - send_wr.wr.ud.ah = NULL; - send_wr.sg_list = sge; - send_wr.num_sge = 1; - send_wr.next = NULL; - xlio_send_wr_opcode(send_wr) = XLIO_IBV_WR_SEND; - - unsigned credits = credits_calculate(&send_wr); - if (!credits_get(credits)) { - // TODO Wait for available space in SQ to post the WQE. This method mustn't fail, - // because we may want to wait until all the WQEs are completed and we need to post - // something and request signal. - qp_logdbg("No space in SQ to trigger completions with a post operation"); - return; - } - - send_to_wire(&send_wr, - (xlio_wr_tx_packet_attr)(XLIO_TX_PACKET_L3_CSUM | XLIO_TX_PACKET_L4_CSUM), - true, nullptr, credits); - } -} - -void qp_mgr_eth_mlx5::reset_inflight_zc_buffers_ctx(void *ctx) -{ - sq_wqe_prop *p = m_sq_wqe_prop_last; - sq_wqe_prop *prev; - if (p) { - unsigned p_i = p - m_sq_wqe_idx_to_prop; - if (p_i == m_sq_wqe_prop_last_signalled) { - return; - } - do { - mem_buf_desc_t *desc = p->buf; - if (desc && desc->tx.zc.ctx == ctx) { - desc->tx.zc.ctx = nullptr; - } - prev = p; - p = p->next; - } while (p && is_sq_wqe_prop_valid(p, prev)); - } -} - -#endif /* DEFINED_DIRECT_VERBS */ diff --git a/src/core/dev/qp_mgr_eth_mlx5.h b/src/core/dev/qp_mgr_eth_mlx5.h deleted file mode 100644 index 1259f5074..000000000 --- a/src/core/dev/qp_mgr_eth_mlx5.h +++ /dev/null @@ -1,232 +0,0 @@ -/* - * Copyright (c) 2001-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#ifndef QP_MGR_ETH_MLX5_H -#define QP_MGR_ETH_MLX5_H - -#include "qp_mgr.h" -#include "util/sg_array.h" -#include "dev/dm_mgr.h" -#include -#include - -#if defined(DEFINED_DIRECT_VERBS) - -#define qp_logpanic __log_info_panic -#define qp_logerr __log_info_err -#define qp_logwarn __log_info_warn -#define qp_loginfo __log_info_info -#define qp_logdbg __log_info_dbg -#define qp_logfunc __log_info_func -#define qp_logfuncall __log_info_funcall - -/* WQE properties description. */ -struct sq_wqe_prop { - /* A buffer held by the WQE. This is NULL for control WQEs. */ - mem_buf_desc_t *buf; - /* Number of credits (usually number of WQEBBs). */ - unsigned credits; - /* Transport interface (TIS/TIR) current WQE holds reference to. */ - xlio_ti *ti; - struct sq_wqe_prop *next; -}; -typedef struct sq_wqe_prop sq_wqe_prop; - -class qp_mgr_eth_mlx5 : public qp_mgr { - friend class cq_mgr_rx; - friend class cq_mgr_rx_regrq; - friend class cq_mgr_tx; - -public: - qp_mgr_eth_mlx5(struct qp_mgr_desc *desc, const uint32_t tx_num_wr, const uint16_t vlan, - bool call_configure = true); - virtual ~qp_mgr_eth_mlx5(); - void up() override; - void down() override; - void post_recv_buffer( - mem_buf_desc_t *p_mem_buf_desc) override; // Post for receive single mem_buf_desc - xlio_ib_mlx5_qp_t m_mlx5_qp; - -#ifdef DEFINED_UTLS - xlio_tis *tls_context_setup_tx(const xlio_tls_info *info) override; - xlio_tir *tls_create_tir(bool cached) override; - int tls_context_setup_rx(xlio_tir *tir, const xlio_tls_info *info, uint32_t next_record_tcp_sn, - xlio_comp_cb_t callback, void *callback_arg) override; - void tls_context_resync_tx(const xlio_tls_info *info, xlio_tis *tis, bool skip_static) override; - void tls_resync_rx(xlio_tir *tir, const xlio_tls_info *info, - uint32_t hw_resync_tcp_sn) override; - void tls_get_progress_params_rx(xlio_tir *tir, void *buf, uint32_t lkey) override; - void tls_release_tis(xlio_tis *tis) override; - void tls_release_tir(xlio_tir *tir) override; - void tls_tx_post_dump_wqe(xlio_tis *tis, void *addr, uint32_t len, uint32_t lkey, - bool first) override; -#endif /* DEFINED_UTLS */ - -#define DPCP_TIS_FLAGS (dpcp::TIS_ATTR_TRANSPORT_DOMAIN | dpcp::TIS_ATTR_PD) -#define DPCP_TIS_NVME_FLAG (dpcp::TIS_ATTR_NVMEOTCP) - std::unique_ptr create_tis(uint32_t flags) const override; - void nvme_set_static_context(xlio_tis *tis, uint32_t config) override; - void nvme_set_progress_context(xlio_tis *tis, uint32_t tcp_seqno) override; - - /* Get a memory inside a wqebb at a wqebb_num offset from the m_sq_wqe_hot and account for - * m_sq_wqe_counter wrap-around. Use offset_in_wqebb to for the internal address. Use the - * template parameter to cast the resulting address to the required pointer type */ - template - constexpr inline T wqebb_get(size_t wqebb_num, size_t offset_in_wqebb = 0U) - { - return reinterpret_cast( - reinterpret_cast( - &(*m_sq_wqes)[(m_sq_wqe_counter + wqebb_num) & (m_tx_num_wr - 1)]) + - offset_in_wqebb); - } - - void post_nop_fence(void) override; - void post_dump_wqe(xlio_tis *tis, void *addr, uint32_t len, uint32_t lkey, bool first) override; - -#if defined(DEFINED_UTLS) - std::unique_ptr get_new_tls_dek(const void *key, uint32_t key_size_bytes); - std::unique_ptr get_tls_dek(const void *key, uint32_t key_size_bytes); - void put_tls_dek(std::unique_ptr &&dek_obj); -#endif - - void reset_inflight_zc_buffers_ctx(void *ctx) override; - // TODO Make credits API inline. - bool credits_get(unsigned credits) override - { - if (m_sq_free_credits >= credits) { - m_sq_free_credits -= credits; - return true; - } - return false; - } - void credits_return(unsigned credits) override { m_sq_free_credits += credits; } - -protected: - void post_recv_buffer_rq(mem_buf_desc_t *p_mem_buf_desc); - void trigger_completion_for_all_sent_packets() override; - bool init_rx_cq_mgr_prepare(); - void init_qp(); - void init_device_memory(); - cq_mgr_rx *init_rx_cq_mgr(struct ibv_comp_channel *p_rx_comp_event_channel) override; - cq_mgr_tx *init_tx_cq_mgr(void) override; - - void put_tls_tir_in_cache(xlio_tir *tir); - void put_tls_tis_in_cache(xlio_tis *tis); - void ti_released(xlio_ti *ti); - - virtual bool is_rq_empty() const override { return (m_mlx5_qp.rq.head == m_mlx5_qp.rq.tail); } - - inline bool is_sq_wqe_prop_valid(sq_wqe_prop *p, sq_wqe_prop *prev) - { - unsigned p_i = p - m_sq_wqe_idx_to_prop; - unsigned prev_i = prev - m_sq_wqe_idx_to_prop; - return (p_i != m_sq_wqe_prop_last_signalled) && - ((m_tx_num_wr + p_i - m_sq_wqe_prop_last_signalled) % m_tx_num_wr < - (m_tx_num_wr + prev_i - m_sq_wqe_prop_last_signalled) % m_tx_num_wr); - } - - sq_wqe_prop *m_sq_wqe_idx_to_prop; - sq_wqe_prop *m_sq_wqe_prop_last; - unsigned m_sq_wqe_prop_last_signalled; - unsigned m_sq_free_credits; - uint64_t m_rq_wqe_counter; - -private: - void update_next_wqe_hot(); - - bool is_completion_need() override - { - return !m_n_unsignaled_count || (m_dm_enabled && m_dm_mgr.is_completion_need()); - }; - void dm_release_data(mem_buf_desc_t *buff) override { m_dm_mgr.release_data(buff); } - - int send_to_wire(xlio_ibv_send_wr *p_send_wqe, xlio_wr_tx_packet_attr attr, bool request_comp, - xlio_tis *tis, unsigned credits) override; - inline int fill_wqe(xlio_ibv_send_wr *p_send_wqe); - inline void store_current_wqe_prop(mem_buf_desc_t *wr_id, unsigned credits, xlio_ti *ti); - void destroy_tis_cache(void); -#if defined(DEFINED_UTLS) - inline void tls_fill_static_params_wqe(struct mlx5_wqe_tls_static_params_seg *params, - const struct xlio_tls_info *info, uint32_t key_id, - uint32_t resync_tcp_sn); - inline void tls_post_static_params_wqe(xlio_ti *ti, const struct xlio_tls_info *info, - uint32_t tis_tir_number, uint32_t key_id, - uint32_t resync_tcp_sn, bool fence, bool is_tx); - inline void tls_fill_progress_params_wqe(struct mlx5_wqe_tls_progress_params_seg *params, - uint32_t tis_tir_number, uint32_t next_record_tcp_sn); - inline void tls_post_progress_params_wqe(xlio_ti *ti, uint32_t tis_tir_number, - uint32_t next_record_tcp_sn, bool fence, bool is_tx); - inline void tls_get_progress_params_wqe(xlio_ti *ti, uint32_t tirn, void *buf, uint32_t lkey); - -protected: - dpcp::tir *xlio_tir_to_dpcp_tir(xlio_tir *tir); - virtual dpcp::tir *create_tir(bool is_tls = false) - { - NOT_IN_USE(is_tls); - return NULL; - } - -private: -#endif /* DEFINED_UTLS */ - inline int fill_wqe_send(xlio_ibv_send_wr *pswr); - inline int fill_wqe_lso(xlio_ibv_send_wr *pswr); - inline void ring_doorbell(int db_method, int num_wqebb, int num_wqebb_top = 0, - bool skip_comp = false); - inline int fill_inl_segment(sg_array &sga, uint8_t *cur_seg, uint8_t *data_addr, - int max_inline_len, int inline_len); - - struct mlx5_eth_wqe (*m_sq_wqes)[]; - struct mlx5_eth_wqe *m_sq_wqe_hot; - uint8_t *m_sq_wqes_end; - enum { MLX5_DB_METHOD_BF, MLX5_DB_METHOD_DB } m_db_method; - - int m_sq_wqe_hot_index; - uint16_t m_sq_wqe_counter; - - bool m_b_fence_needed; - - bool m_dm_enabled; - dm_mgr m_dm_mgr; - /* - * TIS cache. Protected by ring tx lock. - * TODO Move to ring. - */ - std::vector m_tls_tis_cache; - std::vector m_tls_tir_cache; - -#if defined(DEFINED_UTLS) - std::list> m_tls_dek_get_cache; - std::list> m_tls_dek_put_cache; -#endif -}; -#endif // defined(DEFINED_DIRECT_VERBS) -#endif // QP_MGR_ETH_MLX5_H diff --git a/src/core/dev/qp_mgr_eth_mlx5_dpcp.cpp b/src/core/dev/qp_mgr_eth_mlx5_dpcp.cpp deleted file mode 100644 index ae2af8d1d..000000000 --- a/src/core/dev/qp_mgr_eth_mlx5_dpcp.cpp +++ /dev/null @@ -1,347 +0,0 @@ -/* - * Copyright (c) 2001-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "qp_mgr_eth_mlx5_dpcp.h" - -#include -#include "ring_simple.h" -#include "rfs_rule_dpcp.h" -#include "cq_mgr_rx_strq.h" - -#define MODULE_NAME "qp_mgr_eth_mlx5_dpcp" - -#define qp_logpanic __log_info_panic -#define qp_logerr __log_info_err -#define qp_logwarn __log_info_warn -#define qp_loginfo __log_info_info -#define qp_logdbg __log_info_dbg -#define qp_logfunc __log_info_func -#define qp_logfuncall __log_info_funcall - -qp_mgr_eth_mlx5_dpcp::qp_mgr_eth_mlx5_dpcp(struct qp_mgr_desc *desc, uint32_t tx_num_wr, - uint16_t vlan) - : qp_mgr(desc, tx_num_wr, vlan, false) -{ - if (configure(desc)) { - throw_xlio_exception("Failed creating qp_mgr_eth_mlx5_dpcp"); - } - - if (!configure_rq_dpcp()) { - throw_xlio_exception("Failed to create qp_mgr_eth_mlx5_dpcp"); - } -} - -bool qp_mgr_eth_mlx5_dpcp::configure_rq_dpcp() -{ - qp_logdbg("Creating RQ of transport type '%s' on ibv device '%s' [%p] on port %d", - priv_xlio_transport_type_str(m_p_ring->get_transport_type()), - m_p_ib_ctx_handler->get_ibname(), m_p_ib_ctx_handler->get_ibv_device(), m_port_num); - - m_mlx5_qp.cap.max_recv_wr = m_rx_num_wr; - - qp_logdbg("Requested RQ parameters: wre: rx = %d sge: rx = %d", m_mlx5_qp.cap.max_recv_wr, - m_mlx5_qp.cap.max_recv_sge); - - xlio_ib_mlx5_cq_t mlx5_cq; - memset(&mlx5_cq, 0, sizeof(mlx5_cq)); - xlio_ib_mlx5_get_cq(m_p_cq_mgr_rx->get_ibv_cq_hndl(), &mlx5_cq); - - qp_logdbg("Configuring dpcp RQ, cq-rx: %p, cqn-rx: %u", m_p_cq_mgr_rx, - static_cast(mlx5_cq.cq_num)); - - if (safe_mce_sys().enable_striding_rq) { - m_mlx5_qp.cap.max_recv_sge = 2U; // Striding-RQ needs a reserved segment. - _strq_wqe_reserved_seg = 1U; - - delete[] m_ibv_rx_sg_array; - m_ibv_rx_sg_array = - new ibv_sge[m_n_sysvar_rx_num_wr_to_post_recv * m_mlx5_qp.cap.max_recv_sge]; - for (uint32_t wr_idx = 0; wr_idx < m_n_sysvar_rx_num_wr_to_post_recv; wr_idx++) { - m_ibv_rx_wr_array[wr_idx].sg_list = - &m_ibv_rx_sg_array[wr_idx * m_mlx5_qp.cap.max_recv_sge]; - m_ibv_rx_wr_array[wr_idx].num_sge = m_mlx5_qp.cap.max_recv_sge; - memset(m_ibv_rx_wr_array[wr_idx].sg_list, 0, sizeof(ibv_sge)); - m_ibv_rx_wr_array[wr_idx].sg_list[0].length = - 1U; // To bypass a check inside xlio_ib_mlx5_post_recv. - } - } - - // Create the QP - if (!prepare_rq(mlx5_cq.cq_num)) { - return false; - } - - return true; -} - -bool qp_mgr_eth_mlx5_dpcp::prepare_rq(uint32_t cqn) -{ - qp_logdbg(""); - - dpcp::adapter *dpcp_adapter = m_p_ib_ctx_handler->get_dpcp_adapter(); - if (!dpcp_adapter) { - qp_logerr("Failed to get dpcp::adapter for prepare_rq"); - return false; - } - - // user_index Unused. - dpcp::rq_attr rqattrs; - memset(&rqattrs, 0, sizeof(rqattrs)); - rqattrs.cqn = cqn; - rqattrs.wqe_num = m_mlx5_qp.cap.max_recv_wr; - rqattrs.wqe_sz = m_mlx5_qp.cap.max_recv_sge; - - if (safe_mce_sys().hw_ts_conversion_mode == TS_CONVERSION_MODE_RTC) { - qp_logdbg("Enabled RTC timestamp format for RQ"); - rqattrs.ts_format = dpcp::rq_ts_format::RQ_TS_REAL_TIME; - } - - std::unique_ptr new_rq; - dpcp::status rc = dpcp::DPCP_OK; - - if (safe_mce_sys().enable_striding_rq) { - rqattrs.buf_stride_sz = safe_mce_sys().strq_stride_size_bytes; - rqattrs.buf_stride_num = safe_mce_sys().strq_stride_num_per_rwqe; - - // Striding-RQ WQE format is as of Shared-RQ (PRM, page 381, wq_type). - // In this case the WQE minimum size is 2 * 16, and the first segment is reserved. - rqattrs.wqe_sz = m_mlx5_qp.cap.max_recv_sge * 16U; - - dpcp::striding_rq *new_rq_ptr = nullptr; - rc = dpcp_adapter->create_striding_rq(rqattrs, new_rq_ptr); - new_rq.reset(new_rq_ptr); - } else { - dpcp::regular_rq *new_rq_ptr = nullptr; - rc = dpcp_adapter->create_regular_rq(rqattrs, new_rq_ptr); - new_rq.reset(new_rq_ptr); - } - - if (dpcp::DPCP_OK != rc) { - qp_logerr("Failed to create dpcp rq, rc: %d, cqn: %" PRIu32, static_cast(rc), cqn); - return false; - } - - if (!store_rq_mlx5_params(*new_rq)) { - qp_logerr( - "Failed to retrieve initial DPCP RQ parameters, rc: %d, basic_rq: %p, cqn: %" PRIu32, - static_cast(rc), new_rq.get(), cqn); - return false; - } - - _rq = std::move(new_rq); - - // At this stage there is no TIR associated with the RQ, So it mimics QP INIT state. - // At RDY state without a TIR, Work Requests can be submitted to the RQ. - modify_rq_to_ready_state(); - - qp_logdbg("Succeeded to create dpcp rq, rqn: %" PRIu32 ", cqn: %" PRIu32, m_mlx5_qp.rqn, cqn); - - return true; -} - -bool qp_mgr_eth_mlx5_dpcp::store_rq_mlx5_params(dpcp::basic_rq &new_rq) -{ - uint32_t *dbrec_tmp = nullptr; - dpcp::status rc = new_rq.get_dbrec(dbrec_tmp); - if (dpcp::DPCP_OK != rc) { - qp_logerr("Failed to retrieve dbrec of dpcp rq, rc: %d, basic_rq: %p", static_cast(rc), - &new_rq); - return false; - } - m_mlx5_qp.rq.dbrec = dbrec_tmp; - - rc = new_rq.get_wq_buf(m_mlx5_qp.rq.buf); - if (dpcp::DPCP_OK != rc) { - qp_logerr("Failed to retrieve wq-buf of dpcp rq, rc: %d, basic_rq: %p", - static_cast(rc), &new_rq); - return false; - } - - rc = new_rq.get_id(m_mlx5_qp.rqn); - if (dpcp::DPCP_OK != rc) { - qp_logerr("Failed to retrieve rqn of dpcp rq, rc: %d, basic_rq: %p", static_cast(rc), - &new_rq); - return false; - } - - new_rq.get_wqe_num(m_mlx5_qp.rq.wqe_cnt); - new_rq.get_wq_stride_sz(m_mlx5_qp.rq.stride); - if (safe_mce_sys().enable_striding_rq) { - m_mlx5_qp.rq.stride /= 16U; - } - - m_mlx5_qp.rq.wqe_shift = ilog_2(m_mlx5_qp.rq.stride); - m_mlx5_qp.rq.head = 0; - m_mlx5_qp.rq.tail = 0; - m_mlx5_qp.tirn = 0U; - - return true; -} - -void qp_mgr_eth_mlx5_dpcp::init_tir_rq() -{ - _tir.reset(create_tir()); - if (!_tir) { - qp_logpanic("TIR creation for qp_mgr_eth_mlx5_dpcp failed (errno=%d %m)", errno); - } -} - -void qp_mgr_eth_mlx5_dpcp::up() -{ - qp_mgr::init_qp(); - init_tir_rq(); - qp_mgr::up(); - init_device_memory(); -} - -void qp_mgr_eth_mlx5_dpcp::down() -{ - _tir.reset(nullptr); - - qp_mgr::down(); -} - -rfs_rule *qp_mgr_eth_mlx5_dpcp::create_rfs_rule(xlio_ibv_flow_attr &attrs, xlio_tir *tir_ext) -{ - // TODO Remove copypaste. -#ifdef DEFINED_UTLS - if (tir_ext && m_p_ib_ctx_handler && m_p_ib_ctx_handler->get_dpcp_adapter()) { - std::unique_ptr new_rule(new rfs_rule_dpcp()); - if (new_rule->create(attrs, *xlio_tir_to_dpcp_tir(tir_ext), - *m_p_ib_ctx_handler->get_dpcp_adapter())) { - return new_rule.release(); - } - } else -#endif /* DEFINED_UTLS */ - if (_tir && m_p_ib_ctx_handler && m_p_ib_ctx_handler->get_dpcp_adapter()) { - std::unique_ptr new_rule(new rfs_rule_dpcp()); - if (new_rule->create(attrs, *_tir, *m_p_ib_ctx_handler->get_dpcp_adapter())) { - return new_rule.release(); - } - } - - NOT_IN_USE(tir_ext); - return nullptr; -} - -void qp_mgr_eth_mlx5_dpcp::modify_qp_to_ready_state() -{ - qp_mgr::modify_qp_to_ready_state(); - modify_rq_to_ready_state(); -} - -void qp_mgr_eth_mlx5_dpcp::modify_qp_to_error_state() -{ - m_p_cq_mgr_rx->clean_cq(); - - qp_mgr::modify_qp_to_error_state(); - - dpcp::status rc = _rq->modify_state(dpcp::RQ_ERR); - - /* During plugout theres is possibility that kernel - * remove device resources before working process complete - * removing process. As a result ibv api function can - * return EIO=5 errno code. - */ - if (dpcp::DPCP_OK != rc && errno != EIO) { - qp_logerr("Failed to modify rq state to ERR, rc: %d, rqn: %" PRIu32, static_cast(rc), - m_mlx5_qp.rqn); - } -} - -void qp_mgr_eth_mlx5_dpcp::modify_rq_to_ready_state() -{ - dpcp::status rc = _rq->modify_state(dpcp::RQ_RDY); - if (dpcp::DPCP_OK != rc) { - qp_logerr("Failed to modify rq state to RDY, rc: %d, rqn: %" PRIu32, static_cast(rc), - m_mlx5_qp.rqn); - } -} - -cq_mgr_rx *qp_mgr_eth_mlx5_dpcp::init_rx_cq_mgr(struct ibv_comp_channel *p_rx_comp_event_channel) -{ - if (unlikely(!safe_mce_sys().enable_striding_rq)) { - return qp_mgr::init_rx_cq_mgr(p_rx_comp_event_channel); - } - - return (!init_rx_cq_mgr_prepare() - ? nullptr - : new cq_mgr_rx_strq(m_p_ring, m_p_ib_ctx_handler, - safe_mce_sys().strq_stride_num_per_rwqe * m_rx_num_wr, - safe_mce_sys().strq_stride_size_bytes, - safe_mce_sys().strq_stride_num_per_rwqe, - p_rx_comp_event_channel)); -} - -void qp_mgr_eth_mlx5_dpcp::post_recv_buffer(mem_buf_desc_t *p_mem_buf_desc) -{ - uint32_t index = (m_curr_rx_wr * m_mlx5_qp.cap.max_recv_sge) + _strq_wqe_reserved_seg; - m_ibv_rx_sg_array[index].addr = (uintptr_t)p_mem_buf_desc->p_buffer; - m_ibv_rx_sg_array[index].length = p_mem_buf_desc->sz_buffer; - m_ibv_rx_sg_array[index].lkey = p_mem_buf_desc->lkey; - - post_recv_buffer_rq(p_mem_buf_desc); -} - -dpcp::tir *qp_mgr_eth_mlx5_dpcp::create_tir(bool is_tls /*=false*/) -{ - dpcp::tir *tir_obj = nullptr; - dpcp::status status = dpcp::DPCP_OK; - dpcp::tir::attr tir_attr; - - memset(&tir_attr, 0, sizeof(tir_attr)); - tir_attr.flags = dpcp::TIR_ATTR_INLINE_RQN | dpcp::TIR_ATTR_TRANSPORT_DOMAIN; - tir_attr.inline_rqn = m_mlx5_qp.rqn; - tir_attr.transport_domain = m_p_ib_ctx_handler->get_dpcp_adapter()->get_td(); - - if (m_p_ring->m_lro.cap && m_p_ring->m_lro.max_payload_sz) { - tir_attr.flags |= dpcp::TIR_ATTR_LRO; - tir_attr.lro.timeout_period_usecs = XLIO_MLX5_PARAMS_LRO_TIMEOUT; - tir_attr.lro.enable_mask = 3; // Bitmask for IPv4 and IPv6 support - tir_attr.lro.max_msg_sz = m_p_ring->m_lro.max_payload_sz >> 8; - } - - if (is_tls) { - tir_attr.flags |= dpcp::TIR_ATTR_TLS; - tir_attr.tls_en = 1; - } - - status = m_p_ib_ctx_handler->get_dpcp_adapter()->create_tir(tir_attr, tir_obj); - - if (dpcp::DPCP_OK != status) { - qp_logerr("Failed creating dpcp tir with flags=0x%x status=%d", tir_attr.flags, status); - return nullptr; - } - - qp_logdbg("TIR: %p created", tir_obj); - - return tir_obj; -} diff --git a/src/core/dev/qp_mgr_eth_mlx5_dpcp.h b/src/core/dev/qp_mgr_eth_mlx5_dpcp.h deleted file mode 100644 index 5eed174eb..000000000 --- a/src/core/dev/qp_mgr_eth_mlx5_dpcp.h +++ /dev/null @@ -1,77 +0,0 @@ -/* - * Copyright (c) 2001-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#ifndef QP_MGR_ETH_MLX5_DPCP_H -#define QP_MGR_ETH_MLX5_DPCP_H - -#include - -#include -#include -#include "dev/qp_mgr.h" - -class qp_mgr_eth_mlx5_dpcp : public qp_mgr { -public: - qp_mgr_eth_mlx5_dpcp(struct qp_mgr_desc *desc, uint32_t tx_num_wr, uint16_t vlan); - - virtual ~qp_mgr_eth_mlx5_dpcp() override {} - - virtual void up() override; - virtual void down() override; - - virtual rfs_rule *create_rfs_rule(xlio_ibv_flow_attr &attrs, xlio_tir *tir_ext) override; - virtual void modify_qp_to_ready_state() override; - virtual void modify_qp_to_error_state() override; - virtual void post_recv_buffer(mem_buf_desc_t *p_mem_buf_desc) override; - -protected: - virtual cq_mgr_rx *init_rx_cq_mgr(struct ibv_comp_channel *p_rx_comp_event_channel) override; - -private: -#ifdef DEFINED_UTLS - // TODO: Move UTLS related code to this class and remove qp_mgr::create_tir() - dpcp::tir *create_tir(bool is_tls = false) override; -#else - dpcp::tir *create_tir(bool is_tls = false); -#endif - bool configure_rq_dpcp(); - bool prepare_rq(uint32_t cqn); - bool store_rq_mlx5_params(dpcp::basic_rq &new_rq); - void modify_rq_to_ready_state(); - void init_tir_rq(); - - std::unique_ptr _tir = {nullptr}; - std::unique_ptr _rq = {nullptr}; - uint32_t _strq_wqe_reserved_seg = 0U; -}; - -#endif diff --git a/src/core/dev/rfs.cpp b/src/core/dev/rfs.cpp index 35e8ba97f..e791664d2 100644 --- a/src/core/dev/rfs.cpp +++ b/src/core/dev/rfs.cpp @@ -128,6 +128,7 @@ rfs::rfs(flow_tuple *flow_spec_5t, ring_slave *p_ring, rfs_rule_filter *rule_fil uint32_t flow_tag_id /*=0*/) : m_flow_tuple(rule_filter ? rule_filter->m_flow_tuple : *flow_spec_5t) , m_p_ring(p_ring) + , m_p_ring_simple(dynamic_cast(p_ring)) , m_p_rule_filter(rule_filter) , m_n_sinks_list_entries(0) , m_n_sinks_list_max_length(RFS_SINKS_LIST_DEFAULT_LEN) @@ -320,7 +321,11 @@ bool rfs::detach_flow(pkt_rcvr_sink *sink) rfs_rule *rfs::create_rule(xlio_tir *tir, const flow_tuple &flow_spec) { - auto *hqrx = dynamic_cast(m_p_ring)->m_hqrx; + if (!m_p_ring_simple) { + rfs_logpanic("Incompatible ring type"); + } + + auto *hqrx = m_p_ring_simple->m_hqrx; dpcp::match_params match_value_tmp; dpcp::match_params match_mask_tmp; @@ -352,8 +357,12 @@ rfs_rule *rfs::create_rule(xlio_tir *tir, const flow_tuple &flow_spec) bool rfs::create_flow() { - m_rfs_flow = dynamic_cast(m_p_ring)->m_hqrx->create_rfs_rule( - m_match_value, m_match_mask, m_priority, m_flow_tag_id, nullptr); + if (!m_p_ring_simple) { + rfs_logpanic("Incompatible ring type"); + } + + m_rfs_flow = m_p_ring_simple->m_hqrx->create_rfs_rule(m_match_value, m_match_mask, m_priority, + m_flow_tag_id, nullptr); if (!m_rfs_flow) { rfs_logerr("Create RFS flow failed, Tag: %" PRIu32 ", Flow: %s, Priority: %" PRIu16 ", errno: %d - %m", @@ -389,14 +398,12 @@ bool rfs::destroy_flow() void rfs::prepare_flow_spec_eth_ip(const ip_address &dst_ip, const ip_address &src_ip) { - ring_simple *p_ring = dynamic_cast(m_p_ring); - - if (!p_ring) { + if (!m_p_ring_simple) { rfs_logpanic("Incompatible ring type"); } - m_match_value.vlan_id = p_ring->m_hqrx->get_vlan() & VLAN_VID_MASK; - m_match_mask.vlan_id = (p_ring->m_hqrx->get_vlan() ? VLAN_VID_MASK : 0); + m_match_value.vlan_id = m_p_ring_simple->m_hqrx->get_vlan() & VLAN_VID_MASK; + m_match_mask.vlan_id = (m_p_ring_simple->m_hqrx->get_vlan() ? VLAN_VID_MASK : 0); bool is_ipv4 = (m_flow_tuple.get_family() == AF_INET); if (is_ipv4) { diff --git a/src/core/dev/rfs.h b/src/core/dev/rfs.h index 7b631ba40..1eddd922c 100644 --- a/src/core/dev/rfs.h +++ b/src/core/dev/rfs.h @@ -106,6 +106,7 @@ class rfs { protected: flow_tuple m_flow_tuple; ring_slave *m_p_ring; + ring_simple *m_p_ring_simple; rfs_rule_filter *m_p_rule_filter; rfs_rule *m_rfs_flow = nullptr; pkt_rcvr_sink **m_sinks_list; diff --git a/src/core/dev/rfs_rule.cpp b/src/core/dev/rfs_rule.cpp index 80b908ead..da5199a49 100644 --- a/src/core/dev/rfs_rule.cpp +++ b/src/core/dev/rfs_rule.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2001-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * Copyright (c) 2001-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU diff --git a/src/core/dev/rfs_rule_dpcp.cpp b/src/core/dev/rfs_rule_dpcp.cpp deleted file mode 100644 index 5abf6f9ba..000000000 --- a/src/core/dev/rfs_rule_dpcp.cpp +++ /dev/null @@ -1,160 +0,0 @@ -/* - * Copyright (c) 2001-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include "dev/rfs_rule_dpcp.h" - -#if defined(DEFINED_DPCP) - -#include -#include "dev/rfs.h" - -#define MODULE_NAME "rfs_rule_dpcp" - -rfs_rule_dpcp::~rfs_rule_dpcp() -{ -} - -bool rfs_rule_dpcp::create(const xlio_ibv_flow_attr &attrs, dpcp::tir &in_tir, - dpcp::adapter &in_adapter) -{ - const ibv_flow_attr_eth &attrs_eth(reinterpret_cast(attrs)); - dpcp::match_params mp; - dpcp::match_params match_msk; - - memset(&mp, 0, sizeof(mp)); - memset(&match_msk, 0, sizeof(match_msk)); - - memset(&match_msk.dst_mac, 0xFF, sizeof(match_msk.dst_mac)); - memcpy(&mp.dst_mac, attrs_eth.eth.val.dst_mac, - min(sizeof(mp.dst_mac), sizeof(attrs_eth.eth.val.dst_mac))); - - match_msk.ethertype = htons(attrs_eth.eth.mask.ether_type); - mp.ethertype = htons(attrs_eth.eth.val.ether_type); - match_msk.vlan_id = ntohs(attrs_eth.eth.mask.vlan_tag); - mp.vlan_id = ntohs(attrs_eth.eth.val.vlan_tag); - - const xlio_ibv_flow_spec_tcp_udp *p_tcp_udp = nullptr; - const xlio_ibv_flow_spec_action_tag *p_flow_tag = nullptr; - - if (attrs_eth.eth.val.ether_type == htons(ETH_P_IP)) { - const auto &attrs_tcpudp( - reinterpret_cast< - const attach_flow_data_eth_ipv4_tcp_udp_t::ibv_flow_attr_eth_ip_tcp_udp &>(attrs)); - - p_tcp_udp = &(attrs_tcpudp.tcp_udp); - p_flow_tag = &(attrs_tcpudp.flow_tag); - - match_msk.dst.ipv4 = ntohl(attrs_tcpudp.ip.mask.dst_ip); - mp.dst.ipv4 = ntohl(attrs_tcpudp.ip.val.dst_ip); - match_msk.src.ipv4 = ntohl(attrs_tcpudp.ip.mask.src_ip); - mp.src.ipv4 = ntohl(attrs_tcpudp.ip.val.src_ip); - mp.ip_version = 4U; - } else { - const auto &attrs_tcpudp( - reinterpret_cast< - const attach_flow_data_eth_ipv6_tcp_udp_t::ibv_flow_attr_eth_ip_tcp_udp &>(attrs)); - - p_tcp_udp = &(attrs_tcpudp.tcp_udp); - p_flow_tag = &(attrs_tcpudp.flow_tag); - - memcpy(match_msk.dst.ipv6, attrs_tcpudp.ip.mask.dst_ip, sizeof(match_msk.dst.ipv6)); - memcpy(mp.dst.ipv6, attrs_tcpudp.ip.val.dst_ip, sizeof(mp.dst.ipv6)); - memcpy(match_msk.src.ipv6, attrs_tcpudp.ip.mask.src_ip, sizeof(match_msk.src.ipv6)); - memcpy(mp.src.ipv6, attrs_tcpudp.ip.val.src_ip, sizeof(mp.src.ipv6)); - mp.ip_version = 6U; - } - - match_msk.dst_port = ntohs(p_tcp_udp->mask.dst_port); - mp.dst_port = ntohs(p_tcp_udp->val.dst_port); - match_msk.src_port = ntohs(p_tcp_udp->mask.src_port); - mp.src_port = ntohs(p_tcp_udp->val.src_port); - match_msk.protocol = 0xFF; - mp.protocol = (p_tcp_udp->type == XLIO_IBV_FLOW_SPEC_TCP ? IPPROTO_TCP : IPPROTO_UDP); - match_msk.ip_version = 0xF; - - dpcp::flow_rule *new_rule = nullptr; - dpcp::status status_out = in_adapter.create_flow_rule(attrs.priority, match_msk, new_rule); - if (status_out != dpcp::DPCP_OK) { - rfs_logerr( - "Failed dpcp_adpater::create_flow_rule(), Type: %u, Priority %" PRIu16 ", Status: %d", - static_cast(attrs.type), attrs.priority, static_cast(status_out)); - return false; - } - - rfs_logdbg("Succeeded dpcp_adpater::create_flow_rule(), Type: %u, Priority %" PRIu16 - ", rfs_rule_dpcp %p, dpcp_flow: %p", - static_cast(attrs.type), attrs.priority, this, new_rule); - - _dpcp_flow.reset(new_rule); - - status_out = _dpcp_flow->set_match_value(mp); - if (status_out != dpcp::DPCP_OK) { - rfs_logerr("Failed dpcp_flow_rule::set_match_value(), Status: %d, dpcp_flow: %p", - static_cast(status_out), new_rule); - return false; - } - - status_out = _dpcp_flow->add_dest_tir(&in_tir); - if (status_out != dpcp::DPCP_OK) { - rfs_logerr("Failed dpcp_flow_rule::add_dest_tir(), Status: %d, dpcp_flow: %p", - static_cast(status_out), new_rule); - return false; - } - - uint32_t tirn = 0U; - in_tir.get_id(tirn); - rfs_logdbg("Added dpcp_flow_rule::add_dest_tir() TIR %" PRIu32 ", dpcp_flow: %p", tirn, - new_rule); - - if (p_flow_tag->type == XLIO_IBV_FLOW_SPEC_ACTION_TAG) { - rfs_logdbg("Setting flow tag dpcp_adpater::set_flow_id(), Tag: %" PRIu32 ", dpcp_flow: %p", - p_flow_tag->tag_id, new_rule); - - status_out = _dpcp_flow->set_flow_id(p_flow_tag->tag_id); - if (status_out != dpcp::DPCP_OK) { - rfs_logerr("Failed dpcp_flow_rule::set_flow_id(), Status: %d, dpcp_flow: %p", - static_cast(status_out), new_rule); - return false; - } - } - - status_out = _dpcp_flow->apply_settings(); - if (status_out != dpcp::DPCP_OK) { - rfs_logerr("Failed dpcp_flow_rule::apply_settings(), Status: %d, dpcp_flow: %p", - static_cast(status_out), new_rule); - return false; - } - - return true; -} - -#endif // defined(DEFINED_DPCP) diff --git a/src/core/dev/rfs_rule_dpcp.h b/src/core/dev/rfs_rule_dpcp.h deleted file mode 100644 index 8443b5bea..000000000 --- a/src/core/dev/rfs_rule_dpcp.h +++ /dev/null @@ -1,60 +0,0 @@ -/* - * Copyright (c) 2001-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#ifndef RFS_RULE_DPCP_H -#define RFS_RULE_DPCP_H - -#include - -#if defined(DEFINED_DPCP) - -#include -#include "util/utils.h" -#include "ib/base/verbs_extra.h" -#include "dev/rfs_rule.h" -#include - -using namespace std; - -class rfs_rule_dpcp : public rfs_rule { -public: - virtual ~rfs_rule_dpcp(); - - bool create(const xlio_ibv_flow_attr &attrs, dpcp::tir &in_tir, dpcp::adapter &in_adapter); - -private: - unique_ptr _dpcp_flow; -}; - -#endif // defined(DEFINED_DPCP) - -#endif diff --git a/src/core/dev/rfs_rule_ibv.cpp b/src/core/dev/rfs_rule_ibv.cpp deleted file mode 100644 index f2f343b8c..000000000 --- a/src/core/dev/rfs_rule_ibv.cpp +++ /dev/null @@ -1,65 +0,0 @@ -/* - * Copyright (c) 2001-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include -#include "dev/rfs_rule_ibv.h" - -#define MODULE_NAME "rfs_rule_ibv" - -rfs_rule_ibv::~rfs_rule_ibv() -{ -} - -bool rfs_rule_ibv::create(xlio_ibv_flow_attr &attrs, ibv_qp *qp) -{ - _ibv_flow.reset(xlio_ibv_create_flow(qp, &attrs)); - if (_ibv_flow != nullptr) { - rfs_logdbg("Succeeded xlio_ibv_create_flow, Type: %u, Priority %" PRIu16 - ", rfs_rule_ibv: %p, ibv_flow: %p", - static_cast(attrs.type), attrs.priority, this, _ibv_flow.get()); - return true; - } - - rfs_logerr("Failed xlio_ibv_create_flow, Type: %u, Priority %" PRIu16, - static_cast(attrs.type), attrs.priority); - return false; -} - -void rfs_rule_ibv::destory_ibv_flow(xlio_ibv_flow *flow) -{ - IF_VERBS_FAILURE_EX(xlio_ibv_destroy_flow(flow), EIO) - { - __log_err("Failed xlio_ibv_destroy_flow, ibv_flow: %p", flow); - } - else { __log_dbg("Success xlio_ibv_destroy_flow, ibv_flow: %p", flow); } - ENDIF_VERBS_FAILURE; -} diff --git a/src/core/dev/rfs_rule_ibv.h b/src/core/dev/rfs_rule_ibv.h deleted file mode 100644 index 1be41454a..000000000 --- a/src/core/dev/rfs_rule_ibv.h +++ /dev/null @@ -1,59 +0,0 @@ -/* - * Copyright (c) 2001-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#ifndef RFS_RULE_IBV_H -#define RFS_RULE_IBV_H - -#include -#include "util/utils.h" -#include "ib/base/verbs_extra.h" -#include "dev/rfs_rule.h" - -using namespace std; - -template using deleter_func = void (*)(T *); - -template using unique_ptr_delfunc = std::unique_ptr>; - -class rfs_rule_ibv : public rfs_rule { -public: - virtual ~rfs_rule_ibv(); - - bool create(xlio_ibv_flow_attr &attrs, ibv_qp *qp); - -private: - static void destory_ibv_flow(xlio_ibv_flow *flow); - - unique_ptr_delfunc _ibv_flow {nullptr, destory_ibv_flow}; -}; - -#endif diff --git a/src/core/dev/rfs_uc.cpp b/src/core/dev/rfs_uc.cpp index 67de9a958..8642fd4cd 100644 --- a/src/core/dev/rfs_uc.cpp +++ b/src/core/dev/rfs_uc.cpp @@ -65,12 +65,15 @@ rfs_uc::rfs_uc(flow_tuple *flow_spec_5t, ring_slave *p_ring, rfs_rule_filter *ru void rfs_uc::prepare_flow_spec() { + if (!m_p_ring_simple) { + rfs_logpanic("Incompatible ring type"); + } + prepare_flow_spec_eth_ip(m_flow_tuple.get_dst_ip(), m_flow_tuple.get_src_ip()); prepare_flow_spec_tcp_udp(); memset(&m_match_mask.dst_mac, 0xFF, sizeof(m_match_mask.dst_mac)); - memcpy(&m_match_value.dst_mac, - dynamic_cast(m_p_ring)->m_p_l2_addr->get_address(), + memcpy(&m_match_value.dst_mac, m_p_ring_simple->m_p_l2_addr->get_address(), sizeof(m_match_value.dst_mac)); if (m_flow_tuple.get_src_port() || !m_flow_tuple.get_src_ip().is_anyaddr()) { @@ -94,14 +97,13 @@ void rfs_uc::prepare_flow_spec() src_port = g_p_app->get_worker_id(); } - m_match_mask.src_port = static_cast( - (g_p_app->workers_pow2 * g_p_app->src_port_stride) - 2); - m_match_value.src_port = - static_cast(src_port * g_p_app->src_port_stride); + m_match_mask.src_port = + static_cast((g_p_app->workers_pow2 * g_p_app->src_port_stride) - 2); + m_match_value.src_port = static_cast(src_port * g_p_app->src_port_stride); m_priority = 1; - rfs_logdbg("src_port_stride: %d workers_num %d \n", - g_p_app->src_port_stride, g_p_app->workers_num); + rfs_logdbg("src_port_stride: %d workers_num %d \n", g_p_app->src_port_stride, + g_p_app->workers_num); rfs_logdbg("sp_tcp_udp->val.src_port: %d p_tcp_udp->mask.src_port %d \n", m_match_value.src_port, m_match_mask.src_port); @@ -110,7 +112,8 @@ void rfs_uc::prepare_flow_spec() } #endif - rfs_logfunc("Transport type: %d, flow_tag_id: %d", p_ring->get_transport_type(), m_flow_tag_id); + rfs_logfunc("Transport type: %d, flow_tag_id: %d", m_p_ring_simple->get_transport_type(), + m_flow_tag_id); } bool rfs_uc::rx_dispatch_packet(mem_buf_desc_t *p_rx_wc_buf_desc, void *pv_fd_ready_array) diff --git a/src/core/dev/ring_bond.cpp b/src/core/dev/ring_bond.cpp index 4bad26388..b8fcef275 100644 --- a/src/core/dev/ring_bond.cpp +++ b/src/core/dev/ring_bond.cpp @@ -292,13 +292,16 @@ void ring_bond::restart() ring_logdbg("ring %d active", i); if (slaves[j]->lag_tx_port_affinity != 1) { tmp_ring->start_active_queue_tx(); + /* coverity[sleep] */ tmp_ring->start_active_queue_rx(); } m_bond_rings[i]->m_active = true; } else { ring_logdbg("ring %d not active", i); if (slaves[j]->lag_tx_port_affinity != 1) { + /* coverity[sleep] */ tmp_ring->stop_active_queue_tx(); + /* coverity[sleep] */ tmp_ring->stop_active_queue_rx(); } m_bond_rings[i]->m_active = false; diff --git a/src/core/dev/xlio_ti.h b/src/core/dev/xlio_ti.h index 6ca0fd44b..e977c87de 100644 --- a/src/core/dev/xlio_ti.h +++ b/src/core/dev/xlio_ti.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2001-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * Copyright (c) 2001-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU diff --git a/src/core/main.cpp b/src/core/main.cpp index e51b308e5..49b9ec099 100644 --- a/src/core/main.cpp +++ b/src/core/main.cpp @@ -807,12 +807,14 @@ void print_xlio_global_settings() SYS_VAR_UTLS_RX, safe_mce_sys().enable_utls_rx ? "Enabled " : "Disabled"); VLOG_PARAM_STRING("UTLS TX support", safe_mce_sys().enable_utls_tx, MCE_DEFAULT_UTLS_TX, SYS_VAR_UTLS_TX, safe_mce_sys().enable_utls_tx ? "Enabled " : "Disabled"); - VLOG_PARAM_NUMBER( - "UTLS high watermark DEK cache size", safe_mce_sys().utls_high_wmark_dek_cache_size, - MCE_DEFAULT_UTLS_HIGH_WMARK_DEK_CACHE_SIZE, SYS_VAR_UTLS_HIGH_WMARK_DEK_CACHE_SIZE); - VLOG_PARAM_NUMBER( - "UTLS low watermark DEK cache size", safe_mce_sys().utls_low_wmark_dek_cache_size, - MCE_DEFAULT_UTLS_LOW_WMARK_DEK_CACHE_SIZE, SYS_VAR_UTLS_LOW_WMARK_DEK_CACHE_SIZE); + VLOG_PARAM_NUMBER("UTLS high watermark DEK cache size", + static_cast(safe_mce_sys().utls_high_wmark_dek_cache_size), + MCE_DEFAULT_UTLS_HIGH_WMARK_DEK_CACHE_SIZE, + SYS_VAR_UTLS_HIGH_WMARK_DEK_CACHE_SIZE); + VLOG_PARAM_NUMBER("UTLS low watermark DEK cache size", + static_cast(safe_mce_sys().utls_low_wmark_dek_cache_size), + MCE_DEFAULT_UTLS_LOW_WMARK_DEK_CACHE_SIZE, + SYS_VAR_UTLS_LOW_WMARK_DEK_CACHE_SIZE); #endif /* DEFINED_UTLS */ #if defined(DEFINED_NGINX) VLOG_PARAM_NUMBER("Number of Nginx workers", diff --git a/src/core/sock/sockinfo_ulp.cpp b/src/core/sock/sockinfo_ulp.cpp index 0448ca32d..e85f0780f 100644 --- a/src/core/sock/sockinfo_ulp.cpp +++ b/src/core/sock/sockinfo_ulp.cpp @@ -200,7 +200,7 @@ class tls_record : public mem_desc { m_size = TLS_RECORD_HDR_LEN + TLS_RECORD_TAG_LEN; m_p_data = nullptr; tls_sock->get_record_buf(m_p_buf, m_p_data, zc_owner != nullptr); - if (likely(m_p_buf)) { + if (likely(m_p_buf) && likely(m_p_data)) { if (iv) { m_size += TLS_RECORD_IV_LEN; memcpy(&m_p_data[5], iv, TLS_RECORD_IV_LEN); @@ -570,11 +570,6 @@ int sockinfo_tcp_ops_tls::setsockopt(int __level, int __optname, const void *__o return -1; } - if (unlikely(keylen > TLS_AES_GCM_KEY_MAX)) { - errno = EINVAL; - return -1; - } - xlio_tls_info *tls_info = (__optname == TLS_TX) ? &m_tls_info_tx : &m_tls_info_rx; tls_info->tls_version = base_info->version; tls_info->tls_cipher = base_info->cipher_type; From 0e91471a78f36b7e2cbc7fc98fb82caded15d9e3 Mon Sep 17 00:00:00 2001 From: Alexander Grissik Date: Mon, 16 Oct 2023 19:50:33 +0300 Subject: [PATCH 023/169] issue 3514044 Fixing package test with mandatory dpcp Signed-off-by: Alexander Grissik --- .ci/matrix_job.yaml | 10 ++++----- contrib/build_pkg.sh | 6 ++++- contrib/jenkins_tests/gtest.sh | 6 ++--- contrib/jenkins_tests/rpm.sh | 6 ++--- contrib/test_jenkins.sh | 7 +++++- src/core/Makefile.am | 1 + src/core/dev/cq_mgr_rx.cpp | 1 - tests/gtest/nvme/nvme.cc | 41 +++++++++++++++++++++++++++++++--- tests/gtest/tcp/tcp_sockopt.cc | 2 ++ 9 files changed, 63 insertions(+), 17 deletions(-) diff --git a/.ci/matrix_job.yaml b/.ci/matrix_job.yaml index 6e4b29ca1..506286445 100644 --- a/.ci/matrix_job.yaml +++ b/.ci/matrix_job.yaml @@ -40,7 +40,7 @@ runs_on_dockers: - {name: 'rhel8.6-mofed-x86_64', url: 'harbor.mellanox.com/hpcx/x86_64/rhel8.6/builder:mofed-5.6-0.4.5.0', category: 'base', arch: 'x86_64'} # - {name: 'oracle8.6-mofed-x86_64', url: 'harbor.mellanox.com/rivermax/base_oraclelinux8.6:mofed-5.9-0.3.4.0', category: 'base', arch: 'x86_64'} # tool - - {name: 'toolbox', url: 'harbor.mellanox.com/hpcx/x86_64/rhel8.3/builder:inbox', category: 'tool', arch: 'x86_64'} + - {name: 'toolbox', url: 'harbor.mellanox.com/hpcx/x86_64/rhel8.6/builder:inbox', category: 'tool', arch: 'x86_64'} - {name: 'blackduck', url: 'harbor.mellanox.com/toolbox/ngci-centos:7.9.2009.2', category: 'tool', arch: 'x86_64'} - {name: 'header-check', url: 'harbor.mellanox.com/toolbox/header_check:0.0.14', category: 'tool', arch: 'x86_64', tag: '0.0.14'} @@ -216,7 +216,7 @@ steps: containerSelector: - "{name: 'skip-container'}" agentSelector: - - "{nodeLabel: 'beni09', variant:2}" + - "{nodeLabel: 'beni09'}" run: | [ "x${do_cppcheck}" == "xtrue" ] && action=yes || action=no env WORKSPACE=$PWD TARGET=${flags} jenkins_test_cppcheck=${action} ./contrib/test_jenkins.sh @@ -233,7 +233,7 @@ steps: containerSelector: - "{name: 'skip-container'}" agentSelector: - - "{nodeLabel: 'beni09', variant:2}" + - "{nodeLabel: 'beni09'}" run: | [ "x${do_csbuild}" == "xtrue" ] && action=yes || action=no env WORKSPACE=$PWD TARGET=${flags} jenkins_test_csbuild=${action} ./contrib/test_jenkins.sh @@ -250,7 +250,7 @@ steps: containerSelector: - "{name: 'skip-container'}" agentSelector: - - "{nodeLabel: 'beni09', variant:2}" + - "{nodeLabel: 'beni09'}" run: | [ "x${do_tidy}" == "xtrue" ] && action=yes || action=no env WORKSPACE=$PWD TARGET=${flags} jenkins_test_tidy=${action} ./contrib/test_jenkins.sh @@ -267,7 +267,7 @@ steps: containerSelector: - "{name: 'skip-container'}" agentSelector: - - "{nodeLabel: 'beni09', variant:2}" + - "{nodeLabel: 'beni09'}" run: | [ "x${do_test}" == "xtrue" ] && action=yes || action=no env WORKSPACE=$PWD TARGET=${flags} jenkins_test_run=${action} ./contrib/test_jenkins.sh diff --git a/contrib/build_pkg.sh b/contrib/build_pkg.sh index 4f65a71e1..1d8b26c1c 100755 --- a/contrib/build_pkg.sh +++ b/contrib/build_pkg.sh @@ -31,6 +31,10 @@ while test "$1" != ""; do arg_rpm="${arg_deb/=/ }" opt_exports="$opt_exports :$arg_deb"; opt_defines="$opt_defines --define='$arg_rpm'"; + if [[ $arg_deb =~ ^configure_options[[:blank:]]*= ]]; then + shopt -s extglob + opt_conf_val="${arg_deb##configure_options*([[:blank:]])=}" + fi shift ;; *) @@ -110,7 +114,7 @@ cd ${pkg_dir} if [ "$rc" -eq 0 ]; then echo ${pkg_label} "Running ./configure ..." - ${pkg_indir}/configure >> ${pkg_log} 2>&1 + ${pkg_indir}/configure $opt_conf_val >> ${pkg_log} 2>&1 rc=$((rc + $?)) fi diff --git a/contrib/jenkins_tests/gtest.sh b/contrib/jenkins_tests/gtest.sh index 8bf54f5f3..1ac0c3692 100755 --- a/contrib/jenkins_tests/gtest.sh +++ b/contrib/jenkins_tests/gtest.sh @@ -64,8 +64,8 @@ gtest_opt_ipv6="--addr=$(do_get_addrs 'inet6' ${opt2}) -r fdff:ffff:ffff:ffff:ff set +eE if [[ -z "${MANUAL_RUN}" ]]; then - ${WORKSPACE}/configure --prefix=$install_dir - make -C tests/gtest + ${WORKSPACE}/configure --prefix=$install_dir $jenkins_test_custom_configure + make $make_opt -C tests/gtest rc=$(($rc+$?)) fi @@ -90,7 +90,7 @@ rc=$(($rc+$?)) if [[ -z "${MANUAL_RUN}" ]]; then make -C tests/gtest clean - make -C tests/gtest CPPFLAGS="-DEXTRA_API_ENABLED=1" + make $make_opt -C tests/gtest CPPFLAGS="-DEXTRA_API_ENABLED=1" rc=$(($rc+$?)) fi diff --git a/contrib/jenkins_tests/rpm.sh b/contrib/jenkins_tests/rpm.sh index e22952329..9ab251d02 100755 --- a/contrib/jenkins_tests/rpm.sh +++ b/contrib/jenkins_tests/rpm.sh @@ -48,7 +48,7 @@ if [ $opt_tarball -eq 1 ]; then if [ -n "$(automake --version | grep 'automake (GNU automake) 1.10.1')" ]; then test_exec='make $make_opt dist' else - test_exec='make $make_opt dist && make $make_opt distcheck' + test_exec='make $make_opt dist && DISTCHECK_CONFIGURE_FLAGS='"'"$jenkins_test_custom_configure"'"' make $make_opt distcheck' fi do_check_result "$test_exec" "$test_id" "tarball" "$rpm_tap" "${rpm_dir}/rpm-${test_id}" @@ -73,9 +73,9 @@ fi if [ $opt_binrpm -eq 1 ]; then if [ $opt_rpm -eq 1 ]; then - test_exec="env RPM_BUILD_NCPUS=${NPROC} rpmbuild -bb $rpmmacros $rpmopts $rpmspec" + test_exec="env RPM_BUILD_NCPUS=${NPROC} rpmbuild -bb --define='configure_options $jenkins_test_custom_configure' $rpmmacros $rpmopts $rpmspec" else - test_exec="dpkg-buildpackage -us -uc -b" + test_exec="env configure_options=\"$jenkins_test_custom_configure\" dpkg-buildpackage -us -uc -b" fi do_check_result "$test_exec" "$test_id" "binrpm" "$rpm_tap" "${rpm_dir}/rpm-${test_id}" test_id=$((test_id+1)) diff --git a/contrib/test_jenkins.sh b/contrib/test_jenkins.sh index 9e89b8ae2..60fd956b3 100755 --- a/contrib/test_jenkins.sh +++ b/contrib/test_jenkins.sh @@ -100,9 +100,12 @@ do_check_env # set predefined configuration settings and extra options # that depend on environment # + TARGET=${TARGET:=all} i=0 if [ "$TARGET" == "all" -o "$TARGET" == "default" ]; then + export jenkins_target="default" + export prefix=${jenkins_test_custom_prefix}/${jenkins_target} do_check_dpcp opt_value if [ ! -z "${opt_value}" ]; then target_list[$i]="default: --enable-nginx --with-dpcp=${opt_value}" @@ -126,9 +129,11 @@ for target_v in "${target_list[@]}"; do ret=0 IFS=':' read target_name target_option <<< "$target_v" + export jenkins_target="${target_name}" + export prefix=${jenkins_test_custom_prefix}/${jenkins_target} export jenkins_test_artifacts="${WORKSPACE}/${prefix}/xlio-${BUILD_NUMBER}-${HOSTNAME}-${target_name}" export jenkins_test_custom_configure="${jenkins_test_custom_configure} ${target_option}" - export jenkins_target="${target_name}" + set +x echo "======================================================" echo " Checking for [${jenkins_target}] target" diff --git a/src/core/Makefile.am b/src/core/Makefile.am index 27fe9ea95..9a17c7ce9 100644 --- a/src/core/Makefile.am +++ b/src/core/Makefile.am @@ -199,6 +199,7 @@ libxlio_la_SOURCES := \ dev/ring_tap.h \ dev/ring_allocation_logic.h \ dev/wqe_send_handler.h \ + dev/xlio_ti.h \ \ event/command.h \ event/delta_timer.h \ diff --git a/src/core/dev/cq_mgr_rx.cpp b/src/core/dev/cq_mgr_rx.cpp index c0baa098e..2b90ee686 100644 --- a/src/core/dev/cq_mgr_rx.cpp +++ b/src/core/dev/cq_mgr_rx.cpp @@ -286,7 +286,6 @@ void cq_mgr_rx::lro_update_hdr(struct xlio_mlx5_cqe *cqe, mem_buf_desc_t *p_rx_w (struct ip6_hdr *)(p_rx_wc_buf_desc->p_buffer + transport_header_len); assert(0x01 == ((cqe->l4_hdr_type_etc >> 2) & 0x3)); // IPv6 L3 header. - assert(ip_header_version(p_ip6_h) == IPV6); assert(p_ip6_h->ip6_nxt == IPPROTO_TCP); assert(ntohl(cqe->byte_cnt) >= transport_header_len + IPV6_HLEN); diff --git a/tests/gtest/nvme/nvme.cc b/tests/gtest/nvme/nvme.cc index 98c2ede98..8b0ffac43 100644 --- a/tests/gtest/nvme/nvme.cc +++ b/tests/gtest/nvme/nvme.cc @@ -36,7 +36,6 @@ #include #include "common/def.h" #include "common/base.h" -#include "dev/hw_queue_tx.h" #include "proto/nvme_parse_input_args.h" #include "tcp/tcp_base.h" #include "xlio_extra.h" @@ -352,8 +351,8 @@ class nvme_tx : public tcp_send_zc { vector mrs; int client_fd; bool nvme_supported = true; - msghdr *msg; - uint8_t *cmsg_buffer; + msghdr *msg = nullptr; + uint8_t *cmsg_buffer = nullptr; vector msghdr_iov {}; void TearDown() override @@ -369,6 +368,36 @@ class nvme_tx : public tcp_send_zc { msghdr_iov.clear(); } + bool is_nvme_supported() + { + bool nvme_support = false; + int pid = fork(); + if (0 != pid) { + int cfd = tcp_base::sock_create(); + int rc = bind(cfd, (sockaddr *)&client_addr, sizeof(client_addr)); + barrier_fork(pid, true); + rc |= connect(cfd, (sockaddr *)&server_addr, sizeof(server_addr)); + rc |= setsockopt(cfd, IPPROTO_TCP, TCP_ULP, "nvme", 4); + nvme_support = (rc == 0); + close(cfd); + wait_fork(pid); + } else { // I am the child + int listen_fd = tcp_base::sock_create(); + int reuse_on = 1; + int rc = setsockopt(listen_fd, SOL_SOCKET, SO_REUSEPORT, &reuse_on, sizeof(reuse_on)); + rc |= bind(listen_fd, (sockaddr *)&server_addr, sizeof(server_addr)); + rc |= listen(listen_fd, 5); + barrier_fork(pid, true); + int server_fd = accept(listen_fd, nullptr, nullptr); + peer_wait(server_fd); + close(server_fd); + close(listen_fd); + exit(testing::Test::HasFailure()); + } + + return nvme_support; + } + void client_socket_create() { client_fd = tcp_base::sock_create(); @@ -520,6 +549,9 @@ class nvme_tx : public tcp_send_zc { TEST_F(nvme_tx, send_single_pdu) { + SKIP_TRUE(is_nvme_supported(), "NVME offload not supported"); + SKIP_TRUE(!getenv("XLIO_TCP_CTL_THREAD"), "Skip non default XLIO_TCP_CTL_THREAD"); + int pid = fork(); uint32_t empty_ddgst; @@ -541,6 +573,9 @@ TEST_F(nvme_tx, send_single_pdu) TEST_F(nvme_tx, send_multiple_pdus) { + SKIP_TRUE(is_nvme_supported(), "NVME offload not supported"); + SKIP_TRUE(!getenv("XLIO_TCP_CTL_THREAD"), "Skip non default XLIO_TCP_CTL_THREAD"); + int pid = fork(); uint32_t empty_ddgst; diff --git a/tests/gtest/tcp/tcp_sockopt.cc b/tests/gtest/tcp/tcp_sockopt.cc index 0ce724ce4..10ec822ba 100644 --- a/tests/gtest/tcp/tcp_sockopt.cc +++ b/tests/gtest/tcp/tcp_sockopt.cc @@ -855,6 +855,8 @@ class tcp_with_fifo : public testing::TestWithParam { */ TEST_P(tcp_with_fifo, accepted_socket_inherits_the_setsockopt_param) { + SKIP_TRUE(!getenv("XLIO_TCP_CTL_THREAD"), "Skip non default XLIO_TCP_CTL_THREAD"); + int level, optname, value; std::tie(level, optname, value) = GetParam(); pid_t pid = fork(); From d702e3c4e2cc15a34e6d7932283148103abd2e56 Mon Sep 17 00:00:00 2001 From: Alexander Grissik Date: Sun, 31 Dec 2023 14:37:06 +0200 Subject: [PATCH 024/169] issue: 3514044 Updating min dpcp version to 1.1.43 Signed-off-by: Alexander Grissik --- config/m4/dpcp.m4 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/config/m4/dpcp.m4 b/config/m4/dpcp.m4 index 8b85fa27c..e474566b2 100644 --- a/config/m4/dpcp.m4 +++ b/config/m4/dpcp.m4 @@ -56,7 +56,7 @@ get_version_number() get_min_supported_version() { - echo 10130 + echo 10143 } AC_ARG_WITH([dpcp], From 67185ae773618d240f1fd238c6e829b1bafb8ece Mon Sep 17 00:00:00 2001 From: Alexander Grissik Date: Thu, 11 Jan 2024 12:43:01 +0200 Subject: [PATCH 025/169] issue: 3514044 Replacing .inl file with .h Signed-off-by: Alexander Grissik --- contrib/jenkins_tests/copyrights.sh | 2 +- contrib/jenkins_tests/cppcheck.sh | 2 +- contrib/jenkins_tests/style.sh | 6 +++--- contrib/jenkins_tests/tidy.sh | 6 +++--- src/core/Makefile.am | 2 +- src/core/dev/cq_mgr_rx.cpp | 2 +- src/core/dev/cq_mgr_rx.h | 7 ++----- src/core/dev/{cq_mgr_rx.inl => cq_mgr_rx_inl.h} | 0 src/core/dev/cq_mgr_rx_regrq.cpp | 2 +- src/core/dev/cq_mgr_rx_strq.cpp | 2 +- 10 files changed, 14 insertions(+), 17 deletions(-) rename src/core/dev/{cq_mgr_rx.inl => cq_mgr_rx_inl.h} (100%) diff --git a/contrib/jenkins_tests/copyrights.sh b/contrib/jenkins_tests/copyrights.sh index 4e7eba0fd..2ebb0db5f 100755 --- a/contrib/jenkins_tests/copyrights.sh +++ b/contrib/jenkins_tests/copyrights.sh @@ -12,7 +12,7 @@ if [ ! -d "$WORKSPACE" ]; then exit 1 fi -cpp_files=' "extensions": [".c", ".cc", ".cpp", "c++", ".h", ".hpp", ".cs", ".inl", ".l", ".y"],' +cpp_files=' "extensions": [".c", ".cc", ".cpp", "c++", ".h", ".hpp", ".cs", ".l", ".y"],' sed -i "s/.*\"extensions\": \[\"\.c\".*/$cpp_files/g" /opt/nvidia/ProjectConfig/header-types.json cat /opt/nvidia/ProjectConfig/header-types.json diff --git a/contrib/jenkins_tests/cppcheck.sh b/contrib/jenkins_tests/cppcheck.sh index f79cba6ab..3e1070500 100755 --- a/contrib/jenkins_tests/cppcheck.sh +++ b/contrib/jenkins_tests/cppcheck.sh @@ -39,7 +39,7 @@ cd $cppcheck_dir ${WORKSPACE}/configure $jenkins_test_custom_configure > "${cppcheck_dir}/cppcheck.log" 2>&1 set +eE -eval "find ${WORKSPACE}/src -name '*.h' -o -name '*.cpp' -o -name '*.c' -o -name '*.hpp' -o -name '*.inl' | \ +eval "find ${WORKSPACE}/src -name '*.h' -o -name '*.cpp' -o -name '*.c' -o -name '*.hpp' | \ ${tool_app} --std=c++11 --language=c++ --force --enable=information \ -I${WORKSPACE}/src \ -I${WORKSPACE}/src/stats \ diff --git a/contrib/jenkins_tests/style.sh b/contrib/jenkins_tests/style.sh index 6905177e2..90fd026f1 100755 --- a/contrib/jenkins_tests/style.sh +++ b/contrib/jenkins_tests/style.sh @@ -24,9 +24,9 @@ rm -rf $style_tap ln -sf $WORKSPACE/contrib/jenkins_tests/style.conf $WORKSPACE/.clang-format -check_files="$(find $WORKSPACE/src/ ! -name 'config_*' -a \( -iname '*.c' -o -iname '*.cpp' -o -iname '*.h' -o -iname '*.inl' -o -name '*.cc' \))" -check_files+=" $(find $WORKSPACE/tools/daemon/ \( -iname '*.c' -o -iname '*.cpp' -o -iname '*.h' -o -iname '*.inl' -o -iname '*.cc' \))" -check_files+=" $(find $WORKSPACE/tests/gtest/ \( -path "*/googletest" \) ! -prune -o ! \( -name 'tap.h' -o -name 'gtest.h' -o -name 'gtest-all.cc' \) -a \( -iname '*.c' -o -iname '*.cpp' -o -iname '*.h' -o -iname '*.inl' -o -iname '*.cc' \))" +check_files="$(find $WORKSPACE/src/ ! -name 'config_*' -a \( -iname '*.c' -o -iname '*.cpp' -o -iname '*.h' -o -name '*.cc' \))" +check_files+=" $(find $WORKSPACE/tools/daemon/ \( -iname '*.c' -o -iname '*.cpp' -o -iname '*.h' -o -iname '*.cc' \))" +check_files+=" $(find $WORKSPACE/tests/gtest/ \( -path "*/googletest" \) ! -prune -o ! \( -name 'tap.h' -o -name 'gtest.h' -o -name 'gtest-all.cc' \) -a \( -iname '*.c' -o -iname '*.cpp' -o -iname '*.h' -o -iname '*.cc' \))" i=0 nerrors=0 diff --git a/contrib/jenkins_tests/tidy.sh b/contrib/jenkins_tests/tidy.sh index 4eba3f681..4ecbd0b5a 100755 --- a/contrib/jenkins_tests/tidy.sh +++ b/contrib/jenkins_tests/tidy.sh @@ -58,9 +58,9 @@ if [ ! -e $WORKSPACE/.clang-format ]; then ln -sf $WORKSPACE/contrib/jenkins_tests/style.conf $WORKSPACE/.clang-format fi -check_files="$(find $WORKSPACE/src/ ! -name 'config_*' -a \( -iname '*.c' -o -iname '*.cpp' -o -iname '*.hpp' -o -iname '*.h' -o -iname '*.inl' \) 2>&1 | tee -a "${tidy_dir}/${test_name}.log")" -check_files+=" $(find $WORKSPACE/tools/daemon/ \( -iname '*.c' -o -iname '*.cpp' -o -iname '*.hpp' -o -iname '*.h' -o -iname '*.inl' \) 2>&1 | tee -a "${tidy_dir}/${test_name}.log")" -check_files+=" $(find $WORKSPACE/tests/gtest/ \( -path "*/googletest" \) ! -prune -o ! -name 'tap.h' -a \( -iname '*.c' -o -iname '*.cpp' -o -iname '*.cc' -o -iname '*.hpp' -o -iname '*.h' -o -iname '*.inl' \) 2>&1 | tee -a "${tidy_dir}/${test_name}.log")" +check_files="$(find $WORKSPACE/src/ ! -name 'config_*' -a \( -iname '*.c' -o -iname '*.cpp' -o -iname '*.hpp' -o -iname '*.h' \) 2>&1 | tee -a "${tidy_dir}/${test_name}.log")" +check_files+=" $(find $WORKSPACE/tools/daemon/ \( -iname '*.c' -o -iname '*.cpp' -o -iname '*.hpp' -o -iname '*.h' \) 2>&1 | tee -a "${tidy_dir}/${test_name}.log")" +check_files+=" $(find $WORKSPACE/tests/gtest/ \( -path "*/googletest" \) ! -prune -o ! -name 'tap.h' -a \( -iname '*.c' -o -iname '*.cpp' -o -iname '*.cc' -o -iname '*.hpp' -o -iname '*.h' \) 2>&1 | tee -a "${tidy_dir}/${test_name}.log")" i=0 nerrors=0 diff --git a/src/core/Makefile.am b/src/core/Makefile.am index 9a17c7ce9..d3714602d 100644 --- a/src/core/Makefile.am +++ b/src/core/Makefile.am @@ -25,7 +25,6 @@ dist-hook: SUBDIRS = infra netlink EXTRA_DIST = \ - dev/cq_mgr_rx.inl \ util/libxlio.conf sysconf_DATA = util/libxlio.conf @@ -170,6 +169,7 @@ libxlio_la_SOURCES := \ dev/allocator.h \ dev/buffer_pool.h \ dev/cq_mgr_rx.h \ + dev/cq_mgr_rx_inl.h \ dev/cq_mgr_rx_regrq.h \ dev/cq_mgr_rx_strq.h \ dev/cq_mgr_tx.h \ diff --git a/src/core/dev/cq_mgr_rx.cpp b/src/core/dev/cq_mgr_rx.cpp index 2b90ee686..9c1cbc29c 100644 --- a/src/core/dev/cq_mgr_rx.cpp +++ b/src/core/dev/cq_mgr_rx.cpp @@ -31,7 +31,7 @@ */ #include "cq_mgr_rx.h" -#include "cq_mgr_rx.inl" +#include "cq_mgr_rx_inl.h" #include #include #include diff --git a/src/core/dev/cq_mgr_rx.h b/src/core/dev/cq_mgr_rx.h index 9de2e64a5..f4526c68e 100644 --- a/src/core/dev/cq_mgr_rx.h +++ b/src/core/dev/cq_mgr_rx.h @@ -250,11 +250,8 @@ inline struct xlio_mlx5_cqe *cq_mgr_rx::check_cqe(void) (struct xlio_mlx5_cqe *)(((uint8_t *)m_mlx5_cq.cq_buf) + ((m_mlx5_cq.cq_ci & (m_mlx5_cq.cqe_count - 1)) << m_mlx5_cq.cqe_size_log)); - /* - * CQE ownership is defined by Owner bit in the CQE. - * The value indicating SW ownership is flipped every - * time CQ wraps around. - * */ + // CQE ownership is defined by Owner bit in the CQE. + // The value indicating SW ownership is flipped every time CQ wraps around. if (likely((MLX5_CQE_OPCODE(cqe->op_own)) != MLX5_CQE_INVALID) && !((MLX5_CQE_OWNER(cqe->op_own)) ^ !!(m_mlx5_cq.cq_ci & m_mlx5_cq.cqe_count))) { return cqe; diff --git a/src/core/dev/cq_mgr_rx.inl b/src/core/dev/cq_mgr_rx_inl.h similarity index 100% rename from src/core/dev/cq_mgr_rx.inl rename to src/core/dev/cq_mgr_rx_inl.h diff --git a/src/core/dev/cq_mgr_rx_regrq.cpp b/src/core/dev/cq_mgr_rx_regrq.cpp index 4f12afd4c..1292c0a6d 100644 --- a/src/core/dev/cq_mgr_rx_regrq.cpp +++ b/src/core/dev/cq_mgr_rx_regrq.cpp @@ -35,7 +35,7 @@ #if defined(DEFINED_DIRECT_VERBS) #include -#include "cq_mgr_rx.inl" +#include "cq_mgr_rx_inl.h" #include "hw_queue_rx.h" #include "ring_simple.h" diff --git a/src/core/dev/cq_mgr_rx_strq.cpp b/src/core/dev/cq_mgr_rx_strq.cpp index daccd1aa2..e54c8d50e 100644 --- a/src/core/dev/cq_mgr_rx_strq.cpp +++ b/src/core/dev/cq_mgr_rx_strq.cpp @@ -35,7 +35,7 @@ #if defined(DEFINED_DIRECT_VERBS) #include -#include "cq_mgr_rx.inl" +#include "cq_mgr_rx_inl.h" #include "hw_queue_rx.h" #include "ring_simple.h" #include From 3ce76f773604d298c6cfd0b5429cb07ba28d6564 Mon Sep 17 00:00:00 2001 From: Alexander Grissik Date: Thu, 11 Jan 2024 12:47:31 +0200 Subject: [PATCH 026/169] issue: 3514044 Removing option_strq Signed-off-by: Alexander Grissik --- src/core/main.cpp | 6 +++--- src/core/util/sys_vars.cpp | 10 ++-------- src/core/util/sys_vars.h | 9 ++------- 3 files changed, 7 insertions(+), 18 deletions(-) diff --git a/src/core/main.cpp b/src/core/main.cpp index 49b9ec099..3f31d9053 100644 --- a/src/core/main.cpp +++ b/src/core/main.cpp @@ -649,9 +649,9 @@ void print_xlio_global_settings() VLOG_PARAM_STRING("Force Flowtag for MC", safe_mce_sys().mc_force_flowtag, MCE_DEFAULT_MC_FORCE_FLOWTAG, SYS_VAR_MC_FORCE_FLOWTAG, safe_mce_sys().mc_force_flowtag ? "Enabled " : "Disabled"); - VLOG_STR_PARAM_STRING("Striding RQ", option_strq::to_str(safe_mce_sys().enable_strq_env), - option_strq::to_str(MCE_DEFAULT_STRQ), SYS_VAR_STRQ, - option_strq::to_str(safe_mce_sys().enable_strq_env)); + VLOG_STR_PARAM_STRING("Striding RQ", option_3::to_str(safe_mce_sys().enable_strq_env), + option_3::to_str(MCE_DEFAULT_STRQ), SYS_VAR_STRQ, + option_3::to_str(safe_mce_sys().enable_strq_env)); VLOG_PARAM_NUMBER("STRQ Strides per RWQE", safe_mce_sys().strq_stride_num_per_rwqe, MCE_DEFAULT_STRQ_NUM_STRIDES, SYS_VAR_STRQ_NUM_STRIDES); VLOG_PARAM_NUMBER("STRQ Stride Size (Bytes)", safe_mce_sys().strq_stride_size_bytes, diff --git a/src/core/util/sys_vars.cpp b/src/core/util/sys_vars.cpp index a4e8d172d..5c20866ba 100644 --- a/src/core/util/sys_vars.cpp +++ b/src/core/util/sys_vars.cpp @@ -309,12 +309,6 @@ static option_t options[] = {AUTO_ON_OFF_IMPL}; OPTION_FROM_TO_STR_IMPL } // namespace option_3 -namespace option_strq { -static option_t options[] = {AUTO_ON_OFF_IMPL, - {REGULAR_RQ, "Regular RQ", {"regular_rq", NULL, NULL}}}; -OPTION_FROM_TO_STR_IMPL -} // namespace option_strq - namespace option_tcp_ctl_thread { static option_t options[] = { {CTL_THREAD_DISABLE, "Disabled", {"disable", "disabled", NULL}}, @@ -921,11 +915,11 @@ void mce_sys_var::get_env_params() } if ((env_ptr = getenv(SYS_VAR_STRQ)) != NULL) { - enable_strq_env = option_strq::from_str(env_ptr, MCE_DEFAULT_STRQ); + enable_strq_env = option_3::from_str(env_ptr, MCE_DEFAULT_STRQ); } enable_striding_rq = - (enable_strq_env == option_strq::ON || enable_strq_env == option_strq::AUTO); + (enable_strq_env == option_3::ON || enable_strq_env == option_3::AUTO); if (enable_striding_rq) { rx_num_bufs = MCE_DEFAULT_STRQ_NUM_BUFS; diff --git a/src/core/util/sys_vars.h b/src/core/util/sys_vars.h index 52f33cf67..442e8b597 100644 --- a/src/core/util/sys_vars.h +++ b/src/core/util/sys_vars.h @@ -208,11 +208,6 @@ typedef enum { AUTO_ON_OFF_DEF } mode_t; OPTIONS_FROM_TO_STR_DEF; } // namespace option_3 -namespace option_strq { -typedef enum { AUTO_ON_OFF_DEF, REGULAR_RQ = 2 } mode_t; -OPTIONS_FROM_TO_STR_DEF; -} // namespace option_strq - namespace option_tcp_ctl_thread { typedef enum { CTL_THREAD_DISABLE = 0, @@ -476,7 +471,7 @@ struct mce_sys_var { bool enable_socketxtreme; option_3::mode_t enable_tso; option_3::mode_t enable_lro; - option_strq::mode_t enable_strq_env; + option_3::mode_t enable_strq_env; #ifdef DEFINED_UTLS bool enable_utls_rx; bool enable_utls_tx; @@ -757,7 +752,7 @@ extern mce_sys_var &safe_mce_sys(); #define MCE_DEFAULT_TX_SEGS_POOL_BATCH_TCP (16384) #define MCE_DEFAULT_TX_NUM_SGE (4) -#define MCE_DEFAULT_STRQ (option_strq::ON) +#define MCE_DEFAULT_STRQ (option_3::ON) #define MCE_DEFAULT_STRQ_NUM_STRIDES (16384) #define MCE_DEFAULT_STRQ_STRIDE_SIZE_BYTES (512) #define MCE_DEFAULT_STRQ_NUM_BUFS (64) From 6cda7bff3ddc62956426c5f3a99ef4fa7ab6f9ae Mon Sep 17 00:00:00 2001 From: Alexander Grissik Date: Thu, 11 Jan 2024 13:20:20 +0200 Subject: [PATCH 027/169] issue: 3514044 Removing unnecessary checks Signed-off-by: Alexander Grissik --- src/core/dev/cq_mgr_rx.cpp | 2 +- src/core/dev/hw_queue_rx.cpp | 2 +- src/core/dev/hw_queue_tx.cpp | 17 +++++------------ src/core/dev/hw_queue_tx.h | 8 ++++---- src/core/dev/ib_ctx_handler.cpp | 11 ----------- src/core/dev/ib_ctx_handler.h | 2 +- src/core/dev/ring_simple.cpp | 2 +- src/core/sock/sockinfo_ulp.cpp | 2 +- src/core/util/sys_vars.cpp | 3 +-- 9 files changed, 15 insertions(+), 34 deletions(-) diff --git a/src/core/dev/cq_mgr_rx.cpp b/src/core/dev/cq_mgr_rx.cpp index 9c1cbc29c..8b9e268a1 100644 --- a/src/core/dev/cq_mgr_rx.cpp +++ b/src/core/dev/cq_mgr_rx.cpp @@ -86,7 +86,7 @@ cq_mgr_rx::cq_mgr_rx(ring_simple *p_ring, ib_ctx_handler *p_ib_ctx_handler, int , m_b_sysvar_cq_keep_qp_full(safe_mce_sys().cq_keep_qp_full) { BULLSEYE_EXCLUDE_BLOCK_START - if (m_rx_lkey == 0) { + if (m_rx_lkey == LKEY_ERROR) { __log_info_panic("invalid lkey found %u", m_rx_lkey); } BULLSEYE_EXCLUDE_BLOCK_END diff --git a/src/core/dev/hw_queue_rx.cpp b/src/core/dev/hw_queue_rx.cpp index 55c5d569d..5fa3b2d98 100644 --- a/src/core/dev/hw_queue_rx.cpp +++ b/src/core/dev/hw_queue_rx.cpp @@ -636,4 +636,4 @@ bool hw_queue_rx::store_rq_mlx5_params(dpcp::basic_rq &new_rq) m_rq_data.tail = 0; return true; -} \ No newline at end of file +} diff --git a/src/core/dev/hw_queue_tx.cpp b/src/core/dev/hw_queue_tx.cpp index 9fbb922b0..6055e29e7 100644 --- a/src/core/dev/hw_queue_tx.cpp +++ b/src/core/dev/hw_queue_tx.cpp @@ -354,8 +354,8 @@ void hw_queue_tx::release_tx_buffers() NOT_IN_USE(ret); // Suppress --enable-opt-log=high warning } -int hw_queue_tx::send(xlio_ibv_send_wr *p_send_wqe, xlio_wr_tx_packet_attr attr, xlio_tis *tis, - unsigned credits) +void hw_queue_tx::send_wqe(xlio_ibv_send_wr *p_send_wqe, xlio_wr_tx_packet_attr attr, xlio_tis *tis, + unsigned credits) { mem_buf_desc_t *p_mem_buf_desc = (mem_buf_desc_t *)p_send_wqe->wr_id; /* Control tx completions: @@ -371,10 +371,7 @@ int hw_queue_tx::send(xlio_ibv_send_wr *p_send_wqe, xlio_wr_tx_packet_attr attr, hwqtx_logfunc("VERBS send, unsignaled_count: %d", m_n_unsignaled_count); - // TODO send_to_wire() and send() can return void after removing ibverbs support - if (send_to_wire(p_send_wqe, attr, request_comp, tis, credits)) { - return -1; - } + send_to_wire(p_send_wqe, attr, request_comp, tis, credits); if (request_comp || is_signal_requested_for_last_wqe()) { uint64_t dummy_poll_sn = 0; @@ -386,8 +383,6 @@ int hw_queue_tx::send(xlio_ibv_send_wr *p_send_wqe, xlio_wr_tx_packet_attr attr, BULLSEYE_EXCLUDE_BLOCK_END hwqtx_logfunc("polling succeeded on cq_mgr_tx (%d wce)", ret); } - - return 0; } void hw_queue_tx::modify_queue_to_ready_state() @@ -888,8 +883,8 @@ void hw_queue_tx::store_current_wqe_prop(mem_buf_desc_t *buf, unsigned credits, //! Send one RAW packet by MLX5 BlueFlame // -int hw_queue_tx::send_to_wire(xlio_ibv_send_wr *p_send_wqe, xlio_wr_tx_packet_attr attr, - bool request_comp, xlio_tis *tis, unsigned credits) +void hw_queue_tx::send_to_wire(xlio_ibv_send_wr *p_send_wqe, xlio_wr_tx_packet_attr attr, + bool request_comp, xlio_tis *tis, unsigned credits) { struct xlio_mlx5_wqe_ctrl_seg *ctrl = NULL; struct mlx5_wqe_eth_seg *eseg = NULL; @@ -929,8 +924,6 @@ int hw_queue_tx::send_to_wire(xlio_ibv_send_wr *p_send_wqe, xlio_wr_tx_packet_at "m_sq_wqe_hot: %p m_sq_wqe_hot_index: %d wqe_counter: %d new_hot_index: %d wr_id: %llx", m_sq_wqe_hot, m_sq_wqe_hot_index, m_sq_wqe_counter, (m_sq_wqe_counter & (m_tx_num_wr - 1)), p_send_wqe->wr_id); - - return 0; } std::unique_ptr hw_queue_tx::create_tis(uint32_t flags) diff --git a/src/core/dev/hw_queue_tx.h b/src/core/dev/hw_queue_tx.h index e96c3a535..a09707e5d 100644 --- a/src/core/dev/hw_queue_tx.h +++ b/src/core/dev/hw_queue_tx.h @@ -88,8 +88,8 @@ class hw_queue_tx : public xlio_ti_owner { void up(); void down(); - int send(xlio_ibv_send_wr *p_send_wqe, xlio_wr_tx_packet_attr attr, xlio_tis *tis, - unsigned credits); + void send_wqe(xlio_ibv_send_wr *p_send_wqe, xlio_wr_tx_packet_attr attr, xlio_tis *tis, + unsigned credits); struct ibv_qp *get_ibv_qp() const { return m_mlx5_qp.qp; }; @@ -213,8 +213,8 @@ class hw_queue_tx : public xlio_ti_owner { void destroy_tis_cache(); void put_tls_tis_in_cache(xlio_tis *tis); - int send_to_wire(xlio_ibv_send_wr *p_send_wqe, xlio_wr_tx_packet_attr attr, bool request_comp, - xlio_tis *tis, unsigned credits); + void send_to_wire(xlio_ibv_send_wr *p_send_wqe, xlio_wr_tx_packet_attr attr, bool request_comp, + xlio_tis *tis, unsigned credits); void set_unsignaled_count(void) { m_n_unsignaled_count = m_n_sysvar_tx_num_wr_to_signal - 1; } diff --git a/src/core/dev/ib_ctx_handler.cpp b/src/core/dev/ib_ctx_handler.cpp index b645c22b1..089ce3b56 100644 --- a/src/core/dev/ib_ctx_handler.cpp +++ b/src/core/dev/ib_ctx_handler.cpp @@ -72,7 +72,6 @@ ib_ctx_handler::ib_ctx_handler(struct ib_ctx_handler_desc *desc) ibch_logpanic("m_p_ibv_device is invalid"); } - m_p_ibv_context = NULL; m_p_adapter = set_dpcp_adapter(); if (!m_p_adapter) { ibch_logpanic("ibv device %p adapter allocation failure (errno=%d %m)", m_p_ibv_device, @@ -122,11 +121,6 @@ ib_ctx_handler::ib_ctx_handler(struct ib_ctx_handler_desc *desc) delete m_p_adapter; m_p_ibv_context = NULL; } - - if (m_p_ibv_context) { - ibv_close_device(m_p_ibv_context); - m_p_ibv_context = NULL; - } } ib_ctx_handler::~ib_ctx_handler() @@ -163,11 +157,6 @@ ib_ctx_handler::~ib_ctx_handler() m_p_ibv_context = NULL; } - if (m_p_ibv_context) { - ibv_close_device(m_p_ibv_context); - m_p_ibv_context = NULL; - } - BULLSEYE_EXCLUDE_BLOCK_END } diff --git a/src/core/dev/ib_ctx_handler.h b/src/core/dev/ib_ctx_handler.h index 973c11c40..dd9c36d16 100644 --- a/src/core/dev/ib_ctx_handler.h +++ b/src/core/dev/ib_ctx_handler.h @@ -111,7 +111,7 @@ class ib_ctx_handler : public event_handler_ibverbs { private: void handle_event_device_fatal(); ibv_device *m_p_ibv_device; // HCA handle - struct ibv_context *m_p_ibv_context; + struct ibv_context *m_p_ibv_context = nullptr; dpcp::adapter *m_p_adapter; xlio_ibv_device_attr_ex *m_p_ibv_device_attr; ibv_pd *m_p_ibv_pd; diff --git a/src/core/dev/ring_simple.cpp b/src/core/dev/ring_simple.cpp index 0b1d2383c..b7a15a2fb 100644 --- a/src/core/dev/ring_simple.cpp +++ b/src/core/dev/ring_simple.cpp @@ -724,7 +724,7 @@ inline int ring_simple::send_buffer(xlio_ibv_send_wr *p_send_wqe, xlio_wr_tx_pac if (likely(m_hqtx->credits_get(credits)) || is_available_qp_wr(is_set(attr, XLIO_TX_PACKET_BLOCK), credits)) { - ret = m_hqtx->send(p_send_wqe, attr, tis, credits); + m_hqtx->send_wqe(p_send_wqe, attr, tis, credits); } else { ring_logdbg("Silent packet drop, SQ is full!"); ret = -1; diff --git a/src/core/sock/sockinfo_ulp.cpp b/src/core/sock/sockinfo_ulp.cpp index e85f0780f..5770fc25c 100644 --- a/src/core/sock/sockinfo_ulp.cpp +++ b/src/core/sock/sockinfo_ulp.cpp @@ -200,7 +200,7 @@ class tls_record : public mem_desc { m_size = TLS_RECORD_HDR_LEN + TLS_RECORD_TAG_LEN; m_p_data = nullptr; tls_sock->get_record_buf(m_p_buf, m_p_data, zc_owner != nullptr); - if (likely(m_p_buf) && likely(m_p_data)) { + if (likely(m_p_buf && m_p_data)) { if (iv) { m_size += TLS_RECORD_IV_LEN; memcpy(&m_p_data[5], iv, TLS_RECORD_IV_LEN); diff --git a/src/core/util/sys_vars.cpp b/src/core/util/sys_vars.cpp index 5c20866ba..a611f9057 100644 --- a/src/core/util/sys_vars.cpp +++ b/src/core/util/sys_vars.cpp @@ -918,8 +918,7 @@ void mce_sys_var::get_env_params() enable_strq_env = option_3::from_str(env_ptr, MCE_DEFAULT_STRQ); } - enable_striding_rq = - (enable_strq_env == option_3::ON || enable_strq_env == option_3::AUTO); + enable_striding_rq = (enable_strq_env == option_3::ON || enable_strq_env == option_3::AUTO); if (enable_striding_rq) { rx_num_bufs = MCE_DEFAULT_STRQ_NUM_BUFS; From ecb0c8a979227a87a30cc5dc6c176ab8dfd077da Mon Sep 17 00:00:00 2001 From: Alex Briskin Date: Wed, 17 Jan 2024 14:01:24 +0200 Subject: [PATCH 028/169] issue: 3745279 Fix artifact generation in CI Signed-off-by: Alex Briskin --- .ci/artifacts.sh | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/.ci/artifacts.sh b/.ci/artifacts.sh index ba256d570..d1ea70a02 100755 --- a/.ci/artifacts.sh +++ b/.ci/artifacts.sh @@ -1,12 +1,9 @@ #!/bin/bash -xl if [ -d jenkins ]; then - gzip -f ./jenkins/*.tar 2>/dev/null || true - cd ./jenkins/ ; - for f in *.tar.gz ; do [ -e "$f" ] && mv "$f" "${flags}/arch-${name}-$f" ; done ; - cd .. - cd ./jenkins/${flags}; - for f in *.tap ; do [ -e "$f" ] && mv "$f" "${flags}-${name}-$f" ; done ; - for f in *.xml ; do [ -e "$f" ] && mv "$f" "${flags}-${name}-$f" ; done ; - cd ../.. + pushd ./jenkins/${flags}; + gzip -f *.tar 2>/dev/null || true + for f in *.tar.gz ; do mv -f "$f" "arch-${name}-$f" ; done; + for f in *.{tap,xml} ; do mv -f "$f" "${flags}-${name}-$f" ; done ; + popd fi From bf0b74482e9d31e275301ad64821f1f8c031cc2a Mon Sep 17 00:00:00 2001 From: Alexander Grissik Date: Thu, 28 Dec 2023 11:27:39 +0200 Subject: [PATCH 029/169] issue: 3664594 Return ETIMEDOUT err for timed out socket Set ETIMEDOUT errno and return -1 from recv in case a socket was timed out, instead of 0 return value and 0 errno. For instance, in case of TCP keep alive timeout. Signed-off-by: Alexander Grissik --- src/core/lwip/tcp.c | 2 +- src/core/sock/sockinfo_tcp.cpp | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/src/core/lwip/tcp.c b/src/core/lwip/tcp.c index 059cf5cc5..12eb9e3cf 100644 --- a/src/core/lwip/tcp.c +++ b/src/core/lwip/tcp.c @@ -709,7 +709,7 @@ void tcp_slowtmr(struct tcp_pcb *pcb) pcb->remote_ip, pcb->is_ipv6); ++pcb_remove; - err = ERR_ABRT; + err = ERR_TIMEOUT; ++pcb_reset; } #if LWIP_TCP_KEEPALIVE diff --git a/src/core/sock/sockinfo_tcp.cpp b/src/core/sock/sockinfo_tcp.cpp index ee3f8e359..6354a6a28 100644 --- a/src/core/sock/sockinfo_tcp.cpp +++ b/src/core/sock/sockinfo_tcp.cpp @@ -2201,6 +2201,9 @@ int sockinfo_tcp::handle_rx_error(bool blocking) si_tcp_logdbg("RX on reseted socket"); m_conn_state = TCP_CONN_FAILED; errno = ECONNRESET; + } else if (m_conn_state == TCP_CONN_TIMEOUT) { + si_tcp_logdbg("RX on timed out socket"); + errno = ETIMEDOUT; } else { si_tcp_logdbg("RX on disconnected socket - EOF"); ret = 0; From 18f5d314295aa6d768e208c2bc9241772d569692 Mon Sep 17 00:00:00 2001 From: Daniel Pressler Date: Mon, 8 Jan 2024 13:03:05 +0200 Subject: [PATCH 030/169] Issue: 3375239 - add email scan in packages The idea is to scan all rpm/deb packages for personal emails we should not be releasing packages with such emails the scan is done on both the metadat info and the changelog of a specific package Issue: HPCINFRA-919 Signed-off-by: Daniel Pressler --- contrib/jenkins_tests/rpm.sh | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/contrib/jenkins_tests/rpm.sh b/contrib/jenkins_tests/rpm.sh index 9ab251d02..78014bd12 100755 --- a/contrib/jenkins_tests/rpm.sh +++ b/contrib/jenkins_tests/rpm.sh @@ -88,5 +88,40 @@ if [ $opt_checkpkg -eq 1 ]; then test_id=$((test_id+1)) fi + +# check if we have email of indevidual users in the packages rpm/deb metadata Maintainer field +pacakges_location="$rpm_dir"/dist-pkg/packages +email_log_file="$rpm_dir"/dist-pkg/email_scan.log + +if [ $opt_rpm -eq 1 ]; then + search_filter="*.rpm" + test_info_exec="rpm -qpi --changelog" +else + search_filter="*.deb" + test_info_exec="apt info" +fi + +# iterate on all packages and extarct the metadata to outout file +find "$pacakges_location" -type f -name "$search_filter" -exec $test_info_exec {} \; | tee -a "$email_log_file" + +do_archive "$email_log_file" + +set +e +# grep email strings exclude allowed email networking-support@nvidia.com +test_output=$(grep -E -o "\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,6}\b" "$email_log_file" | grep -v "networking-support") +test_rc=$? +# check rc - grep will return 0 if it found such mail and 1 if not +if [[ $test_rc -eq 0 ]]; then + # if we found such mail we will get return code 0 + echo "ERROR: found bad email address $test_output" + rc=$((rc + 1)) +elif [[ -n "$test_output" ]]; then + # if we got rc not 0 and we have output it means something else failed + echo "ERROR: could not find bad email but something else failed: $test_output" + rc=$((rc + 1)) +fi + +set -e + echo "[${0##*/}]..................exit code = $rc" exit $rc From b7ac236463a7d735b243405955f953fb8549e7b1 Mon Sep 17 00:00:00 2001 From: Alex Briskin Date: Mon, 8 Jan 2024 12:57:56 +0200 Subject: [PATCH 031/169] issue: 3724170 Support building as a static library - Enable LTO by default - Add support for PGO - Switch orig_os_api with direct syscalls Signed-off-by: Alex Briskin --- config/m4/compiler.m4 | 109 +- config/m4/dpcp.m4 | 2 +- config/m4/linking_optimization.m4 | 121 ++ configure.ac | 13 + contrib/scripts/libxlio.spec.in | 1 + debian/libxlio-dev.install | 1 + src/core/Makefile.am | 16 +- src/core/dev/net_device_table_mgr.cpp | 33 +- src/core/dev/net_device_val.cpp | 19 +- src/core/dev/ring_bond.cpp | 12 +- src/core/dev/ring_simple.cpp | 4 +- src/core/dev/ring_tap.cpp | 24 +- src/core/event/event_handler_manager.cpp | 22 +- src/core/iomux/epfd_info.cpp | 14 +- src/core/iomux/epoll_wait_call.cpp | 5 +- src/core/iomux/poll_call.cpp | 8 +- src/core/iomux/select_call.cpp | 8 +- src/core/main.cpp | 1 + src/core/proto/mapping.cpp | 10 +- src/core/proto/netlink_socket_mgr.cpp | 8 +- src/core/sock/bind_no_port.cpp | 4 +- src/core/sock/pipeinfo.cpp | 13 +- src/core/sock/sock-app.cpp | 3 +- src/core/sock/sock-extra.cpp | 11 +- src/core/sock/sock-redirect.cpp | 2446 ++++++++++------------ src/core/sock/sock-redirect.h | 30 + src/core/sock/socket_fd_api.cpp | 45 +- src/core/sock/sockinfo.cpp | 25 +- src/core/sock/sockinfo_tcp.cpp | 59 +- src/core/sock/sockinfo_udp.cpp | 32 +- src/core/util/utils.cpp | 68 +- src/core/util/wakeup_pipe.cpp | 12 +- src/core/xlio.h | 142 ++ src/core/xlio_extra.h | 1 - tests/gtest/nvme/nvme.cc | 1 + 35 files changed, 1632 insertions(+), 1691 deletions(-) create mode 100644 config/m4/linking_optimization.m4 create mode 100644 src/core/xlio.h diff --git a/config/m4/compiler.m4 b/config/m4/compiler.m4 index 050ae60b1..0103dd4fe 100644 --- a/config/m4/compiler.m4 +++ b/config/m4/compiler.m4 @@ -71,107 +71,12 @@ AC_DEFUN([CHECK_COMPILER_ATTRIBUTE], [ # Usage: CHECK_COMPILER_CXX([standard], [option], [definition]) # Note: # - [definition] can be omitted if it is equal to attribute -# -AC_DEFUN([CHECK_COMPILER_CXX], [ - case "$1" in - 11) -m4_define([_prj_cv_compiler_body_11], [[ -#ifndef __cplusplus -#error This is not a C++ compiler -#elif __cplusplus < 201103L -#error This is not a C++11 compiler -#else -#include -int main(int argc, char** argv) -{ - (void)argc; - (void)argv; - /* decltype */ - int a = 5; - decltype(a) b = a; - return (b - a); -} -#endif // __cplusplus >= 201103L -]]) - ;; - 14) -m4_define([_prj_cv_compiler_body_14], [[ -#ifndef __cplusplus -#error This is not a C++ compiler -#elif __cplusplus < 201402L -#error This is not a C++14 compiler -#else -#include -int main(int argc, char** argv) -{ - (void)argc; - (void)argv; - /* Binary integer literals */ - constexpr auto i = 0b0000000000101010; - static_assert(i == 42, "wrong value"); - return 0; -} -#endif // __cplusplus >= 201402L -]]) - ;; - 17) -m4_define([_prj_cv_compiler_body_17], [[ -#ifndef __cplusplus -#error This is not a C++ compiler -#elif __cplusplus < 201703L -#error This is not a C++17 compiler -#else -int main(int argc, char** argv) -{ - (void)argc; - (void)argv; - // Check constexpr lambda - auto identity = [](int n) constexpr { return n; }; - static_assert(identity(123) == 123); - return 0; -} -#endif // __cplusplus >= 201703L -]]) - ;; - *) - AC_MSG_ERROR([invalid first argument as [$1] to [$0]]) - ;; - esac - case "$2" in - std) - prj_cv_option=-std=c++$1 - ;; - gnu) - prj_cv_option=-std=gnu++$1 - ;; - *) - AC_MSG_ERROR([invalid first argument as [$2] to [$0]]) - ;; - esac - - AC_CACHE_VAL(prj_cv_compiler_cxx_[$1], [ - prj_cv_compiler_save_CXXFLAGS="$CXXFLAGS" - CXXFLAGS="$prj_cv_option $CXXFLAGS" - - # - # Try to compile using the C++ compiler - # - AC_LANG_PUSH(C++) - AC_COMPILE_IFELSE([AC_LANG_SOURCE(_prj_cv_compiler_body_[$1])], - [prj_cv_compiler_cxx_$1=yes], - [prj_cv_compiler_cxx_$1=no]) - AC_LANG_POP(C++) - - CXXFLAGS="$prj_cv_compiler_save_CXXFLAGS" - ]) - AC_MSG_CHECKING([for compiler c++ [$1]]) - AC_MSG_RESULT([$prj_cv_compiler_cxx_$1]) - AS_IF([test "x$prj_cv_compiler_cxx_[$1]" = "xyes"], - [CXXFLAGS="$prj_cv_option $CXXFLAGS"], - [AC_MSG_ERROR([A compiler with support for C++[$1] language features is required])] - ) -]) - +saved_cxxflags="$CXXFLAGS" +CXXFLAGS="-Werror -std=c++14" +AC_MSG_CHECKING([whether CXX supports -std=c++14]) +AC_COMPILE_IFELSE([AC_LANG_PROGRAM([])], [AC_MSG_RESULT([yes])], + [AC_MSG_ERROR([C++14 is unsupported])]) +CXXFLAGS="-std=c++14 $saved_cxxflags" ########################## # Configure compiler capabilities @@ -256,6 +161,4 @@ else AC_MSG_CHECKING([for symbols visibility]) AC_MSG_RESULT([yes]) fi - -CHECK_COMPILER_CXX([14], [std], []) ]) diff --git a/config/m4/dpcp.m4 b/config/m4/dpcp.m4 index e474566b2..5ac2a7aab 100644 --- a/config/m4/dpcp.m4 +++ b/config/m4/dpcp.m4 @@ -84,7 +84,7 @@ prj_cv_dpcp_save_LDFLAGS="$LDFLAGS" prj_cv_dpcp_save_LIBS="$LIBS" prj_cv_dpcp_CPPFLAGS="-I$with_dpcp/include" -prj_cv_dpcp_LIBS="-ldpcp -lmlx5" +prj_cv_dpcp_LIBS="-ldpcp -lmlx5 -libverbs -lgcov" prj_cv_dpcp_LDFLAGS="-L$with_dpcp/lib -Wl,--rpath,$with_dpcp/lib" if test -d "$with_dpcp/lib64"; then prj_cv_dpcp_LDFLAGS="-L$with_dpcp/lib64 -Wl,--rpath,$with_dpcp/lib64" diff --git a/config/m4/linking_optimization.m4 b/config/m4/linking_optimization.m4 new file mode 100644 index 000000000..2264c8b1f --- /dev/null +++ b/config/m4/linking_optimization.m4 @@ -0,0 +1,121 @@ +# +# Copyright © 2001-2024 NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# +# This software is available to you under a choice of one of two +# licenses. You may choose to be licensed under the terms of the GNU +# General Public License (GPL) Version 2, available from the file +# COPYING in the main directory of this source tree, or the +# BSD license below: +# +# Redistribution and use in source and binary forms, with or +# without modification, are permitted provided that the following +# conditions are met: +# +# - Redistributions of source code must retain the above +# copyright notice, this list of conditions and the following +# disclaimer. +# +# - Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following +# disclaimer in the documentation and/or other materials +# provided with the distribution. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS +# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# + +AC_PROG_CC +AC_PROG_CXX + +AC_MSG_CHECKING([for LTO]) +AC_ARG_ENABLE(lto, AS_HELP_STRING([--enable-lto], [Enable Link Time Optimization]), + [ + enable_lto=$enableval + ], [enable_lto=yes]) + +AS_IF([test "x$enable_lto" = "xyes"], + [ + case $CC in + gcc*|g++*) + AC_SUBST([XLIO_LTO], ["-flto=auto"]) + ;; + clang*|clang++*) + AC_SUBST([XLIO_LTO], ["-flto=thin"]) + ;; + *) + AC_MSG_ERROR([Compiler doesn't support link time optimization]) + ;; + esac + AC_MSG_RESULT([yes]) + ], + [ + AC_SUBST([XLIO_LTO], [""]) + AC_MSG_RESULT([no]) + ] +) + +AC_MSG_CHECKING([for PGO generate]) +AC_ARG_WITH([profile-generate], + [AS_HELP_STRING([--with-profile-generate=DIR], [Path to store profiles for Profile Guided Optimization])], + [ + COMMON_FLAGS="" + case $CC in + gcc*|g++*) + COMMON_FLAGS+="-fprofile-generate -fprofile-correction -Wno-error=missing-profile" + COMMON_FLAGS+=" -fprofile-partial-training -fprofile-dir=$withval" + ;; + clang*|clang++*) + COMMON_FLAGS+="-fprofile-generate=$withval" + ;; + *) + AC_MSG_ERROR([Compiler doesn't support profile guided optimization]) + ;; + esac + AC_CHECK_LIB([gcov], [__gcov_init], [], [AC_MSG_ERROR([libgcov not found])]) + AC_MSG_RESULT([$withval yes]) + profile_generate=yes + AC_SUBST([XLIO_PROFILE], ["$COMMON_FLAGS"]) + AC_SUBST([XLIO_GCOV], ["-lgcov"]) + ], + [ + profile_generate=no + AC_MSG_RESULT([no]) + ] +) + +AC_MSG_CHECKING([for PGO use]) +AC_ARG_WITH([profile-use], + [AS_HELP_STRING([--with-profile-use=DIR], [Path to read profiles for Profile Guided Optimization])], + [ + COMMON_FLAGS="" + case $CC in + gcc*|g++*) + COMMON_FLAGS+="-fprofile-use -fprofile-correction -Wno-error=missing-profile" + COMMON_FLAGS+=" -fprofile-partial-training -fprofile-dir=$withval" + ;; + clang*|clang++*) + COMMON_FLAGS+="-fprofile-use=$withval" + ;; + *) + AC_MSG_ERROR([Compiler doesn't support profile guided optimization]) + ;; + esac + AC_MSG_RESULT([$withval yes]) + profile_use=yes + AC_SUBST([XLIO_PROFILE], ["$COMMON_FLAGS"]) + ], + [ + profile_use=no + AC_MSG_RESULT([no]) + ] +) + +AS_IF([test "x$profile_use" = "xyes" && test "x$profile_generate" = "xyes"], [ + AC_MSG_ERROR([** Cannot use both --with-profile-generate and --with-profile-use]) +]) diff --git a/configure.ac b/configure.ac index 60a87715b..c18e85920 100644 --- a/configure.ac +++ b/configure.ac @@ -105,9 +105,22 @@ show_section_title "Configure build tools" : ${CFLAGS=""} : ${CXXFLAGS=""} +m4_include([config/m4/linking_optimization.m4]) + # Find compiler, libtools, etc # LT_INIT([disable-static]) + +# LT_INIT exposes the ability to configure --enable-static +if test "x$enable_static" = "xyes"; then + AC_SUBST([XLIO_STATIC_BUILD], ["-DXLIO_STATIC_BUILD"]) + if test "x$enable_shared" = "xyes"; then + AC_MSG_ERROR([Please add --disable-shared or --enable-shared=no]) + fi +else + AC_SUBST([XLIO_STATIC_BUILD], [""]) +fi + AC_PROG_CC AC_PROG_CXX diff --git a/contrib/scripts/libxlio.spec.in b/contrib/scripts/libxlio.spec.in index c00ffb42d..fc69e6071 100644 --- a/contrib/scripts/libxlio.spec.in +++ b/contrib/scripts/libxlio.spec.in @@ -178,6 +178,7 @@ fi %files devel %dir %{_includedir}/mellanox %{_includedir}/mellanox/xlio_extra.h +%{_includedir}/mellanox/xlio.h %if %{use_rel} > 0 %{_libdir}/%{name}-debug.so %endif diff --git a/debian/libxlio-dev.install b/debian/libxlio-dev.install index 04f2925df..49f9eb771 100644 --- a/debian/libxlio-dev.install +++ b/debian/libxlio-dev.install @@ -1,2 +1,3 @@ usr/include/mellanox/xlio_extra.h +usr/include/mellanox/xlio.h libxlio-debug.so usr/lib diff --git a/src/core/Makefile.am b/src/core/Makefile.am index d3714602d..d0bb5f7df 100644 --- a/src/core/Makefile.am +++ b/src/core/Makefile.am @@ -12,7 +12,7 @@ LEX_OUTPUT_ROOT=lex.libxlio_yy # as built) because we don't want it to be created by old version of flex/yacc # on some machines that will generate gcc warmings. # in case you change the *.l or *.y in the future - than change the commenting in the following 3 lines -#----- +# #BUILT_SOURCES += config_scanner.c config_parser.h config_parser.c #libconfig_parser_la_SOURCES += util/config_scanner.l util/config_parser.y libconfig_parser_la_SOURCES += config_scanner.c config_parser.c @@ -29,26 +29,32 @@ EXTRA_DIST = \ sysconf_DATA = util/libxlio.conf otherincludedir = $(includedir)/mellanox -otherinclude_HEADERS = xlio_extra.h +otherinclude_HEADERS = xlio_extra.h \ + xlio.h install-exec-hook: rm -f $(DESTDIR)$(libdir)/libxlio.la - rm -f $(DESTDIR)$(libdir)/libxlio.a rm -f $(DESTDIR)$(bindir)/state_machine_test rm -f $(DESTDIR)$(bindir)/vlogger_test + uninstall-hook: rm -f $(DESTDIR)$(libdir)/libxlio.so* + rm -f $(DESTDIR)$(libdir)/libxlio.a lib_LTLIBRARIES = libxlio.la AM_CPPFLAGS := \ -I$(top_srcdir)/src ${LIBNL_CFLAGS} -libxlio_la_LDFLAGS := -no-undefined -version-number @PRJ_LIBRARY_MAJOR@:@PRJ_LIBRARY_MINOR@:@PRJ_LIBRARY_REVISION@ +libxlio_la_CFLAGS = $(XLIO_STATIC_BUILD) $(XLIO_LTO) $(XLIO_PROFILE) +libxlio_la_CXXFLAGS = $(XLIO_STATIC_BUILD) $(XLIO_LTO) $(XLIO_PROFILE) + +libxlio_la_LDFLAGS := $(XLIO_LTO) $(XLIO_PROFILE) -no-undefined \ + -version-number @PRJ_LIBRARY_MAJOR@:@PRJ_LIBRARY_MINOR@:@PRJ_LIBRARY_REVISION@ libxlio_la_LIBADD = \ - -lrt -ldl -lpthread $(LIBNL_LIBS) $(VERBS_LIBS) $(DPCP_LIBS) \ + -lrt -ldl -lpthread $(LIBNL_LIBS) $(VERBS_LIBS) $(DPCP_LIBS) $(XLIO_GCOV) \ $(top_builddir)/src/utils/libutils.la \ $(top_builddir)/src/vlogger/libvlogger.la \ $(top_builddir)/src/state_machine/libstate_machine.la \ diff --git a/src/core/dev/net_device_table_mgr.cpp b/src/core/dev/net_device_table_mgr.cpp index 2a68a7acd..68743adb6 100644 --- a/src/core/dev/net_device_table_mgr.cpp +++ b/src/core/dev/net_device_table_mgr.cpp @@ -76,7 +76,7 @@ net_device_table_mgr::net_device_table_mgr() ndtm_logdbg(""); - m_global_ring_epfd = orig_os_api.epoll_create(48); + m_global_ring_epfd = SYSCALL(epoll_create, 48); BULLSEYE_EXCLUDE_BLOCK_START if (m_global_ring_epfd == -1) { @@ -85,12 +85,12 @@ net_device_table_mgr::net_device_table_mgr() throw_xlio_exception("epoll_create failed"); } - if (orig_os_api.pipe(m_global_ring_pipe_fds)) { + if (SYSCALL(pipe, m_global_ring_pipe_fds)) { ndtm_logerr("pipe create failed. (errno=%d %m)", errno); free_ndtm_resources(); throw_xlio_exception("pipe create failed"); } - if (orig_os_api.write(m_global_ring_pipe_fds[1], "#", 1) != 1) { + if (SYSCALL(write, m_global_ring_pipe_fds[1], "#", 1) != 1) { ndtm_logerr("pipe write failed. (errno=%d %m)", errno); free_ndtm_resources(); throw_xlio_exception("pipe write failed"); @@ -151,12 +151,12 @@ void net_device_table_mgr::free_ndtm_resources() m_lock.lock(); if (m_global_ring_epfd > 0) { - orig_os_api.close(m_global_ring_epfd); + SYSCALL(close, m_global_ring_epfd); m_global_ring_epfd = 0; } - orig_os_api.close(m_global_ring_pipe_fds[1]); - orig_os_api.close(m_global_ring_pipe_fds[0]); + SYSCALL(close, m_global_ring_pipe_fds[1]); + SYSCALL(close, m_global_ring_pipe_fds[0]); net_device_map_index_t::iterator itr; while ((itr = m_net_device_map_index.begin()) != m_net_device_map_index.end()) { @@ -191,7 +191,7 @@ void net_device_table_mgr::update_tbl() net_device_val *p_net_device_val; /* Set up the netlink socket */ - fd = orig_os_api.socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE); + fd = SYSCALL(socket, AF_NETLINK, SOCK_RAW, NETLINK_ROUTE); if (fd < 0) { ndtm_logerr("netlink socket() creation"); return; @@ -210,7 +210,7 @@ void net_device_table_mgr::update_tbl() nl_req.infomsg.ifi_change = 0xffffffff; /* Send the netlink request */ - rc = orig_os_api.send(fd, &nl_req, nl_req.hdr.nlmsg_len, 0); + rc = SYSCALL(send, fd, &nl_req, nl_req.hdr.nlmsg_len, 0); if (rc < 0) { ndtm_logerr("netlink send() operation"); goto ret; @@ -220,7 +220,7 @@ void net_device_table_mgr::update_tbl() do { /* Receive the netlink reply */ - rc = orig_os_api.recv(fd, nl_res, sizeof(nl_res), 0); + rc = SYSCALL(recv, fd, nl_res, sizeof(nl_res), 0); if (rc < 0) { ndtm_logerr("netlink recv() operation"); goto ret; @@ -296,7 +296,7 @@ void net_device_table_mgr::update_tbl() ndtm_logdbg("Check completed. Found %ld offload capable network interfaces", m_net_device_map_index.size()); - orig_os_api.close(fd); + SYSCALL(close, fd); } void net_device_table_mgr::print_val_tbl() @@ -362,9 +362,9 @@ net_device_val *net_device_table_mgr::get_net_device_val(int if_index) net_dev->get_ifname()); if (ret > 0 && (size_t)ret < sizeof(sys_path)) { ret = errno; /* to suppress errno */ - int fd = open(sys_path, O_RDONLY); + int fd = SYSCALL(open, sys_path, O_RDONLY); if (fd >= 0) { - close(fd); + SYSCALL(close, fd); goto out; } errno = ret; @@ -479,7 +479,7 @@ int net_device_table_mgr::global_ring_wait_for_notification_and_process_element( int max_fd = 16; struct epoll_event events[max_fd]; - int res = orig_os_api.epoll_wait(global_ring_epfd_get(), events, max_fd, 0); + int res = SYSCALL(epoll_wait, global_ring_epfd_get(), events, max_fd, 0); if (res > 0) { for (int event_idx = 0; event_idx < res; ++event_idx) { int fd = events[event_idx].data.fd; // This is the Rx cq channel fd @@ -512,8 +512,8 @@ int net_device_table_mgr::global_ring_wait_for_notification_and_process_element( } else { ndtm_logdbg("removing wakeup fd from epfd"); BULLSEYE_EXCLUDE_BLOCK_START - if ((orig_os_api.epoll_ctl(m_global_ring_epfd, EPOLL_CTL_DEL, - m_global_ring_pipe_fds[0], NULL)) && + if ((SYSCALL(epoll_ctl, m_global_ring_epfd, EPOLL_CTL_DEL, + m_global_ring_pipe_fds[0], NULL)) && (!(errno == ENOENT || errno == EBADF))) { ndtm_logerr("failed to del pipe channel fd from internal epfd (errno=%d %m)", errno); @@ -588,8 +588,7 @@ void net_device_table_mgr::global_ring_wakeup() ev.data.ptr = NULL; int errno_tmp = errno; // don't let wakeup affect errno, as this can fail with EEXIST BULLSEYE_EXCLUDE_BLOCK_START - if ((orig_os_api.epoll_ctl(m_global_ring_epfd, EPOLL_CTL_ADD, m_global_ring_pipe_fds[0], - &ev)) && + if ((SYSCALL(epoll_ctl, m_global_ring_epfd, EPOLL_CTL_ADD, m_global_ring_pipe_fds[0], &ev)) && (errno != EEXIST)) { ndtm_logerr("failed to add pipe channel fd to internal epfd (errno=%d %m)", errno); } diff --git a/src/core/dev/net_device_val.cpp b/src/core/dev/net_device_val.cpp index a184c3054..ea35d9b21 100644 --- a/src/core/dev/net_device_val.cpp +++ b/src/core/dev/net_device_val.cpp @@ -360,7 +360,7 @@ void net_device_val::set_ip_array() static int _seq = 0; /* Set up the netlink socket */ - fd = orig_os_api.socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE); + fd = SYSCALL(socket, AF_NETLINK, SOCK_RAW, NETLINK_ROUTE); if (fd < 0) { nd_logerr("netlink socket() creation"); return; @@ -377,7 +377,7 @@ void net_device_val::set_ip_array() nl_req.addrmsg.ifa_index = m_if_idx; /* Send the netlink request */ - rc = orig_os_api.send(fd, &nl_req, nl_req.hdr.nlmsg_len, 0); + rc = SYSCALL(send, fd, &nl_req, nl_req.hdr.nlmsg_len, 0); if (rc < 0) { nd_logerr("netlink send() operation"); goto ret; @@ -385,7 +385,7 @@ void net_device_val::set_ip_array() do { /* Receive the netlink reply */ - rc = orig_os_api.recv(fd, nl_res, sizeof(nl_res), 0); + rc = SYSCALL(recv, fd, nl_res, sizeof(nl_res), 0); if (rc < 0) { nd_logerr("netlink recv() operation"); goto ret; @@ -442,7 +442,7 @@ void net_device_val::set_ip_array() } while (1); ret: - orig_os_api.close(fd); + SYSCALL(close, fd); print_ips(); } @@ -1038,8 +1038,8 @@ ring *net_device_val::reserve_ring(resource_allocation_key *key) int cq_ch_fd = ring_rx_fds_array[i]; ev.data.fd = cq_ch_fd; BULLSEYE_EXCLUDE_BLOCK_START - if (unlikely(orig_os_api.epoll_ctl(g_p_net_device_table_mgr->global_ring_epfd_get(), - EPOLL_CTL_ADD, cq_ch_fd, &ev))) { + if (unlikely(SYSCALL(epoll_ctl, g_p_net_device_table_mgr->global_ring_epfd_get(), + EPOLL_CTL_ADD, cq_ch_fd, &ev))) { nd_logerr( "Failed to add RING notification fd to global_table_mgr_epfd (errno=%d %s)", errno, strerror(errno)); @@ -1091,10 +1091,9 @@ int net_device_val::release_ring(resource_allocation_key *key) for (size_t i = 0; i < num_ring_rx_fds; i++) { int cq_ch_fd = ring_rx_fds_array[i]; BULLSEYE_EXCLUDE_BLOCK_START - if (unlikely( - (orig_os_api.epoll_ctl(g_p_net_device_table_mgr->global_ring_epfd_get(), - EPOLL_CTL_DEL, cq_ch_fd, NULL)) && - (!(errno == ENOENT || errno == EBADF)))) { + if (unlikely((SYSCALL(epoll_ctl, g_p_net_device_table_mgr->global_ring_epfd_get(), + EPOLL_CTL_DEL, cq_ch_fd, NULL)) && + (!(errno == ENOENT || errno == EBADF)))) { nd_logerr("Failed to delete RING notification fd to global_table_mgr_epfd " "(errno=%d %s)", errno, strerror(errno)); diff --git a/src/core/dev/ring_bond.cpp b/src/core/dev/ring_bond.cpp index b8fcef275..cb84f931e 100644 --- a/src/core/dev/ring_bond.cpp +++ b/src/core/dev/ring_bond.cpp @@ -175,7 +175,7 @@ void ring_bond::restart() epfd = g_p_net_device_table_mgr->global_ring_epfd_get(); if (epfd > 0) { fd = ring_rx_fds_array[k]; - rc = orig_os_api.epoll_ctl(epfd, EPOLL_CTL_DEL, fd, NULL); + rc = SYSCALL(epoll_ctl, epfd, EPOLL_CTL_DEL, fd, NULL); ring_logdbg("Remove fd=%d from epfd=%d rc=%d errno=%d", fd, epfd, rc, errno); } @@ -186,14 +186,14 @@ void ring_bond::restart() epfd = si->get_rx_epfd(); if (epfd > 0) { fd = ring_rx_fds_array[k]; - rc = orig_os_api.epoll_ctl(epfd, EPOLL_CTL_DEL, fd, NULL); + rc = SYSCALL(epoll_ctl, epfd, EPOLL_CTL_DEL, fd, NULL); ring_logdbg("Remove fd=%d from epfd=%d rc=%d errno=%d", fd, epfd, rc, errno); } epfd = si->get_epoll_context_fd(); if (epfd > 0) { fd = ring_rx_fds_array[k]; - rc = orig_os_api.epoll_ctl(epfd, EPOLL_CTL_DEL, fd, NULL); + rc = SYSCALL(epoll_ctl, epfd, EPOLL_CTL_DEL, fd, NULL); ring_logdbg("Remove fd=%d from epfd=%d rc=%d errno=%d", fd, epfd, rc, errno); } @@ -223,7 +223,7 @@ void ring_bond::restart() fd = ring_rx_fds_array[k]; ev.events = EPOLLIN; ev.data.fd = fd; - rc = orig_os_api.epoll_ctl(epfd, EPOLL_CTL_ADD, fd, &ev); + rc = SYSCALL(epoll_ctl, epfd, EPOLL_CTL_ADD, fd, &ev); ring_logdbg("Add fd=%d from epfd=%d rc=%d errno=%d", fd, epfd, rc, errno); } @@ -239,7 +239,7 @@ void ring_bond::restart() fd = ring_rx_fds_array[k]; ev.events = EPOLLIN; ev.data.fd = fd; - rc = orig_os_api.epoll_ctl(epfd, EPOLL_CTL_ADD, fd, &ev); + rc = SYSCALL(epoll_ctl, epfd, EPOLL_CTL_ADD, fd, &ev); ring_logdbg("Add fd=%d from epfd=%d rc=%d errno=%d", fd, epfd, rc, errno); } @@ -250,7 +250,7 @@ void ring_bond::restart() fd = ring_rx_fds_array[k]; ev.events = EPOLLIN | EPOLLPRI; ev.data.u64 = (((uint64_t)CQ_FD_MARK << 32) | fd); - rc = orig_os_api.epoll_ctl(epfd, EPOLL_CTL_ADD, fd, &ev); + rc = SYSCALL(epoll_ctl, epfd, EPOLL_CTL_ADD, fd, &ev); ring_logdbg("Add fd=%d from epfd=%d rc=%d errno=%d", fd, epfd, rc, errno); } diff --git a/src/core/dev/ring_simple.cpp b/src/core/dev/ring_simple.cpp index b7a15a2fb..f89d8da20 100644 --- a/src/core/dev/ring_simple.cpp +++ b/src/core/dev/ring_simple.cpp @@ -626,7 +626,7 @@ mem_buf_desc_t *ring_simple::mem_buf_tx_get(ring_user_id_t id, bool b_block, pbu /* coverity[double_unlock] coverity[unlock] TODO: RM#1049980 */ m_lock_ring_tx.unlock(); - ret = orig_os_api.poll(&poll_fd, 1, 100); + ret = SYSCALL(poll, &poll_fd, 1, 100); if (ret == 0) { m_lock_ring_tx_buf_wait.unlock(); /* coverity[double_lock] TODO: RM#1049980 */ @@ -820,7 +820,7 @@ bool ring_simple::is_available_qp_wr(bool b_block, unsigned credits) /* coverity[double_unlock] TODO: RM#1049980 */ m_lock_ring_tx.unlock(); - ret = orig_os_api.poll(&poll_fd, 1, -1); + ret = SYSCALL(poll, &poll_fd, 1, -1); if (ret <= 0) { ring_logdbg("failed blocking on cq_mgr_tx (errno=%d %m)", errno); m_lock_ring_tx_buf_wait.unlock(); diff --git a/src/core/dev/ring_tap.cpp b/src/core/dev/ring_tap.cpp index 66c395951..b59ac33a4 100644 --- a/src/core/dev/ring_tap.cpp +++ b/src/core/dev/ring_tap.cpp @@ -120,7 +120,7 @@ void ring_tap::tap_create(net_device_val *p_ndev) unsigned char hw_addr[ETH_ALEN]; /* Open TAP device */ - if ((m_tap_fd = orig_os_api.open("/dev/net/tun", O_RDWR)) < 0) { + if ((m_tap_fd = SYSCALL(open, "/dev/net/tun", O_RDWR)) < 0) { ring_logerr("FAILED to open tap %m"); rc = -errno; goto error; @@ -146,14 +146,14 @@ void ring_tap::tap_create(net_device_val *p_ndev) /* Setting TAP attributes */ ifr.ifr_flags = IFF_TAP | IFF_NO_PI | IFF_ONE_QUEUE; - if ((rc = orig_os_api.ioctl(m_tap_fd, TUNSETIFF, (void *)&ifr)) < 0) { + if ((rc = SYSCALL(ioctl, m_tap_fd, TUNSETIFF, (void *)&ifr)) < 0) { ring_logerr("ioctl failed fd = %d, %d %m", m_tap_fd, rc); rc = -errno; goto error; } /* Set TAP fd nonblocking */ - if ((rc = orig_os_api.fcntl(m_tap_fd, F_SETFL, O_NONBLOCK)) < 0) { + if ((rc = SYSCALL(fcntl, m_tap_fd, F_SETFL, O_NONBLOCK)) < 0) { ring_logerr("ioctl failed fd = %d, %d %m", m_tap_fd, rc); rc = -errno; goto error; @@ -168,7 +168,7 @@ void ring_tap::tap_create(net_device_val *p_ndev) } /* Create socket */ - if ((ioctl_sock = orig_os_api.socket(AF_INET, SOCK_DGRAM, 0)) < 0) { + if ((ioctl_sock = SYSCALL(socket, AF_INET, SOCK_DGRAM, 0)) < 0) { ring_logerr("FAILED to open socket"); rc = -errno; goto error; @@ -178,7 +178,7 @@ void ring_tap::tap_create(net_device_val *p_ndev) ifr.ifr_hwaddr.sa_family = AF_LOCAL; get_local_ll_addr(p_ndev->get_ifname_link(), hw_addr, ETH_ALEN, false); memcpy(ifr.ifr_hwaddr.sa_data, hw_addr, ETH_ALEN); - if ((rc = orig_os_api.ioctl(ioctl_sock, SIOCSIFHWADDR, &ifr)) < 0) { + if ((rc = SYSCALL(ioctl, ioctl_sock, SIOCSIFHWADDR, &ifr)) < 0) { ring_logerr("ioctl SIOCSIFHWADDR failed %d %m, %s", rc, tap_name); rc = -errno; goto error; @@ -186,7 +186,7 @@ void ring_tap::tap_create(net_device_val *p_ndev) /* Set link UP */ ifr.ifr_flags |= (IFF_UP | IFF_SLAVE); - if ((rc = orig_os_api.ioctl(ioctl_sock, SIOCSIFFLAGS, &ifr)) < 0) { + if ((rc = SYSCALL(ioctl, ioctl_sock, SIOCSIFFLAGS, &ifr)) < 0) { ring_logerr("ioctl SIOCGIFFLAGS failed %d %m, %s", rc, tap_name); rc = -errno; goto error; @@ -203,7 +203,7 @@ void ring_tap::tap_create(net_device_val *p_ndev) /* Update if_index on ring class */ set_if_index(tap_if_index); - orig_os_api.close(ioctl_sock); + SYSCALL(close, ioctl_sock); ring_logdbg("Tap device %d: %s [fd=%d] was created successfully", tap_if_index, ifr.ifr_name, m_tap_fd); @@ -214,11 +214,11 @@ void ring_tap::tap_create(net_device_val *p_ndev) ring_logerr("Tap device creation failed %d, %m", rc); if (ioctl_sock >= 0) { - orig_os_api.close(ioctl_sock); + SYSCALL(close, ioctl_sock); } if (m_tap_fd >= 0) { - orig_os_api.close(m_tap_fd); + SYSCALL(close, m_tap_fd); } m_tap_fd = -1; @@ -227,7 +227,7 @@ void ring_tap::tap_create(net_device_val *p_ndev) void ring_tap::tap_destroy() { if (m_tap_fd >= 0) { - orig_os_api.close(m_tap_fd); + SYSCALL(close, m_tap_fd); m_tap_fd = -1; } @@ -425,7 +425,7 @@ int ring_tap::process_element_rx(void *pv_fd_ready_array) std::lock_guard lock(m_lock_ring_rx); if (m_rx_pool.size() || request_more_rx_buffers()) { mem_buf_desc_t *buff = m_rx_pool.get_and_pop_front(); - ret = orig_os_api.read(m_tap_fd, buff->p_buffer, buff->sz_buffer); + ret = SYSCALL(read, m_tap_fd, buff->p_buffer, buff->sz_buffer); if (ret > 0) { /* Data was read and processed successfully */ buff->sz_data = ret; @@ -607,7 +607,7 @@ int ring_tap::send_buffer(xlio_ibv_send_wr *wr, xlio_wr_tx_packet_attr attr) iovec[i].iov_len = wr->sg_list[i].length; } - ret = orig_os_api.writev(m_tap_fd, iovec, wr->num_sge); + ret = SYSCALL(writev, m_tap_fd, iovec, wr->num_sge); if (ret < 0) { ring_logdbg("writev: tap_fd %d, errno: %d\n", m_tap_fd, errno); } diff --git a/src/core/event/event_handler_manager.cpp b/src/core/event/event_handler_manager.cpp index 71fd86a58..8ea210d16 100644 --- a/src/core/event/event_handler_manager.cpp +++ b/src/core/event/event_handler_manager.cpp @@ -37,7 +37,7 @@ #include #include "core/dev/ring_allocation_logic.h" #include "core/sock/fd_collection.h" -#include "core/sock/sock-redirect.h" // calling orig_os_api.epoll() +#include "core/sock/sock-redirect.h" // calling SYSCALL(epoll)() #include "core/proto/route_table_mgr.h" #include "timer_handler.h" #include "event_handler_ibverbs.h" @@ -249,7 +249,7 @@ event_handler_manager::event_handler_manager(bool internal_thread_mode) return; } - m_epfd = orig_os_api.epoll_create(INITIAL_EVENTS_NUM); + m_epfd = SYSCALL(epoll_create, INITIAL_EVENTS_NUM); BULLSEYE_EXCLUDE_BLOCK_START if (m_epfd == -1) { evh_logdbg("epoll_create failed on ibv device collection (errno=%d %m)", errno); @@ -399,7 +399,7 @@ void event_handler_manager::stop_thread() m_event_handler_tid = 0; // Close main epfd and signaling socket - orig_os_api.close(m_epfd); + SYSCALL(close, m_epfd); m_epfd = -1; } @@ -414,7 +414,7 @@ void event_handler_manager::update_epfd(int fd, int operation, int events) ev.events = events; ev.data.fd = fd; BULLSEYE_EXCLUDE_BLOCK_START - if ((orig_os_api.epoll_ctl(m_epfd, operation, fd, &ev) < 0) && + if ((SYSCALL(epoll_ctl, m_epfd, operation, fd, &ev) < 0) && (!(errno == ENOENT || errno == EBADF))) { const char *operation_str[] = {"", "ADD", "DEL", "MOD"}; evh_logerr("epoll_ctl(%d, %s, fd=%d) failed (errno=%d %m)", m_epfd, @@ -525,7 +525,7 @@ void event_handler_manager::priv_prepare_ibverbs_async_event_queue(event_handler set_fd_block_mode(poll_fd.fd, false); // empty the async event queue - while (orig_os_api.poll(&poll_fd, 1, 0) > 0) { + while (SYSCALL(poll, &poll_fd, 1, 0) > 0) { process_ibverbs_event(i); cnt++; } @@ -795,7 +795,7 @@ void event_handler_manager::query_for_ibverbs_event(int async_fd) } // Check for ready events - if (orig_os_api.poll(&poll_fd, 1, 0) <= 0) { + if (SYSCALL(poll, &poll_fd, 1, 0) <= 0) { return; } @@ -968,7 +968,7 @@ void *event_handler_manager::thread_loop() epoll_event evt = {0, {0}}; evt.events = EPOLLIN | EPOLLPRI; evt.data.fd = m_cq_epfd; - orig_os_api.epoll_ctl(m_epfd, EPOLL_CTL_ADD, m_cq_epfd, &evt); + SYSCALL(epoll_ctl, m_epfd, EPOLL_CTL_ADD, m_cq_epfd, &evt); } } @@ -989,13 +989,13 @@ void *event_handler_manager::thread_loop() } } - evh_logfuncall("calling orig_os_api.epoll with %d msec timeout", timeout_msec); - int ret = orig_os_api.epoll_wait(m_epfd, p_events, maxevents, timeout_msec); + evh_logfuncall("calling SYSCALL(epoll) with %d msec timeout", timeout_msec); + int ret = SYSCALL(epoll_wait, m_epfd, p_events, maxevents, timeout_msec); if (ret < 0) { evh_logfunc("epoll returned with error, errno=%d %m)", errno); continue; } - evh_logfuncall("orig_os_api.epoll found %d ready fds", ret); + evh_logfuncall("SYSCALL(epoll) found %d ready fds", ret); // check pipe for (int idx = 0; (idx < ret) && (m_b_continue_running); ++idx) { @@ -1058,7 +1058,7 @@ void *event_handler_manager::thread_loop() case EV_RDMA_CM: int result; poll_fd.fd = fd; - result = orig_os_api.poll(&poll_fd, 1, 0); + result = SYSCALL(poll, &poll_fd, 1, 0); if (result == 0) { evh_logdbg("error in fd %d", fd); break; diff --git a/src/core/iomux/epfd_info.cpp b/src/core/iomux/epfd_info.cpp index a2d21ba5e..13dc50078 100644 --- a/src/core/iomux/epfd_info.cpp +++ b/src/core/iomux/epfd_info.cpp @@ -45,7 +45,7 @@ int epfd_info::remove_fd_from_epoll_os(int fd) { - int ret = orig_os_api.epoll_ctl(m_epfd, EPOLL_CTL_DEL, fd, NULL); + int ret = SYSCALL(epoll_ctl, m_epfd, EPOLL_CTL_DEL, fd, NULL); BULLSEYE_EXCLUDE_BLOCK_START if (ret < 0) { __log_dbg("failed to remove fd=%d from os epoll epfd=%d (errno=%d %m)", fd, m_epfd, errno); @@ -152,7 +152,7 @@ int epfd_info::ctl(int op, int fd, epoll_event *event) } // YossiE TODO make "event table" - and add index in that table instead - // of real event (in orig_os_api.epoll_ctl). must have this because fd's can + // of real event (in SYSCALL(epoll_ctl)). must have this because fd's can // be added after the cq. lock(); @@ -237,7 +237,7 @@ int epfd_info::add_fd(int fd, epoll_event *event) evt.events = event->events; evt.data.u64 = 0; // zero all data evt.data.fd = fd; - ret = orig_os_api.epoll_ctl(m_epfd, EPOLL_CTL_ADD, fd, &evt); + ret = SYSCALL(epoll_ctl, m_epfd, EPOLL_CTL_ADD, fd, &evt); BULLSEYE_EXCLUDE_BLOCK_START if (ret < 0) { __log_dbg("failed to add fd=%d to epoll epfd=%d (errno=%d %m)", fd, m_epfd, errno); @@ -341,7 +341,7 @@ void epfd_info::increase_ring_ref_count(ring *ring) evt.events = EPOLLIN | EPOLLPRI; int fd = ring_rx_fds_array[i]; evt.data.u64 = (((uint64_t)CQ_FD_MARK << 32) | fd); - int ret = orig_os_api.epoll_ctl(m_epfd, EPOLL_CTL_ADD, fd, &evt); + int ret = SYSCALL(epoll_ctl, m_epfd, EPOLL_CTL_ADD, fd, &evt); BULLSEYE_EXCLUDE_BLOCK_START if (ret < 0) { __log_dbg("failed to add cq fd=%d to epoll epfd=%d (errno=%d %m)", fd, m_epfd, @@ -378,7 +378,7 @@ void epfd_info::decrease_ring_ref_count(ring *ring) int *ring_rx_fds_array = ring->get_rx_channel_fds(num_ring_rx_fds); for (size_t i = 0; i < num_ring_rx_fds; i++) { // delete cq fd from epfd - int ret = orig_os_api.epoll_ctl(m_epfd, EPOLL_CTL_DEL, ring_rx_fds_array[i], NULL); + int ret = SYSCALL(epoll_ctl, m_epfd, EPOLL_CTL_DEL, ring_rx_fds_array[i], NULL); BULLSEYE_EXCLUDE_BLOCK_START if (ret < 0) { __log_dbg("failed to remove cq fd=%d from epfd=%d (errno=%d %m)", @@ -503,7 +503,7 @@ int epfd_info::mod_fd(int fd, epoll_event *event) evt.events = event->events; evt.data.u64 = 0; // zero all data evt.data.fd = fd; - ret = orig_os_api.epoll_ctl(m_epfd, EPOLL_CTL_MOD, fd, &evt); + ret = SYSCALL(epoll_ctl, m_epfd, EPOLL_CTL_MOD, fd, &evt); BULLSEYE_EXCLUDE_BLOCK_START if (ret < 0) { __log_err("failed to modify fd=%d in epoll epfd=%d (errno=%d %m)", fd, m_epfd, errno); @@ -754,7 +754,7 @@ int epfd_info::ring_wait_for_notification_and_process_element(uint64_t *p_poll_s } else { __log_dbg("failed to find channel fd. removing cq fd=%d from epfd=%d", fd, m_epfd); BULLSEYE_EXCLUDE_BLOCK_START - if ((orig_os_api.epoll_ctl(m_epfd, EPOLL_CTL_DEL, fd, NULL)) && + if ((SYSCALL(epoll_ctl, m_epfd, EPOLL_CTL_DEL, fd, NULL)) && (!(errno == ENOENT || errno == EBADF))) { __log_err("failed to del cq channel fd=%d from os epfd=%d (errno=%d %m)", fd, m_epfd, errno); diff --git a/src/core/iomux/epoll_wait_call.cpp b/src/core/iomux/epoll_wait_call.cpp index 2a5765356..b172d79d5 100644 --- a/src/core/iomux/epoll_wait_call.cpp +++ b/src/core/iomux/epoll_wait_call.cpp @@ -200,10 +200,9 @@ bool epoll_wait_call::_wait(int timeout) } if (m_sigmask) { - ready_fds = - orig_os_api.epoll_pwait(m_epfd, m_p_ready_events, m_maxevents, timeout, m_sigmask); + ready_fds = SYSCALL(epoll_pwait, m_epfd, m_p_ready_events, m_maxevents, timeout, m_sigmask); } else { - ready_fds = orig_os_api.epoll_wait(m_epfd, m_p_ready_events, m_maxevents, timeout); + ready_fds = SYSCALL(epoll_wait, m_epfd, m_p_ready_events, m_maxevents, timeout); } if (timeout) { diff --git a/src/core/iomux/poll_call.cpp b/src/core/iomux/poll_call.cpp index 93739b165..f8ef79406 100644 --- a/src/core/iomux/poll_call.cpp +++ b/src/core/iomux/poll_call.cpp @@ -141,9 +141,9 @@ bool poll_call::wait_os(bool zero_timeout) to.tv_nsec = (m_timeout % 1000) * 1000000; pto = &to; } - m_n_all_ready_fds = orig_os_api.ppoll(m_fds, m_nfds, pto, m_sigmask); + m_n_all_ready_fds = SYSCALL(ppoll, m_fds, m_nfds, pto, m_sigmask); } else { - m_n_all_ready_fds = orig_os_api.poll(m_fds, m_nfds, zero_timeout ? 0 : m_timeout); + m_n_all_ready_fds = SYSCALL(poll, m_fds, m_nfds, zero_timeout ? 0 : m_timeout); } if (m_n_all_ready_fds < 0) { xlio_throw_object(io_mux_call::io_error); @@ -175,9 +175,9 @@ bool poll_call::wait(const timeval &elapsed) to.tv_sec = m_timeout / 1000; to.tv_nsec = (m_timeout % 1000) * 1000000; pto = &to; - m_n_all_ready_fds = orig_os_api.ppoll(m_fds, m_nfds + 1, pto, m_sigmask); + m_n_all_ready_fds = SYSCALL(ppoll, m_fds, m_nfds + 1, pto, m_sigmask); } else { - m_n_all_ready_fds = orig_os_api.poll(m_fds, m_nfds + 1, timeout); + m_n_all_ready_fds = SYSCALL(poll, m_fds, m_nfds + 1, timeout); } if (m_n_all_ready_fds > 0 && m_fds[m_nfds].revents) { diff --git a/src/core/iomux/select_call.cpp b/src/core/iomux/select_call.cpp index e81961047..b3a95c9bc 100644 --- a/src/core/iomux/select_call.cpp +++ b/src/core/iomux/select_call.cpp @@ -219,9 +219,9 @@ bool select_call::wait_os(bool zero_timeout) pto_pselect = &to_pselect; } m_n_all_ready_fds = - orig_os_api.pselect(m_nfds, m_readfds, m_writefds, m_exceptfds, pto_pselect, m_sigmask); + SYSCALL(pselect, m_nfds, m_readfds, m_writefds, m_exceptfds, pto_pselect, m_sigmask); } else { - m_n_all_ready_fds = orig_os_api.select(m_nfds, m_readfds, m_writefds, m_exceptfds, pto); + m_n_all_ready_fds = SYSCALL(select, m_nfds, m_readfds, m_writefds, m_exceptfds, pto); } if (m_n_all_ready_fds < 0) { xlio_throw_object(io_mux_call::io_error); @@ -283,10 +283,10 @@ bool select_call::wait(const timeval &elapsed) pto_pselect = &to_pselect; } m_n_all_ready_fds = - orig_os_api.pselect(m_nfds, m_readfds, m_writefds, m_exceptfds, pto_pselect, m_sigmask); + SYSCALL(pselect, m_nfds, m_readfds, m_writefds, m_exceptfds, pto_pselect, m_sigmask); } else { m_n_all_ready_fds = - orig_os_api.select(m_nfds_with_cq, m_readfds, m_writefds, m_exceptfds, pto); + SYSCALL(select, m_nfds_with_cq, m_readfds, m_writefds, m_exceptfds, pto); } __log_func("done select CQ+OS nfds=%d cqfd=%d pto=%p ready=%d!!!", m_nfds_with_cq, m_cqepfd, pto, m_n_all_ready_fds); diff --git a/src/core/main.cpp b/src/core/main.cpp index 3f31d9053..6b4b89311 100644 --- a/src/core/main.cpp +++ b/src/core/main.cpp @@ -77,6 +77,7 @@ #include "util/instrumentation.h" #include "util/agent.h" +#include "xlio.h" void check_netperf_flags(); diff --git a/src/core/proto/mapping.cpp b/src/core/proto/mapping.cpp index 854975226..566d71763 100644 --- a/src/core/proto/mapping.cpp +++ b/src/core/proto/mapping.cpp @@ -146,7 +146,7 @@ int mapping_t::map(int fd) failed_unmap: (void)munmap(m_addr, m_size); failed_close_fd: - orig_os_api.close(m_fd); + SYSCALL(close, m_fd); m_addr = NULL; m_size = 0; m_fd = -1; @@ -171,7 +171,7 @@ int mapping_t::unmap(void) map_logerr("munmap() errno=%d (%s)", errno, strerror(errno)); } p_cache->memory_free(m_size); - orig_os_api.close(m_fd); + SYSCALL(close, m_fd); m_fd = -1; m_addr = NULL; m_size = 0; @@ -230,7 +230,7 @@ int mapping_t::duplicate_fd(int fd, bool &rw) len = readlink(link, filename, sizeof(filename) - 1); if (len > 0) { filename[len] = '\0'; - result = orig_os_api.open(filename, O_RDWR); + result = SYSCALL(open, filename, O_RDWR); if (result < 0) { map_logdbg("open() errno=%d (%s)", errno, strerror(errno)); } else { @@ -248,11 +248,11 @@ int mapping_t::duplicate_fd(int fd, bool &rw) if (result < 0) { /* Fallback to dup(2). */ - result = orig_os_api.dup(fd); + result = SYSCALL(dup, fd); if (result < 0) { map_logerr("dup() errno=%d (%s)", errno, strerror(errno)); } else { - int flags = orig_os_api.fcntl(result, F_GETFL); + int flags = SYSCALL(fcntl, result, F_GETFL); rw = (flags > 0) && ((flags & O_RDWR) == O_RDWR); } } diff --git a/src/core/proto/netlink_socket_mgr.cpp b/src/core/proto/netlink_socket_mgr.cpp index 75e0949ab..87f468908 100644 --- a/src/core/proto/netlink_socket_mgr.cpp +++ b/src/core/proto/netlink_socket_mgr.cpp @@ -90,14 +90,14 @@ bool netlink_socket_mgr::query(const struct nlmsghdr *nl_msg, char *buf, int &le uint32_t nl_seq = nl_msg->nlmsg_seq; BULLSEYE_EXCLUDE_BLOCK_START - if ((sockfd = orig_os_api.socket(PF_NETLINK, SOCK_DGRAM, NETLINK_ROUTE)) < 0) { + if ((sockfd = SYSCALL(socket, PF_NETLINK, SOCK_DGRAM, NETLINK_ROUTE)) < 0) { __log_err("NL socket creation failed, errno = %d", errno); return false; } - if (orig_os_api.fcntl(sockfd, F_SETFD, FD_CLOEXEC) != 0) { + if (SYSCALL(fcntl, sockfd, F_SETFD, FD_CLOEXEC) != 0) { __log_warn("Fail in fcntl, errno = %d", errno); } - if ((len = orig_os_api.send(sockfd, nl_msg, nl_msg->nlmsg_len, 0)) < 0) { + if ((len = SYSCALL(send, sockfd, nl_msg, nl_msg->nlmsg_len, 0)) < 0) { __log_err("Write to NL socket failed, errno = %d", errno); } if (len > 0 && (len = recv_info(sockfd, nl_pid, nl_seq, buf)) < 0) { @@ -126,7 +126,7 @@ int netlink_socket_mgr::recv_info(int sockfd, uint32_t pid, uint32_t seq, char * do { // Receive response from the kernel BULLSEYE_EXCLUDE_BLOCK_START - if ((readLen = orig_os_api.recv(sockfd, buf_ptr, MSG_BUFF_SIZE - msgLen, 0)) < 0) { + if ((readLen = SYSCALL(recv, sockfd, buf_ptr, MSG_BUFF_SIZE - msgLen, 0)) < 0) { __log_err("NL socket read failed, errno = %d", errno); return -1; } diff --git a/src/core/sock/bind_no_port.cpp b/src/core/sock/bind_no_port.cpp index 83e17dca7..32a9f5879 100644 --- a/src/core/sock/bind_no_port.cpp +++ b/src/core/sock/bind_no_port.cpp @@ -52,7 +52,7 @@ int bind_no_port::set_src_port_in_db(int fd, in_port_t port, flow_tuple &tuple) if (INPORT_ANY == port) { sock_addr addr; socklen_t addr_len = sizeof(addr); - if ((ret = orig_os_api.getsockname(fd, addr.get_p_sa(), &addr_len))) { + if ((ret = SYSCALL(getsockname, fd, addr.get_p_sa(), &addr_len))) { return ret; } port = addr.get_in_port(); @@ -92,7 +92,7 @@ int bind_no_port::bind_and_set_port_map(const sock_addr &src, const sock_addr &d in_port_t chosen_port = choose_src_port(tuple); addr.set_in_port(chosen_port); - if ((ret = orig_os_api.bind(fd, addr.get_p_sa(), addr_len))) { + if ((ret = SYSCALL(bind, fd, addr.get_p_sa(), addr_len))) { return ret; } diff --git a/src/core/sock/pipeinfo.cpp b/src/core/sock/pipeinfo.cpp index 64e0f7d23..93c1b20e3 100644 --- a/src/core/sock/pipeinfo.cpp +++ b/src/core/sock/pipeinfo.cpp @@ -213,7 +213,7 @@ int pipeinfo::fcntl(int __cmd, unsigned long int __arg) return ret_val; } - return orig_os_api.fcntl(m_fd, __cmd, __arg); + return SYSCALL(fcntl, m_fd, __cmd, __arg); } int pipeinfo::fcntl64(int __cmd, unsigned long int __arg) @@ -225,7 +225,7 @@ int pipeinfo::fcntl64(int __cmd, unsigned long int __arg) return ret_val; } - return orig_os_api.fcntl64(m_fd, __cmd, __arg); + return SYSCALL(fcntl64, m_fd, __cmd, __arg); } int pipeinfo::ioctl(unsigned long int __request, unsigned long int __arg) @@ -250,7 +250,7 @@ int pipeinfo::ioctl(unsigned long int __request, unsigned long int __arg) break; } - return orig_os_api.ioctl(m_fd, __request, __arg); + return SYSCALL(ioctl, m_fd, __request, __arg); } ssize_t pipeinfo::rx(const rx_call_t call_type, iovec *p_iov, ssize_t sz_iov, int *p_flags, @@ -285,7 +285,7 @@ ssize_t pipeinfo::tx(xlio_tx_call_attr_t &tx_arg) m_lock_tx.lock(); switch (tx_arg.opcode) { case TX_WRITE: - ret = orig_os_api.write(m_fd, p_iov[0].iov_base, p_iov[0].iov_len); + ret = SYSCALL(write, m_fd, p_iov[0].iov_base, p_iov[0].iov_len); break; case TX_SEND: case TX_SENDTO: @@ -326,7 +326,10 @@ void pipeinfo::write_lbm_pipe_enhance() // Send the buffered data char buf[10] = "\0"; - orig_os_api.write(m_fd, buf, 1); + auto result = SYSCALL(write, m_fd, buf, 1); + if (result == -1) { + pi_logdbg("write sycall failed"); + } } void pipeinfo::statistics_print(vlog_levels_t log_level) diff --git a/src/core/sock/sock-app.cpp b/src/core/sock/sock-app.cpp index 3f2953d8a..93cc48e5f 100644 --- a/src/core/sock/sock-app.cpp +++ b/src/core/sock/sock-app.cpp @@ -35,6 +35,7 @@ #endif #include +#include #include #include #include @@ -253,7 +254,7 @@ static int init_worker(int worker_id, int listen_fd) if (child_sock_fd_api) { child_sock_fd_api->copy_sockopt_fork(parent_sock_fd_api); - ret = bind(listen_fd, sa.get_p_sa(), sa_len); + ret = bind_internal(child_sock_fd_api, sa.get_p_sa(), sa_len); if (ret < 0) { app_logerr("bind() error"); } diff --git a/src/core/sock/sock-extra.cpp b/src/core/sock/sock-extra.cpp index 83608ae3c..47dfab67a 100644 --- a/src/core/sock/sock-extra.cpp +++ b/src/core/sock/sock-extra.cpp @@ -83,12 +83,7 @@ extern "C" int xlio_recvfrom_zcopy(int __fd, void *__buf, size_t __nbytes, int * *__flags |= MSG_XLIO_ZCOPY; return p_socket_object->rx(RX_RECVFROM, piov, 1, __flags, __from, __fromlen); } - BULLSEYE_EXCLUDE_BLOCK_START - if (!orig_os_api.recvfrom) { - get_orig_funcs(); - } - BULLSEYE_EXCLUDE_BLOCK_END - return orig_os_api.recvfrom(__fd, __buf, __nbytes, *__flags, __from, __fromlen); + return SYSCALL(recvfrom, __fd, __buf, __nbytes, *__flags, __from, __fromlen); } extern "C" int xlio_recvfrom_zcopy_free_packets(int __fd, struct xlio_recvfrom_zcopy_packet_t *pkts, @@ -315,7 +310,7 @@ static inline struct cmsghdr *__cmsg_nxthdr(void *__ctl, size_t __size, struct c return __ptr; } -extern "C" int xlio_ioctl(void *cmsg_hdr, size_t cmsg_len) +extern "C" int xlio_extra_ioctl(void *cmsg_hdr, size_t cmsg_len) { struct cmsghdr *cmsg = (struct cmsghdr *)cmsg_hdr; @@ -386,7 +381,7 @@ struct xlio_api_t *extra_api(void) enable_socketxtreme ? xlio_socketxtreme_free_buff : dummy_xlio_socketxtreme_free_buff, XLIO_EXTRA_API_SOCKETXTREME_FREE_XLIO_BUFF); SET_EXTRA_API(dump_fd_stats, xlio_dump_fd_stats, XLIO_EXTRA_API_DUMP_FD_STATS); - SET_EXTRA_API(ioctl, xlio_ioctl, XLIO_EXTRA_API_IOCTL); + SET_EXTRA_API(ioctl, xlio_extra_ioctl, XLIO_EXTRA_API_IOCTL); } return xlio_api; diff --git a/src/core/sock/sock-redirect.cpp b/src/core/sock/sock-redirect.cpp index 0b8d08509..5f315b6e6 100644 --- a/src/core/sock/sock-redirect.cpp +++ b/src/core/sock/sock-redirect.cpp @@ -35,8 +35,12 @@ #include "sock-redirect.h" #include "sock-extra.h" #include "sock-app.h" +#include "xlio.h" +#include #include +#include +#include #include #include #include @@ -82,7 +86,6 @@ using namespace std; #define srdr_logfunc_exit __log_exit_func #define EP_MAX_EVENTS (int)((INT_MAX / sizeof(struct epoll_event))) - struct os_api orig_os_api; struct sigaction g_act_prev; sighandler_t g_sighandler = NULL; @@ -177,7 +180,6 @@ void get_orig_funcs() GET_ORIG_FUNC(creat); GET_ORIG_FUNC(dup); GET_ORIG_FUNC(dup2); - GET_ORIG_FUNC(clone); GET_ORIG_FUNC(fork); GET_ORIG_FUNC(vfork); GET_ORIG_FUNC(daemon); @@ -279,15 +281,6 @@ bool handle_close(int fd, bool cleanup, bool passthrough) //----------------------------------------------------------------------------- // replacement functions //----------------------------------------------------------------------------- - -/* Create a new socket of type TYPE in domain DOMAIN, using - protocol PROTOCOL. If PROTOCOL is zero, one is chosen automatically. - Returns a file descriptor for the new socket, or -1 for errors. */ -extern "C" EXPORT_SYMBOL int socket(int __domain, int __type, int __protocol) -{ - return socket_internal(__domain, __type, __protocol, true, true); -} - /* Internal logic of socket() syscall implementation. It can be called from within XLIO, for example, to create a socket for an incoming TCP connection. */ int socket_internal(int __domain, int __type, int __protocol, bool shadow, bool check_offload) @@ -301,11 +294,6 @@ int socket_internal(int __domain, int __type, int __protocol, bool shadow, bool } PROFILE_BLOCK("socket") - BULLSEYE_EXCLUDE_BLOCK_START - if (!orig_os_api.socket) { - get_orig_funcs(); - } - BULLSEYE_EXCLUDE_BLOCK_END #if defined(DEFINED_NGINX) bool add_to_udp_pool = false; if (g_p_app && g_p_app->type == APP_NGINX && g_p_fd_collection && offload_sockets && @@ -316,7 +304,7 @@ int socket_internal(int __domain, int __type, int __protocol, bool shadow, bool fd = SOCKET_FAKE_FD; if (shadow || !offload_sockets || !g_p_fd_collection) { - fd = orig_os_api.socket(__domain, __type, __protocol); + fd = SYSCALL(socket, __domain, __type, __protocol); vlog_printf(VLOG_DEBUG, "ENTER: %s(domain=%s(%d), type=%s(%d), protocol=%d) = %d\n", __func__, socket_get_domain_str(__domain), __domain, socket_get_type_str(__type), __type, __protocol, fd); @@ -344,207 +332,662 @@ int socket_internal(int __domain, int __type, int __protocol, bool shadow, bool return fd; } -extern "C" EXPORT_SYMBOL int close(int __fd) +int bind_internal(void *sock, const struct sockaddr *addr, socklen_t addrlen) { - PROFILE_FUNC - - BULLSEYE_EXCLUDE_BLOCK_START - if (!orig_os_api.close) { - get_orig_funcs(); + auto p_socket_object = reinterpret_cast(sock); + int ret = p_socket_object->bind(addr, addrlen); + if (p_socket_object->isPassthrough()) { + int fd = p_socket_object->get_fd(); + handle_close(fd, false, true); + if (ret) { + ret = SYSCALL(bind, fd, addr, addrlen); + } } - BULLSEYE_EXCLUDE_BLOCK_END - - srdr_logdbg_entry("fd=%d", __fd); - - bool toclose = handle_close(__fd); - int rc = toclose ? orig_os_api.close(__fd) : 0; - - return rc; + return ret; } -extern "C" EXPORT_SYMBOL void __res_iclose(res_state statp, bool free_addr) +ssize_t sendmsg_internal(void *sock, __const struct msghdr *__msg, int __flags) { - PROFILE_FUNC - - BULLSEYE_EXCLUDE_BLOCK_START - if (!orig_os_api.__res_iclose) { - get_orig_funcs(); - } - BULLSEYE_EXCLUDE_BLOCK_END - - /* Current implementation doesn't handle XLIO sockets without a shadow socket or from a socket - pool. If such a socket is present in the nssocks list, system __res_iclose() will close the - fd. This will break the socket functionality. - Assume that resolver doesn't use the above scenarios. */ + auto p_socket_object = reinterpret_cast(sock); + xlio_tx_call_attr_t tx_arg; - srdr_logdbg_entry(""); - for (int ns = 0; ns < statp->_u._ext.nscount; ns++) { - int sock = statp->_u._ext.nssocks[ns]; - if (sock != -1) { - handle_close(sock); + tx_arg.opcode = TX_SENDMSG; + tx_arg.attr.iov = __msg->msg_iov; + tx_arg.attr.sz_iov = (ssize_t)__msg->msg_iovlen; + tx_arg.attr.flags = __flags; + tx_arg.attr.addr = (struct sockaddr *)(__CONST_SOCKADDR_ARG)__msg->msg_name; + tx_arg.attr.len = (socklen_t)__msg->msg_namelen; + tx_arg.attr.hdr = __msg; + tx_arg.priv.attr = PBUF_NONE; + + if (0 < __msg->msg_controllen) { + struct cmsghdr *cmsg = CMSG_FIRSTHDR((struct msghdr *)__msg); + if ((cmsg->cmsg_level == SOL_SOCKET) && + (cmsg->cmsg_type == SCM_XLIO_PD || cmsg->cmsg_type == SCM_XLIO_NVME_PD)) { + if ((tx_arg.attr.flags & MSG_ZEROCOPY) && + (__msg->msg_iovlen == + ((cmsg->cmsg_len - CMSG_LEN(0)) / sizeof(struct xlio_pd_key)))) { + tx_arg.priv.attr = + (cmsg->cmsg_type == SCM_XLIO_PD) ? PBUF_DESC_MKEY : PBUF_DESC_NVME_TX; + tx_arg.priv.map = (void *)CMSG_DATA(cmsg); + } else { + errno = EINVAL; + return -1; + } } } - orig_os_api.__res_iclose(statp, free_addr); + + return p_socket_object->tx(tx_arg); } -/* Shut down all or part of the connection open on socket FD. - HOW determines what to shut down: - SHUT_RD = No more receptions; - SHUT_WR = No more transmissions; - SHUT_RDWR = No more receptions or transmissions. - Returns 0 on success, -1 for errors. */ -extern "C" EXPORT_SYMBOL int shutdown(int __fd, int __how) +static ssize_t sendfile_helper(socket_fd_api *p_socket_object, int in_fd, __off64_t *offset, + size_t count) { - PROFILE_FUNC - - srdr_logdbg_entry("fd=%d, how=%d", __fd, __how); + ssize_t totSent = 0; + struct stat64 stat_buf; + __off64_t orig_offset = 0; + __off64_t cur_offset; + struct iovec piov[1]; + xlio_tx_call_attr_t tx_arg; + sockinfo *s = (sockinfo *)p_socket_object; - socket_fd_api *p_socket_object = NULL; - p_socket_object = fd_collection_get_sockfd(__fd); - if (p_socket_object) { - return p_socket_object->shutdown(__how); + if (p_socket_object->get_type() != FD_TYPE_SOCKET) { + errno = EBADF; + return -1; } - BULLSEYE_EXCLUDE_BLOCK_START - if (!orig_os_api.shutdown) { - get_orig_funcs(); + if (offset == NULL) { + orig_offset = lseek64(in_fd, 0, SEEK_CUR); + if (orig_offset < 0) { + errno = ESPIPE; + return -1; + } + cur_offset = orig_offset; + } else { + cur_offset = *offset; } - BULLSEYE_EXCLUDE_BLOCK_END - - return orig_os_api.shutdown(__fd, __how); -} - -extern "C" EXPORT_SYMBOL int listen(int __fd, int backlog) -{ - PROFILE_FUNC - srdr_logdbg_entry("fd=%d, backlog=%d", __fd, backlog); + if (PROTO_TCP == s->get_protocol()) { + mapping_t *mapping; + int rc; -#if defined(DEFINED_NGINX) || defined(DEFINED_ENVOY) - if (g_p_app && g_p_app->type != APP_NONE) { - /* Envoy: - * Socket handling - * Envoy uses the following procedure for creating sockets and assigning them to workers. - * - * When a listener is created, a socket is pre-created for every worker on the main thread. - * This allows most errors to be caught early on in the listener creation process (e.g., bad - * socket option, unable to bind, etc.). - * - If using reuse_port, a unique socket is created for every worker. - * - If not using reuse_port, a unique socket is created for worker 0, and then that socket - * is duplicated for all other workers. - * a listener can close() its sockets when removed without concern for other listeners. - * - * Implementation: - * - reuse_port(false) : - * Envoy uses dup() call for listen socket on workers_N (N > 0) - * dup() call does not create socket object and does not store fd - * in fd_collection in current implementation - * so as a result duplicated fd is not returned by fd_collection_get_sockfd(__fd) and - * listen() call for duplicated fds are ignored. - * Original listen socket is not ignored by listen() function. - * - reuse_port(true) : - * dup() is not used. Unique socket is created for every worker. - * - * Store all duplicated fd in map_dup_fd with reference to original fd - * Store all listen fd in map_listen_fd with tid - * Identify correct listen fd during epoll_ctl(ADD) call by tid. It should be different. - * Set worker id in map_thread_id basing on tid - * - * Nginx: - * Nginx store all listen fd in map_listen_fd to proceed later in children processes - * after fork() call. - * Set worker id in map_thread_id basing on tid(pid). Nginx has single thread per process so - * tid and pid should be equal. - */ - std::lock_guardm_lock)> lock(g_p_app->m_lock); - g_p_app->map_listen_fd[__fd] = gettid(); - } -#endif /* DEFINED_ENVOY */ + /* Get mapping from the cache */ + mapping = g_zc_cache->get_mapping(in_fd); + if (mapping == NULL) { + srdr_logdbg("Couldn't allocate mapping object"); + goto fallback; + } - socket_fd_api *p_socket_object = NULL; - p_socket_object = fd_collection_get_sockfd(__fd); + if ((__off64_t)mapping->m_size < (__off64_t)(cur_offset + count)) { + struct stat st_buf; - if (p_socket_object) { - // for verifying that the socket is really offloaded - int ret = p_socket_object->prepareListen(); - if (ret < 0) { - return ret; // error - } - if (ret > 0) { // Passthrough - handle_close(__fd, false, true); - } else { -#if defined(DEFINED_NGINX) || defined(DEFINED_ENVOY) - if (g_p_app && g_p_app->type != APP_NONE) { - p_socket_object->m_back_log = backlog; - } else -#endif - { - return p_socket_object->listen(backlog); + /* + * This is slow path, we check fstat(2) to handle the + * scenario when user changes the file while respective + * mapping exists and the file becomes larger. + * As workaround, fallback to preadv() implementation. + */ + mapping->put(); + rc = fstat(in_fd, &st_buf); + if ((rc == 0) && (st_buf.st_size >= (off_t)(cur_offset + count))) { + s->m_p_socket_stats->counters.n_tx_sendfile_overflows++; + goto fallback; + } else { + errno = EOVERFLOW; + return -1; } } - } - BULLSEYE_EXCLUDE_BLOCK_START - if (!orig_os_api.listen) { - get_orig_funcs(); - } - BULLSEYE_EXCLUDE_BLOCK_END + piov[0].iov_base = (char *)mapping->m_addr + cur_offset; + piov[0].iov_len = count; - srdr_logdbg("OS listen fd=%d, backlog=%d", __fd, backlog); - return orig_os_api.listen(__fd, backlog); -} + tx_arg.opcode = TX_FILE; + tx_arg.attr.iov = piov; + tx_arg.attr.sz_iov = 1; + tx_arg.attr.flags = MSG_ZEROCOPY; + tx_arg.priv.attr = PBUF_DESC_MAP; + tx_arg.priv.map = (void *)mapping; + totSent = p_socket_object->tx(tx_arg); -extern "C" EXPORT_SYMBOL int accept(int __fd, struct sockaddr *__addr, socklen_t *__addrlen) -{ - PROFILE_FUNC + mapping->put(); + fallback: + /* Fallback to readv() implementation */ + if (totSent == 0) { + s->m_p_socket_stats->counters.n_tx_sendfile_fallbacks++; + tx_arg.clear(); + tx_arg.opcode = TX_FILE; + tx_arg.attr.iov = piov; + tx_arg.attr.sz_iov = 1; + tx_arg.priv.attr = PBUF_DESC_FD; + tx_arg.priv.fd = in_fd; + piov[0].iov_base = (void *)&cur_offset; + piov[0].iov_len = count; + totSent = p_socket_object->tx(tx_arg); + } + } else { + __off64_t pa_offset = 0; + size_t pa_count = 0; + struct flock64 lock; - socket_fd_api *p_socket_object = NULL; - p_socket_object = fd_collection_get_sockfd(__fd); - if (p_socket_object) { - return p_socket_object->accept(__addr, __addrlen); - } + if ((fstat64(in_fd, &stat_buf) == -1) || + ((__off64_t)stat_buf.st_size < (__off64_t)(cur_offset + count))) { + errno = EOVERFLOW; + return -1; + } - BULLSEYE_EXCLUDE_BLOCK_START - if (!orig_os_api.accept) { - get_orig_funcs(); - } - BULLSEYE_EXCLUDE_BLOCK_END + tx_arg.opcode = TX_WRITE; + tx_arg.attr.iov = piov; + tx_arg.attr.sz_iov = 1; - return orig_os_api.accept(__fd, __addr, __addrlen); -} + /* The off argument of mmap() is constrained to be aligned and + * sized according to the value returned by sysconf() + */ + pa_offset = cur_offset & ~(sysconf(_SC_PAGE_SIZE) - 1); + pa_count = count + cur_offset - pa_offset; -extern "C" EXPORT_SYMBOL int accept4(int __fd, struct sockaddr *__addr, socklen_t *__addrlen, - int __flags) -{ - PROFILE_FUNC + lock.l_type = F_RDLCK; + lock.l_whence = SEEK_SET; + lock.l_start = pa_offset; + lock.l_len = pa_count; + lock.l_pid = 0; - socket_fd_api *p_socket_object = NULL; - p_socket_object = fd_collection_get_sockfd(__fd); - if (p_socket_object) { - return p_socket_object->accept4(__addr, __addrlen, __flags); - } + /* try to use mmap() approach */ + if (-1 != (XLIO_CALL(fcntl, in_fd, F_SETLK, &lock))) { + void *addr = NULL; + addr = mmap64(NULL, pa_count, PROT_READ, MAP_SHARED | MAP_NORESERVE, in_fd, pa_offset); + if (MAP_FAILED != addr) { + ssize_t toRead, numSent = 0; - BULLSEYE_EXCLUDE_BLOCK_START - if (!orig_os_api.accept4) { - get_orig_funcs(); - } - BULLSEYE_EXCLUDE_BLOCK_END + while (count > 0) { + toRead = min(sysconf(_SC_PAGE_SIZE), (ssize_t)count); - return orig_os_api.accept4(__fd, __addr, __addrlen, __flags); -} + piov[0].iov_base = (void *)((uintptr_t)addr + cur_offset - pa_offset + totSent); + piov[0].iov_len = toRead; -/* Give the socket FD the local address ADDR (which is LEN bytes long). */ -extern "C" EXPORT_SYMBOL int bind(int __fd, const struct sockaddr *__addr, socklen_t __addrlen) -{ - int errno_tmp = errno; + numSent = p_socket_object->tx(tx_arg); + if (numSent == -1) { + break; + } + + count -= numSent; + totSent += numSent; + } + (void)munmap(addr, pa_count); + } + lock.l_type = F_UNLCK; + (void)XLIO_CALL(fcntl, in_fd, F_SETLK, &lock); + } + + /* fallback on read() approach */ + if (totSent == 0) { + char buf[sysconf(_SC_PAGE_SIZE)]; + ssize_t toRead, numRead, numSent = 0; + + s->m_p_socket_stats->counters.n_tx_sendfile_fallbacks++; + + while (count > 0) { + toRead = min(sizeof(buf), count); + numRead = pread(in_fd, buf, toRead, cur_offset + totSent); + if (numRead <= 0) { + if (numRead < 0 && totSent == 0) { + totSent = -1; + } + break; + } + + piov[0].iov_base = (void *)buf; + piov[0].iov_len = numRead; + + numSent = p_socket_object->tx(tx_arg); + if (numSent == -1) { + break; + } + + count -= numSent; + totSent += numSent; + } + } + } + + if (totSent > 0) { + if (offset != NULL) { + *offset = *offset + totSent; + } else { + (void)lseek64(in_fd, (orig_offset + totSent), SEEK_SET); + } + } + + return totSent; +} + +// Format a fd_set into a string for logging +// Check nfd to know how many 32 bits hexs do we want to sprintf into user buffer +const char *dbg_sprintf_fdset(char *buf, int buflen, int __nfds, fd_set *__fds) +{ + if (buflen < 1) { + return "(null)"; + } + buf[0] = '\0'; + + if ((__nfds <= 0) || (__fds == NULL)) { + return "(null)"; + } + + int fdsize = 1 + ((__nfds - 1) / (8 * sizeof(uint32_t))); + switch (fdsize) { + case 1: + snprintf(buf, buflen, "%08x", ((uint32_t *)__fds)[0]); + break; + case 2: + snprintf(buf, buflen, "%08x %08x", ((uint32_t *)__fds)[1], ((uint32_t *)__fds)[0]); + break; + case 3: + snprintf(buf, buflen, "%08x %08x %08x", ((uint32_t *)__fds)[2], ((uint32_t *)__fds)[1], + ((uint32_t *)__fds)[0]); + break; + case 4: + snprintf(buf, buflen, "%08x %08x %08x %08x", ((uint32_t *)__fds)[3], ((uint32_t *)__fds)[2], + ((uint32_t *)__fds)[1], ((uint32_t *)__fds)[0]); + break; + case 5: + snprintf(buf, buflen, "%08x %08x %08x %08x %08x", ((uint32_t *)__fds)[4], + ((uint32_t *)__fds)[3], ((uint32_t *)__fds)[2], ((uint32_t *)__fds)[1], + ((uint32_t *)__fds)[0]); + break; + case 6: + snprintf(buf, buflen, "%08x %08x %08x %08x %08x %08x", ((uint32_t *)__fds)[5], + ((uint32_t *)__fds)[4], ((uint32_t *)__fds)[3], ((uint32_t *)__fds)[2], + ((uint32_t *)__fds)[1], ((uint32_t *)__fds)[0]); + break; + default: + buf[0] = '\0'; + } + return buf; +} + +/* Poll the file descriptors described by the NFDS structures starting at + FDS. If TIMis nonzero and not -1, allow TIMmilliseconds for + an event to occur; if TIMis -1, block until an event occurs. + Returns the number of file descriptors with events, zero if timed out, + or -1 for errors. */ +static int poll_helper(struct pollfd *__fds, nfds_t __nfds, int __timeout, + const sigset_t *__sigmask = NULL) +{ + int off_rfd_buffer[__nfds]; + io_mux_call::offloaded_mode_t off_modes_buffer[__nfds]; + int lookup_buffer[__nfds]; + pollfd working_fds_arr[__nfds + 1]; + + try { + poll_call pcall(off_rfd_buffer, off_modes_buffer, lookup_buffer, working_fds_arr, __fds, + __nfds, __timeout, __sigmask); + + int rc = pcall.call(); + srdr_logfunc_exit("rc = %d", rc); + return rc; + } catch (io_mux_call::io_error &) { + srdr_logfunc_exit("io_mux_call::io_error (errno=%d %m)", errno); + return -1; + } +} + +/* Check the first NFDS descriptors each in READFDS (if not NULL) for read + readiness, in WRITEFDS (if not NULL) for write readiness, and in EXCEPTFDS + (if not NULL) for exceptional conditions. If TIMis not NULL, time out + after waiting the interval specified therein. Returns the number of ready + descriptors, or -1 for errors. + + This function is a cancellation point and therefore not marked with + __THROW. */ +static int select_helper(int __nfds, fd_set *__readfds, fd_set *__writefds, fd_set *__exceptfds, + struct timeval *__timeout, const sigset_t *__sigmask = NULL) +{ + int off_rfds_buffer[__nfds]; + io_mux_call::offloaded_mode_t off_modes_buffer[__nfds]; + + if (g_vlogger_level >= VLOG_FUNC) { + const int tmpbufsize = 256; + char tmpbuf[tmpbufsize], tmpbuf2[tmpbufsize]; + NOT_IN_USE(tmpbufsize); /* to suppress warning in case MAX_DEFINED_LOG_LEVEL */ + NOT_IN_USE(tmpbuf); /* to suppress warning in case MAX_DEFINED_LOG_LEVEL */ + NOT_IN_USE(tmpbuf2); /* to suppress warning in case MAX_DEFINED_LOG_LEVEL */ + srdr_logfunc("readfds: %s, writefds: %s", + dbg_sprintf_fdset(tmpbuf, tmpbufsize, __nfds, __readfds), + dbg_sprintf_fdset(tmpbuf2, tmpbufsize, __nfds, __writefds)); + } + + try { + select_call scall(off_rfds_buffer, off_modes_buffer, __nfds, __readfds, __writefds, + __exceptfds, __timeout, __sigmask); + int rc = scall.call(); + + if (g_vlogger_level >= VLOG_FUNC) { + const int tmpbufsize = 256; + char tmpbuf[tmpbufsize], tmpbuf2[tmpbufsize]; + NOT_IN_USE(tmpbufsize); /* to suppress warning in case MAX_DEFINED_LOG_LEVEL */ + NOT_IN_USE(tmpbuf); /* to suppress warning in case MAX_DEFINED_LOG_LEVEL */ + NOT_IN_USE(tmpbuf2); /* to suppress warning in case MAX_DEFINED_LOG_LEVEL */ + srdr_logfunc_exit("readfds: %s, writefds: %s", + dbg_sprintf_fdset(tmpbuf, tmpbufsize, __nfds, __readfds), + dbg_sprintf_fdset(tmpbuf2, tmpbufsize, __nfds, __writefds)); + } + + return rc; + } catch (io_mux_call::io_error &) { + srdr_logfunc_exit("io_mux_call::io_error (errno=%d %m)", errno); + return -1; + } +} + +static void xlio_epoll_create(int epfd, int size) +{ + if (g_p_fd_collection) { + // Sanity check to remove any old sockinfo object using the same fd!! + handle_close(epfd, true); + + // insert epfd to fd_collection as epfd_info + g_p_fd_collection->addepfd(epfd, size); + } +} + +/* Wait for events on an epoll instance "epfd". Returns the number of + triggered events returned in "events" buffer. Or -1 in case of + error with the "errno" variable set to the specific error code. The + "events" parameter is a buffer that will contain triggered + events. The "maxevents" is the maximum number of events to be + returned ( usually size of "events" ). The "timeout" parameter + specifies the maximum wait time in milliseconds (-1 == infinite). */ +inline int epoll_wait_helper(int __epfd, struct epoll_event *__events, int __maxevents, + int __timeout, const sigset_t *__sigmask = NULL) +{ + if (__maxevents <= 0 || __maxevents > EP_MAX_EVENTS) { + srdr_logdbg("invalid value for maxevents: %d", __maxevents); + errno = EINVAL; + return -1; + } + + if (safe_mce_sys().tcp_ctl_thread == option_tcp_ctl_thread::CTL_THREAD_DELEGATE_TCP_TIMERS) { + g_thread_local_event_handler.do_tasks(); + } + + epoll_event extra_events_buffer[__maxevents]; + + try { + epoll_wait_call epcall(extra_events_buffer, NULL, __epfd, __events, __maxevents, __timeout, + __sigmask); + + int rc = epcall.get_current_events(); // returns ready nfds + if (rc <= 0) { + // if no ready nfds available then check all lower level queues (XLIO ring's and OS + // queues) + epcall.init_offloaded_fds(); + rc = epcall.call(); + } + + srdr_logfunc_exit("rc = %d", rc); + return rc; + } catch (io_mux_call::io_error &) { + srdr_logfunc_exit("io_mux_call::io_error (errno=%d %m)", errno); + return -1; + } +} + +static void handler_intr(int sig) +{ + switch (sig) { + case SIGINT: + g_b_exit = true; + srdr_logdbg("Catch Signal: SIGINT (%d)", sig); + break; + default: + srdr_logdbg("Catch Signal: %d", sig); + break; + } + + if (g_act_prev.sa_handler) { + g_act_prev.sa_handler(sig); + } +} + +static void handle_signal(int signum) +{ + srdr_logdbg_entry("Caught signal! signum=%d", signum); + + if (signum == SIGINT) { + g_b_exit = true; + } + + if (g_sighandler) { + g_sighandler(signum); + } +} + +int sigaction_internal(int signum, const struct sigaction *act, struct sigaction *oldact) +{ + int ret = 0; + + PROFILE_FUNC + + if (safe_mce_sys().handle_sigintr) { + srdr_logdbg_entry("signum=%d, act=%p, oldact=%p", signum, act, oldact); + + switch (signum) { + case SIGINT: + if (oldact && g_act_prev.sa_handler) { + *oldact = g_act_prev; + } + if (act) { + struct sigaction xlio_action; + xlio_action.sa_handler = handler_intr; + xlio_action.sa_flags = 0; + sigemptyset(&xlio_action.sa_mask); + + ret = SYSCALL(sigaction, SIGINT, &xlio_action, NULL); + + if (ret < 0) { + srdr_logdbg("Failed to register SIGINT handler, calling to original sigaction " + "handler"); + break; + } + srdr_logdbg("Registered SIGINT handler"); + g_act_prev = *act; + } + if (ret >= 0) { + srdr_logdbg_exit("returned with %d", ret); + } else { + srdr_logdbg_exit("failed (errno=%d %m)", errno); + } + + return ret; + break; + default: + break; + } + } + ret = SYSCALL(sigaction, signum, act, oldact); + + if (safe_mce_sys().handle_sigintr) { + if (ret >= 0) { + srdr_logdbg_exit("returned with %d", ret); + } else { + srdr_logdbg_exit("failed (errno=%d %m)", errno); + } + } + return ret; +} + +extern "C" { +/* Create a new socket of type TYPE in domain DOMAIN, using + protocol PROTOCOL. If PROTOCOL is zero, one is chosen automatically. + Returns a file descriptor for the new socket, or -1 for errors. */ +EXPORT_SYMBOL int XLIO_SYMBOL(socket)(int __domain, int __type, int __protocol) +{ + return socket_internal(__domain, __type, __protocol, true, true); +} + +EXPORT_SYMBOL int XLIO_SYMBOL(close)(int __fd) +{ + PROFILE_FUNC + + srdr_logdbg_entry("fd=%d", __fd); + + bool toclose = handle_close(__fd); + int rc = toclose ? SYSCALL(close, __fd) : 0; + + return rc; +} + +#ifdef XLIO_STATIC_BUILD +extern void __res_iclose(res_state statp, bool free_addr); +#endif + +EXPORT_SYMBOL void XLIO_SYMBOL(__res_iclose)(res_state statp, bool free_addr) +{ + PROFILE_FUNC + + /* Current implementation doesn't handle XLIO sockets without a shadow socket or from a socket + pool. If such a socket is present in the nssocks list, system __res_iclose() will close the + fd. This will break the socket functionality. + Assume that resolver doesn't use the above scenarios. */ + + srdr_logdbg_entry(""); + for (int ns = 0; ns < statp->_u._ext.nscount; ns++) { + int sock = statp->_u._ext.nssocks[ns]; + if (sock != -1) { + handle_close(sock); + } + } + SYSCALL(__res_iclose, statp, free_addr); +} + +/* Shut down all or part of the connection open on socket FD. + HOW determines what to shut down: + SHUT_RD = No more receptions; + SHUT_WR = No more transmissions; + SHUT_RDWR = No more receptions or transmissions. + Returns 0 on success, -1 for errors. */ +EXPORT_SYMBOL int XLIO_SYMBOL(shutdown)(int __fd, int __how) +{ + PROFILE_FUNC + + srdr_logdbg_entry("fd=%d, how=%d", __fd, __how); + + socket_fd_api *p_socket_object = NULL; + p_socket_object = fd_collection_get_sockfd(__fd); + if (p_socket_object) { + return p_socket_object->shutdown(__how); + } + + return SYSCALL(shutdown, __fd, __how); +} + +EXPORT_SYMBOL int XLIO_SYMBOL(listen)(int __fd, int backlog) +{ + PROFILE_FUNC + + srdr_logdbg_entry("fd=%d, backlog=%d", __fd, backlog); + +#if defined(DEFINED_NGINX) || defined(DEFINED_ENVOY) + if (g_p_app && g_p_app->type != APP_NONE) { + /* Envoy: + * Socket handling + * Envoy uses the following procedure for creating sockets and assigning them to workers. + * + * When a listener is created, a socket is pre-created for every worker on the main thread. + * This allows most errors to be caught early on in the listener creation process (e.g., bad + * socket option, unable to bind, etc.). + * - If using reuse_port, a unique socket is created for every worker. + * - If not using reuse_port, a unique socket is created for worker 0, and then that socket + * is duplicated for all other workers. + * a listener can close() its sockets when removed without concern for other listeners. + * + * Implementation: + * - reuse_port(false) : + * Envoy uses dup() call for listen socket on workers_N (N > 0) + * dup() call does not create socket object and does not store fd + * in fd_collection in current implementation + * so as a result duplicated fd is not returned by fd_collection_get_sockfd(__fd) and + * listen() call for duplicated fds are ignored. + * Original listen socket is not ignored by listen() function. + * - reuse_port(true) : + * dup() is not used. Unique socket is created for every worker. + * + * Store all duplicated fd in map_dup_fd with reference to original fd + * Store all listen fd in map_listen_fd with tid + * Identify correct listen fd during epoll_ctl(ADD) call by tid. It should be different. + * Set worker id in map_thread_id basing on tid + * + * Nginx: + * Nginx store all listen fd in map_listen_fd to proceed later in children processes + * after fork() call. + * Set worker id in map_thread_id basing on tid(pid). Nginx has single thread per process so + * tid and pid should be equal. + */ + std::lock_guardm_lock)> lock(g_p_app->m_lock); + g_p_app->map_listen_fd[__fd] = gettid(); + } +#endif /* DEFINED_ENVOY */ + + socket_fd_api *p_socket_object = NULL; + p_socket_object = fd_collection_get_sockfd(__fd); + + if (p_socket_object) { + // for verifying that the socket is really offloaded + int ret = p_socket_object->prepareListen(); + if (ret < 0) { + return ret; // error + } + if (ret > 0) { // Passthrough + handle_close(__fd, false, true); + } else { +#if defined(DEFINED_NGINX) || defined(DEFINED_ENVOY) + if (g_p_app && g_p_app->type != APP_NONE) { + p_socket_object->m_back_log = backlog; + } else +#endif + { + return p_socket_object->listen(backlog); + } + } + } + + srdr_logdbg("OS listen fd=%d, backlog=%d", __fd, backlog); + return SYSCALL(listen, __fd, backlog); +} +EXPORT_SYMBOL int XLIO_SYMBOL(accept)(int __fd, struct sockaddr *__addr, socklen_t *__addrlen) +{ PROFILE_FUNC - BULLSEYE_EXCLUDE_BLOCK_START - if (!orig_os_api.bind) { - get_orig_funcs(); + socket_fd_api *p_socket_object = NULL; + p_socket_object = fd_collection_get_sockfd(__fd); + if (p_socket_object) { + return p_socket_object->accept(__addr, __addrlen); } - BULLSEYE_EXCLUDE_BLOCK_END + + return SYSCALL(accept, __fd, __addr, __addrlen); +} + +EXPORT_SYMBOL int XLIO_SYMBOL(accept4)(int __fd, struct sockaddr *__addr, socklen_t *__addrlen, + int __flags) +{ + PROFILE_FUNC + + socket_fd_api *p_socket_object = NULL; + p_socket_object = fd_collection_get_sockfd(__fd); + if (p_socket_object) { + return p_socket_object->accept4(__addr, __addrlen, __flags); + } + + return SYSCALL(accept4, __fd, __addr, __addrlen, __flags); +} + +/* Give the socket FD the local address ADDR (which is LEN bytes long). */ +EXPORT_SYMBOL int XLIO_SYMBOL(bind)(int __fd, const struct sockaddr *__addr, socklen_t __addrlen) +{ + int errno_tmp = errno; + + PROFILE_FUNC char buf[256]; NOT_IN_USE(buf); /* to suppress warning in case MAX_DEFINED_LOG_LEVEL */ @@ -554,15 +997,9 @@ extern "C" EXPORT_SYMBOL int bind(int __fd, const struct sockaddr *__addr, sockl socket_fd_api *p_socket_object = NULL; p_socket_object = fd_collection_get_sockfd(__fd); if (p_socket_object) { - ret = p_socket_object->bind(__addr, __addrlen); - if (p_socket_object->isPassthrough()) { - handle_close(__fd, false, true); - if (ret) { - ret = orig_os_api.bind(__fd, __addr, __addrlen); - } - } + ret = bind_internal(p_socket_object, __addr, __addrlen); } else { - ret = orig_os_api.bind(__fd, __addr, __addrlen); + ret = SYSCALL(bind, __fd, __addr, __addrlen); } if (ret >= 0) { @@ -583,18 +1020,12 @@ extern "C" EXPORT_SYMBOL int bind(int __fd, const struct sockaddr *__addr, sockl This function is a cancellation point and therefore not marked with __THROW. */ -extern "C" EXPORT_SYMBOL int connect(int __fd, const struct sockaddr *__to, socklen_t __tolen) +EXPORT_SYMBOL int XLIO_SYMBOL(connect)(int __fd, const struct sockaddr *__to, socklen_t __tolen) { int errno_tmp = errno; PROFILE_FUNC - BULLSEYE_EXCLUDE_BLOCK_START - if (!orig_os_api.connect) { - get_orig_funcs(); - } - BULLSEYE_EXCLUDE_BLOCK_END - char buf[256]; NOT_IN_USE(buf); /* to suppress warning in case MAX_DEFINED_LOG_LEVEL */ srdr_logdbg_entry("fd=%d, %s", __fd, sprintf_sockaddr(buf, 256, __to, __tolen)); @@ -603,17 +1034,17 @@ extern "C" EXPORT_SYMBOL int connect(int __fd, const struct sockaddr *__to, sock socket_fd_api *p_socket_object = fd_collection_get_sockfd(__fd); if (p_socket_object == nullptr) { srdr_logdbg_exit("Unable to get sock_fd_api"); - ret = orig_os_api.connect(__fd, __to, __tolen); + ret = SYSCALL(connect, __fd, __to, __tolen); } else if (__to == nullptr || (get_sa_family(__to) != AF_INET && (get_sa_family(__to) != AF_INET6))) { p_socket_object->setPassthrough(); - ret = orig_os_api.connect(__fd, __to, __tolen); + ret = SYSCALL(connect, __fd, __to, __tolen); } else { ret = p_socket_object->connect(__to, __tolen); if (p_socket_object->isPassthrough()) { handle_close(__fd, false, true); if (ret) { - ret = orig_os_api.connect(__fd, __to, __tolen); + ret = SYSCALL(connect, __fd, __to, __tolen); } } } @@ -631,8 +1062,8 @@ extern "C" EXPORT_SYMBOL int connect(int __fd, const struct sockaddr *__to, sock /* Set socket FD's option OPTNAME at protocol level LEVEL to *OPTVAL (which is OPTLEN bytes long). Returns 0 on success, -1 for errors. */ -extern "C" EXPORT_SYMBOL int setsockopt(int __fd, int __level, int __optname, - __const void *__optval, socklen_t __optlen) +EXPORT_SYMBOL int XLIO_SYMBOL(setsockopt)(int __fd, int __level, int __optname, + __const void *__optval, socklen_t __optlen) { srdr_logdbg_entry("fd=%d, level=%d, optname=%d", __fd, __level, __optname); @@ -651,12 +1082,7 @@ extern "C" EXPORT_SYMBOL int setsockopt(int __fd, int __level, int __optname, VERIFY_PASSTROUGH_CHANGED( ret, p_socket_object->setsockopt(__level, __optname, __optval, __optlen)); } else { - BULLSEYE_EXCLUDE_BLOCK_START - if (!orig_os_api.setsockopt) { - get_orig_funcs(); - } - BULLSEYE_EXCLUDE_BLOCK_END - ret = orig_os_api.setsockopt(__fd, __level, __optname, __optval, __optlen); + ret = SYSCALL(setsockopt, __fd, __level, __optname, __optval, __optlen); } if (ret >= 0) { @@ -670,8 +1096,8 @@ extern "C" EXPORT_SYMBOL int setsockopt(int __fd, int __level, int __optname, /* Get socket FD's option OPTNAME at protocol level LEVEL to *OPTVAL (which is OPTLEN bytes long). Returns 0 on success, -1 for errors. */ -extern "C" EXPORT_SYMBOL int getsockopt(int __fd, int __level, int __optname, void *__optval, - socklen_t *__optlen) +EXPORT_SYMBOL int XLIO_SYMBOL(getsockopt)(int __fd, int __level, int __optname, void *__optval, + socklen_t *__optlen) { PROFILE_FUNC @@ -679,9 +1105,7 @@ extern "C" EXPORT_SYMBOL int getsockopt(int __fd, int __level, int __optname, vo if (__fd == -2 && __level == SOL_SOCKET && __optname == SO_XLIO_GET_API && __optlen && *__optlen >= sizeof(struct xlio_api_t *)) { - struct xlio_api_t *xlio_api = extra_api(); - - *((xlio_api_t **)__optval) = xlio_api; + *((xlio_api_t **)__optval) = extra_api(); *__optlen = sizeof(struct xlio_api_t *); return 0; } @@ -693,12 +1117,7 @@ extern "C" EXPORT_SYMBOL int getsockopt(int __fd, int __level, int __optname, vo VERIFY_PASSTROUGH_CHANGED( ret, p_socket_object->getsockopt(__level, __optname, __optval, __optlen)); } else { - BULLSEYE_EXCLUDE_BLOCK_START - if (!orig_os_api.getsockopt) { - get_orig_funcs(); - } - BULLSEYE_EXCLUDE_BLOCK_END - ret = orig_os_api.getsockopt(__fd, __level, __optname, __optval, __optlen); + ret = SYSCALL(getsockopt, __fd, __level, __optname, __optval, __optlen); } if (ret >= 0) { @@ -719,7 +1138,7 @@ extern "C" EXPORT_SYMBOL int getsockopt(int __fd, int __level, int __optname, vo user requested explicitly that XLIO will throw an exception in such a case by setting XLIO_EXCEPTION_HANDLING accordingly (see README.txt) */ -extern "C" EXPORT_SYMBOL int fcntl(int __fd, int __cmd, ...) +EXPORT_SYMBOL int XLIO_SYMBOL(fcntl)(int __fd, int __cmd, ...) { PROFILE_FUNC @@ -737,12 +1156,7 @@ extern "C" EXPORT_SYMBOL int fcntl(int __fd, int __cmd, ...) if (p_socket_object) { VERIFY_PASSTROUGH_CHANGED(res, p_socket_object->fcntl(__cmd, arg)); } else { - BULLSEYE_EXCLUDE_BLOCK_START - if (!orig_os_api.fcntl) { - get_orig_funcs(); - } - BULLSEYE_EXCLUDE_BLOCK_END - res = orig_os_api.fcntl(__fd, __cmd, arg); + res = SYSCALL(fcntl, __fd, __cmd, arg); } if (__cmd == F_DUPFD) { @@ -768,7 +1182,7 @@ extern "C" EXPORT_SYMBOL int fcntl(int __fd, int __cmd, ...) by setting XLIO_EXCEPTION_HANDLING accordingly (see README.txt) */ -extern "C" EXPORT_SYMBOL int fcntl64(int __fd, int __cmd, ...) +EXPORT_SYMBOL int XLIO_SYMBOL(fcntl64)(int __fd, int __cmd, ...) { PROFILE_FUNC @@ -783,25 +1197,10 @@ extern "C" EXPORT_SYMBOL int fcntl64(int __fd, int __cmd, ...) int ret = 0; socket_fd_api *p_socket_object = NULL; p_socket_object = fd_collection_get_sockfd(__fd); - BULLSEYE_EXCLUDE_BLOCK_START - if (!orig_os_api.fcntl64) { - get_orig_funcs(); - } - BULLSEYE_EXCLUDE_BLOCK_END - if (p_socket_object && orig_os_api.fcntl64) { + if (p_socket_object && VALID_SYSCALL(fcntl64)) { VERIFY_PASSTROUGH_CHANGED(res, p_socket_object->fcntl64(__cmd, arg)); } else { - if (!orig_os_api.fcntl64) { - srdr_logfunc_exit("failed (errno=%d %m)", errno); - VLOG_PRINTF_ONCE_THEN_ALWAYS(VLOG_ERROR, VLOG_DEBUG, - "fcntl64 was not found during runtime. Set %s to " - "appripriate debug level to see datails. Ignoring...", - SYS_VAR_LOG_LEVEL); - errno = EOPNOTSUPP; - return -1; - } else { - res = orig_os_api.fcntl64(__fd, __cmd, arg); - } + res = SYSCALL_ERRNO_UNSUPPORTED(fcntl64, __fd, __cmd, arg); } if (__cmd == F_DUPFD) { @@ -819,7 +1218,7 @@ extern "C" EXPORT_SYMBOL int fcntl64(int __fd, int __cmd, ...) /* Perform the I/O control operation specified by REQUEST on FD. One argument may follow; its presence and type depend on REQUEST. Return value depends on REQUEST. Usually -1 indicates error. */ -extern "C" EXPORT_SYMBOL int ioctl(int __fd, unsigned long int __request, ...) +EXPORT_SYMBOL int XLIO_SYMBOL(ioctl)(int __fd, unsigned long int __request, ...) { PROFILE_FUNC @@ -838,387 +1237,79 @@ extern "C" EXPORT_SYMBOL int ioctl(int __fd, unsigned long int __request, ...) if (p_socket_object && arg) { VERIFY_PASSTROUGH_CHANGED(res, p_socket_object->ioctl(__request, arg)); } else { - BULLSEYE_EXCLUDE_BLOCK_START - if (!orig_os_api.ioctl) { - get_orig_funcs(); - } - BULLSEYE_EXCLUDE_BLOCK_END - res = orig_os_api.ioctl(__fd, __request, arg); + res = SYSCALL(ioctl, __fd, __request, arg); } if (ret >= 0) { srdr_logfunc_exit("returned with %d", ret); } else { srdr_logfunc_exit("failed (errno=%d %m)", errno); - } - return res; -} - -extern "C" EXPORT_SYMBOL int getsockname(int __fd, struct sockaddr *__name, socklen_t *__namelen) -{ - PROFILE_FUNC - - srdr_logdbg_entry("fd=%d", __fd); - - int ret = 0; - socket_fd_api *p_socket_object = NULL; - p_socket_object = fd_collection_get_sockfd(__fd); - if (p_socket_object) { - ret = p_socket_object->getsockname(__name, __namelen); - - if (safe_mce_sys().trigger_dummy_send_getsockname) { - char buf[264] = {0}; - struct iovec msg_iov = {&buf, sizeof(buf)}; - struct msghdr msg = {NULL, 0, &msg_iov, 1, NULL, 0, 0}; - int ret_send = sendmsg(__fd, &msg, XLIO_SND_FLAGS_DUMMY); - srdr_logdbg("Triggered dummy message for socket fd=%d (ret_send=%d)", __fd, ret_send); - NOT_IN_USE(ret_send); - } - } else { - BULLSEYE_EXCLUDE_BLOCK_START - if (!orig_os_api.getsockname) { - get_orig_funcs(); - } - BULLSEYE_EXCLUDE_BLOCK_END - ret = orig_os_api.getsockname(__fd, __name, __namelen); - } - - if (ret >= 0) { - srdr_logdbg_exit("returned with %d", ret); - } else { - srdr_logdbg_exit("failed (errno=%d %m)", errno); - } - return ret; -} - -extern "C" EXPORT_SYMBOL int getpeername(int __fd, struct sockaddr *__name, socklen_t *__namelen) -{ - PROFILE_FUNC - - srdr_logdbg_entry("fd=%d", __fd); - - int ret = 0; - socket_fd_api *p_socket_object = NULL; - p_socket_object = fd_collection_get_sockfd(__fd); - if (p_socket_object) { - ret = p_socket_object->getpeername(__name, __namelen); - } else { - BULLSEYE_EXCLUDE_BLOCK_START - if (!orig_os_api.getpeername) { - get_orig_funcs(); - } - BULLSEYE_EXCLUDE_BLOCK_END - ret = orig_os_api.getpeername(__fd, __name, __namelen); - } - - if (ret >= 0) { - srdr_logdbg_exit("returned with %d", ret); - } else { - srdr_logdbg_exit("failed (errno=%d %m)", errno); - } - return ret; -} - -/* Read NBYTES into BUF from FD. Return the - number read, -1 for errors or 0 for EOF. - - This function is a cancellation point and therefore not marked with - __THROW. */ -extern "C" EXPORT_SYMBOL ssize_t read(int __fd, void *__buf, size_t __nbytes) -{ - PROFILE_FUNC - - srdr_logfuncall_entry("fd=%d", __fd); - - socket_fd_api *p_socket_object = NULL; - p_socket_object = fd_collection_get_sockfd(__fd); - if (p_socket_object) { - struct iovec piov[1]; - piov[0].iov_base = __buf; - piov[0].iov_len = __nbytes; - int dummy_flags = 0; - return p_socket_object->rx(RX_READ, piov, 1, &dummy_flags); - } - - BULLSEYE_EXCLUDE_BLOCK_START - if (!orig_os_api.read) { - get_orig_funcs(); - } - BULLSEYE_EXCLUDE_BLOCK_END - - return orig_os_api.read(__fd, __buf, __nbytes); -} - -#if defined HAVE___READ_CHK -/* Checks that the buffer is big enough to contain the number of bytes - * the user requests to read. If the buffer is too small, aborts, - * else read NBYTES into BUF from FD. Return the - number read, -1 for errors or 0 for EOF. - - This function is a cancellation point and therefore not marked with - __THROW. */ -extern "C" EXPORT_SYMBOL ssize_t __read_chk(int __fd, void *__buf, size_t __nbytes, size_t __buflen) -{ - PROFILE_FUNC - - srdr_logfuncall_entry("fd=%d", __fd); - - socket_fd_api *p_socket_object = NULL; - p_socket_object = fd_collection_get_sockfd(__fd); - if (p_socket_object) { - BULLSEYE_EXCLUDE_BLOCK_START - if (__nbytes > __buflen) { - srdr_logpanic("buffer overflow detected"); - } - BULLSEYE_EXCLUDE_BLOCK_END - - struct iovec piov[1]; - piov[0].iov_base = __buf; - piov[0].iov_len = __nbytes; - int dummy_flags = 0; - return p_socket_object->rx(RX_READ, piov, 1, &dummy_flags); - } - BULLSEYE_EXCLUDE_BLOCK_START - if (!orig_os_api.__read_chk) { - get_orig_funcs(); - } - BULLSEYE_EXCLUDE_BLOCK_END - - return orig_os_api.__read_chk(__fd, __buf, __nbytes, __buflen); -} -#endif - -/* Read COUNT blocks into VECTOR from FD. Return the - number of bytes read, -1 for errors or 0 for EOF. - - This function is a cancellation point and therefore not marked with - __THROW. */ - -extern "C" EXPORT_SYMBOL ssize_t readv(int __fd, const struct iovec *iov, int iovcnt) -{ - PROFILE_FUNC - - srdr_logfuncall_entry("fd=%d", __fd); - - socket_fd_api *p_socket_object = NULL; - p_socket_object = fd_collection_get_sockfd(__fd); - if (p_socket_object) { - struct iovec *piov = (struct iovec *)iov; - int dummy_flags = 0; - return p_socket_object->rx(RX_READV, piov, iovcnt, &dummy_flags); - } - BULLSEYE_EXCLUDE_BLOCK_START - if (!orig_os_api.readv) { - get_orig_funcs(); - } - BULLSEYE_EXCLUDE_BLOCK_END - - return orig_os_api.readv(__fd, iov, iovcnt); -} - -/* Read N bytes into BUF from socket FD. - Returns the number read or -1 for errors. - - This function is a cancellation point and therefore not marked with - __THROW. */ -extern "C" EXPORT_SYMBOL ssize_t recv(int __fd, void *__buf, size_t __nbytes, int __flags) -{ - PROFILE_FUNC - - srdr_logfuncall_entry("fd=%d", __fd); - - socket_fd_api *p_socket_object = NULL; - p_socket_object = fd_collection_get_sockfd(__fd); - if (p_socket_object) { - struct iovec piov[1]; - piov[0].iov_base = __buf; - piov[0].iov_len = __nbytes; - return p_socket_object->rx(RX_RECV, piov, 1, &__flags); - } - BULLSEYE_EXCLUDE_BLOCK_START - if (!orig_os_api.recv) { - get_orig_funcs(); - } - BULLSEYE_EXCLUDE_BLOCK_END - - return orig_os_api.recv(__fd, __buf, __nbytes, __flags); -} - -#if defined HAVE___RECV_CHK -/* Checks that the buffer is big enough to contain the number of bytes - the user requests to read. If the buffer is too small, aborts, - else read N bytes into BUF from socket FD. - Returns the number read or -1 for errors. - - This function is a cancellation point and therefore not marked with - __THROW. */ -extern "C" EXPORT_SYMBOL ssize_t __recv_chk(int __fd, void *__buf, size_t __nbytes, size_t __buflen, - int __flags) -{ - PROFILE_FUNC - - srdr_logfuncall_entry("fd=%d", __fd); - - socket_fd_api *p_socket_object = NULL; - p_socket_object = fd_collection_get_sockfd(__fd); - if (p_socket_object) { - BULLSEYE_EXCLUDE_BLOCK_START - if (__nbytes > __buflen) { - srdr_logpanic("buffer overflow detected"); - } - BULLSEYE_EXCLUDE_BLOCK_END - - struct iovec piov[1]; - piov[0].iov_base = __buf; - piov[0].iov_len = __nbytes; - return p_socket_object->rx(RX_RECV, piov, 1, &__flags); - } - BULLSEYE_EXCLUDE_BLOCK_START - if (!orig_os_api.__recv_chk) { - get_orig_funcs(); - } - BULLSEYE_EXCLUDE_BLOCK_END - - return orig_os_api.__recv_chk(__fd, __buf, __nbytes, __buflen, __flags); + } + return res; } -#endif - -/* Receive a message as described by MESSAGE from socket FD. - Returns the number of bytes read or -1 for errors. - This function is a cancellation point and therefore not marked with - __THROW. */ -extern "C" EXPORT_SYMBOL ssize_t recvmsg(int __fd, struct msghdr *__msg, int __flags) +EXPORT_SYMBOL int XLIO_SYMBOL(getsockname)(int __fd, struct sockaddr *__name, socklen_t *__namelen) { PROFILE_FUNC - srdr_logfuncall_entry("fd=%d", __fd); - - if (__msg == NULL) { - srdr_logdbg("NULL msghdr"); - errno = EINVAL; - return -1; - } + srdr_logdbg_entry("fd=%d", __fd); + int ret = 0; socket_fd_api *p_socket_object = NULL; p_socket_object = fd_collection_get_sockfd(__fd); if (p_socket_object) { - __msg->msg_flags = 0; - return p_socket_object->rx(RX_RECVMSG, __msg->msg_iov, __msg->msg_iovlen, &__flags, - (__SOCKADDR_ARG)__msg->msg_name, - (socklen_t *)&__msg->msg_namelen, __msg); - } - BULLSEYE_EXCLUDE_BLOCK_START - if (!orig_os_api.recvmsg) { - get_orig_funcs(); + ret = p_socket_object->getsockname(__name, __namelen); + + if (safe_mce_sys().trigger_dummy_send_getsockname) { + char buf[264] = {0}; + struct iovec msg_iov = {&buf, sizeof(buf)}; + struct msghdr msg = {NULL, 0, &msg_iov, 1, NULL, 0, 0}; + int ret_send = sendmsg(__fd, &msg, XLIO_SND_FLAGS_DUMMY); + srdr_logdbg("Triggered dummy message for socket fd=%d (ret_send=%d)", __fd, ret_send); + NOT_IN_USE(ret_send); + } + } else { + ret = SYSCALL(getsockname, __fd, __name, __namelen); } - BULLSEYE_EXCLUDE_BLOCK_END - return orig_os_api.recvmsg(__fd, __msg, __flags); + if (ret >= 0) { + srdr_logdbg_exit("returned with %d", ret); + } else { + srdr_logdbg_exit("failed (errno=%d %m)", errno); + } + return ret; } -/* The following definitions are for kernels previous to 2.6.32 which dont support recvmmsg */ -#ifndef HAVE_STRUCT_MMSGHDR -#ifndef __INTEL_COMPILER -struct mmsghdr { - struct msghdr msg_hdr; // Message header - unsigned int msg_len; // Number of received bytes for header -}; -#endif -#endif - -#ifndef MSG_WAITFORONE -#define MSG_WAITFORONE 0x10000 // recvmmsg(): block until 1+ packets avail -#endif - -/* Receive multiple messages as described by MESSAGE from socket FD. - Returns the number of messages received or -1 for errors. - - This function is a cancellation point and therefore not marked with - __THROW. */ -extern "C" EXPORT_SYMBOL -#ifdef RECVMMSG_WITH_CONST_TIMESPEC - int - recvmmsg(int __fd, struct mmsghdr *__mmsghdr, unsigned int __vlen, int __flags, - const struct timespec *__timeout) -#else - int - recvmmsg(int __fd, struct mmsghdr *__mmsghdr, unsigned int __vlen, int __flags, - struct timespec *__timeout) -#endif +EXPORT_SYMBOL int XLIO_SYMBOL(getpeername)(int __fd, struct sockaddr *__name, socklen_t *__namelen) { PROFILE_FUNC - int num_of_msg = 0; - struct timespec start_time = TIMESPEC_INITIALIZER, current_time = TIMESPEC_INITIALIZER, - delta_time = TIMESPEC_INITIALIZER; - - srdr_logfuncall_entry("fd=%d, mmsghdr length=%d flags=%x", __fd, __vlen, __flags); - - if (__mmsghdr == NULL) { - srdr_logdbg("NULL mmsghdr"); - errno = EINVAL; - return -1; - } + srdr_logdbg_entry("fd=%d", __fd); - if (__timeout) { - gettime(&start_time); - } + int ret = 0; socket_fd_api *p_socket_object = NULL; p_socket_object = fd_collection_get_sockfd(__fd); if (p_socket_object) { - int ret = 0; - for (unsigned int i = 0; i < __vlen; i++) { - int flags = __flags; - __mmsghdr[i].msg_hdr.msg_flags = 0; - ret = p_socket_object->rx( - RX_RECVMSG, __mmsghdr[i].msg_hdr.msg_iov, __mmsghdr[i].msg_hdr.msg_iovlen, &flags, - (__SOCKADDR_ARG)__mmsghdr[i].msg_hdr.msg_name, - (socklen_t *)&__mmsghdr[i].msg_hdr.msg_namelen, &__mmsghdr[i].msg_hdr); - if (ret < 0) { - break; - } - num_of_msg++; - __mmsghdr[i].msg_len = ret; - if ((i == 0) && (flags & MSG_WAITFORONE)) { - __flags |= MSG_DONTWAIT; - } - if (__timeout) { - gettime(¤t_time); - ts_sub(¤t_time, &start_time, &delta_time); - if (ts_cmp(&delta_time, __timeout, >)) { - break; - } - } - } - if (num_of_msg || ret == 0) { - // todo save ret for so_error if ret != 0(see kernel) - return num_of_msg; - } else { - return ret; - } - } - BULLSEYE_EXCLUDE_BLOCK_START - if (!orig_os_api.recvmmsg) { - get_orig_funcs(); + ret = p_socket_object->getpeername(__name, __namelen); + } else { + ret = SYSCALL(getpeername, __fd, __name, __namelen); } - BULLSEYE_EXCLUDE_BLOCK_END - return orig_os_api.recvmmsg(__fd, __mmsghdr, __vlen, __flags, __timeout); + if (ret >= 0) { + srdr_logdbg_exit("returned with %d", ret); + } else { + srdr_logdbg_exit("failed (errno=%d %m)", errno); + } + return ret; } -/* Read N bytes into BUF through socket FD. - If ADDR is not NULL, fill in *ADDR_LEN bytes of it with tha address of - the sender, and store the actual size of the address in *ADDR_LEN. - Returns the number of bytes read or -1 for errors. +/* Read NBYTES into BUF from FD. Return the + number read, -1 for errors or 0 for EOF. This function is a cancellation point and therefore not marked with __THROW. */ -extern "C" EXPORT_SYMBOL ssize_t recvfrom(int __fd, void *__buf, size_t __nbytes, int __flags, - struct sockaddr *__from, socklen_t *__fromlen) +EXPORT_SYMBOL ssize_t XLIO_SYMBOL(read)(int __fd, void *__buf, size_t __nbytes) { - ssize_t ret_val = 0; - PROFILE_FUNC srdr_logfuncall_entry("fd=%d", __fd); @@ -1229,31 +1320,23 @@ extern "C" EXPORT_SYMBOL ssize_t recvfrom(int __fd, void *__buf, size_t __nbytes struct iovec piov[1]; piov[0].iov_base = __buf; piov[0].iov_len = __nbytes; - ret_val = p_socket_object->rx(RX_RECVFROM, piov, 1, &__flags, __from, __fromlen); - } else { - BULLSEYE_EXCLUDE_BLOCK_START - if (!orig_os_api.recvfrom) { - get_orig_funcs(); - } - BULLSEYE_EXCLUDE_BLOCK_END - ret_val = orig_os_api.recvfrom(__fd, __buf, __nbytes, __flags, __from, __fromlen); + int dummy_flags = 0; + return p_socket_object->rx(RX_READ, piov, 1, &dummy_flags); } - return ret_val; + + return SYSCALL(read, __fd, __buf, __nbytes); } -#if defined HAVE___RECVFROM_CHK +#if defined HAVE___READ_CHK /* Checks that the buffer is big enough to contain the number of bytes - the user requests to read. If the buffer is too small, aborts, - else read N bytes into BUF through socket FD. - If ADDR is not NULL, fill in *ADDR_LEN bytes of it with tha address of - the sender, and store the actual size of the address in *ADDR_LEN. - Returns the number of bytes read or -1 for errors. + * the user requests to read. If the buffer is too small, aborts, + * else read NBYTES into BUF from FD. Return the + number read, -1 for errors or 0 for EOF. This function is a cancellation point and therefore not marked with __THROW. */ -extern "C" EXPORT_SYMBOL ssize_t __recvfrom_chk(int __fd, void *__buf, size_t __nbytes, - size_t __buflen, int __flags, - struct sockaddr *__from, socklen_t *__fromlen) +EXPORT_SYMBOL ssize_t XLIO_SYMBOL(__read_chk)(int __fd, void *__buf, size_t __nbytes, + size_t __buflen) { PROFILE_FUNC @@ -1271,188 +1354,157 @@ extern "C" EXPORT_SYMBOL ssize_t __recvfrom_chk(int __fd, void *__buf, size_t __ struct iovec piov[1]; piov[0].iov_base = __buf; piov[0].iov_len = __nbytes; - return p_socket_object->rx(RX_RECVFROM, piov, 1, &__flags, __from, __fromlen); - } - BULLSEYE_EXCLUDE_BLOCK_START - if (!orig_os_api.__recvfrom_chk) { - get_orig_funcs(); + int dummy_flags = 0; + return p_socket_object->rx(RX_READ, piov, 1, &dummy_flags); } - BULLSEYE_EXCLUDE_BLOCK_END - return orig_os_api.__recvfrom_chk(__fd, __buf, __nbytes, __buflen, __flags, __from, __fromlen); + return SYSCALL(__read_chk, __fd, __buf, __nbytes, __buflen); } #endif -/* Write N bytes of BUF to FD. Return the number written, or -1. +/* Read COUNT blocks into VECTOR from FD. Return the + number of bytes read, -1 for errors or 0 for EOF. This function is a cancellation point and therefore not marked with __THROW. */ -extern "C" EXPORT_SYMBOL ssize_t write(int __fd, __const void *__buf, size_t __nbytes) + +EXPORT_SYMBOL ssize_t XLIO_SYMBOL(readv)(int __fd, const struct iovec *iov, int iovcnt) { PROFILE_FUNC - srdr_logfuncall_entry("fd=%d, nbytes=%d", __fd, __nbytes); + srdr_logfuncall_entry("fd=%d", __fd); socket_fd_api *p_socket_object = NULL; p_socket_object = fd_collection_get_sockfd(__fd); if (p_socket_object) { - struct iovec piov[1] = {{(void *)__buf, __nbytes}}; - xlio_tx_call_attr_t tx_arg; - - tx_arg.opcode = TX_WRITE; - tx_arg.attr.iov = piov; - tx_arg.attr.sz_iov = 1; - - return p_socket_object->tx(tx_arg); - } - BULLSEYE_EXCLUDE_BLOCK_START - if (!orig_os_api.write) { - get_orig_funcs(); + struct iovec *piov = (struct iovec *)iov; + int dummy_flags = 0; + return p_socket_object->rx(RX_READV, piov, iovcnt, &dummy_flags); } - BULLSEYE_EXCLUDE_BLOCK_END - return orig_os_api.write(__fd, __buf, __nbytes); + return SYSCALL(readv, __fd, iov, iovcnt); } -/* Write IOCNT blocks from IOVEC to FD. Return the number written, or -1. +/* Read N bytes into BUF from socket FD. + Returns the number read or -1 for errors. This function is a cancellation point and therefore not marked with __THROW. */ -extern "C" EXPORT_SYMBOL ssize_t writev(int __fd, const struct iovec *iov, int iovcnt) +EXPORT_SYMBOL ssize_t XLIO_SYMBOL(recv)(int __fd, void *__buf, size_t __nbytes, int __flags) { PROFILE_FUNC - srdr_logfuncall_entry("fd=%d, %d iov blocks", __fd, iovcnt); + srdr_logfuncall_entry("fd=%d", __fd); socket_fd_api *p_socket_object = NULL; p_socket_object = fd_collection_get_sockfd(__fd); if (p_socket_object) { - xlio_tx_call_attr_t tx_arg; - - tx_arg.opcode = TX_WRITEV; - tx_arg.attr.iov = (struct iovec *)iov; - tx_arg.attr.sz_iov = iovcnt; - - return p_socket_object->tx(tx_arg); - } - BULLSEYE_EXCLUDE_BLOCK_START - if (!orig_os_api.writev) { - get_orig_funcs(); + struct iovec piov[1]; + piov[0].iov_base = __buf; + piov[0].iov_len = __nbytes; + return p_socket_object->rx(RX_RECV, piov, 1, &__flags); } - BULLSEYE_EXCLUDE_BLOCK_END - return orig_os_api.writev(__fd, iov, iovcnt); + return SYSCALL(recv, __fd, __buf, __nbytes, __flags); } -/* Send N bytes of BUF to socket FD. Returns the number sent or -1. +#if defined HAVE___RECV_CHK +/* Checks that the buffer is big enough to contain the number of bytes + the user requests to read. If the buffer is too small, aborts, + else read N bytes into BUF from socket FD. + Returns the number read or -1 for errors. This function is a cancellation point and therefore not marked with __THROW. */ -extern "C" EXPORT_SYMBOL ssize_t send(int __fd, __const void *__buf, size_t __nbytes, int __flags) +EXPORT_SYMBOL ssize_t XLIO_SYMBOL(__recv_chk)(int __fd, void *__buf, size_t __nbytes, + size_t __buflen, int __flags) { PROFILE_FUNC - srdr_logfuncall_entry("fd=%d, nbytes=%d", __fd, __nbytes); + srdr_logfuncall_entry("fd=%d", __fd); socket_fd_api *p_socket_object = NULL; p_socket_object = fd_collection_get_sockfd(__fd); if (p_socket_object) { - struct iovec piov[1] = {{(void *)__buf, __nbytes}}; - xlio_tx_call_attr_t tx_arg; - - tx_arg.opcode = TX_SEND; - tx_arg.attr.iov = piov; - tx_arg.attr.sz_iov = 1; - tx_arg.attr.flags = __flags; - - return p_socket_object->tx(tx_arg); - } + BULLSEYE_EXCLUDE_BLOCK_START + if (__nbytes > __buflen) { + srdr_logpanic("buffer overflow detected"); + } + BULLSEYE_EXCLUDE_BLOCK_END - // Ignore dummy messages for OS - if (unlikely(IS_DUMMY_PACKET(__flags))) { - errno = EINVAL; - return -1; - } - BULLSEYE_EXCLUDE_BLOCK_START - if (!orig_os_api.send) { - get_orig_funcs(); + struct iovec piov[1]; + piov[0].iov_base = __buf; + piov[0].iov_len = __nbytes; + return p_socket_object->rx(RX_RECV, piov, 1, &__flags); } - BULLSEYE_EXCLUDE_BLOCK_END - return orig_os_api.send(__fd, __buf, __nbytes, __flags); + return SYSCALL(__recv_chk, __fd, __buf, __nbytes, __buflen, __flags); } +#endif -/* Sends a message as described by MESSAGE to socket FD. +/* Receive a message as described by MESSAGE from socket FD. Returns the number of bytes read or -1 for errors. This function is a cancellation point and therefore not marked with __THROW. */ -extern "C" EXPORT_SYMBOL ssize_t sendmsg(int __fd, __const struct msghdr *__msg, int __flags) +EXPORT_SYMBOL ssize_t XLIO_SYMBOL(recvmsg)(int __fd, struct msghdr *__msg, int __flags) { PROFILE_FUNC srdr_logfuncall_entry("fd=%d", __fd); - socket_fd_api *p_socket_object = NULL; - p_socket_object = fd_collection_get_sockfd(__fd); - if (p_socket_object) { - xlio_tx_call_attr_t tx_arg; - - tx_arg.opcode = TX_SENDMSG; - tx_arg.attr.iov = __msg->msg_iov; - tx_arg.attr.sz_iov = (ssize_t)__msg->msg_iovlen; - tx_arg.attr.flags = __flags; - tx_arg.attr.addr = (struct sockaddr *)(__CONST_SOCKADDR_ARG)__msg->msg_name; - tx_arg.attr.len = (socklen_t)__msg->msg_namelen; - tx_arg.attr.hdr = __msg; - tx_arg.priv.attr = PBUF_NONE; - - if (0 < __msg->msg_controllen) { - struct cmsghdr *cmsg = CMSG_FIRSTHDR((struct msghdr *)__msg); - if ((cmsg->cmsg_level == SOL_SOCKET) && - (cmsg->cmsg_type == SCM_XLIO_PD || cmsg->cmsg_type == SCM_XLIO_NVME_PD)) { - if ((tx_arg.attr.flags & MSG_ZEROCOPY) && - (__msg->msg_iovlen == - ((cmsg->cmsg_len - CMSG_LEN(0)) / sizeof(struct xlio_pd_key)))) { - tx_arg.priv.attr = - (cmsg->cmsg_type == SCM_XLIO_PD) ? PBUF_DESC_MKEY : PBUF_DESC_NVME_TX; - tx_arg.priv.map = (void *)CMSG_DATA(cmsg); - } else { - errno = EINVAL; - return -1; - } - } - } - - return p_socket_object->tx(tx_arg); - } - - // Ignore dummy messages for OS - if (unlikely(IS_DUMMY_PACKET(__flags))) { + if (__msg == NULL) { + srdr_logdbg("NULL msghdr"); errno = EINVAL; return -1; } - BULLSEYE_EXCLUDE_BLOCK_START - if (!orig_os_api.sendmsg) { - get_orig_funcs(); + + socket_fd_api *p_socket_object = NULL; + p_socket_object = fd_collection_get_sockfd(__fd); + if (p_socket_object) { + __msg->msg_flags = 0; + return p_socket_object->rx(RX_RECVMSG, __msg->msg_iov, __msg->msg_iovlen, &__flags, + (__SOCKADDR_ARG)__msg->msg_name, + (socklen_t *)&__msg->msg_namelen, __msg); } - BULLSEYE_EXCLUDE_BLOCK_END - return orig_os_api.sendmsg(__fd, __msg, __flags); + return SYSCALL(recvmsg, __fd, __msg, __flags); } -/* Send multiple messages as described by MESSAGE from socket FD. - Returns the number of messages sent or -1 for errors. +/* The following definitions are for kernels previous to 2.6.32 which dont support recvmmsg */ +#ifndef HAVE_STRUCT_MMSGHDR +#ifndef __INTEL_COMPILER +struct mmsghdr { + struct msghdr msg_hdr; // Message header + unsigned int msg_len; // Number of received bytes for header +}; +#endif +#endif + +#ifndef MSG_WAITFORONE +#define MSG_WAITFORONE 0x10000 // recvmmsg(): block until 1+ packets avail +#endif + +/* Receive multiple messages as described by MESSAGE from socket FD. + Returns the number of messages received or -1 for errors. This function is a cancellation point and therefore not marked with __THROW. */ -extern "C" EXPORT_SYMBOL int sendmmsg(int __fd, struct mmsghdr *__mmsghdr, unsigned int __vlen, - int __flags) -{ - int num_of_msg = 0; +#if defined(RECVMMSG_WITH_CONST_TIMESPEC) || defined(XLIO_STATIC_BUILD) +EXPORT_SYMBOL int XLIO_SYMBOL(recvmmsg)(int __fd, struct mmsghdr *__mmsghdr, unsigned int __vlen, + int __flags, const struct timespec *__timeout) +#else +EXPORT_SYMBOL int XLIO_SYMBOL(recvmmsg)(int __fd, struct mmsghdr *__mmsghdr, unsigned int __vlen, + int __flags, struct timespec *__timeout) +#endif +{ PROFILE_FUNC + int num_of_msg = 0; + struct timespec start_time = TIMESPEC_INITIALIZER, current_time = TIMESPEC_INITIALIZER, + delta_time = TIMESPEC_INITIALIZER; + srdr_logfuncall_entry("fd=%d, mmsghdr length=%d flags=%x", __fd, __vlen, __flags); if (__mmsghdr == NULL) { @@ -1461,411 +1513,360 @@ extern "C" EXPORT_SYMBOL int sendmmsg(int __fd, struct mmsghdr *__mmsghdr, unsig return -1; } + if (__timeout) { + gettime(&start_time); + } socket_fd_api *p_socket_object = NULL; p_socket_object = fd_collection_get_sockfd(__fd); if (p_socket_object) { + int ret = 0; for (unsigned int i = 0; i < __vlen; i++) { - xlio_tx_call_attr_t tx_arg; - - tx_arg.opcode = TX_SENDMSG; - tx_arg.attr.iov = __mmsghdr[i].msg_hdr.msg_iov; - tx_arg.attr.sz_iov = (ssize_t)__mmsghdr[i].msg_hdr.msg_iovlen; - tx_arg.attr.flags = __flags; - tx_arg.attr.addr = (struct sockaddr *)(__SOCKADDR_ARG)__mmsghdr[i].msg_hdr.msg_name; - tx_arg.attr.len = (socklen_t)__mmsghdr[i].msg_hdr.msg_namelen; - tx_arg.attr.hdr = &__mmsghdr[i].msg_hdr; - - int ret = p_socket_object->tx(tx_arg); + int flags = __flags; + __mmsghdr[i].msg_hdr.msg_flags = 0; + ret = p_socket_object->rx( + RX_RECVMSG, __mmsghdr[i].msg_hdr.msg_iov, __mmsghdr[i].msg_hdr.msg_iovlen, &flags, + (__SOCKADDR_ARG)__mmsghdr[i].msg_hdr.msg_name, + (socklen_t *)&__mmsghdr[i].msg_hdr.msg_namelen, &__mmsghdr[i].msg_hdr); if (ret < 0) { - if (num_of_msg) { - return num_of_msg; - } else { - return ret; - } + break; } num_of_msg++; __mmsghdr[i].msg_len = ret; + if ((i == 0) && (flags & MSG_WAITFORONE)) { + __flags |= MSG_DONTWAIT; + } + if (__timeout) { + gettime(¤t_time); + ts_sub(¤t_time, &start_time, &delta_time); + if (ts_cmp(&delta_time, __timeout, >)) { + break; + } + } + } + if (num_of_msg || ret == 0) { + // todo save ret for so_error if ret != 0(see kernel) + return num_of_msg; + } else { + return ret; } - return num_of_msg; } - // Ignore dummy messages for OS - if (unlikely(IS_DUMMY_PACKET(__flags))) { - errno = EINVAL; - return -1; - } - BULLSEYE_EXCLUDE_BLOCK_START - if (!orig_os_api.sendmmsg) { - get_orig_funcs(); + struct timespec timeout = TIMESPEC_INITIALIZER; + if (__timeout) { + memcpy(&timeout, __timeout, sizeof(timeout)); } - BULLSEYE_EXCLUDE_BLOCK_END - - return orig_os_api.sendmmsg(__fd, __mmsghdr, __vlen, __flags); + return SYSCALL(recvmmsg, __fd, __mmsghdr, __vlen, __flags, &timeout); } -/* Send N bytes of BUF on socket FD to peer at address ADDR (which is - ADDR_LEN bytes long). Returns the number sent, or -1 for errors. +/* Read N bytes into BUF through socket FD. + If ADDR is not NULL, fill in *ADDR_LEN bytes of it with tha address of + the sender, and store the actual size of the address in *ADDR_LEN. + Returns the number of bytes read or -1 for errors. This function is a cancellation point and therefore not marked with __THROW. */ -extern "C" EXPORT_SYMBOL ssize_t sendto(int __fd, __const void *__buf, size_t __nbytes, int __flags, - const struct sockaddr *__to, socklen_t __tolen) +EXPORT_SYMBOL ssize_t XLIO_SYMBOL(recvfrom)(int __fd, void *__buf, size_t __nbytes, int __flags, + struct sockaddr *__from, socklen_t *__fromlen) { + ssize_t ret_val = 0; + PROFILE_FUNC - srdr_logfuncall_entry("fd=%d, nbytes=%d", __fd, __nbytes); + srdr_logfuncall_entry("fd=%d", __fd); socket_fd_api *p_socket_object = NULL; p_socket_object = fd_collection_get_sockfd(__fd); if (p_socket_object) { - struct iovec piov[1] = {{(void *)__buf, __nbytes}}; - xlio_tx_call_attr_t tx_arg; - - tx_arg.opcode = TX_SENDTO; - tx_arg.attr.iov = piov; - tx_arg.attr.sz_iov = 1; - tx_arg.attr.flags = __flags; - tx_arg.attr.addr = (struct sockaddr *)__to; - tx_arg.attr.len = __tolen; - - return p_socket_object->tx(tx_arg); - } - - // Ignore dummy messages for OS - if (unlikely(IS_DUMMY_PACKET(__flags))) { - errno = EINVAL; - return -1; - } - BULLSEYE_EXCLUDE_BLOCK_START - if (!orig_os_api.sendto) { - get_orig_funcs(); + struct iovec piov[1]; + piov[0].iov_base = __buf; + piov[0].iov_len = __nbytes; + ret_val = p_socket_object->rx(RX_RECVFROM, piov, 1, &__flags, __from, __fromlen); + } else { + ret_val = SYSCALL(recvfrom, __fd, __buf, __nbytes, __flags, __from, __fromlen); } - BULLSEYE_EXCLUDE_BLOCK_END - - return orig_os_api.sendto(__fd, __buf, __nbytes, __flags, __to, __tolen); + return ret_val; } -static ssize_t sendfile_helper(socket_fd_api *p_socket_object, int in_fd, __off64_t *offset, - size_t count) +#if defined HAVE___RECVFROM_CHK +/* Checks that the buffer is big enough to contain the number of bytes + the user requests to read. If the buffer is too small, aborts, + else read N bytes into BUF through socket FD. + If ADDR is not NULL, fill in *ADDR_LEN bytes of it with tha address of + the sender, and store the actual size of the address in *ADDR_LEN. + Returns the number of bytes read or -1 for errors. + + This function is a cancellation point and therefore not marked with + __THROW. */ +EXPORT_SYMBOL ssize_t XLIO_SYMBOL(__recvfrom_chk)(int __fd, void *__buf, size_t __nbytes, + size_t __buflen, int __flags, + struct sockaddr *__from, socklen_t *__fromlen) { - ssize_t totSent = 0; - struct stat64 stat_buf; - __off64_t orig_offset = 0; - __off64_t cur_offset; - struct iovec piov[1]; - xlio_tx_call_attr_t tx_arg; - sockinfo *s = (sockinfo *)p_socket_object; + PROFILE_FUNC - if (p_socket_object->get_type() != FD_TYPE_SOCKET) { - errno = EBADF; - return -1; - } + srdr_logfuncall_entry("fd=%d", __fd); - if (offset == NULL) { - orig_offset = lseek64(in_fd, 0, SEEK_CUR); - if (orig_offset < 0) { - errno = ESPIPE; - return -1; + socket_fd_api *p_socket_object = NULL; + p_socket_object = fd_collection_get_sockfd(__fd); + if (p_socket_object) { + BULLSEYE_EXCLUDE_BLOCK_START + if (__nbytes > __buflen) { + srdr_logpanic("buffer overflow detected"); } - cur_offset = orig_offset; - } else { - cur_offset = *offset; + BULLSEYE_EXCLUDE_BLOCK_END + + struct iovec piov[1]; + piov[0].iov_base = __buf; + piov[0].iov_len = __nbytes; + return p_socket_object->rx(RX_RECVFROM, piov, 1, &__flags, __from, __fromlen); } - if (PROTO_TCP == s->get_protocol()) { - mapping_t *mapping; - int rc; + return SYSCALL(__recvfrom_chk, __fd, __buf, __nbytes, __buflen, __flags, __from, __fromlen); +} +#endif - /* Get mapping from the cache */ - mapping = g_zc_cache->get_mapping(in_fd); - if (mapping == NULL) { - srdr_logdbg("Couldn't allocate mapping object"); - goto fallback; - } +/* Write N bytes of BUF to FD. Return the number written, or -1. - if ((__off64_t)mapping->m_size < (__off64_t)(cur_offset + count)) { - struct stat st_buf; + This function is a cancellation point and therefore not marked with + __THROW. */ +EXPORT_SYMBOL ssize_t XLIO_SYMBOL(write)(int __fd, __const void *__buf, size_t __nbytes) +{ + PROFILE_FUNC - /* - * This is slow path, we check fstat(2) to handle the - * scenario when user changes the file while respective - * mapping exists and the file becomes larger. - * As workaround, fallback to preadv() implementation. - */ - mapping->put(); - rc = fstat(in_fd, &st_buf); - if ((rc == 0) && (st_buf.st_size >= (off_t)(cur_offset + count))) { - s->m_p_socket_stats->counters.n_tx_sendfile_overflows++; - goto fallback; - } else { - errno = EOVERFLOW; - return -1; - } - } + srdr_logfuncall_entry("fd=%d, nbytes=%d", __fd, __nbytes); - piov[0].iov_base = (char *)mapping->m_addr + cur_offset; - piov[0].iov_len = count; + socket_fd_api *p_socket_object = NULL; + p_socket_object = fd_collection_get_sockfd(__fd); + if (p_socket_object) { + struct iovec piov[1] = {{(void *)__buf, __nbytes}}; + xlio_tx_call_attr_t tx_arg; - tx_arg.opcode = TX_FILE; + tx_arg.opcode = TX_WRITE; tx_arg.attr.iov = piov; tx_arg.attr.sz_iov = 1; - tx_arg.attr.flags = MSG_ZEROCOPY; - tx_arg.priv.attr = PBUF_DESC_MAP; - tx_arg.priv.map = (void *)mapping; - totSent = p_socket_object->tx(tx_arg); - - mapping->put(); - fallback: - /* Fallback to readv() implementation */ - if (totSent == 0) { - s->m_p_socket_stats->counters.n_tx_sendfile_fallbacks++; - tx_arg.clear(); - tx_arg.opcode = TX_FILE; - tx_arg.attr.iov = piov; - tx_arg.attr.sz_iov = 1; - tx_arg.priv.attr = PBUF_DESC_FD; - tx_arg.priv.fd = in_fd; - piov[0].iov_base = (void *)&cur_offset; - piov[0].iov_len = count; - totSent = p_socket_object->tx(tx_arg); - } - } else { - __off64_t pa_offset = 0; - size_t pa_count = 0; - struct flock64 lock; - if ((fstat64(in_fd, &stat_buf) == -1) || - ((__off64_t)stat_buf.st_size < (__off64_t)(cur_offset + count))) { - errno = EOVERFLOW; - return -1; - } + return p_socket_object->tx(tx_arg); + } - tx_arg.opcode = TX_WRITE; - tx_arg.attr.iov = piov; - tx_arg.attr.sz_iov = 1; + return SYSCALL(write, __fd, __buf, __nbytes); +} - /* The off argument of mmap() is constrained to be aligned and - * sized according to the value returned by sysconf() - */ - pa_offset = cur_offset & ~(sysconf(_SC_PAGE_SIZE) - 1); - pa_count = count + cur_offset - pa_offset; +/* Write IOCNT blocks from IOVEC to FD. Return the number written, or -1. - lock.l_type = F_RDLCK; - lock.l_whence = SEEK_SET; - lock.l_start = pa_offset; - lock.l_len = pa_count; - lock.l_pid = 0; + This function is a cancellation point and therefore not marked with + __THROW. */ +EXPORT_SYMBOL ssize_t XLIO_SYMBOL(writev)(int __fd, const struct iovec *iov, int iovcnt) +{ + PROFILE_FUNC - /* try to use mmap() approach */ - if (-1 != (fcntl(in_fd, F_SETLK, &lock))) { - void *addr = NULL; - addr = mmap64(NULL, pa_count, PROT_READ, MAP_SHARED | MAP_NORESERVE, in_fd, pa_offset); - if (MAP_FAILED != addr) { - ssize_t toRead, numSent = 0; + srdr_logfuncall_entry("fd=%d, %d iov blocks", __fd, iovcnt); - while (count > 0) { - toRead = min(sysconf(_SC_PAGE_SIZE), (ssize_t)count); + socket_fd_api *p_socket_object = NULL; + p_socket_object = fd_collection_get_sockfd(__fd); + if (p_socket_object) { + xlio_tx_call_attr_t tx_arg; - piov[0].iov_base = (void *)((uintptr_t)addr + cur_offset - pa_offset + totSent); - piov[0].iov_len = toRead; + tx_arg.opcode = TX_WRITEV; + tx_arg.attr.iov = (struct iovec *)iov; + tx_arg.attr.sz_iov = iovcnt; - numSent = p_socket_object->tx(tx_arg); - if (numSent == -1) { - break; - } + return p_socket_object->tx(tx_arg); + } - count -= numSent; - totSent += numSent; - } - (void)munmap(addr, pa_count); - } - lock.l_type = F_UNLCK; - (void)fcntl(in_fd, F_SETLK, &lock); - } + return SYSCALL(writev, __fd, iov, iovcnt); +} - /* fallback on read() approach */ - if (totSent == 0) { - char buf[sysconf(_SC_PAGE_SIZE)]; - ssize_t toRead, numRead, numSent = 0; +/* Send N bytes of BUF to socket FD. Returns the number sent or -1. - s->m_p_socket_stats->counters.n_tx_sendfile_fallbacks++; + This function is a cancellation point and therefore not marked with + __THROW. */ +EXPORT_SYMBOL ssize_t XLIO_SYMBOL(send)(int __fd, __const void *__buf, size_t __nbytes, int __flags) +{ + PROFILE_FUNC - while (count > 0) { - toRead = min(sizeof(buf), count); - numRead = pread(in_fd, buf, toRead, cur_offset + totSent); - if (numRead <= 0) { - if (numRead < 0 && totSent == 0) { - totSent = -1; - } - break; - } + srdr_logfuncall_entry("fd=%d, nbytes=%d", __fd, __nbytes); - piov[0].iov_base = (void *)buf; - piov[0].iov_len = numRead; + socket_fd_api *p_socket_object = NULL; + p_socket_object = fd_collection_get_sockfd(__fd); + if (p_socket_object) { + struct iovec piov[1] = {{(void *)__buf, __nbytes}}; + xlio_tx_call_attr_t tx_arg; - numSent = p_socket_object->tx(tx_arg); - if (numSent == -1) { - break; - } + tx_arg.opcode = TX_SEND; + tx_arg.attr.iov = piov; + tx_arg.attr.sz_iov = 1; + tx_arg.attr.flags = __flags; - count -= numSent; - totSent += numSent; - } - } + return p_socket_object->tx(tx_arg); } - if (totSent > 0) { - if (offset != NULL) { - *offset = *offset + totSent; - } else { - (void)lseek64(in_fd, (orig_offset + totSent), SEEK_SET); - } + // Ignore dummy messages for OS + if (unlikely(IS_DUMMY_PACKET(__flags))) { + errno = EINVAL; + return -1; } - return totSent; + return SYSCALL(send, __fd, __buf, __nbytes, __flags); } -extern "C" EXPORT_SYMBOL ssize_t sendfile(int out_fd, int in_fd, off_t *offset, size_t count) +/* Sends a message as described by MESSAGE to socket FD. + Returns the number of bytes read or -1 for errors. + + This function is a cancellation point and therefore not marked with + __THROW. */ +EXPORT_SYMBOL ssize_t XLIO_SYMBOL(sendmsg)(int __fd, __const struct msghdr *__msg, int __flags) { PROFILE_FUNC - srdr_logfuncall_entry("out_fd=%d, in_fd=%d, offset=%p, *offset=%zu, count=%d", out_fd, in_fd, - offset, offset ? *offset : 0, count); + srdr_logfuncall_entry("fd=%d", __fd); - socket_fd_api *p_socket_object = fd_collection_get_sockfd(out_fd); - if (!p_socket_object) { - if (!orig_os_api.sendfile) { - get_orig_funcs(); - } - return orig_os_api.sendfile(out_fd, in_fd, offset, count); + socket_fd_api *p_socket_object = NULL; + p_socket_object = fd_collection_get_sockfd(__fd); + if (p_socket_object) { + return sendmsg_internal(p_socket_object, __msg, __flags); } - return sendfile_helper(p_socket_object, in_fd, offset, count); + // Ignore dummy messages for OS + if (unlikely(IS_DUMMY_PACKET(__flags))) { + errno = EINVAL; + return -1; + } + + return SYSCALL(sendmsg, __fd, __msg, __flags); } -extern "C" EXPORT_SYMBOL ssize_t sendfile64(int out_fd, int in_fd, __off64_t *offset, size_t count) +/* Send multiple messages as described by MESSAGE from socket FD. + Returns the number of messages sent or -1 for errors. + + This function is a cancellation point and therefore not marked with + __THROW. */ +EXPORT_SYMBOL int XLIO_SYMBOL(sendmmsg)(int __fd, struct mmsghdr *__mmsghdr, unsigned int __vlen, + int __flags) { + int num_of_msg = 0; + PROFILE_FUNC - srdr_logfuncall_entry("out_fd=%d, in_fd=%d, offset=%p, *offset=%zu, count=%d", out_fd, in_fd, - offset, offset ? *offset : 0, count); + srdr_logfuncall_entry("fd=%d, mmsghdr length=%d flags=%x", __fd, __vlen, __flags); - socket_fd_api *p_socket_object = fd_collection_get_sockfd(out_fd); - if (!p_socket_object) { - if (!orig_os_api.sendfile64) { - get_orig_funcs(); - } - return orig_os_api.sendfile64(out_fd, in_fd, offset, count); + if (__mmsghdr == NULL) { + srdr_logdbg("NULL mmsghdr"); + errno = EINVAL; + return -1; } - return sendfile_helper(p_socket_object, in_fd, offset, count); -} + socket_fd_api *p_socket_object = NULL; + p_socket_object = fd_collection_get_sockfd(__fd); + if (p_socket_object) { + for (unsigned int i = 0; i < __vlen; i++) { + xlio_tx_call_attr_t tx_arg; -// Format a fd_set into a string for logging -// Check nfd to know how many 32 bits hexs do we want to sprintf into user buffer -const char *dbg_sprintf_fdset(char *buf, int buflen, int __nfds, fd_set *__fds) -{ - if (buflen < 1) { - return "(null)"; - } - buf[0] = '\0'; + tx_arg.opcode = TX_SENDMSG; + tx_arg.attr.iov = __mmsghdr[i].msg_hdr.msg_iov; + tx_arg.attr.sz_iov = (ssize_t)__mmsghdr[i].msg_hdr.msg_iovlen; + tx_arg.attr.flags = __flags; + tx_arg.attr.addr = (struct sockaddr *)(__SOCKADDR_ARG)__mmsghdr[i].msg_hdr.msg_name; + tx_arg.attr.len = (socklen_t)__mmsghdr[i].msg_hdr.msg_namelen; + tx_arg.attr.hdr = &__mmsghdr[i].msg_hdr; - if ((__nfds <= 0) || (__fds == NULL)) { - return "(null)"; + int ret = p_socket_object->tx(tx_arg); + if (ret < 0) { + if (num_of_msg) { + return num_of_msg; + } else { + return ret; + } + } + num_of_msg++; + __mmsghdr[i].msg_len = ret; + } + return num_of_msg; } - int fdsize = 1 + ((__nfds - 1) / (8 * sizeof(uint32_t))); - switch (fdsize) { - case 1: - snprintf(buf, buflen, "%08x", ((uint32_t *)__fds)[0]); - break; - case 2: - snprintf(buf, buflen, "%08x %08x", ((uint32_t *)__fds)[1], ((uint32_t *)__fds)[0]); - break; - case 3: - snprintf(buf, buflen, "%08x %08x %08x", ((uint32_t *)__fds)[2], ((uint32_t *)__fds)[1], - ((uint32_t *)__fds)[0]); - break; - case 4: - snprintf(buf, buflen, "%08x %08x %08x %08x", ((uint32_t *)__fds)[3], ((uint32_t *)__fds)[2], - ((uint32_t *)__fds)[1], ((uint32_t *)__fds)[0]); - break; - case 5: - snprintf(buf, buflen, "%08x %08x %08x %08x %08x", ((uint32_t *)__fds)[4], - ((uint32_t *)__fds)[3], ((uint32_t *)__fds)[2], ((uint32_t *)__fds)[1], - ((uint32_t *)__fds)[0]); - break; - case 6: - snprintf(buf, buflen, "%08x %08x %08x %08x %08x %08x", ((uint32_t *)__fds)[5], - ((uint32_t *)__fds)[4], ((uint32_t *)__fds)[3], ((uint32_t *)__fds)[2], - ((uint32_t *)__fds)[1], ((uint32_t *)__fds)[0]); - break; - default: - buf[0] = '\0'; + // Ignore dummy messages for OS + if (unlikely(IS_DUMMY_PACKET(__flags))) { + errno = EINVAL; + return -1; } - return buf; + + return SYSCALL(sendmmsg, __fd, __mmsghdr, __vlen, __flags); } -/* Check the first NFDS descriptors each in READFDS (if not NULL) for read - readiness, in WRITEFDS (if not NULL) for write readiness, and in EXCEPTFDS - (if not NULL) for exceptional conditions. If TIMis not NULL, time out - after waiting the interval specified therein. Returns the number of ready - descriptors, or -1 for errors. +/* Send N bytes of BUF on socket FD to peer at address ADDR (which is + ADDR_LEN bytes long). Returns the number sent, or -1 for errors. This function is a cancellation point and therefore not marked with __THROW. */ -static int select_helper(int __nfds, fd_set *__readfds, fd_set *__writefds, fd_set *__exceptfds, - struct timeval *__timeout, const sigset_t *__sigmask = NULL) +EXPORT_SYMBOL ssize_t XLIO_SYMBOL(sendto)(int __fd, __const void *__buf, size_t __nbytes, + int __flags, const struct sockaddr *__to, + socklen_t __tolen) { - int off_rfds_buffer[__nfds]; - io_mux_call::offloaded_mode_t off_modes_buffer[__nfds]; + PROFILE_FUNC - if (g_vlogger_level >= VLOG_FUNC) { - const int tmpbufsize = 256; - char tmpbuf[tmpbufsize], tmpbuf2[tmpbufsize]; - NOT_IN_USE(tmpbufsize); /* to suppress warning in case MAX_DEFINED_LOG_LEVEL */ - NOT_IN_USE(tmpbuf); /* to suppress warning in case MAX_DEFINED_LOG_LEVEL */ - NOT_IN_USE(tmpbuf2); /* to suppress warning in case MAX_DEFINED_LOG_LEVEL */ - srdr_logfunc("readfds: %s, writefds: %s", - dbg_sprintf_fdset(tmpbuf, tmpbufsize, __nfds, __readfds), - dbg_sprintf_fdset(tmpbuf2, tmpbufsize, __nfds, __writefds)); + srdr_logfuncall_entry("fd=%d, nbytes=%d", __fd, __nbytes); + + socket_fd_api *p_socket_object = NULL; + p_socket_object = fd_collection_get_sockfd(__fd); + if (p_socket_object) { + struct iovec piov[1] = {{(void *)__buf, __nbytes}}; + xlio_tx_call_attr_t tx_arg; + + tx_arg.opcode = TX_SENDTO; + tx_arg.attr.iov = piov; + tx_arg.attr.sz_iov = 1; + tx_arg.attr.flags = __flags; + tx_arg.attr.addr = (struct sockaddr *)__to; + tx_arg.attr.len = __tolen; + + return p_socket_object->tx(tx_arg); } - try { - select_call scall(off_rfds_buffer, off_modes_buffer, __nfds, __readfds, __writefds, - __exceptfds, __timeout, __sigmask); - int rc = scall.call(); + // Ignore dummy messages for OS + if (unlikely(IS_DUMMY_PACKET(__flags))) { + errno = EINVAL; + return -1; + } - if (g_vlogger_level >= VLOG_FUNC) { - const int tmpbufsize = 256; - char tmpbuf[tmpbufsize], tmpbuf2[tmpbufsize]; - NOT_IN_USE(tmpbufsize); /* to suppress warning in case MAX_DEFINED_LOG_LEVEL */ - NOT_IN_USE(tmpbuf); /* to suppress warning in case MAX_DEFINED_LOG_LEVEL */ - NOT_IN_USE(tmpbuf2); /* to suppress warning in case MAX_DEFINED_LOG_LEVEL */ - srdr_logfunc_exit("readfds: %s, writefds: %s", - dbg_sprintf_fdset(tmpbuf, tmpbufsize, __nfds, __readfds), - dbg_sprintf_fdset(tmpbuf2, tmpbufsize, __nfds, __writefds)); - } + return SYSCALL(sendto, __fd, __buf, __nbytes, __flags, __to, __tolen); +} + +EXPORT_SYMBOL ssize_t XLIO_SYMBOL(sendfile)(int out_fd, int in_fd, off_t *offset, size_t count) +{ + PROFILE_FUNC + + srdr_logfuncall_entry("out_fd=%d, in_fd=%d, offset=%p, *offset=%zu, count=%d", out_fd, in_fd, + offset, offset ? *offset : 0, count); + + socket_fd_api *p_socket_object = fd_collection_get_sockfd(out_fd); + if (!p_socket_object) { + return SYSCALL(sendfile, out_fd, in_fd, offset, count); + } - return rc; - } catch (io_mux_call::io_error &) { - srdr_logfunc_exit("io_mux_call::io_error (errno=%d %m)", errno); - return -1; + return sendfile_helper(p_socket_object, in_fd, offset, count); +} + +EXPORT_SYMBOL ssize_t XLIO_SYMBOL(sendfile64)(int out_fd, int in_fd, __off64_t *offset, + size_t count) +{ + PROFILE_FUNC + + srdr_logfuncall_entry("out_fd=%d, in_fd=%d, offset=%p, *offset=%zu, count=%d", out_fd, in_fd, + offset, offset ? *offset : 0, count); + + socket_fd_api *p_socket_object = fd_collection_get_sockfd(out_fd); + if (!p_socket_object) { + return SYSCALL(sendfile64, out_fd, in_fd, offset, count); } + + return sendfile_helper(p_socket_object, in_fd, offset, count); } -extern "C" EXPORT_SYMBOL int select(int __nfds, fd_set *__readfds, fd_set *__writefds, - fd_set *__exceptfds, struct timeval *__timeout) +EXPORT_SYMBOL int XLIO_SYMBOL(select)(int __nfds, fd_set *__readfds, fd_set *__writefds, + fd_set *__exceptfds, struct timeval *__timeout) { PROFILE_FUNC if (!g_p_fd_collection) { - BULLSEYE_EXCLUDE_BLOCK_START - if (!orig_os_api.select) { - get_orig_funcs(); - } - BULLSEYE_EXCLUDE_BLOCK_END - return orig_os_api.select(__nfds, __readfds, __writefds, __exceptfds, __timeout); + return SYSCALL(select, __nfds, __readfds, __writefds, __exceptfds, __timeout); } if (__timeout) { @@ -1878,19 +1879,14 @@ extern "C" EXPORT_SYMBOL int select(int __nfds, fd_set *__readfds, fd_set *__wri return select_helper(__nfds, __readfds, __writefds, __exceptfds, __timeout); } -extern "C" EXPORT_SYMBOL int pselect(int __nfds, fd_set *__readfds, fd_set *__writefds, - fd_set *__errorfds, const struct timespec *__timeout, - const sigset_t *__sigmask) +EXPORT_SYMBOL int XLIO_SYMBOL(pselect)(int __nfds, fd_set *__readfds, fd_set *__writefds, + fd_set *__errorfds, const struct timespec *__timeout, + const sigset_t *__sigmask) { PROFILE_FUNC if (!g_p_fd_collection) { - BULLSEYE_EXCLUDE_BLOCK_START - if (!orig_os_api.pselect) { - get_orig_funcs(); - } - BULLSEYE_EXCLUDE_BLOCK_END - return orig_os_api.pselect(__nfds, __readfds, __writefds, __errorfds, __timeout, __sigmask); + return SYSCALL(pselect, __nfds, __readfds, __writefds, __errorfds, __timeout, __sigmask); } struct timeval select_time; @@ -1907,43 +1903,12 @@ extern "C" EXPORT_SYMBOL int pselect(int __nfds, fd_set *__readfds, fd_set *__wr __sigmask); } -/* Poll the file descriptors described by the NFDS structures starting at - FDS. If TIMis nonzero and not -1, allow TIMmilliseconds for - an event to occur; if TIMis -1, block until an event occurs. - Returns the number of file descriptors with events, zero if timed out, - or -1 for errors. */ -static int poll_helper(struct pollfd *__fds, nfds_t __nfds, int __timeout, - const sigset_t *__sigmask = NULL) -{ - int off_rfd_buffer[__nfds]; - io_mux_call::offloaded_mode_t off_modes_buffer[__nfds]; - int lookup_buffer[__nfds]; - pollfd working_fds_arr[__nfds + 1]; - - try { - poll_call pcall(off_rfd_buffer, off_modes_buffer, lookup_buffer, working_fds_arr, __fds, - __nfds, __timeout, __sigmask); - - int rc = pcall.call(); - srdr_logfunc_exit("rc = %d", rc); - return rc; - } catch (io_mux_call::io_error &) { - srdr_logfunc_exit("io_mux_call::io_error (errno=%d %m)", errno); - return -1; - } -} - -extern "C" EXPORT_SYMBOL int poll(struct pollfd *__fds, nfds_t __nfds, int __timeout) +EXPORT_SYMBOL int XLIO_SYMBOL(poll)(struct pollfd *__fds, nfds_t __nfds, int __timeout) { PROFILE_FUNC if (!g_p_fd_collection) { - BULLSEYE_EXCLUDE_BLOCK_START - if (!orig_os_api.poll) { - get_orig_funcs(); - } - BULLSEYE_EXCLUDE_BLOCK_END - return orig_os_api.poll(__fds, __nfds, __timeout); + return SYSCALL(poll, __fds, __nfds, __timeout); } srdr_logfunc_entry("nfds=%d, timeout=(%d milli-sec)", __nfds, __timeout); @@ -1952,18 +1917,13 @@ extern "C" EXPORT_SYMBOL int poll(struct pollfd *__fds, nfds_t __nfds, int __tim } #if defined HAVE___POLL_CHK -extern "C" EXPORT_SYMBOL int __poll_chk(struct pollfd *__fds, nfds_t __nfds, int __timeout, - size_t __fdslen) +EXPORT_SYMBOL int XLIO_SYMBOL(__poll_chk)(struct pollfd *__fds, nfds_t __nfds, int __timeout, + size_t __fdslen) { PROFILE_FUNC if (!g_p_fd_collection) { - BULLSEYE_EXCLUDE_BLOCK_START - if (!orig_os_api.__poll_chk) { - get_orig_funcs(); - } - BULLSEYE_EXCLUDE_BLOCK_END - return orig_os_api.__poll_chk(__fds, __nfds, __timeout, __fdslen); + return SYSCALL(__poll_chk, __fds, __nfds, __timeout, __fdslen); } BULLSEYE_EXCLUDE_BLOCK_START @@ -1978,18 +1938,13 @@ extern "C" EXPORT_SYMBOL int __poll_chk(struct pollfd *__fds, nfds_t __nfds, int } #endif -extern "C" EXPORT_SYMBOL int ppoll(struct pollfd *__fds, nfds_t __nfds, - const struct timespec *__timeout, const sigset_t *__sigmask) +EXPORT_SYMBOL int XLIO_SYMBOL(ppoll)(struct pollfd *__fds, nfds_t __nfds, + const struct timespec *__timeout, const sigset_t *__sigmask) { PROFILE_FUNC if (!g_p_fd_collection) { - BULLSEYE_EXCLUDE_BLOCK_START - if (!orig_os_api.ppoll) { - get_orig_funcs(); - } - BULLSEYE_EXCLUDE_BLOCK_END - return orig_os_api.ppoll(__fds, __nfds, __timeout, __sigmask); + return SYSCALL(ppoll, __fds, __nfds, __timeout, __sigmask); } int timeout = @@ -2001,19 +1956,14 @@ extern "C" EXPORT_SYMBOL int ppoll(struct pollfd *__fds, nfds_t __nfds, } #if defined HAVE___PPOLL_CHK -extern "C" EXPORT_SYMBOL int __ppoll_chk(struct pollfd *__fds, nfds_t __nfds, - const struct timespec *__timeout, - const sigset_t *__sigmask, size_t __fdslen) +EXPORT_SYMBOL int XLIO_SYMBOL(__ppoll_chk)(struct pollfd *__fds, nfds_t __nfds, + const struct timespec *__timeout, + const sigset_t *__sigmask, size_t __fdslen) { PROFILE_FUNC if (!g_p_fd_collection) { - BULLSEYE_EXCLUDE_BLOCK_START - if (!orig_os_api.__ppoll_chk) { - get_orig_funcs(); - } - BULLSEYE_EXCLUDE_BLOCK_END - return orig_os_api.__ppoll_chk(__fds, __nfds, __timeout, __sigmask, __fdslen); + return SYSCALL(__ppoll_chk, __fds, __nfds, __timeout, __sigmask, __fdslen); } BULLSEYE_EXCLUDE_BLOCK_START @@ -2032,22 +1982,11 @@ extern "C" EXPORT_SYMBOL int __ppoll_chk(struct pollfd *__fds, nfds_t __nfds, } #endif -static void xlio_epoll_create(int epfd, int size) -{ - if (g_p_fd_collection) { - // Sanity check to remove any old sockinfo object using the same fd!! - handle_close(epfd, true); - - // insert epfd to fd_collection as epfd_info - g_p_fd_collection->addepfd(epfd, size); - } -} - /* Creates an epoll instance. Returns fd for the new instance. The "size" parameter is a hint specifying the number of file descriptors to be associated with the new instance. The fd returned by epoll_create() should be closed with close(). */ -extern "C" EXPORT_SYMBOL int epoll_create(int __size) +EXPORT_SYMBOL int XLIO_SYMBOL(epoll_create)(int __size) { DO_GLOBAL_CTORS(); @@ -2059,13 +1998,7 @@ extern "C" EXPORT_SYMBOL int epoll_create(int __size) return -1; } - BULLSEYE_EXCLUDE_BLOCK_START - if (!orig_os_api.epoll_create) { - get_orig_funcs(); - } - BULLSEYE_EXCLUDE_BLOCK_END - - int epfd = orig_os_api.epoll_create(__size + 1); // +1 for the cq epfd + int epfd = SYSCALL(epoll_create, __size + 1); // +1 for the cq epfd srdr_logdbg("ENTER: (size=%d) = %d", __size, epfd); if (epfd <= 0) { @@ -2077,19 +2010,13 @@ extern "C" EXPORT_SYMBOL int epoll_create(int __size) return epfd; } -extern "C" EXPORT_SYMBOL int epoll_create1(int __flags) +EXPORT_SYMBOL int XLIO_SYMBOL(epoll_create1)(int __flags) { DO_GLOBAL_CTORS(); PROFILE_FUNC - BULLSEYE_EXCLUDE_BLOCK_START - if (!orig_os_api.epoll_create1) { - get_orig_funcs(); - } - BULLSEYE_EXCLUDE_BLOCK_END - - int epfd = orig_os_api.epoll_create1(__flags); + int epfd = SYSCALL(epoll_create1, __flags); srdr_logdbg("ENTER: (flags=%d) = %d", __flags, epfd); if (epfd <= 0) { @@ -2107,7 +2034,8 @@ extern "C" EXPORT_SYMBOL int epoll_create1(int __flags) constants defined above. The "fd" parameter is the target of the operation. The "event" parameter describes which events the caller is interested in and any associated user data. */ -extern "C" EXPORT_SYMBOL int epoll_ctl(int __epfd, int __op, int __fd, struct epoll_event *__event) +EXPORT_SYMBOL int XLIO_SYMBOL(epoll_ctl)(int __epfd, int __op, int __fd, + struct epoll_event *__event) { PROFILE_FUNC @@ -2143,50 +2071,8 @@ extern "C" EXPORT_SYMBOL int epoll_ctl(int __epfd, int __op, int __fd, struct ep return rc; } -/* Wait for events on an epoll instance "epfd". Returns the number of - triggered events returned in "events" buffer. Or -1 in case of - error with the "errno" variable set to the specific error code. The - "events" parameter is a buffer that will contain triggered - events. The "maxevents" is the maximum number of events to be - returned ( usually size of "events" ). The "timeout" parameter - specifies the maximum wait time in milliseconds (-1 == infinite). */ -inline int epoll_wait_helper(int __epfd, struct epoll_event *__events, int __maxevents, - int __timeout, const sigset_t *__sigmask = NULL) -{ - if (__maxevents <= 0 || __maxevents > EP_MAX_EVENTS) { - srdr_logdbg("invalid value for maxevents: %d", __maxevents); - errno = EINVAL; - return -1; - } - - if (safe_mce_sys().tcp_ctl_thread == option_tcp_ctl_thread::CTL_THREAD_DELEGATE_TCP_TIMERS) { - g_thread_local_event_handler.do_tasks(); - } - - epoll_event extra_events_buffer[__maxevents]; - - try { - epoll_wait_call epcall(extra_events_buffer, NULL, __epfd, __events, __maxevents, __timeout, - __sigmask); - - int rc = epcall.get_current_events(); // returns ready nfds - if (rc <= 0) { - // if no ready nfds available then check all lower level queues (XLIO ring's and OS - // queues) - epcall.init_offloaded_fds(); - rc = epcall.call(); - } - - srdr_logfunc_exit("rc = %d", rc); - return rc; - } catch (io_mux_call::io_error &) { - srdr_logfunc_exit("io_mux_call::io_error (errno=%d %m)", errno); - return -1; - } -} - -extern "C" EXPORT_SYMBOL int epoll_wait(int __epfd, struct epoll_event *__events, int __maxevents, - int __timeout) +EXPORT_SYMBOL int XLIO_SYMBOL(epoll_wait)(int __epfd, struct epoll_event *__events, int __maxevents, + int __timeout) { PROFILE_FUNC @@ -2196,8 +2082,9 @@ extern "C" EXPORT_SYMBOL int epoll_wait(int __epfd, struct epoll_event *__events return epoll_wait_helper(__epfd, __events, __maxevents, __timeout); } -extern "C" EXPORT_SYMBOL int epoll_pwait(int __epfd, struct epoll_event *__events, int __maxevents, - int __timeout, const sigset_t *__sigmask) +EXPORT_SYMBOL int XLIO_SYMBOL(epoll_pwait)(int __epfd, struct epoll_event *__events, + int __maxevents, int __timeout, + const sigset_t *__sigmask) { PROFILE_FUNC @@ -2211,17 +2098,11 @@ extern "C" EXPORT_SYMBOL int epoll_pwait(int __epfd, struct epoll_event *__event protocol PROTOCOL, which are connected to each other, and put file descriptors for them in FDS[0] and FDS[1]. If PROTOCOL is zero, one will be chosen automatically. Returns 0 on success, -1 for errors. */ -extern "C" EXPORT_SYMBOL int socketpair(int __domain, int __type, int __protocol, int __sv[2]) +EXPORT_SYMBOL int XLIO_SYMBOL(socketpair)(int __domain, int __type, int __protocol, int __sv[2]) { PROFILE_FUNC - BULLSEYE_EXCLUDE_BLOCK_START - if (!orig_os_api.socketpair) { - get_orig_funcs(); - } - BULLSEYE_EXCLUDE_BLOCK_END - - int ret = orig_os_api.socketpair(__domain, __type, __protocol, __sv); + int ret = SYSCALL(socketpair, __domain, __type, __protocol, __sv); srdr_logdbg("(domain=%s(%d) type=%s(%d) protocol=%d, fd[%d,%d]) = %d", socket_get_domain_str(__domain), __domain, socket_get_type_str(__type), __type, @@ -2240,17 +2121,11 @@ extern "C" EXPORT_SYMBOL int socketpair(int __domain, int __type, int __protocol If successful, two file descriptors are stored in PIPEDES; bytes written on PIPEDES[1] can be read from PIPEDES[0]. Returns 0 if successful, -1 if not. */ -extern "C" EXPORT_SYMBOL int pipe(int __filedes[2]) +EXPORT_SYMBOL int XLIO_SYMBOL(pipe)(int __filedes[2]) { PROFILE_FUNC - BULLSEYE_EXCLUDE_BLOCK_START - if (!orig_os_api.pipe) { - get_orig_funcs(); - } - BULLSEYE_EXCLUDE_BLOCK_END - - int ret = orig_os_api.pipe(__filedes); + int ret = SYSCALL(pipe, __filedes); srdr_logdbg("(fd[%d,%d]) = %d", __filedes[0], __filedes[1], ret); if (ret == 0 && g_p_fd_collection) { @@ -2264,7 +2139,7 @@ extern "C" EXPORT_SYMBOL int pipe(int __filedes[2]) return ret; } -extern "C" EXPORT_SYMBOL int open(__const char *__file, int __oflag, ...) +EXPORT_SYMBOL int XLIO_SYMBOL(open)(__const char *__file, int __oflag, ...) { va_list va; va_start(va, __oflag); @@ -2272,13 +2147,7 @@ extern "C" EXPORT_SYMBOL int open(__const char *__file, int __oflag, ...) PROFILE_FUNC - BULLSEYE_EXCLUDE_BLOCK_START - if (!orig_os_api.open) { - get_orig_funcs(); - } - BULLSEYE_EXCLUDE_BLOCK_END - - int fd = orig_os_api.open(__file, __oflag, mode); + int fd = SYSCALL(open, __file, __oflag, mode); va_end(va); srdr_logdbg("(file=%s, flags=%#x, mode=%#x) = %d", __file, __oflag, mode, fd); @@ -2289,17 +2158,11 @@ extern "C" EXPORT_SYMBOL int open(__const char *__file, int __oflag, ...) return fd; } -extern "C" EXPORT_SYMBOL int creat(const char *__pathname, mode_t __mode) +EXPORT_SYMBOL int XLIO_SYMBOL(creat)(const char *__pathname, mode_t __mode) { PROFILE_FUNC - BULLSEYE_EXCLUDE_BLOCK_START - if (!orig_os_api.creat) { - get_orig_funcs(); - } - BULLSEYE_EXCLUDE_BLOCK_END - - int fd = orig_os_api.creat(__pathname, __mode); + int fd = SYSCALL(creat, __pathname, __mode); srdr_logdbg("(pathname=%s, mode=%#x) = %d", __pathname, __mode, fd); @@ -2310,17 +2173,11 @@ extern "C" EXPORT_SYMBOL int creat(const char *__pathname, mode_t __mode) } /* Duplicate FD, returning a new file descriptor on the same file. */ -extern "C" EXPORT_SYMBOL int dup(int __fd) +EXPORT_SYMBOL int XLIO_SYMBOL(dup)(int __fd) { PROFILE_FUNC - BULLSEYE_EXCLUDE_BLOCK_START - if (!orig_os_api.dup) { - get_orig_funcs(); - } - BULLSEYE_EXCLUDE_BLOCK_END - - int fid = orig_os_api.dup(__fd); + int fid = SYSCALL(dup, __fd); srdr_logdbg("(fd=%d) = %d", __fd, fid); @@ -2337,7 +2194,7 @@ extern "C" EXPORT_SYMBOL int dup(int __fd) } /* Duplicate FD to FD2, closing FD2 and making it open on the same file. */ -extern "C" EXPORT_SYMBOL int dup2(int __fd, int __fd2) +EXPORT_SYMBOL int XLIO_SYMBOL(dup2)(int __fd, int __fd2) { PROFILE_FUNC @@ -2346,13 +2203,7 @@ extern "C" EXPORT_SYMBOL int dup2(int __fd, int __fd2) handle_close(__fd2); } - BULLSEYE_EXCLUDE_BLOCK_START - if (!orig_os_api.dup2) { - get_orig_funcs(); - } - BULLSEYE_EXCLUDE_BLOCK_END - - int fid = orig_os_api.dup2(__fd, __fd2); + int fid = SYSCALL(dup2, __fd, __fd2); srdr_logdbg("(fd=%d, fd2=%d) = %d", __fd, __fd2, fid); @@ -2362,28 +2213,11 @@ extern "C" EXPORT_SYMBOL int dup2(int __fd, int __fd2) return fid; } -#ifdef _CHANGE_CLONE_PROTO_IN_SLES_10_ -extern "C" EXPORT_SYMBOL int clone(int (*__fn)(void *), void *__child_stack, int __flags, - void *__arg) -{ - PROFILE_FUNC - - srdr_logfunc_entry("flags=%#x", __flags); - - BULLSEYE_EXCLUDE_BLOCK_START - if (!orig_os_api.clone) - get_orig_funcs(); - BULLSEYE_EXCLUDE_BLOCK_END - - return orig_os_api.clone(__fn, __child_stack, __flags, __arg); -} -#endif - /* Clone the calling process, creating an exact copy. Return -1 for errors, 0 to the new process, and the process ID of the new process to the old process. */ -extern "C" EXPORT_SYMBOL pid_t fork(void) +EXPORT_SYMBOL pid_t XLIO_SYMBOL(fork)(void) { PROFILE_FUNC @@ -2399,12 +2233,6 @@ extern "C" EXPORT_SYMBOL pid_t fork(void) "undefined!!"); } - BULLSEYE_EXCLUDE_BLOCK_START - if (!orig_os_api.fork) { - get_orig_funcs(); - } - BULLSEYE_EXCLUDE_BLOCK_END - #if defined(DEFINED_NGINX) static int worker_index = -1; if (g_p_app && g_p_app->type == APP_NGINX && (g_p_app->get_worker_id() == -1)) { @@ -2428,7 +2256,7 @@ extern "C" EXPORT_SYMBOL pid_t fork(void) } #endif - pid_t pid = orig_os_api.fork(); + pid_t pid = SYSCALL(fork); if (pid == 0) { #if defined(DEFINED_NGINX) void *p_fd_collection_temp = g_p_fd_collection; @@ -2491,17 +2319,17 @@ extern "C" EXPORT_SYMBOL pid_t fork(void) } /* Redirect vfork to fork */ -extern "C" EXPORT_SYMBOL pid_t vfork(void) +EXPORT_SYMBOL pid_t XLIO_SYMBOL(vfork)(void) { PROFILE_FUNC - return fork(); + return XLIO_CALL(fork); } /* Put the program in the background, and dissociate from the controlling terminal. If NOCHDIR is zero, do `chdir ("/")'. If NOCLOSE is zero, redirects stdin, stdout, and stderr to /dev/null. */ -extern "C" EXPORT_SYMBOL int daemon(int __nochdir, int __noclose) +EXPORT_SYMBOL int XLIO_SYMBOL(daemon)(int __nochdir, int __noclose) { PROFILE_FUNC @@ -2512,13 +2340,7 @@ extern "C" EXPORT_SYMBOL int daemon(int __nochdir, int __noclose) prepare_fork(); } - BULLSEYE_EXCLUDE_BLOCK_START - if (!orig_os_api.daemon) { - get_orig_funcs(); - } - BULLSEYE_EXCLUDE_BLOCK_END - - int ret = orig_os_api.daemon(__nochdir, __noclose); + int ret = SYSCALL(daemon, __nochdir, __noclose); if (ret == 0) { g_is_forked_child = true; srdr_logdbg_exit("returned with %d", ret); @@ -2547,105 +2369,16 @@ extern "C" EXPORT_SYMBOL int daemon(int __nochdir, int __noclose) return ret; } -static void handler_intr(int sig) -{ - switch (sig) { - case SIGINT: - g_b_exit = true; - srdr_logdbg("Catch Signal: SIGINT (%d)", sig); - break; - default: - srdr_logdbg("Catch Signal: %d", sig); - break; - } - - if (g_act_prev.sa_handler) { - g_act_prev.sa_handler(sig); - } -} - -extern "C" EXPORT_SYMBOL int sigaction(int signum, const struct sigaction *act, - struct sigaction *oldact) -{ - int ret = 0; - - PROFILE_FUNC - - BULLSEYE_EXCLUDE_BLOCK_START - if (!orig_os_api.sigaction) { - get_orig_funcs(); - } - BULLSEYE_EXCLUDE_BLOCK_END - - if (safe_mce_sys().handle_sigintr) { - srdr_logdbg_entry("signum=%d, act=%p, oldact=%p", signum, act, oldact); - - switch (signum) { - case SIGINT: - if (oldact && g_act_prev.sa_handler) { - *oldact = g_act_prev; - } - if (act) { - struct sigaction xlio_action; - xlio_action.sa_handler = handler_intr; - xlio_action.sa_flags = 0; - sigemptyset(&xlio_action.sa_mask); - - ret = orig_os_api.sigaction(SIGINT, &xlio_action, NULL); - - if (ret < 0) { - srdr_logdbg("Failed to register SIGINT handler, calling to original sigaction " - "handler"); - break; - } - srdr_logdbg("Registered SIGINT handler"); - g_act_prev = *act; - } - if (ret >= 0) { - srdr_logdbg_exit("returned with %d", ret); - } else { - srdr_logdbg_exit("failed (errno=%d %m)", errno); - } - - return ret; - break; - default: - break; - } - } - ret = orig_os_api.sigaction(signum, act, oldact); - - if (safe_mce_sys().handle_sigintr) { - if (ret >= 0) { - srdr_logdbg_exit("returned with %d", ret); - } else { - srdr_logdbg_exit("failed (errno=%d %m)", errno); - } - } - return ret; -} - -static void handle_signal(int signum) +EXPORT_SYMBOL int XLIO_SYMBOL(sigaction)(int signum, const struct sigaction *act, + struct sigaction *oldact) { - srdr_logdbg_entry("Caught signal! signum=%d", signum); - - if (signum == SIGINT) { - g_b_exit = true; - } - - if (g_sighandler) { - g_sighandler(signum); - } + return sigaction_internal(signum, act, oldact); } -extern "C" EXPORT_SYMBOL sighandler_t signal(int signum, sighandler_t handler) +EXPORT_SYMBOL sighandler_t XLIO_SYMBOL(signal)(int signum, sighandler_t handler) { PROFILE_FUNC - if (!orig_os_api.signal) { - get_orig_funcs(); - } - if (safe_mce_sys().handle_sigintr) { srdr_logdbg_entry("signum=%d, handler=%p", signum, handler); @@ -2653,28 +2386,22 @@ extern "C" EXPORT_SYMBOL sighandler_t signal(int signum, sighandler_t handler) // Only SIGINT is supported for now if (signum == SIGINT) { g_sighandler = handler; - return orig_os_api.signal(SIGINT, &handle_signal); + return SYSCALL(signal, SIGINT, &handle_signal); } } } - return orig_os_api.signal(signum, handler); + return SYSCALL(signal, signum, handler); } #if defined(DEFINED_NGINX) -extern "C" EXPORT_SYMBOL int setuid(uid_t uid) +EXPORT_SYMBOL int XLIO_SYMBOL(setuid)(uid_t uid) { PROFILE_FUNC - BULLSEYE_EXCLUDE_BLOCK_START - if (!orig_os_api.setuid) { - get_orig_funcs(); - } - BULLSEYE_EXCLUDE_BLOCK_END - uid_t previous_uid = geteuid(); - int orig_rc = orig_os_api.setuid(uid); + int orig_rc = SYSCALL(setuid, uid); if (orig_rc < 0) { srdr_logdbg_exit("failed (errno=%d %m)", errno); } @@ -2687,11 +2414,11 @@ extern "C" EXPORT_SYMBOL int setuid(uid_t uid) return orig_rc; } -extern "C" EXPORT_SYMBOL pid_t waitpid(pid_t pid, int *wstatus, int options) +EXPORT_SYMBOL pid_t XLIO_SYMBOL(waitpid)(pid_t pid, int *wstatus, int options) { PROFILE_FUNC - pid_t child_pid = orig_os_api.waitpid(pid, wstatus, options); + pid_t child_pid = SYSCALL(waitpid, pid, wstatus, options); /* This segment is used as part of NGINX worker termination recovery mechanism. The mechanism * marks the worker PID slot as vacant with -1 later to reuse it in the fork system call.The * implicit assumptions here are that: @@ -2705,7 +2432,8 @@ extern "C" EXPORT_SYMBOL pid_t waitpid(pid_t pid, int *wstatus, int options) g_p_app->unused_worker_id.insert(g_p_app->get_worker_id()); g_p_app->map_thread_id.erase(getpid()); } + return child_pid; } - #endif // DEFINED_NGINX +} diff --git a/src/core/sock/sock-redirect.h b/src/core/sock/sock-redirect.h index 055648756..a429d7e33 100644 --- a/src/core/sock/sock-redirect.h +++ b/src/core/sock/sock-redirect.h @@ -33,6 +33,10 @@ #ifndef SOCK_REDIRECT_H #define SOCK_REDIRECT_H +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + // if you need select with more than 1024 sockets - enable this #ifndef SELECT_BIG_SETSIZE #define SELECT_BIG_SETSIZE 0 @@ -85,6 +89,26 @@ #include #include +#ifdef XLIO_STATIC_BUILD +#define XLIO_SYMBOL(_func) xlio_##_func +#define SYSCALL(_func, ...) ::_func(__VA_ARGS__) +#define XLIO_CALL(_func, ...) xlio_##_func(__VA_ARGS__) +#define SYSCALL_ERRNO_UNSUPPORTED(_func, ...) SYSCALL(_func, __VA_ARGS__) +#define VALID_SYSCALL(_func) (true) +#else +#define XLIO_SYMBOL(_func) _func +#if defined(__GNUC__) && !defined(__clang__) +#define VALID_SYSCALL(_func) (__builtin_addressof(orig_os_api._func) != nullptr) +#else +#define VALID_SYSCALL(_func) ((orig_os_api._func) != nullptr) +#endif +#define SYSCALL(_func, ...) \ + ((VALID_SYSCALL(_func) ? (void)0 : get_orig_funcs()), orig_os_api._func(__VA_ARGS__)) +#define SYSCALL_ERRNO_UNSUPPORTED(_func, ...) \ + (VALID_SYSCALL(_func) ? orig_os_api._func(__VA_ARGS__) : ((errno = EOPNOTSUPP), -1)) +#define XLIO_CALL(_func, ...) _func(__VA_ARGS__) +#endif /* XLIO_STATIC_BUILD */ + struct mmsghdr; /** @@ -221,4 +245,10 @@ bool handle_close(int fd, bool cleanup = false, bool passthrough = false); // TODO: look for additional such functions/calls int socket_internal(int __domain, int __type, int __protocol, bool shadow, bool check_offload); +// allow calling our sendmsg(...) implementation safely from within libxlio.so +ssize_t sendmsg_internal(void *sock, __const struct msghdr *__msg, int __flags); + +// allow calling our bind(...) implementation safely from within libxlio.so +int bind_internal(void *sock, const struct sockaddr *addr, socklen_t addrlen); + #endif // SOCK_REDIRECT_H diff --git a/src/core/sock/socket_fd_api.cpp b/src/core/sock/socket_fd_api.cpp index d0dc9c9a8..794a09524 100644 --- a/src/core/sock/socket_fd_api.cpp +++ b/src/core/sock/socket_fd_api.cpp @@ -70,7 +70,7 @@ socket_fd_api::~socket_fd_api() #endif if (toclose) { - orig_os_api.close(m_fd); + SYSCALL(close, m_fd); } } @@ -81,7 +81,7 @@ void socket_fd_api::destructor_helper() int socket_fd_api::shutdown(int __how) { __log_info_func(""); - int ret = orig_os_api.shutdown(m_fd, __how); + int ret = SYSCALL(shutdown, m_fd, __how); if (ret) { __log_info_dbg("shutdown failed (ret=%d %m)", ret); } @@ -91,7 +91,7 @@ int socket_fd_api::shutdown(int __how) int socket_fd_api::bind(const sockaddr *__addr, socklen_t __addrlen) { __log_info_func(""); - int ret = orig_os_api.bind(m_fd, __addr, __addrlen); + int ret = SYSCALL(bind, m_fd, __addr, __addrlen); if (ret) { __log_info_dbg("bind failed (ret=%d %m)", ret); } @@ -101,7 +101,7 @@ int socket_fd_api::bind(const sockaddr *__addr, socklen_t __addrlen) int socket_fd_api::connect(const sockaddr *__to, socklen_t __tolen) { __log_info_func(""); - int ret = orig_os_api.connect(m_fd, __to, __tolen); + int ret = SYSCALL(connect, m_fd, __to, __tolen); if (ret) { __log_info_dbg("connect failed (ret=%d %m)", ret); } @@ -111,7 +111,7 @@ int socket_fd_api::connect(const sockaddr *__to, socklen_t __tolen) int socket_fd_api::accept(struct sockaddr *__addr, socklen_t *__addrlen) { __log_info_func(""); - int ret = orig_os_api.accept(m_fd, __addr, __addrlen); + int ret = SYSCALL(accept, m_fd, __addr, __addrlen); if (ret < 0) { __log_info_dbg("accept failed (ret=%d %m)", ret); } @@ -121,7 +121,7 @@ int socket_fd_api::accept(struct sockaddr *__addr, socklen_t *__addrlen) int socket_fd_api::accept4(struct sockaddr *__addr, socklen_t *__addrlen, int __flags) { __log_info_func(""); - int ret = orig_os_api.accept4(m_fd, __addr, __addrlen, __flags); + int ret = SYSCALL(accept4, m_fd, __addr, __addrlen, __flags); if (ret < 0) { __log_info_dbg("accept4 failed (ret=%d %m)", ret); } @@ -131,7 +131,7 @@ int socket_fd_api::accept4(struct sockaddr *__addr, socklen_t *__addrlen, int __ int socket_fd_api::listen(int backlog) { __log_info_func(""); - int ret = orig_os_api.listen(m_fd, backlog); + int ret = SYSCALL(listen, m_fd, backlog); if (ret < 0) { __log_info_dbg("listen failed (ret=%d %m)", ret); } @@ -141,7 +141,7 @@ int socket_fd_api::listen(int backlog) int socket_fd_api::getsockname(sockaddr *__name, socklen_t *__namelen) { __log_info_func(""); - int ret = orig_os_api.getsockname(m_fd, __name, __namelen); + int ret = SYSCALL(getsockname, m_fd, __name, __namelen); if (ret) { __log_info_dbg("getsockname failed (ret=%d %m)", ret); } @@ -151,7 +151,7 @@ int socket_fd_api::getsockname(sockaddr *__name, socklen_t *__namelen) int socket_fd_api::getpeername(sockaddr *__name, socklen_t *__namelen) { __log_info_func(""); - int ret = orig_os_api.getpeername(m_fd, __name, __namelen); + int ret = SYSCALL(getpeername, m_fd, __name, __namelen); if (ret) { __log_info_dbg("getpeername failed (ret=%d %m)", ret); } @@ -162,7 +162,7 @@ int socket_fd_api::setsockopt(int __level, int __optname, __const void *__optval socklen_t __optlen) { __log_info_func(""); - int ret = orig_os_api.setsockopt(m_fd, __level, __optname, __optval, __optlen); + int ret = SYSCALL(setsockopt, m_fd, __level, __optname, __optval, __optlen); if (ret) { __log_info_dbg("setsockopt failed (ret=%d %m)", ret); } @@ -172,7 +172,7 @@ int socket_fd_api::setsockopt(int __level, int __optname, __const void *__optval int socket_fd_api::getsockopt(int __level, int __optname, void *__optval, socklen_t *__optlen) { __log_info_func(""); - int ret = orig_os_api.getsockopt(m_fd, __level, __optname, __optval, __optlen); + int ret = SYSCALL(getsockopt, m_fd, __level, __optname, __optval, __optlen); if (ret) { __log_info_dbg("getsockopt failed (ret=%d %m)", ret); } @@ -232,24 +232,24 @@ ssize_t socket_fd_api::rx_os(const rx_call_t call_type, iovec *p_iov, ssize_t sz switch (call_type) { case RX_READ: __log_info_func("calling os receive with orig read"); - return orig_os_api.read(m_fd, p_iov[0].iov_base, p_iov[0].iov_len); + return SYSCALL(read, m_fd, p_iov[0].iov_base, p_iov[0].iov_len); case RX_READV: __log_info_func("calling os receive with orig readv"); - return orig_os_api.readv(m_fd, p_iov, sz_iov); + return SYSCALL(readv, m_fd, p_iov, sz_iov); case RX_RECV: __log_info_func("calling os receive with orig recv"); - return orig_os_api.recv(m_fd, p_iov[0].iov_base, p_iov[0].iov_len, flags); + return SYSCALL(recv, m_fd, p_iov[0].iov_base, p_iov[0].iov_len, flags); case RX_RECVFROM: __log_info_func("calling os receive with orig recvfrom"); - return orig_os_api.recvfrom(m_fd, p_iov[0].iov_base, p_iov[0].iov_len, flags, __from, - __fromlen); + return SYSCALL(recvfrom, m_fd, p_iov[0].iov_base, p_iov[0].iov_len, flags, __from, + __fromlen); case RX_RECVMSG: { __log_info_func("calling os receive with orig recvmsg"); - return orig_os_api.recvmsg(m_fd, __msg, flags); + return SYSCALL(recvmsg, m_fd, __msg, flags); } } return (ssize_t)-1; @@ -269,20 +269,19 @@ ssize_t socket_fd_api::tx_os(const tx_call_t call_type, const iovec *p_iov, cons switch (call_type) { case TX_WRITE: __log_info_func("calling os transmit with orig write"); - return orig_os_api.write(m_fd, p_iov[0].iov_base, p_iov[0].iov_len); + return SYSCALL(write, m_fd, p_iov[0].iov_base, p_iov[0].iov_len); case TX_WRITEV: __log_info_func("calling os transmit with orig writev"); - return orig_os_api.writev(m_fd, p_iov, sz_iov); + return SYSCALL(writev, m_fd, p_iov, sz_iov); case TX_SEND: __log_info_func("calling os transmit with orig send"); - return orig_os_api.send(m_fd, p_iov[0].iov_base, p_iov[0].iov_len, __flags); + return SYSCALL(send, m_fd, p_iov[0].iov_base, p_iov[0].iov_len, __flags); case TX_SENDTO: __log_info_func("calling os transmit with orig sendto"); - return orig_os_api.sendto(m_fd, p_iov[0].iov_base, p_iov[0].iov_len, __flags, __to, - __tolen); + return SYSCALL(sendto, m_fd, p_iov[0].iov_base, p_iov[0].iov_len, __flags, __to, __tolen); case TX_SENDMSG: { msghdr __message; @@ -293,7 +292,7 @@ ssize_t socket_fd_api::tx_os(const tx_call_t call_type, const iovec *p_iov, cons __message.msg_namelen = __tolen; __log_info_func("calling os transmit with orig sendmsg"); - return orig_os_api.sendmsg(m_fd, &__message, __flags); + return SYSCALL(sendmsg, m_fd, &__message, __flags); } default: __log_info_func("calling undefined os call type!"); diff --git a/src/core/sock/sockinfo.cpp b/src/core/sock/sockinfo.cpp index cb7ee8f0d..4b9844cf8 100644 --- a/src/core/sock/sockinfo.cpp +++ b/src/core/sock/sockinfo.cpp @@ -103,7 +103,7 @@ sockinfo::sockinfo(int fd, int domain, bool use_ring_locks) , m_is_ipv6only(safe_mce_sys().sysctl_reader.get_ipv6_bindv6only()) , m_p_rings_fds(NULL) { - m_rx_epfd = orig_os_api.epoll_create(128); + m_rx_epfd = SYSCALL(epoll_create, 128); if (unlikely(m_rx_epfd == -1)) { throw_xlio_exception("create internal epoll"); } @@ -147,8 +147,8 @@ sockinfo::~sockinfo() // Change to non-blocking socket so calling threads can exit m_b_blocking = false; - // This will wake up any blocked thread in rx() call to orig_os_api.epoll_wait() - orig_os_api.close(m_rx_epfd); + // This will wake up any blocked thread in rx() call to SYSCALL(epoll_wait, ) + SYSCALL(close, m_rx_epfd); if (m_p_rings_fds) { delete[] m_p_rings_fds; @@ -251,7 +251,7 @@ int sockinfo::fcntl(int __cmd, unsigned long int __arg) } si_logdbg("going to OS for fcntl cmd=%d, arg=%#lx", __cmd, __arg); - return orig_os_api.fcntl(m_fd, __cmd, __arg); + return SYSCALL(fcntl, m_fd, __cmd, __arg); } int sockinfo::fcntl64(int __cmd, unsigned long int __arg) @@ -263,7 +263,7 @@ int sockinfo::fcntl64(int __cmd, unsigned long int __arg) } si_logdbg("going to OS for fcntl64 cmd=%d, arg=%#lx", __cmd, __arg); - return orig_os_api.fcntl64(m_fd, __cmd, __arg); + return SYSCALL(fcntl64, m_fd, __cmd, __arg); } int sockinfo::set_ring_attr(xlio_ring_alloc_logic_attr *attr) @@ -376,7 +376,7 @@ int sockinfo::ioctl(unsigned long int __request, unsigned long int __arg) } si_logdbg("going to OS for ioctl request=%lu, flags=%#lx", __request, __arg); - return orig_os_api.ioctl(m_fd, __request, __arg); + return SYSCALL(ioctl, m_fd, __request, __arg); } int sockinfo::setsockopt(int __level, int __optname, const void *__optval, socklen_t __optlen) @@ -1527,7 +1527,7 @@ int sockinfo::os_wait_sock_rx_epfd(epoll_event *ep_events, int maxevents) if (unlikely(m_rx_cq_wait_ctrl)) { add_cqfd_to_sock_rx_epfd(m_p_rx_ring); int ret = - orig_os_api.epoll_wait(m_rx_epfd, ep_events, maxevents, m_loops_timer.time_left_msec()); + SYSCALL(epoll_wait, m_rx_epfd, ep_events, maxevents, m_loops_timer.time_left_msec()); remove_cqfd_from_sock_rx_epfd(m_p_rx_ring); return ret; } @@ -1537,7 +1537,7 @@ int sockinfo::os_wait_sock_rx_epfd(epoll_event *ep_events, int maxevents) int sockinfo::os_epoll_wait(epoll_event *ep_events, int maxevents) { - return orig_os_api.epoll_wait(m_rx_epfd, ep_events, maxevents, m_loops_timer.time_left_msec()); + return SYSCALL(epoll_wait, m_rx_epfd, ep_events, maxevents, m_loops_timer.time_left_msec()); } // Add this new CQ channel fd to the rx epfd handle (no need to wake up any sleeping thread about @@ -1553,7 +1553,7 @@ void sockinfo::add_cqfd_to_sock_rx_epfd(ring *p_ring) ev.data.fd = ring_rx_fds_array[i]; BULLSEYE_EXCLUDE_BLOCK_START - if (unlikely(orig_os_api.epoll_ctl(m_rx_epfd, EPOLL_CTL_ADD, ev.data.fd, &ev))) { + if (unlikely(SYSCALL(epoll_ctl, m_rx_epfd, EPOLL_CTL_ADD, ev.data.fd, &ev))) { si_logerr("failed to add cq channel fd to internal epfd errno=%d (%m)", errno); } BULLSEYE_EXCLUDE_BLOCK_END @@ -1567,9 +1567,8 @@ void sockinfo::remove_cqfd_from_sock_rx_epfd(ring *base_ring) for (size_t i = 0; i < num_ring_rx_fds; i++) { BULLSEYE_EXCLUDE_BLOCK_START - if (unlikely( - (orig_os_api.epoll_ctl(m_rx_epfd, EPOLL_CTL_DEL, ring_rx_fds_array[i], NULL)) && - (!(errno == ENOENT || errno == EBADF)))) { + if (unlikely((SYSCALL(epoll_ctl, m_rx_epfd, EPOLL_CTL_DEL, ring_rx_fds_array[i], NULL)) && + (!(errno == ENOENT || errno == EBADF)))) { si_logerr("failed to delete cq channel fd from internal epfd (errno=%d %s)", errno, strerror(errno)); } @@ -2075,7 +2074,7 @@ int sockinfo::setsockopt_kernel(int __level, int __optname, const void *__optval } si_logdbg("going to OS for setsockopt level %d optname %d", __level, __optname); - int ret = orig_os_api.setsockopt(m_fd, __level, __optname, __optval, __optlen); + int ret = SYSCALL(setsockopt, m_fd, __level, __optname, __optval, __optlen); BULLSEYE_EXCLUDE_BLOCK_START if (ret) { if (EPERM == errno && allow_privileged) { diff --git a/src/core/sock/sockinfo_tcp.cpp b/src/core/sock/sockinfo_tcp.cpp index 6354a6a28..3bcbbf525 100644 --- a/src/core/sock/sockinfo_tcp.cpp +++ b/src/core/sock/sockinfo_tcp.cpp @@ -56,6 +56,7 @@ #include "sockinfo_tcp.h" #include "tcp_seg_pool.h" #include "bind_no_port.h" +#include "xlio.h" #define UNLOCK_RET(_ret) \ unlock_tcp_con(); \ @@ -1425,7 +1426,7 @@ void sockinfo_tcp::err_lwip_cb(void *pcb_container, err_t err) // terminating stage, in which case we don't expect to handle packets. // Calling close() under lock will prevent internal thread to delete the object before // we finish with the current processing. - close(delete_fd); + XLIO_CALL(close, delete_fd); return; } } @@ -1925,7 +1926,7 @@ inline err_t sockinfo_tcp::handle_fin(struct tcp_pcb *pcb, err_t err) // terminating stage, in which case we don't expect to handle packets. // Calling close() under lock will prevent internal thread to delete the object before // we finish with the current processing. - close(delete_fd); + XLIO_CALL(close, delete_fd); return ERR_ABRT; } } @@ -2699,7 +2700,7 @@ int sockinfo_tcp::bind(const sockaddr *__addr, socklen_t __addrlen) if (INPORT_ANY == in_port && (m_pcb.so_options & SOF_REUSEADDR)) { int reuse = 0; - ret = orig_os_api.setsockopt(m_fd, SOL_SOCKET, SO_REUSEADDR, &reuse, sizeof(reuse)); + ret = SYSCALL(setsockopt, m_fd, SOL_SOCKET, SO_REUSEADDR, &reuse, sizeof(reuse)); BULLSEYE_EXCLUDE_BLOCK_START if (ret) { si_tcp_logerr("Failed to disable SO_REUSEADDR option (ret=%d %m), connection will be " @@ -2709,9 +2710,9 @@ int sockinfo_tcp::bind(const sockaddr *__addr, socklen_t __addrlen) return ret; } BULLSEYE_EXCLUDE_BLOCK_END - ret = orig_os_api.bind(m_fd, __addr, __addrlen); + ret = SYSCALL(bind, m_fd, __addr, __addrlen); reuse = 1; - int rv = orig_os_api.setsockopt(m_fd, SOL_SOCKET, SO_REUSEADDR, &reuse, sizeof(reuse)); + int rv = SYSCALL(setsockopt, m_fd, SOL_SOCKET, SO_REUSEADDR, &reuse, sizeof(reuse)); BULLSEYE_EXCLUDE_BLOCK_START if (rv) { si_tcp_logerr("Failed to enable SO_REUSEADDR option (ret=%d %m)", rv); @@ -2723,7 +2724,7 @@ int sockinfo_tcp::bind(const sockaddr *__addr, socklen_t __addrlen) } } else { si_tcp_logdbg("OS bind to %s", sockaddr2str(__addr, __addrlen, true).c_str()); - ret = orig_os_api.bind(m_fd, __addr, __addrlen); + ret = SYSCALL(bind, m_fd, __addr, __addrlen); } #if defined(DEFINED_NGINX) || defined(DEFINED_ENVOY) @@ -2741,7 +2742,7 @@ int sockinfo_tcp::bind(const sockaddr *__addr, socklen_t __addrlen) socklen_t addr_len = sizeof(addr); BULLSEYE_EXCLUDE_BLOCK_START - if (orig_os_api.getsockname(m_fd, addr.get_p_sa(), &addr_len)) { + if (SYSCALL(getsockname, m_fd, addr.get_p_sa(), &addr_len)) { si_tcp_logerr("get sockname failed"); UNLOCK_RET(-1); } @@ -2922,13 +2923,13 @@ int sockinfo_tcp::listen(int backlog) if (!success) { /* we will get here if attach_as_uc_receiver failed */ passthrough_unlock("Fallback the connection to os"); - return orig_os_api.listen(m_fd, orig_backlog); + return SYSCALL(listen, m_fd, orig_backlog); } // Calling to orig_listen() by default to monitor connection requests for not offloaded // sockets BULLSEYE_EXCLUDE_BLOCK_START - if (orig_os_api.listen(m_fd, orig_backlog)) { + if (SYSCALL(listen, m_fd, orig_backlog)) { // NOTE: The attach_as_uc_receiver at this stage already created steering rules. // Packets may arrive into the queues and the application may theoreticaly // call accept() with success. @@ -2942,7 +2943,7 @@ int sockinfo_tcp::listen(int backlog) epoll_event ev = {0, {0}}; ev.events = EPOLLIN; ev.data.fd = m_fd; - int ret = orig_os_api.epoll_ctl(m_rx_epfd, EPOLL_CTL_ADD, ev.data.fd, &ev); + int ret = SYSCALL(epoll_ctl, m_rx_epfd, EPOLL_CTL_ADD, ev.data.fd, &ev); BULLSEYE_EXCLUDE_BLOCK_START if (unlikely(ret)) { if (errno == EEXIST) { @@ -2994,9 +2995,9 @@ int sockinfo_tcp::accept_helper(struct sockaddr *__addr, socklen_t *__addrlen, if (m_sock_offload == TCP_SOCK_PASSTHROUGH) { si_tcp_logdbg("passthrough - go to OS accept()"); if (__flags) { - return orig_os_api.accept4(m_fd, __addr, __addrlen, __flags); + return SYSCALL(accept4, m_fd, __addr, __addrlen, __flags); } else { - return orig_os_api.accept(m_fd, __addr, __addrlen); + return SYSCALL(accept, m_fd, __addr, __addrlen); } } @@ -3029,20 +3030,20 @@ int sockinfo_tcp::accept_helper(struct sockaddr *__addr, socklen_t *__addrlen, pollfd os_fd[1]; os_fd[0].fd = m_fd; os_fd[0].events = POLLIN; - ret = orig_os_api.poll(os_fd, 1, 0); // Zero timeout - just poll and return quickly + ret = SYSCALL(poll, os_fd, 1, 0); // Zero timeout - just poll and return quickly if (unlikely(ret == -1)) { m_p_socket_stats->counters.n_rx_os_errors++; - si_tcp_logdbg("orig_os_api.poll returned with error (errno=%d %m)", errno); + si_tcp_logdbg("SYSCALL(poll) returned with error (errno=%d %m)", errno); unlock_tcp_con(); return -1; } if (ret == 1) { - si_tcp_logdbg("orig_os_api.poll returned with packet"); + si_tcp_logdbg("SYSCALL(poll) returned with packet"); unlock_tcp_con(); if (__flags) { - return orig_os_api.accept4(m_fd, __addr, __addrlen, __flags); + return SYSCALL(accept4, m_fd, __addr, __addrlen, __flags); } else { - return orig_os_api.accept(m_fd, __addr, __addrlen); + return SYSCALL(accept, m_fd, __addr, __addrlen); } } @@ -3174,7 +3175,7 @@ sockinfo_tcp *sockinfo_tcp::accept_clone() if (!si) { si_tcp_logwarn("can not get accept socket from FD collection"); - close(fd); + XLIO_CALL(close, fd); return 0; } @@ -3777,10 +3778,10 @@ int sockinfo_tcp::wait_for_conn_ready_blocking() int sockinfo_tcp::os_epoll_wait(epoll_event *ep_events, int maxevents) { - return (likely(m_sysvar_tcp_ctl_thread != option_tcp_ctl_thread::CTL_THREAD_DELEGATE_TCP_TIMERS) - ? orig_os_api.epoll_wait(m_rx_epfd, ep_events, maxevents, - m_loops_timer.time_left_msec()) - : os_epoll_wait_with_tcp_timers(ep_events, maxevents)); + return ( + likely(m_sysvar_tcp_ctl_thread != option_tcp_ctl_thread::CTL_THREAD_DELEGATE_TCP_TIMERS) + ? SYSCALL(epoll_wait, m_rx_epfd, ep_events, maxevents, m_loops_timer.time_left_msec()) + : os_epoll_wait_with_tcp_timers(ep_events, maxevents)); } int sockinfo_tcp::os_epoll_wait_with_tcp_timers(epoll_event *ep_events, int maxevents) @@ -3793,7 +3794,7 @@ int sockinfo_tcp::os_epoll_wait_with_tcp_timers(epoll_event *ep_events, int maxe ? sys_timer_resolution_msec : std::min(m_loops_timer.time_left_msec(), sys_timer_resolution_msec)); - rc = orig_os_api.epoll_wait(m_rx_epfd, ep_events, maxevents, next_timeout); + rc = SYSCALL(epoll_wait, m_rx_epfd, ep_events, maxevents, next_timeout); if (rc != 0 || m_loops_timer.time_left_msec() == 0) { break; @@ -3955,7 +3956,7 @@ int sockinfo_tcp::shutdown(int __how) // if in os pathrough just redirect to os if (m_sock_offload == TCP_SOCK_PASSTHROUGH) { si_tcp_logdbg("passthrough - go to OS shutdown()"); - return orig_os_api.shutdown(m_fd, __how); + return SYSCALL(shutdown, m_fd, __how); } lock_tcp_con(); @@ -4943,7 +4944,7 @@ int sockinfo_tcp::getsockopt(int __level, int __optname, void *__optval, socklen return -1; } - ret = orig_os_api.getsockopt(m_fd, __level, __optname, __optval, __optlen); + ret = SYSCALL(getsockopt, m_fd, __level, __optname, __optval, __optlen); BULLSEYE_EXCLUDE_BLOCK_START if (ret) { @@ -4959,7 +4960,7 @@ int sockinfo_tcp::getsockname(sockaddr *__name, socklen_t *__namelen) if (m_sock_offload == TCP_SOCK_PASSTHROUGH) { si_tcp_logdbg("passthrough - go to OS getsockname"); - return orig_os_api.getsockname(m_fd, __name, __namelen); + return SYSCALL(getsockname, m_fd, __name, __namelen); } // according to man address should be truncated if given struct is too small @@ -4982,7 +4983,7 @@ int sockinfo_tcp::getpeername(sockaddr *__name, socklen_t *__namelen) if (m_sock_offload == TCP_SOCK_PASSTHROUGH) { si_tcp_logdbg("passthrough - go to OS getpeername"); - return orig_os_api.getpeername(m_fd, __name, __namelen); + return SYSCALL(getpeername, m_fd, __name, __namelen); } if (m_conn_state < TCP_CONN_CONNECTED) { @@ -6034,8 +6035,8 @@ inline bool sockinfo_tcp::handle_bind_no_port(int &bind_ret, in_port_t in_port, // first bind call with port 0, we set SO_REUSEPORT so we will be able to bind to a // specific port later when we reuse port int so_reuseport = 1; - if ((bind_ret = orig_os_api.setsockopt(m_fd, SOL_SOCKET, SO_REUSEPORT, &so_reuseport, - sizeof(so_reuseport)))) { + if ((bind_ret = SYSCALL(setsockopt, m_fd, SOL_SOCKET, SO_REUSEPORT, &so_reuseport, + sizeof(so_reuseport)))) { return RETURN_FROM_BIND; } m_bound.set_sockaddr(__addr, __addrlen); diff --git a/src/core/sock/sockinfo_udp.cpp b/src/core/sock/sockinfo_udp.cpp index 8f50bd30c..402e4e02d 100644 --- a/src/core/sock/sockinfo_udp.cpp +++ b/src/core/sock/sockinfo_udp.cpp @@ -101,10 +101,10 @@ inline int sockinfo_udp::poll_os() uint64_t pending_data = 0; m_rx_udp_poll_os_ratio_counter = 0; - ret = orig_os_api.ioctl(m_fd, FIONREAD, &pending_data); + ret = SYSCALL(ioctl, m_fd, FIONREAD, &pending_data); if (unlikely(ret == -1)) { m_p_socket_stats->counters.n_rx_os_errors++; - si_udp_logdbg("orig_os_api.ioctl returned with error in polling loop (errno=%d %m)", errno); + si_udp_logdbg("SYSCALL(ioctl) returned with error in polling loop (errno=%d %m)", errno); return -1; } if (pending_data > 0) { @@ -409,7 +409,7 @@ sockinfo_udp::sockinfo_udp(int fd, int domain) socklen_t option_len = sizeof(n_so_rcvbuf_bytes); BULLSEYE_EXCLUDE_BLOCK_START if (unlikely( - orig_os_api.getsockopt(m_fd, SOL_SOCKET, SO_RCVBUF, &n_so_rcvbuf_bytes, &option_len))) { + SYSCALL(getsockopt, m_fd, SOL_SOCKET, SO_RCVBUF, &n_so_rcvbuf_bytes, &option_len))) { si_udp_logdbg("Failure in getsockopt (errno=%d %m)", errno); } BULLSEYE_EXCLUDE_BLOCK_END @@ -424,7 +424,7 @@ sockinfo_udp::sockinfo_udp(int fd, int domain) ev.data.fd = m_fd; BULLSEYE_EXCLUDE_BLOCK_START - if (unlikely(orig_os_api.epoll_ctl(m_rx_epfd, EPOLL_CTL_ADD, ev.data.fd, &ev))) { + if (unlikely(SYSCALL(epoll_ctl, m_rx_epfd, EPOLL_CTL_ADD, ev.data.fd, &ev))) { si_udp_logpanic("failed to add user's fd to internal epfd errno=%d (%m)", errno); } BULLSEYE_EXCLUDE_BLOCK_END @@ -454,7 +454,7 @@ sockinfo_udp::~sockinfo_udp() /* AlexR: We don't have to be nice and delete the fd. close() will do that any way. This save us the problem when closing in the clean-up case - if we get closed be the - nameserver socket 53. if (unlikely( orig_os_api.epoll_ctl(m_rx_epfd, EPOLL_CTL_DEL, m_fd, + nameserver socket 53. if (unlikely( SYSCALL(epoll_ctl, m_rx_epfd, EPOLL_CTL_DEL, m_fd, NULL))) { if (errno == ENOENT) si_logfunc("failed to del users fd from internal epfd - probably clean up case (errno=%d %m)", errno); else si_logerr("failed to del users fd from internal epfd (errno=%d %m)", errno); @@ -517,7 +517,7 @@ int sockinfo_udp::bind(const struct sockaddr *__addr, socklen_t __addrlen) // We always call the orig_bind which will check sanity of the user socket api // and the OS will also allocate a specific port that we can also use - int ret = orig_os_api.bind(m_fd, __addr, __addrlen); + int ret = SYSCALL(bind, m_fd, __addr, __addrlen); if (ret) { si_udp_logdbg("orig bind failed (ret=%d %m)", ret); // TODO: Should we set errno again (maybe log write modified the orig.bind() errno)? @@ -549,7 +549,7 @@ int sockinfo_udp::connect(const struct sockaddr *__to, socklen_t __tolen) // We always call the orig_connect which will check sanity of the user socket api // and the OS will also allocate a specific bound port that we can also use - int ret = orig_os_api.connect(m_fd, __to, __tolen); + int ret = SYSCALL(connect, m_fd, __to, __tolen); if (ret) { si_udp_logdbg("orig connect failed (ret=%d, errno=%d %m)", ret, errno); return ret; @@ -657,7 +657,7 @@ int sockinfo_udp::getsockname(struct sockaddr *__name, socklen_t *__namelen) return -1; } - return orig_os_api.getsockname(m_fd, __name, __namelen); + return SYSCALL(getsockname, m_fd, __name, __namelen); } int sockinfo_udp::on_sockname_change(struct sockaddr *__name, socklen_t __namelen) @@ -732,7 +732,7 @@ int sockinfo_udp::setsockopt(int __level, int __optname, __const void *__optval, si_udp_logfunc("level=%d, optname=%d", __level, __optname); if (unlikely(m_state == SOCKINFO_DESTROYING) || unlikely(g_b_exit)) { - return orig_os_api.setsockopt(m_fd, __level, __optname, __optval, __optlen); + return SYSCALL(setsockopt, m_fd, __level, __optname, __optval, __optlen); } std::lock_guard lock_tx(m_lock_snd); @@ -1136,7 +1136,7 @@ int sockinfo_udp::setsockopt(int __level, int __optname, __const void *__optval, // offloaded, check if need to pend else if (m_bound.is_anyport()) { // Delay attaching to this MC group until we have bound UDP port - ret = orig_os_api.setsockopt(m_fd, __level, __optname, __optval, __optlen); + ret = SYSCALL(setsockopt, m_fd, __level, __optname, __optval, __optlen); if (ret) { return ret; } @@ -1149,7 +1149,7 @@ int sockinfo_udp::setsockopt(int __level, int __optname, __const void *__optval, } if (goto_os) { - ret = orig_os_api.setsockopt(m_fd, __level, __optname, __optval, __optlen); + ret = SYSCALL(setsockopt, m_fd, __level, __optname, __optval, __optlen); if (ret) { return ret; } @@ -1463,7 +1463,7 @@ int sockinfo_udp::multicast_membership_setsockopt_ip6(int optname, const void *o // offloaded, check if need to pend else if (m_bound.is_anyport()) { // Delay attaching to this MC group until we have bound UDP port - ret = orig_os_api.setsockopt(m_fd, IPPROTO_IPV6, optname, optval, optlen); + ret = SYSCALL(setsockopt, m_fd, IPPROTO_IPV6, optname, optval, optlen); if (ret) { return ret; } @@ -1477,7 +1477,7 @@ int sockinfo_udp::multicast_membership_setsockopt_ip6(int optname, const void *o } if (goto_os) { - ret = orig_os_api.setsockopt(m_fd, IPPROTO_IPV6, optname, optval, optlen); + ret = SYSCALL(setsockopt, m_fd, IPPROTO_IPV6, optname, optval, optlen); if (ret) { return ret; } @@ -1593,7 +1593,7 @@ int sockinfo_udp::getsockopt(int __level, int __optname, void *__optval, socklen { si_udp_logfunc("level=%d, optname=%d", __level, __optname); - int ret = orig_os_api.getsockopt(m_fd, __level, __optname, __optval, __optlen); + int ret = SYSCALL(getsockopt, m_fd, __level, __optname, __optval, __optlen); if (unlikely(m_state == SOCKINFO_DESTROYING) || unlikely(g_b_exit)) { return ret; @@ -2231,7 +2231,7 @@ int sockinfo_udp::rx_verify_available_data() } else if (ret == 1) { // Got 1, means we have a ready packet in OS uint64_t pending_data = 0; - ret = orig_os_api.ioctl(m_fd, FIONREAD, &pending_data); + ret = SYSCALL(ioctl, m_fd, FIONREAD, &pending_data); if (ret >= 0) { // This will cause the next non-blocked read to check the OS again. // We do this only after a successful read. @@ -3058,7 +3058,7 @@ void sockinfo_udp::original_os_setsockopt_helper(const void *pram, int pram_size { si_udp_logdbg("calling orig_setsockopt(%s) for igmp support by OS", setsockopt_ip_opt_to_str(optname)); - if (orig_os_api.setsockopt(m_fd, level, optname, pram, pram_size)) { + if (SYSCALL(setsockopt, m_fd, level, optname, pram, pram_size)) { si_udp_logdbg("orig setsockopt(%s) failed (errno=%d %m)", setsockopt_ip_opt_to_str(optname), errno); } diff --git a/src/core/util/utils.cpp b/src/core/util/utils.cpp index a4016d104..3f050ae74 100644 --- a/src/core/util/utils.cpp +++ b/src/core/util/utils.cpp @@ -495,7 +495,7 @@ void set_fd_block_mode(int fd, bool b_block) { __log_dbg("fd[%d]: setting to %sblocking mode (%d)", fd, b_block ? "" : "non-", b_block); - int flags = orig_os_api.fcntl(fd, F_GETFL); + int flags = SYSCALL(fcntl, fd, F_GETFL); BULLSEYE_EXCLUDE_BLOCK_START if (flags < 0) { __log_err("failed reading fd[%d] flag (rc=%d errno=%d %m)", fd, flags, errno); @@ -509,7 +509,7 @@ void set_fd_block_mode(int fd, bool b_block) flags |= O_NONBLOCK; } - int ret = orig_os_api.fcntl(fd, F_SETFL, flags); + int ret = SYSCALL(fcntl, fd, F_SETFL, flags); BULLSEYE_EXCLUDE_BLOCK_START if (ret < 0) { __log_err("failed changing fd[%d] to %sblocking mode (rc=%d errno=%d %s)", fd, @@ -544,20 +544,20 @@ int priv_read_file(const char *path, char *buf, size_t size, vlog_levels_t log_level /*= VLOG_ERROR*/) { int len = -1; - int fd = open(path, O_RDONLY); + int fd = SYSCALL(open, path, O_RDONLY); BULLSEYE_EXCLUDE_BLOCK_START if (fd < 0) { VLOG_PRINTF(log_level, "ERROR while opening file %s (errno %d %m)", path, errno); return -1; } BULLSEYE_EXCLUDE_BLOCK_END - len = read(fd, buf, size); + len = SYSCALL(read, fd, buf, size); BULLSEYE_EXCLUDE_BLOCK_START if (len < 0) { VLOG_PRINTF(log_level, "ERROR while reading from file %s (errno %d %m)", path, errno); } BULLSEYE_EXCLUDE_BLOCK_END - close(fd); + SYSCALL(close, fd); return len; } @@ -674,20 +674,20 @@ class socket_context_manager { .tv_usec = 10, }; - m_fd = orig_os_api.socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE); + m_fd = SYSCALL(socket, AF_NETLINK, SOCK_RAW, NETLINK_ROUTE); if (m_fd < 0) { throw std::runtime_error("Open netlink socket failed"); } - if (orig_os_api.setsockopt(m_fd, SOL_SOCKET, SO_RCVTIMEO, (const char *)&tv, sizeof tv)) { - close(m_fd); + if (SYSCALL(setsockopt, m_fd, SOL_SOCKET, SO_RCVTIMEO, (const char *)&tv, sizeof tv)) { + SYSCALL(close, m_fd); throw std::runtime_error("Setsockopt non-blocking failed"); } } socket_context_manager(int fd) noexcept : m_fd(fd) {}; - ~socket_context_manager() { close(m_fd); }; + ~socket_context_manager() { SYSCALL(close, m_fd); }; void send_getaddr_request(uint8_t family) { @@ -709,7 +709,7 @@ class socket_context_manager { iovec iov = {&msg_buf, msg_buf.nl.nlmsg_len}; msghdr msg = {&sa, sizeof(sa), &iov, 1, nullptr, 0, 0}; - if (orig_os_api.sendmsg(m_fd, &msg, 0) < 0) { + if (SYSCALL(sendmsg, m_fd, &msg, 0) < 0) { throw std::runtime_error("Send RTM_GETADDR request failed"); } } @@ -720,7 +720,7 @@ class socket_context_manager { iovec iov = {&m_buf, m_buf.size()}; msghdr msg = {&sa, sizeof(sa), &iov, 1, nullptr, 0, 0}; - return orig_os_api.recvmsg(m_fd, &msg, 0); + return SYSCALL(recvmsg, m_fd, &msg, 0); } nlmsghdr *get_nlmsghdr() { return reinterpret_cast(&m_buf); } @@ -837,7 +837,7 @@ uint16_t get_vlan_id_from_ifname(const char *ifname) { // find vlan id from interface name struct vlan_ioctl_args ifr; - int fd = orig_os_api.socket(AF_INET, SOCK_DGRAM, 0); + int fd = SYSCALL(socket, AF_INET, SOCK_DGRAM, 0); if (fd < 0) { __log_err("ERROR from socket() (errno=%d %m)", errno); @@ -847,15 +847,15 @@ uint16_t get_vlan_id_from_ifname(const char *ifname) ifr.cmd = GET_VLAN_VID_CMD; strncpy(ifr.device1, ifname, sizeof(ifr.device1) - 1); - if (orig_os_api.ioctl(fd, SIOCGIFVLAN, &ifr) < 0) { + if (SYSCALL(ioctl, fd, SIOCGIFVLAN, &ifr) < 0) { __log_dbg( "Failure in ioctl(SIOCGIFVLAN, cmd=GET_VLAN_VID_CMD) for interface '%s' (errno=%d %m)", ifname, errno); - orig_os_api.close(fd); + SYSCALL(close, fd); return 0; } - orig_os_api.close(fd); + SYSCALL(close, fd); __log_dbg("found vlan id '%d' for interface '%s'", ifr.u.VID, ifname); @@ -866,7 +866,7 @@ size_t get_vlan_base_name_from_ifname(const char *ifname, char *base_ifname, siz { // find vlan base name from interface name struct vlan_ioctl_args ifr; - int fd = orig_os_api.socket(AF_INET, SOCK_DGRAM, 0); + int fd = SYSCALL(socket, AF_INET, SOCK_DGRAM, 0); if (fd < 0) { __log_err("ERROR from socket() (errno=%d %m)", errno); return -1; @@ -875,15 +875,15 @@ size_t get_vlan_base_name_from_ifname(const char *ifname, char *base_ifname, siz ifr.cmd = GET_VLAN_REALDEV_NAME_CMD; strncpy(ifr.device1, ifname, sizeof(ifr.device1) - 1); - if (orig_os_api.ioctl(fd, SIOCGIFVLAN, &ifr) < 0) { + if (SYSCALL(ioctl, fd, SIOCGIFVLAN, &ifr) < 0) { __log_dbg("Failure in ioctl(SIOCGIFVLAN, cmd=GET_VLAN_REALDEV_NAME_CMD) for interface '%s' " "(errno=%d %m)", ifname, errno); - orig_os_api.close(fd); + SYSCALL(close, fd); return 0; } - orig_os_api.close(fd); + SYSCALL(close, fd); size_t name_len = strlen(ifr.u.device2); if (base_ifname && name_len > 0) { @@ -924,7 +924,7 @@ int run_and_retreive_system_command(const char *cmd_line, char *return_str, int if (file) { int fd = fileno(file); if (fd > 0) { - int actual_len = read(fd, return_str, return_str_len - 1); + int actual_len = SYSCALL(read, fd, return_str, return_str_len - 1); if (actual_len > 0) { return_str[actual_len] = '\0'; } else { @@ -1046,9 +1046,9 @@ bool get_bond_name(IN const char *ifname, OUT char *bond_name, IN int sz) for (ifa = ifaddr; ifa != NULL; ifa = ifa->ifa_next) { snprintf(upper_path, sizeof(upper_path), NETVSC_DEVICE_UPPER_FILE, base_ifname, ifa->ifa_name); - int fd = open(upper_path, O_RDONLY); + int fd = SYSCALL(open, upper_path, O_RDONLY); if (fd >= 0) { - close(fd); + SYSCALL(close, fd); if (IFNAMSIZ <= sz) { memcpy(bond_name, ifa->ifa_name, IFNAMSIZ); } @@ -1109,9 +1109,9 @@ bool get_netvsc_slave(IN const char *ifname, OUT char *slave_name, OUT unsigned for (ifa = ifaddr; ifa != NULL; ifa = ifa->ifa_next) { snprintf(netvsc_path, sizeof(netvsc_path), NETVSC_DEVICE_LOWER_FILE, base_ifname, ifa->ifa_name); - int fd = open(netvsc_path, O_RDONLY); + int fd = SYSCALL(open, netvsc_path, O_RDONLY); if (fd >= 0) { - close(fd); + SYSCALL(close, fd); memcpy(slave_name, ifa->ifa_name, IFNAMSIZ); slave_flags = ifa->ifa_flags; __log_dbg("Found slave_name = %s, slave_flags = %u", slave_name, slave_flags); @@ -1186,9 +1186,9 @@ bool check_device_exist(const char *ifname, const char *path) n = snprintf(device_path, sizeof(device_path), path, ifname); if (likely((0 < n) && (n < (int)sizeof(device_path)))) { - fd = orig_os_api.open(device_path, O_RDONLY); + fd = SYSCALL(open, device_path, O_RDONLY); if (fd >= 0) { - orig_os_api.close(fd); + SYSCALL(close, fd); } if (fd < 0 && errno == EMFILE) { __log_warn("There are no free fds in the system. This may cause unexpected behavior"); @@ -1212,9 +1212,9 @@ bool check_device_name_ib_name(const char *ifname, const char *ibname) n = snprintf(ib_path, sizeof(ib_path), "/sys/class/infiniband/%s/device/net/%s/ifindex", ibname, str_ifname); if (likely((0 < n) && (n < (int)sizeof(ib_path)))) { - fd = open(ib_path, O_RDONLY); + fd = SYSCALL(open, ib_path, O_RDONLY); if (fd >= 0) { - close(fd); + SYSCALL(close, fd); return true; } } @@ -1303,7 +1303,7 @@ int validate_tso(int if_index) struct ifreq req; struct ethtool_value eval; - fd = orig_os_api.socket(AF_INET, SOCK_DGRAM, 0); + fd = SYSCALL(socket, AF_INET, SOCK_DGRAM, 0); if (fd < 0) { __log_err("ERROR from socket() (errno=%d %m)", errno); return -1; @@ -1313,13 +1313,13 @@ int validate_tso(int if_index) req.ifr_ifindex = if_index; if_indextoname(if_index, req.ifr_name); req.ifr_data = (char *)&eval; - ret = orig_os_api.ioctl(fd, SIOCETHTOOL, &req); + ret = SYSCALL(ioctl, fd, SIOCETHTOOL, &req); if (ret < 0) { __log_dbg("ioctl(SIOCETHTOOL) cmd=ETHTOOL_GTSO (errno=%d %m)", errno); } else { ret = eval.data; } - orig_os_api.close(fd); + SYSCALL(close, fd); return ret; #else NOT_IN_USE(if_index); @@ -1335,7 +1335,7 @@ int validate_lro(int if_index) struct ifreq req; struct ethtool_value eval; - fd = orig_os_api.socket(AF_INET, SOCK_DGRAM, 0); + fd = SYSCALL(socket, AF_INET, SOCK_DGRAM, 0); if (fd < 0) { __log_err("ERROR from socket() (errno=%d %m)", errno); return -1; @@ -1345,13 +1345,13 @@ int validate_lro(int if_index) req.ifr_ifindex = if_index; if_indextoname(if_index, req.ifr_name); req.ifr_data = (char *)&eval; - ret = orig_os_api.ioctl(fd, SIOCETHTOOL, &req); + ret = SYSCALL(ioctl, fd, SIOCETHTOOL, &req); if (ret < 0) { __log_dbg("ioctl(SIOCETHTOOL) cmd=ETHTOOL_GFLAGS (errno=%d %m)", errno); } else { ret = (eval.data & ETH_FLAG_LRO ? 1 : 0); } - orig_os_api.close(fd); + SYSCALL(close, fd); return ret; #else NOT_IN_USE(if_index); diff --git a/src/core/util/wakeup_pipe.cpp b/src/core/util/wakeup_pipe.cpp index 32afa180c..bea47a743 100644 --- a/src/core/util/wakeup_pipe.cpp +++ b/src/core/util/wakeup_pipe.cpp @@ -60,18 +60,18 @@ wakeup_pipe::wakeup_pipe() int ref = atomic_fetch_and_inc(&ref_count); if (ref == 0) { BULLSEYE_EXCLUDE_BLOCK_START - if (orig_os_api.pipe(g_wakeup_pipes)) { + if (SYSCALL(pipe, g_wakeup_pipes)) { wkup_logpanic("wakeup pipe create failed (errno=%d %m)", errno); } - if (orig_os_api.write(g_wakeup_pipes[1], "^", 1) != 1) { + if (SYSCALL(write, g_wakeup_pipes[1], "^", 1) != 1) { wkup_logpanic("wakeup pipe write failed(errno=%d %m)", errno); } BULLSEYE_EXCLUDE_BLOCK_END wkup_logdbg("created wakeup pipe [RD=%d, WR=%d]", g_wakeup_pipes[0], g_wakeup_pipes[1]); // ToDo - these pipe should be closed at some point - // orig_os_api.close(g_si_wakeup_pipes[1]); - // orig_os_api.close(g_si_wakeup_pipes[0]); + // SYSCALL(close, g_si_wakeup_pipes[1]); + // SYSCALL(close, g_si_wakeup_pipes[0]); } m_ev.events = EPOLLIN; @@ -96,7 +96,7 @@ void wakeup_pipe::do_wakeup() int errno_tmp = errno; // don't let wakeup affect errno, as this can fail with EEXIST BULLSEYE_EXCLUDE_BLOCK_START - if ((orig_os_api.epoll_ctl(m_epfd, EPOLL_CTL_ADD, g_wakeup_pipes[0], &m_ev)) && + if ((SYSCALL(epoll_ctl, m_epfd, EPOLL_CTL_ADD, g_wakeup_pipes[0], &m_ev)) && (errno != EEXIST)) { wkup_logerr("Failed to add wakeup fd to internal epfd (errno=%d %m)", errno); } @@ -114,7 +114,7 @@ void wakeup_pipe::remove_wakeup_fd() } wkup_entry_dbg(""); int tmp_errno = errno; - if (orig_os_api.epoll_ctl(m_epfd, EPOLL_CTL_DEL, g_wakeup_pipes[0], NULL)) { + if (SYSCALL(epoll_ctl, m_epfd, EPOLL_CTL_DEL, g_wakeup_pipes[0], NULL)) { BULLSEYE_EXCLUDE_BLOCK_START if (errno == ENOENT) { wkup_logdbg("Failed to delete global pipe from internal epfd it was already deleted"); diff --git a/src/core/xlio.h b/src/core/xlio.h new file mode 100644 index 000000000..be092a0a5 --- /dev/null +++ b/src/core/xlio.h @@ -0,0 +1,142 @@ +/* + * Copyright (c) 2001-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef XLIO_H +#define XLIO_H + +#include +#include + +extern "C" { +int xlio_socket(int __domain, int __type, int __protocol); + +int xlio_close(int __fd); + +int xlio_shutdown(int __fd, int __how); + +int xlio_listen(int __fd, int backlog); + +int xlio_accept(int __fd, struct sockaddr *__addr, socklen_t *__addrlen); + +int xlio_accept4(int __fd, struct sockaddr *__addr, socklen_t *__addrlen, int __flags); + +int xlio_bind(int __fd, const struct sockaddr *__addr, socklen_t __addrlen); + +int xlio_connect(int __fd, const struct sockaddr *__to, socklen_t __tolen); + +int xlio_setsockopt(int __fd, int __level, int __optname, __const void *__optval, + socklen_t __optlen); + +int xlio_getsockopt(int __fd, int __level, int __optname, void *__optval, socklen_t *__optlen); + +int xlio_fcntl(int __fd, int __cmd, ...); + +int xlio_fcntl64(int __fd, int __cmd, ...); + +int xlio_ioctl(int __fd, unsigned long int __request, ...); + +int xlio_getsockname(int __fd, struct sockaddr *__name, socklen_t *__namelen); + +int xlio_getpeername(int __fd, struct sockaddr *__name, socklen_t *__namelen); + +ssize_t xlio_read(int __fd, void *__buf, size_t __nbytes); + +ssize_t xlio_readv(int __fd, const struct iovec *iov, int iovcnt); + +ssize_t xlio_recv(int __fd, void *__buf, size_t __nbytes, int __flags); + +ssize_t xlio_recvmsg(int __fd, struct msghdr *__msg, int __flags); + +struct mmsghdr; + +int xlio_recvmmsg(int __fd, struct mmsghdr *__mmsghdr, unsigned int __vlen, int __flags, + const struct timespec *__timeout); + +ssize_t xlio_recvfrom(int __fd, void *__buf, size_t __nbytes, int __flags, struct sockaddr *__from, + socklen_t *__fromlen); + +ssize_t xlio_write(int __fd, __const void *__buf, size_t __nbytes); + +ssize_t xlio_writev(int __fd, const struct iovec *iov, int iovcnt); + +ssize_t xlio_send(int __fd, __const void *__buf, size_t __nbytes, int __flags); + +ssize_t xlio_sendmsg(int __fd, __const struct msghdr *__msg, int __flags); + +int xlio_sendmmsg(int __fd, struct mmsghdr *__mmsghdr, unsigned int __vlen, int __flags); + +ssize_t xlio_sendto(int __fd, __const void *__buf, size_t __nbytes, int __flags, + const struct sockaddr *__to, socklen_t __tolen); + +ssize_t xlio_sendfile(int out_fd, int in_fd, off_t *offset, size_t count); + +ssize_t xlio_sendfile64(int out_fd, int in_fd, __off64_t *offset, size_t count); + +int xlio_select(int __nfds, fd_set *__readfds, fd_set *__writefds, fd_set *__exceptfds, + struct timeval *__timeout); + +int xlio_pselect(int __nfds, fd_set *__readfds, fd_set *__writefds, fd_set *__errorfds, + const struct timespec *__timeout, const sigset_t *__sigmask); +int xlio_poll(struct pollfd *__fds, nfds_t __nfds, int __timeout); + +int xlio_ppoll(struct pollfd *__fds, nfds_t __nfds, const struct timespec *__timeout, + const sigset_t *__sigmask); + +int xlio_epoll_create(int __size); + +int xlio_epoll_create1(int __flags); + +int xlio_epoll_ctl(int __epfd, int __op, int __fd, struct epoll_event *__event); + +int xlio_epoll_wait(int __epfd, struct epoll_event *__events, int __maxevents, int __timeout); + +int xlio_epoll_pwait(int __epfd, struct epoll_event *__events, int __maxevents, int __timeout, + const sigset_t *__sigmask); +int xlio_socketpair(int __domain, int __type, int __protocol, int __sv[2]); + +int xlio_pipe(int __filedes[2]); + +int xlio_open(__const char *__file, int __oflag, ...); + +int xlio_creat(const char *__pathname, mode_t __mode); + +int xlio_dup(int __fd); + +int xlio_dup2(int __fd, int __fd2); + +/* Before using XLIO static interface call xlio_init; */ +int xlio_init(void); + +/* After finishing workling with XLIO interface call xlio_exit */ +int xlio_exit(void); +} +#endif /* XLIO_EXTRA_H */ diff --git a/src/core/xlio_extra.h b/src/core/xlio_extra.h index c027349b9..600465f7d 100644 --- a/src/core/xlio_extra.h +++ b/src/core/xlio_extra.h @@ -640,5 +640,4 @@ static inline struct xlio_api_t *xlio_get_api() } return api_ptr; } - #endif /* XLIO_EXTRA_H */ diff --git a/tests/gtest/nvme/nvme.cc b/tests/gtest/nvme/nvme.cc index 8b0ffac43..1a15691e7 100644 --- a/tests/gtest/nvme/nvme.cc +++ b/tests/gtest/nvme/nvme.cc @@ -32,6 +32,7 @@ #include #include +#include #include #include #include "common/def.h" From 2e5c9c661c57d4e99cde1b138aee1c8febcf03f8 Mon Sep 17 00:00:00 2001 From: Alex Briskin Date: Tue, 16 Jan 2024 12:36:00 +0200 Subject: [PATCH 032/169] issue: 3724170 disable LTO in Jenkins compiler tests Signed-off-by: Alex Briskin --- contrib/jenkins_tests/compiler.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/contrib/jenkins_tests/compiler.sh b/contrib/jenkins_tests/compiler.sh index c37503141..f5d66bcff 100755 --- a/contrib/jenkins_tests/compiler.sh +++ b/contrib/jenkins_tests/compiler.sh @@ -25,7 +25,7 @@ for compiler in $compiler_list; do echo "======================================================" $cc --version echo - test_exec='${WORKSPACE}/configure --prefix=$compiler_dir-$cc CC=$cc CXX=$cxx $jenkins_test_custom_configure && make $make_opt all' + test_exec='${WORKSPACE}/configure --prefix=$compiler_dir-$cc CC=$cc CXX=$cxx --disable-lto $jenkins_test_custom_configure && make $make_opt all' do_check_result "$test_exec" "$test_id" "$test_name" "$compiler_tap" "${compiler_dir}/compiler-${test_id}" [ ! -z "$module" ] && module unload "$module" cd ${compiler_dir} From 5e8fca0f9ca693b994e89af09489cd2de5f22c39 Mon Sep 17 00:00:00 2001 From: Iftah Levi Date: Mon, 22 Jan 2024 18:06:34 +0200 Subject: [PATCH 033/169] issue: 3704820 Fix strides in WQE for NGINX master PRM allows minimum of 512 RX strides per WQE. Adjust minimum value for NGINX master process from 32 to 512. Using also minimum stride size allowed - 64 instead of 512. Signed-off-by: Iftah Levi --- src/core/util/sys_vars.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/core/util/sys_vars.cpp b/src/core/util/sys_vars.cpp index a611f9057..b6f8dfdea 100644 --- a/src/core/util/sys_vars.cpp +++ b/src/core/util/sys_vars.cpp @@ -691,8 +691,8 @@ void mce_sys_var::update_multi_process_params() tx_segs_pool_batch_tcp = 256; rx_num_wr = 1; strq_strides_compensation_level = 32; - strq_stride_size_bytes = 512; - strq_stride_num_per_rwqe = 32; + strq_stride_size_bytes = STRQ_MIN_STRIDE_SIZE_BYTES; + strq_stride_num_per_rwqe = STRQ_MIN_STRIDES_NUM; tx_buf_size = 0; rx_buf_size = 0; } From ff3415678338f1a2b16f19a7266810cbde770d74 Mon Sep 17 00:00:00 2001 From: Iftah Levi Date: Wed, 22 Nov 2023 12:18:05 +0000 Subject: [PATCH 034/169] issue: 3678579 Update last_unacked on ACK recv If all unacked queue is acked - update last_unacked to null. Signed-off-by: Iftah Levi --- src/core/lwip/tcp_in.c | 1 + 1 file changed, 1 insertion(+) diff --git a/src/core/lwip/tcp_in.c b/src/core/lwip/tcp_in.c index 19c9f74f7..c4784c698 100644 --- a/src/core/lwip/tcp_in.c +++ b/src/core/lwip/tcp_in.c @@ -1221,6 +1221,7 @@ static void tcp_receive(struct tcp_pcb *pcb, tcp_in_data *in_data) /* If there's nothing left to acknowledge, stop the retransmit timer, otherwise reset it to start again */ if (pcb->unacked == NULL) { + pcb->last_unacked = NULL; if (persist) { /* start persist timer */ pcb->persist_cnt = 0; From 4ff44ecf0297443c31a4a93780d3f1af32486909 Mon Sep 17 00:00:00 2001 From: Iftah Levi Date: Wed, 22 Nov 2023 12:21:54 +0000 Subject: [PATCH 035/169] issue: 3678579 Fix last_unsent on retransmission In case of pcb->is_last_seg_dropped - we might end up with wrong pcb->last_unsent. Signed-off-by: Iftah Levi --- src/core/lwip/tcp_out.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/core/lwip/tcp_out.c b/src/core/lwip/tcp_out.c index aa8cc00ef..0ca605d66 100644 --- a/src/core/lwip/tcp_out.c +++ b/src/core/lwip/tcp_out.c @@ -1625,6 +1625,9 @@ err_t tcp_output(struct tcp_pcb *pcb) pcb->unacked->next = pcb->unsent; pcb->unsent = pcb->unacked; pcb->unacked = NULL; + if (NULL == pcb->last_unsent) { + pcb->last_unsent = pcb->last_unacked; + } pcb->last_unacked = NULL; } seg = pcb->unsent; From 2bf7224d35ccb2679421677f8e81823fb4db438a Mon Sep 17 00:00:00 2001 From: Iftah Levi Date: Wed, 22 Nov 2023 12:30:37 +0000 Subject: [PATCH 036/169] issue: 3678579 Fix last_unacked in tcp_rexmit_rto Update last_unacked to null after RTO. In this case we move all unacked queue to unsent. Signed-off-by: Iftah Levi --- src/core/lwip/tcp_out.c | 1 + 1 file changed, 1 insertion(+) diff --git a/src/core/lwip/tcp_out.c b/src/core/lwip/tcp_out.c index 0ca605d66..085b50df6 100644 --- a/src/core/lwip/tcp_out.c +++ b/src/core/lwip/tcp_out.c @@ -2034,6 +2034,7 @@ void tcp_rexmit_rto(struct tcp_pcb *pcb) pcb->unsent = pcb->unacked; /* unacked queue is now empty */ pcb->unacked = NULL; + pcb->last_unacked = NULL; /* increment number of retransmissions */ ++pcb->nrtx; From 43fe6e5d0cd8b1ebf6d46f30fc535edaa02a3de2 Mon Sep 17 00:00:00 2001 From: Iftah Levi Date: Wed, 22 Nov 2023 12:34:23 +0000 Subject: [PATCH 037/169] issue: 3678579 Update last_unacked in tcp_rexmit In tcp_rexmit we take the first segment from unacked and put it in unsent. In case its the only segment in unacked, we need to update last_unacked. Signed-off-by: Iftah Levi --- src/core/lwip/tcp_out.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/core/lwip/tcp_out.c b/src/core/lwip/tcp_out.c index 085b50df6..269c28c44 100644 --- a/src/core/lwip/tcp_out.c +++ b/src/core/lwip/tcp_out.c @@ -2065,7 +2065,11 @@ void tcp_rexmit(struct tcp_pcb *pcb) /* Move the first unacked segment to the unsent queue */ /* Keep the unsent queue sorted. */ seg = pcb->unacked; - pcb->unacked = seg->next; + + pcb->unacked = pcb->unacked->next; + if (NULL == pcb->unacked) { + pcb->last_unacked = NULL; + } cur_seg = &(pcb->unsent); while (*cur_seg && TCP_SEQ_LT((*cur_seg)->seqno, seg->seqno)) { From 95845c3f78dcfadcb96aa732fbdb39544df807f3 Mon Sep 17 00:00:00 2001 From: Iftah Levi Date: Wed, 22 Nov 2023 12:38:51 +0000 Subject: [PATCH 038/169] issue: 3678579 Remove iterating lists to find last Remove iterating over unsent / unacked to find last in lists. Signed-off-by: Iftah Levi --- src/core/lwip/tcp.c | 2 +- src/core/lwip/tcp_out.c | 51 ++++++++++------------------------------- 2 files changed, 13 insertions(+), 40 deletions(-) diff --git a/src/core/lwip/tcp.c b/src/core/lwip/tcp.c index 12eb9e3cf..09c121c8b 100644 --- a/src/core/lwip/tcp.c +++ b/src/core/lwip/tcp.c @@ -1271,7 +1271,7 @@ void tcp_pcb_purge(struct tcp_pcb *pcb) tcp_tx_segs_free(pcb, pcb->unsent); tcp_tx_segs_free(pcb, pcb->unacked); pcb->unacked = pcb->unsent = NULL; - pcb->last_unsent = NULL; + pcb->last_unacked = pcb->last_unsent = NULL; #if TCP_OVERSIZE pcb->unsent_oversize = 0; #endif /* TCP_OVERSIZE */ diff --git a/src/core/lwip/tcp_out.c b/src/core/lwip/tcp_out.c index 269c28c44..ff9315a65 100644 --- a/src/core/lwip/tcp_out.c +++ b/src/core/lwip/tcp_out.c @@ -136,17 +136,11 @@ static struct pbuf *tcp_output_alloc_header(struct tcp_pcb *pcb, u16_t optlen, u err_t tcp_send_fin(struct tcp_pcb *pcb) { /* first, try to add the fin to the last unsent segment */ - if (pcb->unsent != NULL) { - struct tcp_seg *last_unsent; - for (last_unsent = pcb->unsent; last_unsent->next != NULL; - last_unsent = last_unsent->next) { - ; - } - - if ((TCPH_FLAGS(last_unsent->tcphdr) & (TCP_SYN | TCP_FIN | TCP_RST)) == 0) { + if (pcb->last_unsent != NULL) { + if ((TCPH_FLAGS(pcb->last_unsent->tcphdr) & (TCP_SYN | TCP_FIN | TCP_RST)) == 0) { /* no SYN/FIN/RST flag in the header, we can add the FIN flag */ - TCPH_SET_FLAG(last_unsent->tcphdr, TCP_FIN); - last_unsent->tcp_flags |= TCP_FIN; + TCPH_SET_FLAG(pcb->last_unsent->tcphdr, TCP_FIN); + pcb->last_unsent->tcp_flags |= TCP_FIN; pcb->flags |= TF_FIN; return ERR_OK; } @@ -460,21 +454,10 @@ err_t tcp_write(struct tcp_pcb *pcb, const void *arg, u32_t len, u16_t apiflags, * pos records progress as data is segmented. */ - /* Find the tail of the unsent queue. */ - if (pcb->unsent != NULL) { + if (pcb->last_unsent != NULL) { u16_t space; - u16_t unsent_optlen; - - if (!pcb->last_unsent || pcb->last_unsent->next) { - /* @todo: this could be sped up by keeping last_unsent in the pcb */ - for (pcb->last_unsent = pcb->unsent; pcb->last_unsent->next != NULL; - pcb->last_unsent = pcb->last_unsent->next) { - ; - } - } - /* Usable space at the end of the last unsent segment */ - unsent_optlen = LWIP_TCP_OPT_LENGTH(pcb->last_unsent->flags); + u16_t unsent_optlen = LWIP_TCP_OPT_LENGTH(pcb->last_unsent->flags); if ((pcb->last_unsent->p->type == type) && (mss_local > pcb->last_unsent->len + unsent_optlen) && (TCP_SEQ_GEQ(pcb->last_unsent->seqno, pcb->snd_nxt)) && @@ -550,7 +533,6 @@ err_t tcp_write(struct tcp_pcb *pcb, const void *arg, u32_t len, u16_t apiflags, } } else { #if TCP_OVERSIZE - pcb->last_unsent = NULL; LWIP_ASSERT("unsent_oversize mismatch (pcb->unsent is NULL)", pcb->unsent_oversize == 0); #endif /* TCP_OVERSIZE */ } @@ -830,11 +812,7 @@ err_t tcp_enqueue_flags(struct tcp_pcb *pcb, u8_t flags) if (pcb->unsent == NULL) { pcb->unsent = seg; } else { - struct tcp_seg *useg; - for (useg = pcb->unsent; useg->next != NULL; useg = useg->next) { - ; - } - useg->next = seg; + pcb->last_unsent->next = seg; } pcb->last_unsent = seg; #if TCP_OVERSIZE @@ -2011,23 +1989,18 @@ void tcp_rst(u32_t seqno, u32_t ackno, u16_t local_port, u16_t remote_port, stru */ void tcp_rexmit_rto(struct tcp_pcb *pcb) { - struct tcp_seg *seg; - if (pcb->unacked == NULL) { return; } /* Move all unacked segments to the head of the unsent queue */ - for (seg = pcb->unacked; seg->next != NULL; seg = seg->next) { - ; - } - /* concatenate unsent queue after unacked queue */ - seg->next = pcb->unsent; - if (pcb->unsent == NULL) { + if (pcb->unsent) { + pcb->last_unacked->next = pcb->unsent; + } else { /* If there are no unsent segments, update last_unsent to the last unacked */ - pcb->last_unsent = seg; + pcb->last_unsent = pcb->last_unacked; #if TCP_OVERSIZE && TCP_OVERSIZE_DBGCHECK - pcb->unsent_oversize = seg->oversize_left; + pcb->unsent_oversize = pcb->last_unacked->oversize_left; #endif /* TCP_OVERSIZE && TCP_OVERSIZE_DBGCHECK*/ } /* unsent queue is the concatenated queue (of unacked, unsent) */ From 8a9d5b27ec9b5c4f4957dced7cba8ba23bae1bdf Mon Sep 17 00:00:00 2001 From: Iftah Levi Date: Thu, 14 Dec 2023 12:59:05 +0200 Subject: [PATCH 039/169] issue: 3678579 Coverity Checking for last_unsent and unsent for NULL is the same. Signed-off-by: Iftah Levi --- src/core/lwip/tcp_out.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/core/lwip/tcp_out.c b/src/core/lwip/tcp_out.c index ff9315a65..44e291b08 100644 --- a/src/core/lwip/tcp_out.c +++ b/src/core/lwip/tcp_out.c @@ -136,7 +136,7 @@ static struct pbuf *tcp_output_alloc_header(struct tcp_pcb *pcb, u16_t optlen, u err_t tcp_send_fin(struct tcp_pcb *pcb) { /* first, try to add the fin to the last unsent segment */ - if (pcb->last_unsent != NULL) { + if (pcb->unsent != NULL) { if ((TCPH_FLAGS(pcb->last_unsent->tcphdr) & (TCP_SYN | TCP_FIN | TCP_RST)) == 0) { /* no SYN/FIN/RST flag in the header, we can add the FIN flag */ TCPH_SET_FLAG(pcb->last_unsent->tcphdr, TCP_FIN); From 07203a25ca0346cf611df341c0beb0a9d74ec024 Mon Sep 17 00:00:00 2001 From: Dmytro Podgornyi Date: Sun, 3 Dec 2023 14:26:38 +0200 Subject: [PATCH 040/169] issue: 3690535 Remove SO_XLIO_RING_USER_MEMORY This is leftover from the Multi Packet RQ removal. In current code, this option doesn't have effect and only wastes 96 bytes per socket. Signed-off-by: Dmytro Podgornyi --- src/core/dev/net_device_val.cpp | 24 ++++-------------------- src/core/dev/net_device_val.h | 15 ++++----------- src/core/sock/sockinfo.cpp | 25 ------------------------- src/core/xlio_extra.h | 1 - 4 files changed, 8 insertions(+), 57 deletions(-) diff --git a/src/core/dev/net_device_val.cpp b/src/core/dev/net_device_val.cpp index ea35d9b21..ebf2bfbf7 100644 --- a/src/core/dev/net_device_val.cpp +++ b/src/core/dev/net_device_val.cpp @@ -72,30 +72,25 @@ ring_alloc_logic_attr::ring_alloc_logic_attr() : m_ring_alloc_logic(RING_LOGIC_PER_INTERFACE) - , m_user_id_key(0) , m_use_locks(true) + , m_user_id_key(0) { - m_mem_desc.iov_base = NULL; - m_mem_desc.iov_len = 0; init(); } ring_alloc_logic_attr::ring_alloc_logic_attr(ring_logic_t ring_logic, bool use_locks) : m_ring_alloc_logic(ring_logic) - , m_user_id_key(0) , m_use_locks(use_locks) + , m_user_id_key(0) { - m_mem_desc.iov_base = NULL; - m_mem_desc.iov_len = 0; init(); } ring_alloc_logic_attr::ring_alloc_logic_attr(const ring_alloc_logic_attr &other) : m_hash(other.m_hash) , m_ring_alloc_logic(other.m_ring_alloc_logic) - , m_user_id_key(other.m_user_id_key) - , m_mem_desc(other.m_mem_desc) , m_use_locks(other.m_use_locks) + , m_user_id_key(other.m_user_id_key) { } @@ -118,8 +113,6 @@ void ring_alloc_logic_attr::init() HASH_ITER(m_ring_alloc_logic, size_t); HASH_ITER(m_user_id_key, uint64_t); - HASH_ITER(m_mem_desc.iov_base, uintptr_t); - HASH_ITER(m_mem_desc.iov_len, size_t); HASH_ITER(m_use_locks, bool); m_hash = h; @@ -134,14 +127,6 @@ void ring_alloc_logic_attr::set_ring_alloc_logic(ring_logic_t logic) } } -void ring_alloc_logic_attr::set_memory_descriptor(iovec &mem_desc) -{ - if (m_mem_desc.iov_base != mem_desc.iov_base || m_mem_desc.iov_len != mem_desc.iov_len) { - m_mem_desc = mem_desc; - init(); - } -} - void ring_alloc_logic_attr::set_user_id_key(uint64_t user_id_key) { if (m_user_id_key != user_id_key) { @@ -162,8 +147,7 @@ const std::string ring_alloc_logic_attr::to_str() const { std::stringstream ss; - ss << "allocation logic " << m_ring_alloc_logic << " key " << m_user_id_key << " user address " - << m_mem_desc.iov_base << " user length " << m_mem_desc.iov_len << " use locks " + ss << "allocation logic " << m_ring_alloc_logic << " key " << m_user_id_key << " use locks " << !!m_use_locks; return ss.str(); diff --git a/src/core/dev/net_device_val.h b/src/core/dev/net_device_val.h index 8e3ec7d2f..5e3b94f29 100644 --- a/src/core/dev/net_device_val.h +++ b/src/core/dev/net_device_val.h @@ -60,21 +60,17 @@ class ring_alloc_logic_attr { ring_alloc_logic_attr(ring_logic_t ring_logic, bool use_locks); ring_alloc_logic_attr(const ring_alloc_logic_attr &other); void set_ring_alloc_logic(ring_logic_t logic); - void set_memory_descriptor(iovec &mem_desc); void set_user_id_key(uint64_t user_id_key); void set_use_locks(bool use_locks); const std::string to_str() const; inline ring_logic_t get_ring_alloc_logic() { return m_ring_alloc_logic; } - inline iovec *get_memory_descriptor() { return &m_mem_desc; } inline uint64_t get_user_id_key() { return m_user_id_key; } inline bool get_use_locks() { return m_use_locks; } bool operator==(const ring_alloc_logic_attr &other) const { return (m_ring_alloc_logic == other.m_ring_alloc_logic && - m_user_id_key == other.m_user_id_key && - m_mem_desc.iov_base == other.m_mem_desc.iov_base && - m_mem_desc.iov_len == other.m_mem_desc.iov_len && m_use_locks == other.m_use_locks); + m_user_id_key == other.m_user_id_key && m_use_locks == other.m_use_locks); } bool operator!=(const ring_alloc_logic_attr &other) const { return !(*this == other); } @@ -85,8 +81,6 @@ class ring_alloc_logic_attr { m_ring_alloc_logic = other.m_ring_alloc_logic; m_user_id_key = other.m_user_id_key; m_hash = other.m_hash; - m_mem_desc.iov_base = other.m_mem_desc.iov_base; - m_mem_desc.iov_len = other.m_mem_desc.iov_len; m_use_locks = other.m_use_locks; } return *this; @@ -101,12 +95,11 @@ class ring_alloc_logic_attr { private: size_t m_hash; - /* ring allocation logic , per thread per fd ... */ + /* Ring allocation logic: per thread, per interface, etc */ ring_logic_t m_ring_alloc_logic; - /* either user_idx or key as defined in ring_logic_t */ - uint64_t m_user_id_key; - iovec m_mem_desc; bool m_use_locks; + /* Either user_idx or key as defined in ring_logic_t */ + uint64_t m_user_id_key; void init(); }; diff --git a/src/core/sock/sockinfo.cpp b/src/core/sock/sockinfo.cpp index 4b9844cf8..c78e881c0 100644 --- a/src/core/sock/sockinfo.cpp +++ b/src/core/sock/sockinfo.cpp @@ -395,31 +395,6 @@ int sockinfo::setsockopt(int __level, int __optname, const void *__optval, sockl errno = EINVAL; } break; - case SO_XLIO_RING_USER_MEMORY: - if (__optval) { - if (__optlen == sizeof(iovec)) { - iovec *attr = (iovec *)__optval; - m_ring_alloc_log_rx.set_memory_descriptor(*attr); - m_ring_alloc_logic_rx = - ring_allocation_logic_rx(get_fd(), m_ring_alloc_log_rx, this); - if (m_p_rx_ring || m_rx_ring_map.size()) { - si_logwarn("user asked to assign memory for " - "RX ring but ring already exists"); - } - ret = SOCKOPT_INTERNAL_XLIO_SUPPORT; - } else { - ret = SOCKOPT_NO_XLIO_SUPPORT; - errno = EINVAL; - si_logdbg("SOL_SOCKET, SO_XLIO_RING_USER_MEMORY - " - "bad length expected %zu got %d", - sizeof(iovec), __optlen); - } - } else { - ret = SOCKOPT_NO_XLIO_SUPPORT; - errno = EINVAL; - si_logdbg("SOL_SOCKET, SO_XLIO_RING_USER_MEMORY - NOT HANDLED, optval == NULL"); - } - break; case SO_XLIO_FLOW_TAG: if (__optval) { if (__optlen == sizeof(uint32_t)) { diff --git a/src/core/xlio_extra.h b/src/core/xlio_extra.h index 600465f7d..f34f6e254 100644 --- a/src/core/xlio_extra.h +++ b/src/core/xlio_extra.h @@ -50,7 +50,6 @@ #define SO_XLIO_GET_API 2800 #define SO_XLIO_USER_DATA 2801 #define SO_XLIO_RING_ALLOC_LOGIC 2810 -#define SO_XLIO_RING_USER_MEMORY 2811 #define SO_XLIO_FLOW_TAG 2820 #define SO_XLIO_SHUTDOWN_RX 2821 #define SO_XLIO_PD 2822 From b12b93afdf3b5c24a1a30673da2a2b1c14edd00a Mon Sep 17 00:00:00 2001 From: Dmytro Podgornyi Date: Sun, 3 Dec 2023 19:00:04 +0200 Subject: [PATCH 041/169] issue: 3690535 Reduce ring_allocation_logic size Remove multiple fields from the ring_allocation_logic class: - m_active is redundant. - m_type / m_owner are needed only for debug logs. 16 bytes per object is a high cost taking into account that ring migration is disabled by default. Also compact the structure. This patch saves 64 bytes per socket (sockinfo and dst_entry). Signed-off-by: Dmytro Podgornyi --- src/core/dev/ring_allocation_logic.cpp | 36 +++++--------------------- src/core/dev/ring_allocation_logic.h | 20 +++----------- src/core/proto/dst_entry.cpp | 4 +-- src/core/proto/neighbour.cpp | 2 +- src/core/sock/sockinfo.cpp | 18 ++++++------- 5 files changed, 22 insertions(+), 58 deletions(-) diff --git a/src/core/dev/ring_allocation_logic.cpp b/src/core/dev/ring_allocation_logic.cpp index 0546f064d..be8fce013 100644 --- a/src/core/dev/ring_allocation_logic.cpp +++ b/src/core/dev/ring_allocation_logic.cpp @@ -34,11 +34,6 @@ #define MODULE_NAME "ral" -#undef MODULE_HDR_INFO -#define MODULE_HDR_INFO MODULE_NAME "%s:%d:%s() " -#undef __INFO__ -#define __INFO__ to_str().c_str() - #define ral_logpanic __log_info_panic #define ral_logerr __log_info_err #define ral_logwarn __log_info_warn @@ -48,35 +43,27 @@ #define ral_logfuncall __log_info_funcall ring_allocation_logic::ring_allocation_logic() - : m_owner(NULL) - , m_ring_migration_ratio(0) - , m_source(-1) + : m_ring_migration_ratio(-1) , m_migration_try_count(0) + , m_source(-1) , m_migration_candidate(0) - , m_active(true) , m_res_key() { - m_type = ""; } ring_allocation_logic::ring_allocation_logic(ring_logic_t allocation_logic, int ring_migration_ratio, source_t source, resource_allocation_key &ring_profile) - : m_owner(NULL) - , m_ring_migration_ratio(ring_migration_ratio) - , m_source(source) + : m_ring_migration_ratio(ring_migration_ratio) , m_migration_try_count(ring_migration_ratio) + , m_source(source) { - m_type = ""; - if (ring_profile.get_ring_alloc_logic() == RING_LOGIC_PER_INTERFACE) { ring_profile.set_ring_alloc_logic(allocation_logic); } m_res_key = resource_allocation_key(ring_profile); m_migration_candidate = 0; m_res_key.set_user_id_key(calc_res_key_by_logic()); - - m_active = true; } /** @@ -152,7 +139,8 @@ bool ring_allocation_logic::should_migrate_ring() { ral_logfuncall("currently accessed from thread=%lu, cpu=%d", pthread_self(), sched_getcpu()); - if (false == m_active) { + if (m_ring_migration_ratio < 0) { + // Ring migration is disabled return false; } @@ -175,10 +163,7 @@ bool ring_allocation_logic::should_migrate_ring() m_migration_try_count = 0; if (!m_migration_candidate) { - // save current used allocation key - // no need to save profile, and allocation logic uint64_t curr_id = m_res_key.get_user_id_key(); - // calc new key uint64_t new_id = calc_res_key_by_logic(); if (new_id == curr_id || g_n_internal_thread_id == curr_id) { return false; @@ -194,15 +179,6 @@ bool ring_allocation_logic::should_migrate_ring() return true; } -const std::string ring_allocation_logic::to_str() const -{ - std::stringstream ss; - - ss << '[' << m_type << '=' << m_owner << ']'; - - return ss.str(); -} - cpu_manager g_cpu_manager; __thread int g_n_thread_cpu_core = NO_CPU; diff --git a/src/core/dev/ring_allocation_logic.h b/src/core/dev/ring_allocation_logic.h index 203b632aa..87f2899d4 100644 --- a/src/core/dev/ring_allocation_logic.h +++ b/src/core/dev/ring_allocation_logic.h @@ -98,19 +98,13 @@ class ring_allocation_logic { } uint64_t calc_res_key_by_logic(); inline ring_logic_t get_alloc_logic_type() { return m_res_key.get_ring_alloc_logic(); } - inline void enable_migration(bool active) { m_active = active; } - const std::string to_str() const; - -protected: - const char *m_type; - const void *m_owner; + inline void disable_migration() { m_ring_migration_ratio = -1; } private: int m_ring_migration_ratio; - source_t m_source; int m_migration_try_count; + source_t m_source; uint64_t m_migration_candidate; - bool m_active; resource_allocation_key m_res_key; }; @@ -120,13 +114,10 @@ class ring_allocation_logic_rx : public ring_allocation_logic { : ring_allocation_logic() { } - ring_allocation_logic_rx(source_t source, resource_allocation_key &ring_profile, - const void *owner) + ring_allocation_logic_rx(source_t source, resource_allocation_key &ring_profile) : ring_allocation_logic(safe_mce_sys().ring_allocation_logic_rx, safe_mce_sys().ring_migration_ratio_rx, source, ring_profile) { - m_type = "Rx"; - m_owner = owner; } }; @@ -136,13 +127,10 @@ class ring_allocation_logic_tx : public ring_allocation_logic { : ring_allocation_logic() { } - ring_allocation_logic_tx(source_t source, resource_allocation_key &ring_profile, - const void *owner) + ring_allocation_logic_tx(source_t source, resource_allocation_key &ring_profile) : ring_allocation_logic(safe_mce_sys().ring_allocation_logic_tx, safe_mce_sys().ring_migration_ratio_tx, source, ring_profile) { - m_type = "Tx"; - m_owner = owner; } }; diff --git a/src/core/proto/dst_entry.cpp b/src/core/proto/dst_entry.cpp index 1ead1adf4..8fe855171 100644 --- a/src/core/proto/dst_entry.cpp +++ b/src/core/proto/dst_entry.cpp @@ -60,7 +60,7 @@ dst_entry::dst_entry(const sock_addr &dst, uint16_t src_port, socket_data &sock_ , m_so_bindtodevice_ip(in6addr_any) , m_route_src_ip(in6addr_any) , m_pkt_src_ip(in6addr_any) - , m_ring_alloc_logic_tx(sock_data.fd, ring_alloc_logic, this) + , m_ring_alloc_logic_tx(sock_data.fd, ring_alloc_logic) , m_p_tx_mem_buf_desc_list(NULL) , m_p_zc_mem_buf_desc_list(NULL) , m_b_tx_mem_buf_desc_list_pending(false) @@ -833,7 +833,7 @@ bool dst_entry::update_ring_alloc_logic(int fd, lock_base &socket_lock, { resource_allocation_key old_key(*m_ring_alloc_logic_tx.get_key()); - m_ring_alloc_logic_tx = ring_allocation_logic_tx(fd, ring_alloc_logic, this); + m_ring_alloc_logic_tx = ring_allocation_logic_tx(fd, ring_alloc_logic); if (*m_ring_alloc_logic_tx.get_key() != old_key) { std::lock_guard locker(m_tx_migration_lock); diff --git a/src/core/proto/neighbour.cpp b/src/core/proto/neighbour.cpp index 20668892b..5bcb9e662 100644 --- a/src/core/proto/neighbour.cpp +++ b/src/core/proto/neighbour.cpp @@ -221,7 +221,7 @@ neigh_entry::neigh_entry(neigh_key key, transport_type_t _type, bool is_init_res // Allocate one ring for g_p_neigh_table_mgr. All eigh_entry objects will share the same ring. ring_alloc_logic_attr ring_attr(RING_LOGIC_PER_OBJECT, true); - m_ring_allocation_logic = ring_allocation_logic_tx(g_p_neigh_table_mgr, ring_attr, this); + m_ring_allocation_logic = ring_allocation_logic_tx(g_p_neigh_table_mgr, ring_attr); if (is_init_resources) { m_p_ring = m_p_dev->reserve_ring(m_ring_allocation_logic.get_key()); diff --git a/src/core/sock/sockinfo.cpp b/src/core/sock/sockinfo.cpp index c78e881c0..050772c42 100644 --- a/src/core/sock/sockinfo.cpp +++ b/src/core/sock/sockinfo.cpp @@ -113,7 +113,7 @@ sockinfo::sockinfo(int fd, int domain, bool use_ring_locks) m_fd_context = (void *)((uintptr_t)m_fd); } - m_ring_alloc_logic_rx = ring_allocation_logic_rx(get_fd(), m_ring_alloc_log_rx, this); + m_ring_alloc_logic_rx = ring_allocation_logic_rx(get_fd(), m_ring_alloc_log_rx); m_p_socket_stats = &m_socket_stats; // Save stats as local copy and allow state publisher to // copy from this location @@ -180,7 +180,7 @@ void sockinfo::socket_stats_init(void) m_p_socket_stats->ring_alloc_logic_tx = m_ring_alloc_log_tx.get_ring_alloc_logic(); m_p_socket_stats->ring_user_id_rx = m_ring_alloc_logic_rx.calc_res_key_by_logic(); m_p_socket_stats->ring_user_id_tx = - ring_allocation_logic_tx(get_fd(), m_ring_alloc_log_tx, this).calc_res_key_by_logic(); + ring_allocation_logic_tx(get_fd(), m_ring_alloc_log_tx).calc_res_key_by_logic(); m_p_socket_stats->sa_family = m_family; } @@ -276,7 +276,7 @@ int sockinfo::set_ring_attr(xlio_ring_alloc_logic_attr *attr) update_header_field(&du); m_p_socket_stats->ring_alloc_logic_tx = m_ring_alloc_log_tx.get_ring_alloc_logic(); m_p_socket_stats->ring_user_id_tx = - ring_allocation_logic_tx(get_fd(), m_ring_alloc_log_tx, this).calc_res_key_by_logic(); + ring_allocation_logic_tx(get_fd(), m_ring_alloc_log_tx).calc_res_key_by_logic(); } if ((attr->comp_mask & XLIO_RING_ALLOC_MASK_RING_INGRESS) && attr->ingress) { ring_alloc_logic_attr old_key(*m_ring_alloc_logic_rx.get_key()); @@ -284,7 +284,7 @@ int sockinfo::set_ring_attr(xlio_ring_alloc_logic_attr *attr) if (set_ring_attr_helper(&m_ring_alloc_log_rx, attr)) { return SOCKOPT_NO_XLIO_SUPPORT; } - m_ring_alloc_logic_rx = ring_allocation_logic_rx(get_fd(), m_ring_alloc_log_rx, this); + m_ring_alloc_logic_rx = ring_allocation_logic_rx(get_fd(), m_ring_alloc_log_rx); if (m_rx_nd_map.size()) { std::lock_guard locker(m_rx_migration_lock); @@ -314,7 +314,7 @@ void sockinfo::set_ring_logic_rx(ring_alloc_logic_attr ral) { if (m_rx_ring_map.empty()) { m_ring_alloc_log_rx = ral; - m_ring_alloc_logic_rx = ring_allocation_logic_rx(get_fd(), m_ring_alloc_log_rx, this); + m_ring_alloc_logic_rx = ring_allocation_logic_rx(get_fd(), m_ring_alloc_log_rx); m_p_socket_stats->ring_alloc_logic_rx = m_ring_alloc_log_rx.get_ring_alloc_logic(); m_p_socket_stats->ring_user_id_rx = m_ring_alloc_logic_rx.calc_res_key_by_logic(); } @@ -326,7 +326,7 @@ void sockinfo::set_ring_logic_tx(ring_alloc_logic_attr ral) m_ring_alloc_log_tx = ral; m_p_socket_stats->ring_alloc_logic_tx = m_ring_alloc_log_tx.get_ring_alloc_logic(); m_p_socket_stats->ring_user_id_tx = - ring_allocation_logic_tx(get_fd(), m_ring_alloc_log_tx, this).calc_res_key_by_logic(); + ring_allocation_logic_tx(get_fd(), m_ring_alloc_log_tx).calc_res_key_by_logic(); } } @@ -1162,7 +1162,7 @@ void sockinfo::do_rings_migration_rx(resource_allocation_key &old_key) if (rc < 0) { si_logerr("Failed to release ring for allocation key %s", old_key.to_str().c_str()); new_key->set_user_id_key(old_calc_id); - m_ring_alloc_logic_rx.enable_migration(false); + m_ring_alloc_logic_rx.disable_migration(); si_logwarn("Migration is disabled due to failure"); } lock_rx_q(); @@ -1175,7 +1175,7 @@ void sockinfo::do_rings_migration_rx(resource_allocation_key &old_key) si_logerr("Failed to reserve ring for allocation key %s on lip %s", new_key->to_str().c_str(), ip_local.to_str().c_str()); new_key->set_user_id_key(old_calc_id); - m_ring_alloc_logic_rx.enable_migration(false); + m_ring_alloc_logic_rx.disable_migration(); si_logwarn("Migration is disabled due to failure"); lock_rx_q(); rx_nd_iter++; @@ -1233,7 +1233,7 @@ void sockinfo::do_rings_migration_rx(resource_allocation_key &old_key) si_logerr("Failed to reserve ring for allocation key %s on lip %s", new_key->to_str().c_str(), ip_local.to_str(m_family).c_str()); new_key->set_user_id_key(old_calc_id); - m_ring_alloc_logic_rx.enable_migration(false); + m_ring_alloc_logic_rx.disable_migration(); si_logwarn("Migration is disabled due to failure"); lock_rx_q(); rx_nd_iter++; From 8ec6c52b44b95334f6e0c90e52dc48180103d085 Mon Sep 17 00:00:00 2001 From: Dmytro Podgornyi Date: Fri, 8 Dec 2023 17:09:39 +0200 Subject: [PATCH 042/169] issue: 3690535 Improve condition of ring migration support Ring migration is disabled in most scenarios, so improve is_logic_support_migration() - check whether migration is disabled first. Signed-off-by: Dmytro Podgornyi --- src/core/dev/ring_allocation_logic.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/core/dev/ring_allocation_logic.h b/src/core/dev/ring_allocation_logic.h index 87f2899d4..2a505a6b6 100644 --- a/src/core/dev/ring_allocation_logic.h +++ b/src/core/dev/ring_allocation_logic.h @@ -93,8 +93,9 @@ class ring_allocation_logic { bool should_migrate_ring(); bool is_logic_support_migration() { - return m_res_key.get_ring_alloc_logic() >= RING_LOGIC_PER_THREAD && - m_res_key.get_ring_alloc_logic() < RING_LOGIC_PER_OBJECT && m_ring_migration_ratio > 0; + return m_ring_migration_ratio > 0 && + m_res_key.get_ring_alloc_logic() >= RING_LOGIC_PER_THREAD && + m_res_key.get_ring_alloc_logic() < RING_LOGIC_PER_OBJECT; } uint64_t calc_res_key_by_logic(); inline ring_logic_t get_alloc_logic_type() { return m_res_key.get_ring_alloc_logic(); } From 1e92b28860bb118e298c0493948ae0eeb18b266f Mon Sep 17 00:00:00 2001 From: Dmytro Podgornyi Date: Fri, 8 Dec 2023 17:57:08 +0200 Subject: [PATCH 043/169] issue: 3690535 Print ring allocation logic type in logs Add to_str() which returns pointer to "this". Also, print the type in constructor, so each further debug message can be identified by the pointer. Signed-off-by: Dmytro Podgornyi --- src/core/dev/ring_allocation_logic.cpp | 30 +++++++++++++++++++++----- src/core/dev/ring_allocation_logic.h | 8 +++++++ 2 files changed, 33 insertions(+), 5 deletions(-) diff --git a/src/core/dev/ring_allocation_logic.cpp b/src/core/dev/ring_allocation_logic.cpp index be8fce013..2a5660f6c 100644 --- a/src/core/dev/ring_allocation_logic.cpp +++ b/src/core/dev/ring_allocation_logic.cpp @@ -34,6 +34,11 @@ #define MODULE_NAME "ral" +#undef MODULE_HDR_INFO +#define MODULE_HDR_INFO MODULE_NAME "%s:%d:%s() " +#undef __INFO__ +#define __INFO__ to_str().c_str() + #define ral_logpanic __log_info_panic #define ral_logerr __log_info_err #define ral_logwarn __log_info_warn @@ -105,7 +110,7 @@ uint64_t ring_allocation_logic::calc_res_key_by_logic() break; default: // not suppose to get here - ral_logdbg("non-valid ring logic = %d", m_res_key.get_ring_alloc_logic()); + ral_logdbg("Non-valid ring logic = %d", m_res_key.get_ring_alloc_logic()); break; BULLSEYE_EXCLUDE_BLOCK_END } @@ -137,7 +142,7 @@ resource_allocation_key *ring_allocation_logic::create_new_key(const ip_address */ bool ring_allocation_logic::should_migrate_ring() { - ral_logfuncall("currently accessed from thread=%lu, cpu=%d", pthread_self(), sched_getcpu()); + ral_logfuncall("Currently accessed from thread=%lu, cpu=%d", pthread_self(), sched_getcpu()); if (m_ring_migration_ratio < 0) { // Ring migration is disabled @@ -172,13 +177,28 @@ bool ring_allocation_logic::should_migrate_ring() return false; } - ral_logdbg("migrating from ring of id=%s to ring of id=%lu", m_res_key.to_str().c_str(), + ral_logdbg("Migrating from ring of id=%s to ring of id=%lu", m_res_key.to_str().c_str(), m_migration_candidate); m_migration_candidate = 0; return true; } +const std::string ring_allocation_logic::to_str() const +{ + std::stringstream ss; + + ss << '[' << this << ']'; + + return ss.str(); +} + +void ring_allocation_logic::debug_print_type(const char *type) +{ + ral_logdbg("Type %s", type); + NOT_IN_USE(type); // Suppress --enable-opt-log=high warning +} + cpu_manager g_cpu_manager; __thread int g_n_thread_cpu_core = NO_CPU; @@ -215,7 +235,7 @@ int cpu_manager::reserve_cpu_for_thread(pthread_t tid, int suggested_cpu /* = NO int avail_cpus = CPU_COUNT(&cpu_set); if (avail_cpus == 0) { unlock(); - __log_err("no cpu available for tid=%lu", tid); + __log_err("No cpu available for tid=%lu", tid); return -1; } @@ -240,7 +260,7 @@ int cpu_manager::reserve_cpu_for_thread(pthread_t tid, int suggested_cpu /* = NO } CPU_ZERO(&cpu_set); CPU_SET(cpu, &cpu_set); - __log_dbg("attach tid=%lu running on cpu=%d to cpu=%d", tid, sched_getcpu(), cpu); + __log_dbg("Attach tid=%lu running on cpu=%d to cpu=%d", tid, sched_getcpu(), cpu); ret = pthread_setaffinity_np(tid, sizeof(cpu_set_t), &cpu_set); if (ret) { unlock(); diff --git a/src/core/dev/ring_allocation_logic.h b/src/core/dev/ring_allocation_logic.h index 2a505a6b6..af15f1cb7 100644 --- a/src/core/dev/ring_allocation_logic.h +++ b/src/core/dev/ring_allocation_logic.h @@ -84,6 +84,8 @@ class ring_allocation_logic { ring_allocation_logic(ring_logic_t ring_allocation_logic, int ring_migration_ratio, source_t source, resource_allocation_key &ring_profile); + void debug_print_type(const char *type); + public: /* careful, you'll lose the previous key !! */ resource_allocation_key *create_new_key(const ip_address &addr, int suggested_cpu = NO_CPU); @@ -101,6 +103,8 @@ class ring_allocation_logic { inline ring_logic_t get_alloc_logic_type() { return m_res_key.get_ring_alloc_logic(); } inline void disable_migration() { m_ring_migration_ratio = -1; } + const std::string to_str() const; + private: int m_ring_migration_ratio; int m_migration_try_count; @@ -114,11 +118,13 @@ class ring_allocation_logic_rx : public ring_allocation_logic { ring_allocation_logic_rx() : ring_allocation_logic() { + debug_print_type("Rx"); } ring_allocation_logic_rx(source_t source, resource_allocation_key &ring_profile) : ring_allocation_logic(safe_mce_sys().ring_allocation_logic_rx, safe_mce_sys().ring_migration_ratio_rx, source, ring_profile) { + debug_print_type("Rx"); } }; @@ -127,11 +133,13 @@ class ring_allocation_logic_tx : public ring_allocation_logic { ring_allocation_logic_tx() : ring_allocation_logic() { + debug_print_type("Tx"); } ring_allocation_logic_tx(source_t source, resource_allocation_key &ring_profile) : ring_allocation_logic(safe_mce_sys().ring_allocation_logic_tx, safe_mce_sys().ring_migration_ratio_tx, source, ring_profile) { + debug_print_type("Tx"); } }; From 871ff340342b9f76d877bf7ebf81eadf24484010 Mon Sep 17 00:00:00 2001 From: Dmytro Podgornyi Date: Tue, 30 Jan 2024 08:09:03 +0200 Subject: [PATCH 044/169] issue: 3690535 Remove unused fields in sockinfo_tcp Fields m_n_pbufs_rcvd and m_n_pbufs_freed are unused. Signed-off-by: Dmytro Podgornyi --- src/core/sock/sockinfo_tcp.cpp | 2 -- src/core/sock/sockinfo_tcp.h | 4 ---- 2 files changed, 6 deletions(-) diff --git a/src/core/sock/sockinfo_tcp.cpp b/src/core/sock/sockinfo_tcp.cpp index 3bcbbf525..1a7f1482b 100644 --- a/src/core/sock/sockinfo_tcp.cpp +++ b/src/core/sock/sockinfo_tcp.cpp @@ -337,8 +337,6 @@ sockinfo_tcp::sockinfo_tcp(int fd, int domain) tcp_err(&m_pcb, sockinfo_tcp::err_lwip_cb); tcp_sent(&m_pcb, sockinfo_tcp::ack_recvd_lwip_cb); - m_n_pbufs_rcvd = m_n_pbufs_freed = 0; - m_parent = NULL; m_iomux_ready_fd_array = NULL; diff --git a/src/core/sock/sockinfo_tcp.h b/src/core/sock/sockinfo_tcp.h index 76df75e83..5f21d62ab 100644 --- a/src/core/sock/sockinfo_tcp.h +++ b/src/core/sock/sockinfo_tcp.h @@ -588,10 +588,6 @@ class sockinfo_tcp : public sockinfo, public timer_handler { uint64_t m_user_huge_page_mask; unsigned m_required_send_block; uint16_t m_external_vlan_tag = 0U; - - // stats - uint64_t m_n_pbufs_rcvd; - uint64_t m_n_pbufs_freed; }; typedef struct tcp_seg tcp_seg; From 46d417252ca481ae25c32279e25856cc823203f1 Mon Sep 17 00:00:00 2001 From: Viacheslav Login Date: Wed, 24 Jan 2024 22:16:08 +0200 Subject: [PATCH 045/169] [CI] Coverity: add snapshot action Jira task: HPCINFRA-1430 Signed-off-by: Viacheslav Login --- .ci/matrix_job.yaml | 4 ++++ .ci/opensource_jjb.yaml | 4 ++++ contrib/jenkins_tests/cov.sh | 10 ++++++++++ 3 files changed, 18 insertions(+) diff --git a/.ci/matrix_job.yaml b/.ci/matrix_job.yaml index 506286445..e70562c22 100644 --- a/.ci/matrix_job.yaml +++ b/.ci/matrix_job.yaml @@ -14,6 +14,9 @@ kubernetes: limits: '{memory: 8Gi, cpu: 7000m}' requests: '{memory: 8Gi, cpu: 7000m}' +credentials: + - {credentialsId: '925b0900-e273-4042-bc7c-facaefae0727', usernameVariable: 'XLIO_COV_USER', passwordVariable: 'XLIO_COV_PASSWORD'} + volumes: - {mountPath: /hpc/local/bin, hostPath: /hpc/local/bin} - {mountPath: /hpc/local/oss, hostPath: /hpc/local/oss} @@ -195,6 +198,7 @@ steps: - name: Coverity enable: ${do_coverity} + credentialsId: '925b0900-e273-4042-bc7c-facaefae0727' containerSelector: - "{name: 'toolbox', category: 'tool'}" agentSelector: diff --git a/.ci/opensource_jjb.yaml b/.ci/opensource_jjb.yaml index de5934f8e..c6989764b 100644 --- a/.ci/opensource_jjb.yaml +++ b/.ci/opensource_jjb.yaml @@ -74,6 +74,10 @@ name: "do_coverity" default: true description: "Launch coverity verification." + - bool: + name: "do_coverity_snapshot" + default: false + description: "Submit Coverity Static Analysis as a snapshot (normally it should be checked only for master branch after proper defects review)" - bool: name: "do_test" default: true diff --git a/contrib/jenkins_tests/cov.sh b/contrib/jenkins_tests/cov.sh index 30bc72778..95c792539 100755 --- a/contrib/jenkins_tests/cov.sh +++ b/contrib/jenkins_tests/cov.sh @@ -46,6 +46,16 @@ eval "cov-analyze --config ${cov_dir}/coverity_config.xml \ --dir ${cov_build}" rc=$(($rc+$?)) +if [[ "${do_coverity_snapshot}" == true ]]; then + cov-commit-defects --ssl --on-new-cert trust \ + --url https://coverity.mellanox.com:8443 \ + --user "${XLIO_COV_USER}" --password "${XLIO_COV_PASSWORD}" \ + --dir "${cov_build}" \ + --stream libxlio-main \ + --strip-path "${WORKSPACE}" + rc=$(($rc+$?)) +fi + set -eE # Excluded files for the local report generated by command "cov-format-errors": From 70467ebedbde15e8f37e38a0f7007a32c62bb02d Mon Sep 17 00:00:00 2001 From: Alex Briskin Date: Mon, 13 Nov 2023 12:21:40 +0200 Subject: [PATCH 046/169] issue: 3668182 Add tcp_write_zc/tcp_prealloc_zc Signed-off-by: Alex Briskin --- src/core/lwip/tcp_impl.h | 1 + src/core/lwip/tcp_in.c | 1 + src/core/lwip/tcp_out.c | 146 +++++++++++++++++++++++++++++++++++++++ 3 files changed, 148 insertions(+) diff --git a/src/core/lwip/tcp_impl.h b/src/core/lwip/tcp_impl.h index fad6e9322..f76f0c22c 100644 --- a/src/core/lwip/tcp_impl.h +++ b/src/core/lwip/tcp_impl.h @@ -296,6 +296,7 @@ struct tcp_seg { #define TF_SEG_OPTS_ZEROCOPY (u8_t) TCP_WRITE_ZEROCOPY /* Use zerocopy send mode */ u8_t tcp_flags; /* Cached TCP flags for outgoing segments */ + u8_t bufs; /* To avoid pbuf_clen() */ /* L2+L3+TCP header for zerocopy segments, it must have enough room for options This should have enough space for L2 (ETH+vLAN), L3 (IPv4/6), L4 (TCP) diff --git a/src/core/lwip/tcp_in.c b/src/core/lwip/tcp_in.c index c4784c698..e93d469b5 100644 --- a/src/core/lwip/tcp_in.c +++ b/src/core/lwip/tcp_in.c @@ -176,6 +176,7 @@ void L3_level_tcp_input(struct pbuf *p, struct tcp_pcb *pcb) in_data.inseg.seqno = in_data.seqno; in_data.inseg.flags = 0; in_data.inseg.tcp_flags = in_data.flags; + in_data.inseg.bufs = 0; in_data.recv_data = NULL; in_data.recv_flags = 0; diff --git a/src/core/lwip/tcp_out.c b/src/core/lwip/tcp_out.c index 44e291b08..b61314e1a 100644 --- a/src/core/lwip/tcp_out.c +++ b/src/core/lwip/tcp_out.c @@ -196,6 +196,7 @@ static struct tcp_seg *tcp_create_segment(struct tcp_pcb *pcb, struct pbuf *p, u seg->flags = optflags; seg->tcp_flags = flags; + seg->bufs = 0; seg->p = p; seg->len = p->tot_len - optlen; seg->seqno = seqno; @@ -228,6 +229,33 @@ static struct tcp_seg *tcp_create_segment(struct tcp_pcb *pcb, struct pbuf *p, u return seg; } +/** + * Allocate a PBUF_RAM pbuf, perhaps with oversize space at the end. + * + * This function is like pbuf_alloc(layer, length, PBUF_RAM) except + * there may be extra bytes available at the end. + * + * @param length size of the pbuf's payload. + * @param oversize pointer to a u16_t that will receive the number of usable tail bytes. + * @param pcb The TCP connection that willo enqueue the pbuf. + * @param + */ +static struct pbuf *tcp_pbuf_prealloc_zc(u16_t length, u16_t *oversize, struct tcp_pcb *pcb, + pbuf_type type, pbuf_desc *desc, struct pbuf *p_buff) +{ + struct pbuf *p; + + p = tcp_tx_pbuf_alloc(pcb, 0, type, desc, p_buff); + if (p == NULL) { + return NULL; + } + LWIP_ASSERT("need unchained pbuf", p->next == NULL); + *oversize = p->len - length; + /* trim p->len to the currently used size */ + p->len = p->tot_len = length; + return p; +} + /** * Allocate a PBUF_RAM pbuf, perhaps with extra space at the end. * @@ -720,6 +748,124 @@ err_t tcp_write(struct tcp_pcb *pcb, const void *arg, u32_t len, u16_t apiflags, return ERR_MEM; } +err_t tcp_write_zc(struct tcp_pcb *pcb, const void *arg, u32_t len, pbuf_desc *desc) +{ + struct pbuf *p; + struct tcp_seg *seg = NULL, *prev_seg = NULL, *queue = NULL; + u32_t pos = 0; /* position in 'arg' data */ + u8_t optflags = TF_SEG_OPTS_ZEROCOPY; + u16_t oversize = 0; + const u16_t mss_local = lwip_zc_tx_size; + u16_t seglen; + u16_t queuelen = 0; + + if (len < pcb->mss) { + const int byte_queued = pcb->snd_nxt - pcb->lastack; + pcb->snd_sml_add = (pcb->unacked ? pcb->unacked->len : 0) + byte_queued; + } + + /* + * TCP segmentation is done in two phases with increasing complexity: + * + * 1. Chain a new pbuf to the end of pcb->unsent. + * 2. Create new segments. + * + * We may run out of memory at any point. In that case we must + * return ERR_MEM and not change anything in pcb. Therefore, all + * changes are recorded in local variables and committed at the end + * of the function. Some pcb fields are maintained in local copies: + * + * queuelen = pcb->snd_queuelen + * oversize = pcb->unsent_oversize + * + * These variables are set consistently by the phases: + * + * seg points to the last segment tampered with. + * + * pos records progress as data is segmented. + */ + + /* Find the tail of the unsent queue. */ + if (pcb->unsent != NULL && (pcb->last_unsent->flags & TF_SEG_OPTS_ZEROCOPY)) { + u16_t space = mss_local - pcb->last_unsent->len; + seg = pcb->last_unsent; + + if (space > 0 && (space >= len || len > mss_local) && + (pcb->last_unsent->bufs < pcb->tso.max_send_sge)) { + seglen = space < len ? space : len; + + if ((p = tcp_pbuf_prealloc_zc(seglen, &oversize, pcb, PBUF_ZEROCOPY, desc, NULL)) == + NULL) { + goto memerr; + } + p->payload = (u8_t *)arg; + pbuf_cat(pcb->last_unsent->p, p); + pcb->last_unsent->len += p->tot_len; + pcb->last_unsent->bufs += 1U; + pos += seglen; + queuelen += 1U; + } + } else { + pcb->last_unsent = NULL; + } + + while (pos < len) { + u32_t left = len - pos; + seglen = left > mss_local ? mss_local : left; + + if ((p = tcp_pbuf_prealloc_zc(seglen, &oversize, pcb, PBUF_ZEROCOPY, desc, NULL)) == NULL) { + goto memerr; + } + p->payload = (u8_t *)arg + pos; + queuelen += 1; + + if ((seg = tcp_create_segment(pcb, p, 0, pcb->snd_lbb + pos, optflags)) == NULL) { + tcp_tx_pbuf_free(pcb, p); + goto memerr; + } + + if (queue == NULL) { + queue = seg; + } else { + prev_seg->next = seg; + } + prev_seg = seg; + + pos += seglen; + } + +#if TCP_OVERSIZE + pcb->unsent_oversize = 0; +#endif /* TCP_OVERSIZE */ + + if (pcb->last_unsent == NULL) { + pcb->unsent = queue; + } else { + pcb->last_unsent->next = queue; + } + pcb->last_unsent = seg; + + /* + * Finally update the pcb state. + */ + pcb->snd_lbb += len; + pcb->snd_buf -= len; + pcb->snd_queuelen += queuelen; + + /* Set the PSH flag in the last segment that we enqueued. */ + if (enable_push_flag && seg != NULL && seg->tcphdr != NULL) { + TCPH_SET_FLAG(seg->tcphdr, TCP_PSH); + } + + return ERR_OK; +memerr: + pcb->flags |= TF_NAGLEMEMERR; + if (queue != NULL) { + tcp_tx_segs_free(pcb, queue); + } + return ERR_MEM; +} + /** * Enqueue TCP options for transmission. * From 4713505252ab343d2f3bf0f373bcb315058542b1 Mon Sep 17 00:00:00 2001 From: Alex Briskin Date: Mon, 13 Nov 2023 12:59:24 +0200 Subject: [PATCH 047/169] issue: 3668182 Connect tcp_write_zc to sockinfo_tcp::tcp_tx Signed-off-by: Alex Briskin --- src/core/lwip/pbuf.h | 1 + src/core/lwip/tcp.h | 1 + src/core/sock/sockinfo_tcp.cpp | 6 +++++- 3 files changed, 7 insertions(+), 1 deletion(-) diff --git a/src/core/lwip/pbuf.h b/src/core/lwip/pbuf.h index 4a4cbd6fb..5f6a69d9a 100644 --- a/src/core/lwip/pbuf.h +++ b/src/core/lwip/pbuf.h @@ -60,6 +60,7 @@ enum { PBUF_DESC_STRIDE, PBUF_DESC_TLS_RX, PBUF_DESC_NVME_TX, + PBUF_DESC_EXPRESS, }; typedef struct { diff --git a/src/core/lwip/tcp.h b/src/core/lwip/tcp.h index bd231378c..42ee10c35 100644 --- a/src/core/lwip/tcp.h +++ b/src/core/lwip/tcp.h @@ -480,6 +480,7 @@ err_t tcp_shutdown(struct tcp_pcb *pcb, int shut_rx, int shut_tx); err_t tcp_write(struct tcp_pcb *pcb, const void *dataptr, u32_t len, u16_t apiflags, pbuf_desc *desc); +err_t tcp_write_zc(struct tcp_pcb *pcb, const void *arg, u32_t len, pbuf_desc *desc); #define TCP_PRIO_MIN 1 #define TCP_PRIO_NORMAL 64 diff --git a/src/core/sock/sockinfo_tcp.cpp b/src/core/sock/sockinfo_tcp.cpp index 1a7f1482b..b689ab926 100644 --- a/src/core/sock/sockinfo_tcp.cpp +++ b/src/core/sock/sockinfo_tcp.cpp @@ -1114,7 +1114,11 @@ ssize_t sockinfo_tcp::tcp_tx(xlio_tx_call_attr_t &tx_arg) } } - err = tcp_write(&m_pcb, tx_ptr, tx_size, apiflags, &tx_arg.priv); + if (apiflags & XLIO_TX_PACKET_ZEROCOPY) { + err = tcp_write_zc(&m_pcb, tx_ptr, tx_size, &tx_arg.priv); + } else { + err = tcp_write(&m_pcb, tx_ptr, tx_size, apiflags, &tx_arg.priv); + } if (unlikely(err != ERR_OK)) { if (unlikely(err == ERR_CONN)) { // happens when remote drops during big write si_tcp_logdbg("connection closed: tx'ed = %d", total_tx); From 670058ed98173176cbea2f12abe8196bb561b981 Mon Sep 17 00:00:00 2001 From: Alex Briskin Date: Mon, 13 Nov 2023 16:23:00 +0200 Subject: [PATCH 048/169] issue: 3668182 Remove PBUF_DESC_MAP for send zerocopy Signed-off-by: Alex Briskin --- src/core/dev/buffer_pool.h | 4 ---- src/core/dev/ring.h | 2 +- src/core/dev/ring_bond.h | 4 ++-- src/core/dev/ring_simple.cpp | 34 +++++++++++--------------------- src/core/dev/ring_simple.h | 2 +- src/core/dev/ring_tap.h | 3 +-- src/core/lwip/pbuf.h | 2 +- src/core/proto/dst_entry_tcp.cpp | 14 +++++-------- src/core/sock/sock-redirect.cpp | 2 +- 9 files changed, 24 insertions(+), 43 deletions(-) diff --git a/src/core/dev/buffer_pool.h b/src/core/dev/buffer_pool.h index eba576d7d..0c26f15c0 100644 --- a/src/core/dev/buffer_pool.h +++ b/src/core/dev/buffer_pool.h @@ -57,10 +57,6 @@ inline static void free_lwip_pbuf(struct pbuf_custom *pbuf_custom) pbuf_custom->pbuf.desc.attr == PBUF_DESC_NVME_TX) { mem_desc *mdesc = (mem_desc *)pbuf_custom->pbuf.desc.mdesc; mdesc->put(); - } else if ((pbuf_custom->pbuf.type == PBUF_ZEROCOPY) && - (pbuf_custom->pbuf.desc.attr == PBUF_DESC_MAP)) { - mapping_t *mapping = (mapping_t *)pbuf_custom->pbuf.desc.map; - mapping->put(); } if (p_desc->m_flags & mem_buf_desc_t::ZCOPY) { diff --git a/src/core/dev/ring.h b/src/core/dev/ring.h index c0554fa29..8251ef306 100644 --- a/src/core/dev/ring.h +++ b/src/core/dev/ring.h @@ -134,7 +134,7 @@ class ring { const ip_address &src_ip, const ip_address &dst_ip, uint16_t src_port, uint16_t dst_port) = 0; virtual int modify_ratelimit(struct xlio_rate_limit_t &rate_limit) = 0; - virtual uint32_t get_tx_user_lkey(void *addr, size_t length, void *p_mapping = NULL) = 0; + virtual uint32_t get_tx_user_lkey(void *addr, size_t length) = 0; virtual uint32_t get_max_inline_data() = 0; virtual uint32_t get_max_send_sge(void) = 0; virtual uint32_t get_max_payload_sz(void) = 0; diff --git a/src/core/dev/ring_bond.h b/src/core/dev/ring_bond.h index e38bbe025..680317abb 100644 --- a/src/core/dev/ring_bond.h +++ b/src/core/dev/ring_bond.h @@ -92,9 +92,9 @@ class ring_bond : public ring { virtual bool get_hw_dummy_send_support(ring_user_id_t id, xlio_ibv_send_wr *p_send_wqe); virtual int modify_ratelimit(struct xlio_rate_limit_t &rate_limit); /* XXX TODO We have to support ring_bond for zerocopy. */ - virtual uint32_t get_tx_user_lkey(void *addr, size_t length, void *p_mapping = NULL) + virtual uint32_t get_tx_user_lkey(void *addr, size_t length) { - NOT_IN_USE(p_mapping), NOT_IN_USE(addr); + NOT_IN_USE(addr); NOT_IN_USE(length); return LKEY_ERROR; } diff --git a/src/core/dev/ring_simple.cpp b/src/core/dev/ring_simple.cpp index f89d8da20..a086a9ae1 100644 --- a/src/core/dev/ring_simple.cpp +++ b/src/core/dev/ring_simple.cpp @@ -1162,38 +1162,28 @@ int ring_simple::modify_ratelimit(struct xlio_rate_limit_t &rate_limit) return 0; } -uint32_t ring_simple::get_tx_user_lkey(void *addr, size_t length, void *p_mapping /*=NULL*/) +uint32_t ring_simple::get_tx_user_lkey(void *addr, size_t length) { uint32_t lkey; /* - * Current implementation supports 2 modes: - * 1. Per ring registration cache where addr is the key - * 2. Proxy query to an external mapping object + * Current implementation supports a ring registration cache where addr is the key. * - * The 1st mode is used for send zerocopy and the 2nd made is used for - * sendfile offload. These 2 modes are differentiated by p_mapping - * value. It is NULL in the 1st case. + * The mode is used for send zerocopy. * - * TODO In the 1st mode we don't support memory deregistration. + * TODO The mode doesnn't support memory deregistration. */ - if (p_mapping == NULL) { - auto iter = m_user_lkey_map.find(addr); - if (iter != m_user_lkey_map.end()) { - lkey = iter->second; + auto iter = m_user_lkey_map.find(addr); + if (iter != m_user_lkey_map.end()) { + lkey = iter->second; + } else { + lkey = m_p_ib_ctx->user_mem_reg(addr, length, XLIO_IBV_ACCESS_LOCAL_WRITE); + if (lkey == LKEY_ERROR) { + ring_logerr("Can't register user memory addr %p len %lx", addr, length); } else { - lkey = m_p_ib_ctx->user_mem_reg(addr, length, XLIO_IBV_ACCESS_LOCAL_WRITE); - if (lkey == LKEY_ERROR) { - ring_logerr("Can't register user memory addr %p len %lx", addr, length); - } else { - m_user_lkey_map[addr] = lkey; - } + m_user_lkey_map[addr] = lkey; } - } else { - mapping_t *mapping = (mapping_t *)p_mapping; - lkey = mapping->get_lkey(NULL, m_p_ib_ctx, addr, length); } - return lkey; } diff --git a/src/core/dev/ring_simple.h b/src/core/dev/ring_simple.h index 91376e17b..08b211769 100644 --- a/src/core/dev/ring_simple.h +++ b/src/core/dev/ring_simple.h @@ -113,7 +113,7 @@ class ring_simple : public ring_slave { { return m_p_tx_comp_event_channel ? m_p_tx_comp_event_channel->fd : -1; } - uint32_t get_tx_user_lkey(void *addr, size_t length, void *p_mapping = NULL) override; + uint32_t get_tx_user_lkey(void *addr, size_t length) override; uint32_t get_max_inline_data() override; ib_ctx_handler *get_ctx(ring_user_id_t id) override { diff --git a/src/core/dev/ring_tap.h b/src/core/dev/ring_tap.h index 9077ce26a..013ac8922 100644 --- a/src/core/dev/ring_tap.h +++ b/src/core/dev/ring_tap.h @@ -99,9 +99,8 @@ class ring_tap : public ring_slave { return 0; } void inc_cq_moderation_stats(size_t sz_data) { NOT_IN_USE(sz_data); } - virtual uint32_t get_tx_user_lkey(void *addr, size_t length, void *p_mapping = NULL) + virtual uint32_t get_tx_user_lkey(void *addr, size_t length) { - NOT_IN_USE(p_mapping); NOT_IN_USE(addr); NOT_IN_USE(length); return LKEY_ERROR; diff --git a/src/core/lwip/pbuf.h b/src/core/lwip/pbuf.h index 5f6a69d9a..11ddc528a 100644 --- a/src/core/lwip/pbuf.h +++ b/src/core/lwip/pbuf.h @@ -55,7 +55,6 @@ enum { PBUF_DESC_NONE = 0, PBUF_DESC_MDESC, PBUF_DESC_FD, - PBUF_DESC_MAP, PBUF_DESC_MKEY, PBUF_DESC_STRIDE, PBUF_DESC_TLS_RX, @@ -65,6 +64,7 @@ enum { typedef struct { int attr; + u32_t express_mkey; union { void *map; void *mdesc; diff --git a/src/core/proto/dst_entry_tcp.cpp b/src/core/proto/dst_entry_tcp.cpp index 1e82730a0..b921070d0 100644 --- a/src/core/proto/dst_entry_tcp.cpp +++ b/src/core/proto/dst_entry_tcp.cpp @@ -222,7 +222,9 @@ ssize_t dst_entry_tcp::fast_send(const iovec *p_iov, const ssize_t sz_iov, xlio_ m_sge[i].addr = (uintptr_t)p_tcp_iov[i].iovec.iov_base; m_sge[i].length = p_tcp_iov[i].iovec.iov_len; if (is_zerocopy) { - if (PBUF_DESC_MKEY == p_tcp_iov[i].p_desc->lwip_pbuf.pbuf.desc.attr) { + if (PBUF_DESC_EXPRESS == p_tcp_iov[i].p_desc->lwip_pbuf.pbuf.desc.attr) { + m_sge[i].lkey = p_tcp_iov[i].p_desc->lwip_pbuf.pbuf.desc.express_mkey; + } else if (PBUF_DESC_MKEY == p_tcp_iov[i].p_desc->lwip_pbuf.pbuf.desc.attr) { /* PBUF_DESC_MKEY - value is provided by user */ m_sge[i].lkey = p_tcp_iov[i].p_desc->lwip_pbuf.pbuf.desc.mkey; } else if (PBUF_DESC_MDESC == p_tcp_iov[i].p_desc->lwip_pbuf.pbuf.desc.attr || @@ -237,13 +239,11 @@ ssize_t dst_entry_tcp::fast_send(const iovec *p_iov, const ssize_t sz_iov, xlio_ /* Do not check desc.attr for specific type because * PBUF_DESC_FD - is not possible for XLIO_TX_PACKET_ZEROCOPY * PBUF_DESC_NONE - map should be initialized to NULL in - * dst_entry_tcp::get_buffer() PBUF_DESC_MAP - map should point on mapping - * object + * dst_entry_tcp::get_buffer() object */ masked_addr = (void *)((uint64_t)m_sge[i].addr & m_user_huge_page_mask); m_sge[i].lkey = - m_p_ring->get_tx_user_lkey(masked_addr, m_n_sysvar_user_huge_page_size, - p_tcp_iov[i].p_desc->lwip_pbuf.pbuf.desc.map); + m_p_ring->get_tx_user_lkey(masked_addr, m_n_sysvar_user_huge_page_size); } } else { m_sge[i].lkey = (i == 0 ? m_p_ring->get_tx_lkey(m_id) : m_sge[0].lkey); @@ -413,10 +413,6 @@ mem_buf_desc_t *dst_entry_tcp::get_buffer(pbuf_type type, pbuf_desc *desc, p_mem_buf_desc->lwip_pbuf.pbuf.desc.attr == PBUF_DESC_NVME_TX) { mem_desc *mdesc = (mem_desc *)p_mem_buf_desc->lwip_pbuf.pbuf.desc.mdesc; mdesc->get(); - } else if (p_mem_buf_desc->lwip_pbuf.pbuf.type == PBUF_ZEROCOPY && - (p_mem_buf_desc->lwip_pbuf.pbuf.desc.attr == PBUF_DESC_MAP)) { - mapping_t *mapping = (mapping_t *)p_mem_buf_desc->lwip_pbuf.pbuf.desc.map; - mapping->get(); } } } diff --git a/src/core/sock/sock-redirect.cpp b/src/core/sock/sock-redirect.cpp index 5f315b6e6..86118892e 100644 --- a/src/core/sock/sock-redirect.cpp +++ b/src/core/sock/sock-redirect.cpp @@ -445,7 +445,7 @@ static ssize_t sendfile_helper(socket_fd_api *p_socket_object, int in_fd, __off6 tx_arg.attr.iov = piov; tx_arg.attr.sz_iov = 1; tx_arg.attr.flags = MSG_ZEROCOPY; - tx_arg.priv.attr = PBUF_DESC_MAP; + tx_arg.priv.attr = PBUF_DESC_MDESC; tx_arg.priv.map = (void *)mapping; totSent = p_socket_object->tx(tx_arg); From 9ebbfeffd46a5b618721aa0cd4488e05213b3106 Mon Sep 17 00:00:00 2001 From: Alex Briskin Date: Tue, 14 Nov 2023 15:24:56 +0200 Subject: [PATCH 049/169] issue: 3668182 Allow snd_buf drop below 0 in zero-copy path Signed-off-by: Alex Briskin --- src/core/lwip/tcp.h | 4 +-- src/core/lwip/tcp_out.c | 21 +++++++++----- src/core/sock/sockinfo_tcp.cpp | 53 ++++++++++++++-------------------- src/core/sock/sockinfo_tcp.h | 5 +++- 4 files changed, 41 insertions(+), 42 deletions(-) diff --git a/src/core/lwip/tcp.h b/src/core/lwip/tcp.h index 42ee10c35..d829cfd05 100644 --- a/src/core/lwip/tcp.h +++ b/src/core/lwip/tcp.h @@ -333,7 +333,7 @@ struct tcp_pcb { u32_t acked; - u32_t snd_buf; /* Available buffer space for sending (in bytes). */ + s32_t snd_buf; /* Available buffer space for sending (in bytes). */ u32_t max_snd_buff; u32_t snd_sml_snt; /* maintain state for minshall's algorithm */ @@ -480,7 +480,7 @@ err_t tcp_shutdown(struct tcp_pcb *pcb, int shut_rx, int shut_tx); err_t tcp_write(struct tcp_pcb *pcb, const void *dataptr, u32_t len, u16_t apiflags, pbuf_desc *desc); -err_t tcp_write_zc(struct tcp_pcb *pcb, const void *arg, u32_t len, pbuf_desc *desc); +err_t tcp_write_express(struct tcp_pcb *pcb, const void *arg, u32_t len, pbuf_desc *desc); #define TCP_PRIO_MIN 1 #define TCP_PRIO_NORMAL 64 diff --git a/src/core/lwip/tcp_out.c b/src/core/lwip/tcp_out.c index b61314e1a..b7a05e72d 100644 --- a/src/core/lwip/tcp_out.c +++ b/src/core/lwip/tcp_out.c @@ -240,8 +240,8 @@ static struct tcp_seg *tcp_create_segment(struct tcp_pcb *pcb, struct pbuf *p, u * @param pcb The TCP connection that willo enqueue the pbuf. * @param */ -static struct pbuf *tcp_pbuf_prealloc_zc(u16_t length, u16_t *oversize, struct tcp_pcb *pcb, - pbuf_type type, pbuf_desc *desc, struct pbuf *p_buff) +static struct pbuf *tcp_pbuf_prealloc_express(u16_t length, u16_t *oversize, struct tcp_pcb *pcb, + pbuf_type type, pbuf_desc *desc, struct pbuf *p_buff) { struct pbuf *p; @@ -323,9 +323,9 @@ static err_t tcp_write_checks(struct tcp_pcb *pcb, u32_t len) } /* fail on too much data */ - if (len > pcb->snd_buf) { + if ((s32_t)len > pcb->snd_buf) { LWIP_DEBUGF(TCP_OUTPUT_DEBUG | 3, - ("tcp_write: too much data (len=%" U32_F " > snd_buf=%" U32_F ")\n", len, + ("tcp_write: too much data (len=%" U32_F " > snd_buf=%" S32_F ")\n", len, pcb->snd_buf)); pcb->flags |= TF_NAGLEMEMERR; return ERR_MEM; @@ -748,7 +748,7 @@ err_t tcp_write(struct tcp_pcb *pcb, const void *arg, u32_t len, u16_t apiflags, return ERR_MEM; } -err_t tcp_write_zc(struct tcp_pcb *pcb, const void *arg, u32_t len, pbuf_desc *desc) +err_t tcp_write_express(struct tcp_pcb *pcb, const void *arg, u32_t len, pbuf_desc *desc) { struct pbuf *p; struct tcp_seg *seg = NULL, *prev_seg = NULL, *queue = NULL; @@ -759,6 +759,10 @@ err_t tcp_write_zc(struct tcp_pcb *pcb, const void *arg, u32_t len, pbuf_desc *d u16_t seglen; u16_t queuelen = 0; + if (pcb->snd_buf < 0) { + goto memerr; + } + if (len < pcb->mss) { const int byte_queued = pcb->snd_nxt - pcb->lastack; pcb->snd_sml_add = (pcb->unacked ? pcb->unacked->len : 0) + byte_queued; @@ -794,8 +798,8 @@ err_t tcp_write_zc(struct tcp_pcb *pcb, const void *arg, u32_t len, pbuf_desc *d (pcb->last_unsent->bufs < pcb->tso.max_send_sge)) { seglen = space < len ? space : len; - if ((p = tcp_pbuf_prealloc_zc(seglen, &oversize, pcb, PBUF_ZEROCOPY, desc, NULL)) == - NULL) { + if ((p = tcp_pbuf_prealloc_express(seglen, &oversize, pcb, PBUF_ZEROCOPY, desc, + NULL)) == NULL) { goto memerr; } p->payload = (u8_t *)arg; @@ -813,7 +817,8 @@ err_t tcp_write_zc(struct tcp_pcb *pcb, const void *arg, u32_t len, pbuf_desc *d u32_t left = len - pos; seglen = left > mss_local ? mss_local : left; - if ((p = tcp_pbuf_prealloc_zc(seglen, &oversize, pcb, PBUF_ZEROCOPY, desc, NULL)) == NULL) { + if ((p = tcp_pbuf_prealloc_express(seglen, &oversize, pcb, PBUF_ZEROCOPY, desc, NULL)) == + NULL) { goto memerr; } p->payload = (u8_t *)arg + pos; diff --git a/src/core/sock/sockinfo_tcp.cpp b/src/core/sock/sockinfo_tcp.cpp index b689ab926..889c59121 100644 --- a/src/core/sock/sockinfo_tcp.cpp +++ b/src/core/sock/sockinfo_tcp.cpp @@ -776,11 +776,11 @@ bool sockinfo_tcp::prepare_dst_to_send(bool is_accepted_socket /* = false */) unsigned sockinfo_tcp::tx_wait(int &err, bool blocking) { - unsigned sz = tcp_sndbuf(&m_pcb); + auto sz = sndbuf_available(); int poll_count = 0; si_tcp_logfunc("sz = %d rx_count=%d", sz, m_n_rx_pkt_ready_list_count); err = 0; - while (is_rts() && (sz = tcp_sndbuf(&m_pcb)) == 0) { + while (is_rts() && (sz = sndbuf_available()) == 0) { err = rx_wait(poll_count, blocking); // AlexV:Avoid from going to sleep, for the blocked socket of course, since // progress engine may consume an arrived credit and it will not wakeup the @@ -893,13 +893,13 @@ ssize_t sockinfo_tcp::tx(xlio_tx_call_attr_t &tx_arg) return m_ops->tx(tx_arg); } -static inline bool cannot_do_requested_partial_write(const tcp_pcb &pcb, +static inline bool cannot_do_requested_partial_write(size_t sndbuf_available, const xlio_tx_call_attr_t &tx_arg, bool is_blocking, size_t total_iov_len) { return !BLOCK_THIS_RUN(is_blocking, tx_arg.attr.flags) && (tx_arg.xlio_flags & TX_FLAG_NO_PARTIAL_WRITE) && - unlikely(tcp_sndbuf(&pcb) < total_iov_len); + unlikely(sndbuf_available < total_iov_len); } static inline bool tcp_wnd_unavalable(const tcp_pcb &pcb, size_t total_iov_len) @@ -990,7 +990,7 @@ ssize_t sockinfo_tcp::tcp_tx(xlio_tx_call_attr_t &tx_arg) /* To force zcopy flow there are two possible ways * - send() MSG_ZEROCOPY flag should be passed by user application * and SO_ZEROCOPY activated - * - sendfile() MSG_SEROCOPY flag set internally with opcode TX_FILE + * - sendfile() MSG_ZEROCOPY flag set internally with opcode TX_FILE */ if ((__flags & MSG_ZEROCOPY) && ((m_b_zc) || (tx_arg.opcode == TX_FILE))) { apiflags |= XLIO_TX_PACKET_ZEROCOPY; @@ -1007,7 +1007,8 @@ ssize_t sockinfo_tcp::tcp_tx(xlio_tx_call_attr_t &tx_arg) lock_tcp_con(); if (cannot_do_requested_dummy_send(m_pcb, tx_arg) || - cannot_do_requested_partial_write(m_pcb, tx_arg, m_b_blocking, total_iov_len) || + cannot_do_requested_partial_write(sndbuf_available(), tx_arg, m_b_blocking, + total_iov_len) || tcp_wnd_unavalable(m_pcb, total_iov_len)) { unlock_tcp_con(); errno = EAGAIN; @@ -1034,7 +1035,7 @@ ssize_t sockinfo_tcp::tcp_tx(xlio_tx_call_attr_t &tx_arg) } unsigned pos = 0; while (pos < p_iov[i].iov_len) { - unsigned tx_size = tcp_sndbuf(&m_pcb); + auto tx_size = sndbuf_available(); /* Process a case when space is not available at the sending socket * to hold the message to be transmitted @@ -1081,9 +1082,7 @@ ssize_t sockinfo_tcp::tcp_tx(xlio_tx_call_attr_t &tx_arg) tx_size = tx_wait(ret, true); } - if (tx_size > p_iov[i].iov_len - pos) { - tx_size = p_iov[i].iov_len - pos; - } + tx_size = std::min(p_iov[i].iov_len - pos, tx_size); if (is_send_zerocopy) { /* * For send zerocopy we don't support pbufs which @@ -1094,9 +1093,7 @@ ssize_t sockinfo_tcp::tcp_tx(xlio_tx_call_attr_t &tx_arg) */ unsigned remainder = ~m_user_huge_page_mask + 1 - ((uint64_t)tx_ptr & ~m_user_huge_page_mask); - if (tx_size > remainder) { - tx_size = remainder; - } + tx_size = std::min(remainder, tx_size); } retry_write: if (unlikely(!is_rts())) { @@ -1115,7 +1112,7 @@ ssize_t sockinfo_tcp::tcp_tx(xlio_tx_call_attr_t &tx_arg) } if (apiflags & XLIO_TX_PACKET_ZEROCOPY) { - err = tcp_write_zc(&m_pcb, tx_ptr, tx_size, &tx_arg.priv); + err = tcp_write_express(&m_pcb, tx_ptr, tx_size, &tx_arg.priv); } else { err = tcp_write(&m_pcb, tx_ptr, tx_size, apiflags, &tx_arg.priv); } @@ -3912,7 +3909,7 @@ bool sockinfo_tcp::is_writeable() goto noblock; } - if (tcp_sndbuf(&m_pcb) > m_required_send_block) { + if (sndbuf_available() > m_required_send_block) { goto noblock; } @@ -3926,7 +3923,11 @@ bool sockinfo_tcp::is_writeable() p_fd_array->fd_count++; } */ - __log_funcall("--->>> tcp_sndbuf(&m_pcb)=%d", tcp_sndbuf(&m_pcb)); +<<<<<<< Updated upstream + __log_funcall("--->>> tcp_sndbuf(&m_pcb)=%d", sndbuf(&m_pcb)); +======= + __log_funcall("--->>> tcp_sndbuf(&m_pcb)=%d", sndbuf_available()); +>>>>>>> Stashed changes return true; } @@ -4130,21 +4131,11 @@ void sockinfo_tcp::fit_rcv_wnd(bool force_fit) void sockinfo_tcp::fit_snd_bufs(unsigned int new_max_snd_buff) { - uint32_t sent_buffs_num = 0; + m_pcb.snd_buf += (new_max_snd_buff - m_pcb.max_snd_buff); + m_pcb.max_snd_buff = new_max_snd_buff; - sent_buffs_num = m_pcb.max_snd_buff - m_pcb.snd_buf; - if (sent_buffs_num <= new_max_snd_buff) { - m_pcb.max_snd_buff = new_max_snd_buff; - if (m_pcb.mss) { - m_pcb.max_unsent_len = (16 * (m_pcb.max_snd_buff) / m_pcb.mss); - } else { - m_pcb.max_unsent_len = - (16 * (m_pcb.max_snd_buff) / 536); /* should MSS be 0 use a const...very unlikely */ - } - /* make sure max_unsent_len is not 0 */ - m_pcb.max_unsent_len = std::max(m_pcb.max_unsent_len, 1U); - m_pcb.snd_buf = m_pcb.max_snd_buff - sent_buffs_num; - } + auto mss = m_pcb.mss ?: 536; + m_pcb.max_unsent_len = (mss - 1 + m_pcb.max_snd_buff * 16) / mss; } void sockinfo_tcp::fit_snd_bufs_to_nagle(bool disable_nagle) @@ -5425,7 +5416,7 @@ void sockinfo_tcp::statistics_print(vlog_levels_t log_level /* = VLOG_DEBUG */) pcb.snd_wl1, pcb.snd_wl2); // Send buffer - vlog_printf(log_level, "Send buffer : snd_buf %u, max_snd_buff %u\n", pcb.snd_buf, + vlog_printf(log_level, "Send buffer : snd_buf %d, max_snd_buff %u\n", pcb.snd_buf, pcb.max_snd_buff); // Retransmission diff --git a/src/core/sock/sockinfo_tcp.h b/src/core/sock/sockinfo_tcp.h index 5f21d62ab..e8a850116 100644 --- a/src/core/sock/sockinfo_tcp.h +++ b/src/core/sock/sockinfo_tcp.h @@ -186,7 +186,10 @@ class sockinfo_tcp : public sockinfo, public timer_handler { inline struct tcp_pcb *get_pcb(void) { return &m_pcb; } - inline unsigned sndbuf_available(void) { return tcp_sndbuf(&m_pcb); } + inline unsigned sndbuf_available(void) + { + return static_cast(std::max(tcp_sndbuf(&m_pcb), 0)); + } inline unsigned get_mss(void) { return m_pcb.mss; } From 249369deb8c3fa38ae43b2037a3ece9eb93651f1 Mon Sep 17 00:00:00 2001 From: Alex Briskin Date: Mon, 20 Nov 2023 15:09:39 +0200 Subject: [PATCH 050/169] issue: 3668182 Add sockinfo_tcp::tcp_tx_express Signed-off-by: Alex Briskin --- src/core/lwip/pbuf.h | 1 + src/core/sock/sockinfo_tcp.cpp | 63 +++++++++++++++++++++++++++++++--- src/core/sock/sockinfo_tcp.h | 4 ++- src/core/xlio_extra.h | 9 +++++ 4 files changed, 71 insertions(+), 6 deletions(-) diff --git a/src/core/lwip/pbuf.h b/src/core/lwip/pbuf.h index 11ddc528a..708800b11 100644 --- a/src/core/lwip/pbuf.h +++ b/src/core/lwip/pbuf.h @@ -70,6 +70,7 @@ typedef struct { void *mdesc; int fd; u32_t mkey; + void *opaque; }; } pbuf_desc; diff --git a/src/core/sock/sockinfo_tcp.cpp b/src/core/sock/sockinfo_tcp.cpp index 889c59121..a23645629 100644 --- a/src/core/sock/sockinfo_tcp.cpp +++ b/src/core/sock/sockinfo_tcp.cpp @@ -3923,11 +3923,7 @@ bool sockinfo_tcp::is_writeable() p_fd_array->fd_count++; } */ -<<<<<<< Updated upstream - __log_funcall("--->>> tcp_sndbuf(&m_pcb)=%d", sndbuf(&m_pcb)); -======= - __log_funcall("--->>> tcp_sndbuf(&m_pcb)=%d", sndbuf_available()); ->>>>>>> Stashed changes + __log_funcall("--->>> tcp_sndbuf(&m_pcb)=%ld", sndbuf_available()); return true; } @@ -6039,3 +6035,60 @@ inline bool sockinfo_tcp::handle_bind_no_port(int &bind_ret, in_port_t in_port, return CONTINUE_WITH_BIND; } + +int sockinfo_tcp::tcp_tx_express(const struct iovec *iov, unsigned iov_len, uint32_t mkey, + xlio_express_flags flags, void *opaque_op) +{ + err_t err; + pbuf_desc mdesc; + + switch (flags & XLIO_EXPRESS_OP_TYPE_MASK) { + case XLIO_EXPRESS_OP_TYPE_DESC: + mdesc.attr = PBUF_DESC_EXPRESS; + break; + case XLIO_EXPRESS_OP_TYPE_FILE_ZEROCOPY: + mdesc.attr = PBUF_DESC_MDESC; + /* Increase the refcount by 1 */ + /* reinterpret_cast(opaque_op)->get(); */ + break; + default: + return -1; + }; + mdesc.express_mkey = mkey; + mdesc.opaque = nullptr; + + int bytes_written = 0; + + lock_tcp_con(); + for (unsigned i = 0; i < iov_len - 1; ++i) { + err = tcp_write_express(&m_pcb, iov[i].iov_base, iov[i].iov_len, &mdesc); + if (err != ERR_OK) { + return -1; + } + bytes_written += iov[i].iov_len; + } + + /* Assign opaque only to the last chunk. So, only the last pbuf will generate zerocopy + * completion. */ + mdesc.opaque = opaque_op; + err = tcp_write_express(&m_pcb, iov[iov_len - 1].iov_base, iov[iov_len - 1].iov_len, &mdesc); + if (err != ERR_OK) { + return -1; + } + + bytes_written += iov[iov_len - 1].iov_len; + + if (!(flags & XLIO_EXPRESS_MSG_MORE)) { + err = tcp_output(&m_pcb); + if (err != ERR_OK) { + return -1; + } + /* if (!express_dirty) { */ + /* express_dirty = true; */ + /* express_dirty_sockets.push_back(this); */ + /* } */ + } + unlock_tcp_con(); + + return bytes_written; +} diff --git a/src/core/sock/sockinfo_tcp.h b/src/core/sock/sockinfo_tcp.h index e8a850116..10f0288c9 100644 --- a/src/core/sock/sockinfo_tcp.h +++ b/src/core/sock/sockinfo_tcp.h @@ -324,7 +324,6 @@ class sockinfo_tcp : public sockinfo, public timer_handler { inline void unlock_tcp_con(void) { m_tcp_con_lock.unlock(); } inline void set_reguired_send_block(unsigned sz) { m_required_send_block = sz; } - static err_t rx_lwip_cb(void *arg, struct tcp_pcb *tpcb, struct pbuf *p, err_t err); static err_t rx_lwip_cb_socketxtreme(void *arg, struct tcp_pcb *tpcb, struct pbuf *p, err_t err); @@ -339,6 +338,9 @@ class sockinfo_tcp : public sockinfo, public timer_handler { return sockinfo::register_callback(callback, context); } + int tcp_tx_express(const struct iovec *iov, unsigned iov_len, uint32_t mkey, + xlio_express_flags flags, void *opaque_op); + protected: virtual void lock_rx_q(); virtual void unlock_rx_q(); diff --git a/src/core/xlio_extra.h b/src/core/xlio_extra.h index f34f6e254..f51a3b70f 100644 --- a/src/core/xlio_extra.h +++ b/src/core/xlio_extra.h @@ -639,4 +639,13 @@ static inline struct xlio_api_t *xlio_get_api() } return api_ptr; } + +enum xlio_express_flags : uint32_t { + XLIO_EXPRESS_OP_TYPE_DESC, + XLIO_EXPRESS_OP_TYPE_FILE_ZEROCOPY, + XLIO_EXPRESS_OP_TYPE_MASK = 0x000fu, + XLIO_EXPRESS_MSG_MORE, + XLIO_EXPRESS_MSG_MASK = 0x00f0u, +}; + #endif /* XLIO_EXTRA_H */ From 3c63b73c76bef2d9af8224af0e9bfc5872748f23 Mon Sep 17 00:00:00 2001 From: Alex Briskin Date: Tue, 5 Dec 2023 15:05:55 +0200 Subject: [PATCH 051/169] issue: 3668182 Use tcp_tx_express in TLS tx zerocopy Add tcp_tx_express to tcp_sockinfo_ops_tls::tx. I was unable to perform any refactoring, since: - tx_arg is the method argument - tls_arg is initialized at the beginning of the method and used in the nested loops. - tls_record class needs constant reassurance whether it is zerocopy or not. - tls_record class needs constant reassurance whether it is TLS1.2 or TLS1.3 version. - The blocking, non-blocking, zerocopy, non-zerocopy, TLS1.2., and TLS1.2 parts are tightly-coupled and no clean separation is possible without rewriting the code. - Need to allow blocking on small sndbuf in TLS Signed-off-by: Alex Briskin --- src/core/sock/sockinfo_ulp.cpp | 59 +++++++++++++++++----------------- 1 file changed, 30 insertions(+), 29 deletions(-) diff --git a/src/core/sock/sockinfo_ulp.cpp b/src/core/sock/sockinfo_ulp.cpp index 5770fc25c..f2d4303b5 100644 --- a/src/core/sock/sockinfo_ulp.cpp +++ b/src/core/sock/sockinfo_ulp.cpp @@ -171,9 +171,10 @@ static inline uint8_t get_alert_level(uint8_t alert_type) * tls_record */ -enum { +enum : size_t { TLS_RECORD_HDR_LEN = 5U, TLS_RECORD_IV_LEN = TLS_AES_GCM_IV_LEN, + TLS_13_RECORD_IV_LEN = 0U, TLS_RECORD_TAG_LEN = 16U, TLS_RECORD_NONCE_LEN = 12U, /* SALT + IV */ /* TLS 1.2 record overhead. */ @@ -305,7 +306,8 @@ class tls_record : public mem_desc { assert(iov_max >= 3); (void)iov_max; iov[0].iov_base = m_p_data; - iov[0].iov_len = TLS_RECORD_HDR_LEN + (is_tls13 ? 0 : TLS_RECORD_IV_LEN); + iov[0].iov_len = + TLS_RECORD_HDR_LEN + (is_tls13 ? TLS_13_RECORD_IV_LEN : TLS_RECORD_IV_LEN); iov[1].iov_base = m_p_zc_data; iov[1].iov_len = m_size - (is_tls13 ? TLS_13_RECORD_OVERHEAD : TLS_12_RECORD_OVERHEAD); iov[2].iov_base = m_p_data + iov[0].iov_len; @@ -747,21 +749,16 @@ ssize_t sockinfo_tcp_ops_tls::tx(xlio_tx_call_attr_t &tx_arg) } } + uint8_t *iv = is_tx_tls13() ? nullptr : m_tls_info_tx.iv; + mem_desc *zc_owner = is_zerocopy ? reinterpret_cast(tx_arg.priv.mdesc) : nullptr; for (ssize_t i = 0; i < tx_arg.attr.sz_iov; ++i) { pos = 0; while (pos < p_iov[i].iov_len) { tls_record *rec; ssize_t ret2; - size_t sndbuf = m_p_sock->sndbuf_available(); - size_t tosend = p_iov[i].iov_len - pos; + size_t tosend = std::min(p_iov[i].iov_len - pos, TLS_RECORD_MAX); - /* - * XXX This approach can lead to issue with epoll() - * since such a socket will always be ready for write - */ - if (!block_this_run && sndbuf < TLS_RECORD_SMALLEST && - (sndbuf < m_tls_rec_overhead || (sndbuf - m_tls_rec_overhead) < tosend)) { - /* We don't want to create too small TLS records when we do partial write. */ + if (m_p_sock->sndbuf_available() == 0U && !block_this_run) { if (ret == 0) { errno = EAGAIN; ret = -1; @@ -769,10 +766,8 @@ ssize_t sockinfo_tcp_ops_tls::tx(xlio_tx_call_attr_t &tx_arg) goto done; } - rec = new tls_record( - this, m_p_sock->get_next_tcp_seqno(), m_next_recno_tx, - is_tx_tls13() ? nullptr : m_tls_info_tx.iv, - is_zerocopy ? reinterpret_cast(tx_arg.priv.mdesc) : nullptr); + rec = + new tls_record(this, m_p_sock->get_next_tcp_seqno(), m_next_recno_tx, iv, zc_owner); if (unlikely(!rec || !rec->m_p_buf)) { if (ret == 0) { errno = ENOMEM; @@ -787,6 +782,14 @@ ssize_t sockinfo_tcp_ops_tls::tx(xlio_tx_call_attr_t &tx_arg) } goto done; } + + tosend = rec->append_data((uint8_t *)p_iov[i].iov_base + pos, tosend, is_tx_tls13()); + /* Set type after all data, because for TLS1.3 it is in the tail. */ + rec->set_type(tls_type, is_tx_tls13()); + rec->fill_iov(tls_arg.attr.iov, ARRAY_SIZE(tls_iov), is_tx_tls13()); + tls_arg.priv.mdesc = reinterpret_cast(rec); + pos += tosend; + ++m_next_recno_tx; /* * Prepare unique explicit_nonce for the next TLS1.2 record. @@ -795,20 +798,15 @@ ssize_t sockinfo_tcp_ops_tls::tx(xlio_tx_call_attr_t &tx_arg) if (!is_tx_tls13()) { ++m_tls_info_tx.iv64; } - + retry: if (!block_this_run) { - /* sndbuf overflow is not possible since we have a check above. */ - tosend = std::min(tosend, sndbuf - m_tls_rec_overhead); - } - tosend = rec->append_data((uint8_t *)p_iov[i].iov_base + pos, tosend, is_tx_tls13()); - /* Set type after all data, because for TLS1.3 it is in the tail. */ - rec->set_type(tls_type, is_tx_tls13()); - rec->fill_iov(tls_arg.attr.iov, ARRAY_SIZE(tls_iov), is_tx_tls13()); - tls_arg.priv.mdesc = reinterpret_cast(rec); - pos += tosend; + ret2 = m_p_sock->tcp_tx_express(tls_arg.attr.iov, tls_arg.attr.sz_iov, 0, + XLIO_EXPRESS_OP_TYPE_FILE_ZEROCOPY, + reinterpret_cast(rec)); - retry: - ret2 = m_p_sock->tcp_tx(tls_arg); + } else { + ret2 = m_p_sock->tcp_tx(tls_arg); + } if (block_this_run && (ret2 != (ssize_t)tls_arg.attr.iov[0].iov_len)) { if ((ret2 >= 0) || (errno == EINTR && !g_b_exit)) { ret2 = ret2 < 0 ? 0 : ret2; @@ -894,7 +892,9 @@ int sockinfo_tcp_ops_tls::postrouting(struct pbuf *p, struct tcp_seg *seg, xlio_ if (is_zerocopy) { hdrlen = std::min( - TLS_RECORD_HDR_LEN + (is_tx_tls13() ? 0 : TLS_RECORD_IV_LEN), totlen); + TLS_RECORD_HDR_LEN + + (is_tx_tls13() ? TLS_13_RECORD_IV_LEN : TLS_RECORD_IV_LEN), + totlen); taillen = TLS_RECORD_TAG_LEN + !!is_tx_tls13(); /* Determine the trailer portion to resend. */ taillen = std::max(totlen + taillen, rec->m_size) - rec->m_size; @@ -1338,7 +1338,8 @@ err_t sockinfo_tcp_ops_tls::recv(struct pbuf *p) struct pbuf *pi; struct pbuf *pres = nullptr; struct pbuf *ptmp = nullptr; - uint32_t offset = m_rx_offset + TLS_RECORD_HDR_LEN + (is_rx_tls13() ? 0 : TLS_RECORD_IV_LEN); + uint32_t offset = m_rx_offset + TLS_RECORD_HDR_LEN + + (is_rx_tls13() ? TLS_13_RECORD_IV_LEN : TLS_RECORD_IV_LEN); uint32_t remain = m_rx_rec_len - m_tls_rec_overhead; unsigned bufs_nr = 0; unsigned decrypted_nr = 0; From cf7ff5757782759e9695d1c3e9484d74a70efc40 Mon Sep 17 00:00:00 2001 From: Alex Briskin Date: Sun, 31 Dec 2023 17:26:42 +0200 Subject: [PATCH 052/169] issue: 3668182 Remove zerocopy flow from tcp_write Signed-off-by: Alex Briskin --- src/core/lwip/tcp_out.c | 37 ++++++------------------------------- 1 file changed, 6 insertions(+), 31 deletions(-) diff --git a/src/core/lwip/tcp_out.c b/src/core/lwip/tcp_out.c index b7a05e72d..7094215e7 100644 --- a/src/core/lwip/tcp_out.c +++ b/src/core/lwip/tcp_out.c @@ -415,9 +415,8 @@ err_t tcp_write(struct tcp_pcb *pcb, const void *arg, u32_t len, u16_t apiflags, off_t offset = 0; off_t offset_next = 0; - bool is_zerocopy = !!(apiflags & TCP_WRITE_ZEROCOPY); - bool is_file = (apiflags & (TCP_WRITE_FILE | TCP_WRITE_ZEROCOPY)) == TCP_WRITE_FILE; - pbuf_type type = (apiflags & TCP_WRITE_ZEROCOPY) ? PBUF_ZEROCOPY : PBUF_RAM; + bool is_file = (apiflags & TCP_WRITE_FILE) == TCP_WRITE_FILE; + pbuf_type type = PBUF_RAM; int byte_queued = pcb->snd_nxt - pcb->lastack; if (len < pcb->mss && !(apiflags & TCP_WRITE_DUMMY)) { @@ -435,14 +434,9 @@ err_t tcp_write(struct tcp_pcb *pcb, const void *arg, u32_t len, u16_t apiflags, } queuelen = pcb->snd_queuelen; - if (is_zerocopy) { - mss_local = lwip_zc_tx_size; - } else { - mss_local = tcp_xmit_size_goal(pcb, 1); - } + mss_local = tcp_xmit_size_goal(pcb, 1); optflags |= (apiflags & TCP_WRITE_DUMMY) ? TF_SEG_OPTS_DUMMY_MSG : 0; - optflags |= (apiflags & TCP_WRITE_ZEROCOPY) ? TF_SEG_OPTS_ZEROCOPY : 0; #if LWIP_TCP_TIMESTAMPS if (pcb->flags & TF_TIMESTAMP) { @@ -452,10 +446,6 @@ err_t tcp_write(struct tcp_pcb *pcb, const void *arg, u32_t len, u16_t apiflags, optlen = LWIP_TCP_OPT_LENGTH(optflags); mss_local_minus_opts = mss_local - optlen; - if (is_zerocopy) { - /* TCP options will reside in seg->l2_l3_tcphdr_zc */ - optlen = 0; - } if (is_file) { offset = offset_next = *(__off64_t *)arg; } @@ -515,7 +505,7 @@ err_t tcp_write(struct tcp_pcb *pcb, const void *arg, u32_t len, u16_t apiflags, #endif /* TCP_OVERSIZE_DBGCHECK */ if (pcb->unsent_oversize > 0) { - if (!(apiflags & (TCP_WRITE_FILE | TCP_WRITE_ZEROCOPY))) { + if (!(apiflags & TCP_WRITE_FILE)) { oversize = pcb->unsent_oversize; LWIP_ASSERT("inconsistent oversize vs. space", oversize_used <= space); oversize_used = oversize < len ? oversize : len; @@ -550,11 +540,7 @@ err_t tcp_write(struct tcp_pcb *pcb, const void *arg, u32_t len, u16_t apiflags, #if TCP_OVERSIZE_DBGCHECK pcb->last_unsent->oversize_left += oversize; #endif /* TCP_OVERSIZE_DBGCHECK */ - if (is_zerocopy) { - concat_p->payload = (u8_t *)arg + pos; - } else { - memcpy(concat_p->payload, (u8_t *)arg + pos, seglen); - } + memcpy(concat_p->payload, (u8_t *)arg + pos, seglen); pos += seglen; queuelen += pbuf_clen(concat_p); @@ -577,11 +563,6 @@ err_t tcp_write(struct tcp_pcb *pcb, const void *arg, u32_t len, u16_t apiflags, u16_t max_len = mss_local_minus_opts; u16_t seglen = left > max_len ? max_len : left; - /* create pbuf of the exact size needed now, to later avoid the p1 (oversize) flow */ - if (is_zerocopy) { - max_len = seglen; - } - /* If copy is set, memory should be allocated and data copied * into pbuf */ if ((p = tcp_pbuf_prealloc(seglen + optlen, max_len, &oversize, pcb, type, @@ -593,9 +574,7 @@ err_t tcp_write(struct tcp_pcb *pcb, const void *arg, u32_t len, u16_t apiflags, } LWIP_ASSERT("tcp_write: check that first pbuf can hold the complete seglen", (p->len >= seglen)); - if (is_zerocopy) { - p->payload = (u8_t *)arg + pos; - } else if (is_file) { + if (is_file) { piov[piov_cur_index].iov_base = (void *)((char *)p->payload + optlen); piov[piov_cur_index].iov_len = seglen; @@ -759,10 +738,6 @@ err_t tcp_write_express(struct tcp_pcb *pcb, const void *arg, u32_t len, pbuf_de u16_t seglen; u16_t queuelen = 0; - if (pcb->snd_buf < 0) { - goto memerr; - } - if (len < pcb->mss) { const int byte_queued = pcb->snd_nxt - pcb->lastack; pcb->snd_sml_add = (pcb->unacked ? pcb->unacked->len : 0) + byte_queued; From 94f83929321c93d597b58b1e3eec8745f7c48951 Mon Sep 17 00:00:00 2001 From: Alex Briskin Date: Sun, 31 Dec 2023 17:27:20 +0200 Subject: [PATCH 053/169] issue: 3668182 Refactor sockinfo_tcp - Refactor cannot_do_requested_partial_write - Extract static function is_invalid_iovec - Extract method is_connected_and_ready_to_send - Extract method tcp_tx_slow_path - Extract method tcp_tx_handle_errno_and_unlock - Extract method tcp_tx_handle_done_and_unlock - Extract method tcp_tx_handle_partial_send_and_unlock - Check ready-to-send socket in express path Signed-off-by: Alex Briskin --- src/core/sock/sockinfo_tcp.cpp | 509 +++++++++++++++++++++------------ src/core/sock/sockinfo_tcp.h | 8 + 2 files changed, 331 insertions(+), 186 deletions(-) diff --git a/src/core/sock/sockinfo_tcp.cpp b/src/core/sock/sockinfo_tcp.cpp index a23645629..97b6c1eb7 100644 --- a/src/core/sock/sockinfo_tcp.cpp +++ b/src/core/sock/sockinfo_tcp.cpp @@ -895,78 +895,193 @@ ssize_t sockinfo_tcp::tx(xlio_tx_call_attr_t &tx_arg) static inline bool cannot_do_requested_partial_write(size_t sndbuf_available, const xlio_tx_call_attr_t &tx_arg, - bool is_blocking, size_t total_iov_len) + size_t total_iov_len) { - return !BLOCK_THIS_RUN(is_blocking, tx_arg.attr.flags) && - (tx_arg.xlio_flags & TX_FLAG_NO_PARTIAL_WRITE) && + return (tx_arg.xlio_flags & TX_FLAG_NO_PARTIAL_WRITE) && unlikely(sndbuf_available < total_iov_len); } -static inline bool tcp_wnd_unavalable(const tcp_pcb &pcb, size_t total_iov_len) -{ #ifdef DEFINED_TCP_TX_WND_AVAILABILITY - return !tcp_is_wnd_available(&pcb, total_iov_len); +#define TCP_WND_UNAVALABLE(pcb, total_iov_len) !tcp_is_wnd_available(&pcb, total_iov_len) #else - NOT_IN_USE(pcb); - NOT_IN_USE(total_iov_len); - return false; +#define TCP_WND_UNAVALABLE(pcb, total_iov_len) false #endif + +static inline bool is_invalid_iovec(const iovec *iov, size_t sz_iov) +{ + return iov == nullptr || sz_iov == 0; } ssize_t sockinfo_tcp::tcp_tx(xlio_tx_call_attr_t &tx_arg) { iovec *p_iov = tx_arg.attr.iov; size_t sz_iov = tx_arg.attr.sz_iov; - struct sockaddr *__dst = tx_arg.attr.addr; - socklen_t __dstlen = tx_arg.attr.len; - int __flags = tx_arg.attr.flags; + int flags = tx_arg.attr.flags; int errno_tmp = errno; int ret = 0; int poll_count = 0; - uint16_t apiflags = 0; err_t err; - bool is_send_zerocopy = false; void *tx_ptr = NULL; struct xlio_pd_key *pd_key_array = NULL; /* Let allow OS to process all invalid scenarios to avoid any * inconsistencies in setting errno values */ - if (unlikely(m_sock_offload != TCP_SOCK_LWIP) || unlikely(!p_iov) || unlikely(0 == sz_iov)) { - ret = socket_fd_api::tx_os(tx_arg.opcode, p_iov, sz_iov, __flags, __dst, __dstlen); + if (unlikely(m_sock_offload != TCP_SOCK_LWIP) || unlikely(is_invalid_iovec(p_iov, sz_iov))) { + struct sockaddr *dst = tx_arg.attr.addr; + socklen_t dstlen = tx_arg.attr.len; + ret = socket_fd_api::tx_os(tx_arg.opcode, p_iov, sz_iov, flags, dst, dstlen); save_stats_tx_os(ret); return ret; } -retry_is_ready: + if (unlikely(!is_connected_and_ready_to_send())) { + return -1; + } - if (unlikely(!is_rts())) { + si_tcp_logfunc("tx: iov=%p niovs=%d", p_iov, sz_iov); - if (m_conn_state == TCP_CONN_TIMEOUT) { - si_tcp_logdbg("TX timed out"); - errno = ETIMEDOUT; - } else if (m_conn_state == TCP_CONN_CONNECTING) { - si_tcp_logdbg("TX while async-connect on socket go to poll"); - rx_wait_helper(poll_count, false); - if (m_conn_state == TCP_CONN_CONNECTED) { - goto retry_is_ready; - } - si_tcp_logdbg("TX while async-connect on socket return EAGAIN"); - errno = EAGAIN; - } else if (m_conn_state == TCP_CONN_RESETED) { - si_tcp_logdbg("TX on reseted socket"); - errno = ECONNRESET; - } else if (m_conn_state == TCP_CONN_ERROR) { - si_tcp_logdbg("TX on connection failed socket"); - errno = ECONNREFUSED; - } else { - si_tcp_logdbg("TX on disconnected socket"); - errno = EPIPE; + if (m_sysvar_rx_poll_on_tx_tcp) { + rx_wait_helper(poll_count, false); + } + + bool is_dummy = IS_DUMMY_PACKET(flags); + bool is_blocking = BLOCK_THIS_RUN(m_b_blocking, flags); + bool is_packet_zerocopy = (flags & MSG_ZEROCOPY) && ((m_b_zc) || (tx_arg.opcode == TX_FILE)); + if (unlikely(is_dummy) || unlikely(!is_packet_zerocopy) || unlikely(is_blocking)) { + return tcp_tx_slow_path(tx_arg); + } + + bool is_send_zerocopy = tx_arg.opcode != TX_FILE; + pd_key_array = + (tx_arg.priv.attr == PBUF_DESC_MKEY ? (struct xlio_pd_key *)tx_arg.priv.map : NULL); + + si_tcp_logfunc("tx: iov=%p niovs=%zu", p_iov, sz_iov); + + size_t total_iov_len = + std::accumulate(&p_iov[0], &p_iov[sz_iov], 0U, + [](size_t sum, const iovec &curr) { return sum + curr.iov_len; }); + lock_tcp_con(); + + if (cannot_do_requested_partial_write(sndbuf_available(), tx_arg, total_iov_len) || + TCP_WND_UNAVALABLE(m_pcb, total_iov_len)) { + return tcp_tx_handle_errno_and_unlock(EAGAIN); + } + + int total_tx = 0; + for (size_t i = 0; i < sz_iov; i++) { + si_tcp_logfunc("iov:%d base=%p len=%d", i, p_iov[i].iov_base, p_iov[i].iov_len); + if (unlikely(!p_iov[i].iov_base)) { + continue; } - return -1; + tx_ptr = p_iov[i].iov_base; + if ((tx_arg.priv.attr == PBUF_DESC_MKEY) && pd_key_array) { + tx_arg.priv.mkey = pd_key_array[i].mkey; + } + unsigned pos = 0; + while (pos < p_iov[i].iov_len) { + auto tx_size = sndbuf_available(); + + /* Process a case when space is not available at the sending socket + * to hold the message to be transmitted + * Nonblocking socket: + * - no data is buffered: return (-1) and EAGAIN + * - some data is buffered: return number of bytes ready to be sent + */ + if (tx_size == 0) { + if (unlikely(!is_rts())) { + si_tcp_logdbg("TX on disconnected socket"); + return tcp_tx_handle_errno_and_unlock(ECONNRESET); + } + // force out TCP data before going on wait() + tcp_output(&m_pcb); + + // non blocking socket should return in order not to tx_wait() + if (total_tx > 0) { + m_tx_consecutive_eagain_count = 0; + return tcp_tx_handle_done_and_unlock(total_tx, errno_tmp, is_dummy, + is_send_zerocopy); + } else { + m_tx_consecutive_eagain_count++; + if (m_tx_consecutive_eagain_count >= TX_CONSECUTIVE_EAGAIN_THREASHOLD) { + if (safe_mce_sys().tcp_ctl_thread == + option_tcp_ctl_thread::CTL_THREAD_DELEGATE_TCP_TIMERS) { + // Slow path. We must attempt TCP timers here for applications that + // do not check for EV_OUT. + g_thread_local_event_handler.do_tasks(); + } + // in case of zero sndbuf and non-blocking just try once polling CQ for + // ACK + rx_wait(poll_count, false); + m_tx_consecutive_eagain_count = 0; + } + return tcp_tx_handle_errno_and_unlock(EAGAIN); + } + } + + tx_size = std::min(p_iov[i].iov_len - pos, tx_size); + if (is_send_zerocopy) { + /* + * For send zerocopy we don't support pbufs which + * cross huge page boundaries. To avoid forming + * such a pbuf, we have to adjust tx_size, so + * tcp_write receives a buffer which doesn't cross + * the boundary. + */ + unsigned remainder = + ~m_user_huge_page_mask + 1 - ((uint64_t)tx_ptr & ~m_user_huge_page_mask); + tx_size = std::min(remainder, tx_size); + } + + if (unlikely(!is_rts())) { + si_tcp_logdbg("TX on disconnected socket"); + return tcp_tx_handle_errno_and_unlock(ECONNRESET); + } + if (unlikely(g_b_exit)) { + return tcp_tx_handle_partial_send_and_unlock(total_tx, EINTR, is_dummy, + is_send_zerocopy, errno_tmp); + } + + err = tcp_write_express(&m_pcb, tx_ptr, tx_size, &tx_arg.priv); + if (unlikely(err != ERR_OK)) { + if (unlikely(err == ERR_CONN)) { // happens when remote drops during big write + si_tcp_logdbg("connection closed: tx'ed = %d", total_tx); + shutdown(SHUT_WR); + return tcp_tx_handle_partial_send_and_unlock(total_tx, EPIPE, is_dummy, + is_send_zerocopy, errno_tmp); + } + if (unlikely(err != ERR_MEM)) { + // we should not get here... + BULLSEYE_EXCLUDE_BLOCK_START + si_tcp_logpanic("tcp_write return: %d", err); + BULLSEYE_EXCLUDE_BLOCK_END + } + return tcp_tx_handle_partial_send_and_unlock(total_tx, EAGAIN, is_dummy, + is_send_zerocopy, errno_tmp); + } + tx_ptr = (void *)((char *)tx_ptr + tx_size); + pos += tx_size; + total_tx += tx_size; + } } - si_tcp_logfunc("tx: iov=%p niovs=%d", p_iov, sz_iov); + + return tcp_tx_handle_done_and_unlock(total_tx, errno_tmp, is_dummy, is_send_zerocopy); +} + +ssize_t sockinfo_tcp::tcp_tx_slow_path(xlio_tx_call_attr_t &tx_arg) +{ + iovec *p_iov = tx_arg.attr.iov; + size_t sz_iov = tx_arg.attr.sz_iov; + int flags = tx_arg.attr.flags; + int errno_tmp = errno; + int ret = 0; + int poll_count = 0; + uint16_t apiflags = 0; + err_t err; + bool is_send_zerocopy = false; + void *tx_ptr = NULL; + struct xlio_pd_key *pd_key_array = NULL; if (m_sysvar_rx_poll_on_tx_tcp) { rx_wait_helper(poll_count, false); @@ -982,7 +1097,7 @@ ssize_t sockinfo_tcp::tcp_tx(xlio_tx_call_attr_t &tx_arg) apiflags |= XLIO_TX_FILE; } - bool is_dummy = IS_DUMMY_PACKET(__flags); + bool is_dummy = IS_DUMMY_PACKET(flags); if (unlikely(is_dummy)) { apiflags |= XLIO_TX_PACKET_DUMMY; } @@ -992,7 +1107,7 @@ ssize_t sockinfo_tcp::tcp_tx(xlio_tx_call_attr_t &tx_arg) * and SO_ZEROCOPY activated * - sendfile() MSG_ZEROCOPY flag set internally with opcode TX_FILE */ - if ((__flags & MSG_ZEROCOPY) && ((m_b_zc) || (tx_arg.opcode == TX_FILE))) { + if ((flags & MSG_ZEROCOPY) && ((m_b_zc) || (tx_arg.opcode == TX_FILE))) { apiflags |= XLIO_TX_PACKET_ZEROCOPY; is_send_zerocopy = tx_arg.opcode != TX_FILE; pd_key_array = @@ -1001,23 +1116,14 @@ ssize_t sockinfo_tcp::tcp_tx(xlio_tx_call_attr_t &tx_arg) si_tcp_logfunc("tx: iov=%p niovs=%zu", p_iov, sz_iov); - size_t total_iov_len = - std::accumulate(&p_iov[0], &p_iov[sz_iov], 0U, - [](size_t sum, const iovec &curr) { return sum + curr.iov_len; }); lock_tcp_con(); - if (cannot_do_requested_dummy_send(m_pcb, tx_arg) || - cannot_do_requested_partial_write(sndbuf_available(), tx_arg, m_b_blocking, - total_iov_len) || - tcp_wnd_unavalable(m_pcb, total_iov_len)) { - unlock_tcp_con(); - errno = EAGAIN; - return -1; + if (cannot_do_requested_dummy_send(m_pcb, tx_arg) || TCP_WND_UNAVALABLE(m_pcb, total_iov_len)) { + return tcp_tx_handle_errno_and_unlock(EAGAIN); } int total_tx = 0; - __off64_t file_offset = 0; - bool block_this_run = BLOCK_THIS_RUN(m_b_blocking, __flags); + off64_t file_offset = 0; for (size_t i = 0; i < sz_iov; i++) { si_tcp_logfunc("iov:%d base=%p len=%d", i, p_iov[i].iov_base, p_iov[i].iov_len); if (unlikely(!p_iov[i].iov_base)) { @@ -1025,7 +1131,7 @@ ssize_t sockinfo_tcp::tcp_tx(xlio_tx_call_attr_t &tx_arg) } if ((tx_arg.opcode == TX_FILE) && !(apiflags & XLIO_TX_PACKET_ZEROCOPY)) { - file_offset = *(__off64_t *)p_iov[i].iov_base; + file_offset = *(off64_t *)p_iov[i].iov_base; tx_ptr = &file_offset; } else { tx_ptr = p_iov[i].iov_base; @@ -1048,36 +1154,12 @@ ssize_t sockinfo_tcp::tcp_tx(xlio_tx_call_attr_t &tx_arg) if (tx_size == 0) { if (unlikely(!is_rts())) { si_tcp_logdbg("TX on disconnected socket"); - errno = ECONNRESET; - goto err; + return tcp_tx_handle_errno_and_unlock(ECONNRESET); } // force out TCP data before going on wait() tcp_output(&m_pcb); /* Set return values for nonblocking socket and finish processing */ - if (!block_this_run) { - // non blocking socket should return in order not to tx_wait() - if (total_tx > 0) { - m_tx_consecutive_eagain_count = 0; - goto done; - } else { - m_tx_consecutive_eagain_count++; - if (m_tx_consecutive_eagain_count >= TX_CONSECUTIVE_EAGAIN_THREASHOLD) { - if (safe_mce_sys().tcp_ctl_thread == - option_tcp_ctl_thread::CTL_THREAD_DELEGATE_TCP_TIMERS) { - // Slow path. We must attempt TCP timers here for applications that - // do not check for EV_OUT. - g_thread_local_event_handler.do_tasks(); - } - // in case of zero sndbuf and non-blocking just try once polling CQ for - // ACK - rx_wait(poll_count, false); - m_tx_consecutive_eagain_count = 0; - } - errno = EAGAIN; - goto err; - } - } tx_size = tx_wait(ret, true); } @@ -1095,63 +1177,46 @@ ssize_t sockinfo_tcp::tcp_tx(xlio_tx_call_attr_t &tx_arg) ~m_user_huge_page_mask + 1 - ((uint64_t)tx_ptr & ~m_user_huge_page_mask); tx_size = std::min(remainder, tx_size); } - retry_write: - if (unlikely(!is_rts())) { - si_tcp_logdbg("TX on disconnected socket"); - errno = ECONNRESET; - goto err; - } - if (unlikely(g_b_exit)) { - if (total_tx > 0) { - goto done; - } else { - errno = EINTR; - si_tcp_logdbg("returning with: EINTR"); - goto err; + do { + if (unlikely(!is_rts())) { + si_tcp_logdbg("TX on disconnected socket"); + return tcp_tx_handle_errno_and_unlock(ECONNRESET); } - } - - if (apiflags & XLIO_TX_PACKET_ZEROCOPY) { - err = tcp_write_express(&m_pcb, tx_ptr, tx_size, &tx_arg.priv); - } else { - err = tcp_write(&m_pcb, tx_ptr, tx_size, apiflags, &tx_arg.priv); - } - if (unlikely(err != ERR_OK)) { - if (unlikely(err == ERR_CONN)) { // happens when remote drops during big write - si_tcp_logdbg("connection closed: tx'ed = %d", total_tx); - shutdown(SHUT_WR); - if (total_tx > 0) { - goto done; - } - errno = EPIPE; - unlock_tcp_con(); - return -1; + if (unlikely(g_b_exit)) { + return tcp_tx_handle_partial_send_and_unlock(total_tx, EINTR, is_dummy, + is_send_zerocopy, errno_tmp); } - if (unlikely(err != ERR_MEM)) { - // we should not get here... - BULLSEYE_EXCLUDE_BLOCK_START - si_tcp_logpanic("tcp_write return: %d", err); - BULLSEYE_EXCLUDE_BLOCK_END + + if (apiflags & XLIO_TX_PACKET_ZEROCOPY) { + err = tcp_write_express(&m_pcb, tx_ptr, tx_size, &tx_arg.priv); + } else { + err = tcp_write(&m_pcb, tx_ptr, tx_size, apiflags, &tx_arg.priv); } - /* Set return values for nonblocking socket and finish processing */ - if (!block_this_run) { - if (total_tx > 0) { - goto done; - } else { - errno = EAGAIN; - goto err; + if (unlikely(err != ERR_OK)) { + if (unlikely(err == ERR_CONN)) { // happens when remote drops during big write + si_tcp_logdbg("connection closed: tx'ed = %d", total_tx); + shutdown(SHUT_WR); + return tcp_tx_handle_partial_send_and_unlock(total_tx, EPIPE, is_dummy, + is_send_zerocopy, errno_tmp); + } + if (unlikely(err != ERR_MEM)) { + // we should not get here... + BULLSEYE_EXCLUDE_BLOCK_START + si_tcp_logpanic("tcp_write return: %d", err); + BULLSEYE_EXCLUDE_BLOCK_END } - } - rx_wait(poll_count, true); + rx_wait(poll_count, true); - // AlexV:Avoid from going to sleep, for the blocked socket of course, since - // progress engine may consume an arrived credit and it will not wakeup the - // transmit thread. - poll_count = 0; + // AlexV:Avoid from going to sleep, for the blocked socket of course, since + // progress engine may consume an arrived credit and it will not wakeup the + // transmit thread. + poll_count = 0; - goto retry_write; - } + continue; + } + break; + } while (true); if (tx_arg.opcode == TX_FILE && !(apiflags & XLIO_TX_PACKET_ZEROCOPY)) { file_offset += tx_size; } else { @@ -1161,45 +1226,8 @@ ssize_t sockinfo_tcp::tcp_tx(xlio_tx_call_attr_t &tx_arg) total_tx += tx_size; } } -done: - tcp_output(&m_pcb); // force data out - if (unlikely(is_dummy)) { - m_p_socket_stats->counters.n_tx_dummy++; - } else if (total_tx) { - m_p_socket_stats->counters.n_tx_sent_byte_count += total_tx; - m_p_socket_stats->counters.n_tx_sent_pkt_count++; - m_p_socket_stats->n_tx_ready_byte_count += total_tx; - } - - /* Each send call with MSG_ZEROCOPY that successfully sends - * data increments the counter. - * The counter is not incremented on failure or if called with length zero. - */ - if (is_send_zerocopy && (total_tx > 0)) { - if (m_last_zcdesc->tx.zc.id != (uint32_t)atomic_read(&m_zckey)) { - si_tcp_logerr("Invalid tx zcopy operation"); - } else { - atomic_fetch_and_inc(&m_zckey); - } - } - - unlock_tcp_con(); - - /* Restore errno on function entry in case success */ - errno = errno_tmp; - - return total_tx; - -err: - // nothing send nb mode or got some other error - if (errno == EAGAIN) { - m_p_socket_stats->counters.n_tx_eagain++; - } else { - m_p_socket_stats->counters.n_tx_errors++; - } - unlock_tcp_con(); - return -1; + return tcp_tx_handle_done_and_unlock(total_tx, errno_tmp, is_dummy, is_send_zerocopy); } /* @@ -6039,28 +6067,61 @@ inline bool sockinfo_tcp::handle_bind_no_port(int &bind_ret, in_port_t in_port, int sockinfo_tcp::tcp_tx_express(const struct iovec *iov, unsigned iov_len, uint32_t mkey, xlio_express_flags flags, void *opaque_op) { + if (unlikely(!is_rts())) { + if (m_conn_state == TCP_CONN_TIMEOUT) { + si_tcp_logdbg("TX timed out"); + errno = ETIMEDOUT; + } else if (m_conn_state == TCP_CONN_RESETED) { + si_tcp_logdbg("TX on reseted socket"); + errno = ECONNRESET; + } else if (m_conn_state == TCP_CONN_ERROR) { + si_tcp_logdbg("TX on connection failed socket"); + errno = ECONNREFUSED; + } else { + si_tcp_logdbg("TX on disconnected socket"); + errno = EPIPE; + } + return -1; + } + err_t err; pbuf_desc mdesc; + if (unlikely(!is_rts())) { + if (m_conn_state == TCP_CONN_TIMEOUT) { + si_tcp_logdbg("TX timed out"); + errno = ETIMEDOUT; + } else if (m_conn_state == TCP_CONN_RESETED) { + si_tcp_logdbg("TX on reseted socket"); + errno = ECONNRESET; + } else if (m_conn_state == TCP_CONN_ERROR) { + si_tcp_logdbg("TX on connection failed socket"); + errno = ECONNREFUSED; + } else { + si_tcp_logdbg("TX on disconnected socket"); + errno = EPIPE; + } + return -1; + } + switch (flags & XLIO_EXPRESS_OP_TYPE_MASK) { case XLIO_EXPRESS_OP_TYPE_DESC: mdesc.attr = PBUF_DESC_EXPRESS; break; case XLIO_EXPRESS_OP_TYPE_FILE_ZEROCOPY: mdesc.attr = PBUF_DESC_MDESC; - /* Increase the refcount by 1 */ - /* reinterpret_cast(opaque_op)->get(); */ break; default: return -1; }; mdesc.express_mkey = mkey; - mdesc.opaque = nullptr; + mdesc.opaque = opaque_op; int bytes_written = 0; lock_tcp_con(); - for (unsigned i = 0; i < iov_len - 1; ++i) { + + for (unsigned i = 0; i < iov_len; ++i) { err = tcp_write_express(&m_pcb, iov[i].iov_base, iov[i].iov_len, &mdesc); if (err != ERR_OK) { return -1; @@ -6068,27 +6129,103 @@ int sockinfo_tcp::tcp_tx_express(const struct iovec *iov, unsigned iov_len, uint bytes_written += iov[i].iov_len; } - /* Assign opaque only to the last chunk. So, only the last pbuf will generate zerocopy - * completion. */ - mdesc.opaque = opaque_op; - err = tcp_write_express(&m_pcb, iov[iov_len - 1].iov_base, iov[iov_len - 1].iov_len, &mdesc); - if (err != ERR_OK) { - return -1; - } - - bytes_written += iov[iov_len - 1].iov_len; - if (!(flags & XLIO_EXPRESS_MSG_MORE)) { err = tcp_output(&m_pcb); if (err != ERR_OK) { return -1; } - /* if (!express_dirty) { */ - /* express_dirty = true; */ - /* express_dirty_sockets.push_back(this); */ - /* } */ } unlock_tcp_con(); return bytes_written; } + +ssize_t sockinfo_tcp::tcp_tx_handle_done_and_unlock(ssize_t total_tx, int errno_tmp, bool is_dummy, + bool is_send_zerocopy) +{ + tcp_output(&m_pcb); // force data out + + if (unlikely(is_dummy)) { + m_p_socket_stats->counters.n_tx_dummy++; + } else if (total_tx) { + m_p_socket_stats->counters.n_tx_sent_byte_count += total_tx; + m_p_socket_stats->counters.n_tx_sent_pkt_count++; + m_p_socket_stats->n_tx_ready_byte_count += total_tx; + } + + /* Each send call with MSG_ZEROCOPY that successfully sends + * data increments the counter. + * The counter is not incremented on failure or if called with length zero. + */ + if (is_send_zerocopy && (total_tx > 0)) { + if (m_last_zcdesc->tx.zc.id != (uint32_t)atomic_read(&m_zckey)) { + /* si_tcp_logerr("Invalid tx zcopy operation"); */ + } else { + atomic_fetch_and_inc(&m_zckey); + } + } + + unlock_tcp_con(); + + /* Restore errno on function entry in case success */ + errno = errno_tmp; + + return total_tx; +} + +ssize_t sockinfo_tcp::tcp_tx_handle_errno_and_unlock(int error_number) +{ + errno = error_number; + + // nothing send nb mode or got some other error + if (errno == EAGAIN) { + m_p_socket_stats->counters.n_tx_eagain++; + } else { + m_p_socket_stats->counters.n_tx_errors++; + } + unlock_tcp_con(); + return -1; +} + +ssize_t sockinfo_tcp::tcp_tx_handle_partial_send_and_unlock(ssize_t total_tx, int errno_to_report, + bool is_dummy, bool is_send_zerocopy, + int errno_to_restore) +{ + if (total_tx > 0) { + return tcp_tx_handle_done_and_unlock(total_tx, errno_to_restore, is_dummy, + is_send_zerocopy); + } + si_tcp_logdbg("Returning with: %d", errno_to_report); + return tcp_tx_handle_errno_and_unlock(errno_to_report); +} + +bool sockinfo_tcp::is_connected_and_ready_to_send() +{ + int poll_count = 0; + /* TODO should we add !g_b_exit here? */ + while (unlikely(!is_rts())) { + if (m_conn_state == TCP_CONN_TIMEOUT) { + si_tcp_logdbg("TX timed out"); + errno = ETIMEDOUT; + } else if (m_conn_state == TCP_CONN_CONNECTING) { + si_tcp_logdbg("TX while async-connect on socket go to poll"); + rx_wait_helper(poll_count, false); + if (m_conn_state == TCP_CONN_CONNECTED) { + continue; + } + si_tcp_logdbg("TX while async-connect on socket return EAGAIN"); + errno = EAGAIN; + } else if (m_conn_state == TCP_CONN_RESETED) { + si_tcp_logdbg("TX on reseted socket"); + errno = ECONNRESET; + } else if (m_conn_state == TCP_CONN_ERROR) { + si_tcp_logdbg("TX on connection failed socket"); + errno = ECONNREFUSED; + } else { + si_tcp_logdbg("TX on disconnected socket"); + errno = EPIPE; + } + return false; + } + return true; +} diff --git a/src/core/sock/sockinfo_tcp.h b/src/core/sock/sockinfo_tcp.h index 10f0288c9..315392543 100644 --- a/src/core/sock/sockinfo_tcp.h +++ b/src/core/sock/sockinfo_tcp.h @@ -395,6 +395,13 @@ class sockinfo_tcp : public sockinfo, public timer_handler { // int rx_wait(int &poll_count, bool blocking = true); static err_t ack_recvd_lwip_cb(void *arg, struct tcp_pcb *tpcb, u16_t space); + ssize_t tcp_tx_handle_done_and_unlock(ssize_t total_tx, int errno_tmp, bool is_dummy, + bool is_send_zerocopy); + ssize_t tcp_tx_handle_errno_and_unlock(int error_number); + ssize_t tcp_tx_handle_partial_send_and_unlock(ssize_t total_tx, int errno_to_report, + bool is_dummy, bool is_send_zerocopy, + int errno_to_restore); + ssize_t tcp_tx_slow_path(xlio_tx_call_attr_t &tx_arg); inline err_t handle_fin(struct tcp_pcb *pcb, err_t err); inline void handle_rx_lwip_cb_error(pbuf *p); inline void rx_lwip_cb_error(pbuf *p); @@ -510,6 +517,7 @@ class sockinfo_tcp : public sockinfo, public timer_handler { void process_reuse_ctl_packets(); void process_rx_ctl_packets(); static void put_agent_msg(void *arg); + bool is_connected_and_ready_to_send(); public: static const int CONNECT_DEFAULT_TIMEOUT_MS = 10000; From ff55e60892cd4aaf2d91d124740d96c51b4ca890 Mon Sep 17 00:00:00 2001 From: Alex Briskin Date: Wed, 15 Nov 2023 09:54:59 +0200 Subject: [PATCH 054/169] issue: 3668182 Refactor LwIP + sockinfo_tcp + dst_entry_tcp - Remove oversize from pbuf_prealloc_express - Extract tcp_tx_handle_sndbuf_unavailable method - Promote pbuf.len checks to u32_t - Combine express_mkey and mkey into mkey - Remove pbuf_clen Signed-off-by: Alex Briskin --- src/core/lwip/pbuf.c | 47 +++++------ src/core/lwip/pbuf.h | 6 +- src/core/lwip/tcp.c | 2 +- src/core/lwip/tcp_impl.h | 4 +- src/core/lwip/tcp_in.c | 16 ++-- src/core/lwip/tcp_out.c | 135 +++++++++++++++++-------------- src/core/proto/dst_entry_tcp.cpp | 2 +- src/core/sock/sockinfo_tcp.cpp | 111 +++++++++++++------------ src/core/sock/sockinfo_tcp.h | 2 + 9 files changed, 166 insertions(+), 159 deletions(-) diff --git a/src/core/lwip/pbuf.c b/src/core/lwip/pbuf.c index de6152816..e27929061 100644 --- a/src/core/lwip/pbuf.c +++ b/src/core/lwip/pbuf.c @@ -162,16 +162,26 @@ u8_t pbuf_header(struct pbuf *p, s32_t header_size_increment) return 1; } - /* Check that we aren't going to move off the end of the pbuf */ - if (header_size_increment < 0 && (-header_size_increment) > (s32_t)p->len) { - return 1; - } + if (header_size_increment >= 0) { + u32_t header_increment = (u32_t)header_size_increment; + /* set new payload pointer */ + p->payload = (u8_t *)p->payload - header_increment; + /* modify pbuf length fields */ + p->len += header_increment; + p->tot_len += header_increment; + } else { + u32_t header_decrement = (u32_t)(-header_size_increment); + /* Check that we aren't going to move off the end of the pbuf */ + if (header_decrement > p->len) { + return 1; + } - /* set new payload pointer */ - p->payload = (u8_t *)p->payload - header_size_increment; - /* modify pbuf length fields */ - p->len += header_size_increment; - p->tot_len += header_size_increment; + /* set new payload pointer */ + p->payload = (u8_t *)p->payload + header_decrement; + /* modify pbuf length fields */ + p->len -= header_decrement; + p->tot_len -= header_decrement; + } LWIP_DEBUGF(PBUF_DEBUG | LWIP_DBG_TRACE, ("pbuf_header: new %p (%" S32_F ")\n", (void *)p->payload, header_size_increment)); @@ -246,25 +256,6 @@ u8_t pbuf_free(struct pbuf *p) return count; } -/** - * Count number of pbufs in a chain - * - * @param p first pbuf of chain - * @return the number of pbufs in a chain - */ - -u8_t pbuf_clen(struct pbuf *p) -{ - u8_t len; - - len = 0; - while (p != NULL) { - ++len; - p = p->next; - } - return len; -} - /** * Increment the reference count of the pbuf. * diff --git a/src/core/lwip/pbuf.h b/src/core/lwip/pbuf.h index 708800b11..99672d043 100644 --- a/src/core/lwip/pbuf.h +++ b/src/core/lwip/pbuf.h @@ -64,12 +64,11 @@ enum { typedef struct { int attr; - u32_t express_mkey; + u32_t mkey; union { void *map; void *mdesc; int fd; - u32_t mkey; void *opaque; }; } pbuf_desc; @@ -82,7 +81,7 @@ struct pbuf { void *payload; /** length of this buffer */ - u16_t len; + u32_t len; u8_t gro; @@ -129,7 +128,6 @@ void pbuf_realloc(struct pbuf *p, u32_t size); u8_t pbuf_header(struct pbuf *p, s32_t header_size); void pbuf_ref(struct pbuf *p); u8_t pbuf_free(struct pbuf *p); -u8_t pbuf_clen(struct pbuf *p); void pbuf_cat(struct pbuf *head, struct pbuf *tail); void pbuf_split_64k(struct pbuf *p, struct pbuf **rest); // windows scale needs large pbuf diff --git a/src/core/lwip/tcp.c b/src/core/lwip/tcp.c index 09c121c8b..d90d86d18 100644 --- a/src/core/lwip/tcp.c +++ b/src/core/lwip/tcp.c @@ -1063,7 +1063,7 @@ void tcp_pcb_recycle(struct tcp_pcb *pcb) } } -struct pbuf *tcp_tx_pbuf_alloc(struct tcp_pcb *pcb, u16_t length, pbuf_type type, pbuf_desc *desc, +struct pbuf *tcp_tx_pbuf_alloc(struct tcp_pcb *pcb, u32_t length, pbuf_type type, pbuf_desc *desc, struct pbuf *p_buff) { struct pbuf *p; diff --git a/src/core/lwip/tcp_impl.h b/src/core/lwip/tcp_impl.h index f76f0c22c..9f40c5bfd 100644 --- a/src/core/lwip/tcp_impl.h +++ b/src/core/lwip/tcp_impl.h @@ -57,7 +57,7 @@ void L3_level_tcp_input(struct pbuf *p, struct tcp_pcb *pcb); /* Used within the TCP code only: */ struct tcp_pcb *tcp_alloc(u8_t prio); -struct pbuf *tcp_tx_pbuf_alloc(struct tcp_pcb *pcb, u16_t length, pbuf_type type, pbuf_desc *desc, +struct pbuf *tcp_tx_pbuf_alloc(struct tcp_pcb *pcb, u32_t length, pbuf_type type, pbuf_desc *desc, struct pbuf *p_buff); void tcp_tx_preallocted_buffers_free(struct tcp_pcb *pcb); void tcp_tx_pbuf_free(struct tcp_pcb *pcb, struct pbuf *pbuf); @@ -296,7 +296,7 @@ struct tcp_seg { #define TF_SEG_OPTS_ZEROCOPY (u8_t) TCP_WRITE_ZEROCOPY /* Use zerocopy send mode */ u8_t tcp_flags; /* Cached TCP flags for outgoing segments */ - u8_t bufs; /* To avoid pbuf_clen() */ + u8_t bufs; /* L2+L3+TCP header for zerocopy segments, it must have enough room for options This should have enough space for L2 (ETH+vLAN), L3 (IPv4/6), L4 (TCP) diff --git a/src/core/lwip/tcp_in.c b/src/core/lwip/tcp_in.c index e93d469b5..b60662a11 100644 --- a/src/core/lwip/tcp_in.c +++ b/src/core/lwip/tcp_in.c @@ -941,6 +941,7 @@ static u32_t tcp_shrink_segment(struct tcp_pcb *pcb, struct tcp_seg *seg, u32_t ("tcp_shrink: count: %-5d unsent %s\n", count, _dump_seg(pcb->unsent))); #endif /* TCP_TSO_DEBUG */ + seg->bufs -= count; return count; } @@ -987,6 +988,7 @@ static u32_t tcp_shrink_zc_segment(struct tcp_pcb *pcb, struct tcp_seg *seg, u32 } seg->tcphdr->seqno = htonl(seg->seqno); + seg->bufs -= count; return count; } @@ -1009,7 +1011,6 @@ static void tcp_receive(struct tcp_pcb *pcb, tcp_in_data *in_data) struct tcp_seg *prev, *cseg; #endif /* TCP_QUEUE_OOSEQ */ struct pbuf *p; - s32_t off; s16_t m; u32_t right_wnd_edge; u32_t new_tot_len; @@ -1017,6 +1018,9 @@ static void tcp_receive(struct tcp_pcb *pcb, tcp_in_data *in_data) s8_t persist = 0; if (in_data->flags & TCP_ACK) { + if (pcb->unacked) { + __builtin_prefetch(pcb->unacked->p); + } right_wnd_edge = pcb->snd_wnd + pcb->snd_wl2; /* Update window. */ @@ -1206,14 +1210,12 @@ static void tcp_receive(struct tcp_pcb *pcb, tcp_in_data *in_data) pcb->unacked = pcb->unacked->next; LWIP_DEBUGF(TCP_QLEN_DEBUG, ("tcp_receive: queuelen %" U32_F " ... ", (u32_t)pcb->snd_queuelen)); - LWIP_ASSERT("pcb->snd_queuelen >= pbuf_clen(next->p)", - (pcb->snd_queuelen >= pbuf_clen(next->p))); /* Prevent ACK for FIN to generate a sent event */ if ((pcb->acked != 0) && ((next->tcp_flags & TCP_FIN) != 0)) { pcb->acked--; } - pcb->snd_queuelen -= pbuf_clen(next->p); + pcb->snd_queuelen -= next->bufs; tcp_tx_seg_free(pcb, next); LWIP_DEBUGF(TCP_QLEN_DEBUG, ("%" U32_F " (after freeing unacked)\n", (u32_t)pcb->snd_queuelen)); @@ -1259,13 +1261,11 @@ static void tcp_receive(struct tcp_pcb *pcb, tcp_in_data *in_data) pcb->unsent = pcb->unsent->next; LWIP_DEBUGF(TCP_QLEN_DEBUG, ("tcp_receive: queuelen %" U32_F " ... ", (u32_t)pcb->snd_queuelen)); - LWIP_ASSERT("pcb->snd_queuelen >= pbuf_clen(next->p)", - (pcb->snd_queuelen >= pbuf_clen(next->p))); /* Prevent ACK for FIN to generate a sent event */ if ((pcb->acked != 0) && ((next->tcp_flags & TCP_FIN) != 0)) { pcb->acked--; } - pcb->snd_queuelen -= pbuf_clen(next->p); + pcb->snd_queuelen -= next->bufs; tcp_tx_seg_free(pcb, next); LWIP_DEBUGF(TCP_QLEN_DEBUG, ("%" U16_F " (after freeing unsent)\n", (u32_t)pcb->snd_queuelen)); @@ -1377,7 +1377,7 @@ static void tcp_receive(struct tcp_pcb *pcb, tcp_in_data *in_data) adjust the ->data pointer in the seg and the segment length.*/ - off = pcb->rcv_nxt - in_data->seqno; + u32_t off = pcb->rcv_nxt - in_data->seqno; p = in_data->inseg.p; LWIP_ASSERT("inseg.p != NULL", in_data->inseg.p); if (in_data->inseg.p->len < off) { diff --git a/src/core/lwip/tcp_out.c b/src/core/lwip/tcp_out.c index 7094215e7..09c6ff59c 100644 --- a/src/core/lwip/tcp_out.c +++ b/src/core/lwip/tcp_out.c @@ -196,10 +196,10 @@ static struct tcp_seg *tcp_create_segment(struct tcp_pcb *pcb, struct pbuf *p, u seg->flags = optflags; seg->tcp_flags = flags; - seg->bufs = 0; seg->p = p; seg->len = p->tot_len - optlen; seg->seqno = seqno; + seg->bufs = 1; if (seg->flags & TF_SEG_OPTS_ZEROCOPY) { /* XXX Don't hardcode size/offset */ @@ -230,29 +230,25 @@ static struct tcp_seg *tcp_create_segment(struct tcp_pcb *pcb, struct pbuf *p, u } /** - * Allocate a PBUF_RAM pbuf, perhaps with oversize space at the end. + * Allocate a PBUF_RAM pbuf * * This function is like pbuf_alloc(layer, length, PBUF_RAM) except * there may be extra bytes available at the end. * * @param length size of the pbuf's payload. - * @param oversize pointer to a u16_t that will receive the number of usable tail bytes. - * @param pcb The TCP connection that willo enqueue the pbuf. + * @param pcb The TCP connection that will enqueue the pbuf. * @param */ -static struct pbuf *tcp_pbuf_prealloc_express(u16_t length, u16_t *oversize, struct tcp_pcb *pcb, - pbuf_type type, pbuf_desc *desc, struct pbuf *p_buff) +static struct pbuf *tcp_pbuf_prealloc_express(u32_t length, struct tcp_pcb *pcb, pbuf_type type, + pbuf_desc *desc, struct pbuf *p_buff) { struct pbuf *p; p = tcp_tx_pbuf_alloc(pcb, 0, type, desc, p_buff); - if (p == NULL) { - return NULL; + if (p != NULL) { + LWIP_ASSERT("need unchained pbuf", p->next == NULL); + p->len = p->tot_len = length; } - LWIP_ASSERT("need unchained pbuf", p->next == NULL); - *oversize = p->len - length; - /* trim p->len to the currently used size */ - p->len = p->tot_len = length; return p; } @@ -270,12 +266,12 @@ static struct pbuf *tcp_pbuf_prealloc_express(u16_t length, u16_t *oversize, str * @param first_seg true when this pbuf will be used in the first enqueued segment. * @param */ -static struct pbuf *tcp_pbuf_prealloc(u16_t length, u16_t max_length, u16_t *oversize, +static struct pbuf *tcp_pbuf_prealloc(u32_t length, u32_t max_length, u16_t *oversize, struct tcp_pcb *pcb, pbuf_type type, u8_t tcp_write_flag_more, u8_t first_seg, pbuf_desc *desc, struct pbuf *p_buff) { struct pbuf *p; - u16_t alloc = length; + u32_t alloc = length; if (length < max_length) { /* Should we allocate an oversized pbuf, or just the minimum @@ -381,14 +377,17 @@ static inline u16_t tcp_xmit_size_goal(struct tcp_pcb *pcb, int use_max) * To prompt the system to send data now, call tcp_output() after * calling tcp_write(). * + * The function will copy the data from arg to a new pbuf. + * * @param pcb Protocol control block for the TCP connection to enqueue data for. * @param arg Pointer to the data to be enqueued for sending. * @param len Data length in bytes - * @param apiflags combination of following flags : + * @param apiflags combination of following flags: * - TCP_WRITE_FLAG_COPY (0x01) data will be copied into memory belonging to the stack * - TCP_WRITE_FLAG_MORE (0x02) for TCP connection, PSH flag will be set on last segment sent * - TCP_WRITE_DUMMY (0x10) indicates if the packet is a dummy packet * - TCP_WRITE_FILE (0x40) data should be taken from file + * @param desc Additional metadata that allows later to check the data mkey/lkey. * @return ERR_OK if enqueued, another err_t on error */ err_t tcp_write(struct tcp_pcb *pcb, const void *arg, u32_t len, u16_t apiflags, pbuf_desc *desc) @@ -404,9 +403,8 @@ err_t tcp_write(struct tcp_pcb *pcb, const void *arg, u32_t len, u16_t apiflags, u16_t oversize_used = 0; #endif /* TCP_OVERSIZE */ err_t err; - u16_t mss_local = 0; - u16_t mss_local_minus_opts; - int tot_p = 0; + u32_t mss_local = 0; + u32_t mss_local_minus_opts; const int piov_max_size = 512; const int piov_max_len = 65536; struct iovec piov[piov_max_size]; @@ -473,7 +471,7 @@ err_t tcp_write(struct tcp_pcb *pcb, const void *arg, u32_t len, u16_t apiflags, */ if (pcb->last_unsent != NULL) { - u16_t space; + u32_t space; /* Usable space at the end of the last unsent segment */ u16_t unsent_optlen = LWIP_TCP_OPT_LENGTH(pcb->last_unsent->flags); if ((pcb->last_unsent->p->type == type) && @@ -488,7 +486,6 @@ err_t tcp_write(struct tcp_pcb *pcb, const void *arg, u32_t len, u16_t apiflags, #endif /* TCP_OVERSIZE */ } seg = pcb->last_unsent; - tot_p = pbuf_clen(seg->p); /* * Phase 1: Copy data directly into an oversized pbuf. @@ -526,9 +523,9 @@ err_t tcp_write(struct tcp_pcb *pcb, const void *arg, u32_t len, u16_t apiflags, * the end. */ if (!is_file && (pos < len) && (space > 0) && (pcb->last_unsent->len > 0) && - (tot_p < (int)pcb->tso.max_send_sge)) { + (pcb->last_unsent->bufs < pcb->tso.max_send_sge)) { - u16_t seglen = space < len - pos ? space : len - pos; + u32_t seglen = space < len - pos ? space : len - pos; if ((concat_p = tcp_pbuf_prealloc(seglen, space, &oversize, pcb, type, TCP_WRITE_FLAG_MORE, 1, desc, NULL)) == NULL) { LWIP_DEBUGF( @@ -543,7 +540,7 @@ err_t tcp_write(struct tcp_pcb *pcb, const void *arg, u32_t len, u16_t apiflags, memcpy(concat_p->payload, (u8_t *)arg + pos, seglen); pos += seglen; - queuelen += pbuf_clen(concat_p); + queuelen++; } } else { #if TCP_OVERSIZE @@ -601,7 +598,7 @@ err_t tcp_write(struct tcp_pcb *pcb, const void *arg, u32_t len, u16_t apiflags, memcpy((char *)p->payload + optlen, (u8_t *)arg + pos, seglen); } - queuelen += pbuf_clen(p); + queuelen++; /* Now that there are more segments queued, we check again if the * length of the queue exceeds the configured maximum or @@ -676,6 +673,7 @@ err_t tcp_write(struct tcp_pcb *pcb, const void *arg, u32_t len, u16_t apiflags, (pcb->last_unsent != NULL)); pbuf_cat(pcb->last_unsent->p, concat_p); pcb->last_unsent->len += concat_p->tot_len; + pcb->last_unsent->bufs++; } /* @@ -727,15 +725,31 @@ err_t tcp_write(struct tcp_pcb *pcb, const void *arg, u32_t len, u16_t apiflags, return ERR_MEM; } +/** + * Write data for sending (but does not send it immediately). + * + * It waits in the expectation of more data being sent soon (as + * it can send them more efficiently by combining them together). + * To prompt the system to send data now, call tcp_output() after + * calling tcp_write_express(). + * + * The function will zero-copy the data into the payload, i.e. the data pointer, instead of the + * data, will be copied. + * + * @param pcb Protocol control block for the TCP connection to enqueue data for. + * @param arg Pointer to the data to be enqueued for sending. + * @param len Data length in bytes + * @param desc Additional metadata that allows later to check the data mkey/lkey. + * @return ERR_OK if enqueued, another err_t on error + */ err_t tcp_write_express(struct tcp_pcb *pcb, const void *arg, u32_t len, pbuf_desc *desc) { struct pbuf *p; struct tcp_seg *seg = NULL, *prev_seg = NULL, *queue = NULL; u32_t pos = 0; /* position in 'arg' data */ u8_t optflags = TF_SEG_OPTS_ZEROCOPY; - u16_t oversize = 0; - const u16_t mss_local = lwip_zc_tx_size; - u16_t seglen; + const u32_t mss_local = lwip_zc_tx_size; + u32_t seglen; u16_t queuelen = 0; if (len < pcb->mss) { @@ -744,10 +758,7 @@ err_t tcp_write_express(struct tcp_pcb *pcb, const void *arg, u32_t len, pbuf_de } /* - * TCP segmentation is done in two phases with increasing complexity: - * - * 1. Chain a new pbuf to the end of pcb->unsent. - * 2. Create new segments. + * Chain a new pbuf to the end of pcb->unsent if there is enough space. * * We may run out of memory at any point. In that case we must * return ERR_MEM and not change anything in pcb. Therefore, all @@ -755,49 +766,39 @@ err_t tcp_write_express(struct tcp_pcb *pcb, const void *arg, u32_t len, pbuf_de * of the function. Some pcb fields are maintained in local copies: * * queuelen = pcb->snd_queuelen - * oversize = pcb->unsent_oversize - * - * These variables are set consistently by the phases: * + * These variables are set consistently by the phases. * seg points to the last segment tampered with. - * * pos records progress as data is segmented. */ - - /* Find the tail of the unsent queue. */ - if (pcb->unsent != NULL && (pcb->last_unsent->flags & TF_SEG_OPTS_ZEROCOPY)) { - u16_t space = mss_local - pcb->last_unsent->len; + if (pcb->unsent != NULL) { seg = pcb->last_unsent; + u32_t space = LWIP_MAX(mss_local, pcb->tso.max_payload_sz) - seg->len; - if (space > 0 && (space >= len || len > mss_local) && - (pcb->last_unsent->bufs < pcb->tso.max_send_sge)) { + if (space > 0 && (seg->flags & TF_SEG_OPTS_ZEROCOPY) && seg->bufs < pcb->tso.max_send_sge) { seglen = space < len ? space : len; - if ((p = tcp_pbuf_prealloc_express(seglen, &oversize, pcb, PBUF_ZEROCOPY, desc, - NULL)) == NULL) { + if ((p = tcp_pbuf_prealloc_express(seglen, pcb, PBUF_ZEROCOPY, desc, NULL)) == NULL) { goto memerr; } p->payload = (u8_t *)arg; - pbuf_cat(pcb->last_unsent->p, p); - pcb->last_unsent->len += p->tot_len; - pcb->last_unsent->bufs += 1U; + pbuf_cat(seg->p, p); + seg->len += p->tot_len; + seg->bufs++; pos += seglen; - queuelen += 1U; + queuelen++; } - } else { - pcb->last_unsent = NULL; } while (pos < len) { u32_t left = len - pos; seglen = left > mss_local ? mss_local : left; - if ((p = tcp_pbuf_prealloc_express(seglen, &oversize, pcb, PBUF_ZEROCOPY, desc, NULL)) == - NULL) { + if ((p = tcp_pbuf_prealloc_express(seglen, pcb, PBUF_ZEROCOPY, desc, NULL)) == NULL) { goto memerr; } p->payload = (u8_t *)arg + pos; - queuelen += 1; + queuelen++; if ((seg = tcp_create_segment(pcb, p, 0, pcb->snd_lbb + pos, optflags)) == NULL) { tcp_tx_pbuf_free(pcb, p); @@ -809,8 +810,8 @@ err_t tcp_write_express(struct tcp_pcb *pcb, const void *arg, u32_t len, pbuf_de } else { prev_seg->next = seg; } - prev_seg = seg; + prev_seg = seg; pos += seglen; } @@ -957,7 +958,7 @@ err_t tcp_enqueue_flags(struct tcp_pcb *pcb, u8_t flags) } /* update number of segments on the queues */ - pcb->snd_queuelen += pbuf_clen(seg->p); + pcb->snd_queuelen += seg->bufs; LWIP_DEBUGF(TCP_QLEN_DEBUG, ("tcp_enqueue_flags: %" S16_F " (after enqueued)\n", pcb->snd_queuelen)); if (pcb->snd_queuelen != 0) { @@ -1057,7 +1058,7 @@ static void tcp_tso_segment(struct tcp_pcb *pcb, struct tcp_seg *seg, u32_t wnd) u32_t max_payload_sz = LWIP_MIN(pcb->tso.max_payload_sz, (wnd - (seg->seqno - pcb->lastack))); u32_t tot_len = 0; u8_t flags = seg->flags; - int tot_p = 0; + u8_t tot_p = 0; /* Ignore retransmitted segments and special segments */ @@ -1075,8 +1076,8 @@ static void tcp_tso_segment(struct tcp_pcb *pcb, struct tcp_seg *seg, u32_t wnd) goto err; } - tot_p += pbuf_clen(cur_seg->p); - if (tot_p > (int)pcb->max_send_sge) { + tot_p += cur_seg->bufs; + if (tot_p > pcb->max_send_sge) { goto err; } @@ -1089,6 +1090,7 @@ static void tcp_tso_segment(struct tcp_pcb *pcb, struct tcp_seg *seg, u32_t wnd) /* Update the original segment with current segment details */ seg->next = cur_seg->next; seg->len += cur_seg->len; + seg->bufs += cur_seg->bufs; /* Update the first pbuf of current segment, unless this is a zerocopy segment */ if (!(cur_seg->flags & TF_SEG_OPTS_ZEROCOPY)) { @@ -1138,7 +1140,7 @@ static struct tcp_seg *tcp_split_one_segment(struct tcp_pcb *pcb, struct tcp_seg struct tcp_seg *result = NULL; struct pbuf *cur_p = NULL; int tcp_hlen_delta; - u16_t max_length = 0; + u32_t max_length = 0; u16_t oversize = 0; pbuf_type type = PBUF_RAM; @@ -1456,12 +1458,14 @@ void tcp_split_rexmit(struct tcp_pcb *pcb, struct tcp_seg *seg) /* New segment update */ new_seg->next = cur_seg->next; new_seg->flags = cur_seg->flags; + new_seg->bufs = cur_seg->bufs - 1; /* Original segment update */ cur_seg->next = new_seg; cur_seg->len = cur_seg->p->len - tcp_hlen_delta - optlen; cur_seg->p->tot_len = cur_seg->p->len; cur_seg->p->next = NULL; + cur_seg->bufs = 1; if (pcb->last_unsent == cur_seg) { /* We have split the last unsent segment, update last_unsent */ @@ -1554,11 +1558,13 @@ void tcp_split_segment(struct tcp_pcb *pcb, struct tcp_seg *seg, u32_t wnd) tcp_tx_pbuf_free(pcb, p); return; } + newseg->bufs = seg->bufs; /* Update original buffer */ seg->p->next = NULL; seg->p->len = seg->p->len - lentoqueue; seg->p->tot_len = seg->p->len; + seg->bufs = 1; /* New segment update */ newseg->next = seg->next; @@ -1590,6 +1596,7 @@ void tcp_split_segment(struct tcp_pcb *pcb, struct tcp_seg *seg, u32_t wnd) struct pbuf *pnewtail = seg->p; struct pbuf *ptmp = seg->p; u32_t headchainlen = seg->p->len; + oversize = 1; // count bufs in the left seg while ((headchainlen + pnewhead->len - (tcp_hlen_delta + optlen)) <= lentosend) { if (pnewtail->ref > 1) { @@ -1599,6 +1606,7 @@ void tcp_split_segment(struct tcp_pcb *pcb, struct tcp_seg *seg, u32_t wnd) headchainlen += pnewhead->len; pnewtail = pnewhead; pnewhead = pnewhead->next; + oversize++; if (NULL == pnewhead) { LWIP_ASSERT("tcp_split_segment: We should not be here", 0); @@ -1616,6 +1624,9 @@ void tcp_split_segment(struct tcp_pcb *pcb, struct tcp_seg *seg, u32_t wnd) return; } + newseg->bufs = seg->bufs - oversize; + seg->bufs = oversize; + /* Update new tail */ pnewtail->next = NULL; @@ -1861,7 +1872,7 @@ err_t tcp_output(struct tcp_pcb *pcb) if (LWIP_IS_DUMMY_SEGMENT(seg)) { pcb->snd_lbb -= seg->len; pcb->snd_buf += seg->len; - pcb->snd_queuelen -= pbuf_clen(seg->p); + pcb->snd_queuelen -= seg->bufs; tcp_tx_seg_free(pcb, seg); } else { /* unacked list is empty? */ @@ -1968,14 +1979,14 @@ static err_t tcp_output_segment(struct tcp_seg *seg, struct tcp_pcb *pcb) if (seg->flags & TF_SEG_OPTS_MSS) { /* coverity[result_independent_of_operands] */ TCP_BUILD_MSS_OPTION(*opts, pcb->advtsd_mss); - opts += 1; // Move to the next line (meaning next 32 bit) as this option is 4 bytes long + opts++; // Move to the next line (meaning next 32 bit) as this option is 4 bytes long } /* If RCV_SCALE is set then prepare segment for window scaling option */ if (seg->flags & TF_SEG_OPTS_WNDSCALE) { TCP_BUILD_WNDSCALE_OPTION(*opts, rcv_wnd_scale); - opts += 1; // Move to the next line (meaning next 32 bit) as this option is 3 bytes long + - // we added 1 byte NOOP padding => total 4 bytes + opts++; // Move to the next line (meaning next 32 bit) as this option is 3 bytes long + + // we added 1 byte NOOP padding => total 4 bytes } #if LWIP_TCP_TIMESTAMPS diff --git a/src/core/proto/dst_entry_tcp.cpp b/src/core/proto/dst_entry_tcp.cpp index b921070d0..b031b8cdf 100644 --- a/src/core/proto/dst_entry_tcp.cpp +++ b/src/core/proto/dst_entry_tcp.cpp @@ -223,7 +223,7 @@ ssize_t dst_entry_tcp::fast_send(const iovec *p_iov, const ssize_t sz_iov, xlio_ m_sge[i].length = p_tcp_iov[i].iovec.iov_len; if (is_zerocopy) { if (PBUF_DESC_EXPRESS == p_tcp_iov[i].p_desc->lwip_pbuf.pbuf.desc.attr) { - m_sge[i].lkey = p_tcp_iov[i].p_desc->lwip_pbuf.pbuf.desc.express_mkey; + m_sge[i].lkey = p_tcp_iov[i].p_desc->lwip_pbuf.pbuf.desc.mkey; } else if (PBUF_DESC_MKEY == p_tcp_iov[i].p_desc->lwip_pbuf.pbuf.desc.attr) { /* PBUF_DESC_MKEY - value is provided by user */ m_sge[i].lkey = p_tcp_iov[i].p_desc->lwip_pbuf.pbuf.desc.mkey; diff --git a/src/core/sock/sockinfo_tcp.cpp b/src/core/sock/sockinfo_tcp.cpp index 97b6c1eb7..97b36f6df 100644 --- a/src/core/sock/sockinfo_tcp.cpp +++ b/src/core/sock/sockinfo_tcp.cpp @@ -749,22 +749,17 @@ bool sockinfo_tcp::prepare_dst_to_send(bool is_accepted_socket /* = false */) bool ret_val = false; if (m_p_connected_dst_entry) { - if (is_accepted_socket) { - ret_val = m_p_connected_dst_entry->prepare_to_send(m_so_ratelimit, true, false); - } else { - ret_val = m_p_connected_dst_entry->prepare_to_send(m_so_ratelimit, false, true); - } - + bool skip_rules = is_accepted_socket, is_connect = !is_accepted_socket; + ret_val = m_p_connected_dst_entry->prepare_to_send(m_so_ratelimit, skip_rules, is_connect); if (ret_val) { /* dst_entry has resolved tx ring, * so it is a time to provide TSO information to PCB */ - m_pcb.tso.max_buf_sz = - std::min(safe_mce_sys().tx_buf_size, - m_p_connected_dst_entry->get_ring()->get_max_payload_sz()); - m_pcb.tso.max_payload_sz = m_p_connected_dst_entry->get_ring()->get_max_payload_sz(); - m_pcb.tso.max_header_sz = m_p_connected_dst_entry->get_ring()->get_max_header_sz(); - m_pcb.tso.max_send_sge = m_p_connected_dst_entry->get_ring()->get_max_send_sge(); + auto *ring = m_p_connected_dst_entry->get_ring(); + m_pcb.tso.max_buf_sz = std::min(safe_mce_sys().tx_buf_size, ring->get_max_payload_sz()); + m_pcb.tso.max_payload_sz = ring->get_max_payload_sz(); + m_pcb.tso.max_header_sz = ring->get_max_header_sz(); + m_pcb.tso.max_send_sge = ring->get_max_send_sge(); /* reserve one slot for network headers of zerocopy segments */ m_pcb.max_send_sge = m_pcb.tso.max_send_sge - 1; safe_mce_sys().zc_tx_size = @@ -776,9 +771,9 @@ bool sockinfo_tcp::prepare_dst_to_send(bool is_accepted_socket /* = false */) unsigned sockinfo_tcp::tx_wait(int &err, bool blocking) { - auto sz = sndbuf_available(); + unsigned sz = sndbuf_available(); int poll_count = 0; - si_tcp_logfunc("sz = %d rx_count=%d", sz, m_n_rx_pkt_ready_list_count); + si_tcp_logfunc("sz = %u rx_count=%d", sz, m_n_rx_pkt_ready_list_count); err = 0; while (is_rts() && (sz = sndbuf_available()) == 0) { err = rx_wait(poll_count, blocking); @@ -981,43 +976,11 @@ ssize_t sockinfo_tcp::tcp_tx(xlio_tx_call_attr_t &tx_arg) } unsigned pos = 0; while (pos < p_iov[i].iov_len) { - auto tx_size = sndbuf_available(); + unsigned tx_size = sndbuf_available(); - /* Process a case when space is not available at the sending socket - * to hold the message to be transmitted - * Nonblocking socket: - * - no data is buffered: return (-1) and EAGAIN - * - some data is buffered: return number of bytes ready to be sent - */ if (tx_size == 0) { - if (unlikely(!is_rts())) { - si_tcp_logdbg("TX on disconnected socket"); - return tcp_tx_handle_errno_and_unlock(ECONNRESET); - } - // force out TCP data before going on wait() - tcp_output(&m_pcb); - - // non blocking socket should return in order not to tx_wait() - if (total_tx > 0) { - m_tx_consecutive_eagain_count = 0; - return tcp_tx_handle_done_and_unlock(total_tx, errno_tmp, is_dummy, - is_send_zerocopy); - } else { - m_tx_consecutive_eagain_count++; - if (m_tx_consecutive_eagain_count >= TX_CONSECUTIVE_EAGAIN_THREASHOLD) { - if (safe_mce_sys().tcp_ctl_thread == - option_tcp_ctl_thread::CTL_THREAD_DELEGATE_TCP_TIMERS) { - // Slow path. We must attempt TCP timers here for applications that - // do not check for EV_OUT. - g_thread_local_event_handler.do_tasks(); - } - // in case of zero sndbuf and non-blocking just try once polling CQ for - // ACK - rx_wait(poll_count, false); - m_tx_consecutive_eagain_count = 0; - } - return tcp_tx_handle_errno_and_unlock(EAGAIN); - } + return tcp_tx_handle_sndbuf_unavailable(total_tx, is_dummy, is_send_zerocopy, + errno_tmp); } tx_size = std::min(p_iov[i].iov_len - pos, tx_size); @@ -5339,7 +5302,7 @@ void sockinfo_tcp::statistics_print(vlog_levels_t log_level /* = VLOG_DEBUG */) tcp_conn_state_e conn_state; u32_t last_unsent_seqno = 0, last_unacked_seqno = 0, first_unsent_seqno = 0, first_unacked_seqno = 0; - u16_t last_unsent_len = 0, last_unacked_len = 0, first_unsent_len = 0, first_unacked_len = 0; + u32_t last_unsent_len = 0, last_unacked_len = 0, first_unsent_len = 0, first_unacked_len = 0; int rcvbuff_max, rcvbuff_current, rcvbuff_non_tcp_recved, rx_pkt_ready_list_size, rx_ctl_packets_list_size, rx_ctl_reuse_list_size; @@ -6114,7 +6077,7 @@ int sockinfo_tcp::tcp_tx_express(const struct iovec *iov, unsigned iov_len, uint default: return -1; }; - mdesc.express_mkey = mkey; + mdesc.mkey = mkey; mdesc.opaque = opaque_op; int bytes_written = 0; @@ -6124,7 +6087,8 @@ int sockinfo_tcp::tcp_tx_express(const struct iovec *iov, unsigned iov_len, uint for (unsigned i = 0; i < iov_len; ++i) { err = tcp_write_express(&m_pcb, iov[i].iov_base, iov[i].iov_len, &mdesc); if (err != ERR_OK) { - return -1; + /* The only error in tcp_write_express is a memory error */ + return tcp_tx_handle_errno_and_unlock(ENOMEM); } bytes_written += iov[i].iov_len; } @@ -6132,7 +6096,8 @@ int sockinfo_tcp::tcp_tx_express(const struct iovec *iov, unsigned iov_len, uint if (!(flags & XLIO_EXPRESS_MSG_MORE)) { err = tcp_output(&m_pcb); if (err != ERR_OK) { - return -1; + /* The error very likely to be recoverable */ + si_tcp_logdbg("tcp_tx_express - tcp_output failed"); } } unlock_tcp_con(); @@ -6229,3 +6194,43 @@ bool sockinfo_tcp::is_connected_and_ready_to_send() } return true; } + +/* Process a case when space is not available at the sending socket + * to hold the message to be transmitted + * Nonblocking socket: + * - no data is buffered: return (-1) and EAGAIN + * - some data is buffered: return number of bytes ready to be sent + */ +ssize_t sockinfo_tcp::tcp_tx_handle_sndbuf_unavailable(ssize_t total_tx, bool is_dummy, + bool is_send_zerocopy, int errno_to_restore) +{ + if (unlikely(!is_rts())) { + si_tcp_logdbg("TX on disconnected socket"); + return tcp_tx_handle_errno_and_unlock(ECONNRESET); + } + // force out TCP data before going on wait() + tcp_output(&m_pcb); + + // non blocking socket should return in order not to tx_wait() + if (total_tx > 0) { + m_tx_consecutive_eagain_count = 0; + return tcp_tx_handle_done_and_unlock(total_tx, errno_to_restore, is_dummy, + is_send_zerocopy); + } else { + m_tx_consecutive_eagain_count++; + if (m_tx_consecutive_eagain_count >= TX_CONSECUTIVE_EAGAIN_THREASHOLD) { + if (safe_mce_sys().tcp_ctl_thread == + option_tcp_ctl_thread::CTL_THREAD_DELEGATE_TCP_TIMERS) { + // Slow path. We must attempt TCP timers here for applications that + // do not check for EV_OUT. + g_thread_local_event_handler.do_tasks(); + } + // in case of zero sndbuf and non-blocking just try once polling CQ for + // ACK + int poll_count = 0; + rx_wait(poll_count, false); + m_tx_consecutive_eagain_count = 0; + } + return tcp_tx_handle_errno_and_unlock(EAGAIN); + } +} diff --git a/src/core/sock/sockinfo_tcp.h b/src/core/sock/sockinfo_tcp.h index 315392543..fc07d04f3 100644 --- a/src/core/sock/sockinfo_tcp.h +++ b/src/core/sock/sockinfo_tcp.h @@ -401,6 +401,8 @@ class sockinfo_tcp : public sockinfo, public timer_handler { ssize_t tcp_tx_handle_partial_send_and_unlock(ssize_t total_tx, int errno_to_report, bool is_dummy, bool is_send_zerocopy, int errno_to_restore); + ssize_t tcp_tx_handle_sndbuf_unavailable(ssize_t total_tx, bool is_dummy, bool is_send_zerocopy, + int errno_to_restore); ssize_t tcp_tx_slow_path(xlio_tx_call_attr_t &tx_arg); inline err_t handle_fin(struct tcp_pcb *pcb, err_t err); inline void handle_rx_lwip_cb_error(pbuf *p); From 1e4e73035a82d88b14e50570a1e5ac85446dacef Mon Sep 17 00:00:00 2001 From: Alex Briskin Date: Tue, 23 Jan 2024 18:22:06 +0200 Subject: [PATCH 055/169] issue: 3668182 Fix PR comments Signed-off-by: Alex Briskin --- src/core/lwip/tcp_impl.h | 2 +- src/core/proto/dst_entry_tcp.cpp | 20 +++-- src/core/sock/sockinfo_tcp.cpp | 148 +++++++++++++++---------------- src/core/sock/sockinfo_tcp.h | 2 +- 4 files changed, 84 insertions(+), 88 deletions(-) diff --git a/src/core/lwip/tcp_impl.h b/src/core/lwip/tcp_impl.h index 9f40c5bfd..4a29d287e 100644 --- a/src/core/lwip/tcp_impl.h +++ b/src/core/lwip/tcp_impl.h @@ -296,7 +296,7 @@ struct tcp_seg { #define TF_SEG_OPTS_ZEROCOPY (u8_t) TCP_WRITE_ZEROCOPY /* Use zerocopy send mode */ u8_t tcp_flags; /* Cached TCP flags for outgoing segments */ - u8_t bufs; + u8_t bufs; /* Number of buffers int the pbuf linked list */ /* L2+L3+TCP header for zerocopy segments, it must have enough room for options This should have enough space for L2 (ETH+vLAN), L3 (IPv4/6), L4 (TCP) diff --git a/src/core/proto/dst_entry_tcp.cpp b/src/core/proto/dst_entry_tcp.cpp index b031b8cdf..8f01fea41 100644 --- a/src/core/proto/dst_entry_tcp.cpp +++ b/src/core/proto/dst_entry_tcp.cpp @@ -222,16 +222,18 @@ ssize_t dst_entry_tcp::fast_send(const iovec *p_iov, const ssize_t sz_iov, xlio_ m_sge[i].addr = (uintptr_t)p_tcp_iov[i].iovec.iov_base; m_sge[i].length = p_tcp_iov[i].iovec.iov_len; if (is_zerocopy) { - if (PBUF_DESC_EXPRESS == p_tcp_iov[i].p_desc->lwip_pbuf.pbuf.desc.attr) { - m_sge[i].lkey = p_tcp_iov[i].p_desc->lwip_pbuf.pbuf.desc.mkey; - } else if (PBUF_DESC_MKEY == p_tcp_iov[i].p_desc->lwip_pbuf.pbuf.desc.attr) { + auto *p_desc = p_tcp_iov[i].p_desc; + auto &pbuf_descriptor = p_desc->lwip_pbuf.pbuf.desc; + if (PBUF_DESC_EXPRESS == pbuf_descriptor.attr) { + m_sge[i].lkey = pbuf_descriptor.mkey; + } else if (PBUF_DESC_MKEY == pbuf_descriptor.attr) { /* PBUF_DESC_MKEY - value is provided by user */ - m_sge[i].lkey = p_tcp_iov[i].p_desc->lwip_pbuf.pbuf.desc.mkey; - } else if (PBUF_DESC_MDESC == p_tcp_iov[i].p_desc->lwip_pbuf.pbuf.desc.attr || - PBUF_DESC_NVME_TX == p_tcp_iov[i].p_desc->lwip_pbuf.pbuf.desc.attr) { - mem_desc *mdesc = (mem_desc *)p_tcp_iov[i].p_desc->lwip_pbuf.pbuf.desc.mdesc; - m_sge[i].lkey = mdesc->get_lkey(p_tcp_iov[i].p_desc, ib_ctx, - (void *)m_sge[i].addr, m_sge[i].length); + m_sge[i].lkey = pbuf_descriptor.mkey; + } else if (PBUF_DESC_MDESC == pbuf_descriptor.attr || + PBUF_DESC_NVME_TX == pbuf_descriptor.attr) { + mem_desc *mdesc = (mem_desc *)pbuf_descriptor.mdesc; + m_sge[i].lkey = + mdesc->get_lkey(p_desc, ib_ctx, (void *)m_sge[i].addr, m_sge[i].length); if (m_sge[i].lkey == LKEY_TX_DEFAULT) { m_sge[i].lkey = m_p_ring->get_tx_lkey(m_id); } diff --git a/src/core/sock/sockinfo_tcp.cpp b/src/core/sock/sockinfo_tcp.cpp index 97b36f6df..38f04fa3f 100644 --- a/src/core/sock/sockinfo_tcp.cpp +++ b/src/core/sock/sockinfo_tcp.cpp @@ -769,12 +769,12 @@ bool sockinfo_tcp::prepare_dst_to_send(bool is_accepted_socket /* = false */) return ret_val; } -unsigned sockinfo_tcp::tx_wait(int &err, bool blocking) +unsigned sockinfo_tcp::tx_wait(bool blocking) { unsigned sz = sndbuf_available(); int poll_count = 0; si_tcp_logfunc("sz = %u rx_count=%d", sz, m_n_rx_pkt_ready_list_count); - err = 0; + int err = 0; while (is_rts() && (sz = sndbuf_available()) == 0) { err = rx_wait(poll_count, blocking); // AlexV:Avoid from going to sleep, for the blocked socket of course, since @@ -795,7 +795,7 @@ unsigned sockinfo_tcp::tx_wait(int &err, bool blocking) poll_count = 0; } } - si_tcp_logfunc("end sz=%d rx_count=%d", sz, m_n_rx_pkt_ready_list_count); + si_tcp_logfunc("end sz=%u rx_count=%d", sz, m_n_rx_pkt_ready_list_count); return sz; } @@ -907,6 +907,16 @@ static inline bool is_invalid_iovec(const iovec *iov, size_t sz_iov) return iov == nullptr || sz_iov == 0; } +/** + * Handles transmission operations on a TCP socket, supporting various user actions such as + * write, send, sendv, sendmsg, and sendfile. This function operates on both blocking and + * non-blocking sockets, providing options for zero-copy send operations. When the socket is + * configured for zero-copy send, it executes a fast-path send for non-blocking operations; + * otherwise, it falls back to the tcp_tx_slow_path function. + * + * @param tx_arg The TCP transmission arguments and parameters. + * @return Returns the number of bytes transmitted, or -1 on error with the errno set. + */ ssize_t sockinfo_tcp::tcp_tx(xlio_tx_call_attr_t &tx_arg) { iovec *p_iov = tx_arg.attr.iov; @@ -933,7 +943,6 @@ ssize_t sockinfo_tcp::tcp_tx(xlio_tx_call_attr_t &tx_arg) if (unlikely(!is_connected_and_ready_to_send())) { return -1; } - si_tcp_logfunc("tx: iov=%p niovs=%d", p_iov, sz_iov); if (m_sysvar_rx_poll_on_tx_tcp) { @@ -947,7 +956,7 @@ ssize_t sockinfo_tcp::tcp_tx(xlio_tx_call_attr_t &tx_arg) return tcp_tx_slow_path(tx_arg); } - bool is_send_zerocopy = tx_arg.opcode != TX_FILE; + bool is_non_file_zerocopy = tx_arg.opcode != TX_FILE; pd_key_array = (tx_arg.priv.attr == PBUF_DESC_MKEY ? (struct xlio_pd_key *)tx_arg.priv.map : NULL); @@ -979,12 +988,19 @@ ssize_t sockinfo_tcp::tcp_tx(xlio_tx_call_attr_t &tx_arg) unsigned tx_size = sndbuf_available(); if (tx_size == 0) { - return tcp_tx_handle_sndbuf_unavailable(total_tx, is_dummy, is_send_zerocopy, + if (unlikely(!is_rts())) { + si_tcp_logdbg("TX on disconnected socket"); + return tcp_tx_handle_errno_and_unlock(ECONNRESET); + } + // force out TCP data before going on wait() + tcp_output(&m_pcb); + + return tcp_tx_handle_sndbuf_unavailable(total_tx, is_dummy, is_non_file_zerocopy, errno_tmp); } tx_size = std::min(p_iov[i].iov_len - pos, tx_size); - if (is_send_zerocopy) { + if (is_non_file_zerocopy) { /* * For send zerocopy we don't support pbufs which * cross huge page boundaries. To avoid forming @@ -1003,7 +1019,7 @@ ssize_t sockinfo_tcp::tcp_tx(xlio_tx_call_attr_t &tx_arg) } if (unlikely(g_b_exit)) { return tcp_tx_handle_partial_send_and_unlock(total_tx, EINTR, is_dummy, - is_send_zerocopy, errno_tmp); + is_non_file_zerocopy, errno_tmp); } err = tcp_write_express(&m_pcb, tx_ptr, tx_size, &tx_arg.priv); @@ -1012,7 +1028,7 @@ ssize_t sockinfo_tcp::tcp_tx(xlio_tx_call_attr_t &tx_arg) si_tcp_logdbg("connection closed: tx'ed = %d", total_tx); shutdown(SHUT_WR); return tcp_tx_handle_partial_send_and_unlock(total_tx, EPIPE, is_dummy, - is_send_zerocopy, errno_tmp); + is_non_file_zerocopy, errno_tmp); } if (unlikely(err != ERR_MEM)) { // we should not get here... @@ -1021,7 +1037,7 @@ ssize_t sockinfo_tcp::tcp_tx(xlio_tx_call_attr_t &tx_arg) BULLSEYE_EXCLUDE_BLOCK_END } return tcp_tx_handle_partial_send_and_unlock(total_tx, EAGAIN, is_dummy, - is_send_zerocopy, errno_tmp); + is_non_file_zerocopy, errno_tmp); } tx_ptr = (void *)((char *)tx_ptr + tx_size); pos += tx_size; @@ -1029,27 +1045,29 @@ ssize_t sockinfo_tcp::tcp_tx(xlio_tx_call_attr_t &tx_arg) } } - return tcp_tx_handle_done_and_unlock(total_tx, errno_tmp, is_dummy, is_send_zerocopy); + return tcp_tx_handle_done_and_unlock(total_tx, errno_tmp, is_dummy, is_non_file_zerocopy); } +/** + * Handles transmission operations on a TCP socket similar to tcp_tx. + * This is a fallback function when the operation is either blocking, not zero-copy, or the socket + * wasn't configured for zero-copy operations. + * + * @param tx_arg The TCP transmission arguments and parameters. + * @return Returns the number of bytes transmitted, or -1 on error with the errno set. + */ ssize_t sockinfo_tcp::tcp_tx_slow_path(xlio_tx_call_attr_t &tx_arg) { iovec *p_iov = tx_arg.attr.iov; size_t sz_iov = tx_arg.attr.sz_iov; int flags = tx_arg.attr.flags; int errno_tmp = errno; - int ret = 0; int poll_count = 0; uint16_t apiflags = 0; - err_t err; bool is_send_zerocopy = false; void *tx_ptr = NULL; struct xlio_pd_key *pd_key_array = NULL; - if (m_sysvar_rx_poll_on_tx_tcp) { - rx_wait_helper(poll_count, false); - } - if (tx_arg.opcode == TX_FILE) { /* * TX_FILE is a special operation which reads a single file. @@ -1081,12 +1099,13 @@ ssize_t sockinfo_tcp::tcp_tx_slow_path(xlio_tx_call_attr_t &tx_arg) lock_tcp_con(); - if (cannot_do_requested_dummy_send(m_pcb, tx_arg) || TCP_WND_UNAVALABLE(m_pcb, total_iov_len)) { + if (cannot_do_requested_dummy_send(m_pcb, tx_arg)) { return tcp_tx_handle_errno_and_unlock(EAGAIN); } int total_tx = 0; off64_t file_offset = 0; + bool block_this_run = BLOCK_THIS_RUN(m_b_blocking, flags); for (size_t i = 0; i < sz_iov; i++) { si_tcp_logfunc("iov:%d base=%p len=%d", i, p_iov[i].iov_base, p_iov[i].iov_len); if (unlikely(!p_iov[i].iov_base)) { @@ -1122,9 +1141,13 @@ ssize_t sockinfo_tcp::tcp_tx_slow_path(xlio_tx_call_attr_t &tx_arg) // force out TCP data before going on wait() tcp_output(&m_pcb); - /* Set return values for nonblocking socket and finish processing */ + // non blocking socket should return in order not to tx_wait() + if (!block_this_run) { + return tcp_tx_handle_sndbuf_unavailable(total_tx, is_dummy, is_send_zerocopy, + errno_tmp); + } - tx_size = tx_wait(ret, true); + tx_size = tx_wait(block_this_run); } tx_size = std::min(p_iov[i].iov_len - pos, tx_size); @@ -1150,11 +1173,9 @@ ssize_t sockinfo_tcp::tcp_tx_slow_path(xlio_tx_call_attr_t &tx_arg) is_send_zerocopy, errno_tmp); } - if (apiflags & XLIO_TX_PACKET_ZEROCOPY) { - err = tcp_write_express(&m_pcb, tx_ptr, tx_size, &tx_arg.priv); - } else { - err = tcp_write(&m_pcb, tx_ptr, tx_size, apiflags, &tx_arg.priv); - } + err_t err = (apiflags & XLIO_TX_PACKET_ZEROCOPY) + ? tcp_write_express(&m_pcb, tx_ptr, tx_size, &tx_arg.priv) + : tcp_write(&m_pcb, tx_ptr, tx_size, apiflags, &tx_arg.priv); if (unlikely(err != ERR_OK)) { if (unlikely(err == ERR_CONN)) { // happens when remote drops during big write si_tcp_logdbg("connection closed: tx'ed = %d", total_tx); @@ -1168,6 +1189,15 @@ ssize_t sockinfo_tcp::tcp_tx_slow_path(xlio_tx_call_attr_t &tx_arg) si_tcp_logpanic("tcp_write return: %d", err); BULLSEYE_EXCLUDE_BLOCK_END } + /* Set return values for nonblocking socket and finish processing */ + if (!block_this_run) { + if (total_tx > 0) { + return tcp_tx_handle_done_and_unlock(total_tx, errno_tmp, is_dummy, + is_send_zerocopy); + } else { + return tcp_tx_handle_errno_and_unlock(EAGAIN); + } + } rx_wait(poll_count, true); @@ -1278,12 +1308,9 @@ err_t sockinfo_tcp::ip_output(struct pbuf *p, struct tcp_seg *seg, void *v_p_con return ERR_OK; } - ssize_t ret = 0; - if (likely((p_dst->is_valid()))) { - ret = p_dst->fast_send((struct iovec *)lwip_iovec, count, attr); - } else { - ret = p_dst->slow_send((struct iovec *)lwip_iovec, count, attr, p_si_tcp->m_so_ratelimit); - } + ssize_t ret = likely((p_dst->is_valid())) + ? p_dst->fast_send((struct iovec *)lwip_iovec, count, attr) + : p_dst->slow_send((struct iovec *)lwip_iovec, count, attr, p_si_tcp->m_so_ratelimit); rc = p_si_tcp->m_ops->handle_send_ret(ret, seg); @@ -6030,43 +6057,12 @@ inline bool sockinfo_tcp::handle_bind_no_port(int &bind_ret, in_port_t in_port, int sockinfo_tcp::tcp_tx_express(const struct iovec *iov, unsigned iov_len, uint32_t mkey, xlio_express_flags flags, void *opaque_op) { - if (unlikely(!is_rts())) { - if (m_conn_state == TCP_CONN_TIMEOUT) { - si_tcp_logdbg("TX timed out"); - errno = ETIMEDOUT; - } else if (m_conn_state == TCP_CONN_RESETED) { - si_tcp_logdbg("TX on reseted socket"); - errno = ECONNRESET; - } else if (m_conn_state == TCP_CONN_ERROR) { - si_tcp_logdbg("TX on connection failed socket"); - errno = ECONNREFUSED; - } else { - si_tcp_logdbg("TX on disconnected socket"); - errno = EPIPE; - } + if (unlikely(!is_connected_and_ready_to_send())) { return -1; } - err_t err; pbuf_desc mdesc; - if (unlikely(!is_rts())) { - if (m_conn_state == TCP_CONN_TIMEOUT) { - si_tcp_logdbg("TX timed out"); - errno = ETIMEDOUT; - } else if (m_conn_state == TCP_CONN_RESETED) { - si_tcp_logdbg("TX on reseted socket"); - errno = ECONNRESET; - } else if (m_conn_state == TCP_CONN_ERROR) { - si_tcp_logdbg("TX on connection failed socket"); - errno = ECONNREFUSED; - } else { - si_tcp_logdbg("TX on disconnected socket"); - errno = EPIPE; - } - return -1; - } - switch (flags & XLIO_EXPRESS_OP_TYPE_MASK) { case XLIO_EXPRESS_OP_TYPE_DESC: mdesc.attr = PBUF_DESC_EXPRESS; @@ -6084,21 +6080,26 @@ int sockinfo_tcp::tcp_tx_express(const struct iovec *iov, unsigned iov_len, uint lock_tcp_con(); + err_t err; for (unsigned i = 0; i < iov_len; ++i) { err = tcp_write_express(&m_pcb, iov[i].iov_base, iov[i].iov_len, &mdesc); if (err != ERR_OK) { - /* The only error in tcp_write_express is a memory error */ + /* The only error in tcp_write_express is a memory error + * In this version we don't implement any error recovery or avoidance + * mechanism and an error at this stage is irrecoverable. + * The considered alternatives are: + * - Setting the socket an error state (this is the one we chose here) + * - Rolling back any written buffers, i.e. recovering + * - Reserving the pbuf(s)/tcp_seg(s) before calling for tcp_write_express */ + m_conn_state = TCP_CONN_ERROR; + m_error_status = ENOMEM; return tcp_tx_handle_errno_and_unlock(ENOMEM); } bytes_written += iov[i].iov_len; } if (!(flags & XLIO_EXPRESS_MSG_MORE)) { - err = tcp_output(&m_pcb); - if (err != ERR_OK) { - /* The error very likely to be recoverable */ - si_tcp_logdbg("tcp_tx_express - tcp_output failed"); - } + tcp_output(&m_pcb); } unlock_tcp_con(); @@ -6187,7 +6188,7 @@ bool sockinfo_tcp::is_connected_and_ready_to_send() si_tcp_logdbg("TX on connection failed socket"); errno = ECONNREFUSED; } else { - si_tcp_logdbg("TX on disconnected socket"); + si_tcp_logdbg("TX on unconnected socket"); errno = EPIPE; } return false; @@ -6204,13 +6205,6 @@ bool sockinfo_tcp::is_connected_and_ready_to_send() ssize_t sockinfo_tcp::tcp_tx_handle_sndbuf_unavailable(ssize_t total_tx, bool is_dummy, bool is_send_zerocopy, int errno_to_restore) { - if (unlikely(!is_rts())) { - si_tcp_logdbg("TX on disconnected socket"); - return tcp_tx_handle_errno_and_unlock(ECONNRESET); - } - // force out TCP data before going on wait() - tcp_output(&m_pcb); - // non blocking socket should return in order not to tx_wait() if (total_tx > 0) { m_tx_consecutive_eagain_count = 0; diff --git a/src/core/sock/sockinfo_tcp.h b/src/core/sock/sockinfo_tcp.h index fc07d04f3..444167ce7 100644 --- a/src/core/sock/sockinfo_tcp.h +++ b/src/core/sock/sockinfo_tcp.h @@ -387,7 +387,7 @@ class sockinfo_tcp : public sockinfo, public timer_handler { int wait_for_conn_ready_blocking(); static err_t connect_lwip_cb(void *arg, struct tcp_pcb *tpcb, err_t err); // tx - unsigned tx_wait(int &err, bool blocking); + unsigned tx_wait(bool blocking); int os_epoll_wait_with_tcp_timers(epoll_event *ep_events, int maxevents); int handle_child_FIN(sockinfo_tcp *child_conn); From 00db3d71fa5596292eb785f1fedb5d4856fed565 Mon Sep 17 00:00:00 2001 From: Alex Briskin Date: Tue, 30 Jan 2024 11:45:51 +0200 Subject: [PATCH 056/169] issue: 3668182 Revert tcp_seg::bufs to pbuf_clen() Signed-off-by: Alex Briskin --- src/core/lwip/pbuf.c | 19 +++++++++++++++++++ src/core/lwip/pbuf.h | 1 + src/core/lwip/tcp_impl.h | 1 - src/core/lwip/tcp_in.c | 11 ++++++----- src/core/lwip/tcp_out.c | 26 ++++++++------------------ 5 files changed, 34 insertions(+), 24 deletions(-) diff --git a/src/core/lwip/pbuf.c b/src/core/lwip/pbuf.c index e27929061..5af160bd6 100644 --- a/src/core/lwip/pbuf.c +++ b/src/core/lwip/pbuf.c @@ -256,6 +256,25 @@ u8_t pbuf_free(struct pbuf *p) return count; } +/** + * Count number of pbufs in a chain + * + * @param p first pbuf of chain + * @return the number of pbufs in a chain + */ + +u8_t pbuf_clen(struct pbuf *p) +{ + u8_t len; + + len = 0; + while (p != NULL) { + ++len; + p = p->next; + } + return len; +} + /** * Increment the reference count of the pbuf. * diff --git a/src/core/lwip/pbuf.h b/src/core/lwip/pbuf.h index 99672d043..4d4608e16 100644 --- a/src/core/lwip/pbuf.h +++ b/src/core/lwip/pbuf.h @@ -128,6 +128,7 @@ void pbuf_realloc(struct pbuf *p, u32_t size); u8_t pbuf_header(struct pbuf *p, s32_t header_size); void pbuf_ref(struct pbuf *p); u8_t pbuf_free(struct pbuf *p); +u8_t pbuf_clen(struct pbuf *p); void pbuf_cat(struct pbuf *head, struct pbuf *tail); void pbuf_split_64k(struct pbuf *p, struct pbuf **rest); // windows scale needs large pbuf diff --git a/src/core/lwip/tcp_impl.h b/src/core/lwip/tcp_impl.h index 4a29d287e..a78fa7a44 100644 --- a/src/core/lwip/tcp_impl.h +++ b/src/core/lwip/tcp_impl.h @@ -296,7 +296,6 @@ struct tcp_seg { #define TF_SEG_OPTS_ZEROCOPY (u8_t) TCP_WRITE_ZEROCOPY /* Use zerocopy send mode */ u8_t tcp_flags; /* Cached TCP flags for outgoing segments */ - u8_t bufs; /* Number of buffers int the pbuf linked list */ /* L2+L3+TCP header for zerocopy segments, it must have enough room for options This should have enough space for L2 (ETH+vLAN), L3 (IPv4/6), L4 (TCP) diff --git a/src/core/lwip/tcp_in.c b/src/core/lwip/tcp_in.c index b60662a11..89074ae3a 100644 --- a/src/core/lwip/tcp_in.c +++ b/src/core/lwip/tcp_in.c @@ -176,7 +176,6 @@ void L3_level_tcp_input(struct pbuf *p, struct tcp_pcb *pcb) in_data.inseg.seqno = in_data.seqno; in_data.inseg.flags = 0; in_data.inseg.tcp_flags = in_data.flags; - in_data.inseg.bufs = 0; in_data.recv_data = NULL; in_data.recv_flags = 0; @@ -941,7 +940,6 @@ static u32_t tcp_shrink_segment(struct tcp_pcb *pcb, struct tcp_seg *seg, u32_t ("tcp_shrink: count: %-5d unsent %s\n", count, _dump_seg(pcb->unsent))); #endif /* TCP_TSO_DEBUG */ - seg->bufs -= count; return count; } @@ -988,7 +986,6 @@ static u32_t tcp_shrink_zc_segment(struct tcp_pcb *pcb, struct tcp_seg *seg, u32 } seg->tcphdr->seqno = htonl(seg->seqno); - seg->bufs -= count; return count; } @@ -1210,12 +1207,14 @@ static void tcp_receive(struct tcp_pcb *pcb, tcp_in_data *in_data) pcb->unacked = pcb->unacked->next; LWIP_DEBUGF(TCP_QLEN_DEBUG, ("tcp_receive: queuelen %" U32_F " ... ", (u32_t)pcb->snd_queuelen)); + LWIP_ASSERT("pcb->snd_queuelen >= pbuf_clen(next->p)", + (pcb->snd_queuelen >= pbuf_clen(next->p))); /* Prevent ACK for FIN to generate a sent event */ if ((pcb->acked != 0) && ((next->tcp_flags & TCP_FIN) != 0)) { pcb->acked--; } - pcb->snd_queuelen -= next->bufs; + pcb->snd_queuelen -= pbuf_clen(next->p); tcp_tx_seg_free(pcb, next); LWIP_DEBUGF(TCP_QLEN_DEBUG, ("%" U32_F " (after freeing unacked)\n", (u32_t)pcb->snd_queuelen)); @@ -1261,11 +1260,13 @@ static void tcp_receive(struct tcp_pcb *pcb, tcp_in_data *in_data) pcb->unsent = pcb->unsent->next; LWIP_DEBUGF(TCP_QLEN_DEBUG, ("tcp_receive: queuelen %" U32_F " ... ", (u32_t)pcb->snd_queuelen)); + LWIP_ASSERT("pcb->snd_queuelen >= pbuf_clen(next->p)", + (pcb->snd_queuelen >= pbuf_clen(next->p))); /* Prevent ACK for FIN to generate a sent event */ if ((pcb->acked != 0) && ((next->tcp_flags & TCP_FIN) != 0)) { pcb->acked--; } - pcb->snd_queuelen -= next->bufs; + pcb->snd_queuelen -= pbuf_clen(next->p); tcp_tx_seg_free(pcb, next); LWIP_DEBUGF(TCP_QLEN_DEBUG, ("%" U16_F " (after freeing unsent)\n", (u32_t)pcb->snd_queuelen)); diff --git a/src/core/lwip/tcp_out.c b/src/core/lwip/tcp_out.c index 09c6ff59c..13655b487 100644 --- a/src/core/lwip/tcp_out.c +++ b/src/core/lwip/tcp_out.c @@ -199,7 +199,6 @@ static struct tcp_seg *tcp_create_segment(struct tcp_pcb *pcb, struct pbuf *p, u seg->p = p; seg->len = p->tot_len - optlen; seg->seqno = seqno; - seg->bufs = 1; if (seg->flags & TF_SEG_OPTS_ZEROCOPY) { /* XXX Don't hardcode size/offset */ @@ -523,7 +522,7 @@ err_t tcp_write(struct tcp_pcb *pcb, const void *arg, u32_t len, u16_t apiflags, * the end. */ if (!is_file && (pos < len) && (space > 0) && (pcb->last_unsent->len > 0) && - (pcb->last_unsent->bufs < pcb->tso.max_send_sge)) { + (pbuf_clen(seg->p) < pcb->tso.max_send_sge)) { u32_t seglen = space < len - pos ? space : len - pos; if ((concat_p = tcp_pbuf_prealloc(seglen, space, &oversize, pcb, type, @@ -540,7 +539,7 @@ err_t tcp_write(struct tcp_pcb *pcb, const void *arg, u32_t len, u16_t apiflags, memcpy(concat_p->payload, (u8_t *)arg + pos, seglen); pos += seglen; - queuelen++; + queuelen++; /* There is only one pbuf in the list */ } } else { #if TCP_OVERSIZE @@ -598,7 +597,7 @@ err_t tcp_write(struct tcp_pcb *pcb, const void *arg, u32_t len, u16_t apiflags, memcpy((char *)p->payload + optlen, (u8_t *)arg + pos, seglen); } - queuelen++; + queuelen++; /* There is only one pbuf in the list */ /* Now that there are more segments queued, we check again if the * length of the queue exceeds the configured maximum or @@ -673,7 +672,6 @@ err_t tcp_write(struct tcp_pcb *pcb, const void *arg, u32_t len, u16_t apiflags, (pcb->last_unsent != NULL)); pbuf_cat(pcb->last_unsent->p, concat_p); pcb->last_unsent->len += concat_p->tot_len; - pcb->last_unsent->bufs++; } /* @@ -775,7 +773,8 @@ err_t tcp_write_express(struct tcp_pcb *pcb, const void *arg, u32_t len, pbuf_de seg = pcb->last_unsent; u32_t space = LWIP_MAX(mss_local, pcb->tso.max_payload_sz) - seg->len; - if (space > 0 && (seg->flags & TF_SEG_OPTS_ZEROCOPY) && seg->bufs < pcb->tso.max_send_sge) { + if (space > 0 && (seg->flags & TF_SEG_OPTS_ZEROCOPY) && + pbuf_clen(seg->p) < pcb->tso.max_send_sge) { seglen = space < len ? space : len; if ((p = tcp_pbuf_prealloc_express(seglen, pcb, PBUF_ZEROCOPY, desc, NULL)) == NULL) { @@ -784,7 +783,6 @@ err_t tcp_write_express(struct tcp_pcb *pcb, const void *arg, u32_t len, pbuf_de p->payload = (u8_t *)arg; pbuf_cat(seg->p, p); seg->len += p->tot_len; - seg->bufs++; pos += seglen; queuelen++; } @@ -958,7 +956,7 @@ err_t tcp_enqueue_flags(struct tcp_pcb *pcb, u8_t flags) } /* update number of segments on the queues */ - pcb->snd_queuelen += seg->bufs; + pcb->snd_queuelen += pbuf_clen(seg->p); LWIP_DEBUGF(TCP_QLEN_DEBUG, ("tcp_enqueue_flags: %" S16_F " (after enqueued)\n", pcb->snd_queuelen)); if (pcb->snd_queuelen != 0) { @@ -1076,7 +1074,7 @@ static void tcp_tso_segment(struct tcp_pcb *pcb, struct tcp_seg *seg, u32_t wnd) goto err; } - tot_p += cur_seg->bufs; + tot_p += pbuf_clen(cur_seg->p); if (tot_p > pcb->max_send_sge) { goto err; } @@ -1090,7 +1088,6 @@ static void tcp_tso_segment(struct tcp_pcb *pcb, struct tcp_seg *seg, u32_t wnd) /* Update the original segment with current segment details */ seg->next = cur_seg->next; seg->len += cur_seg->len; - seg->bufs += cur_seg->bufs; /* Update the first pbuf of current segment, unless this is a zerocopy segment */ if (!(cur_seg->flags & TF_SEG_OPTS_ZEROCOPY)) { @@ -1458,14 +1455,12 @@ void tcp_split_rexmit(struct tcp_pcb *pcb, struct tcp_seg *seg) /* New segment update */ new_seg->next = cur_seg->next; new_seg->flags = cur_seg->flags; - new_seg->bufs = cur_seg->bufs - 1; /* Original segment update */ cur_seg->next = new_seg; cur_seg->len = cur_seg->p->len - tcp_hlen_delta - optlen; cur_seg->p->tot_len = cur_seg->p->len; cur_seg->p->next = NULL; - cur_seg->bufs = 1; if (pcb->last_unsent == cur_seg) { /* We have split the last unsent segment, update last_unsent */ @@ -1558,13 +1553,11 @@ void tcp_split_segment(struct tcp_pcb *pcb, struct tcp_seg *seg, u32_t wnd) tcp_tx_pbuf_free(pcb, p); return; } - newseg->bufs = seg->bufs; /* Update original buffer */ seg->p->next = NULL; seg->p->len = seg->p->len - lentoqueue; seg->p->tot_len = seg->p->len; - seg->bufs = 1; /* New segment update */ newseg->next = seg->next; @@ -1624,9 +1617,6 @@ void tcp_split_segment(struct tcp_pcb *pcb, struct tcp_seg *seg, u32_t wnd) return; } - newseg->bufs = seg->bufs - oversize; - seg->bufs = oversize; - /* Update new tail */ pnewtail->next = NULL; @@ -1872,7 +1862,7 @@ err_t tcp_output(struct tcp_pcb *pcb) if (LWIP_IS_DUMMY_SEGMENT(seg)) { pcb->snd_lbb -= seg->len; pcb->snd_buf += seg->len; - pcb->snd_queuelen -= seg->bufs; + pcb->snd_queuelen -= pbuf_clen(seg->p); tcp_tx_seg_free(pcb, seg); } else { /* unacked list is empty? */ From 1bc79a766173c2cba763a7500b2b0ed404fd20f0 Mon Sep 17 00:00:00 2001 From: Alex Briskin Date: Sun, 28 Jan 2024 11:26:32 +0200 Subject: [PATCH 057/169] issue: 3724170 Add missing ifdef __cplusplus Signed-off-by: Alex Briskin --- src/core/xlio.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/core/xlio.h b/src/core/xlio.h index be092a0a5..95e0aa8fc 100644 --- a/src/core/xlio.h +++ b/src/core/xlio.h @@ -36,6 +36,7 @@ #include #include +#ifdef __cplusplus extern "C" { int xlio_socket(int __domain, int __type, int __protocol); @@ -139,4 +140,5 @@ int xlio_init(void); /* After finishing workling with XLIO interface call xlio_exit */ int xlio_exit(void); } +#endif /* __cplusplus */ #endif /* XLIO_EXTRA_H */ From 181f582aa413b28cc96c77e300e57d9fd8dd8a89 Mon Sep 17 00:00:00 2001 From: Alex Briskin Date: Mon, 29 Jan 2024 20:06:40 +0200 Subject: [PATCH 058/169] issue: 3724170 Remove references to os_api Signed-off-by: Alex Briskin --- src/core/sock/sock-redirect.cpp | 22 ++++++++++++---------- src/core/sock/sock-redirect.h | 12 ++++++------ 2 files changed, 18 insertions(+), 16 deletions(-) diff --git a/src/core/sock/sock-redirect.cpp b/src/core/sock/sock-redirect.cpp index 86118892e..7a79f6440 100644 --- a/src/core/sock/sock-redirect.cpp +++ b/src/core/sock/sock-redirect.cpp @@ -86,7 +86,6 @@ using namespace std; #define srdr_logfunc_exit __log_exit_func #define EP_MAX_EVENTS (int)((INT_MAX / sizeof(struct epoll_event))) -struct os_api orig_os_api; struct sigaction g_act_prev; sighandler_t g_sighandler = NULL; class ring_simple; @@ -99,6 +98,8 @@ template void assign_dlsym(T &ptr, const char *name) #define FD_MAP_SIZE (g_p_fd_collection ? g_p_fd_collection->get_fd_map_size() : 1024) +#ifndef XLIO_STATIC_BUILD +struct os_api orig_os_api; #define GET_ORIG_FUNC(__name) \ if (!orig_os_api.__name) { \ dlerror(); \ @@ -118,15 +119,6 @@ template void assign_dlsym(T &ptr, const char *name) } \ } -#define VERIFY_PASSTROUGH_CHANGED(__ret, __func_and_params__) \ - do { \ - bool passthrough = p_socket_object->isPassthrough(); \ - __ret = __func_and_params__; \ - if (!passthrough && p_socket_object->isPassthrough()) { \ - handle_close(__fd, false, true); \ - } \ - } while (0); - void get_orig_funcs() { // Save pointer to original functions @@ -190,6 +182,16 @@ void get_orig_funcs() GET_ORIG_FUNC(waitpid); #endif // DEFINED_NGINX } +#endif /* XLIO_STATIC_BUILD */ + +#define VERIFY_PASSTROUGH_CHANGED(__ret, __func_and_params__) \ + do { \ + bool passthrough = p_socket_object->isPassthrough(); \ + __ret = __func_and_params__; \ + if (!passthrough && p_socket_object->isPassthrough()) { \ + handle_close(__fd, false, true); \ + } \ + } while (0); const char *socket_get_domain_str(int domain) { diff --git a/src/core/sock/sock-redirect.h b/src/core/sock/sock-redirect.h index a429d7e33..27785d671 100644 --- a/src/core/sock/sock-redirect.h +++ b/src/core/sock/sock-redirect.h @@ -116,7 +116,7 @@ struct mmsghdr; * variables to hold the function-pointers to original functions *----------------------------------------------------------------------------- */ - +#ifndef XLIO_STATIC_BUILD struct os_api { int (*creat)(const char *__pathname, mode_t __mode); int (*open)(__const char *__file, int __oflag, ...); @@ -207,8 +207,12 @@ struct os_api { #if defined(DEFINED_NGINX) int (*setuid)(uid_t uid); pid_t (*waitpid)(pid_t pid, int *wstatus, int options); -#endif // DEFINED_NGINX }; +#endif /* XLIO_STATIC_BUILD */ +extern os_api orig_os_api; + +extern void get_orig_funcs(); +#endif // DEFINED_NGINX /** *----------------------------------------------------------------------------- @@ -228,10 +232,6 @@ struct os_api { } \ } while (0) -extern os_api orig_os_api; - -extern void get_orig_funcs(); - extern iomux_stats_t *g_p_select_stats; extern iomux_stats_t *g_p_poll_stats; extern iomux_stats_t *g_p_epoll_stats; From 72f513d44b9dde3a498e58ad7e82a239d809e704 Mon Sep 17 00:00:00 2001 From: Ben Walker Date: Fri, 26 Jan 2024 18:44:37 +0000 Subject: [PATCH 059/169] issue: 3724170 Make xlio.h C standard compliant Signed-off-by: Ben Walker Signed-off-by: Alex Briskin --- src/core/xlio.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/core/xlio.h b/src/core/xlio.h index 95e0aa8fc..c7adb16b3 100644 --- a/src/core/xlio.h +++ b/src/core/xlio.h @@ -38,6 +38,8 @@ #ifdef __cplusplus extern "C" { +#endif + int xlio_socket(int __domain, int __type, int __protocol); int xlio_close(int __fd); @@ -139,6 +141,8 @@ int xlio_init(void); /* After finishing workling with XLIO interface call xlio_exit */ int xlio_exit(void); + +#ifdef __cplusplus } -#endif /* __cplusplus */ +#endif #endif /* XLIO_EXTRA_H */ From 65bbb00f0b84a8a6604e2393e850158c9da83270 Mon Sep 17 00:00:00 2001 From: Alex Briskin Date: Mon, 29 Jan 2024 20:37:34 +0200 Subject: [PATCH 060/169] issue: 3724170 Disable the constructor/destructor in static build Signed-off-by: Alex Briskin --- src/core/libxlio.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/core/libxlio.c b/src/core/libxlio.c index 33467d8f9..26e48f232 100644 --- a/src/core/libxlio.c +++ b/src/core/libxlio.c @@ -33,6 +33,7 @@ extern int xlio_init(void); extern int xlio_exit(void); +#ifndef XLIO_STATIC_BUILD int __attribute__((constructor)) sock_redirect_lib_load_constructor(void) { return xlio_init(); @@ -42,3 +43,4 @@ int __attribute__((destructor)) sock_redirect_lib_load_destructor(void) { return xlio_exit(); } +#endif /* XLIO_STATIC_BUILD */ From 24e427f88ba438a517358ea5e57e4528d45ba065 Mon Sep 17 00:00:00 2001 From: Alex Briskin Date: Tue, 30 Jan 2024 09:20:12 +0200 Subject: [PATCH 061/169] issue: 3724170 Make socketxtreme API regular function declarations Signed-off-by: Alex Briskin --- src/core/sock/sock-redirect.cpp | 2 + src/core/sock/sock-redirect.h | 4 +- src/core/xlio.h | 242 +++++++++++++++++++++++++++++++- 3 files changed, 245 insertions(+), 3 deletions(-) diff --git a/src/core/sock/sock-redirect.cpp b/src/core/sock/sock-redirect.cpp index 7a79f6440..12c12b6ba 100644 --- a/src/core/sock/sock-redirect.cpp +++ b/src/core/sock/sock-redirect.cpp @@ -1105,12 +1105,14 @@ EXPORT_SYMBOL int XLIO_SYMBOL(getsockopt)(int __fd, int __level, int __optname, srdr_logdbg_entry("fd=%d, level=%d, optname=%d", __fd, __level, __optname); +#ifndef XLIO_STATIC_BUILD if (__fd == -2 && __level == SOL_SOCKET && __optname == SO_XLIO_GET_API && __optlen && *__optlen >= sizeof(struct xlio_api_t *)) { *((xlio_api_t **)__optval) = extra_api(); *__optlen = sizeof(struct xlio_api_t *); return 0; } +#endif /* XLIO_STATIC_BUILD */ int ret = 0; socket_fd_api *p_socket_object = NULL; diff --git a/src/core/sock/sock-redirect.h b/src/core/sock/sock-redirect.h index 27785d671..0f5c4deb6 100644 --- a/src/core/sock/sock-redirect.h +++ b/src/core/sock/sock-redirect.h @@ -207,12 +207,12 @@ struct os_api { #if defined(DEFINED_NGINX) int (*setuid)(uid_t uid); pid_t (*waitpid)(pid_t pid, int *wstatus, int options); +#endif // DEFINED_NGINX }; -#endif /* XLIO_STATIC_BUILD */ extern os_api orig_os_api; extern void get_orig_funcs(); -#endif // DEFINED_NGINX +#endif /* XLIO_STATIC_BUILD */ /** *----------------------------------------------------------------------------- diff --git a/src/core/xlio.h b/src/core/xlio.h index c7adb16b3..9a50f0d02 100644 --- a/src/core/xlio.h +++ b/src/core/xlio.h @@ -36,6 +36,10 @@ #include #include +#include +#include +#include + #ifdef __cplusplus extern "C" { #endif @@ -142,7 +146,243 @@ int xlio_init(void); /* After finishing workling with XLIO interface call xlio_exit */ int xlio_exit(void); +/** + * Zero-copy revcfrom implementation. + * + * @param s Socket file descriptor. + * @param buf Buffer to fill with received data or pointers to data (see below). + * @param flags Pointer to flags (see below). + * @param from If not NULL, will be filled with source address (same as recvfrom). + * @param fromlen If not NULL, will be filled with source address size (same as recvfrom). + * + * This function attempts to receive a packet without doing data copy. + * The flags argument can contain the usual flags of recvmsg(), and also the + * MSG_XLIO_ZCOPY_FORCE flag. If the latter is set, the function will not + * fall back to data copy. Otherwise, the function falls back to data copy + * if zero-copy cannot be performed. If zero-copy is done then MSG_XLIO_ZCOPY + * flag is set upon exit. + * + * If zero copy is performed (MSG_XLIO_ZCOPY flag is returned), the buffer + * is filled with a xlio_recvfrom_zcopy_packets_t structure, holding as much fragments + * as `len' allows. The total size of all fragments is returned. + * Otherwise the MSG_XLIO_ZCOPY flag is not set and the buffer is filled + * with actual data and it's size is returned (same as recvfrom()) + * If no data was received the return value is zero. + * + * NOTE: The returned packet must be freed with free_packet() after + * the application finished using it. + */ +int xlio_recvfrom_zcopy(int s, void *buf, size_t len, int *flags, struct sockaddr *from, + socklen_t *fromlen); + +/** + * Frees a packet received by recvfrom_zcopy() or held by receive callback. + * + * @param s Socket from which the packet was received. + * @param pkts Array of packet. + * @param count Number of packets in the array. + * @return 0 on success, -1 on failure + * + * errno is set to: EINVAL - not a offloaded socket + * ENOENT - the packet was not received from `s'. + */ +int xlio_recvfrom_zcopy_free_packets(int s, struct xlio_recvfrom_zcopy_packet_t *pkts, + size_t count); + +/* + * Add a libxlio.conf rule to the top of the list. + * This rule will not apply to existing sockets which already considered the conf rules. + * (around connect/listen/send/recv ..) + * @param config_line A char buffer with the exact format as defined in libxlio.conf, and should + * end with '\0'. + * @return 0 on success, or error code on failure. + */ +int xlio_add_conf_rule(const char *config_line); + +/* + * Create sockets on pthread tid as offloaded/not-offloaded. + * This does not affect existing sockets. + * Offloaded sockets are still subject to libxlio.conf rules. + * @param offload 1 for offloaded, 0 for not-offloaded. + * @return 0 on success, or error code on failure. + */ +int xlio_thread_offload(int offload, pthread_t tid); + +/** + * Returns the amount of rings that are associated with socket. + * + * @param fd File Descriptor number of the socket. + * @return On success, return the amount of rings. + * On error, -1 is returned. + * + * errno is set to: EINVAL - not a offloaded fd + */ +int xlio_get_socket_rings_num(int fd); + +/** + * Returns FDs of the RX rings that are associated with the socket. + * + * This function gets socket FD + int array + array size and populates + * the array with FD numbers of the rings that are associated + * with the socket. + * + * @param fd File Descriptor number. + * @param ring_fds Array of ring fds + * @param ring_fds_sz Size of the array + * @return On success, return the number populated array entries. + * On error, -1 is returned. + * + * errno is set to: EINVAL - not a offloaded fd + TBD + */ +int xlio_get_socket_rings_fds(int fd, int *ring_fds, int ring_fds_sz); + +/* + * Dump fd statistics using the library logger. + * @param fd to dump, 0 for all open fds. + * @param log_level dumping level corresponding vlog_levels_t enum (vlogger.h). + * @return 0 on success, or error code on failure. + * + * errno is set to: EOPNOTSUPP - Function is not supported when socketXtreme is enabled. + */ +int xlio_dump_fd_stats(int fd, int log_level); + +/** + * This function allows to communicate with library using extendable protocol + * based on struct cmshdr. + * + * Ancillary data is a sequence of cmsghdr structures with appended data. + * The sequence of cmsghdr structures should never be accessed directly. + * Instead, use only the following macros: CMSG_ALIGN, CMSG_SPACE, CMSG_DATA, + * CMSG_LEN. + * + * @param cmsg_hdr - point to control message + * @param cmsg_len - the byte count of the ancillary data, + * which contains the size of the structure header. + * + * @return -1 on failure and 0 on success + */ +int xlio_extra_ioctl(void *cmsg_hdr, size_t cmsg_len); + +/** + * Register a received packet notification callback. + * + * @param s Socket file descriptor. + * @param callback Callback function. + * @param context user contex for callback function. + * @return 0 - success, -1 - error + * + * errno is set to: EINVAL - not offloaded socket + */ +int xlio_register_recv_callback(int s, xlio_recv_callback_t callback, void *context); + +/** + * socketxtreme_poll() polls for completions + * + * @param fd File descriptor. + * @param completions Array of completions. + * @param ncompletions Maximum number of completion to return. + * @param flags Flags. + * SOCKETXTREME_POLL_TX - poll tx completions + * @return On success, return the number of ready completions. + * On error, -1 is returned, and TBD:errno is set?. + * + * This function polls the `fd` for completions and returns maximum `ncompletions` ready + * completions via `completions` array. + * The `fd` can represent a ring, socket or epoll file descriptor. + * + * Completions are indicated for incoming packets and/or for other events. + * If XLIO_SOCKETXTREME_PACKET flag is enabled in xlio_socketxtreme_completion_t.events field + * the completion points to incoming packet descriptor that can be accesses + * via xlio_socketxtreme_completion_t.packet field. + * Packet descriptor points to library specific buffers that contain data scattered + * by HW, so the data is deliver to application with zero copy. + * Notice: after application finished using the returned packets + * and their buffers it must free them using socketxtreme_free_packets(), + * socketxtreme_free_buff() functions. + * + * If XLIO_SOCKETXTREME_PACKET flag is disabled xlio_socketxtreme_completion_t.packet field is + * reserved. + * + * In addition to packet arrival event (indicated by XLIO_SOCKETXTREME_PACKET flag) + * The library also reports XLIO_SOCKETXTREME_NEW_CONNECTION_ACCEPTED event and standard + * epoll events via xlio_socketxtreme_completion_t.events field. + * XLIO_SOCKETXTREME_NEW_CONNECTION_ACCEPTED event is reported when new connection is + * accepted by the server. + * When working with socketxtreme_poll() new connections are accepted + * automatically and accept(listen_socket) must not be called. + * XLIO_SOCKETXTREME_NEW_CONNECTION_ACCEPTED event is reported for the new + * connected/child socket (xlio_socketxtreme_completion_t.user_data refers to child socket) + * and EPOLLIN event is not generated for the listen socket. + * For events other than packet arrival and new connection acceptance + * xlio_socketxtreme_completion_t.events bitmask composed using standard epoll API + * events types. + * Notice: the same completion can report multiple events, for example + * XLIO_SOCKETXTREME_PACKET flag can be enabled together with EPOLLOUT event, + * etc... + * + * * errno is set to: EOPNOTSUPP - socketXtreme was not enabled during configuration time. + */ +int xlio_socketxtreme_poll(int fd, struct xlio_socketxtreme_completion_t *completions, + unsigned int ncompletions, int flags); + +/** + * Frees packets received by socketxtreme_poll(). + * + * @param packets Packets to free. + * @param num Number of packets in `packets` array + * @return 0 on success, -1 on failure + * + * For each packet in `packet` array this function: + * - Updates receive queue size and the advertised TCP + * window size, if needed, for the socket that received + * the packet. + * - Frees the library specific buffer list that is associated with the packet. + * Notice: for each buffer in buffer list the library decreases buffer's + * reference count and only buffers with reference count zero are deallocated. + * Notice: + * - Application can increase buffer reference count, + * in order to hold the buffer even after socketxtreme_free_packets() + * was called for the buffer, using socketxtreme_ref_buff(). + * - Application is responsible to free buffers, that + * couldn't be deallocated during socketxtreme_free_packets() due to + * non zero reference count, using socketxtreme_free_buff() function. + * + * errno is set to: EINVAL - NULL pointer is provided. + * EOPNOTSUPP - socketXtreme was not enabled during configuration time. + */ +int xlio_socketxtreme_free_packets(struct xlio_socketxtreme_packet_desc_t *packets, int num); + +/* This function increments the reference count of the buffer. + * This function should be used in order to hold the buffer + * even after socketxtreme_free_packets() call. + * When buffer is not needed any more it should be freed via + * socketxtreme_free_buff(). + * + * @param buff Buffer to update. + * @return On success, return buffer's reference count after the change + * On errors -1 is returned + * + * errno is set to: EINVAL - NULL pointer is provided. + * EOPNOTSUPP - socketXtreme was not enabled during configuration time. + */ +int xlio_socketxtreme_ref_buff(struct xlio_buff_t *buff); + +/* This function decrements the buff reference count. + * When buff's reference count reaches zero, the buff is + * deallocated. + * + * @param buff Buffer to free. + * @return On success, return buffer's reference count after the change + * On error -1 is returned + * + * Notice: return value zero means that buffer was deallocated. + * + * errno is set to: EINVAL - NULL pointer is provided. + * EOPNOTSUPP - socketXtreme was not enabled during configuration time. + */ +int xlio_socketxtreme_free_buff(struct xlio_buff_t *buff); + #ifdef __cplusplus } #endif -#endif /* XLIO_EXTRA_H */ +#endif /* XLIO_H */ From 569a5510786f090d303b795eaa9524139d523e04 Mon Sep 17 00:00:00 2001 From: Alex Briskin Date: Sun, 4 Feb 2024 10:43:38 +0200 Subject: [PATCH 062/169] issue: 3724170 Fix compilation for static build Signed-off-by: Alex Briskin --- contrib/jenkins_tests/build.sh | 19 ++++++++++--------- src/core/main.cpp | 2 ++ src/core/util/agent.cpp | 7 +++++++ 3 files changed, 19 insertions(+), 9 deletions(-) diff --git a/contrib/jenkins_tests/build.sh b/contrib/jenkins_tests/build.sh index 24c4e153e..0182761ae 100755 --- a/contrib/jenkins_tests/build.sh +++ b/contrib/jenkins_tests/build.sh @@ -13,25 +13,26 @@ cd ${build_dir} # Set symbolic links to default build and install ln -s "${build_dir}/0/install" "${install_dir}" -build_list="\ -debug:--enable-opt-log=no --enable-debug \ -nginx-off:--enable-nginx=no \ -envoy-on:--enable-nginx=yes \ -default: " - +declare -A build_list +build_list['debug']="--enable-opt-log=no --enable-debug" +build_list['nginx-off']="--enable-nginx=no" +build_list['envoy-on']="--enable-nginx=yes" +build_list['static-on']="--enable-static --disable-shared" +build_list['default']="" build_tap=${WORKSPACE}/${prefix}/build.tap echo "1..$(echo $build_list | tr " " "\n" | wc -l)" > $build_tap test_id=0 -for build in $build_list; do - IFS=':' read build_name build_option <<< "$build" + +for build_name in "${!build_list[@]}"; do + build_option="${build_list[$build_name]}" mkdir -p ${build_dir}/${test_id} cd ${build_dir}/${test_id} test_exec='${WORKSPACE}/configure --prefix=${build_dir}/${test_id}/install $build_option $jenkins_test_custom_configure && make $make_opt install' do_check_result "$test_exec" "$test_id" "$build_name" "$build_tap" "${build_dir}/build-${test_id}" cd ${build_dir} - test_id=$((test_id+1)) + ((test_id++)) done diff --git a/src/core/main.cpp b/src/core/main.cpp index 6b4b89311..2b89de3f4 100644 --- a/src/core/main.cpp +++ b/src/core/main.cpp @@ -1277,7 +1277,9 @@ extern "C" int xlio_init(void) { PROFILE_FUNC +#ifndef XLIO_STATIC_BUILD get_orig_funcs(); +#endif /* XLIO_STATIC_BUILD */ safe_mce_sys(); g_init_global_ctors_done = false; diff --git a/src/core/util/agent.cpp b/src/core/util/agent.cpp index 90c362938..5f043a50f 100644 --- a/src/core/util/agent.cpp +++ b/src/core/util/agent.cpp @@ -60,6 +60,12 @@ #define AGENT_DEFAULT_ALIVE (1) /* periodic time for alive check (in sec) */ /* Force system call */ +#ifdef XLIO_STATIC_BUILD +#define sys_call(_result, _func, ...) \ + do { \ + _result = ::_func(__VA_ARGS__); \ + } while (0) +#else /* XLIO_STATIC_BUILD */ #define sys_call(_result, _func, ...) \ do { \ if (orig_os_api._func) \ @@ -67,6 +73,7 @@ else \ _result = ::_func(__VA_ARGS__); \ } while (0) +#endif /* XLIO_STATIC_BUILD */ /* Print user notification */ #define output_fatal() \ From 4534754fdb4cdcfa8d3139703291edb4924f7692 Mon Sep 17 00:00:00 2001 From: Alex Briskin Date: Sun, 4 Feb 2024 12:04:20 +0200 Subject: [PATCH 063/169] issue: 3724170 Disable the *_check functions for the static build Signed-off-by: Alex Briskin --- contrib/jenkins_tests/build.sh | 4 ++-- src/core/sock/sock-redirect.cpp | 10 +++++----- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/contrib/jenkins_tests/build.sh b/contrib/jenkins_tests/build.sh index 0182761ae..0fa64f4e2 100755 --- a/contrib/jenkins_tests/build.sh +++ b/contrib/jenkins_tests/build.sh @@ -21,7 +21,7 @@ build_list['static-on']="--enable-static --disable-shared" build_list['default']="" build_tap=${WORKSPACE}/${prefix}/build.tap -echo "1..$(echo $build_list | tr " " "\n" | wc -l)" > $build_tap +echo "1..${#build_list[@]}" > $build_tap test_id=0 @@ -32,7 +32,7 @@ for build_name in "${!build_list[@]}"; do test_exec='${WORKSPACE}/configure --prefix=${build_dir}/${test_id}/install $build_option $jenkins_test_custom_configure && make $make_opt install' do_check_result "$test_exec" "$test_id" "$build_name" "$build_tap" "${build_dir}/build-${test_id}" cd ${build_dir} - ((test_id++)) + test_id=$((test_id+1)) done diff --git a/src/core/sock/sock-redirect.cpp b/src/core/sock/sock-redirect.cpp index 12c12b6ba..be5eea431 100644 --- a/src/core/sock/sock-redirect.cpp +++ b/src/core/sock/sock-redirect.cpp @@ -1331,7 +1331,7 @@ EXPORT_SYMBOL ssize_t XLIO_SYMBOL(read)(int __fd, void *__buf, size_t __nbytes) return SYSCALL(read, __fd, __buf, __nbytes); } -#if defined HAVE___READ_CHK +#if defined HAVE___READ_CHK && !defined(XLIO_STATIC_BUILD) /* Checks that the buffer is big enough to contain the number of bytes * the user requests to read. If the buffer is too small, aborts, * else read NBYTES into BUF from FD. Return the @@ -1412,7 +1412,7 @@ EXPORT_SYMBOL ssize_t XLIO_SYMBOL(recv)(int __fd, void *__buf, size_t __nbytes, return SYSCALL(recv, __fd, __buf, __nbytes, __flags); } -#if defined HAVE___RECV_CHK +#if defined HAVE___RECV_CHK && !defined(XLIO_STATIC_BUILD) /* Checks that the buffer is big enough to contain the number of bytes the user requests to read. If the buffer is too small, aborts, else read N bytes into BUF from socket FD. @@ -1591,7 +1591,7 @@ EXPORT_SYMBOL ssize_t XLIO_SYMBOL(recvfrom)(int __fd, void *__buf, size_t __nbyt return ret_val; } -#if defined HAVE___RECVFROM_CHK +#if defined HAVE___RECVFROM_CHK && !defined(XLIO_STATIC_BUILD) /* Checks that the buffer is big enough to contain the number of bytes the user requests to read. If the buffer is too small, aborts, else read N bytes into BUF through socket FD. @@ -1920,7 +1920,7 @@ EXPORT_SYMBOL int XLIO_SYMBOL(poll)(struct pollfd *__fds, nfds_t __nfds, int __t return poll_helper(__fds, __nfds, __timeout); } -#if defined HAVE___POLL_CHK +#if defined HAVE___POLL_CHK && !defined(XLIO_STATIC_BUILD) EXPORT_SYMBOL int XLIO_SYMBOL(__poll_chk)(struct pollfd *__fds, nfds_t __nfds, int __timeout, size_t __fdslen) { @@ -1959,7 +1959,7 @@ EXPORT_SYMBOL int XLIO_SYMBOL(ppoll)(struct pollfd *__fds, nfds_t __nfds, return poll_helper(__fds, __nfds, timeout, __sigmask); } -#if defined HAVE___PPOLL_CHK +#if defined HAVE___PPOLL_CHK && !defined(XLIO_STATIC_BUILD) EXPORT_SYMBOL int XLIO_SYMBOL(__ppoll_chk)(struct pollfd *__fds, nfds_t __nfds, const struct timespec *__timeout, const sigset_t *__sigmask, size_t __fdslen) From 660afb57aeedc9816ec96516a7ec50bab14a9e98 Mon Sep 17 00:00:00 2001 From: Alex Briskin Date: Wed, 7 Feb 2024 12:02:13 +0200 Subject: [PATCH 064/169] issue: 3771283 Fix function pointer check The deleted define returned always true because it the address of orig_os_api._func instead of checking it's value. Signed-off-by: Alex Briskin --- src/core/sock/sock-redirect.h | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/src/core/sock/sock-redirect.h b/src/core/sock/sock-redirect.h index 0f5c4deb6..6da6527c5 100644 --- a/src/core/sock/sock-redirect.h +++ b/src/core/sock/sock-redirect.h @@ -96,12 +96,8 @@ #define SYSCALL_ERRNO_UNSUPPORTED(_func, ...) SYSCALL(_func, __VA_ARGS__) #define VALID_SYSCALL(_func) (true) #else -#define XLIO_SYMBOL(_func) _func -#if defined(__GNUC__) && !defined(__clang__) -#define VALID_SYSCALL(_func) (__builtin_addressof(orig_os_api._func) != nullptr) -#else +#define XLIO_SYMBOL(_func) _func #define VALID_SYSCALL(_func) ((orig_os_api._func) != nullptr) -#endif #define SYSCALL(_func, ...) \ ((VALID_SYSCALL(_func) ? (void)0 : get_orig_funcs()), orig_os_api._func(__VA_ARGS__)) #define SYSCALL_ERRNO_UNSUPPORTED(_func, ...) \ From 0fe1ee1eddc7e4332ca7c2bc549bc2e28348fc36 Mon Sep 17 00:00:00 2001 From: Gal Noam Date: Mon, 12 Feb 2024 12:35:06 +0200 Subject: [PATCH 065/169] version: 3.30.0 Signed-off-by: Gal Noam --- CHANGES | 15 +++++++++++++++ configure.ac | 4 ++-- contrib/scripts/libxlio.spec.in | 4 ++-- 3 files changed, 19 insertions(+), 4 deletions(-) diff --git a/CHANGES b/CHANGES index 25a1862b4..900750049 100644 --- a/CHANGES +++ b/CHANGES @@ -1,3 +1,18 @@ +Version 3.30.0-1: +Date + Time 2024-02-12 +============================================================= +Added: + - RM #3724170 Static compilation with LTO and PGO support + - RM #3668182 productize LwIP express data path + - RM #3664594 Backport TCP_KEEPALIVE from VMA + - RM #3514044 XLIO DPCP Only - Remove legacy code and legacy flows + +Fixed: + - RM #3771283 Fix function pointer check + - RM #3690535 Remove leftover after Multi Packet RQ removal + - RM #3678579 Fix last_unacked and last_unsent + - RM #3704820 XLIO error when enabling UDP listen socket + Version 3.21.2-1: Date + Time 2024-01-11 ============================================================= diff --git a/configure.ac b/configure.ac index c18e85920..e68bce533 100644 --- a/configure.ac +++ b/configure.ac @@ -13,8 +13,8 @@ dnl===-----------------------------------------------------------------------=== # Update version number here: # define([prj_ver_major], 3) -define([prj_ver_minor], 21) -define([prj_ver_revision], 2) +define([prj_ver_minor], 30) +define([prj_ver_revision], 0) define([prj_ver_release], esyscmd([echo ${PRJ_RELEASE:=0}])) diff --git a/contrib/scripts/libxlio.spec.in b/contrib/scripts/libxlio.spec.in index fc69e6071..af84647ab 100644 --- a/contrib/scripts/libxlio.spec.in +++ b/contrib/scripts/libxlio.spec.in @@ -188,7 +188,7 @@ fi %{_mandir}/man8/xlio_stats.* %changelog -* Thu Jan 11 2024 NVIDIA CORPORATION 3.21.2-1 -- Bump version to 3.21.2 +* Mon Feb 12 2024 NVIDIA CORPORATION 3.30.0-1 +- Bump version to 3.30.0 - Please refer to CHANGES for full changelog. From ae058d1581bdbc49d75c6acba666cc62d22d2f4d Mon Sep 17 00:00:00 2001 From: Dmytro Podgornyi Date: Mon, 19 Feb 2024 20:26:23 +0200 Subject: [PATCH 066/169] issue: 3786434 Remove C23 feature from public xlio_extra.h Enum with underlying type is C++ or C23 feature. Move the enum definition to an internal header, because currently, it's unused in public API. Signed-off-by: Dmytro Podgornyi --- src/core/sock/sockinfo_tcp.h | 8 ++++++++ src/core/xlio_extra.h | 8 -------- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/src/core/sock/sockinfo_tcp.h b/src/core/sock/sockinfo_tcp.h index 444167ce7..9f2d5ae2d 100644 --- a/src/core/sock/sockinfo_tcp.h +++ b/src/core/sock/sockinfo_tcp.h @@ -89,6 +89,14 @@ enum tcp_conn_state_e { TCP_CONN_RESETED }; +enum xlio_express_flags : uint32_t { + XLIO_EXPRESS_OP_TYPE_DESC, + XLIO_EXPRESS_OP_TYPE_FILE_ZEROCOPY, + XLIO_EXPRESS_OP_TYPE_MASK = 0x000fu, + XLIO_EXPRESS_MSG_MORE, + XLIO_EXPRESS_MSG_MASK = 0x00f0u, +}; + struct socket_option_t { const int level; const int optname; diff --git a/src/core/xlio_extra.h b/src/core/xlio_extra.h index f51a3b70f..e1b64f0ce 100644 --- a/src/core/xlio_extra.h +++ b/src/core/xlio_extra.h @@ -640,12 +640,4 @@ static inline struct xlio_api_t *xlio_get_api() return api_ptr; } -enum xlio_express_flags : uint32_t { - XLIO_EXPRESS_OP_TYPE_DESC, - XLIO_EXPRESS_OP_TYPE_FILE_ZEROCOPY, - XLIO_EXPRESS_OP_TYPE_MASK = 0x000fu, - XLIO_EXPRESS_MSG_MORE, - XLIO_EXPRESS_MSG_MASK = 0x00f0u, -}; - #endif /* XLIO_EXTRA_H */ From 2cdbc84cf339440a7a0517fa8a1a284b9fe00423 Mon Sep 17 00:00:00 2001 From: Gal Noam Date: Thu, 22 Feb 2024 10:46:48 +0200 Subject: [PATCH 067/169] version: 3.30.1 Signed-off-by: Gal Noam --- CHANGES | 6 ++++++ configure.ac | 2 +- contrib/scripts/libxlio.spec.in | 4 ++-- 3 files changed, 9 insertions(+), 3 deletions(-) diff --git a/CHANGES b/CHANGES index 900750049..174e569f7 100644 --- a/CHANGES +++ b/CHANGES @@ -1,3 +1,9 @@ +Version 3.30.1-1: +Date + Time 2024-02-22 +============================================================= +Fixed: + - RM #3786434 C++ or C23 feature in xlio_extra.h breaks compilation of some C programs + Version 3.30.0-1: Date + Time 2024-02-12 ============================================================= diff --git a/configure.ac b/configure.ac index e68bce533..04268f9e6 100644 --- a/configure.ac +++ b/configure.ac @@ -14,7 +14,7 @@ dnl===-----------------------------------------------------------------------=== # define([prj_ver_major], 3) define([prj_ver_minor], 30) -define([prj_ver_revision], 0) +define([prj_ver_revision], 1) define([prj_ver_release], esyscmd([echo ${PRJ_RELEASE:=0}])) diff --git a/contrib/scripts/libxlio.spec.in b/contrib/scripts/libxlio.spec.in index af84647ab..ee50afee3 100644 --- a/contrib/scripts/libxlio.spec.in +++ b/contrib/scripts/libxlio.spec.in @@ -188,7 +188,7 @@ fi %{_mandir}/man8/xlio_stats.* %changelog -* Mon Feb 12 2024 NVIDIA CORPORATION 3.30.0-1 -- Bump version to 3.30.0 +* Thu Feb 22 2024 NVIDIA CORPORATION 3.30.1-1 +- Bump version to 3.30.1 - Please refer to CHANGES for full changelog. From 9ef53fc46c4b92da04590e2509bee99607989df2 Mon Sep 17 00:00:00 2001 From: Dmytro Podgornyi Date: Thu, 22 Feb 2024 16:08:23 +0200 Subject: [PATCH 068/169] issue: 3792731 Fix -Walloc-size-larger-than warning The false positive warning is generated during linking phase. Cast array size to 32bit type to suppress the warning. The warning appears in --enable-lto configuration likely due to inlined new[] operator and generated to the underlying malloc() call. Signed-off-by: Dmytro Podgornyi --- src/core/dev/ib_ctx_handler.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/core/dev/ib_ctx_handler.cpp b/src/core/dev/ib_ctx_handler.cpp index 089ce3b56..28509d1f9 100644 --- a/src/core/dev/ib_ctx_handler.cpp +++ b/src/core/dev/ib_ctx_handler.cpp @@ -257,7 +257,7 @@ dpcp::adapter *ib_ctx_handler::set_dpcp_adapter() goto err; } - dpcp_lst = new (std::nothrow) dpcp::adapter_info[adapters_num]; + dpcp_lst = new (std::nothrow) dpcp::adapter_info[static_cast(adapters_num)]; if (!dpcp_lst) { ibch_logerr("failed allocating memory for devices"); goto err; From 077d5b262fb4c95032e84ba3ea8cff0a89ec5657 Mon Sep 17 00:00:00 2001 From: Iftah Levi Date: Tue, 13 Feb 2024 13:31:07 +0200 Subject: [PATCH 069/169] issue: 3514044 Fix null pointer dereference Accessing completion event channel for dummy RQ, that was created automatically by ibv when we create QP. Signed-off-by: Iftah Levi --- src/core/dev/cq_mgr_rx.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/core/dev/cq_mgr_rx.h b/src/core/dev/cq_mgr_rx.h index f4526c68e..85d9cdf2d 100644 --- a/src/core/dev/cq_mgr_rx.h +++ b/src/core/dev/cq_mgr_rx.h @@ -86,7 +86,7 @@ class cq_mgr_rx { void configure(int cq_size); ibv_cq *get_ibv_cq_hndl() { return m_p_ibv_cq; } - int get_channel_fd() { return m_comp_event_channel->fd; } + int get_channel_fd() { return m_comp_event_channel ? m_comp_event_channel->fd : 0; } /** * Arm the managed CQ's notification channel From e82e642fc8ec55822d2759600de3a77475d4747a Mon Sep 17 00:00:00 2001 From: Dmytro Podgornyi Date: Fri, 2 Feb 2024 06:48:14 +0200 Subject: [PATCH 070/169] issue: 3795922 Remove pbuf_split_64k() XLIO receives any incoming TCP payload in pbuf chains less than 64k. There is no point to do the splits on each receive. Sockinfo_tcp has no 64k limitation and this avoid extra operations. To clean the code and achieve possible minor performance improvement, remove pbuf_split_64k() and accept entire pbuf chain as is. Signed-off-by: Dmytro Podgornyi --- src/core/lwip/pbuf.c | 39 --------------------------------------- src/core/lwip/pbuf.h | 2 -- src/core/lwip/tcp.c | 20 +++++--------------- src/core/lwip/tcp_in.c | 26 +++----------------------- 4 files changed, 8 insertions(+), 79 deletions(-) diff --git a/src/core/lwip/pbuf.c b/src/core/lwip/pbuf.c index 5af160bd6..cfa96b6dc 100644 --- a/src/core/lwip/pbuf.c +++ b/src/core/lwip/pbuf.c @@ -322,42 +322,3 @@ void pbuf_cat(struct pbuf *h, struct pbuf *t) * so netto there is no change to the reference count of t. */ } - -// windows scale needs large pbuf -/** - * This method modifies a 'pbuf chain', so that its total length is - * smaller than 64K. The remainder of the original pbuf chain is stored - * in *rest. - * This function never creates new pbufs, but splits an existing chain - * in two parts. The tot_len of the modified packet queue will likely be - * smaller than 64K. - * 'packet queues' are not supported by this function. - */ -void pbuf_split_64k(struct pbuf *p, struct pbuf **rest) -{ - if (p == NULL || p->tot_len < 0xffff) { - // pbuf is smaller than 64K - *rest = NULL; - } else { - u32_t tot_len_front = 0; - struct pbuf *i = NULL; - - *rest = p; - while (*rest != NULL && tot_len_front + (*rest)->len <= 0xffff) { - tot_len_front += (*rest)->len; - i = *rest; - *rest = (*rest)->next; - } - /* i now points to last packet of the first segment. Set next - * pointer to NULL */ - i->next = NULL; - - /* Update the tot_len field in the first part */ - for (i = p; i && i->next != *rest && *rest; i = i->next) { - i->tot_len -= (*rest)->tot_len; - } - - /* tot_len field in rest does not need modifications */ - /* reference counters do not need modifications */ - } -} diff --git a/src/core/lwip/pbuf.h b/src/core/lwip/pbuf.h index 4d4608e16..800551ed4 100644 --- a/src/core/lwip/pbuf.h +++ b/src/core/lwip/pbuf.h @@ -131,8 +131,6 @@ u8_t pbuf_free(struct pbuf *p); u8_t pbuf_clen(struct pbuf *p); void pbuf_cat(struct pbuf *head, struct pbuf *tail); -void pbuf_split_64k(struct pbuf *p, struct pbuf **rest); // windows scale needs large pbuf - #ifdef __cplusplus } #endif diff --git a/src/core/lwip/tcp.c b/src/core/lwip/tcp.c index d90d86d18..017d1fc27 100644 --- a/src/core/lwip/tcp.c +++ b/src/core/lwip/tcp.c @@ -798,25 +798,15 @@ void tcp_fasttmr(struct tcp_pcb *pcb) { if (pcb != NULL && PCB_IN_ACTIVE_STATE(pcb)) { /* If there is data which was previously "refused" by upper layer */ - while (pcb->refused_data != - NULL) { // 'while' instead of 'if' because windows scale uses large pbuf - struct pbuf *rest; - /* Notify again application with data previously received. */ + if (pcb->refused_data) { err_t err; - pbuf_split_64k(pcb->refused_data, &rest); LWIP_DEBUGF(TCP_INPUT_DEBUG, ("tcp_fasttmr: notify kept packet\n")); TCP_EVENT_RECV(pcb, pcb->refused_data, ERR_OK, err); if (err == ERR_OK) { - pcb->refused_data = rest; - } else { - if (rest) { - pbuf_cat(pcb->refused_data, rest); /* undo splitting */ - } - if (err == ERR_ABRT) { - /* if err == ERR_ABRT, 'pcb' is already deallocated */ - pcb = NULL; - } - break; + pcb->refused_data = NULL; + } else if (err == ERR_ABRT) { + /* if err == ERR_ABRT, 'pcb' is already deallocated */ + pcb = NULL; } } diff --git a/src/core/lwip/tcp_in.c b/src/core/lwip/tcp_in.c index 89074ae3a..199cd996b 100644 --- a/src/core/lwip/tcp_in.c +++ b/src/core/lwip/tcp_in.c @@ -181,20 +181,12 @@ void L3_level_tcp_input(struct pbuf *p, struct tcp_pcb *pcb) in_data.recv_flags = 0; /* If there is data which was previously "refused" by upper layer */ - /* 'while' instead of 'if' because windows scale uses large pbuf */ - while (pcb->refused_data != NULL) { - struct pbuf *rest; - pbuf_split_64k(pcb->refused_data, &rest); - - /* Notify again application with data previously received. */ + if (pcb->refused_data) { LWIP_DEBUGF(TCP_INPUT_DEBUG, ("tcp_input: notify kept packet\n")); TCP_EVENT_RECV(pcb, pcb->refused_data, ERR_OK, err); if (err == ERR_OK) { - pcb->refused_data = rest; + pcb->refused_data = NULL; } else { - if (rest) { - pbuf_cat(pcb->refused_data, rest); /* undo splitting */ - } /* if err == ERR_ABRT, 'pcb' is already deallocated */ /* drop incoming packets, because pcb is "full" */ LWIP_DEBUGF(TCP_INPUT_DEBUG, @@ -230,9 +222,7 @@ void L3_level_tcp_input(struct pbuf *p, struct tcp_pcb *pcb) } } - while (in_data.recv_data != - NULL) { // 'while' instead of 'if' because windows scale uses large pbuf - struct pbuf *rest = NULL; + if (in_data.recv_data) { if (pcb->flags & TF_RXCLOSED) { /* received data although already closed -> abort (send RST) to notify the remote host that not all data has been @@ -241,30 +231,20 @@ void L3_level_tcp_input(struct pbuf *p, struct tcp_pcb *pcb) tcp_abort(pcb); goto aborted; } - pbuf_split_64k(in_data.recv_data, &rest); if (in_data.flags & TCP_PSH) { in_data.recv_data->flags |= PBUF_FLAG_PUSH; } /* Notify application that data has been received. */ TCP_EVENT_RECV(pcb, in_data.recv_data, ERR_OK, err); if (err == ERR_ABRT) { - if (rest) { - pbuf_cat(in_data.recv_data, rest); /* undo splitting */ - } goto aborted; } /* If the upper layer can't receive this data, store it */ if (err != ERR_OK) { - if (rest) { - pbuf_cat(in_data.recv_data, rest); /* undo splitting */ - } pcb->refused_data = in_data.recv_data; LWIP_DEBUGF( TCP_INPUT_DEBUG, ("tcp_input: keep incoming packet, because pcb is \"full\"\n")); - break; - } else { - in_data.recv_data = rest; } } From 83fc3f5129672e92484813b96d5f72850ef3e13b Mon Sep 17 00:00:00 2001 From: Dmytro Podgornyi Date: Fri, 2 Feb 2024 07:28:10 +0200 Subject: [PATCH 071/169] issue: 3795922 Remove refused_data in lwip refused_data was used to keep TCP payload which couldn't be received by sockinfo_tcp object and retry receive operation later. Only a single receive operation could be postponed in such a way and all the subsequent failures led to packet drops. sockinfo_tcp::rx_lwip_cb() doesn't produce an error by itself. All the possible errors are propagated by L3_tcp_input(). And all the errors lead to connection reset. So, there is no point to keep refused_data in current implementation. Remove refused_data to clean the code and save extra cache line read in fast TCP timer. Signed-off-by: Dmytro Podgornyi --- src/core/lwip/tcp.c | 35 +++-------------------------------- src/core/lwip/tcp.h | 1 - src/core/lwip/tcp_in.c | 28 +++------------------------- 3 files changed, 6 insertions(+), 58 deletions(-) diff --git a/src/core/lwip/tcp.c b/src/core/lwip/tcp.c index 017d1fc27..2ed3ba28f 100644 --- a/src/core/lwip/tcp.c +++ b/src/core/lwip/tcp.c @@ -151,7 +151,7 @@ static err_t tcp_close_shutdown(struct tcp_pcb *pcb, u8_t rst_on_unacked_data) if (rst_on_unacked_data && ((get_tcp_state(pcb) == ESTABLISHED) || (get_tcp_state(pcb) == CLOSE_WAIT))) { - if ((pcb->refused_data != NULL) || (pcb->rcv_wnd != pcb->rcv_wnd_max)) { + if (pcb->rcv_wnd != pcb->rcv_wnd_max) { /* Not all data received by application, send RST to tell the remote side about this. */ LWIP_ASSERT("pcb->flags & TF_RXCLOSED", pcb->flags & TF_RXCLOSED); @@ -285,11 +285,6 @@ err_t tcp_shutdown(struct tcp_pcb *pcb, int shut_rx, int shut_tx) /* shutting down the tx AND rx side is the same as closing for the raw API */ return tcp_close_shutdown(pcb, 1); } - /* ... and free buffered data */ - if (pcb->refused_data != NULL) { - pbuf_free(pcb->refused_data); - pcb->refused_data = NULL; - } } if (shut_tx) { /* This can't happen twice since if it succeeds, the pcb's state is changed. @@ -789,29 +784,14 @@ void tcp_slowtmr(struct tcp_pcb *pcb) } /** - * Is called every slow_tmr_interval and process data previously - * "refused" by upper layer (application) and sends delayed ACKs. - * + * Is called every slow_tmr_interval/2 and process data previously * and sends delayed ACKs. * Automatically called from tcp_tmr(). */ void tcp_fasttmr(struct tcp_pcb *pcb) { if (pcb != NULL && PCB_IN_ACTIVE_STATE(pcb)) { - /* If there is data which was previously "refused" by upper layer */ - if (pcb->refused_data) { - err_t err; - LWIP_DEBUGF(TCP_INPUT_DEBUG, ("tcp_fasttmr: notify kept packet\n")); - TCP_EVENT_RECV(pcb, pcb->refused_data, ERR_OK, err); - if (err == ERR_OK) { - pcb->refused_data = NULL; - } else if (err == ERR_ABRT) { - /* if err == ERR_ABRT, 'pcb' is already deallocated */ - pcb = NULL; - } - } - /* send delayed ACKs */ - if (pcb && (pcb->flags & TF_ACK_DELAY)) { + if (pcb->flags & TF_ACK_DELAY) { LWIP_DEBUGF(TCP_DEBUG, ("tcp_fasttmr: delayed ACK\n")); tcp_ack_now(pcb); tcp_output(pcb); @@ -1047,10 +1027,6 @@ void tcp_pcb_recycle(struct tcp_pcb *pcb) tcp_tx_pbuf_free(pcb, pcb->pbuf_alloc); pcb->pbuf_alloc = NULL; } - if (pcb->refused_data) { - pbuf_free(pcb->refused_data); - pcb->refused_data = NULL; - } } struct pbuf *tcp_tx_pbuf_alloc(struct tcp_pcb *pcb, u32_t length, pbuf_type type, pbuf_desc *desc, @@ -1235,11 +1211,6 @@ void tcp_pcb_purge(struct tcp_pcb *pcb) LWIP_DEBUGF(TCP_DEBUG, ("tcp_pcb_purge\n")); - if (pcb->refused_data != NULL) { - LWIP_DEBUGF(TCP_DEBUG, ("tcp_pcb_purge: data left on ->refused_data\n")); - pbuf_free(pcb->refused_data); - pcb->refused_data = NULL; - } if (pcb->unsent != NULL) { LWIP_DEBUGF(TCP_DEBUG, ("tcp_pcb_purge: not all data sent\n")); } diff --git a/src/core/lwip/tcp.h b/src/core/lwip/tcp.h index d829cfd05..92cabc79c 100644 --- a/src/core/lwip/tcp.h +++ b/src/core/lwip/tcp.h @@ -359,7 +359,6 @@ struct tcp_pcb { struct tcp_seg *ooseq; /* Received out of sequence segments. */ #endif /* TCP_QUEUE_OOSEQ */ - struct pbuf *refused_data; /* Data previously received but not yet taken by upper layer */ struct tcp_seg *seg_alloc; /* Available tcp_seg element for use */ struct pbuf *pbuf_alloc; /* Available pbuf element for use */ diff --git a/src/core/lwip/tcp_in.c b/src/core/lwip/tcp_in.c index 199cd996b..75c234f3a 100644 --- a/src/core/lwip/tcp_in.c +++ b/src/core/lwip/tcp_in.c @@ -180,21 +180,6 @@ void L3_level_tcp_input(struct pbuf *p, struct tcp_pcb *pcb) in_data.recv_data = NULL; in_data.recv_flags = 0; - /* If there is data which was previously "refused" by upper layer */ - if (pcb->refused_data) { - LWIP_DEBUGF(TCP_INPUT_DEBUG, ("tcp_input: notify kept packet\n")); - TCP_EVENT_RECV(pcb, pcb->refused_data, ERR_OK, err); - if (err == ERR_OK) { - pcb->refused_data = NULL; - } else { - /* if err == ERR_ABRT, 'pcb' is already deallocated */ - /* drop incoming packets, because pcb is "full" */ - LWIP_DEBUGF(TCP_INPUT_DEBUG, - ("tcp_input: drop incoming packets, because pcb is \"full\"\n")); - pbuf_free(p); - return; - } - } pcb->is_in_input = 1; err = tcp_process(pcb, &in_data); /* A return value of ERR_ABRT means that tcp_abort() was called @@ -241,10 +226,8 @@ void L3_level_tcp_input(struct pbuf *p, struct tcp_pcb *pcb) } /* If the upper layer can't receive this data, store it */ if (err != ERR_OK) { - pcb->refused_data = in_data.recv_data; - LWIP_DEBUGF( - TCP_INPUT_DEBUG, - ("tcp_input: keep incoming packet, because pcb is \"full\"\n")); + pcb->rcv_wnd += in_data.recv_data->tot_len; + pbuf_free(in_data.recv_data); } } @@ -265,11 +248,6 @@ void L3_level_tcp_input(struct pbuf *p, struct tcp_pcb *pcb) pcb->is_in_input = 0; /* Try to send something out. */ tcp_output(pcb); -#if TCP_INPUT_DEBUG -#if TCP_DEBUG - tcp_debug_print_state(get_tcp_state(pcb)); -#endif /* TCP_DEBUG */ -#endif /* TCP_INPUT_DEBUG */ } } /* Jump target if pcb has been aborted in a callback (by calling tcp_abort()). @@ -278,7 +256,7 @@ void L3_level_tcp_input(struct pbuf *p, struct tcp_pcb *pcb) pcb->is_in_input = 0; in_data.recv_data = NULL; - /* give up our reference to inseg.p */ + /* tcp_receive() sets in_data.inseg.p to NULL in case of recv_data */ if (in_data.inseg.p != NULL) { pbuf_free(in_data.inseg.p); in_data.inseg.p = NULL; From 74c38c2de3fb300fd90f5cb862c505cc13163dc7 Mon Sep 17 00:00:00 2001 From: Iftah Levi Date: Sun, 18 Feb 2024 17:54:07 +0200 Subject: [PATCH 072/169] issue: 3781322 Fix for 100% CPU load Split poll_sn to poll_sn_rx and poll_sn_tx. Required after splitting cq to cq_rx and cq_tx. Signed-off-by: Iftah Levi --- src/core/dev/net_device_table_mgr.cpp | 11 ++++++----- src/core/dev/net_device_table_mgr.h | 5 +++-- src/core/dev/net_device_val.cpp | 21 +++++++++++---------- src/core/dev/net_device_val.h | 5 +++-- src/core/event/event_handler_manager.cpp | 14 +++++++++----- src/core/iomux/epfd_info.cpp | 20 ++++++++++---------- src/core/iomux/epfd_info.h | 5 +++-- src/core/iomux/epoll_wait_call.cpp | 6 +++--- src/core/iomux/io_mux_call.cpp | 14 ++++++++------ src/core/iomux/io_mux_call.h | 3 ++- 10 files changed, 58 insertions(+), 46 deletions(-) diff --git a/src/core/dev/net_device_table_mgr.cpp b/src/core/dev/net_device_table_mgr.cpp index 68743adb6..97d3aa8f9 100644 --- a/src/core/dev/net_device_table_mgr.cpp +++ b/src/core/dev/net_device_table_mgr.cpp @@ -420,7 +420,8 @@ void net_device_table_mgr::get_ip_list(local_ip_list_t &ip_list, sa_family_t fam m_lock.unlock(); } -int net_device_table_mgr::global_ring_poll_and_process_element(uint64_t *p_poll_sn, +int net_device_table_mgr::global_ring_poll_and_process_element(uint64_t *p_poll_sn_rx, + uint64_t *p_poll_sn_tx, void *pv_fd_ready_array /*= NULL*/) { ndtm_logfunc(""); @@ -429,8 +430,8 @@ int net_device_table_mgr::global_ring_poll_and_process_element(uint64_t *p_poll_ net_device_map_index_t::iterator net_dev_iter; for (net_dev_iter = m_net_device_map_index.begin(); net_dev_iter != m_net_device_map_index.end(); net_dev_iter++) { - int ret = net_dev_iter->second->global_ring_poll_and_process_element(p_poll_sn, - pv_fd_ready_array); + int ret = net_dev_iter->second->global_ring_poll_and_process_element( + p_poll_sn_rx, p_poll_sn_tx, pv_fd_ready_array); if (ret < 0) { ndtm_logdbg("Error in net_device_val[%p]->poll_and_process_element() (errno=%d %m)", net_dev_iter->second, errno); @@ -446,14 +447,14 @@ int net_device_table_mgr::global_ring_poll_and_process_element(uint64_t *p_poll_ return ret_total; } -int net_device_table_mgr::global_ring_request_notification(uint64_t poll_sn) +int net_device_table_mgr::global_ring_request_notification(uint64_t poll_sn_rx, uint64_t poll_sn_tx) { ndtm_logfunc(""); int ret_total = 0; net_device_map_index_t::iterator net_dev_iter; for (net_dev_iter = m_net_device_map_index.begin(); m_net_device_map_index.end() != net_dev_iter; net_dev_iter++) { - int ret = net_dev_iter->second->global_ring_request_notification(poll_sn); + int ret = net_dev_iter->second->global_ring_request_notification(poll_sn_rx, poll_sn_tx); BULLSEYE_EXCLUDE_BLOCK_START if (ret < 0) { ndtm_logerr("Error in net_device_val[%p]->request_notification() (errno=%d %m)", diff --git a/src/core/dev/net_device_table_mgr.h b/src/core/dev/net_device_table_mgr.h index b0a0d1425..53d6fe9ad 100644 --- a/src/core/dev/net_device_table_mgr.h +++ b/src/core/dev/net_device_table_mgr.h @@ -74,7 +74,8 @@ class net_device_table_mgr : public cache_table_mgr, publ * channel. If race condition case occures then that CQ is polled and processed (and the CQ * notification is armed) Returns >=0 the total number of wce processed < 0 on error */ - int global_ring_poll_and_process_element(uint64_t *p_poll_sn, void *pv_fd_ready_array = NULL); + int global_ring_poll_and_process_element(uint64_t *p_poll_sn_rx, uint64_t *p_poll_sn_tx, + void *pv_fd_ready_array = NULL); /** * This will poll one time on the ALL the managed CQ's @@ -85,7 +86,7 @@ class net_device_table_mgr : public cache_table_mgr, publ int global_ring_wait_for_notification_and_process_element(uint64_t *p_poll_sn, void *pv_fd_ready_array = NULL); - int global_ring_request_notification(uint64_t poll_sn); + int global_ring_request_notification(uint64_t poll_sn_rx, uint64_t poll_sn_tx); /** * This will poll one time on the ALL the managed CQ's diff --git a/src/core/dev/net_device_val.cpp b/src/core/dev/net_device_val.cpp index ebf2bfbf7..d41286bc8 100644 --- a/src/core/dev/net_device_val.cpp +++ b/src/core/dev/net_device_val.cpp @@ -1173,7 +1173,8 @@ void net_device_val::ring_key_redirection_release(resource_allocation_key *key) } } -int net_device_val::global_ring_poll_and_process_element(uint64_t *p_poll_sn, +int net_device_val::global_ring_poll_and_process_element(uint64_t *p_poll_sn_rx, + uint64_t *p_poll_sn_tx, void *pv_fd_ready_array /*=NULL*/) { nd_logfuncall(""); @@ -1181,7 +1182,7 @@ int net_device_val::global_ring_poll_and_process_element(uint64_t *p_poll_sn, std::lock_guard lock(m_lock); rings_hash_map_t::iterator ring_iter; for (ring_iter = m_h_ring_map.begin(); ring_iter != m_h_ring_map.end(); ring_iter++) { - int ret = THE_RING->poll_and_process_element_rx(p_poll_sn, pv_fd_ready_array); + int ret = THE_RING->poll_and_process_element_rx(p_poll_sn_rx, pv_fd_ready_array); BULLSEYE_EXCLUDE_BLOCK_START if (ret < 0 && errno != EAGAIN) { nd_logerr("Error in RX ring->poll_and_process_element() of %p (errno=%d %s)", THE_RING, @@ -1190,11 +1191,11 @@ int net_device_val::global_ring_poll_and_process_element(uint64_t *p_poll_sn, } BULLSEYE_EXCLUDE_BLOCK_END if (ret > 0) { - nd_logfunc("ring[%p] RX Returned with: %d (sn=%d)", THE_RING, ret, *p_poll_sn); + nd_logfunc("ring[%p] RX Returned with: %d (sn=%d)", THE_RING, ret, *p_poll_sn_rx); ret_total += ret; } #if defined(DEFINED_FORCE_TX_POLLING) - ret = THE_RING->poll_and_process_element_tx(p_poll_sn); + ret = THE_RING->poll_and_process_element_tx(p_poll_sn_tx); BULLSEYE_EXCLUDE_BLOCK_START if (ret < 0 && errno != EAGAIN) { nd_logerr("Error in TX ring->poll_and_process_element() of %p (errno=%d %m)", THE_RING, @@ -1203,7 +1204,7 @@ int net_device_val::global_ring_poll_and_process_element(uint64_t *p_poll_sn, } BULLSEYE_EXCLUDE_BLOCK_END if (ret > 0) { - nd_logfunc("ring[%p] TX Returned with: %d (sn=%d)", THE_RING, ret, *p_poll_sn); + nd_logfunc("ring[%p] TX Returned with: %d (sn=%d)", THE_RING, ret, *p_poll_sn_tx); ret_total += ret; } #endif /* DEFINED_FORCE_TX_POLLING */ @@ -1211,13 +1212,13 @@ int net_device_val::global_ring_poll_and_process_element(uint64_t *p_poll_sn, return ret_total; } -int net_device_val::global_ring_request_notification(uint64_t poll_sn) +int net_device_val::global_ring_request_notification(uint64_t poll_sn_rx, uint64_t poll_sn_tx) { int ret_total = 0; std::lock_guard lock(m_lock); rings_hash_map_t::iterator ring_iter; for (ring_iter = m_h_ring_map.begin(); ring_iter != m_h_ring_map.end(); ring_iter++) { - int ret = THE_RING->request_notification(CQT_RX, poll_sn); + int ret = THE_RING->request_notification(CQT_RX, poll_sn_rx); BULLSEYE_EXCLUDE_BLOCK_START if (ret < 0) { nd_logerr("Error RX ring[%p]->request_notification() (errno=%d %s)", THE_RING, errno, @@ -1225,17 +1226,17 @@ int net_device_val::global_ring_request_notification(uint64_t poll_sn) return ret; } BULLSEYE_EXCLUDE_BLOCK_END - nd_logfunc("ring[%p] RX Returned with: %d (sn=%d)", THE_RING, ret, poll_sn); + nd_logfunc("ring[%p] RX Returned with: %d (sn=%d)", THE_RING, ret, poll_sn_rx); ret_total += ret; #if defined(DEFINED_FORCE_TX_POLLING) - ret = THE_RING->request_notification(CQT_TX, poll_sn); + ret = THE_RING->request_notification(CQT_TX, poll_sn_tx); BULLSEYE_EXCLUDE_BLOCK_START if (ret < 0) { nd_logerr("Error TX ring[%p]->request_notification() (errno=%d %m)", THE_RING, errno); return ret; } BULLSEYE_EXCLUDE_BLOCK_END - nd_logfunc("ring[%p] TX Returned with: %d (sn=%d)", THE_RING, ret, poll_sn); + nd_logfunc("ring[%p] TX Returned with: %d (sn=%d)", THE_RING, ret, poll_sn_tx); ret_total += ret; #endif /* DEFINED_FORCE_TX_POLLING */ } diff --git a/src/core/dev/net_device_val.h b/src/core/dev/net_device_val.h index 5e3b94f29..658f7ba0e 100644 --- a/src/core/dev/net_device_val.h +++ b/src/core/dev/net_device_val.h @@ -240,8 +240,9 @@ class net_device_val { transport_type_t get_transport_type() const { return m_transport_type; } bool update_active_backup_slaves(); - int global_ring_poll_and_process_element(uint64_t *p_poll_sn, void *pv_fd_ready_array = NULL); - int global_ring_request_notification(uint64_t poll_sn); + int global_ring_poll_and_process_element(uint64_t *p_poll_sn_rx, uint64_t *p_poll_sn_tx, + void *pv_fd_ready_array = NULL); + int global_ring_request_notification(uint64_t poll_sn_rx, uint64_t poll_sn_tx); int ring_drain_and_proccess(); void ring_adapt_cq_moderation(); L2_address *get_l2_address() { return m_p_L2_addr; }; diff --git a/src/core/event/event_handler_manager.cpp b/src/core/event/event_handler_manager.cpp index 8ea210d16..3149b08e6 100644 --- a/src/core/event/event_handler_manager.cpp +++ b/src/core/event/event_handler_manager.cpp @@ -972,13 +972,17 @@ void *event_handler_manager::thread_loop() } } - uint64_t poll_sn = 0; + uint64_t poll_sn_rx = 0; + uint64_t poll_sn_tx = 0; if (m_b_sysvar_internal_thread_arm_cq_enabled && m_cq_epfd > 0 && g_p_net_device_table_mgr) { - g_p_net_device_table_mgr->global_ring_poll_and_process_element(&poll_sn, NULL); - int ret = g_p_net_device_table_mgr->global_ring_request_notification(poll_sn); + g_p_net_device_table_mgr->global_ring_poll_and_process_element(&poll_sn_rx, &poll_sn_tx, + NULL); + int ret = + g_p_net_device_table_mgr->global_ring_request_notification(poll_sn_rx, poll_sn_tx); if (ret > 0) { - g_p_net_device_table_mgr->global_ring_poll_and_process_element(&poll_sn, NULL); + g_p_net_device_table_mgr->global_ring_poll_and_process_element(&poll_sn_rx, + &poll_sn_tx, NULL); } } @@ -1002,7 +1006,7 @@ void *event_handler_manager::thread_loop() if (m_b_sysvar_internal_thread_arm_cq_enabled && p_events[idx].data.fd == m_cq_epfd && g_p_net_device_table_mgr) { g_p_net_device_table_mgr->global_ring_wait_for_notification_and_process_element( - &poll_sn, NULL); + &poll_sn_rx, NULL); } else if (is_wakeup_fd(p_events[idx].data.fd)) { // a request for registration was sent m_reg_action_q_lock.lock(); diff --git a/src/core/iomux/epfd_info.cpp b/src/core/iomux/epfd_info.cpp index 13dc50078..e44688dc2 100644 --- a/src/core/iomux/epfd_info.cpp +++ b/src/core/iomux/epfd_info.cpp @@ -611,7 +611,7 @@ epoll_stats_t *epfd_info::stats() return m_stats; } -int epfd_info::ring_poll_and_process_element(uint64_t *p_poll_sn, +int epfd_info::ring_poll_and_process_element(uint64_t *p_poll_sn_rx, uint64_t *p_poll_sn_tx, void *pv_fd_ready_array /* = NULL*/) { __log_func(""); @@ -625,7 +625,7 @@ int epfd_info::ring_poll_and_process_element(uint64_t *p_poll_sn, m_ring_map_lock.lock(); for (ring_map_t::iterator iter = m_ring_map.begin(); iter != m_ring_map.end(); iter++) { - int ret = iter->first->poll_and_process_element_rx(p_poll_sn, pv_fd_ready_array); + int ret = iter->first->poll_and_process_element_rx(p_poll_sn_rx, pv_fd_ready_array); BULLSEYE_EXCLUDE_BLOCK_START if (ret < 0 && errno != EAGAIN) { __log_err("Error in RX ring->poll_and_process_element() of %p (errno=%d %m)", @@ -635,11 +635,11 @@ int epfd_info::ring_poll_and_process_element(uint64_t *p_poll_sn, } BULLSEYE_EXCLUDE_BLOCK_END if (ret > 0) { - __log_func("ring[%p] RX Returned with: %d (sn=%d)", iter->first, ret, *p_poll_sn); + __log_func("ring[%p] RX Returned with: %d (sn=%d)", iter->first, ret, *p_poll_sn_rx); ret_total += ret; } #if defined(DEFINED_FORCE_TX_POLLING) - ret = iter->first->poll_and_process_element_tx(p_poll_sn); + ret = iter->first->poll_and_process_element_tx(p_poll_sn_tx); BULLSEYE_EXCLUDE_BLOCK_START if (ret < 0 && errno != EAGAIN) { __log_err("Error in TX ring->poll_and_process_element() of %p (errno=%d %m)", @@ -649,7 +649,7 @@ int epfd_info::ring_poll_and_process_element(uint64_t *p_poll_sn, } BULLSEYE_EXCLUDE_BLOCK_END if (ret > 0) { - __log_func("ring[%p] TX Returned with: %d (sn=%d)", iter->first, ret, *p_poll_sn); + __log_func("ring[%p] TX Returned with: %d (sn=%d)", iter->first, ret, *p_poll_sn_tx); ret_total += ret; } #endif /* DEFINED_FORCE_TX_POLLING */ @@ -669,7 +669,7 @@ int epfd_info::ring_poll_and_process_element(uint64_t *p_poll_sn, return ret_total; } -int epfd_info::ring_request_notification(uint64_t poll_sn) +int epfd_info::ring_request_notification(uint64_t poll_sn_rx, uint64_t poll_sn_tx) { __log_func(""); int ret_total = 0; @@ -681,7 +681,7 @@ int epfd_info::ring_request_notification(uint64_t poll_sn) m_ring_map_lock.lock(); for (ring_map_t::iterator iter = m_ring_map.begin(); iter != m_ring_map.end(); iter++) { - int ret = iter->first->request_notification(CQT_RX, poll_sn); + int ret = iter->first->request_notification(CQT_RX, poll_sn_rx); BULLSEYE_EXCLUDE_BLOCK_START if (ret < 0) { __log_err("Error RX ring[%p]->request_notification() (errno=%d %m)", iter->first, @@ -690,10 +690,10 @@ int epfd_info::ring_request_notification(uint64_t poll_sn) return ret; } BULLSEYE_EXCLUDE_BLOCK_END - __log_func("ring[%p] RX Returned with: %d (sn=%d)", iter->first, ret, poll_sn); + __log_func("ring[%p] RX Returned with: %d (sn=%d)", iter->first, ret, poll_sn_rx); ret_total += ret; #if defined(DEFINED_FORCE_TX_POLLING) - ret = iter->first->request_notification(CQT_TX, poll_sn); + ret = iter->first->request_notification(CQT_TX, poll_sn_tx); BULLSEYE_EXCLUDE_BLOCK_START if (ret < 0) { __log_err("Error TX ring[%p]->request_notification() (errno=%d %m)", iter->first, @@ -702,7 +702,7 @@ int epfd_info::ring_request_notification(uint64_t poll_sn) return ret; } BULLSEYE_EXCLUDE_BLOCK_END - __log_func("ring[%p] TX Returned with: %d (sn=%d)", iter->first, ret, poll_sn); + __log_func("ring[%p] TX Returned with: %d (sn=%d)", iter->first, ret, poll_sn_tx); ret_total += ret; #endif /* DEFINED_FORCE_TX_POLLING */ } diff --git a/src/core/iomux/epfd_info.h b/src/core/iomux/epfd_info.h index a58d092ea..be004d193 100644 --- a/src/core/iomux/epfd_info.h +++ b/src/core/iomux/epfd_info.h @@ -87,9 +87,10 @@ class epfd_info : public lock_mutex_recursive, public cleanable_obj, public wake */ epoll_stats_t *stats(); - int ring_poll_and_process_element(uint64_t *p_poll_sn, void *pv_fd_ready_array = NULL); + int ring_poll_and_process_element(uint64_t *p_poll_sn_rx, uint64_t *p_poll_sn_tx, + void *pv_fd_ready_array = NULL); - int ring_request_notification(uint64_t poll_sn); + int ring_request_notification(uint64_t poll_sn_rx, uint64_t poll_sn_tx); int ring_wait_for_notification_and_process_element(uint64_t *p_poll_sn, void *pv_fd_ready_array = NULL); diff --git a/src/core/iomux/epoll_wait_call.cpp b/src/core/iomux/epoll_wait_call.cpp index b172d79d5..dcac5aaa1 100644 --- a/src/core/iomux/epoll_wait_call.cpp +++ b/src/core/iomux/epoll_wait_call.cpp @@ -404,16 +404,16 @@ bool epoll_wait_call::handle_os_countdown(int &poll_os_countdown) int epoll_wait_call::ring_poll_and_process_element() { - return m_epfd_info->ring_poll_and_process_element(&m_poll_sn, NULL); + return m_epfd_info->ring_poll_and_process_element(&m_poll_sn_rx, &m_poll_sn_tx, NULL); } int epoll_wait_call::ring_request_notification() { - return m_epfd_info->ring_request_notification(m_poll_sn); + return m_epfd_info->ring_request_notification(m_poll_sn_rx, m_poll_sn_tx); } int epoll_wait_call::ring_wait_for_notification_and_process_element(void *pv_fd_ready_array) { - return m_epfd_info->ring_wait_for_notification_and_process_element(&m_poll_sn, + return m_epfd_info->ring_wait_for_notification_and_process_element(&m_poll_sn_rx, pv_fd_ready_array); } diff --git a/src/core/iomux/io_mux_call.cpp b/src/core/iomux/io_mux_call.cpp index e8e6ff4c4..2b63b6254 100644 --- a/src/core/iomux/io_mux_call.cpp +++ b/src/core/iomux/io_mux_call.cpp @@ -177,7 +177,8 @@ io_mux_call::io_mux_call(int *off_fds_buffer, offloaded_mode_t *off_modes_buffer , m_p_offloaded_modes(off_modes_buffer) , m_num_all_offloaded_fds(0) , m_cqepfd(-1) - , m_poll_sn(0) + , m_poll_sn_rx(0) + , m_poll_sn_tx(0) , m_p_stats(NULL) , m_n_all_ready_fds(0) , m_n_ready_rfds(0) @@ -229,7 +230,7 @@ void io_mux_call::check_offloaded_rsockets() fd_ready_array.fd_count = 0; // Poll the socket object - if (p_socket_object->is_readable(&m_poll_sn, &fd_ready_array)) { + if (p_socket_object->is_readable(&m_poll_sn_rx, &fd_ready_array)) { set_offloaded_rfd_ready(offloaded_index); // We have offloaded traffic. Don't sample the OS immediately p_socket_object->unset_immediate_os_sample(); @@ -423,7 +424,7 @@ void io_mux_call::blocking_loops() woke_up_non_valid = false; ret = ring_request_notification(); - __log_func("arming cq with poll_sn=%lx ret=%d", m_poll_sn, ret); + __log_func("arming cq with poll_sn=%lx ret=%d", m_poll_sn_rx, ret); if (ret < 0) { xlio_throw_object(io_mux_call::io_error); } else if (ret > 0) { @@ -549,18 +550,19 @@ bool io_mux_call::immidiate_return(int &poll_os_countdown) int io_mux_call::ring_poll_and_process_element() { // TODO: (select, poll) this access all CQs, it is better to check only relevant ones - return g_p_net_device_table_mgr->global_ring_poll_and_process_element(&m_poll_sn, NULL); + return g_p_net_device_table_mgr->global_ring_poll_and_process_element(&m_poll_sn_rx, + &m_poll_sn_tx, NULL); } int io_mux_call::ring_request_notification() { - return g_p_net_device_table_mgr->global_ring_request_notification(m_poll_sn); + return g_p_net_device_table_mgr->global_ring_request_notification(m_poll_sn_rx, m_poll_sn_tx); } int io_mux_call::ring_wait_for_notification_and_process_element(void *pv_fd_ready_array) { return g_p_net_device_table_mgr->global_ring_wait_for_notification_and_process_element( - &m_poll_sn, pv_fd_ready_array); + &m_poll_sn_rx, pv_fd_ready_array); } bool io_mux_call::is_sig_pending() diff --git a/src/core/iomux/io_mux_call.h b/src/core/iomux/io_mux_call.h index b26daccad..b9b76067f 100644 --- a/src/core/iomux/io_mux_call.h +++ b/src/core/iomux/io_mux_call.h @@ -264,7 +264,8 @@ class io_mux_call { int m_cqepfd; /// poll sn - uint64_t m_poll_sn; + uint64_t m_poll_sn_rx; + uint64_t m_poll_sn_tx; /// xlio statistics. each implementation must initialize this. iomux_func_stats_t *m_p_stats; From cbdbfec6f5685e7f5e58f14c81057bd218092a4d Mon Sep 17 00:00:00 2001 From: Dmytro Podgornyi Date: Wed, 6 Mar 2024 23:41:47 +0200 Subject: [PATCH 073/169] issue: 3813802 Terminate process instead of 'throw' on panic Panic level of logs terminate the process and don't return. Call std::terminate() to terminate the process unconditionally. Also, this fixes cppcheck warning that an exception is thrown without an object. Signed-off-by: Dmytro Podgornyi --- src/core/proto/ip_frag.cpp | 16 +++++----------- src/vlogger/vlogger.h | 4 ++-- 2 files changed, 7 insertions(+), 13 deletions(-) diff --git a/src/core/proto/ip_frag.cpp b/src/core/proto/ip_frag.cpp index 7159bc6c2..ca68b1557 100644 --- a/src/core/proto/ip_frag.cpp +++ b/src/core/proto/ip_frag.cpp @@ -38,23 +38,17 @@ #include "core/event/event_handler_manager.h" #include "mem_buf_desc.h" +#undef MODULE_NAME +#define MODULE_NAME "ip_frag" + //#define IP_FRAG_DEBUG 1 #ifdef IP_FRAG_DEBUG -#define frag_dbg(fmt, args...) \ - vlog_printf(VLOG_WARNING, "%s:%d : " fmt "\n", __FUNCTION__, __LINE__, ##args) +#define frag_dbg __log_info_dbg #else #define frag_dbg(fmt, args...) #endif - -#define frag_err(fmt, args...) \ - vlog_printf(VLOG_ERROR, "%s:%d : " fmt "\n", __FUNCTION__, __LINE__, ##args) - -#define frag_panic(fmt, args...) \ - do { \ - vlog_printf(VLOG_PANIC, "%s:%d : " fmt "\n", __FUNCTION__, __LINE__, ##args); \ - throw; \ - } while (0) +#define frag_panic __log_info_panic #ifdef IP_FRAG_DEBUG static int debug_drop_every_n_pkt = 0; // 0 - Disabled, 1/N is the number of packet dropped diff --git a/src/vlogger/vlogger.h b/src/vlogger/vlogger.h index 0c23f9a88..e1c6a23fd 100644 --- a/src/vlogger/vlogger.h +++ b/src/vlogger/vlogger.h @@ -107,7 +107,7 @@ #define __log_panic(log_fmt, log_args...) \ do { \ VLOG_PRINTF(VLOG_PANIC, log_fmt, ##log_args); \ - throw; \ + std::terminate(); \ } while (0) #define __log_err(log_fmt, log_args...) \ do { \ @@ -165,7 +165,7 @@ #define __log_info_panic(log_fmt, log_args...) \ do { \ VLOG_PRINTF_INFO(VLOG_PANIC, log_fmt, ##log_args); \ - throw; \ + std::terminate(); \ } while (0) #define __log_info_err(log_fmt, log_args...) \ do { \ From 8b076f92c6d6a338bb5761709f6f1218b5633b6d Mon Sep 17 00:00:00 2001 From: Dmytro Podgornyi Date: Wed, 6 Mar 2024 23:47:00 +0200 Subject: [PATCH 074/169] issue: 3813802 Don't wrap xlio_raw_post_recv() with IF_VERBS_FAILURE IF_VERBS_FAILURE if supposed to be used with ibverbs API as a workaround for inconsistent API. However, using it for an internal method make the code dirty. Replace 'throw' with 'throw_xlio_exception' to suppress a cppcheck warning. Signed-off-by: Dmytro Podgornyi --- src/core/dev/hw_queue_rx.cpp | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/src/core/dev/hw_queue_rx.cpp b/src/core/dev/hw_queue_rx.cpp index 5fa3b2d98..0275070f3 100644 --- a/src/core/dev/hw_queue_rx.cpp +++ b/src/core/dev/hw_queue_rx.cpp @@ -329,8 +329,7 @@ void hw_queue_rx::post_recv_buffer_rq(mem_buf_desc_t *p_mem_buf_desc) m_curr_rx_wr = 0; struct ibv_recv_wr *bad_wr = nullptr; - IF_VERBS_FAILURE(xlio_raw_post_recv(&bad_wr)) - { + if (xlio_raw_post_recv(&bad_wr) < 0) { uint32_t n_pos_bad_rx_wr = ((uint8_t *)bad_wr - (uint8_t *)m_ibv_rx_wr_array) / sizeof(struct ibv_recv_wr); hwqrx_logerr("failed posting list (errno=%d %s)", errno, strerror(errno)); @@ -345,10 +344,9 @@ void hw_queue_rx::post_recv_buffer_rq(mem_buf_desc_t *p_mem_buf_desc) if (n_pos_bad_rx_wr != (m_n_sysvar_rx_num_wr_to_post_recv - 1)) { m_ibv_rx_wr_array[n_pos_bad_rx_wr].next = &m_ibv_rx_wr_array[n_pos_bad_rx_wr + 1]; } - throw; + throw_xlio_exception("Failed to post a WQE to RQ"); } - ENDIF_VERBS_FAILURE; - hwqrx_logfunc("Successful ibv_post_recv"); + hwqrx_logfunc("Successful buffer post to RQ"); } else { m_curr_rx_wr++; } @@ -366,14 +364,14 @@ int hw_queue_rx::xlio_raw_post_recv(struct ibv_recv_wr **bad_wr) for (; wr; ++nreq, wr = wr->next) { if (unlikely((int)m_rq_data.head - (int)m_rq_data.tail + nreq >= (int)m_rx_num_wr)) { errno = ENOMEM; - err = -errno; + err = -1; *bad_wr = wr; goto out; } if (unlikely(wr->num_sge > (int)m_rx_sge)) { errno = EINVAL; - err = -errno; + err = -1; *bad_wr = wr; goto out; } From b2c8589f57a8f22b4c31362c30b3750f7ecc4961 Mon Sep 17 00:00:00 2001 From: Dmytro Podgornyi Date: Wed, 6 Mar 2024 23:50:28 +0200 Subject: [PATCH 075/169] issue: 3813802 Avoid partial initialization of an event_data_t object cppcheck generates a warning about uninitialized fields even if they're not used in fact. Signed-off-by: Dmytro Podgornyi --- src/core/event/event_handler_manager.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/core/event/event_handler_manager.cpp b/src/core/event/event_handler_manager.cpp index 3149b08e6..5719c714d 100644 --- a/src/core/event/event_handler_manager.cpp +++ b/src/core/event/event_handler_manager.cpp @@ -538,7 +538,7 @@ void event_handler_manager::priv_register_ibverbs_events(ibverbs_reg_info_t &inf event_handler_map_t::iterator i; i = m_event_handler_map.find(info.fd); if (i == m_event_handler_map.end()) { - event_data_t v; + event_data_t v = {}; v.type = EV_IBVERBS; v.ibverbs_ev.fd = info.fd; @@ -626,7 +626,7 @@ void event_handler_manager::priv_register_rdma_cm_events(rdma_cm_reg_info_t &inf event_handler_map_t::iterator iter_fd = m_event_handler_map.find(info.fd); if (iter_fd == m_event_handler_map.end()) { evh_logdbg("Adding new channel (fd %d, id %p, handler %p)", info.fd, info.id, info.handler); - event_data_t map_value; + event_data_t map_value = {}; map_value.type = EV_RDMA_CM; map_value.rdma_cm_ev.n_ref_count = 1; @@ -703,7 +703,7 @@ void event_handler_manager::priv_register_command_events(command_reg_info_t &inf event_handler_map_t::iterator iter_fd = m_event_handler_map.find(info.fd); if (iter_fd == m_event_handler_map.end()) { evh_logdbg("Adding new channel (fd %d)", info.fd); - event_data_t map_value; + event_data_t map_value = {}; map_value.type = EV_COMMAND; map_value.command_ev.cmd = info.cmd; From fb1a8ea25f91ae9a2a5bd311f160eb115f5d746e Mon Sep 17 00:00:00 2001 From: Dmytro Podgornyi Date: Wed, 6 Mar 2024 23:52:39 +0200 Subject: [PATCH 076/169] issue: 3813802 Remove dst_entry::m_p_send_wqe This field is used as a local variable in 3 places. Allocate a variable on stack in the respective places instead of keeping it per socket. This fixes cppcheck warning about lifecycles of local and non-local variables. Signed-off-by: Dmytro Podgornyi --- src/core/proto/dst_entry.cpp | 1 - src/core/proto/dst_entry.h | 1 - src/core/proto/dst_entry_tcp.cpp | 17 +++++++++-------- src/core/proto/dst_entry_udp.cpp | 15 ++++++++------- 4 files changed, 17 insertions(+), 17 deletions(-) diff --git a/src/core/proto/dst_entry.cpp b/src/core/proto/dst_entry.cpp index 8fe855171..1fed48573 100644 --- a/src/core/proto/dst_entry.cpp +++ b/src/core/proto/dst_entry.cpp @@ -154,7 +154,6 @@ void dst_entry::init_members() m_sge = NULL; m_b_is_offloaded = true; m_b_is_initialized = false; - m_p_send_wqe = NULL; m_max_inline = 0; m_max_ip_payload_size = 0; m_max_udp_payload_size = 0; diff --git a/src/core/proto/dst_entry.h b/src/core/proto/dst_entry.h index b559023d1..46231afbc 100644 --- a/src/core/proto/dst_entry.h +++ b/src/core/proto/dst_entry.h @@ -167,7 +167,6 @@ class dst_entry : public cache_observer, public tostr { uint8_t m_pcp; bool m_b_is_initialized; - xlio_ibv_send_wr *m_p_send_wqe; uint32_t m_max_inline; ring_user_id_t m_id; uint16_t m_max_ip_payload_size; diff --git a/src/core/proto/dst_entry_tcp.cpp b/src/core/proto/dst_entry_tcp.cpp index 8f01fea41..ad4cd966b 100644 --- a/src/core/proto/dst_entry_tcp.cpp +++ b/src/core/proto/dst_entry_tcp.cpp @@ -72,6 +72,7 @@ ssize_t dst_entry_tcp::fast_send(const iovec *p_iov, const ssize_t sz_iov, xlio_ void *p_ip_hdr; void *p_tcp_hdr; tcp_iovec *p_tcp_iov = NULL; + xlio_ibv_send_wr *p_send_wqe; size_t hdr_alignment_diff = 0; bool is_zerocopy = is_set(attr.flags, XLIO_TX_PACKET_ZEROCOPY); @@ -151,7 +152,7 @@ ssize_t dst_entry_tcp::fast_send(const iovec *p_iov, const ssize_t sz_iov, xlio_ tcp_hdr_len = (static_cast(p_tcp_hdr))->doff * 4; if (!is_zerocopy && (total_packet_len < m_max_inline) && (1 == sz_iov)) { - m_p_send_wqe = &m_inline_send_wqe; + p_send_wqe = &m_inline_send_wqe; p_tcp_iov[0].iovec.iov_base = (uint8_t *)p_pkt + hdr_alignment_diff; p_tcp_iov[0].iovec.iov_len = total_packet_len; } else if (is_set(attr.flags, (xlio_wr_tx_packet_attr)(XLIO_TX_PACKET_TSO))) { @@ -165,13 +166,13 @@ ssize_t dst_entry_tcp::fast_send(const iovec *p_iov, const ssize_t sz_iov, xlio_ send_wqe_h.enable_tso(send_wqe, (void *)((uint8_t *)p_pkt + hdr_alignment_diff), m_header->m_total_hdr_len + tcp_hdr_len, 0); } - m_p_send_wqe = &send_wqe; + p_send_wqe = &send_wqe; if (!is_zerocopy) { p_tcp_iov[0].iovec.iov_base = (uint8_t *)p_tcp_hdr + tcp_hdr_len; p_tcp_iov[0].iovec.iov_len -= tcp_hdr_len; } } else { - m_p_send_wqe = &m_not_inline_send_wqe; + p_send_wqe = &m_not_inline_send_wqe; p_tcp_iov[0].iovec.iov_base = (uint8_t *)p_pkt + hdr_alignment_diff; p_tcp_iov[0].iovec.iov_len = total_packet_len; } @@ -211,7 +212,7 @@ ssize_t dst_entry_tcp::fast_send(const iovec *p_iov, const ssize_t sz_iov, xlio_ p_tcp_iov[0].p_desc->tx.p_tcp_h = static_cast(p_tcp_hdr); /* set wr_id as a pointer to memory descriptor */ - m_p_send_wqe->wr_id = (uintptr_t)p_tcp_iov[0].p_desc; + p_send_wqe->wr_id = (uintptr_t)p_tcp_iov[0].p_desc; /* Update scatter gather element list * ref counter is incremented (above) for the first memory descriptor only because it is @@ -252,7 +253,7 @@ ssize_t dst_entry_tcp::fast_send(const iovec *p_iov, const ssize_t sz_iov, xlio_ } } - ret = send_lwip_buffer(m_id, m_p_send_wqe, attr.flags, attr.tis); + ret = send_lwip_buffer(m_id, p_send_wqe, attr.flags, attr.tis); } else { // We don'nt support inline in this case, since we believe that this a very rare case mem_buf_desc_t *p_mem_buf_desc; size_t total_packet_len = 0; @@ -293,10 +294,10 @@ ssize_t dst_entry_tcp::fast_send(const iovec *p_iov, const ssize_t sz_iov, xlio_ p_mem_buf_desc->tx.p_ip_h = p_ip_hdr; p_mem_buf_desc->tx.p_tcp_h = static_cast(p_tcp_hdr); - m_p_send_wqe = &m_not_inline_send_wqe; - m_p_send_wqe->wr_id = (uintptr_t)p_mem_buf_desc; + p_send_wqe = &m_not_inline_send_wqe; + p_send_wqe->wr_id = (uintptr_t)p_mem_buf_desc; - send_ring_buffer(m_id, m_p_send_wqe, attr.flags); + send_ring_buffer(m_id, p_send_wqe, attr.flags); } if (unlikely(m_p_tx_mem_buf_desc_list == NULL)) { diff --git a/src/core/proto/dst_entry_udp.cpp b/src/core/proto/dst_entry_udp.cpp index 817815713..d70e4f64b 100644 --- a/src/core/proto/dst_entry_udp.cpp +++ b/src/core/proto/dst_entry_udp.cpp @@ -202,6 +202,7 @@ inline ssize_t dst_entry_udp::fast_send_not_fragmented(const iovec *p_iov, const ssize_t sz_data_payload) { mem_buf_desc_t *p_mem_buf_desc; + xlio_ibv_send_wr *p_send_wqe; bool b_blocked = is_set(attr, XLIO_TX_PACKET_BLOCK); // Get a bunch of tx buf descriptor and data buffers @@ -234,7 +235,7 @@ inline ssize_t dst_entry_udp::fast_send_not_fragmented(const iovec *p_iov, const // Skip inlining in case of L4 SW checksum because headers and data are not contiguous in memory if (sz_iov == 1 && ((sz_data_payload + m_header->m_total_hdr_len) < m_max_inline) && !is_set(attr, XLIO_TX_SW_L4_CSUM)) { - m_p_send_wqe = &m_inline_send_wqe; + p_send_wqe = &m_inline_send_wqe; m_header->get_udp_hdr()->len = htons((uint16_t)sz_udp_payload); m_header->set_ip_len(m_header->m_ip_header_len + sz_udp_payload); @@ -248,7 +249,7 @@ inline ssize_t dst_entry_udp::fast_send_not_fragmented(const iovec *p_iov, const m_sge[1].addr = (uintptr_t)p_iov[0].iov_base; m_sge[1].lkey = m_p_ring->get_tx_lkey(m_id); } else { - m_p_send_wqe = &m_not_inline_send_wqe; + p_send_wqe = &m_not_inline_send_wqe; void *p_pkt = p_mem_buf_desc->p_buffer; void *p_ip_hdr; @@ -302,8 +303,8 @@ inline ssize_t dst_entry_udp::fast_send_not_fragmented(const iovec *p_iov, const BULLSEYE_EXCLUDE_BLOCK_END } - m_p_send_wqe->wr_id = reinterpret_cast(p_mem_buf_desc); - send_ring_buffer(m_id, m_p_send_wqe, attr); + p_send_wqe->wr_id = reinterpret_cast(p_mem_buf_desc); + send_ring_buffer(m_id, p_send_wqe, attr); // request tx buffers for the next packets if (unlikely(m_p_tx_mem_buf_desc_list == NULL)) { @@ -324,7 +325,7 @@ inline bool dst_entry_udp::fast_send_fragmented_ipv4(mem_buf_desc_t *p_mem_buf_d void *p_ip_hdr; void *p_udp_hdr; mem_buf_desc_t *tmp; - m_p_send_wqe = &m_fragmented_send_wqe; + xlio_ibv_send_wr *p_send_wqe = &m_fragmented_send_wqe; uint16_t packet_id = gen_packet_id_ip4(); // Int for counting offset inside the ip datagram payload @@ -397,7 +398,7 @@ inline bool dst_entry_udp::fast_send_fragmented_ipv4(mem_buf_desc_t *p_mem_buf_d (uintptr_t)(p_mem_buf_desc->p_buffer + (uint8_t)m_header->m_transport_header_tx_offset); m_sge[1].length = sz_user_data_to_copy + hdr_len; m_sge[1].lkey = m_p_ring->get_tx_lkey(m_id); - m_p_send_wqe->wr_id = (uintptr_t)p_mem_buf_desc; + p_send_wqe->wr_id = (uintptr_t)p_mem_buf_desc; dst_udp_logfunc("packet_sz=%d, payload_sz=%d, ip_offset=%d id=%d", m_sge[1].length - m_header->m_transport_header_len, sz_user_data_to_copy, @@ -408,7 +409,7 @@ inline bool dst_entry_udp::fast_send_fragmented_ipv4(mem_buf_desc_t *p_mem_buf_d // We don't check the return valuse of post send when we reach the HW we consider that we // completed our job - send_ring_buffer(m_id, m_p_send_wqe, attr); + send_ring_buffer(m_id, p_send_wqe, attr); p_mem_buf_desc = tmp; From 5a7881e5541e2a037eb8ab4a8c20f01aaf04a86d Mon Sep 17 00:00:00 2001 From: Dmytro Podgornyi Date: Thu, 7 Mar 2024 13:13:55 +0200 Subject: [PATCH 077/169] issue: 3813802 Fix type overflow warning in time_converter_rtc cppcheck complains about 'int' type overflow in 0x3 << 30. Fix this with explicit unsigned type. Signed-off-by: Dmytro Podgornyi --- src/core/dev/time_converter_rtc.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/core/dev/time_converter_rtc.cpp b/src/core/dev/time_converter_rtc.cpp index c0f873dc7..45edd95e2 100644 --- a/src/core/dev/time_converter_rtc.cpp +++ b/src/core/dev/time_converter_rtc.cpp @@ -58,7 +58,7 @@ void time_converter_rtc::handle_timer_expired(void *) void time_converter_rtc::convert_hw_time_to_system_time(uint64_t hwtime, struct timespec *systime) { hwtime &= 0x7FFFFFFFFFFFFFFF; - systime->tv_nsec = (uint32_t)(hwtime & ~(0x3 << 30)); + systime->tv_nsec = (uint32_t)(hwtime & ~(0x3UL << 30)); systime->tv_sec = (uint32_t)(hwtime >> 32); ibchtc_logfine("hwtime: %09ld", hwtime); From fa7aadd0e307b2584486c65a6d8baa0e85419e2f Mon Sep 17 00:00:00 2001 From: Dmytro Podgornyi Date: Thu, 7 Mar 2024 14:23:12 +0200 Subject: [PATCH 078/169] issue: 3813802 Fix IP_FRAG_DEBUG=1 build Debug code is disabled by default in ip_frag and has become broken. It assumes that mem_buf_desc_t::n_ref_count has a non atomic type. Signed-off-by: Dmytro Podgornyi --- src/core/proto/ip_frag.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/core/proto/ip_frag.cpp b/src/core/proto/ip_frag.cpp index ca68b1557..dc151dabf 100644 --- a/src/core/proto/ip_frag.cpp +++ b/src/core/proto/ip_frag.cpp @@ -58,9 +58,9 @@ static int g_ip_frag_count_check = 0; #define MEMBUF_DEBUG_REF_INC(__p_desc__) \ { \ g_ip_frag_count_check++; \ - if (__p_desc__->n_ref_count != 0) \ + if (__p_desc__->inc_ref_count() != 0) { \ frag_panic("REF_INC: p=%p\n", __p_desc__); \ - __p_desc__->n_ref_count++; \ + } \ } #define MEMBUF_DEBUG_REF_DEC(__p_desc__) \ { \ @@ -73,9 +73,9 @@ static int g_ip_frag_count_check = 0; #define MEMBUF_DEBUG_REF_DEC_1(__p_desc__) \ { \ g_ip_frag_count_check--; \ - __p_desc__->n_ref_count--; \ - if (__p_desc__->n_ref_count != 0) \ + if (__p_desc__->dec_ref_count() != 1) { \ frag_panic("REF_DEC: p=%p\n", __p_desc__); \ + } \ } #define PRINT_STATISTICS() \ { \ From 4b205fbc6cabecad8f01328bfbaffaa630ddfd9e Mon Sep 17 00:00:00 2001 From: Dmytro Podgornyi Date: Fri, 8 Mar 2024 01:02:56 +0200 Subject: [PATCH 079/169] issue: 3813802 Include system headers in the right way Use <> for the system headers. This suppresses "information" level warning of cppcheck. Signed-off-by: Dmytro Podgornyi --- src/core/event/netlink_event.cpp | 3 ++- src/core/lwip/cc_cubic.c | 3 ++- src/core/netlink/test_main.cpp | 7 ++++--- 3 files changed, 8 insertions(+), 5 deletions(-) diff --git a/src/core/event/netlink_event.cpp b/src/core/event/netlink_event.cpp index 198078194..093757e83 100644 --- a/src/core/event/netlink_event.cpp +++ b/src/core/event/netlink_event.cpp @@ -32,9 +32,10 @@ #include "netlink_event.h" #include "vlogger/vlogger.h" + #include #include -#include "stdio.h" +#include #define TOSTR_MAX_SIZE 4096 diff --git a/src/core/lwip/cc_cubic.c b/src/core/lwip/cc_cubic.c index 044fa84ac..9071cf228 100644 --- a/src/core/lwip/cc_cubic.c +++ b/src/core/lwip/cc_cubic.c @@ -81,7 +81,8 @@ */ #include "cc_cubic.h" -#include "errno.h" + +#include #include #if TCP_CC_ALGO_MOD diff --git a/src/core/netlink/test_main.cpp b/src/core/netlink/test_main.cpp index 9317a5b03..9216916bc 100644 --- a/src/core/netlink/test_main.cpp +++ b/src/core/netlink/test_main.cpp @@ -30,12 +30,13 @@ * SOFTWARE. */ +#include +#include +#include + #include "core/infra/subject_observer.h" #include "netlink_wrapper.h" #include "neigh_info.h" -#include -#include "errno.h" -#include #include "vlogger/vlogger.h" #include "core/event/netlink_event.h" From eecf0a59d4d825ebdefc3b00b1fb83243ea727f7 Mon Sep 17 00:00:00 2001 From: Dmytro Podgornyi Date: Fri, 8 Mar 2024 01:04:51 +0200 Subject: [PATCH 080/169] issue: 3813802 Remove unneeded cppcheck suppressions Signed-off-by: Dmytro Podgornyi --- src/core/event/event_handler_manager.cpp | 7 ------- src/core/infra/DemoSubject.h | 1 - src/core/sock/sockinfo_tcp.cpp | 1 - src/stats/stats_printer.cpp | 6 ------ src/stats/stats_reader.cpp | 1 - 5 files changed, 16 deletions(-) diff --git a/src/core/event/event_handler_manager.cpp b/src/core/event/event_handler_manager.cpp index 5719c714d..b5ad964fe 100644 --- a/src/core/event/event_handler_manager.cpp +++ b/src/core/event/event_handler_manager.cpp @@ -313,7 +313,6 @@ void *event_handler_thread(void *_p_tgtObject) } else { evh_logdbg("Internal thread affinity not set."); } - /* cppcheck-suppress resourceLeak */ } void *ret = p_tgtObject->thread_loop(); @@ -544,8 +543,6 @@ void event_handler_manager::priv_register_ibverbs_events(ibverbs_reg_info_t &inf v.ibverbs_ev.fd = info.fd; v.ibverbs_ev.channel = info.channel; - /* coverity[uninit_use_in_call] */ - /* cppcheck-suppress uninitStructMember */ m_event_handler_map[info.fd] = v; i = m_event_handler_map.find(info.fd); @@ -633,8 +630,6 @@ void event_handler_manager::priv_register_rdma_cm_events(rdma_cm_reg_info_t &inf map_value.rdma_cm_ev.map_rdma_cm_id[info.id] = info.handler; map_value.rdma_cm_ev.cma_channel = info.cma_channel; - /* coverity[uninit_use_in_call] */ - /* cppcheck-suppress uninitStructMember */ m_event_handler_map[info.fd] = map_value; update_epfd(info.fd, EPOLL_CTL_ADD, EPOLLIN | EPOLLPRI); @@ -708,8 +703,6 @@ void event_handler_manager::priv_register_command_events(command_reg_info_t &inf map_value.type = EV_COMMAND; map_value.command_ev.cmd = info.cmd; - /* coverity[uninit_use_in_call] */ - /* cppcheck-suppress uninitStructMember */ m_event_handler_map[info.fd] = map_value; update_epfd(info.fd, EPOLL_CTL_ADD, EPOLLIN | EPOLLPRI); } diff --git a/src/core/infra/DemoSubject.h b/src/core/infra/DemoSubject.h index fb9d17c37..1320c7726 100644 --- a/src/core/infra/DemoSubject.h +++ b/src/core/infra/DemoSubject.h @@ -48,7 +48,6 @@ template class key_class : public tostr { const std::string to_str() const { char s[20]; - /* cppcheck-suppress wrongPrintfScanfArgNum */ snprintf(s, sizeof(s), "%d.%d.%d.%d", NIPQUAD(m_key)); return (std::string(s)); } diff --git a/src/core/sock/sockinfo_tcp.cpp b/src/core/sock/sockinfo_tcp.cpp index 38f04fa3f..79c4147f8 100644 --- a/src/core/sock/sockinfo_tcp.cpp +++ b/src/core/sock/sockinfo_tcp.cpp @@ -3471,7 +3471,6 @@ err_t sockinfo_tcp::clone_conn_cb(void *arg, struct tcp_pcb **newpcb) new_sock = conn->accept_clone(); if (new_sock) { - /* cppcheck-suppress autoVariables */ *newpcb = (struct tcp_pcb *)(&new_sock->m_pcb); new_sock->m_pcb.my_container = (void *)new_sock; /* XXX We have to search for correct listen socket every time, diff --git a/src/stats/stats_printer.cpp b/src/stats/stats_printer.cpp index 3abb7c286..aca16c25d 100644 --- a/src/stats/stats_printer.cpp +++ b/src/stats/stats_printer.cpp @@ -106,7 +106,6 @@ void print_full_stats(socket_stats_t *p_si_stats, mc_grp_info_t *p_mc_grp_info, if (p_si_stats->socket_type == SOCK_DGRAM) { fprintf(filename, ", MC Loop %s", p_si_stats->b_mc_loop ? "Enabled " : "Disabled"); if (!p_si_stats->mc_tx_if.is_anyaddr()) { - /* cppcheck-suppress wrongPrintfScanfArgNum */ fprintf(filename, ", MC IF = [%s]", p_si_stats->mc_tx_if.to_str(p_si_stats->sa_family).c_str()); } @@ -117,13 +116,11 @@ void print_full_stats(socket_stats_t *p_si_stats, mc_grp_info_t *p_mc_grp_info, // Bounded + Connected information // if (!p_si_stats->bound_if.is_anyaddr() || p_si_stats->bound_port) { - /* cppcheck-suppress wrongPrintfScanfArgNum */ fprintf(filename, "- Local Address = [%s:%d]\n", p_si_stats->bound_if.to_str(p_si_stats->sa_family).c_str(), ntohs(p_si_stats->bound_port)); } if (!p_si_stats->connected_ip.is_anyaddr() || p_si_stats->connected_port) { - /* cppcheck-suppress wrongPrintfScanfArgNum */ fprintf(filename, "- Foreign Address = [%s:%d]\n", p_si_stats->connected_ip.to_str(p_si_stats->sa_family).c_str(), ntohs(p_si_stats->connected_port)); @@ -131,7 +128,6 @@ void print_full_stats(socket_stats_t *p_si_stats, mc_grp_info_t *p_mc_grp_info, if (p_mc_grp_info) { for (int grp_idx = 0; grp_idx < p_mc_grp_info->max_grp_num; grp_idx++) { if (p_si_stats->mc_grp_map.test(grp_idx)) { - /* cppcheck-suppress wrongPrintfScanfArgNum */ fprintf(filename, "- Member of = [%s]\n", p_mc_grp_info->mc_grp_tbl[grp_idx].mc_grp.to_str().c_str()); } @@ -345,7 +341,6 @@ void print_netstat_like(socket_stats_t *p_si_stats, mc_grp_info_t *, FILE *file, // int len = 0; if (!p_si_stats->bound_if.is_anyaddr() || p_si_stats->bound_port) { - /* cppcheck-suppress wrongPrintfScanfArgNum */ len = fprintf(file, "%s:%-5d", p_si_stats->bound_if.to_str(p_si_stats->sa_family).c_str(), ntohs(p_si_stats->bound_port)); @@ -360,7 +355,6 @@ void print_netstat_like(socket_stats_t *p_si_stats, mc_grp_info_t *, FILE *file, fprintf(file, " "); if (!p_si_stats->connected_ip.is_anyaddr() || p_si_stats->connected_port) { - /* cppcheck-suppress wrongPrintfScanfArgNum */ len = fprintf(file, "%s:%-5d", p_si_stats->connected_ip.to_str(p_si_stats->sa_family).c_str(), ntohs(p_si_stats->connected_port)); diff --git a/src/stats/stats_reader.cpp b/src/stats/stats_reader.cpp index 16d5e5e0c..d4991280b 100644 --- a/src/stats/stats_reader.cpp +++ b/src/stats/stats_reader.cpp @@ -1214,7 +1214,6 @@ void print_mc_group_fds(mc_group_fds_t *mc_group_fds, int array_size) printf("------------------------------\n"); for (int i = 0; i < array_size; i++) { char mcg_str[256]; - /* cppcheck-suppress wrongPrintfScanfArgNum */ sprintf(mcg_str, "[%s]", mc_group_fds[i].mc_grp.to_str().c_str()); printf("%-22s", mcg_str); for (const auto &fd : mc_group_fds[i].fd_list) { From 6803858224893d04b621d0831635ef2510740a8c Mon Sep 17 00:00:00 2001 From: Alex Briskin Date: Mon, 5 Feb 2024 18:45:53 +0200 Subject: [PATCH 081/169] issue: 3770816 Use override instead virtual Signed-off-by: Alex Briskin --- src/core/sock/fd_collection.h | 4 +- src/core/sock/pipeinfo.h | 22 +++---- src/core/sock/socket_fd_api.h | 2 +- src/core/sock/sockinfo.h | 38 +++++------ src/core/sock/sockinfo_nvme.h | 2 +- src/core/sock/sockinfo_tcp.h | 117 +++++++++++++++++---------------- src/core/sock/sockinfo_udp.h | 81 ++++++++++++----------- src/core/sock/sockinfo_ulp.cpp | 9 +-- src/core/sock/sockinfo_ulp.h | 2 +- src/core/sock/tcp_seg_pool.h | 2 +- 10 files changed, 141 insertions(+), 138 deletions(-) diff --git a/src/core/sock/fd_collection.h b/src/core/sock/fd_collection.h index 5918398c4..1acf35862 100644 --- a/src/core/sock/fd_collection.h +++ b/src/core/sock/fd_collection.h @@ -67,7 +67,7 @@ class cq_channel_info : public cleanable_obj { public: cq_channel_info(ring *p_ring) : m_p_ring(p_ring) {}; - ~cq_channel_info() {}; + ~cq_channel_info() override {}; ring *get_ring() const noexcept { return m_p_ring; }; protected: @@ -77,7 +77,7 @@ class cq_channel_info : public cleanable_obj { class fd_collection : private lock_mutex_recursive { public: fd_collection(); - ~fd_collection(); + ~fd_collection() override; /** * Create and add a sockinfo. Use get_sock() to get it. diff --git a/src/core/sock/pipeinfo.h b/src/core/sock/pipeinfo.h index 59218ff27..ce29495c6 100644 --- a/src/core/sock/pipeinfo.h +++ b/src/core/sock/pipeinfo.h @@ -41,32 +41,32 @@ class pipeinfo : public socket_fd_api, public timer_handler { public: pipeinfo(int fd); - ~pipeinfo(); + ~pipeinfo() override; - virtual void clean_obj(); + void clean_obj() override; #if defined(DEFINED_NGINX) || defined(DEFINED_ENVOY) - virtual void copy_sockopt_fork(const socket_fd_api *copy_from) { NOT_IN_USE(copy_from); } + void copy_sockopt_fork(const socket_fd_api *copy_from) override { NOT_IN_USE(copy_from); } #endif - int fcntl(int __cmd, unsigned long int __arg); - int fcntl64(int __cmd, unsigned long int __arg); - int ioctl(unsigned long int __request, unsigned long int __arg); + int fcntl(int __cmd, unsigned long int __arg) override; + int fcntl64(int __cmd, unsigned long int __arg) override; + int ioctl(unsigned long int __request, unsigned long int __arg) override; // Process a Rx request, we might have a ready packet, or we might block until // we have one (if sockinfo::m_b_blocking == true) ssize_t rx(const rx_call_t call_type, struct iovec *p_iov, ssize_t sz_iov, int *p_flags, struct sockaddr *__from = NULL, socklen_t *__fromlen = NULL, - struct msghdr *__msg = NULL); + struct msghdr *__msg = NULL) override; // Process a Tx request, handle all that is needed to send the packet, we might block // until the connection info is ready or a tx buffer is releast (if sockinfo::m_b_blocking == // true) - ssize_t tx(xlio_tx_call_attr_t &tx_arg); + ssize_t tx(xlio_tx_call_attr_t &tx_arg) override; - void statistics_print(vlog_levels_t log_level = VLOG_DEBUG); + void statistics_print(vlog_levels_t log_level = VLOG_DEBUG) override; - virtual inline fd_type_t get_type() { return FD_TYPE_PIPE; } + inline fd_type_t get_type() override { return FD_TYPE_PIPE; } private: bool m_b_blocking; @@ -87,7 +87,7 @@ class pipeinfo : public socket_fd_api, public timer_handler { int m_write_count_no_change_count; bool m_b_lbm_event_q_pipe_timer_on; - void handle_timer_expired(void *user_data); + void handle_timer_expired(void *user_data) override; void write_lbm_pipe_enhance(); diff --git a/src/core/sock/socket_fd_api.h b/src/core/sock/socket_fd_api.h index e37567ac6..70c2a789d 100644 --- a/src/core/sock/socket_fd_api.h +++ b/src/core/sock/socket_fd_api.h @@ -144,7 +144,7 @@ typedef xlio_list_t xlio_des class socket_fd_api : public cleanable_obj { public: socket_fd_api(int fd); - virtual ~socket_fd_api(); + ~socket_fd_api() override; virtual void setPassthrough() {} virtual bool isPassthrough() { return false; } diff --git a/src/core/sock/sockinfo.h b/src/core/sock/sockinfo.h index 1037990ad..75d77305d 100644 --- a/src/core/sock/sockinfo.h +++ b/src/core/sock/sockinfo.h @@ -161,7 +161,7 @@ class sockinfo : public socket_fd_api, public wakeup_pipe { public: sockinfo(int fd, int domain, bool use_ring_locks); - virtual ~sockinfo(); + ~sockinfo() override; enum sockinfo_state { SOCKINFO_UNDEFINED, @@ -172,15 +172,15 @@ class sockinfo : public socket_fd_api, }; #if defined(DEFINED_NGINX) || defined(DEFINED_ENVOY) - virtual void copy_sockopt_fork(const socket_fd_api *copy_from); + void copy_sockopt_fork(const socket_fd_api *copy_from) override; #endif #if defined(DEFINED_NGINX) void set_m_n_sysvar_rx_num_buffs_reuse(int val) { m_n_sysvar_rx_num_buffs_reuse = val; } #endif - virtual void consider_rings_migration_rx(); - virtual int add_epoll_context(epfd_info *epfd); - virtual void remove_epoll_context(epfd_info *epfd); + void consider_rings_migration_rx() override; + int add_epoll_context(epfd_info *epfd) override; + void remove_epoll_context(epfd_info *epfd) override; inline bool set_flow_tag(uint32_t flow_tag_id) { @@ -199,10 +199,10 @@ class sockinfo : public socket_fd_api, inline bool is_blocking(void) { return m_b_blocking; } bool flow_in_reuse(void) { return m_reuseaddr | m_reuseport; } - virtual int *get_rings_fds(int &res_length); - virtual int get_rings_num(); - virtual bool check_rings() { return m_p_rx_ring ? true : false; } - virtual void statistics_print(vlog_levels_t log_level = VLOG_DEBUG); + int *get_rings_fds(int &res_length) override; + int get_rings_num() override; + bool check_rings() override { return m_p_rx_ring ? true : false; } + void statistics_print(vlog_levels_t log_level = VLOG_DEBUG) override; uint32_t get_flow_tag_val() { return m_flow_tag_id; } inline in_protocol_t get_protocol(void) { return m_protocol; } @@ -237,13 +237,13 @@ class sockinfo : public socket_fd_api, } virtual void set_blocking(bool is_blocked); - virtual int fcntl(int __cmd, unsigned long int __arg); - virtual int fcntl64(int __cmd, unsigned long int __arg); - virtual int ioctl(unsigned long int __request, unsigned long int __arg); - virtual int setsockopt(int __level, int __optname, const void *__optval, socklen_t __optlen); + int fcntl(int __cmd, unsigned long int __arg) override; + int fcntl64(int __cmd, unsigned long int __arg) override; + int ioctl(unsigned long int __request, unsigned long int __arg) override; + int setsockopt(int __level, int __optname, const void *__optval, socklen_t __optlen) override; int setsockopt_kernel(int __level, int __optname, const void *__optval, socklen_t __optlen, int supported, bool allow_priv); - virtual int getsockopt(int __level, int __optname, void *__optval, socklen_t *__optlen); + int getsockopt(int __level, int __optname, void *__optval, socklen_t *__optlen) override; virtual mem_buf_desc_t *get_front_m_rx_pkt_ready_list() = 0; virtual size_t get_size_m_rx_pkt_ready_list() = 0; @@ -264,7 +264,7 @@ class sockinfo : public socket_fd_api, virtual void post_deqeue(bool release_buff) = 0; virtual int os_epoll_wait(epoll_event *ep_events, int maxevents); virtual int zero_copy_rx(iovec *p_iov, mem_buf_desc_t *pdesc, int *p_flags) = 0; - virtual int register_callback(xlio_recv_callback_t callback, void *context); + int register_callback(xlio_recv_callback_t callback, void *context) override; virtual size_t handle_msg_trunc(size_t total_rx, size_t payload_size, int in_flags, int *p_out_flags); @@ -288,14 +288,14 @@ class sockinfo : public socket_fd_api, const struct sockaddr *sock_addr_second = NULL); // This callback will notify that socket is ready to receive and map the cq. - virtual void rx_add_ring_cb(ring *p_ring); - virtual void rx_del_ring_cb(ring *p_ring); + void rx_add_ring_cb(ring *p_ring) override; + void rx_del_ring_cb(ring *p_ring) override; virtual void lock_rx_q() { m_lock_rcv.lock(); } virtual void unlock_rx_q() { m_lock_rcv.unlock(); } void shutdown_rx(); - void destructor_helper(); + void destructor_helper() override; int modify_ratelimit(dst_entry *p_dst_entry, struct xlio_rate_limit_t &rate_limit); void move_descs(ring *p_ring, descq_t *toq, descq_t *fromq, bool own); @@ -317,7 +317,7 @@ class sockinfo : public socket_fd_api, int os_wait_sock_rx_epfd(epoll_event *ep_events, int maxevents); virtual bool try_un_offloading(); // un-offload the socket if possible - bool is_shadow_socket_present() { return m_fd >= 0 && m_fd != m_rx_epfd; } + bool is_shadow_socket_present() override { return m_fd >= 0 && m_fd != m_rx_epfd; } inline bool is_socketxtreme() { return safe_mce_sys().enable_socketxtreme; } inline void set_events_socketxtreme(uint64_t events) diff --git a/src/core/sock/sockinfo_nvme.h b/src/core/sock/sockinfo_nvme.h index 78f647943..6bf9ac4b0 100644 --- a/src/core/sock/sockinfo_nvme.h +++ b/src/core/sock/sockinfo_nvme.h @@ -56,7 +56,7 @@ class sockinfo_tcp_ops_nvme : public sockinfo_tcp_ops { , m_is_ddgs_on(false) { } - ~sockinfo_tcp_ops_nvme() + ~sockinfo_tcp_ops_nvme() override { if (m_pdu_mdesc) { m_pdu_mdesc->put(); diff --git a/src/core/sock/sockinfo_tcp.h b/src/core/sock/sockinfo_tcp.h index 9f2d5ae2d..32b2b947f 100644 --- a/src/core/sock/sockinfo_tcp.h +++ b/src/core/sock/sockinfo_tcp.h @@ -143,54 +143,54 @@ class sockinfo_tcp : public sockinfo, public timer_handler { } typedef xlio_list_t sock_list_t; sockinfo_tcp(int fd, int domain); - virtual ~sockinfo_tcp(); + ~sockinfo_tcp() override; - virtual void clean_obj(); + void clean_obj() override; void setPassthrough(bool _isPassthrough) { m_sock_offload = _isPassthrough ? TCP_SOCK_PASSTHROUGH : TCP_SOCK_LWIP; m_p_socket_stats->b_is_offloaded = !_isPassthrough; } - void setPassthrough() { setPassthrough(true); } - bool isPassthrough() { return m_sock_offload == TCP_SOCK_PASSTHROUGH; } + void setPassthrough() override { setPassthrough(true); } + bool isPassthrough() override { return m_sock_offload == TCP_SOCK_PASSTHROUGH; } - int prepareListen(); - int shutdown(int __how); + int prepareListen() override; + int shutdown(int __how) override; // Not always we can close immediately TCP socket: we can do that only after the TCP connection // in closed. In this method we just kikstarting the TCP connection termination (empty the // unsent/unacked, senf FIN...) Return val: true is the socket is already closable and false // otherwise - virtual bool prepare_to_close(bool process_shutdown = false); + bool prepare_to_close(bool process_shutdown = false) override; void create_dst_entry(); bool prepare_dst_to_send(bool is_accepted_socket = false); - virtual int fcntl(int __cmd, unsigned long int __arg); - virtual int fcntl64(int __cmd, unsigned long int __arg); - virtual int ioctl(unsigned long int __request, unsigned long int __arg); - virtual int setsockopt(int __level, int __optname, const void *__optval, socklen_t __optlen); + int fcntl(int __cmd, unsigned long int __arg) override; + int fcntl64(int __cmd, unsigned long int __arg) override; + int ioctl(unsigned long int __request, unsigned long int __arg) override; + int setsockopt(int __level, int __optname, const void *__optval, socklen_t __optlen) override; virtual int tcp_setsockopt(int __level, int __optname, const void *__optval, socklen_t __optlen); - virtual int getsockopt(int __level, int __optname, void *__optval, socklen_t *__optlen); + int getsockopt(int __level, int __optname, void *__optval, socklen_t *__optlen) override; int getsockopt_offload(int __level, int __optname, void *__optval, socklen_t *__optlen); - virtual int connect(const sockaddr *, socklen_t); - virtual int bind(const sockaddr *__addr, socklen_t __addrlen); - virtual int listen(int backlog); - virtual int accept(struct sockaddr *__addr, socklen_t *__addrlen); - virtual int accept4(struct sockaddr *__addr, socklen_t *__addrlen, int __flags); - virtual int getsockname(sockaddr *__name, socklen_t *__namelen); - virtual int getpeername(sockaddr *__name, socklen_t *__namelen); + int connect(const sockaddr *, socklen_t) override; + int bind(const sockaddr *__addr, socklen_t __addrlen) override; + int listen(int backlog) override; + int accept(struct sockaddr *__addr, socklen_t *__addrlen) override; + int accept4(struct sockaddr *__addr, socklen_t *__addrlen, int __flags) override; + int getsockname(sockaddr *__name, socklen_t *__namelen) override; + int getpeername(sockaddr *__name, socklen_t *__namelen) override; inline bool handle_bind_no_port(int &bind_ret, in_port_t in_port, const sockaddr *__addr, socklen_t __addrlen); inline void non_tcp_recved(int rx_len); - virtual int recvfrom_zcopy_free_packets(struct xlio_recvfrom_zcopy_packet_t *pkts, - size_t count); + int recvfrom_zcopy_free_packets(struct xlio_recvfrom_zcopy_packet_t *pkts, + size_t count) override; void socketxtreme_recv_buffs_tcp(mem_buf_desc_t *desc, uint16_t len); - virtual void statistics_print(vlog_levels_t log_level = VLOG_DEBUG); + void statistics_print(vlog_levels_t log_level = VLOG_DEBUG) override; inline struct tcp_pcb *get_pcb(void) { return &m_pcb; } @@ -201,18 +201,19 @@ class sockinfo_tcp : public sockinfo, public timer_handler { inline unsigned get_mss(void) { return m_pcb.mss; } - ssize_t tx(xlio_tx_call_attr_t &tx_arg); + ssize_t tx(xlio_tx_call_attr_t &tx_arg) override; ssize_t tcp_tx(xlio_tx_call_attr_t &tx_arg); ssize_t rx(const rx_call_t call_type, iovec *p_iov, ssize_t sz_iov, int *p_flags, - sockaddr *__from = NULL, socklen_t *__fromlen = NULL, struct msghdr *__msg = NULL); + sockaddr *__from = NULL, socklen_t *__fromlen = NULL, + struct msghdr *__msg = NULL) override; static err_t ip_output(struct pbuf *p, struct tcp_seg *seg, void *v_p_conn, uint16_t flags); static err_t ip_output_syn_ack(struct pbuf *p, struct tcp_seg *seg, void *v_p_conn, uint16_t flags); static void tcp_state_observer(void *pcb_container, enum tcp_state new_state); static uint16_t get_route_mtu(struct tcp_pcb *pcb); - virtual void update_header_field(data_updater *updater); - virtual bool rx_input_cb(mem_buf_desc_t *p_rx_pkt_mem_buf_desc_info, void *pv_fd_ready_array); + void update_header_field(data_updater *updater) override; + bool rx_input_cb(mem_buf_desc_t *p_rx_pkt_mem_buf_desc_info, void *pv_fd_ready_array) override; void abort_connection(); void tcp_shutdown_rx(void); @@ -233,10 +234,10 @@ class sockinfo_tcp : public sockinfo, public timer_handler { static void tcp_tx_zc_callback(mem_buf_desc_t *p_desc); void tcp_tx_zc_handle(mem_buf_desc_t *p_desc); - bool inline is_readable(uint64_t *p_poll_sn, fd_array_t *p_fd_array = NULL); - bool inline is_writeable(); - bool inline is_errorable(int *errors); - bool is_closable() + bool inline is_readable(uint64_t *p_poll_sn, fd_array_t *p_fd_array = NULL) override; + bool inline is_writeable() override; + bool inline is_errorable(int *errors) override; + bool is_closable() override { return get_tcp_state(&m_pcb) == CLOSED && m_syn_received.empty() && m_accepted_conns.empty(); @@ -253,21 +254,21 @@ class sockinfo_tcp : public sockinfo, public timer_handler { { return get_tcp_state(&m_pcb) == CLOSED && m_state == SOCKINFO_CLOSING; } - bool skip_os_select() + bool skip_os_select() override { // calling os select on offloaded TCP sockets makes no sense unless it's a listen socket // to make things worse, it returns that os fd is ready... return (m_sock_offload == TCP_SOCK_LWIP && !is_server() && m_conn_state != TCP_CONN_INIT); } - bool is_outgoing() + bool is_outgoing() override { const bool is_listen_socket = is_server() || get_tcp_state(&m_pcb) == LISTEN; // Excluding incoming and listen sockets we can determine outgoing sockets. return !m_b_incoming && !is_listen_socket; } - bool is_incoming() { return m_b_incoming; } + bool is_incoming() override { return m_b_incoming; } bool is_connected() { return m_sock_state == TCP_SOCK_CONNECTED_RDWR; } @@ -288,11 +289,11 @@ class sockinfo_tcp : public sockinfo, public timer_handler { return m_sock_state == TCP_SOCK_ACCEPT_READY || m_sock_state == TCP_SOCK_ACCEPT_SHUT; } - virtual void update_socket_timestamps(timestamps_t *ts) { m_rx_timestamps = *ts; } + void update_socket_timestamps(timestamps_t *ts) override { m_rx_timestamps = *ts; } - virtual inline fd_type_t get_type() { return FD_TYPE_SOCKET; } + inline fd_type_t get_type() override { return FD_TYPE_SOCKET; } - void handle_timer_expired(void *user_data); + void handle_timer_expired(void *user_data) override; inline ib_ctx_handler *get_ctx(void) { @@ -340,7 +341,7 @@ class sockinfo_tcp : public sockinfo, public timer_handler { static err_t rx_drop_lwip_cb(void *arg, struct tcp_pcb *tpcb, struct pbuf *p, err_t err); inline void rx_lwip_cb_socketxtreme_helper(pbuf *p); - virtual int register_callback(xlio_recv_callback_t callback, void *context) + int register_callback(xlio_recv_callback_t callback, void *context) override { tcp_recv(&m_pcb, sockinfo_tcp::rx_lwip_cb_recv_callback); return sockinfo::register_callback(callback, context); @@ -350,10 +351,10 @@ class sockinfo_tcp : public sockinfo, public timer_handler { xlio_express_flags flags, void *opaque_op); protected: - virtual void lock_rx_q(); - virtual void unlock_rx_q(); - virtual bool try_un_offloading(); // un-offload the socket if possible - virtual int os_epoll_wait(epoll_event *ep_events, int maxevents); + void lock_rx_q() override; + void unlock_rx_q() override; + bool try_un_offloading() override; // un-offload the socket if possible + int os_epoll_wait(epoll_event *ep_events, int maxevents) override; private: int fcntl_helper(int __cmd, unsigned long int __arg, bool &bexit); @@ -430,7 +431,7 @@ class sockinfo_tcp : public sockinfo, public timer_handler { /* * Supported only for UDP */ - virtual void handle_ip_pktinfo(struct cmsg_state *) {}; + void handle_ip_pktinfo(struct cmsg_state *) override {}; int handle_rx_error(bool blocking); @@ -451,9 +452,9 @@ class sockinfo_tcp : public sockinfo, public timer_handler { inline void return_pending_rx_buffs(); inline void return_pending_tx_buffs(); inline void reuse_buffer(mem_buf_desc_t *buff); - virtual mem_buf_desc_t *get_next_desc(mem_buf_desc_t *p_desc); - virtual mem_buf_desc_t *get_next_desc_peek(mem_buf_desc_t *p_desc, int &rx_pkt_ready_list_idx); - virtual timestamps_t *get_socket_timestamps(); + mem_buf_desc_t *get_next_desc(mem_buf_desc_t *p_desc) override; + mem_buf_desc_t *get_next_desc_peek(mem_buf_desc_t *p_desc, int &rx_pkt_ready_list_idx) override; + timestamps_t *get_socket_timestamps() override; inline void return_reuse_buffers_postponed() { @@ -491,22 +492,22 @@ class sockinfo_tcp : public sockinfo, public timer_handler { } } - virtual void post_deqeue(bool release_buff); - virtual int zero_copy_rx(iovec *p_iov, mem_buf_desc_t *pdesc, int *p_flags); + void post_deqeue(bool release_buff) override; + int zero_copy_rx(iovec *p_iov, mem_buf_desc_t *pdesc, int *p_flags) override; // Returns the connected pcb, with 5 tuple which matches the input arguments, // in state "SYN Received" or NULL if pcb wasn't found struct tcp_pcb *get_syn_received_pcb(const flow_tuple &key) const; struct tcp_pcb *get_syn_received_pcb(const sock_addr &src, const sock_addr &dst); - virtual mem_buf_desc_t *get_front_m_rx_pkt_ready_list(); - virtual size_t get_size_m_rx_pkt_ready_list(); - virtual void pop_front_m_rx_pkt_ready_list(); - virtual void push_back_m_rx_pkt_ready_list(mem_buf_desc_t *buff); + mem_buf_desc_t *get_front_m_rx_pkt_ready_list() override; + size_t get_size_m_rx_pkt_ready_list() override; + void pop_front_m_rx_pkt_ready_list() override; + void push_back_m_rx_pkt_ready_list(mem_buf_desc_t *buff) override; // lock_spin_recursive m_rx_cq_lck; /* pick all cqs that match given address */ - virtual int rx_verify_available_data(); + int rx_verify_available_data() override; inline int rx_wait(int &poll_count, bool blocking); inline int rx_wait_lockless(int &poll_count, bool blocking); int rx_wait_helper(int &poll_count, bool blocking); @@ -617,19 +618,19 @@ typedef struct tcp_seg tcp_seg; class tcp_timers_collection : public timers_group, public cleanable_obj { public: tcp_timers_collection(int period, int resolution); - virtual ~tcp_timers_collection(); + ~tcp_timers_collection() override; - void clean_obj(); + void clean_obj() override; - virtual void handle_timer_expired(void *user_data); + void handle_timer_expired(void *user_data) override; protected: // add a new timer - void add_new_timer(timer_node_t *node, timer_handler *handler, void *user_data); + void add_new_timer(timer_node_t *node, timer_handler *handler, void *user_data) override; // remove timer from list and free it. // called for stopping (unregistering) a timer - void remove_timer(timer_node_t *node); + void remove_timer(timer_node_t *node) override; void *m_timer_handle; @@ -649,7 +650,7 @@ class tcp_timers_collection : public timers_group, public cleanable_obj { class thread_local_tcp_timers : public tcp_timers_collection { public: thread_local_tcp_timers(); - ~thread_local_tcp_timers(); + ~thread_local_tcp_timers() override; }; extern tcp_timers_collection *g_tcp_timers_collection; diff --git a/src/core/sock/sockinfo_udp.h b/src/core/sock/sockinfo_udp.h index b6e298f53..603f916fe 100644 --- a/src/core/sock/sockinfo_udp.h +++ b/src/core/sock/sockinfo_udp.h @@ -87,19 +87,19 @@ typedef std::unordered_map> mc_m class sockinfo_udp : public sockinfo { public: sockinfo_udp(int fd, int domain); - virtual ~sockinfo_udp(); + ~sockinfo_udp() override; - void setPassthrough() { m_p_socket_stats->b_is_offloaded = m_sock_offload = false; } - bool isPassthrough() { return !m_sock_offload; } + void setPassthrough() override { m_p_socket_stats->b_is_offloaded = m_sock_offload = false; } + bool isPassthrough() override { return !m_sock_offload; } int prepare_to_connect(const sockaddr *__to, socklen_t __tolen); int bind_no_os(); - int bind(const struct sockaddr *__addr, socklen_t __addrlen); - int connect(const struct sockaddr *__to, socklen_t __tolen); - virtual int getsockname(sockaddr *__name, socklen_t *__namelen); - int setsockopt(int __level, int __optname, const void *__optval, socklen_t __optlen); - int getsockopt(int __level, int __optname, void *__optval, socklen_t *__optlen); + int bind(const struct sockaddr *__addr, socklen_t __addrlen) override; + int connect(const struct sockaddr *__to, socklen_t __tolen) override; + int getsockname(sockaddr *__name, socklen_t *__namelen) override; + int setsockopt(int __level, int __optname, const void *__optval, socklen_t __optlen) override; + int getsockopt(int __level, int __optname, void *__optval, socklen_t *__optlen) override; int resolve_if_ip(const int if_index, const ip_address &ip, ip_address &resolved_ip); int fill_mc_structs_ip6(int optname, const void *optval, mc_pending_pram *mcpram); @@ -110,17 +110,18 @@ class sockinfo_udp : public sockinfo { * Sampling the OS immediately by matching the rx_skip_os counter * (m_rx_udp_poll_os_ratio_counter) to the limit (safe_mce_sys().rx_udp_poll_os_ratio) */ - void set_immediate_os_sample(); + void set_immediate_os_sample() override; /** * Reseting rx_skip_os counter to prevent sampling OS immediately */ - void unset_immediate_os_sample(); + void unset_immediate_os_sample() override; /** * Process a Rx request, we might have a ready packet, or we might block until * we have one (if sockinfo::m_b_blocking == true) */ ssize_t rx(const rx_call_t call_type, iovec *p_iov, ssize_t sz_iov, int *p_flags, - sockaddr *__from = NULL, socklen_t *__fromlen = NULL, struct msghdr *__msg = NULL); + sockaddr *__from = NULL, socklen_t *__fromlen = NULL, + struct msghdr *__msg = NULL) override; /** * Check that a call to this sockinfo rx() will not block * -> meaning, we got an offloaded ready rx datagram @@ -128,7 +129,7 @@ class sockinfo_udp : public sockinfo { * * While polling CQ, the fd_array is filled with a list of newly queued packets FD's */ - bool is_readable(uint64_t *p_poll_sn, fd_array_t *p_fd_array = NULL); + bool is_readable(uint64_t *p_poll_sn, fd_array_t *p_fd_array = NULL) override; /** * Arm the event channel(s) assosiated with this sockinfo * Fill the fd_set (p_rxfds) with the correct fd channel values and the p_nfds with the (max_fd @@ -141,14 +142,14 @@ class sockinfo_udp : public sockinfo { * until the connection info is ready or a tx buffer is releast (if sockinfo::m_b_blocking == * true) */ - ssize_t tx(xlio_tx_call_attr_t &tx_arg); + ssize_t tx(xlio_tx_call_attr_t &tx_arg) override; /** * Check that a call to this sockinof rx() will not block * -> meaning, we got a ready rx packet */ - void rx_add_ring_cb(ring *p_ring); - void rx_del_ring_cb(ring *p_ring); - virtual int rx_verify_available_data(); + void rx_add_ring_cb(ring *p_ring) override; + void rx_del_ring_cb(ring *p_ring) override; + int rx_verify_available_data() override; /** * This callback will handle ready rx packet notification, @@ -158,25 +159,25 @@ class sockinfo_udp : public sockinfo { * incremented and method returns false. * Normally it is single point from sockinfo to be called from ring level. */ - bool rx_input_cb(mem_buf_desc_t *p_desc, void *pv_fd_ready_array); + bool rx_input_cb(mem_buf_desc_t *p_desc, void *pv_fd_ready_array) override; // This call will handle all rdma related events (bind->listen->connect_req->accept) - virtual void statistics_print(vlog_levels_t log_level = VLOG_DEBUG); - virtual int recvfrom_zcopy_free_packets(struct xlio_recvfrom_zcopy_packet_t *pkts, - size_t count); - virtual inline fd_type_t get_type() { return FD_TYPE_SOCKET; } + void statistics_print(vlog_levels_t log_level = VLOG_DEBUG) override; + int recvfrom_zcopy_free_packets(struct xlio_recvfrom_zcopy_packet_t *pkts, + size_t count) override; + inline fd_type_t get_type() override { return FD_TYPE_SOCKET; } - virtual bool prepare_to_close(bool process_shutdown = false); - virtual void update_header_field(data_updater *updater); + bool prepare_to_close(bool process_shutdown = false) override; + void update_header_field(data_updater *updater) override; #if defined(DEFINED_NGINX) - virtual void prepare_to_close_socket_pool(bool _push_pop); - virtual void set_params_for_socket_pool() + void prepare_to_close_socket_pool(bool _push_pop) override; + void set_params_for_socket_pool() override { m_is_for_socket_pool = true; set_m_n_sysvar_rx_num_buffs_reuse(safe_mce_sys().nginx_udp_socket_pool_rx_num_buffs_reuse); } - bool is_closable() { return !m_is_for_socket_pool; } + bool is_closable() override { return !m_is_for_socket_pool; } #endif private: @@ -195,7 +196,7 @@ class sockinfo_udp : public sockinfo { void handle_pending_mreq(); void original_os_setsockopt_helper(const void *pram, int pram_size, int optname, int level); /* helper functions */ - void set_blocking(bool is_blocked); + void set_blocking(bool is_blocked) override; void rx_ready_byte_count_limit_update( size_t n_rx_ready_bytes_limit); // Drop rx ready packets from head of queue @@ -212,10 +213,10 @@ class sockinfo_udp : public sockinfo { inline int poll_os(); virtual inline void reuse_buffer(mem_buf_desc_t *buff); - virtual mem_buf_desc_t *get_next_desc(mem_buf_desc_t *p_desc); - virtual mem_buf_desc_t *get_next_desc_peek(mem_buf_desc_t *p_desc, int &rx_pkt_ready_list_idx); - virtual timestamps_t *get_socket_timestamps(); - virtual void update_socket_timestamps(timestamps_t *) {}; + mem_buf_desc_t *get_next_desc(mem_buf_desc_t *p_desc) override; + mem_buf_desc_t *get_next_desc_peek(mem_buf_desc_t *p_desc, int &rx_pkt_ready_list_idx) override; + timestamps_t *get_socket_timestamps() override; + void update_socket_timestamps(timestamps_t *) override {}; inline void return_reuse_buffers_postponed() { @@ -248,16 +249,16 @@ class sockinfo_udp : public sockinfo { inline void update_ready(mem_buf_desc_t *p_rx_wc_buf_desc, void *pv_fd_ready_array, xlio_recv_callback_retval_t cb_ret); - virtual void post_deqeue(bool release_buff); - virtual int zero_copy_rx(iovec *p_iov, mem_buf_desc_t *pdesc, int *p_flags); - virtual size_t handle_msg_trunc(size_t total_rx, size_t payload_size, int in_flags, - int *p_out_flags); - virtual void handle_ip_pktinfo(struct cmsg_state *cm_state); + void post_deqeue(bool release_buff) override; + int zero_copy_rx(iovec *p_iov, mem_buf_desc_t *pdesc, int *p_flags) override; + size_t handle_msg_trunc(size_t total_rx, size_t payload_size, int in_flags, + int *p_out_flags) override; + void handle_ip_pktinfo(struct cmsg_state *cm_state) override; - virtual mem_buf_desc_t *get_front_m_rx_pkt_ready_list(); - virtual size_t get_size_m_rx_pkt_ready_list(); - virtual void pop_front_m_rx_pkt_ready_list(); - virtual void push_back_m_rx_pkt_ready_list(mem_buf_desc_t *buff); + mem_buf_desc_t *get_front_m_rx_pkt_ready_list() override; + size_t get_size_m_rx_pkt_ready_list() override; + void pop_front_m_rx_pkt_ready_list() override; + void push_back_m_rx_pkt_ready_list(mem_buf_desc_t *buff) override; private: struct port_socket_t { diff --git a/src/core/sock/sockinfo_ulp.cpp b/src/core/sock/sockinfo_ulp.cpp index f2d4303b5..71f088df3 100644 --- a/src/core/sock/sockinfo_ulp.cpp +++ b/src/core/sock/sockinfo_ulp.cpp @@ -222,7 +222,7 @@ class tls_record : public mem_desc { m_p_zc_data = nullptr; } - ~tls_record() + ~tls_record() override { /* * Because of batching, buffers can be freed after their socket @@ -236,9 +236,9 @@ class tls_record : public mem_desc { } } - void get(void) { (void)atomic_fetch_and_inc(&m_ref); } + void get(void) override { (void)atomic_fetch_and_inc(&m_ref); } - void put(void) + void put(void) override { int ref = atomic_fetch_and_dec(&m_ref); @@ -247,7 +247,8 @@ class tls_record : public mem_desc { } } - uint32_t get_lkey(mem_buf_desc_t *desc, ib_ctx_handler *ib_ctx, const void *addr, size_t len) + uint32_t get_lkey(mem_buf_desc_t *desc, ib_ctx_handler *ib_ctx, const void *addr, + size_t len) override { const uintptr_t uaddr = (uintptr_t)addr; const uintptr_t ubuf = (uintptr_t)m_p_buf->p_buffer; diff --git a/src/core/sock/sockinfo_ulp.h b/src/core/sock/sockinfo_ulp.h index 7a2ce6c70..acca65a0e 100644 --- a/src/core/sock/sockinfo_ulp.h +++ b/src/core/sock/sockinfo_ulp.h @@ -84,7 +84,7 @@ void xlio_tls_api_setup(void); class sockinfo_tcp_ops_tls : public sockinfo_tcp_ops { public: sockinfo_tcp_ops_tls(sockinfo_tcp *sock); - ~sockinfo_tcp_ops_tls(); + ~sockinfo_tcp_ops_tls() override; int setsockopt(int, int, const void *, socklen_t) override; ssize_t tx(xlio_tx_call_attr_t &tx_arg) override; diff --git a/src/core/sock/tcp_seg_pool.h b/src/core/sock/tcp_seg_pool.h index b7c9852eb..cfe355467 100644 --- a/src/core/sock/tcp_seg_pool.h +++ b/src/core/sock/tcp_seg_pool.h @@ -41,7 +41,7 @@ class tcp_seg_pool : lock_spin { public: tcp_seg_pool(); - virtual ~tcp_seg_pool(); + ~tcp_seg_pool() override; std::pair get_tcp_seg_list(uint32_t amount); tcp_seg *get_tcp_segs(uint32_t amount); From a095c8b4308a91b00f72d4a15df465d1d73d8866 Mon Sep 17 00:00:00 2001 From: Alex Briskin Date: Tue, 6 Feb 2024 10:04:58 +0200 Subject: [PATCH 082/169] issue: 3770816 Use nullptr instead of NULL Signed-off-by: Alex Briskin --- src/core/dev/buffer_pool.cpp | 12 +- src/core/dev/cq_mgr_rx.cpp | 24 +- src/core/dev/cq_mgr_rx.h | 14 +- src/core/dev/cq_mgr_rx_inl.h | 2 +- src/core/dev/cq_mgr_rx_regrq.cpp | 16 +- src/core/dev/cq_mgr_rx_regrq.h | 4 +- src/core/dev/cq_mgr_rx_strq.cpp | 8 +- src/core/dev/cq_mgr_tx.cpp | 6 +- src/core/dev/dm_mgr.cpp | 14 +- src/core/dev/hw_queue_rx.cpp | 12 +- src/core/dev/hw_queue_tx.cpp | 38 +-- src/core/dev/ib_ctx_handler.cpp | 40 +-- src/core/dev/ib_ctx_handler_collection.cpp | 19 +- src/core/dev/ib_ctx_handler_collection.h | 2 +- src/core/dev/net_device_entry.cpp | 4 +- src/core/dev/net_device_table_mgr.cpp | 26 +- src/core/dev/net_device_table_mgr.h | 4 +- src/core/dev/net_device_val.cpp | 68 ++--- src/core/dev/rfs.cpp | 8 +- src/core/dev/rfs.h | 2 +- src/core/dev/rfs_mc.h | 2 +- src/core/dev/rfs_uc.h | 2 +- src/core/dev/rfs_uc_tcp_gro.cpp | 4 +- src/core/dev/rfs_uc_tcp_gro.h | 2 +- src/core/dev/ring.cpp | 6 +- src/core/dev/ring.h | 6 +- src/core/dev/ring_bond.cpp | 46 ++-- src/core/dev/ring_bond.h | 11 +- src/core/dev/ring_simple.cpp | 34 +-- src/core/dev/ring_simple.h | 14 +- src/core/dev/ring_slave.cpp | 33 +-- src/core/dev/ring_tap.cpp | 16 +- src/core/dev/ring_tap.h | 7 +- src/core/dev/time_converter.cpp | 2 +- src/core/dev/time_converter_ib_ctx.cpp | 6 +- src/core/dev/time_converter_ptp.cpp | 2 +- src/core/dev/wqe_send_handler.cpp | 2 +- src/core/event/delta_timer.cpp | 24 +- src/core/event/event.h | 2 +- src/core/event/event_handler_manager.cpp | 28 +- src/core/event/event_handler_manager.h | 2 +- src/core/event/netlink_event.cpp | 4 +- src/core/event/vlogger_timer_handler.cpp | 8 +- src/core/ib/mlx5/ib_mlx5.cpp | 2 +- src/core/infra/DemoObserver.cpp | 4 +- src/core/infra/cache_subject_observer.h | 6 +- src/core/infra/subject_observer.h | 2 +- src/core/iomux/epfd_info.cpp | 18 +- src/core/iomux/epfd_info.h | 4 +- src/core/iomux/epoll_wait_call.cpp | 8 +- src/core/iomux/epoll_wait_call.h | 2 +- src/core/iomux/io_mux_call.cpp | 6 +- src/core/iomux/io_mux_call.h | 2 +- src/core/iomux/poll_call.cpp | 8 +- src/core/iomux/poll_call.h | 2 +- src/core/iomux/select_call.cpp | 10 +- src/core/iomux/select_call.h | 2 +- src/core/main.cpp | 114 ++++---- src/core/netlink/neigh_info.cpp | 4 +- src/core/netlink/neigh_info.h | 4 +- src/core/netlink/netlink_wrapper.cpp | 32 +-- src/core/netlink/test_main.cpp | 2 +- src/core/proto/L2_address.cpp | 2 +- src/core/proto/dst_entry.cpp | 86 +++--- src/core/proto/dst_entry.h | 2 +- src/core/proto/dst_entry_tcp.cpp | 20 +- src/core/proto/dst_entry_tcp.h | 4 +- src/core/proto/dst_entry_udp.cpp | 14 +- src/core/proto/dst_entry_udp.h | 4 +- src/core/proto/dst_entry_udp_mc.cpp | 4 +- src/core/proto/ip_frag.cpp | 38 +-- src/core/proto/mapping.cpp | 28 +- src/core/proto/mapping.h | 4 +- src/core/proto/mem_buf_desc.h | 6 +- src/core/proto/mem_desc.h | 2 +- src/core/proto/neighbour.cpp | 106 ++++---- src/core/proto/neighbour.h | 12 +- src/core/proto/neighbour_table_mgr.cpp | 6 +- src/core/proto/netlink_socket_mgr.cpp | 2 +- src/core/proto/nvme_parse_input_args.h | 4 +- src/core/proto/route_entry.cpp | 14 +- src/core/proto/route_table_mgr.cpp | 10 +- src/core/proto/rule_table_mgr.cpp | 2 +- src/core/proto/xlio_lwip.cpp | 4 +- src/core/sock/fd_collection.cpp | 38 +-- src/core/sock/fd_collection.h | 4 +- src/core/sock/pipeinfo.cpp | 12 +- src/core/sock/pipeinfo.h | 4 +- src/core/sock/sock-app.cpp | 2 +- src/core/sock/sock-app.h | 2 +- src/core/sock/sock-extra.cpp | 26 +- src/core/sock/sock-redirect.cpp | 106 ++++---- src/core/sock/socket_fd_api.cpp | 4 +- src/core/sock/socket_fd_api.h | 8 +- src/core/sock/sockinfo.cpp | 47 ++-- src/core/sock/sockinfo.h | 6 +- src/core/sock/sockinfo_nvme.cpp | 21 +- src/core/sock/sockinfo_tcp.cpp | 142 +++++----- src/core/sock/sockinfo_tcp.h | 6 +- src/core/sock/sockinfo_udp.cpp | 22 +- src/core/sock/sockinfo_udp.h | 6 +- src/core/sock/sockinfo_ulp.cpp | 18 +- src/core/sock/tcp_seg_pool.cpp | 6 +- src/core/util/agent.cpp | 32 +-- src/core/util/hugepage_mgr.cpp | 2 +- src/core/util/match.cpp | 29 +- src/core/util/sg_array.h | 8 +- src/core/util/sys_vars.cpp | 296 ++++++++++----------- src/core/util/sysctl_reader.h | 2 +- src/core/util/utils.cpp | 18 +- src/core/util/utils.h | 10 +- src/core/util/wakeup_pipe.cpp | 2 +- src/core/xlio.h | 1 + src/stats/stats_data_reader.h | 8 +- 114 files changed, 1041 insertions(+), 1039 deletions(-) diff --git a/src/core/dev/buffer_pool.cpp b/src/core/dev/buffer_pool.cpp index 5b276bd32..074f028f3 100644 --- a/src/core/dev/buffer_pool.cpp +++ b/src/core/dev/buffer_pool.cpp @@ -47,22 +47,22 @@ // When Striding RQ is on, it points to g_buffer_pool_rx_stride since the upper layers work with // strides. When Striding RQ is off, it points to g_buffer_pool_rx_rwqe since the upper layers work // with RWQEs buffers themselves. -buffer_pool *g_buffer_pool_rx_ptr = NULL; +buffer_pool *g_buffer_pool_rx_ptr = nullptr; // This buffer-pool holds buffer descriptors which represent strides in strided RWQEs. // These buffers descriptos do not actually own a buffer. // Each such descriptor points into a portion of a buffer of a g_buffer_pool_rx_rwqe descriptor. -buffer_pool *g_buffer_pool_rx_stride = NULL; +buffer_pool *g_buffer_pool_rx_stride = nullptr; // This buffer-pool holds the actual buffers for receive WQEs. -buffer_pool *g_buffer_pool_rx_rwqe = NULL; +buffer_pool *g_buffer_pool_rx_rwqe = nullptr; // This buffer-pool holds the actual buffers for send WQEs. -buffer_pool *g_buffer_pool_tx = NULL; +buffer_pool *g_buffer_pool_tx = nullptr; // This buffer-pool holds buffer descriptors for zero copy TX. // These buffer descriptors do not actually own a buffer. -buffer_pool *g_buffer_pool_zc = NULL; +buffer_pool *g_buffer_pool_zc = nullptr; // inlining a function only help in case it come before using it... inline void buffer_pool::put_buffer_helper(mem_buf_desc_t *buff) @@ -281,7 +281,7 @@ bool buffer_pool::get_buffers_thread_safe(descq_t &pDeque, ring_slave *desc_owne // Remove from list head = m_p_head; m_p_head = m_p_head->p_next_desc; - head->p_next_desc = NULL; + head->p_next_desc = nullptr; // Init head->lkey = lkey; diff --git a/src/core/dev/cq_mgr_rx.cpp b/src/core/dev/cq_mgr_rx.cpp index 8b9e268a1..e2986782a 100644 --- a/src/core/dev/cq_mgr_rx.cpp +++ b/src/core/dev/cq_mgr_rx.cpp @@ -188,7 +188,7 @@ void cq_mgr_rx::add_hqrx(hw_queue_rx *hqrx_ptr) { m_hqrx_ptr = hqrx_ptr; m_hqrx_ptr->m_rq_wqe_counter = 0; // In case of bonded hqrx, wqe_counter must be reset to zero - m_rx_hot_buffer = NULL; + m_rx_hot_buffer = nullptr; if (0 != xlio_ib_mlx5_get_cq(m_p_ibv_cq, &m_mlx5_cq)) { cq_logpanic("xlio_ib_mlx5_get_cq failed (errno=%d %m)", errno); @@ -349,17 +349,17 @@ mem_buf_desc_t *cq_mgr_rx::cqe_process_rx(mem_buf_desc_t *p_mem_buf_desc, enum b /* we use context to verify that on reclaim rx buffer path we return the buffer to the right CQ */ p_mem_buf_desc->rx.is_xlio_thr = false; - p_mem_buf_desc->rx.context = NULL; + p_mem_buf_desc->rx.context = nullptr; if (unlikely(status != BS_OK)) { - m_p_next_rx_desc_poll = NULL; + m_p_next_rx_desc_poll = nullptr; reclaim_recv_buffer_helper(p_mem_buf_desc); - return NULL; + return nullptr; } if (m_n_sysvar_rx_prefetch_bytes_before_poll) { m_p_next_rx_desc_poll = p_mem_buf_desc->p_prev_desc; - p_mem_buf_desc->p_prev_desc = NULL; + p_mem_buf_desc->p_prev_desc = nullptr; } VALGRIND_MAKE_MEM_DEFINED(p_mem_buf_desc->p_buffer, p_mem_buf_desc->sz_data); @@ -409,15 +409,15 @@ void cq_mgr_rx::reclaim_recv_buffer_helper(mem_buf_desc_t *buff) { if (buff->dec_ref_count() <= 1 && (buff->lwip_pbuf.pbuf.ref-- <= 1)) { if (likely(buff->p_desc_owner == m_p_ring)) { - mem_buf_desc_t *temp = NULL; + mem_buf_desc_t *temp = nullptr; while (buff) { VLIST_DEBUG_CQ_MGR_PRINT_ERROR_IS_MEMBER; temp = buff; assert(temp->lwip_pbuf.pbuf.type != PBUF_ZEROCOPY); buff = temp->p_next_desc; temp->clear_transport_data(); - temp->p_next_desc = NULL; - temp->p_prev_desc = NULL; + temp->p_next_desc = nullptr; + temp->p_prev_desc = nullptr; temp->reset_ref_count(); free_lwip_pbuf(&temp->lwip_pbuf); m_rx_pool.push_back(temp); @@ -443,7 +443,7 @@ bool cq_mgr_rx::reclaim_recv_buffers(mem_buf_desc_t *rx_reuse_lst) { if (m_rx_buffs_rdy_for_free_head) { reclaim_recv_buffer_helper(m_rx_buffs_rdy_for_free_head); - m_rx_buffs_rdy_for_free_head = m_rx_buffs_rdy_for_free_tail = NULL; + m_rx_buffs_rdy_for_free_head = m_rx_buffs_rdy_for_free_tail = nullptr; } reclaim_recv_buffer_helper(rx_reuse_lst); return_extra_buffers(); @@ -475,7 +475,7 @@ int cq_mgr_rx::reclaim_recv_single_buffer(mem_buf_desc_t *rx_reuse) m_rx_buffs_rdy_for_free_tail->p_next_desc = rx_reuse; m_rx_buffs_rdy_for_free_tail = rx_reuse; } - m_rx_buffs_rdy_for_free_tail->p_next_desc = NULL; + m_rx_buffs_rdy_for_free_tail->p_next_desc = nullptr; /*if ((safe_mce_sys().thread_mode > THREAD_MODE_SINGLE)) { m_lock_ring_rx.lock(); }*/ @@ -541,8 +541,8 @@ int cq_mgr_rx::wait_for_notification_and_process_element(uint64_t *p_cq_poll_sn, cq_logfunc(""); if (m_b_notification_armed) { - cq_mgr_rx *p_cq_mgr_context = NULL; - struct ibv_cq *p_cq_hndl = NULL; + cq_mgr_rx *p_cq_mgr_context = nullptr; + struct ibv_cq *p_cq_hndl = nullptr; void *p; // deal with compiler warnings // Block on the cq_mgr_rx's notification event channel diff --git a/src/core/dev/cq_mgr_rx.h b/src/core/dev/cq_mgr_rx.h index 85d9cdf2d..0d23ae750 100644 --- a/src/core/dev/cq_mgr_rx.h +++ b/src/core/dev/cq_mgr_rx.h @@ -109,7 +109,7 @@ class cq_mgr_rx { * (on non-blocked channel) (some other thread beat you to it) */ int wait_for_notification_and_process_element(uint64_t *p_cq_poll_sn, - void *pv_fd_ready_array = NULL); + void *pv_fd_ready_array = nullptr); /** * This will poll n_num_poll time on the cq or stop early if it gets @@ -119,7 +119,7 @@ class cq_mgr_rx { * < 0 error */ virtual int poll_and_process_element_rx(uint64_t *p_cq_poll_sn, - void *pv_fd_ready_array = NULL) = 0; + void *pv_fd_ready_array = nullptr) = 0; virtual mem_buf_desc_t *poll_and_process_socketxtreme() { return nullptr; }; /** @@ -128,14 +128,14 @@ class cq_mgr_rx { * @return >=0 number of wce processed * < 0 error */ - virtual int drain_and_proccess(uintptr_t *p_recycle_buffers_last_wr_id = NULL) = 0; + virtual int drain_and_proccess(uintptr_t *p_recycle_buffers_last_wr_id = nullptr) = 0; // CQ implements the Rx mem_buf_desc_owner. // These callbacks will be called for each Rx buffer that passed processed completion // Rx completion handling at the cq_mgr_rx level is forwarding the packet to the ib_comm_mgr // layer void mem_buf_desc_return_to_owner(mem_buf_desc_t *p_mem_buf_desc, - void *pv_fd_ready_array = NULL); + void *pv_fd_ready_array = nullptr); virtual void add_hqrx(hw_queue_rx *hqrx_ptr); virtual void del_hqrx(hw_queue_rx *hqrx_ptr); @@ -159,7 +159,7 @@ class cq_mgr_rx { */ void compensate_qp_poll_failed(); void lro_update_hdr(struct xlio_mlx5_cqe *cqe, mem_buf_desc_t *p_rx_wc_buf_desc); - inline void process_recv_buffer(mem_buf_desc_t *buff, void *pv_fd_ready_array = NULL); + inline void process_recv_buffer(mem_buf_desc_t *buff, void *pv_fd_ready_array = nullptr); inline void update_global_sn_rx(uint64_t &cq_poll_sn, uint32_t rettotal); @@ -172,7 +172,7 @@ class cq_mgr_rx { // Returns true if the given buffer was used, // false if the given buffer was not used. bool compensate_qp_poll_success(mem_buf_desc_t *buff); - inline uint32_t process_recv_queue(void *pv_fd_ready_array = NULL); + inline uint32_t process_recv_queue(void *pv_fd_ready_array = nullptr); virtual void statistics_print(); @@ -257,7 +257,7 @@ inline struct xlio_mlx5_cqe *cq_mgr_rx::check_cqe(void) return cqe; } - return NULL; + return nullptr; } #endif // CQ_MGR_H diff --git a/src/core/dev/cq_mgr_rx_inl.h b/src/core/dev/cq_mgr_rx_inl.h index c03fe082e..a27ef15be 100644 --- a/src/core/dev/cq_mgr_rx_inl.h +++ b/src/core/dev/cq_mgr_rx_inl.h @@ -80,7 +80,7 @@ inline bool is_eth_tcp_frame(mem_buf_desc_t *buff) uint16_t h_proto = p_eth_h->h_proto; size_t transport_header_len = ETH_HDR_LEN; - struct vlanhdr *p_vlan_hdr = NULL; + struct vlanhdr *p_vlan_hdr = nullptr; if (h_proto == htons(ETH_P_8021Q)) { p_vlan_hdr = (struct vlanhdr *)((uint8_t *)p_eth_h + transport_header_len); transport_header_len = ETH_VLAN_HDR_LEN; diff --git a/src/core/dev/cq_mgr_rx_regrq.cpp b/src/core/dev/cq_mgr_rx_regrq.cpp index 1292c0a6d..85d6e2fc5 100644 --- a/src/core/dev/cq_mgr_rx_regrq.cpp +++ b/src/core/dev/cq_mgr_rx_regrq.cpp @@ -63,7 +63,7 @@ uint32_t cq_mgr_rx_regrq::clean_cq() uint64_t cq_poll_sn = 0; mem_buf_desc_t *buff; - if (NULL == m_hqrx_ptr) { // Sanity check + if (!m_hqrx_ptr) { // Sanity check return 0; } @@ -86,9 +86,9 @@ cq_mgr_rx_regrq::~cq_mgr_rx_regrq() mem_buf_desc_t *cq_mgr_rx_regrq::poll(enum buff_status_e &status) { - mem_buf_desc_t *buff = NULL; + mem_buf_desc_t *buff = nullptr; - if (unlikely(NULL == m_rx_hot_buffer)) { + if (unlikely(!m_rx_hot_buffer)) { if (likely(m_hqrx_ptr->m_rq_data.tail != (m_hqrx_ptr->m_rq_data.head))) { uint32_t index = m_hqrx_ptr->m_rq_data.tail & (m_hqrx_ptr->m_rx_num_wr - 1); m_rx_hot_buffer = (mem_buf_desc_t *)m_hqrx_ptr->m_rq_wqe_idx_to_wrid[index]; @@ -99,7 +99,7 @@ mem_buf_desc_t *cq_mgr_rx_regrq::poll(enum buff_status_e &status) } else { /* If rq_tail and rq_head are pointing to the same wqe, * the wq is empty and there is no cqe to be received */ - return NULL; + return nullptr; } } xlio_mlx5_cqe *cqe = check_cqe(); @@ -113,7 +113,7 @@ mem_buf_desc_t *cq_mgr_rx_regrq::poll(enum buff_status_e &status) *m_mlx5_cq.dbrec = htonl(m_mlx5_cq.cq_ci & 0xffffff); buff = m_rx_hot_buffer; - m_rx_hot_buffer = NULL; + m_rx_hot_buffer = nullptr; } else { prefetch((void *)m_rx_hot_buffer); } @@ -261,10 +261,10 @@ int cq_mgr_rx_regrq::drain_and_proccess(uintptr_t *p_recycle_buffers_last_wr_id (p_recycle_buffers_last_wr_id)) { buff_status_e status = BS_OK; mem_buf_desc_t *buff = poll(status); - if (NULL == buff) { + if (!buff) { update_global_sn_rx(cq_poll_sn, ret_total); m_b_was_drained = true; - m_p_ring->m_gro_mgr.flush_all(NULL); + m_p_ring->m_gro_mgr.flush_all(nullptr); return ret_total; } @@ -305,7 +305,7 @@ int cq_mgr_rx_regrq::drain_and_proccess(uintptr_t *p_recycle_buffers_last_wr_id update_global_sn_rx(cq_poll_sn, ret_total); - m_p_ring->m_gro_mgr.flush_all(NULL); + m_p_ring->m_gro_mgr.flush_all(nullptr); m_n_wce_counter = 0; m_b_was_drained = false; diff --git a/src/core/dev/cq_mgr_rx_regrq.h b/src/core/dev/cq_mgr_rx_regrq.h index c5ab51cf8..8a02f77a0 100644 --- a/src/core/dev/cq_mgr_rx_regrq.h +++ b/src/core/dev/cq_mgr_rx_regrq.h @@ -42,10 +42,10 @@ class cq_mgr_rx_regrq : public cq_mgr_rx { virtual ~cq_mgr_rx_regrq() override; - virtual int drain_and_proccess(uintptr_t *p_recycle_buffers_last_wr_id = NULL) override; + virtual int drain_and_proccess(uintptr_t *p_recycle_buffers_last_wr_id = nullptr) override; virtual mem_buf_desc_t *poll_and_process_socketxtreme() override; virtual int poll_and_process_element_rx(uint64_t *p_cq_poll_sn, - void *pv_fd_ready_array = NULL) override; + void *pv_fd_ready_array = nullptr) override; virtual uint32_t clean_cq() override; diff --git a/src/core/dev/cq_mgr_rx_strq.cpp b/src/core/dev/cq_mgr_rx_strq.cpp index e54c8d50e..ceab8e9f7 100644 --- a/src/core/dev/cq_mgr_rx_strq.cpp +++ b/src/core/dev/cq_mgr_rx_strq.cpp @@ -128,7 +128,7 @@ uint32_t cq_mgr_rx_strq::clean_cq() uint32_t ret_total = 0; uint64_t cq_poll_sn = 0; - if (NULL == m_hqrx_ptr) { // Sanity check + if (!m_hqrx_ptr) { // Sanity check return 0; } @@ -165,11 +165,11 @@ bool cq_mgr_rx_strq::set_current_hot_buffer() mem_buf_desc_t *cq_mgr_rx_strq::poll(enum buff_status_e &status, mem_buf_desc_t *&buff_stride) { - mem_buf_desc_t *buff = NULL; + mem_buf_desc_t *buff = nullptr; if (unlikely(!m_rx_hot_buffer)) { if (!set_current_hot_buffer()) { - return NULL; + return nullptr; } } @@ -193,7 +193,7 @@ mem_buf_desc_t *cq_mgr_rx_strq::poll(enum buff_status_e &status, mem_buf_desc_t if (is_wqe_complete) { ++m_hqrx_ptr->m_rq_data.tail; buff = m_rx_hot_buffer; - m_rx_hot_buffer = NULL; + m_rx_hot_buffer = nullptr; if (likely(status == BS_OK)) { ++m_p_cq_stat->n_rx_consumed_rwqe_count; } diff --git a/src/core/dev/cq_mgr_tx.cpp b/src/core/dev/cq_mgr_tx.cpp index 9f2ba0a6a..17fceb582 100644 --- a/src/core/dev/cq_mgr_tx.cpp +++ b/src/core/dev/cq_mgr_tx.cpp @@ -234,8 +234,8 @@ int cq_mgr_tx::request_notification(uint64_t poll_sn) cq_mgr_tx *cq_mgr_tx::get_cq_mgr_from_cq_event(struct ibv_comp_channel *p_cq_channel) { - cq_mgr_tx *p_cq_mgr = NULL; - struct ibv_cq *p_cq_hndl = NULL; + cq_mgr_tx *p_cq_mgr = nullptr; + struct ibv_cq *p_cq_hndl = nullptr; void *p_context; // deal with compiler warnings // read & ack the CQ event @@ -342,7 +342,7 @@ void cq_mgr_tx::handle_sq_wqe_prop(unsigned index) prev = p; p = p->next; - } while (p != NULL && m_hqtx_ptr->is_sq_wqe_prop_valid(p, prev)); + } while (p && m_hqtx_ptr->is_sq_wqe_prop_valid(p, prev)); m_p_ring->return_tx_pool_to_global_pool(); m_hqtx_ptr->credits_return(credits); diff --git a/src/core/dev/dm_mgr.cpp b/src/core/dev/dm_mgr.cpp index 826d6b89a..3afb547b8 100644 --- a/src/core/dev/dm_mgr.cpp +++ b/src/core/dev/dm_mgr.cpp @@ -53,9 +53,9 @@ #define dm_logfunc __log_info_func dm_mgr::dm_mgr() - : m_p_dm_mr(NULL) - , m_p_ibv_dm(NULL) - , m_p_ring_stat(NULL) + : m_p_dm_mr(nullptr) + , m_p_ibv_dm(nullptr) + , m_p_ring_stat(nullptr) , m_allocation(0) , m_used(0) , m_head(0) {}; @@ -106,7 +106,7 @@ bool dm_mgr::allocate_resources(ib_ctx_handler *ib_ctx, ring_stats_t *ring_stats m_p_dm_mr = xlio_ibv_reg_dm_mr(&mr_in); if (!m_p_dm_mr) { xlio_ibv_free_dm(m_p_ibv_dm); - m_p_ibv_dm = NULL; + m_p_ibv_dm = nullptr; dm_logerr("ibv_free_dm error - dm_mr registration failed, %d %m", errno); return false; } @@ -132,7 +132,7 @@ void dm_mgr::release_resources() } else { dm_logdbg("ibv_dereg_mr success"); } - m_p_dm_mr = NULL; + m_p_dm_mr = nullptr; } if (m_p_ibv_dm) { @@ -141,10 +141,10 @@ void dm_mgr::release_resources() } else { dm_logdbg("ibv_free_dm success"); } - m_p_ibv_dm = NULL; + m_p_ibv_dm = nullptr; } - m_p_ring_stat = NULL; + m_p_ring_stat = nullptr; dm_logdbg("Device memory release completed!"); } diff --git a/src/core/dev/hw_queue_rx.cpp b/src/core/dev/hw_queue_rx.cpp index 0275070f3..6ed6856bf 100644 --- a/src/core/dev/hw_queue_rx.cpp +++ b/src/core/dev/hw_queue_rx.cpp @@ -229,7 +229,7 @@ void hw_queue_rx::release_rx_buffers() // Add short delay (500 usec) to allow for WQE's to be flushed to CQ every poll cycle const struct timespec short_sleep = {0, 500000}; // 500 usec - nanosleep(&short_sleep, NULL); + nanosleep(&short_sleep, nullptr); } m_last_posted_rx_wr_id = 0; // Clear the posted WR_ID flag, we just clear the entire RQ hwqrx_logdbg("draining completed with a total of %d wce's on cq_mgr_rx", total_ret); @@ -324,8 +324,8 @@ void hw_queue_rx::post_recv_buffer_rq(mem_buf_desc_t *p_mem_buf_desc) m_last_posted_rx_wr_id = (uintptr_t)p_mem_buf_desc; - m_p_prev_rx_desc_pushed = NULL; - p_mem_buf_desc->p_prev_desc = NULL; + m_p_prev_rx_desc_pushed = nullptr; + p_mem_buf_desc->p_prev_desc = nullptr; m_curr_rx_wr = 0; struct ibv_recv_wr *bad_wr = nullptr; @@ -415,8 +415,8 @@ int hw_queue_rx::xlio_raw_post_recv(struct ibv_recv_wr **bad_wr) bool hw_queue_rx::init_rx_cq_mgr_prepare() { m_rq_wqe_idx_to_wrid = - (uint64_t *)mmap(NULL, m_rx_num_wr * sizeof(*m_rq_wqe_idx_to_wrid), PROT_READ | PROT_WRITE, - MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); + (uint64_t *)mmap(nullptr, m_rx_num_wr * sizeof(*m_rq_wqe_idx_to_wrid), + PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); if (m_rq_wqe_idx_to_wrid == MAP_FAILED) { hwqrx_logerr("Failed allocating m_rq_wqe_idx_to_wrid (errno=%d %m)", errno); return false; @@ -466,7 +466,7 @@ void hw_queue_rx::tls_release_tir(xlio_tir *tir) { /* TODO We don't have to lock ring to destroy DEK object (a garbage collector?). */ - assert(tir != nullptr && tir->m_type == xlio_ti::ti_type::TLS_TIR); + assert(tir && tir->m_type == xlio_ti::ti_type::TLS_TIR); tir->m_released = true; tir->assign_callback(NULL, NULL); if (tir->m_ref == 0) { diff --git a/src/core/dev/hw_queue_tx.cpp b/src/core/dev/hw_queue_tx.cpp index 6055e29e7..293a64a06 100644 --- a/src/core/dev/hw_queue_tx.cpp +++ b/src/core/dev/hw_queue_tx.cpp @@ -478,16 +478,16 @@ void hw_queue_tx::init_queue() */ m_mlx5_qp.cap.max_inline_data = OCTOWORD - 4 + 3 * WQEBB; - if (m_sq_wqe_idx_to_prop == NULL) { + if (!m_sq_wqe_idx_to_prop) { m_sq_wqe_idx_to_prop = - (sq_wqe_prop *)mmap(NULL, m_tx_num_wr * sizeof(*m_sq_wqe_idx_to_prop), + (sq_wqe_prop *)mmap(nullptr, m_tx_num_wr * sizeof(*m_sq_wqe_idx_to_prop), PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); if (m_sq_wqe_idx_to_prop == MAP_FAILED) { hwqtx_logerr("Failed allocating m_sq_wqe_idx_to_prop (errno=%d %m)", errno); return; } m_sq_wqe_prop_last_signalled = m_tx_num_wr - 1; - m_sq_wqe_prop_last = NULL; + m_sq_wqe_prop_last = nullptr; } hwqtx_logfunc("m_tx_num_wr=%d max_inline_data: %d m_sq_wqe_idx_to_prop=%p", m_tx_num_wr, @@ -607,7 +607,7 @@ inline int hw_queue_tx::fill_inl_segment(sg_array &sga, uint8_t *cur_seg, uint8_ int max_inline_len, int inline_len) { int wqe_inline_size = 0; - while ((data_addr != NULL) && inline_len) { + while ((data_addr) && inline_len) { dbg_dump_wqe((uint32_t *)data_addr, inline_len); memcpy(cur_seg, data_addr, inline_len); wqe_inline_size += inline_len; @@ -785,10 +785,10 @@ inline int hw_queue_tx::fill_wqe_send(xlio_ibv_send_wr *pswr) //! Filling wqe for LSO inline int hw_queue_tx::fill_wqe_lso(xlio_ibv_send_wr *pswr) { - struct mlx5_wqe_ctrl_seg *ctrl = NULL; - struct mlx5_wqe_eth_seg *eseg = NULL; - struct mlx5_wqe_data_seg *dpseg = NULL; - uint8_t *cur_seg = NULL; + struct mlx5_wqe_ctrl_seg *ctrl = nullptr; + struct mlx5_wqe_eth_seg *eseg = nullptr; + struct mlx5_wqe_data_seg *dpseg = nullptr; + uint8_t *cur_seg = nullptr; uint8_t *p_hdr = (uint8_t *)pswr->tso.hdr; int inl_hdr_size = pswr->tso.hdr_sz; int inl_hdr_copy_size = 0; @@ -876,7 +876,7 @@ void hw_queue_tx::store_current_wqe_prop(mem_buf_desc_t *buf, unsigned credits, .next = m_sq_wqe_prop_last, }; m_sq_wqe_prop_last = &m_sq_wqe_idx_to_prop[m_sq_wqe_hot_index]; - if (ti != NULL) { + if (ti) { ti->get(); } } @@ -886,8 +886,8 @@ void hw_queue_tx::store_current_wqe_prop(mem_buf_desc_t *buf, unsigned credits, void hw_queue_tx::send_to_wire(xlio_ibv_send_wr *p_send_wqe, xlio_wr_tx_packet_attr attr, bool request_comp, xlio_tis *tis, unsigned credits) { - struct xlio_mlx5_wqe_ctrl_seg *ctrl = NULL; - struct mlx5_wqe_eth_seg *eseg = NULL; + struct xlio_mlx5_wqe_ctrl_seg *ctrl = nullptr; + struct mlx5_wqe_eth_seg *eseg = nullptr; uint32_t tisn = tis ? tis->get_tisn() : 0; ctrl = (struct xlio_mlx5_wqe_ctrl_seg *)m_sq_wqe_hot; @@ -930,7 +930,7 @@ std::unique_ptr hw_queue_tx::create_tis(uint32_t flags) { dpcp::adapter *adapter = m_p_ib_ctx_handler->get_dpcp_adapter(); bool is_tls = flags & dpcp::TIS_ATTR_TLS, is_nvme = flags & dpcp::TIS_ATTR_NVMEOTCP; - if (unlikely(adapter == nullptr || (is_tls && is_nvme))) { + if (unlikely(!adapter || (is_tls && is_nvme))) { return nullptr; } @@ -1126,7 +1126,7 @@ std::unique_ptr hw_queue_tx::get_tls_dek(const void *key, uint32_ void hw_queue_tx::put_tls_dek(std::unique_ptr &&tls_dek_obj) { - if (tls_dek_obj == nullptr) { + if (!tls_dek_obj) { return; } // We don't allow unlimited DEK cache to avoid system DEK starvation. @@ -1141,7 +1141,7 @@ xlio_tis *hw_queue_tx::tls_context_setup_tx(const xlio_tls_info *info) std::unique_ptr tis; if (m_tls_tis_cache.empty()) { tis = create_tis(DPCP_TIS_FLAGS | dpcp::TIS_ATTR_TLS); - if (unlikely(tis == nullptr)) { + if (unlikely(!tis)) { return nullptr; } } else { @@ -1435,7 +1435,7 @@ void hw_queue_tx::tls_tx_post_dump_wqe(xlio_tis *tis, void *addr, uint32_t len, void hw_queue_tx::tls_release_tis(xlio_tis *tis) { - assert(tis != nullptr && tis->m_type == xlio_ti::ti_type::TLS_TIS); + assert(tis && tis->m_type == xlio_ti::ti_type::TLS_TIS); tis->m_released = true; if (tis->m_ref == 0) { put_tls_tis_in_cache(tis); @@ -1445,7 +1445,7 @@ void hw_queue_tx::tls_release_tis(xlio_tis *tis) void hw_queue_tx::put_tls_tis_in_cache(xlio_tis *tis) { std::unique_ptr dek = tis->release_dek(); - assert(dynamic_cast(dek.get()) != nullptr); + assert(dynamic_cast(dek.get())); put_tls_dek(std::unique_ptr(dynamic_cast(dek.release()))); m_tls_tis_cache.push_back(tis); @@ -1484,7 +1484,7 @@ void hw_queue_tx::post_nop_fence(void) cseg->qpn_ds = htobe32((m_mlx5_qp.qpn << MLX5_WQE_CTRL_QPN_SHIFT) | 0x01); cseg->fm_ce_se = MLX5_FENCE_MODE_INITIATOR_SMALL; - store_current_wqe_prop(nullptr, SQ_CREDITS_NOP, NULL); + store_current_wqe_prop(nullptr, SQ_CREDITS_NOP, nullptr); ring_doorbell(MLX5_DB_METHOD_DB, 1); @@ -1558,10 +1558,10 @@ void hw_queue_tx::trigger_completion_for_all_sent_packets() memset(&send_wr, 0, sizeof(send_wr)); send_wr.wr_id = (uintptr_t)p_mem_buf_desc; - send_wr.wr.ud.ah = NULL; + send_wr.wr.ud.ah = nullptr; send_wr.sg_list = sge; send_wr.num_sge = 1; - send_wr.next = NULL; + send_wr.next = nullptr; xlio_send_wr_opcode(send_wr) = XLIO_IBV_WR_SEND; unsigned credits = credits_calculate(&send_wr); diff --git a/src/core/dev/ib_ctx_handler.cpp b/src/core/dev/ib_ctx_handler.cpp index 28509d1f9..cced43b2c 100644 --- a/src/core/dev/ib_ctx_handler.cpp +++ b/src/core/dev/ib_ctx_handler.cpp @@ -60,15 +60,15 @@ ib_ctx_handler::ib_ctx_handler(struct ib_ctx_handler_desc *desc) , m_on_device_memory(0) , m_removed(false) , m_lock_umr("spin_lock_umr") - , m_p_ctx_time_converter(NULL) + , m_p_ctx_time_converter(nullptr) { - if (NULL == desc) { + if (!desc) { ibch_logpanic("Invalid ib_ctx_handler"); } m_p_ibv_device = desc->device; - if (m_p_ibv_device == NULL) { + if (!m_p_ibv_device) { ibch_logpanic("m_p_ibv_device is invalid"); } @@ -80,7 +80,7 @@ ib_ctx_handler::ib_ctx_handler(struct ib_ctx_handler_desc *desc) VALGRIND_MAKE_MEM_DEFINED(m_p_ibv_pd, sizeof(struct ibv_pd)); m_p_ibv_device_attr = new xlio_ibv_device_attr_ex(); - if (m_p_ibv_device_attr == NULL) { + if (!m_p_ibv_device_attr) { ibch_logpanic("ibv device %p attr allocation failure (ibv context %p) (errno=%d %m)", m_p_ibv_device, m_p_ibv_context, errno); } @@ -119,7 +119,7 @@ ib_ctx_handler::ib_ctx_handler(struct ib_ctx_handler_desc *desc) if (m_p_adapter) { delete m_p_adapter; - m_p_ibv_context = NULL; + m_p_ibv_context = nullptr; } } @@ -144,7 +144,7 @@ ib_ctx_handler::~ib_ctx_handler() } ENDIF_VERBS_FAILURE; VALGRIND_MAKE_MEM_UNDEFINED(m_p_ibv_pd, sizeof(struct ibv_pd)); - m_p_ibv_pd = NULL; + m_p_ibv_pd = nullptr; } if (m_p_ctx_time_converter) { @@ -154,7 +154,7 @@ ib_ctx_handler::~ib_ctx_handler() if (m_p_adapter) { delete m_p_adapter; - m_p_ibv_context = NULL; + m_p_ibv_context = nullptr; } BULLSEYE_EXCLUDE_BLOCK_END @@ -222,15 +222,15 @@ int parse_dpcp_version(const char *dpcp_ver) dpcp::adapter *ib_ctx_handler::set_dpcp_adapter() { dpcp::status status = dpcp::DPCP_ERR_NO_SUPPORT; - dpcp::provider *p_provider = NULL; - dpcp::adapter_info *dpcp_lst = NULL; + dpcp::provider *p_provider = nullptr; + dpcp::adapter_info *dpcp_lst = nullptr; size_t adapters_num = 0; size_t i = 0; int dpcp_ver = 0; - m_p_adapter = NULL; + m_p_adapter = nullptr; if (!m_p_ibv_device) { - return NULL; + return nullptr; } status = dpcp::provider::get_instance(p_provider); @@ -251,7 +251,7 @@ dpcp::adapter *ib_ctx_handler::set_dpcp_adapter() * 0 arguments along with DPCP_ERR_OUT_OF_RANGE error. On success, the * number of actual adapters is not set, so we need a separate call here. */ - status = p_provider->get_adapter_info_lst(NULL, adapters_num); + status = p_provider->get_adapter_info_lst(nullptr, adapters_num); if (dpcp::DPCP_ERR_OUT_OF_RANGE != status || 0 == adapters_num) { ibch_logdbg("found no adapters status = %d", status); goto err; @@ -271,13 +271,13 @@ dpcp::adapter *ib_ctx_handler::set_dpcp_adapter() for (i = 0; i < adapters_num; i++) { if (dpcp_lst[i].name == m_p_ibv_device->name) { - dpcp::adapter *adapter = NULL; + dpcp::adapter *adapter = nullptr; status = p_provider->open_adapter(dpcp_lst[i].name, adapter); if ((dpcp::DPCP_OK == status) && (adapter)) { int ret = 0; - struct ibv_context *ctx = NULL; - struct ibv_pd *pd = NULL; + struct ibv_context *ctx = nullptr; + struct ibv_pd *pd = nullptr; mlx5dv_obj mlx5_obj; ctx = (ibv_context *)adapter->get_ibv_context(); @@ -348,7 +348,7 @@ void ib_ctx_handler::check_capabilities() void ib_ctx_handler::set_ctx_time_converter_status(ts_conversion_mode_t conversion_mode) { - if (m_p_ctx_time_converter != NULL) { + if (m_p_ctx_time_converter) { /* * Don't override time_converter object. Current method may be * called more than once if multiple slaves point to the same @@ -416,12 +416,12 @@ void ib_ctx_handler::set_ctx_time_converter_status(ts_conversion_mode_t conversi uint32_t ib_ctx_handler::mem_reg(void *addr, size_t length, uint64_t access) { - struct ibv_mr *mr = NULL; + struct ibv_mr *mr = nullptr; uint32_t lkey = LKEY_ERROR; mr = ibv_reg_mr(m_p_ibv_pd, addr, length, access); VALGRIND_MAKE_MEM_DEFINED(mr, sizeof(ibv_mr)); - if (NULL == mr) { + if (!mr) { print_warning_rlimit_memlock(length, errno); } else { m_mr_map_lkey[mr->lkey] = mr; @@ -460,7 +460,7 @@ struct ibv_mr *ib_ctx_handler::get_mem_reg(uint32_t lkey) return iter->second; } - return NULL; + return nullptr; } uint32_t ib_ctx_handler::user_mem_reg(void *addr, size_t length, uint64_t access) @@ -547,6 +547,6 @@ void ib_ctx_handler::handle_event_device_fatal() g_p_event_handler_manager->unregister_ibverbs_event(m_p_ibv_context->async_fd, this); if (m_p_ctx_time_converter) { m_p_ctx_time_converter->clean_obj(); - m_p_ctx_time_converter = NULL; + m_p_ctx_time_converter = nullptr; } } diff --git a/src/core/dev/ib_ctx_handler_collection.cpp b/src/core/dev/ib_ctx_handler_collection.cpp index 7d11a4561..467483e59 100644 --- a/src/core/dev/ib_ctx_handler_collection.cpp +++ b/src/core/dev/ib_ctx_handler_collection.cpp @@ -50,7 +50,7 @@ #define ibchc_logfunc __log_info_func #define ibchc_logfuncall __log_info_funcall -ib_ctx_handler_collection *g_p_ib_ctx_handler_collection = NULL; +ib_ctx_handler_collection *g_p_ib_ctx_handler_collection = nullptr; void check_flow_steering_log_num_mgm_entry_size() { @@ -66,7 +66,8 @@ void check_flow_steering_log_num_mgm_entry_size() vlog_printf( VLOG_DEBUG, "Flow steering option for mlx4 driver does not exist in current OFED version\n"); - } else if (flow_steering_val[0] != '-' || (strtol(&flow_steering_val[1], NULL, 0) % 2) == 0) { + } else if (flow_steering_val[0] != '-' || + (strtol(&flow_steering_val[1], nullptr, 0) % 2) == 0) { char module_info[3] = {0}; if (!run_and_retreive_system_command("modinfo mlx4_core > /dev/null 2>&1 ; echo $?", module_info, sizeof(module_info)) && @@ -146,8 +147,8 @@ ib_ctx_handler_collection::~ib_ctx_handler_collection() void ib_ctx_handler_collection::update_tbl(const char *ifa_name) { - struct ibv_device **dev_list = NULL; - ib_ctx_handler *p_ib_ctx_handler = NULL; + struct ibv_device **dev_list = nullptr; + ib_ctx_handler *p_ib_ctx_handler = nullptr; int num_devices = 0; int i; @@ -216,7 +217,7 @@ ib_ctx_handler *ib_ctx_handler_collection::get_ib_ctx(const char *ifa_name) if (check_netvsc_device_exist(ifa_name)) { if (!get_netvsc_slave(ifa_name, active_slave, slave_flags)) { - return NULL; + return nullptr; } ifa_name = (const char *)active_slave; } else if (check_bond_device_exist(ifa_name)) { @@ -228,11 +229,11 @@ ib_ctx_handler *ib_ctx_handler_collection::get_ib_ctx(const char *ifa_name) /* active/active: return the first slave */ if (!get_bond_slaves_name_list(ifa_name, slaves, sizeof(slaves))) { - return NULL; + return nullptr; } slave_name = strtok_r(slaves, " ", &save_ptr); - if (NULL == slave_name) { - return NULL; + if (!slave_name) { + return nullptr; } save_ptr = strchr(slave_name, '\n'); if (save_ptr) { @@ -248,7 +249,7 @@ ib_ctx_handler *ib_ctx_handler_collection::get_ib_ctx(const char *ifa_name) } } - return NULL; + return nullptr; } void ib_ctx_handler_collection::del_ib_ctx(ib_ctx_handler *ib_ctx) diff --git a/src/core/dev/ib_ctx_handler_collection.h b/src/core/dev/ib_ctx_handler_collection.h index 9958c18eb..c60415b23 100644 --- a/src/core/dev/ib_ctx_handler_collection.h +++ b/src/core/dev/ib_ctx_handler_collection.h @@ -45,7 +45,7 @@ class ib_ctx_handler_collection { ib_ctx_handler_collection(); ~ib_ctx_handler_collection(); - void update_tbl(const char *ifa_name = NULL); + void update_tbl(const char *ifa_name = nullptr); void print_val_tbl(); inline ib_context_map_t *get_ib_cxt_list() diff --git a/src/core/dev/net_device_entry.cpp b/src/core/dev/net_device_entry.cpp index 49724a1d6..0167707a6 100644 --- a/src/core/dev/net_device_entry.cpp +++ b/src/core/dev/net_device_entry.cpp @@ -51,7 +51,7 @@ net_device_entry::net_device_entry(int if_index, net_device_val *ndv) m_val = ndv; m_is_valid = false; m_cma_id_bind_trial_count = 0; - m_timer_handle = NULL; + m_timer_handle = nullptr; timer_count = -1; m_bond = net_device_val::NO_BOND; @@ -78,7 +78,7 @@ net_device_entry::~net_device_entry() { if (m_timer_handle) { g_p_event_handler_manager->unregister_timer_event(this, m_timer_handle); - m_timer_handle = NULL; + m_timer_handle = nullptr; } net_device_val *p_ndv = dynamic_cast(m_val); if (p_ndv && p_ndv->get_is_bond() == net_device_val::LAG_8023ad) { diff --git a/src/core/dev/net_device_table_mgr.cpp b/src/core/dev/net_device_table_mgr.cpp index 97d3aa8f9..ba0debaae 100644 --- a/src/core/dev/net_device_table_mgr.cpp +++ b/src/core/dev/net_device_table_mgr.cpp @@ -61,7 +61,7 @@ #define ndtm_logfunc __log_info_func #define ndtm_logfuncall __log_info_funcall -net_device_table_mgr *g_p_net_device_table_mgr = NULL; +net_device_table_mgr *g_p_net_device_table_mgr = nullptr; enum net_device_table_mgr_timers { RING_PROGRESS_ENGINE_TIMER, RING_ADAPT_CQ_MODERATION_TIMER }; @@ -103,7 +103,7 @@ net_device_table_mgr::net_device_table_mgr() /* throw exception if there are no supported devices. */ if (m_net_device_map_index.empty()) { int num_devices = 0; - struct ibv_device **dev_list = NULL; + struct ibv_device **dev_list = nullptr; dev_list = xlio_ibv_get_device_list(&num_devices); if (dev_list && num_devices == 0) { ibv_free_device_list(dev_list); @@ -323,18 +323,18 @@ net_device_val *net_device_table_mgr::get_net_device_val(const ip_addr &if_addr) ndtm_logdbg("Found %s for addr: %s", net_dev->to_str().c_str(), if_addr.to_str().c_str()); if (net_dev->get_state() == net_device_val::INVALID) { ndtm_logdbg("invalid net_device %s", net_dev->to_str().c_str()); - return NULL; + return nullptr; } return iter->second; } ndtm_logdbg("Can't find net_device for addr: %s", if_addr.to_str().c_str()); - return NULL; + return nullptr; } net_device_val *net_device_table_mgr::get_net_device_val(int if_index) { net_device_map_index_t::iterator iter; - net_device_val *net_dev = NULL; + net_device_val *net_dev = nullptr; std::lock_guard lock(m_lock); @@ -374,14 +374,14 @@ net_device_val *net_device_table_mgr::get_net_device_val(int if_index) } ndtm_logdbg("Can't find net_device for index: %d", if_index); - return NULL; + return nullptr; out: ndtm_logdbg("Found %s for index: %d", net_dev->to_str().c_str(), if_index); if (net_dev->get_state() == net_device_val::INVALID) { ndtm_logdbg("invalid net_device %s", net_dev->to_str().c_str()); - return NULL; + return nullptr; } return net_dev; } @@ -396,7 +396,7 @@ net_device_entry *net_device_table_mgr::create_new_entry(int if_index, const obs if (p_ndv) { return new net_device_entry(if_index, p_ndv); } - return NULL; + return nullptr; } void net_device_table_mgr::get_ip_list(local_ip_list_t &ip_list, sa_family_t family, int if_index) @@ -514,7 +514,7 @@ int net_device_table_mgr::global_ring_wait_for_notification_and_process_element( ndtm_logdbg("removing wakeup fd from epfd"); BULLSEYE_EXCLUDE_BLOCK_START if ((SYSCALL(epoll_ctl, m_global_ring_epfd, EPOLL_CTL_DEL, - m_global_ring_pipe_fds[0], NULL)) && + m_global_ring_pipe_fds[0], nullptr)) && (!(errno == ENOENT || errno == EBADF))) { ndtm_logerr("failed to del pipe channel fd from internal epfd (errno=%d %m)", errno); @@ -583,10 +583,10 @@ void net_device_table_mgr::handle_timer_expired(void *user_data) void net_device_table_mgr::global_ring_wakeup() { ndtm_logdbg(""); - epoll_event ev = {0, {0}}; + epoll_event ev = {0, {nullptr}}; ev.events = EPOLLIN; - ev.data.ptr = NULL; + ev.data.ptr = nullptr; int errno_tmp = errno; // don't let wakeup affect errno, as this can fail with EEXIST BULLSEYE_EXCLUDE_BLOCK_START if ((SYSCALL(epoll_ctl, m_global_ring_epfd, EPOLL_CTL_ADD, m_global_ring_pipe_fds[0], &ev)) && @@ -617,7 +617,7 @@ void net_device_table_mgr::del_link_event(const netlink_link_info *info) * resources correctly. */ if (info->flags & IFF_SLAVE) { - net_device_val *net_dev = NULL; + net_device_val *net_dev = nullptr; int if_index = info->ifindex; ndtm_logdbg("netlink event: if_index: %d state: %s", info->ifindex, @@ -642,7 +642,7 @@ void net_device_table_mgr::new_link_event(const netlink_link_info *info) * DOWN state (see RTM_DELLINK). */ if (info->flags & IFF_SLAVE) { - net_device_val *net_dev = NULL; + net_device_val *net_dev = nullptr; int if_index = info->ifindex; ndtm_logdbg("netlink event: if_index: %d state: %s", info->ifindex, diff --git a/src/core/dev/net_device_table_mgr.h b/src/core/dev/net_device_table_mgr.h index 53d6fe9ad..9c5614f38 100644 --- a/src/core/dev/net_device_table_mgr.h +++ b/src/core/dev/net_device_table_mgr.h @@ -75,7 +75,7 @@ class net_device_table_mgr : public cache_table_mgr, publ * notification is armed) Returns >=0 the total number of wce processed < 0 on error */ int global_ring_poll_and_process_element(uint64_t *p_poll_sn_rx, uint64_t *p_poll_sn_tx, - void *pv_fd_ready_array = NULL); + void *pv_fd_ready_array = nullptr); /** * This will poll one time on the ALL the managed CQ's @@ -84,7 +84,7 @@ class net_device_table_mgr : public cache_table_mgr, publ * < 0 error */ int global_ring_wait_for_notification_and_process_element(uint64_t *p_poll_sn, - void *pv_fd_ready_array = NULL); + void *pv_fd_ready_array = nullptr); int global_ring_request_notification(uint64_t poll_sn_rx, uint64_t poll_sn_tx); diff --git a/src/core/dev/net_device_val.cpp b/src/core/dev/net_device_val.cpp index d41286bc8..d59b48a2f 100644 --- a/src/core/dev/net_device_val.cpp +++ b/src/core/dev/net_device_val.cpp @@ -158,8 +158,8 @@ net_device_val::net_device_val(struct net_device_val_desc *desc) { bool valid = false; ib_ctx_handler *ib_ctx; - struct nlmsghdr *nl_msg = NULL; - struct ifinfomsg *nl_msgdata = NULL; + struct nlmsghdr *nl_msg = nullptr; + struct ifinfomsg *nl_msgdata = nullptr; int nl_attrlen; struct rtattr *nl_attr; @@ -169,15 +169,15 @@ net_device_val::net_device_val(struct net_device_val_desc *desc) m_flags = 0; m_mtu = 0; m_state = INVALID; - m_p_L2_addr = NULL; - m_p_br_addr = NULL; + m_p_L2_addr = nullptr; + m_p_br_addr = nullptr; m_bond = NO_BOND; m_if_active = 0; m_bond_xmit_hash_policy = XHP_LAYER_2; m_bond_fail_over_mac = 0; m_transport_type = XLIO_TRANSPORT_UNKNOWN; - if (NULL == desc) { + if (!desc) { nd_logerr("Invalid net_device_val name=%s", "NA"); m_state = INVALID; return; @@ -316,12 +316,12 @@ net_device_val::~net_device_val() } if (m_p_br_addr) { delete m_p_br_addr; - m_p_br_addr = NULL; + m_p_br_addr = nullptr; } if (m_p_L2_addr) { delete m_p_L2_addr; - m_p_L2_addr = NULL; + m_p_L2_addr = nullptr; } slave_data_vector_t::iterator slave = m_slaves.begin(); @@ -578,7 +578,7 @@ void net_device_val::set_slave_array() nd_logdbg(""); if (m_bond == NETVSC) { - slave_data_t *s = NULL; + slave_data_t *s = nullptr; unsigned int slave_flags = 0; if (get_netvsc_slave(get_ifname_link(), active_slave, slave_flags)) { if ((slave_flags & IFF_UP) && verify_qp_creation(active_slave, IBV_QPT_RAW_PACKET)) { @@ -604,7 +604,7 @@ void net_device_val::set_slave_array() slave_data_t *s = new slave_data_t(if_nametoindex(slave)); m_slaves.push_back(s); - slave = strtok(NULL, " "); + slave = strtok(nullptr, " "); } } @@ -692,7 +692,7 @@ const slave_data_t *net_device_val::get_slave(int if_index) return cur_slave; } } - return NULL; + return nullptr; } void net_device_val::verify_bonding_mode() @@ -710,7 +710,7 @@ void net_device_val::verify_bonding_mode() sprintf(bond_failover_mac_param_file, BONDING_FAILOVER_MAC_PARAM_FILE, get_ifname_link()); if (priv_safe_read_file(bond_mode_param_file, bond_mode_file_content, FILENAME_MAX) > 0) { - char *bond_mode = NULL; + char *bond_mode = nullptr; bond_mode = strtok(bond_mode_file_content, " "); if (bond_mode) { if (!strcmp(bond_mode, "active-backup")) { @@ -736,16 +736,16 @@ void net_device_val::verify_bonding_mode() get_ifname_link()); if (priv_safe_try_read_file(bond_xmit_hash_policy_param_file, bond_xmit_hash_policy_file_content, FILENAME_MAX) > 0) { - char *bond_xhp = NULL; - char *saveptr = NULL; + char *bond_xhp = nullptr; + char *saveptr = nullptr; bond_xhp = strtok_r(bond_xmit_hash_policy_file_content, " ", &saveptr); - if (NULL == bond_xhp) { + if (!bond_xhp) { nd_logdbg("could not parse bond xmit hash policy, staying with default (L2)\n"); } else { - bond_xhp = strtok_r(NULL, " ", &saveptr); + bond_xhp = strtok_r(nullptr, " ", &saveptr); if (bond_xhp) { - m_bond_xmit_hash_policy = (bond_xmit_hash_policy)strtol(bond_xhp, NULL, 10); + m_bond_xmit_hash_policy = (bond_xmit_hash_policy)strtol(bond_xhp, nullptr, 10); if (m_bond_xmit_hash_policy < XHP_LAYER_2 || m_bond_xmit_hash_policy > XHP_ENCAP_3_4) { vlog_printf(VLOG_WARNING, @@ -933,9 +933,9 @@ bool net_device_val::update_active_slaves() void net_device_val::update_netvsc_slaves(int if_index, int if_flags) { - slave_data_t *s = NULL; + slave_data_t *s = nullptr; bool found = false; - ib_ctx_handler *ib_ctx = NULL, *up_ib_ctx = NULL; + ib_ctx_handler *ib_ctx = nullptr, *up_ib_ctx = nullptr; char if_name[IFNAMSIZ] = {0}; m_lock.lock(); @@ -1001,7 +1001,7 @@ ring *net_device_val::reserve_ring(resource_allocation_key *key) nd_logfunc(""); std::lock_guard lock(m_lock); key = ring_key_redirection_reserve(key); - ring *the_ring = NULL; + ring *the_ring = nullptr; rings_hash_map_t::iterator ring_iter = m_h_ring_map.find(key); if (m_h_ring_map.end() == ring_iter) { @@ -1010,11 +1010,11 @@ ring *net_device_val::reserve_ring(resource_allocation_key *key) resource_allocation_key *new_key = new resource_allocation_key(*key); the_ring = create_ring(new_key); if (!the_ring) { - return NULL; + return nullptr; } m_h_ring_map[new_key] = std::make_pair(the_ring, 0); // each ring is born with ref_count = 0 ring_iter = m_h_ring_map.find(new_key); - epoll_event ev = {0, {0}}; + epoll_event ev = {0, {nullptr}}; size_t num_ring_rx_fds; int *ring_rx_fds_array = the_ring->get_rx_channel_fds(num_ring_rx_fds); ev.events = EPOLLIN; @@ -1055,7 +1055,7 @@ int net_device_val::release_ring(resource_allocation_key *key) std::lock_guard lock(m_lock); red_key = get_ring_key_redirection(key); - ring *the_ring = NULL; + ring *the_ring = nullptr; rings_hash_map_t::iterator ring_iter = m_h_ring_map.find(red_key); if (m_h_ring_map.end() != ring_iter) { @@ -1076,7 +1076,7 @@ int net_device_val::release_ring(resource_allocation_key *key) int cq_ch_fd = ring_rx_fds_array[i]; BULLSEYE_EXCLUDE_BLOCK_START if (unlikely((SYSCALL(epoll_ctl, g_p_net_device_table_mgr->global_ring_epfd_get(), - EPOLL_CTL_DEL, cq_ch_fd, NULL)) && + EPOLL_CTL_DEL, cq_ch_fd, nullptr)) && (!(errno == ENOENT || errno == EBADF)))) { nd_logerr("Failed to delete RING notification fd to global_table_mgr_epfd " "(errno=%d %s)", @@ -1320,7 +1320,7 @@ void net_device_val_eth::configure() m_p_L2_addr = create_L2_address(get_ifname()); BULLSEYE_EXCLUDE_BLOCK_START - if (m_p_L2_addr == NULL) { + if (!m_p_L2_addr) { nd_logpanic("m_p_L2_addr allocation error"); } BULLSEYE_EXCLUDE_BLOCK_END @@ -1364,7 +1364,7 @@ uint32_t net_device_val::get_priority_by_tc_class(uint32_t tc_class) void net_device_val_eth::parse_prio_egress_map() { int len, ret; - nl_cache *cache = NULL; + nl_cache *cache = nullptr; rtnl_link *link; vlan_map *map; @@ -1408,7 +1408,7 @@ void net_device_val_eth::parse_prio_egress_map() ring *net_device_val_eth::create_ring(resource_allocation_key *key) { - ring *ring = NULL; + ring *ring = nullptr; try { switch (m_bond) { @@ -1438,7 +1438,7 @@ L2_address *net_device_val_eth::create_L2_address(const char *ifname) { if (m_p_L2_addr) { delete m_p_L2_addr; - m_p_L2_addr = NULL; + m_p_L2_addr = nullptr; } unsigned char hw_addr[ETH_ALEN]; get_local_ll_addr(ifname, hw_addr, ETH_ALEN, false); @@ -1449,14 +1449,14 @@ void net_device_val_eth::create_br_address(const char *ifname) { if (m_p_br_addr) { delete m_p_br_addr; - m_p_br_addr = NULL; + m_p_br_addr = nullptr; } uint8_t hw_addr[ETH_ALEN]; get_local_ll_addr(ifname, hw_addr, ETH_ALEN, true); m_p_br_addr = new ETH_addr(hw_addr); BULLSEYE_EXCLUDE_BLOCK_START - if (m_p_br_addr == NULL) { + if (!m_p_br_addr) { nd_logpanic("m_p_br_addr allocation error"); } BULLSEYE_EXCLUDE_BLOCK_END @@ -1488,7 +1488,7 @@ bool net_device_val::verify_bond_or_eth_qp_creation() char *slave_name; char *save_ptr; slave_name = strtok_r(slaves, " ", &save_ptr); - while (slave_name != NULL) { + while (slave_name) { char *p = strchr(slave_name, '\n'); if (p) { *p = '\0'; // Remove the tailing 'new line" char @@ -1497,7 +1497,7 @@ bool net_device_val::verify_bond_or_eth_qp_creation() // check all slaves but print only once for bond bond_ok = false; } - slave_name = strtok_r(NULL, " ", &save_ptr); + slave_name = strtok_r(nullptr, " ", &save_ptr); } if (!bond_ok) { vlog_printf(VLOG_WARNING, @@ -1555,9 +1555,9 @@ bool net_device_val::verify_qp_creation(const char *ifname, enum ibv_qp_type qp_ { bool success = false; char bond_roce_lag_path[256] = {0}; - struct ibv_cq *cq = NULL; - struct ibv_comp_channel *channel = NULL; - struct ibv_qp *qp = NULL; + struct ibv_cq *cq = nullptr; + struct ibv_comp_channel *channel = nullptr; + struct ibv_qp *qp = nullptr; struct ibv_context *context; int comp_vector = 0; diff --git a/src/core/dev/rfs.cpp b/src/core/dev/rfs.cpp index e791664d2..80059ae05 100644 --- a/src/core/dev/rfs.cpp +++ b/src/core/dev/rfs.cpp @@ -147,7 +147,7 @@ rfs::rfs(flow_tuple *flow_spec_5t, ring_slave *p_ring, rfs_rule_filter *rule_fil #endif BULLSEYE_EXCLUDE_BLOCK_START - if (m_sinks_list == NULL) { + if (!m_sinks_list) { rfs_logpanic("sinks list allocation failed!"); } BULLSEYE_EXCLUDE_BLOCK_END @@ -176,7 +176,7 @@ rfs::~rfs() if (m_p_rule_filter) { delete m_p_rule_filter; - m_p_rule_filter = NULL; + m_p_rule_filter = nullptr; } delete[] m_sinks_list; } @@ -208,7 +208,7 @@ bool rfs::add_sink(pkt_rcvr_sink *p_sink) pkt_rcvr_sink **tmp_sinks_list = new pkt_rcvr_sink *[tmp_sinks_list_length]; BULLSEYE_EXCLUDE_BLOCK_START - if (tmp_sinks_list == NULL) { + if (!tmp_sinks_list) { rfs_logerr("sinks list allocation failed!"); return false; } @@ -242,7 +242,7 @@ bool rfs::del_sink(pkt_rcvr_sink *p_sink) for (/*continue i*/; i < (m_n_sinks_list_entries - 1); ++i) { m_sinks_list[i] = m_sinks_list[i + 1]; } - m_sinks_list[i] = NULL; + m_sinks_list[i] = nullptr; m_n_sinks_list_entries--; rfs_logdbg("Removed sink (%p), num of sinks is now: %d", p_sink, diff --git a/src/core/dev/rfs.h b/src/core/dev/rfs.h index 1eddd922c..c9f1834e7 100644 --- a/src/core/dev/rfs.h +++ b/src/core/dev/rfs.h @@ -79,7 +79,7 @@ class rfs_rule_filter { class rfs { public: - rfs(flow_tuple *flow_spec_5t, ring_slave *p_ring, rfs_rule_filter *rule_filter = NULL, + rfs(flow_tuple *flow_spec_5t, ring_slave *p_ring, rfs_rule_filter *rule_filter = nullptr, uint32_t flow_tag_id = 0); virtual ~rfs(); diff --git a/src/core/dev/rfs_mc.h b/src/core/dev/rfs_mc.h index 6a51cff4b..a708467cd 100644 --- a/src/core/dev/rfs_mc.h +++ b/src/core/dev/rfs_mc.h @@ -45,7 +45,7 @@ class rfs_mc : public rfs { public: - rfs_mc(flow_tuple *flow_spec_5t, ring_slave *p_ring, rfs_rule_filter *rule_filter = NULL, + rfs_mc(flow_tuple *flow_spec_5t, ring_slave *p_ring, rfs_rule_filter *rule_filter = nullptr, int32_t flow_tag_id = 0); virtual bool rx_dispatch_packet(mem_buf_desc_t *p_rx_wc_buf_desc, diff --git a/src/core/dev/rfs_uc.h b/src/core/dev/rfs_uc.h index b59835619..b6d3ff529 100644 --- a/src/core/dev/rfs_uc.h +++ b/src/core/dev/rfs_uc.h @@ -45,7 +45,7 @@ class rfs_uc : public rfs { public: - rfs_uc(flow_tuple *flow_spec_5t, ring_slave *p_ring, rfs_rule_filter *rule_filter = NULL, + rfs_uc(flow_tuple *flow_spec_5t, ring_slave *p_ring, rfs_rule_filter *rule_filter = nullptr, uint32_t flow_tag_id = 0); virtual bool rx_dispatch_packet(mem_buf_desc_t *p_rx_wc_buf_desc, diff --git a/src/core/dev/rfs_uc_tcp_gro.cpp b/src/core/dev/rfs_uc_tcp_gro.cpp index bfba8f94e..cdf824480 100644 --- a/src/core/dev/rfs_uc_tcp_gro.cpp +++ b/src/core/dev/rfs_uc_tcp_gro.cpp @@ -190,11 +190,11 @@ bool rfs_uc_tcp_gro::add_packet(mem_buf_desc_t *mem_buf_desc, void *payload_ptr, mem_buf_desc->lwip_pbuf.pbuf.len = mem_buf_desc->lwip_pbuf.pbuf.tot_len = mem_buf_desc->rx.sz_payload; mem_buf_desc->lwip_pbuf.pbuf.ref = 1; - mem_buf_desc->lwip_pbuf.pbuf.next = NULL; + mem_buf_desc->lwip_pbuf.pbuf.next = nullptr; mem_buf_desc->lwip_pbuf.pbuf.payload = payload_ptr; m_gro_desc.p_last->lwip_pbuf.pbuf.next = &(mem_buf_desc->lwip_pbuf.pbuf); - m_gro_desc.p_last->p_next_desc = NULL; + m_gro_desc.p_last->p_next_desc = nullptr; mem_buf_desc->p_prev_desc = m_gro_desc.p_last; m_gro_desc.p_last = mem_buf_desc; diff --git a/src/core/dev/rfs_uc_tcp_gro.h b/src/core/dev/rfs_uc_tcp_gro.h index 4df456d31..7150b8c9a 100644 --- a/src/core/dev/rfs_uc_tcp_gro.h +++ b/src/core/dev/rfs_uc_tcp_gro.h @@ -68,7 +68,7 @@ class gro_mgr; class rfs_uc_tcp_gro : public rfs_uc { public: rfs_uc_tcp_gro(flow_tuple *flow_spec_5t, ring_slave *p_ring, - rfs_rule_filter *rule_filter = NULL, uint32_t flow_tag_id = 0); + rfs_rule_filter *rule_filter = nullptr, uint32_t flow_tag_id = 0); virtual bool rx_dispatch_packet(mem_buf_desc_t *p_rx_wc_buf_desc, void *pv_fd_ready_array); diff --git a/src/core/dev/ring.cpp b/src/core/dev/ring.cpp index 9f8870990..706bd3e86 100644 --- a/src/core/dev/ring.cpp +++ b/src/core/dev/ring.cpp @@ -40,8 +40,8 @@ #define MODULE_HDR MODULE_NAME "%d:%s() " ring::ring() - : m_p_n_rx_channel_fds(NULL) - , m_parent(NULL) + : m_p_n_rx_channel_fds(nullptr) + , m_parent(nullptr) , m_tcp_seg_list(nullptr) , m_tcp_seg_count(0U) { @@ -117,5 +117,5 @@ void ring::put_tcp_segs(tcp_seg *seg) void ring::print_val() { ring_logdbg("%d: %p: parent %p", m_if_index, this, - ((uintptr_t)this == (uintptr_t)m_parent ? 0 : m_parent)); + ((uintptr_t)this == (uintptr_t)m_parent ? nullptr : m_parent)); } diff --git a/src/core/dev/ring.h b/src/core/dev/ring.h index 8251ef306..b462b9194 100644 --- a/src/core/dev/ring.h +++ b/src/core/dev/ring.h @@ -69,7 +69,7 @@ struct ring_ec { { INIT_LIST_HEAD(&list); memset(&completion, 0, sizeof(completion)); - last_buff_lst = NULL; + last_buff_lst = nullptr; } }; @@ -115,9 +115,9 @@ class ring { virtual bool reclaim_recv_buffers_no_lock(mem_buf_desc_t *) { return false; }; virtual int drain_and_proccess() = 0; virtual int wait_for_notification_and_process_element(int cq_channel_fd, uint64_t *p_cq_poll_sn, - void *pv_fd_ready_array = NULL) = 0; + void *pv_fd_ready_array = nullptr) = 0; virtual int poll_and_process_element_rx(uint64_t *p_cq_poll_sn, - void *pv_fd_ready_array = NULL) = 0; + void *pv_fd_ready_array = nullptr) = 0; virtual int poll_and_process_element_tx(uint64_t *p_cq_poll_sn) = 0; virtual void adapt_cq_moderation() = 0; virtual void mem_buf_desc_return_single_to_owner_tx(mem_buf_desc_t *p_mem_buf_desc) = 0; diff --git a/src/core/dev/ring_bond.cpp b/src/core/dev/ring_bond.cpp index cb84f931e..fa7623db5 100644 --- a/src/core/dev/ring_bond.cpp +++ b/src/core/dev/ring_bond.cpp @@ -49,7 +49,7 @@ ring_bond::ring_bond(int if_index) , m_lock_ring_rx("ring_bond:lock_rx") , m_lock_ring_tx("ring_bond:lock_tx") { - net_device_val *p_ndev = NULL; + net_device_val *p_ndev = nullptr; /* Configure ring() fields */ set_parent(this); @@ -57,7 +57,7 @@ ring_bond::ring_bond(int if_index) /* Sanity check */ p_ndev = g_p_net_device_table_mgr->get_net_device_val(m_parent->get_if_index()); - if (NULL == p_ndev) { + if (!p_ndev) { ring_logpanic("Invalid if_index = %d", if_index); } @@ -89,14 +89,14 @@ ring_bond::~ring_bond() if (m_p_n_rx_channel_fds) { delete[] m_p_n_rx_channel_fds; - m_p_n_rx_channel_fds = NULL; + m_p_n_rx_channel_fds = nullptr; } } void ring_bond::print_val() { ring_logdbg("%d: %p: parent %p type %s", m_if_index, this, - ((uintptr_t)this == (uintptr_t)m_parent ? 0 : m_parent), "bond"); + ((uintptr_t)this == (uintptr_t)m_parent ? nullptr : m_parent), "bond"); } bool ring_bond::attach_flow(flow_tuple &flow_spec_5t, pkt_rcvr_sink *sink, bool force_5t) @@ -145,7 +145,7 @@ void ring_bond::restart() { net_device_val *p_ndev = g_p_net_device_table_mgr->get_net_device_val(m_parent->get_if_index()); - if (NULL == p_ndev) { + if (!p_ndev) { return; } const slave_data_vector_t &slaves = p_ndev->get_slave_array(); @@ -161,7 +161,7 @@ void ring_bond::restart() ring_tap *p_ring_tap = dynamic_cast(p_ring_bond_netvsc->m_tap_ring); if (p_ring_tap) { size_t num_ring_rx_fds = 0; - int *ring_rx_fds_array = NULL; + int *ring_rx_fds_array = nullptr; int epfd = -1; int fd = -1; int rc = 0; @@ -175,7 +175,7 @@ void ring_bond::restart() epfd = g_p_net_device_table_mgr->global_ring_epfd_get(); if (epfd > 0) { fd = ring_rx_fds_array[k]; - rc = SYSCALL(epoll_ctl, epfd, EPOLL_CTL_DEL, fd, NULL); + rc = SYSCALL(epoll_ctl, epfd, EPOLL_CTL_DEL, fd, nullptr); ring_logdbg("Remove fd=%d from epfd=%d rc=%d errno=%d", fd, epfd, rc, errno); } @@ -204,8 +204,8 @@ void ring_bond::restart() p_ring_tap->inc_vf_plugouts(); p_ring_bond_netvsc->slave_destroy( p_ring_bond_netvsc->m_vf_ring->get_if_index()); - p_ring_bond_netvsc->m_vf_ring = NULL; - p_ring_tap->set_vf_ring(NULL); + p_ring_bond_netvsc->m_vf_ring = nullptr; + p_ring_tap->set_vf_ring(nullptr); } else { for (i = 0; i < slaves.size(); i++) { if (slaves[i]->if_index != p_ring_tap->get_if_index()) { @@ -219,7 +219,7 @@ void ring_bond::restart() for (k = 0; k < num_ring_rx_fds; k++) { epfd = g_p_net_device_table_mgr->global_ring_epfd_get(); if (epfd > 0) { - epoll_event ev = {0, {0}}; + epoll_event ev = {0, {nullptr}}; fd = ring_rx_fds_array[k]; ev.events = EPOLLIN; ev.data.fd = fd; @@ -367,7 +367,7 @@ void ring_bond::adapt_cq_moderation() mem_buf_desc_t *ring_bond::mem_buf_tx_get(ring_user_id_t id, bool b_block, pbuf_type type, int n_num_mem_bufs /* default = 1 */) { - mem_buf_desc_t *ret = NULL; + mem_buf_desc_t *ret = nullptr; std::lock_guard lock(m_lock_ring_tx); ret = m_xmit_rings[id]->mem_buf_tx_get(id, b_block, type, n_num_mem_bufs); @@ -432,7 +432,7 @@ void ring_bond::send_ring_buffer(ring_user_id_t id, xlio_ibv_send_wr *p_send_wqe } else { ring_logfunc("active ring=%p, silent packet drop (%p), (HA event?)", m_xmit_rings[id], p_mem_buf_desc); - p_mem_buf_desc->p_next_desc = NULL; + p_mem_buf_desc->p_next_desc = nullptr; if (likely(p_mem_buf_desc->p_desc_owner == m_bond_rings[id])) { m_bond_rings[id]->mem_buf_tx_release(p_mem_buf_desc, true); } else { @@ -454,7 +454,7 @@ int ring_bond::send_lwip_buffer(ring_user_id_t id, xlio_ibv_send_wr *p_send_wqe, ring_logfunc("active ring=%p, silent packet drop (%p), (HA event?)", m_xmit_rings[id], p_mem_buf_desc); - p_mem_buf_desc->p_next_desc = NULL; + p_mem_buf_desc->p_next_desc = nullptr; /* no need to free the buffer here, as for lwip buffers we have 2 ref counts, */ /* one for caller, and one for completion. for completion, we ref count in */ /* send_lwip_buffer(). Since we are not going in, the caller will free the */ @@ -675,7 +675,7 @@ bool ring_bond::reclaim_recv_buffers(mem_buf_desc_t *) void ring_bond::update_cap(ring_slave *slave) { - if (NULL == slave) { + if (!slave) { m_max_inline_data = (uint32_t)(-1); m_max_send_sge = (uint32_t)(-1); return; @@ -748,7 +748,7 @@ int ring_bond::devide_buffers_helper(mem_buf_desc_t *p_mem_buf_desc_list, } } temp = head->p_next_desc; - head->p_next_desc = NULL; + head->p_next_desc = nullptr; if (i == m_bond_rings.size()) { // handle no owner ring_logdbg("No matching ring %p to return buffer", current->p_desc_owner); @@ -764,7 +764,7 @@ int ring_bond::devide_buffers_helper(mem_buf_desc_t *p_mem_buf_desc_list, void ring_bond::popup_xmit_rings() { - ring_slave *cur_slave = NULL; + ring_slave *cur_slave = nullptr; size_t i, j; m_xmit_rings.clear(); @@ -797,7 +797,7 @@ void ring_bond::popup_recv_rings() net_device_val *p_ndev = g_p_net_device_table_mgr->get_net_device_val(m_parent->get_if_index()); m_recv_rings.clear(); - if (NULL == p_ndev) { + if (!p_ndev) { return; } const slave_data_vector_t &slaves = p_ndev->get_slave_array(); @@ -830,7 +830,7 @@ void ring_bond::update_rx_channel_fds() { if (m_p_n_rx_channel_fds) { delete[] m_p_n_rx_channel_fds; - m_p_n_rx_channel_fds = NULL; + m_p_n_rx_channel_fds = nullptr; } if (m_recv_rings.size() == 0) { return; @@ -960,7 +960,7 @@ int ring_bond::socketxtreme_poll(struct xlio_socketxtreme_completion_t *, unsign void ring_bond::slave_destroy(int if_index) { - ring_slave *cur_slave = NULL; + ring_slave *cur_slave = nullptr; ring_slave_vector_t::iterator iter; for (iter = m_bond_rings.begin(); iter != m_bond_rings.end(); iter++) { @@ -981,7 +981,7 @@ void ring_bond_eth::slave_create(int if_index) ring_slave *cur_slave; cur_slave = new ring_eth(if_index, this); - if (cur_slave == NULL) { + if (!cur_slave) { ring_logpanic("Error creating bond ring: memory allocation error"); } @@ -1000,11 +1000,11 @@ void ring_bond_eth::slave_create(int if_index) void ring_bond_netvsc::slave_create(int if_index) { - ring_slave *cur_slave = NULL; - net_device_val *p_ndev = NULL; + ring_slave *cur_slave = nullptr; + net_device_val *p_ndev = nullptr; p_ndev = g_p_net_device_table_mgr->get_net_device_val(m_parent->get_if_index()); - if (NULL == p_ndev) { + if (!p_ndev) { ring_logpanic("Error creating bond ring"); } diff --git a/src/core/dev/ring_bond.h b/src/core/dev/ring_bond.h index 680317abb..b7455b588 100644 --- a/src/core/dev/ring_bond.h +++ b/src/core/dev/ring_bond.h @@ -59,7 +59,8 @@ class ring_bond : public ring { return m_p_n_rx_channel_fds; }; virtual int request_notification(cq_type_t cq_type, uint64_t poll_sn); - virtual int poll_and_process_element_rx(uint64_t *p_cq_poll_sn, void *pv_fd_ready_array = NULL); + virtual int poll_and_process_element_rx(uint64_t *p_cq_poll_sn, + void *pv_fd_ready_array = nullptr); virtual int poll_and_process_element_tx(uint64_t *p_cq_poll_sn); virtual void adapt_cq_moderation(); virtual bool reclaim_recv_buffers(descq_t *rx_reuse); @@ -67,7 +68,7 @@ class ring_bond : public ring { virtual void mem_buf_rx_release(mem_buf_desc_t *p_mem_buf_desc); virtual int drain_and_proccess(); virtual int wait_for_notification_and_process_element(int cq_channel_fd, uint64_t *p_cq_poll_sn, - void *pv_fd_ready_array = NULL); + void *pv_fd_ready_array = nullptr); virtual int get_num_resources() const { return m_bond_rings.size(); }; virtual bool attach_flow(flow_tuple &flow_spec_5t, pkt_rcvr_sink *sink, bool force_5t = false); virtual bool detach_flow(flow_tuple &flow_spec_5t, pkt_rcvr_sink *sink); @@ -116,7 +117,7 @@ class ring_bond : public ring { } protected: - void update_cap(ring_slave *slave = NULL); + void update_cap(ring_slave *slave = nullptr); void update_rx_channel_fds(); /* Fill m_xmit_rings array */ @@ -196,8 +197,8 @@ class ring_bond_netvsc : public ring_bond { net_device_val *p_ndev = g_p_net_device_table_mgr->get_net_device_val(m_parent->get_if_index()); - m_vf_ring = NULL; - m_tap_ring = NULL; + m_vf_ring = nullptr; + m_tap_ring = nullptr; if (p_ndev) { const slave_data_vector_t &slaves = p_ndev->get_slave_array(); update_cap(); diff --git a/src/core/dev/ring_simple.cpp b/src/core/dev/ring_simple.cpp index a086a9ae1..f59bbacdb 100644 --- a/src/core/dev/ring_simple.cpp +++ b/src/core/dev/ring_simple.cpp @@ -99,7 +99,7 @@ ring_simple::ring_simple(int if_index, ring *parent, ring_type_t type, bool use_ */ BULLSEYE_EXCLUDE_BLOCK_START m_p_ib_ctx = p_slave->p_ib_ctx; - if (m_p_ib_ctx == NULL) { + if (!m_p_ib_ctx) { ring_logpanic("m_p_ib_ctx = NULL. It can be related to wrong bonding configuration"); } @@ -198,7 +198,7 @@ ring_simple::~ring_simple() } ENDIF_VERBS_FAILURE; VALGRIND_MAKE_MEM_UNDEFINED(m_p_tx_comp_event_channel, sizeof(struct ibv_comp_channel)); - m_p_tx_comp_event_channel = NULL; + m_p_tx_comp_event_channel = nullptr; } /* coverity[double_unlock] TODO: RM#1049980 */ @@ -208,7 +208,7 @@ ring_simple::~ring_simple() ring_logdbg("queue of event completion elements is %s", (list_empty(&m_socketxtreme.ec_list) ? "empty" : "not empty")); while (!list_empty(&m_socketxtreme.ec_list)) { - struct ring_ec *ec = NULL; + struct ring_ec *ec = nullptr; ec = get_ec(); if (ec) { del_ec(ec); @@ -225,7 +225,7 @@ void ring_simple::create_resources() save_l2_address(p_slave->p_L2_addr); m_p_tx_comp_event_channel = ibv_create_comp_channel(m_p_ib_ctx->get_ibv_context()); - if (m_p_tx_comp_event_channel == NULL) { + if (!m_p_tx_comp_event_channel) { VLOG_PRINTF_INFO_ONCE_THEN_ALWAYS( VLOG_ERROR, VLOG_DEBUG, "ibv_create_comp_channel for tx failed. m_p_tx_comp_event_channel = %p (errno=%d %m)", @@ -335,7 +335,7 @@ void ring_simple::create_resources() m_p_ib_ctx->get_ibv_context()); // ODED TODO: Adjust the ibv_context to be the exact one in // case of different devices BULLSEYE_EXCLUDE_BLOCK_START - if (m_p_rx_comp_event_channel == NULL) { + if (!m_p_rx_comp_event_channel) { VLOG_PRINTF_INFO_ONCE_THEN_ALWAYS( VLOG_ERROR, VLOG_DEBUG, "ibv_create_comp_channel for rx failed. p_rx_comp_event_channel = %p (errno=%d %m)", @@ -461,11 +461,11 @@ int ring_simple::socketxtreme_poll(struct xlio_socketxtreme_completion_t *xlio_c // completions than ncompletions, what is not optimal for performance. // Not each packet results in a real completion but this check is good enough. if (++pkts >= ncompletions) { - m_gro_mgr.flush_all(NULL); + m_gro_mgr.flush_all(nullptr); pkts = 0U; } } else { - m_gro_mgr.flush_all(NULL); + m_gro_mgr.flush_all(nullptr); do_poll = false; } } else { @@ -485,7 +485,7 @@ int ring_simple::wait_for_notification_and_process_element(int cq_channel_fd, void *pv_fd_ready_array /*NULL*/) { int ret = -1; - if (m_p_cq_mgr_rx != NULL) { + if (m_p_cq_mgr_rx) { RING_TRY_LOCK_RUN_AND_UPDATE_RET(m_lock_ring_rx, m_p_cq_mgr_rx->wait_for_notification_and_process_element( p_cq_poll_sn, pv_fd_ready_array); @@ -574,7 +574,7 @@ mem_buf_desc_t *ring_simple::mem_buf_tx_get(ring_user_id_t id, bool b_block, pbu { NOT_IN_USE(id); int ret = 0; - mem_buf_desc_t *buff_list = NULL; + mem_buf_desc_t *buff_list = nullptr; uint64_t poll_sn = 0; ring_logfuncall("n_num_mem_bufs=%d", n_num_mem_bufs); @@ -590,7 +590,7 @@ mem_buf_desc_t *ring_simple::mem_buf_tx_get(ring_user_id_t id, bool b_block, pbu m_p_cq_mgr_tx, ret); /* coverity[double_unlock] TODO: RM#1049980 */ m_lock_ring_tx.unlock(); - return NULL; + return nullptr; } else if (ret > 0) { ring_logfunc("polling succeeded on cq_mgr_tx (%d wce)", ret); buff_list = get_tx_buffers(type, n_num_mem_bufs); @@ -636,7 +636,7 @@ mem_buf_desc_t *ring_simple::mem_buf_tx_get(ring_user_id_t id, bool b_block, pbu } else if (ret < 0) { ring_logdbg("failed blocking on cq_mgr_tx (errno=%d %m)", errno); m_lock_ring_tx_buf_wait.unlock(); - return NULL; + return nullptr; } /* coverity[double_lock] TODO: RM#1049980 */ m_lock_ring_tx.lock(); @@ -660,7 +660,7 @@ mem_buf_desc_t *ring_simple::mem_buf_tx_get(ring_user_id_t id, bool b_block, pbu /* coverity[double_unlock] TODO: RM#1049980 */ m_lock_ring_tx.unlock(); m_lock_ring_tx_buf_wait.unlock(); - return NULL; + return nullptr; } ring_logfunc("polling/blocking succeeded on cq_mgr_tx (we got %d wce)", ret); @@ -676,7 +676,7 @@ mem_buf_desc_t *ring_simple::mem_buf_tx_get(ring_user_id_t id, bool b_block, pbu } else { // get out on non blocked socket m_lock_ring_tx.unlock(); - return NULL; + return nullptr; } } @@ -711,7 +711,7 @@ int ring_simple::mem_buf_tx_release(mem_buf_desc_t *p_mem_buf_desc_list, bool b_ void ring_simple::mem_buf_rx_release(mem_buf_desc_t *p_mem_buf_desc) { - p_mem_buf_desc->p_next_desc = NULL; + p_mem_buf_desc->p_next_desc = nullptr; reclaim_recv_buffers(p_mem_buf_desc); } @@ -754,7 +754,7 @@ void ring_simple::send_ring_buffer(ring_user_id_t id, xlio_ibv_send_wr *p_send_w } std::lock_guard lock(m_lock_ring_tx); - int ret = send_buffer(p_send_wqe, attr, 0); + int ret = send_buffer(p_send_wqe, attr, nullptr); send_status_handler(ret, p_send_wqe); } @@ -905,7 +905,7 @@ mem_buf_desc_t *ring_simple::get_tx_buffers(pbuf_type type, uint32_t n_num_mem_b } if (unlikely(pool.size() < n_num_mem_bufs)) { - return NULL; + return nullptr; } } @@ -924,7 +924,7 @@ mem_buf_desc_t *ring_simple::get_tx_buffers(pbuf_type type, uint32_t n_num_mem_b next->lwip_pbuf.pbuf.type = type; n_num_mem_bufs--; } - next->p_next_desc = NULL; + next->p_next_desc = nullptr; return head; } diff --git a/src/core/dev/ring_simple.h b/src/core/dev/ring_simple.h index 08b211769..b302165b9 100644 --- a/src/core/dev/ring_simple.h +++ b/src/core/dev/ring_simple.h @@ -67,7 +67,7 @@ class ring_simple : public ring_slave { int request_notification(cq_type_t cq_type, uint64_t poll_sn) override; int poll_and_process_element_rx(uint64_t *p_cq_poll_sn, - void *pv_fd_ready_array = NULL) override; + void *pv_fd_ready_array = nullptr) override; int poll_and_process_element_tx(uint64_t *p_cq_poll_sn) override; void adapt_cq_moderation() override; bool reclaim_recv_buffers(descq_t *rx_reuse) override; @@ -79,10 +79,10 @@ class ring_simple : public ring_slave { unsigned int ncompletions, int flags) override; int drain_and_proccess() override; int wait_for_notification_and_process_element(int cq_channel_fd, uint64_t *p_cq_poll_sn, - void *pv_fd_ready_array = NULL) override; + void *pv_fd_ready_array = nullptr) override; void mem_buf_desc_return_to_owner_tx(mem_buf_desc_t *p_mem_buf_desc); void mem_buf_desc_return_to_owner_rx(mem_buf_desc_t *p_mem_buf_desc, - void *pv_fd_ready_array = NULL); + void *pv_fd_ready_array = nullptr); inline int send_buffer(xlio_ibv_send_wr *p_send_wqe, xlio_wr_tx_packet_attr attr, xlio_tis *tis); bool is_up() override; @@ -235,7 +235,7 @@ class ring_simple : public ring_slave { dpcp::adapter_hca_capabilities caps {}; auto adapter = m_p_ib_ctx->get_dpcp_adapter(); - if (adapter == nullptr || (dpcp::DPCP_OK != adapter->get_hca_capabilities(caps)) || + if (!adapter || (dpcp::DPCP_OK != adapter->get_hca_capabilities(caps)) || !caps.nvmeotcp_caps.enabled) { return 0; } @@ -327,7 +327,7 @@ class ring_simple : public ring_slave { inline ring_ec *get_ec(void) { - struct ring_ec *ec = NULL; + struct ring_ec *ec = nullptr; m_socketxtreme.lock_ec_list.lock(); if (!list_empty(&m_socketxtreme.ec_list)) { @@ -354,7 +354,7 @@ class ring_simple : public ring_slave { if (m_p_l2_addr) { delete m_p_l2_addr; } - m_p_l2_addr = NULL; + m_p_l2_addr = nullptr; }; protected: @@ -442,7 +442,7 @@ class ring_simple : public ring_slave { class ring_eth : public ring_simple { public: - ring_eth(int if_index, ring *parent = NULL, ring_type_t type = RING_ETH, + ring_eth(int if_index, ring *parent = nullptr, ring_type_t type = RING_ETH, bool call_create_res = true, bool use_locks = true) : ring_simple(if_index, parent, type, use_locks) { diff --git a/src/core/dev/ring_slave.cpp b/src/core/dev/ring_slave.cpp index 11ac9e1df..f90edaa51 100644 --- a/src/core/dev/ring_slave.cpp +++ b/src/core/dev/ring_slave.cpp @@ -67,8 +67,8 @@ ring_slave::ring_slave(int if_index, ring *parent, ring_type_t type, bool use_lo , m_b_sysvar_mc_force_flowtag(safe_mce_sys().mc_force_flowtag) , m_type(type) { - net_device_val *p_ndev = NULL; - const slave_data_t *p_slave = NULL; + net_device_val *p_ndev = nullptr; + const slave_data_t *p_slave = nullptr; /* Configure ring() fields */ set_parent(parent); @@ -76,7 +76,7 @@ ring_slave::ring_slave(int if_index, ring *parent, ring_type_t type, bool use_lo /* Sanity check */ p_ndev = g_p_net_device_table_mgr->get_net_device_val(m_parent->get_if_index()); - if (NULL == p_ndev) { + if (!p_ndev) { ring_logpanic("Invalid if_index = %d", if_index); } @@ -122,7 +122,8 @@ ring_slave::~ring_slave() void ring_slave::print_val() { ring_logdbg("%d: %p: parent %p type %s", m_if_index, this, - ((uintptr_t)this == (uintptr_t)m_parent ? 0 : m_parent), ring_type_str[m_type]); + ((uintptr_t)this == (uintptr_t)m_parent ? nullptr : m_parent), + ring_type_str[m_type]); } void ring_slave::restart() @@ -161,10 +162,10 @@ bool steering_handler::attach_flow(flow_tuple &flow_spec_5t, bool force_5t) { rfs *p_rfs; - rfs *p_tmp_rfs = NULL; + rfs *p_tmp_rfs = nullptr; sockinfo *si = static_cast(sink); - if (si == NULL) { + if (!si) { return false; } @@ -184,7 +185,7 @@ bool steering_handler::attach_flow(flow_tuple &flow_spec_5t, flow_spec_5t.get_dst_port(), flow_spec_5t.get_src_port()); sock_addr rule_key(flow_spec_5t.get_family(), &flow_spec_5t.get_dst_ip(), flow_spec_5t.get_dst_port()); - rfs_rule_filter *dst_port_filter = NULL; + rfs_rule_filter *dst_port_filter = nullptr; if (safe_mce_sys().udp_3t_rules) { auto dst_port_iter = m_ring.m_udp_uc_dst_port_attach_map.find(rule_key); if (dst_port_iter == m_ring.m_udp_uc_dst_port_attach_map.end()) { @@ -220,7 +221,7 @@ bool steering_handler::attach_flow(flow_tuple &flow_spec_5t, return false; } BULLSEYE_EXCLUDE_BLOCK_START - if (p_tmp_rfs == NULL) { + if (!p_tmp_rfs) { ring_logerr("Failed to allocate rfs!"); return false; } @@ -257,7 +258,7 @@ bool steering_handler::attach_flow(flow_tuple &flow_spec_5t, // It means that for every MC group, even if we have sockets with different ports - only one // rule in the HW. So the hash map below keeps track of the number of sockets per rule so we // know when to call ibv_attach and ibv_detach - rfs_rule_filter *l2_mc_ip_filter = NULL; + rfs_rule_filter *l2_mc_ip_filter = nullptr; if (m_ring.m_b_sysvar_eth_mc_l2_only_rules) { auto l2_mc_iter = m_ring.m_l2_mc_ip_attach_map.find(rule_key); // It means that this is the first time attach called with this MC ip @@ -297,7 +298,7 @@ bool steering_handler::attach_flow(flow_tuple &flow_spec_5t, flow_spec_5t.get_dst_port(), flow_spec_5t.get_src_port()); sock_addr rule_key(flow_spec_5t.get_family(), &flow_spec_5t.get_dst_ip(), flow_spec_5t.get_dst_port()); - rfs_rule_filter *dst_port_filter = NULL; + rfs_rule_filter *dst_port_filter = nullptr; if (safe_mce_sys().tcp_3t_rules) { auto dst_port_iter = m_ring.m_tcp_dst_port_attach_map.find(rule_key); if (dst_port_iter == m_ring.m_tcp_dst_port_attach_map.end()) { @@ -339,7 +340,7 @@ bool steering_handler::attach_flow(flow_tuple &flow_spec_5t, return false; } BULLSEYE_EXCLUDE_BLOCK_START - if (p_tmp_rfs == NULL) { + if (!p_tmp_rfs) { ring_logerr("Failed to allocate rfs!"); return false; } @@ -398,7 +399,7 @@ bool ring_slave::attach_flow(flow_tuple &flow_spec_5t, pkt_rcvr_sink *sink, bool template bool steering_handler::detach_flow(flow_tuple &flow_spec_5t, pkt_rcvr_sink *sink) { - rfs *p_rfs = NULL; + rfs *p_rfs = nullptr; ring_logdbg("flow: %s, with sink (%p)", flow_spec_5t.to_str().c_str(), sink); @@ -592,14 +593,14 @@ bool ring_slave::rx_process_buffer(mem_buf_desc_t *p_rx_wc_buf_desc, void *pv_fd if (likely(m_flow_tag_enabled && p_rx_wc_buf_desc->rx.flow_tag_id && p_rx_wc_buf_desc->rx.flow_tag_id != FLOW_TAG_MASK && !p_rx_wc_buf_desc->rx.is_sw_csum_need)) { - sockinfo *si = NULL; + sockinfo *si = nullptr; // trying to get sockinfo per flow_tag_id-1 as it was incremented at attach // to allow mapping sockfd=0 assert(g_p_fd_collection); si = static_cast( g_p_fd_collection->get_sockfd(p_rx_wc_buf_desc->rx.flow_tag_id - 1)); - if (likely((si != NULL) && si->flow_tag_enabled())) { + if (likely((si) && si->flow_tag_enabled())) { // will process packets with set flow_tag_id and enabled for the socket if (p_eth_h->h_proto == NET_ETH_P_8021Q) { // Handle VLAN header as next protocol @@ -706,7 +707,7 @@ bool ring_slave::rx_process_buffer(mem_buf_desc_t *p_rx_wc_buf_desc, void *pv_fd ETH_HW_ADDR_PRINT_ADDR(p_eth_h->h_source), htons(h_proto)); // Handle VLAN header as next protocol - struct vlanhdr *p_vlan_hdr = NULL; + struct vlanhdr *p_vlan_hdr = nullptr; uint16_t packet_vlan = 0; if (h_proto == NET_ETH_P_8021Q) { p_vlan_hdr = (struct vlanhdr *)((uint8_t *)p_eth_h + ETH_HDR_LEN); @@ -965,7 +966,7 @@ bool steering_handler::rx_process_buffer_no_flow_id( p_rx_wc_buf_desc->rx.frag.iov_len = ip_tot_len - hdr_data.ip_hdr_len; // Add ip fragment packet to out fragment manager - mem_buf_desc_t *new_buf = NULL; + mem_buf_desc_t *new_buf = nullptr; int ret = -1; if (g_p_ip_frag_manager) { ret = g_p_ip_frag_manager->add_frag(p_ip_h, p_rx_wc_buf_desc, &new_buf); diff --git a/src/core/dev/ring_tap.cpp b/src/core/dev/ring_tap.cpp index b59ac33a4..aebf474b7 100644 --- a/src/core/dev/ring_tap.cpp +++ b/src/core/dev/ring_tap.cpp @@ -45,7 +45,7 @@ ring_tap::ring_tap(int if_index, ring *parent) : ring_slave(if_index, parent, RING_TAP, true) , m_tap_fd(-1) - , m_vf_ring(NULL) + , m_vf_ring(nullptr) , m_sysvar_qp_compensation_level(safe_mce_sys().qp_compensation_level) , m_tap_data_available(false) { @@ -286,7 +286,7 @@ int ring_tap::wait_for_notification_and_process_element(int, uint64_t *, void *p int ring_tap::drain_and_proccess() { - return process_element_rx(NULL); + return process_element_rx(nullptr); } bool ring_tap::reclaim_recv_buffers(descq_t *rx_reuse) @@ -309,14 +309,14 @@ bool ring_tap::reclaim_recv_buffers(descq_t *rx_reuse) bool ring_tap::reclaim_recv_buffers(mem_buf_desc_t *buff) { if (buff && (buff->dec_ref_count() <= 1)) { - mem_buf_desc_t *temp = NULL; + mem_buf_desc_t *temp = nullptr; while (buff) { if (buff->lwip_pbuf_dec_ref_count() <= 0) { temp = buff; buff = temp->p_next_desc; temp->clear_transport_data(); - temp->p_next_desc = NULL; - temp->p_prev_desc = NULL; + temp->p_next_desc = nullptr; + temp->p_prev_desc = nullptr; temp->reset_ref_count(); free_lwip_pbuf(&temp->lwip_pbuf); m_rx_pool.push_back(temp); @@ -469,7 +469,7 @@ bool ring_tap::request_more_rx_buffers() mem_buf_desc_t *ring_tap::mem_buf_tx_get(ring_user_id_t id, bool b_block, pbuf_type type, int n_num_mem_bufs) { - mem_buf_desc_t *head = NULL; + mem_buf_desc_t *head = nullptr; NOT_IN_USE(id); NOT_IN_USE(b_block); @@ -527,7 +527,7 @@ void ring_tap::mem_buf_desc_return_single_to_owner_tx(mem_buf_desc_t *p_mem_buf_ } if (p_mem_buf_desc->lwip_pbuf.pbuf.ref == 0) { - p_mem_buf_desc->p_next_desc = NULL; + p_mem_buf_desc->p_next_desc = nullptr; if (unlikely(p_mem_buf_desc->lwip_pbuf.pbuf.type == PBUF_ZEROCOPY)) { g_buffer_pool_zc->put_buffers_thread_safe(p_mem_buf_desc); return; @@ -568,7 +568,7 @@ int ring_tap::mem_buf_tx_release(mem_buf_desc_t *buff_list, bool b_accounting, b while (buff_list) { next = buff_list->p_next_desc; - buff_list->p_next_desc = NULL; + buff_list->p_next_desc = nullptr; // potential race, ref is protected here by ring_tx lock, and in dst_entry_tcp & // sockinfo_tcp by tcp lock diff --git a/src/core/dev/ring_tap.h b/src/core/dev/ring_tap.h index 013ac8922..4ef935c52 100644 --- a/src/core/dev/ring_tap.h +++ b/src/core/dev/ring_tap.h @@ -44,14 +44,15 @@ class ring_tap : public ring_slave { virtual bool is_up() { return (m_vf_ring || m_active); } virtual bool attach_flow(flow_tuple &flow_spec_5t, pkt_rcvr_sink *sink, bool force_5t = false); virtual bool detach_flow(flow_tuple &flow_spec_5t, pkt_rcvr_sink *sink); - virtual int poll_and_process_element_rx(uint64_t *p_cq_poll_sn, void *pv_fd_ready_array = NULL); + virtual int poll_and_process_element_rx(uint64_t *p_cq_poll_sn, + void *pv_fd_ready_array = nullptr); virtual int poll_and_process_element_tx(uint64_t *p_cq_poll_sn) { NOT_IN_USE(p_cq_poll_sn); return 0; } virtual int wait_for_notification_and_process_element(int cq_channel_fd, uint64_t *p_cq_poll_sn, - void *pv_fd_ready_array = NULL); + void *pv_fd_ready_array = nullptr); virtual int drain_and_proccess(); virtual bool reclaim_recv_buffers(descq_t *rx_reuse); virtual bool reclaim_recv_buffers(mem_buf_desc_t *buff); @@ -109,7 +110,7 @@ class ring_tap : public ring_slave { ib_ctx_handler *get_ctx(ring_user_id_t id) { NOT_IN_USE(id); - return NULL; + return nullptr; } virtual uint32_t get_max_send_sge(void) { return 1; } virtual uint32_t get_max_payload_sz(void) { return 0; } diff --git a/src/core/dev/time_converter.cpp b/src/core/dev/time_converter.cpp index e2d3991a2..3d1da4ca1 100644 --- a/src/core/dev/time_converter.cpp +++ b/src/core/dev/time_converter.cpp @@ -188,7 +188,7 @@ void time_converter::clean_obj() } set_cleaned(); - m_timer_handle = NULL; + m_timer_handle = nullptr; if (g_p_event_handler_manager->is_running()) { g_p_event_handler_manager->unregister_timers_event_and_delete(this); } else { diff --git a/src/core/dev/time_converter_ib_ctx.cpp b/src/core/dev/time_converter_ib_ctx.cpp index 378e95a45..1a72eaf41 100644 --- a/src/core/dev/time_converter_ib_ctx.cpp +++ b/src/core/dev/time_converter_ib_ctx.cpp @@ -71,11 +71,11 @@ time_converter_ib_ctx::time_converter_ib_ctx(struct ibv_context *ctx, m_converter_status = TS_CONVERSION_MODE_SYNC; g_p_event_handler_manager->register_timer_event(UPDATE_HW_TIMER_FIRST_ONESHOT_MS, - this, ONE_SHOT_TIMER, 0); + this, ONE_SHOT_TIMER, nullptr); g_p_event_handler_manager->register_timer_event(UPDATE_HW_TIMER_SECOND_ONESHOT_MS, - this, ONE_SHOT_TIMER, 0); + this, ONE_SHOT_TIMER, nullptr); m_timer_handle = g_p_event_handler_manager->register_timer_event( - UPDATE_HW_TIMER_PERIOD_MS, this, PERIODIC_TIMER, 0); + UPDATE_HW_TIMER_PERIOD_MS, this, PERIODIC_TIMER, nullptr); } } } diff --git a/src/core/dev/time_converter_ptp.cpp b/src/core/dev/time_converter_ptp.cpp index fdb1de1c3..517ff8377 100644 --- a/src/core/dev/time_converter_ptp.cpp +++ b/src/core/dev/time_converter_ptp.cpp @@ -65,7 +65,7 @@ time_converter_ptp::time_converter_ptp(struct ibv_context *ctx) } m_timer_handle = g_p_event_handler_manager->register_timer_event(UPDATE_HW_TIMER_PTP_PERIOD_MS, - this, PERIODIC_TIMER, 0); + this, PERIODIC_TIMER, nullptr); m_converter_status = TS_CONVERSION_MODE_PTP; } diff --git a/src/core/dev/wqe_send_handler.cpp b/src/core/dev/wqe_send_handler.cpp index be06efc30..e30ecce3d 100644 --- a/src/core/dev/wqe_send_handler.cpp +++ b/src/core/dev/wqe_send_handler.cpp @@ -60,7 +60,7 @@ void wqe_send_handler::init_wqe(xlio_ibv_send_wr &wqe_to_init, struct ibv_sge *s wqe_to_init.num_sge = num_sge; xlio_send_wr_opcode(wqe_to_init) = XLIO_IBV_WR_SEND; - wqe_to_init.next = NULL; + wqe_to_init.next = nullptr; wqe_to_init.sg_list = sge_list; wqe_to_init.wr_id = 0; } diff --git a/src/core/event/delta_timer.cpp b/src/core/event/delta_timer.cpp index ecf8b6097..ffec21826 100644 --- a/src/core/event/delta_timer.cpp +++ b/src/core/event/delta_timer.cpp @@ -56,16 +56,16 @@ timer::timer() { - m_list_head = NULL; + m_list_head = nullptr; gettime(&m_ts_last); } timer::~timer() { timer_node_t *iter = m_list_head; - timer_node_t *to_free = NULL; + timer_node_t *to_free = nullptr; tmr_logfunc(""); - m_list_head = NULL; + m_list_head = nullptr; // free all the list while (iter) { to_free = iter; @@ -133,7 +133,7 @@ void timer::remove_timer(timer_node_t *node, timer_handler *handler) BULLSEYE_EXCLUDE_BLOCK_END // Invalidate node before freeing it - node->handler = NULL; + node->handler = nullptr; node->req_type = INVALID_TIMER; // Remove & Free node @@ -145,7 +145,7 @@ void timer::remove_timer(timer_node_t *node, timer_handler *handler) void timer::remove_all_timers(timer_handler *handler) { timer_node_t *node = m_list_head; - timer_node_t *node_tmp = NULL; + timer_node_t *node_tmp = nullptr; // Look for handler in the list if node wasen't indicated while (node) { @@ -160,12 +160,12 @@ void timer::remove_all_timers(timer_handler *handler) } BULLSEYE_EXCLUDE_BLOCK_END // Invalidate node before freeing it - node_tmp->handler = NULL; + node_tmp->handler = nullptr; node_tmp->req_type = INVALID_TIMER; remove_from_list(node_tmp); // Remove & Free node free(node_tmp); - node_tmp = NULL; + node_tmp = nullptr; } else { node = node->next; } @@ -177,7 +177,7 @@ void timer::remove_all_timers(timer_handler *handler) int timer::update_timeout() { int ret = 0, delta_msec = 0; - timer_node_t *list_tmp = NULL; + timer_node_t *list_tmp = nullptr; struct timespec ts_now, ts_delta; ret = gettime(&ts_now); @@ -248,7 +248,7 @@ void timer::process_registered_timers() case PERIODIC_TIMER: // re-insert remove_from_list(iter); - iter->prev = iter->next = NULL; + iter->prev = iter->next = nullptr; insert_to_list(iter); break; @@ -308,8 +308,8 @@ void timer::insert_to_list(timer_node_t *new_node) if (!m_list_head) { // first node in the list new_node->delta_time_msec = new_node->orig_time_msec; // time from now - new_node->next = NULL; - new_node->prev = NULL; + new_node->next = nullptr; + new_node->prev = nullptr; m_list_head = new_node; tmr_logfuncall("insert first node to list (handler %p, timer %d, delta time %d)", new_node->handler, new_node->orig_time_msec, new_node->delta_time_msec); @@ -318,7 +318,7 @@ void timer::insert_to_list(timer_node_t *new_node) // else: need to find the correct place in the list tmp_delta = new_node->orig_time_msec; iter = m_list_head; - prev = NULL; + prev = nullptr; while (iter && tmp_delta >= iter->delta_time_msec) { tmp_delta = tmp_delta - iter->delta_time_msec; diff --git a/src/core/event/event.h b/src/core/event/event.h index 74a4d20f0..856b7cbff 100644 --- a/src/core/event/event.h +++ b/src/core/event/event.h @@ -40,7 +40,7 @@ class event { public: - event(void *notifier = NULL) + event(void *notifier = nullptr) : m_notifier(notifier) { } diff --git a/src/core/event/event_handler_manager.cpp b/src/core/event/event_handler_manager.cpp index b5ad964fe..edf4fe946 100644 --- a/src/core/event/event_handler_manager.cpp +++ b/src/core/event/event_handler_manager.cpp @@ -81,7 +81,7 @@ #define INITIAL_EVENTS_NUM 64 -event_handler_manager *g_p_event_handler_manager = NULL; +event_handler_manager *g_p_event_handler_manager = nullptr; pthread_t g_n_internal_thread_id = 0; @@ -94,7 +94,7 @@ void *event_handler_manager::register_timer_event(int timeout_msec, timer_handle BULLSEYE_EXCLUDE_BLOCK_START if (!handler || (req_type < 0 || req_type >= INVALID_TIMER)) { evh_logwarn("bad timer type (%d) or handler (%p)", req_type, handler); - return NULL; + return nullptr; } BULLSEYE_EXCLUDE_BLOCK_END @@ -290,7 +290,7 @@ void *event_handler_thread(void *_p_tgtObject) tasks_file += "/tasks"; FILE *fp = fopen(tasks_file.c_str(), "w"); BULLSEYE_EXCLUDE_BLOCK_START - if (fp == NULL) { + if (!fp) { evh_logpanic("Failed to open %s for writing", tasks_file.c_str()); } if (fprintf(fp, "%d", gettid()) <= 0) { @@ -389,7 +389,7 @@ void event_handler_manager::stop_thread() // Wait for thread exit if (m_event_handler_tid) { - pthread_join(m_event_handler_tid, 0); + pthread_join(m_event_handler_tid, nullptr); evh_logdbg("event handler thread stopped"); } else { evh_logdbg("event handler thread not running"); @@ -404,7 +404,7 @@ void event_handler_manager::stop_thread() void event_handler_manager::update_epfd(int fd, int operation, int events) { - epoll_event ev = {0, {0}}; + epoll_event ev = {0, {nullptr}}; if (m_epfd < 0) { return; @@ -760,7 +760,7 @@ void event_handler_manager::handle_registration_action(reg_action_t ®_action) case UNREGISTER_TIMERS_AND_DELETE: priv_unregister_all_handler_timers(reg_action.info.timer); delete reg_action.info.timer.handler; - reg_action.info.timer.handler = NULL; + reg_action.info.timer.handler = nullptr; break; BULLSEYE_EXCLUDE_BLOCK_START default: @@ -867,7 +867,7 @@ void event_handler_manager::process_rdma_cm_event(event_handler_map_t::iterator // Read the notification event channel struct rdma_event_channel *cma_channel = (struct rdma_event_channel *)iter_fd->second.rdma_cm_ev.cma_channel; - struct rdma_cm_event *p_tmp_cm_event = NULL; + struct rdma_cm_event *p_tmp_cm_event = nullptr; struct rdma_cm_event cma_event; evh_logfunc_entry("cma_channel %p (fd = %d)", cma_channel, cma_channel->fd); @@ -900,7 +900,7 @@ void event_handler_manager::process_rdma_cm_event(event_handler_map_t::iterator } // Find registered event handler - if (cma_id != NULL) { + if (cma_id) { event_handler_rdma_cm_map_t::iterator iter_id = iter_fd->second.rdma_cm_ev.map_rdma_cm_id.find(cma_id); if (iter_id != iter_fd->second.rdma_cm_ev.map_rdma_cm_id.end()) { @@ -958,7 +958,7 @@ void *event_handler_manager::thread_loop() g_p_net_device_table_mgr) { m_cq_epfd = g_p_net_device_table_mgr->global_ring_epfd_get(); if (m_cq_epfd > 0) { - epoll_event evt = {0, {0}}; + epoll_event evt = {0, {nullptr}}; evt.events = EPOLLIN | EPOLLPRI; evt.data.fd = m_cq_epfd; SYSCALL(epoll_ctl, m_epfd, EPOLL_CTL_ADD, m_cq_epfd, &evt); @@ -970,12 +970,12 @@ void *event_handler_manager::thread_loop() if (m_b_sysvar_internal_thread_arm_cq_enabled && m_cq_epfd > 0 && g_p_net_device_table_mgr) { g_p_net_device_table_mgr->global_ring_poll_and_process_element(&poll_sn_rx, &poll_sn_tx, - NULL); + nullptr); int ret = g_p_net_device_table_mgr->global_ring_request_notification(poll_sn_rx, poll_sn_tx); if (ret > 0) { - g_p_net_device_table_mgr->global_ring_poll_and_process_element(&poll_sn_rx, - &poll_sn_tx, NULL); + g_p_net_device_table_mgr->global_ring_poll_and_process_element( + &poll_sn_rx, &poll_sn_tx, nullptr); } } @@ -999,7 +999,7 @@ void *event_handler_manager::thread_loop() if (m_b_sysvar_internal_thread_arm_cq_enabled && p_events[idx].data.fd == m_cq_epfd && g_p_net_device_table_mgr) { g_p_net_device_table_mgr->global_ring_wait_for_notification_and_process_element( - &poll_sn_rx, NULL); + &poll_sn_rx, nullptr); } else if (is_wakeup_fd(p_events[idx].data.fd)) { // a request for registration was sent m_reg_action_q_lock.lock(); @@ -1093,5 +1093,5 @@ void *event_handler_manager::thread_loop() free(p_events); - return 0; + return nullptr; } diff --git a/src/core/event/event_handler_manager.h b/src/core/event/event_handler_manager.h index 2904bcc19..43b094bbe 100644 --- a/src/core/event/event_handler_manager.h +++ b/src/core/event/event_handler_manager.h @@ -158,7 +158,7 @@ class event_handler_manager : public wakeup_pipe { ~event_handler_manager(); void *register_timer_event(int timeout_msec, timer_handler *handler, timer_req_type_t req_type, - void *user_data, timers_group *group = NULL); + void *user_data, timers_group *group = nullptr); void wakeup_timer_event(timer_handler *handler, void *node); void unregister_timer_event(timer_handler *handler, void *node); void unregister_timers_event_and_delete(timer_handler *handler); diff --git a/src/core/event/netlink_event.cpp b/src/core/event/netlink_event.cpp index 093757e83..aa6ab0502 100644 --- a/src/core/event/netlink_event.cpp +++ b/src/core/event/netlink_event.cpp @@ -92,7 +92,7 @@ const std::string route_nl_event::to_str() const neigh_nl_event::neigh_nl_event(struct nlmsghdr *hdr, struct rtnl_neigh *neigh, void *notifier) : netlink_event(hdr, notifier) - , m_neigh_info(NULL) + , m_neigh_info(nullptr) { m_neigh_info = new netlink_neigh_info(neigh); if ((!hdr) && (neigh)) { @@ -109,7 +109,7 @@ neigh_nl_event::~neigh_nl_event() route_nl_event::route_nl_event(struct nlmsghdr *hdr, struct rtnl_route *route, void *notifier) : netlink_event(hdr, notifier) - , m_route_info(NULL) + , m_route_info(nullptr) { m_route_info = new netlink_route_info(route); } diff --git a/src/core/event/vlogger_timer_handler.cpp b/src/core/event/vlogger_timer_handler.cpp index c515b9686..9107a39c6 100644 --- a/src/core/event/vlogger_timer_handler.cpp +++ b/src/core/event/vlogger_timer_handler.cpp @@ -37,16 +37,16 @@ #include "timer_handler.h" #include "event_handler_manager.h" -vlogger_timer_handler *g_p_vlogger_timer_handler = NULL; +vlogger_timer_handler *g_p_vlogger_timer_handler = nullptr; vlogger_timer_handler::vlogger_timer_handler() - : m_timer_handle(NULL) + : m_timer_handle(nullptr) { if (g_p_event_handler_manager) { /* failure in allocating m_timer_handle will result in throwing an exception by called * methods */ m_timer_handle = g_p_event_handler_manager->register_timer_event( - UPDATE_VLOGGER_LEVELS_INTERVAL, this, PERIODIC_TIMER, 0); + UPDATE_VLOGGER_LEVELS_INTERVAL, this, PERIODIC_TIMER, nullptr); } } @@ -54,7 +54,7 @@ vlogger_timer_handler::~vlogger_timer_handler() { if (m_timer_handle) { g_p_event_handler_manager->unregister_timer_event(this, m_timer_handle); - m_timer_handle = NULL; + m_timer_handle = nullptr; } } diff --git a/src/core/ib/mlx5/ib_mlx5.cpp b/src/core/ib/mlx5/ib_mlx5.cpp index cfc0178ce..ec20b89f4 100644 --- a/src/core/ib/mlx5/ib_mlx5.cpp +++ b/src/core/ib/mlx5/ib_mlx5.cpp @@ -103,7 +103,7 @@ int xlio_ib_mlx5_get_cq(struct ibv_cq *cq, xlio_ib_mlx5_cq_t *mlx5_cq) * from ERROR state to RESET so cq_ci or cq_sn should not be * updated */ - if (mlx5_cq == NULL || mlx5_cq->cq == cq) { + if (!mlx5_cq || mlx5_cq->cq == cq) { return 0; } diff --git a/src/core/infra/DemoObserver.cpp b/src/core/infra/DemoObserver.cpp index bc92f02e8..e5178e06b 100644 --- a/src/core/infra/DemoObserver.cpp +++ b/src/core/infra/DemoObserver.cpp @@ -51,8 +51,8 @@ void Demo_Observer::notify_cb() void Demo_Observer::register_to_subjects(Demo_Coll_Mgr1 *coll_for_subjects_1, Demo_Coll_Mgr2 *coll_for_subjects_2) { - Demo_Subject1 *s1 = NULL; - Demo_Subject2 *s2 = NULL; + Demo_Subject1 *s1 = nullptr; + Demo_Subject2 *s2 = nullptr; key_class c('a'); key_class i(1); char ch = 'a'; diff --git a/src/core/infra/cache_subject_observer.h b/src/core/infra/cache_subject_observer.h index 5d504babe..ed5397000 100644 --- a/src/core/infra/cache_subject_observer.h +++ b/src/core/infra/cache_subject_observer.h @@ -115,7 +115,7 @@ template class cache_table_mgr : public tostr, publ public: cache_table_mgr(const char *lock_name = "lock(cache_table_mgr)") : m_lock(lock_name) - , m_timer_handle(NULL) {}; + , m_timer_handle(nullptr) {}; virtual ~cache_table_mgr(); /* Returns pointer to the subject */ @@ -215,7 +215,7 @@ void cache_table_mgr::start_garbage_collector(int timeout_msec) m_timer_handle = g_p_event_handler_manager->register_timer_event(timeout_msec, this, PERIODIC_TIMER, NULL); - if (m_timer_handle == NULL) { + if (!m_timer_handle) { __log_warn("Failed to start garbage_collector"); } } @@ -224,7 +224,7 @@ template void cache_table_mgr::stop_garba { if (m_timer_handle) { g_p_event_handler_manager->unregister_timer_event(this, m_timer_handle); - m_timer_handle = NULL; + m_timer_handle = nullptr; } } diff --git a/src/core/infra/subject_observer.h b/src/core/infra/subject_observer.h index 44d333c9b..5d14930b6 100644 --- a/src/core/infra/subject_observer.h +++ b/src/core/infra/subject_observer.h @@ -60,7 +60,7 @@ class subject { virtual ~subject() {}; virtual bool register_observer(IN const observer *const new_observer); bool unregister_observer(IN const observer *const old_observer); - void notify_observers(event *ev = NULL); + void notify_observers(event *ev = nullptr); protected: lock_mutex_recursive m_lock; diff --git a/src/core/iomux/epfd_info.cpp b/src/core/iomux/epfd_info.cpp index e44688dc2..8c87ac128 100644 --- a/src/core/iomux/epfd_info.cpp +++ b/src/core/iomux/epfd_info.cpp @@ -45,7 +45,7 @@ int epfd_info::remove_fd_from_epoll_os(int fd) { - int ret = SYSCALL(epoll_ctl, m_epfd, EPOLL_CTL_DEL, fd, NULL); + int ret = SYSCALL(epoll_ctl, m_epfd, EPOLL_CTL_DEL, fd, nullptr); BULLSEYE_EXCLUDE_BLOCK_START if (ret < 0) { __log_dbg("failed to remove fd=%d from os epoll epfd=%d (errno=%d %m)", fd, m_epfd, errno); @@ -146,7 +146,7 @@ int epfd_info::ctl(int op, int fd, epoll_event *event) { int ret; epoll_event event_dummy; - if (event == NULL) { + if (!event) { memset(&event_dummy, 0, sizeof(event_dummy)); event = &event_dummy; } @@ -201,7 +201,7 @@ int epfd_info::add_fd(int fd, epoll_event *event) { int ret; epoll_fd_rec fd_rec; - epoll_event evt = {0, {0}}; + epoll_event evt = {0, {nullptr}}; bool is_offloaded = false; @@ -294,7 +294,7 @@ int epfd_info::add_fd(int fd, epoll_event *event) // if the socket is ready, add it to ready events uint32_t events = 0; int errors; - if ((event->events & EPOLLIN) && temp_sock_fd_api->is_readable(NULL, NULL)) { + if ((event->events & EPOLLIN) && temp_sock_fd_api->is_readable(nullptr, nullptr)) { events |= EPOLLIN; } if ((event->events & EPOLLOUT) && temp_sock_fd_api->is_writeable()) { @@ -337,7 +337,7 @@ void epfd_info::increase_ring_ref_count(ring *ring) size_t num_ring_rx_fds; int *ring_rx_fds_array = ring->get_rx_channel_fds(num_ring_rx_fds); for (size_t i = 0; i < num_ring_rx_fds; i++) { - epoll_event evt = {0, {0}}; + epoll_event evt = {0, {nullptr}}; evt.events = EPOLLIN | EPOLLPRI; int fd = ring_rx_fds_array[i]; evt.data.u64 = (((uint64_t)CQ_FD_MARK << 32) | fd); @@ -378,7 +378,7 @@ void epfd_info::decrease_ring_ref_count(ring *ring) int *ring_rx_fds_array = ring->get_rx_channel_fds(num_ring_rx_fds); for (size_t i = 0; i < num_ring_rx_fds; i++) { // delete cq fd from epfd - int ret = SYSCALL(epoll_ctl, m_epfd, EPOLL_CTL_DEL, ring_rx_fds_array[i], NULL); + int ret = SYSCALL(epoll_ctl, m_epfd, EPOLL_CTL_DEL, ring_rx_fds_array[i], nullptr); BULLSEYE_EXCLUDE_BLOCK_START if (ret < 0) { __log_dbg("failed to remove cq fd=%d from epfd=%d (errno=%d %m)", @@ -521,7 +521,7 @@ int epfd_info::mod_fd(int fd, epoll_event *event) uint32_t events = 0; if (is_offloaded) { // if the socket is ready, add it to ready events - if ((event->events & EPOLLIN) && temp_sock_fd_api->is_readable(NULL, NULL)) { + if ((event->events & EPOLLIN) && temp_sock_fd_api->is_readable(nullptr, nullptr)) { events |= EPOLLIN; } if ((event->events & EPOLLOUT) && temp_sock_fd_api->is_writeable()) { @@ -549,7 +549,7 @@ int epfd_info::mod_fd(int fd, epoll_event *event) epoll_fd_rec *epfd_info::get_fd_rec(int fd) { - epoll_fd_rec *fd_rec = NULL; + epoll_fd_rec *fd_rec = nullptr; socket_fd_api *temp_sock_fd_api = fd_collection_get_sockfd(fd); lock(); @@ -754,7 +754,7 @@ int epfd_info::ring_wait_for_notification_and_process_element(uint64_t *p_poll_s } else { __log_dbg("failed to find channel fd. removing cq fd=%d from epfd=%d", fd, m_epfd); BULLSEYE_EXCLUDE_BLOCK_START - if ((SYSCALL(epoll_ctl, m_epfd, EPOLL_CTL_DEL, fd, NULL)) && + if ((SYSCALL(epoll_ctl, m_epfd, EPOLL_CTL_DEL, fd, nullptr)) && (!(errno == ENOENT || errno == EBADF))) { __log_err("failed to del cq channel fd=%d from os epfd=%d (errno=%d %m)", fd, m_epfd, errno); diff --git a/src/core/iomux/epfd_info.h b/src/core/iomux/epfd_info.h index be004d193..c1ff1da0a 100644 --- a/src/core/iomux/epfd_info.h +++ b/src/core/iomux/epfd_info.h @@ -88,12 +88,12 @@ class epfd_info : public lock_mutex_recursive, public cleanable_obj, public wake epoll_stats_t *stats(); int ring_poll_and_process_element(uint64_t *p_poll_sn_rx, uint64_t *p_poll_sn_tx, - void *pv_fd_ready_array = NULL); + void *pv_fd_ready_array = nullptr); int ring_request_notification(uint64_t poll_sn_rx, uint64_t poll_sn_tx); int ring_wait_for_notification_and_process_element(uint64_t *p_poll_sn, - void *pv_fd_ready_array = NULL); + void *pv_fd_ready_array = nullptr); virtual void clean_obj(); diff --git a/src/core/iomux/epoll_wait_call.cpp b/src/core/iomux/epoll_wait_call.cpp index dcac5aaa1..0e2429756 100644 --- a/src/core/iomux/epoll_wait_call.cpp +++ b/src/core/iomux/epoll_wait_call.cpp @@ -46,7 +46,7 @@ epoll_wait_call::epoll_wait_call(epoll_event *extra_events_buffer, offloaded_mode_t *off_modes_buffer, int epfd, epoll_event *events, int maxevents, int timeout, const sigset_t *sigmask /* = NULL */) - : io_mux_call(NULL, off_modes_buffer, 0, sigmask) + : io_mux_call(nullptr, off_modes_buffer, 0, sigmask) , // TODO: rethink on these arguments m_epfd(epfd) , m_events(events) @@ -111,7 +111,7 @@ int epoll_wait_call::get_current_events() } if (mutual_events & EPOLLIN) { - if (handle_epoll_event(p_socket_object->is_readable(NULL), EPOLLIN, p_socket_object, + if (handle_epoll_event(p_socket_object->is_readable(nullptr), EPOLLIN, p_socket_object, i)) { ready_rfds++; got_event = true; @@ -384,7 +384,7 @@ bool epoll_wait_call::handle_os_countdown(int &poll_os_countdown) if (cq_ready) { // This will empty the cqepfd // (most likely in case of a wakeup and probably only under epoll_wait (Not select/poll)) - ring_wait_for_notification_and_process_element(NULL); + ring_wait_for_notification_and_process_element(nullptr); } /* Before we exit with ready OS fd's we'll check the CQs once more and exit * below after calling check_all_offloaded_sockets(); @@ -404,7 +404,7 @@ bool epoll_wait_call::handle_os_countdown(int &poll_os_countdown) int epoll_wait_call::ring_poll_and_process_element() { - return m_epfd_info->ring_poll_and_process_element(&m_poll_sn_rx, &m_poll_sn_tx, NULL); + return m_epfd_info->ring_poll_and_process_element(&m_poll_sn_rx, &m_poll_sn_tx, nullptr); } int epoll_wait_call::ring_request_notification() diff --git a/src/core/iomux/epoll_wait_call.h b/src/core/iomux/epoll_wait_call.h index be4505648..71ba2eb0a 100644 --- a/src/core/iomux/epoll_wait_call.h +++ b/src/core/iomux/epoll_wait_call.h @@ -56,7 +56,7 @@ class epoll_wait_call : public io_mux_call { */ epoll_wait_call(epoll_event *extra_events_buffer, offloaded_mode_t *off_modes_buffer, int epfd, epoll_event *events, int maxevents, int timeout, - const sigset_t *sigmask = NULL); + const sigset_t *sigmask = nullptr); virtual ~epoll_wait_call(); /// @override diff --git a/src/core/iomux/io_mux_call.cpp b/src/core/iomux/io_mux_call.cpp index 2b63b6254..24c142f98 100644 --- a/src/core/iomux/io_mux_call.cpp +++ b/src/core/iomux/io_mux_call.cpp @@ -179,7 +179,7 @@ io_mux_call::io_mux_call(int *off_fds_buffer, offloaded_mode_t *off_modes_buffer , m_cqepfd(-1) , m_poll_sn_rx(0) , m_poll_sn_tx(0) - , m_p_stats(NULL) + , m_p_stats(nullptr) , m_n_all_ready_fds(0) , m_n_ready_rfds(0) , m_n_ready_wfds(0) @@ -260,7 +260,7 @@ bool io_mux_call::handle_os_countdown(int &poll_os_countdown) // This will empty the cqepfd // (most likely in case of a wakeup and probably only under epoll_wait (Not // select/poll)) - ring_wait_for_notification_and_process_element(NULL); + ring_wait_for_notification_and_process_element(nullptr); } /* Before we exit with ready OS fd's we'll check the CQs once more and exit * below after calling check_all_offloaded_sockets(); @@ -551,7 +551,7 @@ int io_mux_call::ring_poll_and_process_element() { // TODO: (select, poll) this access all CQs, it is better to check only relevant ones return g_p_net_device_table_mgr->global_ring_poll_and_process_element(&m_poll_sn_rx, - &m_poll_sn_tx, NULL); + &m_poll_sn_tx, nullptr); } int io_mux_call::ring_request_notification() diff --git a/src/core/iomux/io_mux_call.h b/src/core/iomux/io_mux_call.h index b9b76067f..596a6847e 100644 --- a/src/core/iomux/io_mux_call.h +++ b/src/core/iomux/io_mux_call.h @@ -77,7 +77,7 @@ class io_mux_call { * @param fds_buffer Pointer to a buffer large enough to hold all fds. */ io_mux_call(int *off_fds_buffer, offloaded_mode_t *off_modes_buffer, int num_fds = 0, - const sigset_t *sigmask = NULL); // = 0 is only temp + const sigset_t *sigmask = nullptr); // = 0 is only temp virtual ~io_mux_call() {}; /** diff --git a/src/core/iomux/poll_call.cpp b/src/core/iomux/poll_call.cpp index f8ef79406..ad8a60412 100644 --- a/src/core/iomux/poll_call.cpp +++ b/src/core/iomux/poll_call.cpp @@ -54,7 +54,7 @@ poll_call::poll_call(int *off_rfds_buffer, offloaded_mode_t *off_modes_buffer, i { nfds_t i; int fd; - m_fds = NULL; + m_fds = nullptr; // create stats m_p_stats = &g_poll_stats; @@ -97,7 +97,7 @@ poll_call::poll_call(int *off_rfds_buffer, offloaded_mode_t *off_modes_buffer, i __log_func("fd=%d must be skipped from os r poll()", fd); m_fds[i].fd = -1; } else if (m_orig_fds[i].events & POLLIN) { - if (temp_sock_fd_api->is_readable(NULL)) { + if (temp_sock_fd_api->is_readable(nullptr)) { io_mux_call::update_fd_array(&m_fd_ready_array, fd); m_n_ready_rfds++; m_n_all_ready_fds++; @@ -132,7 +132,7 @@ bool poll_call::wait_os(bool zero_timeout) { __log_func("calling os poll: %d", m_nfds); if (m_sigmask) { - struct timespec to, *pto = NULL; + struct timespec to, *pto = nullptr; if (zero_timeout) { to.tv_sec = to.tv_nsec = 0; pto = &to; @@ -160,7 +160,7 @@ bool poll_call::wait(const timeval &elapsed) { // poll fds and cq int timeout; - struct timespec to, *pto = NULL; + struct timespec to, *pto = nullptr; if (m_timeout < 0) { timeout = m_timeout; diff --git a/src/core/iomux/poll_call.h b/src/core/iomux/poll_call.h index b4f592dcb..1005c59e3 100644 --- a/src/core/iomux/poll_call.h +++ b/src/core/iomux/poll_call.h @@ -54,7 +54,7 @@ class poll_call : public io_mux_call { */ poll_call(int *off_rfds_buffer, offloaded_mode_t *off_modes_buffer, int *lookup_buffer, pollfd *working_fds_arr, pollfd *fds, nfds_t nfds, int timeout, - const sigset_t *__sigmask = NULL); + const sigset_t *__sigmask = nullptr); /// @override virtual void set_offloaded_rfd_ready(int fd_index); diff --git a/src/core/iomux/select_call.cpp b/src/core/iomux/select_call.cpp index b3a95c9bc..e255f7f24 100644 --- a/src/core/iomux/select_call.cpp +++ b/src/core/iomux/select_call.cpp @@ -112,7 +112,7 @@ select_call::select_call(int *off_fds_buffer, offloaded_mode_t *off_modes_buffer if (!psock->skip_os_select()) { if (check_read) { FD_SET(fd, &m_os_rfds); - if (psock->is_readable(NULL)) { + if (psock->is_readable(nullptr)) { io_mux_call::update_fd_array(&m_fd_ready_array, fd); m_n_ready_rfds++; m_n_all_ready_fds++; @@ -181,8 +181,8 @@ void select_call::prepare_to_block() bool select_call::wait_os(bool zero_timeout) { - timeval to, *pto = NULL; - timespec to_pselect, *pto_pselect = NULL; + timeval to, *pto = nullptr; + timespec to_pselect, *pto_pselect = nullptr; /* Avner: I put it in comment, because this logic is wrong @@ -234,8 +234,8 @@ bool select_call::wait_os(bool zero_timeout) bool select_call::wait(const timeval &elapsed) { - timeval timeout, *pto = NULL; - timespec to_pselect, *pto_pselect = NULL; + timeval timeout, *pto = nullptr; + timespec to_pselect, *pto_pselect = nullptr; BULLSEYE_EXCLUDE_BLOCK_START if (m_n_all_ready_fds > 0) { diff --git a/src/core/iomux/select_call.h b/src/core/iomux/select_call.h index c935c746e..f856455c0 100644 --- a/src/core/iomux/select_call.h +++ b/src/core/iomux/select_call.h @@ -52,7 +52,7 @@ class select_call : public io_mux_call { */ select_call(int *off_fds_buffer, offloaded_mode_t *off_modes_buffer, int nfds, fd_set *readfds, fd_set *writefds, fd_set *exceptfds, timeval *timeout, - const sigset_t *__sigmask = NULL); + const sigset_t *__sigmask = nullptr); /// @override virtual void set_offloaded_rfd_ready(int fd_index); diff --git a/src/core/main.cpp b/src/core/main.cpp index 2b89de3f4..588cfe484 100644 --- a/src/core/main.cpp +++ b/src/core/main.cpp @@ -106,7 +106,7 @@ bool g_b_exit = false; bool g_init_ibv_fork_done = false; bool g_is_forked_child = false; bool g_init_global_ctors_done = true; -static command_netlink *s_cmd_nl = NULL; +static command_netlink *s_cmd_nl = nullptr; #define MAX_VERSION_STR_LEN 128 global_stats_t g_global_stat_static; @@ -135,11 +135,11 @@ static int free_libxlio_resources() if (g_tcp_timers_collection) { g_tcp_timers_collection->clean_obj(); } - g_tcp_timers_collection = NULL; + g_tcp_timers_collection = nullptr; // Block all sock-redicrt API calls into our offloading core fd_collection *g_p_fd_collection_temp = g_p_fd_collection; - g_p_fd_collection = NULL; + g_p_fd_collection = nullptr; if (g_p_fd_collection_temp) { delete g_p_fd_collection_temp; } @@ -147,30 +147,30 @@ static int free_libxlio_resources() if (g_p_lwip) { delete g_p_lwip; } - g_p_lwip = NULL; + g_p_lwip = nullptr; if (g_p_route_table_mgr) { delete g_p_route_table_mgr; } - g_p_route_table_mgr = NULL; + g_p_route_table_mgr = nullptr; if (g_bind_no_port) { delete g_bind_no_port; } - g_bind_no_port = NULL; + g_bind_no_port = nullptr; if (g_p_rule_table_mgr) { delete g_p_rule_table_mgr; } - g_p_rule_table_mgr = NULL; + g_p_rule_table_mgr = nullptr; if (g_p_net_device_table_mgr) { delete g_p_net_device_table_mgr; } - g_p_net_device_table_mgr = NULL; + g_p_net_device_table_mgr = nullptr; ip_frag_manager *g_p_ip_frag_manager_temp = g_p_ip_frag_manager; - g_p_ip_frag_manager = NULL; + g_p_ip_frag_manager = nullptr; if (g_p_ip_frag_manager_temp) { delete g_p_ip_frag_manager_temp; } @@ -178,12 +178,12 @@ static int free_libxlio_resources() if (g_p_neigh_table_mgr) { delete g_p_neigh_table_mgr; } - g_p_neigh_table_mgr = NULL; + g_p_neigh_table_mgr = nullptr; if (g_tcp_seg_pool) { delete g_tcp_seg_pool; } - g_tcp_seg_pool = NULL; + g_tcp_seg_pool = nullptr; if (safe_mce_sys().print_report) { buffer_pool::print_report_on_errors(VLOG_INFO); @@ -192,64 +192,64 @@ static int free_libxlio_resources() if (g_buffer_pool_zc) { delete g_buffer_pool_zc; } - g_buffer_pool_zc = NULL; + g_buffer_pool_zc = nullptr; if (g_buffer_pool_tx) { delete g_buffer_pool_tx; } - g_buffer_pool_tx = NULL; + g_buffer_pool_tx = nullptr; if (g_buffer_pool_rx_stride) { delete g_buffer_pool_rx_stride; } - g_buffer_pool_rx_stride = NULL; + g_buffer_pool_rx_stride = nullptr; if (g_buffer_pool_rx_rwqe) { delete g_buffer_pool_rx_rwqe; } - g_buffer_pool_rx_rwqe = NULL; + g_buffer_pool_rx_rwqe = nullptr; if (g_zc_cache) { delete g_zc_cache; } - g_zc_cache = NULL; + g_zc_cache = nullptr; xlio_heap::finalize(); if (s_cmd_nl) { delete s_cmd_nl; } - s_cmd_nl = NULL; + s_cmd_nl = nullptr; if (g_p_netlink_handler) { delete g_p_netlink_handler; } - g_p_netlink_handler = NULL; + g_p_netlink_handler = nullptr; if (g_p_ib_ctx_handler_collection) { delete g_p_ib_ctx_handler_collection; } - g_p_ib_ctx_handler_collection = NULL; + g_p_ib_ctx_handler_collection = nullptr; if (g_p_vlogger_timer_handler) { delete g_p_vlogger_timer_handler; } - g_p_vlogger_timer_handler = NULL; + g_p_vlogger_timer_handler = nullptr; if (g_p_event_handler_manager) { delete g_p_event_handler_manager; } - g_p_event_handler_manager = NULL; + g_p_event_handler_manager = nullptr; if (g_p_agent) { delete g_p_agent; } - g_p_agent = NULL; + g_p_agent = nullptr; if (safe_mce_sys().app_name) { free(safe_mce_sys().app_name); } - safe_mce_sys().app_name = NULL; + safe_mce_sys().app_name = nullptr; vlog_printf(VLOG_DEBUG, "Stopping logger module\n"); @@ -261,14 +261,14 @@ static int free_libxlio_resources() // cosmetics - remove when adding iomux block fprintf(g_stats_file, "======================================================\n"); fclose(g_stats_file); - g_stats_file = NULL; + g_stats_file = nullptr; } #if defined(DEFINED_NGINX) || defined(DEFINED_ENVOY) if (g_p_app) { delete g_p_app; } - g_p_app = NULL; + g_p_app = nullptr; #endif return 0; @@ -431,7 +431,7 @@ int get_ofed_version_info(char *ofed_version_str, int len) void print_xlio_global_settings() { struct utsname sys_info; - time_t clock = time(NULL); + time_t clock = time(nullptr); char ofed_version_info[MAX_VERSION_STR_LEN]; vlog_printf(VLOG_INFO, @@ -929,7 +929,7 @@ void register_handler_segv() act.sa_handler = handle_segfault; act.sa_flags = 0; sigemptyset(&act.sa_mask); - sigaction(SIGSEGV, &act, NULL); + sigaction(SIGSEGV, &act, nullptr); vlog_printf(VLOG_INFO, "Registered a SIGSEGV handler\n"); } @@ -1093,10 +1093,10 @@ static void do_global_ctors_helper() safe_mce_sys().lwip_mss)), (safe_mce_sys().m_ioctl.user_alloc.flags & IOCTL_USER_ALLOC_TX ? safe_mce_sys().m_ioctl.user_alloc.memalloc - : NULL), + : nullptr), (safe_mce_sys().m_ioctl.user_alloc.flags & IOCTL_USER_ALLOC_TX ? safe_mce_sys().m_ioctl.user_alloc.memfree - : NULL))); + : nullptr))); NEW_CTOR(g_buffer_pool_zc, buffer_pool(BUFFER_POOL_TX, 0)); @@ -1145,13 +1145,13 @@ static void do_global_ctors_helper() // Register netlink fd to the event_manager s_cmd_nl = new command_netlink(g_p_netlink_handler); - if (s_cmd_nl == NULL) { + if (!s_cmd_nl) { throw_xlio_exception("Failed allocating command_netlink\n"); } BULLSEYE_EXCLUDE_BLOCK_END g_p_event_handler_manager->register_command_event(fd, s_cmd_nl); g_p_event_handler_manager->register_timer_event(safe_mce_sys().timer_netlink_update_msec, - s_cmd_nl, PERIODIC_TIMER, NULL); + s_cmd_nl, PERIODIC_TIMER, nullptr); } #ifdef DEFINED_UTLS @@ -1178,29 +1178,29 @@ int do_global_ctors() void reset_globals() { - g_p_fd_collection = NULL; - g_p_ip_frag_manager = NULL; - g_zc_cache = NULL; - g_buffer_pool_rx_ptr = NULL; - g_buffer_pool_rx_stride = NULL; - g_buffer_pool_rx_rwqe = NULL; - g_buffer_pool_tx = NULL; - g_buffer_pool_zc = NULL; - g_tcp_seg_pool = NULL; - g_tcp_timers_collection = NULL; - g_p_vlogger_timer_handler = NULL; - g_p_event_handler_manager = NULL; - g_p_agent = NULL; - g_p_route_table_mgr = NULL; - g_bind_no_port = NULL; - g_p_rule_table_mgr = NULL; - g_stats_file = NULL; - g_p_net_device_table_mgr = NULL; - g_p_neigh_table_mgr = NULL; - g_p_lwip = NULL; - g_p_netlink_handler = NULL; - g_p_ib_ctx_handler_collection = NULL; - s_cmd_nl = NULL; + g_p_fd_collection = nullptr; + g_p_ip_frag_manager = nullptr; + g_zc_cache = nullptr; + g_buffer_pool_rx_ptr = nullptr; + g_buffer_pool_rx_stride = nullptr; + g_buffer_pool_rx_rwqe = nullptr; + g_buffer_pool_tx = nullptr; + g_buffer_pool_zc = nullptr; + g_tcp_seg_pool = nullptr; + g_tcp_timers_collection = nullptr; + g_p_vlogger_timer_handler = nullptr; + g_p_event_handler_manager = nullptr; + g_p_agent = nullptr; + g_p_route_table_mgr = nullptr; + g_bind_no_port = nullptr; + g_p_rule_table_mgr = nullptr; + g_stats_file = nullptr; + g_p_net_device_table_mgr = nullptr; + g_p_neigh_table_mgr = nullptr; + g_p_lwip = nullptr; + g_p_netlink_handler = nullptr; + g_p_ib_ctx_handler_collection = nullptr; + s_cmd_nl = nullptr; g_cpu_manager.reset(); } @@ -1222,9 +1222,9 @@ void check_netperf_flags() if (strcmp(command, "netserver")) { return; } - pch = strtok(NULL, " "); + pch = strtok(nullptr, " "); - while (pch != NULL) { + while (pch) { if (*pch == '-') { if (strchr(pch, 'D')) { b_D_flag = true; @@ -1236,7 +1236,7 @@ void check_netperf_flags() if (b_f_flag && b_D_flag) { break; } - pch = strtok(NULL, " "); + pch = strtok(nullptr, " "); } if (!b_D_flag || !b_f_flag) { vlog_printf(VLOG_WARNING, "Running netserver without flags: -D, -f can cause failure\n"); diff --git a/src/core/netlink/neigh_info.cpp b/src/core/netlink/neigh_info.cpp index 816e72b63..d9becb130 100644 --- a/src/core/netlink/neigh_info.cpp +++ b/src/core/netlink/neigh_info.cpp @@ -37,13 +37,13 @@ netlink_neigh_info::netlink_neigh_info(struct rtnl_neigh *neigh) : dst_addr_str("") - , dst_addr(NULL) + , dst_addr(nullptr) , dst_addr_len(0) , addr_family(0) , flags(0) , ifindex(0) , lladdr_str("") - , lladdr(NULL) + , lladdr(nullptr) , lladdr_len(0) , state(0) , type(0) diff --git a/src/core/netlink/neigh_info.h b/src/core/netlink/neigh_info.h index b18c8f019..90b0f695e 100644 --- a/src/core/netlink/neigh_info.h +++ b/src/core/netlink/neigh_info.h @@ -41,13 +41,13 @@ class netlink_neigh_info { public: netlink_neigh_info() : dst_addr_str("") - , dst_addr(NULL) + , dst_addr(nullptr) , dst_addr_len(0) , addr_family(0) , flags(0) , ifindex(0) , lladdr_str("") - , lladdr(NULL) + , lladdr(nullptr) , lladdr_len(0) , state(0) , type(0) diff --git a/src/core/netlink/netlink_wrapper.cpp b/src/core/netlink/netlink_wrapper.cpp index 4c864b8ce..7155d27db 100644 --- a/src/core/netlink/netlink_wrapper.cpp +++ b/src/core/netlink/netlink_wrapper.cpp @@ -55,7 +55,7 @@ #define nl_logdbg __log_dbg #define nl_logfine __log_fine -netlink_wrapper *g_p_netlink_handler = NULL; +netlink_wrapper *g_p_netlink_handler = nullptr; // structure to pass arguments on internal netlink callbacks handling typedef struct rcv_msg_arg { @@ -167,7 +167,7 @@ void netlink_wrapper::neigh_cache_callback(nl_object *obj) nl_logdbg("notify on neigh event: %s", new_event.to_str().c_str()); netlink_wrapper::notify_observers(&new_event, nlgrpNEIGH); - g_nl_rcv_arg.msghdr = NULL; + g_nl_rcv_arg.msghdr = nullptr; nl_logfine("<--- neigh_cache_callback"); } @@ -180,7 +180,7 @@ void netlink_wrapper::link_cache_callback(nl_object *obj) nl_logdbg("notify on link event: %s", new_event.to_str().c_str()); netlink_wrapper::notify_observers(&new_event, nlgrpLINK); - g_nl_rcv_arg.msghdr = NULL; + g_nl_rcv_arg.msghdr = nullptr; nl_logfine("<--- link_cache_callback"); } @@ -202,21 +202,21 @@ void netlink_wrapper::route_cache_callback(nl_object *obj) } else { nl_logdbg("Received invalid route event"); } - g_nl_rcv_arg.msghdr = NULL; + g_nl_rcv_arg.msghdr = nullptr; nl_logfine("<--- route_cache_callback"); } netlink_wrapper::netlink_wrapper() - : m_socket_handle(NULL) - , m_mngr(NULL) - , m_cache_link(NULL) - , m_cache_neigh(NULL) - , m_cache_route(NULL) + : m_socket_handle(nullptr) + , m_mngr(nullptr) + , m_cache_link(nullptr) + , m_cache_neigh(nullptr) + , m_cache_route(nullptr) { nl_logfine("---> netlink_route_listener CTOR"); g_nl_rcv_arg.subjects_map = &m_subjects_map; g_nl_rcv_arg.netlink = this; - g_nl_rcv_arg.msghdr = NULL; + g_nl_rcv_arg.msghdr = nullptr; nl_logfine("<--- netlink_route_listener CTOR"); } @@ -307,18 +307,18 @@ int netlink_wrapper::open_channel() nl_logdbg("netlink socket is open"); - if (nl_cache_mngr_add_ext(m_mngr, "route/link", link_callback, NULL, &m_cache_link)) { + if (nl_cache_mngr_add_ext(m_mngr, "route/link", link_callback, nullptr, &m_cache_link)) { return -1; } - if (nl_cache_mngr_add_ext(m_mngr, "route/route", route_callback, NULL, &m_cache_route)) { + if (nl_cache_mngr_add_ext(m_mngr, "route/route", route_callback, nullptr, &m_cache_route)) { return -1; } - if (nl_cache_mngr_add_ext(m_mngr, "route/neigh", neigh_callback, NULL, &m_cache_neigh)) { + if (nl_cache_mngr_add_ext(m_mngr, "route/neigh", neigh_callback, nullptr, &m_cache_neigh)) { return -1; } // set custom callback for every message to update message - nl_socket_modify_cb(m_socket_handle, NL_CB_MSG_IN, NL_CB_CUSTOM, nl_msg_rcv_cb, NULL); + nl_socket_modify_cb(m_socket_handle, NL_CB_MSG_IN, NL_CB_CUSTOM, nl_msg_rcv_cb, nullptr); // set the socket non-blocking BULLSEYE_EXCLUDE_BLOCK_START @@ -386,7 +386,7 @@ bool netlink_wrapper::register_event(e_netlink_event_type type, const observer * bool netlink_wrapper::unregister(e_netlink_event_type type, const observer *obs) { std::lock_guard lock(m_subj_map_lock); - if (obs == NULL) { + if (!obs) { return false; } @@ -452,7 +452,7 @@ void netlink_wrapper::neigh_timer_expired() void netlink_wrapper::notify_neigh_cache_entries() { nl_logfine("--->netlink_wrapper::notify_cache_entries"); - g_nl_rcv_arg.msghdr = NULL; + g_nl_rcv_arg.msghdr = nullptr; nl_object *obj = nl_cache_get_first(m_cache_neigh); while (obj) { nl_object_get(obj); diff --git a/src/core/netlink/test_main.cpp b/src/core/netlink/test_main.cpp index 9216916bc..758593214 100644 --- a/src/core/netlink/test_main.cpp +++ b/src/core/netlink/test_main.cpp @@ -115,7 +115,7 @@ void netlink_test() struct epoll_event *e = new struct epoll_event(); e->data.fd = fd; - e->data.ptr = NULL; + e->data.ptr = nullptr; e->events = EPOLLIN | EPOLLET; epoll_ctl(epfd, EPOLL_CTL_ADD, fd, e); diff --git a/src/core/proto/L2_address.cpp b/src/core/proto/L2_address.cpp index 3d2f315a5..74ce51a5d 100644 --- a/src/core/proto/L2_address.cpp +++ b/src/core/proto/L2_address.cpp @@ -56,7 +56,7 @@ void L2_address::set(address_t const address, addrlen_t const len) L2_panic("len = %lu", len); } - if (address == NULL) { + if (!address) { L2_panic("address == NULL"); } BULLSEYE_EXCLUDE_BLOCK_END diff --git a/src/core/proto/dst_entry.cpp b/src/core/proto/dst_entry.cpp index 1fed48573..fe81310f4 100644 --- a/src/core/proto/dst_entry.cpp +++ b/src/core/proto/dst_entry.cpp @@ -61,8 +61,8 @@ dst_entry::dst_entry(const sock_addr &dst, uint16_t src_port, socket_data &sock_ , m_route_src_ip(in6addr_any) , m_pkt_src_ip(in6addr_any) , m_ring_alloc_logic_tx(sock_data.fd, ring_alloc_logic) - , m_p_tx_mem_buf_desc_list(NULL) - , m_p_zc_mem_buf_desc_list(NULL) + , m_p_tx_mem_buf_desc_list(nullptr) + , m_p_zc_mem_buf_desc_list(nullptr) , m_b_tx_mem_buf_desc_list_pending(false) , m_ttl_hop_limit(sock_data.ttl_hop_limit) , m_tos(sock_data.tos) @@ -92,46 +92,46 @@ dst_entry::~dst_entry() if (m_p_rt_entry) { g_p_route_table_mgr->unregister_observer( route_rule_table_key(m_dst_ip, m_route_src_ip, m_family, m_tos), this); - m_p_rt_entry = NULL; + m_p_rt_entry = nullptr; } if (m_p_ring) { if (m_sge) { delete[] m_sge; - m_sge = NULL; + m_sge = nullptr; } if (m_p_tx_mem_buf_desc_list) { m_p_ring->mem_buf_tx_release(m_p_tx_mem_buf_desc_list, true); - m_p_tx_mem_buf_desc_list = NULL; + m_p_tx_mem_buf_desc_list = nullptr; } if (m_p_zc_mem_buf_desc_list) { m_p_ring->mem_buf_tx_release(m_p_zc_mem_buf_desc_list, true); - m_p_zc_mem_buf_desc_list = NULL; + m_p_zc_mem_buf_desc_list = nullptr; } m_p_net_dev_val->release_ring(m_ring_alloc_logic_tx.get_key()); - m_p_ring = NULL; + m_p_ring = nullptr; } if (m_p_send_wqe_handler) { delete m_p_send_wqe_handler; - m_p_send_wqe_handler = NULL; + m_p_send_wqe_handler = nullptr; } if (m_p_neigh_val) { delete m_p_neigh_val; - m_p_neigh_val = NULL; + m_p_neigh_val = nullptr; } if (m_header) { delete m_header; - m_header = NULL; + m_header = nullptr; } if (m_header_neigh) { delete m_header_neigh; - m_header_neigh = NULL; + m_header_neigh = nullptr; } dst_logdbg("Done %s", to_str().c_str()); @@ -140,18 +140,18 @@ dst_entry::~dst_entry() void dst_entry::init_members() { set_state(false); - m_p_rt_val = NULL; - m_p_net_dev_val = NULL; - m_p_ring = NULL; - m_p_net_dev_entry = NULL; - m_p_neigh_entry = NULL; - m_p_neigh_val = NULL; - m_p_rt_entry = NULL; + m_p_rt_val = nullptr; + m_p_net_dev_val = nullptr; + m_p_ring = nullptr; + m_p_net_dev_entry = nullptr; + m_p_neigh_entry = nullptr; + m_p_neigh_val = nullptr; + m_p_rt_entry = nullptr; memset(&m_inline_send_wqe, 0, sizeof(m_inline_send_wqe)); memset(&m_not_inline_send_wqe, 0, sizeof(m_not_inline_send_wqe)); memset(&m_fragmented_send_wqe, 0, sizeof(m_not_inline_send_wqe)); - m_p_send_wqe_handler = NULL; - m_sge = NULL; + m_p_send_wqe_handler = nullptr; + m_sge = nullptr; m_b_is_offloaded = true; m_b_is_initialized = false; m_max_inline = 0; @@ -227,7 +227,7 @@ bool dst_entry::update_net_dev_val() } g_p_neigh_table_mgr->unregister_observer( neigh_key(ip_addr(dst_addr, m_family), m_p_net_dev_val), this); - m_p_neigh_entry = NULL; + m_p_neigh_entry = nullptr; } // Change the net_device, clean old resources... @@ -258,7 +258,7 @@ bool dst_entry::update_net_dev_val() bool dst_entry::update_rt_val() { bool ret_val = true; - route_val *p_rt_val = NULL; + route_val *p_rt_val = nullptr; if (m_p_rt_entry && m_p_rt_entry->get_val(p_rt_val)) { if (m_p_rt_val == p_rt_val) { @@ -279,7 +279,7 @@ bool dst_entry::resolve_net_dev(bool is_connect) { bool ret_val = false; - cache_entry_subject *p_ces = NULL; + cache_entry_subject *p_ces = nullptr; if (m_dst_ip.is_anyaddr()) { dst_logdbg(PRODUCT_NAME " does not offload zero net IP address"); @@ -307,7 +307,7 @@ bool dst_entry::resolve_net_dev(bool is_connect) // set src addr by XLIO. We keep this logic for IPv4 only for backward compliancy. if (m_family == AF_INET && is_connect && m_route_src_ip.is_anyaddr()) { dst_logfunc("Checking rt_entry src addr"); - route_val *p_rt_val = NULL; + route_val *p_rt_val = nullptr; if (m_p_rt_entry && m_p_rt_entry->get_val(p_rt_val) && !p_rt_val->get_src_addr().is_anyaddr()) { g_p_route_table_mgr->unregister_observer(rtk, this); @@ -346,11 +346,11 @@ bool dst_entry::resolve_neigh() if (m_p_rt_val && !m_p_rt_val->get_gw_addr().is_anyaddr() && !dst_addr.is_mc(m_family)) { dst_addr = m_p_rt_val->get_gw_addr(); } - cache_entry_subject *p_ces = NULL; + cache_entry_subject *p_ces = nullptr; if (m_p_neigh_entry || g_p_neigh_table_mgr->register_observer( neigh_key(ip_addr(dst_addr, m_family), m_p_net_dev_val), this, &p_ces)) { - if (m_p_neigh_entry == NULL) { + if (!m_p_neigh_entry) { m_p_neigh_entry = dynamic_cast(p_ces); } if (m_p_neigh_entry) { @@ -378,7 +378,7 @@ bool dst_entry::resolve_ring() if (m_p_ring) { if (m_sge) { delete[] m_sge; - m_sge = NULL; + m_sge = nullptr; } m_sge = new (std::nothrow) struct ibv_sge[m_p_ring->get_max_send_sge()]; if (!m_sge) { @@ -400,15 +400,15 @@ bool dst_entry::release_ring() if (m_p_ring) { if (m_p_tx_mem_buf_desc_list) { m_p_ring->mem_buf_tx_release(m_p_tx_mem_buf_desc_list, true); - m_p_tx_mem_buf_desc_list = NULL; + m_p_tx_mem_buf_desc_list = nullptr; } if (m_p_zc_mem_buf_desc_list) { m_p_ring->mem_buf_tx_release(m_p_zc_mem_buf_desc_list, true); - m_p_zc_mem_buf_desc_list = NULL; + m_p_zc_mem_buf_desc_list = nullptr; } dst_logdbg("releasing a ring"); m_p_net_dev_val->release_ring(m_ring_alloc_logic_tx.get_key()); - m_p_ring = NULL; + m_p_ring = nullptr; } ret_val = true; } @@ -455,7 +455,7 @@ bool dst_entry::conf_l2_hdr_and_snd_wqe_eth() // scratch if (m_p_send_wqe_handler) { delete m_p_send_wqe_handler; - m_p_send_wqe_handler = NULL; + m_p_send_wqe_handler = nullptr; } m_p_send_wqe_handler = new wqe_send_handler(); @@ -583,11 +583,11 @@ bool dst_entry::prepare_to_send(struct xlio_rate_limit_t &rate_limit, bool skip_ m_src_port, m_dst_port); if (m_p_tx_mem_buf_desc_list) { m_p_ring->mem_buf_tx_release(m_p_tx_mem_buf_desc_list, true); - m_p_tx_mem_buf_desc_list = NULL; + m_p_tx_mem_buf_desc_list = nullptr; } if (m_p_zc_mem_buf_desc_list) { m_p_ring->mem_buf_tx_release(m_p_zc_mem_buf_desc_list, true); - m_p_zc_mem_buf_desc_list = NULL; + m_p_zc_mem_buf_desc_list = nullptr; } resolved = true; } @@ -680,7 +680,7 @@ void dst_entry::do_ring_migration_tx(lock_base &socket_lock, resource_allocation m_p_ring = new_ring; if (m_sge) { delete[] m_sge; - m_sge = NULL; + m_sge = nullptr; } m_sge = new (std::nothrow) struct ibv_sge[m_p_ring->get_max_send_sge()]; if (!m_sge) { @@ -691,9 +691,9 @@ void dst_entry::do_ring_migration_tx(lock_base &socket_lock, resource_allocation get_route_mtu() + (uint32_t)m_header->m_transport_header_len); mem_buf_desc_t *tmp_list = m_p_tx_mem_buf_desc_list; - m_p_tx_mem_buf_desc_list = NULL; + m_p_tx_mem_buf_desc_list = nullptr; mem_buf_desc_t *tmp_list_zc = m_p_zc_mem_buf_desc_list; - m_p_zc_mem_buf_desc_list = NULL; + m_p_zc_mem_buf_desc_list = nullptr; m_slow_path_lock.unlock(); socket_lock.unlock(); @@ -770,7 +770,7 @@ bool dst_entry::alloc_neigh_val(transport_type_t tranport) if (m_p_neigh_val) { delete m_p_neigh_val; - m_p_neigh_val = NULL; + m_p_neigh_val = nullptr; } switch (tranport) { @@ -789,25 +789,25 @@ void dst_entry::return_buffers_pool() { int count; - if (m_p_tx_mem_buf_desc_list == NULL && m_p_zc_mem_buf_desc_list == NULL) { + if (!m_p_tx_mem_buf_desc_list && !m_p_zc_mem_buf_desc_list) { return; } if (m_b_tx_mem_buf_desc_list_pending && m_p_ring) { - if (m_p_tx_mem_buf_desc_list != NULL) { + if (m_p_tx_mem_buf_desc_list) { count = m_p_ring->mem_buf_tx_release(m_p_tx_mem_buf_desc_list, true, true); if (count) { - m_p_tx_mem_buf_desc_list = NULL; + m_p_tx_mem_buf_desc_list = nullptr; } } - if (m_p_zc_mem_buf_desc_list != NULL) { + if (m_p_zc_mem_buf_desc_list) { count = m_p_ring->mem_buf_tx_release(m_p_zc_mem_buf_desc_list, true, true); if (count) { - m_p_zc_mem_buf_desc_list = NULL; + m_p_zc_mem_buf_desc_list = nullptr; } } } - set_tx_buff_list_pending(m_p_tx_mem_buf_desc_list != NULL || m_p_zc_mem_buf_desc_list != NULL); + set_tx_buff_list_pending(m_p_tx_mem_buf_desc_list || m_p_zc_mem_buf_desc_list); } int dst_entry::modify_ratelimit(struct xlio_rate_limit_t &rate_limit) diff --git a/src/core/proto/dst_entry.h b/src/core/proto/dst_entry.h index 46231afbc..5a214505f 100644 --- a/src/core/proto/dst_entry.h +++ b/src/core/proto/dst_entry.h @@ -85,7 +85,7 @@ class dst_entry : public cache_observer, public tostr { virtual ssize_t fast_send(const iovec *p_iov, const ssize_t sz_iov, xlio_send_attr attr) = 0; virtual ssize_t slow_send(const iovec *p_iov, const ssize_t sz_iov, xlio_send_attr attr, struct xlio_rate_limit_t &rate_limit, int flags = 0, - socket_fd_api *sock = 0, tx_call_t call_type = TX_UNDEF) = 0; + socket_fd_api *sock = nullptr, tx_call_t call_type = TX_UNDEF) = 0; bool try_migrate_ring_tx(lock_base &socket_lock); diff --git a/src/core/proto/dst_entry_tcp.cpp b/src/core/proto/dst_entry_tcp.cpp index ad4cd966b..a1df9fb3f 100644 --- a/src/core/proto/dst_entry_tcp.cpp +++ b/src/core/proto/dst_entry_tcp.cpp @@ -71,7 +71,7 @@ ssize_t dst_entry_tcp::fast_send(const iovec *p_iov, const ssize_t sz_iov, xlio_ void *p_pkt; void *p_ip_hdr; void *p_tcp_hdr; - tcp_iovec *p_tcp_iov = NULL; + tcp_iovec *p_tcp_iov = nullptr; xlio_ibv_send_wr *p_send_wqe; size_t hdr_alignment_diff = 0; @@ -258,8 +258,8 @@ ssize_t dst_entry_tcp::fast_send(const iovec *p_iov, const ssize_t sz_iov, xlio_ mem_buf_desc_t *p_mem_buf_desc; size_t total_packet_len = 0; - p_mem_buf_desc = get_buffer(PBUF_RAM, NULL, is_set(attr.flags, XLIO_TX_PACKET_BLOCK)); - if (p_mem_buf_desc == NULL) { + p_mem_buf_desc = get_buffer(PBUF_RAM, nullptr, is_set(attr.flags, XLIO_TX_PACKET_BLOCK)); + if (!p_mem_buf_desc) { ret = -1; goto out; } @@ -300,7 +300,7 @@ ssize_t dst_entry_tcp::fast_send(const iovec *p_iov, const ssize_t sz_iov, xlio_ send_ring_buffer(m_id, p_send_wqe, attr.flags); } - if (unlikely(m_p_tx_mem_buf_desc_list == NULL)) { + if (unlikely(!m_p_tx_mem_buf_desc_list)) { m_p_tx_mem_buf_desc_list = m_p_ring->mem_buf_tx_get( m_id, is_set(attr.flags, XLIO_TX_PACKET_BLOCK), PBUF_RAM, m_n_sysvar_tx_bufs_batch_tcp); } @@ -385,24 +385,24 @@ mem_buf_desc_t *dst_entry_tcp::get_buffer(pbuf_type type, pbuf_desc *desc, p_desc_list = type == PBUF_ZEROCOPY ? &m_p_zc_mem_buf_desc_list : &m_p_tx_mem_buf_desc_list; // Get a bunch of tx buf descriptor and data buffers - if (unlikely(*p_desc_list == NULL)) { + if (unlikely(!*p_desc_list)) { *p_desc_list = m_p_ring->mem_buf_tx_get(m_id, b_blocked, type, m_n_sysvar_tx_bufs_batch_tcp); } mem_buf_desc_t *p_mem_buf_desc = *p_desc_list; - if (unlikely(p_mem_buf_desc == NULL)) { + if (unlikely(!p_mem_buf_desc)) { dst_tcp_logfunc("silent packet drop, no buffers!"); } else { *p_desc_list = (*p_desc_list)->p_next_desc; - p_mem_buf_desc->p_next_desc = NULL; + p_mem_buf_desc->p_next_desc = nullptr; // for TX, set lwip payload to the data segment. // lwip will send it with payload pointing to the tcp header. if (p_mem_buf_desc->p_buffer) { p_mem_buf_desc->lwip_pbuf.pbuf.payload = (u8_t *)p_mem_buf_desc->p_buffer + m_header->m_aligned_l2_l3_len + sizeof(struct tcphdr); } else { - p_mem_buf_desc->lwip_pbuf.pbuf.payload = NULL; + p_mem_buf_desc->lwip_pbuf.pbuf.payload = nullptr; } /* Initialize pbuf description */ @@ -430,7 +430,7 @@ void dst_entry_tcp::put_buffer(mem_buf_desc_t *p_desc) { // todo accumulate buffers? - if (unlikely(p_desc == NULL)) { + if (unlikely(!p_desc)) { return; } @@ -446,7 +446,7 @@ void dst_entry_tcp::put_buffer(mem_buf_desc_t *p_desc) } if (p_desc->lwip_pbuf.pbuf.ref == 0) { - p_desc->p_next_desc = NULL; + p_desc->p_next_desc = nullptr; buffer_pool::free_tx_lwip_pbuf_custom(&p_desc->lwip_pbuf.pbuf); } } diff --git a/src/core/proto/dst_entry_tcp.h b/src/core/proto/dst_entry_tcp.h index d4d2aaac2..2615c2abe 100644 --- a/src/core/proto/dst_entry_tcp.h +++ b/src/core/proto/dst_entry_tcp.h @@ -50,8 +50,8 @@ class dst_entry_tcp : public dst_entry { ssize_t fast_send(const iovec *p_iov, const ssize_t sz_iov, xlio_send_attr attr); ssize_t slow_send(const iovec *p_iov, const ssize_t sz_iov, xlio_send_attr attr, - struct xlio_rate_limit_t &rate_limit, int flags = 0, socket_fd_api *sock = 0, - tx_call_t call_type = TX_UNDEF); + struct xlio_rate_limit_t &rate_limit, int flags = 0, + socket_fd_api *sock = nullptr, tx_call_t call_type = TX_UNDEF); ssize_t slow_send_neigh(const iovec *p_iov, size_t sz_iov, struct xlio_rate_limit_t &rate_limit); diff --git a/src/core/proto/dst_entry_udp.cpp b/src/core/proto/dst_entry_udp.cpp index d70e4f64b..a45e2069a 100644 --- a/src/core/proto/dst_entry_udp.cpp +++ b/src/core/proto/dst_entry_udp.cpp @@ -176,7 +176,7 @@ bool dst_entry_udp::fast_send_fragmented_ipv6(mem_buf_desc_t *p_mem_buf_desc, co n_ip_frag_offset, ntohl(packet_id)); tmp = p_mem_buf_desc->p_next_desc; - p_mem_buf_desc->p_next_desc = NULL; + p_mem_buf_desc->p_next_desc = nullptr; // We don't check the return valuse of post send when we reach the HW we consider that we // completed our job @@ -206,11 +206,11 @@ inline ssize_t dst_entry_udp::fast_send_not_fragmented(const iovec *p_iov, const bool b_blocked = is_set(attr, XLIO_TX_PACKET_BLOCK); // Get a bunch of tx buf descriptor and data buffers - if (unlikely(m_p_tx_mem_buf_desc_list == NULL)) { + if (unlikely(!m_p_tx_mem_buf_desc_list)) { m_p_tx_mem_buf_desc_list = m_p_ring->mem_buf_tx_get(m_id, b_blocked, PBUF_RAM, m_n_sysvar_tx_bufs_batch_udp); - if (unlikely(m_p_tx_mem_buf_desc_list == NULL)) { + if (unlikely(!m_p_tx_mem_buf_desc_list)) { if (b_blocked) { dst_udp_logdbg("Error when blocking for next tx buffer (errno=%d %m)", errno); } else { @@ -227,7 +227,7 @@ inline ssize_t dst_entry_udp::fast_send_not_fragmented(const iovec *p_iov, const // Disconnect the first buffer from the list p_mem_buf_desc = m_p_tx_mem_buf_desc_list; m_p_tx_mem_buf_desc_list = m_p_tx_mem_buf_desc_list->p_next_desc; - p_mem_buf_desc->p_next_desc = NULL; + p_mem_buf_desc->p_next_desc = nullptr; set_tx_buff_list_pending(false); @@ -307,7 +307,7 @@ inline ssize_t dst_entry_udp::fast_send_not_fragmented(const iovec *p_iov, const send_ring_buffer(m_id, p_send_wqe, attr); // request tx buffers for the next packets - if (unlikely(m_p_tx_mem_buf_desc_list == NULL)) { + if (unlikely(!m_p_tx_mem_buf_desc_list)) { m_p_tx_mem_buf_desc_list = m_p_ring->mem_buf_tx_get(m_id, b_blocked, PBUF_RAM, m_n_sysvar_tx_bufs_batch_udp); } @@ -405,7 +405,7 @@ inline bool dst_entry_udp::fast_send_fragmented_ipv4(mem_buf_desc_t *p_mem_buf_d n_ip_frag_offset, ntohs(packet_id)); tmp = p_mem_buf_desc->p_next_desc; - p_mem_buf_desc->p_next_desc = NULL; + p_mem_buf_desc->p_next_desc = nullptr; // We don't check the return valuse of post send when we reach the HW we consider that we // completed our job @@ -444,7 +444,7 @@ ssize_t dst_entry_udp::fast_send_fragmented(const iovec *p_iov, const ssize_t sz mem_buf_desc_t *p_mem_buf_desc = m_p_ring->mem_buf_tx_get(m_id, b_blocked, PBUF_RAM, n_num_frags); - if (unlikely(p_mem_buf_desc == NULL)) { + if (unlikely(!p_mem_buf_desc)) { if (b_blocked) { dst_udp_logdbg("Error when blocking for next tx buffer (errno=%d %m)", errno); } else { diff --git a/src/core/proto/dst_entry_udp.h b/src/core/proto/dst_entry_udp.h index 9b9c7724f..0fc183745 100644 --- a/src/core/proto/dst_entry_udp.h +++ b/src/core/proto/dst_entry_udp.h @@ -43,8 +43,8 @@ class dst_entry_udp : public dst_entry { ssize_t fast_send(const iovec *p_iov, const ssize_t sz_iov, xlio_send_attr attr); ssize_t slow_send(const iovec *p_iov, const ssize_t sz_iov, xlio_send_attr attr, - struct xlio_rate_limit_t &rate_limit, int flags = 0, socket_fd_api *sock = 0, - tx_call_t call_type = TX_UNDEF); + struct xlio_rate_limit_t &rate_limit, int flags = 0, + socket_fd_api *sock = nullptr, tx_call_t call_type = TX_UNDEF); static bool fast_send_fragmented_ipv6(mem_buf_desc_t *p_mem_buf_desc, const iovec *p_iov, const ssize_t sz_iov, xlio_wr_tx_packet_attr attr, size_t sz_udp_payload, int n_num_frags, diff --git a/src/core/proto/dst_entry_udp_mc.cpp b/src/core/proto/dst_entry_udp_mc.cpp index 71aee9fb7..396dce28c 100644 --- a/src/core/proto/dst_entry_udp_mc.cpp +++ b/src/core/proto/dst_entry_udp_mc.cpp @@ -84,10 +84,10 @@ bool dst_entry_udp_mc::resolve_net_dev(bool is_connect) { NOT_IN_USE(is_connect); bool ret_val = false; - cache_entry_subject *net_dev_entry = NULL; + cache_entry_subject *net_dev_entry = nullptr; if (!m_mc_tx_src_ip.is_anyaddr() && !m_mc_tx_src_ip.is_mc(m_family)) { - if (m_p_net_dev_entry == NULL) { + if (!m_p_net_dev_entry) { net_device_val *mc_net_dev = g_p_net_device_table_mgr->get_net_device_val(ip_addr(m_mc_tx_src_ip, m_family)); if (mc_net_dev) { diff --git a/src/core/proto/ip_frag.cpp b/src/core/proto/ip_frag.cpp index dc151dabf..5fda4b391 100644 --- a/src/core/proto/ip_frag.cpp +++ b/src/core/proto/ip_frag.cpp @@ -87,14 +87,14 @@ static int g_ip_frag_count_check = 0; #define PRINT_STATISTICS() #endif -ip_frag_manager *g_p_ip_frag_manager = NULL; +ip_frag_manager *g_p_ip_frag_manager = nullptr; -ip_frag_hole_desc *hole_base = NULL; -ip_frag_hole_desc *hole_free_list_head = NULL; +ip_frag_hole_desc *hole_base = nullptr; +ip_frag_hole_desc *hole_free_list_head = nullptr; int hole_free_list_count = 0; -ip_frag_desc *desc_base = NULL; -ip_frag_desc *desc_free_list_head = NULL; +ip_frag_desc *desc_base = nullptr; +ip_frag_desc *desc_free_list_head = nullptr; int desc_free_list_count = 0; ip_frag_manager::ip_frag_manager() @@ -209,7 +209,7 @@ ip_frag_hole_desc *ip_frag_manager::alloc_hole_desc() struct ip_frag_hole_desc *ret; ret = hole_free_list_head; if (!ret) { - return NULL; + return nullptr; } // unlink from hole's free list @@ -217,9 +217,9 @@ ip_frag_hole_desc *ip_frag_manager::alloc_hole_desc() hole_free_list_count--; // clear hole struct - ret->data_first = 0; - ret->data_last = 0; - ret->next = 0; + ret->data_first = nullptr; + ret->data_last = nullptr; + ret->next = nullptr; return ret; } @@ -236,14 +236,14 @@ ip_frag_desc_t *ip_frag_manager::alloc_frag_desc() ip_frag_desc_t *ret; ret = desc_free_list_head; if (!ret) { - return NULL; + return nullptr; } // unlink from hole's free list desc_free_list_head = ret->next; --desc_free_list_count; - ret->next = 0; + ret->next = nullptr; return ret; } @@ -276,13 +276,13 @@ void ip_frag_manager::destroy_frag_desc(ip_frag_desc_t *desc) */ ip_frag_desc_t *ip_frag_manager::new_frag_desc(ip_frag_key_t &key) { - ip_frag_desc_t *desc = NULL; - struct ip_frag_hole_desc *hole = NULL; + ip_frag_desc_t *desc = nullptr; + struct ip_frag_hole_desc *hole = nullptr; hole = alloc_hole_desc(); if (!hole) { frag_dbg("NULL hole"); - return NULL; + return nullptr; } hole->first = IP_FRAG_NINF; hole->last = IP_FRAG_INF; @@ -291,10 +291,10 @@ ip_frag_desc_t *ip_frag_manager::new_frag_desc(ip_frag_key_t &key) if (!desc) { frag_dbg("NULL desc"); free_hole_desc(hole); - return NULL; + return nullptr; } desc->ttl = IP_FRAG_TTL; - desc->frag_list = 0; + desc->frag_list = nullptr; desc->hole_list = hole; desc->frag_counter = m_frag_counter; @@ -382,7 +382,7 @@ int ip_frag_manager::add_frag(iphdr *hdr, mem_buf_desc_t *frag, mem_buf_desc_t * frag_dbg("> old fragmented packet"); } } - if (desc == NULL) { + if (!desc) { MEMBUF_DEBUG_REF_DEC(frag); PRINT_STATISTICS(); unlock(); @@ -393,7 +393,7 @@ int ip_frag_manager::add_frag(iphdr *hdr, mem_buf_desc_t *frag, mem_buf_desc_t * /* 8 step reassembly algorithm as described in RFC 815 */ // step 1 - phole_prev = 0; + phole_prev = nullptr; phole = desc->hole_list; while (phole) { // step 2 and step 3 @@ -501,7 +501,7 @@ int ip_frag_manager::add_frag(iphdr *hdr, mem_buf_desc_t *frag, mem_buf_desc_t * } frag_dbg("> need more packets"); - *ret = NULL; + *ret = nullptr; PRINT_STATISTICS(); unlock(); return 0; diff --git a/src/core/proto/mapping.cpp b/src/core/proto/mapping.cpp index 566d71763..68d36f417 100644 --- a/src/core/proto/mapping.cpp +++ b/src/core/proto/mapping.cpp @@ -56,7 +56,7 @@ #define map_logdbg_exit __log_exit_dbg #define map_logfunc_exit __log_exit_func -mapping_cache *g_zc_cache = NULL; +mapping_cache *g_zc_cache = nullptr; mapping_t::mapping_t(file_uid_t &uid, mapping_cache *cache, ib_ctx_handler *p_ib_ctx) : m_registrator() @@ -64,7 +64,7 @@ mapping_t::mapping_t(file_uid_t &uid, mapping_cache *cache, ib_ctx_handler *p_ib m_state = MAPPING_STATE_UNMAPPED; m_fd = -1; m_uid = uid; - m_addr = NULL; + m_addr = nullptr; m_size = 0; m_ref = 0; m_owners = 0; @@ -125,8 +125,8 @@ int mapping_t::map(int fd) * performance results. For now, use only MAP_PRIVATE mappings. */ flags = /* rw ? MAP_SHARED :*/ MAP_PRIVATE; - m_addr = - mmap64(NULL, m_size, PROT_WRITE | PROT_READ, flags | MAP_NORESERVE | MAP_POPULATE, m_fd, 0); + m_addr = mmap64(nullptr, m_size, PROT_WRITE | PROT_READ, flags | MAP_NORESERVE | MAP_POPULATE, + m_fd, 0); if (MAP_FAILED == m_addr) { map_logerr("mmap64() errno=%d (%s)", errno, strerror(errno)); goto failed_close_fd; @@ -147,7 +147,7 @@ int mapping_t::map(int fd) (void)munmap(m_addr, m_size); failed_close_fd: SYSCALL(close, m_fd); - m_addr = NULL; + m_addr = nullptr; m_size = 0; m_fd = -1; failed: @@ -173,7 +173,7 @@ int mapping_t::unmap(void) p_cache->memory_free(m_size); SYSCALL(close, m_fd); m_fd = -1; - m_addr = NULL; + m_addr = nullptr; m_size = 0; m_state = MAPPING_STATE_UNMAPPED; @@ -295,7 +295,7 @@ mapping_cache::~mapping_cache() mapping_t *mapping_cache::get_mapping(int local_fd, void *p_ctx) { - mapping_t *mapping = NULL; + mapping_t *mapping = nullptr; mapping_fd_map_iter_t iter; file_uid_t uid; struct stat st; @@ -311,7 +311,7 @@ mapping_t *mapping_cache::get_mapping(int local_fd, void *p_ctx) } } - if (mapping == NULL) { + if (!mapping) { if (fstat(local_fd, &st) != 0) { map_logerr("fstat() errno=%d (%s)", errno, strerror(errno)); goto quit; @@ -324,7 +324,7 @@ mapping_t *mapping_cache::get_mapping(int local_fd, void *p_ctx) } quit: - if (mapping != NULL) { + if (mapping) { mapping->get(); /* Mapping object may be unmapped, call mmap() in this case */ @@ -335,9 +335,9 @@ mapping_t *mapping_cache::get_mapping(int local_fd, void *p_ctx) unlock(); - if (mapping != NULL && mapping->m_state == MAPPING_STATE_FAILED) { + if (mapping && mapping->m_state == MAPPING_STATE_FAILED) { mapping->put(); - mapping = NULL; + mapping = nullptr; } return mapping; } @@ -403,7 +403,7 @@ void mapping_cache::memory_free(size_t size) mapping_t *mapping_cache::get_mapping_by_uid_unlocked(file_uid_t &uid, ib_ctx_handler *p_ib_ctx) { - mapping_t *mapping = NULL; + mapping_t *mapping = nullptr; mapping_uid_map_iter_t iter; iter = m_cache_uid.find(uid); @@ -414,9 +414,9 @@ mapping_t *mapping_cache::get_mapping_by_uid_unlocked(file_uid_t &uid, ib_ctx_ha } } - if (mapping == NULL) { + if (!mapping) { mapping = new (std::nothrow) mapping_t(uid, this, p_ib_ctx); - if (mapping != NULL) { + if (mapping) { m_cache_uid[uid] = mapping; } } diff --git a/src/core/proto/mapping.h b/src/core/proto/mapping.h index e99dd18f7..14ede8caa 100644 --- a/src/core/proto/mapping.h +++ b/src/core/proto/mapping.h @@ -127,7 +127,7 @@ class mapping_cache : public lock_spin { mapping_cache(size_t threshold); ~mapping_cache(); - mapping_t *get_mapping(int local_fd, void *p_ctx = NULL); + mapping_t *get_mapping(int local_fd, void *p_ctx = nullptr); void release_mapping(mapping_t *mapping); void handle_close(int local_fd); @@ -137,7 +137,7 @@ class mapping_cache : public lock_spin { struct mapping_cache_stats m_stats; private: - mapping_t *get_mapping_by_uid_unlocked(file_uid_t &uid, ib_ctx_handler *p_ib_ctx = NULL); + mapping_t *get_mapping_by_uid_unlocked(file_uid_t &uid, ib_ctx_handler *p_ib_ctx = nullptr); void evict_mapping_unlocked(mapping_t *mapping); bool cache_evict_unlocked(size_t toFree); diff --git a/src/core/proto/mem_buf_desc.h b/src/core/proto/mem_buf_desc.h index d0330a0d6..87d47fc87 100644 --- a/src/core/proto/mem_buf_desc.h +++ b/src/core/proto/mem_buf_desc.h @@ -72,11 +72,11 @@ class mem_buf_desc_t { : p_buffer(buffer) , m_flags(mem_buf_desc_t::TYPICAL) , lkey(0) - , p_next_desc(0) - , p_prev_desc(0) + , p_next_desc(nullptr) + , p_prev_desc(nullptr) , sz_buffer(size) , sz_data(0) - , p_desc_owner(0) + , p_desc_owner(nullptr) { memset(&lwip_pbuf, 0, sizeof(lwip_pbuf)); diff --git a/src/core/proto/mem_desc.h b/src/core/proto/mem_desc.h index 682705c4c..bbc16ecc2 100644 --- a/src/core/proto/mem_desc.h +++ b/src/core/proto/mem_desc.h @@ -194,7 +194,7 @@ class zcopy_hugepage_mgr : public lock_spin { page = iter->second; } else { page = new zcopy_hugepage(page_addr, m_hugepage_size); - if (likely(page != NULL)) { + if (likely(page)) { m_hugepage_map[page_addr] = page; } } diff --git a/src/core/proto/neighbour.cpp b/src/core/proto/neighbour.cpp index 5bcb9e662..d152aec92 100644 --- a/src/core/proto/neighbour.cpp +++ b/src/core/proto/neighbour.cpp @@ -159,18 +159,18 @@ inline int neigh_eth::build_uc_neigh_val() neigh_entry::neigh_entry(neigh_key key, transport_type_t _type, bool is_init_resources) : cache_entry_subject(key) - , m_cma_id(NULL) + , m_cma_id(nullptr) , m_src_addr(in6addr_any) , m_rdma_port_space((enum rdma_port_space)0) - , m_state_machine(NULL) + , m_state_machine(nullptr) , m_type(UNKNOWN) , m_trans_type(_type) , m_state(false) , m_err_counter(0) - , m_timer_handle(NULL) + , m_timer_handle(nullptr) , m_arp_counter(0) , m_p_dev(key.get_net_device_val()) - , m_p_ring(NULL) + , m_p_ring(nullptr) , m_is_loopback(false) , m_to_str(std::string(priv_xlio_transport_type_str(m_trans_type)) + ":" + get_key().to_str()) , m_id(0) @@ -183,7 +183,7 @@ neigh_entry::neigh_entry(neigh_key key, transport_type_t _type, bool is_init_res m_val = NULL; BULLSEYE_EXCLUDE_BLOCK_START - if (m_p_dev == NULL) { + if (!m_p_dev) { neigh_logpanic("get_net_dev return NULL"); } @@ -225,7 +225,7 @@ neigh_entry::neigh_entry(neigh_key key, transport_type_t _type, bool is_init_res if (is_init_resources) { m_p_ring = m_p_dev->reserve_ring(m_ring_allocation_logic.get_key()); - if (m_p_ring == NULL) { + if (!m_p_ring) { neigh_logpanic("reserve_ring return NULL"); } m_id = m_p_ring->generate_id(); @@ -244,11 +244,11 @@ neigh_entry::~neigh_entry() if (m_state_machine) { delete m_state_machine; - m_state_machine = NULL; + m_state_machine = nullptr; } if (m_p_dev && m_p_ring) { m_p_dev->release_ring(m_ring_allocation_logic.get_key()); - m_p_ring = NULL; + m_p_ring = nullptr; } if (m_val) { delete m_val; @@ -261,7 +261,7 @@ neigh_entry::~neigh_entry() bool neigh_entry::is_deletable() { - if (m_state_machine == NULL) { + if (!m_state_machine) { return true; } @@ -282,7 +282,7 @@ void neigh_entry::clean_obj() m_lock.lock(); set_cleaned(); - m_timer_handle = NULL; + m_timer_handle = nullptr; if (g_p_event_handler_manager->is_running()) { g_p_event_handler_manager->unregister_timers_event_and_delete(this); m_lock.unlock(); @@ -335,7 +335,7 @@ void neigh_entry::handle_timer_expired(void *ctx) neigh_logdbg("Timeout expired!"); // Clear Timer Handler - m_timer_handle = NULL; + m_timer_handle = nullptr; m_sm_lock.lock(); int sm_state = m_state_machine->get_curr_state(); @@ -447,7 +447,7 @@ bool neigh_entry::post_send_udp_ipv4(neigh_send_data *n_send_data) // Find number of ip fragments (-> packets, buffers, buffer descs...) neigh_logdbg("ENTER post_send_udp_ipv4"); int n_num_frags = 1; - mem_buf_desc_t *p_mem_buf_desc, *tmp = NULL; + mem_buf_desc_t *p_mem_buf_desc, *tmp = nullptr; void *p_pkt; void *p_ip_hdr; void *p_udp_hdr; @@ -475,7 +475,7 @@ bool neigh_entry::post_send_udp_ipv4(neigh_send_data *n_send_data) // Get all needed tx buf descriptor and data buffers p_mem_buf_desc = m_p_ring->mem_buf_tx_get(m_id, false, PBUF_RAM, n_num_frags); - if (unlikely(p_mem_buf_desc == NULL)) { + if (unlikely(!p_mem_buf_desc)) { neigh_logdbg("Packet dropped. not enough tx buffers"); return false; } @@ -553,7 +553,7 @@ bool neigh_entry::post_send_udp_ipv4(neigh_send_data *n_send_data) NOT_IN_USE(id); // Fix unused-but-set error when bebug logs are disabled tmp = p_mem_buf_desc->p_next_desc; - p_mem_buf_desc->p_next_desc = NULL; + p_mem_buf_desc->p_next_desc = nullptr; // We don't check the return value of post send when we reach the HW we consider that we // completed our job @@ -582,7 +582,7 @@ bool neigh_entry::post_send_udp_ipv6_fragmented(neigh_send_data *n_send_data, si (sz_udp_payload + max_payload_size_per_packet - 1) / max_payload_size_per_packet; mem_buf_desc_t *p_mem_buf_desc = m_p_ring->mem_buf_tx_get(m_id, false, PBUF_RAM, n_num_frags); - if (unlikely(p_mem_buf_desc == NULL)) { + if (unlikely(!p_mem_buf_desc)) { neigh_logdbg("Packet dropped. not enough tx buffers"); return false; } @@ -597,7 +597,7 @@ bool neigh_entry::post_send_udp_ipv6_not_fragmented(neigh_send_data *n_send_data { neigh_logdbg("ENTER post_send_udp_ipv6_not_fragmented"); mem_buf_desc_t *p_mem_buf_desc = m_p_ring->mem_buf_tx_get(m_id, false, PBUF_RAM); - if (unlikely(p_mem_buf_desc == NULL)) { + if (unlikely(!p_mem_buf_desc)) { neigh_logdbg("Packet dropped. not enough tx buffers"); return false; } @@ -672,14 +672,14 @@ bool neigh_entry::post_send_tcp(neigh_send_data *p_data) p_mem_buf_desc = m_p_ring->mem_buf_tx_get(m_id, false, PBUF_RAM, 1); BULLSEYE_EXCLUDE_BLOCK_START - if (unlikely(p_mem_buf_desc == NULL)) { + if (unlikely(!p_mem_buf_desc)) { neigh_logdbg("Packet dropped. not enough tx buffers"); return false; } BULLSEYE_EXCLUDE_BLOCK_END p_mem_buf_desc->lwip_pbuf.pbuf.payload = (u8_t *)p_mem_buf_desc->p_buffer + h->m_total_hdr_len; - p_mem_buf_desc->p_next_desc = NULL; + p_mem_buf_desc->p_next_desc = nullptr; // copy L4 neigh buffer to tx buffer memcpy((void *)(p_mem_buf_desc->p_buffer + h->m_aligned_l2_l3_len), p_data->m_iov.iov_base, @@ -753,7 +753,7 @@ bool neigh_entry::get_peer_info(neigh_val *p_val) { neigh_logfunc("calling neigh_entry get_peer_info. state = %d", m_state); BULLSEYE_EXCLUDE_BLOCK_START - if (p_val == NULL) { + if (!p_val) { neigh_logdbg("p_val is NULL, return false"); return false; } @@ -814,7 +814,7 @@ void neigh_entry::handle_neigh_event(neigh_nl_event *nl_ev) case NUD_REACHABLE: case NUD_PERMANENT: { BULLSEYE_EXCLUDE_BLOCK_START - if (m_state_machine == NULL) { + if (!m_state_machine) { neigh_logerr("m_state_machine: not a valid case"); break; } @@ -844,7 +844,7 @@ void neigh_entry::handle_neigh_event(neigh_nl_event *nl_ev) case NUD_STALE: { BULLSEYE_EXCLUDE_BLOCK_START - if (m_state_machine == NULL) { + if (!m_state_machine) { neigh_logerr("m_state_machine: not a valid case"); break; } @@ -961,7 +961,7 @@ neigh_entry::event_t neigh_entry::rdma_event_mapping(struct rdma_cm_event *p_rdm { // General check of cma_id BULLSEYE_EXCLUDE_BLOCK_START - if (m_cma_id != NULL && m_cma_id != p_rdma_cm_event->id) { + if (m_cma_id && m_cma_id != p_rdma_cm_event->id) { neigh_logerr("cma_id %p != event->cma_id %p", m_cma_id, p_rdma_cm_event->id); return EV_UNHANDLED; } @@ -1165,7 +1165,7 @@ int neigh_entry::priv_enter_init_resolution() sock_addr src_sa(get_family(), &m_src_addr, 0); /* we had issues passing unicast src addr, let it find the correct one itself */ - sockaddr *src_p_sa = get_dst_addr().is_mc() ? src_sa.get_p_sa() : NULL; + sockaddr *src_p_sa = get_dst_addr().is_mc() ? src_sa.get_p_sa() : nullptr; int timeout_ms = RESOLVE_TIMEOUT_MS; if (get_family() == AF_INET6 && @@ -1378,14 +1378,14 @@ void neigh_entry::priv_destroy_cma_id() neigh_logdbg("Failed in rdma_destroy_id (errno=%d %m)", errno); } ENDIF_RDMACM_FAILURE; - m_cma_id = NULL; + m_cma_id = nullptr; } } void *neigh_entry::priv_register_timer_event(int timeout_msec, timer_handler *handler, timer_req_type_t req_type, void *user_data) { - void *_timer_handler = NULL; + void *_timer_handler = nullptr; std::lock_guard lock(m_lock); if (!is_cleaned()) { _timer_handler = g_p_event_handler_manager->register_timer_event(timeout_msec, handler, @@ -1402,7 +1402,7 @@ void neigh_entry::priv_unregister_timer() // as ONESHOT timer free itself after it run. // TODO: unregister all timers? is there just one or more? // g_p_event_handler_manager->unregister_timer_event(this, m_timer_handle); - m_timer_handle = NULL; + m_timer_handle = nullptr; } } //============================================================== neigh_eth @@ -1426,22 +1426,22 @@ neigh_eth::neigh_eth(neigh_key key) sm_short_table_line_t short_sm_table[] = { // {curr state, event, next state, action func } - {ST_NOT_ACTIVE, EV_KICK_START, ST_INIT, NULL}, - {ST_NOT_ACTIVE, EV_ARP_RESOLVED, ST_READY, NULL}, - {ST_ERROR, EV_KICK_START, ST_INIT, NULL}, - {ST_INIT, EV_ARP_RESOLVED, ST_READY, NULL}, - {ST_INIT, EV_START_RESOLUTION, ST_INIT_RESOLUTION, NULL}, - {ST_INIT_RESOLUTION, EV_RDMA_RESOLVE_FAILED, ST_SOLICIT_SEND, NULL}, - {ST_INIT_RESOLUTION, EV_ADDR_RESOLVED, ST_ADDR_RESOLVED, NULL}, - {ST_INIT_RESOLUTION, EV_ARP_RESOLVED, ST_READY, NULL}, - {ST_ADDR_RESOLVED, EV_ARP_RESOLVED, ST_READY, NULL}, - {ST_SOLICIT_SEND, EV_ARP_RESOLVED, ST_READY, NULL}, - {ST_SOLICIT_SEND, EV_TIMEOUT_EXPIRED, ST_ERROR, NULL}, - {ST_SOLICIT_SEND, EV_ERROR, ST_ERROR, NULL}, - {ST_READY, EV_ERROR, ST_ERROR, NULL}, - {ST_INIT, EV_ERROR, ST_ERROR, NULL}, - {ST_INIT_RESOLUTION, EV_ERROR, ST_ERROR, NULL}, - {ST_ERROR, EV_ERROR, ST_NOT_ACTIVE, NULL}, + {ST_NOT_ACTIVE, EV_KICK_START, ST_INIT, nullptr}, + {ST_NOT_ACTIVE, EV_ARP_RESOLVED, ST_READY, nullptr}, + {ST_ERROR, EV_KICK_START, ST_INIT, nullptr}, + {ST_INIT, EV_ARP_RESOLVED, ST_READY, nullptr}, + {ST_INIT, EV_START_RESOLUTION, ST_INIT_RESOLUTION, nullptr}, + {ST_INIT_RESOLUTION, EV_RDMA_RESOLVE_FAILED, ST_SOLICIT_SEND, nullptr}, + {ST_INIT_RESOLUTION, EV_ADDR_RESOLVED, ST_ADDR_RESOLVED, nullptr}, + {ST_INIT_RESOLUTION, EV_ARP_RESOLVED, ST_READY, nullptr}, + {ST_ADDR_RESOLVED, EV_ARP_RESOLVED, ST_READY, nullptr}, + {ST_SOLICIT_SEND, EV_ARP_RESOLVED, ST_READY, nullptr}, + {ST_SOLICIT_SEND, EV_TIMEOUT_EXPIRED, ST_ERROR, nullptr}, + {ST_SOLICIT_SEND, EV_ERROR, ST_ERROR, nullptr}, + {ST_READY, EV_ERROR, ST_ERROR, nullptr}, + {ST_INIT, EV_ERROR, ST_ERROR, nullptr}, + {ST_INIT_RESOLUTION, EV_ERROR, ST_ERROR, nullptr}, + {ST_ERROR, EV_ERROR, ST_NOT_ACTIVE, nullptr}, // Entry functions {ST_INIT, SM_STATE_ENTRY, SM_NO_ST, neigh_entry::dofunc_enter_init}, {ST_INIT_RESOLUTION, SM_STATE_ENTRY, SM_NO_ST, neigh_entry::dofunc_enter_init_resolution}, @@ -1459,13 +1459,13 @@ neigh_eth::neigh_eth(neigh_key key) EV_LAST, // max events short_sm_table, // short table general_st_entry, // default entry function - NULL, // default leave function - NULL, // default func + nullptr, // default leave function + nullptr, // default func print_event_info // debug function ); BULLSEYE_EXCLUDE_BLOCK_START - if (m_state_machine == NULL) { + if (!m_state_machine) { neigh_logpanic("Failed allocating state_machine"); } BULLSEYE_EXCLUDE_BLOCK_END @@ -1601,7 +1601,7 @@ bool neigh_eth::send_arp_request(bool is_broadcast) net_device_val_eth *netdevice_eth = dynamic_cast(m_p_dev); BULLSEYE_EXCLUDE_BLOCK_START - if (netdevice_eth == NULL) { + if (!netdevice_eth) { neigh_logdbg("Net dev is NULL not sending ARP"); return false; } @@ -1616,7 +1616,7 @@ bool neigh_eth::send_arp_request(bool is_broadcast) const unsigned char *peer_mac = dst->get_address(); BULLSEYE_EXCLUDE_BLOCK_START - if (src == NULL || dst == NULL) { + if (!src || !dst) { neigh_logdbg("src or dst is NULL not sending ARP"); return false; } @@ -1628,7 +1628,7 @@ bool neigh_eth::send_arp_request(bool is_broadcast) 0, 0); mem_buf_desc_t *p_mem_buf_desc = m_p_ring->mem_buf_tx_get(m_id, false, PBUF_RAM, 1); BULLSEYE_EXCLUDE_BLOCK_START - if (unlikely(p_mem_buf_desc == NULL)) { + if (unlikely(!p_mem_buf_desc)) { neigh_logdbg("No free TX buffer, not sending ARP"); return false; } @@ -1655,7 +1655,7 @@ bool neigh_eth::send_arp_request(bool is_broadcast) m_sge.addr = (uintptr_t)(p_mem_buf_desc->p_buffer + (uint8_t)h.m_transport_header_tx_offset); m_sge.length = sizeof(eth_arp_hdr) + h.m_total_hdr_len; m_sge.lkey = p_mem_buf_desc->lkey; - p_mem_buf_desc->p_next_desc = NULL; + p_mem_buf_desc->p_next_desc = nullptr; m_send_wqe.wr_id = (uintptr_t)p_mem_buf_desc; m_p_ring->send_ring_buffer(m_id, &m_send_wqe, (xlio_wr_tx_packet_attr)0); @@ -1671,7 +1671,7 @@ bool neigh_eth::send_neighbor_solicitation() net_device_val_eth *net_dev = dynamic_cast(m_p_dev); BULLSEYE_EXCLUDE_BLOCK_START - if (net_dev == nullptr) { + if (!net_dev) { neigh_logdbg("Net device is unavailable - not sending NS"); return false; } @@ -1679,7 +1679,7 @@ bool neigh_eth::send_neighbor_solicitation() const L2_address *src_mac = m_p_dev->get_l2_address(); BULLSEYE_EXCLUDE_BLOCK_START - if (src_mac == nullptr) { + if (!src_mac) { neigh_logdbg("Source MAC address is unavailable - not sending NS"); return false; } @@ -1715,7 +1715,7 @@ bool neigh_eth::send_neighbor_solicitation() htons(ETH_P_IPV6), m_src_addr, dst_snm, 0, 0); mem_buf_desc_t *p_mem_buf_desc = m_p_ring->mem_buf_tx_get(m_id, false, PBUF_RAM, 1); BULLSEYE_EXCLUDE_BLOCK_START - if (unlikely(p_mem_buf_desc == NULL)) { + if (unlikely(!p_mem_buf_desc)) { neigh_logdbg("No free TX buffer - not sending NS"); return false; } @@ -1787,7 +1787,7 @@ bool neigh_eth::send_neighbor_solicitation() m_sge.addr = reinterpret_cast(head); m_sge.length = static_cast(tail - head); m_sge.lkey = p_mem_buf_desc->lkey; - p_mem_buf_desc->p_next_desc = NULL; + p_mem_buf_desc->p_next_desc = nullptr; m_send_wqe.wr_id = (uintptr_t)p_mem_buf_desc; neigh_logdbg("NS request: base=%p addr=%p length=%" PRIu32, p_mem_buf_desc->p_buffer, (void *)m_sge.addr, m_sge.length); diff --git a/src/core/proto/neighbour.h b/src/core/proto/neighbour.h index b50fb79f0..6e9353de2 100644 --- a/src/core/proto/neighbour.h +++ b/src/core/proto/neighbour.h @@ -108,7 +108,7 @@ class neigh_val { public: neigh_val() : m_trans_type(XLIO_TRANSPORT_UNKNOWN) - , m_l2_address(NULL) {}; + , m_l2_address(nullptr) {}; virtual ~neigh_val() {}; virtual void zero_all_members() @@ -116,7 +116,7 @@ class neigh_val { if (m_l2_address) { delete m_l2_address; } - m_l2_address = NULL; + m_l2_address = nullptr; }; const L2_address *get_l2_address() const { return m_l2_address; }; @@ -275,8 +275,8 @@ class neigh_entry : public cache_entry_subject, bool priv_is_reachable(int state) { return state & (NUD_REACHABLE | NUD_PERMANENT); } bool priv_is_failed(int state) { return state & (NUD_FAILED | NUD_INCOMPLETE); } - void event_handler(event_t event, void *p_event_info = NULL); - void priv_event_handler_no_locks(event_t event, void *p_event_info = NULL); + void event_handler(event_t event, void *p_event_info = nullptr); + void priv_event_handler_no_locks(event_t event, void *p_event_info = nullptr); virtual bool priv_handle_neigh_is_l2_changed(address_t) { return false; }; void priv_handle_neigh_reachable_event(); @@ -291,7 +291,7 @@ class neigh_entry : public cache_entry_subject, virtual bool prepare_to_send_packet(neigh_send_data *) { return true; }; void handle_timer_expired(void *user_data) override; - virtual ring_user_id_t generate_ring_user_id(header *h = NULL) + virtual ring_user_id_t generate_ring_user_id(header *h = nullptr) { NOT_IN_USE(h); return m_p_ring->generate_id(); @@ -328,7 +328,7 @@ class neigh_eth : public neigh_entry { bool is_deletable() override; protected: - ring_user_id_t generate_ring_user_id(header *h = NULL) override; + ring_user_id_t generate_ring_user_id(header *h = nullptr) override; private: int build_mc_neigh_val(); diff --git a/src/core/proto/neighbour_table_mgr.cpp b/src/core/proto/neighbour_table_mgr.cpp index e592cf019..73d1697c1 100644 --- a/src/core/proto/neighbour_table_mgr.cpp +++ b/src/core/proto/neighbour_table_mgr.cpp @@ -50,7 +50,7 @@ #define neigh_mgr_logfunc __log_func #define neigh_mgr_logfuncall __log_funcall -neigh_table_mgr *g_p_neigh_table_mgr = NULL; +neigh_table_mgr *g_p_neigh_table_mgr = nullptr; #define DEFAULT_GARBAGE_COLLECTOR_TIME 100000 @@ -110,7 +110,7 @@ neigh_entry *neigh_table_mgr::create_new_entry(neigh_key neigh_key, const observ return (new neigh_eth(neigh_key)); } else { neigh_mgr_logdbg("Cannot create new entry, transport type is UNKNOWN"); - return NULL; + return nullptr; } } @@ -121,7 +121,7 @@ void neigh_table_mgr::notify_cb(event *ev) neigh_nl_event *nl_ev = dynamic_cast(ev); BULLSEYE_EXCLUDE_BLOCK_START - if (nl_ev == NULL) { + if (!nl_ev) { neigh_mgr_logdbg("Non neigh_nl_event type"); return; } diff --git a/src/core/proto/netlink_socket_mgr.cpp b/src/core/proto/netlink_socket_mgr.cpp index 87f468908..e5adc5a57 100644 --- a/src/core/proto/netlink_socket_mgr.cpp +++ b/src/core/proto/netlink_socket_mgr.cpp @@ -161,7 +161,7 @@ int netlink_socket_mgr::recv_info(int sockfd, uint32_t pid, uint32_t seq, char * // Update data in a table void netlink_socket_mgr::update_tbl(nl_data_t data_type) { - struct nlmsghdr *nl_msg = NULL; + struct nlmsghdr *nl_msg = nullptr; char *buf; int len = 0; diff --git a/src/core/proto/nvme_parse_input_args.h b/src/core/proto/nvme_parse_input_args.h index f43aa37d0..7f2e30c28 100644 --- a/src/core/proto/nvme_parse_input_args.h +++ b/src/core/proto/nvme_parse_input_args.h @@ -63,7 +63,7 @@ class nvme_pdu_mdesc : public mem_desc { auto this_addr = reinterpret_cast(aligned_alloc( alignof(nvme_pdu_mdesc), num_segments * (sizeof(iovec) + sizeof(xlio_pd_key)) + sizeof(nvme_pdu_mdesc))); - if (this_addr == nullptr) { + if (!this_addr) { return nullptr; } auto container = std::unique_ptr(this_addr); @@ -135,7 +135,7 @@ class nvme_pdu_mdesc : public mem_desc { : chunk(nullptr, 0U, LKEY_TX_DEFAULT) {}; inline bool is_valid() { - return iov.iov_base != nullptr && iov.iov_len != 0U && mkey != LKEY_TX_DEFAULT; + return iov.iov_base && iov.iov_len != 0U && mkey != LKEY_TX_DEFAULT; } }; diff --git a/src/core/proto/route_entry.cpp b/src/core/proto/route_entry.cpp index 670cdf002..80a4b80b1 100644 --- a/src/core/proto/route_entry.cpp +++ b/src/core/proto/route_entry.cpp @@ -53,8 +53,8 @@ route_entry::route_entry(route_rule_table_key rtk) , cache_observer() , m_b_offloaded_net_dev(false) , m_is_valid(false) - , m_p_net_dev_entry(NULL) - , m_p_net_dev_val(NULL) + , m_p_net_dev_entry(nullptr) + , m_p_net_dev_val(nullptr) { m_val = NULL; cache_entry_subject *> *rr_entry = NULL; @@ -67,7 +67,7 @@ route_entry::~route_entry() unregister_to_net_device(); if (m_p_rr_entry) { g_p_rule_table_mgr->unregister_observer(get_key(), this); - m_p_rr_entry = NULL; + m_p_rr_entry = nullptr; } } @@ -90,7 +90,7 @@ void route_entry::set_val(IN route_val *&val) void route_entry::register_to_net_device() { - cache_entry_subject *net_dev_entry = NULL; + cache_entry_subject *net_dev_entry = nullptr; if (g_p_net_device_table_mgr->register_observer(m_val->get_if_index(), this, &net_dev_entry)) { rt_entry_logdbg("route_entry [%p] is registered to an offloaded device", this); m_p_net_dev_entry = (net_device_entry *)net_dev_entry; @@ -120,8 +120,8 @@ void route_entry::unregister_to_net_device() } } - m_p_net_dev_entry = NULL; - m_p_net_dev_val = NULL; + m_p_net_dev_entry = nullptr; + m_p_net_dev_val = nullptr; } void route_entry::notify_cb() @@ -131,7 +131,7 @@ void route_entry::notify_cb() if (m_p_net_dev_entry->is_valid()) { m_p_net_dev_entry->get_val(m_p_net_dev_val); } else { - m_p_net_dev_val = NULL; + m_p_net_dev_val = nullptr; } notify_observers(); } diff --git a/src/core/proto/route_table_mgr.cpp b/src/core/proto/route_table_mgr.cpp index 8ad0c2b52..4610c9a0e 100644 --- a/src/core/proto/route_table_mgr.cpp +++ b/src/core/proto/route_table_mgr.cpp @@ -71,7 +71,7 @@ static inline route_val *find_route_val(route_table_t &table, const ip_address &dst, uint32_t table_id); -route_table_mgr *g_p_route_table_mgr = NULL; +route_table_mgr *g_p_route_table_mgr = nullptr; route_table_mgr::route_table_mgr() : netlink_socket_mgr() @@ -218,7 +218,7 @@ void route_table_mgr::rt_mgr_update_source_ip(route_table_t &table) if (!val.get_gw_addr().is_anyaddr() && val.get_src_addr().is_anyaddr()) { route_val *p_val_dst; uint32_t table_id = val.get_table_id(); - if ((p_val_dst = ::find_route_val(table, val.get_gw_addr(), table_id)) != nullptr) { + if ((p_val_dst = ::find_route_val(table, val.get_gw_addr(), table_id))) { if (!p_val_dst->get_src_addr().is_anyaddr()) { val.set_src_addr(p_val_dst->get_src_addr()); } else if (&val == p_val_dst) { // gateway of the entry lead to same entry @@ -423,7 +423,7 @@ bool route_table_mgr::route_resolve(IN route_rule_table_key key, OUT route_resul const sa_family_t family = key.get_family(); route_table_t &rt = family == AF_INET ? m_table_in4 : m_table_in6; - route_val *p_val = NULL; + route_val *p_val = nullptr; auto table_id_list = g_p_rule_table_mgr->rule_resolve(key); @@ -477,12 +477,12 @@ void route_table_mgr::update_entry(INOUT route_entry *p_ent, bool b_register_to_ rule_entry *p_rr_entry = p_ent->get_rule_entry(); std::deque *p_rr_val; if (p_rr_entry && p_rr_entry->get_val(p_rr_val)) { - route_val *p_val = NULL; + route_val *p_val = nullptr; const ip_address &peer_ip = p_ent->get_key().get_dst_ip(); for (const auto &p_rule_val : *p_rr_val) { uint32_t table_id = p_rule_val->get_table_id(); - if ((p_val = ::find_route_val(rt, peer_ip, table_id)) != nullptr) { + if ((p_val = ::find_route_val(rt, peer_ip, table_id))) { p_ent->set_val(p_val); if (b_register_to_net_dev) { // Check if broadcast IPv4 which is NOT supported diff --git a/src/core/proto/rule_table_mgr.cpp b/src/core/proto/rule_table_mgr.cpp index 774cd82dc..73f059cc7 100644 --- a/src/core/proto/rule_table_mgr.cpp +++ b/src/core/proto/rule_table_mgr.cpp @@ -66,7 +66,7 @@ #define DEFAULT_RULE_TABLE_SIZE 64 -rule_table_mgr *g_p_rule_table_mgr = NULL; +rule_table_mgr *g_p_rule_table_mgr = nullptr; static inline bool is_matching_rule(const route_rule_table_key &key, const rule_val &val); rule_table_mgr::rule_table_mgr() diff --git a/src/core/proto/xlio_lwip.cpp b/src/core/proto/xlio_lwip.cpp index f8aea1b0c..97d077fc6 100644 --- a/src/core/proto/xlio_lwip.cpp +++ b/src/core/proto/xlio_lwip.cpp @@ -82,7 +82,7 @@ u8_t xlio_lwip::read_tcp_timestamp_option(void) return res; } -xlio_lwip *g_p_lwip = 0; +xlio_lwip *g_p_lwip = nullptr; /** * LWIP "network" driver code @@ -142,7 +142,7 @@ xlio_lwip::xlio_lwip() set_tmr_resolution(safe_mce_sys().tcp_timer_resolution_msec); // tcp_ticks increases in the rate of tcp slow_timer void *node = g_p_event_handler_manager->register_timer_event( - safe_mce_sys().tcp_timer_resolution_msec * 2, this, PERIODIC_TIMER, 0); + safe_mce_sys().tcp_timer_resolution_msec * 2, this, PERIODIC_TIMER, nullptr); if (!node) { lwip_logdbg("LWIP: failed to register timer event"); free_lwip_resources(); diff --git a/src/core/sock/fd_collection.cpp b/src/core/sock/fd_collection.cpp index 0a288a29c..1111b655c 100644 --- a/src/core/sock/fd_collection.cpp +++ b/src/core/sock/fd_collection.cpp @@ -55,7 +55,7 @@ #define fdcoll_logdbg __log_dbg #define fdcoll_logfunc __log_func -fd_collection *g_p_fd_collection = NULL; +fd_collection *g_p_fd_collection = nullptr; fd_collection::fd_collection() : lock_mutex_recursive("fd_collection") @@ -97,16 +97,16 @@ fd_collection::~fd_collection() m_n_fd_map_size = -1; delete[] m_p_sockfd_map; - m_p_sockfd_map = NULL; + m_p_sockfd_map = nullptr; delete[] m_p_epfd_map; - m_p_epfd_map = NULL; + m_p_epfd_map = nullptr; delete[] m_p_cq_channel_map; - m_p_cq_channel_map = NULL; + m_p_cq_channel_map = nullptr; delete[] m_p_tap_map; - m_p_tap_map = NULL; + m_p_tap_map = nullptr; m_epfd_lst.clear_without_cleanup(); m_pending_to_remove_lst.clear_without_cleanup(); @@ -165,7 +165,7 @@ void fd_collection::clear() } } - m_p_sockfd_map[fd] = NULL; + m_p_sockfd_map[fd] = nullptr; fdcoll_logdbg("destroyed fd=%d", fd); } @@ -174,7 +174,7 @@ void fd_collection::clear() if (p_epfd) { delete p_epfd; } - m_p_epfd_map[fd] = NULL; + m_p_epfd_map[fd] = nullptr; fdcoll_logdbg("destroyed epfd=%d", fd); } @@ -183,12 +183,12 @@ void fd_collection::clear() if (p_cq_ch_info) { delete p_cq_ch_info; } - m_p_cq_channel_map[fd] = NULL; + m_p_cq_channel_map[fd] = nullptr; fdcoll_logdbg("destroyed cq_channel_fd=%d", fd); } if (m_p_tap_map[fd]) { - m_p_tap_map[fd] = NULL; + m_p_tap_map[fd] = nullptr; fdcoll_logdbg("destroyed tapfd=%d", fd); } } @@ -255,7 +255,7 @@ int fd_collection::addsocket(int fd, int domain, int type, bool check_offload /* lock(); BULLSEYE_EXCLUDE_BLOCK_START - if (p_sfd_api_obj == NULL) { + if (!p_sfd_api_obj) { fdcoll_logpanic("[fd=%d] Failed creating new sockinfo (%m)", fd); } BULLSEYE_EXCLUDE_BLOCK_END @@ -384,10 +384,10 @@ int fd_collection::addpipe(int fdrd, int fdwr) lock(); BULLSEYE_EXCLUDE_BLOCK_START - if (p_fdrd_api_obj == NULL) { + if (!p_fdrd_api_obj) { fdcoll_logpanic("[fd=%d] Failed creating new pipeinfo (%m)", fdrd); } - if (p_fdwr_api_obj == NULL) { + if (!p_fdwr_api_obj) { fdcoll_logpanic("[fd=%d] Failed creating new pipeinfo (%m)", fdwr); } BULLSEYE_EXCLUDE_BLOCK_END @@ -424,7 +424,7 @@ int fd_collection::addepfd(int epfd, int size) lock(); BULLSEYE_EXCLUDE_BLOCK_START - if (p_fd_info == NULL) { + if (!p_fd_info) { fdcoll_logpanic("[fd=%d] Failed creating new sockinfo (%m)", epfd); } BULLSEYE_EXCLUDE_BLOCK_END @@ -495,9 +495,9 @@ int fd_collection::add_cq_channel_fd(int cq_ch_fd, ring *p_ring) BULLSEYE_EXCLUDE_BLOCK_START if (p_cq_ch_info) { fdcoll_logwarn("cq channel fd already exists in fd_collection"); - m_p_cq_channel_map[cq_ch_fd] = NULL; + m_p_cq_channel_map[cq_ch_fd] = nullptr; delete p_cq_ch_info; - p_cq_ch_info = NULL; + p_cq_ch_info = nullptr; } BULLSEYE_EXCLUDE_BLOCK_END @@ -506,7 +506,7 @@ int fd_collection::add_cq_channel_fd(int cq_ch_fd, ring *p_ring) lock(); BULLSEYE_EXCLUDE_BLOCK_START - if (p_cq_ch_info == NULL) { + if (!p_cq_ch_info) { fdcoll_logpanic("[fd=%d] Failed creating new cq_channel_info (%m)", cq_ch_fd); } BULLSEYE_EXCLUDE_BLOCK_END @@ -545,7 +545,7 @@ int fd_collection::del_sockfd(int fd, bool b_cleanup /*=false*/, bool is_for_udp if (!is_for_udp_pool) { ++g_global_stat_static.n_pending_sockets; } - m_p_sockfd_map[fd] = NULL; + m_p_sockfd_map[fd] = nullptr; m_pending_to_remove_lst.push_front(p_sfd_api); } @@ -581,7 +581,7 @@ void fd_collection::del_tapfd(int fd) } lock(); - m_p_tap_map[fd] = NULL; + m_p_tap_map[fd] = nullptr; unlock(); } @@ -648,7 +648,7 @@ bool fd_collection::pop_socket_pool(int &fd, bool &add_to_udp_pool, int type) // use fd from pool - will skip creation of new fd by os socket_fd_api *sockfd = m_socket_pool.top(); fd = sockfd->get_fd(); - if (m_p_sockfd_map[fd] == NULL) { + if (!m_p_sockfd_map[fd]) { m_p_sockfd_map[fd] = sockfd; m_pending_to_remove_lst.erase(sockfd); } diff --git a/src/core/sock/fd_collection.h b/src/core/sock/fd_collection.h index 1acf35862..c72a1730a 100644 --- a/src/core/sock/fd_collection.h +++ b/src/core/sock/fd_collection.h @@ -327,7 +327,7 @@ inline socket_fd_api *fd_collection_get_sockfd(int fd) if (g_p_fd_collection) { return g_p_fd_collection->get_sockfd(fd); } - return NULL; + return nullptr; } inline epfd_info *fd_collection_get_epfd(int fd) @@ -335,7 +335,7 @@ inline epfd_info *fd_collection_get_epfd(int fd) if (g_p_fd_collection) { return g_p_fd_collection->get_epfd(fd); } - return NULL; + return nullptr; } #endif diff --git a/src/core/sock/pipeinfo.cpp b/src/core/sock/pipeinfo.cpp index 93c1b20e3..7caf3dcb7 100644 --- a/src/core/sock/pipeinfo.cpp +++ b/src/core/sock/pipeinfo.cpp @@ -98,12 +98,12 @@ pipeinfo::pipeinfo(int fd) pi_logfunc(""); m_b_closed = true; - m_timer_handle = NULL; + m_timer_handle = nullptr; m_b_blocking = true; - m_p_socket_stats = NULL; // mce_stats_instance_create_socket_block(); - if (m_p_socket_stats == NULL) { + m_p_socket_stats = nullptr; // mce_stats_instance_create_socket_block(); + if (!m_p_socket_stats) { // pi_logdbg("Got NULL from mce_stats_instance_create_socket_block, using local member"); m_p_socket_stats = &m_socket_stats; } @@ -140,7 +140,7 @@ pipeinfo::~pipeinfo() if (m_timer_handle) { g_p_event_handler_manager->unregister_timer_event(this, m_timer_handle); - m_timer_handle = NULL; + m_timer_handle = nullptr; } statistics_print(); @@ -159,7 +159,7 @@ void pipeinfo::clean_obj() } set_cleaned(); - m_timer_handle = NULL; + m_timer_handle = nullptr; if (g_p_event_handler_manager->is_running()) { g_p_event_handler_manager->unregister_timers_event_and_delete(this); } else { @@ -312,7 +312,7 @@ void pipeinfo::write_lbm_pipe_enhance() if (m_write_count_no_change_count >= 2 && m_b_lbm_event_q_pipe_timer_on) { if (m_timer_handle) { g_p_event_handler_manager->unregister_timer_event(this, m_timer_handle); - m_timer_handle = NULL; + m_timer_handle = nullptr; } m_b_lbm_event_q_pipe_timer_on = false; diff --git a/src/core/sock/pipeinfo.h b/src/core/sock/pipeinfo.h index ce29495c6..83456cb6c 100644 --- a/src/core/sock/pipeinfo.h +++ b/src/core/sock/pipeinfo.h @@ -56,8 +56,8 @@ class pipeinfo : public socket_fd_api, public timer_handler { // Process a Rx request, we might have a ready packet, or we might block until // we have one (if sockinfo::m_b_blocking == true) ssize_t rx(const rx_call_t call_type, struct iovec *p_iov, ssize_t sz_iov, int *p_flags, - struct sockaddr *__from = NULL, socklen_t *__fromlen = NULL, - struct msghdr *__msg = NULL) override; + struct sockaddr *__from = nullptr, socklen_t *__fromlen = nullptr, + struct msghdr *__msg = nullptr) override; // Process a Tx request, handle all that is needed to send the packet, we might block // until the connection info is ready or a tx buffer is releast (if sockinfo::m_b_blocking == diff --git a/src/core/sock/sock-app.cpp b/src/core/sock/sock-app.cpp index 93cc48e5f..52e10b708 100644 --- a/src/core/sock/sock-app.cpp +++ b/src/core/sock/sock-app.cpp @@ -62,7 +62,7 @@ map_udp_bounded_port_t g_map_udp_bounded_port; static int init_worker(int worker_id, int listen_fd); -struct app_conf *g_p_app = NULL; +struct app_conf *g_p_app = nullptr; #if defined(DEFINED_NGINX) int app_conf::proc_nginx(void) diff --git a/src/core/sock/sock-app.h b/src/core/sock/sock-app.h index 9dc3611e8..7a1879880 100644 --- a/src/core/sock/sock-app.h +++ b/src/core/sock/sock-app.h @@ -78,7 +78,7 @@ struct app_conf { map_thread_id.clear(); map_dup_fd.clear(); unused_worker_id.clear(); - context = NULL; + context = nullptr; setup(); } diff --git a/src/core/sock/sock-extra.cpp b/src/core/sock/sock-extra.cpp index 47dfab67a..954d9033c 100644 --- a/src/core/sock/sock-extra.cpp +++ b/src/core/sock/sock-extra.cpp @@ -61,7 +61,7 @@ extern "C" int xlio_register_recv_callback(int __fd, xlio_recv_callback_t __callback, void *__context) { - socket_fd_api *p_socket_object = NULL; + socket_fd_api *p_socket_object = nullptr; p_socket_object = fd_collection_get_sockfd(__fd); if (p_socket_object && !safe_mce_sys().enable_socketxtreme) { p_socket_object->register_callback(__callback, __context); @@ -74,7 +74,7 @@ extern "C" int xlio_register_recv_callback(int __fd, xlio_recv_callback_t __call extern "C" int xlio_recvfrom_zcopy(int __fd, void *__buf, size_t __nbytes, int *__flags, struct sockaddr *__from, socklen_t *__fromlen) { - socket_fd_api *p_socket_object = NULL; + socket_fd_api *p_socket_object = nullptr; p_socket_object = fd_collection_get_sockfd(__fd); if (p_socket_object) { struct iovec piov[1]; @@ -89,7 +89,7 @@ extern "C" int xlio_recvfrom_zcopy(int __fd, void *__buf, size_t __nbytes, int * extern "C" int xlio_recvfrom_zcopy_free_packets(int __fd, struct xlio_recvfrom_zcopy_packet_t *pkts, size_t count) { - socket_fd_api *p_socket_object = NULL; + socket_fd_api *p_socket_object = nullptr; p_socket_object = fd_collection_get_sockfd(__fd); if (p_socket_object) { return p_socket_object->recvfrom_zcopy_free_packets(pkts, count); @@ -118,7 +118,7 @@ extern "C" int xlio_socketxtreme_poll(int fd, struct xlio_socketxtreme_completio unsigned int ncompletions, int flags) { int ret_val = -1; - cq_channel_info *cq_ch_info = NULL; + cq_channel_info *cq_ch_info = nullptr; cq_ch_info = g_p_fd_collection->get_cq_channel_fd(fd); @@ -153,8 +153,8 @@ static int dummy_xlio_socketxtreme_free_packets(struct xlio_socketxtreme_packet_ extern "C" int xlio_socketxtreme_free_packets(struct xlio_socketxtreme_packet_desc_t *packets, int num) { - mem_buf_desc_t *desc = NULL; - sockinfo_tcp *p_socket_object = NULL; + mem_buf_desc_t *desc = nullptr; + sockinfo_tcp *p_socket_object = nullptr; if (likely(packets)) { for (int i = 0; i < num; i++) { @@ -198,7 +198,7 @@ static int dummy_xlio_socketxtreme_ref_buff(xlio_buff_t *buff) extern "C" int xlio_socketxtreme_ref_buff(xlio_buff_t *buff) { int ret_val = 0; - mem_buf_desc_t *desc = NULL; + mem_buf_desc_t *desc = nullptr; if (likely(buff)) { desc = (mem_buf_desc_t *)buff; @@ -224,7 +224,7 @@ static int dummy_xlio_socketxtreme_free_buff(xlio_buff_t *buff) extern "C" int xlio_socketxtreme_free_buff(xlio_buff_t *buff) { int ret_val = 0; - mem_buf_desc_t *desc = NULL; + mem_buf_desc_t *desc = nullptr; if (likely(buff)) { desc = (mem_buf_desc_t *)buff; @@ -239,7 +239,7 @@ extern "C" int xlio_socketxtreme_free_buff(xlio_buff_t *buff) extern "C" int xlio_get_socket_rings_num(int fd) { - socket_fd_api *p_socket_object = NULL; + socket_fd_api *p_socket_object = nullptr; p_socket_object = fd_collection_get_sockfd(fd); if (p_socket_object && p_socket_object->check_rings()) { return p_socket_object->get_rings_num(); @@ -250,7 +250,7 @@ extern "C" int xlio_get_socket_rings_num(int fd) extern "C" int xlio_get_socket_rings_fds(int fd, int *ring_fds, int ring_fds_sz) { - if (ring_fds_sz <= 0 || ring_fds == NULL) { + if (ring_fds_sz <= 0 || !ring_fds) { errno = EINVAL; return -1; } @@ -304,7 +304,7 @@ static inline struct cmsghdr *__cmsg_nxthdr(void *__ctl, size_t __size, struct c __ptr = (struct cmsghdr *)(((unsigned char *)__cmsg) + CMSG_ALIGN(__cmsg->cmsg_len)); if ((unsigned long)((char *)(__ptr + 1) - (char *)__ctl) > __size) { - return NULL; + return nullptr; } return __ptr; @@ -344,9 +344,9 @@ extern "C" int xlio_extra_ioctl(void *cmsg_hdr, size_t cmsg_len) struct xlio_api_t *extra_api(void) { - static struct xlio_api_t *xlio_api = NULL; + static struct xlio_api_t *xlio_api = nullptr; - if (NULL == xlio_api) { + if (!xlio_api) { bool enable_socketxtreme = safe_mce_sys().enable_socketxtreme; xlio_api = new struct xlio_api_t(); diff --git a/src/core/sock/sock-redirect.cpp b/src/core/sock/sock-redirect.cpp index be5eea431..97b247a6a 100644 --- a/src/core/sock/sock-redirect.cpp +++ b/src/core/sock/sock-redirect.cpp @@ -87,7 +87,7 @@ using namespace std; #define EP_MAX_EVENTS (int)((INT_MAX / sizeof(struct epoll_event))) struct sigaction g_act_prev; -sighandler_t g_sighandler = NULL; +sighandler_t g_sighandler = nullptr; class ring_simple; class ring_eth_direct; @@ -398,7 +398,7 @@ static ssize_t sendfile_helper(socket_fd_api *p_socket_object, int in_fd, __off6 return -1; } - if (offset == NULL) { + if (!offset) { orig_offset = lseek64(in_fd, 0, SEEK_CUR); if (orig_offset < 0) { errno = ESPIPE; @@ -415,7 +415,7 @@ static ssize_t sendfile_helper(socket_fd_api *p_socket_object, int in_fd, __off6 /* Get mapping from the cache */ mapping = g_zc_cache->get_mapping(in_fd); - if (mapping == NULL) { + if (!mapping) { srdr_logdbg("Couldn't allocate mapping object"); goto fallback; } @@ -495,8 +495,9 @@ static ssize_t sendfile_helper(socket_fd_api *p_socket_object, int in_fd, __off6 /* try to use mmap() approach */ if (-1 != (XLIO_CALL(fcntl, in_fd, F_SETLK, &lock))) { - void *addr = NULL; - addr = mmap64(NULL, pa_count, PROT_READ, MAP_SHARED | MAP_NORESERVE, in_fd, pa_offset); + void *addr = nullptr; + addr = + mmap64(nullptr, pa_count, PROT_READ, MAP_SHARED | MAP_NORESERVE, in_fd, pa_offset); if (MAP_FAILED != addr) { ssize_t toRead, numSent = 0; @@ -552,7 +553,7 @@ static ssize_t sendfile_helper(socket_fd_api *p_socket_object, int in_fd, __off6 } if (totSent > 0) { - if (offset != NULL) { + if (offset) { *offset = *offset + totSent; } else { (void)lseek64(in_fd, (orig_offset + totSent), SEEK_SET); @@ -571,7 +572,7 @@ const char *dbg_sprintf_fdset(char *buf, int buflen, int __nfds, fd_set *__fds) } buf[0] = '\0'; - if ((__nfds <= 0) || (__fds == NULL)) { + if ((__nfds <= 0) || (!__fds)) { return "(null)"; } @@ -613,7 +614,7 @@ const char *dbg_sprintf_fdset(char *buf, int buflen, int __nfds, fd_set *__fds) Returns the number of file descriptors with events, zero if timed out, or -1 for errors. */ static int poll_helper(struct pollfd *__fds, nfds_t __nfds, int __timeout, - const sigset_t *__sigmask = NULL) + const sigset_t *__sigmask = nullptr) { int off_rfd_buffer[__nfds]; io_mux_call::offloaded_mode_t off_modes_buffer[__nfds]; @@ -642,7 +643,7 @@ static int poll_helper(struct pollfd *__fds, nfds_t __nfds, int __timeout, This function is a cancellation point and therefore not marked with __THROW. */ static int select_helper(int __nfds, fd_set *__readfds, fd_set *__writefds, fd_set *__exceptfds, - struct timeval *__timeout, const sigset_t *__sigmask = NULL) + struct timeval *__timeout, const sigset_t *__sigmask = nullptr) { int off_rfds_buffer[__nfds]; io_mux_call::offloaded_mode_t off_modes_buffer[__nfds]; @@ -700,7 +701,7 @@ static void xlio_epoll_create(int epfd, int size) returned ( usually size of "events" ). The "timeout" parameter specifies the maximum wait time in milliseconds (-1 == infinite). */ inline int epoll_wait_helper(int __epfd, struct epoll_event *__events, int __maxevents, - int __timeout, const sigset_t *__sigmask = NULL) + int __timeout, const sigset_t *__sigmask = nullptr) { if (__maxevents <= 0 || __maxevents > EP_MAX_EVENTS) { srdr_logdbg("invalid value for maxevents: %d", __maxevents); @@ -715,8 +716,8 @@ inline int epoll_wait_helper(int __epfd, struct epoll_event *__events, int __max epoll_event extra_events_buffer[__maxevents]; try { - epoll_wait_call epcall(extra_events_buffer, NULL, __epfd, __events, __maxevents, __timeout, - __sigmask); + epoll_wait_call epcall(extra_events_buffer, nullptr, __epfd, __events, __maxevents, + __timeout, __sigmask); int rc = epcall.get_current_events(); // returns ready nfds if (rc <= 0) { @@ -784,7 +785,7 @@ int sigaction_internal(int signum, const struct sigaction *act, struct sigaction xlio_action.sa_flags = 0; sigemptyset(&xlio_action.sa_mask); - ret = SYSCALL(sigaction, SIGINT, &xlio_action, NULL); + ret = SYSCALL(sigaction, SIGINT, &xlio_action, nullptr); if (ret < 0) { srdr_logdbg("Failed to register SIGINT handler, calling to original sigaction " @@ -874,7 +875,7 @@ EXPORT_SYMBOL int XLIO_SYMBOL(shutdown)(int __fd, int __how) srdr_logdbg_entry("fd=%d, how=%d", __fd, __how); - socket_fd_api *p_socket_object = NULL; + socket_fd_api *p_socket_object = nullptr; p_socket_object = fd_collection_get_sockfd(__fd); if (p_socket_object) { return p_socket_object->shutdown(__how); @@ -930,7 +931,7 @@ EXPORT_SYMBOL int XLIO_SYMBOL(listen)(int __fd, int backlog) } #endif /* DEFINED_ENVOY */ - socket_fd_api *p_socket_object = NULL; + socket_fd_api *p_socket_object = nullptr; p_socket_object = fd_collection_get_sockfd(__fd); if (p_socket_object) { @@ -961,7 +962,7 @@ EXPORT_SYMBOL int XLIO_SYMBOL(accept)(int __fd, struct sockaddr *__addr, socklen { PROFILE_FUNC - socket_fd_api *p_socket_object = NULL; + socket_fd_api *p_socket_object = nullptr; p_socket_object = fd_collection_get_sockfd(__fd); if (p_socket_object) { return p_socket_object->accept(__addr, __addrlen); @@ -975,7 +976,7 @@ EXPORT_SYMBOL int XLIO_SYMBOL(accept4)(int __fd, struct sockaddr *__addr, sockle { PROFILE_FUNC - socket_fd_api *p_socket_object = NULL; + socket_fd_api *p_socket_object = nullptr; p_socket_object = fd_collection_get_sockfd(__fd); if (p_socket_object) { return p_socket_object->accept4(__addr, __addrlen, __flags); @@ -996,7 +997,7 @@ EXPORT_SYMBOL int XLIO_SYMBOL(bind)(int __fd, const struct sockaddr *__addr, soc srdr_logdbg_entry("fd=%d, %s", __fd, sprintf_sockaddr(buf, 256, __addr, __addrlen)); int ret = 0; - socket_fd_api *p_socket_object = NULL; + socket_fd_api *p_socket_object = nullptr; p_socket_object = fd_collection_get_sockfd(__fd); if (p_socket_object) { ret = bind_internal(p_socket_object, __addr, __addrlen); @@ -1034,11 +1035,10 @@ EXPORT_SYMBOL int XLIO_SYMBOL(connect)(int __fd, const struct sockaddr *__to, so int ret = 0; socket_fd_api *p_socket_object = fd_collection_get_sockfd(__fd); - if (p_socket_object == nullptr) { + if (!p_socket_object) { srdr_logdbg_exit("Unable to get sock_fd_api"); ret = SYSCALL(connect, __fd, __to, __tolen); - } else if (__to == nullptr || - (get_sa_family(__to) != AF_INET && (get_sa_family(__to) != AF_INET6))) { + } else if (!__to || (get_sa_family(__to) != AF_INET && (get_sa_family(__to) != AF_INET6))) { p_socket_object->setPassthrough(); ret = SYSCALL(connect, __fd, __to, __tolen); } else { @@ -1069,7 +1069,7 @@ EXPORT_SYMBOL int XLIO_SYMBOL(setsockopt)(int __fd, int __level, int __optname, { srdr_logdbg_entry("fd=%d, level=%d, optname=%d", __fd, __level, __optname); - if (NULL == __optval) { + if (!__optval) { errno = EFAULT; return -1; } @@ -1077,7 +1077,7 @@ EXPORT_SYMBOL int XLIO_SYMBOL(setsockopt)(int __fd, int __level, int __optname, PROFILE_FUNC int ret = 0; - socket_fd_api *p_socket_object = NULL; + socket_fd_api *p_socket_object = nullptr; p_socket_object = fd_collection_get_sockfd(__fd); if (p_socket_object) { @@ -1115,7 +1115,7 @@ EXPORT_SYMBOL int XLIO_SYMBOL(getsockopt)(int __fd, int __level, int __optname, #endif /* XLIO_STATIC_BUILD */ int ret = 0; - socket_fd_api *p_socket_object = NULL; + socket_fd_api *p_socket_object = nullptr; p_socket_object = fd_collection_get_sockfd(__fd); if (p_socket_object) { VERIFY_PASSTROUGH_CHANGED( @@ -1155,7 +1155,7 @@ EXPORT_SYMBOL int XLIO_SYMBOL(fcntl)(int __fd, int __cmd, ...) va_end(va); int ret = 0; - socket_fd_api *p_socket_object = NULL; + socket_fd_api *p_socket_object = nullptr; p_socket_object = fd_collection_get_sockfd(__fd); if (p_socket_object) { VERIFY_PASSTROUGH_CHANGED(res, p_socket_object->fcntl(__cmd, arg)); @@ -1199,7 +1199,7 @@ EXPORT_SYMBOL int XLIO_SYMBOL(fcntl64)(int __fd, int __cmd, ...) va_end(va); int ret = 0; - socket_fd_api *p_socket_object = NULL; + socket_fd_api *p_socket_object = nullptr; p_socket_object = fd_collection_get_sockfd(__fd); if (p_socket_object && VALID_SYSCALL(fcntl64)) { VERIFY_PASSTROUGH_CHANGED(res, p_socket_object->fcntl64(__cmd, arg)); @@ -1236,7 +1236,7 @@ EXPORT_SYMBOL int XLIO_SYMBOL(ioctl)(int __fd, unsigned long int __request, ...) int ret = 0; - socket_fd_api *p_socket_object = NULL; + socket_fd_api *p_socket_object = nullptr; p_socket_object = fd_collection_get_sockfd(__fd); if (p_socket_object && arg) { VERIFY_PASSTROUGH_CHANGED(res, p_socket_object->ioctl(__request, arg)); @@ -1259,7 +1259,7 @@ EXPORT_SYMBOL int XLIO_SYMBOL(getsockname)(int __fd, struct sockaddr *__name, so srdr_logdbg_entry("fd=%d", __fd); int ret = 0; - socket_fd_api *p_socket_object = NULL; + socket_fd_api *p_socket_object = nullptr; p_socket_object = fd_collection_get_sockfd(__fd); if (p_socket_object) { ret = p_socket_object->getsockname(__name, __namelen); @@ -1267,7 +1267,7 @@ EXPORT_SYMBOL int XLIO_SYMBOL(getsockname)(int __fd, struct sockaddr *__name, so if (safe_mce_sys().trigger_dummy_send_getsockname) { char buf[264] = {0}; struct iovec msg_iov = {&buf, sizeof(buf)}; - struct msghdr msg = {NULL, 0, &msg_iov, 1, NULL, 0, 0}; + struct msghdr msg = {nullptr, 0, &msg_iov, 1, nullptr, 0, 0}; int ret_send = sendmsg(__fd, &msg, XLIO_SND_FLAGS_DUMMY); srdr_logdbg("Triggered dummy message for socket fd=%d (ret_send=%d)", __fd, ret_send); NOT_IN_USE(ret_send); @@ -1291,7 +1291,7 @@ EXPORT_SYMBOL int XLIO_SYMBOL(getpeername)(int __fd, struct sockaddr *__name, so srdr_logdbg_entry("fd=%d", __fd); int ret = 0; - socket_fd_api *p_socket_object = NULL; + socket_fd_api *p_socket_object = nullptr; p_socket_object = fd_collection_get_sockfd(__fd); if (p_socket_object) { ret = p_socket_object->getpeername(__name, __namelen); @@ -1318,7 +1318,7 @@ EXPORT_SYMBOL ssize_t XLIO_SYMBOL(read)(int __fd, void *__buf, size_t __nbytes) srdr_logfuncall_entry("fd=%d", __fd); - socket_fd_api *p_socket_object = NULL; + socket_fd_api *p_socket_object = nullptr; p_socket_object = fd_collection_get_sockfd(__fd); if (p_socket_object) { struct iovec piov[1]; @@ -1346,7 +1346,7 @@ EXPORT_SYMBOL ssize_t XLIO_SYMBOL(__read_chk)(int __fd, void *__buf, size_t __nb srdr_logfuncall_entry("fd=%d", __fd); - socket_fd_api *p_socket_object = NULL; + socket_fd_api *p_socket_object = nullptr; p_socket_object = fd_collection_get_sockfd(__fd); if (p_socket_object) { BULLSEYE_EXCLUDE_BLOCK_START @@ -1378,7 +1378,7 @@ EXPORT_SYMBOL ssize_t XLIO_SYMBOL(readv)(int __fd, const struct iovec *iov, int srdr_logfuncall_entry("fd=%d", __fd); - socket_fd_api *p_socket_object = NULL; + socket_fd_api *p_socket_object = nullptr; p_socket_object = fd_collection_get_sockfd(__fd); if (p_socket_object) { struct iovec *piov = (struct iovec *)iov; @@ -1400,7 +1400,7 @@ EXPORT_SYMBOL ssize_t XLIO_SYMBOL(recv)(int __fd, void *__buf, size_t __nbytes, srdr_logfuncall_entry("fd=%d", __fd); - socket_fd_api *p_socket_object = NULL; + socket_fd_api *p_socket_object = nullptr; p_socket_object = fd_collection_get_sockfd(__fd); if (p_socket_object) { struct iovec piov[1]; @@ -1427,7 +1427,7 @@ EXPORT_SYMBOL ssize_t XLIO_SYMBOL(__recv_chk)(int __fd, void *__buf, size_t __nb srdr_logfuncall_entry("fd=%d", __fd); - socket_fd_api *p_socket_object = NULL; + socket_fd_api *p_socket_object = nullptr; p_socket_object = fd_collection_get_sockfd(__fd); if (p_socket_object) { BULLSEYE_EXCLUDE_BLOCK_START @@ -1457,13 +1457,13 @@ EXPORT_SYMBOL ssize_t XLIO_SYMBOL(recvmsg)(int __fd, struct msghdr *__msg, int _ srdr_logfuncall_entry("fd=%d", __fd); - if (__msg == NULL) { + if (!__msg) { srdr_logdbg("NULL msghdr"); errno = EINVAL; return -1; } - socket_fd_api *p_socket_object = NULL; + socket_fd_api *p_socket_object = nullptr; p_socket_object = fd_collection_get_sockfd(__fd); if (p_socket_object) { __msg->msg_flags = 0; @@ -1511,7 +1511,7 @@ EXPORT_SYMBOL int XLIO_SYMBOL(recvmmsg)(int __fd, struct mmsghdr *__mmsghdr, uns srdr_logfuncall_entry("fd=%d, mmsghdr length=%d flags=%x", __fd, __vlen, __flags); - if (__mmsghdr == NULL) { + if (!__mmsghdr) { srdr_logdbg("NULL mmsghdr"); errno = EINVAL; return -1; @@ -1520,7 +1520,7 @@ EXPORT_SYMBOL int XLIO_SYMBOL(recvmmsg)(int __fd, struct mmsghdr *__mmsghdr, uns if (__timeout) { gettime(&start_time); } - socket_fd_api *p_socket_object = NULL; + socket_fd_api *p_socket_object = nullptr; p_socket_object = fd_collection_get_sockfd(__fd); if (p_socket_object) { int ret = 0; @@ -1578,7 +1578,7 @@ EXPORT_SYMBOL ssize_t XLIO_SYMBOL(recvfrom)(int __fd, void *__buf, size_t __nbyt srdr_logfuncall_entry("fd=%d", __fd); - socket_fd_api *p_socket_object = NULL; + socket_fd_api *p_socket_object = nullptr; p_socket_object = fd_collection_get_sockfd(__fd); if (p_socket_object) { struct iovec piov[1]; @@ -1609,7 +1609,7 @@ EXPORT_SYMBOL ssize_t XLIO_SYMBOL(__recvfrom_chk)(int __fd, void *__buf, size_t srdr_logfuncall_entry("fd=%d", __fd); - socket_fd_api *p_socket_object = NULL; + socket_fd_api *p_socket_object = nullptr; p_socket_object = fd_collection_get_sockfd(__fd); if (p_socket_object) { BULLSEYE_EXCLUDE_BLOCK_START @@ -1638,7 +1638,7 @@ EXPORT_SYMBOL ssize_t XLIO_SYMBOL(write)(int __fd, __const void *__buf, size_t _ srdr_logfuncall_entry("fd=%d, nbytes=%d", __fd, __nbytes); - socket_fd_api *p_socket_object = NULL; + socket_fd_api *p_socket_object = nullptr; p_socket_object = fd_collection_get_sockfd(__fd); if (p_socket_object) { struct iovec piov[1] = {{(void *)__buf, __nbytes}}; @@ -1664,7 +1664,7 @@ EXPORT_SYMBOL ssize_t XLIO_SYMBOL(writev)(int __fd, const struct iovec *iov, int srdr_logfuncall_entry("fd=%d, %d iov blocks", __fd, iovcnt); - socket_fd_api *p_socket_object = NULL; + socket_fd_api *p_socket_object = nullptr; p_socket_object = fd_collection_get_sockfd(__fd); if (p_socket_object) { xlio_tx_call_attr_t tx_arg; @@ -1689,7 +1689,7 @@ EXPORT_SYMBOL ssize_t XLIO_SYMBOL(send)(int __fd, __const void *__buf, size_t __ srdr_logfuncall_entry("fd=%d, nbytes=%d", __fd, __nbytes); - socket_fd_api *p_socket_object = NULL; + socket_fd_api *p_socket_object = nullptr; p_socket_object = fd_collection_get_sockfd(__fd); if (p_socket_object) { struct iovec piov[1] = {{(void *)__buf, __nbytes}}; @@ -1723,7 +1723,7 @@ EXPORT_SYMBOL ssize_t XLIO_SYMBOL(sendmsg)(int __fd, __const struct msghdr *__ms srdr_logfuncall_entry("fd=%d", __fd); - socket_fd_api *p_socket_object = NULL; + socket_fd_api *p_socket_object = nullptr; p_socket_object = fd_collection_get_sockfd(__fd); if (p_socket_object) { return sendmsg_internal(p_socket_object, __msg, __flags); @@ -1752,13 +1752,13 @@ EXPORT_SYMBOL int XLIO_SYMBOL(sendmmsg)(int __fd, struct mmsghdr *__mmsghdr, uns srdr_logfuncall_entry("fd=%d, mmsghdr length=%d flags=%x", __fd, __vlen, __flags); - if (__mmsghdr == NULL) { + if (!__mmsghdr) { srdr_logdbg("NULL mmsghdr"); errno = EINVAL; return -1; } - socket_fd_api *p_socket_object = NULL; + socket_fd_api *p_socket_object = nullptr; p_socket_object = fd_collection_get_sockfd(__fd); if (p_socket_object) { for (unsigned int i = 0; i < __vlen; i++) { @@ -1808,7 +1808,7 @@ EXPORT_SYMBOL ssize_t XLIO_SYMBOL(sendto)(int __fd, __const void *__buf, size_t srdr_logfuncall_entry("fd=%d, nbytes=%d", __fd, __nbytes); - socket_fd_api *p_socket_object = NULL; + socket_fd_api *p_socket_object = nullptr; p_socket_object = fd_collection_get_sockfd(__fd); if (p_socket_object) { struct iovec piov[1] = {{(void *)__buf, __nbytes}}; @@ -1903,8 +1903,8 @@ EXPORT_SYMBOL int XLIO_SYMBOL(pselect)(int __nfds, fd_set *__readfds, fd_set *__ srdr_logfunc_entry("nfds=%d, timeout=(infinite)", __nfds); } - return select_helper(__nfds, __readfds, __writefds, __errorfds, __timeout ? &select_time : NULL, - __sigmask); + return select_helper(__nfds, __readfds, __writefds, __errorfds, + __timeout ? &select_time : nullptr, __sigmask); } EXPORT_SYMBOL int XLIO_SYMBOL(poll)(struct pollfd *__fds, nfds_t __nfds, int __timeout) @@ -1951,8 +1951,7 @@ EXPORT_SYMBOL int XLIO_SYMBOL(ppoll)(struct pollfd *__fds, nfds_t __nfds, return SYSCALL(ppoll, __fds, __nfds, __timeout, __sigmask); } - int timeout = - (__timeout == NULL) ? -1 : (__timeout->tv_sec * 1000 + __timeout->tv_nsec / 1000000); + int timeout = (!__timeout) ? -1 : (__timeout->tv_sec * 1000 + __timeout->tv_nsec / 1000000); srdr_logfunc_entry("nfds=%d, timeout=(%d milli-sec)", __nfds, timeout); @@ -1977,8 +1976,7 @@ EXPORT_SYMBOL int XLIO_SYMBOL(__ppoll_chk)(struct pollfd *__fds, nfds_t __nfds, BULLSEYE_EXCLUDE_BLOCK_END - int timeout = - (__timeout == NULL) ? -1 : (__timeout->tv_sec * 1000 + __timeout->tv_nsec / 1000000); + int timeout = (!__timeout) ? -1 : (__timeout->tv_sec * 1000 + __timeout->tv_nsec / 1000000); srdr_logfunc_entry("nfds=%d, timeout=(%d milli-sec)", __nfds, timeout); diff --git a/src/core/sock/socket_fd_api.cpp b/src/core/sock/socket_fd_api.cpp index 794a09524..2714e0781 100644 --- a/src/core/sock/socket_fd_api.cpp +++ b/src/core/sock/socket_fd_api.cpp @@ -50,7 +50,7 @@ socket_fd_api::socket_fd_api(int fd) : m_epoll_event_flags(0) , m_fd(fd) , m_n_sysvar_select_poll_os_ratio(safe_mce_sys().select_poll_os_ratio) - , m_econtext(NULL) + , m_econtext(nullptr) #if defined(DEFINED_NGINX) || defined(DEFINED_ENVOY) , m_is_for_socket_pool(false) , m_back_log(0) @@ -332,7 +332,7 @@ int socket_fd_api::add_epoll_context(epfd_info *epfd) void socket_fd_api::remove_epoll_context(epfd_info *epfd) { if (m_econtext == epfd) { - m_econtext = NULL; + m_econtext = nullptr; } } diff --git a/src/core/sock/socket_fd_api.h b/src/core/sock/socket_fd_api.h index 70c2a789d..0c51629f4 100644 --- a/src/core/sock/socket_fd_api.h +++ b/src/core/sock/socket_fd_api.h @@ -177,10 +177,10 @@ class socket_fd_api : public cleanable_obj { virtual int ioctl(unsigned long int __request, unsigned long int __arg) = 0; virtual ssize_t rx(const rx_call_t call_type, iovec *iov, const ssize_t iovlen, - int *p_flags = 0, sockaddr *__from = NULL, socklen_t *__fromlen = NULL, - struct msghdr *__msg = NULL) = 0; + int *p_flags = nullptr, sockaddr *__from = nullptr, + socklen_t *__fromlen = nullptr, struct msghdr *__msg = nullptr) = 0; - virtual bool is_readable(uint64_t *p_poll_sn, fd_array_t *p_fd_array = NULL); + virtual bool is_readable(uint64_t *p_poll_sn, fd_array_t *p_fd_array = nullptr); virtual bool is_writeable(); @@ -270,7 +270,7 @@ class socket_fd_api : public cleanable_obj { virtual int *get_rings_fds(int &res_length) { res_length = 0; - return NULL; + return nullptr; } protected: diff --git a/src/core/sock/sockinfo.cpp b/src/core/sock/sockinfo.cpp index 050772c42..8b6aa5d8e 100644 --- a/src/core/sock/sockinfo.cpp +++ b/src/core/sock/sockinfo.cpp @@ -77,9 +77,9 @@ sockinfo::sockinfo(int fd, int domain, bool use_ring_locks) , m_lock_snd(MODULE_NAME "::m_lock_snd") , m_state(SOCKINFO_OPENED) , m_family(domain) - , m_p_connected_dst_entry(NULL) + , m_p_connected_dst_entry(nullptr) , m_so_bindtodevice_ip(ip_address::any_addr(), domain) - , m_p_rx_ring(0) + , m_p_rx_ring(nullptr) , m_rx_reuse_buf_pending(false) , m_rx_reuse_buf_postponed(false) , m_rx_ring_map_lock(MODULE_NAME "::m_rx_ring_map_lock") @@ -91,8 +91,8 @@ sockinfo::sockinfo(int fd, int domain, bool use_ring_locks) , m_ring_alloc_log_rx(safe_mce_sys().ring_allocation_logic_rx, use_ring_locks) , m_ring_alloc_log_tx(safe_mce_sys().ring_allocation_logic_tx, use_ring_locks) , m_pcp(0) - , m_rx_callback(NULL) - , m_rx_callback_context(NULL) + , m_rx_callback(nullptr) + , m_rx_callback_context(nullptr) , m_fd_context((void *)((uintptr_t)m_fd)) , m_flow_tag_id(0) , m_rx_cq_wait_ctrl(safe_mce_sys().rx_cq_wait_ctrl) @@ -101,7 +101,7 @@ sockinfo::sockinfo(int fd, int domain, bool use_ring_locks) : safe_mce_sys().sysctl_reader.get_net_ipv6_hop_limit()) , m_bind_no_port(false) , m_is_ipv6only(safe_mce_sys().sysctl_reader.get_ipv6_bindv6only()) - , m_p_rings_fds(NULL) + , m_p_rings_fds(nullptr) { m_rx_epfd = SYSCALL(epoll_create, 128); if (unlikely(m_rx_epfd == -1)) { @@ -124,7 +124,7 @@ sockinfo::sockinfo(int fd, int domain, bool use_ring_locks) set_flow_tag(m_fd + 1); atomic_set(&m_zckey, 0); - m_last_zcdesc = NULL; + m_last_zcdesc = nullptr; m_socketxtreme.ec_cache.clear(); struct ring_ec ec; @@ -152,7 +152,7 @@ sockinfo::~sockinfo() if (m_p_rings_fds) { delete[] m_p_rings_fds; - m_p_rings_fds = NULL; + m_p_rings_fds = nullptr; } while (!m_error_queue.empty()) { @@ -885,7 +885,7 @@ bool sockinfo::attach_receiver(flow_tuple_with_local_if &flow_key) // Allocate resources on specific interface (create ring) net_device_resources_t *p_nd_resources = create_nd_resources(ip_addr(flow_key.get_local_if(), flow_key.get_family())); - if (NULL == p_nd_resources) { + if (!p_nd_resources) { // any error which occurred inside create_nd_resources() was already printed. No need to // reprint errors here return false; @@ -996,7 +996,7 @@ bool sockinfo::detach_receiver(flow_tuple_with_local_if &flow_key) net_device_resources_t *sockinfo::create_nd_resources(const ip_addr &ip_local) { - net_device_resources_t *p_nd_resources = NULL; + net_device_resources_t *p_nd_resources = nullptr; // Check if we are already registered to net_device with the local ip as observers rx_net_device_map_t::iterator rx_nd_iter = m_rx_nd_map.find(ip_local); @@ -1005,12 +1005,12 @@ net_device_resources_t *sockinfo::create_nd_resources(const ip_addr &ip_local) // Need to register as observer to net_device net_device_resources_t nd_resources; nd_resources.refcnt = 0; - nd_resources.p_nde = NULL; - nd_resources.p_ndv = NULL; - nd_resources.p_ring = NULL; + nd_resources.p_nde = nullptr; + nd_resources.p_ndv = nullptr; + nd_resources.p_ring = nullptr; BULLSEYE_EXCLUDE_BLOCK_START - cache_entry_subject *net_dev_entry = NULL; + cache_entry_subject *net_dev_entry = nullptr; net_device_val *net_dev = g_p_net_device_table_mgr->get_net_device_val(ip_local); if (!net_dev || !g_p_net_device_table_mgr->register_observer(net_dev->get_if_idx(), &m_rx_nd_observer, @@ -1077,12 +1077,12 @@ net_device_resources_t *sockinfo::create_nd_resources(const ip_addr &ip_local) return p_nd_resources; err: - return NULL; + return nullptr; } bool sockinfo::destroy_nd_resources(const ip_addr &ip_local) { - net_device_resources_t *p_nd_resources = NULL; + net_device_resources_t *p_nd_resources = nullptr; rx_net_device_map_t::iterator rx_nd_iter = m_rx_nd_map.find(ip_local); BULLSEYE_EXCLUDE_BLOCK_START @@ -1207,7 +1207,7 @@ void sockinfo::do_rings_migration_rx(resource_allocation_key &old_key) si_logerr("Failed to release ring for allocation key %s", new_key->to_str().c_str()); } - new_ring = NULL; + new_ring = nullptr; break; } lock_rx_q(); @@ -1519,7 +1519,7 @@ int sockinfo::os_epoll_wait(epoll_event *ep_events, int maxevents) // this new fd) void sockinfo::add_cqfd_to_sock_rx_epfd(ring *p_ring) { - epoll_event ev = {0, {0}}; + epoll_event ev = {0, {nullptr}}; ev.events = EPOLLIN; size_t num_ring_rx_fds; int *ring_rx_fds_array = p_ring->get_rx_channel_fds(num_ring_rx_fds); @@ -1542,8 +1542,9 @@ void sockinfo::remove_cqfd_from_sock_rx_epfd(ring *base_ring) for (size_t i = 0; i < num_ring_rx_fds; i++) { BULLSEYE_EXCLUDE_BLOCK_START - if (unlikely((SYSCALL(epoll_ctl, m_rx_epfd, EPOLL_CTL_DEL, ring_rx_fds_array[i], NULL)) && - (!(errno == ENOENT || errno == EBADF)))) { + if (unlikely( + (SYSCALL(epoll_ctl, m_rx_epfd, EPOLL_CTL_DEL, ring_rx_fds_array[i], nullptr)) && + (!(errno == ENOENT || errno == EBADF)))) { si_logerr("failed to delete cq channel fd from internal epfd (errno=%d %s)", errno, strerror(errno)); } @@ -1673,7 +1674,7 @@ void sockinfo::rx_del_ring_cb(ring *p_ring) if (m_rx_ring_map.size() == 1) { m_p_rx_ring = m_rx_ring_map.begin()->first; } else { - m_p_rx_ring = NULL; + m_p_rx_ring = nullptr; } move_descs(base_ring, &temp_rx_reuse, &m_rx_reuse_buff.rx_reuse, true); @@ -1933,7 +1934,7 @@ void sockinfo::destructor_helper() if (m_p_connected_dst_entry) { delete m_p_connected_dst_entry; } - m_p_connected_dst_entry = NULL; + m_p_connected_dst_entry = nullptr; } int sockinfo::register_callback(xlio_recv_callback_t callback, void *context) @@ -2148,7 +2149,7 @@ void sockinfo::handle_recv_timestamping(struct cmsg_state *cm_state) void sockinfo::handle_recv_errqueue(struct cmsg_state *cm_state) { - mem_buf_desc_t *buff = NULL; + mem_buf_desc_t *buff = nullptr; if (m_error_queue.empty()) { return; @@ -2198,7 +2199,7 @@ void sockinfo::insert_cmsg(struct cmsg_state *cm_state, int level, int type, voi (struct cmsghdr *)((char *)cm_state->cmhdr + CMSG_ALIGN(cm_state->cmhdr->cmsg_len)); if ((char *)(next + 1) > ((char *)cm_state->mhdr->msg_control + cm_state->mhdr->msg_controllen)) { - cm_state->cmhdr = NULL; + cm_state->cmhdr = nullptr; } else { cm_state->cmhdr = next; } diff --git a/src/core/sock/sockinfo.h b/src/core/sock/sockinfo.h index 75d77305d..e826eb0e5 100644 --- a/src/core/sock/sockinfo.h +++ b/src/core/sock/sockinfo.h @@ -285,7 +285,7 @@ class sockinfo : public socket_fd_api, // connected_ip is routed to bool attach_as_uc_receiver(role_t role, bool skip_rules = false); transport_t find_target_family(role_t role, const struct sockaddr *sock_addr_first, - const struct sockaddr *sock_addr_second = NULL); + const struct sockaddr *sock_addr_second = nullptr); // This callback will notify that socket is ready to receive and map the cq. void rx_add_ring_cb(ring *p_ring) override; @@ -299,9 +299,9 @@ class sockinfo : public socket_fd_api, int modify_ratelimit(dst_entry *p_dst_entry, struct xlio_rate_limit_t &rate_limit); void move_descs(ring *p_ring, descq_t *toq, descq_t *fromq, bool own); - void pop_descs_rx_ready(descq_t *cache, ring *p_ring = NULL); + void pop_descs_rx_ready(descq_t *cache, ring *p_ring = nullptr); void push_descs_rx_ready(descq_t *cache); - void reuse_descs(descq_t *reuseq, ring *p_ring = NULL); + void reuse_descs(descq_t *reuseq, ring *p_ring = nullptr); int set_sockopt_prio(__const void *__optval, socklen_t __optlen); bool ipv6_set_addr_sel_pref(int val); int ipv6_get_addr_sel_pref(); diff --git a/src/core/sock/sockinfo_nvme.cpp b/src/core/sock/sockinfo_nvme.cpp index 7e529523f..31d16655c 100644 --- a/src/core/sock/sockinfo_nvme.cpp +++ b/src/core/sock/sockinfo_nvme.cpp @@ -92,8 +92,7 @@ ssize_t sockinfo_tcp_ops_nvme::tx(xlio_tx_call_attr_t &tx_arg) auto aux_data = reinterpret_cast(tx_arg.priv.map); auto msg = tx_arg.attr.hdr; - if (msg->msg_iov == nullptr || aux_data == nullptr || msg->msg_iovlen == 0U || - aux_data[0].message_length == 0U) { + if (!msg->msg_iov || !aux_data || msg->msg_iovlen == 0U || aux_data[0].message_length == 0U) { si_nvme_logerr("Invalid msg_iov, msg_iovlen, or auxiliary data"); errno = EINVAL; return -1; @@ -136,7 +135,7 @@ ssize_t sockinfo_tcp_ops_nvme::tx(xlio_tx_call_attr_t &tx_arg) /* Update tx_arg before sending to TCP */ auto *desc = nvme_pdu_mdesc::create(num_iovecs, msg->msg_iov, aux_data, m_p_sock->get_next_tcp_seqno(), total_tx_length); - if (desc == nullptr) { + if (!desc) { si_nvme_logerr("Unable to allocate nvme_mdesc"); errno = ENOMEM; return -1; @@ -165,27 +164,27 @@ static inline bool request_credits_for_resync(ring *p_ring, size_t datalen, size int sockinfo_tcp_ops_nvme::postrouting(pbuf *p, tcp_seg *seg, xlio_send_attr &attr) { - if (!m_is_ddgs_on || p == nullptr || seg == nullptr || seg->len == 0U) { + if (!m_is_ddgs_on || !p || !seg || seg->len == 0U) { return ERR_OK; } - assert(m_p_tis != nullptr); + assert(m_p_tis); attr.tis = m_p_tis.get(); if (likely(seg->seqno == m_expected_seqno)) { m_expected_seqno += seg->len; return ERR_OK; } - assert(p->next != nullptr); + assert(p->next); assert(p->next->desc.attr == PBUF_DESC_NVME_TX); ring *p_ring = m_p_sock->get_tx_ring(); - if (p_ring == nullptr) { + if (!p_ring) { si_nvme_logerr("No ring"); return ERR_RTE; } auto nvme_mdesc = dynamic_cast(static_cast(p->next->desc.mdesc)); - if (unlikely(nvme_mdesc == nullptr)) { + if (unlikely(!nvme_mdesc)) { si_nvme_logerr("NVME momory descriptor not found"); return ERR_RTE; } @@ -241,18 +240,18 @@ bool sockinfo_tcp_ops_nvme::handle_send_ret(ssize_t ret, tcp_seg *seg) err_t sockinfo_tcp_ops_nvme::recv(pbuf *p) { - return p != nullptr ? ERR_OK : ERR_ARG; + return p ? ERR_OK : ERR_ARG; } int sockinfo_tcp_ops_nvme::setsockopt_tx(const uint32_t &config) { ring *p_ring = m_p_sock->get_tx_ring(); - if (p_ring == nullptr) { + if (!p_ring) { errno = ENOTSUP; return -1; } m_p_tis = p_ring->create_tis(DPCP_TIS_FLAGS | DPCP_TIS_NVME_FLAG); - if (m_p_tis == nullptr) { + if (!m_p_tis) { errno = ENOTSUP; return -1; } diff --git a/src/core/sock/sockinfo_tcp.cpp b/src/core/sock/sockinfo_tcp.cpp index 79c4147f8..07a109394 100644 --- a/src/core/sock/sockinfo_tcp.cpp +++ b/src/core/sock/sockinfo_tcp.cpp @@ -80,9 +80,9 @@ extern global_stats_t g_global_stat_static; -tcp_timers_collection *g_tcp_timers_collection = NULL; +tcp_timers_collection *g_tcp_timers_collection = nullptr; thread_local thread_local_tcp_timers g_thread_local_tcp_timers; -bind_no_port *g_bind_no_port = NULL; +bind_no_port *g_bind_no_port = nullptr; /* * The following socket options are inherited by a connected TCP socket from the listening socket: @@ -161,7 +161,7 @@ inline void sockinfo_tcp::lwip_pbuf_init_custom(mem_buf_desc_t *p_desc) p_desc->lwip_pbuf.pbuf.len = p_desc->lwip_pbuf.pbuf.tot_len = (p_desc->sz_data - p_desc->rx.n_transport_header_len); p_desc->lwip_pbuf.pbuf.ref = 1; - p_desc->lwip_pbuf.pbuf.next = NULL; + p_desc->lwip_pbuf.pbuf.next = nullptr; p_desc->lwip_pbuf.pbuf.payload = (u8_t *)p_desc->p_buffer + p_desc->rx.n_transport_header_len; } @@ -223,7 +223,7 @@ inline void sockinfo_tcp::reuse_buffer(mem_buf_desc_t *buff) mem_buf_desc_t *underlying = reinterpret_cast(buff->lwip_pbuf.pbuf.desc.mdesc); - buff->lwip_pbuf.pbuf.desc.mdesc = NULL; + buff->lwip_pbuf.pbuf.desc.mdesc = nullptr; if (likely(p_dst)) { p_dst->put_zc_buffer(buff); } else { @@ -237,8 +237,8 @@ inline void sockinfo_tcp::reuse_buffer(mem_buf_desc_t *buff) /* Continue and release the underlying buffer. */ buff = underlying; buff->lwip_pbuf.pbuf.ref = 1; - buff->lwip_pbuf.pbuf.next = NULL; - buff->p_next_desc = NULL; + buff->lwip_pbuf.pbuf.next = nullptr; + buff->p_next_desc = nullptr; } if (safe_mce_sys().buffer_batching_mode == BUFFER_BATCHING_NONE) { @@ -279,7 +279,7 @@ static inline bool use_socket_ring_locks() sockinfo_tcp::sockinfo_tcp(int fd, int domain) : sockinfo(fd, domain, use_socket_ring_locks()) - , m_timer_handle(NULL) + , m_timer_handle(nullptr) , m_tcp_con_lock(get_new_tcp_lock()) , m_sysvar_buffer_batching_mode(safe_mce_sys().buffer_batching_mode) , m_sysvar_tx_segs_batch_tcp(safe_mce_sys().tx_segs_batch_tcp) @@ -337,8 +337,8 @@ sockinfo_tcp::sockinfo_tcp(int fd, int domain) tcp_err(&m_pcb, sockinfo_tcp::err_lwip_cb); tcp_sent(&m_pcb, sockinfo_tcp::ack_recvd_lwip_cb); - m_parent = NULL; - m_iomux_ready_fd_array = NULL; + m_parent = nullptr; + m_iomux_ready_fd_array = nullptr; /* SNDBUF accounting */ m_sndbuff_max = 0; @@ -380,7 +380,7 @@ sockinfo_tcp::sockinfo_tcp(int fd, int domain) } } - if (g_p_agent != NULL) { + if (g_p_agent) { g_p_agent->register_cb((agent_cb_t)&sockinfo_tcp::put_agent_msg, (void *)this); } si_tcp_logdbg("TCP PCB FLAGS: 0x%x", m_pcb.flags); @@ -407,7 +407,7 @@ sockinfo_tcp::~sockinfo_tcp() delete m_ops_tcp; } delete m_ops; - m_ops = NULL; + m_ops = nullptr; // Return buffers released in the TLS layer destructor m_rx_reuse_buf_postponed = m_rx_reuse_buff.n_buff_num > 0; @@ -453,7 +453,7 @@ sockinfo_tcp::~sockinfo_tcp() m_rx_ctl_reuse_list.size()); } - if (g_p_agent != NULL) { + if (g_p_agent) { g_p_agent->unregister_cb((agent_cb_t)&sockinfo_tcp::put_agent_msg, (void *)this); } si_tcp_logdbg("sock closed"); @@ -478,7 +478,7 @@ void sockinfo_tcp::clean_obj() p_event_mgr->unregister_timer_event(this, m_timer_handle); } - m_timer_handle = NULL; + m_timer_handle = nullptr; unlock_tcp_con(); if (p_event_mgr->is_running() && !delegated_timers_exit) { @@ -501,7 +501,7 @@ bool sockinfo_tcp::prepare_listen_to_close() m_syn_received.erase(key); m_ready_conn_cnt--; new_sock->lock_tcp_con(); - new_sock->m_parent = NULL; + new_sock->m_parent = nullptr; new_sock->abort_connection(); new_sock->unlock_tcp_con(); close(new_sock->get_fd()); @@ -622,14 +622,14 @@ bool sockinfo_tcp::prepare_to_close(bool process_shutdown /* = false */) tcp_close(&m_pcb); if (is_listen_socket) { - tcp_accept(&m_pcb, 0); - tcp_syn_handled(&m_pcb, 0); - tcp_clone_conn(&m_pcb, 0); - tcp_accepted_pcb(&m_pcb, 0); + tcp_accept(&m_pcb, nullptr); + tcp_syn_handled(&m_pcb, nullptr); + tcp_clone_conn(&m_pcb, nullptr); + tcp_accepted_pcb(&m_pcb, nullptr); prepare_listen_to_close(); // close pending to accept sockets } else { tcp_recv(&m_pcb, sockinfo_tcp::rx_drop_lwip_cb); - tcp_sent(&m_pcb, 0); + tcp_sent(&m_pcb, nullptr); } // todo should we do this each time we get into prepare_to_close ? @@ -851,7 +851,7 @@ void sockinfo_tcp::put_agent_msg(void *arg) if (p_si_tcp->is_server() || get_tcp_state(&p_si_tcp->m_pcb) == LISTEN) { return; } - if (unlikely(g_p_agent == NULL)) { + if (unlikely(!g_p_agent)) { return; } @@ -904,7 +904,7 @@ static inline bool cannot_do_requested_partial_write(size_t sndbuf_available, static inline bool is_invalid_iovec(const iovec *iov, size_t sz_iov) { - return iov == nullptr || sz_iov == 0; + return !iov || sz_iov == 0; } /** @@ -926,8 +926,8 @@ ssize_t sockinfo_tcp::tcp_tx(xlio_tx_call_attr_t &tx_arg) int ret = 0; int poll_count = 0; err_t err; - void *tx_ptr = NULL; - struct xlio_pd_key *pd_key_array = NULL; + void *tx_ptr = nullptr; + struct xlio_pd_key *pd_key_array = nullptr; /* Let allow OS to process all invalid scenarios to avoid any * inconsistencies in setting errno values @@ -958,7 +958,7 @@ ssize_t sockinfo_tcp::tcp_tx(xlio_tx_call_attr_t &tx_arg) bool is_non_file_zerocopy = tx_arg.opcode != TX_FILE; pd_key_array = - (tx_arg.priv.attr == PBUF_DESC_MKEY ? (struct xlio_pd_key *)tx_arg.priv.map : NULL); + (tx_arg.priv.attr == PBUF_DESC_MKEY ? (struct xlio_pd_key *)tx_arg.priv.map : nullptr); si_tcp_logfunc("tx: iov=%p niovs=%zu", p_iov, sz_iov); @@ -1065,8 +1065,8 @@ ssize_t sockinfo_tcp::tcp_tx_slow_path(xlio_tx_call_attr_t &tx_arg) int poll_count = 0; uint16_t apiflags = 0; bool is_send_zerocopy = false; - void *tx_ptr = NULL; - struct xlio_pd_key *pd_key_array = NULL; + void *tx_ptr = nullptr; + struct xlio_pd_key *pd_key_array = nullptr; if (tx_arg.opcode == TX_FILE) { /* @@ -1092,7 +1092,7 @@ ssize_t sockinfo_tcp::tcp_tx_slow_path(xlio_tx_call_attr_t &tx_arg) apiflags |= XLIO_TX_PACKET_ZEROCOPY; is_send_zerocopy = tx_arg.opcode != TX_FILE; pd_key_array = - (tx_arg.priv.attr == PBUF_DESC_MKEY ? (struct xlio_pd_key *)tx_arg.priv.map : NULL); + (tx_arg.priv.attr == PBUF_DESC_MKEY ? (struct xlio_pd_key *)tx_arg.priv.map : nullptr); } si_tcp_logfunc("tx: iov=%p niovs=%zu", p_iov, sz_iov); @@ -1242,7 +1242,7 @@ err_t sockinfo_tcp::ip_output(struct pbuf *p, struct tcp_seg *seg, void *v_p_con dst_entry *p_dst = p_si_tcp->m_p_connected_dst_entry; int max_count = p_si_tcp->m_pcb.tso.max_send_sge; tcp_iovec lwip_iovec[max_count]; - xlio_send_attr attr = {(xlio_wr_tx_packet_attr)flags, p_si_tcp->m_pcb.mss, 0, 0}; + xlio_send_attr attr = {(xlio_wr_tx_packet_attr)flags, p_si_tcp->m_pcb.mss, 0, nullptr}; int count = 0; void *cur_end; @@ -1385,7 +1385,7 @@ err_t sockinfo_tcp::ip_output_syn_ack(struct pbuf *p, struct tcp_seg *seg, void } /* Update daemon about actual state for offloaded connection */ - if (g_p_agent != NULL && likely(p_si_tcp->m_sock_offload == TCP_SOCK_LWIP)) { + if (g_p_agent && likely(p_si_tcp->m_sock_offload == TCP_SOCK_LWIP)) { p_si_tcp->put_agent_msg((void *)p_si_tcp); } } @@ -1432,7 +1432,7 @@ void sockinfo_tcp::err_lwip_cb(void *pcb_container, err_t err) return; } - if (conn->m_parent != NULL) { + if (conn->m_parent) { // In case we got RST or abandon() before we accepted the connection conn->unlock_tcp_con(); int delete_fd = conn->m_parent->handle_child_FIN(conn); @@ -1753,7 +1753,7 @@ int sockinfo_tcp::handle_child_FIN(sockinfo_tcp *child_conn) m_received_syn_num--; m_p_socket_stats->listen_counters.n_rx_fin++; m_p_socket_stats->listen_counters.n_conn_dropped++; - child_conn->m_parent = NULL; + child_conn->m_parent = nullptr; unlock_tcp_con(); child_conn->lock_tcp_con(); child_conn->abort_connection(); @@ -1875,7 +1875,7 @@ static inline void _rx_lwip_cb_socketxtreme_helper(pbuf *p, // Is IPv4 only. assert(current_desc->rx.src.get_sa_family() == AF_INET); - if (buff_list_tail == nullptr) { + if (!buff_list_tail) { // New completion completion->packet.buff_lst = reinterpret_cast(p); completion->packet.total_len = p->tot_len; @@ -1930,7 +1930,7 @@ inline err_t sockinfo_tcp::handle_fin(struct tcp_pcb *pcb, err_t err) __log_dbg("[fd=%d] null pbuf sock(%p %p) err=%d", m_fd, &(m_pcb), pcb, err); tcp_shutdown_rx(); - if (m_parent != nullptr) { + if (m_parent) { // in case we got FIN before we accepted the connection /* TODO need to add some refcount inside parent in case parent and child are closed * together*/ @@ -1981,7 +1981,7 @@ inline void sockinfo_tcp::rx_lwip_process_chained_pbufs(pbuf *p) // To avoid reset ref count for first mem_buf_desc, save it and set after the while int head_ref = p_first_desc->get_ref_count(); - for (auto *p_curr_desc = p_first_desc; p_curr_desc != nullptr; + for (auto *p_curr_desc = p_first_desc; p_curr_desc; p = p->next, p_curr_desc = p_curr_desc->p_next_desc) { /* Here we reset ref count for all mem_buf_desc except for the head (p_first_desc). Chain of pbufs can contain some pbufs with ref count >=1 like in ooo or flow tag flows. @@ -2389,7 +2389,7 @@ ssize_t sockinfo_tcp::rx(const rx_call_t call_type, iovec *p_iov, ssize_t sz_iov void sockinfo_tcp::register_timer() { - if (m_timer_handle == NULL) { + if (!m_timer_handle) { si_tcp_logdbg("Registering TCP socket timer: socket: %p, thread-col: %p, global-col: %p", this, get_tcp_timer_collection(), g_tcp_timers_collection); @@ -2428,7 +2428,7 @@ void sockinfo_tcp::queue_rx_ctl_packet(struct tcp_pcb *pcb, mem_buf_desc_t *p_de bool sockinfo_tcp::rx_input_cb(mem_buf_desc_t *p_rx_pkt_mem_buf_desc_info, void *pv_fd_ready_array) { - struct tcp_pcb *pcb = NULL; + struct tcp_pcb *pcb = nullptr; int dropped_count = 0; lock_tcp_con(); @@ -2503,7 +2503,7 @@ bool sockinfo_tcp::rx_input_cb(mem_buf_desc_t *p_rx_pkt_mem_buf_desc_info, void sock->m_tcp_con_lock.unlock(); } - m_iomux_ready_fd_array = NULL; + m_iomux_ready_fd_array = nullptr; while (dropped_count--) { mem_buf_desc_t *p_rx_pkt_desc = m_rx_cb_dropped_list.get_and_pop_front(); @@ -2957,7 +2957,7 @@ int sockinfo_tcp::listen(int backlog) BULLSEYE_EXCLUDE_BLOCK_END // Add the user's orig fd to the rx epfd handle - epoll_event ev = {0, {0}}; + epoll_event ev = {0, {nullptr}}; ev.events = EPOLLIN; ev.data.fd = m_fd; int ret = SYSCALL(epoll_ctl, m_rx_epfd, EPOLL_CTL_ADD, ev.data.fd, &ev); @@ -3185,7 +3185,7 @@ sockinfo_tcp *sockinfo_tcp::accept_clone() fd = socket_internal(m_family, SOCK_STREAM, 0, false, false); if (fd < 0) { m_p_socket_stats->listen_counters.n_conn_dropped++; - return 0; + return nullptr; } si = dynamic_cast(fd_collection_get_sockfd(fd)); @@ -3193,7 +3193,7 @@ sockinfo_tcp *sockinfo_tcp::accept_clone() if (!si) { si_tcp_logwarn("can not get accept socket from FD collection"); XLIO_CALL(close, fd); - return 0; + return nullptr; } // This method is called from a flow which assumes that the socket is locked @@ -3393,7 +3393,7 @@ err_t sockinfo_tcp::accept_lwip_cb(void *arg, struct tcp_pcb *child_pcb, err_t e conn->unlock_tcp_con(); /* Do this after auto_accept_connection() call */ - new_sock->m_parent = NULL; + new_sock->m_parent = nullptr; new_sock->lock_tcp_con(); @@ -3436,7 +3436,7 @@ void sockinfo_tcp::push_back_m_rx_pkt_ready_list(mem_buf_desc_t *buff) struct tcp_pcb *sockinfo_tcp::get_syn_received_pcb(const flow_tuple &key) const { - struct tcp_pcb *ret_val = NULL; + struct tcp_pcb *ret_val = nullptr; syn_received_map_t::const_iterator itr; itr = m_syn_received.find(key); @@ -3519,12 +3519,12 @@ err_t sockinfo_tcp::syn_received_timewait_cb(void *arg, struct tcp_pcb *newpcb) new_sock->m_b_blocking = true; /* Dump statistics of the previous incarnation of the socket. */ - print_full_stats(new_sock->m_p_socket_stats, NULL, safe_mce_sys().stats_file); + print_full_stats(new_sock->m_p_socket_stats, nullptr, safe_mce_sys().stats_file); new_sock->socket_stats_init(); /* Reset zerocopy state */ atomic_set(&new_sock->m_zckey, 0); - new_sock->m_last_zcdesc = NULL; + new_sock->m_last_zcdesc = nullptr; new_sock->m_b_zc = false; new_sock->m_state = SOCKINFO_OPENED; @@ -4027,7 +4027,7 @@ int sockinfo_tcp::shutdown(int __how) if (is_server()) { if (shut_rx) { - tcp_accept(&m_pcb, 0); + tcp_accept(&m_pcb, nullptr); tcp_syn_handled(&m_pcb, sockinfo_tcp::syn_received_drop_lwip_cb); } } else { @@ -4189,7 +4189,7 @@ int sockinfo_tcp::tcp_setsockopt(int __level, int __optname, __const void *__opt SOCKOPT_PASS_TO_OS) { if (!is_incoming() && (ret_opt == SOCKOPT_INTERNAL_XLIO_SUPPORT || ret_opt == SOCKOPT_HANDLE_BY_OS) && - m_sock_state <= TCP_SOCK_ACCEPT_READY && __optval != NULL && + m_sock_state <= TCP_SOCK_ACCEPT_READY && __optval && is_inherited_option(__level, __optname)) { socket_option_t *opt_curr = new socket_option_t(__level, __optname, __optval, __optlen); if (opt_curr) { @@ -4630,7 +4630,7 @@ int sockinfo_tcp::tcp_setsockopt(int __level, int __optname, __const void *__opt return ret; } - if (!is_incoming() && m_sock_state <= TCP_SOCK_ACCEPT_READY && __optval != NULL && + if (!is_incoming() && m_sock_state <= TCP_SOCK_ACCEPT_READY && __optval && is_inherited_option(__level, __optname)) { m_socket_options_list.push_back( new socket_option_t(__level, __optname, __optval, __optlen)); @@ -5183,8 +5183,8 @@ mem_buf_desc_t *sockinfo_tcp::get_next_desc(mem_buf_desc_t *p_desc) m_rx_pkt_ready_list.push_front(p_desc); m_n_rx_pkt_ready_list_count++; m_p_socket_stats->n_rx_ready_pkt_count++; - prev->lwip_pbuf.pbuf.next = NULL; - prev->p_next_desc = NULL; + prev->lwip_pbuf.pbuf.next = nullptr; + prev->p_next_desc = nullptr; prev->rx.n_frags = 1; reuse_buffer(prev); } else { @@ -5193,7 +5193,7 @@ mem_buf_desc_t *sockinfo_tcp::get_next_desc(mem_buf_desc_t *p_desc) if (m_n_rx_pkt_ready_list_count) { return m_rx_pkt_ready_list.front(); } else { - return NULL; + return nullptr; } } @@ -5206,7 +5206,7 @@ mem_buf_desc_t *sockinfo_tcp::get_next_desc_peek(mem_buf_desc_t *pdesc, int &rx_ pdesc = m_rx_pkt_ready_list[rx_pkt_ready_list_idx]; rx_pkt_ready_list_idx++; } else { - pdesc = NULL; + pdesc = nullptr; } return pdesc; @@ -5283,8 +5283,8 @@ int sockinfo_tcp::zero_copy_rx(iovec *p_iov, mem_buf_desc_t *pdesc, int *p_flags p_desc_head->rx.n_frags = p_pkts->sz_iov; p_desc_iter->rx.src = prev->rx.src; p_desc_iter->inc_ref_count(); - prev->lwip_pbuf.pbuf.next = NULL; - prev->p_next_desc = NULL; + prev->lwip_pbuf.pbuf.next = nullptr; + prev->p_next_desc = nullptr; m_rx_pkt_ready_list.push_front(p_desc_iter); break; @@ -5542,12 +5542,12 @@ void sockinfo_tcp::socketxtreme_recv_buffs_tcp(mem_buf_desc_t *desc, uint16_t le mem_buf_desc_t *sockinfo_tcp::tcp_tx_mem_buf_alloc(pbuf_type type) { dst_entry_tcp *p_dst = (dst_entry_tcp *)(m_p_connected_dst_entry); - mem_buf_desc_t *desc = NULL; + mem_buf_desc_t *desc = nullptr; if (likely(p_dst)) { /* Currently this method is called from TLS layer without locks */ m_tcp_con_lock.lock(); - desc = p_dst->get_buffer(type, NULL); + desc = p_dst->get_buffer(type, nullptr); m_tcp_con_lock.unlock(); } return desc; @@ -5563,7 +5563,7 @@ struct pbuf *sockinfo_tcp::tcp_tx_pbuf_alloc(void *p_conn, pbuf_type type, pbuf_ { sockinfo_tcp *p_si_tcp = (sockinfo_tcp *)(((struct tcp_pcb *)p_conn)->my_container); dst_entry_tcp *p_dst = (dst_entry_tcp *)(p_si_tcp->m_p_connected_dst_entry); - mem_buf_desc_t *p_desc = NULL; + mem_buf_desc_t *p_desc = nullptr; if (likely(p_dst)) { p_desc = p_dst->get_buffer(type, desc); @@ -5599,7 +5599,7 @@ void sockinfo_tcp::tcp_rx_pbuf_free(struct pbuf *p_buff) { mem_buf_desc_t *desc = (mem_buf_desc_t *)p_buff; - if (desc->p_desc_owner != NULL && p_buff->type != PBUF_ZEROCOPY) { + if (desc->p_desc_owner && p_buff->type != PBUF_ZEROCOPY) { desc->p_desc_owner->mem_buf_rx_release(desc); } else { buffer_pool::free_rx_lwip_pbuf_custom(p_buff); @@ -5625,7 +5625,7 @@ void sockinfo_tcp::tcp_tx_pbuf_free(void *p_conn, struct pbuf *p_buff) } if (p_desc->lwip_pbuf.pbuf.ref == 0) { - p_desc->p_next_desc = NULL; + p_desc->p_next_desc = nullptr; buffer_pool::free_tx_lwip_pbuf_custom(p_buff); } } @@ -5652,7 +5652,7 @@ mem_buf_desc_t *sockinfo_tcp::tcp_tx_zc_alloc(mem_buf_desc_t *p_desc) void sockinfo_tcp::tcp_tx_zc_callback(mem_buf_desc_t *p_desc) { - sockinfo_tcp *sock = NULL; + sockinfo_tcp *sock = nullptr; if (!p_desc) { return; @@ -5684,7 +5684,7 @@ void sockinfo_tcp::tcp_tx_zc_handle(mem_buf_desc_t *p_desc) uint32_t lo, hi; uint16_t count; uint32_t prev_lo, prev_hi; - mem_buf_desc_t *err_queue = NULL; + mem_buf_desc_t *err_queue = nullptr; sockinfo_tcp *sock = this; count = p_desc->tx.zc.count; @@ -5714,7 +5714,7 @@ void sockinfo_tcp::tcp_tx_zc_handle(mem_buf_desc_t *p_desc) err_queue->ee.ee_data = hi; } } else if ((sum_count >= (1ULL << 32)) || (lo != prev_hi + 1)) { - err_queue = NULL; + err_queue = nullptr; } else { err_queue->ee.ee_data += count; } @@ -5816,7 +5816,7 @@ tcp_timers_collection::tcp_timers_collection(int period, int resolution) m_n_period = period; m_n_resolution = resolution; m_n_intervals_size = period / resolution; - m_timer_handle = NULL; + m_timer_handle = nullptr; m_p_intervals = new timer_node_t *[m_n_intervals_size]; BULLSEYE_EXCLUDE_BLOCK_START if (!m_p_intervals) { @@ -5861,7 +5861,7 @@ void tcp_timers_collection::clean_obj() } set_cleaned(); - m_timer_handle = NULL; + m_timer_handle = nullptr; event_handler_manager *p_event_mgr = get_event_mgr(); if (p_event_mgr->is_running()) { @@ -5903,7 +5903,7 @@ void tcp_timers_collection::handle_timer_expired(void *user_data) } /* Processing all messages for the daemon */ - if (g_p_agent != NULL) { + if (g_p_agent) { g_p_agent->progress(); } } @@ -5914,9 +5914,9 @@ void tcp_timers_collection::add_new_timer(timer_node_t *node, timer_handler *han node->handler = handler; node->user_data = user_data; node->group = this; - node->next = NULL; - node->prev = NULL; - if (m_p_intervals[m_n_next_insert_bucket] != NULL) { + node->next = nullptr; + node->prev = nullptr; + if (m_p_intervals[m_n_next_insert_bucket]) { m_p_intervals[m_n_next_insert_bucket]->prev = node; node->next = m_p_intervals[m_n_next_insert_bucket]; } @@ -5925,7 +5925,7 @@ void tcp_timers_collection::add_new_timer(timer_node_t *node, timer_handler *han if (m_n_count == 0) { m_timer_handle = - get_event_mgr()->register_timer_event(m_n_resolution, this, PERIODIC_TIMER, NULL); + get_event_mgr()->register_timer_event(m_n_resolution, this, PERIODIC_TIMER, nullptr); } m_n_count++; @@ -5938,7 +5938,7 @@ void tcp_timers_collection::remove_timer(timer_node_t *node) return; } - node->group = NULL; + node->group = nullptr; if (node->prev) { node->prev->next = node->next; @@ -5959,7 +5959,7 @@ void tcp_timers_collection::remove_timer(timer_node_t *node) if (m_n_count == 0) { if (m_timer_handle) { get_event_mgr()->unregister_timer_event(this, m_timer_handle); - m_timer_handle = NULL; + m_timer_handle = nullptr; } } @@ -6016,7 +6016,7 @@ bool sockinfo_tcp::is_utls_supported(int direction) const int sockinfo_tcp::get_supported_nvme_feature_mask() const { ring *p_ring = get_tx_ring(); - if (p_ring == nullptr) { + if (!p_ring) { return false; } return p_ring->get_supported_nvme_feature_mask(); diff --git a/src/core/sock/sockinfo_tcp.h b/src/core/sock/sockinfo_tcp.h index 32b2b947f..86093c9a7 100644 --- a/src/core/sock/sockinfo_tcp.h +++ b/src/core/sock/sockinfo_tcp.h @@ -204,8 +204,8 @@ class sockinfo_tcp : public sockinfo, public timer_handler { ssize_t tx(xlio_tx_call_attr_t &tx_arg) override; ssize_t tcp_tx(xlio_tx_call_attr_t &tx_arg); ssize_t rx(const rx_call_t call_type, iovec *p_iov, ssize_t sz_iov, int *p_flags, - sockaddr *__from = NULL, socklen_t *__fromlen = NULL, - struct msghdr *__msg = NULL) override; + sockaddr *__from = nullptr, socklen_t *__fromlen = nullptr, + struct msghdr *__msg = nullptr) override; static err_t ip_output(struct pbuf *p, struct tcp_seg *seg, void *v_p_conn, uint16_t flags); static err_t ip_output_syn_ack(struct pbuf *p, struct tcp_seg *seg, void *v_p_conn, uint16_t flags); @@ -234,7 +234,7 @@ class sockinfo_tcp : public sockinfo, public timer_handler { static void tcp_tx_zc_callback(mem_buf_desc_t *p_desc); void tcp_tx_zc_handle(mem_buf_desc_t *p_desc); - bool inline is_readable(uint64_t *p_poll_sn, fd_array_t *p_fd_array = NULL) override; + bool inline is_readable(uint64_t *p_poll_sn, fd_array_t *p_fd_array = nullptr) override; bool inline is_writeable() override; bool inline is_errorable(int *errors) override; bool is_closable() override diff --git a/src/core/sock/sockinfo_udp.cpp b/src/core/sock/sockinfo_udp.cpp index 402e4e02d..efdfd8835 100644 --- a/src/core/sock/sockinfo_udp.cpp +++ b/src/core/sock/sockinfo_udp.cpp @@ -193,7 +193,7 @@ inline int sockinfo_udp::rx_wait(bool blocking) //(can happen if another thread was polling & processing the wce) // and update is_sleeping flag under the same lock to synchronize between // this code and wakeup mechanism. - if (is_readable(NULL)) { + if (is_readable(nullptr)) { return 0; } } @@ -246,7 +246,7 @@ inline int sockinfo_udp::rx_wait(bool blocking) * This is the classical case of wakeup, but we don't want to * waist time on removing wakeup fd, it will be done next time */ - if (is_readable(NULL)) { + if (is_readable(nullptr)) { return 0; } @@ -383,7 +383,7 @@ sockinfo_udp::sockinfo_udp(int fd, int domain) , m_mc_num_grp_with_src_filter(0) , m_port_map_lock("sockinfo_udp::m_ports_map_lock") , m_port_map_index(0) - , m_p_last_dst_entry(NULL) + , m_p_last_dst_entry(nullptr) , m_tos(0) , m_n_sysvar_rx_poll_yield_loops(safe_mce_sys().rx_poll_yield_loops) , m_n_sysvar_rx_udp_poll_os_ratio(safe_mce_sys().rx_udp_poll_os_ratio) @@ -416,7 +416,7 @@ sockinfo_udp::sockinfo_udp(int fd, int domain) si_udp_logdbg("Sockets RCVBUF = %d bytes", n_so_rcvbuf_bytes); rx_ready_byte_count_limit_update(n_so_rcvbuf_bytes); - epoll_event ev = {0, {0}}; + epoll_event ev = {0, {nullptr}}; ev.events = EPOLLIN; @@ -609,7 +609,7 @@ int sockinfo_udp::connect(const struct sockaddr *__to, socklen_t __tolen) // Create the new dst_entry, delete if one already exists if (m_p_connected_dst_entry) { delete m_p_connected_dst_entry; - m_p_connected_dst_entry = NULL; + m_p_connected_dst_entry = nullptr; } if (dst_ipaddr.is_mc(m_family)) { @@ -663,7 +663,7 @@ int sockinfo_udp::getsockname(struct sockaddr *__name, socklen_t *__namelen) int sockinfo_udp::on_sockname_change(struct sockaddr *__name, socklen_t __namelen) { BULLSEYE_EXCLUDE_BLOCK_START - if (__name == NULL) { + if (!__name) { si_udp_logerr("invalid NULL __name"); errno = EFAULT; return -1; @@ -994,7 +994,7 @@ int sockinfo_udp::setsockopt(int __level, int __optname, __const void *__optval, break; } - if (NULL == __optval) { + if (!__optval) { si_udp_logdbg("IPPROTO_IP, %s; Bad optval! calling OS setsockopt()", setsockopt_ip_opt_to_str(__optname)); break; @@ -1412,7 +1412,7 @@ int sockinfo_udp::setsockopt(int __level, int __optname, __const void *__optval, return 0; } break; case IPV6_RECVPKTINFO: - m_b_pktinfo = __optval != nullptr && *(int *)__optval != 0; + m_b_pktinfo = __optval && *(int *)__optval != 0; break; } break; // case IPPROTO_IPV6 @@ -2041,7 +2041,7 @@ ssize_t sockinfo_udp::tx(xlio_tx_call_attr_t &tx_arg) si_udp_logdbg("MSG_OOB not supported in UDP (tx-ing to os)"); goto tx_packet_to_os; } - if (__dst != NULL) { + if (__dst) { sock_addr dst(__dst, __dstlen); if (!validate_and_convert_mapped_ipv4(dst)) { si_udp_logdbg("Mapped IPv4 on IPv6-Only socket"); @@ -2130,7 +2130,7 @@ ssize_t sockinfo_udp::tx(xlio_tx_call_attr_t &tx_arg) } { - xlio_send_attr attr = {(xlio_wr_tx_packet_attr)0, 0, 0, 0}; + xlio_send_attr attr = {(xlio_wr_tx_packet_attr)0, 0, 0, nullptr}; bool b_blocking = m_b_blocking; if (unlikely(__flags & MSG_DONTWAIT)) { b_blocking = false; @@ -3145,7 +3145,7 @@ timestamps_t *sockinfo_udp::get_socket_timestamps() { if (unlikely(m_rx_pkt_ready_list.empty())) { si_udp_logdbg("m_rx_pkt_ready_list empty"); - return NULL; + return nullptr; } return &m_rx_pkt_ready_list.front()->rx.timestamps; } diff --git a/src/core/sock/sockinfo_udp.h b/src/core/sock/sockinfo_udp.h index 603f916fe..ad22b23a9 100644 --- a/src/core/sock/sockinfo_udp.h +++ b/src/core/sock/sockinfo_udp.h @@ -120,8 +120,8 @@ class sockinfo_udp : public sockinfo { * we have one (if sockinfo::m_b_blocking == true) */ ssize_t rx(const rx_call_t call_type, iovec *p_iov, ssize_t sz_iov, int *p_flags, - sockaddr *__from = NULL, socklen_t *__fromlen = NULL, - struct msghdr *__msg = NULL) override; + sockaddr *__from = nullptr, socklen_t *__fromlen = nullptr, + struct msghdr *__msg = nullptr) override; /** * Check that a call to this sockinfo rx() will not block * -> meaning, we got an offloaded ready rx datagram @@ -129,7 +129,7 @@ class sockinfo_udp : public sockinfo { * * While polling CQ, the fd_array is filled with a list of newly queued packets FD's */ - bool is_readable(uint64_t *p_poll_sn, fd_array_t *p_fd_array = NULL) override; + bool is_readable(uint64_t *p_poll_sn, fd_array_t *p_fd_array = nullptr) override; /** * Arm the event channel(s) assosiated with this sockinfo * Fill the fd_set (p_rxfds) with the correct fd channel values and the p_nfds with the (max_fd diff --git a/src/core/sock/sockinfo_ulp.cpp b/src/core/sock/sockinfo_ulp.cpp index 71f088df3..ff8384bc9 100644 --- a/src/core/sock/sockinfo_ulp.cpp +++ b/src/core/sock/sockinfo_ulp.cpp @@ -200,7 +200,7 @@ class tls_record : public mem_desc { m_record_number = record_number; m_size = TLS_RECORD_HDR_LEN + TLS_RECORD_TAG_LEN; m_p_data = nullptr; - tls_sock->get_record_buf(m_p_buf, m_p_data, zc_owner != nullptr); + tls_sock->get_record_buf(m_p_buf, m_p_data, zc_owner); if (likely(m_p_buf && m_p_data)) { if (iv) { m_size += TLS_RECORD_IV_LEN; @@ -884,7 +884,7 @@ int sockinfo_tcp_ops_tls::postrouting(struct pbuf *p, struct tcp_seg *seg, xlio_ uint64_t recno_be64 = htobe64(rec->m_record_number); bool skip_static = !memcmp(m_tls_info_tx.rec_seq, &recno_be64, TLS_AES_GCM_REC_SEQ_LEN); - bool is_zerocopy = rec->m_p_zc_owner != nullptr; + bool is_zerocopy = rec->m_p_zc_owner; unsigned mss = m_p_sock->get_mss(); uint32_t totlen = seg->seqno - rec->m_seqno; uint32_t lkey = LKEY_TX_DEFAULT; @@ -1127,7 +1127,7 @@ int sockinfo_tcp_ops_tls::tls_rx_decrypt(struct pbuf *plist) copy_by_offset(&buf[TLS_AES_GCM_SALT_LEN], m_rx_offset + TLS_RECORD_HDR_LEN, TLS_RECORD_IV_LEN); } - ret = g_tls_api->EVP_DecryptInit_ex(tls_ctx, (EVP_CIPHER *)m_p_evp_cipher, NULL, + ret = g_tls_api->EVP_DecryptInit_ex(tls_ctx, (EVP_CIPHER *)m_p_evp_cipher, nullptr, m_tls_info_rx.key, buf); if (unlikely(!ret)) { return TLS_DECRYPT_INTERNAL; @@ -1146,20 +1146,20 @@ int sockinfo_tcp_ops_tls::tls_rx_decrypt(struct pbuf *plist) copy_by_offset(buf, m_rx_offset, 3); buf[3] = rec_len >> 8U; buf[4] = rec_len & 0xFFU; - ret = g_tls_api->EVP_DecryptUpdate(tls_ctx, NULL, &len, buf, 5); + ret = g_tls_api->EVP_DecryptUpdate(tls_ctx, nullptr, &len, buf, 5); } else { uint16_t rec_len = m_rx_rec_len - m_tls_rec_overhead; *((uint64_t *)buf) = htobe64(m_next_recno_rx); copy_by_offset(buf + 8, m_rx_offset, 3); buf[11] = rec_len >> 8U; buf[12] = rec_len & 0xFFU; - ret = g_tls_api->EVP_DecryptUpdate(tls_ctx, NULL, &len, buf, 13); + ret = g_tls_api->EVP_DecryptUpdate(tls_ctx, nullptr, &len, buf, 13); } if (unlikely(!ret)) { return TLS_DECRYPT_INTERNAL; } - for (p = plist; p != NULL; p = p->next) { + for (p = plist; p; p = p->next) { if (((mem_buf_desc_t *)p)->rx.tls_decrypted == TLS_RX_DECRYPTED) { /* * This is partially decrypted record, stop here @@ -1210,7 +1210,7 @@ int sockinfo_tcp_ops_tls::tls_rx_encrypt(struct pbuf *plist) copy_by_offset(&buf[TLS_AES_GCM_SALT_LEN], m_rx_offset + TLS_RECORD_HDR_LEN, TLS_RECORD_IV_LEN); } - ret = g_tls_api->EVP_EncryptInit_ex(tls_ctx, (EVP_CIPHER *)m_p_evp_cipher, NULL, + ret = g_tls_api->EVP_EncryptInit_ex(tls_ctx, (EVP_CIPHER *)m_p_evp_cipher, nullptr, m_tls_info_rx.key, buf); if (unlikely(!ret)) { return TLS_DECRYPT_INTERNAL; @@ -1228,13 +1228,13 @@ int sockinfo_tcp_ops_tls::tls_rx_encrypt(struct pbuf *plist) copy_by_offset(buf, m_rx_offset, 3); buf[3] = rec_len >> 8U; buf[4] = rec_len & 0xFFU; - ret = g_tls_api->EVP_EncryptUpdate(tls_ctx, NULL, &len, buf, 5); + ret = g_tls_api->EVP_EncryptUpdate(tls_ctx, nullptr, &len, buf, 5); } else { *((uint64_t *)buf) = htobe64(m_next_recno_rx); copy_by_offset(buf + 8, m_rx_offset, 3); buf[11] = rec_len >> 8U; buf[12] = rec_len & 0xFFU; - ret = g_tls_api->EVP_EncryptUpdate(tls_ctx, NULL, &len, buf, 13); + ret = g_tls_api->EVP_EncryptUpdate(tls_ctx, nullptr, &len, buf, 13); } if (unlikely(!ret)) { return TLS_DECRYPT_INTERNAL; diff --git a/src/core/sock/tcp_seg_pool.cpp b/src/core/sock/tcp_seg_pool.cpp index 396bd6172..05c54cfd8 100644 --- a/src/core/sock/tcp_seg_pool.cpp +++ b/src/core/sock/tcp_seg_pool.cpp @@ -38,7 +38,7 @@ extern global_stats_t g_global_stat_static; -tcp_seg_pool *g_tcp_seg_pool = NULL; +tcp_seg_pool *g_tcp_seg_pool = nullptr; tcp_seg_pool::tcp_seg_pool() : m_p_head(nullptr) @@ -69,7 +69,7 @@ std::pair tcp_seg_pool::get_tcp_seg_list(uint32_t amount) repeat: count = amount; head = next = m_p_head; - prev = NULL; + prev = nullptr; while (count > 0 && next) { prev = next; next = next->next; @@ -84,7 +84,7 @@ std::pair tcp_seg_pool::get_tcp_seg_list(uint32_t amount) unlock(); return std::make_pair(nullptr, nullptr); } - prev->next = NULL; + prev->next = nullptr; m_p_head = next; m_stats.allocations++; g_global_stat_static.n_tcp_seg_pool_size -= amount; diff --git a/src/core/util/agent.cpp b/src/core/util/agent.cpp index 5f043a50f..0cc47f493 100644 --- a/src/core/util/agent.cpp +++ b/src/core/util/agent.cpp @@ -91,7 +91,7 @@ vlog_printf(_level, "*************************************************************\n"); \ } while (0) -agent *g_p_agent = NULL; +agent *g_p_agent = nullptr; agent::agent() : m_state(AGENT_CLOSED) @@ -100,7 +100,7 @@ agent::agent() , m_msg_num(AGENT_DEFAULT_MSG_NUM) { int rc = 0; - agent_msg_t *msg = NULL; + agent_msg_t *msg = nullptr; int i = 0; INIT_LIST_HEAD(&m_cb_queue); @@ -114,7 +114,7 @@ agent::agent() while (i--) { /* coverity[overwrite_var] */ msg = (agent_msg_t *)calloc(1, sizeof(*msg)); - if (NULL == msg) { + if (!msg) { rc = -ENOMEM; __log_dbg("failed queue creation (rc = %d)", rc); goto err; @@ -214,8 +214,8 @@ agent::agent() agent::~agent() { - agent_msg_t *msg = NULL; - agent_callback_t *cb = NULL; + agent_msg_t *msg = nullptr; + agent_callback_t *cb = nullptr; if (AGENT_CLOSED == m_state) { return; @@ -260,14 +260,14 @@ agent::~agent() void agent::register_cb(agent_cb_t fn, void *arg) { - agent_callback_t *cb = NULL; - struct list_head *entry = NULL; + agent_callback_t *cb = nullptr; + struct list_head *entry = nullptr; if (AGENT_CLOSED == m_state) { return; } - if (NULL == fn) { + if (!fn) { return; } @@ -294,8 +294,8 @@ void agent::register_cb(agent_cb_t fn, void *arg) void agent::unregister_cb(agent_cb_t fn, void *arg) { - agent_callback_t *cb = NULL; - struct list_head *entry = NULL; + agent_callback_t *cb = nullptr; + struct list_head *entry = nullptr; if (AGENT_CLOSED == m_state) { return; @@ -318,7 +318,7 @@ void agent::unregister_cb(agent_cb_t fn, void *arg) int agent::put(const void *data, size_t length, intptr_t tag) { - agent_msg_t *msg = NULL; + agent_msg_t *msg = nullptr; int i = 0; if (AGENT_CLOSED == m_state) { @@ -345,7 +345,7 @@ int agent::put(const void *data, size_t length, intptr_t tag) for (i = 0; i < AGENT_DEFAULT_MSG_GROW; i++) { /* coverity[overwrite_var] */ msg = (agent_msg_t *)malloc(sizeof(*msg)); - if (NULL == msg) { + if (!msg) { break; } msg->length = 0; @@ -377,7 +377,7 @@ int agent::put(const void *data, size_t length, intptr_t tag) void agent::progress(void) { - agent_msg_t *msg = NULL; + agent_msg_t *msg = nullptr; struct timeval tv_now = TIMEVAL_INITIALIZER; static struct timeval tv_inactive_elapsed = TIMEVAL_INITIALIZER; static struct timeval tv_alive_elapsed = TIMEVAL_INITIALIZER; @@ -431,8 +431,8 @@ void agent::progress(void) void agent::progress_cb(void) { - agent_callback_t *cb = NULL; - struct list_head *entry = NULL; + agent_callback_t *cb = nullptr; + struct list_head *entry = nullptr; m_cb_lock.lock(); list_for_each(entry, &m_cb_queue) @@ -455,7 +455,7 @@ int agent::send(agent_msg_t *msg) return -EBADF; } - if (NULL == msg) { + if (!msg) { return -EINVAL; } diff --git a/src/core/util/hugepage_mgr.cpp b/src/core/util/hugepage_mgr.cpp index 73d84f817..60d1768b2 100644 --- a/src/core/util/hugepage_mgr.cpp +++ b/src/core/util/hugepage_mgr.cpp @@ -95,7 +95,7 @@ void *hugepage_mgr::alloc_hugepages_helper(size_t &size, size_t hugepage) map_flags = (int)log2(hugepage) << MAP_HUGE_SHIFT; } - ptr = mmap(NULL, actual_size, PROT_READ | PROT_WRITE, + ptr = mmap(nullptr, actual_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS | MAP_POPULATE | MAP_HUGETLB | map_flags, -1, 0); if (ptr == MAP_FAILED) { ptr = nullptr; diff --git a/src/core/util/match.cpp b/src/core/util/match.cpp index 3f8f47c80..c17804428 100644 --- a/src/core/util/match.cpp +++ b/src/core/util/match.cpp @@ -85,8 +85,8 @@ static void free_dbl_lst(struct dbl_lst *dbl_lst) free(node); node = tmp; } - dbl_lst->head = NULL; - dbl_lst->tail = NULL; + dbl_lst->head = nullptr; + dbl_lst->tail = nullptr; } static void free_instance_content(struct instance *instance) @@ -124,8 +124,8 @@ void __xlio_free_resources(void) free(node); node = tmp; } - __instance_list.head = NULL; - __instance_list.tail = NULL; + __instance_list.head = nullptr; + __instance_list.tail = nullptr; } void get_address_port_rule_str(char *addr_buf, char *ports_buf, struct address_port_rule *rule) @@ -276,7 +276,7 @@ static inline int match_ipv4_addr(struct address_port_rule *rule, const struct s static int match_ip_addr_and_port(transport_t my_transport, struct use_family_rule *rule, const struct sockaddr *addr_in_first, const socklen_t addrlen_first, - const struct sockaddr *addr_in_second = NULL, + const struct sockaddr *addr_in_second = nullptr, const socklen_t addrlen_second = 0) { const struct sockaddr_in *sin_first = (const struct sockaddr_in *)addr_in_first; @@ -308,7 +308,7 @@ static int match_ip_addr_and_port(transport_t my_transport, struct use_family_ru MAX_ADDR_STR_LEN); port_first = ntohs(sin_first->sin_port); } - if (addr_str_first == NULL) { + if (!addr_str_first) { addr_str_first = "INVALID_ADDR"; } @@ -322,7 +322,7 @@ static int match_ip_addr_and_port(transport_t my_transport, struct use_family_ru addr_buf_second, MAX_ADDR_STR_LEN); port_second = ntohs(sin_second->sin_port); } - if (addr_str_second == NULL) { + if (!addr_str_second) { addr_str_second = "INVALID_ADDR"; } @@ -350,7 +350,7 @@ static int match_ip_addr_and_port(transport_t my_transport, struct use_family_ru } if (match && rule->first.match_by_addr) { - if (__xlio_sockaddr_to_xlio(addr_in_first, addrlen_first, &tmp_sin_first, NULL) || + if (__xlio_sockaddr_to_xlio(addr_in_first, addrlen_first, &tmp_sin_first, nullptr) || match_ipv4_addr(&(rule->first), &tmp_sin_first)) { match_logdbg("NEGATIVE MATCH by address"); match = 0; @@ -372,7 +372,7 @@ static int match_ip_addr_and_port(transport_t my_transport, struct use_family_ru } if (match && rule->second.match_by_addr) { - if (__xlio_sockaddr_to_xlio(addr_in_second, addrlen_second, &tmp_sin_second, NULL) || + if (__xlio_sockaddr_to_xlio(addr_in_second, addrlen_second, &tmp_sin_second, nullptr) || match_ipv4_addr(&(rule->second), &tmp_sin_second)) { match_logdbg("NEGATIVE MATCH by address"); match = 0; @@ -425,12 +425,12 @@ static transport_t get_family_by_first_matching_rule(transport_t my_transport, struct dbl_lst rules_lst, const struct sockaddr *sin_first, const socklen_t addrlen_first, - const struct sockaddr *sin_second = NULL, + const struct sockaddr *sin_second = nullptr, const socklen_t addrlen_second = 0) { struct dbl_lst_node *node; - for (node = rules_lst.head; node != NULL; node = node->next) { + for (node = rules_lst.head; node; node = node->next) { /* first rule wins */ struct use_family_rule *rule = (struct use_family_rule *)node->data; if (rule) { @@ -447,7 +447,7 @@ static transport_t get_family_by_first_matching_rule(transport_t my_transport, static transport_t get_family_by_instance_first_matching_rule( transport_t my_transport, role_t role, const char *app_id, const struct sockaddr *sin_first, - const socklen_t addrlen_first, const struct sockaddr *sin_second = NULL, + const socklen_t addrlen_first, const struct sockaddr *sin_second = nullptr, const socklen_t addrlen_second = 0) { transport_t target_family = TRANS_DEFAULT; @@ -593,8 +593,7 @@ static transport_t match_by_all_rules_program(in_protocol_t my_protocol, struct struct dbl_lst_node *node; struct use_family_rule *rule; - for (node = rules_lst.head; (node != NULL) && (target_family == TRANS_DEFAULT); - node = node->next) { + for (node = rules_lst.head; (node) && (target_family == TRANS_DEFAULT); node = node->next) { /* * to declare a dont care we either have a dont care address and port * or the previous non global rules use the same target family as the @@ -771,7 +770,7 @@ int __xlio_sockaddr_to_xlio(const struct sockaddr *addr_in, socklen_t addrlen, addr_out->sin_port = sin6->sin6_port; if (inet_ntop(addr_out->sin_family, (void *)&(addr_out->sin_addr), buf, MAX_ADDR_STR_LEN) == - NULL) { + nullptr) { match_logdbg("__xlio_sockaddr_to_xlio: Converted IPv4 address is illegal"); } else { match_logdbg("__xlio_sockaddr_to_xlio: Converted IPv4 is:%s", buf); diff --git a/src/core/util/sg_array.h b/src/core/util/sg_array.h index 8f500c89c..b4210c9ee 100644 --- a/src/core/util/sg_array.h +++ b/src/core/util/sg_array.h @@ -84,14 +84,14 @@ class sg_array { uint8_t *old_p = (uint8_t *)m_sg[m_index].addr + m_pos; m_pos += *get_len; if (unlikely(m_pos < 0)) { - return NULL; + return nullptr; } return old_p; } else { *get_len = m_current->length - m_pos; if (unlikely(m_pos < 0)) { - return NULL; + return nullptr; } uint8_t *old_p = (uint8_t *)m_sg[m_index++].addr + m_pos; // moving to next sge @@ -99,13 +99,13 @@ class sg_array { return old_p; } } - return NULL; + return nullptr; } inline int get_num_sge(void) { return m_sg ? m_num_sge : -1; } inline int length(void) { - if (unlikely(m_sg == NULL || m_num_sge == 0)) { + if (unlikely(!m_sg || m_num_sge == 0)) { return 0; } for (int i = 0; i < m_num_sge; i++) { diff --git a/src/core/util/sys_vars.cpp b/src/core/util/sys_vars.cpp index b6f8dfdea..28540bea5 100644 --- a/src/core/util/sys_vars.cpp +++ b/src/core/util/sys_vars.cpp @@ -105,13 +105,13 @@ typedef struct { const char **input_names; } xlio_spec_names; -static const char *names_none[] = {"none", NULL}; -static const char *spec_names_ulatency[] = {"ultra-latency", NULL}; -static const char *spec_names_latency[] = {"latency", NULL}; -static const char *spec_names_multi_ring[] = {"multi_ring_latency", NULL}; -static const char *spec_names_nginx[] = {"nginx", NULL}; -static const char *spec_names_nginx_dpu[] = {"nginx_dpu", NULL}; -static const char *spec_names_nvme_bf2[] = {"nvme_bf2", NULL}; +static const char *names_none[] = {"none", nullptr}; +static const char *spec_names_ulatency[] = {"ultra-latency", nullptr}; +static const char *spec_names_latency[] = {"latency", nullptr}; +static const char *spec_names_multi_ring[] = {"multi_ring_latency", nullptr}; +static const char *spec_names_nginx[] = {"nginx", nullptr}; +static const char *spec_names_nginx_dpu[] = {"nginx_dpu", nullptr}; +static const char *spec_names_nvme_bf2[] = {"nvme_bf2", nullptr}; // must be by order because "to_str" relies on that! static const xlio_spec_names specs[] = { @@ -277,7 +277,7 @@ const char *to_str(MODE option, const OPT (&options)[N]) } } - return NULL; + return nullptr; } } // namespace option_x @@ -368,14 +368,14 @@ int mce_sys_var::list_to_cpuset(char *cpulist, cpu_set_t *cpu_set) * Here we assume that if we get a second subtoken * then we must be processing a range. */ - subtoken = strtok_r(NULL, dash, &dash_saveptr); + subtoken = strtok_r(nullptr, dash, &dash_saveptr); if (subtoken) { errno = 0; range_end = strtol(subtoken, &endptr, 10); if ((!range_end && *endptr) || errno) { return -1; } - subtoken = NULL; + subtoken = nullptr; } else { range_end = range_start; } @@ -389,7 +389,7 @@ int mce_sys_var::list_to_cpuset(char *cpulist, cpu_set_t *cpu_set) } } - token = strtok_r(NULL, comma, &comma_saveptr); + token = strtok_r(nullptr, comma, &comma_saveptr); } return 0; @@ -419,7 +419,7 @@ int mce_sys_var::hex_to_cpuset(char *start, cpu_set_t *cpu_set) return -1; } - digit = strtol(hexc, NULL, 16); + digit = strtol(hexc, nullptr, 16); /* * Each hex digit is 4 bits. For each bit set per @@ -487,9 +487,9 @@ void mce_sys_var::read_env_variable_with_pid(char *mce_sys_name, size_t mce_sys_ char *env_ptr) { int n = -1; - char *d_pos = NULL; + char *d_pos = nullptr; - if (NULL == env_ptr || NULL == mce_sys_name || mce_sys_max_size < 2) { + if (!env_ptr || !mce_sys_name || mce_sys_max_size < 2) { return; } @@ -578,7 +578,7 @@ const char *mce_sys_var::cpuid_hv_vendor() static __thread char vendor[13] = {0}; if (!cpuid_hv()) { - return NULL; + return nullptr; } #if defined(__x86_64__) uint32_t _ebx = 0, _ecx = 0, _edx = 0; @@ -593,7 +593,7 @@ const char *mce_sys_var::cpuid_hv_vendor() void mce_sys_var::read_hv() { - const char *hyper_vendor_id = NULL; + const char *hyper_vendor_id = nullptr; hypervisor = mce_sys_var::HYPER_NONE; hyper_vendor_id = cpuid_hv_vendor(); @@ -704,7 +704,7 @@ void mce_sys_var::get_env_params() { int c = 0, len = 0; char *env_ptr; - FILE *fp = NULL; + FILE *fp = nullptr; int app_name_size = MAX_CMD_LINE; // Large buffer size to avoid need for realloc @@ -905,7 +905,7 @@ void mce_sys_var::get_env_params() /* Configure enable_socketxtreme as first because * this mode has some special predefined parameter limitations */ - if ((env_ptr = getenv(SYS_VAR_SOCKETXTREME)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_SOCKETXTREME))) { enable_socketxtreme = atoi(env_ptr) ? true : false; } if (enable_socketxtreme) { @@ -914,7 +914,7 @@ void mce_sys_var::get_env_params() progress_engine_interval_msec = MCE_CQ_DRAIN_INTERVAL_DISABLED; } - if ((env_ptr = getenv(SYS_VAR_STRQ)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_STRQ))) { enable_strq_env = option_3::from_str(env_ptr, MCE_DEFAULT_STRQ); } @@ -927,7 +927,7 @@ void mce_sys_var::get_env_params() qp_compensation_level = MCE_DEFAULT_STRQ_COMPENSATION_LEVEL; } - if ((env_ptr = getenv(SYS_VAR_SPEC)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_SPEC))) { mce_spec = (uint32_t)xlio_spec::from_str(env_ptr, MCE_SPEC_NONE); } @@ -936,7 +936,7 @@ void mce_sys_var::get_env_params() * based on number of workers or application type further. */ #if defined(DEFINED_NGINX) - if ((env_ptr = getenv(SYS_VAR_NGINX_WORKERS_NUM)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_NGINX_WORKERS_NUM))) { app.workers_num = (uint32_t)atoi(env_ptr); if (app.workers_num > 0) { app.type = APP_NGINX; @@ -1185,31 +1185,31 @@ void mce_sys_var::get_env_params() break; } - if ((env_ptr = getenv(SYS_VAR_PRINT_REPORT)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_PRINT_REPORT))) { print_report = atoi(env_ptr) ? true : false; } - if ((env_ptr = getenv(SYS_VAR_LOG_FILENAME)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_LOG_FILENAME))) { read_env_variable_with_pid(log_filename, sizeof(log_filename), env_ptr); } - if ((env_ptr = getenv(SYS_VAR_STATS_FILENAME)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_STATS_FILENAME))) { read_env_variable_with_pid(stats_filename, sizeof(stats_filename), env_ptr); } - if ((env_ptr = getenv(SYS_VAR_STATS_SHMEM_DIRNAME)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_STATS_SHMEM_DIRNAME))) { read_env_variable_with_pid(stats_shmem_dirname, sizeof(stats_shmem_dirname), env_ptr); } - if ((env_ptr = getenv(SYS_VAR_CONF_FILENAME)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_CONF_FILENAME))) { read_env_variable_with_pid(conf_filename, sizeof(conf_filename), env_ptr); } - if ((env_ptr = getenv(SYS_VAR_SERVICE_DIR)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_SERVICE_DIR))) { read_env_variable_with_pid(service_notify_dir, sizeof(service_notify_dir), env_ptr); } - if ((env_ptr = getenv(SYS_VAR_SERVICE_ENABLE)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_SERVICE_ENABLE))) { service_enable = atoi(env_ptr) ? true : false; } if (HYPER_MSHV == hypervisor && !service_enable) { @@ -1218,7 +1218,7 @@ void mce_sys_var::get_env_params() SYS_VAR_SERVICE_ENABLE); } - if ((env_ptr = getenv(SYS_VAR_LOG_LEVEL)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_LOG_LEVEL))) { log_level = log_level::from_str(env_ptr, VLOG_DEFAULT); } @@ -1226,27 +1226,27 @@ void mce_sys_var::get_env_params() log_details = 2; } - if ((env_ptr = getenv(SYS_VAR_LOG_DETAILS)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_LOG_DETAILS))) { log_details = (uint32_t)atoi(env_ptr); } - if ((env_ptr = getenv(SYS_VAR_LOG_COLORS)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_LOG_COLORS))) { log_colors = atoi(env_ptr) ? true : false; } - if ((env_ptr = getenv(SYS_VAR_APPLICATION_ID)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_APPLICATION_ID))) { read_env_variable_with_pid(app_id, sizeof(app_id), env_ptr); } - if ((env_ptr = getenv(SYS_VAR_HANDLE_SIGINTR)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_HANDLE_SIGINTR))) { handle_sigintr = atoi(env_ptr) ? true : false; } - if ((env_ptr = getenv(SYS_VAR_HANDLE_SIGSEGV)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_HANDLE_SIGSEGV))) { handle_segfault = atoi(env_ptr) ? true : false; } - if ((env_ptr = getenv(SYS_VAR_STATS_FD_NUM)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_STATS_FD_NUM))) { stats_fd_num_max = (uint32_t)atoi(env_ptr); if (stats_fd_num_max > MAX_STATS_FD_NUM) { vlog_printf(VLOG_WARNING, " Can only monitor maximum %d sockets in statistics \n", @@ -1258,29 +1258,29 @@ void mce_sys_var::get_env_params() read_strq_strides_num(); read_strq_stride_size_bytes(); - if ((env_ptr = getenv(SYS_VAR_STRQ_STRIDES_NUM_BUFS)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_STRQ_STRIDES_NUM_BUFS))) { strq_strides_num_bufs = (uint32_t)atoi(env_ptr); } - if ((env_ptr = getenv(SYS_VAR_STRQ_STRIDES_COMPENSATION_LEVEL)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_STRQ_STRIDES_COMPENSATION_LEVEL))) { strq_strides_compensation_level = (uint32_t)atoi(env_ptr); } - if ((env_ptr = getenv(SYS_VAR_ZC_CACHE_THRESHOLD)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_ZC_CACHE_THRESHOLD))) { zc_cache_threshold = option_size::from_str(env_ptr); } bool tx_num_bufs_set = false; - if ((env_ptr = getenv(SYS_VAR_TX_NUM_BUFS)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_TX_NUM_BUFS))) { tx_num_bufs = (uint32_t)atoi(env_ptr); tx_num_bufs_set = true; } - if ((env_ptr = getenv(SYS_VAR_TX_BUF_SIZE)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_TX_BUF_SIZE))) { tx_buf_size = (uint32_t)option_size::from_str(env_ptr); } - if ((env_ptr = getenv(SYS_VAR_ZC_TX_SIZE)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_ZC_TX_SIZE))) { zc_tx_size = (uint32_t)option_size::from_str(env_ptr); if (zc_tx_size > MCE_MAX_ZC_TX_SIZE) { vlog_printf(VLOG_WARNING, @@ -1290,15 +1290,15 @@ void mce_sys_var::get_env_params() } } - if ((env_ptr = getenv(SYS_VAR_TCP_NODELAY_TRESHOLD)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_TCP_NODELAY_TRESHOLD))) { tcp_nodelay_treshold = (uint32_t)atoi(env_ptr); } - if ((env_ptr = getenv(SYS_VAR_TX_NUM_WRE)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_TX_NUM_WRE))) { tx_num_wr = (uint32_t)atoi(env_ptr); } - if ((env_ptr = getenv(SYS_VAR_TX_NUM_WRE_TO_SIGNAL)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_TX_NUM_WRE_TO_SIGNAL))) { tx_num_wr_to_signal = std::min(NUM_TX_WRE_TO_SIGNAL_MAX, std::max(1, atoi(env_ptr))); } @@ -1306,7 +1306,7 @@ void mce_sys_var::get_env_params() tx_num_wr = tx_num_wr_to_signal * 2; } - if ((env_ptr = getenv(SYS_VAR_TX_MAX_INLINE)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_TX_MAX_INLINE))) { tx_max_inline = (uint32_t)atoi(env_ptr); } if (tx_max_inline > MAX_SUPPORTED_IB_INLINE_SIZE) { @@ -1315,35 +1315,35 @@ void mce_sys_var::get_env_params() tx_max_inline = MAX_SUPPORTED_IB_INLINE_SIZE; } - if ((env_ptr = getenv(SYS_VAR_TX_MC_LOOPBACK)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_TX_MC_LOOPBACK))) { tx_mc_loopback_default = atoi(env_ptr) ? true : false; } - if ((env_ptr = getenv(SYS_VAR_TX_NONBLOCKED_EAGAINS)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_TX_NONBLOCKED_EAGAINS))) { tx_nonblocked_eagains = atoi(env_ptr) ? true : false; } - if ((env_ptr = getenv(SYS_VAR_TX_PREFETCH_BYTES)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_TX_PREFETCH_BYTES))) { tx_prefetch_bytes = (uint32_t)atoi(env_ptr); } - if ((env_ptr = getenv(SYS_VAR_TX_BUFS_BATCH_TCP)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_TX_BUFS_BATCH_TCP))) { tx_bufs_batch_tcp = (uint32_t)std::max(atoi(env_ptr), 1); } - if ((env_ptr = getenv(SYS_VAR_TX_SEGS_BATCH_TCP)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_TX_SEGS_BATCH_TCP))) { tx_segs_batch_tcp = (uint32_t)std::max(atoi(env_ptr), 1); } - if ((env_ptr = getenv(SYS_VAR_TX_SEGS_RING_BATCH_TCP)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_TX_SEGS_RING_BATCH_TCP))) { tx_segs_ring_batch_tcp = (uint32_t)std::max(atoi(env_ptr), 1); } - if ((env_ptr = getenv(SYS_VAR_TX_SEGS_POOL_BATCH_TCP)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_TX_SEGS_POOL_BATCH_TCP))) { tx_segs_pool_batch_tcp = (uint32_t)std::max(atoi(env_ptr), 1); } - if ((env_ptr = getenv(SYS_VAR_RING_ALLOCATION_LOGIC_TX)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_RING_ALLOCATION_LOGIC_TX))) { ring_allocation_logic_tx = (ring_logic_t)atoi(env_ptr); if (!is_ring_logic_valid(ring_allocation_logic_tx)) { vlog_printf(VLOG_WARNING, "%s = %d is not valid, setting logic to default = %d\n", @@ -1353,7 +1353,7 @@ void mce_sys_var::get_env_params() } } - if ((env_ptr = getenv(SYS_VAR_RING_ALLOCATION_LOGIC_RX)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_RING_ALLOCATION_LOGIC_RX))) { ring_allocation_logic_rx = (ring_logic_t)atoi(env_ptr); if (!is_ring_logic_valid(ring_allocation_logic_rx)) { vlog_printf(VLOG_WARNING, "%s = %d is not valid, setting logic to default = %d\n", @@ -1363,41 +1363,41 @@ void mce_sys_var::get_env_params() } } - if ((env_ptr = getenv(SYS_VAR_RING_MIGRATION_RATIO_TX)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_RING_MIGRATION_RATIO_TX))) { ring_migration_ratio_tx = atoi(env_ptr); } - if ((env_ptr = getenv(SYS_VAR_RING_MIGRATION_RATIO_RX)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_RING_MIGRATION_RATIO_RX))) { ring_migration_ratio_rx = atoi(env_ptr); } - if ((env_ptr = getenv(SYS_VAR_RING_LIMIT_PER_INTERFACE)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_RING_LIMIT_PER_INTERFACE))) { ring_limit_per_interface = std::max(0, atoi(env_ptr)); } - if ((env_ptr = getenv(SYS_VAR_RING_DEV_MEM_TX)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_RING_DEV_MEM_TX))) { ring_dev_mem_tx = std::max(0, atoi(env_ptr)); } - if ((env_ptr = getenv(SYS_VAR_TCP_MAX_SYN_RATE)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_TCP_MAX_SYN_RATE))) { tcp_max_syn_rate = std::min(TCP_MAX_SYN_RATE_TOP_LIMIT, std::max(0, atoi(env_ptr))); } bool rx_num_bufs_set = false; - if ((env_ptr = getenv(SYS_VAR_RX_NUM_BUFS)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_RX_NUM_BUFS))) { rx_num_bufs = (uint32_t)atoi(env_ptr); rx_num_bufs_set = true; } - if ((env_ptr = getenv(SYS_VAR_RX_BUF_SIZE)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_RX_BUF_SIZE))) { rx_buf_size = (uint32_t)option_size::from_str(env_ptr); } - if ((env_ptr = getenv(SYS_VAR_RX_NUM_WRE_TO_POST_RECV)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_RX_NUM_WRE_TO_POST_RECV))) { rx_num_wr_to_post_recv = std::min(NUM_RX_WRE_TO_POST_RECV_MAX, std::max(1, atoi(env_ptr))); } - if ((env_ptr = getenv(SYS_VAR_RX_NUM_WRE)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_RX_NUM_WRE))) { rx_num_wr = (uint32_t)atoi(env_ptr); } @@ -1415,7 +1415,7 @@ void mce_sys_var::get_env_params() rx_num_wr = rx_num_wr_to_post_recv * 2; } - if ((env_ptr = getenv(SYS_VAR_RX_NUM_POLLS)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_RX_NUM_POLLS))) { rx_poll_num = atoi(env_ptr); } if (rx_poll_num < MCE_MIN_RX_NUM_POLLS || rx_poll_num > MCE_MAX_RX_NUM_POLLS) { @@ -1423,7 +1423,7 @@ void mce_sys_var::get_env_params() MCE_MIN_RX_NUM_POLLS, MCE_MAX_RX_NUM_POLLS, rx_poll_num); rx_poll_num = MCE_DEFAULT_RX_NUM_POLLS; } - if ((env_ptr = getenv(SYS_VAR_RX_NUM_POLLS_INIT)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_RX_NUM_POLLS_INIT))) { rx_poll_num_init = atoi(env_ptr); } if (rx_poll_num_init < MCE_MIN_RX_NUM_POLLS || rx_poll_num_init > MCE_MAX_RX_NUM_POLLS) { @@ -1435,11 +1435,11 @@ void mce_sys_var::get_env_params() rx_poll_num = 1; // Force at least one good polling loop } - if ((env_ptr = getenv(SYS_VAR_RX_UDP_POLL_OS_RATIO)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_RX_UDP_POLL_OS_RATIO))) { rx_udp_poll_os_ratio = (uint32_t)atoi(env_ptr); } - if ((env_ptr = getenv(SYS_VAR_HW_TS_CONVERSION_MODE)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_HW_TS_CONVERSION_MODE))) { hw_ts_conversion_mode = (ts_conversion_mode_t)atoi(env_ptr); if ((uint32_t)hw_ts_conversion_mode >= TS_CONVERSION_MODE_LAST) { vlog_printf( @@ -1452,32 +1452,32 @@ void mce_sys_var::get_env_params() } // The following 2 params were replaced by SYS_VAR_RX_UDP_POLL_OS_RATIO - if ((env_ptr = getenv(SYS_VAR_RX_POLL_OS_RATIO)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_RX_POLL_OS_RATIO))) { rx_udp_poll_os_ratio = (uint32_t)atoi(env_ptr); vlog_printf(VLOG_WARNING, "The parameter %s is no longer in use. Parameter %s was set to %d instead\n", SYS_VAR_RX_POLL_OS_RATIO, SYS_VAR_RX_UDP_POLL_OS_RATIO, rx_udp_poll_os_ratio); } - if ((env_ptr = getenv(SYS_VAR_RX_SKIP_OS)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_RX_SKIP_OS))) { rx_udp_poll_os_ratio = (uint32_t)atoi(env_ptr); vlog_printf(VLOG_WARNING, "The parameter %s is no longer in use. Parameter %s was set to %d instead\n", SYS_VAR_RX_SKIP_OS, SYS_VAR_RX_UDP_POLL_OS_RATIO, rx_udp_poll_os_ratio); } - if ((env_ptr = getenv(SYS_VAR_RX_POLL_YIELD)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_RX_POLL_YIELD))) { rx_poll_yield_loops = (uint32_t)atoi(env_ptr); } - if ((env_ptr = getenv(SYS_VAR_SELECT_CPU_USAGE_STATS)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_SELECT_CPU_USAGE_STATS))) { select_handle_cpu_usage_stats = atoi(env_ptr) ? true : false; } - if ((env_ptr = getenv(SYS_VAR_RX_BYTE_MIN_LIMIT)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_RX_BYTE_MIN_LIMIT))) { rx_ready_byte_min_limit = (uint32_t)atoi(env_ptr); } - if ((env_ptr = getenv(SYS_VAR_RX_PREFETCH_BYTES)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_RX_PREFETCH_BYTES))) { rx_prefetch_bytes = (uint32_t)atoi(env_ptr); } if (rx_prefetch_bytes < MCE_MIN_RX_PREFETCH_BYTES || @@ -1487,7 +1487,7 @@ void mce_sys_var::get_env_params() rx_prefetch_bytes = MCE_DEFAULT_RX_PREFETCH_BYTES; } - if ((env_ptr = getenv(SYS_VAR_RX_PREFETCH_BYTES_BEFORE_POLL)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_RX_PREFETCH_BYTES_BEFORE_POLL))) { rx_prefetch_bytes_before_poll = (uint32_t)atoi(env_ptr); } if (rx_prefetch_bytes_before_poll != 0 && @@ -1500,34 +1500,34 @@ void mce_sys_var::get_env_params() rx_prefetch_bytes_before_poll = MCE_DEFAULT_RX_PREFETCH_BYTES_BEFORE_POLL; } - if ((env_ptr = getenv(SYS_VAR_RX_CQ_DRAIN_RATE_NSEC)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_RX_CQ_DRAIN_RATE_NSEC))) { rx_cq_drain_rate_nsec = atoi(env_ptr); } // Update the rx cq polling rate for draining logic tscval_t tsc_per_second = get_tsc_rate_per_second(); rx_delta_tsc_between_cq_polls = tsc_per_second * rx_cq_drain_rate_nsec / NSEC_PER_SEC; - if ((env_ptr = getenv(SYS_VAR_GRO_STREAMS_MAX)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_GRO_STREAMS_MAX))) { gro_streams_max = std::max(atoi(env_ptr), 0); } - if ((env_ptr = getenv(SYS_VAR_TCP_3T_RULES)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_TCP_3T_RULES))) { tcp_3t_rules = atoi(env_ptr) ? true : false; } - if ((env_ptr = getenv(SYS_VAR_UDP_3T_RULES)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_UDP_3T_RULES))) { udp_3t_rules = atoi(env_ptr) ? true : false; } - if ((env_ptr = getenv(SYS_VAR_ETH_MC_L2_ONLY_RULES)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_ETH_MC_L2_ONLY_RULES))) { eth_mc_l2_only_rules = atoi(env_ptr) ? true : false; } - if ((env_ptr = getenv(SYS_VAR_DISABLE_FLOW_TAG)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_DISABLE_FLOW_TAG))) { disable_flow_tag = std::max(atoi(env_ptr), 0) ? true : false; } // mc_force_flowtag must be after disable_flow_tag - if ((env_ptr = getenv(SYS_VAR_MC_FORCE_FLOWTAG)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_MC_FORCE_FLOWTAG))) { mc_force_flowtag = atoi(env_ptr) ? true : false; if (disable_flow_tag) { vlog_printf(VLOG_WARNING, "%s and %s can't be set together. Disabling %s\n", @@ -1537,7 +1537,7 @@ void mce_sys_var::get_env_params() } } - if ((env_ptr = getenv(SYS_VAR_SELECT_NUM_POLLS)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_SELECT_NUM_POLLS))) { select_poll_num = atoi(env_ptr); } @@ -1547,7 +1547,7 @@ void mce_sys_var::get_env_params() select_poll_num = MCE_DEFAULT_SELECT_NUM_POLLS; } - if ((env_ptr = getenv(SYS_VAR_SELECT_POLL_OS_FORCE)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_SELECT_POLL_OS_FORCE))) { select_poll_os_force = (uint32_t)atoi(env_ptr); } @@ -1556,11 +1556,11 @@ void mce_sys_var::get_env_params() select_skip_os_fd_check = 1; } - if ((env_ptr = getenv(SYS_VAR_SELECT_POLL_OS_RATIO)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_SELECT_POLL_OS_RATIO))) { select_poll_os_ratio = (uint32_t)atoi(env_ptr); } - if ((env_ptr = getenv(SYS_VAR_SELECT_SKIP_OS)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_SELECT_SKIP_OS))) { select_skip_os_fd_check = (uint32_t)atoi(env_ptr); } @@ -1568,10 +1568,10 @@ void mce_sys_var::get_env_params() if ((mce_spec != MCE_SPEC_NVME_BF2) && (rx_poll_num < 0 || select_poll_num < 0)) { cq_moderation_enable = false; } - if ((env_ptr = getenv(SYS_VAR_CQ_MODERATION_ENABLE)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_CQ_MODERATION_ENABLE))) { cq_moderation_enable = atoi(env_ptr) ? true : false; } - if ((env_ptr = getenv(SYS_VAR_CQ_MODERATION_COUNT)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_CQ_MODERATION_COUNT))) { cq_moderation_count = (uint32_t)atoi(env_ptr); } @@ -1581,11 +1581,11 @@ void mce_sys_var::get_env_params() cq_moderation_count = max_cq_moderation_count; } - if ((env_ptr = getenv(SYS_VAR_CQ_MODERATION_PERIOD_USEC)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_CQ_MODERATION_PERIOD_USEC))) { cq_moderation_period_usec = (uint32_t)atoi(env_ptr); } - if ((env_ptr = getenv(SYS_VAR_CQ_AIM_MAX_COUNT)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_CQ_AIM_MAX_COUNT))) { cq_aim_max_count = (uint32_t)atoi(env_ptr); } @@ -1595,11 +1595,11 @@ void mce_sys_var::get_env_params() cq_aim_max_count = max_cq_aim_max_count; } - if ((env_ptr = getenv(SYS_VAR_CQ_AIM_MAX_PERIOD_USEC)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_CQ_AIM_MAX_PERIOD_USEC))) { cq_aim_max_period_usec = (uint32_t)atoi(env_ptr); } - if ((env_ptr = getenv(SYS_VAR_CQ_AIM_INTERVAL_MSEC)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_CQ_AIM_INTERVAL_MSEC))) { cq_aim_interval_msec = (uint32_t)atoi(env_ptr); } @@ -1607,7 +1607,7 @@ void mce_sys_var::get_env_params() cq_aim_interval_msec = MCE_CQ_ADAPTIVE_MODERATION_DISABLED; } - if ((env_ptr = getenv(SYS_VAR_CQ_AIM_INTERRUPTS_RATE_PER_SEC)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_CQ_AIM_INTERRUPTS_RATE_PER_SEC))) { cq_aim_interrupts_rate_per_sec = (uint32_t)atoi(env_ptr); } #else @@ -1641,7 +1641,7 @@ void mce_sys_var::get_env_params() } #endif /* DEFINED_IBV_CQ_ATTR_MODERATE */ - if ((env_ptr = getenv(SYS_VAR_CQ_POLL_BATCH_MAX)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_CQ_POLL_BATCH_MAX))) { cq_poll_batch_max = (uint32_t)atoi(env_ptr); } if (cq_poll_batch_max < MCE_MIN_CQ_POLL_BATCH || cq_poll_batch_max > MCE_MAX_CQ_POLL_BATCH) { @@ -1650,7 +1650,7 @@ void mce_sys_var::get_env_params() cq_poll_batch_max = MCE_DEFAULT_CQ_POLL_BATCH; } - if ((env_ptr = getenv(SYS_VAR_PROGRESS_ENGINE_INTERVAL)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_PROGRESS_ENGINE_INTERVAL))) { progress_engine_interval_msec = (uint32_t)atoi(env_ptr); } if (enable_socketxtreme && (progress_engine_interval_msec != MCE_CQ_DRAIN_INTERVAL_DISABLED)) { @@ -1660,41 +1660,41 @@ void mce_sys_var::get_env_params() SYS_VAR_SOCKETXTREME); } - if ((env_ptr = getenv(SYS_VAR_PROGRESS_ENGINE_WCE_MAX)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_PROGRESS_ENGINE_WCE_MAX))) { progress_engine_wce_max = (uint32_t)atoi(env_ptr); } - if ((env_ptr = getenv(SYS_VAR_CQ_KEEP_QP_FULL)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_CQ_KEEP_QP_FULL))) { cq_keep_qp_full = atoi(env_ptr) ? true : false; } - if ((env_ptr = getenv(SYS_VAR_QP_COMPENSATION_LEVEL)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_QP_COMPENSATION_LEVEL))) { qp_compensation_level = (uint32_t)atoi(env_ptr); } if (qp_compensation_level < rx_num_wr_to_post_recv) { qp_compensation_level = rx_num_wr_to_post_recv; } - if ((env_ptr = getenv(SYS_VAR_USER_HUGE_PAGE_SIZE)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_USER_HUGE_PAGE_SIZE))) { user_huge_page_size = option_size::from_str(env_ptr); if (user_huge_page_size == 0) { user_huge_page_size = g_hugepage_mgr.get_default_hugepage(); } } - if ((env_ptr = getenv(SYS_VAR_OFFLOADED_SOCKETS)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_OFFLOADED_SOCKETS))) { offloaded_sockets = atoi(env_ptr) ? true : false; } - if ((env_ptr = getenv(SYS_VAR_TIMER_RESOLUTION_MSEC)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_TIMER_RESOLUTION_MSEC))) { timer_resolution_msec = atoi(env_ptr); } - if ((env_ptr = getenv(SYS_VAR_TCP_TIMER_RESOLUTION_MSEC)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_TCP_TIMER_RESOLUTION_MSEC))) { tcp_timer_resolution_msec = atoi(env_ptr); } - if ((env_ptr = getenv(SYS_VAR_TCP_CTL_THREAD)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_TCP_CTL_THREAD))) { tcp_ctl_thread = option_tcp_ctl_thread::from_str(env_ptr, MCE_DEFAULT_TCP_CTL_THREAD); if (tcp_ctl_thread == option_tcp_ctl_thread::CTL_THREAD_DELEGATE_TCP_TIMERS) { if (progress_engine_interval_msec != MCE_CQ_DRAIN_INTERVAL_DISABLED) { @@ -1717,7 +1717,7 @@ void mce_sys_var::get_env_params() } } - if ((env_ptr = getenv(SYS_VAR_TCP_TIMESTAMP_OPTION)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_TCP_TIMESTAMP_OPTION))) { tcp_ts_opt = (tcp_ts_opt_t)atoi(env_ptr); if ((uint32_t)tcp_ts_opt >= TCP_TS_OPTION_LAST) { vlog_printf(VLOG_WARNING, @@ -1729,30 +1729,30 @@ void mce_sys_var::get_env_params() } } - if ((env_ptr = getenv(SYS_VAR_TCP_NODELAY)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_TCP_NODELAY))) { tcp_nodelay = atoi(env_ptr) ? true : false; } - if ((env_ptr = getenv(SYS_VAR_TCP_QUICKACK)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_TCP_QUICKACK))) { tcp_quickack = atoi(env_ptr) ? true : false; } - if ((env_ptr = getenv(SYS_VAR_TCP_PUSH_FLAG)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_TCP_PUSH_FLAG))) { tcp_push_flag = atoi(env_ptr) ? true : false; } // TODO: this should be replaced by calling "exception_handling.init()" that will be called from // init() - if ((env_ptr = getenv(xlio_exception_handling::getSysVar())) != NULL) { - exception_handling = xlio_exception_handling( - strtol(env_ptr, NULL, 10)); // xlio_exception_handling is responsible for its invariant + if ((env_ptr = getenv(xlio_exception_handling::getSysVar()))) { + exception_handling = xlio_exception_handling(strtol( + env_ptr, nullptr, 10)); // xlio_exception_handling is responsible for its invariant } - if ((env_ptr = getenv(SYS_VAR_AVOID_SYS_CALLS_ON_TCP_FD)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_AVOID_SYS_CALLS_ON_TCP_FD))) { avoid_sys_calls_on_tcp_fd = atoi(env_ptr) ? true : false; } - if ((env_ptr = getenv(SYS_VAR_ALLOW_PRIVILEGED_SOCK_OPT)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_ALLOW_PRIVILEGED_SOCK_OPT))) { allow_privileged_sock_opt = atoi(env_ptr) ? true : false; } @@ -1765,16 +1765,16 @@ void mce_sys_var::get_env_params() tcp_timer_resolution_msec = timer_resolution_msec; } - if ((env_ptr = getenv(SYS_VAR_INTERNAL_THREAD_ARM_CQ)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_INTERNAL_THREAD_ARM_CQ))) { internal_thread_arm_cq_enabled = atoi(env_ptr) ? true : false; } - if ((env_ptr = getenv(SYS_VAR_INTERNAL_THREAD_CPUSET)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_INTERNAL_THREAD_CPUSET))) { snprintf(internal_thread_cpuset, FILENAME_MAX, "%s", env_ptr); } // handle internal thread affinity - default is CPU-0 - if ((env_ptr = getenv(SYS_VAR_INTERNAL_THREAD_AFFINITY)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_INTERNAL_THREAD_AFFINITY))) { int n = snprintf(internal_thread_affinity_str, sizeof(internal_thread_affinity_str), "%s", env_ptr); if (unlikely(((int)sizeof(internal_thread_affinity_str) < n) || (n < 0))) { @@ -1787,18 +1787,18 @@ void mce_sys_var::get_env_params() internal_thread_affinity_str); } - if ((env_ptr = getenv(SYS_VAR_WAIT_AFTER_JOIN_MSEC)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_WAIT_AFTER_JOIN_MSEC))) { wait_after_join_msec = (uint32_t)atoi(env_ptr); } - if ((env_ptr = getenv(SYS_VAR_THREAD_MODE)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_THREAD_MODE))) { thread_mode = (thread_mode_t)atoi(env_ptr); if (thread_mode < 0 || thread_mode >= THREAD_MODE_LAST) { thread_mode = MCE_DEFAULT_THREAD_MODE; } } - if ((env_ptr = getenv(SYS_VAR_BUFFER_BATCHING_MODE)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_BUFFER_BATCHING_MODE))) { buffer_batching_mode = (buffer_batching_mode_t)atoi(env_ptr); if (buffer_batching_mode < 0 || buffer_batching_mode >= BUFFER_BATCHING_LAST) { buffer_batching_mode = MCE_DEFAULT_BUFFER_BATCHING_MODE; @@ -1811,24 +1811,24 @@ void mce_sys_var::get_env_params() rx_bufs_batch = 1; } - if ((env_ptr = getenv(SYS_VAR_NETLINK_TIMER_MSEC)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_NETLINK_TIMER_MSEC))) { timer_netlink_update_msec = (uint32_t)atoi(env_ptr); } - if ((env_ptr = getenv(SYS_VAR_NEIGH_NUM_ERR_RETRIES)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_NEIGH_NUM_ERR_RETRIES))) { neigh_num_err_retries = (uint32_t)atoi(env_ptr); } - if ((env_ptr = getenv(SYS_VAR_NEIGH_UC_ARP_DELAY_MSEC)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_NEIGH_UC_ARP_DELAY_MSEC))) { neigh_wait_till_send_arp_msec = (uint32_t)atoi(env_ptr); } - if ((env_ptr = getenv(SYS_VAR_NEIGH_UC_ARP_QUATA)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_NEIGH_UC_ARP_QUATA))) { neigh_uc_arp_quata = (uint32_t)atoi(env_ptr); } - if ((env_ptr = getenv(SYS_VAR_MEM_ALLOC_TYPE)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_MEM_ALLOC_TYPE))) { mem_alloc_type = option_alloc_type::from_str(env_ptr, MCE_DEFAULT_MEM_ALLOC_TYPE); } - if ((env_ptr = getenv(SYS_VAR_MEMORY_LIMIT)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_MEMORY_LIMIT))) { memory_limit = option_size::from_str(env_ptr) ?: MCE_DEFAULT_MEMORY_LIMIT; } else { /* @@ -1870,13 +1870,13 @@ void mce_sys_var::get_env_params() memory_limit = std::max(memory_limit, memory_limit_est); } } - if ((env_ptr = getenv(SYS_VAR_MEMORY_LIMIT_USER)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_MEMORY_LIMIT_USER))) { memory_limit_user = option_size::from_str(env_ptr); } - if ((env_ptr = getenv(SYS_VAR_HEAP_METADATA_BLOCK)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_HEAP_METADATA_BLOCK))) { heap_metadata_block = option_size::from_str(env_ptr) ?: MCE_DEFAULT_HEAP_METADATA_BLOCK; } - if ((env_ptr = getenv(SYS_VAR_HUGEPAGE_LOG2)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_HUGEPAGE_LOG2))) { unsigned val = (unsigned)atoi(env_ptr); // mmap() uses 6 bits for the hugepage size log2 @@ -1899,15 +1899,15 @@ void mce_sys_var::get_env_params() } } - if ((env_ptr = getenv(SYS_VAR_BF)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_BF))) { handle_bf = atoi(env_ptr) ? true : false; } - if ((env_ptr = getenv(SYS_VAR_FORK)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_FORK))) { handle_fork = atoi(env_ptr) ? true : false; } - if ((env_ptr = getenv(SYS_VAR_TSO)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_TSO))) { enable_tso = option_3::from_str(env_ptr, MCE_DEFAULT_TSO); } @@ -1940,67 +1940,67 @@ void mce_sys_var::get_env_params() } #endif /* DEFINED_UTLS */ - if ((env_ptr = getenv(SYS_VAR_LRO)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_LRO))) { enable_lro = option_3::from_str(env_ptr, MCE_DEFAULT_LRO); } - if ((env_ptr = getenv(SYS_VAR_CLOSE_ON_DUP2)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_CLOSE_ON_DUP2))) { close_on_dup2 = atoi(env_ptr) ? true : false; } - if ((env_ptr = getenv(SYS_VAR_MTU)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_MTU))) { mtu = (uint32_t)atoi(env_ptr); } #if defined(DEFINED_NGINX) - if ((env_ptr = getenv(SYS_VAR_NGINX_UDP_POOL_SIZE)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_NGINX_UDP_POOL_SIZE))) { nginx_udp_socket_pool_size = (uint32_t)atoi(env_ptr); } - if ((env_ptr = getenv(SYS_VAR_NGINX_UDP_POOL_RX_NUM_BUFFS_REUSE)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_NGINX_UDP_POOL_RX_NUM_BUFFS_REUSE))) { nginx_udp_socket_pool_rx_num_buffs_reuse = (uint32_t)atoi(env_ptr); } #endif // DEFINED_NGINX #if defined(DEFINED_NGINX) || defined(DEFINED_ENVOY) - if ((env_ptr = getenv(SYS_VAR_SRC_PORT_STRIDE)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_SRC_PORT_STRIDE))) { app.src_port_stride = (uint32_t)atoi(env_ptr); } - if ((env_ptr = getenv(SYS_VAR_DISTRIBUTE_CQ)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_DISTRIBUTE_CQ))) { app.distribute_cq_interrupts = atoi(env_ptr) ? true : false; } #endif - if ((env_ptr = getenv(SYS_VAR_MSS)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_MSS))) { lwip_mss = (uint32_t)atoi(env_ptr); } - if ((env_ptr = getenv(SYS_VAR_TCP_CC_ALGO)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_TCP_CC_ALGO))) { lwip_cc_algo_mod = (uint32_t)atoi(env_ptr); } - if ((env_ptr = getenv(SYS_VAR_DEFERRED_CLOSE)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_DEFERRED_CLOSE))) { deferred_close = atoi(env_ptr) ? true : false; } - if ((env_ptr = getenv(SYS_VAR_TCP_ABORT_ON_CLOSE)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_TCP_ABORT_ON_CLOSE))) { tcp_abort_on_close = atoi(env_ptr) ? true : false; } - if ((env_ptr = getenv(SYS_VAR_RX_POLL_ON_TX_TCP)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_RX_POLL_ON_TX_TCP))) { rx_poll_on_tx_tcp = atoi(env_ptr) ? true : false; } - if ((env_ptr = getenv(SYS_VAR_RX_CQ_WAIT_CTRL)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_RX_CQ_WAIT_CTRL))) { rx_cq_wait_ctrl = atoi(env_ptr) ? true : false; } - if ((env_ptr = getenv(SYS_VAR_TRIGGER_DUMMY_SEND_GETSOCKNAME)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_TRIGGER_DUMMY_SEND_GETSOCKNAME))) { trigger_dummy_send_getsockname = atoi(env_ptr) ? true : false; } - if ((env_ptr = getenv(SYS_VAR_TCP_SEND_BUFFER_SIZE)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_TCP_SEND_BUFFER_SIZE))) { tcp_send_buffer_size = (uint32_t)option_size::from_str(env_ptr); } - if ((env_ptr = getenv(SYS_VAR_SKIP_POLL_IN_RX)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_SKIP_POLL_IN_RX))) { int temp = atoi(env_ptr); if (temp < 0 || temp > SKIP_POLL_IN_RX_EPOLL_ONLY) { temp = 0; @@ -2008,7 +2008,7 @@ void mce_sys_var::get_env_params() skip_poll_in_rx = (skip_poll_in_rx_t)temp; } - if ((env_ptr = getenv(SYS_VAR_MULTILOCK)) != NULL) { + if ((env_ptr = getenv(SYS_VAR_MULTILOCK))) { int temp = atoi(env_ptr); if (temp < 0 || temp > MULTILOCK_MUTEX) { temp = 0; @@ -2059,10 +2059,10 @@ void set_env_params() } // Don't override user defined values. - if (getenv("MLX_QP_ALLOC_TYPE") == nullptr) { + if (!getenv("MLX_QP_ALLOC_TYPE")) { setenv("MLX_QP_ALLOC_TYPE", ibv_alloc_type, 0); } - if (getenv("MLX_CQ_ALLOC_TYPE") == nullptr) { + if (!getenv("MLX_CQ_ALLOC_TYPE")) { setenv("MLX_CQ_ALLOC_TYPE", ibv_alloc_type, 0); } } diff --git a/src/core/util/sysctl_reader.h b/src/core/util/sysctl_reader.h index 86c814036..475e1917b 100644 --- a/src/core/util/sysctl_reader.h +++ b/src/core/util/sysctl_reader.h @@ -57,7 +57,7 @@ class sysctl_reader_t { FILE *pfile = fopen(path, "r"); int ans; - if (pfile == NULL) { + if (!pfile) { return -1; } diff --git a/src/core/util/utils.cpp b/src/core/util/utils.cpp index 3f050ae74..601113572 100644 --- a/src/core/util/utils.cpp +++ b/src/core/util/utils.cpp @@ -133,7 +133,7 @@ int get_base_interface_name(const char *if_name, char *base_ifname, size_t sz_ba } BULLSEYE_EXCLUDE_BLOCK_END - for (ifa = ifaddr; ifa != NULL; ifa = ifa->ifa_next) { + for (ifa = ifaddr; ifa; ifa = ifa->ifa_next) { if (!strcmp(ifa->ifa_name, if_name)) { continue; } @@ -583,13 +583,13 @@ int get_port_from_ifname(const char *ifname) snprintf(dev_path, sizeof(dev_path), VERBS_DEVICE_PORT_PARAM_FILE, ifname); if (priv_safe_try_read_file(dev_path, num_buf, sizeof(num_buf)) > 0) { dev_port = - strtol(num_buf, NULL, 0); // base=0 means strtol() can parse hexadecimal and decimal + strtol(num_buf, nullptr, 0); // base=0 means strtol() can parse hexadecimal and decimal __log_dbg("dev_port file=%s dev_port str=%s dev_port val=%d", dev_path, num_buf, dev_port); } snprintf(dev_path, sizeof(dev_path), VERBS_DEVICE_ID_PARAM_FILE, ifname); if (priv_safe_try_read_file(dev_path, num_buf, sizeof(num_buf)) > 0) { dev_id = - strtol(num_buf, NULL, 0); // base=0 means strtol() can parse hexadecimal and decimal + strtol(num_buf, nullptr, 0); // base=0 means strtol() can parse hexadecimal and decimal __log_dbg("dev_id file= %s dev_id str=%s dev_id val=%d", dev_path, num_buf, dev_id); } @@ -991,9 +991,9 @@ size_t get_local_ll_addr(IN const char *ifname, OUT unsigned char *addr, IN int bool check_bond_device_exist(const char *ifname) { int ret = 0; - struct nl_cache *cache = NULL; - struct rtnl_link *link = NULL; - char *link_type = NULL; + struct nl_cache *cache = nullptr; + struct rtnl_link *link = nullptr; + char *link_type = nullptr; struct nl_sock *nl_socket = nl_socket_alloc(); if (!nl_socket) { @@ -1014,7 +1014,7 @@ bool check_bond_device_exist(const char *ifname) } link_type = rtnl_link_get_type(link); if (link_type && (strcmp(link_type, "bond") != 0)) { - link_type = NULL; + link_type = nullptr; } out: if (link) { @@ -1043,7 +1043,7 @@ bool get_bond_name(IN const char *ifname, OUT char *bond_name, IN int sz) return ret; } - for (ifa = ifaddr; ifa != NULL; ifa = ifa->ifa_next) { + for (ifa = ifaddr; ifa; ifa = ifa->ifa_next) { snprintf(upper_path, sizeof(upper_path), NETVSC_DEVICE_UPPER_FILE, base_ifname, ifa->ifa_name); int fd = SYSCALL(open, upper_path, O_RDONLY); @@ -1106,7 +1106,7 @@ bool get_netvsc_slave(IN const char *ifname, OUT char *slave_name, OUT unsigned return ret; } - for (ifa = ifaddr; ifa != NULL; ifa = ifa->ifa_next) { + for (ifa = ifaddr; ifa; ifa = ifa->ifa_next) { snprintf(netvsc_path, sizeof(netvsc_path), NETVSC_DEVICE_LOWER_FILE, base_ifname, ifa->ifa_name); int fd = SYSCALL(open, netvsc_path, O_RDONLY); diff --git a/src/core/util/utils.h b/src/core/util/utils.h index 0293436cb..af11bbbe0 100644 --- a/src/core/util/utils.h +++ b/src/core/util/utils.h @@ -279,8 +279,8 @@ size_t get_vlan_base_name_from_ifname(const char *ifname, char *base_ifname, siz size_t get_local_ll_addr(const char *ifname, unsigned char *addr, int addr_len, bool is_broadcast); /* Print warning while RoCE Lag is enabled */ -void print_roce_lag_warnings(const char *interface, char *disable_path = NULL, - const char *port1 = NULL, const char *port2 = NULL); +void print_roce_lag_warnings(const char *interface, char *disable_path = nullptr, + const char *port1 = nullptr, const char *port2 = nullptr); /*Print a warning to the user when there was an error registering memory*/ void print_warning_rlimit_memlock(size_t length, int error); @@ -323,10 +323,10 @@ static inline int get_procname(int pid, char *proc, size_t size) { char app_full_name[PATH_MAX] = {0}; char proccess_proc_dir[FILE_NAME_MAX_SIZE] = {0}; - char *app_base_name = NULL; + char *app_base_name = nullptr; int n = -1; - if (NULL == proc) { + if (!proc) { return -1; } @@ -351,7 +351,7 @@ static inline int get_procname(int pid, char *proc, size_t size) inline void create_multicast_mac_from_ip(unsigned char *mc_mac, const ip_address &addr, sa_family_t family) { - if (mc_mac == NULL) { + if (!mc_mac) { return; } diff --git a/src/core/util/wakeup_pipe.cpp b/src/core/util/wakeup_pipe.cpp index bea47a743..322e1f0b7 100644 --- a/src/core/util/wakeup_pipe.cpp +++ b/src/core/util/wakeup_pipe.cpp @@ -114,7 +114,7 @@ void wakeup_pipe::remove_wakeup_fd() } wkup_entry_dbg(""); int tmp_errno = errno; - if (SYSCALL(epoll_ctl, m_epfd, EPOLL_CTL_DEL, g_wakeup_pipes[0], NULL)) { + if (SYSCALL(epoll_ctl, m_epfd, EPOLL_CTL_DEL, g_wakeup_pipes[0], nullptr)) { BULLSEYE_EXCLUDE_BLOCK_START if (errno == ENOENT) { wkup_logdbg("Failed to delete global pipe from internal epfd it was already deleted"); diff --git a/src/core/xlio.h b/src/core/xlio.h index 9a50f0d02..98ff0a56e 100644 --- a/src/core/xlio.h +++ b/src/core/xlio.h @@ -35,6 +35,7 @@ #include #include +#include #include #include diff --git a/src/stats/stats_data_reader.h b/src/stats/stats_data_reader.h index 08a7cd17c..46407070c 100644 --- a/src/stats/stats_data_reader.h +++ b/src/stats/stats_data_reader.h @@ -79,7 +79,7 @@ struct tls_context_counters_show { tls_context_counters_show &update(const sh_mem_t *mem) { - return (mem != nullptr) ? update(mem->ring_inst_arr) : *this; + return (mem) ? update(mem->ring_inst_arr) : *this; } #ifdef DEFINED_UTLS @@ -120,7 +120,7 @@ struct global_counters_show { global_counters_show &update(const sh_mem_t *mem) { - return (mem != nullptr) ? update(mem->global_inst_arr) : *this; + return (mem) ? update(mem->global_inst_arr) : *this; } global_counters_show &update(const global_instance_block_t (&globals)[NUM_OF_SUPPORTED_GLOBALS]) @@ -234,7 +234,7 @@ struct ring_packet_aggregate { ring_packet_aggregate &update(const sh_mem_t *mem) { - return (mem != nullptr) ? update(mem->ring_inst_arr) : *this; + return (mem) ? update(mem->ring_inst_arr) : *this; } ring_packet_aggregate &update(const ring_instance_block_t (&rings)[NUM_OF_SUPPORTED_RINGS]) @@ -294,7 +294,7 @@ struct socket_listen_counter_aggregate { socket_listen_counter_aggregate &update(const sh_mem_t *mem) { - if (mem != nullptr) { + if (mem) { std::swap(curr, prev); curr = summarize_listen_counters(*mem); } From b66af636472ba80e400b5e3356eeb60a521fae0a Mon Sep 17 00:00:00 2001 From: Alex Briskin Date: Tue, 6 Feb 2024 09:42:03 +0200 Subject: [PATCH 083/169] issue: 3770816 Remove redundant void argument lists Signed-off-by: Alex Briskin --- src/core/sock/sock-app.cpp | 2 +- src/core/sock/sock-app.h | 2 +- src/core/sock/sock-extra.cpp | 2 +- src/core/sock/sock-extra.h | 2 +- src/core/sock/sock-redirect.h | 4 ++-- src/core/sock/socket_fd_api.h | 10 +++++----- src/core/sock/sockinfo.cpp | 2 +- src/core/sock/sockinfo.h | 2 +- src/core/sock/sockinfo_tcp.cpp | 4 ++-- src/core/sock/sockinfo_tcp.h | 36 +++++++++++++++++----------------- src/core/sock/sockinfo_ulp.cpp | 20 +++++++++---------- src/core/sock/sockinfo_ulp.h | 10 +++++----- 12 files changed, 48 insertions(+), 48 deletions(-) diff --git a/src/core/sock/sock-app.cpp b/src/core/sock/sock-app.cpp index 52e10b708..d2141c26f 100644 --- a/src/core/sock/sock-app.cpp +++ b/src/core/sock/sock-app.cpp @@ -65,7 +65,7 @@ static int init_worker(int worker_id, int listen_fd); struct app_conf *g_p_app = nullptr; #if defined(DEFINED_NGINX) -int app_conf::proc_nginx(void) +int app_conf::proc_nginx() { int rc = 0; diff --git a/src/core/sock/sock-app.h b/src/core/sock/sock-app.h index 7a1879880..0f2e8be9a 100644 --- a/src/core/sock/sock-app.h +++ b/src/core/sock/sock-app.h @@ -112,7 +112,7 @@ struct app_conf { } #if defined(DEFINED_NGINX) - int proc_nginx(void); + int proc_nginx(); #endif /* DEFINED_NGINX */ #if defined(DEFINED_ENVOY) diff --git a/src/core/sock/sock-extra.cpp b/src/core/sock/sock-extra.cpp index 954d9033c..fcb438560 100644 --- a/src/core/sock/sock-extra.cpp +++ b/src/core/sock/sock-extra.cpp @@ -342,7 +342,7 @@ extern "C" int xlio_extra_ioctl(void *cmsg_hdr, size_t cmsg_len) return 0; } -struct xlio_api_t *extra_api(void) +struct xlio_api_t *extra_api() { static struct xlio_api_t *xlio_api = nullptr; diff --git a/src/core/sock/sock-extra.h b/src/core/sock/sock-extra.h index 71c03bd90..9a5fa49d4 100644 --- a/src/core/sock/sock-extra.h +++ b/src/core/sock/sock-extra.h @@ -35,6 +35,6 @@ #include "xlio_extra.h" -struct xlio_api_t *extra_api(void); +struct xlio_api_t *extra_api(); #endif /* _SOCK_EXTRA_H_ */ diff --git a/src/core/sock/sock-redirect.h b/src/core/sock/sock-redirect.h index 6da6527c5..5487f1e5f 100644 --- a/src/core/sock/sock-redirect.h +++ b/src/core/sock/sock-redirect.h @@ -194,8 +194,8 @@ struct os_api { const sigset_t *sigmask); int (*clone)(int (*__fn)(void *), void *__child_stack, int __flags, void *__arg); - pid_t (*fork)(void); - pid_t (*vfork)(void); + pid_t (*fork)(); + pid_t (*vfork)(); int (*daemon)(int __nochdir, int __noclose); int (*sigaction)(int signum, const struct sigaction *act, struct sigaction *oldact); diff --git a/src/core/sock/socket_fd_api.h b/src/core/sock/socket_fd_api.h index 0c51629f4..a474dff0d 100644 --- a/src/core/sock/socket_fd_api.h +++ b/src/core/sock/socket_fd_api.h @@ -105,7 +105,7 @@ typedef struct xlio_tx_call_attr { pbuf_desc priv; ~xlio_tx_call_attr() {}; - void clear(void) + void clear() { opcode = TX_UNDEF; memset(&attr, 0, sizeof(attr)); @@ -245,22 +245,22 @@ class socket_fd_api : public cleanable_obj { ssize_t tx_os(const tx_call_t call_type, const iovec *p_iov, const ssize_t sz_iov, const int __flags, const sockaddr *__to, const socklen_t __tolen); - static inline size_t pendig_to_remove_node_offset(void) + static inline size_t pendig_to_remove_node_offset() { return NODE_OFFSET(socket_fd_api, pendig_to_remove_node); } - static inline size_t socket_fd_list_node_offset(void) + static inline size_t socket_fd_list_node_offset() { return NODE_OFFSET(socket_fd_api, socket_fd_list_node); } - static inline size_t ep_ready_fd_node_offset(void) + static inline size_t ep_ready_fd_node_offset() { return NODE_OFFSET(socket_fd_api, ep_ready_fd_node); } - static inline size_t ep_info_fd_node_offset(void) + static inline size_t ep_info_fd_node_offset() { return NODE_OFFSET(socket_fd_api, ep_info_fd_node); } diff --git a/src/core/sock/sockinfo.cpp b/src/core/sock/sockinfo.cpp index 8b6aa5d8e..7b73d987b 100644 --- a/src/core/sock/sockinfo.cpp +++ b/src/core/sock/sockinfo.cpp @@ -170,7 +170,7 @@ sockinfo::~sockinfo() m_socketxtreme.ec_cache.clear(); } -void sockinfo::socket_stats_init(void) +void sockinfo::socket_stats_init() { m_p_socket_stats->reset(); m_p_socket_stats->fd = m_fd; diff --git a/src/core/sock/sockinfo.h b/src/core/sock/sockinfo.h index e826eb0e5..fc4ef8de9 100644 --- a/src/core/sock/sockinfo.h +++ b/src/core/sock/sockinfo.h @@ -207,7 +207,7 @@ class sockinfo : public socket_fd_api, inline in_protocol_t get_protocol(void) { return m_protocol; } bool validate_and_convert_mapped_ipv4(sock_addr &sock) const; - void socket_stats_init(void); + void socket_stats_init(); void sock_pop_descs_rx_ready(descq_t *cache) { diff --git a/src/core/sock/sockinfo_tcp.cpp b/src/core/sock/sockinfo_tcp.cpp index 07a109394..1aeb770bc 100644 --- a/src/core/sock/sockinfo_tcp.cpp +++ b/src/core/sock/sockinfo_tcp.cpp @@ -1785,7 +1785,7 @@ err_t sockinfo_tcp::ack_recvd_lwip_cb(void *arg, struct tcp_pcb *tpcb, u16_t ack return ERR_OK; } -void sockinfo_tcp::tcp_shutdown_rx(void) +void sockinfo_tcp::tcp_shutdown_rx() { /* Call this method under connection lock */ @@ -5837,7 +5837,7 @@ tcp_timers_collection::~tcp_timers_collection() free_tta_resources(); } -void tcp_timers_collection::free_tta_resources(void) +void tcp_timers_collection::free_tta_resources() { if (m_n_count) { for (int i = 0; i < m_n_intervals_size; i++) { diff --git a/src/core/sock/sockinfo_tcp.h b/src/core/sock/sockinfo_tcp.h index 86093c9a7..ea43b9753 100644 --- a/src/core/sock/sockinfo_tcp.h +++ b/src/core/sock/sockinfo_tcp.h @@ -137,7 +137,7 @@ enum inet_ecns { class sockinfo_tcp : public sockinfo, public timer_handler { public: - static inline size_t accepted_conns_node_offset(void) + static inline size_t accepted_conns_node_offset() { return NODE_OFFSET(sockinfo_tcp, accepted_conns_node); } @@ -192,14 +192,14 @@ class sockinfo_tcp : public sockinfo, public timer_handler { void statistics_print(vlog_levels_t log_level = VLOG_DEBUG) override; - inline struct tcp_pcb *get_pcb(void) { return &m_pcb; } + inline struct tcp_pcb *get_pcb() { return &m_pcb; } - inline unsigned sndbuf_available(void) + inline unsigned sndbuf_available() { return static_cast(std::max(tcp_sndbuf(&m_pcb), 0)); } - inline unsigned get_mss(void) { return m_pcb.mss; } + inline unsigned get_mss() { return m_pcb.mss; } ssize_t tx(xlio_tx_call_attr_t &tx_arg) override; ssize_t tcp_tx(xlio_tx_call_attr_t &tx_arg); @@ -215,7 +215,7 @@ class sockinfo_tcp : public sockinfo, public timer_handler { void update_header_field(data_updater *updater) override; bool rx_input_cb(mem_buf_desc_t *p_rx_pkt_mem_buf_desc_info, void *pv_fd_ready_array) override; void abort_connection(); - void tcp_shutdown_rx(void); + void tcp_shutdown_rx(); mem_buf_desc_t *tcp_tx_mem_buf_alloc(pbuf_type type); void tcp_rx_mem_buf_free(mem_buf_desc_t *p_desc); @@ -227,8 +227,8 @@ class sockinfo_tcp : public sockinfo, public timer_handler { static struct tcp_seg *tcp_seg_alloc_cached(void *p_conn); static void tcp_seg_free_direct(void *p_conn, struct tcp_seg *seg); static void tcp_seg_free_cached(void *p_conn, struct tcp_seg *seg); - uint32_t get_next_tcp_seqno(void) { return m_pcb.snd_lbb; } - uint32_t get_next_tcp_seqno_rx(void) { return m_pcb.rcv_nxt; } + uint32_t get_next_tcp_seqno() { return m_pcb.snd_lbb; } + uint32_t get_next_tcp_seqno_rx() { return m_pcb.rcv_nxt; } mem_buf_desc_t *tcp_tx_zc_alloc(mem_buf_desc_t *p_desc); static void tcp_tx_zc_callback(mem_buf_desc_t *p_desc); @@ -242,7 +242,7 @@ class sockinfo_tcp : public sockinfo, public timer_handler { return get_tcp_state(&m_pcb) == CLOSED && m_syn_received.empty() && m_accepted_conns.empty(); } - bool inline is_destroyable_lock(void) + bool inline is_destroyable_lock() { bool state; m_tcp_con_lock.lock(); @@ -250,7 +250,7 @@ class sockinfo_tcp : public sockinfo, public timer_handler { m_tcp_con_lock.unlock(); return state; } - bool inline is_destroyable_no_lock(void) + bool inline is_destroyable_no_lock() { return get_tcp_state(&m_pcb) == CLOSED && m_state == SOCKINFO_CLOSING; } @@ -295,18 +295,18 @@ class sockinfo_tcp : public sockinfo, public timer_handler { void handle_timer_expired(void *user_data) override; - inline ib_ctx_handler *get_ctx(void) + inline ib_ctx_handler *get_ctx() { return m_p_connected_dst_entry ? m_p_connected_dst_entry->get_ctx() : nullptr; } - inline ring *get_tx_ring(void) const noexcept + inline ring *get_tx_ring() const noexcept { return m_p_connected_dst_entry ? m_p_connected_dst_entry->get_ring() : nullptr; } - inline ring *get_rx_ring(void) { return m_p_rx_ring; } - const flow_tuple_with_local_if &get_flow_tuple(void) + inline ring *get_rx_ring() { return m_p_rx_ring; } + const flow_tuple_with_local_if &get_flow_tuple() { /* XXX Dosn't handle empty map and a map with multiple elements. */ auto rx_flow_iter = m_rx_flow_map.begin(); @@ -314,7 +314,7 @@ class sockinfo_tcp : public sockinfo, public timer_handler { } /* Proxy to support ULP. TODO Refactor. */ - inline sockinfo_tcp_ops *get_ops(void) { return m_ops; } + inline sockinfo_tcp_ops *get_ops() { return m_ops; } inline void set_ops(sockinfo_tcp_ops *ops) noexcept { std::swap(ops, m_ops); @@ -322,15 +322,15 @@ class sockinfo_tcp : public sockinfo, public timer_handler { delete ops; } } - inline void reset_ops(void) noexcept { set_ops(m_ops_tcp); } + inline void reset_ops() noexcept { set_ops(m_ops_tcp); } bool is_utls_supported(int direction) const; int get_supported_nvme_feature_mask() const; - inline int trylock_tcp_con(void) { return m_tcp_con_lock.trylock(); } - inline void lock_tcp_con(void) { m_tcp_con_lock.lock(); } - inline void unlock_tcp_con(void) { m_tcp_con_lock.unlock(); } + inline int trylock_tcp_con() { return m_tcp_con_lock.trylock(); } + inline void lock_tcp_con() { m_tcp_con_lock.lock(); } + inline void unlock_tcp_con() { m_tcp_con_lock.unlock(); } inline void set_reguired_send_block(unsigned sz) { m_required_send_block = sz; } static err_t rx_lwip_cb(void *arg, struct tcp_pcb *tpcb, struct pbuf *p, err_t err); diff --git a/src/core/sock/sockinfo_ulp.cpp b/src/core/sock/sockinfo_ulp.cpp index ff8384bc9..90ae74286 100644 --- a/src/core/sock/sockinfo_ulp.cpp +++ b/src/core/sock/sockinfo_ulp.cpp @@ -46,7 +46,7 @@ #define si_ulp_logerr __log_info_err /*inline*/ -ring *sockinfo_tcp_ops::get_tx_ring(void) +ring *sockinfo_tcp_ops::get_tx_ring() { return m_p_sock->get_tx_ring(); } @@ -86,11 +86,11 @@ bool sockinfo_tcp_ops::handle_send_ret(ssize_t ret, struct tcp_seg *seg) #include struct xlio_tls_api { - EVP_CIPHER_CTX *(*EVP_CIPHER_CTX_new)(void); + EVP_CIPHER_CTX *(*EVP_CIPHER_CTX_new)(); void (*EVP_CIPHER_CTX_free)(EVP_CIPHER_CTX *); int (*EVP_CIPHER_CTX_reset)(EVP_CIPHER_CTX *); - const EVP_CIPHER *(*EVP_aes_128_gcm)(void); - const EVP_CIPHER *(*EVP_aes_256_gcm)(void); + const EVP_CIPHER *(*EVP_aes_128_gcm)(); + const EVP_CIPHER *(*EVP_aes_256_gcm)(); int (*EVP_DecryptInit_ex)(EVP_CIPHER_CTX *, const EVP_CIPHER *, ENGINE *, const unsigned char *, const unsigned char *); int (*EVP_DecryptUpdate)(EVP_CIPHER_CTX *, unsigned char *, int *, const unsigned char *, int); @@ -116,7 +116,7 @@ template static void dlsym_default(T &ptr, const char *name) #define XLIO_TLS_API_FIND(__name) dlsym_default(s_tls_api.__name, #__name); -void xlio_tls_api_setup(void) +void xlio_tls_api_setup() { XLIO_TLS_API_FIND(EVP_CIPHER_CTX_new); XLIO_TLS_API_FIND(EVP_CIPHER_CTX_free); @@ -236,9 +236,9 @@ class tls_record : public mem_desc { } } - void get(void) override { (void)atomic_fetch_and_inc(&m_ref); } + void get() override { (void)atomic_fetch_and_inc(&m_ref); } - void put(void) override + void put() override { int ref = atomic_fetch_and_dec(&m_ref); @@ -275,7 +275,7 @@ class tls_record : public mem_desc { return len; } - inline size_t avail_space(void) + inline size_t avail_space() { /* Don't produce records larger than 16KB according to the protocol. */ size_t max_len = m_p_zc_owner ? (size_t)TLS_RECORD_MAX @@ -320,7 +320,7 @@ class tls_record : public mem_desc { } private: - inline void set_length(void) + inline void set_length() { uint16_t len = m_size - TLS_RECORD_HDR_LEN; @@ -669,7 +669,7 @@ int sockinfo_tcp_ops_tls::setsockopt(int __level, int __optname, const void *__o return 0; } -err_t sockinfo_tcp_ops_tls::tls_rx_consume_ready_packets(void) +err_t sockinfo_tcp_ops_tls::tls_rx_consume_ready_packets() { err_t ret = ERR_OK; diff --git a/src/core/sock/sockinfo_ulp.h b/src/core/sock/sockinfo_ulp.h index acca65a0e..d5af9722c 100644 --- a/src/core/sock/sockinfo_ulp.h +++ b/src/core/sock/sockinfo_ulp.h @@ -55,7 +55,7 @@ class sockinfo_tcp_ops { : m_p_sock(sock) {}; virtual ~sockinfo_tcp_ops() {} - inline ring *get_tx_ring(void); + inline ring *get_tx_ring(); virtual int setsockopt(int __level, int __optname, const void *__optval, socklen_t __optlen); virtual ssize_t tx(xlio_tx_call_attr_t &tx_arg); @@ -79,7 +79,7 @@ enum xlio_utls_mode { UTLS_MODE_RX = 1 << 1, }; -void xlio_tls_api_setup(void); +void xlio_tls_api_setup(); class sockinfo_tcp_ops_tls : public sockinfo_tcp_ops { public: @@ -94,13 +94,13 @@ class sockinfo_tcp_ops_tls : public sockinfo_tcp_ops { void get_record_buf(mem_buf_desc_t *&buf, uint8_t *&data, bool is_zerocopy); private: - inline bool is_tx_tls13(void) { return m_tls_info_tx.tls_version == TLS_1_3_VERSION; } - inline bool is_rx_tls13(void) { return m_tls_info_rx.tls_version == TLS_1_3_VERSION; } + inline bool is_tx_tls13() { return m_tls_info_tx.tls_version == TLS_1_3_VERSION; } + inline bool is_rx_tls13() { return m_tls_info_rx.tls_version == TLS_1_3_VERSION; } int send_alert(uint8_t alert_type); void terminate_session_fatal(uint8_t alert_type); - err_t tls_rx_consume_ready_packets(void); + err_t tls_rx_consume_ready_packets(); err_t recv(struct pbuf *p) override; void copy_by_offset(uint8_t *dst, uint32_t offset, uint32_t len); uint16_t offset_to_host16(uint32_t offset); From be5a30760c7910dda0dbfad8773a99eb52154288 Mon Sep 17 00:00:00 2001 From: Alex Briskin Date: Tue, 6 Feb 2024 09:32:50 +0200 Subject: [PATCH 084/169] issue: 3770816 Replace empty destructor with default Signed-off-by: Alex Briskin --- src/core/sock/cleanable_obj.h | 2 +- src/core/sock/fd_collection.h | 2 +- src/core/sock/pkt_rcvr_sink.h | 2 +- src/core/sock/pkt_sndr_source.h | 2 +- src/core/sock/sock-app.h | 2 +- src/core/sock/socket_fd_api.h | 3 ++- src/core/sock/sockinfo_ulp.h | 2 +- 7 files changed, 8 insertions(+), 7 deletions(-) diff --git a/src/core/sock/cleanable_obj.h b/src/core/sock/cleanable_obj.h index d8fe73649..415a00f02 100644 --- a/src/core/sock/cleanable_obj.h +++ b/src/core/sock/cleanable_obj.h @@ -41,7 +41,7 @@ class cleanable_obj { public: cleanable_obj() { m_b_cleaned = false; }; - virtual ~cleanable_obj() {}; + virtual ~cleanable_obj() = default; /* This function should be used just for objects that * was allocated via new() (not by new[], nor by placement new, nor a local object on the stack, diff --git a/src/core/sock/fd_collection.h b/src/core/sock/fd_collection.h index c72a1730a..9e4f34215 100644 --- a/src/core/sock/fd_collection.h +++ b/src/core/sock/fd_collection.h @@ -67,7 +67,7 @@ class cq_channel_info : public cleanable_obj { public: cq_channel_info(ring *p_ring) : m_p_ring(p_ring) {}; - ~cq_channel_info() override {}; + ~cq_channel_info() override = default; ring *get_ring() const noexcept { return m_p_ring; }; protected: diff --git a/src/core/sock/pkt_rcvr_sink.h b/src/core/sock/pkt_rcvr_sink.h index af59859cf..bfc71630d 100644 --- a/src/core/sock/pkt_rcvr_sink.h +++ b/src/core/sock/pkt_rcvr_sink.h @@ -49,7 +49,7 @@ class ring; */ class pkt_rcvr_sink { public: - virtual ~pkt_rcvr_sink() {}; + virtual ~pkt_rcvr_sink() = default; // Callback from lower layer notifying new receive packets // Return: 'true' if object queuing this receive packet diff --git a/src/core/sock/pkt_sndr_source.h b/src/core/sock/pkt_sndr_source.h index 8198afcfb..96f4b795c 100644 --- a/src/core/sock/pkt_sndr_source.h +++ b/src/core/sock/pkt_sndr_source.h @@ -41,7 +41,7 @@ */ class pkt_sndr_source { public: - virtual ~pkt_sndr_source() {}; + virtual ~pkt_sndr_source() = default; }; #endif diff --git a/src/core/sock/sock-app.h b/src/core/sock/sock-app.h index 0f2e8be9a..9f7e66397 100644 --- a/src/core/sock/sock-app.h +++ b/src/core/sock/sock-app.h @@ -83,7 +83,7 @@ struct app_conf { setup(); } - ~app_conf() {} + ~app_conf() = default; void setup() { diff --git a/src/core/sock/socket_fd_api.h b/src/core/sock/socket_fd_api.h index a474dff0d..69793f29a 100644 --- a/src/core/sock/socket_fd_api.h +++ b/src/core/sock/socket_fd_api.h @@ -104,7 +104,8 @@ typedef struct xlio_tx_call_attr { unsigned xlio_flags; pbuf_desc priv; - ~xlio_tx_call_attr() {}; + ~xlio_tx_call_attr() = default; + void clear() { opcode = TX_UNDEF; diff --git a/src/core/sock/sockinfo_ulp.h b/src/core/sock/sockinfo_ulp.h index d5af9722c..74941ebc2 100644 --- a/src/core/sock/sockinfo_ulp.h +++ b/src/core/sock/sockinfo_ulp.h @@ -53,7 +53,7 @@ class sockinfo_tcp_ops { public: sockinfo_tcp_ops(sockinfo_tcp *sock) : m_p_sock(sock) {}; - virtual ~sockinfo_tcp_ops() {} + virtual ~sockinfo_tcp_ops() = default; inline ring *get_tx_ring(); From e5f3ead3d0da5aab003e0014e328ac2a52cefb13 Mon Sep 17 00:00:00 2001 From: Dmytro Podgornyi Date: Wed, 21 Feb 2024 12:25:28 +0200 Subject: [PATCH 085/169] issue: 3788369 Rename thread_local_event_handler It's not thread_local specific anymore. This event handler will also be used within poll_group to provide TCP timers for the respective sockets. Signed-off-by: Dmytro Podgornyi --- src/core/Makefile.am | 4 ++-- ...er.cpp => event_handler_manager_local.cpp} | 21 ++++++++++--------- ...andler.h => event_handler_manager_local.h} | 10 ++++----- src/core/sock/sock-extra.cpp | 6 +++--- src/core/sock/sock-redirect.cpp | 4 ++-- src/core/sock/sockinfo_tcp.cpp | 10 ++++----- 6 files changed, 27 insertions(+), 28 deletions(-) rename src/core/event/{thread_local_event_handler.cpp => event_handler_manager_local.cpp} (75%) rename src/core/event/{thread_local_event_handler.h => event_handler_manager_local.h} (87%) diff --git a/src/core/Makefile.am b/src/core/Makefile.am index d0bb5f7df..a15bc4d6a 100644 --- a/src/core/Makefile.am +++ b/src/core/Makefile.am @@ -99,7 +99,7 @@ libxlio_la_SOURCES := \ \ event/delta_timer.cpp \ event/event_handler_manager.cpp \ - event/thread_local_event_handler.cpp \ + event/event_handler_manager_local.cpp \ event/vlogger_timer_handler.cpp \ event/netlink_event.cpp \ \ @@ -212,7 +212,7 @@ libxlio_la_SOURCES := \ event/event.h \ event/event_handler_ibverbs.h \ event/event_handler_manager.h \ - event/thread_local_event_handler.h \ + event/event_handler_manager_local.h \ event/event_handler_rdma_cm.h \ event/netlink_event.h \ event/timer_handler.h \ diff --git a/src/core/event/thread_local_event_handler.cpp b/src/core/event/event_handler_manager_local.cpp similarity index 75% rename from src/core/event/thread_local_event_handler.cpp rename to src/core/event/event_handler_manager_local.cpp index 5100f4bb9..1595d5b54 100644 --- a/src/core/event/thread_local_event_handler.cpp +++ b/src/core/event/event_handler_manager_local.cpp @@ -30,28 +30,29 @@ * SOFTWARE. */ -#include "thread_local_event_handler.h" +#include "event_handler_manager_local.h" #include "util/sys_vars.h" -thread_local thread_local_event_handler g_thread_local_event_handler; +using namespace std::chrono; -thread_local_event_handler::thread_local_event_handler() +thread_local event_handler_manager_local g_event_handler_manager_local; + +event_handler_manager_local::event_handler_manager_local() : event_handler_manager(false) { } -void thread_local_event_handler::post_new_reg_action(reg_action_t ®_action) +void event_handler_manager_local::post_new_reg_action(reg_action_t ®_action) { // For thread local event handler registration can be immediate. handle_registration_action(reg_action); } -void thread_local_event_handler::do_tasks() +void event_handler_manager_local::do_tasks() { - auto curr_time = chrono::steady_clock::now(); - if (likely( - safe_mce_sys().tcp_timer_resolution_msec > - chrono::duration_cast(curr_time - _last_run_time).count())) { + auto curr_time = steady_clock::now(); + if (likely(safe_mce_sys().tcp_timer_resolution_msec > + duration_cast(curr_time - _last_run_time).count())) { return; } @@ -60,7 +61,7 @@ void thread_local_event_handler::do_tasks() do_tasks_for_thread_local(); } -void thread_local_event_handler::do_tasks_for_thread_local() +void event_handler_manager_local::do_tasks_for_thread_local() { m_timer.process_registered_timers_uncond(); } diff --git a/src/core/event/thread_local_event_handler.h b/src/core/event/event_handler_manager_local.h similarity index 87% rename from src/core/event/thread_local_event_handler.h rename to src/core/event/event_handler_manager_local.h index 1db1b7bd9..40199caef 100644 --- a/src/core/event/thread_local_event_handler.h +++ b/src/core/event/event_handler_manager_local.h @@ -37,11 +37,9 @@ #include "event_handler_manager.h" -using namespace std; - -class thread_local_event_handler : public event_handler_manager { +class event_handler_manager_local : public event_handler_manager { public: - thread_local_event_handler(); + event_handler_manager_local(); void do_tasks(); @@ -51,9 +49,9 @@ class thread_local_event_handler : public event_handler_manager { private: void do_tasks_for_thread_local(); - chrono::steady_clock::time_point _last_run_time; + std::chrono::steady_clock::time_point _last_run_time; }; -extern thread_local thread_local_event_handler g_thread_local_event_handler; +extern thread_local event_handler_manager_local g_event_handler_manager_local; #endif diff --git a/src/core/sock/sock-extra.cpp b/src/core/sock/sock-extra.cpp index fcb438560..b7a9959bf 100644 --- a/src/core/sock/sock-extra.cpp +++ b/src/core/sock/sock-extra.cpp @@ -38,7 +38,7 @@ #include #include #include -#include +#include #include #include #include @@ -123,7 +123,7 @@ extern "C" int xlio_socketxtreme_poll(int fd, struct xlio_socketxtreme_completio cq_ch_info = g_p_fd_collection->get_cq_channel_fd(fd); if (safe_mce_sys().tcp_ctl_thread == option_tcp_ctl_thread::CTL_THREAD_DELEGATE_TCP_TIMERS) { - g_thread_local_event_handler.do_tasks(); + g_event_handler_manager_local.do_tasks(); } if (likely(cq_ch_info)) { @@ -259,7 +259,7 @@ extern "C" int xlio_get_socket_rings_fds(int fd, int *ring_fds, int ring_fds_sz) if (p_socket_object && p_socket_object->check_rings()) { int rings_num = 0; int *p_rings_fds = p_socket_object->get_rings_fds(rings_num); - int num_rings_to_copy = min(ring_fds_sz, rings_num); + int num_rings_to_copy = std::min(ring_fds_sz, rings_num); std::copy(&p_rings_fds[0], &p_rings_fds[num_rings_to_copy], ring_fds); return num_rings_to_copy; } diff --git a/src/core/sock/sock-redirect.cpp b/src/core/sock/sock-redirect.cpp index 97b247a6a..b550fdbc6 100644 --- a/src/core/sock/sock-redirect.cpp +++ b/src/core/sock/sock-redirect.cpp @@ -49,7 +49,7 @@ #include "utils/lock_wrapper.h" #include #include -#include +#include #include #include #include @@ -710,7 +710,7 @@ inline int epoll_wait_helper(int __epfd, struct epoll_event *__events, int __max } if (safe_mce_sys().tcp_ctl_thread == option_tcp_ctl_thread::CTL_THREAD_DELEGATE_TCP_TIMERS) { - g_thread_local_event_handler.do_tasks(); + g_event_handler_manager_local.do_tasks(); } epoll_event extra_events_buffer[__maxevents]; diff --git a/src/core/sock/sockinfo_tcp.cpp b/src/core/sock/sockinfo_tcp.cpp index 1aeb770bc..5a0d07d34 100644 --- a/src/core/sock/sockinfo_tcp.cpp +++ b/src/core/sock/sockinfo_tcp.cpp @@ -46,11 +46,11 @@ #include "util/list.h" #include "util/agent.h" #include "event/event_handler_manager.h" +#include "event/event_handler_manager_local.h" #include "proto/route_table_mgr.h" #include "proto/xlio_lwip.h" #include "proto/dst_entry_tcp.h" #include "iomux/io_mux_call.h" -#include "event/thread_local_event_handler.h" #include "sock-redirect.h" #include "fd_collection.h" #include "sockinfo_tcp.h" @@ -137,7 +137,7 @@ static event_handler_manager *get_event_mgr() { return (safe_mce_sys().tcp_ctl_thread != option_tcp_ctl_thread::CTL_THREAD_DELEGATE_TCP_TIMERS ? g_p_event_handler_manager - : &g_thread_local_event_handler); + : &g_event_handler_manager_local); } static tcp_timers_collection *get_tcp_timer_collection() @@ -3819,7 +3819,7 @@ int sockinfo_tcp::os_epoll_wait_with_tcp_timers(epoll_event *ep_events, int maxe // epol_wait timeout // We must run here TCP timers because we are in a mode when TCP timers are // handled by the context threads instead of the internal thread. - g_thread_local_event_handler.do_tasks(); + g_event_handler_manager_local.do_tasks(); } while (1); return rc; @@ -5060,7 +5060,7 @@ int sockinfo_tcp::rx_wait_helper(int &poll_count, bool blocking) // There are scenarios when rx_wait_helper is called in an infinite loop but exits before // OS epoll_wait. Delegated TCP timers must be attempted in such case. // This is a slow path. So calling chrono::now(), even with every iteration, is OK here. - g_thread_local_event_handler.do_tasks(); + g_event_handler_manager_local.do_tasks(); } // if in blocking accept state skip poll phase and go to sleep directly @@ -6216,7 +6216,7 @@ ssize_t sockinfo_tcp::tcp_tx_handle_sndbuf_unavailable(ssize_t total_tx, bool is option_tcp_ctl_thread::CTL_THREAD_DELEGATE_TCP_TIMERS) { // Slow path. We must attempt TCP timers here for applications that // do not check for EV_OUT. - g_thread_local_event_handler.do_tasks(); + g_event_handler_manager_local.do_tasks(); } // in case of zero sndbuf and non-blocking just try once polling CQ for // ACK From 8561e115b757b1ca17e76b20583d81cbb8d63f46 Mon Sep 17 00:00:00 2001 From: Dmytro Podgornyi Date: Thu, 22 Feb 2024 20:28:17 +0200 Subject: [PATCH 086/169] issue: 3788369 Fix subsequent xlio_get_socket_rings_fds() calls After the 1st call to xlio_get_socket_rings_fds(), XLIO remembers the resulting array within the socket object. However, subsequent calls return the array immediately without setting number of elements in the array. As result, the function returns 0 rings by mistake. Always set the number of elements to resolve the issue. Signed-off-by: Dmytro Podgornyi --- src/core/sock/sockinfo.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/core/sock/sockinfo.cpp b/src/core/sock/sockinfo.cpp index 7b73d987b..690185a23 100644 --- a/src/core/sock/sockinfo.cpp +++ b/src/core/sock/sockinfo.cpp @@ -1996,10 +1996,10 @@ int *sockinfo::get_rings_fds(int &res_length) return m_p_rx_ring->get_rx_channel_fds(num_rx_channel_fds); } + res_length = get_rings_num(); if (m_p_rings_fds) { return m_p_rings_fds; } - res_length = get_rings_num(); m_p_rings_fds = new int[res_length]; rx_ring_map_t::iterator it = m_rx_ring_map.begin(); From e583d424bf0ed030d9b1567403b842cd5ed50f87 Mon Sep 17 00:00:00 2001 From: Dmytro Podgornyi Date: Fri, 23 Feb 2024 21:25:40 +0200 Subject: [PATCH 087/169] issue: 3788369 Return TX ring by xlio_get_socket_rings_fds() xlio_get_socket_rings_fds() and xlio_get_socket_rings_num() return the TX ring now. This can be useful in case of TX ring != RX ring when extra API is used. Also don't remember resulting array and construct it on the fly. Usually, this API is called only once for a socket, so remembering the array is a waste of memory. Remove the check_rings() and hardcoded number of rings for SockeXtreme flow. Return actual rings to make more precise result. Signed-off-by: Dmytro Podgornyi --- src/core/sock/sock-extra.cpp | 19 +---- src/core/sock/socket_fd_api.h | 8 +-- src/core/sock/sockinfo.cpp | 74 +++++++++---------- src/core/sock/sockinfo.h | 4 +- tests/gtest/extra_api/extra_poll.cc | 107 +++++++++++++++------------- tests/gtest/tcp/tcp_sockopt.cc | 86 +++++++++++++++------- 6 files changed, 162 insertions(+), 136 deletions(-) diff --git a/src/core/sock/sock-extra.cpp b/src/core/sock/sock-extra.cpp index b7a9959bf..8c5fe21d6 100644 --- a/src/core/sock/sock-extra.cpp +++ b/src/core/sock/sock-extra.cpp @@ -239,13 +239,8 @@ extern "C" int xlio_socketxtreme_free_buff(xlio_buff_t *buff) extern "C" int xlio_get_socket_rings_num(int fd) { - socket_fd_api *p_socket_object = nullptr; - p_socket_object = fd_collection_get_sockfd(fd); - if (p_socket_object && p_socket_object->check_rings()) { - return p_socket_object->get_rings_num(); - } - - return 0; + socket_fd_api *p_socket_object = fd_collection_get_sockfd(fd); + return p_socket_object ? p_socket_object->get_rings_num() : 0; } extern "C" int xlio_get_socket_rings_fds(int fd, int *ring_fds, int ring_fds_sz) @@ -256,15 +251,7 @@ extern "C" int xlio_get_socket_rings_fds(int fd, int *ring_fds, int ring_fds_sz) } socket_fd_api *p_socket_object = fd_collection_get_sockfd(fd); - if (p_socket_object && p_socket_object->check_rings()) { - int rings_num = 0; - int *p_rings_fds = p_socket_object->get_rings_fds(rings_num); - int num_rings_to_copy = std::min(ring_fds_sz, rings_num); - std::copy(&p_rings_fds[0], &p_rings_fds[num_rings_to_copy], ring_fds); - return num_rings_to_copy; - } - - return 0; + return p_socket_object ? p_socket_object->get_rings_fds(ring_fds, ring_fds_sz) : 0; } extern "C" int xlio_add_conf_rule(const char *config_line) diff --git a/src/core/sock/socket_fd_api.h b/src/core/sock/socket_fd_api.h index 69793f29a..7328b63d5 100644 --- a/src/core/sock/socket_fd_api.h +++ b/src/core/sock/socket_fd_api.h @@ -267,11 +267,11 @@ class socket_fd_api : public cleanable_obj { } virtual int get_rings_num() { return 0; } - virtual bool check_rings() { return false; } - virtual int *get_rings_fds(int &res_length) + virtual int get_rings_fds(int *ring_fds, int ring_fds_sz) { - res_length = 0; - return nullptr; + NOT_IN_USE(ring_fds); + NOT_IN_USE(ring_fds_sz); + return 0; } protected: diff --git a/src/core/sock/sockinfo.cpp b/src/core/sock/sockinfo.cpp index 690185a23..196f89ff8 100644 --- a/src/core/sock/sockinfo.cpp +++ b/src/core/sock/sockinfo.cpp @@ -101,7 +101,6 @@ sockinfo::sockinfo(int fd, int domain, bool use_ring_locks) : safe_mce_sys().sysctl_reader.get_net_ipv6_hop_limit()) , m_bind_no_port(false) , m_is_ipv6only(safe_mce_sys().sysctl_reader.get_ipv6_bindv6only()) - , m_p_rings_fds(nullptr) { m_rx_epfd = SYSCALL(epoll_create, 128); if (unlikely(m_rx_epfd == -1)) { @@ -150,11 +149,6 @@ sockinfo::~sockinfo() // This will wake up any blocked thread in rx() call to SYSCALL(epoll_wait, ) SYSCALL(close, m_rx_epfd); - if (m_p_rings_fds) { - delete[] m_p_rings_fds; - m_p_rings_fds = nullptr; - } - while (!m_error_queue.empty()) { mem_buf_desc_t *buff = m_error_queue.get_and_pop_front(); if (buff->m_flags & mem_buf_desc_t::CLONED) { @@ -1969,53 +1963,53 @@ int sockinfo::modify_ratelimit(dst_entry *p_dst_entry, struct xlio_rate_limit_t int sockinfo::get_rings_num() { - int count = 0; + size_t count = 0; size_t num_rx_channel_fds; - if (is_socketxtreme()) { - /* socketXtreme mode support just single ring */ - return 1; + + ring *tx_ring = m_p_connected_dst_entry ? m_p_connected_dst_entry->get_ring() : nullptr; + if (tx_ring) { + (void)tx_ring->get_rx_channel_fds(count); } - rx_ring_map_t::iterator it = m_rx_ring_map.begin(); - for (; it != m_rx_ring_map.end(); ++it) { - (void)it->first->get_rx_channel_fds(num_rx_channel_fds); - count += (int)num_rx_channel_fds; + for (auto pair : m_rx_ring_map) { + if (tx_ring == pair.first) { + continue; + } + (void)pair.first->get_rx_channel_fds(num_rx_channel_fds); + count += num_rx_channel_fds; } - return count; + return static_cast(count); } -int *sockinfo::get_rings_fds(int &res_length) +int sockinfo::get_rings_fds(int *ring_fds, int ring_fds_sz) { - res_length = 0; - int index = 0; size_t num_rx_channel_fds; + int *channel_fds; + int index = 0; - if (is_socketxtreme()) { - /* socketXtreme mode support just single ring */ - res_length = 1; - return m_p_rx_ring->get_rx_channel_fds(num_rx_channel_fds); - } - - res_length = get_rings_num(); - if (m_p_rings_fds) { - return m_p_rings_fds; + /* + * We return RX channels for the TX ring to make it consistent and comparable with the RX + * rings. The channels are used only as indirect pointers to the rings, therefore, this + * doesn't introduce any functionality issues. + */ + ring *tx_ring = m_p_connected_dst_entry ? m_p_connected_dst_entry->get_ring() : nullptr; + if (tx_ring) { + channel_fds = tx_ring->get_rx_channel_fds(num_rx_channel_fds); + for (size_t i = 0; i < num_rx_channel_fds && index < ring_fds_sz; ++i) { + ring_fds[index++] = channel_fds[i]; + } } - m_p_rings_fds = new int[res_length]; - rx_ring_map_t::iterator it = m_rx_ring_map.begin(); - for (; it != m_rx_ring_map.end(); ++it) { - int *p_n_rx_channel_fds = it->first->get_rx_channel_fds(num_rx_channel_fds); - for (size_t j = 0; j < num_rx_channel_fds; ++j) { - int fd = p_n_rx_channel_fds[j]; - if (fd != -1) { - m_p_rings_fds[index] = fd; - ++index; - } else { - si_logdbg("got ring with fd -1"); - } + for (auto pair : m_rx_ring_map) { + if (tx_ring == pair.first) { + continue; + } + channel_fds = pair.first->get_rx_channel_fds(num_rx_channel_fds); + for (size_t i = 0; i < num_rx_channel_fds && index < ring_fds_sz; ++i) { + ring_fds[index++] = channel_fds[i]; } } - return m_p_rings_fds; + return index; } int sockinfo::setsockopt_kernel(int __level, int __optname, const void *__optval, diff --git a/src/core/sock/sockinfo.h b/src/core/sock/sockinfo.h index fc4ef8de9..2d8bd09ff 100644 --- a/src/core/sock/sockinfo.h +++ b/src/core/sock/sockinfo.h @@ -199,9 +199,8 @@ class sockinfo : public socket_fd_api, inline bool is_blocking(void) { return m_b_blocking; } bool flow_in_reuse(void) { return m_reuseaddr | m_reuseport; } - int *get_rings_fds(int &res_length) override; + int get_rings_fds(int *ring_fds, int ring_fds_sz) override; int get_rings_num() override; - bool check_rings() override { return m_p_rx_ring ? true : false; } void statistics_print(vlog_levels_t log_level = VLOG_DEBUG) override; uint32_t get_flow_tag_val() { return m_flow_tag_id; } inline in_protocol_t get_protocol(void) { return m_protocol; } @@ -647,7 +646,6 @@ class sockinfo : public socket_fd_api, uint8_t m_n_uc_ttl_hop_lim; bool m_bind_no_port; bool m_is_ipv6only; - int *m_p_rings_fds; }; #endif /* BASE_SOCKINFO_H */ diff --git a/tests/gtest/extra_api/extra_poll.cc b/tests/gtest/extra_api/extra_poll.cc index 64481f861..d9b096293 100644 --- a/tests/gtest/extra_api/extra_poll.cc +++ b/tests/gtest/extra_api/extra_poll.cc @@ -379,48 +379,55 @@ TEST_F(socketxtreme_poll, ti_4_socket_isolation) bool received_data = false; char msg[] = "Hello"; - int _xlio_ring_fd = -1; - int _xlio_peer_ring_fd = -1; + int ring_fd[3] = {-1, -1, -1}; + int peer_ring_fd[3] = {-1, -1, -1}; + int ring_fd_nr; + int peer_ring_fd_nr = 0; struct xlio_socketxtreme_completion_t xlio_comps; int fd_peer = -1; struct sockaddr peer_addr; - auto poll_single_ring = [&](int ring_fd) { - rc = xlio_api->socketxtreme_poll(ring_fd, &xlio_comps, 1, SOCKETXTREME_POLL_TX); - if (rc == 0) { - return; - } - if ((xlio_comps.events & EPOLLERR) || (xlio_comps.events & EPOLLHUP) || - (xlio_comps.events & EPOLLRDHUP)) { - log_trace("Close connection: event: 0x%lx\n", xlio_comps.events); - rc = -1; - return; - } - if (xlio_comps.events & XLIO_SOCKETXTREME_NEW_CONNECTION_ACCEPTED) { - EXPECT_EQ(fd, (int)xlio_comps.listen_fd); - fd_peer = (int)xlio_comps.user_data; - memcpy(&peer_addr, &xlio_comps.src, sizeof(peer_addr)); - log_trace("Accepted connection: fd: %d from %s\n", fd_peer, - sys_addr2str((struct sockaddr *)&peer_addr)); - - rc = xlio_api->get_socket_rings_fds(fd_peer, &_xlio_peer_ring_fd, 1); - ASSERT_EQ(1, rc); - ASSERT_LE(0, _xlio_peer_ring_fd); - - rc = send(fd_peer, (void *)msg, sizeof(msg), 0); - EXPECT_EQ(static_cast(sizeof(msg)), rc); - } - if (xlio_comps.events & XLIO_SOCKETXTREME_PACKET) { - EXPECT_EQ(1U, xlio_comps.packet.num_bufs); - EXPECT_EQ(sizeof(msg), xlio_comps.packet.total_len); - EXPECT_TRUE(xlio_comps.packet.buff_lst->payload); - EXPECT_EQ(0, - strncmp(msg, (const char *)xlio_comps.packet.buff_lst->payload, - xlio_comps.packet.total_len)); - log_trace("Received data: user_data: %p data: %s\n", - (void *)((uintptr_t)xlio_comps.user_data), - (char *)xlio_comps.packet.buff_lst->payload); - received_data = true; + auto poll_rings = [&](int *rings, int rings_nr) { + for (int i = 0; i < rings_nr; ++i) { + rc = xlio_api->socketxtreme_poll(rings[i], &xlio_comps, 1, SOCKETXTREME_POLL_TX); + if (rc == 0) { + continue; + } + if ((xlio_comps.events & EPOLLERR) || (xlio_comps.events & EPOLLHUP) || + (xlio_comps.events & EPOLLRDHUP)) { + log_trace("Close connection: event: 0x%lx\n", xlio_comps.events); + rc = -1; + return; + } + if (xlio_comps.events & XLIO_SOCKETXTREME_NEW_CONNECTION_ACCEPTED) { + EXPECT_EQ(fd, (int)xlio_comps.listen_fd); + fd_peer = (int)xlio_comps.user_data; + memcpy(&peer_addr, &xlio_comps.src, sizeof(peer_addr)); + log_trace("Accepted connection: fd: %d from %s\n", fd_peer, + sys_addr2str((struct sockaddr *)&peer_addr)); + + rc = xlio_api->get_socket_rings_num(fd); + ASSERT_GE((int)ARRAY_SIZE(peer_ring_fd), rc); + + peer_ring_fd_nr = + xlio_api->get_socket_rings_fds(fd_peer, peer_ring_fd, ARRAY_SIZE(peer_ring_fd)); + ASSERT_LT(0, peer_ring_fd_nr); + + rc = send(fd_peer, (void *)msg, sizeof(msg), 0); + EXPECT_EQ(static_cast(sizeof(msg)), rc); + } + if (xlio_comps.events & XLIO_SOCKETXTREME_PACKET) { + EXPECT_EQ(1U, xlio_comps.packet.num_bufs); + EXPECT_EQ(sizeof(msg), xlio_comps.packet.total_len); + EXPECT_TRUE(xlio_comps.packet.buff_lst->payload); + EXPECT_EQ(0, + strncmp(msg, (const char *)xlio_comps.packet.buff_lst->payload, + xlio_comps.packet.total_len)); + log_trace("Received data: user_data: %p data: %s\n", + (void *)((uintptr_t)xlio_comps.user_data), + (char *)xlio_comps.packet.buff_lst->payload); + received_data = true; + } } rc = 0; }; @@ -453,15 +460,17 @@ TEST_F(socketxtreme_poll, ti_4_socket_isolation) rc = sock_noblock(fd); ASSERT_EQ(0, rc); - rc = xlio_api->get_socket_rings_fds(fd, &_xlio_ring_fd, 1); - ASSERT_EQ(1, rc); - ASSERT_LE(0, _xlio_ring_fd); + rc = xlio_api->get_socket_rings_num(fd); + ASSERT_GE((int)ARRAY_SIZE(ring_fd), rc); + + ring_fd_nr = xlio_api->get_socket_rings_fds(fd, ring_fd, ARRAY_SIZE(ring_fd)); + ASSERT_LT(0, ring_fd_nr); uint64_t ts = timestamp_ms(); ASSERT_NE(0LU, ts); rc = 0; while (rc == 0 && !received_data && !testing::Test::HasFailure()) { - poll_single_ring(_xlio_ring_fd); + poll_rings(ring_fd, ring_fd_nr); if (timestamp_ms_elapsed(ts, 500UL)) { log_trace("No data received by client within time limit\n"); break; @@ -490,17 +499,19 @@ TEST_F(socketxtreme_poll, ti_4_socket_isolation) rc = listen(fd, 5); CHECK_ERR_OK(rc); - rc = xlio_api->get_socket_rings_fds(fd, &_xlio_ring_fd, 1); - ASSERT_EQ(1, rc); - ASSERT_LE(0, _xlio_ring_fd); + rc = xlio_api->get_socket_rings_num(fd); + ASSERT_GE((int)ARRAY_SIZE(ring_fd), rc); + + ring_fd_nr = xlio_api->get_socket_rings_fds(fd, ring_fd, ARRAY_SIZE(ring_fd)); + ASSERT_LT(0, ring_fd_nr); barrier_fork(pid); rc = 0; while (rc == 0 && !child_fork_exit() && !testing::Test::HasFailure()) { - poll_single_ring(_xlio_ring_fd); - if (_xlio_peer_ring_fd >= 0 && _xlio_peer_ring_fd != _xlio_ring_fd && rc == 0) { - poll_single_ring(_xlio_peer_ring_fd); + poll_rings(ring_fd, ring_fd_nr); + if (peer_ring_fd_nr > 0 && rc == 0 && !testing::Test::HasFailure()) { + poll_rings(peer_ring_fd, peer_ring_fd_nr); } } diff --git a/tests/gtest/tcp/tcp_sockopt.cc b/tests/gtest/tcp/tcp_sockopt.cc index 10ec822ba..5dbc0cc76 100644 --- a/tests/gtest/tcp/tcp_sockopt.cc +++ b/tests/gtest/tcp/tcp_sockopt.cc @@ -258,6 +258,25 @@ TEST_F(tcp_sockopt, ti_3_setsockopt_isolate) SKIP_TRUE(server_addr.addr.sa_family == AF_INET && client_addr.addr.sa_family == AF_INET, "This test supports only IPv4"); + auto compare_rings_ne = [&](int *arr1, int arr1_nr, int *arr2, int arr2_nr) { + // Whether arr1 and arr2 don't overlap (contain different rings) + for (int i = 0; i < arr1_nr; ++i) { + for (int j = 0; j < arr2_nr; ++j) { + ASSERT_NE(arr1[i], arr2[j]); + } + } + }; + auto compare_rings_contains = [&](int *arr1, int arr1_nr, int *arr2, int arr2_nr) { + // Whether arr1 contains all arr2 + for (int i = 0; i < arr2_nr; ++i) { + bool contains = false; + for (int j = 0; j < arr1_nr; ++j) { + contains = contains || (arr2[i] == arr1[j]); + } + ASSERT_TRUE(contains); + } + }; + auto test_client = [&]() { char buf[64]; sockaddr_store_t addr; @@ -300,13 +319,20 @@ TEST_F(tcp_sockopt, ti_3_setsockopt_isolate) ASSERT_EQ(-1, rc); ASSERT_EQ(EINVAL, errno); - int xlio_ring_fds[3]; - int xlio_ring_fds2[3]; - rc = xlio_api->get_socket_rings_fds(sock, xlio_ring_fds, ARRAY_SIZE(xlio_ring_fds)); - ASSERT_EQ(1, rc); - rc = xlio_api->get_socket_rings_fds(sock2, xlio_ring_fds2, ARRAY_SIZE(xlio_ring_fds2)); - ASSERT_EQ(1, rc); - ASSERT_NE(xlio_ring_fds[0], xlio_ring_fds2[0]); + int ring_fds[3]; + int ring_fds2[3]; + int ring_fds_nr; + int ring_fds2_nr; + rc = xlio_api->get_socket_rings_num(sock); + ASSERT_LT(0, rc); + ASSERT_GE((int)ARRAY_SIZE(ring_fds), rc); + rc = xlio_api->get_socket_rings_num(sock2); + ASSERT_LT(0, rc); + ASSERT_GE((int)ARRAY_SIZE(ring_fds2), rc); + ring_fds_nr = xlio_api->get_socket_rings_fds(sock, ring_fds, ARRAY_SIZE(ring_fds)); + ring_fds2_nr = xlio_api->get_socket_rings_fds(sock2, ring_fds2, ARRAY_SIZE(ring_fds2)); + compare_rings_ne(ring_fds, ring_fds_nr, ring_fds2, ring_fds2_nr); + ASSERT_TRUE(!testing::Test::HasFailure()); len = write(sock, HELLO_STR, sizeof(HELLO_STR)); ASSERT_LT(0, len); @@ -393,20 +419,23 @@ TEST_F(tcp_sockopt, ti_3_setsockopt_isolate) ASSERT_EQ(EINVAL, errno); /* - * Check rings + * Check rings for listen sockets */ - int xlio_ring_fds[3]; - int xlio_ring_fds2[3]; - int xlio_ring_fds3[3]; - rc = xlio_api->get_socket_rings_fds(sock, xlio_ring_fds, ARRAY_SIZE(xlio_ring_fds)); - ASSERT_EQ(1, rc); - rc = xlio_api->get_socket_rings_fds(sock2, xlio_ring_fds2, ARRAY_SIZE(xlio_ring_fds2)); - ASSERT_EQ(1, rc); - rc = xlio_api->get_socket_rings_fds(sock3, xlio_ring_fds3, ARRAY_SIZE(xlio_ring_fds3)); - ASSERT_EQ(1, rc); - ASSERT_EQ(xlio_ring_fds[0], xlio_ring_fds2[0]); - ASSERT_NE(xlio_ring_fds[0], xlio_ring_fds3[0]); + int ring_fds[3]; + int ring_fds2[3]; + int ring_fds3[3]; + int ring_fds_nr; + int ring_fds2_nr; + int ring_fds3_nr; + ring_fds_nr = xlio_api->get_socket_rings_fds(sock, ring_fds, ARRAY_SIZE(ring_fds)); + ASSERT_EQ(1, ring_fds_nr); + ring_fds2_nr = xlio_api->get_socket_rings_fds(sock2, ring_fds2, ARRAY_SIZE(ring_fds2)); + ASSERT_EQ(1, ring_fds2_nr); + ring_fds3_nr = xlio_api->get_socket_rings_fds(sock3, ring_fds3, ARRAY_SIZE(ring_fds3)); + ASSERT_EQ(1, ring_fds3_nr); + ASSERT_EQ(ring_fds[0], ring_fds2[0]); + ASSERT_NE(ring_fds[0], ring_fds3[0]); // Notify client to proceed with connect() barrier_fork(pid); @@ -431,13 +460,20 @@ TEST_F(tcp_sockopt, ti_3_setsockopt_isolate) log_trace("Accepted connection: fd=%d from %s\n", sock_in2, sys_addr2str((struct sockaddr *)&peer_addr)); - rc = xlio_api->get_socket_rings_fds(sock_in, xlio_ring_fds2, ARRAY_SIZE(xlio_ring_fds2)); - ASSERT_EQ(1, rc); - rc = xlio_api->get_socket_rings_fds(sock_in2, xlio_ring_fds3, ARRAY_SIZE(xlio_ring_fds3)); - ASSERT_EQ(1, rc); + rc = xlio_api->get_socket_rings_num(sock_in); + ASSERT_LT(0, rc); + ASSERT_GE((int)ARRAY_SIZE(ring_fds2), rc); + rc = xlio_api->get_socket_rings_num(sock_in2); + ASSERT_LT(0, rc); + ASSERT_GE((int)ARRAY_SIZE(ring_fds3), rc); + + ring_fds2_nr = xlio_api->get_socket_rings_fds(sock_in, ring_fds2, ARRAY_SIZE(ring_fds2)); + ring_fds3_nr = xlio_api->get_socket_rings_fds(sock_in2, ring_fds3, ARRAY_SIZE(ring_fds3)); // Incoming TCP sockets inherit ring allocation logic from their parents - ASSERT_EQ(xlio_ring_fds[0], xlio_ring_fds2[0]); - ASSERT_EQ(xlio_ring_fds[0], xlio_ring_fds3[0]); + compare_rings_contains(ring_fds2, ring_fds2_nr, ring_fds, ring_fds_nr); + ASSERT_TRUE(!testing::Test::HasFailure()); + compare_rings_contains(ring_fds3, ring_fds3_nr, ring_fds, ring_fds_nr); + ASSERT_TRUE(!testing::Test::HasFailure()); /* * Socket read / write From 757e0a1891af67d5f72a09b6ed90fa741614d47b Mon Sep 17 00:00:00 2001 From: Dmytro Podgornyi Date: Fri, 23 Feb 2024 17:39:00 +0200 Subject: [PATCH 088/169] issue: 3788369 Don't reset TCP connection twice XLIO_TCP_ABORT_ON_CLOSE option forces XLIO to send RST instead of FIN handshake. In this case, XLIO reset each connection twice. Fix this and check only once whether connection needs to send RST in prepare_to_close(). Signed-off-by: Dmytro Podgornyi --- src/core/sock/sockinfo_tcp.cpp | 30 ++++++++++++------------------ 1 file changed, 12 insertions(+), 18 deletions(-) diff --git a/src/core/sock/sockinfo_tcp.cpp b/src/core/sock/sockinfo_tcp.cpp index 5a0d07d34..f83c12f51 100644 --- a/src/core/sock/sockinfo_tcp.cpp +++ b/src/core/sock/sockinfo_tcp.cpp @@ -528,15 +528,15 @@ bool sockinfo_tcp::prepare_listen_to_close() bool sockinfo_tcp::prepare_to_close(bool process_shutdown /* = false */) { - bool do_abort = safe_mce_sys().tcp_abort_on_close; - bool state; + si_tcp_logdbg(""); lock_tcp_con(); - si_tcp_logdbg(""); - + bool do_abort = safe_mce_sys().tcp_abort_on_close || m_n_rx_pkt_ready_list_count; bool is_listen_socket = is_server() || get_tcp_state(&m_pcb) == LISTEN; + m_state = SOCKINFO_CLOSING; + /* * consider process_shutdown: * workaround for LBM which does not close the listen sockets properly on process shutdown. @@ -549,9 +549,6 @@ bool sockinfo_tcp::prepare_to_close(bool process_shutdown /* = false */) m_sock_state == TCP_SOCK_CONNECTED_WR || m_sock_state == TCP_SOCK_CONNECTED_RDWR) { m_sock_state = TCP_SOCK_BOUND; } - if (!is_listen_socket && (do_abort || m_n_rx_pkt_ready_list_count)) { - abort_connection(); - } m_rx_ready_byte_count += m_rx_pkt_ready_offset; m_p_socket_stats->n_rx_ready_byte_count += m_rx_pkt_ready_offset; @@ -615,7 +612,7 @@ bool sockinfo_tcp::prepare_to_close(bool process_shutdown /* = false */) * termination sequence * If process_shutdown is set as True do abort() with setting tcp state as CLOSED */ - if (get_tcp_state(&m_pcb) != LISTEN && + if (!is_listen_socket && (do_abort || process_shutdown || (m_linger.l_onoff && !m_linger.l_linger))) { abort_connection(); } else { @@ -630,25 +627,22 @@ bool sockinfo_tcp::prepare_to_close(bool process_shutdown /* = false */) } else { tcp_recv(&m_pcb, sockinfo_tcp::rx_drop_lwip_cb); tcp_sent(&m_pcb, nullptr); - } - - // todo should we do this each time we get into prepare_to_close ? - if (get_tcp_state(&m_pcb) != LISTEN) { - handle_socket_linger(); + if (m_linger.l_onoff && m_linger.l_linger) { + // TODO Should we do this each time we get into prepare_to_close? + handle_socket_linger(); + } } } - m_state = SOCKINFO_CLOSING; NOTIFY_ON_EVENTS(this, EPOLLHUP); - do_wakeup(); if (m_econtext) { m_econtext->fd_closed(m_fd); } - state = is_closable(); - if (state) { + bool is_closable_state = is_closable(); + if (is_closable_state) { m_state = SOCKINFO_CLOSED; reset_ops(); } else if (!is_listen_socket) { @@ -663,7 +657,7 @@ bool sockinfo_tcp::prepare_to_close(bool process_shutdown /* = false */) unlock_tcp_con(); - return state; + return is_closable_state; } void sockinfo_tcp::handle_socket_linger() From 621535993d6255bc87589c43605cc58fe9908186 Mon Sep 17 00:00:00 2001 From: Dmytro Podgornyi Date: Tue, 27 Feb 2024 17:59:44 +0200 Subject: [PATCH 089/169] issue: 3788369 Don't hardcode TCP send buffer for TCP_NODELAY Once TCP_NODELAY option is set, XLIO disables Nagle's algorithm and sets TCP send buffer to a hardcoded value. This applies to setsockopt() and XLIO_TCP_NODELAY configuration. Avoid changing the send buffer and continues using either SO_SNDBUF or XLIO_TCP_SEND_BUFFER_SIZE value. Also, remove duplicated send buffer size from sockinfo_tcp object which was used only by getsockopt(). Signed-off-by: Dmytro Podgornyi --- src/core/lwip/opt.h | 6 ------ src/core/lwip/tcp.c | 4 ++-- src/core/sock/sockinfo_tcp.cpp | 30 +++++++----------------------- src/core/sock/sockinfo_tcp.h | 3 --- 4 files changed, 9 insertions(+), 34 deletions(-) diff --git a/src/core/lwip/opt.h b/src/core/lwip/opt.h index 466606f83..618a8a080 100644 --- a/src/core/lwip/opt.h +++ b/src/core/lwip/opt.h @@ -91,12 +91,6 @@ #define CONST_TCP_MSS 1460 #define LWIP_TCP_MSS (lwip_tcp_mss) -/** - * TCP_SND_BUF: TCP sender buffer space (bytes). - */ -#define TCP_SND_BUF (lwip_tcp_snd_buf) -#define TCP_SND_BUF_NO_NAGLE 256000 - /* Misc */ // replace lwip byte swapping to optimized one diff --git a/src/core/lwip/tcp.c b/src/core/lwip/tcp.c index 2ed3ba28f..4656da55a 100644 --- a/src/core/lwip/tcp.c +++ b/src/core/lwip/tcp.c @@ -907,9 +907,9 @@ void tcp_pcb_init(struct tcp_pcb *pcb, u8_t prio, void *container) memset(pcb, 0, sizeof(*pcb)); pcb->my_container = container; - pcb->max_snd_buff = TCP_SND_BUF; pcb->is_last_seg_dropped = false; pcb->prio = prio; + pcb->max_snd_buff = lwip_tcp_snd_buf; pcb->snd_buf = pcb->max_snd_buff; pcb->snd_queuelen = 0; pcb->snd_scale = 0; @@ -984,7 +984,7 @@ void tcp_pcb_recycle(struct tcp_pcb *pcb) u32_t iss; pcb->flags = 0; - pcb->max_snd_buff = TCP_SND_BUF; + pcb->max_snd_buff = lwip_tcp_snd_buf; pcb->snd_buf = pcb->max_snd_buff; pcb->user_timeout_ms = 0; pcb->ticks_since_data_sent = -1; diff --git a/src/core/sock/sockinfo_tcp.cpp b/src/core/sock/sockinfo_tcp.cpp index f83c12f51..ea85cfeab 100644 --- a/src/core/sock/sockinfo_tcp.cpp +++ b/src/core/sock/sockinfo_tcp.cpp @@ -340,8 +340,6 @@ sockinfo_tcp::sockinfo_tcp(int fd, int domain) m_parent = nullptr; m_iomux_ready_fd_array = nullptr; - /* SNDBUF accounting */ - m_sndbuff_max = 0; /* RCVBUF accounting */ m_rcvbuff_max = safe_mce_sys().sysctl_reader.get_tcp_rmem()->default_value; @@ -3324,7 +3322,6 @@ err_t sockinfo_tcp::accept_lwip_cb(void *arg, struct tcp_pcb *child_pcb, err_t e tcp_nagle_disabled(&new_sock->m_pcb)) { conn_nagle_disabled ? tcp_nagle_disable(&new_sock->m_pcb) : tcp_nagle_enable(&new_sock->m_pcb); - new_sock->fit_snd_bufs_to_nagle(conn_nagle_disabled); } if (new_sock->m_conn_state == TCP_CONN_INIT) { @@ -4138,26 +4135,14 @@ void sockinfo_tcp::fit_rcv_wnd(bool force_fit) void sockinfo_tcp::fit_snd_bufs(unsigned int new_max_snd_buff) { - m_pcb.snd_buf += (new_max_snd_buff - m_pcb.max_snd_buff); + // snd_buf can become negative + m_pcb.snd_buf += ((int)new_max_snd_buff - m_pcb.max_snd_buff); m_pcb.max_snd_buff = new_max_snd_buff; auto mss = m_pcb.mss ?: 536; m_pcb.max_unsent_len = (mss - 1 + m_pcb.max_snd_buff * 16) / mss; } -void sockinfo_tcp::fit_snd_bufs_to_nagle(bool disable_nagle) -{ - if (m_sndbuff_max) { - return; - } - - if (disable_nagle) { - fit_snd_bufs(TCP_SND_BUF_NO_NAGLE); - } else { - fit_snd_bufs(TCP_SND_BUF); - } -} - //////////////////////////////////////////////////////////////////////////////// bool sockinfo_tcp::try_un_offloading() // un-offload the socket if possible { @@ -4248,7 +4233,6 @@ int sockinfo_tcp::tcp_setsockopt(int __level, int __optname, __const void *__opt } else { tcp_nagle_enable(&m_pcb); } - fit_snd_bufs_to_nagle(val); unlock_tcp_con(); si_tcp_logdbg("(TCP_NODELAY) nagle: %d", val); break; @@ -4423,10 +4407,10 @@ int sockinfo_tcp::tcp_setsockopt(int __level, int __optname, __const void *__opt lock_tcp_con(); // OS allocates double the size of memory requested by the application - not sure we // need it. - m_sndbuff_max = std::max(2 * m_pcb.mss, 2 * val); - fit_snd_bufs(m_sndbuff_max); + val = std::max(2 * m_pcb.mss, 2 * val); + fit_snd_bufs(val); unlock_tcp_con(); - si_tcp_logdbg("setsockopt SO_SNDBUF: %d", m_sndbuff_max); + si_tcp_logdbg("setsockopt SO_SNDBUF: requested %d, set %d", *(int *)__optval, val); break; case SO_LINGER: if (__optlen < sizeof(struct linger)) { @@ -4823,8 +4807,8 @@ int sockinfo_tcp::getsockopt_offload(int __level, int __optname, void *__optval, break; case SO_SNDBUF: if (*__optlen >= sizeof(int)) { - *(int *)__optval = m_sndbuff_max; - si_tcp_logdbg("(SO_SNDBUF) sndbuf=%d", m_sndbuff_max); + *(int *)__optval = m_pcb.max_snd_buff; + si_tcp_logdbg("(SO_SNDBUF) sndbuf=%d", *(int *)__optval); ret = 0; } else { errno = EINVAL; diff --git a/src/core/sock/sockinfo_tcp.h b/src/core/sock/sockinfo_tcp.h index ea43b9753..611007ab2 100644 --- a/src/core/sock/sockinfo_tcp.h +++ b/src/core/sock/sockinfo_tcp.h @@ -513,7 +513,6 @@ class sockinfo_tcp : public sockinfo, public timer_handler { int rx_wait_helper(int &poll_count, bool blocking); void fit_rcv_wnd(bool force_fit); void fit_snd_bufs(unsigned int new_max); - void fit_snd_bufs_to_nagle(bool disable_nagle); inline struct tcp_seg *get_tcp_seg_cached(); inline struct tcp_seg *get_tcp_seg_direct(); @@ -552,8 +551,6 @@ class sockinfo_tcp : public sockinfo, public timer_handler { bool m_b_attached; /* connection state machine */ int m_conn_timeout; - /* SNDBUF acconting */ - int m_sndbuff_max; /* RCVBUF acconting */ int m_rcvbuff_max; int m_rcvbuff_current; From a750ddc0a96591a72bc1a2bb6001a5f48cbb37f5 Mon Sep 17 00:00:00 2001 From: Dmytro Podgornyi Date: Thu, 29 Feb 2024 11:01:17 +0200 Subject: [PATCH 090/169] issue: 3788369 Disable MSG_ZEROCOPY tests in gtests The tests fail to receive notifications via the error queue and disabled in CI as result. However, running the tests manually is inconvenient, because of the known failing test cases. Disable the failing tests in the code instead of CI script. Signed-off-by: Dmytro Podgornyi --- contrib/jenkins_tests/gtest.sh | 8 ++++---- tests/gtest/core/xlio_send_zc.cc | 4 ++-- tests/gtest/tcp/tcp_send_zc.cc | 12 ++++++------ 3 files changed, 12 insertions(+), 12 deletions(-) diff --git a/contrib/jenkins_tests/gtest.sh b/contrib/jenkins_tests/gtest.sh index 1ac0c3692..31bcf41eb 100755 --- a/contrib/jenkins_tests/gtest.sh +++ b/contrib/jenkins_tests/gtest.sh @@ -73,19 +73,19 @@ eval "${sudo_cmd} pkill -9 ${prj_service} 2>/dev/null || true" eval "${sudo_cmd} ${install_dir}/sbin/${prj_service} --console -v5 &" # Exclude EXTRA API tests -eval "${sudo_cmd} $timeout_exe env GTEST_TAP=2 LD_PRELOAD=$gtest_lib $gtest_app $gtest_opt --gtest_filter=-xlio_*:tcp_send_zc* --gtest_output=xml:${WORKSPACE}/${prefix}/test-basic.xml" +eval "${sudo_cmd} $timeout_exe env GTEST_TAP=2 LD_PRELOAD=$gtest_lib $gtest_app $gtest_opt --gtest_filter=-xlio_* --gtest_output=xml:${WORKSPACE}/${prefix}/test-basic.xml" rc=$(($rc+$?)) # Exclude EXTRA API tests IPv6 -eval "${sudo_cmd} $timeout_exe env GTEST_TAP=2 LD_PRELOAD=$gtest_lib $gtest_app $gtest_opt_ipv6 --gtest_filter=-xlio_*:tcp_send_zc* --gtest_output=xml:${WORKSPACE}/${prefix}/test-basic-ipv6.xml" +eval "${sudo_cmd} $timeout_exe env GTEST_TAP=2 LD_PRELOAD=$gtest_lib $gtest_app $gtest_opt_ipv6 --gtest_filter=-xlio_* --gtest_output=xml:${WORKSPACE}/${prefix}/test-basic-ipv6.xml" rc=$(($rc+$?)) # Verify Delegated TCP Timers tests -eval "${sudo_cmd} $timeout_exe env XLIO_RX_POLL_ON_TX_TCP=1 XLIO_TCP_ABORT_ON_CLOSE=1 XLIO_TCP_CTL_THREAD=delegate GTEST_TAP=2 LD_PRELOAD=$gtest_lib $gtest_app $gtest_opt --gtest_filter=-xlio*:tcp_send_zc* --gtest_output=xml:${WORKSPACE}/${prefix}/test-delegate.xml" +eval "${sudo_cmd} $timeout_exe env XLIO_RX_POLL_ON_TX_TCP=1 XLIO_TCP_ABORT_ON_CLOSE=1 XLIO_TCP_CTL_THREAD=delegate GTEST_TAP=2 LD_PRELOAD=$gtest_lib $gtest_app $gtest_opt --gtest_filter=-xlio* --gtest_output=xml:${WORKSPACE}/${prefix}/test-delegate.xml" rc=$(($rc+$?)) # Verify Delegated TCP Timers tests IPv6 -eval "${sudo_cmd} $timeout_exe env XLIO_RX_POLL_ON_TX_TCP=1 XLIO_TCP_ABORT_ON_CLOSE=1 XLIO_TCP_CTL_THREAD=delegate GTEST_TAP=2 LD_PRELOAD=$gtest_lib $gtest_app $gtest_opt_ipv6 --gtest_filter=-xlio*:tcp_send_zc* --gtest_output=xml:${WORKSPACE}/${prefix}/test-delegate-ipv6.xml" +eval "${sudo_cmd} $timeout_exe env XLIO_RX_POLL_ON_TX_TCP=1 XLIO_TCP_ABORT_ON_CLOSE=1 XLIO_TCP_CTL_THREAD=delegate GTEST_TAP=2 LD_PRELOAD=$gtest_lib $gtest_app $gtest_opt_ipv6 --gtest_filter=-xlio* --gtest_output=xml:${WORKSPACE}/${prefix}/test-delegate-ipv6.xml" rc=$(($rc+$?)) if [[ -z "${MANUAL_RUN}" ]]; then diff --git a/tests/gtest/core/xlio_send_zc.cc b/tests/gtest/core/xlio_send_zc.cc index cf30a4bc5..a4ba5e5a7 100644 --- a/tests/gtest/core/xlio_send_zc.cc +++ b/tests/gtest/core/xlio_send_zc.cc @@ -421,7 +421,7 @@ TEST_F(xlio_send_zc, ti_2) * Send data using few sendmsg(MSG_ZEROCOPY) * @details */ -TEST_F(xlio_send_zc, ti_3_few_send) +TEST_F(xlio_send_zc, DISABLED_ti_3_few_send) { int rc = EOK; int test_iter = 3; @@ -576,7 +576,7 @@ TEST_F(xlio_send_zc, ti_3_few_send) * single call * @details */ -TEST_F(xlio_send_zc, ti_4_large_send) +TEST_F(xlio_send_zc, DISABLED_ti_4_large_send) { int rc = EOK; diff --git a/tests/gtest/tcp/tcp_send_zc.cc b/tests/gtest/tcp/tcp_send_zc.cc index e9a8e502d..663b2db2b 100644 --- a/tests/gtest/tcp/tcp_send_zc.cc +++ b/tests/gtest/tcp/tcp_send_zc.cc @@ -50,7 +50,7 @@ * Send data using single send(MSG_ZEROCOPY) * @details */ -TEST_F(tcp_send_zc, ti_1_send_once) +TEST_F(tcp_send_zc, DISABLED_ti_1_send_once) { int rc = EOK; char test_msg[] = "Hello test"; @@ -150,7 +150,7 @@ TEST_F(tcp_send_zc, ti_1_send_once) * Send data using few sendmsg(MSG_ZEROCOPY) * @details */ -TEST_F(tcp_send_zc, ti_2_few_send) +TEST_F(tcp_send_zc, DISABLED_ti_2_few_send) { int rc = EOK; int test_iter = 3; @@ -276,7 +276,7 @@ TEST_F(tcp_send_zc, ti_2_few_send) * single call * @details */ -TEST_F(tcp_send_zc, ti_3_large_send) +TEST_F(tcp_send_zc, DISABLED_ti_3_large_send) { int rc = EOK; @@ -403,7 +403,7 @@ TEST_F(tcp_send_zc, ti_3_large_send) * notification after every call * @details */ -TEST_F(tcp_send_zc, ti_4_mass_send_check_every_call) +TEST_F(tcp_send_zc, DISABLED_ti_4_mass_send_check_every_call) { int rc = EOK; struct { @@ -549,7 +549,7 @@ TEST_F(tcp_send_zc, ti_4_mass_send_check_every_call) * notification after last call * @details */ -TEST_F(tcp_send_zc, ti_5_mass_send_check_last_call) +TEST_F(tcp_send_zc, DISABLED_ti_5_mass_send_check_last_call) { int rc = EOK; struct { @@ -684,7 +684,7 @@ TEST_F(tcp_send_zc, ti_5_mass_send_check_last_call) * Verify epoll notification * @details */ -TEST_F(tcp_send_zc, ti_6_epoll_notification) +TEST_F(tcp_send_zc, DISABLED_ti_6_epoll_notification) { int rc = EOK; char test_msg[] = "Hello test"; From 7a4f2dab46746de57fb79a995563b76a659cd753 Mon Sep 17 00:00:00 2001 From: Dmytro Podgornyi Date: Sat, 2 Mar 2024 03:18:16 +0200 Subject: [PATCH 091/169] issue: 3788369 Remove XLIO_ZC_TX_SIZE XLIO_ZC_TX_SIZE limited zerocopy segment size to the specified 16bit value. By default, this was 32KB. However, pbuf len is not limited to 16bit anymore and TCP segment has TSO limitation instead. Remove the option and use TSO capability to limit zerocopy TCP segments. Signed-off-by: Dmytro Podgornyi --- src/core/lwip/tcp.c | 1 - src/core/lwip/tcp.h | 1 - src/core/lwip/tcp_out.c | 2 +- src/core/main.cpp | 2 -- src/core/proto/xlio_lwip.cpp | 1 - src/core/sock/sockinfo_tcp.cpp | 2 -- src/core/util/sys_vars.cpp | 11 ----------- src/core/util/sys_vars.h | 4 ---- 8 files changed, 1 insertion(+), 23 deletions(-) diff --git a/src/core/lwip/tcp.c b/src/core/lwip/tcp.c index 4656da55a..ed8a707ed 100644 --- a/src/core/lwip/tcp.c +++ b/src/core/lwip/tcp.c @@ -94,7 +94,6 @@ u16_t lwip_tcp_mss = CONST_TCP_MSS; u8_t enable_push_flag = 1; u8_t enable_ts_option = 0; u32_t lwip_tcp_snd_buf = 0; -u32_t lwip_zc_tx_size = 0; u32_t lwip_tcp_nodelay_treshold = 0; /* slow timer value */ diff --git a/src/core/lwip/tcp.h b/src/core/lwip/tcp.h index 92cabc79c..72867fe44 100644 --- a/src/core/lwip/tcp.h +++ b/src/core/lwip/tcp.h @@ -50,7 +50,6 @@ void register_sys_now(sys_now_fn fn); extern u16_t lwip_tcp_mss; extern u32_t lwip_tcp_snd_buf; -extern u32_t lwip_zc_tx_size; extern u32_t lwip_tcp_nodelay_treshold; struct tcp_seg; diff --git a/src/core/lwip/tcp_out.c b/src/core/lwip/tcp_out.c index 13655b487..37f1c1602 100644 --- a/src/core/lwip/tcp_out.c +++ b/src/core/lwip/tcp_out.c @@ -746,7 +746,7 @@ err_t tcp_write_express(struct tcp_pcb *pcb, const void *arg, u32_t len, pbuf_de struct tcp_seg *seg = NULL, *prev_seg = NULL, *queue = NULL; u32_t pos = 0; /* position in 'arg' data */ u8_t optflags = TF_SEG_OPTS_ZEROCOPY; - const u32_t mss_local = lwip_zc_tx_size; + const u32_t mss_local = tcp_tso(pcb) ? pcb->tso.max_payload_sz : pcb->mss; u32_t seglen; u16_t queuelen = 0; diff --git a/src/core/main.cpp b/src/core/main.cpp index 588cfe484..738e95506 100644 --- a/src/core/main.cpp +++ b/src/core/main.cpp @@ -558,8 +558,6 @@ void print_xlio_global_settings() SYS_VAR_TX_NUM_BUFS); VLOG_PARAM_STRING("Tx Mem Buf size", safe_mce_sys().tx_buf_size, MCE_DEFAULT_TX_BUF_SIZE, SYS_VAR_TX_BUF_SIZE, option_size::to_str(safe_mce_sys().tx_buf_size)); - VLOG_PARAM_STRING("ZC TX size", safe_mce_sys().zc_tx_size, MCE_DEFAULT_ZC_TX_SIZE, - SYS_VAR_ZC_TX_SIZE, option_size::to_str(safe_mce_sys().zc_tx_size)); VLOG_PARAM_NUMBER("Tx QP WRE", safe_mce_sys().tx_num_wr, MCE_DEFAULT_TX_NUM_WRE, SYS_VAR_TX_NUM_WRE); VLOG_PARAM_NUMBER("Tx QP WRE Batching", safe_mce_sys().tx_num_wr_to_signal, diff --git a/src/core/proto/xlio_lwip.cpp b/src/core/proto/xlio_lwip.cpp index 97d077fc6..ef2b86d23 100644 --- a/src/core/proto/xlio_lwip.cpp +++ b/src/core/proto/xlio_lwip.cpp @@ -102,7 +102,6 @@ xlio_lwip::xlio_lwip() lwip_tcp_mss = get_lwip_tcp_mss(safe_mce_sys().mtu, safe_mce_sys().lwip_mss); lwip_tcp_snd_buf = safe_mce_sys().tcp_send_buffer_size; - lwip_zc_tx_size = safe_mce_sys().zc_tx_size; lwip_tcp_nodelay_treshold = safe_mce_sys().tcp_nodelay_treshold; BULLSEYE_EXCLUDE_BLOCK_END diff --git a/src/core/sock/sockinfo_tcp.cpp b/src/core/sock/sockinfo_tcp.cpp index ea85cfeab..09c82aa61 100644 --- a/src/core/sock/sockinfo_tcp.cpp +++ b/src/core/sock/sockinfo_tcp.cpp @@ -754,8 +754,6 @@ bool sockinfo_tcp::prepare_dst_to_send(bool is_accepted_socket /* = false */) m_pcb.tso.max_send_sge = ring->get_max_send_sge(); /* reserve one slot for network headers of zerocopy segments */ m_pcb.max_send_sge = m_pcb.tso.max_send_sge - 1; - safe_mce_sys().zc_tx_size = - std::min(safe_mce_sys().zc_tx_size, m_pcb.tso.max_payload_sz); } } return ret_val; diff --git a/src/core/util/sys_vars.cpp b/src/core/util/sys_vars.cpp index 28540bea5..717140cb9 100644 --- a/src/core/util/sys_vars.cpp +++ b/src/core/util/sys_vars.cpp @@ -775,7 +775,6 @@ void mce_sys_var::get_env_params() zc_cache_threshold = MCE_DEFAULT_ZC_CACHE_THRESHOLD; tx_num_bufs = MCE_DEFAULT_TX_NUM_BUFS; tx_buf_size = MCE_DEFAULT_TX_BUF_SIZE; - zc_tx_size = MCE_DEFAULT_ZC_TX_SIZE; tcp_nodelay_treshold = MCE_DEFAULT_TCP_NODELAY_TRESHOLD; tx_num_wr = MCE_DEFAULT_TX_NUM_WRE; tx_num_wr_to_signal = MCE_DEFAULT_TX_NUM_WRE_TO_SIGNAL; @@ -1280,16 +1279,6 @@ void mce_sys_var::get_env_params() tx_buf_size = (uint32_t)option_size::from_str(env_ptr); } - if ((env_ptr = getenv(SYS_VAR_ZC_TX_SIZE))) { - zc_tx_size = (uint32_t)option_size::from_str(env_ptr); - if (zc_tx_size > MCE_MAX_ZC_TX_SIZE) { - vlog_printf(VLOG_WARNING, - "ZC TX size [%u] exceeds the maximum (max=%u), setting to default.\n", - zc_tx_size, MCE_MAX_ZC_TX_SIZE); - zc_tx_size = MCE_DEFAULT_ZC_TX_SIZE; - } - } - if ((env_ptr = getenv(SYS_VAR_TCP_NODELAY_TRESHOLD))) { tcp_nodelay_treshold = (uint32_t)atoi(env_ptr); } diff --git a/src/core/util/sys_vars.h b/src/core/util/sys_vars.h index 442e8b597..eaee8e793 100644 --- a/src/core/util/sys_vars.h +++ b/src/core/util/sys_vars.h @@ -371,7 +371,6 @@ struct mce_sys_var { size_t zc_cache_threshold; uint32_t tx_num_bufs; uint32_t tx_buf_size; - uint32_t zc_tx_size; uint32_t tcp_nodelay_treshold; uint32_t tx_num_wr; uint32_t tx_num_wr_to_signal; @@ -577,7 +576,6 @@ extern mce_sys_var &safe_mce_sys(); #define SYS_VAR_ZC_CACHE_THRESHOLD "XLIO_ZC_CACHE_THRESHOLD" #define SYS_VAR_TX_NUM_BUFS "XLIO_TX_BUFS" #define SYS_VAR_TX_BUF_SIZE "XLIO_TX_BUF_SIZE" -#define SYS_VAR_ZC_TX_SIZE "XLIO_ZC_TX_SIZE" #define SYS_VAR_TCP_NODELAY_TRESHOLD "XLIO_TCP_NODELAY_TRESHOLD" #define SYS_VAR_TX_NUM_WRE "XLIO_TX_WRE" #define SYS_VAR_TX_NUM_WRE_TO_SIGNAL "XLIO_TX_WRE_BATCHING" @@ -733,7 +731,6 @@ extern mce_sys_var &safe_mce_sys(); #define MCE_DEFAULT_RING_LIMIT_PER_INTERFACE (0) #define MCE_DEFAULT_RING_DEV_MEM_TX (0) #define MCE_DEFAULT_TCP_MAX_SYN_RATE (0) -#define MCE_DEFAULT_ZC_TX_SIZE (32768) #define MCE_DEFAULT_TCP_NODELAY_TRESHOLD (0) #define MCE_DEFAULT_ZC_CACHE_THRESHOLD (10LU * 1024 * 1024 * 1024) // 10GB #define MCE_DEFAULT_TX_NUM_BUFS (200000) @@ -855,7 +852,6 @@ extern mce_sys_var &safe_mce_sys(); #define MCE_MAX_RX_NUM_POLLS (100000000) #define MCE_MIN_RX_PREFETCH_BYTES (32) /* Just enough for headers (IPoIB+IP+UDP)*/ #define MCE_MAX_RX_PREFETCH_BYTES (2044) -#define MCE_MAX_ZC_TX_SIZE (65535) #define MCE_RX_CQ_DRAIN_RATE_DISABLED (0) #define MCE_CQ_DRAIN_INTERVAL_DISABLED (0) #define MCE_CQ_ADAPTIVE_MODERATION_DISABLED (0) From 9c38f688b92b5d2c87fc8c5f5e9a0c53defc560b Mon Sep 17 00:00:00 2001 From: Dmytro Podgornyi Date: Sat, 2 Mar 2024 03:30:30 +0200 Subject: [PATCH 092/169] issue: 3788369 Remove redundant max_send_sge field tcp_pcb::max_send_sge is a copy of tcp_pcb::tso.max_send_sge, but reduced by 1. It was used in zerocopy POC implementation, because zerocopy segments were TSO segments with the TCP header in the 1st sge. Current implementation uses tcp_seg::l2_l3_tcphdr_zc[] to hold the header and doesn't require the reduced limit. Signed-off-by: Dmytro Podgornyi --- src/core/lwip/tcp.h | 2 -- src/core/lwip/tcp_out.c | 2 +- src/core/sock/sockinfo_tcp.cpp | 2 -- 3 files changed, 1 insertion(+), 5 deletions(-) diff --git a/src/core/lwip/tcp.h b/src/core/lwip/tcp.h index 72867fe44..772fdf9b7 100644 --- a/src/core/lwip/tcp.h +++ b/src/core/lwip/tcp.h @@ -418,8 +418,6 @@ struct tcp_pcb { /* Maximum number of SGE */ u32_t max_send_sge; } tso; - - u32_t max_send_sge; }; typedef u16_t (*ip_route_mtu_fn)(struct tcp_pcb *pcb); diff --git a/src/core/lwip/tcp_out.c b/src/core/lwip/tcp_out.c index 37f1c1602..4177a7a49 100644 --- a/src/core/lwip/tcp_out.c +++ b/src/core/lwip/tcp_out.c @@ -1075,7 +1075,7 @@ static void tcp_tso_segment(struct tcp_pcb *pcb, struct tcp_seg *seg, u32_t wnd) } tot_p += pbuf_clen(cur_seg->p); - if (tot_p > pcb->max_send_sge) { + if (tot_p > pcb->tso.max_send_sge) { goto err; } diff --git a/src/core/sock/sockinfo_tcp.cpp b/src/core/sock/sockinfo_tcp.cpp index 09c82aa61..013210229 100644 --- a/src/core/sock/sockinfo_tcp.cpp +++ b/src/core/sock/sockinfo_tcp.cpp @@ -752,8 +752,6 @@ bool sockinfo_tcp::prepare_dst_to_send(bool is_accepted_socket /* = false */) m_pcb.tso.max_payload_sz = ring->get_max_payload_sz(); m_pcb.tso.max_header_sz = ring->get_max_header_sz(); m_pcb.tso.max_send_sge = ring->get_max_send_sge(); - /* reserve one slot for network headers of zerocopy segments */ - m_pcb.max_send_sge = m_pcb.tso.max_send_sge - 1; } } return ret_val; From cee8ca70f9a076740729d8536ac58444212baa3d Mon Sep 17 00:00:00 2001 From: Dmytro Podgornyi Date: Sat, 2 Mar 2024 03:42:07 +0200 Subject: [PATCH 093/169] issue: 3788369 Pass iovec to tcp_write_express() tcp_write_express() is a zerocopy version of tcp_write() and not limited by the sndbuf or queue length. Therefore, it's unlikely to fail. However, it still can fail to allocate pbuf or TCP segment in theory. Move the loop over iovec from higher level methods to tcp_write_express(), so we keep consistent TCP state if the memory allocation error happens in the middle of a complex send operation. Additionally, improve tcp_write_express() robustness and don't allow to append data to last segment if it's a retransmit. Signed-off-by: Dmytro Podgornyi --- src/core/lwip/tcp.h | 3 +- src/core/lwip/tcp_out.c | 181 ++++++++++++++++++++------------- src/core/sock/sockinfo_tcp.cpp | 53 ++++------ 3 files changed, 131 insertions(+), 106 deletions(-) diff --git a/src/core/lwip/tcp.h b/src/core/lwip/tcp.h index 772fdf9b7..5b818ebb8 100644 --- a/src/core/lwip/tcp.h +++ b/src/core/lwip/tcp.h @@ -476,7 +476,8 @@ err_t tcp_shutdown(struct tcp_pcb *pcb, int shut_rx, int shut_tx); err_t tcp_write(struct tcp_pcb *pcb, const void *dataptr, u32_t len, u16_t apiflags, pbuf_desc *desc); -err_t tcp_write_express(struct tcp_pcb *pcb, const void *arg, u32_t len, pbuf_desc *desc); +err_t tcp_write_express(struct tcp_pcb *pcb, const struct iovec *iov, u32_t iovcnt, + pbuf_desc *desc); #define TCP_PRIO_MIN 1 #define TCP_PRIO_NORMAL 64 diff --git a/src/core/lwip/tcp_out.c b/src/core/lwip/tcp_out.c index 4177a7a49..5aaf71d2d 100644 --- a/src/core/lwip/tcp_out.c +++ b/src/core/lwip/tcp_out.c @@ -726,122 +726,159 @@ err_t tcp_write(struct tcp_pcb *pcb, const void *arg, u32_t len, u16_t apiflags, /** * Write data for sending (but does not send it immediately). * - * It waits in the expectation of more data being sent soon (as - * it can send them more efficiently by combining them together). - * To prompt the system to send data now, call tcp_output() after - * calling tcp_write_express(). - * * The function will zero-copy the data into the payload, i.e. the data pointer, instead of the - * data, will be copied. + * data, will be set. * * @param pcb Protocol control block for the TCP connection to enqueue data for. - * @param arg Pointer to the data to be enqueued for sending. - * @param len Data length in bytes + * @param iov Vector of the data buffers to be enqueued for sending. + * @param iovcnt Number of the iov elements. * @param desc Additional metadata that allows later to check the data mkey/lkey. * @return ERR_OK if enqueued, another err_t on error */ -err_t tcp_write_express(struct tcp_pcb *pcb, const void *arg, u32_t len, pbuf_desc *desc) +err_t tcp_write_express(struct tcp_pcb *pcb, const struct iovec *iov, u32_t iovcnt, pbuf_desc *desc) { struct pbuf *p; - struct tcp_seg *seg = NULL, *prev_seg = NULL, *queue = NULL; - u32_t pos = 0; /* position in 'arg' data */ - u8_t optflags = TF_SEG_OPTS_ZEROCOPY; - const u32_t mss_local = tcp_tso(pcb) ? pcb->tso.max_payload_sz : pcb->mss; + struct tcp_seg *seg = NULL; + struct tcp_seg *queue = NULL; + struct tcp_seg *last; + const u32_t seglen_max = tcp_tso(pcb) ? pcb->tso.max_payload_sz : pcb->mss; + u32_t pos; u32_t seglen; + u32_t last_seglen; + u32_t total_len = 0; u16_t queuelen = 0; - - if (len < pcb->mss) { - const int byte_queued = pcb->snd_nxt - pcb->lastack; - pcb->snd_sml_add = (pcb->unacked ? pcb->unacked->len : 0) + byte_queued; - } + u8_t optflags = TF_SEG_OPTS_ZEROCOPY; /* - * Chain a new pbuf to the end of pcb->unsent if there is enough space. - * - * We may run out of memory at any point. In that case we must - * return ERR_MEM and not change anything in pcb. Therefore, all - * changes are recorded in local variables and committed at the end - * of the function. Some pcb fields are maintained in local copies: - * - * queuelen = pcb->snd_queuelen - * - * These variables are set consistently by the phases. - * seg points to the last segment tampered with. - * pos records progress as data is segmented. + * We may run out of memory at any point. In that case we must return ERR_MEM and not change + * anything in pcb. Therefore, all changes are recorded in local variables and committed at + * the end of the function. Some pcb fields are maintained in local copies. */ - if (pcb->unsent != NULL) { - seg = pcb->last_unsent; - u32_t space = LWIP_MAX(mss_local, pcb->tso.max_payload_sz) - seg->len; - if (space > 0 && (seg->flags & TF_SEG_OPTS_ZEROCOPY) && - pbuf_clen(seg->p) < pcb->tso.max_send_sge) { - seglen = space < len ? space : len; + last = pcb->last_unsent; + const bool can_merge = + last && (last->flags & TF_SEG_OPTS_ZEROCOPY) && TCP_SEQ_GEQ(last->seqno, pcb->snd_nxt); + if (!can_merge) { + /* We cannot append data to a segment of different type or a retransmitted segment. */ + last = NULL; + } + last_seglen = last ? last->len : 0; + + for (unsigned i = 0; i < iovcnt; ++i) { + u8_t *data = (u8_t *)iov[i].iov_base; + const u32_t len = iov[i].iov_len; + pos = 0; - if ((p = tcp_pbuf_prealloc_express(seglen, pcb, PBUF_ZEROCOPY, desc, NULL)) == NULL) { - goto memerr; + /* Chain a new pbuf to the last segment if there is enough space. */ + if (last) { + seg = last; + const u32_t space = seglen_max - seg->len; + + if (space > 0 && pbuf_clen(seg->p) < pcb->tso.max_send_sge) { + seglen = space < len ? space : len; + + p = tcp_pbuf_prealloc_express(seglen, pcb, PBUF_ZEROCOPY, desc, NULL); + if (!p) { + goto memerr; + } + p->payload = data; + pbuf_cat(seg->p, p); + seg->len += p->tot_len; + pos += seglen; + queuelen++; } - p->payload = (u8_t *)arg; - pbuf_cat(seg->p, p); - seg->len += p->tot_len; - pos += seglen; - queuelen++; } - } - while (pos < len) { - u32_t left = len - pos; - seglen = left > mss_local ? mss_local : left; + while (pos < len) { + u32_t left = len - pos; + seglen = left > seglen_max ? seglen_max : left; - if ((p = tcp_pbuf_prealloc_express(seglen, pcb, PBUF_ZEROCOPY, desc, NULL)) == NULL) { - goto memerr; - } - p->payload = (u8_t *)arg + pos; - queuelen++; + p = tcp_pbuf_prealloc_express(seglen, pcb, PBUF_ZEROCOPY, desc, NULL); + if (!p) { + goto memerr; + } + p->payload = data + pos; - if ((seg = tcp_create_segment(pcb, p, 0, pcb->snd_lbb + pos, optflags)) == NULL) { - tcp_tx_pbuf_free(pcb, p); - goto memerr; - } + seg = tcp_create_segment(pcb, p, 0, pcb->snd_lbb + total_len + pos, optflags); + if (!seg) { + tcp_tx_pbuf_free(pcb, p); + goto memerr; + } - if (queue == NULL) { - queue = seg; - } else { - prev_seg->next = seg; + if (!queue) { + queue = seg; + } + if (last) { + last->next = seg; + } + last = seg; + + pos += seglen; + queuelen++; } - prev_seg = seg; - pos += seglen; + total_len += len; + } + + /* Set the PSH flag in the last segment that we enqueued. */ + if (enable_push_flag && seg != NULL && seg->tcphdr != NULL) { + TCPH_SET_FLAG(seg->tcphdr, TCP_PSH); } #if TCP_OVERSIZE pcb->unsent_oversize = 0; #endif /* TCP_OVERSIZE */ - if (pcb->last_unsent == NULL) { + if (!pcb->last_unsent) { pcb->unsent = queue; } else { + /* The next field is either NULL or equals to queue, so we can overwrite. */ pcb->last_unsent->next = queue; } - pcb->last_unsent = seg; + if (last) { + pcb->last_unsent = last; + } - /* - * Finally update the pcb state. - */ - pcb->snd_lbb += len; - pcb->snd_buf -= len; + /* Update the pcb state. */ + pcb->snd_lbb += total_len; + pcb->snd_buf -= total_len; pcb->snd_queuelen += queuelen; - /* Set the PSH flag in the last segment that we enqueued. */ - if (enable_push_flag && seg != NULL && seg->tcphdr != NULL) { - TCPH_SET_FLAG(seg->tcphdr, TCP_PSH); + /* TODO Move Minshall's logic to tcp_output(). */ + if (total_len < pcb->mss) { + const u32_t byte_queued = pcb->snd_nxt - pcb->lastack; + pcb->snd_sml_add = (pcb->unacked ? pcb->unacked->len : 0) + byte_queued; } return ERR_OK; + memerr: + /* Error path - restore unsent queue. */ pcb->flags |= TF_NAGLEMEMERR; if (queue != NULL) { tcp_tx_segs_free(pcb, queue); } + if (pcb->last_unsent && last_seglen > 0) { + pcb->last_unsent->next = NULL; + p = pcb->last_unsent->p; + while (last_seglen > 0) { + last_seglen -= p->len; + p = p->next; + } + if (p) { + pcb->last_unsent->len -= p->tot_len; + struct pbuf *ptmp = pcb->last_unsent->p; + while (ptmp) { + ptmp->tot_len -= p->tot_len; + if (ptmp->next == p) { + ptmp->next = NULL; + } + ptmp = ptmp->next; + } + assert(pcb->last_unsent->len == last_seglen); + assert(pcb->last_unsent->p->tot_len == last_seglen); + } + } return ERR_MEM; } diff --git a/src/core/sock/sockinfo_tcp.cpp b/src/core/sock/sockinfo_tcp.cpp index 013210229..6d3a297b8 100644 --- a/src/core/sock/sockinfo_tcp.cpp +++ b/src/core/sock/sockinfo_tcp.cpp @@ -1010,20 +1010,10 @@ ssize_t sockinfo_tcp::tcp_tx(xlio_tx_call_attr_t &tx_arg) is_non_file_zerocopy, errno_tmp); } - err = tcp_write_express(&m_pcb, tx_ptr, tx_size, &tx_arg.priv); + const struct iovec iov = {.iov_base = tx_ptr, .iov_len = tx_size}; + err = tcp_write_express(&m_pcb, &iov, 1, &tx_arg.priv); if (unlikely(err != ERR_OK)) { - if (unlikely(err == ERR_CONN)) { // happens when remote drops during big write - si_tcp_logdbg("connection closed: tx'ed = %d", total_tx); - shutdown(SHUT_WR); - return tcp_tx_handle_partial_send_and_unlock(total_tx, EPIPE, is_dummy, - is_non_file_zerocopy, errno_tmp); - } - if (unlikely(err != ERR_MEM)) { - // we should not get here... - BULLSEYE_EXCLUDE_BLOCK_START - si_tcp_logpanic("tcp_write return: %d", err); - BULLSEYE_EXCLUDE_BLOCK_END - } + // tcp_write_express() can return only ERR_MEM error. return tcp_tx_handle_partial_send_and_unlock(total_tx, EAGAIN, is_dummy, is_non_file_zerocopy, errno_tmp); } @@ -1161,9 +1151,13 @@ ssize_t sockinfo_tcp::tcp_tx_slow_path(xlio_tx_call_attr_t &tx_arg) is_send_zerocopy, errno_tmp); } - err_t err = (apiflags & XLIO_TX_PACKET_ZEROCOPY) - ? tcp_write_express(&m_pcb, tx_ptr, tx_size, &tx_arg.priv) - : tcp_write(&m_pcb, tx_ptr, tx_size, apiflags, &tx_arg.priv); + err_t err; + if (apiflags & XLIO_TX_PACKET_ZEROCOPY) { + const struct iovec iov = {.iov_base = tx_ptr, .iov_len = tx_size}; + err = tcp_write_express(&m_pcb, &iov, 1, &tx_arg.priv); + } else { + err = tcp_write(&m_pcb, tx_ptr, tx_size, apiflags, &tx_arg.priv); + } if (unlikely(err != ERR_OK)) { if (unlikely(err == ERR_CONN)) { // happens when remote drops during big write si_tcp_logdbg("connection closed: tx'ed = %d", total_tx); @@ -6050,30 +6044,23 @@ int sockinfo_tcp::tcp_tx_express(const struct iovec *iov, unsigned iov_len, uint mdesc.opaque = opaque_op; int bytes_written = 0; - - lock_tcp_con(); - - err_t err; for (unsigned i = 0; i < iov_len; ++i) { - err = tcp_write_express(&m_pcb, iov[i].iov_base, iov[i].iov_len, &mdesc); - if (err != ERR_OK) { - /* The only error in tcp_write_express is a memory error - * In this version we don't implement any error recovery or avoidance - * mechanism and an error at this stage is irrecoverable. - * The considered alternatives are: - * - Setting the socket an error state (this is the one we chose here) - * - Rolling back any written buffers, i.e. recovering - * - Reserving the pbuf(s)/tcp_seg(s) before calling for tcp_write_express */ - m_conn_state = TCP_CONN_ERROR; - m_error_status = ENOMEM; - return tcp_tx_handle_errno_and_unlock(ENOMEM); - } bytes_written += iov[i].iov_len; } + lock_tcp_con(); + + err_t err = tcp_write_express(&m_pcb, iov, iov_len, &mdesc); + if (unlikely(err != ERR_OK)) { + // The only error in tcp_write_express() is a memory error. + m_conn_state = TCP_CONN_ERROR; + m_error_status = ENOMEM; + return tcp_tx_handle_errno_and_unlock(ENOMEM); + } if (!(flags & XLIO_EXPRESS_MSG_MORE)) { tcp_output(&m_pcb); } + unlock_tcp_con(); return bytes_written; From 98e3ada35b3d3a0897b67921644585a68aed97d0 Mon Sep 17 00:00:00 2001 From: Dmytro Podgornyi Date: Sat, 2 Mar 2024 04:03:25 +0200 Subject: [PATCH 094/169] issue: 3788369 Don't poll RX while checking is_rst() Condition is_connected_and_ready_to_send() does RX polling in case of TCP_CONN_CONNECTING state. The idea is to speedup the connected state, however, this has few drawbacks: - This is unlikely path, because there are mechanisms to async connect() to finish; - RX polling requires to be unlocked, so there is a race window between TX thread and concurrent RX which may change the state; - The RX polling doesn't guarantee that there is the last handshake packet in the RQ. Remove the polling, so checking for connected state is more lightweight and allowed under the socket lock. Signed-off-by: Dmytro Podgornyi --- src/core/sock/sockinfo_tcp.cpp | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/src/core/sock/sockinfo_tcp.cpp b/src/core/sock/sockinfo_tcp.cpp index 6d3a297b8..73730deea 100644 --- a/src/core/sock/sockinfo_tcp.cpp +++ b/src/core/sock/sockinfo_tcp.cpp @@ -6127,18 +6127,12 @@ ssize_t sockinfo_tcp::tcp_tx_handle_partial_send_and_unlock(ssize_t total_tx, in bool sockinfo_tcp::is_connected_and_ready_to_send() { - int poll_count = 0; /* TODO should we add !g_b_exit here? */ - while (unlikely(!is_rts())) { + if (unlikely(!is_rts())) { if (m_conn_state == TCP_CONN_TIMEOUT) { si_tcp_logdbg("TX timed out"); errno = ETIMEDOUT; } else if (m_conn_state == TCP_CONN_CONNECTING) { - si_tcp_logdbg("TX while async-connect on socket go to poll"); - rx_wait_helper(poll_count, false); - if (m_conn_state == TCP_CONN_CONNECTED) { - continue; - } si_tcp_logdbg("TX while async-connect on socket return EAGAIN"); errno = EAGAIN; } else if (m_conn_state == TCP_CONN_RESETED) { From a42d50d33ff63908b86d7d556055ace3455caa7e Mon Sep 17 00:00:00 2001 From: Dmytro Podgornyi Date: Sat, 2 Mar 2024 04:59:16 +0200 Subject: [PATCH 095/169] issue: 3788369 Fix LwIP type length related to segment/pbuf size Replace u16_t with u32_t to avoid type overflow. Signed-off-by: Dmytro Podgornyi --- src/core/lwip/tcp_out.c | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/src/core/lwip/tcp_out.c b/src/core/lwip/tcp_out.c index 5aaf71d2d..45432a8b2 100644 --- a/src/core/lwip/tcp_out.c +++ b/src/core/lwip/tcp_out.c @@ -346,9 +346,9 @@ static err_t tcp_write_checks(struct tcp_pcb *pcb, u32_t len) return ERR_OK; } -static inline u16_t tcp_xmit_size_goal(struct tcp_pcb *pcb, int use_max) +static inline u32_t tcp_xmit_size_goal(struct tcp_pcb *pcb, int use_max) { - u16_t size = pcb->mss; + u32_t size = pcb->mss; #if LWIP_TCP_TIMESTAMPS if ((pcb->flags & TF_TIMESTAMP)) { @@ -556,8 +556,8 @@ err_t tcp_write(struct tcp_pcb *pcb, const void *arg, u32_t len, u16_t apiflags, while (pos < len) { struct pbuf *p; u32_t left = len - pos; - u16_t max_len = mss_local_minus_opts; - u16_t seglen = left > max_len ? max_len : left; + u32_t max_len = mss_local_minus_opts; + u32_t seglen = left > max_len ? max_len : left; /* If copy is set, memory should be allocated and data copied * into pbuf */ @@ -1300,8 +1300,8 @@ __attribute__((unused)) static struct tcp_seg *tcp_rexmit_segment(struct tcp_pcb struct tcp_seg *new_seg = NULL; struct pbuf *cur_p = NULL; int tcp_hlen_delta; - u16_t mss_local = 0; - u16_t mss_local_minus_opts; + u32_t mss_local = 0; + u32_t mss_local_minus_opts; u8_t optflags = 0; u8_t optlen = 0; u32_t seqno = 0; @@ -1522,8 +1522,8 @@ void tcp_split_segment(struct tcp_pcb *pcb, struct tcp_seg *seg, u32_t wnd) u16_t oversize = 0; u8_t optlen = 0; u8_t optflags = 0; - u16_t mss_local = 0; - u16_t max_length; + u32_t mss_local = 0; + u32_t max_length; pbuf_type type = PBUF_RAM; int is_zerocopy = 0; @@ -1980,7 +1980,6 @@ static err_t tcp_output_segment(struct tcp_seg *seg, struct tcp_pcb *pcb) /* zc_buf is only used to pass pointer to TCP header to ip_output(). */ struct pbuf zc_pbuf; struct pbuf *p; - u16_t len; u32_t *opts; /* The TCP header has already been constructed, but the ackno and @@ -2070,7 +2069,7 @@ static err_t tcp_output_segment(struct tcp_seg *seg, struct tcp_pcb *pcb) p->next = seg->p; p->len = p->tot_len = LWIP_TCP_HDRLEN(seg->tcphdr); } else { - len = (u16_t)((u8_t *)seg->tcphdr - (u8_t *)seg->p->payload); + u32_t len = (u32_t)((u8_t *)seg->tcphdr - (u8_t *)seg->p->payload); seg->p->len -= len; seg->p->tot_len -= len; @@ -2342,8 +2341,8 @@ void tcp_zero_window_probe(struct tcp_pcb *pcb) struct tcp_seg *seg; u16_t len; u8_t is_fin; - u32_t snd_nxt; u8_t optlen = 0; + u32_t snd_nxt; u32_t *opts; LWIP_DEBUGF_IP_ADDR(TCP_DEBUG, "tcp_zero_window_probe: sending ZERO WINDOW probe to ", From 75221e01292deb7a463fd26f6eb49f01f930b4c0 Mon Sep 17 00:00:00 2001 From: Dmytro Podgornyi Date: Sat, 2 Mar 2024 05:23:05 +0200 Subject: [PATCH 096/169] issue: 3788369 Fix Nagle's algorithm for negative snd_buf pcb::snd_buf can become negative in some flows. Signed-off-by: Dmytro Podgornyi --- src/core/lwip/tcp_impl.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/core/lwip/tcp_impl.h b/src/core/lwip/tcp_impl.h index a78fa7a44..19f9c354e 100644 --- a/src/core/lwip/tcp_impl.h +++ b/src/core/lwip/tcp_impl.h @@ -92,7 +92,7 @@ void set_tmr_resolution(u32_t v); ((tpcb)->flags & TF_INFR) || \ (((tpcb)->unsent != NULL) && \ (((tpcb)->unsent->next != NULL) || ((tpcb)->unsent->len >= (tpcb)->mss))) || \ - ((tcp_sndbuf(tpcb) == 0) || (tcp_sndqueuelen(tpcb) >= (tpcb)->max_tcp_snd_queuelen))) \ + ((tcp_sndbuf(tpcb) <= 0) || (tcp_sndqueuelen(tpcb) >= (tpcb)->max_tcp_snd_queuelen))) \ ? 1 \ : 0) #define tcp_output_nagle(tpcb) (tcp_do_output_nagle(tpcb) ? tcp_output(tpcb) : ERR_OK) From 140401b4966d4acf836c67cbc99c7ade01d842eb Mon Sep 17 00:00:00 2001 From: Dmytro Podgornyi Date: Sat, 2 Mar 2024 05:25:15 +0200 Subject: [PATCH 097/169] issue: 3788369 Remove redundant snd_buf check in LwIP Higher level logic checks for sndbuf, so no need to check it again in tcp_write(). Also account SYN/FIN segments in snd_buf. Previously, this code was disabled, because of the corner case when snd_buf is 0. But now it can become negative. Signed-off-by: Dmytro Podgornyi --- src/core/lwip/tcp_out.c | 36 ++---------------------------------- 1 file changed, 2 insertions(+), 34 deletions(-) diff --git a/src/core/lwip/tcp_out.c b/src/core/lwip/tcp_out.c index 45432a8b2..42448e4d7 100644 --- a/src/core/lwip/tcp_out.c +++ b/src/core/lwip/tcp_out.c @@ -317,19 +317,8 @@ static err_t tcp_write_checks(struct tcp_pcb *pcb, u32_t len) return ERR_OK; } - /* fail on too much data */ - if ((s32_t)len > pcb->snd_buf) { - LWIP_DEBUGF(TCP_OUTPUT_DEBUG | 3, - ("tcp_write: too much data (len=%" U32_F " > snd_buf=%" S32_F ")\n", len, - pcb->snd_buf)); - pcb->flags |= TF_NAGLEMEMERR; - return ERR_MEM; - } - LWIP_DEBUGF(TCP_QLEN_DEBUG, ("tcp_write: queuelen: %" U32_F "\n", (u32_t)pcb->snd_queuelen)); - /* If total number of pbufs on the unsent/unacked queues exceeds the * configured maximum, return an error */ - /* check for configured max queuelen and possible overflow */ if ((pcb->snd_queuelen >= pcb->max_unsent_len) || (pcb->snd_queuelen > TCP_SNDQUEUELEN_OVERFLOW)) { LWIP_DEBUGF(TCP_OUTPUT_DEBUG | 3, @@ -338,11 +327,6 @@ static err_t tcp_write_checks(struct tcp_pcb *pcb, u32_t len) pcb->flags |= TF_NAGLEMEMERR; return ERR_MEM; } - if (pcb->snd_queuelen != 0) { - } else { - LWIP_ASSERT("tcp_write: no pbufs on queue => both queues empty", - pcb->unacked == NULL && pcb->unsent == NULL); - } return ERR_OK; } @@ -940,22 +924,11 @@ err_t tcp_enqueue_flags(struct tcp_pcb *pcb, u8_t flags) #endif /* LWIP_TCP_TIMESTAMPS */ optlen = LWIP_TCP_OPT_LENGTH(optflags); - /* tcp_enqueue_flags is always called with either SYN or FIN in flags. - * We need one available snd_buf byte to do that. - * This means we can't send FIN while snd_buf==0. A better fix would be to - * not include SYN and FIN sequence numbers in the snd_buf count. */ - - /*if (pcb->snd_buf == 0) { - LWIP_DEBUGF(TCP_OUTPUT_DEBUG | 3, ("tcp_enqueue_flags: no send buffer available\n")); - return ERR_MEM; - }*/ //to consider snd_buf for syn or fin, unmarked sections with SND_BUF_FOR_SYN_FIN - /* Allocate pbuf with room for TCP header + options */ if ((p = tcp_tx_pbuf_alloc(pcb, optlen, PBUF_RAM, NULL, NULL)) == NULL) { pcb->flags |= TF_NAGLEMEMERR; return ERR_MEM; } - LWIP_ASSERT("tcp_enqueue_flags: check that first pbuf can hold optlen", (p->len >= optlen)); /* Allocate memory for tcp_seg, and fill in fields. */ if ((seg = tcp_create_segment(pcb, p, flags, pcb->snd_lbb, optflags)) == NULL) { @@ -963,7 +936,6 @@ err_t tcp_enqueue_flags(struct tcp_pcb *pcb, u8_t flags) tcp_tx_pbuf_free(pcb, p); return ERR_MEM; } - LWIP_ASSERT("tcp_enqueue_flags: invalid segment length", seg->len == 0); LWIP_DEBUGF( TCP_OUTPUT_DEBUG | LWIP_DBG_TRACE, @@ -983,10 +955,10 @@ err_t tcp_enqueue_flags(struct tcp_pcb *pcb, u8_t flags) #endif /* TCP_OVERSIZE */ /* SYN and FIN bump the sequence number */ - if ((flags & TCP_SYN) || (flags & TCP_FIN)) { + if (flags & (TCP_SYN | TCP_FIN)) { pcb->snd_lbb++; /* optlen does not influence snd_buf */ - // pcb->snd_buf--; SND_BUF_FOR_SYN_FIN + pcb->snd_buf--; } if (flags & TCP_FIN) { pcb->flags |= TF_FIN; @@ -996,10 +968,6 @@ err_t tcp_enqueue_flags(struct tcp_pcb *pcb, u8_t flags) pcb->snd_queuelen += pbuf_clen(seg->p); LWIP_DEBUGF(TCP_QLEN_DEBUG, ("tcp_enqueue_flags: %" S16_F " (after enqueued)\n", pcb->snd_queuelen)); - if (pcb->snd_queuelen != 0) { - LWIP_ASSERT("tcp_enqueue_flags: invalid queue length", - pcb->unacked != NULL || pcb->unsent != NULL); - } return ERR_OK; } From 236d43f2b9dda90be03f66042ca5ccfa770b3553 Mon Sep 17 00:00:00 2001 From: Dmytro Podgornyi Date: Sat, 2 Mar 2024 14:49:36 +0200 Subject: [PATCH 098/169] issue: 3788369 Remove pbuf_desc::map This field was used to hold mapping_t pointer, but the mapping is a mem_desc object now. Remove the leftover. Signed-off-by: Dmytro Podgornyi --- src/core/lwip/pbuf.h | 3 +-- src/core/sock/sock-redirect.cpp | 4 ++-- src/core/sock/sockinfo_nvme.cpp | 2 +- src/core/sock/sockinfo_tcp.cpp | 5 +++-- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/src/core/lwip/pbuf.h b/src/core/lwip/pbuf.h index 800551ed4..92c77fbb7 100644 --- a/src/core/lwip/pbuf.h +++ b/src/core/lwip/pbuf.h @@ -66,9 +66,8 @@ typedef struct { int attr; u32_t mkey; union { - void *map; - void *mdesc; int fd; + void *mdesc; void *opaque; }; } pbuf_desc; diff --git a/src/core/sock/sock-redirect.cpp b/src/core/sock/sock-redirect.cpp index b550fdbc6..aa56a2dda 100644 --- a/src/core/sock/sock-redirect.cpp +++ b/src/core/sock/sock-redirect.cpp @@ -371,7 +371,7 @@ ssize_t sendmsg_internal(void *sock, __const struct msghdr *__msg, int __flags) ((cmsg->cmsg_len - CMSG_LEN(0)) / sizeof(struct xlio_pd_key)))) { tx_arg.priv.attr = (cmsg->cmsg_type == SCM_XLIO_PD) ? PBUF_DESC_MKEY : PBUF_DESC_NVME_TX; - tx_arg.priv.map = (void *)CMSG_DATA(cmsg); + tx_arg.priv.opaque = (void *)CMSG_DATA(cmsg); } else { errno = EINVAL; return -1; @@ -448,7 +448,7 @@ static ssize_t sendfile_helper(socket_fd_api *p_socket_object, int in_fd, __off6 tx_arg.attr.sz_iov = 1; tx_arg.attr.flags = MSG_ZEROCOPY; tx_arg.priv.attr = PBUF_DESC_MDESC; - tx_arg.priv.map = (void *)mapping; + tx_arg.priv.mdesc = (void *)mapping; totSent = p_socket_object->tx(tx_arg); mapping->put(); diff --git a/src/core/sock/sockinfo_nvme.cpp b/src/core/sock/sockinfo_nvme.cpp index 31d16655c..01a0fa785 100644 --- a/src/core/sock/sockinfo_nvme.cpp +++ b/src/core/sock/sockinfo_nvme.cpp @@ -89,7 +89,7 @@ ssize_t sockinfo_tcp_ops_nvme::tx(xlio_tx_call_attr_t &tx_arg) errno = EINVAL; return -1; } - auto aux_data = reinterpret_cast(tx_arg.priv.map); + auto aux_data = reinterpret_cast(tx_arg.priv.opaque); auto msg = tx_arg.attr.hdr; if (!msg->msg_iov || !aux_data || msg->msg_iovlen == 0U || aux_data[0].message_length == 0U) { diff --git a/src/core/sock/sockinfo_tcp.cpp b/src/core/sock/sockinfo_tcp.cpp index 73730deea..674f1781d 100644 --- a/src/core/sock/sockinfo_tcp.cpp +++ b/src/core/sock/sockinfo_tcp.cpp @@ -946,7 +946,7 @@ ssize_t sockinfo_tcp::tcp_tx(xlio_tx_call_attr_t &tx_arg) bool is_non_file_zerocopy = tx_arg.opcode != TX_FILE; pd_key_array = - (tx_arg.priv.attr == PBUF_DESC_MKEY ? (struct xlio_pd_key *)tx_arg.priv.map : nullptr); + (tx_arg.priv.attr == PBUF_DESC_MKEY ? (struct xlio_pd_key *)tx_arg.priv.opaque : nullptr); si_tcp_logfunc("tx: iov=%p niovs=%zu", p_iov, sz_iov); @@ -1070,7 +1070,8 @@ ssize_t sockinfo_tcp::tcp_tx_slow_path(xlio_tx_call_attr_t &tx_arg) apiflags |= XLIO_TX_PACKET_ZEROCOPY; is_send_zerocopy = tx_arg.opcode != TX_FILE; pd_key_array = - (tx_arg.priv.attr == PBUF_DESC_MKEY ? (struct xlio_pd_key *)tx_arg.priv.map : nullptr); + (tx_arg.priv.attr == PBUF_DESC_MKEY ? (struct xlio_pd_key *)tx_arg.priv.opaque + : nullptr); } si_tcp_logfunc("tx: iov=%p niovs=%zu", p_iov, sz_iov); From 877988254445e2feab6d2813912a557921c979a0 Mon Sep 17 00:00:00 2001 From: Dmytro Podgornyi Date: Sun, 4 Feb 2024 09:40:37 +0200 Subject: [PATCH 099/169] issue: 3788369 Introduce XLIO socket API This is performance oriented and event based API for TCP sockets. It introduces a polling group (xlio_poll_group_t) which is a collection of sockets (xlio_socket_t) and a serialization context. Signed-off-by: Dmytro Podgornyi --- src/core/Makefile.am | 2 + src/core/dev/allocator.cpp | 10 +- src/core/dev/buffer_pool.h | 1 + src/core/dev/ring.cpp | 7 +- src/core/dev/ring.h | 5 + src/core/event/poll_group.cpp | 120 ++++++++++ src/core/event/poll_group.h | 80 +++++++ src/core/lwip/pbuf.h | 1 - src/core/lwip/tcp_out.c | 34 ++- src/core/proto/dst_entry.h | 2 +- src/core/proto/mem_buf_desc.h | 76 +++--- src/core/sock/sock-extra.cpp | 191 +++++++++++++++ src/core/sock/sockinfo_tcp.cpp | 253 ++++++++++++++++++-- src/core/sock/sockinfo_tcp.h | 118 ++++++---- src/core/xlio.h | 229 +++++++++++++++++- tests/extra_api/xlio_socket_api.c | 379 ++++++++++++++++++++++++++++++ 16 files changed, 1398 insertions(+), 110 deletions(-) create mode 100644 src/core/event/poll_group.cpp create mode 100644 src/core/event/poll_group.h create mode 100644 tests/extra_api/xlio_socket_api.c diff --git a/src/core/Makefile.am b/src/core/Makefile.am index a15bc4d6a..867c70c70 100644 --- a/src/core/Makefile.am +++ b/src/core/Makefile.am @@ -100,6 +100,7 @@ libxlio_la_SOURCES := \ event/delta_timer.cpp \ event/event_handler_manager.cpp \ event/event_handler_manager_local.cpp \ + event/poll_group.cpp \ event/vlogger_timer_handler.cpp \ event/netlink_event.cpp \ \ @@ -215,6 +216,7 @@ libxlio_la_SOURCES := \ event/event_handler_manager_local.h \ event/event_handler_rdma_cm.h \ event/netlink_event.h \ + event/poll_group.h \ event/timer_handler.h \ event/timers_group.h \ event/vlogger_timer_handler.h \ diff --git a/src/core/dev/allocator.cpp b/src/core/dev/allocator.cpp index 5696464b3..ffd1db1b7 100644 --- a/src/core/dev/allocator.cpp +++ b/src/core/dev/allocator.cpp @@ -40,9 +40,13 @@ #include "ib_ctx_handler_collection.h" #include "util/hugepage_mgr.h" #include "util/vtypes.h" +#include "xlio.h" #define MODULE_NAME "allocator" +// See description at the xlio_memory_cb_t definition. +xlio_memory_cb_t g_user_memory_cb = nullptr; + xlio_allocator::xlio_allocator() : xlio_allocator(nullptr, nullptr) { @@ -286,7 +290,7 @@ bool xlio_registrator::register_memory(void *data, size_t size, ib_ctx_handler * return lkey != LKEY_ERROR; } - // Path for all ib contextes + // Path for all ib contexts ib_context_map_t *ib_ctx_map = g_p_ib_ctx_handler_collection->get_ib_cxt_list(); if (likely(ib_ctx_map)) { for (const auto &ib_ctx_key_val : *ib_ctx_map) { @@ -500,6 +504,10 @@ bool xlio_heap::expand(size_t size /*=0*/) m_blocks.push_back(block); m_latest_offset = 0; + if (m_b_hw && g_user_memory_cb) { + g_user_memory_cb(data, size, 0); + } + return true; error: diff --git a/src/core/dev/buffer_pool.h b/src/core/dev/buffer_pool.h index 0c26f15c0..8902cd21e 100644 --- a/src/core/dev/buffer_pool.h +++ b/src/core/dev/buffer_pool.h @@ -62,6 +62,7 @@ inline static void free_lwip_pbuf(struct pbuf_custom *pbuf_custom) if (p_desc->m_flags & mem_buf_desc_t::ZCOPY) { p_desc->tx.zc.callback(p_desc); } + p_desc->m_flags = 0; pbuf_custom->pbuf.flags = 0; pbuf_custom->pbuf.ref = 0; pbuf_custom->pbuf.desc.attr = PBUF_DESC_NONE; diff --git a/src/core/dev/ring.cpp b/src/core/dev/ring.cpp index 706bd3e86..00a86e634 100644 --- a/src/core/dev/ring.cpp +++ b/src/core/dev/ring.cpp @@ -31,6 +31,7 @@ */ #include "ring.h" +#include "event/poll_group.h" #include "proto/route_table_mgr.h" #include "sock/tcp_seg_pool.h" @@ -40,7 +41,8 @@ #define MODULE_HDR MODULE_NAME "%d:%s() " ring::ring() - : m_p_n_rx_channel_fds(nullptr) + : m_p_group(nullptr) + , m_p_n_rx_channel_fds(nullptr) , m_parent(nullptr) , m_tcp_seg_list(nullptr) , m_tcp_seg_count(0U) @@ -51,6 +53,9 @@ ring::ring() ring::~ring() { + if (m_p_group) { + m_p_group->del_ring(this); + } if (m_tcp_seg_list) { g_tcp_seg_pool->put_tcp_segs(m_tcp_seg_list); } diff --git a/src/core/dev/ring.h b/src/core/dev/ring.h index b462b9194..4c863a2b7 100644 --- a/src/core/dev/ring.h +++ b/src/core/dev/ring.h @@ -45,6 +45,7 @@ struct xlio_tls_info; class pkt_rcvr_sink; class rfs_rule; +class poll_group; #define ring_logpanic __log_info_panic #define ring_logerr __log_info_err @@ -262,10 +263,14 @@ class ring { struct tcp_seg *get_tcp_segs(uint32_t num); void put_tcp_segs(struct tcp_seg *seg); + void set_group(poll_group *grp) { m_p_group = grp; } + poll_group *get_group() const { return m_p_group; } + protected: inline void set_parent(ring *parent) { m_parent = (parent ? parent : this); } inline void set_if_index(int if_index) { m_if_index = if_index; } + poll_group *m_p_group; int *m_p_n_rx_channel_fds; ring *m_parent; diff --git a/src/core/event/poll_group.cpp b/src/core/event/poll_group.cpp new file mode 100644 index 000000000..66f60b714 --- /dev/null +++ b/src/core/event/poll_group.cpp @@ -0,0 +1,120 @@ +/* + * Copyright (c) 2001-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "config.h" +#include "poll_group.h" + +#include "dev/ring.h" +#include "event/event_handler_manager_local.h" +#include "sock/sockinfo_tcp.h" + +#define MODULE_NAME "group:" + +#define grp_logpanic __log_panic +#define grp_logerr __log_err +#define grp_logwarn __log_warn +#define grp_loginfo __log_info +#define grp_logdbg __log_dbg + +poll_group::poll_group(const struct xlio_poll_group_attr *attr) + : m_socket_event_cb(attr->socket_event_cb) + , m_socket_comp_cb(attr->socket_comp_cb) + , m_socket_rx_cb(attr->socket_rx_cb) + , m_group_flags(attr->flags) +{ + /* + * In the best case, we expect a single ring per group. Reserve two elements for a scenario + * with two network interfaces and when the both interfaces are used by the sockets. + * More complex scenarios will be covered with re-allocation. + */ + m_rings.reserve(2); + + m_event_handler = std::make_unique(); + m_tcp_timers = std::make_unique( + safe_mce_sys().tcp_timer_resolution_msec, safe_mce_sys().tcp_timer_resolution_msec); + m_tcp_timers->set_group(this); +} + +poll_group::~poll_group() +{ +} + +void poll_group::poll() +{ + for (ring *rng : m_rings) { + uint64_t sn; + rng->poll_and_process_element_tx(&sn); + sn = 0; + rng->poll_and_process_element_rx(&sn); + } + m_event_handler->do_tasks(); +} + +void poll_group::add_dirty_socket(sockinfo_tcp *si) +{ + if (m_group_flags & XLIO_GROUP_FLAG_DIRTY) { + m_dirty_sockets.push_back(si); + } +} + +void poll_group::flush() +{ + for (auto si : m_dirty_sockets) { + si->flush(); + } + m_dirty_sockets.clear(); + // TODO Ring doorbell and request TX completion. +} + +void poll_group::add_ring(ring *rng) +{ + if (std::find(m_rings.begin(), m_rings.end(), rng) == std::end(m_rings)) { + grp_logdbg("New ring %p in group %p", rng, this); + if (rng->get_group()) { + grp_logwarn("Ring belongs to a group %p (current group %p)", rng->get_group(), this); + } + rng->set_group(this); + m_rings.push_back(rng); + // TODO Increase ref count for the ring and keep it until the group is destroyed. + // In this way we don't have to implement del_ring() and there won't be a race between + // socket destruction and xlio_group_buf_free(). + } +} + +void poll_group::del_ring(ring *rng) +{ + auto iter = std::find(m_rings.begin(), m_rings.end(), rng); + if (iter != std::end(m_rings)) { + grp_logdbg("Removed ring %p from group %p", rng, this); + m_rings.erase(iter); + } +} diff --git a/src/core/event/poll_group.h b/src/core/event/poll_group.h new file mode 100644 index 000000000..c13e01c96 --- /dev/null +++ b/src/core/event/poll_group.h @@ -0,0 +1,80 @@ +/* + * Copyright (c) 2001-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef XLIO_GROUP_H +#define XLIO_GROUP_H + +#include +#include + +#include "xlio.h" + +/* Forward declarations */ +struct xlio_poll_group_attr; +class ring; +class event_handler_manager_local; +class tcp_timers_collection; +class sockinfo_tcp; + +class poll_group { +public: + poll_group(const struct xlio_poll_group_attr *attr); + ~poll_group(); + + void poll(); + + void add_dirty_socket(sockinfo_tcp *si); + void flush(); + + void add_ring(ring *); + void del_ring(ring *); + + unsigned get_flags() const { return m_group_flags; } + event_handler_manager_local *get_event_handler() const { return m_event_handler.get(); } + tcp_timers_collection *get_tcp_timers() const { return m_tcp_timers.get(); } + +public: + xlio_socket_event_cb_t m_socket_event_cb; + xlio_socket_comp_cb_t m_socket_comp_cb; + xlio_socket_rx_cb_t m_socket_rx_cb; + +private: + std::vector m_rings; + std::unique_ptr m_event_handler; + std::unique_ptr m_tcp_timers; + + std::vector m_dirty_sockets; + + unsigned m_group_flags; +}; + +#endif /* XLIO_GROUP_H */ diff --git a/src/core/lwip/pbuf.h b/src/core/lwip/pbuf.h index 92c77fbb7..e25923548 100644 --- a/src/core/lwip/pbuf.h +++ b/src/core/lwip/pbuf.h @@ -117,7 +117,6 @@ typedef void (*pbuf_free_custom_fn)(struct pbuf *p); struct pbuf_custom { /** The actual pbuf */ struct pbuf pbuf; - u64_t padding; /* TODO Remove and optimize mem_buf_desc alignment. */ }; /* Initializes the pbuf module. This call is empty for now, but may not be in future. */ diff --git a/src/core/lwip/tcp_out.c b/src/core/lwip/tcp_out.c index 42448e4d7..e5e15f01d 100644 --- a/src/core/lwip/tcp_out.c +++ b/src/core/lwip/tcp_out.c @@ -721,10 +721,11 @@ err_t tcp_write(struct tcp_pcb *pcb, const void *arg, u32_t len, u16_t apiflags, */ err_t tcp_write_express(struct tcp_pcb *pcb, const struct iovec *iov, u32_t iovcnt, pbuf_desc *desc) { - struct pbuf *p; + struct pbuf *p = NULL; struct tcp_seg *seg = NULL; struct tcp_seg *queue = NULL; struct tcp_seg *last; + void *opaque = NULL; const u32_t seglen_max = tcp_tso(pcb) ? pcb->tso.max_payload_sz : pcb->mss; u32_t pos; u32_t seglen; @@ -748,6 +749,23 @@ err_t tcp_write_express(struct tcp_pcb *pcb, const struct iovec *iov, u32_t iovc } last_seglen = last ? last->len : 0; + if (desc->attr == PBUF_DESC_EXPRESS) { + /* + * Keep opaque value only in the right most pbuf for each send operation. + * + * Express path needs to call the completion callback only after the send operation + * is completed and all the related buffers are not used by XLIO. + * Current implementation keeps the opaque in the last pbuf and calls the callback + * when the opaque is set. + * This implementation can call the callback while a buffer is still in SQ in a specific + * case of spurious retransmission. However, without HW offloads and user memory + * deregistration, the buffer in the SQ won't lead to a functional issue. + * This is a place for improvements. + */ + opaque = desc->opaque; + desc->opaque = NULL; + } + for (unsigned i = 0; i < iovcnt; ++i) { u8_t *data = (u8_t *)iov[i].iov_base; const u32_t len = iov[i].iov_len; @@ -823,6 +841,15 @@ err_t tcp_write_express(struct tcp_pcb *pcb, const struct iovec *iov, u32_t iovc pcb->last_unsent = last; } + if (desc->attr == PBUF_DESC_EXPRESS) { + /* See description above. */ + if (p) { + /* 'p' is the last allocated pbuf. */ + p->desc.opaque = opaque; + } + desc->opaque = opaque; + } + /* Update the pcb state. */ pcb->snd_lbb += total_len; pcb->snd_buf -= total_len; @@ -1537,6 +1564,11 @@ void tcp_split_segment(struct tcp_pcb *pcb, struct tcp_seg *seg, u32_t wnd) return; } + if (seg->p->desc.attr == PBUF_DESC_EXPRESS) { + /* Keep opaque value only in the right most pbuf for each send operation. */ + seg->p->desc.opaque = NULL; + } + /* Copy the data from the original buffer */ if (is_zerocopy) { p->payload = (char *)seg->p->payload + lentosend; diff --git a/src/core/proto/dst_entry.h b/src/core/proto/dst_entry.h index 5a214505f..9d30dd5ff 100644 --- a/src/core/proto/dst_entry.h +++ b/src/core/proto/dst_entry.h @@ -112,7 +112,7 @@ class dst_entry : public cache_observer, public tostr { } inline void set_src_sel_prefs(uint8_t sel_flags) { m_src_sel_prefs = sel_flags; } inline ring *get_ring() { return m_p_ring; } - inline ib_ctx_handler *get_ctx() { return m_p_ring->get_ctx(m_id); } + inline ib_ctx_handler *get_ctx() { return m_p_ring ? m_p_ring->get_ctx(m_id) : nullptr; } inline sa_family_t get_sa_family() { return m_family; } uint8_t get_tos() const { return m_tos; } uint8_t get_ttl_hop_limit() const { return m_ttl_hop_limit; } diff --git a/src/core/proto/mem_buf_desc.h b/src/core/proto/mem_buf_desc.h index 87d47fc87..db274b9b1 100644 --- a/src/core/proto/mem_buf_desc.h +++ b/src/core/proto/mem_buf_desc.h @@ -77,6 +77,7 @@ class mem_buf_desc_t { , sz_buffer(size) , sz_data(0) , p_desc_owner(nullptr) + , unused_padding(0) { memset(&lwip_pbuf, 0, sizeof(lwip_pbuf)); @@ -94,13 +95,41 @@ class mem_buf_desc_t { memcpy((void *)this, &ref, sizeof(mem_buf_desc_t)); } + inline mem_buf_desc_t *clone() + { + mem_buf_desc_t *p_desc = new mem_buf_desc_t(*this); + INIT_LIST_HEAD(&p_desc->buffer_node.head); + p_desc->m_flags |= mem_buf_desc_t::CLONED; + return p_desc; + } + // Destructor specifically for cloned buffers. ~mem_buf_desc_t() {} - /* This field must be first in this class - * It encapsulates pbuf structure from lwip - * and extra fields to proceed customer specific requirements - */ + inline void clear_transport_data(void) + { + // rx field is the largest in the union, this clears tx as well. + memset((void *)&rx, 0, sizeof(rx)); + } + + inline int get_ref_count() const { return atomic_read(&n_ref_count); } + inline void reset_ref_count() { atomic_set(&n_ref_count, 0); } + inline void set_ref_count(int x) { atomic_set(&n_ref_count, x); } + inline int inc_ref_count() { return atomic_fetch_and_inc(&n_ref_count); } + inline int dec_ref_count() { return atomic_fetch_and_dec(&n_ref_count); } + inline int add_ref_count(int x) { return atomic_fetch_add_relaxed(x, &n_ref_count); } + inline unsigned int lwip_pbuf_get_ref_count() const { return lwip_pbuf.pbuf.ref; } + inline unsigned int lwip_pbuf_inc_ref_count() { return ++lwip_pbuf.pbuf.ref; } + inline unsigned int lwip_pbuf_dec_ref_count() + { + if (likely(lwip_pbuf.pbuf.ref)) { + --lwip_pbuf.pbuf.ref; + } + return lwip_pbuf.pbuf.ref; + } + +public: + /* This field must be first in this class. It encapsulates pbuf structure from lwip */ struct pbuf_custom lwip_pbuf; uint8_t *p_buffer; @@ -191,45 +220,8 @@ class mem_buf_desc_t { private: atomic_t n_ref_count; // number of interested receivers (sockinfo) [can be modified only in // cq_mgr_rx context] - public: - inline void clear_transport_data(void) - { - // rx field is the largest in the union, this clears tx as well. - memset((void *)&rx, 0, sizeof(rx)); - } - - inline mem_buf_desc_t *clone() - { - mem_buf_desc_t *p_desc = new mem_buf_desc_t(*this); - INIT_LIST_HEAD(&p_desc->buffer_node.head); - p_desc->m_flags |= mem_buf_desc_t::CLONED; - return p_desc; - } - - inline int get_ref_count() const { return atomic_read(&n_ref_count); } - - inline void reset_ref_count() { atomic_set(&n_ref_count, 0); } - - inline void set_ref_count(int x) { atomic_set(&n_ref_count, x); } - - inline int inc_ref_count() { return atomic_fetch_and_inc(&n_ref_count); } - - inline int dec_ref_count() { return atomic_fetch_and_dec(&n_ref_count); } - - inline int add_ref_count(int x) { return atomic_fetch_add_relaxed(x, &n_ref_count); } - - inline unsigned int lwip_pbuf_inc_ref_count() { return ++lwip_pbuf.pbuf.ref; } - - inline unsigned int lwip_pbuf_dec_ref_count() - { - if (likely(lwip_pbuf.pbuf.ref)) { - --lwip_pbuf.pbuf.ref; - } - return lwip_pbuf.pbuf.ref; - } - - inline unsigned int lwip_pbuf_get_ref_count() const { return lwip_pbuf.pbuf.ref; } + uint64_t unused_padding; // Align the structure to the cache line boundary }; typedef xlio_list_t descq_t; diff --git a/src/core/sock/sock-extra.cpp b/src/core/sock/sock-extra.cpp index 8c5fe21d6..ee50836f5 100644 --- a/src/core/sock/sock-extra.cpp +++ b/src/core/sock/sock-extra.cpp @@ -39,12 +39,14 @@ #include #include #include +#include #include #include #include #include #include "sock/sock-extra.h" +#include "xlio.h" #define MODULE_NAME "extra:" @@ -373,3 +375,192 @@ struct xlio_api_t *extra_api() return xlio_api; } + +/* + * Storage API + */ + +extern "C" int xlio_init_ex(const struct xlio_init_attr *attr) +{ + // Set XLIO socket API specific parameter unless user sets them explicitly + if (!getenv(SYS_VAR_PROGRESS_ENGINE_INTERVAL)) { + setenv(SYS_VAR_PROGRESS_ENGINE_INTERVAL, "0", 1); + } + if (!getenv(SYS_VAR_TCP_ABORT_ON_CLOSE)) { + setenv(SYS_VAR_TCP_ABORT_ON_CLOSE, "1", 1); + } + + extern xlio_memory_cb_t g_user_memory_cb; + g_user_memory_cb = attr->memory_cb; + + xlio_init(); + DO_GLOBAL_CTORS(); + + return 0; +} + +extern "C" int xlio_poll_group_create(const struct xlio_poll_group_attr *attr, + xlio_poll_group_t *group_out) +{ + // Validate input arguments + if (!group_out || !attr || !attr->socket_event_cb) { + errno = EINVAL; + return -1; + } + + poll_group *grp = new poll_group(attr); + if (!grp) { + errno = ENOMEM; + return -1; + } + + *group_out = reinterpret_cast(grp); + return 0; +} + +extern "C" int xlio_poll_group_destroy(xlio_poll_group_t group) +{ + poll_group *grp = reinterpret_cast(group); + + delete grp; + return 0; +} + +extern "C" void xlio_poll_group_poll(xlio_poll_group_t group) +{ + poll_group *grp = reinterpret_cast(group); + + grp->poll(); +} + +extern "C" int xlio_socket_create(const struct xlio_socket_attr *attr, xlio_socket_t *sock_out) +{ + // Validate input arguments + if (!sock_out || !attr || !attr->group || + !(attr->domain == AF_INET || attr->domain == AF_INET6)) { + errno = EINVAL; + return -1; + } + + int sockfd = socket_internal(attr->domain, SOCK_STREAM, 0, true, false); + if (sockfd < 0) { + return -1; + } + + sockinfo_tcp *si = dynamic_cast(g_p_fd_collection->get_sockfd(sockfd)); + if (!si) { + errno = EBADF; + return -1; + } + + si->set_xlio_socket(attr); + + *sock_out = reinterpret_cast(si); + return 0; +} + +extern "C" int xlio_socket_destroy(xlio_socket_t sock) +{ + sockinfo_tcp *si = reinterpret_cast(sock); + + return XLIO_CALL(close, si->get_fd()); +} + +extern "C" int xlio_socket_setsockopt(xlio_socket_t sock, int level, int optname, + const void *optval, socklen_t optlen) +{ + sockinfo_tcp *si = reinterpret_cast(sock); + + return XLIO_CALL(setsockopt, si->get_fd(), level, optname, optval, optlen); +} + +extern "C" int xlio_socket_bind(xlio_socket_t sock, const struct sockaddr *addr, socklen_t addrlen) +{ + sockinfo_tcp *si = reinterpret_cast(sock); + + return XLIO_CALL(bind, si->get_fd(), addr, addrlen); +} + +extern "C" int xlio_socket_connect(xlio_socket_t sock, const struct sockaddr *to, socklen_t tolen) +{ + sockinfo_tcp *si = reinterpret_cast(sock); + int errno_save = errno; + + int rc = XLIO_CALL(connect, si->get_fd(), to, tolen); + rc = (rc == -1 && (errno == EINPROGRESS || errno == EAGAIN)) ? 0 : rc; + if (rc == 0) { + si->add_tx_ring_to_group(); + errno = errno_save; + } + return rc; +} + +extern "C" struct ibv_pd *xlio_socket_get_pd(xlio_socket_t sock) +{ + sockinfo_tcp *si = reinterpret_cast(sock); + ib_ctx_handler *ctx = si->get_ctx(); + + return ctx ? ctx->get_ibv_pd() : nullptr; +} + +extern "C" int xlio_socket_fd(xlio_socket_t sock) +{ + sockinfo_tcp *si = reinterpret_cast(sock); + return si->get_fd(); +} + +static void xlio_buf_free(struct xlio_buf *buf) +{ + // TODO Use mem_buf_desc_t field as xlio_buf + mem_buf_desc_t *desc = reinterpret_cast(buf); + ring_slave *rng = desc->p_desc_owner; + + (void)rng->reclaim_recv_single_buffer(desc); +} + +extern "C" void xlio_socket_buf_free(xlio_socket_t sock, struct xlio_buf *buf) +{ + NOT_IN_USE(sock); + xlio_buf_free(buf); +} + +extern "C" void xlio_poll_group_buf_free(xlio_poll_group_t group, struct xlio_buf *buf) +{ + NOT_IN_USE(group); + xlio_buf_free(buf); +} + +extern "C" int xlio_socket_send(xlio_socket_t sock, const void *data, size_t len, + const struct xlio_socket_send_attr *attr) +{ + const struct iovec iov = {.iov_base = const_cast(data), .iov_len = len}; + + return xlio_socket_sendv(sock, &iov, 1, attr); +} + +extern "C" int xlio_socket_sendv(xlio_socket_t sock, const struct iovec *iov, unsigned iovcnt, + const struct xlio_socket_send_attr *attr) +{ + sockinfo_tcp *si = reinterpret_cast(sock); + + unsigned flags = XLIO_EXPRESS_OP_TYPE_DESC; + flags |= !(attr->flags & XLIO_SOCKET_SEND_FLAG_FLUSH) * XLIO_EXPRESS_MSG_MORE; + + int rc = (attr->flags & XLIO_SOCKET_SEND_FLAG_INLINE) + ? si->tcp_tx_express_inline(iov, iovcnt, flags) + : si->tcp_tx_express(iov, iovcnt, attr->mkey, flags, + reinterpret_cast(attr->userdata_op)); + return rc < 0 ? rc : 0; +} + +extern "C" void xlio_poll_group_flush(xlio_poll_group_t group) +{ + poll_group *grp = reinterpret_cast(group); + grp->flush(); +} + +extern "C" void xlio_socket_flush(xlio_socket_t sock) +{ + sockinfo_tcp *si = reinterpret_cast(sock); + si->flush(); +} diff --git a/src/core/sock/sockinfo_tcp.cpp b/src/core/sock/sockinfo_tcp.cpp index 674f1781d..5990dc2c8 100644 --- a/src/core/sock/sockinfo_tcp.cpp +++ b/src/core/sock/sockinfo_tcp.cpp @@ -47,6 +47,7 @@ #include "util/agent.h" #include "event/event_handler_manager.h" #include "event/event_handler_manager_local.h" +#include "event/poll_group.h" #include "proto/route_table_mgr.h" #include "proto/xlio_lwip.h" #include "proto/dst_entry_tcp.h" @@ -133,18 +134,28 @@ static bool is_inherited_option(int __level, int __optname) return ret; } -static event_handler_manager *get_event_mgr() +event_handler_manager *sockinfo_tcp::get_event_mgr() { - return (safe_mce_sys().tcp_ctl_thread != option_tcp_ctl_thread::CTL_THREAD_DELEGATE_TCP_TIMERS - ? g_p_event_handler_manager - : &g_event_handler_manager_local); + if (is_xlio_socket()) { + return m_p_group->get_event_handler(); + } else if (safe_mce_sys().tcp_ctl_thread == + option_tcp_ctl_thread::CTL_THREAD_DELEGATE_TCP_TIMERS) { + return &g_event_handler_manager_local; + } else { + return g_p_event_handler_manager; + } } -static tcp_timers_collection *get_tcp_timer_collection() +tcp_timers_collection *sockinfo_tcp::get_tcp_timer_collection() { - return (safe_mce_sys().tcp_ctl_thread != option_tcp_ctl_thread::CTL_THREAD_DELEGATE_TCP_TIMERS - ? g_tcp_timers_collection - : &g_thread_local_tcp_timers); + if (is_xlio_socket()) { + return m_p_group->get_tcp_timers(); + } else if (safe_mce_sys().tcp_ctl_thread == + option_tcp_ctl_thread::CTL_THREAD_DELEGATE_TCP_TIMERS) { + return &g_thread_local_tcp_timers; + } else { + return g_tcp_timers_collection; + } } static lock_base *get_new_tcp_lock() @@ -385,6 +396,127 @@ sockinfo_tcp::sockinfo_tcp(int fd, int domain) si_tcp_logfunc("done"); } +void sockinfo_tcp::rx_add_ring_cb(ring *p_ring) +{ + if (m_p_group) { + m_p_group->add_ring(p_ring); + } + sockinfo::rx_add_ring_cb(p_ring); +} + +void sockinfo_tcp::set_xlio_socket(const struct xlio_socket_attr *attr) +{ + m_xlio_socket_userdata = attr->userdata_sq; + m_p_group = reinterpret_cast(attr->group); + + bool current_locks = m_ring_alloc_log_rx.get_use_locks(); + + m_ring_alloc_log_rx.set_ring_alloc_logic(RING_LOGIC_PER_USER_ID); + m_ring_alloc_log_rx.set_user_id_key(reinterpret_cast(m_p_group)); + m_ring_alloc_log_rx.set_use_locks(current_locks || + (m_p_group->get_flags() & XLIO_GROUP_FLAG_SAFE)); + m_ring_alloc_logic_rx = ring_allocation_logic_rx(get_fd(), m_ring_alloc_log_rx); + + m_ring_alloc_log_tx.set_ring_alloc_logic(RING_LOGIC_PER_USER_ID); + m_ring_alloc_log_tx.set_user_id_key(reinterpret_cast(m_p_group)); + m_ring_alloc_log_tx.set_use_locks(current_locks || + (m_p_group->get_flags() & XLIO_GROUP_FLAG_SAFE)); + + if (!current_locks && (m_p_group->get_flags() & XLIO_GROUP_FLAG_SAFE)) { + m_tcp_con_lock = multilock::create_new_lock(MULTILOCK_RECURSIVE, "tcp_con"); + } + + tcp_recv(&m_pcb, sockinfo_tcp::rx_lwip_cb_xlio_socket); + tcp_err(&m_pcb, sockinfo_tcp::err_lwip_cb_xlio_socket); + set_blocking(false); +} + +void sockinfo_tcp::add_tx_ring_to_group() +{ + ring *rng = get_tx_ring(); + if (m_p_group && rng) { + m_p_group->add_ring(rng); + } +} + +void sockinfo_tcp::xlio_socket_event(int event, int value) +{ + if (is_xlio_socket()) { + /* poll_group::m_socket_event_cb must be always set. */ + m_p_group->m_socket_event_cb(reinterpret_cast(this), m_xlio_socket_userdata, + event, value); + } +} + +/*static*/ +err_t sockinfo_tcp::rx_lwip_cb_xlio_socket(void *arg, struct tcp_pcb *pcb, struct pbuf *p, + err_t err) +{ + sockinfo_tcp *conn = (sockinfo_tcp *)arg; + + NOT_IN_USE(pcb); + assert((uintptr_t)pcb->my_container == (uintptr_t)arg); + + // if is FIN + if (unlikely(!p)) { + return conn->handle_fin(pcb, err); + } + + if (unlikely(err != ERR_OK)) { + conn->handle_rx_lwip_cb_error(p); + return err; + } + + tcp_recved(pcb, p->tot_len); + + if (conn->m_p_group->m_socket_rx_cb) { + struct pbuf *ptmp = p; + while (ptmp) { + /* TODO Pass mem_buf_desc_t field intead of pbuf itself as xlio_buf */ + conn->m_p_group->m_socket_rx_cb(reinterpret_cast(conn), + conn->m_xlio_socket_userdata, ptmp->payload, ptmp->len, + reinterpret_cast(ptmp)); + ptmp = ptmp->next; + } + } + pbuf_free(p); + + // TODO Stats + + return ERR_OK; +} + +/*static*/ +void sockinfo_tcp::err_lwip_cb_xlio_socket(void *pcb_container, err_t err) +{ + sockinfo_tcp *conn = reinterpret_cast(pcb_container); + + // TODO Reduce copy-paste + conn->m_conn_state = TCP_CONN_FAILED; + conn->m_error_status = ECONNABORTED; + if (err == ERR_TIMEOUT) { + conn->m_conn_state = TCP_CONN_TIMEOUT; + conn->m_error_status = ETIMEDOUT; + } else if (err == ERR_RST) { + if (conn->m_sock_state == TCP_SOCK_ASYNC_CONNECT) { + conn->m_conn_state = TCP_CONN_ERROR; + conn->m_error_status = ECONNREFUSED; + } else { + conn->m_conn_state = TCP_CONN_RESETED; + conn->m_error_status = ECONNRESET; + } + } + + // Avoid binding twice in case of calling connect again after previous call failed. + if (conn->m_sock_state != TCP_SOCK_BOUND) { // TODO: maybe we need to exclude more states? + conn->m_sock_state = TCP_SOCK_INITED; + } + + if (conn->m_state != SOCKINFO_CLOSING) { + conn->xlio_socket_event(XLIO_SOCKET_EVENT_ERROR, conn->m_error_status); + } +} + sockinfo_tcp::~sockinfo_tcp() { si_tcp_logfunc(""); @@ -455,6 +587,8 @@ sockinfo_tcp::~sockinfo_tcp() g_p_agent->unregister_cb((agent_cb_t)&sockinfo_tcp::put_agent_msg, (void *)this); } si_tcp_logdbg("sock closed"); + + xlio_socket_event(XLIO_SOCKET_EVENT_TERMINATED, 0); } void sockinfo_tcp::clean_obj() @@ -741,7 +875,8 @@ bool sockinfo_tcp::prepare_dst_to_send(bool is_accepted_socket /* = false */) bool ret_val = false; if (m_p_connected_dst_entry) { - bool skip_rules = is_accepted_socket, is_connect = !is_accepted_socket; + bool skip_rules = is_accepted_socket; + bool is_connect = !is_accepted_socket; ret_val = m_p_connected_dst_entry->prepare_to_send(m_so_ratelimit, skip_rules, is_connect); if (ret_val) { /* dst_entry has resolved tx ring, @@ -1366,6 +1501,9 @@ err_t sockinfo_tcp::ip_output_syn_ack(struct pbuf *p, struct tcp_seg *seg, void */ p_si_tcp->reset_ops(); } + if (new_state == ESTABLISHED) { + p_si_tcp->xlio_socket_event(XLIO_SOCKET_EVENT_ESTABLISHED, 0); + } /* Update daemon about actual state for offloaded connection */ if (g_p_agent && likely(p_si_tcp->m_sock_offload == TCP_SOCK_LWIP)) { @@ -4130,7 +4268,7 @@ void sockinfo_tcp::fit_snd_bufs(unsigned int new_max_snd_buff) m_pcb.snd_buf += ((int)new_max_snd_buff - m_pcb.max_snd_buff); m_pcb.max_snd_buff = new_max_snd_buff; - auto mss = m_pcb.mss ?: 536; + uint16_t mss = m_pcb.mss ?: 536; m_pcb.max_unsent_len = (mss - 1 + m_pcb.max_snd_buff * 16) / mss; } @@ -5536,10 +5674,20 @@ struct pbuf *sockinfo_tcp::tcp_tx_pbuf_alloc(void *p_conn, pbuf_type type, pbuf_ if (likely(p_dst)) { p_desc = p_dst->get_buffer(type, desc); - if (p_desc && (p_desc->lwip_pbuf.pbuf.type == PBUF_ZEROCOPY) && - ((p_desc->lwip_pbuf.pbuf.desc.attr == PBUF_DESC_NONE) || - (p_desc->lwip_pbuf.pbuf.desc.attr == PBUF_DESC_MKEY) || - p_desc->lwip_pbuf.pbuf.desc.attr == PBUF_DESC_NVME_TX)) { + } + if (likely(p_desc) && p_desc->lwip_pbuf.pbuf.type == PBUF_ZEROCOPY) { + if (p_desc->lwip_pbuf.pbuf.desc.attr == PBUF_DESC_EXPRESS) { + p_desc->m_flags |= mem_buf_desc_t::ZCOPY; + p_desc->tx.zc.callback = tcp_express_zc_callback; + if (p_buff) { + mem_buf_desc_t *p_prev_desc = reinterpret_cast(p_buff); + p_desc->tx.zc.ctx = p_prev_desc->tx.zc.ctx; + } else { + p_desc->tx.zc.ctx = reinterpret_cast(p_si_tcp); + } + } else if ((p_desc->lwip_pbuf.pbuf.desc.attr == PBUF_DESC_NONE) || + (p_desc->lwip_pbuf.pbuf.desc.attr == PBUF_DESC_MKEY) || + (p_desc->lwip_pbuf.pbuf.desc.attr == PBUF_DESC_NVME_TX)) { /* Prepare error queue fields for send zerocopy */ if (p_buff) { /* It is a special case that can happen as a result @@ -5619,6 +5767,19 @@ mem_buf_desc_t *sockinfo_tcp::tcp_tx_zc_alloc(mem_buf_desc_t *p_desc) return p_desc; } +/*static*/ +void sockinfo_tcp::tcp_express_zc_callback(mem_buf_desc_t *p_desc) +{ + sockinfo_tcp *si = reinterpret_cast(p_desc->tx.zc.ctx); + const uintptr_t opaque_op = reinterpret_cast(p_desc->lwip_pbuf.pbuf.desc.opaque); + + if (opaque_op && si->m_p_group && si->m_p_group->m_socket_comp_cb) { + si->m_p_group->m_socket_comp_cb(reinterpret_cast(si), + si->m_xlio_socket_userdata, opaque_op); + } +} + +/*static*/ void sockinfo_tcp::tcp_tx_zc_callback(mem_buf_desc_t *p_desc) { sockinfo_tcp *sock = nullptr; @@ -5806,6 +5967,18 @@ tcp_timers_collection::~tcp_timers_collection() free_tta_resources(); } +event_handler_manager *tcp_timers_collection::get_event_mgr() +{ + if (m_p_group) { + return m_p_group->get_event_handler(); + } else if (safe_mce_sys().tcp_ctl_thread == + option_tcp_ctl_thread::CTL_THREAD_DELEGATE_TCP_TIMERS) { + return &g_event_handler_manager_local; + } else { + return g_p_event_handler_manager; + } +} + void tcp_timers_collection::free_tta_resources() { if (m_n_count) { @@ -6023,7 +6196,7 @@ inline bool sockinfo_tcp::handle_bind_no_port(int &bind_ret, in_port_t in_port, } int sockinfo_tcp::tcp_tx_express(const struct iovec *iov, unsigned iov_len, uint32_t mkey, - xlio_express_flags flags, void *opaque_op) + unsigned flags, void *opaque_op) { if (unlikely(!is_connected_and_ready_to_send())) { return -1; @@ -6060,6 +6233,10 @@ int sockinfo_tcp::tcp_tx_express(const struct iovec *iov, unsigned iov_len, uint } if (!(flags & XLIO_EXPRESS_MSG_MORE)) { tcp_output(&m_pcb); + m_b_xlio_socket_dirty = false; + } else if (m_p_group && !m_b_xlio_socket_dirty) { + m_b_xlio_socket_dirty = true; + m_p_group->add_dirty_socket(this); } unlock_tcp_con(); @@ -6067,6 +6244,52 @@ int sockinfo_tcp::tcp_tx_express(const struct iovec *iov, unsigned iov_len, uint return bytes_written; } +int sockinfo_tcp::tcp_tx_express_inline(const struct iovec *iov, unsigned iov_len, unsigned flags) +{ + if (unlikely(!is_connected_and_ready_to_send())) { + return -1; + } + + pbuf_desc mdesc; + int bytes_written = 0; + + memset(&mdesc, 0, sizeof(mdesc)); + mdesc.attr = PBUF_DESC_NONE; + + lock_tcp_con(); + + for (unsigned i = 0; i < iov_len; ++i) { + bytes_written += iov[i].iov_len; + err_t err = tcp_write(&m_pcb, iov[i].iov_base, iov[i].iov_len, 0, &mdesc); + if (unlikely(err != ERR_OK)) { + // XXX tcp_write() can return multiple errors. + // XXX tcp_write() can also fail due to queuelen limit, but this is unlikely. + m_conn_state = TCP_CONN_ERROR; + m_error_status = ENOMEM; + return tcp_tx_handle_errno_and_unlock(ENOMEM); + } + } + if (!(flags & XLIO_EXPRESS_MSG_MORE)) { + m_b_xlio_socket_dirty = false; + tcp_output(&m_pcb); + } else if (m_p_group && !m_b_xlio_socket_dirty) { + m_b_xlio_socket_dirty = true; + m_p_group->add_dirty_socket(this); + } + + unlock_tcp_con(); + + return bytes_written; +} + +void sockinfo_tcp::flush() +{ + lock_tcp_con(); + m_b_xlio_socket_dirty = false; + tcp_output(&m_pcb); + unlock_tcp_con(); +} + ssize_t sockinfo_tcp::tcp_tx_handle_done_and_unlock(ssize_t total_tx, int errno_tmp, bool is_dummy, bool is_send_zerocopy) { diff --git a/src/core/sock/sockinfo_tcp.h b/src/core/sock/sockinfo_tcp.h index 611007ab2..d9a5c5663 100644 --- a/src/core/sock/sockinfo_tcp.h +++ b/src/core/sock/sockinfo_tcp.h @@ -40,7 +40,6 @@ #include "dev/cq_mgr_rx.h" #include "xlio_extra.h" -// LWIP includes #include "lwip/opt.h" #include "lwip/tcp_impl.h" @@ -48,6 +47,10 @@ #include "sockinfo_ulp.h" #include "sockinfo_nvme.h" +/* Forward declarations */ +struct xlio_socket_attr; +class poll_group; + #define BLOCK_THIS_RUN(blocking, flags) (blocking && !(flags & MSG_DONTWAIT)) /** @@ -121,6 +124,51 @@ struct socket_option_t { } }; +class tcp_timers_collection : public timers_group, public cleanable_obj { +public: + tcp_timers_collection(int period, int resolution); + ~tcp_timers_collection() override; + + void clean_obj() override; + + void handle_timer_expired(void *user_data) override; + + void set_group(poll_group *group) { m_p_group = group; } + inline event_handler_manager *get_event_mgr(); + +protected: + // add a new timer + void add_new_timer(timer_node_t *node, timer_handler *handler, void *user_data) override; + + // remove timer from list and free it. + // called for stopping (unregistering) a timer + void remove_timer(timer_node_t *node) override; + +private: + void free_tta_resources(); + +protected: + void *m_timer_handle; + +private: + timer_node_t **m_p_intervals; + + int m_n_period; + int m_n_resolution; + int m_n_intervals_size; + int m_n_location; + int m_n_count; + int m_n_next_insert_bucket; + + poll_group *m_p_group = nullptr; +}; + +class thread_local_tcp_timers : public tcp_timers_collection { +public: + thread_local_tcp_timers(); + ~thread_local_tcp_timers() override; +}; + typedef std::deque socket_options_list_t; typedef std::map ready_pcb_map_t; typedef std::map syn_received_map_t; @@ -231,6 +279,7 @@ class sockinfo_tcp : public sockinfo, public timer_handler { uint32_t get_next_tcp_seqno_rx() { return m_pcb.rcv_nxt; } mem_buf_desc_t *tcp_tx_zc_alloc(mem_buf_desc_t *p_desc); + static void tcp_express_zc_callback(mem_buf_desc_t *p_desc); static void tcp_tx_zc_callback(mem_buf_desc_t *p_desc); void tcp_tx_zc_handle(mem_buf_desc_t *p_desc); @@ -305,10 +354,11 @@ class sockinfo_tcp : public sockinfo, public timer_handler { return m_p_connected_dst_entry ? m_p_connected_dst_entry->get_ring() : nullptr; } - inline ring *get_rx_ring() { return m_p_rx_ring; } + void rx_add_ring_cb(ring *p_ring) override; + ring *get_rx_ring() { return m_p_rx_ring; } const flow_tuple_with_local_if &get_flow_tuple() { - /* XXX Dosn't handle empty map and a map with multiple elements. */ + /* XXX Doesn't handle empty map and a map with multiple elements. */ auto rx_flow_iter = m_rx_flow_map.begin(); return rx_flow_iter->first; } @@ -347,8 +397,17 @@ class sockinfo_tcp : public sockinfo, public timer_handler { return sockinfo::register_callback(callback, context); } - int tcp_tx_express(const struct iovec *iov, unsigned iov_len, uint32_t mkey, - xlio_express_flags flags, void *opaque_op); + int tcp_tx_express(const struct iovec *iov, unsigned iov_len, uint32_t mkey, unsigned flags, + void *opaque_op); + int tcp_tx_express_inline(const struct iovec *iov, unsigned iov_len, unsigned flags); + void flush(); + + void set_xlio_socket(const struct xlio_socket_attr *attr); + void add_tx_ring_to_group(); + bool is_xlio_socket() { return m_p_group != nullptr; } + void xlio_socket_event(int event, int value); + static err_t rx_lwip_cb_xlio_socket(void *arg, struct tcp_pcb *tpcb, struct pbuf *p, err_t err); + static void err_lwip_cb_xlio_socket(void *pcb_container, err_t err); protected: void lock_rx_q() override; @@ -529,6 +588,9 @@ class sockinfo_tcp : public sockinfo, public timer_handler { static void put_agent_msg(void *arg); bool is_connected_and_ready_to_send(); + inline event_handler_manager *get_event_mgr(); + inline tcp_timers_collection *get_tcp_timer_collection(); + public: static const int CONNECT_DEFAULT_TIMEOUT_MS = 10000; @@ -609,48 +671,16 @@ class sockinfo_tcp : public sockinfo, public timer_handler { uint64_t m_user_huge_page_mask; unsigned m_required_send_block; uint16_t m_external_vlan_tag = 0U; -}; -typedef struct tcp_seg tcp_seg; - -class tcp_timers_collection : public timers_group, public cleanable_obj { -public: - tcp_timers_collection(int period, int resolution); - ~tcp_timers_collection() override; - - void clean_obj() override; - - void handle_timer_expired(void *user_data) override; - -protected: - // add a new timer - void add_new_timer(timer_node_t *node, timer_handler *handler, void *user_data) override; - - // remove timer from list and free it. - // called for stopping (unregistering) a timer - void remove_timer(timer_node_t *node) override; - void *m_timer_handle; - -private: - timer_node_t **m_p_intervals; - - int m_n_period; - int m_n_resolution; - int m_n_intervals_size; - int m_n_location; - int m_n_count; - int m_n_next_insert_bucket; - - void free_tta_resources(); -}; - -class thread_local_tcp_timers : public tcp_timers_collection { -public: - thread_local_tcp_timers(); - ~thread_local_tcp_timers() override; + /* + * Storage API + * TODO Move the fields to proper cold/hot sections in the final version. + */ + bool m_b_xlio_socket_dirty = false; + uintptr_t m_xlio_socket_userdata = 0; + poll_group *m_p_group = nullptr; }; extern tcp_timers_collection *g_tcp_timers_collection; -extern thread_local thread_local_tcp_timers g_thread_local_tcp_timers; #endif diff --git a/src/core/xlio.h b/src/core/xlio.h index 98ff0a56e..e29bfbe94 100644 --- a/src/core/xlio.h +++ b/src/core/xlio.h @@ -33,13 +33,15 @@ #ifndef XLIO_H #define XLIO_H -#include -#include -#include - #include #include #include +#include +#include +#include +#include + +#include "xlio_extra.h" #ifdef __cplusplus extern "C" { @@ -383,6 +385,225 @@ int xlio_socketxtreme_ref_buff(struct xlio_buff_t *buff); */ int xlio_socketxtreme_free_buff(struct xlio_buff_t *buff); +/* + * XLIO Socket API + */ + +typedef uintptr_t xlio_poll_group_t; +typedef uintptr_t xlio_socket_t; +typedef uint32_t xlio_key_t; + +struct xlio_buf { + uint64_t userdata; +}; + +/* + * XLIO initialization. + * + * xlio_init_ex() must be called before using any XLIO Socket API. This is heavy operation. + * xlio_init_ex() is not thread-safe operation, however, subsequent serialized calls exit + * successfully without any action. + * + * If set, memory_cb() notifies about memory blocks which zerocopy RX buffers can point to. + * Current implementation allocates a single memory block and does it within xlio_init_ex() context. + */ + +/* + * Memory callback. + * + * XLIO calls the callback each time XLIO allocates a memory region which can be used for RX + * buffers. User can use this information to prepare the memory for some logic in the future. + * Zerocopy RX interface provides pointers to such memory. + * + * Current XLIO implementation does a single allocation for buffers. + */ +typedef void (*xlio_memory_cb_t)(void *addr, size_t len, size_t hugepage_size); + +struct xlio_init_attr { + unsigned flags; + xlio_memory_cb_t memory_cb; +}; + +int xlio_init_ex(const struct xlio_init_attr *attr); + +/* + * Socket callbacks. + */ + +enum { + /* TCP connection established. */ + XLIO_SOCKET_EVENT_ESTABLISHED = 1, + /* Socket terminated and no further events are possible. */ + XLIO_SOCKET_EVENT_TERMINATED, + /* Passive close. */ + XLIO_SOCKET_EVENT_CLOSED, + /* An error occurred, see the error code value. */ + XLIO_SOCKET_EVENT_ERROR, +}; + +/* + * Socket event callback. + * + * May be called from xlio_poll_group_poll() context. + * In the callback context, send operation is allowed only for the ESTABLISHED event. + * Argument value holds the error code for the ERROR event and 0 for other events. + * + * List of possible error code values: + * ECONNABORTED - connection aborted by local side + * ECONNRESET - connection reset by remote side + * ECONNREFUSED - connection refused by remote side during TCP handshake + * ETIMEDOUT - connection timed out due to keepalive, user timeout option or TCP handshake timeout + */ +typedef void (*xlio_socket_event_cb_t)(xlio_socket_t, uintptr_t userdata_sq, int event, int value); + +/* + * Zerocopy completion event. + * + * May be called from the following contexts: + * - xlio_poll_group_poll() - likely + * - xlio_socket_send() - can happen only if data is flushed + * - xlio_socket_flush() / xlio_poll_group_flush() + * - xlio_socket_destroy() + * + * In the callback context, send operation is allowed unless the socket is under destruction. + */ +typedef void (*xlio_socket_comp_cb_t)(xlio_socket_t, uintptr_t userdata_sq, uintptr_t userdata_op); + +/* + * RX callback. + * + * Returns TCP payload upon arrival. Each call returns a single contiguous buffer. The buffer points + * to memory within a block which is provided by the memory_cb() notification. + * + * xlio_buf is a descriptor of the buffer which must be returned to XLIO. During user ownership, + * they may use the uninitialized field in the structure. + */ +typedef void (*xlio_socket_rx_cb_t)(xlio_socket_t, uintptr_t userdata_sq, void *data, size_t len, + struct xlio_buf *buf); + +/* + * XLIO polling groups. + * + * Event callbacks are registered per group. This allows to move control flow connections to + * a separate group and implement RX / completion logic differently. + * + * xlio_poll_group_poll() polls HW for events and executes TCP timers. Most of the callbacks are + * expected from the context of this call. + * + * Recommendations: + * - Groups are expected to be long lived objects. Frequent creation/destruction has a penalty. + * - Reduce the number of different network interfaces within a group to minimum. This will + * optimize the HW objects utilization. However, maintaining extra groups can have an overhead. + */ + +/* Sockets and rings will be protected with locks regardless of XLIO configuration. */ +#define XLIO_GROUP_FLAG_SAFE 0x1 +/* Group will keep dirty sockets to be flushed with xlio_poll_group_flush(). */ +#define XLIO_GROUP_FLAG_DIRTY 0x2 + +struct xlio_poll_group_attr { + unsigned flags; + + xlio_socket_event_cb_t socket_event_cb; + xlio_socket_comp_cb_t socket_comp_cb; + xlio_socket_rx_cb_t socket_rx_cb; +}; + +int xlio_poll_group_create(const struct xlio_poll_group_attr *attr, xlio_poll_group_t *group_out); +int xlio_poll_group_destroy(xlio_poll_group_t group); +void xlio_poll_group_poll(xlio_poll_group_t group); + +/* + * XLIO socket. + * + * XLIO socket is represented by xlio_socket_t instead of file descriptor. This is a TCP + * non-blocking socket abstraction. + * + * xlio_socket_destroy() triggers socket closing procedure. The process can be asynchronous + * and socket events may be expected until XLIO_SOCKET_EVENT_TERMINATED event arrives. + * Example of the possible events is zerocopy completions which can arrive from the + * xlio_socket_destroy() context or xlio_poll_group_poll() context. + * + * Limitations: + * - Only outgoing connections are supported + * - Bonding is not supported + */ + +struct xlio_socket_attr { + unsigned flags; + int domain; /* AF_INET or AF_INET6 */ + xlio_poll_group_t group; + uintptr_t userdata_sq; +}; + +/* Forward declaration. */ +struct ibv_pd; + +int xlio_socket_create(const struct xlio_socket_attr *attr, xlio_socket_t *sock_out); +int xlio_socket_destroy(xlio_socket_t sock); +int xlio_socket_setsockopt(xlio_socket_t sock, int level, int optname, const void *optval, + socklen_t optlen); +int xlio_socket_bind(xlio_socket_t sock, const struct sockaddr *addr, socklen_t addrlen); +int xlio_socket_connect(xlio_socket_t sock, const struct sockaddr *to, socklen_t tolen); +struct ibv_pd *xlio_socket_get_pd(xlio_socket_t sock); + +int xlio_socket_fd(xlio_socket_t sock); + +/* + * TX flow. + * + * Properties of the TX flow: + * - Non-blocking + * - No partial write support - accepts all data unless memory allocation error happens + * - Each send call expects a complete or part of a single PDU or message. This is a requirement + * in case of either crypto or CRC offload is enabled. + * - User requests zerocopy completion callback with non-zero userdata_op value and controls + * the logic of completions. For example, each completion can complete entire PDU object. + * - Inline send operations don't trigger the completion callback. + * - XLIO aggregates data on socket and pushes it to wire with the flush-like API or + * XLIO_SOCKET_SEND_FLAG_FLUSH flag. + * + * **Current limitations**: + * - Currently, data can be pushes to wire in the RX flow regardless of the flush logic. + * - Avoid using xlio_socket_flush() for a XLIO_GROUP_FLAG_DIRTY group. + * - For a XLIO_GROUP_FLAG_DIRTY group, usage of XLIO_SOCKET_SEND_FLAG_FLUSH is limited, + * it's better to avoid using them both. + */ + +/* Flush socket after queueing the data. */ +#define XLIO_SOCKET_SEND_FLAG_FLUSH 0x1 +/* Copy user data to the internal buffers instead of taking ownership. */ +#define XLIO_SOCKET_SEND_FLAG_INLINE 0x2 + +struct xlio_socket_send_attr { + unsigned flags; + uint32_t mkey; + uintptr_t userdata_op; +}; + +/* Returns either 0 or -1. The errors, except of ENOMEM, are not recoverable. */ +int xlio_socket_send(xlio_socket_t sock, const void *data, size_t len, + const struct xlio_socket_send_attr *attr); +int xlio_socket_sendv(xlio_socket_t sock, const struct iovec *iov, unsigned iovcnt, + const struct xlio_socket_send_attr *attr); +void xlio_poll_group_flush(xlio_poll_group_t group); +void xlio_socket_flush(xlio_socket_t sock); + +struct xlio_key_attr { + int unused; +}; + +/* All the alive socket keys are destroyed on socket destruction. */ +int xlio_key_create(xlio_socket_t sock, struct xlio_key_attr *attr, xlio_key_t *key_out); +void xlio_key_destroy(xlio_socket_t sock, xlio_key_t key); + +/* + * RX flow. + */ + +void xlio_socket_buf_free(xlio_socket_t sock, struct xlio_buf *buf); +void xlio_poll_group_buf_free(xlio_poll_group_t group, struct xlio_buf *buf); + #ifdef __cplusplus } #endif diff --git a/tests/extra_api/xlio_socket_api.c b/tests/extra_api/xlio_socket_api.c new file mode 100644 index 000000000..b170d0aae --- /dev/null +++ b/tests/extra_api/xlio_socket_api.c @@ -0,0 +1,379 @@ +/* + * Copyright © 2019-2024 NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +/* g++ -I./install/include -L./install/lib -L../dpcp/install/lib -o test xlio_socket_api.c -lxlio -lm -lnl-3 -ldpcp -libverbs -lmlx5 -lrdmacm -lnl-route-3 -g3 */ +/* LD_LIBRARY_PATH=./install/lib:../dpcp/install/lib ./test */ +/* Use `nc -l 8080` on the remote side */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#define TEST_USERDATA_MAGIC 0xfeedbeef +#define FAKE_PORT 65535 + +static bool quit = false; +static bool terminated = false; +static int g_test_events; +static int g_comp_events = 0; +static char sndbuf[256]; +static struct ibv_mr *mr_buf; + +static void memory_cb(void *data, size_t size, size_t page_size) +{ + printf("Memory area allocated data=%p size=%zu page_size=%zu\n", data, size, page_size); +} + +static void send_single_msg(xlio_socket_t sock, const void *data, size_t len, uintptr_t userdata_op, + unsigned flags) +{ + struct xlio_socket_send_attr attr = { + .flags = flags, + .mkey = mr_buf->lkey, + .userdata_op = userdata_op, + }; + memcpy(sndbuf, data, len); + int ret = xlio_socket_send(sock, sndbuf, len, &attr); + assert(ret == 0); + xlio_socket_flush(sock); +} + +static void send_inline_msg(xlio_socket_t sock, const void *data, size_t len, uintptr_t userdata_op, + unsigned flags) +{ + struct xlio_socket_send_attr attr = { + .flags = flags | XLIO_SOCKET_SEND_FLAG_INLINE, + .mkey = 0, + .userdata_op = userdata_op, + }; + int ret = xlio_socket_send(sock, data, len, &attr); + assert(ret == 0); + xlio_socket_flush(sock); +} + +static void socket_event_cb(xlio_socket_t sock, uintptr_t userdata_sq, int event, int value) +{ + if (event == XLIO_SOCKET_EVENT_ESTABLISHED) { + printf("Connection established (sock=%lx).\n", userdata_sq); + } else if (event == XLIO_SOCKET_EVENT_CLOSED) { + printf("Connection closed passively (sock=%lx).\n", userdata_sq); + } else if (event == XLIO_SOCKET_EVENT_TERMINATED) { + printf("Connection terminated (sock=%lx).\n", userdata_sq); + terminated = true; + } else { + printf("Event callback: event=%d value=%d (sock=%lx).\n", event, value, userdata_sq); + if (event == XLIO_SOCKET_EVENT_ERROR) { + quit = true; + } + } +} + +static void socket_comp_cb(xlio_socket_t sock, uintptr_t userdata_sq, uintptr_t userdata_op) +{ + const char *reply_msg = "completed\n"; + const char *inline_msg = "inline\n"; + + printf("Completed zcopy buffer userdata_sq=%lx userdata_op=%lx.\n", userdata_sq, userdata_op); + assert(userdata_sq != 0); + assert(userdata_op != 0); + + ++g_comp_events; + if (!quit) { + /* + * Don't send data after socket destroy, completions are still possible until + * XLIO_SOCKET_EVENT_TERMINATED event arrives. + */ + send_single_msg(sock, reply_msg, strlen(reply_msg), 0, 0); + send_inline_msg(sock, inline_msg, strlen(inline_msg), 0, 0); + } +} + +static void socket_rx_cb(xlio_socket_t sock, uintptr_t userdata_sq, void *data, size_t len, + struct xlio_buf *buf) +{ + char *msg = (char *)malloc(len + 1); + memcpy(msg, data, len); + msg[len] = '\0'; + if (len > 0 && msg[len - 1] == '\n') { + msg[len - 1] = '\0'; + } + printf("RECV: %s\n", msg); + if (strncmp(msg, "quit", 4) == 0 || strncmp(msg, "exit", 4) == 0) { + quit = true; + } + free(msg); + + send_single_msg(sock, data, len, 0xdeadbeef, 0); + xlio_socket_buf_free(sock, buf); +} + +static void test_event_cb(xlio_socket_t sock, uintptr_t userdata_sq, int event, int value) +{ + (void)sock; + (void)value; + assert(userdata_sq = TEST_USERDATA_MAGIC); + + printf("Test event callback: event=%d value=%d.\n", event, value); + + if (event == XLIO_SOCKET_EVENT_ERROR || event == XLIO_SOCKET_EVENT_TERMINATED) { + ++g_test_events; + } +} + +static void test_comp_cb(xlio_socket_t sock, uintptr_t userdata_sq, uintptr_t userdata_op) +{ + (void)sock; + (void)userdata_op; + assert(userdata_sq = TEST_USERDATA_MAGIC); +} + +static void test_rx_cb(xlio_socket_t sock, uintptr_t userdata_sq, void *data, size_t len, + struct xlio_buf *buf) +{ + (void)data; + (void)len; + assert(userdata_sq = TEST_USERDATA_MAGIC); + xlio_socket_buf_free(sock, buf); +} + +static void test_multi_groups(const char *ip) +{ + xlio_poll_group_t group1; + xlio_poll_group_t group2; + xlio_poll_group_t group3; + xlio_socket_t sock1_1; + xlio_socket_t sock1_2; + xlio_socket_t sock2; + xlio_socket_t sock3; + int rc; + + struct xlio_poll_group_attr gattr = { + .socket_event_cb = &test_event_cb, + .socket_comp_cb = &test_comp_cb, + .socket_rx_cb = &test_rx_cb, + }; + + rc = xlio_poll_group_create(&gattr, &group1); + assert(rc == 0); + rc = xlio_poll_group_create(&gattr, &group2); + assert(rc == 0); + + gattr.flags = XLIO_GROUP_FLAG_SAFE; + rc = xlio_poll_group_create(&gattr, &group3); + assert(rc == 0); + + struct xlio_socket_attr sattr = { + .domain = AF_INET, + .userdata_sq = TEST_USERDATA_MAGIC, + }; + + sattr.group = group1; + rc = xlio_socket_create(&sattr, &sock1_1); + assert(rc == 0); + rc = xlio_socket_create(&sattr, &sock1_2); + assert(rc == 0); + sattr.group = group2; + rc = xlio_socket_create(&sattr, &sock2); + assert(rc == 0); + sattr.group = group3; + rc = xlio_socket_create(&sattr, &sock3); + assert(rc == 0); + + struct sockaddr_in addr = {}; + addr.sin_family = AF_INET; + addr.sin_port = htons(FAKE_PORT); + rc = inet_aton(ip, &addr.sin_addr); + assert(rc != 0); + + g_test_events = 0; + /* Connect will fail, we need it to allocate rings for the checks below. */ + rc = xlio_socket_connect(sock1_1, (struct sockaddr *)&addr, sizeof(addr)); + assert(rc == 0); + rc = xlio_socket_connect(sock1_2, (struct sockaddr *)&addr, sizeof(addr)); + assert(rc == 0); + rc = xlio_socket_connect(sock2, (struct sockaddr *)&addr, sizeof(addr)); + assert(rc == 0); + rc = xlio_socket_connect(sock3, (struct sockaddr *)&addr, sizeof(addr)); + assert(rc == 0); + + int fd1_1 = xlio_socket_fd(sock1_1); + int fd1_2 = xlio_socket_fd(sock1_2); + int fd2 = xlio_socket_fd(sock2); + int fd3 = xlio_socket_fd(sock3); + assert(fd1_1 >= 0); + assert(fd1_2 >= 0); + assert(fd2 >= 0); + assert(fd3 >= 0); + + assert(xlio_get_socket_rings_num(fd1_1) == 1); + assert(xlio_get_socket_rings_num(fd1_2) == 1); + assert(xlio_get_socket_rings_num(fd2) == 1); + assert(xlio_get_socket_rings_num(fd3) == 1); + + int ring1_1; + int ring1_2; + int ring2; + int ring3; + + rc = xlio_get_socket_rings_fds(fd1_1, &ring1_1, 1); + assert(rc == 1); + rc = xlio_get_socket_rings_fds(fd1_2, &ring1_2, 1); + assert(rc == 1); + rc = xlio_get_socket_rings_fds(fd2, &ring2, 1); + assert(rc == 1); + rc = xlio_get_socket_rings_fds(fd3, &ring3, 1); + assert(rc == 1); + + assert(ring1_1 == ring1_2); + assert(ring1_1 != ring2); + assert(ring1_1 != ring3); + assert(ring2 != ring3); + + /* Wait for ERROR events (ECONREFUSED). */ + while (g_test_events < 4) { + xlio_poll_group_poll(group1); + xlio_poll_group_poll(group2); + xlio_poll_group_poll(group3); + } + + g_test_events = 0; + xlio_socket_destroy(sock1_1); + xlio_socket_destroy(sock1_2); + xlio_socket_destroy(sock2); + xlio_socket_destroy(sock3); + + /* Wait for TERMINATED events. */ + while (g_test_events < 4) { + xlio_poll_group_poll(group1); + xlio_poll_group_poll(group2); + xlio_poll_group_poll(group3); + } + + xlio_poll_group_destroy(group1); + xlio_poll_group_destroy(group2); + xlio_poll_group_destroy(group3); + + printf("Multi group test done.\n"); +} + +int main(int argc, char **argv) +{ + xlio_poll_group_t group; + xlio_socket_t sock; + int rc; + + struct xlio_init_attr iattr = { + .flags = 0, + .memory_cb = &memory_cb, + }; + struct xlio_poll_group_attr gattr = { + .socket_event_cb = &socket_event_cb, + .socket_comp_cb = &socket_comp_cb, + .socket_rx_cb = &socket_rx_cb, + }; + + if (argc < 2) { + printf("Usage: %s \n", argv[0]); + printf("Run 'nc -l 8080' on the server with the address.\n"); + printf("Type messages on the nc side.\n"); + printf("Message 'quit' or 'exit' will terminate the client.\n"); + return 1; + } + + rc = xlio_init_ex(&iattr); + assert(rc == 0); + + test_multi_groups(argv[1]); + + rc = xlio_poll_group_create(&gattr, &group); + assert(rc == 0); + + printf("Group created.\n"); + + struct xlio_socket_attr sattr = { + .domain = AF_INET, + .group = group, + .userdata_sq = 0xdeadc0de, + }; + + rc = xlio_socket_create(&sattr, &sock); + assert(rc == 0); + + printf("Socket created, connecting to %s:8080.\n", argv[1]); + + struct sockaddr_in addr = {}; + addr.sin_family = AF_INET; + addr.sin_port = htons(8080); + rc = inet_aton(argv[1], &addr.sin_addr); + assert(rc != 0); + + rc = xlio_socket_connect(sock, (struct sockaddr *)&addr, sizeof(addr)); + assert(rc == 0); + + struct ibv_pd *pd = xlio_socket_get_pd(sock); + assert(pd != NULL); + mr_buf = ibv_reg_mr(pd, sndbuf, sizeof(sndbuf), IBV_ACCESS_LOCAL_WRITE); + assert(mr_buf != NULL); + + printf("Starting polling loop.\n"); + + while (!quit) { + xlio_poll_group_poll(group); + } + + printf("Quiting...\n"); + + rc = xlio_socket_destroy(sock); + assert(rc == 0); + + while (!terminated) { + xlio_poll_group_poll(group); + } + + rc = xlio_poll_group_destroy(group); + assert(rc == 0); + + printf("Zerocopy completion events: %d\n", g_comp_events); + + ibv_dereg_mr(mr_buf); + xlio_exit(); + + return 0; +} From 4abcf4f3123ea258a5cfda5b4515859a7eb6addc Mon Sep 17 00:00:00 2001 From: Dmytro Podgornyi Date: Mon, 4 Mar 2024 13:42:57 +0200 Subject: [PATCH 100/169] issue: 3788369 Remove lwip/init.[ch] Remove stub initialization functions to clean the code. Signed-off-by: Dmytro Podgornyi --- debian/copyright | 2 -- src/core/Makefile.am | 2 -- src/core/lwip/init.c | 53 ------------------------------------ src/core/lwip/init.h | 48 -------------------------------- src/core/lwip/pbuf.h | 3 -- src/core/lwip/tcp_impl.h | 2 -- src/core/proto/xlio_lwip.cpp | 6 +--- 7 files changed, 1 insertion(+), 115 deletions(-) delete mode 100644 src/core/lwip/init.c delete mode 100644 src/core/lwip/init.h diff --git a/debian/copyright b/debian/copyright index 88bb20923..72d3f06dd 100644 --- a/debian/copyright +++ b/debian/copyright @@ -51,8 +51,6 @@ License: GPLv2 and 2BSD Files: src/core/lwip/def.h src/core/lwip/err.h - src/core/lwip/init.c - src/core/lwip/init.h src/core/lwip/ip_addr.h src/core/lwip/opt.h src/core/lwip/pbuf.c diff --git a/src/core/Makefile.am b/src/core/Makefile.am index 867c70c70..15a35b3a9 100644 --- a/src/core/Makefile.am +++ b/src/core/Makefile.am @@ -122,7 +122,6 @@ libxlio_la_SOURCES := \ lwip/cc_lwip.c \ lwip/cc_cubic.c \ lwip/cc_none.c \ - lwip/init.c \ \ proto/ip_frag.cpp \ proto/flow_tuple.cpp \ @@ -238,7 +237,6 @@ libxlio_la_SOURCES := \ lwip/cc.h \ lwip/def.h \ lwip/err.h \ - lwip/init.h \ lwip/ip_addr.h \ lwip/opt.h \ lwip/pbuf.h \ diff --git a/src/core/lwip/init.c b/src/core/lwip/init.c deleted file mode 100644 index 653718deb..000000000 --- a/src/core/lwip/init.c +++ /dev/null @@ -1,53 +0,0 @@ -/** - * @file - * Modules initialization - * - */ - -/* - * Copyright (c) 2001-2004 Swedish Institute of Computer Science. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without modification, - * are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright notice, - * this list of conditions and the following disclaimer in the documentation - * and/or other materials provided with the distribution. - * 3. The name of the author may not be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED - * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT - * SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, - * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT - * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING - * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY - * OF SUCH DAMAGE. - * - * This file is part of the lwIP TCP/IP stack. - * - * Author: Adam Dunkels - * - */ - -#include "lwip/opt.h" - -#include "lwip/init.h" -#include "lwip/pbuf.h" -#include "lwip/tcp_impl.h" - -/** - * Perform Sanity check of user-configurable values, and initialize all modules. - */ -void lwip_init(void) -{ - /* Modules initialization */ - pbuf_init(); - tcp_init(); -} diff --git a/src/core/lwip/init.h b/src/core/lwip/init.h deleted file mode 100644 index 87a947afa..000000000 --- a/src/core/lwip/init.h +++ /dev/null @@ -1,48 +0,0 @@ -/* - * Copyright (c) 2001-2004 Swedish Institute of Computer Science. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without modification, - * are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright notice, - * this list of conditions and the following disclaimer in the documentation - * and/or other materials provided with the distribution. - * 3. The name of the author may not be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED - * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT - * SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, - * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT - * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING - * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY - * OF SUCH DAMAGE. - * - * This file is part of the lwIP TCP/IP stack. - * - * Author: Adam Dunkels - * - */ -#ifndef __LWIP_INIT_H__ -#define __LWIP_INIT_H__ - -#include "lwip/opt.h" - -#ifdef __cplusplus -extern "C" { -#endif - -/* Modules initialization */ -void lwip_init(void); - -#ifdef __cplusplus -} -#endif - -#endif /* __LWIP_INIT_H__ */ diff --git a/src/core/lwip/pbuf.h b/src/core/lwip/pbuf.h index e25923548..ef1d14cbb 100644 --- a/src/core/lwip/pbuf.h +++ b/src/core/lwip/pbuf.h @@ -119,9 +119,6 @@ struct pbuf_custom { struct pbuf pbuf; }; -/* Initializes the pbuf module. This call is empty for now, but may not be in future. */ -#define pbuf_init() - void pbuf_realloc(struct pbuf *p, u32_t size); u8_t pbuf_header(struct pbuf *p, s32_t header_size); void pbuf_ref(struct pbuf *p); diff --git a/src/core/lwip/tcp_impl.h b/src/core/lwip/tcp_impl.h index 19f9c354e..c3e0c02b0 100644 --- a/src/core/lwip/tcp_impl.h +++ b/src/core/lwip/tcp_impl.h @@ -40,8 +40,6 @@ extern "C" { #endif -#define tcp_init() /* Compatibility define, no init needed. */ - /* Functions for interfacing with TCP: */ #if defined(__GNUC__) && (((__GNUC__ == 4) && (__GNUC_MINOR__ >= 4)) || (__GNUC__ > 4)) #pragma GCC visibility push(hidden) diff --git a/src/core/proto/xlio_lwip.cpp b/src/core/proto/xlio_lwip.cpp index ef2b86d23..bc26b48d3 100644 --- a/src/core/proto/xlio_lwip.cpp +++ b/src/core/proto/xlio_lwip.cpp @@ -35,7 +35,6 @@ #include "core/event/event_handler_manager.h" #include "core/sock/sockinfo_tcp.h" -#include "core/lwip/init.h" #include "core/lwip/tcp_impl.h" #include "xlio_lwip.h" @@ -118,10 +117,6 @@ xlio_lwip::xlio_lwip() rcv_wnd_scale = 0; } - // Bring up LWIP - lwip_init(); - lwip_logdbg("LWIP subsystem initialized"); - // In case of batching is not requested we fetch tcp_seg from the ring directly. // This creates hot segments, CPU cache wise. if (safe_mce_sys().tx_segs_batch_tcp == 1U) { @@ -147,6 +142,7 @@ xlio_lwip::xlio_lwip() free_lwip_resources(); throw_xlio_exception("LWIP: failed to register timer event"); } + lwip_logdbg("LWIP subsystem initialized"); } xlio_lwip::~xlio_lwip() From cdcc99c95c8fe64e3f5aa08fb9bcc1dee5729e3f Mon Sep 17 00:00:00 2001 From: Dmytro Podgornyi Date: Mon, 4 Mar 2024 15:06:39 +0200 Subject: [PATCH 101/169] issue: 3788369 Remove pbuf_custom wrapper structure Structure pbuf_custom doesn't serve its original purpose anymore and contains a single field pbuf. Remove the structure and use pbuf directly. Signed-off-by: Dmytro Podgornyi --- src/core/dev/buffer_pool.cpp | 10 ++--- src/core/dev/buffer_pool.h | 15 ++++--- src/core/dev/cq_mgr_rx.cpp | 4 +- src/core/dev/cq_mgr_rx_strq.cpp | 16 ++++---- src/core/dev/rfs_uc_tcp_gro.cpp | 25 ++++++------ src/core/dev/ring.h | 18 ++++---- src/core/dev/ring_bond.cpp | 2 +- src/core/dev/ring_simple.cpp | 23 +++++------ src/core/dev/ring_tap.cpp | 21 +++++----- src/core/lwip/pbuf.h | 9 ---- src/core/proto/dst_entry_tcp.cpp | 42 +++++++++---------- src/core/proto/mem_buf_desc.h | 15 ++++--- src/core/proto/neighbour.cpp | 8 ++-- src/core/sock/sockinfo_tcp.cpp | 70 ++++++++++++++++---------------- src/core/sock/sockinfo_ulp.cpp | 50 +++++++++++------------ 15 files changed, 155 insertions(+), 173 deletions(-) diff --git a/src/core/dev/buffer_pool.cpp b/src/core/dev/buffer_pool.cpp index 074f028f3..7de9ad0b5 100644 --- a/src/core/dev/buffer_pool.cpp +++ b/src/core/dev/buffer_pool.cpp @@ -74,15 +74,15 @@ inline void buffer_pool::put_buffer_helper(mem_buf_desc_t *buff) } #endif - if (buff->lwip_pbuf.pbuf.desc.attr == PBUF_DESC_STRIDE) { - mem_buf_desc_t *rwqe = reinterpret_cast(buff->lwip_pbuf.pbuf.desc.mdesc); + if (buff->lwip_pbuf.desc.attr == PBUF_DESC_STRIDE) { + mem_buf_desc_t *rwqe = reinterpret_cast(buff->lwip_pbuf.desc.mdesc); if (buff->rx.strides_num == rwqe->add_ref_count(-buff->rx.strides_num)) { // Is last stride. g_buffer_pool_rx_rwqe->put_buffers_thread_safe(rwqe); } } buff->p_next_desc = m_p_head; - assert(buff->lwip_pbuf.pbuf.type != PBUF_ZEROCOPY || this == g_buffer_pool_zc || + assert(buff->lwip_pbuf.type != PBUF_ZEROCOPY || this == g_buffer_pool_zc || g_buffer_pool_zc == NULL); free_lwip_pbuf(&buff->lwip_pbuf); m_p_head = buff; @@ -490,7 +490,7 @@ void buffer_pool::put_buffers_after_deref_thread_safe(descq_t *pDeque) std::lock_guard lock(m_lock); while (!pDeque->empty()) { mem_buf_desc_t *list = pDeque->get_and_pop_front(); - if (list->dec_ref_count() <= 1 && (list->lwip_pbuf.pbuf.ref-- <= 1)) { + if (list->dec_ref_count() <= 1 && (list->lwip_pbuf.ref-- <= 1)) { put_buffers(list); } } @@ -498,7 +498,7 @@ void buffer_pool::put_buffers_after_deref_thread_safe(descq_t *pDeque) void buffer_pool::put_buffer_after_deref_thread_safe(mem_buf_desc_t *buff) { - if (buff->dec_ref_count() <= 1 && (buff->lwip_pbuf.pbuf.ref-- <= 1)) { + if (buff->dec_ref_count() <= 1 && (buff->lwip_pbuf.ref-- <= 1)) { std::lock_guard lock(m_lock); put_buffers(buff); } diff --git a/src/core/dev/buffer_pool.h b/src/core/dev/buffer_pool.h index 8902cd21e..427f97bbe 100644 --- a/src/core/dev/buffer_pool.h +++ b/src/core/dev/buffer_pool.h @@ -49,13 +49,12 @@ enum buffer_pool_type { BUFFER_POOL_TX, }; -inline static void free_lwip_pbuf(struct pbuf_custom *pbuf_custom) +inline static void free_lwip_pbuf(struct pbuf *lwip_pbuf) { - mem_buf_desc_t *p_desc = (mem_buf_desc_t *)pbuf_custom; + mem_buf_desc_t *p_desc = reinterpret_cast(lwip_pbuf); - if (pbuf_custom->pbuf.desc.attr == PBUF_DESC_MDESC || - pbuf_custom->pbuf.desc.attr == PBUF_DESC_NVME_TX) { - mem_desc *mdesc = (mem_desc *)pbuf_custom->pbuf.desc.mdesc; + if (lwip_pbuf->desc.attr == PBUF_DESC_MDESC || lwip_pbuf->desc.attr == PBUF_DESC_NVME_TX) { + mem_desc *mdesc = reinterpret_cast(lwip_pbuf->desc.mdesc); mdesc->put(); } @@ -63,9 +62,9 @@ inline static void free_lwip_pbuf(struct pbuf_custom *pbuf_custom) p_desc->tx.zc.callback(p_desc); } p_desc->m_flags = 0; - pbuf_custom->pbuf.flags = 0; - pbuf_custom->pbuf.ref = 0; - pbuf_custom->pbuf.desc.attr = PBUF_DESC_NONE; + lwip_pbuf->flags = 0; + lwip_pbuf->ref = 0; + lwip_pbuf->desc.attr = PBUF_DESC_NONE; } /** diff --git a/src/core/dev/cq_mgr_rx.cpp b/src/core/dev/cq_mgr_rx.cpp index e2986782a..452ba5176 100644 --- a/src/core/dev/cq_mgr_rx.cpp +++ b/src/core/dev/cq_mgr_rx.cpp @@ -407,13 +407,13 @@ void cq_mgr_rx::compensate_qp_poll_failed() void cq_mgr_rx::reclaim_recv_buffer_helper(mem_buf_desc_t *buff) { - if (buff->dec_ref_count() <= 1 && (buff->lwip_pbuf.pbuf.ref-- <= 1)) { + if (buff->dec_ref_count() <= 1 && (buff->lwip_pbuf.ref-- <= 1)) { if (likely(buff->p_desc_owner == m_p_ring)) { mem_buf_desc_t *temp = nullptr; while (buff) { VLIST_DEBUG_CQ_MGR_PRINT_ERROR_IS_MEMBER; temp = buff; - assert(temp->lwip_pbuf.pbuf.type != PBUF_ZEROCOPY); + assert(temp->lwip_pbuf.type != PBUF_ZEROCOPY); buff = temp->p_next_desc; temp->clear_transport_data(); temp->p_next_desc = nullptr; diff --git a/src/core/dev/cq_mgr_rx_strq.cpp b/src/core/dev/cq_mgr_rx_strq.cpp index ceab8e9f7..846c10a93 100644 --- a/src/core/dev/cq_mgr_rx_strq.cpp +++ b/src/core/dev/cq_mgr_rx_strq.cpp @@ -237,8 +237,8 @@ inline bool cq_mgr_rx_strq::strq_cqe_to_mem_buff_desc(struct xlio_mlx5_cqe *cqe, case MLX5_CQE_RESP_SEND_INV: { status = BS_OK; _hot_buffer_stride->rx.strides_num = ((host_byte_cnt >> 16) & 0x00003FFF); - _hot_buffer_stride->lwip_pbuf.pbuf.desc.attr = PBUF_DESC_STRIDE; - _hot_buffer_stride->lwip_pbuf.pbuf.desc.mdesc = m_rx_hot_buffer; + _hot_buffer_stride->lwip_pbuf.desc.attr = PBUF_DESC_STRIDE; + _hot_buffer_stride->lwip_pbuf.desc.mdesc = m_rx_hot_buffer; is_filler = (host_byte_cnt >> 31 != 0U ? true : false); _hot_buffer_stride->sz_data = @@ -274,8 +274,8 @@ inline bool cq_mgr_rx_strq::strq_cqe_to_mem_buff_desc(struct xlio_mlx5_cqe *cqe, case MLX5_CQE_RESP_ERR: default: { _hot_buffer_stride->rx.strides_num = ((host_byte_cnt >> 16) & 0x00003FFF); - _hot_buffer_stride->lwip_pbuf.pbuf.desc.attr = PBUF_DESC_STRIDE; - _hot_buffer_stride->lwip_pbuf.pbuf.desc.mdesc = m_rx_hot_buffer; + _hot_buffer_stride->lwip_pbuf.desc.attr = PBUF_DESC_STRIDE; + _hot_buffer_stride->lwip_pbuf.desc.mdesc = m_rx_hot_buffer; is_filler = true; _current_wqe_consumed_bytes = _wqe_buff_size_bytes; _hot_buffer_stride->sz_data = 0U; @@ -526,11 +526,11 @@ void cq_mgr_rx_strq::statistics_print() void cq_mgr_rx_strq::reclaim_recv_buffer_helper(mem_buf_desc_t *buff) { - if (buff->dec_ref_count() <= 1 && (buff->lwip_pbuf.pbuf.ref-- <= 1)) { + if (buff->dec_ref_count() <= 1 && (buff->lwip_pbuf.ref-- <= 1)) { if (likely(buff->p_desc_owner == m_p_ring)) { mem_buf_desc_t *temp = nullptr; while (buff) { - if (unlikely(buff->lwip_pbuf.pbuf.desc.attr != PBUF_DESC_STRIDE)) { + if (unlikely(buff->lwip_pbuf.desc.attr != PBUF_DESC_STRIDE)) { __log_info_err("CQ STRQ reclaim_recv_buffer_helper with incompatible " "mem_buf_desc_t object"); // We cannot continue iterating over a broken buffer object. @@ -538,7 +538,7 @@ void cq_mgr_rx_strq::reclaim_recv_buffer_helper(mem_buf_desc_t *buff) } mem_buf_desc_t *rwqe = - reinterpret_cast(buff->lwip_pbuf.pbuf.desc.mdesc); + reinterpret_cast(buff->lwip_pbuf.desc.mdesc); if (buff->rx.strides_num == rwqe->add_ref_count(-buff->rx.strides_num)) { // Is last stride. cq_mgr_rx::reclaim_recv_buffer_helper(rwqe); @@ -546,7 +546,7 @@ void cq_mgr_rx_strq::reclaim_recv_buffer_helper(mem_buf_desc_t *buff) VLIST_DEBUG_CQ_MGR_PRINT_ERROR_IS_MEMBER; temp = buff; - assert(temp->lwip_pbuf.pbuf.type != PBUF_ZEROCOPY); + assert(temp->lwip_pbuf.type != PBUF_ZEROCOPY); buff = temp->p_next_desc; temp->clear_transport_data(); temp->p_next_desc = nullptr; diff --git a/src/core/dev/rfs_uc_tcp_gro.cpp b/src/core/dev/rfs_uc_tcp_gro.cpp index cdf824480..522a6ff63 100644 --- a/src/core/dev/rfs_uc_tcp_gro.cpp +++ b/src/core/dev/rfs_uc_tcp_gro.cpp @@ -159,7 +159,7 @@ bool rfs_uc_tcp_gro::rx_dispatch_packet(mem_buf_desc_t *p_rx_pkt_mem_buf_desc_in cq_stats_t &cq_stats = *m_p_ring_simple->m_p_cq_mgr_rx->m_p_cq_stat; cq_stats.n_rx_gro_packets++; cq_stats.n_rx_gro_frags += 1; - cq_stats.n_rx_gro_bytes += p_rx_pkt_mem_buf_desc_info->lwip_pbuf.pbuf.tot_len; + cq_stats.n_rx_gro_bytes += p_rx_pkt_mem_buf_desc_info->lwip_pbuf.tot_len; return rfs_uc::rx_dispatch_packet(p_rx_pkt_mem_buf_desc_info, pv_fd_ready_array); } @@ -187,13 +187,12 @@ bool rfs_uc_tcp_gro::add_packet(mem_buf_desc_t *mem_buf_desc, void *payload_ptr, mem_buf_desc->reset_ref_count(); - mem_buf_desc->lwip_pbuf.pbuf.len = mem_buf_desc->lwip_pbuf.pbuf.tot_len = - mem_buf_desc->rx.sz_payload; - mem_buf_desc->lwip_pbuf.pbuf.ref = 1; - mem_buf_desc->lwip_pbuf.pbuf.next = nullptr; - mem_buf_desc->lwip_pbuf.pbuf.payload = payload_ptr; + mem_buf_desc->lwip_pbuf.len = mem_buf_desc->lwip_pbuf.tot_len = mem_buf_desc->rx.sz_payload; + mem_buf_desc->lwip_pbuf.ref = 1; + mem_buf_desc->lwip_pbuf.next = nullptr; + mem_buf_desc->lwip_pbuf.payload = payload_ptr; - m_gro_desc.p_last->lwip_pbuf.pbuf.next = &(mem_buf_desc->lwip_pbuf.pbuf); + m_gro_desc.p_last->lwip_pbuf.next = &mem_buf_desc->lwip_pbuf; m_gro_desc.p_last->p_next_desc = nullptr; mem_buf_desc->p_prev_desc = m_gro_desc.p_last; m_gro_desc.p_last = mem_buf_desc; @@ -232,18 +231,18 @@ void rfs_uc_tcp_gro::flush_gro_desc(void *pv_fd_ready_array) p_tcp_ts_h->popts[2] = m_gro_desc.tsecr; } - m_gro_desc.p_first->lwip_pbuf.pbuf.gro = 1; + m_gro_desc.p_first->lwip_pbuf.gro = 1; - m_gro_desc.p_first->lwip_pbuf.pbuf.tot_len = m_gro_desc.p_first->lwip_pbuf.pbuf.len = + m_gro_desc.p_first->lwip_pbuf.tot_len = m_gro_desc.p_first->lwip_pbuf.len = (m_gro_desc.p_first->sz_data - m_gro_desc.p_first->rx.n_transport_header_len); - m_gro_desc.p_first->lwip_pbuf.pbuf.ref = 1; - m_gro_desc.p_first->lwip_pbuf.pbuf.payload = + m_gro_desc.p_first->lwip_pbuf.ref = 1; + m_gro_desc.p_first->lwip_pbuf.payload = (u8_t *)(m_gro_desc.p_first->p_buffer + m_gro_desc.p_first->rx.n_transport_header_len); m_gro_desc.p_first->rx.is_xlio_thr = m_gro_desc.p_last->rx.is_xlio_thr; for (mem_buf_desc_t *p_desc = m_gro_desc.p_last; p_desc != m_gro_desc.p_first; p_desc = p_desc->p_prev_desc) { - p_desc->p_prev_desc->lwip_pbuf.pbuf.tot_len += p_desc->lwip_pbuf.pbuf.tot_len; + p_desc->p_prev_desc->lwip_pbuf.tot_len += p_desc->lwip_pbuf.tot_len; } } @@ -259,7 +258,7 @@ void rfs_uc_tcp_gro::flush_gro_desc(void *pv_fd_ready_array) cq_stats_t &cq_stats = *m_p_ring_simple->m_p_cq_mgr_rx->m_p_cq_stat; cq_stats.n_rx_gro_packets++; cq_stats.n_rx_gro_frags += m_gro_desc.buf_count; - cq_stats.n_rx_gro_bytes += m_gro_desc.p_first->lwip_pbuf.pbuf.tot_len; + cq_stats.n_rx_gro_bytes += m_gro_desc.p_first->lwip_pbuf.tot_len; if (!rfs_uc::rx_dispatch_packet(m_gro_desc.p_first, pv_fd_ready_array)) { m_p_ring_simple->reclaim_recv_buffers_no_lock(m_gro_desc.p_first); diff --git a/src/core/dev/ring.h b/src/core/dev/ring.h index 4c863a2b7..1425af904 100644 --- a/src/core/dev/ring.h +++ b/src/core/dev/ring.h @@ -95,8 +95,8 @@ class ring { bool trylock = false) = 0; virtual void mem_buf_rx_release(mem_buf_desc_t *p_mem_buf_desc) { - buffer_pool::free_rx_lwip_pbuf_custom(&p_mem_buf_desc->lwip_pbuf.pbuf); - }; + buffer_pool::free_rx_lwip_pbuf_custom(&p_mem_buf_desc->lwip_pbuf); + } virtual void send_ring_buffer(ring_user_id_t id, xlio_ibv_send_wr *p_send_wqe, xlio_wr_tx_packet_attr attr) = 0; virtual int send_lwip_buffer(ring_user_id_t id, xlio_ibv_send_wr *p_send_wqe, @@ -107,13 +107,13 @@ class ring { { length = 1; return m_p_n_rx_channel_fds; - }; - virtual int get_tx_channel_fd() const { return -1; }; + } + virtual int get_tx_channel_fd() const { return -1; } virtual bool get_hw_dummy_send_support(ring_user_id_t id, xlio_ibv_send_wr *p_send_wqe) = 0; virtual int request_notification(cq_type_t cq_type, uint64_t poll_sn) = 0; virtual bool reclaim_recv_buffers(descq_t *rx_reuse) = 0; virtual bool reclaim_recv_buffers(mem_buf_desc_t *rx_reuse_lst) = 0; - virtual bool reclaim_recv_buffers_no_lock(mem_buf_desc_t *) { return false; }; + virtual bool reclaim_recv_buffers_no_lock(mem_buf_desc_t *) { return false; } virtual int drain_and_proccess() = 0; virtual int wait_for_notification_and_process_element(int cq_channel_fd, uint64_t *p_cq_poll_sn, void *pv_fd_ready_array = nullptr) = 0; @@ -128,8 +128,8 @@ class ring { virtual void inc_tx_retransmissions_stats(ring_user_id_t id) = 0; virtual bool is_member(ring_slave *rng) = 0; virtual bool is_active_member(ring_slave *rng, ring_user_id_t id) = 0; - ring *get_parent() { return m_parent; }; - ring_user_id_t generate_id() { return 0; }; + ring *get_parent() { return m_parent; } + ring_user_id_t generate_id() { return 0; } virtual ring_user_id_t generate_id(const address_t src_mac, const address_t dst_mac, uint16_t eth_proto, uint16_t encap_proto, const ip_address &src_ip, const ip_address &dst_ip, @@ -222,12 +222,12 @@ class ring { { NOT_IN_USE(tis); NOT_IN_USE(config); - }; + } virtual void nvme_set_progress_context(xlio_tis *tis, uint32_t tcp_seqno) { NOT_IN_USE(tis); NOT_IN_USE(tcp_seqno); - }; + } enum { NVME_CRC_TX = 1 << 0, diff --git a/src/core/dev/ring_bond.cpp b/src/core/dev/ring_bond.cpp index fa7623db5..c6e32d0c9 100644 --- a/src/core/dev/ring_bond.cpp +++ b/src/core/dev/ring_bond.cpp @@ -406,7 +406,7 @@ void ring_bond::mem_buf_rx_release(mem_buf_desc_t *p_mem_buf_desc) } } if (i == m_bond_rings.size()) { - buffer_pool::free_rx_lwip_pbuf_custom(&p_mem_buf_desc->lwip_pbuf.pbuf); + buffer_pool::free_rx_lwip_pbuf_custom(&p_mem_buf_desc->lwip_pbuf); } } diff --git a/src/core/dev/ring_simple.cpp b/src/core/dev/ring_simple.cpp index f59bbacdb..e2bd02183 100644 --- a/src/core/dev/ring_simple.cpp +++ b/src/core/dev/ring_simple.cpp @@ -551,8 +551,7 @@ void ring_simple::mem_buf_desc_return_single_multi_ref(mem_buf_desc_t *p_mem_buf std::lock_guard lock(m_lock_ring_tx); - p_mem_buf_desc->lwip_pbuf.pbuf.ref -= - std::min(p_mem_buf_desc->lwip_pbuf.pbuf.ref, ref - 1); + p_mem_buf_desc->lwip_pbuf.ref -= std::min(p_mem_buf_desc->lwip_pbuf.ref, ref - 1); put_tx_single_buffer(p_mem_buf_desc); } @@ -910,18 +909,18 @@ mem_buf_desc_t *ring_simple::get_tx_buffers(pbuf_type type, uint32_t n_num_mem_b } head = pool.get_and_pop_back(); - head->lwip_pbuf.pbuf.ref = 1; - assert(head->lwip_pbuf.pbuf.type == type); - head->lwip_pbuf.pbuf.type = type; + head->lwip_pbuf.ref = 1; + assert(head->lwip_pbuf.type == type); + head->lwip_pbuf.type = type; n_num_mem_bufs--; mem_buf_desc_t *next = head; while (n_num_mem_bufs) { next->p_next_desc = pool.get_and_pop_back(); next = next->p_next_desc; - next->lwip_pbuf.pbuf.ref = 1; - assert(head->lwip_pbuf.pbuf.type == type); - next->lwip_pbuf.pbuf.type = type; + next->lwip_pbuf.ref = 1; + assert(head->lwip_pbuf.type == type); + next->lwip_pbuf.type = type; n_num_mem_bufs--; } next->p_next_desc = nullptr; @@ -958,14 +957,14 @@ int ring_simple::put_tx_buffer_helper(mem_buf_desc_t *buff) // Potential race, ref is protected here by ring_tx lock, and in dst_entry_tcp & // sockinfo_tcp by tcp lock - if (likely(buff->lwip_pbuf.pbuf.ref)) { - buff->lwip_pbuf.pbuf.ref--; + if (likely(buff->lwip_pbuf.ref)) { + buff->lwip_pbuf.ref--; } else { ring_logerr("ref count of %p is already zero, double free??", buff); } - if (buff->lwip_pbuf.pbuf.ref == 0) { - descq_t &pool = buff->lwip_pbuf.pbuf.type == PBUF_ZEROCOPY ? m_zc_pool : m_tx_pool; + if (buff->lwip_pbuf.ref == 0) { + descq_t &pool = buff->lwip_pbuf.type == PBUF_ZEROCOPY ? m_zc_pool : m_tx_pool; buff->p_next_desc = nullptr; free_lwip_pbuf(&buff->lwip_pbuf); pool.push_back(buff); diff --git a/src/core/dev/ring_tap.cpp b/src/core/dev/ring_tap.cpp index aebf474b7..d7d0b7471 100644 --- a/src/core/dev/ring_tap.cpp +++ b/src/core/dev/ring_tap.cpp @@ -489,14 +489,14 @@ mem_buf_desc_t *ring_tap::mem_buf_tx_get(ring_user_id_t id, bool b_block, pbuf_t } head = m_tx_pool.get_and_pop_back(); - head->lwip_pbuf.pbuf.ref = 1; + head->lwip_pbuf.ref = 1; n_num_mem_bufs--; mem_buf_desc_t *next = head; while (n_num_mem_bufs) { next->p_next_desc = m_tx_pool.get_and_pop_back(); next = next->p_next_desc; - next->lwip_pbuf.pbuf.ref = 1; + next->lwip_pbuf.ref = 1; n_num_mem_bufs--; } @@ -520,15 +520,15 @@ void ring_tap::mem_buf_desc_return_single_to_owner_tx(mem_buf_desc_t *p_mem_buf_ if (likely(p_mem_buf_desc)) { // potential race, ref is protected here by ring_tx lock, and in dst_entry_tcp & // sockinfo_tcp by tcp lock - if (likely(p_mem_buf_desc->lwip_pbuf.pbuf.ref)) { - p_mem_buf_desc->lwip_pbuf.pbuf.ref--; + if (likely(p_mem_buf_desc->lwip_pbuf.ref)) { + p_mem_buf_desc->lwip_pbuf.ref--; } else { ring_logerr("ref count of %p is already zero, double free??", p_mem_buf_desc); } - if (p_mem_buf_desc->lwip_pbuf.pbuf.ref == 0) { + if (p_mem_buf_desc->lwip_pbuf.ref == 0) { p_mem_buf_desc->p_next_desc = nullptr; - if (unlikely(p_mem_buf_desc->lwip_pbuf.pbuf.type == PBUF_ZEROCOPY)) { + if (unlikely(p_mem_buf_desc->lwip_pbuf.type == PBUF_ZEROCOPY)) { g_buffer_pool_zc->put_buffers_thread_safe(p_mem_buf_desc); return; } @@ -547,8 +547,7 @@ void ring_tap::mem_buf_desc_return_single_multi_ref(mem_buf_desc_t *p_mem_buf_de } m_lock_ring_tx.lock(); - p_mem_buf_desc->lwip_pbuf.pbuf.ref -= - std::min(p_mem_buf_desc->lwip_pbuf.pbuf.ref, ref - 1); + p_mem_buf_desc->lwip_pbuf.ref -= std::min(p_mem_buf_desc->lwip_pbuf.ref, ref - 1); m_lock_ring_tx.unlock(); mem_buf_desc_return_single_to_owner_tx(p_mem_buf_desc); } @@ -572,13 +571,13 @@ int ring_tap::mem_buf_tx_release(mem_buf_desc_t *buff_list, bool b_accounting, b // potential race, ref is protected here by ring_tx lock, and in dst_entry_tcp & // sockinfo_tcp by tcp lock - if (likely(buff_list->lwip_pbuf.pbuf.ref)) { - buff_list->lwip_pbuf.pbuf.ref--; + if (likely(buff_list->lwip_pbuf.ref)) { + buff_list->lwip_pbuf.ref--; } else { ring_logerr("ref count of %p is already zero, double free??", buff_list); } - if (buff_list->lwip_pbuf.pbuf.ref == 0) { + if (buff_list->lwip_pbuf.ref == 0) { free_lwip_pbuf(&buff_list->lwip_pbuf); m_tx_pool.push_back(buff_list); freed++; diff --git a/src/core/lwip/pbuf.h b/src/core/lwip/pbuf.h index ef1d14cbb..683d3febc 100644 --- a/src/core/lwip/pbuf.h +++ b/src/core/lwip/pbuf.h @@ -110,15 +110,6 @@ struct pbuf { pbuf_desc desc; }; -/** Prototype for a function to free a custom pbuf */ -typedef void (*pbuf_free_custom_fn)(struct pbuf *p); - -/** A custom pbuf: like a pbuf, but following a function pointer to free it. */ -struct pbuf_custom { - /** The actual pbuf */ - struct pbuf pbuf; -}; - void pbuf_realloc(struct pbuf *p, u32_t size); u8_t pbuf_header(struct pbuf *p, s32_t header_size); void pbuf_ref(struct pbuf *p); diff --git a/src/core/proto/dst_entry_tcp.cpp b/src/core/proto/dst_entry_tcp.cpp index a1df9fb3f..528fcf37b 100644 --- a/src/core/proto/dst_entry_tcp.cpp +++ b/src/core/proto/dst_entry_tcp.cpp @@ -177,7 +177,7 @@ ssize_t dst_entry_tcp::fast_send(const iovec *p_iov, const ssize_t sz_iov, xlio_ p_tcp_iov[0].iovec.iov_len = total_packet_len; } - if (unlikely(p_tcp_iov[0].p_desc->lwip_pbuf.pbuf.ref > 1)) { + if (unlikely(p_tcp_iov[0].p_desc->lwip_pbuf.ref > 1)) { /* * First buffer in the vector is used for reference counting. * The reference is released after completion depending on @@ -195,16 +195,16 @@ ssize_t dst_entry_tcp::fast_send(const iovec *p_iov, const ssize_t sz_iov, xlio_ * * We don't change data, only pointer to buffer descriptor. */ - pbuf_type type = (pbuf_type)p_tcp_iov[0].p_desc->lwip_pbuf.pbuf.type; + pbuf_type type = (pbuf_type)p_tcp_iov[0].p_desc->lwip_pbuf.type; mem_buf_desc_t *p_mem_buf_desc = - get_buffer(type, &(p_tcp_iov[0].p_desc->lwip_pbuf.pbuf.desc), + get_buffer(type, &(p_tcp_iov[0].p_desc->lwip_pbuf.desc), is_set(attr.flags, XLIO_TX_PACKET_BLOCK)); if (!p_mem_buf_desc) { return -1; } p_tcp_iov[0].p_desc = p_mem_buf_desc; } else { - p_tcp_iov[0].p_desc->lwip_pbuf.pbuf.ref++; + p_tcp_iov[0].p_desc->lwip_pbuf.ref++; } /* save pointers to ip and tcp headers for software checksum calculation */ @@ -224,7 +224,7 @@ ssize_t dst_entry_tcp::fast_send(const iovec *p_iov, const ssize_t sz_iov, xlio_ m_sge[i].length = p_tcp_iov[i].iovec.iov_len; if (is_zerocopy) { auto *p_desc = p_tcp_iov[i].p_desc; - auto &pbuf_descriptor = p_desc->lwip_pbuf.pbuf.desc; + auto &pbuf_descriptor = p_desc->lwip_pbuf.desc; if (PBUF_DESC_EXPRESS == pbuf_descriptor.attr) { m_sge[i].lkey = pbuf_descriptor.mkey; } else if (PBUF_DESC_MKEY == pbuf_descriptor.attr) { @@ -399,22 +399,20 @@ mem_buf_desc_t *dst_entry_tcp::get_buffer(pbuf_type type, pbuf_desc *desc, // for TX, set lwip payload to the data segment. // lwip will send it with payload pointing to the tcp header. if (p_mem_buf_desc->p_buffer) { - p_mem_buf_desc->lwip_pbuf.pbuf.payload = (u8_t *)p_mem_buf_desc->p_buffer + + p_mem_buf_desc->lwip_pbuf.payload = (u8_t *)p_mem_buf_desc->p_buffer + m_header->m_aligned_l2_l3_len + sizeof(struct tcphdr); } else { - p_mem_buf_desc->lwip_pbuf.pbuf.payload = nullptr; + p_mem_buf_desc->lwip_pbuf.payload = nullptr; } /* Initialize pbuf description */ - memset(&p_mem_buf_desc->lwip_pbuf.pbuf.desc, 0, - sizeof(p_mem_buf_desc->lwip_pbuf.pbuf.desc)); - p_mem_buf_desc->lwip_pbuf.pbuf.desc.attr = PBUF_DESC_NONE; + memset(&p_mem_buf_desc->lwip_pbuf.desc, 0, sizeof(p_mem_buf_desc->lwip_pbuf.desc)); + p_mem_buf_desc->lwip_pbuf.desc.attr = PBUF_DESC_NONE; if (desc) { - memcpy(&p_mem_buf_desc->lwip_pbuf.pbuf.desc, desc, - sizeof(p_mem_buf_desc->lwip_pbuf.pbuf.desc)); - if (p_mem_buf_desc->lwip_pbuf.pbuf.desc.attr == PBUF_DESC_MDESC || - p_mem_buf_desc->lwip_pbuf.pbuf.desc.attr == PBUF_DESC_NVME_TX) { - mem_desc *mdesc = (mem_desc *)p_mem_buf_desc->lwip_pbuf.pbuf.desc.mdesc; + memcpy(&p_mem_buf_desc->lwip_pbuf.desc, desc, sizeof(p_mem_buf_desc->lwip_pbuf.desc)); + if (p_mem_buf_desc->lwip_pbuf.desc.attr == PBUF_DESC_MDESC || + p_mem_buf_desc->lwip_pbuf.desc.attr == PBUF_DESC_NVME_TX) { + mem_desc *mdesc = (mem_desc *)p_mem_buf_desc->lwip_pbuf.desc.mdesc; mdesc->get(); } } @@ -439,26 +437,26 @@ void dst_entry_tcp::put_buffer(mem_buf_desc_t *p_desc) } else { // potential race, ref is protected here by tcp lock, and in ring by ring_tx lock - if (likely(p_desc->lwip_pbuf.pbuf.ref)) { - p_desc->lwip_pbuf.pbuf.ref--; + if (likely(p_desc->lwip_pbuf.ref)) { + p_desc->lwip_pbuf.ref--; } else { dst_tcp_logerr("ref count of %p is already zero, double free??", p_desc); } - if (p_desc->lwip_pbuf.pbuf.ref == 0) { + if (p_desc->lwip_pbuf.ref == 0) { p_desc->p_next_desc = nullptr; - buffer_pool::free_tx_lwip_pbuf_custom(&p_desc->lwip_pbuf.pbuf); + buffer_pool::free_tx_lwip_pbuf_custom(&p_desc->lwip_pbuf); } } } void dst_entry_tcp::put_zc_buffer(mem_buf_desc_t *p_desc) { - if (likely(p_desc->lwip_pbuf.pbuf.ref <= 1)) { - p_desc->lwip_pbuf.pbuf.ref = 1; + if (likely(p_desc->lwip_pbuf.ref <= 1)) { + p_desc->lwip_pbuf.ref = 1; p_desc->p_next_desc = m_p_zc_mem_buf_desc_list; m_p_zc_mem_buf_desc_list = p_desc; } else { - p_desc->lwip_pbuf.pbuf.ref--; + p_desc->lwip_pbuf.ref--; } } diff --git a/src/core/proto/mem_buf_desc.h b/src/core/proto/mem_buf_desc.h index db274b9b1..c52313c5e 100644 --- a/src/core/proto/mem_buf_desc.h +++ b/src/core/proto/mem_buf_desc.h @@ -79,13 +79,12 @@ class mem_buf_desc_t { , p_desc_owner(nullptr) , unused_padding(0) { - memset(&lwip_pbuf, 0, sizeof(lwip_pbuf)); clear_transport_data(); memset(&ee, 0, sizeof(ee)); reset_ref_count(); - lwip_pbuf.pbuf.type = type; + lwip_pbuf.type = type; } // Copy constructor for the clone() method. @@ -118,19 +117,19 @@ class mem_buf_desc_t { inline int inc_ref_count() { return atomic_fetch_and_inc(&n_ref_count); } inline int dec_ref_count() { return atomic_fetch_and_dec(&n_ref_count); } inline int add_ref_count(int x) { return atomic_fetch_add_relaxed(x, &n_ref_count); } - inline unsigned int lwip_pbuf_get_ref_count() const { return lwip_pbuf.pbuf.ref; } - inline unsigned int lwip_pbuf_inc_ref_count() { return ++lwip_pbuf.pbuf.ref; } + inline unsigned int lwip_pbuf_get_ref_count() const { return lwip_pbuf.ref; } + inline unsigned int lwip_pbuf_inc_ref_count() { return ++lwip_pbuf.ref; } inline unsigned int lwip_pbuf_dec_ref_count() { - if (likely(lwip_pbuf.pbuf.ref)) { - --lwip_pbuf.pbuf.ref; + if (likely(lwip_pbuf.ref)) { + --lwip_pbuf.ref; } - return lwip_pbuf.pbuf.ref; + return lwip_pbuf.ref; } public: /* This field must be first in this class. It encapsulates pbuf structure from lwip */ - struct pbuf_custom lwip_pbuf; + struct pbuf lwip_pbuf; uint8_t *p_buffer; static inline size_t buffer_node_offset(void) diff --git a/src/core/proto/neighbour.cpp b/src/core/proto/neighbour.cpp index d152aec92..21bad8bae 100644 --- a/src/core/proto/neighbour.cpp +++ b/src/core/proto/neighbour.cpp @@ -678,7 +678,7 @@ bool neigh_entry::post_send_tcp(neigh_send_data *p_data) } BULLSEYE_EXCLUDE_BLOCK_END - p_mem_buf_desc->lwip_pbuf.pbuf.payload = (u8_t *)p_mem_buf_desc->p_buffer + h->m_total_hdr_len; + p_mem_buf_desc->lwip_pbuf.payload = (u8_t *)p_mem_buf_desc->p_buffer + h->m_total_hdr_len; p_mem_buf_desc->p_next_desc = nullptr; // copy L4 neigh buffer to tx buffer @@ -712,9 +712,9 @@ bool neigh_entry::post_send_tcp(neigh_send_data *p_data) neigh_logerr("p_buffer - addr=%d, m_total_hdr_len=%u, p_buffer=%p, type=%d, len=%d, " "tot_len=%d, payload=%p, hdr_alignment_diff=%zd\n", (int)(p_mem_buf_desc->p_buffer - (uint8_t *)m_sge.addr), h->m_total_hdr_len, - p_mem_buf_desc->p_buffer, p_mem_buf_desc->lwip_pbuf.pbuf.type, - p_mem_buf_desc->lwip_pbuf.pbuf.len, p_mem_buf_desc->lwip_pbuf.pbuf.tot_len, - p_mem_buf_desc->lwip_pbuf.pbuf.payload, hdr_alignment_diff); + p_mem_buf_desc->p_buffer, p_mem_buf_desc->lwip_pbuf.type, + p_mem_buf_desc->lwip_pbuf.len, p_mem_buf_desc->lwip_pbuf.tot_len, + p_mem_buf_desc->lwip_pbuf.payload, hdr_alignment_diff); } m_send_wqe.wr_id = (uintptr_t)p_mem_buf_desc; diff --git a/src/core/sock/sockinfo_tcp.cpp b/src/core/sock/sockinfo_tcp.cpp index 5990dc2c8..d1b7aaef2 100644 --- a/src/core/sock/sockinfo_tcp.cpp +++ b/src/core/sock/sockinfo_tcp.cpp @@ -168,15 +168,14 @@ static lock_base *get_new_tcp_lock() inline void sockinfo_tcp::lwip_pbuf_init_custom(mem_buf_desc_t *p_desc) { - if (!p_desc->lwip_pbuf.pbuf.gro) { - p_desc->lwip_pbuf.pbuf.len = p_desc->lwip_pbuf.pbuf.tot_len = + if (!p_desc->lwip_pbuf.gro) { + p_desc->lwip_pbuf.len = p_desc->lwip_pbuf.tot_len = (p_desc->sz_data - p_desc->rx.n_transport_header_len); - p_desc->lwip_pbuf.pbuf.ref = 1; - p_desc->lwip_pbuf.pbuf.next = nullptr; - p_desc->lwip_pbuf.pbuf.payload = - (u8_t *)p_desc->p_buffer + p_desc->rx.n_transport_header_len; + p_desc->lwip_pbuf.ref = 1; + p_desc->lwip_pbuf.next = nullptr; + p_desc->lwip_pbuf.payload = (u8_t *)p_desc->p_buffer + p_desc->rx.n_transport_header_len; } - p_desc->lwip_pbuf.pbuf.gro = 0; + p_desc->lwip_pbuf.gro = 0; } /* change default rx_wait impl to flow based one */ @@ -229,26 +228,26 @@ inline void sockinfo_tcp::return_pending_tx_buffs() inline void sockinfo_tcp::reuse_buffer(mem_buf_desc_t *buff) { /* Special case when ZC buffers are used in RX path. */ - if (buff->lwip_pbuf.pbuf.type == PBUF_ZEROCOPY) { + if (buff->lwip_pbuf.type == PBUF_ZEROCOPY) { dst_entry_tcp *p_dst = (dst_entry_tcp *)(m_p_connected_dst_entry); - mem_buf_desc_t *underlying = - reinterpret_cast(buff->lwip_pbuf.pbuf.desc.mdesc); + mem_buf_desc_t *underlying = reinterpret_cast(buff->lwip_pbuf.desc.mdesc); - buff->lwip_pbuf.pbuf.desc.mdesc = nullptr; + buff->lwip_pbuf.desc.mdesc = nullptr; if (likely(p_dst)) { p_dst->put_zc_buffer(buff); } else { g_buffer_pool_zc->put_buffers_thread_safe(buff); } - if (underlying->lwip_pbuf.pbuf.ref > 1) { - --underlying->lwip_pbuf.pbuf.ref; + if (underlying->lwip_pbuf.ref > 1) { + --underlying->lwip_pbuf.ref; return; } /* Continue and release the underlying buffer. */ buff = underlying; - buff->lwip_pbuf.pbuf.ref = 1; - buff->lwip_pbuf.pbuf.next = nullptr; + + buff->lwip_pbuf.ref = 1; + buff->lwip_pbuf.next = nullptr; buff->p_next_desc = nullptr; } @@ -5277,20 +5276,18 @@ mem_buf_desc_t *sockinfo_tcp::get_next_desc(mem_buf_desc_t *p_desc) m_n_rx_pkt_ready_list_count--; if (p_desc->p_next_desc) { - // vlog_printf(VLOG_ERROR, "detected chained pbufs! REF %u\n", - // p_desc->lwip_pbuf.pbuf.ref); mem_buf_desc_t *prev = p_desc; p_desc = p_desc->p_next_desc; - prev->rx.sz_payload = prev->lwip_pbuf.pbuf.len; - p_desc->rx.sz_payload = p_desc->lwip_pbuf.pbuf.tot_len = - prev->lwip_pbuf.pbuf.tot_len - prev->lwip_pbuf.pbuf.len; + prev->rx.sz_payload = prev->lwip_pbuf.len; + p_desc->rx.sz_payload = p_desc->lwip_pbuf.tot_len = + prev->lwip_pbuf.tot_len - prev->lwip_pbuf.len; p_desc->rx.n_frags = --prev->rx.n_frags; p_desc->rx.src = prev->rx.src; p_desc->inc_ref_count(); m_rx_pkt_ready_list.push_front(p_desc); m_n_rx_pkt_ready_list_count++; m_p_socket_stats->n_rx_ready_pkt_count++; - prev->lwip_pbuf.pbuf.next = nullptr; + prev->lwip_pbuf.next = nullptr; prev->p_next_desc = nullptr; prev->rx.n_frags = 1; reuse_buffer(prev); @@ -5377,20 +5374,21 @@ int sockinfo_tcp::zero_copy_rx(iovec *p_iov, mem_buf_desc_t *pdesc, int *p_flags if (len < 0 && p_desc_iter) { // Update length of right side of chain after split - push to pkt_ready_list - p_desc_iter->rx.sz_payload = p_desc_iter->lwip_pbuf.pbuf.tot_len = - prev->lwip_pbuf.pbuf.tot_len - prev->lwip_pbuf.pbuf.len; + p_desc_iter->rx.sz_payload = p_desc_iter->lwip_pbuf.tot_len = + prev->lwip_pbuf.tot_len - prev->lwip_pbuf.len; // Update length of left side of chain after split - return to app mem_buf_desc_t *p_desc_head = reinterpret_cast(p_pkts->packet_id); // XXX TODO: subsequent buffers are not updated - p_desc_head->lwip_pbuf.pbuf.tot_len = p_desc_head->rx.sz_payload -= + p_desc_head->lwip_pbuf.tot_len = p_desc_head->rx.sz_payload -= p_desc_iter->rx.sz_payload; p_desc_iter->rx.n_frags = p_desc_head->rx.n_frags - p_pkts->sz_iov; p_desc_head->rx.n_frags = p_pkts->sz_iov; p_desc_iter->rx.src = prev->rx.src; p_desc_iter->inc_ref_count(); - prev->lwip_pbuf.pbuf.next = nullptr; + + prev->lwip_pbuf.next = nullptr; prev->p_next_desc = nullptr; m_rx_pkt_ready_list.push_front(p_desc_iter); @@ -5675,8 +5673,8 @@ struct pbuf *sockinfo_tcp::tcp_tx_pbuf_alloc(void *p_conn, pbuf_type type, pbuf_ if (likely(p_dst)) { p_desc = p_dst->get_buffer(type, desc); } - if (likely(p_desc) && p_desc->lwip_pbuf.pbuf.type == PBUF_ZEROCOPY) { - if (p_desc->lwip_pbuf.pbuf.desc.attr == PBUF_DESC_EXPRESS) { + if (likely(p_desc) && p_desc->lwip_pbuf.type == PBUF_ZEROCOPY) { + if (p_desc->lwip_pbuf.desc.attr == PBUF_DESC_EXPRESS) { p_desc->m_flags |= mem_buf_desc_t::ZCOPY; p_desc->tx.zc.callback = tcp_express_zc_callback; if (p_buff) { @@ -5685,9 +5683,9 @@ struct pbuf *sockinfo_tcp::tcp_tx_pbuf_alloc(void *p_conn, pbuf_type type, pbuf_ } else { p_desc->tx.zc.ctx = reinterpret_cast(p_si_tcp); } - } else if ((p_desc->lwip_pbuf.pbuf.desc.attr == PBUF_DESC_NONE) || - (p_desc->lwip_pbuf.pbuf.desc.attr == PBUF_DESC_MKEY) || - (p_desc->lwip_pbuf.pbuf.desc.attr == PBUF_DESC_NVME_TX)) { + } else if ((p_desc->lwip_pbuf.desc.attr == PBUF_DESC_NONE) || + (p_desc->lwip_pbuf.desc.attr == PBUF_DESC_MKEY) || + (p_desc->lwip_pbuf.desc.attr == PBUF_DESC_NVME_TX)) { /* Prepare error queue fields for send zerocopy */ if (p_buff) { /* It is a special case that can happen as a result @@ -5697,7 +5695,7 @@ struct pbuf *sockinfo_tcp::tcp_tx_pbuf_alloc(void *p_conn, pbuf_type type, pbuf_ p_desc->m_flags |= mem_buf_desc_t::ZCOPY; p_desc->tx.zc.id = p_prev_desc->tx.zc.id; p_desc->tx.zc.count = p_prev_desc->tx.zc.count; - p_desc->tx.zc.len = p_desc->lwip_pbuf.pbuf.len; + p_desc->tx.zc.len = p_desc->lwip_pbuf.len; p_desc->tx.zc.ctx = p_prev_desc->tx.zc.ctx; p_desc->tx.zc.callback = tcp_tx_zc_callback; p_prev_desc->tx.zc.count = 0; @@ -5741,7 +5739,7 @@ void sockinfo_tcp::tcp_tx_pbuf_free(void *p_conn, struct pbuf *p_buff) __log_err("ref count of %p is already zero, double free??", p_desc); } - if (p_desc->lwip_pbuf.pbuf.ref == 0) { + if (p_desc->lwip_pbuf.ref == 0) { p_desc->p_next_desc = nullptr; buffer_pool::free_tx_lwip_pbuf_custom(p_buff); } @@ -5753,13 +5751,13 @@ mem_buf_desc_t *sockinfo_tcp::tcp_tx_zc_alloc(mem_buf_desc_t *p_desc) p_desc->m_flags |= mem_buf_desc_t::ZCOPY; p_desc->tx.zc.id = atomic_read(&m_zckey); p_desc->tx.zc.count = 1; - p_desc->tx.zc.len = p_desc->lwip_pbuf.pbuf.len; + p_desc->tx.zc.len = p_desc->lwip_pbuf.len; p_desc->tx.zc.ctx = (void *)this; p_desc->tx.zc.callback = tcp_tx_zc_callback; - if (m_last_zcdesc && (m_last_zcdesc != p_desc) && (m_last_zcdesc->lwip_pbuf.pbuf.ref > 0) && + if (m_last_zcdesc && (m_last_zcdesc != p_desc) && (m_last_zcdesc->lwip_pbuf.ref > 0) && (m_last_zcdesc->tx.zc.id == p_desc->tx.zc.id)) { - m_last_zcdesc->tx.zc.len = m_last_zcdesc->lwip_pbuf.pbuf.len; + m_last_zcdesc->tx.zc.len = m_last_zcdesc->lwip_pbuf.len; m_last_zcdesc->tx.zc.count = 0; } m_last_zcdesc = p_desc; @@ -5771,7 +5769,7 @@ mem_buf_desc_t *sockinfo_tcp::tcp_tx_zc_alloc(mem_buf_desc_t *p_desc) void sockinfo_tcp::tcp_express_zc_callback(mem_buf_desc_t *p_desc) { sockinfo_tcp *si = reinterpret_cast(p_desc->tx.zc.ctx); - const uintptr_t opaque_op = reinterpret_cast(p_desc->lwip_pbuf.pbuf.desc.opaque); + const uintptr_t opaque_op = reinterpret_cast(p_desc->lwip_pbuf.desc.opaque); if (opaque_op && si->m_p_group && si->m_p_group->m_socket_comp_cb) { si->m_p_group->m_socket_comp_cb(reinterpret_cast(si), diff --git a/src/core/sock/sockinfo_ulp.cpp b/src/core/sock/sockinfo_ulp.cpp index 90ae74286..24eee400a 100644 --- a/src/core/sock/sockinfo_ulp.cpp +++ b/src/core/sock/sockinfo_ulp.cpp @@ -419,9 +419,9 @@ sockinfo_tcp_ops_tls::~sockinfo_tcp_ops_tls() * users. Note, we are under TCP connection lock here. */ mem_buf_desc_t *pdesc = m_rx_bufs.front(); - if (pdesc->lwip_pbuf.pbuf.ref > 1) { + if (pdesc->lwip_pbuf.ref > 1) { m_rx_bufs.pop_front(); - pbuf_free(&pdesc->lwip_pbuf.pbuf); + pbuf_free(&pdesc->lwip_pbuf); } while (!m_rx_bufs.empty()) { pdesc = m_rx_bufs.get_and_pop_front(); @@ -452,7 +452,7 @@ void sockinfo_tcp_ops_tls::get_record_buf(mem_buf_desc_t *&buf, uint8_t *&data, m_zc_stor = m_p_sock->tcp_tx_mem_buf_alloc(PBUF_RAM); m_zc_stor_offset = 0; if (likely(m_zc_stor)) { - m_zc_stor->lwip_pbuf.pbuf.ref += m_zc_stor->sz_buffer / TLS_ZC_BLOCK; + m_zc_stor->lwip_pbuf.ref += m_zc_stor->sz_buffer / TLS_ZC_BLOCK; } } buf = m_zc_stor; @@ -688,7 +688,7 @@ err_t sockinfo_tcp_ops_tls::tls_rx_consume_ready_packets() mem_buf_desc_t *temp; temp = descs_rx_ready.front(); descs_rx_ready.pop_front(); - ret = recv(&temp->lwip_pbuf.pbuf); + ret = recv(&temp->lwip_pbuf); if (unlikely(ERR_OK != ret)) { break; } @@ -1051,18 +1051,18 @@ void sockinfo_tcp_ops_tls::copy_by_offset(uint8_t *dst, uint32_t offset, uint32_ mem_buf_desc_t *pdesc = *iter; /* Skip leading buffers */ - if (unlikely(pdesc->lwip_pbuf.pbuf.len <= offset)) { - while (pdesc && pdesc->lwip_pbuf.pbuf.len <= offset) { - offset -= pdesc->lwip_pbuf.pbuf.len; + if (unlikely(pdesc->lwip_pbuf.len <= offset)) { + while (pdesc && pdesc->lwip_pbuf.len <= offset) { + offset -= pdesc->lwip_pbuf.len; pdesc = *(++iter); } } /* Copy */ while (likely(pdesc) && len > 0) { - uint32_t buflen = std::min(pdesc->lwip_pbuf.pbuf.len - offset, len); + uint32_t buflen = std::min(pdesc->lwip_pbuf.len - offset, len); - memcpy(dst, (uint8_t *)pdesc->lwip_pbuf.pbuf.payload + offset, buflen); + memcpy(dst, (uint8_t *)pdesc->lwip_pbuf.payload + offset, buflen); len -= buflen; dst += buflen; offset = 0; @@ -1079,24 +1079,24 @@ uint16_t sockinfo_tcp_ops_tls::offset_to_host16(uint32_t offset) uint16_t res = 0; /* Skip leading buffers */ - if (unlikely(pdesc->lwip_pbuf.pbuf.len <= offset)) { - while (pdesc && pdesc->lwip_pbuf.pbuf.len <= offset) { - offset -= pdesc->lwip_pbuf.pbuf.len; + if (unlikely(pdesc->lwip_pbuf.len <= offset)) { + while (pdesc && pdesc->lwip_pbuf.len <= offset) { + offset -= pdesc->lwip_pbuf.len; pdesc = *(++iter); } } if (likely(pdesc)) { - res = (uint16_t)((uint8_t *)pdesc->lwip_pbuf.pbuf.payload)[offset] << 8U; + res = (uint16_t)((uint8_t *)pdesc->lwip_pbuf.payload)[offset] << 8U; ++offset; - if (unlikely(offset >= pdesc->lwip_pbuf.pbuf.len)) { + if (unlikely(offset >= pdesc->lwip_pbuf.len)) { offset = 0; pdesc = *(++iter); if (unlikely(!pdesc)) { return 0; } } - res |= (uint16_t)((uint8_t *)pdesc->lwip_pbuf.pbuf.payload)[offset]; + res |= (uint16_t)((uint8_t *)pdesc->lwip_pbuf.payload)[offset]; } return res; } @@ -1289,11 +1289,11 @@ err_t sockinfo_tcp_ops_tls::recv(struct pbuf *p) m_p_tx_ring->credits_get(SQ_CREDITS_TLS_RX_GET_PSV)) { /* If we fail to request credits we will retry resync flow with the next incoming packet. */ m_rx_psv_buf = m_p_sock->tcp_tx_mem_buf_alloc(PBUF_RAM); - m_rx_psv_buf->lwip_pbuf.pbuf.payload = + m_rx_psv_buf->lwip_pbuf.payload = (void *)(((uintptr_t)m_rx_psv_buf->p_buffer + 63U) >> 6U << 6U); - uint8_t *payload = (uint8_t *)m_rx_psv_buf->lwip_pbuf.pbuf.payload; + uint8_t *payload = (uint8_t *)m_rx_psv_buf->lwip_pbuf.payload; if (likely(m_rx_psv_buf->sz_buffer >= (size_t)(payload - m_rx_psv_buf->p_buffer + 64))) { - memset(m_rx_psv_buf->lwip_pbuf.pbuf.payload, 0, 64); + memset(m_rx_psv_buf->lwip_pbuf.payload, 0, 64); m_rx_resync_recno = m_next_recno_rx; m_p_tx_ring->tls_get_progress_params_rx(m_p_tir, payload, LKEY_TX_DEFAULT); ++m_p_sock->m_p_socket_stats->tls_counters.n_tls_rx_resync; @@ -1348,7 +1348,7 @@ err_t sockinfo_tcp_ops_tls::recv(struct pbuf *p) uint8_t tls_decrypted = 0; mem_buf_desc_t *pdesc = *iter; - tls_type = ((uint8_t *)pdesc->lwip_pbuf.pbuf.payload)[m_rx_offset]; + tls_type = ((uint8_t *)pdesc->lwip_pbuf.payload)[m_rx_offset]; if (is_rx_tls13()) { /* TLS 1.3 sends record type as the last byte of the payload. */ ++remain; @@ -1363,7 +1363,7 @@ err_t sockinfo_tcp_ops_tls::recv(struct pbuf *p) break; } - pi = &pdesc->lwip_pbuf.pbuf; + pi = &pdesc->lwip_pbuf; if (pi->len <= offset) { offset -= pi->len; goto next_buffer; @@ -1500,18 +1500,18 @@ err_t sockinfo_tcp_ops_tls::recv(struct pbuf *p) break; } pdesc = m_rx_bufs.front(); - if (pdesc->lwip_pbuf.pbuf.len > (m_rx_rec_len + m_rx_offset)) { + if (pdesc->lwip_pbuf.len > (m_rx_rec_len + m_rx_offset)) { break; } m_rx_bufs.pop_front(); - m_rx_rec_len -= pdesc->lwip_pbuf.pbuf.len - m_rx_offset; - m_rx_rec_rcvd -= pdesc->lwip_pbuf.pbuf.len - m_rx_offset; + m_rx_rec_len -= pdesc->lwip_pbuf.len - m_rx_offset; + m_rx_rec_rcvd -= pdesc->lwip_pbuf.len - m_rx_offset; m_rx_offset = 0; /* * pbuf_free() is slow when it actually frees a buffer, however, * we expect to only reduce ref counter with this call. */ - pbuf_free(&pdesc->lwip_pbuf.pbuf); + pbuf_free(&pdesc->lwip_pbuf); } m_rx_offset += m_rx_rec_len; m_rx_rec_rcvd -= m_rx_rec_len; @@ -1556,7 +1556,7 @@ void sockinfo_tcp_ops_tls::rx_comp_callback(void *arg) if (utls->m_rx_psv_buf) { /* Resync flow, GET_PSV is completed. */ struct xlio_tls_progress_params *params = - (struct xlio_tls_progress_params *)utls->m_rx_psv_buf->lwip_pbuf.pbuf.payload; + (struct xlio_tls_progress_params *)utls->m_rx_psv_buf->lwip_pbuf.payload; uint32_t resync_seqno = be32toh(params->hw_resync_tcp_sn); int tracker = params->state >> 6U; int auth = (params->state >> 4U) & 0x3U; From fea6e8225650ae6b6a90b3557d7dde9fa929e6cb Mon Sep 17 00:00:00 2001 From: Gal Noam Date: Mon, 11 Mar 2024 23:19:41 +0200 Subject: [PATCH 102/169] version: 3.30.2 Signed-off-by: Gal Noam --- CHANGES | 14 ++++++++++++++ configure.ac | 2 +- contrib/scripts/libxlio.spec.in | 4 ++-- 3 files changed, 17 insertions(+), 3 deletions(-) diff --git a/CHANGES b/CHANGES index 174e569f7..1c73eb04d 100644 --- a/CHANGES +++ b/CHANGES @@ -1,3 +1,17 @@ +Version 3.30.2-1: +Date + Time 2024-03-11 +============================================================= +Added: + - RM #3792777 provide new storage API headers + - RM #3792789 provide new storage API implementation, integration level + - RM #3770816 Modernize C++ source code + - RM #3813802 Fix warnings from newer cppcheck version + - RM #3795922 Remove pbuf_split_64k() and refused_data + +Fixed: + - RM #3781322 higher CPU loads when loaded with Nginx responding to http requests of high payloads. + - RM #3792731 False positive Walloc-size-larger-than warning + Version 3.30.1-1: Date + Time 2024-02-22 ============================================================= diff --git a/configure.ac b/configure.ac index 04268f9e6..d783786f4 100644 --- a/configure.ac +++ b/configure.ac @@ -14,7 +14,7 @@ dnl===-----------------------------------------------------------------------=== # define([prj_ver_major], 3) define([prj_ver_minor], 30) -define([prj_ver_revision], 1) +define([prj_ver_revision], 2) define([prj_ver_release], esyscmd([echo ${PRJ_RELEASE:=0}])) diff --git a/contrib/scripts/libxlio.spec.in b/contrib/scripts/libxlio.spec.in index ee50afee3..e948280ee 100644 --- a/contrib/scripts/libxlio.spec.in +++ b/contrib/scripts/libxlio.spec.in @@ -188,7 +188,7 @@ fi %{_mandir}/man8/xlio_stats.* %changelog -* Thu Feb 22 2024 NVIDIA CORPORATION 3.30.1-1 -- Bump version to 3.30.1 +* Mon Mar 11 2024 NVIDIA CORPORATION 3.30.2-1 +- Bump version to 3.30.2 - Please refer to CHANGES for full changelog. From 1d23a9d15a0409a630d18251be876c297370cecd Mon Sep 17 00:00:00 2001 From: Viacheslav Login Date: Thu, 7 Mar 2024 13:56:34 +0200 Subject: [PATCH 103/169] issue: HPCINFRA-1321 add Dockerfile for static tests Signed-off-by: Viacheslav Login --- .ci/dockerfiles/Dockerfile.rhel8.6 | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) create mode 100644 .ci/dockerfiles/Dockerfile.rhel8.6 diff --git a/.ci/dockerfiles/Dockerfile.rhel8.6 b/.ci/dockerfiles/Dockerfile.rhel8.6 new file mode 100644 index 000000000..679896211 --- /dev/null +++ b/.ci/dockerfiles/Dockerfile.rhel8.6 @@ -0,0 +1,26 @@ +FROM harbor.mellanox.com/hpcx/x86_64/rhel8.6/core:latest +ARG _UID=6213 +ARG _GID=101 +ARG _LOGIN=swx-jenkins +ARG _HOME=/var/home/$_LOGIN + +RUN yum install -y https://dl.fedoraproject.org/pub/epel/epel-release-latest-7.noarch.rpm \ + && yum install -y cppcheck \ + && yum install -y https://dl.fedoraproject.org/pub/epel/epel-release-latest-8.noarch.rpm \ + && yum install -y csbuild clang-tools-extra sudo curl autoconf automake make libtool \ + libnl3-devel libnl3 rdma-core-devel rdma-core bc \ + && yum clean all + +RUN pip3 install -U pip --no-cache-dir \ + && pip3 install compiledb --no-cache-dir + +RUN echo "${_LOGIN} ALL=(ALL) NOPASSWD: ALL" >> /etc/sudoers && \ + echo "root ALL=(ALL) NOPASSWD: ALL" >> /etc/sudoers && \ + mkdir -p ${_HOME} && \ + groupadd -f -g "$_GID" "$_LOGIN" && \ + useradd -u "$_UID" -g "$_GID" -s /bin/bash -m -d ${_HOME} "${_LOGIN}" && \ + chown -R ${_LOGIN} ${_HOME} && \ + mkdir /build && chown -R ${_LOGIN} /build + +USER "$_LOGIN" +ENTRYPOINT [ "/bin/bash", "--login", "--rcfile", "/etc/bashrc", "-c" ] From 6e9a0054bfbe8af5485e8fd4da9c20b124962ef3 Mon Sep 17 00:00:00 2001 From: Viacheslav Login Date: Tue, 5 Mar 2024 15:20:32 +0200 Subject: [PATCH 104/169] issue: HPCINFRA-1321 Switch cppcheck to a docker Signed-off-by: Viacheslav Login --- .ci/matrix_job.yaml | 21 ++++++++++++++------- contrib/jenkins_tests/cppcheck.sh | 16 ---------------- 2 files changed, 14 insertions(+), 23 deletions(-) diff --git a/.ci/matrix_job.yaml b/.ci/matrix_job.yaml index e70562c22..78557bc29 100644 --- a/.ci/matrix_job.yaml +++ b/.ci/matrix_job.yaml @@ -4,13 +4,14 @@ job: LIBXLIO step_allow_single_selector: false registry_host: harbor.mellanox.com -registry_auth: swx-storage +registry_auth: 1daaea28-800e-425f-a91f-3bd3e9136eea +registry_path: /swx-infra/media kubernetes: privileged: false cloud: swx-k8s-spray nodeSelector: 'beta.kubernetes.io/os=linux' - + namespace: xlio-ci limits: '{memory: 8Gi, cpu: 7000m}' requests: '{memory: 8Gi, cpu: 7000m}' @@ -32,9 +33,6 @@ volumes: # User profile for release - {mountPath: /var/home/swx-jenkins, hostPath: /labhome/swx-jenkins} -env: - build_dockers: false - runs_on_dockers: # mofed - {name: 'ub20.04-mofed-x86_64', url: 'harbor.mellanox.com/swx-infra/x86_64/ubuntu20.04/builder:mofed-5.2-2.2.0.0', category: 'base', arch: 'x86_64'} @@ -46,6 +44,15 @@ runs_on_dockers: - {name: 'toolbox', url: 'harbor.mellanox.com/hpcx/x86_64/rhel8.6/builder:inbox', category: 'tool', arch: 'x86_64'} - {name: 'blackduck', url: 'harbor.mellanox.com/toolbox/ngci-centos:7.9.2009.2', category: 'tool', arch: 'x86_64'} - {name: 'header-check', url: 'harbor.mellanox.com/toolbox/header_check:0.0.14', category: 'tool', arch: 'x86_64', tag: '0.0.14'} +# static tests + - {file: '.ci/dockerfiles/Dockerfile.rhel8.6', + arch: 'x86_64', + name: 'xlio_static.cppcheck', + uri: '$arch/$name', + tag: '20240703', + build_args: '--no-cache', + category: 'tool' + } runs_on_agents: - {nodeLabel: 'beni09', category: 'base'} @@ -218,9 +225,9 @@ steps: - name: Cppcheck enable: ${do_cppcheck} containerSelector: - - "{name: 'skip-container'}" + - "{name: 'xlio_static.cppcheck', category: 'tool', variant: 1}" agentSelector: - - "{nodeLabel: 'beni09'}" + - "{nodeLabel: 'skip-agent'}" run: | [ "x${do_cppcheck}" == "xtrue" ] && action=yes || action=no env WORKSPACE=$PWD TARGET=${flags} jenkins_test_cppcheck=${action} ./contrib/test_jenkins.sh diff --git a/contrib/jenkins_tests/cppcheck.sh b/contrib/jenkins_tests/cppcheck.sh index 3e1070500..625e20485 100755 --- a/contrib/jenkins_tests/cppcheck.sh +++ b/contrib/jenkins_tests/cppcheck.sh @@ -7,26 +7,10 @@ echo "Checking for cppcheck ..." tool_app=cppcheck # This unit requires cppcheck so check for existence -if [ $(command -v ${tool_app} >/dev/null 2>&1 || echo $?) ]; then - set +e - eval "timeout -s SIGKILL 20s https://github.com/danmar/cppcheck.git cppcheck " > /dev/null 2>&1 - if [ $? -eq 0 ]; then - eval "cd cppcheck && checkout 2.1 " > /dev/null 2>&1 - if [ $? -eq 0 ]; then - eval "make $make_opt FILESDIR=$PWD HAVE_RULES=yes " > /dev/null 2>&1 - if [ $? -eq 0 ]; then - tool_app=$PWD/cppcheck - fi - fi - cd .. - fi - set -e - if [ $(command -v ${tool_app} >/dev/null 2>&1 || echo $?) ]; then echo "[SKIP] cppcheck tool does not exist" exit 1 fi -fi echo $(${tool_app} --version) From bda508bf0df7ec443d7abda2afb288a864f5386b Mon Sep 17 00:00:00 2001 From: Viacheslav Login Date: Wed, 6 Mar 2024 12:32:43 +0200 Subject: [PATCH 105/169] issue: HPCINFRA-1321 Switch csbuild to a docker Signed-off-by: Viacheslav Login --- .ci/matrix_job.yaml | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/.ci/matrix_job.yaml b/.ci/matrix_job.yaml index 78557bc29..12c246233 100644 --- a/.ci/matrix_job.yaml +++ b/.ci/matrix_job.yaml @@ -53,6 +53,14 @@ runs_on_dockers: build_args: '--no-cache', category: 'tool' } + - {file: '.ci/dockerfiles/Dockerfile.rhel8.6', + arch: 'x86_64', + name: 'xlio_static.csbuild', + uri: '$arch/$name', + tag: '20240703', + build_args: '--no-cache', + category: 'tool' + } runs_on_agents: - {nodeLabel: 'beni09', category: 'base'} @@ -242,9 +250,9 @@ steps: - name: Csbuild enable: ${do_csbuild} containerSelector: - - "{name: 'skip-container'}" + - "{name: 'xlio_static.csbuild', category: 'tool', variant: 1}" agentSelector: - - "{nodeLabel: 'beni09'}" + - "{nodeLabel: 'skip-agent'}" run: | [ "x${do_csbuild}" == "xtrue" ] && action=yes || action=no env WORKSPACE=$PWD TARGET=${flags} jenkins_test_csbuild=${action} ./contrib/test_jenkins.sh From accbf7be3941a2a0c6c65bf04b03b230762eacfd Mon Sep 17 00:00:00 2001 From: Viacheslav Login Date: Wed, 6 Mar 2024 14:08:03 +0200 Subject: [PATCH 106/169] issue: HPCINFRA-1321 Switch Tidy to a docker Signed-off-by: Viacheslav Login --- .ci/matrix_job.yaml | 12 ++++++++++-- contrib/jenkins_tests/tidy.sh | 2 ++ 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/.ci/matrix_job.yaml b/.ci/matrix_job.yaml index 12c246233..5455da234 100644 --- a/.ci/matrix_job.yaml +++ b/.ci/matrix_job.yaml @@ -61,6 +61,14 @@ runs_on_dockers: build_args: '--no-cache', category: 'tool' } + - {file: '.ci/dockerfiles/Dockerfile.rhel8.6', + arch: 'x86_64', + name: 'xlio_static.tidy', + uri: '$arch/$name', + tag: '20240703', + build_args: '--no-cache', + category: 'tool' + } runs_on_agents: - {nodeLabel: 'beni09', category: 'base'} @@ -267,9 +275,9 @@ steps: - name: Tidy enable: ${do_tidy} containerSelector: - - "{name: 'skip-container'}" + - "{name: 'xlio_static.tidy', category: 'tool', variant: 1}" agentSelector: - - "{nodeLabel: 'beni09'}" + - "{nodeLabel: 'skip-agent'}" run: | [ "x${do_tidy}" == "xtrue" ] && action=yes || action=no env WORKSPACE=$PWD TARGET=${flags} jenkins_test_tidy=${action} ./contrib/test_jenkins.sh diff --git a/contrib/jenkins_tests/tidy.sh b/contrib/jenkins_tests/tidy.sh index 4ecbd0b5a..94b74d6d4 100755 --- a/contrib/jenkins_tests/tidy.sh +++ b/contrib/jenkins_tests/tidy.sh @@ -13,6 +13,8 @@ source $(dirname $0)/globals.sh echo "Checking for tidy ..." +git config --global --add safe.directory $WORKSPACE + cd $WORKSPACE rm -rf $tidy_dir From 95210565175b20fee58efc072004a8f5a00a5e53 Mon Sep 17 00:00:00 2001 From: Alexander Grissik Date: Mon, 12 Feb 2024 11:16:56 +0200 Subject: [PATCH 107/169] issue: 3777348 Remove unused pipeinfo class There is no path to create pipeinfo objects. Signed-off-by: Alexander Grissik --- src/core/Makefile.am | 2 - src/core/sock/fd_collection.cpp | 53 ---- src/core/sock/fd_collection.h | 12 +- src/core/sock/pipeinfo.cpp | 438 -------------------------------- src/core/sock/pipeinfo.h | 100 -------- 5 files changed, 2 insertions(+), 603 deletions(-) delete mode 100644 src/core/sock/pipeinfo.cpp delete mode 100644 src/core/sock/pipeinfo.h diff --git a/src/core/Makefile.am b/src/core/Makefile.am index 15a35b3a9..d65472eba 100644 --- a/src/core/Makefile.am +++ b/src/core/Makefile.am @@ -151,7 +151,6 @@ libxlio_la_SOURCES := \ sock/sockinfo_tcp.cpp \ sock/tcp_seg_pool.cpp \ sock/fd_collection.cpp \ - sock/pipeinfo.cpp \ sock/socket_fd_api.cpp \ sock/sock-redirect.cpp \ sock/sock-app.cpp \ @@ -276,7 +275,6 @@ libxlio_la_SOURCES := \ \ sock/cleanable_obj.h \ sock/fd_collection.h \ - sock/pipeinfo.h \ sock/pkt_rcvr_sink.h \ sock/pkt_sndr_source.h \ sock/socket_fd_api.h \ diff --git a/src/core/sock/fd_collection.cpp b/src/core/sock/fd_collection.cpp index 1111b655c..1c71df427 100644 --- a/src/core/sock/fd_collection.cpp +++ b/src/core/sock/fd_collection.cpp @@ -38,7 +38,6 @@ #include "sock-redirect.h" #include "socket_fd_api.h" #include "sockinfo_udp.h" -#include "pipeinfo.h" #include "sockinfo_tcp.h" #include "iomux/epfd_info.h" @@ -348,58 +347,6 @@ void fd_collection::statistics_print(int fd, vlog_levels_t log_level) vlog_printf(log_level, "==================================================\n"); } -int fd_collection::addpipe(int fdrd, int fdwr) -{ - fdcoll_logfunc("fdrd=%d, fdwr=%d", fdrd, fdwr); - - if (!is_valid_fd(fdrd) || !is_valid_fd(fdwr)) { - return -1; - } - - lock(); - - // Sanity check to remove any old objects using the same fd!! - socket_fd_api *p_fdrd_api_obj = get_sockfd(fdrd); - BULLSEYE_EXCLUDE_BLOCK_START - if (p_fdrd_api_obj) { - fdcoll_logwarn("[fd=%d] Deleting old duplicate object (%p)", fdrd, p_fdrd_api_obj); - unlock(); - handle_close(fdrd, true); - lock(); - } - BULLSEYE_EXCLUDE_BLOCK_END - socket_fd_api *p_fdwr_api_obj = get_sockfd(fdwr); - BULLSEYE_EXCLUDE_BLOCK_START - if (p_fdwr_api_obj) { - fdcoll_logwarn("[fd=%d] Deleting old duplicate object (%p)", fdwr, p_fdwr_api_obj); - unlock(); - handle_close(fdwr, true); - lock(); - } - BULLSEYE_EXCLUDE_BLOCK_END - - unlock(); - p_fdrd_api_obj = new pipeinfo(fdrd); - p_fdwr_api_obj = new pipeinfo(fdwr); - lock(); - - BULLSEYE_EXCLUDE_BLOCK_START - if (!p_fdrd_api_obj) { - fdcoll_logpanic("[fd=%d] Failed creating new pipeinfo (%m)", fdrd); - } - if (!p_fdwr_api_obj) { - fdcoll_logpanic("[fd=%d] Failed creating new pipeinfo (%m)", fdwr); - } - BULLSEYE_EXCLUDE_BLOCK_END - - m_p_sockfd_map[fdrd] = p_fdrd_api_obj; - m_p_sockfd_map[fdwr] = p_fdwr_api_obj; - - unlock(); - - return 0; -} - int fd_collection::addepfd(int epfd, int size) { fdcoll_logfunc("epfd=%d", epfd); diff --git a/src/core/sock/fd_collection.h b/src/core/sock/fd_collection.h index 9e4f34215..c8ae6096a 100644 --- a/src/core/sock/fd_collection.h +++ b/src/core/sock/fd_collection.h @@ -87,14 +87,6 @@ class fd_collection : private lock_mutex_recursive { */ int addsocket(int fd, int domain, int type, bool check_offload = false); - /** - * Create pipeinfo. Use get_sock() to get it. - * @param fdrd Read fd. - * @param fdwr Write fd. - * @return 0 on success, -1 on failure. - */ - int addpipe(int fdrd, int fdwr); - /** * Create epfd_info. Use get_epfd() to get it. * @param epfd epoll fd. @@ -120,7 +112,7 @@ class fd_collection : private lock_mutex_recursive { int addtapfd(int tapfd, ring_tap *p_ring); /** - * Remove pipeinfo/sockinfo. + * Remove sockinfo. */ int del_sockfd(int fd, bool b_cleanup = false, bool is_for_udp_pool = false); @@ -148,7 +140,7 @@ class fd_collection : private lock_mutex_recursive { inline void reuse_sockfd(int fd, socket_fd_api *p_sfd_api_obj); inline void destroy_sockfd(socket_fd_api *p_sfd_api_obj); /** - * Get sock_fd_api (sockinfo or pipeinfo) by fd. + * Get sock_fd_api (sockinfo) by fd. */ inline socket_fd_api *get_sockfd(int fd); diff --git a/src/core/sock/pipeinfo.cpp b/src/core/sock/pipeinfo.cpp deleted file mode 100644 index 7caf3dcb7..000000000 --- a/src/core/sock/pipeinfo.cpp +++ /dev/null @@ -1,438 +0,0 @@ -/* - * Copyright (c) 2001-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include -#include -#include "utils/bullseye.h" -#include - -#include "sock-redirect.h" - -#include "pipeinfo.h" - -#define MODULE_NAME "pi" -#undef VLOG_PRINTF -#define VLOG_PRINTF(log_level, log_fmt, log_args...) \ - vlog_printf(log_level, "fd[%#x]:%s() " log_fmt "\n", m_fd, __FUNCTION__, ##log_args) -#define VLOG_PRINTF_DETAILS(log_level, log_fmt, log_args...) \ - vlog_printf(log_level, MODULE_NAME ":%d:fd[%#x]:%s() " log_fmt "\n", __LINE__, m_fd, \ - __FUNCTION__, ##log_args) - -#define pi_logpanic(log_fmt, log_args...) \ - VLOG_PRINTF(VLOG_PANIC, log_fmt, ##log_args); \ - throw; -#define pi_logerr(log_fmt, log_args...) VLOG_PRINTF(VLOG_ERROR, log_fmt, ##log_args) -#define pi_logwarn(log_fmt, log_args...) VLOG_PRINTF(VLOG_WARNING, log_fmt, ##log_args) -#define pi_loginfo(log_fmt, log_args...) VLOG_PRINTF(VLOG_INFO, log_fmt, ##log_args) - -#if (MAX_DEFINED_LOG_LEVEL < DEFINED_VLOG_DEBUG) -#define pi_logdbg_no_funcname(log_fmt, log_args...) ((void)0) -#define pi_logdbg(log_fmt, log_args...) ((void)0) -#define si_logdbg_no_funcname(log_fmt, log_args...) ((void)0) -#else -#define pi_logdbg_no_funcname(log_fmt, log_args...) \ - if (g_vlogger_level >= VLOG_DEBUG) \ - vlog_printf(VLOG_DEBUG, MODULE_NAME ":%d:fd[%d]: " log_fmt "\n", __LINE__, m_fd, ##log_args) -#define pi_logdbg(log_fmt, log_args...) \ - if (g_vlogger_level >= VLOG_DEBUG) \ - VLOG_PRINTF_DETAILS(VLOG_DEBUG, log_fmt, ##log_args) -#define si_logdbg_no_funcname(log_fmt, log_args...) \ - do { \ - if (g_vlogger_level >= VLOG_DEBUG) \ - vlog_printf(VLOG_DEBUG, MODULE_NAME "[fd=%d]:%d: " log_fmt "\n", m_fd, __LINE__, \ - ##log_args); \ - } while (0) -#endif - -#if (MAX_DEFINED_LOG_LEVEL < DEFINED_VLOG_FINE) -#define pi_logfunc(log_fmt, log_args...) ((void)0) -#else -#define pi_logfunc(log_fmt, log_args...) \ - if (g_vlogger_level >= VLOG_FUNC) \ - VLOG_PRINTF_DETAILS(VLOG_FUNC, log_fmt, ##log_args) -#endif - -#if (MAX_DEFINED_LOG_LEVEL < DEFINED_VLOG_FINER) -#define pi_logfuncall(log_fmt, log_args...) ((void)0) -#else -#define pi_logfuncall(log_fmt, log_args...) \ - if (g_vlogger_level >= VLOG_FUNC_ALL) \ - VLOG_PRINTF_DETAILS(VLOG_FUNC_ALL, log_fmt, ##log_args) -#endif /* MAX_DEFINED_LOG_LEVEL */ - -pipeinfo::pipeinfo(int fd) - : socket_fd_api(fd) - , m_lock("pipeinfo::m_lock") - , m_lock_rx("pipeinfo::m_lock_rx") - , m_lock_tx("pipeinfo::m_lock_tx") -{ - pi_logfunc(""); - - m_b_closed = true; - m_timer_handle = nullptr; - - m_b_blocking = true; - - m_p_socket_stats = nullptr; // mce_stats_instance_create_socket_block(); - if (!m_p_socket_stats) { - // pi_logdbg("Got NULL from mce_stats_instance_create_socket_block, using local member"); - m_p_socket_stats = &m_socket_stats; - } - m_p_socket_stats->reset(); - m_p_socket_stats->fd = m_fd; - m_p_socket_stats->b_blocking = m_b_blocking; - m_p_socket_stats->n_rx_ready_pkt_count = 0; - m_p_socket_stats->counters.n_rx_ready_pkt_max = 0; - m_p_socket_stats->n_rx_ready_byte_count = 0; - m_p_socket_stats->n_tx_ready_byte_count = 0; - m_p_socket_stats->counters.n_rx_ready_byte_max = 0; - m_p_socket_stats->n_rx_zcopy_pkt_count = 0; - - m_b_closed = false; - - m_b_lbm_event_q_pipe_timer_on = false; - m_write_count = m_write_count_on_last_timer = 0; - m_write_count_no_change_count = 0; - - pi_logfunc("done"); -} - -pipeinfo::~pipeinfo() -{ - m_b_closed = true; - pi_logfunc(""); - - // Change to non-blocking socket so calling threads can exit - m_b_blocking = false; - - m_lock_tx.lock(); - m_lock_rx.lock(); - m_lock.lock(); - - if (m_timer_handle) { - g_p_event_handler_manager->unregister_timer_event(this, m_timer_handle); - m_timer_handle = nullptr; - } - - statistics_print(); - - m_lock_tx.unlock(); - m_lock_rx.unlock(); - m_lock.unlock(); - - pi_logfunc("done"); -} - -void pipeinfo::clean_obj() -{ - if (is_cleaned()) { - return; - } - - set_cleaned(); - m_timer_handle = nullptr; - if (g_p_event_handler_manager->is_running()) { - g_p_event_handler_manager->unregister_timers_event_and_delete(this); - } else { - cleanable_obj::clean_obj(); - } -} - -int pipeinfo::fcntl_helper(int __cmd, unsigned long int __arg, bool &bexit) -{ - - switch (__cmd) { - case F_SETFL: { - pi_logfunc("cmd=F_SETFL, arg=%#x", __cmd, __arg); - if (__arg & O_NONBLOCK) { - pi_logdbg("set to non-blocking mode"); - m_b_blocking = false; - } else { - pi_logdbg("set to blocked mode"); - m_b_blocking = true; - } - m_p_socket_stats->b_blocking = m_b_blocking; - } break; - - case F_GETFL: /* Get file status flags. */ - pi_logfunc("F_GETFL, arg=%#x", __arg); - break; - - case F_GETFD: /* Get file descriptor flags. */ - pi_logfunc("F_GETFD, arg=%#x", __arg); - break; - - case F_SETFD: /* Set file descriptor flags. */ - pi_logfunc("F_SETFD, arg=%#x", __arg); - break; - - default: - pi_logfunc("cmd=%d, arg=%#x", __cmd, __arg); - break; - } - - bexit = false; - return 0; -} - -int pipeinfo::fcntl(int __cmd, unsigned long int __arg) -{ - - bool bexit = false; - int ret_val = fcntl_helper(__cmd, __arg, bexit); - if (bexit) { - return ret_val; - } - - return SYSCALL(fcntl, m_fd, __cmd, __arg); -} - -int pipeinfo::fcntl64(int __cmd, unsigned long int __arg) -{ - - bool bexit = false; - int ret_val = fcntl_helper(__cmd, __arg, bexit); - if (bexit) { - return ret_val; - } - - return SYSCALL(fcntl64, m_fd, __cmd, __arg); -} - -int pipeinfo::ioctl(unsigned long int __request, unsigned long int __arg) -{ - int *p_arg = (int *)__arg; - - switch (__request) { - case FIONBIO: { - if (*p_arg) { - pi_logdbg("FIONBIO, arg=%d - set to non-blocking mode", *p_arg); - m_b_blocking = false; - } else { - pi_logdbg("FIONBIO, arg=%d - set to blocked mode", *p_arg); - m_b_blocking = true; - } - - m_p_socket_stats->b_blocking = m_b_blocking; - } break; - - default: - pi_logfunc("request=%d, arg=%#x", __request, __arg); - break; - } - - return SYSCALL(ioctl, m_fd, __request, __arg); -} - -ssize_t pipeinfo::rx(const rx_call_t call_type, iovec *p_iov, ssize_t sz_iov, int *p_flags, - sockaddr *__from, socklen_t *__fromlen, struct msghdr *__msg) -{ - pi_logfunc(""); - ssize_t ret = - socket_fd_api::rx_os(call_type, p_iov, sz_iov, *p_flags, __from, __fromlen, __msg); - save_stats_rx_os(ret); - return ret; -} - -void pipeinfo::handle_timer_expired(void *user_data) -{ - NOT_IN_USE(user_data); - pi_logfunc("(m_write_count=%d)", m_write_count); - m_lock_tx.lock(); - write_lbm_pipe_enhance(); - m_lock_tx.unlock(); -} - -ssize_t pipeinfo::tx(xlio_tx_call_attr_t &tx_arg) -{ - const iovec *p_iov = tx_arg.attr.iov; - const ssize_t sz_iov = tx_arg.attr.sz_iov; - const int __flags = tx_arg.attr.flags; - const struct sockaddr *__to = tx_arg.attr.addr; - const socklen_t __tolen = tx_arg.attr.len; - ssize_t ret = -1; - - pi_logfunc(""); - m_lock_tx.lock(); - switch (tx_arg.opcode) { - case TX_WRITE: - ret = SYSCALL(write, m_fd, p_iov[0].iov_base, p_iov[0].iov_len); - break; - case TX_SEND: - case TX_SENDTO: - case TX_SENDMSG: - default: - ret = socket_fd_api::tx_os(tx_arg.opcode, p_iov, sz_iov, __flags, __to, __tolen); - break; - } - - save_stats_tx_os(ret); - m_lock_tx.unlock(); - return ret; -} - -void pipeinfo::write_lbm_pipe_enhance() -{ - pi_logfunc("(m_write_count=%d)", m_write_count); - - if (m_write_count == m_write_count_on_last_timer) { - // No pipe write happened during the last timer_expired() - m_write_count_no_change_count++; - - // After 3 of these stop timer - if (m_write_count_no_change_count >= 2 && m_b_lbm_event_q_pipe_timer_on) { - if (m_timer_handle) { - g_p_event_handler_manager->unregister_timer_event(this, m_timer_handle); - m_timer_handle = nullptr; - } - m_b_lbm_event_q_pipe_timer_on = false; - - pi_logfunc("pipe_write DONE timer Un-Reg"); - } - } - - m_write_count = 0; - m_write_count_no_change_count = 0; - m_write_count_on_last_timer = 0; - - // Send the buffered data - char buf[10] = "\0"; - auto result = SYSCALL(write, m_fd, buf, 1); - if (result == -1) { - pi_logdbg("write sycall failed"); - } -} - -void pipeinfo::statistics_print(vlog_levels_t log_level) -{ - bool b_any_activiy = false; - NOT_IN_USE(log_level); - - if (m_p_socket_stats->counters.n_tx_sent_byte_count || - m_p_socket_stats->counters.n_tx_sent_pkt_count || m_p_socket_stats->counters.n_tx_errors || - m_p_socket_stats->counters.n_tx_eagain) { - pi_logdbg_no_funcname( - "Tx Offload: %" PRIu64 " KB / %d / %d / %d [kilobytes/packets/errors/eagains]", - m_p_socket_stats->counters.n_tx_sent_byte_count / 1024, - m_p_socket_stats->counters.n_tx_sent_pkt_count, m_p_socket_stats->counters.n_tx_errors, - m_p_socket_stats->counters.n_tx_eagain); - b_any_activiy = true; - } - if (m_p_socket_stats->counters.n_tx_os_bytes || m_p_socket_stats->counters.n_tx_os_packets || - m_p_socket_stats->counters.n_tx_os_errors) { - pi_logdbg_no_funcname("Tx OS info: %" PRIu64 " KB / %d / %d [kilobytes/packets/errors]", - m_p_socket_stats->counters.n_tx_os_bytes / 1024, - m_p_socket_stats->counters.n_tx_os_packets, - m_p_socket_stats->counters.n_tx_os_errors); - b_any_activiy = true; - } - if (m_p_socket_stats->counters.n_rx_bytes || m_p_socket_stats->counters.n_rx_packets || - m_p_socket_stats->counters.n_rx_errors || m_p_socket_stats->counters.n_rx_eagain) { - pi_logdbg_no_funcname( - "Rx Offload: %" PRIu64 " KB / %d / %d / %d [kilobytes/packets/errors/eagains]", - m_p_socket_stats->counters.n_rx_bytes / 1024, m_p_socket_stats->counters.n_rx_packets, - m_p_socket_stats->counters.n_rx_errors, m_p_socket_stats->counters.n_rx_eagain); - b_any_activiy = true; - } - if (m_p_socket_stats->counters.n_rx_os_bytes || m_p_socket_stats->counters.n_rx_os_packets || - m_p_socket_stats->counters.n_rx_os_errors) { - pi_logdbg_no_funcname("Rx OS info: %" PRIu64 " KB / %d / %d [kilobytes/packets/errors]", - m_p_socket_stats->counters.n_rx_os_bytes / 1024, - m_p_socket_stats->counters.n_rx_os_packets, - m_p_socket_stats->counters.n_rx_os_errors); - b_any_activiy = true; - } - if (m_p_socket_stats->counters.n_rx_poll_miss || m_p_socket_stats->counters.n_rx_poll_hit) { - pi_logdbg_no_funcname("Rx poll: %d / %d (%2.2f%%) [miss/hit]", - m_p_socket_stats->counters.n_rx_poll_miss, - m_p_socket_stats->counters.n_rx_poll_hit, - (float)(m_p_socket_stats->counters.n_rx_poll_hit * 100) / - (float)(m_p_socket_stats->counters.n_rx_poll_miss + - m_p_socket_stats->counters.n_rx_poll_hit)); - b_any_activiy = true; - } - if (m_p_socket_stats->counters.n_rx_ready_byte_drop) { - si_logdbg_no_funcname( - "Rx byte: max %d / dropped %d (%2.2f%%) [limit is %d]", - m_p_socket_stats->counters.n_rx_ready_byte_max, - m_p_socket_stats->counters.n_rx_ready_byte_drop, - (m_p_socket_stats->counters.n_rx_packets - ? (float)(m_p_socket_stats->counters.n_rx_ready_byte_drop * 100) / - (float)m_p_socket_stats->counters.n_rx_packets - : 0), - m_p_socket_stats->n_rx_ready_byte_limit); - b_any_activiy = true; - } - if (m_p_socket_stats->counters.n_rx_ready_pkt_drop) { - si_logdbg_no_funcname("Rx pkt : max %d / dropped %d (%2.2f%%)", - m_p_socket_stats->counters.n_rx_ready_pkt_max, - m_p_socket_stats->counters.n_rx_ready_pkt_drop, - (m_p_socket_stats->counters.n_rx_packets - ? (float)(m_p_socket_stats->counters.n_rx_ready_pkt_drop * 100) / - (float)m_p_socket_stats->counters.n_rx_packets - : 0)); - b_any_activiy = true; - } - if (m_p_socket_stats->strq_counters.n_strq_total_strides) { - si_logdbg_no_funcname("Rx RQ Strides: %" PRIu64 " / %u [total/max-per-packet]\n", - m_p_socket_stats->strq_counters.n_strq_total_strides, - m_p_socket_stats->strq_counters.n_strq_max_strides_per_packet); - b_any_activiy = true; - } - if (b_any_activiy == false) { - pi_logdbg_no_funcname("Rx and Tx where not active"); - } -} - -void pipeinfo::save_stats_rx_os(int bytes) -{ - if (bytes >= 0) { - m_p_socket_stats->counters.n_rx_os_bytes += bytes; - m_p_socket_stats->counters.n_rx_os_packets++; - } else if (errno == EAGAIN) { - m_p_socket_stats->counters.n_rx_os_eagain++; - } else { - m_p_socket_stats->counters.n_rx_os_errors++; - } -} - -void pipeinfo::save_stats_tx_os(int bytes) -{ - if (bytes >= 0) { - m_p_socket_stats->counters.n_tx_os_bytes += bytes; - m_p_socket_stats->counters.n_tx_os_packets++; - } else if (errno == EAGAIN) { - m_p_socket_stats->counters.n_rx_os_eagain++; - } else { - m_p_socket_stats->counters.n_tx_os_errors++; - } -} diff --git a/src/core/sock/pipeinfo.h b/src/core/sock/pipeinfo.h deleted file mode 100644 index 83456cb6c..000000000 --- a/src/core/sock/pipeinfo.h +++ /dev/null @@ -1,100 +0,0 @@ -/* - * Copyright (c) 2001-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#ifndef PIPEINFO_H -#define PIPEINFO_H - -#include "socket_fd_api.h" -#include "utils/lock_wrapper.h" -#include -#include - -class pipeinfo : public socket_fd_api, public timer_handler { -public: - pipeinfo(int fd); - ~pipeinfo() override; - - void clean_obj() override; - -#if defined(DEFINED_NGINX) || defined(DEFINED_ENVOY) - void copy_sockopt_fork(const socket_fd_api *copy_from) override { NOT_IN_USE(copy_from); } -#endif - - int fcntl(int __cmd, unsigned long int __arg) override; - int fcntl64(int __cmd, unsigned long int __arg) override; - int ioctl(unsigned long int __request, unsigned long int __arg) override; - - // Process a Rx request, we might have a ready packet, or we might block until - // we have one (if sockinfo::m_b_blocking == true) - ssize_t rx(const rx_call_t call_type, struct iovec *p_iov, ssize_t sz_iov, int *p_flags, - struct sockaddr *__from = nullptr, socklen_t *__fromlen = nullptr, - struct msghdr *__msg = nullptr) override; - - // Process a Tx request, handle all that is needed to send the packet, we might block - // until the connection info is ready or a tx buffer is releast (if sockinfo::m_b_blocking == - // true) - ssize_t tx(xlio_tx_call_attr_t &tx_arg) override; - - void statistics_print(vlog_levels_t log_level = VLOG_DEBUG) override; - - inline fd_type_t get_type() override { return FD_TYPE_PIPE; } - -private: - bool m_b_blocking; - - // Main mutex to protect from multi threaded access to sockinfo from sock-redirect - bool m_b_closed; - lock_mutex m_lock; - lock_mutex m_lock_rx; - lock_mutex m_lock_tx; - - socket_stats_t m_socket_stats; - socket_stats_t *m_p_socket_stats; - - void *m_timer_handle; - - int m_write_count; - int m_write_count_on_last_timer; - int m_write_count_no_change_count; - bool m_b_lbm_event_q_pipe_timer_on; - - void handle_timer_expired(void *user_data) override; - - void write_lbm_pipe_enhance(); - - void save_stats_rx_os(int bytes); - void save_stats_tx_os(int bytes); - - int fcntl_helper(int __cmd, unsigned long int __arg, bool &bexit); -}; - -#endif From e4492765b9f9ee463930ddd2f2df63f112455144 Mon Sep 17 00:00:00 2001 From: Alexander Grissik Date: Mon, 12 Feb 2024 11:22:24 +0200 Subject: [PATCH 108/169] issue: 3777348 Removing cleanable_obj from socket_fd_api To simplify inheritance and make socket objects cache friendly, we avoid inheriting cleanable_obj which inserts only one boolean but trashes the sapce after vptr. Signed-off-by: Alexander Grissik --- src/core/sock/fd_collection.cpp | 33 +++++++++++++++++++++++++++++---- src/core/sock/fd_collection.h | 5 +++-- src/core/sock/sock-redirect.cpp | 4 ++-- src/core/sock/socket_fd_api.h | 6 ++++-- src/core/sock/sockinfo_tcp.cpp | 10 +++++----- src/core/sock/sockinfo_tcp.h | 6 +++--- 6 files changed, 46 insertions(+), 18 deletions(-) diff --git a/src/core/sock/fd_collection.cpp b/src/core/sock/fd_collection.cpp index 1c71df427..665d4d7d7 100644 --- a/src/core/sock/fd_collection.cpp +++ b/src/core/sock/fd_collection.cpp @@ -130,6 +130,7 @@ void fd_collection::prepare_to_close() unlock(); } +// Called in destructor after Internal-Thread destroyed void fd_collection::clear() { int fd; @@ -147,7 +148,7 @@ void fd_collection::clear() */ while (!m_pending_to_remove_lst.empty()) { socket_fd_api *p_sfd_api = m_pending_to_remove_lst.get_and_pop_back(); - p_sfd_api->clean_obj(); + p_sfd_api->clean_socket_obj(); } g_global_stat_static.n_pending_sockets = 0; @@ -160,7 +161,7 @@ void fd_collection::clear() socket_fd_api *p_sfd_api = get_sockfd(fd); if (p_sfd_api) { p_sfd_api->statistics_print(); - p_sfd_api->clean_obj(); + p_sfd_api->clean_socket_obj(); } } @@ -464,7 +465,7 @@ int fd_collection::add_cq_channel_fd(int cq_ch_fd, ring *p_ring) return 0; } -int fd_collection::del_sockfd(int fd, bool b_cleanup /*=false*/, bool is_for_udp_pool /*=false*/) +int fd_collection::del_sockfd(int fd, bool is_for_udp_pool /*=false*/) { int ret_val = -1; socket_fd_api *p_sfd_api; @@ -479,7 +480,9 @@ int fd_collection::del_sockfd(int fd, bool b_cleanup /*=false*/, bool is_for_udp // 2. Socket deletion when TCP connection == CLOSED if (p_sfd_api->prepare_to_close()) { // the socket is already closable - ret_val = del(fd, b_cleanup, m_p_sockfd_map); + // This may register the socket to be erased by internal thread, + // However, a timer may tick on this socket before it is deleted. + ret_val = del_socket(fd, m_p_sockfd_map); } else { lock(); // The socket is not ready for close. @@ -556,6 +559,28 @@ template int fd_collection::del(int fd, bool b_cleanup, cls **map return -1; } +int fd_collection::del_socket(int fd, socket_fd_api **map_type) +{ + fdcoll_logfunc("fd=%d", fd); + + if (!is_valid_fd(fd)) { + return -1; + } + + lock(); + socket_fd_api *p_obj = map_type[fd]; + if (p_obj) { + map_type[fd] = nullptr; + unlock(); + p_obj->clean_socket_obj(); + return 0; + } + + fdcoll_logdbg("[fd=%d] Could not find related object", fd); + unlock(); + return -1; +} + void fd_collection::remove_from_all_epfds(int fd, bool passthrough) { epfd_info_list_t::iterator itr; diff --git a/src/core/sock/fd_collection.h b/src/core/sock/fd_collection.h index c8ae6096a..ad6f844a4 100644 --- a/src/core/sock/fd_collection.h +++ b/src/core/sock/fd_collection.h @@ -114,7 +114,7 @@ class fd_collection : private lock_mutex_recursive { /** * Remove sockinfo. */ - int del_sockfd(int fd, bool b_cleanup = false, bool is_for_udp_pool = false); + int del_sockfd(int fd, bool is_for_udp_pool = false); /** * Remove epfd_info. @@ -190,6 +190,7 @@ class fd_collection : private lock_mutex_recursive { private: template int del(int fd, bool b_cleanup, cls **map_type); template inline cls *get(int fd, cls **map_type); + int del_socket(int fd, socket_fd_api **map_type); inline bool is_valid_fd(int fd); inline bool create_offloaded_sockets(); @@ -283,7 +284,7 @@ inline void fd_collection::destroy_sockfd(socket_fd_api *p_sfd_api_obj) lock(); --g_global_stat_static.n_pending_sockets; m_pending_to_remove_lst.erase(p_sfd_api_obj); - p_sfd_api_obj->clean_obj(); + p_sfd_api_obj->clean_socket_obj(); unlock(); } diff --git a/src/core/sock/sock-redirect.cpp b/src/core/sock/sock-redirect.cpp index aa56a2dda..6a767259a 100644 --- a/src/core/sock/sock-redirect.cpp +++ b/src/core/sock/sock-redirect.cpp @@ -240,7 +240,7 @@ bool handle_close(int fd, bool cleanup, bool passthrough) bool to_close_now = true; bool is_for_udp_pool = false; - srdr_logfunc("Cleanup fd=%d", fd); + srdr_logfunc("Cleanup fd=%d cleanup=%d", fd, !!cleanup); if (g_zc_cache) { g_zc_cache->handle_close(fd); @@ -258,7 +258,7 @@ bool handle_close(int fd, bool cleanup, bool passthrough) // Save this value before pointer is destructed is_for_udp_pool = sockfd->m_is_for_socket_pool; #endif - g_p_fd_collection->del_sockfd(fd, cleanup, is_for_udp_pool); + g_p_fd_collection->del_sockfd(fd, is_for_udp_pool); if (safe_mce_sys().deferred_close) { to_close_now = false; } diff --git a/src/core/sock/socket_fd_api.h b/src/core/sock/socket_fd_api.h index 7328b63d5..1f2e82e09 100644 --- a/src/core/sock/socket_fd_api.h +++ b/src/core/sock/socket_fd_api.h @@ -142,10 +142,12 @@ typedef xlio_list_t xlio_des * */ -class socket_fd_api : public cleanable_obj { +class socket_fd_api { public: socket_fd_api(int fd); - ~socket_fd_api() override; + virtual ~socket_fd_api(); + + virtual void clean_socket_obj() { delete this; } virtual void setPassthrough() {} virtual bool isPassthrough() { return false; } diff --git a/src/core/sock/sockinfo_tcp.cpp b/src/core/sock/sockinfo_tcp.cpp index d1b7aaef2..fdfd63673 100644 --- a/src/core/sock/sockinfo_tcp.cpp +++ b/src/core/sock/sockinfo_tcp.cpp @@ -590,15 +590,15 @@ sockinfo_tcp::~sockinfo_tcp() xlio_socket_event(XLIO_SOCKET_EVENT_TERMINATED, 0); } -void sockinfo_tcp::clean_obj() +void sockinfo_tcp::clean_socket_obj() { + lock_tcp_con(); + if (is_cleaned()) { return; } - lock_tcp_con(); - set_cleaned(); - + m_is_cleaned = true; event_handler_manager *p_event_mgr = get_event_mgr(); bool delegated_timers_exit = g_b_exit && @@ -615,7 +615,7 @@ void sockinfo_tcp::clean_obj() if (p_event_mgr->is_running() && !delegated_timers_exit) { p_event_mgr->unregister_timers_event_and_delete(this); } else { - cleanable_obj::clean_obj(); + delete this; } } diff --git a/src/core/sock/sockinfo_tcp.h b/src/core/sock/sockinfo_tcp.h index d9a5c5663..234e5492c 100644 --- a/src/core/sock/sockinfo_tcp.h +++ b/src/core/sock/sockinfo_tcp.h @@ -193,7 +193,7 @@ class sockinfo_tcp : public sockinfo, public timer_handler { sockinfo_tcp(int fd, int domain); ~sockinfo_tcp() override; - void clean_obj() override; + void clean_socket_obj() override; void setPassthrough(bool _isPassthrough) { @@ -381,8 +381,8 @@ class sockinfo_tcp : public sockinfo, public timer_handler { inline int trylock_tcp_con() { return m_tcp_con_lock.trylock(); } inline void lock_tcp_con() { m_tcp_con_lock.lock(); } inline void unlock_tcp_con() { m_tcp_con_lock.unlock(); } - inline void set_reguired_send_block(unsigned sz) { m_required_send_block = sz; } + bool is_cleaned() const { return m_is_cleaned; } static err_t rx_lwip_cb(void *arg, struct tcp_pcb *tpcb, struct pbuf *p, err_t err); static err_t rx_lwip_cb_socketxtreme(void *arg, struct tcp_pcb *tpcb, struct pbuf *p, err_t err); @@ -645,7 +645,7 @@ class sockinfo_tcp : public sockinfo, public timer_handler { // used for reporting 'connected' on second non-blocking call to connect or // second call to failed connect blocking socket. bool report_connected; - + bool m_is_cleaned = false; // If this socket registered deletion on internal thread. int m_error_status; const buffer_batching_mode_t m_sysvar_buffer_batching_mode; From 430547396a76cd6b3ef4ed6fcb58c7906c0aa0dc Mon Sep 17 00:00:00 2001 From: Alexander Grissik Date: Mon, 12 Feb 2024 11:27:40 +0200 Subject: [PATCH 109/169] issue: 3777348 Removing unused pkt_sndr_source class The class pkt_sndr_source is unused empty interface. This adds undesired vptr to inheriting classes and trashes the cache. Signed-off-by: Alexander Grissik --- src/core/Makefile.am | 1 - src/core/sock/pkt_sndr_source.h | 47 --------------------------------- src/core/sock/sockinfo.h | 6 +---- src/core/sock/sockinfo_udp.h | 1 - 4 files changed, 1 insertion(+), 54 deletions(-) delete mode 100644 src/core/sock/pkt_sndr_source.h diff --git a/src/core/Makefile.am b/src/core/Makefile.am index d65472eba..251748ecf 100644 --- a/src/core/Makefile.am +++ b/src/core/Makefile.am @@ -276,7 +276,6 @@ libxlio_la_SOURCES := \ sock/cleanable_obj.h \ sock/fd_collection.h \ sock/pkt_rcvr_sink.h \ - sock/pkt_sndr_source.h \ sock/socket_fd_api.h \ sock/sockinfo.h \ sock/sockinfo_tcp.h \ diff --git a/src/core/sock/pkt_sndr_source.h b/src/core/sock/pkt_sndr_source.h deleted file mode 100644 index 96f4b795c..000000000 --- a/src/core/sock/pkt_sndr_source.h +++ /dev/null @@ -1,47 +0,0 @@ -/* - * Copyright (c) 2001-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#ifndef PKT_SNDR_SOURCE_H -#define PKT_SNDR_SOURCE_H - -/** - * @class pkt_sndr_source - * An object must implement pkt_sndr_source to register with ib_conn_mgr_base - * When no packet transmitters (or receivers) are registered the objects will be - * deleted. - */ -class pkt_sndr_source { -public: - virtual ~pkt_sndr_source() = default; -}; - -#endif diff --git a/src/core/sock/sockinfo.h b/src/core/sock/sockinfo.h index 2d8bd09ff..30281904e 100644 --- a/src/core/sock/sockinfo.h +++ b/src/core/sock/sockinfo.h @@ -52,7 +52,6 @@ #include "socket_fd_api.h" #include "pkt_rcvr_sink.h" -#include "pkt_sndr_source.h" #include "sock-redirect.h" #include "sock-app.h" @@ -155,10 +154,7 @@ typedef std::unordered_map rx_ring_map_t; // see route.c in Linux kernel const uint8_t ip_tos2prio[16] = {0, 0, 0, 0, 2, 2, 2, 2, 6, 6, 6, 6, 4, 4, 4, 4}; -class sockinfo : public socket_fd_api, - public pkt_rcvr_sink, - public pkt_sndr_source, - public wakeup_pipe { +class sockinfo : public socket_fd_api, public pkt_rcvr_sink, public wakeup_pipe { public: sockinfo(int fd, int domain, bool use_ring_locks); ~sockinfo() override; diff --git a/src/core/sock/sockinfo_udp.h b/src/core/sock/sockinfo_udp.h index ad22b23a9..48d988c97 100644 --- a/src/core/sock/sockinfo_udp.h +++ b/src/core/sock/sockinfo_udp.h @@ -49,7 +49,6 @@ #include "proto/dst_entry_udp.h" #include "pkt_rcvr_sink.h" -#include "pkt_sndr_source.h" #include "sock-redirect.h" #include "sockinfo.h" From 29c6b6362dca338f183213cf8794c89d0abd2f88 Mon Sep 17 00:00:00 2001 From: Alexander Grissik Date: Mon, 12 Feb 2024 11:47:31 +0200 Subject: [PATCH 110/169] issue: 3777348 Replacing pkt_rcvr_source class with sockinfo The only consumer of pkt_rcvr_source is sockinfo hierarchy objects. Removing this abstraction allows cache friendly layout of sockinfo. Signed-off-by: Alexander Grissik --- src/core/Makefile.am | 1 - src/core/dev/rfs.cpp | 16 ++++----- src/core/dev/rfs.h | 18 +++++----- src/core/dev/rfs_mc.cpp | 1 + src/core/dev/rfs_uc.cpp | 1 + src/core/dev/ring.h | 7 ++-- src/core/dev/ring_bond.cpp | 4 +-- src/core/dev/ring_bond.h | 6 ++-- src/core/dev/ring_slave.cpp | 8 ++--- src/core/dev/ring_slave.h | 8 ++--- src/core/dev/ring_tap.cpp | 4 +-- src/core/dev/ring_tap.h | 7 ++-- src/core/proto/xlio_lwip.h | 1 - src/core/sock/pkt_rcvr_sink.h | 67 ----------------------------------- src/core/sock/sockinfo.h | 13 ++++--- src/core/sock/sockinfo_udp.h | 1 - 16 files changed, 49 insertions(+), 114 deletions(-) delete mode 100644 src/core/sock/pkt_rcvr_sink.h diff --git a/src/core/Makefile.am b/src/core/Makefile.am index 251748ecf..98ec13979 100644 --- a/src/core/Makefile.am +++ b/src/core/Makefile.am @@ -275,7 +275,6 @@ libxlio_la_SOURCES := \ \ sock/cleanable_obj.h \ sock/fd_collection.h \ - sock/pkt_rcvr_sink.h \ sock/socket_fd_api.h \ sock/sockinfo.h \ sock/sockinfo_tcp.h \ diff --git a/src/core/dev/rfs.cpp b/src/core/dev/rfs.cpp index 80059ae05..68693b3d0 100644 --- a/src/core/dev/rfs.cpp +++ b/src/core/dev/rfs.cpp @@ -138,7 +138,7 @@ rfs::rfs(flow_tuple *flow_spec_5t, ring_slave *p_ring, rfs_rule_filter *rule_fil memset(&m_match_value, 0, sizeof(m_match_value)); memset(&m_match_mask, 0, sizeof(m_match_mask)); - m_sinks_list = new pkt_rcvr_sink *[m_n_sinks_list_max_length]; + m_sinks_list = new sockinfo *[m_n_sinks_list_max_length]; #if defined(DEFINED_NGINX) || defined(DEFINED_ENVOY) if (g_p_app->type != APP_NONE && g_p_app->get_worker_id() >= 0) { @@ -152,7 +152,7 @@ rfs::rfs(flow_tuple *flow_spec_5t, ring_slave *p_ring, rfs_rule_filter *rule_fil } BULLSEYE_EXCLUDE_BLOCK_END - memset(m_sinks_list, 0, sizeof(pkt_rcvr_sink *) * m_n_sinks_list_max_length); + memset(m_sinks_list, 0, sizeof(sockinfo *) * m_n_sinks_list_max_length); } rfs::~rfs() @@ -181,7 +181,7 @@ rfs::~rfs() delete[] m_sinks_list; } -bool rfs::add_sink(pkt_rcvr_sink *p_sink) +bool rfs::add_sink(sockinfo *p_sink) { uint32_t i; @@ -205,7 +205,7 @@ bool rfs::add_sink(pkt_rcvr_sink *p_sink) if (m_n_sinks_list_entries == m_n_sinks_list_max_length) { // Sinks list array is full // Reallocate a new array with double size uint32_t tmp_sinks_list_length = 2 * m_n_sinks_list_max_length; - pkt_rcvr_sink **tmp_sinks_list = new pkt_rcvr_sink *[tmp_sinks_list_length]; + sockinfo **tmp_sinks_list = new sockinfo *[tmp_sinks_list_length]; BULLSEYE_EXCLUDE_BLOCK_START if (!tmp_sinks_list) { @@ -214,7 +214,7 @@ bool rfs::add_sink(pkt_rcvr_sink *p_sink) } BULLSEYE_EXCLUDE_BLOCK_END - memcpy(tmp_sinks_list, m_sinks_list, sizeof(pkt_rcvr_sink *) * m_n_sinks_list_max_length); + memcpy(tmp_sinks_list, m_sinks_list, sizeof(sockinfo *) * m_n_sinks_list_max_length); delete[] m_sinks_list; m_sinks_list = tmp_sinks_list; m_n_sinks_list_max_length = tmp_sinks_list_length; @@ -227,7 +227,7 @@ bool rfs::add_sink(pkt_rcvr_sink *p_sink) return true; } -bool rfs::del_sink(pkt_rcvr_sink *p_sink) +bool rfs::del_sink(sockinfo *p_sink) { uint32_t i; @@ -258,7 +258,7 @@ bool rfs::del_sink(pkt_rcvr_sink *p_sink) return false; } -bool rfs::attach_flow(pkt_rcvr_sink *sink) +bool rfs::attach_flow(sockinfo *sink) { bool ret; int filter_counter = 1; @@ -294,7 +294,7 @@ bool rfs::attach_flow(pkt_rcvr_sink *sink) return ret; } -bool rfs::detach_flow(pkt_rcvr_sink *sink) +bool rfs::detach_flow(sockinfo *sink) { bool ret = false; int filter_counter = 0; diff --git a/src/core/dev/rfs.h b/src/core/dev/rfs.h index c9f1834e7..316f97b62 100644 --- a/src/core/dev/rfs.h +++ b/src/core/dev/rfs.h @@ -44,7 +44,7 @@ #define RFS_SINKS_LIST_DEFAULT_LEN 32 class hw_queue_rx; -class pkt_rcvr_sink; +class sockinfo; /* * Priority description: @@ -85,15 +85,15 @@ class rfs { /** * Register/Unregister a sink with this rfs object - * Get notifications about incoming packets using the pkt_rcvr_sink callback api + * Get notifications about incoming packets using the sockinfo callback api * The rfs will call ibv_attach on the QP once when at least one receiver sink is registered * An ibv_detach is called when the last receiver sink is deleted from the registered list * */ - bool attach_flow(pkt_rcvr_sink *sink); // Add a sink. If this is the first sink --> map the sink - // and attach flow to QP - bool detach_flow(pkt_rcvr_sink *sink); // Delete a sink. If this is the last sink --> delete it - // and detach flow from QP + bool attach_flow(sockinfo *sink); // Add a sink. If this is the first sink --> map the sink + // and attach flow to QP + bool detach_flow(sockinfo *sink); // Delete a sink. If this is the last sink --> delete it + // and detach flow from QP #ifdef DEFINED_UTLS rfs_rule *create_rule(xlio_tir *tir, const flow_tuple &flow_spec); // Create a duplicate rule which points to @@ -109,7 +109,7 @@ class rfs { ring_simple *m_p_ring_simple; rfs_rule_filter *m_p_rule_filter; rfs_rule *m_rfs_flow = nullptr; - pkt_rcvr_sink **m_sinks_list; + sockinfo **m_sinks_list; uint32_t m_n_sinks_list_entries; // Number of actual sinks in the array (we shrink the array if // a sink is removed) uint32_t m_n_sinks_list_max_length; @@ -122,8 +122,8 @@ class rfs { bool create_flow(); // Attach flow to all queues bool destroy_flow(); // Detach flow from all queues - bool add_sink(pkt_rcvr_sink *p_sink); - bool del_sink(pkt_rcvr_sink *p_sink); + bool add_sink(sockinfo *p_sink); + bool del_sink(sockinfo *p_sink); void prepare_flow_spec_eth_ip(const ip_address &dst_ip, const ip_address &src_ip); void prepare_flow_spec_tcp_udp(); virtual void prepare_flow_spec() = 0; diff --git a/src/core/dev/rfs_mc.cpp b/src/core/dev/rfs_mc.cpp index 6aaa4f251..14f7fa064 100644 --- a/src/core/dev/rfs_mc.cpp +++ b/src/core/dev/rfs_mc.cpp @@ -34,6 +34,7 @@ #include "util/utils.h" #include "dev/rfs_mc.h" #include "dev/ring_simple.h" +#include "sock/sockinfo.h" #define MODULE_NAME "rfs_mc" diff --git a/src/core/dev/rfs_uc.cpp b/src/core/dev/rfs_uc.cpp index 8642fd4cd..277492957 100644 --- a/src/core/dev/rfs_uc.cpp +++ b/src/core/dev/rfs_uc.cpp @@ -37,6 +37,7 @@ #include "util/instrumentation.h" #include "sock/sock-redirect.h" #include "sock/sock-app.h" +#include "sock/sockinfo.h" #define MODULE_NAME "rfs_uc" diff --git a/src/core/dev/ring.h b/src/core/dev/ring.h index 1425af904..751e1ee3a 100644 --- a/src/core/dev/ring.h +++ b/src/core/dev/ring.h @@ -43,7 +43,7 @@ /* Forward declarations */ struct xlio_tls_info; -class pkt_rcvr_sink; +class sockinfo; class rfs_rule; class poll_group; @@ -82,9 +82,8 @@ class ring { virtual void print_val(); - virtual bool attach_flow(flow_tuple &flow_spec_5t, pkt_rcvr_sink *sink, - bool force_5t = false) = 0; - virtual bool detach_flow(flow_tuple &flow_spec_5t, pkt_rcvr_sink *sink) = 0; + virtual bool attach_flow(flow_tuple &flow_spec_5t, sockinfo *sink, bool force_5t = false) = 0; + virtual bool detach_flow(flow_tuple &flow_spec_5t, sockinfo *sink) = 0; virtual void restart() = 0; diff --git a/src/core/dev/ring_bond.cpp b/src/core/dev/ring_bond.cpp index c6e32d0c9..5427f0e8d 100644 --- a/src/core/dev/ring_bond.cpp +++ b/src/core/dev/ring_bond.cpp @@ -99,7 +99,7 @@ void ring_bond::print_val() ((uintptr_t)this == (uintptr_t)m_parent ? nullptr : m_parent), "bond"); } -bool ring_bond::attach_flow(flow_tuple &flow_spec_5t, pkt_rcvr_sink *sink, bool force_5t) +bool ring_bond::attach_flow(flow_tuple &flow_spec_5t, sockinfo *sink, bool force_5t) { bool ret = true; struct flow_sink_t value = {flow_spec_5t, sink}; @@ -117,7 +117,7 @@ bool ring_bond::attach_flow(flow_tuple &flow_spec_5t, pkt_rcvr_sink *sink, bool return ret; } -bool ring_bond::detach_flow(flow_tuple &flow_spec_5t, pkt_rcvr_sink *sink) +bool ring_bond::detach_flow(flow_tuple &flow_spec_5t, sockinfo *sink) { bool ret = true; struct flow_sink_t value = {flow_spec_5t, sink}; diff --git a/src/core/dev/ring_bond.h b/src/core/dev/ring_bond.h index b7455b588..6c663b8b0 100644 --- a/src/core/dev/ring_bond.h +++ b/src/core/dev/ring_bond.h @@ -42,7 +42,7 @@ typedef std::vector ring_slave_vector_t; struct flow_sink_t { flow_tuple flow; - pkt_rcvr_sink *sink; + sockinfo *sink; }; class ring_bond : public ring { @@ -70,8 +70,8 @@ class ring_bond : public ring { virtual int wait_for_notification_and_process_element(int cq_channel_fd, uint64_t *p_cq_poll_sn, void *pv_fd_ready_array = nullptr); virtual int get_num_resources() const { return m_bond_rings.size(); }; - virtual bool attach_flow(flow_tuple &flow_spec_5t, pkt_rcvr_sink *sink, bool force_5t = false); - virtual bool detach_flow(flow_tuple &flow_spec_5t, pkt_rcvr_sink *sink); + virtual bool attach_flow(flow_tuple &flow_spec_5t, sockinfo *sink, bool force_5t = false); + virtual bool detach_flow(flow_tuple &flow_spec_5t, sockinfo *sink); virtual void restart(); virtual mem_buf_desc_t *mem_buf_tx_get(ring_user_id_t id, bool b_block, pbuf_type type, int n_num_mem_bufs = 1); diff --git a/src/core/dev/ring_slave.cpp b/src/core/dev/ring_slave.cpp index f90edaa51..1dded5607 100644 --- a/src/core/dev/ring_slave.cpp +++ b/src/core/dev/ring_slave.cpp @@ -158,7 +158,7 @@ void ring_slave::inc_tx_retransmissions_stats(ring_user_id_t) } template -bool steering_handler::attach_flow(flow_tuple &flow_spec_5t, pkt_rcvr_sink *sink, +bool steering_handler::attach_flow(flow_tuple &flow_spec_5t, sockinfo *sink, bool force_5t) { rfs *p_rfs; @@ -387,7 +387,7 @@ bool steering_handler::attach_flow(flow_tuple &flow_spec_5t, return ret; } -bool ring_slave::attach_flow(flow_tuple &flow_spec_5t, pkt_rcvr_sink *sink, bool force_5t) +bool ring_slave::attach_flow(flow_tuple &flow_spec_5t, sockinfo *sink, bool force_5t) { std::lock_guard lock(m_lock_ring_rx); @@ -397,7 +397,7 @@ bool ring_slave::attach_flow(flow_tuple &flow_spec_5t, pkt_rcvr_sink *sink, bool } template -bool steering_handler::detach_flow(flow_tuple &flow_spec_5t, pkt_rcvr_sink *sink) +bool steering_handler::detach_flow(flow_tuple &flow_spec_5t, sockinfo *sink) { rfs *p_rfs = nullptr; @@ -521,7 +521,7 @@ bool steering_handler::detach_flow(flow_tuple &flow_spec_5t, return true; } -bool ring_slave::detach_flow(flow_tuple &flow_spec_5t, pkt_rcvr_sink *sink) +bool ring_slave::detach_flow(flow_tuple &flow_spec_5t, sockinfo *sink) { std::lock_guard lock(m_lock_ring_rx); diff --git a/src/core/dev/ring_slave.h b/src/core/dev/ring_slave.h index 8c7c8d0e2..439e972b6 100644 --- a/src/core/dev/ring_slave.h +++ b/src/core/dev/ring_slave.h @@ -247,8 +247,8 @@ template class steering_handler { { } - bool attach_flow(flow_tuple &flow_spec_5t, pkt_rcvr_sink *sink, bool force_5t = false); - bool detach_flow(flow_tuple &flow_spec_5t, pkt_rcvr_sink *sink); + bool attach_flow(flow_tuple &flow_spec_5t, sockinfo *sink, bool force_5t = false); + bool detach_flow(flow_tuple &flow_spec_5t, sockinfo *sink); inline bool rx_process_buffer_no_flow_id(mem_buf_desc_t *p_rx_wc_buf_desc, void *pv_fd_ready_array, HDR *p_ip_h); @@ -292,8 +292,8 @@ class ring_slave : public ring { virtual int reclaim_recv_single_buffer(mem_buf_desc_t *rx_reuse) = 0; virtual void inc_cq_moderation_stats(size_t sz_data) = 0; - virtual bool attach_flow(flow_tuple &flow_spec_5t, pkt_rcvr_sink *sink, bool force_5t = false); - virtual bool detach_flow(flow_tuple &flow_spec_5t, pkt_rcvr_sink *sink); + virtual bool attach_flow(flow_tuple &flow_spec_5t, sockinfo *sink, bool force_5t = false); + virtual bool detach_flow(flow_tuple &flow_spec_5t, sockinfo *sink); #ifdef DEFINED_UTLS /* Call this method in an RX ring. */ diff --git a/src/core/dev/ring_tap.cpp b/src/core/dev/ring_tap.cpp index d7d0b7471..377372c1c 100644 --- a/src/core/dev/ring_tap.cpp +++ b/src/core/dev/ring_tap.cpp @@ -233,7 +233,7 @@ void ring_tap::tap_destroy() } } -bool ring_tap::attach_flow(flow_tuple &flow_spec_5t, pkt_rcvr_sink *sink, bool force_5t) +bool ring_tap::attach_flow(flow_tuple &flow_spec_5t, sockinfo *sink, bool force_5t) { std::lock_guard lock(m_lock_ring_rx); bool ret = ring_slave::attach_flow(flow_spec_5t, sink, force_5t); @@ -254,7 +254,7 @@ bool ring_tap::attach_flow(flow_tuple &flow_spec_5t, pkt_rcvr_sink *sink, bool f return ret; } -bool ring_tap::detach_flow(flow_tuple &flow_spec_5t, pkt_rcvr_sink *sink) +bool ring_tap::detach_flow(flow_tuple &flow_spec_5t, sockinfo *sink) { std::lock_guard lock(m_lock_ring_rx); bool ret = ring_slave::detach_flow(flow_spec_5t, sink); diff --git a/src/core/dev/ring_tap.h b/src/core/dev/ring_tap.h index 4ef935c52..67777e9b3 100644 --- a/src/core/dev/ring_tap.h +++ b/src/core/dev/ring_tap.h @@ -42,10 +42,9 @@ class ring_tap : public ring_slave { virtual ~ring_tap(); virtual bool is_up() { return (m_vf_ring || m_active); } - virtual bool attach_flow(flow_tuple &flow_spec_5t, pkt_rcvr_sink *sink, bool force_5t = false); - virtual bool detach_flow(flow_tuple &flow_spec_5t, pkt_rcvr_sink *sink); - virtual int poll_and_process_element_rx(uint64_t *p_cq_poll_sn, - void *pv_fd_ready_array = nullptr); + virtual bool attach_flow(flow_tuple &flow_spec_5t, sockinfo *sink, bool force_5t = false); + virtual bool detach_flow(flow_tuple &flow_spec_5t, sockinfo *sink); + virtual int poll_and_process_element_rx(uint64_t *p_cq_poll_sn, void *pv_fd_ready_array = NULL); virtual int poll_and_process_element_tx(uint64_t *p_cq_poll_sn) { NOT_IN_USE(p_cq_poll_sn); diff --git a/src/core/proto/xlio_lwip.h b/src/core/proto/xlio_lwip.h index 0b50ff610..dff65f502 100644 --- a/src/core/proto/xlio_lwip.h +++ b/src/core/proto/xlio_lwip.h @@ -35,7 +35,6 @@ #include "core/event/timer_handler.h" #include "core/proto/mem_buf_desc.h" -#include "core/sock/pkt_rcvr_sink.h" #include "core/lwip/tcp.h" typedef enum xlio_wr_tx_packet_attr { diff --git a/src/core/sock/pkt_rcvr_sink.h b/src/core/sock/pkt_rcvr_sink.h deleted file mode 100644 index bfc71630d..000000000 --- a/src/core/sock/pkt_rcvr_sink.h +++ /dev/null @@ -1,67 +0,0 @@ -/* - * Copyright (c) 2001-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#ifndef PKT_RECVR_SINK_H -#define PKT_RECVR_SINK_H - -class mem_buf_desc_t; -class ring; - -/* - * Class pkt_rcvr_sink - * An object must implement pkt_rcvr_sink to register with ib_conn_mgr_base - * The rx_joined_notify_cb() will be called when the IBCM is ready to start - * receiving packets (MC join is complete and CQ is mapped). - * The rx_diconnect_notify_cb() will be called before the IB stops receiving - * packets (CQ is being removed and MC leave is called). - * The rx_pkt_notify_cb() will be called when a ip packet is in the ready q for the socket. - * The implementing object should register the information and release calling context immediately. - * When no packet receivers (or transmitters) are registered the objects will be deleted - */ -class pkt_rcvr_sink { -public: - virtual ~pkt_rcvr_sink() = default; - - // Callback from lower layer notifying new receive packets - // Return: 'true' if object queuing this receive packet - // 'false' if not interested in this receive packet - virtual bool rx_input_cb(mem_buf_desc_t *p_rx_pkt_mem_buf_desc_info, - void *pv_fd_ready_array) = 0; - - // Callback from lower layer notifying completion of RX registration process - virtual void rx_add_ring_cb(ring *p_ring) = 0; - - // Callback from lower layer notifying before RX resources deallocation - virtual void rx_del_ring_cb(ring *p_ring) = 0; -}; - -#endif diff --git a/src/core/sock/sockinfo.h b/src/core/sock/sockinfo.h index 30281904e..b203a12df 100644 --- a/src/core/sock/sockinfo.h +++ b/src/core/sock/sockinfo.h @@ -51,7 +51,6 @@ #include "dev/ring_allocation_logic.h" #include "socket_fd_api.h" -#include "pkt_rcvr_sink.h" #include "sock-redirect.h" #include "sock-app.h" @@ -154,7 +153,7 @@ typedef std::unordered_map rx_ring_map_t; // see route.c in Linux kernel const uint8_t ip_tos2prio[16] = {0, 0, 0, 0, 2, 2, 2, 2, 6, 6, 6, 6, 4, 4, 4, 4}; -class sockinfo : public socket_fd_api, public pkt_rcvr_sink, public wakeup_pipe { +class sockinfo : public socket_fd_api, public wakeup_pipe { public: sockinfo(int fd, int domain, bool use_ring_locks); ~sockinfo() override; @@ -167,6 +166,12 @@ class sockinfo : public socket_fd_api, public pkt_rcvr_sink, public wakeup_pipe SOCKINFO_DESTROYING }; + // Callback from lower layer notifying new receive packets + // Return: 'true' if object queuing this receive packet + // 'false' if not interested in this receive packet + virtual bool rx_input_cb(mem_buf_desc_t *p_rx_pkt_mem_buf_desc_info, + void *pv_fd_ready_array) = 0; + #if defined(DEFINED_NGINX) || defined(DEFINED_ENVOY) void copy_sockopt_fork(const socket_fd_api *copy_from) override; #endif @@ -283,8 +288,8 @@ class sockinfo : public socket_fd_api, public pkt_rcvr_sink, public wakeup_pipe const struct sockaddr *sock_addr_second = nullptr); // This callback will notify that socket is ready to receive and map the cq. - void rx_add_ring_cb(ring *p_ring) override; - void rx_del_ring_cb(ring *p_ring) override; + virtual void rx_add_ring_cb(ring *p_ring); + virtual void rx_del_ring_cb(ring *p_ring); virtual void lock_rx_q() { m_lock_rcv.lock(); } virtual void unlock_rx_q() { m_lock_rcv.unlock(); } diff --git a/src/core/sock/sockinfo_udp.h b/src/core/sock/sockinfo_udp.h index 48d988c97..eeadd4ab1 100644 --- a/src/core/sock/sockinfo_udp.h +++ b/src/core/sock/sockinfo_udp.h @@ -48,7 +48,6 @@ #include "proto/mem_buf_desc.h" #include "proto/dst_entry_udp.h" -#include "pkt_rcvr_sink.h" #include "sock-redirect.h" #include "sockinfo.h" From a1cb2091cecc4754a9ee4d0fbb911f0ebe0d7f9a Mon Sep 17 00:00:00 2001 From: Alexander Grissik Date: Tue, 13 Feb 2024 13:51:32 +0200 Subject: [PATCH 111/169] issue: 3777348 Simplifying timers for TCP sockets Removing inheritance of timer_handler from sockinfo_tcp allows o order fields in cache friendly manner. timer_hander add vptr ad related fields in an undesired location in the object layout. sockinfo_tcp is always handled by tcp_timer_collection which itslef is timer_handler. Thus, there is no real need in registering the socket itself into the internal thread. This removes complex code with handling groups inside timers registration which are used only for tcp_collection. For XLIO_TCP_CTL_THREAD=with_wakeup/no_wakeup we put the listen socket events to be handled as part of tcp_timer_collections instead of being registered directly on internal thread. Signed-off-by: Alexander Grissik --- src/core/Makefile.am | 1 - src/core/event/delta_timer.h | 2 - src/core/event/event_handler_manager.cpp | 61 ++++++--- src/core/event/event_handler_manager.h | 10 +- src/core/event/poll_group.cpp | 3 +- src/core/event/timers_group.h | 61 --------- src/core/main.cpp | 4 +- src/core/sock/sockinfo_tcp.cpp | 164 ++++++++--------------- src/core/sock/sockinfo_tcp.h | 47 +++---- 9 files changed, 126 insertions(+), 227 deletions(-) delete mode 100644 src/core/event/timers_group.h diff --git a/src/core/Makefile.am b/src/core/Makefile.am index 98ec13979..164b5140f 100644 --- a/src/core/Makefile.am +++ b/src/core/Makefile.am @@ -216,7 +216,6 @@ libxlio_la_SOURCES := \ event/netlink_event.h \ event/poll_group.h \ event/timer_handler.h \ - event/timers_group.h \ event/vlogger_timer_handler.h \ \ ib/base/verbs_extra.h \ diff --git a/src/core/event/delta_timer.h b/src/core/event/delta_timer.h index c1dc228f2..311351fcc 100644 --- a/src/core/event/delta_timer.h +++ b/src/core/event/delta_timer.h @@ -39,7 +39,6 @@ #define INFINITE_TIMEOUT (-1) class timer_handler; -class timers_group; enum timer_req_type_t { // reregister itself every after timer expires. (the client doesn't need to reregister) @@ -65,7 +64,6 @@ struct timer_node_t { /* link to the context registered */ timer_handler *handler; void *user_data; - timers_group *group; timer_req_type_t req_type; struct timer_node_t *next; struct timer_node_t *prev; diff --git a/src/core/event/event_handler_manager.cpp b/src/core/event/event_handler_manager.cpp index edf4fe946..984aaa96f 100644 --- a/src/core/event/event_handler_manager.cpp +++ b/src/core/event/event_handler_manager.cpp @@ -42,7 +42,7 @@ #include "timer_handler.h" #include "event_handler_ibverbs.h" #include "event_handler_rdma_cm.h" - +#include "core/sock/sockinfo_tcp.h" #include "core/util/instrumentation.h" #define MODULE_NAME "evh:" @@ -86,8 +86,7 @@ event_handler_manager *g_p_event_handler_manager = nullptr; pthread_t g_n_internal_thread_id = 0; void *event_handler_manager::register_timer_event(int timeout_msec, timer_handler *handler, - timer_req_type_t req_type, void *user_data, - timers_group *group /* = NULL */) + timer_req_type_t req_type, void *user_data) { evh_logdbg("timer handler '%p' registered %s timer for %d msec (user data: %p)", handler, timer_req_type_str(req_type), timeout_msec, user_data); @@ -115,7 +114,6 @@ void *event_handler_manager::register_timer_event(int timeout_msec, timer_handle reg_action.type = REGISTER_TIMER; reg_action.info.timer.handler = handler; reg_action.info.timer.user_data = user_data; - reg_action.info.timer.group = group; reg_action.info.timer.node = node; reg_action.info.timer.timeout_msec = timeout_msec; reg_action.info.timer.req_type = req_type; @@ -123,6 +121,16 @@ void *event_handler_manager::register_timer_event(int timeout_msec, timer_handle return node; } +void event_handler_manager::register_socket_timer_event(sockinfo_tcp *sock_tcp) +{ + evh_logdbg("Registering TCP socket timer: %p", sock_tcp); + reg_action_t reg_action; + memset(®_action, 0, sizeof(reg_action)); + reg_action.type = REGISTER_TCP_SOCKET_TIMER; + reg_action.info.timer.user_data = sock_tcp; + post_new_reg_action(reg_action); +} + void event_handler_manager::wakeup_timer_event(timer_handler *handler, void *node) { evh_logdbg("timer handler '%p'", handler); @@ -138,7 +146,6 @@ void event_handler_manager::wakeup_timer_event(timer_handler *handler, void *nod reg_action.info.timer.handler = handler; reg_action.info.timer.node = node; post_new_reg_action(reg_action); - return; } void event_handler_manager::unregister_timer_event(timer_handler *handler, void *node) @@ -175,6 +182,16 @@ void event_handler_manager::unregister_timers_event_and_delete(timer_handler *ha post_new_reg_action(reg_action); } +void event_handler_manager::unregister_socket_timer_and_delete(sockinfo_tcp *sock_tcp) +{ + evh_logdbg("Unregistering TCP socket timer: %p", sock_tcp); + reg_action_t reg_action; + memset(®_action, 0, sizeof(reg_action)); + reg_action.type = UNREGISTER_TCP_SOCKET_TIMER_AND_DELETE; + reg_action.info.timer.user_data = sock_tcp; + post_new_reg_action(reg_action); +} + void event_handler_manager::register_ibverbs_event(int fd, event_handler_ibverbs *handler, void *channel, void *user_data) { @@ -425,6 +442,10 @@ void event_handler_manager::update_epfd(int fd, int operation, int events) const char *event_handler_manager::reg_action_str(event_action_type_e reg_action_type) { switch (reg_action_type) { + case REGISTER_TCP_SOCKET_TIMER: + return "REGISTER_TCP_SOCKET_TIMER"; + case UNREGISTER_TCP_SOCKET_TIMER_AND_DELETE: + return "UNREGISTER_TCP_SOCKET_TIMER_AND_DELETE"; case REGISTER_TIMER: return "REGISTER_TIMER"; case UNREGISTER_TIMER: @@ -475,30 +496,18 @@ void event_handler_manager::post_new_reg_action(reg_action_t ®_action) void event_handler_manager::priv_register_timer_handler(timer_reg_info_t &info) { - if (info.group) { - info.group->add_new_timer((timer_node_t *)info.node, info.handler, info.user_data); - } else { - m_timer.add_new_timer(info.timeout_msec, (timer_node_t *)info.node, info.handler, - info.user_data, info.req_type); - } + m_timer.add_new_timer(info.timeout_msec, (timer_node_t *)info.node, info.handler, + info.user_data, info.req_type); } void event_handler_manager::priv_wakeup_timer_handler(timer_reg_info_t &info) { - timer_node_t *node = (timer_node_t *)info.node; - if (node && !node->group) { - m_timer.wakeup_timer(node); - } + m_timer.wakeup_timer((timer_node_t *)info.node); } void event_handler_manager::priv_unregister_timer_handler(timer_reg_info_t &info) { - timer_node_t *node = (timer_node_t *)info.node; - if (node && node->group) { - node->group->remove_timer((timer_node_t *)info.node); - } else { - m_timer.remove_timer(node, info.handler); - } + m_timer.remove_timer((timer_node_t *)info.node, info.handler); } void event_handler_manager::priv_unregister_all_handler_timers(timer_reg_info_t &info) @@ -729,7 +738,17 @@ void event_handler_manager::handle_registration_action(reg_action_t ®_action) } evh_logfunc("event action %d", reg_action.type); + sockinfo_tcp *sock; switch (reg_action.type) { + case REGISTER_TCP_SOCKET_TIMER: + sock = reinterpret_cast(reg_action.info.timer.user_data); + sock->get_tcp_timer_collection()->add_new_timer(sock); + break; + case UNREGISTER_TCP_SOCKET_TIMER_AND_DELETE: + sock = reinterpret_cast(reg_action.info.timer.user_data); + sock->get_tcp_timer_collection()->remove_timer(sock); + delete sock; + break; case REGISTER_TIMER: priv_register_timer_handler(reg_action.info.timer); break; diff --git a/src/core/event/event_handler_manager.h b/src/core/event/event_handler_manager.h index 43b094bbe..c9e2ba69f 100644 --- a/src/core/event/event_handler_manager.h +++ b/src/core/event/event_handler_manager.h @@ -42,17 +42,19 @@ #include "core/infra/subject_observer.h" #include "core/event/command.h" #include "core/event/delta_timer.h" -#include "core/event/timers_group.h" #include "core/util/xlio_stats.h" class timer_handler; class event_handler_ibverbs; class event_handler_rdma_cm; +class sockinfo_tcp; typedef std::map event_handler_rdma_cm_map_t; typedef enum { + REGISTER_TCP_SOCKET_TIMER, + UNREGISTER_TCP_SOCKET_TIMER_AND_DELETE, REGISTER_TIMER, WAKEUP_TIMER, /* NOT AVAILABLE FOR GROUPED TIMERS */ UNREGISTER_TIMER, @@ -94,7 +96,6 @@ struct timer_reg_info_t { void *node; unsigned int timeout_msec; void *user_data; - timers_group *group; timer_req_type_t req_type; }; @@ -158,11 +159,14 @@ class event_handler_manager : public wakeup_pipe { ~event_handler_manager(); void *register_timer_event(int timeout_msec, timer_handler *handler, timer_req_type_t req_type, - void *user_data, timers_group *group = nullptr); + void *user_data); void wakeup_timer_event(timer_handler *handler, void *node); void unregister_timer_event(timer_handler *handler, void *node); void unregister_timers_event_and_delete(timer_handler *handler); + void register_socket_timer_event(sockinfo_tcp *sock_tcp); + void unregister_socket_timer_and_delete(sockinfo_tcp *sock_tcp); + void register_ibverbs_event(int fd, event_handler_ibverbs *handler, void *channel, void *user_data); void unregister_ibverbs_event(int fd, event_handler_ibverbs *handler); diff --git a/src/core/event/poll_group.cpp b/src/core/event/poll_group.cpp index 66f60b714..977374551 100644 --- a/src/core/event/poll_group.cpp +++ b/src/core/event/poll_group.cpp @@ -59,8 +59,7 @@ poll_group::poll_group(const struct xlio_poll_group_attr *attr) m_rings.reserve(2); m_event_handler = std::make_unique(); - m_tcp_timers = std::make_unique( - safe_mce_sys().tcp_timer_resolution_msec, safe_mce_sys().tcp_timer_resolution_msec); + m_tcp_timers = std::make_unique(1U); m_tcp_timers->set_group(this); } diff --git a/src/core/event/timers_group.h b/src/core/event/timers_group.h deleted file mode 100644 index aa67a383a..000000000 --- a/src/core/event/timers_group.h +++ /dev/null @@ -1,61 +0,0 @@ -/* - * Copyright (c) 2001-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#ifndef TIMERS_GROUP_H -#define TIMERS_GROUP_H - -/* - * This is an API for batching timers into groups. - * Instead of registering each timer separately into the internal thread, the group is registered - * once, and the timers are registered to the group. The registration to the group is still done - * through the internal thread. The group must be deleted through the internal thread (must - * implement clean_obj interface). Registering to group must be used with register_timer_event() and - * unregister_timer_event() only. - */ -class timers_group : public timer_handler { -public: - virtual ~timers_group() {}; - // execute all the timers registered to the group - // according to the internal group logic. - virtual void handle_timer_expired(void *user_data) = 0; - -protected: - friend class event_handler_manager; - // add a new timer - virtual void add_new_timer(timer_node_t *node, timer_handler *handler, void *user_data) = 0; - - // remove timer from list and free it. - // called for stopping (unregistering) a timer - virtual void remove_timer(timer_node_t *node) = 0; -}; - -#endif diff --git a/src/core/main.cpp b/src/core/main.cpp index 738e95506..0c5467366 100644 --- a/src/core/main.cpp +++ b/src/core/main.cpp @@ -1102,9 +1102,7 @@ static void do_global_ctors_helper() // For delegated TCP timers the global collection is not used. if (safe_mce_sys().tcp_ctl_thread != option_tcp_ctl_thread::CTL_THREAD_DELEGATE_TCP_TIMERS) { - NEW_CTOR(g_tcp_timers_collection, - tcp_timers_collection(safe_mce_sys().tcp_timer_resolution_msec, - safe_mce_sys().timer_resolution_msec)); + NEW_CTOR(g_tcp_timers_collection, tcp_timers_collection()); } NEW_CTOR(g_p_vlogger_timer_handler, vlogger_timer_handler()); diff --git a/src/core/sock/sockinfo_tcp.cpp b/src/core/sock/sockinfo_tcp.cpp index fdfd63673..09245295a 100644 --- a/src/core/sock/sockinfo_tcp.cpp +++ b/src/core/sock/sockinfo_tcp.cpp @@ -289,7 +289,6 @@ static inline bool use_socket_ring_locks() sockinfo_tcp::sockinfo_tcp(int fd, int domain) : sockinfo(fd, domain, use_socket_ring_locks()) - , m_timer_handle(nullptr) , m_tcp_con_lock(get_new_tcp_lock()) , m_sysvar_buffer_batching_mode(safe_mce_sys().buffer_batching_mode) , m_sysvar_tx_segs_batch_tcp(safe_mce_sys().tx_segs_batch_tcp) @@ -597,23 +596,16 @@ void sockinfo_tcp::clean_socket_obj() if (is_cleaned()) { return; } - m_is_cleaned = true; - event_handler_manager *p_event_mgr = get_event_mgr(); + unlock_tcp_con(); + + event_handler_manager *p_event_mgr = get_event_mgr(); bool delegated_timers_exit = g_b_exit && (safe_mce_sys().tcp_ctl_thread == option_tcp_ctl_thread::CTL_THREAD_DELEGATE_TCP_TIMERS); - /* Remove group timers from g_tcp_timers_collection */ - if (p_event_mgr->is_running() && m_timer_handle && !delegated_timers_exit) { - p_event_mgr->unregister_timer_event(this, m_timer_handle); - } - - m_timer_handle = nullptr; - unlock_tcp_con(); - if (p_event_mgr->is_running() && !delegated_timers_exit) { - p_event_mgr->unregister_timers_event_and_delete(this); + p_event_mgr->unregister_socket_timer_and_delete(this); } else { delete this; } @@ -1829,9 +1821,8 @@ void sockinfo_tcp::process_rx_ctl_packets() } // Execute TCP timers of this connection -void sockinfo_tcp::handle_timer_expired(void *user_data) +void sockinfo_tcp::handle_timer_expired() { - NOT_IN_USE(user_data); si_tcp_logfunc(""); if (tcp_ctl_thread_on(m_sysvar_tcp_ctl_thread)) { @@ -2509,18 +2500,10 @@ ssize_t sockinfo_tcp::rx(const rx_call_t call_type, iovec *p_iov, ssize_t sz_iov void sockinfo_tcp::register_timer() { - if (!m_timer_handle) { - si_tcp_logdbg("Registering TCP socket timer: socket: %p, thread-col: %p, global-col: %p", - this, get_tcp_timer_collection(), g_tcp_timers_collection); + si_tcp_logdbg("Registering TCP socket timer: socket: %p, thread-col: %p, global-col: %p", this, + get_tcp_timer_collection(), g_tcp_timers_collection); - /* user_data is the socket itself for a fast cast in the timer_expired(). */ - m_timer_handle = get_event_mgr()->register_timer_event( - safe_mce_sys().tcp_timer_resolution_msec, this, PERIODIC_TIMER, - reinterpret_cast(this), get_tcp_timer_collection()); - } else { - si_tcp_logdbg("register_timer was called more than once. Something might be wrong, or " - "connect was called twice."); - } + get_event_mgr()->register_socket_timer_event(this); } void sockinfo_tcp::queue_rx_ctl_packet(struct tcp_pcb *pcb, mem_buf_desc_t *p_desc) @@ -2540,7 +2523,7 @@ void sockinfo_tcp::queue_rx_ctl_packet(struct tcp_pcb *pcb, mem_buf_desc_t *p_de } if (m_sysvar_tcp_ctl_thread == option_tcp_ctl_thread::CTL_THREAD_WITH_WAKEUP) { - g_p_event_handler_manager->wakeup_timer_event(this, m_timer_handle); + get_tcp_timer_collection()->register_wakeup_event(); } return; @@ -3095,8 +3078,7 @@ int sockinfo_tcp::listen(int backlog) BULLSEYE_EXCLUDE_BLOCK_END if (tcp_ctl_thread_on(m_sysvar_tcp_ctl_thread)) { - m_timer_handle = g_p_event_handler_manager->register_timer_event( - safe_mce_sys().timer_resolution_msec, this, PERIODIC_TIMER, 0, NULL); + g_p_event_handler_manager->register_socket_timer_event(this); } unlock_tcp_con(); @@ -3224,7 +3206,7 @@ int sockinfo_tcp::accept_helper(struct sockaddr *__addr, socklen_t *__addrlen, if (m_sysvar_tcp_ctl_thread == option_tcp_ctl_thread::CTL_THREAD_WITH_WAKEUP && !m_rx_peer_packets.empty()) { - g_p_event_handler_manager->wakeup_timer_event(this, m_timer_handle); + get_tcp_timer_collection()->register_wakeup_event(); } unlock_tcp_con(); @@ -5939,25 +5921,16 @@ void sockinfo_tcp::put_tcp_seg_cached(struct tcp_seg *seg) } } -tcp_timers_collection::tcp_timers_collection(int period, int resolution) +tcp_timers_collection::tcp_timers_collection() + : tcp_timers_collection(safe_mce_sys().tcp_timer_resolution_msec / + safe_mce_sys().timer_resolution_msec) { - m_n_period = period; - m_n_resolution = resolution; - m_n_intervals_size = period / resolution; - m_timer_handle = nullptr; - m_p_intervals = new timer_node_t *[m_n_intervals_size]; - BULLSEYE_EXCLUDE_BLOCK_START - if (!m_p_intervals) { - __log_dbg("failed to allocate memory"); - free_tta_resources(); - throw_xlio_exception("failed to allocate memory"); - } +} - BULLSEYE_EXCLUDE_BLOCK_END - memset(m_p_intervals, 0, sizeof(timer_node_t *) * m_n_intervals_size); - m_n_location = 0; - m_n_next_insert_bucket = 0; - m_n_count = 0; +tcp_timers_collection::tcp_timers_collection(int intervals) +{ + m_n_intervals_size = intervals; + m_p_intervals.resize(m_n_intervals_size); } tcp_timers_collection::~tcp_timers_collection() @@ -5979,19 +5952,15 @@ event_handler_manager *tcp_timers_collection::get_event_mgr() void tcp_timers_collection::free_tta_resources() { - if (m_n_count) { - for (int i = 0; i < m_n_intervals_size; i++) { - if (m_p_intervals[i]) { - remove_timer(m_p_intervals[i]); - } - } - - if (m_n_count) { - __log_dbg("not all TCP timers have been removed, count=%d", m_n_count); + for (auto &bucket : m_p_intervals) { + while (!bucket.empty()) { + remove_timer(bucket.front()); } } - delete[] m_p_intervals; + if (m_n_count) { + __log_dbg("Not all TCP socket timers have been removed, count=%d", m_n_count); + } } void tcp_timers_collection::clean_obj() @@ -6014,14 +5983,10 @@ void tcp_timers_collection::clean_obj() void tcp_timers_collection::handle_timer_expired(void *user_data) { NOT_IN_USE(user_data); - timer_node_t *iter = m_p_intervals[m_n_location]; - sockinfo_tcp *p_sock; - + sock_list &bucket = m_p_intervals[m_n_location]; m_n_location = (m_n_location + 1) % m_n_intervals_size; - while (iter) { - p_sock = reinterpret_cast(iter->user_data); - + for (sockinfo_tcp *p_sock : bucket) { /* It is not guaranteed that the same sockinfo object is met once * in this loop. * So in case sockinfo object is destroyed other processing @@ -6031,7 +5996,7 @@ void tcp_timers_collection::handle_timer_expired(void *user_data) if (!p_sock->trylock_tcp_con()) { bool destroyable = false; if (!p_sock->is_cleaned()) { - p_sock->handle_timer_expired(iter->user_data); + p_sock->handle_timer_expired(); destroyable = p_sock->is_destroyable_no_lock(); } p_sock->unlock_tcp_con(); @@ -6039,7 +6004,6 @@ void tcp_timers_collection::handle_timer_expired(void *user_data) g_p_fd_collection->destroy_sockfd(p_sock); } } - iter = iter->next; } /* Processing all messages for the daemon */ @@ -6048,69 +6012,51 @@ void tcp_timers_collection::handle_timer_expired(void *user_data) } } -void tcp_timers_collection::add_new_timer(timer_node_t *node, timer_handler *handler, - void *user_data) +void tcp_timers_collection::add_new_timer(sockinfo_tcp *sock) { - node->handler = handler; - node->user_data = user_data; - node->group = this; - node->next = nullptr; - node->prev = nullptr; - if (m_p_intervals[m_n_next_insert_bucket]) { - m_p_intervals[m_n_next_insert_bucket]->prev = node; - node->next = m_p_intervals[m_n_next_insert_bucket]; + if (!sock) { + __log_warn("Trying to add timer for null TCP socket %p", sock); + return; } - m_p_intervals[m_n_next_insert_bucket] = node; + + sock_list &bucket = m_p_intervals[m_n_next_insert_bucket]; + bucket.emplace_back(sock); + m_sock_remove_map.emplace(sock, std::make_tuple(m_n_next_insert_bucket, --(bucket.end()))); m_n_next_insert_bucket = (m_n_next_insert_bucket + 1) % m_n_intervals_size; - if (m_n_count == 0) { - m_timer_handle = - get_event_mgr()->register_timer_event(m_n_resolution, this, PERIODIC_TIMER, nullptr); + if (0 == m_n_count++) { + m_timer_handle = get_event_mgr()->register_timer_event(safe_mce_sys().timer_resolution_msec, + this, PERIODIC_TIMER, nullptr); } - m_n_count++; - __log_dbg("new TCP timer handler [%p] was added", handler); + __log_dbg("New TCP socket [%p] timer was added", sock); } -void tcp_timers_collection::remove_timer(timer_node_t *node) +void tcp_timers_collection::remove_timer(sockinfo_tcp *sock) { - if (!node) { - return; - } + auto node = m_sock_remove_map.find(sock); + if (node != m_sock_remove_map.end()) { + m_p_intervals[std::get<0>(node->second)].erase(std::get<1>(node->second)); + m_sock_remove_map.erase(node); - node->group = nullptr; - - if (node->prev) { - node->prev->next = node->next; - } else { - for (int i = 0; i < m_n_intervals_size; i++) { - if (m_p_intervals[i] == node) { - m_p_intervals[i] = node->next; - break; + if (!(--m_n_count)) { + if (m_timer_handle) { + get_event_mgr()->unregister_timer_event(this, m_timer_handle); + m_timer_handle = nullptr; } } - } - - if (node->next) { - node->next->prev = node->prev; - } - m_n_count--; - if (m_n_count == 0) { - if (m_timer_handle) { - get_event_mgr()->unregister_timer_event(this, m_timer_handle); - m_timer_handle = nullptr; - } + __log_dbg("TCP socket [%p] timer was removed", sock); } +} - __log_dbg("TCP timer handler [%p] was removed", node->handler); - - free(node); +void tcp_timers_collection::register_wakeup_event() +{ + g_p_event_handler_manager->wakeup_timer_event(this, m_timer_handle); } thread_local_tcp_timers::thread_local_tcp_timers() - : tcp_timers_collection(safe_mce_sys().tcp_timer_resolution_msec, - safe_mce_sys().tcp_timer_resolution_msec) + : tcp_timers_collection(1) { } diff --git a/src/core/sock/sockinfo_tcp.h b/src/core/sock/sockinfo_tcp.h index 234e5492c..ff30916bf 100644 --- a/src/core/sock/sockinfo_tcp.h +++ b/src/core/sock/sockinfo_tcp.h @@ -124,42 +124,41 @@ struct socket_option_t { } }; -class tcp_timers_collection : public timers_group, public cleanable_obj { +class tcp_timers_collection : public timer_handler, public cleanable_obj { public: - tcp_timers_collection(int period, int resolution); + tcp_timers_collection(); + tcp_timers_collection(int intervals); ~tcp_timers_collection() override; void clean_obj() override; void handle_timer_expired(void *user_data) override; - void set_group(poll_group *group) { m_p_group = group; } - inline event_handler_manager *get_event_mgr(); + void register_wakeup_event(); -protected: - // add a new timer - void add_new_timer(timer_node_t *node, timer_handler *handler, void *user_data) override; + void add_new_timer(sockinfo_tcp *sock); - // remove timer from list and free it. - // called for stopping (unregistering) a timer - void remove_timer(timer_node_t *node) override; + void remove_timer(sockinfo_tcp *sock); + + void set_group(poll_group *group) { m_p_group = group; } + inline event_handler_manager *get_event_mgr(); private: void free_tta_resources(); protected: - void *m_timer_handle; + void *m_timer_handle = nullptr; private: - timer_node_t **m_p_intervals; - int m_n_period; - int m_n_resolution; + typedef std::list sock_list; + typedef typename sock_list::iterator sock_list_itr; + std::vector m_p_intervals; + std::unordered_map> m_sock_remove_map; int m_n_intervals_size; - int m_n_location; - int m_n_count; - int m_n_next_insert_bucket; - + int m_n_location = 0; + int m_n_count = 0; + int m_n_next_insert_bucket = 0; poll_group *m_p_group = nullptr; }; @@ -169,6 +168,8 @@ class thread_local_tcp_timers : public tcp_timers_collection { ~thread_local_tcp_timers() override; }; +extern tcp_timers_collection *g_tcp_timers_collection; + typedef std::deque socket_options_list_t; typedef std::map ready_pcb_map_t; typedef std::map syn_received_map_t; @@ -183,7 +184,7 @@ enum inet_ecns { INET_ECN_MASK = 3, }; -class sockinfo_tcp : public sockinfo, public timer_handler { +class sockinfo_tcp : public sockinfo { public: static inline size_t accepted_conns_node_offset() { @@ -342,7 +343,7 @@ class sockinfo_tcp : public sockinfo, public timer_handler { inline fd_type_t get_type() override { return FD_TYPE_SOCKET; } - void handle_timer_expired(void *user_data) override; + void handle_timer_expired(); inline ib_ctx_handler *get_ctx() { @@ -382,6 +383,7 @@ class sockinfo_tcp : public sockinfo, public timer_handler { inline void lock_tcp_con() { m_tcp_con_lock.lock(); } inline void unlock_tcp_con() { m_tcp_con_lock.unlock(); } inline void set_reguired_send_block(unsigned sz) { m_required_send_block = sz; } + tcp_timers_collection *get_tcp_timer_collection(); bool is_cleaned() const { return m_is_cleaned; } static err_t rx_lwip_cb(void *arg, struct tcp_pcb *tpcb, struct pbuf *p, err_t err); static err_t rx_lwip_cb_socketxtreme(void *arg, struct tcp_pcb *tpcb, struct pbuf *p, @@ -589,7 +591,6 @@ class sockinfo_tcp : public sockinfo, public timer_handler { bool is_connected_and_ready_to_send(); inline event_handler_manager *get_event_mgr(); - inline tcp_timers_collection *get_tcp_timer_collection(); public: static const int CONNECT_DEFAULT_TIMEOUT_MS = 10000; @@ -639,7 +640,6 @@ class sockinfo_tcp : public sockinfo, public timer_handler { uint32_t m_ready_conn_cnt; int m_backlog; - void *m_timer_handle; multilock m_tcp_con_lock; // used for reporting 'connected' on second non-blocking call to connect or @@ -671,7 +671,6 @@ class sockinfo_tcp : public sockinfo, public timer_handler { uint64_t m_user_huge_page_mask; unsigned m_required_send_block; uint16_t m_external_vlan_tag = 0U; - /* * Storage API * TODO Move the fields to proper cold/hot sections in the final version. @@ -681,6 +680,4 @@ class sockinfo_tcp : public sockinfo, public timer_handler { poll_group *m_p_group = nullptr; }; -extern tcp_timers_collection *g_tcp_timers_collection; - #endif From 19441e41a2d587a397b61128a6982421074de05a Mon Sep 17 00:00:00 2001 From: Alexander Grissik Date: Wed, 14 Feb 2024 16:32:39 +0200 Subject: [PATCH 112/169] issue: 3777348 Moving wakeup_pipe to be a member of sockinfo Having sockinfo inheriting from wakeup_pipe inserts wakeup_pipe members in an undesired location in term of cache utilization for data path. The same functionallity can be achieved by having the wakeup_pipe as a member. Signed-off-by: Alexander Grissik --- src/core/sock/sockinfo.cpp | 6 +++--- src/core/sock/sockinfo.h | 3 ++- src/core/sock/sockinfo_tcp.cpp | 34 +++++++++++++++++----------------- src/core/sock/sockinfo_udp.cpp | 14 +++++++------- src/core/util/wakeup.cpp | 10 +++++----- src/core/util/wakeup.h | 8 ++------ src/core/util/wakeup_pipe.cpp | 6 +++--- 7 files changed, 39 insertions(+), 42 deletions(-) diff --git a/src/core/sock/sockinfo.cpp b/src/core/sock/sockinfo.cpp index 196f89ff8..c039d64c7 100644 --- a/src/core/sock/sockinfo.cpp +++ b/src/core/sock/sockinfo.cpp @@ -106,7 +106,7 @@ sockinfo::sockinfo(int fd, int domain, bool use_ring_locks) if (unlikely(m_rx_epfd == -1)) { throw_xlio_exception("create internal epoll"); } - wakeup_set_epoll_fd(m_rx_epfd); + m_sock_wakeup_pipe.wakeup_set_epoll_fd(m_rx_epfd); if (m_fd == SOCKET_FAKE_FD) { m_fd = m_rx_epfd; m_fd_context = (void *)((uintptr_t)m_fd); @@ -1587,8 +1587,8 @@ void sockinfo::rx_add_ring_cb(ring *p_ring) add_cqfd_to_sock_rx_epfd(p_ring); } - do_wakeup(); // A ready wce can be pending due to the drain logic (cq channel will not wake - // up by itself) + // A ready wce can be pending due to the drain logic (cq channel will not wake up by itself) + m_sock_wakeup_pipe.do_wakeup(); } else { // Increase ref count on cq_mgr_rx object rx_ring_iter->second->refcnt++; diff --git a/src/core/sock/sockinfo.h b/src/core/sock/sockinfo.h index b203a12df..40c56c5ff 100644 --- a/src/core/sock/sockinfo.h +++ b/src/core/sock/sockinfo.h @@ -153,7 +153,7 @@ typedef std::unordered_map rx_ring_map_t; // see route.c in Linux kernel const uint8_t ip_tos2prio[16] = {0, 0, 0, 0, 2, 2, 2, 2, 6, 6, 6, 6, 4, 4, 4, 4}; -class sockinfo : public socket_fd_api, public wakeup_pipe { +class sockinfo : public socket_fd_api { public: sockinfo(int fd, int domain, bool use_ring_locks); ~sockinfo() override; @@ -585,6 +585,7 @@ class sockinfo : public socket_fd_api, public wakeup_pipe { sa_family_t m_family; sock_addr m_bound; sock_addr m_connected; + wakeup_pipe m_sock_wakeup_pipe; dst_entry *m_p_connected_dst_entry; ip_addr m_so_bindtodevice_ip; diff --git a/src/core/sock/sockinfo_tcp.cpp b/src/core/sock/sockinfo_tcp.cpp index 09245295a..6dd829d94 100644 --- a/src/core/sock/sockinfo_tcp.cpp +++ b/src/core/sock/sockinfo_tcp.cpp @@ -529,7 +529,7 @@ sockinfo_tcp::~sockinfo_tcp() prepare_to_close(true); } - do_wakeup(); + m_sock_wakeup_pipe.do_wakeup(); if (m_ops_tcp != m_ops) { delete m_ops_tcp; @@ -758,7 +758,7 @@ bool sockinfo_tcp::prepare_to_close(bool process_shutdown /* = false */) } NOTIFY_ON_EVENTS(this, EPOLLHUP); - do_wakeup(); + m_sock_wakeup_pipe.do_wakeup(); if (m_econtext) { m_econtext->fd_closed(m_fd); @@ -1608,7 +1608,7 @@ void sockinfo_tcp::err_lwip_cb(void *pcb_container, err_t err) conn->m_sock_state = TCP_SOCK_INITED; } - conn->do_wakeup(); + conn->m_sock_wakeup_pipe.do_wakeup(); } bool sockinfo_tcp::process_peer_ctl_packets(xlio_desc_list_t &peer_packets) @@ -1908,7 +1908,7 @@ void sockinfo_tcp::tcp_shutdown_rx() * null in such case and as a result update_fd_array() call means nothing */ io_mux_call::update_fd_array(m_iomux_ready_fd_array, m_fd); - do_wakeup(); + m_sock_wakeup_pipe.do_wakeup(); tcp_shutdown(&m_pcb, 1, 0); @@ -1958,7 +1958,7 @@ err_t sockinfo_tcp::rx_lwip_cb(void *arg, struct tcp_pcb *pcb, struct pbuf *p, e io_mux_call::update_fd_array(conn->m_iomux_ready_fd_array, conn->m_fd); // OLG: Now we should wakeup all threads that are sleeping on this socket. - conn->do_wakeup(); + conn->m_sock_wakeup_pipe.do_wakeup(); /* * RCVBUFF Accounting: tcp_recved here(stream into the 'internal' buffer) only if the user @@ -2067,7 +2067,7 @@ inline void sockinfo_tcp::handle_rx_lwip_cb_error(pbuf *p) // notify io_mux NOTIFY_ON_EVENTS(this, EPOLLERR); - do_wakeup(); + m_sock_wakeup_pipe.do_wakeup(); vlog_printf(VLOG_ERROR, "%s:%d %s\n", __func__, __LINE__, "recv error!!!"); pbuf_free(p); m_sock_state = TCP_SOCK_INITED; @@ -2177,7 +2177,7 @@ err_t sockinfo_tcp::rx_lwip_cb_socketxtreme(void *arg, struct tcp_pcb *pcb, stru conn->rx_lwip_cb_socketxtreme_helper(p); io_mux_call::update_fd_array(conn->m_iomux_ready_fd_array, conn->m_fd); - conn->do_wakeup(); + conn->m_sock_wakeup_pipe.do_wakeup(); /* * RCVBUFF Accounting: tcp_recved here(stream into the 'internal' buffer) only if the user * buffer is not 'filled' @@ -2268,7 +2268,7 @@ err_t sockinfo_tcp::rx_lwip_cb_recv_callback(void *arg, struct tcp_pcb *pcb, str if (callback_retval != XLIO_PACKET_HOLD) { // OLG: Now we should wakeup all threads that are sleeping on this socket. - conn->do_wakeup(); + conn->m_sock_wakeup_pipe.do_wakeup(); } else { conn->m_p_socket_stats->n_rx_zcopy_pkt_count++; } @@ -3488,7 +3488,7 @@ err_t sockinfo_tcp::accept_lwip_cb(void *arg, struct tcp_pcb *child_pcb, err_t e conn->m_p_socket_stats->listen_counters.n_conn_backlog++; // OLG: Now we should wakeup all threads that are sleeping on this socket. - conn->do_wakeup(); + conn->m_sock_wakeup_pipe.do_wakeup(); // Now we should register the child socket to TCP timer conn->unlock_tcp_con(); @@ -3642,7 +3642,7 @@ err_t sockinfo_tcp::syn_received_timewait_cb(void *arg, struct tcp_pcb *newpcb) tcp_err(&new_sock->m_pcb, sockinfo_tcp::err_lwip_cb); tcp_sent(&new_sock->m_pcb, sockinfo_tcp::ack_recvd_lwip_cb); new_sock->m_pcb.syn_tw_handled_cb = nullptr; - new_sock->wakeup_clear(); + new_sock->m_sock_wakeup_pipe.wakeup_clear(); if (tcp_ctl_thread_on(new_sock->m_sysvar_tcp_ctl_thread)) { tcp_ip_output(&new_sock->m_pcb, sockinfo_tcp::ip_output_syn_ack); } @@ -3824,7 +3824,7 @@ err_t sockinfo_tcp::connect_lwip_cb(void *arg, struct tcp_pcb *tpcb, err_t err) NOTIFY_ON_EVENTS(conn, EPOLLOUT); // OLG: Now we should wakeup all threads that are sleeping on this socket. - conn->do_wakeup(); + conn->m_sock_wakeup_pipe.do_wakeup(); conn->m_p_socket_stats->set_connected_ip(conn->m_connected); conn->m_p_socket_stats->connected_port = conn->m_connected.get_in_port(); @@ -4139,7 +4139,7 @@ int sockinfo_tcp::shutdown(int __how) } } - do_wakeup(); + m_sock_wakeup_pipe.do_wakeup(); if (err == ERR_OK) { unlock_tcp_con(); @@ -5202,7 +5202,7 @@ int sockinfo_tcp::rx_wait_helper(int &poll_count, bool blocking) lock_tcp_con(); if (!m_n_rx_pkt_ready_list_count && !m_ready_conn_cnt) { - going_to_sleep(); + m_sock_wakeup_pipe.going_to_sleep(); unlock_tcp_con(); } else { unlock_tcp_con(); @@ -5212,7 +5212,7 @@ int sockinfo_tcp::rx_wait_helper(int &poll_count, bool blocking) ret = os_wait_sock_rx_epfd(rx_epfd_events, SI_RX_EPFD_EVENT_MAX); lock_tcp_con(); - return_from_sleep(); + m_sock_wakeup_pipe.return_from_sleep(); unlock_tcp_con(); if (ret <= 0) { @@ -5226,9 +5226,9 @@ int sockinfo_tcp::rx_wait_helper(int &poll_count, bool blocking) for (int event_idx = 0; event_idx < ret; event_idx++) { int fd = rx_epfd_events[event_idx].data.fd; - if (is_wakeup_fd(fd)) { // wakeup event + if (m_sock_wakeup_pipe.is_wakeup_fd(fd)) { // wakeup event lock_tcp_con(); - remove_wakeup_fd(); + m_sock_wakeup_pipe.remove_wakeup_fd(); unlock_tcp_con(); continue; } @@ -5840,7 +5840,7 @@ void sockinfo_tcp::tcp_tx_zc_handle(mem_buf_desc_t *p_desc) /* Signal events on socket */ NOTIFY_ON_EVENTS(sock, EPOLLERR); - sock->do_wakeup(); + sock->m_sock_wakeup_pipe.do_wakeup(); } struct tcp_seg *sockinfo_tcp::tcp_seg_alloc_direct(void *p_conn) diff --git a/src/core/sock/sockinfo_udp.cpp b/src/core/sock/sockinfo_udp.cpp index efdfd8835..317a9fcd2 100644 --- a/src/core/sock/sockinfo_udp.cpp +++ b/src/core/sock/sockinfo_udp.cpp @@ -204,7 +204,7 @@ inline int sockinfo_udp::rx_wait(bool blocking) /* coverity[double_lock] TODO: RM#1049980 */ m_lock_rcv.lock(); if (!m_n_rx_pkt_ready_list_count) { - going_to_sleep(); + m_sock_wakeup_pipe.going_to_sleep(); /* coverity[double_unlock] TODO: RM#1049980 */ m_lock_rcv.unlock(); } else { @@ -216,7 +216,7 @@ inline int sockinfo_udp::rx_wait(bool blocking) /* coverity[double_lock] TODO: RM#1049980 */ m_lock_rcv.lock(); - return_from_sleep(); + m_sock_wakeup_pipe.return_from_sleep(); /* coverity[double_unlock] TODO: RM#1049980 */ m_lock_rcv.unlock(); @@ -253,10 +253,10 @@ inline int sockinfo_udp::rx_wait(bool blocking) // Run through all ready fd's for (int event_idx = 0; event_idx < ret; ++event_idx) { int fd = rx_epfd_events[event_idx].data.fd; - if (is_wakeup_fd(fd)) { + if (m_sock_wakeup_pipe.is_wakeup_fd(fd)) { /* coverity[double_lock] TODO: RM#1049980 */ m_lock_rcv.lock(); - remove_wakeup_fd(); + m_sock_wakeup_pipe.remove_wakeup_fd(); /* coverity[double_unlock] TODO: RM#1049980 */ m_lock_rcv.unlock(); continue; @@ -461,7 +461,7 @@ sockinfo_udp::~sockinfo_udp() } */ m_lock_rcv.lock(); - do_wakeup(); + m_sock_wakeup_pipe.do_wakeup(); destructor_helper(); @@ -2336,7 +2336,7 @@ inline void sockinfo_udp::update_ready(mem_buf_desc_t *p_desc, void *pv_fd_ready m_p_socket_stats->counters.n_rx_ready_byte_max = std::max((uint32_t)m_p_socket_stats->n_rx_ready_byte_count, m_p_socket_stats->counters.n_rx_ready_byte_max); - do_wakeup(); + m_sock_wakeup_pipe.do_wakeup(); m_lock_rcv.unlock(); } else { m_p_socket_stats->n_rx_zcopy_pkt_count++; @@ -3233,7 +3233,7 @@ void sockinfo_udp::push_back_m_rx_pkt_ready_list(mem_buf_desc_t *buff) bool sockinfo_udp::prepare_to_close(bool process_shutdown) { m_lock_rcv.lock(); - do_wakeup(); + m_sock_wakeup_pipe.do_wakeup(); if (m_econtext) { m_econtext->fd_closed(m_fd); diff --git a/src/core/util/wakeup.cpp b/src/core/util/wakeup.cpp index a21e0284e..f15d60dce 100644 --- a/src/core/util/wakeup.cpp +++ b/src/core/util/wakeup.cpp @@ -49,21 +49,21 @@ #undef MODULE_HDR_INFO #define MODULE_HDR_INFO MODULE_NAME "[epfd=%d]:%d:%s() " #undef __INFO__ -#define __INFO__ m_epfd +#define __INFO__ m_wakeup_epfd wakeup::wakeup() { - m_epfd = 0; + m_wakeup_epfd = 0; m_is_sleeping = 0; memset(&m_ev, 0, sizeof(m_ev)); } void wakeup::going_to_sleep() { BULLSEYE_EXCLUDE_BLOCK_START - if (likely(m_epfd)) { + if (likely(m_wakeup_epfd)) { m_is_sleeping++; } else { - wkup_logerr(" m_epfd is not initialized - cannot use wakeup mechanism\n"); + wkup_logerr(" m_wakeup_epfd is not initialized - cannot use wakeup mechanism\n"); m_is_sleeping = 0; } BULLSEYE_EXCLUDE_BLOCK_END @@ -71,5 +71,5 @@ void wakeup::going_to_sleep() void wakeup::wakeup_set_epoll_fd(int epfd) { - m_epfd = epfd; + m_wakeup_epfd = epfd; } diff --git a/src/core/util/wakeup.h b/src/core/util/wakeup.h index 193223c72..3b3a635b4 100644 --- a/src/core/util/wakeup.h +++ b/src/core/util/wakeup.h @@ -48,15 +48,11 @@ class wakeup { void going_to_sleep(); void return_from_sleep() { --m_is_sleeping; }; void wakeup_clear() { m_is_sleeping = 0; } + void wakeup_set_epoll_fd(int epfd); protected: - virtual void wakeup_set_epoll_fd(int epfd); int m_is_sleeping; - - // lock_spin_recursive m_wakeup_lock; This lock is not needed for now. Maybe we will need it for - // epoll. - - int m_epfd; + int m_wakeup_epfd; struct epoll_event m_ev; }; diff --git a/src/core/util/wakeup_pipe.cpp b/src/core/util/wakeup_pipe.cpp index 322e1f0b7..8bac8886c 100644 --- a/src/core/util/wakeup_pipe.cpp +++ b/src/core/util/wakeup_pipe.cpp @@ -49,7 +49,7 @@ #undef MODULE_HDR_INFO #define MODULE_HDR_INFO MODULE_NAME "[epfd=%d]:%d:%s() " #undef __INFO__ -#define __INFO__ m_epfd +#define __INFO__ m_wakeup_epfd #define UNINIT_PIPE_FD (-1) int wakeup_pipe::g_wakeup_pipes[2] = {UNINIT_PIPE_FD, UNINIT_PIPE_FD}; @@ -96,7 +96,7 @@ void wakeup_pipe::do_wakeup() int errno_tmp = errno; // don't let wakeup affect errno, as this can fail with EEXIST BULLSEYE_EXCLUDE_BLOCK_START - if ((SYSCALL(epoll_ctl, m_epfd, EPOLL_CTL_ADD, g_wakeup_pipes[0], &m_ev)) && + if ((SYSCALL(epoll_ctl, m_wakeup_epfd, EPOLL_CTL_ADD, g_wakeup_pipes[0], &m_ev)) && (errno != EEXIST)) { wkup_logerr("Failed to add wakeup fd to internal epfd (errno=%d %m)", errno); } @@ -114,7 +114,7 @@ void wakeup_pipe::remove_wakeup_fd() } wkup_entry_dbg(""); int tmp_errno = errno; - if (SYSCALL(epoll_ctl, m_epfd, EPOLL_CTL_DEL, g_wakeup_pipes[0], nullptr)) { + if (SYSCALL(epoll_ctl, m_wakeup_epfd, EPOLL_CTL_DEL, g_wakeup_pipes[0], nullptr)) { BULLSEYE_EXCLUDE_BLOCK_START if (errno == ENOENT) { wkup_logdbg("Failed to delete global pipe from internal epfd it was already deleted"); From 1aa581aa891e41d7671fce36ddf55d80eb93716c Mon Sep 17 00:00:00 2001 From: Alexander Grissik Date: Thu, 15 Feb 2024 11:42:37 +0200 Subject: [PATCH 113/169] issue: 3777348 Replacing socket_fd_api access with sockinfo Before merging socket_fd_api with sockinfo, replace all access through socket_fd_api with sockinfo. Signed-off-by: Alexander Grissik --- src/core/dev/ring.h | 7 +-- src/core/dev/ring_bond.cpp | 2 +- src/core/iomux/epfd_info.cpp | 18 ++++---- src/core/iomux/epfd_info.h | 10 ++--- src/core/iomux/epoll_wait_call.cpp | 14 +++--- src/core/iomux/epoll_wait_call.h | 3 +- src/core/iomux/io_mux_call.cpp | 6 +-- src/core/iomux/io_mux_call.h | 2 +- src/core/iomux/poll_call.cpp | 4 +- src/core/iomux/select_call.cpp | 3 +- src/core/proto/dst_entry.h | 3 +- src/core/proto/dst_entry_tcp.cpp | 2 +- src/core/proto/dst_entry_tcp.h | 4 +- src/core/proto/dst_entry_udp.cpp | 3 +- src/core/proto/dst_entry_udp.h | 4 +- src/core/proto/route_table_mgr.cpp | 2 +- src/core/sock/fd_collection.cpp | 30 ++++++------- src/core/sock/fd_collection.h | 26 +++++------ src/core/sock/sock-app.cpp | 10 ++--- src/core/sock/sock-extra.cpp | 12 ++--- src/core/sock/sock-redirect.cpp | 70 +++++++++++++++--------------- src/core/sock/socket_fd_api.cpp | 7 --- src/core/sock/socket_fd_api.h | 1 - src/core/sock/sockinfo.cpp | 7 +++ src/core/sock/sockinfo.h | 20 ++++----- src/core/sock/sockinfo_ulp.h | 2 +- 26 files changed, 136 insertions(+), 136 deletions(-) diff --git a/src/core/dev/ring.h b/src/core/dev/ring.h index 751e1ee3a..737c43247 100644 --- a/src/core/dev/ring.h +++ b/src/core/dev/ring.h @@ -35,11 +35,12 @@ #include #include "ib/base/verbs_extra.h" -#include "proto/flow_tuple.h" -#include "sock/socket_fd_api.h" +#include "dev/buffer_pool.h" +#include "dev/xlio_ti.h" #include "sock/tcp_seg_pool.h" +#include "proto/flow_tuple.h" +#include "proto/xlio_lwip.h" #include "proto/L2_address.h" -#include "dev/xlio_ti.h" /* Forward declarations */ struct xlio_tls_info; diff --git a/src/core/dev/ring_bond.cpp b/src/core/dev/ring_bond.cpp index 5427f0e8d..ce0401008 100644 --- a/src/core/dev/ring_bond.cpp +++ b/src/core/dev/ring_bond.cpp @@ -245,7 +245,7 @@ void ring_bond::restart() } epfd = si->get_epoll_context_fd(); if (epfd > 0) { -#define CQ_FD_MARK 0xabcd /* see socket_fd_api */ +#define CQ_FD_MARK 0xabcd /* see sockinfo */ epoll_event ev = {0, {0}}; fd = ring_rx_fds_array[k]; ev.events = EPOLLIN | EPOLLPRI; diff --git a/src/core/iomux/epfd_info.cpp b/src/core/iomux/epfd_info.cpp index 8c87ac128..dae4861dc 100644 --- a/src/core/iomux/epfd_info.cpp +++ b/src/core/iomux/epfd_info.cpp @@ -99,7 +99,7 @@ epfd_info::epfd_info(int epfd, int size) epfd_info::~epfd_info() { __log_funcall(""); - socket_fd_api *sock_fd; + sockinfo *sock_fd; // Meny: going over all handled fds and removing epoll context. @@ -207,7 +207,7 @@ int epfd_info::add_fd(int fd, epoll_event *event) __log_funcall("fd=%d", fd); - socket_fd_api *temp_sock_fd_api = fd_collection_get_sockfd(fd); + sockinfo *temp_sock_fd_api = fd_collection_get_sockfd(fd); if (temp_sock_fd_api && temp_sock_fd_api->get_type() == FD_TYPE_SOCKET) { is_offloaded = true; } @@ -403,7 +403,7 @@ int epfd_info::del_fd(int fd, bool passthrough) __log_funcall("fd=%d", fd); epoll_fd_rec *fi; - socket_fd_api *temp_sock_fd_api = fd_collection_get_sockfd(fd); + sockinfo *temp_sock_fd_api = fd_collection_get_sockfd(fd); if (temp_sock_fd_api && temp_sock_fd_api->skip_os_select()) { __log_dbg("fd=%d must be skipped from os epoll()", fd); } else if (!passthrough) { @@ -448,7 +448,7 @@ int epfd_info::del_fd(int fd, bool passthrough) // remove fd and replace by last fd m_p_offloaded_fds[fi->offloaded_index - 1] = m_p_offloaded_fds[m_n_offloaded_fds - 1]; - socket_fd_api *last_socket = + sockinfo *last_socket = fd_collection_get_sockfd(m_p_offloaded_fds[m_n_offloaded_fds - 1]); if (last_socket && last_socket->get_epoll_context_fd() == m_epfd) { last_socket->m_fd_rec.offloaded_index = fi->offloaded_index; @@ -485,7 +485,7 @@ int epfd_info::mod_fd(int fd, epoll_event *event) return -1; } - socket_fd_api *temp_sock_fd_api = fd_collection_get_sockfd(fd); + sockinfo *temp_sock_fd_api = fd_collection_get_sockfd(fd); // check if fd is offloaded that new event mask is OK if (temp_sock_fd_api && temp_sock_fd_api->m_fd_rec.offloaded_index > 0) { if (m_log_invalid_events && (event->events & ~SUPPORTED_EPOLL_EVENTS)) { @@ -550,7 +550,7 @@ int epfd_info::mod_fd(int fd, epoll_event *event) epoll_fd_rec *epfd_info::get_fd_rec(int fd) { epoll_fd_rec *fd_rec = nullptr; - socket_fd_api *temp_sock_fd_api = fd_collection_get_sockfd(fd); + sockinfo *temp_sock_fd_api = fd_collection_get_sockfd(fd); lock(); if (temp_sock_fd_api && temp_sock_fd_api->get_epoll_context_fd() == m_epfd) { @@ -575,7 +575,7 @@ void epfd_info::fd_closed(int fd, bool passthrough) unlock(); } -void epfd_info::insert_epoll_event_cb(socket_fd_api *sock_fd, uint32_t event_flags) +void epfd_info::insert_epoll_event_cb(sockinfo *sock_fd, uint32_t event_flags) { lock(); // EPOLLHUP | EPOLLERR are reported without user request @@ -585,7 +585,7 @@ void epfd_info::insert_epoll_event_cb(socket_fd_api *sock_fd, uint32_t event_fla unlock(); } -void epfd_info::insert_epoll_event(socket_fd_api *sock_fd, uint32_t event_flags) +void epfd_info::insert_epoll_event(sockinfo *sock_fd, uint32_t event_flags) { // assumed lock if (sock_fd->ep_ready_fd_node.is_list_member()) { @@ -598,7 +598,7 @@ void epfd_info::insert_epoll_event(socket_fd_api *sock_fd, uint32_t event_flags) do_wakeup(); } -void epfd_info::remove_epoll_event(socket_fd_api *sock_fd, uint32_t event_flags) +void epfd_info::remove_epoll_event(sockinfo *sock_fd, uint32_t event_flags) { sock_fd->m_epoll_event_flags &= ~event_flags; if (sock_fd->m_epoll_event_flags == 0) { diff --git a/src/core/iomux/epfd_info.h b/src/core/iomux/epfd_info.h index c1ff1da0a..245d4f196 100644 --- a/src/core/iomux/epfd_info.h +++ b/src/core/iomux/epfd_info.h @@ -37,8 +37,8 @@ #include #include -typedef xlio_list_t ep_ready_fd_list_t; -typedef xlio_list_t fd_info_list_t; +typedef xlio_list_t ep_ready_fd_list_t; +typedef xlio_list_t fd_info_list_t; typedef std::unordered_map fd_info_map_t; typedef std::unordered_map ring_map_t; typedef std::deque ready_cq_fd_q_t; @@ -121,9 +121,9 @@ class epfd_info : public lock_mutex_recursive, public cleanable_obj, public wake int remove_fd_from_epoll_os(int fd); inline size_t get_fd_non_offloaded_size() { return m_fd_non_offloaded_map.size(); } inline size_t get_fd_offloaded_size() { return m_fd_offloaded_list.size(); } - void insert_epoll_event_cb(socket_fd_api *sock_fd, uint32_t event_flags); - void insert_epoll_event(socket_fd_api *sock_fd, uint32_t event_flags); - void remove_epoll_event(socket_fd_api *sock_fd, uint32_t event_flags); + void insert_epoll_event_cb(sockinfo *sock_fd, uint32_t event_flags); + void insert_epoll_event(sockinfo *sock_fd, uint32_t event_flags); + void remove_epoll_event(sockinfo *sock_fd, uint32_t event_flags); void increase_ring_ref_count(ring *ring); void decrease_ring_ref_count(ring *ring); diff --git a/src/core/iomux/epoll_wait_call.cpp b/src/core/iomux/epoll_wait_call.cpp index 0e2429756..563a569c2 100644 --- a/src/core/iomux/epoll_wait_call.cpp +++ b/src/core/iomux/epoll_wait_call.cpp @@ -36,7 +36,7 @@ #include #include -#include +#include #include #include "epfd_info.h" @@ -85,11 +85,11 @@ int epoll_wait_call::get_current_events() return m_n_all_ready_fds; } - xlio_list_t socket_fd_list; + xlio_list_t socket_fd_list; lock(); int i, ready_rfds = 0, ready_wfds = 0; i = m_n_all_ready_fds; - socket_fd_api *p_socket_object; + sockinfo *p_socket_object; ep_ready_fd_list_t::iterator iter = m_epfd_info->m_ready_fds.begin(); while (iter != m_epfd_info->m_ready_fds.end() && i < m_maxevents) { p_socket_object = *iter; @@ -165,7 +165,7 @@ int epoll_wait_call::get_current_events() * see RM task 212058 */ while (!socket_fd_list.empty()) { - socket_fd_api *sockfd = socket_fd_list.get_and_pop_front(); + sockinfo *sockfd = socket_fd_list.get_and_pop_front(); sockfd->consider_rings_migration_rx(); } @@ -235,7 +235,7 @@ bool epoll_wait_call::_wait(int timeout) } if (m_p_ready_events[i].events & EPOLLIN) { - socket_fd_api *temp_sock_fd_api = fd_collection_get_sockfd(fd); + sockinfo *temp_sock_fd_api = fd_collection_get_sockfd(fd); if (temp_sock_fd_api) { // Instructing the socket to sample the OS immediately to prevent hitting EAGAIN on // recvfrom(), after iomux returned a shadow fd as ready (only for non-blocking @@ -343,8 +343,8 @@ bool epoll_wait_call::immidiate_return(int &poll_os_countdown) return false; } -bool epoll_wait_call::handle_epoll_event(bool is_ready, uint32_t events, - socket_fd_api *socket_object, int index) +bool epoll_wait_call::handle_epoll_event(bool is_ready, uint32_t events, sockinfo *socket_object, + int index) { if (is_ready) { epoll_fd_rec &fd_rec = socket_object->m_fd_rec; diff --git a/src/core/iomux/epoll_wait_call.h b/src/core/iomux/epoll_wait_call.h index 71ba2eb0a..57d11b5e9 100644 --- a/src/core/iomux/epoll_wait_call.h +++ b/src/core/iomux/epoll_wait_call.h @@ -100,8 +100,7 @@ class epoll_wait_call : public io_mux_call { int get_current_events(); - bool handle_epoll_event(bool is_ready, uint32_t events, socket_fd_api *socket_object, - int index); + bool handle_epoll_event(bool is_ready, uint32_t events, sockinfo *socket_object, int index); protected: virtual int ring_poll_and_process_element(); diff --git a/src/core/iomux/io_mux_call.cpp b/src/core/iomux/io_mux_call.cpp index 24c142f98..d2f7a6c92 100644 --- a/src/core/iomux/io_mux_call.cpp +++ b/src/core/iomux/io_mux_call.cpp @@ -92,7 +92,7 @@ inline void io_mux_call::check_offloaded_wsockets() if (m_p_offloaded_modes[offloaded_index] & OFF_WRITE) { int fd = m_p_all_offloaded_fds[offloaded_index]; - socket_fd_api *p_socket_object = fd_collection_get_sockfd(fd); + sockinfo *p_socket_object = fd_collection_get_sockfd(fd); if (!p_socket_object) { // If we can't find this previously mapped offloaded socket // then it was probably closed. We need to get out with error code @@ -113,7 +113,7 @@ inline void io_mux_call::check_offloaded_esockets() for (int offloaded_index = 0; offloaded_index < *m_p_num_all_offloaded_fds; ++offloaded_index) { if (m_p_offloaded_modes[offloaded_index] & OFF_RDWR) { int fd = m_p_all_offloaded_fds[offloaded_index]; - socket_fd_api *p_socket_object = fd_collection_get_sockfd(fd); + sockinfo *p_socket_object = fd_collection_get_sockfd(fd); if (!p_socket_object) { // If we can't find this previously mapped offloaded socket // then it was probably closed. We need to get out with error code @@ -205,7 +205,7 @@ void io_mux_call::check_offloaded_rsockets() { int fd, offloaded_index, num_all_offloaded_fds; fd_array_t fd_ready_array; - socket_fd_api *p_socket_object; + sockinfo *p_socket_object; fd_ready_array.fd_max = FD_ARRAY_MAX; diff --git a/src/core/iomux/io_mux_call.h b/src/core/iomux/io_mux_call.h index 596a6847e..0672d8f87 100644 --- a/src/core/iomux/io_mux_call.h +++ b/src/core/iomux/io_mux_call.h @@ -38,7 +38,7 @@ #include #include -#include +#include #include // from sigset.h diff --git a/src/core/iomux/poll_call.cpp b/src/core/iomux/poll_call.cpp index ad8a60412..f767d558b 100644 --- a/src/core/iomux/poll_call.cpp +++ b/src/core/iomux/poll_call.cpp @@ -34,7 +34,7 @@ #include #include -#include +#include #include #include #include @@ -71,7 +71,7 @@ poll_call::poll_call(int *off_rfds_buffer, offloaded_mode_t *off_modes_buffer, i } fd = m_orig_fds[i].fd; - socket_fd_api *temp_sock_fd_api = fd_collection_get_sockfd(fd); + sockinfo *temp_sock_fd_api = fd_collection_get_sockfd(fd); if (temp_sock_fd_api && (temp_sock_fd_api->get_type() == FD_TYPE_SOCKET)) { // POLLERR and POLLHUP are always enabled implicitly and considered as READ by XLIO offloaded_mode_t off_mode = OFF_READ; diff --git a/src/core/iomux/select_call.cpp b/src/core/iomux/select_call.cpp index e255f7f24..23c9e069a 100644 --- a/src/core/iomux/select_call.cpp +++ b/src/core/iomux/select_call.cpp @@ -61,7 +61,6 @@ select_call::select_call(int *off_fds_buffer, offloaded_mode_t *off_modes_buffer , m_b_run_prepare_to_poll(false) { int fd; - // socket_fd_api* temp_sock_fd_api = NULL; if (m_nfds > FD_SETSIZE) { errno = ENOMEM; @@ -91,7 +90,7 @@ select_call::select_call(int *off_fds_buffer, offloaded_mode_t *off_modes_buffer bool check_read = offloaded_read && FD_ISSET(fd, m_readfds); bool check_write = offloaded_write && FD_ISSET(fd, m_writefds); - socket_fd_api *psock = fd_collection_get_sockfd(fd); + sockinfo *psock = fd_collection_get_sockfd(fd); if (psock && psock->get_type() == FD_TYPE_SOCKET) { diff --git a/src/core/proto/dst_entry.h b/src/core/proto/dst_entry.h index 9d30dd5ff..3b2bc4813 100644 --- a/src/core/proto/dst_entry.h +++ b/src/core/proto/dst_entry.h @@ -55,6 +55,7 @@ /* Forward declarations */ class xlio_tis; +class sockinfo; struct socket_data { int fd; @@ -85,7 +86,7 @@ class dst_entry : public cache_observer, public tostr { virtual ssize_t fast_send(const iovec *p_iov, const ssize_t sz_iov, xlio_send_attr attr) = 0; virtual ssize_t slow_send(const iovec *p_iov, const ssize_t sz_iov, xlio_send_attr attr, struct xlio_rate_limit_t &rate_limit, int flags = 0, - socket_fd_api *sock = nullptr, tx_call_t call_type = TX_UNDEF) = 0; + sockinfo *sock = nullptr, tx_call_t call_type = TX_UNDEF) = 0; bool try_migrate_ring_tx(lock_base &socket_lock); diff --git a/src/core/proto/dst_entry_tcp.cpp b/src/core/proto/dst_entry_tcp.cpp index 528fcf37b..7690b10ba 100644 --- a/src/core/proto/dst_entry_tcp.cpp +++ b/src/core/proto/dst_entry_tcp.cpp @@ -315,7 +315,7 @@ ssize_t dst_entry_tcp::fast_send(const iovec *p_iov, const ssize_t sz_iov, xlio_ ssize_t dst_entry_tcp::slow_send(const iovec *p_iov, const ssize_t sz_iov, xlio_send_attr attr, struct xlio_rate_limit_t &rate_limit, int flags /*= 0*/, - socket_fd_api *sock /*= 0*/, tx_call_t call_type /*= 0*/) + sockinfo *sock /*= 0*/, tx_call_t call_type /*= 0*/) { ssize_t ret_val = -1; diff --git a/src/core/proto/dst_entry_tcp.h b/src/core/proto/dst_entry_tcp.h index 2615c2abe..66223470a 100644 --- a/src/core/proto/dst_entry_tcp.h +++ b/src/core/proto/dst_entry_tcp.h @@ -50,8 +50,8 @@ class dst_entry_tcp : public dst_entry { ssize_t fast_send(const iovec *p_iov, const ssize_t sz_iov, xlio_send_attr attr); ssize_t slow_send(const iovec *p_iov, const ssize_t sz_iov, xlio_send_attr attr, - struct xlio_rate_limit_t &rate_limit, int flags = 0, - socket_fd_api *sock = nullptr, tx_call_t call_type = TX_UNDEF); + struct xlio_rate_limit_t &rate_limit, int flags = 0, sockinfo *sock = nullptr, + tx_call_t call_type = TX_UNDEF); ssize_t slow_send_neigh(const iovec *p_iov, size_t sz_iov, struct xlio_rate_limit_t &rate_limit); diff --git a/src/core/proto/dst_entry_udp.cpp b/src/core/proto/dst_entry_udp.cpp index a45e2069a..09926f81b 100644 --- a/src/core/proto/dst_entry_udp.cpp +++ b/src/core/proto/dst_entry_udp.cpp @@ -33,6 +33,7 @@ #include "utils/bullseye.h" #include "core/util/utils.h" #include "dst_entry_udp.h" +#include "sock/sockinfo.h" #define MODULE_NAME "dst_udp" @@ -497,7 +498,7 @@ ssize_t dst_entry_udp::fast_send(const iovec *p_iov, const ssize_t sz_iov, xlio_ ssize_t dst_entry_udp::slow_send(const iovec *p_iov, const ssize_t sz_iov, xlio_send_attr attr, struct xlio_rate_limit_t &rate_limit, int flags /*= 0*/, - socket_fd_api *sock /*= 0*/, tx_call_t call_type /*= 0*/) + sockinfo *sock /*= 0*/, tx_call_t call_type /*= 0*/) { ssize_t ret_val = 0; diff --git a/src/core/proto/dst_entry_udp.h b/src/core/proto/dst_entry_udp.h index 0fc183745..3b7cda9b2 100644 --- a/src/core/proto/dst_entry_udp.h +++ b/src/core/proto/dst_entry_udp.h @@ -43,8 +43,8 @@ class dst_entry_udp : public dst_entry { ssize_t fast_send(const iovec *p_iov, const ssize_t sz_iov, xlio_send_attr attr); ssize_t slow_send(const iovec *p_iov, const ssize_t sz_iov, xlio_send_attr attr, - struct xlio_rate_limit_t &rate_limit, int flags = 0, - socket_fd_api *sock = nullptr, tx_call_t call_type = TX_UNDEF); + struct xlio_rate_limit_t &rate_limit, int flags = 0, sockinfo *sock = nullptr, + tx_call_t call_type = TX_UNDEF); static bool fast_send_fragmented_ipv6(mem_buf_desc_t *p_mem_buf_desc, const iovec *p_iov, const ssize_t sz_iov, xlio_wr_tx_packet_attr attr, size_t sz_udp_payload, int n_num_frags, diff --git a/src/core/proto/route_table_mgr.cpp b/src/core/proto/route_table_mgr.cpp index 4610c9a0e..45983f3b3 100644 --- a/src/core/proto/route_table_mgr.cpp +++ b/src/core/proto/route_table_mgr.cpp @@ -49,7 +49,7 @@ #include "vlogger/vlogger.h" #include "core/util/vtypes.h" #include "core/util/utils.h" -#include "core/sock/socket_fd_api.h" +#include "core/sock/sockinfo.h" #include "core/sock/sock-redirect.h" #include "core/dev/net_device_table_mgr.h" #include "core/util/ip_address.h" diff --git a/src/core/sock/fd_collection.cpp b/src/core/sock/fd_collection.cpp index 665d4d7d7..56c30ce20 100644 --- a/src/core/sock/fd_collection.cpp +++ b/src/core/sock/fd_collection.cpp @@ -36,7 +36,7 @@ #include "util/libxlio.h" #include "fd_collection.h" #include "sock-redirect.h" -#include "socket_fd_api.h" +#include "sockinfo.h" #include "sockinfo_udp.h" #include "sockinfo_tcp.h" #include "iomux/epfd_info.h" @@ -75,8 +75,8 @@ fd_collection::fd_collection() } fdcoll_logdbg("using open files max limit of %d file descriptors", m_n_fd_map_size); - m_p_sockfd_map = new socket_fd_api *[m_n_fd_map_size]; - memset(m_p_sockfd_map, 0, m_n_fd_map_size * sizeof(socket_fd_api *)); + m_p_sockfd_map = new sockinfo *[m_n_fd_map_size]; + memset(m_p_sockfd_map, 0, m_n_fd_map_size * sizeof(sockinfo *)); m_p_epfd_map = new epfd_info *[m_n_fd_map_size]; memset(m_p_epfd_map, 0, m_n_fd_map_size * sizeof(epfd_info *)); @@ -120,7 +120,7 @@ void fd_collection::prepare_to_close() for (int fd = 0; fd < m_n_fd_map_size; ++fd) { if (m_p_sockfd_map[fd]) { if (!g_is_forked_child) { - socket_fd_api *p_sfd_api = get_sockfd(fd); + sockinfo *p_sfd_api = get_sockfd(fd); if (p_sfd_api) { p_sfd_api->prepare_to_close(true); } @@ -147,7 +147,7 @@ void fd_collection::clear() * these sockets can not be deleted through the it. */ while (!m_pending_to_remove_lst.empty()) { - socket_fd_api *p_sfd_api = m_pending_to_remove_lst.get_and_pop_back(); + sockinfo *p_sfd_api = m_pending_to_remove_lst.get_and_pop_back(); p_sfd_api->clean_socket_obj(); } @@ -158,7 +158,7 @@ void fd_collection::clear() for (fd = 0; fd < m_n_fd_map_size; ++fd) { if (m_p_sockfd_map[fd]) { if (!g_is_forked_child) { - socket_fd_api *p_sfd_api = get_sockfd(fd); + sockinfo *p_sfd_api = get_sockfd(fd); if (p_sfd_api) { p_sfd_api->statistics_print(); p_sfd_api->clean_socket_obj(); @@ -203,7 +203,7 @@ int fd_collection::addsocket(int fd, int domain, int type, bool check_offload /* const int SOCK_TYPE_MASK = 0xf; int sock_type = type & SOCK_TYPE_MASK; int sock_flags = type & ~SOCK_TYPE_MASK; - socket_fd_api *p_sfd_api_obj; + sockinfo *p_sfd_api_obj; fdcoll_logfunc("fd=%d domain=%d type=%d", fd, domain, type); @@ -311,7 +311,7 @@ void fd_collection::offloading_rule_change_thread(bool offloaded, pthread_t tid) void fd_collection::statistics_print_helper(int fd, vlog_levels_t log_level) { - socket_fd_api *socket_fd; + sockinfo *socket_fd; epfd_info *epoll_fd; if ((socket_fd = get_sockfd(fd))) { @@ -428,7 +428,7 @@ int fd_collection::add_cq_channel_fd(int cq_ch_fd, ring *p_ring) BULLSEYE_EXCLUDE_BLOCK_END // Sanity check to remove any old objects using the same fd!! - socket_fd_api *p_cq_ch_fd_api_obj = get_sockfd(cq_ch_fd); + sockinfo *p_cq_ch_fd_api_obj = get_sockfd(cq_ch_fd); BULLSEYE_EXCLUDE_BLOCK_START if (p_cq_ch_fd_api_obj) { fdcoll_logwarn("[fd=%d] Deleting old duplicate object (%p)", cq_ch_fd, p_cq_ch_fd_api_obj); @@ -468,7 +468,7 @@ int fd_collection::add_cq_channel_fd(int cq_ch_fd, ring *p_ring) int fd_collection::del_sockfd(int fd, bool is_for_udp_pool /*=false*/) { int ret_val = -1; - socket_fd_api *p_sfd_api; + sockinfo *p_sfd_api; p_sfd_api = get_sockfd(fd); @@ -559,7 +559,7 @@ template int fd_collection::del(int fd, bool b_cleanup, cls **map return -1; } -int fd_collection::del_socket(int fd, socket_fd_api **map_type) +int fd_collection::del_socket(int fd, sockinfo **map_type) { fdcoll_logfunc("fd=%d", fd); @@ -568,7 +568,7 @@ int fd_collection::del_socket(int fd, socket_fd_api **map_type) } lock(); - socket_fd_api *p_obj = map_type[fd]; + sockinfo *p_obj = map_type[fd]; if (p_obj) { map_type[fd] = nullptr; unlock(); @@ -595,7 +595,7 @@ void fd_collection::remove_from_all_epfds(int fd, bool passthrough) } #if defined(DEFINED_NGINX) -void fd_collection::push_socket_pool(socket_fd_api *sockfd) +void fd_collection::push_socket_pool(sockinfo *sockfd) { lock(); sockfd->prepare_to_close_socket_pool(true); @@ -618,7 +618,7 @@ bool fd_collection::pop_socket_pool(int &fd, bool &add_to_udp_pool, int type) lock(); if (!m_socket_pool.empty()) { // use fd from pool - will skip creation of new fd by os - socket_fd_api *sockfd = m_socket_pool.top(); + sockinfo *sockfd = m_socket_pool.top(); fd = sockfd->get_fd(); if (!m_p_sockfd_map[fd]) { m_p_sockfd_map[fd] = sockfd; @@ -649,7 +649,7 @@ void fd_collection::handle_socket_pool(int fd) return; } - socket_fd_api *sockfd = get_sockfd(fd); + sockinfo *sockfd = get_sockfd(fd); if (sockfd) { ++m_socket_pool_counter; sockfd->set_params_for_socket_pool(); diff --git a/src/core/sock/fd_collection.h b/src/core/sock/fd_collection.h index ad6f844a4..d657b4050 100644 --- a/src/core/sock/fd_collection.h +++ b/src/core/sock/fd_collection.h @@ -41,11 +41,11 @@ #include "event/event_handler_manager.h" #include "event/timer_handler.h" #include "sock/cleanable_obj.h" -#include "sock/socket_fd_api.h" +#include "sock/sockinfo.h" #include "iomux/epfd_info.h" #include "utils/lock_wrapper.h" -typedef xlio_list_t sock_fd_api_list_t; +typedef xlio_list_t sock_fd_api_list_t; typedef xlio_list_t epfd_info_list_t; typedef std::unordered_map offload_thread_rule_t; @@ -137,12 +137,12 @@ class fd_collection : private lock_mutex_recursive { */ inline bool set_immediate_os_sample(int fd); - inline void reuse_sockfd(int fd, socket_fd_api *p_sfd_api_obj); - inline void destroy_sockfd(socket_fd_api *p_sfd_api_obj); + inline void reuse_sockfd(int fd, sockinfo *p_sfd_api_obj); + inline void destroy_sockfd(sockinfo *p_sfd_api_obj); /** * Get sock_fd_api (sockinfo) by fd. */ - inline socket_fd_api *get_sockfd(int fd); + inline sockinfo *get_sockfd(int fd); /** * Get epfd_info by fd. @@ -184,13 +184,13 @@ class fd_collection : private lock_mutex_recursive { #if defined(DEFINED_NGINX) bool pop_socket_pool(int &fd, bool &add_to_udp_pool, int type); - void push_socket_pool(socket_fd_api *sockfd); + void push_socket_pool(sockinfo *sockfd); void handle_socket_pool(int fd); #endif private: template int del(int fd, bool b_cleanup, cls **map_type); template inline cls *get(int fd, cls **map_type); - int del_socket(int fd, socket_fd_api **map_type); + int del_socket(int fd, sockinfo **map_type); inline bool is_valid_fd(int fd); inline bool create_offloaded_sockets(); @@ -206,7 +206,7 @@ class fd_collection : private lock_mutex_recursive { private: int m_n_fd_map_size; - socket_fd_api **m_p_sockfd_map; + sockinfo **m_p_sockfd_map; epfd_info **m_p_epfd_map; cq_channel_info **m_p_cq_channel_map; ring_tap **m_p_tap_map; @@ -223,7 +223,7 @@ class fd_collection : private lock_mutex_recursive { #if defined(DEFINED_NGINX) bool m_use_socket_pool; - std::stack m_socket_pool; + std::stack m_socket_pool; int m_socket_pool_size; int m_socket_pool_counter; #endif @@ -270,7 +270,7 @@ inline bool fd_collection::set_immediate_os_sample(int fd) return false; } -inline void fd_collection::reuse_sockfd(int fd, socket_fd_api *p_sfd_api_obj) +inline void fd_collection::reuse_sockfd(int fd, sockinfo *p_sfd_api_obj) { lock(); m_pending_to_remove_lst.erase(p_sfd_api_obj); @@ -279,7 +279,7 @@ inline void fd_collection::reuse_sockfd(int fd, socket_fd_api *p_sfd_api_obj) unlock(); } -inline void fd_collection::destroy_sockfd(socket_fd_api *p_sfd_api_obj) +inline void fd_collection::destroy_sockfd(sockinfo *p_sfd_api_obj) { lock(); --g_global_stat_static.n_pending_sockets; @@ -288,7 +288,7 @@ inline void fd_collection::destroy_sockfd(socket_fd_api *p_sfd_api_obj) unlock(); } -inline socket_fd_api *fd_collection::get_sockfd(int fd) +inline sockinfo *fd_collection::get_sockfd(int fd) { return get(fd, m_p_sockfd_map); } @@ -315,7 +315,7 @@ inline int fd_collection::get_fd_map_size() extern fd_collection *g_p_fd_collection; -inline socket_fd_api *fd_collection_get_sockfd(int fd) +inline sockinfo *fd_collection_get_sockfd(int fd) { if (g_p_fd_collection) { return g_p_fd_collection->get_sockfd(fd); diff --git a/src/core/sock/sock-app.cpp b/src/core/sock/sock-app.cpp index d2141c26f..477abbae8 100644 --- a/src/core/sock/sock-app.cpp +++ b/src/core/sock/sock-app.cpp @@ -34,7 +34,7 @@ #include "config.h" #endif -#include +#include #include #include #include @@ -81,7 +81,7 @@ int app_conf::proc_nginx() */ fd_collection *p_fd_collection = (fd_collection *)g_p_app->context; for (int fd = 0; fd < p_fd_collection->get_fd_map_size(); fd++) { - socket_fd_api *sock_fd_api = p_fd_collection->get_sockfd(fd); + sockinfo *sock_fd_api = p_fd_collection->get_sockfd(fd); if (!sock_fd_api || !dynamic_cast(sock_fd_api)) { continue; } @@ -106,7 +106,7 @@ int app_conf::proc_envoy(int __op, int __fd) /* Prcess only sockets from map_listen_fd */ auto iter = g_p_app->map_listen_fd.find(__fd); if (iter != g_p_app->map_listen_fd.end()) { - socket_fd_api *p_socket_object = fd_collection_get_sockfd(__fd); + sockinfo *p_socket_object = fd_collection_get_sockfd(__fd); if (iter->second == gettid()) { /* process listen sockets from main thread and remove * them from map_listen_fd @@ -194,7 +194,7 @@ static int init_worker(int worker_id, int listen_fd) app_logdbg("worker: %d fd: %d", worker_id, listen_fd); int ret = 0; - socket_fd_api *child_sock_fd_api = nullptr; + sockinfo *child_sock_fd_api = nullptr; int parent_fd = listen_fd; fd_collection *p_fd_collection = (fd_collection *)g_p_app->context; @@ -236,7 +236,7 @@ static int init_worker(int worker_id, int listen_fd) * Nginx: parent_fd is equal to listen_fd */ sockinfo *si; - socket_fd_api *parent_sock_fd_api = p_fd_collection->get_sockfd(parent_fd); + sockinfo *parent_sock_fd_api = p_fd_collection->get_sockfd(parent_fd); if (!parent_sock_fd_api || !(si = dynamic_cast(parent_sock_fd_api))) { app_logerr("parent sockinfo is not found"); return -1; diff --git a/src/core/sock/sock-extra.cpp b/src/core/sock/sock-extra.cpp index ee50836f5..1e15b96ae 100644 --- a/src/core/sock/sock-extra.cpp +++ b/src/core/sock/sock-extra.cpp @@ -40,7 +40,7 @@ #include #include #include -#include +#include #include #include #include @@ -63,7 +63,7 @@ extern "C" int xlio_register_recv_callback(int __fd, xlio_recv_callback_t __callback, void *__context) { - socket_fd_api *p_socket_object = nullptr; + sockinfo *p_socket_object = nullptr; p_socket_object = fd_collection_get_sockfd(__fd); if (p_socket_object && !safe_mce_sys().enable_socketxtreme) { p_socket_object->register_callback(__callback, __context); @@ -76,7 +76,7 @@ extern "C" int xlio_register_recv_callback(int __fd, xlio_recv_callback_t __call extern "C" int xlio_recvfrom_zcopy(int __fd, void *__buf, size_t __nbytes, int *__flags, struct sockaddr *__from, socklen_t *__fromlen) { - socket_fd_api *p_socket_object = nullptr; + sockinfo *p_socket_object = nullptr; p_socket_object = fd_collection_get_sockfd(__fd); if (p_socket_object) { struct iovec piov[1]; @@ -91,7 +91,7 @@ extern "C" int xlio_recvfrom_zcopy(int __fd, void *__buf, size_t __nbytes, int * extern "C" int xlio_recvfrom_zcopy_free_packets(int __fd, struct xlio_recvfrom_zcopy_packet_t *pkts, size_t count) { - socket_fd_api *p_socket_object = nullptr; + sockinfo *p_socket_object = nullptr; p_socket_object = fd_collection_get_sockfd(__fd); if (p_socket_object) { return p_socket_object->recvfrom_zcopy_free_packets(pkts, count); @@ -241,7 +241,7 @@ extern "C" int xlio_socketxtreme_free_buff(xlio_buff_t *buff) extern "C" int xlio_get_socket_rings_num(int fd) { - socket_fd_api *p_socket_object = fd_collection_get_sockfd(fd); + sockinfo *p_socket_object = fd_collection_get_sockfd(fd); return p_socket_object ? p_socket_object->get_rings_num() : 0; } @@ -252,7 +252,7 @@ extern "C" int xlio_get_socket_rings_fds(int fd, int *ring_fds, int ring_fds_sz) return -1; } - socket_fd_api *p_socket_object = fd_collection_get_sockfd(fd); + sockinfo *p_socket_object = fd_collection_get_sockfd(fd); return p_socket_object ? p_socket_object->get_rings_fds(ring_fds, ring_fds_sz) : 0; } diff --git a/src/core/sock/sock-redirect.cpp b/src/core/sock/sock-redirect.cpp index 6a767259a..bb9341d42 100644 --- a/src/core/sock/sock-redirect.cpp +++ b/src/core/sock/sock-redirect.cpp @@ -250,7 +250,7 @@ bool handle_close(int fd, bool cleanup, bool passthrough) // Remove fd from all existing epoll sets g_p_fd_collection->remove_from_all_epfds(fd, passthrough); - socket_fd_api *sockfd = fd_collection_get_sockfd(fd); + sockinfo *sockfd = fd_collection_get_sockfd(fd); if (sockfd) { // Don't call close(2) for objects without a shadow socket (TCP incoming sockets). to_close_now = !passthrough && sockfd->is_shadow_socket_present(); @@ -336,7 +336,7 @@ int socket_internal(int __domain, int __type, int __protocol, bool shadow, bool int bind_internal(void *sock, const struct sockaddr *addr, socklen_t addrlen) { - auto p_socket_object = reinterpret_cast(sock); + auto p_socket_object = reinterpret_cast(sock); int ret = p_socket_object->bind(addr, addrlen); if (p_socket_object->isPassthrough()) { int fd = p_socket_object->get_fd(); @@ -350,7 +350,7 @@ int bind_internal(void *sock, const struct sockaddr *addr, socklen_t addrlen) ssize_t sendmsg_internal(void *sock, __const struct msghdr *__msg, int __flags) { - auto p_socket_object = reinterpret_cast(sock); + auto p_socket_object = reinterpret_cast(sock); xlio_tx_call_attr_t tx_arg; tx_arg.opcode = TX_SENDMSG; @@ -382,7 +382,7 @@ ssize_t sendmsg_internal(void *sock, __const struct msghdr *__msg, int __flags) return p_socket_object->tx(tx_arg); } -static ssize_t sendfile_helper(socket_fd_api *p_socket_object, int in_fd, __off64_t *offset, +static ssize_t sendfile_helper(sockinfo *p_socket_object, int in_fd, __off64_t *offset, size_t count) { ssize_t totSent = 0; @@ -875,7 +875,7 @@ EXPORT_SYMBOL int XLIO_SYMBOL(shutdown)(int __fd, int __how) srdr_logdbg_entry("fd=%d, how=%d", __fd, __how); - socket_fd_api *p_socket_object = nullptr; + sockinfo *p_socket_object = nullptr; p_socket_object = fd_collection_get_sockfd(__fd); if (p_socket_object) { return p_socket_object->shutdown(__how); @@ -931,7 +931,7 @@ EXPORT_SYMBOL int XLIO_SYMBOL(listen)(int __fd, int backlog) } #endif /* DEFINED_ENVOY */ - socket_fd_api *p_socket_object = nullptr; + sockinfo *p_socket_object = nullptr; p_socket_object = fd_collection_get_sockfd(__fd); if (p_socket_object) { @@ -962,7 +962,7 @@ EXPORT_SYMBOL int XLIO_SYMBOL(accept)(int __fd, struct sockaddr *__addr, socklen { PROFILE_FUNC - socket_fd_api *p_socket_object = nullptr; + sockinfo *p_socket_object = nullptr; p_socket_object = fd_collection_get_sockfd(__fd); if (p_socket_object) { return p_socket_object->accept(__addr, __addrlen); @@ -976,7 +976,7 @@ EXPORT_SYMBOL int XLIO_SYMBOL(accept4)(int __fd, struct sockaddr *__addr, sockle { PROFILE_FUNC - socket_fd_api *p_socket_object = nullptr; + sockinfo *p_socket_object = nullptr; p_socket_object = fd_collection_get_sockfd(__fd); if (p_socket_object) { return p_socket_object->accept4(__addr, __addrlen, __flags); @@ -997,7 +997,7 @@ EXPORT_SYMBOL int XLIO_SYMBOL(bind)(int __fd, const struct sockaddr *__addr, soc srdr_logdbg_entry("fd=%d, %s", __fd, sprintf_sockaddr(buf, 256, __addr, __addrlen)); int ret = 0; - socket_fd_api *p_socket_object = nullptr; + sockinfo *p_socket_object = nullptr; p_socket_object = fd_collection_get_sockfd(__fd); if (p_socket_object) { ret = bind_internal(p_socket_object, __addr, __addrlen); @@ -1034,7 +1034,7 @@ EXPORT_SYMBOL int XLIO_SYMBOL(connect)(int __fd, const struct sockaddr *__to, so srdr_logdbg_entry("fd=%d, %s", __fd, sprintf_sockaddr(buf, 256, __to, __tolen)); int ret = 0; - socket_fd_api *p_socket_object = fd_collection_get_sockfd(__fd); + sockinfo *p_socket_object = fd_collection_get_sockfd(__fd); if (!p_socket_object) { srdr_logdbg_exit("Unable to get sock_fd_api"); ret = SYSCALL(connect, __fd, __to, __tolen); @@ -1077,7 +1077,7 @@ EXPORT_SYMBOL int XLIO_SYMBOL(setsockopt)(int __fd, int __level, int __optname, PROFILE_FUNC int ret = 0; - socket_fd_api *p_socket_object = nullptr; + sockinfo *p_socket_object = nullptr; p_socket_object = fd_collection_get_sockfd(__fd); if (p_socket_object) { @@ -1115,7 +1115,7 @@ EXPORT_SYMBOL int XLIO_SYMBOL(getsockopt)(int __fd, int __level, int __optname, #endif /* XLIO_STATIC_BUILD */ int ret = 0; - socket_fd_api *p_socket_object = nullptr; + sockinfo *p_socket_object = nullptr; p_socket_object = fd_collection_get_sockfd(__fd); if (p_socket_object) { VERIFY_PASSTROUGH_CHANGED( @@ -1155,7 +1155,7 @@ EXPORT_SYMBOL int XLIO_SYMBOL(fcntl)(int __fd, int __cmd, ...) va_end(va); int ret = 0; - socket_fd_api *p_socket_object = nullptr; + sockinfo *p_socket_object = nullptr; p_socket_object = fd_collection_get_sockfd(__fd); if (p_socket_object) { VERIFY_PASSTROUGH_CHANGED(res, p_socket_object->fcntl(__cmd, arg)); @@ -1199,7 +1199,7 @@ EXPORT_SYMBOL int XLIO_SYMBOL(fcntl64)(int __fd, int __cmd, ...) va_end(va); int ret = 0; - socket_fd_api *p_socket_object = nullptr; + sockinfo *p_socket_object = nullptr; p_socket_object = fd_collection_get_sockfd(__fd); if (p_socket_object && VALID_SYSCALL(fcntl64)) { VERIFY_PASSTROUGH_CHANGED(res, p_socket_object->fcntl64(__cmd, arg)); @@ -1236,7 +1236,7 @@ EXPORT_SYMBOL int XLIO_SYMBOL(ioctl)(int __fd, unsigned long int __request, ...) int ret = 0; - socket_fd_api *p_socket_object = nullptr; + sockinfo *p_socket_object = nullptr; p_socket_object = fd_collection_get_sockfd(__fd); if (p_socket_object && arg) { VERIFY_PASSTROUGH_CHANGED(res, p_socket_object->ioctl(__request, arg)); @@ -1259,7 +1259,7 @@ EXPORT_SYMBOL int XLIO_SYMBOL(getsockname)(int __fd, struct sockaddr *__name, so srdr_logdbg_entry("fd=%d", __fd); int ret = 0; - socket_fd_api *p_socket_object = nullptr; + sockinfo *p_socket_object = nullptr; p_socket_object = fd_collection_get_sockfd(__fd); if (p_socket_object) { ret = p_socket_object->getsockname(__name, __namelen); @@ -1291,7 +1291,7 @@ EXPORT_SYMBOL int XLIO_SYMBOL(getpeername)(int __fd, struct sockaddr *__name, so srdr_logdbg_entry("fd=%d", __fd); int ret = 0; - socket_fd_api *p_socket_object = nullptr; + sockinfo *p_socket_object = nullptr; p_socket_object = fd_collection_get_sockfd(__fd); if (p_socket_object) { ret = p_socket_object->getpeername(__name, __namelen); @@ -1318,7 +1318,7 @@ EXPORT_SYMBOL ssize_t XLIO_SYMBOL(read)(int __fd, void *__buf, size_t __nbytes) srdr_logfuncall_entry("fd=%d", __fd); - socket_fd_api *p_socket_object = nullptr; + sockinfo *p_socket_object = nullptr; p_socket_object = fd_collection_get_sockfd(__fd); if (p_socket_object) { struct iovec piov[1]; @@ -1346,7 +1346,7 @@ EXPORT_SYMBOL ssize_t XLIO_SYMBOL(__read_chk)(int __fd, void *__buf, size_t __nb srdr_logfuncall_entry("fd=%d", __fd); - socket_fd_api *p_socket_object = nullptr; + sockinfo *p_socket_object = nullptr; p_socket_object = fd_collection_get_sockfd(__fd); if (p_socket_object) { BULLSEYE_EXCLUDE_BLOCK_START @@ -1378,7 +1378,7 @@ EXPORT_SYMBOL ssize_t XLIO_SYMBOL(readv)(int __fd, const struct iovec *iov, int srdr_logfuncall_entry("fd=%d", __fd); - socket_fd_api *p_socket_object = nullptr; + sockinfo *p_socket_object = nullptr; p_socket_object = fd_collection_get_sockfd(__fd); if (p_socket_object) { struct iovec *piov = (struct iovec *)iov; @@ -1400,7 +1400,7 @@ EXPORT_SYMBOL ssize_t XLIO_SYMBOL(recv)(int __fd, void *__buf, size_t __nbytes, srdr_logfuncall_entry("fd=%d", __fd); - socket_fd_api *p_socket_object = nullptr; + sockinfo *p_socket_object = nullptr; p_socket_object = fd_collection_get_sockfd(__fd); if (p_socket_object) { struct iovec piov[1]; @@ -1427,7 +1427,7 @@ EXPORT_SYMBOL ssize_t XLIO_SYMBOL(__recv_chk)(int __fd, void *__buf, size_t __nb srdr_logfuncall_entry("fd=%d", __fd); - socket_fd_api *p_socket_object = nullptr; + sockinfo *p_socket_object = nullptr; p_socket_object = fd_collection_get_sockfd(__fd); if (p_socket_object) { BULLSEYE_EXCLUDE_BLOCK_START @@ -1463,7 +1463,7 @@ EXPORT_SYMBOL ssize_t XLIO_SYMBOL(recvmsg)(int __fd, struct msghdr *__msg, int _ return -1; } - socket_fd_api *p_socket_object = nullptr; + sockinfo *p_socket_object = nullptr; p_socket_object = fd_collection_get_sockfd(__fd); if (p_socket_object) { __msg->msg_flags = 0; @@ -1520,7 +1520,7 @@ EXPORT_SYMBOL int XLIO_SYMBOL(recvmmsg)(int __fd, struct mmsghdr *__mmsghdr, uns if (__timeout) { gettime(&start_time); } - socket_fd_api *p_socket_object = nullptr; + sockinfo *p_socket_object = nullptr; p_socket_object = fd_collection_get_sockfd(__fd); if (p_socket_object) { int ret = 0; @@ -1578,7 +1578,7 @@ EXPORT_SYMBOL ssize_t XLIO_SYMBOL(recvfrom)(int __fd, void *__buf, size_t __nbyt srdr_logfuncall_entry("fd=%d", __fd); - socket_fd_api *p_socket_object = nullptr; + sockinfo *p_socket_object = nullptr; p_socket_object = fd_collection_get_sockfd(__fd); if (p_socket_object) { struct iovec piov[1]; @@ -1609,7 +1609,7 @@ EXPORT_SYMBOL ssize_t XLIO_SYMBOL(__recvfrom_chk)(int __fd, void *__buf, size_t srdr_logfuncall_entry("fd=%d", __fd); - socket_fd_api *p_socket_object = nullptr; + sockinfo *p_socket_object = nullptr; p_socket_object = fd_collection_get_sockfd(__fd); if (p_socket_object) { BULLSEYE_EXCLUDE_BLOCK_START @@ -1638,7 +1638,7 @@ EXPORT_SYMBOL ssize_t XLIO_SYMBOL(write)(int __fd, __const void *__buf, size_t _ srdr_logfuncall_entry("fd=%d, nbytes=%d", __fd, __nbytes); - socket_fd_api *p_socket_object = nullptr; + sockinfo *p_socket_object = nullptr; p_socket_object = fd_collection_get_sockfd(__fd); if (p_socket_object) { struct iovec piov[1] = {{(void *)__buf, __nbytes}}; @@ -1664,7 +1664,7 @@ EXPORT_SYMBOL ssize_t XLIO_SYMBOL(writev)(int __fd, const struct iovec *iov, int srdr_logfuncall_entry("fd=%d, %d iov blocks", __fd, iovcnt); - socket_fd_api *p_socket_object = nullptr; + sockinfo *p_socket_object = nullptr; p_socket_object = fd_collection_get_sockfd(__fd); if (p_socket_object) { xlio_tx_call_attr_t tx_arg; @@ -1689,7 +1689,7 @@ EXPORT_SYMBOL ssize_t XLIO_SYMBOL(send)(int __fd, __const void *__buf, size_t __ srdr_logfuncall_entry("fd=%d, nbytes=%d", __fd, __nbytes); - socket_fd_api *p_socket_object = nullptr; + sockinfo *p_socket_object = nullptr; p_socket_object = fd_collection_get_sockfd(__fd); if (p_socket_object) { struct iovec piov[1] = {{(void *)__buf, __nbytes}}; @@ -1723,7 +1723,7 @@ EXPORT_SYMBOL ssize_t XLIO_SYMBOL(sendmsg)(int __fd, __const struct msghdr *__ms srdr_logfuncall_entry("fd=%d", __fd); - socket_fd_api *p_socket_object = nullptr; + sockinfo *p_socket_object = nullptr; p_socket_object = fd_collection_get_sockfd(__fd); if (p_socket_object) { return sendmsg_internal(p_socket_object, __msg, __flags); @@ -1758,7 +1758,7 @@ EXPORT_SYMBOL int XLIO_SYMBOL(sendmmsg)(int __fd, struct mmsghdr *__mmsghdr, uns return -1; } - socket_fd_api *p_socket_object = nullptr; + sockinfo *p_socket_object = nullptr; p_socket_object = fd_collection_get_sockfd(__fd); if (p_socket_object) { for (unsigned int i = 0; i < __vlen; i++) { @@ -1808,7 +1808,7 @@ EXPORT_SYMBOL ssize_t XLIO_SYMBOL(sendto)(int __fd, __const void *__buf, size_t srdr_logfuncall_entry("fd=%d, nbytes=%d", __fd, __nbytes); - socket_fd_api *p_socket_object = nullptr; + sockinfo *p_socket_object = nullptr; p_socket_object = fd_collection_get_sockfd(__fd); if (p_socket_object) { struct iovec piov[1] = {{(void *)__buf, __nbytes}}; @@ -1840,7 +1840,7 @@ EXPORT_SYMBOL ssize_t XLIO_SYMBOL(sendfile)(int out_fd, int in_fd, off_t *offset srdr_logfuncall_entry("out_fd=%d, in_fd=%d, offset=%p, *offset=%zu, count=%d", out_fd, in_fd, offset, offset ? *offset : 0, count); - socket_fd_api *p_socket_object = fd_collection_get_sockfd(out_fd); + sockinfo *p_socket_object = fd_collection_get_sockfd(out_fd); if (!p_socket_object) { return SYSCALL(sendfile, out_fd, in_fd, offset, count); } @@ -1856,7 +1856,7 @@ EXPORT_SYMBOL ssize_t XLIO_SYMBOL(sendfile64)(int out_fd, int in_fd, __off64_t * srdr_logfuncall_entry("out_fd=%d, in_fd=%d, offset=%p, *offset=%zu, count=%d", out_fd, in_fd, offset, offset ? *offset : 0, count); - socket_fd_api *p_socket_object = fd_collection_get_sockfd(out_fd); + sockinfo *p_socket_object = fd_collection_get_sockfd(out_fd); if (!p_socket_object) { return SYSCALL(sendfile64, out_fd, in_fd, offset, count); } @@ -2289,7 +2289,7 @@ EXPORT_SYMBOL pid_t XLIO_SYMBOL(fork)(void) if (g_p_app && g_p_app->type == APP_NGINX) { g_p_app->map_thread_id[gettid()] = worker_index; /* Child process needs information about - * listen socket_fd_api objects, so pass this using parent`s g_p_fd_collection. + * listen sockinfo objects, so pass this using parent`s g_p_fd_collection. * It is possible as far as parent`s g_p_fd_collection is not deleted * by reset_globals() */ diff --git a/src/core/sock/socket_fd_api.cpp b/src/core/sock/socket_fd_api.cpp index 2714e0781..66c1b06c6 100644 --- a/src/core/sock/socket_fd_api.cpp +++ b/src/core/sock/socket_fd_api.cpp @@ -336,13 +336,6 @@ void socket_fd_api::remove_epoll_context(epfd_info *epfd) } } -void socket_fd_api::notify_epoll_context(uint32_t events) -{ - if (m_econtext) { - m_econtext->insert_epoll_event_cb(this, events); - } -} - void socket_fd_api::notify_epoll_context_add_ring(ring *ring) { if (m_econtext) { diff --git a/src/core/sock/socket_fd_api.h b/src/core/sock/socket_fd_api.h index 1f2e82e09..342bdd657 100644 --- a/src/core/sock/socket_fd_api.h +++ b/src/core/sock/socket_fd_api.h @@ -277,7 +277,6 @@ class socket_fd_api { } protected: - void notify_epoll_context(uint32_t events); void notify_epoll_context_add_ring(ring *ring); void notify_epoll_context_remove_ring(ring *ring); bool notify_epoll_context_verify(epfd_info *epfd); diff --git a/src/core/sock/sockinfo.cpp b/src/core/sock/sockinfo.cpp index c039d64c7..ecf459ac0 100644 --- a/src/core/sock/sockinfo.cpp +++ b/src/core/sock/sockinfo.cpp @@ -2219,3 +2219,10 @@ void sockinfo::handle_cmsg(struct msghdr *msg, int flags) cm_state.mhdr->msg_controllen = cm_state.cmsg_bytes_consumed; } + +void sockinfo::insert_epoll_event(uint64_t events) +{ + if (m_econtext) { + m_econtext->insert_epoll_event_cb(this, static_cast(events)); + } +} diff --git a/src/core/sock/sockinfo.h b/src/core/sock/sockinfo.h index 40c56c5ff..c1915371f 100644 --- a/src/core/sock/sockinfo.h +++ b/src/core/sock/sockinfo.h @@ -43,6 +43,7 @@ #include "util/xlio_stats.h" #include "util/sys_vars.h" #include "util/wakeup_pipe.h" +#include "iomux/epfd_info.h" #include "proto/flow_tuple.h" #include "proto/mem_buf_desc.h" #include "proto/dst_entry.h" @@ -179,9 +180,15 @@ class sockinfo : public socket_fd_api { void set_m_n_sysvar_rx_num_buffs_reuse(int val) { m_n_sysvar_rx_num_buffs_reuse = val; } #endif + int fcntl(int __cmd, unsigned long int __arg) override; + int fcntl64(int __cmd, unsigned long int __arg) override; + int ioctl(unsigned long int __request, unsigned long int __arg) override; + int setsockopt(int __level, int __optname, const void *__optval, socklen_t __optlen) override; + int getsockopt(int __level, int __optname, void *__optval, socklen_t *__optlen) override; void consider_rings_migration_rx() override; int add_epoll_context(epfd_info *epfd) override; void remove_epoll_context(epfd_info *epfd) override; + int register_callback(xlio_recv_callback_t callback, void *context) override; inline bool set_flow_tag(uint32_t flow_tag_id) { @@ -205,7 +212,7 @@ class sockinfo : public socket_fd_api { void statistics_print(vlog_levels_t log_level = VLOG_DEBUG) override; uint32_t get_flow_tag_val() { return m_flow_tag_id; } inline in_protocol_t get_protocol(void) { return m_protocol; } - + bool is_shadow_socket_present() override { return m_fd >= 0 && m_fd != m_rx_epfd; } bool validate_and_convert_mapped_ipv4(sock_addr &sock) const; void socket_stats_init(); @@ -237,13 +244,8 @@ class sockinfo : public socket_fd_api { } virtual void set_blocking(bool is_blocked); - int fcntl(int __cmd, unsigned long int __arg) override; - int fcntl64(int __cmd, unsigned long int __arg) override; - int ioctl(unsigned long int __request, unsigned long int __arg) override; - int setsockopt(int __level, int __optname, const void *__optval, socklen_t __optlen) override; int setsockopt_kernel(int __level, int __optname, const void *__optval, socklen_t __optlen, int supported, bool allow_priv); - int getsockopt(int __level, int __optname, void *__optval, socklen_t *__optlen) override; virtual mem_buf_desc_t *get_front_m_rx_pkt_ready_list() = 0; virtual size_t get_size_m_rx_pkt_ready_list() = 0; @@ -264,7 +266,6 @@ class sockinfo : public socket_fd_api { virtual void post_deqeue(bool release_buff) = 0; virtual int os_epoll_wait(epoll_event *ep_events, int maxevents); virtual int zero_copy_rx(iovec *p_iov, mem_buf_desc_t *pdesc, int *p_flags) = 0; - int register_callback(xlio_recv_callback_t callback, void *context) override; virtual size_t handle_msg_trunc(size_t total_rx, size_t payload_size, int in_flags, int *p_out_flags); @@ -305,7 +306,7 @@ class sockinfo : public socket_fd_api { int set_sockopt_prio(__const void *__optval, socklen_t __optlen); bool ipv6_set_addr_sel_pref(int val); int ipv6_get_addr_sel_pref(); - + void insert_epoll_event(uint64_t events); virtual void handle_ip_pktinfo(struct cmsg_state *cm_state) = 0; inline void handle_recv_timestamping(struct cmsg_state *cm_state); inline void handle_recv_errqueue(struct cmsg_state *cm_state); @@ -317,7 +318,6 @@ class sockinfo : public socket_fd_api { int os_wait_sock_rx_epfd(epoll_event *ep_events, int maxevents); virtual bool try_un_offloading(); // un-offload the socket if possible - bool is_shadow_socket_present() override { return m_fd >= 0 && m_fd != m_rx_epfd; } inline bool is_socketxtreme() { return safe_mce_sys().enable_socketxtreme; } inline void set_events_socketxtreme(uint64_t events) @@ -352,7 +352,7 @@ class sockinfo : public socket_fd_api { set_events_socketxtreme(events); } - socket_fd_api::notify_epoll_context((uint32_t)events); + insert_epoll_event(events); } inline void save_strq_stats(uint32_t packet_strides) diff --git a/src/core/sock/sockinfo_ulp.h b/src/core/sock/sockinfo_ulp.h index 74941ebc2..b9970d1a2 100644 --- a/src/core/sock/sockinfo_ulp.h +++ b/src/core/sock/sockinfo_ulp.h @@ -33,7 +33,7 @@ #ifndef _SOCKINFO_ULP_H #define _SOCKINFO_ULP_H -#include "socket_fd_api.h" /* xlio_tx_call_attr_t */ +#include "sockinfo.h" /* xlio_tx_call_attr_t */ #include "proto/dst_entry.h" /* xlio_send_attr */ #include "proto/tls.h" /* xlio_tls_info */ #include "lwip/err.h" /* err_t */ From fcbba6ebfd81069ebf6dd7b6644486ec34132891 Mon Sep 17 00:00:00 2001 From: Alexander Grissik Date: Thu, 15 Feb 2024 11:42:37 +0200 Subject: [PATCH 114/169] issue: 3777348 Merging socket_fd_api with sockinfo Removing redundant base from sockinfo allows to put data path members near the vptr for better cache utilization. Signed-off-by: Alexander Grissik --- src/core/Makefile.am | 2 - src/core/proto/dst_entry.h | 11 +- src/core/sock/socket_fd_api.cpp | 375 -------------------------------- src/core/sock/socket_fd_api.h | 309 -------------------------- src/core/sock/sockinfo.cpp | 162 ++++++++++++-- src/core/sock/sockinfo.h | 249 +++++++++++++++++---- src/core/sock/sockinfo_tcp.cpp | 17 +- src/core/sock/sockinfo_tcp.h | 15 +- src/core/sock/sockinfo_udp.cpp | 57 ++++- src/core/sock/sockinfo_udp.h | 26 +++ 10 files changed, 462 insertions(+), 761 deletions(-) delete mode 100644 src/core/sock/socket_fd_api.cpp delete mode 100644 src/core/sock/socket_fd_api.h diff --git a/src/core/Makefile.am b/src/core/Makefile.am index 164b5140f..a87522a3e 100644 --- a/src/core/Makefile.am +++ b/src/core/Makefile.am @@ -151,7 +151,6 @@ libxlio_la_SOURCES := \ sock/sockinfo_tcp.cpp \ sock/tcp_seg_pool.cpp \ sock/fd_collection.cpp \ - sock/socket_fd_api.cpp \ sock/sock-redirect.cpp \ sock/sock-app.cpp \ sock/sock-extra.cpp \ @@ -274,7 +273,6 @@ libxlio_la_SOURCES := \ \ sock/cleanable_obj.h \ sock/fd_collection.h \ - sock/socket_fd_api.h \ sock/sockinfo.h \ sock/sockinfo_tcp.h \ sock/sockinfo_udp.h \ diff --git a/src/core/proto/dst_entry.h b/src/core/proto/dst_entry.h index 3b2bc4813..f9afa28b6 100644 --- a/src/core/proto/dst_entry.h +++ b/src/core/proto/dst_entry.h @@ -40,7 +40,6 @@ #include "vlogger/vlogger.h" #include "utils/lock_wrapper.h" -#include "core/sock/socket_fd_api.h" #include "core/proto/route_entry.h" #include "core/proto/route_val.h" #include "core/proto/neighbour_table_mgr.h" @@ -57,6 +56,16 @@ class xlio_tis; class sockinfo; +typedef enum { + TX_WRITE = 13, + TX_WRITEV, + TX_SEND, + TX_SENDTO, + TX_SENDMSG, + TX_FILE, + TX_UNDEF +} tx_call_t; + struct socket_data { int fd; uint8_t ttl_hop_limit; diff --git a/src/core/sock/socket_fd_api.cpp b/src/core/sock/socket_fd_api.cpp deleted file mode 100644 index 66c1b06c6..000000000 --- a/src/core/sock/socket_fd_api.cpp +++ /dev/null @@ -1,375 +0,0 @@ -/* - * Copyright (c) 2001-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include - -#include -#include -#include "utils/bullseye.h" -#include "sock-redirect.h" -#include "sock-app.h" - -#include "socket_fd_api.h" - -#define MODULE_NAME "sapi" -#undef MODULE_HDR_INFO -#define MODULE_HDR_INFO MODULE_NAME "[fd=%d]:%d:%s() " -#undef __INFO__ -#define __INFO__ m_fd - -socket_fd_api::socket_fd_api(int fd) - : m_epoll_event_flags(0) - , m_fd(fd) - , m_n_sysvar_select_poll_os_ratio(safe_mce_sys().select_poll_os_ratio) - , m_econtext(nullptr) -#if defined(DEFINED_NGINX) || defined(DEFINED_ENVOY) - , m_is_for_socket_pool(false) - , m_back_log(0) -#endif -{ -} - -socket_fd_api::~socket_fd_api() -{ - bool toclose = safe_mce_sys().deferred_close && m_fd >= 0; - -#if defined(DEFINED_NGINX) - if (g_p_app->type == APP_NGINX) { - // Sockets from a socket pool are not closed during close(), so do it now. - toclose = toclose || (m_is_for_socket_pool && m_fd >= 0); - } -#endif - - if (toclose) { - SYSCALL(close, m_fd); - } -} - -void socket_fd_api::destructor_helper() -{ -} - -int socket_fd_api::shutdown(int __how) -{ - __log_info_func(""); - int ret = SYSCALL(shutdown, m_fd, __how); - if (ret) { - __log_info_dbg("shutdown failed (ret=%d %m)", ret); - } - return ret; -} - -int socket_fd_api::bind(const sockaddr *__addr, socklen_t __addrlen) -{ - __log_info_func(""); - int ret = SYSCALL(bind, m_fd, __addr, __addrlen); - if (ret) { - __log_info_dbg("bind failed (ret=%d %m)", ret); - } - return ret; -} - -int socket_fd_api::connect(const sockaddr *__to, socklen_t __tolen) -{ - __log_info_func(""); - int ret = SYSCALL(connect, m_fd, __to, __tolen); - if (ret) { - __log_info_dbg("connect failed (ret=%d %m)", ret); - } - return ret; -} - -int socket_fd_api::accept(struct sockaddr *__addr, socklen_t *__addrlen) -{ - __log_info_func(""); - int ret = SYSCALL(accept, m_fd, __addr, __addrlen); - if (ret < 0) { - __log_info_dbg("accept failed (ret=%d %m)", ret); - } - return ret; -} - -int socket_fd_api::accept4(struct sockaddr *__addr, socklen_t *__addrlen, int __flags) -{ - __log_info_func(""); - int ret = SYSCALL(accept4, m_fd, __addr, __addrlen, __flags); - if (ret < 0) { - __log_info_dbg("accept4 failed (ret=%d %m)", ret); - } - return ret; -} - -int socket_fd_api::listen(int backlog) -{ - __log_info_func(""); - int ret = SYSCALL(listen, m_fd, backlog); - if (ret < 0) { - __log_info_dbg("listen failed (ret=%d %m)", ret); - } - return ret; -} - -int socket_fd_api::getsockname(sockaddr *__name, socklen_t *__namelen) -{ - __log_info_func(""); - int ret = SYSCALL(getsockname, m_fd, __name, __namelen); - if (ret) { - __log_info_dbg("getsockname failed (ret=%d %m)", ret); - } - return ret; -} - -int socket_fd_api::getpeername(sockaddr *__name, socklen_t *__namelen) -{ - __log_info_func(""); - int ret = SYSCALL(getpeername, m_fd, __name, __namelen); - if (ret) { - __log_info_dbg("getpeername failed (ret=%d %m)", ret); - } - return ret; -} - -int socket_fd_api::setsockopt(int __level, int __optname, __const void *__optval, - socklen_t __optlen) -{ - __log_info_func(""); - int ret = SYSCALL(setsockopt, m_fd, __level, __optname, __optval, __optlen); - if (ret) { - __log_info_dbg("setsockopt failed (ret=%d %m)", ret); - } - return ret; -} - -int socket_fd_api::getsockopt(int __level, int __optname, void *__optval, socklen_t *__optlen) -{ - __log_info_func(""); - int ret = SYSCALL(getsockopt, m_fd, __level, __optname, __optval, __optlen); - if (ret) { - __log_info_dbg("getsockopt failed (ret=%d %m)", ret); - } - return ret; -} - -bool socket_fd_api::is_readable(uint64_t *p_poll_sn, fd_array_t *p_fd_array) -{ - NOT_IN_USE(p_poll_sn); - NOT_IN_USE(p_fd_array); - __log_info_funcall(""); - return false; -} - -void socket_fd_api::set_immediate_os_sample() -{ - __log_info_funcall(""); - return; -} - -void socket_fd_api::unset_immediate_os_sample() -{ - __log_info_funcall(""); - return; -} - -bool socket_fd_api::is_writeable() -{ - __log_info_funcall(""); - return true; -} - -bool socket_fd_api::is_errorable(int *errors) -{ - NOT_IN_USE(errors); - __log_info_funcall(""); - return false; -} - -void socket_fd_api::statistics_print(vlog_levels_t log_level /* = VLOG_DEBUG */) -{ - int epoll_fd = get_epoll_context_fd(); - - // Socket data - vlog_printf(log_level, "Fd number : %d\n", m_fd); - if (epoll_fd) { - vlog_printf(log_level, "Socket epoll Fd : %d\n", epoll_fd); - vlog_printf(log_level, "Socket epoll flags : 0x%x\n", m_fd_rec.events); - } -} - -ssize_t socket_fd_api::rx_os(const rx_call_t call_type, iovec *p_iov, ssize_t sz_iov, - const int flags, sockaddr *__from, socklen_t *__fromlen, - struct msghdr *__msg) -{ - errno = 0; - switch (call_type) { - case RX_READ: - __log_info_func("calling os receive with orig read"); - return SYSCALL(read, m_fd, p_iov[0].iov_base, p_iov[0].iov_len); - - case RX_READV: - __log_info_func("calling os receive with orig readv"); - return SYSCALL(readv, m_fd, p_iov, sz_iov); - - case RX_RECV: - __log_info_func("calling os receive with orig recv"); - return SYSCALL(recv, m_fd, p_iov[0].iov_base, p_iov[0].iov_len, flags); - - case RX_RECVFROM: - __log_info_func("calling os receive with orig recvfrom"); - return SYSCALL(recvfrom, m_fd, p_iov[0].iov_base, p_iov[0].iov_len, flags, __from, - __fromlen); - - case RX_RECVMSG: { - __log_info_func("calling os receive with orig recvmsg"); - return SYSCALL(recvmsg, m_fd, __msg, flags); - } - } - return (ssize_t)-1; -} - -ssize_t socket_fd_api::tx_os(const tx_call_t call_type, const iovec *p_iov, const ssize_t sz_iov, - const int __flags, const sockaddr *__to, const socklen_t __tolen) -{ - errno = 0; - - // Ignore dummy messages for OS - if (unlikely(IS_DUMMY_PACKET(__flags))) { - errno = EINVAL; - return -1; - } - - switch (call_type) { - case TX_WRITE: - __log_info_func("calling os transmit with orig write"); - return SYSCALL(write, m_fd, p_iov[0].iov_base, p_iov[0].iov_len); - - case TX_WRITEV: - __log_info_func("calling os transmit with orig writev"); - return SYSCALL(writev, m_fd, p_iov, sz_iov); - - case TX_SEND: - __log_info_func("calling os transmit with orig send"); - return SYSCALL(send, m_fd, p_iov[0].iov_base, p_iov[0].iov_len, __flags); - - case TX_SENDTO: - __log_info_func("calling os transmit with orig sendto"); - return SYSCALL(sendto, m_fd, p_iov[0].iov_base, p_iov[0].iov_len, __flags, __to, __tolen); - - case TX_SENDMSG: { - msghdr __message; - memset(&__message, 0, sizeof(__message)); - __message.msg_iov = (iovec *)p_iov; - __message.msg_iovlen = sz_iov; - __message.msg_name = (void *)__to; - __message.msg_namelen = __tolen; - - __log_info_func("calling os transmit with orig sendmsg"); - return SYSCALL(sendmsg, m_fd, &__message, __flags); - } - default: - __log_info_func("calling undefined os call type!"); - break; - } - return (ssize_t)-1; -} - -int socket_fd_api::register_callback(xlio_recv_callback_t callback, void *context) -{ - NOT_IN_USE(callback); - NOT_IN_USE(context); - return -1; -} - -int socket_fd_api::recvfrom_zcopy_free_packets(struct xlio_recvfrom_zcopy_packet_t *pkts, - size_t count) -{ - NOT_IN_USE(pkts); - NOT_IN_USE(count); - return -1; -} - -int socket_fd_api::add_epoll_context(epfd_info *epfd) -{ - if (!m_econtext) { - // This socket is not registered to any epfd - m_econtext = epfd; - return 0; - } else { - // Currently XLIO does not support more then 1 epfd listed - errno = (m_econtext == epfd) ? EEXIST : ENOMEM; - return -1; - } -} - -void socket_fd_api::remove_epoll_context(epfd_info *epfd) -{ - if (m_econtext == epfd) { - m_econtext = nullptr; - } -} - -void socket_fd_api::notify_epoll_context_add_ring(ring *ring) -{ - if (m_econtext) { - m_econtext->increase_ring_ref_count(ring); - } -} - -void socket_fd_api::notify_epoll_context_remove_ring(ring *ring) -{ - if (m_econtext) { - m_econtext->decrease_ring_ref_count(ring); - } -} - -bool socket_fd_api::notify_epoll_context_verify(epfd_info *epfd) -{ - return m_econtext == epfd; -} - -void socket_fd_api::notify_epoll_context_fd_is_offloaded() -{ - if (m_econtext) { - m_econtext->remove_fd_from_epoll_os(m_fd); - } -} - -int socket_fd_api::get_epoll_context_fd() -{ - if (m_econtext) { - return m_econtext->get_epoll_fd(); - } - return 0; -} - -#if _BullseyeCoverage -#pragma BullseyeCoverage on -#endif diff --git a/src/core/sock/socket_fd_api.h b/src/core/sock/socket_fd_api.h deleted file mode 100644 index 342bdd657..000000000 --- a/src/core/sock/socket_fd_api.h +++ /dev/null @@ -1,309 +0,0 @@ -/* - * Copyright (c) 2001-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#ifndef SOCKET_FD_API_H -#define SOCKET_FD_API_H - -#include "config.h" -#include -#include "xlio_extra.h" - -#include -#include -#include - -#ifndef SOCK_NONBLOCK -#define SOCK_NONBLOCK 04000 -#endif -#ifndef SOCK_CLOEXEC -#define SOCK_CLOEXEC 02000000 -#endif -#ifndef SO_MAX_PACING_RATE -#define SO_MAX_PACING_RATE 47 -#endif - -#define IS_DUMMY_PACKET(flags) (flags & XLIO_SND_FLAGS_DUMMY) - -class cq_mgr_rx; -class epfd_info; -class mem_buf_desc_t; - -struct epoll_fd_rec { - uint32_t events; - epoll_data epdata; - int offloaded_index; // offloaded fd index + 1 - - epoll_fd_rec() { reset(); } - - void reset() - { - this->events = 0; - memset(&this->epdata, 0, sizeof(this->epdata)); - this->offloaded_index = 0; - } -}; - -typedef enum { - TX_WRITE = 13, - TX_WRITEV, - TX_SEND, - TX_SENDTO, - TX_SENDMSG, - TX_FILE, - TX_UNDEF -} tx_call_t; - -enum { - TX_FLAG_NO_PARTIAL_WRITE = 1 << 0, -}; - -/* This structure describes the send operation attributes - * Used attributes can be of different types TX_FILE, TX_WRITE, TX_WRITEV, TX_SEND, TX_SENDTO, - * TX_SENDMSG - */ -typedef struct xlio_tx_call_attr { - tx_call_t opcode; - struct _attr { - struct iovec *iov; - ssize_t sz_iov; - int flags; - struct sockaddr *addr; - socklen_t len; - const struct msghdr *hdr; - } attr; - - unsigned xlio_flags; - pbuf_desc priv; - - ~xlio_tx_call_attr() = default; - - void clear() - { - opcode = TX_UNDEF; - memset(&attr, 0, sizeof(attr)); - memset(&priv, 0, sizeof(priv)); - priv.attr = PBUF_DESC_NONE; - xlio_flags = 0; - } - - xlio_tx_call_attr() { clear(); } -} xlio_tx_call_attr_t; - -typedef enum { RX_READ = 23, RX_READV, RX_RECV, RX_RECVFROM, RX_RECVMSG } rx_call_t; - -#define FD_ARRAY_MAX 24 -typedef struct { - // coverity[member_decl] - int fd_list[FD_ARRAY_MAX]; // Note: An FD might appear twice in the list, - // the user of this array will need to handle it correctly - int fd_max; - int fd_count; -} fd_array_t; - -enum fd_type_t { - FD_TYPE_SOCKET = 0, - FD_TYPE_PIPE, -}; - -typedef xlio_list_t xlio_desc_list_t; - -/** - * - * class socket_fd_api - * - */ - -class socket_fd_api { -public: - socket_fd_api(int fd); - virtual ~socket_fd_api(); - - virtual void clean_socket_obj() { delete this; } - - virtual void setPassthrough() {} - virtual bool isPassthrough() { return false; } - - virtual int prepareListen() { return 0; } - - virtual void destructor_helper(); - - virtual int shutdown(int __how); - - virtual int listen(int backlog); - - virtual int accept(struct sockaddr *__addr, socklen_t *__addrlen); - - virtual int accept4(struct sockaddr *__addr, socklen_t *__addrlen, int __flags); - - virtual int bind(const sockaddr *__addr, socklen_t __addrlen); - - virtual int connect(const sockaddr *__to, socklen_t __tolen); - - virtual int getsockname(sockaddr *__name, socklen_t *__namelen); - virtual int getpeername(sockaddr *__name, socklen_t *__namelen); - - virtual int setsockopt(int __level, int __optname, __const void *__optval, socklen_t __optlen); - - virtual int getsockopt(int __level, int __optname, void *__optval, socklen_t *__optlen); - virtual int fcntl(int __cmd, unsigned long int __arg) = 0; - virtual int fcntl64(int __cmd, unsigned long int __arg) = 0; - - virtual int ioctl(unsigned long int __request, unsigned long int __arg) = 0; - - virtual ssize_t rx(const rx_call_t call_type, iovec *iov, const ssize_t iovlen, - int *p_flags = nullptr, sockaddr *__from = nullptr, - socklen_t *__fromlen = nullptr, struct msghdr *__msg = nullptr) = 0; - - virtual bool is_readable(uint64_t *p_poll_sn, fd_array_t *p_fd_array = nullptr); - - virtual bool is_writeable(); - - virtual bool is_errorable(int *errors); - - // Instructing the socket to immediately sample/un-sample the OS in receive flow - virtual void set_immediate_os_sample(); - virtual void unset_immediate_os_sample(); - - virtual bool is_outgoing() { return false; } - virtual bool is_incoming() { return false; } - virtual bool is_closable() { return true; } - virtual bool is_shadow_socket_present() { return m_fd >= 0; } - -#if defined(DEFINED_NGINX) - virtual void prepare_to_close_socket_pool(bool _push_pop) { NOT_IN_USE(_push_pop); } - virtual void set_params_for_socket_pool() {} -#endif - - // In some cases we need the socket can't be deleted immidiatly - //(for example STREAME sockets) - // This prepares the socket for termination and return true if the - // Return val: true is the socket is already closable and false otherwise - virtual bool prepare_to_close(bool process_shutdown = false) - { - NOT_IN_USE(process_shutdown); - return is_closable(); - } - - virtual ssize_t tx(xlio_tx_call_attr_t &tx_arg) = 0; - - virtual void statistics_print(vlog_levels_t log_level = VLOG_DEBUG); - - virtual int register_callback(xlio_recv_callback_t callback, void *context); - - virtual int recvfrom_zcopy_free_packets(struct xlio_recvfrom_zcopy_packet_t *pkts, - size_t count); - - virtual int get_fd() const { return m_fd; }; - - // true if fd must be skipped from OS select() - // If m_n_sysvar_select_poll_os_ratio == 0, it means that user configured XLIO not to poll os - // (i.e. TRUE...) - virtual bool skip_os_select() { return (!m_n_sysvar_select_poll_os_ratio); }; - - virtual fd_type_t get_type() = 0; - -#if defined(DEFINED_NGINX) || defined(DEFINED_ENVOY) - // This socket options copy is currently implemented for nginx and for very specific options. - // This copy is called as part of fork() flow of nginx specifically. - // If a generic fork() is implemented, this copy should be reimplemented in a more generic way, - // see is_inherited_option mechanism of sockinfo_tcp for an example. - virtual void copy_sockopt_fork(const socket_fd_api *copy_from) = 0; -#endif - - virtual void consider_rings_migration_rx() {} - virtual int add_epoll_context(epfd_info *epfd); - virtual void remove_epoll_context(epfd_info *epfd); - int get_epoll_context_fd(); - - // Calling OS transmit - ssize_t tx_os(const tx_call_t call_type, const iovec *p_iov, const ssize_t sz_iov, - const int __flags, const sockaddr *__to, const socklen_t __tolen); - - static inline size_t pendig_to_remove_node_offset() - { - return NODE_OFFSET(socket_fd_api, pendig_to_remove_node); - } - - static inline size_t socket_fd_list_node_offset() - { - return NODE_OFFSET(socket_fd_api, socket_fd_list_node); - } - - static inline size_t ep_ready_fd_node_offset() - { - return NODE_OFFSET(socket_fd_api, ep_ready_fd_node); - } - - static inline size_t ep_info_fd_node_offset() - { - return NODE_OFFSET(socket_fd_api, ep_info_fd_node); - } - - virtual int get_rings_num() { return 0; } - virtual int get_rings_fds(int *ring_fds, int ring_fds_sz) - { - NOT_IN_USE(ring_fds); - NOT_IN_USE(ring_fds_sz); - return 0; - } - -protected: - void notify_epoll_context_add_ring(ring *ring); - void notify_epoll_context_remove_ring(ring *ring); - bool notify_epoll_context_verify(epfd_info *epfd); - void notify_epoll_context_fd_is_offloaded(); - - // Calling OS receive - ssize_t rx_os(const rx_call_t call_type, iovec *p_iov, ssize_t sz_iov, const int flags, - sockaddr *__from, socklen_t *__fromlen, struct msghdr *__msg); - -public: - list_node pendig_to_remove_node; - list_node socket_fd_list_node; - list_node ep_ready_fd_node; - uint32_t m_epoll_event_flags; - list_node ep_info_fd_node; - epoll_fd_rec m_fd_rec; - -protected: - // identification information - int m_fd; - const uint32_t m_n_sysvar_select_poll_os_ratio; - epfd_info *m_econtext; - -public: -#if defined(DEFINED_NGINX) || defined(DEFINED_ENVOY) - bool m_is_for_socket_pool; // true when this fd will be used for socket pool on close - int m_back_log; -#endif -}; -#endif diff --git a/src/core/sock/sockinfo.cpp b/src/core/sock/sockinfo.cpp index ecf459ac0..19b91f4e4 100644 --- a/src/core/sock/sockinfo.cpp +++ b/src/core/sock/sockinfo.cpp @@ -60,7 +60,10 @@ #define si_logfuncall __log_info_funcall sockinfo::sockinfo(int fd, int domain, bool use_ring_locks) - : socket_fd_api(fd) + : m_epoll_event_flags(0) + , m_fd(fd) + , m_n_sysvar_select_poll_os_ratio(safe_mce_sys().select_poll_os_ratio) + , m_econtext(NULL) , m_reuseaddr(false) , m_reuseport(false) , m_flow_tag_enabled(false) @@ -162,6 +165,19 @@ sockinfo::~sockinfo() xlio_stats_instance_remove_socket_block(m_p_socket_stats); m_socketxtreme.ec_cache.clear(); + + bool toclose = safe_mce_sys().deferred_close && m_fd >= 0; + +#if defined(DEFINED_NGINX) + if (g_p_app->type == APP_NGINX) { + // Sockets from a socket pool are not closed during close(), so do it now. + toclose = toclose || (m_is_for_socket_pool && m_fd >= 0); + } +#endif + + if (toclose) { + SYSCALL(close, m_fd); + } } void sockinfo::socket_stats_init() @@ -260,6 +276,18 @@ int sockinfo::fcntl64(int __cmd, unsigned long int __arg) return SYSCALL(fcntl64, m_fd, __cmd, __arg); } +int sockinfo::get_epoll_context_fd() +{ + return (m_econtext ? m_econtext->get_epoll_fd() : 0); +} + +void sockinfo::insert_epoll_event(uint64_t events) +{ + if (m_econtext) { + m_econtext->insert_epoll_event_cb(this, static_cast(events)); + } +} + int sockinfo::set_ring_attr(xlio_ring_alloc_logic_attr *attr) { if ((attr->comp_mask & XLIO_RING_ALLOC_MASK_RING_ENGRESS) && attr->engress) { @@ -773,7 +801,7 @@ int sockinfo::getsockopt(int __level, int __optname, void *__optval, socklen_t * } #if defined(DEFINED_NGINX) || defined(DEFINED_ENVOY) -void sockinfo::copy_sockopt_fork(const socket_fd_api *copy_from) +void sockinfo::copy_sockopt_fork(const sockinfo *copy_from) { const sockinfo *skinfo = dynamic_cast(copy_from); if (skinfo) { @@ -801,7 +829,7 @@ int sockinfo::get_sock_by_L3_L4(in_protocol_t protocol, const ip_address &ip, in assert(g_p_fd_collection); int map_size = g_p_fd_collection->get_fd_map_size(); for (int i = 0; i < map_size; i++) { - socket_fd_api *p_sock_i = g_p_fd_collection->get_sockfd(i); + sockinfo *p_sock_i = g_p_fd_collection->get_sockfd(i); if (!p_sock_i || p_sock_i->get_type() != FD_TYPE_SOCKET) { continue; } @@ -849,15 +877,6 @@ void sockinfo::save_stats_tx_os(int bytes) } } -size_t sockinfo::handle_msg_trunc(size_t total_rx, size_t payload_size, int in_flags, - int *p_out_flags) -{ - NOT_IN_USE(payload_size); - NOT_IN_USE(in_flags); - *p_out_flags &= ~MSG_TRUNC; // don't handle msg_trunc - return total_rx; -} - bool sockinfo::attach_receiver(flow_tuple_with_local_if &flow_key) { // This function should be called from within mutex protected context of the sockinfo!!! @@ -1303,7 +1322,15 @@ int sockinfo::add_epoll_context(epfd_info *epfd) m_rx_ring_map_lock.lock(); lock_rx_q(); - ret = socket_fd_api::add_epoll_context(epfd); + if (!m_econtext) { + // This socket is not registered to any epfd + m_econtext = epfd; + } else { + // Currently XLIO does not support more then 1 epfd listed + errno = (m_econtext == epfd) ? EEXIST : ENOMEM; + ret = -1; + } + if (ret < 0) { goto unlock_locks; } @@ -1314,7 +1341,9 @@ int sockinfo::add_epoll_context(epfd_info *epfd) sock_ring_map_iter = m_rx_ring_map.begin(); while (sock_ring_map_iter != m_rx_ring_map.end()) { - notify_epoll_context_add_ring(sock_ring_map_iter->first); + if (m_econtext) { + m_econtext->increase_ring_ref_count(sock_ring_map_iter->first); + } sock_ring_map_iter++; } @@ -1331,7 +1360,7 @@ void sockinfo::remove_epoll_context(epfd_info *epfd) m_rx_ring_map_lock.lock(); lock_rx_q(); - if (!notify_epoll_context_verify(epfd)) { + if (m_econtext != epfd) { unlock_rx_q(); m_rx_ring_map_lock.unlock(); return; @@ -1339,11 +1368,16 @@ void sockinfo::remove_epoll_context(epfd_info *epfd) rx_ring_map_t::const_iterator sock_ring_map_iter = m_rx_ring_map.begin(); while (sock_ring_map_iter != m_rx_ring_map.end()) { - notify_epoll_context_remove_ring(sock_ring_map_iter->first); + if (m_econtext) { + m_econtext->decrease_ring_ref_count(sock_ring_map_iter->first); + } sock_ring_map_iter++; } - socket_fd_api::remove_epoll_context(epfd); + if (m_econtext == epfd) { + m_econtext = NULL; + } + if (safe_mce_sys().skip_poll_in_rx == SKIP_POLL_IN_RX_EPOLL_ONLY) { m_skip_cq_poll_in_rx = false; } @@ -1370,7 +1404,14 @@ void sockinfo::statistics_print(vlog_levels_t log_level /* = VLOG_DEBUG */) bool b_any_activity = false; - socket_fd_api::statistics_print(log_level); + int epoll_fd = get_epoll_context_fd(); + + // Socket data + vlog_printf(log_level, "Fd number : %d\n", m_fd); + if (epoll_fd) { + vlog_printf(log_level, "Socket epoll Fd : %d\n", epoll_fd); + vlog_printf(log_level, "Socket epoll flags : 0x%x\n", m_fd_rec.events); + } vlog_printf(log_level, "Bind info : %s\n", m_bound.to_str_ip_port(true).c_str()); vlog_printf(log_level, "Connection info : %s\n", m_connected.to_str_ip_port(true).c_str()); @@ -1602,7 +1643,9 @@ void sockinfo::rx_add_ring_cb(ring *p_ring) // first in order. possible race between removal of fd from epoll (epoll_ctl del, or epoll // close) and here. need to add a third-side lock (fd_collection?) to sync between epoll and // socket. - notify_epoll_context_add_ring(p_ring); + if (m_econtext) { + m_econtext->increase_ring_ref_count(p_ring); + } } lock_rx_q(); @@ -1688,7 +1731,9 @@ void sockinfo::rx_del_ring_cb(ring *p_ring) // first in order. possible race between removal of fd from epoll (epoll_ctl del, or epoll // close) and here. need to add a third-side lock (fd_collection?) to sync between epoll and // socket. - notify_epoll_context_remove_ring(base_ring); + if (m_econtext) { + m_econtext->decrease_ring_ref_count(base_ring); + } } // no need for m_lock_rcv since temp_rx_reuse is on the stack @@ -1931,7 +1976,7 @@ void sockinfo::destructor_helper() m_p_connected_dst_entry = nullptr; } -int sockinfo::register_callback(xlio_recv_callback_t callback, void *context) +int sockinfo::register_callback_ctx(xlio_recv_callback_t callback, void *context) { m_rx_callback = callback; m_rx_callback_context = context; @@ -2220,9 +2265,78 @@ void sockinfo::handle_cmsg(struct msghdr *msg, int flags) cm_state.mhdr->msg_controllen = cm_state.cmsg_bytes_consumed; } -void sockinfo::insert_epoll_event(uint64_t events) +ssize_t sockinfo::rx_os(const rx_call_t call_type, iovec *p_iov, ssize_t sz_iov, const int flags, + sockaddr *__from, socklen_t *__fromlen, struct msghdr *__msg) { - if (m_econtext) { - m_econtext->insert_epoll_event_cb(this, static_cast(events)); + errno = 0; + switch (call_type) { + case RX_READ: + __log_info_func("calling os receive with orig read"); + return SYSCALL(read, m_fd, p_iov[0].iov_base, p_iov[0].iov_len); + + case RX_READV: + __log_info_func("calling os receive with orig readv"); + return SYSCALL(readv, m_fd, p_iov, sz_iov); + + case RX_RECV: + __log_info_func("calling os receive with orig recv"); + return SYSCALL(recv, m_fd, p_iov[0].iov_base, p_iov[0].iov_len, flags); + + case RX_RECVFROM: + __log_info_func("calling os receive with orig recvfrom"); + return SYSCALL(recvfrom, m_fd, p_iov[0].iov_base, p_iov[0].iov_len, flags, __from, + __fromlen); + + case RX_RECVMSG: { + __log_info_func("calling os receive with orig recvmsg"); + return SYSCALL(recvmsg, m_fd, __msg, flags); + } + } + return (ssize_t)-1; +} + +ssize_t sockinfo::tx_os(const tx_call_t call_type, const iovec *p_iov, const ssize_t sz_iov, + const int __flags, const sockaddr *__to, const socklen_t __tolen) +{ + errno = 0; + + // Ignore dummy messages for OS + if (unlikely(IS_DUMMY_PACKET(__flags))) { + errno = EINVAL; + return -1; + } + + switch (call_type) { + case TX_WRITE: + __log_info_func("calling os transmit with orig write"); + return SYSCALL(write, m_fd, p_iov[0].iov_base, p_iov[0].iov_len); + + case TX_WRITEV: + __log_info_func("calling os transmit with orig writev"); + return SYSCALL(writev, m_fd, p_iov, sz_iov); + + case TX_SEND: + __log_info_func("calling os transmit with orig send"); + return SYSCALL(send, m_fd, p_iov[0].iov_base, p_iov[0].iov_len, __flags); + + case TX_SENDTO: + __log_info_func("calling os transmit with orig sendto"); + return SYSCALL(sendto, m_fd, p_iov[0].iov_base, p_iov[0].iov_len, __flags, __to, __tolen); + + case TX_SENDMSG: { + msghdr __message; + memset(&__message, 0, sizeof(__message)); + __message.msg_iov = (iovec *)p_iov; + __message.msg_iovlen = sz_iov; + __message.msg_name = (void *)__to; + __message.msg_namelen = __tolen; + + __log_info_func("calling os transmit with orig sendmsg"); + return SYSCALL(sendmsg, m_fd, &__message, __flags); + } + default: + __log_info_func("calling undefined os call type!"); + break; } + return (ssize_t)-1; } diff --git a/src/core/sock/sockinfo.h b/src/core/sock/sockinfo.h index c1915371f..18915a105 100644 --- a/src/core/sock/sockinfo.h +++ b/src/core/sock/sockinfo.h @@ -33,11 +33,14 @@ #include #include #include - +#include #include "config.h" +#include "xlio_extra.h" +#include "dev/cq_mgr_rx.h" +#include "dev/buffer_pool.h" +#include "sock/cleanable_obj.h" #include "vlogger/vlogger.h" #include "utils/lock_wrapper.h" -#include "xlio_extra.h" #include "util/data_updater.h" #include "util/sock_addr.h" #include "util/xlio_stats.h" @@ -50,8 +53,6 @@ #include "dev/net_device_table_mgr.h" #include "dev/ring_simple.h" #include "dev/ring_allocation_logic.h" - -#include "socket_fd_api.h" #include "sock-redirect.h" #include "sock-app.h" @@ -61,6 +62,19 @@ #define SI_RX_EPFD_EVENT_MAX 16 #define BYTE_TO_KB(byte_value) ((byte_value) / 125) #define KB_TO_BYTE(kbit_value) ((kbit_value)*125) +#define FD_ARRAY_MAX 24 + +#ifndef SOCK_NONBLOCK +#define SOCK_NONBLOCK 04000 +#endif +#ifndef SOCK_CLOEXEC +#define SOCK_CLOEXEC 02000000 +#endif +#ifndef SO_MAX_PACING_RATE +#define SO_MAX_PACING_RATE 47 +#endif + +#define IS_DUMMY_PACKET(flags) (flags & XLIO_SND_FLAGS_DUMMY) #if DEFINED_MISSING_NET_TSTAMP enum { @@ -105,6 +119,17 @@ enum { #define MSG_ZEROCOPY 0x4000000 #endif +typedef enum { RX_READ = 23, RX_READV, RX_RECV, RX_RECVFROM, RX_RECVMSG } rx_call_t; + +enum { + TX_FLAG_NO_PARTIAL_WRITE = 1 << 0, +}; + +enum fd_type_t { + FD_TYPE_SOCKET = 0, + FD_TYPE_PIPE, +}; + struct cmsg_state { struct msghdr *mhdr; struct cmsghdr *cmhdr; @@ -124,6 +149,21 @@ struct buff_info_t { descq_t rx_reuse; }; +struct epoll_fd_rec { + uint32_t events; + epoll_data epdata; + int offloaded_index; // offloaded fd index + 1 + + epoll_fd_rec() { reset(); } + + void reset() + { + this->events = 0; + memset(&this->epdata, 0, sizeof(this->epdata)); + this->offloaded_index = 0; + } +}; + typedef struct { net_device_entry *p_nde; net_device_val *p_ndv; @@ -131,7 +171,47 @@ typedef struct { int refcnt; } net_device_resources_t; +typedef struct { + // coverity[member_decl] + int fd_list[FD_ARRAY_MAX]; // Note: An FD might appear twice in the list, + // the user of this array will need to handle it correctly + int fd_max; + int fd_count; +} fd_array_t; + +/* This structure describes the send operation attributes + * Used attributes can be of different types TX_FILE, TX_WRITE, TX_WRITEV, TX_SEND, TX_SENDTO, + * TX_SENDMSG + */ +typedef struct xlio_tx_call_attr { + tx_call_t opcode; + struct _attr { + struct iovec *iov; + ssize_t sz_iov; + int flags; + struct sockaddr *addr; + socklen_t len; + const struct msghdr *hdr; + } attr; + + unsigned xlio_flags; + pbuf_desc priv; + + ~xlio_tx_call_attr() {}; + void clear(void) + { + opcode = TX_UNDEF; + memset(&attr, 0, sizeof(attr)); + memset(&priv, 0, sizeof(priv)); + priv.attr = PBUF_DESC_NONE; + xlio_flags = 0; + } + + xlio_tx_call_attr() { clear(); } +} xlio_tx_call_attr_t; + typedef std::unordered_map rx_net_device_map_t; +typedef xlio_list_t xlio_desc_list_t; /* * Sockinfo setsockopt() return values @@ -154,10 +234,12 @@ typedef std::unordered_map rx_ring_map_t; // see route.c in Linux kernel const uint8_t ip_tos2prio[16] = {0, 0, 0, 0, 2, 2, 2, 2, 6, 6, 6, 6, 4, 4, 4, 4}; -class sockinfo : public socket_fd_api { +class epfd_info; + +class sockinfo { public: sockinfo(int fd, int domain, bool use_ring_locks); - ~sockinfo() override; + virtual ~sockinfo(); enum sockinfo_state { SOCKINFO_UNDEFINED, @@ -167,28 +249,101 @@ class sockinfo : public socket_fd_api { SOCKINFO_DESTROYING }; + static inline size_t pendig_to_remove_node_offset(void) + { + return NODE_OFFSET(sockinfo, pendig_to_remove_node); + } + + static inline size_t socket_fd_list_node_offset(void) + { + return NODE_OFFSET(sockinfo, socket_fd_list_node); + } + + static inline size_t ep_ready_fd_node_offset(void) + { + return NODE_OFFSET(sockinfo, ep_ready_fd_node); + } + + static inline size_t ep_info_fd_node_offset(void) + { + return NODE_OFFSET(sockinfo, ep_info_fd_node); + } + // Callback from lower layer notifying new receive packets // Return: 'true' if object queuing this receive packet // 'false' if not interested in this receive packet virtual bool rx_input_cb(mem_buf_desc_t *p_rx_pkt_mem_buf_desc_info, void *pv_fd_ready_array) = 0; + int get_fd() const { return m_fd; }; + virtual void clean_socket_obj() = 0; + virtual void setPassthrough() = 0; + virtual bool isPassthrough() = 0; + virtual int prepareListen() = 0; + void destructor_helper(); + virtual int shutdown(int __how) = 0; + virtual int listen(int backlog) = 0; + virtual int accept(struct sockaddr *__addr, socklen_t *__addrlen) = 0; + virtual int accept4(struct sockaddr *__addr, socklen_t *__addrlen, int __flags) = 0; + virtual int bind(const sockaddr *__addr, socklen_t __addrlen) = 0; + virtual int connect(const sockaddr *__to, socklen_t __tolen) = 0; + virtual int getsockname(sockaddr *__name, socklen_t *__namelen) = 0; + virtual int getpeername(sockaddr *__name, socklen_t *__namelen) = 0; + virtual int setsockopt(int __level, int __optname, __const void *__optval, + socklen_t __optlen) = 0; + virtual int getsockopt(int __level, int __optname, void *__optval, socklen_t *__optlen) = 0; + virtual bool is_readable(uint64_t *p_poll_sn, fd_array_t *p_fd_array = NULL) = 0; + virtual bool is_writeable() = 0; + virtual bool is_errorable(int *errors) = 0; + virtual bool is_outgoing() = 0; + virtual bool is_incoming() = 0; + virtual bool is_closable() = 0; + virtual ssize_t tx(xlio_tx_call_attr_t &tx_arg) = 0; + virtual void statistics_print(vlog_levels_t log_level = VLOG_DEBUG) = 0; + virtual int register_callback(xlio_recv_callback_t callback, void *context) = 0; + int register_callback_ctx(xlio_recv_callback_t callback, void *context); + void consider_rings_migration_rx(); + int add_epoll_context(epfd_info *epfd); + void remove_epoll_context(epfd_info *epfd); + virtual int fcntl(int __cmd, unsigned long int __arg); + virtual int fcntl64(int __cmd, unsigned long int __arg); + virtual int ioctl(unsigned long int __request, unsigned long int __arg); + virtual fd_type_t get_type() = 0; + + virtual ssize_t rx(const rx_call_t call_type, iovec *iov, const ssize_t iovlen, + int *p_flags = 0, sockaddr *__from = NULL, socklen_t *__fromlen = NULL, + struct msghdr *__msg = NULL) = 0; + + virtual int recvfrom_zcopy_free_packets(struct xlio_recvfrom_zcopy_packet_t *pkts, + size_t count) = 0; + + // Instructing the socket to immediately sample/un-sample the OS in receive flow + virtual void set_immediate_os_sample() = 0; + virtual void unset_immediate_os_sample() = 0; + + // In some cases we need the socket can't be deleted immidiatly + //(for example STREAME sockets) + // This prepares the socket for termination and return true if the + // Return val: true is the socket is already closable and false otherwise + virtual bool prepare_to_close(bool process_shutdown = false) = 0; + + // true if fd must be skipped from OS select() + // If m_n_sysvar_select_poll_os_ratio == 0, it means that user configured XLIO not to poll os + // (i.e. TRUE...) + virtual bool skip_os_select() { return (!m_n_sysvar_select_poll_os_ratio); }; + #if defined(DEFINED_NGINX) || defined(DEFINED_ENVOY) - void copy_sockopt_fork(const socket_fd_api *copy_from) override; -#endif + // This socket options copy is currently implemented for nginx and for very specific options. + // This copy is called as part of fork() flow of nginx specifically. + // If a generic fork() is implemented, this copy should be reimplemented in a more generic way, + // see is_inherited_option mechanism of sockinfo_tcp for an example. + void copy_sockopt_fork(const sockinfo *copy_from); #if defined(DEFINED_NGINX) + virtual void prepare_to_close_socket_pool(bool _push_pop) { NOT_IN_USE(_push_pop); } + virtual void set_params_for_socket_pool() {}; void set_m_n_sysvar_rx_num_buffs_reuse(int val) { m_n_sysvar_rx_num_buffs_reuse = val; } #endif - - int fcntl(int __cmd, unsigned long int __arg) override; - int fcntl64(int __cmd, unsigned long int __arg) override; - int ioctl(unsigned long int __request, unsigned long int __arg) override; - int setsockopt(int __level, int __optname, const void *__optval, socklen_t __optlen) override; - int getsockopt(int __level, int __optname, void *__optval, socklen_t *__optlen) override; - void consider_rings_migration_rx() override; - int add_epoll_context(epfd_info *epfd) override; - void remove_epoll_context(epfd_info *epfd) override; - int register_callback(xlio_recv_callback_t callback, void *context) override; +#endif inline bool set_flow_tag(uint32_t flow_tag_id) { @@ -207,12 +362,10 @@ class sockinfo : public socket_fd_api { inline bool is_blocking(void) { return m_b_blocking; } bool flow_in_reuse(void) { return m_reuseaddr | m_reuseport; } - int get_rings_fds(int *ring_fds, int ring_fds_sz) override; - int get_rings_num() override; - void statistics_print(vlog_levels_t log_level = VLOG_DEBUG) override; + int get_rings_fds(int *ring_fds, int ring_fds_sz); + int get_rings_num(); uint32_t get_flow_tag_val() { return m_flow_tag_id; } inline in_protocol_t get_protocol(void) { return m_protocol; } - bool is_shadow_socket_present() override { return m_fd >= 0 && m_fd != m_rx_epfd; } bool validate_and_convert_mapped_ipv4(sock_addr &sock) const; void socket_stats_init(); @@ -236,6 +389,12 @@ class sockinfo : public socket_fd_api { } sa_family_t get_family() { return m_family; } + bool is_shadow_socket_present() { return m_fd >= 0 && m_fd != m_rx_epfd; } + int get_epoll_context_fd(); + + // Calling OS transmit + ssize_t tx_os(const tx_call_t call_type, const iovec *p_iov, const ssize_t sz_iov, + const int __flags, const sockaddr *__to, const socklen_t __tolen); protected: inline void set_rx_reuse_pending(bool is_pending = true) @@ -243,15 +402,16 @@ class sockinfo : public socket_fd_api { m_rx_reuse_buf_pending = is_pending; } - virtual void set_blocking(bool is_blocked); int setsockopt_kernel(int __level, int __optname, const void *__optval, socklen_t __optlen, int supported, bool allow_priv); + virtual void set_blocking(bool is_blocked); virtual mem_buf_desc_t *get_front_m_rx_pkt_ready_list() = 0; virtual size_t get_size_m_rx_pkt_ready_list() = 0; virtual void pop_front_m_rx_pkt_ready_list() = 0; virtual void push_back_m_rx_pkt_ready_list(mem_buf_desc_t *buff) = 0; + void notify_epoll_context(uint32_t events); void save_stats_rx_os(int bytes); void save_stats_tx_os(int bytes); void save_stats_rx_offload(int nbytes); @@ -266,9 +426,16 @@ class sockinfo : public socket_fd_api { virtual void post_deqeue(bool release_buff) = 0; virtual int os_epoll_wait(epoll_event *ep_events, int maxevents); virtual int zero_copy_rx(iovec *p_iov, mem_buf_desc_t *pdesc, int *p_flags) = 0; - + virtual void handle_ip_pktinfo(struct cmsg_state *cm_state) = 0; + virtual void lock_rx_q() = 0; + virtual void unlock_rx_q() = 0; + virtual bool try_un_offloading(); // un-offload the socket if possible virtual size_t handle_msg_trunc(size_t total_rx, size_t payload_size, int in_flags, - int *p_out_flags); + int *p_out_flags) = 0; + + // This callback will notify that socket is ready to receive and map the cq. + virtual void rx_add_ring_cb(ring *p_ring); + virtual void rx_del_ring_cb(ring *p_ring); bool attach_receiver(flow_tuple_with_local_if &flow_key); bool detach_receiver(flow_tuple_with_local_if &flow_key); @@ -288,15 +455,7 @@ class sockinfo : public socket_fd_api { transport_t find_target_family(role_t role, const struct sockaddr *sock_addr_first, const struct sockaddr *sock_addr_second = nullptr); - // This callback will notify that socket is ready to receive and map the cq. - virtual void rx_add_ring_cb(ring *p_ring); - virtual void rx_del_ring_cb(ring *p_ring); - - virtual void lock_rx_q() { m_lock_rcv.lock(); } - virtual void unlock_rx_q() { m_lock_rcv.unlock(); } - void shutdown_rx(); - void destructor_helper() override; int modify_ratelimit(dst_entry *p_dst_entry, struct xlio_rate_limit_t &rate_limit); void move_descs(ring *p_ring, descq_t *toq, descq_t *fromq, bool own); @@ -306,8 +465,6 @@ class sockinfo : public socket_fd_api { int set_sockopt_prio(__const void *__optval, socklen_t __optlen); bool ipv6_set_addr_sel_pref(int val); int ipv6_get_addr_sel_pref(); - void insert_epoll_event(uint64_t events); - virtual void handle_ip_pktinfo(struct cmsg_state *cm_state) = 0; inline void handle_recv_timestamping(struct cmsg_state *cm_state); inline void handle_recv_errqueue(struct cmsg_state *cm_state); void insert_cmsg(struct cmsg_state *cm_state, int level, int type, void *data, int len); @@ -316,9 +473,12 @@ class sockinfo : public socket_fd_api { void add_cqfd_to_sock_rx_epfd(ring *p_ring); void remove_cqfd_from_sock_rx_epfd(ring *p_ring); int os_wait_sock_rx_epfd(epoll_event *ep_events, int maxevents); - virtual bool try_un_offloading(); // un-offload the socket if possible - inline bool is_socketxtreme() { return safe_mce_sys().enable_socketxtreme; } + void insert_epoll_event(uint64_t events); + + // Calling OS receive + ssize_t rx_os(const rx_call_t call_type, iovec *p_iov, ssize_t sz_iov, const int flags, + sockaddr *__from, socklen_t *__fromlen, struct msghdr *__msg); inline void set_events_socketxtreme(uint64_t events) { @@ -563,7 +723,22 @@ class sockinfo : public socket_fd_api { rfs *rfs_ptr = nullptr; +#if defined(DEFINED_NGINX) || defined(DEFINED_ENVOY) + bool m_is_for_socket_pool = false; // true when this fd will be used for socket pool on close + int m_back_log = 0; +#endif + + list_node pendig_to_remove_node; + list_node socket_fd_list_node; + list_node ep_ready_fd_node; + uint32_t m_epoll_event_flags; + list_node ep_info_fd_node; + epoll_fd_rec m_fd_rec; + protected: + int m_fd; // identification information + const uint32_t m_n_sysvar_select_poll_os_ratio; + epfd_info *m_econtext; bool m_reuseaddr; // to track setsockopt with SO_REUSEADDR bool m_reuseport; // to track setsockopt with SO_REUSEPORT bool m_flow_tag_enabled; // for this socket diff --git a/src/core/sock/sockinfo_tcp.cpp b/src/core/sock/sockinfo_tcp.cpp index 6dd829d94..c7c7f1f4b 100644 --- a/src/core/sock/sockinfo_tcp.cpp +++ b/src/core/sock/sockinfo_tcp.cpp @@ -1049,7 +1049,7 @@ ssize_t sockinfo_tcp::tcp_tx(xlio_tx_call_attr_t &tx_arg) if (unlikely(m_sock_offload != TCP_SOCK_LWIP) || unlikely(is_invalid_iovec(p_iov, sz_iov))) { struct sockaddr *dst = tx_arg.attr.addr; socklen_t dstlen = tx_arg.attr.len; - ret = socket_fd_api::tx_os(tx_arg.opcode, p_iov, sz_iov, flags, dst, dstlen); + ret = tx_os(tx_arg.opcode, p_iov, sz_iov, flags, dst, dstlen); save_stats_tx_os(ret); return ret; } @@ -2374,7 +2374,7 @@ ssize_t sockinfo_tcp::rx(const rx_call_t call_type, iovec *p_iov, ssize_t sz_iov si_tcp_logfuncall(""); if (unlikely(m_sock_offload != TCP_SOCK_LWIP)) { int ret = 0; - ret = socket_fd_api::rx_os(call_type, p_iov, sz_iov, in_flags, __from, __fromlen, __msg); + ret = rx_os(call_type, p_iov, sz_iov, in_flags, __from, __fromlen, __msg); save_stats_rx_os(ret); return ret; } @@ -2718,8 +2718,8 @@ int sockinfo_tcp::connect(const sockaddr *__to, socklen_t __tolen) TRANS_XLIO) { passthrough_unlock("non offloaded socket --> connect only via OS"); return -1; - } else { - notify_epoll_context_fd_is_offloaded(); // remove fd from os epoll + } else if (m_econtext) { + m_econtext->remove_fd_from_epoll_os(m_fd); // remove fd from os epoll } if (bound_any_addr) { @@ -6350,3 +6350,12 @@ ssize_t sockinfo_tcp::tcp_tx_handle_sndbuf_unavailable(ssize_t total_tx, bool is return tcp_tx_handle_errno_and_unlock(EAGAIN); } } + +size_t sockinfo_tcp::handle_msg_trunc(size_t total_rx, size_t payload_size, int in_flags, + int *p_out_flags) +{ + NOT_IN_USE(payload_size); + NOT_IN_USE(in_flags); + *p_out_flags &= ~MSG_TRUNC; // don't handle msg_trunc + return total_rx; +} diff --git a/src/core/sock/sockinfo_tcp.h b/src/core/sock/sockinfo_tcp.h index ff30916bf..66b2c1b90 100644 --- a/src/core/sock/sockinfo_tcp.h +++ b/src/core/sock/sockinfo_tcp.h @@ -35,7 +35,7 @@ #include "utils/lock_wrapper.h" #include "proto/mem_buf_desc.h" -#include "sock/socket_fd_api.h" +#include "sock/sockinfo.h" #include "dev/buffer_pool.h" #include "dev/cq_mgr_rx.h" #include "xlio_extra.h" @@ -230,6 +230,8 @@ class sockinfo_tcp : public sockinfo { int accept4(struct sockaddr *__addr, socklen_t *__addrlen, int __flags) override; int getsockname(sockaddr *__name, socklen_t *__namelen) override; int getpeername(sockaddr *__name, socklen_t *__namelen) override; + void set_immediate_os_sample() override {}; + void unset_immediate_os_sample() override {}; inline bool handle_bind_no_port(int &bind_ret, in_port_t in_port, const sockaddr *__addr, socklen_t __addrlen); @@ -284,9 +286,9 @@ class sockinfo_tcp : public sockinfo { static void tcp_tx_zc_callback(mem_buf_desc_t *p_desc); void tcp_tx_zc_handle(mem_buf_desc_t *p_desc); - bool inline is_readable(uint64_t *p_poll_sn, fd_array_t *p_fd_array = nullptr) override; - bool inline is_writeable() override; - bool inline is_errorable(int *errors) override; + bool is_readable(uint64_t *p_poll_sn, fd_array_t *p_fd_array = NULL) override; + bool is_writeable() override; + bool is_errorable(int *errors) override; bool is_closable() override { return get_tcp_state(&m_pcb) == CLOSED && m_syn_received.empty() && @@ -396,7 +398,7 @@ class sockinfo_tcp : public sockinfo { int register_callback(xlio_recv_callback_t callback, void *context) override { tcp_recv(&m_pcb, sockinfo_tcp::rx_lwip_cb_recv_callback); - return sockinfo::register_callback(callback, context); + return register_callback_ctx(callback, context); } int tcp_tx_express(const struct iovec *iov, unsigned iov_len, uint32_t mkey, unsigned flags, @@ -417,6 +419,9 @@ class sockinfo_tcp : public sockinfo { bool try_un_offloading() override; // un-offload the socket if possible int os_epoll_wait(epoll_event *ep_events, int maxevents) override; + size_t handle_msg_trunc(size_t total_rx, size_t payload_size, int in_flags, + int *p_out_flags) override; + private: int fcntl_helper(int __cmd, unsigned long int __arg, bool &bexit); void get_tcp_info(struct tcp_info *ti); diff --git a/src/core/sock/sockinfo_udp.cpp b/src/core/sock/sockinfo_udp.cpp index 317a9fcd2..6b8e55758 100644 --- a/src/core/sock/sockinfo_udp.cpp +++ b/src/core/sock/sockinfo_udp.cpp @@ -648,6 +648,46 @@ int sockinfo_udp::connect(const struct sockaddr *__to, socklen_t __tolen) return 0; } +int sockinfo_udp::shutdown(int __how) +{ + si_udp_logfunc(""); + int ret = SYSCALL(shutdown, m_fd, __how); + if (ret) { + si_udp_logdbg("shutdown failed (ret=%d %m)", ret); + } + return ret; +} + +int sockinfo_udp::accept(struct sockaddr *__addr, socklen_t *__addrlen) +{ + si_udp_logfunc(""); + int ret = SYSCALL(accept, m_fd, __addr, __addrlen); + if (ret < 0) { + si_udp_logdbg("accept failed (ret=%d %m)", ret); + } + return ret; +} + +int sockinfo_udp::accept4(struct sockaddr *__addr, socklen_t *__addrlen, int __flags) +{ + si_udp_logfunc(""); + int ret = SYSCALL(accept4, m_fd, __addr, __addrlen, __flags); + if (ret < 0) { + si_udp_logdbg("accept4 failed (ret=%d %m)", ret); + } + return ret; +} + +int sockinfo_udp::listen(int backlog) +{ + si_udp_logfunc(""); + int ret = SYSCALL(listen, m_fd, backlog); + if (ret < 0) { + si_udp_logdbg("listen failed (ret=%d %m)", ret); + } + return ret; +} + int sockinfo_udp::getsockname(struct sockaddr *__name, socklen_t *__namelen) { si_udp_logdbg(""); @@ -660,6 +700,16 @@ int sockinfo_udp::getsockname(struct sockaddr *__name, socklen_t *__namelen) return SYSCALL(getsockname, m_fd, __name, __namelen); } +int sockinfo_udp::getpeername(sockaddr *__name, socklen_t *__namelen) +{ + si_udp_logfunc(""); + int ret = SYSCALL(getpeername, m_fd, __name, __namelen); + if (ret) { + si_udp_logdbg("getpeername failed (ret=%d %m)", ret); + } + return ret; +} + int sockinfo_udp::on_sockname_change(struct sockaddr *__name, socklen_t __namelen) { BULLSEYE_EXCLUDE_BLOCK_START @@ -1813,7 +1863,7 @@ ssize_t sockinfo_udp::rx(const rx_call_t call_type, iovec *p_iov, ssize_t sz_iov } in_flags &= ~MSG_XLIO_ZCOPY; - ret = socket_fd_api::rx_os(call_type, p_iov, sz_iov, in_flags, __from, __fromlen, __msg); + ret = rx_os(call_type, p_iov, sz_iov, in_flags, __from, __fromlen, __msg); *p_flags = in_flags; save_stats_rx_os(ret); if (ret > 0) { @@ -2178,7 +2228,7 @@ ssize_t sockinfo_udp::tx(xlio_tx_call_attr_t &tx_arg) tx_packet_to_os: // Calling OS transmit - ret = socket_fd_api::tx_os(tx_arg.opcode, p_iov, sz_iov, __flags, __dst, __dstlen); + ret = tx_os(tx_arg.opcode, p_iov, sz_iov, __flags, __dst, __dstlen); tx_packet_to_os_stats: save_stats_tx_os(ret); @@ -2479,8 +2529,7 @@ bool sockinfo_udp::rx_input_cb(mem_buf_desc_t *p_desc, void *pv_fd_ready_array) m_port_map_index = ((m_port_map_index + 1) >= m_port_map.size() ? 0 : (m_port_map_index + 1)); int new_port = m_port_map[m_port_map_index].port; - socket_fd_api *sock_api = - g_p_fd_collection->get_sockfd(m_port_map[m_port_map_index].fd); + sockinfo *sock_api = g_p_fd_collection->get_sockfd(m_port_map[m_port_map_index].fd); if (!sock_api || sock_api->get_type() != FD_TYPE_SOCKET) { m_port_map.erase(std::remove(m_port_map.begin(), m_port_map.end(), m_port_map[m_port_map_index].port)); diff --git a/src/core/sock/sockinfo_udp.h b/src/core/sock/sockinfo_udp.h index eeadd4ab1..7d81ca112 100644 --- a/src/core/sock/sockinfo_udp.h +++ b/src/core/sock/sockinfo_udp.h @@ -95,7 +95,22 @@ class sockinfo_udp : public sockinfo { int bind_no_os(); int bind(const struct sockaddr *__addr, socklen_t __addrlen) override; int connect(const struct sockaddr *__to, socklen_t __tolen) override; + void clean_socket_obj() override { delete this; } + bool is_writeable() override { return true; }; + bool is_errorable(int *errors) override + { + NOT_IN_USE(errors); + return false; + } + bool is_outgoing() override { return false; } + bool is_incoming() override { return false; } + int shutdown(int __how) override; + int prepareListen() override { return 0; } + int listen(int backlog) override; + int accept(struct sockaddr *__addr, socklen_t *__addrlen) override; + int accept4(struct sockaddr *__addr, socklen_t *__addrlen, int __flags) override; int getsockname(sockaddr *__name, socklen_t *__namelen) override; + int getpeername(sockaddr *__name, socklen_t *__namelen) override; int setsockopt(int __level, int __optname, const void *__optval, socklen_t __optlen) override; int getsockopt(int __level, int __optname, void *__optval, socklen_t *__optlen) override; @@ -176,8 +191,19 @@ class sockinfo_udp : public sockinfo { set_m_n_sysvar_rx_num_buffs_reuse(safe_mce_sys().nginx_udp_socket_pool_rx_num_buffs_reuse); } bool is_closable() override { return !m_is_for_socket_pool; } +#else + bool is_closable() override { return true; } #endif + int register_callback(xlio_recv_callback_t callback, void *context) override + { + return register_callback_ctx(callback, context); + } + +protected: + void lock_rx_q() override { m_lock_rcv.lock(); } + void unlock_rx_q() override { m_lock_rcv.unlock(); } + private: bool packet_is_loopback(mem_buf_desc_t *p_desc); ssize_t check_payload_size(const iovec *p_iov, ssize_t sz_iov); From 0168f14cb2afb15c7a40490a5d6e703b87496711 Mon Sep 17 00:00:00 2001 From: Alexander Grissik Date: Sun, 25 Feb 2024 11:34:33 +0000 Subject: [PATCH 115/169] issue: 3777348 Moving sockinfo inline impl outside the class Having inline methods with long implementation outside the class makes the class definition more readable. Signed-off-by: Alexander Grissik --- src/core/sock/sockinfo.cpp | 54 +++++ src/core/sock/sockinfo.h | 461 +++++++++++++++++-------------------- 2 files changed, 266 insertions(+), 249 deletions(-) diff --git a/src/core/sock/sockinfo.cpp b/src/core/sock/sockinfo.cpp index 19b91f4e4..d40facc65 100644 --- a/src/core/sock/sockinfo.cpp +++ b/src/core/sock/sockinfo.cpp @@ -59,6 +59,45 @@ #define si_logfunc __log_info_func #define si_logfuncall __log_info_funcall +const char *sockinfo::setsockopt_so_opt_to_str(int opt) +{ + switch (opt) { + case SO_REUSEADDR: + return "SO_REUSEADDR"; + case SO_REUSEPORT: + return "SO_REUSEPORT"; + case SO_BROADCAST: + return "SO_BROADCAST"; + case SO_RCVBUF: + return "SO_RCVBUF"; + case SO_SNDBUF: + return "SO_SNDBUF"; + case SO_TIMESTAMP: + return "SO_TIMESTAMP"; + case SO_TIMESTAMPNS: + return "SO_TIMESTAMPNS"; + case SO_BINDTODEVICE: + return "SO_BINDTODEVICE"; + case SO_ZEROCOPY: + return "SO_ZEROCOPY"; + case SO_XLIO_RING_ALLOC_LOGIC: + return "SO_XLIO_RING_ALLOC_LOGIC"; + case SO_MAX_PACING_RATE: + return "SO_MAX_PACING_RATE"; + case SO_XLIO_FLOW_TAG: + return "SO_XLIO_FLOW_TAG"; + case SO_XLIO_SHUTDOWN_RX: + return "SO_XLIO_SHUTDOWN_RX"; + case IPV6_V6ONLY: + return "IPV6_V6ONLY"; + case IPV6_ADDR_PREFERENCES: + return "IPV6_ADDR_PREFERENCES"; + default: + break; + } + return "UNKNOWN SO opt"; +} + sockinfo::sockinfo(int fd, int domain, bool use_ring_locks) : m_epoll_event_flags(0) , m_fd(fd) @@ -2340,3 +2379,18 @@ ssize_t sockinfo::tx_os(const tx_call_t call_type, const iovec *p_iov, const ssi } return (ssize_t)-1; } + +int sockinfo::handle_exception_flow() +{ + if (safe_mce_sys().exception_handling.is_suit_un_offloading()) { + try_un_offloading(); + } + if (safe_mce_sys().exception_handling == xlio_exception_handling::MODE_RETURN_ERROR) { + errno = EINVAL; + return -1; + } + if (safe_mce_sys().exception_handling == xlio_exception_handling::MODE_ABORT) { + return -2; + } + return 0; +} diff --git a/src/core/sock/sockinfo.h b/src/core/sock/sockinfo.h index 18915a105..0a166f397 100644 --- a/src/core/sock/sockinfo.h +++ b/src/core/sock/sockinfo.h @@ -345,16 +345,7 @@ class sockinfo { #endif #endif - inline bool set_flow_tag(uint32_t flow_tag_id) - { - if (flow_tag_id && (flow_tag_id != FLOW_TAG_MASK)) { - m_flow_tag_id = flow_tag_id; - m_flow_tag_enabled = true; - return true; - } - m_flow_tag_id = FLOW_TAG_MASK; - return false; - } + inline bool set_flow_tag(uint32_t flow_tag_id); inline bool get_reuseaddr(void) { return m_reuseaddr; } inline bool get_reuseport(void) { return m_reuseport; } inline bool flow_tag_enabled(void) { return m_flow_tag_enabled; } @@ -369,24 +360,7 @@ class sockinfo { bool validate_and_convert_mapped_ipv4(sock_addr &sock) const; void socket_stats_init(); - void sock_pop_descs_rx_ready(descq_t *cache) - { - lock_rx_q(); - mem_buf_desc_t *temp; - const size_t size = get_size_m_rx_pkt_ready_list(); - - for (size_t i = 0; i < size; i++) { - temp = get_front_m_rx_pkt_ready_list(); - pop_front_m_rx_pkt_ready_list(); - cache->push_back(temp); - } - m_n_rx_pkt_ready_list_count = 0; - m_rx_ready_byte_count = 0; - m_p_socket_stats->n_rx_ready_pkt_count = 0; - m_p_socket_stats->n_rx_ready_byte_count = 0; - - unlock_rx_q(); - } + inline void sock_pop_descs_rx_ready(descq_t *cache); sa_family_t get_family() { return m_family; } bool is_shadow_socket_present() { return m_fd >= 0 && m_fd != m_rx_epfd; } @@ -397,10 +371,7 @@ class sockinfo { const int __flags, const sockaddr *__to, const socklen_t __tolen); protected: - inline void set_rx_reuse_pending(bool is_pending = true) - { - m_rx_reuse_buf_pending = is_pending; - } + inline void set_rx_reuse_pending(bool is_pending = true); int setsockopt_kernel(int __level, int __optname, const void *__optval, socklen_t __optlen, int supported, bool allow_priv); @@ -480,230 +451,20 @@ class sockinfo { ssize_t rx_os(const rx_call_t call_type, iovec *p_iov, ssize_t sz_iov, const int flags, sockaddr *__from, socklen_t *__fromlen, struct msghdr *__msg); - inline void set_events_socketxtreme(uint64_t events) - { - m_socketxtreme.ec->completion.user_data = (uint64_t)m_fd_context; - if (!m_socketxtreme.ec->completion.events) { - m_socketxtreme.ec->completion.events |= events; - m_p_rx_ring->put_ec(m_socketxtreme.ec); - - m_socketxtreme.ec = NULL; - for (auto &ec : m_socketxtreme.ec_cache) { - if (0 == ec.completion.events) { - m_socketxtreme.ec = &ec; - break; - } - } - if (NULL == m_socketxtreme.ec) { - struct ring_ec ec; - ec.clear(); - m_socketxtreme.ec_cache.push_back(ec); - m_socketxtreme.ec = &m_socketxtreme.ec_cache.back(); - } - } else { - m_socketxtreme.ec->completion.events |= events; - } - } - - inline void set_events(uint64_t events) - { - /* Collect all events if rx ring is enabled */ - if (is_socketxtreme() && m_state == SOCKINFO_OPENED) { - set_events_socketxtreme(events); - } - - insert_epoll_event(events); - } - - inline void save_strq_stats(uint32_t packet_strides) - { - m_socket_stats.strq_counters.n_strq_total_strides += static_cast(packet_strides); - m_socket_stats.strq_counters.n_strq_max_strides_per_packet = - std::max(m_socket_stats.strq_counters.n_strq_max_strides_per_packet, packet_strides); - } + inline void set_events_socketxtreme(uint64_t events); + inline void set_events(uint64_t events); + inline void save_strq_stats(uint32_t packet_strides); inline int dequeue_packet(iovec *p_iov, ssize_t sz_iov, sockaddr *__from, socklen_t *__fromlen, - int in_flags, int *p_out_flags) - { - mem_buf_desc_t *pdesc; - int total_rx = 0; - uint32_t nbytes, pos; - bool relase_buff = true; - - bool is_peek = in_flags & MSG_PEEK; - int rx_pkt_ready_list_idx = 1; - int rx_pkt_ready_offset = m_rx_pkt_ready_offset; - - pdesc = get_front_m_rx_pkt_ready_list(); - void *iov_base = (uint8_t *)pdesc->rx.frag.iov_base + m_rx_pkt_ready_offset; - size_t bytes_left = pdesc->rx.frag.iov_len - m_rx_pkt_ready_offset; - size_t payload_size = pdesc->rx.sz_payload; - - if (__from && __fromlen) { - pdesc->rx.src.get_sa_by_family(__from, *__fromlen, m_family); - } + int in_flags, int *p_out_flags); - if (in_flags & MSG_XLIO_ZCOPY) { - relase_buff = false; - total_rx = zero_copy_rx(p_iov, pdesc, p_out_flags); - if (unlikely(total_rx < 0)) { - return -1; - } - m_rx_pkt_ready_offset = 0; - } else { -#ifdef DEFINED_UTLS - uint8_t tls_type = pdesc->rx.tls_type; -#endif /* DEFINED_UTLS */ - for (int i = 0; i < sz_iov && pdesc; i++) { - pos = 0; - while (pos < p_iov[i].iov_len && pdesc) { -#ifdef DEFINED_UTLS - if (unlikely(pdesc->rx.tls_type != tls_type)) { - break; - } -#endif /* DEFINED_UTLS */ - nbytes = p_iov[i].iov_len - pos; - if (nbytes > bytes_left) { - nbytes = bytes_left; - } - memcpy((char *)(p_iov[i].iov_base) + pos, iov_base, nbytes); - pos += nbytes; - total_rx += nbytes; - m_rx_pkt_ready_offset += nbytes; - bytes_left -= nbytes; - iov_base = (uint8_t *)iov_base + nbytes; - if (m_b_rcvtstamp || m_n_tsing_flags) { - update_socket_timestamps(&pdesc->rx.timestamps); - } - if (bytes_left <= 0) { - if (unlikely(is_peek)) { - pdesc = get_next_desc_peek(pdesc, rx_pkt_ready_list_idx); - } else { - pdesc = get_next_desc(pdesc); - } - m_rx_pkt_ready_offset = 0; - if (pdesc) { - iov_base = pdesc->rx.frag.iov_base; - bytes_left = pdesc->rx.frag.iov_len; - } - } - } - } - } + inline void reuse_buffer(mem_buf_desc_t *buff); - if (unlikely(is_peek)) { - m_rx_pkt_ready_offset = - rx_pkt_ready_offset; // if MSG_PEEK is on, m_rx_pkt_ready_offset must be zero-ed - // save_stats_rx_offload(total_rx); //TODO?? - } else { - m_rx_ready_byte_count -= total_rx; - m_p_socket_stats->n_rx_ready_byte_count -= total_rx; - post_deqeue(relase_buff); - save_stats_rx_offload(total_rx); - } - - total_rx = handle_msg_trunc(total_rx, payload_size, in_flags, p_out_flags); - - return total_rx; - } - - inline void reuse_buffer(mem_buf_desc_t *buff) - { - set_rx_reuse_pending(false); - ring *p_ring = buff->p_desc_owner->get_parent(); - rx_ring_map_t::iterator iter = m_rx_ring_map.find(p_ring); - if (likely(iter != m_rx_ring_map.end())) { - if (safe_mce_sys().buffer_batching_mode == BUFFER_BATCHING_NONE) { - if (!p_ring->reclaim_recv_buffers(buff)) { - g_buffer_pool_rx_ptr->put_buffer_after_deref_thread_safe(buff); - } - return; - } - - descq_t *rx_reuse = &iter->second->rx_reuse_info.rx_reuse; - int &n_buff_num = iter->second->rx_reuse_info.n_buff_num; - rx_reuse->push_back(buff); - n_buff_num += buff->rx.n_frags; - if (n_buff_num < m_n_sysvar_rx_num_buffs_reuse) { - return; - } - if (n_buff_num >= 2 * m_n_sysvar_rx_num_buffs_reuse) { - if (p_ring->reclaim_recv_buffers(rx_reuse)) { - n_buff_num = 0; - } else { - g_buffer_pool_rx_ptr->put_buffers_after_deref_thread_safe(rx_reuse); - n_buff_num = 0; - } - m_rx_reuse_buf_postponed = false; - } else { - m_rx_reuse_buf_postponed = true; - } - } else { - // Retuned buffer to global pool when owner can't be found - // In case ring was deleted while buffers where still queued - vlog_printf(VLOG_DEBUG, "Buffer owner not found\n"); - // Awareness: these are best efforts: decRef without lock in case no CQ - g_buffer_pool_rx_ptr->put_buffer_after_deref_thread_safe(buff); - } - } - - static const char *setsockopt_so_opt_to_str(int opt) - { - switch (opt) { - case SO_REUSEADDR: - return "SO_REUSEADDR"; - case SO_REUSEPORT: - return "SO_REUSEPORT"; - case SO_BROADCAST: - return "SO_BROADCAST"; - case SO_RCVBUF: - return "SO_RCVBUF"; - case SO_SNDBUF: - return "SO_SNDBUF"; - case SO_TIMESTAMP: - return "SO_TIMESTAMP"; - case SO_TIMESTAMPNS: - return "SO_TIMESTAMPNS"; - case SO_BINDTODEVICE: - return "SO_BINDTODEVICE"; - case SO_ZEROCOPY: - return "SO_ZEROCOPY"; - case SO_XLIO_RING_ALLOC_LOGIC: - return "SO_XLIO_RING_ALLOC_LOGIC"; - case SO_MAX_PACING_RATE: - return "SO_MAX_PACING_RATE"; - case SO_XLIO_FLOW_TAG: - return "SO_XLIO_FLOW_TAG"; - case SO_XLIO_SHUTDOWN_RX: - return "SO_XLIO_SHUTDOWN_RX"; - case IPV6_V6ONLY: - return "IPV6_V6ONLY"; - case IPV6_ADDR_PREFERENCES: - return "IPV6_ADDR_PREFERENCES"; - default: - break; - } - return "UNKNOWN SO opt"; - } + static const char *setsockopt_so_opt_to_str(int opt); int get_sock_by_L3_L4(in_protocol_t protocol, const ip_address &ip, in_port_t port); + int handle_exception_flow(); - ////////////////////////////////////////////////////////////////// - int handle_exception_flow() - { - if (safe_mce_sys().exception_handling.is_suit_un_offloading()) { - try_un_offloading(); - } - if (safe_mce_sys().exception_handling == xlio_exception_handling::MODE_RETURN_ERROR) { - errno = EINVAL; - return -1; - } - if (safe_mce_sys().exception_handling == xlio_exception_handling::MODE_ABORT) { - return -2; - } - return 0; - } - ////////////////////////////////////////////////////////////////// private: int fcntl_helper(int __cmd, unsigned long int __arg, bool &bexit); bool attach_as_uc_receiver_anyip(sa_family_t family, role_t role, bool skip_rules); @@ -825,4 +586,206 @@ class sockinfo { bool m_is_ipv6only; }; +void sockinfo::set_rx_reuse_pending(bool is_pending) +{ + m_rx_reuse_buf_pending = is_pending; +} + +bool sockinfo::set_flow_tag(uint32_t flow_tag_id) +{ + if (flow_tag_id && (flow_tag_id != FLOW_TAG_MASK)) { + m_flow_tag_id = flow_tag_id; + m_flow_tag_enabled = true; + return true; + } + m_flow_tag_id = FLOW_TAG_MASK; + return false; +} + +void sockinfo::sock_pop_descs_rx_ready(descq_t *cache) +{ + lock_rx_q(); + mem_buf_desc_t *temp; + const size_t size = get_size_m_rx_pkt_ready_list(); + + for (size_t i = 0; i < size; i++) { + temp = get_front_m_rx_pkt_ready_list(); + pop_front_m_rx_pkt_ready_list(); + cache->push_back(temp); + } + m_n_rx_pkt_ready_list_count = 0; + m_rx_ready_byte_count = 0; + m_p_socket_stats->n_rx_ready_pkt_count = 0; + m_p_socket_stats->n_rx_ready_byte_count = 0; + + unlock_rx_q(); +} + +void sockinfo::set_events_socketxtreme(uint64_t events) +{ + m_socketxtreme.ec->completion.user_data = (uint64_t)m_fd_context; + if (!m_socketxtreme.ec->completion.events) { + m_socketxtreme.ec->completion.events |= events; + m_p_rx_ring->put_ec(m_socketxtreme.ec); + + m_socketxtreme.ec = NULL; + for (auto &ec : m_socketxtreme.ec_cache) { + if (0 == ec.completion.events) { + m_socketxtreme.ec = &ec; + break; + } + } + if (NULL == m_socketxtreme.ec) { + struct ring_ec ec; + ec.clear(); + m_socketxtreme.ec_cache.push_back(ec); + m_socketxtreme.ec = &m_socketxtreme.ec_cache.back(); + } + } else { + m_socketxtreme.ec->completion.events |= events; + } +} + +void sockinfo::set_events(uint64_t events) +{ + /* Collect all events if rx ring is enabled */ + if (is_socketxtreme() && m_state == SOCKINFO_OPENED) { + set_events_socketxtreme(events); + } + + insert_epoll_event(events); +} + +void sockinfo::save_strq_stats(uint32_t packet_strides) +{ + m_socket_stats.strq_counters.n_strq_total_strides += static_cast(packet_strides); + m_socket_stats.strq_counters.n_strq_max_strides_per_packet = + std::max(m_socket_stats.strq_counters.n_strq_max_strides_per_packet, packet_strides); +} + +int sockinfo::dequeue_packet(iovec *p_iov, ssize_t sz_iov, sockaddr *__from, socklen_t *__fromlen, + int in_flags, int *p_out_flags) +{ + mem_buf_desc_t *pdesc; + int total_rx = 0; + uint32_t nbytes, pos; + bool relase_buff = true; + + bool is_peek = in_flags & MSG_PEEK; + int rx_pkt_ready_list_idx = 1; + int rx_pkt_ready_offset = m_rx_pkt_ready_offset; + + pdesc = get_front_m_rx_pkt_ready_list(); + void *iov_base = (uint8_t *)pdesc->rx.frag.iov_base + m_rx_pkt_ready_offset; + size_t bytes_left = pdesc->rx.frag.iov_len - m_rx_pkt_ready_offset; + size_t payload_size = pdesc->rx.sz_payload; + + if (__from && __fromlen) { + pdesc->rx.src.get_sa_by_family(__from, *__fromlen, m_family); + } + + if (in_flags & MSG_XLIO_ZCOPY) { + relase_buff = false; + total_rx = zero_copy_rx(p_iov, pdesc, p_out_flags); + if (unlikely(total_rx < 0)) { + return -1; + } + m_rx_pkt_ready_offset = 0; + } else { +#ifdef DEFINED_UTLS + uint8_t tls_type = pdesc->rx.tls_type; +#endif /* DEFINED_UTLS */ + for (int i = 0; i < sz_iov && pdesc; i++) { + pos = 0; + while (pos < p_iov[i].iov_len && pdesc) { +#ifdef DEFINED_UTLS + if (unlikely(pdesc->rx.tls_type != tls_type)) { + break; + } +#endif /* DEFINED_UTLS */ + nbytes = p_iov[i].iov_len - pos; + if (nbytes > bytes_left) { + nbytes = bytes_left; + } + memcpy((char *)(p_iov[i].iov_base) + pos, iov_base, nbytes); + pos += nbytes; + total_rx += nbytes; + m_rx_pkt_ready_offset += nbytes; + bytes_left -= nbytes; + iov_base = (uint8_t *)iov_base + nbytes; + if (m_b_rcvtstamp || m_n_tsing_flags) { + update_socket_timestamps(&pdesc->rx.timestamps); + } + if (bytes_left <= 0) { + if (unlikely(is_peek)) { + pdesc = get_next_desc_peek(pdesc, rx_pkt_ready_list_idx); + } else { + pdesc = get_next_desc(pdesc); + } + m_rx_pkt_ready_offset = 0; + if (pdesc) { + iov_base = pdesc->rx.frag.iov_base; + bytes_left = pdesc->rx.frag.iov_len; + } + } + } + } + } + + if (unlikely(is_peek)) { + m_rx_pkt_ready_offset = + rx_pkt_ready_offset; // if MSG_PEEK is on, m_rx_pkt_ready_offset must be zero-ed + // save_stats_rx_offload(total_rx); //TODO?? + } else { + m_rx_ready_byte_count -= total_rx; + m_p_socket_stats->n_rx_ready_byte_count -= total_rx; + post_deqeue(relase_buff); + save_stats_rx_offload(total_rx); + } + + total_rx = handle_msg_trunc(total_rx, payload_size, in_flags, p_out_flags); + + return total_rx; +} + +void sockinfo::reuse_buffer(mem_buf_desc_t *buff) +{ + set_rx_reuse_pending(false); + ring *p_ring = buff->p_desc_owner->get_parent(); + rx_ring_map_t::iterator iter = m_rx_ring_map.find(p_ring); + if (likely(iter != m_rx_ring_map.end())) { + if (safe_mce_sys().buffer_batching_mode == BUFFER_BATCHING_NONE) { + if (!p_ring->reclaim_recv_buffers(buff)) { + g_buffer_pool_rx_ptr->put_buffer_after_deref_thread_safe(buff); + } + return; + } + + descq_t *rx_reuse = &iter->second->rx_reuse_info.rx_reuse; + int &n_buff_num = iter->second->rx_reuse_info.n_buff_num; + rx_reuse->push_back(buff); + n_buff_num += buff->rx.n_frags; + if (n_buff_num < m_n_sysvar_rx_num_buffs_reuse) { + return; + } + if (n_buff_num >= 2 * m_n_sysvar_rx_num_buffs_reuse) { + if (p_ring->reclaim_recv_buffers(rx_reuse)) { + n_buff_num = 0; + } else { + g_buffer_pool_rx_ptr->put_buffers_after_deref_thread_safe(rx_reuse); + n_buff_num = 0; + } + m_rx_reuse_buf_postponed = false; + } else { + m_rx_reuse_buf_postponed = true; + } + } else { + // Retuned buffer to global pool when owner can't be found + // In case ring was deleted while buffers where still queued + vlog_printf(VLOG_DEBUG, "Buffer owner not found\n"); + // Awareness: these are best efforts: decRef without lock in case no CQ + g_buffer_pool_rx_ptr->put_buffer_after_deref_thread_safe(buff); + } +} + #endif /* BASE_SOCKINFO_H */ From d135aac7f1c10f678fb0261082aec9482f5fe6c9 Mon Sep 17 00:00:00 2001 From: Alexander Grissik Date: Sun, 25 Feb 2024 12:01:02 +0000 Subject: [PATCH 116/169] issue: 3777348 sockinfo Reordering methods Reordering sockinfo methods for more readable definition. Signed-off-by: Alexander Grissik --- src/core/sock/sockinfo.h | 239 ++++++++++++++++------------------ src/core/sock/sockinfo_nvme.h | 1 - src/core/sock/sockinfo_tcp.h | 1 - 3 files changed, 115 insertions(+), 126 deletions(-) diff --git a/src/core/sock/sockinfo.h b/src/core/sock/sockinfo.h index 0a166f397..44f067eab 100644 --- a/src/core/sock/sockinfo.h +++ b/src/core/sock/sockinfo.h @@ -67,30 +67,15 @@ #ifndef SOCK_NONBLOCK #define SOCK_NONBLOCK 04000 #endif + #ifndef SOCK_CLOEXEC #define SOCK_CLOEXEC 02000000 #endif + #ifndef SO_MAX_PACING_RATE #define SO_MAX_PACING_RATE 47 #endif -#define IS_DUMMY_PACKET(flags) (flags & XLIO_SND_FLAGS_DUMMY) - -#if DEFINED_MISSING_NET_TSTAMP -enum { - SOF_TIMESTAMPING_TX_HARDWARE = (1 << 0), - SOF_TIMESTAMPING_TX_SOFTWARE = (1 << 1), - SOF_TIMESTAMPING_RX_HARDWARE = (1 << 2), - SOF_TIMESTAMPING_RX_SOFTWARE = (1 << 3), - SOF_TIMESTAMPING_SOFTWARE = (1 << 4), - SOF_TIMESTAMPING_SYS_HARDWARE = (1 << 5), - SOF_TIMESTAMPING_RAW_HARDWARE = (1 << 6), - SOF_TIMESTAMPING_MASK = (SOF_TIMESTAMPING_RAW_HARDWARE - 1) | SOF_TIMESTAMPING_RAW_HARDWARE -}; -#else -#include -#endif - #ifndef SO_TIMESTAMPNS #define SO_TIMESTAMPNS 35 #endif @@ -119,6 +104,34 @@ enum { #define MSG_ZEROCOPY 0x4000000 #endif +#define IS_DUMMY_PACKET(flags) (flags & XLIO_SND_FLAGS_DUMMY) +#define NOTIFY_ON_EVENTS(context, events) context->set_events(events) + +// Sockinfo setsockopt() return values +// Internal socket option, should not pass request to OS. +#define SOCKOPT_INTERNAL_XLIO_SUPPORT 0 +// Socket option was found but not supported, error should be returned to user. +#define SOCKOPT_NO_XLIO_SUPPORT -1 +// Should pass to TCP/UDP level or OS. +#define SOCKOPT_PASS_TO_OS 1 +// Pass the option also to the OS. +#define SOCKOPT_HANDLE_BY_OS -2 + +#if DEFINED_MISSING_NET_TSTAMP +enum { + SOF_TIMESTAMPING_TX_HARDWARE = (1 << 0), + SOF_TIMESTAMPING_TX_SOFTWARE = (1 << 1), + SOF_TIMESTAMPING_RX_HARDWARE = (1 << 2), + SOF_TIMESTAMPING_RX_SOFTWARE = (1 << 3), + SOF_TIMESTAMPING_SOFTWARE = (1 << 4), + SOF_TIMESTAMPING_SYS_HARDWARE = (1 << 5), + SOF_TIMESTAMPING_RAW_HARDWARE = (1 << 6), + SOF_TIMESTAMPING_MASK = (SOF_TIMESTAMPING_RAW_HARDWARE - 1) | SOF_TIMESTAMPING_RAW_HARDWARE +}; +#else +#include +#endif + typedef enum { RX_READ = 23, RX_READV, RX_RECV, RX_RECVFROM, RX_RECVMSG } rx_call_t; enum { @@ -136,8 +149,6 @@ struct cmsg_state { size_t cmsg_bytes_consumed; }; -#define NOTIFY_ON_EVENTS(context, events) context->set_events(events) - struct buff_info_t { buff_info_t() { @@ -164,26 +175,30 @@ struct epoll_fd_rec { } }; -typedef struct { +struct net_device_resources_t { net_device_entry *p_nde; net_device_val *p_ndv; ring *p_ring; int refcnt; -} net_device_resources_t; +}; -typedef struct { +struct fd_array_t { // coverity[member_decl] int fd_list[FD_ARRAY_MAX]; // Note: An FD might appear twice in the list, // the user of this array will need to handle it correctly int fd_max; int fd_count; -} fd_array_t; +}; -/* This structure describes the send operation attributes - * Used attributes can be of different types TX_FILE, TX_WRITE, TX_WRITEV, TX_SEND, TX_SENDTO, - * TX_SENDMSG - */ -typedef struct xlio_tx_call_attr { +struct ring_info_t { + int refcnt; + buff_info_t rx_reuse_info; +}; + +// This structure describes the send operation attributes +// Used attributes can be of different types TX_FILE, TX_WRITE, TX_WRITEV, TX_SEND, TX_SENDTO, +// TX_SENDMSG +struct xlio_tx_call_attr_t { tx_call_t opcode; struct _attr { struct iovec *iov; @@ -197,7 +212,7 @@ typedef struct xlio_tx_call_attr { unsigned xlio_flags; pbuf_desc priv; - ~xlio_tx_call_attr() {}; + ~xlio_tx_call_attr_t() {}; void clear(void) { opcode = TX_UNDEF; @@ -207,28 +222,12 @@ typedef struct xlio_tx_call_attr { xlio_flags = 0; } - xlio_tx_call_attr() { clear(); } -} xlio_tx_call_attr_t; + xlio_tx_call_attr_t() { clear(); } +}; typedef std::unordered_map rx_net_device_map_t; typedef xlio_list_t xlio_desc_list_t; - -/* - * Sockinfo setsockopt() return values - */ -#define SOCKOPT_INTERNAL_XLIO_SUPPORT 0 // Internal socket option, should not pass request to OS. -#define SOCKOPT_NO_XLIO_SUPPORT \ - -1 // Socket option was found but not supported, error should be returned to user. -#define SOCKOPT_PASS_TO_OS 1 // Should pass to TCP/UDP level or OS. -#define SOCKOPT_HANDLE_BY_OS -2 // Pass the option also to the OS. - typedef std::unordered_map rx_flow_map_t; - -typedef struct { - int refcnt; - buff_info_t rx_reuse_info; -} ring_info_t; - typedef std::unordered_map rx_ring_map_t; // see route.c in Linux kernel @@ -238,9 +237,6 @@ class epfd_info; class sockinfo { public: - sockinfo(int fd, int domain, bool use_ring_locks); - virtual ~sockinfo(); - enum sockinfo_state { SOCKINFO_UNDEFINED, SOCKINFO_OPENED, @@ -269,18 +265,23 @@ class sockinfo { return NODE_OFFSET(sockinfo, ep_info_fd_node); } + sockinfo(int fd, int domain, bool use_ring_locks); + virtual ~sockinfo(); + // Callback from lower layer notifying new receive packets // Return: 'true' if object queuing this receive packet // 'false' if not interested in this receive packet virtual bool rx_input_cb(mem_buf_desc_t *p_rx_pkt_mem_buf_desc_info, void *pv_fd_ready_array) = 0; - int get_fd() const { return m_fd; }; + virtual ssize_t tx(xlio_tx_call_attr_t &tx_arg) = 0; + virtual bool is_readable(uint64_t *p_poll_sn, fd_array_t *p_fd_array = nullptr) = 0; + virtual bool is_writeable() = 0; + virtual bool is_errorable(int *errors) = 0; virtual void clean_socket_obj() = 0; virtual void setPassthrough() = 0; virtual bool isPassthrough() = 0; virtual int prepareListen() = 0; - void destructor_helper(); virtual int shutdown(int __how) = 0; virtual int listen(int backlog) = 0; virtual int accept(struct sockaddr *__addr, socklen_t *__addrlen) = 0; @@ -292,27 +293,19 @@ class sockinfo { virtual int setsockopt(int __level, int __optname, __const void *__optval, socklen_t __optlen) = 0; virtual int getsockopt(int __level, int __optname, void *__optval, socklen_t *__optlen) = 0; - virtual bool is_readable(uint64_t *p_poll_sn, fd_array_t *p_fd_array = NULL) = 0; - virtual bool is_writeable() = 0; - virtual bool is_errorable(int *errors) = 0; virtual bool is_outgoing() = 0; virtual bool is_incoming() = 0; virtual bool is_closable() = 0; - virtual ssize_t tx(xlio_tx_call_attr_t &tx_arg) = 0; virtual void statistics_print(vlog_levels_t log_level = VLOG_DEBUG) = 0; virtual int register_callback(xlio_recv_callback_t callback, void *context) = 0; - int register_callback_ctx(xlio_recv_callback_t callback, void *context); - void consider_rings_migration_rx(); - int add_epoll_context(epfd_info *epfd); - void remove_epoll_context(epfd_info *epfd); virtual int fcntl(int __cmd, unsigned long int __arg); virtual int fcntl64(int __cmd, unsigned long int __arg); virtual int ioctl(unsigned long int __request, unsigned long int __arg); virtual fd_type_t get_type() = 0; virtual ssize_t rx(const rx_call_t call_type, iovec *iov, const ssize_t iovlen, - int *p_flags = 0, sockaddr *__from = NULL, socklen_t *__fromlen = NULL, - struct msghdr *__msg = NULL) = 0; + int *p_flags = 0, sockaddr *__from = nullptr, socklen_t *__fromlen = nullptr, + struct msghdr *__msg = nullptr) = 0; virtual int recvfrom_zcopy_free_packets(struct xlio_recvfrom_zcopy_packet_t *pkts, size_t count) = 0; @@ -332,61 +325,57 @@ class sockinfo { // (i.e. TRUE...) virtual bool skip_os_select() { return (!m_n_sysvar_select_poll_os_ratio); }; -#if defined(DEFINED_NGINX) || defined(DEFINED_ENVOY) - // This socket options copy is currently implemented for nginx and for very specific options. - // This copy is called as part of fork() flow of nginx specifically. - // If a generic fork() is implemented, this copy should be reimplemented in a more generic way, - // see is_inherited_option mechanism of sockinfo_tcp for an example. - void copy_sockopt_fork(const sockinfo *copy_from); -#if defined(DEFINED_NGINX) - virtual void prepare_to_close_socket_pool(bool _push_pop) { NOT_IN_USE(_push_pop); } - virtual void set_params_for_socket_pool() {}; - void set_m_n_sysvar_rx_num_buffs_reuse(int val) { m_n_sysvar_rx_num_buffs_reuse = val; } -#endif -#endif - inline bool set_flow_tag(uint32_t flow_tag_id); - inline bool get_reuseaddr(void) { return m_reuseaddr; } - inline bool get_reuseport(void) { return m_reuseport; } - inline bool flow_tag_enabled(void) { return m_flow_tag_enabled; } - inline int get_rx_epfd(void) { return m_rx_epfd; } - inline bool is_blocking(void) { return m_b_blocking; } + inline void sock_pop_descs_rx_ready(descq_t *cache); + int get_fd() const { return m_fd; }; + sa_family_t get_family() { return m_family; } + bool get_reuseaddr(void) { return m_reuseaddr; } + bool get_reuseport(void) { return m_reuseport; } + bool flow_tag_enabled(void) { return m_flow_tag_enabled; } + int get_rx_epfd(void) { return m_rx_epfd; } + bool is_blocking(void) { return m_b_blocking; } bool flow_in_reuse(void) { return m_reuseaddr | m_reuseport; } + bool is_shadow_socket_present() { return m_fd >= 0 && m_fd != m_rx_epfd; } + uint32_t get_flow_tag_val() { return m_flow_tag_id; } + in_protocol_t get_protocol(void) { return m_protocol; } + void destructor_helper(); int get_rings_fds(int *ring_fds, int ring_fds_sz); int get_rings_num(); - uint32_t get_flow_tag_val() { return m_flow_tag_id; } - inline in_protocol_t get_protocol(void) { return m_protocol; } bool validate_and_convert_mapped_ipv4(sock_addr &sock) const; void socket_stats_init(); - - inline void sock_pop_descs_rx_ready(descq_t *cache); - - sa_family_t get_family() { return m_family; } - bool is_shadow_socket_present() { return m_fd >= 0 && m_fd != m_rx_epfd; } + int register_callback_ctx(xlio_recv_callback_t callback, void *context); + void consider_rings_migration_rx(); + int add_epoll_context(epfd_info *epfd); + void remove_epoll_context(epfd_info *epfd); int get_epoll_context_fd(); // Calling OS transmit ssize_t tx_os(const tx_call_t call_type, const iovec *p_iov, const ssize_t sz_iov, const int __flags, const sockaddr *__to, const socklen_t __tolen); +#if defined(DEFINED_NGINX) || defined(DEFINED_ENVOY) + // This socket options copy is currently implemented for nginx and for very specific options. + // This copy is called as part of fork() flow of nginx specifically. + // If a generic fork() is implemented, this copy should be reimplemented in a more generic way, + // see is_inherited_option mechanism of sockinfo_tcp for an example. + void copy_sockopt_fork(const sockinfo *copy_from); +#if defined(DEFINED_NGINX) + virtual void prepare_to_close_socket_pool(bool _push_pop) { NOT_IN_USE(_push_pop); } + virtual void set_params_for_socket_pool() {}; + void set_m_n_sysvar_rx_num_buffs_reuse(int val) { m_n_sysvar_rx_num_buffs_reuse = val; } +#endif +#endif protected: - inline void set_rx_reuse_pending(bool is_pending = true); - - int setsockopt_kernel(int __level, int __optname, const void *__optval, socklen_t __optlen, - int supported, bool allow_priv); + static const char *setsockopt_so_opt_to_str(int opt); + virtual void lock_rx_q() = 0; + virtual void unlock_rx_q() = 0; virtual void set_blocking(bool is_blocked); virtual mem_buf_desc_t *get_front_m_rx_pkt_ready_list() = 0; virtual size_t get_size_m_rx_pkt_ready_list() = 0; virtual void pop_front_m_rx_pkt_ready_list() = 0; virtual void push_back_m_rx_pkt_ready_list(mem_buf_desc_t *buff) = 0; - - void notify_epoll_context(uint32_t events); - void save_stats_rx_os(int bytes); - void save_stats_tx_os(int bytes); - void save_stats_rx_offload(int nbytes); - virtual int rx_verify_available_data() = 0; virtual void update_header_field(data_updater *updater) = 0; virtual mem_buf_desc_t *get_next_desc(mem_buf_desc_t *p_desc) = 0; @@ -398,9 +387,8 @@ class sockinfo { virtual int os_epoll_wait(epoll_event *ep_events, int maxevents); virtual int zero_copy_rx(iovec *p_iov, mem_buf_desc_t *pdesc, int *p_flags) = 0; virtual void handle_ip_pktinfo(struct cmsg_state *cm_state) = 0; - virtual void lock_rx_q() = 0; - virtual void unlock_rx_q() = 0; virtual bool try_un_offloading(); // un-offload the socket if possible + virtual size_t handle_msg_trunc(size_t total_rx, size_t payload_size, int in_flags, int *p_out_flags) = 0; @@ -408,6 +396,21 @@ class sockinfo { virtual void rx_add_ring_cb(ring *p_ring); virtual void rx_del_ring_cb(ring *p_ring); + inline void set_rx_reuse_pending(bool is_pending = true); + inline void reuse_buffer(mem_buf_desc_t *buff); + inline void set_events_socketxtreme(uint64_t events); + inline void set_events(uint64_t events); + inline void save_strq_stats(uint32_t packet_strides); + + inline int dequeue_packet(iovec *p_iov, ssize_t sz_iov, sockaddr *__from, socklen_t *__fromlen, + int in_flags, int *p_out_flags); + + bool is_socketxtreme() { return safe_mce_sys().enable_socketxtreme; } + int get_sock_by_L3_L4(in_protocol_t protocol, const ip_address &ip, in_port_t port); + void notify_epoll_context(uint32_t events); + void save_stats_rx_os(int bytes); + void save_stats_tx_os(int bytes); + void save_stats_rx_offload(int nbytes); bool attach_receiver(flow_tuple_with_local_if &flow_key); bool detach_receiver(flow_tuple_with_local_if &flow_key); net_device_resources_t *create_nd_resources(const ip_addr &ip_local); @@ -417,18 +420,8 @@ class sockinfo { int set_ring_attr_helper(ring_alloc_logic_attr *sock_attr, xlio_ring_alloc_logic_attr *attr); void set_ring_logic_rx(ring_alloc_logic_attr ral); void set_ring_logic_tx(ring_alloc_logic_attr ral); - - // Attach to all relevant rings for offloading receive flows - always used from slow path - // According to bounded information we need to attach to all UC relevant flows - // If local_ip is ANY then we need to attach to all offloaded interfaces OR to the one our - // connected_ip is routed to - bool attach_as_uc_receiver(role_t role, bool skip_rules = false); - transport_t find_target_family(role_t role, const struct sockaddr *sock_addr_first, - const struct sockaddr *sock_addr_second = nullptr); - void shutdown_rx(); int modify_ratelimit(dst_entry *p_dst_entry, struct xlio_rate_limit_t &rate_limit); - void move_descs(ring *p_ring, descq_t *toq, descq_t *fromq, bool own); void pop_descs_rx_ready(descq_t *cache, ring *p_ring = nullptr); void push_descs_rx_ready(descq_t *cache); @@ -444,26 +437,24 @@ class sockinfo { void add_cqfd_to_sock_rx_epfd(ring *p_ring); void remove_cqfd_from_sock_rx_epfd(ring *p_ring); int os_wait_sock_rx_epfd(epoll_event *ep_events, int maxevents); - inline bool is_socketxtreme() { return safe_mce_sys().enable_socketxtreme; } void insert_epoll_event(uint64_t events); + int handle_exception_flow(); + + // Attach to all relevant rings for offloading receive flows - always used from slow path + // According to bounded information we need to attach to all UC relevant flows + // If local_ip is ANY then we need to attach to all offloaded interfaces OR to the one our + // connected_ip is routed to + bool attach_as_uc_receiver(role_t role, bool skip_rules = false); // Calling OS receive ssize_t rx_os(const rx_call_t call_type, iovec *p_iov, ssize_t sz_iov, const int flags, sockaddr *__from, socklen_t *__fromlen, struct msghdr *__msg); - inline void set_events_socketxtreme(uint64_t events); - inline void set_events(uint64_t events); - inline void save_strq_stats(uint32_t packet_strides); - - inline int dequeue_packet(iovec *p_iov, ssize_t sz_iov, sockaddr *__from, socklen_t *__fromlen, - int in_flags, int *p_out_flags); - - inline void reuse_buffer(mem_buf_desc_t *buff); - - static const char *setsockopt_so_opt_to_str(int opt); + int setsockopt_kernel(int __level, int __optname, const void *__optval, socklen_t __optlen, + int supported, bool allow_priv); - int get_sock_by_L3_L4(in_protocol_t protocol, const ip_address &ip, in_port_t port); - int handle_exception_flow(); + transport_t find_target_family(role_t role, const struct sockaddr *sock_addr_first, + const struct sockaddr *sock_addr_second = nullptr); private: int fcntl_helper(int __cmd, unsigned long int __arg, bool &bexit); @@ -628,14 +619,14 @@ void sockinfo::set_events_socketxtreme(uint64_t events) m_socketxtreme.ec->completion.events |= events; m_p_rx_ring->put_ec(m_socketxtreme.ec); - m_socketxtreme.ec = NULL; + m_socketxtreme.ec = nullptr; for (auto &ec : m_socketxtreme.ec_cache) { if (0 == ec.completion.events) { m_socketxtreme.ec = &ec; break; } } - if (NULL == m_socketxtreme.ec) { + if (!m_socketxtreme.ec) { struct ring_ec ec; ec.clear(); m_socketxtreme.ec_cache.push_back(ec); diff --git a/src/core/sock/sockinfo_nvme.h b/src/core/sock/sockinfo_nvme.h index 6bf9ac4b0..40703b688 100644 --- a/src/core/sock/sockinfo_nvme.h +++ b/src/core/sock/sockinfo_nvme.h @@ -41,7 +41,6 @@ #include "xlio_extra.h" #include "lwip/err.h" /* err_t */ -typedef struct xlio_tx_call_attr xlio_tx_call_attr_t; struct xlio_send_attr; class sockinfo_tcp_ops_nvme : public sockinfo_tcp_ops { diff --git a/src/core/sock/sockinfo_tcp.h b/src/core/sock/sockinfo_tcp.h index 66b2c1b90..f416e164c 100644 --- a/src/core/sock/sockinfo_tcp.h +++ b/src/core/sock/sockinfo_tcp.h @@ -150,7 +150,6 @@ class tcp_timers_collection : public timer_handler, public cleanable_obj { void *m_timer_handle = nullptr; private: - typedef std::list sock_list; typedef typename sock_list::iterator sock_list_itr; std::vector m_p_intervals; From eca2eba028aa9de8b82c2e9137dc1178f1be4b35 Mon Sep 17 00:00:00 2001 From: Alexander Grissik Date: Mon, 26 Feb 2024 12:34:25 +0000 Subject: [PATCH 117/169] issue: 3777348 Moving sock stats outside the socket Moving socket stats to be a global resource instead of inside of each socket. This reduces socket size by +500B and allows to disable stats for sockets or make few sockets to use stats. Socket stats are generally unneeded and limited in monitoring to 1024 anyway. In cases of 1K+ sockets these stats are overhead. This also allows to disable stats for sockets and improve cache misses. Signed-off-by: Alexander Grissik --- src/core/Makefile.am | 2 + src/core/main.cpp | 4 +- src/core/sock/sock_stats.cpp | 79 ++++++++++++++++++++++++++++++ src/core/sock/sock_stats.h | 60 +++++++++++++++++++++++ src/core/sock/sockinfo.cpp | 32 +++++++++--- src/core/sock/sockinfo.h | 24 ++++++--- src/core/sock/sockinfo_tcp.cpp | 87 +++++++++++++++++++-------------- src/core/sock/sockinfo_udp.cpp | 89 ++++++++++++++++------------------ src/core/sock/sockinfo_udp.h | 1 + src/core/sock/sockinfo_ulp.cpp | 24 +++++---- src/core/util/sys_vars.cpp | 3 +- src/core/util/sys_vars.h | 5 +- src/core/util/xlio_stats.h | 65 +++++++++++++------------ src/stats/stats_printer.cpp | 5 +- src/stats/stats_publisher.cpp | 28 +++++++---- src/stats/stats_reader.cpp | 1 - 16 files changed, 353 insertions(+), 156 deletions(-) create mode 100644 src/core/sock/sock_stats.cpp create mode 100644 src/core/sock/sock_stats.h diff --git a/src/core/Makefile.am b/src/core/Makefile.am index a87522a3e..3b30bf9d2 100644 --- a/src/core/Makefile.am +++ b/src/core/Makefile.am @@ -145,6 +145,7 @@ libxlio_la_SOURCES := \ proto/header.cpp \ proto/arp.cpp \ \ + sock/sock_stats.cpp \ sock/sockinfo.cpp \ sock/sockinfo_udp.cpp \ sock/sockinfo_ulp.cpp \ @@ -273,6 +274,7 @@ libxlio_la_SOURCES := \ \ sock/cleanable_obj.h \ sock/fd_collection.h \ + sock/sock_stats.h \ sock/sockinfo.h \ sock/sockinfo_tcp.h \ sock/sockinfo_udp.h \ diff --git a/src/core/main.cpp b/src/core/main.cpp index 0c5467366..c78d50ca8 100644 --- a/src/core/main.cpp +++ b/src/core/main.cpp @@ -65,7 +65,7 @@ #include "proto/neighbour_table_mgr.h" #include "netlink/netlink_wrapper.h" #include "event/command.h" - +#include "sock/sock_stats.h" #include "sock/sock-redirect.h" #include "sock/sock-app.h" #include "sock/fd_collection.h" @@ -1036,6 +1036,8 @@ static void do_global_ctors_helper() *g_p_vlogger_level = g_vlogger_level; *g_p_vlogger_details = g_vlogger_details; + sock_stats::instance().init_sock_stats(safe_mce_sys().stats_fd_num_max); + g_global_stat_static.init(); xlio_stats_instance_create_global_block(&g_global_stat_static); diff --git a/src/core/sock/sock_stats.cpp b/src/core/sock/sock_stats.cpp new file mode 100644 index 000000000..58c6bd6a9 --- /dev/null +++ b/src/core/sock/sock_stats.cpp @@ -0,0 +1,79 @@ + +/* + * Copyright (c) 2001-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "sock_stats.h" + +thread_local socket_stats_t sock_stats::t_dummy_stats; + +sock_stats &sock_stats::instance() +{ + static sock_stats the_instance; + return the_instance; +} + +socket_stats_t *sock_stats::get_stats_obj() +{ + std::lock_guard lock(_stats_lock); + + if (!_socket_stats_list) { + return nullptr; + } + + socket_stats_t *stat = _socket_stats_list; + _socket_stats_list = _socket_stats_list->_next_stat; + return stat; +} + +void sock_stats::return_stats_obj(socket_stats_t *stats) +{ + std::lock_guard lock(_stats_lock); + stats->_next_stat = _socket_stats_list; + _socket_stats_list = stats; +} + +void sock_stats::init_sock_stats(size_t max_stats) +{ + if (max_stats == 0U) { + return; + } + + std::lock_guard lock(_stats_lock); + + _socket_stats_vec.resize(max_stats); + for (size_t idx = 1; idx < _socket_stats_vec.size(); ++idx) { + _socket_stats_vec[idx - 1U]._next_stat = &_socket_stats_vec[idx]; + } + + _socket_stats_vec[_socket_stats_vec.size() - 1U]._next_stat = nullptr; + _socket_stats_list = &_socket_stats_vec[0]; +} diff --git a/src/core/sock/sock_stats.h b/src/core/sock/sock_stats.h new file mode 100644 index 000000000..3dcde7d2a --- /dev/null +++ b/src/core/sock/sock_stats.h @@ -0,0 +1,60 @@ +/* + * Copyright (c) 2001-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef SOCK_STATS_H +#define SOCK_STATS_H + +#include +#include +#include +#include +#include "util/ip_address.h" +#include "util/xlio_stats.h" + +class sock_stats { +public: + static sock_stats &instance(); + static thread_local socket_stats_t t_dummy_stats; + + void init_sock_stats(size_t max_stats); + socket_stats_t *get_stats_obj(); + void return_stats_obj(socket_stats_t *stats); + +private: + sock_stats() {} + + std::mutex _stats_lock; + socket_stats_t *_socket_stats_list = nullptr; + std::vector _socket_stats_vec; +}; + +#endif diff --git a/src/core/sock/sockinfo.cpp b/src/core/sock/sockinfo.cpp index d40facc65..c71c13f2e 100644 --- a/src/core/sock/sockinfo.cpp +++ b/src/core/sock/sockinfo.cpp @@ -156,10 +156,8 @@ sockinfo::sockinfo(int fd, int domain, bool use_ring_locks) m_ring_alloc_logic_rx = ring_allocation_logic_rx(get_fd(), m_ring_alloc_log_rx); - m_p_socket_stats = &m_socket_stats; // Save stats as local copy and allow state publisher to - // copy from this location socket_stats_init(); - xlio_stats_instance_create_socket_block(m_p_socket_stats); + m_rx_reuse_buff.n_buff_num = 0; memset(&m_so_ratelimit, 0, sizeof(xlio_rate_limit_t)); set_flow_tag(m_fd + 1); @@ -201,7 +199,10 @@ sockinfo::~sockinfo() } } - xlio_stats_instance_remove_socket_block(m_p_socket_stats); + if (m_has_stats) { + xlio_stats_instance_remove_socket_block(m_p_socket_stats); + sock_stats::instance().return_stats_obj(m_p_socket_stats); + } m_socketxtreme.ec_cache.clear(); @@ -221,6 +222,18 @@ sockinfo::~sockinfo() void sockinfo::socket_stats_init() { + if (!m_p_socket_stats) { // This check is for listen sockets. + m_p_socket_stats = sock_stats::instance().get_stats_obj(); + if (!m_p_socket_stats) { + m_p_socket_stats = &sock_stats::t_dummy_stats; + return; + } + + // Save stats as local copy and allow state publisher to copy from this location + xlio_stats_instance_create_socket_block(m_p_socket_stats); + } + + m_has_stats = true; m_p_socket_stats->reset(); m_p_socket_stats->fd = m_fd; m_p_socket_stats->inode = fd2inode(m_fd); @@ -883,7 +896,7 @@ int sockinfo::get_sock_by_L3_L4(in_protocol_t protocol, const ip_address &ip, in void sockinfo::save_stats_rx_offload(int nbytes) { - if (nbytes < 0) { + if (unlikely(has_stats()) && nbytes < 0) { if (errno == EAGAIN) { m_p_socket_stats->counters.n_rx_eagain++; } else { @@ -1467,6 +1480,10 @@ void sockinfo::statistics_print(vlog_levels_t log_level /* = VLOG_DEBUG */) m_p_connected_dst_entry->is_offloaded() ? "true" : "false"); } + if (!has_stats()) { + return; + } + if (m_p_socket_stats->ring_alloc_logic_rx == RING_LOGIC_PER_USER_ID) { vlog_printf(log_level, "RX Ring User ID : %lu\n", m_p_socket_stats->ring_user_id_rx); } @@ -1525,10 +1542,9 @@ void sockinfo::statistics_print(vlog_levels_t log_level /* = VLOG_DEBUG */) (float)(m_p_socket_stats->counters.n_rx_ready_byte_drop * 100) / (float)m_p_socket_stats->counters.n_rx_packets; } - vlog_printf(log_level, "Rx byte : max %d / dropped %d (%2.2f%%) / limit %d\n", + vlog_printf(log_level, "Rx byte : max %d / dropped %d (%2.2f%%)\n", m_p_socket_stats->counters.n_rx_ready_byte_max, - m_p_socket_stats->counters.n_rx_ready_byte_drop, rx_drop_percentage, - m_p_socket_stats->n_rx_ready_byte_limit); + m_p_socket_stats->counters.n_rx_ready_byte_drop, rx_drop_percentage); if (m_p_socket_stats->n_rx_ready_pkt_count) { rx_drop_percentage = (float)(m_p_socket_stats->counters.n_rx_ready_pkt_drop * 100) / diff --git a/src/core/sock/sockinfo.h b/src/core/sock/sockinfo.h index 44f067eab..c29787e07 100644 --- a/src/core/sock/sockinfo.h +++ b/src/core/sock/sockinfo.h @@ -55,6 +55,7 @@ #include "dev/ring_allocation_logic.h" #include "sock-redirect.h" #include "sock-app.h" +#include "sock_stats.h" #ifndef BASE_SOCKINFO_H #define BASE_SOCKINFO_H @@ -328,6 +329,8 @@ class sockinfo { inline bool set_flow_tag(uint32_t flow_tag_id); inline void sock_pop_descs_rx_ready(descq_t *cache); + bool has_stats() const { return m_has_stats; } + bool get_rx_pkt_ready_list_count() const { return m_n_rx_pkt_ready_list_count; } int get_fd() const { return m_fd; }; sa_family_t get_family() { return m_family; } bool get_reuseaddr(void) { return m_reuseaddr; } @@ -343,7 +346,6 @@ class sockinfo { int get_rings_fds(int *ring_fds, int ring_fds_sz); int get_rings_num(); bool validate_and_convert_mapped_ipv4(sock_addr &sock) const; - void socket_stats_init(); int register_callback_ctx(xlio_recv_callback_t callback, void *context); void consider_rings_migration_rx(); int add_epoll_context(epfd_info *epfd); @@ -411,6 +413,7 @@ class sockinfo { void save_stats_rx_os(int bytes); void save_stats_tx_os(int bytes); void save_stats_rx_offload(int nbytes); + void socket_stats_init(); bool attach_receiver(flow_tuple_with_local_if &flow_key); bool detach_receiver(flow_tuple_with_local_if &flow_key); net_device_resources_t *create_nd_resources(const ip_addr &ip_local); @@ -461,7 +464,7 @@ class sockinfo { bool attach_as_uc_receiver_anyip(sa_family_t family, role_t role, bool skip_rules); public: - socket_stats_t *m_p_socket_stats; + socket_stats_t *m_p_socket_stats = nullptr; /* Last memory descriptor with zcopy operation method */ mem_buf_desc_t *m_last_zcdesc; struct { @@ -488,9 +491,11 @@ class sockinfo { epoll_fd_rec m_fd_rec; protected: + int m_fd; // identification information const uint32_t m_n_sysvar_select_poll_os_ratio; epfd_info *m_econtext; + bool m_has_stats = false; bool m_reuseaddr; // to track setsockopt with SO_REUSEADDR bool m_reuseport; // to track setsockopt with SO_REUSEPORT bool m_flow_tag_enabled; // for this socket @@ -516,8 +521,6 @@ class sockinfo { dst_entry *m_p_connected_dst_entry; ip_addr m_so_bindtodevice_ip; - socket_stats_t m_socket_stats; - int m_rx_epfd; cache_observer m_rx_nd_observer; rx_net_device_map_t m_rx_nd_map; @@ -649,9 +652,12 @@ void sockinfo::set_events(uint64_t events) void sockinfo::save_strq_stats(uint32_t packet_strides) { - m_socket_stats.strq_counters.n_strq_total_strides += static_cast(packet_strides); - m_socket_stats.strq_counters.n_strq_max_strides_per_packet = - std::max(m_socket_stats.strq_counters.n_strq_max_strides_per_packet, packet_strides); + if (unlikely(has_stats())) { + m_p_socket_stats->counters.n_rx_packets++; + m_p_socket_stats->strq_counters.n_strq_total_strides += static_cast(packet_strides); + m_p_socket_stats->strq_counters.n_strq_max_strides_per_packet = + std::max(m_p_socket_stats->strq_counters.n_strq_max_strides_per_packet, packet_strides); + } } int sockinfo::dequeue_packet(iovec *p_iov, ssize_t sz_iov, sockaddr *__from, socklen_t *__fromlen, @@ -728,8 +734,10 @@ int sockinfo::dequeue_packet(iovec *p_iov, ssize_t sz_iov, sockaddr *__from, soc rx_pkt_ready_offset; // if MSG_PEEK is on, m_rx_pkt_ready_offset must be zero-ed // save_stats_rx_offload(total_rx); //TODO?? } else { + if (unlikely(has_stats())) { + m_p_socket_stats->n_rx_ready_byte_count -= total_rx; + } m_rx_ready_byte_count -= total_rx; - m_p_socket_stats->n_rx_ready_byte_count -= total_rx; post_deqeue(relase_buff); save_stats_rx_offload(total_rx); } diff --git a/src/core/sock/sockinfo_tcp.cpp b/src/core/sock/sockinfo_tcp.cpp index c7c7f1f4b..dc45b7c0a 100644 --- a/src/core/sock/sockinfo_tcp.cpp +++ b/src/core/sock/sockinfo_tcp.cpp @@ -1886,7 +1886,9 @@ err_t sockinfo_tcp::ack_recvd_lwip_cb(void *arg, struct tcp_pcb *tpcb, u16_t ack ASSERT_LOCKED(conn->m_tcp_con_lock); - conn->m_p_socket_stats->n_tx_ready_byte_count -= ack; + if (unlikely(conn->has_stats())) { + conn->m_p_socket_stats->n_tx_ready_byte_count -= ack; + } if (conn->sndbuf_available() >= conn->m_required_send_block) { NOTIFY_ON_EVENTS(conn, EPOLLOUT); @@ -1950,7 +1952,6 @@ err_t sockinfo_tcp::rx_lwip_cb(void *arg, struct tcp_pcb *pcb, struct pbuf *p, e } conn->rx_lwip_process_chained_pbufs(p); - conn->save_packet_info_in_ready_list(p); // notify io_mux @@ -2082,12 +2083,16 @@ inline void sockinfo_tcp::rx_lwip_process_chained_pbufs(pbuf *p) m_connected.get_sa(reinterpret_cast(&p_first_desc->rx.src), static_cast(sizeof(p_first_desc->rx.src))); - // We go over the p_first_desc again, so decrement what we did in rx_input_cb. - m_socket_stats.strq_counters.n_strq_total_strides -= - static_cast(p_first_desc->rx.strides_num); - m_socket_stats.counters.n_rx_data_pkts++; - // Assume that all chained buffers are GRO packets - m_socket_stats.counters.n_gro += !!p->next; + if (unlikely(has_stats())) { + m_p_socket_stats->counters.n_rx_bytes += p->tot_len; + + // We go over the p_first_desc again, so decrement what we did in rx_input_cb. + m_p_socket_stats->strq_counters.n_strq_total_strides -= + static_cast(p_first_desc->rx.strides_num); + m_p_socket_stats->counters.n_rx_data_pkts++; + // Assume that all chained buffers are GRO packets + m_p_socket_stats->counters.n_gro += !!p->next; + } // To avoid reset ref count for first mem_buf_desc, save it and set after the while int head_ref = p_first_desc->get_ref_count(); @@ -2110,7 +2115,12 @@ inline void sockinfo_tcp::rx_lwip_process_chained_pbufs(pbuf *p) p_curr_desc->p_next_desc = reinterpret_cast(p->next); process_timestamps(p_curr_desc); } + p_first_desc->set_ref_count(head_ref); + + if (unlikely(has_stats())) { + m_p_socket_stats->counters.n_rx_frags += p_first_desc->rx.n_frags; + } } inline void sockinfo_tcp::save_packet_info_in_ready_list(pbuf *p) @@ -2118,16 +2128,15 @@ inline void sockinfo_tcp::save_packet_info_in_ready_list(pbuf *p) m_rx_pkt_ready_list.push_back(reinterpret_cast(p)); m_n_rx_pkt_ready_list_count++; m_rx_ready_byte_count += p->tot_len; - m_p_socket_stats->counters.n_rx_bytes += p->tot_len; - m_p_socket_stats->n_rx_ready_byte_count += p->tot_len; - m_p_socket_stats->n_rx_ready_pkt_count++; - m_socket_stats.counters.n_rx_frags += reinterpret_cast(p)->rx.n_frags; - m_p_socket_stats->counters.n_rx_ready_pkt_max = - std::max((uint32_t)m_p_socket_stats->n_rx_ready_pkt_count, - m_p_socket_stats->counters.n_rx_ready_pkt_max); - m_p_socket_stats->counters.n_rx_ready_byte_max = - std::max((uint32_t)m_p_socket_stats->n_rx_ready_byte_count, - m_p_socket_stats->counters.n_rx_ready_byte_max); + + if (unlikely(has_stats())) { + m_p_socket_stats->n_rx_ready_byte_count += p->tot_len; + m_p_socket_stats->n_rx_ready_pkt_count++; + m_p_socket_stats->counters.n_rx_ready_pkt_max = std::max( + (uint32_t)m_n_rx_pkt_ready_list_count, m_p_socket_stats->counters.n_rx_ready_pkt_max); + m_p_socket_stats->counters.n_rx_ready_byte_max = std::max( + (uint32_t)m_rx_ready_byte_count, m_p_socket_stats->counters.n_rx_ready_byte_max); + } } inline void sockinfo_tcp::rx_lwip_shrink_rcv_wnd(size_t pbuf_tot_len, int bytes_received) @@ -2170,12 +2179,10 @@ err_t sockinfo_tcp::rx_lwip_cb_socketxtreme(void *arg, struct tcp_pcb *pcb, stru conn->handle_rx_lwip_cb_error(p); return err; } - conn->rx_lwip_process_chained_pbufs(p); - - conn->m_p_socket_stats->counters.n_rx_bytes += p->tot_len; - conn->m_socket_stats.counters.n_rx_frags += reinterpret_cast(p)->rx.n_frags; + conn->rx_lwip_process_chained_pbufs(p); conn->rx_lwip_cb_socketxtreme_helper(p); + io_mux_call::update_fd_array(conn->m_iomux_ready_fd_array, conn->m_fd); conn->m_sock_wakeup_pipe.do_wakeup(); /* @@ -2214,10 +2221,8 @@ err_t sockinfo_tcp::rx_lwip_cb_recv_callback(void *arg, struct tcp_pcb *pcb, str conn->handle_rx_lwip_cb_error(p); return err; } - conn->rx_lwip_process_chained_pbufs(p); - conn->m_p_socket_stats->counters.n_rx_bytes += p->tot_len; - conn->m_socket_stats.counters.n_rx_frags += reinterpret_cast(p)->rx.n_frags; + conn->rx_lwip_process_chained_pbufs(p); xlio_recv_callback_retval_t callback_retval = XLIO_PACKET_RECV; @@ -2231,8 +2236,8 @@ err_t sockinfo_tcp::rx_lwip_cb_recv_callback(void *arg, struct tcp_pcb *pcb, str pkt_info.packet_id = (void *)p_first_desc; pkt_info.src = p_first_desc->rx.src.get_p_sa(); pkt_info.dst = p_first_desc->rx.dst.get_p_sa(); - pkt_info.socket_ready_queue_pkt_count = conn->m_p_socket_stats->n_rx_ready_pkt_count; - pkt_info.socket_ready_queue_byte_count = conn->m_p_socket_stats->n_rx_ready_byte_count; + pkt_info.socket_ready_queue_pkt_count = conn->m_n_rx_pkt_ready_list_count; + pkt_info.socket_ready_queue_byte_count = conn->m_rx_ready_byte_count; if (conn->m_n_tsing_flags & SOF_TIMESTAMPING_RAW_HARDWARE) { pkt_info.hw_timestamp = p_first_desc->rx.timestamps.hw; @@ -2537,7 +2542,7 @@ bool sockinfo_tcp::rx_input_cb(mem_buf_desc_t *p_rx_pkt_mem_buf_desc_info, void lock_tcp_con(); save_strq_stats(p_rx_pkt_mem_buf_desc_info->rx.strides_num); - m_socket_stats.counters.n_rx_packets++; + m_iomux_ready_fd_array = (fd_array_t *)pv_fd_ready_array; if (unlikely(get_tcp_state(&m_pcb) == LISTEN)) { @@ -4757,10 +4762,12 @@ void sockinfo_tcp::get_tcp_info(struct tcp_info *ti) ti->tcpi_snd_mss = m_pcb.mss; ti->tcpi_retransmits = m_pcb.nrtx; // ti->tcpi_retrans - we don't keep it and calculation would be O(N). - ti->tcpi_total_retrans = m_p_socket_stats->counters.n_tx_retransmits; ti->tcpi_snd_cwnd = m_pcb.cwnd / m_pcb.mss; ti->tcpi_snd_ssthresh = m_pcb.ssthresh / m_pcb.mss; + // This will be incorrect if sockets number is bigger than safe_mce_sys().stats_fd_num_max. + ti->tcpi_total_retrans = m_p_socket_stats->counters.n_tx_retransmits; + // Currently we miss per segment statistics and most of congestion control fields. } @@ -5254,7 +5261,9 @@ int sockinfo_tcp::rx_wait_helper(int &poll_count, bool blocking) mem_buf_desc_t *sockinfo_tcp::get_next_desc(mem_buf_desc_t *p_desc) { m_rx_pkt_ready_list.pop_front(); - m_p_socket_stats->n_rx_ready_pkt_count--; + if (unlikely(has_stats())) { + m_p_socket_stats->n_rx_ready_pkt_count--; + } m_n_rx_pkt_ready_list_count--; if (p_desc->p_next_desc) { @@ -5268,10 +5277,12 @@ mem_buf_desc_t *sockinfo_tcp::get_next_desc(mem_buf_desc_t *p_desc) p_desc->inc_ref_count(); m_rx_pkt_ready_list.push_front(p_desc); m_n_rx_pkt_ready_list_count++; - m_p_socket_stats->n_rx_ready_pkt_count++; prev->lwip_pbuf.next = nullptr; prev->p_next_desc = nullptr; prev->rx.n_frags = 1; + if (unlikely(has_stats())) { + m_p_socket_stats->n_rx_ready_pkt_count++; + } reuse_buffer(prev); } else { reuse_buffer(p_desc); @@ -6239,12 +6250,14 @@ ssize_t sockinfo_tcp::tcp_tx_handle_done_and_unlock(ssize_t total_tx, int errno_ { tcp_output(&m_pcb); // force data out - if (unlikely(is_dummy)) { - m_p_socket_stats->counters.n_tx_dummy++; - } else if (total_tx) { - m_p_socket_stats->counters.n_tx_sent_byte_count += total_tx; - m_p_socket_stats->counters.n_tx_sent_pkt_count++; - m_p_socket_stats->n_tx_ready_byte_count += total_tx; + if (unlikely(has_stats())) { + if (unlikely(is_dummy)) { + m_p_socket_stats->counters.n_tx_dummy++; + } else if (total_tx) { + m_p_socket_stats->counters.n_tx_sent_byte_count += total_tx; + m_p_socket_stats->counters.n_tx_sent_pkt_count++; + m_p_socket_stats->n_tx_ready_byte_count += total_tx; + } } /* Each send call with MSG_ZEROCOPY that successfully sends diff --git a/src/core/sock/sockinfo_udp.cpp b/src/core/sock/sockinfo_udp.cpp index 6b8e55758..60c69de3e 100644 --- a/src/core/sock/sockinfo_udp.cpp +++ b/src/core/sock/sockinfo_udp.cpp @@ -439,7 +439,7 @@ sockinfo_udp::~sockinfo_udp() // Remove all RX ready queue buffers (Push into reuse queue per ring) si_udp_logdbg("Releasing %d ready rx packets (total of %lu bytes)", m_n_rx_pkt_ready_list_count, - m_p_socket_stats->n_rx_ready_byte_count); + m_rx_ready_byte_count); rx_ready_byte_count_limit_update(0); // Clear the dst_entry map @@ -1665,9 +1665,9 @@ int sockinfo_udp::getsockopt(int __level, int __optname, void *__optval, socklen uint32_t n_so_rcvbuf_bytes = *(int *)__optval; si_udp_logdbg("SOL_SOCKET, SO_RCVBUF=%d", n_so_rcvbuf_bytes); - if (m_p_socket_stats->n_rx_ready_byte_count > n_so_rcvbuf_bytes) { + if (m_rx_ready_byte_count > n_so_rcvbuf_bytes) { si_udp_logdbg("Releasing at least %lu bytes from ready rx packets queue", - m_p_socket_stats->n_rx_ready_byte_count - n_so_rcvbuf_bytes); + m_rx_ready_byte_count - n_so_rcvbuf_bytes); } rx_ready_byte_count_limit_update(n_so_rcvbuf_bytes); @@ -1719,14 +1719,14 @@ int sockinfo_udp::getsockopt(int __level, int __optname, void *__optval, socklen void sockinfo_udp::rx_ready_byte_count_limit_update(size_t n_rx_ready_bytes_limit_new) { si_udp_logfunc("new limit: %d Bytes (old: %d Bytes, min value %d Bytes)", - n_rx_ready_bytes_limit_new, m_p_socket_stats->n_rx_ready_byte_limit, + n_rx_ready_bytes_limit_new, m_rx_ready_byte_limit, m_n_sysvar_rx_ready_byte_min_limit); if (n_rx_ready_bytes_limit_new > 0 && n_rx_ready_bytes_limit_new < m_n_sysvar_rx_ready_byte_min_limit) { n_rx_ready_bytes_limit_new = m_n_sysvar_rx_ready_byte_min_limit; } - m_p_socket_stats->n_rx_ready_byte_limit = n_rx_ready_bytes_limit_new; - drop_rx_ready_byte_count(m_p_socket_stats->n_rx_ready_byte_limit); + m_rx_ready_byte_limit = n_rx_ready_bytes_limit_new; + drop_rx_ready_byte_count(n_rx_ready_bytes_limit_new); return; } @@ -1737,8 +1737,7 @@ void sockinfo_udp::drop_rx_ready_byte_count(size_t n_rx_bytes_limit) m_lock_rcv.lock(); while (m_n_rx_pkt_ready_list_count) { mem_buf_desc_t *p_rx_pkt_desc = m_rx_pkt_ready_list.front(); - if (m_p_socket_stats->n_rx_ready_byte_count > n_rx_bytes_limit || - p_rx_pkt_desc->rx.sz_payload == 0U) { + if (m_rx_ready_byte_count > n_rx_bytes_limit || p_rx_pkt_desc->rx.sz_payload == 0U) { m_rx_pkt_ready_list.pop_front(); m_n_rx_pkt_ready_list_count--; m_rx_ready_byte_count -= p_rx_pkt_desc->rx.sz_payload; @@ -1954,15 +1953,14 @@ bool sockinfo_udp::is_readable(uint64_t *p_poll_sn, fd_array_t *p_fd_ready_array if (m_n_sysvar_rx_cq_drain_rate_nsec == MCE_RX_CQ_DRAIN_RATE_DISABLED) { si_udp_logfunc("=> true (ready count = %d packets / %d bytes)", - m_n_rx_pkt_ready_list_count, m_p_socket_stats->n_rx_ready_byte_count); + m_n_rx_pkt_ready_list_count, m_rx_ready_byte_count); return true; } else { tscval_t tsc_now = TSCVAL_INITIALIZER; gettimeoftsc(&tsc_now); if (tsc_now - g_si_tscv_last_poll < m_n_sysvar_rx_delta_tsc_between_cq_polls) { si_udp_logfunc("=> true (ready count = %d packets / %d bytes)", - m_n_rx_pkt_ready_list_count, - m_p_socket_stats->n_rx_ready_byte_count); + m_n_rx_pkt_ready_list_count, m_rx_ready_byte_count); return true; } @@ -1997,8 +1995,7 @@ bool sockinfo_udp::is_readable(uint64_t *p_poll_sn, fd_array_t *p_fd_ready_array if (m_n_rx_pkt_ready_list_count) { // Get out of the CQ polling loop si_udp_logfunc("=> polled true (ready count = %d packets / %d bytes)", - m_n_rx_pkt_ready_list_count, - m_p_socket_stats->n_rx_ready_byte_count); + m_n_rx_pkt_ready_list_count, m_rx_ready_byte_count); m_rx_ring_map_lock.unlock(); return true; } @@ -2013,13 +2010,13 @@ bool sockinfo_udp::is_readable(uint64_t *p_poll_sn, fd_array_t *p_fd_ready_array // m_n_rx_pkt_ready_list_count if (m_n_rx_pkt_ready_list_count) { si_udp_logfunc("=> true (ready count = %d packets / %d bytes)", m_n_rx_pkt_ready_list_count, - m_p_socket_stats->n_rx_ready_byte_count); + m_rx_ready_byte_count); return true; } // Not ready packets in ready queue, return false si_udp_logfuncall("=> false (ready count = %d packets / %d bytes)", m_n_rx_pkt_ready_list_count, - m_p_socket_stats->n_rx_ready_byte_count); + m_rx_ready_byte_count); return false; } @@ -2308,8 +2305,8 @@ inline xlio_recv_callback_retval_t sockinfo_udp::inspect_by_user_cb(mem_buf_desc pkt_info.packet_id = (void *)p_desc; pkt_info.src = p_desc->rx.src.get_p_sa(); pkt_info.dst = p_desc->rx.dst.get_p_sa(); - pkt_info.socket_ready_queue_pkt_count = m_p_socket_stats->n_rx_ready_pkt_count; - pkt_info.socket_ready_queue_byte_count = m_p_socket_stats->n_rx_ready_byte_count; + pkt_info.socket_ready_queue_pkt_count = m_n_rx_pkt_ready_list_count; + pkt_info.socket_ready_queue_byte_count = m_rx_ready_byte_count; if (m_n_tsing_flags & SOF_TIMESTAMPING_RAW_HARDWARE) { pkt_info.hw_timestamp = p_desc->rx.timestamps.hw; @@ -2378,14 +2375,15 @@ inline void sockinfo_udp::update_ready(mem_buf_desc_t *p_desc, void *pv_fd_ready m_rx_pkt_ready_list.push_back(p_desc); m_n_rx_pkt_ready_list_count++; m_rx_ready_byte_count += p_desc->rx.sz_payload; - m_p_socket_stats->n_rx_ready_pkt_count++; - m_p_socket_stats->n_rx_ready_byte_count += p_desc->rx.sz_payload; - m_p_socket_stats->counters.n_rx_ready_pkt_max = - std::max((uint32_t)m_p_socket_stats->n_rx_ready_pkt_count, - m_p_socket_stats->counters.n_rx_ready_pkt_max); - m_p_socket_stats->counters.n_rx_ready_byte_max = - std::max((uint32_t)m_p_socket_stats->n_rx_ready_byte_count, - m_p_socket_stats->counters.n_rx_ready_byte_max); + if (unlikely(has_stats())) { + m_p_socket_stats->n_rx_ready_byte_count += p_desc->rx.sz_payload; + m_p_socket_stats->n_rx_ready_pkt_count++; + m_p_socket_stats->counters.n_rx_ready_pkt_max = + std::max((uint32_t)m_n_rx_pkt_ready_list_count, + m_p_socket_stats->counters.n_rx_ready_pkt_max); + m_p_socket_stats->counters.n_rx_ready_byte_max = std::max( + (uint32_t)m_rx_ready_byte_count, m_p_socket_stats->counters.n_rx_ready_byte_max); + } m_sock_wakeup_pipe.do_wakeup(); m_lock_rcv.unlock(); } else { @@ -2402,7 +2400,7 @@ inline void sockinfo_udp::update_ready(mem_buf_desc_t *p_desc, void *pv_fd_ready io_mux_call::update_fd_array((fd_array_t *)pv_fd_ready_array, m_fd); si_udp_logfunc("rx ready count = %d packets / %d bytes", m_n_rx_pkt_ready_list_count, - m_p_socket_stats->n_rx_ready_byte_count); + m_rx_ready_byte_count); } bool sockinfo_udp::packet_is_loopback(mem_buf_desc_t *p_desc) @@ -2415,18 +2413,15 @@ bool sockinfo_udp::packet_is_loopback(mem_buf_desc_t *p_desc) bool sockinfo_udp::rx_input_cb(mem_buf_desc_t *p_desc, void *pv_fd_ready_array) { - m_p_socket_stats->counters.n_rx_packets++; - if (unlikely((m_state == SOCKINFO_DESTROYING) || g_b_exit)) { si_udp_logfunc("rx packet discarded - fd closed"); return false; } /* Check if sockinfo rx byte SO_RCVBUF reached - then disregard this packet */ - if (unlikely(m_p_socket_stats->n_rx_ready_byte_count >= - m_p_socket_stats->n_rx_ready_byte_limit)) { + if (unlikely(m_rx_ready_byte_count >= m_rx_ready_byte_limit)) { si_udp_logfunc("rx packet discarded - socket limit reached (%d bytes)", - m_p_socket_stats->n_rx_ready_byte_limit); + m_rx_ready_byte_limit); m_p_socket_stats->counters.n_rx_ready_byte_drop += p_desc->rx.sz_payload; m_p_socket_stats->counters.n_rx_ready_pkt_drop++; return false; @@ -2830,7 +2825,7 @@ int sockinfo_udp::mc_change_membership_ip4(const mc_pending_pram *p_mc_pram) // we will get RX from OS return -1; } - xlio_stats_mc_group_add(mc_grp, m_p_socket_stats); + xlio_stats_mc_group_add(mc_grp, has_stats() ? m_p_socket_stats : nullptr); original_os_setsockopt_helper(&mreq_src, pram_size, p_mc_pram->optname, IPPROTO_IP); m_multicast = true; break; @@ -2842,7 +2837,7 @@ int sockinfo_udp::mc_change_membership_ip4(const mc_pending_pram *p_mc_pram) // we will get RX from OS return -1; } - xlio_stats_mc_group_add(mc_grp, m_p_socket_stats); + xlio_stats_mc_group_add(mc_grp, has_stats() ? m_p_socket_stats : nullptr); pram_size = sizeof(ip_mreq_source); original_os_setsockopt_helper(&mreq_src, pram_size, p_mc_pram->optname, IPPROTO_IP); m_multicast = true; @@ -2855,7 +2850,7 @@ int sockinfo_udp::mc_change_membership_ip4(const mc_pending_pram *p_mc_pram) if (!detach_receiver(flow_key)) { return -1; } - xlio_stats_mc_group_remove(mc_grp, m_p_socket_stats); + xlio_stats_mc_group_remove(mc_grp, has_stats() ? m_p_socket_stats : nullptr); m_multicast = false; break; } @@ -2868,7 +2863,7 @@ int sockinfo_udp::mc_change_membership_ip4(const mc_pending_pram *p_mc_pram) if (!detach_receiver(flow_key)) { return -1; } - xlio_stats_mc_group_remove(mc_grp, m_p_socket_stats); + xlio_stats_mc_group_remove(mc_grp, has_stats() ? m_p_socket_stats : nullptr); m_multicast = false; // get out from MC group } break; @@ -3070,7 +3065,7 @@ int sockinfo_udp::mc_change_membership_ip6(const mc_pending_pram *p_mc_pram) // we will get RX from OS return -1; } - xlio_stats_mc_group_add(mc_grp, m_p_socket_stats); + xlio_stats_mc_group_add(mc_grp, has_stats() ? m_p_socket_stats : nullptr); original_os_setsockopt_helper(&p_mc_pram->req, p_mc_pram->pram_size, p_mc_pram->optname, IPPROTO_IPV6); } break; @@ -3087,7 +3082,7 @@ int sockinfo_udp::mc_change_membership_ip6(const mc_pending_pram *p_mc_pram) if (!detach_receiver(flow_key)) { return -1; } - xlio_stats_mc_group_remove(mc_grp, m_p_socket_stats); + xlio_stats_mc_group_remove(mc_grp, has_stats() ? m_p_socket_stats : nullptr); } break; } @@ -3143,16 +3138,18 @@ void sockinfo_udp::save_stats_threadid_tx() void sockinfo_udp::save_stats_tx_offload(int bytes, bool is_dummy) { - if (unlikely(is_dummy)) { - m_p_socket_stats->counters.n_tx_dummy++; - } else { - if (bytes >= 0) { - m_p_socket_stats->counters.n_tx_sent_byte_count += bytes; - m_p_socket_stats->counters.n_tx_sent_pkt_count++; - } else if (errno == EAGAIN) { - m_p_socket_stats->counters.n_rx_os_eagain++; + if (unlikely(has_stats())) { + if (unlikely(is_dummy)) { + m_p_socket_stats->counters.n_tx_dummy++; } else { - m_p_socket_stats->counters.n_tx_errors++; + if (bytes >= 0) { + m_p_socket_stats->counters.n_tx_sent_byte_count += bytes; + m_p_socket_stats->counters.n_tx_sent_pkt_count++; + } else if (errno == EAGAIN) { + m_p_socket_stats->counters.n_rx_os_eagain++; + } else { + m_p_socket_stats->counters.n_tx_errors++; + } } } } diff --git a/src/core/sock/sockinfo_udp.h b/src/core/sock/sockinfo_udp.h index 7d81ca112..cf7e7f026 100644 --- a/src/core/sock/sockinfo_udp.h +++ b/src/core/sock/sockinfo_udp.h @@ -293,6 +293,7 @@ class sockinfo_udp : public sockinfo { bool operator==(const int &r_port) { return port == r_port; } }; + uint32_t m_rx_ready_byte_limit; ip_addr m_mc_tx_src_ip; bool m_b_mc_tx_loop; uint8_t m_n_mc_ttl_hop_lim; diff --git a/src/core/sock/sockinfo_ulp.cpp b/src/core/sock/sockinfo_ulp.cpp index 24eee400a..aaa599362 100644 --- a/src/core/sock/sockinfo_ulp.cpp +++ b/src/core/sock/sockinfo_ulp.cpp @@ -680,7 +680,7 @@ err_t sockinfo_tcp_ops_tls::tls_rx_consume_ready_packets() * receive encrypted TLS records with header and TAG after successful * setsockopt() call. */ - if (m_p_sock->m_p_socket_stats->n_rx_ready_pkt_count != 0) { + if (m_p_sock->get_rx_pkt_ready_list_count() > 0) { descq_t descs_rx_ready; m_p_sock->sock_pop_descs_rx_ready(&descs_rx_ready); @@ -857,8 +857,10 @@ ssize_t sockinfo_tcp_ops_tls::tx(xlio_tx_call_attr_t &tx_arg) /* Statistics */ if (ret > 0) { errno = errno_save; - m_p_sock->m_p_socket_stats->tls_counters.n_tls_tx_records += m_next_recno_tx - last_recno; - m_p_sock->m_p_socket_stats->tls_counters.n_tls_tx_bytes += ret; + if (unlikely(m_p_sock->has_stats())) { + m_p_sock->m_p_socket_stats->tls_counters.n_tls_tx_records += m_next_recno_tx - last_recno; + m_p_sock->m_p_socket_stats->tls_counters.n_tls_tx_bytes += ret; + } } return ret; } @@ -1441,8 +1443,10 @@ err_t sockinfo_tcp_ops_tls::recv(struct pbuf *p) } /* Statistics */ - m_p_sock->m_p_socket_stats->tls_counters.n_tls_rx_records_enc += !!(decrypted_nr == 0); - m_p_sock->m_p_socket_stats->tls_counters.n_tls_rx_records_partial += !!(decrypted_nr != 0); + if (unlikely(m_p_sock->has_stats())) { + m_p_sock->m_p_socket_stats->tls_counters.n_tls_rx_records_enc += !!(decrypted_nr == 0); + m_p_sock->m_p_socket_stats->tls_counters.n_tls_rx_records_partial += !!(decrypted_nr != 0); + } } /* Handle decryption failures. */ @@ -1474,10 +1478,12 @@ err_t sockinfo_tcp_ops_tls::recv(struct pbuf *p) } /* Statistics */ - m_p_sock->m_p_socket_stats->tls_counters.n_tls_rx_records += 1U; - m_p_sock->m_p_socket_stats->tls_counters.n_tls_rx_bytes += likely(pres) ? pres->tot_len : 0; - /* Adjust TCP counters with received TLS header/trailer. */ - m_p_sock->m_p_socket_stats->counters.n_rx_bytes += m_tls_rec_overhead; + if (unlikely(m_p_sock->has_stats())) { + m_p_sock->m_p_socket_stats->tls_counters.n_tls_rx_records += 1U; + m_p_sock->m_p_socket_stats->tls_counters.n_tls_rx_bytes += likely(pres) ? pres->tot_len : 0; + /* Adjust TCP counters with received TLS header/trailer. */ + m_p_sock->m_p_socket_stats->counters.n_rx_bytes += m_tls_rec_overhead; + } ++m_next_recno_rx; diff --git a/src/core/util/sys_vars.cpp b/src/core/util/sys_vars.cpp index 717140cb9..83214ea20 100644 --- a/src/core/util/sys_vars.cpp +++ b/src/core/util/sys_vars.cpp @@ -762,6 +762,7 @@ void mce_sys_var::get_env_params() handle_sigintr = MCE_DEFAULT_HANDLE_SIGINTR; handle_segfault = MCE_DEFAULT_HANDLE_SIGFAULT; stats_fd_num_max = MCE_DEFAULT_STATS_FD_NUM; + stats_fd_num_monitor = MCE_DEFAULT_STATS_FD_NUM; ring_allocation_logic_tx = MCE_DEFAULT_RING_ALLOCATION_LOGIC_TX; ring_allocation_logic_rx = MCE_DEFAULT_RING_ALLOCATION_LOGIC_RX; @@ -1247,10 +1248,10 @@ void mce_sys_var::get_env_params() if ((env_ptr = getenv(SYS_VAR_STATS_FD_NUM))) { stats_fd_num_max = (uint32_t)atoi(env_ptr); + stats_fd_num_monitor = std::min(stats_fd_num_max, MAX_STATS_FD_NUM); if (stats_fd_num_max > MAX_STATS_FD_NUM) { vlog_printf(VLOG_WARNING, " Can only monitor maximum %d sockets in statistics \n", MAX_STATS_FD_NUM); - stats_fd_num_max = MAX_STATS_FD_NUM; } } diff --git a/src/core/util/sys_vars.h b/src/core/util/sys_vars.h index eaee8e793..794fefcba 100644 --- a/src/core/util/sys_vars.h +++ b/src/core/util/sys_vars.h @@ -359,6 +359,7 @@ struct mce_sys_var { bool handle_sigintr; bool handle_segfault; uint32_t stats_fd_num_max; + uint32_t stats_fd_num_monitor; ring_logic_t ring_allocation_logic_tx; ring_logic_t ring_allocation_logic_rx; @@ -723,7 +724,7 @@ extern mce_sys_var &safe_mce_sys(); #define MCE_DEFAULT_APP_ID ("XLIO_DEFAULT_APPLICATION_ID") #define MCE_DEFAULT_HANDLE_SIGINTR (true) #define MCE_DEFAULT_HANDLE_SIGFAULT (false) -#define MCE_DEFAULT_STATS_FD_NUM 100 +#define MCE_DEFAULT_STATS_FD_NUM 0 #define MCE_DEFAULT_RING_ALLOCATION_LOGIC_TX (RING_LOGIC_PER_INTERFACE) #define MCE_DEFAULT_RING_ALLOCATION_LOGIC_RX (RING_LOGIC_PER_INTERFACE) #define MCE_DEFAULT_RING_MIGRATION_RATIO_TX (-1) @@ -913,7 +914,7 @@ extern mce_sys_var &safe_mce_sys(); #define NETVSC_DEVICE_UPPER_FILE "/sys/class/net/%s/upper_%s/ifindex" #define NETVSC_ID "{f8615163-df3e-46c5-913f-f2d2f965ed0e}\n" -#define MAX_STATS_FD_NUM 1024 +#define MAX_STATS_FD_NUM 1024U #define MAX_WINDOW_SCALING 14 #define STRQ_MIN_STRIDES_NUM 512 diff --git a/src/core/util/xlio_stats.h b/src/core/util/xlio_stats.h index 725f7fb3f..5b77774f7 100644 --- a/src/core/util/xlio_stats.h +++ b/src/core/util/xlio_stats.h @@ -225,44 +225,47 @@ typedef struct socket_listen_counters { } } socket_listen_counters_t; -typedef struct socket_stats_t { - int fd; - uint32_t inode; - uint32_t tcp_state; // enum tcp_state - uint8_t socket_type; // SOCK_STREAM, SOCK_DGRAM, ... - bool padding1; - sa_family_t sa_family; - bool b_is_offloaded; - bool b_blocking; - bool b_mc_loop; - bool padding2; - in_port_t bound_port; - in_port_t connected_port; - ip_address bound_if; - ip_address connected_ip; - ip_address mc_tx_if; - pid_t threadid_last_rx; - pid_t threadid_last_tx; - uint32_t n_rx_ready_pkt_count; - uint32_t n_rx_ready_byte_limit; - uint64_t n_rx_ready_byte_count; +struct socket_stats_t { + // Data Path uint64_t n_tx_ready_byte_count; - uint32_t n_rx_zcopy_pkt_count; + uint64_t n_rx_ready_byte_count; + uint32_t n_rx_ready_pkt_count; socket_counters_t counters; + socket_strq_counters_t strq_counters; #ifdef DEFINED_UTLS - bool tls_tx_offload; - bool tls_rx_offload; - uint16_t tls_version; - uint16_t tls_cipher; socket_tls_counters_t tls_counters; #endif /* DEFINED_UTLS */ - socket_strq_counters_t strq_counters; socket_listen_counters_t listen_counters; + + // Control Path std::bitset mc_grp_map; ring_logic_t ring_alloc_logic_rx; ring_logic_t ring_alloc_logic_tx; + ip_address bound_if; + ip_address connected_ip; + ip_address mc_tx_if; + int fd; + uint32_t inode; + uint32_t tcp_state; // enum tcp_state + uint32_t n_rx_zcopy_pkt_count; + pid_t threadid_last_rx; + pid_t threadid_last_tx; uint64_t ring_user_id_rx; uint64_t ring_user_id_tx; + sa_family_t sa_family; + in_port_t bound_port; + in_port_t connected_port; + uint8_t socket_type; // SOCK_STREAM, SOCK_DGRAM, ... + bool b_is_offloaded; + bool b_blocking; + bool b_mc_loop; +#ifdef DEFINED_UTLS + uint16_t tls_version; + uint16_t tls_cipher; + bool tls_tx_offload; + bool tls_rx_offload; +#endif /* DEFINED_UTLS */ + socket_stats_t *_next_stat; void reset() { @@ -274,8 +277,8 @@ typedef struct socket_stats_t { bound_if = connected_ip = mc_tx_if = ip_address(in6addr_any); bound_port = connected_port = (in_port_t)0; threadid_last_rx = threadid_last_tx = pid_t(0); - n_rx_ready_pkt_count = n_rx_ready_byte_count = n_rx_ready_byte_limit = - n_rx_zcopy_pkt_count = n_tx_ready_byte_count = 0; + n_rx_ready_pkt_count = n_rx_ready_byte_count = n_rx_zcopy_pkt_count = + n_tx_ready_byte_count = 0; memset(&counters, 0, sizeof(counters)); #ifdef DEFINED_UTLS tls_tx_offload = tls_rx_offload = false; @@ -287,7 +290,6 @@ typedef struct socket_stats_t { mc_grp_map.reset(); ring_user_id_rx = ring_user_id_tx = 0; ring_alloc_logic_rx = ring_alloc_logic_tx = RING_LOGIC_PER_INTERFACE; - padding1 = padding2 = 0; }; void set_bound_if(sock_addr &sock) @@ -312,10 +314,11 @@ typedef struct socket_stats_t { : bound_if(in6addr_any) , connected_ip(in6addr_any) , mc_tx_if(in6addr_any) + , _next_stat(nullptr) { reset(); }; -} socket_stats_t; +}; typedef struct { bool b_enabled; diff --git a/src/stats/stats_printer.cpp b/src/stats/stats_printer.cpp index aca16c25d..d7eade9d3 100644 --- a/src/stats/stats_printer.cpp +++ b/src/stats/stats_printer.cpp @@ -202,10 +202,9 @@ void print_full_stats(socket_stats_t *p_si_stats, mc_grp_info_t *p_mc_grp_info, b_any_activiy = true; } if (p_si_stats->counters.n_rx_data_pkts || p_si_stats->n_rx_ready_pkt_count) { - fprintf(filename, "Rx byte: cur %lu / max %u / dropped%s %u / limit %u\n", + fprintf(filename, "Rx byte: cur %lu / max %u / dropped%s %u\n", p_si_stats->n_rx_ready_byte_count, p_si_stats->counters.n_rx_ready_byte_max, - post_fix, p_si_stats->counters.n_rx_ready_byte_drop, - p_si_stats->n_rx_ready_byte_limit); + post_fix, p_si_stats->counters.n_rx_ready_byte_drop); fprintf(filename, "Rx pkt : cur %u / max %u / dropped%s %u\n", p_si_stats->n_rx_ready_pkt_count, p_si_stats->counters.n_rx_ready_pkt_max, post_fix, p_si_stats->counters.n_rx_ready_pkt_drop); diff --git a/src/stats/stats_publisher.cpp b/src/stats/stats_publisher.cpp index 02f671e56..9649694ff 100644 --- a/src/stats/stats_publisher.cpp +++ b/src/stats/stats_publisher.cpp @@ -181,7 +181,7 @@ void xlio_shmem_stats_open(vlog_levels_t **p_p_xlio_log_level, uint8_t **p_p_xli } BULLSEYE_EXCLUDE_BLOCK_END - shmem_size = SHMEM_STATS_SIZE(safe_mce_sys().stats_fd_num_max); + shmem_size = SHMEM_STATS_SIZE(safe_mce_sys().stats_fd_num_monitor); buf = malloc(shmem_size); if (buf == NULL) { goto shmem_error; @@ -266,11 +266,11 @@ void xlio_shmem_stats_open(vlog_levels_t **p_p_xlio_log_level, uint8_t **p_p_xli write_version_details_to_shmem(&g_sh_mem->ver_info); memcpy(g_sh_mem->stats_protocol_ver, STATS_PROTOCOL_VER, std::min(sizeof(g_sh_mem->stats_protocol_ver), sizeof(STATS_PROTOCOL_VER))); - g_sh_mem->max_skt_inst_num = safe_mce_sys().stats_fd_num_max; + g_sh_mem->max_skt_inst_num = safe_mce_sys().stats_fd_num_monitor; g_sh_mem->reader_counter = 0; __log_dbg("file '%s' fd %d shared memory at %p with %d max blocks", g_sh_mem_info.filename_sh_stats, g_sh_mem_info.fd_sh_stats, g_sh_mem_info.p_sh_stats, - safe_mce_sys().stats_fd_num_max); + safe_mce_sys().stats_fd_num_monitor); // Update the shmem initial log values g_sh_mem->log_level = **p_p_xlio_log_level; @@ -306,11 +306,11 @@ void xlio_shmem_stats_close() if (g_sh_mem_info.p_sh_stats && g_sh_mem_info.p_sh_stats != MAP_FAILED) { __log_dbg("file '%s' fd %d shared memory at %p with %d max blocks", g_sh_mem_info.filename_sh_stats, g_sh_mem_info.fd_sh_stats, - g_sh_mem_info.p_sh_stats, safe_mce_sys().stats_fd_num_max); + g_sh_mem_info.p_sh_stats, safe_mce_sys().stats_fd_num_monitor); BULLSEYE_EXCLUDE_BLOCK_START - if (munmap(g_sh_mem_info.p_sh_stats, SHMEM_STATS_SIZE(safe_mce_sys().stats_fd_num_max)) != - 0) { + if (munmap(g_sh_mem_info.p_sh_stats, + SHMEM_STATS_SIZE(safe_mce_sys().stats_fd_num_monitor)) != 0) { vlog_printf(VLOG_ERROR, "%s: file [%s] fd [%d] error while unmap shared memory at [%p]\n", __func__, g_sh_mem_info.filename_sh_stats, g_sh_mem_info.fd_sh_stats, @@ -351,7 +351,7 @@ void xlio_stats_instance_create_socket_block(socket_stats_t *local_stats_addr) goto out; } } - if (g_sh_mem->max_skt_inst_num + 1 < safe_mce_sys().stats_fd_num_max) { + if (g_sh_mem->max_skt_inst_num + 1 < safe_mce_sys().stats_fd_num_monitor) { // allocate next sh_mem block p_skt_stats = &g_sh_mem->skt_inst_arr[g_sh_mem->max_skt_inst_num].skt_stats; g_sh_mem->skt_inst_arr[g_sh_mem->max_skt_inst_num].b_enabled = true; @@ -360,8 +360,10 @@ void xlio_stats_instance_create_socket_block(socket_stats_t *local_stats_addr) } else { if (!printed_sock_limit_info) { printed_sock_limit_info = true; - vlog_printf(VLOG_INFO, "Statistics can monitor up to %d sockets - increase %s\n", - safe_mce_sys().stats_fd_num_max, SYS_VAR_STATS_FD_NUM); + if (safe_mce_sys().stats_fd_num_monitor < MAX_STATS_FD_NUM) { + vlog_printf(VLOG_INFO, "Statistics can monitor up to %d sockets - increase %s\n", + safe_mce_sys().stats_fd_num_monitor, SYS_VAR_STATS_FD_NUM); + } } goto out; } @@ -418,6 +420,10 @@ void xlio_stats_mc_group_add(const ip_address &mc_grp, socket_stats_t *p_socket_ int empty_entry = -1; int index_to_insert = -1; + if (!p_socket_stats) { + return; + } + g_lock_mc_info.lock(); for (int grp_idx = 0; grp_idx < g_sh_mem->mc_info.max_grp_num && index_to_insert == -1; grp_idx++) { @@ -451,6 +457,10 @@ void xlio_stats_mc_group_add(const ip_address &mc_grp, socket_stats_t *p_socket_ void xlio_stats_mc_group_remove(const ip_address &mc_grp, socket_stats_t *p_socket_stats) { + if (!p_socket_stats) { + return; + } + g_lock_mc_info.lock(); for (int grp_idx = 0; grp_idx < g_sh_mem->mc_info.max_grp_num; grp_idx++) { if (g_sh_mem->mc_info.mc_grp_tbl[grp_idx].sock_num && diff --git a/src/stats/stats_reader.cpp b/src/stats/stats_reader.cpp index d4991280b..0a8621c32 100644 --- a/src/stats/stats_reader.cpp +++ b/src/stats/stats_reader.cpp @@ -252,7 +252,6 @@ void update_delta_stat(socket_stats_t *p_curr_stat, socket_stats_t *p_prev_stat) (p_curr_stat->counters.n_rx_poll_hit - p_prev_stat->counters.n_rx_poll_hit) / delay; p_prev_stat->n_rx_ready_byte_count = p_curr_stat->n_rx_ready_byte_count; p_prev_stat->n_tx_ready_byte_count = p_curr_stat->n_tx_ready_byte_count; - p_prev_stat->n_rx_ready_byte_limit = p_curr_stat->n_rx_ready_byte_limit; p_prev_stat->counters.n_rx_ready_byte_max = p_curr_stat->counters.n_rx_ready_byte_max; p_prev_stat->counters.n_rx_ready_byte_drop = (p_curr_stat->counters.n_rx_ready_byte_drop - p_prev_stat->counters.n_rx_ready_byte_drop) / From 38f8cd037cd210d10de3502231d65e7941b37e5a Mon Sep 17 00:00:00 2001 From: Alexander Grissik Date: Mon, 11 Mar 2024 15:14:58 +0000 Subject: [PATCH 118/169] issue: 3777348 Reordering sockinfo members Reordering the sockinfo fields from random locations to group data path frequently accessed fields together. Placing frequently fields at the beggining of the object allows these members to be fetched with the cache line that fetches the vptr. Signed-off-by: Alexander Grissik --- src/core/dev/ring_slave.cpp | 4 +- src/core/sock/sock-redirect.cpp | 6 +- src/core/sock/sockinfo.cpp | 37 +------- src/core/sock/sockinfo.h | 162 +++++++++++++++++--------------- src/core/sock/sockinfo_ulp.cpp | 30 +++--- 5 files changed, 110 insertions(+), 129 deletions(-) diff --git a/src/core/dev/ring_slave.cpp b/src/core/dev/ring_slave.cpp index 1dded5607..bf0019e6b 100644 --- a/src/core/dev/ring_slave.cpp +++ b/src/core/dev/ring_slave.cpp @@ -347,7 +347,7 @@ bool steering_handler::attach_flow(flow_tuple &flow_spec_5t, BULLSEYE_EXCLUDE_BLOCK_END p_rfs = p_tmp_rfs; - si->rfs_ptr = p_rfs; + si->set_rfs_ptr(p_rfs); #if defined(DEFINED_NGINX) || defined(DEFINED_ENVOY) if (g_p_app->type == APP_NONE || !g_p_app->add_second_4t_rule) #endif @@ -666,7 +666,7 @@ bool ring_slave::rx_process_buffer(mem_buf_desc_t *p_rx_wc_buf_desc, void *pv_fd p_tcp_h->fin ? "F" : "", ntohl(p_tcp_h->seq), ntohl(p_tcp_h->ack_seq), ntohs(p_tcp_h->window), p_rx_wc_buf_desc->rx.sz_payload); - return si->rfs_ptr->rx_dispatch_packet(p_rx_wc_buf_desc, pv_fd_ready_array); + return si->get_rfs_ptr()->rx_dispatch_packet(p_rx_wc_buf_desc, pv_fd_ready_array); } if (likely(protocol == IPPROTO_UDP)) { diff --git a/src/core/sock/sock-redirect.cpp b/src/core/sock/sock-redirect.cpp index bb9341d42..16ead07fa 100644 --- a/src/core/sock/sock-redirect.cpp +++ b/src/core/sock/sock-redirect.cpp @@ -432,7 +432,7 @@ static ssize_t sendfile_helper(sockinfo *p_socket_object, int in_fd, __off64_t * mapping->put(); rc = fstat(in_fd, &st_buf); if ((rc == 0) && (st_buf.st_size >= (off_t)(cur_offset + count))) { - s->m_p_socket_stats->counters.n_tx_sendfile_overflows++; + s->get_sock_stats()->counters.n_tx_sendfile_overflows++; goto fallback; } else { errno = EOVERFLOW; @@ -455,7 +455,7 @@ static ssize_t sendfile_helper(sockinfo *p_socket_object, int in_fd, __off64_t * fallback: /* Fallback to readv() implementation */ if (totSent == 0) { - s->m_p_socket_stats->counters.n_tx_sendfile_fallbacks++; + s->get_sock_stats()->counters.n_tx_sendfile_fallbacks++; tx_arg.clear(); tx_arg.opcode = TX_FILE; tx_arg.attr.iov = piov; @@ -526,7 +526,7 @@ static ssize_t sendfile_helper(sockinfo *p_socket_object, int in_fd, __off64_t * char buf[sysconf(_SC_PAGE_SIZE)]; ssize_t toRead, numRead, numSent = 0; - s->m_p_socket_stats->counters.n_tx_sendfile_fallbacks++; + s->get_sock_stats()->counters.n_tx_sendfile_fallbacks++; while (count > 0) { toRead = min(sizeof(buf), count); diff --git a/src/core/sock/sockinfo.cpp b/src/core/sock/sockinfo.cpp index c71c13f2e..2c58a9ab6 100644 --- a/src/core/sock/sockinfo.cpp +++ b/src/core/sock/sockinfo.cpp @@ -99,49 +99,23 @@ const char *sockinfo::setsockopt_so_opt_to_str(int opt) } sockinfo::sockinfo(int fd, int domain, bool use_ring_locks) - : m_epoll_event_flags(0) - , m_fd(fd) + : m_fd(fd) + , m_fd_context((void *)((uintptr_t)m_fd)) + , m_n_sysvar_rx_num_buffs_reuse(safe_mce_sys().rx_bufs_batch) + , m_n_sysvar_rx_poll_num(safe_mce_sys().rx_poll_num) , m_n_sysvar_select_poll_os_ratio(safe_mce_sys().select_poll_os_ratio) - , m_econtext(NULL) - , m_reuseaddr(false) - , m_reuseport(false) - , m_flow_tag_enabled(false) - , m_b_blocking(true) - , m_b_pktinfo(false) - , m_b_rcvtstamp(false) - , m_b_rcvtstampns(false) - , m_b_zc(false) + , m_rx_cq_wait_ctrl(safe_mce_sys().rx_cq_wait_ctrl) , m_skip_cq_poll_in_rx(safe_mce_sys().skip_poll_in_rx == SKIP_POLL_IN_RX_ENABLE) - , m_n_tsing_flags(0) - , m_protocol(PROTO_UNDEFINED) - , m_src_sel_flags(0U) , m_lock_rcv(MULTILOCK_RECURSIVE, MODULE_NAME "::m_lock_rcv") , m_lock_snd(MODULE_NAME "::m_lock_snd") - , m_state(SOCKINFO_OPENED) , m_family(domain) - , m_p_connected_dst_entry(nullptr) , m_so_bindtodevice_ip(ip_address::any_addr(), domain) - , m_p_rx_ring(nullptr) - , m_rx_reuse_buf_pending(false) - , m_rx_reuse_buf_postponed(false) , m_rx_ring_map_lock(MODULE_NAME "::m_rx_ring_map_lock") - , m_n_rx_pkt_ready_list_count(0) - , m_rx_pkt_ready_offset(0) - , m_rx_ready_byte_count(0) - , m_n_sysvar_rx_num_buffs_reuse(safe_mce_sys().rx_bufs_batch) - , m_n_sysvar_rx_poll_num(safe_mce_sys().rx_poll_num) , m_ring_alloc_log_rx(safe_mce_sys().ring_allocation_logic_rx, use_ring_locks) , m_ring_alloc_log_tx(safe_mce_sys().ring_allocation_logic_tx, use_ring_locks) - , m_pcp(0) - , m_rx_callback(nullptr) - , m_rx_callback_context(nullptr) - , m_fd_context((void *)((uintptr_t)m_fd)) - , m_flow_tag_id(0) - , m_rx_cq_wait_ctrl(safe_mce_sys().rx_cq_wait_ctrl) , m_n_uc_ttl_hop_lim(m_family == AF_INET ? safe_mce_sys().sysctl_reader.get_net_ipv4_ttl() : safe_mce_sys().sysctl_reader.get_net_ipv6_hop_limit()) - , m_bind_no_port(false) , m_is_ipv6only(safe_mce_sys().sysctl_reader.get_ipv6_bindv6only()) { m_rx_epfd = SYSCALL(epoll_create, 128); @@ -163,7 +137,6 @@ sockinfo::sockinfo(int fd, int domain, bool use_ring_locks) set_flow_tag(m_fd + 1); atomic_set(&m_zckey, 0); - m_last_zcdesc = nullptr; m_socketxtreme.ec_cache.clear(); struct ring_ec ec; diff --git a/src/core/sock/sockinfo.h b/src/core/sock/sockinfo.h index c29787e07..98e781435 100644 --- a/src/core/sock/sockinfo.h +++ b/src/core/sock/sockinfo.h @@ -238,7 +238,7 @@ class epfd_info; class sockinfo { public: - enum sockinfo_state { + enum sockinfo_state : uint16_t { SOCKINFO_UNDEFINED, SOCKINFO_OPENED, SOCKINFO_CLOSING, @@ -342,6 +342,9 @@ class sockinfo { bool is_shadow_socket_present() { return m_fd >= 0 && m_fd != m_rx_epfd; } uint32_t get_flow_tag_val() { return m_flow_tag_id; } in_protocol_t get_protocol(void) { return m_protocol; } + socket_stats_t *get_sock_stats() const { return m_p_socket_stats; } + rfs *get_rfs_ptr() const { return m_rfs_ptr; } + void set_rfs_ptr(rfs *r) { m_rfs_ptr = r; } void destructor_helper(); int get_rings_fds(int *ring_fds, int ring_fds_sz); int get_rings_num(); @@ -463,121 +466,123 @@ class sockinfo { int fcntl_helper(int __cmd, unsigned long int __arg, bool &bexit); bool attach_as_uc_receiver_anyip(sa_family_t family, role_t role, bool skip_rules); -public: - socket_stats_t *m_p_socket_stats = nullptr; +protected: /* Last memory descriptor with zcopy operation method */ - mem_buf_desc_t *m_last_zcdesc; + dst_entry *m_p_connected_dst_entry = nullptr; + ring *m_p_rx_ring = nullptr; // used in TCP/UDP + epfd_info *m_econtext = nullptr; + socket_stats_t *m_p_socket_stats = nullptr; + mem_buf_desc_t *m_last_zcdesc = nullptr; + sockinfo_state m_state = SOCKINFO_OPENED; // socket current state + uint8_t m_n_tsing_flags = 0U; + bool m_has_stats = false; + bool m_flow_tag_enabled = false; // for this socket + bool m_b_rcvtstamp = false; + bool m_b_zc = false; + bool m_b_blocking = true; + + /* TX zcopy counter + * The notification itself for tx zcopy operation is a simple scalar value. + * Each socket maintains an internal unsigned 32-bit counter. + * Each send call with MSG_ZEROCOPY that successfully sends data increments + * the counter. The counter is not incremented on failure or if called with + * length zero. + * The counter counts system call invocations, not bytes. + * It wraps after UINT_MAX calls. + */ + atomic_t m_zckey; + + int m_fd; // identification information + + // End of first cache line + + /* Socket error queue that keeps local errors and internal data required + * to provide notification ability. + */ + descq_t m_error_queue; + lock_spin m_error_queue_lock; + void *m_fd_context; // Context data stored with socket + + // End of second cache line + + wakeup_pipe m_sock_wakeup_pipe; + rfs *m_rfs_ptr = nullptr; struct { /* Use std::deque in current design as far as it allows pushing * elements on either end without moving around any other element * but trade this for slightly worse iteration speeds. */ - std::deque ec_cache; struct ring_ec *ec; + std::deque ec_cache; } m_socketxtreme; - rfs *rfs_ptr = nullptr; - -#if defined(DEFINED_NGINX) || defined(DEFINED_ENVOY) - bool m_is_for_socket_pool = false; // true when this fd will be used for socket pool on close - int m_back_log = 0; -#endif - - list_node pendig_to_remove_node; + // End of fourth cache line +public: list_node socket_fd_list_node; list_node ep_ready_fd_node; - uint32_t m_epoll_event_flags; list_node ep_info_fd_node; + list_node pendig_to_remove_node; epoll_fd_rec m_fd_rec; + uint32_t m_epoll_event_flags = 0U; protected: - - int m_fd; // identification information + int m_rx_epfd; + /** + * list of pending ready packet on the Rx, + * each element is a pointer to the ib_conn_mgr that holds this ready rx datagram + */ + size_t m_rx_pkt_ready_offset = 0U; + size_t m_rx_ready_byte_count = 0U; + buff_info_t m_rx_reuse_buff; // used in TCP instead of m_rx_ring_map + int m_n_rx_pkt_ready_list_count = 0; + int m_n_sysvar_rx_num_buffs_reuse; + const int32_t m_n_sysvar_rx_poll_num; const uint32_t m_n_sysvar_select_poll_os_ratio; - epfd_info *m_econtext; - bool m_has_stats = false; - bool m_reuseaddr; // to track setsockopt with SO_REUSEADDR - bool m_reuseport; // to track setsockopt with SO_REUSEPORT - bool m_flow_tag_enabled; // for this socket - bool m_b_blocking; - bool m_b_pktinfo; - bool m_b_rcvtstamp; - bool m_b_rcvtstampns; - bool m_b_zc; + // used to periodically return buffers, even if threshold was not reached + bool m_rx_reuse_buf_pending = false; + // used to mark threshold was reached, but free was not done yet + bool m_rx_reuse_buf_postponed = false; + bool m_rx_cq_wait_ctrl; bool m_skip_cq_poll_in_rx; - uint8_t m_n_tsing_flags; - in_protocol_t m_protocol; - uint8_t m_src_sel_flags; + bool m_reuseaddr = false; // to track setsockopt with SO_REUSEADDR + bool m_reuseport = false; // to track setsockopt with SO_REUSEPORT + bool m_b_pktinfo = false; + bool m_b_rcvtstampns = false; multilock m_lock_rcv; lock_mutex m_lock_snd; lock_mutex m_rx_migration_lock; - sockinfo_state m_state; // socket current state + uint32_t m_flow_tag_id = 0U; // Flow Tag for this socket + in_protocol_t m_protocol = PROTO_UNDEFINED; sa_family_t m_family; sock_addr m_bound; sock_addr m_connected; - wakeup_pipe m_sock_wakeup_pipe; - dst_entry *m_p_connected_dst_entry; ip_addr m_so_bindtodevice_ip; - - int m_rx_epfd; cache_observer m_rx_nd_observer; rx_net_device_map_t m_rx_nd_map; rx_flow_map_t m_rx_flow_map; - // we either listen on ALL system cqs or bound to the specific cq - ring *m_p_rx_ring; // used in TCP/UDP - buff_info_t m_rx_reuse_buff; // used in TCP instead of m_rx_ring_map - bool m_rx_reuse_buf_pending; // used to periodically return buffers, even if threshold was not - // reached - bool m_rx_reuse_buf_postponed; // used to mark threshold was reached, but free was not done yet rx_ring_map_t m_rx_ring_map; // CQ map lock_mutex_recursive m_rx_ring_map_lock; ring_allocation_logic_rx m_ring_alloc_logic_rx; - loops_timer m_loops_timer; - - /** - * list of pending ready packet on the Rx, - * each element is a pointer to the ib_conn_mgr that holds this ready rx datagram - */ - int m_n_rx_pkt_ready_list_count; - size_t m_rx_pkt_ready_offset; - size_t m_rx_ready_byte_count; - - int m_n_sysvar_rx_num_buffs_reuse; - const int32_t m_n_sysvar_rx_poll_num; ring_alloc_logic_attr m_ring_alloc_log_rx; ring_alloc_logic_attr m_ring_alloc_log_tx; - uint32_t m_pcp; - - /* Socket error queue that keeps local errors and internal data required - * to provide notification ability. - */ - descq_t m_error_queue; - lock_spin m_error_queue_lock; - - /* TX zcopy counter - * The notification itself for tx zcopy operation is a simple scalar value. - * Each socket maintains an internal unsigned 32-bit counter. - * Each send call with MSG_ZEROCOPY that successfully sends data increments - * the counter. The counter is not incremented on failure or if called with - * length zero. - * The counter counts system call invocations, not bytes. - * It wraps after UINT_MAX calls. - */ - atomic_t m_zckey; - // Callback function pointer to support VMA extra API (xlio_extra.h) - xlio_recv_callback_t m_rx_callback; - void *m_rx_callback_context; // user context + xlio_recv_callback_t m_rx_callback = nullptr; + void *m_rx_callback_context = nullptr; // user context struct xlio_rate_limit_t m_so_ratelimit; - void *m_fd_context; // Context data stored with socket - uint32_t m_flow_tag_id; // Flow Tag for this socket - bool m_rx_cq_wait_ctrl; + uint32_t m_pcp = 0U; uint8_t m_n_uc_ttl_hop_lim; - bool m_bind_no_port; + uint8_t m_src_sel_flags = 0U; + bool m_bind_no_port = false; bool m_is_ipv6only; + +public: +#if defined(DEFINED_NGINX) || defined(DEFINED_ENVOY) + int m_back_log = 0; + bool m_is_for_socket_pool = false; // true when this fd will be used for socket pool on close +#endif }; void sockinfo::set_rx_reuse_pending(bool is_pending) @@ -654,7 +659,8 @@ void sockinfo::save_strq_stats(uint32_t packet_strides) { if (unlikely(has_stats())) { m_p_socket_stats->counters.n_rx_packets++; - m_p_socket_stats->strq_counters.n_strq_total_strides += static_cast(packet_strides); + m_p_socket_stats->strq_counters.n_strq_total_strides += + static_cast(packet_strides); m_p_socket_stats->strq_counters.n_strq_max_strides_per_packet = std::max(m_p_socket_stats->strq_counters.n_strq_max_strides_per_packet, packet_strides); } diff --git a/src/core/sock/sockinfo_ulp.cpp b/src/core/sock/sockinfo_ulp.cpp index aaa599362..ef449939b 100644 --- a/src/core/sock/sockinfo_ulp.cpp +++ b/src/core/sock/sockinfo_ulp.cpp @@ -603,7 +603,7 @@ int sockinfo_tcp_ops_tls::setsockopt(int __level, int __optname, const void *__o return -1; } m_is_tls_tx = true; - m_p_sock->m_p_socket_stats->tls_tx_offload = true; + m_p_sock->get_sock_stats()->tls_tx_offload = true; } else { m_p_cipher_ctx = (void *)g_tls_api->EVP_CIPHER_CTX_new(); if (unlikely(!m_p_cipher_ctx)) { @@ -656,12 +656,12 @@ int sockinfo_tcp_ops_tls::setsockopt(int __level, int __optname, const void *__o } tcp_recv(m_p_sock->get_pcb(), sockinfo_tcp_ops_tls::rx_lwip_cb); - m_p_sock->m_p_socket_stats->tls_rx_offload = true; + m_p_sock->get_sock_stats()->tls_rx_offload = true; m_p_sock->unlock_tcp_con(); } - m_p_sock->m_p_socket_stats->tls_version = base_info->version; - m_p_sock->m_p_socket_stats->tls_cipher = base_info->cipher_type; + m_p_sock->get_sock_stats()->tls_version = base_info->version; + m_p_sock->get_sock_stats()->tls_cipher = base_info->cipher_type; si_ulp_logdbg("TLS%s %s offload is configured, keylen=%u", base_info->version == TLS_1_2_VERSION ? "1.2" : "1.3", @@ -858,8 +858,9 @@ ssize_t sockinfo_tcp_ops_tls::tx(xlio_tx_call_attr_t &tx_arg) if (ret > 0) { errno = errno_save; if (unlikely(m_p_sock->has_stats())) { - m_p_sock->m_p_socket_stats->tls_counters.n_tls_tx_records += m_next_recno_tx - last_recno; - m_p_sock->m_p_socket_stats->tls_counters.n_tls_tx_bytes += ret; + m_p_sock->get_sock_stats()->tls_counters.n_tls_tx_records += + m_next_recno_tx - last_recno; + m_p_sock->get_sock_stats()->tls_counters.n_tls_tx_bytes += ret; } } return ret; @@ -975,8 +976,8 @@ int sockinfo_tcp_ops_tls::postrouting(struct pbuf *p, struct tcp_seg *seg, xlio_ m_expected_seqno = seg->seqno; /* Statistics */ - ++m_p_sock->m_p_socket_stats->tls_counters.n_tls_tx_resync; - m_p_sock->m_p_socket_stats->tls_counters.n_tls_tx_resync_replay += + ++m_p_sock->get_sock_stats()->tls_counters.n_tls_tx_resync; + m_p_sock->get_sock_stats()->tls_counters.n_tls_tx_resync_replay += (seg->seqno != rec->m_seqno); } m_expected_seqno += seg->len; @@ -1298,7 +1299,7 @@ err_t sockinfo_tcp_ops_tls::recv(struct pbuf *p) memset(m_rx_psv_buf->lwip_pbuf.payload, 0, 64); m_rx_resync_recno = m_next_recno_rx; m_p_tx_ring->tls_get_progress_params_rx(m_p_tir, payload, LKEY_TX_DEFAULT); - ++m_p_sock->m_p_socket_stats->tls_counters.n_tls_rx_resync; + ++m_p_sock->get_sock_stats()->tls_counters.n_tls_rx_resync; } } @@ -1444,8 +1445,9 @@ err_t sockinfo_tcp_ops_tls::recv(struct pbuf *p) /* Statistics */ if (unlikely(m_p_sock->has_stats())) { - m_p_sock->m_p_socket_stats->tls_counters.n_tls_rx_records_enc += !!(decrypted_nr == 0); - m_p_sock->m_p_socket_stats->tls_counters.n_tls_rx_records_partial += !!(decrypted_nr != 0); + m_p_sock->get_sock_stats()->tls_counters.n_tls_rx_records_enc += !!(decrypted_nr == 0); + m_p_sock->get_sock_stats()->tls_counters.n_tls_rx_records_partial += + !!(decrypted_nr != 0); } } @@ -1479,10 +1481,10 @@ err_t sockinfo_tcp_ops_tls::recv(struct pbuf *p) /* Statistics */ if (unlikely(m_p_sock->has_stats())) { - m_p_sock->m_p_socket_stats->tls_counters.n_tls_rx_records += 1U; - m_p_sock->m_p_socket_stats->tls_counters.n_tls_rx_bytes += likely(pres) ? pres->tot_len : 0; + m_p_sock->get_sock_stats()->tls_counters.n_tls_rx_records += 1U; + m_p_sock->get_sock_stats()->tls_counters.n_tls_rx_bytes += likely(pres) ? pres->tot_len : 0; /* Adjust TCP counters with received TLS header/trailer. */ - m_p_sock->m_p_socket_stats->counters.n_rx_bytes += m_tls_rec_overhead; + m_p_sock->get_sock_stats()->counters.n_rx_bytes += m_tls_rec_overhead; } ++m_next_recno_rx; From 06a7917934c9c83ca56740b3a5dd5f4aa136c23f Mon Sep 17 00:00:00 2001 From: Alexander Grissik Date: Wed, 13 Mar 2024 07:49:56 +0000 Subject: [PATCH 119/169] issue: 3777348 Removing m_flow_tag_enabled check The check for m_flow_tag_enabled is redundant as we already got flow-tag packet and know the socket. In worst case the flow_tag_id can be checked. Avoid unneeded memory access for critical data path. Signed-off-by: Alexander Grissik --- src/core/dev/ring_slave.cpp | 2 +- src/core/sock/sockinfo.h | 3 --- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/src/core/dev/ring_slave.cpp b/src/core/dev/ring_slave.cpp index bf0019e6b..54bb5cc2e 100644 --- a/src/core/dev/ring_slave.cpp +++ b/src/core/dev/ring_slave.cpp @@ -600,7 +600,7 @@ bool ring_slave::rx_process_buffer(mem_buf_desc_t *p_rx_wc_buf_desc, void *pv_fd si = static_cast( g_p_fd_collection->get_sockfd(p_rx_wc_buf_desc->rx.flow_tag_id - 1)); - if (likely((si) && si->flow_tag_enabled())) { + if (likely(si)) { // will process packets with set flow_tag_id and enabled for the socket if (p_eth_h->h_proto == NET_ETH_P_8021Q) { // Handle VLAN header as next protocol diff --git a/src/core/sock/sockinfo.h b/src/core/sock/sockinfo.h index 98e781435..1bba3db3b 100644 --- a/src/core/sock/sockinfo.h +++ b/src/core/sock/sockinfo.h @@ -335,7 +335,6 @@ class sockinfo { sa_family_t get_family() { return m_family; } bool get_reuseaddr(void) { return m_reuseaddr; } bool get_reuseport(void) { return m_reuseport; } - bool flow_tag_enabled(void) { return m_flow_tag_enabled; } int get_rx_epfd(void) { return m_rx_epfd; } bool is_blocking(void) { return m_b_blocking; } bool flow_in_reuse(void) { return m_reuseaddr | m_reuseport; } @@ -476,7 +475,6 @@ class sockinfo { sockinfo_state m_state = SOCKINFO_OPENED; // socket current state uint8_t m_n_tsing_flags = 0U; bool m_has_stats = false; - bool m_flow_tag_enabled = false; // for this socket bool m_b_rcvtstamp = false; bool m_b_zc = false; bool m_b_blocking = true; @@ -594,7 +592,6 @@ bool sockinfo::set_flow_tag(uint32_t flow_tag_id) { if (flow_tag_id && (flow_tag_id != FLOW_TAG_MASK)) { m_flow_tag_id = flow_tag_id; - m_flow_tag_enabled = true; return true; } m_flow_tag_id = FLOW_TAG_MASK; From 1949822ef9de976bb051d42e067267db2dba7b16 Mon Sep 17 00:00:00 2001 From: Alexander Grissik Date: Wed, 13 Mar 2024 07:59:03 +0000 Subject: [PATCH 120/169] issue: 3777348 Remove support for SO_XLIO_FLOW_TAG This support was added long ago. However, there are no more users for this option. Signed-off-by: Alexander Grissik --- src/core/sock/sockinfo.cpp | 38 -------------------------------------- src/core/xlio_extra.h | 1 - 2 files changed, 39 deletions(-) diff --git a/src/core/sock/sockinfo.cpp b/src/core/sock/sockinfo.cpp index 2c58a9ab6..163bf5e86 100644 --- a/src/core/sock/sockinfo.cpp +++ b/src/core/sock/sockinfo.cpp @@ -84,8 +84,6 @@ const char *sockinfo::setsockopt_so_opt_to_str(int opt) return "SO_XLIO_RING_ALLOC_LOGIC"; case SO_MAX_PACING_RATE: return "SO_MAX_PACING_RATE"; - case SO_XLIO_FLOW_TAG: - return "SO_XLIO_FLOW_TAG"; case SO_XLIO_SHUTDOWN_RX: return "SO_XLIO_SHUTDOWN_RX"; case IPV6_V6ONLY: @@ -442,34 +440,6 @@ int sockinfo::setsockopt(int __level, int __optname, const void *__optval, sockl errno = EINVAL; } break; - case SO_XLIO_FLOW_TAG: - if (__optval) { - if (__optlen == sizeof(uint32_t)) { - if (set_flow_tag(*(uint32_t *)__optval)) { - si_logdbg("SO_XLIO_FLOW_TAG, set " - "socket fd: %d to flow id: %d", - m_fd, m_flow_tag_id); - // not supported in OS - ret = SOCKOPT_INTERNAL_XLIO_SUPPORT; - } else { - ret = SOCKOPT_NO_XLIO_SUPPORT; - errno = EINVAL; - } - } else { - ret = SOCKOPT_NO_XLIO_SUPPORT; - errno = EINVAL; - si_logdbg("SO_XLIO_FLOW_TAG, bad length " - "expected %zu got %d", - sizeof(uint32_t), __optlen); - break; - } - } else { - ret = SOCKOPT_NO_XLIO_SUPPORT; - errno = EINVAL; - si_logdbg("SO_XLIO_FLOW_TAG - NOT HANDLED, " - "optval == NULL"); - } - break; case SO_REUSEADDR: if (__optval && __optlen == sizeof(int)) { @@ -756,14 +726,6 @@ int sockinfo::getsockopt(int __level, int __optname, void *__optval, socklen_t * errno = EINVAL; } break; - case SO_XLIO_FLOW_TAG: - if (*__optlen >= sizeof(uint32_t)) { - *(uint32_t *)__optval = m_flow_tag_id; - ret = 0; - } else { - errno = EINVAL; - } - break; case SO_MAX_PACING_RATE: if (*__optlen == sizeof(struct xlio_rate_limit_t)) { *(struct xlio_rate_limit_t *)__optval = m_so_ratelimit; diff --git a/src/core/xlio_extra.h b/src/core/xlio_extra.h index e1b64f0ce..780d56d64 100644 --- a/src/core/xlio_extra.h +++ b/src/core/xlio_extra.h @@ -50,7 +50,6 @@ #define SO_XLIO_GET_API 2800 #define SO_XLIO_USER_DATA 2801 #define SO_XLIO_RING_ALLOC_LOGIC 2810 -#define SO_XLIO_FLOW_TAG 2820 #define SO_XLIO_SHUTDOWN_RX 2821 #define SO_XLIO_PD 2822 #define SCM_XLIO_PD SO_XLIO_PD From 28ded23fd981e94dae7ef42bc44c3a40e5789b0b Mon Sep 17 00:00:00 2001 From: Alexander Grissik Date: Wed, 13 Mar 2024 09:05:39 +0000 Subject: [PATCH 121/169] issue: 3777348 Avoid process_timestamps checking on each packet Checking if timestamps are needed per each packet consumes more cpu cycles than checking once and then running another loop is better for most cases when timestamps are not required. wq Signed-off-by: Alexander Grissik --- src/core/sock/sockinfo_tcp.cpp | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/src/core/sock/sockinfo_tcp.cpp b/src/core/sock/sockinfo_tcp.cpp index dc45b7c0a..eb4f878ed 100644 --- a/src/core/sock/sockinfo_tcp.cpp +++ b/src/core/sock/sockinfo_tcp.cpp @@ -2113,7 +2113,15 @@ inline void sockinfo_tcp::rx_lwip_process_chained_pbufs(pbuf *p) p_curr_desc->rx.frag.iov_base = p->payload; p_curr_desc->rx.frag.iov_len = p->len; p_curr_desc->p_next_desc = reinterpret_cast(p->next); - process_timestamps(p_curr_desc); + } + + // To avoid redundant checking for every packet a seperate loop runs + // only in case timestamps are needed. + if (m_b_rcvtstamp || m_n_tsing_flags) { + for (auto *p_curr_desc = p_first_desc; p_curr_desc; + p_curr_desc = p_curr_desc->p_next_desc) { + process_timestamps(p_curr_desc); + } } p_first_desc->set_ref_count(head_ref); From 0d975c0f7e559139f28f3f1ff18ffe229a4d1f0a Mon Sep 17 00:00:00 2001 From: Alexander Grissik Date: Wed, 13 Mar 2024 09:22:56 +0000 Subject: [PATCH 122/169] issue: 3777348 Remove precached sysvars from sockinfo Precaching constant sysvar inside socket enlarges the socket and requires more cache lines. Signed-off-by: Alexander Grissik --- src/core/sock/sockinfo.cpp | 22 +++++++++++++--------- src/core/sock/sockinfo.h | 17 +++++------------ src/core/sock/sockinfo_tcp.cpp | 8 ++++---- src/core/sock/sockinfo_tcp.h | 4 ++-- src/core/sock/sockinfo_udp.cpp | 6 +++--- src/core/sock/sockinfo_udp.h | 4 ++-- 6 files changed, 29 insertions(+), 32 deletions(-) diff --git a/src/core/sock/sockinfo.cpp b/src/core/sock/sockinfo.cpp index 163bf5e86..2d68594fc 100644 --- a/src/core/sock/sockinfo.cpp +++ b/src/core/sock/sockinfo.cpp @@ -99,10 +99,7 @@ const char *sockinfo::setsockopt_so_opt_to_str(int opt) sockinfo::sockinfo(int fd, int domain, bool use_ring_locks) : m_fd(fd) , m_fd_context((void *)((uintptr_t)m_fd)) - , m_n_sysvar_rx_num_buffs_reuse(safe_mce_sys().rx_bufs_batch) - , m_n_sysvar_rx_poll_num(safe_mce_sys().rx_poll_num) - , m_n_sysvar_select_poll_os_ratio(safe_mce_sys().select_poll_os_ratio) - , m_rx_cq_wait_ctrl(safe_mce_sys().rx_cq_wait_ctrl) + , m_rx_num_buffs_reuse(safe_mce_sys().rx_bufs_batch) , m_skip_cq_poll_in_rx(safe_mce_sys().skip_poll_in_rx == SKIP_POLL_IN_RX_ENABLE) , m_lock_rcv(MULTILOCK_RECURSIVE, MODULE_NAME "::m_lock_rcv") , m_lock_snd(MODULE_NAME "::m_lock_snd") @@ -1524,7 +1521,7 @@ void sockinfo::statistics_print(vlog_levels_t log_level /* = VLOG_DEBUG */) // Sleep on different CQs and OS listen socket int sockinfo::os_wait_sock_rx_epfd(epoll_event *ep_events, int maxevents) { - if (unlikely(m_rx_cq_wait_ctrl)) { + if (unlikely(safe_mce_sys().rx_cq_wait_ctrl)) { add_cqfd_to_sock_rx_epfd(m_p_rx_ring); int ret = SYSCALL(epoll_wait, m_rx_epfd, ep_events, maxevents, m_loops_timer.time_left_msec()); @@ -1611,10 +1608,10 @@ void sockinfo::rx_add_ring_cb(ring *p_ring) // each event on the cq-fd. This causes high latency and increased CPU usage by the Kernel // which leads to decreased performance. For example, for 350K connections and a single // ring. there will be 350K epfds watching a single cq-fd. When this cq-fd has an event, the - // Kernel loops through all the 350K epfds. By setting m_rx_cq_wait_ctrl=true, we add the - // cq-fd only to the epfds of the sockets that are going to sleep inside + // Kernel loops through all the 350K epfds. By setting safe_mce_sys().rx_cq_wait_ctrl=true, + // we add the cq-fd only to the epfds of the sockets that are going to sleep inside // sockinfo_tcp::rx_wait_helper/sockinfo_udp::rx_wait. - if (!m_rx_cq_wait_ctrl) { + if (!safe_mce_sys().rx_cq_wait_ctrl) { add_cqfd_to_sock_rx_epfd(p_ring); } @@ -1680,7 +1677,7 @@ void sockinfo::rx_del_ring_cb(ring *p_ring) p_ring_info->rx_reuse_info.rx_reuse.size()); } - if (!m_rx_cq_wait_ctrl) { + if (!safe_mce_sys().rx_cq_wait_ctrl) { remove_cqfd_from_sock_rx_epfd(base_ring); } @@ -2345,3 +2342,10 @@ int sockinfo::handle_exception_flow() } return 0; } + +bool sockinfo::skip_os_select() +{ + // If safe_mce_sys().select_poll_os_ratio == 0, it means that user configured XLIO not to poll + // os (i.e. TRUE...) + return (!safe_mce_sys().select_poll_os_ratio); +} diff --git a/src/core/sock/sockinfo.h b/src/core/sock/sockinfo.h index 1bba3db3b..d633c327c 100644 --- a/src/core/sock/sockinfo.h +++ b/src/core/sock/sockinfo.h @@ -320,11 +320,7 @@ class sockinfo { // This prepares the socket for termination and return true if the // Return val: true is the socket is already closable and false otherwise virtual bool prepare_to_close(bool process_shutdown = false) = 0; - - // true if fd must be skipped from OS select() - // If m_n_sysvar_select_poll_os_ratio == 0, it means that user configured XLIO not to poll os - // (i.e. TRUE...) - virtual bool skip_os_select() { return (!m_n_sysvar_select_poll_os_ratio); }; + virtual bool skip_os_select(); // true if fd must be skipped from OS select() inline bool set_flow_tag(uint32_t flow_tag_id); inline void sock_pop_descs_rx_ready(descq_t *cache); @@ -367,7 +363,7 @@ class sockinfo { #if defined(DEFINED_NGINX) virtual void prepare_to_close_socket_pool(bool _push_pop) { NOT_IN_USE(_push_pop); } virtual void set_params_for_socket_pool() {}; - void set_m_n_sysvar_rx_num_buffs_reuse(int val) { m_n_sysvar_rx_num_buffs_reuse = val; } + void set_rx_num_buffs_reuse(int val) { m_rx_num_buffs_reuse = val; } #endif #endif protected: @@ -533,14 +529,11 @@ class sockinfo { size_t m_rx_ready_byte_count = 0U; buff_info_t m_rx_reuse_buff; // used in TCP instead of m_rx_ring_map int m_n_rx_pkt_ready_list_count = 0; - int m_n_sysvar_rx_num_buffs_reuse; - const int32_t m_n_sysvar_rx_poll_num; - const uint32_t m_n_sysvar_select_poll_os_ratio; + int m_rx_num_buffs_reuse; // used to periodically return buffers, even if threshold was not reached bool m_rx_reuse_buf_pending = false; // used to mark threshold was reached, but free was not done yet bool m_rx_reuse_buf_postponed = false; - bool m_rx_cq_wait_ctrl; bool m_skip_cq_poll_in_rx; bool m_reuseaddr = false; // to track setsockopt with SO_REUSEADDR bool m_reuseport = false; // to track setsockopt with SO_REUSEPORT @@ -767,10 +760,10 @@ void sockinfo::reuse_buffer(mem_buf_desc_t *buff) int &n_buff_num = iter->second->rx_reuse_info.n_buff_num; rx_reuse->push_back(buff); n_buff_num += buff->rx.n_frags; - if (n_buff_num < m_n_sysvar_rx_num_buffs_reuse) { + if (n_buff_num < m_rx_num_buffs_reuse) { return; } - if (n_buff_num >= 2 * m_n_sysvar_rx_num_buffs_reuse) { + if (n_buff_num >= 2 * m_rx_num_buffs_reuse) { if (p_ring->reclaim_recv_buffers(rx_reuse)) { n_buff_num = 0; } else { diff --git a/src/core/sock/sockinfo_tcp.cpp b/src/core/sock/sockinfo_tcp.cpp index eb4f878ed..63c2c48fb 100644 --- a/src/core/sock/sockinfo_tcp.cpp +++ b/src/core/sock/sockinfo_tcp.cpp @@ -262,10 +262,10 @@ inline void sockinfo_tcp::reuse_buffer(mem_buf_desc_t *buff) if (likely(m_p_rx_ring)) { m_rx_reuse_buff.n_buff_num += buff->rx.n_frags; m_rx_reuse_buff.rx_reuse.push_back(buff); - if (m_rx_reuse_buff.n_buff_num < m_n_sysvar_rx_num_buffs_reuse) { + if (m_rx_reuse_buff.n_buff_num < m_rx_num_buffs_reuse) { return; } - if (m_rx_reuse_buff.n_buff_num >= 2 * m_n_sysvar_rx_num_buffs_reuse) { + if (m_rx_reuse_buff.n_buff_num >= 2 * m_rx_num_buffs_reuse) { if (m_p_rx_ring->reclaim_recv_buffers(&m_rx_reuse_buff.rx_reuse)) { m_rx_reuse_buff.n_buff_num = 0; } else { @@ -3118,7 +3118,7 @@ int sockinfo_tcp::accept_helper(struct sockaddr *__addr, socklen_t *__addrlen, { sockinfo_tcp *ns; // todo do one CQ poll and go to sleep even if infinite polling was set - int poll_count = m_n_sysvar_rx_poll_num; // do one poll and go to sleep (if blocking) + int poll_count = safe_mce_sys().rx_poll_num; // do one poll and go to sleep (if blocking) int ret; si_tcp_logfuncall(""); @@ -5172,7 +5172,7 @@ int sockinfo_tcp::rx_wait_helper(int &poll_count, bool blocking) return -1; } - if (poll_count < m_n_sysvar_rx_poll_num || m_n_sysvar_rx_poll_num == -1) { + if (poll_count < safe_mce_sys().rx_poll_num || safe_mce_sys().rx_poll_num == -1) { return 0; } diff --git a/src/core/sock/sockinfo_tcp.h b/src/core/sock/sockinfo_tcp.h index f416e164c..7d64b8568 100644 --- a/src/core/sock/sockinfo_tcp.h +++ b/src/core/sock/sockinfo_tcp.h @@ -533,7 +533,7 @@ class sockinfo_tcp : public sockinfo { m_rx_reuse_buf_postponed = false; if (m_p_rx_ring) { - if (m_rx_reuse_buff.n_buff_num >= m_n_sysvar_rx_num_buffs_reuse) { + if (m_rx_reuse_buff.n_buff_num >= m_rx_num_buffs_reuse) { if (m_p_rx_ring->reclaim_recv_buffers(&m_rx_reuse_buff.rx_reuse)) { m_rx_reuse_buff.n_buff_num = 0; } else { @@ -545,7 +545,7 @@ class sockinfo_tcp : public sockinfo { while (iter != m_rx_ring_map.end()) { descq_t *rx_reuse = &iter->second->rx_reuse_info.rx_reuse; int &n_buff_num = iter->second->rx_reuse_info.n_buff_num; - if (n_buff_num >= m_n_sysvar_rx_num_buffs_reuse) { + if (n_buff_num >= m_rx_num_buffs_reuse) { if (iter->first->reclaim_recv_buffers(rx_reuse)) { n_buff_num = 0; } else { diff --git a/src/core/sock/sockinfo_udp.cpp b/src/core/sock/sockinfo_udp.cpp index 60c69de3e..f9ee35b7b 100644 --- a/src/core/sock/sockinfo_udp.cpp +++ b/src/core/sock/sockinfo_udp.cpp @@ -150,7 +150,7 @@ inline int sockinfo_udp::rx_wait(bool blocking) } loops++; - if (!blocking || m_n_sysvar_rx_poll_num != -1) { + if (!blocking || safe_mce_sys().rx_poll_num != -1) { loops_to_go--; } if (m_loops_timer.is_timeout()) { @@ -2571,7 +2571,7 @@ void sockinfo_udp::rx_add_ring_cb(ring *p_ring) // Now that we got at least 1 CQ attached start polling the CQs if (m_b_blocking) { - m_loops_to_go = m_n_sysvar_rx_poll_num; + m_loops_to_go = safe_mce_sys().rx_poll_num; } else { m_loops_to_go = 1; // Force single CQ poll in case of non-blocking socket } @@ -2601,7 +2601,7 @@ void sockinfo_udp::set_blocking(bool is_blocked) // Set the high CQ polling RX_POLL value // depending on where we have mapped offloaded MC gorups if (m_rx_ring_map.size() > 0) { - m_loops_to_go = m_n_sysvar_rx_poll_num; + m_loops_to_go = safe_mce_sys().rx_poll_num; } else { m_loops_to_go = safe_mce_sys().rx_poll_num_init; } diff --git a/src/core/sock/sockinfo_udp.h b/src/core/sock/sockinfo_udp.h index cf7e7f026..3db469120 100644 --- a/src/core/sock/sockinfo_udp.h +++ b/src/core/sock/sockinfo_udp.h @@ -188,7 +188,7 @@ class sockinfo_udp : public sockinfo { void set_params_for_socket_pool() override { m_is_for_socket_pool = true; - set_m_n_sysvar_rx_num_buffs_reuse(safe_mce_sys().nginx_udp_socket_pool_rx_num_buffs_reuse); + set_rx_num_buffs_reuse(safe_mce_sys().nginx_udp_socket_pool_rx_num_buffs_reuse); } bool is_closable() override { return !m_is_for_socket_pool; } #else @@ -257,7 +257,7 @@ class sockinfo_udp : public sockinfo { while (iter != m_rx_ring_map.end()) { descq_t *rx_reuse = &iter->second->rx_reuse_info.rx_reuse; int &n_buff_num = iter->second->rx_reuse_info.n_buff_num; - if (n_buff_num >= m_n_sysvar_rx_num_buffs_reuse) { + if (n_buff_num >= m_rx_num_buffs_reuse) { if (iter->first->reclaim_recv_buffers(rx_reuse)) { n_buff_num = 0; } else { From cb0b278280e9c3024a388f3d7a27a69af5543f23 Mon Sep 17 00:00:00 2001 From: Alexander Grissik Date: Wed, 13 Mar 2024 15:42:31 +0000 Subject: [PATCH 123/169] issue: 3777348 Remove access to m_sock_wakeup_pipe for socketxtreme Socketxtrme does not need to wakeup sockets by using m_sock_wakeup_pipe. This mechanism is used when a seperate thread sleeps on a blocking socket. Signed-off-by: Alexander Grissik --- src/core/sock/sockinfo.h | 3 ++- src/core/sock/sockinfo_tcp.cpp | 9 +++++++-- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/src/core/sock/sockinfo.h b/src/core/sock/sockinfo.h index d633c327c..f381dd7fc 100644 --- a/src/core/sock/sockinfo.h +++ b/src/core/sock/sockinfo.h @@ -499,7 +499,6 @@ class sockinfo { // End of second cache line - wakeup_pipe m_sock_wakeup_pipe; rfs *m_rfs_ptr = nullptr; struct { /* Use std::deque in current design as far as it allows pushing @@ -510,6 +509,8 @@ class sockinfo { std::deque ec_cache; } m_socketxtreme; + wakeup_pipe m_sock_wakeup_pipe; + // End of fourth cache line public: list_node socket_fd_list_node; diff --git a/src/core/sock/sockinfo_tcp.cpp b/src/core/sock/sockinfo_tcp.cpp index 63c2c48fb..f4e4e1e44 100644 --- a/src/core/sock/sockinfo_tcp.cpp +++ b/src/core/sock/sockinfo_tcp.cpp @@ -2192,7 +2192,7 @@ err_t sockinfo_tcp::rx_lwip_cb_socketxtreme(void *arg, struct tcp_pcb *pcb, stru conn->rx_lwip_cb_socketxtreme_helper(p); io_mux_call::update_fd_array(conn->m_iomux_ready_fd_array, conn->m_fd); - conn->m_sock_wakeup_pipe.do_wakeup(); + /* * RCVBUFF Accounting: tcp_recved here(stream into the 'internal' buffer) only if the user * buffer is not 'filled' @@ -5859,7 +5859,12 @@ void sockinfo_tcp::tcp_tx_zc_handle(mem_buf_desc_t *p_desc) /* Signal events on socket */ NOTIFY_ON_EVENTS(sock, EPOLLERR); - sock->m_sock_wakeup_pipe.do_wakeup(); + + // Avoid cache access unnecessarily. + // Non-blocking sockets are waked-up as part of mux handling. + if (unlikely(is_blocking())) { + sock->m_sock_wakeup_pipe.do_wakeup(); + } } struct tcp_seg *sockinfo_tcp::tcp_seg_alloc_direct(void *p_conn) From 08956f8ba5f412fbdac946a9fd25d9a9565ea21c Mon Sep 17 00:00:00 2001 From: Alexander Grissik Date: Wed, 13 Mar 2024 16:49:33 +0000 Subject: [PATCH 124/169] issue: 3777348 Avoid checking m_iomux_ready_fd_array for Socketxtrme This mechanism is used for poll/select only. Socketxtrme has its own polling mechanism. Accesing and checking this member/method is cycles/cache overhead. Signed-off-by: Alexander Grissik --- src/core/sock/sockinfo_tcp.cpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/core/sock/sockinfo_tcp.cpp b/src/core/sock/sockinfo_tcp.cpp index f4e4e1e44..92506661d 100644 --- a/src/core/sock/sockinfo_tcp.cpp +++ b/src/core/sock/sockinfo_tcp.cpp @@ -2191,8 +2191,6 @@ err_t sockinfo_tcp::rx_lwip_cb_socketxtreme(void *arg, struct tcp_pcb *pcb, stru conn->rx_lwip_process_chained_pbufs(p); conn->rx_lwip_cb_socketxtreme_helper(p); - io_mux_call::update_fd_array(conn->m_iomux_ready_fd_array, conn->m_fd); - /* * RCVBUFF Accounting: tcp_recved here(stream into the 'internal' buffer) only if the user * buffer is not 'filled' From 6434e585a1818695bb31103ae9105496c4dbbccb Mon Sep 17 00:00:00 2001 From: Alexander Grissik Date: Wed, 13 Mar 2024 17:22:14 +0000 Subject: [PATCH 125/169] issue: 3777348 Avoid unnecessary access to ring_allocation_tx members Cache optimization for data path. Signed-off-by: Alexander Grissik --- src/core/sock/sockinfo_tcp.cpp | 8 +++++--- src/core/sock/sockinfo_udp.cpp | 7 +++++-- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/src/core/sock/sockinfo_tcp.cpp b/src/core/sock/sockinfo_tcp.cpp index 92506661d..2a01f1d97 100644 --- a/src/core/sock/sockinfo_tcp.cpp +++ b/src/core/sock/sockinfo_tcp.cpp @@ -1423,11 +1423,13 @@ err_t sockinfo_tcp::ip_output(struct pbuf *p, struct tcp_seg *seg, void *v_p_con rc = p_si_tcp->m_ops->handle_send_ret(ret, seg); - if (p_dst->try_migrate_ring_tx(p_si_tcp->m_tcp_con_lock.get_lock_base())) { - p_si_tcp->m_p_socket_stats->counters.n_tx_migrations++; + if (unlikely(safe_mce_sys().ring_migration_ratio_tx > 0)) { // Condition for cache optimization + if (p_dst->try_migrate_ring_tx(p_si_tcp->m_tcp_con_lock.get_lock_base())) { + p_si_tcp->m_p_socket_stats->counters.n_tx_migrations++; + } } - if (rc && is_set(attr.flags, XLIO_TX_PACKET_REXMIT)) { + if (unlikely(is_set(attr.flags, XLIO_TX_PACKET_REXMIT) && rc)) { p_si_tcp->m_p_socket_stats->counters.n_tx_retransmits++; } diff --git a/src/core/sock/sockinfo_udp.cpp b/src/core/sock/sockinfo_udp.cpp index f9ee35b7b..c44d0821a 100644 --- a/src/core/sock/sockinfo_udp.cpp +++ b/src/core/sock/sockinfo_udp.cpp @@ -2195,8 +2195,11 @@ ssize_t sockinfo_udp::tx(xlio_tx_call_attr_t &tx_arg) tx_arg.opcode); } - if (unlikely(p_dst_entry->try_migrate_ring_tx(m_lock_snd))) { - m_p_socket_stats->counters.n_tx_migrations++; + // Condition for cache optimization + if (unlikely(safe_mce_sys().ring_migration_ratio_tx > 0)) { + if (unlikely(p_dst_entry->try_migrate_ring_tx(m_lock_snd))) { + m_p_socket_stats->counters.n_tx_migrations++; + } } // TODO ALEXR - still need to handle "is_dropped" in send path From 87a76ea5f74e10c6d60c8277bd187b8ee70f27de Mon Sep 17 00:00:00 2001 From: Alexander Grissik Date: Thu, 14 Mar 2024 06:16:11 +0000 Subject: [PATCH 126/169] issue: 3777348 Use thread_local dummy lock It is enough to hold a thread_local dummy lock instead of allocating a seperate object for each socket. Dummy lock contains only empty methods. Signed-off-by: Alexander Grissik --- src/core/dev/ring_slave.cpp | 4 +++- src/core/sock/sockinfo_tcp.cpp | 3 ++- src/utils/lock_wrapper.h | 24 ++++++++++++++++-------- 3 files changed, 21 insertions(+), 10 deletions(-) diff --git a/src/core/dev/ring_slave.cpp b/src/core/dev/ring_slave.cpp index 54bb5cc2e..73a4cab63 100644 --- a/src/core/dev/ring_slave.cpp +++ b/src/core/dev/ring_slave.cpp @@ -47,11 +47,13 @@ // AF_INET address 0.0.0.0:0, used for 3T flow spec keys. static const sock_addr s_sock_addrany; +static thread_local lock_dummy t_lock_dummy_ring; + static lock_base *get_new_lock(const char *name, bool real_lock) { return (real_lock ? static_cast(multilock::create_new_lock(MULTILOCK_RECURSIVE, name)) - : static_cast(new lock_dummy())); + : static_cast(&t_lock_dummy_ring)); } ring_slave::ring_slave(int if_index, ring *parent, ring_type_t type, bool use_locks) diff --git a/src/core/sock/sockinfo_tcp.cpp b/src/core/sock/sockinfo_tcp.cpp index 2a01f1d97..3ce0d7b94 100644 --- a/src/core/sock/sockinfo_tcp.cpp +++ b/src/core/sock/sockinfo_tcp.cpp @@ -84,6 +84,7 @@ extern global_stats_t g_global_stat_static; tcp_timers_collection *g_tcp_timers_collection = nullptr; thread_local thread_local_tcp_timers g_thread_local_tcp_timers; bind_no_port *g_bind_no_port = nullptr; +static thread_local lock_dummy t_lock_dummy_socket; /* * The following socket options are inherited by a connected TCP socket from the listening socket: @@ -163,7 +164,7 @@ static lock_base *get_new_tcp_lock() return ( safe_mce_sys().tcp_ctl_thread != option_tcp_ctl_thread::CTL_THREAD_DELEGATE_TCP_TIMERS ? static_cast(multilock::create_new_lock(MULTILOCK_RECURSIVE, "tcp_con")) - : static_cast(new lock_dummy)); + : static_cast(&t_lock_dummy_socket)); } inline void sockinfo_tcp::lwip_pbuf_init_custom(mem_buf_desc_t *p_desc) diff --git a/src/utils/lock_wrapper.h b/src/utils/lock_wrapper.h index 9f4925971..1562106c7 100644 --- a/src/utils/lock_wrapper.h +++ b/src/utils/lock_wrapper.h @@ -42,7 +42,7 @@ #include "types.h" #include "utils/bullseye.h" #include "utils/rdtsc.h" -#include +#include #include #include @@ -83,6 +83,7 @@ class lock_base { lock_base(const char *_lock_name = NULL) : m_lock_name(_lock_name) {}; virtual ~lock_base() {}; + virtual void delete_obj() { delete this; } virtual int lock() = 0; virtual int trylock() = 0; virtual int unlock() = 0; @@ -458,21 +459,27 @@ class lock_dummy : public lock_base { { } - inline int lock() { return 0; } - inline int trylock() { return 0; } - inline int unlock() { return 0; } - inline int is_locked_by_me() { return 1; } + void delete_obj() override {} + int lock() override { return 0; } + int trylock() override { return 0; } + int unlock() override { return 0; } + int is_locked_by_me() override { return 1; } }; +static inline void lock_deleter_func(lock_base *lock) +{ + lock->delete_obj(); +} + class multilock { public: multilock(lock_base *_lock) - : m_lock(_lock) + : m_lock(_lock, lock_deleter_func) { } multilock(multilock_recursive_t _recursive, const char *_str) - : m_lock(create_new_lock(_recursive, _str)) + : m_lock(create_new_lock(_recursive, _str), lock_deleter_func) { } @@ -504,7 +511,8 @@ class multilock { inline const char *to_str() { return m_lock->to_str(); } private: - std::unique_ptr m_lock; + typedef std::function lock_deleter; + std::unique_ptr m_lock; }; #endif // LOCK_WRAPPER_H From 1ff8f40f5042a8d978506ec9cd924444ebe4b991 Mon Sep 17 00:00:00 2001 From: Alexander Grissik Date: Thu, 14 Mar 2024 08:07:04 +0000 Subject: [PATCH 127/169] issue: 3777348 Avoid copying src/dst addresses for TCP flow-tag DP TCP 5T flow-tag data path always contains src/dst addresses in m_connected/m_bound memebers. Signed-off-by: Alexander Grissik --- src/core/dev/ring_slave.cpp | 4 ---- src/core/sock/sockinfo.h | 11 +++++++++-- src/core/sock/sockinfo_tcp.cpp | 17 +++++------------ 3 files changed, 14 insertions(+), 18 deletions(-) diff --git a/src/core/dev/ring_slave.cpp b/src/core/dev/ring_slave.cpp index 73a4cab63..c6bb055b0 100644 --- a/src/core/dev/ring_slave.cpp +++ b/src/core/dev/ring_slave.cpp @@ -646,10 +646,6 @@ bool ring_slave::rx_process_buffer(mem_buf_desc_t *p_rx_wc_buf_desc, void *pv_fd if (likely(protocol == IPPROTO_TCP)) { struct tcphdr *p_tcp_h = (struct tcphdr *)((uint8_t *)p_ip_h + ip_hdr_len); - // Update the L3 and L4 info - p_rx_wc_buf_desc->rx.src.set_ip_port(family, saddr, p_tcp_h->source); - p_rx_wc_buf_desc->rx.dst.set_ip_port(family, daddr, p_tcp_h->dest); - // Update packet descriptor with datagram base address and length p_rx_wc_buf_desc->rx.frag.iov_base = (uint8_t *)p_tcp_h + sizeof(struct tcphdr); p_rx_wc_buf_desc->rx.frag.iov_len = ip_payload_len - sizeof(struct tcphdr); diff --git a/src/core/sock/sockinfo.h b/src/core/sock/sockinfo.h index f381dd7fc..3cb9f8e14 100644 --- a/src/core/sock/sockinfo.h +++ b/src/core/sock/sockinfo.h @@ -560,7 +560,7 @@ class sockinfo { loops_timer m_loops_timer; ring_alloc_logic_attr m_ring_alloc_log_rx; ring_alloc_logic_attr m_ring_alloc_log_tx; - // Callback function pointer to support VMA extra API (xlio_extra.h) + // Callback function pointer to support XLIO extra API (xlio_extra.h) xlio_recv_callback_t m_rx_callback = nullptr; void *m_rx_callback_context = nullptr; // user context struct xlio_rate_limit_t m_so_ratelimit; @@ -675,7 +675,14 @@ int sockinfo::dequeue_packet(iovec *p_iov, ssize_t sz_iov, sockaddr *__from, soc size_t payload_size = pdesc->rx.sz_payload; if (__from && __fromlen) { - pdesc->rx.src.get_sa_by_family(__from, *__fromlen, m_family); + if (m_protocol == PROTO_UDP || m_connected.is_anyport()) { + // For UDP non-connected or TCP listen socket fetch from packet. + pdesc->rx.src.get_sa_by_family(__from, *__fromlen, m_family); + } else { + // For TCP connected 5T fetch from m_connected. + // For TCP flow-tag we avoid filling packet with src for performance. + m_connected.get_sa_by_family(__from, *__fromlen, m_family); + } } if (in_flags & MSG_XLIO_ZCOPY) { diff --git a/src/core/sock/sockinfo_tcp.cpp b/src/core/sock/sockinfo_tcp.cpp index 3ce0d7b94..d651d395f 100644 --- a/src/core/sock/sockinfo_tcp.cpp +++ b/src/core/sock/sockinfo_tcp.cpp @@ -1626,6 +1626,7 @@ bool sockinfo_tcp::process_peer_ctl_packets(xlio_desc_list_t &peer_packets) return false; } + // Listen socket is 3T and so rx.src/dst are set as part of rx_process_buffer_no_flow_id. struct tcp_pcb *pcb = get_syn_received_pcb(desc->rx.src, desc->rx.dst); // 2.1.2 get the pcb and sockinfo @@ -1987,9 +1988,6 @@ static inline void _rx_lwip_cb_socketxtreme_helper(pbuf *p, { mem_buf_desc_t *current_desc = reinterpret_cast(p); - // Is IPv4 only. - assert(current_desc->rx.src.get_sa_family() == AF_INET); - if (!buff_list_tail) { // New completion completion->packet.buff_lst = reinterpret_cast(p); @@ -1997,8 +1995,7 @@ static inline void _rx_lwip_cb_socketxtreme_helper(pbuf *p, completion->packet.num_bufs = current_desc->rx.n_frags; assert(reinterpret_cast(p)->rx.n_frags > 0); - current_desc->rx.src.get_sa(reinterpret_cast(&completion->src), - sizeof(completion->src)); + if (use_hw_timestamp) { completion->packet.hw_timestamp = current_desc->rx.timestamps.hw; } @@ -2083,9 +2080,6 @@ inline void sockinfo_tcp::rx_lwip_process_chained_pbufs(pbuf *p) p_first_desc->rx.sz_payload = p->tot_len; p_first_desc->rx.n_frags = 0; - m_connected.get_sa(reinterpret_cast(&p_first_desc->rx.src), - static_cast(sizeof(p_first_desc->rx.src))); - if (unlikely(has_stats())) { m_p_socket_stats->counters.n_rx_bytes += p->tot_len; @@ -2243,8 +2237,8 @@ err_t sockinfo_tcp::rx_lwip_cb_recv_callback(void *arg, struct tcp_pcb *pcb, str pkt_info.struct_sz = sizeof(pkt_info); pkt_info.packet_id = (void *)p_first_desc; - pkt_info.src = p_first_desc->rx.src.get_p_sa(); - pkt_info.dst = p_first_desc->rx.dst.get_p_sa(); + pkt_info.src = conn->m_connected.get_p_sa(); + pkt_info.dst = conn->m_bound.get_p_sa(); pkt_info.socket_ready_queue_pkt_count = conn->m_n_rx_pkt_ready_list_count; pkt_info.socket_ready_queue_byte_count = conn->m_rx_ready_byte_count; @@ -2555,6 +2549,7 @@ bool sockinfo_tcp::rx_input_cb(mem_buf_desc_t *p_rx_pkt_mem_buf_desc_info, void m_iomux_ready_fd_array = (fd_array_t *)pv_fd_ready_array; if (unlikely(get_tcp_state(&m_pcb) == LISTEN)) { + // Listen socket is always 3T and so rx.src/dst are set as part of no-flow-id path. pcb = get_syn_received_pcb(p_rx_pkt_mem_buf_desc_info->rx.src, p_rx_pkt_mem_buf_desc_info->rx.dst); bool established_backlog_full = false; @@ -5282,7 +5277,6 @@ mem_buf_desc_t *sockinfo_tcp::get_next_desc(mem_buf_desc_t *p_desc) p_desc->rx.sz_payload = p_desc->lwip_pbuf.tot_len = prev->lwip_pbuf.tot_len - prev->lwip_pbuf.len; p_desc->rx.n_frags = --prev->rx.n_frags; - p_desc->rx.src = prev->rx.src; p_desc->inc_ref_count(); m_rx_pkt_ready_list.push_front(p_desc); m_n_rx_pkt_ready_list_count++; @@ -5387,7 +5381,6 @@ int sockinfo_tcp::zero_copy_rx(iovec *p_iov, mem_buf_desc_t *pdesc, int *p_flags p_desc_iter->rx.n_frags = p_desc_head->rx.n_frags - p_pkts->sz_iov; p_desc_head->rx.n_frags = p_pkts->sz_iov; - p_desc_iter->rx.src = prev->rx.src; p_desc_iter->inc_ref_count(); prev->lwip_pbuf.next = nullptr; From 7fb59634f86848adfabbb915f164a2dbb62d44ee Mon Sep 17 00:00:00 2001 From: Alex Briskin Date: Mon, 18 Mar 2024 12:01:11 +0200 Subject: [PATCH 128/169] issue: 3808935 Add nullptr checks before dereferencing There is no bug, this fix is aimed at suppressing the optimizer errors. Signed-off-by: Alex Briskin --- src/core/dev/buffer_pool.cpp | 2 +- src/core/dev/cq_mgr_rx_strq.cpp | 5 ++++- src/core/sock/sockinfo_tcp.cpp | 10 ++++++---- 3 files changed, 11 insertions(+), 6 deletions(-) diff --git a/src/core/dev/buffer_pool.cpp b/src/core/dev/buffer_pool.cpp index 7de9ad0b5..6176c32ea 100644 --- a/src/core/dev/buffer_pool.cpp +++ b/src/core/dev/buffer_pool.cpp @@ -490,7 +490,7 @@ void buffer_pool::put_buffers_after_deref_thread_safe(descq_t *pDeque) std::lock_guard lock(m_lock); while (!pDeque->empty()) { mem_buf_desc_t *list = pDeque->get_and_pop_front(); - if (list->dec_ref_count() <= 1 && (list->lwip_pbuf.ref-- <= 1)) { + if (likely(list) && list->dec_ref_count() <= 1 && (list->lwip_pbuf.ref-- <= 1)) { put_buffers(list); } } diff --git a/src/core/dev/cq_mgr_rx_strq.cpp b/src/core/dev/cq_mgr_rx_strq.cpp index 846c10a93..041364177 100644 --- a/src/core/dev/cq_mgr_rx_strq.cpp +++ b/src/core/dev/cq_mgr_rx_strq.cpp @@ -84,7 +84,10 @@ cq_mgr_rx_strq::~cq_mgr_rx_strq() cq_logdbg("Clearing %zu stride objects)", m_rx_queue.size()); while (!m_rx_queue.empty()) { - reclaim_recv_buffer_helper(m_rx_queue.get_and_pop_front()); + mem_buf_desc_t *buff = m_rx_queue.get_and_pop_front(); + if (likely(buff)) { + reclaim_recv_buffer_helper(buff); + } } m_p_cq_stat->n_rx_sw_queue_len = m_rx_queue.size(); diff --git a/src/core/sock/sockinfo_tcp.cpp b/src/core/sock/sockinfo_tcp.cpp index d651d395f..c10a26fa8 100644 --- a/src/core/sock/sockinfo_tcp.cpp +++ b/src/core/sock/sockinfo_tcp.cpp @@ -3468,10 +3468,12 @@ err_t sockinfo_tcp::accept_lwip_cb(void *arg, struct tcp_pcb *child_pcb, err_t e while (!temp_list.empty()) { mem_buf_desc_t *desc = temp_list.get_and_pop_front(); - desc->inc_ref_count(); - L3_level_tcp_input((pbuf *)desc, &new_sock->m_pcb); - if (desc->dec_ref_count() <= 1) { // todo reuse needed? - new_sock->m_rx_ctl_reuse_list.push_back(desc); + if (likely(desc)) { + desc->inc_ref_count(); + L3_level_tcp_input((pbuf *)desc, &new_sock->m_pcb); + if (desc->dec_ref_count() <= 1) { // todo reuse needed? + new_sock->m_rx_ctl_reuse_list.push_back(desc); + } } } } From af806d1bd013bb228d91bccd60d11070529001af Mon Sep 17 00:00:00 2001 From: Alexander Grissik Date: Tue, 19 Mar 2024 17:34:45 +0200 Subject: [PATCH 129/169] issue: 3829626 Fix new TCP timers registration for reused sockets A TCP time-wait used socket can call register_timer although it is already registered. This made the socket to be added more than once to the timers list. Adding a check to verify that the socket is not already in the list. Signed-off-by: Alexander Grissik --- src/core/sock/sockinfo_tcp.cpp | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/core/sock/sockinfo_tcp.cpp b/src/core/sock/sockinfo_tcp.cpp index c10a26fa8..140cd7f0b 100644 --- a/src/core/sock/sockinfo_tcp.cpp +++ b/src/core/sock/sockinfo_tcp.cpp @@ -6039,6 +6039,11 @@ void tcp_timers_collection::add_new_timer(sockinfo_tcp *sock) return; } + // A reused time-wait socket wil try to add a timer although it is already registered. + if (m_sock_remove_map.find(sock) != m_sock_remove_map.end()) { + return; + } + sock_list &bucket = m_p_intervals[m_n_next_insert_bucket]; bucket.emplace_back(sock); m_sock_remove_map.emplace(sock, std::make_tuple(m_n_next_insert_bucket, --(bucket.end()))); @@ -6067,6 +6072,10 @@ void tcp_timers_collection::remove_timer(sockinfo_tcp *sock) } __log_dbg("TCP socket [%p] timer was removed", sock); + } else { + // Listen sockets are not added to timers. + // As part of socket general unregister and destroy they will get here and will no be found. + __log_dbg("TCP socket [%p] timer was not found (listen socket)", sock); } } From 3a98c90563b3355c5d1d35831fc4ac8e1768ea78 Mon Sep 17 00:00:00 2001 From: Alexander Grissik Date: Tue, 19 Mar 2024 17:37:03 +0200 Subject: [PATCH 130/169] issue: 3829626 Fixing statistics init for reused sockets A TCP reused socket calls socket_stats_init. However, the check inside socket_stats_init was for m_p_socket_stats which may point to sock_stats::t_dummy_stats and thus pass the condition and init the sockets as one that has stats. Further a pointer to dummy is returned to stats vector as regular stats object. Checking for m_has_stats instead of for m_p_socket_stats solves the issue. Since, m_has_stats initialized to true only if a real stats object is returned by sock_stats. This stats object will be held by the socket untill its destruction. A reused timewait socket that has stats will just reset its stats as part of socket_stats_init. Signed-off-by: Alexander Grissik --- src/core/sock/sockinfo.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/core/sock/sockinfo.cpp b/src/core/sock/sockinfo.cpp index 2d68594fc..1e01a5f38 100644 --- a/src/core/sock/sockinfo.cpp +++ b/src/core/sock/sockinfo.cpp @@ -190,18 +190,18 @@ sockinfo::~sockinfo() void sockinfo::socket_stats_init() { - if (!m_p_socket_stats) { // This check is for listen sockets. + if (!m_has_stats) { // This check is for listen sockets. m_p_socket_stats = sock_stats::instance().get_stats_obj(); if (!m_p_socket_stats) { m_p_socket_stats = &sock_stats::t_dummy_stats; return; } + m_has_stats = true; // Save stats as local copy and allow state publisher to copy from this location xlio_stats_instance_create_socket_block(m_p_socket_stats); } - m_has_stats = true; m_p_socket_stats->reset(); m_p_socket_stats->fd = m_fd; m_p_socket_stats->inode = fd2inode(m_fd); From 27931ced5f6a20f2cb3215ac6be9b9410992e8ef Mon Sep 17 00:00:00 2001 From: Dmytro Podgornyi Date: Tue, 12 Mar 2024 17:28:09 +0200 Subject: [PATCH 131/169] issue: 3788369 Replace XLIO_HUGEPAGE_LOG2 with XLIO_HUGEPAGE_SIZE XLIO Socket API introduced memory callback which provides information about internal memory allocation for RX/TX buffers. The callback also notifies about hugepage size and guarantees hugepage boundary alignment if hugepages are used. This makes XLIO hugepage configuration is more critical to user if their logic depends on hugepage properties. XLIO_HUGEPAGE_LOG2 is not user friendly, so replace it with XLIO_HUGEPAGE_SIZE configuration which accepts user readable format of the hugepage size. For example, XLIO_HUGEPAGE_SIZE=2MB. XLIO checks whether the requested hugepage size is supported by the system. But this check is moved to hugepage_mgr constructor, because g_hugepage_mgr is not initialized during reading XLIO configuration. Signed-off-by: Dmytro Podgornyi --- README | 7 +++++++ src/core/main.cpp | 4 ++-- src/core/util/hugepage_mgr.cpp | 23 +++++++++++++++++------ src/core/util/sys_vars.cpp | 31 +++++++++++-------------------- src/core/util/sys_vars.h | 7 ++++--- 5 files changed, 41 insertions(+), 31 deletions(-) diff --git a/README b/README index b3f19bc79..65617d8db 100644 --- a/README +++ b/README @@ -153,6 +153,7 @@ Example: XLIO DETAILS: Mem Allocation type Huge pages [XLIO_MEM_ALLOC_TYPE] XLIO DETAILS: Memory limit 2 GB [XLIO_MEMORY_LIMIT] XLIO DETAILS: Memory limit (user allocator) 0 [XLIO_MEMORY_LIMIT_USER] + XLIO DETAILS: Hugepage size 0 [XLIO_HUGEPAGE_SIZE] XLIO DETAILS: Num of UC ARPs 3 [XLIO_NEIGH_UC_ARP_QUATA] XLIO DETAILS: UC ARP delay (msec) 10000 [XLIO_NEIGH_UC_ARP_DELAY_MSEC] XLIO DETAILS: Num of neigh restart retries 1 [XLIO_NEIGH_NUM_ERR_RETRIES] @@ -963,6 +964,12 @@ provided with XLIO extra API. Default value 0 makes XLIO use XLIO_MEMORY_LIMIT value for user allocations. Default value is 0 +XLIO_HUGEPAGE_SIZE +Force specific hugepage size for XLIO internal memory allocations. Value 0 allows +to use any supported and available hugepages. The size may be specified with +suffixes such as KB, MB, GB. +Default value is 0 + XLIO_NEIGH_UC_ARP_QUATA XLIO will send UC ARP in case neigh state is NUD_STALE. In case that neigh state is still NUD_STALE XLIO will try diff --git a/src/core/main.cpp b/src/core/main.cpp index c78d50ca8..2bb7cfc5e 100644 --- a/src/core/main.cpp +++ b/src/core/main.cpp @@ -780,8 +780,8 @@ void print_xlio_global_settings() VLOG_PARAM_STRING("Memory limit (user allocator)", safe_mce_sys().memory_limit_user, MCE_DEFAULT_MEMORY_LIMIT_USER, SYS_VAR_MEMORY_LIMIT_USER, option_size::to_str(safe_mce_sys().memory_limit_user)); - VLOG_PARAM_NUMBER("Hugepage log2", safe_mce_sys().hugepage_log2, MCE_DEFAULT_HUGEPAGE_LOG2, - SYS_VAR_HUGEPAGE_LOG2); + VLOG_PARAM_STRING("Hugepage size", safe_mce_sys().hugepage_size, MCE_DEFAULT_HUGEPAGE_SIZE, + SYS_VAR_HUGEPAGE_SIZE, option_size::to_str(safe_mce_sys().hugepage_size)); VLOG_PARAM_NUMBER("Num of UC ARPs", safe_mce_sys().neigh_uc_arp_quata, MCE_DEFAULT_NEIGH_UC_ARP_QUATA, SYS_VAR_NEIGH_UC_ARP_QUATA); diff --git a/src/core/util/hugepage_mgr.cpp b/src/core/util/hugepage_mgr.cpp index 60d1768b2..c5bc254a1 100644 --- a/src/core/util/hugepage_mgr.cpp +++ b/src/core/util/hugepage_mgr.cpp @@ -55,6 +55,18 @@ hugepage_mgr::hugepage_mgr() memset(&m_stats, 0, sizeof(m_stats)); m_default_hugepage = read_meminfo("Hugepagesize:"); update(); + + /* Check hugepage size if requested by user explicitly. */ + if (safe_mce_sys().hugepage_size != 0 && !is_hugepage_supported(safe_mce_sys().hugepage_size)) { + vlog_printf(VLOG_WARNING, + "Requested hugepage %s is not supported by the system. " + "XLIO will autodetect optimal hugepage.\n", + option_size::to_str(safe_mce_sys().hugepage_size)); + /* Value 0 means default autodetection behavior. Don't set MCE_DEFAULT_HUGEPAGE_SIZE + * here, because it can be defined to an unsupported specific value. + */ + safe_mce_sys().hugepage_size = 0; + } } void hugepage_mgr::update() @@ -115,12 +127,12 @@ void *hugepage_mgr::alloc_hugepages(size_t &size) void *ptr = nullptr; std::vector hugepages; - if (safe_mce_sys().hugepage_log2 == 0) { + if (safe_mce_sys().hugepage_size == 0) { get_supported_hugepages(hugepages); std::sort(hugepages.begin(), hugepages.end(), std::greater()); } else { // User requested specific hugepage size - don't check other types. - hugepages.push_back(1LU << safe_mce_sys().hugepage_log2); + hugepages.push_back(safe_mce_sys().hugepage_size); } for (auto iter = hugepages.begin(); !ptr && iter != hugepages.end(); ++iter) { @@ -173,10 +185,9 @@ void hugepage_mgr::print_report(bool short_report /*=false*/) get_supported_hugepages(hugepages); vlog_printf(VLOG_INFO, "Hugepages info:\n"); - if (safe_mce_sys().hugepage_log2) { - vlog_printf(VLOG_INFO, " User forced to use %lu kB hugepages (%s=%u).\n", - (1LU << safe_mce_sys().hugepage_log2) / 1024U, SYS_VAR_HUGEPAGE_LOG2, - safe_mce_sys().hugepage_log2); + if (safe_mce_sys().hugepage_size) { + vlog_printf(VLOG_INFO, " User forced to use %lu kB hugepages.\n", + (safe_mce_sys().hugepage_size) / 1024U); } for (size_t hugepage : hugepages) { vlog_printf(VLOG_INFO, " %zu kB : total=%u free=%u\n", hugepage / 1024U, diff --git a/src/core/util/sys_vars.cpp b/src/core/util/sys_vars.cpp index 83214ea20..025b885aa 100644 --- a/src/core/util/sys_vars.cpp +++ b/src/core/util/sys_vars.cpp @@ -858,7 +858,7 @@ void mce_sys_var::get_env_params() memory_limit = MCE_DEFAULT_MEMORY_LIMIT; memory_limit_user = MCE_DEFAULT_MEMORY_LIMIT_USER; heap_metadata_block = MCE_DEFAULT_HEAP_METADATA_BLOCK; - hugepage_log2 = MCE_DEFAULT_HUGEPAGE_LOG2; + hugepage_size = MCE_DEFAULT_HUGEPAGE_SIZE; enable_socketxtreme = MCE_DEFAULT_SOCKETXTREME; enable_tso = MCE_DEFAULT_TSO; #ifdef DEFINED_UTLS @@ -1866,26 +1866,17 @@ void mce_sys_var::get_env_params() if ((env_ptr = getenv(SYS_VAR_HEAP_METADATA_BLOCK))) { heap_metadata_block = option_size::from_str(env_ptr) ?: MCE_DEFAULT_HEAP_METADATA_BLOCK; } - if ((env_ptr = getenv(SYS_VAR_HUGEPAGE_LOG2))) { - unsigned val = (unsigned)atoi(env_ptr); - - // mmap() uses 6 bits for the hugepage size log2 - if (val < 64U) { - hugepage_log2 = val; - } else { - hugepage_log2 = MCE_DEFAULT_HUGEPAGE_LOG2; - vlog_printf(VLOG_WARNING, "%s parameter can be in range [0, 63], but set to %u\n", - SYS_VAR_HUGEPAGE_LOG2, val); + if ((env_ptr = getenv(SYS_VAR_HUGEPAGE_SIZE))) { + hugepage_size = option_size::from_str(env_ptr); + if (hugepage_size & (hugepage_size - 1)) { + vlog_printf(VLOG_WARNING, "%s must be a power of 2. Fallback to default value (%s)\n", + SYS_VAR_HUGEPAGE_SIZE, option_size::to_str(MCE_DEFAULT_HUGEPAGE_SIZE)); + hugepage_size = MCE_DEFAULT_HUGEPAGE_SIZE; } - if (hugepage_log2 != 0 && !g_hugepage_mgr.is_hugepage_supported(1LU << hugepage_log2)) { - vlog_printf(VLOG_WARNING, - "Requested hugepage %zu kB is not supported. " - "XLIO will autodetect optimal hugepage.", - (1LU << hugepage_log2) / 1024LU); - /* Value 0 means default autodetection behavior. Don't set MCE_DEFAULT_HUGEPAGE_LOG2 - * here, because it can be defined to an unsupported specific value. - */ - hugepage_log2 = 0; + if (hugepage_size > MCE_MAX_HUGEPAGE_SIZE) { + vlog_printf(VLOG_WARNING, "%s exceeds maximum possible hugepage size (%s)\n", + SYS_VAR_HUGEPAGE_SIZE, option_size::to_str(MCE_MAX_HUGEPAGE_SIZE)); + hugepage_size = MCE_DEFAULT_HUGEPAGE_SIZE; } } diff --git a/src/core/util/sys_vars.h b/src/core/util/sys_vars.h index 794fefcba..64736f610 100644 --- a/src/core/util/sys_vars.h +++ b/src/core/util/sys_vars.h @@ -454,7 +454,7 @@ struct mce_sys_var { size_t memory_limit; size_t memory_limit_user; size_t heap_metadata_block; - uint8_t hugepage_log2; + size_t hugepage_size; bool handle_fork; bool close_on_dup2; uint32_t mtu; /* effective MTU. If mtu==0 then auto calculate the MTU */ @@ -655,7 +655,7 @@ extern mce_sys_var &safe_mce_sys(); #define SYS_VAR_MEMORY_LIMIT "XLIO_MEMORY_LIMIT" #define SYS_VAR_MEMORY_LIMIT_USER "XLIO_MEMORY_LIMIT_USER" #define SYS_VAR_HEAP_METADATA_BLOCK "XLIO_HEAP_METADATA_BLOCK" -#define SYS_VAR_HUGEPAGE_LOG2 "XLIO_HUGEPAGE_LOG2" +#define SYS_VAR_HUGEPAGE_SIZE "XLIO_HUGEPAGE_SIZE" #define SYS_VAR_FORK "XLIO_FORK" #define SYS_VAR_BF "XLIO_BF" #define SYS_VAR_CLOSE_ON_DUP2 "XLIO_CLOSE_ON_DUP2" @@ -822,7 +822,8 @@ extern mce_sys_var &safe_mce_sys(); #define MCE_DEFAULT_MEMORY_LIMIT (2LU * 1024 * 1024 * 1024) #define MCE_DEFAULT_MEMORY_LIMIT_USER (0) #define MCE_DEFAULT_HEAP_METADATA_BLOCK (32LU * 1024 * 1024) -#define MCE_DEFAULT_HUGEPAGE_LOG2 (0) +#define MCE_DEFAULT_HUGEPAGE_SIZE (0) +#define MCE_MAX_HUGEPAGE_SIZE (1ULL << 63ULL) #define MCE_DEFAULT_FORK_SUPPORT (true) #define MCE_DEFAULT_BF_FLAG (true) #define MCE_DEFAULT_CLOSE_ON_DUP2 (true) From bf77b2c4a57d0593865b44ad4655620b1221f1c5 Mon Sep 17 00:00:00 2001 From: Dmytro Podgornyi Date: Mon, 18 Mar 2024 17:21:39 +0200 Subject: [PATCH 132/169] issue: 3788369 Remove xlio_key prototypes XLIO socket API doesn't support crypto offload on this stage. Signed-off-by: Dmytro Podgornyi --- src/core/xlio.h | 9 --------- 1 file changed, 9 deletions(-) diff --git a/src/core/xlio.h b/src/core/xlio.h index e29bfbe94..aa3d36785 100644 --- a/src/core/xlio.h +++ b/src/core/xlio.h @@ -391,7 +391,6 @@ int xlio_socketxtreme_free_buff(struct xlio_buff_t *buff); typedef uintptr_t xlio_poll_group_t; typedef uintptr_t xlio_socket_t; -typedef uint32_t xlio_key_t; struct xlio_buf { uint64_t userdata; @@ -589,14 +588,6 @@ int xlio_socket_sendv(xlio_socket_t sock, const struct iovec *iov, unsigned iovc void xlio_poll_group_flush(xlio_poll_group_t group); void xlio_socket_flush(xlio_socket_t sock); -struct xlio_key_attr { - int unused; -}; - -/* All the alive socket keys are destroyed on socket destruction. */ -int xlio_key_create(xlio_socket_t sock, struct xlio_key_attr *attr, xlio_key_t *key_out); -void xlio_key_destroy(xlio_socket_t sock, xlio_key_t key); - /* * RX flow. */ From 0dcadd393e64ef0a864557718aeeeffdab84fb20 Mon Sep 17 00:00:00 2001 From: Dmytro Podgornyi Date: Mon, 18 Mar 2024 20:08:30 +0200 Subject: [PATCH 133/169] issue: 3788369 Move public types definitions to xlio_types.h xlio.h and xlio_extra.h provides public API for different scenarios: explicit linkage and LD_PRELOAD respectively. Therefore, usually, it makes sense to use a single header. However, the headers have common types definitions. Move them to a separate xlio_types.h instead of including xlio_extra.h from xlio.h. Signed-off-by: Dmytro Podgornyi --- contrib/scripts/libxlio.spec.in | 1 + debian/libxlio-dev.install | 1 + src/core/Makefile.am | 10 +- src/core/xlio.h | 123 +-------- src/core/xlio_extra.h | 305 +-------------------- src/core/xlio_types.h | 456 ++++++++++++++++++++++++++++++++ 6 files changed, 478 insertions(+), 418 deletions(-) create mode 100644 src/core/xlio_types.h diff --git a/contrib/scripts/libxlio.spec.in b/contrib/scripts/libxlio.spec.in index e948280ee..59d71153f 100644 --- a/contrib/scripts/libxlio.spec.in +++ b/contrib/scripts/libxlio.spec.in @@ -178,6 +178,7 @@ fi %files devel %dir %{_includedir}/mellanox %{_includedir}/mellanox/xlio_extra.h +%{_includedir}/mellanox/xlio_types.h %{_includedir}/mellanox/xlio.h %if %{use_rel} > 0 %{_libdir}/%{name}-debug.so diff --git a/debian/libxlio-dev.install b/debian/libxlio-dev.install index 49f9eb771..e52df90cc 100644 --- a/debian/libxlio-dev.install +++ b/debian/libxlio-dev.install @@ -1,3 +1,4 @@ usr/include/mellanox/xlio_extra.h +usr/include/mellanox/xlio_types.h usr/include/mellanox/xlio.h libxlio-debug.so usr/lib diff --git a/src/core/Makefile.am b/src/core/Makefile.am index 3b30bf9d2..09d9e6c27 100644 --- a/src/core/Makefile.am +++ b/src/core/Makefile.am @@ -29,8 +29,10 @@ EXTRA_DIST = \ sysconf_DATA = util/libxlio.conf otherincludedir = $(includedir)/mellanox -otherinclude_HEADERS = xlio_extra.h \ - xlio.h +otherinclude_HEADERS = \ + xlio.h \ + xlio_extra.h \ + xlio_types.h install-exec-hook: rm -f $(DESTDIR)$(libdir)/libxlio.la @@ -312,7 +314,9 @@ libxlio_la_SOURCES := \ \ config_parser.h \ main.h \ - xlio_extra.h + xlio.h \ + xlio_extra.h \ + xlio_types.h libxlio_la_DEPENDENCIES = \ $(top_builddir)/src/vlogger/libvlogger.la \ diff --git a/src/core/xlio.h b/src/core/xlio.h index aa3d36785..f1964b3bc 100644 --- a/src/core/xlio.h +++ b/src/core/xlio.h @@ -40,8 +40,9 @@ #include #include #include +#include -#include "xlio_extra.h" +#include "xlio_types.h" #ifdef __cplusplus extern "C" { @@ -387,15 +388,10 @@ int xlio_socketxtreme_free_buff(struct xlio_buff_t *buff); /* * XLIO Socket API + * + * This is performance-oriented event based API. */ -typedef uintptr_t xlio_poll_group_t; -typedef uintptr_t xlio_socket_t; - -struct xlio_buf { - uint64_t userdata; -}; - /* * XLIO initialization. * @@ -406,80 +402,8 @@ struct xlio_buf { * If set, memory_cb() notifies about memory blocks which zerocopy RX buffers can point to. * Current implementation allocates a single memory block and does it within xlio_init_ex() context. */ - -/* - * Memory callback. - * - * XLIO calls the callback each time XLIO allocates a memory region which can be used for RX - * buffers. User can use this information to prepare the memory for some logic in the future. - * Zerocopy RX interface provides pointers to such memory. - * - * Current XLIO implementation does a single allocation for buffers. - */ -typedef void (*xlio_memory_cb_t)(void *addr, size_t len, size_t hugepage_size); - -struct xlio_init_attr { - unsigned flags; - xlio_memory_cb_t memory_cb; -}; - int xlio_init_ex(const struct xlio_init_attr *attr); -/* - * Socket callbacks. - */ - -enum { - /* TCP connection established. */ - XLIO_SOCKET_EVENT_ESTABLISHED = 1, - /* Socket terminated and no further events are possible. */ - XLIO_SOCKET_EVENT_TERMINATED, - /* Passive close. */ - XLIO_SOCKET_EVENT_CLOSED, - /* An error occurred, see the error code value. */ - XLIO_SOCKET_EVENT_ERROR, -}; - -/* - * Socket event callback. - * - * May be called from xlio_poll_group_poll() context. - * In the callback context, send operation is allowed only for the ESTABLISHED event. - * Argument value holds the error code for the ERROR event and 0 for other events. - * - * List of possible error code values: - * ECONNABORTED - connection aborted by local side - * ECONNRESET - connection reset by remote side - * ECONNREFUSED - connection refused by remote side during TCP handshake - * ETIMEDOUT - connection timed out due to keepalive, user timeout option or TCP handshake timeout - */ -typedef void (*xlio_socket_event_cb_t)(xlio_socket_t, uintptr_t userdata_sq, int event, int value); - -/* - * Zerocopy completion event. - * - * May be called from the following contexts: - * - xlio_poll_group_poll() - likely - * - xlio_socket_send() - can happen only if data is flushed - * - xlio_socket_flush() / xlio_poll_group_flush() - * - xlio_socket_destroy() - * - * In the callback context, send operation is allowed unless the socket is under destruction. - */ -typedef void (*xlio_socket_comp_cb_t)(xlio_socket_t, uintptr_t userdata_sq, uintptr_t userdata_op); - -/* - * RX callback. - * - * Returns TCP payload upon arrival. Each call returns a single contiguous buffer. The buffer points - * to memory within a block which is provided by the memory_cb() notification. - * - * xlio_buf is a descriptor of the buffer which must be returned to XLIO. During user ownership, - * they may use the uninitialized field in the structure. - */ -typedef void (*xlio_socket_rx_cb_t)(xlio_socket_t, uintptr_t userdata_sq, void *data, size_t len, - struct xlio_buf *buf); - /* * XLIO polling groups. * @@ -495,19 +419,6 @@ typedef void (*xlio_socket_rx_cb_t)(xlio_socket_t, uintptr_t userdata_sq, void * * optimize the HW objects utilization. However, maintaining extra groups can have an overhead. */ -/* Sockets and rings will be protected with locks regardless of XLIO configuration. */ -#define XLIO_GROUP_FLAG_SAFE 0x1 -/* Group will keep dirty sockets to be flushed with xlio_poll_group_flush(). */ -#define XLIO_GROUP_FLAG_DIRTY 0x2 - -struct xlio_poll_group_attr { - unsigned flags; - - xlio_socket_event_cb_t socket_event_cb; - xlio_socket_comp_cb_t socket_comp_cb; - xlio_socket_rx_cb_t socket_rx_cb; -}; - int xlio_poll_group_create(const struct xlio_poll_group_attr *attr, xlio_poll_group_t *group_out); int xlio_poll_group_destroy(xlio_poll_group_t group); void xlio_poll_group_poll(xlio_poll_group_t group); @@ -528,13 +439,6 @@ void xlio_poll_group_poll(xlio_poll_group_t group); * - Bonding is not supported */ -struct xlio_socket_attr { - unsigned flags; - int domain; /* AF_INET or AF_INET6 */ - xlio_poll_group_t group; - uintptr_t userdata_sq; -}; - /* Forward declaration. */ struct ibv_pd; @@ -546,8 +450,6 @@ int xlio_socket_bind(xlio_socket_t sock, const struct sockaddr *addr, socklen_t int xlio_socket_connect(xlio_socket_t sock, const struct sockaddr *to, socklen_t tolen); struct ibv_pd *xlio_socket_get_pd(xlio_socket_t sock); -int xlio_socket_fd(xlio_socket_t sock); - /* * TX flow. * @@ -569,17 +471,6 @@ int xlio_socket_fd(xlio_socket_t sock); * it's better to avoid using them both. */ -/* Flush socket after queueing the data. */ -#define XLIO_SOCKET_SEND_FLAG_FLUSH 0x1 -/* Copy user data to the internal buffers instead of taking ownership. */ -#define XLIO_SOCKET_SEND_FLAG_INLINE 0x2 - -struct xlio_socket_send_attr { - unsigned flags; - uint32_t mkey; - uintptr_t userdata_op; -}; - /* Returns either 0 or -1. The errors, except of ENOMEM, are not recoverable. */ int xlio_socket_send(xlio_socket_t sock, const void *data, size_t len, const struct xlio_socket_send_attr *attr); @@ -595,6 +486,12 @@ void xlio_socket_flush(xlio_socket_t sock); void xlio_socket_buf_free(xlio_socket_t sock, struct xlio_buf *buf); void xlio_poll_group_buf_free(xlio_poll_group_t group, struct xlio_buf *buf); +/* + * Experimental level API. + */ + +int xlio_socket_fd(xlio_socket_t sock); + #ifdef __cplusplus } #endif diff --git a/src/core/xlio_extra.h b/src/core/xlio_extra.h index 780d56d64..4a3f5cad7 100644 --- a/src/core/xlio_extra.h +++ b/src/core/xlio_extra.h @@ -35,273 +35,16 @@ #include #include -#include #include -/* - * Flags for recvfrom_zcopy() - */ -#define MSG_XLIO_ZCOPY_FORCE 0x01000000 // don't fallback to bcopy -#define MSG_XLIO_ZCOPY 0x00040000 // return: zero copy was done - -/* - * Options for setsockopt()/getsockopt() - */ -#define SO_XLIO_GET_API 2800 -#define SO_XLIO_USER_DATA 2801 -#define SO_XLIO_RING_ALLOC_LOGIC 2810 -#define SO_XLIO_SHUTDOWN_RX 2821 -#define SO_XLIO_PD 2822 -#define SCM_XLIO_PD SO_XLIO_PD -#define SCM_XLIO_NVME_PD 2823 -#define SO_XLIO_EXT_VLAN_TAG 2824 - -/** - * @def SO_XLIO_ISOLATE - * Socket isolation option groups sockets under specified policy. - * - * Supported policies: - * - SO_XLIO_ISOLATE_DEFAULT - default behavior according to XLIO configuration. - * - * - SO_XLIO_ISOLATE_SAFE - isolate sockets from the default sockets and guarantee thread - * safety regardless of XLIO configuration (note: this option doesn't change socket API - * thread safety model). This policy is mostly effective in XLIO_TCP_CTL_THREAD=delegate - * configuration. - * - * Current limitations: - * - SO_XLIO_ISOLATE option is supported only by TCP sockets - * - SO_XLIO_ISOLATE must be called according to thread safety model and XLIO configuration - * - SO_XLIO_ISOLATE may be called after socket() syscall and before either listen() or connect() - */ -#define SO_XLIO_ISOLATE 2825 -#define SO_XLIO_ISOLATE_DEFAULT 0 -#define SO_XLIO_ISOLATE_SAFE 1 - -enum { CMSG_XLIO_IOCTL_USER_ALLOC = 2900 }; +#include "xlio_types.h" -/* - * Flags for Dummy send API - */ -#define XLIO_SND_FLAGS_DUMMY MSG_SYN // equals to 0x400 - -/* - * Magic value for xlio_get_api (NVDAXLIO) - */ +/** Magic value for xlio_get_api (NVDAXLIO) */ #define XLIO_MAGIC_NUMBER (0x4f494c584144564eULL) -/* - * Return values for the receive packet notify callback function - */ -typedef enum { - XLIO_PACKET_DROP, /* The library will drop the received packet and recycle - the buffer if no other socket needs it */ - - XLIO_PACKET_RECV, /* The library will queue the received packet on this socket ready queue. - The application will read it with the usual recv socket APIs */ - - XLIO_PACKET_HOLD /* Application will handle the queuing of the received packet. The application - must return the descriptor to the library using the free packet function - But not in the context of XLIO's callback itself. */ -} xlio_recv_callback_retval_t; - /** - * @brief Pass this structure as an argument into getsockopt() with @ref SO_XLIO_PD - * to get protection domain information from ring used for current socket. - * This information can be available after setting connection for TX ring - * and bounding to device for RX ring. - * By default getting PD for TX ring. - * This case can be used with sendmsg(SCM_XLIO_PD) when the data portion contains - * an array of the elements with datatype as struct xlio_pd_key. Number of elements in this - * array should be equal to msg_iovlen value. Every data pointer in msg_iov has - * correspondent memory key. - * - * @param flags - to specify needed information. - * @param pd - protection domain (PD) for the RDMA device context - */ -struct xlio_pd_attr { - uint32_t flags; - void *ib_pd; -}; - -/** - * @brief elements with this datatype can be passed into sendmsg(SCM_XLIO_PD) - * as control message with correspondent pointer to data. - * - * @param flags - to specify needed information. By default mkey value is used. - * @param mkey - memory key - */ -struct xlio_pd_key { - union { - uint32_t flags; - uint32_t message_length; - }; - uint32_t mkey; -}; - -#define NVDA_NVME 666 -#define NVME_TX 1 -#define NVME_RX 2 - -enum { - XLIO_NVME_DDGST_ENABLE = 1U << 31, - XLIO_NVME_DDGST_OFFLOAD = 1U << 30, - XLIO_NVME_HDGST_ENABLE = 1U << 29, - XLIO_NVME_HDGST_OFFLOAD = 1U << 28, - XLIO_NVME_PDA_MASK = ((1U << 4) - 1U), - XLIO_NVME_DDGST_MASK = (XLIO_NVME_DDGST_ENABLE | XLIO_NVME_DDGST_OFFLOAD), -}; - -/************ SocketXtreme API types definition start***************/ - -enum { - XLIO_SOCKETXTREME_PACKET = (1ULL << 32), /* New packet is available */ - XLIO_SOCKETXTREME_NEW_CONNECTION_ACCEPTED = - (1ULL << 33) /* New connection is auto accepted by server */ -}; - -/* - * Represents specific buffer - * Used in SocketXtreme extended API. - */ -struct xlio_buff_t { - struct xlio_buff_t *next; /* next buffer (for last buffer next == NULL) */ - void *payload; /* pointer to data */ - uint16_t len; /* data length */ -}; - -/** - * Represents one specific packet - * Used in SocketXtreme extended API. - */ -struct xlio_socketxtreme_packet_desc_t { - size_t num_bufs; /* number of packet's buffers */ - uint16_t total_len; /* total data length */ - struct xlio_buff_t *buff_lst; /* list of packet's buffers */ - struct timespec hw_timestamp; /* packet hw_timestamp */ -}; - -/* - * Represents specific completion form. - * Used in SocketXtreme extended API. - */ -struct xlio_socketxtreme_completion_t { - /* Packet is valid in case XLIO_SOCKETXTREME_PACKET event is set - */ - struct xlio_socketxtreme_packet_desc_t packet; - /* Set of events - */ - uint64_t events; - /* User provided data. - * By default this field has FD of the socket - * User is able to change the content using setsockopt() - * with level argument SOL_SOCKET and opname as SO_XLIO_USER_DATA - */ - uint64_t user_data; - /* Source address (in network byte order) set for: - * XLIO_SOCKETXTREME_PACKET and XLIO_SOCKETXTREME_NEW_CONNECTION_ACCEPTED events - */ - struct sockaddr_in src; - /* Connected socket's parent/listen socket fd number. - * Valid in case XLIO_SOCKETXTREME_NEW_CONNECTION_ACCEPTED event is set. - */ - int listen_fd; -}; - -/************ SocketXtreme API types definition end ***************/ - -/** - * Represents one packet - * Used in receive zero-copy extended API. - */ -struct __attribute__((packed)) xlio_recvfrom_zcopy_packet_t { - void *packet_id; // packet identifier - size_t sz_iov; // number of fragments - struct iovec iov[]; // fragments size+data -}; - -/** - * Represents received packets - * Used in receive zero-copy extended API. - */ -struct __attribute__((packed)) xlio_recvfrom_zcopy_packets_t { - size_t n_packet_num; // number of received packets - struct xlio_recvfrom_zcopy_packet_t pkts[]; // array of received packets -}; - -/* - * Structure holding additional information on the packet and socket - * Note: Check structure size value for future library changes - */ -struct __attribute__((packed)) xlio_info_t { - size_t - struct_sz; /* Compare this value with sizeof(xlio_info_t) to check version compatability */ - void *packet_id; /* Handle to received packet buffer to be return if zero copy logic is used */ - - /* Packet addressing information (in network byte order) */ - const struct sockaddr *src; - const struct sockaddr *dst; - - /* Packet information */ - size_t payload_sz; - - /* Socket's information */ - uint32_t socket_ready_queue_pkt_count; /* Current count of packets waiting to be read from the - socket */ - uint32_t socket_ready_queue_byte_count; /* Current count of bytes waiting to be read from the - socket */ - - /* Packet timestamping information */ - struct timespec hw_timestamp; - struct timespec sw_timestamp; -}; - -struct xlio_rate_limit_t { - uint32_t rate; /* rate limit in Kbps */ - uint32_t max_burst_sz; /* maximum burst size in bytes */ - uint16_t typical_pkt_sz; /* typical packet size in bytes */ -}; - -typedef enum { - RING_LOGIC_PER_INTERFACE = 0, //!< RING_LOGIC_PER_INTERFACE - RING_LOGIC_PER_IP = 1, //!< RING_LOGIC_PER_IP - RING_LOGIC_PER_SOCKET = 10, //!< RING_LOGIC_PER_SOCKET - RING_LOGIC_PER_USER_ID = 11, //!< RING_LOGIC_PER_USER_ID - RING_LOGIC_PER_THREAD = 20, //!< RING_LOGIC_PER_THREAD - RING_LOGIC_PER_CORE = 30, //!< RING_LOGIC_PER_CORE - RING_LOGIC_PER_CORE_ATTACH_THREADS = 31, //!< RING_LOGIC_PER_CORE_ATTACH_THREADS - RING_LOGIC_PER_OBJECT = 32, //!< RING_LOGIC_PER_OBJECT - RING_LOGIC_ISOLATE = 33, //!< RING_LOGIC_ISOLATE - RING_LOGIC_LAST //!< RING_LOGIC_LAST -} ring_logic_t; - -typedef enum { - XLIO_RING_ALLOC_MASK_RING_USER_ID = (1 << 0), - XLIO_RING_ALLOC_MASK_RING_INGRESS = (1 << 1), - XLIO_RING_ALLOC_MASK_RING_ENGRESS = (1 << 2), -} xlio_ring_alloc_logic_attr_comp_mask; - -/** - * @brief pass this struct to process by the library using setsockopt with - * @ref SO_XLIO_RING_ALLOC_LOGIC - * to set the allocation logic of this FD when he requests a ring. - * @note ring_alloc_logic is a mandatory - * @param comp_mask - what fields are read when processing this struct - * see @ref xlio_ring_alloc_logic_attr_comp_mask - * @param ring_alloc_logic- allocation ratio to use - * @param user_idx - when used RING_LOGIC_PER_USER_ID int @ref ring_alloc_logic - * this is the user id to define. This lets you define the same ring for - * few FD's regardless the interface\thread\core. - * @param ingress - RX ring - * @param engress - TX ring + * XLIO Extended Socket API */ -struct xlio_ring_alloc_logic_attr { - uint32_t comp_mask; - ring_logic_t ring_alloc_logic; - uint32_t user_id; - uint32_t ingress : 1; - uint32_t engress : 1; - uint32_t reserved : 30; -}; enum { XLIO_EXTRA_API_REGISTER_RECV_CALLBACK = (1 << 0), @@ -319,48 +62,6 @@ enum { XLIO_EXTRA_API_IOCTL = (1 << 12), }; -/** - * - * Notification callback for incoming packet on socket - * @param fd Socket's file descriptor which this packet refers to - * @param iov iovector structure array point holding the packet - * received data buffer pointers and size of each buffer - * @param iov_sz Size of iov array - * @param xlio_info Additional information on the packet and socket - * @param context User-defined value provided during callback - * registration for each socket - * - * This callback function should be registered by the library calling - * register_recv_callback() in the extended API. It can be unregistered by - * setting a NULL function pointer. The library will call the callback to notify - * of new incoming packets after the IP & UDP header processing and before - * they are queued in the socket's receive queue. - * Context of the callback will always be from one of the user's application - * threads when calling the following socket APIs: select, poll, epoll, recv, - * recvfrom, recvmsg, read, readv. - * - * Notes: - * - The application can call all of the Socket APIs control and send from - * within the callback context. - * - Packet loss might occur depending on the applications behavior in the - * callback context. - * - Parameters `iov' and `xlio_info' are only valid until callback context - * is returned to the library. User should copy these structures for later use - * if working with zero copy logic. - */ -typedef xlio_recv_callback_retval_t (*xlio_recv_callback_t)(int fd, size_t sz_iov, - struct iovec iov[], - struct xlio_info_t *xlio_info, - void *context); - -/** - * XLIO Extended Socket API - */ - -enum { - SOCKETXTREME_POLL_TX = (1 << 15), -}; - struct __attribute__((packed)) xlio_api_t { /** diff --git a/src/core/xlio_types.h b/src/core/xlio_types.h new file mode 100644 index 000000000..b9cbf3b9c --- /dev/null +++ b/src/core/xlio_types.h @@ -0,0 +1,456 @@ +/* + * Copyright (c) 2001-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef XLIO_TYPES_H +#define XLIO_TYPES_H + +#include +#include +#include +#include +#include + +/* + * Flags for recvfrom_zcopy() + */ +#define MSG_XLIO_ZCOPY_FORCE 0x01000000 // don't fallback to bcopy +#define MSG_XLIO_ZCOPY 0x00040000 // return: zero copy was done + +/* + * Options for setsockopt()/getsockopt() + */ +#define SO_XLIO_GET_API 2800 +#define SO_XLIO_USER_DATA 2801 +#define SO_XLIO_RING_ALLOC_LOGIC 2810 +#define SO_XLIO_SHUTDOWN_RX 2821 +#define SO_XLIO_PD 2822 +#define SCM_XLIO_PD SO_XLIO_PD +#define SCM_XLIO_NVME_PD 2823 +#define SO_XLIO_EXT_VLAN_TAG 2824 + +/** + * @def SO_XLIO_ISOLATE + * Socket isolation option groups sockets under specified policy. + * + * Supported policies: + * - SO_XLIO_ISOLATE_DEFAULT - default behavior according to XLIO configuration. + * + * - SO_XLIO_ISOLATE_SAFE - isolate sockets from the default sockets and guarantee thread + * safety regardless of XLIO configuration (note: this option doesn't change socket API + * thread safety model). This policy is mostly effective in XLIO_TCP_CTL_THREAD=delegate + * configuration. + * + * Current limitations: + * - SO_XLIO_ISOLATE option is supported only by TCP sockets + * - SO_XLIO_ISOLATE must be called according to thread safety model and XLIO configuration + * - SO_XLIO_ISOLATE may be called after socket() syscall and before either listen() or connect() + */ +#define SO_XLIO_ISOLATE 2825 +#define SO_XLIO_ISOLATE_DEFAULT 0 +#define SO_XLIO_ISOLATE_SAFE 1 + +enum { CMSG_XLIO_IOCTL_USER_ALLOC = 2900 }; + +/* + * Flags for Dummy send API + */ +#define XLIO_SND_FLAGS_DUMMY MSG_SYN // equals to 0x400 + +/* + * Return values for the receive packet notify callback function + */ +typedef enum { + XLIO_PACKET_DROP, /* The library will drop the received packet and recycle + the buffer if no other socket needs it */ + + XLIO_PACKET_RECV, /* The library will queue the received packet on this socket ready queue. + The application will read it with the usual recv socket APIs */ + + XLIO_PACKET_HOLD /* Application will handle the queuing of the received packet. The application + must return the descriptor to the library using the free packet function + But not in the context of XLIO's callback itself. */ +} xlio_recv_callback_retval_t; + +/** + * @brief Pass this structure as an argument into getsockopt() with @ref SO_XLIO_PD + * to get protection domain information from ring used for current socket. + * This information can be available after setting connection for TX ring + * and bounding to device for RX ring. + * By default getting PD for TX ring. + * This case can be used with sendmsg(SCM_XLIO_PD) when the data portion contains + * an array of the elements with datatype as struct xlio_pd_key. Number of elements in this + * array should be equal to msg_iovlen value. Every data pointer in msg_iov has + * correspondent memory key. + * + * @param flags - to specify needed information. + * @param pd - protection domain (PD) for the RDMA device context + */ +struct xlio_pd_attr { + uint32_t flags; + void *ib_pd; +}; + +/** + * @brief elements with this datatype can be passed into sendmsg(SCM_XLIO_PD) + * as control message with correspondent pointer to data. + * + * @param flags - to specify needed information. By default mkey value is used. + * @param mkey - memory key + */ +struct xlio_pd_key { + union { + uint32_t flags; + uint32_t message_length; + }; + uint32_t mkey; +}; + +#define NVDA_NVME 666 +#define NVME_TX 1 +#define NVME_RX 2 + +enum { + XLIO_NVME_DDGST_ENABLE = 1U << 31, + XLIO_NVME_DDGST_OFFLOAD = 1U << 30, + XLIO_NVME_HDGST_ENABLE = 1U << 29, + XLIO_NVME_HDGST_OFFLOAD = 1U << 28, + XLIO_NVME_PDA_MASK = ((1U << 4) - 1U), + XLIO_NVME_DDGST_MASK = (XLIO_NVME_DDGST_ENABLE | XLIO_NVME_DDGST_OFFLOAD), +}; + +/************ SocketXtreme API types definition start***************/ + +enum { + SOCKETXTREME_POLL_TX = (1 << 15), +}; + +enum { + XLIO_SOCKETXTREME_PACKET = (1ULL << 32), /* New packet is available */ + XLIO_SOCKETXTREME_NEW_CONNECTION_ACCEPTED = + (1ULL << 33) /* New connection is auto accepted by server */ +}; + +/* + * Represents specific buffer + * Used in SocketXtreme extended API. + */ +struct xlio_buff_t { + struct xlio_buff_t *next; /* next buffer (for last buffer next == NULL) */ + void *payload; /* pointer to data */ + uint16_t len; /* data length */ +}; + +/** + * Represents one specific packet + * Used in SocketXtreme extended API. + */ +struct xlio_socketxtreme_packet_desc_t { + size_t num_bufs; /* number of packet's buffers */ + uint16_t total_len; /* total data length */ + struct xlio_buff_t *buff_lst; /* list of packet's buffers */ + struct timespec hw_timestamp; /* packet hw_timestamp */ +}; + +/* + * Represents specific completion form. + * Used in SocketXtreme extended API. + */ +struct xlio_socketxtreme_completion_t { + /* Packet is valid in case XLIO_SOCKETXTREME_PACKET event is set + */ + struct xlio_socketxtreme_packet_desc_t packet; + /* Set of events + */ + uint64_t events; + /* User provided data. + * By default this field has FD of the socket + * User is able to change the content using setsockopt() + * with level argument SOL_SOCKET and opname as SO_XLIO_USER_DATA + */ + uint64_t user_data; + /* Source address (in network byte order) set for: + * XLIO_SOCKETXTREME_PACKET and XLIO_SOCKETXTREME_NEW_CONNECTION_ACCEPTED events + */ + struct sockaddr_in src; + /* Connected socket's parent/listen socket fd number. + * Valid in case XLIO_SOCKETXTREME_NEW_CONNECTION_ACCEPTED event is set. + */ + int listen_fd; +}; + +/************ SocketXtreme API types definition end ***************/ + +/** + * Represents one packet + * Used in receive zero-copy extended API. + */ +struct __attribute__((packed)) xlio_recvfrom_zcopy_packet_t { + void *packet_id; // packet identifier + size_t sz_iov; // number of fragments + struct iovec iov[]; // fragments size+data +}; + +/** + * Represents received packets + * Used in receive zero-copy extended API. + */ +struct __attribute__((packed)) xlio_recvfrom_zcopy_packets_t { + size_t n_packet_num; // number of received packets + struct xlio_recvfrom_zcopy_packet_t pkts[]; // array of received packets +}; + +/* + * Structure holding additional information on the packet and socket + * Note: Check structure size value for future library changes + */ +struct __attribute__((packed)) xlio_info_t { + size_t + struct_sz; /* Compare this value with sizeof(xlio_info_t) to check version compatability */ + void *packet_id; /* Handle to received packet buffer to be return if zero copy logic is used */ + + /* Packet addressing information (in network byte order) */ + const struct sockaddr *src; + const struct sockaddr *dst; + + /* Packet information */ + size_t payload_sz; + + /* Socket's information */ + uint32_t socket_ready_queue_pkt_count; /* Current count of packets waiting to be read from the + socket */ + uint32_t socket_ready_queue_byte_count; /* Current count of bytes waiting to be read from the + socket */ + + /* Packet timestamping information */ + struct timespec hw_timestamp; + struct timespec sw_timestamp; +}; + +struct xlio_rate_limit_t { + uint32_t rate; /* rate limit in Kbps */ + uint32_t max_burst_sz; /* maximum burst size in bytes */ + uint16_t typical_pkt_sz; /* typical packet size in bytes */ +}; + +typedef enum { + RING_LOGIC_PER_INTERFACE = 0, //!< RING_LOGIC_PER_INTERFACE + RING_LOGIC_PER_IP = 1, //!< RING_LOGIC_PER_IP + RING_LOGIC_PER_SOCKET = 10, //!< RING_LOGIC_PER_SOCKET + RING_LOGIC_PER_USER_ID = 11, //!< RING_LOGIC_PER_USER_ID + RING_LOGIC_PER_THREAD = 20, //!< RING_LOGIC_PER_THREAD + RING_LOGIC_PER_CORE = 30, //!< RING_LOGIC_PER_CORE + RING_LOGIC_PER_CORE_ATTACH_THREADS = 31, //!< RING_LOGIC_PER_CORE_ATTACH_THREADS + RING_LOGIC_PER_OBJECT = 32, //!< RING_LOGIC_PER_OBJECT + RING_LOGIC_ISOLATE = 33, //!< RING_LOGIC_ISOLATE + RING_LOGIC_LAST //!< RING_LOGIC_LAST +} ring_logic_t; + +typedef enum { + XLIO_RING_ALLOC_MASK_RING_USER_ID = (1 << 0), + XLIO_RING_ALLOC_MASK_RING_INGRESS = (1 << 1), + XLIO_RING_ALLOC_MASK_RING_ENGRESS = (1 << 2), +} xlio_ring_alloc_logic_attr_comp_mask; + +/** + * @brief pass this struct to process by the library using setsockopt with + * @ref SO_XLIO_RING_ALLOC_LOGIC + * to set the allocation logic of this FD when he requests a ring. + * @note ring_alloc_logic is a mandatory + * @param comp_mask - what fields are read when processing this struct + * see @ref xlio_ring_alloc_logic_attr_comp_mask + * @param ring_alloc_logic- allocation ratio to use + * @param user_idx - when used RING_LOGIC_PER_USER_ID int @ref ring_alloc_logic + * this is the user id to define. This lets you define the same ring for + * few FD's regardless the interface\thread\core. + * @param ingress - RX ring + * @param engress - TX ring + */ +struct xlio_ring_alloc_logic_attr { + uint32_t comp_mask; + ring_logic_t ring_alloc_logic; + uint32_t user_id; + uint32_t ingress : 1; + uint32_t engress : 1; + uint32_t reserved : 30; +}; + +/** + * + * Notification callback for incoming packet on socket + * @param fd Socket's file descriptor which this packet refers to + * @param iov iovector structure array point holding the packet + * received data buffer pointers and size of each buffer + * @param iov_sz Size of iov array + * @param xlio_info Additional information on the packet and socket + * @param context User-defined value provided during callback + * registration for each socket + * + * This callback function should be registered by the library calling + * register_recv_callback() in the extended API. It can be unregistered by + * setting a NULL function pointer. The library will call the callback to notify + * of new incoming packets after the IP & UDP header processing and before + * they are queued in the socket's receive queue. + * Context of the callback will always be from one of the user's application + * threads when calling the following socket APIs: select, poll, epoll, recv, + * recvfrom, recvmsg, read, readv. + * + * Notes: + * - The application can call all of the Socket APIs control and send from + * within the callback context. + * - Packet loss might occur depending on the applications behavior in the + * callback context. + * - Parameters `iov' and `xlio_info' are only valid until callback context + * is returned to the library. User should copy these structures for later use + * if working with zero copy logic. + */ +typedef xlio_recv_callback_retval_t (*xlio_recv_callback_t)(int fd, size_t sz_iov, + struct iovec iov[], + struct xlio_info_t *xlio_info, + void *context); + +/* + * XLIO Socket API main objects + */ + +typedef uintptr_t xlio_poll_group_t; +typedef uintptr_t xlio_socket_t; + +struct xlio_buf { + uint64_t userdata; +}; + +/* + * XLIO Socket API callbacks + */ + +/* + * Memory callback. + * + * XLIO calls the callback each time XLIO allocates a memory region which can be used for RX + * buffers. User can use this information to prepare the memory for some logic in the future. + * Zerocopy RX interface provides pointers to such memory. + * + * Current XLIO implementation does a single allocation for buffers. + */ +typedef void (*xlio_memory_cb_t)(void *addr, size_t len, size_t hugepage_size); + +/* + * Socket event callback. + * + * May be called from xlio_poll_group_poll() context. + * In the callback context, send operation is allowed only for the ESTABLISHED event. + * Argument value holds the error code for the ERROR event and 0 for other events. + * + * List of possible error code values: + * ECONNABORTED - connection aborted by local side + * ECONNRESET - connection reset by remote side + * ECONNREFUSED - connection refused by remote side during TCP handshake + * ETIMEDOUT - connection timed out due to keepalive, user timeout option or TCP handshake timeout + */ +enum { + /* TCP connection established. */ + XLIO_SOCKET_EVENT_ESTABLISHED = 1, + /* Socket terminated and no further events are possible. */ + XLIO_SOCKET_EVENT_TERMINATED, + /* Passive close. */ + XLIO_SOCKET_EVENT_CLOSED, + /* An error occurred, see the error code value. */ + XLIO_SOCKET_EVENT_ERROR, +}; +typedef void (*xlio_socket_event_cb_t)(xlio_socket_t, uintptr_t userdata_sq, int event, int value); + +/* + * Zerocopy completion event. + * + * May be called from the following contexts: + * - xlio_poll_group_poll() - likely + * - xlio_socket_send() - can happen only if data is flushed + * - xlio_socket_flush() / xlio_poll_group_flush() + * - xlio_socket_destroy() + * + * In the callback context, send operation is allowed unless the socket is under destruction. + */ +typedef void (*xlio_socket_comp_cb_t)(xlio_socket_t, uintptr_t userdata_sq, uintptr_t userdata_op); + +/* + * RX callback. + * + * Returns TCP payload upon arrival. Each call returns a single contiguous buffer. The buffer points + * to memory within a block which is provided by the memory_cb() notification. + * + * xlio_buf is a descriptor of the buffer which must be returned to XLIO. During user ownership, + * they may use the uninitialized field in the structure. + */ +typedef void (*xlio_socket_rx_cb_t)(xlio_socket_t, uintptr_t userdata_sq, void *data, size_t len, + struct xlio_buf *buf); + +/* + * XLIO Socket API attribute structures + */ + +struct xlio_init_attr { + unsigned flags; + xlio_memory_cb_t memory_cb; +}; + +/* Sockets and rings will be protected with locks regardless of XLIO configuration. */ +#define XLIO_GROUP_FLAG_SAFE 0x1 +/* Group will keep dirty sockets to be flushed with xlio_poll_group_flush(). */ +#define XLIO_GROUP_FLAG_DIRTY 0x2 + +struct xlio_poll_group_attr { + unsigned flags; + + xlio_socket_event_cb_t socket_event_cb; + xlio_socket_comp_cb_t socket_comp_cb; + xlio_socket_rx_cb_t socket_rx_cb; +}; + +struct xlio_socket_attr { + unsigned flags; + int domain; /* AF_INET or AF_INET6 */ + xlio_poll_group_t group; + uintptr_t userdata_sq; +}; + +/* Flush socket after queueing the data. */ +#define XLIO_SOCKET_SEND_FLAG_FLUSH 0x1 +/* Copy user data to the internal buffers instead of taking ownership. */ +#define XLIO_SOCKET_SEND_FLAG_INLINE 0x2 + +struct xlio_socket_send_attr { + unsigned flags; + uint32_t mkey; + uintptr_t userdata_op; +}; + +#endif /* XLIO_TYPES_H */ From 4df427c77fe4b351634460392ca227d3fc1a098d Mon Sep 17 00:00:00 2001 From: Dmytro Podgornyi Date: Mon, 18 Mar 2024 21:10:25 +0200 Subject: [PATCH 134/169] issue: 3788369 Add external allocator to XLIO Socket API Allow user to take responsibility for XLIO buffers allocation. This can be useful if RX buffers needs to be placed to a special memory for further handling. Signed-off-by: Dmytro Podgornyi --- src/core/sock/sock-extra.cpp | 11 ++++++++++- src/core/xlio.h | 6 ++++-- src/core/xlio_types.h | 10 +++++++++- 3 files changed, 23 insertions(+), 4 deletions(-) diff --git a/src/core/sock/sock-extra.cpp b/src/core/sock/sock-extra.cpp index 1e15b96ae..bc6da6e04 100644 --- a/src/core/sock/sock-extra.cpp +++ b/src/core/sock/sock-extra.cpp @@ -390,10 +390,19 @@ extern "C" int xlio_init_ex(const struct xlio_init_attr *attr) setenv(SYS_VAR_TCP_ABORT_ON_CLOSE, "1", 1); } + xlio_init(); + extern xlio_memory_cb_t g_user_memory_cb; g_user_memory_cb = attr->memory_cb; - xlio_init(); + if (attr->memory_alloc) { + safe_mce_sys().m_ioctl.user_alloc.flags = IOCTL_USER_ALLOC_TX | IOCTL_USER_ALLOC_RX; + safe_mce_sys().m_ioctl.user_alloc.memalloc = attr->memory_alloc; + safe_mce_sys().m_ioctl.user_alloc.memfree = attr->memory_free; + safe_mce_sys().memory_limit_user = + std::max(safe_mce_sys().memory_limit_user, safe_mce_sys().memory_limit); + } + DO_GLOBAL_CTORS(); return 0; diff --git a/src/core/xlio.h b/src/core/xlio.h index f1964b3bc..8294986a3 100644 --- a/src/core/xlio.h +++ b/src/core/xlio.h @@ -399,8 +399,10 @@ int xlio_socketxtreme_free_buff(struct xlio_buff_t *buff); * xlio_init_ex() is not thread-safe operation, however, subsequent serialized calls exit * successfully without any action. * - * If set, memory_cb() notifies about memory blocks which zerocopy RX buffers can point to. - * Current implementation allocates a single memory block and does it within xlio_init_ex() context. + * If set, xlio_init_attr::memory_cb() notifies about memory blocks which are allocated to + * buffers. Each zerocopy RX buffer resides within one such memory block. + * If set, XLIO uses external allocator xlio_init_attr::memory_alloc() instead of the internal. + * Current implementation allocates a single memory block and does it in xlio_init_ex() context. */ int xlio_init_ex(const struct xlio_init_attr *attr); diff --git a/src/core/xlio_types.h b/src/core/xlio_types.h index b9cbf3b9c..3fa7d472c 100644 --- a/src/core/xlio_types.h +++ b/src/core/xlio_types.h @@ -359,7 +359,11 @@ struct xlio_buf { * buffers. User can use this information to prepare the memory for some logic in the future. * Zerocopy RX interface provides pointers to such memory. * - * Current XLIO implementation does a single allocation for buffers. + * Argument hugepage_size provides the page size if XLIO uses hugepages for the allocation. + * If hugepage_size is not zero, the both addr and len are aligned to the page size boundary. + * There is no alignment guarantee for regular pages and hugepage_size is zero in this case. + * In case of external user allocator, XLIO reports hugepage_size zero regardless of the underlying + * pages properties. */ typedef void (*xlio_memory_cb_t)(void *addr, size_t len, size_t hugepage_size); @@ -420,6 +424,10 @@ typedef void (*xlio_socket_rx_cb_t)(xlio_socket_t, uintptr_t userdata_sq, void * struct xlio_init_attr { unsigned flags; xlio_memory_cb_t memory_cb; + + /* Optional external user allocator for XLIO buffers. */ + void *(*memory_alloc)(size_t); + void (*memory_free)(void *); }; /* Sockets and rings will be protected with locks regardless of XLIO configuration. */ From f959cff5a61ff56dd55c6826582e620919ced662 Mon Sep 17 00:00:00 2001 From: Dmytro Podgornyi Date: Mon, 18 Mar 2024 21:25:53 +0200 Subject: [PATCH 135/169] issue: 3788369 Add XLIO Socket API to the xlio_api_t pointers This allows to use XLIO Socket API with LD_PRELOAD approach via pointers. Signed-off-by: Dmytro Podgornyi --- src/core/sock/sock-extra.cpp | 21 ++++++++++++++++++++- src/core/xlio_extra.h | 28 ++++++++++++++++++++++++++++ 2 files changed, 48 insertions(+), 1 deletion(-) diff --git a/src/core/sock/sock-extra.cpp b/src/core/sock/sock-extra.cpp index bc6da6e04..7ecb2f533 100644 --- a/src/core/sock/sock-extra.cpp +++ b/src/core/sock/sock-extra.cpp @@ -371,6 +371,25 @@ struct xlio_api_t *extra_api() XLIO_EXTRA_API_SOCKETXTREME_FREE_XLIO_BUFF); SET_EXTRA_API(dump_fd_stats, xlio_dump_fd_stats, XLIO_EXTRA_API_DUMP_FD_STATS); SET_EXTRA_API(ioctl, xlio_extra_ioctl, XLIO_EXTRA_API_IOCTL); + + // XLIO Socket API. + SET_EXTRA_API(xlio_init_ex, xlio_init_ex, XLIO_EXTRA_API_XLIO_SOCKET); + SET_EXTRA_API(xlio_poll_group_create, xlio_poll_group_create, XLIO_EXTRA_API_XLIO_SOCKET); + SET_EXTRA_API(xlio_poll_group_destroy, xlio_poll_group_destroy, XLIO_EXTRA_API_XLIO_SOCKET); + SET_EXTRA_API(xlio_poll_group_poll, xlio_poll_group_poll, XLIO_EXTRA_API_XLIO_SOCKET); + SET_EXTRA_API(xlio_socket_create, xlio_socket_create, XLIO_EXTRA_API_XLIO_SOCKET); + SET_EXTRA_API(xlio_socket_destroy, xlio_socket_destroy, XLIO_EXTRA_API_XLIO_SOCKET); + SET_EXTRA_API(xlio_socket_setsockopt, xlio_socket_setsockopt, XLIO_EXTRA_API_XLIO_SOCKET); + SET_EXTRA_API(xlio_socket_bind, xlio_socket_bind, XLIO_EXTRA_API_XLIO_SOCKET); + SET_EXTRA_API(xlio_socket_connect, xlio_socket_connect, XLIO_EXTRA_API_XLIO_SOCKET); + SET_EXTRA_API(xlio_socket_get_pd, xlio_socket_get_pd, XLIO_EXTRA_API_XLIO_SOCKET); + SET_EXTRA_API(xlio_socket_send, xlio_socket_send, XLIO_EXTRA_API_XLIO_SOCKET); + SET_EXTRA_API(xlio_socket_sendv, xlio_socket_sendv, XLIO_EXTRA_API_XLIO_SOCKET); + SET_EXTRA_API(xlio_poll_group_flush, xlio_poll_group_flush, XLIO_EXTRA_API_XLIO_SOCKET); + SET_EXTRA_API(xlio_socket_flush, xlio_socket_flush, XLIO_EXTRA_API_XLIO_SOCKET); + SET_EXTRA_API(xlio_socket_buf_free, xlio_socket_buf_free, XLIO_EXTRA_API_XLIO_SOCKET); + SET_EXTRA_API(xlio_poll_group_buf_free, xlio_poll_group_buf_free, + XLIO_EXTRA_API_XLIO_SOCKET); } return xlio_api; @@ -382,7 +401,7 @@ struct xlio_api_t *extra_api() extern "C" int xlio_init_ex(const struct xlio_init_attr *attr) { - // Set XLIO socket API specific parameter unless user sets them explicitly + // Set XLIO Socket API specific parameters unless user sets them explicitly if (!getenv(SYS_VAR_PROGRESS_ENGINE_INTERVAL)) { setenv(SYS_VAR_PROGRESS_ENGINE_INTERVAL, "0", 1); } diff --git a/src/core/xlio_extra.h b/src/core/xlio_extra.h index 4a3f5cad7..3bd15b022 100644 --- a/src/core/xlio_extra.h +++ b/src/core/xlio_extra.h @@ -42,6 +42,9 @@ /** Magic value for xlio_get_api (NVDAXLIO) */ #define XLIO_MAGIC_NUMBER (0x4f494c584144564eULL) +/* Forward declaration. */ +struct ibv_pd; + /** * XLIO Extended Socket API */ @@ -60,6 +63,7 @@ enum { XLIO_EXTRA_API_SOCKETXTREME_FREE_XLIO_BUFF = (1 << 10), XLIO_EXTRA_API_DUMP_FD_STATS = (1 << 11), XLIO_EXTRA_API_IOCTL = (1 << 12), + XLIO_EXTRA_API_XLIO_SOCKET = (1 << 13), }; struct __attribute__((packed)) xlio_api_t { @@ -312,6 +316,30 @@ struct __attribute__((packed)) xlio_api_t { * EOPNOTSUPP - socketXtreme was not enabled during configuration time. */ int (*socketxtreme_free_buff)(struct xlio_buff_t *buff); + + /** + * XLIO Socket API. + */ + int (*xlio_init_ex)(const struct xlio_init_attr *attr); + int (*xlio_poll_group_create)(const struct xlio_poll_group_attr *attr, + xlio_poll_group_t *group_out); + int (*xlio_poll_group_destroy)(xlio_poll_group_t group); + void (*xlio_poll_group_poll)(xlio_poll_group_t group); + int (*xlio_socket_create)(const struct xlio_socket_attr *attr, xlio_socket_t *sock_out); + int (*xlio_socket_destroy)(xlio_socket_t sock); + int (*xlio_socket_setsockopt)(xlio_socket_t sock, int level, int optname, const void *optval, + socklen_t optlen); + int (*xlio_socket_bind)(xlio_socket_t sock, const struct sockaddr *addr, socklen_t addrlen); + int (*xlio_socket_connect)(xlio_socket_t sock, const struct sockaddr *to, socklen_t tolen); + struct ibv_pd *(*xlio_socket_get_pd)(xlio_socket_t sock); + int (*xlio_socket_send)(xlio_socket_t sock, const void *data, size_t len, + const struct xlio_socket_send_attr *attr); + int (*xlio_socket_sendv)(xlio_socket_t sock, const struct iovec *iov, unsigned iovcnt, + const struct xlio_socket_send_attr *attr); + void (*xlio_poll_group_flush)(xlio_poll_group_t group); + void (*xlio_socket_flush)(xlio_socket_t sock); + void (*xlio_socket_buf_free)(xlio_socket_t sock, struct xlio_buf *buf); + void (*xlio_poll_group_buf_free)(xlio_poll_group_t group, struct xlio_buf *buf); }; /** From 1baf57484fd3a6cae29592c20bb4cc5f5e5fbbe1 Mon Sep 17 00:00:00 2001 From: Alexander Grissik Date: Sun, 17 Mar 2024 14:03:28 +0200 Subject: [PATCH 136/169] issue: 3777348 Adding lock_spin_simple for smaller space utilization" While lock_spin requires 16 bytes, the lock_spin_simple requires 4 bytes. This allows squashing data path members into less cache lines. Signed-off-by: Alexander Grissik --- src/core/sock/sockinfo.cpp | 4 ++-- src/core/sock/sockinfo.h | 6 ++---- src/utils/lock_wrapper.h | 33 +++++++++++++++++++++++++++++++++ 3 files changed, 37 insertions(+), 6 deletions(-) diff --git a/src/core/sock/sockinfo.cpp b/src/core/sock/sockinfo.cpp index 1e01a5f38..a9ab53b8f 100644 --- a/src/core/sock/sockinfo.cpp +++ b/src/core/sock/sockinfo.cpp @@ -97,8 +97,8 @@ const char *sockinfo::setsockopt_so_opt_to_str(int opt) } sockinfo::sockinfo(int fd, int domain, bool use_ring_locks) - : m_fd(fd) - , m_fd_context((void *)((uintptr_t)m_fd)) + : m_fd_context((void *)((uintptr_t)m_fd)) + , m_fd(fd) , m_rx_num_buffs_reuse(safe_mce_sys().rx_bufs_batch) , m_skip_cq_poll_in_rx(safe_mce_sys().skip_poll_in_rx == SKIP_POLL_IN_RX_ENABLE) , m_lock_rcv(MULTILOCK_RECURSIVE, MODULE_NAME "::m_lock_rcv") diff --git a/src/core/sock/sockinfo.h b/src/core/sock/sockinfo.h index 3cb9f8e14..23d9e4290 100644 --- a/src/core/sock/sockinfo.h +++ b/src/core/sock/sockinfo.h @@ -486,7 +486,7 @@ class sockinfo { */ atomic_t m_zckey; - int m_fd; // identification information + lock_spin_simple m_error_queue_lock; // End of first cache line @@ -494,11 +494,8 @@ class sockinfo { * to provide notification ability. */ descq_t m_error_queue; - lock_spin m_error_queue_lock; void *m_fd_context; // Context data stored with socket - // End of second cache line - rfs *m_rfs_ptr = nullptr; struct { /* Use std::deque in current design as far as it allows pushing @@ -521,6 +518,7 @@ class sockinfo { uint32_t m_epoll_event_flags = 0U; protected: + int m_fd; // identification information int m_rx_epfd; /** * list of pending ready packet on the Rx, diff --git a/src/utils/lock_wrapper.h b/src/utils/lock_wrapper.h index 1562106c7..d35f93528 100644 --- a/src/utils/lock_wrapper.h +++ b/src/utils/lock_wrapper.h @@ -225,6 +225,39 @@ class lock_spin : public lock_base { pthread_spinlock_t m_lock; }; +/** + * pthread spinlock + */ +/* coverity[missing_move_assignment] */ +class lock_spin_simple { +public: + lock_spin_simple() { pthread_spin_init(&m_lock, 0); }; + ~lock_spin_simple() { pthread_spin_destroy(&m_lock); }; + inline int lock() + { + DEFINED_NO_THREAD_LOCK_RETURN_0 + return pthread_spin_lock(&m_lock); + }; + inline int trylock() + { + DEFINED_NO_THREAD_LOCK_RETURN_0 + return pthread_spin_trylock(&m_lock); + }; + inline int unlock() + { + DEFINED_NO_THREAD_LOCK_RETURN_0 + return pthread_spin_unlock(&m_lock); + }; + inline int is_locked_by_me() + { + assert(!"lock_spin_simple::is_locked_by_me is unsupported"); + return 0; // Unsupported + } + +protected: + pthread_spinlock_t m_lock; +}; + /** * pthread spinlock */ From 6ab22faae365d31b8cae5053b7460b01b55d7870 Mon Sep 17 00:00:00 2001 From: Alexander Grissik Date: Sun, 17 Mar 2024 14:24:08 +0200 Subject: [PATCH 137/169] issue: 3777348 Adding template cached_obj_pool Adding a class to act as a pool of different objects. Other objects that use this pool are introduced next. Signed-off-by: Alexander Grissik --- src/core/Makefile.am | 3 +- src/core/dev/ring.cpp | 77 +++++++------ src/core/dev/ring.h | 6 +- src/core/main.cpp | 6 +- src/core/sock/sockinfo_tcp.cpp | 11 +- src/core/sock/tcp_seg_pool.cpp | 164 --------------------------- src/core/sock/tcp_seg_pool.h | 68 ----------- src/core/util/cached_obj_pool.h | 192 ++++++++++++++++++++++++++++++++ 8 files changed, 253 insertions(+), 274 deletions(-) delete mode 100644 src/core/sock/tcp_seg_pool.cpp delete mode 100644 src/core/sock/tcp_seg_pool.h create mode 100644 src/core/util/cached_obj_pool.h diff --git a/src/core/Makefile.am b/src/core/Makefile.am index 09d9e6c27..996ac4a99 100644 --- a/src/core/Makefile.am +++ b/src/core/Makefile.am @@ -152,7 +152,6 @@ libxlio_la_SOURCES := \ sock/sockinfo_udp.cpp \ sock/sockinfo_ulp.cpp \ sock/sockinfo_tcp.cpp \ - sock/tcp_seg_pool.cpp \ sock/fd_collection.cpp \ sock/sock-redirect.cpp \ sock/sock-app.cpp \ @@ -281,7 +280,6 @@ libxlio_la_SOURCES := \ sock/sockinfo_tcp.h \ sock/sockinfo_udp.h \ sock/sockinfo_ulp.h \ - sock/tcp_seg_pool.h \ sock/sock-redirect.h \ sock/sock-redirect-internal.h \ sock/sock-app.h \ @@ -295,6 +293,7 @@ libxlio_la_SOURCES := \ util/instrumentation.h \ util/libxlio.h \ util/list.h \ + util/cached_obj_pool.h \ util/sg_array.h \ util/ip_address.h \ util/sock_addr.h \ diff --git a/src/core/dev/ring.cpp b/src/core/dev/ring.cpp index 00a86e634..cf8124434 100644 --- a/src/core/dev/ring.cpp +++ b/src/core/dev/ring.cpp @@ -33,13 +33,14 @@ #include "ring.h" #include "event/poll_group.h" #include "proto/route_table_mgr.h" -#include "sock/tcp_seg_pool.h" #undef MODULE_NAME #define MODULE_NAME "ring" #undef MODULE_HDR #define MODULE_HDR MODULE_NAME "%d:%s() " +tcp_seg_pool *g_tcp_seg_pool = nullptr; + ring::ring() : m_p_group(nullptr) , m_p_n_rx_channel_fds(nullptr) @@ -57,29 +58,28 @@ ring::~ring() m_p_group->del_ring(this); } if (m_tcp_seg_list) { - g_tcp_seg_pool->put_tcp_segs(m_tcp_seg_list); + g_tcp_seg_pool->put_objs(m_tcp_seg_list); } } -// Assumed num > 0. -tcp_seg *ring::get_tcp_segs(uint32_t num) +template +static inline T *get_obj_list(cached_obj_pool *obj_pool, uint32_t num, T *&obj_list_from, + uint32_t &obj_count, uint32_t batch_size) { - std::lock_guard lock(m_tcp_seg_lock); - - if (unlikely(num > m_tcp_seg_count)) { - uint32_t getsize = std::max(safe_mce_sys().tx_segs_ring_batch_tcp, num - m_tcp_seg_count); - auto seg_list = g_tcp_seg_pool->get_tcp_seg_list(getsize); - if (!seg_list.first) { + if (unlikely(num > obj_count)) { + uint32_t getsize = std::max(batch_size, num - obj_count); + auto obj_list = obj_pool->get_obj_list(getsize); + if (!obj_list.first) { return nullptr; } - seg_list.second->next = m_tcp_seg_list; - m_tcp_seg_list = seg_list.first; - m_tcp_seg_count += getsize; + obj_list.second->next = obj_list_from; + obj_list_from = obj_list.first; + obj_count += getsize; } - tcp_seg *head = m_tcp_seg_list; - tcp_seg *last = head; - m_tcp_seg_count -= num; + T *head = obj_list_from; + T *last = head; + obj_count -= num; // For non-batching, improves branch prediction. For batching, we do not get here often. if (unlikely(num > 1U)) { @@ -88,37 +88,52 @@ tcp_seg *ring::get_tcp_segs(uint32_t num) } } - m_tcp_seg_list = last->next; + obj_list_from = last->next; last->next = nullptr; return head; } -// Assumed seg is not nullptr -void ring::put_tcp_segs(tcp_seg *seg) +// Assumed num > 0. +tcp_seg *ring::get_tcp_segs(uint32_t num) { - static const uint32_t return_treshold = safe_mce_sys().tx_segs_ring_batch_tcp * 2U; - std::lock_guard lock(m_tcp_seg_lock); - tcp_seg *seg_temp = m_tcp_seg_list; - m_tcp_seg_list = seg; + return get_obj_list(g_tcp_seg_pool, num, m_tcp_seg_list, m_tcp_seg_count, + safe_mce_sys().tx_segs_ring_batch_tcp); +} + +template +static inline void put_obj_list(cached_obj_pool *obj_pool, T *&obj_list_to, T *&obj_list_from, + uint32_t &obj_count, uint32_t return_treshold) +{ + T *obj_temp = obj_list_to; + obj_list_to = obj_list_from; // For non-batching, improves branch prediction. For batching, we do not get here often. - if (unlikely(seg->next)) { - while (likely(seg->next)) { - seg = seg->next; - ++m_tcp_seg_count; // Count all except the first. + if (unlikely(obj_list_from->next)) { + while (likely(obj_list_from->next)) { + obj_list_from = obj_list_from->next; + ++obj_count; // Count all except the first. } } - seg->next = seg_temp; - if (unlikely(++m_tcp_seg_count > return_treshold)) { - g_tcp_seg_pool->put_tcp_segs( - tcp_seg_pool::split_tcp_segs(m_tcp_seg_count / 2, m_tcp_seg_list, m_tcp_seg_count)); + obj_list_from->next = obj_temp; + if (unlikely(++obj_count > return_treshold)) { + obj_pool->put_objs(obj_pool->split_obj_list(obj_count / 2, obj_list_to, obj_count)); } } +// Assumed seg is not nullptr +void ring::put_tcp_segs(tcp_seg *seg) +{ + static const uint32_t return_treshold = safe_mce_sys().tx_segs_ring_batch_tcp * 2U; + + std::lock_guard lock(m_tcp_seg_lock); + + put_obj_list(g_tcp_seg_pool, m_tcp_seg_list, seg, m_tcp_seg_count, return_treshold); +} + void ring::print_val() { ring_logdbg("%d: %p: parent %p", m_if_index, this, diff --git a/src/core/dev/ring.h b/src/core/dev/ring.h index 737c43247..852092c46 100644 --- a/src/core/dev/ring.h +++ b/src/core/dev/ring.h @@ -37,10 +37,11 @@ #include "ib/base/verbs_extra.h" #include "dev/buffer_pool.h" #include "dev/xlio_ti.h" -#include "sock/tcp_seg_pool.h" #include "proto/flow_tuple.h" #include "proto/xlio_lwip.h" #include "proto/L2_address.h" +#include "util/cached_obj_pool.h" +#include "lwip/tcp_impl.h" /* Forward declarations */ struct xlio_tls_info; @@ -75,6 +76,9 @@ struct ring_ec { } }; +typedef cached_obj_pool tcp_seg_pool; +extern tcp_seg_pool *g_tcp_seg_pool; + class ring { public: ring(); diff --git a/src/core/main.cpp b/src/core/main.cpp index 2bb7cfc5e..38f5d3b01 100644 --- a/src/core/main.cpp +++ b/src/core/main.cpp @@ -71,7 +71,6 @@ #include "sock/fd_collection.h" #include "sock/sockinfo_tcp.h" #include "sock/sockinfo_udp.h" -#include "sock/tcp_seg_pool.h" #include "sock/bind_no_port.h" #include "iomux/io_mux_call.h" @@ -1100,7 +1099,10 @@ static void do_global_ctors_helper() NEW_CTOR(g_buffer_pool_zc, buffer_pool(BUFFER_POOL_TX, 0)); - NEW_CTOR(g_tcp_seg_pool, tcp_seg_pool()); + NEW_CTOR(g_tcp_seg_pool, + tcp_seg_pool("TCP segments", safe_mce_sys().tx_segs_pool_batch_tcp, + g_global_stat_static.n_tcp_seg_pool_size, + g_global_stat_static.n_tcp_seg_pool_no_segs)); // For delegated TCP timers the global collection is not used. if (safe_mce_sys().tcp_ctl_thread != option_tcp_ctl_thread::CTL_THREAD_DELEGATE_TCP_TIMERS) { diff --git a/src/core/sock/sockinfo_tcp.cpp b/src/core/sock/sockinfo_tcp.cpp index 140cd7f0b..838841479 100644 --- a/src/core/sock/sockinfo_tcp.cpp +++ b/src/core/sock/sockinfo_tcp.cpp @@ -55,7 +55,6 @@ #include "sock-redirect.h" #include "fd_collection.h" #include "sockinfo_tcp.h" -#include "tcp_seg_pool.h" #include "bind_no_port.h" #include "xlio.h" @@ -555,7 +554,7 @@ sockinfo_tcp::~sockinfo_tcp() si_tcp_logwarn("still %d tcp segs in use!", m_tcp_seg_in_use); } if (m_tcp_seg_list) { - g_tcp_seg_pool->put_tcp_segs(m_tcp_seg_list); + g_tcp_seg_pool->put_objs(m_tcp_seg_list); } while (!m_socket_options_list.empty()) { @@ -5889,12 +5888,12 @@ void sockinfo_tcp::tcp_seg_free_cached(void *p_conn, struct tcp_seg *seg) void sockinfo_tcp::return_tcp_segs(struct tcp_seg *seg) { - (likely(m_p_rx_ring)) ? m_p_rx_ring->put_tcp_segs(seg) : g_tcp_seg_pool->put_tcp_segs(seg); + (likely(m_p_rx_ring)) ? m_p_rx_ring->put_tcp_segs(seg) : g_tcp_seg_pool->put_objs(seg); } struct tcp_seg *sockinfo_tcp::get_tcp_seg_direct() { - return likely(m_p_rx_ring) ? m_p_rx_ring->get_tcp_segs(1U) : g_tcp_seg_pool->get_tcp_segs(1U); + return likely(m_p_rx_ring) ? m_p_rx_ring->get_tcp_segs(1U) : g_tcp_seg_pool->get_objs(1U); } struct tcp_seg *sockinfo_tcp::get_tcp_seg_cached() @@ -5902,7 +5901,7 @@ struct tcp_seg *sockinfo_tcp::get_tcp_seg_cached() if (!m_tcp_seg_list) { m_tcp_seg_list = (likely(m_p_rx_ring)) ? m_p_rx_ring->get_tcp_segs(m_sysvar_tx_segs_batch_tcp) - : g_tcp_seg_pool->get_tcp_segs(m_sysvar_tx_segs_batch_tcp); + : g_tcp_seg_pool->get_objs(m_sysvar_tx_segs_batch_tcp); if (unlikely(!m_tcp_seg_list)) { return nullptr; @@ -5936,7 +5935,7 @@ void sockinfo_tcp::put_tcp_seg_cached(struct tcp_seg *seg) --m_tcp_seg_in_use; if (m_tcp_seg_count > 2U * m_sysvar_tx_segs_batch_tcp && m_tcp_seg_in_use < m_tcp_seg_count / 2U) { - return_tcp_segs(tcp_seg_pool::split_tcp_segs((m_tcp_seg_count - m_tcp_seg_in_use) / 2U, + return_tcp_segs(tcp_seg_pool::split_obj_list((m_tcp_seg_count - m_tcp_seg_in_use) / 2U, m_tcp_seg_list, m_tcp_seg_count)); } } diff --git a/src/core/sock/tcp_seg_pool.cpp b/src/core/sock/tcp_seg_pool.cpp deleted file mode 100644 index 05c54cfd8..000000000 --- a/src/core/sock/tcp_seg_pool.cpp +++ /dev/null @@ -1,164 +0,0 @@ -/* - * Copyright (c) 2001-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include "tcp_seg_pool.h" -#include "core/util/utils.h" -#include "vlogger/vlogger.h" - -#define MODULE_NAME "tcp_seg_pool" - -extern global_stats_t g_global_stat_static; - -tcp_seg_pool *g_tcp_seg_pool = nullptr; - -tcp_seg_pool::tcp_seg_pool() - : m_p_head(nullptr) - , m_allocator(false) -{ - memset(&m_stats, 0, sizeof(m_stats)); - expand(); -} - -tcp_seg_pool::~tcp_seg_pool() -{ - print_report(); -} - -tcp_seg *tcp_seg_pool::get_tcp_segs(uint32_t amount) -{ - return get_tcp_seg_list(amount).first; -} - -std::pair tcp_seg_pool::get_tcp_seg_list(uint32_t amount) -{ - uint32_t count; - tcp_seg *head, *next, *prev; - if (unlikely(amount <= 0)) { - return std::make_pair(nullptr, nullptr); - } - lock(); -repeat: - count = amount; - head = next = m_p_head; - prev = nullptr; - while (count > 0 && next) { - prev = next; - next = next->next; - count--; - } - if (count) { - // run out of segments - if (expand()) { - goto repeat; - } - g_global_stat_static.n_tcp_seg_pool_no_segs++; - unlock(); - return std::make_pair(nullptr, nullptr); - } - prev->next = nullptr; - m_p_head = next; - m_stats.allocations++; - g_global_stat_static.n_tcp_seg_pool_size -= amount; - unlock(); - - return std::make_pair(head, prev); -} - -void tcp_seg_pool::put_tcp_segs(tcp_seg *seg_list) -{ - tcp_seg *next = seg_list; - if (unlikely(!seg_list)) { - return; - } - - int i; - for (i = 1; next->next; i++) { - next = next->next; - } - - lock(); - next->next = m_p_head; - m_p_head = seg_list; - g_global_stat_static.n_tcp_seg_pool_size += i; - unlock(); -} - -// Splitting seg list such that first 'count' segs are returned and 'tcp_seg_list' -// is updated to point to the remaining segs. -// The length of tcp_seg_list is assumed to be at least 'count' long. -tcp_seg *tcp_seg_pool::split_tcp_segs(uint32_t count, tcp_seg *&tcp_seg_list, uint32_t &total_count) -{ - struct tcp_seg *head = tcp_seg_list; - struct tcp_seg *last = head; - total_count -= count; - while (count-- > 1U) { - last = last->next; - } - - tcp_seg_list = last->next; - last->next = nullptr; - return head; -} - -bool tcp_seg_pool::expand() -{ - size_t size = sizeof(tcp_seg) * safe_mce_sys().tx_segs_pool_batch_tcp; - tcp_seg *tcp_segs_array = (tcp_seg *)m_allocator.alloc(size); - - if (!tcp_segs_array) { - __log_dbg("TCP segments allocation failed"); - return false; - } - - // Allocator can allocate more memory than requested - utilize it. - size_t segs_nr = size / sizeof(tcp_seg); - - if (segs_nr > 0) { - memset(tcp_segs_array, 0, size); - for (size_t i = 0; i < segs_nr - 1; i++) { - tcp_segs_array[i].next = &tcp_segs_array[i + 1]; - } - tcp_segs_array[segs_nr - 1].next = m_p_head; - m_p_head = &tcp_segs_array[0]; - m_stats.total_segs += segs_nr; - m_stats.expands++; - g_global_stat_static.n_tcp_seg_pool_size += segs_nr; - } - return true; -} - -void tcp_seg_pool::print_report(vlog_levels_t log_level /*=VLOG_DEBUG*/) -{ - vlog_printf(log_level, "TCP segments pool statistics:\n"); - vlog_printf(log_level, " allocations=%u expands=%u total_segs=%u\n", m_stats.allocations, - m_stats.expands, m_stats.total_segs); -} diff --git a/src/core/sock/tcp_seg_pool.h b/src/core/sock/tcp_seg_pool.h deleted file mode 100644 index cfe355467..000000000 --- a/src/core/sock/tcp_seg_pool.h +++ /dev/null @@ -1,68 +0,0 @@ -/* - * Copyright (c) 2001-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#ifndef TCP_SEG_POOL_H -#define TCP_SEG_POOL_H - -#include -#include "dev/allocator.h" -#include "utils/lock_wrapper.h" -#include "lwip/tcp_impl.h" - -class tcp_seg_pool : lock_spin { -public: - tcp_seg_pool(); - ~tcp_seg_pool() override; - - std::pair get_tcp_seg_list(uint32_t amount); - tcp_seg *get_tcp_segs(uint32_t amount); - void put_tcp_segs(tcp_seg *seg_list); - - static tcp_seg *split_tcp_segs(uint32_t count, tcp_seg *&tcp_seg_list, uint32_t &total_count); - -private: - bool expand(); - void print_report(vlog_levels_t log_level = VLOG_DEBUG); - - tcp_seg *m_p_head; - xlio_allocator_heap m_allocator; - - struct { - unsigned total_segs; - unsigned allocations; - unsigned expands; - } m_stats; -}; - -extern tcp_seg_pool *g_tcp_seg_pool; - -#endif diff --git a/src/core/util/cached_obj_pool.h b/src/core/util/cached_obj_pool.h new file mode 100644 index 000000000..ea197c17a --- /dev/null +++ b/src/core/util/cached_obj_pool.h @@ -0,0 +1,192 @@ +/* + * Copyright (c) 2001-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef CACHED_OBJ_POOL_H +#define CACHED_OBJ_POOL_H + +#include +#include "dev/allocator.h" +#include "utils/lock_wrapper.h" + +template class cached_obj_pool : lock_spin { +public: + cached_obj_pool(const char *pool_name, size_t alloc_batch, uint32_t &global_obj_pool_size_ref, + uint32_t &global_obj_pool_no_objs_ref); + ~cached_obj_pool() override; + + std::pair get_obj_list(uint32_t amount); + T *get_objs(uint32_t amount); + void put_objs(T *obj_list); + + static T *split_obj_list(uint32_t count, T *&obj_list, uint32_t &total_count); + +protected: + bool expand(); + + T *m_p_head = nullptr; + xlio_allocator_heap m_allocator; + + struct { + unsigned total_objs; + unsigned allocations; + unsigned expands; + uint32_t &global_obj_pool_size; + uint32_t &global_obj_pool_no_objs; + } m_stats; + + const size_t m_alloc_batch; + const char *m_pool_name; +}; + +template +cached_obj_pool::cached_obj_pool(const char *pool_name, size_t alloc_batch, + uint32_t &global_obj_pool_size_ref, + uint32_t &global_obj_pool_no_objs_ref) + : m_allocator(false) + , m_stats {0U, 0U, 0U, global_obj_pool_size_ref, global_obj_pool_no_objs_ref} + , m_alloc_batch(alloc_batch) + , m_pool_name(pool_name) +{ + expand(); +} + +template cached_obj_pool::~cached_obj_pool() +{ + vlog_printf(VLOG_DEBUG, "%s pool statistics:\n", m_pool_name); + vlog_printf(VLOG_DEBUG, " allocations=%u expands=%u total_segs=%u\n", m_stats.allocations, + m_stats.expands, m_stats.total_objs); +} + +template T *cached_obj_pool::get_objs(uint32_t amount) +{ + return get_obj_list(amount).first; +} + +template std::pair cached_obj_pool::get_obj_list(uint32_t amount) +{ + uint32_t count; + T *head, *next, *prev; + if (unlikely(amount <= 0)) { + return std::make_pair(nullptr, nullptr); + } + lock(); +repeat: + count = amount; + head = next = m_p_head; + prev = nullptr; + while (count > 0 && next) { + prev = next; + next = next->next; + count--; + } + if (count) { + // Ran out of objects + if (expand()) { + goto repeat; + } + m_stats.global_obj_pool_no_objs++; + unlock(); + return std::make_pair(nullptr, nullptr); + } + prev->next = nullptr; + m_p_head = next; + m_stats.allocations++; + m_stats.global_obj_pool_size -= amount; + unlock(); + + return std::make_pair(head, prev); +} + +template void cached_obj_pool::put_objs(T *obj_list) +{ + if (unlikely(!obj_list)) { + return; + } + + T *next = obj_list; + int i; + for (i = 1; next->next; i++) { + next = next->next; + } + + lock(); + next->next = m_p_head; + m_p_head = obj_list; + m_stats.global_obj_pool_size += i; + unlock(); +} + +// Splitting obj list such that first 'count' objs are returned and 'obj_list' +// is updated to point to the remaining objs. +// The length of obj_list is assumed to be at least 'count' long. +template +T *cached_obj_pool::split_obj_list(uint32_t count, T *&obj_list, uint32_t &total_count) +{ + T *head = obj_list; + T *last = head; + total_count -= count; + while (count-- > 1U) { + last = last->next; + } + + obj_list = last->next; + last->next = nullptr; + return head; +} + +template bool cached_obj_pool::expand() +{ + size_t size = sizeof(T) * m_alloc_batch; + T *objs_array = (T *)m_allocator.alloc(size); + if (!objs_array) { + vlog_printf(VLOG_DEBUG, "Cached pool failed to allocate objects (%s)", m_pool_name); + return false; + } + + // Allocator can allocate more memory than requested - utilize it. + size_t objs_nr = size / sizeof(T); + + if (objs_nr > 0) { + memset(objs_array, 0, size); + for (size_t i = 0; i < objs_nr - 1; i++) { + objs_array[i].next = &objs_array[i + 1]; + } + objs_array[objs_nr - 1].next = m_p_head; + m_p_head = &objs_array[0]; + m_stats.total_objs += objs_nr; + m_stats.expands++; + m_stats.global_obj_pool_size += objs_nr; + } + return true; +} + +#endif From 347c36269bf98ceef50d0f549e72e8d4be1dbf75 Mon Sep 17 00:00:00 2001 From: Alexander Grissik Date: Thu, 29 Feb 2024 10:55:20 +0000 Subject: [PATCH 138/169] issue: 3777348 Socketxtreme completions ring pool Introduce ring pool of ring_ec objects which are used for Socketxtreme completion. Using such a pool decreases significantly the number of ring_ec obejcts needed. This, in turn, improves cache utilization. Signed-off-by: Alexander Grissik --- src/core/dev/ring.cpp | 127 ++++++++++++++++++++++++++-- src/core/dev/ring.h | 53 +++++++----- src/core/dev/ring_bond.h | 4 - src/core/dev/ring_simple.cpp | 21 +---- src/core/dev/ring_simple.h | 41 --------- src/core/dev/ring_tap.h | 4 - src/core/main.cpp | 11 +++ src/core/sock/sockinfo.cpp | 76 ++++++++++------- src/core/sock/sockinfo.h | 125 ++++++++++++++------------- src/core/sock/sockinfo_tcp.cpp | 84 ++++++------------ src/core/sock/sockinfo_tcp.h | 2 +- src/core/sock/sockinfo_udp.cpp | 14 ++- src/core/util/cached_obj_pool.h | 2 +- tests/gtest/extra_api/extra_poll.cc | 32 +++++-- 14 files changed, 338 insertions(+), 258 deletions(-) diff --git a/src/core/dev/ring.cpp b/src/core/dev/ring.cpp index cf8124434..7f89228fd 100644 --- a/src/core/dev/ring.cpp +++ b/src/core/dev/ring.cpp @@ -33,6 +33,7 @@ #include "ring.h" #include "event/poll_group.h" #include "proto/route_table_mgr.h" +#include "sock/sockinfo.h" #undef MODULE_NAME #define MODULE_NAME "ring" @@ -40,15 +41,10 @@ #define MODULE_HDR MODULE_NAME "%d:%s() " tcp_seg_pool *g_tcp_seg_pool = nullptr; +socketxtreme_ec_pool *g_socketxtreme_ec_pool = nullptr; ring::ring() - : m_p_group(nullptr) - , m_p_n_rx_channel_fds(nullptr) - , m_parent(nullptr) - , m_tcp_seg_list(nullptr) - , m_tcp_seg_count(0U) { - m_if_index = 0; print_val(); } @@ -60,6 +56,10 @@ ring::~ring() if (m_tcp_seg_list) { g_tcp_seg_pool->put_objs(m_tcp_seg_list); } + + if (m_socketxtreme_ec_list) { + g_socketxtreme_ec_pool->put_objs(m_socketxtreme_ec_list); + } } template @@ -103,6 +103,15 @@ tcp_seg *ring::get_tcp_segs(uint32_t num) safe_mce_sys().tx_segs_ring_batch_tcp); } +// Assumed num > 0. +ring_ec *ring::socketxtreme_get_ecs(uint32_t num) +{ + std::lock_guard lock(m_ec_lock); + + return get_obj_list(g_socketxtreme_ec_pool, num, m_socketxtreme_ec_list, + m_socketxtreme_ec_count, 256U); +} + template static inline void put_obj_list(cached_obj_pool *obj_pool, T *&obj_list_to, T *&obj_list_from, uint32_t &obj_count, uint32_t return_treshold) @@ -134,6 +143,112 @@ void ring::put_tcp_segs(tcp_seg *seg) put_obj_list(g_tcp_seg_pool, m_tcp_seg_list, seg, m_tcp_seg_count, return_treshold); } +// Assumed ec is not nullptr +void ring::socketxtreme_put_ecs(ring_ec *ec) +{ + static const uint32_t return_treshold = 256 * 2U; + + std::lock_guard lock(m_ec_lock); + + put_obj_list(g_socketxtreme_ec_pool, m_socketxtreme_ec_list, ec, m_socketxtreme_ec_count, + return_treshold); +} + +void ring::socketxtreme_ec_sock_list_add(sockinfo *sock) +{ + sock->set_ec_ring_list_next(nullptr); + if (likely(m_socketxtreme.ec_sock_list_end)) { + m_socketxtreme.ec_sock_list_end->set_ec_ring_list_next(sock); + m_socketxtreme.ec_sock_list_end = sock; + } else { + m_socketxtreme.ec_sock_list_end = m_socketxtreme.ec_sock_list_start = sock; + } +} + +xlio_socketxtreme_completion_t &ring::socketxtreme_start_ec_operation(sockinfo *sock, + bool always_new) +{ + m_socketxtreme.lock_ec_list.lock(); + if (likely(!sock->get_last_ec())) { + socketxtreme_ec_sock_list_add(sock); + always_new = true; + } + + if (always_new) { + sock->add_ec(socketxtreme_get_ecs(1U)); + } + + return sock->get_last_ec()->completion; +} + +void ring::socketxtreme_end_ec_operation() +{ + m_socketxtreme.lock_ec_list.unlock(); +} + +bool ring::socketxtreme_ec_pop_completion(xlio_socketxtreme_completion_t *completion) +{ + struct ring_ec *ec = nullptr; + + m_socketxtreme.lock_ec_list.lock(); + if (m_socketxtreme.ec_sock_list_start) { + ec = m_socketxtreme.ec_sock_list_start->pop_next_ec(); + + ring_logfunc( + "tid: %d completion %p: events:%lu, ud:%lu, b:%p, %p\n", gettid(), ec, + ec->completion.events, ec->completion.user_data, ec->completion.packet.buff_lst, + ec->completion.packet.buff_lst ? ec->completion.packet.buff_lst->next : nullptr); + + memcpy(completion, &ec->completion, sizeof(ec->completion)); + ec->next = nullptr; + socketxtreme_put_ecs(ec); + if (!m_socketxtreme.ec_sock_list_start + ->has_next_ec()) { // Last ec of the socket was popped. + // Remove socket from ready list. + sockinfo *temp = m_socketxtreme.ec_sock_list_start; + m_socketxtreme.ec_sock_list_start = temp->get_ec_ring_list_next(); + if (!m_socketxtreme.ec_sock_list_start) { + m_socketxtreme.ec_sock_list_end = nullptr; + } + temp->set_ec_ring_list_next(nullptr); + } + } + m_socketxtreme.lock_ec_list.unlock(); + return (ec != nullptr); +} + +void ring::socketxtreme_ec_clear_sock(sockinfo *sock) +{ + m_socketxtreme.lock_ec_list.lock(); + + ring_ec *ecs = sock->clear_ecs(); + if (ecs) { + socketxtreme_put_ecs(ecs); + sockinfo *temp = m_socketxtreme.ec_sock_list_start; + sockinfo *prev = nullptr; + while (temp && temp != sock) { + prev = temp; + temp = temp->get_ec_ring_list_next(); + } + + if (prev) { + prev->set_ec_ring_list_next(sock->get_ec_ring_list_next()); + } + + if (sock == m_socketxtreme.ec_sock_list_start) { + m_socketxtreme.ec_sock_list_start = sock->get_ec_ring_list_next(); + } + + if (sock == m_socketxtreme.ec_sock_list_end) { + m_socketxtreme.ec_sock_list_end = prev; + } + + sock->set_ec_ring_list_next(nullptr); + } + + m_socketxtreme.lock_ec_list.unlock(); +} + void ring::print_val() { ring_logdbg("%d: %p: parent %p", m_if_index, this, diff --git a/src/core/dev/ring.h b/src/core/dev/ring.h index 852092c46..859262fd2 100644 --- a/src/core/dev/ring.h +++ b/src/core/dev/ring.h @@ -62,22 +62,17 @@ typedef enum { CQT_RX, CQT_TX } cq_type_t; typedef size_t ring_user_id_t; -/* Ring event completion */ +// Socketxtreme completion struct ring_ec { - struct list_head list; struct xlio_socketxtreme_completion_t completion; - struct xlio_buff_t *last_buff_lst; - - inline void clear() - { - INIT_LIST_HEAD(&list); - memset(&completion, 0, sizeof(completion)); - last_buff_lst = nullptr; - } + ring_ec *next; }; typedef cached_obj_pool tcp_seg_pool; +typedef cached_obj_pool socketxtreme_ec_pool; + extern tcp_seg_pool *g_tcp_seg_pool; +extern socketxtreme_ec_pool *g_socketxtreme_ec_pool; class ring { public: @@ -151,10 +146,6 @@ class ring { virtual int socketxtreme_poll(struct xlio_socketxtreme_completion_t *xlio_completions, unsigned int ncompletions, int flags) = 0; - virtual bool is_socketxtreme(void) = 0; - virtual void put_ec(struct ring_ec *ec) = 0; - virtual void del_ec(struct ring_ec *ec) = 0; - inline int get_if_index() { return m_if_index; } #ifdef DEFINED_UTLS @@ -270,19 +261,41 @@ class ring { void set_group(poll_group *grp) { m_p_group = grp; } poll_group *get_group() const { return m_p_group; } + ring_ec *socketxtreme_get_ecs(uint32_t num); + void socketxtreme_put_ecs(struct ring_ec *ec); + + void socketxtreme_ec_clear_sock(sockinfo *sock); + void socketxtreme_ec_sock_list_add(sockinfo *sock); + bool socketxtreme_ec_pop_completion(xlio_socketxtreme_completion_t *completion); + void socketxtreme_end_ec_operation(); + xlio_socketxtreme_completion_t &socketxtreme_start_ec_operation(sockinfo *sock, + bool always_new); + protected: inline void set_parent(ring *parent) { m_parent = (parent ? parent : this); } inline void set_if_index(int if_index) { m_if_index = if_index; } - poll_group *m_p_group; - int *m_p_n_rx_channel_fds; - ring *m_parent; + poll_group *m_p_group = nullptr; + int *m_p_n_rx_channel_fds = nullptr; + ring *m_parent = nullptr; - struct tcp_seg *m_tcp_seg_list; - uint32_t m_tcp_seg_count; + struct tcp_seg *m_tcp_seg_list = nullptr; + ring_ec *m_socketxtreme_ec_list = nullptr; + uint32_t m_tcp_seg_count = 0U; + uint32_t m_socketxtreme_ec_count = 0U; lock_spin_recursive m_tcp_seg_lock; + lock_spin_recursive m_ec_lock; + + struct { + // Queue of ready sockets. Each socket can be added only once to this queue. + sockinfo *ec_sock_list_start = nullptr; + sockinfo *ec_sock_list_end = nullptr; + + // Thread-safety lock for get/put operations under the queue. + lock_spin lock_ec_list; + } m_socketxtreme; - int m_if_index; /* Interface index */ + int m_if_index = 0; /* Interface index */ }; #endif /* RING_H */ diff --git a/src/core/dev/ring_bond.h b/src/core/dev/ring_bond.h index 6c663b8b0..888efe901 100644 --- a/src/core/dev/ring_bond.h +++ b/src/core/dev/ring_bond.h @@ -131,10 +131,6 @@ class ring_bond : public ring { int devide_buffers_helper(mem_buf_desc_t *p_mem_buf_desc_list, mem_buf_desc_t **buffer_per_ring); - bool is_socketxtreme(void) { return false; } - void put_ec(struct ring_ec *ec) { NOT_IN_USE(ec); } - void del_ec(struct ring_ec *ec) { NOT_IN_USE(ec); } - protected: /* Array of all aggregated rings * Every ring can be Active or Backup diff --git a/src/core/dev/ring_simple.cpp b/src/core/dev/ring_simple.cpp index e2bd02183..2d2a43bdd 100644 --- a/src/core/dev/ring_simple.cpp +++ b/src/core/dev/ring_simple.cpp @@ -118,8 +118,6 @@ ring_simple::ring_simple(int if_index, ring *parent, ring_type_t type, bool use_ memset(&m_tls, 0, sizeof(m_tls)); #endif /* DEFINED_UTLS */ memset(&m_lro, 0, sizeof(m_lro)); - - INIT_LIST_HEAD(&m_socketxtreme.ec_list); } ring_simple::~ring_simple() @@ -205,16 +203,6 @@ ring_simple::~ring_simple() m_lock_ring_tx.unlock(); m_lock_ring_rx.unlock(); - ring_logdbg("queue of event completion elements is %s", - (list_empty(&m_socketxtreme.ec_list) ? "empty" : "not empty")); - while (!list_empty(&m_socketxtreme.ec_list)) { - struct ring_ec *ec = nullptr; - ec = get_ec(); - if (ec) { - del_ec(ec); - } - } - ring_logdbg("delete ring_simple() completed"); } @@ -435,7 +423,7 @@ int ring_simple::socketxtreme_poll(struct xlio_socketxtreme_completion_t *xlio_c bool do_poll = true; if (likely(xlio_completions) && ncompletions) { - if ((flags & SOCKETXTREME_POLL_TX) && list_empty(&m_socketxtreme.ec_list)) { + if ((flags & SOCKETXTREME_POLL_TX) && !m_socketxtreme.ec_sock_list_start) { uint64_t poll_sn = 0; const std::lock_guard lock(m_lock_ring_tx); m_p_cq_mgr_tx->poll_and_process_element_tx(&poll_sn); @@ -443,11 +431,8 @@ int ring_simple::socketxtreme_poll(struct xlio_socketxtreme_completion_t *xlio_c const std::lock_guard lock(m_lock_ring_rx); while (!g_b_exit && (i < (int)ncompletions)) { - if (!list_empty(&m_socketxtreme.ec_list)) { - ring_ec *ec = get_ec(); - if (ec) { - memcpy(xlio_completions, &ec->completion, sizeof(ec->completion)); - ec->clear(); + if (m_socketxtreme.ec_sock_list_start) { + if (socketxtreme_ec_pop_completion(xlio_completions)) { xlio_completions++; i++; } diff --git a/src/core/dev/ring_simple.h b/src/core/dev/ring_simple.h index b302165b9..ca5d2f541 100644 --- a/src/core/dev/ring_simple.h +++ b/src/core/dev/ring_simple.h @@ -308,35 +308,6 @@ class ring_simple : public ring_slave { inline uint32_t get_mtu() { return m_mtu; } private: - bool is_socketxtreme(void) override { return safe_mce_sys().enable_socketxtreme; } - - void put_ec(struct ring_ec *ec) override - { - m_socketxtreme.lock_ec_list.lock(); - list_add_tail(&ec->list, &m_socketxtreme.ec_list); - m_socketxtreme.lock_ec_list.unlock(); - } - - void del_ec(struct ring_ec *ec) override - { - m_socketxtreme.lock_ec_list.lock(); - list_del_init(&ec->list); - ec->clear(); - m_socketxtreme.lock_ec_list.unlock(); - } - - inline ring_ec *get_ec(void) - { - struct ring_ec *ec = nullptr; - - m_socketxtreme.lock_ec_list.lock(); - if (!list_empty(&m_socketxtreme.ec_list)) { - ec = list_entry(m_socketxtreme.ec_list.next, struct ring_ec, list); - list_del_init(&ec->list); - } - m_socketxtreme.lock_ec_list.unlock(); - return ec; - } inline void send_status_handler(int ret, xlio_ibv_send_wr *p_send_wqe); inline mem_buf_desc_t *get_tx_buffers(pbuf_type type, uint32_t n_num_mem_bufs); inline int put_tx_buffer_helper(mem_buf_desc_t *buff); @@ -367,18 +338,6 @@ class ring_simple : public ring_slave { std::unordered_map m_user_lkey_map; private: - struct { - /* queue of event completion elements - * this queue is stored events related different sockinfo (sockets) - * In current implementation every sockinfo (socket) can have single event - * in this queue - */ - struct list_head ec_list; - - /* Thread-safety lock for get/put operations under the queue */ - lock_spin lock_ec_list; - } m_socketxtreme; - lock_mutex m_lock_ring_tx_buf_wait; uint32_t m_tx_num_bufs = 0U; uint32_t m_zc_num_bufs = 0U; diff --git a/src/core/dev/ring_tap.h b/src/core/dev/ring_tap.h index 67777e9b3..b358e292f 100644 --- a/src/core/dev/ring_tap.h +++ b/src/core/dev/ring_tap.h @@ -136,10 +136,6 @@ class ring_tap : public ring_slave { void tap_create(net_device_val *p_ndev); void tap_destroy(); - bool is_socketxtreme(void) { return false; } - void put_ec(struct ring_ec *ec) { NOT_IN_USE(ec); } - void del_ec(struct ring_ec *ec) { NOT_IN_USE(ec); } - /* These fields are NETVSC mode specific */ int m_tap_fd; /* file descriptor of tap device */ ring_slave *m_vf_ring; diff --git a/src/core/main.cpp b/src/core/main.cpp index 38f5d3b01..f2713577a 100644 --- a/src/core/main.cpp +++ b/src/core/main.cpp @@ -109,6 +109,8 @@ static command_netlink *s_cmd_nl = nullptr; #define MAX_VERSION_STR_LEN 128 global_stats_t g_global_stat_static; +static uint32_t g_ec_pool_size = 0U; +static uint32_t g_ec_pool_no_objs = 0U; static int free_libxlio_resources() { @@ -184,6 +186,11 @@ static int free_libxlio_resources() } g_tcp_seg_pool = nullptr; + if (g_socketxtreme_ec_pool) { + delete g_socketxtreme_ec_pool; + } + g_socketxtreme_ec_pool = NULL; + if (safe_mce_sys().print_report) { buffer_pool::print_report_on_errors(VLOG_INFO); } @@ -1104,6 +1111,9 @@ static void do_global_ctors_helper() g_global_stat_static.n_tcp_seg_pool_size, g_global_stat_static.n_tcp_seg_pool_no_segs)); + NEW_CTOR(g_socketxtreme_ec_pool, + socketxtreme_ec_pool("Socketxtreme ec", 512, g_ec_pool_size, g_ec_pool_no_objs)); + // For delegated TCP timers the global collection is not used. if (safe_mce_sys().tcp_ctl_thread != option_tcp_ctl_thread::CTL_THREAD_DELEGATE_TCP_TIMERS) { NEW_CTOR(g_tcp_timers_collection, tcp_timers_collection()); @@ -1187,6 +1197,7 @@ void reset_globals() g_buffer_pool_tx = nullptr; g_buffer_pool_zc = nullptr; g_tcp_seg_pool = nullptr; + g_socketxtreme_ec_pool = NULL; g_tcp_timers_collection = nullptr; g_p_vlogger_timer_handler = nullptr; g_p_event_handler_manager = nullptr; diff --git a/src/core/sock/sockinfo.cpp b/src/core/sock/sockinfo.cpp index a9ab53b8f..ede97961d 100644 --- a/src/core/sock/sockinfo.cpp +++ b/src/core/sock/sockinfo.cpp @@ -97,13 +97,14 @@ const char *sockinfo::setsockopt_so_opt_to_str(int opt) } sockinfo::sockinfo(int fd, int domain, bool use_ring_locks) - : m_fd_context((void *)((uintptr_t)m_fd)) + : m_fd_context((void *)((uintptr_t)fd)) + , m_family(domain) , m_fd(fd) , m_rx_num_buffs_reuse(safe_mce_sys().rx_bufs_batch) , m_skip_cq_poll_in_rx(safe_mce_sys().skip_poll_in_rx == SKIP_POLL_IN_RX_ENABLE) + , m_is_ipv6only(safe_mce_sys().sysctl_reader.get_ipv6_bindv6only()) , m_lock_rcv(MULTILOCK_RECURSIVE, MODULE_NAME "::m_lock_rcv") , m_lock_snd(MODULE_NAME "::m_lock_snd") - , m_family(domain) , m_so_bindtodevice_ip(ip_address::any_addr(), domain) , m_rx_ring_map_lock(MODULE_NAME "::m_rx_ring_map_lock") , m_ring_alloc_log_rx(safe_mce_sys().ring_allocation_logic_rx, use_ring_locks) @@ -111,7 +112,6 @@ sockinfo::sockinfo(int fd, int domain, bool use_ring_locks) , m_n_uc_ttl_hop_lim(m_family == AF_INET ? safe_mce_sys().sysctl_reader.get_net_ipv4_ttl() : safe_mce_sys().sysctl_reader.get_net_ipv6_hop_limit()) - , m_is_ipv6only(safe_mce_sys().sysctl_reader.get_ipv6_bindv6only()) { m_rx_epfd = SYSCALL(epoll_create, 128); if (unlikely(m_rx_epfd == -1)) { @@ -133,12 +133,6 @@ sockinfo::sockinfo(int fd, int domain, bool use_ring_locks) atomic_set(&m_zckey, 0); - m_socketxtreme.ec_cache.clear(); - struct ring_ec ec; - ec.clear(); - m_socketxtreme.ec_cache.push_back(ec); - m_socketxtreme.ec = &m_socketxtreme.ec_cache.back(); - m_connected.set_sa_family(m_family); m_bound.set_sa_family(m_family); } @@ -172,8 +166,6 @@ sockinfo::~sockinfo() sock_stats::instance().return_stats_obj(m_p_socket_stats); } - m_socketxtreme.ec_cache.clear(); - bool toclose = safe_mce_sys().deferred_close && m_fd >= 0; #if defined(DEFINED_NGINX) @@ -214,6 +206,39 @@ void sockinfo::socket_stats_init() m_p_socket_stats->sa_family = m_family; } +ring_ec *sockinfo::pop_next_ec() +{ + if (likely(m_socketxtreme_ec_first)) { + ring_ec *temp = m_socketxtreme_ec_first; + m_socketxtreme_ec_first = m_socketxtreme_ec_first->next; + if (likely(!m_socketxtreme_ec_first)) { // We likely to have a single ec most of the time. + m_socketxtreme_ec_last = nullptr; + } + + return temp; + } + + return nullptr; +} + +ring_ec *sockinfo::clear_ecs() +{ + ring_ec *temp = m_socketxtreme_ec_first; + m_socketxtreme_ec_first = m_socketxtreme_ec_last = nullptr; + return temp; +} + +void sockinfo::add_ec(ring_ec *ec) +{ + memset(&ec->completion, 0, sizeof(ec->completion)); + if (likely(!m_socketxtreme_ec_last)) { + m_socketxtreme_ec_last = m_socketxtreme_ec_first = ec; + } else { + m_socketxtreme_ec_last->next = ec; + m_socketxtreme_ec_last = ec; + } +} + void sockinfo::set_blocking(bool is_blocked) { si_logdbg("set socket to %s mode", is_blocked ? "blocked" : "non-blocking"); @@ -298,12 +323,12 @@ int sockinfo::fcntl64(int __cmd, unsigned long int __arg) int sockinfo::get_epoll_context_fd() { - return (m_econtext ? m_econtext->get_epoll_fd() : 0); + return (has_epoll_context() ? m_econtext->get_epoll_fd() : 0); } void sockinfo::insert_epoll_event(uint64_t events) { - if (m_econtext) { + if (has_epoll_context()) { m_econtext->insert_epoll_event_cb(this, static_cast(events)); } } @@ -1306,7 +1331,7 @@ int sockinfo::add_epoll_context(epfd_info *epfd) m_rx_ring_map_lock.lock(); lock_rx_q(); - if (!m_econtext) { + if (!m_econtext && !safe_mce_sys().enable_socketxtreme) { // This socket is not registered to any epfd m_econtext = epfd; } else { @@ -1325,7 +1350,7 @@ int sockinfo::add_epoll_context(epfd_info *epfd) sock_ring_map_iter = m_rx_ring_map.begin(); while (sock_ring_map_iter != m_rx_ring_map.end()) { - if (m_econtext) { + if (has_epoll_context()) { m_econtext->increase_ring_ref_count(sock_ring_map_iter->first); } sock_ring_map_iter++; @@ -1344,7 +1369,7 @@ void sockinfo::remove_epoll_context(epfd_info *epfd) m_rx_ring_map_lock.lock(); lock_rx_q(); - if (m_econtext != epfd) { + if (!has_epoll_context() || m_econtext != epfd) { unlock_rx_q(); m_rx_ring_map_lock.unlock(); return; @@ -1352,9 +1377,7 @@ void sockinfo::remove_epoll_context(epfd_info *epfd) rx_ring_map_t::const_iterator sock_ring_map_iter = m_rx_ring_map.begin(); while (sock_ring_map_iter != m_rx_ring_map.end()) { - if (m_econtext) { - m_econtext->decrease_ring_ref_count(sock_ring_map_iter->first); - } + m_econtext->decrease_ring_ref_count(sock_ring_map_iter->first); sock_ring_map_iter++; } @@ -1630,7 +1653,7 @@ void sockinfo::rx_add_ring_cb(ring *p_ring) // first in order. possible race between removal of fd from epoll (epoll_ctl del, or epoll // close) and here. need to add a third-side lock (fd_collection?) to sync between epoll and // socket. - if (m_econtext) { + if (has_epoll_context()) { m_econtext->increase_ring_ref_count(p_ring); } } @@ -1687,14 +1710,9 @@ void sockinfo::rx_del_ring_cb(ring *p_ring) delete p_ring_info; if (m_p_rx_ring == base_ring) { - /* Ring should not have completion events related closed socket - * in wait list - */ - for (auto &ec : m_socketxtreme.ec_cache) { - if (0 != ec.completion.events) { - m_p_rx_ring->del_ec(&ec); - } - } + // Ring should not have completion events related closed socket in wait list + m_p_rx_ring->socketxtreme_ec_clear_sock(this); + if (m_rx_ring_map.size() == 1) { m_p_rx_ring = m_rx_ring_map.begin()->first; } else { @@ -1718,7 +1736,7 @@ void sockinfo::rx_del_ring_cb(ring *p_ring) // first in order. possible race between removal of fd from epoll (epoll_ctl del, or epoll // close) and here. need to add a third-side lock (fd_collection?) to sync between epoll and // socket. - if (m_econtext) { + if (has_epoll_context()) { m_econtext->decrease_ring_ref_count(base_ring); } } diff --git a/src/core/sock/sockinfo.h b/src/core/sock/sockinfo.h index 23d9e4290..86a21c404 100644 --- a/src/core/sock/sockinfo.h +++ b/src/core/sock/sockinfo.h @@ -201,16 +201,17 @@ struct ring_info_t { // TX_SENDMSG struct xlio_tx_call_attr_t { tx_call_t opcode; + unsigned xlio_flags; + struct _attr { struct iovec *iov; ssize_t sz_iov; int flags; - struct sockaddr *addr; socklen_t len; + struct sockaddr *addr; const struct msghdr *hdr; } attr; - unsigned xlio_flags; pbuf_desc priv; ~xlio_tx_call_attr_t() {}; @@ -325,6 +326,16 @@ class sockinfo { inline bool set_flow_tag(uint32_t flow_tag_id); inline void sock_pop_descs_rx_ready(descq_t *cache); + // Socketxtreme related. + ring_ec *pop_next_ec(); + ring_ec *clear_ecs(); + void add_ec(ring_ec *ec); + ring_ec *get_last_ec() { return m_socketxtreme_ec_last; } + bool has_next_ec() { return (m_socketxtreme_ec_first != nullptr); } + sockinfo *get_ec_ring_list_next() { return m_socketxtreme_ring_list_next; } + void set_ec_ring_list_next(sockinfo *sock) { m_socketxtreme_ring_list_next = sock; } + + bool has_epoll_context() { return (!safe_mce_sys().enable_socketxtreme && m_econtext); } bool has_stats() const { return m_has_stats; } bool get_rx_pkt_ready_list_count() const { return m_n_rx_pkt_ready_list_count; } int get_fd() const { return m_fd; }; @@ -398,14 +409,14 @@ class sockinfo { inline void set_rx_reuse_pending(bool is_pending = true); inline void reuse_buffer(mem_buf_desc_t *buff); - inline void set_events_socketxtreme(uint64_t events); + inline xlio_socketxtreme_completion_t *set_events_socketxtreme(uint64_t events, + bool full_transaction); inline void set_events(uint64_t events); inline void save_strq_stats(uint32_t packet_strides); inline int dequeue_packet(iovec *p_iov, ssize_t sz_iov, sockaddr *__from, socklen_t *__fromlen, int in_flags, int *p_out_flags); - bool is_socketxtreme() { return safe_mce_sys().enable_socketxtreme; } int get_sock_by_L3_L4(in_protocol_t protocol, const ip_address &ip, in_port_t port); void notify_epoll_context(uint32_t events); void save_stats_rx_os(int bytes); @@ -462,18 +473,31 @@ class sockinfo { bool attach_as_uc_receiver_anyip(sa_family_t family, role_t role, bool skip_rules); protected: - /* Last memory descriptor with zcopy operation method */ dst_entry *m_p_connected_dst_entry = nullptr; - ring *m_p_rx_ring = nullptr; // used in TCP/UDP - epfd_info *m_econtext = nullptr; - socket_stats_t *m_p_socket_stats = nullptr; - mem_buf_desc_t *m_last_zcdesc = nullptr; sockinfo_state m_state = SOCKINFO_OPENED; // socket current state uint8_t m_n_tsing_flags = 0U; bool m_has_stats = false; bool m_b_rcvtstamp = false; bool m_b_zc = false; bool m_b_blocking = true; + bool m_b_rcvtstampns = false; + rfs *m_rfs_ptr = nullptr; + ring *m_p_rx_ring = nullptr; // used in TCP/UDP + ring_ec *m_socketxtreme_ec_first = nullptr; + ring_ec *m_socketxtreme_ec_last = nullptr; + sockinfo *m_socketxtreme_ring_list_next = nullptr; + + // End of first cache line + + void *m_fd_context; // Context data stored with socket + mem_buf_desc_t *m_last_zcdesc = nullptr; + socket_stats_t *m_p_socket_stats = nullptr; + + /* Socket error queue that keeps local errors and internal data required + * to provide notification ability. + */ + descq_t m_error_queue; + lock_spin_simple m_error_queue_lock; /* TX zcopy counter * The notification itself for tx zcopy operation is a simple scalar value. @@ -486,29 +510,14 @@ class sockinfo { */ atomic_t m_zckey; - lock_spin_simple m_error_queue_lock; - - // End of first cache line - - /* Socket error queue that keeps local errors and internal data required - * to provide notification ability. - */ - descq_t m_error_queue; - void *m_fd_context; // Context data stored with socket - - rfs *m_rfs_ptr = nullptr; - struct { - /* Use std::deque in current design as far as it allows pushing - * elements on either end without moving around any other element - * but trade this for slightly worse iteration speeds. - */ - struct ring_ec *ec; - std::deque ec_cache; - } m_socketxtreme; + // End of second cache line + epfd_info *m_econtext = nullptr; wakeup_pipe m_sock_wakeup_pipe; + int m_rx_epfd; + in_protocol_t m_protocol = PROTO_UNDEFINED; + sa_family_t m_family; - // End of fourth cache line public: list_node socket_fd_list_node; list_node ep_ready_fd_node; @@ -519,7 +528,6 @@ class sockinfo { protected: int m_fd; // identification information - int m_rx_epfd; /** * list of pending ready packet on the Rx, * each element is a pointer to the ib_conn_mgr that holds this ready rx datagram @@ -537,15 +545,12 @@ class sockinfo { bool m_reuseaddr = false; // to track setsockopt with SO_REUSEADDR bool m_reuseport = false; // to track setsockopt with SO_REUSEPORT bool m_b_pktinfo = false; - bool m_b_rcvtstampns = false; + bool m_bind_no_port = false; + bool m_is_ipv6only; multilock m_lock_rcv; lock_mutex m_lock_snd; lock_mutex m_rx_migration_lock; - - uint32_t m_flow_tag_id = 0U; // Flow Tag for this socket - in_protocol_t m_protocol = PROTO_UNDEFINED; - sa_family_t m_family; sock_addr m_bound; sock_addr m_connected; ip_addr m_so_bindtodevice_ip; @@ -563,15 +568,14 @@ class sockinfo { void *m_rx_callback_context = nullptr; // user context struct xlio_rate_limit_t m_so_ratelimit; uint32_t m_pcp = 0U; + uint32_t m_flow_tag_id = 0U; // Flow Tag for this socket uint8_t m_n_uc_ttl_hop_lim; uint8_t m_src_sel_flags = 0U; - bool m_bind_no_port = false; - bool m_is_ipv6only; public: #if defined(DEFINED_NGINX) || defined(DEFINED_ENVOY) - int m_back_log = 0; bool m_is_for_socket_pool = false; // true when this fd will be used for socket pool on close + int m_back_log = 0; #endif }; @@ -609,39 +613,34 @@ void sockinfo::sock_pop_descs_rx_ready(descq_t *cache) unlock_rx_q(); } -void sockinfo::set_events_socketxtreme(uint64_t events) +xlio_socketxtreme_completion_t *sockinfo::set_events_socketxtreme(uint64_t events, + bool full_transaction) { - m_socketxtreme.ec->completion.user_data = (uint64_t)m_fd_context; - if (!m_socketxtreme.ec->completion.events) { - m_socketxtreme.ec->completion.events |= events; - m_p_rx_ring->put_ec(m_socketxtreme.ec); - - m_socketxtreme.ec = nullptr; - for (auto &ec : m_socketxtreme.ec_cache) { - if (0 == ec.completion.events) { - m_socketxtreme.ec = &ec; - break; - } - } - if (!m_socketxtreme.ec) { - struct ring_ec ec; - ec.clear(); - m_socketxtreme.ec_cache.push_back(ec); - m_socketxtreme.ec = &m_socketxtreme.ec_cache.back(); - } - } else { - m_socketxtreme.ec->completion.events |= events; + bool always_new = + ((events & (XLIO_SOCKETXTREME_PACKET | XLIO_SOCKETXTREME_NEW_CONNECTION_ACCEPTED)) != 0U); + xlio_socketxtreme_completion_t &completion = + m_p_rx_ring->socketxtreme_start_ec_operation(this, always_new); + completion.user_data = (uint64_t)m_fd_context; + completion.events |= events; + + if (full_transaction) { + m_p_rx_ring->socketxtreme_end_ec_operation(); + return nullptr; } + + return &completion; } void sockinfo::set_events(uint64_t events) { /* Collect all events if rx ring is enabled */ - if (is_socketxtreme() && m_state == SOCKINFO_OPENED) { - set_events_socketxtreme(events); + if (safe_mce_sys().enable_socketxtreme) { + if (m_state == SOCKINFO_OPENED) { + set_events_socketxtreme(events, true); + } + } else { + insert_epoll_event(events); } - - insert_epoll_event(events); } void sockinfo::save_strq_stats(uint32_t packet_strides) diff --git a/src/core/sock/sockinfo_tcp.cpp b/src/core/sock/sockinfo_tcp.cpp index 838841479..d43510e5c 100644 --- a/src/core/sock/sockinfo_tcp.cpp +++ b/src/core/sock/sockinfo_tcp.cpp @@ -338,7 +338,7 @@ sockinfo_tcp::sockinfo_tcp(int fd, int domain) si_tcp_logdbg("new pcb %p pcb state %d", &m_pcb, get_tcp_state(&m_pcb)); tcp_arg(&m_pcb, this); tcp_ip_output(&m_pcb, sockinfo_tcp::ip_output); - if (is_socketxtreme()) { + if (safe_mce_sys().enable_socketxtreme) { tcp_recv(&m_pcb, sockinfo_tcp::rx_lwip_cb_socketxtreme); } else { tcp_recv(&m_pcb, sockinfo_tcp::rx_lwip_cb); @@ -760,7 +760,7 @@ bool sockinfo_tcp::prepare_to_close(bool process_shutdown /* = false */) NOTIFY_ON_EVENTS(this, EPOLLHUP); m_sock_wakeup_pipe.do_wakeup(); - if (m_econtext) { + if (has_epoll_context()) { m_econtext->fd_closed(m_fd); } @@ -798,7 +798,7 @@ void sockinfo_tcp::handle_socket_linger() /* SOCKETXTREME WA: Don't call rx_wait() in order not to miss events in socketxtreme_poll() * flow. TBD: find proper solution! rx_wait(poll_cnt, false); * */ - if (!is_socketxtreme()) { + if (!safe_mce_sys().enable_socketxtreme) { rx_wait(poll_cnt, false); } tcp_output(&m_pcb); @@ -1979,54 +1979,28 @@ err_t sockinfo_tcp::rx_lwip_cb(void *arg, struct tcp_pcb *pcb, struct pbuf *p, e return ERR_OK; } -static inline void _rx_lwip_cb_socketxtreme_helper(pbuf *p, - xlio_socketxtreme_completion_t *completion, - xlio_buff_t *&buff_list_tail, - bool use_hw_timestamp, - std::function notify) +inline void sockinfo_tcp::rx_lwip_cb_socketxtreme_helper(pbuf *p) { + xlio_socketxtreme_completion_t *completion = + set_events_socketxtreme(XLIO_SOCKETXTREME_PACKET, false); + mem_buf_desc_t *current_desc = reinterpret_cast(p); - if (!buff_list_tail) { - // New completion - completion->packet.buff_lst = reinterpret_cast(p); - completion->packet.total_len = p->tot_len; - completion->packet.num_bufs = current_desc->rx.n_frags; + // Is IPv4 only. + assert(p); + assert(current_desc->rx.src.get_sa_family() == AF_INET); + assert(current_desc->rx.n_frags > 0); - assert(reinterpret_cast(p)->rx.n_frags > 0); + completion->packet.buff_lst = reinterpret_cast(p); + completion->packet.total_len = p->tot_len; + completion->packet.num_bufs = current_desc->rx.n_frags; - if (use_hw_timestamp) { - completion->packet.hw_timestamp = current_desc->rx.timestamps.hw; - } - notify(); - } else { - // Update existing completion - xlio_buff_t *&buff_list_head = completion->packet.buff_lst; - completion->packet.total_len += p->tot_len; - completion->packet.num_bufs += current_desc->rx.n_frags; - - auto membuff_list_tail = reinterpret_cast(buff_list_tail); - while (membuff_list_tail->p_next_desc) { - membuff_list_tail = membuff_list_tail->p_next_desc; - } - membuff_list_tail->p_next_desc = current_desc; - reinterpret_cast(buff_list_head)->rx.n_frags = - completion->packet.num_bufs; - pbuf_cat(reinterpret_cast(buff_list_head), p); - current_desc->rx.n_frags = 0; + if (m_n_tsing_flags & SOF_TIMESTAMPING_RAW_HARDWARE) { + completion->packet.hw_timestamp = current_desc->rx.timestamps.hw; } - buff_list_tail = reinterpret_cast(p); -} - -inline void sockinfo_tcp::rx_lwip_cb_socketxtreme_helper(pbuf *p) -{ - auto notify = [this]() { NOTIFY_ON_EVENTS(this, XLIO_SOCKETXTREME_PACKET); }; - bool use_hw_timestamp = (m_n_tsing_flags & SOF_TIMESTAMPING_RAW_HARDWARE); - assert(p); - _rx_lwip_cb_socketxtreme_helper(p, &m_socketxtreme.ec->completion, - m_socketxtreme.ec->last_buff_lst, use_hw_timestamp, notify); - save_stats_rx_offload(m_socketxtreme.ec->completion.packet.total_len); + m_p_rx_ring->socketxtreme_end_ec_operation(); + save_stats_rx_offload(p->tot_len); } inline err_t sockinfo_tcp::handle_fin(struct tcp_pcb *pcb, err_t err) @@ -2726,7 +2700,7 @@ int sockinfo_tcp::connect(const sockaddr *__to, socklen_t __tolen) TRANS_XLIO) { passthrough_unlock("non offloaded socket --> connect only via OS"); return -1; - } else if (m_econtext) { + } else if (has_epoll_context()) { m_econtext->remove_fd_from_epoll_os(m_fd); // remove fd from os epoll } @@ -3358,19 +3332,17 @@ void sockinfo_tcp::accept_connection_socketxtreme(sockinfo_tcp *parent, sockinfo child->m_p_socket_stats->set_bound_if(child->m_bound); child->m_p_socket_stats->bound_port = child->m_bound.get_in_port(); - xlio_socketxtreme_completion_t &parent_compl = parent->m_socketxtreme.ec->completion; - - child->m_connected.get_sa(reinterpret_cast(&parent_compl.src), - static_cast(sizeof(parent_compl.src))); - /* Update xlio_completion with * XLIO_SOCKETXTREME_NEW_CONNECTION_ACCEPTED related data */ if (likely(child->m_parent)) { + xlio_socketxtreme_completion_t &completion = + *(child->set_events_socketxtreme(XLIO_SOCKETXTREME_NEW_CONNECTION_ACCEPTED, false)); + completion.listen_fd = child->m_parent->get_fd(); - child->m_socketxtreme.ec->completion.src = parent->m_socketxtreme.ec->completion.src; - child->m_socketxtreme.ec->completion.listen_fd = child->m_parent->get_fd(); - NOTIFY_ON_EVENTS(child, XLIO_SOCKETXTREME_NEW_CONNECTION_ACCEPTED); + child->m_connected.get_sa(reinterpret_cast(&completion.src), + static_cast(sizeof(completion.src))); + child->m_p_rx_ring->socketxtreme_end_ec_operation(); } else { vlog_printf(VLOG_ERROR, "XLIO_SOCKETXTREME_NEW_CONNECTION_ACCEPTED: can't find listen socket for new " @@ -3417,7 +3389,7 @@ err_t sockinfo_tcp::accept_lwip_cb(void *arg, struct tcp_pcb *child_pcb, err_t e tcp_ip_output(&(new_sock->m_pcb), sockinfo_tcp::ip_output); tcp_arg(&(new_sock->m_pcb), new_sock); - if (new_sock->is_socketxtreme()) { + if (safe_mce_sys().enable_socketxtreme) { tcp_recv(&new_sock->m_pcb, sockinfo_tcp::rx_lwip_cb_socketxtreme); } else { tcp_recv(&new_sock->m_pcb, sockinfo_tcp::rx_lwip_cb); @@ -3486,7 +3458,7 @@ err_t sockinfo_tcp::accept_lwip_cb(void *arg, struct tcp_pcb *child_pcb, err_t e // todo check that listen socket was not closed by now ? (is_server()) conn->m_ready_pcbs.erase(&new_sock->m_pcb); - if (conn->is_socketxtreme()) { + if (safe_mce_sys().enable_socketxtreme) { accept_connection_socketxtreme(conn, new_sock); } else { conn->m_accepted_conns.push_back(new_sock); @@ -3643,7 +3615,7 @@ err_t sockinfo_tcp::syn_received_timewait_cb(void *arg, struct tcp_pcb *newpcb) new_sock->m_conn_state = TCP_CONN_INIT; new_sock->m_parent = listen_sock; - if (new_sock->is_socketxtreme()) { + if (safe_mce_sys().enable_socketxtreme) { tcp_recv(&new_sock->m_pcb, sockinfo_tcp::rx_lwip_cb_socketxtreme); } else { tcp_recv(&new_sock->m_pcb, sockinfo_tcp::rx_lwip_cb); diff --git a/src/core/sock/sockinfo_tcp.h b/src/core/sock/sockinfo_tcp.h index 7d64b8568..b12c12b34 100644 --- a/src/core/sock/sockinfo_tcp.h +++ b/src/core/sock/sockinfo_tcp.h @@ -607,6 +607,7 @@ class sockinfo_tcp : public sockinfo { // lwip specific things struct tcp_pcb m_pcb; + fd_array_t *m_iomux_ready_fd_array; socket_options_list_t m_socket_options_list; timestamps_t m_rx_timestamps; tcp_sock_offload_e m_sock_offload; @@ -623,7 +624,6 @@ class sockinfo_tcp : public sockinfo { int m_rcvbuff_current; int m_rcvbuff_non_tcp_recved; tcp_conn_state_e m_conn_state; - fd_array_t *m_iomux_ready_fd_array; struct linger m_linger; /* local & peer addresses */ diff --git a/src/core/sock/sockinfo_udp.cpp b/src/core/sock/sockinfo_udp.cpp index c44d0821a..7c4eb8068 100644 --- a/src/core/sock/sockinfo_udp.cpp +++ b/src/core/sock/sockinfo_udp.cpp @@ -2335,13 +2335,11 @@ inline xlio_recv_callback_retval_t sockinfo_udp::inspect_by_user_cb(mem_buf_desc */ inline void sockinfo_udp::rx_udp_cb_socketxtreme_helper(mem_buf_desc_t *p_desc) { - struct xlio_socketxtreme_completion_t *completion; - // xlio_socketxtreme_completion_t is IPv4 only. assert(p_desc->rx.src.get_sa_family() == AF_INET); - completion = &m_socketxtreme.ec->completion; - + xlio_socketxtreme_completion_t *completion = + set_events_socketxtreme(XLIO_SOCKETXTREME_PACKET, false); completion->packet.num_bufs = p_desc->rx.n_frags; completion->packet.total_len = 0; p_desc->rx.src.get_sa(reinterpret_cast(&completion->src), @@ -2359,9 +2357,9 @@ inline void sockinfo_udp::rx_udp_cb_socketxtreme_helper(mem_buf_desc_t *p_desc) completion->packet.buff_lst->len = p_desc->rx.frag.iov_len; } - NOTIFY_ON_EVENTS(this, XLIO_SOCKETXTREME_PACKET); - save_stats_rx_offload(completion->packet.total_len); + + m_p_rx_ring->socketxtreme_end_ec_operation(); } /** @@ -2556,7 +2554,7 @@ bool sockinfo_udp::rx_input_cb(mem_buf_desc_t *p_desc, void *pv_fd_ready_array) p_desc->inc_ref_count(); save_strq_stats(p_desc->rx.strides_num); - if (is_socketxtreme()) { + if (safe_mce_sys().enable_socketxtreme) { rx_udp_cb_socketxtreme_helper(p_desc); } else { update_ready(p_desc, pv_fd_ready_array, cb_ret); @@ -3284,7 +3282,7 @@ bool sockinfo_udp::prepare_to_close(bool process_shutdown) m_lock_rcv.lock(); m_sock_wakeup_pipe.do_wakeup(); - if (m_econtext) { + if (has_epoll_context()) { m_econtext->fd_closed(m_fd); } diff --git a/src/core/util/cached_obj_pool.h b/src/core/util/cached_obj_pool.h index ea197c17a..f3268adb4 100644 --- a/src/core/util/cached_obj_pool.h +++ b/src/core/util/cached_obj_pool.h @@ -168,7 +168,7 @@ template bool cached_obj_pool::expand() size_t size = sizeof(T) * m_alloc_batch; T *objs_array = (T *)m_allocator.alloc(size); if (!objs_array) { - vlog_printf(VLOG_DEBUG, "Cached pool failed to allocate objects (%s)", m_pool_name); + vlog_printf(VLOG_DEBUG, "Cached pool failed to allocate objects (%s)\n", m_pool_name); return false; } diff --git a/tests/gtest/extra_api/extra_poll.cc b/tests/gtest/extra_api/extra_poll.cc index d9b096293..29259933f 100644 --- a/tests/gtest/extra_api/extra_poll.cc +++ b/tests/gtest/extra_api/extra_poll.cc @@ -86,8 +86,6 @@ TEST_F(socketxtreme_poll, ti_1) int pid = fork(); if (0 == pid) { /* I am the child */ - struct epoll_event event; - barrier_fork(pid); fd = m_tcp_base.sock_create_fa_nb(m_family); @@ -100,11 +98,31 @@ TEST_F(socketxtreme_poll, ti_1) ASSERT_EQ(EINPROGRESS, errno); ASSERT_EQ((-1), rc); - event.events = EPOLLOUT | EPOLLIN; - event.data.fd = fd; - rc = test_base::event_wait(&event); - EXPECT_LT(0, rc); - EXPECT_EQ((uint32_t)(EPOLLOUT), event.events); + // Wait for connect to complete. + struct xlio_socketxtreme_completion_t xlio_comps; + int xlio_ring_fd[2] = {-1, -1}; + rc = xlio_api->get_socket_rings_fds(fd, xlio_ring_fd, 2); + ASSERT_LE(1, rc); + + rc = 0; + while (rc == 0) { + if (xlio_ring_fd[0] > 0) { + rc = xlio_api->socketxtreme_poll(xlio_ring_fd[0], &xlio_comps, 1, 0); + ASSERT_LE(0, rc); + if (rc > 0) { + ASSERT_LT(0U, (xlio_comps.events & EPOLLOUT)); + break; + } + } + + if (xlio_ring_fd[1] > 0) { + rc = xlio_api->socketxtreme_poll(xlio_ring_fd[1], &xlio_comps, 1, 0); + ASSERT_LE(0, rc); + if (rc > 0) { + ASSERT_LT(0U, (xlio_comps.events & EPOLLOUT)); + } + } + } log_trace("Established connection: fd=%d to %s\n", fd, sys_addr2str((struct sockaddr *)&server_addr)); From 4de688bcff364866f952ce7f889b281efb5bf0dd Mon Sep 17 00:00:00 2001 From: Dmytro Podgornyi Date: Tue, 12 Mar 2024 00:32:12 +0200 Subject: [PATCH 139/169] issue: Fix big endian build and clean unused macros Compiler defines __BYTE_ORDER__ and others, but XLIO uses wrong names in the macros. Preprocessor replaces undefined names with 0, so XLIO always builds little endian branch. Fix the byte order names and fail a build if __BYTE_ORDER__ is not defined. Otherwise, the build would be broken on a big endian system. Signed-off-by: Dmytro Podgornyi --- src/core/dev/hw_queue_tx.cpp | 5 +-- src/core/event/delta_timer.cpp | 2 +- src/core/lwip/def.h | 70 ++++------------------------------ src/core/lwip/opt.h | 41 ++++++++------------ src/core/lwip/tcp_out.c | 2 +- src/core/util/vtypes.h | 17 +++++---- src/stats/stats_reader.cpp | 21 ---------- 7 files changed, 36 insertions(+), 122 deletions(-) diff --git a/src/core/dev/hw_queue_tx.cpp b/src/core/dev/hw_queue_tx.cpp index 293a64a06..6a5877cf5 100644 --- a/src/core/dev/hw_queue_tx.cpp +++ b/src/core/dev/hw_queue_tx.cpp @@ -50,9 +50,6 @@ #define hwqtx_logfunc __log_info_func #define hwqtx_logfuncall __log_info_funcall -//#define ALIGN_WR_UP(_num_wr_) (max(32, ((_num_wr_ + 0xf) & ~(0xf)))) -#define ALIGN_WR_DOWN(_num_wr_) (max(32, ((_num_wr_) & ~(0xf)))) - #if !defined(MLX5_ETH_INLINE_HEADER_SIZE) #define MLX5_ETH_INLINE_HEADER_SIZE 18 #endif @@ -60,7 +57,7 @@ #define OCTOWORD 16 #define WQEBB 64 -//#define DBG_DUMP_WQE 1 +//#define DBG_DUMP_WQE 1 #ifdef DBG_DUMP_WQE #define dbg_dump_wqe(_addr, _size) \ diff --git a/src/core/event/delta_timer.cpp b/src/core/event/delta_timer.cpp index ffec21826..bf5590020 100644 --- a/src/core/event/delta_timer.cpp +++ b/src/core/event/delta_timer.cpp @@ -48,7 +48,7 @@ #define tmr_loginfo __log_info #define tmr_logdbg __log_dbg #define tmr_logfunc __log_func -//#define tmr_logfuncall __log_funcall +//#define tmr_logfuncall __log_funcall #define tmr_logfuncall(fmt, ...) #define IS_NODE_INVALID(_node_) \ diff --git a/src/core/lwip/def.h b/src/core/lwip/def.h index 5cb2bed41..4e4f22a81 100644 --- a/src/core/lwip/def.h +++ b/src/core/lwip/def.h @@ -32,8 +32,6 @@ #ifndef __LWIP_DEF_H__ #define __LWIP_DEF_H__ -/* arch.h might define NULL already */ - #include "core/lwip/opt.h" #ifdef __cplusplus @@ -47,65 +45,7 @@ extern "C" { #define NULL ((void *)0) #endif -/** Get the absolute difference between 2 u32_t values (correcting overflows) - * 'a' is expected to be 'higher' (without overflow) than 'b'. */ -#define LWIP_U32_DIFF(a, b) (((a) >= (b)) ? ((a) - (b)) : (((a) + ((b) ^ 0xFFFFFFFF) + 1))) - -/* Endianess-optimized shifting of two u8_t to create one u16_t */ -#if BYTE_ORDER == LITTLE_ENDIAN -#define LWIP_MAKE_U16(a, b) ((a << 8) | b) -#else -#define LWIP_MAKE_U16(a, b) ((b << 8) | a) -#endif - -#ifndef LWIP_PLATFORM_BYTESWAP -#define LWIP_PLATFORM_BYTESWAP 0 -#endif - -#ifndef LWIP_PREFIX_BYTEORDER_FUNCS -/* workaround for naming collisions on some platforms */ - -#ifdef htons -#undef htons -#endif /* htons */ -#ifdef htonl -#undef htonl -#endif /* htonl */ -#ifdef ntohs -#undef ntohs -#endif /* ntohs */ -#ifdef ntohl -#undef ntohl -#endif /* ntohl */ - -#define htons(x) lwip_htons(x) -#define ntohs(x) lwip_ntohs(x) -#define htonl(x) lwip_htonl(x) -#define ntohl(x) lwip_ntohl(x) -#endif /* LWIP_PREFIX_BYTEORDER_FUNCS */ - -#if BYTE_ORDER == BIG_ENDIAN -#define lwip_htons(x) (x) -#define lwip_ntohs(x) (x) -#define lwip_htonl(x) (x) -#define lwip_ntohl(x) (x) -#define PP_HTONS(x) (x) -#define PP_NTOHS(x) (x) -#define PP_HTONL(x) (x) -#define PP_NTOHL(x) (x) -#else /* BYTE_ORDER != BIG_ENDIAN */ -#if LWIP_PLATFORM_BYTESWAP -#define lwip_htons(x) LWIP_PLATFORM_HTONS(x) -#define lwip_ntohs(x) LWIP_PLATFORM_HTONS(x) -#define lwip_htonl(x) LWIP_PLATFORM_HTONL(x) -#define lwip_ntohl(x) LWIP_PLATFORM_HTONL(x) -#else /* LWIP_PLATFORM_BYTESWAP */ -u16_t lwip_htons(u16_t x); -u16_t lwip_ntohs(u16_t x); -u32_t lwip_htonl(u32_t x); -u32_t lwip_ntohl(u32_t x); -#endif /* LWIP_PLATFORM_BYTESWAP */ - +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ /* These macros should be calculated by the preprocessor and are used with compile-time constants only (so that there is no little-endian overhead at runtime). */ @@ -115,8 +55,12 @@ u32_t lwip_ntohl(u32_t x); ((((x)&0xff) << 24) | (((x)&0xff00) << 8) | (((x)&0xff0000UL) >> 8) | \ (((x)&0xff000000UL) >> 24)) #define PP_NTOHL(x) PP_HTONL(x) - -#endif /* BYTE_ORDER == BIG_ENDIAN */ +#else /* __BYTE_ORDER__ */ +#define PP_HTONS(x) (x) +#define PP_NTOHS(x) (x) +#define PP_HTONL(x) (x) +#define PP_NTOHL(x) (x) +#endif /* __BYTE_ORDER__ */ static inline u32_t read32_be(const void *addr) { diff --git a/src/core/lwip/opt.h b/src/core/lwip/opt.h index 618a8a080..7f3aefbb1 100644 --- a/src/core/lwip/opt.h +++ b/src/core/lwip/opt.h @@ -93,27 +93,18 @@ /* Misc */ -// replace lwip byte swapping to optimized one -#include - -#define LWIP_PLATFORM_BYTESWAP 1 -#define LWIP_PLATFORM_HTONS(x) bswap_16(x) -#define LWIP_PLATFORM_HTONL(x) bswap_32(x) - -// enable LWIP DEBUG here -#if 1 -//#define PBUF_DEBUG LWIP_DBG_ON -//#define TCP_DEBUG LWIP_DBG_ON -//#define TCP_INPUT_DEBUG LWIP_DBG_ON -//#define TCP_FR_DEBUG LWIP_DBG_ON -//#define TCP_RTO_DEBUG LWIP_DBG_ON -//#define TCP_CWND_DEBUG LWIP_DBG_ON -//#define TCP_WND_DEBUG LWIP_DBG_ON -//#define TCP_OUTPUT_DEBUG LWIP_DBG_ON -//#define TCP_RST_DEBUG LWIP_DBG_ON -//#define TCP_QLEN_DEBUG LWIP_DBG_ON -//#define TCP_TSO_DEBUG LWIP_DBG_ON -#endif +// Enable LWIP DEBUG here +//#define PBUF_DEBUG LWIP_DBG_ON +//#define TCP_DEBUG LWIP_DBG_ON +//#define TCP_INPUT_DEBUG LWIP_DBG_ON +//#define TCP_FR_DEBUG LWIP_DBG_ON +//#define TCP_RTO_DEBUG LWIP_DBG_ON +//#define TCP_CWND_DEBUG LWIP_DBG_ON +//#define TCP_WND_DEBUG LWIP_DBG_ON +//#define TCP_OUTPUT_DEBUG LWIP_DBG_ON +//#define TCP_RST_DEBUG LWIP_DBG_ON +//#define TCP_QLEN_DEBUG LWIP_DBG_ON +//#define TCP_TSO_DEBUG LWIP_DBG_ON /* --------------------------------- @@ -221,10 +212,10 @@ #define LWIP_TCP_KEEPALIVE 0 #endif -/* Define platform endianness */ -#ifndef BYTE_ORDER -#define BYTE_ORDER LITTLE_ENDIAN -#endif /* BYTE_ORDER */ +/* Platform endianness */ +#if !defined(__BYTE_ORDER__) || !defined(__ORDER_LITTLE_ENDIAN__) || !defined(__ORDER_BIG_ENDIAN__) +#error "__BYTE_ORDER__ or __ORDER_..._ENDIAN__ is not defined" +#endif /* __BYTE_ORDER__ */ /* Define generic types used in lwIP */ typedef uint8_t u8_t; diff --git a/src/core/lwip/tcp_out.c b/src/core/lwip/tcp_out.c index e5e15f01d..c307f5e81 100644 --- a/src/core/lwip/tcp_out.c +++ b/src/core/lwip/tcp_out.c @@ -2406,7 +2406,7 @@ void tcp_zero_window_probe(struct tcp_pcb *pcb) } /* The byte may be acknowledged without the window being opened. */ - snd_nxt = lwip_ntohl(seg->tcphdr->seqno) + 1; + snd_nxt = ntohl(seg->tcphdr->seqno) + 1; if (TCP_SEQ_LT(pcb->snd_nxt, snd_nxt)) { pcb->snd_nxt = snd_nxt; } diff --git a/src/core/util/vtypes.h b/src/core/util/vtypes.h index 181790a7c..15138ca30 100644 --- a/src/core/util/vtypes.h +++ b/src/core/util/vtypes.h @@ -41,6 +41,7 @@ #include "utils/types.h" #include "utils/bullseye.h" + #ifndef IN #define IN #endif @@ -53,7 +54,11 @@ #define INOUT #endif -#if __BYTE_ORDER == __LITTLE_ENDIAN +#if !defined(__BYTE_ORDER__) || !defined(__ORDER_LITTLE_ENDIAN__) || !defined(__ORDER_BIG_ENDIAN__) +#error "__BYTE_ORDER__ or __ORDER_..._ENDIAN__ is not defined" +#endif + +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ static inline uint64_t htonll(uint64_t x) { return bswap_64(x); @@ -62,7 +67,7 @@ static inline uint64_t ntohll(uint64_t x) { return bswap_64(x); } -#elif __BYTE_ORDER == __BIG_ENDIAN +#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ static inline uint64_t htonll(uint64_t x) { return x; @@ -72,7 +77,7 @@ static inline uint64_t ntohll(uint64_t x) return x; } #else -#error __BYTE_ORDER is neither __LITTLE_ENDIAN nor __BIG_ENDIAN +#error __BYTE_ORDER__ is neither __ORDER_LITTLE_ENDIAN__ nor __ORDER_BIG_ENDIAN__ #endif #define likely(x) __builtin_expect(!!(x), 1) @@ -96,7 +101,7 @@ static inline uint64_t ntohll(uint64_t x) (uint8_t)(((ip) >> 24) & 0xff), (uint8_t)(((ip) >> 16) & 0xff), (uint8_t)(((ip) >> 8) & 0xff), \ (uint8_t)((ip)&0xff) -#if __BYTE_ORDER == __LITTLE_ENDIAN +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ /* The host byte order is the same as network byte order, so these functions are all just identity. */ @@ -104,13 +109,11 @@ static inline uint64_t ntohll(uint64_t x) #define NIPQUAD(ip) NETWORK_IP_PRINTQUAD_LITTLE_ENDIAN(ip) #define HIPQUAD(ip) HOST_IP_PRINTQUAD_LITTLE_ENDIAN(ip) -#else -#if __BYTE_ORDER == __BIG_ENDIAN +#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ #define NIPQUAD(ip) HOST_IP_PRINTQUAD_LITTLE_ENDIAN(ip) #define HIPQUAD(ip) NETWORK_IP_PRINTQUAD_LITTLE_ENDIAN(ip) -#endif #endif #define ETH_HW_ADDR_PRINT_FMT "%02x:%02x:%02x:%02x:%02x:%02x" diff --git a/src/stats/stats_reader.cpp b/src/stats/stats_reader.cpp index 0a8621c32..18a118b16 100644 --- a/src/stats/stats_reader.cpp +++ b/src/stats/stats_reader.cpp @@ -133,25 +133,6 @@ typedef enum { e_K = 1024, e_M = 1048576 } units_t; #define SEC_TO_MICRO(n) ((n)*1000000) #define TIME_DIFF_in_MICRO(start, end) \ (SEC_TO_MICRO((end).tv_sec - (start).tv_sec) + (NANO_TO_MICRO((end).tv_nsec - (start).tv_nsec))) -// printf formating when IP is in network byte ordering (for LITTLE_ENDIAN) -#define NETWORK_IP_PRINTQUAD_LITTLE_ENDIAN(ip) \ - (uint8_t)((ip)&0xff), (uint8_t)(((ip) >> 8) & 0xff), (uint8_t)(((ip) >> 16) & 0xff), \ - (uint8_t)(((ip) >> 24) & 0xff) - -// printf formating when IP is in host byte ordering (for LITTLE_ENDIAN) -#define HOST_IP_PRINTQUAD_LITTLE_ENDIAN(ip) \ - (uint8_t)(((ip) >> 24) & 0xff), (uint8_t)(((ip) >> 16) & 0xff), (uint8_t)(((ip) >> 8) & 0xff), \ - (uint8_t)((ip)&0xff) - -#if __BYTE_ORDER == __LITTLE_ENDIAN -/* The host byte order is the same as network byte order, so these functions are all just identity. - */ -#define NIPQUAD(ip) NETWORK_IP_PRINTQUAD_LITTLE_ENDIAN(ip) -#else -#if __BYTE_ORDER == __BIG_ENDIAN -#define NIPQUAD(ip) HOST_IP_PRINTQUAD_LITTLE_ENDIAN(ip) -#endif -#endif bool g_b_exit = false; struct sigaction g_sigact; @@ -1241,8 +1222,6 @@ void show_mc_group_stats(mc_grp_info_t *p_mc_grp_info, socket_instance_block_t * socket_stats_t *p_si_stats = &p_instance[i].skt_stats; for (int grp_idx = 0; grp_idx < p_mc_grp_info->max_grp_num; grp_idx++) { if (p_si_stats->mc_grp_map.test(grp_idx)) { - // printf("fd %d Member of = [%d.%d.%d.%d]\n",p_si_stats->fd, - // NIPQUAD(p_si_stats->mc_grp[grp_idx])); add_fd_to_array(p_si_stats->fd, p_mc_grp_info->mc_grp_tbl[grp_idx].mc_grp, mc_group_fds, &array_size); } From 221ea721e4de5221e4f820e9fa9562a4ccde021a Mon Sep 17 00:00:00 2001 From: Gal Noam Date: Wed, 20 Mar 2024 13:47:09 +0200 Subject: [PATCH 140/169] version: 3.30.3 Signed-off-by: Gal Noam --- CHANGES | 11 +++++++++++ configure.ac | 2 +- contrib/scripts/libxlio.spec.in | 4 ++-- 3 files changed, 14 insertions(+), 3 deletions(-) diff --git a/CHANGES b/CHANGES index 1c73eb04d..c0b532a8c 100644 --- a/CHANGES +++ b/CHANGES @@ -1,3 +1,14 @@ +Version 3.30.3-1: +Date + Time 2024-03-20 +============================================================= +Added: + - RM #3788369 New Storage API + - RM #3777348 Improve sockinfo cache utilization + +Fixed: + - RM #3829626 Nginx http CPS tests report XLIO ERROR + - RM #3808935 SNAP4 static build error + Version 3.30.2-1: Date + Time 2024-03-11 ============================================================= diff --git a/configure.ac b/configure.ac index d783786f4..b3255693c 100644 --- a/configure.ac +++ b/configure.ac @@ -14,7 +14,7 @@ dnl===-----------------------------------------------------------------------=== # define([prj_ver_major], 3) define([prj_ver_minor], 30) -define([prj_ver_revision], 2) +define([prj_ver_revision], 3) define([prj_ver_release], esyscmd([echo ${PRJ_RELEASE:=0}])) diff --git a/contrib/scripts/libxlio.spec.in b/contrib/scripts/libxlio.spec.in index 59d71153f..174599081 100644 --- a/contrib/scripts/libxlio.spec.in +++ b/contrib/scripts/libxlio.spec.in @@ -189,7 +189,7 @@ fi %{_mandir}/man8/xlio_stats.* %changelog -* Mon Mar 11 2024 NVIDIA CORPORATION 3.30.2-1 -- Bump version to 3.30.2 +* Wed Mar 20 2024 NVIDIA CORPORATION 3.30.3-1 +- Bump version to 3.30.3 - Please refer to CHANGES for full changelog. From dcdcd643b7b177f535f976c4a1f0e93eab709d90 Mon Sep 17 00:00:00 2001 From: Dmytro Podgornyi Date: Sun, 17 Mar 2024 03:15:16 +0200 Subject: [PATCH 141/169] issue: 3788369 Keep global collection of the polling groups Keep a global collection to destroy remaining groups at exit. Usually, polling groups are pre-initialized in a small number (up to the number of CPU cores) and destroyed at exit. Therefore, the collection doesn't have to be efficient. Moreover, a group destruction is a heavy operation itself, therefore, optimizing the collection doesn't make sense. Signed-off-by: Dmytro Podgornyi --- src/core/event/poll_group.cpp | 33 +++++++++++++++++++++++++++++++++ src/core/event/poll_group.h | 1 + src/core/main.cpp | 3 +++ 3 files changed, 37 insertions(+) diff --git a/src/core/event/poll_group.cpp b/src/core/event/poll_group.cpp index 977374551..742fe678b 100644 --- a/src/core/event/poll_group.cpp +++ b/src/core/event/poll_group.cpp @@ -45,6 +45,14 @@ #define grp_loginfo __log_info #define grp_logdbg __log_dbg +/* + * Collection of the groups to destroy leftovers in the library destructor. + * Groups are likely pre-initialized in a small number (up to the number of CPU cores) + * and destroyed at exit. Therefore, a simple collection data structure is enough. + */ +static std::vector s_poll_groups; +static lock_spin s_poll_groups_lock; + poll_group::poll_group(const struct xlio_poll_group_attr *attr) : m_socket_event_cb(attr->socket_event_cb) , m_socket_comp_cb(attr->socket_comp_cb) @@ -61,10 +69,35 @@ poll_group::poll_group(const struct xlio_poll_group_attr *attr) m_event_handler = std::make_unique(); m_tcp_timers = std::make_unique(1U); m_tcp_timers->set_group(this); + + s_poll_groups_lock.lock(); + s_poll_groups.push_back(this); + s_poll_groups_lock.unlock(); + + grp_logdbg("Polling group %p created", this); } poll_group::~poll_group() { + s_poll_groups_lock.lock(); + auto iter = std::find(s_poll_groups.begin(), s_poll_groups.end(), this); + if (iter != std::end(s_poll_groups)) { + s_poll_groups.erase(iter); + } + s_poll_groups_lock.unlock(); + + grp_logdbg("Polling group %p destroyed", this); +} + +/*static*/ +void poll_group::destroy_all_groups() +{ + s_poll_groups_lock.lock(); + std::vector groups(std::move(s_poll_groups)); + s_poll_groups_lock.unlock(); + for (poll_group *grp : groups) { + delete grp; + } } void poll_group::poll() diff --git a/src/core/event/poll_group.h b/src/core/event/poll_group.h index c13e01c96..e5573c697 100644 --- a/src/core/event/poll_group.h +++ b/src/core/event/poll_group.h @@ -49,6 +49,7 @@ class poll_group { public: poll_group(const struct xlio_poll_group_attr *attr); ~poll_group(); + static void destroy_all_groups(); void poll(); diff --git a/src/core/main.cpp b/src/core/main.cpp index f2713577a..2b7327cb5 100644 --- a/src/core/main.cpp +++ b/src/core/main.cpp @@ -52,6 +52,7 @@ #include "util/xlio_stats.h" #include "util/utils.h" #include "event/event_handler_manager.h" +#include "event/poll_group.h" #include "event/vlogger_timer_handler.h" #include "dev/buffer_pool.h" #include "dev/ib_ctx_handler_collection.h" @@ -145,6 +146,8 @@ static int free_libxlio_resources() delete g_p_fd_collection_temp; } + poll_group::destroy_all_groups(); + if (g_p_lwip) { delete g_p_lwip; } From dde0276cd66bb41cd69c321e57b7aaefce8dfd02 Mon Sep 17 00:00:00 2001 From: Dmytro Podgornyi Date: Sun, 17 Mar 2024 05:09:51 +0200 Subject: [PATCH 142/169] issue: 3788369 Keep sockets list per polling group Keep a sockets list per polling group to close the remaining sockets on a group destruction. Signed-off-by: Dmytro Podgornyi --- src/core/event/poll_group.cpp | 23 +++++++++++++++++++++++ src/core/event/poll_group.h | 7 ++++++- src/core/sock/sock-extra.cpp | 24 +++++++++++++++--------- src/core/sock/sockinfo_tcp.h | 1 + 4 files changed, 45 insertions(+), 10 deletions(-) diff --git a/src/core/event/poll_group.cpp b/src/core/event/poll_group.cpp index 742fe678b..22a650d57 100644 --- a/src/core/event/poll_group.cpp +++ b/src/core/event/poll_group.cpp @@ -86,6 +86,13 @@ poll_group::~poll_group() } s_poll_groups_lock.unlock(); + while (!m_sockets_list.empty()) { + sockinfo_tcp *si = dynamic_cast(m_sockets_list.front()); + if (likely(si)) { + close_socket(si, true); + } + } + grp_logdbg("Polling group %p destroyed", this); } @@ -150,3 +157,19 @@ void poll_group::del_ring(ring *rng) m_rings.erase(iter); } } + +void poll_group::add_socket(sockinfo_tcp *si) +{ + m_sockets_list.push_back(si); +} + +void poll_group::close_socket(sockinfo_tcp *si, bool force /*=false*/) +{ + m_sockets_list.erase(si); + + bool closed = si->prepare_to_close(force); + if (closed) { + si->clean_socket_obj(); + } + // TODO If not closed, the socket will be destroyed after the last completion notification. +} diff --git a/src/core/event/poll_group.h b/src/core/event/poll_group.h index e5573c697..ecb0f6c33 100644 --- a/src/core/event/poll_group.h +++ b/src/core/event/poll_group.h @@ -36,6 +36,7 @@ #include #include +#include "sock/fd_collection.h" #include "xlio.h" /* Forward declarations */ @@ -59,6 +60,9 @@ class poll_group { void add_ring(ring *); void del_ring(ring *); + void add_socket(sockinfo_tcp *si); + void close_socket(sockinfo_tcp *si, bool force = false); + unsigned get_flags() const { return m_group_flags; } event_handler_manager_local *get_event_handler() const { return m_event_handler.get(); } tcp_timers_collection *get_tcp_timers() const { return m_tcp_timers.get(); } @@ -73,9 +77,10 @@ class poll_group { std::unique_ptr m_event_handler; std::unique_ptr m_tcp_timers; + unsigned m_group_flags; std::vector m_dirty_sockets; - unsigned m_group_flags; + sock_fd_api_list_t m_sockets_list; }; #endif /* XLIO_GROUP_H */ diff --git a/src/core/sock/sock-extra.cpp b/src/core/sock/sock-extra.cpp index 7ecb2f533..d12d81110 100644 --- a/src/core/sock/sock-extra.cpp +++ b/src/core/sock/sock-extra.cpp @@ -405,9 +405,6 @@ extern "C" int xlio_init_ex(const struct xlio_init_attr *attr) if (!getenv(SYS_VAR_PROGRESS_ENGINE_INTERVAL)) { setenv(SYS_VAR_PROGRESS_ENGINE_INTERVAL, "0", 1); } - if (!getenv(SYS_VAR_TCP_ABORT_ON_CLOSE)) { - setenv(SYS_VAR_TCP_ABORT_ON_CLOSE, "1", 1); - } xlio_init(); @@ -470,19 +467,21 @@ extern "C" int xlio_socket_create(const struct xlio_socket_attr *attr, xlio_sock return -1; } - int sockfd = socket_internal(attr->domain, SOCK_STREAM, 0, true, false); - if (sockfd < 0) { + int fd = SYSCALL(socket, attr->domain, SOCK_STREAM, 0); + if (fd < 0) { return -1; } - sockinfo_tcp *si = dynamic_cast(g_p_fd_collection->get_sockfd(sockfd)); + sockinfo_tcp *si = new sockinfo_tcp(fd, attr->domain); if (!si) { - errno = EBADF; + errno = ENOMEM; return -1; } - si->set_xlio_socket(attr); + poll_group *grp = reinterpret_cast(attr->group); + grp->add_socket(si); + *sock_out = reinterpret_cast(si); return 0; } @@ -490,8 +489,15 @@ extern "C" int xlio_socket_create(const struct xlio_socket_attr *attr, xlio_sock extern "C" int xlio_socket_destroy(xlio_socket_t sock) { sockinfo_tcp *si = reinterpret_cast(sock); + poll_group *grp = si->get_poll_group(); - return XLIO_CALL(close, si->get_fd()); + if (likely(grp)) { + // We always force TCP reset not to handle FIN handshake and TIME-WAIT state. + grp->close_socket(si, true); + } else { + return XLIO_CALL(close, si->get_fd()); + } + return 0; } extern "C" int xlio_socket_setsockopt(xlio_socket_t sock, int level, int optname, diff --git a/src/core/sock/sockinfo_tcp.h b/src/core/sock/sockinfo_tcp.h index b12c12b34..0d731536b 100644 --- a/src/core/sock/sockinfo_tcp.h +++ b/src/core/sock/sockinfo_tcp.h @@ -408,6 +408,7 @@ class sockinfo_tcp : public sockinfo { void set_xlio_socket(const struct xlio_socket_attr *attr); void add_tx_ring_to_group(); bool is_xlio_socket() { return m_p_group != nullptr; } + poll_group *get_poll_group() { return m_p_group; } void xlio_socket_event(int event, int value); static err_t rx_lwip_cb_xlio_socket(void *arg, struct tcp_pcb *tpcb, struct pbuf *p, err_t err); static void err_lwip_cb_xlio_socket(void *pcb_container, err_t err); From 8452e0014c0fe365e7838208253e2411a50121b8 Mon Sep 17 00:00:00 2001 From: Dmytro Podgornyi Date: Sun, 17 Mar 2024 06:55:18 +0200 Subject: [PATCH 143/169] issue: 3788369 poll_group takes reference to ring To avoid a race and extra destruction operations, take additional reference to each ring within a group. The race can happen if a socket destruction leads to its ring destruction, but user returns RX buffers after that. User may return buffers to a group regardless of socket lifecycle. In a scenario when user closes all the sockets in runtime, the respective ring would be destroyed and created with new socket again. This is expensive operation and the reference avoids the destruction. Current implementation keeps all the rings until net_device_val destructor. In the future, poll_group should release the reference and destroy the ring. Signed-off-by: Dmytro Podgornyi --- src/core/dev/ring.cpp | 3 --- src/core/dev/ring.h | 4 ---- src/core/event/poll_group.cpp | 33 +++++++++++++++++---------------- src/core/event/poll_group.h | 8 ++++---- src/core/sock/sockinfo_tcp.cpp | 4 ++-- 5 files changed, 23 insertions(+), 29 deletions(-) diff --git a/src/core/dev/ring.cpp b/src/core/dev/ring.cpp index 7f89228fd..661e2aaa8 100644 --- a/src/core/dev/ring.cpp +++ b/src/core/dev/ring.cpp @@ -50,9 +50,6 @@ ring::ring() ring::~ring() { - if (m_p_group) { - m_p_group->del_ring(this); - } if (m_tcp_seg_list) { g_tcp_seg_pool->put_objs(m_tcp_seg_list); } diff --git a/src/core/dev/ring.h b/src/core/dev/ring.h index 859262fd2..4cacdf962 100644 --- a/src/core/dev/ring.h +++ b/src/core/dev/ring.h @@ -258,9 +258,6 @@ class ring { struct tcp_seg *get_tcp_segs(uint32_t num); void put_tcp_segs(struct tcp_seg *seg); - void set_group(poll_group *grp) { m_p_group = grp; } - poll_group *get_group() const { return m_p_group; } - ring_ec *socketxtreme_get_ecs(uint32_t num); void socketxtreme_put_ecs(struct ring_ec *ec); @@ -275,7 +272,6 @@ class ring { inline void set_parent(ring *parent) { m_parent = (parent ? parent : this); } inline void set_if_index(int if_index) { m_if_index = if_index; } - poll_group *m_p_group = nullptr; int *m_p_n_rx_channel_fds = nullptr; ring *m_parent = nullptr; diff --git a/src/core/event/poll_group.cpp b/src/core/event/poll_group.cpp index 22a650d57..f7676f0c4 100644 --- a/src/core/event/poll_group.cpp +++ b/src/core/event/poll_group.cpp @@ -93,6 +93,8 @@ poll_group::~poll_group() } } + // TODO Release the rings. Current implementation destroys the rings in the library destructor. + grp_logdbg("Polling group %p destroyed", this); } @@ -134,27 +136,26 @@ void poll_group::flush() // TODO Ring doorbell and request TX completion. } -void poll_group::add_ring(ring *rng) +void poll_group::add_ring(ring *rng, ring_alloc_logic_attr *attr) { if (std::find(m_rings.begin(), m_rings.end(), rng) == std::end(m_rings)) { grp_logdbg("New ring %p in group %p", rng, this); - if (rng->get_group()) { - grp_logwarn("Ring belongs to a group %p (current group %p)", rng->get_group(), this); - } - rng->set_group(this); m_rings.push_back(rng); - // TODO Increase ref count for the ring and keep it until the group is destroyed. - // In this way we don't have to implement del_ring() and there won't be a race between - // socket destruction and xlio_group_buf_free(). - } -} -void poll_group::del_ring(ring *rng) -{ - auto iter = std::find(m_rings.begin(), m_rings.end(), rng); - if (iter != std::end(m_rings)) { - grp_logdbg("Removed ring %p from group %p", rng, this); - m_rings.erase(iter); + /* + * Take reference to the ring. This avoids a race between socket destruction and buffer + * return to the group. Socket destruction can lead to the ring destruction. But user + * may return a buffer outside of the socket lifecycle. + * This also avoids extra ring destruction in a scenario when application closes all + * the sockets multiple times in runtime. + */ + net_device_val *nd = g_p_net_device_table_mgr->get_net_device_val(rng->get_if_index()); + if (nd) { + ring *reserved = nd->reserve_ring(attr); + if (reserved != rng) { + grp_logerr("Cannot reserve ring %p (reserved=%p)", rng, reserved); + } + } } } diff --git a/src/core/event/poll_group.h b/src/core/event/poll_group.h index ecb0f6c33..76dbc1f3e 100644 --- a/src/core/event/poll_group.h +++ b/src/core/event/poll_group.h @@ -41,10 +41,11 @@ /* Forward declarations */ struct xlio_poll_group_attr; -class ring; class event_handler_manager_local; -class tcp_timers_collection; +class ring; +class ring_alloc_logic_attr; class sockinfo_tcp; +class tcp_timers_collection; class poll_group { public: @@ -57,8 +58,7 @@ class poll_group { void add_dirty_socket(sockinfo_tcp *si); void flush(); - void add_ring(ring *); - void del_ring(ring *); + void add_ring(ring *rng, ring_alloc_logic_attr *attr); void add_socket(sockinfo_tcp *si); void close_socket(sockinfo_tcp *si, bool force = false); diff --git a/src/core/sock/sockinfo_tcp.cpp b/src/core/sock/sockinfo_tcp.cpp index d43510e5c..2fa352814 100644 --- a/src/core/sock/sockinfo_tcp.cpp +++ b/src/core/sock/sockinfo_tcp.cpp @@ -397,7 +397,7 @@ sockinfo_tcp::sockinfo_tcp(int fd, int domain) void sockinfo_tcp::rx_add_ring_cb(ring *p_ring) { if (m_p_group) { - m_p_group->add_ring(p_ring); + m_p_group->add_ring(p_ring, &m_ring_alloc_log_rx); } sockinfo::rx_add_ring_cb(p_ring); } @@ -433,7 +433,7 @@ void sockinfo_tcp::add_tx_ring_to_group() { ring *rng = get_tx_ring(); if (m_p_group && rng) { - m_p_group->add_ring(rng); + m_p_group->add_ring(rng, &m_ring_alloc_log_tx); } } From 617759ea93fe1fc7853e9cf3cac2cad1e0b9d788 Mon Sep 17 00:00:00 2001 From: Dmytro Podgornyi Date: Mon, 18 Mar 2024 00:45:16 +0200 Subject: [PATCH 144/169] issue: 3788369 Throw exception if netdev not found for a ring Rings are created in context of a net_device_val. Therefore, the net_device_val object must be present for an existing ring. However, API allows to return NULL by the netdev lookup and coverity generates a warning because of that. Throw an exception to suppress the coverity warning. Signed-off-by: Dmytro Podgornyi --- src/core/dev/ring_simple.cpp | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/src/core/dev/ring_simple.cpp b/src/core/dev/ring_simple.cpp index 2d2a43bdd..c6ccbe98e 100644 --- a/src/core/dev/ring_simple.cpp +++ b/src/core/dev/ring_simple.cpp @@ -90,6 +90,13 @@ ring_simple::ring_simple(int if_index, ring *parent, ring_type_t type, bool use_ , m_gro_mgr(safe_mce_sys().gro_streams_max, MAX_GRO_BUFS) { net_device_val *p_ndev = g_p_net_device_table_mgr->get_net_device_val(m_parent->get_if_index()); + BULLSEYE_EXCLUDE_BLOCK_START + if (!p_ndev) { + // Coverity warning suppression + throw_xlio_exception("Cannot find netdev for a ring"); + } + BULLSEYE_EXCLUDE_BLOCK_END + const slave_data_t *p_slave = p_ndev->get_slave(get_if_index()); ring_logdbg("new ring_simple()"); @@ -209,10 +216,18 @@ ring_simple::~ring_simple() void ring_simple::create_resources() { net_device_val *p_ndev = g_p_net_device_table_mgr->get_net_device_val(m_parent->get_if_index()); + BULLSEYE_EXCLUDE_BLOCK_START + if (!p_ndev) { + // Coverity warning suppression + throw_xlio_exception("Cannot find netdev for a ring"); + } + BULLSEYE_EXCLUDE_BLOCK_END + const slave_data_t *p_slave = p_ndev->get_slave(get_if_index()); save_l2_address(p_slave->p_L2_addr); m_p_tx_comp_event_channel = ibv_create_comp_channel(m_p_ib_ctx->get_ibv_context()); + BULLSEYE_EXCLUDE_BLOCK_START if (!m_p_tx_comp_event_channel) { VLOG_PRINTF_INFO_ONCE_THEN_ALWAYS( VLOG_ERROR, VLOG_DEBUG, @@ -319,9 +334,7 @@ void ring_simple::create_resources() #endif ring_logdbg("ring attributes: m_flow_tag_enabled = %d", m_flow_tag_enabled); - m_p_rx_comp_event_channel = ibv_create_comp_channel( - m_p_ib_ctx->get_ibv_context()); // ODED TODO: Adjust the ibv_context to be the exact one in - // case of different devices + m_p_rx_comp_event_channel = ibv_create_comp_channel(m_p_ib_ctx->get_ibv_context()); BULLSEYE_EXCLUDE_BLOCK_START if (!m_p_rx_comp_event_channel) { VLOG_PRINTF_INFO_ONCE_THEN_ALWAYS( From 25351d9477b2b743896c7847af6c5b3a12a2c8eb Mon Sep 17 00:00:00 2001 From: Dmytro Podgornyi Date: Thu, 21 Mar 2024 02:16:10 +0200 Subject: [PATCH 145/169] issue: 3788369 Release native rings in the poll_group destructor poll_group takes additional reference to each its ring. But it doesn't release it once the group is destroyed. This leads to two issues: 1. Extra resources are utilized if user destroys a polling group before the application terminates. 2. Polling is not possible for a destroyed group. Therefore, if there are not completed WQEs in the SQ, respective sockets won't report TX completions and cannot be fully terminated. The ring needs to be destroyed to flush all the completions. Release all the native rings explicitly in the poll_group destructor to resolve the above issues. Signed-off-by: Dmytro Podgornyi --- src/core/event/poll_group.cpp | 14 +++++++++++++- src/core/event/poll_group.h | 1 + 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/src/core/event/poll_group.cpp b/src/core/event/poll_group.cpp index f7676f0c4..ad25559d7 100644 --- a/src/core/event/poll_group.cpp +++ b/src/core/event/poll_group.cpp @@ -33,6 +33,8 @@ #include "config.h" #include "poll_group.h" +#include "dev/net_device_table_mgr.h" +#include "dev/net_device_val.h" #include "dev/ring.h" #include "event/event_handler_manager_local.h" #include "sock/sockinfo_tcp.h" @@ -93,7 +95,11 @@ poll_group::~poll_group() } } - // TODO Release the rings. Current implementation destroys the rings in the library destructor. + // Release references to the rings that we take in add_ring() + for (auto &item : m_rings_ref) { + item.second->release_ring(item.first.get()); + } + m_rings_ref.clear(); grp_logdbg("Polling group %p destroyed", this); } @@ -154,6 +160,12 @@ void poll_group::add_ring(ring *rng, ring_alloc_logic_attr *attr) ring *reserved = nd->reserve_ring(attr); if (reserved != rng) { grp_logerr("Cannot reserve ring %p (reserved=%p)", rng, reserved); + if (reserved) { + nd->release_ring(attr); + } + } else { + m_rings_ref.push_back( + std::make_pair(std::make_unique(*attr), nd)); } } } diff --git a/src/core/event/poll_group.h b/src/core/event/poll_group.h index 76dbc1f3e..3fbbf59c6 100644 --- a/src/core/event/poll_group.h +++ b/src/core/event/poll_group.h @@ -81,6 +81,7 @@ class poll_group { std::vector m_dirty_sockets; sock_fd_api_list_t m_sockets_list; + std::vector, net_device_val *>> m_rings_ref; }; #endif /* XLIO_GROUP_H */ From 0b3eb599ab1a3e19622d57d66bfaaf634dd9ffc9 Mon Sep 17 00:00:00 2001 From: Dmytro Podgornyi Date: Sun, 24 Mar 2024 03:31:10 +0200 Subject: [PATCH 146/169] issue: 3788369 Don't free buffer unconditionally in XLIO Socket API When an RX packet event happens, XLIO passes the ownership to user. Further, user releases the buffer explicitly. However, XLIO frees the buffer unconditionally just after emitting the event. Fix this and free buffers only if user doesn't provides the RX event callback. Signed-off-by: Dmytro Podgornyi --- src/core/sock/sockinfo_tcp.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/core/sock/sockinfo_tcp.cpp b/src/core/sock/sockinfo_tcp.cpp index 2fa352814..8b97b7d96 100644 --- a/src/core/sock/sockinfo_tcp.cpp +++ b/src/core/sock/sockinfo_tcp.cpp @@ -476,8 +476,9 @@ err_t sockinfo_tcp::rx_lwip_cb_xlio_socket(void *arg, struct tcp_pcb *pcb, struc reinterpret_cast(ptmp)); ptmp = ptmp->next; } + } else { + pbuf_free(p); } - pbuf_free(p); // TODO Stats From 69ca61fc04ee2f25191cf58cc06b620174cde3e0 Mon Sep 17 00:00:00 2001 From: Dmytro Podgornyi Date: Sun, 24 Mar 2024 03:34:34 +0200 Subject: [PATCH 147/169] issue: 3788369 Use reclaim_recv_buffers() in XLIO Socket API reclaim_recv_single_buffer() accumulates buffers in a list. In the performance oriented API we want to reuse hot buffers immediately, so reclaim_recv_buffers() implementation is more suitable. Signed-off-by: Dmytro Podgornyi --- src/core/sock/sock-extra.cpp | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/core/sock/sock-extra.cpp b/src/core/sock/sock-extra.cpp index d12d81110..3bc0fc80b 100644 --- a/src/core/sock/sock-extra.cpp +++ b/src/core/sock/sock-extra.cpp @@ -549,7 +549,11 @@ static void xlio_buf_free(struct xlio_buf *buf) mem_buf_desc_t *desc = reinterpret_cast(buf); ring_slave *rng = desc->p_desc_owner; - (void)rng->reclaim_recv_single_buffer(desc); + desc->p_next_desc = nullptr; + bool ret = rng->reclaim_recv_buffers(desc); + if (unlikely(!ret)) { + g_buffer_pool_rx_ptr->put_buffer_after_deref_thread_safe(desc); + } } extern "C" void xlio_socket_buf_free(xlio_socket_t sock, struct xlio_buf *buf) From 8ad2b35ac4d724aeaea63332c0d63052c884f926 Mon Sep 17 00:00:00 2001 From: Dmytro Podgornyi Date: Mon, 25 Mar 2024 02:21:05 +0200 Subject: [PATCH 148/169] issue: 3788369 Pass proper hugepage_size to XLIO Socket API The memory callback provides hugepage size of the underlying pages. Replace hardcoded 0 with real hugepage size. Keep the page size in xlio_allocator object. This field a relevant only the hugepage allocation method and 0 in all other cases. Signed-off-by: Dmytro Podgornyi --- src/core/dev/allocator.cpp | 5 +++-- src/core/dev/allocator.h | 6 ++++-- src/core/util/hugepage_mgr.cpp | 3 ++- src/core/util/hugepage_mgr.h | 2 +- 4 files changed, 10 insertions(+), 6 deletions(-) diff --git a/src/core/dev/allocator.cpp b/src/core/dev/allocator.cpp index ffd1db1b7..9af6613b9 100644 --- a/src/core/dev/allocator.cpp +++ b/src/core/dev/allocator.cpp @@ -66,6 +66,7 @@ xlio_allocator::xlio_allocator(alloc_t alloc_func, free_t free_func) m_type = static_cast(safe_mce_sys().mem_alloc_type); m_data = nullptr; m_size = 0; + m_page_size = 0; m_memalloc = alloc_func; m_memfree = free_func; if (m_memalloc) { @@ -155,7 +156,7 @@ void *xlio_allocator::alloc_huge(size_t size) __log_info_dbg("Allocating %zu bytes in huge tlb using mmap", size); size_t actual_size = size; - m_data = g_hugepage_mgr.alloc_hugepages(actual_size); + m_data = g_hugepage_mgr.alloc_hugepages(actual_size, m_page_size); if (!m_data && g_hugepage_mgr.get_default_hugepage() && m_type == ALLOC_TYPE_HUGEPAGES) { // Print a warning message on allocation error if hugepages are supported // and this is not a fallback from a different allocation method. @@ -505,7 +506,7 @@ bool xlio_heap::expand(size_t size /*=0*/) m_latest_offset = 0; if (m_b_hw && g_user_memory_cb) { - g_user_memory_cb(data, size, 0); + g_user_memory_cb(data, size, block->page_size()); } return true; diff --git a/src/core/dev/allocator.h b/src/core/dev/allocator.h index 39a749947..9e27693cf 100644 --- a/src/core/dev/allocator.h +++ b/src/core/dev/allocator.h @@ -61,8 +61,9 @@ class xlio_allocator { void dealloc(); - inline size_t size() { return m_size; } - inline void *data() { return m_data; } + size_t size() { return m_size; } + size_t page_size() { return m_page_size; } + void *data() { return m_data; } private: void print_hugepages_warning(size_t requested_size); @@ -71,6 +72,7 @@ class xlio_allocator { alloc_mode_t m_type; void *m_data; size_t m_size; + size_t m_page_size; private: alloc_t m_memalloc; diff --git a/src/core/util/hugepage_mgr.cpp b/src/core/util/hugepage_mgr.cpp index c5bc254a1..f502786bf 100644 --- a/src/core/util/hugepage_mgr.cpp +++ b/src/core/util/hugepage_mgr.cpp @@ -118,7 +118,7 @@ void *hugepage_mgr::alloc_hugepages_helper(size_t &size, size_t hugepage) return ptr; } -void *hugepage_mgr::alloc_hugepages(size_t &size) +void *hugepage_mgr::alloc_hugepages(size_t &size, size_t &hugepage_size) { std::lock_guard lock(m_lock); @@ -149,6 +149,7 @@ void *hugepage_mgr::alloc_hugepages(size_t &size) } if (ptr) { size = actual_size; + hugepage_size = hugepage; } // Statistics diff --git a/src/core/util/hugepage_mgr.h b/src/core/util/hugepage_mgr.h index 33f5a8485..b0a8c76ca 100644 --- a/src/core/util/hugepage_mgr.h +++ b/src/core/util/hugepage_mgr.h @@ -66,7 +66,7 @@ class hugepage_mgr { size_t get_default_hugepage() { return m_default_hugepage; } bool is_hugepage_supported(size_t hugepage); - void *alloc_hugepages(size_t &size); + void *alloc_hugepages(size_t &size, size_t &hugepage_size); void dealloc_hugepages(void *ptr, size_t size); void print_report(bool short_report = false); From 7b8853d28192bd6e29216d3915991c3837d8edce Mon Sep 17 00:00:00 2001 From: Dmytro Podgornyi Date: Mon, 25 Mar 2024 02:39:39 +0200 Subject: [PATCH 149/169] issue: 3788369 Poll local ring before XLIO socket destruction XLIO Socket API must guarantee that the XLIO_SOCKET_EVENT_TERMINATED is not followed by any other events. Therefore, all the TX completion events must be completed by that moment. Do a polling iteration before calling socket destructor to increase the chance that all the relevant WQEs are completed. This mechanism needs to be improved in the future. Signed-off-by: Dmytro Podgornyi --- src/core/event/poll_group.cpp | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/src/core/event/poll_group.cpp b/src/core/event/poll_group.cpp index ad25559d7..2a5aac2a5 100644 --- a/src/core/event/poll_group.cpp +++ b/src/core/event/poll_group.cpp @@ -182,7 +182,15 @@ void poll_group::close_socket(sockinfo_tcp *si, bool force /*=false*/) bool closed = si->prepare_to_close(force); if (closed) { + /* + * Current implementation forces TCP reset, so the socket is expected to be closable. + * Do a polling iteration to increase the chance that all the relevant WQEs are completed + * and XLIO emitted all the TX completion before the XLIO_SOCKET_EVENT_TERMINATED event. + * + * TODO Implement more reliable mechanism of deferred socket destruction if there are + * not completed TX operations. + */ + poll(); si->clean_socket_obj(); } - // TODO If not closed, the socket will be destroyed after the last completion notification. } From 64bf555cc6c8d09deaaca2365f86c87700845752 Mon Sep 17 00:00:00 2001 From: Dmytro Podgornyi Date: Mon, 25 Mar 2024 10:26:14 +0200 Subject: [PATCH 150/169] issue: 3788369 Re-read env params in xlio_init_ex() xlio_init_ex() changes some default parameters. However, a global object can trigger safe_mce_sys() constructor at the start. Therefore, we need to re-read the environment variables again to guarantee that the changed parameters take place. Signed-off-by: Dmytro Podgornyi --- src/core/sock/sock-extra.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/core/sock/sock-extra.cpp b/src/core/sock/sock-extra.cpp index 3bc0fc80b..83a329aa8 100644 --- a/src/core/sock/sock-extra.cpp +++ b/src/core/sock/sock-extra.cpp @@ -406,6 +406,9 @@ extern "C" int xlio_init_ex(const struct xlio_init_attr *attr) setenv(SYS_VAR_PROGRESS_ENGINE_INTERVAL, "0", 1); } + // Read the updated parameters. A global object could trigger the reading earlier. + safe_mce_sys().get_env_params(); + xlio_init(); extern xlio_memory_cb_t g_user_memory_cb; From 4771bdd7803bb73a175592c9c552455bc579394b Mon Sep 17 00:00:00 2001 From: Dmytro Podgornyi Date: Mon, 25 Mar 2024 10:29:37 +0200 Subject: [PATCH 151/169] issue: 3788369 Avoid POSIX connect() in xlio_socket_connect() Avoid using connect() with sock fd interface, because fd_collection doesn't keep xlio_socket_t objects. Signed-off-by: Dmytro Podgornyi --- src/core/sock/sock-extra.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/core/sock/sock-extra.cpp b/src/core/sock/sock-extra.cpp index 83a329aa8..b8ee2584d 100644 --- a/src/core/sock/sock-extra.cpp +++ b/src/core/sock/sock-extra.cpp @@ -523,7 +523,7 @@ extern "C" int xlio_socket_connect(xlio_socket_t sock, const struct sockaddr *to sockinfo_tcp *si = reinterpret_cast(sock); int errno_save = errno; - int rc = XLIO_CALL(connect, si->get_fd(), to, tolen); + int rc = si->connect(to, tolen); rc = (rc == -1 && (errno == EINPROGRESS || errno == EAGAIN)) ? 0 : rc; if (rc == 0) { si->add_tx_ring_to_group(); From dfdbd8bc20ff2538bb37625dc00ffb0ebe310753 Mon Sep 17 00:00:00 2001 From: Dmytro Podgornyi Date: Mon, 25 Mar 2024 19:37:47 +0200 Subject: [PATCH 152/169] issue: 3788369 Remove get_fd() from XLIO Socket API xlio_socket_t objects aren't connected to the fd_collection anymore. Therefore, all the methods must be called from the sockinfo_tcp objects directly. Also, xlio_socket_fd() is not relevant anymore and can be removed. Signed-off-by: Dmytro Podgornyi --- src/core/sock/sock-extra.cpp | 20 +++++++++++-------- src/core/xlio.h | 6 ------ tests/extra_api/xlio_socket_api.c | 33 +------------------------------ 3 files changed, 13 insertions(+), 46 deletions(-) diff --git a/src/core/sock/sock-extra.cpp b/src/core/sock/sock-extra.cpp index b8ee2584d..48cb63250 100644 --- a/src/core/sock/sock-extra.cpp +++ b/src/core/sock/sock-extra.cpp @@ -507,15 +507,25 @@ extern "C" int xlio_socket_setsockopt(xlio_socket_t sock, int level, int optname const void *optval, socklen_t optlen) { sockinfo_tcp *si = reinterpret_cast(sock); + int errno_save = errno; - return XLIO_CALL(setsockopt, si->get_fd(), level, optname, optval, optlen); + int rc = si->setsockopt(level, optname, optval, optlen); + if (rc == 0) { + errno = errno_save; + } + return rc; } extern "C" int xlio_socket_bind(xlio_socket_t sock, const struct sockaddr *addr, socklen_t addrlen) { sockinfo_tcp *si = reinterpret_cast(sock); + int errno_save = errno; - return XLIO_CALL(bind, si->get_fd(), addr, addrlen); + int rc = si->bind(addr, addrlen); + if (rc == 0) { + errno = errno_save; + } + return rc; } extern "C" int xlio_socket_connect(xlio_socket_t sock, const struct sockaddr *to, socklen_t tolen) @@ -540,12 +550,6 @@ extern "C" struct ibv_pd *xlio_socket_get_pd(xlio_socket_t sock) return ctx ? ctx->get_ibv_pd() : nullptr; } -extern "C" int xlio_socket_fd(xlio_socket_t sock) -{ - sockinfo_tcp *si = reinterpret_cast(sock); - return si->get_fd(); -} - static void xlio_buf_free(struct xlio_buf *buf) { // TODO Use mem_buf_desc_t field as xlio_buf diff --git a/src/core/xlio.h b/src/core/xlio.h index 8294986a3..f3daa80bb 100644 --- a/src/core/xlio.h +++ b/src/core/xlio.h @@ -488,12 +488,6 @@ void xlio_socket_flush(xlio_socket_t sock); void xlio_socket_buf_free(xlio_socket_t sock, struct xlio_buf *buf); void xlio_poll_group_buf_free(xlio_poll_group_t group, struct xlio_buf *buf); -/* - * Experimental level API. - */ - -int xlio_socket_fd(xlio_socket_t sock); - #ifdef __cplusplus } #endif diff --git a/tests/extra_api/xlio_socket_api.c b/tests/extra_api/xlio_socket_api.c index b170d0aae..9bb7b11a6 100644 --- a/tests/extra_api/xlio_socket_api.c +++ b/tests/extra_api/xlio_socket_api.c @@ -233,38 +233,7 @@ static void test_multi_groups(const char *ip) rc = xlio_socket_connect(sock3, (struct sockaddr *)&addr, sizeof(addr)); assert(rc == 0); - int fd1_1 = xlio_socket_fd(sock1_1); - int fd1_2 = xlio_socket_fd(sock1_2); - int fd2 = xlio_socket_fd(sock2); - int fd3 = xlio_socket_fd(sock3); - assert(fd1_1 >= 0); - assert(fd1_2 >= 0); - assert(fd2 >= 0); - assert(fd3 >= 0); - - assert(xlio_get_socket_rings_num(fd1_1) == 1); - assert(xlio_get_socket_rings_num(fd1_2) == 1); - assert(xlio_get_socket_rings_num(fd2) == 1); - assert(xlio_get_socket_rings_num(fd3) == 1); - - int ring1_1; - int ring1_2; - int ring2; - int ring3; - - rc = xlio_get_socket_rings_fds(fd1_1, &ring1_1, 1); - assert(rc == 1); - rc = xlio_get_socket_rings_fds(fd1_2, &ring1_2, 1); - assert(rc == 1); - rc = xlio_get_socket_rings_fds(fd2, &ring2, 1); - assert(rc == 1); - rc = xlio_get_socket_rings_fds(fd3, &ring3, 1); - assert(rc == 1); - - assert(ring1_1 == ring1_2); - assert(ring1_1 != ring2); - assert(ring1_1 != ring3); - assert(ring2 != ring3); + /* TODO There is no API to check expected internal ring distribution. */ /* Wait for ERROR events (ECONREFUSED). */ while (g_test_events < 4) { From 86fc67a4065c0d13233ccc7723ac1a1f82c3e622 Mon Sep 17 00:00:00 2001 From: Iftah Levi Date: Wed, 27 Mar 2024 20:21:19 +0000 Subject: [PATCH 153/169] issue: 3829626 Fix seg fault in TCP timers Iterate over std::list of TCP sockets while erasing socket during iteration. Overcomed by increasing iterator before erase. Signed-off-by: Iftah Levi --- src/core/sock/sockinfo_tcp.cpp | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/core/sock/sockinfo_tcp.cpp b/src/core/sock/sockinfo_tcp.cpp index 8b97b7d96..bd35d7fd8 100644 --- a/src/core/sock/sockinfo_tcp.cpp +++ b/src/core/sock/sockinfo_tcp.cpp @@ -5978,7 +5978,13 @@ void tcp_timers_collection::handle_timer_expired(void *user_data) sock_list &bucket = m_p_intervals[m_n_location]; m_n_location = (m_n_location + 1) % m_n_intervals_size; - for (sockinfo_tcp *p_sock : bucket) { + auto iter = bucket.begin(); + while (iter != bucket.end()) { + sockinfo_tcp *p_sock = *iter; + // Must inc iter first bacause handle_timer_expired can erase + // the socket that the iter points to, with delegated timers. + iter++; + /* It is not guaranteed that the same sockinfo object is met once * in this loop. * So in case sockinfo object is destroyed other processing From 65130bd4721d4569e7edf7e07cdf9f7091abb43c Mon Sep 17 00:00:00 2001 From: Dmytro Podgornyi Date: Mon, 1 Apr 2024 16:53:28 +0300 Subject: [PATCH 154/169] issue: 3818038 Remove BlueFlame doorbell method rdma-core limits number of UARs per context to 16 by default. After creating 16 QPs, XLIO receives duplicates of blueflame registers for each subsequent QP. As results, blueflame doorbell method can write WQEs concurrently without serialization and this leads to a data corruption. BlueFlame can make impact on throughput, since copy to the blueflame register is expensive. It can improve latency in some low latency scenarios, however, XLIO targets high traffic/PPS rates. Removing blueflame method also slightly improves performance in some scenarios. BlueFlame can be returned back in the future to improve low-latency scenarios, however, it will need some rework to avoid the data corruption. Signed-off-by: Dmytro Podgornyi --- src/core/dev/hw_queue_tx.cpp | 103 ++++++++--------------------------- src/core/dev/hw_queue_tx.h | 4 +- src/core/ib/mlx5/ib_mlx5.cpp | 2 - src/core/ib/mlx5/ib_mlx5.h | 2 - 4 files changed, 25 insertions(+), 86 deletions(-) diff --git a/src/core/dev/hw_queue_tx.cpp b/src/core/dev/hw_queue_tx.cpp index 6a5877cf5..f4c786527 100644 --- a/src/core/dev/hw_queue_tx.cpp +++ b/src/core/dev/hw_queue_tx.cpp @@ -140,11 +140,6 @@ hw_queue_tx::hw_queue_tx(ring_simple *ring, const slave_data_t *slave, const uin // Check device capabilities for dummy send support m_hw_dummy_send_support = xlio_is_nop_supported(m_p_ib_ctx_handler->get_ibv_device_attr()); - m_db_method = - (is_bf((slave->p_ib_ctx)->get_ibv_context()) ? MLX5_DB_METHOD_BF : MLX5_DB_METHOD_DB); - - hwqtx_logdbg("m_db_method=%d", m_db_method); - if (configure(slave)) { throw_xlio_exception("Failed to configure"); } @@ -498,9 +493,9 @@ void hw_queue_tx::init_queue() m_sq_wqe_hot->eseg.cs_flags = XLIO_TX_PACKET_L3_CSUM | XLIO_TX_PACKET_L4_CSUM; hwqtx_logfunc("%p allocated for %d QPs sq_wqes:%p sq_wqes_end: %p and configured %d WRs " - "BlueFlame: %p buf_size: %d offset: %d", + "BlueFlame: %p", m_mlx5_qp.qp, m_mlx5_qp.qpn, m_sq_wqes, m_sq_wqes_end, m_tx_num_wr, - m_mlx5_qp.bf.reg, m_mlx5_qp.bf.size, m_mlx5_qp.bf.offset); + m_mlx5_qp.bf.reg); } void hw_queue_tx::init_device_memory() @@ -508,18 +503,10 @@ void hw_queue_tx::init_device_memory() /* This limitation is done because of a observation * that dm_copy takes a lot of time on VMs w/o BF (RM:1542628) */ - if (m_p_ib_ctx_handler->get_on_device_memory_size() > 0) { - if (m_db_method == MLX5_DB_METHOD_BF) { - m_dm_enabled = - m_dm_mgr.allocate_resources(m_p_ib_ctx_handler, m_p_ring->m_p_ring_stat.get()); - - } else { -#if defined(DEFINED_IBV_DM) - VLOG_PRINTF_ONCE_THEN_DEBUG( - VLOG_WARNING, - "Device Memory functionality is not used on devices w/o Blue Flame support\n"); -#endif /* DEFINED_IBV_DM */ - } + if (m_p_ib_ctx_handler->get_on_device_memory_size() > 0 && + is_bf(m_p_ib_ctx_handler->get_ibv_context())) { + m_dm_enabled = + m_dm_mgr.allocate_resources(m_p_ib_ctx_handler, m_p_ring->m_p_ring_stat.get()); } } @@ -543,10 +530,9 @@ cq_mgr_tx *hw_queue_tx::init_tx_cq_mgr() m_p_ring->get_tx_comp_event_channel()); } -inline void hw_queue_tx::ring_doorbell(int db_method, int num_wqebb, int num_wqebb_top, - bool skip_comp /*=false*/) +inline void hw_queue_tx::ring_doorbell(int num_wqebb, bool skip_comp /*=false*/) { - uint64_t *dst = (uint64_t *)((uint8_t *)m_mlx5_qp.bf.reg + m_mlx5_qp.bf.offset); + uint64_t *dst = (uint64_t *)m_mlx5_qp.bf.reg; uint64_t *src = reinterpret_cast(m_sq_wqe_hot); struct xlio_mlx5_wqe_ctrl_seg *ctrl = reinterpret_cast(src); @@ -566,7 +552,7 @@ inline void hw_queue_tx::ring_doorbell(int db_method, int num_wqebb, int num_wqe m_b_fence_needed = false; } - m_sq_wqe_counter = (m_sq_wqe_counter + num_wqebb + num_wqebb_top) & 0xFFFF; + m_sq_wqe_counter = (m_sq_wqe_counter + num_wqebb) & 0xFFFF; // Make sure that descriptors are written before // updating doorbell record and ringing the doorbell @@ -575,29 +561,13 @@ inline void hw_queue_tx::ring_doorbell(int db_method, int num_wqebb, int num_wqe // This wc_wmb ensures ordering between DB record and BF copy wc_wmb(); - if (likely(db_method == MLX5_DB_METHOD_BF)) { - /* Copying src to BlueFlame register buffer by Write Combining cnt WQEBBs - * Avoid using memcpy() to copy to BlueFlame page, since memcpy() - * implementations may use move-string-buffer assembler instructions, - * which do not guarantee order of copying. - */ - while (num_wqebb--) { - COPY_64B_NT(dst, src); - } - src = (uint64_t *)m_sq_wqes; - while (num_wqebb_top--) { - COPY_64B_NT(dst, src); - } - } else { - *dst = *src; - } + *dst = *src; /* Use wc_wmb() to ensure write combining buffers are flushed out * of the running CPU. * sfence instruction affects only the WC buffers of the CPU that executes it */ wc_wmb(); - m_mlx5_qp.bf.offset ^= m_mlx5_qp.bf.size; } inline int hw_queue_tx::fill_inl_segment(sg_array &sga, uint8_t *cur_seg, uint8_t *data_addr, @@ -670,7 +640,7 @@ inline int hw_queue_tx::fill_wqe(xlio_ibv_send_wr *pswr) rest_space = align_to_WQEBB_up(wqe_size) / 4; hwqtx_logfunc("data_len: %d inline_len: %d wqe_size: %d wqebbs: %d", data_len - inline_len, inline_len, wqe_size, rest_space); - ring_doorbell(m_db_method, rest_space); + ring_doorbell(rest_space); return rest_space; } else { // wrap around case, first filling till the end of m_sq_wqes @@ -715,7 +685,7 @@ inline int hw_queue_tx::fill_wqe(xlio_ibv_send_wr *pswr) dbg_dump_wqe((uint32_t *)m_sq_wqe_hot, rest_space * 4 * 16); dbg_dump_wqe((uint32_t *)m_sq_wqes, max_inline_len * 4 * 16); - ring_doorbell(m_db_method, rest_space, max_inline_len); + ring_doorbell(rest_space + max_inline_len); return rest_space + max_inline_len; } } else { @@ -773,8 +743,7 @@ inline int hw_queue_tx::fill_wqe_send(xlio_ibv_send_wr *pswr) m_sq_wqe_hot->ctrl.data[1] = htonl((m_mlx5_qp.qpn << 8) | wqe_size); int wqebbs = align_to_WQEBB_up(wqe_size) / 4; - /* TODO FIXME Split into top and bottom parts */ - ring_doorbell(m_db_method, wqebbs); + ring_doorbell(wqebbs); return wqebbs; } @@ -825,7 +794,6 @@ inline int hw_queue_tx::fill_wqe_lso(xlio_ibv_send_wr *pswr) max_inline_len = align_to_octoword_up(inl_hdr_copy_size); cur_seg = (uint8_t *)m_sq_wqes + max_inline_len; wqe_size += rest / OCTOWORD; - inl_hdr_copy_size = align_to_WQEBB_up(wqe_size) / 4; } wqe_size += max_inline_len / OCTOWORD; hwqtx_logfunc("TSO: num_sge: %d max_inline_len: %d inl_hdr_size: %d rest: %d", pswr->num_sge, @@ -835,7 +803,6 @@ inline int hw_queue_tx::fill_wqe_lso(xlio_ibv_send_wr *pswr) for (i = 0; i < pswr->num_sge; i++) { if (unlikely((uintptr_t)dpseg >= (uintptr_t)m_sq_wqes_end)) { dpseg = (struct mlx5_wqe_data_seg *)m_sq_wqes; - inl_hdr_copy_size = align_to_WQEBB_up(wqe_size) / 4; } dpseg->addr = htonll((uint64_t)pswr->sg_list[i].addr); dpseg->lkey = htonl(pswr->sg_list[i].lkey); @@ -847,21 +814,11 @@ inline int hw_queue_tx::fill_wqe_lso(xlio_ibv_send_wr *pswr) dpseg++; wqe_size += sizeof(struct mlx5_wqe_data_seg) / OCTOWORD; } - inl_hdr_size = align_to_WQEBB_up(wqe_size) / 4; m_sq_wqe_hot->ctrl.data[1] = htonl((m_mlx5_qp.qpn << 8) | wqe_size); - // sending by BlueFlame or DoorBell covering wrap around - // TODO Make a single doorbell call - if (likely(inl_hdr_size <= 4)) { - if (likely(inl_hdr_copy_size == 0)) { - ring_doorbell(MLX5_DB_METHOD_DB, inl_hdr_size); - } else { - ring_doorbell(MLX5_DB_METHOD_DB, inl_hdr_copy_size, inl_hdr_size - inl_hdr_copy_size); - } - } else { - ring_doorbell(MLX5_DB_METHOD_DB, inl_hdr_size); - } - return align_to_WQEBB_up(wqe_size) / 4; + int wqebbs = align_to_WQEBB_up(wqe_size) / 4; + ring_doorbell(wqebbs); + return wqebbs; } void hw_queue_tx::store_current_wqe_prop(mem_buf_desc_t *buf, unsigned credits, xlio_ti *ti) @@ -878,8 +835,7 @@ void hw_queue_tx::store_current_wqe_prop(mem_buf_desc_t *buf, unsigned credits, } } -//! Send one RAW packet by MLX5 BlueFlame -// +//! Send one RAW packet void hw_queue_tx::send_to_wire(xlio_ibv_send_wr *p_send_wqe, xlio_wr_tx_packet_attr attr, bool request_comp, xlio_tis *tis, unsigned credits) { @@ -1028,7 +984,7 @@ void hw_queue_tx::nvme_set_static_context(xlio_tis *tis, uint32_t config) auto *params = wqebb_get(2U); nvme_fill_static_params_transport_params(params, config); store_current_wqe_prop(nullptr, SQ_CREDITS_UMR, tis); - ring_doorbell(MLX5_DB_METHOD_DB, MLX5E_TRANSPORT_SET_STATIC_PARAMS_WQEBBS); + ring_doorbell(MLX5E_TRANSPORT_SET_STATIC_PARAMS_WQEBBS); update_next_wqe_hot(); } @@ -1038,7 +994,7 @@ void hw_queue_tx::nvme_set_progress_context(xlio_tis *tis, uint32_t tcp_seqno) nvme_fill_progress_wqe(wqe, m_sq_wqe_counter, m_mlx5_qp.qpn, tis->get_tisn(), tcp_seqno, MLX5_FENCE_MODE_INITIATOR_SMALL); store_current_wqe_prop(nullptr, SQ_CREDITS_SET_PSV, tis); - ring_doorbell(MLX5_DB_METHOD_DB, MLX5E_NVMEOTCP_PROGRESS_PARAMS_WQEBBS); + ring_doorbell(MLX5E_NVMEOTCP_PROGRESS_PARAMS_WQEBBS); update_next_wqe_hot(); } @@ -1311,8 +1267,6 @@ inline void hw_queue_tx::tls_post_static_params_wqe(xlio_ti *ti, const struct xl ucseg->flags = MLX5_UMR_INLINE; ucseg->bsf_octowords = htobe16(DEVX_ST_SZ_BYTES(tls_static_params) / 16); - int num_wqebbs = TLS_SET_STATIC_PARAMS_WQEBBS; - int num_wqebbs_top = 0; int sq_wqebbs_room_left = (static_cast(m_sq_wqes_end - reinterpret_cast(cseg)) / MLX5_SEND_WQE_BB); @@ -1324,14 +1278,10 @@ inline void hw_queue_tx::tls_post_static_params_wqe(xlio_ti *ti, const struct xl if (unlikely(sq_wqebbs_room_left == 2)) { // Case 2: Change tspseg pointer: tspseg = reinterpret_cast(m_sq_wqes); - num_wqebbs = 2; - num_wqebbs_top = 1; } else if (unlikely(sq_wqebbs_room_left == 1)) { // Case 3: Change mkcseg and tspseg pointers: mkcseg = reinterpret_cast(m_sq_wqes); tspseg = reinterpret_cast( reinterpret_cast(m_sq_wqes) + sizeof(*mkcseg)); - num_wqebbs = 1; - num_wqebbs_top = 2; } memset(mkcseg, 0, sizeof(*mkcseg)); @@ -1340,7 +1290,7 @@ inline void hw_queue_tx::tls_post_static_params_wqe(xlio_ti *ti, const struct xl tls_fill_static_params_wqe(tspseg, info, key_id, resync_tcp_sn); store_current_wqe_prop(nullptr, SQ_CREDITS_UMR, ti); - ring_doorbell(MLX5_DB_METHOD_DB, num_wqebbs, num_wqebbs_top, true); + ring_doorbell(TLS_SET_STATIC_PARAMS_WQEBBS, true); dbg_dump_wqe((uint32_t *)m_sq_wqe_hot, sizeof(mlx5_set_tls_static_params_wqe)); update_next_wqe_hot(); @@ -1364,8 +1314,6 @@ inline void hw_queue_tx::tls_post_progress_params_wqe(xlio_ti *ti, uint32_t tis_ uint32_t next_record_tcp_sn, bool fence, bool is_tx) { - uint16_t num_wqebbs = TLS_SET_PROGRESS_PARAMS_WQEBBS; - struct mlx5_set_tls_progress_params_wqe *wqe = reinterpret_cast(m_sq_wqe_hot); struct xlio_mlx5_wqe_ctrl_seg *cseg = &wqe->ctrl.ctrl; @@ -1386,7 +1334,7 @@ inline void hw_queue_tx::tls_post_progress_params_wqe(xlio_ti *ti, uint32_t tis_ tls_fill_progress_params_wqe(&wqe->params, tis_tir_number, next_record_tcp_sn); store_current_wqe_prop(nullptr, SQ_CREDITS_SET_PSV, ti); - ring_doorbell(MLX5_DB_METHOD_DB, num_wqebbs); + ring_doorbell(TLS_SET_PROGRESS_PARAMS_WQEBBS); dbg_dump_wqe((uint32_t *)m_sq_wqe_hot, sizeof(mlx5_set_tls_progress_params_wqe)); update_next_wqe_hot(); @@ -1395,8 +1343,6 @@ inline void hw_queue_tx::tls_post_progress_params_wqe(xlio_ti *ti, uint32_t tis_ inline void hw_queue_tx::tls_get_progress_params_wqe(xlio_ti *ti, uint32_t tirn, void *buf, uint32_t lkey) { - uint16_t num_wqebbs = TLS_GET_PROGRESS_WQEBBS; - struct mlx5_get_tls_progress_params_wqe *wqe = reinterpret_cast(m_sq_wqe_hot); struct xlio_mlx5_wqe_ctrl_seg *cseg = &wqe->ctrl.ctrl; @@ -1419,7 +1365,7 @@ inline void hw_queue_tx::tls_get_progress_params_wqe(xlio_ti *ti, uint32_t tirn, store_current_wqe_prop(nullptr, SQ_CREDITS_GET_PSV, ti); - ring_doorbell(MLX5_DB_METHOD_DB, num_wqebbs); + ring_doorbell(TLS_GET_PROGRESS_WQEBBS); update_next_wqe_hot(); } @@ -1483,7 +1429,7 @@ void hw_queue_tx::post_nop_fence(void) store_current_wqe_prop(nullptr, SQ_CREDITS_NOP, nullptr); - ring_doorbell(MLX5_DB_METHOD_DB, 1); + ring_doorbell(1); update_next_wqe_hot(); } @@ -1495,7 +1441,6 @@ void hw_queue_tx::post_dump_wqe(xlio_tis *tis, void *addr, uint32_t len, uint32_ struct xlio_mlx5_wqe_ctrl_seg *cseg = &wqe->ctrl.ctrl; struct mlx5_wqe_data_seg *dseg = &wqe->data; uint32_t tisn = tis ? tis->get_tisn() : 0; - uint16_t num_wqebbs = XLIO_DUMP_WQEBBS; uint16_t ds_cnt = sizeof(*wqe) / MLX5_SEND_WQE_DS; memset(wqe, 0, sizeof(*wqe)); @@ -1511,7 +1456,7 @@ void hw_queue_tx::post_dump_wqe(xlio_tis *tis, void *addr, uint32_t len, uint32_ store_current_wqe_prop(nullptr, SQ_CREDITS_DUMP, tis); - ring_doorbell(MLX5_DB_METHOD_DB, num_wqebbs, 0, true); + ring_doorbell(XLIO_DUMP_WQEBBS, true); update_next_wqe_hot(); } diff --git a/src/core/dev/hw_queue_tx.h b/src/core/dev/hw_queue_tx.h index a09707e5d..dc6f60302 100644 --- a/src/core/dev/hw_queue_tx.h +++ b/src/core/dev/hw_queue_tx.h @@ -264,8 +264,7 @@ class hw_queue_tx : public xlio_ti_owner { inline int fill_wqe_lso(xlio_ibv_send_wr *pswr); inline int fill_inl_segment(sg_array &sga, uint8_t *cur_seg, uint8_t *data_addr, int max_inline_len, int inline_len); - inline void ring_doorbell(int db_method, int num_wqebb, int num_wqebb_top = 0, - bool skip_comp = false); + inline void ring_doorbell(int num_wqebb, bool skip_comp = false); struct xlio_rate_limit_t m_rate_limit; xlio_ib_mlx5_qp_t m_mlx5_qp; @@ -279,7 +278,6 @@ class hw_queue_tx : public xlio_ti_owner { struct mlx5_eth_wqe (*m_sq_wqes)[] = nullptr; struct mlx5_eth_wqe *m_sq_wqe_hot = nullptr; uint8_t *m_sq_wqes_end = nullptr; - enum { MLX5_DB_METHOD_BF, MLX5_DB_METHOD_DB } m_db_method; const uint32_t m_n_sysvar_tx_num_wr_to_signal; uint32_t m_tx_num_wr; diff --git a/src/core/ib/mlx5/ib_mlx5.cpp b/src/core/ib/mlx5/ib_mlx5.cpp index ec20b89f4..3fc4da36c 100644 --- a/src/core/ib/mlx5/ib_mlx5.cpp +++ b/src/core/ib/mlx5/ib_mlx5.cpp @@ -69,8 +69,6 @@ int xlio_ib_mlx5_get_qp_tx(xlio_ib_mlx5_qp_t *mlx5_qp) mlx5_qp->sq.wqe_cnt = dqp.sq.wqe_cnt; mlx5_qp->sq.stride = dqp.sq.stride; mlx5_qp->bf.reg = dqp.bf.reg; - mlx5_qp->bf.size = dqp.bf.size; - mlx5_qp->bf.offset = 0; #if defined(DEFINED_DV_RAW_QP_HANDLES) mlx5_qp->tisn = dqp.tisn; mlx5_qp->sqn = dqp.sqn; diff --git a/src/core/ib/mlx5/ib_mlx5.h b/src/core/ib/mlx5/ib_mlx5.h index e85c83681..1ecf5a5a7 100644 --- a/src/core/ib/mlx5/ib_mlx5.h +++ b/src/core/ib/mlx5/ib_mlx5.h @@ -80,8 +80,6 @@ typedef struct xlio_ib_mlx5_qp { } sq; struct { void *reg; - uint32_t size; - uint32_t offset; } bf; uint32_t tisn; uint32_t sqn; From 6f485a1e2557d388f9a2a482ce20b5e553f390b0 Mon Sep 17 00:00:00 2001 From: Dmytro Podgornyi Date: Tue, 2 Apr 2024 15:45:12 +0300 Subject: [PATCH 155/169] issue: 3818038 Remove likely() from the inline WQE branch The inline WQE branch is not likely in most throughput scenarios. Signed-off-by: Dmytro Podgornyi --- src/core/dev/hw_queue_tx.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/core/dev/hw_queue_tx.cpp b/src/core/dev/hw_queue_tx.cpp index f4c786527..6be8c5a62 100644 --- a/src/core/dev/hw_queue_tx.cpp +++ b/src/core/dev/hw_queue_tx.cpp @@ -600,7 +600,7 @@ inline int hw_queue_tx::fill_wqe(xlio_ibv_send_wr *pswr) int max_inline_len = get_max_inline_data(); // assume packet is full inline - if (likely(data_len <= max_inline_len && xlio_send_wr_opcode(*pswr) == XLIO_IBV_WR_SEND)) { + if (data_len <= max_inline_len && xlio_send_wr_opcode(*pswr) == XLIO_IBV_WR_SEND) { uint8_t *data_addr = sga.get_data(&inline_len); // data for inlining in ETH header data_len -= inline_len; hwqtx_logfunc( From ea38dd7d7e0757ec8bae2645350bfc71e85daa8f Mon Sep 17 00:00:00 2001 From: Alexander Grissik Date: Tue, 2 Apr 2024 14:37:28 +0300 Subject: [PATCH 156/169] issue: 3844385 Fix new TCP timers registration lock contention Avoid calling register_socket_timer_event when a socket is already registered (TIME-WAIT). Although there is no functionality issue with that, it produces too high rate of posting events for internal-thread. This leads to lock contantion inside internal-thread and degraded performance of HTTP CPS. Signed-off-by: Alexander Grissik --- src/core/sock/sockinfo_tcp.cpp | 17 ++++++++++++----- src/core/sock/sockinfo_tcp.h | 3 +++ 2 files changed, 15 insertions(+), 5 deletions(-) diff --git a/src/core/sock/sockinfo_tcp.cpp b/src/core/sock/sockinfo_tcp.cpp index bd35d7fd8..e087076c8 100644 --- a/src/core/sock/sockinfo_tcp.cpp +++ b/src/core/sock/sockinfo_tcp.cpp @@ -2482,10 +2482,15 @@ ssize_t sockinfo_tcp::rx(const rx_call_t call_type, iovec *p_iov, ssize_t sz_iov void sockinfo_tcp::register_timer() { - si_tcp_logdbg("Registering TCP socket timer: socket: %p, thread-col: %p, global-col: %p", this, - get_tcp_timer_collection(), g_tcp_timers_collection); + // A reused time-wait socket wil try to add a timer although it is already registered. + // We should avoid calling register_socket_timer_event unnecessarily because it introduces + // internal-thread locks contention. + if (!is_timer_registered()) { + si_tcp_logdbg("Registering TCP socket timer: socket: %p, thread-col: %p, global-col: %p", + this, get_tcp_timer_collection(), g_tcp_timers_collection); - get_event_mgr()->register_socket_timer_event(this); + get_event_mgr()->register_socket_timer_event(this); + } } void sockinfo_tcp::queue_rx_ctl_packet(struct tcp_pcb *pcb, mem_buf_desc_t *p_desc) @@ -6017,8 +6022,8 @@ void tcp_timers_collection::add_new_timer(sockinfo_tcp *sock) return; } - // A reused time-wait socket wil try to add a timer although it is already registered. - if (m_sock_remove_map.find(sock) != m_sock_remove_map.end()) { + if (sock->is_timer_registered()) { + __log_warn("Trying to add timer twice for TCP socket %p", sock); return; } @@ -6026,6 +6031,7 @@ void tcp_timers_collection::add_new_timer(sockinfo_tcp *sock) bucket.emplace_back(sock); m_sock_remove_map.emplace(sock, std::make_tuple(m_n_next_insert_bucket, --(bucket.end()))); m_n_next_insert_bucket = (m_n_next_insert_bucket + 1) % m_n_intervals_size; + sock->set_timer_registered(true); if (0 == m_n_count++) { m_timer_handle = get_event_mgr()->register_timer_event(safe_mce_sys().timer_resolution_msec, @@ -6041,6 +6047,7 @@ void tcp_timers_collection::remove_timer(sockinfo_tcp *sock) if (node != m_sock_remove_map.end()) { m_p_intervals[std::get<0>(node->second)].erase(std::get<1>(node->second)); m_sock_remove_map.erase(node); + sock->set_timer_registered(false); if (!(--m_n_count)) { if (m_timer_handle) { diff --git a/src/core/sock/sockinfo_tcp.h b/src/core/sock/sockinfo_tcp.h index 0d731536b..757c598eb 100644 --- a/src/core/sock/sockinfo_tcp.h +++ b/src/core/sock/sockinfo_tcp.h @@ -320,6 +320,8 @@ class sockinfo_tcp : public sockinfo { } bool is_incoming() override { return m_b_incoming; } + bool is_timer_registered() const { return m_timer_registered; } + void set_timer_registered(bool v) { m_timer_registered = v; } bool is_connected() { return m_sock_state == TCP_SOCK_CONNECTED_RDWR; } @@ -618,6 +620,7 @@ class sockinfo_tcp : public sockinfo { bool m_xlio_thr; bool m_b_incoming; bool m_b_attached; + bool m_timer_registered = false; /* connection state machine */ int m_conn_timeout; /* RCVBUF acconting */ From 8e64060ed1d452d524c2732e3d02f88e295f57a3 Mon Sep 17 00:00:00 2001 From: Gal Noam Date: Thu, 4 Apr 2024 17:07:04 +0300 Subject: [PATCH 157/169] version: 3.30.4 Signed-off-by: Gal Noam --- CHANGES | 8 ++++++++ configure.ac | 2 +- contrib/scripts/libxlio.spec.in | 4 ++-- 3 files changed, 11 insertions(+), 3 deletions(-) diff --git a/CHANGES b/CHANGES index c0b532a8c..3fd175bed 100644 --- a/CHANGES +++ b/CHANGES @@ -1,3 +1,11 @@ +Version 3.30.4-1: +Date + Time 2024-04-04 +============================================================= +Fixed: + - RM #3792798 Do code cleanup for new storage API + - RM #3829626 Nginx http CPS tests report error + - RM #3818038 I/O errors during FIO + Version 3.30.3-1: Date + Time 2024-03-20 ============================================================= diff --git a/configure.ac b/configure.ac index b3255693c..492d68d55 100644 --- a/configure.ac +++ b/configure.ac @@ -14,7 +14,7 @@ dnl===-----------------------------------------------------------------------=== # define([prj_ver_major], 3) define([prj_ver_minor], 30) -define([prj_ver_revision], 3) +define([prj_ver_revision], 4) define([prj_ver_release], esyscmd([echo ${PRJ_RELEASE:=0}])) diff --git a/contrib/scripts/libxlio.spec.in b/contrib/scripts/libxlio.spec.in index 174599081..1feb49efd 100644 --- a/contrib/scripts/libxlio.spec.in +++ b/contrib/scripts/libxlio.spec.in @@ -189,7 +189,7 @@ fi %{_mandir}/man8/xlio_stats.* %changelog -* Wed Mar 20 2024 NVIDIA CORPORATION 3.30.3-1 -- Bump version to 3.30.3 +* Thu Apr 4 2024 NVIDIA CORPORATION 3.30.4-1 +- Bump version to 3.30.4 - Please refer to CHANGES for full changelog. From 9b7eec0657ceccedb4c9d5dd18329b835300673b Mon Sep 17 00:00:00 2001 From: Dmytro Podgornyi Date: Sat, 6 Apr 2024 00:12:54 +0300 Subject: [PATCH 158/169] issue: 3788164 Fix RX poll on TX option for UTLS UTLS uses tcp_tx_express() for non blocking sockets. However, this TX method doesn't support XLIO_RX_POLL_ON_TX_TCP. Additional RX polling improves scenarios such as WEB servers. Insert RX polling into UTLS TX path to resolve performance degradation. Signed-off-by: Dmytro Podgornyi --- src/core/sock/sockinfo_tcp.h | 8 ++++++++ src/core/sock/sockinfo_ulp.cpp | 1 + 2 files changed, 9 insertions(+) diff --git a/src/core/sock/sockinfo_tcp.h b/src/core/sock/sockinfo_tcp.h index 757c598eb..f8b7a67ac 100644 --- a/src/core/sock/sockinfo_tcp.h +++ b/src/core/sock/sockinfo_tcp.h @@ -367,6 +367,14 @@ class sockinfo_tcp : public sockinfo { return rx_flow_iter->first; } + void rx_poll_on_tx_if_needed() + { + if (m_sysvar_rx_poll_on_tx_tcp) { + int poll_count = 0; + rx_wait_helper(poll_count, false); + } + } + /* Proxy to support ULP. TODO Refactor. */ inline sockinfo_tcp_ops *get_ops() { return m_ops; } inline void set_ops(sockinfo_tcp_ops *ops) noexcept diff --git a/src/core/sock/sockinfo_ulp.cpp b/src/core/sock/sockinfo_ulp.cpp index ef449939b..f0c6125e7 100644 --- a/src/core/sock/sockinfo_ulp.cpp +++ b/src/core/sock/sockinfo_ulp.cpp @@ -801,6 +801,7 @@ ssize_t sockinfo_tcp_ops_tls::tx(xlio_tx_call_attr_t &tx_arg) } retry: if (!block_this_run) { + m_p_sock->rx_poll_on_tx_if_needed(); ret2 = m_p_sock->tcp_tx_express(tls_arg.attr.iov, tls_arg.attr.sz_iov, 0, XLIO_EXPRESS_OP_TYPE_FILE_ZEROCOPY, reinterpret_cast(rec)); From 0678a45b42eb4e944e75e4ec42d4726d6fa3f32f Mon Sep 17 00:00:00 2001 From: Alexander Grissik Date: Mon, 8 Apr 2024 11:40:31 +0300 Subject: [PATCH 159/169] issue: 3855390 Fixing adding TCP timer twice warning In heavy CPS scenarios a socket may go to TIME-WAIT state and be reused before first TCP timer registration is performed by internal-thread. 1. Setting timer_registered=true while posting the event prevents the second attemp to try and post the event again. 2. Adding sanity check in add_new_timer that verifies that the socket is not already in the timer map. Signed-off-by: Alexander Grissik --- src/core/sock/sockinfo_tcp.cpp | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/src/core/sock/sockinfo_tcp.cpp b/src/core/sock/sockinfo_tcp.cpp index e087076c8..85332757c 100644 --- a/src/core/sock/sockinfo_tcp.cpp +++ b/src/core/sock/sockinfo_tcp.cpp @@ -2489,6 +2489,7 @@ void sockinfo_tcp::register_timer() si_tcp_logdbg("Registering TCP socket timer: socket: %p, thread-col: %p, global-col: %p", this, get_tcp_timer_collection(), g_tcp_timers_collection); + set_timer_registered(true); get_event_mgr()->register_socket_timer_event(this); } } @@ -6022,17 +6023,20 @@ void tcp_timers_collection::add_new_timer(sockinfo_tcp *sock) return; } - if (sock->is_timer_registered()) { + sock_list &bucket = m_p_intervals[m_n_next_insert_bucket]; + bucket.emplace_back(sock); + auto rc = + m_sock_remove_map.emplace(sock, std::make_tuple(m_n_next_insert_bucket, --(bucket.end()))); + + // If the socket already exists in m_sock_remove_map, emplace returns false in rc.second + // Mainly for sanity check, we dont expect it. + if (unlikely(!rc.second)) { __log_warn("Trying to add timer twice for TCP socket %p", sock); + bucket.pop_back(); return; } - sock_list &bucket = m_p_intervals[m_n_next_insert_bucket]; - bucket.emplace_back(sock); - m_sock_remove_map.emplace(sock, std::make_tuple(m_n_next_insert_bucket, --(bucket.end()))); m_n_next_insert_bucket = (m_n_next_insert_bucket + 1) % m_n_intervals_size; - sock->set_timer_registered(true); - if (0 == m_n_count++) { m_timer_handle = get_event_mgr()->register_timer_event(safe_mce_sys().timer_resolution_msec, this, PERIODIC_TIMER, nullptr); From db61660cf840e60c6b1632a2def457825d03f4fa Mon Sep 17 00:00:00 2001 From: Iftah Levi Date: Thu, 4 Apr 2024 17:08:09 +0300 Subject: [PATCH 160/169] issue: 3795997 Control TSO max payload size Added new env parameter - XLIO_MAX_TSO_SIZE. It allows the user to control maximum size of TSO, instead of taking the maximum cap by HW. The default size is 256KB (maximum by current HW). Values higher than HW capabilities won't be taken into account. Signed-off-by: Iftah Levi --- src/core/dev/ring_simple.cpp | 5 +++++ src/core/main.cpp | 2 ++ src/core/sock/sockinfo_tcp.cpp | 5 +++-- src/core/util/sys_vars.cpp | 5 +++++ src/core/util/sys_vars.h | 3 +++ 5 files changed, 18 insertions(+), 2 deletions(-) diff --git a/src/core/dev/ring_simple.cpp b/src/core/dev/ring_simple.cpp index c6ccbe98e..c0826132a 100644 --- a/src/core/dev/ring_simple.cpp +++ b/src/core/dev/ring_simple.cpp @@ -261,6 +261,11 @@ void ring_simple::create_resources() const xlio_ibv_tso_caps *caps = &xlio_get_tso_caps(m_p_ib_ctx->get_ibv_device_attr_ex()); if (ibv_is_qpt_supported(caps->supported_qpts, IBV_QPT_RAW_PACKET)) { + if (caps->max_tso && (caps->max_tso > MCE_DEFAULT_MAX_TSO_SIZE)) { + ring_logwarn("max_tso cap (=%u) is higher than default TSO size (=%u). " + "Increase XLIO_MAX_TSO_SIZE to get full TSO potential.", + caps->max_tso, MCE_DEFAULT_MAX_TSO_SIZE); + } m_tso.max_payload_sz = caps->max_tso; /* ETH(14) + IP(20) + TCP(20) + TCP OPTIONS(40) */ m_tso.max_header_sz = 94; diff --git a/src/core/main.cpp b/src/core/main.cpp index 2b7327cb5..7335abc35 100644 --- a/src/core/main.cpp +++ b/src/core/main.cpp @@ -805,6 +805,8 @@ void print_xlio_global_settings() VLOG_STR_PARAM_STRING("TSO support", option_3::to_str(safe_mce_sys().enable_tso), option_3::to_str(MCE_DEFAULT_TSO), SYS_VAR_TSO, option_3::to_str(safe_mce_sys().enable_tso)); + VLOG_PARAM_STRING("TSO max size", safe_mce_sys().max_tso_sz, MCE_DEFAULT_MAX_TSO_SIZE, + SYS_VAR_MAX_TSO_SIZE, option_size::to_str(safe_mce_sys().max_tso_sz)); VLOG_STR_PARAM_STRING("LRO support", option_3::to_str(safe_mce_sys().enable_lro), option_3::to_str(MCE_DEFAULT_LRO), SYS_VAR_LRO, option_3::to_str(safe_mce_sys().enable_lro)); diff --git a/src/core/sock/sockinfo_tcp.cpp b/src/core/sock/sockinfo_tcp.cpp index 85332757c..e9671074e 100644 --- a/src/core/sock/sockinfo_tcp.cpp +++ b/src/core/sock/sockinfo_tcp.cpp @@ -875,8 +875,9 @@ bool sockinfo_tcp::prepare_dst_to_send(bool is_accepted_socket /* = false */) * so it is a time to provide TSO information to PCB */ auto *ring = m_p_connected_dst_entry->get_ring(); - m_pcb.tso.max_buf_sz = std::min(safe_mce_sys().tx_buf_size, ring->get_max_payload_sz()); - m_pcb.tso.max_payload_sz = ring->get_max_payload_sz(); + uint32_t max_tso_sz = std::min(ring->get_max_payload_sz(), safe_mce_sys().max_tso_sz); + m_pcb.tso.max_buf_sz = std::min(safe_mce_sys().tx_buf_size, max_tso_sz); + m_pcb.tso.max_payload_sz = max_tso_sz; m_pcb.tso.max_header_sz = ring->get_max_header_sz(); m_pcb.tso.max_send_sge = ring->get_max_send_sge(); } diff --git a/src/core/util/sys_vars.cpp b/src/core/util/sys_vars.cpp index 025b885aa..c62709b49 100644 --- a/src/core/util/sys_vars.cpp +++ b/src/core/util/sys_vars.cpp @@ -837,6 +837,7 @@ void mce_sys_var::get_env_params() progress_engine_wce_max = MCE_DEFAULT_PROGRESS_ENGINE_WCE_MAX; cq_keep_qp_full = MCE_DEFAULT_CQ_KEEP_QP_FULL; qp_compensation_level = MCE_DEFAULT_QP_COMPENSATION_LEVEL; + max_tso_sz = MCE_DEFAULT_MAX_TSO_SIZE; user_huge_page_size = MCE_DEFAULT_USER_HUGE_PAGE_SIZE; internal_thread_arm_cq_enabled = MCE_DEFAULT_INTERNAL_THREAD_ARM_CQ_ENABLED; @@ -1892,6 +1893,10 @@ void mce_sys_var::get_env_params() enable_tso = option_3::from_str(env_ptr, MCE_DEFAULT_TSO); } + if ((env_ptr = getenv(SYS_VAR_MAX_TSO_SIZE))) { + max_tso_sz = option_size::from_str(env_ptr); + } + if ((enable_tso != option_3::OFF) && (ring_migration_ratio_tx != -1)) { ring_migration_ratio_tx = -1; vlog_printf(VLOG_DEBUG, "%s parameter is forced to %d in case %s is enabled\n", diff --git a/src/core/util/sys_vars.h b/src/core/util/sys_vars.h index 64736f610..c97c1f9da 100644 --- a/src/core/util/sys_vars.h +++ b/src/core/util/sys_vars.h @@ -434,6 +434,7 @@ struct mce_sys_var { uint32_t progress_engine_wce_max; bool cq_keep_qp_full; uint32_t qp_compensation_level; + uint32_t max_tso_sz; size_t user_huge_page_size; bool offloaded_sockets; @@ -637,6 +638,7 @@ extern mce_sys_var &safe_mce_sys(); #define SYS_VAR_PROGRESS_ENGINE_WCE_MAX "XLIO_PROGRESS_ENGINE_WCE_MAX" #define SYS_VAR_CQ_KEEP_QP_FULL "XLIO_CQ_KEEP_QP_FULL" #define SYS_VAR_QP_COMPENSATION_LEVEL "XLIO_QP_COMPENSATION_LEVEL" +#define SYS_VAR_MAX_TSO_SIZE "XLIO_MAX_TSO_SIZE" #define SYS_VAR_USER_HUGE_PAGE_SIZE "XLIO_USER_HUGE_PAGE_SIZE" #define SYS_VAR_OFFLOADED_SOCKETS "XLIO_OFFLOADED_SOCKETS" #define SYS_VAR_TIMER_RESOLUTION_MSEC "XLIO_TIMER_RESOLUTION_MSEC" @@ -861,6 +863,7 @@ extern mce_sys_var &safe_mce_sys(); #define MCE_MAX_CQ_POLL_BATCH (128) #define MCE_DEFAULT_SOCKETXTREME (false) #define MCE_DEFAULT_TSO (option_3::AUTO) +#define MCE_DEFAULT_MAX_TSO_SIZE (256 * 1024) #ifdef DEFINED_UTLS #define MCE_DEFAULT_UTLS_RX (false) #define MCE_DEFAULT_UTLS_TX (true) From 1e18c6a55c27e933086073ca112d402197b6f29c Mon Sep 17 00:00:00 2001 From: Gal Noam Date: Tue, 9 Apr 2024 15:22:01 +0300 Subject: [PATCH 161/169] version: 3.30.5 Signed-off-by: Gal Noam --- CHANGES | 8 ++++++++ configure.ac | 2 +- contrib/scripts/libxlio.spec.in | 4 ++-- 3 files changed, 11 insertions(+), 3 deletions(-) diff --git a/CHANGES b/CHANGES index 3fd175bed..53d4f6c84 100644 --- a/CHANGES +++ b/CHANGES @@ -1,3 +1,11 @@ +Version 3.30.5-1: +Date + Time 2024-0-09 +============================================================= +Fixed: + - RM #3795997 OCI degradation IO operations with 8k conn, block size of 256k + - RM #3855390 http CPS test with IPv6 is reporting XLIO warning + - RM #3788164 Nginx Degradation 10MB with a low number of workers. + Version 3.30.4-1: Date + Time 2024-04-04 ============================================================= diff --git a/configure.ac b/configure.ac index 492d68d55..25a5dc8fc 100644 --- a/configure.ac +++ b/configure.ac @@ -14,7 +14,7 @@ dnl===-----------------------------------------------------------------------=== # define([prj_ver_major], 3) define([prj_ver_minor], 30) -define([prj_ver_revision], 4) +define([prj_ver_revision], 5) define([prj_ver_release], esyscmd([echo ${PRJ_RELEASE:=0}])) diff --git a/contrib/scripts/libxlio.spec.in b/contrib/scripts/libxlio.spec.in index 1feb49efd..1d79c29e3 100644 --- a/contrib/scripts/libxlio.spec.in +++ b/contrib/scripts/libxlio.spec.in @@ -189,7 +189,7 @@ fi %{_mandir}/man8/xlio_stats.* %changelog -* Thu Apr 4 2024 NVIDIA CORPORATION 3.30.4-1 -- Bump version to 3.30.4 +* Tue Apr 9 2024 NVIDIA CORPORATION 3.30.5-1 +- Bump version to 3.30.5 - Please refer to CHANGES for full changelog. From 2ca7928b3f9d6d1e8b3d181c4ef7f91aebd1e930 Mon Sep 17 00:00:00 2001 From: Alexander Grissik Date: Wed, 17 Apr 2024 10:49:30 +0300 Subject: [PATCH 162/169] issue: 3858121 Fixing socket stats corruption on termination When sock_stats was static its destructor was called before xlio_exit that destroys the internal-thread which destroys sockets. We should avoid having global objects with untrivial constructors/destructors, since there is no control of their execution order. Signed-off-by: Alexander Grissik --- src/core/main.cpp | 4 +++- src/core/sock/sock_stats.cpp | 27 +++++++++++++++++++++------ src/core/sock/sock_stats.h | 8 ++++++-- src/core/sock/sockinfo_tcp.cpp | 5 ++++- 4 files changed, 34 insertions(+), 10 deletions(-) diff --git a/src/core/main.cpp b/src/core/main.cpp index 7335abc35..782df548e 100644 --- a/src/core/main.cpp +++ b/src/core/main.cpp @@ -262,6 +262,8 @@ static int free_libxlio_resources() vlog_printf(VLOG_DEBUG, "Stopping logger module\n"); + sock_stats::destroy_instance(); + sock_redirect_exit(); vlog_stop(); @@ -1047,7 +1049,7 @@ static void do_global_ctors_helper() *g_p_vlogger_level = g_vlogger_level; *g_p_vlogger_details = g_vlogger_details; - sock_stats::instance().init_sock_stats(safe_mce_sys().stats_fd_num_max); + sock_stats::init_instance(safe_mce_sys().stats_fd_num_max); g_global_stat_static.init(); xlio_stats_instance_create_global_block(&g_global_stat_static); diff --git a/src/core/sock/sock_stats.cpp b/src/core/sock/sock_stats.cpp index 58c6bd6a9..0d5b8d123 100644 --- a/src/core/sock/sock_stats.cpp +++ b/src/core/sock/sock_stats.cpp @@ -34,11 +34,30 @@ #include "sock_stats.h" thread_local socket_stats_t sock_stats::t_dummy_stats; +sock_stats *sock_stats::s_instance = nullptr; +void sock_stats::init_instance(size_t max_stats) +{ + if (!s_instance) { + s_instance = new sock_stats(); + if (max_stats) { + s_instance->init_sock_stats(max_stats); + } + } +} + +void sock_stats::destroy_instance() +{ + if (s_instance) { + delete s_instance; + s_instance = nullptr; + } +} + +// Calling init_instance() before instance() is a hard requirement. sock_stats &sock_stats::instance() { - static sock_stats the_instance; - return the_instance; + return *s_instance; } socket_stats_t *sock_stats::get_stats_obj() @@ -63,10 +82,6 @@ void sock_stats::return_stats_obj(socket_stats_t *stats) void sock_stats::init_sock_stats(size_t max_stats) { - if (max_stats == 0U) { - return; - } - std::lock_guard lock(_stats_lock); _socket_stats_vec.resize(max_stats); diff --git a/src/core/sock/sock_stats.h b/src/core/sock/sock_stats.h index 3dcde7d2a..b1faaa6d1 100644 --- a/src/core/sock/sock_stats.h +++ b/src/core/sock/sock_stats.h @@ -42,16 +42,20 @@ class sock_stats { public: + static void init_instance(size_t max_stats); + static void destroy_instance(); static sock_stats &instance(); - static thread_local socket_stats_t t_dummy_stats; - void init_sock_stats(size_t max_stats); socket_stats_t *get_stats_obj(); void return_stats_obj(socket_stats_t *stats); + static thread_local socket_stats_t t_dummy_stats; + private: sock_stats() {} + void init_sock_stats(size_t max_stats); + static sock_stats *s_instance; std::mutex _stats_lock; socket_stats_t *_socket_stats_list = nullptr; std::vector _socket_stats_vec; diff --git a/src/core/sock/sockinfo_tcp.cpp b/src/core/sock/sockinfo_tcp.cpp index e9671074e..a34da1088 100644 --- a/src/core/sock/sockinfo_tcp.cpp +++ b/src/core/sock/sockinfo_tcp.cpp @@ -3610,7 +3610,10 @@ err_t sockinfo_tcp::syn_received_timewait_cb(void *arg, struct tcp_pcb *newpcb) new_sock->m_b_blocking = true; /* Dump statistics of the previous incarnation of the socket. */ - print_full_stats(new_sock->m_p_socket_stats, nullptr, safe_mce_sys().stats_file); + if (new_sock->has_stats()) { + print_full_stats(new_sock->m_p_socket_stats, nullptr, safe_mce_sys().stats_file); + } + new_sock->socket_stats_init(); /* Reset zerocopy state */ From 6c90533700e4542dc43444b39c658d04a82c840b Mon Sep 17 00:00:00 2001 From: Alexander Grissik Date: Mon, 8 Apr 2024 12:40:43 +0300 Subject: [PATCH 163/169] issue: 3854806 Fixing TX buffers return order in TCP socket destruction When TCP socket is destroyed it frees the preallocated buffers after dst_entry is deleted. This returns the buffers to the global pool directly and breaks m_tx_num_bufs,m_zc_num_bufs ring counters. 1. Move the preallocated buffers cleanup before dst_entry destruction. 2. Add ring stats for m_tx_num_bufs and m_zc_num_bufs. Signed-off-by: Alexander Grissik --- src/core/dev/ring_simple.cpp | 5 +++++ src/core/sock/sockinfo_tcp.cpp | 19 ++++++++++++------- src/core/sock/sockinfo_tcp.h | 1 + src/core/util/xlio_stats.h | 2 ++ src/stats/stats_reader.cpp | 13 +++++++++++++ 5 files changed, 33 insertions(+), 7 deletions(-) diff --git a/src/core/dev/ring_simple.cpp b/src/core/dev/ring_simple.cpp index c0826132a..4d82ca523 100644 --- a/src/core/dev/ring_simple.cpp +++ b/src/core/dev/ring_simple.cpp @@ -877,6 +877,7 @@ void ring_simple::init_tx_buffers(uint32_t count) { request_more_tx_buffers(PBUF_RAM, count, m_tx_lkey); m_tx_num_bufs = m_tx_pool.size(); + m_p_ring_stat->simple.n_tx_num_bufs = m_tx_num_bufs; } void ring_simple::inc_cq_moderation_stats(size_t sz_data) @@ -901,8 +902,10 @@ mem_buf_desc_t *ring_simple::get_tx_buffers(pbuf_type type, uint32_t n_num_mem_b */ if (type == PBUF_ZEROCOPY) { m_zc_num_bufs += count; + m_p_ring_stat->simple.n_zc_num_bufs = m_zc_num_bufs; } else { m_tx_num_bufs += count; + m_p_ring_stat->simple.n_tx_num_bufs = m_tx_num_bufs; } } @@ -937,12 +940,14 @@ void ring_simple::return_to_global_pool() m_tx_num_bufs >= RING_TX_BUFS_COMPENSATE * 2)) { int return_bufs = m_tx_pool.size() / 2; m_tx_num_bufs -= return_bufs; + m_p_ring_stat->simple.n_tx_num_bufs = m_tx_num_bufs; g_buffer_pool_tx->put_buffers_thread_safe(&m_tx_pool, return_bufs); } if (unlikely(m_zc_pool.size() > (m_zc_num_bufs / 2) && m_zc_num_bufs >= RING_TX_BUFS_COMPENSATE * 2)) { int return_bufs = m_zc_pool.size() / 2; m_zc_num_bufs -= return_bufs; + m_p_ring_stat->simple.n_zc_num_bufs = m_zc_num_bufs; g_buffer_pool_zc->put_buffers_thread_safe(&m_zc_pool, return_bufs); } } diff --git a/src/core/sock/sockinfo_tcp.cpp b/src/core/sock/sockinfo_tcp.cpp index a34da1088..ace285d8b 100644 --- a/src/core/sock/sockinfo_tcp.cpp +++ b/src/core/sock/sockinfo_tcp.cpp @@ -546,10 +546,7 @@ sockinfo_tcp::~sockinfo_tcp() g_bind_no_port->release_port(m_bound, m_connected); } - destructor_helper(); - - // Release preallocated buffers - tcp_tx_preallocted_buffers_free(&m_pcb); + destructor_helper_tcp(); if (m_tcp_seg_in_use) { si_tcp_logwarn("still %d tcp segs in use!", m_tcp_seg_in_use); @@ -590,6 +587,14 @@ sockinfo_tcp::~sockinfo_tcp() xlio_socket_event(XLIO_SOCKET_EVENT_TERMINATED, 0); } +void sockinfo_tcp::destructor_helper_tcp() +{ + // Release preallocated buffers + tcp_tx_preallocted_buffers_free(&m_pcb); + + destructor_helper(); +} + void sockinfo_tcp::clean_socket_obj() { lock_tcp_con(); @@ -2733,7 +2738,7 @@ int sockinfo_tcp::connect(const sockaddr *__to, socklen_t __tolen) ntohs(m_connected.get_in_port()), m_pcb.is_ipv6, sockinfo_tcp::connect_lwip_cb); if (err != ERR_OK) { // todo consider setPassthrough and go to OS - destructor_helper(); + destructor_helper_tcp(); m_conn_state = TCP_CONN_FAILED; errno = ECONNREFUSED; si_tcp_logerr("bad connect, err=%d", err); @@ -2771,7 +2776,7 @@ int sockinfo_tcp::connect(const sockaddr *__to, socklen_t __tolen) int keep_errno = errno; tcp_close(&m_pcb); - destructor_helper(); + destructor_helper_tcp(); unlock_tcp_con(); si_tcp_logdbg("Blocking connect error, m_sock_state=%d", static_cast(m_sock_state)); @@ -3060,7 +3065,7 @@ int sockinfo_tcp::listen(int backlog) si_tcp_logdbg("failed to add user's fd to internal epfd errno=%d (%m)", errno); } else { si_tcp_logerr("failed to add user's fd to internal epfd errno=%d (%m)", errno); - destructor_helper(); + destructor_helper_tcp(); passthrough_unlock("Fallback the connection to os"); return 0; } diff --git a/src/core/sock/sockinfo_tcp.h b/src/core/sock/sockinfo_tcp.h index f8b7a67ac..3fc7f12bd 100644 --- a/src/core/sock/sockinfo_tcp.h +++ b/src/core/sock/sockinfo_tcp.h @@ -212,6 +212,7 @@ class sockinfo_tcp : public sockinfo { // otherwise bool prepare_to_close(bool process_shutdown = false) override; void create_dst_entry(); + void destructor_helper_tcp(); bool prepare_dst_to_send(bool is_accepted_socket = false); int fcntl(int __cmd, unsigned long int __arg) override; diff --git a/src/core/util/xlio_stats.h b/src/core/util/xlio_stats.h index 5b77774f7..f3d9d48bb 100644 --- a/src/core/util/xlio_stats.h +++ b/src/core/util/xlio_stats.h @@ -382,6 +382,8 @@ typedef struct { uint64_t n_tx_dev_mem_byte_count; uint64_t n_tx_dev_mem_oob; uint32_t n_tx_dev_mem_allocated; + uint32_t n_tx_num_bufs; + uint32_t n_zc_num_bufs; } simple; struct { char s_tap_name[IFNAMSIZ]; diff --git a/src/stats/stats_reader.cpp b/src/stats/stats_reader.cpp index 18a118b16..287853c24 100644 --- a/src/stats/stats_reader.cpp +++ b/src/stats/stats_reader.cpp @@ -358,6 +358,12 @@ void update_delta_ring_stat(ring_stats_t *p_curr_ring_stats, ring_stats_t *p_pre (p_curr_ring_stats->simple.n_tx_dropped_wqes - p_prev_ring_stats->simple.n_tx_dropped_wqes) / delay; + p_prev_ring_stats->simple.n_tx_num_bufs = + (p_curr_ring_stats->simple.n_tx_num_bufs - p_prev_ring_stats->simple.n_tx_num_bufs) / + delay; + p_prev_ring_stats->simple.n_zc_num_bufs = + (p_curr_ring_stats->simple.n_zc_num_bufs - p_prev_ring_stats->simple.n_zc_num_bufs) / + delay; #ifdef DEFINED_UTLS p_prev_ring_stats->n_tx_tls_contexts = (p_curr_ring_stats->n_tx_tls_contexts - p_prev_ring_stats->n_tx_tls_contexts) / delay; @@ -548,6 +554,11 @@ void print_ring_stats(ring_instance_block_t *p_ring_inst_arr) p_ring_stats->simple.n_tx_dev_mem_pkt_count, p_ring_stats->simple.n_tx_dev_mem_oob, post_fix); } + + printf(FORMAT_STATS_32bit, + "TX buffers inflight:", p_ring_stats->simple.n_tx_num_bufs); + printf(FORMAT_STATS_32bit, + "TX ZC buffers inflight:", p_ring_stats->simple.n_zc_num_bufs); } } } @@ -1789,6 +1800,8 @@ void zero_ring_stats(ring_stats_t *p_ring_stats) p_ring_stats->simple.n_tx_dev_mem_byte_count = 0; p_ring_stats->simple.n_tx_dev_mem_pkt_count = 0; p_ring_stats->simple.n_tx_dev_mem_oob = 0; + p_ring_stats->simple.n_tx_num_bufs = 0; + p_ring_stats->simple.n_zc_num_bufs = 0; } } From ffc6804006c2e20867a2a6b63cce5ed612f2bdb0 Mon Sep 17 00:00:00 2001 From: Alexander Grissik Date: Thu, 4 Apr 2024 10:11:21 +0300 Subject: [PATCH 164/169] issue: 3598943 Fixing adaptive CQ moderation 1. Removing hardcoded check that switches AIM to latency mode. In case of low packet rate the calculation will result in 0 count anyway. In case packet rate is higher than the desired interrupt rate we do want to utilize the AIM correctly. 2. Changing default AIM values to more reasonable. 3. Removing default values for Nginx and use AIM by default. This improves CPU utilization in low congested cases significantly.: Signed-off-by: Alexander Grissik --- src/core/dev/ring_simple.cpp | 17 +++-------------- src/core/dev/ring_simple.h | 4 +--- src/core/dev/ring_slave.cpp | 2 +- src/core/dev/ring_slave.h | 2 +- src/core/dev/ring_tap.h | 2 +- src/core/util/sys_vars.cpp | 13 +------------ src/core/util/sys_vars.h | 8 ++++---- 7 files changed, 12 insertions(+), 36 deletions(-) diff --git a/src/core/dev/ring_simple.cpp b/src/core/dev/ring_simple.cpp index 4d82ca523..41bc5c047 100644 --- a/src/core/dev/ring_simple.cpp +++ b/src/core/dev/ring_simple.cpp @@ -880,9 +880,8 @@ void ring_simple::init_tx_buffers(uint32_t count) m_p_ring_stat->simple.n_tx_num_bufs = m_tx_num_bufs; } -void ring_simple::inc_cq_moderation_stats(size_t sz_data) +void ring_simple::inc_cq_moderation_stats() { - m_cq_moderation_info.bytes += sz_data; ++m_cq_moderation_info.packets; } @@ -1049,15 +1048,13 @@ void ring_simple::adapt_cq_moderation() uint32_t missed_rounds = m_cq_moderation_info.missed_rounds; // todo collect bytes and packets from all rings ?? - int64_t interval_bytes = m_cq_moderation_info.bytes - m_cq_moderation_info.prev_bytes; int64_t interval_packets = m_cq_moderation_info.packets - m_cq_moderation_info.prev_packets; - m_cq_moderation_info.prev_bytes = m_cq_moderation_info.bytes; m_cq_moderation_info.prev_packets = m_cq_moderation_info.packets; m_cq_moderation_info.missed_rounds = 0; BULLSEYE_EXCLUDE_BLOCK_START - if (interval_bytes < 0 || interval_packets < 0) { + if (interval_packets < 0) { // rare wrap-around of 64 bit, just ignore m_lock_ring_rx.unlock(); return; @@ -1072,7 +1069,6 @@ void ring_simple::adapt_cq_moderation() return; } - uint32_t avg_packet_size = interval_bytes / interval_packets; uint32_t avg_packet_rate = (interval_packets * 1000) / (safe_mce_sys().cq_aim_interval_msec * (1 + missed_rounds)); @@ -1083,14 +1079,7 @@ void ring_simple::adapt_cq_moderation() safe_mce_sys().cq_aim_max_period_usec, ((1000000UL / ir_rate) - (1000000UL / std::max(avg_packet_rate, ir_rate)))); - if (avg_packet_size < 1024 && avg_packet_rate < 450000) { - modify_cq_moderation(0, 0); // latency mode - // todo latency for big messages is not good - // the rate is affected by the moderation and the moderation by the rate.. - // so each cycle change from 0 to max, and max to 0, .. - } else { - modify_cq_moderation(period, count); // throughput mode - } + modify_cq_moderation(period, count); m_lock_ring_rx.unlock(); } diff --git a/src/core/dev/ring_simple.h b/src/core/dev/ring_simple.h index ca5d2f541..4a4a96ba1 100644 --- a/src/core/dev/ring_simple.h +++ b/src/core/dev/ring_simple.h @@ -47,9 +47,7 @@ struct cq_moderation_info { uint32_t period; uint32_t count; uint64_t packets; - uint64_t bytes; uint64_t prev_packets; - uint64_t prev_bytes; uint32_t missed_rounds; }; @@ -302,7 +300,7 @@ class ring_simple : public ring_slave { protected: void create_resources(); virtual void init_tx_buffers(uint32_t count); - void inc_cq_moderation_stats(size_t sz_data) override; + void inc_cq_moderation_stats() override; inline void set_tx_num_wr(uint32_t num_wr) { m_tx_num_wr = num_wr; } inline uint32_t get_tx_num_wr() { return m_tx_num_wr; } inline uint32_t get_mtu() { return m_mtu; } diff --git a/src/core/dev/ring_slave.cpp b/src/core/dev/ring_slave.cpp index c6bb055b0..8fcb8e41e 100644 --- a/src/core/dev/ring_slave.cpp +++ b/src/core/dev/ring_slave.cpp @@ -586,7 +586,7 @@ bool ring_slave::rx_process_buffer(mem_buf_desc_t *p_rx_wc_buf_desc, void *pv_fd return false; } - inc_cq_moderation_stats(sz_data); + inc_cq_moderation_stats(); m_p_ring_stat->n_rx_byte_count += sz_data; ++m_p_ring_stat->n_rx_pkt_count; diff --git a/src/core/dev/ring_slave.h b/src/core/dev/ring_slave.h index 439e972b6..b015f7cf3 100644 --- a/src/core/dev/ring_slave.h +++ b/src/core/dev/ring_slave.h @@ -290,7 +290,7 @@ class ring_slave : public ring { virtual void inc_tx_retransmissions_stats(ring_user_id_t id); bool rx_process_buffer(mem_buf_desc_t *p_rx_wc_buf_desc, void *pv_fd_ready_array); virtual int reclaim_recv_single_buffer(mem_buf_desc_t *rx_reuse) = 0; - virtual void inc_cq_moderation_stats(size_t sz_data) = 0; + virtual void inc_cq_moderation_stats() = 0; virtual bool attach_flow(flow_tuple &flow_spec_5t, sockinfo *sink, bool force_5t = false); virtual bool detach_flow(flow_tuple &flow_spec_5t, sockinfo *sink); diff --git a/src/core/dev/ring_tap.h b/src/core/dev/ring_tap.h index b358e292f..f61e99c0e 100644 --- a/src/core/dev/ring_tap.h +++ b/src/core/dev/ring_tap.h @@ -98,7 +98,7 @@ class ring_tap : public ring_slave { NOT_IN_USE(rate_limit); return 0; } - void inc_cq_moderation_stats(size_t sz_data) { NOT_IN_USE(sz_data); } + void inc_cq_moderation_stats() {} virtual uint32_t get_tx_user_lkey(void *addr, size_t length) { NOT_IN_USE(addr); diff --git a/src/core/util/sys_vars.cpp b/src/core/util/sys_vars.cpp index c62709b49..a135097aa 100644 --- a/src/core/util/sys_vars.cpp +++ b/src/core/util/sys_vars.cpp @@ -1047,12 +1047,6 @@ void mce_sys_var::get_env_params() rx_bufs_batch = 8; // MCE_DEFAULT_RX_BUFS_BATCH (64), RX buffers batch size. progress_engine_interval_msec = 0; // MCE_DEFAULT_PROGRESS_ENGINE_INTERVAL_MSEC (10), // Disable internal thread CQ draining logic. - cq_moderation_period_usec = - 1024; // MCE_DEFAULT_CQ_MODERATION_PERIOD_USEC (50), CQ moderation threshold in time. - cq_moderation_count = - 1024; // MCE_DEFAULT_CQ_MODERATION_COUNT(48), CQ moderation threshold in WCEs. - cq_aim_interval_msec = - 0; // MCE_DEFAULT_CQ_AIM_INTERVAL_MSEC (250), Disable adaptive CQ moderation. cq_poll_batch_max = 128; // MCE_DEFAULT_CQ_POLL_BATCH (16), Maximum CQEs to poll in one batch. thread_mode = THREAD_MODE_SINGLE; // MCE_DEFAULT_THREAD_MODE (THREAD_MODE_MULTI), Single @@ -1099,12 +1093,7 @@ void mce_sys_var::get_env_params() rx_bufs_batch = 8; // MCE_DEFAULT_RX_BUFS_BATCH (64), RX buffers batch size. progress_engine_interval_msec = 0; // MCE_DEFAULT_PROGRESS_ENGINE_INTERVAL_MSEC (10), // Disable internal thread CQ draining logic. - cq_moderation_period_usec = - 1024; // MCE_DEFAULT_CQ_MODERATION_PERIOD_USEC (50), CQ moderation threshold in time. - cq_moderation_count = - 1024; // MCE_DEFAULT_CQ_MODERATION_COUNT(48), CQ moderation threshold in WCEs. - cq_aim_interval_msec = - 0; // MCE_DEFAULT_CQ_AIM_INTERVAL_MSEC (250), Disable adaptive CQ moderation. + cq_poll_batch_max = 128; // MCE_DEFAULT_CQ_POLL_BATCH (16), Maximum CQEs to poll in one batch. thread_mode = THREAD_MODE_SINGLE; // MCE_DEFAULT_THREAD_MODE (THREAD_MODE_MULTI), Single diff --git a/src/core/util/sys_vars.h b/src/core/util/sys_vars.h index c97c1f9da..3c3f966e3 100644 --- a/src/core/util/sys_vars.h +++ b/src/core/util/sys_vars.h @@ -795,10 +795,10 @@ extern mce_sys_var &safe_mce_sys(); #endif #define MCE_DEFAULT_CQ_MODERATION_COUNT (48) #define MCE_DEFAULT_CQ_MODERATION_PERIOD_USEC (50) -#define MCE_DEFAULT_CQ_AIM_MAX_COUNT (560) -#define MCE_DEFAULT_CQ_AIM_MAX_PERIOD_USEC (250) -#define MCE_DEFAULT_CQ_AIM_INTERVAL_MSEC (250) -#define MCE_DEFAULT_CQ_AIM_INTERRUPTS_RATE_PER_SEC (5000) +#define MCE_DEFAULT_CQ_AIM_MAX_COUNT (500) +#define MCE_DEFAULT_CQ_AIM_MAX_PERIOD_USEC (1000) +#define MCE_DEFAULT_CQ_AIM_INTERVAL_MSEC (1000) +#define MCE_DEFAULT_CQ_AIM_INTERRUPTS_RATE_PER_SEC (1000) #define MCE_DEFAULT_CQ_POLL_BATCH (16) #define MCE_DEFAULT_PROGRESS_ENGINE_INTERVAL_MSEC (10) #define MCE_DEFAULT_PROGRESS_ENGINE_WCE_MAX (10000) From ef71ee661562618fed24b7483d4bdf05e882d12a Mon Sep 17 00:00:00 2001 From: Dmytro Podgornyi Date: Mon, 22 Apr 2024 18:18:48 +0300 Subject: [PATCH 165/169] issue: 3832212 Print a deprecation warning for XLIO_TX/RX_BUFS These parameters are deprecated and will be removed in the future. Use XLIO_MEMORY_LIMIT instead. Signed-off-by: Dmytro Podgornyi --- src/core/util/sys_vars.cpp | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/core/util/sys_vars.cpp b/src/core/util/sys_vars.cpp index a135097aa..3133a594b 100644 --- a/src/core/util/sys_vars.cpp +++ b/src/core/util/sys_vars.cpp @@ -1990,6 +1990,15 @@ void mce_sys_var::get_env_params() } multilock = (multilock_t)temp; } + + std::vector deprecated_params = {SYS_VAR_TX_NUM_BUFS, SYS_VAR_RX_NUM_BUFS}; + for (const char *param : deprecated_params) { + env_ptr = getenv(param); + if (env_ptr) { + vlog_printf(VLOG_WARNING, + "%s is deprecated and will be removed in the future versions\n", param); + } + } } void set_env_params() From 3cc173e99f8c0cf853a5f98d89c4b612eab9b5aa Mon Sep 17 00:00:00 2001 From: Dmytro Podgornyi Date: Tue, 9 Apr 2024 14:33:02 +0300 Subject: [PATCH 166/169] issue: 3857909 Avoid using MCE_MAX_CQ_POLL_BATCH in runtime MCE_MAX_CQ_POLL_BATCH usage requires it to be small enough. However, this is a logical upper limit and we want be able to raise it if necessary. Remove unused cq_mgr_tx::clean_cq() which uses MCE_MAX_CQ_POLL_BATCH for an array on stack. Adjust condition for RX buffers compensation to remove MCE_MAX_CQ_POLL_BATCH. However, this changes the logic and now, we forcibly compensate only the last RX buffer in RQ. Signed-off-by: Dmytro Podgornyi --- src/core/dev/cq_mgr_rx.cpp | 3 +- src/core/dev/cq_mgr_tx.cpp | 64 -------------------------------------- src/core/dev/cq_mgr_tx.h | 3 -- 3 files changed, 1 insertion(+), 69 deletions(-) diff --git a/src/core/dev/cq_mgr_rx.cpp b/src/core/dev/cq_mgr_rx.cpp index 452ba5176..0a9826479 100644 --- a/src/core/dev/cq_mgr_rx.cpp +++ b/src/core/dev/cq_mgr_rx.cpp @@ -380,8 +380,7 @@ bool cq_mgr_rx::compensate_qp_poll_success(mem_buf_desc_t *buff_cur) m_hqrx_ptr->post_recv_buffers(&m_rx_pool, buffers); m_debt -= buffers; m_p_cq_stat->n_buffer_pool_len = m_rx_pool.size(); - } else if (m_b_sysvar_cq_keep_qp_full || - m_debt + MCE_MAX_CQ_POLL_BATCH > (int)m_hqrx_ptr->m_rx_num_wr) { + } else if (m_b_sysvar_cq_keep_qp_full || m_debt >= (int)m_hqrx_ptr->m_rx_num_wr) { m_p_cq_stat->n_rx_pkt_drop++; m_hqrx_ptr->post_recv_buffer(buff_cur); --m_debt; diff --git a/src/core/dev/cq_mgr_tx.cpp b/src/core/dev/cq_mgr_tx.cpp index 17fceb582..84f7c4921 100644 --- a/src/core/dev/cq_mgr_tx.cpp +++ b/src/core/dev/cq_mgr_tx.cpp @@ -76,70 +76,6 @@ cq_mgr_tx::~cq_mgr_tx() cq_logdbg("Destroying CQ as Tx done"); } -uint32_t cq_mgr_tx::clean_cq() -{ - uint32_t ret_total = 0; - uint64_t cq_poll_sn = 0; - mem_buf_desc_t *buff; - - int ret = 0; - /* coverity[stack_use_local_overflow] */ - xlio_ibv_wc wce[MCE_MAX_CQ_POLL_BATCH]; - while ((ret = clean_cq_poll_tx(wce, MCE_MAX_CQ_POLL_BATCH, &cq_poll_sn)) > 0) { - for (int i = 0; i < ret; i++) { - buff = (mem_buf_desc_t *)(uintptr_t)(wce[i].wr_id); - if (buff) { - m_p_ring->mem_buf_desc_return_single_to_owner_tx(buff); - } - } - ret_total += ret; - } - - return ret_total; -} - -int cq_mgr_tx::clean_cq_poll_tx(xlio_ibv_wc *p_wce, int num_entries, uint64_t *p_cq_poll_sn) -{ - // Assume locked!!! - cq_logfuncall(""); - - int ret = xlio_ibv_poll_cq(m_p_ibv_cq, num_entries, p_wce); - if (ret <= 0) { - // Zero polled wce OR ibv_poll_cq() has driver specific errors - // so we can't really do anything with them - *p_cq_poll_sn = m_n_global_sn_tx; - return 0; - } - - if (unlikely(g_vlogger_level >= VLOG_FUNC_ALL)) { - for (int i = 0; i < ret; i++) { - cq_logfuncall("wce[%d] info wr_id=%x, status=%x, opcode=%x, vendor_err=%x, " - "byte_len=%d, imm_data=%x", - i, p_wce[i].wr_id, p_wce[i].status, xlio_wc_opcode(p_wce[i]), - p_wce[i].vendor_err, p_wce[i].byte_len, p_wce[i].imm_data); - cq_logfuncall("qp_num=%x, src_qp=%x, wc_flags=%x, pkey_index=%x, slid=%x, sl=%x, " - "dlid_path_bits=%x", - p_wce[i].qp_num, p_wce[i].src_qp, xlio_wc_flags(p_wce[i]), - p_wce[i].pkey_index, p_wce[i].slid, p_wce[i].sl, p_wce[i].dlid_path_bits); - } - } - - // spoil the global sn if we have packets ready - union __attribute__((packed)) { - uint64_t global_sn; - struct { - uint32_t cq_id; - uint32_t cq_sn; - } bundle; - } next_sn; - next_sn.bundle.cq_sn = ++m_n_cq_poll_sn_tx; - next_sn.bundle.cq_id = m_cq_id_tx; - - *p_cq_poll_sn = m_n_global_sn_tx = next_sn.global_sn; - - return ret; -} - void cq_mgr_tx::configure(int cq_size) { xlio_ibv_cq_init_attr attr; diff --git a/src/core/dev/cq_mgr_tx.h b/src/core/dev/cq_mgr_tx.h index f5b5b7fec..a3bfb7c60 100644 --- a/src/core/dev/cq_mgr_tx.h +++ b/src/core/dev/cq_mgr_tx.h @@ -56,8 +56,6 @@ class cq_mgr_tx { void add_qp_tx(hw_queue_tx *hqtx_ptr); void del_qp_tx(hw_queue_tx *hqtx_ptr); - uint32_t clean_cq(); - /** * Arm the managed CQ's notification channel * Calling this more then once without get_event() will return without @@ -77,7 +75,6 @@ class cq_mgr_tx { private: void log_cqe_error(struct xlio_mlx5_cqe *cqe); void handle_sq_wqe_prop(unsigned index); - int clean_cq_poll_tx(xlio_ibv_wc *p_wce, int num_entries, uint64_t *p_cq_poll_sn); void get_cq_event(int count = 1) { xlio_ib_mlx5_get_cq_event(&m_mlx5_cq, count); }; From 7c616346c5120f37b29f00c37dbdf1f1a79cba4a Mon Sep 17 00:00:00 2001 From: Dmytro Podgornyi Date: Mon, 8 Apr 2024 20:06:43 +0300 Subject: [PATCH 167/169] issue: 3857909 Increase upper limit for XLIO_CQ_POLL_BATCH_MAX MCE_MAX_CQ_POLL_BATCH is a logical upper limit for CQ polling batch size. There is no hard limitation for it, so raise it to maximum CQ size. This value can even exceed CQ size, because HW continue receiving packets during polling. Be default, this change doesn't have effect unless a higher value for XLIO_CQ_POLL_BATCH_MAX is set explicitly. This can be helpful in a scenario when a high traffic rate stops for a long time and number of packets in an RQ exceeds the batch size. Signed-off-by: Dmytro Podgornyi --- src/core/util/sys_vars.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/core/util/sys_vars.h b/src/core/util/sys_vars.h index 3c3f966e3..9dddbce91 100644 --- a/src/core/util/sys_vars.h +++ b/src/core/util/sys_vars.h @@ -860,7 +860,7 @@ extern mce_sys_var &safe_mce_sys(); #define MCE_CQ_DRAIN_INTERVAL_DISABLED (0) #define MCE_CQ_ADAPTIVE_MODERATION_DISABLED (0) #define MCE_MIN_CQ_POLL_BATCH (1) -#define MCE_MAX_CQ_POLL_BATCH (128) +#define MCE_MAX_CQ_POLL_BATCH (32768) #define MCE_DEFAULT_SOCKETXTREME (false) #define MCE_DEFAULT_TSO (option_3::AUTO) #define MCE_DEFAULT_MAX_TSO_SIZE (256 * 1024) From fc48742d4f1528a7cc3d25ebb2e1b0e0ead20b57 Mon Sep 17 00:00:00 2001 From: Gal Noam Date: Thu, 2 May 2024 08:21:27 +0300 Subject: [PATCH 168/169] version: 3.31.0 Signed-off-by: Gal Noam --- CHANGES | 12 ++++++++++++ configure.ac | 4 ++-- contrib/scripts/libxlio.spec.in | 4 ++-- 3 files changed, 16 insertions(+), 4 deletions(-) diff --git a/CHANGES b/CHANGES index 53d4f6c84..7b9b923d8 100644 --- a/CHANGES +++ b/CHANGES @@ -1,3 +1,15 @@ +Version 3.31.0-1: +Date + Time 2024-05-01 +============================================================= +Added: + - RM #3832212 Print a deprecation warning for XLIO_TX/RX_BUFS + +Fixed: + - RM #3857909 Missed interrupts when high RX traffic rate stops + - RM #3598943 higher CPU util then kernel with Nginx with high payloads + - RM #3854806 Tx buffer pool leak with sockperf scenarios enabling zcopy + - RM #3858121 Issues with XLIO stats functionality + Version 3.30.5-1: Date + Time 2024-0-09 ============================================================= diff --git a/configure.ac b/configure.ac index 25a5dc8fc..ac8e01dd4 100644 --- a/configure.ac +++ b/configure.ac @@ -13,8 +13,8 @@ dnl===-----------------------------------------------------------------------=== # Update version number here: # define([prj_ver_major], 3) -define([prj_ver_minor], 30) -define([prj_ver_revision], 5) +define([prj_ver_minor], 31) +define([prj_ver_revision], 0) define([prj_ver_release], esyscmd([echo ${PRJ_RELEASE:=0}])) diff --git a/contrib/scripts/libxlio.spec.in b/contrib/scripts/libxlio.spec.in index 1d79c29e3..e592a820a 100644 --- a/contrib/scripts/libxlio.spec.in +++ b/contrib/scripts/libxlio.spec.in @@ -189,7 +189,7 @@ fi %{_mandir}/man8/xlio_stats.* %changelog -* Tue Apr 9 2024 NVIDIA CORPORATION 3.30.5-1 -- Bump version to 3.30.5 +* Wed May 1 2024 NVIDIA CORPORATION 3.31.0-1 +- Bump version to 3.31.0 - Please refer to CHANGES for full changelog. From 03a98fda07f5ed4b679f5bbfe40d9cda4b1ec877 Mon Sep 17 00:00:00 2001 From: Iftah Levi Date: Tue, 26 Mar 2024 22:41:05 +0200 Subject: [PATCH 169/169] issue: 3795997 Allow split segment with unacked q When the send window is not big enough for the required TCP segment to send, we may split the segment so it will fit into the window. Before this change - We didn't split the segment in the case we have unacked segments. The motivation was that we anticipate to get ACK on the inflight segments, which will trigger the next send operation. This flow counts on RTT for receiving ACKs, which may be delayed depending on the remote side. When RTT is long - we would block sending although the TCP send window allows it. The change is to split TCP segments although we have unacked data, in case the send window is big enough (mss). Signed-off-by: Iftah Levi --- src/core/lwip/tcp_out.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/core/lwip/tcp_out.c b/src/core/lwip/tcp_out.c index c307f5e81..54269ad44 100644 --- a/src/core/lwip/tcp_out.c +++ b/src/core/lwip/tcp_out.c @@ -1813,7 +1813,8 @@ err_t tcp_output(struct tcp_pcb *pcb) } /* Split the segment in case of a small window */ - if ((NULL == pcb->unacked) && (wnd) && ((seg->len + seg->seqno - pcb->lastack) > wnd)) { + if (wnd && ((NULL == pcb->unacked) || (wnd >= pcb->mss)) && + ((seg->len + seg->seqno - pcb->lastack) > wnd)) { LWIP_ASSERT("tcp_output: no window for dummy packet", !LWIP_IS_DUMMY_SEGMENT(seg)); tcp_split_segment(pcb, seg, wnd); }