From ac9b09d8c4478dc4dc20cb9002b3a6df52cd2fa9 Mon Sep 17 00:00:00 2001 From: Kevin Heifner Date: Fri, 2 Aug 2024 15:48:19 -0500 Subject: [PATCH 01/15] GH-459 Reset sync_known_lib_num and sync_last_requested_num when unable to request range. Verify request is above LIB. --- plugins/net_plugin/net_plugin.cpp | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/plugins/net_plugin/net_plugin.cpp b/plugins/net_plugin/net_plugin.cpp index 5baf52a775..bb3b4daf14 100644 --- a/plugins/net_plugin/net_plugin.cpp +++ b/plugins/net_plugin/net_plugin.cpp @@ -2113,6 +2113,7 @@ namespace eosio { } if (conns.size() == 1) { // only one available ++sync_ordinal; + fc_dlog(logger, "sync from ${c}", ("c", conns.front()->connection_id)); conns.front()->sync_ordinal = sync_ordinal.load(); return conns.front(); } @@ -2155,23 +2156,27 @@ namespace eosio { * a provider is supplied and able to be used, use it. * otherwise select the next available from the list, round-robin style. */ + connection_ptr new_sync_source = (conn && conn->current()) ? conn : find_next_sync_node(); - connection_ptr new_sync_source = (conn && conn->current()) ? conn : - find_next_sync_node(); + auto reset_on_failure = [&]() { + sync_source.reset(); + sync_known_lib_num = chain_info.lib_num; + sync_last_requested_num = 0; + // not in sync, but need to be out of lib_catchup for start_sync to work + set_state( in_sync ); + send_handshakes(); + }; // verify there is an available source if( !new_sync_source ) { fc_wlog( logger, "Unable to continue syncing at this time"); - sync_source.reset(); - sync_known_lib_num = chain_info.lib_num; - sync_last_requested_num = 0; - set_state( in_sync ); // probably not, but we can't do anything else + reset_on_failure(); return; } bool request_sent = false; if( sync_last_requested_num != sync_known_lib_num ) { - uint32_t start = sync_next_expected_num; + uint32_t start = std::min(sync_next_expected_num, chain_info.lib_num+1); uint32_t end = start + sync_req_span - 1; if( end > sync_known_lib_num ) end = sync_known_lib_num; @@ -2186,10 +2191,8 @@ namespace eosio { } } if( !request_sent ) { - sync_source.reset(); fc_wlog(logger, "Unable to request range, sending handshakes to everyone"); - set_state( in_sync ); // need to be out of lib_catchup so start_sync will work - send_handshakes(); + reset_on_failure(); } } From 3fee28cf4b4a1d2a70fdc27520a597d50407e859 Mon Sep 17 00:00:00 2001 From: Kevin Heifner Date: Fri, 2 Aug 2024 15:50:08 -0500 Subject: [PATCH 02/15] GH-459 Should not consider a block accepted unless it was applied --- plugins/net_plugin/net_plugin.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/plugins/net_plugin/net_plugin.cpp b/plugins/net_plugin/net_plugin.cpp index bb3b4daf14..4aa1ee0576 100644 --- a/plugins/net_plugin/net_plugin.cpp +++ b/plugins/net_plugin/net_plugin.cpp @@ -2478,7 +2478,8 @@ namespace eosio { return; } c->latest_blk_time = std::chrono::system_clock::now(); - c->block_status_monitor_.accepted(); + if (blk_applied) + c->block_status_monitor_.accepted(); if (blk_latency.count() < config::block_interval_us && c->peer_syncing_from_us) { // a peer will not send us a recent block unless it is synced c->peer_syncing_from_us = false; From ef1e4b2e1953249521220b22d4a3cf8b8e2d701e Mon Sep 17 00:00:00 2001 From: Kevin Heifner Date: Fri, 2 Aug 2024 15:51:51 -0500 Subject: [PATCH 03/15] GH-459 Do not request next chunk if reached sync_known_lib_num --- plugins/net_plugin/net_plugin.cpp | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/plugins/net_plugin/net_plugin.cpp b/plugins/net_plugin/net_plugin.cpp index 4aa1ee0576..68e4015e5d 100644 --- a/plugins/net_plugin/net_plugin.cpp +++ b/plugins/net_plugin/net_plugin.cpp @@ -2550,22 +2550,22 @@ namespace eosio { } } if (blk_num >= sync_known_lib_num) { - peer_dlog(c, "received non-applied block ${bn} > ${kn}, will send handshakes when caught up", + peer_dlog(c, "received non-applied block ${bn} >= ${kn}, will send handshakes when caught up", ("bn", blk_num)("kn", sync_known_lib_num)); send_handshakes_when_synced = true; - } - - // use chain head instead of fork head so we do not get too far ahead of applied blocks - uint32_t head = my_impl->get_chain_head_num(); - // do not allow to get too far ahead (one sync_req_span) of chain head - if (blk_num >= sync_last_requested_num && blk_num < head + sync_req_span) { - // block was not applied, possibly because we already have the block - fc_dlog(logger, "Requesting blocks ahead, head: ${h} fhead ${fh} blk_num: ${bn} sync_next_expected_num ${nen} " - "sync_last_requested_num: ${lrn}, sync_last_requested_block: ${lrb}", - ("h", my_impl->get_chain_head_num())("fh", my_impl->get_fork_head_num()) - ("bn", blk_num)("nen", sync_next_expected_num) - ("lrn", sync_last_requested_num)("lrb", c->sync_last_requested_block)); - request_next_chunk(); + } else { + // use chain head instead of fork head so we do not get too far ahead of applied blocks + uint32_t head = my_impl->get_chain_head_num(); + // do not allow to get too far ahead (one sync_req_span) of chain head + if (blk_num >= sync_last_requested_num && blk_num < head + sync_req_span) { + // block was not applied, possibly because we already have the block + fc_dlog(logger, "Requesting blocks ahead, head: ${h} fhead ${fh} blk_num: ${bn} sync_next_expected_num ${nen} " + "sync_last_requested_num: ${lrn}, sync_last_requested_block: ${lrb}", + ("h", my_impl->get_chain_head_num())("fh", my_impl->get_fork_head_num()) + ("bn", blk_num)("nen", sync_next_expected_num) + ("lrn", sync_last_requested_num)("lrb", c->sync_last_requested_block)); + request_next_chunk(); + } } } else { // blk_applied if (blk_num >= sync_last_requested_num) { From 406b87742f43c1d88db09a49e5a8f65c68c774a8 Mon Sep 17 00:00:00 2001 From: Kevin Heifner Date: Fri, 2 Aug 2024 16:04:01 -0500 Subject: [PATCH 04/15] GH-459 Fix mutex warning --- plugins/net_plugin/net_plugin.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/plugins/net_plugin/net_plugin.cpp b/plugins/net_plugin/net_plugin.cpp index 68e4015e5d..9673ba6355 100644 --- a/plugins/net_plugin/net_plugin.cpp +++ b/plugins/net_plugin/net_plugin.cpp @@ -2158,7 +2158,7 @@ namespace eosio { */ connection_ptr new_sync_source = (conn && conn->current()) ? conn : find_next_sync_node(); - auto reset_on_failure = [&]() { + auto reset_on_failure = [&]() REQUIRES(sync_mtx) { sync_source.reset(); sync_known_lib_num = chain_info.lib_num; sync_last_requested_num = 0; From 4ff71519a09b00928b35ecb2ef7644e1cdfd5163 Mon Sep 17 00:00:00 2001 From: Kevin Heifner Date: Fri, 2 Aug 2024 16:09:17 -0500 Subject: [PATCH 05/15] GH-459 Remove unused last_req --- plugins/net_plugin/net_plugin.cpp | 97 +------------------------------ 1 file changed, 1 insertion(+), 96 deletions(-) diff --git a/plugins/net_plugin/net_plugin.cpp b/plugins/net_plugin/net_plugin.cpp index 9673ba6355..23a5d0df95 100644 --- a/plugins/net_plugin/net_plugin.cpp +++ b/plugins/net_plugin/net_plugin.cpp @@ -295,8 +295,6 @@ namespace eosio { void expire_blocks( uint32_t lib_num ); void recv_notice(const connection_ptr& conn, const notice_message& msg, bool generated); - void retry_fetch(const connection_ptr& conn); - bool add_peer_block( const block_id_type& blkid, uint32_t connection_id ); bool peer_has_block(const block_id_type& blkid, uint32_t connection_id) const; bool have_block(const block_id_type& blkid) const; @@ -967,8 +965,7 @@ namespace eosio { std::atomic no_retry{no_reason}; alignas(hardware_destructive_interference_size) - mutable fc::mutex conn_mtx; //< mtx for last_req .. remote_endpoint_ip - std::optional last_req GUARDED_BY(conn_mtx); + mutable fc::mutex conn_mtx; //< mtx for last_handshake_recv .. remote_endpoint_ip handshake_message last_handshake_recv GUARDED_BY(conn_mtx); handshake_message last_handshake_sent GUARDED_BY(conn_mtx); block_id_type conn_fork_head GUARDED_BY(conn_mtx); @@ -1076,9 +1073,7 @@ namespace eosio { void cancel_wait(); void sync_wait(); - void fetch_wait(); void sync_timeout(boost::system::error_code ec); - void fetch_timeout(boost::system::error_code ec); void queue_write(const std::shared_ptr>& buff, std::function callback, @@ -1466,18 +1461,13 @@ namespace eosio { peer_syncing_from_us = false; block_status_monitor_.reset(); ++consecutive_immediate_connection_close; - bool has_last_req = false; { fc::lock_guard g_conn( conn_mtx ); - has_last_req = last_req.has_value(); last_handshake_recv = handshake_message(); last_handshake_sent = handshake_message(); last_close = fc::time_point::now(); conn_node_id = fc::sha256(); } - if( has_last_req && !shutdown ) { - my_impl->dispatcher.retry_fetch( shared_from_this() ); - } peer_lib_num = 0; peer_requested.reset(); sent_handshake_count = 0; @@ -1959,17 +1949,6 @@ namespace eosio { } ) ); } - // thread safe - void connection::fetch_wait() { - connection_ptr c( shared_from_this() ); - fc::lock_guard g( response_expected_timer_mtx ); - response_expected_timer.expires_from_now( my_impl->resp_expected_period ); - response_expected_timer.async_wait( - boost::asio::bind_executor( c->strand, [c]( boost::system::error_code ec ) { - c->fetch_timeout(ec); - } ) ); - } - // called from connection strand void connection::sync_timeout( boost::system::error_code ec ) { if( !ec ) { @@ -1980,15 +1959,6 @@ namespace eosio { } } - // called from connection strand - void connection::fetch_timeout( boost::system::error_code ec ) { - if( !ec ) { - my_impl->dispatcher.retry_fetch( shared_from_this() ); - } else if( ec != boost::asio::error::operation_aborted ) { // don't log on operation_aborted, called on destroy - peer_elog( this, "setting timer for fetch request got error ${ec}", ("ec", ec.message() ) ); - } - } - // called from connection strand void connection::request_sync_blocks(uint32_t start, uint32_t end) { sync_last_requested_block = end; @@ -2730,17 +2700,6 @@ namespace eosio { // called from c's connection strand void dispatch_manager::recv_block(const connection_ptr& c, const block_id_type& id, uint32_t bnum) { - fc::unique_lock g( c->conn_mtx ); - if (c && - c->last_req && - c->last_req->req_blocks.mode != none && - !c->last_req->req_blocks.ids.empty() && - c->last_req->req_blocks.ids.back() == id) { - peer_dlog( c, "resetting last_req" ); - c->last_req.reset(); - } - g.unlock(); - peer_dlog(c, "canceling wait"); c->cancel_wait(); } @@ -2795,60 +2754,6 @@ namespace eosio { } } - // called from c's connection strand - void dispatch_manager::retry_fetch(const connection_ptr& c) { - peer_dlog( c, "retry fetch" ); - request_message last_req; - block_id_type bid; - { - fc::lock_guard g_c_conn( c->conn_mtx ); - if( !c->last_req ) { - return; - } - peer_wlog( c, "failed to fetch from peer" ); - if( c->last_req->req_blocks.mode == normal && !c->last_req->req_blocks.ids.empty() ) { - bid = c->last_req->req_blocks.ids.back(); - } else { - peer_wlog( c, "no retry, block mpde = ${b} trx mode = ${t}", - ("b", modes_str( c->last_req->req_blocks.mode ))( "t", modes_str( c->last_req->req_trx.mode ) ) ); - return; - } - last_req = *c->last_req; - } - auto request_from_peer = [this, &c, &last_req, &bid]( auto& conn ) { - if( conn == c ) - return false; - - { - fc::lock_guard guard( conn->conn_mtx ); - if( conn->last_req ) { - return false; - } - } - - bool sendit = peer_has_block( bid, conn->connection_id ); - if( sendit ) { - conn->strand.post( [conn, last_req{std::move(last_req)}]() { - conn->enqueue( last_req ); - conn->fetch_wait(); - fc::lock_guard g_conn_conn( conn->conn_mtx ); - conn->last_req = last_req; - } ); - return true; - } - return false; - }; - - if (!my_impl->connections.any_of_block_connections(request_from_peer)) { - // at this point no other peer has it, re-request or do nothing? - peer_wlog(c, "no peer has last_req"); - if (c->connected()) { - c->enqueue(last_req); - c->fetch_wait(); - } - } - } - //------------------------------------------------------------------------ bool connection::reconnect() { From 83cc7cc425b14807d8c687544de596156372516c Mon Sep 17 00:00:00 2001 From: Kevin Heifner Date: Fri, 2 Aug 2024 17:04:49 -0500 Subject: [PATCH 06/15] GH-459 Fix case where sync-fetch-span is 1 --- plugins/net_plugin/net_plugin.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/plugins/net_plugin/net_plugin.cpp b/plugins/net_plugin/net_plugin.cpp index 23a5d0df95..dbe104bb7e 100644 --- a/plugins/net_plugin/net_plugin.cpp +++ b/plugins/net_plugin/net_plugin.cpp @@ -2146,7 +2146,9 @@ namespace eosio { bool request_sent = false; if( sync_last_requested_num != sync_known_lib_num ) { - uint32_t start = std::min(sync_next_expected_num, chain_info.lib_num+1); + uint32_t start = sync_next_expected_num; + if (sync_req_span > 1) + start = std::min(start, chain_info.lib_num+1); uint32_t end = start + sync_req_span - 1; if( end > sync_known_lib_num ) end = sync_known_lib_num; From f3871362c3b1dade8a56a80bdef8646e39009adb Mon Sep 17 00:00:00 2001 From: Kevin Heifner Date: Fri, 2 Aug 2024 17:13:42 -0500 Subject: [PATCH 07/15] GH-459 Better handling of sync wait --- plugins/net_plugin/net_plugin.cpp | 78 ++++++++++++++----------------- 1 file changed, 34 insertions(+), 44 deletions(-) diff --git a/plugins/net_plugin/net_plugin.cpp b/plugins/net_plugin/net_plugin.cpp index dbe104bb7e..7959955308 100644 --- a/plugins/net_plugin/net_plugin.cpp +++ b/plugins/net_plugin/net_plugin.cpp @@ -260,7 +260,7 @@ namespace eosio { bool syncing_from_peer() const { return sync_state == lib_catchup; } bool is_in_sync() const { return sync_state == in_sync; } void sync_reset_lib_num( const connection_ptr& conn, bool closing ); - void sync_reassign_fetch( const connection_ptr& c, go_away_reason reason ); + void sync_reassign_fetch( const connection_ptr& c ); void rejected_block( const connection_ptr& c, uint32_t blk_num, closing_mode mode ); void sync_recv_block( const connection_ptr& c, const block_id_type& blk_id, uint32_t blk_num, bool blk_applied, const fc::microseconds& blk_latency ); @@ -958,8 +958,8 @@ namespace eosio { block_status_monitor block_status_monitor_; alignas(hardware_destructive_interference_size) - fc::mutex response_expected_timer_mtx; - boost::asio::steady_timer response_expected_timer GUARDED_BY(response_expected_timer_mtx); + fc::mutex sync_response_expected_timer_mtx; + boost::asio::steady_timer sync_response_expected_timer GUARDED_BY(sync_response_expected_timer_mtx); alignas(hardware_destructive_interference_size) std::atomic no_retry{no_reason}; @@ -1066,12 +1066,12 @@ namespace eosio { void enqueue_buffer( const std::shared_ptr>& send_buffer, go_away_reason close_after_send, bool to_sync_queue = false); - void cancel_sync(go_away_reason reason); + void cancel_sync(); void flush_queues(); bool enqueue_sync_block(); void request_sync_blocks(uint32_t start, uint32_t end); - void cancel_wait(); + void cancel_sync_wait(); void sync_wait(); void sync_timeout(boost::system::error_code ec); @@ -1273,7 +1273,7 @@ namespace eosio { listen_address( listen_address ), log_p2p_address( endpoint ), connection_id( ++my_impl->current_connection_id ), - response_expected_timer( my_impl->thread_pool.get_executor() ), + sync_response_expected_timer( my_impl->thread_pool.get_executor() ), last_handshake_recv(), last_handshake_sent(), p2p_address( endpoint ) @@ -1290,7 +1290,7 @@ namespace eosio { socket( new tcp::socket( std::move(s) ) ), listen_address( listen_address ), connection_id( ++my_impl->current_connection_id ), - response_expected_timer( my_impl->thread_pool.get_executor() ), + sync_response_expected_timer( my_impl->thread_pool.get_executor() ), last_handshake_recv(), last_handshake_sent() { @@ -1472,7 +1472,7 @@ namespace eosio { peer_requested.reset(); sent_handshake_count = 0; if( !shutdown) my_impl->sync_master->sync_reset_lib_num( shared_from_this(), true ); - cancel_wait(); + cancel_sync_wait(); sync_last_requested_block = 0; org = std::chrono::nanoseconds{0}; latest_msg_time = std::chrono::system_clock::time_point::min(); @@ -1722,23 +1722,13 @@ namespace eosio { } // called from connection strand - void connection::cancel_sync(go_away_reason reason) { - peer_dlog( this, "cancel sync reason = ${m}, write queue size ${o} bytes", - ("m", reason_str( reason ))("o", buffer_queue.write_queue_size()) ); - cancel_wait(); + void connection::cancel_sync() { + peer_dlog( this, "cancel sync, write queue size ${o} bytes", ("o", buffer_queue.write_queue_size()) ); + cancel_sync_wait(); sync_last_requested_block = 0; flush_queues(); - switch (reason) { - case validation : - case fatal_other : { - no_retry = reason; - enqueue( go_away_message( reason )); - break; - } - default: - peer_ilog(this, "sending empty request but not calling sync wait"); - enqueue( ( sync_request_message ) {0,0} ); - } + peer_ilog(this, "sending empty request but not calling sync wait"); + enqueue( ( sync_request_message ) {0,0} ); } // called from connection strand @@ -1933,17 +1923,17 @@ namespace eosio { } // thread safe - void connection::cancel_wait() { - fc::lock_guard g( response_expected_timer_mtx ); - response_expected_timer.cancel(); + void connection::cancel_sync_wait() { + fc::lock_guard g( sync_response_expected_timer_mtx ); + sync_response_expected_timer.cancel(); } // thread safe void connection::sync_wait() { connection_ptr c(shared_from_this()); - fc::lock_guard g( response_expected_timer_mtx ); - response_expected_timer.expires_from_now( my_impl->resp_expected_period ); - response_expected_timer.async_wait( + fc::lock_guard g( sync_response_expected_timer_mtx ); + sync_response_expected_timer.expires_from_now( my_impl->resp_expected_period ); + sync_response_expected_timer.async_wait( boost::asio::bind_executor( c->strand, [c]( boost::system::error_code ec ) { c->sync_timeout( ec ); } ) ); @@ -1952,7 +1942,7 @@ namespace eosio { // called from connection strand void connection::sync_timeout( boost::system::error_code ec ) { if( !ec ) { - my_impl->sync_master->sync_reassign_fetch( shared_from_this(), benign_other ); + my_impl->sync_master->sync_reassign_fetch( shared_from_this() ); close(true); } else if( ec != boost::asio::error::operation_aborted ) { // don't log on operation_aborted, called on destroy peer_elog( this, "setting timer for sync request got error ${ec}", ("ec", ec.message()) ); @@ -2206,6 +2196,7 @@ namespace eosio { sync_next_expected_num = chain_info.lib_num + 1; } else { peer_dlog(c, "already syncing, start sync ignored"); + c->sync_wait(); return; } @@ -2213,13 +2204,13 @@ namespace eosio { } // called from connection strand - void sync_manager::sync_reassign_fetch(const connection_ptr& c, go_away_reason reason) { + void sync_manager::sync_reassign_fetch(const connection_ptr& c) { fc::unique_lock g( sync_mtx ); peer_ilog( c, "reassign_fetch, our last req is ${cc}, next expected is ${ne}", ("cc", sync_last_requested_num)("ne", sync_next_expected_num) ); if( c == sync_source ) { - c->cancel_sync(reason); + c->cancel_sync(); sync_last_requested_num = 0; request_next_chunk(); } @@ -2503,9 +2494,9 @@ namespace eosio { } else { if (!blk_applied) { if (blk_num >= c->sync_last_requested_block) { - peer_dlog(c, "calling cancel_wait, block ${b}, sync_last_requested_block ${lrb}", + peer_dlog(c, "calling cancel_sync_wait, block ${b}, sync_last_requested_block ${lrb}", ("b", blk_num)("lrb", c->sync_last_requested_block)); - c->cancel_wait(); + c->cancel_sync_wait(); } else { peer_dlog(c, "calling sync_wait, block ${b}", ("b", blk_num)); c->sync_wait(); @@ -2552,9 +2543,12 @@ namespace eosio { } } } - } else { - if (blk_applied) + } else { // in_sync + if (blk_applied) { send_handshakes_if_synced(blk_latency); + } else { + c->cancel_sync_wait(); + } } } @@ -2702,8 +2696,6 @@ namespace eosio { // called from c's connection strand void dispatch_manager::recv_block(const connection_ptr& c, const block_id_type& id, uint32_t bnum) { - peer_dlog(c, "canceling wait"); - c->cancel_wait(); } void dispatch_manager::rejected_block(const block_id_type& id) { @@ -3062,10 +3054,9 @@ namespace eosio { const fc::microseconds age(fc::time_point::now() - bh.timestamp); // don't add_peer_block because we have not validated this block header yet if( my_impl->dispatcher.have_block( blk_id ) ) { - peer_dlog( this, "canceling wait, already received block ${num}, id ${id}..., latency ${l}ms", + peer_dlog( this, "already received block ${num}, id ${id}..., latency ${l}ms", ("num", blk_num)("id", blk_id.str().substr(8,16))("l", age.count()/1000) ); my_impl->sync_master->sync_recv_block( shared_from_this(), blk_id, blk_num, false, age ); - cancel_wait(); pending_message_buffer.advance_read_ptr( message_length ); return true; @@ -3084,7 +3075,7 @@ namespace eosio { ("lib", blk_num < last_sent_lib ? last_sent_lib : lib_num) ); enqueue( (sync_request_message) {0, 0} ); send_handshake(); - cancel_wait(); + cancel_sync_wait(); pending_message_buffer.advance_read_ptr( message_length ); return true; @@ -3092,13 +3083,12 @@ namespace eosio { } else { block_sync_bytes_received += message_length; uint32_t lib_num = my_impl->get_chain_lib_num(); + my_impl->sync_master->sync_recv_block(shared_from_this(), blk_id, blk_num, false, age); if( blk_num <= lib_num ) { - cancel_wait(); - + peer_dlog( this, "received block ${n} less than lib ${lib} while syncing", ("n", blk_num)("lib", lib_num) ); pending_message_buffer.advance_read_ptr( message_length ); return true; } - my_impl->sync_master->sync_recv_block(shared_from_this(), blk_id, blk_num, false, age); } auto ds = pending_message_buffer.create_datastream(); From c4b173f6dfd56c0b87dec44d9a6c44cc0f8cbf6e Mon Sep 17 00:00:00 2001 From: Kevin Heifner Date: Fri, 2 Aug 2024 17:28:07 -0500 Subject: [PATCH 08/15] GH-459 Revert unneeded sync_wait --- plugins/net_plugin/net_plugin.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/plugins/net_plugin/net_plugin.cpp b/plugins/net_plugin/net_plugin.cpp index 7959955308..27aae05cba 100644 --- a/plugins/net_plugin/net_plugin.cpp +++ b/plugins/net_plugin/net_plugin.cpp @@ -2191,12 +2191,11 @@ namespace eosio { return; } - if( sync_state != lib_catchup ) { + if( sync_state != lib_catchup || !sync_source ) { set_state( lib_catchup ); sync_next_expected_num = chain_info.lib_num + 1; } else { peer_dlog(c, "already syncing, start sync ignored"); - c->sync_wait(); return; } From 7d9b53fe1d2e11cd53559cdafd7149e0b456ce2d Mon Sep 17 00:00:00 2001 From: Kevin Heifner Date: Fri, 2 Aug 2024 17:57:52 -0500 Subject: [PATCH 09/15] GH-459 Revert syncing from lib+1 --- plugins/net_plugin/net_plugin.cpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/plugins/net_plugin/net_plugin.cpp b/plugins/net_plugin/net_plugin.cpp index 27aae05cba..038445215b 100644 --- a/plugins/net_plugin/net_plugin.cpp +++ b/plugins/net_plugin/net_plugin.cpp @@ -2137,8 +2137,6 @@ namespace eosio { bool request_sent = false; if( sync_last_requested_num != sync_known_lib_num ) { uint32_t start = sync_next_expected_num; - if (sync_req_span > 1) - start = std::min(start, chain_info.lib_num+1); uint32_t end = start + sync_req_span - 1; if( end > sync_known_lib_num ) end = sync_known_lib_num; From 310eaafe2f2957e47f089ab6614a240df262cd9d Mon Sep 17 00:00:00 2001 From: Kevin Heifner Date: Sat, 3 Aug 2024 08:08:34 -0500 Subject: [PATCH 10/15] GH-459 syncing from lib+1 --- plugins/net_plugin/net_plugin.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/plugins/net_plugin/net_plugin.cpp b/plugins/net_plugin/net_plugin.cpp index 038445215b..9a5f214d42 100644 --- a/plugins/net_plugin/net_plugin.cpp +++ b/plugins/net_plugin/net_plugin.cpp @@ -2136,7 +2136,7 @@ namespace eosio { bool request_sent = false; if( sync_last_requested_num != sync_known_lib_num ) { - uint32_t start = sync_next_expected_num; + uint32_t start = std::max(sync_next_expected_num, chain_info.lib_num+1); uint32_t end = start + sync_req_span - 1; if( end > sync_known_lib_num ) end = sync_known_lib_num; From 0077c30f4f4ae95f91b9511a875ab3991a11878e Mon Sep 17 00:00:00 2001 From: Kevin Heifner Date: Sat, 3 Aug 2024 11:03:58 -0500 Subject: [PATCH 11/15] GH-459 Make sure sync_nex_expected set correctly --- plugins/net_plugin/net_plugin.cpp | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/plugins/net_plugin/net_plugin.cpp b/plugins/net_plugin/net_plugin.cpp index 9a5f214d42..bd860d1501 100644 --- a/plugins/net_plugin/net_plugin.cpp +++ b/plugins/net_plugin/net_plugin.cpp @@ -2039,10 +2039,11 @@ namespace eosio { // if closing the connection we are currently syncing from then request from a diff peer if( c == sync_source ) { - sync_last_requested_num = 0; // if starting to sync need to always start from lib as we might be on our own fork uint32_t lib_num = my_impl->get_chain_lib_num(); + sync_last_requested_num = 0; sync_next_expected_num = std::max( lib_num + 1, sync_next_expected_num ); + sync_source.reset(); request_next_chunk(); } } @@ -2122,6 +2123,7 @@ namespace eosio { sync_source.reset(); sync_known_lib_num = chain_info.lib_num; sync_last_requested_num = 0; + sync_next_expected_num = std::max( sync_known_lib_num + 1, sync_next_expected_num ); // not in sync, but need to be out of lib_catchup for start_sync to work set_state( in_sync ); send_handshakes(); @@ -2136,7 +2138,7 @@ namespace eosio { bool request_sent = false; if( sync_last_requested_num != sync_known_lib_num ) { - uint32_t start = std::max(sync_next_expected_num, chain_info.lib_num+1); + uint32_t start = sync_next_expected_num; uint32_t end = start + sync_req_span - 1; if( end > sync_known_lib_num ) end = sync_known_lib_num; @@ -2191,6 +2193,7 @@ namespace eosio { if( sync_state != lib_catchup || !sync_source ) { set_state( lib_catchup ); + sync_last_requested_num = 0; sync_next_expected_num = chain_info.lib_num + 1; } else { peer_dlog(c, "already syncing, start sync ignored"); @@ -2208,7 +2211,9 @@ namespace eosio { if( c == sync_source ) { c->cancel_sync(); + auto lib = my_impl->get_chain_lib_num(); sync_last_requested_num = 0; + sync_next_expected_num = std::max(sync_next_expected_num, lib + 1); request_next_chunk(); } } From da75754475a6fd9630977d451d3476f627921e55 Mon Sep 17 00:00:00 2001 From: Kevin Heifner Date: Sat, 3 Aug 2024 14:59:57 -0500 Subject: [PATCH 12/15] GH-459 Additional logging --- plugins/net_plugin/net_plugin.cpp | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/plugins/net_plugin/net_plugin.cpp b/plugins/net_plugin/net_plugin.cpp index bd860d1501..ec9482684a 100644 --- a/plugins/net_plugin/net_plugin.cpp +++ b/plugins/net_plugin/net_plugin.cpp @@ -1723,7 +1723,7 @@ namespace eosio { // called from connection strand void connection::cancel_sync() { - peer_dlog( this, "cancel sync, write queue size ${o} bytes", ("o", buffer_queue.write_queue_size()) ); + peer_dlog( this, "cancel sync_wait, write queue size ${o} bytes", ("o", buffer_queue.write_queue_size()) ); cancel_sync_wait(); sync_last_requested_block = 0; flush_queues(); @@ -1942,6 +1942,7 @@ namespace eosio { // called from connection strand void connection::sync_timeout( boost::system::error_code ec ) { if( !ec ) { + peer_dlog(this, "sync timeout"); my_impl->sync_master->sync_reassign_fetch( shared_from_this() ); close(true); } else if( ec != boost::asio::error::operation_aborted ) { // don't log on operation_aborted, called on destroy @@ -1954,6 +1955,7 @@ namespace eosio { sync_last_requested_block = end; sync_request_message srm = {start,end}; enqueue( net_message(srm) ); + peer_dlog(this, "calling sync_wait, sync_request_message ${s} - ${e}", ("s", start)("e", end)); sync_wait(); } @@ -2206,10 +2208,9 @@ namespace eosio { // called from connection strand void sync_manager::sync_reassign_fetch(const connection_ptr& c) { fc::unique_lock g( sync_mtx ); - peer_ilog( c, "reassign_fetch, our last req is ${cc}, next expected is ${ne}", - ("cc", sync_last_requested_num)("ne", sync_next_expected_num) ); - if( c == sync_source ) { + peer_ilog(c, "reassign_fetch, our last req is ${cc}, next expected is ${ne}", + ("cc", sync_last_requested_num)("ne", sync_next_expected_num)); c->cancel_sync(); auto lib = my_impl->get_chain_lib_num(); sync_last_requested_num = 0; @@ -2549,6 +2550,7 @@ namespace eosio { if (blk_applied) { send_handshakes_if_synced(blk_latency); } else { + peer_dlog(c, "in_sync, cancel_sync_wait"); c->cancel_sync_wait(); } } From 98a6e112de5fc03764c92be83653cd0ba5e90c11 Mon Sep 17 00:00:00 2001 From: Kevin Heifner Date: Sat, 3 Aug 2024 15:23:40 -0500 Subject: [PATCH 13/15] GH-459 Remove unneeded cancel_sync_wait --- plugins/net_plugin/net_plugin.cpp | 3 --- 1 file changed, 3 deletions(-) diff --git a/plugins/net_plugin/net_plugin.cpp b/plugins/net_plugin/net_plugin.cpp index ec9482684a..e43ecf711f 100644 --- a/plugins/net_plugin/net_plugin.cpp +++ b/plugins/net_plugin/net_plugin.cpp @@ -2549,9 +2549,6 @@ namespace eosio { } else { // in_sync if (blk_applied) { send_handshakes_if_synced(blk_latency); - } else { - peer_dlog(c, "in_sync, cancel_sync_wait"); - c->cancel_sync_wait(); } } } From 261ae2372c95da379b84b012bc5f2a3618822767 Mon Sep 17 00:00:00 2001 From: Kevin Heifner Date: Sat, 3 Aug 2024 15:52:02 -0500 Subject: [PATCH 14/15] GH-459 Increase timeout as sync wait times out and connection has to be re-established --- tests/p2p_sync_throttle_test.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/p2p_sync_throttle_test.py b/tests/p2p_sync_throttle_test.py index 93397d4875..41430a67b9 100755 --- a/tests/p2p_sync_throttle_test.py +++ b/tests/p2p_sync_throttle_test.py @@ -205,7 +205,7 @@ def extractPrometheusMetric(connID: str, metric: str, text: str): 'block_sync_bytes_sent', response) Print(f'End sync throttling bytes sent: {endSyncThrottlingBytesSent}') - assert throttledNode.waitForBlock(beginLargeBlocksHeadBlock, timeout=90), f'Wait for begin block {beginLargeBlocksHeadBlock} on throttled sync node timed out' + assert throttledNode.waitForBlock(beginLargeBlocksHeadBlock, timeout=120), f'Wait for begin block {beginLargeBlocksHeadBlock} on throttled sync node timed out' # Throttled node is connecting to a listen port with a block sync throttle applied so it will receive # blocks more slowly during syncing than an unthrottled node. wasThrottled = False @@ -218,7 +218,7 @@ def extractPrometheusMetric(connID: str, metric: str, text: str): if throttledState: wasThrottled = True break - assert throttledNode.waitForBlock(endLargeBlocksHeadBlock, timeout=90), f'Wait for block {endLargeBlocksHeadBlock} on sync node timed out' + assert throttledNode.waitForBlock(endLargeBlocksHeadBlock, timeout=120), f'Wait for block {endLargeBlocksHeadBlock} on sync node timed out' endThrottledSync = time.time() response = throttledNode.processUrllibRequest('prometheus', 'metrics', exitOnError=True, returnType=ReturnType.raw, printReturnLimit=16).decode() Print('Throttled Node End State') From 95ca20ba25a1c93e389eba6d058b8c813ce98a2d Mon Sep 17 00:00:00 2001 From: Kevin Heifner Date: Mon, 5 Aug 2024 07:25:50 -0500 Subject: [PATCH 15/15] GH-459 Reset sync_source when sync complete --- plugins/net_plugin/net_plugin.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/plugins/net_plugin/net_plugin.cpp b/plugins/net_plugin/net_plugin.cpp index e43ecf711f..78500de0d6 100644 --- a/plugins/net_plugin/net_plugin.cpp +++ b/plugins/net_plugin/net_plugin.cpp @@ -2215,6 +2215,7 @@ namespace eosio { auto lib = my_impl->get_chain_lib_num(); sync_last_requested_num = 0; sync_next_expected_num = std::max(sync_next_expected_num, lib + 1); + sync_source.reset(); request_next_chunk(); } } @@ -2499,6 +2500,7 @@ namespace eosio { if (blk_num >= c->sync_last_requested_block) { peer_dlog(c, "calling cancel_sync_wait, block ${b}, sync_last_requested_block ${lrb}", ("b", blk_num)("lrb", c->sync_last_requested_block)); + sync_source.reset(); c->cancel_sync_wait(); } else { peer_dlog(c, "calling sync_wait, block ${b}", ("b", blk_num));