diff --git a/plugins/net_plugin/net_plugin.cpp b/plugins/net_plugin/net_plugin.cpp index 1b91f21cfe..93c82cdf1b 100644 --- a/plugins/net_plugin/net_plugin.cpp +++ b/plugins/net_plugin/net_plugin.cpp @@ -2142,8 +2142,8 @@ namespace eosio { void sync_manager::request_next_chunk( const connection_ptr& conn ) REQUIRES(sync_mtx) { auto chain_info = my_impl->get_chain_info(); - fc_dlog( logger, "sync_last_requested_num: ${r}, sync_next_expected_num: ${e}, sync_known_lib_num: ${k}, sync_req_span: ${s}, head: ${h}", - ("r", sync_last_requested_num)("e", sync_next_expected_num)("k", sync_known_lib_num)("s", sync_req_span)("h", chain_info.head_num) ); + fc_dlog( logger, "sync_last_requested_num: ${r}, sync_next_expected_num: ${e}, sync_known_lib_num: ${k}, sync_req_span: ${s}, head: ${h}, lib: ${lib}", + ("r", sync_last_requested_num)("e", sync_next_expected_num)("k", sync_known_lib_num)("s", sync_req_span)("h", chain_info.head_num)("lib", chain_info.lib_num) ); if( chain_info.head_num + sync_req_span < sync_last_requested_num && sync_source && sync_source->current() ) { fc_dlog( logger, "ignoring request, head is ${h} last req = ${r}, sync_next_expected_num: ${e}, sync_known_lib_num: ${k}, sync_req_span: ${s}, source connection ${c}", @@ -2187,8 +2187,8 @@ namespace eosio { sync_last_requested_num = end; sync_source = new_sync_source; request_sent = true; - new_sync_source->strand.post( [new_sync_source, start, end, head_num=chain_info.head_num]() { - peer_ilog( new_sync_source, "requesting range ${s} to ${e}, head ${h}", ("s", start)("e", end)("h", head_num) ); + new_sync_source->strand.post( [new_sync_source, start, end, head_num=chain_info.head_num, lib=chain_info.lib_num]() { + peer_ilog( new_sync_source, "requesting range ${s} to ${e}, head ${h}, lib ${lib}", ("s", start)("e", end)("h", head_num)("lib", lib) ); new_sync_source->request_sync_blocks( start, end ); } ); } @@ -2234,8 +2234,10 @@ namespace eosio { if( sync_state != lib_catchup ) { set_state( lib_catchup ); + sync_next_expected_num = chain_info.lib_num + 1; + } else { + sync_next_expected_num = std::max( chain_info.lib_num + 1, sync_next_expected_num ); } - sync_next_expected_num = std::max( chain_info.lib_num + 1, sync_next_expected_num ); request_next_chunk( c ); } @@ -2443,11 +2445,10 @@ namespace eosio { // called from connection strand void sync_manager::rejected_block( const connection_ptr& c, uint32_t blk_num, closing_mode mode ) { c->block_status_monitor_.rejected(); + // reset sync on rejected block fc::unique_lock g( sync_mtx ); sync_last_requested_num = 0; - if (blk_num < sync_next_expected_num) { - sync_next_expected_num = my_impl->get_chain_lib_num(); - } + sync_next_expected_num = my_impl->get_chain_lib_num() + 1; if( mode == closing_mode::immediately || c->block_status_monitor_.max_events_violated()) { peer_wlog( c, "block ${bn} not accepted, closing connection", ("bn", blk_num) ); sync_source.reset(); @@ -2526,7 +2527,11 @@ namespace eosio { c->sync_wait(); } - sync_next_expected_num = blk_num + 1; + if (sync_last_requested_num == 0) { // block was rejected + sync_next_expected_num = my_impl->get_chain_lib_num(); + } else { + sync_next_expected_num = blk_num + 1; + } } uint32_t head = my_impl->get_chain_head_num(); @@ -3824,8 +3829,9 @@ namespace eosio { // use c in this method instead of this to highlight that all methods called on c-> must be thread safe connection_ptr c = shared_from_this(); + uint32_t lib = cc.last_irreversible_block_num(); try { - if( blk_num <= cc.last_irreversible_block_num() || cc.block_exists(blk_id) ) { + if( blk_num <= lib || cc.block_exists(blk_id) ) { c->strand.post( [sync_master = my_impl->sync_master.get(), dispatcher = my_impl->dispatcher.get(), c, blk_id, blk_num]() { dispatcher->add_peer_block( blk_id, c->connection_id ); @@ -3841,8 +3847,8 @@ namespace eosio { } fc::microseconds age( fc::time_point::now() - block->timestamp); - fc_dlog( logger, "received signed_block: #${n} block age in secs = ${age}, connection ${cid}, ${v}", - ("n", blk_num)("age", age.to_seconds())("cid", c->connection_id)("v", obt ? "pre-validated" : "validation pending") ); + fc_dlog( logger, "received signed_block: #${n} block age in secs = ${age}, connection ${cid}, ${v}, lib #${lib}", + ("n", blk_num)("age", age.to_seconds())("cid", c->connection_id)("v", obt ? "pre-validated" : "validation pending")("lib", lib) ); go_away_reason reason = no_reason; bool accepted = false; diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index a0a9831c3d..ec6b2c7c2e 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -50,6 +50,7 @@ configure_file(${CMAKE_CURRENT_SOURCE_DIR}/ship_test.py ${CMAKE_CURRENT_BINARY_D configure_file(${CMAKE_CURRENT_SOURCE_DIR}/ship_streamer_test.py ${CMAKE_CURRENT_BINARY_DIR}/ship_streamer_test.py COPYONLY) configure_file(${CMAKE_CURRENT_SOURCE_DIR}/bridge_for_fork_test_shape.json ${CMAKE_CURRENT_BINARY_DIR}/bridge_for_fork_test_shape.json COPYONLY) configure_file(${CMAKE_CURRENT_SOURCE_DIR}/large-lib-test.py ${CMAKE_CURRENT_BINARY_DIR}/large-lib-test.py COPYONLY) +configure_file(${CMAKE_CURRENT_SOURCE_DIR}/lib_advance_test.py ${CMAKE_CURRENT_BINARY_DIR}/lib_advance_test.py COPYONLY) configure_file(${CMAKE_CURRENT_SOURCE_DIR}/http_plugin_test.py ${CMAKE_CURRENT_BINARY_DIR}/http_plugin_test.py COPYONLY) configure_file(${CMAKE_CURRENT_SOURCE_DIR}/p2p_high_latency_test.py ${CMAKE_CURRENT_BINARY_DIR}/p2p_high_latency_test.py COPYONLY) configure_file(${CMAKE_CURRENT_SOURCE_DIR}/p2p_multiple_listen_test.py ${CMAKE_CURRENT_BINARY_DIR}/p2p_multiple_listen_test.py COPYONLY) @@ -328,6 +329,11 @@ set_property(TEST larger_lib_test PROPERTY LABELS nonparallelizable_tests) add_test(NAME larger_lib_if_test COMMAND tests/large-lib-test.py --activate-if ${UNSHARE} WORKING_DIRECTORY ${CMAKE_BINARY_DIR}) set_property(TEST larger_lib_if_test PROPERTY LABELS nonparallelizable_tests) +add_test(NAME lib_advance_test COMMAND tests/lib_advance_test.py -v ${UNSHARE} WORKING_DIRECTORY ${CMAKE_BINARY_DIR}) +set_property(TEST lib_advance_test PROPERTY LABELS nonparallelizable_tests) +add_test(NAME lib_advance_if_test COMMAND tests/lib_advance_test.py --activate-if -v ${UNSHARE} WORKING_DIRECTORY ${CMAKE_BINARY_DIR}) +set_property(TEST lib_advance_if_test PROPERTY LABELS nonparallelizable_tests) + add_test(NAME leap_util_bls_test COMMAND tests/leap_util_bls_test.py WORKING_DIRECTORY ${CMAKE_BINARY_DIR}) add_test(NAME http_plugin_test COMMAND tests/http_plugin_test.py ${UNSHARE} WORKING_DIRECTORY ${CMAKE_BINARY_DIR}) diff --git a/tests/lib_advance_test.py b/tests/lib_advance_test.py new file mode 100755 index 0000000000..69c4d21d6c --- /dev/null +++ b/tests/lib_advance_test.py @@ -0,0 +1,153 @@ +#!/usr/bin/env python3 + +import time +import decimal +import json +import math +import re +import signal + +from TestHarness import Account, Cluster, Node, TestHelper, Utils, WalletMgr, CORE_SYMBOL +from TestHarness.Node import BlockType + +############################################################### +# lib_advance_test +# +# Setup 4 producers separated by a bridge node. +# Kill bridge node, allow both sides to produce and +# verify they can sync back together after they are +# reconnected. +# +############################################################### +Print=Utils.Print +errorExit=Utils.errorExit + + +args = TestHelper.parse_args({"--activate-if","--dump-error-details","--keep-logs","-v","--leave-running", + "--wallet-port","--unshared"}) +Utils.Debug=args.v +totalProducerNodes=4 +totalNonProducerNodes=1 +totalNodes=totalProducerNodes+totalNonProducerNodes +maxActiveProducers=3 +totalProducers=maxActiveProducers +cluster=Cluster(unshared=args.unshared, keepRunning=args.leave_running, keepLogs=args.keep_logs) +activateIF=args.activate_if +dumpErrorDetails=args.dump_error_details +walletPort=args.wallet_port + +walletMgr=WalletMgr(True, port=walletPort) +testSuccessful=False + +try: + TestHelper.printSystemInfo("BEGIN") + + cluster.setWalletMgr(walletMgr) + Print("Stand up cluster") + specificExtraNodeosArgs={} + # producer nodes will be mapped to 0 through totalProducerNodes-1, so the number totalProducerNodes will be the non-producing node + specificExtraNodeosArgs[totalProducerNodes]="--plugin eosio::test_control_api_plugin" + + # *** setup topogrophy *** + + # "bridge" shape connects defproducera (node0) defproducerb (node1) defproducerc (node2) to each other and defproducerd (node3) + # and the only connection between those 2 groups is through the bridge (node4) + if cluster.launch(topo="./tests/bridge_for_fork_test_shape.json", pnodes=totalProducerNodes, + totalNodes=totalNodes, totalProducers=totalProducerNodes, loadSystemContract=False, + activateIF=activateIF, biosFinalizer=False, + specificExtraNodeosArgs=specificExtraNodeosArgs) is False: + Utils.cmdError("launcher") + Utils.errorExit("Failed to stand up eos cluster.") + Print("Validating system accounts after bootstrap") + cluster.validateAccounts(None) + + # *** identify each node (producers and non-producing node) *** + + prodNode0 = cluster.getNode(0) + prodNode1 = cluster.getNode(1) + prodNode2 = cluster.getNode(2) + prodNode3 = cluster.getNode(3) # other side of bridge + nonProdNode = cluster.getNode(4) + + prodNodes=[ prodNode0, prodNode1, prodNode2, prodNode3 ] + + prodA=prodNode0 # defproducera + prodD=prodNode3 # defproducerc + + # *** Identify a block where production is stable *** + + #verify nodes are in sync and advancing + cluster.biosNode.kill(signal.SIGTERM) + cluster.waitOnClusterSync(blockAdvancing=5) + + libProdABeforeKill = prodA.getIrreversibleBlockNum() + libProdDBeforeKill = prodD.getIrreversibleBlockNum() + + # *** Killing the "bridge" node *** + Print('Sending command to kill "bridge" node to separate the 2 producer groups.') + # kill at the beginning of the production window for defproducera, so there is time for the fork for + # defproducerd to grow before it would overtake the fork for defproducera and defproducerb and defproducerc + killAtProducer="defproducera" + nonProdNode.killNodeOnProducer(producer=killAtProducer, whereInSequence=1) + + #verify that the non producing node is not alive (and populate the producer nodes with current getInfo data to report if + #an error occurs) + numPasses = 2 + blocksPerProducer = 12 + blocksPerRound = totalProducers * blocksPerProducer + count = blocksPerRound * numPasses + while nonProdNode.verifyAlive() and count > 0: + # wait on prodNode 0 since it will continue to advance, since defproducera and defproducerb are its producers + Print("Wait for next block") + assert prodA.waitForNextBlock(timeout=6), "Production node A should continue to advance, even after bridge node is killed" + count -= 1 + + assert not nonProdNode.verifyAlive(), "Bridge node should have been killed if test was functioning correctly." + + transferAmount = 10 + # Does not use transaction retry (not needed) + transfer = prodD.transferFunds(cluster.eosioAccount, cluster.defproduceraAccount, f"{transferAmount}.0000 {CORE_SYMBOL}", "fund account") + transBlockNum = transfer['processed']['block_num'] + transId = prodD.getLastTrackedTransactionId() + + beforeBlockNum = prodA.getBlockNum() + prodA.waitForProducer("defproducera") + prodA.waitForProducer("defproducerb") + prodA.waitForProducer("defproducera") + prodA.waitForProducer("defproducerc") # produce enough that sync will have over 30 blocks to sync + assert prodA.waitForLibToAdvance(), "Production node A should advance lib without D" + assert prodD.waitForNextBlock(), "Production node D should continue to advance, even after bridge node is killed" + afterBlockNum = prodA.getBlockNum() + + Print("Relaunching the non-producing bridge node to connect the nodes") + if not nonProdNode.relaunch(): + errorExit(f"Failure - (non-production) node {nonProdNode.nodeNum} should have restarted") + + while prodD.getInfo()['last_irreversible_block_num'] < transBlockNum: + Print("Wait for LIB to move, which indicates prodD may have forked out the branch") + assert prodD.waitForLibToAdvance(60), \ + "ERROR: Network did not reach consensus after bridge node was restarted." + + assert prodD.waitForLibToAdvance() + assert prodD.waitForProducer("defproducera") + assert prodA.waitForProducer("defproducerd") + + for prodNode in prodNodes: + info=prodNode.getInfo() + Print(f"node info: {json.dumps(info, indent=1)}") + + assert prodA.getIrreversibleBlockNum() > max(libProdABeforeKill, libProdDBeforeKill) + assert prodD.getIrreversibleBlockNum() > max(libProdABeforeKill, libProdDBeforeKill) + + logFile = Utils.getNodeDataDir(prodNode3.nodeId) + "/stderr.txt" + f = open(logFile) + contents = f.read() + if contents.count("3030001 unlinkable_block_exception: Unlinkable block") > (afterBlockNum-beforeBlockNum): # a few are fine + errorExit(f"Node{prodNode3.nodeId} has more than {afterBlockNum-beforeBlockNum} unlinkable blocks: {logFile}.") + + testSuccessful=True +finally: + TestHelper.shutdown(cluster, walletMgr, testSuccessful=testSuccessful, dumpErrorDetails=dumpErrorDetails) + +errorCode = 0 if testSuccessful else 1 +exit(errorCode)