Skip to content

Commit

Permalink
GH-2215 Reset syncing on unlinkable blocks
Browse files Browse the repository at this point in the history
  • Loading branch information
heifner committed Feb 6, 2024
1 parent ed4209f commit 736b4af
Show file tree
Hide file tree
Showing 3 changed files with 177 additions and 12 deletions.
30 changes: 18 additions & 12 deletions plugins/net_plugin/net_plugin.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2142,8 +2142,8 @@ namespace eosio {
void sync_manager::request_next_chunk( const connection_ptr& conn ) REQUIRES(sync_mtx) {
auto chain_info = my_impl->get_chain_info();

fc_dlog( logger, "sync_last_requested_num: ${r}, sync_next_expected_num: ${e}, sync_known_lib_num: ${k}, sync_req_span: ${s}, head: ${h}",
("r", sync_last_requested_num)("e", sync_next_expected_num)("k", sync_known_lib_num)("s", sync_req_span)("h", chain_info.head_num) );
fc_dlog( logger, "sync_last_requested_num: ${r}, sync_next_expected_num: ${e}, sync_known_lib_num: ${k}, sync_req_span: ${s}, head: ${h}, lib: ${lib}",
("r", sync_last_requested_num)("e", sync_next_expected_num)("k", sync_known_lib_num)("s", sync_req_span)("h", chain_info.head_num)("lib", chain_info.lib_num) );

if( chain_info.head_num + sync_req_span < sync_last_requested_num && sync_source && sync_source->current() ) {
fc_dlog( logger, "ignoring request, head is ${h} last req = ${r}, sync_next_expected_num: ${e}, sync_known_lib_num: ${k}, sync_req_span: ${s}, source connection ${c}",
Expand Down Expand Up @@ -2187,8 +2187,8 @@ namespace eosio {
sync_last_requested_num = end;
sync_source = new_sync_source;
request_sent = true;
new_sync_source->strand.post( [new_sync_source, start, end, head_num=chain_info.head_num]() {
peer_ilog( new_sync_source, "requesting range ${s} to ${e}, head ${h}", ("s", start)("e", end)("h", head_num) );
new_sync_source->strand.post( [new_sync_source, start, end, head_num=chain_info.head_num, lib=chain_info.lib_num]() {
peer_ilog( new_sync_source, "requesting range ${s} to ${e}, head ${h}, lib ${lib}", ("s", start)("e", end)("h", head_num)("lib", lib) );
new_sync_source->request_sync_blocks( start, end );
} );
}
Expand Down Expand Up @@ -2234,8 +2234,10 @@ namespace eosio {

if( sync_state != lib_catchup ) {
set_state( lib_catchup );
sync_next_expected_num = chain_info.lib_num + 1;
} else {
sync_next_expected_num = std::max( chain_info.lib_num + 1, sync_next_expected_num );
}
sync_next_expected_num = std::max( chain_info.lib_num + 1, sync_next_expected_num );

request_next_chunk( c );
}
Expand Down Expand Up @@ -2443,11 +2445,10 @@ namespace eosio {
// called from connection strand
void sync_manager::rejected_block( const connection_ptr& c, uint32_t blk_num, closing_mode mode ) {
c->block_status_monitor_.rejected();
// reset sync on rejected block
fc::unique_lock g( sync_mtx );
sync_last_requested_num = 0;
if (blk_num < sync_next_expected_num) {
sync_next_expected_num = my_impl->get_chain_lib_num();
}
sync_next_expected_num = my_impl->get_chain_lib_num() + 1;
if( mode == closing_mode::immediately || c->block_status_monitor_.max_events_violated()) {
peer_wlog( c, "block ${bn} not accepted, closing connection", ("bn", blk_num) );
sync_source.reset();
Expand Down Expand Up @@ -2526,7 +2527,11 @@ namespace eosio {
c->sync_wait();
}

sync_next_expected_num = blk_num + 1;
if (sync_last_requested_num == 0) { // block was rejected
sync_next_expected_num = my_impl->get_chain_lib_num();
} else {
sync_next_expected_num = blk_num + 1;
}
}

uint32_t head = my_impl->get_chain_head_num();
Expand Down Expand Up @@ -3824,8 +3829,9 @@ namespace eosio {
// use c in this method instead of this to highlight that all methods called on c-> must be thread safe
connection_ptr c = shared_from_this();

uint32_t lib = cc.last_irreversible_block_num();
try {
if( blk_num <= cc.last_irreversible_block_num() || cc.block_exists(blk_id) ) {
if( blk_num <= lib || cc.block_exists(blk_id) ) {
c->strand.post( [sync_master = my_impl->sync_master.get(),
dispatcher = my_impl->dispatcher.get(), c, blk_id, blk_num]() {
dispatcher->add_peer_block( blk_id, c->connection_id );
Expand All @@ -3841,8 +3847,8 @@ namespace eosio {
}

fc::microseconds age( fc::time_point::now() - block->timestamp);
fc_dlog( logger, "received signed_block: #${n} block age in secs = ${age}, connection ${cid}, ${v}",
("n", blk_num)("age", age.to_seconds())("cid", c->connection_id)("v", obt ? "pre-validated" : "validation pending") );
fc_dlog( logger, "received signed_block: #${n} block age in secs = ${age}, connection ${cid}, ${v}, lib #${lib}",
("n", blk_num)("age", age.to_seconds())("cid", c->connection_id)("v", obt ? "pre-validated" : "validation pending")("lib", lib) );

go_away_reason reason = no_reason;
bool accepted = false;
Expand Down
6 changes: 6 additions & 0 deletions tests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ configure_file(${CMAKE_CURRENT_SOURCE_DIR}/ship_test.py ${CMAKE_CURRENT_BINARY_D
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/ship_streamer_test.py ${CMAKE_CURRENT_BINARY_DIR}/ship_streamer_test.py COPYONLY)
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/bridge_for_fork_test_shape.json ${CMAKE_CURRENT_BINARY_DIR}/bridge_for_fork_test_shape.json COPYONLY)
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/large-lib-test.py ${CMAKE_CURRENT_BINARY_DIR}/large-lib-test.py COPYONLY)
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/lib_advance_test.py ${CMAKE_CURRENT_BINARY_DIR}/lib_advance_test.py COPYONLY)
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/http_plugin_test.py ${CMAKE_CURRENT_BINARY_DIR}/http_plugin_test.py COPYONLY)
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/p2p_high_latency_test.py ${CMAKE_CURRENT_BINARY_DIR}/p2p_high_latency_test.py COPYONLY)
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/p2p_multiple_listen_test.py ${CMAKE_CURRENT_BINARY_DIR}/p2p_multiple_listen_test.py COPYONLY)
Expand Down Expand Up @@ -328,6 +329,11 @@ set_property(TEST larger_lib_test PROPERTY LABELS nonparallelizable_tests)
add_test(NAME larger_lib_if_test COMMAND tests/large-lib-test.py --activate-if ${UNSHARE} WORKING_DIRECTORY ${CMAKE_BINARY_DIR})
set_property(TEST larger_lib_if_test PROPERTY LABELS nonparallelizable_tests)

add_test(NAME lib_advance_test COMMAND tests/lib_advance_test.py -v ${UNSHARE} WORKING_DIRECTORY ${CMAKE_BINARY_DIR})
set_property(TEST lib_advance_test PROPERTY LABELS nonparallelizable_tests)
add_test(NAME lib_advance_if_test COMMAND tests/lib_advance_test.py --activate-if -v ${UNSHARE} WORKING_DIRECTORY ${CMAKE_BINARY_DIR})
set_property(TEST lib_advance_if_test PROPERTY LABELS nonparallelizable_tests)

add_test(NAME leap_util_bls_test COMMAND tests/leap_util_bls_test.py WORKING_DIRECTORY ${CMAKE_BINARY_DIR})

add_test(NAME http_plugin_test COMMAND tests/http_plugin_test.py ${UNSHARE} WORKING_DIRECTORY ${CMAKE_BINARY_DIR})
Expand Down
153 changes: 153 additions & 0 deletions tests/lib_advance_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,153 @@
#!/usr/bin/env python3

import time
import decimal
import json
import math
import re
import signal

from TestHarness import Account, Cluster, Node, TestHelper, Utils, WalletMgr, CORE_SYMBOL
from TestHarness.Node import BlockType

###############################################################
# lib_advance_test
#
# Setup 4 producers separated by a bridge node.
# Kill bridge node, allow both sides to produce and
# verify they can sync back together after they are
# reconnected.
#
###############################################################
Print=Utils.Print
errorExit=Utils.errorExit


args = TestHelper.parse_args({"--activate-if","--dump-error-details","--keep-logs","-v","--leave-running",
"--wallet-port","--unshared"})
Utils.Debug=args.v
totalProducerNodes=4
totalNonProducerNodes=1
totalNodes=totalProducerNodes+totalNonProducerNodes
maxActiveProducers=3
totalProducers=maxActiveProducers
cluster=Cluster(unshared=args.unshared, keepRunning=args.leave_running, keepLogs=args.keep_logs)
activateIF=args.activate_if
dumpErrorDetails=args.dump_error_details
walletPort=args.wallet_port

walletMgr=WalletMgr(True, port=walletPort)
testSuccessful=False

try:
TestHelper.printSystemInfo("BEGIN")

cluster.setWalletMgr(walletMgr)
Print("Stand up cluster")
specificExtraNodeosArgs={}
# producer nodes will be mapped to 0 through totalProducerNodes-1, so the number totalProducerNodes will be the non-producing node
specificExtraNodeosArgs[totalProducerNodes]="--plugin eosio::test_control_api_plugin"

# *** setup topogrophy ***

# "bridge" shape connects defproducera (node0) defproducerb (node1) defproducerc (node2) to each other and defproducerd (node3)
# and the only connection between those 2 groups is through the bridge (node4)
if cluster.launch(topo="./tests/bridge_for_fork_test_shape.json", pnodes=totalProducerNodes,
totalNodes=totalNodes, totalProducers=totalProducerNodes, loadSystemContract=False,
activateIF=activateIF, biosFinalizer=False,
specificExtraNodeosArgs=specificExtraNodeosArgs) is False:
Utils.cmdError("launcher")
Utils.errorExit("Failed to stand up eos cluster.")
Print("Validating system accounts after bootstrap")
cluster.validateAccounts(None)

# *** identify each node (producers and non-producing node) ***

prodNode0 = cluster.getNode(0)
prodNode1 = cluster.getNode(1)
prodNode2 = cluster.getNode(2)
prodNode3 = cluster.getNode(3) # other side of bridge
nonProdNode = cluster.getNode(4)

prodNodes=[ prodNode0, prodNode1, prodNode2, prodNode3 ]

prodA=prodNode0 # defproducera
prodD=prodNode3 # defproducerc

# *** Identify a block where production is stable ***

#verify nodes are in sync and advancing
cluster.biosNode.kill(signal.SIGTERM)
cluster.waitOnClusterSync(blockAdvancing=5)

libProdABeforeKill = prodA.getIrreversibleBlockNum()
libProdDBeforeKill = prodD.getIrreversibleBlockNum()

# *** Killing the "bridge" node ***
Print('Sending command to kill "bridge" node to separate the 2 producer groups.')
# kill at the beginning of the production window for defproducera, so there is time for the fork for
# defproducerd to grow before it would overtake the fork for defproducera and defproducerb and defproducerc
killAtProducer="defproducera"
nonProdNode.killNodeOnProducer(producer=killAtProducer, whereInSequence=1)

#verify that the non producing node is not alive (and populate the producer nodes with current getInfo data to report if
#an error occurs)
numPasses = 2
blocksPerProducer = 12
blocksPerRound = totalProducers * blocksPerProducer
count = blocksPerRound * numPasses
while nonProdNode.verifyAlive() and count > 0:
# wait on prodNode 0 since it will continue to advance, since defproducera and defproducerb are its producers
Print("Wait for next block")
assert prodA.waitForNextBlock(timeout=6), "Production node A should continue to advance, even after bridge node is killed"
count -= 1

assert not nonProdNode.verifyAlive(), "Bridge node should have been killed if test was functioning correctly."

transferAmount = 10
# Does not use transaction retry (not needed)
transfer = prodD.transferFunds(cluster.eosioAccount, cluster.defproduceraAccount, f"{transferAmount}.0000 {CORE_SYMBOL}", "fund account")
transBlockNum = transfer['processed']['block_num']
transId = prodD.getLastTrackedTransactionId()

beforeBlockNum = prodA.getBlockNum()
prodA.waitForProducer("defproducera")
prodA.waitForProducer("defproducerb")
prodA.waitForProducer("defproducera")
prodA.waitForProducer("defproducerc") # produce enough that sync will have over 30 blocks to sync
assert prodA.waitForLibToAdvance(), "Production node A should advance lib without D"
assert prodD.waitForNextBlock(), "Production node D should continue to advance, even after bridge node is killed"
afterBlockNum = prodA.getBlockNum()

Print("Relaunching the non-producing bridge node to connect the nodes")
if not nonProdNode.relaunch():
errorExit(f"Failure - (non-production) node {nonProdNode.nodeNum} should have restarted")

while prodD.getInfo()['last_irreversible_block_num'] < transBlockNum:
Print("Wait for LIB to move, which indicates prodD may have forked out the branch")
assert prodD.waitForLibToAdvance(60), \
"ERROR: Network did not reach consensus after bridge node was restarted."

assert prodD.waitForLibToAdvance()
assert prodD.waitForProducer("defproducera")
assert prodA.waitForProducer("defproducerd")

for prodNode in prodNodes:
info=prodNode.getInfo()
Print(f"node info: {json.dumps(info, indent=1)}")

assert prodA.getIrreversibleBlockNum() > max(libProdABeforeKill, libProdDBeforeKill)
assert prodD.getIrreversibleBlockNum() > max(libProdABeforeKill, libProdDBeforeKill)

logFile = Utils.getNodeDataDir(prodNode3.nodeId) + "/stderr.txt"
f = open(logFile)
contents = f.read()
if contents.count("3030001 unlinkable_block_exception: Unlinkable block") > (afterBlockNum-beforeBlockNum): # a few are fine
errorExit(f"Node{prodNode3.nodeId} has more than {afterBlockNum-beforeBlockNum} unlinkable blocks: {logFile}.")

testSuccessful=True
finally:
TestHelper.shutdown(cluster, walletMgr, testSuccessful=testSuccessful, dumpErrorDetails=dumpErrorDetails)

errorCode = 0 if testSuccessful else 1
exit(errorCode)

0 comments on commit 736b4af

Please sign in to comment.