From 3a6cd53fffeefbc0563bb5a52f19c7b2c6a0290d Mon Sep 17 00:00:00 2001 From: Balamurali Gopalswami Date: Mon, 18 Nov 2024 12:36:34 -0500 Subject: [PATCH 1/2] Fix flake in detecting reorg by nodes --- integration-tests/ccip-tests/smoke/ccip_test.go | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/integration-tests/ccip-tests/smoke/ccip_test.go b/integration-tests/ccip-tests/smoke/ccip_test.go index 08054459481..31c0dbef1db 100644 --- a/integration-tests/ccip-tests/smoke/ccip_test.go +++ b/integration-tests/ccip-tests/smoke/ccip_test.go @@ -885,7 +885,7 @@ func TestSmokeCCIPReorgBelowFinality(t *testing.T) { // Test creates above finality reorg at destination and // expects ccip transactions in-flight and the one initiated after reorg -// doesn't go through and verifies every node is able to detect reorg. +// doesn't go through and verifies at least half of the nodes is able to detect reorg. // Note: LogPollInterval interval is set as 1s to detect the reorg immediately func TestSmokeCCIPReorgAboveFinalityAtDestination(t *testing.T) { t.Parallel() @@ -896,7 +896,7 @@ func TestSmokeCCIPReorgAboveFinalityAtDestination(t *testing.T) { // Test creates above finality reorg at destination and // expects ccip transactions in-flight doesn't go through, the transaction initiated after reorg -// shouldn't even get initiated and verifies every node is able to detect reorg. +// shouldn't even get initiated and verifies at least half of the nodes is able to detect reorg. // Note: LogPollInterval interval is set as 1s to detect the reorg immediately func TestSmokeCCIPReorgAboveFinalityAtSource(t *testing.T) { t.Parallel() @@ -936,7 +936,8 @@ func performAboveFinalityReorgAndValidate(t *testing.T, network string) { rs.RunReorg(rs.SrcClient, int(rs.Cfg.SrcFinalityDepth)+rs.Cfg.FinalityDelta, network, 2*time.Second) } clNodes := setUpOutput.Env.CLNodes - // assert every node is detecting the reorg (LogPollInterval is set as 1s for faster detection) + // assert at least half of the nodes is detecting the reorg (LogPollInterval is set as 1s for faster detection) + // expecting to detect reorg by every node is flake. So, checking with at least half of the nodes. nodesDetectedViolation := make(map[string]bool) assert.Eventually(t, func() bool { for _, node := range clNodes { @@ -952,8 +953,8 @@ func performAboveFinalityReorgAndValidate(t *testing.T, network string) { } } } - return len(nodesDetectedViolation) == len(clNodes) - }, 3*time.Minute, 20*time.Second, "Reorg above finality depth is not detected by every node") + return len(nodesDetectedViolation) >= len(clNodes)/2 + }, 3*time.Minute, 20*time.Second, "Reorg above finality depth is not detected by half of the nodes") log.Debug().Interface("Nodes", nodesDetectedViolation).Msg("Violation detection details") // send another request and verify it fails err = lane.SendRequests(1, gasLimit) From 9d1be8794978f244d94b87b60d2b894a63ae453a Mon Sep 17 00:00:00 2001 From: Balamurali Gopalswami Date: Tue, 19 Nov 2024 12:26:15 -0500 Subject: [PATCH 2/2] represent as f+1 nodes --- .../ccip-tests/smoke/ccip_test.go | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/integration-tests/ccip-tests/smoke/ccip_test.go b/integration-tests/ccip-tests/smoke/ccip_test.go index 31c0dbef1db..2f5e35d75e4 100644 --- a/integration-tests/ccip-tests/smoke/ccip_test.go +++ b/integration-tests/ccip-tests/smoke/ccip_test.go @@ -2,6 +2,7 @@ package smoke import ( "fmt" + "math" "math/big" "testing" "time" @@ -885,7 +886,7 @@ func TestSmokeCCIPReorgBelowFinality(t *testing.T) { // Test creates above finality reorg at destination and // expects ccip transactions in-flight and the one initiated after reorg -// doesn't go through and verifies at least half of the nodes is able to detect reorg. +// doesn't go through and verifies f+1 nodes is able to detect reorg. // Note: LogPollInterval interval is set as 1s to detect the reorg immediately func TestSmokeCCIPReorgAboveFinalityAtDestination(t *testing.T) { t.Parallel() @@ -896,7 +897,7 @@ func TestSmokeCCIPReorgAboveFinalityAtDestination(t *testing.T) { // Test creates above finality reorg at destination and // expects ccip transactions in-flight doesn't go through, the transaction initiated after reorg -// shouldn't even get initiated and verifies at least half of the nodes is able to detect reorg. +// shouldn't even get initiated and verifies f+1 nodes is able to detect reorg. // Note: LogPollInterval interval is set as 1s to detect the reorg immediately func TestSmokeCCIPReorgAboveFinalityAtSource(t *testing.T) { t.Parallel() @@ -935,12 +936,13 @@ func performAboveFinalityReorgAndValidate(t *testing.T, network string) { logPollerName = fmt.Sprintf("EVM.%d.LogPoller", lane.SourceChain.GetChainID()) rs.RunReorg(rs.SrcClient, int(rs.Cfg.SrcFinalityDepth)+rs.Cfg.FinalityDelta, network, 2*time.Second) } - clNodes := setUpOutput.Env.CLNodes - // assert at least half of the nodes is detecting the reorg (LogPollInterval is set as 1s for faster detection) - // expecting to detect reorg by every node is flake. So, checking with at least half of the nodes. + // DON is 3F+1, finding f+1 from the given number of nodes in the environment + fPlus1Nodes := int(math.Ceil(float64(len(setUpOutput.Env.CLNodes)-1)/3)) + 1 + // assert at least f+1 nodes is detecting the reorg (LogPollInterval is set as 1s for faster detection) + // additional context: Commit requires 2f+1 observations, so f+1 nodes need to detect it in order to force the entire DON to stop processing messages. nodesDetectedViolation := make(map[string]bool) assert.Eventually(t, func() bool { - for _, node := range clNodes { + for _, node := range setUpOutput.Env.CLNodes { if _, ok := nodesDetectedViolation[node.ChainlinkClient.URL()]; ok { continue } @@ -953,8 +955,8 @@ func performAboveFinalityReorgAndValidate(t *testing.T, network string) { } } } - return len(nodesDetectedViolation) >= len(clNodes)/2 - }, 3*time.Minute, 20*time.Second, "Reorg above finality depth is not detected by half of the nodes") + return len(nodesDetectedViolation) >= fPlus1Nodes + }, 3*time.Minute, 20*time.Second, "Reorg above finality depth is not detected by f+1 nodes") log.Debug().Interface("Nodes", nodesDetectedViolation).Msg("Violation detection details") // send another request and verify it fails err = lane.SendRequests(1, gasLimit)