From 553538498c3cad4f5be3931f2bd0fd554bf69bbe Mon Sep 17 00:00:00 2001 From: Pavel Kalinnikov Date: Wed, 10 Apr 2024 20:05:59 +0100 Subject: [PATCH] rafttest: reduce waitLeader flakiness There are a few tests requiring a stable leader. For example, TestBasicProgress waits for a leader, submits 100 proposals, and expects that all 100 proposals are committed. In rare cases, a leader is elected, and the test proceeds, but in the meantime another node campaigns and wins a higher-term election. After this, some proposals end up not committed (legitimately), and the test fails. This commit modifies the waitLeader function with a better heuristic for a stable leader. It now waits until the leader has the highest term in the cluster, which more reliably (although not 100%) guarantees that there is no in-flight campaign that is about to win. Signed-off-by: Pavel Kalinnikov --- rafttest/node_test.go | 41 +++++++++++++++++++++++------------------ 1 file changed, 23 insertions(+), 18 deletions(-) diff --git a/rafttest/node_test.go b/rafttest/node_test.go index 60b10411..544a1ef4 100644 --- a/rafttest/node_test.go +++ b/rafttest/node_test.go @@ -35,7 +35,7 @@ func TestBasicProgress(t *testing.T) { nodes = append(nodes, n) } - waitLeader(nodes) + waitStableLeader(nodes) for i := 0; i < 100; i++ { nodes[0].Propose(context.TODO(), []byte("somedata")) @@ -59,7 +59,7 @@ func TestRestart(t *testing.T) { nodes = append(nodes, n) } - l := waitLeader(nodes) + l := waitStableLeader(nodes) k1, k2 := (l+1)%5, (l+2)%5 for i := 0; i < 30; i++ { @@ -97,7 +97,7 @@ func TestPause(t *testing.T) { nodes = append(nodes, n) } - waitLeader(nodes) + waitStableLeader(nodes) for i := 0; i < 30; i++ { nodes[0].Propose(context.TODO(), []byte("somedata")) @@ -123,26 +123,31 @@ func TestPause(t *testing.T) { } } -func waitLeader(ns []*node) int { - var l map[uint64]struct{} - var lindex = -1 - +// waitStableLeader waits until there is a stable leader in the cluster. It +// heuristically assumes that there is a stable leader when there is a node in +// StateLeader among the highest-term nodes. +// +// Note that this function would not work properly in clusters with "network" +// partitions, in which a node can have the highest term, and yet never become a +// leader. +func waitStableLeader(ns []*node) int { for { - l = make(map[uint64]struct{}) - + lead := -1 + var maxTerm uint64 for i, n := range ns { - lead := n.Status().SoftState.Lead - if lead != 0 { - l[lead] = struct{}{} - if n.id == lead { - lindex = i - } + st := n.Status() + if st.Term > maxTerm { + lead = -1 + maxTerm = st.Term + } + if st.RaftState == raft.StateLeader { + lead = i } } - - if len(l) == 1 && lindex != -1 { - return lindex + if lead != -1 { + return lead } + time.Sleep(time.Millisecond) } }