Skip to content

Commit

Permalink
1. raft TLA spec with etcd implementation
Browse files Browse the repository at this point in the history
2. etcd TLA trace validation
  • Loading branch information
joshuazh-x committed Oct 16, 2023
1 parent 838e6ff commit 730a130
Show file tree
Hide file tree
Showing 8 changed files with 1,484 additions and 300 deletions.
21 changes: 21 additions & 0 deletions raft.go
Original file line number Diff line number Diff line change
Expand Up @@ -578,11 +578,14 @@ func (r *raft) send(m pb.Message) {
// we err on the side of safety and omit a `&& !m.Reject` condition
// above.
r.msgsAfterAppend = append(r.msgsAfterAppend, m)
traceSendMessage(r, &m)

} else {
if m.To == r.id {
r.logger.Panicf("message should not be self-addressed when sending %s", m.Type)
}
r.msgs = append(r.msgs, m)
traceSendMessage(r, &m)
}
}

Expand Down Expand Up @@ -753,6 +756,8 @@ func (r *raft) appliedSnap(snap *pb.Snapshot) {
// the commit index changed (in which case the caller should call
// r.bcastAppend).
func (r *raft) maybeCommit() bool {
traceNodeEvent(RsmCommit, r)

mci := r.prs.Committed()
return r.raftLog.maybeCommit(mci, r.Term)
}
Expand Down Expand Up @@ -868,19 +873,24 @@ func (r *raft) becomeFollower(term uint64, lead uint64) {
r.lead = lead
r.state = StateFollower
r.logger.Infof("%x became follower at term %d", r.id, r.Term)

traceNodeEvent(RsmBecomeFollower, r)
}

func (r *raft) becomeCandidate() {
// TODO(xiangli) remove the panic when the raft implementation is stable
if r.state == StateLeader {
panic("invalid transition [leader -> candidate]")
}

r.step = stepCandidate
r.reset(r.Term + 1)
r.tick = r.tickElection
r.Vote = r.id
r.state = StateCandidate
r.logger.Infof("%x became candidate at term %d", r.id, r.Term)

traceNodeEvent(RsmBecomeCandidate, r)
}

func (r *raft) becomePreCandidate() {
Expand All @@ -904,6 +914,7 @@ func (r *raft) becomeLeader() {
if r.state == StateFollower {
panic("invalid transition [follower -> leader]")
}

r.step = stepLeader
r.reset(r.Term)
r.tick = r.tickHeartbeat
Expand All @@ -926,6 +937,8 @@ func (r *raft) becomeLeader() {
// could be expensive.
r.pendingConfIndex = r.raftLog.lastIndex()

traceNodeEvent(RsmBecomeLeader, r)

emptyEnt := pb.Entry{Data: nil}
if !r.appendEntry(emptyEnt) {
// This won't happen because we just called reset() above.
Expand Down Expand Up @@ -1049,6 +1062,8 @@ func (r *raft) poll(id uint64, t pb.MessageType, v bool) (granted int, rejected
}

func (r *raft) Step(m pb.Message) error {
traceReceiveMessage(r, &m)

// Handle the message term, which may result in our stepping down to a follower.
switch {
case m.Term == 0:
Expand Down Expand Up @@ -1273,6 +1288,8 @@ func stepLeader(r *raft, m pb.Message) error {
cc = ccc
}
if cc != nil {
traceChangeConfEvent(cc, r)

alreadyPending := r.pendingConfIndex > r.raftLog.applied
alreadyJoint := len(r.prs.Config.Voters[1]) > 0
wantsLeaveJoint := len(cc.AsV2().Changes) == 0
Expand All @@ -1292,6 +1309,8 @@ func stepLeader(r *raft, m pb.Message) error {
} else {
r.pendingConfIndex = r.raftLog.lastIndex() + uint64(i) + 1
}
} else {
traceNodeEvent(RsmReplicate, r)
}
}

Expand Down Expand Up @@ -1915,6 +1934,8 @@ func (r *raft) applyConfChange(cc pb.ConfChangeV2) pb.ConfState {
//
// The inputs usually result from restoring a ConfState or applying a ConfChange.
func (r *raft) switchToConfig(cfg tracker.Config, prs tracker.ProgressMap) pb.ConfState {
traceConfChangeEvent(cfg, r)

r.prs.Config = cfg
r.prs.Progress = prs

Expand Down
226 changes: 226 additions & 0 deletions state_trace.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,226 @@
package raft

import (
"go.etcd.io/raft/v3/raftpb"
"go.etcd.io/raft/v3/tracker"
)

type RaftStateMachineEventType int

Check warning on line 8 in state_trace.go

View workflow job for this annotation

GitHub Actions / run

exported: type name will be used as raft.RaftStateMachineEventType by other packages, and that stutters; consider calling this StateMachineEventType (revive)

const (
RsmInitState RaftStateMachineEventType = iota
RsmBecomeCandidate
RsmBecomeFollower
RsmBecomeLeader
RsmCommit
RsmReplicate
RsmChangeConf
RsmApplyConfChange
RsmSendAppendEntriesRequest
RsmReceiveAppendEntriesRequest
RsmSendAppendEntriesResponse
RsmReceiveAppendEntriesResponse
RsmSendRequestVoteRequest
RsmReceiveRequestVoteRequest
RsmSendRequestVoteResponse
RsmReceiveRequestVoteResponse
)

func (e RaftStateMachineEventType) String() string {
return []string{
"InitState",
"BecomeCandidate",
"BecomeFollower",
"BecomeLeader",
"Commit",
"Replicate",
"ChangeConf",
"ApplyConfChange",
"SendAppendEntriesRequest",
"ReceiveAppendEntriesRequest",
"SendAppendEntriesResponse",
"ReceiveAppendEntriesResponse",
"SendRequestVoteRequest",
"ReceiveRequestVoteRequest",
"SendRequestVoteResponse",
"ReceiveRequestVoteResponse",
}[e]
}

const (
ConfChangeAddNewServer string = "AddNewServer"
ConfChangeRemoveServer string = "RemoveServer"
)

type TracingEvent struct {
Name string `json:"name"`
NodeID uint64 `json:"nid"`
State raftpb.HardState `json:"state"`
Role string `json:"role"`
Conf [2][]uint64 `json:"conf"`
Message *TracingMessage `json:"msg,omitempty"`
ConfChange *TracingConfChange `json:"cc,omitempty"`
}

type TracingMessage struct {
Type string `json:"type"`
Term uint64 `json:"term"`
From uint64 `json:"from"`
To uint64 `json:"to"`
EntryLength int `json:"entries"`
LogTerm uint64 `json:"logTerm"`
Index uint64 `json:"index"`
Commit uint64 `json:"commit"`
Vote uint64 `json:"vote"`
Reject bool `json:"reject"`
RejectHint uint64 `json:"rejectHint"`
}

type SingleConfChange struct {
NodeId uint64 `json:"nid"`

Check warning on line 80 in state_trace.go

View workflow job for this annotation

GitHub Actions / run

var-naming: struct field NodeId should be NodeID (revive)
Action string `json:"action"`
}

type TracingConfChange struct {
Changes []SingleConfChange `json:"changes,omitempty"`
NewConf []uint64 `json:"newconf,omitempty"`
}

func makeTracingMessage(m *raftpb.Message) *TracingMessage {
if m == nil {
return nil
}

return &TracingMessage{
Type: m.Type.String(),
Term: m.Term,
From: m.From,
To: m.To,
EntryLength: len(m.Entries),
LogTerm: m.LogTerm,
Index: m.Index,
Commit: m.Commit,
Vote: m.Vote,
Reject: m.Reject,
RejectHint: m.RejectHint,
}
}

type RaftStateMachineTracer interface {

Check warning on line 109 in state_trace.go

View workflow job for this annotation

GitHub Actions / run

exported: type name will be used as raft.RaftStateMachineTracer by other packages, and that stutters; consider calling this StateMachineTracer (revive)
TraceState(*TracingEvent)
}

var stateTracer RaftStateMachineTracer

func SetStateTracer(t RaftStateMachineTracer) {
stateTracer = t
}

func traceEvent(evt RaftStateMachineEventType, r *raft, m *raftpb.Message, cc *TracingConfChange) {
if stateTracer == nil {
return
}

stateTracer.TraceState(&TracingEvent{
Name: evt.String(),
NodeID: r.id,
State: r.hardState(),
Conf: [2][]uint64{nonNilSlice(r.prs.Voters[0].Slice()), nonNilSlice(r.prs.Voters[1].Slice())},
Role: r.state.String(),
Message: makeTracingMessage(m),
ConfChange: cc,
})
}

func traceNodeEvent(evt RaftStateMachineEventType, r *raft) {
traceEvent(evt, r, nil, nil)
}

func traceChangeConfEvent(cci raftpb.ConfChangeI, r *raft) {
cc2 := cci.AsV2()
cc := &TracingConfChange{
Changes: []SingleConfChange{},
NewConf: []uint64{},
}
for _, c := range cc2.Changes {
switch c.Type {
case raftpb.ConfChangeAddNode:
cc.Changes = append(cc.Changes, SingleConfChange{
NodeId: c.NodeID,
Action: ConfChangeAddNewServer,
})
case raftpb.ConfChangeRemoveNode:
cc.Changes = append(cc.Changes, SingleConfChange{
NodeId: c.NodeID,
Action: ConfChangeRemoveServer,
})
}
}

traceEvent(RsmChangeConf, r, nil, cc)
}

func traceConfChangeEvent(cfg tracker.Config, r *raft) {
if stateTracer == nil {
return
}

cc := &TracingConfChange{
Changes: []SingleConfChange{},
NewConf: nonNilSlice(cfg.Voters[0].Slice()),
}

traceEvent(RsmApplyConfChange, r, nil, cc)
}

func traceSendMessage(r *raft, m *raftpb.Message) {
if stateTracer == nil {
return
}

var evt RaftStateMachineEventType
switch m.Type {
case raftpb.MsgApp:
evt = RsmSendAppendEntriesRequest
case raftpb.MsgAppResp:
evt = RsmSendAppendEntriesResponse
case raftpb.MsgVote:
evt = RsmSendRequestVoteRequest
case raftpb.MsgVoteResp:
evt = RsmSendRequestVoteResponse
default:
return
}

traceEvent(evt, r, m, nil)
}

func traceReceiveMessage(r *raft, m *raftpb.Message) {
if stateTracer == nil {
return
}

var evt RaftStateMachineEventType
switch m.Type {
case raftpb.MsgApp:
evt = RsmReceiveAppendEntriesRequest
case raftpb.MsgAppResp:
evt = RsmReceiveAppendEntriesResponse
case raftpb.MsgVote:
evt = RsmReceiveRequestVoteRequest
case raftpb.MsgVoteResp:
evt = RsmReceiveRequestVoteResponse
default:
return
}

traceEvent(evt, r, m, nil)
}

func nonNilSlice(s []uint64) []uint64 {
if s == nil {
return []uint64{}
}

return s
}
49 changes: 49 additions & 0 deletions tla/Traceetcdraft.cfg
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
\* A trace specification defines a set of one or more traces, wherein the value of some
\* variables in each state aligns with the values specified on the corresponding line
\* of the log or trace file. Commonly, the log file might be incomplete, failing to
\* provide the value of all variables. In such instances, the omitted values are
\* determined non-deterministically, adhering to the high-level specification's parameters.
\* Furthermore, a trace may not be complete such that it only matches a prefix of the log.
SPECIFICATION
TraceSpec
VIEW
TraceView
\* TLA has only limited support for hyperproperties. The following property is a poorman's
\* hyperproperty that asserts that TLC generated *at least one* trace that fully matches the
\* log file.
PROPERTIES
TraceMatched

\* Checking for deadlocks during trace validation is disabled, as it may lead to false
\* counterexamples. A trace specification defines a set of traces, where at least one
\* trace is expected to match the log file in terms of variable values and length.
\* However, partial matches may occur where the trace cannot be extended to fully
\* correspond with the log file. In such cases, deadlock checking would report the first
\* of these traces.
CHECK_DEADLOCK
FALSE

CONSTANTS
InitServer <- TraceInitServer
Server <- TraceServer

InitConfVars <- TraceInitConfVars
InitLogVars <- TraceInitLogVars

EntriesToAppend <- TraceEntriesToAppend
NextIndexForUnsuccessfulAppendResponse <- TraceNextIndexForUnsuccessfulAppendResponse

Nil = 0

ValueEntry = "ValueEntry"
ConfigEntry = "ConfigEntry"

Follower = "Follower"
Candidate = "Candidate"
Leader = "Leader"
RequestVoteRequest = "RequestVoteRequest"
RequestVoteResponse = "RequestVoteResponse"
AppendEntriesRequest = "AppendEntriesRequest"
AppendEntriesResponse = "AppendEntriesResponse"
Loading

0 comments on commit 730a130

Please sign in to comment.