Skip to content

Commit

Permalink
Detect if our node is behind the majority
Browse files Browse the repository at this point in the history
This commit adds a mechanism that detects that our node is behind the majority of the stake.
The intent is to later have this mechanism be the trigger for the bootstrapping mechanism.
Currently, the bootstrapping mechanism is only active upon node boot, but not at a later point.

The mechanism works in the following manner:

- It intercepts the snowman engine's Chits message handling, and upon every reception of the Chits message,
  the mechanism that detects if the node is a straggler (a node with a ledger height behind the rest) may be invoked,
  if it wasn't invoked too recently.
- The mechanism draws statistics from the validators known to it, and computes the latest accepted block for each validator.
- The mechanism then proceeds to determine which blocks are pending to be processed (a block pending to be processed was not accepted).
- The mechanism then collects a snapshot of all blocks it hasn't accepted yet, and the amount of stake that has accepted this block.
- The mechanism then waits for its next invocation, in order to see if it has accepted blocks correlated with enough stake.
- If there is too much stake that has accepted blocks by other nodes correlated to it that the node hasn't accepted,
  then the mechanism announces the node is behind, and returns the time period between the two invocations.
- The mechanism sums the total time it has detected the node is behind, until a sampling concludes it is not behind, and then
  the total time is nullified.

Signed-off-by: Yacov Manevich <[email protected]>
  • Loading branch information
yacovm committed Sep 16, 2024
1 parent 42d0cff commit 91cc26e
Show file tree
Hide file tree
Showing 10 changed files with 891 additions and 13 deletions.
11 changes: 9 additions & 2 deletions chains/manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -1339,12 +1339,19 @@ func (m *manager) createSnowmanChain(
Consensus: consensus,
PartialSync: m.PartialSyncPrimaryNetwork && ctx.ChainID == constants.PlatformChainID,
}
var engine common.Engine
engine, err = smeng.New(engineConfig)

sme, err := smeng.New(engineConfig)
if err != nil {
return nil, fmt.Errorf("error initializing snowman engine: %w", err)
}

ed := smeng.EngineStragglerDetector{
Listener: func(_ time.Duration) {},
Time: time.Now,
}

engine := ed.AttachToEngine(sme)

if m.TracingEnabled {
engine = common.TraceEngine(engine, m.Tracer)
}
Expand Down
9 changes: 9 additions & 0 deletions ids/node_weight.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
// Copyright (C) 2019-2024, Ava Labs, Inc. All rights reserved.
// See the file LICENSE for licensing terms.

package ids

type NodeWeight struct {
Node NodeID
Weight uint64
}
26 changes: 16 additions & 10 deletions snow/engine/common/tracker/peers.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@ import (
"sync"

"github.com/prometheus/client_golang/prometheus"
"golang.org/x/exp/maps"

"github.com/ava-labs/avalanchego/ids"
"github.com/ava-labs/avalanchego/snow/validators"
Expand Down Expand Up @@ -37,10 +36,10 @@ type Peers interface {
SampleValidator() (ids.NodeID, bool)
// GetValidators returns the set of all validators
// known to this peer manager
GetValidators() set.Set[ids.NodeID]
GetValidators() set.Set[ids.NodeWeight]
// ConnectedValidators returns the set of all validators
// that are currently connected
ConnectedValidators() set.Set[ids.NodeID]
ConnectedValidators() set.Set[ids.NodeWeight]
}

type lockedPeers struct {
Expand Down Expand Up @@ -112,14 +111,14 @@ func (p *lockedPeers) SampleValidator() (ids.NodeID, bool) {
return p.peers.SampleValidator()
}

func (p *lockedPeers) GetValidators() set.Set[ids.NodeID] {
func (p *lockedPeers) GetValidators() set.Set[ids.NodeWeight] {
p.lock.RLock()
defer p.lock.RUnlock()

return p.peers.GetValidators()
}

func (p *lockedPeers) ConnectedValidators() set.Set[ids.NodeID] {
func (p *lockedPeers) ConnectedValidators() set.Set[ids.NodeWeight] {
p.lock.RLock()
defer p.lock.RUnlock()

Expand Down Expand Up @@ -272,14 +271,21 @@ func (p *peerData) SampleValidator() (ids.NodeID, bool) {
return p.connectedValidators.Peek()
}

func (p *peerData) GetValidators() set.Set[ids.NodeID] {
return set.Of(maps.Keys(p.validators)...)
func (p *peerData) GetValidators() set.Set[ids.NodeWeight] {
res := set.NewSet[ids.NodeWeight](len(p.validators))
for k, v := range p.validators {
res.Add(ids.NodeWeight{Node: k, Weight: v})
}
return res
}

func (p *peerData) ConnectedValidators() set.Set[ids.NodeID] {
func (p *peerData) ConnectedValidators() set.Set[ids.NodeWeight] {
// The set is copied to avoid future changes from being reflected in the
// returned set.
copied := set.NewSet[ids.NodeID](len(p.connectedValidators))
copied.Union(p.connectedValidators)
copied := set.NewSet[ids.NodeWeight](len(p.connectedValidators))
for _, vdrID := range p.connectedValidators.List() {
weight := p.validators[vdrID]
copied.Add(ids.NodeWeight{Node: vdrID, Weight: weight})
}
return copied
}
25 changes: 25 additions & 0 deletions snow/engine/common/tracker/peers_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ import (
"github.com/stretchr/testify/require"

"github.com/ava-labs/avalanchego/ids"
"github.com/ava-labs/avalanchego/utils/set"
"github.com/ava-labs/avalanchego/version"
)

Expand Down Expand Up @@ -40,3 +41,27 @@ func TestPeers(t *testing.T) {
require.NoError(p.Disconnected(context.Background(), nodeID))
require.Zero(p.ConnectedWeight())
}

func TestConnectedValidators(t *testing.T) {
require := require.New(t)

nodeID1 := ids.GenerateTestNodeID()
nodeID2 := ids.GenerateTestNodeID()

p := NewPeers()

p.OnValidatorAdded(nodeID1, nil, ids.Empty, 5)
p.OnValidatorAdded(nodeID2, nil, ids.Empty, 6)

require.NoError(p.Connected(context.Background(), nodeID1, version.CurrentApp))
require.Equal(uint64(5), p.ConnectedWeight())

require.NoError(p.Connected(context.Background(), nodeID2, version.CurrentApp))
require.Equal(uint64(11), p.ConnectedWeight())
require.True(set.Of(ids.NodeWeight{Node: nodeID1, Weight: 5}, ids.NodeWeight{Node: nodeID2, Weight: 6}).Equals(p.GetValidators()))
require.True(set.Of(ids.NodeWeight{Node: nodeID1, Weight: 5}, ids.NodeWeight{Node: nodeID2, Weight: 6}).Equals(p.ConnectedValidators()))

require.NoError(p.Disconnected(context.Background(), nodeID2))
require.True(set.Of(ids.NodeWeight{Node: nodeID1, Weight: 5}, ids.NodeWeight{Node: nodeID2, Weight: 6}).Equals(p.GetValidators()))
require.True(set.Of(ids.NodeWeight{Node: nodeID1, Weight: 5}).Equals(p.ConnectedValidators()))
}
59 changes: 59 additions & 0 deletions snow/engine/snowman/engine_decorator.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
// Copyright (C) 2019-2024, Ava Labs, Inc. All rights reserved.
// See the file LICENSE for licensing terms.

package snowman

import (
"context"
"time"

"go.uber.org/zap"

"github.com/ava-labs/avalanchego/ids"
"github.com/ava-labs/avalanchego/snow/engine/common"
)

type EngineStragglerDetector struct {
Listener func(duration time.Duration)
Time func() time.Time
}

func (ed *EngineStragglerDetector) AttachToEngine(e *Engine) common.Engine {
minConfRatio := float64(e.Params.AlphaConfidence) / float64(e.Params.K)
sd := newStragglerDetector(ed.Time, e.Config.Ctx.Log, minConfRatio, e.Consensus.LastAccepted,
e.Config.ConnectedValidators.ConnectedValidators, e.Config.ConnectedValidators.ConnectedPercent,
e.Consensus.Processing, e.acceptedFrontiers.LastAccepted)
de := &DecoratedEngine{Engine: e}
de.decorate("Chits", func(e *Engine) {
behindDuration := sd.CheckIfWeAreStragglingBehind()
if behindDuration > 0 {
e.Config.Ctx.Log.Info("We are behind the rest of the network", zap.Float64("seconds", behindDuration.Seconds()))
}
e.metrics.stragglingDuration.Set(float64(behindDuration))
ed.Listener(behindDuration)
})

return de
}

type DecoratedEngine struct {
decorations map[string]func(*Engine)

*Engine
}

func (de *DecoratedEngine) decorate(method string, f func(*Engine)) {
if de.decorations == nil {
de.decorations = map[string]func(*Engine){}
}
de.decorations[method] = f
}

func (de *DecoratedEngine) Chits(ctx context.Context, nodeID ids.NodeID, requestID uint32, preferredID ids.ID, preferredIDAtHeight ids.ID, acceptedID ids.ID) error {
f, ok := de.decorations["Chits"]
if !ok {
panic("programming error: decorator for Chits not registered")
}
f(de.Engine)
return de.Engine.Chits(ctx, nodeID, requestID, preferredID, preferredIDAtHeight, acceptedID)
}
75 changes: 75 additions & 0 deletions snow/engine/snowman/engine_decorator_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
// Copyright (C) 2019-2024, Ava Labs, Inc. All rights reserved.
// See the file LICENSE for licensing terms.

package snowman

import (
"context"
"testing"
"time"

"github.com/stretchr/testify/require"

"github.com/ava-labs/avalanchego/ids"
"github.com/ava-labs/avalanchego/snow/consensus/snowman"
"github.com/ava-labs/avalanchego/snow/consensus/snowman/snowmantest"
)

func TestEngineStragglerDetector(t *testing.T) {
require := require.New(t)

fakeClock := make(chan time.Time, 1)

conf := DefaultConfig(t)
peerID, _, sender, vm, engine := setup(t, conf)

parent := snowmantest.BuildChild(snowmantest.Genesis)
require.NoError(conf.Consensus.Add(parent))

listenerShouldInvokeWith := []time.Duration{0, 0, time.Second * 2}

esd := &EngineStragglerDetector{
Listener: func(duration time.Duration) {
require.Equal(listenerShouldInvokeWith[0], duration)
listenerShouldInvokeWith = listenerShouldInvokeWith[1:]
},
Time: func() time.Time {
select {
case now := <-fakeClock:
return now
default:
require.Fail("should have a time.Time in the channel")
return time.Time{}
}
},
}

decoratedEngine := esd.AttachToEngine(engine)

vm.GetBlockF = func(_ context.Context, blkID ids.ID) (snowman.Block, error) {
switch blkID {
case snowmantest.GenesisID:
return snowmantest.Genesis, nil
default:
return nil, errUnknownBlock
}
}

sender.SendGetF = func(_ context.Context, _ ids.NodeID, _ uint32, _ ids.ID) {
}
vm.ParseBlockF = func(_ context.Context, _ []byte) (snowman.Block, error) {
require.FailNow("should not be called")
return nil, nil
}

now := time.Now()
fakeClock <- now
require.NoError(decoratedEngine.Chits(context.Background(), peerID, 0, parent.ID(), parent.ID(), parent.ID()))
now = now.Add(time.Second * 2)
fakeClock <- now
require.NoError(decoratedEngine.Chits(context.Background(), peerID, 0, parent.ID(), parent.ID(), parent.ID()))
now = now.Add(time.Second * 2)
fakeClock <- now
require.NoError(decoratedEngine.Chits(context.Background(), peerID, 0, parent.ID(), parent.ID(), parent.ID()))
require.Empty(listenerShouldInvokeWith)
}
6 changes: 6 additions & 0 deletions snow/engine/snowman/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ type metrics struct {
numBlocked prometheus.Gauge
numBlockers prometheus.Gauge
numNonVerifieds prometheus.Gauge
stragglingDuration prometheus.Gauge
numBuilt prometheus.Counter
numBuildsFailed prometheus.Counter
numUselessPutBytes prometheus.Counter
Expand All @@ -41,6 +42,10 @@ type metrics struct {
func newMetrics(reg prometheus.Registerer) (*metrics, error) {
errs := wrappers.Errs{}
m := &metrics{
stragglingDuration: prometheus.NewGauge(prometheus.GaugeOpts{
Name: "straggling_duration",
Help: "For how long we have been straggling behind the rest, in nano-seconds.",
}),
bootstrapFinished: prometheus.NewGauge(prometheus.GaugeOpts{
Name: "bootstrap_finished",
Help: "Whether or not bootstrap process has completed. 1 is success, 0 is fail or ongoing.",
Expand Down Expand Up @@ -128,6 +133,7 @@ func newMetrics(reg prometheus.Registerer) (*metrics, error) {
m.issued.WithLabelValues(unknownSource)

errs.Add(
reg.Register(m.stragglingDuration),
reg.Register(m.bootstrapFinished),
reg.Register(m.numRequests),
reg.Register(m.numBlocked),
Expand Down
Loading

0 comments on commit 91cc26e

Please sign in to comment.