-
Notifications
You must be signed in to change notification settings - Fork 674
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Detect node is behind #3373
base: master
Are you sure you want to change the base?
Detect node is behind #3373
Changes from all commits
77ff361
dac0382
4d014c0
0228a65
eb68c75
438768d
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
// Copyright (C) 2019-2024, Ava Labs, Inc. All rights reserved. | ||
// See the file LICENSE for licensing terms. | ||
|
||
package ids | ||
|
||
type NodeWeight struct { | ||
ID NodeID | ||
Weight uint64 | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -9,7 +9,6 @@ import ( | |
"sync" | ||
|
||
"github.com/prometheus/client_golang/prometheus" | ||
"golang.org/x/exp/maps" | ||
|
||
"github.com/ava-labs/avalanchego/ids" | ||
"github.com/ava-labs/avalanchego/snow/validators" | ||
|
@@ -37,10 +36,10 @@ type Peers interface { | |
SampleValidator() (ids.NodeID, bool) | ||
// GetValidators returns the set of all validators | ||
// known to this peer manager | ||
GetValidators() set.Set[ids.NodeID] | ||
GetValidators() set.Set[ids.NodeWeight] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should we return a map instead of set if we need both? The nodeIDs should be unique within the set, but this change makes it possible to add the nodeID twice with two different weights. This shouldn't happen because we're creating it on demand, but it seems like an odd use of set to me. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Would it make sense to use the validator set type on the snowman config type rather than making this change to |
||
// ConnectedValidators returns the set of all validators | ||
// that are currently connected | ||
ConnectedValidators() set.Set[ids.NodeID] | ||
ConnectedValidators() set.Set[ids.NodeWeight] | ||
} | ||
|
||
type lockedPeers struct { | ||
|
@@ -112,14 +111,14 @@ func (p *lockedPeers) SampleValidator() (ids.NodeID, bool) { | |
return p.peers.SampleValidator() | ||
} | ||
|
||
func (p *lockedPeers) GetValidators() set.Set[ids.NodeID] { | ||
func (p *lockedPeers) GetValidators() set.Set[ids.NodeWeight] { | ||
p.lock.RLock() | ||
defer p.lock.RUnlock() | ||
|
||
return p.peers.GetValidators() | ||
} | ||
|
||
func (p *lockedPeers) ConnectedValidators() set.Set[ids.NodeID] { | ||
func (p *lockedPeers) ConnectedValidators() set.Set[ids.NodeWeight] { | ||
p.lock.RLock() | ||
defer p.lock.RUnlock() | ||
|
||
|
@@ -272,14 +271,21 @@ func (p *peerData) SampleValidator() (ids.NodeID, bool) { | |
return p.connectedValidators.Peek() | ||
} | ||
|
||
func (p *peerData) GetValidators() set.Set[ids.NodeID] { | ||
return set.Of(maps.Keys(p.validators)...) | ||
func (p *peerData) GetValidators() set.Set[ids.NodeWeight] { | ||
res := set.NewSet[ids.NodeWeight](len(p.validators)) | ||
for k, v := range p.validators { | ||
res.Add(ids.NodeWeight{ID: k, Weight: v}) | ||
} | ||
return res | ||
} | ||
|
||
func (p *peerData) ConnectedValidators() set.Set[ids.NodeID] { | ||
func (p *peerData) ConnectedValidators() set.Set[ids.NodeWeight] { | ||
ceyonur marked this conversation as resolved.
Show resolved
Hide resolved
|
||
// The set is copied to avoid future changes from being reflected in the | ||
// returned set. | ||
copied := set.NewSet[ids.NodeID](len(p.connectedValidators)) | ||
copied.Union(p.connectedValidators) | ||
copied := set.NewSet[ids.NodeWeight](len(p.connectedValidators)) | ||
for _, vdrID := range p.connectedValidators.List() { | ||
weight := p.validators[vdrID] | ||
copied.Add(ids.NodeWeight{ID: vdrID, Weight: weight}) | ||
} | ||
return copied | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,67 @@ | ||
// Copyright (C) 2019-2024, Ava Labs, Inc. All rights reserved. | ||
// See the file LICENSE for licensing terms. | ||
|
||
package snowman | ||
|
||
import ( | ||
"context" | ||
"time" | ||
|
||
"go.uber.org/zap" | ||
|
||
"github.com/ava-labs/avalanchego/ids" | ||
"github.com/ava-labs/avalanchego/snow/engine/common" | ||
) | ||
|
||
type decoratedEngineWithStragglerDetector struct { | ||
*Engine | ||
sd *stragglerDetector | ||
f func(time.Duration) | ||
} | ||
|
||
func NewDecoratedEngineWithStragglerDetector(e *Engine, time func() time.Time, f func(time.Duration)) common.Engine { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can we use There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sure, I changed the tests to use However I kept the function pointer |
||
minConfRatio := float64(e.Params.AlphaConfidence) / float64(e.Params.K) | ||
|
||
subnet := e.Ctx.SubnetID | ||
|
||
sa := &snapshotAnalyzer{ | ||
lastAcceptedHeight: onlyHeight(e.Consensus.LastAccepted), | ||
log: e.Config.Ctx.Log, | ||
} | ||
|
||
s := &snapshotter{ | ||
totalWeight: func() (uint64, error) { | ||
return e.Validators.TotalWeight(subnet) | ||
}, | ||
log: e.Config.Ctx.Log, | ||
connectedValidators: e.Config.ConnectedValidators.ConnectedValidators, | ||
minConfirmationThreshold: minConfRatio, | ||
lastAcceptedHeightByNodeID: e.acceptedFrontiers.LastAccepted, | ||
} | ||
|
||
conf := stragglerDetectorConfig{ | ||
getSnapshot: s.getNetworkSnapshot, | ||
areWeBehindTheRest: sa.areWeBehindTheRest, | ||
minStragglerCheckInterval: minStragglerCheckInterval, | ||
log: e.Config.Ctx.Log, | ||
getTime: time, | ||
} | ||
|
||
sd := newStragglerDetector(conf) | ||
|
||
return &decoratedEngineWithStragglerDetector{ | ||
Engine: e, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. any reason we need Engine as a struct rather than interface? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. because I need its accepted frontiers, which is a field in the struct, and not in the interface. |
||
f: f, | ||
sd: sd, | ||
} | ||
} | ||
|
||
func (de *decoratedEngineWithStragglerDetector) Chits(ctx context.Context, nodeID ids.NodeID, requestID uint32, preferredID ids.ID, preferredIDAtHeight ids.ID, acceptedID ids.ID, acceptedHeight uint64) error { | ||
behindDuration := de.sd.CheckIfWeAreStragglingBehind() | ||
if behindDuration > 0 { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I guess this won't return any small numbers (like 1 ms) and making this too spammy. but if it does, maybe we can consider adding a threshold. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The implementation of There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can we add a todo in the code? thanks. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I changed to 10 instead of adding a TODO. |
||
de.Engine.Config.Ctx.Log.Info("We are behind the rest of the network", zap.Float64("seconds", behindDuration.Seconds())) | ||
} | ||
de.Engine.metrics.stragglingDuration.Set(float64(behindDuration)) | ||
de.f(behindDuration) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. is this for testing only? If so, I feel like this hook is equivalent of directly testing There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is not for testing. This is a decorator, so it invokes the method it decorates but also invokes a function, which is pluggable. The pluggable function it invokes, if this There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I still slightly think this is a bit extra (since we don't actually use it in production). but if you think we should keep it, can we rename it and add comment so that it's visible for consumers? |
||
return de.Engine.Chits(ctx, nodeID, requestID, preferredID, preferredIDAtHeight, acceptedID, acceptedHeight) | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,65 @@ | ||
// Copyright (C) 2019-2024, Ava Labs, Inc. All rights reserved. | ||
// See the file LICENSE for licensing terms. | ||
|
||
package snowman | ||
|
||
import ( | ||
"context" | ||
"testing" | ||
"time" | ||
|
||
"github.com/stretchr/testify/require" | ||
|
||
"github.com/ava-labs/avalanchego/ids" | ||
"github.com/ava-labs/avalanchego/snow/consensus/snowman" | ||
"github.com/ava-labs/avalanchego/snow/consensus/snowman/snowmantest" | ||
"github.com/ava-labs/avalanchego/utils/timer/mockable" | ||
) | ||
|
||
func TestEngineStragglerDetector(t *testing.T) { | ||
require := require.New(t) | ||
|
||
var fakeClock mockable.Clock | ||
|
||
conf := DefaultConfig(t) | ||
peerID, _, sender, vm, engine := setup(t, conf) | ||
|
||
parent := snowmantest.BuildChild(snowmantest.Genesis) | ||
require.NoError(conf.Consensus.Add(parent)) | ||
|
||
listenerShouldInvokeWith := []time.Duration{0, 0, minStragglerCheckInterval * 2} | ||
|
||
f := func(duration time.Duration) { | ||
require.Equal(listenerShouldInvokeWith[0], duration) | ||
listenerShouldInvokeWith = listenerShouldInvokeWith[1:] | ||
} | ||
|
||
decoratedEngine := NewDecoratedEngineWithStragglerDetector(engine, fakeClock.Time, f) | ||
|
||
vm.GetBlockF = func(_ context.Context, blkID ids.ID) (snowman.Block, error) { | ||
switch blkID { | ||
case snowmantest.GenesisID: | ||
return snowmantest.Genesis, nil | ||
default: | ||
return nil, errUnknownBlock | ||
} | ||
} | ||
|
||
sender.SendGetF = func(_ context.Context, _ ids.NodeID, _ uint32, _ ids.ID) { | ||
} | ||
vm.ParseBlockF = func(_ context.Context, _ []byte) (snowman.Block, error) { | ||
require.FailNow("should not be called") | ||
return nil, nil | ||
} | ||
|
||
now := time.Now() | ||
fakeClock.Set(now) | ||
require.NoError(decoratedEngine.Chits(context.Background(), peerID, 0, parent.ID(), parent.ID(), parent.ID(), 100)) | ||
now = now.Add(minStragglerCheckInterval * 2) | ||
fakeClock.Set(now) | ||
require.NoError(decoratedEngine.Chits(context.Background(), peerID, 0, parent.ID(), parent.ID(), parent.ID(), 100)) | ||
now = now.Add(minStragglerCheckInterval * 2) | ||
fakeClock.Set(now) | ||
require.NoError(decoratedEngine.Chits(context.Background(), peerID, 0, parent.ID(), parent.ID(), parent.ID(), 100)) | ||
require.Empty(listenerShouldInvokeWith) | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -23,6 +23,7 @@ type metrics struct { | |
numBlocked prometheus.Gauge | ||
numBlockers prometheus.Gauge | ||
numNonVerifieds prometheus.Gauge | ||
stragglingDuration prometheus.Gauge | ||
numBuilt prometheus.Counter | ||
numBuildsFailed prometheus.Counter | ||
numUselessPutBytes prometheus.Counter | ||
|
@@ -41,6 +42,10 @@ type metrics struct { | |
func newMetrics(reg prometheus.Registerer) (*metrics, error) { | ||
errs := wrappers.Errs{} | ||
m := &metrics{ | ||
stragglingDuration: prometheus.NewGauge(prometheus.GaugeOpts{ | ||
Name: "straggling_duration", | ||
Help: "For how long we have been straggling behind the rest, in nano-seconds.", | ||
}), | ||
Comment on lines
+45
to
+48
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: could we put this at the end ? |
||
bootstrapFinished: prometheus.NewGauge(prometheus.GaugeOpts{ | ||
Name: "bootstrap_finished", | ||
Help: "Whether or not bootstrap process has completed. 1 is success, 0 is fail or ongoing.", | ||
|
@@ -128,6 +133,7 @@ func newMetrics(reg prometheus.Registerer) (*metrics, error) { | |
m.issued.WithLabelValues(unknownSource) | ||
|
||
errs.Add( | ||
reg.Register(m.stragglingDuration), | ||
reg.Register(m.bootstrapFinished), | ||
reg.Register(m.numRequests), | ||
reg.Register(m.numBlocked), | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Could we include a comment on why the anonymous function here is empty?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why make this a decorator instead of part of the engine?