-
Notifications
You must be signed in to change notification settings - Fork 179
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[EFM] Recoverable Random Beacon State Machine #6771
Changes from 35 commits
100a92b
719b6a6
01bbdcf
1d3c6a4
6c34886
8aac526
064b651
6bcaf38
c13c7f6
489c871
10aab93
fb28249
fb161f7
650b8b8
2386623
bfe95b0
79aac04
1119c8c
c8dfca6
c9182c5
577712a
7e728aa
1fa11c6
a92d8b7
f0be4ba
62d399d
bea9c1a
89005a8
0b9f732
5a98f76
b639742
be18c57
d0df4fb
0f98cf7
6ea64d6
6f7017a
ea0f412
b9bc92e
8f15c29
43d6a63
9302497
550fd3f
7053c88
0687e30
8226642
66f67d4
56b1a63
c0c6b7e
00f952a
21ff3ef
12a633a
aa377c2
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -154,24 +154,24 @@ func (e *ReactorEngine) startDKGForEpoch(currentEpochCounter uint64, first *flow | |
Hex("first_block_id", firstID[:]). // id of first block in EpochSetup phase | ||
Logger() | ||
|
||
// if we have started the dkg for this epoch already, exit | ||
// if we have dkgState the dkg for this epoch already, exit | ||
durkmurder marked this conversation as resolved.
Show resolved
Hide resolved
|
||
started, err := e.dkgState.GetDKGStarted(nextEpochCounter) | ||
if err != nil { | ||
// unexpected storage-level error | ||
// TODO use irrecoverable context | ||
log.Fatal().Err(err).Msg("could not check whether DKG is started") | ||
log.Fatal().Err(err).Msg("could not check whether DKG is dkgState") | ||
} | ||
if started { | ||
log.Warn().Msg("DKG started before, skipping starting the DKG for this epoch") | ||
return | ||
} | ||
|
||
// flag that we are starting the dkg for this epoch | ||
err = e.dkgState.SetDKGStarted(nextEpochCounter) | ||
err = e.dkgState.SetDKGState(nextEpochCounter, flow.DKGStateStarted) | ||
if err != nil { | ||
// unexpected storage-level error | ||
// TODO use irrecoverable context | ||
log.Fatal().Err(err).Msg("could not set dkg started") | ||
log.Fatal().Err(err).Msg("could not set dkg dkgState") | ||
durkmurder marked this conversation as resolved.
Show resolved
Hide resolved
|
||
} | ||
|
||
curDKGInfo, err := e.getDKGInfo(firstID) | ||
|
@@ -271,9 +271,13 @@ func (e *ReactorEngine) handleEpochCommittedPhaseStarted(currentEpochCounter uin | |
// This can happen if the DKG failed locally, if we failed to generate | ||
// a local private beacon key, or if we crashed while performing this | ||
// check previously. | ||
endState, err := e.dkgState.GetDKGEndState(nextEpochCounter) | ||
if err == nil { | ||
log.Warn().Msgf("checking beacon key consistency: exiting because dkg end state was already set: %s", endState.String()) | ||
currentState, err := e.dkgState.GetDKGState(nextEpochCounter) | ||
if err != nil { | ||
log.Fatal().Err(err).Msg("failed to get dkg state, by this point it should have been set") | ||
return | ||
} | ||
if currentState != flow.DKGStateCompleted { | ||
log.Warn().Msgf("checking beacon key consistency: exiting because dkg didn't reach completed state: %s", currentState.String()) | ||
return | ||
} | ||
|
||
|
@@ -289,10 +293,10 @@ func (e *ReactorEngine) handleEpochCommittedPhaseStarted(currentEpochCounter uin | |
return | ||
} | ||
|
||
myBeaconPrivKey, err := e.dkgState.RetrieveMyBeaconPrivateKey(nextEpochCounter) | ||
myBeaconPrivKey, err := e.dkgState.UnsafeRetrieveMyBeaconPrivateKey(nextEpochCounter) | ||
if errors.Is(err, storage.ErrNotFound) { | ||
log.Warn().Msg("checking beacon key consistency: no key found") | ||
err := e.dkgState.SetDKGEndState(nextEpochCounter, flow.DKGEndStateNoKey) | ||
err := e.dkgState.SetDKGState(nextEpochCounter, flow.DKGStateFailure) | ||
if err != nil { | ||
// TODO use irrecoverable context | ||
log.Fatal().Err(err).Msg("failed to set dkg end state") | ||
|
@@ -319,15 +323,15 @@ func (e *ReactorEngine) handleEpochCommittedPhaseStarted(currentEpochCounter uin | |
Str("computed_beacon_pub_key", localPubKey.String()). | ||
Str("canonical_beacon_pub_key", nextDKGPubKey.String()). | ||
Msg("checking beacon key consistency: locally computed beacon public key does not match beacon public key for next epoch") | ||
err := e.dkgState.SetDKGEndState(nextEpochCounter, flow.DKGEndStateInconsistentKey) | ||
err := e.dkgState.SetDKGState(nextEpochCounter, flow.DKGStateFailure) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Intuitively it seems to be a too relaxed behavior. For failure states we allow self-transitions, for everything else where we deviate from happy path and get an unexpected error I would be inclined to return an error so we can figure out what was wrong. For your particular scenario I am not very worried, it means that operator has to try again. |
||
if err != nil { | ||
// TODO use irrecoverable context | ||
log.Fatal().Err(err).Msg("failed to set dkg end state") | ||
} | ||
return | ||
} | ||
|
||
err = e.dkgState.SetDKGEndState(nextEpochCounter, flow.DKGEndStateSuccess) | ||
err = e.dkgState.SetDKGState(nextEpochCounter, flow.RandomBeaconKeyCommitted) | ||
if err != nil { | ||
// TODO use irrecoverable context | ||
e.log.Fatal().Err(err).Msg("failed to set dkg end state") | ||
|
@@ -427,10 +431,10 @@ func (e *ReactorEngine) end(nextEpochCounter uint64) func() error { | |
// has already abandoned the happy path, because on the happy path the ReactorEngine is the only writer. | ||
// Then this function just stops and returns without error. | ||
e.log.Warn().Err(err).Msgf("node %s with index %d failed DKG locally", e.me.NodeID(), e.controller.GetIndex()) | ||
err := e.dkgState.SetDKGEndState(nextEpochCounter, flow.DKGEndStateDKGFailure) | ||
err := e.dkgState.SetDKGState(nextEpochCounter, flow.DKGStateFailure) | ||
if err != nil { | ||
if errors.Is(err, storage.ErrAlreadyExists) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
The comment above is outdated now, but it implies that we might already have persisted a failure state:
The state machine does not allow transitions from one state to itself (except for Suggestions:
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I have allowed transition failure -> failure to handle such situations since the timing might be tricky and I have also added updated comment regarding possible error return. Let me know what do you think: https://github.com/onflow/flow-go/pull/6771/files/43d6a63349788084c057a42134326dcb4e721ad5..550fd3f739237b9fb241f13a69b19de5b7ce56b5 |
||
return nil // DKGEndState already being set is expected in case of epoch recovery | ||
return nil // DKGState already being set is expected in case of epoch recovery | ||
} | ||
return fmt.Errorf("failed to set dkg end state following dkg end error: %w", err) | ||
} | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If that is easy to do, I'd prefer if we can check that the random beacon key matches the information in the Epoch Commit event ... just to be safe against human configuration errors
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Will take care of this in follow up PR.