From c93c56871bf29ad8b9a64f47ded0a54f67d67147 Mon Sep 17 00:00:00 2001 From: Andrii Slisarchuk Date: Thu, 1 Jun 2023 13:55:46 +0300 Subject: [PATCH 01/56] Added config flags and connect circuit breaker. --- cmd/access/node_builder/access_node_builder.go | 13 +++++++++++++ .../access/rpc/backend/connection_factory.go | 18 +++++++++++++++--- engine/access/rpc/engine.go | 2 ++ go.mod | 1 + go.sum | 2 ++ 5 files changed, 33 insertions(+), 3 deletions(-) diff --git a/cmd/access/node_builder/access_node_builder.go b/cmd/access/node_builder/access_node_builder.go index 66355eaed39..b44f8592526 100644 --- a/cmd/access/node_builder/access_node_builder.go +++ b/cmd/access/node_builder/access_node_builder.go @@ -158,6 +158,11 @@ func DefaultAccessNodeConfig() *AccessNodeConfig { FixedExecutionNodeIDs: nil, ArchiveAddressList: nil, MaxMsgSize: grpcutils.DefaultMaxMsgSize, + CircuitBreakerConfig: backend.CircuitBreakerConfig{ + CircuitBreakerEnabled: false, + RestoreTimeout: time.Duration(60) * time.Second, + MaxRequestToBreak: 5, + }, }, stateStreamConf: state_stream.Config{ MaxExecutionDataMsgSize: grpcutils.DefaultMaxMsgSize, @@ -644,6 +649,9 @@ func (builder *FlowAccessNodeBuilder) extraFlags() { flags.StringToIntVar(&builder.apiBurstlimits, "api-burst-limits", defaultConfig.apiBurstlimits, "burst limits for Access API methods e.g. Ping=100,GetTransaction=100 etc.") flags.BoolVar(&builder.supportsObserver, "supports-observer", defaultConfig.supportsObserver, "true if this staked access node supports observer or follower connections") flags.StringVar(&builder.PublicNetworkConfig.BindAddress, "public-network-address", defaultConfig.PublicNetworkConfig.BindAddress, "staked access node's public network bind address") + flags.BoolVar(&builder.rpcConf.CircuitBreakerConfig.CircuitBreakerEnabled, "circuit-breaker-enabled", defaultConfig.rpcConf.CircuitBreakerConfig.CircuitBreakerEnabled, "whether to enable the circuit breaker for collection and execution node connections") + flags.DurationVar(&builder.rpcConf.CircuitBreakerConfig.RestoreTimeout, "circuit-breaker-restore-timeout", defaultConfig.rpcConf.CircuitBreakerConfig.RestoreTimeout, "initial timeout for circuit breaker to try connect again. Default value is 60s") + flags.Uint32Var(&builder.rpcConf.CircuitBreakerConfig.MaxRequestToBreak, "circuit-breaker-max-request-to-break", defaultConfig.rpcConf.CircuitBreakerConfig.MaxRequestToBreak, "number of consecutive failures to break connection. Default value is 5") // ExecutionDataRequester config flags.BoolVar(&builder.executionDataSyncEnabled, "execution-data-sync-enabled", defaultConfig.executionDataSyncEnabled, "whether to enable the execution data sync protocol") @@ -704,6 +712,11 @@ func (builder *FlowAccessNodeBuilder) extraFlags() { } } } + if builder.rpcConf.CircuitBreakerConfig.CircuitBreakerEnabled { + if builder.rpcConf.CircuitBreakerConfig.MaxRequestToBreak == 0 { + return errors.New("circuit-breaker-max-request-to-break must be greater than 0") + } + } return nil }) diff --git a/engine/access/rpc/backend/connection_factory.go b/engine/access/rpc/backend/connection_factory.go index 63ead3d3e32..f56ddc55471 100644 --- a/engine/access/rpc/backend/connection_factory.go +++ b/engine/access/rpc/backend/connection_factory.go @@ -12,6 +12,7 @@ import ( "github.com/onflow/flow/protobuf/go/flow/access" "github.com/onflow/flow/protobuf/go/flow/execution" "github.com/rs/zerolog" + "github.com/sony/gobreaker" "google.golang.org/grpc" "google.golang.org/grpc/connectivity" "google.golang.org/grpc/credentials/insecure" @@ -61,6 +62,14 @@ type ConnectionFactoryImpl struct { AccessMetrics module.AccessMetrics Log zerolog.Logger mutex sync.Mutex + CircuitBreakerConfig CircuitBreakerConfig +} + +// TODO: describe +type CircuitBreakerConfig struct { + CircuitBreakerEnabled bool + RestoreTimeout time.Duration + MaxRequestToBreak uint32 } type CachedClient struct { @@ -250,7 +259,7 @@ func getGRPCAddress(address string, grpcPort uint) (string, error) { } func WithClientUnaryInterceptor(timeout time.Duration) grpc.DialOption { - + circuitBreaker := gobreaker.NewCircuitBreaker(gobreaker.Settings{}) clientTimeoutInterceptor := func( ctx context.Context, method string, @@ -265,9 +274,12 @@ func WithClientUnaryInterceptor(timeout time.Duration) grpc.DialOption { ctxWithTimeout, cancel := context.WithTimeout(ctx, timeout) defer cancel() + _, err := circuitBreaker.Execute(func() (interface{}, error) { + // call the remote GRPC using the short context + err := invoker(ctxWithTimeout, method, req, reply, cc, opts...) - // call the remote GRPC using the short context - err := invoker(ctxWithTimeout, method, req, reply, cc, opts...) + return nil, err + }) return err } diff --git a/engine/access/rpc/engine.go b/engine/access/rpc/engine.go index 76df14a2127..ae25ecfcb76 100644 --- a/engine/access/rpc/engine.go +++ b/engine/access/rpc/engine.go @@ -48,6 +48,7 @@ type Config struct { PreferredExecutionNodeIDs []string // preferred list of upstream execution node IDs FixedExecutionNodeIDs []string // fixed list of execution node IDs to choose from if no node node ID can be chosen from the PreferredExecutionNodeIDs ArchiveAddressList []string // the archive node address list to send script executions. when configured, script executions will be all sent to the archive node + CircuitBreakerConfig backend.CircuitBreakerConfig //TODO: } // Engine exposes the server with a simplified version of the Access API. @@ -171,6 +172,7 @@ func NewBuilder(log zerolog.Logger, MaxMsgSize: config.MaxMsgSize, AccessMetrics: accessMetrics, Log: log, + CircuitBreakerConfig: config.CircuitBreakerConfig, } backend := backend.New(state, diff --git a/go.mod b/go.mod index 602fb4c15fd..6e2a08a10de 100644 --- a/go.mod +++ b/go.mod @@ -242,6 +242,7 @@ require ( github.com/psiemens/sconfig v0.1.0 // indirect github.com/raulk/go-watchdog v1.3.0 // indirect github.com/rivo/uniseg v0.2.1-0.20211004051800-57c86be7915a // indirect + github.com/sony/gobreaker v0.5.0 // indirect github.com/spacemonkeygo/spacelog v0.0.0-20180420211403-2296661a0572 // indirect github.com/spaolacci/murmur3 v1.1.0 // indirect github.com/spf13/afero v1.9.0 // indirect diff --git a/go.sum b/go.sum index ed305eed14f..16dd9935de9 100644 --- a/go.sum +++ b/go.sum @@ -1421,6 +1421,8 @@ github.com/smartystreets/goconvey v1.6.4/go.mod h1:syvi0/a8iFYH4r/RixwvyeAJjdLS9 github.com/smola/gocompat v0.2.0/go.mod h1:1B0MlxbmoZNo3h8guHp8HztB3BSYR5itql9qtVc0ypY= github.com/soheilhy/cmux v0.1.4/go.mod h1:IM3LyeVVIOuxMH7sFAkER9+bJ4dT7Ms6E4xg4kGIyLM= github.com/sony/gobreaker v0.4.1/go.mod h1:ZKptC7FHNvhBz7dN2LGjPVBz2sZJmc0/PkyDJOjmxWY= +github.com/sony/gobreaker v0.5.0 h1:dRCvqm0P490vZPmy7ppEk2qCnCieBooFJ+YoXGYB+yg= +github.com/sony/gobreaker v0.5.0/go.mod h1:ZKptC7FHNvhBz7dN2LGjPVBz2sZJmc0/PkyDJOjmxWY= github.com/sourcegraph/annotate v0.0.0-20160123013949-f4cad6c6324d/go.mod h1:UdhH50NIW0fCiwBSr0co2m7BnFLdv4fQTgdqdJTHFeE= github.com/sourcegraph/syntaxhighlight v0.0.0-20170531221838-bd320f5d308e/go.mod h1:HuIsMU8RRBOtsCgI77wP899iHVBQpCmg4ErYMZB+2IA= github.com/spacemonkeygo/openssl v0.0.0-20181017203307-c2dcc5cca94a/go.mod h1:7AyxJNCJ7SBZ1MfVQCWD6Uqo2oubI2Eq2y2eqf+A5r0= From 3f37d4fea94bba9b657c2ecfc0b2182e6c90918f Mon Sep 17 00:00:00 2001 From: Andrii Slisarchuk Date: Thu, 1 Jun 2023 15:01:25 +0300 Subject: [PATCH 02/56] Created CB baed on configuration. --- .../rpc/backend/node_connection_guard.go | 243 ++++++++++++++++++ 1 file changed, 243 insertions(+) create mode 100644 engine/access/rpc/backend/node_connection_guard.go diff --git a/engine/access/rpc/backend/node_connection_guard.go b/engine/access/rpc/backend/node_connection_guard.go new file mode 100644 index 00000000000..a46029219a1 --- /dev/null +++ b/engine/access/rpc/backend/node_connection_guard.go @@ -0,0 +1,243 @@ +package backend + +import ( + "context" + "fmt" + "github.com/onflow/flow-go/storage" + "time" + + "github.com/rs/zerolog" + "github.com/sony/gobreaker" + + "github.com/onflow/flow-go/model/flow" + "github.com/onflow/flow-go/model/flow/filter" + "github.com/onflow/flow-go/state/protocol" +) + +type NodeSelector interface { + GetExecutionNodesForBlockID(ctx context.Context, blockID flow.Identifier) (flow.IdentityList, error) + GetCollectionNodes(txID flow.Identifier) ([]string, error) +} + +type NodeConnectionGuard struct { + state protocol.State + executionReceipts storage.ExecutionReceipts + log zerolog.Logger + circuitBreaker *gobreaker.CircuitBreaker + connectionFactory ConnectionFactory +} + +var _ NodeSelector = (*NodeConnectionGuard)(nil) + +func NewNodeConnectionGuard(connectionFactory ConnectionFactory, state protocol.State, executionReceipts storage.ExecutionReceipts, log zerolog.Logger) NodeConnectionGuard { + return NodeConnectionGuard{ + state: state, + executionReceipts: executionReceipts, + log: log, + circuitBreaker: gobreaker.NewCircuitBreaker(gobreaker.Settings{}), + connectionFactory: connectionFactory, + } +} + +func (ncg *NodeConnectionGuard) Invoke(req func() (interface{}, error)) (interface{}, error) { + result, err := ncg.circuitBreaker.Execute(req) + return result, err +} + +func (ncg *NodeConnectionGuard) GetCollectionNodes(txId flow.Identifier) ([]string, error) { + // retrieve the set of collector clusters + clusters, err := ncg.state.Final().Epochs().Current().Clustering() + if err != nil { + return nil, fmt.Errorf("could not cluster collection nodes: %w", err) + } + + // get the cluster responsible for the transaction + txCluster, ok := clusters.ByTxID(txId) + if !ok { + return nil, fmt.Errorf("could not get local cluster by txID: %x", txId) + } + + // select a random subset of collection nodes from the cluster to be tried in order + //TODO: Change to cb selection of nodes. + targetNodes := txCluster.Sample(3) + + // collect the addresses of all the chosen collection nodes + var targetAddrs = make([]string, len(targetNodes)) + for i, id := range targetNodes { + targetAddrs[i] = id.Address + } + + return targetAddrs, nil +} + +// GetExecutionNodesForBlockID returns upto maxExecutionNodesCnt number of randomly chosen execution node identities +// which have executed the given block ID. +// If no such execution node is found, an InsufficientExecutionReceipts error is returned. +func (ncg *NodeConnectionGuard) GetExecutionNodesForBlockID( + ctx context.Context, + blockID flow.Identifier) (flow.IdentityList, error) { + + var executorIDs flow.IdentifierList + + // check if the block ID is of the root block. If it is then don't look for execution receipts since they + // will not be present for the root block. + rootBlock, err := ncg.state.Params().Root() + if err != nil { + return nil, fmt.Errorf("failed to retreive execution IDs for block ID %v: %w", blockID, err) + } + + if rootBlock.ID() == blockID { + executorIdentities, err := ncg.state.Final().Identities(filter.HasRole(flow.RoleExecution)) + if err != nil { + return nil, fmt.Errorf("failed to retreive execution IDs for block ID %v: %w", blockID, err) + } + executorIDs = executorIdentities.NodeIDs() + } else { + // try to find atleast minExecutionNodesCnt execution node ids from the execution receipts for the given blockID + for attempt := 0; attempt < maxAttemptsForExecutionReceipt; attempt++ { + executorIDs, err = ncg.findAllExecutionNodes(blockID) + if err != nil { + return nil, err + } + + if len(executorIDs) >= minExecutionNodesCnt { + break + } + + // log the attempt + ncg.log.Debug().Int("attempt", attempt).Int("max_attempt", maxAttemptsForExecutionReceipt). + Int("execution_receipts_found", len(executorIDs)). + Str("block_id", blockID.String()). + Msg("insufficient execution receipts") + + // if one or less execution receipts may have been received then re-query + // in the hope that more might have been received by now + //TODO: Should be removed + select { + case <-ctx.Done(): + return nil, ctx.Err() + case <-time.After(100 * time.Millisecond << time.Duration(attempt)): + //retry after an exponential backoff + } + } + + receiptCnt := len(executorIDs) + // if less than minExecutionNodesCnt execution receipts have been received so far, then return random ENs + if receiptCnt < minExecutionNodesCnt { + newExecutorIDs, err := ncg.state.AtBlockID(blockID).Identities(filter.HasRole(flow.RoleExecution)) + if err != nil { + return nil, fmt.Errorf("failed to retreive execution IDs for block ID %v: %w", blockID, err) + } + executorIDs = newExecutorIDs.NodeIDs() + } + } + + // choose from the preferred or fixed execution nodes + subsetENs, err := ncg.chooseExecutionNodes(executorIDs) + if err != nil { + return nil, fmt.Errorf("failed to retreive execution IDs for block ID %v: %w", blockID, err) + } + + // randomly choose upto maxExecutionNodesCnt identities + executionIdentitiesRandom := subsetENs.Sample(maxExecutionNodesCnt) + + if len(executionIdentitiesRandom) == 0 { + return nil, fmt.Errorf("no matching execution node found for block ID %v", blockID) + } + + return executionIdentitiesRandom, nil +} + +// findAllExecutionNodes find all the execution nodes ids from the execution receipts that have been received for the +// given blockID +func (ncg *NodeConnectionGuard) findAllExecutionNodes( + blockID flow.Identifier) (flow.IdentifierList, error) { + + // lookup the receipt's storage with the block ID + allReceipts, err := ncg.executionReceipts.ByBlockID(blockID) + if err != nil { + return nil, fmt.Errorf("failed to retreive execution receipts for block ID %v: %w", blockID, err) + } + + executionResultMetaList := make(flow.ExecutionReceiptMetaList, 0, len(allReceipts)) + for _, r := range allReceipts { + executionResultMetaList = append(executionResultMetaList, r.Meta()) + } + executionResultGroupedMetaList := executionResultMetaList.GroupByResultID() + + // maximum number of matching receipts found so far for any execution result id + maxMatchedReceiptCnt := 0 + // execution result id key for the highest number of matching receipts in the identicalReceipts map + var maxMatchedReceiptResultID flow.Identifier + + // find the largest list of receipts which have the same result ID + for resultID, executionReceiptList := range executionResultGroupedMetaList { + currentMatchedReceiptCnt := executionReceiptList.Size() + if currentMatchedReceiptCnt > maxMatchedReceiptCnt { + maxMatchedReceiptCnt = currentMatchedReceiptCnt + maxMatchedReceiptResultID = resultID + } + } + + // if there are more than one execution result for the same block ID, log as error + if executionResultGroupedMetaList.NumberGroups() > 1 { + identicalReceiptsStr := fmt.Sprintf("%v", flow.GetIDs(allReceipts)) + ncg.log.Error(). + Str("block_id", blockID.String()). + Str("execution_receipts", identicalReceiptsStr). + Msg("execution receipt mismatch") + } + + // pick the largest list of matching receipts + matchingReceiptMetaList := executionResultGroupedMetaList.GetGroup(maxMatchedReceiptResultID) + + metaReceiptGroupedByExecutorID := matchingReceiptMetaList.GroupByExecutorID() + + // collect all unique execution node ids from the receipts + var executorIDs flow.IdentifierList + for executorID := range metaReceiptGroupedByExecutorID { + executorIDs = append(executorIDs, executorID) + } + + return executorIDs, nil +} + +// chooseExecutionNodes finds the subset of execution nodes defined in the identity table by first +// choosing the preferred execution nodes which have executed the transaction. If no such preferred +// execution nodes are found, then the fixed execution nodes defined in the identity table are returned +// If neither preferred nor fixed nodes are defined, then all execution node matching the executor IDs are returned. +// e.g. If execution nodes in identity table are {1,2,3,4}, preferred ENs are defined as {2,3,4} +// and the executor IDs is {1,2,3}, then {2, 3} is returned as the chosen subset of ENs +func (ncg *NodeConnectionGuard) chooseExecutionNodes(executorIDs flow.IdentifierList) (flow.IdentityList, error) { + + allENs, err := ncg.state.Final().Identities(filter.HasRole(flow.RoleExecution)) + if err != nil { + return nil, fmt.Errorf("failed to retreive all execution IDs: %w", err) + } + + // first try and choose from the preferred EN IDs + var chosenIDs flow.IdentityList + if len(preferredENIdentifiers) > 0 { + // find the preferred execution node IDs which have executed the transaction + chosenIDs = allENs.Filter(filter.And(filter.HasNodeID(preferredENIdentifiers...), + filter.HasNodeID(executorIDs...))) + if len(chosenIDs) > 0 { + return chosenIDs, nil + } + } + + // if no preferred EN ID is found, then choose from the fixed EN IDs + if len(fixedENIdentifiers) > 0 { + // choose fixed ENs which have executed the transaction + chosenIDs = allENs.Filter(filter.And(filter.HasNodeID(fixedENIdentifiers...), filter.HasNodeID(executorIDs...))) + if len(chosenIDs) > 0 { + return chosenIDs, nil + } + // if no such ENs are found then just choose all fixed ENs + chosenIDs = allENs.Filter(filter.HasNodeID(fixedENIdentifiers...)) + return chosenIDs, nil + } + + // If no preferred or fixed ENs have been specified, then return all executor IDs i.e. no preference at all + return allENs.Filter(filter.HasNodeID(executorIDs...)), nil +} From 41e1b99ef3a1f6804d822f88eeb6c009bb47442c Mon Sep 17 00:00:00 2001 From: Andrii Slisarchuk Date: Thu, 1 Jun 2023 15:02:42 +0300 Subject: [PATCH 03/56] Created CB baed on configuration. --- apiproxy/access_api_proxy.go | 8 +++- .../node_builder/access_node_builder.go | 12 +++--- engine/access/apiproxy/access_api_proxy.go | 8 +++- .../access/rpc/backend/connection_factory.go | 41 ++++++++++++++----- 4 files changed, 48 insertions(+), 21 deletions(-) diff --git a/apiproxy/access_api_proxy.go b/apiproxy/access_api_proxy.go index 8e0b781af5e..d54b1dab483 100644 --- a/apiproxy/access_api_proxy.go +++ b/apiproxy/access_api_proxy.go @@ -86,7 +86,9 @@ func (h *FlowAccessAPIForwarder) reconnectingClient(i int) error { identity.Address, grpc.WithDefaultCallOptions(grpc.MaxCallRecvMsgSize(grpcutils.DefaultMaxMsgSize)), grpc.WithInsecure(), //nolint:staticcheck - backend.WithClientUnaryInterceptor(timeout)) + backend.WithClientUnaryInterceptor(timeout, backend.CircuitBreakerConfig{ + Enabled: false, + })) if err != nil { return err } @@ -100,7 +102,9 @@ func (h *FlowAccessAPIForwarder) reconnectingClient(i int) error { identity.Address, grpc.WithDefaultCallOptions(grpc.MaxCallRecvMsgSize(grpcutils.DefaultMaxMsgSize)), grpc.WithTransportCredentials(credentials.NewTLS(tlsConfig)), - backend.WithClientUnaryInterceptor(timeout)) + backend.WithClientUnaryInterceptor(timeout, backend.CircuitBreakerConfig{ + Enabled: false, + })) if err != nil { return fmt.Errorf("cannot connect to %s %w", identity.Address, err) } diff --git a/cmd/access/node_builder/access_node_builder.go b/cmd/access/node_builder/access_node_builder.go index b44f8592526..0716ed6de62 100644 --- a/cmd/access/node_builder/access_node_builder.go +++ b/cmd/access/node_builder/access_node_builder.go @@ -159,9 +159,9 @@ func DefaultAccessNodeConfig() *AccessNodeConfig { ArchiveAddressList: nil, MaxMsgSize: grpcutils.DefaultMaxMsgSize, CircuitBreakerConfig: backend.CircuitBreakerConfig{ - CircuitBreakerEnabled: false, - RestoreTimeout: time.Duration(60) * time.Second, - MaxRequestToBreak: 5, + Enabled: false, + RestoreTimeout: time.Duration(60) * time.Second, + MaxRequestToBreak: 5, }, }, stateStreamConf: state_stream.Config{ @@ -649,7 +649,7 @@ func (builder *FlowAccessNodeBuilder) extraFlags() { flags.StringToIntVar(&builder.apiBurstlimits, "api-burst-limits", defaultConfig.apiBurstlimits, "burst limits for Access API methods e.g. Ping=100,GetTransaction=100 etc.") flags.BoolVar(&builder.supportsObserver, "supports-observer", defaultConfig.supportsObserver, "true if this staked access node supports observer or follower connections") flags.StringVar(&builder.PublicNetworkConfig.BindAddress, "public-network-address", defaultConfig.PublicNetworkConfig.BindAddress, "staked access node's public network bind address") - flags.BoolVar(&builder.rpcConf.CircuitBreakerConfig.CircuitBreakerEnabled, "circuit-breaker-enabled", defaultConfig.rpcConf.CircuitBreakerConfig.CircuitBreakerEnabled, "whether to enable the circuit breaker for collection and execution node connections") + flags.BoolVar(&builder.rpcConf.CircuitBreakerConfig.Enabled, "circuit-breaker-enabled", defaultConfig.rpcConf.CircuitBreakerConfig.Enabled, "whether to enable the circuit breaker for collection and execution node connections") flags.DurationVar(&builder.rpcConf.CircuitBreakerConfig.RestoreTimeout, "circuit-breaker-restore-timeout", defaultConfig.rpcConf.CircuitBreakerConfig.RestoreTimeout, "initial timeout for circuit breaker to try connect again. Default value is 60s") flags.Uint32Var(&builder.rpcConf.CircuitBreakerConfig.MaxRequestToBreak, "circuit-breaker-max-request-to-break", defaultConfig.rpcConf.CircuitBreakerConfig.MaxRequestToBreak, "number of consecutive failures to break connection. Default value is 5") @@ -712,7 +712,7 @@ func (builder *FlowAccessNodeBuilder) extraFlags() { } } } - if builder.rpcConf.CircuitBreakerConfig.CircuitBreakerEnabled { + if builder.rpcConf.CircuitBreakerConfig.Enabled { if builder.rpcConf.CircuitBreakerConfig.MaxRequestToBreak == 0 { return errors.New("circuit-breaker-max-request-to-break must be greater than 0") } @@ -876,7 +876,7 @@ func (builder *FlowAccessNodeBuilder) Build() (cmd.Node, error) { builder.rpcConf.CollectionAddr, grpc.WithDefaultCallOptions(grpc.MaxCallRecvMsgSize(int(builder.rpcConf.MaxMsgSize))), grpc.WithTransportCredentials(insecure.NewCredentials()), - backend.WithClientUnaryInterceptor(builder.rpcConf.CollectionClientTimeout)) + backend.WithClientUnaryInterceptor(builder.rpcConf.CollectionClientTimeout, builder.rpcConf.CircuitBreakerConfig)) if err != nil { return err } diff --git a/engine/access/apiproxy/access_api_proxy.go b/engine/access/apiproxy/access_api_proxy.go index d72ec5bb5e2..7123411cc2b 100644 --- a/engine/access/apiproxy/access_api_proxy.go +++ b/engine/access/apiproxy/access_api_proxy.go @@ -65,7 +65,9 @@ func (h *FlowAccessAPIForwarder) reconnectingClient(i int) error { identity.Address, grpc.WithDefaultCallOptions(grpc.MaxCallRecvMsgSize(int(h.maxMsgSize))), grpc.WithTransportCredentials(insecure.NewCredentials()), - backend.WithClientUnaryInterceptor(timeout)) + backend.WithClientUnaryInterceptor(timeout, backend.CircuitBreakerConfig{ + Enabled: false, + })) if err != nil { return err } @@ -79,7 +81,9 @@ func (h *FlowAccessAPIForwarder) reconnectingClient(i int) error { identity.Address, grpc.WithDefaultCallOptions(grpc.MaxCallRecvMsgSize(int(h.maxMsgSize))), grpc.WithTransportCredentials(credentials.NewTLS(tlsConfig)), - backend.WithClientUnaryInterceptor(timeout)) + backend.WithClientUnaryInterceptor(timeout, backend.CircuitBreakerConfig{ + Enabled: false, + })) if err != nil { return fmt.Errorf("cannot connect to %s %w", identity.Address, err) } diff --git a/engine/access/rpc/backend/connection_factory.go b/engine/access/rpc/backend/connection_factory.go index f56ddc55471..bb4f5e3548f 100644 --- a/engine/access/rpc/backend/connection_factory.go +++ b/engine/access/rpc/backend/connection_factory.go @@ -67,9 +67,9 @@ type ConnectionFactoryImpl struct { // TODO: describe type CircuitBreakerConfig struct { - CircuitBreakerEnabled bool - RestoreTimeout time.Duration - MaxRequestToBreak uint32 + Enabled bool + RestoreTimeout time.Duration + MaxRequestToBreak uint32 } type CachedClient struct { @@ -102,7 +102,7 @@ func (cf *ConnectionFactoryImpl) createConnection(address string, timeout time.D grpc.WithDefaultCallOptions(grpc.MaxCallRecvMsgSize(int(cf.MaxMsgSize))), grpc.WithTransportCredentials(insecure.NewCredentials()), grpc.WithKeepaliveParams(keepaliveParams), - WithClientUnaryInterceptor(timeout)) + WithClientUnaryInterceptor(timeout, cf.CircuitBreakerConfig)) if err != nil { return nil, fmt.Errorf("failed to connect to address %s: %w", address, err) } @@ -258,8 +258,18 @@ func getGRPCAddress(address string, grpcPort uint) (string, error) { return grpcAddress, nil } -func WithClientUnaryInterceptor(timeout time.Duration) grpc.DialOption { - circuitBreaker := gobreaker.NewCircuitBreaker(gobreaker.Settings{}) +func WithClientUnaryInterceptor(timeout time.Duration, circuitBreakerConfig CircuitBreakerConfig) grpc.DialOption { + var circuitBreaker *gobreaker.CircuitBreaker + + if circuitBreakerConfig.Enabled { + circuitBreaker = gobreaker.NewCircuitBreaker(gobreaker.Settings{ + Timeout: circuitBreakerConfig.RestoreTimeout, + ReadyToTrip: func(counts gobreaker.Counts) bool { + return counts.ConsecutiveFailures > circuitBreakerConfig.MaxRequestToBreak + }, + }) + } + clientTimeoutInterceptor := func( ctx context.Context, method string, @@ -269,17 +279,26 @@ func WithClientUnaryInterceptor(timeout time.Duration) grpc.DialOption { invoker grpc.UnaryInvoker, opts ...grpc.CallOption, ) error { + exec := func() (interface{}, error) { + // create a context that expires after timeout + ctxWithTimeout, cancel := context.WithTimeout(ctx, timeout) - // create a context that expires after timeout - ctxWithTimeout, cancel := context.WithTimeout(ctx, timeout) + defer cancel() - defer cancel() - _, err := circuitBreaker.Execute(func() (interface{}, error) { // call the remote GRPC using the short context err := invoker(ctxWithTimeout, method, req, reply, cc, opts...) + //TODO: As invoker do not return any results, for now nil returned return nil, err - }) + } + + var err error + + if circuitBreakerConfig.Enabled { + _, err = circuitBreaker.Execute(exec) + } else { + _, err = exec() + } return err } From ed7938c23fa947e44532bc36e4e687c4cb710630 Mon Sep 17 00:00:00 2001 From: Andrii Slisarchuk Date: Tue, 6 Jun 2023 16:34:20 +0300 Subject: [PATCH 04/56] Moved connection selection to separate module. --- Makefile | 1 + engine/access/rpc/backend/backend.go | 187 +------------ engine/access/rpc/backend/backend_accounts.go | 3 +- engine/access/rpc/backend/backend_events.go | 3 +- engine/access/rpc/backend/backend_scripts.go | 5 +- engine/access/rpc/backend/backend_test.go | 58 +++- .../rpc/backend/backend_transactions.go | 39 +-- ...ection_guard.go => connection_selector.go} | 259 ++++++++++++------ .../rpc/backend/mock/connection_selector.go | 82 ++++++ engine/access/rpc/engine.go | 3 + module/mock/finalized_header_cache.go | 44 +++ 11 files changed, 378 insertions(+), 306 deletions(-) rename engine/access/rpc/backend/{node_connection_guard.go => connection_selector.go} (57%) create mode 100644 engine/access/rpc/backend/mock/connection_selector.go create mode 100644 module/mock/finalized_header_cache.go diff --git a/Makefile b/Makefile index 5e55f9fe57b..9c9de19b91d 100644 --- a/Makefile +++ b/Makefile @@ -179,6 +179,7 @@ generate-mocks: install-mock-generators mockery --name 'API' --dir="./engine/protocol" --case=underscore --output="./engine/protocol/mock" --outpkg="mock" mockery --name 'API' --dir="./engine/access/state_stream" --case=underscore --output="./engine/access/state_stream/mock" --outpkg="mock" mockery --name 'ConnectionFactory' --dir="./engine/access/rpc/backend" --case=underscore --output="./engine/access/rpc/backend/mock" --outpkg="mock" + mockery --name 'ConnectionSelector' --dir="./engine/access/rpc/backend" --case=underscore --output="./engine/access/rpc/backend/mock" --outpkg="mock" mockery --name 'IngestRPC' --dir="./engine/execution/ingestion" --case=underscore --tags relic --output="./engine/execution/ingestion/mock" --outpkg="mock" mockery --name '.*' --dir=model/fingerprint --case=underscore --output="./model/fingerprint/mock" --outpkg="mock" mockery --name 'ExecForkActor' --structname 'ExecForkActorMock' --dir=module/mempool/consensus/mock/ --case=underscore --output="./module/mempool/consensus/mock/" --outpkg="mock" diff --git a/engine/access/rpc/backend/backend.go b/engine/access/rpc/backend/backend.go index 721b3b063c9..4832f0b7cde 100644 --- a/engine/access/rpc/backend/backend.go +++ b/engine/access/rpc/backend/backend.go @@ -3,8 +3,6 @@ package backend import ( "context" "fmt" - "time" - "google.golang.org/grpc/codes" "google.golang.org/grpc/status" @@ -17,7 +15,6 @@ import ( "github.com/onflow/flow-go/engine/common/rpc" "github.com/onflow/flow-go/engine/common/rpc/convert" "github.com/onflow/flow-go/model/flow" - "github.com/onflow/flow-go/model/flow/filter" "github.com/onflow/flow-go/module" "github.com/onflow/flow-go/state/protocol" "github.com/onflow/flow-go/storage" @@ -76,6 +73,7 @@ type Backend struct { collections storage.Collections executionReceipts storage.ExecutionReceipts connFactory ConnectionFactory + connSelector ConnectionSelector } func New( @@ -91,6 +89,7 @@ func New( chainID flow.ChainID, transactionMetrics module.TransactionMetrics, connFactory ConnectionFactory, + connSelector ConnectionSelector, retryEnabled bool, maxHeightRange uint, preferredExecutionNodeIDs []string, @@ -116,6 +115,7 @@ func New( headers: headers, executionReceipts: executionReceipts, connFactory: connFactory, + connSelector: connSelector, state: state, log: log, metrics: transactionMetrics, @@ -134,6 +134,7 @@ func New( transactionMetrics: transactionMetrics, retry: retry, connFactory: connFactory, + connSelector: connSelector, previousAccessNodes: historicalAccessNodes, log: log, }, @@ -142,6 +143,7 @@ func New( headers: headers, executionReceipts: executionReceipts, connFactory: connFactory, + connSelector: connSelector, log: log, maxHeightRange: maxHeightRange, }, @@ -158,6 +160,7 @@ func New( headers: headers, executionReceipts: executionReceipts, connFactory: connFactory, + connSelector: connSelector, log: log, }, backendExecutionResults: backendExecutionResults{ @@ -171,6 +174,7 @@ func New( collections: collections, executionReceipts: executionReceipts, connFactory: connFactory, + connSelector: connSelector, chainID: chainID, } @@ -285,180 +289,3 @@ func (b *Backend) GetLatestProtocolStateSnapshot(_ context.Context) ([]byte, err return convert.SnapshotToBytes(validSnapshot) } - -// executionNodesForBlockID returns upto maxExecutionNodesCnt number of randomly chosen execution node identities -// which have executed the given block ID. -// If no such execution node is found, an InsufficientExecutionReceipts error is returned. -func executionNodesForBlockID( - ctx context.Context, - blockID flow.Identifier, - executionReceipts storage.ExecutionReceipts, - state protocol.State, - log zerolog.Logger) (flow.IdentityList, error) { - - var executorIDs flow.IdentifierList - - // check if the block ID is of the root block. If it is then don't look for execution receipts since they - // will not be present for the root block. - rootBlock, err := state.Params().Root() - if err != nil { - return nil, fmt.Errorf("failed to retreive execution IDs for block ID %v: %w", blockID, err) - } - - if rootBlock.ID() == blockID { - executorIdentities, err := state.Final().Identities(filter.HasRole(flow.RoleExecution)) - if err != nil { - return nil, fmt.Errorf("failed to retreive execution IDs for block ID %v: %w", blockID, err) - } - executorIDs = executorIdentities.NodeIDs() - } else { - // try to find atleast minExecutionNodesCnt execution node ids from the execution receipts for the given blockID - for attempt := 0; attempt < maxAttemptsForExecutionReceipt; attempt++ { - executorIDs, err = findAllExecutionNodes(blockID, executionReceipts, log) - if err != nil { - return nil, err - } - - if len(executorIDs) >= minExecutionNodesCnt { - break - } - - // log the attempt - log.Debug().Int("attempt", attempt).Int("max_attempt", maxAttemptsForExecutionReceipt). - Int("execution_receipts_found", len(executorIDs)). - Str("block_id", blockID.String()). - Msg("insufficient execution receipts") - - // if one or less execution receipts may have been received then re-query - // in the hope that more might have been received by now - - select { - case <-ctx.Done(): - return nil, ctx.Err() - case <-time.After(100 * time.Millisecond << time.Duration(attempt)): - //retry after an exponential backoff - } - } - - receiptCnt := len(executorIDs) - // if less than minExecutionNodesCnt execution receipts have been received so far, then return random ENs - if receiptCnt < minExecutionNodesCnt { - newExecutorIDs, err := state.AtBlockID(blockID).Identities(filter.HasRole(flow.RoleExecution)) - if err != nil { - return nil, fmt.Errorf("failed to retreive execution IDs for block ID %v: %w", blockID, err) - } - executorIDs = newExecutorIDs.NodeIDs() - } - } - - // choose from the preferred or fixed execution nodes - subsetENs, err := chooseExecutionNodes(state, executorIDs) - if err != nil { - return nil, fmt.Errorf("failed to retreive execution IDs for block ID %v: %w", blockID, err) - } - - // randomly choose upto maxExecutionNodesCnt identities - executionIdentitiesRandom := subsetENs.Sample(maxExecutionNodesCnt) - - if len(executionIdentitiesRandom) == 0 { - return nil, fmt.Errorf("no matching execution node found for block ID %v", blockID) - } - - return executionIdentitiesRandom, nil -} - -// findAllExecutionNodes find all the execution nodes ids from the execution receipts that have been received for the -// given blockID -func findAllExecutionNodes( - blockID flow.Identifier, - executionReceipts storage.ExecutionReceipts, - log zerolog.Logger) (flow.IdentifierList, error) { - - // lookup the receipt's storage with the block ID - allReceipts, err := executionReceipts.ByBlockID(blockID) - if err != nil { - return nil, fmt.Errorf("failed to retreive execution receipts for block ID %v: %w", blockID, err) - } - - executionResultMetaList := make(flow.ExecutionReceiptMetaList, 0, len(allReceipts)) - for _, r := range allReceipts { - executionResultMetaList = append(executionResultMetaList, r.Meta()) - } - executionResultGroupedMetaList := executionResultMetaList.GroupByResultID() - - // maximum number of matching receipts found so far for any execution result id - maxMatchedReceiptCnt := 0 - // execution result id key for the highest number of matching receipts in the identicalReceipts map - var maxMatchedReceiptResultID flow.Identifier - - // find the largest list of receipts which have the same result ID - for resultID, executionReceiptList := range executionResultGroupedMetaList { - currentMatchedReceiptCnt := executionReceiptList.Size() - if currentMatchedReceiptCnt > maxMatchedReceiptCnt { - maxMatchedReceiptCnt = currentMatchedReceiptCnt - maxMatchedReceiptResultID = resultID - } - } - - // if there are more than one execution result for the same block ID, log as error - if executionResultGroupedMetaList.NumberGroups() > 1 { - identicalReceiptsStr := fmt.Sprintf("%v", flow.GetIDs(allReceipts)) - log.Error(). - Str("block_id", blockID.String()). - Str("execution_receipts", identicalReceiptsStr). - Msg("execution receipt mismatch") - } - - // pick the largest list of matching receipts - matchingReceiptMetaList := executionResultGroupedMetaList.GetGroup(maxMatchedReceiptResultID) - - metaReceiptGroupedByExecutorID := matchingReceiptMetaList.GroupByExecutorID() - - // collect all unique execution node ids from the receipts - var executorIDs flow.IdentifierList - for executorID := range metaReceiptGroupedByExecutorID { - executorIDs = append(executorIDs, executorID) - } - - return executorIDs, nil -} - -// chooseExecutionNodes finds the subset of execution nodes defined in the identity table by first -// choosing the preferred execution nodes which have executed the transaction. If no such preferred -// execution nodes are found, then the fixed execution nodes defined in the identity table are returned -// If neither preferred nor fixed nodes are defined, then all execution node matching the executor IDs are returned. -// e.g. If execution nodes in identity table are {1,2,3,4}, preferred ENs are defined as {2,3,4} -// and the executor IDs is {1,2,3}, then {2, 3} is returned as the chosen subset of ENs -func chooseExecutionNodes(state protocol.State, executorIDs flow.IdentifierList) (flow.IdentityList, error) { - - allENs, err := state.Final().Identities(filter.HasRole(flow.RoleExecution)) - if err != nil { - return nil, fmt.Errorf("failed to retreive all execution IDs: %w", err) - } - - // first try and choose from the preferred EN IDs - var chosenIDs flow.IdentityList - if len(preferredENIdentifiers) > 0 { - // find the preferred execution node IDs which have executed the transaction - chosenIDs = allENs.Filter(filter.And(filter.HasNodeID(preferredENIdentifiers...), - filter.HasNodeID(executorIDs...))) - if len(chosenIDs) > 0 { - return chosenIDs, nil - } - } - - // if no preferred EN ID is found, then choose from the fixed EN IDs - if len(fixedENIdentifiers) > 0 { - // choose fixed ENs which have executed the transaction - chosenIDs = allENs.Filter(filter.And(filter.HasNodeID(fixedENIdentifiers...), filter.HasNodeID(executorIDs...))) - if len(chosenIDs) > 0 { - return chosenIDs, nil - } - // if no such ENs are found then just choose all fixed ENs - chosenIDs = allENs.Filter(filter.HasNodeID(fixedENIdentifiers...)) - return chosenIDs, nil - } - - // If no preferred or fixed ENs have been specified, then return all executor IDs i.e. no preference at all - return allENs.Filter(filter.HasNodeID(executorIDs...)), nil -} diff --git a/engine/access/rpc/backend/backend_accounts.go b/engine/access/rpc/backend/backend_accounts.go index a3a41053c61..4781e395d23 100644 --- a/engine/access/rpc/backend/backend_accounts.go +++ b/engine/access/rpc/backend/backend_accounts.go @@ -22,6 +22,7 @@ type backendAccounts struct { headers storage.Headers executionReceipts storage.ExecutionReceipts connFactory ConnectionFactory + connSelector ConnectionSelector log zerolog.Logger } @@ -82,7 +83,7 @@ func (b *backendAccounts) getAccountAtBlockID( BlockId: blockID[:], } - execNodes, err := executionNodesForBlockID(ctx, blockID, b.executionReceipts, b.state, b.log) + execNodes, err := b.connSelector.GetExecutionNodesForBlockID(ctx, blockID) if err != nil { return nil, rpc.ConvertError(err, "failed to get account from the execution node", codes.Internal) } diff --git a/engine/access/rpc/backend/backend_events.go b/engine/access/rpc/backend/backend_events.go index e097843b933..83824e74733 100644 --- a/engine/access/rpc/backend/backend_events.go +++ b/engine/access/rpc/backend/backend_events.go @@ -25,6 +25,7 @@ type backendEvents struct { executionReceipts storage.ExecutionReceipts state protocol.State connFactory ConnectionFactory + connSelector ConnectionSelector log zerolog.Logger maxHeightRange uint } @@ -129,7 +130,7 @@ func (b *backendEvents) getBlockEventsFromExecutionNode( // choose the last block ID to find the list of execution nodes lastBlockID := blockIDs[len(blockIDs)-1] - execNodes, err := executionNodesForBlockID(ctx, lastBlockID, b.executionReceipts, b.state, b.log) + execNodes, err := b.connSelector.GetExecutionNodesForBlockID(ctx, lastBlockID) if err != nil { b.log.Error().Err(err).Msg("failed to retrieve events from execution node") return nil, rpc.ConvertError(err, "failed to retrieve events from execution node", codes.Internal) diff --git a/engine/access/rpc/backend/backend_scripts.go b/engine/access/rpc/backend/backend_scripts.go index 9f4ec5dffb2..48df09d6f12 100644 --- a/engine/access/rpc/backend/backend_scripts.go +++ b/engine/access/rpc/backend/backend_scripts.go @@ -28,6 +28,7 @@ type backendScripts struct { executionReceipts storage.ExecutionReceipts state protocol.State connFactory ConnectionFactory + connSelector ConnectionSelector log zerolog.Logger metrics module.BackendScriptsMetrics loggedScripts *lru.Cache @@ -86,12 +87,12 @@ func (b *backendScripts) findScriptExecutors( ctx context.Context, blockID flow.Identifier, ) ([]string, error) { - // send script queries to archive nodes if archive addres is configured + // send script queries to archive nodes if archive address is configured if len(b.archiveAddressList) > 0 { return b.archiveAddressList, nil } - executors, err := executionNodesForBlockID(ctx, blockID, b.executionReceipts, b.state, b.log) + executors, err := b.connSelector.GetExecutionNodesForBlockID(ctx, blockID) if err != nil { return nil, err } diff --git a/engine/access/rpc/backend/backend_test.go b/engine/access/rpc/backend/backend_test.go index 11109130222..03294796633 100644 --- a/engine/access/rpc/backend/backend_test.go +++ b/engine/access/rpc/backend/backend_test.go @@ -47,6 +47,7 @@ type Suite struct { execClient *access.ExecutionAPIClient historicalAccessClient *access.AccessAPIClient connectionFactory *backendmock.ConnectionFactory + connSelector *backendmock.ConnectionSelector chainID flow.ChainID } @@ -74,6 +75,7 @@ func (suite *Suite) SetupTest() { suite.chainID = flow.Testnet suite.historicalAccessClient = new(access.AccessAPIClient) suite.connectionFactory = new(backendmock.ConnectionFactory) + suite.connSelector = new(backendmock.ConnectionSelector) } func (suite *Suite) TestPing() { @@ -98,6 +100,7 @@ func (suite *Suite) TestPing() { suite.chainID, metrics.NewNoopCollector(), nil, + nil, false, DefaultMaxHeightRange, nil, @@ -133,6 +136,7 @@ func (suite *Suite) TestGetLatestFinalizedBlockHeader() { suite.chainID, metrics.NewNoopCollector(), nil, + nil, false, DefaultMaxHeightRange, nil, @@ -198,6 +202,7 @@ func (suite *Suite) TestGetLatestProtocolStateSnapshot_NoTransitionSpan() { suite.chainID, metrics.NewNoopCollector(), nil, + nil, false, 100, nil, @@ -270,6 +275,7 @@ func (suite *Suite) TestGetLatestProtocolStateSnapshot_TransitionSpans() { suite.chainID, metrics.NewNoopCollector(), nil, + nil, false, 100, nil, @@ -335,6 +341,7 @@ func (suite *Suite) TestGetLatestProtocolStateSnapshot_PhaseTransitionSpan() { suite.chainID, metrics.NewNoopCollector(), nil, + nil, false, 100, nil, @@ -411,6 +418,7 @@ func (suite *Suite) TestGetLatestProtocolStateSnapshot_EpochTransitionSpan() { suite.chainID, metrics.NewNoopCollector(), nil, + nil, false, 100, nil, @@ -471,6 +479,7 @@ func (suite *Suite) TestGetLatestProtocolStateSnapshot_HistoryLimit() { suite.chainID, metrics.NewNoopCollector(), nil, + nil, false, DefaultMaxHeightRange, nil, @@ -509,6 +518,7 @@ func (suite *Suite) TestGetLatestSealedBlockHeader() { suite.chainID, metrics.NewNoopCollector(), nil, + nil, false, DefaultMaxHeightRange, nil, @@ -555,6 +565,7 @@ func (suite *Suite) TestGetTransaction() { suite.chainID, metrics.NewNoopCollector(), nil, + nil, false, DefaultMaxHeightRange, nil, @@ -595,6 +606,7 @@ func (suite *Suite) TestGetCollection() { suite.chainID, metrics.NewNoopCollector(), nil, + nil, false, DefaultMaxHeightRange, nil, @@ -636,6 +648,8 @@ func (suite *Suite) TestGetTransactionResultByIndex() { connFactory := new(backendmock.ConnectionFactory) connFactory.On("GetExecutionAPIClient", mock.Anything).Return(suite.execClient, &mockCloser{}, nil) + connSelector := new(backendmock.ConnectionSelector) + exeEventReq := &execproto.GetTransactionByIndexRequest{ BlockId: blockId[:], Index: index, @@ -658,6 +672,7 @@ func (suite *Suite) TestGetTransactionResultByIndex() { suite.chainID, metrics.NewNoopCollector(), connFactory, // the connection factory should be used to get the execution node client + connSelector, false, DefaultMaxHeightRange, nil, @@ -700,6 +715,8 @@ func (suite *Suite) TestGetTransactionResultsByBlockID() { connFactory := new(backendmock.ConnectionFactory) connFactory.On("GetExecutionAPIClient", mock.Anything).Return(suite.execClient, &mockCloser{}, nil) + connSelector := new(backendmock.ConnectionSelector) + exeEventReq := &execproto.GetTransactionsByBlockIDRequest{ BlockId: blockId[:], } @@ -721,6 +738,7 @@ func (suite *Suite) TestGetTransactionResultsByBlockID() { suite.chainID, metrics.NewNoopCollector(), connFactory, // the connection factory should be used to get the execution node client + connSelector, false, DefaultMaxHeightRange, nil, @@ -790,6 +808,8 @@ func (suite *Suite) TestTransactionStatusTransition() { connFactory.On("GetExecutionAPIClient", mock.Anything).Return(suite.execClient, &mockCloser{}, nil) connFactory.On("InvalidateExecutionAPIClient", mock.Anything) + connSelector := new(backendmock.ConnectionSelector) + exeEventReq := &execproto.GetTransactionResultRequest{ BlockId: blockID[:], TransactionId: txID[:], @@ -812,6 +832,7 @@ func (suite *Suite) TestTransactionStatusTransition() { suite.chainID, metrics.NewNoopCollector(), connFactory, // the connection factory should be used to get the execution node client + connSelector, false, DefaultMaxHeightRange, nil, @@ -932,6 +953,7 @@ func (suite *Suite) TestTransactionExpiredStatusTransition() { suite.chainID, metrics.NewNoopCollector(), nil, + nil, false, DefaultMaxHeightRange, nil, @@ -1085,6 +1107,7 @@ func (suite *Suite) TestTransactionPendingToFinalizedStatusTransition() { // create a mock connection factory connFactory := suite.setupConnectionFactory() + connSelector := new(backendmock.ConnectionSelector) backend := New( suite.state, @@ -1099,6 +1122,7 @@ func (suite *Suite) TestTransactionPendingToFinalizedStatusTransition() { suite.chainID, metrics.NewNoopCollector(), connFactory, // the connection factory should be used to get the execution node client + connSelector, false, 100, nil, @@ -1157,6 +1181,7 @@ func (suite *Suite) TestTransactionResultUnknown() { suite.chainID, metrics.NewNoopCollector(), nil, + nil, false, DefaultMaxHeightRange, nil, @@ -1211,6 +1236,7 @@ func (suite *Suite) TestGetLatestFinalizedBlock() { suite.chainID, metrics.NewNoopCollector(), nil, + nil, false, DefaultMaxHeightRange, nil, @@ -1276,6 +1302,8 @@ func (suite *Suite) TestGetEventsForBlockIDs() { connFactory := new(backendmock.ConnectionFactory) connFactory.On("GetExecutionAPIClient", mock.Anything).Return(suite.execClient, &mockCloser{}, nil) + connSelector := new(backendmock.ConnectionSelector) + // create the expected results from execution node and access node exeResults := make([]*execproto.GetEventsForBlockIDsResponse_Result, len(blockHeaders)) @@ -1341,6 +1369,7 @@ func (suite *Suite) TestGetEventsForBlockIDs() { suite.chainID, metrics.NewNoopCollector(), connFactory, // the connection factory should be used to get the execution node client + connSelector, false, DefaultMaxHeightRange, nil, @@ -1373,6 +1402,7 @@ func (suite *Suite) TestGetEventsForBlockIDs() { suite.chainID, metrics.NewNoopCollector(), connFactory, // the connection factory should be used to get the execution node client + connSelector, false, DefaultMaxHeightRange, nil, @@ -1399,6 +1429,7 @@ func (suite *Suite) TestGetExecutionResultByID() { // create a mock connection factory connFactory := new(backendmock.ConnectionFactory) + connSelector := new(backendmock.ConnectionSelector) nonexistingID := unittest.IdentifierFixture() blockID := unittest.IdentifierFixture() @@ -1432,6 +1463,7 @@ func (suite *Suite) TestGetExecutionResultByID() { suite.chainID, metrics.NewNoopCollector(), connFactory, // the connection factory should be used to get the execution node client + connSelector, false, DefaultMaxHeightRange, nil, @@ -1462,6 +1494,7 @@ func (suite *Suite) TestGetExecutionResultByID() { suite.chainID, metrics.NewNoopCollector(), connFactory, // the connection factory should be used to get the execution node client + connSelector, false, DefaultMaxHeightRange, nil, @@ -1490,6 +1523,7 @@ func (suite *Suite) TestGetExecutionResultByBlockID() { // create a mock connection factory connFactory := new(backendmock.ConnectionFactory) + connSelector := new(backendmock.ConnectionSelector) blockID := unittest.IdentifierFixture() executionResult := unittest.ExecutionResultFixture( @@ -1525,6 +1559,7 @@ func (suite *Suite) TestGetExecutionResultByBlockID() { suite.chainID, metrics.NewNoopCollector(), connFactory, // the connection factory should be used to get the execution node client + connSelector, false, DefaultMaxHeightRange, nil, @@ -1556,6 +1591,7 @@ func (suite *Suite) TestGetExecutionResultByBlockID() { suite.chainID, metrics.NewNoopCollector(), connFactory, // the connection factory should be used to get the execution node client + connSelector, false, DefaultMaxHeightRange, nil, @@ -1691,6 +1727,7 @@ func (suite *Suite) TestGetEventsForHeightRange() { } connFactory := suite.setupConnectionFactory() + connSelector := new(backendmock.ConnectionSelector) suite.Run("invalid request max height < min height", func() { backend := New( @@ -1706,6 +1743,7 @@ func (suite *Suite) TestGetEventsForHeightRange() { suite.chainID, metrics.NewNoopCollector(), connFactory, // the connection factory should be used to get the execution node client + connSelector, false, DefaultMaxHeightRange, nil, @@ -1745,6 +1783,7 @@ func (suite *Suite) TestGetEventsForHeightRange() { suite.chainID, metrics.NewNoopCollector(), connFactory, // the connection factory should be used to get the execution node client + connSelector, false, DefaultMaxHeightRange, nil, @@ -1783,6 +1822,7 @@ func (suite *Suite) TestGetEventsForHeightRange() { suite.chainID, metrics.NewNoopCollector(), connFactory, // the connection factory should be used to get the execution node client + connSelector, false, DefaultMaxHeightRange, nil, @@ -1820,6 +1860,7 @@ func (suite *Suite) TestGetEventsForHeightRange() { suite.chainID, metrics.NewNoopCollector(), connFactory, // the connection factory should be used to get the execution node client + connSelector, false, 1, // set maximum range to 1 nil, @@ -1857,6 +1898,7 @@ func (suite *Suite) TestGetEventsForHeightRange() { suite.chainID, metrics.NewNoopCollector(), connFactory, // the connection factory should be used to get the execution node client + connSelector, false, DefaultMaxHeightRange, nil, @@ -1920,6 +1962,8 @@ func (suite *Suite) TestGetAccount() { connFactory := new(backendmock.ConnectionFactory) connFactory.On("GetExecutionAPIClient", mock.Anything).Return(suite.execClient, &mockCloser{}, nil) + connSelector := new(backendmock.ConnectionSelector) + // create the handler with the mock backend := New( suite.state, @@ -1934,6 +1978,7 @@ func (suite *Suite) TestGetAccount() { suite.chainID, metrics.NewNoopCollector(), connFactory, // the connection factory should be used to get the execution node client + connSelector, false, DefaultMaxHeightRange, nil, @@ -1983,6 +2028,8 @@ func (suite *Suite) TestGetAccountAtBlockHeight() { connFactory := new(backendmock.ConnectionFactory) connFactory.On("GetExecutionAPIClient", mock.Anything).Return(suite.execClient, &mockCloser{}, nil) + connSelector := new(backendmock.ConnectionSelector) + // create the expected execution API request blockID := h.ID() exeReq := &execproto.GetAccountAtBlockIDRequest{ @@ -2015,6 +2062,7 @@ func (suite *Suite) TestGetAccountAtBlockHeight() { flow.Testnet, metrics.NewNoopCollector(), connFactory, // the connection factory should be used to get the execution node client + connSelector, false, DefaultMaxHeightRange, nil, @@ -2054,6 +2102,7 @@ func (suite *Suite) TestGetNetworkParameters() { flow.Mainnet, metrics.NewNoopCollector(), nil, + nil, false, DefaultMaxHeightRange, nil, @@ -2121,6 +2170,8 @@ func (suite *Suite) TestExecutionNodesForBlockID() { func(flow.IdentityFilter) error { return nil }) suite.state.On("Final").Return(suite.snapshot, nil).Maybe() + connSelector := new(backendmock.ConnectionSelector) + testExecutionNodesForBlockID := func(preferredENs, fixedENs, expectedENs flow.IdentityList) { if preferredENs != nil { @@ -2129,7 +2180,7 @@ func (suite *Suite) TestExecutionNodesForBlockID() { if fixedENs != nil { fixedENIdentifiers = fixedENs.NodeIDs() } - actualList, err := executionNodesForBlockID(context.Background(), block.ID(), suite.receipts, suite.state, suite.log) + actualList, err := connSelector.GetExecutionNodesForBlockID(context.Background(), block.ID()) require.NoError(suite.T(), err) if expectedENs == nil { expectedENs = flow.IdentityList{} @@ -2149,7 +2200,7 @@ func (suite *Suite) TestExecutionNodesForBlockID() { attempt2Receipts = flow.ExecutionReceiptList{} attempt3Receipts = flow.ExecutionReceiptList{} suite.state.On("AtBlockID", mock.Anything).Return(suite.snapshot) - actualList, err := executionNodesForBlockID(context.Background(), block.ID(), suite.receipts, suite.state, suite.log) + actualList, err := connSelector.GetExecutionNodesForBlockID(context.Background(), block.ID()) require.NoError(suite.T(), err) require.Equal(suite.T(), len(actualList), maxExecutionNodesCnt) }) @@ -2219,6 +2270,8 @@ func (suite *Suite) TestExecuteScriptOnExecutionNode() { connFactory.On("GetExecutionAPIClient", mock.Anything).Return(suite.execClient, &mockCloser{}, nil) connFactory.On("InvalidateExecutionAPIClient", mock.Anything) + connSelector := new(backendmock.ConnectionSelector) + // create the handler with the mock backend := New( suite.state, @@ -2233,6 +2286,7 @@ func (suite *Suite) TestExecuteScriptOnExecutionNode() { flow.Mainnet, metrics.NewNoopCollector(), connFactory, // the connection factory should be used to get the execution node client + connSelector, false, DefaultMaxHeightRange, nil, diff --git a/engine/access/rpc/backend/backend_transactions.go b/engine/access/rpc/backend/backend_transactions.go index 661fc3f90f8..286cd544aae 100644 --- a/engine/access/rpc/backend/backend_transactions.go +++ b/engine/access/rpc/backend/backend_transactions.go @@ -24,8 +24,6 @@ import ( "github.com/onflow/flow-go/storage" ) -const collectionNodesToTry uint = 3 - type backendTransactions struct { staticCollectionRPC accessproto.AccessAPIClient // rpc client tied to a fixed collection node transactions storage.Transactions @@ -38,6 +36,7 @@ type backendTransactions struct { transactionValidator *access.TransactionValidator retry *Retry connFactory ConnectionFactory + connSelector ConnectionSelector previousAccessNodes []accessproto.AccessAPIClient log zerolog.Logger @@ -86,7 +85,7 @@ func (b *backendTransactions) trySendTransaction(ctx context.Context, tx *flow.T } // otherwise choose a random set of collections nodes to try - collAddrs, err := b.chooseCollectionNodes(tx, collectionNodesToTry) + collAddrs, err := b.connSelector.GetCollectionNodes(tx.ID()) if err != nil { return fmt.Errorf("failed to determine collection node for tx %x: %w", tx, err) } @@ -112,34 +111,6 @@ func (b *backendTransactions) trySendTransaction(ctx context.Context, tx *flow.T return sendErrors.ErrorOrNil() } -// chooseCollectionNodes finds a random subset of size sampleSize of collection node addresses from the -// collection node cluster responsible for the given tx -func (b *backendTransactions) chooseCollectionNodes(tx *flow.TransactionBody, sampleSize uint) ([]string, error) { - - // retrieve the set of collector clusters - clusters, err := b.state.Final().Epochs().Current().Clustering() - if err != nil { - return nil, fmt.Errorf("could not cluster collection nodes: %w", err) - } - - // get the cluster responsible for the transaction - txCluster, ok := clusters.ByTxID(tx.ID()) - if !ok { - return nil, fmt.Errorf("could not get local cluster by txID: %x", tx.ID()) - } - - // select a random subset of collection nodes from the cluster to be tried in order - targetNodes := txCluster.Sample(sampleSize) - - // collect the addresses of all the chosen collection nodes - var targetAddrs = make([]string, len(targetNodes)) - for i, id := range targetNodes { - targetAddrs[i] = id.Address - } - - return targetAddrs, nil -} - // sendTransactionToCollection sends the transaction to the given collection node via grpc func (b *backendTransactions) sendTransactionToCollector(ctx context.Context, tx *flow.TransactionBody, @@ -371,7 +342,7 @@ func (b *backendTransactions) GetTransactionResultsByBlockID( req := &execproto.GetTransactionsByBlockIDRequest{ BlockId: blockID[:], } - execNodes, err := executionNodesForBlockID(ctx, blockID, b.executionReceipts, b.state, b.log) + execNodes, err := b.connSelector.GetExecutionNodesForBlockID(ctx, blockID) if err != nil { if IsInsufficientExecutionReceipts(err) { return nil, status.Errorf(codes.NotFound, err.Error()) @@ -491,7 +462,7 @@ func (b *backendTransactions) GetTransactionResultByIndex( BlockId: blockID[:], Index: index, } - execNodes, err := executionNodesForBlockID(ctx, blockID, b.executionReceipts, b.state, b.log) + execNodes, err := b.connSelector.GetExecutionNodesForBlockID(ctx, blockID) if err != nil { if IsInsufficientExecutionReceipts(err) { return nil, status.Errorf(codes.NotFound, err.Error()) @@ -715,7 +686,7 @@ func (b *backendTransactions) getTransactionResultFromExecutionNode( TransactionId: transactionID, } - execNodes, err := executionNodesForBlockID(ctx, blockID, b.executionReceipts, b.state, b.log) + execNodes, err := b.connSelector.GetExecutionNodesForBlockID(ctx, blockID) if err != nil { // if no execution receipt were found, return a NotFound GRPC error if IsInsufficientExecutionReceipts(err) { diff --git a/engine/access/rpc/backend/node_connection_guard.go b/engine/access/rpc/backend/connection_selector.go similarity index 57% rename from engine/access/rpc/backend/node_connection_guard.go rename to engine/access/rpc/backend/connection_selector.go index a46029219a1..8a70e0af5e8 100644 --- a/engine/access/rpc/backend/node_connection_guard.go +++ b/engine/access/rpc/backend/connection_selector.go @@ -3,50 +3,56 @@ package backend import ( "context" "fmt" - "github.com/onflow/flow-go/storage" "time" - "github.com/rs/zerolog" - "github.com/sony/gobreaker" - "github.com/onflow/flow-go/model/flow" "github.com/onflow/flow-go/model/flow/filter" "github.com/onflow/flow-go/state/protocol" + "github.com/onflow/flow-go/storage" + "github.com/rs/zerolog" ) -type NodeSelector interface { +const collectionNodesToTry uint = 3 + +type ConnectionSelector interface { GetExecutionNodesForBlockID(ctx context.Context, blockID flow.Identifier) (flow.IdentityList, error) GetCollectionNodes(txID flow.Identifier) ([]string, error) } -type NodeConnectionGuard struct { +type MainConnectionSelector struct { state protocol.State executionReceipts storage.ExecutionReceipts log zerolog.Logger - circuitBreaker *gobreaker.CircuitBreaker - connectionFactory ConnectionFactory } -var _ NodeSelector = (*NodeConnectionGuard)(nil) - -func NewNodeConnectionGuard(connectionFactory ConnectionFactory, state protocol.State, executionReceipts storage.ExecutionReceipts, log zerolog.Logger) NodeConnectionGuard { - return NodeConnectionGuard{ - state: state, - executionReceipts: executionReceipts, - log: log, - circuitBreaker: gobreaker.NewCircuitBreaker(gobreaker.Settings{}), - connectionFactory: connectionFactory, +type CircuitBreakerConnectionSelector MainConnectionSelector + +var _ ConnectionSelector = (*MainConnectionSelector)(nil) + +func NewConnectionSelector( + state protocol.State, + executionReceipts storage.ExecutionReceipts, + log zerolog.Logger, + isCircuitBreakerEnabled bool, +) ConnectionSelector { + if isCircuitBreakerEnabled { + return &CircuitBreakerConnectionSelector{ + state: state, + executionReceipts: executionReceipts, + log: log, + } + } else { + return &MainConnectionSelector{ + state: state, + executionReceipts: executionReceipts, + log: log, + } } } -func (ncg *NodeConnectionGuard) Invoke(req func() (interface{}, error)) (interface{}, error) { - result, err := ncg.circuitBreaker.Execute(req) - return result, err -} - -func (ncg *NodeConnectionGuard) GetCollectionNodes(txId flow.Identifier) ([]string, error) { +func (ncs *MainConnectionSelector) GetCollectionNodes(txId flow.Identifier) ([]string, error) { // retrieve the set of collector clusters - clusters, err := ncg.state.Final().Epochs().Current().Clustering() + clusters, err := ncs.state.Final().Epochs().Current().Clustering() if err != nil { return nil, fmt.Errorf("could not cluster collection nodes: %w", err) } @@ -58,8 +64,7 @@ func (ncg *NodeConnectionGuard) GetCollectionNodes(txId flow.Identifier) ([]stri } // select a random subset of collection nodes from the cluster to be tried in order - //TODO: Change to cb selection of nodes. - targetNodes := txCluster.Sample(3) + targetNodes := txCluster.Sample(collectionNodesToTry) // collect the addresses of all the chosen collection nodes var targetAddrs = make([]string, len(targetNodes)) @@ -73,46 +78,46 @@ func (ncg *NodeConnectionGuard) GetCollectionNodes(txId flow.Identifier) ([]stri // GetExecutionNodesForBlockID returns upto maxExecutionNodesCnt number of randomly chosen execution node identities // which have executed the given block ID. // If no such execution node is found, an InsufficientExecutionReceipts error is returned. -func (ncg *NodeConnectionGuard) GetExecutionNodesForBlockID( +func (ncs *MainConnectionSelector) GetExecutionNodesForBlockID( ctx context.Context, - blockID flow.Identifier) (flow.IdentityList, error) { + blockID flow.Identifier, +) (flow.IdentityList, error) { var executorIDs flow.IdentifierList // check if the block ID is of the root block. If it is then don't look for execution receipts since they // will not be present for the root block. - rootBlock, err := ncg.state.Params().Root() + rootBlock, err := ncs.state.Params().Root() if err != nil { return nil, fmt.Errorf("failed to retreive execution IDs for block ID %v: %w", blockID, err) } if rootBlock.ID() == blockID { - executorIdentities, err := ncg.state.Final().Identities(filter.HasRole(flow.RoleExecution)) + executorIdentities, err := ncs.state.Final().Identities(filter.HasRole(flow.RoleExecution)) if err != nil { return nil, fmt.Errorf("failed to retreive execution IDs for block ID %v: %w", blockID, err) } executorIDs = executorIdentities.NodeIDs() } else { - // try to find atleast minExecutionNodesCnt execution node ids from the execution receipts for the given blockID + // try to find at least minExecutionNodesCnt execution node ids from the execution receipts for the given blockID for attempt := 0; attempt < maxAttemptsForExecutionReceipt; attempt++ { - executorIDs, err = ncg.findAllExecutionNodes(blockID) + executorIDs, err = findAllExecutionNodes(blockID, ncs.executionReceipts, ncs.log) if err != nil { return nil, err } - if len(executorIDs) >= minExecutionNodesCnt { + if len(executorIDs) > 0 { break } // log the attempt - ncg.log.Debug().Int("attempt", attempt).Int("max_attempt", maxAttemptsForExecutionReceipt). + ncs.log.Debug().Int("attempt", attempt).Int("max_attempt", maxAttemptsForExecutionReceipt). Int("execution_receipts_found", len(executorIDs)). Str("block_id", blockID.String()). Msg("insufficient execution receipts") // if one or less execution receipts may have been received then re-query // in the hope that more might have been received by now - //TODO: Should be removed select { case <-ctx.Done(): return nil, ctx.Err() @@ -120,20 +125,10 @@ func (ncg *NodeConnectionGuard) GetExecutionNodesForBlockID( //retry after an exponential backoff } } - - receiptCnt := len(executorIDs) - // if less than minExecutionNodesCnt execution receipts have been received so far, then return random ENs - if receiptCnt < minExecutionNodesCnt { - newExecutorIDs, err := ncg.state.AtBlockID(blockID).Identities(filter.HasRole(flow.RoleExecution)) - if err != nil { - return nil, fmt.Errorf("failed to retreive execution IDs for block ID %v: %w", blockID, err) - } - executorIDs = newExecutorIDs.NodeIDs() - } } // choose from the preferred or fixed execution nodes - subsetENs, err := ncg.chooseExecutionNodes(executorIDs) + subsetENs, err := chooseExecutionNodes(ncs.state, executorIDs) if err != nil { return nil, fmt.Errorf("failed to retreive execution IDs for block ID %v: %w", blockID, err) } @@ -148,58 +143,94 @@ func (ncg *NodeConnectionGuard) GetExecutionNodesForBlockID( return executionIdentitiesRandom, nil } -// findAllExecutionNodes find all the execution nodes ids from the execution receipts that have been received for the -// given blockID -func (ncg *NodeConnectionGuard) findAllExecutionNodes( - blockID flow.Identifier) (flow.IdentifierList, error) { - - // lookup the receipt's storage with the block ID - allReceipts, err := ncg.executionReceipts.ByBlockID(blockID) +func (nccbs *CircuitBreakerConnectionSelector) GetCollectionNodes(txId flow.Identifier) ([]string, error) { + // retrieve the set of collector clusters + clusters, err := nccbs.state.Final().Epochs().Current().Clustering() if err != nil { - return nil, fmt.Errorf("failed to retreive execution receipts for block ID %v: %w", blockID, err) + return nil, fmt.Errorf("could not cluster collection nodes: %w", err) } - executionResultMetaList := make(flow.ExecutionReceiptMetaList, 0, len(allReceipts)) - for _, r := range allReceipts { - executionResultMetaList = append(executionResultMetaList, r.Meta()) + // get the cluster responsible for the transaction + txCluster, ok := clusters.ByTxID(txId) + if !ok { + return nil, fmt.Errorf("could not get local cluster by txID: %x", txId) } - executionResultGroupedMetaList := executionResultMetaList.GroupByResultID() - // maximum number of matching receipts found so far for any execution result id - maxMatchedReceiptCnt := 0 - // execution result id key for the highest number of matching receipts in the identicalReceipts map - var maxMatchedReceiptResultID flow.Identifier + // collect the addresses of all the chosen collection nodes + var targetAddress = make([]string, len(txCluster)) + for i, id := range txCluster { + targetAddress[i] = id.Address + } - // find the largest list of receipts which have the same result ID - for resultID, executionReceiptList := range executionResultGroupedMetaList { - currentMatchedReceiptCnt := executionReceiptList.Size() - if currentMatchedReceiptCnt > maxMatchedReceiptCnt { - maxMatchedReceiptCnt = currentMatchedReceiptCnt - maxMatchedReceiptResultID = resultID - } + return targetAddress, nil +} + +// GetExecutionNodesForBlockID returns upto maxExecutionNodesCnt number of randomly chosen execution node identities +// which have executed the given block ID. +// If no such execution node is found, an InsufficientExecutionReceipts error is returned. +func (nccbs *CircuitBreakerConnectionSelector) GetExecutionNodesForBlockID( + ctx context.Context, + blockID flow.Identifier, +) (flow.IdentityList, error) { + + var executorIDs flow.IdentifierList + + // check if the block ID is of the root block. If it is then don't look for execution receipts since they + // will not be present for the root block. + rootBlock, err := nccbs.state.Params().Root() + if err != nil { + return nil, fmt.Errorf("failed to retreive execution IDs for block ID %v: %w", blockID, err) } - // if there are more than one execution result for the same block ID, log as error - if executionResultGroupedMetaList.NumberGroups() > 1 { - identicalReceiptsStr := fmt.Sprintf("%v", flow.GetIDs(allReceipts)) - ncg.log.Error(). - Str("block_id", blockID.String()). - Str("execution_receipts", identicalReceiptsStr). - Msg("execution receipt mismatch") + if rootBlock.ID() == blockID { + executorIdentities, err := nccbs.state.Final().Identities(filter.HasRole(flow.RoleExecution)) + if err != nil { + return nil, fmt.Errorf("failed to retreive execution IDs for block ID %v: %w", blockID, err) + } + executorIDs = executorIdentities.NodeIDs() + } else { + // try to find at least minExecutionNodesCnt execution node ids from the execution receipts for the given blockID + for attempt := 0; attempt < maxAttemptsForExecutionReceipt; attempt++ { + executorIDs, err = findAllExecutionNodes(blockID, nccbs.executionReceipts, nccbs.log) + if err != nil { + return nil, err + } + + if len(executorIDs) > 0 { + break + } + + // log the attempt + nccbs.log.Debug().Int("attempt", attempt).Int("max_attempt", maxAttemptsForExecutionReceipt). + Int("execution_receipts_found", len(executorIDs)). + Str("block_id", blockID.String()). + Msg("insufficient execution receipts") + + // if one or less execution receipts may have been received then re-query + // in the hope that more might have been received by now + select { + case <-ctx.Done(): + return nil, ctx.Err() + case <-time.After(100 * time.Millisecond << time.Duration(attempt)): + //retry after an exponential backoff + } + } } - // pick the largest list of matching receipts - matchingReceiptMetaList := executionResultGroupedMetaList.GetGroup(maxMatchedReceiptResultID) + // choose from the preferred or fixed execution nodes + subsetENs, err := chooseExecutionNodes(nccbs.state, executorIDs) + if err != nil { + return nil, fmt.Errorf("failed to retreive execution IDs for block ID %v: %w", blockID, err) + } - metaReceiptGroupedByExecutorID := matchingReceiptMetaList.GroupByExecutorID() + // randomly choose upto maxExecutionNodesCnt identities + executionIdentitiesRandom := subsetENs.Sample(maxExecutionNodesCnt) - // collect all unique execution node ids from the receipts - var executorIDs flow.IdentifierList - for executorID := range metaReceiptGroupedByExecutorID { - executorIDs = append(executorIDs, executorID) + if len(executionIdentitiesRandom) == 0 { + return nil, fmt.Errorf("no matching execution node found for block ID %v", blockID) } - return executorIDs, nil + return executionIdentitiesRandom, nil } // chooseExecutionNodes finds the subset of execution nodes defined in the identity table by first @@ -208,9 +239,9 @@ func (ncg *NodeConnectionGuard) findAllExecutionNodes( // If neither preferred nor fixed nodes are defined, then all execution node matching the executor IDs are returned. // e.g. If execution nodes in identity table are {1,2,3,4}, preferred ENs are defined as {2,3,4} // and the executor IDs is {1,2,3}, then {2, 3} is returned as the chosen subset of ENs -func (ncg *NodeConnectionGuard) chooseExecutionNodes(executorIDs flow.IdentifierList) (flow.IdentityList, error) { +func chooseExecutionNodes(state protocol.State, executorIDs flow.IdentifierList) (flow.IdentityList, error) { - allENs, err := ncg.state.Final().Identities(filter.HasRole(flow.RoleExecution)) + allENs, err := state.Final().Identities(filter.HasRole(flow.RoleExecution)) if err != nil { return nil, fmt.Errorf("failed to retreive all execution IDs: %w", err) } @@ -241,3 +272,59 @@ func (ncg *NodeConnectionGuard) chooseExecutionNodes(executorIDs flow.Identifier // If no preferred or fixed ENs have been specified, then return all executor IDs i.e. no preference at all return allENs.Filter(filter.HasNodeID(executorIDs...)), nil } + +// findAllExecutionNodes find all the execution nodes ids from the execution receipts that have been received for the +// given blockID +func findAllExecutionNodes( + blockID flow.Identifier, + executionReceipts storage.ExecutionReceipts, + log zerolog.Logger) (flow.IdentifierList, error) { + + // lookup the receipt's storage with the block ID + allReceipts, err := executionReceipts.ByBlockID(blockID) + if err != nil { + return nil, fmt.Errorf("failed to retreive execution receipts for block ID %v: %w", blockID, err) + } + + executionResultMetaList := make(flow.ExecutionReceiptMetaList, 0, len(allReceipts)) + for _, r := range allReceipts { + executionResultMetaList = append(executionResultMetaList, r.Meta()) + } + executionResultGroupedMetaList := executionResultMetaList.GroupByResultID() + + // maximum number of matching receipts found so far for any execution result id + maxMatchedReceiptCnt := 0 + // execution result id key for the highest number of matching receipts in the identicalReceipts map + var maxMatchedReceiptResultID flow.Identifier + + // find the largest list of receipts which have the same result ID + for resultID, executionReceiptList := range executionResultGroupedMetaList { + currentMatchedReceiptCnt := executionReceiptList.Size() + if currentMatchedReceiptCnt > maxMatchedReceiptCnt { + maxMatchedReceiptCnt = currentMatchedReceiptCnt + maxMatchedReceiptResultID = resultID + } + } + + // if there are more than one execution result for the same block ID, log as error + if executionResultGroupedMetaList.NumberGroups() > 1 { + identicalReceiptsStr := fmt.Sprintf("%v", flow.GetIDs(allReceipts)) + log.Error(). + Str("block_id", blockID.String()). + Str("execution_receipts", identicalReceiptsStr). + Msg("execution receipt mismatch") + } + + // pick the largest list of matching receipts + matchingReceiptMetaList := executionResultGroupedMetaList.GetGroup(maxMatchedReceiptResultID) + + metaReceiptGroupedByExecutorID := matchingReceiptMetaList.GroupByExecutorID() + + // collect all unique execution node ids from the receipts + var executorIDs flow.IdentifierList + for executorID := range metaReceiptGroupedByExecutorID { + executorIDs = append(executorIDs, executorID) + } + + return executorIDs, nil +} diff --git a/engine/access/rpc/backend/mock/connection_selector.go b/engine/access/rpc/backend/mock/connection_selector.go new file mode 100644 index 00000000000..6337683391f --- /dev/null +++ b/engine/access/rpc/backend/mock/connection_selector.go @@ -0,0 +1,82 @@ +// Code generated by mockery v2.21.4. DO NOT EDIT. + +package mock + +import ( + context "context" + + flow "github.com/onflow/flow-go/model/flow" + mock "github.com/stretchr/testify/mock" +) + +// ConnectionSelector is an autogenerated mock type for the ConnectionSelector type +type ConnectionSelector struct { + mock.Mock +} + +// GetCollectionNodes provides a mock function with given fields: txID +func (_m *ConnectionSelector) GetCollectionNodes(txID flow.Identifier) ([]string, error) { + ret := _m.Called(txID) + + var r0 []string + var r1 error + if rf, ok := ret.Get(0).(func(flow.Identifier) ([]string, error)); ok { + return rf(txID) + } + if rf, ok := ret.Get(0).(func(flow.Identifier) []string); ok { + r0 = rf(txID) + } else { + if ret.Get(0) != nil { + r0 = ret.Get(0).([]string) + } + } + + if rf, ok := ret.Get(1).(func(flow.Identifier) error); ok { + r1 = rf(txID) + } else { + r1 = ret.Error(1) + } + + return r0, r1 +} + +// GetExecutionNodesForBlockID provides a mock function with given fields: ctx, blockID +func (_m *ConnectionSelector) GetExecutionNodesForBlockID(ctx context.Context, blockID flow.Identifier) (flow.IdentityList, error) { + ret := _m.Called(ctx, blockID) + + var r0 flow.IdentityList + var r1 error + if rf, ok := ret.Get(0).(func(context.Context, flow.Identifier) (flow.IdentityList, error)); ok { + return rf(ctx, blockID) + } + if rf, ok := ret.Get(0).(func(context.Context, flow.Identifier) flow.IdentityList); ok { + r0 = rf(ctx, blockID) + } else { + if ret.Get(0) != nil { + r0 = ret.Get(0).(flow.IdentityList) + } + } + + if rf, ok := ret.Get(1).(func(context.Context, flow.Identifier) error); ok { + r1 = rf(ctx, blockID) + } else { + r1 = ret.Error(1) + } + + return r0, r1 +} + +type mockConstructorTestingTNewConnectionSelector interface { + mock.TestingT + Cleanup(func()) +} + +// NewConnectionSelector creates a new instance of ConnectionSelector. It also registers a testing interface on the mock and a cleanup function to assert the mocks expectations. +func NewConnectionSelector(t mockConstructorTestingTNewConnectionSelector) *ConnectionSelector { + mock := &ConnectionSelector{} + mock.Mock.Test(t) + + t.Cleanup(func() { mock.AssertExpectations(t) }) + + return mock +} diff --git a/engine/access/rpc/engine.go b/engine/access/rpc/engine.go index ae25ecfcb76..4db6d0ab14f 100644 --- a/engine/access/rpc/engine.go +++ b/engine/access/rpc/engine.go @@ -175,6 +175,8 @@ func NewBuilder(log zerolog.Logger, CircuitBreakerConfig: config.CircuitBreakerConfig, } + connectionSelector := backend.NewConnectionSelector(state, executionReceipts, log, config.CircuitBreakerConfig.Enabled) + backend := backend.New(state, collectionRPC, historicalAccessNodes, @@ -187,6 +189,7 @@ func NewBuilder(log zerolog.Logger, chainID, transactionMetrics, connectionFactory, + connectionSelector, retryEnabled, config.MaxHeightRange, config.PreferredExecutionNodeIDs, diff --git a/module/mock/finalized_header_cache.go b/module/mock/finalized_header_cache.go new file mode 100644 index 00000000000..018981fb347 --- /dev/null +++ b/module/mock/finalized_header_cache.go @@ -0,0 +1,44 @@ +// Code generated by mockery v2.21.4. DO NOT EDIT. + +package mock + +import ( + flow "github.com/onflow/flow-go/model/flow" + mock "github.com/stretchr/testify/mock" +) + +// FinalizedHeaderCache is an autogenerated mock type for the FinalizedHeaderCache type +type FinalizedHeaderCache struct { + mock.Mock +} + +// Get provides a mock function with given fields: +func (_m *FinalizedHeaderCache) Get() *flow.Header { + ret := _m.Called() + + var r0 *flow.Header + if rf, ok := ret.Get(0).(func() *flow.Header); ok { + r0 = rf() + } else { + if ret.Get(0) != nil { + r0 = ret.Get(0).(*flow.Header) + } + } + + return r0 +} + +type mockConstructorTestingTNewFinalizedHeaderCache interface { + mock.TestingT + Cleanup(func()) +} + +// NewFinalizedHeaderCache creates a new instance of FinalizedHeaderCache. It also registers a testing interface on the mock and a cleanup function to assert the mocks expectations. +func NewFinalizedHeaderCache(t mockConstructorTestingTNewFinalizedHeaderCache) *FinalizedHeaderCache { + mock := &FinalizedHeaderCache{} + mock.Mock.Test(t) + + t.Cleanup(func() { mock.AssertExpectations(t) }) + + return mock +} From 43d995f0657cb6fc02835bb6a648468c8db66ae9 Mon Sep 17 00:00:00 2001 From: Andrii Slisarchuk Date: Wed, 7 Jun 2023 01:09:18 +0300 Subject: [PATCH 05/56] Fixed broken tests --- engine/access/access_test.go | 5 +++++ engine/access/rpc/backend/historical_access_test.go | 2 ++ engine/access/rpc/backend/retry_test.go | 2 ++ 3 files changed, 9 insertions(+) diff --git a/engine/access/access_test.go b/engine/access/access_test.go index 8aa301ba49b..677bd218882 100644 --- a/engine/access/access_test.go +++ b/engine/access/access_test.go @@ -150,6 +150,7 @@ func (suite *Suite) RunTest( suite.chainID, suite.metrics, nil, + nil, false, backend.DefaultMaxHeightRange, nil, @@ -322,6 +323,7 @@ func (suite *Suite) TestSendTransactionToRandomCollectionNode() { suite.chainID, metrics, connFactory, + nil, false, backend.DefaultMaxHeightRange, nil, @@ -648,6 +650,7 @@ func (suite *Suite) TestGetSealedTransaction() { suite.chainID, suite.metrics, connFactory, + nil, false, backend.DefaultMaxHeightRange, nil, @@ -787,6 +790,7 @@ func (suite *Suite) TestGetTransactionResult() { suite.chainID, suite.metrics, connFactory, + nil, false, backend.DefaultMaxHeightRange, nil, @@ -978,6 +982,7 @@ func (suite *Suite) TestExecuteScript() { suite.chainID, suite.metrics, connFactory, + nil, false, backend.DefaultMaxHeightRange, nil, diff --git a/engine/access/rpc/backend/historical_access_test.go b/engine/access/rpc/backend/historical_access_test.go index b66904f6604..3ba35c15d70 100644 --- a/engine/access/rpc/backend/historical_access_test.go +++ b/engine/access/rpc/backend/historical_access_test.go @@ -49,6 +49,7 @@ func (suite *Suite) TestHistoricalTransactionResult() { suite.chainID, metrics.NewNoopCollector(), nil, + nil, false, DefaultMaxHeightRange, nil, @@ -107,6 +108,7 @@ func (suite *Suite) TestHistoricalTransaction() { suite.chainID, metrics.NewNoopCollector(), nil, + nil, false, DefaultMaxHeightRange, nil, diff --git a/engine/access/rpc/backend/retry_test.go b/engine/access/rpc/backend/retry_test.go index c10b66bbbc0..7a4ddf6b375 100644 --- a/engine/access/rpc/backend/retry_test.go +++ b/engine/access/rpc/backend/retry_test.go @@ -54,6 +54,7 @@ func (suite *Suite) TestTransactionRetry() { suite.chainID, metrics.NewNoopCollector(), nil, + nil, false, DefaultMaxHeightRange, nil, @@ -143,6 +144,7 @@ func (suite *Suite) TestSuccessfulTransactionsDontRetry() { suite.chainID, metrics.NewNoopCollector(), connFactory, + nil, false, DefaultMaxHeightRange, nil, From 065494d309e12e2b2aafaeaa481d5abe64cfe435 Mon Sep 17 00:00:00 2001 From: Andrii Slisarchuk Date: Thu, 8 Jun 2023 12:56:17 +0300 Subject: [PATCH 06/56] Added tests for circuit breaker. --- .../export_report.json | 6 + engine/access/rpc/backend/backend_test.go | 75 ++++++------ .../access/rpc/backend/connection_factory.go | 2 +- .../rpc/backend/connection_factory_test.go | 112 ++++++++++++++++++ .../access/rpc/backend/connection_selector.go | 4 +- go.mod | 2 +- insecure/go.mod | 1 + insecure/go.sum | 2 + 8 files changed, 160 insertions(+), 44 deletions(-) create mode 100644 cmd/util/cmd/execution-state-extract/export_report.json diff --git a/cmd/util/cmd/execution-state-extract/export_report.json b/cmd/util/cmd/execution-state-extract/export_report.json new file mode 100644 index 00000000000..72af21af279 --- /dev/null +++ b/cmd/util/cmd/execution-state-extract/export_report.json @@ -0,0 +1,6 @@ +{ + "EpochCounter": 0, + "PreviousStateCommitment": "53749b13e1f99759abb35a7ab7d7a4f180d8f6bc24e5ef6b29f3565d459765f0", + "CurrentStateCommitment": "53749b13e1f99759abb35a7ab7d7a4f180d8f6bc24e5ef6b29f3565d459765f0", + "ReportSucceeded": true +} \ No newline at end of file diff --git a/engine/access/rpc/backend/backend_test.go b/engine/access/rpc/backend/backend_test.go index 31834efc190..a2d34eca298 100644 --- a/engine/access/rpc/backend/backend_test.go +++ b/engine/access/rpc/backend/backend_test.go @@ -3,8 +3,6 @@ package backend import ( "context" "fmt" - "testing" - "github.com/dgraph-io/badger/v2" accessproto "github.com/onflow/flow/protobuf/go/flow/access" entitiesproto "github.com/onflow/flow/protobuf/go/flow/entities" @@ -16,6 +14,7 @@ import ( "github.com/stretchr/testify/suite" "google.golang.org/grpc/codes" "google.golang.org/grpc/status" + "testing" access "github.com/onflow/flow-go/engine/access/mock" backendmock "github.com/onflow/flow-go/engine/access/rpc/backend/mock" @@ -27,6 +26,7 @@ import ( "github.com/onflow/flow-go/state/protocol/util" "github.com/onflow/flow-go/storage" storagemock "github.com/onflow/flow-go/storage/mock" + "github.com/onflow/flow-go/utils/rand" "github.com/onflow/flow-go/utils/unittest" ) @@ -47,7 +47,7 @@ type Suite struct { execClient *access.ExecutionAPIClient historicalAccessClient *access.AccessAPIClient connectionFactory *backendmock.ConnectionFactory - connSelector *backendmock.ConnectionSelector + nil *backendmock.ConnectionSelector chainID flow.ChainID } @@ -75,7 +75,7 @@ func (suite *Suite) SetupTest() { suite.chainID = flow.Testnet suite.historicalAccessClient = new(access.AccessAPIClient) suite.connectionFactory = new(backendmock.ConnectionFactory) - suite.connSelector = new(backendmock.ConnectionSelector) + suite.nil = new(backendmock.ConnectionSelector) } func (suite *Suite) TestPing() { @@ -648,8 +648,6 @@ func (suite *Suite) TestGetTransactionResultByIndex() { connFactory := new(backendmock.ConnectionFactory) connFactory.On("GetExecutionAPIClient", mock.Anything).Return(suite.execClient, &mockCloser{}, nil) - connSelector := new(backendmock.ConnectionSelector) - exeEventReq := &execproto.GetTransactionByIndexRequest{ BlockId: blockId[:], Index: index, @@ -672,7 +670,7 @@ func (suite *Suite) TestGetTransactionResultByIndex() { suite.chainID, metrics.NewNoopCollector(), connFactory, // the connection factory should be used to get the execution node client - connSelector, + nil, false, DefaultMaxHeightRange, nil, @@ -715,8 +713,6 @@ func (suite *Suite) TestGetTransactionResultsByBlockID() { connFactory := new(backendmock.ConnectionFactory) connFactory.On("GetExecutionAPIClient", mock.Anything).Return(suite.execClient, &mockCloser{}, nil) - connSelector := new(backendmock.ConnectionSelector) - exeEventReq := &execproto.GetTransactionsByBlockIDRequest{ BlockId: blockId[:], } @@ -738,7 +734,7 @@ func (suite *Suite) TestGetTransactionResultsByBlockID() { suite.chainID, metrics.NewNoopCollector(), connFactory, // the connection factory should be used to get the execution node client - connSelector, + nil, false, DefaultMaxHeightRange, nil, @@ -808,8 +804,6 @@ func (suite *Suite) TestTransactionStatusTransition() { connFactory.On("GetExecutionAPIClient", mock.Anything).Return(suite.execClient, &mockCloser{}, nil) connFactory.On("InvalidateExecutionAPIClient", mock.Anything) - connSelector := new(backendmock.ConnectionSelector) - exeEventReq := &execproto.GetTransactionResultRequest{ BlockId: blockID[:], TransactionId: txID[:], @@ -832,7 +826,7 @@ func (suite *Suite) TestTransactionStatusTransition() { suite.chainID, metrics.NewNoopCollector(), connFactory, // the connection factory should be used to get the execution node client - connSelector, + nil, false, DefaultMaxHeightRange, nil, @@ -1107,7 +1101,6 @@ func (suite *Suite) TestTransactionPendingToFinalizedStatusTransition() { // create a mock connection factory connFactory := suite.setupConnectionFactory() - connSelector := new(backendmock.ConnectionSelector) backend := New( suite.state, @@ -1122,7 +1115,7 @@ func (suite *Suite) TestTransactionPendingToFinalizedStatusTransition() { suite.chainID, metrics.NewNoopCollector(), connFactory, // the connection factory should be used to get the execution node client - connSelector, + nil, false, 100, nil, @@ -1302,8 +1295,6 @@ func (suite *Suite) TestGetEventsForBlockIDs() { connFactory := new(backendmock.ConnectionFactory) connFactory.On("GetExecutionAPIClient", mock.Anything).Return(suite.execClient, &mockCloser{}, nil) - connSelector := new(backendmock.ConnectionSelector) - // create the expected results from execution node and access node exeResults := make([]*execproto.GetEventsForBlockIDsResponse_Result, len(blockHeaders)) @@ -1369,7 +1360,7 @@ func (suite *Suite) TestGetEventsForBlockIDs() { suite.chainID, metrics.NewNoopCollector(), connFactory, // the connection factory should be used to get the execution node client - connSelector, + nil, false, DefaultMaxHeightRange, nil, @@ -1402,7 +1393,7 @@ func (suite *Suite) TestGetEventsForBlockIDs() { suite.chainID, metrics.NewNoopCollector(), connFactory, // the connection factory should be used to get the execution node client - connSelector, + nil, false, DefaultMaxHeightRange, nil, @@ -1429,7 +1420,6 @@ func (suite *Suite) TestGetExecutionResultByID() { // create a mock connection factory connFactory := new(backendmock.ConnectionFactory) - connSelector := new(backendmock.ConnectionSelector) nonexistingID := unittest.IdentifierFixture() blockID := unittest.IdentifierFixture() @@ -1463,7 +1453,7 @@ func (suite *Suite) TestGetExecutionResultByID() { suite.chainID, metrics.NewNoopCollector(), connFactory, // the connection factory should be used to get the execution node client - connSelector, + nil, false, DefaultMaxHeightRange, nil, @@ -1494,7 +1484,7 @@ func (suite *Suite) TestGetExecutionResultByID() { suite.chainID, metrics.NewNoopCollector(), connFactory, // the connection factory should be used to get the execution node client - connSelector, + nil, false, DefaultMaxHeightRange, nil, @@ -1523,7 +1513,6 @@ func (suite *Suite) TestGetExecutionResultByBlockID() { // create a mock connection factory connFactory := new(backendmock.ConnectionFactory) - connSelector := new(backendmock.ConnectionSelector) blockID := unittest.IdentifierFixture() executionResult := unittest.ExecutionResultFixture( @@ -1559,7 +1548,7 @@ func (suite *Suite) TestGetExecutionResultByBlockID() { suite.chainID, metrics.NewNoopCollector(), connFactory, // the connection factory should be used to get the execution node client - connSelector, + nil, false, DefaultMaxHeightRange, nil, @@ -1591,7 +1580,7 @@ func (suite *Suite) TestGetExecutionResultByBlockID() { suite.chainID, metrics.NewNoopCollector(), connFactory, // the connection factory should be used to get the execution node client - connSelector, + nil, false, DefaultMaxHeightRange, nil, @@ -1727,7 +1716,6 @@ func (suite *Suite) TestGetEventsForHeightRange() { } connFactory := suite.setupConnectionFactory() - connSelector := new(backendmock.ConnectionSelector) suite.Run("invalid request max height < min height", func() { backend := New( @@ -1743,7 +1731,7 @@ func (suite *Suite) TestGetEventsForHeightRange() { suite.chainID, metrics.NewNoopCollector(), connFactory, // the connection factory should be used to get the execution node client - connSelector, + nil, false, DefaultMaxHeightRange, nil, @@ -1783,7 +1771,7 @@ func (suite *Suite) TestGetEventsForHeightRange() { suite.chainID, metrics.NewNoopCollector(), connFactory, // the connection factory should be used to get the execution node client - connSelector, + nil, false, DefaultMaxHeightRange, nil, @@ -1822,7 +1810,7 @@ func (suite *Suite) TestGetEventsForHeightRange() { suite.chainID, metrics.NewNoopCollector(), connFactory, // the connection factory should be used to get the execution node client - connSelector, + nil, false, DefaultMaxHeightRange, nil, @@ -1860,7 +1848,7 @@ func (suite *Suite) TestGetEventsForHeightRange() { suite.chainID, metrics.NewNoopCollector(), connFactory, // the connection factory should be used to get the execution node client - connSelector, + nil, false, 1, // set maximum range to 1 nil, @@ -1898,7 +1886,7 @@ func (suite *Suite) TestGetEventsForHeightRange() { suite.chainID, metrics.NewNoopCollector(), connFactory, // the connection factory should be used to get the execution node client - connSelector, + nil, false, DefaultMaxHeightRange, nil, @@ -1962,8 +1950,6 @@ func (suite *Suite) TestGetAccount() { connFactory := new(backendmock.ConnectionFactory) connFactory.On("GetExecutionAPIClient", mock.Anything).Return(suite.execClient, &mockCloser{}, nil) - connSelector := new(backendmock.ConnectionSelector) - // create the handler with the mock backend := New( suite.state, @@ -1978,7 +1964,7 @@ func (suite *Suite) TestGetAccount() { suite.chainID, metrics.NewNoopCollector(), connFactory, // the connection factory should be used to get the execution node client - connSelector, + nil, false, DefaultMaxHeightRange, nil, @@ -2028,8 +2014,6 @@ func (suite *Suite) TestGetAccountAtBlockHeight() { connFactory := new(backendmock.ConnectionFactory) connFactory.On("GetExecutionAPIClient", mock.Anything).Return(suite.execClient, &mockCloser{}, nil) - connSelector := new(backendmock.ConnectionSelector) - // create the expected execution API request blockID := h.ID() exeReq := &execproto.GetAccountAtBlockIDRequest{ @@ -2062,7 +2046,7 @@ func (suite *Suite) TestGetAccountAtBlockHeight() { flow.Testnet, metrics.NewNoopCollector(), connFactory, // the connection factory should be used to get the execution node client - connSelector, + nil, false, DefaultMaxHeightRange, nil, @@ -2171,6 +2155,19 @@ func (suite *Suite) TestExecutionNodesForBlockID() { suite.state.On("Final").Return(suite.snapshot, nil).Maybe() connSelector := new(backendmock.ConnectionSelector) + connSelector.On("GetExecutionNodesForBlockID").Return(func() flow.IdentityList { + randomItems := make(flow.IdentityList, 0, maxExecutionNodesCnt) + + for i := 0; i < maxExecutionNodesCnt; i++ { + // Generate a random index within the range of the array + randomIndex, err := rand.Uintn(uint(len(allExecutionNodes))) + require.NoError(suite.T(), err) + // Append the item at the random index to the new slice + randomItems = append(randomItems, allExecutionNodes[randomIndex]) + } + + return randomItems + }) testExecutionNodesForBlockID := func(preferredENs, fixedENs, expectedENs flow.IdentityList) { @@ -2270,8 +2267,6 @@ func (suite *Suite) TestExecuteScriptOnExecutionNode() { connFactory.On("GetExecutionAPIClient", mock.Anything).Return(suite.execClient, &mockCloser{}, nil) connFactory.On("InvalidateExecutionAPIClient", mock.Anything) - connSelector := new(backendmock.ConnectionSelector) - // create the handler with the mock backend := New( suite.state, @@ -2286,7 +2281,7 @@ func (suite *Suite) TestExecuteScriptOnExecutionNode() { flow.Mainnet, metrics.NewNoopCollector(), connFactory, // the connection factory should be used to get the execution node client - connSelector, + nil, false, DefaultMaxHeightRange, nil, diff --git a/engine/access/rpc/backend/connection_factory.go b/engine/access/rpc/backend/connection_factory.go index bb4f5e3548f..cf6940f006e 100644 --- a/engine/access/rpc/backend/connection_factory.go +++ b/engine/access/rpc/backend/connection_factory.go @@ -265,7 +265,7 @@ func WithClientUnaryInterceptor(timeout time.Duration, circuitBreakerConfig Circ circuitBreaker = gobreaker.NewCircuitBreaker(gobreaker.Settings{ Timeout: circuitBreakerConfig.RestoreTimeout, ReadyToTrip: func(counts gobreaker.Counts) bool { - return counts.ConsecutiveFailures > circuitBreakerConfig.MaxRequestToBreak + return counts.ConsecutiveFailures >= circuitBreakerConfig.MaxRequestToBreak }, }) } diff --git a/engine/access/rpc/backend/connection_factory_test.go b/engine/access/rpc/backend/connection_factory_test.go index fa4801a5897..24953bb22a1 100644 --- a/engine/access/rpc/backend/connection_factory_test.go +++ b/engine/access/rpc/backend/connection_factory_test.go @@ -3,6 +3,7 @@ package backend import ( "context" "fmt" + "github.com/sony/gobreaker" "net" "strconv" "strings" @@ -407,6 +408,117 @@ func TestConnectionPoolStale(t *testing.T) { assert.Equal(t, resp, expected) } +// TestCircuitBreakerExecutionNode +func TestCircuitBreakerExecutionNode(t *testing.T) { + timeout := time.Second + + // create an execution node + en := new(executionNode) + en.start(t) + defer en.stop(t) + + // setup the handler mock to not respond within the timeout + req := &execution.PingRequest{} + resp := &execution.PingResponse{} + en.handler.On("Ping", testifymock.Anything, req).After(timeout+time.Second).Return(resp, nil) + + // create the factory + connectionFactory := new(ConnectionFactoryImpl) + // set the execution grpc port + connectionFactory.ExecutionGRPCPort = en.port + // set the execution grpc client timeout + connectionFactory.ExecutionNodeGRPCTimeout = timeout + // set the connection pool cache size + cacheSize := 5 + connectionFactory.CacheSize = uint(cacheSize) + // set metrics reporting + connectionFactory.AccessMetrics = metrics.NewNoopCollector() + connectionFactory.CircuitBreakerConfig = CircuitBreakerConfig{ + Enabled: true, + MaxRequestToBreak: 1, + RestoreTimeout: 10 * time.Second, + } + + // create the execution API client + client, _, err := connectionFactory.GetExecutionAPIClient(en.listener.Addr().String()) + assert.NoError(t, err) + + ctx := context.Background() + callAndMeasurePingDuration := func() (time.Duration, error) { + start := time.Now() + + // make the call to the execution node + _, err = client.Ping(ctx, req) + + return time.Since(start), err + } + + duration, err := callAndMeasurePingDuration() + // assert that the client timed out + assert.Equal(t, codes.DeadlineExceeded, status.Code(err)) + assert.LessOrEqual(t, timeout, duration) + + duration, err = callAndMeasurePingDuration() + assert.Equal(t, gobreaker.ErrOpenState, err) + assert.Greater(t, timeout, duration) +} + +// TestCircuitBreakerCollectionNode +func TestCircuitBreakerCollectionNode(t *testing.T) { + timeout := time.Second + + // create a collection node + cn := new(collectionNode) + cn.start(t) + defer cn.stop(t) + + // set up the handler mock to not respond within the timeout + req := &access.PingRequest{} + resp := &access.PingResponse{} + cn.handler.On("Ping", testifymock.Anything, req).After(timeout+time.Second).Return(resp, nil) + + // create the factory + connectionFactory := new(ConnectionFactoryImpl) + // set the collection grpc port + connectionFactory.CollectionGRPCPort = cn.port + // set the collection grpc client timeout + connectionFactory.CollectionNodeGRPCTimeout = timeout + // set the connection pool cache size + cacheSize := 5 + connectionFactory.CacheSize = uint(cacheSize) + // set metrics reporting + connectionFactory.AccessMetrics = metrics.NewNoopCollector() + + connectionFactory.CircuitBreakerConfig = CircuitBreakerConfig{ + Enabled: true, + MaxRequestToBreak: 1, + RestoreTimeout: 10 * time.Second, + } + + // create the collection API client + client, _, err := connectionFactory.GetAccessAPIClient(cn.listener.Addr().String()) + assert.NoError(t, err) + + ctx := context.Background() + callAndMeasurePingDuration := func() (time.Duration, error) { + start := time.Now() + + // make the call to the collection node + _, err = client.Ping(ctx, req) + + return time.Since(start), err + } + + duration, err := callAndMeasurePingDuration() + // assert that the client timed out + assert.Equal(t, codes.DeadlineExceeded, status.Code(err)) + assert.LessOrEqual(t, timeout, duration) + + duration, err = callAndMeasurePingDuration() + assert.Equal(t, gobreaker.ErrOpenState, err) + assert.Greater(t, timeout, duration) +} + // node mocks a flow node that runs a GRPC server type node struct { server *grpc.Server diff --git a/engine/access/rpc/backend/connection_selector.go b/engine/access/rpc/backend/connection_selector.go index 8a70e0af5e8..fe351fdb5b2 100644 --- a/engine/access/rpc/backend/connection_selector.go +++ b/engine/access/rpc/backend/connection_selector.go @@ -87,7 +87,7 @@ func (ncs *MainConnectionSelector) GetExecutionNodesForBlockID( // check if the block ID is of the root block. If it is then don't look for execution receipts since they // will not be present for the root block. - rootBlock, err := ncs.state.Params().Root() + rootBlock, err := ncs.state.Params().FinalizedRoot() if err != nil { return nil, fmt.Errorf("failed to retreive execution IDs for block ID %v: %w", blockID, err) } @@ -177,7 +177,7 @@ func (nccbs *CircuitBreakerConnectionSelector) GetExecutionNodesForBlockID( // check if the block ID is of the root block. If it is then don't look for execution receipts since they // will not be present for the root block. - rootBlock, err := nccbs.state.Params().Root() + rootBlock, err := nccbs.state.Params().FinalizedRoot() if err != nil { return nil, fmt.Errorf("failed to retreive execution IDs for block ID %v: %w", blockID, err) } diff --git a/go.mod b/go.mod index 4413c64fd11..22b92538103 100644 --- a/go.mod +++ b/go.mod @@ -101,6 +101,7 @@ require ( github.com/coreos/go-semver v0.3.0 github.com/onflow/wal v0.0.0-20230529184820-bc9f8244608d github.com/slok/go-http-metrics v0.10.0 + github.com/sony/gobreaker v0.5.0 gonum.org/v1/gonum v0.8.2 ) @@ -242,7 +243,6 @@ require ( github.com/psiemens/sconfig v0.1.0 // indirect github.com/raulk/go-watchdog v1.3.0 // indirect github.com/rivo/uniseg v0.4.4 // indirect - github.com/sony/gobreaker v0.5.0 // indirect github.com/spacemonkeygo/spacelog v0.0.0-20180420211403-2296661a0572 // indirect github.com/spaolacci/murmur3 v1.1.0 // indirect github.com/spf13/afero v1.9.0 // indirect diff --git a/insecure/go.mod b/insecure/go.mod index 008b173cd61..a068e1efdab 100644 --- a/insecure/go.mod +++ b/insecure/go.mod @@ -212,6 +212,7 @@ require ( github.com/sethvargo/go-retry v0.2.3 // indirect github.com/shirou/gopsutil/v3 v3.22.2 // indirect github.com/slok/go-http-metrics v0.10.0 // indirect + github.com/sony/gobreaker v0.5.0 // indirect github.com/spacemonkeygo/spacelog v0.0.0-20180420211403-2296661a0572 // indirect github.com/spaolacci/murmur3 v1.1.0 // indirect github.com/spf13/afero v1.9.0 // indirect diff --git a/insecure/go.sum b/insecure/go.sum index 5b525b5d7da..a6b14f91090 100644 --- a/insecure/go.sum +++ b/insecure/go.sum @@ -1368,6 +1368,8 @@ github.com/smartystreets/goconvey v1.6.4/go.mod h1:syvi0/a8iFYH4r/RixwvyeAJjdLS9 github.com/smola/gocompat v0.2.0/go.mod h1:1B0MlxbmoZNo3h8guHp8HztB3BSYR5itql9qtVc0ypY= github.com/soheilhy/cmux v0.1.4/go.mod h1:IM3LyeVVIOuxMH7sFAkER9+bJ4dT7Ms6E4xg4kGIyLM= github.com/sony/gobreaker v0.4.1/go.mod h1:ZKptC7FHNvhBz7dN2LGjPVBz2sZJmc0/PkyDJOjmxWY= +github.com/sony/gobreaker v0.5.0 h1:dRCvqm0P490vZPmy7ppEk2qCnCieBooFJ+YoXGYB+yg= +github.com/sony/gobreaker v0.5.0/go.mod h1:ZKptC7FHNvhBz7dN2LGjPVBz2sZJmc0/PkyDJOjmxWY= github.com/sourcegraph/annotate v0.0.0-20160123013949-f4cad6c6324d/go.mod h1:UdhH50NIW0fCiwBSr0co2m7BnFLdv4fQTgdqdJTHFeE= github.com/sourcegraph/syntaxhighlight v0.0.0-20170531221838-bd320f5d308e/go.mod h1:HuIsMU8RRBOtsCgI77wP899iHVBQpCmg4ErYMZB+2IA= github.com/spacemonkeygo/openssl v0.0.0-20181017203307-c2dcc5cca94a/go.mod h1:7AyxJNCJ7SBZ1MfVQCWD6Uqo2oubI2Eq2y2eqf+A5r0= From ab604f9dab01a187d7a476017bc9e3769d2779c0 Mon Sep 17 00:00:00 2001 From: Andrii Slisarchuk Date: Mon, 12 Jun 2023 14:09:33 +0300 Subject: [PATCH 07/56] Added additional checks to test. --- .../rpc/backend/connection_factory_test.go | 89 ++++++++++++------- 1 file changed, 58 insertions(+), 31 deletions(-) diff --git a/engine/access/rpc/backend/connection_factory_test.go b/engine/access/rpc/backend/connection_factory_test.go index 24953bb22a1..96dba28889b 100644 --- a/engine/access/rpc/backend/connection_factory_test.go +++ b/engine/access/rpc/backend/connection_factory_test.go @@ -410,34 +410,39 @@ func TestConnectionPoolStale(t *testing.T) { // TestCircuitBreakerExecutionNode func TestCircuitBreakerExecutionNode(t *testing.T) { - timeout := time.Second - + requestTimeout := 1 * time.Second + circuitBreakerRestoreTimeout := 3 * time.Second // create an execution node en := new(executionNode) en.start(t) defer en.stop(t) - // setup the handler mock to not respond within the timeout + // setup the handler mock to not respond within the requestTimeout req := &execution.PingRequest{} resp := &execution.PingResponse{} - en.handler.On("Ping", testifymock.Anything, req).After(timeout+time.Second).Return(resp, nil) + en.handler.On("Ping", testifymock.Anything, req).After(2*requestTimeout).Return(resp, nil) // create the factory connectionFactory := new(ConnectionFactoryImpl) // set the execution grpc port connectionFactory.ExecutionGRPCPort = en.port - // set the execution grpc client timeout - connectionFactory.ExecutionNodeGRPCTimeout = timeout - // set the connection pool cache size - cacheSize := 5 - connectionFactory.CacheSize = uint(cacheSize) - // set metrics reporting - connectionFactory.AccessMetrics = metrics.NewNoopCollector() + // set the execution grpc client requestTimeout + connectionFactory.ExecutionNodeGRPCTimeout = requestTimeout + // set the configuration for circuit breaker connectionFactory.CircuitBreakerConfig = CircuitBreakerConfig{ Enabled: true, MaxRequestToBreak: 1, - RestoreTimeout: 10 * time.Second, + RestoreTimeout: circuitBreakerRestoreTimeout, } + // set the connection pool cache size + cacheSize := 1 + cache, _ := lru.NewWithEvict(cacheSize, func(_, evictedValue interface{}) { + evictedValue.(*CachedClient).Close() + }) + connectionFactory.ConnectionsCache = cache + connectionFactory.CacheSize = uint(cacheSize) + // set metrics reporting + connectionFactory.AccessMetrics = metrics.NewNoopCollector() // create the execution API client client, _, err := connectionFactory.GetExecutionAPIClient(en.listener.Addr().String()) @@ -454,46 +459,59 @@ func TestCircuitBreakerExecutionNode(t *testing.T) { } duration, err := callAndMeasurePingDuration() - // assert that the client timed out assert.Equal(t, codes.DeadlineExceeded, status.Code(err)) - assert.LessOrEqual(t, timeout, duration) + assert.LessOrEqual(t, requestTimeout, duration) duration, err = callAndMeasurePingDuration() assert.Equal(t, gobreaker.ErrOpenState, err) - assert.Greater(t, timeout, duration) + assert.Greater(t, requestTimeout, duration) + + //Wait until Circuit breaker go to Half-open state + time.Sleep(circuitBreakerRestoreTimeout + time.Second) + + en.handler.On("Ping", testifymock.Anything, req).Unset() + en.handler.On("Ping", testifymock.Anything, req).Return(resp, nil) + + duration, err = callAndMeasurePingDuration() + assert.Greater(t, requestTimeout, duration) + assert.Equal(t, nil, err) } // TestCircuitBreakerCollectionNode func TestCircuitBreakerCollectionNode(t *testing.T) { - timeout := time.Second - + requestTimeout := 1 * time.Second + circuitBreakerRestoreTimeout := 3 * time.Second // create a collection node cn := new(collectionNode) cn.start(t) defer cn.stop(t) - // set up the handler mock to not respond within the timeout + // set up the handler mock to not respond within the requestTimeout req := &access.PingRequest{} resp := &access.PingResponse{} - cn.handler.On("Ping", testifymock.Anything, req).After(timeout+time.Second).Return(resp, nil) + cn.handler.On("Ping", testifymock.Anything, req).After(2*requestTimeout).Return(resp, nil) // create the factory connectionFactory := new(ConnectionFactoryImpl) // set the collection grpc port connectionFactory.CollectionGRPCPort = cn.port - // set the collection grpc client timeout - connectionFactory.CollectionNodeGRPCTimeout = timeout - // set the connection pool cache size - cacheSize := 5 - connectionFactory.CacheSize = uint(cacheSize) - // set metrics reporting - connectionFactory.AccessMetrics = metrics.NewNoopCollector() - + // set the collection grpc client requestTimeout + connectionFactory.CollectionNodeGRPCTimeout = requestTimeout + // set the configuration for circuit breaker connectionFactory.CircuitBreakerConfig = CircuitBreakerConfig{ Enabled: true, MaxRequestToBreak: 1, - RestoreTimeout: 10 * time.Second, + RestoreTimeout: circuitBreakerRestoreTimeout, } + // set the connection pool cache size + cacheSize := 1 + cache, _ := lru.NewWithEvict(cacheSize, func(_, evictedValue interface{}) { + evictedValue.(*CachedClient).Close() + }) + connectionFactory.ConnectionsCache = cache + connectionFactory.CacheSize = uint(cacheSize) + // set metrics reporting + connectionFactory.AccessMetrics = metrics.NewNoopCollector() // create the collection API client client, _, err := connectionFactory.GetAccessAPIClient(cn.listener.Addr().String()) @@ -510,13 +528,22 @@ func TestCircuitBreakerCollectionNode(t *testing.T) { } duration, err := callAndMeasurePingDuration() - // assert that the client timed out assert.Equal(t, codes.DeadlineExceeded, status.Code(err)) - assert.LessOrEqual(t, timeout, duration) + assert.LessOrEqual(t, requestTimeout, duration) duration, err = callAndMeasurePingDuration() assert.Equal(t, gobreaker.ErrOpenState, err) - assert.Greater(t, timeout, duration) + assert.Greater(t, requestTimeout, duration) + + //Wait until Circuit breaker go to Half-open state + time.Sleep(circuitBreakerRestoreTimeout + time.Second) + + cn.handler.On("Ping", testifymock.Anything, req).Unset() + cn.handler.On("Ping", testifymock.Anything, req).Return(resp, nil) + + duration, err = callAndMeasurePingDuration() + assert.Greater(t, requestTimeout, duration) + assert.Equal(t, nil, err) } // node mocks a flow node that runs a GRPC server From bd510e948d0dd3a803e90b9e6e1e6132211fc313 Mon Sep 17 00:00:00 2001 From: Andrii Slisarchuk Date: Mon, 12 Jun 2023 14:56:43 +0300 Subject: [PATCH 08/56] Rename circuit breaker configurations. --- cmd/access/node_builder/access_node_builder.go | 10 +++++----- engine/access/rpc/backend/connection_factory.go | 9 ++++----- engine/access/rpc/backend/connection_factory_test.go | 12 ++++++------ 3 files changed, 15 insertions(+), 16 deletions(-) diff --git a/cmd/access/node_builder/access_node_builder.go b/cmd/access/node_builder/access_node_builder.go index 42d5ad29a4c..d24fe4537bd 100644 --- a/cmd/access/node_builder/access_node_builder.go +++ b/cmd/access/node_builder/access_node_builder.go @@ -162,9 +162,9 @@ func DefaultAccessNodeConfig() *AccessNodeConfig { ArchiveAddressList: nil, MaxMsgSize: grpcutils.DefaultMaxMsgSize, CircuitBreakerConfig: backend.CircuitBreakerConfig{ - Enabled: false, - RestoreTimeout: time.Duration(60) * time.Second, - MaxRequestToBreak: 5, + Enabled: false, + RestoreTimeout: time.Duration(60) * time.Second, + MaxFailures: 5, }, }, stateStreamConf: state_stream.Config{ @@ -688,7 +688,7 @@ func (builder *FlowAccessNodeBuilder) extraFlags() { flags.StringVar(&builder.PublicNetworkConfig.BindAddress, "public-network-address", defaultConfig.PublicNetworkConfig.BindAddress, "staked access node's public network bind address") flags.BoolVar(&builder.rpcConf.CircuitBreakerConfig.Enabled, "circuit-breaker-enabled", defaultConfig.rpcConf.CircuitBreakerConfig.Enabled, "whether to enable the circuit breaker for collection and execution node connections") flags.DurationVar(&builder.rpcConf.CircuitBreakerConfig.RestoreTimeout, "circuit-breaker-restore-timeout", defaultConfig.rpcConf.CircuitBreakerConfig.RestoreTimeout, "initial timeout for circuit breaker to try connect again. Default value is 60s") - flags.Uint32Var(&builder.rpcConf.CircuitBreakerConfig.MaxRequestToBreak, "circuit-breaker-max-request-to-break", defaultConfig.rpcConf.CircuitBreakerConfig.MaxRequestToBreak, "number of consecutive failures to break connection. Default value is 5") + flags.Uint32Var(&builder.rpcConf.CircuitBreakerConfig.MaxFailures, "circuit-breaker-max-failures", defaultConfig.rpcConf.CircuitBreakerConfig.MaxFailures, "number of consecutive failures to break connection. Default value is 5") // ExecutionDataRequester config flags.BoolVar(&builder.executionDataSyncEnabled, "execution-data-sync-enabled", defaultConfig.executionDataSyncEnabled, "whether to enable the execution data sync protocol") @@ -754,7 +754,7 @@ func (builder *FlowAccessNodeBuilder) extraFlags() { } } if builder.rpcConf.CircuitBreakerConfig.Enabled { - if builder.rpcConf.CircuitBreakerConfig.MaxRequestToBreak == 0 { + if builder.rpcConf.CircuitBreakerConfig.MaxFailures == 0 { return errors.New("circuit-breaker-max-request-to-break must be greater than 0") } } diff --git a/engine/access/rpc/backend/connection_factory.go b/engine/access/rpc/backend/connection_factory.go index cf6940f006e..37cf965c372 100644 --- a/engine/access/rpc/backend/connection_factory.go +++ b/engine/access/rpc/backend/connection_factory.go @@ -65,11 +65,10 @@ type ConnectionFactoryImpl struct { CircuitBreakerConfig CircuitBreakerConfig } -// TODO: describe type CircuitBreakerConfig struct { - Enabled bool - RestoreTimeout time.Duration - MaxRequestToBreak uint32 + Enabled bool + RestoreTimeout time.Duration + MaxFailures uint32 } type CachedClient struct { @@ -265,7 +264,7 @@ func WithClientUnaryInterceptor(timeout time.Duration, circuitBreakerConfig Circ circuitBreaker = gobreaker.NewCircuitBreaker(gobreaker.Settings{ Timeout: circuitBreakerConfig.RestoreTimeout, ReadyToTrip: func(counts gobreaker.Counts) bool { - return counts.ConsecutiveFailures >= circuitBreakerConfig.MaxRequestToBreak + return counts.ConsecutiveFailures >= circuitBreakerConfig.MaxFailures }, }) } diff --git a/engine/access/rpc/backend/connection_factory_test.go b/engine/access/rpc/backend/connection_factory_test.go index 96dba28889b..3c96b9c124a 100644 --- a/engine/access/rpc/backend/connection_factory_test.go +++ b/engine/access/rpc/backend/connection_factory_test.go @@ -430,9 +430,9 @@ func TestCircuitBreakerExecutionNode(t *testing.T) { connectionFactory.ExecutionNodeGRPCTimeout = requestTimeout // set the configuration for circuit breaker connectionFactory.CircuitBreakerConfig = CircuitBreakerConfig{ - Enabled: true, - MaxRequestToBreak: 1, - RestoreTimeout: circuitBreakerRestoreTimeout, + Enabled: true, + MaxFailures: 1, + RestoreTimeout: circuitBreakerRestoreTimeout, } // set the connection pool cache size cacheSize := 1 @@ -499,9 +499,9 @@ func TestCircuitBreakerCollectionNode(t *testing.T) { connectionFactory.CollectionNodeGRPCTimeout = requestTimeout // set the configuration for circuit breaker connectionFactory.CircuitBreakerConfig = CircuitBreakerConfig{ - Enabled: true, - MaxRequestToBreak: 1, - RestoreTimeout: circuitBreakerRestoreTimeout, + Enabled: true, + MaxFailures: 1, + RestoreTimeout: circuitBreakerRestoreTimeout, } // set the connection pool cache size cacheSize := 1 From 8a6439fd9f9e5b3b44a0aa0985d51a1b388c30b6 Mon Sep 17 00:00:00 2001 From: Andrii Slisarchuk Date: Mon, 12 Jun 2023 21:32:04 +0300 Subject: [PATCH 09/56] Added missing part of code. --- .../access/rpc/backend/connection_selector.go | 64 ++++++++++++------- 1 file changed, 41 insertions(+), 23 deletions(-) diff --git a/engine/access/rpc/backend/connection_selector.go b/engine/access/rpc/backend/connection_selector.go index fe351fdb5b2..c3bbfcf722c 100644 --- a/engine/access/rpc/backend/connection_selector.go +++ b/engine/access/rpc/backend/connection_selector.go @@ -50,9 +50,9 @@ func NewConnectionSelector( } } -func (ncs *MainConnectionSelector) GetCollectionNodes(txId flow.Identifier) ([]string, error) { +func (mcs *MainConnectionSelector) GetCollectionNodes(txId flow.Identifier) ([]string, error) { // retrieve the set of collector clusters - clusters, err := ncs.state.Final().Epochs().Current().Clustering() + clusters, err := mcs.state.Final().Epochs().Current().Clustering() if err != nil { return nil, fmt.Errorf("could not cluster collection nodes: %w", err) } @@ -78,7 +78,7 @@ func (ncs *MainConnectionSelector) GetCollectionNodes(txId flow.Identifier) ([]s // GetExecutionNodesForBlockID returns upto maxExecutionNodesCnt number of randomly chosen execution node identities // which have executed the given block ID. // If no such execution node is found, an InsufficientExecutionReceipts error is returned. -func (ncs *MainConnectionSelector) GetExecutionNodesForBlockID( +func (mcs *MainConnectionSelector) GetExecutionNodesForBlockID( ctx context.Context, blockID flow.Identifier, ) (flow.IdentityList, error) { @@ -87,37 +87,38 @@ func (ncs *MainConnectionSelector) GetExecutionNodesForBlockID( // check if the block ID is of the root block. If it is then don't look for execution receipts since they // will not be present for the root block. - rootBlock, err := ncs.state.Params().FinalizedRoot() + rootBlock, err := mcs.state.Params().FinalizedRoot() if err != nil { return nil, fmt.Errorf("failed to retreive execution IDs for block ID %v: %w", blockID, err) } if rootBlock.ID() == blockID { - executorIdentities, err := ncs.state.Final().Identities(filter.HasRole(flow.RoleExecution)) + executorIdentities, err := mcs.state.Final().Identities(filter.HasRole(flow.RoleExecution)) if err != nil { return nil, fmt.Errorf("failed to retreive execution IDs for block ID %v: %w", blockID, err) } executorIDs = executorIdentities.NodeIDs() } else { - // try to find at least minExecutionNodesCnt execution node ids from the execution receipts for the given blockID + // try to find atleast minExecutionNodesCnt execution node ids from the execution receipts for the given blockID for attempt := 0; attempt < maxAttemptsForExecutionReceipt; attempt++ { - executorIDs, err = findAllExecutionNodes(blockID, ncs.executionReceipts, ncs.log) + executorIDs, err = findAllExecutionNodes(blockID, mcs.executionReceipts, mcs.log) if err != nil { return nil, err } - if len(executorIDs) > 0 { + if len(executorIDs) >= minExecutionNodesCnt { break } // log the attempt - ncs.log.Debug().Int("attempt", attempt).Int("max_attempt", maxAttemptsForExecutionReceipt). + mcs.log.Debug().Int("attempt", attempt).Int("max_attempt", maxAttemptsForExecutionReceipt). Int("execution_receipts_found", len(executorIDs)). Str("block_id", blockID.String()). Msg("insufficient execution receipts") // if one or less execution receipts may have been received then re-query // in the hope that more might have been received by now + select { case <-ctx.Done(): return nil, ctx.Err() @@ -125,10 +126,20 @@ func (ncs *MainConnectionSelector) GetExecutionNodesForBlockID( //retry after an exponential backoff } } + + receiptCnt := len(executorIDs) + // if less than minExecutionNodesCnt execution receipts have been received so far, then return random ENs + if receiptCnt < minExecutionNodesCnt { + newExecutorIDs, err := mcs.state.AtBlockID(blockID).Identities(filter.HasRole(flow.RoleExecution)) + if err != nil { + return nil, fmt.Errorf("failed to retreive execution IDs for block ID %v: %w", blockID, err) + } + executorIDs = newExecutorIDs.NodeIDs() + } } // choose from the preferred or fixed execution nodes - subsetENs, err := chooseExecutionNodes(ncs.state, executorIDs) + subsetENs, err := chooseExecutionNodes(mcs.state, executorIDs) if err != nil { return nil, fmt.Errorf("failed to retreive execution IDs for block ID %v: %w", blockID, err) } @@ -143,9 +154,9 @@ func (ncs *MainConnectionSelector) GetExecutionNodesForBlockID( return executionIdentitiesRandom, nil } -func (nccbs *CircuitBreakerConnectionSelector) GetCollectionNodes(txId flow.Identifier) ([]string, error) { +func (cbcs *CircuitBreakerConnectionSelector) GetCollectionNodes(txId flow.Identifier) ([]string, error) { // retrieve the set of collector clusters - clusters, err := nccbs.state.Final().Epochs().Current().Clustering() + clusters, err := cbcs.state.Final().Epochs().Current().Clustering() if err != nil { return nil, fmt.Errorf("could not cluster collection nodes: %w", err) } @@ -168,7 +179,7 @@ func (nccbs *CircuitBreakerConnectionSelector) GetCollectionNodes(txId flow.Iden // GetExecutionNodesForBlockID returns upto maxExecutionNodesCnt number of randomly chosen execution node identities // which have executed the given block ID. // If no such execution node is found, an InsufficientExecutionReceipts error is returned. -func (nccbs *CircuitBreakerConnectionSelector) GetExecutionNodesForBlockID( +func (cbcs *CircuitBreakerConnectionSelector) GetExecutionNodesForBlockID( ctx context.Context, blockID flow.Identifier, ) (flow.IdentityList, error) { @@ -177,13 +188,13 @@ func (nccbs *CircuitBreakerConnectionSelector) GetExecutionNodesForBlockID( // check if the block ID is of the root block. If it is then don't look for execution receipts since they // will not be present for the root block. - rootBlock, err := nccbs.state.Params().FinalizedRoot() + rootBlock, err := cbcs.state.Params().FinalizedRoot() if err != nil { return nil, fmt.Errorf("failed to retreive execution IDs for block ID %v: %w", blockID, err) } if rootBlock.ID() == blockID { - executorIdentities, err := nccbs.state.Final().Identities(filter.HasRole(flow.RoleExecution)) + executorIdentities, err := cbcs.state.Final().Identities(filter.HasRole(flow.RoleExecution)) if err != nil { return nil, fmt.Errorf("failed to retreive execution IDs for block ID %v: %w", blockID, err) } @@ -191,7 +202,7 @@ func (nccbs *CircuitBreakerConnectionSelector) GetExecutionNodesForBlockID( } else { // try to find at least minExecutionNodesCnt execution node ids from the execution receipts for the given blockID for attempt := 0; attempt < maxAttemptsForExecutionReceipt; attempt++ { - executorIDs, err = findAllExecutionNodes(blockID, nccbs.executionReceipts, nccbs.log) + executorIDs, err = findAllExecutionNodes(blockID, cbcs.executionReceipts, cbcs.log) if err != nil { return nil, err } @@ -201,7 +212,7 @@ func (nccbs *CircuitBreakerConnectionSelector) GetExecutionNodesForBlockID( } // log the attempt - nccbs.log.Debug().Int("attempt", attempt).Int("max_attempt", maxAttemptsForExecutionReceipt). + cbcs.log.Debug().Int("attempt", attempt).Int("max_attempt", maxAttemptsForExecutionReceipt). Int("execution_receipts_found", len(executorIDs)). Str("block_id", blockID.String()). Msg("insufficient execution receipts") @@ -215,22 +226,29 @@ func (nccbs *CircuitBreakerConnectionSelector) GetExecutionNodesForBlockID( //retry after an exponential backoff } } + + receiptCnt := len(executorIDs) + // if less than minExecutionNodesCnt execution receipts have been received so far, then return random ENs + if receiptCnt < minExecutionNodesCnt { + newExecutorIDs, err := cbcs.state.AtBlockID(blockID).Identities(filter.HasRole(flow.RoleExecution)) + if err != nil { + return nil, fmt.Errorf("failed to retreive execution IDs for block ID %v: %w", blockID, err) + } + executorIDs = newExecutorIDs.NodeIDs() + } } // choose from the preferred or fixed execution nodes - subsetENs, err := chooseExecutionNodes(nccbs.state, executorIDs) + subsetENs, err := chooseExecutionNodes(cbcs.state, executorIDs) if err != nil { return nil, fmt.Errorf("failed to retreive execution IDs for block ID %v: %w", blockID, err) } - // randomly choose upto maxExecutionNodesCnt identities - executionIdentitiesRandom := subsetENs.Sample(maxExecutionNodesCnt) - - if len(executionIdentitiesRandom) == 0 { + if len(subsetENs) == 0 { return nil, fmt.Errorf("no matching execution node found for block ID %v", blockID) } - return executionIdentitiesRandom, nil + return subsetENs, nil } // chooseExecutionNodes finds the subset of execution nodes defined in the identity table by first From 5290e7f0a0a3511d2ebb74c6e5183b1f0baa563b Mon Sep 17 00:00:00 2001 From: Andrii Slisarchuk Date: Wed, 14 Jun 2023 00:02:45 +0300 Subject: [PATCH 10/56] Added chained interceptor --- Makefile | 1 - apiproxy/access_api_proxy.go | 8 +- .../node_builder/access_node_builder.go | 2 +- engine/access/apiproxy/access_api_proxy.go | 8 +- engine/access/rpc/backend/backend.go | 186 +++++++++- engine/access/rpc/backend/backend_accounts.go | 3 +- engine/access/rpc/backend/backend_events.go | 3 +- engine/access/rpc/backend/backend_scripts.go | 3 +- engine/access/rpc/backend/backend_test.go | 32 -- .../rpc/backend/backend_transactions.go | 39 +- .../access/rpc/backend/connection_factory.go | 64 ++-- .../access/rpc/backend/connection_selector.go | 348 ------------------ .../rpc/backend/mock/connection_selector.go | 82 ----- engine/access/rpc/engine.go | 3 - 14 files changed, 259 insertions(+), 523 deletions(-) delete mode 100644 engine/access/rpc/backend/connection_selector.go delete mode 100644 engine/access/rpc/backend/mock/connection_selector.go diff --git a/Makefile b/Makefile index 3e4c5ac62ec..f6726da1395 100644 --- a/Makefile +++ b/Makefile @@ -180,7 +180,6 @@ generate-mocks: install-mock-generators mockery --name 'API' --dir="./engine/protocol" --case=underscore --output="./engine/protocol/mock" --outpkg="mock" mockery --name '.*' --dir="./engine/access/state_stream" --case=underscore --output="./engine/access/state_stream/mock" --outpkg="mock" mockery --name 'ConnectionFactory' --dir="./engine/access/rpc/backend" --case=underscore --output="./engine/access/rpc/backend/mock" --outpkg="mock" - mockery --name 'ConnectionSelector' --dir="./engine/access/rpc/backend" --case=underscore --output="./engine/access/rpc/backend/mock" --outpkg="mock" mockery --name 'IngestRPC' --dir="./engine/execution/ingestion" --case=underscore --tags relic --output="./engine/execution/ingestion/mock" --outpkg="mock" mockery --name '.*' --dir=model/fingerprint --case=underscore --output="./model/fingerprint/mock" --outpkg="mock" mockery --name 'ExecForkActor' --structname 'ExecForkActorMock' --dir=module/mempool/consensus/mock/ --case=underscore --output="./module/mempool/consensus/mock/" --outpkg="mock" diff --git a/apiproxy/access_api_proxy.go b/apiproxy/access_api_proxy.go index d54b1dab483..dfe610f5857 100644 --- a/apiproxy/access_api_proxy.go +++ b/apiproxy/access_api_proxy.go @@ -86,9 +86,7 @@ func (h *FlowAccessAPIForwarder) reconnectingClient(i int) error { identity.Address, grpc.WithDefaultCallOptions(grpc.MaxCallRecvMsgSize(grpcutils.DefaultMaxMsgSize)), grpc.WithInsecure(), //nolint:staticcheck - backend.WithClientUnaryInterceptor(timeout, backend.CircuitBreakerConfig{ - Enabled: false, - })) + grpc.WithUnaryInterceptor(backend.WithClientUnaryInterceptor(timeout))) if err != nil { return err } @@ -102,9 +100,7 @@ func (h *FlowAccessAPIForwarder) reconnectingClient(i int) error { identity.Address, grpc.WithDefaultCallOptions(grpc.MaxCallRecvMsgSize(grpcutils.DefaultMaxMsgSize)), grpc.WithTransportCredentials(credentials.NewTLS(tlsConfig)), - backend.WithClientUnaryInterceptor(timeout, backend.CircuitBreakerConfig{ - Enabled: false, - })) + grpc.WithUnaryInterceptor(backend.WithClientUnaryInterceptor(timeout))) if err != nil { return fmt.Errorf("cannot connect to %s %w", identity.Address, err) } diff --git a/cmd/access/node_builder/access_node_builder.go b/cmd/access/node_builder/access_node_builder.go index d24fe4537bd..cdf66e66bf0 100644 --- a/cmd/access/node_builder/access_node_builder.go +++ b/cmd/access/node_builder/access_node_builder.go @@ -926,7 +926,7 @@ func (builder *FlowAccessNodeBuilder) Build() (cmd.Node, error) { builder.rpcConf.CollectionAddr, grpc.WithDefaultCallOptions(grpc.MaxCallRecvMsgSize(int(builder.rpcConf.MaxMsgSize))), grpc.WithTransportCredentials(insecure.NewCredentials()), - backend.WithClientUnaryInterceptor(builder.rpcConf.CollectionClientTimeout, builder.rpcConf.CircuitBreakerConfig)) + grpc.WithUnaryInterceptor(backend.WithClientUnaryInterceptor(builder.rpcConf.CollectionClientTimeout))) if err != nil { return err } diff --git a/engine/access/apiproxy/access_api_proxy.go b/engine/access/apiproxy/access_api_proxy.go index 7123411cc2b..ce95c1cca28 100644 --- a/engine/access/apiproxy/access_api_proxy.go +++ b/engine/access/apiproxy/access_api_proxy.go @@ -65,9 +65,7 @@ func (h *FlowAccessAPIForwarder) reconnectingClient(i int) error { identity.Address, grpc.WithDefaultCallOptions(grpc.MaxCallRecvMsgSize(int(h.maxMsgSize))), grpc.WithTransportCredentials(insecure.NewCredentials()), - backend.WithClientUnaryInterceptor(timeout, backend.CircuitBreakerConfig{ - Enabled: false, - })) + grpc.WithUnaryInterceptor(backend.WithClientUnaryInterceptor(timeout))) if err != nil { return err } @@ -81,9 +79,7 @@ func (h *FlowAccessAPIForwarder) reconnectingClient(i int) error { identity.Address, grpc.WithDefaultCallOptions(grpc.MaxCallRecvMsgSize(int(h.maxMsgSize))), grpc.WithTransportCredentials(credentials.NewTLS(tlsConfig)), - backend.WithClientUnaryInterceptor(timeout, backend.CircuitBreakerConfig{ - Enabled: false, - })) + grpc.WithUnaryInterceptor(backend.WithClientUnaryInterceptor(timeout))) if err != nil { return fmt.Errorf("cannot connect to %s %w", identity.Address, err) } diff --git a/engine/access/rpc/backend/backend.go b/engine/access/rpc/backend/backend.go index 19996285231..868858c8876 100644 --- a/engine/access/rpc/backend/backend.go +++ b/engine/access/rpc/backend/backend.go @@ -3,8 +3,10 @@ package backend import ( "context" "fmt" + "github.com/onflow/flow-go/model/flow/filter" "google.golang.org/grpc/codes" "google.golang.org/grpc/status" + "time" lru "github.com/hashicorp/golang-lru" accessproto "github.com/onflow/flow/protobuf/go/flow/access" @@ -73,7 +75,6 @@ type Backend struct { collections storage.Collections executionReceipts storage.ExecutionReceipts connFactory ConnectionFactory - connSelector ConnectionSelector } func New( @@ -89,7 +90,6 @@ func New( chainID flow.ChainID, accessMetrics module.AccessMetrics, connFactory ConnectionFactory, - connSelector ConnectionSelector, retryEnabled bool, maxHeightRange uint, preferredExecutionNodeIDs []string, @@ -115,7 +115,6 @@ func New( headers: headers, executionReceipts: executionReceipts, connFactory: connFactory, - connSelector: connSelector, state: state, log: log, metrics: accessMetrics, @@ -134,7 +133,6 @@ func New( transactionMetrics: accessMetrics, retry: retry, connFactory: connFactory, - connSelector: connSelector, previousAccessNodes: historicalAccessNodes, log: log, }, @@ -143,7 +141,6 @@ func New( headers: headers, executionReceipts: executionReceipts, connFactory: connFactory, - connSelector: connSelector, log: log, maxHeightRange: maxHeightRange, }, @@ -160,7 +157,6 @@ func New( headers: headers, executionReceipts: executionReceipts, connFactory: connFactory, - connSelector: connSelector, log: log, }, backendExecutionResults: backendExecutionResults{ @@ -174,7 +170,6 @@ func New( collections: collections, executionReceipts: executionReceipts, connFactory: connFactory, - connSelector: connSelector, chainID: chainID, } @@ -289,3 +284,180 @@ func (b *Backend) GetLatestProtocolStateSnapshot(_ context.Context) ([]byte, err return convert.SnapshotToBytes(validSnapshot) } + +// executionNodesForBlockID returns upto maxExecutionNodesCnt number of randomly chosen execution node identities +// which have executed the given block ID. +// If no such execution node is found, an InsufficientExecutionReceipts error is returned. +func executionNodesForBlockID( + ctx context.Context, + blockID flow.Identifier, + executionReceipts storage.ExecutionReceipts, + state protocol.State, + log zerolog.Logger) (flow.IdentityList, error) { + + var executorIDs flow.IdentifierList + + // check if the block ID is of the root block. If it is then don't look for execution receipts since they + // will not be present for the root block. + rootBlock, err := state.Params().FinalizedRoot() + if err != nil { + return nil, fmt.Errorf("failed to retreive execution IDs for block ID %v: %w", blockID, err) + } + + if rootBlock.ID() == blockID { + executorIdentities, err := state.Final().Identities(filter.HasRole(flow.RoleExecution)) + if err != nil { + return nil, fmt.Errorf("failed to retreive execution IDs for block ID %v: %w", blockID, err) + } + executorIDs = executorIdentities.NodeIDs() + } else { + // try to find atleast minExecutionNodesCnt execution node ids from the execution receipts for the given blockID + for attempt := 0; attempt < maxAttemptsForExecutionReceipt; attempt++ { + executorIDs, err = findAllExecutionNodes(blockID, executionReceipts, log) + if err != nil { + return nil, err + } + + if len(executorIDs) >= minExecutionNodesCnt { + break + } + + // log the attempt + log.Debug().Int("attempt", attempt).Int("max_attempt", maxAttemptsForExecutionReceipt). + Int("execution_receipts_found", len(executorIDs)). + Str("block_id", blockID.String()). + Msg("insufficient execution receipts") + + // if one or less execution receipts may have been received then re-query + // in the hope that more might have been received by now + + select { + case <-ctx.Done(): + return nil, ctx.Err() + case <-time.After(100 * time.Millisecond << time.Duration(attempt)): + //retry after an exponential backoff + } + } + + receiptCnt := len(executorIDs) + // if less than minExecutionNodesCnt execution receipts have been received so far, then return random ENs + if receiptCnt < minExecutionNodesCnt { + newExecutorIDs, err := state.AtBlockID(blockID).Identities(filter.HasRole(flow.RoleExecution)) + if err != nil { + return nil, fmt.Errorf("failed to retreive execution IDs for block ID %v: %w", blockID, err) + } + executorIDs = newExecutorIDs.NodeIDs() + } + } + + // choose from the preferred or fixed execution nodes + subsetENs, err := chooseExecutionNodes(state, executorIDs) + if err != nil { + return nil, fmt.Errorf("failed to retreive execution IDs for block ID %v: %w", blockID, err) + } + + // randomly choose upto maxExecutionNodesCnt identities + executionIdentitiesRandom := subsetENs.Sample(maxExecutionNodesCnt) + + if len(executionIdentitiesRandom) == 0 { + return nil, fmt.Errorf("no matching execution node found for block ID %v", blockID) + } + + return executionIdentitiesRandom, nil +} + +// findAllExecutionNodes find all the execution nodes ids from the execution receipts that have been received for the +// given blockID +func findAllExecutionNodes( + blockID flow.Identifier, + executionReceipts storage.ExecutionReceipts, + log zerolog.Logger) (flow.IdentifierList, error) { + + // lookup the receipt's storage with the block ID + allReceipts, err := executionReceipts.ByBlockID(blockID) + if err != nil { + return nil, fmt.Errorf("failed to retreive execution receipts for block ID %v: %w", blockID, err) + } + + executionResultMetaList := make(flow.ExecutionReceiptMetaList, 0, len(allReceipts)) + for _, r := range allReceipts { + executionResultMetaList = append(executionResultMetaList, r.Meta()) + } + executionResultGroupedMetaList := executionResultMetaList.GroupByResultID() + + // maximum number of matching receipts found so far for any execution result id + maxMatchedReceiptCnt := 0 + // execution result id key for the highest number of matching receipts in the identicalReceipts map + var maxMatchedReceiptResultID flow.Identifier + + // find the largest list of receipts which have the same result ID + for resultID, executionReceiptList := range executionResultGroupedMetaList { + currentMatchedReceiptCnt := executionReceiptList.Size() + if currentMatchedReceiptCnt > maxMatchedReceiptCnt { + maxMatchedReceiptCnt = currentMatchedReceiptCnt + maxMatchedReceiptResultID = resultID + } + } + + // if there are more than one execution result for the same block ID, log as error + if executionResultGroupedMetaList.NumberGroups() > 1 { + identicalReceiptsStr := fmt.Sprintf("%v", flow.GetIDs(allReceipts)) + log.Error(). + Str("block_id", blockID.String()). + Str("execution_receipts", identicalReceiptsStr). + Msg("execution receipt mismatch") + } + + // pick the largest list of matching receipts + matchingReceiptMetaList := executionResultGroupedMetaList.GetGroup(maxMatchedReceiptResultID) + + metaReceiptGroupedByExecutorID := matchingReceiptMetaList.GroupByExecutorID() + + // collect all unique execution node ids from the receipts + var executorIDs flow.IdentifierList + for executorID := range metaReceiptGroupedByExecutorID { + executorIDs = append(executorIDs, executorID) + } + + return executorIDs, nil +} + +// chooseExecutionNodes finds the subset of execution nodes defined in the identity table by first +// choosing the preferred execution nodes which have executed the transaction. If no such preferred +// execution nodes are found, then the fixed execution nodes defined in the identity table are returned +// If neither preferred nor fixed nodes are defined, then all execution node matching the executor IDs are returned. +// e.g. If execution nodes in identity table are {1,2,3,4}, preferred ENs are defined as {2,3,4} +// and the executor IDs is {1,2,3}, then {2, 3} is returned as the chosen subset of ENs +func chooseExecutionNodes(state protocol.State, executorIDs flow.IdentifierList) (flow.IdentityList, error) { + + allENs, err := state.Final().Identities(filter.HasRole(flow.RoleExecution)) + if err != nil { + return nil, fmt.Errorf("failed to retreive all execution IDs: %w", err) + } + + // first try and choose from the preferred EN IDs + var chosenIDs flow.IdentityList + if len(preferredENIdentifiers) > 0 { + // find the preferred execution node IDs which have executed the transaction + chosenIDs = allENs.Filter(filter.And(filter.HasNodeID(preferredENIdentifiers...), + filter.HasNodeID(executorIDs...))) + if len(chosenIDs) > 0 { + return chosenIDs, nil + } + } + + // if no preferred EN ID is found, then choose from the fixed EN IDs + if len(fixedENIdentifiers) > 0 { + // choose fixed ENs which have executed the transaction + chosenIDs = allENs.Filter(filter.And(filter.HasNodeID(fixedENIdentifiers...), filter.HasNodeID(executorIDs...))) + if len(chosenIDs) > 0 { + return chosenIDs, nil + } + // if no such ENs are found then just choose all fixed ENs + chosenIDs = allENs.Filter(filter.HasNodeID(fixedENIdentifiers...)) + return chosenIDs, nil + } + + // If no preferred or fixed ENs have been specified, then return all executor IDs i.e. no preference at all + return allENs.Filter(filter.HasNodeID(executorIDs...)), nil +} diff --git a/engine/access/rpc/backend/backend_accounts.go b/engine/access/rpc/backend/backend_accounts.go index 4781e395d23..a3a41053c61 100644 --- a/engine/access/rpc/backend/backend_accounts.go +++ b/engine/access/rpc/backend/backend_accounts.go @@ -22,7 +22,6 @@ type backendAccounts struct { headers storage.Headers executionReceipts storage.ExecutionReceipts connFactory ConnectionFactory - connSelector ConnectionSelector log zerolog.Logger } @@ -83,7 +82,7 @@ func (b *backendAccounts) getAccountAtBlockID( BlockId: blockID[:], } - execNodes, err := b.connSelector.GetExecutionNodesForBlockID(ctx, blockID) + execNodes, err := executionNodesForBlockID(ctx, blockID, b.executionReceipts, b.state, b.log) if err != nil { return nil, rpc.ConvertError(err, "failed to get account from the execution node", codes.Internal) } diff --git a/engine/access/rpc/backend/backend_events.go b/engine/access/rpc/backend/backend_events.go index f2481b61288..f48ba395947 100644 --- a/engine/access/rpc/backend/backend_events.go +++ b/engine/access/rpc/backend/backend_events.go @@ -25,7 +25,6 @@ type backendEvents struct { executionReceipts storage.ExecutionReceipts state protocol.State connFactory ConnectionFactory - connSelector ConnectionSelector log zerolog.Logger maxHeightRange uint } @@ -130,7 +129,7 @@ func (b *backendEvents) getBlockEventsFromExecutionNode( // choose the last block ID to find the list of execution nodes lastBlockID := blockIDs[len(blockIDs)-1] - execNodes, err := b.connSelector.GetExecutionNodesForBlockID(ctx, lastBlockID) + execNodes, err := executionNodesForBlockID(ctx, lastBlockID, b.executionReceipts, b.state, b.log) if err != nil { b.log.Error().Err(err).Msg("failed to retrieve events from execution node") return nil, rpc.ConvertError(err, "failed to retrieve events from execution node", codes.Internal) diff --git a/engine/access/rpc/backend/backend_scripts.go b/engine/access/rpc/backend/backend_scripts.go index 48df09d6f12..e6da62a5b7b 100644 --- a/engine/access/rpc/backend/backend_scripts.go +++ b/engine/access/rpc/backend/backend_scripts.go @@ -28,7 +28,6 @@ type backendScripts struct { executionReceipts storage.ExecutionReceipts state protocol.State connFactory ConnectionFactory - connSelector ConnectionSelector log zerolog.Logger metrics module.BackendScriptsMetrics loggedScripts *lru.Cache @@ -92,7 +91,7 @@ func (b *backendScripts) findScriptExecutors( return b.archiveAddressList, nil } - executors, err := b.connSelector.GetExecutionNodesForBlockID(ctx, blockID) + executors, err := executionNodesForBlockID(ctx, blockID, b.executionReceipts, b.state, b.log) if err != nil { return nil, err } diff --git a/engine/access/rpc/backend/backend_test.go b/engine/access/rpc/backend/backend_test.go index a2d34eca298..9751f64b865 100644 --- a/engine/access/rpc/backend/backend_test.go +++ b/engine/access/rpc/backend/backend_test.go @@ -100,7 +100,6 @@ func (suite *Suite) TestPing() { suite.chainID, metrics.NewNoopCollector(), nil, - nil, false, DefaultMaxHeightRange, nil, @@ -136,7 +135,6 @@ func (suite *Suite) TestGetLatestFinalizedBlockHeader() { suite.chainID, metrics.NewNoopCollector(), nil, - nil, false, DefaultMaxHeightRange, nil, @@ -202,7 +200,6 @@ func (suite *Suite) TestGetLatestProtocolStateSnapshot_NoTransitionSpan() { suite.chainID, metrics.NewNoopCollector(), nil, - nil, false, 100, nil, @@ -275,7 +272,6 @@ func (suite *Suite) TestGetLatestProtocolStateSnapshot_TransitionSpans() { suite.chainID, metrics.NewNoopCollector(), nil, - nil, false, 100, nil, @@ -341,7 +337,6 @@ func (suite *Suite) TestGetLatestProtocolStateSnapshot_PhaseTransitionSpan() { suite.chainID, metrics.NewNoopCollector(), nil, - nil, false, 100, nil, @@ -418,7 +413,6 @@ func (suite *Suite) TestGetLatestProtocolStateSnapshot_EpochTransitionSpan() { suite.chainID, metrics.NewNoopCollector(), nil, - nil, false, 100, nil, @@ -479,7 +473,6 @@ func (suite *Suite) TestGetLatestProtocolStateSnapshot_HistoryLimit() { suite.chainID, metrics.NewNoopCollector(), nil, - nil, false, DefaultMaxHeightRange, nil, @@ -518,7 +511,6 @@ func (suite *Suite) TestGetLatestSealedBlockHeader() { suite.chainID, metrics.NewNoopCollector(), nil, - nil, false, DefaultMaxHeightRange, nil, @@ -565,7 +557,6 @@ func (suite *Suite) TestGetTransaction() { suite.chainID, metrics.NewNoopCollector(), nil, - nil, false, DefaultMaxHeightRange, nil, @@ -606,7 +597,6 @@ func (suite *Suite) TestGetCollection() { suite.chainID, metrics.NewNoopCollector(), nil, - nil, false, DefaultMaxHeightRange, nil, @@ -670,7 +660,6 @@ func (suite *Suite) TestGetTransactionResultByIndex() { suite.chainID, metrics.NewNoopCollector(), connFactory, // the connection factory should be used to get the execution node client - nil, false, DefaultMaxHeightRange, nil, @@ -734,7 +723,6 @@ func (suite *Suite) TestGetTransactionResultsByBlockID() { suite.chainID, metrics.NewNoopCollector(), connFactory, // the connection factory should be used to get the execution node client - nil, false, DefaultMaxHeightRange, nil, @@ -826,7 +814,6 @@ func (suite *Suite) TestTransactionStatusTransition() { suite.chainID, metrics.NewNoopCollector(), connFactory, // the connection factory should be used to get the execution node client - nil, false, DefaultMaxHeightRange, nil, @@ -947,7 +934,6 @@ func (suite *Suite) TestTransactionExpiredStatusTransition() { suite.chainID, metrics.NewNoopCollector(), nil, - nil, false, DefaultMaxHeightRange, nil, @@ -1115,7 +1101,6 @@ func (suite *Suite) TestTransactionPendingToFinalizedStatusTransition() { suite.chainID, metrics.NewNoopCollector(), connFactory, // the connection factory should be used to get the execution node client - nil, false, 100, nil, @@ -1174,7 +1159,6 @@ func (suite *Suite) TestTransactionResultUnknown() { suite.chainID, metrics.NewNoopCollector(), nil, - nil, false, DefaultMaxHeightRange, nil, @@ -1229,7 +1213,6 @@ func (suite *Suite) TestGetLatestFinalizedBlock() { suite.chainID, metrics.NewNoopCollector(), nil, - nil, false, DefaultMaxHeightRange, nil, @@ -1360,7 +1343,6 @@ func (suite *Suite) TestGetEventsForBlockIDs() { suite.chainID, metrics.NewNoopCollector(), connFactory, // the connection factory should be used to get the execution node client - nil, false, DefaultMaxHeightRange, nil, @@ -1393,7 +1375,6 @@ func (suite *Suite) TestGetEventsForBlockIDs() { suite.chainID, metrics.NewNoopCollector(), connFactory, // the connection factory should be used to get the execution node client - nil, false, DefaultMaxHeightRange, nil, @@ -1453,7 +1434,6 @@ func (suite *Suite) TestGetExecutionResultByID() { suite.chainID, metrics.NewNoopCollector(), connFactory, // the connection factory should be used to get the execution node client - nil, false, DefaultMaxHeightRange, nil, @@ -1484,7 +1464,6 @@ func (suite *Suite) TestGetExecutionResultByID() { suite.chainID, metrics.NewNoopCollector(), connFactory, // the connection factory should be used to get the execution node client - nil, false, DefaultMaxHeightRange, nil, @@ -1548,7 +1527,6 @@ func (suite *Suite) TestGetExecutionResultByBlockID() { suite.chainID, metrics.NewNoopCollector(), connFactory, // the connection factory should be used to get the execution node client - nil, false, DefaultMaxHeightRange, nil, @@ -1580,7 +1558,6 @@ func (suite *Suite) TestGetExecutionResultByBlockID() { suite.chainID, metrics.NewNoopCollector(), connFactory, // the connection factory should be used to get the execution node client - nil, false, DefaultMaxHeightRange, nil, @@ -1731,7 +1708,6 @@ func (suite *Suite) TestGetEventsForHeightRange() { suite.chainID, metrics.NewNoopCollector(), connFactory, // the connection factory should be used to get the execution node client - nil, false, DefaultMaxHeightRange, nil, @@ -1771,7 +1747,6 @@ func (suite *Suite) TestGetEventsForHeightRange() { suite.chainID, metrics.NewNoopCollector(), connFactory, // the connection factory should be used to get the execution node client - nil, false, DefaultMaxHeightRange, nil, @@ -1810,7 +1785,6 @@ func (suite *Suite) TestGetEventsForHeightRange() { suite.chainID, metrics.NewNoopCollector(), connFactory, // the connection factory should be used to get the execution node client - nil, false, DefaultMaxHeightRange, nil, @@ -1848,7 +1822,6 @@ func (suite *Suite) TestGetEventsForHeightRange() { suite.chainID, metrics.NewNoopCollector(), connFactory, // the connection factory should be used to get the execution node client - nil, false, 1, // set maximum range to 1 nil, @@ -1886,7 +1859,6 @@ func (suite *Suite) TestGetEventsForHeightRange() { suite.chainID, metrics.NewNoopCollector(), connFactory, // the connection factory should be used to get the execution node client - nil, false, DefaultMaxHeightRange, nil, @@ -1964,7 +1936,6 @@ func (suite *Suite) TestGetAccount() { suite.chainID, metrics.NewNoopCollector(), connFactory, // the connection factory should be used to get the execution node client - nil, false, DefaultMaxHeightRange, nil, @@ -2046,7 +2017,6 @@ func (suite *Suite) TestGetAccountAtBlockHeight() { flow.Testnet, metrics.NewNoopCollector(), connFactory, // the connection factory should be used to get the execution node client - nil, false, DefaultMaxHeightRange, nil, @@ -2086,7 +2056,6 @@ func (suite *Suite) TestGetNetworkParameters() { flow.Mainnet, metrics.NewNoopCollector(), nil, - nil, false, DefaultMaxHeightRange, nil, @@ -2281,7 +2250,6 @@ func (suite *Suite) TestExecuteScriptOnExecutionNode() { flow.Mainnet, metrics.NewNoopCollector(), connFactory, // the connection factory should be used to get the execution node client - nil, false, DefaultMaxHeightRange, nil, diff --git a/engine/access/rpc/backend/backend_transactions.go b/engine/access/rpc/backend/backend_transactions.go index 535387b63f2..5bc31162a07 100644 --- a/engine/access/rpc/backend/backend_transactions.go +++ b/engine/access/rpc/backend/backend_transactions.go @@ -24,6 +24,8 @@ import ( "github.com/onflow/flow-go/storage" ) +const collectionNodesToTry uint = 3 + type backendTransactions struct { staticCollectionRPC accessproto.AccessAPIClient // rpc client tied to a fixed collection node transactions storage.Transactions @@ -36,7 +38,6 @@ type backendTransactions struct { transactionValidator *access.TransactionValidator retry *Retry connFactory ConnectionFactory - connSelector ConnectionSelector previousAccessNodes []accessproto.AccessAPIClient log zerolog.Logger @@ -85,7 +86,7 @@ func (b *backendTransactions) trySendTransaction(ctx context.Context, tx *flow.T } // otherwise choose a random set of collections nodes to try - collAddrs, err := b.connSelector.GetCollectionNodes(tx.ID()) + collAddrs, err := b.chooseCollectionNodes(tx, collectionNodesToTry) if err != nil { return fmt.Errorf("failed to determine collection node for tx %x: %w", tx, err) } @@ -111,6 +112,34 @@ func (b *backendTransactions) trySendTransaction(ctx context.Context, tx *flow.T return sendErrors.ErrorOrNil() } +// chooseCollectionNodes finds a random subset of size sampleSize of collection node addresses from the +// collection node cluster responsible for the given tx +func (b *backendTransactions) chooseCollectionNodes(tx *flow.TransactionBody, sampleSize uint) ([]string, error) { + + // retrieve the set of collector clusters + clusters, err := b.state.Final().Epochs().Current().Clustering() + if err != nil { + return nil, fmt.Errorf("could not cluster collection nodes: %w", err) + } + + // get the cluster responsible for the transaction + txCluster, ok := clusters.ByTxID(tx.ID()) + if !ok { + return nil, fmt.Errorf("could not get local cluster by txID: %x", tx.ID()) + } + + // select a random subset of collection nodes from the cluster to be tried in order + targetNodes := txCluster.Sample(sampleSize) + + // collect the addresses of all the chosen collection nodes + var targetAddrs = make([]string, len(targetNodes)) + for i, id := range targetNodes { + targetAddrs[i] = id.Address + } + + return targetAddrs, nil +} + // sendTransactionToCollection sends the transaction to the given collection node via grpc func (b *backendTransactions) sendTransactionToCollector(ctx context.Context, tx *flow.TransactionBody, @@ -342,7 +371,7 @@ func (b *backendTransactions) GetTransactionResultsByBlockID( req := &execproto.GetTransactionsByBlockIDRequest{ BlockId: blockID[:], } - execNodes, err := b.connSelector.GetExecutionNodesForBlockID(ctx, blockID) + execNodes, err := executionNodesForBlockID(ctx, blockID, b.executionReceipts, b.state, b.log) if err != nil { if IsInsufficientExecutionReceipts(err) { return nil, status.Errorf(codes.NotFound, err.Error()) @@ -473,7 +502,7 @@ func (b *backendTransactions) GetTransactionResultByIndex( BlockId: blockID[:], Index: index, } - execNodes, err := b.connSelector.GetExecutionNodesForBlockID(ctx, blockID) + execNodes, err := executionNodesForBlockID(ctx, blockID, b.executionReceipts, b.state, b.log) if err != nil { if IsInsufficientExecutionReceipts(err) { return nil, status.Errorf(codes.NotFound, err.Error()) @@ -702,7 +731,7 @@ func (b *backendTransactions) getTransactionResultFromExecutionNode( TransactionId: transactionID, } - execNodes, err := b.connSelector.GetExecutionNodesForBlockID(ctx, blockID) + execNodes, err := executionNodesForBlockID(ctx, blockID, b.executionReceipts, b.state, b.log) if err != nil { // if no execution receipt were found, return a NotFound GRPC error if IsInsufficientExecutionReceipts(err) { diff --git a/engine/access/rpc/backend/connection_factory.go b/engine/access/rpc/backend/connection_factory.go index 37cf965c372..a5f63ab9eb1 100644 --- a/engine/access/rpc/backend/connection_factory.go +++ b/engine/access/rpc/backend/connection_factory.go @@ -101,7 +101,8 @@ func (cf *ConnectionFactoryImpl) createConnection(address string, timeout time.D grpc.WithDefaultCallOptions(grpc.MaxCallRecvMsgSize(int(cf.MaxMsgSize))), grpc.WithTransportCredentials(insecure.NewCredentials()), grpc.WithKeepaliveParams(keepaliveParams), - WithClientUnaryInterceptor(timeout, cf.CircuitBreakerConfig)) + cf.withChainUnaryInterceptor(timeout), + ) if err != nil { return nil, fmt.Errorf("failed to connect to address %s: %w", address, err) } @@ -257,18 +258,42 @@ func getGRPCAddress(address string, grpcPort uint) (string, error) { return grpcAddress, nil } -func WithClientUnaryInterceptor(timeout time.Duration, circuitBreakerConfig CircuitBreakerConfig) grpc.DialOption { - var circuitBreaker *gobreaker.CircuitBreaker +func (cf *ConnectionFactoryImpl) withChainUnaryInterceptor(timeout time.Duration) grpc.DialOption { + var clientInterceptors []grpc.UnaryClientInterceptor - if circuitBreakerConfig.Enabled { - circuitBreaker = gobreaker.NewCircuitBreaker(gobreaker.Settings{ - Timeout: circuitBreakerConfig.RestoreTimeout, + if cf.CircuitBreakerConfig.Enabled { + circuitBreaker := gobreaker.NewCircuitBreaker(gobreaker.Settings{ + Timeout: cf.CircuitBreakerConfig.RestoreTimeout, ReadyToTrip: func(counts gobreaker.Counts) bool { - return counts.ConsecutiveFailures >= circuitBreakerConfig.MaxFailures + return counts.ConsecutiveFailures >= cf.CircuitBreakerConfig.MaxFailures }, }) + + interceptor := func( + ctx context.Context, + method string, + req interface{}, + reply interface{}, + cc *grpc.ClientConn, + invoker grpc.UnaryInvoker, + opts ...grpc.CallOption, + ) error { + _, err := circuitBreaker.Execute(func() (interface{}, error) { + err := invoker(ctx, method, req, reply, cc, opts...) + + return nil, err + }) + return err + } + clientInterceptors = append(clientInterceptors, interceptor) } + clientInterceptors = append(clientInterceptors, WithClientUnaryInterceptor(timeout)) + + return grpc.WithChainUnaryInterceptor(clientInterceptors...) +} + +func WithClientUnaryInterceptor(timeout time.Duration) grpc.UnaryClientInterceptor { clientTimeoutInterceptor := func( ctx context.Context, method string, @@ -278,29 +303,16 @@ func WithClientUnaryInterceptor(timeout time.Duration, circuitBreakerConfig Circ invoker grpc.UnaryInvoker, opts ...grpc.CallOption, ) error { - exec := func() (interface{}, error) { - // create a context that expires after timeout - ctxWithTimeout, cancel := context.WithTimeout(ctx, timeout) - - defer cancel() + // create a context that expires after timeout + ctxWithTimeout, cancel := context.WithTimeout(ctx, timeout) - // call the remote GRPC using the short context - err := invoker(ctxWithTimeout, method, req, reply, cc, opts...) + defer cancel() - //TODO: As invoker do not return any results, for now nil returned - return nil, err - } - - var err error - - if circuitBreakerConfig.Enabled { - _, err = circuitBreaker.Execute(exec) - } else { - _, err = exec() - } + // call the remote GRPC using the short context + err := invoker(ctxWithTimeout, method, req, reply, cc, opts...) return err } - return grpc.WithUnaryInterceptor(clientTimeoutInterceptor) + return clientTimeoutInterceptor } diff --git a/engine/access/rpc/backend/connection_selector.go b/engine/access/rpc/backend/connection_selector.go deleted file mode 100644 index c3bbfcf722c..00000000000 --- a/engine/access/rpc/backend/connection_selector.go +++ /dev/null @@ -1,348 +0,0 @@ -package backend - -import ( - "context" - "fmt" - "time" - - "github.com/onflow/flow-go/model/flow" - "github.com/onflow/flow-go/model/flow/filter" - "github.com/onflow/flow-go/state/protocol" - "github.com/onflow/flow-go/storage" - "github.com/rs/zerolog" -) - -const collectionNodesToTry uint = 3 - -type ConnectionSelector interface { - GetExecutionNodesForBlockID(ctx context.Context, blockID flow.Identifier) (flow.IdentityList, error) - GetCollectionNodes(txID flow.Identifier) ([]string, error) -} - -type MainConnectionSelector struct { - state protocol.State - executionReceipts storage.ExecutionReceipts - log zerolog.Logger -} - -type CircuitBreakerConnectionSelector MainConnectionSelector - -var _ ConnectionSelector = (*MainConnectionSelector)(nil) - -func NewConnectionSelector( - state protocol.State, - executionReceipts storage.ExecutionReceipts, - log zerolog.Logger, - isCircuitBreakerEnabled bool, -) ConnectionSelector { - if isCircuitBreakerEnabled { - return &CircuitBreakerConnectionSelector{ - state: state, - executionReceipts: executionReceipts, - log: log, - } - } else { - return &MainConnectionSelector{ - state: state, - executionReceipts: executionReceipts, - log: log, - } - } -} - -func (mcs *MainConnectionSelector) GetCollectionNodes(txId flow.Identifier) ([]string, error) { - // retrieve the set of collector clusters - clusters, err := mcs.state.Final().Epochs().Current().Clustering() - if err != nil { - return nil, fmt.Errorf("could not cluster collection nodes: %w", err) - } - - // get the cluster responsible for the transaction - txCluster, ok := clusters.ByTxID(txId) - if !ok { - return nil, fmt.Errorf("could not get local cluster by txID: %x", txId) - } - - // select a random subset of collection nodes from the cluster to be tried in order - targetNodes := txCluster.Sample(collectionNodesToTry) - - // collect the addresses of all the chosen collection nodes - var targetAddrs = make([]string, len(targetNodes)) - for i, id := range targetNodes { - targetAddrs[i] = id.Address - } - - return targetAddrs, nil -} - -// GetExecutionNodesForBlockID returns upto maxExecutionNodesCnt number of randomly chosen execution node identities -// which have executed the given block ID. -// If no such execution node is found, an InsufficientExecutionReceipts error is returned. -func (mcs *MainConnectionSelector) GetExecutionNodesForBlockID( - ctx context.Context, - blockID flow.Identifier, -) (flow.IdentityList, error) { - - var executorIDs flow.IdentifierList - - // check if the block ID is of the root block. If it is then don't look for execution receipts since they - // will not be present for the root block. - rootBlock, err := mcs.state.Params().FinalizedRoot() - if err != nil { - return nil, fmt.Errorf("failed to retreive execution IDs for block ID %v: %w", blockID, err) - } - - if rootBlock.ID() == blockID { - executorIdentities, err := mcs.state.Final().Identities(filter.HasRole(flow.RoleExecution)) - if err != nil { - return nil, fmt.Errorf("failed to retreive execution IDs for block ID %v: %w", blockID, err) - } - executorIDs = executorIdentities.NodeIDs() - } else { - // try to find atleast minExecutionNodesCnt execution node ids from the execution receipts for the given blockID - for attempt := 0; attempt < maxAttemptsForExecutionReceipt; attempt++ { - executorIDs, err = findAllExecutionNodes(blockID, mcs.executionReceipts, mcs.log) - if err != nil { - return nil, err - } - - if len(executorIDs) >= minExecutionNodesCnt { - break - } - - // log the attempt - mcs.log.Debug().Int("attempt", attempt).Int("max_attempt", maxAttemptsForExecutionReceipt). - Int("execution_receipts_found", len(executorIDs)). - Str("block_id", blockID.String()). - Msg("insufficient execution receipts") - - // if one or less execution receipts may have been received then re-query - // in the hope that more might have been received by now - - select { - case <-ctx.Done(): - return nil, ctx.Err() - case <-time.After(100 * time.Millisecond << time.Duration(attempt)): - //retry after an exponential backoff - } - } - - receiptCnt := len(executorIDs) - // if less than minExecutionNodesCnt execution receipts have been received so far, then return random ENs - if receiptCnt < minExecutionNodesCnt { - newExecutorIDs, err := mcs.state.AtBlockID(blockID).Identities(filter.HasRole(flow.RoleExecution)) - if err != nil { - return nil, fmt.Errorf("failed to retreive execution IDs for block ID %v: %w", blockID, err) - } - executorIDs = newExecutorIDs.NodeIDs() - } - } - - // choose from the preferred or fixed execution nodes - subsetENs, err := chooseExecutionNodes(mcs.state, executorIDs) - if err != nil { - return nil, fmt.Errorf("failed to retreive execution IDs for block ID %v: %w", blockID, err) - } - - // randomly choose upto maxExecutionNodesCnt identities - executionIdentitiesRandom := subsetENs.Sample(maxExecutionNodesCnt) - - if len(executionIdentitiesRandom) == 0 { - return nil, fmt.Errorf("no matching execution node found for block ID %v", blockID) - } - - return executionIdentitiesRandom, nil -} - -func (cbcs *CircuitBreakerConnectionSelector) GetCollectionNodes(txId flow.Identifier) ([]string, error) { - // retrieve the set of collector clusters - clusters, err := cbcs.state.Final().Epochs().Current().Clustering() - if err != nil { - return nil, fmt.Errorf("could not cluster collection nodes: %w", err) - } - - // get the cluster responsible for the transaction - txCluster, ok := clusters.ByTxID(txId) - if !ok { - return nil, fmt.Errorf("could not get local cluster by txID: %x", txId) - } - - // collect the addresses of all the chosen collection nodes - var targetAddress = make([]string, len(txCluster)) - for i, id := range txCluster { - targetAddress[i] = id.Address - } - - return targetAddress, nil -} - -// GetExecutionNodesForBlockID returns upto maxExecutionNodesCnt number of randomly chosen execution node identities -// which have executed the given block ID. -// If no such execution node is found, an InsufficientExecutionReceipts error is returned. -func (cbcs *CircuitBreakerConnectionSelector) GetExecutionNodesForBlockID( - ctx context.Context, - blockID flow.Identifier, -) (flow.IdentityList, error) { - - var executorIDs flow.IdentifierList - - // check if the block ID is of the root block. If it is then don't look for execution receipts since they - // will not be present for the root block. - rootBlock, err := cbcs.state.Params().FinalizedRoot() - if err != nil { - return nil, fmt.Errorf("failed to retreive execution IDs for block ID %v: %w", blockID, err) - } - - if rootBlock.ID() == blockID { - executorIdentities, err := cbcs.state.Final().Identities(filter.HasRole(flow.RoleExecution)) - if err != nil { - return nil, fmt.Errorf("failed to retreive execution IDs for block ID %v: %w", blockID, err) - } - executorIDs = executorIdentities.NodeIDs() - } else { - // try to find at least minExecutionNodesCnt execution node ids from the execution receipts for the given blockID - for attempt := 0; attempt < maxAttemptsForExecutionReceipt; attempt++ { - executorIDs, err = findAllExecutionNodes(blockID, cbcs.executionReceipts, cbcs.log) - if err != nil { - return nil, err - } - - if len(executorIDs) > 0 { - break - } - - // log the attempt - cbcs.log.Debug().Int("attempt", attempt).Int("max_attempt", maxAttemptsForExecutionReceipt). - Int("execution_receipts_found", len(executorIDs)). - Str("block_id", blockID.String()). - Msg("insufficient execution receipts") - - // if one or less execution receipts may have been received then re-query - // in the hope that more might have been received by now - select { - case <-ctx.Done(): - return nil, ctx.Err() - case <-time.After(100 * time.Millisecond << time.Duration(attempt)): - //retry after an exponential backoff - } - } - - receiptCnt := len(executorIDs) - // if less than minExecutionNodesCnt execution receipts have been received so far, then return random ENs - if receiptCnt < minExecutionNodesCnt { - newExecutorIDs, err := cbcs.state.AtBlockID(blockID).Identities(filter.HasRole(flow.RoleExecution)) - if err != nil { - return nil, fmt.Errorf("failed to retreive execution IDs for block ID %v: %w", blockID, err) - } - executorIDs = newExecutorIDs.NodeIDs() - } - } - - // choose from the preferred or fixed execution nodes - subsetENs, err := chooseExecutionNodes(cbcs.state, executorIDs) - if err != nil { - return nil, fmt.Errorf("failed to retreive execution IDs for block ID %v: %w", blockID, err) - } - - if len(subsetENs) == 0 { - return nil, fmt.Errorf("no matching execution node found for block ID %v", blockID) - } - - return subsetENs, nil -} - -// chooseExecutionNodes finds the subset of execution nodes defined in the identity table by first -// choosing the preferred execution nodes which have executed the transaction. If no such preferred -// execution nodes are found, then the fixed execution nodes defined in the identity table are returned -// If neither preferred nor fixed nodes are defined, then all execution node matching the executor IDs are returned. -// e.g. If execution nodes in identity table are {1,2,3,4}, preferred ENs are defined as {2,3,4} -// and the executor IDs is {1,2,3}, then {2, 3} is returned as the chosen subset of ENs -func chooseExecutionNodes(state protocol.State, executorIDs flow.IdentifierList) (flow.IdentityList, error) { - - allENs, err := state.Final().Identities(filter.HasRole(flow.RoleExecution)) - if err != nil { - return nil, fmt.Errorf("failed to retreive all execution IDs: %w", err) - } - - // first try and choose from the preferred EN IDs - var chosenIDs flow.IdentityList - if len(preferredENIdentifiers) > 0 { - // find the preferred execution node IDs which have executed the transaction - chosenIDs = allENs.Filter(filter.And(filter.HasNodeID(preferredENIdentifiers...), - filter.HasNodeID(executorIDs...))) - if len(chosenIDs) > 0 { - return chosenIDs, nil - } - } - - // if no preferred EN ID is found, then choose from the fixed EN IDs - if len(fixedENIdentifiers) > 0 { - // choose fixed ENs which have executed the transaction - chosenIDs = allENs.Filter(filter.And(filter.HasNodeID(fixedENIdentifiers...), filter.HasNodeID(executorIDs...))) - if len(chosenIDs) > 0 { - return chosenIDs, nil - } - // if no such ENs are found then just choose all fixed ENs - chosenIDs = allENs.Filter(filter.HasNodeID(fixedENIdentifiers...)) - return chosenIDs, nil - } - - // If no preferred or fixed ENs have been specified, then return all executor IDs i.e. no preference at all - return allENs.Filter(filter.HasNodeID(executorIDs...)), nil -} - -// findAllExecutionNodes find all the execution nodes ids from the execution receipts that have been received for the -// given blockID -func findAllExecutionNodes( - blockID flow.Identifier, - executionReceipts storage.ExecutionReceipts, - log zerolog.Logger) (flow.IdentifierList, error) { - - // lookup the receipt's storage with the block ID - allReceipts, err := executionReceipts.ByBlockID(blockID) - if err != nil { - return nil, fmt.Errorf("failed to retreive execution receipts for block ID %v: %w", blockID, err) - } - - executionResultMetaList := make(flow.ExecutionReceiptMetaList, 0, len(allReceipts)) - for _, r := range allReceipts { - executionResultMetaList = append(executionResultMetaList, r.Meta()) - } - executionResultGroupedMetaList := executionResultMetaList.GroupByResultID() - - // maximum number of matching receipts found so far for any execution result id - maxMatchedReceiptCnt := 0 - // execution result id key for the highest number of matching receipts in the identicalReceipts map - var maxMatchedReceiptResultID flow.Identifier - - // find the largest list of receipts which have the same result ID - for resultID, executionReceiptList := range executionResultGroupedMetaList { - currentMatchedReceiptCnt := executionReceiptList.Size() - if currentMatchedReceiptCnt > maxMatchedReceiptCnt { - maxMatchedReceiptCnt = currentMatchedReceiptCnt - maxMatchedReceiptResultID = resultID - } - } - - // if there are more than one execution result for the same block ID, log as error - if executionResultGroupedMetaList.NumberGroups() > 1 { - identicalReceiptsStr := fmt.Sprintf("%v", flow.GetIDs(allReceipts)) - log.Error(). - Str("block_id", blockID.String()). - Str("execution_receipts", identicalReceiptsStr). - Msg("execution receipt mismatch") - } - - // pick the largest list of matching receipts - matchingReceiptMetaList := executionResultGroupedMetaList.GetGroup(maxMatchedReceiptResultID) - - metaReceiptGroupedByExecutorID := matchingReceiptMetaList.GroupByExecutorID() - - // collect all unique execution node ids from the receipts - var executorIDs flow.IdentifierList - for executorID := range metaReceiptGroupedByExecutorID { - executorIDs = append(executorIDs, executorID) - } - - return executorIDs, nil -} diff --git a/engine/access/rpc/backend/mock/connection_selector.go b/engine/access/rpc/backend/mock/connection_selector.go deleted file mode 100644 index 6337683391f..00000000000 --- a/engine/access/rpc/backend/mock/connection_selector.go +++ /dev/null @@ -1,82 +0,0 @@ -// Code generated by mockery v2.21.4. DO NOT EDIT. - -package mock - -import ( - context "context" - - flow "github.com/onflow/flow-go/model/flow" - mock "github.com/stretchr/testify/mock" -) - -// ConnectionSelector is an autogenerated mock type for the ConnectionSelector type -type ConnectionSelector struct { - mock.Mock -} - -// GetCollectionNodes provides a mock function with given fields: txID -func (_m *ConnectionSelector) GetCollectionNodes(txID flow.Identifier) ([]string, error) { - ret := _m.Called(txID) - - var r0 []string - var r1 error - if rf, ok := ret.Get(0).(func(flow.Identifier) ([]string, error)); ok { - return rf(txID) - } - if rf, ok := ret.Get(0).(func(flow.Identifier) []string); ok { - r0 = rf(txID) - } else { - if ret.Get(0) != nil { - r0 = ret.Get(0).([]string) - } - } - - if rf, ok := ret.Get(1).(func(flow.Identifier) error); ok { - r1 = rf(txID) - } else { - r1 = ret.Error(1) - } - - return r0, r1 -} - -// GetExecutionNodesForBlockID provides a mock function with given fields: ctx, blockID -func (_m *ConnectionSelector) GetExecutionNodesForBlockID(ctx context.Context, blockID flow.Identifier) (flow.IdentityList, error) { - ret := _m.Called(ctx, blockID) - - var r0 flow.IdentityList - var r1 error - if rf, ok := ret.Get(0).(func(context.Context, flow.Identifier) (flow.IdentityList, error)); ok { - return rf(ctx, blockID) - } - if rf, ok := ret.Get(0).(func(context.Context, flow.Identifier) flow.IdentityList); ok { - r0 = rf(ctx, blockID) - } else { - if ret.Get(0) != nil { - r0 = ret.Get(0).(flow.IdentityList) - } - } - - if rf, ok := ret.Get(1).(func(context.Context, flow.Identifier) error); ok { - r1 = rf(ctx, blockID) - } else { - r1 = ret.Error(1) - } - - return r0, r1 -} - -type mockConstructorTestingTNewConnectionSelector interface { - mock.TestingT - Cleanup(func()) -} - -// NewConnectionSelector creates a new instance of ConnectionSelector. It also registers a testing interface on the mock and a cleanup function to assert the mocks expectations. -func NewConnectionSelector(t mockConstructorTestingTNewConnectionSelector) *ConnectionSelector { - mock := &ConnectionSelector{} - mock.Mock.Test(t) - - t.Cleanup(func() { mock.AssertExpectations(t) }) - - return mock -} diff --git a/engine/access/rpc/engine.go b/engine/access/rpc/engine.go index 10916425aa6..5947ee6cd7d 100644 --- a/engine/access/rpc/engine.go +++ b/engine/access/rpc/engine.go @@ -175,8 +175,6 @@ func NewBuilder(log zerolog.Logger, CircuitBreakerConfig: config.CircuitBreakerConfig, } - connectionSelector := backend.NewConnectionSelector(state, executionReceipts, log, config.CircuitBreakerConfig.Enabled) - backend := backend.New(state, collectionRPC, historicalAccessNodes, @@ -189,7 +187,6 @@ func NewBuilder(log zerolog.Logger, chainID, accessMetrics, connectionFactory, - connectionSelector, retryEnabled, config.MaxHeightRange, config.PreferredExecutionNodeIDs, From d95f01e26223cfdcb67efe9d8a9d56e593f1bd27 Mon Sep 17 00:00:00 2001 From: Andrii Slisarchuk Date: Wed, 14 Jun 2023 00:40:17 +0300 Subject: [PATCH 11/56] fixed tests --- engine/access/access_test.go | 5 ---- engine/access/rpc/backend/backend.go | 6 ++--- engine/access/rpc/backend/backend_test.go | 25 +++---------------- .../rpc/backend/connection_factory_test.go | 2 +- .../rpc/backend/historical_access_test.go | 2 -- engine/access/rpc/backend/retry_test.go | 2 -- 6 files changed, 8 insertions(+), 34 deletions(-) diff --git a/engine/access/access_test.go b/engine/access/access_test.go index 3dccd604980..63d7af2d76c 100644 --- a/engine/access/access_test.go +++ b/engine/access/access_test.go @@ -150,7 +150,6 @@ func (suite *Suite) RunTest( suite.chainID, suite.metrics, nil, - nil, false, backend.DefaultMaxHeightRange, nil, @@ -323,7 +322,6 @@ func (suite *Suite) TestSendTransactionToRandomCollectionNode() { suite.chainID, metrics, connFactory, - nil, false, backend.DefaultMaxHeightRange, nil, @@ -650,7 +648,6 @@ func (suite *Suite) TestGetSealedTransaction() { suite.chainID, suite.metrics, connFactory, - nil, false, backend.DefaultMaxHeightRange, nil, @@ -790,7 +787,6 @@ func (suite *Suite) TestGetTransactionResult() { suite.chainID, suite.metrics, connFactory, - nil, false, backend.DefaultMaxHeightRange, nil, @@ -982,7 +978,6 @@ func (suite *Suite) TestExecuteScript() { suite.chainID, suite.metrics, connFactory, - nil, false, backend.DefaultMaxHeightRange, nil, diff --git a/engine/access/rpc/backend/backend.go b/engine/access/rpc/backend/backend.go index 868858c8876..954a04981c3 100644 --- a/engine/access/rpc/backend/backend.go +++ b/engine/access/rpc/backend/backend.go @@ -3,20 +3,20 @@ package backend import ( "context" "fmt" - "github.com/onflow/flow-go/model/flow/filter" - "google.golang.org/grpc/codes" - "google.golang.org/grpc/status" "time" lru "github.com/hashicorp/golang-lru" accessproto "github.com/onflow/flow/protobuf/go/flow/access" "github.com/rs/zerolog" + "google.golang.org/grpc/codes" + "google.golang.org/grpc/status" "github.com/onflow/flow-go/access" "github.com/onflow/flow-go/cmd/build" "github.com/onflow/flow-go/engine/common/rpc" "github.com/onflow/flow-go/engine/common/rpc/convert" "github.com/onflow/flow-go/model/flow" + "github.com/onflow/flow-go/model/flow/filter" "github.com/onflow/flow-go/module" "github.com/onflow/flow-go/state/protocol" "github.com/onflow/flow-go/storage" diff --git a/engine/access/rpc/backend/backend_test.go b/engine/access/rpc/backend/backend_test.go index 9751f64b865..923af00cacc 100644 --- a/engine/access/rpc/backend/backend_test.go +++ b/engine/access/rpc/backend/backend_test.go @@ -3,6 +3,8 @@ package backend import ( "context" "fmt" + "testing" + "github.com/dgraph-io/badger/v2" accessproto "github.com/onflow/flow/protobuf/go/flow/access" entitiesproto "github.com/onflow/flow/protobuf/go/flow/entities" @@ -14,7 +16,6 @@ import ( "github.com/stretchr/testify/suite" "google.golang.org/grpc/codes" "google.golang.org/grpc/status" - "testing" access "github.com/onflow/flow-go/engine/access/mock" backendmock "github.com/onflow/flow-go/engine/access/rpc/backend/mock" @@ -26,7 +27,6 @@ import ( "github.com/onflow/flow-go/state/protocol/util" "github.com/onflow/flow-go/storage" storagemock "github.com/onflow/flow-go/storage/mock" - "github.com/onflow/flow-go/utils/rand" "github.com/onflow/flow-go/utils/unittest" ) @@ -47,7 +47,6 @@ type Suite struct { execClient *access.ExecutionAPIClient historicalAccessClient *access.AccessAPIClient connectionFactory *backendmock.ConnectionFactory - nil *backendmock.ConnectionSelector chainID flow.ChainID } @@ -75,7 +74,6 @@ func (suite *Suite) SetupTest() { suite.chainID = flow.Testnet suite.historicalAccessClient = new(access.AccessAPIClient) suite.connectionFactory = new(backendmock.ConnectionFactory) - suite.nil = new(backendmock.ConnectionSelector) } func (suite *Suite) TestPing() { @@ -2123,21 +2121,6 @@ func (suite *Suite) TestExecutionNodesForBlockID() { func(flow.IdentityFilter) error { return nil }) suite.state.On("Final").Return(suite.snapshot, nil).Maybe() - connSelector := new(backendmock.ConnectionSelector) - connSelector.On("GetExecutionNodesForBlockID").Return(func() flow.IdentityList { - randomItems := make(flow.IdentityList, 0, maxExecutionNodesCnt) - - for i := 0; i < maxExecutionNodesCnt; i++ { - // Generate a random index within the range of the array - randomIndex, err := rand.Uintn(uint(len(allExecutionNodes))) - require.NoError(suite.T(), err) - // Append the item at the random index to the new slice - randomItems = append(randomItems, allExecutionNodes[randomIndex]) - } - - return randomItems - }) - testExecutionNodesForBlockID := func(preferredENs, fixedENs, expectedENs flow.IdentityList) { if preferredENs != nil { @@ -2146,7 +2129,7 @@ func (suite *Suite) TestExecutionNodesForBlockID() { if fixedENs != nil { fixedENIdentifiers = fixedENs.NodeIDs() } - actualList, err := connSelector.GetExecutionNodesForBlockID(context.Background(), block.ID()) + actualList, err := executionNodesForBlockID(context.Background(), block.ID(), suite.receipts, suite.state, suite.log) require.NoError(suite.T(), err) if expectedENs == nil { expectedENs = flow.IdentityList{} @@ -2166,7 +2149,7 @@ func (suite *Suite) TestExecutionNodesForBlockID() { attempt2Receipts = flow.ExecutionReceiptList{} attempt3Receipts = flow.ExecutionReceiptList{} suite.state.On("AtBlockID", mock.Anything).Return(suite.snapshot) - actualList, err := connSelector.GetExecutionNodesForBlockID(context.Background(), block.ID()) + actualList, err := executionNodesForBlockID(context.Background(), block.ID(), suite.receipts, suite.state, suite.log) require.NoError(suite.T(), err) require.Equal(suite.T(), len(actualList), maxExecutionNodesCnt) }) diff --git a/engine/access/rpc/backend/connection_factory_test.go b/engine/access/rpc/backend/connection_factory_test.go index 3c96b9c124a..d1fc6238df5 100644 --- a/engine/access/rpc/backend/connection_factory_test.go +++ b/engine/access/rpc/backend/connection_factory_test.go @@ -3,7 +3,6 @@ package backend import ( "context" "fmt" - "github.com/sony/gobreaker" "net" "strconv" "strings" @@ -14,6 +13,7 @@ import ( lru "github.com/hashicorp/golang-lru" "github.com/onflow/flow/protobuf/go/flow/access" "github.com/onflow/flow/protobuf/go/flow/execution" + "github.com/sony/gobreaker" "github.com/stretchr/testify/assert" testifymock "github.com/stretchr/testify/mock" "google.golang.org/grpc" diff --git a/engine/access/rpc/backend/historical_access_test.go b/engine/access/rpc/backend/historical_access_test.go index 3ba35c15d70..b66904f6604 100644 --- a/engine/access/rpc/backend/historical_access_test.go +++ b/engine/access/rpc/backend/historical_access_test.go @@ -49,7 +49,6 @@ func (suite *Suite) TestHistoricalTransactionResult() { suite.chainID, metrics.NewNoopCollector(), nil, - nil, false, DefaultMaxHeightRange, nil, @@ -108,7 +107,6 @@ func (suite *Suite) TestHistoricalTransaction() { suite.chainID, metrics.NewNoopCollector(), nil, - nil, false, DefaultMaxHeightRange, nil, diff --git a/engine/access/rpc/backend/retry_test.go b/engine/access/rpc/backend/retry_test.go index 7a4ddf6b375..c10b66bbbc0 100644 --- a/engine/access/rpc/backend/retry_test.go +++ b/engine/access/rpc/backend/retry_test.go @@ -54,7 +54,6 @@ func (suite *Suite) TestTransactionRetry() { suite.chainID, metrics.NewNoopCollector(), nil, - nil, false, DefaultMaxHeightRange, nil, @@ -144,7 +143,6 @@ func (suite *Suite) TestSuccessfulTransactionsDontRetry() { suite.chainID, metrics.NewNoopCollector(), connFactory, - nil, false, DefaultMaxHeightRange, nil, From dda2d3a22d3758bbcc90a7b2527ac08fbd2042f5 Mon Sep 17 00:00:00 2001 From: Andrii Slisarchuk Date: Wed, 14 Jun 2023 02:04:35 +0300 Subject: [PATCH 12/56] Added assert call. --- .../access/rpc/backend/connection_factory_test.go | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/engine/access/rpc/backend/connection_factory_test.go b/engine/access/rpc/backend/connection_factory_test.go index d1fc6238df5..99bbad2e05f 100644 --- a/engine/access/rpc/backend/connection_factory_test.go +++ b/engine/access/rpc/backend/connection_factory_test.go @@ -421,7 +421,6 @@ func TestCircuitBreakerExecutionNode(t *testing.T) { req := &execution.PingRequest{} resp := &execution.PingResponse{} en.handler.On("Ping", testifymock.Anything, req).After(2*requestTimeout).Return(resp, nil) - // create the factory connectionFactory := new(ConnectionFactoryImpl) // set the execution grpc port @@ -454,6 +453,7 @@ func TestCircuitBreakerExecutionNode(t *testing.T) { // make the call to the execution node _, err = client.Ping(ctx, req) + en.handler.AssertCalled(t, "Ping", testifymock.Anything, req) return time.Since(start), err } @@ -466,12 +466,12 @@ func TestCircuitBreakerExecutionNode(t *testing.T) { assert.Equal(t, gobreaker.ErrOpenState, err) assert.Greater(t, requestTimeout, duration) - //Wait until Circuit breaker go to Half-open state - time.Sleep(circuitBreakerRestoreTimeout + time.Second) - en.handler.On("Ping", testifymock.Anything, req).Unset() en.handler.On("Ping", testifymock.Anything, req).Return(resp, nil) + //Wait until Circuit breaker go to Half-open state + time.Sleep(circuitBreakerRestoreTimeout + time.Second) + duration, err = callAndMeasurePingDuration() assert.Greater(t, requestTimeout, duration) assert.Equal(t, nil, err) @@ -523,6 +523,7 @@ func TestCircuitBreakerCollectionNode(t *testing.T) { // make the call to the collection node _, err = client.Ping(ctx, req) + cn.handler.AssertCalled(t, "Ping", testifymock.Anything, req) return time.Since(start), err } @@ -535,12 +536,12 @@ func TestCircuitBreakerCollectionNode(t *testing.T) { assert.Equal(t, gobreaker.ErrOpenState, err) assert.Greater(t, requestTimeout, duration) - //Wait until Circuit breaker go to Half-open state - time.Sleep(circuitBreakerRestoreTimeout + time.Second) - cn.handler.On("Ping", testifymock.Anything, req).Unset() cn.handler.On("Ping", testifymock.Anything, req).Return(resp, nil) + //Wait until Circuit breaker go to Half-open state + time.Sleep(circuitBreakerRestoreTimeout + time.Second) + duration, err = callAndMeasurePingDuration() assert.Greater(t, requestTimeout, duration) assert.Equal(t, nil, err) From d95aed3f25cd17fe7b8ac69431d353b6f680e5a4 Mon Sep 17 00:00:00 2001 From: Andrii Slisarchuk Date: Wed, 14 Jun 2023 11:00:24 +0300 Subject: [PATCH 13/56] Return all exec nodes that matches criteria. --- engine/access/access_test.go | 5 ++ engine/access/rpc/backend/backend.go | 49 +++++++-------- engine/access/rpc/backend/backend_accounts.go | 22 +++++-- engine/access/rpc/backend/backend_events.go | 23 ++++--- engine/access/rpc/backend/backend_scripts.go | 26 +++++--- engine/access/rpc/backend/backend_test.go | 35 +++++++++++ .../rpc/backend/backend_transactions.go | 62 +++++++++++++------ .../rpc/backend/historical_access_test.go | 2 + engine/access/rpc/backend/retry_test.go | 2 + engine/access/rpc/engine.go | 3 +- 10 files changed, 161 insertions(+), 68 deletions(-) diff --git a/engine/access/access_test.go b/engine/access/access_test.go index 63d7af2d76c..b6d4051e9ba 100644 --- a/engine/access/access_test.go +++ b/engine/access/access_test.go @@ -157,6 +157,7 @@ func (suite *Suite) RunTest( suite.log, backend.DefaultSnapshotHistoryLimit, nil, + false, ) handler := access.NewHandler(suite.backend, suite.chainID.Chain(), suite.finalizedHeaderCache, suite.me, access.WithBlockSignerDecoder(suite.signerIndicesDecoder)) f(handler, db, all) @@ -329,6 +330,7 @@ func (suite *Suite) TestSendTransactionToRandomCollectionNode() { suite.log, backend.DefaultSnapshotHistoryLimit, nil, + false, ) handler := access.NewHandler(backend, suite.chainID.Chain(), suite.finalizedHeaderCache, suite.me) @@ -655,6 +657,7 @@ func (suite *Suite) TestGetSealedTransaction() { suite.log, backend.DefaultSnapshotHistoryLimit, nil, + false, ) handler := access.NewHandler(backend, suite.chainID.Chain(), suite.finalizedHeaderCache, suite.me) @@ -794,6 +797,7 @@ func (suite *Suite) TestGetTransactionResult() { suite.log, backend.DefaultSnapshotHistoryLimit, nil, + false, ) handler := access.NewHandler(backend, suite.chainID.Chain(), suite.finalizedHeaderCache, suite.me) @@ -985,6 +989,7 @@ func (suite *Suite) TestExecuteScript() { suite.log, backend.DefaultSnapshotHistoryLimit, nil, + false, ) handler := access.NewHandler(suite.backend, suite.chainID.Chain(), suite.finalizedHeaderCache, suite.me) diff --git a/engine/access/rpc/backend/backend.go b/engine/access/rpc/backend/backend.go index 954a04981c3..8676631b7ae 100644 --- a/engine/access/rpc/backend/backend.go +++ b/engine/access/rpc/backend/backend.go @@ -3,13 +3,12 @@ package backend import ( "context" "fmt" - "time" - lru "github.com/hashicorp/golang-lru" accessproto "github.com/onflow/flow/protobuf/go/flow/access" "github.com/rs/zerolog" "google.golang.org/grpc/codes" "google.golang.org/grpc/status" + "time" "github.com/onflow/flow-go/access" "github.com/onflow/flow-go/cmd/build" @@ -97,6 +96,7 @@ func New( log zerolog.Logger, snapshotHistoryLimit int, archiveAddressList []string, + circuitBreakerEnabled bool, ) *Backend { retry := newRetry() if retryEnabled { @@ -122,19 +122,20 @@ func New( archiveAddressList: archiveAddressList, }, backendTransactions: backendTransactions{ - staticCollectionRPC: collectionRPC, - state: state, - chainID: chainID, - collections: collections, - blocks: blocks, - transactions: transactions, - executionReceipts: executionReceipts, - transactionValidator: configureTransactionValidator(state, chainID), - transactionMetrics: accessMetrics, - retry: retry, - connFactory: connFactory, - previousAccessNodes: historicalAccessNodes, - log: log, + staticCollectionRPC: collectionRPC, + state: state, + chainID: chainID, + collections: collections, + blocks: blocks, + transactions: transactions, + executionReceipts: executionReceipts, + transactionValidator: configureTransactionValidator(state, chainID), + transactionMetrics: accessMetrics, + retry: retry, + connFactory: connFactory, + previousAccessNodes: historicalAccessNodes, + log: log, + circuitBreakerEnabled: circuitBreakerEnabled, }, backendEvents: backendEvents{ state: state, @@ -153,11 +154,12 @@ func New( state: state, }, backendAccounts: backendAccounts{ - state: state, - headers: headers, - executionReceipts: executionReceipts, - connFactory: connFactory, - log: log, + state: state, + headers: headers, + executionReceipts: executionReceipts, + connFactory: connFactory, + log: log, + circuitBreakerEnabled: circuitBreakerEnabled, }, backendExecutionResults: backendExecutionResults{ executionResults: executionResults, @@ -356,14 +358,11 @@ func executionNodesForBlockID( return nil, fmt.Errorf("failed to retreive execution IDs for block ID %v: %w", blockID, err) } - // randomly choose upto maxExecutionNodesCnt identities - executionIdentitiesRandom := subsetENs.Sample(maxExecutionNodesCnt) - - if len(executionIdentitiesRandom) == 0 { + if len(subsetENs) == 0 { return nil, fmt.Errorf("no matching execution node found for block ID %v", blockID) } - return executionIdentitiesRandom, nil + return subsetENs, nil } // findAllExecutionNodes find all the execution nodes ids from the execution receipts that have been received for the diff --git a/engine/access/rpc/backend/backend_accounts.go b/engine/access/rpc/backend/backend_accounts.go index a3a41053c61..c5864b3fd86 100644 --- a/engine/access/rpc/backend/backend_accounts.go +++ b/engine/access/rpc/backend/backend_accounts.go @@ -18,11 +18,12 @@ import ( ) type backendAccounts struct { - state protocol.State - headers storage.Headers - executionReceipts storage.ExecutionReceipts - connFactory ConnectionFactory - log zerolog.Logger + state protocol.State + headers storage.Headers + executionReceipts storage.ExecutionReceipts + connFactory ConnectionFactory + log zerolog.Logger + circuitBreakerEnabled bool } func (b *backendAccounts) GetAccount(ctx context.Context, address flow.Address) (*flow.Account, error) { @@ -82,7 +83,11 @@ func (b *backendAccounts) getAccountAtBlockID( BlockId: blockID[:], } - execNodes, err := executionNodesForBlockID(ctx, blockID, b.executionReceipts, b.state, b.log) + var execNodes flow.IdentityList + var err error + + execNodes, err = executionNodesForBlockID(ctx, blockID, b.executionReceipts, b.state, b.log) + if err != nil { return nil, rpc.ConvertError(err, "failed to get account from the execution node", codes.Internal) } @@ -107,7 +112,12 @@ func (b *backendAccounts) getAccountAtBlockID( // other ENs are logged and swallowed. If all ENs fail to return a valid response, then an // error aggregating all failures is returned. func (b *backendAccounts) getAccountFromAnyExeNode(ctx context.Context, execNodes flow.IdentityList, req *execproto.GetAccountAtBlockIDRequest) (*execproto.GetAccountAtBlockIDResponse, error) { + if !b.circuitBreakerEnabled { + execNodes = execNodes.Sample(maxExecutionNodesCnt) + } + var errors *multierror.Error + for _, execNode := range execNodes { // TODO: use the GRPC Client interceptor start := time.Now() diff --git a/engine/access/rpc/backend/backend_events.go b/engine/access/rpc/backend/backend_events.go index f48ba395947..b58d19c8b6d 100644 --- a/engine/access/rpc/backend/backend_events.go +++ b/engine/access/rpc/backend/backend_events.go @@ -21,12 +21,13 @@ import ( ) type backendEvents struct { - headers storage.Headers - executionReceipts storage.ExecutionReceipts - state protocol.State - connFactory ConnectionFactory - log zerolog.Logger - maxHeightRange uint + headers storage.Headers + executionReceipts storage.ExecutionReceipts + state protocol.State + connFactory ConnectionFactory + log zerolog.Logger + maxHeightRange uint + circuitBreakerEnabled bool } // GetEventsForHeightRange retrieves events for all sealed blocks between the start block height and @@ -129,7 +130,11 @@ func (b *backendEvents) getBlockEventsFromExecutionNode( // choose the last block ID to find the list of execution nodes lastBlockID := blockIDs[len(blockIDs)-1] - execNodes, err := executionNodesForBlockID(ctx, lastBlockID, b.executionReceipts, b.state, b.log) + var execNodes flow.IdentityList + var err error + + execNodes, err = executionNodesForBlockID(ctx, lastBlockID, b.executionReceipts, b.state, b.log) + if err != nil { b.log.Error().Err(err).Msg("failed to retrieve events from execution node") return nil, rpc.ConvertError(err, "failed to retrieve events from execution node", codes.Internal) @@ -209,6 +214,10 @@ func verifyAndConvertToAccessEvents( func (b *backendEvents) getEventsFromAnyExeNode(ctx context.Context, execNodes flow.IdentityList, req *execproto.GetEventsForBlockIDsRequest) (*execproto.GetEventsForBlockIDsResponse, *flow.Identity, error) { + if !b.circuitBreakerEnabled { + execNodes = execNodes.Sample(maxExecutionNodesCnt) + } + var errors *multierror.Error // try to get events from one of the execution nodes for _, execNode := range execNodes { diff --git a/engine/access/rpc/backend/backend_scripts.go b/engine/access/rpc/backend/backend_scripts.go index e6da62a5b7b..80b611d243c 100644 --- a/engine/access/rpc/backend/backend_scripts.go +++ b/engine/access/rpc/backend/backend_scripts.go @@ -24,14 +24,15 @@ import ( const uniqueScriptLoggingTimeWindow = 10 * time.Minute type backendScripts struct { - headers storage.Headers - executionReceipts storage.ExecutionReceipts - state protocol.State - connFactory ConnectionFactory - log zerolog.Logger - metrics module.BackendScriptsMetrics - loggedScripts *lru.Cache - archiveAddressList []string + headers storage.Headers + executionReceipts storage.ExecutionReceipts + state protocol.State + connFactory ConnectionFactory + log zerolog.Logger + metrics module.BackendScriptsMetrics + loggedScripts *lru.Cache + archiveAddressList []string + circuitBreakerEnabled bool } func (b *backendScripts) ExecuteScriptAtLatestBlock( @@ -91,11 +92,18 @@ func (b *backendScripts) findScriptExecutors( return b.archiveAddressList, nil } - executors, err := executionNodesForBlockID(ctx, blockID, b.executionReceipts, b.state, b.log) + var executors flow.IdentityList + var err error + + executors, err = executionNodesForBlockID(ctx, blockID, b.executionReceipts, b.state, b.log) if err != nil { return nil, err } + if !b.circuitBreakerEnabled { + executors = executors.Sample(maxExecutionNodesCnt) + } + executorAddrs := make([]string, 0, len(executors)) for _, executor := range executors { executorAddrs = append(executorAddrs, executor.Address) diff --git a/engine/access/rpc/backend/backend_test.go b/engine/access/rpc/backend/backend_test.go index 923af00cacc..8f572daf172 100644 --- a/engine/access/rpc/backend/backend_test.go +++ b/engine/access/rpc/backend/backend_test.go @@ -105,6 +105,7 @@ func (suite *Suite) TestPing() { suite.log, DefaultSnapshotHistoryLimit, nil, + false, ) err := backend.Ping(context.Background()) @@ -140,6 +141,7 @@ func (suite *Suite) TestGetLatestFinalizedBlockHeader() { suite.log, DefaultSnapshotHistoryLimit, nil, + false, ) // query the handler for the latest finalized block @@ -205,6 +207,7 @@ func (suite *Suite) TestGetLatestProtocolStateSnapshot_NoTransitionSpan() { suite.log, DefaultSnapshotHistoryLimit, nil, + false, ) // query the handler for the latest finalized snapshot @@ -277,6 +280,7 @@ func (suite *Suite) TestGetLatestProtocolStateSnapshot_TransitionSpans() { suite.log, DefaultSnapshotHistoryLimit, nil, + false, ) // query the handler for the latest finalized snapshot @@ -342,6 +346,7 @@ func (suite *Suite) TestGetLatestProtocolStateSnapshot_PhaseTransitionSpan() { suite.log, DefaultSnapshotHistoryLimit, nil, + false, ) // query the handler for the latest finalized snapshot @@ -418,6 +423,7 @@ func (suite *Suite) TestGetLatestProtocolStateSnapshot_EpochTransitionSpan() { suite.log, DefaultSnapshotHistoryLimit, nil, + false, ) // query the handler for the latest finalized snapshot @@ -478,6 +484,7 @@ func (suite *Suite) TestGetLatestProtocolStateSnapshot_HistoryLimit() { suite.log, snapshotHistoryLimit, nil, + false, ) // the handler should return a snapshot history limit error @@ -516,6 +523,7 @@ func (suite *Suite) TestGetLatestSealedBlockHeader() { suite.log, DefaultSnapshotHistoryLimit, nil, + false, ) // query the handler for the latest sealed block @@ -562,6 +570,7 @@ func (suite *Suite) TestGetTransaction() { suite.log, DefaultSnapshotHistoryLimit, nil, + false, ) actual, err := backend.GetTransaction(context.Background(), transaction.ID()) @@ -602,6 +611,7 @@ func (suite *Suite) TestGetCollection() { suite.log, DefaultSnapshotHistoryLimit, nil, + false, ) actual, err := backend.GetCollectionByID(context.Background(), expected.ID()) @@ -665,6 +675,7 @@ func (suite *Suite) TestGetTransactionResultByIndex() { suite.log, DefaultSnapshotHistoryLimit, nil, + false, ) suite.execClient. On("GetTransactionResultByIndex", ctx, exeEventReq). @@ -728,6 +739,7 @@ func (suite *Suite) TestGetTransactionResultsByBlockID() { suite.log, DefaultSnapshotHistoryLimit, nil, + false, ) suite.execClient. On("GetTransactionResultsByBlockID", ctx, exeEventReq). @@ -819,6 +831,7 @@ func (suite *Suite) TestTransactionStatusTransition() { suite.log, DefaultSnapshotHistoryLimit, nil, + false, ) // Successfully return empty event list @@ -939,6 +952,7 @@ func (suite *Suite) TestTransactionExpiredStatusTransition() { suite.log, DefaultSnapshotHistoryLimit, nil, + false, ) // should return pending status when we have not observed an expiry block @@ -1106,6 +1120,7 @@ func (suite *Suite) TestTransactionPendingToFinalizedStatusTransition() { suite.log, DefaultSnapshotHistoryLimit, nil, + false, ) preferredENIdentifiers = flow.IdentifierList{receipts[0].ExecutorID} @@ -1164,6 +1179,7 @@ func (suite *Suite) TestTransactionResultUnknown() { suite.log, DefaultSnapshotHistoryLimit, nil, + false, ) // first call - when block under test is greater height than the sealed head, but execution node does not know about Tx @@ -1218,6 +1234,7 @@ func (suite *Suite) TestGetLatestFinalizedBlock() { suite.log, DefaultSnapshotHistoryLimit, nil, + false, ) // query the handler for the latest finalized header @@ -1348,6 +1365,7 @@ func (suite *Suite) TestGetEventsForBlockIDs() { suite.log, DefaultSnapshotHistoryLimit, nil, + false, ) // execute request @@ -1380,6 +1398,7 @@ func (suite *Suite) TestGetEventsForBlockIDs() { suite.log, DefaultSnapshotHistoryLimit, nil, + false, ) // execute request with an empty block id list and expect an empty list of events and no error @@ -1439,6 +1458,7 @@ func (suite *Suite) TestGetExecutionResultByID() { suite.log, DefaultSnapshotHistoryLimit, nil, + false, ) // execute request @@ -1469,6 +1489,7 @@ func (suite *Suite) TestGetExecutionResultByID() { suite.log, DefaultSnapshotHistoryLimit, nil, + false, ) // execute request @@ -1532,6 +1553,7 @@ func (suite *Suite) TestGetExecutionResultByBlockID() { suite.log, DefaultSnapshotHistoryLimit, nil, + false, ) // execute request @@ -1563,6 +1585,7 @@ func (suite *Suite) TestGetExecutionResultByBlockID() { suite.log, DefaultSnapshotHistoryLimit, nil, + false, ) // execute request @@ -1713,6 +1736,7 @@ func (suite *Suite) TestGetEventsForHeightRange() { suite.log, DefaultSnapshotHistoryLimit, nil, + false, ) _, err := backend.GetEventsForHeightRange(ctx, string(flow.EventAccountCreated), maxHeight, minHeight) @@ -1752,6 +1776,7 @@ func (suite *Suite) TestGetEventsForHeightRange() { suite.log, DefaultSnapshotHistoryLimit, nil, + false, ) // execute request @@ -1790,6 +1815,7 @@ func (suite *Suite) TestGetEventsForHeightRange() { suite.log, DefaultSnapshotHistoryLimit, nil, + false, ) actualResp, err := backend.GetEventsForHeightRange(ctx, string(flow.EventAccountCreated), minHeight, maxHeight) @@ -1827,6 +1853,7 @@ func (suite *Suite) TestGetEventsForHeightRange() { suite.log, DefaultSnapshotHistoryLimit, nil, + false, ) _, err := backend.GetEventsForHeightRange(ctx, string(flow.EventAccountCreated), minHeight, minHeight+1) @@ -1864,6 +1891,7 @@ func (suite *Suite) TestGetEventsForHeightRange() { suite.log, DefaultSnapshotHistoryLimit, nil, + false, ) _, err := backend.GetEventsForHeightRange(ctx, string(flow.EventAccountCreated), minHeight, maxHeight) @@ -1941,6 +1969,7 @@ func (suite *Suite) TestGetAccount() { suite.log, DefaultSnapshotHistoryLimit, nil, + false, ) preferredENIdentifiers = flow.IdentifierList{receipts[0].ExecutorID} @@ -2022,6 +2051,7 @@ func (suite *Suite) TestGetAccountAtBlockHeight() { suite.log, DefaultSnapshotHistoryLimit, nil, + false, ) preferredENIdentifiers = flow.IdentifierList{receipts[0].ExecutorID} @@ -2061,6 +2091,7 @@ func (suite *Suite) TestGetNetworkParameters() { suite.log, DefaultSnapshotHistoryLimit, nil, + false, ) params := backend.GetNetworkParameters(context.Background()) @@ -2131,6 +2162,8 @@ func (suite *Suite) TestExecutionNodesForBlockID() { } actualList, err := executionNodesForBlockID(context.Background(), block.ID(), suite.receipts, suite.state, suite.log) require.NoError(suite.T(), err) + actualList = actualList.Sample(maxExecutionNodesCnt) + if expectedENs == nil { expectedENs = flow.IdentityList{} } @@ -2150,6 +2183,7 @@ func (suite *Suite) TestExecutionNodesForBlockID() { attempt3Receipts = flow.ExecutionReceiptList{} suite.state.On("AtBlockID", mock.Anything).Return(suite.snapshot) actualList, err := executionNodesForBlockID(context.Background(), block.ID(), suite.receipts, suite.state, suite.log) + actualList = actualList.Sample(maxExecutionNodesCnt) require.NoError(suite.T(), err) require.Equal(suite.T(), len(actualList), maxExecutionNodesCnt) }) @@ -2240,6 +2274,7 @@ func (suite *Suite) TestExecuteScriptOnExecutionNode() { suite.log, DefaultSnapshotHistoryLimit, nil, + false, ) // mock parameters diff --git a/engine/access/rpc/backend/backend_transactions.go b/engine/access/rpc/backend/backend_transactions.go index 5bc31162a07..c92cf0212dc 100644 --- a/engine/access/rpc/backend/backend_transactions.go +++ b/engine/access/rpc/backend/backend_transactions.go @@ -27,20 +27,20 @@ import ( const collectionNodesToTry uint = 3 type backendTransactions struct { - staticCollectionRPC accessproto.AccessAPIClient // rpc client tied to a fixed collection node - transactions storage.Transactions - executionReceipts storage.ExecutionReceipts - collections storage.Collections - blocks storage.Blocks - state protocol.State - chainID flow.ChainID - transactionMetrics module.TransactionMetrics - transactionValidator *access.TransactionValidator - retry *Retry - connFactory ConnectionFactory - - previousAccessNodes []accessproto.AccessAPIClient - log zerolog.Logger + staticCollectionRPC accessproto.AccessAPIClient // rpc client tied to a fixed collection node + transactions storage.Transactions + executionReceipts storage.ExecutionReceipts + collections storage.Collections + blocks storage.Blocks + state protocol.State + chainID flow.ChainID + transactionMetrics module.TransactionMetrics + transactionValidator *access.TransactionValidator + retry *Retry + connFactory ConnectionFactory + previousAccessNodes []accessproto.AccessAPIClient + log zerolog.Logger + circuitBreakerEnabled bool } // SendTransaction forwards the transaction to the collection node @@ -123,13 +123,15 @@ func (b *backendTransactions) chooseCollectionNodes(tx *flow.TransactionBody, sa } // get the cluster responsible for the transaction - txCluster, ok := clusters.ByTxID(tx.ID()) + targetNodes, ok := clusters.ByTxID(tx.ID()) if !ok { return nil, fmt.Errorf("could not get local cluster by txID: %x", tx.ID()) } - // select a random subset of collection nodes from the cluster to be tried in order - targetNodes := txCluster.Sample(sampleSize) + if !b.circuitBreakerEnabled { + // select a random subset of collection nodes from the cluster to be tried in order + targetNodes = targetNodes.Sample(sampleSize) + } // collect the addresses of all the chosen collection nodes var targetAddrs = make([]string, len(targetNodes)) @@ -371,7 +373,10 @@ func (b *backendTransactions) GetTransactionResultsByBlockID( req := &execproto.GetTransactionsByBlockIDRequest{ BlockId: blockID[:], } - execNodes, err := executionNodesForBlockID(ctx, blockID, b.executionReceipts, b.state, b.log) + + var execNodes flow.IdentityList + + execNodes, err = executionNodesForBlockID(ctx, blockID, b.executionReceipts, b.state, b.log) if err != nil { if IsInsufficientExecutionReceipts(err) { return nil, status.Errorf(codes.NotFound, err.Error()) @@ -502,7 +507,9 @@ func (b *backendTransactions) GetTransactionResultByIndex( BlockId: blockID[:], Index: index, } - execNodes, err := executionNodesForBlockID(ctx, blockID, b.executionReceipts, b.state, b.log) + var execNodes flow.IdentityList + + execNodes, err = executionNodesForBlockID(ctx, blockID, b.executionReceipts, b.state, b.log) if err != nil { if IsInsufficientExecutionReceipts(err) { return nil, status.Errorf(codes.NotFound, err.Error()) @@ -731,7 +738,10 @@ func (b *backendTransactions) getTransactionResultFromExecutionNode( TransactionId: transactionID, } - execNodes, err := executionNodesForBlockID(ctx, blockID, b.executionReceipts, b.state, b.log) + var execNodes flow.IdentityList + var err error + + execNodes, err = executionNodesForBlockID(ctx, blockID, b.executionReceipts, b.state, b.log) if err != nil { // if no execution receipt were found, return a NotFound GRPC error if IsInsufficientExecutionReceipts(err) { @@ -762,6 +772,10 @@ func (b *backendTransactions) getTransactionResultFromAnyExeNode( execNodes flow.IdentityList, req *execproto.GetTransactionResultRequest, ) (*execproto.GetTransactionResultResponse, error) { + if !b.circuitBreakerEnabled { + execNodes = execNodes.Sample(maxExecutionNodesCnt) + } + var errs *multierror.Error logAnyError := func() { errToReturn := errs.ErrorOrNil() @@ -817,6 +831,10 @@ func (b *backendTransactions) getTransactionResultsByBlockIDFromAnyExeNode( execNodes flow.IdentityList, req *execproto.GetTransactionsByBlockIDRequest, ) (*execproto.GetTransactionResultsResponse, error) { + if !b.circuitBreakerEnabled { + execNodes = execNodes.Sample(maxExecutionNodesCnt) + } + var errs *multierror.Error defer func() { @@ -876,6 +894,10 @@ func (b *backendTransactions) getTransactionResultByIndexFromAnyExeNode( execNodes flow.IdentityList, req *execproto.GetTransactionByIndexRequest, ) (*execproto.GetTransactionResultResponse, error) { + if !b.circuitBreakerEnabled { + execNodes = execNodes.Sample(maxExecutionNodesCnt) + } + var errs *multierror.Error logAnyError := func() { errToReturn := errs.ErrorOrNil() diff --git a/engine/access/rpc/backend/historical_access_test.go b/engine/access/rpc/backend/historical_access_test.go index b66904f6604..42dd829dbbc 100644 --- a/engine/access/rpc/backend/historical_access_test.go +++ b/engine/access/rpc/backend/historical_access_test.go @@ -56,6 +56,7 @@ func (suite *Suite) TestHistoricalTransactionResult() { suite.log, DefaultSnapshotHistoryLimit, nil, + false, ) // Successfully return the transaction from the historical node @@ -114,6 +115,7 @@ func (suite *Suite) TestHistoricalTransaction() { suite.log, DefaultSnapshotHistoryLimit, nil, + false, ) // Successfully return the transaction from the historical node diff --git a/engine/access/rpc/backend/retry_test.go b/engine/access/rpc/backend/retry_test.go index c10b66bbbc0..2189223118a 100644 --- a/engine/access/rpc/backend/retry_test.go +++ b/engine/access/rpc/backend/retry_test.go @@ -61,6 +61,7 @@ func (suite *Suite) TestTransactionRetry() { suite.log, DefaultSnapshotHistoryLimit, nil, + false, ) retry := newRetry().SetBackend(backend).Activate() backend.retry = retry @@ -150,6 +151,7 @@ func (suite *Suite) TestSuccessfulTransactionsDontRetry() { suite.log, DefaultSnapshotHistoryLimit, nil, + false, ) retry := newRetry().SetBackend(backend).Activate() backend.retry = retry diff --git a/engine/access/rpc/engine.go b/engine/access/rpc/engine.go index 5947ee6cd7d..c4b36c263b6 100644 --- a/engine/access/rpc/engine.go +++ b/engine/access/rpc/engine.go @@ -48,7 +48,7 @@ type Config struct { PreferredExecutionNodeIDs []string // preferred list of upstream execution node IDs FixedExecutionNodeIDs []string // fixed list of execution node IDs to choose from if no node node ID can be chosen from the PreferredExecutionNodeIDs ArchiveAddressList []string // the archive node address list to send script executions. when configured, script executions will be all sent to the archive node - CircuitBreakerConfig backend.CircuitBreakerConfig //TODO: + CircuitBreakerConfig backend.CircuitBreakerConfig // the configuration for circuit breaker } // Engine exposes the server with a simplified version of the Access API. @@ -194,6 +194,7 @@ func NewBuilder(log zerolog.Logger, log, backend.DefaultSnapshotHistoryLimit, config.ArchiveAddressList, + config.CircuitBreakerConfig.Enabled, ) finalizedCache, finalizedCacheWorker, err := events.NewFinalizedHeaderCache(state) From 3ab2573d67cb3dbf70ec41bf0b60cf9dc0c820d4 Mon Sep 17 00:00:00 2001 From: Andrii Slisarchuk Date: Wed, 14 Jun 2023 11:01:27 +0300 Subject: [PATCH 14/56] linted --- engine/access/rpc/backend/backend.go | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/engine/access/rpc/backend/backend.go b/engine/access/rpc/backend/backend.go index 8676631b7ae..7e4e5ef0137 100644 --- a/engine/access/rpc/backend/backend.go +++ b/engine/access/rpc/backend/backend.go @@ -3,12 +3,13 @@ package backend import ( "context" "fmt" + "time" + lru "github.com/hashicorp/golang-lru" accessproto "github.com/onflow/flow/protobuf/go/flow/access" "github.com/rs/zerolog" "google.golang.org/grpc/codes" "google.golang.org/grpc/status" - "time" "github.com/onflow/flow-go/access" "github.com/onflow/flow-go/cmd/build" From cda4d37a141e790166e2404598b895c958a54af2 Mon Sep 17 00:00:00 2001 From: Andrii Slisarchuk Date: Wed, 14 Jun 2023 11:37:36 +0300 Subject: [PATCH 15/56] Remove unnecessary code --- .../export_report.json | 4 +-- engine/access/rpc/backend/backend.go | 30 ++++++++++--------- engine/access/rpc/backend/backend_accounts.go | 6 +--- engine/access/rpc/backend/backend_events.go | 6 +--- .../rpc/backend/backend_transactions.go | 12 ++------ 5 files changed, 23 insertions(+), 35 deletions(-) diff --git a/cmd/util/cmd/execution-state-extract/export_report.json b/cmd/util/cmd/execution-state-extract/export_report.json index 72af21af279..35f744445ee 100644 --- a/cmd/util/cmd/execution-state-extract/export_report.json +++ b/cmd/util/cmd/execution-state-extract/export_report.json @@ -1,6 +1,6 @@ { "EpochCounter": 0, - "PreviousStateCommitment": "53749b13e1f99759abb35a7ab7d7a4f180d8f6bc24e5ef6b29f3565d459765f0", - "CurrentStateCommitment": "53749b13e1f99759abb35a7ab7d7a4f180d8f6bc24e5ef6b29f3565d459765f0", + "PreviousStateCommitment": "6d2eb6ae0aa575274a45362f9bd175ec6f030f79552c5fed27ada030d3799a53", + "CurrentStateCommitment": "6d2eb6ae0aa575274a45362f9bd175ec6f030f79552c5fed27ada030d3799a53", "ReportSucceeded": true } \ No newline at end of file diff --git a/engine/access/rpc/backend/backend.go b/engine/access/rpc/backend/backend.go index 7e4e5ef0137..6c63ca1d35f 100644 --- a/engine/access/rpc/backend/backend.go +++ b/engine/access/rpc/backend/backend.go @@ -113,14 +113,15 @@ func New( state: state, // create the sub-backends backendScripts: backendScripts{ - headers: headers, - executionReceipts: executionReceipts, - connFactory: connFactory, - state: state, - log: log, - metrics: accessMetrics, - loggedScripts: loggedScripts, - archiveAddressList: archiveAddressList, + headers: headers, + executionReceipts: executionReceipts, + connFactory: connFactory, + state: state, + log: log, + metrics: accessMetrics, + loggedScripts: loggedScripts, + archiveAddressList: archiveAddressList, + circuitBreakerEnabled: circuitBreakerEnabled, }, backendTransactions: backendTransactions{ staticCollectionRPC: collectionRPC, @@ -139,12 +140,13 @@ func New( circuitBreakerEnabled: circuitBreakerEnabled, }, backendEvents: backendEvents{ - state: state, - headers: headers, - executionReceipts: executionReceipts, - connFactory: connFactory, - log: log, - maxHeightRange: maxHeightRange, + state: state, + headers: headers, + executionReceipts: executionReceipts, + connFactory: connFactory, + log: log, + maxHeightRange: maxHeightRange, + circuitBreakerEnabled: circuitBreakerEnabled, }, backendBlockHeaders: backendBlockHeaders{ headers: headers, diff --git a/engine/access/rpc/backend/backend_accounts.go b/engine/access/rpc/backend/backend_accounts.go index c5864b3fd86..205fa0f3ab8 100644 --- a/engine/access/rpc/backend/backend_accounts.go +++ b/engine/access/rpc/backend/backend_accounts.go @@ -83,11 +83,7 @@ func (b *backendAccounts) getAccountAtBlockID( BlockId: blockID[:], } - var execNodes flow.IdentityList - var err error - - execNodes, err = executionNodesForBlockID(ctx, blockID, b.executionReceipts, b.state, b.log) - + execNodes, err := executionNodesForBlockID(ctx, blockID, b.executionReceipts, b.state, b.log) if err != nil { return nil, rpc.ConvertError(err, "failed to get account from the execution node", codes.Internal) } diff --git a/engine/access/rpc/backend/backend_events.go b/engine/access/rpc/backend/backend_events.go index b58d19c8b6d..fc163459518 100644 --- a/engine/access/rpc/backend/backend_events.go +++ b/engine/access/rpc/backend/backend_events.go @@ -130,11 +130,7 @@ func (b *backendEvents) getBlockEventsFromExecutionNode( // choose the last block ID to find the list of execution nodes lastBlockID := blockIDs[len(blockIDs)-1] - var execNodes flow.IdentityList - var err error - - execNodes, err = executionNodesForBlockID(ctx, lastBlockID, b.executionReceipts, b.state, b.log) - + execNodes, err := executionNodesForBlockID(ctx, lastBlockID, b.executionReceipts, b.state, b.log) if err != nil { b.log.Error().Err(err).Msg("failed to retrieve events from execution node") return nil, rpc.ConvertError(err, "failed to retrieve events from execution node", codes.Internal) diff --git a/engine/access/rpc/backend/backend_transactions.go b/engine/access/rpc/backend/backend_transactions.go index c92cf0212dc..a9ba3002d40 100644 --- a/engine/access/rpc/backend/backend_transactions.go +++ b/engine/access/rpc/backend/backend_transactions.go @@ -374,9 +374,7 @@ func (b *backendTransactions) GetTransactionResultsByBlockID( BlockId: blockID[:], } - var execNodes flow.IdentityList - - execNodes, err = executionNodesForBlockID(ctx, blockID, b.executionReceipts, b.state, b.log) + execNodes, err := executionNodesForBlockID(ctx, blockID, b.executionReceipts, b.state, b.log) if err != nil { if IsInsufficientExecutionReceipts(err) { return nil, status.Errorf(codes.NotFound, err.Error()) @@ -507,9 +505,8 @@ func (b *backendTransactions) GetTransactionResultByIndex( BlockId: blockID[:], Index: index, } - var execNodes flow.IdentityList - execNodes, err = executionNodesForBlockID(ctx, blockID, b.executionReceipts, b.state, b.log) + execNodes, err := executionNodesForBlockID(ctx, blockID, b.executionReceipts, b.state, b.log) if err != nil { if IsInsufficientExecutionReceipts(err) { return nil, status.Errorf(codes.NotFound, err.Error()) @@ -738,10 +735,7 @@ func (b *backendTransactions) getTransactionResultFromExecutionNode( TransactionId: transactionID, } - var execNodes flow.IdentityList - var err error - - execNodes, err = executionNodesForBlockID(ctx, blockID, b.executionReceipts, b.state, b.log) + execNodes, err := executionNodesForBlockID(ctx, blockID, b.executionReceipts, b.state, b.log) if err != nil { // if no execution receipt were found, return a NotFound GRPC error if IsInsufficientExecutionReceipts(err) { From fddb2a84e2ac6ebc1d6265264ae749271941a8a2 Mon Sep 17 00:00:00 2001 From: Andrii Slisarchuk Date: Tue, 20 Jun 2023 20:30:12 +0300 Subject: [PATCH 16/56] Added integration test for CB. --- engine/access/rpc/backend/backend_accounts.go | 2 +- engine/access/rpc/backend/backend_events.go | 2 +- engine/access/rpc/backend/backend_scripts.go | 2 +- .../rpc/backend/backend_transactions.go | 8 +- .../access/access_circuit_breaker_test.go | 190 ++++++++++++++++++ 5 files changed, 197 insertions(+), 7 deletions(-) create mode 100644 integration/tests/access/access_circuit_breaker_test.go diff --git a/engine/access/rpc/backend/backend_accounts.go b/engine/access/rpc/backend/backend_accounts.go index 205fa0f3ab8..119c59a8b15 100644 --- a/engine/access/rpc/backend/backend_accounts.go +++ b/engine/access/rpc/backend/backend_accounts.go @@ -152,7 +152,7 @@ func (b *backendAccounts) tryGetAccount(ctx context.Context, execNode *flow.Iden resp, err := execRPCClient.GetAccountAtBlockID(ctx, req) if err != nil { - if status.Code(err) == codes.Unavailable { + if status.Code(err) == codes.Unavailable && !b.circuitBreakerEnabled { b.connFactory.InvalidateExecutionAPIClient(execNode.Address) } return nil, err diff --git a/engine/access/rpc/backend/backend_events.go b/engine/access/rpc/backend/backend_events.go index fc163459518..e1fcfb087d2 100644 --- a/engine/access/rpc/backend/backend_events.go +++ b/engine/access/rpc/backend/backend_events.go @@ -252,7 +252,7 @@ func (b *backendEvents) tryGetEvents(ctx context.Context, resp, err := execRPCClient.GetEventsForBlockIDs(ctx, req) if err != nil { - if status.Code(err) == codes.Unavailable { + if status.Code(err) == codes.Unavailable && !b.circuitBreakerEnabled { b.connFactory.InvalidateExecutionAPIClient(execNode.Address) } return nil, err diff --git a/engine/access/rpc/backend/backend_scripts.go b/engine/access/rpc/backend/backend_scripts.go index 80b611d243c..2777944299b 100644 --- a/engine/access/rpc/backend/backend_scripts.go +++ b/engine/access/rpc/backend/backend_scripts.go @@ -206,7 +206,7 @@ func (b *backendScripts) tryExecuteScript(ctx context.Context, executorAddress s execResp, err := execRPCClient.ExecuteScriptAtBlockID(ctx, req) if err != nil { - if status.Code(err) == codes.Unavailable { + if status.Code(err) == codes.Unavailable && !b.circuitBreakerEnabled { b.connFactory.InvalidateExecutionAPIClient(executorAddress) } return nil, status.Errorf(status.Code(err), "failed to execute the script on the execution node %s: %v", executorAddress, err) diff --git a/engine/access/rpc/backend/backend_transactions.go b/engine/access/rpc/backend/backend_transactions.go index a9ba3002d40..048cb2d223f 100644 --- a/engine/access/rpc/backend/backend_transactions.go +++ b/engine/access/rpc/backend/backend_transactions.go @@ -155,7 +155,7 @@ func (b *backendTransactions) sendTransactionToCollector(ctx context.Context, err = b.grpcTxSend(ctx, collectionRPC, tx) if err != nil { - if status.Code(err) == codes.Unavailable { + if status.Code(err) == codes.Unavailable && !b.circuitBreakerEnabled { b.connFactory.InvalidateAccessAPIClient(collectionNodeAddr) } return fmt.Errorf("failed to send transaction to collection node at %s: %w", collectionNodeAddr, err) @@ -811,7 +811,7 @@ func (b *backendTransactions) tryGetTransactionResult( resp, err := execRPCClient.GetTransactionResult(ctx, req) if err != nil { - if status.Code(err) == codes.Unavailable { + if status.Code(err) == codes.Unavailable && !b.circuitBreakerEnabled { b.connFactory.InvalidateExecutionAPIClient(execNode.Address) } return nil, err @@ -874,7 +874,7 @@ func (b *backendTransactions) tryGetTransactionResultsByBlockID( resp, err := execRPCClient.GetTransactionResultsByBlockID(ctx, req) if err != nil { - if status.Code(err) == codes.Unavailable { + if status.Code(err) == codes.Unavailable && !b.circuitBreakerEnabled { b.connFactory.InvalidateExecutionAPIClient(execNode.Address) } return nil, err @@ -938,7 +938,7 @@ func (b *backendTransactions) tryGetTransactionResultByIndex( resp, err := execRPCClient.GetTransactionResultByIndex(ctx, req) if err != nil { - if status.Code(err) == codes.Unavailable { + if status.Code(err) == codes.Unavailable && !b.circuitBreakerEnabled { b.connFactory.InvalidateExecutionAPIClient(execNode.Address) } return nil, err diff --git a/integration/tests/access/access_circuit_breaker_test.go b/integration/tests/access/access_circuit_breaker_test.go new file mode 100644 index 00000000000..0c025b11565 --- /dev/null +++ b/integration/tests/access/access_circuit_breaker_test.go @@ -0,0 +1,190 @@ +package access + +import ( + "context" + "fmt" + sdk "github.com/onflow/flow-go-sdk" + sdkcrypto "github.com/onflow/flow-go-sdk/crypto" + "github.com/onflow/flow-go-sdk/templates" + "github.com/onflow/flow-go/integration/testnet" + "github.com/onflow/flow-go/integration/tests/lib" + "github.com/onflow/flow-go/model/flow" + "github.com/onflow/flow-go/utils/unittest" + "github.com/rs/zerolog" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "github.com/stretchr/testify/suite" + "google.golang.org/grpc/codes" + "google.golang.org/grpc/status" + "testing" + "time" +) + +func TestAccessCircuitBreaker(t *testing.T) { + suite.Run(t, new(AccessCircuitBreakerSuite)) +} + +type AccessCircuitBreakerSuite struct { + suite.Suite + + log zerolog.Logger + + // root context for the current test + ctx context.Context + cancel context.CancelFunc + + net *testnet.FlowNetwork +} + +var requestTimeout = 3*time.Second +var cbRestoreTimeout = 6*time.Second + +func (s *AccessCircuitBreakerSuite) TearDownTest() { + s.log.Info().Msg("================> Start TearDownTest") + s.net.Remove() + s.cancel() + s.log.Info().Msg("================> Finish TearDownTest") +} + + +func (s *AccessCircuitBreakerSuite) SetupTest() { + s.log = unittest.LoggerForTest(s.Suite.T(), zerolog.InfoLevel) + s.log.Info().Msg("================> SetupTest") + defer func() { + s.log.Info().Msg("================> Finish SetupTest") + }() + + // need one access node with enabled circuit breaker + nodeConfigs := []testnet.NodeConfig{ + testnet.NewNodeConfig( + flow.RoleAccess, + testnet.WithLogLevel(zerolog.InfoLevel), + testnet.WithAdditionalFlag("--circuit-breaker-enabled=true"), + testnet.WithAdditionalFlag(fmt.Sprintf("--circuit-breaker-restore-timeout=%s", cbRestoreTimeout.String())), + testnet.WithAdditionalFlag("--circuit-breaker-max-failures=1"), + testnet.WithAdditionalFlag(fmt.Sprintf("--collection-client-timeout=%s", requestTimeout.String())), + ), + } + // need one execution node + exeConfig := testnet.NewNodeConfig(flow.RoleExecution, testnet.WithLogLevel(zerolog.FatalLevel)) + nodeConfigs = append(nodeConfigs, exeConfig) + + // need one dummy verification node (unused ghost) + verConfig := testnet.NewNodeConfig(flow.RoleVerification, testnet.WithLogLevel(zerolog.FatalLevel), testnet.AsGhost()) + nodeConfigs = append(nodeConfigs, verConfig) + + // need one controllable collection node + collConfig := testnet.NewNodeConfig(flow.RoleCollection, testnet.WithLogLevel(zerolog.FatalLevel), testnet.WithAdditionalFlag("--hotstuff-proposal-duration=100ms")) + nodeConfigs = append(nodeConfigs, collConfig) + + // need three consensus nodes (unused ghost) + for n := 0; n < 3; n++ { + conID := unittest.IdentifierFixture() + nodeConfig := testnet.NewNodeConfig(flow.RoleConsensus, + testnet.WithLogLevel(zerolog.FatalLevel), + testnet.WithID(conID), + testnet.AsGhost()) + nodeConfigs = append(nodeConfigs, nodeConfig) + } + + conf := testnet.NewNetworkConfig("access_api_test", nodeConfigs) + s.net = testnet.PrepareFlowNetwork(s.T(), conf, flow.Localnet) + + // start the network + s.T().Logf("starting flow network with docker containers") + s.ctx, s.cancel = context.WithCancel(context.Background()) + + s.net.Start(s.ctx) +} + +func (s *AccessCircuitBreakerSuite) TestCircuitBreaker() { + ctx, cancel := context.WithCancel(s.ctx) + defer cancel() + + // 1. Get collection node + collectionContainer := s.net.ContainerByName("collection_1") + + + // 2. Get Access Node container and client + accessContainer := s.net.ContainerByName(testnet.PrimaryAN) + + // Check if access node was created with circuit breaker flags + require.True(s.T(), accessContainer.IsFlagSet("circuit-breaker-enabled")) + require.True(s.T(), accessContainer.IsFlagSet("circuit-breaker-restore-timeout")) + require.True(s.T(), accessContainer.IsFlagSet("circuit-breaker-max-failures")) + + accessClient, err := accessContainer.TestnetClient() + assert.NoError(s.T(), err, "failed to get collection node client") + + latestBlockID, err := accessClient.GetLatestBlockID(ctx) + require.NoError(s.T(), err) + + // create new account to deploy Counter to + accountPrivateKey := lib.RandomPrivateKey() + + accountKey := sdk.NewAccountKey(). + FromPrivateKey(accountPrivateKey). + SetHashAlgo(sdkcrypto.SHA3_256). + SetWeight(sdk.AccountKeyWeightThreshold) + + serviceAddress := sdk.Address(accessClient.Chain.ServiceAddress()) + + // Generate the account creation transaction + createAccountTx, err := templates.CreateAccount( + []*sdk.AccountKey{accountKey}, + []templates.Contract{ + { + Name: lib.CounterContract.Name, + Source: lib.CounterContract.ToCadence(), + }, + }, serviceAddress) + require.NoError(s.T(), err) + + createAccountTx. + SetReferenceBlockID(sdk.Identifier(latestBlockID)). + SetProposalKey(serviceAddress, 0, accessClient.GetSeqNumber()). + SetPayer(serviceAddress). + SetGasLimit(9999) + + // sign transaction + + childCtx, cancel := context.WithTimeout(ctx, time.Second*10) + signedTx, err := accessClient.SignTransaction(createAccountTx) + require.NoError(s.T(), err) + cancel() + + // 3. Disconnect collection node from network to activate Circuit Breaker + err = collectionContainer.Disconnect() + require.NoError(s.T(), err, "failed to pause connection node") + + //4. Send couple transactions to proof circuit breaker opens correctly + sendTransaction := func(ctx context.Context, tx *sdk.Transaction) (time.Duration, error) { + childCtx, cancel = context.WithTimeout(ctx, time.Second*10) + start := time.Now() + err := accessClient.SendTransaction(childCtx, tx) + duration := time.Since(start) + defer cancel() + + return duration, err + } + + // try to send transaction first time. Should wait at least timeout time and return Unknown error + duration, err := sendTransaction(ctx, signedTx) + assert.Equal(s.T(), codes.Unknown, status.Code(err)) + assert.GreaterOrEqual(s.T(), requestTimeout, duration) + + // try to send transaction second time. Should wait less then a second cause CB configured to break after firs fail + duration, err = sendTransaction(ctx, signedTx) + assert.Equal(s.T(), codes.Unknown, status.Code(err)) + assert.Greater(s.T(), time.Second, duration) + + // connect again + err = collectionContainer.Connect() + require.NoError(s.T(), err, "failed to start collection node") + // wait to restore circuit breaker + time.Sleep(cbRestoreTimeout) + + // try to send transaction third time. Transaction should be send successful + _, err = sendTransaction(ctx, signedTx) + require.NoError(s.T(), err, "transaction should be sent") +} From 84d33960a872445dd8c26389fc166cbdaf74c404 Mon Sep 17 00:00:00 2001 From: Andrii Slisarchuk Date: Tue, 20 Jun 2023 21:02:56 +0300 Subject: [PATCH 17/56] Change circuit breaker config to pointer. --- cmd/access/node_builder/access_node_builder.go | 2 +- engine/access/rpc/backend/backend_transactions.go | 1 + engine/access/rpc/backend/connection_factory.go | 5 ++++- engine/access/rpc/backend/connection_factory_test.go | 4 ++-- engine/access/rpc/engine.go | 2 +- 5 files changed, 9 insertions(+), 5 deletions(-) diff --git a/cmd/access/node_builder/access_node_builder.go b/cmd/access/node_builder/access_node_builder.go index cdf66e66bf0..1d08851e780 100644 --- a/cmd/access/node_builder/access_node_builder.go +++ b/cmd/access/node_builder/access_node_builder.go @@ -161,7 +161,7 @@ func DefaultAccessNodeConfig() *AccessNodeConfig { FixedExecutionNodeIDs: nil, ArchiveAddressList: nil, MaxMsgSize: grpcutils.DefaultMaxMsgSize, - CircuitBreakerConfig: backend.CircuitBreakerConfig{ + CircuitBreakerConfig: &backend.CircuitBreakerConfig{ Enabled: false, RestoreTimeout: time.Duration(60) * time.Second, MaxFailures: 5, diff --git a/engine/access/rpc/backend/backend_transactions.go b/engine/access/rpc/backend/backend_transactions.go index 048cb2d223f..f8c9f5385ea 100644 --- a/engine/access/rpc/backend/backend_transactions.go +++ b/engine/access/rpc/backend/backend_transactions.go @@ -128,6 +128,7 @@ func (b *backendTransactions) chooseCollectionNodes(tx *flow.TransactionBody, sa return nil, fmt.Errorf("could not get local cluster by txID: %x", tx.ID()) } + // samples ony used when circuit breaker is disabled if !b.circuitBreakerEnabled { // select a random subset of collection nodes from the cluster to be tried in order targetNodes = targetNodes.Sample(sampleSize) diff --git a/engine/access/rpc/backend/connection_factory.go b/engine/access/rpc/backend/connection_factory.go index a5f63ab9eb1..acb635bd37c 100644 --- a/engine/access/rpc/backend/connection_factory.go +++ b/engine/access/rpc/backend/connection_factory.go @@ -62,7 +62,7 @@ type ConnectionFactoryImpl struct { AccessMetrics module.AccessMetrics Log zerolog.Logger mutex sync.Mutex - CircuitBreakerConfig CircuitBreakerConfig + CircuitBreakerConfig *CircuitBreakerConfig } type CircuitBreakerConfig struct { @@ -263,8 +263,10 @@ func (cf *ConnectionFactoryImpl) withChainUnaryInterceptor(timeout time.Duration if cf.CircuitBreakerConfig.Enabled { circuitBreaker := gobreaker.NewCircuitBreaker(gobreaker.Settings{ + // here restore timeout defined to automatically return circuit breaker to HalfClose state Timeout: cf.CircuitBreakerConfig.RestoreTimeout, ReadyToTrip: func(counts gobreaker.Counts) bool { + // here number of maximum failures will be checked, before circuit breaker go to Open state return counts.ConsecutiveFailures >= cf.CircuitBreakerConfig.MaxFailures }, }) @@ -278,6 +280,7 @@ func (cf *ConnectionFactoryImpl) withChainUnaryInterceptor(timeout time.Duration invoker grpc.UnaryInvoker, opts ...grpc.CallOption, ) error { + // The invoker should be called from circuit breaker execute, to catch each fails and react according to settings _, err := circuitBreaker.Execute(func() (interface{}, error) { err := invoker(ctx, method, req, reply, cc, opts...) diff --git a/engine/access/rpc/backend/connection_factory_test.go b/engine/access/rpc/backend/connection_factory_test.go index 99bbad2e05f..b7b7aedda40 100644 --- a/engine/access/rpc/backend/connection_factory_test.go +++ b/engine/access/rpc/backend/connection_factory_test.go @@ -428,7 +428,7 @@ func TestCircuitBreakerExecutionNode(t *testing.T) { // set the execution grpc client requestTimeout connectionFactory.ExecutionNodeGRPCTimeout = requestTimeout // set the configuration for circuit breaker - connectionFactory.CircuitBreakerConfig = CircuitBreakerConfig{ + connectionFactory.CircuitBreakerConfig = &CircuitBreakerConfig{ Enabled: true, MaxFailures: 1, RestoreTimeout: circuitBreakerRestoreTimeout, @@ -498,7 +498,7 @@ func TestCircuitBreakerCollectionNode(t *testing.T) { // set the collection grpc client requestTimeout connectionFactory.CollectionNodeGRPCTimeout = requestTimeout // set the configuration for circuit breaker - connectionFactory.CircuitBreakerConfig = CircuitBreakerConfig{ + connectionFactory.CircuitBreakerConfig = &CircuitBreakerConfig{ Enabled: true, MaxFailures: 1, RestoreTimeout: circuitBreakerRestoreTimeout, diff --git a/engine/access/rpc/engine.go b/engine/access/rpc/engine.go index c4b36c263b6..0634f908ba9 100644 --- a/engine/access/rpc/engine.go +++ b/engine/access/rpc/engine.go @@ -48,7 +48,7 @@ type Config struct { PreferredExecutionNodeIDs []string // preferred list of upstream execution node IDs FixedExecutionNodeIDs []string // fixed list of execution node IDs to choose from if no node node ID can be chosen from the PreferredExecutionNodeIDs ArchiveAddressList []string // the archive node address list to send script executions. when configured, script executions will be all sent to the archive node - CircuitBreakerConfig backend.CircuitBreakerConfig // the configuration for circuit breaker + CircuitBreakerConfig *backend.CircuitBreakerConfig // the configuration for circuit breaker } // Engine exposes the server with a simplified version of the Access API. From 152b7699d19ee7c894f292cc026391ec8577dae7 Mon Sep 17 00:00:00 2001 From: Andrii Slisarchuk Date: Tue, 20 Jun 2023 21:10:08 +0300 Subject: [PATCH 18/56] Added CB config checks. --- engine/access/rpc/backend/connection_factory.go | 2 +- engine/access/rpc/engine.go | 7 ++++++- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/engine/access/rpc/backend/connection_factory.go b/engine/access/rpc/backend/connection_factory.go index acb635bd37c..ffaa1c72172 100644 --- a/engine/access/rpc/backend/connection_factory.go +++ b/engine/access/rpc/backend/connection_factory.go @@ -261,7 +261,7 @@ func getGRPCAddress(address string, grpcPort uint) (string, error) { func (cf *ConnectionFactoryImpl) withChainUnaryInterceptor(timeout time.Duration) grpc.DialOption { var clientInterceptors []grpc.UnaryClientInterceptor - if cf.CircuitBreakerConfig.Enabled { + if cf.CircuitBreakerConfig != nil && cf.CircuitBreakerConfig.Enabled { circuitBreaker := gobreaker.NewCircuitBreaker(gobreaker.Settings{ // here restore timeout defined to automatically return circuit breaker to HalfClose state Timeout: cf.CircuitBreakerConfig.RestoreTimeout, diff --git a/engine/access/rpc/engine.go b/engine/access/rpc/engine.go index 0634f908ba9..7f9a62809c8 100644 --- a/engine/access/rpc/engine.go +++ b/engine/access/rpc/engine.go @@ -175,6 +175,11 @@ func NewBuilder(log zerolog.Logger, CircuitBreakerConfig: config.CircuitBreakerConfig, } + circuitBreakerEnabled := false + if config.CircuitBreakerConfig != nil { + circuitBreakerEnabled = config.CircuitBreakerConfig.Enabled + } + backend := backend.New(state, collectionRPC, historicalAccessNodes, @@ -194,7 +199,7 @@ func NewBuilder(log zerolog.Logger, log, backend.DefaultSnapshotHistoryLimit, config.ArchiveAddressList, - config.CircuitBreakerConfig.Enabled, + circuitBreakerEnabled, ) finalizedCache, finalizedCacheWorker, err := events.NewFinalizedHeaderCache(state) From 09669612ffd337ca40a3b4ec9a56ae793f1ddbbc Mon Sep 17 00:00:00 2001 From: Andrii Slisarchuk Date: Tue, 20 Jun 2023 21:11:42 +0300 Subject: [PATCH 19/56] remove unnecessary changes --- cmd/util/cmd/execution-state-extract/export_report.json | 6 ------ 1 file changed, 6 deletions(-) diff --git a/cmd/util/cmd/execution-state-extract/export_report.json b/cmd/util/cmd/execution-state-extract/export_report.json index 35f744445ee..e69de29bb2d 100644 --- a/cmd/util/cmd/execution-state-extract/export_report.json +++ b/cmd/util/cmd/execution-state-extract/export_report.json @@ -1,6 +0,0 @@ -{ - "EpochCounter": 0, - "PreviousStateCommitment": "6d2eb6ae0aa575274a45362f9bd175ec6f030f79552c5fed27ada030d3799a53", - "CurrentStateCommitment": "6d2eb6ae0aa575274a45362f9bd175ec6f030f79552c5fed27ada030d3799a53", - "ReportSucceeded": true -} \ No newline at end of file From 691438e33de3732fcc9c53416cf2bde9aac39212 Mon Sep 17 00:00:00 2001 From: Andrii Slisarchuk Date: Tue, 20 Jun 2023 21:14:59 +0300 Subject: [PATCH 20/56] revert unnecessary commited file --- cmd/util/cmd/execution-state-extract/export_report.json | 0 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 cmd/util/cmd/execution-state-extract/export_report.json diff --git a/cmd/util/cmd/execution-state-extract/export_report.json b/cmd/util/cmd/execution-state-extract/export_report.json deleted file mode 100644 index e69de29bb2d..00000000000 From 1915b4a76745f91e9a67c5c52d1c1375b058257a Mon Sep 17 00:00:00 2001 From: Andrii Slisarchuk Date: Tue, 20 Jun 2023 21:48:52 +0300 Subject: [PATCH 21/56] Added comment to unit tests --- engine/access/rpc/backend/connection_factory_test.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/engine/access/rpc/backend/connection_factory_test.go b/engine/access/rpc/backend/connection_factory_test.go index b7b7aedda40..8a60430b0a2 100644 --- a/engine/access/rpc/backend/connection_factory_test.go +++ b/engine/access/rpc/backend/connection_factory_test.go @@ -408,7 +408,7 @@ func TestConnectionPoolStale(t *testing.T) { assert.Equal(t, resp, expected) } -// TestCircuitBreakerExecutionNode +// TestCircuitBreakerExecutionNode tests circuit breaker states changed for execution nodes func TestCircuitBreakerExecutionNode(t *testing.T) { requestTimeout := 1 * time.Second circuitBreakerRestoreTimeout := 3 * time.Second @@ -477,7 +477,7 @@ func TestCircuitBreakerExecutionNode(t *testing.T) { assert.Equal(t, nil, err) } -// TestCircuitBreakerCollectionNode +// TestCircuitBreakerCollectionNode tests circuit breaker states changed for collection nodes func TestCircuitBreakerCollectionNode(t *testing.T) { requestTimeout := 1 * time.Second circuitBreakerRestoreTimeout := 3 * time.Second @@ -536,7 +536,7 @@ func TestCircuitBreakerCollectionNode(t *testing.T) { assert.Equal(t, gobreaker.ErrOpenState, err) assert.Greater(t, requestTimeout, duration) - cn.handler.On("Ping", testifymock.Anything, req).Unset() + //cn.handler.On("Ping", testifymock.Anything, req).Unset() cn.handler.On("Ping", testifymock.Anything, req).Return(resp, nil) //Wait until Circuit breaker go to Half-open state From 236269a1e30321825459123a571274cfb535a307 Mon Sep 17 00:00:00 2001 From: Andrii Slisarchuk Date: Wed, 21 Jun 2023 14:43:55 +0300 Subject: [PATCH 22/56] linted --- .../access/access_circuit_breaker_test.go | 24 +++++++++---------- 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/integration/tests/access/access_circuit_breaker_test.go b/integration/tests/access/access_circuit_breaker_test.go index 0c025b11565..07b9f98ddbf 100644 --- a/integration/tests/access/access_circuit_breaker_test.go +++ b/integration/tests/access/access_circuit_breaker_test.go @@ -3,6 +3,16 @@ package access import ( "context" "fmt" + "testing" + "time" + + "github.com/rs/zerolog" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "github.com/stretchr/testify/suite" + "google.golang.org/grpc/codes" + "google.golang.org/grpc/status" + sdk "github.com/onflow/flow-go-sdk" sdkcrypto "github.com/onflow/flow-go-sdk/crypto" "github.com/onflow/flow-go-sdk/templates" @@ -10,14 +20,6 @@ import ( "github.com/onflow/flow-go/integration/tests/lib" "github.com/onflow/flow-go/model/flow" "github.com/onflow/flow-go/utils/unittest" - "github.com/rs/zerolog" - "github.com/stretchr/testify/assert" - "github.com/stretchr/testify/require" - "github.com/stretchr/testify/suite" - "google.golang.org/grpc/codes" - "google.golang.org/grpc/status" - "testing" - "time" ) func TestAccessCircuitBreaker(t *testing.T) { @@ -36,8 +38,8 @@ type AccessCircuitBreakerSuite struct { net *testnet.FlowNetwork } -var requestTimeout = 3*time.Second -var cbRestoreTimeout = 6*time.Second +var requestTimeout = 3 * time.Second +var cbRestoreTimeout = 6 * time.Second func (s *AccessCircuitBreakerSuite) TearDownTest() { s.log.Info().Msg("================> Start TearDownTest") @@ -46,7 +48,6 @@ func (s *AccessCircuitBreakerSuite) TearDownTest() { s.log.Info().Msg("================> Finish TearDownTest") } - func (s *AccessCircuitBreakerSuite) SetupTest() { s.log = unittest.LoggerForTest(s.Suite.T(), zerolog.InfoLevel) s.log.Info().Msg("================> SetupTest") @@ -104,7 +105,6 @@ func (s *AccessCircuitBreakerSuite) TestCircuitBreaker() { // 1. Get collection node collectionContainer := s.net.ContainerByName("collection_1") - // 2. Get Access Node container and client accessContainer := s.net.ContainerByName(testnet.PrimaryAN) From ac0d9e87ca24a6b1685f6d2cddd1baed94ca93c1 Mon Sep 17 00:00:00 2001 From: Andrii Slisarchuk Date: Wed, 21 Jun 2023 18:30:37 +0300 Subject: [PATCH 23/56] simplify logic for interceptors --- apiproxy/access_api_proxy.go | 4 +-- .../node_builder/access_node_builder.go | 2 +- .../export_report.json | 6 ++++ engine/access/apiproxy/access_api_proxy.go | 4 +-- .../access/rpc/backend/connection_factory.go | 31 ++++++++++++------- 5 files changed, 31 insertions(+), 16 deletions(-) create mode 100644 cmd/util/cmd/execution-state-extract/export_report.json diff --git a/apiproxy/access_api_proxy.go b/apiproxy/access_api_proxy.go index dfe610f5857..0bd90bd0999 100644 --- a/apiproxy/access_api_proxy.go +++ b/apiproxy/access_api_proxy.go @@ -86,7 +86,7 @@ func (h *FlowAccessAPIForwarder) reconnectingClient(i int) error { identity.Address, grpc.WithDefaultCallOptions(grpc.MaxCallRecvMsgSize(grpcutils.DefaultMaxMsgSize)), grpc.WithInsecure(), //nolint:staticcheck - grpc.WithUnaryInterceptor(backend.WithClientUnaryInterceptor(timeout))) + backend.WithClientTimeoutOption(timeout)) if err != nil { return err } @@ -100,7 +100,7 @@ func (h *FlowAccessAPIForwarder) reconnectingClient(i int) error { identity.Address, grpc.WithDefaultCallOptions(grpc.MaxCallRecvMsgSize(grpcutils.DefaultMaxMsgSize)), grpc.WithTransportCredentials(credentials.NewTLS(tlsConfig)), - grpc.WithUnaryInterceptor(backend.WithClientUnaryInterceptor(timeout))) + backend.WithClientTimeoutOption(timeout)) if err != nil { return fmt.Errorf("cannot connect to %s %w", identity.Address, err) } diff --git a/cmd/access/node_builder/access_node_builder.go b/cmd/access/node_builder/access_node_builder.go index 766462f1ded..6e653805f44 100644 --- a/cmd/access/node_builder/access_node_builder.go +++ b/cmd/access/node_builder/access_node_builder.go @@ -923,7 +923,7 @@ func (builder *FlowAccessNodeBuilder) Build() (cmd.Node, error) { builder.rpcConf.CollectionAddr, grpc.WithDefaultCallOptions(grpc.MaxCallRecvMsgSize(int(builder.rpcConf.MaxMsgSize))), grpc.WithTransportCredentials(insecure.NewCredentials()), - grpc.WithUnaryInterceptor(backend.WithClientUnaryInterceptor(builder.rpcConf.CollectionClientTimeout))) + backend.WithClientTimeoutOption(builder.rpcConf.CollectionClientTimeout)) if err != nil { return err } diff --git a/cmd/util/cmd/execution-state-extract/export_report.json b/cmd/util/cmd/execution-state-extract/export_report.json new file mode 100644 index 00000000000..6393b54664b --- /dev/null +++ b/cmd/util/cmd/execution-state-extract/export_report.json @@ -0,0 +1,6 @@ +{ + "EpochCounter": 0, + "PreviousStateCommitment": "f4a516703d828532a8537dc81f98fffecf6228733364d17efa3332e177592b54", + "CurrentStateCommitment": "f4a516703d828532a8537dc81f98fffecf6228733364d17efa3332e177592b54", + "ReportSucceeded": true +} \ No newline at end of file diff --git a/engine/access/apiproxy/access_api_proxy.go b/engine/access/apiproxy/access_api_proxy.go index ce95c1cca28..ed47ef167eb 100644 --- a/engine/access/apiproxy/access_api_proxy.go +++ b/engine/access/apiproxy/access_api_proxy.go @@ -65,7 +65,7 @@ func (h *FlowAccessAPIForwarder) reconnectingClient(i int) error { identity.Address, grpc.WithDefaultCallOptions(grpc.MaxCallRecvMsgSize(int(h.maxMsgSize))), grpc.WithTransportCredentials(insecure.NewCredentials()), - grpc.WithUnaryInterceptor(backend.WithClientUnaryInterceptor(timeout))) + backend.WithClientTimeoutOption(timeout)) if err != nil { return err } @@ -79,7 +79,7 @@ func (h *FlowAccessAPIForwarder) reconnectingClient(i int) error { identity.Address, grpc.WithDefaultCallOptions(grpc.MaxCallRecvMsgSize(int(h.maxMsgSize))), grpc.WithTransportCredentials(credentials.NewTLS(tlsConfig)), - grpc.WithUnaryInterceptor(backend.WithClientUnaryInterceptor(timeout))) + backend.WithClientTimeoutOption(timeout)) if err != nil { return fmt.Errorf("cannot connect to %s %w", identity.Address, err) } diff --git a/engine/access/rpc/backend/connection_factory.go b/engine/access/rpc/backend/connection_factory.go index ffaa1c72172..00ecf14d2e2 100644 --- a/engine/access/rpc/backend/connection_factory.go +++ b/engine/access/rpc/backend/connection_factory.go @@ -92,6 +92,13 @@ func (cf *ConnectionFactoryImpl) createConnection(address string, timeout time.D Timeout: timeout, } + var connInterceptors []grpc.UnaryClientInterceptor + cbInterceptor := cf.withCircuitBreakerInterceptor() + if cbInterceptor != nil { + connInterceptors = append(connInterceptors, cbInterceptor) + } + connInterceptors = append(connInterceptors, WithClientTimeoutInterceptor(timeout)) + // ClientConn's default KeepAlive on connections is indefinite, assuming the timeout isn't reached // The connections should be safe to be persisted and reused // https://pkg.go.dev/google.golang.org/grpc#WithKeepaliveParams @@ -101,7 +108,7 @@ func (cf *ConnectionFactoryImpl) createConnection(address string, timeout time.D grpc.WithDefaultCallOptions(grpc.MaxCallRecvMsgSize(int(cf.MaxMsgSize))), grpc.WithTransportCredentials(insecure.NewCredentials()), grpc.WithKeepaliveParams(keepaliveParams), - cf.withChainUnaryInterceptor(timeout), + grpc.WithChainUnaryInterceptor(connInterceptors...), ) if err != nil { return nil, fmt.Errorf("failed to connect to address %s: %w", address, err) @@ -258,9 +265,7 @@ func getGRPCAddress(address string, grpcPort uint) (string, error) { return grpcAddress, nil } -func (cf *ConnectionFactoryImpl) withChainUnaryInterceptor(timeout time.Duration) grpc.DialOption { - var clientInterceptors []grpc.UnaryClientInterceptor - +func (cf *ConnectionFactoryImpl) withCircuitBreakerInterceptor() grpc.UnaryClientInterceptor { if cf.CircuitBreakerConfig != nil && cf.CircuitBreakerConfig.Enabled { circuitBreaker := gobreaker.NewCircuitBreaker(gobreaker.Settings{ // here restore timeout defined to automatically return circuit breaker to HalfClose state @@ -271,7 +276,7 @@ func (cf *ConnectionFactoryImpl) withChainUnaryInterceptor(timeout time.Duration }, }) - interceptor := func( + circuitBreakerInterceptor := func( ctx context.Context, method string, req interface{}, @@ -280,6 +285,7 @@ func (cf *ConnectionFactoryImpl) withChainUnaryInterceptor(timeout time.Duration invoker grpc.UnaryInvoker, opts ...grpc.CallOption, ) error { + fmt.Println("!!! circuitBreakerInterceptor") // The invoker should be called from circuit breaker execute, to catch each fails and react according to settings _, err := circuitBreaker.Execute(func() (interface{}, error) { err := invoker(ctx, method, req, reply, cc, opts...) @@ -288,15 +294,14 @@ func (cf *ConnectionFactoryImpl) withChainUnaryInterceptor(timeout time.Duration }) return err } - clientInterceptors = append(clientInterceptors, interceptor) - } - clientInterceptors = append(clientInterceptors, WithClientUnaryInterceptor(timeout)) + return circuitBreakerInterceptor + } - return grpc.WithChainUnaryInterceptor(clientInterceptors...) + return nil } -func WithClientUnaryInterceptor(timeout time.Duration) grpc.UnaryClientInterceptor { +func WithClientTimeoutInterceptor(timeout time.Duration) grpc.UnaryClientInterceptor { clientTimeoutInterceptor := func( ctx context.Context, method string, @@ -310,7 +315,7 @@ func WithClientUnaryInterceptor(timeout time.Duration) grpc.UnaryClientIntercept ctxWithTimeout, cancel := context.WithTimeout(ctx, timeout) defer cancel() - + fmt.Println("!!! clientTimeoutInterceptor") // call the remote GRPC using the short context err := invoker(ctxWithTimeout, method, req, reply, cc, opts...) @@ -319,3 +324,7 @@ func WithClientUnaryInterceptor(timeout time.Duration) grpc.UnaryClientIntercept return clientTimeoutInterceptor } + +func WithClientTimeoutOption(timeout time.Duration) grpc.DialOption { + return grpc.WithUnaryInterceptor(WithClientTimeoutInterceptor(timeout)) +} From 6017683edd1e2bd1eda8bc495ddd7da4ee1f0861 Mon Sep 17 00:00:00 2001 From: Andrii Slisarchuk Date: Wed, 21 Jun 2023 20:33:36 +0300 Subject: [PATCH 24/56] added invalidate interceptor --- .../export_report.json | 4 +- engine/access/rpc/backend/backend_accounts.go | 3 - engine/access/rpc/backend/backend_events.go | 3 - engine/access/rpc/backend/backend_scripts.go | 3 - .../rpc/backend/backend_transactions.go | 12 ---- .../access/rpc/backend/connection_factory.go | 62 ++++++++++++++++--- .../rpc/backend/connection_factory_test.go | 2 +- 7 files changed, 58 insertions(+), 31 deletions(-) diff --git a/cmd/util/cmd/execution-state-extract/export_report.json b/cmd/util/cmd/execution-state-extract/export_report.json index 6393b54664b..6f3e833a6a5 100644 --- a/cmd/util/cmd/execution-state-extract/export_report.json +++ b/cmd/util/cmd/execution-state-extract/export_report.json @@ -1,6 +1,6 @@ { "EpochCounter": 0, - "PreviousStateCommitment": "f4a516703d828532a8537dc81f98fffecf6228733364d17efa3332e177592b54", - "CurrentStateCommitment": "f4a516703d828532a8537dc81f98fffecf6228733364d17efa3332e177592b54", + "PreviousStateCommitment": "f305e934fb48402557dbf73b9b6abf6217a582ffa441ce6f6e84a22d7e82b887", + "CurrentStateCommitment": "f305e934fb48402557dbf73b9b6abf6217a582ffa441ce6f6e84a22d7e82b887", "ReportSucceeded": true } \ No newline at end of file diff --git a/engine/access/rpc/backend/backend_accounts.go b/engine/access/rpc/backend/backend_accounts.go index 119c59a8b15..f0e3bf1aed3 100644 --- a/engine/access/rpc/backend/backend_accounts.go +++ b/engine/access/rpc/backend/backend_accounts.go @@ -152,9 +152,6 @@ func (b *backendAccounts) tryGetAccount(ctx context.Context, execNode *flow.Iden resp, err := execRPCClient.GetAccountAtBlockID(ctx, req) if err != nil { - if status.Code(err) == codes.Unavailable && !b.circuitBreakerEnabled { - b.connFactory.InvalidateExecutionAPIClient(execNode.Address) - } return nil, err } return resp, nil diff --git a/engine/access/rpc/backend/backend_events.go b/engine/access/rpc/backend/backend_events.go index e1fcfb087d2..bc1c0f50f62 100644 --- a/engine/access/rpc/backend/backend_events.go +++ b/engine/access/rpc/backend/backend_events.go @@ -252,9 +252,6 @@ func (b *backendEvents) tryGetEvents(ctx context.Context, resp, err := execRPCClient.GetEventsForBlockIDs(ctx, req) if err != nil { - if status.Code(err) == codes.Unavailable && !b.circuitBreakerEnabled { - b.connFactory.InvalidateExecutionAPIClient(execNode.Address) - } return nil, err } return resp, nil diff --git a/engine/access/rpc/backend/backend_scripts.go b/engine/access/rpc/backend/backend_scripts.go index 2777944299b..cbb73fb25bc 100644 --- a/engine/access/rpc/backend/backend_scripts.go +++ b/engine/access/rpc/backend/backend_scripts.go @@ -206,9 +206,6 @@ func (b *backendScripts) tryExecuteScript(ctx context.Context, executorAddress s execResp, err := execRPCClient.ExecuteScriptAtBlockID(ctx, req) if err != nil { - if status.Code(err) == codes.Unavailable && !b.circuitBreakerEnabled { - b.connFactory.InvalidateExecutionAPIClient(executorAddress) - } return nil, status.Errorf(status.Code(err), "failed to execute the script on the execution node %s: %v", executorAddress, err) } return execResp.GetValue(), nil diff --git a/engine/access/rpc/backend/backend_transactions.go b/engine/access/rpc/backend/backend_transactions.go index 2f2111a9ca3..025be9c04e7 100644 --- a/engine/access/rpc/backend/backend_transactions.go +++ b/engine/access/rpc/backend/backend_transactions.go @@ -156,9 +156,6 @@ func (b *backendTransactions) sendTransactionToCollector(ctx context.Context, err = b.grpcTxSend(ctx, collectionRPC, tx) if err != nil { - if status.Code(err) == codes.Unavailable && !b.circuitBreakerEnabled { - b.connFactory.InvalidateAccessAPIClient(collectionNodeAddr) - } return fmt.Errorf("failed to send transaction to collection node at %s: %w", collectionNodeAddr, err) } return nil @@ -828,9 +825,6 @@ func (b *backendTransactions) tryGetTransactionResult( resp, err := execRPCClient.GetTransactionResult(ctx, req) if err != nil { - if status.Code(err) == codes.Unavailable && !b.circuitBreakerEnabled { - b.connFactory.InvalidateExecutionAPIClient(execNode.Address) - } return nil, err } @@ -891,9 +885,6 @@ func (b *backendTransactions) tryGetTransactionResultsByBlockID( resp, err := execRPCClient.GetTransactionResultsByBlockID(ctx, req) if err != nil { - if status.Code(err) == codes.Unavailable && !b.circuitBreakerEnabled { - b.connFactory.InvalidateExecutionAPIClient(execNode.Address) - } return nil, err } @@ -955,9 +946,6 @@ func (b *backendTransactions) tryGetTransactionResultByIndex( resp, err := execRPCClient.GetTransactionResultByIndex(ctx, req) if err != nil { - if status.Code(err) == codes.Unavailable && !b.circuitBreakerEnabled { - b.connFactory.InvalidateExecutionAPIClient(execNode.Address) - } return nil, err } diff --git a/engine/access/rpc/backend/connection_factory.go b/engine/access/rpc/backend/connection_factory.go index 00ecf14d2e2..bca83eb70b2 100644 --- a/engine/access/rpc/backend/connection_factory.go +++ b/engine/access/rpc/backend/connection_factory.go @@ -8,6 +8,9 @@ import ( "sync" "time" + "google.golang.org/grpc/codes" + "google.golang.org/grpc/status" + lru "github.com/hashicorp/golang-lru" "github.com/onflow/flow/protobuf/go/flow/access" "github.com/onflow/flow/protobuf/go/flow/execution" @@ -24,6 +27,13 @@ import ( // DefaultClientTimeout is used when making a GRPC request to a collection node or an execution node const DefaultClientTimeout = 3 * time.Second +type clientType int + +const ( + AccessClient clientType = iota + ExecutionClient +) + // ConnectionFactory is used to create an access api client type ConnectionFactory interface { GetAccessAPIClient(address string) (access.AccessAPIClient, io.Closer, error) @@ -79,7 +89,7 @@ type CachedClient struct { } // createConnection creates new gRPC connections to remote node -func (cf *ConnectionFactoryImpl) createConnection(address string, timeout time.Duration) (*grpc.ClientConn, error) { +func (cf *ConnectionFactoryImpl) createConnection(address string, timeout time.Duration, clientType clientType) (*grpc.ClientConn, error) { if timeout == 0 { timeout = DefaultClientTimeout @@ -93,10 +103,17 @@ func (cf *ConnectionFactoryImpl) createConnection(address string, timeout time.D } var connInterceptors []grpc.UnaryClientInterceptor + cbInterceptor := cf.withCircuitBreakerInterceptor() if cbInterceptor != nil { connInterceptors = append(connInterceptors, cbInterceptor) } + + ciInterceptor := cf.withClientInvalidationInterceptor(address, clientType) + if ciInterceptor != nil { + connInterceptors = append(connInterceptors, ciInterceptor) + } + connInterceptors = append(connInterceptors, WithClientTimeoutInterceptor(timeout)) // ClientConn's default KeepAlive on connections is indefinite, assuming the timeout isn't reached @@ -116,7 +133,7 @@ func (cf *ConnectionFactoryImpl) createConnection(address string, timeout time.D return conn, nil } -func (cf *ConnectionFactoryImpl) retrieveConnection(grpcAddress string, timeout time.Duration) (*grpc.ClientConn, error) { +func (cf *ConnectionFactoryImpl) retrieveConnection(grpcAddress string, timeout time.Duration, clientType clientType) (*grpc.ClientConn, error) { var conn *grpc.ClientConn var store *CachedClient cacheHit := false @@ -143,7 +160,7 @@ func (cf *ConnectionFactoryImpl) retrieveConnection(grpcAddress string, timeout if conn == nil || conn.GetState() == connectivity.Shutdown { var err error - conn, err = cf.createConnection(grpcAddress, timeout) + conn, err = cf.createConnection(grpcAddress, timeout, clientType) if err != nil { return nil, err } @@ -170,14 +187,14 @@ func (cf *ConnectionFactoryImpl) GetAccessAPIClient(address string) (access.Acce var conn *grpc.ClientConn if cf.ConnectionsCache != nil { - conn, err = cf.retrieveConnection(grpcAddress, cf.CollectionNodeGRPCTimeout) + conn, err = cf.retrieveConnection(grpcAddress, cf.CollectionNodeGRPCTimeout, AccessClient) if err != nil { return nil, nil, err } return access.NewAccessAPIClient(conn), &noopCloser{}, err } - conn, err = cf.createConnection(grpcAddress, cf.CollectionNodeGRPCTimeout) + conn, err = cf.createConnection(grpcAddress, cf.CollectionNodeGRPCTimeout, AccessClient) if err != nil { return nil, nil, err } @@ -203,14 +220,14 @@ func (cf *ConnectionFactoryImpl) GetExecutionAPIClient(address string) (executio var conn *grpc.ClientConn if cf.ConnectionsCache != nil { - conn, err = cf.retrieveConnection(grpcAddress, cf.ExecutionNodeGRPCTimeout) + conn, err = cf.retrieveConnection(grpcAddress, cf.ExecutionNodeGRPCTimeout, ExecutionClient) if err != nil { return nil, nil, err } return execution.NewExecutionAPIClient(conn), &noopCloser{}, nil } - conn, err = cf.createConnection(grpcAddress, cf.ExecutionNodeGRPCTimeout) + conn, err = cf.createConnection(grpcAddress, cf.ExecutionNodeGRPCTimeout, ExecutionClient) if err != nil { return nil, nil, err } @@ -265,6 +282,37 @@ func getGRPCAddress(address string, grpcPort uint) (string, error) { return grpcAddress, nil } +func (cf *ConnectionFactoryImpl) withClientInvalidationInterceptor(address string, clientType clientType) grpc.UnaryClientInterceptor { + if cf.CircuitBreakerConfig == nil || !cf.CircuitBreakerConfig.Enabled { + clientInvalidationInterceptor := func( + ctx context.Context, + method string, + req interface{}, + reply interface{}, + cc *grpc.ClientConn, + invoker grpc.UnaryInvoker, + opts ...grpc.CallOption, + ) error { + fmt.Println("!!! clientInvalidationInterceptor") + err := invoker(ctx, method, req, reply, cc, opts...) + if status.Code(err) == codes.Unavailable { + switch clientType { + case AccessClient: + cf.InvalidateAccessAPIClient(address) + case ExecutionClient: + cf.InvalidateExecutionAPIClient(address) + } + } + + return err + } + + return clientInvalidationInterceptor + } + + return nil +} + func (cf *ConnectionFactoryImpl) withCircuitBreakerInterceptor() grpc.UnaryClientInterceptor { if cf.CircuitBreakerConfig != nil && cf.CircuitBreakerConfig.Enabled { circuitBreaker := gobreaker.NewCircuitBreaker(gobreaker.Settings{ diff --git a/engine/access/rpc/backend/connection_factory_test.go b/engine/access/rpc/backend/connection_factory_test.go index 8a60430b0a2..c81201fc953 100644 --- a/engine/access/rpc/backend/connection_factory_test.go +++ b/engine/access/rpc/backend/connection_factory_test.go @@ -536,7 +536,7 @@ func TestCircuitBreakerCollectionNode(t *testing.T) { assert.Equal(t, gobreaker.ErrOpenState, err) assert.Greater(t, requestTimeout, duration) - //cn.handler.On("Ping", testifymock.Anything, req).Unset() + cn.handler.On("Ping", testifymock.Anything, req).Unset() cn.handler.On("Ping", testifymock.Anything, req).Return(resp, nil) //Wait until Circuit breaker go to Half-open state From 559c4e9cfb5ddbccf85353604b8eec916e40d1ef Mon Sep 17 00:00:00 2001 From: Andrii Slisarchuk Date: Thu, 22 Jun 2023 00:48:03 +0300 Subject: [PATCH 25/56] Added execution and collection nodes iterators. --- .../export_report.json | 4 +- engine/access/rpc/backend/backend.go | 79 +++++++++--------- engine/access/rpc/backend/backend_accounts.go | 20 ++--- engine/access/rpc/backend/backend_events.go | 23 +++--- engine/access/rpc/backend/backend_scripts.go | 27 +++--- engine/access/rpc/backend/backend_test.go | 28 +++++-- .../rpc/backend/backend_transactions.go | 82 ++++++++----------- engine/access/rpc/backend/node_iterator.go | 79 ++++++++++++++++++ 8 files changed, 210 insertions(+), 132 deletions(-) create mode 100644 engine/access/rpc/backend/node_iterator.go diff --git a/cmd/util/cmd/execution-state-extract/export_report.json b/cmd/util/cmd/execution-state-extract/export_report.json index 6f3e833a6a5..fe9eccd9727 100644 --- a/cmd/util/cmd/execution-state-extract/export_report.json +++ b/cmd/util/cmd/execution-state-extract/export_report.json @@ -1,6 +1,6 @@ { "EpochCounter": 0, - "PreviousStateCommitment": "f305e934fb48402557dbf73b9b6abf6217a582ffa441ce6f6e84a22d7e82b887", - "CurrentStateCommitment": "f305e934fb48402557dbf73b9b6abf6217a582ffa441ce6f6e84a22d7e82b887", + "PreviousStateCommitment": "5bdf6c643d9f0dbc78adf5923895df3bbf52867c406444d7fd101b9942f7d1f4", + "CurrentStateCommitment": "5bdf6c643d9f0dbc78adf5923895df3bbf52867c406444d7fd101b9942f7d1f4", "ReportSucceeded": true } \ No newline at end of file diff --git a/engine/access/rpc/backend/backend.go b/engine/access/rpc/backend/backend.go index 6c63ca1d35f..993f88fb046 100644 --- a/engine/access/rpc/backend/backend.go +++ b/engine/access/rpc/backend/backend.go @@ -22,9 +22,6 @@ import ( "github.com/onflow/flow-go/storage" ) -// maxExecutionNodesCnt is the max number of execution nodes that will be contacted to complete an execution api request -const maxExecutionNodesCnt = 3 - // minExecutionNodesCnt is the minimum number of execution nodes expected to have sent the execution receipt for a block const minExecutionNodesCnt = 2 @@ -109,44 +106,48 @@ func New( log.Fatal().Err(err).Msg("failed to initialize script logging cache") } + collIteratorFactory := CollectionNodeIteratorFactory{circuitBreakerEnabled: circuitBreakerEnabled} + execIteratorFactory := ExecutionNodeIteratorFactory{circuitBreakerEnabled: circuitBreakerEnabled} + b := &Backend{ state: state, // create the sub-backends backendScripts: backendScripts{ - headers: headers, - executionReceipts: executionReceipts, - connFactory: connFactory, - state: state, - log: log, - metrics: accessMetrics, - loggedScripts: loggedScripts, - archiveAddressList: archiveAddressList, - circuitBreakerEnabled: circuitBreakerEnabled, + headers: headers, + executionReceipts: executionReceipts, + connFactory: connFactory, + state: state, + log: log, + metrics: accessMetrics, + loggedScripts: loggedScripts, + archiveAddressList: archiveAddressList, + execIteratorFactory: execIteratorFactory, }, backendTransactions: backendTransactions{ - staticCollectionRPC: collectionRPC, - state: state, - chainID: chainID, - collections: collections, - blocks: blocks, - transactions: transactions, - executionReceipts: executionReceipts, - transactionValidator: configureTransactionValidator(state, chainID), - transactionMetrics: accessMetrics, - retry: retry, - connFactory: connFactory, - previousAccessNodes: historicalAccessNodes, - log: log, - circuitBreakerEnabled: circuitBreakerEnabled, + staticCollectionRPC: collectionRPC, + state: state, + chainID: chainID, + collections: collections, + blocks: blocks, + transactions: transactions, + executionReceipts: executionReceipts, + transactionValidator: configureTransactionValidator(state, chainID), + transactionMetrics: accessMetrics, + retry: retry, + connFactory: connFactory, + previousAccessNodes: historicalAccessNodes, + log: log, + collIteratorFactory: collIteratorFactory, + execIteratorFactory: execIteratorFactory, }, backendEvents: backendEvents{ - state: state, - headers: headers, - executionReceipts: executionReceipts, - connFactory: connFactory, - log: log, - maxHeightRange: maxHeightRange, - circuitBreakerEnabled: circuitBreakerEnabled, + state: state, + headers: headers, + executionReceipts: executionReceipts, + connFactory: connFactory, + log: log, + maxHeightRange: maxHeightRange, + execIteratorFactory: execIteratorFactory, }, backendBlockHeaders: backendBlockHeaders{ headers: headers, @@ -157,12 +158,12 @@ func New( state: state, }, backendAccounts: backendAccounts{ - state: state, - headers: headers, - executionReceipts: executionReceipts, - connFactory: connFactory, - log: log, - circuitBreakerEnabled: circuitBreakerEnabled, + state: state, + headers: headers, + executionReceipts: executionReceipts, + connFactory: connFactory, + log: log, + execIteratorFactory: execIteratorFactory, }, backendExecutionResults: backendExecutionResults{ executionResults: executionResults, diff --git a/engine/access/rpc/backend/backend_accounts.go b/engine/access/rpc/backend/backend_accounts.go index f0e3bf1aed3..b38f552631e 100644 --- a/engine/access/rpc/backend/backend_accounts.go +++ b/engine/access/rpc/backend/backend_accounts.go @@ -18,12 +18,12 @@ import ( ) type backendAccounts struct { - state protocol.State - headers storage.Headers - executionReceipts storage.ExecutionReceipts - connFactory ConnectionFactory - log zerolog.Logger - circuitBreakerEnabled bool + state protocol.State + headers storage.Headers + executionReceipts storage.ExecutionReceipts + connFactory ConnectionFactory + log zerolog.Logger + execIteratorFactory ExecutionNodeIteratorFactory } func (b *backendAccounts) GetAccount(ctx context.Context, address flow.Address) (*flow.Account, error) { @@ -108,13 +108,11 @@ func (b *backendAccounts) getAccountAtBlockID( // other ENs are logged and swallowed. If all ENs fail to return a valid response, then an // error aggregating all failures is returned. func (b *backendAccounts) getAccountFromAnyExeNode(ctx context.Context, execNodes flow.IdentityList, req *execproto.GetAccountAtBlockIDRequest) (*execproto.GetAccountAtBlockIDResponse, error) { - if !b.circuitBreakerEnabled { - execNodes = execNodes.Sample(maxExecutionNodesCnt) - } - var errors *multierror.Error - for _, execNode := range execNodes { + execNodeIter := b.execIteratorFactory.CreateNodeIterator(execNodes) + + for execNode := execNodeIter.Next(); execNode != nil; execNode = execNodeIter.Next() { // TODO: use the GRPC Client interceptor start := time.Now() diff --git a/engine/access/rpc/backend/backend_events.go b/engine/access/rpc/backend/backend_events.go index bc1c0f50f62..74a64cc10bb 100644 --- a/engine/access/rpc/backend/backend_events.go +++ b/engine/access/rpc/backend/backend_events.go @@ -21,13 +21,13 @@ import ( ) type backendEvents struct { - headers storage.Headers - executionReceipts storage.ExecutionReceipts - state protocol.State - connFactory ConnectionFactory - log zerolog.Logger - maxHeightRange uint - circuitBreakerEnabled bool + headers storage.Headers + executionReceipts storage.ExecutionReceipts + state protocol.State + connFactory ConnectionFactory + log zerolog.Logger + maxHeightRange uint + execIteratorFactory ExecutionNodeIteratorFactory } // GetEventsForHeightRange retrieves events for all sealed blocks between the start block height and @@ -210,13 +210,12 @@ func verifyAndConvertToAccessEvents( func (b *backendEvents) getEventsFromAnyExeNode(ctx context.Context, execNodes flow.IdentityList, req *execproto.GetEventsForBlockIDsRequest) (*execproto.GetEventsForBlockIDsResponse, *flow.Identity, error) { - if !b.circuitBreakerEnabled { - execNodes = execNodes.Sample(maxExecutionNodesCnt) - } - var errors *multierror.Error + + execNodeIter := b.execIteratorFactory.CreateNodeIterator(execNodes) + // try to get events from one of the execution nodes - for _, execNode := range execNodes { + for execNode := execNodeIter.Next(); execNode != nil; execNode = execNodeIter.Next() { start := time.Now() resp, err := b.tryGetEvents(ctx, execNode, req) duration := time.Since(start) diff --git a/engine/access/rpc/backend/backend_scripts.go b/engine/access/rpc/backend/backend_scripts.go index cbb73fb25bc..8a1cd7b9cb2 100644 --- a/engine/access/rpc/backend/backend_scripts.go +++ b/engine/access/rpc/backend/backend_scripts.go @@ -24,15 +24,15 @@ import ( const uniqueScriptLoggingTimeWindow = 10 * time.Minute type backendScripts struct { - headers storage.Headers - executionReceipts storage.ExecutionReceipts - state protocol.State - connFactory ConnectionFactory - log zerolog.Logger - metrics module.BackendScriptsMetrics - loggedScripts *lru.Cache - archiveAddressList []string - circuitBreakerEnabled bool + headers storage.Headers + executionReceipts storage.ExecutionReceipts + state protocol.State + connFactory ConnectionFactory + log zerolog.Logger + metrics module.BackendScriptsMetrics + loggedScripts *lru.Cache + archiveAddressList []string + execIteratorFactory ExecutionNodeIteratorFactory } func (b *backendScripts) ExecuteScriptAtLatestBlock( @@ -100,12 +100,11 @@ func (b *backendScripts) findScriptExecutors( return nil, err } - if !b.circuitBreakerEnabled { - executors = executors.Sample(maxExecutionNodesCnt) - } - executorAddrs := make([]string, 0, len(executors)) - for _, executor := range executors { + execNodeIter := b.execIteratorFactory.CreateNodeIterator(executors) + + // try to get events from one of the execution nodes + for executor := execNodeIter.Next(); executor != nil; executor = execNodeIter.Next() { executorAddrs = append(executorAddrs, executor.Address) } return executorAddrs, nil diff --git a/engine/access/rpc/backend/backend_test.go b/engine/access/rpc/backend/backend_test.go index 8f572daf172..01f1c9bddc5 100644 --- a/engine/access/rpc/backend/backend_test.go +++ b/engine/access/rpc/backend/backend_test.go @@ -2160,13 +2160,22 @@ func (suite *Suite) TestExecutionNodesForBlockID() { if fixedENs != nil { fixedENIdentifiers = fixedENs.NodeIDs() } - actualList, err := executionNodesForBlockID(context.Background(), block.ID(), suite.receipts, suite.state, suite.log) - require.NoError(suite.T(), err) - actualList = actualList.Sample(maxExecutionNodesCnt) if expectedENs == nil { expectedENs = flow.IdentityList{} } + + allExecNodes, err := executionNodesForBlockID(context.Background(), block.ID(), suite.receipts, suite.state, suite.log) + require.NoError(suite.T(), err) + + execIteratorFactory := ExecutionNodeIteratorFactory{circuitBreakerEnabled: false} + execIterator := execIteratorFactory.CreateNodeIterator(allExecNodes) + + actualList := flow.IdentityList{} + for actual := execIterator.Next(); actual != nil; actual = execIterator.Next() { + actualList = append(actualList, actual) + } + if len(expectedENs) > maxExecutionNodesCnt { for _, actual := range actualList { require.Contains(suite.T(), expectedENs, actual) @@ -2182,9 +2191,18 @@ func (suite *Suite) TestExecutionNodesForBlockID() { attempt2Receipts = flow.ExecutionReceiptList{} attempt3Receipts = flow.ExecutionReceiptList{} suite.state.On("AtBlockID", mock.Anything).Return(suite.snapshot) - actualList, err := executionNodesForBlockID(context.Background(), block.ID(), suite.receipts, suite.state, suite.log) - actualList = actualList.Sample(maxExecutionNodesCnt) + + allExecNodes, err := executionNodesForBlockID(context.Background(), block.ID(), suite.receipts, suite.state, suite.log) require.NoError(suite.T(), err) + + execIteratorFactory := ExecutionNodeIteratorFactory{circuitBreakerEnabled: false} + execIterator := execIteratorFactory.CreateNodeIterator(allExecNodes) + + actualList := flow.IdentityList{} + for actual := execIterator.Next(); actual != nil; actual = execIterator.Next() { + actualList = append(actualList, actual) + } + require.Equal(suite.T(), len(actualList), maxExecutionNodesCnt) }) diff --git a/engine/access/rpc/backend/backend_transactions.go b/engine/access/rpc/backend/backend_transactions.go index 025be9c04e7..f9adf6fa4e4 100644 --- a/engine/access/rpc/backend/backend_transactions.go +++ b/engine/access/rpc/backend/backend_transactions.go @@ -24,23 +24,22 @@ import ( "github.com/onflow/flow-go/storage" ) -const collectionNodesToTry uint = 3 - type backendTransactions struct { - staticCollectionRPC accessproto.AccessAPIClient // rpc client tied to a fixed collection node - transactions storage.Transactions - executionReceipts storage.ExecutionReceipts - collections storage.Collections - blocks storage.Blocks - state protocol.State - chainID flow.ChainID - transactionMetrics module.TransactionMetrics - transactionValidator *access.TransactionValidator - retry *Retry - connFactory ConnectionFactory - previousAccessNodes []accessproto.AccessAPIClient - log zerolog.Logger - circuitBreakerEnabled bool + staticCollectionRPC accessproto.AccessAPIClient // rpc client tied to a fixed collection node + transactions storage.Transactions + executionReceipts storage.ExecutionReceipts + collections storage.Collections + blocks storage.Blocks + state protocol.State + chainID flow.ChainID + transactionMetrics module.TransactionMetrics + transactionValidator *access.TransactionValidator + retry *Retry + connFactory ConnectionFactory + previousAccessNodes []accessproto.AccessAPIClient + log zerolog.Logger + collIteratorFactory CollectionNodeIteratorFactory + execIteratorFactory ExecutionNodeIteratorFactory } // SendTransaction forwards the transaction to the collection node @@ -85,8 +84,8 @@ func (b *backendTransactions) trySendTransaction(ctx context.Context, tx *flow.T return b.grpcTxSend(ctx, b.staticCollectionRPC, tx) } - // otherwise choose a random set of collections nodes to try - collAddrs, err := b.chooseCollectionNodes(tx, collectionNodesToTry) + // otherwise choose all collection nodes to try + collNodes, err := b.chooseCollectionNodes(tx) if err != nil { return fmt.Errorf("failed to determine collection node for tx %x: %w", tx, err) } @@ -100,9 +99,11 @@ func (b *backendTransactions) trySendTransaction(ctx context.Context, tx *flow.T } defer logAnyError() + collNodeIter := b.collIteratorFactory.CreateNodeIterator(collNodes) + // try sending the transaction to one of the chosen collection nodes - for _, addr := range collAddrs { - err = b.sendTransactionToCollector(ctx, tx, addr) + for colNode := collNodeIter.Next(); colNode != nil; colNode = collNodeIter.Next() { + err = b.sendTransactionToCollector(ctx, tx, colNode.Address) if err == nil { return nil } @@ -114,7 +115,7 @@ func (b *backendTransactions) trySendTransaction(ctx context.Context, tx *flow.T // chooseCollectionNodes finds a random subset of size sampleSize of collection node addresses from the // collection node cluster responsible for the given tx -func (b *backendTransactions) chooseCollectionNodes(tx *flow.TransactionBody, sampleSize uint) ([]string, error) { +func (b *backendTransactions) chooseCollectionNodes(tx *flow.TransactionBody) (flow.IdentityList, error) { // retrieve the set of collector clusters clusters, err := b.state.Final().Epochs().Current().Clustering() @@ -128,19 +129,7 @@ func (b *backendTransactions) chooseCollectionNodes(tx *flow.TransactionBody, sa return nil, fmt.Errorf("could not get local cluster by txID: %x", tx.ID()) } - // samples ony used when circuit breaker is disabled - if !b.circuitBreakerEnabled { - // select a random subset of collection nodes from the cluster to be tried in order - targetNodes = targetNodes.Sample(sampleSize) - } - - // collect the addresses of all the chosen collection nodes - var targetAddrs = make([]string, len(targetNodes)) - for i, id := range targetNodes { - targetAddrs[i] = id.Address - } - - return targetAddrs, nil + return targetNodes, nil } // sendTransactionToCollection sends the transaction to the given collection node via grpc @@ -780,10 +769,6 @@ func (b *backendTransactions) getTransactionResultFromAnyExeNode( execNodes flow.IdentityList, req *execproto.GetTransactionResultRequest, ) (*execproto.GetTransactionResultResponse, error) { - if !b.circuitBreakerEnabled { - execNodes = execNodes.Sample(maxExecutionNodesCnt) - } - var errs *multierror.Error logAnyError := func() { errToReturn := errs.ErrorOrNil() @@ -792,8 +777,11 @@ func (b *backendTransactions) getTransactionResultFromAnyExeNode( } } defer logAnyError() + + execNodeIter := b.execIteratorFactory.CreateNodeIterator(execNodes) + // try to execute the script on one of the execution nodes - for _, execNode := range execNodes { + for execNode := execNodeIter.Next(); execNode != nil; execNode = execNodeIter.Next() { resp, err := b.tryGetTransactionResult(ctx, execNode, req) if err == nil { b.log.Debug(). @@ -836,10 +824,6 @@ func (b *backendTransactions) getTransactionResultsByBlockIDFromAnyExeNode( execNodes flow.IdentityList, req *execproto.GetTransactionsByBlockIDRequest, ) (*execproto.GetTransactionResultsResponse, error) { - if !b.circuitBreakerEnabled { - execNodes = execNodes.Sample(maxExecutionNodesCnt) - } - var errs *multierror.Error defer func() { @@ -854,7 +838,9 @@ func (b *backendTransactions) getTransactionResultsByBlockIDFromAnyExeNode( return nil, errors.New("zero execution nodes") } - for _, execNode := range execNodes { + execNodeIter := b.execIteratorFactory.CreateNodeIterator(execNodes) + + for execNode := execNodeIter.Next(); execNode != nil; execNode = execNodeIter.Next() { resp, err := b.tryGetTransactionResultsByBlockID(ctx, execNode, req) if err == nil { b.log.Debug(). @@ -896,10 +882,6 @@ func (b *backendTransactions) getTransactionResultByIndexFromAnyExeNode( execNodes flow.IdentityList, req *execproto.GetTransactionByIndexRequest, ) (*execproto.GetTransactionResultResponse, error) { - if !b.circuitBreakerEnabled { - execNodes = execNodes.Sample(maxExecutionNodesCnt) - } - var errs *multierror.Error logAnyError := func() { errToReturn := errs.ErrorOrNil() @@ -913,8 +895,10 @@ func (b *backendTransactions) getTransactionResultByIndexFromAnyExeNode( return nil, errors.New("zero execution nodes provided") } + execNodeIter := b.execIteratorFactory.CreateNodeIterator(execNodes) + // try to execute the script on one of the execution nodes - for _, execNode := range execNodes { + for execNode := execNodeIter.Next(); execNode != nil; execNode = execNodeIter.Next() { resp, err := b.tryGetTransactionResultByIndex(ctx, execNode, req) if err == nil { b.log.Debug(). diff --git a/engine/access/rpc/backend/node_iterator.go b/engine/access/rpc/backend/node_iterator.go new file mode 100644 index 00000000000..a929f0e7899 --- /dev/null +++ b/engine/access/rpc/backend/node_iterator.go @@ -0,0 +1,79 @@ +package backend + +import "github.com/onflow/flow-go/model/flow" + +// maxExecutionNodesCnt is the max number of execution nodes that will be contacted to complete an execution api request +const maxExecutionNodesCnt = 3 + +const collectionNodesToTry = 3 + +type NodeIterator interface { + Next() *flow.Identity +} + +type NodeIteratorFactory interface { + CreateNodeIterator(nodes flow.IdentityList) NodeIterator +} + +var _ NodeIteratorFactory = (*ExecutionNodeIteratorFactory)(nil) +var _ NodeIterator = (*ExecutionNodeIterator)(nil) +var _ NodeIteratorFactory = (*CollectionNodeIteratorFactory)(nil) +var _ NodeIterator = (*CollectionNodeIterator)(nil) + +type ExecutionNodeIteratorFactory struct { + circuitBreakerEnabled bool +} + +func (e *ExecutionNodeIteratorFactory) CreateNodeIterator(nodes flow.IdentityList) NodeIterator { + if !e.circuitBreakerEnabled { + nodes = nodes.Sample(maxExecutionNodesCnt) + } + + return &ExecutionNodeIterator{ + nodes: nodes, + index: 0, + } +} + +type ExecutionNodeIterator struct { + nodes flow.IdentityList + index int +} + +func (e *ExecutionNodeIterator) Next() *flow.Identity { + if e.index < len(e.nodes) { + next := e.nodes[e.index] + e.index++ + return next + } + return nil +} + +type CollectionNodeIteratorFactory struct { + circuitBreakerEnabled bool +} + +func (c *CollectionNodeIteratorFactory) CreateNodeIterator(nodes flow.IdentityList) NodeIterator { + if !c.circuitBreakerEnabled { + nodes = nodes.Sample(collectionNodesToTry) + } + + return &CollectionNodeIterator{ + nodes: nodes, + index: 0, + } +} + +type CollectionNodeIterator struct { + nodes flow.IdentityList + index int +} + +func (c *CollectionNodeIterator) Next() *flow.Identity { + if c.index < len(c.nodes) { + next := c.nodes[c.index] + c.index++ + return next + } + return nil +} From 41edd8f7abb282183b075dcbb36815ea96bfcf96 Mon Sep 17 00:00:00 2001 From: Andrii Slisarchuk Date: Thu, 22 Jun 2023 11:09:27 +0300 Subject: [PATCH 26/56] Removed debug output --- engine/access/rpc/backend/connection_factory.go | 3 --- 1 file changed, 3 deletions(-) diff --git a/engine/access/rpc/backend/connection_factory.go b/engine/access/rpc/backend/connection_factory.go index bca83eb70b2..7e447fe94e4 100644 --- a/engine/access/rpc/backend/connection_factory.go +++ b/engine/access/rpc/backend/connection_factory.go @@ -293,7 +293,6 @@ func (cf *ConnectionFactoryImpl) withClientInvalidationInterceptor(address strin invoker grpc.UnaryInvoker, opts ...grpc.CallOption, ) error { - fmt.Println("!!! clientInvalidationInterceptor") err := invoker(ctx, method, req, reply, cc, opts...) if status.Code(err) == codes.Unavailable { switch clientType { @@ -333,7 +332,6 @@ func (cf *ConnectionFactoryImpl) withCircuitBreakerInterceptor() grpc.UnaryClien invoker grpc.UnaryInvoker, opts ...grpc.CallOption, ) error { - fmt.Println("!!! circuitBreakerInterceptor") // The invoker should be called from circuit breaker execute, to catch each fails and react according to settings _, err := circuitBreaker.Execute(func() (interface{}, error) { err := invoker(ctx, method, req, reply, cc, opts...) @@ -363,7 +361,6 @@ func WithClientTimeoutInterceptor(timeout time.Duration) grpc.UnaryClientInterce ctxWithTimeout, cancel := context.WithTimeout(ctx, timeout) defer cancel() - fmt.Println("!!! clientTimeoutInterceptor") // call the remote GRPC using the short context err := invoker(ctxWithTimeout, method, req, reply, cc, opts...) From bbf5bebdeec9afb6b2f2836587a3f40b4302787f Mon Sep 17 00:00:00 2001 From: Andrii Slisarchuk Date: Thu, 22 Jun 2023 15:33:24 +0300 Subject: [PATCH 27/56] Renamed node selector and interceptor creator functions --- .../node_builder/access_node_builder.go | 2 +- .../export_report.json | 6 --- engine/access/rpc/backend/backend.go | 12 +++-- engine/access/rpc/backend/backend_accounts.go | 6 +-- engine/access/rpc/backend/backend_events.go | 6 +-- engine/access/rpc/backend/backend_scripts.go | 6 +-- engine/access/rpc/backend/backend_test.go | 12 ++--- .../rpc/backend/backend_transactions.go | 19 ++++---- .../access/rpc/backend/connection_factory.go | 14 +++--- engine/access/rpc/backend/node_iterator.go | 45 ++++++++----------- 10 files changed, 55 insertions(+), 73 deletions(-) delete mode 100644 cmd/util/cmd/execution-state-extract/export_report.json diff --git a/cmd/access/node_builder/access_node_builder.go b/cmd/access/node_builder/access_node_builder.go index 6e653805f44..151bcbbf74f 100644 --- a/cmd/access/node_builder/access_node_builder.go +++ b/cmd/access/node_builder/access_node_builder.go @@ -163,7 +163,7 @@ func DefaultAccessNodeConfig() *AccessNodeConfig { MaxMsgSize: grpcutils.DefaultMaxMsgSize, CircuitBreakerConfig: &backend.CircuitBreakerConfig{ Enabled: false, - RestoreTimeout: time.Duration(60) * time.Second, + RestoreTimeout: 60 * time.Second, MaxFailures: 5, }, }, diff --git a/cmd/util/cmd/execution-state-extract/export_report.json b/cmd/util/cmd/execution-state-extract/export_report.json deleted file mode 100644 index fe9eccd9727..00000000000 --- a/cmd/util/cmd/execution-state-extract/export_report.json +++ /dev/null @@ -1,6 +0,0 @@ -{ - "EpochCounter": 0, - "PreviousStateCommitment": "5bdf6c643d9f0dbc78adf5923895df3bbf52867c406444d7fd101b9942f7d1f4", - "CurrentStateCommitment": "5bdf6c643d9f0dbc78adf5923895df3bbf52867c406444d7fd101b9942f7d1f4", - "ReportSucceeded": true -} \ No newline at end of file diff --git a/engine/access/rpc/backend/backend.go b/engine/access/rpc/backend/backend.go index 993f88fb046..42a0fcd26a1 100644 --- a/engine/access/rpc/backend/backend.go +++ b/engine/access/rpc/backend/backend.go @@ -106,8 +106,7 @@ func New( log.Fatal().Err(err).Msg("failed to initialize script logging cache") } - collIteratorFactory := CollectionNodeIteratorFactory{circuitBreakerEnabled: circuitBreakerEnabled} - execIteratorFactory := ExecutionNodeIteratorFactory{circuitBreakerEnabled: circuitBreakerEnabled} + nodeSelectorFactory := NodeSelectorFactory{circuitBreakerEnabled: circuitBreakerEnabled} b := &Backend{ state: state, @@ -121,7 +120,7 @@ func New( metrics: accessMetrics, loggedScripts: loggedScripts, archiveAddressList: archiveAddressList, - execIteratorFactory: execIteratorFactory, + nodeSelectorFactory: nodeSelectorFactory, }, backendTransactions: backendTransactions{ staticCollectionRPC: collectionRPC, @@ -137,8 +136,7 @@ func New( connFactory: connFactory, previousAccessNodes: historicalAccessNodes, log: log, - collIteratorFactory: collIteratorFactory, - execIteratorFactory: execIteratorFactory, + nodeSelectorFactory: nodeSelectorFactory, }, backendEvents: backendEvents{ state: state, @@ -147,7 +145,7 @@ func New( connFactory: connFactory, log: log, maxHeightRange: maxHeightRange, - execIteratorFactory: execIteratorFactory, + nodeSelectorFactory: nodeSelectorFactory, }, backendBlockHeaders: backendBlockHeaders{ headers: headers, @@ -163,7 +161,7 @@ func New( executionReceipts: executionReceipts, connFactory: connFactory, log: log, - execIteratorFactory: execIteratorFactory, + nodeSelectorFactory: nodeSelectorFactory, }, backendExecutionResults: backendExecutionResults{ executionResults: executionResults, diff --git a/engine/access/rpc/backend/backend_accounts.go b/engine/access/rpc/backend/backend_accounts.go index b38f552631e..f3fd12c82f9 100644 --- a/engine/access/rpc/backend/backend_accounts.go +++ b/engine/access/rpc/backend/backend_accounts.go @@ -23,7 +23,7 @@ type backendAccounts struct { executionReceipts storage.ExecutionReceipts connFactory ConnectionFactory log zerolog.Logger - execIteratorFactory ExecutionNodeIteratorFactory + nodeSelectorFactory NodeSelectorFactory } func (b *backendAccounts) GetAccount(ctx context.Context, address flow.Address) (*flow.Account, error) { @@ -110,9 +110,9 @@ func (b *backendAccounts) getAccountAtBlockID( func (b *backendAccounts) getAccountFromAnyExeNode(ctx context.Context, execNodes flow.IdentityList, req *execproto.GetAccountAtBlockIDRequest) (*execproto.GetAccountAtBlockIDResponse, error) { var errors *multierror.Error - execNodeIter := b.execIteratorFactory.CreateNodeIterator(execNodes) + execNodeSelector := b.nodeSelectorFactory.SelectExecutionNodes(execNodes) - for execNode := execNodeIter.Next(); execNode != nil; execNode = execNodeIter.Next() { + for execNode := execNodeSelector.Next(); execNode != nil; execNode = execNodeSelector.Next() { // TODO: use the GRPC Client interceptor start := time.Now() diff --git a/engine/access/rpc/backend/backend_events.go b/engine/access/rpc/backend/backend_events.go index 74a64cc10bb..b82abf0b8c1 100644 --- a/engine/access/rpc/backend/backend_events.go +++ b/engine/access/rpc/backend/backend_events.go @@ -27,7 +27,7 @@ type backendEvents struct { connFactory ConnectionFactory log zerolog.Logger maxHeightRange uint - execIteratorFactory ExecutionNodeIteratorFactory + nodeSelectorFactory NodeSelectorFactory } // GetEventsForHeightRange retrieves events for all sealed blocks between the start block height and @@ -212,10 +212,10 @@ func (b *backendEvents) getEventsFromAnyExeNode(ctx context.Context, req *execproto.GetEventsForBlockIDsRequest) (*execproto.GetEventsForBlockIDsResponse, *flow.Identity, error) { var errors *multierror.Error - execNodeIter := b.execIteratorFactory.CreateNodeIterator(execNodes) + execNodeSelector := b.nodeSelectorFactory.SelectExecutionNodes(execNodes) // try to get events from one of the execution nodes - for execNode := execNodeIter.Next(); execNode != nil; execNode = execNodeIter.Next() { + for execNode := execNodeSelector.Next(); execNode != nil; execNode = execNodeSelector.Next() { start := time.Now() resp, err := b.tryGetEvents(ctx, execNode, req) duration := time.Since(start) diff --git a/engine/access/rpc/backend/backend_scripts.go b/engine/access/rpc/backend/backend_scripts.go index 8a1cd7b9cb2..3d251139531 100644 --- a/engine/access/rpc/backend/backend_scripts.go +++ b/engine/access/rpc/backend/backend_scripts.go @@ -32,7 +32,7 @@ type backendScripts struct { metrics module.BackendScriptsMetrics loggedScripts *lru.Cache archiveAddressList []string - execIteratorFactory ExecutionNodeIteratorFactory + nodeSelectorFactory NodeSelectorFactory } func (b *backendScripts) ExecuteScriptAtLatestBlock( @@ -101,10 +101,10 @@ func (b *backendScripts) findScriptExecutors( } executorAddrs := make([]string, 0, len(executors)) - execNodeIter := b.execIteratorFactory.CreateNodeIterator(executors) + execNodeSelector := b.nodeSelectorFactory.SelectExecutionNodes(executors) // try to get events from one of the execution nodes - for executor := execNodeIter.Next(); executor != nil; executor = execNodeIter.Next() { + for executor := execNodeSelector.Next(); executor != nil; executor = execNodeSelector.Next() { executorAddrs = append(executorAddrs, executor.Address) } return executorAddrs, nil diff --git a/engine/access/rpc/backend/backend_test.go b/engine/access/rpc/backend/backend_test.go index 01f1c9bddc5..eed923727d0 100644 --- a/engine/access/rpc/backend/backend_test.go +++ b/engine/access/rpc/backend/backend_test.go @@ -2168,11 +2168,11 @@ func (suite *Suite) TestExecutionNodesForBlockID() { allExecNodes, err := executionNodesForBlockID(context.Background(), block.ID(), suite.receipts, suite.state, suite.log) require.NoError(suite.T(), err) - execIteratorFactory := ExecutionNodeIteratorFactory{circuitBreakerEnabled: false} - execIterator := execIteratorFactory.CreateNodeIterator(allExecNodes) + execNodeSelectorFactory := NodeSelectorFactory{circuitBreakerEnabled: false} + execSelector := execNodeSelectorFactory.SelectExecutionNodes(allExecNodes) actualList := flow.IdentityList{} - for actual := execIterator.Next(); actual != nil; actual = execIterator.Next() { + for actual := execSelector.Next(); actual != nil; actual = execSelector.Next() { actualList = append(actualList, actual) } @@ -2195,11 +2195,11 @@ func (suite *Suite) TestExecutionNodesForBlockID() { allExecNodes, err := executionNodesForBlockID(context.Background(), block.ID(), suite.receipts, suite.state, suite.log) require.NoError(suite.T(), err) - execIteratorFactory := ExecutionNodeIteratorFactory{circuitBreakerEnabled: false} - execIterator := execIteratorFactory.CreateNodeIterator(allExecNodes) + execNodeSelectorFactory := NodeSelectorFactory{circuitBreakerEnabled: false} + execSelector := execNodeSelectorFactory.SelectExecutionNodes(allExecNodes) actualList := flow.IdentityList{} - for actual := execIterator.Next(); actual != nil; actual = execIterator.Next() { + for actual := execSelector.Next(); actual != nil; actual = execSelector.Next() { actualList = append(actualList, actual) } diff --git a/engine/access/rpc/backend/backend_transactions.go b/engine/access/rpc/backend/backend_transactions.go index f9adf6fa4e4..05cd0645e42 100644 --- a/engine/access/rpc/backend/backend_transactions.go +++ b/engine/access/rpc/backend/backend_transactions.go @@ -38,8 +38,7 @@ type backendTransactions struct { connFactory ConnectionFactory previousAccessNodes []accessproto.AccessAPIClient log zerolog.Logger - collIteratorFactory CollectionNodeIteratorFactory - execIteratorFactory ExecutionNodeIteratorFactory + nodeSelectorFactory NodeSelectorFactory } // SendTransaction forwards the transaction to the collection node @@ -99,10 +98,10 @@ func (b *backendTransactions) trySendTransaction(ctx context.Context, tx *flow.T } defer logAnyError() - collNodeIter := b.collIteratorFactory.CreateNodeIterator(collNodes) + collNodeSelector := b.nodeSelectorFactory.SelectCollectionNodes(collNodes) // try sending the transaction to one of the chosen collection nodes - for colNode := collNodeIter.Next(); colNode != nil; colNode = collNodeIter.Next() { + for colNode := collNodeSelector.Next(); colNode != nil; colNode = collNodeSelector.Next() { err = b.sendTransactionToCollector(ctx, tx, colNode.Address) if err == nil { return nil @@ -778,10 +777,10 @@ func (b *backendTransactions) getTransactionResultFromAnyExeNode( } defer logAnyError() - execNodeIter := b.execIteratorFactory.CreateNodeIterator(execNodes) + execNodeSelector := b.nodeSelectorFactory.SelectExecutionNodes(execNodes) // try to execute the script on one of the execution nodes - for execNode := execNodeIter.Next(); execNode != nil; execNode = execNodeIter.Next() { + for execNode := execNodeSelector.Next(); execNode != nil; execNode = execNodeSelector.Next() { resp, err := b.tryGetTransactionResult(ctx, execNode, req) if err == nil { b.log.Debug(). @@ -838,9 +837,9 @@ func (b *backendTransactions) getTransactionResultsByBlockIDFromAnyExeNode( return nil, errors.New("zero execution nodes") } - execNodeIter := b.execIteratorFactory.CreateNodeIterator(execNodes) + execNodeSelector := b.nodeSelectorFactory.SelectExecutionNodes(execNodes) - for execNode := execNodeIter.Next(); execNode != nil; execNode = execNodeIter.Next() { + for execNode := execNodeSelector.Next(); execNode != nil; execNode = execNodeSelector.Next() { resp, err := b.tryGetTransactionResultsByBlockID(ctx, execNode, req) if err == nil { b.log.Debug(). @@ -895,10 +894,10 @@ func (b *backendTransactions) getTransactionResultByIndexFromAnyExeNode( return nil, errors.New("zero execution nodes provided") } - execNodeIter := b.execIteratorFactory.CreateNodeIterator(execNodes) + execNodeSelector := b.nodeSelectorFactory.SelectExecutionNodes(execNodes) // try to execute the script on one of the execution nodes - for execNode := execNodeIter.Next(); execNode != nil; execNode = execNodeIter.Next() { + for execNode := execNodeSelector.Next(); execNode != nil; execNode = execNodeSelector.Next() { resp, err := b.tryGetTransactionResultByIndex(ctx, execNode, req) if err == nil { b.log.Debug(). diff --git a/engine/access/rpc/backend/connection_factory.go b/engine/access/rpc/backend/connection_factory.go index 7e447fe94e4..2df7cb305c0 100644 --- a/engine/access/rpc/backend/connection_factory.go +++ b/engine/access/rpc/backend/connection_factory.go @@ -104,17 +104,17 @@ func (cf *ConnectionFactoryImpl) createConnection(address string, timeout time.D var connInterceptors []grpc.UnaryClientInterceptor - cbInterceptor := cf.withCircuitBreakerInterceptor() + cbInterceptor := cf.createCircuitBreakerInterceptor() if cbInterceptor != nil { connInterceptors = append(connInterceptors, cbInterceptor) } - ciInterceptor := cf.withClientInvalidationInterceptor(address, clientType) + ciInterceptor := cf.createClientInvalidationInterceptor(address, clientType) if ciInterceptor != nil { connInterceptors = append(connInterceptors, ciInterceptor) } - connInterceptors = append(connInterceptors, WithClientTimeoutInterceptor(timeout)) + connInterceptors = append(connInterceptors, createClientTimeoutInterceptor(timeout)) // ClientConn's default KeepAlive on connections is indefinite, assuming the timeout isn't reached // The connections should be safe to be persisted and reused @@ -282,7 +282,7 @@ func getGRPCAddress(address string, grpcPort uint) (string, error) { return grpcAddress, nil } -func (cf *ConnectionFactoryImpl) withClientInvalidationInterceptor(address string, clientType clientType) grpc.UnaryClientInterceptor { +func (cf *ConnectionFactoryImpl) createClientInvalidationInterceptor(address string, clientType clientType) grpc.UnaryClientInterceptor { if cf.CircuitBreakerConfig == nil || !cf.CircuitBreakerConfig.Enabled { clientInvalidationInterceptor := func( ctx context.Context, @@ -312,7 +312,7 @@ func (cf *ConnectionFactoryImpl) withClientInvalidationInterceptor(address strin return nil } -func (cf *ConnectionFactoryImpl) withCircuitBreakerInterceptor() grpc.UnaryClientInterceptor { +func (cf *ConnectionFactoryImpl) createCircuitBreakerInterceptor() grpc.UnaryClientInterceptor { if cf.CircuitBreakerConfig != nil && cf.CircuitBreakerConfig.Enabled { circuitBreaker := gobreaker.NewCircuitBreaker(gobreaker.Settings{ // here restore timeout defined to automatically return circuit breaker to HalfClose state @@ -347,7 +347,7 @@ func (cf *ConnectionFactoryImpl) withCircuitBreakerInterceptor() grpc.UnaryClien return nil } -func WithClientTimeoutInterceptor(timeout time.Duration) grpc.UnaryClientInterceptor { +func createClientTimeoutInterceptor(timeout time.Duration) grpc.UnaryClientInterceptor { clientTimeoutInterceptor := func( ctx context.Context, method string, @@ -371,5 +371,5 @@ func WithClientTimeoutInterceptor(timeout time.Duration) grpc.UnaryClientInterce } func WithClientTimeoutOption(timeout time.Duration) grpc.DialOption { - return grpc.WithUnaryInterceptor(WithClientTimeoutInterceptor(timeout)) + return grpc.WithUnaryInterceptor(createClientTimeoutInterceptor(timeout)) } diff --git a/engine/access/rpc/backend/node_iterator.go b/engine/access/rpc/backend/node_iterator.go index a929f0e7899..df6e0cf395b 100644 --- a/engine/access/rpc/backend/node_iterator.go +++ b/engine/access/rpc/backend/node_iterator.go @@ -7,25 +7,16 @@ const maxExecutionNodesCnt = 3 const collectionNodesToTry = 3 -type NodeIterator interface { +type NodeSelector interface { Next() *flow.Identity } -type NodeIteratorFactory interface { - CreateNodeIterator(nodes flow.IdentityList) NodeIterator -} - -var _ NodeIteratorFactory = (*ExecutionNodeIteratorFactory)(nil) -var _ NodeIterator = (*ExecutionNodeIterator)(nil) -var _ NodeIteratorFactory = (*CollectionNodeIteratorFactory)(nil) -var _ NodeIterator = (*CollectionNodeIterator)(nil) - -type ExecutionNodeIteratorFactory struct { +type NodeSelectorFactory struct { circuitBreakerEnabled bool } -func (e *ExecutionNodeIteratorFactory) CreateNodeIterator(nodes flow.IdentityList) NodeIterator { - if !e.circuitBreakerEnabled { +func (n *NodeSelectorFactory) SelectExecutionNodes(nodes flow.IdentityList) NodeSelector { + if !n.circuitBreakerEnabled { nodes = nodes.Sample(maxExecutionNodesCnt) } @@ -35,6 +26,19 @@ func (e *ExecutionNodeIteratorFactory) CreateNodeIterator(nodes flow.IdentityLis } } +func (n *NodeSelectorFactory) SelectCollectionNodes(nodes flow.IdentityList) NodeSelector { + if !n.circuitBreakerEnabled { + nodes = nodes.Sample(collectionNodesToTry) + } + + return &CollectionNodeIterator{ + nodes: nodes, + index: 0, + } +} + +var _ NodeSelector = (*ExecutionNodeIterator)(nil) + type ExecutionNodeIterator struct { nodes flow.IdentityList index int @@ -49,20 +53,7 @@ func (e *ExecutionNodeIterator) Next() *flow.Identity { return nil } -type CollectionNodeIteratorFactory struct { - circuitBreakerEnabled bool -} - -func (c *CollectionNodeIteratorFactory) CreateNodeIterator(nodes flow.IdentityList) NodeIterator { - if !c.circuitBreakerEnabled { - nodes = nodes.Sample(collectionNodesToTry) - } - - return &CollectionNodeIterator{ - nodes: nodes, - index: 0, - } -} +var _ NodeSelector = (*CollectionNodeIterator)(nil) type CollectionNodeIterator struct { nodes flow.IdentityList From 6054ef7bb9fca39ef4e3baa2da7ecf56411f8514 Mon Sep 17 00:00:00 2001 From: Andrii Slisarchuk Date: Thu, 22 Jun 2023 16:40:11 +0300 Subject: [PATCH 28/56] Added comments --- .../node_builder/access_node_builder.go | 8 +- .../access/rpc/backend/connection_factory.go | 9 ++ engine/access/rpc/backend/node_iterator.go | 70 --------------- engine/access/rpc/backend/node_selector.go | 86 +++++++++++++++++++ 4 files changed, 99 insertions(+), 74 deletions(-) delete mode 100644 engine/access/rpc/backend/node_iterator.go create mode 100644 engine/access/rpc/backend/node_selector.go diff --git a/cmd/access/node_builder/access_node_builder.go b/cmd/access/node_builder/access_node_builder.go index 151bcbbf74f..d0b7696c7a1 100644 --- a/cmd/access/node_builder/access_node_builder.go +++ b/cmd/access/node_builder/access_node_builder.go @@ -687,9 +687,9 @@ func (builder *FlowAccessNodeBuilder) extraFlags() { flags.StringToIntVar(&builder.apiBurstlimits, "api-burst-limits", defaultConfig.apiBurstlimits, "burst limits for Access API methods e.g. Ping=100,GetTransaction=100 etc.") flags.BoolVar(&builder.supportsObserver, "supports-observer", defaultConfig.supportsObserver, "true if this staked access node supports observer or follower connections") flags.StringVar(&builder.PublicNetworkConfig.BindAddress, "public-network-address", defaultConfig.PublicNetworkConfig.BindAddress, "staked access node's public network bind address") - flags.BoolVar(&builder.rpcConf.CircuitBreakerConfig.Enabled, "circuit-breaker-enabled", defaultConfig.rpcConf.CircuitBreakerConfig.Enabled, "whether to enable the circuit breaker for collection and execution node connections") - flags.DurationVar(&builder.rpcConf.CircuitBreakerConfig.RestoreTimeout, "circuit-breaker-restore-timeout", defaultConfig.rpcConf.CircuitBreakerConfig.RestoreTimeout, "initial timeout for circuit breaker to try connect again. Default value is 60s") - flags.Uint32Var(&builder.rpcConf.CircuitBreakerConfig.MaxFailures, "circuit-breaker-max-failures", defaultConfig.rpcConf.CircuitBreakerConfig.MaxFailures, "number of consecutive failures to break connection. Default value is 5") + flags.BoolVar(&builder.rpcConf.CircuitBreakerConfig.Enabled, "circuit-breaker-enabled", defaultConfig.rpcConf.CircuitBreakerConfig.Enabled, "specifies whether the circuit breaker is enabled for collection and execution API clients.") + flags.DurationVar(&builder.rpcConf.CircuitBreakerConfig.RestoreTimeout, "circuit-breaker-restore-timeout", defaultConfig.rpcConf.CircuitBreakerConfig.RestoreTimeout, "duration after which the circuit breaker will restore the connection to the client after closing it due to failures. Default value is 60s") + flags.Uint32Var(&builder.rpcConf.CircuitBreakerConfig.MaxFailures, "circuit-breaker-max-failures", defaultConfig.rpcConf.CircuitBreakerConfig.MaxFailures, "maximum number of failed calls to the client that will cause the circuit breaker to close the connection. Default value is 5") // ExecutionDataRequester config flags.BoolVar(&builder.executionDataSyncEnabled, "execution-data-sync-enabled", defaultConfig.executionDataSyncEnabled, "whether to enable the execution data sync protocol") @@ -756,7 +756,7 @@ func (builder *FlowAccessNodeBuilder) extraFlags() { } if builder.rpcConf.CircuitBreakerConfig.Enabled { if builder.rpcConf.CircuitBreakerConfig.MaxFailures == 0 { - return errors.New("circuit-breaker-max-request-to-break must be greater than 0") + return errors.New("circuit-breaker-max-failures must be greater than 0") } } diff --git a/engine/access/rpc/backend/connection_factory.go b/engine/access/rpc/backend/connection_factory.go index 2df7cb305c0..cab254e879e 100644 --- a/engine/access/rpc/backend/connection_factory.go +++ b/engine/access/rpc/backend/connection_factory.go @@ -75,6 +75,15 @@ type ConnectionFactoryImpl struct { CircuitBreakerConfig *CircuitBreakerConfig } +// CircuitBreakerConfig is a configuration struct for the circuit breaker. +// +// Enabled specifies whether the circuit breaker is enabled for collection and execution API clients. +// +// RestoreTimeout specifies the duration after which the circuit breaker will restore the connection to the client +// after closing it due to failures. +// +// MaxFailures specifies the maximum number of failed calls to the client that will cause the circuit breaker +// to close the connection. type CircuitBreakerConfig struct { Enabled bool RestoreTimeout time.Duration diff --git a/engine/access/rpc/backend/node_iterator.go b/engine/access/rpc/backend/node_iterator.go deleted file mode 100644 index df6e0cf395b..00000000000 --- a/engine/access/rpc/backend/node_iterator.go +++ /dev/null @@ -1,70 +0,0 @@ -package backend - -import "github.com/onflow/flow-go/model/flow" - -// maxExecutionNodesCnt is the max number of execution nodes that will be contacted to complete an execution api request -const maxExecutionNodesCnt = 3 - -const collectionNodesToTry = 3 - -type NodeSelector interface { - Next() *flow.Identity -} - -type NodeSelectorFactory struct { - circuitBreakerEnabled bool -} - -func (n *NodeSelectorFactory) SelectExecutionNodes(nodes flow.IdentityList) NodeSelector { - if !n.circuitBreakerEnabled { - nodes = nodes.Sample(maxExecutionNodesCnt) - } - - return &ExecutionNodeIterator{ - nodes: nodes, - index: 0, - } -} - -func (n *NodeSelectorFactory) SelectCollectionNodes(nodes flow.IdentityList) NodeSelector { - if !n.circuitBreakerEnabled { - nodes = nodes.Sample(collectionNodesToTry) - } - - return &CollectionNodeIterator{ - nodes: nodes, - index: 0, - } -} - -var _ NodeSelector = (*ExecutionNodeIterator)(nil) - -type ExecutionNodeIterator struct { - nodes flow.IdentityList - index int -} - -func (e *ExecutionNodeIterator) Next() *flow.Identity { - if e.index < len(e.nodes) { - next := e.nodes[e.index] - e.index++ - return next - } - return nil -} - -var _ NodeSelector = (*CollectionNodeIterator)(nil) - -type CollectionNodeIterator struct { - nodes flow.IdentityList - index int -} - -func (c *CollectionNodeIterator) Next() *flow.Identity { - if c.index < len(c.nodes) { - next := c.nodes[c.index] - c.index++ - return next - } - return nil -} diff --git a/engine/access/rpc/backend/node_selector.go b/engine/access/rpc/backend/node_selector.go new file mode 100644 index 00000000000..48831337ba3 --- /dev/null +++ b/engine/access/rpc/backend/node_selector.go @@ -0,0 +1,86 @@ +package backend + +import "github.com/onflow/flow-go/model/flow" + +// maxExecutionNodesCnt is the maximum number of execution nodes that will be contacted to complete an execution API request. +const maxExecutionNodesCnt = 3 + +// maxCollectionNodesCnt is the maximum number of collection nodes that will be contacted to complete a collection API request. +const maxCollectionNodesCnt = 3 + +// NodeSelector is an interface that represents the ability to select node identities that the access node is trying to reach. +// It encapsulates the internal logic of node selection and provides a way to change implementations for different types +// of nodes. Implementations of this interface should define the Next method, which returns the next node identity to be +// selected. +type NodeSelector interface { + Next() *flow.Identity +} + +// NodeSelectorFactory is a factory for creating node selectors based on factory configuration and node type. +type NodeSelectorFactory struct { + circuitBreakerEnabled bool +} + +// SelectExecutionNodes selects the configured number of execution node identities from the provided list of execution nodes +// and returns an execution node selector to iterate through them. +func (n *NodeSelectorFactory) SelectExecutionNodes(executionNodes flow.IdentityList) NodeSelector { + // If the circuit breaker is disabled, the legacy logic should be used, which selects only a specified number of nodes. + if !n.circuitBreakerEnabled { + executionNodes = executionNodes.Sample(maxExecutionNodesCnt) + } + + return &ExecutionNodeSelector{ + nodes: executionNodes, + index: 0, + } +} + +// SelectCollectionNodes selects the configured number of collection node identities from the provided list of collection nodes +// and returns a collection node selector to iterate through them. +func (n *NodeSelectorFactory) SelectCollectionNodes(collectionNodes flow.IdentityList) NodeSelector { + // If the circuit breaker is disabled, the legacy logic should be used, which selects only a specified number of nodes. + if !n.circuitBreakerEnabled { + collectionNodes = collectionNodes.Sample(maxCollectionNodesCnt) + } + + return &CollectionNodeSelector{ + nodes: collectionNodes, + index: 0, + } +} + +var _ NodeSelector = (*ExecutionNodeSelector)(nil) + +// ExecutionNodeSelector is a specific implementation of an execution node selector. +type ExecutionNodeSelector struct { + nodes flow.IdentityList + index int +} + +// Next returns the next execution node in the selector. +func (e *ExecutionNodeSelector) Next() *flow.Identity { + if e.index < len(e.nodes) { + next := e.nodes[e.index] + e.index++ + return next + } + return nil +} + +var _ NodeSelector = (*CollectionNodeSelector)(nil) + +// CollectionNodeSelector is a specific implementation of a collection node selector. +type CollectionNodeSelector struct { + nodes flow.IdentityList + index int +} + +// Next returns the next collection node in the selector. +func (c *CollectionNodeSelector) Next() *flow.Identity { + if c.index < len(c.nodes) { + next := c.nodes[c.index] + c.index++ + return next + } + return nil +} From b045dd0c2291c15fe34b9391695aa460eb308ccd Mon Sep 17 00:00:00 2001 From: Andrii Slisarchuk Date: Thu, 22 Jun 2023 17:54:59 +0300 Subject: [PATCH 29/56] Added more comments --- engine/access/rpc/backend/backend.go | 1 + engine/access/rpc/backend/backend_scripts.go | 1 - .../access/rpc/backend/connection_factory.go | 49 ++++++++++++++++--- 3 files changed, 43 insertions(+), 8 deletions(-) diff --git a/engine/access/rpc/backend/backend.go b/engine/access/rpc/backend/backend.go index 42a0fcd26a1..2da4be060c7 100644 --- a/engine/access/rpc/backend/backend.go +++ b/engine/access/rpc/backend/backend.go @@ -106,6 +106,7 @@ func New( log.Fatal().Err(err).Msg("failed to initialize script logging cache") } + // create configured node selection factory to be used in sub-backend logic nodeSelectorFactory := NodeSelectorFactory{circuitBreakerEnabled: circuitBreakerEnabled} b := &Backend{ diff --git a/engine/access/rpc/backend/backend_scripts.go b/engine/access/rpc/backend/backend_scripts.go index 3d251139531..46d247a12aa 100644 --- a/engine/access/rpc/backend/backend_scripts.go +++ b/engine/access/rpc/backend/backend_scripts.go @@ -103,7 +103,6 @@ func (b *backendScripts) findScriptExecutors( executorAddrs := make([]string, 0, len(executors)) execNodeSelector := b.nodeSelectorFactory.SelectExecutionNodes(executors) - // try to get events from one of the execution nodes for executor := execNodeSelector.Next(); executor != nil; executor = execNodeSelector.Next() { executorAddrs = append(executorAddrs, executor.Address) } diff --git a/engine/access/rpc/backend/connection_factory.go b/engine/access/rpc/backend/connection_factory.go index cab254e879e..9d66a4caeba 100644 --- a/engine/access/rpc/backend/connection_factory.go +++ b/engine/access/rpc/backend/connection_factory.go @@ -291,7 +291,13 @@ func getGRPCAddress(address string, grpcPort uint) (string, error) { return grpcAddress, nil } -func (cf *ConnectionFactoryImpl) createClientInvalidationInterceptor(address string, clientType clientType) grpc.UnaryClientInterceptor { +// createClientInvalidationInterceptor creates a client interceptor for client invalidation. It should only be created +// if the circuit breaker is disabled. If the response from the server indicates an unavailable status, it invalidates +// the corresponding client. +func (cf *ConnectionFactoryImpl) createClientInvalidationInterceptor( + address string, + clientType clientType, +) grpc.UnaryClientInterceptor { if cf.CircuitBreakerConfig == nil || !cf.CircuitBreakerConfig.Enabled { clientInvalidationInterceptor := func( ctx context.Context, @@ -321,13 +327,33 @@ func (cf *ConnectionFactoryImpl) createClientInvalidationInterceptor(address str return nil } +// The simplified representation and description of circuit breaker pattern, that used to handle node connectivity: +// +// Circuit Open --> Circuit Half-Open --> Circuit Closed +// ^ | +// | | +// +--------------------------------------+ +// +// The "Circuit Open" state represents the circuit being open, indicating that the node is not available. +// This state is entered when the number of consecutive failures exceeds the maximum allowed failures. +// +// The "Circuit Half-Open" state represents the circuit transitioning from the open state to the half-open +// state after a configured restore timeout. In this state, the circuit allows a limited number of requests +// to test if the node has recovered. +// +// The "Circuit Closed" state represents the circuit being closed, indicating that the node is available. +// This state is initial or entered when the test requests in the half-open state succeed. + +// createCircuitBreakerInterceptor creates a client interceptor for circuit breaker functionality. It should only be +// created if the circuit breaker is enabled. All invocations will go through the circuit breaker to be tracked for +// success or failure of the call. func (cf *ConnectionFactoryImpl) createCircuitBreakerInterceptor() grpc.UnaryClientInterceptor { if cf.CircuitBreakerConfig != nil && cf.CircuitBreakerConfig.Enabled { circuitBreaker := gobreaker.NewCircuitBreaker(gobreaker.Settings{ - // here restore timeout defined to automatically return circuit breaker to HalfClose state + // The restore timeout is defined to automatically return the circuit breaker to the HalfClose state. Timeout: cf.CircuitBreakerConfig.RestoreTimeout, ReadyToTrip: func(counts gobreaker.Counts) bool { - // here number of maximum failures will be checked, before circuit breaker go to Open state + // The number of maximum failures is checked before the circuit breaker goes to the Open state. return counts.ConsecutiveFailures >= cf.CircuitBreakerConfig.MaxFailures }, }) @@ -341,7 +367,13 @@ func (cf *ConnectionFactoryImpl) createCircuitBreakerInterceptor() grpc.UnaryCli invoker grpc.UnaryInvoker, opts ...grpc.CallOption, ) error { - // The invoker should be called from circuit breaker execute, to catch each fails and react according to settings + // The circuit breaker integration occurs here, where all invoked calls to the node pass through the + // CircuitBreaker.Execute method. This method counts successful and failed invocations, and switches to the + // "StateOpen" when the maximum failure threshold is reached. When the circuit breaker is in the "StateOpen" + // it immediately rejects connections and returns without waiting for the call timeout. After the + // "RestoreTimeout" period elapses, the circuit breaker transitions to the "StateHalfOpen" and attempts the + // invocation again. If the invocation fails, it returns to the "StateOpen"; otherwise, it transitions to + // the "StateClosed" and handles invocations as usual. _, err := circuitBreaker.Execute(func() (interface{}, error) { err := invoker(ctx, method, req, reply, cc, opts...) @@ -356,6 +388,7 @@ func (cf *ConnectionFactoryImpl) createCircuitBreakerInterceptor() grpc.UnaryCli return nil } +// createClientTimeoutInterceptor creates a client interceptor with a context that expires after the timeout. func createClientTimeoutInterceptor(timeout time.Duration) grpc.UnaryClientInterceptor { clientTimeoutInterceptor := func( ctx context.Context, @@ -366,11 +399,11 @@ func createClientTimeoutInterceptor(timeout time.Duration) grpc.UnaryClientInter invoker grpc.UnaryInvoker, opts ...grpc.CallOption, ) error { - // create a context that expires after timeout + // Create a context that expires after the specified timeout. ctxWithTimeout, cancel := context.WithTimeout(ctx, timeout) - defer cancel() - // call the remote GRPC using the short context + + // Call the remote GRPC using the short context. err := invoker(ctxWithTimeout, method, req, reply, cc, opts...) return err @@ -379,6 +412,8 @@ func createClientTimeoutInterceptor(timeout time.Duration) grpc.UnaryClientInter return clientTimeoutInterceptor } +// WithClientTimeoutOption is a helper function to create a GRPC dial option +// with the specified client timeout interceptor. func WithClientTimeoutOption(timeout time.Duration) grpc.DialOption { return grpc.WithUnaryInterceptor(createClientTimeoutInterceptor(timeout)) } From 0e8fd98f4a6dd49d053245ce0f710b5dbb3f8994 Mon Sep 17 00:00:00 2001 From: Andrii Slisarchuk Date: Thu, 22 Jun 2023 21:13:31 +0300 Subject: [PATCH 30/56] Added comments for tests --- .../rpc/backend/connection_factory_test.go | 73 +++++++++++++------ .../access/access_circuit_breaker_test.go | 31 +++++--- 2 files changed, 68 insertions(+), 36 deletions(-) diff --git a/engine/access/rpc/backend/connection_factory_test.go b/engine/access/rpc/backend/connection_factory_test.go index c81201fc953..553948f0a93 100644 --- a/engine/access/rpc/backend/connection_factory_test.go +++ b/engine/access/rpc/backend/connection_factory_test.go @@ -408,140 +408,165 @@ func TestConnectionPoolStale(t *testing.T) { assert.Equal(t, resp, expected) } -// TestCircuitBreakerExecutionNode tests circuit breaker states changed for execution nodes +// TestCircuitBreakerExecutionNode tests the circuit breaker state changes for execution nodes. func TestCircuitBreakerExecutionNode(t *testing.T) { requestTimeout := 1 * time.Second circuitBreakerRestoreTimeout := 3 * time.Second - // create an execution node + + // Create an execution node for testing. en := new(executionNode) en.start(t) defer en.stop(t) - // setup the handler mock to not respond within the requestTimeout + // Set up the handler mock to not respond within the requestTimeout. req := &execution.PingRequest{} resp := &execution.PingResponse{} en.handler.On("Ping", testifymock.Anything, req).After(2*requestTimeout).Return(resp, nil) - // create the factory + + // Create the connection factory. connectionFactory := new(ConnectionFactoryImpl) - // set the execution grpc port + + // Set the execution gRPC port. connectionFactory.ExecutionGRPCPort = en.port - // set the execution grpc client requestTimeout + + // Set the execution gRPC client requestTimeout. connectionFactory.ExecutionNodeGRPCTimeout = requestTimeout - // set the configuration for circuit breaker + + // Set the configuration for the circuit breaker. connectionFactory.CircuitBreakerConfig = &CircuitBreakerConfig{ Enabled: true, MaxFailures: 1, RestoreTimeout: circuitBreakerRestoreTimeout, } - // set the connection pool cache size + + // Set the connection pool cache size. cacheSize := 1 cache, _ := lru.NewWithEvict(cacheSize, func(_, evictedValue interface{}) { evictedValue.(*CachedClient).Close() }) connectionFactory.ConnectionsCache = cache connectionFactory.CacheSize = uint(cacheSize) - // set metrics reporting + + // Set metrics reporting. connectionFactory.AccessMetrics = metrics.NewNoopCollector() - // create the execution API client + // Create the execution API client. client, _, err := connectionFactory.GetExecutionAPIClient(en.listener.Addr().String()) assert.NoError(t, err) ctx := context.Background() + + // Helper function to make the Ping call to the execution node and measure the duration. callAndMeasurePingDuration := func() (time.Duration, error) { start := time.Now() - // make the call to the execution node + // Make the call to the execution node. _, err = client.Ping(ctx, req) en.handler.AssertCalled(t, "Ping", testifymock.Anything, req) return time.Since(start), err } + // Call and measure the duration for the first invocation. duration, err := callAndMeasurePingDuration() assert.Equal(t, codes.DeadlineExceeded, status.Code(err)) assert.LessOrEqual(t, requestTimeout, duration) + // Call and measure the duration for the second invocation (circuit breaker state is now "Open"). duration, err = callAndMeasurePingDuration() assert.Equal(t, gobreaker.ErrOpenState, err) assert.Greater(t, requestTimeout, duration) + // Reset the mock Ping for the next invocation to return response without delay en.handler.On("Ping", testifymock.Anything, req).Unset() en.handler.On("Ping", testifymock.Anything, req).Return(resp, nil) - //Wait until Circuit breaker go to Half-open state + // Wait until the circuit breaker transitions to the "HalfOpen" state. time.Sleep(circuitBreakerRestoreTimeout + time.Second) + // Call and measure the duration for the third invocation (circuit breaker state is now "HalfOpen"). duration, err = callAndMeasurePingDuration() assert.Greater(t, requestTimeout, duration) assert.Equal(t, nil, err) } -// TestCircuitBreakerCollectionNode tests circuit breaker states changed for collection nodes +// TestCircuitBreakerCollectionNode tests the circuit breaker state changes for collection nodes. func TestCircuitBreakerCollectionNode(t *testing.T) { requestTimeout := 1 * time.Second circuitBreakerRestoreTimeout := 3 * time.Second - // create a collection node + + // Create a collection node for testing. cn := new(collectionNode) cn.start(t) defer cn.stop(t) - // set up the handler mock to not respond within the requestTimeout + // Set up the handler mock to not respond within the requestTimeout. req := &access.PingRequest{} resp := &access.PingResponse{} cn.handler.On("Ping", testifymock.Anything, req).After(2*requestTimeout).Return(resp, nil) - // create the factory + // Create the connection factory. connectionFactory := new(ConnectionFactoryImpl) - // set the collection grpc port + + // Set the collection gRPC port. connectionFactory.CollectionGRPCPort = cn.port - // set the collection grpc client requestTimeout + + // Set the collection gRPC client requestTimeout. connectionFactory.CollectionNodeGRPCTimeout = requestTimeout - // set the configuration for circuit breaker + + // Set the configuration for the circuit breaker. connectionFactory.CircuitBreakerConfig = &CircuitBreakerConfig{ Enabled: true, MaxFailures: 1, RestoreTimeout: circuitBreakerRestoreTimeout, } - // set the connection pool cache size + + // Set the connection pool cache size. cacheSize := 1 cache, _ := lru.NewWithEvict(cacheSize, func(_, evictedValue interface{}) { evictedValue.(*CachedClient).Close() }) connectionFactory.ConnectionsCache = cache connectionFactory.CacheSize = uint(cacheSize) - // set metrics reporting + + // Set metrics reporting. connectionFactory.AccessMetrics = metrics.NewNoopCollector() - // create the collection API client + // Create the collection API client. client, _, err := connectionFactory.GetAccessAPIClient(cn.listener.Addr().String()) assert.NoError(t, err) ctx := context.Background() + + // Helper function to make the Ping call to the collection node and measure the duration. callAndMeasurePingDuration := func() (time.Duration, error) { start := time.Now() - // make the call to the collection node + // Make the call to the collection node. _, err = client.Ping(ctx, req) cn.handler.AssertCalled(t, "Ping", testifymock.Anything, req) return time.Since(start), err } + // Call and measure the duration for the first invocation. duration, err := callAndMeasurePingDuration() assert.Equal(t, codes.DeadlineExceeded, status.Code(err)) assert.LessOrEqual(t, requestTimeout, duration) + // Call and measure the duration for the second invocation (circuit breaker state is now "Open"). duration, err = callAndMeasurePingDuration() assert.Equal(t, gobreaker.ErrOpenState, err) assert.Greater(t, requestTimeout, duration) + // Reset the mock Ping for the next invocation to return response without delay cn.handler.On("Ping", testifymock.Anything, req).Unset() cn.handler.On("Ping", testifymock.Anything, req).Return(resp, nil) - //Wait until Circuit breaker go to Half-open state + // Wait until the circuit breaker transitions to the "HalfOpen" state. time.Sleep(circuitBreakerRestoreTimeout + time.Second) + // Call and measure the duration for the third invocation (circuit breaker state is now "HalfOpen"). duration, err = callAndMeasurePingDuration() assert.Greater(t, requestTimeout, duration) assert.Equal(t, nil, err) diff --git a/integration/tests/access/access_circuit_breaker_test.go b/integration/tests/access/access_circuit_breaker_test.go index 07b9f98ddbf..d99c292465e 100644 --- a/integration/tests/access/access_circuit_breaker_test.go +++ b/integration/tests/access/access_circuit_breaker_test.go @@ -98,14 +98,20 @@ func (s *AccessCircuitBreakerSuite) SetupTest() { s.net.Start(s.ctx) } +// TestCircuitBreaker tests the behavior of the circuit breaker. It verifies the circuit breaker's ability to open, +// prevent further requests, and restore after a timeout. It is done in a few steps: +// 1. Get the collection node and disconnect it from the network. +// 2. Try to send a transaction multiple times to observe the decrease in waiting time for a failed response. +// 3. Connect the collection node to the network and wait for the circuit breaker restore time. +// 4. Successfully send a transaction. func (s *AccessCircuitBreakerSuite) TestCircuitBreaker() { ctx, cancel := context.WithCancel(s.ctx) defer cancel() - // 1. Get collection node + // 1. Get the collection node collectionContainer := s.net.ContainerByName("collection_1") - // 2. Get Access Node container and client + // 2. Get the Access Node container and client accessContainer := s.net.ContainerByName(testnet.PrimaryAN) // Check if access node was created with circuit breaker flags @@ -119,7 +125,7 @@ func (s *AccessCircuitBreakerSuite) TestCircuitBreaker() { latestBlockID, err := accessClient.GetLatestBlockID(ctx) require.NoError(s.T(), err) - // create new account to deploy Counter to + // Create a new account to deploy Counter to accountPrivateKey := lib.RandomPrivateKey() accountKey := sdk.NewAccountKey(). @@ -146,18 +152,17 @@ func (s *AccessCircuitBreakerSuite) TestCircuitBreaker() { SetPayer(serviceAddress). SetGasLimit(9999) - // sign transaction - + // Sign the transaction childCtx, cancel := context.WithTimeout(ctx, time.Second*10) signedTx, err := accessClient.SignTransaction(createAccountTx) require.NoError(s.T(), err) cancel() - // 3. Disconnect collection node from network to activate Circuit Breaker + // 3. Disconnect the collection node from the network to activate the Circuit Breaker err = collectionContainer.Disconnect() require.NoError(s.T(), err, "failed to pause connection node") - //4. Send couple transactions to proof circuit breaker opens correctly + // 4. Send a couple of transactions to test if the circuit breaker opens correctly sendTransaction := func(ctx context.Context, tx *sdk.Transaction) (time.Duration, error) { childCtx, cancel = context.WithTimeout(ctx, time.Second*10) start := time.Now() @@ -168,23 +173,25 @@ func (s *AccessCircuitBreakerSuite) TestCircuitBreaker() { return duration, err } - // try to send transaction first time. Should wait at least timeout time and return Unknown error + // Try to send the transaction for the first time. It should wait at least the timeout time and return Unknown error duration, err := sendTransaction(ctx, signedTx) assert.Equal(s.T(), codes.Unknown, status.Code(err)) assert.GreaterOrEqual(s.T(), requestTimeout, duration) - // try to send transaction second time. Should wait less then a second cause CB configured to break after firs fail + // Try to send the transaction for the second time. It should wait less than a second because the circuit breaker + // is configured to break after the first failure duration, err = sendTransaction(ctx, signedTx) assert.Equal(s.T(), codes.Unknown, status.Code(err)) assert.Greater(s.T(), time.Second, duration) - // connect again + // Reconnect the collection node err = collectionContainer.Connect() require.NoError(s.T(), err, "failed to start collection node") - // wait to restore circuit breaker + + // Wait for the circuit breaker to restore time.Sleep(cbRestoreTimeout) - // try to send transaction third time. Transaction should be send successful + // Try to send the transaction for the third time. The transaction should be sent successfully _, err = sendTransaction(ctx, signedTx) require.NoError(s.T(), err, "transaction should be sent") } From 60909308f8fb7462e071d3db2d11278ad3ee1fcb Mon Sep 17 00:00:00 2001 From: Andrii Slisarchuk Date: Sat, 24 Jun 2023 21:55:48 +0300 Subject: [PATCH 31/56] Apply suggestions from code review Co-authored-by: Peter Argue <89119817+peterargue@users.noreply.github.com> --- engine/access/rpc/backend/connection_factory.go | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/engine/access/rpc/backend/connection_factory.go b/engine/access/rpc/backend/connection_factory.go index 9d66a4caeba..701f1458b4f 100644 --- a/engine/access/rpc/backend/connection_factory.go +++ b/engine/access/rpc/backend/connection_factory.go @@ -113,14 +113,10 @@ func (cf *ConnectionFactoryImpl) createConnection(address string, timeout time.D var connInterceptors []grpc.UnaryClientInterceptor - cbInterceptor := cf.createCircuitBreakerInterceptor() - if cbInterceptor != nil { - connInterceptors = append(connInterceptors, cbInterceptor) - } - - ciInterceptor := cf.createClientInvalidationInterceptor(address, clientType) - if ciInterceptor != nil { - connInterceptors = append(connInterceptors, ciInterceptor) + if cf.CircuitBreakerConfig.Enabled { + connInterceptors = append(connInterceptors, cf.createCircuitBreakerInterceptor()) + } else { + connInterceptors = append(connInterceptors, cf.createClientInvalidationInterceptor(address, clientType)) } connInterceptors = append(connInterceptors, createClientTimeoutInterceptor(timeout)) From 5d8f69d07c2a7308ae6bdcc943dd29b9e1f07bd6 Mon Sep 17 00:00:00 2001 From: Andrii Slisarchuk Date: Tue, 4 Jul 2023 13:04:56 +0300 Subject: [PATCH 32/56] Added retries --- cmd/access/node_builder/access_node_builder.go | 6 +++++- engine/access/rpc/backend/connection_factory.go | 4 ++++ engine/access/rpc/backend/connection_factory_test.go | 2 ++ 3 files changed, 11 insertions(+), 1 deletion(-) diff --git a/cmd/access/node_builder/access_node_builder.go b/cmd/access/node_builder/access_node_builder.go index d0b7696c7a1..84554dda002 100644 --- a/cmd/access/node_builder/access_node_builder.go +++ b/cmd/access/node_builder/access_node_builder.go @@ -165,6 +165,7 @@ func DefaultAccessNodeConfig() *AccessNodeConfig { Enabled: false, RestoreTimeout: 60 * time.Second, MaxFailures: 5, + MaxRetries: 1, }, }, stateStreamConf: state_stream.Config{ @@ -690,7 +691,7 @@ func (builder *FlowAccessNodeBuilder) extraFlags() { flags.BoolVar(&builder.rpcConf.CircuitBreakerConfig.Enabled, "circuit-breaker-enabled", defaultConfig.rpcConf.CircuitBreakerConfig.Enabled, "specifies whether the circuit breaker is enabled for collection and execution API clients.") flags.DurationVar(&builder.rpcConf.CircuitBreakerConfig.RestoreTimeout, "circuit-breaker-restore-timeout", defaultConfig.rpcConf.CircuitBreakerConfig.RestoreTimeout, "duration after which the circuit breaker will restore the connection to the client after closing it due to failures. Default value is 60s") flags.Uint32Var(&builder.rpcConf.CircuitBreakerConfig.MaxFailures, "circuit-breaker-max-failures", defaultConfig.rpcConf.CircuitBreakerConfig.MaxFailures, "maximum number of failed calls to the client that will cause the circuit breaker to close the connection. Default value is 5") - + flags.Uint32Var(&builder.rpcConf.CircuitBreakerConfig.MaxRetries, "circuit-breaker-max-retries", defaultConfig.rpcConf.CircuitBreakerConfig.MaxRetries, "maximum number of retries call to check if connection restored after timeout. Default value is 1") // ExecutionDataRequester config flags.BoolVar(&builder.executionDataSyncEnabled, "execution-data-sync-enabled", defaultConfig.executionDataSyncEnabled, "whether to enable the execution data sync protocol") flags.StringVar(&builder.executionDataDir, "execution-data-dir", defaultConfig.executionDataDir, "directory to use for Execution Data database") @@ -758,6 +759,9 @@ func (builder *FlowAccessNodeBuilder) extraFlags() { if builder.rpcConf.CircuitBreakerConfig.MaxFailures == 0 { return errors.New("circuit-breaker-max-failures must be greater than 0") } + if builder.rpcConf.CircuitBreakerConfig.MaxRetries == 0 { + return errors.New("circuit-breaker-max-retries must be greater than 0") + } } return nil diff --git a/engine/access/rpc/backend/connection_factory.go b/engine/access/rpc/backend/connection_factory.go index 701f1458b4f..d9bca57aab8 100644 --- a/engine/access/rpc/backend/connection_factory.go +++ b/engine/access/rpc/backend/connection_factory.go @@ -84,10 +84,13 @@ type ConnectionFactoryImpl struct { // // MaxFailures specifies the maximum number of failed calls to the client that will cause the circuit breaker // to close the connection. +// +// MaxRetries specifies the maximum number of retries call to check if connection restored after timeout. type CircuitBreakerConfig struct { Enabled bool RestoreTimeout time.Duration MaxFailures uint32 + MaxRetries uint32 } type CachedClient struct { @@ -352,6 +355,7 @@ func (cf *ConnectionFactoryImpl) createCircuitBreakerInterceptor() grpc.UnaryCli // The number of maximum failures is checked before the circuit breaker goes to the Open state. return counts.ConsecutiveFailures >= cf.CircuitBreakerConfig.MaxFailures }, + MaxRequests: cf.CircuitBreakerConfig.MaxRetries, }) circuitBreakerInterceptor := func( diff --git a/engine/access/rpc/backend/connection_factory_test.go b/engine/access/rpc/backend/connection_factory_test.go index 553948f0a93..ff746b93a98 100644 --- a/engine/access/rpc/backend/connection_factory_test.go +++ b/engine/access/rpc/backend/connection_factory_test.go @@ -436,6 +436,7 @@ func TestCircuitBreakerExecutionNode(t *testing.T) { connectionFactory.CircuitBreakerConfig = &CircuitBreakerConfig{ Enabled: true, MaxFailures: 1, + MaxRetries: 1, RestoreTimeout: circuitBreakerRestoreTimeout, } @@ -518,6 +519,7 @@ func TestCircuitBreakerCollectionNode(t *testing.T) { connectionFactory.CircuitBreakerConfig = &CircuitBreakerConfig{ Enabled: true, MaxFailures: 1, + MaxRetries: 1, RestoreTimeout: circuitBreakerRestoreTimeout, } From f63d05903b4671bdd814ae5487bfcb36efe95db0 Mon Sep 17 00:00:00 2001 From: Andrii Slisarchuk Date: Fri, 7 Jul 2023 15:03:34 +0300 Subject: [PATCH 33/56] Added node communicator. --- engine/access/rpc/backend/backend.go | 34 ++-- engine/access/rpc/backend/backend_accounts.go | 71 ++++---- engine/access/rpc/backend/backend_events.go | 74 ++++---- engine/access/rpc/backend/backend_test.go | 1 + .../rpc/backend/backend_transactions.go | 164 +++++++++--------- .../access/rpc/backend/connection_factory.go | 2 +- .../access/rpc/backend/node_communicator.go | 90 ++++++++++ 7 files changed, 265 insertions(+), 171 deletions(-) create mode 100644 engine/access/rpc/backend/node_communicator.go diff --git a/engine/access/rpc/backend/backend.go b/engine/access/rpc/backend/backend.go index 8cf6d246791..2c3fdcc7e0d 100644 --- a/engine/access/rpc/backend/backend.go +++ b/engine/access/rpc/backend/backend.go @@ -117,8 +117,8 @@ func New( archivePorts[idx] = port } - // create configured node selection factory to be used in sub-backend logic - nodeSelectorFactory := NodeSelectorFactory{circuitBreakerEnabled: circuitBreakerEnabled} + // create node communicator, that will be used in sub-backend logic for interacting with API calls + nodeCommunicator := NewNodeCommunicator(circuitBreakerEnabled) b := &Backend{ state: state, @@ -133,7 +133,7 @@ func New( loggedScripts: loggedScripts, archiveAddressList: archiveAddressList, archivePorts: archivePorts, - nodeSelectorFactory: nodeSelectorFactory, + nodeSelectorFactory: NodeSelectorFactory{circuitBreakerEnabled: circuitBreakerEnabled}, }, backendTransactions: backendTransactions{ staticCollectionRPC: collectionRPC, @@ -149,16 +149,16 @@ func New( connFactory: connFactory, previousAccessNodes: historicalAccessNodes, log: log, - nodeSelectorFactory: nodeSelectorFactory, + nodeCommunicator: nodeCommunicator, }, backendEvents: backendEvents{ - state: state, - headers: headers, - executionReceipts: executionReceipts, - connFactory: connFactory, - log: log, - maxHeightRange: maxHeightRange, - nodeSelectorFactory: nodeSelectorFactory, + state: state, + headers: headers, + executionReceipts: executionReceipts, + connFactory: connFactory, + log: log, + maxHeightRange: maxHeightRange, + nodeCommunicator: nodeCommunicator, }, backendBlockHeaders: backendBlockHeaders{ headers: headers, @@ -169,12 +169,12 @@ func New( state: state, }, backendAccounts: backendAccounts{ - state: state, - headers: headers, - executionReceipts: executionReceipts, - connFactory: connFactory, - log: log, - nodeSelectorFactory: nodeSelectorFactory, + state: state, + headers: headers, + executionReceipts: executionReceipts, + connFactory: connFactory, + log: log, + nodeCommunicator: nodeCommunicator, }, backendExecutionResults: backendExecutionResults{ executionResults: executionResults, diff --git a/engine/access/rpc/backend/backend_accounts.go b/engine/access/rpc/backend/backend_accounts.go index f3fd12c82f9..c1eaa1e98e2 100644 --- a/engine/access/rpc/backend/backend_accounts.go +++ b/engine/access/rpc/backend/backend_accounts.go @@ -4,7 +4,6 @@ import ( "context" "time" - "github.com/hashicorp/go-multierror" execproto "github.com/onflow/flow/protobuf/go/flow/execution" "github.com/rs/zerolog" "google.golang.org/grpc/codes" @@ -18,12 +17,12 @@ import ( ) type backendAccounts struct { - state protocol.State - headers storage.Headers - executionReceipts storage.ExecutionReceipts - connFactory ConnectionFactory - log zerolog.Logger - nodeSelectorFactory NodeSelectorFactory + state protocol.State + headers storage.Headers + executionReceipts storage.ExecutionReceipts + connFactory ConnectionFactory + log zerolog.Logger + nodeCommunicator *NodeCommunicator } func (b *backendAccounts) GetAccount(ctx context.Context, address flow.Address) (*flow.Account, error) { @@ -108,37 +107,39 @@ func (b *backendAccounts) getAccountAtBlockID( // other ENs are logged and swallowed. If all ENs fail to return a valid response, then an // error aggregating all failures is returned. func (b *backendAccounts) getAccountFromAnyExeNode(ctx context.Context, execNodes flow.IdentityList, req *execproto.GetAccountAtBlockIDRequest) (*execproto.GetAccountAtBlockIDResponse, error) { - var errors *multierror.Error - - execNodeSelector := b.nodeSelectorFactory.SelectExecutionNodes(execNodes) - - for execNode := execNodeSelector.Next(); execNode != nil; execNode = execNodeSelector.Next() { - // TODO: use the GRPC Client interceptor - start := time.Now() - - resp, err := b.tryGetAccount(ctx, execNode, req) - duration := time.Since(start) - if err == nil { - // return if any execution node replied successfully - b.log.Debug(). - Str("execution_node", execNode.String()). + var resp *execproto.GetAccountAtBlockIDResponse + errToReturn := b.nodeCommunicator.CallAvailableExecutionNode( + execNodes, + func(node *flow.Identity) error { + var err error + // TODO: use the GRPC Client interceptor + start := time.Now() + + resp, err = b.tryGetAccount(ctx, node, req) + duration := time.Since(start) + if err == nil { + // return if any execution node replied successfully + b.log.Debug(). + Str("execution_node", node.String()). + Hex("block_id", req.GetBlockId()). + Hex("address", req.GetAddress()). + Int64("rtt_ms", duration.Milliseconds()). + Msg("Successfully got account info") + return nil + } + b.log.Error(). + Str("execution_node", node.String()). Hex("block_id", req.GetBlockId()). Hex("address", req.GetAddress()). Int64("rtt_ms", duration.Milliseconds()). - Msg("Successfully got account info") - return resp, nil - } - b.log.Error(). - Str("execution_node", execNode.String()). - Hex("block_id", req.GetBlockId()). - Hex("address", req.GetAddress()). - Int64("rtt_ms", duration.Milliseconds()). - Err(err). - Msg("failed to execute GetAccount") - errors = multierror.Append(errors, err) - } - - return nil, errors.ErrorOrNil() + Err(err). + Msg("failed to execute GetAccount") + return err + }, + nil, + ) + + return resp, errToReturn } func (b *backendAccounts) tryGetAccount(ctx context.Context, execNode *flow.Identity, req *execproto.GetAccountAtBlockIDRequest) (*execproto.GetAccountAtBlockIDResponse, error) { diff --git a/engine/access/rpc/backend/backend_events.go b/engine/access/rpc/backend/backend_events.go index b82abf0b8c1..38bd723a077 100644 --- a/engine/access/rpc/backend/backend_events.go +++ b/engine/access/rpc/backend/backend_events.go @@ -7,7 +7,6 @@ import ( "fmt" "time" - "github.com/hashicorp/go-multierror" execproto "github.com/onflow/flow/protobuf/go/flow/execution" "github.com/rs/zerolog" "google.golang.org/grpc/codes" @@ -21,13 +20,13 @@ import ( ) type backendEvents struct { - headers storage.Headers - executionReceipts storage.ExecutionReceipts - state protocol.State - connFactory ConnectionFactory - log zerolog.Logger - maxHeightRange uint - nodeSelectorFactory NodeSelectorFactory + headers storage.Headers + executionReceipts storage.ExecutionReceipts + state protocol.State + connFactory ConnectionFactory + log zerolog.Logger + maxHeightRange uint + nodeCommunicator *NodeCommunicator } // GetEventsForHeightRange retrieves events for all sealed blocks between the start block height and @@ -210,34 +209,37 @@ func verifyAndConvertToAccessEvents( func (b *backendEvents) getEventsFromAnyExeNode(ctx context.Context, execNodes flow.IdentityList, req *execproto.GetEventsForBlockIDsRequest) (*execproto.GetEventsForBlockIDsResponse, *flow.Identity, error) { - var errors *multierror.Error - - execNodeSelector := b.nodeSelectorFactory.SelectExecutionNodes(execNodes) - - // try to get events from one of the execution nodes - for execNode := execNodeSelector.Next(); execNode != nil; execNode = execNodeSelector.Next() { - start := time.Now() - resp, err := b.tryGetEvents(ctx, execNode, req) - duration := time.Since(start) - - logger := b.log.With(). - Str("execution_node", execNode.String()). - Str("event", req.GetType()). - Int("blocks", len(req.BlockIds)). - Int64("rtt_ms", duration.Milliseconds()). - Logger() - - if err == nil { - // return if any execution node replied successfully - logger.Debug().Msg("Successfully got events") - return resp, execNode, nil - } - - logger.Err(err).Msg("failed to execute GetEvents") - - errors = multierror.Append(errors, err) - } - return nil, nil, errors.ErrorOrNil() + var resp *execproto.GetEventsForBlockIDsResponse + var execNode *flow.Identity + errToReturn := b.nodeCommunicator.CallAvailableExecutionNode( + execNodes, + func(node *flow.Identity) error { + var err error + start := time.Now() + resp, err = b.tryGetEvents(ctx, node, req) + duration := time.Since(start) + + logger := b.log.With(). + Str("execution_node", node.String()). + Str("event", req.GetType()). + Int("blocks", len(req.BlockIds)). + Int64("rtt_ms", duration.Milliseconds()). + Logger() + + if err == nil { + // return if any execution node replied successfully + logger.Debug().Msg("Successfully got events") + execNode = node + return nil + } + + logger.Err(err).Msg("failed to execute GetEvents") + return err + }, + nil, + ) + + return resp, execNode, errToReturn } func (b *backendEvents) tryGetEvents(ctx context.Context, diff --git a/engine/access/rpc/backend/backend_test.go b/engine/access/rpc/backend/backend_test.go index 493d5be069e..be1024ec4bd 100644 --- a/engine/access/rpc/backend/backend_test.go +++ b/engine/access/rpc/backend/backend_test.go @@ -2372,6 +2372,7 @@ func (suite *Suite) TestExecuteScriptOnArchiveNode() { suite.log, DefaultSnapshotHistoryLimit, []string{fullArchiveAddress}, + false, ) // mock parameters diff --git a/engine/access/rpc/backend/backend_transactions.go b/engine/access/rpc/backend/backend_transactions.go index de5cc4f878c..b2abff13eb1 100644 --- a/engine/access/rpc/backend/backend_transactions.go +++ b/engine/access/rpc/backend/backend_transactions.go @@ -6,7 +6,6 @@ import ( "fmt" "time" - "github.com/hashicorp/go-multierror" accessproto "github.com/onflow/flow/protobuf/go/flow/access" "github.com/onflow/flow/protobuf/go/flow/entities" execproto "github.com/onflow/flow/protobuf/go/flow/execution" @@ -38,7 +37,7 @@ type backendTransactions struct { connFactory ConnectionFactory previousAccessNodes []accessproto.AccessAPIClient log zerolog.Logger - nodeSelectorFactory NodeSelectorFactory + nodeCommunicator *NodeCommunicator } // SendTransaction forwards the transaction to the collection node @@ -89,27 +88,24 @@ func (b *backendTransactions) trySendTransaction(ctx context.Context, tx *flow.T return fmt.Errorf("failed to determine collection node for tx %x: %w", tx, err) } - var sendErrors *multierror.Error + var sendError error logAnyError := func() { - err = sendErrors.ErrorOrNil() - if err != nil { + if sendError != nil { b.log.Info().Err(err).Msg("failed to send transactions to collector nodes") } } defer logAnyError() - collNodeSelector := b.nodeSelectorFactory.SelectCollectionNodes(collNodes) - // try sending the transaction to one of the chosen collection nodes - for colNode := collNodeSelector.Next(); colNode != nil; colNode = collNodeSelector.Next() { - err = b.sendTransactionToCollector(ctx, tx, colNode.Address) - if err == nil { - return nil + sendError = b.nodeCommunicator.CallAvailableConnectionNode(collNodes, func(node *flow.Identity) error { + err = b.sendTransactionToCollector(ctx, tx, node.Address) + if err != nil { + return err } - sendErrors = multierror.Append(sendErrors, err) - } + return nil + }) - return sendErrors.ErrorOrNil() + return sendError } // chooseCollectionNodes finds a random subset of size sampleSize of collection node addresses from the @@ -768,35 +764,36 @@ func (b *backendTransactions) getTransactionResultFromAnyExeNode( execNodes flow.IdentityList, req *execproto.GetTransactionResultRequest, ) (*execproto.GetTransactionResultResponse, error) { - var errs *multierror.Error - logAnyError := func() { - errToReturn := errs.ErrorOrNil() + var errToReturn error + + defer func() { if errToReturn != nil { b.log.Info().Err(errToReturn).Msg("failed to get transaction result from execution nodes") } - } - defer logAnyError() - - execNodeSelector := b.nodeSelectorFactory.SelectExecutionNodes(execNodes) + }() - // try to execute the script on one of the execution nodes - for execNode := execNodeSelector.Next(); execNode != nil; execNode = execNodeSelector.Next() { - resp, err := b.tryGetTransactionResult(ctx, execNode, req) - if err == nil { - b.log.Debug(). - Str("execution_node", execNode.String()). - Hex("block_id", req.GetBlockId()). - Hex("transaction_id", req.GetTransactionId()). - Msg("Successfully got transaction results from any node") - return resp, nil - } - if status.Code(err) == codes.NotFound { - return nil, err - } - errs = multierror.Append(errs, err) - } + var resp *execproto.GetTransactionResultResponse + errToReturn = b.nodeCommunicator.CallAvailableExecutionNode( + execNodes, + func(node *flow.Identity) error { + var err error + resp, err = b.tryGetTransactionResult(ctx, node, req) + if err == nil { + b.log.Debug(). + Str("execution_node", node.String()). + Hex("block_id", req.GetBlockId()). + Hex("transaction_id", req.GetTransactionId()). + Msg("Successfully got transaction results from any node") + return nil + } + return err + }, + func(err error) bool { + return status.Code(err) == codes.NotFound + }, + ) - return nil, errs.ErrorOrNil() + return resp, errToReturn } func (b *backendTransactions) tryGetTransactionResult( @@ -823,12 +820,12 @@ func (b *backendTransactions) getTransactionResultsByBlockIDFromAnyExeNode( execNodes flow.IdentityList, req *execproto.GetTransactionsByBlockIDRequest, ) (*execproto.GetTransactionResultsResponse, error) { - var errs *multierror.Error + var errToReturn error defer func() { // log the errors - if err := errs.ErrorOrNil(); err != nil { - b.log.Err(errs).Msg("failed to get transaction results from execution nodes") + if errToReturn != nil { + b.log.Err(errToReturn).Msg("failed to get transaction results from execution nodes") } }() @@ -837,24 +834,27 @@ func (b *backendTransactions) getTransactionResultsByBlockIDFromAnyExeNode( return nil, errors.New("zero execution nodes") } - execNodeSelector := b.nodeSelectorFactory.SelectExecutionNodes(execNodes) - - for execNode := execNodeSelector.Next(); execNode != nil; execNode = execNodeSelector.Next() { - resp, err := b.tryGetTransactionResultsByBlockID(ctx, execNode, req) - if err == nil { - b.log.Debug(). - Str("execution_node", execNode.String()). - Hex("block_id", req.GetBlockId()). - Msg("Successfully got transaction results from any node") - return resp, nil - } - if status.Code(err) == codes.NotFound { - return nil, err - } - errs = multierror.Append(errs, err) - } + var resp *execproto.GetTransactionResultsResponse + errToReturn = b.nodeCommunicator.CallAvailableExecutionNode( + execNodes, + func(node *flow.Identity) error { + var err error + resp, err = b.tryGetTransactionResultsByBlockID(ctx, node, req) + if err == nil { + b.log.Debug(). + Str("execution_node", node.String()). + Hex("block_id", req.GetBlockId()). + Msg("Successfully got transaction results from any node") + return nil + } + return err + }, + func(err error) bool { + return status.Code(err) == codes.NotFound + }, + ) - return nil, errs.ErrorOrNil() + return resp, errToReturn } func (b *backendTransactions) tryGetTransactionResultsByBlockID( @@ -881,39 +881,39 @@ func (b *backendTransactions) getTransactionResultByIndexFromAnyExeNode( execNodes flow.IdentityList, req *execproto.GetTransactionByIndexRequest, ) (*execproto.GetTransactionResultResponse, error) { - var errs *multierror.Error - logAnyError := func() { - errToReturn := errs.ErrorOrNil() + var errToReturn error + defer func() { if errToReturn != nil { b.log.Info().Err(errToReturn).Msg("failed to get transaction result from execution nodes") } - } - defer logAnyError() + }() if len(execNodes) == 0 { return nil, errors.New("zero execution nodes provided") } - execNodeSelector := b.nodeSelectorFactory.SelectExecutionNodes(execNodes) - - // try to execute the script on one of the execution nodes - for execNode := execNodeSelector.Next(); execNode != nil; execNode = execNodeSelector.Next() { - resp, err := b.tryGetTransactionResultByIndex(ctx, execNode, req) - if err == nil { - b.log.Debug(). - Str("execution_node", execNode.String()). - Hex("block_id", req.GetBlockId()). - Uint32("index", req.GetIndex()). - Msg("Successfully got transaction results from any node") - return resp, nil - } - if status.Code(err) == codes.NotFound { - return nil, err - } - errs = multierror.Append(errs, err) - } + var resp *execproto.GetTransactionResultResponse + errToReturn = b.nodeCommunicator.CallAvailableExecutionNode( + execNodes, + func(node *flow.Identity) error { + var err error + resp, err = b.tryGetTransactionResultByIndex(ctx, node, req) + if err == nil { + b.log.Debug(). + Str("execution_node", node.String()). + Hex("block_id", req.GetBlockId()). + Uint32("index", req.GetIndex()). + Msg("Successfully got transaction results from any node") + return nil + } + return err + }, + func(err error) bool { + return status.Code(err) == codes.NotFound + }, + ) - return nil, errs.ErrorOrNil() + return resp, errToReturn } func (b *backendTransactions) tryGetTransactionResultByIndex( diff --git a/engine/access/rpc/backend/connection_factory.go b/engine/access/rpc/backend/connection_factory.go index 93b74dc7989..3f459f4d674 100644 --- a/engine/access/rpc/backend/connection_factory.go +++ b/engine/access/rpc/backend/connection_factory.go @@ -116,7 +116,7 @@ func (cf *ConnectionFactoryImpl) createConnection(address string, timeout time.D var connInterceptors []grpc.UnaryClientInterceptor - if cf.CircuitBreakerConfig.Enabled { + if cf.CircuitBreakerConfig != nil && cf.CircuitBreakerConfig.Enabled { connInterceptors = append(connInterceptors, cf.createCircuitBreakerInterceptor()) } else { connInterceptors = append(connInterceptors, cf.createClientInvalidationInterceptor(address, clientType)) diff --git a/engine/access/rpc/backend/node_communicator.go b/engine/access/rpc/backend/node_communicator.go new file mode 100644 index 00000000000..70932764155 --- /dev/null +++ b/engine/access/rpc/backend/node_communicator.go @@ -0,0 +1,90 @@ +package backend + +import ( + "github.com/hashicorp/go-multierror" + "github.com/sony/gobreaker" + + "github.com/onflow/flow-go/model/flow" +) + +// maxFailedRequestCount represents the maximum number of failed requests before returning errors. +const maxFailedRequestCount = 3 + +// NodeCommunicator is responsible for calling available nodes in the backend. +type NodeCommunicator struct { + circuitBreakerEnabled bool + nodeSelectorFactory NodeSelectorFactory +} + +// NewNodeCommunicator creates a new instance of NodeCommunicator. +func NewNodeCommunicator(circuitBreakerEnabled bool) *NodeCommunicator { + return &NodeCommunicator{ + circuitBreakerEnabled: circuitBreakerEnabled, + nodeSelectorFactory: NodeSelectorFactory{circuitBreakerEnabled: circuitBreakerEnabled}, + } +} + +// CallAvailableExecutionNode calls the provided function on the available execution nodes. +// It iterates through the execution nodes and executes the function. +// If an error occurs, it applies the custom error handler (if provided) and keeps track of the errors. +// If the error occurs in circuit breaker, it continues to the next execution node. +// If the maximum failed request count is reached, it returns the accumulated errors. +func (b *NodeCommunicator) CallAvailableExecutionNode( + nodes flow.IdentityList, + call func(node *flow.Identity) error, + customErrorHandler func(err error) bool, +) error { + var errs *multierror.Error + execNodeSelector := b.nodeSelectorFactory.SelectExecutionNodes(nodes) + + for execNode := execNodeSelector.Next(); execNode != nil; execNode = execNodeSelector.Next() { + err := call(execNode) + if err == nil { + return nil + } + + if customErrorHandler != nil && customErrorHandler(err) { + return err + } + + if err == gobreaker.ErrOpenState { + continue + } + + errs = multierror.Append(errs, err) + if len(errs.Errors) >= maxFailedRequestCount { + return errs.ErrorOrNil() + } + } + + return errs.ErrorOrNil() +} + +// CallAvailableConnectionNode calls the provided function on the available connection nodes. +// It iterates through the connection nodes and executes the function. +// If an error occurs, it keeps track of the errors. +// If the error occurs in circuit breaker, it continues to the next execution node. +// If the maximum failed request count is reached, it returns the accumulated errors. +func (b *NodeCommunicator) CallAvailableConnectionNode(nodes flow.IdentityList, call func(node *flow.Identity) error) error { + var errs *multierror.Error + + collNodeSelector := b.nodeSelectorFactory.SelectCollectionNodes(nodes) + + for colNode := collNodeSelector.Next(); colNode != nil; colNode = collNodeSelector.Next() { + err := call(colNode) + if err == nil { + return nil + } + + if err == gobreaker.ErrOpenState { + continue + } + + errs = multierror.Append(errs, err) + if len(errs.Errors) >= maxFailedRequestCount { + return errs.ErrorOrNil() + } + } + + return errs.ErrorOrNil() +} From c55bb5acb66c5accf10fbc74ef37f1eded93b734 Mon Sep 17 00:00:00 2001 From: Andrii Slisarchuk Date: Fri, 7 Jul 2023 15:16:45 +0300 Subject: [PATCH 34/56] Rename MaxRetries to MaxRequest --- cmd/access/node_builder/access_node_builder.go | 8 ++++---- engine/access/rpc/backend/connection_factory.go | 6 +++--- engine/access/rpc/backend/connection_factory_test.go | 4 ++-- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/cmd/access/node_builder/access_node_builder.go b/cmd/access/node_builder/access_node_builder.go index 672969e66b4..1400bf46873 100644 --- a/cmd/access/node_builder/access_node_builder.go +++ b/cmd/access/node_builder/access_node_builder.go @@ -164,7 +164,7 @@ func DefaultAccessNodeConfig() *AccessNodeConfig { Enabled: false, RestoreTimeout: 60 * time.Second, MaxFailures: 5, - MaxRetries: 1, + MaxRequests: 1, }, }, stateStreamConf: state_stream.Config{ @@ -690,7 +690,7 @@ func (builder *FlowAccessNodeBuilder) extraFlags() { flags.BoolVar(&builder.rpcConf.CircuitBreakerConfig.Enabled, "circuit-breaker-enabled", defaultConfig.rpcConf.CircuitBreakerConfig.Enabled, "specifies whether the circuit breaker is enabled for collection and execution API clients.") flags.DurationVar(&builder.rpcConf.CircuitBreakerConfig.RestoreTimeout, "circuit-breaker-restore-timeout", defaultConfig.rpcConf.CircuitBreakerConfig.RestoreTimeout, "duration after which the circuit breaker will restore the connection to the client after closing it due to failures. Default value is 60s") flags.Uint32Var(&builder.rpcConf.CircuitBreakerConfig.MaxFailures, "circuit-breaker-max-failures", defaultConfig.rpcConf.CircuitBreakerConfig.MaxFailures, "maximum number of failed calls to the client that will cause the circuit breaker to close the connection. Default value is 5") - flags.Uint32Var(&builder.rpcConf.CircuitBreakerConfig.MaxRetries, "circuit-breaker-max-retries", defaultConfig.rpcConf.CircuitBreakerConfig.MaxRetries, "maximum number of retries call to check if connection restored after timeout. Default value is 1") + flags.Uint32Var(&builder.rpcConf.CircuitBreakerConfig.MaxRequests, "circuit-breaker-max-requests", defaultConfig.rpcConf.CircuitBreakerConfig.MaxRequests, "maximum number of requests to check if connection restored after timeout. Default value is 1") // ExecutionDataRequester config flags.BoolVar(&builder.executionDataSyncEnabled, "execution-data-sync-enabled", defaultConfig.executionDataSyncEnabled, "whether to enable the execution data sync protocol") flags.StringVar(&builder.executionDataDir, "execution-data-dir", defaultConfig.executionDataDir, "directory to use for Execution Data database") @@ -758,8 +758,8 @@ func (builder *FlowAccessNodeBuilder) extraFlags() { if builder.rpcConf.CircuitBreakerConfig.MaxFailures == 0 { return errors.New("circuit-breaker-max-failures must be greater than 0") } - if builder.rpcConf.CircuitBreakerConfig.MaxRetries == 0 { - return errors.New("circuit-breaker-max-retries must be greater than 0") + if builder.rpcConf.CircuitBreakerConfig.MaxRequests == 0 { + return errors.New("circuit-breaker-max-requests must be greater than 0") } } diff --git a/engine/access/rpc/backend/connection_factory.go b/engine/access/rpc/backend/connection_factory.go index 3f459f4d674..98e84ddffe2 100644 --- a/engine/access/rpc/backend/connection_factory.go +++ b/engine/access/rpc/backend/connection_factory.go @@ -85,12 +85,12 @@ type ConnectionFactoryImpl struct { // MaxFailures specifies the maximum number of failed calls to the client that will cause the circuit breaker // to close the connection. // -// MaxRetries specifies the maximum number of retries call to check if connection restored after timeout. +// MaxRequests specifies the maximum number of requests to check if connection restored after timeout. type CircuitBreakerConfig struct { Enabled bool RestoreTimeout time.Duration MaxFailures uint32 - MaxRetries uint32 + MaxRequests uint32 } type CachedClient struct { @@ -359,7 +359,7 @@ func (cf *ConnectionFactoryImpl) createCircuitBreakerInterceptor() grpc.UnaryCli // The number of maximum failures is checked before the circuit breaker goes to the Open state. return counts.ConsecutiveFailures >= cf.CircuitBreakerConfig.MaxFailures }, - MaxRequests: cf.CircuitBreakerConfig.MaxRetries, + MaxRequests: cf.CircuitBreakerConfig.MaxRequests, }) circuitBreakerInterceptor := func( diff --git a/engine/access/rpc/backend/connection_factory_test.go b/engine/access/rpc/backend/connection_factory_test.go index ff746b93a98..73a0be84f84 100644 --- a/engine/access/rpc/backend/connection_factory_test.go +++ b/engine/access/rpc/backend/connection_factory_test.go @@ -436,7 +436,7 @@ func TestCircuitBreakerExecutionNode(t *testing.T) { connectionFactory.CircuitBreakerConfig = &CircuitBreakerConfig{ Enabled: true, MaxFailures: 1, - MaxRetries: 1, + MaxRequests: 1, RestoreTimeout: circuitBreakerRestoreTimeout, } @@ -519,7 +519,7 @@ func TestCircuitBreakerCollectionNode(t *testing.T) { connectionFactory.CircuitBreakerConfig = &CircuitBreakerConfig{ Enabled: true, MaxFailures: 1, - MaxRetries: 1, + MaxRequests: 1, RestoreTimeout: circuitBreakerRestoreTimeout, } From 0ce5a84e18d0e358827e6c4ae6cee1db9564b605 Mon Sep 17 00:00:00 2001 From: Andrii Slisarchuk Date: Fri, 7 Jul 2023 21:00:15 +0300 Subject: [PATCH 35/56] fixed mistakes and typos --- engine/access/rpc/backend/backend_transactions.go | 2 +- engine/access/rpc/backend/node_communicator.go | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/engine/access/rpc/backend/backend_transactions.go b/engine/access/rpc/backend/backend_transactions.go index b2abff13eb1..45e5b6eb1ef 100644 --- a/engine/access/rpc/backend/backend_transactions.go +++ b/engine/access/rpc/backend/backend_transactions.go @@ -97,7 +97,7 @@ func (b *backendTransactions) trySendTransaction(ctx context.Context, tx *flow.T defer logAnyError() // try sending the transaction to one of the chosen collection nodes - sendError = b.nodeCommunicator.CallAvailableConnectionNode(collNodes, func(node *flow.Identity) error { + sendError = b.nodeCommunicator.CallAvailableCollectionNode(collNodes, func(node *flow.Identity) error { err = b.sendTransactionToCollector(ctx, tx, node.Address) if err != nil { return err diff --git a/engine/access/rpc/backend/node_communicator.go b/engine/access/rpc/backend/node_communicator.go index 70932764155..00963457d86 100644 --- a/engine/access/rpc/backend/node_communicator.go +++ b/engine/access/rpc/backend/node_communicator.go @@ -60,12 +60,12 @@ func (b *NodeCommunicator) CallAvailableExecutionNode( return errs.ErrorOrNil() } -// CallAvailableConnectionNode calls the provided function on the available connection nodes. -// It iterates through the connection nodes and executes the function. +// CallAvailableCollectionNode calls the provided function on the available collection nodes. +// It iterates through the collection nodes and executes the function. // If an error occurs, it keeps track of the errors. -// If the error occurs in circuit breaker, it continues to the next execution node. +// If the error occurs in circuit breaker, it continues to the next collection node. // If the maximum failed request count is reached, it returns the accumulated errors. -func (b *NodeCommunicator) CallAvailableConnectionNode(nodes flow.IdentityList, call func(node *flow.Identity) error) error { +func (b *NodeCommunicator) CallAvailableCollectionNode(nodes flow.IdentityList, call func(node *flow.Identity) error) error { var errs *multierror.Error collNodeSelector := b.nodeSelectorFactory.SelectCollectionNodes(nodes) From dbe364cf52761229f5181e859d07c4469101afe7 Mon Sep 17 00:00:00 2001 From: Andrii Slisarchuk Date: Tue, 18 Jul 2023 16:19:25 +0300 Subject: [PATCH 36/56] Change config from pointer to value --- cmd/access/node_builder/access_node_builder.go | 2 +- engine/access/rpc/backend/connection_factory.go | 8 ++++---- engine/access/rpc/backend/connection_factory_test.go | 4 ++-- engine/access/rpc/engine.go | 9 ++------- 4 files changed, 9 insertions(+), 14 deletions(-) diff --git a/cmd/access/node_builder/access_node_builder.go b/cmd/access/node_builder/access_node_builder.go index 1400bf46873..a3bf23df03c 100644 --- a/cmd/access/node_builder/access_node_builder.go +++ b/cmd/access/node_builder/access_node_builder.go @@ -160,7 +160,7 @@ func DefaultAccessNodeConfig() *AccessNodeConfig { FixedExecutionNodeIDs: nil, ArchiveAddressList: nil, MaxMsgSize: grpcutils.DefaultMaxMsgSize, - CircuitBreakerConfig: &backend.CircuitBreakerConfig{ + CircuitBreakerConfig: backend.CircuitBreakerConfig{ Enabled: false, RestoreTimeout: 60 * time.Second, MaxFailures: 5, diff --git a/engine/access/rpc/backend/connection_factory.go b/engine/access/rpc/backend/connection_factory.go index 98e84ddffe2..b5013d4550d 100644 --- a/engine/access/rpc/backend/connection_factory.go +++ b/engine/access/rpc/backend/connection_factory.go @@ -72,7 +72,7 @@ type ConnectionFactoryImpl struct { AccessMetrics module.AccessMetrics Log zerolog.Logger mutex sync.Mutex - CircuitBreakerConfig *CircuitBreakerConfig + CircuitBreakerConfig CircuitBreakerConfig } // CircuitBreakerConfig is a configuration struct for the circuit breaker. @@ -116,7 +116,7 @@ func (cf *ConnectionFactoryImpl) createConnection(address string, timeout time.D var connInterceptors []grpc.UnaryClientInterceptor - if cf.CircuitBreakerConfig != nil && cf.CircuitBreakerConfig.Enabled { + if cf.CircuitBreakerConfig.Enabled { connInterceptors = append(connInterceptors, cf.createCircuitBreakerInterceptor()) } else { connInterceptors = append(connInterceptors, cf.createClientInvalidationInterceptor(address, clientType)) @@ -301,7 +301,7 @@ func (cf *ConnectionFactoryImpl) createClientInvalidationInterceptor( address string, clientType clientType, ) grpc.UnaryClientInterceptor { - if cf.CircuitBreakerConfig == nil || !cf.CircuitBreakerConfig.Enabled { + if !cf.CircuitBreakerConfig.Enabled { clientInvalidationInterceptor := func( ctx context.Context, method string, @@ -351,7 +351,7 @@ func (cf *ConnectionFactoryImpl) createClientInvalidationInterceptor( // created if the circuit breaker is enabled. All invocations will go through the circuit breaker to be tracked for // success or failure of the call. func (cf *ConnectionFactoryImpl) createCircuitBreakerInterceptor() grpc.UnaryClientInterceptor { - if cf.CircuitBreakerConfig != nil && cf.CircuitBreakerConfig.Enabled { + if cf.CircuitBreakerConfig.Enabled { circuitBreaker := gobreaker.NewCircuitBreaker(gobreaker.Settings{ // The restore timeout is defined to automatically return the circuit breaker to the HalfClose state. Timeout: cf.CircuitBreakerConfig.RestoreTimeout, diff --git a/engine/access/rpc/backend/connection_factory_test.go b/engine/access/rpc/backend/connection_factory_test.go index 73a0be84f84..e9f6f1a4819 100644 --- a/engine/access/rpc/backend/connection_factory_test.go +++ b/engine/access/rpc/backend/connection_factory_test.go @@ -433,7 +433,7 @@ func TestCircuitBreakerExecutionNode(t *testing.T) { connectionFactory.ExecutionNodeGRPCTimeout = requestTimeout // Set the configuration for the circuit breaker. - connectionFactory.CircuitBreakerConfig = &CircuitBreakerConfig{ + connectionFactory.CircuitBreakerConfig = CircuitBreakerConfig{ Enabled: true, MaxFailures: 1, MaxRequests: 1, @@ -516,7 +516,7 @@ func TestCircuitBreakerCollectionNode(t *testing.T) { connectionFactory.CollectionNodeGRPCTimeout = requestTimeout // Set the configuration for the circuit breaker. - connectionFactory.CircuitBreakerConfig = &CircuitBreakerConfig{ + connectionFactory.CircuitBreakerConfig = CircuitBreakerConfig{ Enabled: true, MaxFailures: 1, MaxRequests: 1, diff --git a/engine/access/rpc/engine.go b/engine/access/rpc/engine.go index 7f9a62809c8..c4b36c263b6 100644 --- a/engine/access/rpc/engine.go +++ b/engine/access/rpc/engine.go @@ -48,7 +48,7 @@ type Config struct { PreferredExecutionNodeIDs []string // preferred list of upstream execution node IDs FixedExecutionNodeIDs []string // fixed list of execution node IDs to choose from if no node node ID can be chosen from the PreferredExecutionNodeIDs ArchiveAddressList []string // the archive node address list to send script executions. when configured, script executions will be all sent to the archive node - CircuitBreakerConfig *backend.CircuitBreakerConfig // the configuration for circuit breaker + CircuitBreakerConfig backend.CircuitBreakerConfig // the configuration for circuit breaker } // Engine exposes the server with a simplified version of the Access API. @@ -175,11 +175,6 @@ func NewBuilder(log zerolog.Logger, CircuitBreakerConfig: config.CircuitBreakerConfig, } - circuitBreakerEnabled := false - if config.CircuitBreakerConfig != nil { - circuitBreakerEnabled = config.CircuitBreakerConfig.Enabled - } - backend := backend.New(state, collectionRPC, historicalAccessNodes, @@ -199,7 +194,7 @@ func NewBuilder(log zerolog.Logger, log, backend.DefaultSnapshotHistoryLimit, config.ArchiveAddressList, - circuitBreakerEnabled, + config.CircuitBreakerConfig.Enabled, ) finalizedCache, finalizedCacheWorker, err := events.NewFinalizedHeaderCache(state) From 1b2e6dfe88c7858186cf73eadf61fa7630ca52a5 Mon Sep 17 00:00:00 2001 From: Andrii Slisarchuk Date: Tue, 18 Jul 2023 16:20:17 +0300 Subject: [PATCH 37/56] Removed unused field --- engine/access/rpc/backend/node_communicator.go | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/engine/access/rpc/backend/node_communicator.go b/engine/access/rpc/backend/node_communicator.go index 00963457d86..08852180eba 100644 --- a/engine/access/rpc/backend/node_communicator.go +++ b/engine/access/rpc/backend/node_communicator.go @@ -12,15 +12,13 @@ const maxFailedRequestCount = 3 // NodeCommunicator is responsible for calling available nodes in the backend. type NodeCommunicator struct { - circuitBreakerEnabled bool - nodeSelectorFactory NodeSelectorFactory + nodeSelectorFactory NodeSelectorFactory } // NewNodeCommunicator creates a new instance of NodeCommunicator. func NewNodeCommunicator(circuitBreakerEnabled bool) *NodeCommunicator { return &NodeCommunicator{ - circuitBreakerEnabled: circuitBreakerEnabled, - nodeSelectorFactory: NodeSelectorFactory{circuitBreakerEnabled: circuitBreakerEnabled}, + nodeSelectorFactory: NodeSelectorFactory{circuitBreakerEnabled: circuitBreakerEnabled}, } } From da69a05c7e7ad90be1cf32daa94380e501a5f37e Mon Sep 17 00:00:00 2001 From: Andrii Slisarchuk Date: Tue, 18 Jul 2023 17:37:31 +0300 Subject: [PATCH 38/56] Fixed remarks --- engine/access/rpc/backend/backend.go | 2 +- engine/access/rpc/backend/backend_accounts.go | 2 +- engine/access/rpc/backend/backend_events.go | 2 +- engine/access/rpc/backend/backend_scripts.go | 2 +- engine/access/rpc/backend/backend_test.go | 14 ++--- .../rpc/backend/backend_transactions.go | 24 +++++--- .../access/rpc/backend/connection_factory.go | 11 +++- .../access/rpc/backend/node_communicator.go | 59 ++++++------------ engine/access/rpc/backend/node_selector.go | 61 +++++-------------- 9 files changed, 68 insertions(+), 109 deletions(-) diff --git a/engine/access/rpc/backend/backend.go b/engine/access/rpc/backend/backend.go index 2c3fdcc7e0d..d0e88b255a8 100644 --- a/engine/access/rpc/backend/backend.go +++ b/engine/access/rpc/backend/backend.go @@ -302,7 +302,7 @@ func (b *Backend) GetLatestProtocolStateSnapshot(_ context.Context) ([]byte, err return convert.SnapshotToBytes(validSnapshot) } -// executionNodesForBlockID returns upto maxExecutionNodesCnt number of randomly chosen execution node identities +// executionNodesForBlockID returns upto maxNodesCnt number of randomly chosen execution node identities // which have executed the given block ID. // If no such execution node is found, an InsufficientExecutionReceipts error is returned. func executionNodesForBlockID( diff --git a/engine/access/rpc/backend/backend_accounts.go b/engine/access/rpc/backend/backend_accounts.go index c1eaa1e98e2..44862c8c2aa 100644 --- a/engine/access/rpc/backend/backend_accounts.go +++ b/engine/access/rpc/backend/backend_accounts.go @@ -108,7 +108,7 @@ func (b *backendAccounts) getAccountAtBlockID( // error aggregating all failures is returned. func (b *backendAccounts) getAccountFromAnyExeNode(ctx context.Context, execNodes flow.IdentityList, req *execproto.GetAccountAtBlockIDRequest) (*execproto.GetAccountAtBlockIDResponse, error) { var resp *execproto.GetAccountAtBlockIDResponse - errToReturn := b.nodeCommunicator.CallAvailableExecutionNode( + errToReturn := b.nodeCommunicator.CallAvailableNode( execNodes, func(node *flow.Identity) error { var err error diff --git a/engine/access/rpc/backend/backend_events.go b/engine/access/rpc/backend/backend_events.go index 38bd723a077..5bae9d633f1 100644 --- a/engine/access/rpc/backend/backend_events.go +++ b/engine/access/rpc/backend/backend_events.go @@ -211,7 +211,7 @@ func (b *backendEvents) getEventsFromAnyExeNode(ctx context.Context, req *execproto.GetEventsForBlockIDsRequest) (*execproto.GetEventsForBlockIDsResponse, *flow.Identity, error) { var resp *execproto.GetEventsForBlockIDsResponse var execNode *flow.Identity - errToReturn := b.nodeCommunicator.CallAvailableExecutionNode( + errToReturn := b.nodeCommunicator.CallAvailableNode( execNodes, func(node *flow.Identity) error { var err error diff --git a/engine/access/rpc/backend/backend_scripts.go b/engine/access/rpc/backend/backend_scripts.go index fc615f61c82..7a0b5e2a741 100644 --- a/engine/access/rpc/backend/backend_scripts.go +++ b/engine/access/rpc/backend/backend_scripts.go @@ -96,7 +96,7 @@ func (b *backendScripts) findScriptExecutors( return nil, err } executorAddrs := make([]string, 0, len(executors)) - execNodeSelector := b.nodeSelectorFactory.SelectExecutionNodes(executors) + execNodeSelector := b.nodeSelectorFactory.SelectNodes(executors) for executor := execNodeSelector.Next(); executor != nil; executor = execNodeSelector.Next() { executorAddrs = append(executorAddrs, executor.Address) diff --git a/engine/access/rpc/backend/backend_test.go b/engine/access/rpc/backend/backend_test.go index be1024ec4bd..213545a3ab8 100644 --- a/engine/access/rpc/backend/backend_test.go +++ b/engine/access/rpc/backend/backend_test.go @@ -171,7 +171,7 @@ func (suite *Suite) TestGetLatestProtocolStateSnapshot_NoTransitionSpan() { epochBuilder := unittest.NewEpochBuilder(suite.T(), state) // build epoch 1 // blocks in current state - // P <- A(S_P-1) <- B(S_P) <- C(S_A) <- D(S_B) |setup| <- E(S_C) <- F(S_D) |commit| + // P <- A(S_P-1) <- B(S_P) <- C(S_A) <- D(S_B) |setup| <- E(S_C) <- NodeAction(S_D) |commit| epochBuilder. BuildEpoch(). CompleteEpoch() @@ -308,7 +308,7 @@ func (suite *Suite) TestGetLatestProtocolStateSnapshot_PhaseTransitionSpan() { epochBuilder := unittest.NewEpochBuilder(suite.T(), state) // build epoch 1 // blocks in current state - // P <- A(S_P-1) <- B(S_P) <- C(S_A) <- D(S_B) |setup| <- E(S_C) <- F(S_D) |commit| + // P <- A(S_P-1) <- B(S_P) <- C(S_A) <- D(S_B) |setup| <- E(S_C) <- NodeAction(S_D) |commit| epochBuilder. BuildEpoch(). CompleteEpoch() @@ -373,7 +373,7 @@ func (suite *Suite) TestGetLatestProtocolStateSnapshot_EpochTransitionSpan() { epochBuilder := unittest.NewEpochBuilder(suite.T(), state) // build epoch 1 // blocks in current state - // P <- A(S_P-1) <- B(S_P) <- C(S_A) <- D(S_B) |setup| <- E(S_C) <- F(S_D) |commit| + // P <- A(S_P-1) <- B(S_P) <- C(S_A) <- D(S_B) |setup| <- E(S_C) <- NodeAction(S_D) |commit| epochBuilder.BuildEpoch() // add more blocks to our state in the commit phase, this will allow @@ -2172,14 +2172,14 @@ func (suite *Suite) TestExecutionNodesForBlockID() { require.NoError(suite.T(), err) execNodeSelectorFactory := NodeSelectorFactory{circuitBreakerEnabled: false} - execSelector := execNodeSelectorFactory.SelectExecutionNodes(allExecNodes) + execSelector := execNodeSelectorFactory.SelectNodes(allExecNodes) actualList := flow.IdentityList{} for actual := execSelector.Next(); actual != nil; actual = execSelector.Next() { actualList = append(actualList, actual) } - if len(expectedENs) > maxExecutionNodesCnt { + if len(expectedENs) > maxNodesCnt { for _, actual := range actualList { require.Contains(suite.T(), expectedENs, actual) } @@ -2199,14 +2199,14 @@ func (suite *Suite) TestExecutionNodesForBlockID() { require.NoError(suite.T(), err) execNodeSelectorFactory := NodeSelectorFactory{circuitBreakerEnabled: false} - execSelector := execNodeSelectorFactory.SelectExecutionNodes(allExecNodes) + execSelector := execNodeSelectorFactory.SelectNodes(allExecNodes) actualList := flow.IdentityList{} for actual := execSelector.Next(); actual != nil; actual = execSelector.Next() { actualList = append(actualList, actual) } - require.Equal(suite.T(), len(actualList), maxExecutionNodesCnt) + require.Equal(suite.T(), len(actualList), maxNodesCnt) }) // if no preferred or fixed ENs are specified, the ExecutionNodesForBlockID function should diff --git a/engine/access/rpc/backend/backend_transactions.go b/engine/access/rpc/backend/backend_transactions.go index 45e5b6eb1ef..f6706df4874 100644 --- a/engine/access/rpc/backend/backend_transactions.go +++ b/engine/access/rpc/backend/backend_transactions.go @@ -97,13 +97,17 @@ func (b *backendTransactions) trySendTransaction(ctx context.Context, tx *flow.T defer logAnyError() // try sending the transaction to one of the chosen collection nodes - sendError = b.nodeCommunicator.CallAvailableCollectionNode(collNodes, func(node *flow.Identity) error { - err = b.sendTransactionToCollector(ctx, tx, node.Address) - if err != nil { - return err - } - return nil - }) + sendError = b.nodeCommunicator.CallAvailableNode( + collNodes, + func(node *flow.Identity) error { + err = b.sendTransactionToCollector(ctx, tx, node.Address) + if err != nil { + return err + } + return nil + }, + nil, + ) return sendError } @@ -773,7 +777,7 @@ func (b *backendTransactions) getTransactionResultFromAnyExeNode( }() var resp *execproto.GetTransactionResultResponse - errToReturn = b.nodeCommunicator.CallAvailableExecutionNode( + errToReturn = b.nodeCommunicator.CallAvailableNode( execNodes, func(node *flow.Identity) error { var err error @@ -835,7 +839,7 @@ func (b *backendTransactions) getTransactionResultsByBlockIDFromAnyExeNode( } var resp *execproto.GetTransactionResultsResponse - errToReturn = b.nodeCommunicator.CallAvailableExecutionNode( + errToReturn = b.nodeCommunicator.CallAvailableNode( execNodes, func(node *flow.Identity) error { var err error @@ -893,7 +897,7 @@ func (b *backendTransactions) getTransactionResultByIndexFromAnyExeNode( } var resp *execproto.GetTransactionResultResponse - errToReturn = b.nodeCommunicator.CallAvailableExecutionNode( + errToReturn = b.nodeCommunicator.CallAvailableNode( execNodes, func(node *flow.Identity) error { var err error diff --git a/engine/access/rpc/backend/connection_factory.go b/engine/access/rpc/backend/connection_factory.go index b5013d4550d..8d338525a63 100644 --- a/engine/access/rpc/backend/connection_factory.go +++ b/engine/access/rpc/backend/connection_factory.go @@ -116,7 +116,11 @@ func (cf *ConnectionFactoryImpl) createConnection(address string, timeout time.D var connInterceptors []grpc.UnaryClientInterceptor + // The order in which interceptors are added to the `connInterceptors` slice is important since they will be called + // in the same order during gRPC requests. if cf.CircuitBreakerConfig.Enabled { + // If the circuit breaker interceptor is enabled, it should always be called first before passing control to + // subsequent interceptors. connInterceptors = append(connInterceptors, cf.createCircuitBreakerInterceptor()) } else { connInterceptors = append(connInterceptors, cf.createClientInvalidationInterceptor(address, clientType)) @@ -318,6 +322,8 @@ func (cf *ConnectionFactoryImpl) createClientInvalidationInterceptor( cf.InvalidateAccessAPIClient(address) case ExecutionClient: cf.InvalidateExecutionAPIClient(address) + default: + cf.Log.Info().Str("client_invalidation_interceptor", address).Msg(fmt.Sprintf("unexpected client type: %d", clientType)) } } @@ -353,12 +359,15 @@ func (cf *ConnectionFactoryImpl) createClientInvalidationInterceptor( func (cf *ConnectionFactoryImpl) createCircuitBreakerInterceptor() grpc.UnaryClientInterceptor { if cf.CircuitBreakerConfig.Enabled { circuitBreaker := gobreaker.NewCircuitBreaker(gobreaker.Settings{ - // The restore timeout is defined to automatically return the circuit breaker to the HalfClose state. + // Timeout defines how long the circuit breaker will remain open before transitioning to the HalfClose state. Timeout: cf.CircuitBreakerConfig.RestoreTimeout, + // ReadyToTrip returns true when the circuit breaker should trip and transition to the Open state ReadyToTrip: func(counts gobreaker.Counts) bool { // The number of maximum failures is checked before the circuit breaker goes to the Open state. return counts.ConsecutiveFailures >= cf.CircuitBreakerConfig.MaxFailures }, + // MaxRequests defines the max number of concurrent requests while the circuit breaker is in the HalfClosed + // state. MaxRequests: cf.CircuitBreakerConfig.MaxRequests, }) diff --git a/engine/access/rpc/backend/node_communicator.go b/engine/access/rpc/backend/node_communicator.go index 08852180eba..87d0265c182 100644 --- a/engine/access/rpc/backend/node_communicator.go +++ b/engine/access/rpc/backend/node_communicator.go @@ -10,6 +10,14 @@ import ( // maxFailedRequestCount represents the maximum number of failed requests before returning errors. const maxFailedRequestCount = 3 +// NodeAction is a callback function type that represents an action to be performed on a node. +// It takes a node as input and returns an error indicating the result of the action. +type NodeAction func(node *flow.Identity) error + +// ErrorTerminator is a callback function that determines whether an error should terminate further execution. +// It takes an error as input and returns a boolean value indicating whether the error should be considered terminal. +type ErrorTerminator func(err error) bool + // NodeCommunicator is responsible for calling available nodes in the backend. type NodeCommunicator struct { nodeSelectorFactory NodeSelectorFactory @@ -22,26 +30,26 @@ func NewNodeCommunicator(circuitBreakerEnabled bool) *NodeCommunicator { } } -// CallAvailableExecutionNode calls the provided function on the available execution nodes. -// It iterates through the execution nodes and executes the function. -// If an error occurs, it applies the custom error handler (if provided) and keeps track of the errors. -// If the error occurs in circuit breaker, it continues to the next execution node. +// CallAvailableNode calls the provided function on the available nodes. +// It iterates through the nodes and executes the function. +// If an error occurs, it applies the custom error terminator (if provided) and keeps track of the errors. +// If the error occurs in circuit breaker, it continues to the next node. // If the maximum failed request count is reached, it returns the accumulated errors. -func (b *NodeCommunicator) CallAvailableExecutionNode( +func (b *NodeCommunicator) CallAvailableNode( nodes flow.IdentityList, - call func(node *flow.Identity) error, - customErrorHandler func(err error) bool, + call NodeAction, + shouldTerminateOnError ErrorTerminator, ) error { var errs *multierror.Error - execNodeSelector := b.nodeSelectorFactory.SelectExecutionNodes(nodes) + nodeSelector := b.nodeSelectorFactory.SelectNodes(nodes) - for execNode := execNodeSelector.Next(); execNode != nil; execNode = execNodeSelector.Next() { - err := call(execNode) + for node := nodeSelector.Next(); node != nil; node = nodeSelector.Next() { + err := call(node) if err == nil { return nil } - if customErrorHandler != nil && customErrorHandler(err) { + if shouldTerminateOnError != nil && shouldTerminateOnError(err) { return err } @@ -57,32 +65,3 @@ func (b *NodeCommunicator) CallAvailableExecutionNode( return errs.ErrorOrNil() } - -// CallAvailableCollectionNode calls the provided function on the available collection nodes. -// It iterates through the collection nodes and executes the function. -// If an error occurs, it keeps track of the errors. -// If the error occurs in circuit breaker, it continues to the next collection node. -// If the maximum failed request count is reached, it returns the accumulated errors. -func (b *NodeCommunicator) CallAvailableCollectionNode(nodes flow.IdentityList, call func(node *flow.Identity) error) error { - var errs *multierror.Error - - collNodeSelector := b.nodeSelectorFactory.SelectCollectionNodes(nodes) - - for colNode := collNodeSelector.Next(); colNode != nil; colNode = collNodeSelector.Next() { - err := call(colNode) - if err == nil { - return nil - } - - if err == gobreaker.ErrOpenState { - continue - } - - errs = multierror.Append(errs, err) - if len(errs.Errors) >= maxFailedRequestCount { - return errs.ErrorOrNil() - } - } - - return errs.ErrorOrNil() -} diff --git a/engine/access/rpc/backend/node_selector.go b/engine/access/rpc/backend/node_selector.go index 48831337ba3..c4c1c724c5f 100644 --- a/engine/access/rpc/backend/node_selector.go +++ b/engine/access/rpc/backend/node_selector.go @@ -2,11 +2,8 @@ package backend import "github.com/onflow/flow-go/model/flow" -// maxExecutionNodesCnt is the maximum number of execution nodes that will be contacted to complete an execution API request. -const maxExecutionNodesCnt = 3 - -// maxCollectionNodesCnt is the maximum number of collection nodes that will be contacted to complete a collection API request. -const maxCollectionNodesCnt = 3 +// maxNodesCnt is the maximum number of nodes that will be contacted to complete an API request. +const maxNodesCnt = 3 // NodeSelector is an interface that represents the ability to select node identities that the access node is trying to reach. // It encapsulates the internal logic of node selection and provides a way to change implementations for different types @@ -21,44 +18,32 @@ type NodeSelectorFactory struct { circuitBreakerEnabled bool } -// SelectExecutionNodes selects the configured number of execution node identities from the provided list of execution nodes -// and returns an execution node selector to iterate through them. -func (n *NodeSelectorFactory) SelectExecutionNodes(executionNodes flow.IdentityList) NodeSelector { +// SelectNodes selects the configured number of node identities from the provided list of nodes +// and returns the node selector to iterate through them. +func (n *NodeSelectorFactory) SelectNodes(nodes flow.IdentityList) NodeSelector { // If the circuit breaker is disabled, the legacy logic should be used, which selects only a specified number of nodes. if !n.circuitBreakerEnabled { - executionNodes = executionNodes.Sample(maxExecutionNodesCnt) + nodes = nodes.Sample(maxNodesCnt) } - return &ExecutionNodeSelector{ - nodes: executionNodes, + return &MainNodeSelector{ + nodes: nodes, index: 0, } } -// SelectCollectionNodes selects the configured number of collection node identities from the provided list of collection nodes -// and returns a collection node selector to iterate through them. -func (n *NodeSelectorFactory) SelectCollectionNodes(collectionNodes flow.IdentityList) NodeSelector { - // If the circuit breaker is disabled, the legacy logic should be used, which selects only a specified number of nodes. - if !n.circuitBreakerEnabled { - collectionNodes = collectionNodes.Sample(maxCollectionNodesCnt) - } +// SelectCollectionNodes - return &CollectionNodeSelector{ - nodes: collectionNodes, - index: 0, - } -} +var _ NodeSelector = (*MainNodeSelector)(nil) -var _ NodeSelector = (*ExecutionNodeSelector)(nil) - -// ExecutionNodeSelector is a specific implementation of an execution node selector. -type ExecutionNodeSelector struct { +// MainNodeSelector is a specific implementation of the node selector. +type MainNodeSelector struct { nodes flow.IdentityList index int } -// Next returns the next execution node in the selector. -func (e *ExecutionNodeSelector) Next() *flow.Identity { +// Next returns the next node in the selector. +func (e *MainNodeSelector) Next() *flow.Identity { if e.index < len(e.nodes) { next := e.nodes[e.index] e.index++ @@ -66,21 +51,3 @@ func (e *ExecutionNodeSelector) Next() *flow.Identity { } return nil } - -var _ NodeSelector = (*CollectionNodeSelector)(nil) - -// CollectionNodeSelector is a specific implementation of a collection node selector. -type CollectionNodeSelector struct { - nodes flow.IdentityList - index int -} - -// Next returns the next collection node in the selector. -func (c *CollectionNodeSelector) Next() *flow.Identity { - if c.index < len(c.nodes) { - next := c.nodes[c.index] - c.index++ - return next - } - return nil -} From cc90ff9e6ae22904ec0108193b202ead7d93c5d2 Mon Sep 17 00:00:00 2001 From: Andrii Slisarchuk Date: Tue, 18 Jul 2023 17:56:32 +0300 Subject: [PATCH 39/56] Fixed remarks --- .../access/rpc/backend/connection_factory.go | 22 ++++++++----------- .../rpc/backend/connection_factory_test.go | 5 +++-- .../access/access_circuit_breaker_test.go | 4 ++-- 3 files changed, 14 insertions(+), 17 deletions(-) diff --git a/engine/access/rpc/backend/connection_factory.go b/engine/access/rpc/backend/connection_factory.go index 8d338525a63..159f48c969d 100644 --- a/engine/access/rpc/backend/connection_factory.go +++ b/engine/access/rpc/backend/connection_factory.go @@ -76,21 +76,17 @@ type ConnectionFactoryImpl struct { } // CircuitBreakerConfig is a configuration struct for the circuit breaker. -// -// Enabled specifies whether the circuit breaker is enabled for collection and execution API clients. -// -// RestoreTimeout specifies the duration after which the circuit breaker will restore the connection to the client -// after closing it due to failures. -// -// MaxFailures specifies the maximum number of failed calls to the client that will cause the circuit breaker -// to close the connection. -// -// MaxRequests specifies the maximum number of requests to check if connection restored after timeout. type CircuitBreakerConfig struct { - Enabled bool + // Enabled specifies whether the circuit breaker is enabled for collection and execution API clients. + Enabled bool + // RestoreTimeout specifies the duration after which the circuit breaker will restore the connection to the client + // after closing it due to failures. RestoreTimeout time.Duration - MaxFailures uint32 - MaxRequests uint32 + // MaxFailures specifies the maximum number of failed calls to the client that will cause the circuit breaker + // to close the connection. + MaxFailures uint32 + // MaxRequests specifies the maximum number of requests to check if connection restored after timeout. + MaxRequests uint32 } type CachedClient struct { diff --git a/engine/access/rpc/backend/connection_factory_test.go b/engine/access/rpc/backend/connection_factory_test.go index e9f6f1a4819..23804a66e02 100644 --- a/engine/access/rpc/backend/connection_factory_test.go +++ b/engine/access/rpc/backend/connection_factory_test.go @@ -16,6 +16,7 @@ import ( "github.com/sony/gobreaker" "github.com/stretchr/testify/assert" testifymock "github.com/stretchr/testify/mock" + "github.com/stretchr/testify/require" "google.golang.org/grpc" "google.golang.org/grpc/codes" "google.golang.org/grpc/status" @@ -221,7 +222,7 @@ func TestExecutionNodeClientTimeout(t *testing.T) { // create the execution API client client, _, err := connectionFactory.GetExecutionAPIClient(en.listener.Addr().String()) - assert.NoError(t, err) + require.NoError(t, err) ctx := context.Background() // make the call to the execution node @@ -453,7 +454,7 @@ func TestCircuitBreakerExecutionNode(t *testing.T) { // Create the execution API client. client, _, err := connectionFactory.GetExecutionAPIClient(en.listener.Addr().String()) - assert.NoError(t, err) + require.NoError(t, err) ctx := context.Background() diff --git a/integration/tests/access/access_circuit_breaker_test.go b/integration/tests/access/access_circuit_breaker_test.go index d99c292465e..aeb37223822 100644 --- a/integration/tests/access/access_circuit_breaker_test.go +++ b/integration/tests/access/access_circuit_breaker_test.go @@ -120,7 +120,7 @@ func (s *AccessCircuitBreakerSuite) TestCircuitBreaker() { require.True(s.T(), accessContainer.IsFlagSet("circuit-breaker-max-failures")) accessClient, err := accessContainer.TestnetClient() - assert.NoError(s.T(), err, "failed to get collection node client") + assert.NoError(s.T(), err, "failed to get access node client") latestBlockID, err := accessClient.GetLatestBlockID(ctx) require.NoError(s.T(), err) @@ -176,7 +176,7 @@ func (s *AccessCircuitBreakerSuite) TestCircuitBreaker() { // Try to send the transaction for the first time. It should wait at least the timeout time and return Unknown error duration, err := sendTransaction(ctx, signedTx) assert.Equal(s.T(), codes.Unknown, status.Code(err)) - assert.GreaterOrEqual(s.T(), requestTimeout, duration) + assert.Greater(s.T(), requestTimeout, duration) // Try to send the transaction for the second time. It should wait less than a second because the circuit breaker // is configured to break after the first failure From 3ea1fd49e0c1d5dae1ecd7a3b8ef91d37f4aeeed Mon Sep 17 00:00:00 2001 From: Andrii Slisarchuk Date: Tue, 18 Jul 2023 20:57:15 +0300 Subject: [PATCH 40/56] Fixed issue with sample after merge. --- engine/access/rpc/backend/backend_scripts.go | 5 ++++- engine/access/rpc/backend/backend_test.go | 6 ++++-- engine/access/rpc/backend/node_communicator.go | 5 ++++- engine/access/rpc/backend/node_selector.go | 16 ++++++++++++---- 4 files changed, 24 insertions(+), 8 deletions(-) diff --git a/engine/access/rpc/backend/backend_scripts.go b/engine/access/rpc/backend/backend_scripts.go index 7a0b5e2a741..587bb9f3f3c 100644 --- a/engine/access/rpc/backend/backend_scripts.go +++ b/engine/access/rpc/backend/backend_scripts.go @@ -96,7 +96,10 @@ func (b *backendScripts) findScriptExecutors( return nil, err } executorAddrs := make([]string, 0, len(executors)) - execNodeSelector := b.nodeSelectorFactory.SelectNodes(executors) + execNodeSelector, err := b.nodeSelectorFactory.SelectNodes(executors) + if err != nil { + return nil, err + } for executor := execNodeSelector.Next(); executor != nil; executor = execNodeSelector.Next() { executorAddrs = append(executorAddrs, executor.Address) diff --git a/engine/access/rpc/backend/backend_test.go b/engine/access/rpc/backend/backend_test.go index 213545a3ab8..60ce82bc440 100644 --- a/engine/access/rpc/backend/backend_test.go +++ b/engine/access/rpc/backend/backend_test.go @@ -2172,7 +2172,8 @@ func (suite *Suite) TestExecutionNodesForBlockID() { require.NoError(suite.T(), err) execNodeSelectorFactory := NodeSelectorFactory{circuitBreakerEnabled: false} - execSelector := execNodeSelectorFactory.SelectNodes(allExecNodes) + execSelector, err := execNodeSelectorFactory.SelectNodes(allExecNodes) + require.NoError(suite.T(), err) actualList := flow.IdentityList{} for actual := execSelector.Next(); actual != nil; actual = execSelector.Next() { @@ -2199,7 +2200,8 @@ func (suite *Suite) TestExecutionNodesForBlockID() { require.NoError(suite.T(), err) execNodeSelectorFactory := NodeSelectorFactory{circuitBreakerEnabled: false} - execSelector := execNodeSelectorFactory.SelectNodes(allExecNodes) + execSelector, err := execNodeSelectorFactory.SelectNodes(allExecNodes) + require.NoError(suite.T(), err) actualList := flow.IdentityList{} for actual := execSelector.Next(); actual != nil; actual = execSelector.Next() { diff --git a/engine/access/rpc/backend/node_communicator.go b/engine/access/rpc/backend/node_communicator.go index 87d0265c182..279c281c225 100644 --- a/engine/access/rpc/backend/node_communicator.go +++ b/engine/access/rpc/backend/node_communicator.go @@ -41,7 +41,10 @@ func (b *NodeCommunicator) CallAvailableNode( shouldTerminateOnError ErrorTerminator, ) error { var errs *multierror.Error - nodeSelector := b.nodeSelectorFactory.SelectNodes(nodes) + nodeSelector, err := b.nodeSelectorFactory.SelectNodes(nodes) + if err != nil { + return err + } for node := nodeSelector.Next(); node != nil; node = nodeSelector.Next() { err := call(node) diff --git a/engine/access/rpc/backend/node_selector.go b/engine/access/rpc/backend/node_selector.go index c4c1c724c5f..fbe6c1aa1dd 100644 --- a/engine/access/rpc/backend/node_selector.go +++ b/engine/access/rpc/backend/node_selector.go @@ -1,6 +1,10 @@ package backend -import "github.com/onflow/flow-go/model/flow" +import ( + "fmt" + + "github.com/onflow/flow-go/model/flow" +) // maxNodesCnt is the maximum number of nodes that will be contacted to complete an API request. const maxNodesCnt = 3 @@ -20,16 +24,20 @@ type NodeSelectorFactory struct { // SelectNodes selects the configured number of node identities from the provided list of nodes // and returns the node selector to iterate through them. -func (n *NodeSelectorFactory) SelectNodes(nodes flow.IdentityList) NodeSelector { +func (n *NodeSelectorFactory) SelectNodes(nodes flow.IdentityList) (NodeSelector, error) { + var err error // If the circuit breaker is disabled, the legacy logic should be used, which selects only a specified number of nodes. if !n.circuitBreakerEnabled { - nodes = nodes.Sample(maxNodesCnt) + nodes, err = nodes.Sample(maxNodesCnt) + if err != nil { + return nil, fmt.Errorf("sampling failed: %w", err) + } } return &MainNodeSelector{ nodes: nodes, index: 0, - } + }, nil } // SelectCollectionNodes From 80132f94d2ea55c247589c03c0ce1446e9d6a29b Mon Sep 17 00:00:00 2001 From: Andrii Slisarchuk Date: Tue, 18 Jul 2023 21:41:26 +0300 Subject: [PATCH 41/56] Fixed issue with all nodes unavailable. --- engine/access/rpc/backend/node_communicator.go | 5 +++++ engine/access/rpc/backend/node_selector.go | 8 +++++++- 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/engine/access/rpc/backend/node_communicator.go b/engine/access/rpc/backend/node_communicator.go index 279c281c225..31298d30894 100644 --- a/engine/access/rpc/backend/node_communicator.go +++ b/engine/access/rpc/backend/node_communicator.go @@ -3,6 +3,8 @@ package backend import ( "github.com/hashicorp/go-multierror" "github.com/sony/gobreaker" + "google.golang.org/grpc/codes" + "google.golang.org/grpc/status" "github.com/onflow/flow-go/model/flow" ) @@ -57,6 +59,9 @@ func (b *NodeCommunicator) CallAvailableNode( } if err == gobreaker.ErrOpenState { + if !nodeSelector.HasNext() && len(errs.Errors) == 0 { + errs = multierror.Append(errs, status.Error(codes.Unavailable, "there are no available nodes")) + } continue } diff --git a/engine/access/rpc/backend/node_selector.go b/engine/access/rpc/backend/node_selector.go index fbe6c1aa1dd..f5682a79ccf 100644 --- a/engine/access/rpc/backend/node_selector.go +++ b/engine/access/rpc/backend/node_selector.go @@ -12,9 +12,10 @@ const maxNodesCnt = 3 // NodeSelector is an interface that represents the ability to select node identities that the access node is trying to reach. // It encapsulates the internal logic of node selection and provides a way to change implementations for different types // of nodes. Implementations of this interface should define the Next method, which returns the next node identity to be -// selected. +// selected. HasNext checks if there is next node available. type NodeSelector interface { Next() *flow.Identity + HasNext() bool } // NodeSelectorFactory is a factory for creating node selectors based on factory configuration and node type. @@ -50,6 +51,11 @@ type MainNodeSelector struct { index int } +// HasNext returns true if next node is available. +func (e *MainNodeSelector) HasNext() bool { + return e.index < len(e.nodes) +} + // Next returns the next node in the selector. func (e *MainNodeSelector) Next() *flow.Identity { if e.index < len(e.nodes) { From f654460241e554a27c78fe3cd50171a7842ecf95 Mon Sep 17 00:00:00 2001 From: Andrii Slisarchuk Date: Tue, 18 Jul 2023 22:03:33 +0300 Subject: [PATCH 42/56] Make timeouts shorter in cf tests. --- engine/access/rpc/backend/connection_factory_test.go | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/engine/access/rpc/backend/connection_factory_test.go b/engine/access/rpc/backend/connection_factory_test.go index 23804a66e02..aefdfd64488 100644 --- a/engine/access/rpc/backend/connection_factory_test.go +++ b/engine/access/rpc/backend/connection_factory_test.go @@ -411,8 +411,8 @@ func TestConnectionPoolStale(t *testing.T) { // TestCircuitBreakerExecutionNode tests the circuit breaker state changes for execution nodes. func TestCircuitBreakerExecutionNode(t *testing.T) { - requestTimeout := 1 * time.Second - circuitBreakerRestoreTimeout := 3 * time.Second + requestTimeout := 500 * time.Millisecond + circuitBreakerRestoreTimeout := 1500 * time.Millisecond // Create an execution node for testing. en := new(executionNode) @@ -484,7 +484,7 @@ func TestCircuitBreakerExecutionNode(t *testing.T) { en.handler.On("Ping", testifymock.Anything, req).Return(resp, nil) // Wait until the circuit breaker transitions to the "HalfOpen" state. - time.Sleep(circuitBreakerRestoreTimeout + time.Second) + time.Sleep(circuitBreakerRestoreTimeout + (500 * time.Millisecond)) // Call and measure the duration for the third invocation (circuit breaker state is now "HalfOpen"). duration, err = callAndMeasurePingDuration() @@ -494,8 +494,8 @@ func TestCircuitBreakerExecutionNode(t *testing.T) { // TestCircuitBreakerCollectionNode tests the circuit breaker state changes for collection nodes. func TestCircuitBreakerCollectionNode(t *testing.T) { - requestTimeout := 1 * time.Second - circuitBreakerRestoreTimeout := 3 * time.Second + requestTimeout := 500 * time.Millisecond + circuitBreakerRestoreTimeout := 1500 * time.Millisecond // Create a collection node for testing. cn := new(collectionNode) @@ -567,7 +567,7 @@ func TestCircuitBreakerCollectionNode(t *testing.T) { cn.handler.On("Ping", testifymock.Anything, req).Return(resp, nil) // Wait until the circuit breaker transitions to the "HalfOpen" state. - time.Sleep(circuitBreakerRestoreTimeout + time.Second) + time.Sleep(circuitBreakerRestoreTimeout + (500 * time.Millisecond)) // Call and measure the duration for the third invocation (circuit breaker state is now "HalfOpen"). duration, err = callAndMeasurePingDuration() From a7e305af83d7de0a996c58b9fe5dc2ccc964c6fd Mon Sep 17 00:00:00 2001 From: Andrii Slisarchuk Date: Tue, 18 Jul 2023 23:41:20 +0300 Subject: [PATCH 43/56] Fixed integration test. --- integration/tests/access/access_circuit_breaker_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/integration/tests/access/access_circuit_breaker_test.go b/integration/tests/access/access_circuit_breaker_test.go index aeb37223822..d537873136e 100644 --- a/integration/tests/access/access_circuit_breaker_test.go +++ b/integration/tests/access/access_circuit_breaker_test.go @@ -175,7 +175,7 @@ func (s *AccessCircuitBreakerSuite) TestCircuitBreaker() { // Try to send the transaction for the first time. It should wait at least the timeout time and return Unknown error duration, err := sendTransaction(ctx, signedTx) - assert.Equal(s.T(), codes.Unknown, status.Code(err)) + assert.Equal(s.T(), codes.Unavailable, status.Code(err)) assert.Greater(s.T(), requestTimeout, duration) // Try to send the transaction for the second time. It should wait less than a second because the circuit breaker From 248c4e949530c498ffee4617dcde6c848370226c Mon Sep 17 00:00:00 2001 From: Andrii Slisarchuk Date: Thu, 20 Jul 2023 11:17:26 +0300 Subject: [PATCH 44/56] Revert unnecessary changes --- engine/access/rpc/backend/backend_test.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/engine/access/rpc/backend/backend_test.go b/engine/access/rpc/backend/backend_test.go index 60ce82bc440..ec2d2501a45 100644 --- a/engine/access/rpc/backend/backend_test.go +++ b/engine/access/rpc/backend/backend_test.go @@ -171,7 +171,7 @@ func (suite *Suite) TestGetLatestProtocolStateSnapshot_NoTransitionSpan() { epochBuilder := unittest.NewEpochBuilder(suite.T(), state) // build epoch 1 // blocks in current state - // P <- A(S_P-1) <- B(S_P) <- C(S_A) <- D(S_B) |setup| <- E(S_C) <- NodeAction(S_D) |commit| + // P <- A(S_P-1) <- B(S_P) <- C(S_A) <- D(S_B) |setup| <- E(S_C) <- F(S_D) |commit| epochBuilder. BuildEpoch(). CompleteEpoch() @@ -308,7 +308,7 @@ func (suite *Suite) TestGetLatestProtocolStateSnapshot_PhaseTransitionSpan() { epochBuilder := unittest.NewEpochBuilder(suite.T(), state) // build epoch 1 // blocks in current state - // P <- A(S_P-1) <- B(S_P) <- C(S_A) <- D(S_B) |setup| <- E(S_C) <- NodeAction(S_D) |commit| + // P <- A(S_P-1) <- B(S_P) <- C(S_A) <- D(S_B) |setup| <- E(S_C) <- F(S_D) |commit| epochBuilder. BuildEpoch(). CompleteEpoch() @@ -373,7 +373,7 @@ func (suite *Suite) TestGetLatestProtocolStateSnapshot_EpochTransitionSpan() { epochBuilder := unittest.NewEpochBuilder(suite.T(), state) // build epoch 1 // blocks in current state - // P <- A(S_P-1) <- B(S_P) <- C(S_A) <- D(S_B) |setup| <- E(S_C) <- NodeAction(S_D) |commit| + // P <- A(S_P-1) <- B(S_P) <- C(S_A) <- D(S_B) |setup| <- E(S_C) <- F(S_D) |commit| epochBuilder.BuildEpoch() // add more blocks to our state in the commit phase, this will allow From da821bc79ac6ccee71221bf8c597a7edfd77c16f Mon Sep 17 00:00:00 2001 From: Andrii Slisarchuk Date: Thu, 20 Jul 2023 11:33:58 +0300 Subject: [PATCH 45/56] Fixed test case --- integration/tests/access/access_circuit_breaker_test.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/integration/tests/access/access_circuit_breaker_test.go b/integration/tests/access/access_circuit_breaker_test.go index d537873136e..d02fc811a6c 100644 --- a/integration/tests/access/access_circuit_breaker_test.go +++ b/integration/tests/access/access_circuit_breaker_test.go @@ -173,10 +173,10 @@ func (s *AccessCircuitBreakerSuite) TestCircuitBreaker() { return duration, err } - // Try to send the transaction for the first time. It should wait at least the timeout time and return Unknown error + // Try to send the transaction for the first time. It should wait at least the timeout time and return Unavailable error duration, err := sendTransaction(ctx, signedTx) assert.Equal(s.T(), codes.Unavailable, status.Code(err)) - assert.Greater(s.T(), requestTimeout, duration) + assert.Less(s.T(), duration, requestTimeout) // Try to send the transaction for the second time. It should wait less than a second because the circuit breaker // is configured to break after the first failure From ac8c8b16373dfd1462b2d4a98d6a34f395d58bc5 Mon Sep 17 00:00:00 2001 From: Andrii Slisarchuk Date: Thu, 20 Jul 2023 13:25:00 +0300 Subject: [PATCH 46/56] Implemented limitation for scripts execution nodes --- engine/access/rpc/backend/backend.go | 20 +-- engine/access/rpc/backend/backend_scripts.go | 135 ++++++++---------- .../access/rpc/backend/node_communicator.go | 4 +- 3 files changed, 74 insertions(+), 85 deletions(-) diff --git a/engine/access/rpc/backend/backend.go b/engine/access/rpc/backend/backend.go index d0e88b255a8..4552f3f3009 100644 --- a/engine/access/rpc/backend/backend.go +++ b/engine/access/rpc/backend/backend.go @@ -124,16 +124,16 @@ func New( state: state, // create the sub-backends backendScripts: backendScripts{ - headers: headers, - executionReceipts: executionReceipts, - connFactory: connFactory, - state: state, - log: log, - metrics: accessMetrics, - loggedScripts: loggedScripts, - archiveAddressList: archiveAddressList, - archivePorts: archivePorts, - nodeSelectorFactory: NodeSelectorFactory{circuitBreakerEnabled: circuitBreakerEnabled}, + headers: headers, + executionReceipts: executionReceipts, + connFactory: connFactory, + state: state, + log: log, + metrics: accessMetrics, + loggedScripts: loggedScripts, + archiveAddressList: archiveAddressList, + archivePorts: archivePorts, + nodeCommunicator: nodeCommunicator, }, backendTransactions: backendTransactions{ staticCollectionRPC: collectionRPC, diff --git a/engine/access/rpc/backend/backend_scripts.go b/engine/access/rpc/backend/backend_scripts.go index 587bb9f3f3c..103afea02e0 100644 --- a/engine/access/rpc/backend/backend_scripts.go +++ b/engine/access/rpc/backend/backend_scripts.go @@ -9,7 +9,6 @@ import ( lru "github.com/hashicorp/golang-lru" "github.com/onflow/flow/protobuf/go/flow/access" - "github.com/hashicorp/go-multierror" execproto "github.com/onflow/flow/protobuf/go/flow/execution" "github.com/rs/zerolog" "google.golang.org/grpc/codes" @@ -26,16 +25,16 @@ import ( const uniqueScriptLoggingTimeWindow = 10 * time.Minute type backendScripts struct { - headers storage.Headers - executionReceipts storage.ExecutionReceipts - state protocol.State - connFactory ConnectionFactory - log zerolog.Logger - metrics module.BackendScriptsMetrics - loggedScripts *lru.Cache - archiveAddressList []string - archivePorts []uint - nodeSelectorFactory NodeSelectorFactory + headers storage.Headers + executionReceipts storage.ExecutionReceipts + state protocol.State + connFactory ConnectionFactory + log zerolog.Logger + metrics module.BackendScriptsMetrics + loggedScripts *lru.Cache + archiveAddressList []string + archivePorts []uint + nodeCommunicator *NodeCommunicator } func (b *backendScripts) ExecuteScriptAtLatestBlock( @@ -86,27 +85,6 @@ func (b *backendScripts) ExecuteScriptAtBlockHeight( return b.executeScriptOnExecutor(ctx, blockID, script, arguments) } -func (b *backendScripts) findScriptExecutors( - ctx context.Context, - blockID flow.Identifier, -) ([]string, error) { - executors, err := executionNodesForBlockID(ctx, blockID, b.executionReceipts, b.state, b.log) - - if err != nil { - return nil, err - } - executorAddrs := make([]string, 0, len(executors)) - execNodeSelector, err := b.nodeSelectorFactory.SelectNodes(executors) - if err != nil { - return nil, err - } - - for executor := execNodeSelector.Next(); executor != nil; executor = execNodeSelector.Next() { - executorAddrs = append(executorAddrs, executor.Address) - } - return executorAddrs, nil -} - // executeScriptOnExecutionNode forwards the request to the execution node using the execution node // grpc client and converts the response back to the access node api response format func (b *backendScripts) executeScriptOnExecutor( @@ -116,7 +94,7 @@ func (b *backendScripts) executeScriptOnExecutor( arguments [][]byte, ) ([]byte, error) { // find few execution nodes which have executed the block earlier and provided an execution receipt for it - scriptExecutors, err := b.findScriptExecutors(ctx, blockID) + executors, err := executionNodesForBlockID(ctx, blockID, b.executionReceipts, b.state, b.log) if err != nil { return nil, status.Errorf(codes.Internal, "failed to find script executors at blockId %v: %v", blockID.String(), err) } @@ -125,8 +103,8 @@ func (b *backendScripts) executeScriptOnExecutor( // *DO NOT* use this hash for any protocol-related or cryptographic functions. insecureScriptHash := md5.Sum(script) //nolint:gosec - // try execution on Archive nodes first - if len(b.archiveAddressList) > 0 { + // try execution on Archive nodes if there is no execution nodes found + if len(executors) == 0 && len(b.archiveAddressList) > 0 { startTime := time.Now() for idx, rnAddr := range b.archiveAddressList { rnPort := b.archivePorts[idx] @@ -160,54 +138,65 @@ func (b *backendScripts) executeScriptOnExecutor( } } } - // try execution nodes if the script wasn't executed - var errors *multierror.Error + // try to execute the script on one of the execution nodes found - for _, executorAddress := range scriptExecutors { - execStartTime := time.Now() // record start time - result, err := b.tryExecuteScriptOnExecutionNode(ctx, executorAddress, blockID, script, arguments) - if err == nil { - if b.log.GetLevel() == zerolog.DebugLevel { - executionTime := time.Now() - if b.shouldLogScript(executionTime, insecureScriptHash) { - b.log.Debug(). - Str("script_executor_addr", executorAddress). - Hex("block_id", blockID[:]). - Hex("script_hash", insecureScriptHash[:]). - Str("script", string(script)). - Msg("Successfully executed script") - b.loggedScripts.Add(insecureScriptHash, executionTime) + var result []byte + hasInvalidArgument := false + errToReturn := b.nodeCommunicator.CallAvailableNode( + executors, + func(node *flow.Identity) error { + execStartTime := time.Now() + result, err = b.tryExecuteScriptOnExecutionNode(ctx, node.Address, blockID, script, arguments) + if err == nil { + if b.log.GetLevel() == zerolog.DebugLevel { + executionTime := time.Now() + if b.shouldLogScript(executionTime, insecureScriptHash) { + b.log.Debug(). + Str("script_executor_addr", node.Address). + Hex("block_id", blockID[:]). + Hex("script_hash", insecureScriptHash[:]). + Str("script", string(script)). + Msg("Successfully executed script") + b.loggedScripts.Add(insecureScriptHash, executionTime) + } } + + // log execution time + b.metrics.ScriptExecuted( + time.Since(execStartTime), + len(script), + ) + + return nil } - // log execution time - b.metrics.ScriptExecuted( - time.Since(execStartTime), - len(script), - ) + return err + }, + func(node flow.Identity, err error) bool { + hasInvalidArgument = status.Code(err) == codes.InvalidArgument + if hasInvalidArgument { + b.log.Debug().Err(err). + Str("script_executor_addr", node.Address). + Hex("block_id", blockID[:]). + Hex("script_hash", insecureScriptHash[:]). + Str("script", string(script)). + Msg("script failed to execute on the execution node") + } + return hasInvalidArgument + }, + ) - return result, nil - } - // return if it's just a script failure as opposed to an EN/RN failure and skip trying other ENs/RNs - if status.Code(err) == codes.InvalidArgument { - b.log.Debug().Err(err). - Str("script_executor_addr", executorAddress). - Hex("block_id", blockID[:]). - Hex("script_hash", insecureScriptHash[:]). - Str("script", string(script)). - Msg("script failed to execute on the execution node") - return nil, err - } - errors = multierror.Append(errors, err) + if hasInvalidArgument { + return nil, errToReturn } - errToReturn := errors.ErrorOrNil() - if errToReturn != nil { + if errToReturn == nil { b.metrics.ScriptExecutionErrorOnExecutionNode() b.log.Error().Err(err).Msg("script execution failed for execution node internal reasons") + return nil, rpc.ConvertError(errToReturn, "failed to execute script on execution nodes", codes.Internal) + } else { + return result, nil } - - return nil, rpc.ConvertMultiError(errors, "failed to execute script on execution nodes", codes.Internal) } // shouldLogScript checks if the script hash is unique in the time window diff --git a/engine/access/rpc/backend/node_communicator.go b/engine/access/rpc/backend/node_communicator.go index 31298d30894..052b4efa0c9 100644 --- a/engine/access/rpc/backend/node_communicator.go +++ b/engine/access/rpc/backend/node_communicator.go @@ -18,7 +18,7 @@ type NodeAction func(node *flow.Identity) error // ErrorTerminator is a callback function that determines whether an error should terminate further execution. // It takes an error as input and returns a boolean value indicating whether the error should be considered terminal. -type ErrorTerminator func(err error) bool +type ErrorTerminator func(node flow.Identity, err error) bool // NodeCommunicator is responsible for calling available nodes in the backend. type NodeCommunicator struct { @@ -54,7 +54,7 @@ func (b *NodeCommunicator) CallAvailableNode( return nil } - if shouldTerminateOnError != nil && shouldTerminateOnError(err) { + if shouldTerminateOnError != nil && shouldTerminateOnError(*node, err) { return err } From 3ed6741d85bad7afb50b95c3846f4a65c42894a5 Mon Sep 17 00:00:00 2001 From: Andrii Slisarchuk Date: Thu, 20 Jul 2023 14:35:42 +0300 Subject: [PATCH 47/56] Fixed if statement --- cmd/util/cmd/execution-state-extract/export_report.json | 4 ++-- engine/access/rpc/backend/backend_scripts.go | 4 ++-- engine/access/rpc/backend/backend_transactions.go | 6 +++--- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/cmd/util/cmd/execution-state-extract/export_report.json b/cmd/util/cmd/execution-state-extract/export_report.json index 4c8484e4396..3d4abf5bcf2 100644 --- a/cmd/util/cmd/execution-state-extract/export_report.json +++ b/cmd/util/cmd/execution-state-extract/export_report.json @@ -1,6 +1,6 @@ { "EpochCounter": 0, - "PreviousStateCommitment": "1c9f9d343cb8d4610e0b2c1eb74d6ea2f2f8aef2d666281dc22870e3efaa607b", - "CurrentStateCommitment": "1c9f9d343cb8d4610e0b2c1eb74d6ea2f2f8aef2d666281dc22870e3efaa607b", + "PreviousStateCommitment": "0872fc2bbcf573c016e8af58e14db6e1efdc48e8e770115d5962f9225b1fa465", + "CurrentStateCommitment": "0872fc2bbcf573c016e8af58e14db6e1efdc48e8e770115d5962f9225b1fa465", "ReportSucceeded": true } \ No newline at end of file diff --git a/engine/access/rpc/backend/backend_scripts.go b/engine/access/rpc/backend/backend_scripts.go index 103afea02e0..1f4a03044b2 100644 --- a/engine/access/rpc/backend/backend_scripts.go +++ b/engine/access/rpc/backend/backend_scripts.go @@ -191,11 +191,11 @@ func (b *backendScripts) executeScriptOnExecutor( } if errToReturn == nil { + return result, nil + } else { b.metrics.ScriptExecutionErrorOnExecutionNode() b.log.Error().Err(err).Msg("script execution failed for execution node internal reasons") return nil, rpc.ConvertError(errToReturn, "failed to execute script on execution nodes", codes.Internal) - } else { - return result, nil } } diff --git a/engine/access/rpc/backend/backend_transactions.go b/engine/access/rpc/backend/backend_transactions.go index f6706df4874..c4bab57d5c3 100644 --- a/engine/access/rpc/backend/backend_transactions.go +++ b/engine/access/rpc/backend/backend_transactions.go @@ -792,7 +792,7 @@ func (b *backendTransactions) getTransactionResultFromAnyExeNode( } return err }, - func(err error) bool { + func(_ flow.Identity, err error) bool { return status.Code(err) == codes.NotFound }, ) @@ -853,7 +853,7 @@ func (b *backendTransactions) getTransactionResultsByBlockIDFromAnyExeNode( } return err }, - func(err error) bool { + func(_ flow.Identity, err error) bool { return status.Code(err) == codes.NotFound }, ) @@ -912,7 +912,7 @@ func (b *backendTransactions) getTransactionResultByIndexFromAnyExeNode( } return err }, - func(err error) bool { + func(_ flow.Identity, err error) bool { return status.Code(err) == codes.NotFound }, ) From b779edf1fafc441235c30c01289ef0dbb54a4fa5 Mon Sep 17 00:00:00 2001 From: Andrii Slisarchuk Date: Thu, 20 Jul 2023 14:38:30 +0300 Subject: [PATCH 48/56] Added check for argument --- cmd/access/node_builder/access_node_builder.go | 3 +++ cmd/util/cmd/execution-state-extract/export_report.json | 4 ++-- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/cmd/access/node_builder/access_node_builder.go b/cmd/access/node_builder/access_node_builder.go index e2ac124c0ba..3afdf5b313d 100644 --- a/cmd/access/node_builder/access_node_builder.go +++ b/cmd/access/node_builder/access_node_builder.go @@ -760,6 +760,9 @@ func (builder *FlowAccessNodeBuilder) extraFlags() { if builder.rpcConf.CircuitBreakerConfig.MaxRequests == 0 { return errors.New("circuit-breaker-max-requests must be greater than 0") } + if builder.rpcConf.CircuitBreakerConfig.RestoreTimeout > 0 { + return errors.New("circuit-breaker-restore-timeout must be greater than 0") + } } return nil diff --git a/cmd/util/cmd/execution-state-extract/export_report.json b/cmd/util/cmd/execution-state-extract/export_report.json index 3d4abf5bcf2..067d5c44b7f 100644 --- a/cmd/util/cmd/execution-state-extract/export_report.json +++ b/cmd/util/cmd/execution-state-extract/export_report.json @@ -1,6 +1,6 @@ { "EpochCounter": 0, - "PreviousStateCommitment": "0872fc2bbcf573c016e8af58e14db6e1efdc48e8e770115d5962f9225b1fa465", - "CurrentStateCommitment": "0872fc2bbcf573c016e8af58e14db6e1efdc48e8e770115d5962f9225b1fa465", + "PreviousStateCommitment": "0af51cc7d3d8ac8307b33126a6407ac950e4b64396c55c304b313364c6a0e64d", + "CurrentStateCommitment": "0af51cc7d3d8ac8307b33126a6407ac950e4b64396c55c304b313364c6a0e64d", "ReportSucceeded": true } \ No newline at end of file From 7dc921bfcfda549b50fe717ab5bc97dd7cadc1d0 Mon Sep 17 00:00:00 2001 From: Andrii Slisarchuk Date: Fri, 21 Jul 2023 00:17:54 +0300 Subject: [PATCH 49/56] Apply suggestions from code review Co-authored-by: Peter Argue <89119817+peterargue@users.noreply.github.com> --- engine/access/rpc/backend/node_selector.go | 2 -- integration/tests/access/access_circuit_breaker_test.go | 2 +- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/engine/access/rpc/backend/node_selector.go b/engine/access/rpc/backend/node_selector.go index f5682a79ccf..9a68d25769f 100644 --- a/engine/access/rpc/backend/node_selector.go +++ b/engine/access/rpc/backend/node_selector.go @@ -41,8 +41,6 @@ func (n *NodeSelectorFactory) SelectNodes(nodes flow.IdentityList) (NodeSelector }, nil } -// SelectCollectionNodes - var _ NodeSelector = (*MainNodeSelector)(nil) // MainNodeSelector is a specific implementation of the node selector. diff --git a/integration/tests/access/access_circuit_breaker_test.go b/integration/tests/access/access_circuit_breaker_test.go index d02fc811a6c..db9853e9e8e 100644 --- a/integration/tests/access/access_circuit_breaker_test.go +++ b/integration/tests/access/access_circuit_breaker_test.go @@ -176,7 +176,7 @@ func (s *AccessCircuitBreakerSuite) TestCircuitBreaker() { // Try to send the transaction for the first time. It should wait at least the timeout time and return Unavailable error duration, err := sendTransaction(ctx, signedTx) assert.Equal(s.T(), codes.Unavailable, status.Code(err)) - assert.Less(s.T(), duration, requestTimeout) + assert.GreaterOrEqual(s.T(), duration, requestTimeout) // Try to send the transaction for the second time. It should wait less than a second because the circuit breaker // is configured to break after the first failure From 384968ad07921c0df86182753d9cb4041b1857dc Mon Sep 17 00:00:00 2001 From: Andrii Slisarchuk Date: Fri, 21 Jul 2023 00:25:29 +0300 Subject: [PATCH 50/56] change error terminator argument type --- engine/access/rpc/backend/backend_scripts.go | 2 +- engine/access/rpc/backend/backend_transactions.go | 6 +++--- engine/access/rpc/backend/node_communicator.go | 4 ++-- integration/tests/access/access_circuit_breaker_test.go | 2 +- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/engine/access/rpc/backend/backend_scripts.go b/engine/access/rpc/backend/backend_scripts.go index 1f4a03044b2..41ec28ce5ae 100644 --- a/engine/access/rpc/backend/backend_scripts.go +++ b/engine/access/rpc/backend/backend_scripts.go @@ -172,7 +172,7 @@ func (b *backendScripts) executeScriptOnExecutor( return err }, - func(node flow.Identity, err error) bool { + func(node *flow.Identity, err error) bool { hasInvalidArgument = status.Code(err) == codes.InvalidArgument if hasInvalidArgument { b.log.Debug().Err(err). diff --git a/engine/access/rpc/backend/backend_transactions.go b/engine/access/rpc/backend/backend_transactions.go index c4bab57d5c3..4622fabff78 100644 --- a/engine/access/rpc/backend/backend_transactions.go +++ b/engine/access/rpc/backend/backend_transactions.go @@ -792,7 +792,7 @@ func (b *backendTransactions) getTransactionResultFromAnyExeNode( } return err }, - func(_ flow.Identity, err error) bool { + func(_ *flow.Identity, err error) bool { return status.Code(err) == codes.NotFound }, ) @@ -853,7 +853,7 @@ func (b *backendTransactions) getTransactionResultsByBlockIDFromAnyExeNode( } return err }, - func(_ flow.Identity, err error) bool { + func(_ *flow.Identity, err error) bool { return status.Code(err) == codes.NotFound }, ) @@ -912,7 +912,7 @@ func (b *backendTransactions) getTransactionResultByIndexFromAnyExeNode( } return err }, - func(_ flow.Identity, err error) bool { + func(_ *flow.Identity, err error) bool { return status.Code(err) == codes.NotFound }, ) diff --git a/engine/access/rpc/backend/node_communicator.go b/engine/access/rpc/backend/node_communicator.go index 052b4efa0c9..d75432b0b29 100644 --- a/engine/access/rpc/backend/node_communicator.go +++ b/engine/access/rpc/backend/node_communicator.go @@ -18,7 +18,7 @@ type NodeAction func(node *flow.Identity) error // ErrorTerminator is a callback function that determines whether an error should terminate further execution. // It takes an error as input and returns a boolean value indicating whether the error should be considered terminal. -type ErrorTerminator func(node flow.Identity, err error) bool +type ErrorTerminator func(node *flow.Identity, err error) bool // NodeCommunicator is responsible for calling available nodes in the backend. type NodeCommunicator struct { @@ -54,7 +54,7 @@ func (b *NodeCommunicator) CallAvailableNode( return nil } - if shouldTerminateOnError != nil && shouldTerminateOnError(*node, err) { + if shouldTerminateOnError != nil && shouldTerminateOnError(node, err) { return err } diff --git a/integration/tests/access/access_circuit_breaker_test.go b/integration/tests/access/access_circuit_breaker_test.go index db9853e9e8e..717a9df98cc 100644 --- a/integration/tests/access/access_circuit_breaker_test.go +++ b/integration/tests/access/access_circuit_breaker_test.go @@ -181,7 +181,7 @@ func (s *AccessCircuitBreakerSuite) TestCircuitBreaker() { // Try to send the transaction for the second time. It should wait less than a second because the circuit breaker // is configured to break after the first failure duration, err = sendTransaction(ctx, signedTx) - assert.Equal(s.T(), codes.Unknown, status.Code(err)) + assert.Equal(s.T(), codes.Unavailable, status.Code(err)) assert.Greater(s.T(), time.Second, duration) // Reconnect the collection node From cc4c03ce77877b52305a821e95e1baf8fb6c7490 Mon Sep 17 00:00:00 2001 From: Andrii Slisarchuk Date: Fri, 21 Jul 2023 15:22:11 +0300 Subject: [PATCH 51/56] Fixed broken test --- .../node_builder/access_node_builder.go | 2 +- integration/testnet/client.go | 24 +++++++++---------- .../access/access_circuit_breaker_test.go | 18 +++++++------- 3 files changed, 22 insertions(+), 22 deletions(-) diff --git a/cmd/access/node_builder/access_node_builder.go b/cmd/access/node_builder/access_node_builder.go index 29ca72a720a..61cae1f468c 100644 --- a/cmd/access/node_builder/access_node_builder.go +++ b/cmd/access/node_builder/access_node_builder.go @@ -762,7 +762,7 @@ func (builder *FlowAccessNodeBuilder) extraFlags() { if builder.rpcConf.CircuitBreakerConfig.MaxRequests == 0 { return errors.New("circuit-breaker-max-requests must be greater than 0") } - if builder.rpcConf.CircuitBreakerConfig.RestoreTimeout > 0 { + if builder.rpcConf.CircuitBreakerConfig.RestoreTimeout <= 0 { return errors.New("circuit-breaker-restore-timeout must be greater than 0") } } diff --git a/integration/testnet/client.go b/integration/testnet/client.go index ab2eb0b751e..941c83a8fbd 100644 --- a/integration/testnet/client.go +++ b/integration/testnet/client.go @@ -79,18 +79,18 @@ func NewClient(addr string, chain flow.Chain) (*Client, error) { } // Uncomment for debugging keys - //json, err := key.MarshalJSON() - //if err != nil { - // return nil, fmt.Errorf("cannot marshal key json: %w", err) - //} - //public := key.PublicKey(1000) - //publicJson, err := public.MarshalJSON() - //if err != nil { - // return nil, fmt.Errorf("cannot marshal key json: %w", err) - //} - - //fmt.Printf("New client with private key: \n%s\n", json) - //fmt.Printf("and public key: \n%s\n", publicJson) + json, err := key.MarshalJSON() + if err != nil { + return nil, fmt.Errorf("cannot marshal key json: %w", err) + } + public := key.PublicKey(1000) + publicJson, err := public.MarshalJSON() + if err != nil { + return nil, fmt.Errorf("cannot marshal key json: %w", err) + } + + fmt.Printf("New client with private key: \n%s\n", json) + fmt.Printf("and public key: \n%s\n", publicJson) return NewClientWithKey(addr, sdk.Address(chain.ServiceAddress()), privateKey, chain) } diff --git a/integration/tests/access/access_circuit_breaker_test.go b/integration/tests/access/access_circuit_breaker_test.go index 717a9df98cc..d2721e41284 100644 --- a/integration/tests/access/access_circuit_breaker_test.go +++ b/integration/tests/access/access_circuit_breaker_test.go @@ -62,6 +62,7 @@ func (s *AccessCircuitBreakerSuite) SetupTest() { testnet.WithLogLevel(zerolog.InfoLevel), testnet.WithAdditionalFlag("--circuit-breaker-enabled=true"), testnet.WithAdditionalFlag(fmt.Sprintf("--circuit-breaker-restore-timeout=%s", cbRestoreTimeout.String())), + testnet.WithAdditionalFlag("--circuit-breaker-max-requests=1"), testnet.WithAdditionalFlag("--circuit-breaker-max-failures=1"), testnet.WithAdditionalFlag(fmt.Sprintf("--collection-client-timeout=%s", requestTimeout.String())), ), @@ -105,9 +106,6 @@ func (s *AccessCircuitBreakerSuite) SetupTest() { // 3. Connect the collection node to the network and wait for the circuit breaker restore time. // 4. Successfully send a transaction. func (s *AccessCircuitBreakerSuite) TestCircuitBreaker() { - ctx, cancel := context.WithCancel(s.ctx) - defer cancel() - // 1. Get the collection node collectionContainer := s.net.ContainerByName("collection_1") @@ -117,12 +115,14 @@ func (s *AccessCircuitBreakerSuite) TestCircuitBreaker() { // Check if access node was created with circuit breaker flags require.True(s.T(), accessContainer.IsFlagSet("circuit-breaker-enabled")) require.True(s.T(), accessContainer.IsFlagSet("circuit-breaker-restore-timeout")) + require.True(s.T(), accessContainer.IsFlagSet("circuit-breaker-max-requests")) require.True(s.T(), accessContainer.IsFlagSet("circuit-breaker-max-failures")) accessClient, err := accessContainer.TestnetClient() - assert.NoError(s.T(), err, "failed to get access node client") + require.NoError(s.T(), err, "failed to get access node client") + require.NotNil(s.T(), accessClient, "failed to get access node client") - latestBlockID, err := accessClient.GetLatestBlockID(ctx) + latestBlockID, err := accessClient.GetLatestBlockID(s.ctx) require.NoError(s.T(), err) // Create a new account to deploy Counter to @@ -153,7 +153,7 @@ func (s *AccessCircuitBreakerSuite) TestCircuitBreaker() { SetGasLimit(9999) // Sign the transaction - childCtx, cancel := context.WithTimeout(ctx, time.Second*10) + childCtx, cancel := context.WithTimeout(s.ctx, time.Second*10) signedTx, err := accessClient.SignTransaction(createAccountTx) require.NoError(s.T(), err) cancel() @@ -174,13 +174,13 @@ func (s *AccessCircuitBreakerSuite) TestCircuitBreaker() { } // Try to send the transaction for the first time. It should wait at least the timeout time and return Unavailable error - duration, err := sendTransaction(ctx, signedTx) + duration, err := sendTransaction(s.ctx, signedTx) assert.Equal(s.T(), codes.Unavailable, status.Code(err)) assert.GreaterOrEqual(s.T(), duration, requestTimeout) // Try to send the transaction for the second time. It should wait less than a second because the circuit breaker // is configured to break after the first failure - duration, err = sendTransaction(ctx, signedTx) + duration, err = sendTransaction(s.ctx, signedTx) assert.Equal(s.T(), codes.Unavailable, status.Code(err)) assert.Greater(s.T(), time.Second, duration) @@ -192,6 +192,6 @@ func (s *AccessCircuitBreakerSuite) TestCircuitBreaker() { time.Sleep(cbRestoreTimeout) // Try to send the transaction for the third time. The transaction should be sent successfully - _, err = sendTransaction(ctx, signedTx) + _, err = sendTransaction(s.ctx, signedTx) require.NoError(s.T(), err, "transaction should be sent") } From 30ac5aa58bd3cf33da1114857d87ff2681a43df7 Mon Sep 17 00:00:00 2001 From: Andrii Slisarchuk Date: Fri, 21 Jul 2023 21:44:41 +0300 Subject: [PATCH 52/56] Apply suggestions from code review Co-authored-by: Yurii Oleksyshyn --- engine/access/rpc/backend/node_selector.go | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/engine/access/rpc/backend/node_selector.go b/engine/access/rpc/backend/node_selector.go index 9a68d25769f..69a8f3e11aa 100644 --- a/engine/access/rpc/backend/node_selector.go +++ b/engine/access/rpc/backend/node_selector.go @@ -19,6 +19,9 @@ type NodeSelector interface { } // NodeSelectorFactory is a factory for creating node selectors based on factory configuration and node type. +// Supported configurations: +// circuitBreakerEnabled = true - nodes will be pseudo-randomly sampled and picked in-order. +// circuitBreakerEnabled = false - nodes will be picked from proposed list in-order without any changes. type NodeSelectorFactory struct { circuitBreakerEnabled bool } @@ -44,6 +47,7 @@ func (n *NodeSelectorFactory) SelectNodes(nodes flow.IdentityList) (NodeSelector var _ NodeSelector = (*MainNodeSelector)(nil) // MainNodeSelector is a specific implementation of the node selector. +// Which performs in-order node selection using fixed list of pre-defined nodes. type MainNodeSelector struct { nodes flow.IdentityList index int From 82f8cbaf90cc19b966674337c0b50fdd628e1be3 Mon Sep 17 00:00:00 2001 From: Andrii Slisarchuk Date: Fri, 21 Jul 2023 21:59:04 +0300 Subject: [PATCH 53/56] Fixed review remarks --- engine/access/rpc/backend/connection_factory.go | 1 + .../access/rpc/backend/connection_factory_test.go | 10 ++-------- engine/access/rpc/backend/node_selector.go | 13 +++++++------ 3 files changed, 10 insertions(+), 14 deletions(-) diff --git a/engine/access/rpc/backend/connection_factory.go b/engine/access/rpc/backend/connection_factory.go index 159f48c969d..bb22af8c249 100644 --- a/engine/access/rpc/backend/connection_factory.go +++ b/engine/access/rpc/backend/connection_factory.go @@ -27,6 +27,7 @@ import ( // DefaultClientTimeout is used when making a GRPC request to a collection node or an execution node const DefaultClientTimeout = 3 * time.Second +// clientType is an enumeration type used to differentiate between different types of gRPC clients. type clientType int const ( diff --git a/engine/access/rpc/backend/connection_factory_test.go b/engine/access/rpc/backend/connection_factory_test.go index aefdfd64488..bd4393332db 100644 --- a/engine/access/rpc/backend/connection_factory_test.go +++ b/engine/access/rpc/backend/connection_factory_test.go @@ -443,10 +443,7 @@ func TestCircuitBreakerExecutionNode(t *testing.T) { // Set the connection pool cache size. cacheSize := 1 - cache, _ := lru.NewWithEvict(cacheSize, func(_, evictedValue interface{}) { - evictedValue.(*CachedClient).Close() - }) - connectionFactory.ConnectionsCache = cache + connectionFactory.ConnectionsCache, _ = lru.New(cacheSize) connectionFactory.CacheSize = uint(cacheSize) // Set metrics reporting. @@ -526,10 +523,7 @@ func TestCircuitBreakerCollectionNode(t *testing.T) { // Set the connection pool cache size. cacheSize := 1 - cache, _ := lru.NewWithEvict(cacheSize, func(_, evictedValue interface{}) { - evictedValue.(*CachedClient).Close() - }) - connectionFactory.ConnectionsCache = cache + connectionFactory.ConnectionsCache, _ = lru.New(cacheSize) connectionFactory.CacheSize = uint(cacheSize) // Set metrics reporting. diff --git a/engine/access/rpc/backend/node_selector.go b/engine/access/rpc/backend/node_selector.go index 69a8f3e11aa..f90f8271b2d 100644 --- a/engine/access/rpc/backend/node_selector.go +++ b/engine/access/rpc/backend/node_selector.go @@ -38,14 +38,9 @@ func (n *NodeSelectorFactory) SelectNodes(nodes flow.IdentityList) (NodeSelector } } - return &MainNodeSelector{ - nodes: nodes, - index: 0, - }, nil + return NewMainNodeSelector(nodes), nil } -var _ NodeSelector = (*MainNodeSelector)(nil) - // MainNodeSelector is a specific implementation of the node selector. // Which performs in-order node selection using fixed list of pre-defined nodes. type MainNodeSelector struct { @@ -53,6 +48,12 @@ type MainNodeSelector struct { index int } +var _ NodeSelector = (*MainNodeSelector)(nil) + +func NewMainNodeSelector(nodes flow.IdentityList) *MainNodeSelector { + return &MainNodeSelector{nodes: nodes, index: 0} +} + // HasNext returns true if next node is available. func (e *MainNodeSelector) HasNext() bool { return e.index < len(e.nodes) From 0d5a2c48d6379d9cbf3ac0623cbd74359a975476 Mon Sep 17 00:00:00 2001 From: Andrii Slisarchuk Date: Fri, 21 Jul 2023 22:41:39 +0300 Subject: [PATCH 54/56] Fixed integrarion test. Removed timeouts as it is invalid check there. --- .../access/rpc/backend/connection_factory.go | 9 +++---- .../access/access_circuit_breaker_test.go | 27 +++++-------------- 2 files changed, 11 insertions(+), 25 deletions(-) diff --git a/engine/access/rpc/backend/connection_factory.go b/engine/access/rpc/backend/connection_factory.go index bb22af8c249..957cccf7dfc 100644 --- a/engine/access/rpc/backend/connection_factory.go +++ b/engine/access/rpc/backend/connection_factory.go @@ -99,7 +99,6 @@ type CachedClient struct { // createConnection creates new gRPC connections to remote node func (cf *ConnectionFactoryImpl) createConnection(address string, timeout time.Duration, clientType clientType) (*grpc.ClientConn, error) { - if timeout == 0 { timeout = DefaultClientTimeout } @@ -113,8 +112,11 @@ func (cf *ConnectionFactoryImpl) createConnection(address string, timeout time.D var connInterceptors []grpc.UnaryClientInterceptor + connInterceptors = append(connInterceptors, createClientTimeoutInterceptor(timeout)) + // The order in which interceptors are added to the `connInterceptors` slice is important since they will be called - // in the same order during gRPC requests. + // in the opposite order during gRPC requests. See documentation for more info: + // https://grpc.io/blog/grpc-web-interceptor/#binding-interceptors if cf.CircuitBreakerConfig.Enabled { // If the circuit breaker interceptor is enabled, it should always be called first before passing control to // subsequent interceptors. @@ -123,8 +125,6 @@ func (cf *ConnectionFactoryImpl) createConnection(address string, timeout time.D connInterceptors = append(connInterceptors, cf.createClientInvalidationInterceptor(address, clientType)) } - connInterceptors = append(connInterceptors, createClientTimeoutInterceptor(timeout)) - // ClientConn's default KeepAlive on connections is indefinite, assuming the timeout isn't reached // The connections should be safe to be persisted and reused // https://pkg.go.dev/google.golang.org/grpc#WithKeepaliveParams @@ -386,7 +386,6 @@ func (cf *ConnectionFactoryImpl) createCircuitBreakerInterceptor() grpc.UnaryCli // the "StateClosed" and handles invocations as usual. _, err := circuitBreaker.Execute(func() (interface{}, error) { err := invoker(ctx, method, req, reply, cc, opts...) - return nil, err }) return err diff --git a/integration/tests/access/access_circuit_breaker_test.go b/integration/tests/access/access_circuit_breaker_test.go index d2721e41284..a0d38f79b5c 100644 --- a/integration/tests/access/access_circuit_breaker_test.go +++ b/integration/tests/access/access_circuit_breaker_test.go @@ -38,8 +38,8 @@ type AccessCircuitBreakerSuite struct { net *testnet.FlowNetwork } -var requestTimeout = 3 * time.Second -var cbRestoreTimeout = 6 * time.Second +var requestTimeout = 1500 * time.Millisecond +var cbRestoreTimeout = 3 * time.Second func (s *AccessCircuitBreakerSuite) TearDownTest() { s.log.Info().Msg("================> Start TearDownTest") @@ -153,36 +153,23 @@ func (s *AccessCircuitBreakerSuite) TestCircuitBreaker() { SetGasLimit(9999) // Sign the transaction - childCtx, cancel := context.WithTimeout(s.ctx, time.Second*10) signedTx, err := accessClient.SignTransaction(createAccountTx) require.NoError(s.T(), err) - cancel() // 3. Disconnect the collection node from the network to activate the Circuit Breaker err = collectionContainer.Disconnect() require.NoError(s.T(), err, "failed to pause connection node") // 4. Send a couple of transactions to test if the circuit breaker opens correctly - sendTransaction := func(ctx context.Context, tx *sdk.Transaction) (time.Duration, error) { - childCtx, cancel = context.WithTimeout(ctx, time.Second*10) - start := time.Now() - err := accessClient.SendTransaction(childCtx, tx) - duration := time.Since(start) - defer cancel() - - return duration, err - } - // Try to send the transaction for the first time. It should wait at least the timeout time and return Unavailable error - duration, err := sendTransaction(s.ctx, signedTx) + err = accessClient.SendTransaction(s.ctx, signedTx) assert.Equal(s.T(), codes.Unavailable, status.Code(err)) - assert.GreaterOrEqual(s.T(), duration, requestTimeout) // Try to send the transaction for the second time. It should wait less than a second because the circuit breaker // is configured to break after the first failure - duration, err = sendTransaction(s.ctx, signedTx) - assert.Equal(s.T(), codes.Unavailable, status.Code(err)) - assert.Greater(s.T(), time.Second, duration) + err = accessClient.SendTransaction(s.ctx, signedTx) + //Here we catch the codes.Unknown error, as this is the one that comes from the Circuit Breaker when the state is Open. + assert.Equal(s.T(), codes.Unknown, status.Code(err)) // Reconnect the collection node err = collectionContainer.Connect() @@ -192,6 +179,6 @@ func (s *AccessCircuitBreakerSuite) TestCircuitBreaker() { time.Sleep(cbRestoreTimeout) // Try to send the transaction for the third time. The transaction should be sent successfully - _, err = sendTransaction(s.ctx, signedTx) + err = accessClient.SendTransaction(s.ctx, signedTx) require.NoError(s.T(), err, "transaction should be sent") } From b4c749c1bb75be70b5a3a7091c1612299b073767 Mon Sep 17 00:00:00 2001 From: Andrii Slisarchuk Date: Mon, 24 Jul 2023 15:08:44 +0300 Subject: [PATCH 55/56] CHanged timeouts --- integration/tests/access/access_circuit_breaker_test.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/integration/tests/access/access_circuit_breaker_test.go b/integration/tests/access/access_circuit_breaker_test.go index a0d38f79b5c..a96e9cf2ab7 100644 --- a/integration/tests/access/access_circuit_breaker_test.go +++ b/integration/tests/access/access_circuit_breaker_test.go @@ -38,8 +38,8 @@ type AccessCircuitBreakerSuite struct { net *testnet.FlowNetwork } -var requestTimeout = 1500 * time.Millisecond -var cbRestoreTimeout = 3 * time.Second +var requestTimeout = 3 * time.Second +var cbRestoreTimeout = 6 * time.Second func (s *AccessCircuitBreakerSuite) TearDownTest() { s.log.Info().Msg("================> Start TearDownTest") From 731a26c6d47a1c5d79341b25f4b09c094ee06db2 Mon Sep 17 00:00:00 2001 From: Andrii Slisarchuk Date: Fri, 28 Jul 2023 01:30:45 +0300 Subject: [PATCH 56/56] Fixed remarks --- .../export_report.json | 6 +- engine/access/rpc/backend/backend_scripts.go | 4 +- .../access/rpc/connection/connection_test.go | 70 +++---------------- integration/testnet/client.go | 24 +++---- 4 files changed, 27 insertions(+), 77 deletions(-) diff --git a/cmd/util/cmd/execution-state-extract/export_report.json b/cmd/util/cmd/execution-state-extract/export_report.json index bf70c636e61..f33cbf40cb9 100644 --- a/cmd/util/cmd/execution-state-extract/export_report.json +++ b/cmd/util/cmd/execution-state-extract/export_report.json @@ -1,6 +1,6 @@ { "EpochCounter": 0, - "PreviousStateCommitment": "524d1b0bb8c8826b42298a04c327b34f113628c617cf396e4fcd1405bdf7c70f", - "CurrentStateCommitment": "524d1b0bb8c8826b42298a04c327b34f113628c617cf396e4fcd1405bdf7c70f", + "PreviousStateCommitment": "1c9f9d343cb8d4610e0b2c1eb74d6ea2f2f8aef2d666281dc22870e3efaa607b", + "CurrentStateCommitment": "1c9f9d343cb8d4610e0b2c1eb74d6ea2f2f8aef2d666281dc22870e3efaa607b", "ReportSucceeded": true -} \ No newline at end of file +} diff --git a/engine/access/rpc/backend/backend_scripts.go b/engine/access/rpc/backend/backend_scripts.go index 3651450b472..62d32c56211 100644 --- a/engine/access/rpc/backend/backend_scripts.go +++ b/engine/access/rpc/backend/backend_scripts.go @@ -104,8 +104,8 @@ func (b *backendScripts) executeScriptOnExecutor( // *DO NOT* use this hash for any protocol-related or cryptographic functions. insecureScriptHash := md5.Sum(script) //nolint:gosec - // try execution on Archive nodes if there is no execution nodes found - if len(executors) == 0 && len(b.archiveAddressList) > 0 { + // try execution on Archive nodes + if len(b.archiveAddressList) > 0 { startTime := time.Now() for idx, rnAddr := range b.archiveAddressList { rnPort := b.archivePorts[idx] diff --git a/engine/access/rpc/connection/connection_test.go b/engine/access/rpc/connection/connection_test.go index cbfea58c3d6..a961816605e 100644 --- a/engine/access/rpc/connection/connection_test.go +++ b/engine/access/rpc/connection/connection_test.go @@ -52,12 +52,7 @@ func TestProxyAccessAPI(t *testing.T) { unittest.Logger(), connectionFactory.AccessMetrics, 0, - CircuitBreakerConfig{ - Enabled: false, - MaxFailures: 0, - MaxRequests: 0, - RestoreTimeout: 0, - }, + CircuitBreakerConfig{}, ) proxyConnectionFactory := ProxyConnectionFactory{ @@ -99,12 +94,7 @@ func TestProxyExecutionAPI(t *testing.T) { unittest.Logger(), connectionFactory.AccessMetrics, 0, - CircuitBreakerConfig{ - Enabled: false, - MaxFailures: 0, - MaxRequests: 0, - RestoreTimeout: 0, - }, + CircuitBreakerConfig{}, ) proxyConnectionFactory := ProxyConnectionFactory{ @@ -151,12 +141,7 @@ func TestProxyAccessAPIConnectionReuse(t *testing.T) { unittest.Logger(), connectionFactory.AccessMetrics, 0, - CircuitBreakerConfig{ - Enabled: false, - MaxFailures: 0, - MaxRequests: 0, - RestoreTimeout: 0, - }, + CircuitBreakerConfig{}, ) proxyConnectionFactory := ProxyConnectionFactory{ @@ -210,12 +195,7 @@ func TestProxyExecutionAPIConnectionReuse(t *testing.T) { unittest.Logger(), connectionFactory.AccessMetrics, 0, - CircuitBreakerConfig{ - Enabled: false, - MaxFailures: 0, - MaxRequests: 0, - RestoreTimeout: 0, - }, + CircuitBreakerConfig{}, ) proxyConnectionFactory := ProxyConnectionFactory{ @@ -276,12 +256,7 @@ func TestExecutionNodeClientTimeout(t *testing.T) { unittest.Logger(), connectionFactory.AccessMetrics, 0, - CircuitBreakerConfig{ - Enabled: false, - MaxFailures: 0, - MaxRequests: 0, - RestoreTimeout: 0, - }, + CircuitBreakerConfig{}, ) // create the execution API client @@ -330,12 +305,7 @@ func TestCollectionNodeClientTimeout(t *testing.T) { unittest.Logger(), connectionFactory.AccessMetrics, 0, - CircuitBreakerConfig{ - Enabled: false, - MaxFailures: 0, - MaxRequests: 0, - RestoreTimeout: 0, - }, + CircuitBreakerConfig{}, ) // create the collection API client @@ -384,12 +354,7 @@ func TestConnectionPoolFull(t *testing.T) { unittest.Logger(), connectionFactory.AccessMetrics, 0, - CircuitBreakerConfig{ - Enabled: false, - MaxFailures: 0, - MaxRequests: 0, - RestoreTimeout: 0, - }, + CircuitBreakerConfig{}, ) cn1Address := "foo1:123" @@ -465,12 +430,7 @@ func TestConnectionPoolStale(t *testing.T) { unittest.Logger(), connectionFactory.AccessMetrics, 0, - CircuitBreakerConfig{ - Enabled: false, - MaxFailures: 0, - MaxRequests: 0, - RestoreTimeout: 0, - }, + CircuitBreakerConfig{}, ) proxyConnectionFactory := ProxyConnectionFactory{ @@ -559,12 +519,7 @@ func TestExecutionNodeClientClosedGracefully(t *testing.T) { unittest.Logger(), connectionFactory.AccessMetrics, 0, - CircuitBreakerConfig{ - Enabled: false, - MaxFailures: 0, - MaxRequests: 0, - RestoreTimeout: 0, - }, + CircuitBreakerConfig{}, ) clientAddress := en.listener.Addr().String() @@ -648,12 +603,7 @@ func TestExecutionEvictingCacheClients(t *testing.T) { unittest.Logger(), connectionFactory.AccessMetrics, 0, - CircuitBreakerConfig{ - Enabled: false, - MaxFailures: 0, - MaxRequests: 0, - RestoreTimeout: 0, - }, + CircuitBreakerConfig{}, ) clientAddress := cn.listener.Addr().String() diff --git a/integration/testnet/client.go b/integration/testnet/client.go index 941c83a8fbd..51026702085 100644 --- a/integration/testnet/client.go +++ b/integration/testnet/client.go @@ -79,18 +79,18 @@ func NewClient(addr string, chain flow.Chain) (*Client, error) { } // Uncomment for debugging keys - json, err := key.MarshalJSON() - if err != nil { - return nil, fmt.Errorf("cannot marshal key json: %w", err) - } - public := key.PublicKey(1000) - publicJson, err := public.MarshalJSON() - if err != nil { - return nil, fmt.Errorf("cannot marshal key json: %w", err) - } - - fmt.Printf("New client with private key: \n%s\n", json) - fmt.Printf("and public key: \n%s\n", publicJson) + //json, err := key.MarshalJSON() + //if err != nil { + // return nil, fmt.Errorf("cannot marshal key json: %w", err) + //} + //public := key.PublicKey(1000) + //publicJson, err := public.MarshalJSON() + //if err != nil { + // return nil, fmt.Errorf("cannot marshal key json: %w", err) + //} + // + //fmt.Printf("New client with private key: \n%s\n", json) + //fmt.Printf("and public key: \n%s\n", publicJson) return NewClientWithKey(addr, sdk.Address(chain.ServiceAddress()), privateKey, chain) }