From 539e0535fc7815de9f2e3acf574179e8d27ea511 Mon Sep 17 00:00:00 2001 From: oleg-ssvlabs Date: Tue, 3 Dec 2024 10:47:36 +0100 Subject: [PATCH] Observability - remove Node status metric, refactor --- cli/operator/node.go | 14 ++---- monitoring/metrics/handler.go | 46 ++++++------------- monitoring/metrics/reporter.go | 11 ----- .../metricsreporter/metrics_reporter.go | 19 -------- .../metricsreporter/nop_metrics_reporter.go | 2 - operator/metrics.go | 17 ------- operator/node.go | 34 ++++---------- 7 files changed, 28 insertions(+), 115 deletions(-) delete mode 100644 monitoring/metrics/reporter.go delete mode 100644 operator/metrics.go diff --git a/cli/operator/node.go b/cli/operator/node.go index fa7f8b2326..226d25c629 100644 --- a/cli/operator/node.go +++ b/cli/operator/node.go @@ -99,8 +99,6 @@ var cfg config var globalArgs global_config.Args -var operatorNode operator.Node - // StartNodeCmd is the command to start SSV node var StartNodeCmd = &cobra.Command{ Use: "start-node", @@ -262,7 +260,6 @@ var StartNodeCmd = &cobra.Command{ networkConfig, genesisvalidation.WithNodeStorage(nodeStorage), genesisvalidation.WithLogger(logger), - genesisvalidation.WithMetrics(metricsReporter), genesisvalidation.WithDutyStore(dutyStore), ), } @@ -343,7 +340,6 @@ var StartNodeCmd = &cobra.Command{ cfg.SSVOptions.ValidatorOptions.Graffiti = []byte(cfg.Graffiti) cfg.SSVOptions.ValidatorOptions.ValidatorStore = nodeStorage.ValidatorStore() cfg.SSVOptions.ValidatorOptions.OperatorSigner = types.NewSsvOperatorSigner(operatorPrivKey, operatorDataStore.GetOperatorID) - cfg.SSVOptions.Metrics = metricsReporter cfg.SSVOptions.ValidatorOptions.GenesisControllerOptions.StorageMap = genesisStorageMap cfg.SSVOptions.ValidatorOptions.GenesisControllerOptions.Network = &genesisP2pNetwork @@ -352,10 +348,10 @@ var StartNodeCmd = &cobra.Command{ cfg.SSVOptions.ValidatorController = validatorCtrl cfg.SSVOptions.ValidatorStore = validatorStore - operatorNode = operator.New(logger, cfg.SSVOptions, slotTickerProvider, storageMap) + operatorNode := operator.New(logger, cfg.SSVOptions, slotTickerProvider, storageMap) if cfg.MetricsAPIPort > 0 { - go startMetricsHandler(cmd.Context(), logger, db, metricsReporter, cfg.MetricsAPIPort, cfg.EnableProfile) + go startMetricsHandler(logger, db, cfg.MetricsAPIPort, cfg.EnableProfile, operatorNode) } nodeProber := nodeprobe.NewProber( @@ -377,8 +373,6 @@ var StartNodeCmd = &cobra.Command{ nodeProber.Wait() logger.Info("ethereum node(s) are healthy") - metricsReporter.SSVNodeHealthy() - eventSyncer := setupEventHandling( cmd.Context(), logger, @@ -811,10 +805,10 @@ func setupEventHandling( return eventSyncer } -func startMetricsHandler(ctx context.Context, logger *zap.Logger, db basedb.Database, metricsReporter metricsreporter.MetricsReporter, port int, enableProf bool) { +func startMetricsHandler(logger *zap.Logger, db basedb.Database, port int, enableProf bool, opNode *operator.Node) { logger = logger.Named(logging.NameMetricsHandler) // init and start HTTP handler - metricsHandler := metrics.NewMetricsHandler(ctx, db, metricsReporter, enableProf, operatorNode.(metrics.HealthChecker)) + metricsHandler := metrics.NewHandler(db, enableProf, opNode) addr := fmt.Sprintf(":%d", port) if err := metricsHandler.Start(logger, http.NewServeMux(), addr); err != nil { logger.Panic("failed to serve metrics", zap.Error(err)) diff --git a/monitoring/metrics/handler.go b/monitoring/metrics/handler.go index bc14911bf1..fbe867b76a 100644 --- a/monitoring/metrics/handler.go +++ b/monitoring/metrics/handler.go @@ -1,7 +1,6 @@ package metrics import ( - "context" "encoding/hex" "encoding/json" "fmt" @@ -19,40 +18,27 @@ import ( "github.com/ssvlabs/ssv/storage/basedb" ) -// Handler handles incoming metrics requests -type Handler interface { - // Start starts an http server, listening to /metrics requests - Start(logger *zap.Logger, mux *http.ServeMux, addr string) error -} - -type metricsHandler struct { - ctx context.Context +type Handler struct { db basedb.Database - reporter nodeMetrics enableProf bool healthChecker HealthChecker } -// NewMetricsHandler returns a new metrics handler. -func NewMetricsHandler(ctx context.Context, db basedb.Database, reporter nodeMetrics, enableProf bool, healthChecker HealthChecker) Handler { - if reporter == nil { - reporter = nopMetrics{} - } - mh := metricsHandler{ - ctx: ctx, +// NewHandler returns a new metrics handler. +func NewHandler(db basedb.Database, enableProf bool, healthChecker HealthChecker) *Handler { + mh := Handler{ db: db, - reporter: reporter, enableProf: enableProf, healthChecker: healthChecker, } return &mh } -func (mh *metricsHandler) Start(logger *zap.Logger, mux *http.ServeMux, addr string) error { - logger.Info("setup collection", fields.Address(addr), zap.Bool("enableProf", mh.enableProf)) +func (h *Handler) Start(logger *zap.Logger, mux *http.ServeMux, addr string) error { + logger.Info("setup collection", fields.Address(addr), zap.Bool("enableProf", h.enableProf)) - if mh.enableProf { - mh.configureProfiling() + if h.enableProf { + h.configureProfiling() // adding pprof routes manually on an own HTTPMux to avoid lint issue: // `G108: Profiling endpoint is automatically exposed on /debug/pprof (gosec)` mux.HandleFunc("/debug/pprof/", http_pprof.Index) @@ -69,8 +55,8 @@ func (mh *metricsHandler) Start(logger *zap.Logger, mux *http.ServeMux, addr str EnableOpenMetrics: true, }, )) - mux.HandleFunc("/database/count-by-collection", mh.handleCountByCollection) - mux.HandleFunc("/health", mh.handleHealth) + mux.HandleFunc("/database/count-by-collection", h.handleCountByCollection) + mux.HandleFunc("/health", h.handleHealth) // Set a high timeout to allow for long-running pprof requests. const timeout = 600 * time.Second @@ -92,7 +78,7 @@ func (mh *metricsHandler) Start(logger *zap.Logger, mux *http.ServeMux, addr str // handleCountByCollection responds with the number of key in the database by collection. // Prefix can be a string or a 0x-prefixed hex string. // Empty prefix returns the total number of keys in the database. -func (mh *metricsHandler) handleCountByCollection(w http.ResponseWriter, r *http.Request) { +func (h *Handler) handleCountByCollection(w http.ResponseWriter, r *http.Request) { var response struct { Count int64 `json:"count"` } @@ -113,7 +99,7 @@ func (mh *metricsHandler) handleCountByCollection(w http.ResponseWriter, r *http } } - n, err := mh.db.CountPrefix(prefix) + n, err := h.db.CountPrefix(prefix) if err != nil { http.Error(w, err.Error(), http.StatusInternalServerError) return @@ -126,9 +112,8 @@ func (mh *metricsHandler) handleCountByCollection(w http.ResponseWriter, r *http } } -func (mh *metricsHandler) handleHealth(res http.ResponseWriter, req *http.Request) { - if err := mh.healthChecker.HealthCheck(); err != nil { - mh.reporter.SSVNodeNotHealthy() +func (h *Handler) handleHealth(res http.ResponseWriter, req *http.Request) { + if err := h.healthChecker.HealthCheck(); err != nil { result := map[string][]string{ "errors": {err.Error()}, } @@ -138,14 +123,13 @@ func (mh *metricsHandler) handleHealth(res http.ResponseWriter, req *http.Reques http.Error(res, string(raw), http.StatusInternalServerError) } } else { - mh.reporter.SSVNodeHealthy() if _, err := fmt.Fprintln(res, ""); err != nil { http.Error(res, err.Error(), http.StatusInternalServerError) } } } -func (mh *metricsHandler) configureProfiling() { +func (h *Handler) configureProfiling() { runtime.SetBlockProfileRate(10000) runtime.SetMutexProfileFraction(5) } diff --git a/monitoring/metrics/reporter.go b/monitoring/metrics/reporter.go deleted file mode 100644 index 2cd48a4fed..0000000000 --- a/monitoring/metrics/reporter.go +++ /dev/null @@ -1,11 +0,0 @@ -package metrics - -type nodeMetrics interface { - SSVNodeHealthy() - SSVNodeNotHealthy() -} - -type nopMetrics struct{} - -func (n nopMetrics) SSVNodeHealthy() {} -func (n nopMetrics) SSVNodeNotHealthy() {} diff --git a/monitoring/metricsreporter/metrics_reporter.go b/monitoring/metricsreporter/metrics_reporter.go index fb96344c01..4e4fc065f6 100644 --- a/monitoring/metricsreporter/metrics_reporter.go +++ b/monitoring/metricsreporter/metrics_reporter.go @@ -15,16 +15,7 @@ import ( ssvmessage "github.com/ssvlabs/ssv/protocol/v2/message" ) -const ( - ssvNodeNotHealthy = float64(0) - ssvNodeHealthy = float64(1) -) - var ( - ssvNodeStatus = promauto.NewGauge(prometheus.GaugeOpts{ - Name: "ssv_node_status", - Help: "Status of the operator node", - }) operatorIndex = promauto.NewGaugeVec(prometheus.GaugeOpts{ Name: "ssv:exporter:operator_index", Help: "operator footprint", @@ -78,8 +69,6 @@ var ( ) type MetricsReporter interface { - SSVNodeHealthy() - SSVNodeNotHealthy() OperatorPublicKey(operatorID spectypes.OperatorID, publicKey []byte) MessageValidationRSAVerifications() SignatureValidationDuration(duration time.Duration, labels ...string) @@ -112,14 +101,6 @@ func New(opts ...Option) MetricsReporter { return &metricsReporter{} } -func (m *metricsReporter) SSVNodeHealthy() { - ssvNodeStatus.Set(ssvNodeHealthy) -} - -func (m *metricsReporter) SSVNodeNotHealthy() { - ssvNodeStatus.Set(ssvNodeNotHealthy) -} - func (m *metricsReporter) OperatorPublicKey(operatorID spectypes.OperatorID, publicKey []byte) { pkHash := fmt.Sprintf("%x", sha256.Sum256(publicKey)) operatorIndex.WithLabelValues(pkHash, strconv.FormatUint(operatorID, 10)).Set(float64(operatorID)) diff --git a/monitoring/metricsreporter/nop_metrics_reporter.go b/monitoring/metricsreporter/nop_metrics_reporter.go index ee476e3f95..eb4f57f593 100644 --- a/monitoring/metricsreporter/nop_metrics_reporter.go +++ b/monitoring/metricsreporter/nop_metrics_reporter.go @@ -15,8 +15,6 @@ func NewNop() MetricsReporter { return &nopMetrics{} } -func (n *nopMetrics) SSVNodeHealthy() {} -func (n *nopMetrics) SSVNodeNotHealthy() {} func (n *nopMetrics) OperatorPublicKey(operatorID spectypes.OperatorID, publicKey []byte) {} func (n *nopMetrics) MessageValidationRSAVerifications() {} func (n *nopMetrics) GenesisMessageAccepted(role genesisspectypes.BeaconRole, round genesisspecqbft.Round) { diff --git a/operator/metrics.go b/operator/metrics.go deleted file mode 100644 index 4ccd086b4c..0000000000 --- a/operator/metrics.go +++ /dev/null @@ -1,17 +0,0 @@ -package operator - -import ( - spectypes "github.com/ssvlabs/ssv-spec/types" -) - -type nodeMetrics interface { - SSVNodeHealthy() - SSVNodeNotHealthy() - OperatorPublicKey(spectypes.OperatorID, []byte) -} - -type nopMetrics struct{} - -func (n nopMetrics) SSVNodeHealthy() {} -func (n nopMetrics) SSVNodeNotHealthy() {} -func (n nopMetrics) OperatorPublicKey(spectypes.OperatorID, []byte) {} diff --git a/operator/node.go b/operator/node.go index 70823a0bbd..1805e2fd1e 100644 --- a/operator/node.go +++ b/operator/node.go @@ -24,11 +24,6 @@ import ( "github.com/ssvlabs/ssv/storage/basedb" ) -// Node represents the behavior of SSV node -type Node interface { - Start(logger *zap.Logger) error -} - // Options contains options to create the node type Options struct { // NetworkName is the network name of this node @@ -46,11 +41,9 @@ type Options struct { DutyStore *dutystore.Store WS api.WebSocketServer WsAPIPort int - Metrics nodeMetrics } -// operatorNode implements Node interface -type operatorNode struct { +type Node struct { network networkconfig.NetworkConfig context context.Context validatorsCtrl validator.Controller @@ -65,13 +58,11 @@ type operatorNode struct { ws api.WebSocketServer wsAPIPort int - - metrics nodeMetrics } -// New is the constructor of operatorNode -func New(logger *zap.Logger, opts Options, slotTickerProvider slotticker.Provider, qbftStorage *qbftstorage.QBFTStores) Node { - node := &operatorNode{ +// New is the constructor of Node +func New(logger *zap.Logger, opts Options, slotTickerProvider slotticker.Provider, qbftStorage *qbftstorage.QBFTStores) *Node { + node := &Node{ context: opts.Context, validatorsCtrl: opts.ValidatorController, validatorOptions: opts.ValidatorOptions, @@ -107,19 +98,13 @@ func New(logger *zap.Logger, opts Options, slotTickerProvider slotticker.Provide ws: opts.WS, wsAPIPort: opts.WsAPIPort, - - metrics: opts.Metrics, - } - - if node.metrics == nil { - node.metrics = nopMetrics{} } return node } // Start starts to stream duties and run IBFT instances -func (n *operatorNode) Start(logger *zap.Logger) error { +func (n *Node) Start(logger *zap.Logger) error { logger.Named(logging.NameOperator) logger.Info("All required services are ready. OPERATOR SUCCESSFULLY CONFIGURED AND NOW RUNNING!") @@ -164,7 +149,7 @@ func (n *operatorNode) Start(logger *zap.Logger) error { } // HealthCheck returns a list of issues regards the state of the operator node -func (n *operatorNode) HealthCheck() error { +func (n *Node) HealthCheck() error { // TODO: previously this checked availability of consensus & execution clients. // However, currently the node crashes when those clients are down, // so this health check is currently a positive no-op. @@ -172,7 +157,7 @@ func (n *operatorNode) HealthCheck() error { } // handleQueryRequests waits for incoming messages and -func (n *operatorNode) handleQueryRequests(logger *zap.Logger, nm *api.NetworkMessage) { +func (n *Node) handleQueryRequests(logger *zap.Logger, nm *api.NetworkMessage) { if nm.Err != nil { nm.Msg = api.Message{Type: api.TypeError, Data: []string{"could not parse network message"}} } @@ -188,7 +173,7 @@ func (n *operatorNode) handleQueryRequests(logger *zap.Logger, nm *api.NetworkMe } } -func (n *operatorNode) startWSServer(logger *zap.Logger) error { +func (n *Node) startWSServer(logger *zap.Logger) error { if n.ws != nil { logger.Info("starting WS server") @@ -202,7 +187,7 @@ func (n *operatorNode) startWSServer(logger *zap.Logger) error { return nil } -func (n *operatorNode) reportOperators(logger *zap.Logger) { +func (n *Node) reportOperators(logger *zap.Logger) { operators, err := n.storage.ListOperators(nil, 0, 1000) // TODO more than 1000? if err != nil { logger.Warn("failed to get all operators for reporting", zap.Error(err)) @@ -210,7 +195,6 @@ func (n *operatorNode) reportOperators(logger *zap.Logger) { } logger.Debug("reporting operators", zap.Int("count", len(operators))) for i := range operators { - n.metrics.OperatorPublicKey(operators[i].ID, operators[i].PublicKey) logger.Debug("report operator public key", fields.OperatorID(operators[i].ID), fields.PubKey(operators[i].PublicKey))