diff --git a/common/metrics/config.go b/common/metrics/config.go new file mode 100644 index 000000000..1bc8c0e4d --- /dev/null +++ b/common/metrics/config.go @@ -0,0 +1,10 @@ +package metrics + +// Config provides configuration for a Metrics instance. +type Config struct { + // Namespace is the namespace for the metrics. + Namespace string + + // HTTPPort is the port to serve metrics on. + HTTPPort int +} diff --git a/common/metrics/count_metric.go b/common/metrics/count_metric.go new file mode 100644 index 000000000..f52b942cb --- /dev/null +++ b/common/metrics/count_metric.go @@ -0,0 +1,101 @@ +package metrics + +import ( + "fmt" + "github.com/Layr-Labs/eigensdk-go/logging" + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/promauto" +) + +var _ CountMetric = &countMetric{} + +// countMetric a standard implementation of the CountMetric. +type countMetric struct { + Metric + + // logger is the logger used to log errors. + logger logging.Logger + + // name is the name of the metric. + name string + + // description is the description of the metric. + description string + + // counter is the prometheus counter used to report this metric. + vec *prometheus.CounterVec + + // labeler is the label maker used to create labels for this metric. + labeler *labelMaker +} + +// newCountMetric creates a new CountMetric instance. +func newCountMetric( + logger logging.Logger, + registry *prometheus.Registry, + namespace string, + name string, + description string, + labelTemplate any) (CountMetric, error) { + + labeler, err := newLabelMaker(labelTemplate) + if err != nil { + return nil, err + } + + vec := promauto.With(registry).NewCounterVec( + prometheus.CounterOpts{ + Namespace: namespace, + Name: fmt.Sprintf("%s_count", name), + }, + labeler.getKeys(), + ) + + return &countMetric{ + logger: logger, + name: name, + description: description, + vec: vec, + labeler: labeler, + }, nil +} + +func (m *countMetric) Name() string { + return m.name +} + +func (m *countMetric) Unit() string { + return "count" +} + +func (m *countMetric) Description() string { + return m.description +} + +func (m *countMetric) Type() string { + return "counter" +} + +func (m *countMetric) LabelFields() []string { + return m.labeler.getKeys() +} + +func (m *countMetric) Increment(label ...any) { + m.Add(1, label...) +} + +func (m *countMetric) Add(value float64, label ...any) { + var l any + if len(label) > 0 { + l = label[0] + } + + values, err := m.labeler.extractValues(l) + if err != nil { + m.logger.Errorf("error extracting values from label for metric %s: %v", m.name, err) + return + } + + observer := m.vec.WithLabelValues(values...) + observer.Add(value) +} diff --git a/common/metrics/gauge_metric.go b/common/metrics/gauge_metric.go new file mode 100644 index 000000000..ca3c80ae7 --- /dev/null +++ b/common/metrics/gauge_metric.go @@ -0,0 +1,103 @@ +package metrics + +import ( + "fmt" + "github.com/Layr-Labs/eigensdk-go/logging" + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/promauto" +) + +var _ GaugeMetric = &gaugeMetric{} + +// gaugeMetric is a standard implementation of the GaugeMetric interface via prometheus. +type gaugeMetric struct { + Metric + + // logger is the logger used to log errors. + logger logging.Logger + + // name is the name of the metric. + name string + + // unit is the unit of the metric. + unit string + + // description is the description of the metric. + description string + + // gauge is the prometheus gauge used to report this metric. + vec *prometheus.GaugeVec + + // labeler is the label maker used to create labels for this metric. + labeler *labelMaker +} + +// newGaugeMetric creates a new GaugeMetric instance. +func newGaugeMetric( + logger logging.Logger, + registry *prometheus.Registry, + namespace string, + name string, + unit string, + description string, + labelTemplate any) (GaugeMetric, error) { + + labeler, err := newLabelMaker(labelTemplate) + if err != nil { + return nil, err + } + + vec := promauto.With(registry).NewGaugeVec( + prometheus.GaugeOpts{ + Namespace: namespace, + Name: fmt.Sprintf("%s_%s", name, unit), + }, + labeler.getKeys(), + ) + + return &gaugeMetric{ + logger: logger, + name: name, + unit: unit, + description: description, + vec: vec, + labeler: labeler, + }, nil +} + +func (m *gaugeMetric) Name() string { + return m.name +} + +func (m *gaugeMetric) Unit() string { + return m.unit +} + +func (m *gaugeMetric) Description() string { + return m.description +} + +func (m *gaugeMetric) Type() string { + return "gauge" +} + +func (m *gaugeMetric) LabelFields() []string { + return m.labeler.getKeys() +} + +func (m *gaugeMetric) Set(value float64, label ...any) { + var l any + if len(label) > 0 { + l = label[0] + } + + values, err := m.labeler.extractValues(l) + if err != nil { + m.logger.Errorf("failed to extract values from label: %v", err) + return + } + + observer := m.vec.WithLabelValues(values...) + + observer.Set(value) +} diff --git a/common/metrics/label_maker.go b/common/metrics/label_maker.go new file mode 100644 index 000000000..cb56ecdfc --- /dev/null +++ b/common/metrics/label_maker.go @@ -0,0 +1,73 @@ +package metrics + +import ( + "fmt" + "reflect" +) + +// labelMaker encapsulates logic for creating labels for metrics. +type labelMaker struct { + keys []string + emptyValues []string + templateType reflect.Type + labelCount int +} + +// newLabelMaker creates a new labelMaker instance given a label template. The label template may be nil. +func newLabelMaker(labelTemplate any) (*labelMaker, error) { + labeler := &labelMaker{ + keys: make([]string, 0), + } + + if labelTemplate == nil { + return labeler, nil + } + + v := reflect.ValueOf(labelTemplate) + if v.Kind() != reflect.Struct { + return nil, fmt.Errorf("label template must be a struct") + } + + t := v.Type() + labeler.templateType = t + for i := 0; i < t.NumField(); i++ { + + fieldType := t.Field(i).Type + if fieldType.Kind() != reflect.String { + return nil, fmt.Errorf( + "field %s has type %v, only string fields are supported", t.Field(i).Name, fieldType) + } + + labeler.keys = append(labeler.keys, t.Field(i).Name) + } + + labeler.emptyValues = make([]string, len(labeler.keys)) + labeler.labelCount = len(labeler.keys) + + return labeler, nil +} + +// getKeys provides the keys for the label struct. +func (l *labelMaker) getKeys() []string { + return l.keys +} + +// extractValues extracts the values from the given label struct. +func (l *labelMaker) extractValues(label any) ([]string, error) { + if l.templateType == nil || label == nil { + return l.emptyValues, nil + } + + if l.templateType != reflect.TypeOf(label) { + return nil, fmt.Errorf( + "label type mismatch, expected %v, got %v", l.templateType, reflect.TypeOf(label)) + } + + values := make([]string, 0, l.labelCount) + for i := 0; i < l.labelCount; i++ { + v := reflect.ValueOf(label) + values = append(values, v.Field(i).String()) + } + + return values, nil +} diff --git a/common/metrics/latency_metric.go b/common/metrics/latency_metric.go new file mode 100644 index 000000000..1309a9b33 --- /dev/null +++ b/common/metrics/latency_metric.go @@ -0,0 +1,102 @@ +package metrics + +import ( + "fmt" + "github.com/Layr-Labs/eigensdk-go/logging" + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/promauto" + "time" +) + +var _ LatencyMetric = &latencyMetric{} + +// latencyMetric is a standard implementation of the LatencyMetric interface via prometheus. +type latencyMetric struct { + Metric + + // logger is the logger used to log errors. + logger logging.Logger + + // name is the name of the metric. + name string + + // description is the description of the metric. + description string + + // vec is the prometheus summary vector used to report this metric. + vec *prometheus.SummaryVec + + // lm is the label maker used to create labels for this metric. + labeler *labelMaker +} + +// newLatencyMetric creates a new LatencyMetric instance. +func newLatencyMetric( + logger logging.Logger, + registry *prometheus.Registry, + namespace string, + name string, + description string, + objectives map[float64]float64, + labelTemplate any) (LatencyMetric, error) { + + labeler, err := newLabelMaker(labelTemplate) + if err != nil { + return nil, err + } + + vec := promauto.With(registry).NewSummaryVec( + prometheus.SummaryOpts{ + Namespace: namespace, + Name: fmt.Sprintf("%s_ms", name), + Objectives: objectives, + }, + labeler.getKeys(), + ) + + return &latencyMetric{ + logger: logger, + name: name, + description: description, + vec: vec, + labeler: labeler, + }, nil +} + +func (m *latencyMetric) Name() string { + return m.name +} + +func (m *latencyMetric) Unit() string { + return "ms" +} + +func (m *latencyMetric) Description() string { + return m.description +} + +func (m *latencyMetric) Type() string { + return "latency" +} + +func (m *latencyMetric) LabelFields() []string { + return m.labeler.getKeys() +} + +func (m *latencyMetric) ReportLatency(latency time.Duration, label ...any) { + var l any + if len(label) > 0 { + l = label[0] + } + + values, err := m.labeler.extractValues(l) + if err != nil { + m.logger.Errorf("error extracting values from label: %v", err) + } + + observer := m.vec.WithLabelValues(values...) + + nanoseconds := float64(latency.Nanoseconds()) + milliseconds := nanoseconds / float64(time.Millisecond) + observer.Observe(milliseconds) +} diff --git a/common/metrics/metrics.go b/common/metrics/metrics.go new file mode 100644 index 000000000..ed63a1c6a --- /dev/null +++ b/common/metrics/metrics.go @@ -0,0 +1,142 @@ +package metrics + +import "time" + +// Metrics provides a convenient interface for reporting metrics. +type Metrics interface { + // Start starts the metrics server. + Start() error + + // Stop stops the metrics server. + Stop() error + + // GenerateMetricsDocumentation generates documentation for all currently registered metrics. + // Documentation is returned as a string in markdown format. + GenerateMetricsDocumentation() string + + // WriteMetricsDocumentation writes documentation for all currently registered metrics to a file. + // Documentation is written in markdown format. + WriteMetricsDocumentation(fileName string) error + + // NewLatencyMetric creates a new LatencyMetric instance. Useful for reporting the latency of an operation. + // Metric name and label may only contain alphanumeric characters and underscores. + // + // The labelTemplate parameter is the label type that will be used for this metric. Each field becomes a label for + // the metric. Each field type must be a string. If no labels are needed, pass nil. + NewLatencyMetric( + name string, + description string, + labelTemplate any, + quantiles ...*Quantile) (LatencyMetric, error) + + // NewCountMetric creates a new CountMetric instance. Useful for tracking the count of a type of event. + // Metric name and label may only contain alphanumeric characters and underscores. + // + // The labelTemplate parameter is the label type that will be used for this metric. Each field becomes a label for + // the metric. Each field type must be a string. If no labels are needed, pass nil. + NewCountMetric( + name string, + description string, + labelTemplate any) (CountMetric, error) + + // NewGaugeMetric creates a new GaugeMetric instance. Useful for reporting specific values. + // Metric name and label may only contain alphanumeric characters and underscores. + // + // The labelTemplate parameter is the label type that will be used for this metric. Each field becomes a label for + // the metric. Each field type must be a string. If no labels are needed, pass nil. + NewGaugeMetric( + name string, + unit string, + description string, + labelTemplate any) (GaugeMetric, error) + + // NewAutoGauge creates a new GaugeMetric instance that is automatically updated by the given source function. + // The function is polled at the given period. This produces a gauge type metric internally. + // Metric name and label may only contain alphanumeric characters and underscores. + // + // The label parameter accepts zero or one label. + NewAutoGauge( + name string, + unit string, + description string, + pollPeriod time.Duration, + source func() float64, + label ...any) error +} + +// Metric represents a metric that can be reported. +type Metric interface { + + // Name returns the name of the metric. + Name() string + + // Unit returns the unit of the metric. + Unit() string + + // Description returns the description of the metric. Should be a one or two sentence human-readable description. + Description() string + + // Type returns the type of the metric. + Type() string + + // LabelFields returns the fields of the label template. + LabelFields() []string +} + +// GaugeMetric allows specific values to be reported. +type GaugeMetric interface { + Metric + + // Set sets the value of a gauge metric. + // + // The label parameter accepts zero or one label. If the label type does not match the template label type provided + // when creating the metric, an error will be returned. + Set(value float64, label ...any) +} + +// CountMetric allows the count of a type of event to be tracked. +type CountMetric interface { + Metric + + // Increment increments the count by 1. + // + // The label parameter accepts zero or one label. If the label type does not match the template label type provided + // when creating the metric, an error will be returned. + Increment(label ...any) + + // Add increments the count by the given value. + // + // The label parameter accepts zero or one label. If the label type does not match the template label type provided + // when creating the metric, an error will be returned. + Add(value float64, label ...any) +} + +// Quantile describes a quantile of a latency metric that should be reported. For a description of how +// to interpret a quantile, see the prometheus documentation +// https://github.com/prometheus/client_golang/blob/v1.20.5/prometheus/summary.go#L126 +type Quantile struct { + Quantile float64 + Error float64 +} + +// NewQuantile creates a new Quantile instance. Error is set to 1% of the quantile. +func NewQuantile(quantile float64) *Quantile { + return &Quantile{ + Quantile: quantile, + Error: quantile / 100.0, + } +} + +// LatencyMetric allows the latency of an operation to be tracked. Similar to a gauge metric, but specialized for time. +// +// The label parameter accepts zero or one label. If the label type does not match the template label type provided +// when creating the metric, an error will be returned. +type LatencyMetric interface { + Metric + + // ReportLatency reports a latency value. + // + // The label parameter accepts zero or one label. If the label type does not match the template label type provided + // when creating the metric, an error will be returned. + ReportLatency(latency time.Duration, label ...any) +} diff --git a/common/metrics/metrics_server.go b/common/metrics/metrics_server.go new file mode 100644 index 000000000..f18fd02ac --- /dev/null +++ b/common/metrics/metrics_server.go @@ -0,0 +1,430 @@ +package metrics + +import ( + "errors" + "fmt" + "github.com/Layr-Labs/eigensdk-go/logging" + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/collectors" + "github.com/prometheus/client_golang/prometheus/promhttp" + "net/http" + "os" + "regexp" + "slices" + "strings" + "sync" + "sync/atomic" + "time" +) + +var _ Metrics = &metrics{} + +// metrics is a standard implementation of the Metrics interface via prometheus. +type metrics struct { + // logger is the logger used to log messages. + logger logging.Logger + + // config is the configuration for the metrics. + config *Config + + // registry is the prometheus registry used to report metrics. + registry *prometheus.Registry + + // A map from metricID to Metric instance. If a metric is requested but that metric + // already exists, the existing metric will be returned instead of a new one being created. + metricMap map[metricID]Metric + + // autoGaugesToStart is a list of functions that will start auto-gauges. If an auto-gauge is created + // before the metrics server is started, we don't actually start the goroutine until the server is started. + autoGaugesToStart []func() + + // lock is a lock used to ensure that metrics are not created concurrently. + lock sync.Mutex + + // started is true if the metrics server has been started. + started bool + + // isAlize is true if the metrics server has not been stopped. + isAlive atomic.Bool + + // server is the metrics server + server *http.Server + + // quantilesMap contains a string describing the quantiles for each latency metric. Used to generate documentation. + quantilesMap map[metricID]string +} + +// NewMetrics creates a new Metrics instance. +func NewMetrics(logger logging.Logger, config *Config) Metrics { + reg := prometheus.NewRegistry() + reg.MustRegister(collectors.NewProcessCollector(collectors.ProcessCollectorOpts{})) + reg.MustRegister(collectors.NewGoCollector()) + + logger.Infof("Starting metrics server at port %d", config.HTTPPort) + addr := fmt.Sprintf(":%d", config.HTTPPort) + mux := http.NewServeMux() + mux.Handle("/metrics", promhttp.HandlerFor( + reg, + promhttp.HandlerOpts{}, + )) + server := &http.Server{ + Addr: addr, + Handler: mux, + } + + m := &metrics{ + logger: logger, + config: config, + registry: reg, + metricMap: make(map[metricID]Metric), + isAlive: atomic.Bool{}, + server: server, + quantilesMap: make(map[metricID]string), + } + m.isAlive.Store(true) + return m +} + +// metricID is a unique identifier for a metric. +type metricID struct { + name string + unit string +} + +var legalCharactersRegex = regexp.MustCompile(`^[a-zA-Z0-9_]+$`) + +// containsLegalCharacters returns true if the string contains only legal characters (alphanumeric and underscore). +func containsLegalCharacters(s string) bool { + return legalCharactersRegex.MatchString(s) +} + +// newMetricID creates a new metricID instance. +func newMetricID(name string, unit string) (metricID, error) { + if !containsLegalCharacters(name) { + return metricID{}, fmt.Errorf("invalid metric name: %s", name) + } + if !containsLegalCharacters(unit) { + return metricID{}, fmt.Errorf("invalid metric unit: %s", unit) + } + return metricID{ + name: name, + unit: unit, + }, nil +} + +// NameWithUnit returns the name of the metric with the unit appended. +func (i *metricID) NameWithUnit() string { + return fmt.Sprintf("%s_%s", i.name, i.unit) +} + +// Start starts the metrics server. +func (m *metrics) Start() error { + m.lock.Lock() + defer m.lock.Unlock() + + if m.started { + return errors.New("metrics server already started") + } + m.started = true + + go func() { + err := m.server.ListenAndServe() + if err != nil && !strings.Contains(err.Error(), "http: Server closed") { + m.logger.Errorf("metrics server error: %v", err) + } + }() + + // start the auto-gauges that were created before the server was started + for _, autoGauge := range m.autoGaugesToStart { + go autoGauge() + } + + return nil +} + +// Stop stops the metrics server. +func (m *metrics) Stop() error { + m.lock.Lock() + defer m.lock.Unlock() + + if !m.started { + return errors.New("metrics server not started") + } + + if !m.isAlive.Load() { + return errors.New("metrics server already stopped") + } + + m.isAlive.Store(false) + return m.server.Close() +} + +// NewLatencyMetric creates a new LatencyMetric instance. +func (m *metrics) NewLatencyMetric( + name string, + description string, + labelTemplate any, + quantiles ...*Quantile) (LatencyMetric, error) { + + m.lock.Lock() + defer m.lock.Unlock() + + if !m.isAlive.Load() { + return nil, errors.New("metrics server is not alive") + } + + id, err := newMetricID(name, "ms") + if err != nil { + return nil, err + } + + preExistingMetric, ok := m.metricMap[id] + if ok { + return preExistingMetric.(LatencyMetric), nil + } + + quantilesString := "" + objectives := make(map[float64]float64, len(quantiles)) + for i, q := range quantiles { + objectives[q.Quantile] = q.Error + + quantilesString += fmt.Sprintf("`%.3f`", q.Quantile) + if i < len(quantiles)-1 { + quantilesString += ", " + } + } + m.quantilesMap[id] = quantilesString + + metric, err := newLatencyMetric( + m.logger, + m.registry, + m.config.Namespace, + name, + description, + objectives, + labelTemplate) + + if err != nil { + return nil, err + } + + m.metricMap[id] = metric + return metric, nil +} + +// NewCountMetric creates a new CountMetric instance. +func (m *metrics) NewCountMetric( + name string, + description string, + labelTemplate any) (CountMetric, error) { + + m.lock.Lock() + defer m.lock.Unlock() + + if !m.isAlive.Load() { + return nil, errors.New("metrics server is not alive") + } + + id, err := newMetricID(name, "count") + if err != nil { + return nil, err + } + + preExistingMetric, ok := m.metricMap[id] + if ok { + return preExistingMetric.(CountMetric), nil + } + + metric, err := newCountMetric( + m.logger, + m.registry, + m.config.Namespace, + name, description, + labelTemplate) + + if err != nil { + return nil, err + } + + m.metricMap[id] = metric + + return metric, nil +} + +// NewGaugeMetric creates a new GaugeMetric instance. +func (m *metrics) NewGaugeMetric( + name string, + unit string, + description string, + labelTemplate any) (GaugeMetric, error) { + + m.lock.Lock() + defer m.lock.Unlock() + return m.newGaugeMetricUnsafe(name, unit, description, labelTemplate) +} + +// newGaugeMetricUnsafe creates a new GaugeMetric instance without locking. +func (m *metrics) newGaugeMetricUnsafe( + name string, + unit string, + description string, + labelTemplate any) (GaugeMetric, error) { + + if !m.isAlive.Load() { + return nil, errors.New("metrics server is not alive") + } + + id, err := newMetricID(name, unit) + if err != nil { + return nil, err + } + + preExistingMetric, ok := m.metricMap[id] + if ok { + return preExistingMetric.(GaugeMetric), nil + } + + metric, err := newGaugeMetric( + m.logger, + m.registry, + m.config.Namespace, + name, + unit, + description, + labelTemplate) + + if err != nil { + return nil, err + } + + m.metricMap[id] = metric + return metric, nil +} + +func (m *metrics) NewAutoGauge( + name string, + unit string, + description string, + pollPeriod time.Duration, + source func() float64, + label ...any) error { + + m.lock.Lock() + defer m.lock.Unlock() + + if !m.isAlive.Load() { + return errors.New("metrics server is not alive") + } + + if len(label) > 1 { + return fmt.Errorf("too many labels provided, expected 1, got %d", len(label)) + } + var l any + if len(label) == 1 { + l = label[0] + } + + gauge, err := m.newGaugeMetricUnsafe(name, unit, description, l) + if err != nil { + return err + } + + pollingAgent := func() { + ticker := time.NewTicker(pollPeriod) + for m.isAlive.Load() { + value := source() + + gauge.Set(value, l) + <-ticker.C + } + } + + if m.started { + // start the polling agent immediately + go pollingAgent() + } else { + // the polling agent will be started when the metrics server is started + m.autoGaugesToStart = append(m.autoGaugesToStart, pollingAgent) + } + + return nil +} + +func (m *metrics) GenerateMetricsDocumentation() string { + sb := &strings.Builder{} + + metricIDs := make([]*metricID, 0, len(m.metricMap)) + for id := range m.metricMap { + boundID := id + metricIDs = append(metricIDs, &boundID) + } + + // sort the metric IDs alphabetically + sortFunc := func(a *metricID, b *metricID) int { + if a.name != b.name { + return strings.Compare(a.name, b.name) + } + return strings.Compare(a.unit, b.unit) + } + slices.SortFunc(metricIDs, sortFunc) + + sb.Write([]byte(fmt.Sprintf("# Metrics Documentation for namespace '%s'\n\n", m.config.Namespace))) + sb.Write([]byte(fmt.Sprintf("This documentation was automatically generated at time `%s`\n\n", + time.Now().Format(time.RFC3339)))) + + sb.Write([]byte(fmt.Sprintf("There are a total of `%d` registered metrics.\n\n", len(m.metricMap)))) + + for _, id := range metricIDs { + metric := m.metricMap[*id] + + sb.Write([]byte("---\n\n")) + sb.Write([]byte(fmt.Sprintf("## %s\n\n", id.NameWithUnit()))) + + sb.Write([]byte(fmt.Sprintf("%s\n\n", metric.Description()))) + + sb.Write([]byte("| | |\n")) + sb.Write([]byte("|---|---|\n")) + sb.Write([]byte(fmt.Sprintf("| **Name** | `%s` |\n", metric.Name()))) + sb.Write([]byte(fmt.Sprintf("| **Unit** | `%s` |\n", metric.Unit()))) + labels := metric.LabelFields() + if len(labels) > 0 { + sb.Write([]byte("| **Labels** | ")) + for i, label := range labels { + sb.Write([]byte(fmt.Sprintf("`%s`", label))) + if i < len(labels)-1 { + sb.Write([]byte(", ")) + } + } + sb.Write([]byte(" |\n")) + } + sb.Write([]byte(fmt.Sprintf("| **Type** | `%s` |\n", metric.Type()))) + if metric.Type() == "latency" { + sb.Write([]byte(fmt.Sprintf("| **Quantiles** | %s |\n", m.quantilesMap[*id]))) + } + sb.Write([]byte(fmt.Sprintf("| **Fully Qualified Name** | `%s_%s_%s` |\n", + m.config.Namespace, id.name, id.unit))) + } + + return sb.String() +} + +func (m *metrics) WriteMetricsDocumentation(fileName string) error { + doc := m.GenerateMetricsDocumentation() + + file, err := os.Create(fileName) + if err != nil { + return fmt.Errorf("error creating file: %v", err) + } + + _, err = file.Write([]byte(doc)) + if err != nil { + return fmt.Errorf("error writing to file: %v", err) + } + + err = file.Close() + if err != nil { + return fmt.Errorf("error closing file: %v", err) + } + + return nil +} diff --git a/common/metrics/mock_metrics.go b/common/metrics/mock_metrics.go new file mode 100644 index 000000000..695a2662e --- /dev/null +++ b/common/metrics/mock_metrics.go @@ -0,0 +1,154 @@ +package metrics + +import ( + "time" +) + +var _ Metrics = &mockMetrics{} + +// mockMetrics is a mock implementation of the Metrics interface. +type mockMetrics struct { +} + +// NewMockMetrics creates a new mock Metrics instance. +// Suitable for testing or for when you just want to disable all metrics. +func NewMockMetrics() Metrics { + return &mockMetrics{} +} + +func (m *mockMetrics) GenerateMetricsDocumentation() string { + return "" +} + +func (m *mockMetrics) WriteMetricsDocumentation(fileName string) error { + return nil +} + +func (m *mockMetrics) Start() error { + return nil +} + +func (m *mockMetrics) Stop() error { + return nil +} + +func (m *mockMetrics) NewLatencyMetric( + name string, + description string, + templateLabel any, + quantiles ...*Quantile) (LatencyMetric, error) { + return &mockLatencyMetric{}, nil +} + +func (m *mockMetrics) NewCountMetric(name string, description string, templateLabel any) (CountMetric, error) { + return &mockCountMetric{}, nil +} + +func (m *mockMetrics) NewGaugeMetric( + name string, + unit string, + description string, + labelTemplate any) (GaugeMetric, error) { + return &mockGaugeMetric{}, nil +} + +func (m *mockMetrics) NewAutoGauge( + name string, + unit string, + description string, + pollPeriod time.Duration, + source func() float64, + label ...any) error { + return nil +} + +var _ CountMetric = &mockCountMetric{} + +type mockCountMetric struct { +} + +func (m *mockCountMetric) Name() string { + return "" +} + +func (m *mockCountMetric) Unit() string { + return "" +} + +func (m *mockCountMetric) Description() string { + return "" +} + +func (m *mockCountMetric) Type() string { + return "" +} + +func (m *mockCountMetric) LabelFields() []string { + return make([]string, 0) +} + +func (m *mockCountMetric) Increment(label ...any) { + +} + +func (m *mockCountMetric) Add(value float64, label ...any) { + +} + +var _ GaugeMetric = &mockGaugeMetric{} + +type mockGaugeMetric struct { +} + +func (m *mockGaugeMetric) Name() string { + return "" +} + +func (m *mockGaugeMetric) Unit() string { + return "" +} + +func (m *mockGaugeMetric) Description() string { + return "" +} + +func (m *mockGaugeMetric) Type() string { + return "" +} + +func (m *mockGaugeMetric) LabelFields() []string { + return make([]string, 0) +} + +func (m *mockGaugeMetric) Set(value float64, label ...any) { + +} + +var _ LatencyMetric = &mockLatencyMetric{} + +type mockLatencyMetric struct { +} + +func (m *mockLatencyMetric) Name() string { + return "" +} + +func (m *mockLatencyMetric) Unit() string { + return "" +} + +func (m *mockLatencyMetric) Description() string { + return "" +} + +func (m *mockLatencyMetric) Type() string { + return "" +} + +func (m *mockLatencyMetric) LabelFields() []string { + return make([]string, 0) +} + +func (m *mockLatencyMetric) ReportLatency(latency time.Duration, label ...any) { + +} diff --git a/common/metrics/test/main.go b/common/metrics/test/main.go new file mode 100644 index 000000000..bdd1050b2 --- /dev/null +++ b/common/metrics/test/main.go @@ -0,0 +1,137 @@ +package main + +import ( + "fmt" + "github.com/Layr-Labs/eigenda/common" + "github.com/Layr-Labs/eigenda/common/metrics" + "math/rand" + "sync/atomic" + "time" +) + +// This is a simple test bed for validating the metrics server (since it's not straight forward to unit test). + +type LabelType1 struct { + foo string + bar string + baz string +} + +type LabelType2 struct { + X string + Y string + Z string +} + +func main() { + + metricsConfig := &metrics.Config{ + Namespace: "test", + HTTPPort: 9101, + } + + logger, err := common.NewLogger(common.DefaultLoggerConfig()) + if err != nil { + panic(err) + } + + metricsServer := metrics.NewMetrics(logger, metricsConfig) + + l1, err := metricsServer.NewLatencyMetric( + "l1", + "this metric shows the latency of the sleep cycle", + LabelType1{}, + metrics.NewQuantile(0.5), + metrics.NewQuantile(0.9), + metrics.NewQuantile(0.99)) + if err != nil { + panic(err) + } + + c1, err := metricsServer.NewCountMetric( + "c1", + "this metric shows the number of times the sleep cycle has been executed", + LabelType2{}) + if err != nil { + panic(err) + } + + c2, err := metricsServer.NewCountMetric( + "c2", + "the purpose of this counter is to test what happens if we don't provide a label template", + nil) + if err != nil { + panic(err) + } + + g1, err := metricsServer.NewGaugeMetric( + "g1", + "milliseconds", + "this metric shows the duration of the most recent sleep cycle", + LabelType1{}) + if err != nil { + panic(err) + } + + sum := atomic.Int64{} + err = metricsServer.NewAutoGauge( + "g2", + "milliseconds", + "this metric shows the sum of all sleep cycles", + 1*time.Second, + func() float64 { + return float64(sum.Load()) + }, + LabelType2{X: "sum"}) + if err != nil { + panic(err) + } + + err = metricsServer.WriteMetricsDocumentation("metrics.md") + if err != nil { + panic(err) + } + + err = metricsServer.Start() + if err != nil { + panic(err) + } + + prev := time.Now() + for i := 0; i < 100; i++ { + fmt.Printf("Iteration %d\n", i) + time.Sleep(time.Duration(rand.Intn(1000)) * time.Millisecond) + now := time.Now() + elapsed := now.Sub(prev) + prev = now + + l1.ReportLatency(elapsed) + + l1.ReportLatency(elapsed/2, + LabelType1{ + foo: "half of the normal value", + bar: "42", + baz: "true", + }) + + c1.Increment() + c1.Add(2, LabelType2{ + X: "2x", + }) + c2.Increment() + + g1.Set(float64(elapsed.Milliseconds()), + LabelType1{ + foo: "bar", + bar: "baz", + baz: "foo", + }) + + sum.Store(sum.Load() + elapsed.Milliseconds()) + } + + err = metricsServer.Stop() + if err != nil { + panic(err) + } +} diff --git a/common/metrics/test/metrics.md b/common/metrics/test/metrics.md new file mode 100644 index 000000000..dfb98804a --- /dev/null +++ b/common/metrics/test/metrics.md @@ -0,0 +1,71 @@ +# Metrics Documentation for namespace 'test' + +This documentation was automatically generated at time `2024-11-25T12:46:49-06:00` + +There are a total of `5` registered metrics. + +--- + +## c1_count + +this metric shows the number of times the sleep cycle has been executed + +| | | +|---|---| +| **Name** | `c1` | +| **Unit** | `count` | +| **Labels** | `X`, `Y`, `Z` | +| **Type** | `counter` | +| **Fully Qualified Name** | `test_c1_count` | +--- + +## c2_count + +the purpose of this counter is to test what happens if we don't provide a label template + +| | | +|---|---| +| **Name** | `c2` | +| **Unit** | `count` | +| **Type** | `counter` | +| **Fully Qualified Name** | `test_c2_count` | +--- + +## g1_milliseconds + +this metric shows the duration of the most recent sleep cycle + +| | | +|---|---| +| **Name** | `g1` | +| **Unit** | `milliseconds` | +| **Labels** | `foo`, `bar`, `baz` | +| **Type** | `gauge` | +| **Fully Qualified Name** | `test_g1_milliseconds` | +--- + +## g2_milliseconds + +this metric shows the sum of all sleep cycles + +| | | +|---|---| +| **Name** | `g2` | +| **Unit** | `milliseconds` | +| **Labels** | `X`, `Y`, `Z` | +| **Type** | `gauge` | +| **Fully Qualified Name** | `test_g2_milliseconds` | +--- + +## l1_ms + +this metric shows the latency of the sleep cycle + +| | | +|---|---| +| **Name** | `l1` | +| **Unit** | `ms` | +| **Labels** | `foo`, `bar`, `baz` | +| **Type** | `latency` | +| **Quantiles** | `0.500`, `0.900`, `0.990` | +| **Fully Qualified Name** | `test_l1_ms` |