Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

wip: start toying with wrapping requests withRedFmetrics #222

Draft
wants to merge 1 commit into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 4 additions & 5 deletions cmd/zoekt-sourcegraph-indexserver/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -48,11 +48,10 @@ var (
Buckets: prometheus.ExponentialBuckets(1, 10, 6), // 1s -> 27min
})

metricResolveRevisionDuration = promauto.NewHistogramVec(prometheus.HistogramOpts{
Name: "resolve_revision_seconds",
Help: "A histogram of latencies for resolving a repository revision.",
Buckets: prometheus.ExponentialBuckets(.25, 2, 4), // 250ms -> 2s
}, []string{"success"}) // success=true|false
metricResolveRevisionObserver = NewRedfMetrics("resolve_revision",
WithLabels("success"), // success=true|false
WithDurationBuckets(prometheus.ExponentialBuckets(.25, 2, 4)), // 250ms -> 2s
)

metricGetIndexOptionsError = promauto.NewCounter(prometheus.CounterOpts{
Name: "get_index_options_error_total",
Expand Down
149 changes: 149 additions & 0 deletions cmd/zoekt-sourcegraph-indexserver/metrics.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,149 @@
package main

import (
"fmt"
"time"

"github.com/prometheus/client_golang/prometheus"
)

// Example Usage:
//
// observer := NewRedFMetrics("operation_name", WithLabels("factorA", "factorB"))
//
// start := time.now()
// err := doOperation()
//
// operation.Observe(time.Since(start), err)
//
// m.Observe(duration duration, err error, "label0", "label1"...)

// RedFMetrics contains four common metrics for an operation.
// It's based on the RED method + some additional advice from
// Google SRE's "Monitoring Distributed Systems".
//
// See:
// - https://www.weave.works/blog/the-red-method-key-metrics-for-microservices-architecture/
// - https://sre.google/sre-book/monitoring-distributed-systems/
type RedFMetrics struct {
Count *prometheus.CounterVec // How often did this operation run successfully?
Duration *prometheus.HistogramVec // How long did this operation run for?

ErrorCount *prometheus.CounterVec // How often did this operation fail?
ErrorDuration *prometheus.HistogramVec // How long did the failures take?
}

func (m *RedFMetrics) Observe(d time.Duration, err error, lvals ...string) {
if err != nil {
m.ErrorCount.WithLabelValues(lvals...).Inc()
m.ErrorDuration.WithLabelValues(lvals...).Observe(d.Seconds())
return
}

m.Count.WithLabelValues(lvals...).Inc()
m.ErrorDuration.WithLabelValues(lvals...).Observe(d.Seconds())
}

type redfMetricOptions struct {
countHelp string
durationHelp string

errorsCountHelp string
errorsDurationHelp string

labels []string
durationBuckets []float64
}

// RedfMetricsOption alter the default behavior of NewRedfMetrics.
type RedfMetricsOption func(o *redfMetricOptions)

// WithDurationHelp overrides the default help text for duration metrics.
func WithDurationHelp(text string) RedfMetricsOption {
return func(o *redfMetricOptions) { o.durationHelp = text }
}

// WithCountHelp overrides the default help text for count metrics.
func WithCountHelp(text string) RedfMetricsOption {
return func(o *redfMetricOptions) { o.countHelp = text }
}

// WithErrorsCountHelp overrides the default help text for error count metrics.
func WithErrorsCountHelp(text string) RedfMetricsOption {
return func(o *redfMetricOptions) { o.errorsCountHelp = text }
}

// WithErrorsCountHelp overrides the default help text for error duration metrics.
func WithErrorsDurationHelp(text string) RedfMetricsOption {
return func(o *redfMetricOptions) { o.errorsDurationHelp = text }
}

// WithLabels overrides the default labels for all metrics.
func WithLabels(labels ...string) RedfMetricsOption {
return func(o *redfMetricOptions) { o.labels = labels }
}

// WithDurationBuckets overrides the default histogram bucket values for duration metrics.
func WithDurationBuckets(buckets []float64) RedfMetricsOption {
return func(o *redfMetricOptions) {
if len(buckets) != 0 {
o.durationBuckets = buckets
}
}
}

func NewRedfMetrics(name string, overrides ...RedfMetricsOption) *RedFMetrics {
options := &redfMetricOptions{
countHelp: fmt.Sprintf("Number of successful %s operations", name),
durationHelp: fmt.Sprintf("Time in seconds spent performing %s operations", name),
errorsCountHelp: fmt.Sprintf("Number of failed %s operations", name),
errorsDurationHelp: fmt.Sprintf("Time in seconds spent performing failed %s operations", name),

labels: nil,
durationBuckets: prometheus.DefBuckets,
}

for _, override := range overrides {
override(options)
}

count := prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: fmt.Sprintf("%s_total", name),
Help: options.countHelp,
},
options.labels,
)

duration := prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Name: fmt.Sprintf("%s_duration", name),
Help: options.countHelp,
},
options.labels,
)

errorsCount := prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: fmt.Sprintf("%s_errors_total", name),
Help: options.errorsCountHelp,
},
options.labels,
)

errorsDuration := prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Name: fmt.Sprintf("%s_errors_duration", name),
Help: options.errorsDurationHelp,
},
options.labels,
)

return &RedFMetrics{
Count: count,
Duration: duration,

ErrorCount: errorsCount,
ErrorDuration: errorsDuration,
}
}
5 changes: 2 additions & 3 deletions cmd/zoekt-sourcegraph-indexserver/sg.go
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,7 @@ func (s *sourcegraphClient) List(ctx context.Context, indexed []uint32) (*Source
first = false
s.configFingerprint.Store(lastFingerprint)

metricResolveRevisionDuration.WithLabelValues("false").Observe(time.Since(start).Seconds())
metricResolveRevisionObserver.Observe(time.Since(start), err, "false")
tr.LazyPrintf("failed fetching options batch: %v", err)
tr.SetError()
continue
Expand All @@ -125,10 +125,9 @@ func (s *sourcegraphClient) List(ctx context.Context, indexed []uint32) (*Source
s.configFingerprint.Store(fingerprint)
}

metricResolveRevisionDuration.WithLabelValues("true").Observe(time.Since(start).Seconds())
metricResolveRevisionObserver.Observe(time.Since(start), err, "false")
for _, opt := range opts {
if opt.Error != "" {
metricGetIndexOptionsError.Inc()
tr.LazyPrintf("failed fetching options for %v: %v", opt.Name, opt.Error)
tr.SetError()
continue
Expand Down