From bfaa4bcbfaf9ad6981577833047154ed9b5d4753 Mon Sep 17 00:00:00 2001 From: James Cox Date: Tue, 16 May 2023 11:31:23 +0100 Subject: [PATCH] FFM-7702 Clean up cache metrics registration **What** - Cleans up the prometheus registration for the cache - Puts a check in so we don't register CacheNotFound errors as errors in prometheus metrics. I think the error label in the prometheus should be reserved for internal/unexpected errors, we already track the status code in the http handlers so can see when requests have been made for a resource that doesn't exist **Why** I was a bit sloppy when initailly registering some of the metrics for the cache and spotted they weren't coming throuhg during my load testing. **Testing** Tested these changes out locally --- cache/metrics.go | 17 +++++++++++++---- cache/metrics_test.go | 4 ++-- 2 files changed, 15 insertions(+), 6 deletions(-) diff --git a/cache/metrics.go b/cache/metrics.go index a8199b7f..e019c393 100644 --- a/cache/metrics.go +++ b/cache/metrics.go @@ -3,9 +3,11 @@ package cache import ( "context" "encoding" + "errors" "fmt" "time" + "github.com/harness/ff-proxy/domain" "github.com/prometheus/client_golang/prometheus" ) @@ -39,21 +41,21 @@ func NewMetricsCache(label string, reg prometheus.Registerer, next Cache) Metric next: next, writeDuration: prometheus.NewHistogramVec(prometheus.HistogramOpts{ - Name: "ff_proxy_%s_cache_write_duration", + Name: fmt.Sprintf("ff_proxy_%s_cache_write_duration", label), Help: "Tracks how long write operations to the cache take", Buckets: []float64{0.01, 0.025, 0.05, 0.075, 0.1, 0.25, 0.5}, }, []string{}, ), readDuration: prometheus.NewHistogramVec(prometheus.HistogramOpts{ - Name: "ff_proxy_%s_cache_read_duration", + Name: fmt.Sprintf("ff_proxy_%s_cache_read_duration", label), Help: "Tracks how long write operations to the cache take", Buckets: []float64{0.01, 0.025, 0.05, 0.075, 0.1, 0.25, 0.5}, }, []string{}, ), deleteDuration: prometheus.NewHistogramVec(prometheus.HistogramOpts{ - Name: "ff_proxy_%s_cache_delete_duration", + Name: fmt.Sprintf("ff_proxy_%s_cache_delete_duration", label), Help: "Tracks how long delete operations to the cache take", Buckets: []float64{0.01, 0.025, 0.05, 0.075, 0.1, 0.25, 0.5}, }, @@ -73,7 +75,7 @@ func NewMetricsCache(label string, reg prometheus.Registerer, next Cache) Metric []string{"key", "operation", "error"}, ), deleteCount: prometheus.NewCounterVec(prometheus.CounterOpts{ - Name: fmt.Sprintf("ff_proxy_%s_cache_remove_duration", label), + Name: fmt.Sprintf("ff_proxy_%s_cache_remove_count", label), Help: "Tracks how many deletes we make to the cache", }, []string{"key", "operation"}, @@ -81,6 +83,9 @@ func NewMetricsCache(label string, reg prometheus.Registerer, next Cache) Metric } reg.MustRegister( + c.writeDuration, + c.readDuration, + c.deleteDuration, c.writeCount, c.readCount, c.deleteCount, @@ -187,6 +192,10 @@ func trackCounter(metric counter, labels ...string) { func getErrorLabel(err error) string { if err != nil { + // Don't want to track NotFound as an error in prometheus metrics + if errors.Is(err, domain.ErrCacheNotFound) { + return "false" + } return "true" } return "false" diff --git a/cache/metrics_test.go b/cache/metrics_test.go index 1ea19b57..9adc8ca6 100644 --- a/cache/metrics_test.go +++ b/cache/metrics_test.go @@ -252,7 +252,7 @@ func TestCacheMetrics_GetAll(t *testing.T) { readCount *mockCounter expected result }{ - "Given I call GetAll and the decorated cache errors": { + "Given I call GetAll and the decorated cache errors with a domain.ErrCacheNotFound error": { args: args{ key: "foo", cacheData: map[string]map[string][]byte{}, @@ -264,7 +264,7 @@ func TestCacheMetrics_GetAll(t *testing.T) { expected: result{ observations: 1, - labels: []string{"foo", "GetAll", "true"}, + labels: []string{"foo", "GetAll", "false"}, }, }, "Given I call GetAll and the decorated cache doesn't error": {