Skip to content

Commit

Permalink
feat: bucketize latency attribute (#2269)
Browse files Browse the repository at this point in the history
This PR bucketizes the latency _attribute_ (NOT the latency _metric_!
That would be very bad!)

For better filtering capabilities in datadog, we need to bucket our
high-cardinality numeric attributes. This uses log base8 because:

1. That seems to give the most useful results for current production
numbers we're seeing in DD - good functional granularity for analysis
purposes without being so granular that it's annoying to sort through
the buckets. Current values range from 62 to 20279, concentrated in the
100-1000 range.
2. Base 8 bucket labels are all powers of 2, so it's relatively easy for
our brains to skim

Sample output:
```
-> ftl.async_call.time_since_scheduled_at_ms.bucket: Str([8,64))
```

Issue: #2194
  • Loading branch information
deniseli authored Aug 6, 2024
1 parent dedbed0 commit 8fcbd33
Show file tree
Hide file tree
Showing 3 changed files with 79 additions and 8 deletions.
16 changes: 8 additions & 8 deletions backend/controller/observability/async_calls.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,12 @@ import (
)

const (
asyncCallMeterName = "ftl.async_call"
asyncCallOriginAttr = "ftl.async_call.origin"
asyncCallVerbRefAttr = "ftl.async_call.verb.ref"
asyncCallTimeSinceScheduledAtAttr = "ftl.async_call.time_since_scheduled_at_ms"
asyncCallRemainingAttemptsAttr = "ftl.async_call.remaining_attempts"
asyncCallExecFailureModeAttr = "ftl.async_call.execution.failure_mode"
asyncCallMeterName = "ftl.async_call"
asyncCallOriginAttr = "ftl.async_call.origin"
asyncCallVerbRefAttr = "ftl.async_call.verb.ref"
asyncCallTimeSinceScheduledAtBucketAttr = "ftl.async_call.time_since_scheduled_at_ms.bucket"
asyncCallRemainingAttemptsAttr = "ftl.async_call.remaining_attempts"
asyncCallExecFailureModeAttr = "ftl.async_call.execution.failure_mode"
)

type AsyncCallMetrics struct {
Expand Down Expand Up @@ -126,14 +126,14 @@ func (m *AsyncCallMetrics) Completed(ctx context.Context, verb schema.RefKey, or
attrs = append(attrs, attribute.Bool(observability.StatusSucceededAttribute, maybeErr == nil))
m.msToComplete.Record(ctx, msToComplete, metric.WithAttributes(attrs...))

attrs = append(attrs, attribute.Int64(asyncCallTimeSinceScheduledAtAttr, msToComplete))
attrs = append(attrs, attribute.String(asyncCallTimeSinceScheduledAtBucketAttr, logBucket(8, msToComplete)))
m.completed.Add(ctx, 1, metric.WithAttributes(attrs...))

m.queueDepth.Record(ctx, queueDepth)
}

func extractAsyncCallAttrs(verb schema.RefKey, origin string, scheduledAt time.Time) []attribute.KeyValue {
return append(extractRefAttrs(verb, origin), attribute.Int64(asyncCallTimeSinceScheduledAtAttr, timeSinceMS(scheduledAt)))
return append(extractRefAttrs(verb, origin), attribute.String(asyncCallTimeSinceScheduledAtBucketAttr, logBucket(8, timeSinceMS(scheduledAt))))
}

func extractRefAttrs(verb schema.RefKey, origin string) []attribute.KeyValue {
Expand Down
18 changes: 18 additions & 0 deletions backend/controller/observability/observability.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ package observability
import (
"errors"
"fmt"
"math"
"time"

"go.opentelemetry.io/otel/metric"
Expand Down Expand Up @@ -47,3 +48,20 @@ func handleInt64UpDownCounterError(counter string, err error, errs error) (metri
func timeSinceMS(start time.Time) int64 {
return time.Since(start).Milliseconds()
}

// logBucket returns a string bucket label for a given positive number bucketed into
// powers of some arbitrary base. For base 8, for example, we would have buckets:
//
// <1, [1-8), [8-64), [64-512), etc.
//
// Go only supports a few bases with math.Log*, so this func performs a change of base:
// log_b(x) = log_a(x) / log_a(b)
func logBucket(base int, num int64) string {
if num < 1 {
return "<1"
}
b := float64(base)
logB := math.Log(float64(num)) / math.Log(b)
bucketExpLo := math.Floor(logB)
return fmt.Sprintf("[%d,%d)", int(math.Pow(b, bucketExpLo)), int(math.Pow(b, bucketExpLo+1)))
}
53 changes: 53 additions & 0 deletions backend/controller/observability/observability_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
package observability

import (
"testing"

"github.com/alecthomas/assert/v2"
)

func TestLogBucket(t *testing.T) {
tests := []struct {
name string
base int
num int
want string
}{
{
name: "<1",
base: 8,
num: 0,
want: "<1",
},
{
name: "EqualLowEndOfRange",
base: 8,
num: 1,
want: "[1,8)",
},
{
name: "HigherEndOfRange",
base: 8,
num: 7,
want: "[1,8)",
},
{
name: "BigInputNum",
base: 8,
num: 16800000,
want: "[16777216,134217728)",
},
{
name: "Base2",
base: 2,
num: 8,
want: "[8,16)",
},
}

for _, test := range tests {
t.Run(test.name, func(t *testing.T) {
assert.Equal(t, test.want, logBucket(test.base, int64(test.num)))
})
}
}

0 comments on commit 8fcbd33

Please sign in to comment.