Skip to content

Commit

Permalink
profiler: avoid metrics profile log noise when stopping profiling
Browse files Browse the repository at this point in the history
The metrics profiler insisted on at least one second between collections
for two reasons:

- To avoid a division by zero because it was doing integer division to
  convert a time.Duration to seconds, which will truncate to 0, as a
  ratio in a subsequent computation
- In case "a system clock issue causes time to run backwards"

The profiler would report an error if less than one second elapsed
between collections. In practice, this resulted in misleading error logs
because it's entirely likely for profiling to be stopped less than a
second after the last profile collection.

The restriction was not really even needed. For one, we can just do
floating-point division rather than integer division to avoid the
truncation problem.

Also, Go has had monotonic time support by default since 2017, added in
Go 1.9, and time comparison operations including time.Time.Sub, work
with respect to monotonic time. We shouldn't have any issues with
negative periods. We can ensure the period is positive just as a
defensive measure, and fail if it's negative since this may indicate a
bug in the Go runtime if it's violating the monotonicity guarantees.

Fixes #2863
  • Loading branch information
nsrip-dd committed Sep 19, 2024
1 parent e081e4a commit f5d8369
Show file tree
Hide file tree
Showing 3 changed files with 50 additions and 40 deletions.
45 changes: 24 additions & 21 deletions profiler/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -61,47 +61,50 @@ func (m *metrics) reset(now time.Time) {

func (m *metrics) report(now time.Time, buf *bytes.Buffer) error {
period := now.Sub(m.collectedAt)

if period < time.Second {
// Profiler could be mis-configured to report more frequently than every second
// or a system clock issue causes time to run backwards.
// We can't emit valid metrics in either case.
return collectionTooFrequent{min: time.Second, observed: period}
if period <= 0 {
// It is technically possible, though very unlikely, for period
// to be 0 if the monotonic clock did not advance at all or if
// we somehow collected two metrics profiles closer together
// than the clock can measure. If the period is negative, this
// might be a Go runtime bug, since time.Time.Sub is supposed to
// work with monotonic time. Either way, bail out since
// something is probably going wrong
return fmt.Errorf(
"unexpected duration %v between metrics collections, first at %v, second at %v",
period, m.collectedAt, now,
)
}

previousStats := m.snapshot
m.reset(now)

points := m.compute(&previousStats, &m.snapshot, period, now)
data, err := json.Marshal(removeInvalid(points))

if err != nil {
// NB the minimum period check and removeInvalid ensures we don't hit this case
return err
}

if _, err := buf.Write(data); err != nil {
// NB removeInvalid ensures we don't hit this case by dropping inf/NaN
return err
}

return nil
_, err = buf.Write(data)
return err
}

func computeMetrics(prev *metricsSnapshot, curr *metricsSnapshot, period time.Duration, now time.Time) []point {
periodSeconds := float64(period) / float64(time.Second)
return []point{
{metric: "go_alloc_bytes_per_sec", value: rate(curr.TotalAlloc, prev.TotalAlloc, period/time.Second)},
{metric: "go_allocs_per_sec", value: rate(curr.Mallocs, prev.Mallocs, period/time.Second)},
{metric: "go_frees_per_sec", value: rate(curr.Frees, prev.Frees, period/time.Second)},
{metric: "go_heap_growth_bytes_per_sec", value: rate(curr.HeapAlloc, prev.HeapAlloc, period/time.Second)},
{metric: "go_gcs_per_sec", value: rate(uint64(curr.NumGC), uint64(prev.NumGC), period/time.Second)},
{metric: "go_gc_pause_time", value: rate(curr.PauseTotalNs, prev.PauseTotalNs, period)}, // % of time spent paused
{metric: "go_alloc_bytes_per_sec", value: rate(curr.TotalAlloc, prev.TotalAlloc, periodSeconds)},
{metric: "go_allocs_per_sec", value: rate(curr.Mallocs, prev.Mallocs, periodSeconds)},
{metric: "go_frees_per_sec", value: rate(curr.Frees, prev.Frees, periodSeconds)},
{metric: "go_heap_growth_bytes_per_sec", value: rate(curr.HeapAlloc, prev.HeapAlloc, periodSeconds)},
{metric: "go_gcs_per_sec", value: rate(uint64(curr.NumGC), uint64(prev.NumGC), periodSeconds)},
{metric: "go_gc_pause_time", value: rate(curr.PauseTotalNs, prev.PauseTotalNs, float64(period))}, // % of time spent paused
{metric: "go_max_gc_pause_time", value: float64(maxPauseNs(&curr.MemStats, now.Add(-period)))},
{metric: "go_num_goroutine", value: float64(curr.NumGoroutine)},
}
}

func rate(curr, prev uint64, period time.Duration) float64 {
return float64(int64(curr)-int64(prev)) / float64(period)
func rate(curr, prev uint64, period float64) float64 {
return float64(int64(curr)-int64(prev)) / period
}

// maxPauseNs returns maximum pause time within the recent period, assumes stats populated at period end
Expand Down
19 changes: 0 additions & 19 deletions profiler/metrics_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -143,22 +143,3 @@ func TestMetricsReport(t *testing.T) {
assert.NoError(t, err)
assert.Equal(t, "[[\"metric_name\",1.1]]", buf.String())
}

func TestMetricsCollectFrequency(t *testing.T) {
now := now()
var err error
var buf bytes.Buffer
m := newTestMetrics(now)

err = m.report(now.Add(-time.Second), &buf)
assert.Error(t, err, "collection call times must be monotonically increasing")
assert.Empty(t, buf)

err = m.report(now.Add(time.Second-1), &buf)
assert.Error(t, err, "must be at least one second between collection calls")
assert.Empty(t, buf)

err = m.report(now.Add(time.Second), &buf)
assert.NoError(t, err, "one second between calls should work")
assert.NotEmpty(t, buf)
}
26 changes: 26 additions & 0 deletions profiler/profiler_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -378,6 +378,7 @@ func TestAllUploaded(t *testing.T) {
"delta-mutex.pprof",
"goroutines.pprof",
"goroutineswait.pprof",
"metrics.json",
}
if executionTraceEnabledDefault {
expected = append(expected, "go.trace")
Expand Down Expand Up @@ -779,3 +780,28 @@ func TestOrchestrionProfileInfo(t *testing.T) {
})
}
}

func TestShortMetricsProfile(t *testing.T) {
profiles := startTestProfiler(t, 1, WithPeriod(10*time.Millisecond), WithProfileTypes(MetricsProfile))
for range 3 {
p := <-profiles
if _, ok := p.attachments["metrics.json"]; !ok {
t.Errorf("didn't get metrics profile, got %v", p.event.Attachments)
}
}
}

func TestMetricsProfileStopEarlyNoLog(t *testing.T) {
rl := new(log.RecordLogger)
defer log.UseLogger(rl)()
startTestProfiler(t, 1, WithPeriod(2*time.Second), WithProfileTypes(MetricsProfile))
// Stop the profiler immediately
Stop()
log.Flush()
for _, msg := range rl.Logs() {
// We should not see any error about stopping the metrics profile short
if strings.Contains(msg, "ERROR:") {
t.Errorf("unexpected error log: %s", msg)
}
}
}

0 comments on commit f5d8369

Please sign in to comment.