From 232e3d161f6db82dc4ea67c15b02e1499cbb9286 Mon Sep 17 00:00:00 2001 From: Nick Ripley Date: Tue, 24 Sep 2024 12:10:25 -0400 Subject: [PATCH] profiler: suppress erorrs if the profiler is stopped The fix in #2865 was intended to suppress spurious metrics profile errors when the profiler is stopped. It did so by relaxing the one-second duration constraint of the metrics profiler. However, the Windows system timer resolution is about 15 milliseconds (see https://learn.microsoft.com/en-us/windows-hardware/drivers/kernel/high-resolution-timers#controlling-timer-accuracy) This caused the metrics profile tests from #2865 to fail because the metrics profiler will likely be stopped in less than 15 milliseconds, meaning we'll see 0 duration between profile collection and log an error. This commit actually suppresses the error by checking whether the profiler was stopped (meaning interruptibleSleep was interrupted). If so, and if the metrics profiler returned an error, we instead return a sentinel error indicating that profiling was stopped. If we see that error, we just drop the profile and don't log an error. We won't upload the profile anyway. This way, we should only report an error from the metrics profiler if there is _actually_ a problem with the timer. --- profiler/profile.go | 4 +++- profiler/profiler.go | 18 ++++++++++++++---- 2 files changed, 17 insertions(+), 5 deletions(-) diff --git a/profiler/profile.go b/profiler/profile.go index 2aa0257f81..c7c23e63d9 100644 --- a/profiler/profile.go +++ b/profiler/profile.go @@ -177,7 +177,9 @@ var profileTypes = map[ProfileType]profileType{ Filename: "metrics.json", Collect: func(p *profiler) ([]byte, error) { var buf bytes.Buffer - p.interruptibleSleep(p.cfg.period) + if p.interruptibleSleep(p.cfg.period) { + return nil, errProfilerStopped + } err := p.met.report(now(), &buf) return buf.Bytes(), err }, diff --git a/profiler/profiler.go b/profiler/profiler.go index 5e1d1736e8..0dba2062b8 100644 --- a/profiler/profiler.go +++ b/profiler/profiler.go @@ -36,6 +36,10 @@ var ( activeProfiler *profiler containerID = internal.ContainerID() // replaced in tests entityID = internal.EntityID() // replaced in tests + + // errProfilerStopped is a sentinel for suppressng errors if we are + // about to stop the profiler + errProfilerStopped = errors.New("profiler stopped") ) // Start starts the profiler. If the profiler is already running, it will be @@ -343,9 +347,12 @@ func (p *profiler) collect(ticker <-chan time.Time) { } profs, err := p.runProfile(t) if err != nil { - log.Error("Error getting %s profile: %v; skipping.", t, err) - tags := append(p.cfg.tags.Slice(), t.Tag()) - p.cfg.statsd.Count("datadog.profiling.go.collect_error", 1, tags, 1) + if err != errProfilerStopped { + log.Error("Error getting %s profile: %v; skipping.", t, err) + tags := append(p.cfg.tags.Slice(), t.Tag()) + p.cfg.statsd.Count("datadog.profiling.go.collect_error", 1, tags, 1) + } + return } mu.Lock() defer mu.Unlock() @@ -480,10 +487,13 @@ func (p *profiler) outputDir(bat batch) error { // interruptibleSleep sleeps for the given duration or until interrupted by the // p.exit channel being closed. -func (p *profiler) interruptibleSleep(d time.Duration) { +// Returns whether the sleep was interrupted +func (p *profiler) interruptibleSleep(d time.Duration) bool { select { case <-p.exit: + return true case <-time.After(d): + return false } }