diff --git a/docs/ProfilingDevelopment.md b/docs/ProfilingDevelopment.md index 309bac05f5..26fa6cd66c 100644 --- a/docs/ProfilingDevelopment.md +++ b/docs/ProfilingDevelopment.md @@ -209,9 +209,9 @@ from the VM callbacks and also messing with cpu/wall-time accounting for threads The `ThreadContext` collector exposes three APIs for GVL profiling: -* `thread_context_collector_on_gvl_waiting(VALUE thread)` -* `thread_context_collector_on_gvl_running(VALUE thread)` -* `thread_context_collector_sample_after_gvl_running(VALUE self_instance)` +* `thread_context_collector_on_gvl_waiting` +* `thread_context_collector_on_gvl_running` +* `thread_context_collector_sample_after_gvl_running` The intuition here is that: diff --git a/ext/datadog_profiling_native_extension/collectors_cpu_and_wall_time_worker.c b/ext/datadog_profiling_native_extension/collectors_cpu_and_wall_time_worker.c index da5e286bd7..7dbecfde7b 100644 --- a/ext/datadog_profiling_native_extension/collectors_cpu_and_wall_time_worker.c +++ b/ext/datadog_profiling_native_extension/collectors_cpu_and_wall_time_worker.c @@ -182,6 +182,10 @@ struct cpu_and_wall_time_worker_state { unsigned int after_gvl_running; // How many times we skipped the after_gvl_running sampling unsigned int gvl_dont_sample; + // Min/max/total wall-time spent on gvl sampling + uint64_t gvl_sampling_time_ns_min; + uint64_t gvl_sampling_time_ns_max; + uint64_t gvl_sampling_time_ns_total; } stats; }; @@ -1045,8 +1049,12 @@ static VALUE _native_stats(DDTRACE_UNUSED VALUE self, VALUE instance) { ID2SYM(rb_intern("allocations_during_sample")), /* => */ state->allocation_profiling_enabled ? UINT2NUM(state->stats.allocations_during_sample) : Qnil, // GVL profiling stats - ID2SYM(rb_intern("after_gvl_running")), /* => */ UINT2NUM(state->stats.after_gvl_running), - ID2SYM(rb_intern("gvl_dont_sample")), /* => */ UINT2NUM(state->stats.gvl_dont_sample), + ID2SYM(rb_intern("after_gvl_running")), /* => */ UINT2NUM(state->stats.after_gvl_running), + ID2SYM(rb_intern("gvl_dont_sample")), /* => */ UINT2NUM(state->stats.gvl_dont_sample), + ID2SYM(rb_intern("gvl_sampling_time_ns_min")), /* => */ RUBY_NUM_OR_NIL(state->stats.gvl_sampling_time_ns_min, != UINT64_MAX, ULL2NUM), + ID2SYM(rb_intern("gvl_sampling_time_ns_max")), /* => */ RUBY_NUM_OR_NIL(state->stats.gvl_sampling_time_ns_max, > 0, ULL2NUM), + ID2SYM(rb_intern("gvl_sampling_time_ns_total")), /* => */ RUBY_NUM_OR_NIL(state->stats.gvl_sampling_time_ns_total, > 0, ULL2NUM), + ID2SYM(rb_intern("gvl_sampling_time_ns_avg")), /* => */ RUBY_AVG_OR_NIL(state->stats.gvl_sampling_time_ns_total, state->stats.after_gvl_running), }; for (long unsigned int i = 0; i < VALUE_COUNT(arguments); i += 2) rb_hash_aset(stats_as_hash, arguments[i], arguments[i+1]); return stats_as_hash; @@ -1084,8 +1092,10 @@ static void reset_stats_not_thread_safe(struct cpu_and_wall_time_worker_state *s // Given the expected infrequency of resetting (~once per 60s profile) and the auxiliary/non-critical nature of these stats // this momentary loss of accuracy is deemed acceptable to keep overhead to a minimum. state->stats = (struct stats) { - .cpu_sampling_time_ns_min = UINT64_MAX, // Since we always take the min between existing and latest sample - .allocation_sampling_time_ns_min = UINT64_MAX, // Since we always take the min between existing and latest sample + // All these values are initialized to their highest value possible since we always take the min between existing and latest sample + .cpu_sampling_time_ns_min = UINT64_MAX, + .allocation_sampling_time_ns_min = UINT64_MAX, + .gvl_sampling_time_ns_min = UINT64_MAX, }; } @@ -1369,6 +1379,16 @@ static VALUE _native_resume_signals(DDTRACE_UNUSED VALUE self) { long wall_time_ns_before_sample = monotonic_wall_time_now_ns(RAISE_ON_FAILURE); thread_context_collector_sample_after_gvl_running(state->thread_context_collector_instance, rb_thread_current(), wall_time_ns_before_sample); + long wall_time_ns_after_sample = monotonic_wall_time_now_ns(RAISE_ON_FAILURE); + + long delta_ns = wall_time_ns_after_sample - wall_time_ns_before_sample; + + // Guard against wall-time going backwards, see https://github.com/DataDog/dd-trace-rb/pull/2336 for discussion. + uint64_t sampling_time_ns = delta_ns < 0 ? 0 : delta_ns; + + state->stats.gvl_sampling_time_ns_min = uint64_min_of(sampling_time_ns, state->stats.gvl_sampling_time_ns_min); + state->stats.gvl_sampling_time_ns_max = uint64_max_of(sampling_time_ns, state->stats.gvl_sampling_time_ns_max); + state->stats.gvl_sampling_time_ns_total += sampling_time_ns; state->stats.after_gvl_running++; diff --git a/spec/datadog/profiling/collectors/cpu_and_wall_time_worker_spec.rb b/spec/datadog/profiling/collectors/cpu_and_wall_time_worker_spec.rb index 43045da98a..3626af65b7 100644 --- a/spec/datadog/profiling/collectors/cpu_and_wall_time_worker_spec.rb +++ b/spec/datadog/profiling/collectors/cpu_and_wall_time_worker_spec.rb @@ -507,7 +507,15 @@ expect(waiting_for_gvl_time).to be_within(5).percent_of(total_time), \ "Expected waiting_for_gvl_time to be close to total_time, debug_failures: #{debug_failures}" - expect(cpu_and_wall_time_worker.stats.fetch(:after_gvl_running)).to be > 0 + expect(cpu_and_wall_time_worker.stats).to match( + hash_including( + after_gvl_running: be > 0, + gvl_sampling_time_ns_min: be > 0, + gvl_sampling_time_ns_max: be > 0, + gvl_sampling_time_ns_total: be > 0, + gvl_sampling_time_ns_avg: be > 0, + ) + ) end context "when 'Waiting for GVL' periods are below waiting_for_gvl_threshold_ns" do @@ -530,9 +538,21 @@ cpu_and_wall_time_worker.stop - expect(cpu_and_wall_time_worker.stats.fetch(:after_gvl_running)).to be 0 + # Note: There may still be "Waiting for GVL" samples in the output, but these samples will come from the + # periodic cpu/wall-sampling, not samples directly triggered by the end of a "Waiting for GVL" period. + expect(cpu_and_wall_time_worker.stats.fetch(:gvl_dont_sample)) .to be > 100 # Arbitrary, on my machine I see 250k on a run + + expect(cpu_and_wall_time_worker.stats).to match( + hash_including( + after_gvl_running: 0, + gvl_sampling_time_ns_min: nil, + gvl_sampling_time_ns_max: nil, + gvl_sampling_time_ns_total: nil, + gvl_sampling_time_ns_avg: nil, + ) + ) end end end @@ -1164,6 +1184,10 @@ allocations_during_sample: nil, after_gvl_running: 0, gvl_dont_sample: 0, + gvl_sampling_time_ns_min: nil, + gvl_sampling_time_ns_max: nil, + gvl_sampling_time_ns_total: nil, + gvl_sampling_time_ns_avg: nil, } ) end