Skip to content

Commit

Permalink
Introduce internal stats to track time spent in extra GVL profiling s…
Browse files Browse the repository at this point in the history
…amples
  • Loading branch information
ivoanjo committed Oct 11, 2024
1 parent d196517 commit 7bba03e
Show file tree
Hide file tree
Showing 3 changed files with 53 additions and 9 deletions.
6 changes: 3 additions & 3 deletions docs/ProfilingDevelopment.md
Original file line number Diff line number Diff line change
Expand Up @@ -209,9 +209,9 @@ from the VM callbacks and also messing with cpu/wall-time accounting for threads
The `ThreadContext` collector exposes three APIs for GVL profiling:
* `thread_context_collector_on_gvl_waiting(VALUE thread)`
* `thread_context_collector_on_gvl_running(VALUE thread)`
* `thread_context_collector_sample_after_gvl_running(VALUE self_instance)`
* `thread_context_collector_on_gvl_waiting`
* `thread_context_collector_on_gvl_running`
* `thread_context_collector_sample_after_gvl_running`
The intuition here is that:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -182,6 +182,10 @@ struct cpu_and_wall_time_worker_state {
unsigned int after_gvl_running;
// How many times we skipped the after_gvl_running sampling
unsigned int gvl_dont_sample;
// Min/max/total wall-time spent on gvl sampling
uint64_t gvl_sampling_time_ns_min;
uint64_t gvl_sampling_time_ns_max;
uint64_t gvl_sampling_time_ns_total;
} stats;
};

Expand Down Expand Up @@ -1045,8 +1049,12 @@ static VALUE _native_stats(DDTRACE_UNUSED VALUE self, VALUE instance) {
ID2SYM(rb_intern("allocations_during_sample")), /* => */ state->allocation_profiling_enabled ? UINT2NUM(state->stats.allocations_during_sample) : Qnil,

// GVL profiling stats
ID2SYM(rb_intern("after_gvl_running")), /* => */ UINT2NUM(state->stats.after_gvl_running),
ID2SYM(rb_intern("gvl_dont_sample")), /* => */ UINT2NUM(state->stats.gvl_dont_sample),
ID2SYM(rb_intern("after_gvl_running")), /* => */ UINT2NUM(state->stats.after_gvl_running),
ID2SYM(rb_intern("gvl_dont_sample")), /* => */ UINT2NUM(state->stats.gvl_dont_sample),
ID2SYM(rb_intern("gvl_sampling_time_ns_min")), /* => */ RUBY_NUM_OR_NIL(state->stats.gvl_sampling_time_ns_min, != UINT64_MAX, ULL2NUM),
ID2SYM(rb_intern("gvl_sampling_time_ns_max")), /* => */ RUBY_NUM_OR_NIL(state->stats.gvl_sampling_time_ns_max, > 0, ULL2NUM),
ID2SYM(rb_intern("gvl_sampling_time_ns_total")), /* => */ RUBY_NUM_OR_NIL(state->stats.gvl_sampling_time_ns_total, > 0, ULL2NUM),
ID2SYM(rb_intern("gvl_sampling_time_ns_avg")), /* => */ RUBY_AVG_OR_NIL(state->stats.gvl_sampling_time_ns_total, state->stats.after_gvl_running),
};
for (long unsigned int i = 0; i < VALUE_COUNT(arguments); i += 2) rb_hash_aset(stats_as_hash, arguments[i], arguments[i+1]);
return stats_as_hash;
Expand Down Expand Up @@ -1084,8 +1092,10 @@ static void reset_stats_not_thread_safe(struct cpu_and_wall_time_worker_state *s
// Given the expected infrequency of resetting (~once per 60s profile) and the auxiliary/non-critical nature of these stats
// this momentary loss of accuracy is deemed acceptable to keep overhead to a minimum.
state->stats = (struct stats) {
.cpu_sampling_time_ns_min = UINT64_MAX, // Since we always take the min between existing and latest sample
.allocation_sampling_time_ns_min = UINT64_MAX, // Since we always take the min between existing and latest sample
// All these values are initialized to their highest value possible since we always take the min between existing and latest sample
.cpu_sampling_time_ns_min = UINT64_MAX,
.allocation_sampling_time_ns_min = UINT64_MAX,
.gvl_sampling_time_ns_min = UINT64_MAX,
};
}

Expand Down Expand Up @@ -1369,6 +1379,16 @@ static VALUE _native_resume_signals(DDTRACE_UNUSED VALUE self) {

long wall_time_ns_before_sample = monotonic_wall_time_now_ns(RAISE_ON_FAILURE);
thread_context_collector_sample_after_gvl_running(state->thread_context_collector_instance, rb_thread_current(), wall_time_ns_before_sample);
long wall_time_ns_after_sample = monotonic_wall_time_now_ns(RAISE_ON_FAILURE);

long delta_ns = wall_time_ns_after_sample - wall_time_ns_before_sample;

// Guard against wall-time going backwards, see https://github.com/DataDog/dd-trace-rb/pull/2336 for discussion.
uint64_t sampling_time_ns = delta_ns < 0 ? 0 : delta_ns;

state->stats.gvl_sampling_time_ns_min = uint64_min_of(sampling_time_ns, state->stats.gvl_sampling_time_ns_min);
state->stats.gvl_sampling_time_ns_max = uint64_max_of(sampling_time_ns, state->stats.gvl_sampling_time_ns_max);
state->stats.gvl_sampling_time_ns_total += sampling_time_ns;

state->stats.after_gvl_running++;

Expand Down
28 changes: 26 additions & 2 deletions spec/datadog/profiling/collectors/cpu_and_wall_time_worker_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -507,7 +507,15 @@
expect(waiting_for_gvl_time).to be_within(5).percent_of(total_time), \
"Expected waiting_for_gvl_time to be close to total_time, debug_failures: #{debug_failures}"

expect(cpu_and_wall_time_worker.stats.fetch(:after_gvl_running)).to be > 0
expect(cpu_and_wall_time_worker.stats).to match(
hash_including(
after_gvl_running: be > 0,
gvl_sampling_time_ns_min: be > 0,
gvl_sampling_time_ns_max: be > 0,
gvl_sampling_time_ns_total: be > 0,
gvl_sampling_time_ns_avg: be > 0,
)
)
end

context "when 'Waiting for GVL' periods are below waiting_for_gvl_threshold_ns" do
Expand All @@ -530,9 +538,21 @@

cpu_and_wall_time_worker.stop

expect(cpu_and_wall_time_worker.stats.fetch(:after_gvl_running)).to be 0
# Note: There may still be "Waiting for GVL" samples in the output, but these samples will come from the
# periodic cpu/wall-sampling, not samples directly triggered by the end of a "Waiting for GVL" period.

expect(cpu_and_wall_time_worker.stats.fetch(:gvl_dont_sample))
.to be > 100 # Arbitrary, on my machine I see 250k on a run

expect(cpu_and_wall_time_worker.stats).to match(
hash_including(
after_gvl_running: 0,
gvl_sampling_time_ns_min: nil,
gvl_sampling_time_ns_max: nil,
gvl_sampling_time_ns_total: nil,
gvl_sampling_time_ns_avg: nil,
)
)
end
end
end
Expand Down Expand Up @@ -1164,6 +1184,10 @@
allocations_during_sample: nil,
after_gvl_running: 0,
gvl_dont_sample: 0,
gvl_sampling_time_ns_min: nil,
gvl_sampling_time_ns_max: nil,
gvl_sampling_time_ns_total: nil,
gvl_sampling_time_ns_avg: nil,
}
)
end
Expand Down

0 comments on commit 7bba03e

Please sign in to comment.