Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

WIP: all task (wall-clock) profiler #177

Closed
wants to merge 11 commits into from
7 changes: 7 additions & 0 deletions src/julia_internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -206,6 +206,13 @@ JL_DLLEXPORT void jl_unlock_profile_wr(void) JL_NOTSAFEPOINT JL_NOTSAFEPOINT_LEA
int jl_lock_stackwalk(void) JL_NOTSAFEPOINT JL_NOTSAFEPOINT_ENTER;
void jl_unlock_stackwalk(int lockret) JL_NOTSAFEPOINT JL_NOTSAFEPOINT_LEAVE;

void jl_rec_backtrace(jl_task_t *t) JL_NOTSAFEPOINT;
extern volatile struct _jl_bt_element_t *bt_data_prof;
extern volatile size_t bt_size_max;
extern volatile size_t bt_size_cur;
extern volatile int running;
extern volatile int profile_all_tasks;

// number of cycles since power-on
static inline uint64_t cycleclock(void) JL_NOTSAFEPOINT
{
Expand Down
11 changes: 6 additions & 5 deletions src/signal-handling.c
Original file line number Diff line number Diff line change
Expand Up @@ -19,15 +19,16 @@ extern "C" {

// Profiler control variables
// Note: these "static" variables are also used in "signals-*.c"
static volatile jl_bt_element_t *bt_data_prof = NULL;
static volatile size_t bt_size_max = 0;
static volatile size_t bt_size_cur = 0;
volatile jl_bt_element_t *bt_data_prof = NULL;
volatile size_t bt_size_max = 0;
volatile size_t bt_size_cur = 0;
static volatile uint64_t nsecprof = 0;
static volatile int running = 0;
volatile int running = 0;
volatile int profile_all_tasks = 0;
static const uint64_t GIGA = 1000000000ULL;
// Timers to take samples at intervals
JL_DLLEXPORT void jl_profile_stop_timer(void);
JL_DLLEXPORT int jl_profile_start_timer(void);
JL_DLLEXPORT int jl_profile_start_timer(uint8_t);
// File-descriptor for safe logging on signal handling
int jl_sig_fd;

Expand Down
170 changes: 92 additions & 78 deletions src/signals-mach.c
Original file line number Diff line number Diff line change
Expand Up @@ -603,6 +603,83 @@ void jl_unlock_stackwalk(int lockret)
jl_unlock_profile_mach(1, lockret);
}

// assumes holding `jl_lock_profile_mach`
void jl_profile_thread_mach(int tid)
{
// if there is no space left, return early
if (jl_profile_is_buffer_full()) {
jl_profile_stop_timer();
return;
}
if (_dyld_dlopen_atfork_prepare != NULL && _dyld_dlopen_atfork_parent != NULL)
_dyld_dlopen_atfork_prepare();
if (_dyld_atfork_prepare != NULL && _dyld_atfork_parent != NULL)
_dyld_atfork_prepare(); // briefly acquire the dlsym lock
host_thread_state_t state;
int valid_thread = jl_thread_suspend_and_get_state2(tid, &state);
unw_context_t *uc = (unw_context_t*)&state;
if (_dyld_atfork_prepare != NULL && _dyld_atfork_parent != NULL)
_dyld_atfork_parent(); // quickly release the dlsym lock
if (_dyld_dlopen_atfork_prepare != NULL && _dyld_dlopen_atfork_parent != NULL)
_dyld_dlopen_atfork_parent();
if (!valid_thread)
return;
if (running) {
#ifdef LLVMLIBUNWIND
/*
* Unfortunately compact unwind info is incorrectly generated for quite a number of
* libraries by quite a large number of compilers. We can fall back to DWARF unwind info
* in some cases, but in quite a number of cases (especially libraries not compiled in debug
* mode, only the compact unwind info may be available). Even more unfortunately, there is no
* way to detect such bogus compact unwind info (other than noticing the resulting segfault).
* What we do here is ugly, but necessary until the compact unwind info situation improves.
* We try to use the compact unwind info and if that results in a segfault, we retry with DWARF info.
* Note that in a small number of cases this may result in bogus stack traces, but at least the topmost
* entry will always be correct, and the number of cases in which this is an issue is rather small.
* Other than that, this implementation is not incorrect as the other thread is paused while we are profiling
* and during stack unwinding we only ever read memory, but never write it.
*/

forceDwarf = 0;
unw_getcontext(&profiler_uc); // will resume from this point if the next lines segfault at any point

if (forceDwarf == 0) {
// Save the backtrace
bt_size_cur += rec_backtrace_ctx((jl_bt_element_t*)bt_data_prof + bt_size_cur, bt_size_max - bt_size_cur - 1, uc, NULL);
}
else if (forceDwarf == 1) {
bt_size_cur += rec_backtrace_ctx_dwarf((jl_bt_element_t*)bt_data_prof + bt_size_cur, bt_size_max - bt_size_cur - 1, uc, NULL);
}
else if (forceDwarf == -1) {
jl_safe_printf("WARNING: profiler attempt to access an invalid memory location\n");
}

forceDwarf = -2;
#else
bt_size_cur += rec_backtrace_ctx((jl_bt_element_t*)bt_data_prof + bt_size_cur, bt_size_max - bt_size_cur - 1, uc, NULL);
#endif
jl_ptls_t ptls = jl_atomic_load_relaxed(&jl_all_tls_states)[tid];

// store threadid but add 1 as 0 is preserved to indicate end of block
bt_data_prof[bt_size_cur++].uintptr = ptls->tid + 1;

// store task id (never null)
bt_data_prof[bt_size_cur++].jlvalue = (jl_value_t*)jl_atomic_load_relaxed(&ptls->current_task);

// store cpu cycle clock
bt_data_prof[bt_size_cur++].uintptr = cycleclock();

// store whether thread is sleeping but add 1 as 0 is preserved to indicate end of block
bt_data_prof[bt_size_cur++].uintptr = jl_atomic_load_relaxed(&ptls->sleep_check_state) + 1;

// Mark the end of this block with two 0's
bt_data_prof[bt_size_cur++].uintptr = 0;
bt_data_prof[bt_size_cur++].uintptr = 0;
}
// We're done! Resume the thread.
jl_thread_resume(tid);
}

void *mach_profile_listener(void *arg)
{
(void)arg;
Expand All @@ -621,84 +698,18 @@ void *mach_profile_listener(void *arg)
// (so that thread zero gets notified last)
int keymgr_locked = jl_lock_profile_mach(0);

int nthreads = jl_atomic_load_acquire(&jl_n_threads);
int *randperm = profile_get_randperm(nthreads);
for (int idx = nthreads; idx-- > 0; ) {
// Stop the threads in the random or reverse round-robin order.
int i = randperm[idx];
// if there is no space left, break early
if (jl_profile_is_buffer_full()) {
jl_profile_stop_timer();
break;
}

if (_dyld_dlopen_atfork_prepare != NULL && _dyld_dlopen_atfork_parent != NULL)
_dyld_dlopen_atfork_prepare();
if (_dyld_atfork_prepare != NULL && _dyld_atfork_parent != NULL)
_dyld_atfork_prepare(); // briefly acquire the dlsym lock
host_thread_state_t state;
int valid_thread = jl_thread_suspend_and_get_state2(i, &state);
unw_context_t *uc = (unw_context_t*)&state;
if (_dyld_atfork_prepare != NULL && _dyld_atfork_parent != NULL)
_dyld_atfork_parent(); // quickly release the dlsym lock
if (_dyld_dlopen_atfork_prepare != NULL && _dyld_dlopen_atfork_parent != NULL)
_dyld_dlopen_atfork_parent();
if (!valid_thread)
continue;
if (running) {
#ifdef LLVMLIBUNWIND
/*
* Unfortunately compact unwind info is incorrectly generated for quite a number of
* libraries by quite a large number of compilers. We can fall back to DWARF unwind info
* in some cases, but in quite a number of cases (especially libraries not compiled in debug
* mode, only the compact unwind info may be available). Even more unfortunately, there is no
* way to detect such bogus compact unwind info (other than noticing the resulting segfault).
* What we do here is ugly, but necessary until the compact unwind info situation improves.
* We try to use the compact unwind info and if that results in a segfault, we retry with DWARF info.
* Note that in a small number of cases this may result in bogus stack traces, but at least the topmost
* entry will always be correct, and the number of cases in which this is an issue is rather small.
* Other than that, this implementation is not incorrect as the other thread is paused while we are profiling
* and during stack unwinding we only ever read memory, but never write it.
*/

forceDwarf = 0;
unw_getcontext(&profiler_uc); // will resume from this point if the next lines segfault at any point

if (forceDwarf == 0) {
// Save the backtrace
bt_size_cur += rec_backtrace_ctx((jl_bt_element_t*)bt_data_prof + bt_size_cur, bt_size_max - bt_size_cur - 1, uc, NULL);
}
else if (forceDwarf == 1) {
bt_size_cur += rec_backtrace_ctx_dwarf((jl_bt_element_t*)bt_data_prof + bt_size_cur, bt_size_max - bt_size_cur - 1, uc, NULL);
}
else if (forceDwarf == -1) {
jl_safe_printf("WARNING: profiler attempt to access an invalid memory location\n");
}

forceDwarf = -2;
#else
bt_size_cur += rec_backtrace_ctx((jl_bt_element_t*)bt_data_prof + bt_size_cur, bt_size_max - bt_size_cur - 1, uc, NULL);
#endif
jl_ptls_t ptls = jl_atomic_load_relaxed(&jl_all_tls_states)[i];

// store threadid but add 1 as 0 is preserved to indicate end of block
bt_data_prof[bt_size_cur++].uintptr = ptls->tid + 1;

// store task id (never null)
bt_data_prof[bt_size_cur++].jlvalue = (jl_value_t*)jl_atomic_load_relaxed(&ptls->current_task);

// store cpu cycle clock
bt_data_prof[bt_size_cur++].uintptr = cycleclock();

// store whether thread is sleeping but add 1 as 0 is preserved to indicate end of block
bt_data_prof[bt_size_cur++].uintptr = jl_atomic_load_relaxed(&ptls->sleep_check_state) + 1;

// Mark the end of this block with two 0's
bt_data_prof[bt_size_cur++].uintptr = 0;
bt_data_prof[bt_size_cur++].uintptr = 0;
if (profile_all_tasks) {
// t = select_task()
// jl_profile_task()
}
else {
int nthreads = jl_atomic_load_acquire(&jl_n_threads);
int *randperm = profile_get_randperm(nthreads);
for (int idx = nthreads; idx-- > 0; ) {
// Stop the threads in random order.
int i = randperm[idx];
jl_profile_thread_mach(i);
}
// We're done! Resume the thread.
jl_thread_resume(i);
}
jl_unlock_profile_mach(0, keymgr_locked);
if (running) {
Expand All @@ -710,7 +721,8 @@ void *mach_profile_listener(void *arg)
}
}

JL_DLLEXPORT int jl_profile_start_timer(void)

JL_DLLEXPORT int jl_profile_start_timer(uint8_t all_tasks)
{
kern_return_t ret;
if (!profile_started) {
Expand Down Expand Up @@ -740,6 +752,7 @@ JL_DLLEXPORT int jl_profile_start_timer(void)
timerprof.tv_nsec = nsecprof%GIGA;

running = 1;
profile_all_tasks = all_tasks;
// ensure the alarm is running
ret = clock_alarm(clk, TIME_RELATIVE, timerprof, profile_port);
HANDLE_MACH_ERROR("clock_alarm", ret);
Expand All @@ -750,4 +763,5 @@ JL_DLLEXPORT int jl_profile_start_timer(void)
JL_DLLEXPORT void jl_profile_stop_timer(void)
{
running = 0;
profile_all_tasks = 0;
}
Loading