From fa26768a7643cb6351a3ea62b9ae0728ea8e6ef2 Mon Sep 17 00:00:00 2001 From: "John F. Carr" Date: Tue, 30 Jul 2024 14:39:45 -0400 Subject: [PATCH] Run the root closure on the boss thread when synced. --- runtime/global.c | 2 + runtime/global.h | 4 ++ runtime/init.c | 49 +++++++++--------------- runtime/scheduler.c | 87 +++++++++++++++++++++++++++++++----------- runtime/worker_sleep.h | 9 +++-- 5 files changed, 93 insertions(+), 58 deletions(-) diff --git a/runtime/global.c b/runtime/global.c index 2ba284bf..6ed0be67 100644 --- a/runtime/global.c +++ b/runtime/global.c @@ -171,6 +171,8 @@ global_state *global_state_init(int argc, char *argv[]) { atomic_store_explicit(&g->cilkified, 0, memory_order_relaxed); atomic_store_explicit(&g->disengaged_sentinel, 0, memory_order_relaxed); + g->activate_boss = false; + g->terminate = false; g->worker_args = diff --git a/runtime/global.h b/runtime/global.h index a83af24e..c1921bc4 100644 --- a/runtime/global.h +++ b/runtime/global.h @@ -75,6 +75,10 @@ struct global_state { // optimization would improve performance. _Atomic uint32_t cilkified_futex __attribute__((aligned(CILK_CACHE_LINE))); atomic_bool cilkified; + // Set to true by any worker to signal that the cilkifying function + // needs to run on the original worker. The cilkifying closure should + // be locked when this is set. + bool activate_boss; pthread_mutex_t cilkified_lock; pthread_cond_t cilkified_cond_var; diff --git a/runtime/init.c b/runtime/init.c index 5c7538ce..cb47a7c0 100644 --- a/runtime/init.c +++ b/runtime/init.c @@ -577,15 +577,16 @@ void __cilkrts_internal_invoke_cilkified_root(__cilkrts_stack_frame *sf) { } } -// Finish the execution of a Cilkified region. Executed by a worker in g. +// Finish the execution of a Cilkified region. Executed by the boss worker. void __cilkrts_internal_exit_cilkified_root(global_state *g, __cilkrts_stack_frame *sf) { __cilkrts_worker *w = __cilkrts_get_tls_worker(); CILK_ASSERT(w->l->state == WORKER_RUN); CILK_SWITCH_TIMING(w, INTERVAL_WORK, INTERVAL_CILKIFY_EXIT); - worker_id self = w->self; - const bool is_boss = (0 == self); + CILK_ASSERT(w->self == 0); + + worker_id self = 0; ReadyDeque *deques = g->deques; // Mark the computation as done. Also "sleep" the workers: update global @@ -596,16 +597,6 @@ void __cilkrts_internal_exit_cilkified_root(global_state *g, atomic_store_explicit(&g->done, 1, memory_order_release); /* wake_all_disengaged(g); */ - if (!is_boss) { - w->l->exiting = true; - __cilkrts_worker **workers = g->workers; - __cilkrts_worker *w0 = workers[0]; - w0->hyper_table = w->hyper_table; - w->hyper_table = NULL; - w0->extension = w->extension; - w->extension = NULL; - } - // Clear this worker's deque. Nobody can successfully steal from this deque // at this point, because head == tail, but we still want any subsequent // Cilkified region to start with an empty deque. We go ahead and grab the @@ -625,25 +616,19 @@ void __cilkrts_internal_exit_cilkified_root(global_state *g, sf->flags = 0; CILK_STOP_TIMING(w, INTERVAL_CILKIFY_EXIT); - if (is_boss) { - // We finished the computation on the boss thread. No need to jump to - // the runtime in this case; just return normally. - local_state *l = w->l; - atomic_store_explicit(&g->cilkified, 0, memory_order_relaxed); - l->state = WORKER_IDLE; - __cilkrts_need_to_cilkify = true; - - // Restore the boss's original rsp, so the boss completes the Cilk - // function on its original stack. - SP(sf) = g->orig_rsp; - sysdep_restore_fp_state(sf); - sanitizer_start_switch_fiber(NULL); - __builtin_longjmp(sf->ctx, 1); - } else { - // done; go back to runtime - CILK_START_TIMING(w, INTERVAL_WORK); - longjmp_to_runtime(w); - } + // We finished the computation on the boss thread. No need to jump to + // the runtime in this case; just return normally. + local_state *l = w->l; + atomic_store_explicit(&g->cilkified, 0, memory_order_relaxed); + l->state = WORKER_IDLE; + __cilkrts_need_to_cilkify = true; + + // Restore the boss's original rsp, so the boss completes the Cilk + // function on its original stack. + SP(sf) = g->orig_rsp; + sysdep_restore_fp_state(sf); + sanitizer_start_switch_fiber(NULL); + __builtin_longjmp(sf->ctx, 1); } static void global_state_terminate(global_state *g) { diff --git a/runtime/scheduler.c b/runtime/scheduler.c index b7291d25..7f07a1dc 100644 --- a/runtime/scheduler.c +++ b/runtime/scheduler.c @@ -211,6 +211,16 @@ static void setup_for_sync(__cilkrts_worker *w, worker_id self, Closure *t) { t->orig_rsp = NULL; // unset once we have sync-ed } +static void resume_boss(__cilkrts_worker *w, worker_id self, Closure *t) { + CILK_ASSERT(t->status == CLOSURE_SUSPENDED); + CILK_ASSERT(!Closure_has_children(t)); + // TODO: This should not be on any worker's deque + Closure_lock(self, t); + setup_for_sync(w, self, t); + Closure_set_status(t, CLOSURE_RUNNING); + Closure_unlock(self, t); +} + // ============================================== // TLS related functions // ============================================== @@ -295,31 +305,44 @@ static Closure *provably_good_steal_maybe(__cilkrts_worker *const w, Closure_assert_ownership(self, parent); local_state *l = w->l; + global_state *g = w->g; // cilkrts_alert(STEAL, "(provably_good_steal_maybe) cl %p", // (void *)parent); - CILK_ASSERT(!l->provably_good_steal); - if (!Closure_has_children(parent) && parent->status == CLOSURE_SUSPENDED) { - // cilkrts_alert(STEAL | ALERT_SYNC, - // "(provably_good_steal_maybe) completing a sync"); + if (Closure_has_children(parent)) + return NULL; - CILK_ASSERT(parent->frame != NULL); + if (parent->status != CLOSURE_SUSPENDED) + return NULL; - /* do a provably-good steal; this is *really* simple */ - l->provably_good_steal = true; + /* Only the cilkifying worker can run the cilkifying frame synced. */ + if (parent == g->root_closure && w->self != 0) { + __cilkrts_stack_frame *sf = parent->frame; + CILK_ASSERT(sf); + if (sf->flags & CILK_FRAME_LAST) { + g->activate_boss = true; + return NULL; + } + } - setup_for_sync(w, self, parent); - CILK_ASSERT(parent->owner_ready_deque == NO_WORKER); - Closure_make_ready(parent); + // cilkrts_alert(STEAL | ALERT_SYNC, + // "(provably_good_steal_maybe) completing a sync"); - cilkrts_alert(STEAL | ALERT_SYNC, - "(provably_good_steal_maybe) returned %p", - (void *)parent); + CILK_ASSERT(parent->frame != NULL); - return parent; - } + /* do a provably-good steal; this is *really* simple */ + CILK_ASSERT(!l->provably_good_steal); + l->provably_good_steal = true; + + setup_for_sync(w, self, parent); + CILK_ASSERT(parent->owner_ready_deque == NO_WORKER); + Closure_make_ready(parent); + + cilkrts_alert(STEAL | ALERT_SYNC, + "(provably_good_steal_maybe) returned %p", + (void *)parent); - return NULL; + return parent; } /*** @@ -1224,7 +1247,8 @@ int Cilk_sync(__cilkrts_worker *const w, __cilkrts_stack_frame *frame) { int res = SYNC_READY; //----- EVENT_CILK_SYNC - ReadyDeque *deques = w->g->deques; + global_state *g = w->g; + ReadyDeque *deques = g->deques; worker_id self = w->self; deque_lock_self(deques, self); @@ -1246,6 +1270,20 @@ int Cilk_sync(__cilkrts_worker *const w, __cilkrts_stack_frame *frame) { if (Closure_has_children(t)) { cilkrts_alert(SYNC, "(Cilk_sync) Closure %p has outstanding children", (void *)t); + res = SYNC_NOT_READY; + } else if (self != 0 && t == g->root_closure && (t->frame->flags & CILK_FRAME_LAST)) { + cilkrts_alert(SYNC, "(Cilk_sync) Closure %p needs to run on boss", + (void *)t); + g->activate_boss = true; + res = SYNC_NOT_READY; + } else { + cilkrts_alert(SYNC, "(Cilk_sync) closure %p sync successfully", + (void *)t); + res = SYNC_READY; + } + + if (res == SYNC_NOT_READY) { + // XXX not in the root closure case? if (t->fiber) { cilk_fiber_deallocate_to_pool(w, t->fiber); } @@ -1263,10 +1301,7 @@ int Cilk_sync(__cilkrts_worker *const w, __cilkrts_stack_frame *frame) { Closure_suspend(deques, self, t); t->user_ht = ht; /* set this after state change to suspended */ - res = SYNC_NOT_READY; } else { - cilkrts_alert(SYNC, "(Cilk_sync) closure %p sync successfully", - (void *)t); setup_for_sync(w, self, t); } @@ -1447,6 +1482,14 @@ void worker_scheduler(__cilkrts_worker *w) { while (!t && !atomic_load_explicit(&rts->done, memory_order_acquire)) { CILK_START_TIMING(w, INTERVAL_SCHED); CILK_START_TIMING(w, INTERVAL_IDLE); + + if (rts->activate_boss) { + t = rts->root_closure; + resume_boss(w, self, t); + rts->activate_boss = false; + break; + } + #if ENABLE_THIEF_SLEEP // Get the set of workers we can steal from and a local copy of the // index-to-worker map. We'll attempt a few steals using these @@ -1469,10 +1512,10 @@ void worker_scheduler(__cilkrts_worker *w) { uint32_t sentinel = nworkers / 2; #endif // ENABLE_THIEF_SLEEP #ifndef __APPLE__ - uint32_t lg_sentinel = sentinel == 0 ? 1 + const uint32_t lg_sentinel = sentinel == 0 ? 1 : (8 * sizeof(sentinel)) - __builtin_clz(sentinel); - uint32_t sentinel_div_lg_sentinel = + const uint32_t sentinel_div_lg_sentinel = sentinel == 0 ? 1 : (sentinel >> (8 * sizeof(lg_sentinel) - __builtin_clz(lg_sentinel))); diff --git a/runtime/worker_sleep.h b/runtime/worker_sleep.h index 0ea058f0..644c5a9a 100644 --- a/runtime/worker_sleep.h +++ b/runtime/worker_sleep.h @@ -467,12 +467,13 @@ handle_failed_steal_attempts(global_state *const rts, worker_id self, #endif if (is_boss) { - if (fails % NAP_THRESHOLD == 0) { - // The boss thread should never disengage. Sleep instead. + if (fails % NAP_THRESHOLD == 0 && !rts->activate_boss) { + // The boss thread should never disengage or + // sleep for a long time. const struct timespec sleeptime = { .tv_sec = 0, - .tv_nsec = - (fails > SLEEP_THRESHOLD) ? SLEEP_NSEC : NAP_NSEC}; + .tv_nsec = 1000 + }; nanosleep(&sleeptime, NULL); } } else {