Skip to content

Commit

Permalink
Run the root closure on the boss thread when synced.
Browse files Browse the repository at this point in the history
  • Loading branch information
VoxSciurorum committed Sep 19, 2024
1 parent f461280 commit 659c3c1
Show file tree
Hide file tree
Showing 5 changed files with 93 additions and 58 deletions.
2 changes: 2 additions & 0 deletions runtime/global.c
Original file line number Diff line number Diff line change
Expand Up @@ -172,6 +172,8 @@ global_state *global_state_init(int argc, char *argv[]) {
atomic_store_explicit(&g->cilkified, 0, memory_order_relaxed);
atomic_store_explicit(&g->disengaged_sentinel, 0, memory_order_relaxed);

g->activate_boss = false;

g->terminate = false;

g->worker_args =
Expand Down
4 changes: 4 additions & 0 deletions runtime/global.h
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,10 @@ struct global_state {
// optimization would improve performance.
_Atomic uint32_t cilkified_futex __attribute__((aligned(CILK_CACHE_LINE)));
atomic_bool cilkified;
// Set to true by any worker to signal that the cilkifying function
// needs to run on the original worker. The cilkifying closure should
// be locked when this is set.
bool activate_boss;

pthread_mutex_t cilkified_lock;
pthread_cond_t cilkified_cond_var;
Expand Down
49 changes: 17 additions & 32 deletions runtime/init.c
Original file line number Diff line number Diff line change
Expand Up @@ -577,15 +577,16 @@ void __cilkrts_internal_invoke_cilkified_root(__cilkrts_stack_frame *sf) {
}
}

// Finish the execution of a Cilkified region. Executed by a worker in g.
// Finish the execution of a Cilkified region. Executed by the boss worker.
void __cilkrts_internal_exit_cilkified_root(global_state *g,
__cilkrts_stack_frame *sf) {
__cilkrts_worker *w = __cilkrts_get_tls_worker();
CILK_ASSERT(w->l->state == WORKER_RUN);
CILK_SWITCH_TIMING(w, INTERVAL_WORK, INTERVAL_CILKIFY_EXIT);

worker_id self = w->self;
const bool is_boss = (0 == self);
CILK_ASSERT(w->self == 0);

worker_id self = 0;
ReadyDeque *deques = g->deques;

// Mark the computation as done. Also "sleep" the workers: update global
Expand All @@ -596,16 +597,6 @@ void __cilkrts_internal_exit_cilkified_root(global_state *g,
atomic_store_explicit(&g->done, 1, memory_order_release);
/* wake_all_disengaged(g); */

if (!is_boss) {
w->l->exiting = true;
__cilkrts_worker **workers = g->workers;
__cilkrts_worker *w0 = workers[0];
w0->hyper_table = w->hyper_table;
w->hyper_table = NULL;
w0->extension = w->extension;
w->extension = NULL;
}

// Clear this worker's deque. Nobody can successfully steal from this deque
// at this point, because head == tail, but we still want any subsequent
// Cilkified region to start with an empty deque. We go ahead and grab the
Expand All @@ -625,25 +616,19 @@ void __cilkrts_internal_exit_cilkified_root(global_state *g,
sf->flags = 0;

CILK_STOP_TIMING(w, INTERVAL_CILKIFY_EXIT);
if (is_boss) {
// We finished the computation on the boss thread. No need to jump to
// the runtime in this case; just return normally.
local_state *l = w->l;
atomic_store_explicit(&g->cilkified, 0, memory_order_relaxed);
l->state = WORKER_IDLE;
__cilkrts_need_to_cilkify = true;

// Restore the boss's original rsp, so the boss completes the Cilk
// function on its original stack.
SP(sf) = g->orig_rsp;
sysdep_restore_fp_state(sf);
sanitizer_start_switch_fiber(NULL);
__builtin_longjmp(sf->ctx, 1);
} else {
// done; go back to runtime
CILK_START_TIMING(w, INTERVAL_WORK);
longjmp_to_runtime(w);
}
// We finished the computation on the boss thread. No need to jump to
// the runtime in this case; just return normally.
local_state *l = w->l;
atomic_store_explicit(&g->cilkified, 0, memory_order_relaxed);
l->state = WORKER_IDLE;
__cilkrts_need_to_cilkify = true;

// Restore the boss's original rsp, so the boss completes the Cilk
// function on its original stack.
SP(sf) = g->orig_rsp;
sysdep_restore_fp_state(sf);
sanitizer_start_switch_fiber(NULL);
__builtin_longjmp(sf->ctx, 1);
}

static void global_state_terminate(global_state *g) {
Expand Down
87 changes: 65 additions & 22 deletions runtime/scheduler.c
Original file line number Diff line number Diff line change
Expand Up @@ -211,6 +211,16 @@ static void setup_for_sync(__cilkrts_worker *w, worker_id self, Closure *t) {
t->orig_rsp = NULL; // unset once we have sync-ed
}

static void resume_boss(__cilkrts_worker *w, worker_id self, Closure *t) {
CILK_ASSERT(t->status == CLOSURE_SUSPENDED);
CILK_ASSERT(!Closure_has_children(t));
// TODO: This should not be on any worker's deque
Closure_lock(self, t);
setup_for_sync(w, self, t);
Closure_set_status(t, CLOSURE_RUNNING);
Closure_unlock(self, t);
}

// ==============================================
// TLS related functions
// ==============================================
Expand Down Expand Up @@ -295,31 +305,44 @@ static Closure *provably_good_steal_maybe(__cilkrts_worker *const w,

Closure_assert_ownership(self, parent);
local_state *l = w->l;
global_state *g = w->g;
// cilkrts_alert(STEAL, "(provably_good_steal_maybe) cl %p",
// (void *)parent);
CILK_ASSERT(!l->provably_good_steal);

if (!Closure_has_children(parent) && parent->status == CLOSURE_SUSPENDED) {
// cilkrts_alert(STEAL | ALERT_SYNC,
// "(provably_good_steal_maybe) completing a sync");
if (Closure_has_children(parent))
return NULL;

CILK_ASSERT(parent->frame != NULL);
if (parent->status != CLOSURE_SUSPENDED)
return NULL;

/* do a provably-good steal; this is *really* simple */
l->provably_good_steal = true;
/* Only the cilkifying worker can run the cilkifying frame synced. */
if (parent == g->root_closure && w->self != 0) {
__cilkrts_stack_frame *sf = parent->frame;
CILK_ASSERT(sf);
if (sf->flags & CILK_FRAME_LAST) {
g->activate_boss = true;
return NULL;
}
}

setup_for_sync(w, self, parent);
CILK_ASSERT(parent->owner_ready_deque == NO_WORKER);
Closure_make_ready(parent);
// cilkrts_alert(STEAL | ALERT_SYNC,
// "(provably_good_steal_maybe) completing a sync");

cilkrts_alert(STEAL | ALERT_SYNC,
"(provably_good_steal_maybe) returned %p",
(void *)parent);
CILK_ASSERT(parent->frame != NULL);

return parent;
}
/* do a provably-good steal; this is *really* simple */
CILK_ASSERT(!l->provably_good_steal);
l->provably_good_steal = true;

setup_for_sync(w, self, parent);
CILK_ASSERT(parent->owner_ready_deque == NO_WORKER);
Closure_make_ready(parent);

cilkrts_alert(STEAL | ALERT_SYNC,
"(provably_good_steal_maybe) returned %p",
(void *)parent);

return NULL;
return parent;
}

/***
Expand Down Expand Up @@ -1224,7 +1247,8 @@ int Cilk_sync(__cilkrts_worker *const w, __cilkrts_stack_frame *frame) {
int res = SYNC_READY;

//----- EVENT_CILK_SYNC
ReadyDeque *deques = w->g->deques;
global_state *g = w->g;
ReadyDeque *deques = g->deques;
worker_id self = w->self;

deque_lock_self(deques, self);
Expand All @@ -1246,6 +1270,20 @@ int Cilk_sync(__cilkrts_worker *const w, __cilkrts_stack_frame *frame) {
if (Closure_has_children(t)) {
cilkrts_alert(SYNC, "(Cilk_sync) Closure %p has outstanding children",
(void *)t);
res = SYNC_NOT_READY;
} else if (self != 0 && t == g->root_closure && (t->frame->flags & CILK_FRAME_LAST)) {
cilkrts_alert(SYNC, "(Cilk_sync) Closure %p needs to run on boss",
(void *)t);
g->activate_boss = true;
res = SYNC_NOT_READY;
} else {
cilkrts_alert(SYNC, "(Cilk_sync) closure %p sync successfully",
(void *)t);
res = SYNC_READY;
}

if (res == SYNC_NOT_READY) {
// XXX not in the root closure case?
if (t->fiber) {
cilk_fiber_deallocate_to_pool(w, t->fiber);
}
Expand All @@ -1263,10 +1301,7 @@ int Cilk_sync(__cilkrts_worker *const w, __cilkrts_stack_frame *frame) {

Closure_suspend(deques, self, t);
t->user_ht = ht; /* set this after state change to suspended */
res = SYNC_NOT_READY;
} else {
cilkrts_alert(SYNC, "(Cilk_sync) closure %p sync successfully",
(void *)t);
setup_for_sync(w, self, t);
}

Expand Down Expand Up @@ -1447,6 +1482,14 @@ void worker_scheduler(__cilkrts_worker *w) {
while (!t && !atomic_load_explicit(&rts->done, memory_order_acquire)) {
CILK_START_TIMING(w, INTERVAL_SCHED);
CILK_START_TIMING(w, INTERVAL_IDLE);

if (rts->activate_boss) {
t = rts->root_closure;
resume_boss(w, self, t);
rts->activate_boss = false;
break;
}

#if ENABLE_THIEF_SLEEP
// Get the set of workers we can steal from and a local copy of the
// index-to-worker map. We'll attempt a few steals using these
Expand All @@ -1469,10 +1512,10 @@ void worker_scheduler(__cilkrts_worker *w) {
uint32_t sentinel = nworkers / 2;
#endif // ENABLE_THIEF_SLEEP
#ifndef __APPLE__
uint32_t lg_sentinel = sentinel == 0 ? 1
const uint32_t lg_sentinel = sentinel == 0 ? 1
: (8 * sizeof(sentinel)) -
__builtin_clz(sentinel);
uint32_t sentinel_div_lg_sentinel =
const uint32_t sentinel_div_lg_sentinel =
sentinel == 0 ? 1
: (sentinel >> (8 * sizeof(lg_sentinel) -
__builtin_clz(lg_sentinel)));
Expand Down
9 changes: 5 additions & 4 deletions runtime/worker_sleep.h
Original file line number Diff line number Diff line change
Expand Up @@ -467,12 +467,13 @@ handle_failed_steal_attempts(global_state *const rts, worker_id self,

#endif
if (is_boss) {
if (fails % NAP_THRESHOLD == 0) {
// The boss thread should never disengage. Sleep instead.
if (fails % NAP_THRESHOLD == 0 && !rts->activate_boss) {
// The boss thread should never disengage or
// sleep for a long time.
const struct timespec sleeptime = {
.tv_sec = 0,
.tv_nsec =
(fails > SLEEP_THRESHOLD) ? SLEEP_NSEC : NAP_NSEC};
.tv_nsec = 1000
};
nanosleep(&sleeptime, NULL);
}
} else {
Expand Down

0 comments on commit 659c3c1

Please sign in to comment.