From b3a5709ee563974f62bb7622d137ca3901d2a24a Mon Sep 17 00:00:00 2001 From: Kyle Singer Date: Mon, 25 Nov 2024 11:51:27 -0600 Subject: [PATCH] boss only wakes 1 worker (defers thief wake); sleeping workers chain wakes --- runtime/init.c | 2 +- runtime/scheduler.c | 16 +++++++++--- runtime/worker_coord.h | 56 ++++++++++++++++++++++++++---------------- 3 files changed, 49 insertions(+), 25 deletions(-) diff --git a/runtime/init.c b/runtime/init.c index c2fb865d..820f18af 100644 --- a/runtime/init.c +++ b/runtime/init.c @@ -556,7 +556,7 @@ void __cilkrts_internal_invoke_cilkified_root(__cilkrts_stack_frame *sf) { // occur, rather than all at once. Initial testing of this approach did not // seem to perform well, however. One possible reason why could be because // of the extra kernel interactions involved in waking workers gradually. - wake_thieves(g); + async_wake_thieves(g); /* request_more_thieves(g, g->nworkers); */ // Start the workers if necessary diff --git a/runtime/scheduler.c b/runtime/scheduler.c index 2ed42bab..92714cd9 100644 --- a/runtime/scheduler.c +++ b/runtime/scheduler.c @@ -1605,7 +1605,10 @@ void worker_scheduler(__cilkrts_worker *w) { atomic_load_explicit(&rts->done, memory_order_relaxed)) { busy_pause(); } - if (thief_should_wait(rts)) { + const uint32_t local_wake = take_current_wake_value(rts); + /*if (local_wake == (nworkers - 1u)) { + deferred_wake_thieves(rts); + } else */if (thief_should_wait(local_wake)) { break; } } @@ -1652,11 +1655,18 @@ void *scheduler_thread_proc(void *arg) { // Wait for g->start == 1 to start executing the work-stealing loop. We // use a condition variable to wait on g->start, because this approach // seems to result in better performance. - if (thief_should_wait(rts)) { + uint32_t local_wake = take_current_wake_value(rts); + if (thief_should_wait(local_wake)) { disengage_worker(rts, nworkers, self); - l->wake_val = thief_wait(rts); + local_wake = thief_wait(rts); + l->wake_val = local_wake; reengage_worker(rts, nworkers, self); + deferred_wake_thieves(rts); } + + //if (local_wake == (rts->nworkers - 1u)) { + // deferred_wake_thieves(rts); + //} CILK_STOP_TIMING(w, INTERVAL_SLEEP_UNCILK); // Check if we should exit this scheduling function. diff --git a/runtime/worker_coord.h b/runtime/worker_coord.h index c27f573d..0f681c64 100644 --- a/runtime/worker_coord.h +++ b/runtime/worker_coord.h @@ -17,7 +17,7 @@ #include "global.h" -#define USER_USE_FUTEX 1 +#define USER_USE_FUTEX 0 #ifdef __linux__ #define USE_FUTEX USER_USE_FUTEX #else @@ -299,11 +299,7 @@ static inline uint32_t thief_wait(global_state *g) { return thief_disengage(g); } -// Called by a thief thread. Check if the thief should start waiting for the -// start of a cilkified region. If a new cilkified region has been started -// already, update the global state to indicate that this worker is engaged in -// work stealing. -static inline bool thief_should_wait(global_state *g) { +static inline uint32_t take_current_wake_value(global_state *const g) { _Atomic uint32_t *futexp = &g->disengaged_thieves_futex; uint32_t val = atomic_load_explicit(futexp, memory_order_relaxed); #if USE_FUTEX @@ -311,35 +307,40 @@ static inline bool thief_should_wait(global_state *g) { if (atomic_compare_exchange_weak_explicit(futexp, &val, val - 1, memory_order_release, memory_order_relaxed)) - return false; + break; busy_loop_pause(); val = atomic_load_explicit(futexp, memory_order_relaxed); } - return true; #else - if (val == 0) - return true; - - pthread_mutex_t *lock = &g->disengaged_lock; - pthread_mutex_lock(lock); - val = atomic_load_explicit(futexp, memory_order_relaxed); - if (val > 0) { - atomic_store_explicit(futexp, val - 1, memory_order_release); + if (val != 0) { + pthread_mutex_t *lock = &g->disengaged_lock; + pthread_mutex_lock(lock); + val = atomic_load_explicit(futexp, memory_order_relaxed); + if (val > 0) { + atomic_store_explicit(futexp, val - 1, memory_order_release); + } pthread_mutex_unlock(lock); - return false; } - pthread_mutex_unlock(lock); - return true; #endif + + return val; +} + +// Called by a thief thread. Check if the thief should start waiting for the +// start of a cilkified region. If a new cilkified region has been started +// already, update the global state to indicate that this worker is engaged in +// work stealing. +static inline bool thief_should_wait(const uint32_t wake_value) { + return wake_value == 0u; } // Signal the thief threads to start work-stealing (or terminate, if // g->terminate == 1). -static inline void wake_thieves(global_state *g) { +static inline void async_wake_thieves(global_state *const g) { #if USE_FUTEX atomic_store_explicit(&g->disengaged_thieves_futex, g->nworkers - 1, memory_order_release); - long s = futex(&g->disengaged_thieves_futex, FUTEX_WAKE_PRIVATE, INT_MAX, + long s = futex(&g->disengaged_thieves_futex, FUTEX_WAKE_PRIVATE, 1, NULL, NULL, 0); if (s == -1) errExit("futex-FUTEX_WAKE"); @@ -347,6 +348,19 @@ static inline void wake_thieves(global_state *g) { pthread_mutex_lock(&g->disengaged_lock); atomic_store_explicit(&g->disengaged_thieves_futex, g->nworkers - 1, memory_order_release); + pthread_cond_signal(&g->disengaged_cond_var); + pthread_mutex_unlock(&g->disengaged_lock); +#endif +} + +static inline void deferred_wake_thieves(global_state *const g) { +#if USE_FUTEX + long s = futex(&g->disengaged_thieves_futex, FUTEX_WAKE_PRIVATE, 1, + NULL, NULL, 0); + if (s == -1) + errExit("futex-FUTEX_WAKE"); +#else + pthread_mutex_lock(&g->disengaged_lock); pthread_cond_broadcast(&g->disengaged_cond_var); pthread_mutex_unlock(&g->disengaged_lock); #endif