From dad558815d645b4d8afe708284d17d359cc1c991 Mon Sep 17 00:00:00 2001 From: Diogo Netto <61364108+d-netto@users.noreply.github.com> Date: Thu, 14 Dec 2023 10:53:02 -0300 Subject: [PATCH] GC scheduler refinements (#52294) (#117) --- src/gc.c | 117 +++++++++++++++++++++++++++++++++++++++++++++++-------- src/gc.h | 15 +------ 2 files changed, 102 insertions(+), 30 deletions(-) diff --git a/src/gc.c b/src/gc.c index a4161a6e63235..cb2bab9acd59f 100644 --- a/src/gc.c +++ b/src/gc.c @@ -24,6 +24,8 @@ int gc_first_tid; // Mutex/cond used to synchronize sleep/wakeup of GC threads uv_mutex_t gc_threads_lock; uv_cond_t gc_threads_cond; +// Mutex used to coordinate entry of GC threads in the mark loop +uv_mutex_t gc_queue_observer_lock; // Linked list of callback functions @@ -2867,8 +2869,10 @@ void gc_mark_and_steal(jl_ptls_t ptls) jl_gc_markqueue_t *mq = &ptls->mark_queue; jl_gc_markqueue_t *mq_master = NULL; int master_tid = jl_atomic_load(&gc_master_tid); - if (master_tid != -1) - mq_master = &gc_all_tls_states[master_tid]->mark_queue; + if (master_tid == -1) { + return; + } + mq_master = &gc_all_tls_states[master_tid]->mark_queue; void *new_obj; jl_gc_chunk_t c; pop : { @@ -2943,28 +2947,108 @@ void gc_mark_and_steal(jl_ptls_t ptls) } } +size_t gc_count_work_in_queue(jl_ptls_t ptls) JL_NOTSAFEPOINT +{ + // assume each chunk is worth 256 units of work and each pointer + // is worth 1 unit of work + size_t work = 256 * (jl_atomic_load_relaxed(&ptls->mark_queue.chunk_queue.bottom) - + jl_atomic_load_relaxed(&ptls->mark_queue.chunk_queue.top)); + work += (jl_atomic_load_relaxed(&ptls->mark_queue.ptr_queue.bottom) - + jl_atomic_load_relaxed(&ptls->mark_queue.ptr_queue.top)); + return work; +} + +/** + * Correctness argument for the mark-loop termination protocol. + * + * Safety properties: + * - No work items shall be in any thread's queues when `gc_mark_loop_barrier` observes + * that `gc_n_threads_marking` is zero. + * + * - No work item shall be stolen from the master thread (i.e. mutator thread which started + * GC and which helped the `jl_n_gcthreads` - 1 threads to mark) after + * `gc_mark_loop_barrier` observes that `gc_n_threads_marking` is zero. This property is + * necessary because we call `gc_mark_loop_serial` after marking the finalizer list in + * `_jl_gc_collect`, and want to ensure that we have the serial mark-loop semantics there, + * and that no work is stolen from us at that point. + * + * Proof: + * - Suppose the master thread observes that `gc_n_threads_marking` is zero in + * `gc_mark_loop_barrier` and there is a work item left in one thread's queue at that point. + * Since threads try to steal from all threads' queues, this implies that all threads must + * have tried to steal from the queue which still has a work item left, but failed to do so, + * which violates the semantics of Chase-Lev's work-stealing queue. + * + * - Let E1 be the event "master thread writes -1 to gc_master_tid" and E2 be the event + * "master thread observes that `gc_n_threads_marking` is zero". Since we're using + * sequentially consistent atomics, E1 => E2. Now suppose one thread which is spinning in + * `gc_should_mark` tries to enter the mark-loop after E2. In order to do so, it must + * increment `gc_n_threads_marking` to 1 in an event E3, and then read `gc_master_tid` in an + * event E4. Since we're using sequentially consistent atomics, E3 => E4. Since we observed + * `gc_n_threads_marking` as zero in E2, then E2 => E3, and we conclude E1 => E4, so that + * the thread which is spinning in `gc_should_mark` must observe that `gc_master_tid` is -1 + * and therefore won't enter the mark-loop. + */ + +int gc_should_mark(jl_ptls_t ptls) +{ + int should_mark = 0; + int n_threads_marking = jl_atomic_load(&gc_n_threads_marking); + // fast path + if (n_threads_marking == 0) { + return 0; + } + uv_mutex_lock(&gc_queue_observer_lock); + while (1) { + int tid = jl_atomic_load(&gc_master_tid); + // fast path + if (tid == -1) { + break; + } + n_threads_marking = jl_atomic_load(&gc_n_threads_marking); + // fast path + if (n_threads_marking == 0) { + break; + } + size_t work = gc_count_work_in_queue(gc_all_tls_states[tid]); + for (tid = gc_first_tid; tid < gc_first_tid + jl_n_gcthreads; tid++) { + work += gc_count_work_in_queue(gc_all_tls_states[tid]); + } + // if there is a lot of work left, enter the mark loop + if (work >= 16 * n_threads_marking) { + jl_atomic_fetch_add(&gc_n_threads_marking, 1); + should_mark = 1; + break; + } + jl_cpu_pause(); + } + uv_mutex_unlock(&gc_queue_observer_lock); + return should_mark; +} + +void gc_wake_all_for_marking(jl_ptls_t ptls) +{ + jl_atomic_store(&gc_master_tid, ptls->tid); + uv_mutex_lock(&gc_threads_lock); + jl_atomic_fetch_add(&gc_n_threads_marking, 1); + uv_cond_broadcast(&gc_threads_cond); + uv_mutex_unlock(&gc_threads_lock); +} + void gc_mark_loop_parallel(jl_ptls_t ptls, int master) { - int backoff = GC_BACKOFF_MIN; if (master) { - jl_atomic_store(&gc_master_tid, ptls->tid); - // Wake threads up and try to do some work - uv_mutex_lock(&gc_threads_lock); - jl_atomic_fetch_add(&gc_n_threads_marking, 1); - uv_cond_broadcast(&gc_threads_cond); - uv_mutex_unlock(&gc_threads_lock); + gc_wake_all_for_marking(ptls); gc_mark_and_steal(ptls); jl_atomic_fetch_add(&gc_n_threads_marking, -1); } - while (jl_atomic_load(&gc_n_threads_marking) > 0) { - // Try to become a thief while other threads are marking - jl_atomic_fetch_add(&gc_n_threads_marking, 1); - if (jl_atomic_load(&gc_master_tid) != -1) { - gc_mark_and_steal(ptls); + while (1) { + int should_mark = gc_should_mark(ptls); + if (!should_mark) { + break; } + gc_mark_and_steal(ptls); jl_atomic_fetch_add(&gc_n_threads_marking, -1); - // Failed to steal - gc_backoff(&backoff); } } @@ -3745,6 +3829,7 @@ void jl_gc_init(void) uv_mutex_init(&gc_perm_lock); uv_mutex_init(&gc_threads_lock); uv_cond_init(&gc_threads_cond); + uv_mutex_init(&gc_queue_observer_lock); jl_gc_init_page(); jl_gc_debug_init(); diff --git a/src/gc.h b/src/gc.h index cadd489515456..ff52269b73af9 100644 --- a/src/gc.h +++ b/src/gc.h @@ -189,25 +189,12 @@ extern jl_gc_page_stack_t global_page_pool_lazily_freed; extern jl_gc_page_stack_t global_page_pool_clean; extern jl_gc_page_stack_t global_page_pool_freed; -#define GC_BACKOFF_MIN 4 -#define GC_BACKOFF_MAX 12 - -STATIC_INLINE void gc_backoff(int *i) JL_NOTSAFEPOINT -{ - if (*i < GC_BACKOFF_MAX) { - (*i)++; - } - for (int j = 0; j < (1 << *i); j++) { - jl_cpu_pause(); - } -} - // Lock-free stack implementation taken // from Herlihy's "The Art of Multiprocessor Programming" // XXX: this is not a general-purpose lock-free stack. We can // get away with just using a CAS and not implementing some ABA // prevention mechanism since once a node is popped from the -// `jl_gc_global_page_pool_t`, it may only be pushed back to them +// `jl_gc_page_stack_t`, it may only be pushed back to them // in the sweeping phase, which also doesn't push a node into the // same stack after it's popped