From ed028daa08feba3735b086ec5bae515972bc751b Mon Sep 17 00:00:00 2001 From: Jameson Nash Date: Wed, 11 Dec 2024 14:14:35 +0000 Subject: [PATCH] gc: improve mallocarrays locality small_arraylist_t has much better memory locality and space utilization than a linked list with individually malloc'd elements --- src/gc-common.c | 13 ++----------- src/gc-common.h | 6 ------ src/gc-debug.c | 10 ++++------ src/gc-stock.c | 36 ++++++++++++++++-------------------- src/gc-tls-common.h | 5 ++--- 5 files changed, 24 insertions(+), 46 deletions(-) diff --git a/src/gc-common.c b/src/gc-common.c index c751b54f059f5..3d578b81578b1 100644 --- a/src/gc-common.c +++ b/src/gc-common.c @@ -557,17 +557,8 @@ size_t jl_genericmemory_nbytes(jl_genericmemory_t *m) JL_NOTSAFEPOINT // tracking Memorys with malloc'd storage void jl_gc_track_malloced_genericmemory(jl_ptls_t ptls, jl_genericmemory_t *m, int isaligned){ // This is **NOT** a GC safe point. - mallocmemory_t *ma; - if (ptls->gc_tls_common.heap.mafreelist == NULL) { - ma = (mallocmemory_t*)malloc_s(sizeof(mallocmemory_t)); - } - else { - ma = ptls->gc_tls_common.heap.mafreelist; - ptls->gc_tls_common.heap.mafreelist = ma->next; - } - ma->a = (jl_genericmemory_t*)((uintptr_t)m | !!isaligned); - ma->next = ptls->gc_tls_common.heap.mallocarrays; - ptls->gc_tls_common.heap.mallocarrays = ma; + void *a = (void*)((uintptr_t)m | !!isaligned); + small_arraylist_push(&ptls->gc_tls_common.heap.mallocarrays, a); } // =========================================================================== // diff --git a/src/gc-common.h b/src/gc-common.h index 3007151009f7d..1745df17950f9 100644 --- a/src/gc-common.h +++ b/src/gc-common.h @@ -61,12 +61,6 @@ extern jl_gc_callback_list_t *gc_cblist_notify_gc_pressure; // malloc wrappers, aligned allocation // =========================================================================== // -// data structure for tracking malloc'd genericmemory. -typedef struct _mallocmemory_t { - jl_genericmemory_t *a; // lowest bit is tagged if this is aligned memory - struct _mallocmemory_t *next; -} mallocmemory_t; - #if defined(_OS_WINDOWS_) STATIC_INLINE void *jl_malloc_aligned(size_t sz, size_t align) { diff --git a/src/gc-debug.c b/src/gc-debug.c index 7c479484cde45..6e51064035b7b 100644 --- a/src/gc-debug.c +++ b/src/gc-debug.c @@ -1025,12 +1025,11 @@ void gc_stats_big_obj(void) v = v->next; } - mallocmemory_t *ma = ptls2->gc_tls.heap.mallocarrays; - while (ma != NULL) { - uint8_t bits =jl_astaggedvalue(ma->a)->bits.gc; + void **lst = ptls2->gc_tls.heap.mallocarrays.items; + for (size_t i = 0, l = ptls2->gc_tls.heap.mallocarrays.len; i < l; i++) { + jl_genericmemory_t *m = (jl_genericmemory_t*)((uintptr_t)lst[i] & ~(uintptr_t)1); + uint8_t bits = jl_astaggedvalue(m)->bits.gc; if (gc_marked(bits)) { - jl_genericmemory_t *m = (jl_genericmemory_t*)ma->a; - m = (jl_genericmemory_t*)((uintptr_t)m & ~(uintptr_t)1); size_t sz = jl_genericmemory_nbytes(m); if (gc_old(bits)) { assert(bits == GC_OLD_MARKED); @@ -1042,7 +1041,6 @@ void gc_stats_big_obj(void) stat.nbytes_used += sz; } } - ma = ma->next; } } jl_safe_printf("%lld kB (%lld%% old) in %lld large objects (%lld%% old)\n", diff --git a/src/gc-stock.c b/src/gc-stock.c index 61a013f347975..fe10a59b89623 100644 --- a/src/gc-stock.c +++ b/src/gc-stock.c @@ -629,10 +629,9 @@ void jl_gc_reset_alloc_count(void) JL_NOTSAFEPOINT reset_thread_gc_counts(); } -static void jl_gc_free_memory(jl_value_t *v, int isaligned) JL_NOTSAFEPOINT +static void jl_gc_free_memory(jl_genericmemory_t *m, int isaligned) JL_NOTSAFEPOINT { - assert(jl_is_genericmemory(v)); - jl_genericmemory_t *m = (jl_genericmemory_t*)v; + assert(jl_is_genericmemory(m)); assert(jl_genericmemory_how(m) == 1 || jl_genericmemory_how(m) == 2); char *d = (char*)m->ptr; size_t freed_bytes = memory_block_usable_size(d, isaligned); @@ -654,25 +653,23 @@ static void sweep_malloced_memory(void) JL_NOTSAFEPOINT for (int t_i = 0; t_i < gc_n_threads; t_i++) { jl_ptls_t ptls2 = gc_all_tls_states[t_i]; if (ptls2 != NULL) { - mallocmemory_t *ma = ptls2->gc_tls_common.heap.mallocarrays; - mallocmemory_t **pma = &ptls2->gc_tls_common.heap.mallocarrays; - while (ma != NULL) { - mallocmemory_t *nxt = ma->next; - jl_value_t *a = (jl_value_t*)((uintptr_t)ma->a & ~1); - int bits = jl_astaggedvalue(a)->bits.gc; - if (gc_marked(bits)) { - pma = &ma->next; + size_t n = 0; + size_t l = ptls2->gc_tls_common.heap.mallocarrays.len; + void **lst = ptls2->gc_tls_common.heap.mallocarrays.items; + // filter without preserving order + while (n < l) { + jl_genericmemory_t *m = (jl_genericmemory_t*)((uintptr_t)lst[n] & ~1); + if (gc_marked(jl_astaggedvalue(m)->bits.gc)) { + n++; } else { - *pma = nxt; - int isaligned = (uintptr_t)ma->a & 1; - jl_gc_free_memory(a, isaligned); - ma->next = ptls2->gc_tls_common.heap.mafreelist; - ptls2->gc_tls_common.heap.mafreelist = ma; + int isaligned = (uintptr_t)lst[n] & 1; + jl_gc_free_memory(m, isaligned); + l--; + lst[n] = lst[l]; } - gc_time_count_mallocd_memory(bits); - ma = nxt; } + ptls2->gc_tls_common.heap.mallocarrays.len = l; } } gc_time_mallocd_memory_end(); @@ -3439,8 +3436,7 @@ void jl_init_thread_heap(jl_ptls_t ptls) small_arraylist_new(&common_heap->live_tasks, 0); for (int i = 0; i < JL_N_STACK_POOLS; i++) small_arraylist_new(&common_heap->free_stacks[i], 0); - common_heap->mallocarrays = NULL; - common_heap->mafreelist = NULL; + small_arraylist_new(&common_heap->mallocarrays, 0); heap->young_generation_of_bigvals = (bigval_t*)calloc_s(sizeof(bigval_t)); // sentinel assert(gc_bigval_sentinel_tag != 0); // make sure the sentinel is initialized heap->young_generation_of_bigvals->header = gc_bigval_sentinel_tag; diff --git a/src/gc-tls-common.h b/src/gc-tls-common.h index ba36f5c1c238e..473668d648294 100644 --- a/src/gc-tls-common.h +++ b/src/gc-tls-common.h @@ -21,9 +21,8 @@ typedef struct { // that are holding onto a stack from the pool small_arraylist_t live_tasks; - // variables for tracking malloc'd arrays - struct _mallocmemory_t *mallocarrays; - struct _mallocmemory_t *mafreelist; + // variable for tracking malloc'd arrays + small_arraylist_t mallocarrays; #define JL_N_STACK_POOLS 16 small_arraylist_t free_stacks[JL_N_STACK_POOLS];