Skip to content

Commit

Permalink
WIP: create separate object pools for long-lived types
Browse files Browse the repository at this point in the history
  • Loading branch information
d-netto committed Dec 10, 2023
1 parent 005e280 commit 9f01e8d
Show file tree
Hide file tree
Showing 4 changed files with 155 additions and 26 deletions.
155 changes: 131 additions & 24 deletions src/gc.c
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ int gc_first_tid;
// Mutex/cond used to synchronize sleep/wakeup of GC threads
uv_mutex_t gc_threads_lock;
uv_cond_t gc_threads_cond;
FILE *page_profiling_file;

// Linked list of callback functions

Expand Down Expand Up @@ -1444,6 +1445,19 @@ JL_DLLEXPORT jl_value_t *jl_gc_pool_alloc(jl_ptls_t ptls, int pool_offset,
JL_DLLEXPORT jl_value_t *jl_gc_pool_alloc_instrumented(jl_ptls_t ptls, int pool_offset,
int osize, jl_value_t* type)
{
jl_datatype_t *ty = (jl_datatype_t*)type;
if (ty == jl_simplevector_type) {
pool_offset += JL_GC_N_MAX_POOLS;
}
else if (ty == jl_code_instance_type) {
pool_offset = 2 * JL_GC_N_MAX_POOLS;
}
else if (ty == jl_method_instance_type) {
pool_offset = 2 * JL_GC_N_MAX_POOLS + 1;
}
else if (ty == jl_typemap_entry_type) {
pool_offset = 2 * JL_GC_N_MAX_POOLS + 2;
}
jl_value_t *val = jl_gc_pool_alloc_inner(ptls, pool_offset, osize);
maybe_record_alloc_to_profile(val, osize, (jl_datatype_t*)type);
return val;
Expand Down Expand Up @@ -1473,25 +1487,71 @@ JL_DLLEXPORT double jl_gc_page_utilization_stats[JL_GC_N_MAX_POOLS];

STATIC_INLINE void gc_update_page_fragmentation_data(jl_gc_pagemeta_t *pg) JL_NOTSAFEPOINT
{
gc_fragmentation_stat_t *stats = &gc_page_fragmentation_stats[pg->pool_n];
jl_atomic_fetch_add(&stats->n_freed_objs, pg->nfree);
jl_atomic_fetch_add(&stats->n_pages_allocd, 1);
// gc_fragmentation_stat_t *stats = &gc_page_fragmentation_stats[pg->pool_n];
// jl_atomic_fetch_add(&stats->n_freed_objs, pg->nfree);
// jl_atomic_fetch_add(&stats->n_pages_allocd, 1);
}

STATIC_INLINE void gc_dump_page_utilization_data(void) JL_NOTSAFEPOINT
{
for (int i = 0; i < JL_GC_N_POOLS; i++) {
gc_fragmentation_stat_t *stats = &gc_page_fragmentation_stats[i];
double utilization = 1.0;
size_t n_freed_objs = jl_atomic_load_relaxed(&stats->n_freed_objs);
size_t n_pages_allocd = jl_atomic_load_relaxed(&stats->n_pages_allocd);
if (n_pages_allocd != 0) {
utilization -= ((double)n_freed_objs * (double)jl_gc_sizeclasses[i]) / (double)n_pages_allocd / (double)GC_PAGE_SZ;
}
jl_gc_page_utilization_stats[i] = utilization;
jl_atomic_store_relaxed(&stats->n_freed_objs, 0);
jl_atomic_store_relaxed(&stats->n_pages_allocd, 0);
// for (int i = 0; i < JL_GC_N_POOLS; i++) {
// gc_fragmentation_stat_t *stats = &gc_page_fragmentation_stats[i];
// double utilization = 1.0;
// size_t n_freed_objs = jl_atomic_load_relaxed(&stats->n_freed_objs);
// size_t n_pages_allocd = jl_atomic_load_relaxed(&stats->n_pages_allocd);
// if (n_pages_allocd != 0) {
// utilization -= ((double)n_freed_objs * (double)jl_gc_sizeclasses[i]) / (double)n_pages_allocd / (double)GC_PAGE_SZ;
// }
// jl_gc_page_utilization_stats[i] = utilization;
// jl_atomic_store_relaxed(&stats->n_freed_objs, 0);
// jl_atomic_store_relaxed(&stats->n_pages_allocd, 0);
// }
}

// #define GC_HEAP_DUMP

int heap_dump_enabled;

void gc_enable_heap_dump(void)
{
heap_dump_enabled = 1;
}

static void gc_heap_dump_write_preamble(char *data, int osize) JL_NOTSAFEPOINT
{
#ifdef GC_HEAP_DUMP
if (heap_dump_enabled) {
fprintf(page_profiling_file, "{data: %p, osize: %d}\n", data, osize);
}
#endif
}

static void gc_heap_dump_write_empty_page(void) JL_NOTSAFEPOINT
{
#ifdef GC_HEAP_DUMP
if (heap_dump_enabled) {
fprintf(page_profiling_file, "[empty]\n");
}
#endif
}

static void gc_heap_dump_write_garbage(void) JL_NOTSAFEPOINT
{
#ifdef GC_HEAP_DUMP
if (heap_dump_enabled) {
fprintf(page_profiling_file, "[garbage]\n");
}
#endif
}

static void gc_heap_dump_write_live_obj(jl_taggedvalue_t *v) JL_NOTSAFEPOINT
{
#ifdef GC_HEAP_DUMP
if (heap_dump_enabled) {
const char *name = jl_typeof_str(jl_valueof(v));
fprintf(page_profiling_file, "[%s]\n", name);
}
#endif
}

int64_t buffered_pages = 0;
Expand All @@ -1509,6 +1569,7 @@ static void gc_sweep_page(jl_gc_pool_t *p, jl_gc_page_stack_t *allocd, jl_gc_pag
}
size_t old_nfree = pg->nfree;
size_t nfree;
gc_heap_dump_write_preamble(data, osize);

int re_use_page = 1;
int keep_as_local_buffer = 0;
Expand All @@ -1526,6 +1587,7 @@ static void gc_sweep_page(jl_gc_pool_t *p, jl_gc_page_stack_t *allocd, jl_gc_pag
keep_as_local_buffer = 1;
}
nfree = (GC_PAGE_SZ - GC_PAGE_OFFSET) / osize;
gc_heap_dump_write_empty_page();
goto done;
}
// For quick sweep, we might be able to skip the page if the page doesn't
Expand All @@ -1535,6 +1597,7 @@ static void gc_sweep_page(jl_gc_pool_t *p, jl_gc_page_stack_t *allocd, jl_gc_pag
if (!prev_sweep_full || pg->prev_nold == pg->nold) {
freedall = 0;
nfree = pg->nfree;
gc_heap_dump_write_empty_page();
goto done;
}
}
Expand All @@ -1552,18 +1615,22 @@ static void gc_sweep_page(jl_gc_pool_t *p, jl_gc_page_stack_t *allocd, jl_gc_pag
int bits = v->bits.gc;
// if an object is past `lim_newpages` then we can guarantee it's garbage
if (!gc_marked(bits) || (char*)v >= lim_newpages) {
gc_heap_dump_write_garbage();
*pfl = v;
pfl = &v->next;
pfl_begin = (pfl_begin != NULL) ? pfl_begin : pfl;
pg_nfree++;
gc_heap_dump_write_garbage();
}
else { // marked young or old
gc_heap_dump_write_live_obj(v);
if (current_sweep_full || bits == GC_MARKED) { // old enough
bits = v->bits.gc = GC_OLD; // promote
}
prev_nold++;
has_marked |= gc_marked(bits);
freedall = 0;
gc_heap_dump_write_live_obj(v);
}
v = (jl_taggedvalue_t*)((char*)v + osize);
}
Expand Down Expand Up @@ -1704,6 +1771,7 @@ void gc_free_pages(void)
// setup the data-structures for a sweep over all memory pools
static void gc_sweep_pool(void)
{
// jl_safe_printf("GC sweep pools...\n");
gc_time_pool_start();
buffered_pages = 0;

Expand All @@ -1713,20 +1781,21 @@ static void gc_sweep_pool(void)

// allocate enough space to hold the end of the free list chain
// for every thread and pool size
jl_taggedvalue_t ***pfl = (jl_taggedvalue_t ***) alloca(n_threads * JL_GC_N_POOLS * sizeof(jl_taggedvalue_t**));
const int jl_gc_n_tot_pools = 2 * JL_GC_N_MAX_POOLS + 3;
jl_taggedvalue_t ***pfl = (jl_taggedvalue_t ***) alloca(n_threads * jl_gc_n_tot_pools * sizeof(jl_taggedvalue_t**));

// update metadata of pages that were pointed to by freelist or newpages from a pool
// i.e. pages being the current allocation target
for (int t_i = 0; t_i < n_threads; t_i++) {
jl_ptls_t ptls2 = gc_all_tls_states[t_i];
if (ptls2 == NULL) {
for (int i = 0; i < JL_GC_N_POOLS; i++) {
pfl[t_i * JL_GC_N_POOLS + i] = NULL;
for (int i = 0; i < jl_gc_n_tot_pools; i++) {
pfl[t_i * jl_gc_n_tot_pools + i] = NULL;
}
continue;
}
jl_atomic_store_relaxed(&ptls2->gc_num.pool_live_bytes, 0);
for (int i = 0; i < JL_GC_N_POOLS; i++) {
for (int i = 0; i < jl_gc_n_tot_pools; i++) {
jl_gc_pool_t *p = &ptls2->heap.norm_pools[i];
jl_taggedvalue_t *last = p->freelist;
if (last != NULL) {
Expand All @@ -1735,7 +1804,7 @@ static void gc_sweep_pool(void)
pg->has_young = 1;
}
p->freelist = NULL;
pfl[t_i * JL_GC_N_POOLS + i] = &p->freelist;
pfl[t_i * jl_gc_n_tot_pools + i] = &p->freelist;

last = p->newpages;
if (last != NULL) {
Expand All @@ -1755,6 +1824,7 @@ static void gc_sweep_pool(void)
}

// the actual sweeping
// jl_safe_printf("GC sweep pages... the actual sweeping\n");
jl_gc_page_stack_t *tmp = (jl_gc_page_stack_t *)alloca(n_threads * sizeof(jl_gc_page_stack_t));
memset(tmp, 0, n_threads * sizeof(jl_gc_page_stack_t));
jl_atomic_store(&gc_allocd_scratch, tmp);
Expand All @@ -1766,14 +1836,15 @@ static void gc_sweep_pool(void)
jl_ptls_t ptls2 = gc_all_tls_states[t_i];
if (ptls2 != NULL) {
ptls2->page_metadata_allocd = tmp[t_i];
for (int i = 0; i < JL_GC_N_POOLS; i++) {
for (int i = 0; i < jl_gc_n_tot_pools; i++) {
jl_gc_pool_t *p = &ptls2->heap.norm_pools[i];
p->newpages = NULL;
}
}
}

// merge free lists
// jl_safe_printf("GC sweep pages... merge free lists\n");
for (int t_i = 0; t_i < n_threads; t_i++) {
jl_ptls_t ptls2 = gc_all_tls_states[t_i];
if (ptls2 == NULL) {
Expand All @@ -1786,19 +1857,20 @@ static void gc_sweep_pool(void)
char *cur_pg = pg->data;
jl_taggedvalue_t *fl_beg = (jl_taggedvalue_t*)(cur_pg + pg->fl_begin_offset);
jl_taggedvalue_t *fl_end = (jl_taggedvalue_t*)(cur_pg + pg->fl_end_offset);
*pfl[t_i * JL_GC_N_POOLS + pg->pool_n] = fl_beg;
pfl[t_i * JL_GC_N_POOLS + pg->pool_n] = &fl_end->next;
*pfl[t_i * jl_gc_n_tot_pools + pg->pool_n] = fl_beg;
pfl[t_i * jl_gc_n_tot_pools + pg->pool_n] = &fl_end->next;
}
pg = pg2;
}
}

// null out terminal pointers of free lists
// jl_safe_printf("GC sweep pages... null out terminal pointers of free lists\n");
for (int t_i = 0; t_i < n_threads; t_i++) {
jl_ptls_t ptls2 = gc_all_tls_states[t_i];
if (ptls2 != NULL) {
for (int i = 0; i < JL_GC_N_POOLS; i++) {
*pfl[t_i * JL_GC_N_POOLS + i] = NULL;
for (int i = 0; i < jl_gc_n_tot_pools; i++) {
*pfl[t_i * jl_gc_n_tot_pools + i] = NULL;
}
}
}
Expand Down Expand Up @@ -3676,11 +3748,39 @@ void jl_init_thread_heap(jl_ptls_t ptls)
{
jl_thread_heap_t *heap = &ptls->heap;
jl_gc_pool_t *p = heap->norm_pools;
// initialize pools for objects which are not special cased
for (int i = 0; i < JL_GC_N_POOLS; i++) {
p[i].osize = jl_gc_sizeclasses[i];
p[i].freelist = NULL;
p[i].newpages = NULL;
}
// initialize pools for SimpleVector
for (int i = 0; i < JL_GC_N_POOLS; i++) {
p[i + JL_GC_N_MAX_POOLS].osize = jl_gc_sizeclasses[i];
p[i + JL_GC_N_MAX_POOLS].freelist = NULL;
p[i + JL_GC_N_MAX_POOLS].newpages = NULL;
}
// // initialize pool for CodeInstance
size_t ci_size = sizeof(jl_code_instance_t) + sizeof(jl_taggedvalue_t);
size_t sz = jl_gc_sizeclasses[jl_gc_szclass(ci_size)];
int i = 2 * JL_GC_N_MAX_POOLS;
p[i].osize = sz;
p[i].freelist = NULL;
p[i].newpages = NULL;
// initialize pool for MethodInstance
size_t mi_size = sizeof(jl_method_instance_t) + sizeof(jl_taggedvalue_t);
sz = jl_gc_sizeclasses[jl_gc_szclass(mi_size)];
i = 2 * JL_GC_N_MAX_POOLS + 1;
p[i].osize = sz;
p[i].freelist = NULL;
p[i].newpages = NULL;
// initialize pool for TypeMapEntry
size_t tme_size = sizeof(jl_typemap_entry_t) + sizeof(jl_taggedvalue_t);
sz = jl_gc_sizeclasses[jl_gc_szclass(tme_size)];
i = 2 * JL_GC_N_MAX_POOLS + 2;
p[i].osize = sz;
p[i].freelist = NULL;
p[i].newpages = NULL;
small_arraylist_new(&heap->weak_refs, 0);
small_arraylist_new(&heap->live_tasks, 0);
for (int i = 0; i < JL_N_STACK_POOLS; i++)
Expand Down Expand Up @@ -3728,6 +3828,13 @@ void jl_gc_init(void)
uv_mutex_init(&gc_perm_lock);
uv_mutex_init(&gc_threads_lock);
uv_cond_init(&gc_threads_cond);
#ifdef GC_HEAP_DUMP
page_profiling_file = fopen("julia_page_profile.out", "w");
if (page_profiling_file == NULL) {
fprintf(stderr, "could not open \"julia_page_profile.out\" for writing\n");
abort();
}
#endif

jl_gc_init_page();
jl_gc_debug_init();
Expand Down
4 changes: 4 additions & 0 deletions src/jlapi.c
Original file line number Diff line number Diff line change
Expand Up @@ -678,6 +678,8 @@ static void rr_detach_teleport(void) {
}
#endif

void gc_enable_heap_dump(void);

JL_DLLEXPORT int jl_repl_entrypoint(int argc, char *argv[])
{
// no-op on Windows, note that the caller must have already converted
Expand Down Expand Up @@ -716,6 +718,8 @@ JL_DLLEXPORT int jl_repl_entrypoint(int argc, char *argv[])
}
int ret = true_main(argc, (char**)new_argv);
jl_atexit_hook(ret);
gc_enable_heap_dump();
jl_gc_collect(JL_GC_FULL);
return ret;
}

Expand Down
14 changes: 13 additions & 1 deletion src/julia_internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -483,8 +483,20 @@ STATIC_INLINE jl_value_t *jl_gc_alloc_(jl_ptls_t ptls, size_t sz, void *ty)
const size_t allocsz = sz + sizeof(jl_taggedvalue_t);
if (sz <= GC_MAX_SZCLASS) {
int pool_id = jl_gc_szclass(allocsz);
jl_gc_pool_t *p = &ptls->heap.norm_pools[pool_id];
int osize = jl_gc_sizeclasses[pool_id];
jl_gc_pool_t *p = &ptls->heap.norm_pools[pool_id];
if (ty == jl_simplevector_type) {
p += JL_GC_N_MAX_POOLS;
}
else if (ty == jl_code_instance_type) {
p = &ptls->heap.norm_pools[2 * JL_GC_N_MAX_POOLS];
}
else if (ty == jl_method_instance_type) {
p = &ptls->heap.norm_pools[2 * JL_GC_N_MAX_POOLS + 1];
}
else if (ty == jl_typemap_entry_type) {
p = &ptls->heap.norm_pools[2 * JL_GC_N_MAX_POOLS + 2];
}
// We call `jl_gc_pool_alloc_noinline` instead of `jl_gc_pool_alloc` to avoid double-counting in
// the Allocations Profiler. (See https://github.com/JuliaLang/julia/pull/43868 for more details.)
v = jl_gc_pool_alloc_noinline(ptls, (char*)p - (char*)ptls, osize);
Expand Down
8 changes: 7 additions & 1 deletion src/julia_threads.h
Original file line number Diff line number Diff line change
Expand Up @@ -161,7 +161,13 @@ typedef struct {

// variables for allocating objects from pools
#define JL_GC_N_MAX_POOLS 51 // conservative. must be kept in sync with `src/julia_internal.h`
jl_gc_pool_t norm_pools[JL_GC_N_MAX_POOLS];
// pool layout:
// - [0, JL_GC_N_MAX_POOLS): small objects which are not special cased (see below)
// - [JL_GC_N_MAX_POOLS, 2 * JL_GC_N_MAX_POOLS): SimpleVector
// - 2 * JL_GC_N_MAX_POOLS: CodeInstance
// - 2 * JL_GC_N_MAX_POOLS + 1: MethodInstance
// - 2 * JL_GC_N_MAX_POOLS + 2: TypeMapEntry
jl_gc_pool_t norm_pools[2 * JL_GC_N_MAX_POOLS + 3];

#define JL_N_STACK_POOLS 16
small_arraylist_t free_stacks[JL_N_STACK_POOLS];
Expand Down

0 comments on commit 9f01e8d

Please sign in to comment.