Skip to content

Commit

Permalink
count bytes allocated through malloc more precisely (#55223)
Browse files Browse the repository at this point in the history
Should make the accounting for memory allocated through malloc a bit
more accurate.

Should also simplify the accounting code by eliminating the use of
`jl_gc_count_freed` in `jl_genericmemory_to_string`.
  • Loading branch information
d-netto authored Nov 7, 2024
1 parent 4278ded commit 9e14bf8
Show file tree
Hide file tree
Showing 6 changed files with 117 additions and 113 deletions.
91 changes: 31 additions & 60 deletions src/gc-common.c
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,6 @@
#include "julia_gcext.h"
#include "julia_assert.h"
#include "threading.h"
#ifdef __GLIBC__
#include <malloc.h> // for malloc_trim
#endif

#ifdef __cplusplus
extern "C" {
Expand Down Expand Up @@ -120,6 +117,37 @@ JL_DLLEXPORT void jl_gc_set_cb_notify_gc_pressure(jl_gc_cb_notify_gc_pressure_t
jl_gc_deregister_callback(&gc_cblist_notify_gc_pressure, (jl_gc_cb_func_t)cb);
}

// =========================================================================== //
// malloc wrappers, aligned allocation
// =========================================================================== //

#if defined(_OS_WINDOWS_)
// helper function based partly on wine msvcrt80+ heap.c
// but with several fixes to improve the correctness of the computation and remove unnecessary parameters
#define SAVED_PTR(x) ((void *)((DWORD_PTR)((char *)x - sizeof(void *)) & \
~(sizeof(void *) - 1)))
static size_t _aligned_msize(void *p)
{
void *alloc_ptr = *(void**)SAVED_PTR(p);
return _msize(alloc_ptr) - ((char*)p - (char*)alloc_ptr);
}
#undef SAVED_PTR
#endif

size_t memory_block_usable_size(void *p, int isaligned) JL_NOTSAFEPOINT
{
#if defined(_OS_WINDOWS_)
if (isaligned)
return _aligned_msize(p);
else
return _msize(p);
#elif defined(_OS_DARWIN_)
return malloc_size(p);
#else
return malloc_usable_size(p);
#endif
}

// =========================================================================== //
// Finalization
// =========================================================================== //
Expand Down Expand Up @@ -505,63 +533,6 @@ JL_DLLEXPORT jl_value_t *jl_gc_allocobj(size_t sz)
return jl_gc_alloc(ptls, sz, NULL);
}

// allocation wrappers that save the size of allocations, to allow using
// jl_gc_counted_* functions with a libc-compatible API.

JL_DLLEXPORT void *jl_malloc(size_t sz)
{
int64_t *p = (int64_t *)jl_gc_counted_malloc(sz + JL_SMALL_BYTE_ALIGNMENT);
if (p == NULL)
return NULL;
p[0] = sz;
return (void *)(p + 2); // assumes JL_SMALL_BYTE_ALIGNMENT == 16
}

//_unchecked_calloc does not check for potential overflow of nm*sz
STATIC_INLINE void *_unchecked_calloc(size_t nm, size_t sz) {
size_t nmsz = nm*sz;
int64_t *p = (int64_t *)jl_gc_counted_calloc(nmsz + JL_SMALL_BYTE_ALIGNMENT, 1);
if (p == NULL)
return NULL;
p[0] = nmsz;
return (void *)(p + 2); // assumes JL_SMALL_BYTE_ALIGNMENT == 16
}

JL_DLLEXPORT void *jl_calloc(size_t nm, size_t sz)
{
if (nm > SSIZE_MAX/sz - JL_SMALL_BYTE_ALIGNMENT)
return NULL;
return _unchecked_calloc(nm, sz);
}

JL_DLLEXPORT void jl_free(void *p)
{
if (p != NULL) {
int64_t *pp = (int64_t *)p - 2;
size_t sz = pp[0];
jl_gc_counted_free_with_size(pp, sz + JL_SMALL_BYTE_ALIGNMENT);
}
}

JL_DLLEXPORT void *jl_realloc(void *p, size_t sz)
{
int64_t *pp;
size_t szold;
if (p == NULL) {
pp = NULL;
szold = 0;
}
else {
pp = (int64_t *)p - 2;
szold = pp[0] + JL_SMALL_BYTE_ALIGNMENT;
}
int64_t *pnew = (int64_t *)jl_gc_counted_realloc_with_old_size(pp, szold, sz + JL_SMALL_BYTE_ALIGNMENT);
if (pnew == NULL)
return NULL;
pnew[0] = sz;
return (void *)(pnew + 2); // assumes JL_SMALL_BYTE_ALIGNMENT == 16
}

// allocator entry points

JL_DLLEXPORT jl_value_t *(jl_gc_alloc)(jl_ptls_t ptls, size_t sz, void *ty)
Expand Down
8 changes: 8 additions & 0 deletions src/gc-common.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,14 @@
#endif
#endif

#include <stdlib.h>

#if defined(_OS_DARWIN_)
#include <malloc/malloc.h>
#else
#include <malloc.h> // for malloc_trim
#endif

#ifdef __cplusplus
extern "C" {
#endif
Expand Down
122 changes: 74 additions & 48 deletions src/gc-stock.c
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,6 @@
#include "julia_atomics.h"
#include "julia_gcext.h"
#include "julia_assert.h"
#ifdef __GLIBC__
#include <malloc.h> // for malloc_trim
#endif

#ifdef __cplusplus
extern "C" {
Expand Down Expand Up @@ -569,11 +566,6 @@ void jl_gc_count_allocd(size_t sz) JL_NOTSAFEPOINT
jl_batch_accum_heap_size(ptls, sz);
}

void jl_gc_count_freed(size_t sz) JL_NOTSAFEPOINT
{
jl_batch_accum_free_size(jl_current_task->ptls, sz);
}

// Only safe to update the heap inside the GC
static void combine_thread_gc_counts(jl_gc_num_t *dest, int update_heap) JL_NOTSAFEPOINT
{
Expand Down Expand Up @@ -643,13 +635,15 @@ static void jl_gc_free_memory(jl_value_t *v, int isaligned) JL_NOTSAFEPOINT
jl_genericmemory_t *m = (jl_genericmemory_t*)v;
assert(jl_genericmemory_how(m) == 1 || jl_genericmemory_how(m) == 2);
char *d = (char*)m->ptr;
size_t freed_bytes = memory_block_usable_size(d, isaligned);
assert(freed_bytes != 0);
if (isaligned)
jl_free_aligned(d);
else
free(d);
jl_atomic_store_relaxed(&gc_heap_stats.heap_size,
jl_atomic_load_relaxed(&gc_heap_stats.heap_size) - jl_genericmemory_nbytes(m));
gc_num.freed += jl_genericmemory_nbytes(m);
jl_atomic_load_relaxed(&gc_heap_stats.heap_size) - freed_bytes);
gc_num.freed += freed_bytes;
gc_num.freecall++;
}

Expand Down Expand Up @@ -3652,14 +3646,69 @@ JL_DLLEXPORT uint64_t jl_gc_get_max_memory(void)
return max_total_memory;
}

// allocation wrappers that track allocation and let collection run
// allocation wrappers that add to gc pressure

JL_DLLEXPORT void *jl_malloc(size_t sz)
{
return jl_gc_counted_malloc(sz);
}

//_unchecked_calloc does not check for potential overflow of nm*sz
STATIC_INLINE void *_unchecked_calloc(size_t nm, size_t sz) {
size_t nmsz = nm*sz;
return jl_gc_counted_calloc(nmsz, 1);
}

JL_DLLEXPORT void *jl_calloc(size_t nm, size_t sz)
{
if (nm > SSIZE_MAX/sz)
return NULL;
return _unchecked_calloc(nm, sz);
}

JL_DLLEXPORT void jl_free(void *p)
{
if (p != NULL) {
size_t sz = memory_block_usable_size(p, 0);
free(p);
jl_task_t *ct = jl_get_current_task();
if (ct != NULL)
jl_batch_accum_free_size(ct->ptls, sz);
}
}

JL_DLLEXPORT void *jl_realloc(void *p, size_t sz)
{
size_t old = p ? memory_block_usable_size(p, 0) : 0;
void *data = realloc(p, sz);
jl_task_t *ct = jl_get_current_task();
if (data != NULL && ct != NULL) {
sz = memory_block_usable_size(data, 0);
jl_ptls_t ptls = ct->ptls;
maybe_collect(ptls);
if (!(sz < old))
jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.allocd,
jl_atomic_load_relaxed(&ptls->gc_tls_common.gc_num.allocd) + (sz - old));
jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.realloc,
jl_atomic_load_relaxed(&ptls->gc_tls_common.gc_num.realloc) + 1);

int64_t diff = sz - old;
if (diff < 0) {
jl_batch_accum_free_size(ptls, -diff);
}
else {
jl_batch_accum_heap_size(ptls, diff);
}
}
return data;
}

JL_DLLEXPORT void *jl_gc_counted_malloc(size_t sz)
{
jl_gcframe_t **pgcstack = jl_get_pgcstack();
jl_task_t *ct = jl_current_task;
void *data = malloc(sz);
if (data != NULL && pgcstack != NULL && ct->world_age) {
jl_task_t *ct = jl_get_current_task();
if (data != NULL && ct != NULL) {
sz = memory_block_usable_size(data, 0);
jl_ptls_t ptls = ct->ptls;
maybe_collect(ptls);
jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.allocd,
Expand All @@ -3673,54 +3722,29 @@ JL_DLLEXPORT void *jl_gc_counted_malloc(size_t sz)

JL_DLLEXPORT void *jl_gc_counted_calloc(size_t nm, size_t sz)
{
jl_gcframe_t **pgcstack = jl_get_pgcstack();
jl_task_t *ct = jl_current_task;
void *data = calloc(nm, sz);
if (data != NULL && pgcstack != NULL && ct->world_age) {
jl_task_t *ct = jl_get_current_task();
if (data != NULL && ct != NULL) {
sz = memory_block_usable_size(data, 0);
jl_ptls_t ptls = ct->ptls;
maybe_collect(ptls);
jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.allocd,
jl_atomic_load_relaxed(&ptls->gc_tls_common.gc_num.allocd) + nm*sz);
jl_atomic_load_relaxed(&ptls->gc_tls_common.gc_num.allocd) + sz);
jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.malloc,
jl_atomic_load_relaxed(&ptls->gc_tls_common.gc_num.malloc) + 1);
jl_batch_accum_heap_size(ptls, sz * nm);
jl_batch_accum_heap_size(ptls, sz);
}
return data;
}

JL_DLLEXPORT void jl_gc_counted_free_with_size(void *p, size_t sz)
{
jl_gcframe_t **pgcstack = jl_get_pgcstack();
jl_task_t *ct = jl_current_task;
free(p);
if (pgcstack != NULL && ct->world_age) {
jl_batch_accum_free_size(ct->ptls, sz);
}
return jl_free(p);
}

JL_DLLEXPORT void *jl_gc_counted_realloc_with_old_size(void *p, size_t old, size_t sz)
{
jl_gcframe_t **pgcstack = jl_get_pgcstack();
jl_task_t *ct = jl_current_task;
void *data = realloc(p, sz);
if (data != NULL && pgcstack != NULL && ct->world_age) {
jl_ptls_t ptls = ct->ptls;
maybe_collect(ptls);
if (!(sz < old))
jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.allocd,
jl_atomic_load_relaxed(&ptls->gc_tls_common.gc_num.allocd) + (sz - old));
jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.realloc,
jl_atomic_load_relaxed(&ptls->gc_tls_common.gc_num.realloc) + 1);

int64_t diff = sz - old;
if (diff < 0) {
jl_batch_accum_free_size(ptls, -diff);
}
else {
jl_batch_accum_heap_size(ptls, diff);
}
}
return data;
return jl_realloc(p, sz);
}

// allocating blocks for Arrays and Strings
Expand All @@ -3741,11 +3765,13 @@ JL_DLLEXPORT void *jl_gc_managed_malloc(size_t sz)
if (b == NULL)
jl_throw(jl_memory_exception);

size_t allocated_bytes = memory_block_usable_size(b, 1);
assert(allocated_bytes >= allocsz);
jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.allocd,
jl_atomic_load_relaxed(&ptls->gc_tls_common.gc_num.allocd) + allocsz);
jl_atomic_load_relaxed(&ptls->gc_tls_common.gc_num.allocd) + allocated_bytes);
jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.malloc,
jl_atomic_load_relaxed(&ptls->gc_tls_common.gc_num.malloc) + 1);
jl_batch_accum_heap_size(ptls, allocsz);
jl_batch_accum_heap_size(ptls, allocated_bytes);
#ifdef _OS_WINDOWS_
SetLastError(last_error);
#endif
Expand Down
5 changes: 2 additions & 3 deletions src/genericmemory.c
Original file line number Diff line number Diff line change
Expand Up @@ -165,7 +165,8 @@ JL_DLLEXPORT jl_genericmemory_t *jl_ptr_to_genericmemory(jl_value_t *mtype, void
if (own_buffer) {
int isaligned = 0; // TODO: allow passing memalign'd buffers
jl_gc_track_malloced_genericmemory(ct->ptls, m, isaligned);
jl_gc_count_allocd(nel*elsz);
size_t allocated_bytes = memory_block_usable_size(data, isaligned);
jl_gc_count_allocd(allocated_bytes);
}
return m;
}
Expand Down Expand Up @@ -208,8 +209,6 @@ JL_DLLEXPORT jl_value_t *jl_genericmemory_to_string(jl_genericmemory_t *m, size_
JL_GC_PUSH1(&o);
jl_value_t *str = jl_pchar_to_string((const char*)m->ptr, len);
JL_GC_POP();
if (how == 1) // TODO: we might like to early-call jl_gc_free_memory here instead actually, but hopefully `m` will die soon
jl_gc_count_freed(mlength);
return str;
}
// n.b. how == 0 is always pool-allocated, so the freed bytes are computed from the pool not the object
Expand Down
2 changes: 1 addition & 1 deletion src/julia_internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -615,8 +615,8 @@ jl_svec_t *jl_perm_symsvec(size_t n, ...);

void jl_gc_track_malloced_genericmemory(jl_ptls_t ptls, jl_genericmemory_t *m, int isaligned) JL_NOTSAFEPOINT;
size_t jl_genericmemory_nbytes(jl_genericmemory_t *a) JL_NOTSAFEPOINT;
size_t memory_block_usable_size(void *mem, int isaligned) JL_NOTSAFEPOINT;
void jl_gc_count_allocd(size_t sz) JL_NOTSAFEPOINT;
void jl_gc_count_freed(size_t sz) JL_NOTSAFEPOINT;
void jl_gc_run_all_finalizers(jl_task_t *ct);
void jl_release_task_stack(jl_ptls_t ptls, jl_task_t *task);
void jl_gc_add_finalizer_(jl_ptls_t ptls, void *v, void *f) JL_NOTSAFEPOINT;
Expand Down
2 changes: 1 addition & 1 deletion test/compiler/codegen.jl
Original file line number Diff line number Diff line change
Expand Up @@ -407,7 +407,7 @@ function g_dict_hash_alloc()
end
# Warm up
f_dict_hash_alloc(); g_dict_hash_alloc();
@test (@allocated f_dict_hash_alloc()) == (@allocated g_dict_hash_alloc())
@test abs((@allocated f_dict_hash_alloc()) / (@allocated g_dict_hash_alloc()) - 1) < 0.1 # less that 10% difference

# returning an argument shouldn't alloc a new box
@noinline f33829(x) = (global called33829 = true; x)
Expand Down

5 comments on commit 9e14bf8

@nanosoldier
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Executing the daily package evaluation, I will reply here when finished:

@nanosoldier runtests(isdaily = true)

@vtjnash
Copy link
Member

@vtjnash vtjnash commented on 9e14bf8 Nov 7, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@nanosoldier runbenchmarks(ALL, isdaily = true)

@nanosoldier
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Your benchmark job has completed - possible performance regressions were detected. A full report can be found here.

@vtjnash
Copy link
Member

@vtjnash vtjnash commented on 9e14bf8 Nov 7, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The memory regressions reported here are due to a change in how reporting is measured 9e14bf8, not real changes (actually we allocate a little bit less now)

@nanosoldier
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The package evaluation job you requested has completed - possible new issues were detected.
The full report is available.

Please sign in to comment.