diff --git a/src/gc-common.c b/src/gc-common.c
index b552afb8228f0..c751b54f059f5 100644
--- a/src/gc-common.c
+++ b/src/gc-common.c
@@ -6,9 +6,6 @@
 #include "julia_gcext.h"
 #include "julia_assert.h"
 #include "threading.h"
-#ifdef __GLIBC__
-#include <malloc.h> // for malloc_trim
-#endif
 
 #ifdef __cplusplus
 extern "C" {
@@ -120,6 +117,37 @@ JL_DLLEXPORT void jl_gc_set_cb_notify_gc_pressure(jl_gc_cb_notify_gc_pressure_t
         jl_gc_deregister_callback(&gc_cblist_notify_gc_pressure, (jl_gc_cb_func_t)cb);
 }
 
+// =========================================================================== //
+// malloc wrappers, aligned allocation
+// =========================================================================== //
+
+#if defined(_OS_WINDOWS_)
+// helper function based partly on wine msvcrt80+ heap.c
+// but with several fixes to improve the correctness of the computation and remove unnecessary parameters
+#define SAVED_PTR(x) ((void *)((DWORD_PTR)((char *)x - sizeof(void *)) & \
+                               ~(sizeof(void *) - 1)))
+static size_t _aligned_msize(void *p)
+{
+    void *alloc_ptr = *(void**)SAVED_PTR(p);
+    return _msize(alloc_ptr) - ((char*)p - (char*)alloc_ptr);
+}
+#undef SAVED_PTR
+#endif
+
+size_t memory_block_usable_size(void *p, int isaligned) JL_NOTSAFEPOINT
+{
+#if defined(_OS_WINDOWS_)
+    if (isaligned)
+        return _aligned_msize(p);
+    else
+        return _msize(p);
+#elif defined(_OS_DARWIN_)
+    return malloc_size(p);
+#else
+    return malloc_usable_size(p);
+#endif
+}
+
 // =========================================================================== //
 // Finalization
 // =========================================================================== //
@@ -505,63 +533,6 @@ JL_DLLEXPORT jl_value_t *jl_gc_allocobj(size_t sz)
     return jl_gc_alloc(ptls, sz, NULL);
 }
 
-// allocation wrappers that save the size of allocations, to allow using
-// jl_gc_counted_* functions with a libc-compatible API.
-
-JL_DLLEXPORT void *jl_malloc(size_t sz)
-{
-    int64_t *p = (int64_t *)jl_gc_counted_malloc(sz + JL_SMALL_BYTE_ALIGNMENT);
-    if (p == NULL)
-        return NULL;
-    p[0] = sz;
-    return (void *)(p + 2); // assumes JL_SMALL_BYTE_ALIGNMENT == 16
-}
-
-//_unchecked_calloc does not check for potential overflow of nm*sz
-STATIC_INLINE void *_unchecked_calloc(size_t nm, size_t sz) {
-    size_t nmsz = nm*sz;
-    int64_t *p = (int64_t *)jl_gc_counted_calloc(nmsz + JL_SMALL_BYTE_ALIGNMENT, 1);
-    if (p == NULL)
-        return NULL;
-    p[0] = nmsz;
-    return (void *)(p + 2); // assumes JL_SMALL_BYTE_ALIGNMENT == 16
-}
-
-JL_DLLEXPORT void *jl_calloc(size_t nm, size_t sz)
-{
-    if (nm > SSIZE_MAX/sz - JL_SMALL_BYTE_ALIGNMENT)
-        return NULL;
-    return _unchecked_calloc(nm, sz);
-}
-
-JL_DLLEXPORT void jl_free(void *p)
-{
-    if (p != NULL) {
-        int64_t *pp = (int64_t *)p - 2;
-        size_t sz = pp[0];
-        jl_gc_counted_free_with_size(pp, sz + JL_SMALL_BYTE_ALIGNMENT);
-    }
-}
-
-JL_DLLEXPORT void *jl_realloc(void *p, size_t sz)
-{
-    int64_t *pp;
-    size_t szold;
-    if (p == NULL) {
-        pp = NULL;
-        szold = 0;
-    }
-    else {
-        pp = (int64_t *)p - 2;
-        szold = pp[0] + JL_SMALL_BYTE_ALIGNMENT;
-    }
-    int64_t *pnew = (int64_t *)jl_gc_counted_realloc_with_old_size(pp, szold, sz + JL_SMALL_BYTE_ALIGNMENT);
-    if (pnew == NULL)
-        return NULL;
-    pnew[0] = sz;
-    return (void *)(pnew + 2); // assumes JL_SMALL_BYTE_ALIGNMENT == 16
-}
-
 // allocator entry points
 
 JL_DLLEXPORT jl_value_t *(jl_gc_alloc)(jl_ptls_t ptls, size_t sz, void *ty)
diff --git a/src/gc-common.h b/src/gc-common.h
index 32b7470b13a58..3007151009f7d 100644
--- a/src/gc-common.h
+++ b/src/gc-common.h
@@ -12,6 +12,14 @@
 #endif
 #endif
 
+#include <stdlib.h>
+
+#if defined(_OS_DARWIN_)
+#include <malloc/malloc.h>
+#else
+#include <malloc.h> // for malloc_trim
+#endif
+
 #ifdef __cplusplus
 extern "C" {
 #endif
diff --git a/src/gc-stock.c b/src/gc-stock.c
index 3a2027f9190a7..86dbea3b9a17a 100644
--- a/src/gc-stock.c
+++ b/src/gc-stock.c
@@ -9,9 +9,6 @@
 #include "julia_atomics.h"
 #include "julia_gcext.h"
 #include "julia_assert.h"
-#ifdef __GLIBC__
-#include <malloc.h> // for malloc_trim
-#endif
 
 #ifdef __cplusplus
 extern "C" {
@@ -569,11 +566,6 @@ void jl_gc_count_allocd(size_t sz) JL_NOTSAFEPOINT
     jl_batch_accum_heap_size(ptls, sz);
 }
 
-void jl_gc_count_freed(size_t sz) JL_NOTSAFEPOINT
-{
-    jl_batch_accum_free_size(jl_current_task->ptls, sz);
-}
-
 // Only safe to update the heap inside the GC
 static void combine_thread_gc_counts(jl_gc_num_t *dest, int update_heap) JL_NOTSAFEPOINT
 {
@@ -643,13 +635,15 @@ static void jl_gc_free_memory(jl_value_t *v, int isaligned) JL_NOTSAFEPOINT
     jl_genericmemory_t *m = (jl_genericmemory_t*)v;
     assert(jl_genericmemory_how(m) == 1 || jl_genericmemory_how(m) == 2);
     char *d = (char*)m->ptr;
+    size_t freed_bytes = memory_block_usable_size(d, isaligned);
+    assert(freed_bytes != 0);
     if (isaligned)
         jl_free_aligned(d);
     else
         free(d);
     jl_atomic_store_relaxed(&gc_heap_stats.heap_size,
-        jl_atomic_load_relaxed(&gc_heap_stats.heap_size) - jl_genericmemory_nbytes(m));
-    gc_num.freed += jl_genericmemory_nbytes(m);
+        jl_atomic_load_relaxed(&gc_heap_stats.heap_size) - freed_bytes);
+    gc_num.freed += freed_bytes;
     gc_num.freecall++;
 }
 
@@ -3652,14 +3646,69 @@ JL_DLLEXPORT uint64_t jl_gc_get_max_memory(void)
     return max_total_memory;
 }
 
-// allocation wrappers that track allocation and let collection run
+// allocation wrappers that add to gc pressure
+
+JL_DLLEXPORT void *jl_malloc(size_t sz)
+{
+    return jl_gc_counted_malloc(sz);
+}
+
+//_unchecked_calloc does not check for potential overflow of nm*sz
+STATIC_INLINE void *_unchecked_calloc(size_t nm, size_t sz) {
+    size_t nmsz = nm*sz;
+    return jl_gc_counted_calloc(nmsz, 1);
+}
+
+JL_DLLEXPORT void *jl_calloc(size_t nm, size_t sz)
+{
+    if (nm > SSIZE_MAX/sz)
+        return NULL;
+    return _unchecked_calloc(nm, sz);
+}
+
+JL_DLLEXPORT void jl_free(void *p)
+{
+    if (p != NULL) {
+        size_t sz = memory_block_usable_size(p, 0);
+        free(p);
+        jl_task_t *ct = jl_get_current_task();
+        if (ct != NULL)
+            jl_batch_accum_free_size(ct->ptls, sz);
+    }
+}
+
+JL_DLLEXPORT void *jl_realloc(void *p, size_t sz)
+{
+    size_t old = p ? memory_block_usable_size(p, 0) : 0;
+    void *data = realloc(p, sz);
+    jl_task_t *ct = jl_get_current_task();
+    if (data != NULL && ct != NULL) {
+        sz = memory_block_usable_size(data, 0);
+        jl_ptls_t ptls = ct->ptls;
+        maybe_collect(ptls);
+        if (!(sz < old))
+            jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.allocd,
+                jl_atomic_load_relaxed(&ptls->gc_tls_common.gc_num.allocd) + (sz - old));
+        jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.realloc,
+            jl_atomic_load_relaxed(&ptls->gc_tls_common.gc_num.realloc) + 1);
+
+        int64_t diff = sz - old;
+        if (diff < 0) {
+            jl_batch_accum_free_size(ptls, -diff);
+        }
+        else {
+            jl_batch_accum_heap_size(ptls, diff);
+        }
+    }
+    return data;
+}
 
 JL_DLLEXPORT void *jl_gc_counted_malloc(size_t sz)
 {
-    jl_gcframe_t **pgcstack = jl_get_pgcstack();
-    jl_task_t *ct = jl_current_task;
     void *data = malloc(sz);
-    if (data != NULL && pgcstack != NULL && ct->world_age) {
+    jl_task_t *ct = jl_get_current_task();
+    if (data != NULL && ct != NULL) {
+        sz = memory_block_usable_size(data, 0);
         jl_ptls_t ptls = ct->ptls;
         maybe_collect(ptls);
         jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.allocd,
@@ -3673,54 +3722,29 @@ JL_DLLEXPORT void *jl_gc_counted_malloc(size_t sz)
 
 JL_DLLEXPORT void *jl_gc_counted_calloc(size_t nm, size_t sz)
 {
-    jl_gcframe_t **pgcstack = jl_get_pgcstack();
-    jl_task_t *ct = jl_current_task;
     void *data = calloc(nm, sz);
-    if (data != NULL && pgcstack != NULL && ct->world_age) {
+    jl_task_t *ct = jl_get_current_task();
+    if (data != NULL && ct != NULL) {
+        sz = memory_block_usable_size(data, 0);
         jl_ptls_t ptls = ct->ptls;
         maybe_collect(ptls);
         jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.allocd,
-            jl_atomic_load_relaxed(&ptls->gc_tls_common.gc_num.allocd) + nm*sz);
+            jl_atomic_load_relaxed(&ptls->gc_tls_common.gc_num.allocd) + sz);
         jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.malloc,
             jl_atomic_load_relaxed(&ptls->gc_tls_common.gc_num.malloc) + 1);
-        jl_batch_accum_heap_size(ptls, sz * nm);
+        jl_batch_accum_heap_size(ptls, sz);
     }
     return data;
 }
 
 JL_DLLEXPORT void jl_gc_counted_free_with_size(void *p, size_t sz)
 {
-    jl_gcframe_t **pgcstack = jl_get_pgcstack();
-    jl_task_t *ct = jl_current_task;
-    free(p);
-    if (pgcstack != NULL && ct->world_age) {
-        jl_batch_accum_free_size(ct->ptls, sz);
-    }
+    return jl_free(p);
 }
 
 JL_DLLEXPORT void *jl_gc_counted_realloc_with_old_size(void *p, size_t old, size_t sz)
 {
-    jl_gcframe_t **pgcstack = jl_get_pgcstack();
-    jl_task_t *ct = jl_current_task;
-    void *data = realloc(p, sz);
-    if (data != NULL && pgcstack != NULL && ct->world_age) {
-        jl_ptls_t ptls = ct->ptls;
-        maybe_collect(ptls);
-        if (!(sz < old))
-            jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.allocd,
-                jl_atomic_load_relaxed(&ptls->gc_tls_common.gc_num.allocd) + (sz - old));
-        jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.realloc,
-            jl_atomic_load_relaxed(&ptls->gc_tls_common.gc_num.realloc) + 1);
-
-        int64_t diff = sz - old;
-        if (diff < 0) {
-            jl_batch_accum_free_size(ptls, -diff);
-        }
-        else {
-            jl_batch_accum_heap_size(ptls, diff);
-        }
-    }
-    return data;
+    return jl_realloc(p, sz);
 }
 
 // allocating blocks for Arrays and Strings
@@ -3741,11 +3765,13 @@ JL_DLLEXPORT void *jl_gc_managed_malloc(size_t sz)
     if (b == NULL)
         jl_throw(jl_memory_exception);
 
+    size_t allocated_bytes = memory_block_usable_size(b, 1);
+    assert(allocated_bytes >= allocsz);
     jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.allocd,
-        jl_atomic_load_relaxed(&ptls->gc_tls_common.gc_num.allocd) + allocsz);
+        jl_atomic_load_relaxed(&ptls->gc_tls_common.gc_num.allocd) + allocated_bytes);
     jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.malloc,
         jl_atomic_load_relaxed(&ptls->gc_tls_common.gc_num.malloc) + 1);
-    jl_batch_accum_heap_size(ptls, allocsz);
+    jl_batch_accum_heap_size(ptls, allocated_bytes);
 #ifdef _OS_WINDOWS_
     SetLastError(last_error);
 #endif
diff --git a/src/genericmemory.c b/src/genericmemory.c
index 0bf2cf46edaae..2b02f021ccdd0 100644
--- a/src/genericmemory.c
+++ b/src/genericmemory.c
@@ -165,7 +165,8 @@ JL_DLLEXPORT jl_genericmemory_t *jl_ptr_to_genericmemory(jl_value_t *mtype, void
     if (own_buffer) {
         int isaligned = 0;  // TODO: allow passing memalign'd buffers
         jl_gc_track_malloced_genericmemory(ct->ptls, m, isaligned);
-        jl_gc_count_allocd(nel*elsz);
+        size_t allocated_bytes = memory_block_usable_size(data, isaligned);
+        jl_gc_count_allocd(allocated_bytes);
     }
     return m;
 }
@@ -208,8 +209,6 @@ JL_DLLEXPORT jl_value_t *jl_genericmemory_to_string(jl_genericmemory_t *m, size_
         JL_GC_PUSH1(&o);
         jl_value_t *str = jl_pchar_to_string((const char*)m->ptr, len);
         JL_GC_POP();
-        if (how == 1) // TODO: we might like to early-call jl_gc_free_memory here instead actually, but hopefully `m` will die soon
-            jl_gc_count_freed(mlength);
         return str;
     }
     // n.b. how == 0 is always pool-allocated, so the freed bytes are computed from the pool not the object
diff --git a/src/julia_internal.h b/src/julia_internal.h
index c5bac37890042..5eb99be9e333f 100644
--- a/src/julia_internal.h
+++ b/src/julia_internal.h
@@ -615,8 +615,8 @@ jl_svec_t *jl_perm_symsvec(size_t n, ...);
 
 void jl_gc_track_malloced_genericmemory(jl_ptls_t ptls, jl_genericmemory_t *m, int isaligned) JL_NOTSAFEPOINT;
 size_t jl_genericmemory_nbytes(jl_genericmemory_t *a) JL_NOTSAFEPOINT;
+size_t memory_block_usable_size(void *mem, int isaligned) JL_NOTSAFEPOINT;
 void jl_gc_count_allocd(size_t sz) JL_NOTSAFEPOINT;
-void jl_gc_count_freed(size_t sz) JL_NOTSAFEPOINT;
 void jl_gc_run_all_finalizers(jl_task_t *ct);
 void jl_release_task_stack(jl_ptls_t ptls, jl_task_t *task);
 void jl_gc_add_finalizer_(jl_ptls_t ptls, void *v, void *f) JL_NOTSAFEPOINT;
diff --git a/test/compiler/codegen.jl b/test/compiler/codegen.jl
index fcb3beb87b5a5..83f4001e616e7 100644
--- a/test/compiler/codegen.jl
+++ b/test/compiler/codegen.jl
@@ -407,7 +407,7 @@ function g_dict_hash_alloc()
 end
 # Warm up
 f_dict_hash_alloc(); g_dict_hash_alloc();
-@test (@allocated f_dict_hash_alloc()) == (@allocated g_dict_hash_alloc())
+@test abs((@allocated f_dict_hash_alloc()) / (@allocated g_dict_hash_alloc()) - 1) < 0.1 # less that 10% difference
 
 # returning an argument shouldn't alloc a new box
 @noinline f33829(x) = (global called33829 = true; x)