From b2f2d3c486739f20e4eb5500f3ad7b1866122fce Mon Sep 17 00:00:00 2001
From: Eduardo Souza <ledusou@gmail.com>
Date: Fri, 25 Oct 2024 03:18:02 +0000
Subject: [PATCH] Adding support for MMTk (non-moving Immix)

---
 Make.inc                                      |   44 +
 base/timing.jl                                |   17 +-
 src/Makefile                                  |   56 +-
 src/gc-common.c                               |   50 +
 src/gc-common.h                               |   35 +
 src/gc-interface.h                            |   12 +-
 src/gc-mmtk.c                                 | 1185 +++++++++++++++++
 src/gc-stacks.c                               |   97 --
 src/gc-stock.c                                |  193 ++-
 src/gc-stock.h                                |   26 +-
 src/gc-tls-mmtk.h                             |   23 +
 src/{gc-tls.h => gc-tls-stock.h}              |    0
 src/julia_internal.h                          |    2 +-
 src/julia_locks.h                             |    2 +-
 src/julia_threads.h                           |    6 +-
 src/llvm-gc-interface-passes.h                |    3 +
 src/llvm-late-gc-lowering-mmtk.cpp            |   96 ++
 src/llvm-late-gc-lowering-stock.cpp           |    9 +
 src/llvm-late-gc-lowering.cpp                 |   26 +
 src/staticdata.c                              |    2 +
 src/threading.c                               |    4 +
 .../InteractiveUtils/src/InteractiveUtils.jl  |    1 +
 22 files changed, 1679 insertions(+), 210 deletions(-)
 create mode 100644 src/gc-mmtk.c
 create mode 100644 src/gc-tls-mmtk.h
 rename src/{gc-tls.h => gc-tls-stock.h} (100%)
 create mode 100644 src/llvm-late-gc-lowering-mmtk.cpp
 create mode 100644 src/llvm-late-gc-lowering-stock.cpp

diff --git a/Make.inc b/Make.inc
index a60a95d21c3db..feb0c0be733ff 100644
--- a/Make.inc
+++ b/Make.inc
@@ -80,6 +80,9 @@ HAVE_SSP := 0
 WITH_GC_VERIFY := 0
 WITH_GC_DEBUG_ENV := 0
 
+# Use stock if MMTK_PLAN hasn't been defined
+MMTK_PLAN ?= None
+
 # Enable DTrace support
 WITH_DTRACE := 0
 
@@ -829,6 +832,41 @@ JCXXFLAGS += -DGC_DEBUG_ENV
 JCFLAGS += -DGC_DEBUG_ENV
 endif
 
+ifneq (${MMTK_PLAN},None)
+ifeq (${MMTK_JULIA_DIR},)
+$(error MMTK_JULIA_DIR must be set to use MMTk)
+endif
+JCXXFLAGS += -DMMTK_GC
+JCFLAGS += -DMMTK_GC
+ifeq (${MMTK_BUILD},)
+ifeq (debug,$(findstring debug,$(MAKECMDGOALS)))
+MMTK_BUILD = debug
+else
+MMTK_BUILD = release
+endif
+endif
+ifeq (${MMTK_PLAN},Immix)
+JCXXFLAGS += -DMMTK_PLAN_IMMIX
+JCFLAGS += -DMMTK_PLAN_IMMIX
+else
+$(error "Unsupported MMTk plan: $(MMTK_PLAN)")
+endif
+MMTK_DIR = ${MMTK_JULIA_DIR}/mmtk
+MMTK_API_INC = $(MMTK_DIR)/api
+ifeq ($(OS),Linux)
+MMTK_LIB_NAME := libmmtk_julia.so
+else
+$(error "Unsupported OS for MMTk")
+endif
+MMTK_LIB_SRC := $(MMTK_DIR)/target/$(MMTK_BUILD)/$(MMTK_LIB_NAME)
+MMTK_LIB_DST := $(BUILDROOT)/usr/lib/$(MMTK_LIB_NAME)
+MMTK_LIB := -lmmtk_julia
+LDFLAGS += -Wl,-rpath=$(MMTK_DIR)/target/$(MMTK_BUILD)/
+else
+MMTK_JULIA_INC :=
+MMTK_LIB :=
+endif
+
 ifeq ($(WITH_DTRACE), 1)
 JCXXFLAGS += -DUSE_DTRACE
 JCFLAGS += -DUSE_DTRACE
@@ -1823,6 +1861,9 @@ PRINT_PERL = printf '    %b %b\n' $(PERLCOLOR)PERL$(ENDCOLOR) $(BINCOLOR)$(GOAL)
 PRINT_FLISP = printf '    %b %b\n' $(FLISPCOLOR)FLISP$(ENDCOLOR) $(BINCOLOR)$(GOAL)$(ENDCOLOR); $(1)
 PRINT_JULIA = printf '    %b %b\n' $(JULIACOLOR)JULIA$(ENDCOLOR) $(BINCOLOR)$(GOAL)$(ENDCOLOR); $(1)
 PRINT_DTRACE = printf '    %b %b\n' $(DTRACECOLOR)DTRACE$(ENDCOLOR) $(BINCOLOR)$(GOAL)$(ENDCOLOR); $(1)
+ifneq (${MMTK_PLAN},None)
+PRINT_MMTK = printf '    %b %b\n' $(LINKCOLOR)MMTK$(ENDCOLOR) $(BINCOLOR)$(GOAL)$(ENDCOLOR); $(1)
+endif
 
 else
 QUIET_MAKE =
@@ -1833,6 +1874,9 @@ PRINT_PERL = echo '$(subst ','\'',$(1))'; $(1)
 PRINT_FLISP = echo '$(subst ','\'',$(1))'; $(1)
 PRINT_JULIA = echo '$(subst ','\'',$(1))'; $(1)
 PRINT_DTRACE = echo '$(subst ','\'',$(1))'; $(1)
+ifneq (${MMTK_PLAN},None)
+PRINT_MMTK = echo '$(subst ','\'',$(1))'; $(1)
+endif
 
 endif # VERBOSE
 
diff --git a/base/timing.jl b/base/timing.jl
index 1de3727756829..0088f8bb77eca 100644
--- a/base/timing.jl
+++ b/base/timing.jl
@@ -106,9 +106,14 @@ function gc_page_utilization_data()
     return Base.unsafe_wrap(Array, page_utilization_raw, JL_GC_N_MAX_POOLS, own=false)
 end
 
+
+const USING_STOCK_GC = occursin("stock", unsafe_string(ccall(:jl_gc_active_impl, Ptr{UInt8}, ())))
+# Full sweep reasons are currently only available for the stock GC
+@static if USING_STOCK_GC
 # must be kept in sync with `src/gc-stock.h``
 const FULL_SWEEP_REASONS = [:FULL_SWEEP_REASON_SWEEP_ALWAYS_FULL, :FULL_SWEEP_REASON_FORCED_FULL_SWEEP,
                             :FULL_SWEEP_REASON_USER_MAX_EXCEEDED, :FULL_SWEEP_REASON_LARGE_PROMOTION_RATE]
+end
 
 """
     Base.full_sweep_reasons()
@@ -124,11 +129,15 @@ The reasons are:
 Note that the set of reasons is not guaranteed to be stable across minor versions of Julia.
 """
 function full_sweep_reasons()
-    reason = cglobal(:jl_full_sweep_reasons, UInt64)
-    reasons_as_array = Base.unsafe_wrap(Vector{UInt64}, reason, length(FULL_SWEEP_REASONS), own=false)
     d = Dict{Symbol, Int64}()
-    for (i, r) in enumerate(FULL_SWEEP_REASONS)
-        d[r] = reasons_as_array[i]
+    # populate the dictionary according to the reasons above for the stock GC
+    # otherwise return an empty dictionary for now
+    @static if USING_STOCK_GC
+        reason = cglobal(:jl_full_sweep_reasons, UInt64)
+        reasons_as_array = Base.unsafe_wrap(Vector{UInt64}, reason, length(FULL_SWEEP_REASONS), own=false)
+        for (i, r) in enumerate(FULL_SWEEP_REASONS)
+            d[r] = reasons_as_array[i]
+        end
     end
     return d
 end
diff --git a/src/Makefile b/src/Makefile
index 9355ca2c4c675..f5dd46741c6e9 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -29,6 +29,10 @@ ifeq ($(USECLANG),1)
 FLAGS += -Wno-return-type-c-linkage -Wno-atomic-alignment
 endif
 
+ifeq ($(WITH_MMTK), 1)
+FLAGS += -I$(MMTK_API_INC)
+endif
+
 FLAGS += -DJL_BUILD_ARCH='"$(ARCH)"'
 ifeq ($(OS),WINNT)
 FLAGS += -DJL_BUILD_UNAME='"NT"'
@@ -40,23 +44,41 @@ ifeq ($(OS),FreeBSD)
 FLAGS += -I$(LOCALBASE)/include
 endif
 
+# GC source code. It depends on which GC implementation to use.
+GC_SRCS := gc-common gc-stacks gc-alloc-profiler gc-heap-snapshot
+ifneq (${MMTK_PLAN},None)
+GC_SRCS += gc-mmtk
+else
+GC_SRCS += gc-stock gc-debug gc-pages gc-page-profiler
+endif
+
 SRCS := \
 	jltypes gf typemap smallintset ast builtins module interpreter symbol \
 	dlload sys init task array genericmemory staticdata toplevel jl_uv datatype \
 	simplevector runtime_intrinsics precompile jloptions mtarraylist \
-	threading scheduler stackwalk gc-common gc-stock gc-debug gc-pages gc-stacks gc-alloc-profiler gc-page-profiler method \
-	jlapi signal-handling safepoint timing subtype rtutils gc-heap-snapshot \
-	crc32c APInt-C processor ircode opaque_closure codegen-stubs coverage runtime_ccall engine
+	threading scheduler stackwalk \
+	method jlapi signal-handling safepoint timing subtype rtutils \
+	crc32c APInt-C processor ircode opaque_closure codegen-stubs coverage runtime_ccall engine \
+	$(GC_SRCS)
 
 RT_LLVMLINK :=
 CG_LLVMLINK :=
 
 ifeq ($(JULIACODEGEN),LLVM)
+# Currently these files are used by both GCs. But we should make the list specific to stock, and MMTk should have its own implementation.
+GC_CODEGEN_SRCS := llvm-final-gc-lowering llvm-late-gc-lowering llvm-gc-invariant-verifier
+ifneq (${MMTK_PLAN},None)
+FLAGS += -I$(MMTK_API_INC)
+GC_CODEGEN_SRCS += llvm-late-gc-lowering-mmtk
+else
+GC_CODEGEN_SRCS += llvm-late-gc-lowering-stock
+endif
 CODEGEN_SRCS := codegen jitlayers aotcompile debuginfo disasm llvm-simdloop \
-	llvm-final-gc-lowering llvm-pass-helpers llvm-late-gc-lowering llvm-ptls \
-	llvm-lower-handlers llvm-gc-invariant-verifier llvm-propagate-addrspaces \
+	llvm-pass-helpers llvm-ptls \
+	llvm-lower-handlers llvm-propagate-addrspaces \
 	llvm-multiversioning llvm-alloc-opt llvm-alloc-helpers cgmemmgr llvm-remove-addrspaces \
-	llvm-remove-ni llvm-julia-licm llvm-demote-float16 llvm-cpufeatures pipeline llvm_api
+	llvm-remove-ni llvm-julia-licm llvm-demote-float16 llvm-cpufeatures pipeline llvm_api \
+	$(GC_CODEGEN_SRCS)
 FLAGS += -I$(shell $(LLVM_CONFIG_HOST) --includedir)
 CG_LLVM_LIBS := all
 ifeq ($(USE_POLLY),1)
@@ -99,7 +121,12 @@ ifeq ($(USE_SYSTEM_LIBUV),0)
 UV_HEADERS += uv.h
 UV_HEADERS += uv/*.h
 endif
-PUBLIC_HEADERS := $(BUILDDIR)/julia_version.h $(wildcard $(SRCDIR)/support/*.h) $(addprefix $(SRCDIR)/,work-stealing-queue.h gc-interface.h gc-tls.h gc-tls-common.h julia.h julia_assert.h julia_threads.h julia_fasttls.h julia_locks.h julia_atomics.h jloptions.h)
+PUBLIC_HEADERS := $(BUILDDIR)/julia_version.h $(wildcard $(SRCDIR)/support/*.h) $(addprefix $(SRCDIR)/,work-stealing-queue.h gc-interface.h gc-tls-common.h julia.h julia_assert.h julia_threads.h julia_fasttls.h julia_locks.h julia_atomics.h jloptions.h)
+ifneq (${MMTK_PLAN},None)
+	PUBLIC_HEADERS += $(addprefix $(SRCDIR)/,gc-tls-mmtk.h)
+else
+	PUBLIC_HEADERS += $(addprefix $(SRCDIR)/,gc-tls-stock.h)
+endif
 ifeq ($(OS),WINNT)
 PUBLIC_HEADERS += $(addprefix $(SRCDIR)/,win32_ucontext.h)
 endif
@@ -164,8 +191,8 @@ LIBJULIA_PATH_REL := libjulia
 endif
 
 COMMON_LIBPATHS := -L$(build_libdir) -L$(build_shlibdir)
-RT_LIBS := $(WHOLE_ARCHIVE) $(LIBUV) $(WHOLE_ARCHIVE) $(LIBUTF8PROC) $(NO_WHOLE_ARCHIVE) $(LIBUNWIND) $(RT_LLVMLINK) $(OSLIBS) $(LIBTRACYCLIENT) $(LIBITTAPI)
-CG_LIBS := $(LIBUNWIND) $(CG_LLVMLINK) $(OSLIBS) $(LIBTRACYCLIENT) $(LIBITTAPI)
+RT_LIBS := $(WHOLE_ARCHIVE) $(LIBUV) $(WHOLE_ARCHIVE) $(LIBUTF8PROC) $(NO_WHOLE_ARCHIVE) $(LIBUNWIND) $(RT_LLVMLINK) $(OSLIBS) $(LIBTRACYCLIENT) $(LIBITTAPI) $(MMTK_LIB)
+CG_LIBS := $(LIBUNWIND) $(CG_LLVMLINK) $(OSLIBS) $(LIBTRACYCLIENT) $(LIBITTAPI) $(MMTK_LIB)
 RT_DEBUG_LIBS := $(COMMON_LIBPATHS) $(WHOLE_ARCHIVE) $(BUILDDIR)/flisp/libflisp-debug.a $(WHOLE_ARCHIVE) $(BUILDDIR)/support/libsupport-debug.a -ljulia-debug $(RT_LIBS)
 CG_DEBUG_LIBS := $(COMMON_LIBPATHS) $(CG_LIBS) -ljulia-debug -ljulia-internal-debug
 RT_RELEASE_LIBS := $(COMMON_LIBPATHS) $(WHOLE_ARCHIVE) $(BUILDDIR)/flisp/libflisp.a $(WHOLE_ARCHIVE) $(BUILDDIR)/support/libsupport.a -ljulia $(RT_LIBS)
@@ -222,6 +249,12 @@ $(BUILDDIR)/%.h.gen : $(SRCDIR)/%.d
 	sed 's/JULIA_/JL_PROBE_/' $@ > $@.tmp
 	mv $@.tmp $@
 
+# Compile files from the binding side and copy so file into lib folder
+ifneq (${MMTK_PLAN},None)
+$(MMTK_LIB_DST): $(MMTK_LIB_SRC)
+	@$(call PRINT_MMTK, cp $< $@)
+endif
+
 $(BUILDDIR)/jl_internal_funcs.inc: $(SRCDIR)/jl_exported_funcs.inc
 	# Generate `.inc` file that contains a list of `#define` macros to rename functions defined in `libjulia-internal`
 	# to have a `ijl_` prefix instead of `jl_`, to denote that they are coming from `libjulia-internal`.  This avoids
@@ -314,6 +347,7 @@ $(BUILDDIR)/debuginfo.o $(BUILDDIR)/debuginfo.dbg.obj: $(addprefix $(SRCDIR)/,de
 $(BUILDDIR)/disasm.o $(BUILDDIR)/disasm.dbg.obj: $(SRCDIR)/debuginfo.h $(SRCDIR)/processor.h
 $(BUILDDIR)/gc-debug.o $(BUILDDIR)/gc-debug.dbg.obj: $(SRCDIR)/gc-common.h $(SRCDIR)/gc-stock.h
 $(BUILDDIR)/gc-pages.o $(BUILDDIR)/gc-pages.dbg.obj: $(SRCDIR)/gc-common.h $(SRCDIR)/gc-stock.h
+$(BUILDDIR)/gc-mmtk.o $(BUILDDIR)/gc-mmtk.dbg.obj: $(SRCDIR)/gc-common.h $(SRCDIR)/gc-heap-snapshot.h $(SRCDIR)/gc-alloc-profiler.h
 $(BUILDDIR)/gc-stacks.o $(BUILDDIR)/gc-stacks.dbg.obj: $(SRCDIR)/gc-common.h $(SRCDIR)/gc-stock.h
 $(BUILDDIR)/gc-stock.o $(BUILDDIR)/gc.dbg.obj: $(SRCDIR)/gc-common.h $(SRCDIR)/gc-stock.h $(SRCDIR)/gc-heap-snapshot.h $(SRCDIR)/gc-alloc-profiler.h $(SRCDIR)/gc-page-profiler.h
 $(BUILDDIR)/gc-heap-snapshot.o $(BUILDDIR)/gc-heap-snapshot.dbg.obj: $(SRCDIR)/gc-heap-snapshot.h
@@ -386,13 +420,13 @@ $(BUILDDIR)/julia.expmap: $(SRCDIR)/julia.expmap.in $(JULIAHOME)/VERSION $(LLVM_
 	sed <'$<' >'$@' -e "s/@JULIA_SHLIB_SYMBOL_VERSION@/JL_LIBJULIA_$(SOMAJOR)/" \
 		        -e "s/@LLVM_SHLIB_SYMBOL_VERSION@/$(LLVM_SHLIB_SYMBOL_VERSION)/"
 
-$(build_shlibdir)/libjulia-internal.$(JL_MAJOR_MINOR_SHLIB_EXT): $(BUILDDIR)/julia.expmap $(OBJS) $(BUILDDIR)/flisp/libflisp.a $(BUILDDIR)/support/libsupport.a $(LIBUV)
+$(build_shlibdir)/libjulia-internal.$(JL_MAJOR_MINOR_SHLIB_EXT): $(BUILDDIR)/julia.expmap $(OBJS) $(MMTK_LIB_DST) $(BUILDDIR)/flisp/libflisp.a $(BUILDDIR)/support/libsupport.a $(LIBUV)
 	@$(call PRINT_LINK, $(CXXLD) $(call IMPLIB_FLAGS,$@) $(JCXXFLAGS) $(JL_CXXFLAGS) $(CXXLDFLAGS) $(SHIPFLAGS) $(OBJS) $(RPATH_LIB) -o $@ \
 		$(JLDFLAGS) $(BOLT_LDFLAGS) $(JLIBLDFLAGS) $(RT_RELEASE_LIBS) $(call SONAME_FLAGS,libjulia-internal.$(JL_MAJOR_SHLIB_EXT)))
 	@$(INSTALL_NAME_CMD)libjulia-internal.$(SHLIB_EXT) $@
 	$(DSYMUTIL) $@
 
-$(build_shlibdir)/libjulia-internal-debug.$(JL_MAJOR_MINOR_SHLIB_EXT): $(BUILDDIR)/julia.expmap $(DOBJS) $(BUILDDIR)/flisp/libflisp-debug.a $(BUILDDIR)/support/libsupport-debug.a $(LIBUV)
+$(build_shlibdir)/libjulia-internal-debug.$(JL_MAJOR_MINOR_SHLIB_EXT): $(BUILDDIR)/julia.expmap $(DOBJS) $(MMTK_LIB_DST) $(BUILDDIR)/flisp/libflisp-debug.a $(BUILDDIR)/support/libsupport-debug.a $(LIBUV)
 	@$(call PRINT_LINK, $(CXXLD) $(call IMPLIB_FLAGS,$@) $(JCXXFLAGS) $(JL_CXXFLAGS) $(CXXLDFLAGS) $(DEBUGFLAGS) $(DOBJS) $(RPATH_LIB) -o $@ \
 		$(JLDFLAGS) $(JLIBLDFLAGS) $(RT_DEBUG_LIBS) $(call SONAME_FLAGS,libjulia-internal-debug.$(JL_MAJOR_SHLIB_EXT)))
 	@$(INSTALL_NAME_CMD)libjulia-internal-debug.$(SHLIB_EXT) $@
diff --git a/src/gc-common.c b/src/gc-common.c
index c751b54f059f5..4dd088f49b7b8 100644
--- a/src/gc-common.c
+++ b/src/gc-common.c
@@ -540,6 +540,38 @@ JL_DLLEXPORT jl_value_t *(jl_gc_alloc)(jl_ptls_t ptls, size_t sz, void *ty)
     return jl_gc_alloc_(ptls, sz, ty);
 }
 
+JL_DLLEXPORT void *jl_malloc(size_t sz)
+{
+    return jl_gc_counted_malloc(sz);
+}
+
+//_unchecked_calloc does not check for potential overflow of nm*sz
+STATIC_INLINE void *_unchecked_calloc(size_t nm, size_t sz) {
+    size_t nmsz = nm*sz;
+    return jl_gc_counted_calloc(nmsz, 1);
+}
+
+JL_DLLEXPORT void *jl_calloc(size_t nm, size_t sz)
+{
+    if (nm > SSIZE_MAX/sz)
+        return NULL;
+    return _unchecked_calloc(nm, sz);
+}
+
+JL_DLLEXPORT void jl_free(void *p)
+{
+    if (p != NULL) {
+        size_t sz = memory_block_usable_size(p, 0);
+        return jl_gc_counted_free_with_size(p, sz);
+    }
+}
+
+JL_DLLEXPORT void *jl_realloc(void *p, size_t sz)
+{
+    size_t old = p ? memory_block_usable_size(p, 0) : 0;
+    return jl_gc_counted_realloc_with_old_size(p, old, sz);
+}
+
 // =========================================================================== //
 // Generic Memory
 // =========================================================================== //
@@ -677,6 +709,24 @@ JL_DLLEXPORT void jl_throw_out_of_memory_error(void)
     jl_throw(jl_memory_exception);
 }
 
+// Sweeping mtarraylist_buffers:
+// These buffers are made unreachable via `mtarraylist_resizeto` from mtarraylist.c
+// and are freed at the end of GC via jl_gc_sweep_stack_pools_and_mtarraylist_buffers
+void sweep_mtarraylist_buffers(void) JL_NOTSAFEPOINT
+{
+    for (int i = 0; i < gc_n_threads; i++) {
+        jl_ptls_t ptls = gc_all_tls_states[i];
+        if (ptls == NULL) {
+            continue;
+        }
+        small_arraylist_t *buffers = &ptls->lazily_freed_mtarraylist_buffers;
+        void *buf;
+        while ((buf = small_arraylist_pop(buffers)) != NULL) {
+            free(buf);
+        }
+    }
+}
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/gc-common.h b/src/gc-common.h
index 3007151009f7d..2726b372a1150 100644
--- a/src/gc-common.h
+++ b/src/gc-common.h
@@ -24,6 +24,31 @@
 extern "C" {
 #endif
 
+// =========================================================================== //
+// GC Big objects
+// =========================================================================== //
+
+JL_EXTENSION typedef struct _bigval_t {
+    struct _bigval_t *next;
+    struct _bigval_t *prev;
+    size_t sz;
+#ifdef _P64 // Add padding so that the value is 64-byte aligned
+    // (8 pointers of 8 bytes each) - (4 other pointers in struct)
+    void *_padding[8 - 4];
+#else
+    // (16 pointers of 4 bytes each) - (4 other pointers in struct)
+    void *_padding[16 - 4];
+#endif
+    //struct jl_taggedvalue_t <>;
+    union {
+        uintptr_t header;
+        struct {
+            uintptr_t gc:2;
+        } bits;
+    };
+    // must be 64-byte aligned here, in 32 & 64 bit modes
+} bigval_t;
+
 // =========================================================================== //
 // GC Callbacks
 // =========================================================================== //
@@ -193,4 +218,14 @@ extern jl_ptls_t* gc_all_tls_states;
 
 extern int gc_logging_enabled;
 
+// =========================================================================== //
+// MISC
+// =========================================================================== //
+
+// number of stacks to always keep available per pool
+#define MIN_STACK_MAPPINGS_PER_POOL 5
+
+void _jl_free_stack(jl_ptls_t ptls, void *stkbuf, size_t bufsz) JL_NOTSAFEPOINT;
+void sweep_mtarraylist_buffers(void) JL_NOTSAFEPOINT;
+
 #endif // JL_GC_COMMON_H
diff --git a/src/gc-interface.h b/src/gc-interface.h
index eb6687d52d9ab..e4a27782f7520 100644
--- a/src/gc-interface.h
+++ b/src/gc-interface.h
@@ -98,6 +98,13 @@ JL_DLLEXPORT void jl_gc_set_max_memory(uint64_t max_mem);
 JL_DLLEXPORT void jl_gc_collect(jl_gc_collection_t collection);
 // Returns whether the thread with `tid` is a collector thread
 JL_DLLEXPORT int gc_is_collector_thread(int tid) JL_NOTSAFEPOINT;
+// Returns which GC implementation is being used and possibly its version according to the list of supported GCs
+// NB: it should clearly identify the GC by including e.g. ‘stock’ or ‘mmtk’ as a substring.
+JL_DLLEXPORT const char* jl_gc_active_impl(void);
+// Sweep Julia's stack pools and mtarray buffers. Note that this function has been added to the interface as
+// each GC should implement it but it will most likely not be used by other code in the runtime.
+// It still needs to be annotated with JL_DLLEXPORT since it is called from Rust by MMTk.
+JL_DLLEXPORT void jl_gc_sweep_stack_pools_and_mtarraylist_buffers(jl_ptls_t ptls) JL_NOTSAFEPOINT;
 
 // ========================================================================= //
 // Metrics
@@ -138,7 +145,6 @@ JL_DLLEXPORT uint64_t jl_gc_total_hrtime(void);
 // **must** also set the type of the returning object to be `ty`. The type `ty` may also be used to record
 // an allocation of that type in the allocation profiler.
 struct _jl_value_t *jl_gc_alloc_(struct _jl_tls_states_t * ptls, size_t sz, void *ty);
-
 // Allocates small objects and increments Julia allocation counterst. Size of the object
 // header must be included in the object size. The (possibly unused in some implementations)
 // offset to the arena in which we're allocating is passed in the second parameter, and the
@@ -198,6 +204,10 @@ JL_DLLEXPORT void *jl_gc_perm_alloc(size_t sz, int zero, unsigned align,
 //              the allocated object. All objects stored in fields of this object
 //              must be either permanently allocated or have other roots.
 struct _jl_value_t *jl_gc_permobj(size_t sz, void *ty) JL_NOTSAFEPOINT;
+// This function notifies the GC about memory addresses that are set when loading the boot image.
+// The GC may use that information to, for instance, determine that such objects should
+// be treated as marked and belonged to the old generation in nursery collections.
+void jl_gc_notify_image_load(const char* img_data, size_t len);
 
 // ========================================================================= //
 // Runtime Write-Barriers
diff --git a/src/gc-mmtk.c b/src/gc-mmtk.c
new file mode 100644
index 0000000000000..20de2789b6453
--- /dev/null
+++ b/src/gc-mmtk.c
@@ -0,0 +1,1185 @@
+#include "gc-common.h"
+#include "gc-tls-mmtk.h"
+#include "mmtkMutator.h"
+#include "threading.h"
+
+// File exists in the binding
+#include "mmtk.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// ========================================================================= //
+// Julia specific
+// ========================================================================= //
+
+extern jl_value_t *cmpswap_names JL_GLOBALLY_ROOTED;
+extern const unsigned pool_sizes[];
+extern jl_mutex_t finalizers_lock;
+
+// FIXME: Should the values below be shared between both GC's?
+// Note that MMTk uses a hard max heap limit, which is set by default
+// as 70% of the free available memory. The min heap is set as the
+// default_collect_interval variable below.
+
+// max_total_memory is a suggestion.  We try very hard to stay
+// under this limit, but we will go above it rather than halting.
+#ifdef _P64
+typedef uint64_t memsize_t;
+static const size_t default_collect_interval = 5600 * 1024 * sizeof(void*);
+// We expose this to the user/ci as jl_gc_set_max_memory
+static memsize_t max_total_memory = (memsize_t) 2 * 1024 * 1024 * 1024 * 1024 * 1024;
+#else
+typedef uint32_t memsize_t;
+static const size_t default_collect_interval = 3200 * 1024 * sizeof(void*);
+// Work really hard to stay within 2GB
+// Alternative is to risk running out of address space
+// on 32 bit architectures.
+#define MAX32HEAP 1536 * 1024 * 1024
+static memsize_t max_total_memory = (memsize_t) MAX32HEAP;
+#endif
+
+// ========================================================================= //
+// Defined by the binding
+// ========================================================================= //
+
+extern void mmtk_julia_copy_stack_check(int copy_stack);
+extern void mmtk_gc_init(uintptr_t min_heap_size, uintptr_t max_heap_size, uintptr_t n_gcthreads, uintptr_t header_size, uintptr_t tag);
+extern void mmtk_object_reference_write_post(void* mutator, const void* parent, const void* ptr);
+extern void mmtk_object_reference_write_slow(void* mutator, const void* parent, const void* ptr);
+extern void* mmtk_alloc(void* mutator, size_t size, size_t align, size_t offset, int allocator);
+extern void mmtk_post_alloc(void* mutator, void* refer, size_t bytes, int allocator);
+extern void mmtk_store_obj_size_c(void* obj, size_t size);
+extern const void* MMTK_SIDE_LOG_BIT_BASE_ADDRESS;
+extern const void* MMTK_SIDE_VO_BIT_BASE_ADDRESS;
+
+// ========================================================================= //
+// GC Initialization and Control
+// ========================================================================= //
+
+void jl_gc_init(void) {
+    // TODO: use jl_options.heap_size_hint to set MMTk's fixed heap size? (see issue: https://github.com/mmtk/mmtk-julia/issues/167)
+    JL_MUTEX_INIT(&finalizers_lock, "finalizers_lock");
+
+    arraylist_new(&to_finalize, 0);
+    arraylist_new(&finalizer_list_marked, 0);
+
+    gc_num.allocd = 0;
+    gc_num.max_pause = 0;
+    gc_num.max_memory = 0;
+
+    long long min_heap_size;
+    long long max_heap_size;
+    char* min_size_def = getenv("MMTK_MIN_HSIZE");
+    char* min_size_gb = getenv("MMTK_MIN_HSIZE_G");
+
+    char* max_size_def = getenv("MMTK_MAX_HSIZE");
+    char* max_size_gb = getenv("MMTK_MAX_HSIZE_G");
+
+    // default min heap currently set as Julia's default_collect_interval
+    if (min_size_def != NULL) {
+        char *p;
+        double min_size = strtod(min_size_def, &p);
+        min_heap_size = (long) 1024 * 1024 * min_size;
+    } else if (min_size_gb != NULL) {
+        char *p;
+        double min_size = strtod(min_size_gb, &p);
+        min_heap_size = (long) 1024 * 1024 * 1024 * min_size;
+    } else {
+        min_heap_size = default_collect_interval;
+    }
+
+    // default max heap currently set as 70% the free memory in the system
+    if (max_size_def != NULL) {
+        char *p;
+        double max_size = strtod(max_size_def, &p);
+        max_heap_size = (long) 1024 * 1024 * max_size;
+    } else if (max_size_gb != NULL) {
+        char *p;
+        double max_size = strtod(max_size_gb, &p);
+        max_heap_size = (long) 1024 * 1024 * 1024 * max_size;
+    } else {
+        max_heap_size = uv_get_free_memory() * 70 / 100;
+    }
+
+    // Assert that the number of stock GC threads is 0; MMTK uses the number of threads in jl_options.ngcthreads
+    assert(jl_n_gcthreads == 0);
+
+    // Check that the julia_copy_stack rust feature has been defined when the COPY_STACK has been defined
+    int copy_stacks;
+
+#ifdef COPY_STACKS
+    copy_stacks = 1;
+#else
+    copy_stacks = 0;
+#endif
+
+    mmtk_julia_copy_stack_check(copy_stacks);
+
+    // if only max size is specified initialize MMTk with a fixed size heap
+    // TODO: We just assume mark threads means GC threads, and ignore the number of concurrent sweep threads.
+    // If the two values are the same, we can use either. Otherwise, we need to be careful.
+    uintptr_t gcthreads = jl_options.nmarkthreads;
+    if (max_size_def != NULL || (max_size_gb != NULL && (min_size_def == NULL && min_size_gb == NULL))) {
+        mmtk_gc_init(0, max_heap_size, gcthreads, (sizeof(jl_taggedvalue_t)), jl_buff_tag);
+    } else {
+        mmtk_gc_init(min_heap_size, max_heap_size, gcthreads, (sizeof(jl_taggedvalue_t)), jl_buff_tag);
+    }
+}
+
+void jl_start_gc_threads(void) {
+    jl_ptls_t ptls = jl_current_task->ptls;
+    mmtk_initialize_collection((void *)ptls);
+}
+
+void jl_init_thread_heap(struct _jl_tls_states_t *ptls) JL_NOTSAFEPOINT {
+    jl_thread_heap_common_t *heap = &ptls->gc_tls_common.heap;
+    small_arraylist_new(&heap->weak_refs, 0);
+    small_arraylist_new(&heap->live_tasks, 0);
+    for (int i = 0; i < JL_N_STACK_POOLS; i++)
+        small_arraylist_new(&heap->free_stacks[i], 0);
+    heap->mallocarrays = NULL;
+    heap->mafreelist = NULL;
+    arraylist_new(&ptls->finalizers, 0);
+    // Initialize `lazily_freed_mtarraylist_buffers`
+    small_arraylist_new(&ptls->lazily_freed_mtarraylist_buffers, 0);
+    // Clear the malloc sz count
+    jl_atomic_store_relaxed(&ptls->gc_tls.malloc_sz_since_last_poll, 0);
+    // Create mutator
+    MMTk_Mutator mmtk_mutator = mmtk_bind_mutator((void *)ptls, ptls->tid);
+    // Copy the mutator to the thread local storage
+    memcpy(&ptls->gc_tls.mmtk_mutator, mmtk_mutator, sizeof(MMTkMutatorContext));
+    // Call post_bind to maintain a list of active mutators and to reclaim the old mutator (which is no longer needed)
+    mmtk_post_bind_mutator(&ptls->gc_tls.mmtk_mutator, mmtk_mutator);
+    memset(&ptls->gc_tls_common.gc_num, 0, sizeof(ptls->gc_tls_common.gc_num));
+}
+
+void jl_free_thread_gc_state(struct _jl_tls_states_t *ptls) {
+    mmtk_destroy_mutator(&ptls->gc_tls.mmtk_mutator);
+}
+
+JL_DLLEXPORT void jl_gc_set_max_memory(uint64_t max_mem) {
+    // MMTk currently does not allow setting the heap size at runtime
+}
+
+inline void maybe_collect(jl_ptls_t ptls)
+{
+    // Just do a safe point for general maybe_collect
+    jl_gc_safepoint_(ptls);
+}
+
+// This is only used for malloc. We need to know if we need to do GC. However, keeping checking with MMTk (mmtk_gc_poll),
+// is expensive. So we only check for every few allocations.
+static inline void malloc_maybe_collect(jl_ptls_t ptls, size_t sz)
+{
+    // We do not need to carefully maintain malloc_sz_since_last_poll. We just need to
+    // avoid using mmtk_gc_poll too frequently, and try to be precise on our heap usage
+    // as much as we can.
+    if (ptls->gc_tls.malloc_sz_since_last_poll > 4096) {
+        jl_atomic_store_relaxed(&ptls->gc_tls.malloc_sz_since_last_poll, 0);
+        mmtk_gc_poll(ptls);
+    } else {
+        size_t curr = jl_atomic_load_relaxed(&ptls->gc_tls.malloc_sz_since_last_poll);
+        jl_atomic_store_relaxed(&ptls->gc_tls.malloc_sz_since_last_poll, curr + sz);
+        jl_gc_safepoint_(ptls);
+    }
+}
+
+// This is called when the user calls for a GC with Gc.gc()
+JL_DLLEXPORT void jl_gc_collect(jl_gc_collection_t collection) {
+    jl_task_t *ct = jl_current_task;
+    jl_ptls_t ptls = ct->ptls;
+    if (jl_atomic_load_acquire(&jl_gc_disable_counter)) {
+        size_t localbytes = jl_atomic_load_relaxed(&ptls->gc_tls_common.gc_num.allocd) + gc_num.interval;
+        jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.allocd, -(int64_t)gc_num.interval);
+        static_assert(sizeof(_Atomic(uint64_t)) == sizeof(gc_num.deferred_alloc), "");
+        jl_atomic_fetch_add_relaxed((_Atomic(uint64_t)*)&gc_num.deferred_alloc, localbytes);
+        return;
+    }
+    mmtk_handle_user_collection_request(ptls, collection);
+}
+
+
+// Based on jl_gc_collect from gc-stock.c
+// called when stopping the thread in `mmtk_block_for_gc`
+JL_DLLEXPORT void jl_gc_prepare_to_collect(void)
+{
+    // FIXME: set to JL_GC_AUTO since we're calling it from mmtk
+    // maybe just remove this?
+    JL_PROBE_GC_BEGIN(JL_GC_AUTO);
+
+    jl_task_t *ct = jl_current_task;
+    jl_ptls_t ptls = ct->ptls;
+    if (jl_atomic_load_acquire(&jl_gc_disable_counter)) {
+        size_t localbytes = jl_atomic_load_relaxed(&ptls->gc_tls_common.gc_num.allocd) + gc_num.interval;
+        jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.allocd, -(int64_t)gc_num.interval);
+        static_assert(sizeof(_Atomic(uint64_t)) == sizeof(gc_num.deferred_alloc), "");
+        jl_atomic_fetch_add_relaxed((_Atomic(uint64_t)*)&gc_num.deferred_alloc, localbytes);
+        return;
+    }
+
+    int8_t old_state = jl_atomic_load_relaxed(&ptls->gc_state);
+    jl_atomic_store_release(&ptls->gc_state, JL_GC_STATE_WAITING);
+    // `jl_safepoint_start_gc()` makes sure only one thread can run the GC.
+    uint64_t t0 = jl_hrtime();
+    if (!jl_safepoint_start_gc(ct)) {
+        jl_gc_state_set(ptls, old_state, JL_GC_STATE_WAITING);
+        jl_safepoint_wait_thread_resume(ct); // block in thread-suspend now if requested, after clearing the gc_state
+        return;
+    }
+
+    JL_TIMING_SUSPEND_TASK(GC, ct);
+    JL_TIMING(GC, GC);
+
+    int last_errno = errno;
+#ifdef _OS_WINDOWS_
+    DWORD last_error = GetLastError();
+#endif
+    // Now we are ready to wait for other threads to hit the safepoint,
+    // we can do a few things that doesn't require synchronization.
+    //
+    // We must sync here with the tls_lock operations, so that we have a
+    // seq-cst order between these events now we know that either the new
+    // thread must run into our safepoint flag or we must observe the
+    // existence of the thread in the jl_n_threads count.
+    //
+    // TODO: concurrently queue objects
+    jl_fence();
+    gc_n_threads = jl_atomic_load_acquire(&jl_n_threads);
+    gc_all_tls_states = jl_atomic_load_relaxed(&jl_all_tls_states);
+    jl_gc_wait_for_the_world(gc_all_tls_states, gc_n_threads);
+    JL_PROBE_GC_STOP_THE_WORLD();
+
+    uint64_t t1 = jl_hrtime();
+    uint64_t duration = t1 - t0;
+    if (duration > gc_num.max_time_to_safepoint)
+        gc_num.max_time_to_safepoint = duration;
+    gc_num.time_to_safepoint = duration;
+    gc_num.total_time_to_safepoint += duration;
+
+    if (!jl_atomic_load_acquire(&jl_gc_disable_counter)) {
+        JL_LOCK_NOGC(&finalizers_lock); // all the other threads are stopped, so this does not make sense, right? otherwise, failing that, this seems like plausibly a deadlock
+#ifndef __clang_gcanalyzer__
+        mmtk_block_thread_for_gc();
+#endif
+        JL_UNLOCK_NOGC(&finalizers_lock);
+    }
+
+    gc_n_threads = 0;
+    gc_all_tls_states = NULL;
+    jl_safepoint_end_gc();
+    jl_gc_state_set(ptls, old_state, JL_GC_STATE_WAITING);
+    JL_PROBE_GC_END();
+    jl_safepoint_wait_thread_resume(ct); // block in thread-suspend now if requested, after clearing the gc_state
+
+    // Only disable finalizers on current thread
+    // Doing this on all threads is racy (it's impossible to check
+    // or wait for finalizers on other threads without dead lock).
+    if (!ptls->finalizers_inhibited && ptls->locks.len == 0) {
+        JL_TIMING(GC, GC_Finalizers);
+        run_finalizers(ct, 0);
+    }
+    JL_PROBE_GC_FINALIZER();
+
+#ifdef _OS_WINDOWS_
+    SetLastError(last_error);
+#endif
+    errno = last_errno;
+}
+
+// ========================================================================= //
+// GC Statistics
+// ========================================================================= //
+
+JL_DLLEXPORT const char* jl_gc_active_impl(void) {
+    const char* mmtk_version = get_mmtk_version();
+    return mmtk_version;
+}
+
+int64_t last_gc_total_bytes = 0;
+int64_t last_live_bytes = 0; // live_bytes at last collection
+int64_t live_bytes = 0;
+
+// FIXME: The functions combine_thread_gc_counts and reset_thread_gc_counts
+// are currently nearly identical for mmtk and for stock. However, the stats
+// are likely different (e.g., MMTk doesn't track the bytes allocated in the fastpath,
+// but only when the slowpath is called). We might need to adapt these later so that
+// the statistics are the same or as close as possible for each GC.
+static void combine_thread_gc_counts(jl_gc_num_t *dest, int update_heap) JL_NOTSAFEPOINT
+{
+    int gc_n_threads;
+    jl_ptls_t* gc_all_tls_states;
+    gc_n_threads = jl_atomic_load_acquire(&jl_n_threads);
+    gc_all_tls_states = jl_atomic_load_relaxed(&jl_all_tls_states);
+    for (int i = 0; i < gc_n_threads; i++) {
+        jl_ptls_t ptls = gc_all_tls_states[i];
+        if (ptls) {
+            dest->allocd += (jl_atomic_load_relaxed(&ptls->gc_tls_common.gc_num.allocd) + gc_num.interval);
+            dest->malloc += jl_atomic_load_relaxed(&ptls->gc_tls_common.gc_num.malloc);
+            dest->realloc += jl_atomic_load_relaxed(&ptls->gc_tls_common.gc_num.realloc);
+            dest->poolalloc += jl_atomic_load_relaxed(&ptls->gc_tls_common.gc_num.poolalloc);
+            dest->bigalloc += jl_atomic_load_relaxed(&ptls->gc_tls_common.gc_num.bigalloc);
+            dest->freed += jl_atomic_load_relaxed(&ptls->gc_tls_common.gc_num.free_acc);
+            if (update_heap) {
+                jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.alloc_acc, 0);
+                jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.free_acc, 0);
+            }
+        }
+    }
+}
+
+void reset_thread_gc_counts(void) JL_NOTSAFEPOINT
+{
+    int gc_n_threads;
+    jl_ptls_t* gc_all_tls_states;
+    gc_n_threads = jl_atomic_load_acquire(&jl_n_threads);
+    gc_all_tls_states = jl_atomic_load_relaxed(&jl_all_tls_states);
+    for (int i = 0; i < gc_n_threads; i++) {
+        jl_ptls_t ptls = gc_all_tls_states[i];
+        if (ptls != NULL) {
+            // don't reset `pool_live_bytes` here
+            jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.allocd, -(int64_t)gc_num.interval);
+            jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.malloc, 0);
+            jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.realloc, 0);
+            jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.poolalloc, 0);
+            jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.bigalloc, 0);
+            jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.alloc_acc, 0);
+            jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.free_acc, 0);
+        }
+    }
+}
+
+// Retrieves Julia's `GC_Num` (structure that stores GC statistics).
+JL_DLLEXPORT jl_gc_num_t jl_gc_num(void) {
+    jl_gc_num_t num = gc_num;
+    combine_thread_gc_counts(&num, 0);
+    return num;
+}
+
+JL_DLLEXPORT int64_t jl_gc_diff_total_bytes(void) JL_NOTSAFEPOINT {
+    int64_t oldtb = last_gc_total_bytes;
+    int64_t newtb;
+    jl_gc_get_total_bytes(&newtb);
+    last_gc_total_bytes = newtb;
+    return newtb - oldtb;
+}
+
+JL_DLLEXPORT int64_t jl_gc_sync_total_bytes(int64_t offset) JL_NOTSAFEPOINT
+{
+    int64_t oldtb = last_gc_total_bytes;
+    int64_t newtb;
+    jl_gc_get_total_bytes(&newtb);
+    last_gc_total_bytes = newtb - offset;
+    return newtb - oldtb;
+}
+
+JL_DLLEXPORT int64_t jl_gc_pool_live_bytes(void) {
+    return 0;
+}
+
+void jl_gc_count_allocd(size_t sz) JL_NOTSAFEPOINT
+{
+    jl_ptls_t ptls = jl_current_task->ptls;
+    jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.allocd,
+        jl_atomic_load_relaxed(&ptls->gc_tls_common.gc_num.allocd) + sz);
+}
+
+void jl_gc_count_freed(size_t sz) JL_NOTSAFEPOINT
+{
+}
+
+int64_t inc_live_bytes(int64_t inc) JL_NOTSAFEPOINT
+{
+    jl_timing_counter_inc(JL_TIMING_COUNTER_HeapSize, inc);
+    return live_bytes += inc;
+}
+
+void jl_gc_reset_alloc_count(void) JL_NOTSAFEPOINT
+{
+    combine_thread_gc_counts(&gc_num, 0);
+    inc_live_bytes(gc_num.deferred_alloc + gc_num.allocd);
+    gc_num.allocd = 0;
+    gc_num.deferred_alloc = 0;
+    reset_thread_gc_counts();
+}
+
+JL_DLLEXPORT int64_t jl_gc_live_bytes(void) {
+    return last_live_bytes;
+}
+
+JL_DLLEXPORT void jl_gc_get_total_bytes(int64_t *bytes) JL_NOTSAFEPOINT
+{
+    jl_gc_num_t num = gc_num;
+    combine_thread_gc_counts(&num, 0);
+    // Sync this logic with `base/util.jl:GC_Diff`
+    *bytes = (num.total_allocd + num.deferred_alloc + num.allocd);
+}
+
+JL_DLLEXPORT uint64_t jl_gc_get_max_memory(void)
+{
+    // FIXME: should probably return MMTk's heap size
+    return max_total_memory;
+}
+
+// These are needed to collect MMTk statistics from a Julia program using ccall
+JL_DLLEXPORT void (jl_mmtk_harness_begin)(void)
+{
+    jl_ptls_t ptls = jl_current_task->ptls;
+    mmtk_harness_begin(ptls);
+}
+
+JL_DLLEXPORT void (jl_mmtk_harness_end)(void)
+{
+    mmtk_harness_end();
+}
+
+// ========================================================================= //
+// Root Processing, Object Scanning and Julia-specific sweeping
+// ========================================================================= //
+
+static void add_node_to_roots_buffer(RootsWorkClosure* closure, RootsWorkBuffer* buf, size_t* buf_len, void* root) {
+    if (root == NULL)
+        return;
+
+    buf->ptr[*buf_len] = root;
+    *buf_len += 1;
+    if (*buf_len >= buf->cap) {
+        RootsWorkBuffer new_buf = (closure->report_nodes_func)(buf->ptr, *buf_len, buf->cap, closure->data, true);
+        *buf = new_buf;
+        *buf_len = 0;
+    }
+}
+
+static void add_node_to_tpinned_roots_buffer(RootsWorkClosure* closure, RootsWorkBuffer* buf, size_t* buf_len, void* root) {
+    if (root == NULL)
+        return;
+
+    buf->ptr[*buf_len] = root;
+    *buf_len += 1;
+    if (*buf_len >= buf->cap) {
+        RootsWorkBuffer new_buf = (closure->report_tpinned_nodes_func)(buf->ptr, *buf_len, buf->cap, closure->data, true);
+        *buf = new_buf;
+        *buf_len = 0;
+    }
+}
+
+JL_DLLEXPORT void jl_gc_scan_vm_specific_roots(RootsWorkClosure* closure)
+{
+    // Create a new buf
+    RootsWorkBuffer buf = (closure->report_nodes_func)((void**)0, 0, 0, closure->data, true);
+    size_t len = 0;
+
+    // add module
+    add_node_to_roots_buffer(closure, &buf, &len, jl_main_module);
+
+    // buildin values
+    add_node_to_roots_buffer(closure, &buf, &len, jl_an_empty_vec_any);
+    add_node_to_roots_buffer(closure, &buf, &len, jl_module_init_order);
+    for (size_t i = 0; i < jl_current_modules.size; i += 2) {
+        if (jl_current_modules.table[i + 1] != HT_NOTFOUND) {
+            add_node_to_roots_buffer(closure, &buf, &len, jl_current_modules.table[i]);
+        }
+    }
+    add_node_to_roots_buffer(closure, &buf, &len, jl_anytuple_type_type);
+    for (size_t i = 0; i < N_CALL_CACHE; i++) {
+         jl_typemap_entry_t *v = jl_atomic_load_relaxed(&call_cache[i]);
+        add_node_to_roots_buffer(closure, &buf, &len, v);
+    }
+    add_node_to_roots_buffer(closure, &buf, &len, _jl_debug_method_invalidation);
+
+    // constants
+    add_node_to_roots_buffer(closure, &buf, &len, jl_emptytuple_type);
+    add_node_to_roots_buffer(closure, &buf, &len, cmpswap_names);
+
+    // jl_global_roots_table must be transitively pinned
+    RootsWorkBuffer tpinned_buf = (closure->report_tpinned_nodes_func)((void**)0, 0, 0, closure->data, true);
+    size_t tpinned_len = 0;
+    add_node_to_tpinned_roots_buffer(closure, &tpinned_buf, &tpinned_len, jl_global_roots_list);
+    add_node_to_tpinned_roots_buffer(closure, &tpinned_buf, &tpinned_len, jl_global_roots_keyset);
+
+    // Push the result of the work.
+    (closure->report_nodes_func)(buf.ptr, len, buf.cap, closure->data, false);
+    (closure->report_tpinned_nodes_func)(tpinned_buf.ptr, tpinned_len, tpinned_buf.cap, closure->data, false);
+}
+
+JL_DLLEXPORT void jl_gc_scan_julia_exc_obj(void* obj_raw, void* closure, ProcessSlotFn process_slot) {
+    jl_task_t *ta = (jl_task_t*)obj_raw;
+
+    if (ta->excstack) { // inlining label `excstack` from mark_loop
+
+        // the excstack should always be a heap object
+        assert(mmtk_object_is_managed_by_mmtk(ta->excstack));
+
+        process_slot(closure, &ta->excstack);
+        jl_excstack_t *excstack = ta->excstack;
+        size_t itr = ta->excstack->top;
+        size_t bt_index = 0;
+        size_t jlval_index = 0;
+        while (itr > 0) {
+            size_t bt_size = jl_excstack_bt_size(excstack, itr);
+            jl_bt_element_t *bt_data = jl_excstack_bt_data(excstack, itr);
+            for (; bt_index < bt_size; bt_index += jl_bt_entry_size(bt_data + bt_index)) {
+                jl_bt_element_t *bt_entry = bt_data + bt_index;
+                if (jl_bt_is_native(bt_entry))
+                    continue;
+                // Found an extended backtrace entry: iterate over any
+                // GC-managed values inside.
+                size_t njlvals = jl_bt_num_jlvals(bt_entry);
+                while (jlval_index < njlvals) {
+                    jl_value_t** new_obj_slot = &bt_entry[2 + jlval_index].jlvalue;
+                    jlval_index += 1;
+                    process_slot(closure, new_obj_slot);
+                }
+                jlval_index = 0;
+            }
+
+            jl_bt_element_t *stack_raw = (jl_bt_element_t *)(excstack+1);
+            jl_value_t** stack_obj_slot = &stack_raw[itr-1].jlvalue;
+
+            itr = jl_excstack_next(excstack, itr);
+            bt_index = 0;
+            jlval_index = 0;
+            process_slot(closure, stack_obj_slot);
+        }
+    }
+}
+
+// This is used in mmtk_sweep_malloced_memory and it is slightly different
+// from jl_gc_free_memory from gc-stock.c as the stock GC updates the
+// information in the global variable gc_heap_stats (which is specific to the stock GC)
+static void jl_gc_free_memory(jl_value_t *v, int isaligned) JL_NOTSAFEPOINT
+{
+    assert(jl_is_genericmemory(v));
+    jl_genericmemory_t *m = (jl_genericmemory_t*)v;
+    assert(jl_genericmemory_how(m) == 1 || jl_genericmemory_how(m) == 2);
+    char *d = (char*)m->ptr;
+    if (isaligned)
+        jl_free_aligned(d);
+    else
+        free(d);
+    gc_num.freed += jl_genericmemory_nbytes(m);
+    gc_num.freecall++;
+}
+
+JL_DLLEXPORT void jl_gc_mmtk_sweep_malloced_memory(void) JL_NOTSAFEPOINT
+{
+    void* iter = mmtk_new_mutator_iterator();
+    jl_ptls_t ptls2 = (jl_ptls_t)mmtk_get_next_mutator_tls(iter);
+    while(ptls2 != NULL) {
+        mallocmemory_t *ma = ptls2->gc_tls_common.heap.mallocarrays;
+        mallocmemory_t **pma = &ptls2->gc_tls_common.heap.mallocarrays;
+        while (ma != NULL) {
+            mallocmemory_t *nxt = ma->next;
+            jl_value_t *a = (jl_value_t*)((uintptr_t)ma->a & ~1);
+            // the array should always be a heap object
+            assert(mmtk_object_is_managed_by_mmtk(a));
+            if (mmtk_is_live_object(a)) {
+                // if the array has been forwarded, the reference needs to be updated
+                jl_genericmemory_t *maybe_forwarded = (jl_genericmemory_t*)mmtk_get_possibly_forwarded(ma->a);
+                ma->a = maybe_forwarded;
+                pma = &ma->next;
+            }
+            else {
+                *pma = nxt;
+                int isaligned = (uintptr_t)ma->a & 1;
+                jl_gc_free_memory(a, isaligned);
+                ma->next = ptls2->gc_tls_common.heap.mafreelist;
+                ptls2->gc_tls_common.heap.mafreelist = ma;
+            }
+            ma = nxt;
+        }
+        ptls2 = (jl_ptls_t)mmtk_get_next_mutator_tls(iter);
+    }
+    mmtk_close_mutator_iterator(iter);
+}
+
+#define jl_genericmemory_elsize(a) (((jl_datatype_t*)jl_typetagof(a))->layout->size)
+
+// if data is inlined inside the genericmemory object --- to->ptr needs to be updated when copying the array
+JL_DLLEXPORT void jl_gc_update_inlined_array(void* from, void* to) {
+    jl_value_t* jl_from = (jl_value_t*) from;
+    jl_value_t* jl_to = (jl_value_t*) to;
+
+    uintptr_t tag_to = (uintptr_t)jl_typeof(jl_to);
+    jl_datatype_t *vt = (jl_datatype_t*)tag_to;
+
+    if(vt->name == jl_genericmemory_typename) {
+        jl_genericmemory_t *a = (jl_genericmemory_t*)jl_from;
+        jl_genericmemory_t *b = (jl_genericmemory_t*)jl_to;
+        int how = jl_genericmemory_how(a);
+
+        if (how == 0 && mmtk_object_is_managed_by_mmtk(a->ptr)) { // a is inlined (a->ptr points into the mmtk object)
+            size_t offset_of_data = ((size_t)a->ptr - (size_t)a);
+            if (offset_of_data > 0) {
+                b->ptr = (void*)((size_t) b + offset_of_data);
+            }
+        }
+    }
+}
+
+// modified sweep_stack_pools from gc-stacks.c
+JL_DLLEXPORT void jl_gc_mmtk_sweep_stack_pools(void)
+{
+    // Stack sweeping algorithm:
+    //    // deallocate stacks if we have too many sitting around unused
+    //    for (stk in halfof(free_stacks))
+    //        free_stack(stk, pool_sz);
+    //    // then sweep the task stacks
+    //    for (t in live_tasks)
+    //        if (!gc-marked(t))
+    //            stkbuf = t->stkbuf
+    //            bufsz = t->bufsz
+    //            if (stkbuf)
+    //                push(free_stacks[sz], stkbuf)
+    assert(gc_n_threads);
+    for (int i = 0; i < jl_n_threads; i++) {
+        jl_ptls_t ptls2 = gc_all_tls_states[i];
+        if (ptls2 == NULL)
+            continue;
+
+        // free half of stacks that remain unused since last sweep
+        for (int p = 0; p < JL_N_STACK_POOLS; p++) {
+            small_arraylist_t *al = &ptls2->gc_tls_common.heap.free_stacks[p];
+            size_t n_to_free;
+            if (jl_atomic_load_relaxed(&ptls2->current_task) == NULL) {
+                n_to_free = al->len; // not alive yet or dead, so it does not need these anymore
+            }
+            else if (al->len > MIN_STACK_MAPPINGS_PER_POOL) {
+                n_to_free = al->len / 2;
+                if (n_to_free > (al->len - MIN_STACK_MAPPINGS_PER_POOL))
+                    n_to_free = al->len - MIN_STACK_MAPPINGS_PER_POOL;
+            }
+            else {
+                n_to_free = 0;
+            }
+            for (int n = 0; n < n_to_free; n++) {
+                void *stk = small_arraylist_pop(al);
+                free_stack(stk, pool_sizes[p]);
+            }
+            if (jl_atomic_load_relaxed(&ptls2->current_task) == NULL) {
+                small_arraylist_free(al);
+            }
+        }
+        if (jl_atomic_load_relaxed(&ptls2->current_task) == NULL) {
+            small_arraylist_free(ptls2->gc_tls_common.heap.free_stacks);
+        }
+
+        small_arraylist_t *live_tasks = &ptls2->gc_tls_common.heap.live_tasks;
+        size_t n = 0;
+        size_t ndel = 0;
+        size_t l = live_tasks->len;
+        void **lst = live_tasks->items;
+        if (l == 0)
+            continue;
+        while (1) {
+            jl_task_t *t = (jl_task_t*)lst[n];
+            if (mmtk_is_live_object(t)) {
+                jl_task_t *maybe_forwarded = (jl_task_t*)mmtk_get_possibly_forwarded(t);
+                live_tasks->items[n] = maybe_forwarded;
+                t = maybe_forwarded;
+                assert(jl_is_task(t));
+                if (t->ctx.stkbuf == NULL)
+                    ndel++; // jl_release_task_stack called
+                else
+                    n++;
+            } else {
+                ndel++;
+                void *stkbuf = t->ctx.stkbuf;
+                size_t bufsz = t->ctx.bufsz;
+                if (stkbuf) {
+                    t->ctx.stkbuf = NULL;
+                    _jl_free_stack(ptls2, stkbuf, bufsz);
+                }
+#ifdef _COMPILER_TSAN_ENABLED_
+                if (t->ctx.tsan_state) {
+                    __tsan_destroy_fiber(t->ctx.tsan_state);
+                    t->ctx.tsan_state = NULL;
+                }
+#endif
+            }
+            if (n >= l - ndel)
+                break;
+            void *tmp = lst[n];
+            lst[n] = lst[n + ndel];
+            lst[n + ndel] = tmp;
+        }
+        live_tasks->len -= ndel;
+    }
+}
+
+JL_DLLEXPORT void jl_gc_sweep_stack_pools_and_mtarraylist_buffers(jl_ptls_t ptls) JL_NOTSAFEPOINT
+{
+    jl_gc_mmtk_sweep_stack_pools();
+    sweep_mtarraylist_buffers();
+}
+
+JL_DLLEXPORT void* jl_gc_get_stackbase(int16_t tid) {
+    assert(tid >= 0);
+    jl_ptls_t ptls2 = jl_all_tls_states[tid];
+    return ptls2->stackbase;
+}
+
+JL_DLLEXPORT void jl_gc_update_stats(uint64_t inc, size_t mmtk_live_bytes, bool is_nursery_gc) {
+    gc_num.total_time += inc;
+    gc_num.pause += 1;
+    gc_num.full_sweep += !(is_nursery_gc);
+    gc_num.total_allocd += gc_num.allocd;
+    gc_num.allocd = 0;
+    live_bytes = mmtk_live_bytes;
+}
+
+#define jl_genericmemory_data_owner_field_addr(a) ((jl_value_t**)((jl_genericmemory_t*)(a) + 1))
+
+JL_DLLEXPORT void* jl_gc_get_owner_address_to_mmtk(void* m) {
+    return (void*)jl_genericmemory_data_owner_field_addr(m);
+}
+
+// same as jl_genericmemory_how but with JL_DLLEXPORT
+// we should probably inline this in Rust
+JL_DLLEXPORT size_t jl_gc_genericmemory_how(void *arg) JL_NOTSAFEPOINT
+{
+    jl_genericmemory_t* m = (jl_genericmemory_t*)arg;
+    if (m->ptr == (void*)((char*)m + 16)) // JL_SMALL_BYTE_ALIGNMENT (from julia_internal.h)
+        return 0;
+    jl_value_t *owner = jl_genericmemory_data_owner_field(m);
+    if (owner == (jl_value_t*)m)
+        return 1;
+    if (owner == NULL)
+        return 2;
+    return 3;
+}
+
+// ========================================================================= //
+// Weak References and Finalizers
+// ========================================================================= //
+
+JL_DLLEXPORT jl_weakref_t *jl_gc_new_weakref_th(jl_ptls_t ptls, jl_value_t *value)
+{
+    jl_weakref_t *wr = (jl_weakref_t*)jl_gc_alloc(ptls, sizeof(void*), jl_weakref_type);
+    wr->value = value;  // NOTE: wb not needed here
+    mmtk_add_weak_candidate(wr);
+    return wr;
+}
+
+JL_DLLEXPORT void* jl_gc_get_thread_finalizer_list(void* ptls_raw) {
+    jl_ptls_t ptls = (jl_ptls_t) ptls_raw;
+    return (void*)&ptls->finalizers;
+}
+
+JL_DLLEXPORT void* jl_gc_get_to_finalize_list(void) {
+    return (void*)&to_finalize;
+}
+
+JL_DLLEXPORT void* jl_gc_get_marked_finalizers_list(void) {
+    return (void*)&finalizer_list_marked;
+}
+
+JL_DLLEXPORT int* jl_gc_get_have_pending_finalizers(void) {
+    return (int*)&jl_gc_have_pending_finalizers;
+}
+
+// ========================================================================= //
+// Allocation
+// ========================================================================= //
+
+#define MMTK_DEFAULT_IMMIX_ALLOCATOR (0)
+#define MMTK_IMMORTAL_BUMP_ALLOCATOR (0)
+
+int jl_gc_classify_pools(size_t sz, int *osize)
+{
+    if (sz > GC_MAX_SZCLASS)
+        return -1; // call big alloc function
+    size_t allocsz = sz + sizeof(jl_taggedvalue_t);
+    *osize = LLT_ALIGN(allocsz, 16);
+    return 0; // use MMTk's fastpath logic
+}
+
+#define MMTK_MIN_ALIGNMENT 4
+// MMTk assumes allocation size is aligned to min alignment.
+inline size_t mmtk_align_alloc_sz(size_t sz) JL_NOTSAFEPOINT
+{
+    return (sz + MMTK_MIN_ALIGNMENT - 1) & ~(MMTK_MIN_ALIGNMENT - 1);
+}
+
+inline void* bump_alloc_fast(MMTkMutatorContext* mutator, uintptr_t* cursor, uintptr_t limit, size_t size, size_t align, size_t offset, int allocator) {
+    intptr_t delta = (-offset - *cursor) & (align - 1);
+    uintptr_t result = *cursor + (uintptr_t)delta;
+
+    if (__unlikely(result + size > limit)) {
+        return (void*) mmtk_alloc(mutator, size, align, offset, allocator);
+    } else{
+        *cursor = result + size;
+        return (void*)result;
+    }
+}
+
+inline void* mmtk_immix_alloc_fast(MMTkMutatorContext* mutator, size_t size, size_t align, size_t offset) {
+    ImmixAllocator* allocator = &mutator->allocators.immix[MMTK_DEFAULT_IMMIX_ALLOCATOR];
+    return bump_alloc_fast(mutator, (uintptr_t*)&allocator->cursor, (intptr_t)allocator->limit, size, align, offset, 0);
+}
+
+inline void mmtk_immix_post_alloc_slow(MMTkMutatorContext* mutator, void* obj, size_t size) {
+    mmtk_post_alloc(mutator, obj, size, 0);
+}
+
+inline void mmtk_immix_post_alloc_fast(MMTkMutatorContext* mutator, void* obj, size_t size) {
+    // FIXME: for now, we do nothing
+    // but when supporting moving, this is where we set the valid object (VO) bit
+}
+
+inline void* mmtk_immortal_alloc_fast(MMTkMutatorContext* mutator, size_t size, size_t align, size_t offset) {
+    BumpAllocator* allocator = &mutator->allocators.bump_pointer[MMTK_IMMORTAL_BUMP_ALLOCATOR];
+    return bump_alloc_fast(mutator, (uintptr_t*)&allocator->cursor, (uintptr_t)allocator->limit, size, align, offset, 1);
+}
+
+inline void mmtk_immortal_post_alloc_fast(MMTkMutatorContext* mutator, void* obj, size_t size) {
+    // FIXME: Similarly, for now, we do nothing
+    // but when supporting moving, this is where we set the valid object (VO) bit
+    // and log (old gen) bit
+}
+
+JL_DLLEXPORT jl_value_t *jl_mmtk_gc_alloc_default(jl_ptls_t ptls, int osize, size_t align, void *ty)
+{
+    // safepoint
+    jl_gc_safepoint_(ptls);
+
+    jl_value_t *v;
+    if ((uintptr_t)ty != jl_buff_tag) {
+        // v needs to be 16 byte aligned, therefore v_tagged needs to be offset accordingly to consider the size of header
+        jl_taggedvalue_t *v_tagged = (jl_taggedvalue_t *)mmtk_immix_alloc_fast(&ptls->gc_tls.mmtk_mutator, LLT_ALIGN(osize, align), align, sizeof(jl_taggedvalue_t));
+        v = jl_valueof(v_tagged);
+        mmtk_immix_post_alloc_fast(&ptls->gc_tls.mmtk_mutator, v, LLT_ALIGN(osize, align));
+    } else {
+        // allocating an extra word to store the size of buffer objects
+        jl_taggedvalue_t *v_tagged = (jl_taggedvalue_t *)mmtk_immix_alloc_fast(&ptls->gc_tls.mmtk_mutator, LLT_ALIGN(osize+sizeof(jl_taggedvalue_t), align), align, 0);
+        jl_value_t* v_tagged_aligned = ((jl_value_t*)((char*)(v_tagged) + sizeof(jl_taggedvalue_t)));
+        v = jl_valueof(v_tagged_aligned);
+        mmtk_store_obj_size_c(v, LLT_ALIGN(osize+sizeof(jl_taggedvalue_t), align));
+        mmtk_immix_post_alloc_fast(&ptls->gc_tls.mmtk_mutator, v, LLT_ALIGN(osize+sizeof(jl_taggedvalue_t), align));
+    }
+
+    ptls->gc_tls_common.gc_num.allocd += osize;
+    ptls->gc_tls_common.gc_num.poolalloc++;
+
+    return v;
+}
+
+JL_DLLEXPORT jl_value_t *jl_mmtk_gc_alloc_big(jl_ptls_t ptls, size_t sz)
+{
+    // safepoint
+    jl_gc_safepoint_(ptls);
+
+    size_t offs = offsetof(bigval_t, header);
+    assert(sz >= sizeof(jl_taggedvalue_t) && "sz must include tag");
+    static_assert(offsetof(bigval_t, header) >= sizeof(void*), "Empty bigval header?");
+    static_assert(sizeof(bigval_t) % JL_HEAP_ALIGNMENT == 0, "");
+    size_t allocsz = LLT_ALIGN(sz + offs, JL_CACHE_BYTE_ALIGNMENT);
+    if (allocsz < sz) { // overflow in adding offs, size was "negative"
+        assert(0 && "Error when allocating big object");
+        jl_throw(jl_memory_exception);
+    }
+
+    bigval_t *v = (bigval_t*)mmtk_alloc_large(&ptls->gc_tls.mmtk_mutator, allocsz, JL_CACHE_BYTE_ALIGNMENT, 0, 2);
+
+    if (v == NULL) {
+        assert(0 && "Allocation failed");
+        jl_throw(jl_memory_exception);
+    }
+    v->sz = allocsz;
+
+    ptls->gc_tls_common.gc_num.allocd += allocsz;
+    ptls->gc_tls_common.gc_num.bigalloc++;
+
+    jl_value_t *result = jl_valueof(&v->header);
+    mmtk_post_alloc(&ptls->gc_tls.mmtk_mutator, result, allocsz, 2);
+
+    return result;
+}
+
+// Instrumented version of jl_gc_small_alloc_inner, called into by LLVM-generated code.
+JL_DLLEXPORT jl_value_t *jl_gc_small_alloc(jl_ptls_t ptls, int offset, int osize, jl_value_t* type)
+{
+    assert(jl_atomic_load_relaxed(&ptls->gc_state) == 0);
+
+    jl_value_t *val = jl_mmtk_gc_alloc_default(ptls, osize, 16, NULL);
+    maybe_record_alloc_to_profile(val, osize, (jl_datatype_t*)type);
+    return val;
+}
+
+// Instrumented version of jl_gc_big_alloc_inner, called into by LLVM-generated code.
+JL_DLLEXPORT jl_value_t *jl_gc_big_alloc(jl_ptls_t ptls, size_t sz, jl_value_t *type)
+{
+    // TODO: assertion needed here?
+    assert(jl_atomic_load_relaxed(&ptls->gc_state) == 0);
+
+    jl_value_t *val = jl_mmtk_gc_alloc_big(ptls, sz);
+    maybe_record_alloc_to_profile(val, sz, (jl_datatype_t*)type);
+    return val;
+}
+
+inline jl_value_t *jl_gc_alloc_(jl_ptls_t ptls, size_t sz, void *ty)
+{
+    jl_value_t *v;
+    const size_t allocsz = sz + sizeof(jl_taggedvalue_t);
+    if (sz <= GC_MAX_SZCLASS) {
+        v = jl_mmtk_gc_alloc_default(ptls, allocsz, 16, ty);
+    }
+    else {
+        if (allocsz < sz) // overflow in adding offs, size was "negative"
+            jl_throw(jl_memory_exception);
+        v = jl_mmtk_gc_alloc_big(ptls, allocsz);
+    }
+    jl_set_typeof(v, ty);
+    maybe_record_alloc_to_profile(v, sz, (jl_datatype_t*)ty);
+    return v;
+}
+
+// allocation wrappers that track allocation and let collection run
+JL_DLLEXPORT void *jl_gc_counted_malloc(size_t sz)
+{
+    jl_gcframe_t **pgcstack = jl_get_pgcstack();
+    jl_task_t *ct = jl_current_task;
+    void *data = malloc(sz);
+    if (data != NULL && pgcstack != NULL && ct->world_age) {
+        jl_ptls_t ptls = ct->ptls;
+        malloc_maybe_collect(ptls, sz);
+        jl_atomic_fetch_add_relaxed(&JULIA_MALLOC_BYTES, sz);
+    }
+    return data;
+}
+
+JL_DLLEXPORT void *jl_gc_counted_calloc(size_t nm, size_t sz)
+{
+    jl_gcframe_t **pgcstack = jl_get_pgcstack();
+    jl_task_t *ct = jl_current_task;
+    void *data = calloc(nm, sz);
+    if (data != NULL && pgcstack != NULL && ct->world_age) {
+        jl_ptls_t ptls = ct->ptls;
+        malloc_maybe_collect(ptls, nm * sz);
+        jl_atomic_fetch_add_relaxed(&JULIA_MALLOC_BYTES, nm * sz);
+    }
+    return data;
+}
+
+JL_DLLEXPORT void jl_gc_counted_free_with_size(void *p, size_t sz)
+{
+    jl_gcframe_t **pgcstack = jl_get_pgcstack();
+    jl_task_t *ct = jl_current_task;
+    free(p);
+    if (pgcstack != NULL && ct->world_age) {
+        jl_atomic_fetch_add_relaxed(&JULIA_MALLOC_BYTES, -sz);
+    }
+}
+
+JL_DLLEXPORT void *jl_gc_counted_realloc_with_old_size(void *p, size_t old, size_t sz)
+{
+    jl_gcframe_t **pgcstack = jl_get_pgcstack();
+    jl_task_t *ct = jl_current_task;
+    if (pgcstack && ct->world_age) {
+        jl_ptls_t ptls = ct->ptls;
+        malloc_maybe_collect(ptls, sz);
+        if (sz < old)
+            jl_atomic_fetch_add_relaxed(&JULIA_MALLOC_BYTES, old - sz);
+        else
+            jl_atomic_fetch_add_relaxed(&JULIA_MALLOC_BYTES, sz - old);
+    }
+    return realloc(p, sz);
+}
+
+void *jl_gc_perm_alloc_nolock(size_t sz, int zero, unsigned align, unsigned offset)
+{
+    jl_ptls_t ptls = jl_current_task->ptls;
+    size_t allocsz = mmtk_align_alloc_sz(sz);
+    void* addr = mmtk_immortal_alloc_fast(&ptls->gc_tls.mmtk_mutator, allocsz, align, offset);
+    return addr;
+}
+
+void *jl_gc_perm_alloc(size_t sz, int zero, unsigned align, unsigned offset)
+{
+    return jl_gc_perm_alloc_nolock(sz, zero, align, offset);
+}
+
+jl_value_t *jl_gc_permobj(size_t sz, void *ty) JL_NOTSAFEPOINT
+{
+    const size_t allocsz = sz + sizeof(jl_taggedvalue_t);
+    unsigned align = (sz == 0 ? sizeof(void*) : (allocsz <= sizeof(void*) * 2 ?
+                                                 sizeof(void*) * 2 : 16));
+    jl_taggedvalue_t *o = (jl_taggedvalue_t*)jl_gc_perm_alloc(allocsz, 0, align,
+                                                              sizeof(void*) % align);
+
+    jl_ptls_t ptls = jl_current_task->ptls;
+    mmtk_immortal_post_alloc_fast(&ptls->gc_tls.mmtk_mutator, jl_valueof(o), allocsz);
+    o->header = (uintptr_t)ty;
+    return jl_valueof(o);
+}
+
+JL_DLLEXPORT void *jl_gc_managed_malloc(size_t sz)
+{
+    jl_ptls_t ptls = jl_current_task->ptls;
+    maybe_collect(ptls);
+    size_t allocsz = LLT_ALIGN(sz, JL_CACHE_BYTE_ALIGNMENT);
+    if (allocsz < sz)  // overflow in adding offs, size was "negative"
+        jl_throw(jl_memory_exception);
+
+    int last_errno = errno;
+#ifdef _OS_WINDOWS_
+    DWORD last_error = GetLastError();
+#endif
+    void *b = malloc_cache_align(allocsz);
+    if (b == NULL)
+        jl_throw(jl_memory_exception);
+
+    jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.allocd,
+        jl_atomic_load_relaxed(&ptls->gc_tls_common.gc_num.allocd) + allocsz);
+    jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.malloc,
+        jl_atomic_load_relaxed(&ptls->gc_tls_common.gc_num.malloc) + 1);
+    // FIXME: Should these be part of mmtk's heap?
+    // malloc_maybe_collect(ptls, sz);
+    // jl_atomic_fetch_add_relaxed(&JULIA_MALLOC_BYTES, allocsz);
+#ifdef _OS_WINDOWS_
+    SetLastError(last_error);
+#endif
+    errno = last_errno;
+    // jl_gc_managed_malloc is currently always used for allocating array buffers.
+    maybe_record_alloc_to_profile((jl_value_t*)b, sz, (jl_datatype_t*)jl_buff_tag);
+    return b;
+}
+
+void jl_gc_notify_image_load(const char* img_data, size_t len)
+{
+    mmtk_set_vm_space((void*)img_data, len);
+}
+
+// ========================================================================= //
+// Code specific to stock that is not supported by MMTk
+// ========================================================================= //
+
+// mutex for page profile
+uv_mutex_t page_profile_lock;
+
+JL_DLLEXPORT void jl_gc_take_page_profile(ios_t *stream)
+{
+    uv_mutex_lock(&page_profile_lock);
+    const char *str = "Page profiler in unsupported in MMTk.";
+    ios_write(stream, str, strlen(str));
+    uv_mutex_unlock(&page_profile_lock);
+}
+
+// this seems to be needed by the gc tests
+#define JL_GC_N_MAX_POOLS 51
+JL_DLLEXPORT double jl_gc_page_utilization_stats[JL_GC_N_MAX_POOLS];
+
+STATIC_INLINE void gc_dump_page_utilization_data(void) JL_NOTSAFEPOINT
+{
+    // FIXME: MMTk would have to provide its own stats
+}
+
+#define MMTK_GC_PAGE_SZ (1 << 12) // MMTk's page size is defined in mmtk-core constants
+
+JL_DLLEXPORT uint64_t jl_get_pg_size(void)
+{
+    return MMTK_GC_PAGE_SZ;
+}
+
+// Not used by mmtk
+// Number of GC threads that may run parallel marking
+int jl_n_markthreads;
+// Number of GC threads that may run concurrent sweeping (0 or 1)
+int jl_n_sweepthreads;
+// `tid` of first GC thread
+int gc_first_tid;
+// Number of threads sweeping stacks
+_Atomic(int) gc_n_threads_sweeping_stacks;
+// counter for sharing work when sweeping stacks
+_Atomic(int) gc_ptls_sweep_idx;
+// counter for round robin of giving back stack pages to the OS
+_Atomic(int) gc_stack_free_idx = 0;
+
+JL_DLLEXPORT void jl_gc_queue_root(const struct _jl_value_t *ptr) JL_NOTSAFEPOINT
+{
+    mmtk_unreachable();
+}
+
+JL_DLLEXPORT void jl_gc_queue_multiroot(const struct _jl_value_t *root, const void *stored,
+                                        struct _jl_datatype_t *dt) JL_NOTSAFEPOINT
+{
+    mmtk_unreachable();
+}
+
+JL_DLLEXPORT int jl_gc_mark_queue_obj(jl_ptls_t ptls, jl_value_t *obj)
+{
+    mmtk_unreachable();
+    return 0;
+}
+
+JL_DLLEXPORT void jl_gc_mark_queue_objarray(jl_ptls_t ptls, jl_value_t *parent,
+                                            jl_value_t **objs, size_t nobjs)
+{
+    mmtk_unreachable();
+}
+
+JL_DLLEXPORT size_t jl_gc_max_internal_obj_size(void)
+{
+    // TODO: meaningful for MMTk?
+    return GC_MAX_SZCLASS;
+}
+
+JL_DLLEXPORT void jl_gc_schedule_foreign_sweepfunc(jl_ptls_t ptls, jl_value_t *obj)
+{
+    // FIXME: do we need to implement this?
+}
+
+// gc-debug functions
+JL_DLLEXPORT jl_taggedvalue_t *jl_gc_find_taggedvalue_pool(char *p, size_t *osize_p)
+{
+    return NULL;
+}
+
+void jl_gc_debug_critical_error(void) JL_NOTSAFEPOINT
+{
+}
+
+int gc_is_collector_thread(int tid) JL_NOTSAFEPOINT
+{
+    return 0;
+}
+
+void jl_gc_debug_print_status(void) JL_NOTSAFEPOINT
+{
+    // May not be accurate but should be helpful enough
+    uint64_t pool_count = gc_num.poolalloc;
+    uint64_t big_count = gc_num.bigalloc;
+    jl_safe_printf("Allocations: %" PRIu64 " "
+                   "(Pool: %" PRIu64 "; Big: %" PRIu64 "); GC: %d\n",
+                   pool_count + big_count, pool_count, big_count, gc_num.pause);
+}
+
+JL_DLLEXPORT size_t jl_gc_external_obj_hdr_size(void)
+{
+    return sizeof(bigval_t);
+}
+
+void jl_print_gc_stats(JL_STREAM *s)
+{
+}
+
+JL_DLLEXPORT int jl_gc_enable_conservative_gc_support(void)
+{
+    return 0;
+}
+
+JL_DLLEXPORT int jl_gc_conservative_gc_support_enabled(void)
+{
+    return 0;
+}
+
+// TODO: if this is needed, it can be added in MMTk
+JL_DLLEXPORT jl_value_t *jl_gc_internal_obj_base_ptr(void *p)
+{
+    return NULL;
+}
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/src/gc-stacks.c b/src/gc-stacks.c
index a0ca2561c5cf9..9387c7fb065ec 100644
--- a/src/gc-stacks.c
+++ b/src/gc-stacks.c
@@ -1,7 +1,6 @@
 // This file is a part of Julia. License is MIT: https://julialang.org/license
 
 #include "gc-common.h"
-#include "gc-stock.h"
 #include "threading.h"
 #ifndef _OS_WINDOWS_
 #  include <sys/resource.h>
@@ -21,9 +20,6 @@
 # endif
 #endif
 
-// number of stacks to always keep available per pool
-#define MIN_STACK_MAPPINGS_PER_POOL 5
-
 const size_t jl_guard_size = (4096 * 8);
 static _Atomic(uint32_t) num_stack_mappings = 0;
 
@@ -203,99 +199,6 @@ JL_DLLEXPORT void *jl_malloc_stack(size_t *bufsz, jl_task_t *owner) JL_NOTSAFEPO
     return stk;
 }
 
-void sweep_stack_pool_loop(void) JL_NOTSAFEPOINT
-{
-    // Stack sweeping algorithm:
-    //    // deallocate stacks if we have too many sitting around unused
-    //    for (stk in halfof(free_stacks))
-    //        free_stack(stk, pool_sz);
-    //    // then sweep the task stacks
-    //    for (t in live_tasks)
-    //        if (!gc-marked(t))
-    //            stkbuf = t->stkbuf
-    //            bufsz = t->bufsz
-    //            if (stkbuf)
-    //                push(free_stacks[sz], stkbuf)
-    jl_atomic_fetch_add(&gc_n_threads_sweeping_stacks, 1);
-    while (1) {
-        int i = jl_atomic_fetch_add_relaxed(&gc_ptls_sweep_idx, -1);
-        if (i < 0)
-            break;
-        jl_ptls_t ptls2 = gc_all_tls_states[i];
-        if (ptls2 == NULL)
-            continue;
-        assert(gc_n_threads);
-        // free half of stacks that remain unused since last sweep
-        if (i == jl_atomic_load_relaxed(&gc_stack_free_idx)) {
-            for (int p = 0; p < JL_N_STACK_POOLS; p++) {
-                small_arraylist_t *al = &ptls2->gc_tls_common.heap.free_stacks[p];
-                size_t n_to_free;
-                if (jl_atomic_load_relaxed(&ptls2->current_task) == NULL) {
-                    n_to_free = al->len; // not alive yet or dead, so it does not need these anymore
-                }
-                else if (al->len > MIN_STACK_MAPPINGS_PER_POOL) {
-                    n_to_free = al->len / 2;
-                    if (n_to_free > (al->len - MIN_STACK_MAPPINGS_PER_POOL))
-                        n_to_free = al->len - MIN_STACK_MAPPINGS_PER_POOL;
-                }
-                else {
-                    n_to_free = 0;
-                }
-                for (int n = 0; n < n_to_free; n++) {
-                    void *stk = small_arraylist_pop(al);
-                    free_stack(stk, pool_sizes[p]);
-                }
-                if (jl_atomic_load_relaxed(&ptls2->current_task) == NULL) {
-                    small_arraylist_free(al);
-                }
-            }
-        }
-        if (jl_atomic_load_relaxed(&ptls2->current_task) == NULL) {
-            small_arraylist_free(ptls2->gc_tls_common.heap.free_stacks);
-        }
-
-        small_arraylist_t *live_tasks = &ptls2->gc_tls_common.heap.live_tasks;
-        size_t n = 0;
-        size_t ndel = 0;
-        size_t l = live_tasks->len;
-        void **lst = live_tasks->items;
-        if (l == 0)
-            continue;
-        while (1) {
-            jl_task_t *t = (jl_task_t*)lst[n];
-            assert(jl_is_task(t));
-            if (gc_marked(jl_astaggedvalue(t)->bits.gc)) {
-                if (t->ctx.stkbuf == NULL)
-                    ndel++; // jl_release_task_stack called
-                else
-                    n++;
-            }
-            else {
-                ndel++;
-                void *stkbuf = t->ctx.stkbuf;
-                size_t bufsz = t->ctx.bufsz;
-                if (stkbuf) {
-                    t->ctx.stkbuf = NULL;
-                    _jl_free_stack(ptls2, stkbuf, bufsz);
-                }
-#ifdef _COMPILER_TSAN_ENABLED_
-                if (t->ctx.tsan_state) {
-                    __tsan_destroy_fiber(t->ctx.tsan_state);
-                    t->ctx.tsan_state = NULL;
-                }
-#endif
-            }
-            if (n >= l - ndel)
-                break;
-            void *tmp = lst[n];
-            lst[n] = lst[n + ndel];
-            lst[n + ndel] = tmp;
-        }
-        live_tasks->len -= ndel;
-    }
-    jl_atomic_fetch_add(&gc_n_threads_sweeping_stacks, -1);
-}
-
 // Builds a list of the live tasks. Racy: `live_tasks` can expand at any time.
 arraylist_t *jl_get_all_tasks_arraylist(void) JL_NOTSAFEPOINT
 {
diff --git a/src/gc-stock.c b/src/gc-stock.c
index 61a013f347975..77fad28ab287b 100644
--- a/src/gc-stock.c
+++ b/src/gc-stock.c
@@ -1019,22 +1019,102 @@ void gc_sweep_wait_for_all_stacks(void) JL_NOTSAFEPOINT
     }
 }
 
-void sweep_mtarraylist_buffers(void) JL_NOTSAFEPOINT
-{
-    for (int i = 0; i < gc_n_threads; i++) {
-        jl_ptls_t ptls = gc_all_tls_states[i];
-        if (ptls == NULL) {
+extern const unsigned pool_sizes[];
+
+void sweep_stack_pool_loop(void) JL_NOTSAFEPOINT
+{
+    // Stack sweeping algorithm:
+    //    // deallocate stacks if we have too many sitting around unused
+    //    for (stk in halfof(free_stacks))
+    //        free_stack(stk, pool_sz);
+    //    // then sweep the task stacks
+    //    for (t in live_tasks)
+    //        if (!gc-marked(t))
+    //            stkbuf = t->stkbuf
+    //            bufsz = t->bufsz
+    //            if (stkbuf)
+    //                push(free_stacks[sz], stkbuf)
+    jl_atomic_fetch_add(&gc_n_threads_sweeping_stacks, 1);
+    while (1) {
+        int i = jl_atomic_fetch_add_relaxed(&gc_ptls_sweep_idx, -1);
+        if (i < 0)
+            break;
+        jl_ptls_t ptls2 = gc_all_tls_states[i];
+        if (ptls2 == NULL)
             continue;
+        assert(gc_n_threads);
+        // free half of stacks that remain unused since last sweep
+        if (i == jl_atomic_load_relaxed(&gc_stack_free_idx)) {
+            for (int p = 0; p < JL_N_STACK_POOLS; p++) {
+                small_arraylist_t *al = &ptls2->gc_tls_common.heap.free_stacks[p];
+                size_t n_to_free;
+                if (jl_atomic_load_relaxed(&ptls2->current_task) == NULL) {
+                    n_to_free = al->len; // not alive yet or dead, so it does not need these anymore
+                }
+                else if (al->len > MIN_STACK_MAPPINGS_PER_POOL) {
+                    n_to_free = al->len / 2;
+                    if (n_to_free > (al->len - MIN_STACK_MAPPINGS_PER_POOL))
+                        n_to_free = al->len - MIN_STACK_MAPPINGS_PER_POOL;
+                }
+                else {
+                    n_to_free = 0;
+                }
+                for (int n = 0; n < n_to_free; n++) {
+                    void *stk = small_arraylist_pop(al);
+                    free_stack(stk, pool_sizes[p]);
+                }
+                if (jl_atomic_load_relaxed(&ptls2->current_task) == NULL) {
+                    small_arraylist_free(al);
+                }
+            }
         }
-        small_arraylist_t *buffers = &ptls->lazily_freed_mtarraylist_buffers;
-        void *buf;
-        while ((buf = small_arraylist_pop(buffers)) != NULL) {
-            free(buf);
+        if (jl_atomic_load_relaxed(&ptls2->current_task) == NULL) {
+            small_arraylist_free(ptls2->gc_tls_common.heap.free_stacks);
         }
+
+        small_arraylist_t *live_tasks = &ptls2->gc_tls_common.heap.live_tasks;
+        size_t n = 0;
+        size_t ndel = 0;
+        size_t l = live_tasks->len;
+        void **lst = live_tasks->items;
+        if (l == 0)
+            continue;
+        while (1) {
+            jl_task_t *t = (jl_task_t*)lst[n];
+            assert(jl_is_task(t));
+            if (gc_marked(jl_astaggedvalue(t)->bits.gc)) {
+                if (t->ctx.stkbuf == NULL)
+                    ndel++; // jl_release_task_stack called
+                else
+                    n++;
+            }
+            else {
+                ndel++;
+                void *stkbuf = t->ctx.stkbuf;
+                size_t bufsz = t->ctx.bufsz;
+                if (stkbuf) {
+                    t->ctx.stkbuf = NULL;
+                    _jl_free_stack(ptls2, stkbuf, bufsz);
+                }
+#ifdef _COMPILER_TSAN_ENABLED_
+                if (t->ctx.tsan_state) {
+                    __tsan_destroy_fiber(t->ctx.tsan_state);
+                    t->ctx.tsan_state = NULL;
+                }
+#endif
+            }
+            if (n >= l - ndel)
+                break;
+            void *tmp = lst[n];
+            lst[n] = lst[n + ndel];
+            lst[n + ndel] = tmp;
+        }
+        live_tasks->len -= ndel;
     }
+    jl_atomic_fetch_add(&gc_n_threads_sweeping_stacks, -1);
 }
 
-void sweep_stack_pools_and_mtarraylist_buffers(jl_ptls_t ptls) JL_NOTSAFEPOINT
+JL_DLLEXPORT void jl_gc_sweep_stack_pools_and_mtarraylist_buffers(jl_ptls_t ptls) JL_NOTSAFEPOINT
 {
     // initialize ptls index for parallel sweeping of stack pools
     assert(gc_n_threads);
@@ -3096,7 +3176,7 @@ static int _jl_gc_collect(jl_ptls_t ptls, jl_gc_collection_t collection)
         current_sweep_full = sweep_full;
         sweep_weak_refs();
         uint64_t stack_pool_time = jl_hrtime();
-        sweep_stack_pools_and_mtarraylist_buffers(ptls);
+        jl_gc_sweep_stack_pools_and_mtarraylist_buffers(ptls);
         stack_pool_time = jl_hrtime() - stack_pool_time;
         gc_num.total_stack_pool_sweep_time += stack_pool_time;
         gc_num.stack_pool_sweep_time = stack_pool_time;
@@ -3655,61 +3735,6 @@ JL_DLLEXPORT uint64_t jl_gc_get_max_memory(void)
 
 // allocation wrappers that add to gc pressure
 
-JL_DLLEXPORT void *jl_malloc(size_t sz)
-{
-    return jl_gc_counted_malloc(sz);
-}
-
-//_unchecked_calloc does not check for potential overflow of nm*sz
-STATIC_INLINE void *_unchecked_calloc(size_t nm, size_t sz) {
-    size_t nmsz = nm*sz;
-    return jl_gc_counted_calloc(nmsz, 1);
-}
-
-JL_DLLEXPORT void *jl_calloc(size_t nm, size_t sz)
-{
-    if (nm > SSIZE_MAX/sz)
-        return NULL;
-    return _unchecked_calloc(nm, sz);
-}
-
-JL_DLLEXPORT void jl_free(void *p)
-{
-    if (p != NULL) {
-        size_t sz = memory_block_usable_size(p, 0);
-        free(p);
-        jl_task_t *ct = jl_get_current_task();
-        if (ct != NULL)
-            jl_batch_accum_free_size(ct->ptls, sz);
-    }
-}
-
-JL_DLLEXPORT void *jl_realloc(void *p, size_t sz)
-{
-    size_t old = p ? memory_block_usable_size(p, 0) : 0;
-    void *data = realloc(p, sz);
-    jl_task_t *ct = jl_get_current_task();
-    if (data != NULL && ct != NULL) {
-        sz = memory_block_usable_size(data, 0);
-        jl_ptls_t ptls = ct->ptls;
-        maybe_collect(ptls);
-        if (!(sz < old))
-            jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.allocd,
-                jl_atomic_load_relaxed(&ptls->gc_tls_common.gc_num.allocd) + (sz - old));
-        jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.realloc,
-            jl_atomic_load_relaxed(&ptls->gc_tls_common.gc_num.realloc) + 1);
-
-        int64_t diff = sz - old;
-        if (diff < 0) {
-            jl_batch_accum_free_size(ptls, -diff);
-        }
-        else {
-            jl_batch_accum_heap_size(ptls, diff);
-        }
-    }
-    return data;
-}
-
 JL_DLLEXPORT void *jl_gc_counted_malloc(size_t sz)
 {
     void *data = malloc(sz);
@@ -3746,12 +3771,34 @@ JL_DLLEXPORT void *jl_gc_counted_calloc(size_t nm, size_t sz)
 
 JL_DLLEXPORT void jl_gc_counted_free_with_size(void *p, size_t sz)
 {
-    return jl_free(p);
+    free(p);
+    jl_task_t *ct = jl_get_current_task();
+    if (ct != NULL)
+        jl_batch_accum_free_size(ct->ptls, sz);
 }
 
 JL_DLLEXPORT void *jl_gc_counted_realloc_with_old_size(void *p, size_t old, size_t sz)
 {
-    return jl_realloc(p, sz);
+    void *data = realloc(p, sz);
+    jl_task_t *ct = jl_get_current_task();
+    if (data != NULL && ct != NULL) {
+        sz = memory_block_usable_size(data, 0);
+        jl_ptls_t ptls = ct->ptls;
+        maybe_collect(ptls);
+        if (!(sz < old))
+            jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.allocd,
+                jl_atomic_load_relaxed(&ptls->gc_tls_common.gc_num.allocd) + (sz - old));
+        jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.realloc,
+            jl_atomic_load_relaxed(&ptls->gc_tls_common.gc_num.realloc) + 1);
+        int64_t diff = sz - old;
+        if (diff < 0) {
+            jl_batch_accum_free_size(ptls, -diff);
+        }
+        else {
+            jl_batch_accum_heap_size(ptls, diff);
+        }
+    }
+    return data;
 }
 
 // allocating blocks for Arrays and Strings
@@ -4015,12 +4062,20 @@ JL_DLLEXPORT size_t jl_gc_external_obj_hdr_size(void)
     return sizeof(bigval_t);
 }
 
-
 JL_DLLEXPORT void jl_gc_schedule_foreign_sweepfunc(jl_ptls_t ptls, jl_value_t *obj)
 {
     arraylist_push(&ptls->gc_tls.sweep_objs, obj);
 }
 
+void jl_gc_notify_image_load(const char* img_data, size_t len)
+{
+    // Do nothing
+}
+
+JL_DLLEXPORT const char* jl_gc_active_impl(void) {
+    return "Built with stock GC";
+}
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/gc-stock.h b/src/gc-stock.h
index b9a2e720f120a..d478ee1366da0 100644
--- a/src/gc-stock.h
+++ b/src/gc-stock.h
@@ -5,7 +5,6 @@
   . non-moving, precise mark and sweep collector
   . pool-allocates small objects, keeps big objects on a simple list
 */
-
 #ifndef JL_GC_H
 #define JL_GC_H
 
@@ -20,6 +19,7 @@
 #include "julia_internal.h"
 #include "julia_assert.h"
 #include "threading.h"
+#include "gc-common.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -85,27 +85,6 @@ typedef struct _jl_gc_chunk_t {
 
 extern uintptr_t gc_bigval_sentinel_tag;
 
-JL_EXTENSION typedef struct _bigval_t {
-    struct _bigval_t *next;
-    struct _bigval_t *prev;
-    size_t sz;
-#ifdef _P64 // Add padding so that the value is 64-byte aligned
-    // (8 pointers of 8 bytes each) - (4 other pointers in struct)
-    void *_padding[8 - 4];
-#else
-    // (16 pointers of 4 bytes each) - (4 other pointers in struct)
-    void *_padding[16 - 4];
-#endif
-    //struct jl_taggedvalue_t <>;
-    union {
-        uintptr_t header;
-        struct {
-            uintptr_t gc:2;
-        } bits;
-    };
-    // must be 64-byte aligned here, in 32 & 64 bit modes
-} bigval_t;
-
 // pool page metadata
 typedef struct _jl_gc_pagemeta_t {
     // next metadata structure in per-thread list
@@ -519,9 +498,6 @@ extern uv_cond_t gc_threads_cond;
 extern uv_sem_t gc_sweep_assists_needed;
 extern _Atomic(int) gc_n_threads_marking;
 extern _Atomic(int) gc_n_threads_sweeping_pools;
-extern _Atomic(int) gc_n_threads_sweeping_stacks;
-extern _Atomic(int) gc_ptls_sweep_idx;
-extern _Atomic(int) gc_stack_free_idx;
 extern _Atomic(int) n_threads_running;
 extern uv_barrier_t thread_init_done;
 void gc_mark_queue_all_roots(jl_ptls_t ptls, jl_gc_markqueue_t *mq);
diff --git a/src/gc-tls-mmtk.h b/src/gc-tls-mmtk.h
new file mode 100644
index 0000000000000..5b69aef5d55fb
--- /dev/null
+++ b/src/gc-tls-mmtk.h
@@ -0,0 +1,23 @@
+// This file is a part of Julia. License is MIT: https://julialang.org/license
+
+#ifndef JL_GC_TLS_H
+#define JL_GC_TLS_H
+
+#include <assert.h>
+#include "mmtkMutator.h"
+#include "julia_atomics.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct {
+    MMTkMutatorContext mmtk_mutator;
+    _Atomic(size_t) malloc_sz_since_last_poll;
+} jl_gc_tls_states_t;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // JL_GC_TLS_H
diff --git a/src/gc-tls.h b/src/gc-tls-stock.h
similarity index 100%
rename from src/gc-tls.h
rename to src/gc-tls-stock.h
diff --git a/src/julia_internal.h b/src/julia_internal.h
index 4741316093f95..3a201e5d3201b 100644
--- a/src/julia_internal.h
+++ b/src/julia_internal.h
@@ -226,7 +226,7 @@ extern volatile int profile_all_tasks;
 // Ensures that we can safely read the `live_tasks`field of every TLS when profiling.
 // We want to avoid the case that a GC gets interleaved with `jl_profile_task` and shrinks
 // the `live_tasks` array while we are reading it or frees tasks that are being profiled.
-// Because of that, this lock must be held in `jl_profile_task` and `sweep_stack_pools_and_mtarraylist_buffers`.
+// Because of that, this lock must be held in `jl_profile_task` and `jl_gc_sweep_stack_pools_and_mtarraylist_buffers`.
 extern uv_mutex_t live_tasks_lock;
 // Ensures that we can safely write to `profile_bt_data_prof` and `profile_bt_size_cur`.
 // We want to avoid the case that:
diff --git a/src/julia_locks.h b/src/julia_locks.h
index 35bcf7dd97322..92d67b34b1692 100644
--- a/src/julia_locks.h
+++ b/src/julia_locks.h
@@ -116,7 +116,7 @@ class jl_unique_gcsafe_lock {
     {
         jl_task_t *ct = jl_current_task;
         gc_state = jl_gc_safe_enter(ct->ptls); // contains jl_gc_safepoint after enter
-        this->native = std::unique_lock(native);
+        this->native = std::unique_lock<std::mutex>(native);
         ct->ptls->engine_nqueued++; // disables finalizers until inference is finished on this method graph
     }
     jl_unique_gcsafe_lock(jl_unique_gcsafe_lock &&native) = delete;
diff --git a/src/julia_threads.h b/src/julia_threads.h
index faa8ab9e0aaf4..b6ef65dc7fe52 100644
--- a/src/julia_threads.h
+++ b/src/julia_threads.h
@@ -4,7 +4,11 @@
 #ifndef JL_THREADS_H
 #define JL_THREADS_H
 
-#include "gc-tls.h"
+#ifndef MMTK_GC
+#include "gc-tls-stock.h"
+#else
+#include "gc-tls-mmtk.h"
+#endif
 #include "gc-tls-common.h"
 #include "julia_atomics.h"
 #ifndef _OS_WINDOWS_
diff --git a/src/llvm-gc-interface-passes.h b/src/llvm-gc-interface-passes.h
index 278987858eab7..7b2a4bb033203 100644
--- a/src/llvm-gc-interface-passes.h
+++ b/src/llvm-gc-interface-passes.h
@@ -19,6 +19,7 @@
 #include <llvm/ADT/SmallVector.h>
 #include <llvm/ADT/SmallSet.h>
 #include <llvm/Analysis/CFG.h>
+#include <llvm/Analysis/DomTreeUpdater.h>
 #include <llvm/Analysis/InstSimplifyFolder.h>
 #include <llvm/IR/Value.h>
 #include <llvm/IR/Constants.h>
@@ -328,6 +329,7 @@ struct LateLowerGCFrame:  private JuliaPassContext {
 
 private:
     CallInst *pgcstack;
+    Function *smallAllocFunc;
 
     void MaybeNoteDef(State &S, BBState &BBS, Value *Def, const ArrayRef<int> &SafepointsSoFar,
                       SmallVector<int, 1> &&RefinedPtr = SmallVector<int, 1>());
@@ -366,6 +368,7 @@ struct LateLowerGCFrame:  private JuliaPassContext {
     void RefineLiveSet(LargeSparseBitVector &LS, State &S, ArrayRef<int> CalleeRoots);
     Value *EmitTagPtr(IRBuilder<> &builder, Type *T, Type *T_size, Value *V);
     Value *EmitLoadTag(IRBuilder<> &builder, Type *T_size, Value *V);
+    Value* lowerGCAllocBytesLate(CallInst *target, Function &F);
 };
 
 // The final GC lowering pass. This pass lowers platform-agnostic GC
diff --git a/src/llvm-late-gc-lowering-mmtk.cpp b/src/llvm-late-gc-lowering-mmtk.cpp
new file mode 100644
index 0000000000000..5539c8dbcf153
--- /dev/null
+++ b/src/llvm-late-gc-lowering-mmtk.cpp
@@ -0,0 +1,96 @@
+// This file is a part of Julia. License is MIT: https://julialang.org/license
+
+#include "llvm-gc-interface-passes.h"
+
+Value* LateLowerGCFrame::lowerGCAllocBytesLate(CallInst *target, Function &F)
+{
+    assert(target->arg_size() == 3);
+
+    IRBuilder<> builder(target);
+    auto ptls = target->getArgOperand(0);
+    auto type = target->getArgOperand(2);
+    if (auto CI = dyn_cast<ConstantInt>(target->getArgOperand(1))) {
+        size_t sz = (size_t)CI->getZExtValue();
+        // This is strongly architecture and OS dependent
+        int osize;
+        int offset = jl_gc_classify_pools(sz, &osize);
+        if (offset >= 0) {
+            // In this case instead of lowering julia.gc_alloc_bytes to jl_gc_small_alloc
+            // We do a slowpath/fastpath check and lower it only on the slowpath, returning
+            // the cursor and updating it in the fastpath.
+            auto pool_osize_i32 = ConstantInt::get(Type::getInt32Ty(F.getContext()), osize);
+            auto pool_osize = ConstantInt::get(Type::getInt64Ty(F.getContext()), osize);
+
+            // Should we generate fastpath allocation sequence here? We should always generate fastpath here for MMTk.
+            // Setting this to false will increase allocation overhead a lot, and should only be used for debugging.
+            const bool INLINE_FASTPATH_ALLOCATION = true;
+
+            if (INLINE_FASTPATH_ALLOCATION) {
+                // Assuming we use the first immix allocator.
+                // FIXME: We should get the allocator index and type from MMTk.
+                auto allocator_offset = offsetof(jl_tls_states_t, gc_tls) + offsetof(jl_gc_tls_states_t, mmtk_mutator) + offsetof(MMTkMutatorContext, allocators) + offsetof(Allocators, immix);
+
+                auto cursor_pos = ConstantInt::get(Type::getInt64Ty(target->getContext()), allocator_offset + offsetof(ImmixAllocator, cursor));
+                auto limit_pos = ConstantInt::get(Type::getInt64Ty(target->getContext()),  allocator_offset + offsetof(ImmixAllocator, limit));
+
+                auto cursor_ptr = builder.CreateInBoundsGEP(Type::getInt8Ty(target->getContext()), ptls, cursor_pos);
+                auto cursor = builder.CreateAlignedLoad(Type::getInt64Ty(target->getContext()), cursor_ptr, Align(sizeof(void *)), "cursor");
+
+                // offset = 8
+                auto delta_offset = builder.CreateNSWSub(ConstantInt::get(Type::getInt64Ty(target->getContext()), 0), ConstantInt::get(Type::getInt64Ty(target->getContext()), 8));
+                auto delta_cursor = builder.CreateNSWSub(ConstantInt::get(Type::getInt64Ty(target->getContext()), 0), cursor);
+                auto delta_op = builder.CreateNSWAdd(delta_offset, delta_cursor);
+                // alignment 16 (15 = 16 - 1)
+                auto delta = builder.CreateAnd(delta_op, ConstantInt::get(Type::getInt64Ty(target->getContext()), 15), "delta");
+                auto result = builder.CreateNSWAdd(cursor, delta, "result");
+
+                auto new_cursor = builder.CreateNSWAdd(result, pool_osize);
+
+                auto limit_ptr = builder.CreateInBoundsGEP(Type::getInt8Ty(target->getContext()), ptls, limit_pos);
+                auto limit = builder.CreateAlignedLoad(Type::getInt64Ty(target->getContext()), limit_ptr, Align(sizeof(void *)), "limit");
+
+                auto gt_limit = builder.CreateICmpSGT(new_cursor, limit);
+
+                auto slowpath = BasicBlock::Create(target->getContext(), "slowpath", target->getFunction());
+                auto fastpath = BasicBlock::Create(target->getContext(), "fastpath", target->getFunction());
+
+                auto next_instr = target->getNextNode();
+                SmallVector<uint32_t, 2> Weights{1, 9};
+
+                MDBuilder MDB(F.getContext());
+                SplitBlockAndInsertIfThenElse(gt_limit, next_instr, &slowpath, &fastpath, false, false, MDB.createBranchWeights(Weights));
+
+                builder.SetInsertPoint(next_instr);
+                auto phiNode = builder.CreatePHI(target->getCalledFunction()->getReturnType(), 2, "phi_fast_slow");
+
+                // slowpath
+                builder.SetInsertPoint(slowpath);
+                auto pool_offs = ConstantInt::get(Type::getInt32Ty(F.getContext()), 1);
+                auto new_call = builder.CreateCall(smallAllocFunc, { ptls, pool_offs, pool_osize_i32, type });
+                new_call->setAttributes(new_call->getCalledFunction()->getAttributes());
+                builder.CreateBr(next_instr->getParent());
+
+                // fastpath
+                builder.SetInsertPoint(fastpath);
+                builder.CreateStore(new_cursor, cursor_ptr);
+
+                // ptls->gc_tls.gc_num.allocd += osize;
+                auto pool_alloc_pos = ConstantInt::get(Type::getInt64Ty(target->getContext()), offsetof(jl_tls_states_t, gc_tls_common) + offsetof(jl_gc_tls_states_common_t, gc_num));
+                auto pool_alloc_tls = builder.CreateInBoundsGEP(Type::getInt8Ty(target->getContext()), ptls, pool_alloc_pos);
+                auto pool_allocd = builder.CreateAlignedLoad(Type::getInt64Ty(target->getContext()), pool_alloc_tls, Align(sizeof(void *)));
+                auto pool_allocd_total = builder.CreateAdd(pool_allocd, pool_osize);
+                builder.CreateStore(pool_allocd_total, pool_alloc_tls);
+
+                auto v_raw = builder.CreateNSWAdd(result, ConstantInt::get(Type::getInt64Ty(target->getContext()), sizeof(jl_taggedvalue_t)));
+                auto v_as_ptr = builder.CreateIntToPtr(v_raw, smallAllocFunc->getReturnType());
+                builder.CreateBr(next_instr->getParent());
+
+                phiNode->addIncoming(new_call, slowpath);
+                phiNode->addIncoming(v_as_ptr, fastpath);
+                phiNode->takeName(target);
+                return phiNode;
+            }
+        }
+    }
+    return target;
+}
diff --git a/src/llvm-late-gc-lowering-stock.cpp b/src/llvm-late-gc-lowering-stock.cpp
new file mode 100644
index 0000000000000..2a11487773396
--- /dev/null
+++ b/src/llvm-late-gc-lowering-stock.cpp
@@ -0,0 +1,9 @@
+// This file is a part of Julia. License is MIT: https://julialang.org/license
+
+#include "llvm-gc-interface-passes.h"
+
+Value* LateLowerGCFrame::lowerGCAllocBytesLate(CallInst *target, Function &F)
+{
+    // Do nothing for the stock GC
+    return target;
+}
diff --git a/src/llvm-late-gc-lowering.cpp b/src/llvm-late-gc-lowering.cpp
index ff2cac6e49406..7d6fba65a79e7 100644
--- a/src/llvm-late-gc-lowering.cpp
+++ b/src/llvm-late-gc-lowering.cpp
@@ -2446,6 +2446,7 @@ void LateLowerGCFrame::PlaceRootsAndUpdateCalls(ArrayRef<int> Colors, int PreAss
 
 bool LateLowerGCFrame::runOnFunction(Function &F, bool *CFGModified) {
     initAll(*F.getParent());
+    smallAllocFunc = getOrDeclare(jl_well_known::GCSmallAlloc);
     LLVM_DEBUG(dbgs() << "GC ROOT PLACEMENT: Processing function " << F.getName() << "\n");
     if (!pgcstack_getter && !adoptthread_func)
         return CleanupIR(F, nullptr, CFGModified);
@@ -2460,6 +2461,31 @@ bool LateLowerGCFrame::runOnFunction(Function &F, bool *CFGModified) {
     std::map<Value *, std::pair<int, int>> CallFrames; // = OptimizeCallFrames(S, Ordering);
     PlaceRootsAndUpdateCalls(Colors.first, Colors.second, S, CallFrames);
     CleanupIR(F, &S, CFGModified);
+
+
+    // We lower the julia.gc_alloc_bytes intrinsic in this pass to insert slowpath/fastpath blocks for MMTk
+    // For now, we do nothing for the Stock GC
+    auto GCAllocBytes = getOrNull(jl_intrinsics::GCAllocBytes);
+
+    if (GCAllocBytes) {
+        for (auto it = GCAllocBytes->user_begin(); it != GCAllocBytes->user_end(); ) {
+            if (auto *CI = dyn_cast<CallInst>(*it)) {
+                *CFGModified = true;
+
+                assert(CI->getCalledOperand() == GCAllocBytes);
+
+                auto newI = lowerGCAllocBytesLate(CI, F);
+                if (newI != CI) {
+                    ++it;
+                    CI->replaceAllUsesWith(newI);
+                    CI->eraseFromParent();
+                    continue;
+                }
+            }
+            ++it;
+        }
+    }
+
     return true;
 }
 
diff --git a/src/staticdata.c b/src/staticdata.c
index 1665b5be4ed0f..d2309e3a5d053 100644
--- a/src/staticdata.c
+++ b/src/staticdata.c
@@ -657,6 +657,7 @@ static void jl_load_sysimg_so(void)
         plen = (size_t *)&jl_system_image_size;
     else
         jl_dlsym(jl_sysimg_handle, "jl_system_image_size", (void **)&plen, 1);
+    jl_gc_notify_image_load(sysimg_data, *plen);
     jl_restore_system_image_data(sysimg_data, *plen);
 }
 
@@ -4224,6 +4225,7 @@ JL_DLLEXPORT jl_value_t *jl_restore_package_image_from_file(const char *fname, j
     jl_dlsym(pkgimg_handle, "jl_system_image_data", (void **)&pkgimg_data, 1);
     size_t *plen;
     jl_dlsym(pkgimg_handle, "jl_system_image_size", (void **)&plen, 1);
+    jl_gc_notify_image_load(pkgimg_data, *plen);
 
     jl_image_t pkgimage = jl_init_processor_pkgimg(pkgimg_handle);
 
diff --git a/src/threading.c b/src/threading.c
index ac9cc276d613a..77956786af3f4 100644
--- a/src/threading.c
+++ b/src/threading.c
@@ -773,6 +773,10 @@ void jl_init_threading(void)
     }
     int16_t ngcthreads = jl_n_markthreads + jl_n_sweepthreads;
 
+    if (strstr(jl_gc_active_impl(), "MMTk")) {
+        ngcthreads = 0;
+    }
+
     jl_all_tls_states_size = nthreads + nthreadsi + ngcthreads;
     jl_n_threads_per_pool = (int*)malloc_s(2 * sizeof(int));
     jl_n_threads_per_pool[0] = nthreadsi;
diff --git a/stdlib/InteractiveUtils/src/InteractiveUtils.jl b/stdlib/InteractiveUtils/src/InteractiveUtils.jl
index aa13fa3cdd31d..4a320282610cd 100644
--- a/stdlib/InteractiveUtils/src/InteractiveUtils.jl
+++ b/stdlib/InteractiveUtils/src/InteractiveUtils.jl
@@ -166,6 +166,7 @@ function versioninfo(io::IO=stdout; verbose::Bool=false)
     end
     println(io, "  WORD_SIZE: ", Sys.WORD_SIZE)
     println(io, "  LLVM: libLLVM-",Base.libllvm_version," (", Sys.JIT, ", ", Sys.CPU_NAME, ")")
+    println(io, "  GC: ", unsafe_string(ccall(:jl_gc_active_impl, Ptr{UInt8}, ())))
     println(io, """Threads: $(Threads.nthreads(:default)) default, $(Threads.nthreads(:interactive)) interactive, \
       $(Threads.ngcthreads()) GC (on $(Sys.CPU_THREADS) virtual cores)""")