From df35d17559c67c7fc61ca683ea10fc80addb71bb Mon Sep 17 00:00:00 2001
From: Eduardo Souza <ledusou@gmail.com>
Date: Wed, 28 Aug 2024 00:44:09 +0000
Subject: [PATCH 01/38] Refactoring to be considered before adding MMTk

---
 src/gc-common.c      | 156 +++++++++++++++++++++++++++++++++++++++++++
 src/gc-common.h      |   6 ++
 src/gc-debug.c       |  41 +-----------
 src/gc-interface.h   |  12 ++++
 src/gc-stacks.c      |   4 +-
 src/gc-stock.c       | 156 ++++++++++++-------------------------------
 src/gc-stock.h       |  21 ------
 src/julia.h          |   2 +-
 src/julia_internal.h |  26 +-------
 src/scheduler.c      |  11 +++
 src/stackwalk.c      |   4 +-
 src/staticdata.c     |   2 +
 12 files changed, 237 insertions(+), 204 deletions(-)

diff --git a/src/gc-common.c b/src/gc-common.c
index ee461b576ea9e..2ec167caa667a 100644
--- a/src/gc-common.c
+++ b/src/gc-common.c
@@ -20,6 +20,11 @@ extern "C" {
 
 jl_gc_num_t gc_num = {0};
 
+JL_DLLEXPORT uint64_t jl_gc_total_hrtime(void)
+{
+    return gc_num.total_time;
+}
+
 // =========================================================================== //
 // GC Callbacks
 // =========================================================================== //
@@ -489,6 +494,87 @@ jl_ptls_t* gc_all_tls_states;
 // MISC
 // =========================================================================== //
 
+JL_DLLEXPORT jl_weakref_t *jl_gc_new_weakref(jl_value_t *value)
+{
+    jl_ptls_t ptls = jl_current_task->ptls;
+    return jl_gc_new_weakref_th(ptls, value);
+}
+
+JL_DLLEXPORT void * jl_gc_alloc_typed(jl_ptls_t ptls, size_t sz, void *ty)
+{
+    return jl_gc_alloc(ptls, sz, ty);
+}
+
+JL_DLLEXPORT jl_value_t *jl_gc_allocobj(size_t sz)
+{
+    jl_ptls_t ptls = jl_current_task->ptls;
+    return jl_gc_alloc(ptls, sz, NULL);
+}
+
+// allocation wrappers that save the size of allocations, to allow using
+// jl_gc_counted_* functions with a libc-compatible API.
+
+JL_DLLEXPORT void *jl_malloc(size_t sz)
+{
+    int64_t *p = (int64_t *)jl_gc_counted_malloc(sz + JL_SMALL_BYTE_ALIGNMENT);
+    if (p == NULL)
+        return NULL;
+    p[0] = sz;
+    return (void *)(p + 2); // assumes JL_SMALL_BYTE_ALIGNMENT == 16
+}
+
+//_unchecked_calloc does not check for potential overflow of nm*sz
+STATIC_INLINE void *_unchecked_calloc(size_t nm, size_t sz) {
+    size_t nmsz = nm*sz;
+    int64_t *p = (int64_t *)jl_gc_counted_calloc(nmsz + JL_SMALL_BYTE_ALIGNMENT, 1);
+    if (p == NULL)
+        return NULL;
+    p[0] = nmsz;
+    return (void *)(p + 2); // assumes JL_SMALL_BYTE_ALIGNMENT == 16
+}
+
+JL_DLLEXPORT void *jl_calloc(size_t nm, size_t sz)
+{
+    if (nm > SSIZE_MAX/sz - JL_SMALL_BYTE_ALIGNMENT)
+        return NULL;
+    return _unchecked_calloc(nm, sz);
+}
+
+JL_DLLEXPORT void jl_free(void *p)
+{
+    if (p != NULL) {
+        int64_t *pp = (int64_t *)p - 2;
+        size_t sz = pp[0];
+        jl_gc_counted_free_with_size(pp, sz + JL_SMALL_BYTE_ALIGNMENT);
+    }
+}
+
+JL_DLLEXPORT void *jl_realloc(void *p, size_t sz)
+{
+    int64_t *pp;
+    size_t szold;
+    if (p == NULL) {
+        pp = NULL;
+        szold = 0;
+    }
+    else {
+        pp = (int64_t *)p - 2;
+        szold = pp[0] + JL_SMALL_BYTE_ALIGNMENT;
+    }
+    int64_t *pnew = (int64_t *)jl_gc_counted_realloc_with_old_size(pp, szold, sz + JL_SMALL_BYTE_ALIGNMENT);
+    if (pnew == NULL)
+        return NULL;
+    pnew[0] = sz;
+    return (void *)(pnew + 2); // assumes JL_SMALL_BYTE_ALIGNMENT == 16
+}
+
+// allocator entry points
+
+JL_DLLEXPORT jl_value_t *(jl_gc_alloc)(jl_ptls_t ptls, size_t sz, void *ty)
+{
+    return jl_gc_alloc_(ptls, sz, ty);
+}
+
 const uint64_t _jl_buff_tag[3] = {0x4eadc0004eadc000ull, 0x4eadc0004eadc000ull, 0x4eadc0004eadc000ull}; // aka 0xHEADER00
 JL_DLLEXPORT uintptr_t jl_get_buff_tag(void) JL_NOTSAFEPOINT
 {
@@ -501,6 +587,76 @@ JL_DLLEXPORT void jl_throw_out_of_memory_error(void)
     jl_throw(jl_memory_exception);
 }
 
+size_t jl_genericmemory_nbytes(jl_genericmemory_t *m) JL_NOTSAFEPOINT
+{
+    const jl_datatype_layout_t *layout = ((jl_datatype_t*)jl_typetagof(m))->layout;
+    size_t sz = layout->size * m->length;
+    if (layout->flags.arrayelem_isunion)
+        // account for isbits Union array selector bytes
+        sz += m->length;
+    return sz;
+}
+
+// tracking Memorys with malloc'd storage
+void jl_gc_track_malloced_genericmemory(jl_ptls_t ptls, jl_genericmemory_t *m, int isaligned){
+    // This is **NOT** a GC safe point.
+    mallocmemory_t *ma;
+    if (ptls->gc_tls.heap.mafreelist == NULL) {
+        ma = (mallocmemory_t*)malloc_s(sizeof(mallocmemory_t));
+    }
+    else {
+        ma = ptls->gc_tls.heap.mafreelist;
+        ptls->gc_tls.heap.mafreelist = ma->next;
+    }
+    ma->a = (jl_genericmemory_t*)((uintptr_t)m | !!isaligned);
+    ma->next = ptls->gc_tls.heap.mallocarrays;
+    ptls->gc_tls.heap.mallocarrays = ma;
+}
+
+int gc_logging_enabled = 0;
+
+JL_DLLEXPORT void jl_enable_gc_logging(int enable) {
+    gc_logging_enabled = enable;
+}
+
+JL_DLLEXPORT int jl_is_gc_logging_enabled(void) {
+    return gc_logging_enabled;
+}
+
+// gc-debug common functions
+// ---
+
+int gc_slot_to_fieldidx(void *obj, void *slot, jl_datatype_t *vt) JL_NOTSAFEPOINT
+{
+    int nf = (int)jl_datatype_nfields(vt);
+    for (int i = 1; i < nf; i++) {
+        if (slot < (void*)((char*)obj + jl_field_offset(vt, i)))
+            return i - 1;
+    }
+    return nf - 1;
+}
+
+int gc_slot_to_arrayidx(void *obj, void *_slot) JL_NOTSAFEPOINT
+{
+    char *slot = (char*)_slot;
+    jl_datatype_t *vt = (jl_datatype_t*)jl_typeof(obj);
+    char *start = NULL;
+    size_t len = 0;
+    size_t elsize = sizeof(void*);
+    if (vt == jl_module_type) {
+        jl_module_t *m = (jl_module_t*)obj;
+        start = (char*)m->usings.items;
+        len = m->usings.len;
+    }
+    else if (vt == jl_simplevector_type) {
+        start = (char*)jl_svec_data(obj);
+        len = jl_svec_len(obj);
+    }
+    if (slot < start || slot >= start + elsize * len)
+        return -1;
+    return (slot - start) / elsize;
+}
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/gc-common.h b/src/gc-common.h
index 4d53830442a7d..154b9659e9ccb 100644
--- a/src/gc-common.h
+++ b/src/gc-common.h
@@ -53,6 +53,12 @@ extern jl_gc_callback_list_t *gc_cblist_notify_gc_pressure;
 // malloc wrappers, aligned allocation
 // =========================================================================== //
 
+// data structure for tracking malloc'd genericmemory.
+typedef struct _mallocmemory_t {
+    jl_genericmemory_t *a; // lowest bit is tagged if this is aligned memory
+    struct _mallocmemory_t *next;
+} mallocmemory_t;
+
 #if defined(_OS_WINDOWS_)
 STATIC_INLINE void *jl_malloc_aligned(size_t sz, size_t align)
 {
diff --git a/src/gc-debug.c b/src/gc-debug.c
index 19dd93af5f236..d05fb4b49e9f7 100644
--- a/src/gc-debug.c
+++ b/src/gc-debug.c
@@ -1105,46 +1105,7 @@ void gc_count_pool(void)
     jl_safe_printf("************************\n");
 }
 
-int gc_slot_to_fieldidx(void *obj, void *slot, jl_datatype_t *vt) JL_NOTSAFEPOINT
-{
-    int nf = (int)jl_datatype_nfields(vt);
-    for (int i = 1; i < nf; i++) {
-        if (slot < (void*)((char*)obj + jl_field_offset(vt, i)))
-            return i - 1;
-    }
-    return nf - 1;
-}
-
-int gc_slot_to_arrayidx(void *obj, void *_slot) JL_NOTSAFEPOINT
-{
-    char *slot = (char*)_slot;
-    jl_datatype_t *vt = (jl_datatype_t*)jl_typeof(obj);
-    char *start = NULL;
-    size_t len = 0;
-    size_t elsize = sizeof(void*);
-    if (vt == jl_module_type) {
-        jl_module_t *m = (jl_module_t*)obj;
-        start = (char*)m->usings.items;
-        len = m->usings.len;
-    }
-    else if (vt == jl_simplevector_type) {
-        start = (char*)jl_svec_data(obj);
-        len = jl_svec_len(obj);
-    }
-    if (slot < start || slot >= start + elsize * len)
-        return -1;
-    return (slot - start) / elsize;
-}
-
-static int gc_logging_enabled = 0;
-
-JL_DLLEXPORT void jl_enable_gc_logging(int enable) {
-    gc_logging_enabled = enable;
-}
-
-JL_DLLEXPORT int jl_is_gc_logging_enabled(void) {
-    return gc_logging_enabled;
-}
+extern int gc_logging_enabled;
 
 void _report_gc_finished(uint64_t pause, uint64_t freed, int full, int recollect, int64_t live_bytes) JL_NOTSAFEPOINT {
     if (!gc_logging_enabled) {
diff --git a/src/gc-interface.h b/src/gc-interface.h
index e543b4b5879f1..682f22344d69d 100644
--- a/src/gc-interface.h
+++ b/src/gc-interface.h
@@ -128,6 +128,13 @@ JL_DLLEXPORT uint64_t jl_gc_total_hrtime(void);
 // Allocation
 // ========================================================================= //
 
+// On GCC, this function is inlined when sz is constant (see julia_internal.h)
+// In general, this function should implement allocation and should use the specific GC's logic
+// to decide whether to allocate a small or a large object. Finally, note that this function
+// **must** also set the type of the returning object to be `ty`. The type `ty` may also be used to record
+// an allocation of that type in the allocation profiler.
+struct _jl_value_t *jl_gc_alloc_(struct _jl_tls_states_t * ptls, size_t sz, void *ty);
+
 // Allocates small objects and increments Julia allocation counterst. Size of the object
 // header must be included in the object size. The (possibly unused in some implementations)
 // offset to the arena in which we're allocating is passed in the second parameter, and the
@@ -211,6 +218,11 @@ JL_DLLEXPORT void *jl_gc_perm_alloc(size_t sz, int zero, unsigned align,
 // object being allocated and will be used to set the object header.
 struct _jl_value_t *jl_gc_permobj(size_t sz, void *ty) JL_NOTSAFEPOINT;
 
+// This function notifies the GC about memory addresses that are set when loading the boot image.
+// The GC may use that information to, for instance, determine that such objects should
+// be treated as marked and belonged to the old generation in nursery collections.
+void jl_gc_notify_image_load(const char* img_data, size_t len);
+
 // ========================================================================= //
 // Runtime Write-Barriers
 // ========================================================================= //
diff --git a/src/gc-stacks.c b/src/gc-stacks.c
index 783129ea97693..8c44b65284386 100644
--- a/src/gc-stacks.c
+++ b/src/gc-stacks.c
@@ -46,7 +46,7 @@ static void *malloc_stack(size_t bufsz) JL_NOTSAFEPOINT
 }
 
 
-static void free_stack(void *stkbuf, size_t bufsz) JL_NOTSAFEPOINT
+void free_stack(void *stkbuf, size_t bufsz) JL_NOTSAFEPOINT
 {
     VirtualFree(stkbuf, 0, MEM_RELEASE);
     jl_atomic_fetch_add_relaxed(&num_stack_mappings, -1);
@@ -81,7 +81,7 @@ static void *malloc_stack(size_t bufsz) JL_NOTSAFEPOINT
     return stk;
 }
 
-static void free_stack(void *stkbuf, size_t bufsz) JL_NOTSAFEPOINT
+void free_stack(void *stkbuf, size_t bufsz) JL_NOTSAFEPOINT
 {
     munmap(stkbuf, bufsz);
     jl_atomic_fetch_add_relaxed(&num_stack_mappings, -1);
diff --git a/src/gc-stock.c b/src/gc-stock.c
index d25f8917f302d..4a8c6fe7decc5 100644
--- a/src/gc-stock.c
+++ b/src/gc-stock.c
@@ -553,24 +553,6 @@ static void sweep_big(jl_ptls_t ptls) JL_NOTSAFEPOINT
     gc_time_big_end();
 }
 
-// tracking Memorys with malloc'd storage
-
-void jl_gc_track_malloced_genericmemory(jl_ptls_t ptls, jl_genericmemory_t *m, int isaligned){
-    // This is **NOT** a GC safe point.
-    mallocmemory_t *ma;
-    if (ptls->gc_tls.heap.mafreelist == NULL) {
-        ma = (mallocmemory_t*)malloc_s(sizeof(mallocmemory_t));
-    }
-    else {
-        ma = ptls->gc_tls.heap.mafreelist;
-        ptls->gc_tls.heap.mafreelist = ma->next;
-    }
-    ma->a = (jl_genericmemory_t*)((uintptr_t)m | !!isaligned);
-    ma->next = ptls->gc_tls.heap.mallocarrays;
-    ptls->gc_tls.heap.mallocarrays = ma;
-}
-
-
 void jl_gc_count_allocd(size_t sz) JL_NOTSAFEPOINT
 {
     jl_ptls_t ptls = jl_current_task->ptls;
@@ -647,17 +629,6 @@ void jl_gc_reset_alloc_count(void) JL_NOTSAFEPOINT
     reset_thread_gc_counts();
 }
 
-size_t jl_genericmemory_nbytes(jl_genericmemory_t *m) JL_NOTSAFEPOINT
-{
-    const jl_datatype_layout_t *layout = ((jl_datatype_t*)jl_typetagof(m))->layout;
-    size_t sz = layout->size * m->length;
-    if (layout->flags.arrayelem_isunion)
-        // account for isbits Union array selector bytes
-        sz += m->length;
-    return sz;
-}
-
-
 static void jl_gc_free_memory(jl_value_t *v, int isaligned) JL_NOTSAFEPOINT
 {
     assert(jl_is_genericmemory(v));
@@ -816,6 +787,29 @@ jl_value_t *jl_gc_small_alloc_noinline(jl_ptls_t ptls, int offset, int osize) {
     return jl_gc_small_alloc_inner(ptls, offset, osize);
 }
 
+// Size does NOT include the type tag!!
+inline jl_value_t *jl_gc_alloc_(jl_ptls_t ptls, size_t sz, void *ty)
+{
+    jl_value_t *v;
+    const size_t allocsz = sz + sizeof(jl_taggedvalue_t);
+    if (sz <= GC_MAX_SZCLASS) {
+        int pool_id = jl_gc_szclass(allocsz);
+        jl_gc_pool_t *p = &ptls->gc_tls.heap.norm_pools[pool_id];
+        int osize = jl_gc_sizeclasses[pool_id];
+        // We call `jl_gc_small_alloc_noinline` instead of `jl_gc_small_alloc` to avoid double-counting in
+        // the Allocations Profiler. (See https://github.com/JuliaLang/julia/pull/43868 for more details.)
+        v = jl_gc_small_alloc_noinline(ptls, (char*)p - (char*)ptls, osize);
+    }
+    else {
+        if (allocsz < sz) // overflow in adding offs, size was "negative"
+            jl_throw(jl_memory_exception);
+        v = jl_gc_big_alloc_noinline(ptls, allocsz);
+    }
+    jl_set_typeof(v, ty);
+    maybe_record_alloc_to_profile(v, sz, (jl_datatype_t*)ty);
+    return v;
+}
+
 int jl_gc_classify_pools(size_t sz, int *osize)
 {
     if (sz > GC_MAX_SZCLASS)
@@ -2792,6 +2786,21 @@ static void sweep_finalizer_list(arraylist_t *list)
     list->len = j;
 }
 
+int gc_is_parallel_collector_thread(int tid) JL_NOTSAFEPOINT
+{
+    return tid >= gc_first_tid && tid <= gc_last_parallel_collector_thread_id();
+}
+
+int gc_is_concurrent_collector_thread(int tid) JL_NOTSAFEPOINT
+{
+    if (jl_n_sweepthreads == 0) {
+        return 0;
+    }
+    int last_parallel_collector_thread_id = gc_last_parallel_collector_thread_id();
+    int concurrent_collector_thread_id = last_parallel_collector_thread_id + 1;
+    return tid == concurrent_collector_thread_id;
+}
+
 // collector entry point and control
 _Atomic(uint32_t) jl_gc_disable_counter = 1;
 
@@ -2830,11 +2839,6 @@ JL_DLLEXPORT void jl_gc_get_total_bytes(int64_t *bytes) JL_NOTSAFEPOINT
     *bytes = (num.total_allocd + num.deferred_alloc + num.allocd);
 }
 
-JL_DLLEXPORT uint64_t jl_gc_total_hrtime(void)
-{
-    return gc_num.total_time;
-}
-
 JL_DLLEXPORT jl_gc_num_t jl_gc_num(void)
 {
     jl_gc_num_t num = gc_num;
@@ -3386,13 +3390,6 @@ void gc_mark_queue_all_roots(jl_ptls_t ptls, jl_gc_markqueue_t *mq)
     gc_mark_roots(mq);
 }
 
-// allocator entry points
-
-JL_DLLEXPORT jl_value_t *(jl_gc_alloc)(jl_ptls_t ptls, size_t sz, void *ty)
-{
-    return jl_gc_alloc_(ptls, sz, ty);
-}
-
 // Per-thread initialization
 void jl_init_thread_heap(jl_ptls_t ptls)
 {
@@ -3674,63 +3671,6 @@ JL_DLLEXPORT void *jl_gc_counted_realloc_with_old_size(void *p, size_t old, size
     return data;
 }
 
-// allocation wrappers that save the size of allocations, to allow using
-// jl_gc_counted_* functions with a libc-compatible API.
-
-JL_DLLEXPORT void *jl_malloc(size_t sz)
-{
-    int64_t *p = (int64_t *)jl_gc_counted_malloc(sz + JL_SMALL_BYTE_ALIGNMENT);
-    if (p == NULL)
-        return NULL;
-    p[0] = sz;
-    return (void *)(p + 2); // assumes JL_SMALL_BYTE_ALIGNMENT == 16
-}
-
-//_unchecked_calloc does not check for potential overflow of nm*sz
-STATIC_INLINE void *_unchecked_calloc(size_t nm, size_t sz) {
-    size_t nmsz = nm*sz;
-    int64_t *p = (int64_t *)jl_gc_counted_calloc(nmsz + JL_SMALL_BYTE_ALIGNMENT, 1);
-    if (p == NULL)
-        return NULL;
-    p[0] = nmsz;
-    return (void *)(p + 2); // assumes JL_SMALL_BYTE_ALIGNMENT == 16
-}
-
-JL_DLLEXPORT void *jl_calloc(size_t nm, size_t sz)
-{
-    if (nm > SSIZE_MAX/sz - JL_SMALL_BYTE_ALIGNMENT)
-        return NULL;
-    return _unchecked_calloc(nm, sz);
-}
-
-JL_DLLEXPORT void jl_free(void *p)
-{
-    if (p != NULL) {
-        int64_t *pp = (int64_t *)p - 2;
-        size_t sz = pp[0];
-        jl_gc_counted_free_with_size(pp, sz + JL_SMALL_BYTE_ALIGNMENT);
-    }
-}
-
-JL_DLLEXPORT void *jl_realloc(void *p, size_t sz)
-{
-    int64_t *pp;
-    size_t szold;
-    if (p == NULL) {
-        pp = NULL;
-        szold = 0;
-    }
-    else {
-        pp = (int64_t *)p - 2;
-        szold = pp[0] + JL_SMALL_BYTE_ALIGNMENT;
-    }
-    int64_t *pnew = (int64_t *)jl_gc_counted_realloc_with_old_size(pp, szold, sz + JL_SMALL_BYTE_ALIGNMENT);
-    if (pnew == NULL)
-        return NULL;
-    pnew[0] = sz;
-    return (void *)(pnew + 2); // assumes JL_SMALL_BYTE_ALIGNMENT == 16
-}
-
 // allocating blocks for Arrays and Strings
 
 JL_DLLEXPORT void *jl_gc_managed_malloc(size_t sz)
@@ -3864,18 +3804,6 @@ jl_value_t *jl_gc_permobj(size_t sz, void *ty) JL_NOTSAFEPOINT
     return jl_valueof(o);
 }
 
-JL_DLLEXPORT jl_weakref_t *jl_gc_new_weakref(jl_value_t *value)
-{
-    jl_ptls_t ptls = jl_current_task->ptls;
-    return jl_gc_new_weakref_th(ptls, value);
-}
-
-JL_DLLEXPORT jl_value_t *jl_gc_allocobj(size_t sz)
-{
-    jl_ptls_t ptls = jl_current_task->ptls;
-    return jl_gc_alloc(ptls, sz, NULL);
-}
-
 JL_DLLEXPORT int jl_gc_enable_conservative_gc_support(void)
 {
     if (jl_is_initialized()) {
@@ -4003,14 +3931,14 @@ JL_DLLEXPORT size_t jl_gc_external_obj_hdr_size(void)
 }
 
 
-JL_DLLEXPORT void * jl_gc_alloc_typed(jl_ptls_t ptls, size_t sz, void *ty)
+JL_DLLEXPORT void jl_gc_schedule_foreign_sweepfunc(jl_ptls_t ptls, jl_value_t *obj)
 {
-    return jl_gc_alloc(ptls, sz, ty);
+    arraylist_push(&ptls->gc_tls.sweep_objs, obj);
 }
 
-JL_DLLEXPORT void jl_gc_schedule_foreign_sweepfunc(jl_ptls_t ptls, jl_value_t *obj)
+void jl_gc_notify_image_load(const char* img_data, size_t len)
 {
-    arraylist_push(&ptls->gc_tls.sweep_objs, obj);
+    // Do nothing
 }
 
 #ifdef __cplusplus
diff --git a/src/gc-stock.h b/src/gc-stock.h
index 45c93bf4289ae..3f3900b349bcf 100644
--- a/src/gc-stock.h
+++ b/src/gc-stock.h
@@ -106,12 +106,6 @@ JL_EXTENSION typedef struct _bigval_t {
     // must be 64-byte aligned here, in 32 & 64 bit modes
 } bigval_t;
 
-// data structure for tracking malloc'd genericmemory.
-typedef struct _mallocmemory_t {
-    jl_genericmemory_t *a; // lowest bit is tagged if this is aligned memory
-    struct _mallocmemory_t *next;
-} mallocmemory_t;
-
 // pool page metadata
 typedef struct _jl_gc_pagemeta_t {
     // next metadata structure in per-thread list
@@ -428,21 +422,6 @@ STATIC_INLINE int gc_ith_parallel_collector_thread_id(int i) JL_NOTSAFEPOINT
     return gc_first_tid + i;
 }
 
-STATIC_INLINE int gc_is_parallel_collector_thread(int tid) JL_NOTSAFEPOINT
-{
-    return tid >= gc_first_tid && tid <= gc_last_parallel_collector_thread_id();
-}
-
-STATIC_INLINE int gc_is_concurrent_collector_thread(int tid) JL_NOTSAFEPOINT
-{
-    if (jl_n_sweepthreads == 0) {
-        return 0;
-    }
-    int last_parallel_collector_thread_id = gc_last_parallel_collector_thread_id();
-    int concurrent_collector_thread_id = last_parallel_collector_thread_id + 1;
-    return tid == concurrent_collector_thread_id;
-}
-
 STATIC_INLINE int gc_random_parallel_collector_thread_id(jl_ptls_t ptls) JL_NOTSAFEPOINT
 {
     assert(jl_n_markthreads > 0);
diff --git a/src/julia.h b/src/julia.h
index abb8a57ff13b0..db57db1fbeb38 100644
--- a/src/julia.h
+++ b/src/julia.h
@@ -850,7 +850,7 @@ static inline jl_value_t *jl_to_typeof(uintptr_t t)
     return (jl_value_t*)t;
 }
 #else
-extern JL_HIDDEN jl_datatype_t *ijl_small_typeof[(jl_max_tags << 4) / sizeof(jl_datatype_t*)];
+extern JL_DLLEXPORT jl_datatype_t *ijl_small_typeof[(jl_max_tags << 4) / sizeof(jl_datatype_t*)];
 static inline jl_value_t *jl_to_typeof(uintptr_t t)
 {
     if (t < (jl_max_tags << 4))
diff --git a/src/julia_internal.h b/src/julia_internal.h
index f00667d016796..edddb68754fc3 100644
--- a/src/julia_internal.h
+++ b/src/julia_internal.h
@@ -513,30 +513,6 @@ STATIC_INLINE uint8_t JL_CONST_FUNC jl_gc_szclass_align8(unsigned sz) JL_NOTSAFE
 #define GC_MAX_SZCLASS (2032-sizeof(void*))
 static_assert(ARRAY_CACHE_ALIGN_THRESHOLD > GC_MAX_SZCLASS, "");
 
-
-// Size does NOT include the type tag!!
-STATIC_INLINE jl_value_t *jl_gc_alloc_(jl_ptls_t ptls, size_t sz, void *ty)
-{
-    jl_value_t *v;
-    const size_t allocsz = sz + sizeof(jl_taggedvalue_t);
-    if (sz <= GC_MAX_SZCLASS) {
-        int pool_id = jl_gc_szclass(allocsz);
-        jl_gc_pool_t *p = &ptls->gc_tls.heap.norm_pools[pool_id];
-        int osize = jl_gc_sizeclasses[pool_id];
-        // We call `jl_gc_small_alloc_noinline` instead of `jl_gc_small_alloc` to avoid double-counting in
-        // the Allocations Profiler. (See https://github.com/JuliaLang/julia/pull/43868 for more details.)
-        v = jl_gc_small_alloc_noinline(ptls, (char*)p - (char*)ptls, osize);
-    }
-    else {
-        if (allocsz < sz) // overflow in adding offs, size was "negative"
-            jl_throw(jl_memory_exception);
-        v = jl_gc_big_alloc_noinline(ptls, allocsz);
-    }
-    jl_set_typeof(v, ty);
-    maybe_record_alloc_to_profile(v, sz, (jl_datatype_t*)ty);
-    return v;
-}
-
 /* Programming style note: When using jl_gc_alloc, do not JL_GC_PUSH it into a
  * gc frame, until it has been fully initialized. An uninitialized value in a
  * gc frame can crash upon encountering the first safepoint. By delaying use of
@@ -1074,7 +1050,7 @@ STATIC_INLINE int jl_addr_is_safepoint(uintptr_t addr)
     return addr >= safepoint_addr && addr < safepoint_addr + jl_page_size * 4;
 }
 extern _Atomic(uint32_t) jl_gc_running;
-extern _Atomic(uint32_t) jl_gc_disable_counter;
+extern JL_DLLEXPORT _Atomic(uint32_t) jl_gc_disable_counter;
 // All the functions are safe to be called from within a signal handler
 // provided that the thread will not be interrupted by another asynchronous
 // signal.
diff --git a/src/scheduler.c b/src/scheduler.c
index bb2f85b52283f..b85a481588e4f 100644
--- a/src/scheduler.c
+++ b/src/scheduler.c
@@ -80,9 +80,20 @@ JL_DLLEXPORT int jl_set_task_threadpoolid(jl_task_t *task, int8_t tpid) JL_NOTSA
     return 1;
 }
 
+<<<<<<< HEAD
 // GC functions used
 extern int jl_gc_mark_queue_obj_explicit(jl_gc_mark_cache_t *gc_cache,
                                          jl_gc_markqueue_t *mq, jl_value_t *obj) JL_NOTSAFEPOINT;
+=======
+// parallel task runtime
+// ---
+
+JL_DLLEXPORT uint32_t jl_rand_ptls(uint32_t max) // [0, n)
+{
+    jl_ptls_t ptls = jl_current_task->ptls;
+    return cong(max, &ptls->rngseed);
+}
+>>>>>>> 4f39869d04 (Refactoring to be considered before adding MMTk)
 
 // initialize the threading infrastructure
 // (called only by the main thread)
diff --git a/src/stackwalk.c b/src/stackwalk.c
index 6aa36fa8b499c..5f28b61c4a8fe 100644
--- a/src/stackwalk.c
+++ b/src/stackwalk.c
@@ -5,7 +5,7 @@
   utilities for walking the stack and looking up information about code addresses
 */
 #include <inttypes.h>
-#include "gc-stock.h"
+#include "gc-common.h"
 #include "julia.h"
 #include "julia_internal.h"
 #include "threading.h"
@@ -1294,6 +1294,8 @@ JL_DLLEXPORT void jl_print_backtrace(void) JL_NOTSAFEPOINT
 }
 
 extern int gc_first_tid;
+extern int gc_is_parallel_collector_thread(int tid) JL_NOTSAFEPOINT;
+extern int gc_is_concurrent_collector_thread(int tid) JL_NOTSAFEPOINT;
 
 // Print backtraces for all live tasks, for all threads, to jl_safe_printf stderr
 JL_DLLEXPORT void jl_print_task_backtraces(int show_done) JL_NOTSAFEPOINT
diff --git a/src/staticdata.c b/src/staticdata.c
index 363aa46b62221..e07a5365bf06f 100644
--- a/src/staticdata.c
+++ b/src/staticdata.c
@@ -654,6 +654,7 @@ static void jl_load_sysimg_so(void)
         plen = (size_t *)&jl_system_image_size;
     else
         jl_dlsym(jl_sysimg_handle, "jl_system_image_size", (void **)&plen, 1);
+    jl_gc_notify_image_load(sysimg_data, *plen);
     jl_restore_system_image_data(sysimg_data, *plen);
 }
 
@@ -3899,6 +3900,7 @@ JL_DLLEXPORT jl_value_t *jl_restore_package_image_from_file(const char *fname, j
     jl_dlsym(pkgimg_handle, "jl_system_image_data", (void **)&pkgimg_data, 1);
     size_t *plen;
     jl_dlsym(pkgimg_handle, "jl_system_image_size", (void **)&plen, 1);
+    jl_gc_notify_image_load(pkgimg_data, *plen);
 
     jl_image_t pkgimage = jl_init_processor_pkgimg(pkgimg_handle);
 

From d2f2b8d9c477514e93009d0b99e2ffe65bcc9831 Mon Sep 17 00:00:00 2001
From: Eduardo Souza <ledusou@gmail.com>
Date: Thu, 29 Aug 2024 04:57:59 +0000
Subject: [PATCH 02/38] Removing jl_gc_notify_image_load, since it's a new
 function and not part of the refactoring

---
 src/gc-interface.h | 5 -----
 src/gc-stock.c     | 5 -----
 src/staticdata.c   | 2 --
 3 files changed, 12 deletions(-)

diff --git a/src/gc-interface.h b/src/gc-interface.h
index 682f22344d69d..25ffed4524f0c 100644
--- a/src/gc-interface.h
+++ b/src/gc-interface.h
@@ -218,11 +218,6 @@ JL_DLLEXPORT void *jl_gc_perm_alloc(size_t sz, int zero, unsigned align,
 // object being allocated and will be used to set the object header.
 struct _jl_value_t *jl_gc_permobj(size_t sz, void *ty) JL_NOTSAFEPOINT;
 
-// This function notifies the GC about memory addresses that are set when loading the boot image.
-// The GC may use that information to, for instance, determine that such objects should
-// be treated as marked and belonged to the old generation in nursery collections.
-void jl_gc_notify_image_load(const char* img_data, size_t len);
-
 // ========================================================================= //
 // Runtime Write-Barriers
 // ========================================================================= //
diff --git a/src/gc-stock.c b/src/gc-stock.c
index 4a8c6fe7decc5..9b633cacd7870 100644
--- a/src/gc-stock.c
+++ b/src/gc-stock.c
@@ -3936,11 +3936,6 @@ JL_DLLEXPORT void jl_gc_schedule_foreign_sweepfunc(jl_ptls_t ptls, jl_value_t *o
     arraylist_push(&ptls->gc_tls.sweep_objs, obj);
 }
 
-void jl_gc_notify_image_load(const char* img_data, size_t len)
-{
-    // Do nothing
-}
-
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/staticdata.c b/src/staticdata.c
index e07a5365bf06f..363aa46b62221 100644
--- a/src/staticdata.c
+++ b/src/staticdata.c
@@ -654,7 +654,6 @@ static void jl_load_sysimg_so(void)
         plen = (size_t *)&jl_system_image_size;
     else
         jl_dlsym(jl_sysimg_handle, "jl_system_image_size", (void **)&plen, 1);
-    jl_gc_notify_image_load(sysimg_data, *plen);
     jl_restore_system_image_data(sysimg_data, *plen);
 }
 
@@ -3900,7 +3899,6 @@ JL_DLLEXPORT jl_value_t *jl_restore_package_image_from_file(const char *fname, j
     jl_dlsym(pkgimg_handle, "jl_system_image_data", (void **)&pkgimg_data, 1);
     size_t *plen;
     jl_dlsym(pkgimg_handle, "jl_system_image_size", (void **)&plen, 1);
-    jl_gc_notify_image_load(pkgimg_data, *plen);
 
     jl_image_t pkgimage = jl_init_processor_pkgimg(pkgimg_handle);
 

From a42cb6410cf4f3e1773b0e41ecb5c696bc9cf836 Mon Sep 17 00:00:00 2001
From: Eduardo Souza <ledusou@gmail.com>
Date: Mon, 2 Sep 2024 01:27:08 +0000
Subject: [PATCH 03/38] Moving gc_enable code to gc-common.c

---
 src/gc-common.c | 30 ++++++++++++++++++++++++++++++
 src/gc-stock.c  | 30 ------------------------------
 2 files changed, 30 insertions(+), 30 deletions(-)

diff --git a/src/gc-common.c b/src/gc-common.c
index 2ec167caa667a..03c046bc300f2 100644
--- a/src/gc-common.c
+++ b/src/gc-common.c
@@ -613,6 +613,36 @@ void jl_gc_track_malloced_genericmemory(jl_ptls_t ptls, jl_genericmemory_t *m, i
     ptls->gc_tls.heap.mallocarrays = ma;
 }
 
+// collector entry point and control
+_Atomic(uint32_t) jl_gc_disable_counter = 1;
+
+JL_DLLEXPORT int jl_gc_enable(int on)
+{
+    jl_ptls_t ptls = jl_current_task->ptls;
+    int prev = !ptls->disable_gc;
+    ptls->disable_gc = (on == 0);
+    if (on && !prev) {
+        // disable -> enable
+        if (jl_atomic_fetch_add(&jl_gc_disable_counter, -1) == 1) {
+            gc_num.allocd += gc_num.deferred_alloc;
+            gc_num.deferred_alloc = 0;
+        }
+    }
+    else if (prev && !on) {
+        // enable -> disable
+        jl_atomic_fetch_add(&jl_gc_disable_counter, 1);
+        // check if the GC is running and wait for it to finish
+        jl_gc_safepoint_(ptls);
+    }
+    return prev;
+}
+
+JL_DLLEXPORT int jl_gc_is_enabled(void)
+{
+    jl_ptls_t ptls = jl_current_task->ptls;
+    return !ptls->disable_gc;
+}
+
 int gc_logging_enabled = 0;
 
 JL_DLLEXPORT void jl_enable_gc_logging(int enable) {
diff --git a/src/gc-stock.c b/src/gc-stock.c
index 9b633cacd7870..61fc8d4e83a3a 100644
--- a/src/gc-stock.c
+++ b/src/gc-stock.c
@@ -2801,36 +2801,6 @@ int gc_is_concurrent_collector_thread(int tid) JL_NOTSAFEPOINT
     return tid == concurrent_collector_thread_id;
 }
 
-// collector entry point and control
-_Atomic(uint32_t) jl_gc_disable_counter = 1;
-
-JL_DLLEXPORT int jl_gc_enable(int on)
-{
-    jl_ptls_t ptls = jl_current_task->ptls;
-    int prev = !ptls->disable_gc;
-    ptls->disable_gc = (on == 0);
-    if (on && !prev) {
-        // disable -> enable
-        if (jl_atomic_fetch_add(&jl_gc_disable_counter, -1) == 1) {
-            gc_num.allocd += gc_num.deferred_alloc;
-            gc_num.deferred_alloc = 0;
-        }
-    }
-    else if (prev && !on) {
-        // enable -> disable
-        jl_atomic_fetch_add(&jl_gc_disable_counter, 1);
-        // check if the GC is running and wait for it to finish
-        jl_gc_safepoint_(ptls);
-    }
-    return prev;
-}
-
-JL_DLLEXPORT int jl_gc_is_enabled(void)
-{
-    jl_ptls_t ptls = jl_current_task->ptls;
-    return !ptls->disable_gc;
-}
-
 JL_DLLEXPORT void jl_gc_get_total_bytes(int64_t *bytes) JL_NOTSAFEPOINT
 {
     jl_gc_num_t num = gc_num;

From 92563918292056178d6f6ed12c58a9f998ef2d54 Mon Sep 17 00:00:00 2001
From: Eduardo Souza <ledusou@gmail.com>
Date: Mon, 16 Sep 2024 06:38:02 +0000
Subject: [PATCH 04/38] Addressing PR comments

---
 src/gc-common.c      | 134 +++++++++++++++++++++++++------------------
 src/gc-common.h      |   6 ++
 src/gc-debug.c       |   2 -
 src/gc-interface.h   |  30 +---------
 src/gc-stock.c       |  18 +-----
 src/gc-stock.h       |  15 +++++
 src/julia.h          |   2 +-
 src/julia_internal.h |   4 +-
 src/stackwalk.c      |  10 +---
 9 files changed, 110 insertions(+), 111 deletions(-)

diff --git a/src/gc-common.c b/src/gc-common.c
index 03c046bc300f2..046feae6aa4c5 100644
--- a/src/gc-common.c
+++ b/src/gc-common.c
@@ -491,15 +491,9 @@ int gc_n_threads;
 jl_ptls_t* gc_all_tls_states;
 
 // =========================================================================== //
-// MISC
+// Allocation
 // =========================================================================== //
 
-JL_DLLEXPORT jl_weakref_t *jl_gc_new_weakref(jl_value_t *value)
-{
-    jl_ptls_t ptls = jl_current_task->ptls;
-    return jl_gc_new_weakref_th(ptls, value);
-}
-
 JL_DLLEXPORT void * jl_gc_alloc_typed(jl_ptls_t ptls, size_t sz, void *ty)
 {
     return jl_gc_alloc(ptls, sz, ty);
@@ -575,17 +569,9 @@ JL_DLLEXPORT jl_value_t *(jl_gc_alloc)(jl_ptls_t ptls, size_t sz, void *ty)
     return jl_gc_alloc_(ptls, sz, ty);
 }
 
-const uint64_t _jl_buff_tag[3] = {0x4eadc0004eadc000ull, 0x4eadc0004eadc000ull, 0x4eadc0004eadc000ull}; // aka 0xHEADER00
-JL_DLLEXPORT uintptr_t jl_get_buff_tag(void) JL_NOTSAFEPOINT
-{
-    return jl_buff_tag;
-}
-
-// callback for passing OOM errors from gmp
-JL_DLLEXPORT void jl_throw_out_of_memory_error(void)
-{
-    jl_throw(jl_memory_exception);
-}
+// =========================================================================== //
+// Generic Memory
+// =========================================================================== //
 
 size_t jl_genericmemory_nbytes(jl_genericmemory_t *m) JL_NOTSAFEPOINT
 {
@@ -613,6 +599,66 @@ void jl_gc_track_malloced_genericmemory(jl_ptls_t ptls, jl_genericmemory_t *m, i
     ptls->gc_tls.heap.mallocarrays = ma;
 }
 
+// =========================================================================== //
+// GC Debug
+// =========================================================================== //
+
+int gc_slot_to_fieldidx(void *obj, void *slot, jl_datatype_t *vt) JL_NOTSAFEPOINT
+{
+    int nf = (int)jl_datatype_nfields(vt);
+    for (int i = 1; i < nf; i++) {
+        if (slot < (void*)((char*)obj + jl_field_offset(vt, i)))
+            return i - 1;
+    }
+    return nf - 1;
+}
+
+int gc_slot_to_arrayidx(void *obj, void *_slot) JL_NOTSAFEPOINT
+{
+    char *slot = (char*)_slot;
+    jl_datatype_t *vt = (jl_datatype_t*)jl_typeof(obj);
+    char *start = NULL;
+    size_t len = 0;
+    size_t elsize = sizeof(void*);
+    if (vt == jl_module_type) {
+        jl_module_t *m = (jl_module_t*)obj;
+        start = (char*)m->usings.items;
+        len = m->usings.len;
+    }
+    else if (vt == jl_simplevector_type) {
+        start = (char*)jl_svec_data(obj);
+        len = jl_svec_len(obj);
+    }
+    if (slot < start || slot >= start + elsize * len)
+        return -1;
+    return (slot - start) / elsize;
+}
+
+// =========================================================================== //
+// GC Control
+// =========================================================================== //
+
+JL_DLLEXPORT uint32_t jl_get_gc_disable_counter(void) {
+    return jl_atomic_load_acquire(&jl_gc_disable_counter);
+}
+
+JL_DLLEXPORT int jl_gc_is_enabled(void)
+{
+    jl_ptls_t ptls = jl_current_task->ptls;
+    return !ptls->disable_gc;
+}
+
+int gc_logging_enabled = 0;
+
+JL_DLLEXPORT void jl_enable_gc_logging(int enable) {
+    gc_logging_enabled = enable;
+}
+
+JL_DLLEXPORT int jl_is_gc_logging_enabled(void) {
+    return gc_logging_enabled;
+}
+
+
 // collector entry point and control
 _Atomic(uint32_t) jl_gc_disable_counter = 1;
 
@@ -637,54 +683,30 @@ JL_DLLEXPORT int jl_gc_enable(int on)
     return prev;
 }
 
-JL_DLLEXPORT int jl_gc_is_enabled(void)
+// =========================================================================== //
+// MISC
+// =========================================================================== //
+
+JL_DLLEXPORT jl_weakref_t *jl_gc_new_weakref(jl_value_t *value)
 {
     jl_ptls_t ptls = jl_current_task->ptls;
-    return !ptls->disable_gc;
-}
-
-int gc_logging_enabled = 0;
-
-JL_DLLEXPORT void jl_enable_gc_logging(int enable) {
-    gc_logging_enabled = enable;
+    return jl_gc_new_weakref_th(ptls, value);
 }
 
-JL_DLLEXPORT int jl_is_gc_logging_enabled(void) {
-    return gc_logging_enabled;
+JL_DLLEXPORT jl_datatype_t **jl_get_ijl_small_typeof(void) {
+    return ijl_small_typeof;
 }
 
-// gc-debug common functions
-// ---
-
-int gc_slot_to_fieldidx(void *obj, void *slot, jl_datatype_t *vt) JL_NOTSAFEPOINT
+const uint64_t _jl_buff_tag[3] = {0x4eadc0004eadc000ull, 0x4eadc0004eadc000ull, 0x4eadc0004eadc000ull}; // aka 0xHEADER00
+JL_DLLEXPORT uintptr_t jl_get_buff_tag(void) JL_NOTSAFEPOINT
 {
-    int nf = (int)jl_datatype_nfields(vt);
-    for (int i = 1; i < nf; i++) {
-        if (slot < (void*)((char*)obj + jl_field_offset(vt, i)))
-            return i - 1;
-    }
-    return nf - 1;
+    return jl_buff_tag;
 }
 
-int gc_slot_to_arrayidx(void *obj, void *_slot) JL_NOTSAFEPOINT
+// callback for passing OOM errors from gmp
+JL_DLLEXPORT void jl_throw_out_of_memory_error(void)
 {
-    char *slot = (char*)_slot;
-    jl_datatype_t *vt = (jl_datatype_t*)jl_typeof(obj);
-    char *start = NULL;
-    size_t len = 0;
-    size_t elsize = sizeof(void*);
-    if (vt == jl_module_type) {
-        jl_module_t *m = (jl_module_t*)obj;
-        start = (char*)m->usings.items;
-        len = m->usings.len;
-    }
-    else if (vt == jl_simplevector_type) {
-        start = (char*)jl_svec_data(obj);
-        len = jl_svec_len(obj);
-    }
-    if (slot < start || slot >= start + elsize * len)
-        return -1;
-    return (slot - start) / elsize;
+    jl_throw(jl_memory_exception);
 }
 
 #ifdef __cplusplus
diff --git a/src/gc-common.h b/src/gc-common.h
index 154b9659e9ccb..32b7470b13a58 100644
--- a/src/gc-common.h
+++ b/src/gc-common.h
@@ -179,4 +179,10 @@ JL_DLLEXPORT void jl_finalize_th(jl_task_t *ct, jl_value_t *o);
 extern int gc_n_threads;
 extern jl_ptls_t* gc_all_tls_states;
 
+// =========================================================================== //
+// Logging
+// =========================================================================== //
+
+extern int gc_logging_enabled;
+
 #endif // JL_GC_COMMON_H
diff --git a/src/gc-debug.c b/src/gc-debug.c
index d05fb4b49e9f7..7c479484cde45 100644
--- a/src/gc-debug.c
+++ b/src/gc-debug.c
@@ -1105,8 +1105,6 @@ void gc_count_pool(void)
     jl_safe_printf("************************\n");
 }
 
-extern int gc_logging_enabled;
-
 void _report_gc_finished(uint64_t pause, uint64_t freed, int full, int recollect, int64_t live_bytes) JL_NOTSAFEPOINT {
     if (!gc_logging_enabled) {
         return;
diff --git a/src/gc-interface.h b/src/gc-interface.h
index 25ffed4524f0c..0e9ce32697f35 100644
--- a/src/gc-interface.h
+++ b/src/gc-interface.h
@@ -94,6 +94,8 @@ JL_DLLEXPORT void jl_gc_set_max_memory(uint64_t max_mem);
 // should run a collection cycle again (e.g. a full mark right after a full sweep to ensure
 // we do a full heap traversal).
 JL_DLLEXPORT void jl_gc_collect(jl_gc_collection_t collection);
+// Returns whether the thread with `tid` is a collector thread
+JL_DLLEXPORT int gc_is_collector_thread(int tid) JL_NOTSAFEPOINT;
 
 // ========================================================================= //
 // Metrics
@@ -162,26 +164,6 @@ JL_DLLEXPORT void *jl_gc_counted_calloc(size_t nm, size_t sz);
 JL_DLLEXPORT void jl_gc_counted_free_with_size(void *p, size_t sz);
 // Wrapper around Libc realloc that updates Julia allocation counters.
 JL_DLLEXPORT void *jl_gc_counted_realloc_with_old_size(void *p, size_t old, size_t sz);
-// Wrapper around Libc malloc that allocates a memory region with a few additional machine
-// words before the actual payload that are used to record the size of the requested
-// allocation. Also updates Julia allocation counters. The function returns a pointer to the
-// payload as a result of the allocation.
-JL_DLLEXPORT void *jl_malloc(size_t sz);
-// Wrapper around Libc calloc that allocates a memory region with a few additional machine
-// words before the actual payload that are used to record the size of the requested
-// allocation. Also updates Julia allocation counters. The function returns a pointer to the
-// payload as a result of the allocation.
-JL_DLLEXPORT void *jl_calloc(size_t nm, size_t sz);
-// Wrapper around Libc free that takes a pointer to the payload of a memory region allocated
-// with jl_malloc or jl_calloc, and uses the size information stored in the first machine
-// words of the memory buffer update Julia allocation counters, and then frees the
-// corresponding memory buffer.
-JL_DLLEXPORT void jl_free(void *p);
-// Wrapper around Libc realloc that takes a memory region allocated with jl_malloc or
-// jl_calloc, and uses the size information stored in the first machine words of the memory
-// buffer to update Julia allocation counters, reallocating the corresponding memory buffer
-// in the end.
-JL_DLLEXPORT void *jl_realloc(void *p, size_t sz);
 // Wrapper around Libc malloc that's used to dynamically allocate memory for Arrays and
 // Strings. It increments Julia allocation counters and should check whether we're close to
 // the Julia heap target, and therefore, whether we should run a collection. Note that this
@@ -195,14 +177,6 @@ JL_DLLEXPORT void *jl_gc_managed_malloc(size_t sz);
 // thread-local allocator of the thread referenced by the first jl_ptls_t argument.
 JL_DLLEXPORT struct _jl_weakref_t *jl_gc_new_weakref_th(struct _jl_tls_states_t *ptls,
                                                         struct _jl_value_t *value);
-// Allocates a new weak-reference, assigns its value and increments Julia allocation
-// counters. If thread-local allocators are used, then this function should allocate in the
-// thread-local allocator of the current thread.
-JL_DLLEXPORT struct _jl_weakref_t *jl_gc_new_weakref(struct _jl_value_t *value);
-// Allocates an object whose size is specified by the function argument and increments Julia
-// allocation counters. If thread-local allocators are used, then this function should
-// allocate in the thread-local allocator of the current thread.
-JL_DLLEXPORT struct _jl_value_t *jl_gc_allocobj(size_t sz);
 // Permanently allocates a memory slot of the size specified by the first parameter. This
 // block of memory is allocated in an immortal region that is never swept. The second
 // parameter specifies whether the memory should be filled with zeros. The third and fourth
diff --git a/src/gc-stock.c b/src/gc-stock.c
index 61fc8d4e83a3a..3ff37566dc6c7 100644
--- a/src/gc-stock.c
+++ b/src/gc-stock.c
@@ -2786,19 +2786,8 @@ static void sweep_finalizer_list(arraylist_t *list)
     list->len = j;
 }
 
-int gc_is_parallel_collector_thread(int tid) JL_NOTSAFEPOINT
-{
-    return tid >= gc_first_tid && tid <= gc_last_parallel_collector_thread_id();
-}
-
-int gc_is_concurrent_collector_thread(int tid) JL_NOTSAFEPOINT
-{
-    if (jl_n_sweepthreads == 0) {
-        return 0;
-    }
-    int last_parallel_collector_thread_id = gc_last_parallel_collector_thread_id();
-    int concurrent_collector_thread_id = last_parallel_collector_thread_id + 1;
-    return tid == concurrent_collector_thread_id;
+int gc_is_collector_thread(int tid) JL_NOTSAFEPOINT {
+    return gc_is_parallel_collector_thread(tid) || gc_is_concurrent_collector_thread(tid);
 }
 
 JL_DLLEXPORT void jl_gc_get_total_bytes(int64_t *bytes) JL_NOTSAFEPOINT
@@ -3182,8 +3171,7 @@ static int _jl_gc_collect(jl_ptls_t ptls, jl_gc_collection_t collection)
         // free empty GC state for threads that have exited
         if (jl_atomic_load_relaxed(&ptls2->current_task) == NULL) {
             // GC threads should never exit
-            assert(!gc_is_parallel_collector_thread(t_i));
-            assert(!gc_is_concurrent_collector_thread(t_i));
+            assert(!gc_is_collector_thread(t_i));
             jl_thread_heap_t *heap = &ptls2->gc_tls.heap;
             if (heap->weak_refs.len == 0)
                 small_arraylist_free(&heap->weak_refs);
diff --git a/src/gc-stock.h b/src/gc-stock.h
index 3f3900b349bcf..50eca3aadbd86 100644
--- a/src/gc-stock.h
+++ b/src/gc-stock.h
@@ -422,6 +422,21 @@ STATIC_INLINE int gc_ith_parallel_collector_thread_id(int i) JL_NOTSAFEPOINT
     return gc_first_tid + i;
 }
 
+STATIC_INLINE int gc_is_parallel_collector_thread(int tid) JL_NOTSAFEPOINT
+{
+    return tid >= gc_first_tid && tid <= gc_last_parallel_collector_thread_id();
+}
+
+STATIC_INLINE int gc_is_concurrent_collector_thread(int tid) JL_NOTSAFEPOINT
+{
+    if (jl_n_sweepthreads == 0) {
+        return 0;
+    }
+    int last_parallel_collector_thread_id = gc_last_parallel_collector_thread_id();
+    int concurrent_collector_thread_id = last_parallel_collector_thread_id + 1;
+    return tid == concurrent_collector_thread_id;
+}
+
 STATIC_INLINE int gc_random_parallel_collector_thread_id(jl_ptls_t ptls) JL_NOTSAFEPOINT
 {
     assert(jl_n_markthreads > 0);
diff --git a/src/julia.h b/src/julia.h
index db57db1fbeb38..abb8a57ff13b0 100644
--- a/src/julia.h
+++ b/src/julia.h
@@ -850,7 +850,7 @@ static inline jl_value_t *jl_to_typeof(uintptr_t t)
     return (jl_value_t*)t;
 }
 #else
-extern JL_DLLEXPORT jl_datatype_t *ijl_small_typeof[(jl_max_tags << 4) / sizeof(jl_datatype_t*)];
+extern JL_HIDDEN jl_datatype_t *ijl_small_typeof[(jl_max_tags << 4) / sizeof(jl_datatype_t*)];
 static inline jl_value_t *jl_to_typeof(uintptr_t t)
 {
     if (t < (jl_max_tags << 4))
diff --git a/src/julia_internal.h b/src/julia_internal.h
index edddb68754fc3..e677f40907dfd 100644
--- a/src/julia_internal.h
+++ b/src/julia_internal.h
@@ -367,6 +367,8 @@ extern jl_function_t *jl_typeinf_func JL_GLOBALLY_ROOTED;
 extern JL_DLLEXPORT size_t jl_typeinf_world;
 extern _Atomic(jl_typemap_entry_t*) call_cache[N_CALL_CACHE] JL_GLOBALLY_ROOTED;
 
+extern void free_stack(void *stkbuf, size_t bufsz) JL_NOTSAFEPOINT;
+
 JL_DLLEXPORT extern int jl_lineno;
 JL_DLLEXPORT extern const char *jl_filename;
 
@@ -1050,7 +1052,7 @@ STATIC_INLINE int jl_addr_is_safepoint(uintptr_t addr)
     return addr >= safepoint_addr && addr < safepoint_addr + jl_page_size * 4;
 }
 extern _Atomic(uint32_t) jl_gc_running;
-extern JL_DLLEXPORT _Atomic(uint32_t) jl_gc_disable_counter;
+extern _Atomic(uint32_t) jl_gc_disable_counter;
 // All the functions are safe to be called from within a signal handler
 // provided that the thread will not be interrupted by another asynchronous
 // signal.
diff --git a/src/stackwalk.c b/src/stackwalk.c
index 5f28b61c4a8fe..a1de3a6d61a07 100644
--- a/src/stackwalk.c
+++ b/src/stackwalk.c
@@ -1294,8 +1294,6 @@ JL_DLLEXPORT void jl_print_backtrace(void) JL_NOTSAFEPOINT
 }
 
 extern int gc_first_tid;
-extern int gc_is_parallel_collector_thread(int tid) JL_NOTSAFEPOINT;
-extern int gc_is_concurrent_collector_thread(int tid) JL_NOTSAFEPOINT;
 
 // Print backtraces for all live tasks, for all threads, to jl_safe_printf stderr
 JL_DLLEXPORT void jl_print_task_backtraces(int show_done) JL_NOTSAFEPOINT
@@ -1304,12 +1302,8 @@ JL_DLLEXPORT void jl_print_task_backtraces(int show_done) JL_NOTSAFEPOINT
     jl_ptls_t *allstates = jl_atomic_load_relaxed(&jl_all_tls_states);
     for (size_t i = 0; i < nthreads; i++) {
         jl_ptls_t ptls2 = allstates[i];
-        if (gc_is_parallel_collector_thread(i)) {
-            jl_safe_printf("==== Skipping backtrace for parallel GC thread %zu\n", i + 1);
-            continue;
-        }
-        if (gc_is_concurrent_collector_thread(i)) {
-            jl_safe_printf("==== Skipping backtrace for concurrent GC thread %zu\n", i + 1);
+        if (gc_is_collector_thread(i)) {
+            jl_safe_printf("==== Skipping backtrace for parallel/concurrent GC thread %zu\n", i + 1);
             continue;
         }
         if (ptls2 == NULL) {

From ec398e1a98cf713a77f908a459ed37fd4b25af27 Mon Sep 17 00:00:00 2001
From: Eduardo Souza <ledusou@gmail.com>
Date: Thu, 19 Sep 2024 04:18:13 +0000
Subject: [PATCH 05/38] Push resolution of merge conflict

---
 src/scheduler.c | 11 -----------
 1 file changed, 11 deletions(-)

diff --git a/src/scheduler.c b/src/scheduler.c
index b85a481588e4f..bb2f85b52283f 100644
--- a/src/scheduler.c
+++ b/src/scheduler.c
@@ -80,20 +80,9 @@ JL_DLLEXPORT int jl_set_task_threadpoolid(jl_task_t *task, int8_t tpid) JL_NOTSA
     return 1;
 }
 
-<<<<<<< HEAD
 // GC functions used
 extern int jl_gc_mark_queue_obj_explicit(jl_gc_mark_cache_t *gc_cache,
                                          jl_gc_markqueue_t *mq, jl_value_t *obj) JL_NOTSAFEPOINT;
-=======
-// parallel task runtime
-// ---
-
-JL_DLLEXPORT uint32_t jl_rand_ptls(uint32_t max) // [0, n)
-{
-    jl_ptls_t ptls = jl_current_task->ptls;
-    return cong(max, &ptls->rngseed);
-}
->>>>>>> 4f39869d04 (Refactoring to be considered before adding MMTk)
 
 // initialize the threading infrastructure
 // (called only by the main thread)

From 68e5e11a229f253ec6de966a321bd9d3de453a3b Mon Sep 17 00:00:00 2001
From: Eduardo Souza <ledusou@gmail.com>
Date: Wed, 25 Sep 2024 01:10:31 +0000
Subject: [PATCH 06/38] Removing jl_gc_mark_queue_obj_explicit extern
 definition from scheduler.c

---
 src/scheduler.c | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/src/scheduler.c b/src/scheduler.c
index bb2f85b52283f..7e23f654c2566 100644
--- a/src/scheduler.c
+++ b/src/scheduler.c
@@ -80,10 +80,6 @@ JL_DLLEXPORT int jl_set_task_threadpoolid(jl_task_t *task, int8_t tpid) JL_NOTSA
     return 1;
 }
 
-// GC functions used
-extern int jl_gc_mark_queue_obj_explicit(jl_gc_mark_cache_t *gc_cache,
-                                         jl_gc_markqueue_t *mq, jl_value_t *obj) JL_NOTSAFEPOINT;
-
 // initialize the threading infrastructure
 // (called only by the main thread)
 void jl_init_threadinginfra(void)

From c23f0db8347f475e1eb2b37261dd4816537210fa Mon Sep 17 00:00:00 2001
From: Eduardo Souza <ledusou@gmail.com>
Date: Wed, 25 Sep 2024 02:50:25 +0000
Subject: [PATCH 07/38] Don't need the getter function since it's possible to
 use jl_small_typeof directly

---
 src/gc-common.c | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/src/gc-common.c b/src/gc-common.c
index 046feae6aa4c5..417f12f26d64d 100644
--- a/src/gc-common.c
+++ b/src/gc-common.c
@@ -693,10 +693,6 @@ JL_DLLEXPORT jl_weakref_t *jl_gc_new_weakref(jl_value_t *value)
     return jl_gc_new_weakref_th(ptls, value);
 }
 
-JL_DLLEXPORT jl_datatype_t **jl_get_ijl_small_typeof(void) {
-    return ijl_small_typeof;
-}
-
 const uint64_t _jl_buff_tag[3] = {0x4eadc0004eadc000ull, 0x4eadc0004eadc000ull, 0x4eadc0004eadc000ull}; // aka 0xHEADER00
 JL_DLLEXPORT uintptr_t jl_get_buff_tag(void) JL_NOTSAFEPOINT
 {

From 4bfcfe5df056bb5066a545e29c29463722678892 Mon Sep 17 00:00:00 2001
From: Eduardo Souza <ledusou@gmail.com>
Date: Tue, 27 Aug 2024 06:47:41 +0000
Subject: [PATCH 08/38] WIP: Adding support for MMTk/Immix

---
 Make.inc                     |  47 ++
 contrib/refresh_checksums.mk |   2 +-
 src/Makefile                 |  43 +-
 src/builtins.c               |   1 +
 src/gc-common.c              |  70 +++
 src/gc-debug.c               |   4 +-
 src/gc-heap-snapshot.cpp     |   1 -
 src/gc-interface.h           |   3 +
 src/gc-mmtk.c                | 843 +++++++++++++++++++++++++++++++++++
 src/gc-mmtk.h                |  34 ++
 src/gc-page-profiler.c       |   4 +-
 src/gc-pages.c               |   4 +-
 src/gc-stock.c               |  14 +-
 src/gc-stock.h               |  18 +-
 src/gc-tls-mmtk.h            |  49 ++
 src/gc-tls.h                 |   4 +
 src/julia.h                  |   2 +-
 src/julia_internal.h         |   2 +-
 src/julia_threads.h          |   4 +
 src/stackwalk.c              |   2 +
 src/staticdata.c             |   2 +
 src/threading.c              |   4 +
 22 files changed, 1123 insertions(+), 34 deletions(-)
 create mode 100644 src/gc-mmtk.c
 create mode 100644 src/gc-mmtk.h
 create mode 100644 src/gc-tls-mmtk.h

diff --git a/Make.inc b/Make.inc
index f078a0c84f806..039755ce34098 100644
--- a/Make.inc
+++ b/Make.inc
@@ -86,6 +86,9 @@ HAVE_SSP := 0
 WITH_GC_VERIFY := 0
 WITH_GC_DEBUG_ENV := 0
 
+# Use MMTk GC
+WITH_MMTK ?= 0
+
 # Enable DTrace support
 WITH_DTRACE := 0
 
@@ -790,6 +793,44 @@ JCXXFLAGS += -DGC_DEBUG_ENV
 JCFLAGS += -DGC_DEBUG_ENV
 endif
 
+ifeq ($(WITH_MMTK), 1)
+ifeq (${MMTK_JULIA_DIR},)
+$(error MMTK_JULIA_DIR must be set to use MMTk)
+endif
+JCXXFLAGS += -DMMTK_GC
+JCFLAGS += -DMMTK_GC
+ifeq (${MMTK_BUILD},)
+ifeq (debug,$(findstring debug,$(MAKECMDGOALS)))
+MMTK_BUILD = debug
+else
+MMTK_BUILD = release
+endif
+endif
+ifeq (${MMTK_PLAN},Immix)
+JCXXFLAGS += -DMMTK_PLAN_IMMIX
+JCFLAGS += -DMMTK_PLAN_IMMIX
+endif
+ifeq (${MMTK_PLAN},StickyImmix)
+JCXXFLAGS += -DMMTK_PLAN_STICKYIMMIX
+JCFLAGS += -DMMTK_PLAN_STICKYIMMIX
+endif
+MMTK_DIR = ${MMTK_JULIA_DIR}/mmtk
+MMTK_API_INC = $(MMTK_DIR)/api
+MMTK_JULIA_INC = ${MMTK_JULIA_DIR}/julia
+ifeq ($(OS),Linux)
+MMTK_LIB_NAME := libmmtk_julia.so
+else
+$(error "Unsupported OS for MMTk")
+endif
+MMTK_LIB_SRC := $(MMTK_DIR)/target/$(MMTK_BUILD)/$(MMTK_LIB_NAME)
+MMTK_LIB_DST := $(BUILDROOT)/usr/lib/$(MMTK_LIB_NAME)
+MMTK_LIB := -lmmtk_julia
+LDFLAGS += -Wl,-rpath=$(MMTK_DIR)/target/$(MMTK_BUILD)/
+else
+MMTK_JULIA_INC :=
+MMTK_LIB :=
+endif
+
 ifeq ($(WITH_DTRACE), 1)
 JCXXFLAGS += -DUSE_DTRACE
 JCFLAGS += -DUSE_DTRACE
@@ -1777,6 +1818,9 @@ PRINT_PERL = printf '    %b %b\n' $(PERLCOLOR)PERL$(ENDCOLOR) $(BINCOLOR)$(GOAL)
 PRINT_FLISP = printf '    %b %b\n' $(FLISPCOLOR)FLISP$(ENDCOLOR) $(BINCOLOR)$(GOAL)$(ENDCOLOR); $(1)
 PRINT_JULIA = printf '    %b %b\n' $(JULIACOLOR)JULIA$(ENDCOLOR) $(BINCOLOR)$(GOAL)$(ENDCOLOR); $(1)
 PRINT_DTRACE = printf '    %b %b\n' $(DTRACECOLOR)DTRACE$(ENDCOLOR) $(BINCOLOR)$(GOAL)$(ENDCOLOR); $(1)
+ifeq ($(WITH_MMTK), 1)
+PRINT_MMTK = printf '    %b %b\n' $(LINKCOLOR)MMTK$(ENDCOLOR) $(BINCOLOR)$(GOAL)$(ENDCOLOR); $(1)
+endif
 
 else
 QUIET_MAKE =
@@ -1787,6 +1831,9 @@ PRINT_PERL = echo '$(subst ','\'',$(1))'; $(1)
 PRINT_FLISP = echo '$(subst ','\'',$(1))'; $(1)
 PRINT_JULIA = echo '$(subst ','\'',$(1))'; $(1)
 PRINT_DTRACE = echo '$(subst ','\'',$(1))'; $(1)
+ifeq ($(WITH_MMTK), 1)
+PRINT_MMTK = echo '$(subst ','\'',$(1))'; $(1)
+endif
 
 endif
 
diff --git a/contrib/refresh_checksums.mk b/contrib/refresh_checksums.mk
index f67088141ccd4..bf99c0fad9da2 100644
--- a/contrib/refresh_checksums.mk
+++ b/contrib/refresh_checksums.mk
@@ -24,7 +24,7 @@ CLANG_TRIPLETS=$(filter %-darwin %-freebsd,$(TRIPLETS))
 NON_CLANG_TRIPLETS=$(filter-out %-darwin %-freebsd,$(TRIPLETS))
 
 # These are the projects currently using BinaryBuilder; both GCC-expanded and non-GCC-expanded:
-BB_PROJECTS=mbedtls libssh2 nghttp2 mpfr curl libgit2 pcre libuv unwind llvmunwind dsfmt objconv p7zip zlib libsuitesparse openlibm blastrampoline libtracyclient
+BB_PROJECTS=mbedtls libssh2 nghttp2 mpfr curl libgit2 pcre libuv unwind llvmunwind dsfmt objconv p7zip zlib libsuitesparse openlibm blastrampoline libtracyclient libmmtk_julia
 BB_GCC_EXPANDED_PROJECTS=openblas csl
 BB_CXX_EXPANDED_PROJECTS=gmp llvm clang llvm-tools lld
 # These are non-BB source-only deps
diff --git a/src/Makefile b/src/Makefile
index 52e673aa6cc1a..c01848c16adf7 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -29,6 +29,10 @@ ifeq ($(USECLANG),1)
 FLAGS += -Wno-return-type-c-linkage -Wno-atomic-alignment
 endif
 
+ifeq ($(WITH_MMTK), 1)
+FLAGS += -I$(MMTK_API_INC) -I$(MMTK_JULIA_INC)
+endif
+
 FLAGS += -DJL_BUILD_ARCH='"$(ARCH)"'
 ifeq ($(OS),WINNT)
 FLAGS += -DJL_BUILD_UNAME='"NT"'
@@ -44,8 +48,8 @@ SRCS := \
 	jltypes gf typemap smallintset ast builtins module interpreter symbol \
 	dlload sys init task array genericmemory staticdata toplevel jl_uv datatype \
 	simplevector runtime_intrinsics precompile jloptions mtarraylist \
-	threading scheduler stackwalk gc-common gc-stock gc-debug gc-pages gc-stacks gc-alloc-profiler gc-page-profiler method \
-	jlapi signal-handling safepoint timing subtype rtutils gc-heap-snapshot \
+	threading scheduler stackwalk gc-common gc-stock gc-mmtk gc-debug gc-pages gc-stacks gc-alloc-profiler gc-page-profiler \
+	method jlapi signal-handling safepoint timing subtype rtutils gc-heap-snapshot \
 	crc32c APInt-C processor ircode opaque_closure codegen-stubs coverage runtime_ccall engine
 
 RT_LLVMLINK :=
@@ -103,7 +107,7 @@ ifeq ($(USE_SYSTEM_LIBUV),0)
 UV_HEADERS += uv.h
 UV_HEADERS += uv/*.h
 endif
-PUBLIC_HEADERS := $(BUILDDIR)/julia_version.h $(wildcard $(SRCDIR)/support/*.h) $(addprefix $(SRCDIR)/,work-stealing-queue.h gc-interface.h gc-tls.h julia.h julia_assert.h julia_threads.h julia_fasttls.h julia_locks.h julia_atomics.h jloptions.h)
+PUBLIC_HEADERS := $(BUILDDIR)/julia_version.h $(wildcard $(SRCDIR)/support/*.h) $(addprefix $(SRCDIR)/,work-stealing-queue.h gc-interface.h gc-tls.h gc-tls-mmtk.h julia.h julia_assert.h julia_threads.h julia_fasttls.h julia_locks.h julia_atomics.h jloptions.h)
 ifeq ($(OS),WINNT)
 PUBLIC_HEADERS += $(addprefix $(SRCDIR)/,win32_ucontext.h)
 endif
@@ -168,8 +172,8 @@ LIBJULIA_PATH_REL := libjulia
 endif
 
 COMMON_LIBPATHS := -L$(build_libdir) -L$(build_shlibdir)
-RT_LIBS := $(WHOLE_ARCHIVE) $(LIBUV) $(WHOLE_ARCHIVE) $(LIBUTF8PROC) $(NO_WHOLE_ARCHIVE) $(LIBUNWIND) $(RT_LLVMLINK) $(OSLIBS) $(LIBTRACYCLIENT) $(LIBITTAPI)
-CG_LIBS := $(LIBUNWIND) $(CG_LLVMLINK) $(OSLIBS) $(LIBTRACYCLIENT) $(LIBITTAPI)
+RT_LIBS := $(WHOLE_ARCHIVE) $(LIBUV) $(WHOLE_ARCHIVE) $(LIBUTF8PROC) $(NO_WHOLE_ARCHIVE) $(LIBUNWIND) $(RT_LLVMLINK) $(OSLIBS) $(LIBTRACYCLIENT) $(LIBITTAPI) $(MMTK_LIB)
+CG_LIBS := $(LIBUNWIND) $(CG_LLVMLINK) $(OSLIBS) $(LIBTRACYCLIENT) $(LIBITTAPI) $(MMTK_LIB)
 RT_DEBUG_LIBS := $(COMMON_LIBPATHS) $(WHOLE_ARCHIVE) $(BUILDDIR)/flisp/libflisp-debug.a $(WHOLE_ARCHIVE) $(BUILDDIR)/support/libsupport-debug.a -ljulia-debug $(RT_LIBS)
 CG_DEBUG_LIBS := $(COMMON_LIBPATHS) $(CG_LIBS) -ljulia-debug -ljulia-internal-debug
 RT_RELEASE_LIBS := $(COMMON_LIBPATHS) $(WHOLE_ARCHIVE) $(BUILDDIR)/flisp/libflisp.a $(WHOLE_ARCHIVE) $(BUILDDIR)/support/libsupport.a -ljulia $(RT_LIBS)
@@ -178,6 +182,15 @@ CG_RELEASE_LIBS := $(COMMON_LIBPATHS) $(CG_LIBS) -ljulia -ljulia-internal
 OBJS := $(SRCS:%=$(BUILDDIR)/%.o)
 DOBJS := $(SRCS:%=$(BUILDDIR)/%.dbg.obj)
 
+ifeq ($(WITH_MMTK), 1)
+MMTK_SRCS := mmtk_julia
+MMTK_OBJS := $(MMTK_SRCS:%=$(MMTK_JULIA_INC)/%.o) $(MMTK_LIB_DST)
+MMTK_DOBJS := $(MMTK_SRCS:%=$(MMTK_JULIA_INC)/%.dbg.obj) $(MMTK_LIB_DST)
+else
+MMTK_OBJS :=
+MMTK_DOBJS :=
+endif
+
 CODEGEN_OBJS := $(CODEGEN_SRCS:%=$(BUILDDIR)/%.o)
 CODEGEN_DOBJS := $(CODEGEN_SRCS:%=$(BUILDDIR)/%.dbg.obj)
 
@@ -226,6 +239,16 @@ $(BUILDDIR)/%.h.gen : $(SRCDIR)/%.d
 	sed 's/JULIA_/JL_PROBE_/' $@ > $@.tmp
 	mv $@.tmp $@
 
+# Compile files from the binding side and copy so file into lib folder
+ifeq ($(WITH_MMTK), 1)
+$(MMTK_JULIA_INC)/%.o: $(MMTK_JULIA_INC)/%.c $(HEADERS) | $(MMTK_JULIA_INC)
+	@$(call PRINT_CC, $(CC) $(JCPPFLAGS) $(JCFLAGS) $(SHIPFLAGS) $(DISABLE_ASSERTIONS) -c $< -o $@)
+$(MMTK_JULIA_INC)/%.dbg.obj: $(MMTK_JULIA_INC)/%.c $(HEADERS) | $(MMTK_JULIA_INC)
+	@$(call PRINT_CC, $(CC) $(JCPPFLAGS) $(JCFLAGS) $(DEBUGFLAGS) -c $< -o $@)
+$(MMTK_LIB_DST): $(MMTK_LIB_SRC)
+	@$(call PRINT_MMTK, cp $< $@)
+endif
+
 $(BUILDDIR)/jl_internal_funcs.inc: $(SRCDIR)/jl_exported_funcs.inc
 	# Generate `.inc` file that contains a list of `#define` macros to rename functions defined in `libjulia-internal`
 	# to have a `ijl_` prefix instead of `jl_`, to denote that they are coming from `libjulia-internal`.  This avoids
@@ -318,6 +341,7 @@ $(BUILDDIR)/debuginfo.o $(BUILDDIR)/debuginfo.dbg.obj: $(addprefix $(SRCDIR)/,de
 $(BUILDDIR)/disasm.o $(BUILDDIR)/disasm.dbg.obj: $(SRCDIR)/debuginfo.h $(SRCDIR)/processor.h
 $(BUILDDIR)/gc-debug.o $(BUILDDIR)/gc-debug.dbg.obj: $(SRCDIR)/gc-common.h $(SRCDIR)/gc-stock.h
 $(BUILDDIR)/gc-pages.o $(BUILDDIR)/gc-pages.dbg.obj: $(SRCDIR)/gc-common.h $(SRCDIR)/gc-stock.h
+$(BUILDDIR)/gc-mmtk.o $(BUILDDIR)/mmtk-gc.dbg.obj: $(SRCDIR)/gc-common.h $(SRCDIR)/gc-heap-snapshot.h $(SRCDIR)/gc-alloc-profiler.h
 $(BUILDDIR)/gc-stock.o $(BUILDDIR)/gc.dbg.obj: $(SRCDIR)/gc-common.h $(SRCDIR)/gc-stock.h $(SRCDIR)/gc-heap-snapshot.h $(SRCDIR)/gc-alloc-profiler.h $(SRCDIR)/gc-page-profiler.h
 $(BUILDDIR)/gc-heap-snapshot.o $(BUILDDIR)/gc-heap-snapshot.dbg.obj: $(SRCDIR)/gc-heap-snapshot.h
 $(BUILDDIR)/gc-alloc-profiler.o $(BUILDDIR)/gc-alloc-profiler.dbg.obj: $(SRCDIR)/gc-alloc-profiler.h
@@ -389,14 +413,14 @@ $(BUILDDIR)/julia.expmap: $(SRCDIR)/julia.expmap.in $(JULIAHOME)/VERSION $(LLVM_
 	sed <'$<' >'$@' -e "s/@JULIA_SHLIB_SYMBOL_VERSION@/JL_LIBJULIA_$(SOMAJOR)/" \
 		        -e "s/@LLVM_SHLIB_SYMBOL_VERSION@/$(LLVM_SHLIB_SYMBOL_VERSION)/"
 
-$(build_shlibdir)/libjulia-internal.$(JL_MAJOR_MINOR_SHLIB_EXT): $(BUILDDIR)/julia.expmap $(OBJS) $(BUILDDIR)/flisp/libflisp.a $(BUILDDIR)/support/libsupport.a $(LIBUV)
-	@$(call PRINT_LINK, $(CXXLD) $(call IMPLIB_FLAGS,$@) $(JCXXFLAGS) $(JL_CXXFLAGS) $(CXXLDFLAGS) $(SHIPFLAGS) $(OBJS) $(RPATH_LIB) -o $@ \
+$(build_shlibdir)/libjulia-internal.$(JL_MAJOR_MINOR_SHLIB_EXT): $(BUILDDIR)/julia.expmap $(OBJS) $(MMTK_OBJS) $(BUILDDIR)/flisp/libflisp.a $(BUILDDIR)/support/libsupport.a $(LIBUV)
+	@$(call PRINT_LINK, $(CXXLD) $(call IMPLIB_FLAGS,$@) $(JCXXFLAGS) $(JL_CXXFLAGS) $(CXXLDFLAGS) $(SHIPFLAGS) $(OBJS) $(MMTK_OBJS) $(RPATH_LIB) -o $@ \
 		$(JLDFLAGS) $(BOLT_LDFLAGS) $(JLIBLDFLAGS) $(RT_RELEASE_LIBS) $(call SONAME_FLAGS,libjulia-internal.$(JL_MAJOR_SHLIB_EXT)))
 	@$(INSTALL_NAME_CMD)libjulia-internal.$(SHLIB_EXT) $@
 	$(DSYMUTIL) $@
 
-$(build_shlibdir)/libjulia-internal-debug.$(JL_MAJOR_MINOR_SHLIB_EXT): $(BUILDDIR)/julia.expmap $(DOBJS) $(BUILDDIR)/flisp/libflisp-debug.a $(BUILDDIR)/support/libsupport-debug.a $(LIBUV)
-	@$(call PRINT_LINK, $(CXXLD) $(call IMPLIB_FLAGS,$@) $(JCXXFLAGS) $(JL_CXXFLAGS) $(CXXLDFLAGS) $(DEBUGFLAGS) $(DOBJS) $(RPATH_LIB) -o $@ \
+$(build_shlibdir)/libjulia-internal-debug.$(JL_MAJOR_MINOR_SHLIB_EXT): $(BUILDDIR)/julia.expmap $(DOBJS) $(MMTK_DOBJS) $(BUILDDIR)/flisp/libflisp-debug.a $(BUILDDIR)/support/libsupport-debug.a $(LIBUV)
+	@$(call PRINT_LINK, $(CXXLD) $(call IMPLIB_FLAGS,$@) $(JCXXFLAGS) $(JL_CXXFLAGS) $(CXXLDFLAGS) $(DEBUGFLAGS) $(DOBJS) $(MMTK_DOBJS) $(RPATH_LIB) -o $@ \
 		$(JLDFLAGS) $(JLIBLDFLAGS) $(RT_DEBUG_LIBS) $(call SONAME_FLAGS,libjulia-internal-debug.$(JL_MAJOR_SHLIB_EXT)))
 	@$(INSTALL_NAME_CMD)libjulia-internal-debug.$(SHLIB_EXT) $@
 	$(DSYMUTIL) $@
@@ -455,6 +479,7 @@ clean:
 	-rm -f $(BUILDDIR)/*.dbg.obj $(BUILDDIR)/*.o $(BUILDDIR)/*.dwo $(BUILDDIR)/*.$(SHLIB_EXT) $(BUILDDIR)/*.a $(BUILDDIR)/*.h.gen
 	-rm -f $(BUILDDIR)/julia.expmap
 	-rm -f $(BUILDDIR)/julia_version.h
+	-rm -f $(MMTK_OBJS) $(MMTK_DOBJS)
 
 clean-flisp:
 	-$(MAKE) -C $(SRCDIR)/flisp clean BUILDDIR='$(abspath $(BUILDDIR)/flisp)'
diff --git a/src/builtins.c b/src/builtins.c
index 96c4cec0f5087..4a778035de405 100644
--- a/src/builtins.c
+++ b/src/builtins.c
@@ -22,6 +22,7 @@
 #include <ctype.h>
 #include "julia.h"
 #include "julia_internal.h"
+#include "gc-interface.h"
 #include "builtin_proto.h"
 #include "intrinsics.h"
 #include "julia_assert.h"
diff --git a/src/gc-common.c b/src/gc-common.c
index 417f12f26d64d..17f6f1330743b 100644
--- a/src/gc-common.c
+++ b/src/gc-common.c
@@ -705,6 +705,76 @@ JL_DLLEXPORT void jl_throw_out_of_memory_error(void)
     jl_throw(jl_memory_exception);
 }
 
+size_t jl_genericmemory_nbytes(jl_genericmemory_t *m) JL_NOTSAFEPOINT
+{
+    const jl_datatype_layout_t *layout = ((jl_datatype_t*)jl_typetagof(m))->layout;
+    size_t sz = layout->size * m->length;
+    if (layout->flags.arrayelem_isunion)
+        // account for isbits Union array selector bytes
+        sz += m->length;
+    return sz;
+}
+
+// tracking Memorys with malloc'd storage
+void jl_gc_track_malloced_genericmemory(jl_ptls_t ptls, jl_genericmemory_t *m, int isaligned){
+    // This is **NOT** a GC safe point.
+    mallocmemory_t *ma;
+    if (ptls->gc_tls.heap.mafreelist == NULL) {
+        ma = (mallocmemory_t*)malloc_s(sizeof(mallocmemory_t));
+    }
+    else {
+        ma = ptls->gc_tls.heap.mafreelist;
+        ptls->gc_tls.heap.mafreelist = ma->next;
+    }
+    ma->a = (jl_genericmemory_t*)((uintptr_t)m | !!isaligned);
+    ma->next = ptls->gc_tls.heap.mallocarrays;
+    ptls->gc_tls.heap.mallocarrays = ma;
+}
+
+int gc_logging_enabled = 0;
+
+JL_DLLEXPORT void jl_enable_gc_logging(int enable) {
+    gc_logging_enabled = enable;
+}
+
+JL_DLLEXPORT int jl_is_gc_logging_enabled(void) {
+    return gc_logging_enabled;
+}
+
+// gc-debug common functions
+// ---
+
+int gc_slot_to_fieldidx(void *obj, void *slot, jl_datatype_t *vt) JL_NOTSAFEPOINT
+{
+    int nf = (int)jl_datatype_nfields(vt);
+    for (int i = 1; i < nf; i++) {
+        if (slot < (void*)((char*)obj + jl_field_offset(vt, i)))
+            return i - 1;
+    }
+    return nf - 1;
+}
+
+int gc_slot_to_arrayidx(void *obj, void *_slot) JL_NOTSAFEPOINT
+{
+    char *slot = (char*)_slot;
+    jl_datatype_t *vt = (jl_datatype_t*)jl_typeof(obj);
+    char *start = NULL;
+    size_t len = 0;
+    size_t elsize = sizeof(void*);
+    if (vt == jl_module_type) {
+        jl_module_t *m = (jl_module_t*)obj;
+        start = (char*)m->usings.items;
+        len = m->usings.len;
+    }
+    else if (vt == jl_simplevector_type) {
+        start = (char*)jl_svec_data(obj);
+        len = jl_svec_len(obj);
+    }
+    if (slot < start || slot >= start + elsize * len)
+        return -1;
+    return (slot - start) / elsize;
+}
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/gc-debug.c b/src/gc-debug.c
index 7c479484cde45..ecd7f2328cada 100644
--- a/src/gc-debug.c
+++ b/src/gc-debug.c
@@ -1,5 +1,5 @@
 // This file is a part of Julia. License is MIT: https://julialang.org/license
-
+#ifndef MMTK_GC
 #include "gc-common.h"
 #include "gc-stock.h"
 #include "julia.h"
@@ -1129,3 +1129,5 @@ void _report_gc_finished(uint64_t pause, uint64_t freed, int full, int recollect
 #ifdef __cplusplus
 }
 #endif
+
+#endif // !MMTK_GC
diff --git a/src/gc-heap-snapshot.cpp b/src/gc-heap-snapshot.cpp
index fcda11dad4f8a..d3cb1e98d84a4 100644
--- a/src/gc-heap-snapshot.cpp
+++ b/src/gc-heap-snapshot.cpp
@@ -1,5 +1,4 @@
 // This file is a part of Julia. License is MIT: https://julialang.org/license
-
 #include "gc-heap-snapshot.h"
 
 #include "julia.h"
diff --git a/src/gc-interface.h b/src/gc-interface.h
index 0e9ce32697f35..72a57f4944156 100644
--- a/src/gc-interface.h
+++ b/src/gc-interface.h
@@ -192,6 +192,9 @@ JL_DLLEXPORT void *jl_gc_perm_alloc(size_t sz, int zero, unsigned align,
 // object being allocated and will be used to set the object header.
 struct _jl_value_t *jl_gc_permobj(size_t sz, void *ty) JL_NOTSAFEPOINT;
 
+// FIXME: add description here
+void jl_gc_notify_image_load(const char* img_data, size_t len);
+
 // ========================================================================= //
 // Runtime Write-Barriers
 // ========================================================================= //
diff --git a/src/gc-mmtk.c b/src/gc-mmtk.c
new file mode 100644
index 0000000000000..e459b0f12c41d
--- /dev/null
+++ b/src/gc-mmtk.c
@@ -0,0 +1,843 @@
+#ifdef MMTK_GC
+
+#include "mmtk_julia.h"
+#include "gc-common.h"
+#include "mmtkMutator.h"
+#include "gc-mmtk.h"
+#include "threading.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// For now we're using the same values as stock-gc. However
+// for the heap size we use 70% of the free memory available
+// since that is actually a hard limit in MMTk.
+
+// max_total_memory is a suggestion.  We try very hard to stay
+// under this limit, but we will go above it rather than halting.
+#ifdef _P64
+typedef uint64_t memsize_t;
+static const size_t default_collect_interval = 5600 * 1024 * sizeof(void*);
+// We expose this to the user/ci as jl_gc_set_max_memory
+static memsize_t max_total_memory = (memsize_t) 2 * 1024 * 1024 * 1024 * 1024 * 1024;
+#else
+typedef uint32_t memsize_t;
+static const size_t default_collect_interval = 3200 * 1024 * sizeof(void*);
+// Work really hard to stay within 2GB
+// Alternative is to risk running out of address space
+// on 32 bit architectures.
+#define MAX32HEAP 1536 * 1024 * 1024
+static memsize_t max_total_memory = (memsize_t) MAX32HEAP;
+#endif
+
+void jl_gc_init(void) {
+    // TODO: use jl_options.heap_size_hint to set MMTk's fixed heap size? (see issue: https://github.com/mmtk/mmtk-julia/issues/167)
+
+    JL_MUTEX_INIT(&finalizers_lock, "finalizers_lock");
+
+    arraylist_new(&to_finalize, 0);
+    arraylist_new(&finalizer_list_marked, 0);
+
+    gc_num.allocd = 0;
+    gc_num.max_pause = 0;
+    gc_num.max_memory = 0;
+
+    long long min_heap_size;
+    long long max_heap_size;
+    char* min_size_def = getenv("MMTK_MIN_HSIZE");
+    char* min_size_gb = getenv("MMTK_MIN_HSIZE_G");
+
+    char* max_size_def = getenv("MMTK_MAX_HSIZE");
+    char* max_size_gb = getenv("MMTK_MAX_HSIZE_G");
+
+    // default min heap currently set as Julia's default_collect_interval
+    if (min_size_def != NULL) {
+        char *p;
+        double min_size = strtod(min_size_def, &p);
+        min_heap_size = (long) 1024 * 1024 * min_size;
+    } else if (min_size_gb != NULL) {
+        char *p;
+        double min_size = strtod(min_size_gb, &p);
+        min_heap_size = (long) 1024 * 1024 * 1024 * min_size;
+    } else {
+        min_heap_size = default_collect_interval;
+    }
+
+    // default max heap currently set as 70% the free memory in the system
+    if (max_size_def != NULL) {
+        char *p;
+        double max_size = strtod(max_size_def, &p);
+        max_heap_size = (long) 1024 * 1024 * max_size;
+    } else if (max_size_gb != NULL) {
+        char *p;
+        double max_size = strtod(max_size_gb, &p);
+        max_heap_size = (long) 1024 * 1024 * 1024 * max_size;
+    } else {
+        max_heap_size = uv_get_free_memory() * 70 / 100;
+    }
+
+    // Assert that the number of stock GC threads is 0; MMTK uses the number of threads in jl_options.ngcthreads
+    assert(jl_n_gcthreads == 0);
+
+    // Check that the julia_copy_stack rust feature has been defined when the COPY_STACK has been defined
+    int copy_stacks;
+
+#ifdef COPY_STACKS
+    copy_stacks = 1;
+#else
+    copy_stacks = 0;
+#endif
+
+    mmtk_julia_copy_stack_check(copy_stacks);
+
+    // if only max size is specified initialize MMTk with a fixed size heap
+    // TODO: We just assume mark threads means GC threads, and ignore the number of concurrent sweep threads.
+    // If the two values are the same, we can use either. Otherwise, we need to be careful.
+    uintptr_t gcthreads = jl_options.nmarkthreads;
+    if (max_size_def != NULL || (max_size_gb != NULL && (min_size_def == NULL && min_size_gb == NULL))) {
+        mmtk_gc_init(0, max_heap_size, gcthreads, &mmtk_upcalls, (sizeof(jl_taggedvalue_t)), jl_buff_tag);
+    } else {
+        mmtk_gc_init(min_heap_size, max_heap_size, gcthreads, &mmtk_upcalls, (sizeof(jl_taggedvalue_t)), jl_buff_tag);
+    }
+}
+
+void jl_start_gc_threads(void) {
+    jl_ptls_t ptls = jl_current_task->ptls;
+    mmtk_initialize_collection((void *)ptls);
+    // int nthreads = jl_atomic_load_relaxed(&jl_n_threads);
+    // int ngcthreads = jl_n_gcthreads;
+    // int nmutator_threads = nthreads - ngcthreads;
+    // printf("nthreads = %d, ngcthreads = %d, nmutator_threads = %d\n", nthreads, ngcthreads, nmutator_threads);
+}
+
+void jl_init_thread_heap(struct _jl_tls_states_t *ptls) JL_NOTSAFEPOINT {
+    jl_thread_heap_t *heap = &ptls->gc_tls.heap;
+    small_arraylist_new(&heap->weak_refs, 0);
+    small_arraylist_new(&heap->live_tasks, 0);
+    for (int i = 0; i < JL_N_STACK_POOLS; i++)
+        small_arraylist_new(&heap->free_stacks[i], 0);
+    heap->mallocarrays = NULL;
+    heap->mafreelist = NULL;
+    arraylist_new(&ptls->finalizers, 0);
+    // Clear the malloc sz count
+    jl_atomic_store_relaxed(&ptls->gc_tls.malloc_sz_since_last_poll, 0);
+    // Create mutator
+    MMTk_Mutator mmtk_mutator = mmtk_bind_mutator((void *)ptls, ptls->tid);
+    // Copy the mutator to the thread local storage
+    memcpy(&ptls->gc_tls.mmtk_mutator, mmtk_mutator, sizeof(MMTkMutatorContext));
+    // Call post_bind to maintain a list of active mutators and to reclaim the old mutator (which is no longer needed)
+    mmtk_post_bind_mutator(&ptls->gc_tls.mmtk_mutator, mmtk_mutator);
+    memset(&ptls->gc_tls.gc_num, 0, sizeof(ptls->gc_tls.gc_num));
+}
+
+void jl_free_thread_gc_state(struct _jl_tls_states_t *ptls) {
+    mmtk_destroy_mutator(&ptls->gc_tls.mmtk_mutator);
+}
+
+// FIXME: mmtk uses the same code as stock to enable/disable the GC
+// Should this be moved to gc-common.c?
+
+_Atomic(uint32_t) jl_gc_disable_counter = 1;
+
+JL_DLLEXPORT int jl_gc_enable(int on) {
+    jl_ptls_t ptls = jl_current_task->ptls;
+    int prev = !ptls->disable_gc;
+    ptls->disable_gc = (on == 0);
+    if (on && !prev) {
+        // disable -> enable
+        if (jl_atomic_fetch_add(&jl_gc_disable_counter, -1) == 1) {
+            gc_num.allocd += gc_num.deferred_alloc;
+            gc_num.deferred_alloc = 0;
+        }
+    }
+    else if (prev && !on) {
+        // enable -> disable
+        jl_atomic_fetch_add(&jl_gc_disable_counter, 1);
+        // check if the GC is running and wait for it to finish
+        jl_gc_safepoint_(ptls);
+    }
+    return prev;
+}
+
+JL_DLLEXPORT int jl_gc_is_enabled(void) {
+    jl_ptls_t ptls = jl_current_task->ptls;
+    return !ptls->disable_gc;
+}
+
+JL_DLLEXPORT void jl_gc_set_max_memory(uint64_t max_mem) {
+    // MMTk currently does not allow setting the heap size at runtime
+}
+
+JL_DLLEXPORT void jl_gc_collect(jl_gc_collection_t collection) {
+    jl_task_t *ct = jl_current_task;
+    jl_ptls_t ptls = ct->ptls;
+    if (jl_atomic_load_acquire(&jl_gc_disable_counter)) {
+        size_t localbytes = jl_atomic_load_relaxed(&ptls->gc_tls.gc_num.allocd) + gc_num.interval;
+        jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.allocd, -(int64_t)gc_num.interval);
+        static_assert(sizeof(_Atomic(uint64_t)) == sizeof(gc_num.deferred_alloc), "");
+        jl_atomic_fetch_add_relaxed((_Atomic(uint64_t)*)&gc_num.deferred_alloc, localbytes);
+        return;
+    }
+    mmtk_handle_user_collection_request(ptls, collection);
+}
+
+// same as above, some of these are identical to the implementation in gc stock
+static void combine_thread_gc_counts(jl_gc_num_t *dest, int update_heap) JL_NOTSAFEPOINT
+{
+    int gc_n_threads;
+    jl_ptls_t* gc_all_tls_states;
+    gc_n_threads = jl_atomic_load_acquire(&jl_n_threads);
+    gc_all_tls_states = jl_atomic_load_relaxed(&jl_all_tls_states);
+    for (int i = 0; i < gc_n_threads; i++) {
+        jl_ptls_t ptls = gc_all_tls_states[i];
+        if (ptls) {
+            dest->allocd += (jl_atomic_load_relaxed(&ptls->gc_tls.gc_num.allocd) + gc_num.interval);
+            dest->malloc += jl_atomic_load_relaxed(&ptls->gc_tls.gc_num.malloc);
+            dest->realloc += jl_atomic_load_relaxed(&ptls->gc_tls.gc_num.realloc);
+            dest->poolalloc += jl_atomic_load_relaxed(&ptls->gc_tls.gc_num.poolalloc);
+            dest->bigalloc += jl_atomic_load_relaxed(&ptls->gc_tls.gc_num.bigalloc);
+            dest->freed += jl_atomic_load_relaxed(&ptls->gc_tls.gc_num.free_acc);
+            if (update_heap) {
+                jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.alloc_acc, 0);
+                jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.free_acc, 0);
+            }
+        }
+    }
+}
+
+
+void reset_thread_gc_counts(void) JL_NOTSAFEPOINT
+{
+    int gc_n_threads;
+    jl_ptls_t* gc_all_tls_states;
+    gc_n_threads = jl_atomic_load_acquire(&jl_n_threads);
+    gc_all_tls_states = jl_atomic_load_relaxed(&jl_all_tls_states);
+    for (int i = 0; i < gc_n_threads; i++) {
+        jl_ptls_t ptls = gc_all_tls_states[i];
+        if (ptls != NULL) {
+            // don't reset `pool_live_bytes` here
+            jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.allocd, -(int64_t)gc_num.interval);
+            jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.malloc, 0);
+            jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.realloc, 0);
+            jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.poolalloc, 0);
+            jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.bigalloc, 0);
+            jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.alloc_acc, 0);
+            jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.free_acc, 0);
+        }
+    }
+}
+
+// weak references
+// ---
+JL_DLLEXPORT jl_weakref_t *jl_gc_new_weakref_th(jl_ptls_t ptls, jl_value_t *value)
+{
+    jl_weakref_t *wr = (jl_weakref_t*)jl_gc_alloc(ptls, sizeof(void*), jl_weakref_type);
+    wr->value = value;  // NOTE: wb not needed here
+    mmtk_add_weak_candidate(wr);
+    return wr;
+}
+
+
+// allocation
+int jl_gc_classify_pools(size_t sz, int *osize)
+{
+    if (sz > GC_MAX_SZCLASS)
+        return -1; // call big alloc function
+    size_t allocsz = sz + sizeof(jl_taggedvalue_t);
+    *osize = LLT_ALIGN(allocsz, 16);
+    return 0; // use MMTk's fastpath logic
+}
+
+int64_t last_gc_total_bytes = 0;
+int64_t last_live_bytes = 0; // live_bytes at last collection
+int64_t live_bytes = 0;
+
+// Retrieves Julia's `GC_Num` (structure that stores GC statistics).
+JL_DLLEXPORT jl_gc_num_t jl_gc_num(void) {
+    jl_gc_num_t num = gc_num;
+    combine_thread_gc_counts(&num, 0);
+    return num;
+}
+
+JL_DLLEXPORT int64_t jl_gc_diff_total_bytes(void) JL_NOTSAFEPOINT {
+    int64_t oldtb = last_gc_total_bytes;
+    int64_t newtb;
+    jl_gc_get_total_bytes(&newtb);
+    last_gc_total_bytes = newtb;
+    return newtb - oldtb;
+}
+
+JL_DLLEXPORT int64_t jl_gc_sync_total_bytes(int64_t offset) JL_NOTSAFEPOINT
+{
+    int64_t oldtb = last_gc_total_bytes;
+    int64_t newtb;
+    jl_gc_get_total_bytes(&newtb);
+    last_gc_total_bytes = newtb - offset;
+    return newtb - oldtb;
+}
+
+JL_DLLEXPORT int64_t jl_gc_pool_live_bytes(void) {
+    return 0;
+}
+
+void jl_gc_count_allocd(size_t sz) JL_NOTSAFEPOINT
+{
+    jl_ptls_t ptls = jl_current_task->ptls;
+    jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.allocd,
+        jl_atomic_load_relaxed(&ptls->gc_tls.gc_num.allocd) + sz);
+}
+
+void jl_gc_count_freed(size_t sz) JL_NOTSAFEPOINT
+{
+}
+
+int64_t inc_live_bytes(int64_t inc) JL_NOTSAFEPOINT
+{
+    jl_timing_counter_inc(JL_TIMING_COUNTER_HeapSize, inc);
+    return live_bytes += inc;
+}
+
+void jl_gc_reset_alloc_count(void) JL_NOTSAFEPOINT
+{
+    combine_thread_gc_counts(&gc_num, 0);
+    inc_live_bytes(gc_num.deferred_alloc + gc_num.allocd);
+    gc_num.allocd = 0;
+    gc_num.deferred_alloc = 0;
+    reset_thread_gc_counts();
+}
+
+JL_DLLEXPORT int64_t jl_gc_live_bytes(void) {
+    return last_live_bytes;
+}
+
+JL_DLLEXPORT void jl_gc_get_total_bytes(int64_t *bytes) JL_NOTSAFEPOINT
+{
+    jl_gc_num_t num = gc_num;
+    combine_thread_gc_counts(&num, 0);
+    // Sync this logic with `base/util.jl:GC_Diff`
+    *bytes = (num.total_allocd + num.deferred_alloc + num.allocd);
+}
+
+JL_DLLEXPORT uint64_t jl_gc_get_max_memory(void)
+{
+    // FIXME: should probably return MMTk's heap size
+    return max_total_memory;
+}
+
+extern void mmtk_object_reference_write_post(void* mutator, const void* parent, const void* ptr);
+extern void mmtk_object_reference_write_slow(void* mutator, const void* parent, const void* ptr);
+extern void* mmtk_alloc(void* mutator, size_t size, size_t align, size_t offset, int allocator);
+extern void mmtk_post_alloc(void* mutator, void* refer, size_t bytes, int allocator);
+
+
+extern const void* MMTK_SIDE_LOG_BIT_BASE_ADDRESS;
+extern const void* MMTK_SIDE_VO_BIT_BASE_ADDRESS;
+
+// These need to be constants.
+
+#define MMTK_OBJECT_BARRIER (1)
+// Stickyimmix needs write barrier. Immix does not need write barrier.
+#ifdef MMTK_PLAN_IMMIX
+#define MMTK_NEEDS_WRITE_BARRIER (0)
+#endif
+#ifdef MMTK_PLAN_STICKYIMMIX
+#define MMTK_NEEDS_WRITE_BARRIER (1)
+#endif
+
+#ifdef MMTK_CONSERVATIVE_SCAN
+#define MMTK_NEEDS_VO_BIT (1)
+#else
+#define MMTK_NEEDS_VO_BIT (0)
+#endif
+
+#define MMTK_DEFAULT_IMMIX_ALLOCATOR (0)
+#define MMTK_IMMORTAL_BUMP_ALLOCATOR (0)
+
+// Directly call into MMTk for write barrier (debugging only)
+inline void mmtk_gc_wb_full(const void *parent, const void *ptr) JL_NOTSAFEPOINT
+{
+    jl_task_t *ct = jl_current_task;
+    jl_ptls_t ptls = ct->ptls;
+    mmtk_object_reference_write_post(&ptls->gc_tls.mmtk_mutator, parent, ptr);
+}
+
+// Fastpath. Return 1 if we should go to slowpath
+inline int mmtk_gc_wb_fast_check(const void *parent, const void *ptr) JL_NOTSAFEPOINT
+{
+    if (MMTK_NEEDS_WRITE_BARRIER == MMTK_OBJECT_BARRIER) {
+        intptr_t addr = (intptr_t) (void*) parent;
+        uint8_t* meta_addr = (uint8_t*) (MMTK_SIDE_LOG_BIT_BASE_ADDRESS) + (addr >> 6);
+        intptr_t shift = (addr >> 3) & 0b111;
+        uint8_t byte_val = *meta_addr;
+        return ((byte_val >> shift) & 1) == 1;
+    } else {
+        return 0;
+    }
+}
+
+// Slowpath.
+inline void mmtk_gc_wb_slow(const void *parent, const void *ptr) JL_NOTSAFEPOINT
+{
+    if (MMTK_NEEDS_WRITE_BARRIER == MMTK_OBJECT_BARRIER) {
+        jl_task_t *ct = jl_current_task;
+        jl_ptls_t ptls = ct->ptls;
+        mmtk_object_reference_write_slow(&ptls->gc_tls.mmtk_mutator, parent, ptr);
+    }
+}
+
+inline void mmtk_gc_wb(const void *parent, const void *ptr) JL_NOTSAFEPOINT
+{
+    if (mmtk_gc_wb_fast_check(parent, ptr)) {
+        mmtk_gc_wb_slow(parent, ptr);
+    }
+}
+
+inline void mmtk_gc_wb_binding(const void *bnd, const void *val) JL_NOTSAFEPOINT
+{
+    if (mmtk_gc_wb_fast_check(bnd, val)) {
+        jl_astaggedvalue(bnd)->bits.gc = 2; // to indicate that the buffer is a binding
+        mmtk_gc_wb_slow(bnd, val);
+    }
+}
+
+#define MMTK_MIN_ALIGNMENT 4
+// MMTk assumes allocation size is aligned to min alignment.
+inline size_t mmtk_align_alloc_sz(size_t sz) JL_NOTSAFEPOINT
+{
+    return (sz + MMTK_MIN_ALIGNMENT - 1) & ~(MMTK_MIN_ALIGNMENT - 1);
+}
+
+inline void* bump_alloc_fast(MMTkMutatorContext* mutator, uintptr_t* cursor, uintptr_t limit, size_t size, size_t align, size_t offset, int allocator) {
+    intptr_t delta = (-offset - *cursor) & (align - 1);
+    uintptr_t result = *cursor + (uintptr_t)delta;
+
+    if (__unlikely(result + size > limit)) {
+        return (void*) mmtk_alloc(mutator, size, align, offset, allocator);
+    } else{
+        *cursor = result + size;
+        return (void*)result;
+    }
+}
+
+inline void* mmtk_immix_alloc_fast(MMTkMutatorContext* mutator, size_t size, size_t align, size_t offset) {
+    ImmixAllocator* allocator = &mutator->allocators.immix[MMTK_DEFAULT_IMMIX_ALLOCATOR];
+    return bump_alloc_fast(mutator, (uintptr_t*)&allocator->cursor, (intptr_t)allocator->limit, size, align, offset, 0);
+}
+
+inline void mmtk_immix_post_alloc_slow(MMTkMutatorContext* mutator, void* obj, size_t size) {
+    mmtk_post_alloc(mutator, obj, size, 0);
+}
+
+inline void mmtk_set_vo_bit(void* obj) {
+        intptr_t addr = (intptr_t) obj;
+        intptr_t shift = (addr >> 3) & 0b111;
+        uint8_t* vo_meta_addr = (uint8_t*) (MMTK_SIDE_VO_BIT_BASE_ADDRESS) + (addr >> 6);
+        uint8_t new_val = (*vo_meta_addr) | (1 << shift);
+        (*vo_meta_addr) = new_val;
+}
+
+inline void mmtk_immix_post_alloc_fast(MMTkMutatorContext* mutator, void* obj, size_t size) {
+    if (MMTK_NEEDS_VO_BIT) {
+        // set VO bit
+        mmtk_set_vo_bit(obj);
+    }
+}
+
+inline void* mmtk_immortal_alloc_fast(MMTkMutatorContext* mutator, size_t size, size_t align, size_t offset) {
+    BumpAllocator* allocator = &mutator->allocators.bump_pointer[MMTK_IMMORTAL_BUMP_ALLOCATOR];
+    return bump_alloc_fast(mutator, (uintptr_t*)&allocator->cursor, (uintptr_t)allocator->limit, size, align, offset, 1);
+}
+
+inline void mmtk_immortal_post_alloc_fast(MMTkMutatorContext* mutator, void* obj, size_t size) {
+    if (MMTK_NEEDS_VO_BIT) {
+        // set VO bit
+        mmtk_set_vo_bit(obj);
+    }
+
+    if (MMTK_NEEDS_WRITE_BARRIER == MMTK_OBJECT_BARRIER) {
+        intptr_t addr = (intptr_t) obj;
+        intptr_t shift = (addr >> 3) & 0b111;
+        uint8_t* meta_addr = (uint8_t*) (MMTK_SIDE_LOG_BIT_BASE_ADDRESS) + (addr >> 6);
+        while(1) {
+            uint8_t old_val = *meta_addr;
+            uint8_t new_val = old_val | (1 << shift);
+            if (jl_atomic_cmpswap((_Atomic(uint8_t)*)meta_addr, &old_val, new_val)) {
+                break;
+            }
+        }
+    }
+}
+
+// mutex for page profile
+uv_mutex_t page_profile_lock;
+
+JL_DLLEXPORT void jl_gc_take_page_profile(ios_t *stream)
+{
+    uv_mutex_lock(&page_profile_lock);
+    const char *str = "Page profiler in unsupported in MMTk.";
+    ios_write(stream, str, strlen(str));
+    uv_mutex_unlock(&page_profile_lock);
+}
+
+// this seems to be needed by the gc tests
+#define JL_GC_N_MAX_POOLS 51
+JL_DLLEXPORT double jl_gc_page_utilization_stats[JL_GC_N_MAX_POOLS];
+
+STATIC_INLINE void gc_dump_page_utilization_data(void) JL_NOTSAFEPOINT
+{
+    // FIXME: MMTk would have to provide its own stats
+}
+
+#define MMTK_GC_PAGE_SZ (1 << 12) // MMTk's page size is defined in mmtk-core constants
+
+JL_DLLEXPORT uint64_t jl_get_pg_size(void)
+{
+    return MMTK_GC_PAGE_SZ;
+}
+
+
+extern void mmtk_store_obj_size_c(void* obj, size_t size);
+
+inline void maybe_collect(jl_ptls_t ptls)
+{
+    // Just do a safe point for general maybe_collect
+    jl_gc_safepoint_(ptls);
+}
+
+// This is only used for malloc. We need to know if we need to do GC. However, keeping checking with MMTk (mmtk_gc_poll),
+// is expensive. So we only check for every few allocations.
+static inline void malloc_maybe_collect(jl_ptls_t ptls, size_t sz)
+{
+    // We do not need to carefully maintain malloc_sz_since_last_poll. We just need to
+    // avoid using mmtk_gc_poll too frequently, and try to be precise on our heap usage
+    // as much as we can.
+    if (ptls->gc_tls.malloc_sz_since_last_poll > 4096) {
+        jl_atomic_store_relaxed(&ptls->gc_tls.malloc_sz_since_last_poll, 0);
+        mmtk_gc_poll(ptls);
+    } else {
+        jl_atomic_fetch_add_relaxed(&ptls->gc_tls.malloc_sz_since_last_poll, sz);
+        jl_gc_safepoint_(ptls);
+    }
+}
+
+// allocation wrappers that track allocation and let collection run
+
+JL_DLLEXPORT void *jl_gc_counted_malloc(size_t sz)
+{
+    jl_gcframe_t **pgcstack = jl_get_pgcstack();
+    jl_task_t *ct = jl_current_task;
+    void *data = malloc(sz);
+    if (data != NULL && pgcstack != NULL && ct->world_age) {
+        jl_ptls_t ptls = ct->ptls;
+        malloc_maybe_collect(ptls, sz);
+        jl_atomic_fetch_add_relaxed(&JULIA_MALLOC_BYTES, sz);
+    }
+    return data;
+}
+
+JL_DLLEXPORT void *jl_gc_counted_calloc(size_t nm, size_t sz)
+{
+    jl_gcframe_t **pgcstack = jl_get_pgcstack();
+    jl_task_t *ct = jl_current_task;
+    void *data = calloc(nm, sz);
+    if (data != NULL && pgcstack != NULL && ct->world_age) {
+        jl_ptls_t ptls = ct->ptls;
+        malloc_maybe_collect(ptls, nm * sz);
+        jl_atomic_fetch_add_relaxed(&JULIA_MALLOC_BYTES, nm * sz);
+    }
+    return data;
+}
+
+JL_DLLEXPORT void jl_gc_counted_free_with_size(void *p, size_t sz)
+{
+    jl_gcframe_t **pgcstack = jl_get_pgcstack();
+    jl_task_t *ct = jl_current_task;
+    free(p);
+    if (pgcstack != NULL && ct->world_age) {
+        jl_atomic_fetch_add_relaxed(&JULIA_MALLOC_BYTES, -sz);
+    }
+}
+
+JL_DLLEXPORT void *jl_gc_counted_realloc_with_old_size(void *p, size_t old, size_t sz)
+{
+    jl_gcframe_t **pgcstack = jl_get_pgcstack();
+    jl_task_t *ct = jl_current_task;
+    if (pgcstack && ct->world_age) {
+        jl_ptls_t ptls = ct->ptls;
+        malloc_maybe_collect(ptls, sz);
+        if (sz < old)
+            jl_atomic_fetch_add_relaxed(&JULIA_MALLOC_BYTES, old - sz);
+        else
+            jl_atomic_fetch_add_relaxed(&JULIA_MALLOC_BYTES, sz - old);
+    }
+    return realloc(p, sz);
+}
+
+void *jl_gc_perm_alloc_nolock(size_t sz, int zero, unsigned align, unsigned offset)
+{
+    jl_ptls_t ptls = jl_current_task->ptls;
+    size_t allocsz = mmtk_align_alloc_sz(sz);
+    void* addr = mmtk_immortal_alloc_fast(&ptls->gc_tls.mmtk_mutator, allocsz, align, offset);
+    return addr;
+}
+
+void *jl_gc_perm_alloc(size_t sz, int zero, unsigned align, unsigned offset)
+{
+    return jl_gc_perm_alloc_nolock(sz, zero, align, offset);
+}
+
+jl_value_t *jl_gc_permobj(size_t sz, void *ty) JL_NOTSAFEPOINT
+{
+    const size_t allocsz = sz + sizeof(jl_taggedvalue_t);
+    unsigned align = (sz == 0 ? sizeof(void*) : (allocsz <= sizeof(void*) * 2 ?
+                                                 sizeof(void*) * 2 : 16));
+    jl_taggedvalue_t *o = (jl_taggedvalue_t*)jl_gc_perm_alloc(allocsz, 0, align,
+                                                              sizeof(void*) % align);
+
+    jl_ptls_t ptls = jl_current_task->ptls;
+    mmtk_immortal_post_alloc_fast(&ptls->gc_tls.mmtk_mutator, jl_valueof(o), allocsz);
+    o->header = (uintptr_t)ty;
+    return jl_valueof(o);
+}
+
+
+JL_DLLEXPORT jl_value_t *jl_mmtk_gc_alloc_default(jl_ptls_t ptls, int osize, size_t align, void *ty)
+{
+    // safepoint
+    jl_gc_safepoint_(ptls);
+
+    jl_value_t *v;
+    if ((uintptr_t)ty != jl_buff_tag) {
+        // v needs to be 16 byte aligned, therefore v_tagged needs to be offset accordingly to consider the size of header
+        jl_taggedvalue_t *v_tagged = (jl_taggedvalue_t *)mmtk_immix_alloc_fast(&ptls->gc_tls.mmtk_mutator, LLT_ALIGN(osize, align), align, sizeof(jl_taggedvalue_t));
+        v = jl_valueof(v_tagged);
+        mmtk_immix_post_alloc_fast(&ptls->gc_tls.mmtk_mutator, v, LLT_ALIGN(osize, align));
+    } else {
+        // allocating an extra word to store the size of buffer objects
+        jl_taggedvalue_t *v_tagged = (jl_taggedvalue_t *)mmtk_immix_alloc_fast(&ptls->gc_tls.mmtk_mutator, LLT_ALIGN(osize+sizeof(jl_taggedvalue_t), align), align, 0);
+        jl_value_t* v_tagged_aligned = ((jl_value_t*)((char*)(v_tagged) + sizeof(jl_taggedvalue_t)));
+        v = jl_valueof(v_tagged_aligned);
+        mmtk_store_obj_size_c(v, LLT_ALIGN(osize+sizeof(jl_taggedvalue_t), align));
+        mmtk_immix_post_alloc_fast(&ptls->gc_tls.mmtk_mutator, v, LLT_ALIGN(osize+sizeof(jl_taggedvalue_t), align));
+    }
+
+    ptls->gc_tls.gc_num.allocd += osize;
+    ptls->gc_tls.gc_num.poolalloc++;
+
+    return v;
+}
+
+void jl_gc_notify_image_load(const char* img_data, size_t len)
+{
+    mmtk_set_vm_space((void*)img_data, len);
+}
+
+JL_DLLEXPORT jl_value_t *jl_mmtk_gc_alloc_big(jl_ptls_t ptls, size_t sz)
+{
+    // safepoint
+    jl_gc_safepoint_(ptls);
+
+    size_t offs = offsetof(bigval_t, header);
+    assert(sz >= sizeof(jl_taggedvalue_t) && "sz must include tag");
+    static_assert(offsetof(bigval_t, header) >= sizeof(void*), "Empty bigval header?");
+    static_assert(sizeof(bigval_t) % JL_HEAP_ALIGNMENT == 0, "");
+    size_t allocsz = LLT_ALIGN(sz + offs, JL_CACHE_BYTE_ALIGNMENT);
+    if (allocsz < sz) { // overflow in adding offs, size was "negative"
+        assert(0 && "Error when allocating big object");
+        jl_throw(jl_memory_exception);
+    }
+
+    bigval_t *v = (bigval_t*)mmtk_alloc_large(&ptls->gc_tls.mmtk_mutator, allocsz, JL_CACHE_BYTE_ALIGNMENT, 0, 2);
+
+    if (v == NULL) {
+        assert(0 && "Allocation failed");
+        jl_throw(jl_memory_exception);
+    }
+    v->sz = allocsz;
+
+    ptls->gc_tls.gc_num.allocd += allocsz;
+    ptls->gc_tls.gc_num.bigalloc++;
+
+    jl_value_t *result = jl_valueof(&v->header);
+    mmtk_post_alloc(&ptls->gc_tls.mmtk_mutator, result, allocsz, 2);
+
+    return result;
+}
+
+// Instrumented version of jl_gc_small_alloc_inner, called into by LLVM-generated code.
+JL_DLLEXPORT jl_value_t *jl_gc_small_alloc(jl_ptls_t ptls, int offset, int osize, jl_value_t* type)
+{
+    assert(jl_atomic_load_relaxed(&ptls->gc_state) == 0);
+
+    jl_value_t *val = jl_mmtk_gc_alloc_default(ptls, osize, 16, NULL);
+    maybe_record_alloc_to_profile(val, osize, (jl_datatype_t*)type);
+    return val;
+}
+
+// Instrumented version of jl_gc_big_alloc_inner, called into by LLVM-generated code.
+JL_DLLEXPORT jl_value_t *jl_gc_big_alloc(jl_ptls_t ptls, size_t sz, jl_value_t *type)
+{
+    // TODO: assertion needed here?
+    assert(jl_atomic_load_relaxed(&ptls->gc_state) == 0);
+
+    jl_value_t *val = jl_mmtk_gc_alloc_big(ptls, sz);
+    maybe_record_alloc_to_profile(val, sz, (jl_datatype_t*)type);
+    return val;
+}
+
+inline jl_value_t *jl_gc_alloc_(jl_ptls_t ptls, size_t sz, void *ty)
+{
+    jl_value_t *v;
+    const size_t allocsz = sz + sizeof(jl_taggedvalue_t);
+    if (sz <= GC_MAX_SZCLASS) {
+        v = jl_mmtk_gc_alloc_default(ptls, allocsz, 16, ty);
+    }
+    else {
+        if (allocsz < sz) // overflow in adding offs, size was "negative"
+            jl_throw(jl_memory_exception);
+        v = jl_mmtk_gc_alloc_big(ptls, allocsz);
+    }
+    jl_set_typeof(v, ty);
+    maybe_record_alloc_to_profile(v, sz, (jl_datatype_t*)ty);
+    return v;
+}
+
+JL_DLLEXPORT void *jl_gc_managed_malloc(size_t sz)
+{
+    jl_ptls_t ptls = jl_current_task->ptls;
+    maybe_collect(ptls);
+    size_t allocsz = LLT_ALIGN(sz, JL_CACHE_BYTE_ALIGNMENT);
+    if (allocsz < sz)  // overflow in adding offs, size was "negative"
+        jl_throw(jl_memory_exception);
+
+    int last_errno = errno;
+#ifdef _OS_WINDOWS_
+    DWORD last_error = GetLastError();
+#endif
+    void *b = malloc_cache_align(allocsz);
+    if (b == NULL)
+        jl_throw(jl_memory_exception);
+
+    jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.allocd,
+        jl_atomic_load_relaxed(&ptls->gc_tls.gc_num.allocd) + allocsz);
+    jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.malloc,
+        jl_atomic_load_relaxed(&ptls->gc_tls.gc_num.malloc) + 1);
+    // FIXME: Should these be part of mmtk's heap?
+    // malloc_maybe_collect(ptls, sz);
+    // jl_atomic_fetch_add_relaxed(&JULIA_MALLOC_BYTES, allocsz);
+#ifdef _OS_WINDOWS_
+    SetLastError(last_error);
+#endif
+    errno = last_errno;
+    // jl_gc_managed_malloc is currently always used for allocating array buffers.
+    maybe_record_alloc_to_profile((jl_value_t*)b, sz, (jl_datatype_t*)jl_buff_tag);
+    return b;
+}
+
+// Not used by mmtk
+// Number of GC threads that may run parallel marking
+int jl_n_markthreads;
+// Number of GC threads that may run concurrent sweeping (0 or 1)
+int jl_n_sweepthreads;
+// `tid` of first GC thread
+int gc_first_tid;
+
+JL_DLLEXPORT void jl_gc_queue_root(const struct _jl_value_t *ptr) JL_NOTSAFEPOINT
+{
+    mmtk_unreachable();
+}
+
+JL_DLLEXPORT void jl_gc_queue_multiroot(const struct _jl_value_t *root, const void *stored,
+                                        struct _jl_datatype_t *dt) JL_NOTSAFEPOINT
+{
+    mmtk_unreachable();
+}
+
+// marking
+// ---
+
+JL_DLLEXPORT int jl_gc_mark_queue_obj(jl_ptls_t ptls, jl_value_t *obj)
+{
+    mmtk_unreachable();
+    return 0;
+}
+JL_DLLEXPORT void jl_gc_mark_queue_objarray(jl_ptls_t ptls, jl_value_t *parent,
+                                            jl_value_t **objs, size_t nobjs)
+{
+    mmtk_unreachable();
+}
+
+JL_DLLEXPORT size_t jl_gc_max_internal_obj_size(void)
+{
+    // TODO: meaningful for MMTk?
+    return GC_MAX_SZCLASS;
+}
+
+JL_DLLEXPORT void jl_gc_schedule_foreign_sweepfunc(jl_ptls_t ptls, jl_value_t *obj)
+{
+    // FIXME: do we need to implement this?
+}
+
+// gc-debug functions
+// ---
+
+JL_DLLEXPORT jl_taggedvalue_t *jl_gc_find_taggedvalue_pool(char *p, size_t *osize_p)
+{
+    return NULL;
+}
+
+void jl_gc_debug_critical_error(void) JL_NOTSAFEPOINT
+{
+}
+
+int gc_is_concurrent_collector_thread(int tid) JL_NOTSAFEPOINT
+{
+    return 0;
+}
+
+int gc_is_parallel_collector_thread(int tid) JL_NOTSAFEPOINT
+{
+    return 0;
+}
+
+void jl_gc_debug_print_status(void) JL_NOTSAFEPOINT
+{
+    // May not be accurate but should be helpful enough
+    uint64_t pool_count = gc_num.poolalloc;
+    uint64_t big_count = gc_num.bigalloc;
+    jl_safe_printf("Allocations: %" PRIu64 " "
+                   "(Pool: %" PRIu64 "; Big: %" PRIu64 "); GC: %d\n",
+                   pool_count + big_count, pool_count, big_count, gc_num.pause);
+}
+
+JL_DLLEXPORT size_t jl_gc_external_obj_hdr_size(void)
+{
+    return sizeof(bigval_t);
+}
+
+void jl_print_gc_stats(JL_STREAM *s)
+{
+}
+
+JL_DLLEXPORT int jl_gc_enable_conservative_gc_support(void)
+{
+    return 0;
+}
+
+JL_DLLEXPORT int jl_gc_conservative_gc_support_enabled(void)
+{
+    return 0;
+}
+
+// TODO: if this is needed, it can be added in MMTk
+JL_DLLEXPORT jl_value_t *jl_gc_internal_obj_base_ptr(void *p)
+{
+    return NULL;
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // MMTK_GC
diff --git a/src/gc-mmtk.h b/src/gc-mmtk.h
new file mode 100644
index 0000000000000..6c2c7a40bc81f
--- /dev/null
+++ b/src/gc-mmtk.h
@@ -0,0 +1,34 @@
+#ifdef MMTK_GC
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+extern jl_mutex_t finalizers_lock;
+extern arraylist_t to_finalize;
+extern arraylist_t finalizer_list_marked;
+
+JL_EXTENSION typedef struct _bigval_t {
+    size_t sz;
+#ifdef _P64 // Add padding so that the value is 64-byte aligned
+    // (8 pointers of 8 bytes each) - (2 other pointers in struct)
+    void *_padding[8 - 2];
+#else
+    // (16 pointers of 4 bytes each) - (2 other pointers in struct)
+    void *_padding[16 - 2];
+#endif
+    //struct jl_taggedvalue_t <>;
+    union {
+        uintptr_t header;
+        struct {
+            uintptr_t gc:2;
+        } bits;
+    };
+    // must be 64-byte aligned here, in 32 & 64 bit modes
+} bigval_t;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // MMTK_GC
diff --git a/src/gc-page-profiler.c b/src/gc-page-profiler.c
index 2625fa812781a..bfd1c74247df8 100644
--- a/src/gc-page-profiler.c
+++ b/src/gc-page-profiler.c
@@ -1,5 +1,5 @@
 // This file is a part of Julia. License is MIT: https://julialang.org/license
-
+#ifndef MMTK_GC
 #include "gc-page-profiler.h"
 #include "julia.h"
 
@@ -178,3 +178,5 @@ JL_DLLEXPORT void jl_gc_take_page_profile(ios_t *stream)
 #ifdef __cplusplus
 }
 #endif
+
+#endif // !MMTK_GC
diff --git a/src/gc-pages.c b/src/gc-pages.c
index 71d59de29166f..ed6e0ed20ba1c 100644
--- a/src/gc-pages.c
+++ b/src/gc-pages.c
@@ -1,5 +1,5 @@
 // This file is a part of Julia. License is MIT: https://julialang.org/license
-
+#ifndef MMTK_GC
 #include "gc-common.h"
 #include "gc-stock.h"
 #ifndef _OS_WINDOWS_
@@ -205,3 +205,5 @@ void jl_gc_free_page(jl_gc_pagemeta_t *pg) JL_NOTSAFEPOINT
 #ifdef __cplusplus
 }
 #endif
+
+#endif // !MMTK_GC
diff --git a/src/gc-stock.c b/src/gc-stock.c
index 3ff37566dc6c7..164d3067a31de 100644
--- a/src/gc-stock.c
+++ b/src/gc-stock.c
@@ -1,5 +1,5 @@
 // This file is a part of Julia. License is MIT: https://julialang.org/license
-
+#ifndef MMTK_GC
 #include "gc-common.h"
 #include "gc-stock.h"
 #include "gc-alloc-profiler.h"
@@ -405,7 +405,6 @@ static void sweep_weak_refs(void)
     }
 }
 
-
 STATIC_INLINE void jl_batch_accum_heap_size(jl_ptls_t ptls, uint64_t sz) JL_NOTSAFEPOINT
 {
     uint64_t alloc_acc = jl_atomic_load_relaxed(&ptls->gc_tls.gc_num.alloc_acc) + sz;
@@ -453,7 +452,6 @@ STATIC_INLINE jl_value_t *jl_gc_big_alloc_inner(jl_ptls_t ptls, size_t sz)
     return jl_valueof(&v->header);
 }
 
-
 // Instrumented version of jl_gc_big_alloc_inner, called into by LLVM-generated code.
 JL_DLLEXPORT jl_value_t *jl_gc_big_alloc(jl_ptls_t ptls, size_t sz, jl_value_t *type)
 {
@@ -3888,12 +3886,22 @@ JL_DLLEXPORT size_t jl_gc_external_obj_hdr_size(void)
     return sizeof(bigval_t);
 }
 
+<<<<<<< HEAD
 
+=======
+>>>>>>> c48a701f54 (WIP: Adding support for MMTk/Immix)
 JL_DLLEXPORT void jl_gc_schedule_foreign_sweepfunc(jl_ptls_t ptls, jl_value_t *obj)
 {
     arraylist_push(&ptls->gc_tls.sweep_objs, obj);
 }
 
+void jl_gc_notify_image_load(const char* img_data, size_t len)
+{
+    // Do nothing
+}
+
 #ifdef __cplusplus
 }
 #endif
+
+#endif // !MMTK_GC
diff --git a/src/gc-stock.h b/src/gc-stock.h
index 50eca3aadbd86..8e563f32ab9d3 100644
--- a/src/gc-stock.h
+++ b/src/gc-stock.h
@@ -5,6 +5,7 @@
   . non-moving, precise mark and sweep collector
   . pool-allocates small objects, keeps big objects on a simple list
 */
+#ifndef MMTK_GC
 
 #ifndef JL_GC_H
 #define JL_GC_H
@@ -422,21 +423,6 @@ STATIC_INLINE int gc_ith_parallel_collector_thread_id(int i) JL_NOTSAFEPOINT
     return gc_first_tid + i;
 }
 
-STATIC_INLINE int gc_is_parallel_collector_thread(int tid) JL_NOTSAFEPOINT
-{
-    return tid >= gc_first_tid && tid <= gc_last_parallel_collector_thread_id();
-}
-
-STATIC_INLINE int gc_is_concurrent_collector_thread(int tid) JL_NOTSAFEPOINT
-{
-    if (jl_n_sweepthreads == 0) {
-        return 0;
-    }
-    int last_parallel_collector_thread_id = gc_last_parallel_collector_thread_id();
-    int concurrent_collector_thread_id = last_parallel_collector_thread_id + 1;
-    return tid == concurrent_collector_thread_id;
-}
-
 STATIC_INLINE int gc_random_parallel_collector_thread_id(jl_ptls_t ptls) JL_NOTSAFEPOINT
 {
     assert(jl_n_markthreads > 0);
@@ -712,3 +698,5 @@ void _report_gc_finished(uint64_t pause, uint64_t freed, int full, int recollect
 #endif
 
 #endif
+
+#endif // !MMTK_GC
diff --git a/src/gc-tls-mmtk.h b/src/gc-tls-mmtk.h
new file mode 100644
index 0000000000000..2eb5f2a6a44d9
--- /dev/null
+++ b/src/gc-tls-mmtk.h
@@ -0,0 +1,49 @@
+// This file is a part of Julia. License is MIT: https://julialang.org/license
+
+#ifdef MMTK_GC
+
+#include <assert.h>
+#include "mmtkMutator.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct {
+    // variable for tracking weak references
+    small_arraylist_t weak_refs;
+    // live tasks started on this thread
+    // that are holding onto a stack from the pool
+    small_arraylist_t live_tasks;
+
+    // variables for tracking malloc'd arrays
+    struct _mallocmemory_t *mallocarrays;
+    struct _mallocmemory_t *mafreelist;
+
+#define JL_N_STACK_POOLS 16
+    small_arraylist_t free_stacks[JL_N_STACK_POOLS];
+} jl_thread_heap_t;
+
+typedef struct {
+    _Atomic(int64_t) allocd;
+    _Atomic(int64_t) pool_live_bytes;
+    _Atomic(uint64_t) malloc;
+    _Atomic(uint64_t) realloc;
+    _Atomic(uint64_t) poolalloc;
+    _Atomic(uint64_t) bigalloc;
+    _Atomic(int64_t) free_acc;
+    _Atomic(uint64_t) alloc_acc;
+} jl_thread_gc_num_t;
+
+typedef struct {
+    jl_thread_heap_t heap;
+    jl_thread_gc_num_t gc_num;
+    MMTkMutatorContext mmtk_mutator;
+    size_t malloc_sz_since_last_poll;
+} jl_gc_tls_states_t;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // MMTK_GC
diff --git a/src/gc-tls.h b/src/gc-tls.h
index 9e4b09404db84..43adfb8a7ff2a 100644
--- a/src/gc-tls.h
+++ b/src/gc-tls.h
@@ -1,5 +1,7 @@
 // This file is a part of Julia. License is MIT: https://julialang.org/license
 
+#ifndef MMTK_GC
+
 // Meant to be included in "julia_threads.h"
 #ifndef JL_GC_TLS_H
 #define JL_GC_TLS_H
@@ -90,3 +92,5 @@ typedef struct {
 #endif
 
 #endif // JL_GC_TLS_H
+
+#endif // MMTK_GC
diff --git a/src/julia.h b/src/julia.h
index abb8a57ff13b0..db57db1fbeb38 100644
--- a/src/julia.h
+++ b/src/julia.h
@@ -850,7 +850,7 @@ static inline jl_value_t *jl_to_typeof(uintptr_t t)
     return (jl_value_t*)t;
 }
 #else
-extern JL_HIDDEN jl_datatype_t *ijl_small_typeof[(jl_max_tags << 4) / sizeof(jl_datatype_t*)];
+extern JL_DLLEXPORT jl_datatype_t *ijl_small_typeof[(jl_max_tags << 4) / sizeof(jl_datatype_t*)];
 static inline jl_value_t *jl_to_typeof(uintptr_t t)
 {
     if (t < (jl_max_tags << 4))
diff --git a/src/julia_internal.h b/src/julia_internal.h
index e677f40907dfd..d5013601a9124 100644
--- a/src/julia_internal.h
+++ b/src/julia_internal.h
@@ -1052,7 +1052,7 @@ STATIC_INLINE int jl_addr_is_safepoint(uintptr_t addr)
     return addr >= safepoint_addr && addr < safepoint_addr + jl_page_size * 4;
 }
 extern _Atomic(uint32_t) jl_gc_running;
-extern _Atomic(uint32_t) jl_gc_disable_counter;
+extern JL_DLLEXPORT _Atomic(uint32_t) jl_gc_disable_counter;
 // All the functions are safe to be called from within a signal handler
 // provided that the thread will not be interrupted by another asynchronous
 // signal.
diff --git a/src/julia_threads.h b/src/julia_threads.h
index b697a0bf030ed..641c50386c555 100644
--- a/src/julia_threads.h
+++ b/src/julia_threads.h
@@ -4,7 +4,11 @@
 #ifndef JL_THREADS_H
 #define JL_THREADS_H
 
+#ifndef MMTK_GC
 #include "gc-tls.h"
+#else
+#include "gc-tls-mmtk.h"
+#endif
 #include "julia_atomics.h"
 #ifndef _OS_WINDOWS_
 #include "pthread.h"
diff --git a/src/stackwalk.c b/src/stackwalk.c
index a1de3a6d61a07..e6fc2c7bbf56a 100644
--- a/src/stackwalk.c
+++ b/src/stackwalk.c
@@ -1294,6 +1294,8 @@ JL_DLLEXPORT void jl_print_backtrace(void) JL_NOTSAFEPOINT
 }
 
 extern int gc_first_tid;
+extern int gc_is_parallel_collector_thread(int tid) JL_NOTSAFEPOINT;
+extern int gc_is_concurrent_collector_thread(int tid) JL_NOTSAFEPOINT;
 
 // Print backtraces for all live tasks, for all threads, to jl_safe_printf stderr
 JL_DLLEXPORT void jl_print_task_backtraces(int show_done) JL_NOTSAFEPOINT
diff --git a/src/staticdata.c b/src/staticdata.c
index 363aa46b62221..e07a5365bf06f 100644
--- a/src/staticdata.c
+++ b/src/staticdata.c
@@ -654,6 +654,7 @@ static void jl_load_sysimg_so(void)
         plen = (size_t *)&jl_system_image_size;
     else
         jl_dlsym(jl_sysimg_handle, "jl_system_image_size", (void **)&plen, 1);
+    jl_gc_notify_image_load(sysimg_data, *plen);
     jl_restore_system_image_data(sysimg_data, *plen);
 }
 
@@ -3899,6 +3900,7 @@ JL_DLLEXPORT jl_value_t *jl_restore_package_image_from_file(const char *fname, j
     jl_dlsym(pkgimg_handle, "jl_system_image_data", (void **)&pkgimg_data, 1);
     size_t *plen;
     jl_dlsym(pkgimg_handle, "jl_system_image_size", (void **)&plen, 1);
+    jl_gc_notify_image_load(pkgimg_data, *plen);
 
     jl_image_t pkgimage = jl_init_processor_pkgimg(pkgimg_handle);
 
diff --git a/src/threading.c b/src/threading.c
index 44b1192528531..df62ea107bf04 100644
--- a/src/threading.c
+++ b/src/threading.c
@@ -743,6 +743,10 @@ void jl_init_threading(void)
     }
     int16_t ngcthreads = jl_n_markthreads + jl_n_sweepthreads;
 
+#ifdef MMTK_GC
+    ngcthreads = 0;
+#endif
+
     jl_all_tls_states_size = nthreads + nthreadsi + ngcthreads;
     jl_n_threads_per_pool = (int*)malloc_s(2 * sizeof(int));
     jl_n_threads_per_pool[0] = nthreadsi;

From b488bbeb22847c3740459d015878368587ecb847 Mon Sep 17 00:00:00 2001
From: Eduardo Souza <ledusou@gmail.com>
Date: Wed, 28 Aug 2024 00:44:09 +0000
Subject: [PATCH 09/38] Refactoring to be considered before adding MMTk

---
 src/gc-interface.h | 4 +++-
 src/gc-stock.c     | 3 +++
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/src/gc-interface.h b/src/gc-interface.h
index 72a57f4944156..b1f3ab9d6908d 100644
--- a/src/gc-interface.h
+++ b/src/gc-interface.h
@@ -192,7 +192,9 @@ JL_DLLEXPORT void *jl_gc_perm_alloc(size_t sz, int zero, unsigned align,
 // object being allocated and will be used to set the object header.
 struct _jl_value_t *jl_gc_permobj(size_t sz, void *ty) JL_NOTSAFEPOINT;
 
-// FIXME: add description here
+// This function notifies the GC about memory addresses that are set when loading the boot image.
+// The GC may use that information to, for instance, determine that such objects should
+// be treated as marked and belonged to the old generation in nursery collections.
 void jl_gc_notify_image_load(const char* img_data, size_t len);
 
 // ========================================================================= //
diff --git a/src/gc-stock.c b/src/gc-stock.c
index 164d3067a31de..019ae481ce189 100644
--- a/src/gc-stock.c
+++ b/src/gc-stock.c
@@ -3888,8 +3888,11 @@ JL_DLLEXPORT size_t jl_gc_external_obj_hdr_size(void)
 
 <<<<<<< HEAD
 
+<<<<<<< HEAD
 =======
 >>>>>>> c48a701f54 (WIP: Adding support for MMTk/Immix)
+=======
+>>>>>>> 0aee3ba32a (Refactoring to be considered before adding MMTk)
 JL_DLLEXPORT void jl_gc_schedule_foreign_sweepfunc(jl_ptls_t ptls, jl_value_t *obj)
 {
     arraylist_push(&ptls->gc_tls.sweep_objs, obj);

From a4cf8e7c754fc72c9612750ccce65b87eaeb720b Mon Sep 17 00:00:00 2001
From: Eduardo Souza <ledusou@gmail.com>
Date: Thu, 29 Aug 2024 05:37:53 +0000
Subject: [PATCH 10/38] Adding fastpath allocation

---
 src/llvm-gc-interface-passes.h |   5 ++
 src/llvm-late-gc-lowering.cpp  | 139 +++++++++++++++++++++++++++++++++
 2 files changed, 144 insertions(+)

diff --git a/src/llvm-gc-interface-passes.h b/src/llvm-gc-interface-passes.h
index d33567e887118..ed6b94dcdc3fc 100644
--- a/src/llvm-gc-interface-passes.h
+++ b/src/llvm-gc-interface-passes.h
@@ -19,6 +19,7 @@
 #include <llvm/ADT/SmallVector.h>
 #include <llvm/ADT/SmallSet.h>
 #include <llvm/Analysis/CFG.h>
+#include <llvm/Analysis/DomTreeUpdater.h>
 #include <llvm/Analysis/InstSimplifyFolder.h>
 #include <llvm/IR/Value.h>
 #include <llvm/IR/Constants.h>
@@ -328,6 +329,7 @@ struct LateLowerGCFrame:  private JuliaPassContext {
 
 private:
     CallInst *pgcstack;
+    Function *smallAllocFunc;
 
     void MaybeNoteDef(State &S, BBState &BBS, Value *Def, const ArrayRef<int> &SafepointsSoFar,
                       SmallVector<int, 1> &&RefinedPtr = SmallVector<int, 1>());
@@ -365,6 +367,9 @@ struct LateLowerGCFrame:  private JuliaPassContext {
     void RefineLiveSet(LargeSparseBitVector &LS, State &S, ArrayRef<int> CalleeRoots);
     Value *EmitTagPtr(IRBuilder<> &builder, Type *T, Type *T_size, Value *V);
     Value *EmitLoadTag(IRBuilder<> &builder, Type *T_size, Value *V);
+#ifdef MMTK_GC
+    Value* lowerGCAllocBytesLate(CallInst *target, Function &F);
+#endif
 };
 
 // The final GC lowering pass. This pass lowers platform-agnostic GC
diff --git a/src/llvm-late-gc-lowering.cpp b/src/llvm-late-gc-lowering.cpp
index 1d390a5115207..d395771f6df0c 100644
--- a/src/llvm-late-gc-lowering.cpp
+++ b/src/llvm-late-gc-lowering.cpp
@@ -2452,8 +2452,122 @@ void LateLowerGCFrame::PlaceRootsAndUpdateCalls(SmallVectorImpl<int> &Colors, St
     }
 }
 
+#ifdef MMTK_GC
+Value* LateLowerGCFrame::lowerGCAllocBytesLate(CallInst *target, Function &F)
+{
+    assert(target->arg_size() == 3);
+
+    IRBuilder<> builder(target);
+    auto ptls = target->getArgOperand(0);
+    auto type = target->getArgOperand(2);
+    if (auto CI = dyn_cast<ConstantInt>(target->getArgOperand(1))) {
+        size_t sz = (size_t)CI->getZExtValue();
+        // This is strongly architecture and OS dependent
+        int osize;
+        int offset = jl_gc_classify_pools(sz, &osize);
+        if (offset >= 0) {
+            // In this case instead of lowering julia.gc_alloc_bytes to jl_gc_small_alloc
+            // We do a slowpath/fastpath check and lower it only on the slowpath, returning
+            // the cursor and updating it in the fastpath.
+            auto pool_osize_i32 = ConstantInt::get(Type::getInt32Ty(F.getContext()), osize);
+            auto pool_osize = ConstantInt::get(Type::getInt64Ty(F.getContext()), osize);
+
+            // Should we generate fastpath allocation sequence here? We should always generate fastpath here for MMTk.
+            // Setting this to false will increase allocation overhead a lot, and should only be used for debugging.
+            const bool INLINE_FASTPATH_ALLOCATION = true;
+
+            if (INLINE_FASTPATH_ALLOCATION) {
+                // Assuming we use the first immix allocator.
+                // FIXME: We should get the allocator index and type from MMTk.
+                auto allocator_offset = offsetof(jl_tls_states_t, gc_tls) + offsetof(jl_gc_tls_states_t, mmtk_mutator) + offsetof(MMTkMutatorContext, allocators) + offsetof(Allocators, immix);
+
+                auto cursor_pos = ConstantInt::get(Type::getInt64Ty(target->getContext()), allocator_offset + offsetof(ImmixAllocator, cursor));
+                auto limit_pos = ConstantInt::get(Type::getInt64Ty(target->getContext()),  allocator_offset + offsetof(ImmixAllocator, limit));
+
+                auto cursor_tls_i8 = builder.CreateGEP(Type::getInt8Ty(target->getContext()), ptls, cursor_pos);
+                auto cursor_ptr = builder.CreateBitCast(cursor_tls_i8, PointerType::get(Type::getInt64Ty(target->getContext()), 0), "cursor_ptr");
+                auto cursor = builder.CreateLoad(Type::getInt64Ty(target->getContext()), cursor_ptr, "cursor");
+
+                // offset = 8
+                auto delta_offset = builder.CreateNSWSub(ConstantInt::get(Type::getInt64Ty(target->getContext()), 0), ConstantInt::get(Type::getInt64Ty(target->getContext()), 8));
+                auto delta_cursor = builder.CreateNSWSub(ConstantInt::get(Type::getInt64Ty(target->getContext()), 0), cursor);
+                auto delta_op = builder.CreateNSWAdd(delta_offset, delta_cursor);
+                // alignment 16 (15 = 16 - 1)
+                auto delta = builder.CreateAnd(delta_op, ConstantInt::get(Type::getInt64Ty(target->getContext()), 15), "delta");
+                auto result = builder.CreateNSWAdd(cursor, delta, "result");
+
+                auto new_cursor = builder.CreateNSWAdd(result, pool_osize);
+
+                auto limit_tls_i8 = builder.CreateGEP(Type::getInt8Ty(target->getContext()), ptls, limit_pos);
+                auto limit_ptr = builder.CreateBitCast(limit_tls_i8, PointerType::get(Type::getInt64Ty(target->getContext()), 0), "limit_ptr");
+                auto limit = builder.CreateLoad(Type::getInt64Ty(target->getContext()), limit_ptr, "limit");
+
+                auto gt_limit = builder.CreateICmpSGT(new_cursor, limit);
+
+                auto slowpath = BasicBlock::Create(target->getContext(), "slowpath", target->getFunction());
+                auto fastpath = BasicBlock::Create(target->getContext(), "fastpath", target->getFunction());
+
+                auto next_instr = target->getNextNode();
+                SmallVector<uint32_t, 2> Weights{1, 9};
+
+                MDBuilder MDB(F.getContext());
+                SplitBlockAndInsertIfThenElse(gt_limit, next_instr, &slowpath, &fastpath, false, false, MDB.createBranchWeights(Weights));
+
+                builder.SetInsertPoint(next_instr);
+                auto phiNode = builder.CreatePHI(target->getCalledFunction()->getReturnType(), 2, "phi_fast_slow");
+
+                // slowpath
+                builder.SetInsertPoint(slowpath);
+                auto pool_offs = ConstantInt::get(Type::getInt32Ty(F.getContext()), 1);
+                auto new_call = builder.CreateCall(smallAllocFunc, { ptls, pool_offs, pool_osize_i32, type });
+                new_call->setAttributes(new_call->getCalledFunction()->getAttributes());
+                builder.CreateBr(next_instr->getParent());
+
+                // fastpath
+                builder.SetInsertPoint(fastpath);
+                builder.CreateStore(new_cursor, cursor_ptr);
+
+                // ptls->gc_tls.gc_num.allocd += osize;
+                auto pool_alloc_pos = ConstantInt::get(Type::getInt64Ty(target->getContext()), offsetof(jl_tls_states_t, gc_tls) + offsetof(jl_gc_tls_states_t, gc_num));
+                auto pool_alloc_i8 = builder.CreateGEP(Type::getInt8Ty(target->getContext()), ptls, pool_alloc_pos);
+                auto pool_alloc_tls = builder.CreateBitCast(pool_alloc_i8, PointerType::get(Type::getInt64Ty(target->getContext()), 0), "pool_alloc");
+                auto pool_allocd = builder.CreateLoad(Type::getInt64Ty(target->getContext()), pool_alloc_tls);
+                auto pool_allocd_total = builder.CreateAdd(pool_allocd, pool_osize);
+                builder.CreateStore(pool_allocd_total, pool_alloc_tls);
+
+                auto v_raw = builder.CreateNSWAdd(result, ConstantInt::get(Type::getInt64Ty(target->getContext()), sizeof(jl_taggedvalue_t)));
+                auto v_as_ptr = builder.CreateIntToPtr(v_raw, smallAllocFunc->getReturnType());
+                builder.CreateBr(next_instr->getParent());
+
+                phiNode->addIncoming(new_call, slowpath);
+                phiNode->addIncoming(v_as_ptr, fastpath);
+                phiNode->takeName(target);
+                return phiNode;
+            }
+        }
+    }
+    return target;
+}
+
+template<typename TIterator>
+static void replaceInstruction(
+    Instruction *oldInstruction,
+    Value *newInstruction,
+    TIterator &it)
+{
+    if (newInstruction != oldInstruction) {
+        oldInstruction->replaceAllUsesWith(newInstruction);
+        it = oldInstruction->eraseFromParent();
+    }
+    else {
+        ++it;
+    }
+}
+#endif
+
 bool LateLowerGCFrame::runOnFunction(Function &F, bool *CFGModified) {
     initAll(*F.getParent());
+    smallAllocFunc = getOrDeclare(jl_well_known::GCSmallAlloc);
     LLVM_DEBUG(dbgs() << "GC ROOT PLACEMENT: Processing function " << F.getName() << "\n");
     if (!pgcstack_getter && !adoptthread_func)
         return CleanupIR(F, nullptr, CFGModified);
@@ -2468,6 +2582,31 @@ bool LateLowerGCFrame::runOnFunction(Function &F, bool *CFGModified) {
     std::map<Value *, std::pair<int, int>> CallFrames; // = OptimizeCallFrames(S, Ordering);
     PlaceRootsAndUpdateCalls(Colors, S, CallFrames);
     CleanupIR(F, &S, CFGModified);
+
+#ifdef MMTK_GC
+    // We lower the julia.gc_alloc_bytes intrinsic in this pass to insert slowpath/fastpath blocks for MMTk
+    for (BasicBlock &BB : F) {
+        for (auto it = BB.begin(); it != BB.end();) {
+            auto *CI = dyn_cast<CallInst>(&*it);
+            if (!CI) {
+                ++it;
+                continue;
+            }
+
+            Value *callee = CI->getCalledOperand();
+            assert(callee);
+
+            auto GCAllocBytes = getOrNull(jl_intrinsics::GCAllocBytes);
+            if (GCAllocBytes == callee) {
+                *CFGModified = true;
+                replaceInstruction(CI, lowerGCAllocBytesLate(CI, F), it);
+                continue;
+            }
+            ++it;
+        }
+    }
+#endif
+
     return true;
 }
 

From ecb675a597ab3dcd57fc053c995252618b6b0edd Mon Sep 17 00:00:00 2001
From: Eduardo Souza <ledusou@gmail.com>
Date: Thu, 29 Aug 2024 05:51:26 +0000
Subject: [PATCH 11/38] Fixing removed newlines

---
 src/gc-debug.c           | 1 +
 src/gc-heap-snapshot.cpp | 1 +
 src/gc-page-profiler.c   | 1 +
 src/gc-pages.c           | 1 +
 src/gc-stock.c           | 7 +++++++
 5 files changed, 11 insertions(+)

diff --git a/src/gc-debug.c b/src/gc-debug.c
index ecd7f2328cada..2c8e1c6055414 100644
--- a/src/gc-debug.c
+++ b/src/gc-debug.c
@@ -1,4 +1,5 @@
 // This file is a part of Julia. License is MIT: https://julialang.org/license
+
 #ifndef MMTK_GC
 #include "gc-common.h"
 #include "gc-stock.h"
diff --git a/src/gc-heap-snapshot.cpp b/src/gc-heap-snapshot.cpp
index d3cb1e98d84a4..fcda11dad4f8a 100644
--- a/src/gc-heap-snapshot.cpp
+++ b/src/gc-heap-snapshot.cpp
@@ -1,4 +1,5 @@
 // This file is a part of Julia. License is MIT: https://julialang.org/license
+
 #include "gc-heap-snapshot.h"
 
 #include "julia.h"
diff --git a/src/gc-page-profiler.c b/src/gc-page-profiler.c
index bfd1c74247df8..e5c6b91978731 100644
--- a/src/gc-page-profiler.c
+++ b/src/gc-page-profiler.c
@@ -1,4 +1,5 @@
 // This file is a part of Julia. License is MIT: https://julialang.org/license
+
 #ifndef MMTK_GC
 #include "gc-page-profiler.h"
 #include "julia.h"
diff --git a/src/gc-pages.c b/src/gc-pages.c
index ed6e0ed20ba1c..976fc461d5b95 100644
--- a/src/gc-pages.c
+++ b/src/gc-pages.c
@@ -1,4 +1,5 @@
 // This file is a part of Julia. License is MIT: https://julialang.org/license
+
 #ifndef MMTK_GC
 #include "gc-common.h"
 #include "gc-stock.h"
diff --git a/src/gc-stock.c b/src/gc-stock.c
index 019ae481ce189..05f2f5930448c 100644
--- a/src/gc-stock.c
+++ b/src/gc-stock.c
@@ -1,4 +1,5 @@
 // This file is a part of Julia. License is MIT: https://julialang.org/license
+
 #ifndef MMTK_GC
 #include "gc-common.h"
 #include "gc-stock.h"
@@ -405,6 +406,7 @@ static void sweep_weak_refs(void)
     }
 }
 
+
 STATIC_INLINE void jl_batch_accum_heap_size(jl_ptls_t ptls, uint64_t sz) JL_NOTSAFEPOINT
 {
     uint64_t alloc_acc = jl_atomic_load_relaxed(&ptls->gc_tls.gc_num.alloc_acc) + sz;
@@ -452,6 +454,7 @@ STATIC_INLINE jl_value_t *jl_gc_big_alloc_inner(jl_ptls_t ptls, size_t sz)
     return jl_valueof(&v->header);
 }
 
+
 // Instrumented version of jl_gc_big_alloc_inner, called into by LLVM-generated code.
 JL_DLLEXPORT jl_value_t *jl_gc_big_alloc(jl_ptls_t ptls, size_t sz, jl_value_t *type)
 {
@@ -3886,6 +3889,7 @@ JL_DLLEXPORT size_t jl_gc_external_obj_hdr_size(void)
     return sizeof(bigval_t);
 }
 
+<<<<<<< HEAD
 <<<<<<< HEAD
 
 <<<<<<< HEAD
@@ -3893,6 +3897,9 @@ JL_DLLEXPORT size_t jl_gc_external_obj_hdr_size(void)
 >>>>>>> c48a701f54 (WIP: Adding support for MMTk/Immix)
 =======
 >>>>>>> 0aee3ba32a (Refactoring to be considered before adding MMTk)
+=======
+
+>>>>>>> 30ac6f081d (Fixing removed newlines)
 JL_DLLEXPORT void jl_gc_schedule_foreign_sweepfunc(jl_ptls_t ptls, jl_value_t *obj)
 {
     arraylist_push(&ptls->gc_tls.sweep_objs, obj);

From 77db2039905d73c9d6a30bef583d7ad15aea9ca1 Mon Sep 17 00:00:00 2001
From: Eduardo Souza <ledusou@gmail.com>
Date: Wed, 28 Aug 2024 00:44:09 +0000
Subject: [PATCH 12/38] Refactoring to be considered before adding MMTk

---
 src/gc-stock.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/gc-stock.c b/src/gc-stock.c
index 05f2f5930448c..5fd3b7efafead 100644
--- a/src/gc-stock.c
+++ b/src/gc-stock.c
@@ -3892,6 +3892,7 @@ JL_DLLEXPORT size_t jl_gc_external_obj_hdr_size(void)
 <<<<<<< HEAD
 <<<<<<< HEAD
 
+<<<<<<< HEAD
 <<<<<<< HEAD
 =======
 >>>>>>> c48a701f54 (WIP: Adding support for MMTk/Immix)
@@ -3900,6 +3901,8 @@ JL_DLLEXPORT size_t jl_gc_external_obj_hdr_size(void)
 =======
 
 >>>>>>> 30ac6f081d (Fixing removed newlines)
+=======
+>>>>>>> 2efcdf8335 (Refactoring to be considered before adding MMTk)
 JL_DLLEXPORT void jl_gc_schedule_foreign_sweepfunc(jl_ptls_t ptls, jl_value_t *obj)
 {
     arraylist_push(&ptls->gc_tls.sweep_objs, obj);

From c5d3a40880cc08014ec6347372ea35c3249f8709 Mon Sep 17 00:00:00 2001
From: Eduardo Souza <ledusou@gmail.com>
Date: Mon, 2 Sep 2024 06:07:02 +0000
Subject: [PATCH 13/38] Adding a few comments; Moving some functions to be
 closer together

---
 src/gc-common.c   |  70 -----------
 src/gc-mmtk.c     | 311 ++++++++++++++--------------------------------
 src/gc-tls-mmtk.h |   2 +
 3 files changed, 94 insertions(+), 289 deletions(-)

diff --git a/src/gc-common.c b/src/gc-common.c
index 17f6f1330743b..417f12f26d64d 100644
--- a/src/gc-common.c
+++ b/src/gc-common.c
@@ -705,76 +705,6 @@ JL_DLLEXPORT void jl_throw_out_of_memory_error(void)
     jl_throw(jl_memory_exception);
 }
 
-size_t jl_genericmemory_nbytes(jl_genericmemory_t *m) JL_NOTSAFEPOINT
-{
-    const jl_datatype_layout_t *layout = ((jl_datatype_t*)jl_typetagof(m))->layout;
-    size_t sz = layout->size * m->length;
-    if (layout->flags.arrayelem_isunion)
-        // account for isbits Union array selector bytes
-        sz += m->length;
-    return sz;
-}
-
-// tracking Memorys with malloc'd storage
-void jl_gc_track_malloced_genericmemory(jl_ptls_t ptls, jl_genericmemory_t *m, int isaligned){
-    // This is **NOT** a GC safe point.
-    mallocmemory_t *ma;
-    if (ptls->gc_tls.heap.mafreelist == NULL) {
-        ma = (mallocmemory_t*)malloc_s(sizeof(mallocmemory_t));
-    }
-    else {
-        ma = ptls->gc_tls.heap.mafreelist;
-        ptls->gc_tls.heap.mafreelist = ma->next;
-    }
-    ma->a = (jl_genericmemory_t*)((uintptr_t)m | !!isaligned);
-    ma->next = ptls->gc_tls.heap.mallocarrays;
-    ptls->gc_tls.heap.mallocarrays = ma;
-}
-
-int gc_logging_enabled = 0;
-
-JL_DLLEXPORT void jl_enable_gc_logging(int enable) {
-    gc_logging_enabled = enable;
-}
-
-JL_DLLEXPORT int jl_is_gc_logging_enabled(void) {
-    return gc_logging_enabled;
-}
-
-// gc-debug common functions
-// ---
-
-int gc_slot_to_fieldidx(void *obj, void *slot, jl_datatype_t *vt) JL_NOTSAFEPOINT
-{
-    int nf = (int)jl_datatype_nfields(vt);
-    for (int i = 1; i < nf; i++) {
-        if (slot < (void*)((char*)obj + jl_field_offset(vt, i)))
-            return i - 1;
-    }
-    return nf - 1;
-}
-
-int gc_slot_to_arrayidx(void *obj, void *_slot) JL_NOTSAFEPOINT
-{
-    char *slot = (char*)_slot;
-    jl_datatype_t *vt = (jl_datatype_t*)jl_typeof(obj);
-    char *start = NULL;
-    size_t len = 0;
-    size_t elsize = sizeof(void*);
-    if (vt == jl_module_type) {
-        jl_module_t *m = (jl_module_t*)obj;
-        start = (char*)m->usings.items;
-        len = m->usings.len;
-    }
-    else if (vt == jl_simplevector_type) {
-        start = (char*)jl_svec_data(obj);
-        len = jl_svec_len(obj);
-    }
-    if (slot < start || slot >= start + elsize * len)
-        return -1;
-    return (slot - start) / elsize;
-}
-
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/gc-mmtk.c b/src/gc-mmtk.c
index e459b0f12c41d..98a5612871be0 100644
--- a/src/gc-mmtk.c
+++ b/src/gc-mmtk.c
@@ -10,9 +10,10 @@
 extern "C" {
 #endif
 
-// For now we're using the same values as stock-gc. However
-// for the heap size we use 70% of the free memory available
-// since that is actually a hard limit in MMTk.
+// FIXME: Should the values below be shared between both GC's?
+// Note that MMTk uses a hard max heap limit, which is set by default
+// as 70% of the free available memory. The min heap is set as the
+// default_collect_interval variable below.
 
 // max_total_memory is a suggestion.  We try very hard to stay
 // under this limit, but we will go above it rather than halting.
@@ -33,7 +34,6 @@ static memsize_t max_total_memory = (memsize_t) MAX32HEAP;
 
 void jl_gc_init(void) {
     // TODO: use jl_options.heap_size_hint to set MMTk's fixed heap size? (see issue: https://github.com/mmtk/mmtk-julia/issues/167)
-
     JL_MUTEX_INIT(&finalizers_lock, "finalizers_lock");
 
     arraylist_new(&to_finalize, 0);
@@ -105,10 +105,6 @@ void jl_gc_init(void) {
 void jl_start_gc_threads(void) {
     jl_ptls_t ptls = jl_current_task->ptls;
     mmtk_initialize_collection((void *)ptls);
-    // int nthreads = jl_atomic_load_relaxed(&jl_n_threads);
-    // int ngcthreads = jl_n_gcthreads;
-    // int nmutator_threads = nthreads - ngcthreads;
-    // printf("nthreads = %d, ngcthreads = %d, nmutator_threads = %d\n", nthreads, ngcthreads, nmutator_threads);
 }
 
 void jl_init_thread_heap(struct _jl_tls_states_t *ptls) JL_NOTSAFEPOINT {
@@ -135,38 +131,31 @@ void jl_free_thread_gc_state(struct _jl_tls_states_t *ptls) {
     mmtk_destroy_mutator(&ptls->gc_tls.mmtk_mutator);
 }
 
-// FIXME: mmtk uses the same code as stock to enable/disable the GC
-// Should this be moved to gc-common.c?
-
-_Atomic(uint32_t) jl_gc_disable_counter = 1;
-
-JL_DLLEXPORT int jl_gc_enable(int on) {
-    jl_ptls_t ptls = jl_current_task->ptls;
-    int prev = !ptls->disable_gc;
-    ptls->disable_gc = (on == 0);
-    if (on && !prev) {
-        // disable -> enable
-        if (jl_atomic_fetch_add(&jl_gc_disable_counter, -1) == 1) {
-            gc_num.allocd += gc_num.deferred_alloc;
-            gc_num.deferred_alloc = 0;
-        }
-    }
-    else if (prev && !on) {
-        // enable -> disable
-        jl_atomic_fetch_add(&jl_gc_disable_counter, 1);
-        // check if the GC is running and wait for it to finish
-        jl_gc_safepoint_(ptls);
-    }
-    return prev;
+JL_DLLEXPORT void jl_gc_set_max_memory(uint64_t max_mem) {
+    // MMTk currently does not allow setting the heap size at runtime
 }
 
-JL_DLLEXPORT int jl_gc_is_enabled(void) {
-    jl_ptls_t ptls = jl_current_task->ptls;
-    return !ptls->disable_gc;
+
+inline void maybe_collect(jl_ptls_t ptls)
+{
+    // Just do a safe point for general maybe_collect
+    jl_gc_safepoint_(ptls);
 }
 
-JL_DLLEXPORT void jl_gc_set_max_memory(uint64_t max_mem) {
-    // MMTk currently does not allow setting the heap size at runtime
+// This is only used for malloc. We need to know if we need to do GC. However, keeping checking with MMTk (mmtk_gc_poll),
+// is expensive. So we only check for every few allocations.
+static inline void malloc_maybe_collect(jl_ptls_t ptls, size_t sz)
+{
+    // We do not need to carefully maintain malloc_sz_since_last_poll. We just need to
+    // avoid using mmtk_gc_poll too frequently, and try to be precise on our heap usage
+    // as much as we can.
+    if (ptls->gc_tls.malloc_sz_since_last_poll > 4096) {
+        jl_atomic_store_relaxed(&ptls->gc_tls.malloc_sz_since_last_poll, 0);
+        mmtk_gc_poll(ptls);
+    } else {
+        jl_atomic_fetch_add_relaxed(&ptls->gc_tls.malloc_sz_since_last_poll, sz);
+        jl_gc_safepoint_(ptls);
+    }
 }
 
 JL_DLLEXPORT void jl_gc_collect(jl_gc_collection_t collection) {
@@ -182,7 +171,12 @@ JL_DLLEXPORT void jl_gc_collect(jl_gc_collection_t collection) {
     mmtk_handle_user_collection_request(ptls, collection);
 }
 
-// same as above, some of these are identical to the implementation in gc stock
+// FIXME: The functions combine_thread_gc_counts and reset_thread_gc_counts
+// are currently nearly identical for mmtk and for stock. However, the stats
+// are likely different (e.g., MMTk doesn't track the bytes allocated in the fastpath,
+// but only when the slowpath is called). We might need to adapt these later so that
+// the statistics are the same or as close as possible for each GC.
+
 static void combine_thread_gc_counts(jl_gc_num_t *dest, int update_heap) JL_NOTSAFEPOINT
 {
     int gc_n_threads;
@@ -228,31 +222,6 @@ void reset_thread_gc_counts(void) JL_NOTSAFEPOINT
     }
 }
 
-// weak references
-// ---
-JL_DLLEXPORT jl_weakref_t *jl_gc_new_weakref_th(jl_ptls_t ptls, jl_value_t *value)
-{
-    jl_weakref_t *wr = (jl_weakref_t*)jl_gc_alloc(ptls, sizeof(void*), jl_weakref_type);
-    wr->value = value;  // NOTE: wb not needed here
-    mmtk_add_weak_candidate(wr);
-    return wr;
-}
-
-
-// allocation
-int jl_gc_classify_pools(size_t sz, int *osize)
-{
-    if (sz > GC_MAX_SZCLASS)
-        return -1; // call big alloc function
-    size_t allocsz = sz + sizeof(jl_taggedvalue_t);
-    *osize = LLT_ALIGN(allocsz, 16);
-    return 0; // use MMTk's fastpath logic
-}
-
-int64_t last_gc_total_bytes = 0;
-int64_t last_live_bytes = 0; // live_bytes at last collection
-int64_t live_bytes = 0;
-
 // Retrieves Julia's `GC_Num` (structure that stores GC statistics).
 JL_DLLEXPORT jl_gc_num_t jl_gc_num(void) {
     jl_gc_num_t num = gc_num;
@@ -260,6 +229,10 @@ JL_DLLEXPORT jl_gc_num_t jl_gc_num(void) {
     return num;
 }
 
+int64_t last_gc_total_bytes = 0;
+int64_t last_live_bytes = 0; // live_bytes at last collection
+int64_t live_bytes = 0;
+
 JL_DLLEXPORT int64_t jl_gc_diff_total_bytes(void) JL_NOTSAFEPOINT {
     int64_t oldtb = last_gc_total_bytes;
     int64_t newtb;
@@ -325,82 +298,38 @@ JL_DLLEXPORT uint64_t jl_gc_get_max_memory(void)
     return max_total_memory;
 }
 
+// weak references
+// ---
+JL_DLLEXPORT jl_weakref_t *jl_gc_new_weakref_th(jl_ptls_t ptls, jl_value_t *value)
+{
+    jl_weakref_t *wr = (jl_weakref_t*)jl_gc_alloc(ptls, sizeof(void*), jl_weakref_type);
+    wr->value = value;  // NOTE: wb not needed here
+    mmtk_add_weak_candidate(wr);
+    return wr;
+}
+
+// allocation
+
 extern void mmtk_object_reference_write_post(void* mutator, const void* parent, const void* ptr);
 extern void mmtk_object_reference_write_slow(void* mutator, const void* parent, const void* ptr);
 extern void* mmtk_alloc(void* mutator, size_t size, size_t align, size_t offset, int allocator);
 extern void mmtk_post_alloc(void* mutator, void* refer, size_t bytes, int allocator);
-
-
 extern const void* MMTK_SIDE_LOG_BIT_BASE_ADDRESS;
 extern const void* MMTK_SIDE_VO_BIT_BASE_ADDRESS;
-
-// These need to be constants.
-
-#define MMTK_OBJECT_BARRIER (1)
-// Stickyimmix needs write barrier. Immix does not need write barrier.
-#ifdef MMTK_PLAN_IMMIX
-#define MMTK_NEEDS_WRITE_BARRIER (0)
-#endif
-#ifdef MMTK_PLAN_STICKYIMMIX
-#define MMTK_NEEDS_WRITE_BARRIER (1)
-#endif
-
-#ifdef MMTK_CONSERVATIVE_SCAN
-#define MMTK_NEEDS_VO_BIT (1)
-#else
-#define MMTK_NEEDS_VO_BIT (0)
-#endif
+extern void mmtk_store_obj_size_c(void* obj, size_t size);
 
 #define MMTK_DEFAULT_IMMIX_ALLOCATOR (0)
 #define MMTK_IMMORTAL_BUMP_ALLOCATOR (0)
 
-// Directly call into MMTk for write barrier (debugging only)
-inline void mmtk_gc_wb_full(const void *parent, const void *ptr) JL_NOTSAFEPOINT
-{
-    jl_task_t *ct = jl_current_task;
-    jl_ptls_t ptls = ct->ptls;
-    mmtk_object_reference_write_post(&ptls->gc_tls.mmtk_mutator, parent, ptr);
-}
-
-// Fastpath. Return 1 if we should go to slowpath
-inline int mmtk_gc_wb_fast_check(const void *parent, const void *ptr) JL_NOTSAFEPOINT
-{
-    if (MMTK_NEEDS_WRITE_BARRIER == MMTK_OBJECT_BARRIER) {
-        intptr_t addr = (intptr_t) (void*) parent;
-        uint8_t* meta_addr = (uint8_t*) (MMTK_SIDE_LOG_BIT_BASE_ADDRESS) + (addr >> 6);
-        intptr_t shift = (addr >> 3) & 0b111;
-        uint8_t byte_val = *meta_addr;
-        return ((byte_val >> shift) & 1) == 1;
-    } else {
-        return 0;
-    }
-}
-
-// Slowpath.
-inline void mmtk_gc_wb_slow(const void *parent, const void *ptr) JL_NOTSAFEPOINT
-{
-    if (MMTK_NEEDS_WRITE_BARRIER == MMTK_OBJECT_BARRIER) {
-        jl_task_t *ct = jl_current_task;
-        jl_ptls_t ptls = ct->ptls;
-        mmtk_object_reference_write_slow(&ptls->gc_tls.mmtk_mutator, parent, ptr);
-    }
-}
 
-inline void mmtk_gc_wb(const void *parent, const void *ptr) JL_NOTSAFEPOINT
-{
-    if (mmtk_gc_wb_fast_check(parent, ptr)) {
-        mmtk_gc_wb_slow(parent, ptr);
-    }
-}
-
-inline void mmtk_gc_wb_binding(const void *bnd, const void *val) JL_NOTSAFEPOINT
+int jl_gc_classify_pools(size_t sz, int *osize)
 {
-    if (mmtk_gc_wb_fast_check(bnd, val)) {
-        jl_astaggedvalue(bnd)->bits.gc = 2; // to indicate that the buffer is a binding
-        mmtk_gc_wb_slow(bnd, val);
-    }
+    if (sz > GC_MAX_SZCLASS)
+        return -1; // call big alloc function
+    size_t allocsz = sz + sizeof(jl_taggedvalue_t);
+    *osize = LLT_ALIGN(allocsz, 16);
+    return 0; // use MMTk's fastpath logic
 }
-
 #define MMTK_MIN_ALIGNMENT 4
 // MMTk assumes allocation size is aligned to min alignment.
 inline size_t mmtk_align_alloc_sz(size_t sz) JL_NOTSAFEPOINT
@@ -429,19 +358,9 @@ inline void mmtk_immix_post_alloc_slow(MMTkMutatorContext* mutator, void* obj, s
     mmtk_post_alloc(mutator, obj, size, 0);
 }
 
-inline void mmtk_set_vo_bit(void* obj) {
-        intptr_t addr = (intptr_t) obj;
-        intptr_t shift = (addr >> 3) & 0b111;
-        uint8_t* vo_meta_addr = (uint8_t*) (MMTK_SIDE_VO_BIT_BASE_ADDRESS) + (addr >> 6);
-        uint8_t new_val = (*vo_meta_addr) | (1 << shift);
-        (*vo_meta_addr) = new_val;
-}
-
 inline void mmtk_immix_post_alloc_fast(MMTkMutatorContext* mutator, void* obj, size_t size) {
-    if (MMTK_NEEDS_VO_BIT) {
-        // set VO bit
-        mmtk_set_vo_bit(obj);
-    }
+    // FIXME: for now, we do nothing
+    // but when supporting moving, this is where we set the valid object (VO) bit
 }
 
 inline void* mmtk_immortal_alloc_fast(MMTkMutatorContext* mutator, size_t size, size_t align, size_t offset) {
@@ -450,79 +369,12 @@ inline void* mmtk_immortal_alloc_fast(MMTkMutatorContext* mutator, size_t size,
 }
 
 inline void mmtk_immortal_post_alloc_fast(MMTkMutatorContext* mutator, void* obj, size_t size) {
-    if (MMTK_NEEDS_VO_BIT) {
-        // set VO bit
-        mmtk_set_vo_bit(obj);
-    }
-
-    if (MMTK_NEEDS_WRITE_BARRIER == MMTK_OBJECT_BARRIER) {
-        intptr_t addr = (intptr_t) obj;
-        intptr_t shift = (addr >> 3) & 0b111;
-        uint8_t* meta_addr = (uint8_t*) (MMTK_SIDE_LOG_BIT_BASE_ADDRESS) + (addr >> 6);
-        while(1) {
-            uint8_t old_val = *meta_addr;
-            uint8_t new_val = old_val | (1 << shift);
-            if (jl_atomic_cmpswap((_Atomic(uint8_t)*)meta_addr, &old_val, new_val)) {
-                break;
-            }
-        }
-    }
-}
-
-// mutex for page profile
-uv_mutex_t page_profile_lock;
-
-JL_DLLEXPORT void jl_gc_take_page_profile(ios_t *stream)
-{
-    uv_mutex_lock(&page_profile_lock);
-    const char *str = "Page profiler in unsupported in MMTk.";
-    ios_write(stream, str, strlen(str));
-    uv_mutex_unlock(&page_profile_lock);
-}
-
-// this seems to be needed by the gc tests
-#define JL_GC_N_MAX_POOLS 51
-JL_DLLEXPORT double jl_gc_page_utilization_stats[JL_GC_N_MAX_POOLS];
-
-STATIC_INLINE void gc_dump_page_utilization_data(void) JL_NOTSAFEPOINT
-{
-    // FIXME: MMTk would have to provide its own stats
-}
-
-#define MMTK_GC_PAGE_SZ (1 << 12) // MMTk's page size is defined in mmtk-core constants
-
-JL_DLLEXPORT uint64_t jl_get_pg_size(void)
-{
-    return MMTK_GC_PAGE_SZ;
-}
-
-
-extern void mmtk_store_obj_size_c(void* obj, size_t size);
-
-inline void maybe_collect(jl_ptls_t ptls)
-{
-    // Just do a safe point for general maybe_collect
-    jl_gc_safepoint_(ptls);
-}
-
-// This is only used for malloc. We need to know if we need to do GC. However, keeping checking with MMTk (mmtk_gc_poll),
-// is expensive. So we only check for every few allocations.
-static inline void malloc_maybe_collect(jl_ptls_t ptls, size_t sz)
-{
-    // We do not need to carefully maintain malloc_sz_since_last_poll. We just need to
-    // avoid using mmtk_gc_poll too frequently, and try to be precise on our heap usage
-    // as much as we can.
-    if (ptls->gc_tls.malloc_sz_since_last_poll > 4096) {
-        jl_atomic_store_relaxed(&ptls->gc_tls.malloc_sz_since_last_poll, 0);
-        mmtk_gc_poll(ptls);
-    } else {
-        jl_atomic_fetch_add_relaxed(&ptls->gc_tls.malloc_sz_since_last_poll, sz);
-        jl_gc_safepoint_(ptls);
-    }
+    // FIXME: Similarly, for now, we do nothing
+    // but when supporting moving, this is where we set the valid object (VO) bit
+    // and log (old gen) bit
 }
 
 // allocation wrappers that track allocation and let collection run
-
 JL_DLLEXPORT void *jl_gc_counted_malloc(size_t sz)
 {
     jl_gcframe_t **pgcstack = jl_get_pgcstack();
@@ -601,7 +453,6 @@ jl_value_t *jl_gc_permobj(size_t sz, void *ty) JL_NOTSAFEPOINT
     return jl_valueof(o);
 }
 
-
 JL_DLLEXPORT jl_value_t *jl_mmtk_gc_alloc_default(jl_ptls_t ptls, int osize, size_t align, void *ty)
 {
     // safepoint
@@ -628,11 +479,6 @@ JL_DLLEXPORT jl_value_t *jl_mmtk_gc_alloc_default(jl_ptls_t ptls, int osize, siz
     return v;
 }
 
-void jl_gc_notify_image_load(const char* img_data, size_t len)
-{
-    mmtk_set_vm_space((void*)img_data, len);
-}
-
 JL_DLLEXPORT jl_value_t *jl_mmtk_gc_alloc_big(jl_ptls_t ptls, size_t sz)
 {
     // safepoint
@@ -735,6 +581,38 @@ JL_DLLEXPORT void *jl_gc_managed_malloc(size_t sz)
     return b;
 }
 
+void jl_gc_notify_image_load(const char* img_data, size_t len)
+{
+    mmtk_set_vm_space((void*)img_data, len);
+}
+
+// mutex for page profile
+uv_mutex_t page_profile_lock;
+
+JL_DLLEXPORT void jl_gc_take_page_profile(ios_t *stream)
+{
+    uv_mutex_lock(&page_profile_lock);
+    const char *str = "Page profiler in unsupported in MMTk.";
+    ios_write(stream, str, strlen(str));
+    uv_mutex_unlock(&page_profile_lock);
+}
+
+// this seems to be needed by the gc tests
+#define JL_GC_N_MAX_POOLS 51
+JL_DLLEXPORT double jl_gc_page_utilization_stats[JL_GC_N_MAX_POOLS];
+
+STATIC_INLINE void gc_dump_page_utilization_data(void) JL_NOTSAFEPOINT
+{
+    // FIXME: MMTk would have to provide its own stats
+}
+
+#define MMTK_GC_PAGE_SZ (1 << 12) // MMTk's page size is defined in mmtk-core constants
+
+JL_DLLEXPORT uint64_t jl_get_pg_size(void)
+{
+    return MMTK_GC_PAGE_SZ;
+}
+
 // Not used by mmtk
 // Number of GC threads that may run parallel marking
 int jl_n_markthreads;
@@ -791,12 +669,7 @@ void jl_gc_debug_critical_error(void) JL_NOTSAFEPOINT
 {
 }
 
-int gc_is_concurrent_collector_thread(int tid) JL_NOTSAFEPOINT
-{
-    return 0;
-}
-
-int gc_is_parallel_collector_thread(int tid) JL_NOTSAFEPOINT
+int gc_is_collector_thread(int tid) JL_NOTSAFEPOINT
 {
     return 0;
 }
diff --git a/src/gc-tls-mmtk.h b/src/gc-tls-mmtk.h
index 2eb5f2a6a44d9..64a1bae192445 100644
--- a/src/gc-tls-mmtk.h
+++ b/src/gc-tls-mmtk.h
@@ -9,6 +9,8 @@
 extern "C" {
 #endif
 
+// This mostly remove some fields that are not used by MMTk
+
 typedef struct {
     // variable for tracking weak references
     small_arraylist_t weak_refs;

From c26632ed5d2be1effebe86bfa5ca844195933095 Mon Sep 17 00:00:00 2001
From: Eduardo Souza <ledusou@gmail.com>
Date: Wed, 25 Sep 2024 01:20:29 +0000
Subject: [PATCH 14/38] Fixing merge conflicts

---
 src/gc-stock.c | 14 --------------
 1 file changed, 14 deletions(-)

diff --git a/src/gc-stock.c b/src/gc-stock.c
index 5fd3b7efafead..078635f18e3ce 100644
--- a/src/gc-stock.c
+++ b/src/gc-stock.c
@@ -3889,20 +3889,6 @@ JL_DLLEXPORT size_t jl_gc_external_obj_hdr_size(void)
     return sizeof(bigval_t);
 }
 
-<<<<<<< HEAD
-<<<<<<< HEAD
-
-<<<<<<< HEAD
-<<<<<<< HEAD
-=======
->>>>>>> c48a701f54 (WIP: Adding support for MMTk/Immix)
-=======
->>>>>>> 0aee3ba32a (Refactoring to be considered before adding MMTk)
-=======
-
->>>>>>> 30ac6f081d (Fixing removed newlines)
-=======
->>>>>>> 2efcdf8335 (Refactoring to be considered before adding MMTk)
 JL_DLLEXPORT void jl_gc_schedule_foreign_sweepfunc(jl_ptls_t ptls, jl_value_t *obj)
 {
     arraylist_push(&ptls->gc_tls.sweep_objs, obj);

From c283442edf340d882b13c9ec887a6d9bd44b2527 Mon Sep 17 00:00:00 2001
From: Eduardo Souza <ledusou@gmail.com>
Date: Wed, 25 Sep 2024 01:24:44 +0000
Subject: [PATCH 15/38] Applying changes from refactoring before adding MMTk

---
 src/gc-stock.h       | 16 ++++++++++++++++
 src/julia.h          |  2 +-
 src/julia_internal.h |  2 +-
 src/stackwalk.c      |  2 --
 4 files changed, 18 insertions(+), 4 deletions(-)

diff --git a/src/gc-stock.h b/src/gc-stock.h
index 8e563f32ab9d3..6f75dcd014176 100644
--- a/src/gc-stock.h
+++ b/src/gc-stock.h
@@ -423,6 +423,21 @@ STATIC_INLINE int gc_ith_parallel_collector_thread_id(int i) JL_NOTSAFEPOINT
     return gc_first_tid + i;
 }
 
+STATIC_INLINE int gc_is_parallel_collector_thread(int tid) JL_NOTSAFEPOINT
+{
+    return tid >= gc_first_tid && tid <= gc_last_parallel_collector_thread_id();
+}
+
+STATIC_INLINE int gc_is_concurrent_collector_thread(int tid) JL_NOTSAFEPOINT
+{
+    if (jl_n_sweepthreads == 0) {
+        return 0;
+    }
+    int last_parallel_collector_thread_id = gc_last_parallel_collector_thread_id();
+    int concurrent_collector_thread_id = last_parallel_collector_thread_id + 1;
+    return tid == concurrent_collector_thread_id;
+}
+
 STATIC_INLINE int gc_random_parallel_collector_thread_id(jl_ptls_t ptls) JL_NOTSAFEPOINT
 {
     assert(jl_n_markthreads > 0);
@@ -699,4 +714,5 @@ void _report_gc_finished(uint64_t pause, uint64_t freed, int full, int recollect
 
 #endif
 
+
 #endif // !MMTK_GC
diff --git a/src/julia.h b/src/julia.h
index db57db1fbeb38..abb8a57ff13b0 100644
--- a/src/julia.h
+++ b/src/julia.h
@@ -850,7 +850,7 @@ static inline jl_value_t *jl_to_typeof(uintptr_t t)
     return (jl_value_t*)t;
 }
 #else
-extern JL_DLLEXPORT jl_datatype_t *ijl_small_typeof[(jl_max_tags << 4) / sizeof(jl_datatype_t*)];
+extern JL_HIDDEN jl_datatype_t *ijl_small_typeof[(jl_max_tags << 4) / sizeof(jl_datatype_t*)];
 static inline jl_value_t *jl_to_typeof(uintptr_t t)
 {
     if (t < (jl_max_tags << 4))
diff --git a/src/julia_internal.h b/src/julia_internal.h
index d5013601a9124..e677f40907dfd 100644
--- a/src/julia_internal.h
+++ b/src/julia_internal.h
@@ -1052,7 +1052,7 @@ STATIC_INLINE int jl_addr_is_safepoint(uintptr_t addr)
     return addr >= safepoint_addr && addr < safepoint_addr + jl_page_size * 4;
 }
 extern _Atomic(uint32_t) jl_gc_running;
-extern JL_DLLEXPORT _Atomic(uint32_t) jl_gc_disable_counter;
+extern _Atomic(uint32_t) jl_gc_disable_counter;
 // All the functions are safe to be called from within a signal handler
 // provided that the thread will not be interrupted by another asynchronous
 // signal.
diff --git a/src/stackwalk.c b/src/stackwalk.c
index e6fc2c7bbf56a..a1de3a6d61a07 100644
--- a/src/stackwalk.c
+++ b/src/stackwalk.c
@@ -1294,8 +1294,6 @@ JL_DLLEXPORT void jl_print_backtrace(void) JL_NOTSAFEPOINT
 }
 
 extern int gc_first_tid;
-extern int gc_is_parallel_collector_thread(int tid) JL_NOTSAFEPOINT;
-extern int gc_is_concurrent_collector_thread(int tid) JL_NOTSAFEPOINT;
 
 // Print backtraces for all live tasks, for all threads, to jl_safe_printf stderr
 JL_DLLEXPORT void jl_print_task_backtraces(int show_done) JL_NOTSAFEPOINT

From 01aa62331858a7810efbcf5857edfda990a93e72 Mon Sep 17 00:00:00 2001
From: Eduardo Souza <ledusou@gmail.com>
Date: Wed, 28 Aug 2024 00:44:09 +0000
Subject: [PATCH 16/38] Refactoring to be considered before adding MMTk

---
 src/gc-common.c      | 156 +++++++++++++++++++++++++++++++++++++++++++
 src/gc-common.h      |   6 ++
 src/gc-debug.c       |  41 +-----------
 src/gc-interface.h   |  12 ++++
 src/gc-stacks.c      |   4 +-
 src/gc-stock.c       | 156 ++++++++++++-------------------------------
 src/gc-stock.h       |  21 ------
 src/julia.h          |   2 +-
 src/julia_internal.h |  26 +-------
 src/scheduler.c      |  11 +++
 src/stackwalk.c      |   4 +-
 src/staticdata.c     |   2 +
 12 files changed, 237 insertions(+), 204 deletions(-)

diff --git a/src/gc-common.c b/src/gc-common.c
index ee461b576ea9e..2ec167caa667a 100644
--- a/src/gc-common.c
+++ b/src/gc-common.c
@@ -20,6 +20,11 @@ extern "C" {
 
 jl_gc_num_t gc_num = {0};
 
+JL_DLLEXPORT uint64_t jl_gc_total_hrtime(void)
+{
+    return gc_num.total_time;
+}
+
 // =========================================================================== //
 // GC Callbacks
 // =========================================================================== //
@@ -489,6 +494,87 @@ jl_ptls_t* gc_all_tls_states;
 // MISC
 // =========================================================================== //
 
+JL_DLLEXPORT jl_weakref_t *jl_gc_new_weakref(jl_value_t *value)
+{
+    jl_ptls_t ptls = jl_current_task->ptls;
+    return jl_gc_new_weakref_th(ptls, value);
+}
+
+JL_DLLEXPORT void * jl_gc_alloc_typed(jl_ptls_t ptls, size_t sz, void *ty)
+{
+    return jl_gc_alloc(ptls, sz, ty);
+}
+
+JL_DLLEXPORT jl_value_t *jl_gc_allocobj(size_t sz)
+{
+    jl_ptls_t ptls = jl_current_task->ptls;
+    return jl_gc_alloc(ptls, sz, NULL);
+}
+
+// allocation wrappers that save the size of allocations, to allow using
+// jl_gc_counted_* functions with a libc-compatible API.
+
+JL_DLLEXPORT void *jl_malloc(size_t sz)
+{
+    int64_t *p = (int64_t *)jl_gc_counted_malloc(sz + JL_SMALL_BYTE_ALIGNMENT);
+    if (p == NULL)
+        return NULL;
+    p[0] = sz;
+    return (void *)(p + 2); // assumes JL_SMALL_BYTE_ALIGNMENT == 16
+}
+
+//_unchecked_calloc does not check for potential overflow of nm*sz
+STATIC_INLINE void *_unchecked_calloc(size_t nm, size_t sz) {
+    size_t nmsz = nm*sz;
+    int64_t *p = (int64_t *)jl_gc_counted_calloc(nmsz + JL_SMALL_BYTE_ALIGNMENT, 1);
+    if (p == NULL)
+        return NULL;
+    p[0] = nmsz;
+    return (void *)(p + 2); // assumes JL_SMALL_BYTE_ALIGNMENT == 16
+}
+
+JL_DLLEXPORT void *jl_calloc(size_t nm, size_t sz)
+{
+    if (nm > SSIZE_MAX/sz - JL_SMALL_BYTE_ALIGNMENT)
+        return NULL;
+    return _unchecked_calloc(nm, sz);
+}
+
+JL_DLLEXPORT void jl_free(void *p)
+{
+    if (p != NULL) {
+        int64_t *pp = (int64_t *)p - 2;
+        size_t sz = pp[0];
+        jl_gc_counted_free_with_size(pp, sz + JL_SMALL_BYTE_ALIGNMENT);
+    }
+}
+
+JL_DLLEXPORT void *jl_realloc(void *p, size_t sz)
+{
+    int64_t *pp;
+    size_t szold;
+    if (p == NULL) {
+        pp = NULL;
+        szold = 0;
+    }
+    else {
+        pp = (int64_t *)p - 2;
+        szold = pp[0] + JL_SMALL_BYTE_ALIGNMENT;
+    }
+    int64_t *pnew = (int64_t *)jl_gc_counted_realloc_with_old_size(pp, szold, sz + JL_SMALL_BYTE_ALIGNMENT);
+    if (pnew == NULL)
+        return NULL;
+    pnew[0] = sz;
+    return (void *)(pnew + 2); // assumes JL_SMALL_BYTE_ALIGNMENT == 16
+}
+
+// allocator entry points
+
+JL_DLLEXPORT jl_value_t *(jl_gc_alloc)(jl_ptls_t ptls, size_t sz, void *ty)
+{
+    return jl_gc_alloc_(ptls, sz, ty);
+}
+
 const uint64_t _jl_buff_tag[3] = {0x4eadc0004eadc000ull, 0x4eadc0004eadc000ull, 0x4eadc0004eadc000ull}; // aka 0xHEADER00
 JL_DLLEXPORT uintptr_t jl_get_buff_tag(void) JL_NOTSAFEPOINT
 {
@@ -501,6 +587,76 @@ JL_DLLEXPORT void jl_throw_out_of_memory_error(void)
     jl_throw(jl_memory_exception);
 }
 
+size_t jl_genericmemory_nbytes(jl_genericmemory_t *m) JL_NOTSAFEPOINT
+{
+    const jl_datatype_layout_t *layout = ((jl_datatype_t*)jl_typetagof(m))->layout;
+    size_t sz = layout->size * m->length;
+    if (layout->flags.arrayelem_isunion)
+        // account for isbits Union array selector bytes
+        sz += m->length;
+    return sz;
+}
+
+// tracking Memorys with malloc'd storage
+void jl_gc_track_malloced_genericmemory(jl_ptls_t ptls, jl_genericmemory_t *m, int isaligned){
+    // This is **NOT** a GC safe point.
+    mallocmemory_t *ma;
+    if (ptls->gc_tls.heap.mafreelist == NULL) {
+        ma = (mallocmemory_t*)malloc_s(sizeof(mallocmemory_t));
+    }
+    else {
+        ma = ptls->gc_tls.heap.mafreelist;
+        ptls->gc_tls.heap.mafreelist = ma->next;
+    }
+    ma->a = (jl_genericmemory_t*)((uintptr_t)m | !!isaligned);
+    ma->next = ptls->gc_tls.heap.mallocarrays;
+    ptls->gc_tls.heap.mallocarrays = ma;
+}
+
+int gc_logging_enabled = 0;
+
+JL_DLLEXPORT void jl_enable_gc_logging(int enable) {
+    gc_logging_enabled = enable;
+}
+
+JL_DLLEXPORT int jl_is_gc_logging_enabled(void) {
+    return gc_logging_enabled;
+}
+
+// gc-debug common functions
+// ---
+
+int gc_slot_to_fieldidx(void *obj, void *slot, jl_datatype_t *vt) JL_NOTSAFEPOINT
+{
+    int nf = (int)jl_datatype_nfields(vt);
+    for (int i = 1; i < nf; i++) {
+        if (slot < (void*)((char*)obj + jl_field_offset(vt, i)))
+            return i - 1;
+    }
+    return nf - 1;
+}
+
+int gc_slot_to_arrayidx(void *obj, void *_slot) JL_NOTSAFEPOINT
+{
+    char *slot = (char*)_slot;
+    jl_datatype_t *vt = (jl_datatype_t*)jl_typeof(obj);
+    char *start = NULL;
+    size_t len = 0;
+    size_t elsize = sizeof(void*);
+    if (vt == jl_module_type) {
+        jl_module_t *m = (jl_module_t*)obj;
+        start = (char*)m->usings.items;
+        len = m->usings.len;
+    }
+    else if (vt == jl_simplevector_type) {
+        start = (char*)jl_svec_data(obj);
+        len = jl_svec_len(obj);
+    }
+    if (slot < start || slot >= start + elsize * len)
+        return -1;
+    return (slot - start) / elsize;
+}
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/gc-common.h b/src/gc-common.h
index 4d53830442a7d..154b9659e9ccb 100644
--- a/src/gc-common.h
+++ b/src/gc-common.h
@@ -53,6 +53,12 @@ extern jl_gc_callback_list_t *gc_cblist_notify_gc_pressure;
 // malloc wrappers, aligned allocation
 // =========================================================================== //
 
+// data structure for tracking malloc'd genericmemory.
+typedef struct _mallocmemory_t {
+    jl_genericmemory_t *a; // lowest bit is tagged if this is aligned memory
+    struct _mallocmemory_t *next;
+} mallocmemory_t;
+
 #if defined(_OS_WINDOWS_)
 STATIC_INLINE void *jl_malloc_aligned(size_t sz, size_t align)
 {
diff --git a/src/gc-debug.c b/src/gc-debug.c
index 19dd93af5f236..d05fb4b49e9f7 100644
--- a/src/gc-debug.c
+++ b/src/gc-debug.c
@@ -1105,46 +1105,7 @@ void gc_count_pool(void)
     jl_safe_printf("************************\n");
 }
 
-int gc_slot_to_fieldidx(void *obj, void *slot, jl_datatype_t *vt) JL_NOTSAFEPOINT
-{
-    int nf = (int)jl_datatype_nfields(vt);
-    for (int i = 1; i < nf; i++) {
-        if (slot < (void*)((char*)obj + jl_field_offset(vt, i)))
-            return i - 1;
-    }
-    return nf - 1;
-}
-
-int gc_slot_to_arrayidx(void *obj, void *_slot) JL_NOTSAFEPOINT
-{
-    char *slot = (char*)_slot;
-    jl_datatype_t *vt = (jl_datatype_t*)jl_typeof(obj);
-    char *start = NULL;
-    size_t len = 0;
-    size_t elsize = sizeof(void*);
-    if (vt == jl_module_type) {
-        jl_module_t *m = (jl_module_t*)obj;
-        start = (char*)m->usings.items;
-        len = m->usings.len;
-    }
-    else if (vt == jl_simplevector_type) {
-        start = (char*)jl_svec_data(obj);
-        len = jl_svec_len(obj);
-    }
-    if (slot < start || slot >= start + elsize * len)
-        return -1;
-    return (slot - start) / elsize;
-}
-
-static int gc_logging_enabled = 0;
-
-JL_DLLEXPORT void jl_enable_gc_logging(int enable) {
-    gc_logging_enabled = enable;
-}
-
-JL_DLLEXPORT int jl_is_gc_logging_enabled(void) {
-    return gc_logging_enabled;
-}
+extern int gc_logging_enabled;
 
 void _report_gc_finished(uint64_t pause, uint64_t freed, int full, int recollect, int64_t live_bytes) JL_NOTSAFEPOINT {
     if (!gc_logging_enabled) {
diff --git a/src/gc-interface.h b/src/gc-interface.h
index e543b4b5879f1..682f22344d69d 100644
--- a/src/gc-interface.h
+++ b/src/gc-interface.h
@@ -128,6 +128,13 @@ JL_DLLEXPORT uint64_t jl_gc_total_hrtime(void);
 // Allocation
 // ========================================================================= //
 
+// On GCC, this function is inlined when sz is constant (see julia_internal.h)
+// In general, this function should implement allocation and should use the specific GC's logic
+// to decide whether to allocate a small or a large object. Finally, note that this function
+// **must** also set the type of the returning object to be `ty`. The type `ty` may also be used to record
+// an allocation of that type in the allocation profiler.
+struct _jl_value_t *jl_gc_alloc_(struct _jl_tls_states_t * ptls, size_t sz, void *ty);
+
 // Allocates small objects and increments Julia allocation counterst. Size of the object
 // header must be included in the object size. The (possibly unused in some implementations)
 // offset to the arena in which we're allocating is passed in the second parameter, and the
@@ -211,6 +218,11 @@ JL_DLLEXPORT void *jl_gc_perm_alloc(size_t sz, int zero, unsigned align,
 // object being allocated and will be used to set the object header.
 struct _jl_value_t *jl_gc_permobj(size_t sz, void *ty) JL_NOTSAFEPOINT;
 
+// This function notifies the GC about memory addresses that are set when loading the boot image.
+// The GC may use that information to, for instance, determine that such objects should
+// be treated as marked and belonged to the old generation in nursery collections.
+void jl_gc_notify_image_load(const char* img_data, size_t len);
+
 // ========================================================================= //
 // Runtime Write-Barriers
 // ========================================================================= //
diff --git a/src/gc-stacks.c b/src/gc-stacks.c
index 783129ea97693..8c44b65284386 100644
--- a/src/gc-stacks.c
+++ b/src/gc-stacks.c
@@ -46,7 +46,7 @@ static void *malloc_stack(size_t bufsz) JL_NOTSAFEPOINT
 }
 
 
-static void free_stack(void *stkbuf, size_t bufsz) JL_NOTSAFEPOINT
+void free_stack(void *stkbuf, size_t bufsz) JL_NOTSAFEPOINT
 {
     VirtualFree(stkbuf, 0, MEM_RELEASE);
     jl_atomic_fetch_add_relaxed(&num_stack_mappings, -1);
@@ -81,7 +81,7 @@ static void *malloc_stack(size_t bufsz) JL_NOTSAFEPOINT
     return stk;
 }
 
-static void free_stack(void *stkbuf, size_t bufsz) JL_NOTSAFEPOINT
+void free_stack(void *stkbuf, size_t bufsz) JL_NOTSAFEPOINT
 {
     munmap(stkbuf, bufsz);
     jl_atomic_fetch_add_relaxed(&num_stack_mappings, -1);
diff --git a/src/gc-stock.c b/src/gc-stock.c
index 6b97881909bbd..6ebac8a0c079e 100644
--- a/src/gc-stock.c
+++ b/src/gc-stock.c
@@ -555,24 +555,6 @@ static void sweep_big(jl_ptls_t ptls) JL_NOTSAFEPOINT
     gc_time_big_end();
 }
 
-// tracking Memorys with malloc'd storage
-
-void jl_gc_track_malloced_genericmemory(jl_ptls_t ptls, jl_genericmemory_t *m, int isaligned){
-    // This is **NOT** a GC safe point.
-    mallocmemory_t *ma;
-    if (ptls->gc_tls.heap.mafreelist == NULL) {
-        ma = (mallocmemory_t*)malloc_s(sizeof(mallocmemory_t));
-    }
-    else {
-        ma = ptls->gc_tls.heap.mafreelist;
-        ptls->gc_tls.heap.mafreelist = ma->next;
-    }
-    ma->a = (jl_genericmemory_t*)((uintptr_t)m | !!isaligned);
-    ma->next = ptls->gc_tls.heap.mallocarrays;
-    ptls->gc_tls.heap.mallocarrays = ma;
-}
-
-
 void jl_gc_count_allocd(size_t sz) JL_NOTSAFEPOINT
 {
     jl_ptls_t ptls = jl_current_task->ptls;
@@ -649,17 +631,6 @@ void jl_gc_reset_alloc_count(void) JL_NOTSAFEPOINT
     reset_thread_gc_counts();
 }
 
-size_t jl_genericmemory_nbytes(jl_genericmemory_t *m) JL_NOTSAFEPOINT
-{
-    const jl_datatype_layout_t *layout = ((jl_datatype_t*)jl_typetagof(m))->layout;
-    size_t sz = layout->size * m->length;
-    if (layout->flags.arrayelem_isunion)
-        // account for isbits Union array selector bytes
-        sz += m->length;
-    return sz;
-}
-
-
 static void jl_gc_free_memory(jl_value_t *v, int isaligned) JL_NOTSAFEPOINT
 {
     assert(jl_is_genericmemory(v));
@@ -818,6 +789,29 @@ jl_value_t *jl_gc_small_alloc_noinline(jl_ptls_t ptls, int offset, int osize) {
     return jl_gc_small_alloc_inner(ptls, offset, osize);
 }
 
+// Size does NOT include the type tag!!
+inline jl_value_t *jl_gc_alloc_(jl_ptls_t ptls, size_t sz, void *ty)
+{
+    jl_value_t *v;
+    const size_t allocsz = sz + sizeof(jl_taggedvalue_t);
+    if (sz <= GC_MAX_SZCLASS) {
+        int pool_id = jl_gc_szclass(allocsz);
+        jl_gc_pool_t *p = &ptls->gc_tls.heap.norm_pools[pool_id];
+        int osize = jl_gc_sizeclasses[pool_id];
+        // We call `jl_gc_small_alloc_noinline` instead of `jl_gc_small_alloc` to avoid double-counting in
+        // the Allocations Profiler. (See https://github.com/JuliaLang/julia/pull/43868 for more details.)
+        v = jl_gc_small_alloc_noinline(ptls, (char*)p - (char*)ptls, osize);
+    }
+    else {
+        if (allocsz < sz) // overflow in adding offs, size was "negative"
+            jl_throw(jl_memory_exception);
+        v = jl_gc_big_alloc_noinline(ptls, allocsz);
+    }
+    jl_set_typeof(v, ty);
+    maybe_record_alloc_to_profile(v, sz, (jl_datatype_t*)ty);
+    return v;
+}
+
 int jl_gc_classify_pools(size_t sz, int *osize)
 {
     if (sz > GC_MAX_SZCLASS)
@@ -2794,6 +2788,21 @@ static void sweep_finalizer_list(arraylist_t *list)
     list->len = j;
 }
 
+int gc_is_parallel_collector_thread(int tid) JL_NOTSAFEPOINT
+{
+    return tid >= gc_first_tid && tid <= gc_last_parallel_collector_thread_id();
+}
+
+int gc_is_concurrent_collector_thread(int tid) JL_NOTSAFEPOINT
+{
+    if (jl_n_sweepthreads == 0) {
+        return 0;
+    }
+    int last_parallel_collector_thread_id = gc_last_parallel_collector_thread_id();
+    int concurrent_collector_thread_id = last_parallel_collector_thread_id + 1;
+    return tid == concurrent_collector_thread_id;
+}
+
 // collector entry point and control
 _Atomic(uint32_t) jl_gc_disable_counter = 1;
 
@@ -2832,11 +2841,6 @@ JL_DLLEXPORT void jl_gc_get_total_bytes(int64_t *bytes) JL_NOTSAFEPOINT
     *bytes = (num.total_allocd + num.deferred_alloc + num.allocd);
 }
 
-JL_DLLEXPORT uint64_t jl_gc_total_hrtime(void)
-{
-    return gc_num.total_time;
-}
-
 JL_DLLEXPORT jl_gc_num_t jl_gc_num(void)
 {
     jl_gc_num_t num = gc_num;
@@ -3397,13 +3401,6 @@ void gc_mark_queue_all_roots(jl_ptls_t ptls, jl_gc_markqueue_t *mq)
     gc_mark_roots(mq);
 }
 
-// allocator entry points
-
-JL_DLLEXPORT jl_value_t *(jl_gc_alloc)(jl_ptls_t ptls, size_t sz, void *ty)
-{
-    return jl_gc_alloc_(ptls, sz, ty);
-}
-
 // Per-thread initialization
 void jl_init_thread_heap(jl_ptls_t ptls)
 {
@@ -3685,63 +3682,6 @@ JL_DLLEXPORT void *jl_gc_counted_realloc_with_old_size(void *p, size_t old, size
     return data;
 }
 
-// allocation wrappers that save the size of allocations, to allow using
-// jl_gc_counted_* functions with a libc-compatible API.
-
-JL_DLLEXPORT void *jl_malloc(size_t sz)
-{
-    int64_t *p = (int64_t *)jl_gc_counted_malloc(sz + JL_SMALL_BYTE_ALIGNMENT);
-    if (p == NULL)
-        return NULL;
-    p[0] = sz;
-    return (void *)(p + 2); // assumes JL_SMALL_BYTE_ALIGNMENT == 16
-}
-
-//_unchecked_calloc does not check for potential overflow of nm*sz
-STATIC_INLINE void *_unchecked_calloc(size_t nm, size_t sz) {
-    size_t nmsz = nm*sz;
-    int64_t *p = (int64_t *)jl_gc_counted_calloc(nmsz + JL_SMALL_BYTE_ALIGNMENT, 1);
-    if (p == NULL)
-        return NULL;
-    p[0] = nmsz;
-    return (void *)(p + 2); // assumes JL_SMALL_BYTE_ALIGNMENT == 16
-}
-
-JL_DLLEXPORT void *jl_calloc(size_t nm, size_t sz)
-{
-    if (nm > SSIZE_MAX/sz - JL_SMALL_BYTE_ALIGNMENT)
-        return NULL;
-    return _unchecked_calloc(nm, sz);
-}
-
-JL_DLLEXPORT void jl_free(void *p)
-{
-    if (p != NULL) {
-        int64_t *pp = (int64_t *)p - 2;
-        size_t sz = pp[0];
-        jl_gc_counted_free_with_size(pp, sz + JL_SMALL_BYTE_ALIGNMENT);
-    }
-}
-
-JL_DLLEXPORT void *jl_realloc(void *p, size_t sz)
-{
-    int64_t *pp;
-    size_t szold;
-    if (p == NULL) {
-        pp = NULL;
-        szold = 0;
-    }
-    else {
-        pp = (int64_t *)p - 2;
-        szold = pp[0] + JL_SMALL_BYTE_ALIGNMENT;
-    }
-    int64_t *pnew = (int64_t *)jl_gc_counted_realloc_with_old_size(pp, szold, sz + JL_SMALL_BYTE_ALIGNMENT);
-    if (pnew == NULL)
-        return NULL;
-    pnew[0] = sz;
-    return (void *)(pnew + 2); // assumes JL_SMALL_BYTE_ALIGNMENT == 16
-}
-
 // allocating blocks for Arrays and Strings
 
 JL_DLLEXPORT void *jl_gc_managed_malloc(size_t sz)
@@ -3875,18 +3815,6 @@ jl_value_t *jl_gc_permobj(size_t sz, void *ty) JL_NOTSAFEPOINT
     return jl_valueof(o);
 }
 
-JL_DLLEXPORT jl_weakref_t *jl_gc_new_weakref(jl_value_t *value)
-{
-    jl_ptls_t ptls = jl_current_task->ptls;
-    return jl_gc_new_weakref_th(ptls, value);
-}
-
-JL_DLLEXPORT jl_value_t *jl_gc_allocobj(size_t sz)
-{
-    jl_ptls_t ptls = jl_current_task->ptls;
-    return jl_gc_alloc(ptls, sz, NULL);
-}
-
 JL_DLLEXPORT int jl_gc_enable_conservative_gc_support(void)
 {
     if (jl_is_initialized()) {
@@ -4014,14 +3942,14 @@ JL_DLLEXPORT size_t jl_gc_external_obj_hdr_size(void)
 }
 
 
-JL_DLLEXPORT void * jl_gc_alloc_typed(jl_ptls_t ptls, size_t sz, void *ty)
+JL_DLLEXPORT void jl_gc_schedule_foreign_sweepfunc(jl_ptls_t ptls, jl_value_t *obj)
 {
-    return jl_gc_alloc(ptls, sz, ty);
+    arraylist_push(&ptls->gc_tls.sweep_objs, obj);
 }
 
-JL_DLLEXPORT void jl_gc_schedule_foreign_sweepfunc(jl_ptls_t ptls, jl_value_t *obj)
+void jl_gc_notify_image_load(const char* img_data, size_t len)
 {
-    arraylist_push(&ptls->gc_tls.sweep_objs, obj);
+    // Do nothing
 }
 
 #ifdef __cplusplus
diff --git a/src/gc-stock.h b/src/gc-stock.h
index 46f7d3e11e105..cc661ce6e1600 100644
--- a/src/gc-stock.h
+++ b/src/gc-stock.h
@@ -106,12 +106,6 @@ JL_EXTENSION typedef struct _bigval_t {
     // must be 64-byte aligned here, in 32 & 64 bit modes
 } bigval_t;
 
-// data structure for tracking malloc'd genericmemory.
-typedef struct _mallocmemory_t {
-    jl_genericmemory_t *a; // lowest bit is tagged if this is aligned memory
-    struct _mallocmemory_t *next;
-} mallocmemory_t;
-
 // pool page metadata
 typedef struct _jl_gc_pagemeta_t {
     // next metadata structure in per-thread list
@@ -428,21 +422,6 @@ STATIC_INLINE int gc_ith_parallel_collector_thread_id(int i) JL_NOTSAFEPOINT
     return gc_first_tid + i;
 }
 
-STATIC_INLINE int gc_is_parallel_collector_thread(int tid) JL_NOTSAFEPOINT
-{
-    return tid >= gc_first_tid && tid <= gc_last_parallel_collector_thread_id();
-}
-
-STATIC_INLINE int gc_is_concurrent_collector_thread(int tid) JL_NOTSAFEPOINT
-{
-    if (jl_n_sweepthreads == 0) {
-        return 0;
-    }
-    int last_parallel_collector_thread_id = gc_last_parallel_collector_thread_id();
-    int concurrent_collector_thread_id = last_parallel_collector_thread_id + 1;
-    return tid == concurrent_collector_thread_id;
-}
-
 STATIC_INLINE int gc_random_parallel_collector_thread_id(jl_ptls_t ptls) JL_NOTSAFEPOINT
 {
     assert(jl_n_markthreads > 0);
diff --git a/src/julia.h b/src/julia.h
index ed3d9bf825658..b74de3060d26a 100644
--- a/src/julia.h
+++ b/src/julia.h
@@ -858,7 +858,7 @@ static inline jl_value_t *jl_to_typeof(uintptr_t t)
     return (jl_value_t*)t;
 }
 #else
-extern JL_HIDDEN jl_datatype_t *ijl_small_typeof[(jl_max_tags << 4) / sizeof(jl_datatype_t*)];
+extern JL_DLLEXPORT jl_datatype_t *ijl_small_typeof[(jl_max_tags << 4) / sizeof(jl_datatype_t*)];
 static inline jl_value_t *jl_to_typeof(uintptr_t t)
 {
     if (t < (jl_max_tags << 4))
diff --git a/src/julia_internal.h b/src/julia_internal.h
index 20d90fede3d5e..04857d440b643 100644
--- a/src/julia_internal.h
+++ b/src/julia_internal.h
@@ -513,30 +513,6 @@ STATIC_INLINE uint8_t JL_CONST_FUNC jl_gc_szclass_align8(unsigned sz) JL_NOTSAFE
 #define GC_MAX_SZCLASS (2032-sizeof(void*))
 static_assert(ARRAY_CACHE_ALIGN_THRESHOLD > GC_MAX_SZCLASS, "");
 
-
-// Size does NOT include the type tag!!
-STATIC_INLINE jl_value_t *jl_gc_alloc_(jl_ptls_t ptls, size_t sz, void *ty)
-{
-    jl_value_t *v;
-    const size_t allocsz = sz + sizeof(jl_taggedvalue_t);
-    if (sz <= GC_MAX_SZCLASS) {
-        int pool_id = jl_gc_szclass(allocsz);
-        jl_gc_pool_t *p = &ptls->gc_tls.heap.norm_pools[pool_id];
-        int osize = jl_gc_sizeclasses[pool_id];
-        // We call `jl_gc_small_alloc_noinline` instead of `jl_gc_small_alloc` to avoid double-counting in
-        // the Allocations Profiler. (See https://github.com/JuliaLang/julia/pull/43868 for more details.)
-        v = jl_gc_small_alloc_noinline(ptls, (char*)p - (char*)ptls, osize);
-    }
-    else {
-        if (allocsz < sz) // overflow in adding offs, size was "negative"
-            jl_throw(jl_memory_exception);
-        v = jl_gc_big_alloc_noinline(ptls, allocsz);
-    }
-    jl_set_typeof(v, ty);
-    maybe_record_alloc_to_profile(v, sz, (jl_datatype_t*)ty);
-    return v;
-}
-
 /* Programming style note: When using jl_gc_alloc, do not JL_GC_PUSH it into a
  * gc frame, until it has been fully initialized. An uninitialized value in a
  * gc frame can crash upon encountering the first safepoint. By delaying use of
@@ -1077,7 +1053,7 @@ STATIC_INLINE int jl_addr_is_safepoint(uintptr_t addr)
     return addr >= safepoint_addr && addr < safepoint_addr + jl_page_size * 4;
 }
 extern _Atomic(uint32_t) jl_gc_running;
-extern _Atomic(uint32_t) jl_gc_disable_counter;
+extern JL_DLLEXPORT _Atomic(uint32_t) jl_gc_disable_counter;
 // All the functions are safe to be called from within a signal handler
 // provided that the thread will not be interrupted by another asynchronous
 // signal.
diff --git a/src/scheduler.c b/src/scheduler.c
index bb2f85b52283f..b85a481588e4f 100644
--- a/src/scheduler.c
+++ b/src/scheduler.c
@@ -80,9 +80,20 @@ JL_DLLEXPORT int jl_set_task_threadpoolid(jl_task_t *task, int8_t tpid) JL_NOTSA
     return 1;
 }
 
+<<<<<<< HEAD
 // GC functions used
 extern int jl_gc_mark_queue_obj_explicit(jl_gc_mark_cache_t *gc_cache,
                                          jl_gc_markqueue_t *mq, jl_value_t *obj) JL_NOTSAFEPOINT;
+=======
+// parallel task runtime
+// ---
+
+JL_DLLEXPORT uint32_t jl_rand_ptls(uint32_t max) // [0, n)
+{
+    jl_ptls_t ptls = jl_current_task->ptls;
+    return cong(max, &ptls->rngseed);
+}
+>>>>>>> 4f39869d04 (Refactoring to be considered before adding MMTk)
 
 // initialize the threading infrastructure
 // (called only by the main thread)
diff --git a/src/stackwalk.c b/src/stackwalk.c
index 6aa36fa8b499c..5f28b61c4a8fe 100644
--- a/src/stackwalk.c
+++ b/src/stackwalk.c
@@ -5,7 +5,7 @@
   utilities for walking the stack and looking up information about code addresses
 */
 #include <inttypes.h>
-#include "gc-stock.h"
+#include "gc-common.h"
 #include "julia.h"
 #include "julia_internal.h"
 #include "threading.h"
@@ -1294,6 +1294,8 @@ JL_DLLEXPORT void jl_print_backtrace(void) JL_NOTSAFEPOINT
 }
 
 extern int gc_first_tid;
+extern int gc_is_parallel_collector_thread(int tid) JL_NOTSAFEPOINT;
+extern int gc_is_concurrent_collector_thread(int tid) JL_NOTSAFEPOINT;
 
 // Print backtraces for all live tasks, for all threads, to jl_safe_printf stderr
 JL_DLLEXPORT void jl_print_task_backtraces(int show_done) JL_NOTSAFEPOINT
diff --git a/src/staticdata.c b/src/staticdata.c
index 0a8cbe6db7c67..bba35e6dcb5f9 100644
--- a/src/staticdata.c
+++ b/src/staticdata.c
@@ -657,6 +657,7 @@ static void jl_load_sysimg_so(void)
         plen = (size_t *)&jl_system_image_size;
     else
         jl_dlsym(jl_sysimg_handle, "jl_system_image_size", (void **)&plen, 1);
+    jl_gc_notify_image_load(sysimg_data, *plen);
     jl_restore_system_image_data(sysimg_data, *plen);
 }
 
@@ -4054,6 +4055,7 @@ JL_DLLEXPORT jl_value_t *jl_restore_package_image_from_file(const char *fname, j
     jl_dlsym(pkgimg_handle, "jl_system_image_data", (void **)&pkgimg_data, 1);
     size_t *plen;
     jl_dlsym(pkgimg_handle, "jl_system_image_size", (void **)&plen, 1);
+    jl_gc_notify_image_load(pkgimg_data, *plen);
 
     jl_image_t pkgimage = jl_init_processor_pkgimg(pkgimg_handle);
 

From e10e3caef963bd1086deb3fb7d42f014ca2a3771 Mon Sep 17 00:00:00 2001
From: Eduardo Souza <ledusou@gmail.com>
Date: Thu, 29 Aug 2024 04:57:59 +0000
Subject: [PATCH 17/38] Removing jl_gc_notify_image_load, since it's a new
 function and not part of the refactoring

---
 src/gc-interface.h | 5 -----
 src/gc-stock.c     | 5 -----
 src/staticdata.c   | 2 --
 3 files changed, 12 deletions(-)

diff --git a/src/gc-interface.h b/src/gc-interface.h
index 682f22344d69d..25ffed4524f0c 100644
--- a/src/gc-interface.h
+++ b/src/gc-interface.h
@@ -218,11 +218,6 @@ JL_DLLEXPORT void *jl_gc_perm_alloc(size_t sz, int zero, unsigned align,
 // object being allocated and will be used to set the object header.
 struct _jl_value_t *jl_gc_permobj(size_t sz, void *ty) JL_NOTSAFEPOINT;
 
-// This function notifies the GC about memory addresses that are set when loading the boot image.
-// The GC may use that information to, for instance, determine that such objects should
-// be treated as marked and belonged to the old generation in nursery collections.
-void jl_gc_notify_image_load(const char* img_data, size_t len);
-
 // ========================================================================= //
 // Runtime Write-Barriers
 // ========================================================================= //
diff --git a/src/gc-stock.c b/src/gc-stock.c
index 6ebac8a0c079e..88b201a687eba 100644
--- a/src/gc-stock.c
+++ b/src/gc-stock.c
@@ -3947,11 +3947,6 @@ JL_DLLEXPORT void jl_gc_schedule_foreign_sweepfunc(jl_ptls_t ptls, jl_value_t *o
     arraylist_push(&ptls->gc_tls.sweep_objs, obj);
 }
 
-void jl_gc_notify_image_load(const char* img_data, size_t len)
-{
-    // Do nothing
-}
-
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/staticdata.c b/src/staticdata.c
index bba35e6dcb5f9..0a8cbe6db7c67 100644
--- a/src/staticdata.c
+++ b/src/staticdata.c
@@ -657,7 +657,6 @@ static void jl_load_sysimg_so(void)
         plen = (size_t *)&jl_system_image_size;
     else
         jl_dlsym(jl_sysimg_handle, "jl_system_image_size", (void **)&plen, 1);
-    jl_gc_notify_image_load(sysimg_data, *plen);
     jl_restore_system_image_data(sysimg_data, *plen);
 }
 
@@ -4055,7 +4054,6 @@ JL_DLLEXPORT jl_value_t *jl_restore_package_image_from_file(const char *fname, j
     jl_dlsym(pkgimg_handle, "jl_system_image_data", (void **)&pkgimg_data, 1);
     size_t *plen;
     jl_dlsym(pkgimg_handle, "jl_system_image_size", (void **)&plen, 1);
-    jl_gc_notify_image_load(pkgimg_data, *plen);
 
     jl_image_t pkgimage = jl_init_processor_pkgimg(pkgimg_handle);
 

From d4c4360ab89dc9052cd87933b1f4b9e3581f4daa Mon Sep 17 00:00:00 2001
From: Eduardo Souza <ledusou@gmail.com>
Date: Mon, 2 Sep 2024 01:27:08 +0000
Subject: [PATCH 18/38] Moving gc_enable code to gc-common.c

---
 src/gc-common.c | 30 ++++++++++++++++++++++++++++++
 src/gc-stock.c  | 30 ------------------------------
 2 files changed, 30 insertions(+), 30 deletions(-)

diff --git a/src/gc-common.c b/src/gc-common.c
index 2ec167caa667a..03c046bc300f2 100644
--- a/src/gc-common.c
+++ b/src/gc-common.c
@@ -613,6 +613,36 @@ void jl_gc_track_malloced_genericmemory(jl_ptls_t ptls, jl_genericmemory_t *m, i
     ptls->gc_tls.heap.mallocarrays = ma;
 }
 
+// collector entry point and control
+_Atomic(uint32_t) jl_gc_disable_counter = 1;
+
+JL_DLLEXPORT int jl_gc_enable(int on)
+{
+    jl_ptls_t ptls = jl_current_task->ptls;
+    int prev = !ptls->disable_gc;
+    ptls->disable_gc = (on == 0);
+    if (on && !prev) {
+        // disable -> enable
+        if (jl_atomic_fetch_add(&jl_gc_disable_counter, -1) == 1) {
+            gc_num.allocd += gc_num.deferred_alloc;
+            gc_num.deferred_alloc = 0;
+        }
+    }
+    else if (prev && !on) {
+        // enable -> disable
+        jl_atomic_fetch_add(&jl_gc_disable_counter, 1);
+        // check if the GC is running and wait for it to finish
+        jl_gc_safepoint_(ptls);
+    }
+    return prev;
+}
+
+JL_DLLEXPORT int jl_gc_is_enabled(void)
+{
+    jl_ptls_t ptls = jl_current_task->ptls;
+    return !ptls->disable_gc;
+}
+
 int gc_logging_enabled = 0;
 
 JL_DLLEXPORT void jl_enable_gc_logging(int enable) {
diff --git a/src/gc-stock.c b/src/gc-stock.c
index 88b201a687eba..55499bce61182 100644
--- a/src/gc-stock.c
+++ b/src/gc-stock.c
@@ -2803,36 +2803,6 @@ int gc_is_concurrent_collector_thread(int tid) JL_NOTSAFEPOINT
     return tid == concurrent_collector_thread_id;
 }
 
-// collector entry point and control
-_Atomic(uint32_t) jl_gc_disable_counter = 1;
-
-JL_DLLEXPORT int jl_gc_enable(int on)
-{
-    jl_ptls_t ptls = jl_current_task->ptls;
-    int prev = !ptls->disable_gc;
-    ptls->disable_gc = (on == 0);
-    if (on && !prev) {
-        // disable -> enable
-        if (jl_atomic_fetch_add(&jl_gc_disable_counter, -1) == 1) {
-            gc_num.allocd += gc_num.deferred_alloc;
-            gc_num.deferred_alloc = 0;
-        }
-    }
-    else if (prev && !on) {
-        // enable -> disable
-        jl_atomic_fetch_add(&jl_gc_disable_counter, 1);
-        // check if the GC is running and wait for it to finish
-        jl_gc_safepoint_(ptls);
-    }
-    return prev;
-}
-
-JL_DLLEXPORT int jl_gc_is_enabled(void)
-{
-    jl_ptls_t ptls = jl_current_task->ptls;
-    return !ptls->disable_gc;
-}
-
 JL_DLLEXPORT void jl_gc_get_total_bytes(int64_t *bytes) JL_NOTSAFEPOINT
 {
     jl_gc_num_t num = gc_num;

From d07cae75b0b36b34a1b5150feab2b52d62a0c1ad Mon Sep 17 00:00:00 2001
From: Eduardo Souza <ledusou@gmail.com>
Date: Mon, 16 Sep 2024 06:38:02 +0000
Subject: [PATCH 19/38] Addressing PR comments

---
 src/gc-common.c      | 134 +++++++++++++++++++++++++------------------
 src/gc-common.h      |   6 ++
 src/gc-debug.c       |   2 -
 src/gc-interface.h   |  30 +---------
 src/gc-stock.c       |  18 +-----
 src/gc-stock.h       |  15 +++++
 src/julia.h          |   2 +-
 src/julia_internal.h |   4 +-
 src/stackwalk.c      |  10 +---
 9 files changed, 110 insertions(+), 111 deletions(-)

diff --git a/src/gc-common.c b/src/gc-common.c
index 03c046bc300f2..046feae6aa4c5 100644
--- a/src/gc-common.c
+++ b/src/gc-common.c
@@ -491,15 +491,9 @@ int gc_n_threads;
 jl_ptls_t* gc_all_tls_states;
 
 // =========================================================================== //
-// MISC
+// Allocation
 // =========================================================================== //
 
-JL_DLLEXPORT jl_weakref_t *jl_gc_new_weakref(jl_value_t *value)
-{
-    jl_ptls_t ptls = jl_current_task->ptls;
-    return jl_gc_new_weakref_th(ptls, value);
-}
-
 JL_DLLEXPORT void * jl_gc_alloc_typed(jl_ptls_t ptls, size_t sz, void *ty)
 {
     return jl_gc_alloc(ptls, sz, ty);
@@ -575,17 +569,9 @@ JL_DLLEXPORT jl_value_t *(jl_gc_alloc)(jl_ptls_t ptls, size_t sz, void *ty)
     return jl_gc_alloc_(ptls, sz, ty);
 }
 
-const uint64_t _jl_buff_tag[3] = {0x4eadc0004eadc000ull, 0x4eadc0004eadc000ull, 0x4eadc0004eadc000ull}; // aka 0xHEADER00
-JL_DLLEXPORT uintptr_t jl_get_buff_tag(void) JL_NOTSAFEPOINT
-{
-    return jl_buff_tag;
-}
-
-// callback for passing OOM errors from gmp
-JL_DLLEXPORT void jl_throw_out_of_memory_error(void)
-{
-    jl_throw(jl_memory_exception);
-}
+// =========================================================================== //
+// Generic Memory
+// =========================================================================== //
 
 size_t jl_genericmemory_nbytes(jl_genericmemory_t *m) JL_NOTSAFEPOINT
 {
@@ -613,6 +599,66 @@ void jl_gc_track_malloced_genericmemory(jl_ptls_t ptls, jl_genericmemory_t *m, i
     ptls->gc_tls.heap.mallocarrays = ma;
 }
 
+// =========================================================================== //
+// GC Debug
+// =========================================================================== //
+
+int gc_slot_to_fieldidx(void *obj, void *slot, jl_datatype_t *vt) JL_NOTSAFEPOINT
+{
+    int nf = (int)jl_datatype_nfields(vt);
+    for (int i = 1; i < nf; i++) {
+        if (slot < (void*)((char*)obj + jl_field_offset(vt, i)))
+            return i - 1;
+    }
+    return nf - 1;
+}
+
+int gc_slot_to_arrayidx(void *obj, void *_slot) JL_NOTSAFEPOINT
+{
+    char *slot = (char*)_slot;
+    jl_datatype_t *vt = (jl_datatype_t*)jl_typeof(obj);
+    char *start = NULL;
+    size_t len = 0;
+    size_t elsize = sizeof(void*);
+    if (vt == jl_module_type) {
+        jl_module_t *m = (jl_module_t*)obj;
+        start = (char*)m->usings.items;
+        len = m->usings.len;
+    }
+    else if (vt == jl_simplevector_type) {
+        start = (char*)jl_svec_data(obj);
+        len = jl_svec_len(obj);
+    }
+    if (slot < start || slot >= start + elsize * len)
+        return -1;
+    return (slot - start) / elsize;
+}
+
+// =========================================================================== //
+// GC Control
+// =========================================================================== //
+
+JL_DLLEXPORT uint32_t jl_get_gc_disable_counter(void) {
+    return jl_atomic_load_acquire(&jl_gc_disable_counter);
+}
+
+JL_DLLEXPORT int jl_gc_is_enabled(void)
+{
+    jl_ptls_t ptls = jl_current_task->ptls;
+    return !ptls->disable_gc;
+}
+
+int gc_logging_enabled = 0;
+
+JL_DLLEXPORT void jl_enable_gc_logging(int enable) {
+    gc_logging_enabled = enable;
+}
+
+JL_DLLEXPORT int jl_is_gc_logging_enabled(void) {
+    return gc_logging_enabled;
+}
+
+
 // collector entry point and control
 _Atomic(uint32_t) jl_gc_disable_counter = 1;
 
@@ -637,54 +683,30 @@ JL_DLLEXPORT int jl_gc_enable(int on)
     return prev;
 }
 
-JL_DLLEXPORT int jl_gc_is_enabled(void)
+// =========================================================================== //
+// MISC
+// =========================================================================== //
+
+JL_DLLEXPORT jl_weakref_t *jl_gc_new_weakref(jl_value_t *value)
 {
     jl_ptls_t ptls = jl_current_task->ptls;
-    return !ptls->disable_gc;
-}
-
-int gc_logging_enabled = 0;
-
-JL_DLLEXPORT void jl_enable_gc_logging(int enable) {
-    gc_logging_enabled = enable;
+    return jl_gc_new_weakref_th(ptls, value);
 }
 
-JL_DLLEXPORT int jl_is_gc_logging_enabled(void) {
-    return gc_logging_enabled;
+JL_DLLEXPORT jl_datatype_t **jl_get_ijl_small_typeof(void) {
+    return ijl_small_typeof;
 }
 
-// gc-debug common functions
-// ---
-
-int gc_slot_to_fieldidx(void *obj, void *slot, jl_datatype_t *vt) JL_NOTSAFEPOINT
+const uint64_t _jl_buff_tag[3] = {0x4eadc0004eadc000ull, 0x4eadc0004eadc000ull, 0x4eadc0004eadc000ull}; // aka 0xHEADER00
+JL_DLLEXPORT uintptr_t jl_get_buff_tag(void) JL_NOTSAFEPOINT
 {
-    int nf = (int)jl_datatype_nfields(vt);
-    for (int i = 1; i < nf; i++) {
-        if (slot < (void*)((char*)obj + jl_field_offset(vt, i)))
-            return i - 1;
-    }
-    return nf - 1;
+    return jl_buff_tag;
 }
 
-int gc_slot_to_arrayidx(void *obj, void *_slot) JL_NOTSAFEPOINT
+// callback for passing OOM errors from gmp
+JL_DLLEXPORT void jl_throw_out_of_memory_error(void)
 {
-    char *slot = (char*)_slot;
-    jl_datatype_t *vt = (jl_datatype_t*)jl_typeof(obj);
-    char *start = NULL;
-    size_t len = 0;
-    size_t elsize = sizeof(void*);
-    if (vt == jl_module_type) {
-        jl_module_t *m = (jl_module_t*)obj;
-        start = (char*)m->usings.items;
-        len = m->usings.len;
-    }
-    else if (vt == jl_simplevector_type) {
-        start = (char*)jl_svec_data(obj);
-        len = jl_svec_len(obj);
-    }
-    if (slot < start || slot >= start + elsize * len)
-        return -1;
-    return (slot - start) / elsize;
+    jl_throw(jl_memory_exception);
 }
 
 #ifdef __cplusplus
diff --git a/src/gc-common.h b/src/gc-common.h
index 154b9659e9ccb..32b7470b13a58 100644
--- a/src/gc-common.h
+++ b/src/gc-common.h
@@ -179,4 +179,10 @@ JL_DLLEXPORT void jl_finalize_th(jl_task_t *ct, jl_value_t *o);
 extern int gc_n_threads;
 extern jl_ptls_t* gc_all_tls_states;
 
+// =========================================================================== //
+// Logging
+// =========================================================================== //
+
+extern int gc_logging_enabled;
+
 #endif // JL_GC_COMMON_H
diff --git a/src/gc-debug.c b/src/gc-debug.c
index d05fb4b49e9f7..7c479484cde45 100644
--- a/src/gc-debug.c
+++ b/src/gc-debug.c
@@ -1105,8 +1105,6 @@ void gc_count_pool(void)
     jl_safe_printf("************************\n");
 }
 
-extern int gc_logging_enabled;
-
 void _report_gc_finished(uint64_t pause, uint64_t freed, int full, int recollect, int64_t live_bytes) JL_NOTSAFEPOINT {
     if (!gc_logging_enabled) {
         return;
diff --git a/src/gc-interface.h b/src/gc-interface.h
index 25ffed4524f0c..0e9ce32697f35 100644
--- a/src/gc-interface.h
+++ b/src/gc-interface.h
@@ -94,6 +94,8 @@ JL_DLLEXPORT void jl_gc_set_max_memory(uint64_t max_mem);
 // should run a collection cycle again (e.g. a full mark right after a full sweep to ensure
 // we do a full heap traversal).
 JL_DLLEXPORT void jl_gc_collect(jl_gc_collection_t collection);
+// Returns whether the thread with `tid` is a collector thread
+JL_DLLEXPORT int gc_is_collector_thread(int tid) JL_NOTSAFEPOINT;
 
 // ========================================================================= //
 // Metrics
@@ -162,26 +164,6 @@ JL_DLLEXPORT void *jl_gc_counted_calloc(size_t nm, size_t sz);
 JL_DLLEXPORT void jl_gc_counted_free_with_size(void *p, size_t sz);
 // Wrapper around Libc realloc that updates Julia allocation counters.
 JL_DLLEXPORT void *jl_gc_counted_realloc_with_old_size(void *p, size_t old, size_t sz);
-// Wrapper around Libc malloc that allocates a memory region with a few additional machine
-// words before the actual payload that are used to record the size of the requested
-// allocation. Also updates Julia allocation counters. The function returns a pointer to the
-// payload as a result of the allocation.
-JL_DLLEXPORT void *jl_malloc(size_t sz);
-// Wrapper around Libc calloc that allocates a memory region with a few additional machine
-// words before the actual payload that are used to record the size of the requested
-// allocation. Also updates Julia allocation counters. The function returns a pointer to the
-// payload as a result of the allocation.
-JL_DLLEXPORT void *jl_calloc(size_t nm, size_t sz);
-// Wrapper around Libc free that takes a pointer to the payload of a memory region allocated
-// with jl_malloc or jl_calloc, and uses the size information stored in the first machine
-// words of the memory buffer update Julia allocation counters, and then frees the
-// corresponding memory buffer.
-JL_DLLEXPORT void jl_free(void *p);
-// Wrapper around Libc realloc that takes a memory region allocated with jl_malloc or
-// jl_calloc, and uses the size information stored in the first machine words of the memory
-// buffer to update Julia allocation counters, reallocating the corresponding memory buffer
-// in the end.
-JL_DLLEXPORT void *jl_realloc(void *p, size_t sz);
 // Wrapper around Libc malloc that's used to dynamically allocate memory for Arrays and
 // Strings. It increments Julia allocation counters and should check whether we're close to
 // the Julia heap target, and therefore, whether we should run a collection. Note that this
@@ -195,14 +177,6 @@ JL_DLLEXPORT void *jl_gc_managed_malloc(size_t sz);
 // thread-local allocator of the thread referenced by the first jl_ptls_t argument.
 JL_DLLEXPORT struct _jl_weakref_t *jl_gc_new_weakref_th(struct _jl_tls_states_t *ptls,
                                                         struct _jl_value_t *value);
-// Allocates a new weak-reference, assigns its value and increments Julia allocation
-// counters. If thread-local allocators are used, then this function should allocate in the
-// thread-local allocator of the current thread.
-JL_DLLEXPORT struct _jl_weakref_t *jl_gc_new_weakref(struct _jl_value_t *value);
-// Allocates an object whose size is specified by the function argument and increments Julia
-// allocation counters. If thread-local allocators are used, then this function should
-// allocate in the thread-local allocator of the current thread.
-JL_DLLEXPORT struct _jl_value_t *jl_gc_allocobj(size_t sz);
 // Permanently allocates a memory slot of the size specified by the first parameter. This
 // block of memory is allocated in an immortal region that is never swept. The second
 // parameter specifies whether the memory should be filled with zeros. The third and fourth
diff --git a/src/gc-stock.c b/src/gc-stock.c
index 55499bce61182..b345fe08ff69c 100644
--- a/src/gc-stock.c
+++ b/src/gc-stock.c
@@ -2788,19 +2788,8 @@ static void sweep_finalizer_list(arraylist_t *list)
     list->len = j;
 }
 
-int gc_is_parallel_collector_thread(int tid) JL_NOTSAFEPOINT
-{
-    return tid >= gc_first_tid && tid <= gc_last_parallel_collector_thread_id();
-}
-
-int gc_is_concurrent_collector_thread(int tid) JL_NOTSAFEPOINT
-{
-    if (jl_n_sweepthreads == 0) {
-        return 0;
-    }
-    int last_parallel_collector_thread_id = gc_last_parallel_collector_thread_id();
-    int concurrent_collector_thread_id = last_parallel_collector_thread_id + 1;
-    return tid == concurrent_collector_thread_id;
+int gc_is_collector_thread(int tid) JL_NOTSAFEPOINT {
+    return gc_is_parallel_collector_thread(tid) || gc_is_concurrent_collector_thread(tid);
 }
 
 JL_DLLEXPORT void jl_gc_get_total_bytes(int64_t *bytes) JL_NOTSAFEPOINT
@@ -3193,8 +3182,7 @@ static int _jl_gc_collect(jl_ptls_t ptls, jl_gc_collection_t collection)
         // free empty GC state for threads that have exited
         if (jl_atomic_load_relaxed(&ptls2->current_task) == NULL) {
             // GC threads should never exit
-            assert(!gc_is_parallel_collector_thread(t_i));
-            assert(!gc_is_concurrent_collector_thread(t_i));
+            assert(!gc_is_collector_thread(t_i));
             jl_thread_heap_t *heap = &ptls2->gc_tls.heap;
             if (heap->weak_refs.len == 0)
                 small_arraylist_free(&heap->weak_refs);
diff --git a/src/gc-stock.h b/src/gc-stock.h
index cc661ce6e1600..0f8d1eee67581 100644
--- a/src/gc-stock.h
+++ b/src/gc-stock.h
@@ -422,6 +422,21 @@ STATIC_INLINE int gc_ith_parallel_collector_thread_id(int i) JL_NOTSAFEPOINT
     return gc_first_tid + i;
 }
 
+STATIC_INLINE int gc_is_parallel_collector_thread(int tid) JL_NOTSAFEPOINT
+{
+    return tid >= gc_first_tid && tid <= gc_last_parallel_collector_thread_id();
+}
+
+STATIC_INLINE int gc_is_concurrent_collector_thread(int tid) JL_NOTSAFEPOINT
+{
+    if (jl_n_sweepthreads == 0) {
+        return 0;
+    }
+    int last_parallel_collector_thread_id = gc_last_parallel_collector_thread_id();
+    int concurrent_collector_thread_id = last_parallel_collector_thread_id + 1;
+    return tid == concurrent_collector_thread_id;
+}
+
 STATIC_INLINE int gc_random_parallel_collector_thread_id(jl_ptls_t ptls) JL_NOTSAFEPOINT
 {
     assert(jl_n_markthreads > 0);
diff --git a/src/julia.h b/src/julia.h
index b74de3060d26a..ed3d9bf825658 100644
--- a/src/julia.h
+++ b/src/julia.h
@@ -858,7 +858,7 @@ static inline jl_value_t *jl_to_typeof(uintptr_t t)
     return (jl_value_t*)t;
 }
 #else
-extern JL_DLLEXPORT jl_datatype_t *ijl_small_typeof[(jl_max_tags << 4) / sizeof(jl_datatype_t*)];
+extern JL_HIDDEN jl_datatype_t *ijl_small_typeof[(jl_max_tags << 4) / sizeof(jl_datatype_t*)];
 static inline jl_value_t *jl_to_typeof(uintptr_t t)
 {
     if (t < (jl_max_tags << 4))
diff --git a/src/julia_internal.h b/src/julia_internal.h
index 04857d440b643..c079c06f0189a 100644
--- a/src/julia_internal.h
+++ b/src/julia_internal.h
@@ -367,6 +367,8 @@ extern jl_function_t *jl_typeinf_func JL_GLOBALLY_ROOTED;
 extern JL_DLLEXPORT size_t jl_typeinf_world;
 extern _Atomic(jl_typemap_entry_t*) call_cache[N_CALL_CACHE] JL_GLOBALLY_ROOTED;
 
+extern void free_stack(void *stkbuf, size_t bufsz) JL_NOTSAFEPOINT;
+
 JL_DLLEXPORT extern int jl_lineno;
 JL_DLLEXPORT extern const char *jl_filename;
 
@@ -1053,7 +1055,7 @@ STATIC_INLINE int jl_addr_is_safepoint(uintptr_t addr)
     return addr >= safepoint_addr && addr < safepoint_addr + jl_page_size * 4;
 }
 extern _Atomic(uint32_t) jl_gc_running;
-extern JL_DLLEXPORT _Atomic(uint32_t) jl_gc_disable_counter;
+extern _Atomic(uint32_t) jl_gc_disable_counter;
 // All the functions are safe to be called from within a signal handler
 // provided that the thread will not be interrupted by another asynchronous
 // signal.
diff --git a/src/stackwalk.c b/src/stackwalk.c
index 5f28b61c4a8fe..a1de3a6d61a07 100644
--- a/src/stackwalk.c
+++ b/src/stackwalk.c
@@ -1294,8 +1294,6 @@ JL_DLLEXPORT void jl_print_backtrace(void) JL_NOTSAFEPOINT
 }
 
 extern int gc_first_tid;
-extern int gc_is_parallel_collector_thread(int tid) JL_NOTSAFEPOINT;
-extern int gc_is_concurrent_collector_thread(int tid) JL_NOTSAFEPOINT;
 
 // Print backtraces for all live tasks, for all threads, to jl_safe_printf stderr
 JL_DLLEXPORT void jl_print_task_backtraces(int show_done) JL_NOTSAFEPOINT
@@ -1304,12 +1302,8 @@ JL_DLLEXPORT void jl_print_task_backtraces(int show_done) JL_NOTSAFEPOINT
     jl_ptls_t *allstates = jl_atomic_load_relaxed(&jl_all_tls_states);
     for (size_t i = 0; i < nthreads; i++) {
         jl_ptls_t ptls2 = allstates[i];
-        if (gc_is_parallel_collector_thread(i)) {
-            jl_safe_printf("==== Skipping backtrace for parallel GC thread %zu\n", i + 1);
-            continue;
-        }
-        if (gc_is_concurrent_collector_thread(i)) {
-            jl_safe_printf("==== Skipping backtrace for concurrent GC thread %zu\n", i + 1);
+        if (gc_is_collector_thread(i)) {
+            jl_safe_printf("==== Skipping backtrace for parallel/concurrent GC thread %zu\n", i + 1);
             continue;
         }
         if (ptls2 == NULL) {

From 8e15217b8a5eaea51335f6b7577ba929905a4a54 Mon Sep 17 00:00:00 2001
From: Eduardo Souza <ledusou@gmail.com>
Date: Thu, 19 Sep 2024 04:18:13 +0000
Subject: [PATCH 20/38] Push resolution of merge conflict

---
 src/scheduler.c | 11 -----------
 1 file changed, 11 deletions(-)

diff --git a/src/scheduler.c b/src/scheduler.c
index b85a481588e4f..bb2f85b52283f 100644
--- a/src/scheduler.c
+++ b/src/scheduler.c
@@ -80,20 +80,9 @@ JL_DLLEXPORT int jl_set_task_threadpoolid(jl_task_t *task, int8_t tpid) JL_NOTSA
     return 1;
 }
 
-<<<<<<< HEAD
 // GC functions used
 extern int jl_gc_mark_queue_obj_explicit(jl_gc_mark_cache_t *gc_cache,
                                          jl_gc_markqueue_t *mq, jl_value_t *obj) JL_NOTSAFEPOINT;
-=======
-// parallel task runtime
-// ---
-
-JL_DLLEXPORT uint32_t jl_rand_ptls(uint32_t max) // [0, n)
-{
-    jl_ptls_t ptls = jl_current_task->ptls;
-    return cong(max, &ptls->rngseed);
-}
->>>>>>> 4f39869d04 (Refactoring to be considered before adding MMTk)
 
 // initialize the threading infrastructure
 // (called only by the main thread)

From 0cb0784a43aa01803b73407c90bd5ee44d09531f Mon Sep 17 00:00:00 2001
From: Eduardo Souza <ledusou@gmail.com>
Date: Wed, 25 Sep 2024 01:10:31 +0000
Subject: [PATCH 21/38] Removing jl_gc_mark_queue_obj_explicit extern
 definition from scheduler.c

---
 src/scheduler.c | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/src/scheduler.c b/src/scheduler.c
index bb2f85b52283f..7e23f654c2566 100644
--- a/src/scheduler.c
+++ b/src/scheduler.c
@@ -80,10 +80,6 @@ JL_DLLEXPORT int jl_set_task_threadpoolid(jl_task_t *task, int8_t tpid) JL_NOTSA
     return 1;
 }
 
-// GC functions used
-extern int jl_gc_mark_queue_obj_explicit(jl_gc_mark_cache_t *gc_cache,
-                                         jl_gc_markqueue_t *mq, jl_value_t *obj) JL_NOTSAFEPOINT;
-
 // initialize the threading infrastructure
 // (called only by the main thread)
 void jl_init_threadinginfra(void)

From 12634f36d67bd9c8275feda1e2729b0910ca2664 Mon Sep 17 00:00:00 2001
From: Eduardo Souza <ledusou@gmail.com>
Date: Wed, 25 Sep 2024 02:50:25 +0000
Subject: [PATCH 22/38] Don't need the getter function since it's possible to
 use jl_small_typeof directly

---
 src/gc-common.c | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/src/gc-common.c b/src/gc-common.c
index 046feae6aa4c5..417f12f26d64d 100644
--- a/src/gc-common.c
+++ b/src/gc-common.c
@@ -693,10 +693,6 @@ JL_DLLEXPORT jl_weakref_t *jl_gc_new_weakref(jl_value_t *value)
     return jl_gc_new_weakref_th(ptls, value);
 }
 
-JL_DLLEXPORT jl_datatype_t **jl_get_ijl_small_typeof(void) {
-    return ijl_small_typeof;
-}
-
 const uint64_t _jl_buff_tag[3] = {0x4eadc0004eadc000ull, 0x4eadc0004eadc000ull, 0x4eadc0004eadc000ull}; // aka 0xHEADER00
 JL_DLLEXPORT uintptr_t jl_get_buff_tag(void) JL_NOTSAFEPOINT
 {

From aa8093328cf5f70d9df78fda2315b077a76e4d8b Mon Sep 17 00:00:00 2001
From: Eduardo Souza <ledusou@gmail.com>
Date: Fri, 27 Sep 2024 00:49:07 +0000
Subject: [PATCH 23/38] Remove extern from free_stack declaration in
 julia_internal.h

---
 src/julia_internal.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/julia_internal.h b/src/julia_internal.h
index c079c06f0189a..6fd537ed6baf8 100644
--- a/src/julia_internal.h
+++ b/src/julia_internal.h
@@ -367,7 +367,7 @@ extern jl_function_t *jl_typeinf_func JL_GLOBALLY_ROOTED;
 extern JL_DLLEXPORT size_t jl_typeinf_world;
 extern _Atomic(jl_typemap_entry_t*) call_cache[N_CALL_CACHE] JL_GLOBALLY_ROOTED;
 
-extern void free_stack(void *stkbuf, size_t bufsz) JL_NOTSAFEPOINT;
+void free_stack(void *stkbuf, size_t bufsz) JL_NOTSAFEPOINT;
 
 JL_DLLEXPORT extern int jl_lineno;
 JL_DLLEXPORT extern const char *jl_filename;

From 7ce3fe392616d4da1035de6b02a21056f05072b6 Mon Sep 17 00:00:00 2001
From: Eduardo Souza <ledusou@gmail.com>
Date: Tue, 8 Oct 2024 09:12:49 +0000
Subject: [PATCH 24/38] Putting everything that is common GC tls into
 gc-tls-common.h

---
 src/gc-common.c     |  10 +--
 src/gc-stacks.c     |  18 +++---
 src/gc-stock.c      | 154 ++++++++++++++++++++++----------------------
 src/gc-tls-common.h |  52 +++++++++++++++
 src/gc-tls.h        |  25 -------
 src/julia_threads.h |   2 +
 src/stackwalk.c     |   2 +-
 7 files changed, 147 insertions(+), 116 deletions(-)
 create mode 100644 src/gc-tls-common.h

diff --git a/src/gc-common.c b/src/gc-common.c
index 417f12f26d64d..6ce455d3923ad 100644
--- a/src/gc-common.c
+++ b/src/gc-common.c
@@ -587,16 +587,16 @@ size_t jl_genericmemory_nbytes(jl_genericmemory_t *m) JL_NOTSAFEPOINT
 void jl_gc_track_malloced_genericmemory(jl_ptls_t ptls, jl_genericmemory_t *m, int isaligned){
     // This is **NOT** a GC safe point.
     mallocmemory_t *ma;
-    if (ptls->gc_tls.heap.mafreelist == NULL) {
+    if (ptls->gc_tls_common.heap.mafreelist == NULL) {
         ma = (mallocmemory_t*)malloc_s(sizeof(mallocmemory_t));
     }
     else {
-        ma = ptls->gc_tls.heap.mafreelist;
-        ptls->gc_tls.heap.mafreelist = ma->next;
+        ma = ptls->gc_tls_common.heap.mafreelist;
+        ptls->gc_tls_common.heap.mafreelist = ma->next;
     }
     ma->a = (jl_genericmemory_t*)((uintptr_t)m | !!isaligned);
-    ma->next = ptls->gc_tls.heap.mallocarrays;
-    ptls->gc_tls.heap.mallocarrays = ma;
+    ma->next = ptls->gc_tls_common.heap.mallocarrays;
+    ptls->gc_tls_common.heap.mallocarrays = ma;
 }
 
 // =========================================================================== //
diff --git a/src/gc-stacks.c b/src/gc-stacks.c
index 8c44b65284386..a8fec938456a3 100644
--- a/src/gc-stacks.c
+++ b/src/gc-stacks.c
@@ -131,7 +131,7 @@ void _jl_free_stack(jl_ptls_t ptls, void *stkbuf, size_t bufsz) JL_NOTSAFEPOINT
     if (bufsz <= pool_sizes[JL_N_STACK_POOLS - 1]) {
         unsigned pool_id = select_pool(bufsz);
         if (pool_sizes[pool_id] == bufsz) {
-            small_arraylist_push(&ptls->gc_tls.heap.free_stacks[pool_id], stkbuf);
+            small_arraylist_push(&ptls->gc_tls_common.heap.free_stacks[pool_id], stkbuf);
             return;
         }
     }
@@ -160,7 +160,7 @@ void jl_release_task_stack(jl_ptls_t ptls, jl_task_t *task)
 #ifdef _COMPILER_ASAN_ENABLED_
             __asan_unpoison_stack_memory((uintptr_t)stkbuf, bufsz);
 #endif
-            small_arraylist_push(&ptls->gc_tls.heap.free_stacks[pool_id], stkbuf);
+            small_arraylist_push(&ptls->gc_tls_common.heap.free_stacks[pool_id], stkbuf);
         }
     }
 }
@@ -175,7 +175,7 @@ JL_DLLEXPORT void *jl_malloc_stack(size_t *bufsz, jl_task_t *owner) JL_NOTSAFEPO
     if (ssize <= pool_sizes[JL_N_STACK_POOLS - 1]) {
         unsigned pool_id = select_pool(ssize);
         ssize = pool_sizes[pool_id];
-        small_arraylist_t *pool = &ptls->gc_tls.heap.free_stacks[pool_id];
+        small_arraylist_t *pool = &ptls->gc_tls_common.heap.free_stacks[pool_id];
         if (pool->len > 0) {
             stk = small_arraylist_pop(pool);
         }
@@ -196,7 +196,7 @@ JL_DLLEXPORT void *jl_malloc_stack(size_t *bufsz, jl_task_t *owner) JL_NOTSAFEPO
     }
     *bufsz = ssize;
     if (owner) {
-        small_arraylist_t *live_tasks = &ptls->gc_tls.heap.live_tasks;
+        small_arraylist_t *live_tasks = &ptls->gc_tls_common.heap.live_tasks;
         mtarraylist_push(live_tasks, owner);
     }
     return stk;
@@ -223,7 +223,7 @@ void sweep_stack_pools(void) JL_NOTSAFEPOINT
 
         // free half of stacks that remain unused since last sweep
         for (int p = 0; p < JL_N_STACK_POOLS; p++) {
-            small_arraylist_t *al = &ptls2->gc_tls.heap.free_stacks[p];
+            small_arraylist_t *al = &ptls2->gc_tls_common.heap.free_stacks[p];
             size_t n_to_free;
             if (jl_atomic_load_relaxed(&ptls2->current_task) == NULL) {
                 n_to_free = al->len; // not alive yet or dead, so it does not need these anymore
@@ -245,10 +245,10 @@ void sweep_stack_pools(void) JL_NOTSAFEPOINT
             }
         }
         if (jl_atomic_load_relaxed(&ptls2->current_task) == NULL) {
-            small_arraylist_free(ptls2->gc_tls.heap.free_stacks);
+            small_arraylist_free(ptls2->gc_tls_common.heap.free_stacks);
         }
 
-        small_arraylist_t *live_tasks = &ptls2->gc_tls.heap.live_tasks;
+        small_arraylist_t *live_tasks = &ptls2->gc_tls_common.heap.live_tasks;
         size_t n = 0;
         size_t ndel = 0;
         size_t l = live_tasks->len;
@@ -299,7 +299,7 @@ JL_DLLEXPORT jl_array_t *jl_live_tasks(void)
         jl_ptls_t ptls2 = allstates[i];
         if (ptls2 == NULL)
             continue;
-        small_arraylist_t *live_tasks = &ptls2->gc_tls.heap.live_tasks;
+        small_arraylist_t *live_tasks = &ptls2->gc_tls_common.heap.live_tasks;
         size_t n = mtarraylist_length(live_tasks);
         l += n + (ptls2->root_task->ctx.stkbuf != NULL);
     }
@@ -318,7 +318,7 @@ JL_DLLEXPORT jl_array_t *jl_live_tasks(void)
                 goto restart;
             jl_array_data(a,void*)[j++] = t;
         }
-        small_arraylist_t *live_tasks = &ptls2->gc_tls.heap.live_tasks;
+        small_arraylist_t *live_tasks = &ptls2->gc_tls_common.heap.live_tasks;
         size_t n = mtarraylist_length(live_tasks);
         for (size_t i = 0; i < n; i++) {
             jl_task_t *t = (jl_task_t*)mtarraylist_get(live_tasks, i);
diff --git a/src/gc-stock.c b/src/gc-stock.c
index b345fe08ff69c..8e040c9b25dcf 100644
--- a/src/gc-stock.c
+++ b/src/gc-stock.c
@@ -357,7 +357,7 @@ JL_DLLEXPORT jl_weakref_t *jl_gc_new_weakref_th(jl_ptls_t ptls, jl_value_t *valu
     jl_weakref_t *wr = (jl_weakref_t*)jl_gc_alloc(ptls, sizeof(void*),
                                                   jl_weakref_type);
     wr->value = value;  // NOTE: wb not needed here
-    small_arraylist_push(&ptls->gc_tls.heap.weak_refs, wr);
+    small_arraylist_push(&ptls->gc_tls_common.heap.weak_refs, wr);
     return wr;
 }
 
@@ -367,8 +367,8 @@ static void clear_weak_refs(void)
     for (int i = 0; i < gc_n_threads; i++) {
         jl_ptls_t ptls2 = gc_all_tls_states[i];
         if (ptls2 != NULL) {
-            size_t n, l = ptls2->gc_tls.heap.weak_refs.len;
-            void **lst = ptls2->gc_tls.heap.weak_refs.items;
+            size_t n, l = ptls2->gc_tls_common.heap.weak_refs.len;
+            void **lst = ptls2->gc_tls_common.heap.weak_refs.items;
             for (n = 0; n < l; n++) {
                 jl_weakref_t *wr = (jl_weakref_t*)lst[n];
                 if (!gc_marked(jl_astaggedvalue(wr->value)->bits.gc))
@@ -386,8 +386,8 @@ static void sweep_weak_refs(void)
         if (ptls2 != NULL) {
             size_t n = 0;
             size_t ndel = 0;
-            size_t l = ptls2->gc_tls.heap.weak_refs.len;
-            void **lst = ptls2->gc_tls.heap.weak_refs.items;
+            size_t l = ptls2->gc_tls_common.heap.weak_refs.len;
+            void **lst = ptls2->gc_tls_common.heap.weak_refs.items;
             if (l == 0)
                 continue;
             while (1) {
@@ -402,7 +402,7 @@ static void sweep_weak_refs(void)
                 lst[n] = lst[n + ndel];
                 lst[n + ndel] = tmp;
             }
-            ptls2->gc_tls.heap.weak_refs.len -= ndel;
+            ptls2->gc_tls_common.heap.weak_refs.len -= ndel;
         }
     }
 }
@@ -410,18 +410,18 @@ static void sweep_weak_refs(void)
 
 STATIC_INLINE void jl_batch_accum_heap_size(jl_ptls_t ptls, uint64_t sz) JL_NOTSAFEPOINT
 {
-    uint64_t alloc_acc = jl_atomic_load_relaxed(&ptls->gc_tls.gc_num.alloc_acc) + sz;
+    uint64_t alloc_acc = jl_atomic_load_relaxed(&ptls->gc_tls_common.gc_num.alloc_acc) + sz;
     if (alloc_acc < 16*1024)
-        jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.alloc_acc, alloc_acc);
+        jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.alloc_acc, alloc_acc);
     else {
         jl_atomic_fetch_add_relaxed(&gc_heap_stats.heap_size, alloc_acc);
-        jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.alloc_acc, 0);
+        jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.alloc_acc, 0);
     }
 }
 
 STATIC_INLINE void jl_batch_accum_free_size(jl_ptls_t ptls, uint64_t sz) JL_NOTSAFEPOINT
 {
-    jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.free_acc, jl_atomic_load_relaxed(&ptls->gc_tls.gc_num.free_acc) + sz);
+    jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.free_acc, jl_atomic_load_relaxed(&ptls->gc_tls_common.gc_num.free_acc) + sz);
 }
 
 // big value list
@@ -442,10 +442,10 @@ STATIC_INLINE jl_value_t *jl_gc_big_alloc_inner(jl_ptls_t ptls, size_t sz)
         jl_throw(jl_memory_exception);
     gc_invoke_callbacks(jl_gc_cb_notify_external_alloc_t,
         gc_cblist_notify_external_alloc, (v, allocsz));
-    jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.allocd,
-        jl_atomic_load_relaxed(&ptls->gc_tls.gc_num.allocd) + allocsz);
-    jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.bigalloc,
-        jl_atomic_load_relaxed(&ptls->gc_tls.gc_num.bigalloc) + 1);
+    jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.allocd,
+        jl_atomic_load_relaxed(&ptls->gc_tls_common.gc_num.allocd) + allocsz);
+    jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.bigalloc,
+        jl_atomic_load_relaxed(&ptls->gc_tls_common.gc_num.bigalloc) + 1);
     jl_batch_accum_heap_size(ptls, allocsz);
 #ifdef MEMDEBUG
     memset(v, 0xee, allocsz);
@@ -558,8 +558,8 @@ static void sweep_big(jl_ptls_t ptls) JL_NOTSAFEPOINT
 void jl_gc_count_allocd(size_t sz) JL_NOTSAFEPOINT
 {
     jl_ptls_t ptls = jl_current_task->ptls;
-    jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.allocd,
-        jl_atomic_load_relaxed(&ptls->gc_tls.gc_num.allocd) + sz);
+    jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.allocd,
+        jl_atomic_load_relaxed(&ptls->gc_tls_common.gc_num.allocd) + sz);
     jl_batch_accum_heap_size(ptls, sz);
 }
 
@@ -578,18 +578,18 @@ static void combine_thread_gc_counts(jl_gc_num_t *dest, int update_heap) JL_NOTS
     for (int i = 0; i < gc_n_threads; i++) {
         jl_ptls_t ptls = gc_all_tls_states[i];
         if (ptls) {
-            dest->allocd += (jl_atomic_load_relaxed(&ptls->gc_tls.gc_num.allocd) + gc_num.interval);
-            dest->malloc += jl_atomic_load_relaxed(&ptls->gc_tls.gc_num.malloc);
-            dest->realloc += jl_atomic_load_relaxed(&ptls->gc_tls.gc_num.realloc);
-            dest->poolalloc += jl_atomic_load_relaxed(&ptls->gc_tls.gc_num.poolalloc);
-            dest->bigalloc += jl_atomic_load_relaxed(&ptls->gc_tls.gc_num.bigalloc);
-            dest->freed += jl_atomic_load_relaxed(&ptls->gc_tls.gc_num.free_acc);
+            dest->allocd += (jl_atomic_load_relaxed(&ptls->gc_tls_common.gc_num.allocd) + gc_num.interval);
+            dest->malloc += jl_atomic_load_relaxed(&ptls->gc_tls_common.gc_num.malloc);
+            dest->realloc += jl_atomic_load_relaxed(&ptls->gc_tls_common.gc_num.realloc);
+            dest->poolalloc += jl_atomic_load_relaxed(&ptls->gc_tls_common.gc_num.poolalloc);
+            dest->bigalloc += jl_atomic_load_relaxed(&ptls->gc_tls_common.gc_num.bigalloc);
+            dest->freed += jl_atomic_load_relaxed(&ptls->gc_tls_common.gc_num.free_acc);
             if (update_heap) {
-                uint64_t alloc_acc = jl_atomic_load_relaxed(&ptls->gc_tls.gc_num.alloc_acc);
-                freed_in_runtime += jl_atomic_load_relaxed(&ptls->gc_tls.gc_num.free_acc);
+                uint64_t alloc_acc = jl_atomic_load_relaxed(&ptls->gc_tls_common.gc_num.alloc_acc);
+                freed_in_runtime += jl_atomic_load_relaxed(&ptls->gc_tls_common.gc_num.free_acc);
                 jl_atomic_store_relaxed(&gc_heap_stats.heap_size, alloc_acc + jl_atomic_load_relaxed(&gc_heap_stats.heap_size));
-                jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.alloc_acc, 0);
-                jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.free_acc, 0);
+                jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.alloc_acc, 0);
+                jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.free_acc, 0);
             }
         }
     }
@@ -605,13 +605,13 @@ static void reset_thread_gc_counts(void) JL_NOTSAFEPOINT
         jl_ptls_t ptls = gc_all_tls_states[i];
         if (ptls != NULL) {
             // don't reset `pool_live_bytes` here
-            jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.allocd, -(int64_t)gc_num.interval);
-            jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.malloc, 0);
-            jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.realloc, 0);
-            jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.poolalloc, 0);
-            jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.bigalloc, 0);
-            jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.alloc_acc, 0);
-            jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.free_acc, 0);
+            jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.allocd, -(int64_t)gc_num.interval);
+            jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.malloc, 0);
+            jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.realloc, 0);
+            jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.poolalloc, 0);
+            jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.bigalloc, 0);
+            jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.alloc_acc, 0);
+            jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.free_acc, 0);
         }
     }
 }
@@ -654,8 +654,8 @@ static void sweep_malloced_memory(void) JL_NOTSAFEPOINT
     for (int t_i = 0; t_i < gc_n_threads; t_i++) {
         jl_ptls_t ptls2 = gc_all_tls_states[t_i];
         if (ptls2 != NULL) {
-            mallocmemory_t *ma = ptls2->gc_tls.heap.mallocarrays;
-            mallocmemory_t **pma = &ptls2->gc_tls.heap.mallocarrays;
+            mallocmemory_t *ma = ptls2->gc_tls_common.heap.mallocarrays;
+            mallocmemory_t **pma = &ptls2->gc_tls_common.heap.mallocarrays;
             while (ma != NULL) {
                 mallocmemory_t *nxt = ma->next;
                 jl_value_t *a = (jl_value_t*)((uintptr_t)ma->a & ~1);
@@ -667,8 +667,8 @@ static void sweep_malloced_memory(void) JL_NOTSAFEPOINT
                     *pma = nxt;
                     int isaligned = (uintptr_t)ma->a & 1;
                     jl_gc_free_memory(a, isaligned);
-                    ma->next = ptls2->gc_tls.heap.mafreelist;
-                    ptls2->gc_tls.heap.mafreelist = ma;
+                    ma->next = ptls2->gc_tls_common.heap.mafreelist;
+                    ptls2->gc_tls_common.heap.mafreelist = ma;
                 }
                 gc_time_count_mallocd_memory(bits);
                 ma = nxt;
@@ -729,12 +729,12 @@ STATIC_INLINE jl_value_t *jl_gc_small_alloc_inner(jl_ptls_t ptls, int offset,
     return jl_gc_big_alloc(ptls, osize, NULL);
 #endif
     maybe_collect(ptls);
-    jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.allocd,
-        jl_atomic_load_relaxed(&ptls->gc_tls.gc_num.allocd) + osize);
-    jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.pool_live_bytes,
-        jl_atomic_load_relaxed(&ptls->gc_tls.gc_num.pool_live_bytes) + osize);
-    jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.poolalloc,
-        jl_atomic_load_relaxed(&ptls->gc_tls.gc_num.poolalloc) + 1);
+    jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.allocd,
+        jl_atomic_load_relaxed(&ptls->gc_tls_common.gc_num.allocd) + osize);
+    jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.pool_live_bytes,
+        jl_atomic_load_relaxed(&ptls->gc_tls_common.gc_num.pool_live_bytes) + osize);
+    jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.poolalloc,
+        jl_atomic_load_relaxed(&ptls->gc_tls_common.gc_num.poolalloc) + 1);
     // first try to use the freelist
     jl_taggedvalue_t *v = p->freelist;
     if (v != NULL) {
@@ -971,8 +971,8 @@ static void gc_sweep_page(gc_page_profiler_serializer_t *s, jl_gc_pool_t *p, jl_
     // instead of adding it to the thread that originally allocated the page, so we can avoid
     // an atomic-fetch-add here.
     size_t delta = (GC_PAGE_SZ - GC_PAGE_OFFSET - nfree * osize);
-    jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.pool_live_bytes,
-        jl_atomic_load_relaxed(&ptls->gc_tls.gc_num.pool_live_bytes) + delta);
+    jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.pool_live_bytes,
+        jl_atomic_load_relaxed(&ptls->gc_tls_common.gc_num.pool_live_bytes) + delta);
     jl_atomic_fetch_add_relaxed((_Atomic(int64_t) *)&gc_num.freed, (nfree - old_nfree) * osize);
 }
 
@@ -1228,7 +1228,7 @@ static void gc_sweep_pool(void)
             }
             continue;
         }
-        jl_atomic_store_relaxed(&ptls2->gc_tls.gc_num.pool_live_bytes, 0);
+        jl_atomic_store_relaxed(&ptls2->gc_tls_common.gc_num.pool_live_bytes, 0);
         for (int i = 0; i < JL_GC_N_POOLS; i++) {
             jl_gc_pool_t *p = &ptls2->gc_tls.heap.norm_pools[i];
             jl_taggedvalue_t *last = p->freelist;
@@ -2834,7 +2834,7 @@ JL_DLLEXPORT int64_t jl_gc_pool_live_bytes(void)
     for (int i = 0; i < n_threads; i++) {
         jl_ptls_t ptls2 = all_tls_states[i];
         if (ptls2 != NULL) {
-            pool_live_bytes += jl_atomic_load_relaxed(&ptls2->gc_tls.gc_num.pool_live_bytes);
+            pool_live_bytes += jl_atomic_load_relaxed(&ptls2->gc_tls_common.gc_num.pool_live_bytes);
         }
     }
     return pool_live_bytes;
@@ -3183,11 +3183,12 @@ static int _jl_gc_collect(jl_ptls_t ptls, jl_gc_collection_t collection)
         if (jl_atomic_load_relaxed(&ptls2->current_task) == NULL) {
             // GC threads should never exit
             assert(!gc_is_collector_thread(t_i));
+            jl_thread_heap_common_t *common_heap = &ptls2->gc_tls_common.heap;
             jl_thread_heap_t *heap = &ptls2->gc_tls.heap;
-            if (heap->weak_refs.len == 0)
-                small_arraylist_free(&heap->weak_refs);
-            if (heap->live_tasks.len == 0)
-                small_arraylist_free(&heap->live_tasks);
+            if (common_heap->weak_refs.len == 0)
+                small_arraylist_free(&common_heap->weak_refs);
+            if (common_heap->live_tasks.len == 0)
+                small_arraylist_free(&common_heap->live_tasks);
             if (heap->remset.len == 0)
                 arraylist_free(&heap->remset);
             if (ptls2->finalizers.len == 0)
@@ -3256,8 +3257,8 @@ JL_DLLEXPORT void jl_gc_collect(jl_gc_collection_t collection)
     jl_task_t *ct = jl_current_task;
     jl_ptls_t ptls = ct->ptls;
     if (jl_atomic_load_acquire(&jl_gc_disable_counter)) {
-        size_t localbytes = jl_atomic_load_relaxed(&ptls->gc_tls.gc_num.allocd) + gc_num.interval;
-        jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.allocd, -(int64_t)gc_num.interval);
+        size_t localbytes = jl_atomic_load_relaxed(&ptls->gc_tls_common.gc_num.allocd) + gc_num.interval;
+        jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.allocd, -(int64_t)gc_num.interval);
         static_assert(sizeof(_Atomic(uint64_t)) == sizeof(gc_num.deferred_alloc), "");
         jl_atomic_fetch_add_relaxed((_Atomic(uint64_t)*)&gc_num.deferred_alloc, localbytes);
         return;
@@ -3362,6 +3363,7 @@ void gc_mark_queue_all_roots(jl_ptls_t ptls, jl_gc_markqueue_t *mq)
 // Per-thread initialization
 void jl_init_thread_heap(jl_ptls_t ptls)
 {
+    jl_thread_heap_common_t *common_heap = &ptls->gc_tls_common.heap;
     jl_thread_heap_t *heap = &ptls->gc_tls.heap;
     jl_gc_pool_t *p = heap->norm_pools;
     for (int i = 0; i < JL_GC_N_POOLS; i++) {
@@ -3369,12 +3371,12 @@ void jl_init_thread_heap(jl_ptls_t ptls)
         p[i].freelist = NULL;
         p[i].newpages = NULL;
     }
-    small_arraylist_new(&heap->weak_refs, 0);
-    small_arraylist_new(&heap->live_tasks, 0);
+    small_arraylist_new(&common_heap->weak_refs, 0);
+    small_arraylist_new(&common_heap->live_tasks, 0);
     for (int i = 0; i < JL_N_STACK_POOLS; i++)
-        small_arraylist_new(&heap->free_stacks[i], 0);
-    heap->mallocarrays = NULL;
-    heap->mafreelist = NULL;
+        small_arraylist_new(&common_heap->free_stacks[i], 0);
+    common_heap->mallocarrays = NULL;
+    common_heap->mafreelist = NULL;
     heap->young_generation_of_bigvals = (bigval_t*)calloc_s(sizeof(bigval_t)); // sentinel
     assert(gc_bigval_sentinel_tag != 0); // make sure the sentinel is initialized
     heap->young_generation_of_bigvals->header = gc_bigval_sentinel_tag;
@@ -3400,8 +3402,8 @@ void jl_init_thread_heap(jl_ptls_t ptls)
     jl_atomic_store_relaxed(&q->array, wsa2);
     arraylist_new(&mq->reclaim_set, 32);
 
-    memset(&ptls->gc_tls.gc_num, 0, sizeof(ptls->gc_tls.gc_num));
-    jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.allocd, -(int64_t)gc_num.interval);
+    memset(&ptls->gc_tls_common.gc_num, 0, sizeof(ptls->gc_tls_common.gc_num));
+    jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.allocd, -(int64_t)gc_num.interval);
 }
 
 void jl_free_thread_gc_state(jl_ptls_t ptls)
@@ -3579,10 +3581,10 @@ JL_DLLEXPORT void *jl_gc_counted_malloc(size_t sz)
     if (data != NULL && pgcstack != NULL && ct->world_age) {
         jl_ptls_t ptls = ct->ptls;
         maybe_collect(ptls);
-        jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.allocd,
-            jl_atomic_load_relaxed(&ptls->gc_tls.gc_num.allocd) + sz);
-        jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.malloc,
-            jl_atomic_load_relaxed(&ptls->gc_tls.gc_num.malloc) + 1);
+        jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.allocd,
+            jl_atomic_load_relaxed(&ptls->gc_tls_common.gc_num.allocd) + sz);
+        jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.malloc,
+            jl_atomic_load_relaxed(&ptls->gc_tls_common.gc_num.malloc) + 1);
         jl_batch_accum_heap_size(ptls, sz);
     }
     return data;
@@ -3596,10 +3598,10 @@ JL_DLLEXPORT void *jl_gc_counted_calloc(size_t nm, size_t sz)
     if (data != NULL && pgcstack != NULL && ct->world_age) {
         jl_ptls_t ptls = ct->ptls;
         maybe_collect(ptls);
-        jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.allocd,
-            jl_atomic_load_relaxed(&ptls->gc_tls.gc_num.allocd) + nm*sz);
-        jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.malloc,
-            jl_atomic_load_relaxed(&ptls->gc_tls.gc_num.malloc) + 1);
+        jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.allocd,
+            jl_atomic_load_relaxed(&ptls->gc_tls_common.gc_num.allocd) + nm*sz);
+        jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.malloc,
+            jl_atomic_load_relaxed(&ptls->gc_tls_common.gc_num.malloc) + 1);
         jl_batch_accum_heap_size(ptls, sz * nm);
     }
     return data;
@@ -3624,10 +3626,10 @@ JL_DLLEXPORT void *jl_gc_counted_realloc_with_old_size(void *p, size_t old, size
         jl_ptls_t ptls = ct->ptls;
         maybe_collect(ptls);
         if (!(sz < old))
-            jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.allocd,
-                jl_atomic_load_relaxed(&ptls->gc_tls.gc_num.allocd) + (sz - old));
-        jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.realloc,
-            jl_atomic_load_relaxed(&ptls->gc_tls.gc_num.realloc) + 1);
+            jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.allocd,
+                jl_atomic_load_relaxed(&ptls->gc_tls_common.gc_num.allocd) + (sz - old));
+        jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.realloc,
+            jl_atomic_load_relaxed(&ptls->gc_tls_common.gc_num.realloc) + 1);
 
         int64_t diff = sz - old;
         if (diff < 0) {
@@ -3658,10 +3660,10 @@ JL_DLLEXPORT void *jl_gc_managed_malloc(size_t sz)
     if (b == NULL)
         jl_throw(jl_memory_exception);
 
-    jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.allocd,
-        jl_atomic_load_relaxed(&ptls->gc_tls.gc_num.allocd) + allocsz);
-    jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.malloc,
-        jl_atomic_load_relaxed(&ptls->gc_tls.gc_num.malloc) + 1);
+    jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.allocd,
+        jl_atomic_load_relaxed(&ptls->gc_tls_common.gc_num.allocd) + allocsz);
+    jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.malloc,
+        jl_atomic_load_relaxed(&ptls->gc_tls_common.gc_num.malloc) + 1);
     jl_batch_accum_heap_size(ptls, allocsz);
 #ifdef _OS_WINDOWS_
     SetLastError(last_error);
diff --git a/src/gc-tls-common.h b/src/gc-tls-common.h
new file mode 100644
index 0000000000000..28fbf2d0c448e
--- /dev/null
+++ b/src/gc-tls-common.h
@@ -0,0 +1,52 @@
+// This file is a part of Julia. License is MIT: https://julialang.org/license
+
+// Meant to be included in "julia_threads.h"
+#ifndef JL_GC_TLS_COMMON_H
+#define JL_GC_TLS_COMMON_H
+
+#include "julia_atomics.h"
+
+// GC threading ------------------------------------------------------------------
+
+#include "arraylist.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct {
+    // variable for tracking weak references
+    small_arraylist_t weak_refs;
+    // live tasks started on this thread
+    // that are holding onto a stack from the pool
+    small_arraylist_t live_tasks;
+
+    // variables for tracking malloc'd arrays
+    struct _mallocmemory_t *mallocarrays;
+    struct _mallocmemory_t *mafreelist;
+
+#define JL_N_STACK_POOLS 16
+    small_arraylist_t free_stacks[JL_N_STACK_POOLS];
+} jl_thread_heap_common_t;
+
+typedef struct {
+    _Atomic(int64_t) allocd;
+    _Atomic(int64_t) pool_live_bytes;
+    _Atomic(uint64_t) malloc;
+    _Atomic(uint64_t) realloc;
+    _Atomic(uint64_t) poolalloc;
+    _Atomic(uint64_t) bigalloc;
+    _Atomic(int64_t) free_acc;
+    _Atomic(uint64_t) alloc_acc;
+} jl_thread_gc_num_common_t;
+
+typedef struct {
+    jl_thread_heap_common_t heap;
+    jl_thread_gc_num_common_t gc_num;
+} jl_gc_tls_states_common_t;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // JL_GC_TLS_H
diff --git a/src/gc-tls.h b/src/gc-tls.h
index 9e4b09404db84..ecc815805a98b 100644
--- a/src/gc-tls.h
+++ b/src/gc-tls.h
@@ -21,16 +21,6 @@ typedef struct {
 } jl_gc_pool_t;
 
 typedef struct {
-    // variable for tracking weak references
-    small_arraylist_t weak_refs;
-    // live tasks started on this thread
-    // that are holding onto a stack from the pool
-    small_arraylist_t live_tasks;
-
-    // variables for tracking malloc'd arrays
-    struct _mallocmemory_t *mallocarrays;
-    struct _mallocmemory_t *mafreelist;
-
     // variable for tracking young (i.e. not in `GC_OLD_MARKED`/last generation) large objects
     struct _bigval_t *young_generation_of_bigvals;
 
@@ -42,22 +32,8 @@ typedef struct {
     // variables for allocating objects from pools
 #define JL_GC_N_MAX_POOLS 51 // conservative. must be kept in sync with `src/julia_internal.h`
     jl_gc_pool_t norm_pools[JL_GC_N_MAX_POOLS];
-
-#define JL_N_STACK_POOLS 16
-    small_arraylist_t free_stacks[JL_N_STACK_POOLS];
 } jl_thread_heap_t;
 
-typedef struct {
-    _Atomic(int64_t) allocd;
-    _Atomic(int64_t) pool_live_bytes;
-    _Atomic(uint64_t) malloc;
-    _Atomic(uint64_t) realloc;
-    _Atomic(uint64_t) poolalloc;
-    _Atomic(uint64_t) bigalloc;
-    _Atomic(int64_t) free_acc;
-    _Atomic(uint64_t) alloc_acc;
-} jl_thread_gc_num_t;
-
 typedef struct {
     ws_queue_t chunk_queue;
     ws_queue_t ptr_queue;
@@ -78,7 +54,6 @@ typedef struct {
 typedef struct {
     jl_thread_heap_t heap;
     jl_gc_page_stack_t page_metadata_allocd;
-    jl_thread_gc_num_t gc_num;
     jl_gc_markqueue_t mark_queue;
     jl_gc_mark_cache_t gc_cache;
     _Atomic(size_t) gc_sweeps_requested;
diff --git a/src/julia_threads.h b/src/julia_threads.h
index b697a0bf030ed..fcc28591658cb 100644
--- a/src/julia_threads.h
+++ b/src/julia_threads.h
@@ -5,6 +5,7 @@
 #define JL_THREADS_H
 
 #include "gc-tls.h"
+#include "gc-tls-common.h"
 #include "julia_atomics.h"
 #ifndef _OS_WINDOWS_
 #include "pthread.h"
@@ -155,6 +156,7 @@ typedef struct _jl_tls_states_t {
     // Counter to disable finalizer **on the current thread**
     int finalizers_inhibited;
     jl_gc_tls_states_t gc_tls; // this is very large, and the offset of the first member is baked into codegen
+    jl_gc_tls_states_common_t gc_tls_common; // common tls for both GCs
     volatile sig_atomic_t defer_signal;
     _Atomic(struct _jl_task_t*) current_task;
     struct _jl_task_t *next_task;
diff --git a/src/stackwalk.c b/src/stackwalk.c
index a1de3a6d61a07..0988d7a833c94 100644
--- a/src/stackwalk.c
+++ b/src/stackwalk.c
@@ -1309,7 +1309,7 @@ JL_DLLEXPORT void jl_print_task_backtraces(int show_done) JL_NOTSAFEPOINT
         if (ptls2 == NULL) {
             continue;
         }
-        small_arraylist_t *live_tasks = &ptls2->gc_tls.heap.live_tasks;
+        small_arraylist_t *live_tasks = &ptls2->gc_tls_common.heap.live_tasks;
         size_t n = mtarraylist_length(live_tasks);
         int t_state = JL_TASK_STATE_DONE;
         jl_task_t *t = ptls2->root_task;

From 048af72dee003a3ded89c3bf6c6572f97cb2678a Mon Sep 17 00:00:00 2001
From: Eduardo Souza <ledusou@gmail.com>
Date: Tue, 8 Oct 2024 09:14:24 +0000
Subject: [PATCH 25/38] Typo

---
 src/gc-tls-common.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/gc-tls-common.h b/src/gc-tls-common.h
index 28fbf2d0c448e..ba36f5c1c238e 100644
--- a/src/gc-tls-common.h
+++ b/src/gc-tls-common.h
@@ -49,4 +49,4 @@ typedef struct {
 }
 #endif
 
-#endif // JL_GC_TLS_H
+#endif // JL_GC_TLS_COMMON_H

From fe61c2232d997da0ebd3b936a469024acff7afbb Mon Sep 17 00:00:00 2001
From: Eduardo Souza <ledusou@gmail.com>
Date: Tue, 8 Oct 2024 22:46:39 +0000
Subject: [PATCH 26/38] Adding gc-tls-common.h to Makefile as a public header

---
 src/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Makefile b/src/Makefile
index a6b1f433b73ce..80bbdbcff67fc 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -103,7 +103,7 @@ ifeq ($(USE_SYSTEM_LIBUV),0)
 UV_HEADERS += uv.h
 UV_HEADERS += uv/*.h
 endif
-PUBLIC_HEADERS := $(BUILDDIR)/julia_version.h $(wildcard $(SRCDIR)/support/*.h) $(addprefix $(SRCDIR)/,work-stealing-queue.h gc-interface.h gc-tls.h julia.h julia_assert.h julia_threads.h julia_fasttls.h julia_locks.h julia_atomics.h jloptions.h)
+PUBLIC_HEADERS := $(BUILDDIR)/julia_version.h $(wildcard $(SRCDIR)/support/*.h) $(addprefix $(SRCDIR)/,work-stealing-queue.h gc-interface.h gc-tls.h gc-tls-common.h julia.h julia_assert.h julia_threads.h julia_fasttls.h julia_locks.h julia_atomics.h jloptions.h)
 ifeq ($(OS),WINNT)
 PUBLIC_HEADERS += $(addprefix $(SRCDIR)/,win32_ucontext.h)
 endif

From 380fd833efba491cb167ad9c61909199e14098d8 Mon Sep 17 00:00:00 2001
From: Eduardo Souza <ledusou@gmail.com>
Date: Tue, 8 Oct 2024 23:26:33 +0000
Subject: [PATCH 27/38] Removing gc-tls-common fields from gc-tls-mmtk.h

---
 src/gc-mmtk.c                 | 58 +++++++++++++++++------------------
 src/gc-tls-mmtk.h             | 30 ------------------
 src/llvm-late-gc-lowering.cpp |  2 +-
 3 files changed, 30 insertions(+), 60 deletions(-)

diff --git a/src/gc-mmtk.c b/src/gc-mmtk.c
index 98a5612871be0..aa010c73b27d2 100644
--- a/src/gc-mmtk.c
+++ b/src/gc-mmtk.c
@@ -108,7 +108,7 @@ void jl_start_gc_threads(void) {
 }
 
 void jl_init_thread_heap(struct _jl_tls_states_t *ptls) JL_NOTSAFEPOINT {
-    jl_thread_heap_t *heap = &ptls->gc_tls.heap;
+    jl_thread_heap_common_t *heap = &ptls->gc_tls_common.heap;
     small_arraylist_new(&heap->weak_refs, 0);
     small_arraylist_new(&heap->live_tasks, 0);
     for (int i = 0; i < JL_N_STACK_POOLS; i++)
@@ -124,7 +124,7 @@ void jl_init_thread_heap(struct _jl_tls_states_t *ptls) JL_NOTSAFEPOINT {
     memcpy(&ptls->gc_tls.mmtk_mutator, mmtk_mutator, sizeof(MMTkMutatorContext));
     // Call post_bind to maintain a list of active mutators and to reclaim the old mutator (which is no longer needed)
     mmtk_post_bind_mutator(&ptls->gc_tls.mmtk_mutator, mmtk_mutator);
-    memset(&ptls->gc_tls.gc_num, 0, sizeof(ptls->gc_tls.gc_num));
+    memset(&ptls->gc_tls_common.gc_num, 0, sizeof(ptls->gc_tls_common.gc_num));
 }
 
 void jl_free_thread_gc_state(struct _jl_tls_states_t *ptls) {
@@ -162,8 +162,8 @@ JL_DLLEXPORT void jl_gc_collect(jl_gc_collection_t collection) {
     jl_task_t *ct = jl_current_task;
     jl_ptls_t ptls = ct->ptls;
     if (jl_atomic_load_acquire(&jl_gc_disable_counter)) {
-        size_t localbytes = jl_atomic_load_relaxed(&ptls->gc_tls.gc_num.allocd) + gc_num.interval;
-        jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.allocd, -(int64_t)gc_num.interval);
+        size_t localbytes = jl_atomic_load_relaxed(&ptls->gc_tls_common.gc_num.allocd) + gc_num.interval;
+        jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.allocd, -(int64_t)gc_num.interval);
         static_assert(sizeof(_Atomic(uint64_t)) == sizeof(gc_num.deferred_alloc), "");
         jl_atomic_fetch_add_relaxed((_Atomic(uint64_t)*)&gc_num.deferred_alloc, localbytes);
         return;
@@ -186,15 +186,15 @@ static void combine_thread_gc_counts(jl_gc_num_t *dest, int update_heap) JL_NOTS
     for (int i = 0; i < gc_n_threads; i++) {
         jl_ptls_t ptls = gc_all_tls_states[i];
         if (ptls) {
-            dest->allocd += (jl_atomic_load_relaxed(&ptls->gc_tls.gc_num.allocd) + gc_num.interval);
-            dest->malloc += jl_atomic_load_relaxed(&ptls->gc_tls.gc_num.malloc);
-            dest->realloc += jl_atomic_load_relaxed(&ptls->gc_tls.gc_num.realloc);
-            dest->poolalloc += jl_atomic_load_relaxed(&ptls->gc_tls.gc_num.poolalloc);
-            dest->bigalloc += jl_atomic_load_relaxed(&ptls->gc_tls.gc_num.bigalloc);
-            dest->freed += jl_atomic_load_relaxed(&ptls->gc_tls.gc_num.free_acc);
+            dest->allocd += (jl_atomic_load_relaxed(&ptls->gc_tls_common.gc_num.allocd) + gc_num.interval);
+            dest->malloc += jl_atomic_load_relaxed(&ptls->gc_tls_common.gc_num.malloc);
+            dest->realloc += jl_atomic_load_relaxed(&ptls->gc_tls_common.gc_num.realloc);
+            dest->poolalloc += jl_atomic_load_relaxed(&ptls->gc_tls_common.gc_num.poolalloc);
+            dest->bigalloc += jl_atomic_load_relaxed(&ptls->gc_tls_common.gc_num.bigalloc);
+            dest->freed += jl_atomic_load_relaxed(&ptls->gc_tls_common.gc_num.free_acc);
             if (update_heap) {
-                jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.alloc_acc, 0);
-                jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.free_acc, 0);
+                jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.alloc_acc, 0);
+                jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.free_acc, 0);
             }
         }
     }
@@ -211,13 +211,13 @@ void reset_thread_gc_counts(void) JL_NOTSAFEPOINT
         jl_ptls_t ptls = gc_all_tls_states[i];
         if (ptls != NULL) {
             // don't reset `pool_live_bytes` here
-            jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.allocd, -(int64_t)gc_num.interval);
-            jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.malloc, 0);
-            jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.realloc, 0);
-            jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.poolalloc, 0);
-            jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.bigalloc, 0);
-            jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.alloc_acc, 0);
-            jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.free_acc, 0);
+            jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.allocd, -(int64_t)gc_num.interval);
+            jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.malloc, 0);
+            jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.realloc, 0);
+            jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.poolalloc, 0);
+            jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.bigalloc, 0);
+            jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.alloc_acc, 0);
+            jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.free_acc, 0);
         }
     }
 }
@@ -257,8 +257,8 @@ JL_DLLEXPORT int64_t jl_gc_pool_live_bytes(void) {
 void jl_gc_count_allocd(size_t sz) JL_NOTSAFEPOINT
 {
     jl_ptls_t ptls = jl_current_task->ptls;
-    jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.allocd,
-        jl_atomic_load_relaxed(&ptls->gc_tls.gc_num.allocd) + sz);
+    jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.allocd,
+        jl_atomic_load_relaxed(&ptls->gc_tls_common.gc_num.allocd) + sz);
 }
 
 void jl_gc_count_freed(size_t sz) JL_NOTSAFEPOINT
@@ -473,8 +473,8 @@ JL_DLLEXPORT jl_value_t *jl_mmtk_gc_alloc_default(jl_ptls_t ptls, int osize, siz
         mmtk_immix_post_alloc_fast(&ptls->gc_tls.mmtk_mutator, v, LLT_ALIGN(osize+sizeof(jl_taggedvalue_t), align));
     }
 
-    ptls->gc_tls.gc_num.allocd += osize;
-    ptls->gc_tls.gc_num.poolalloc++;
+    ptls->gc_tls_common.gc_num.allocd += osize;
+    ptls->gc_tls_common.gc_num.poolalloc++;
 
     return v;
 }
@@ -502,8 +502,8 @@ JL_DLLEXPORT jl_value_t *jl_mmtk_gc_alloc_big(jl_ptls_t ptls, size_t sz)
     }
     v->sz = allocsz;
 
-    ptls->gc_tls.gc_num.allocd += allocsz;
-    ptls->gc_tls.gc_num.bigalloc++;
+    ptls->gc_tls_common.gc_num.allocd += allocsz;
+    ptls->gc_tls_common.gc_num.bigalloc++;
 
     jl_value_t *result = jl_valueof(&v->header);
     mmtk_post_alloc(&ptls->gc_tls.mmtk_mutator, result, allocsz, 2);
@@ -565,10 +565,10 @@ JL_DLLEXPORT void *jl_gc_managed_malloc(size_t sz)
     if (b == NULL)
         jl_throw(jl_memory_exception);
 
-    jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.allocd,
-        jl_atomic_load_relaxed(&ptls->gc_tls.gc_num.allocd) + allocsz);
-    jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.malloc,
-        jl_atomic_load_relaxed(&ptls->gc_tls.gc_num.malloc) + 1);
+    jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.allocd,
+        jl_atomic_load_relaxed(&ptls->gc_tls_common.gc_num.allocd) + allocsz);
+    jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.malloc,
+        jl_atomic_load_relaxed(&ptls->gc_tls_common.gc_num.malloc) + 1);
     // FIXME: Should these be part of mmtk's heap?
     // malloc_maybe_collect(ptls, sz);
     // jl_atomic_fetch_add_relaxed(&JULIA_MALLOC_BYTES, allocsz);
diff --git a/src/gc-tls-mmtk.h b/src/gc-tls-mmtk.h
index 64a1bae192445..7b1b249cd8ae3 100644
--- a/src/gc-tls-mmtk.h
+++ b/src/gc-tls-mmtk.h
@@ -9,37 +9,7 @@
 extern "C" {
 #endif
 
-// This mostly remove some fields that are not used by MMTk
-
-typedef struct {
-    // variable for tracking weak references
-    small_arraylist_t weak_refs;
-    // live tasks started on this thread
-    // that are holding onto a stack from the pool
-    small_arraylist_t live_tasks;
-
-    // variables for tracking malloc'd arrays
-    struct _mallocmemory_t *mallocarrays;
-    struct _mallocmemory_t *mafreelist;
-
-#define JL_N_STACK_POOLS 16
-    small_arraylist_t free_stacks[JL_N_STACK_POOLS];
-} jl_thread_heap_t;
-
-typedef struct {
-    _Atomic(int64_t) allocd;
-    _Atomic(int64_t) pool_live_bytes;
-    _Atomic(uint64_t) malloc;
-    _Atomic(uint64_t) realloc;
-    _Atomic(uint64_t) poolalloc;
-    _Atomic(uint64_t) bigalloc;
-    _Atomic(int64_t) free_acc;
-    _Atomic(uint64_t) alloc_acc;
-} jl_thread_gc_num_t;
-
 typedef struct {
-    jl_thread_heap_t heap;
-    jl_thread_gc_num_t gc_num;
     MMTkMutatorContext mmtk_mutator;
     size_t malloc_sz_since_last_poll;
 } jl_gc_tls_states_t;
diff --git a/src/llvm-late-gc-lowering.cpp b/src/llvm-late-gc-lowering.cpp
index d395771f6df0c..4b7dc0ec855a7 100644
--- a/src/llvm-late-gc-lowering.cpp
+++ b/src/llvm-late-gc-lowering.cpp
@@ -2528,7 +2528,7 @@ Value* LateLowerGCFrame::lowerGCAllocBytesLate(CallInst *target, Function &F)
                 builder.CreateStore(new_cursor, cursor_ptr);
 
                 // ptls->gc_tls.gc_num.allocd += osize;
-                auto pool_alloc_pos = ConstantInt::get(Type::getInt64Ty(target->getContext()), offsetof(jl_tls_states_t, gc_tls) + offsetof(jl_gc_tls_states_t, gc_num));
+                auto pool_alloc_pos = ConstantInt::get(Type::getInt64Ty(target->getContext()), offsetof(jl_tls_states_t, gc_tls_common) + offsetof(jl_gc_tls_states_common_t, gc_num));
                 auto pool_alloc_i8 = builder.CreateGEP(Type::getInt8Ty(target->getContext()), ptls, pool_alloc_pos);
                 auto pool_alloc_tls = builder.CreateBitCast(pool_alloc_i8, PointerType::get(Type::getInt64Ty(target->getContext()), 0), "pool_alloc");
                 auto pool_allocd = builder.CreateLoad(Type::getInt64Ty(target->getContext()), pool_alloc_tls);

From ebf478ad2783571684e64fa41c7868d40b105985 Mon Sep 17 00:00:00 2001
From: Eduardo Souza <ledusou@gmail.com>
Date: Wed, 28 Aug 2024 00:44:09 +0000
Subject: [PATCH 28/38] Refactoring to be considered before adding MMTk

---
 src/gc-common.c      | 156 +++++++++++++++++++++++++++++++++++++++++++
 src/gc-common.h      |   6 ++
 src/gc-debug.c       |  41 +-----------
 src/gc-interface.h   |  12 ++++
 src/gc-stacks.c      |   4 +-
 src/gc-stock.c       | 156 ++++++++++++-------------------------------
 src/gc-stock.h       |  21 ------
 src/julia.h          |   2 +-
 src/julia_internal.h |  26 +-------
 src/scheduler.c      |  11 +++
 src/stackwalk.c      |   4 +-
 src/staticdata.c     |   2 +
 12 files changed, 237 insertions(+), 204 deletions(-)

diff --git a/src/gc-common.c b/src/gc-common.c
index ee461b576ea9e..2ec167caa667a 100644
--- a/src/gc-common.c
+++ b/src/gc-common.c
@@ -20,6 +20,11 @@ extern "C" {
 
 jl_gc_num_t gc_num = {0};
 
+JL_DLLEXPORT uint64_t jl_gc_total_hrtime(void)
+{
+    return gc_num.total_time;
+}
+
 // =========================================================================== //
 // GC Callbacks
 // =========================================================================== //
@@ -489,6 +494,87 @@ jl_ptls_t* gc_all_tls_states;
 // MISC
 // =========================================================================== //
 
+JL_DLLEXPORT jl_weakref_t *jl_gc_new_weakref(jl_value_t *value)
+{
+    jl_ptls_t ptls = jl_current_task->ptls;
+    return jl_gc_new_weakref_th(ptls, value);
+}
+
+JL_DLLEXPORT void * jl_gc_alloc_typed(jl_ptls_t ptls, size_t sz, void *ty)
+{
+    return jl_gc_alloc(ptls, sz, ty);
+}
+
+JL_DLLEXPORT jl_value_t *jl_gc_allocobj(size_t sz)
+{
+    jl_ptls_t ptls = jl_current_task->ptls;
+    return jl_gc_alloc(ptls, sz, NULL);
+}
+
+// allocation wrappers that save the size of allocations, to allow using
+// jl_gc_counted_* functions with a libc-compatible API.
+
+JL_DLLEXPORT void *jl_malloc(size_t sz)
+{
+    int64_t *p = (int64_t *)jl_gc_counted_malloc(sz + JL_SMALL_BYTE_ALIGNMENT);
+    if (p == NULL)
+        return NULL;
+    p[0] = sz;
+    return (void *)(p + 2); // assumes JL_SMALL_BYTE_ALIGNMENT == 16
+}
+
+//_unchecked_calloc does not check for potential overflow of nm*sz
+STATIC_INLINE void *_unchecked_calloc(size_t nm, size_t sz) {
+    size_t nmsz = nm*sz;
+    int64_t *p = (int64_t *)jl_gc_counted_calloc(nmsz + JL_SMALL_BYTE_ALIGNMENT, 1);
+    if (p == NULL)
+        return NULL;
+    p[0] = nmsz;
+    return (void *)(p + 2); // assumes JL_SMALL_BYTE_ALIGNMENT == 16
+}
+
+JL_DLLEXPORT void *jl_calloc(size_t nm, size_t sz)
+{
+    if (nm > SSIZE_MAX/sz - JL_SMALL_BYTE_ALIGNMENT)
+        return NULL;
+    return _unchecked_calloc(nm, sz);
+}
+
+JL_DLLEXPORT void jl_free(void *p)
+{
+    if (p != NULL) {
+        int64_t *pp = (int64_t *)p - 2;
+        size_t sz = pp[0];
+        jl_gc_counted_free_with_size(pp, sz + JL_SMALL_BYTE_ALIGNMENT);
+    }
+}
+
+JL_DLLEXPORT void *jl_realloc(void *p, size_t sz)
+{
+    int64_t *pp;
+    size_t szold;
+    if (p == NULL) {
+        pp = NULL;
+        szold = 0;
+    }
+    else {
+        pp = (int64_t *)p - 2;
+        szold = pp[0] + JL_SMALL_BYTE_ALIGNMENT;
+    }
+    int64_t *pnew = (int64_t *)jl_gc_counted_realloc_with_old_size(pp, szold, sz + JL_SMALL_BYTE_ALIGNMENT);
+    if (pnew == NULL)
+        return NULL;
+    pnew[0] = sz;
+    return (void *)(pnew + 2); // assumes JL_SMALL_BYTE_ALIGNMENT == 16
+}
+
+// allocator entry points
+
+JL_DLLEXPORT jl_value_t *(jl_gc_alloc)(jl_ptls_t ptls, size_t sz, void *ty)
+{
+    return jl_gc_alloc_(ptls, sz, ty);
+}
+
 const uint64_t _jl_buff_tag[3] = {0x4eadc0004eadc000ull, 0x4eadc0004eadc000ull, 0x4eadc0004eadc000ull}; // aka 0xHEADER00
 JL_DLLEXPORT uintptr_t jl_get_buff_tag(void) JL_NOTSAFEPOINT
 {
@@ -501,6 +587,76 @@ JL_DLLEXPORT void jl_throw_out_of_memory_error(void)
     jl_throw(jl_memory_exception);
 }
 
+size_t jl_genericmemory_nbytes(jl_genericmemory_t *m) JL_NOTSAFEPOINT
+{
+    const jl_datatype_layout_t *layout = ((jl_datatype_t*)jl_typetagof(m))->layout;
+    size_t sz = layout->size * m->length;
+    if (layout->flags.arrayelem_isunion)
+        // account for isbits Union array selector bytes
+        sz += m->length;
+    return sz;
+}
+
+// tracking Memorys with malloc'd storage
+void jl_gc_track_malloced_genericmemory(jl_ptls_t ptls, jl_genericmemory_t *m, int isaligned){
+    // This is **NOT** a GC safe point.
+    mallocmemory_t *ma;
+    if (ptls->gc_tls.heap.mafreelist == NULL) {
+        ma = (mallocmemory_t*)malloc_s(sizeof(mallocmemory_t));
+    }
+    else {
+        ma = ptls->gc_tls.heap.mafreelist;
+        ptls->gc_tls.heap.mafreelist = ma->next;
+    }
+    ma->a = (jl_genericmemory_t*)((uintptr_t)m | !!isaligned);
+    ma->next = ptls->gc_tls.heap.mallocarrays;
+    ptls->gc_tls.heap.mallocarrays = ma;
+}
+
+int gc_logging_enabled = 0;
+
+JL_DLLEXPORT void jl_enable_gc_logging(int enable) {
+    gc_logging_enabled = enable;
+}
+
+JL_DLLEXPORT int jl_is_gc_logging_enabled(void) {
+    return gc_logging_enabled;
+}
+
+// gc-debug common functions
+// ---
+
+int gc_slot_to_fieldidx(void *obj, void *slot, jl_datatype_t *vt) JL_NOTSAFEPOINT
+{
+    int nf = (int)jl_datatype_nfields(vt);
+    for (int i = 1; i < nf; i++) {
+        if (slot < (void*)((char*)obj + jl_field_offset(vt, i)))
+            return i - 1;
+    }
+    return nf - 1;
+}
+
+int gc_slot_to_arrayidx(void *obj, void *_slot) JL_NOTSAFEPOINT
+{
+    char *slot = (char*)_slot;
+    jl_datatype_t *vt = (jl_datatype_t*)jl_typeof(obj);
+    char *start = NULL;
+    size_t len = 0;
+    size_t elsize = sizeof(void*);
+    if (vt == jl_module_type) {
+        jl_module_t *m = (jl_module_t*)obj;
+        start = (char*)m->usings.items;
+        len = m->usings.len;
+    }
+    else if (vt == jl_simplevector_type) {
+        start = (char*)jl_svec_data(obj);
+        len = jl_svec_len(obj);
+    }
+    if (slot < start || slot >= start + elsize * len)
+        return -1;
+    return (slot - start) / elsize;
+}
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/gc-common.h b/src/gc-common.h
index 4d53830442a7d..154b9659e9ccb 100644
--- a/src/gc-common.h
+++ b/src/gc-common.h
@@ -53,6 +53,12 @@ extern jl_gc_callback_list_t *gc_cblist_notify_gc_pressure;
 // malloc wrappers, aligned allocation
 // =========================================================================== //
 
+// data structure for tracking malloc'd genericmemory.
+typedef struct _mallocmemory_t {
+    jl_genericmemory_t *a; // lowest bit is tagged if this is aligned memory
+    struct _mallocmemory_t *next;
+} mallocmemory_t;
+
 #if defined(_OS_WINDOWS_)
 STATIC_INLINE void *jl_malloc_aligned(size_t sz, size_t align)
 {
diff --git a/src/gc-debug.c b/src/gc-debug.c
index 19dd93af5f236..d05fb4b49e9f7 100644
--- a/src/gc-debug.c
+++ b/src/gc-debug.c
@@ -1105,46 +1105,7 @@ void gc_count_pool(void)
     jl_safe_printf("************************\n");
 }
 
-int gc_slot_to_fieldidx(void *obj, void *slot, jl_datatype_t *vt) JL_NOTSAFEPOINT
-{
-    int nf = (int)jl_datatype_nfields(vt);
-    for (int i = 1; i < nf; i++) {
-        if (slot < (void*)((char*)obj + jl_field_offset(vt, i)))
-            return i - 1;
-    }
-    return nf - 1;
-}
-
-int gc_slot_to_arrayidx(void *obj, void *_slot) JL_NOTSAFEPOINT
-{
-    char *slot = (char*)_slot;
-    jl_datatype_t *vt = (jl_datatype_t*)jl_typeof(obj);
-    char *start = NULL;
-    size_t len = 0;
-    size_t elsize = sizeof(void*);
-    if (vt == jl_module_type) {
-        jl_module_t *m = (jl_module_t*)obj;
-        start = (char*)m->usings.items;
-        len = m->usings.len;
-    }
-    else if (vt == jl_simplevector_type) {
-        start = (char*)jl_svec_data(obj);
-        len = jl_svec_len(obj);
-    }
-    if (slot < start || slot >= start + elsize * len)
-        return -1;
-    return (slot - start) / elsize;
-}
-
-static int gc_logging_enabled = 0;
-
-JL_DLLEXPORT void jl_enable_gc_logging(int enable) {
-    gc_logging_enabled = enable;
-}
-
-JL_DLLEXPORT int jl_is_gc_logging_enabled(void) {
-    return gc_logging_enabled;
-}
+extern int gc_logging_enabled;
 
 void _report_gc_finished(uint64_t pause, uint64_t freed, int full, int recollect, int64_t live_bytes) JL_NOTSAFEPOINT {
     if (!gc_logging_enabled) {
diff --git a/src/gc-interface.h b/src/gc-interface.h
index e543b4b5879f1..682f22344d69d 100644
--- a/src/gc-interface.h
+++ b/src/gc-interface.h
@@ -128,6 +128,13 @@ JL_DLLEXPORT uint64_t jl_gc_total_hrtime(void);
 // Allocation
 // ========================================================================= //
 
+// On GCC, this function is inlined when sz is constant (see julia_internal.h)
+// In general, this function should implement allocation and should use the specific GC's logic
+// to decide whether to allocate a small or a large object. Finally, note that this function
+// **must** also set the type of the returning object to be `ty`. The type `ty` may also be used to record
+// an allocation of that type in the allocation profiler.
+struct _jl_value_t *jl_gc_alloc_(struct _jl_tls_states_t * ptls, size_t sz, void *ty);
+
 // Allocates small objects and increments Julia allocation counterst. Size of the object
 // header must be included in the object size. The (possibly unused in some implementations)
 // offset to the arena in which we're allocating is passed in the second parameter, and the
@@ -211,6 +218,11 @@ JL_DLLEXPORT void *jl_gc_perm_alloc(size_t sz, int zero, unsigned align,
 // object being allocated and will be used to set the object header.
 struct _jl_value_t *jl_gc_permobj(size_t sz, void *ty) JL_NOTSAFEPOINT;
 
+// This function notifies the GC about memory addresses that are set when loading the boot image.
+// The GC may use that information to, for instance, determine that such objects should
+// be treated as marked and belonged to the old generation in nursery collections.
+void jl_gc_notify_image_load(const char* img_data, size_t len);
+
 // ========================================================================= //
 // Runtime Write-Barriers
 // ========================================================================= //
diff --git a/src/gc-stacks.c b/src/gc-stacks.c
index 783129ea97693..8c44b65284386 100644
--- a/src/gc-stacks.c
+++ b/src/gc-stacks.c
@@ -46,7 +46,7 @@ static void *malloc_stack(size_t bufsz) JL_NOTSAFEPOINT
 }
 
 
-static void free_stack(void *stkbuf, size_t bufsz) JL_NOTSAFEPOINT
+void free_stack(void *stkbuf, size_t bufsz) JL_NOTSAFEPOINT
 {
     VirtualFree(stkbuf, 0, MEM_RELEASE);
     jl_atomic_fetch_add_relaxed(&num_stack_mappings, -1);
@@ -81,7 +81,7 @@ static void *malloc_stack(size_t bufsz) JL_NOTSAFEPOINT
     return stk;
 }
 
-static void free_stack(void *stkbuf, size_t bufsz) JL_NOTSAFEPOINT
+void free_stack(void *stkbuf, size_t bufsz) JL_NOTSAFEPOINT
 {
     munmap(stkbuf, bufsz);
     jl_atomic_fetch_add_relaxed(&num_stack_mappings, -1);
diff --git a/src/gc-stock.c b/src/gc-stock.c
index 6b97881909bbd..6ebac8a0c079e 100644
--- a/src/gc-stock.c
+++ b/src/gc-stock.c
@@ -555,24 +555,6 @@ static void sweep_big(jl_ptls_t ptls) JL_NOTSAFEPOINT
     gc_time_big_end();
 }
 
-// tracking Memorys with malloc'd storage
-
-void jl_gc_track_malloced_genericmemory(jl_ptls_t ptls, jl_genericmemory_t *m, int isaligned){
-    // This is **NOT** a GC safe point.
-    mallocmemory_t *ma;
-    if (ptls->gc_tls.heap.mafreelist == NULL) {
-        ma = (mallocmemory_t*)malloc_s(sizeof(mallocmemory_t));
-    }
-    else {
-        ma = ptls->gc_tls.heap.mafreelist;
-        ptls->gc_tls.heap.mafreelist = ma->next;
-    }
-    ma->a = (jl_genericmemory_t*)((uintptr_t)m | !!isaligned);
-    ma->next = ptls->gc_tls.heap.mallocarrays;
-    ptls->gc_tls.heap.mallocarrays = ma;
-}
-
-
 void jl_gc_count_allocd(size_t sz) JL_NOTSAFEPOINT
 {
     jl_ptls_t ptls = jl_current_task->ptls;
@@ -649,17 +631,6 @@ void jl_gc_reset_alloc_count(void) JL_NOTSAFEPOINT
     reset_thread_gc_counts();
 }
 
-size_t jl_genericmemory_nbytes(jl_genericmemory_t *m) JL_NOTSAFEPOINT
-{
-    const jl_datatype_layout_t *layout = ((jl_datatype_t*)jl_typetagof(m))->layout;
-    size_t sz = layout->size * m->length;
-    if (layout->flags.arrayelem_isunion)
-        // account for isbits Union array selector bytes
-        sz += m->length;
-    return sz;
-}
-
-
 static void jl_gc_free_memory(jl_value_t *v, int isaligned) JL_NOTSAFEPOINT
 {
     assert(jl_is_genericmemory(v));
@@ -818,6 +789,29 @@ jl_value_t *jl_gc_small_alloc_noinline(jl_ptls_t ptls, int offset, int osize) {
     return jl_gc_small_alloc_inner(ptls, offset, osize);
 }
 
+// Size does NOT include the type tag!!
+inline jl_value_t *jl_gc_alloc_(jl_ptls_t ptls, size_t sz, void *ty)
+{
+    jl_value_t *v;
+    const size_t allocsz = sz + sizeof(jl_taggedvalue_t);
+    if (sz <= GC_MAX_SZCLASS) {
+        int pool_id = jl_gc_szclass(allocsz);
+        jl_gc_pool_t *p = &ptls->gc_tls.heap.norm_pools[pool_id];
+        int osize = jl_gc_sizeclasses[pool_id];
+        // We call `jl_gc_small_alloc_noinline` instead of `jl_gc_small_alloc` to avoid double-counting in
+        // the Allocations Profiler. (See https://github.com/JuliaLang/julia/pull/43868 for more details.)
+        v = jl_gc_small_alloc_noinline(ptls, (char*)p - (char*)ptls, osize);
+    }
+    else {
+        if (allocsz < sz) // overflow in adding offs, size was "negative"
+            jl_throw(jl_memory_exception);
+        v = jl_gc_big_alloc_noinline(ptls, allocsz);
+    }
+    jl_set_typeof(v, ty);
+    maybe_record_alloc_to_profile(v, sz, (jl_datatype_t*)ty);
+    return v;
+}
+
 int jl_gc_classify_pools(size_t sz, int *osize)
 {
     if (sz > GC_MAX_SZCLASS)
@@ -2794,6 +2788,21 @@ static void sweep_finalizer_list(arraylist_t *list)
     list->len = j;
 }
 
+int gc_is_parallel_collector_thread(int tid) JL_NOTSAFEPOINT
+{
+    return tid >= gc_first_tid && tid <= gc_last_parallel_collector_thread_id();
+}
+
+int gc_is_concurrent_collector_thread(int tid) JL_NOTSAFEPOINT
+{
+    if (jl_n_sweepthreads == 0) {
+        return 0;
+    }
+    int last_parallel_collector_thread_id = gc_last_parallel_collector_thread_id();
+    int concurrent_collector_thread_id = last_parallel_collector_thread_id + 1;
+    return tid == concurrent_collector_thread_id;
+}
+
 // collector entry point and control
 _Atomic(uint32_t) jl_gc_disable_counter = 1;
 
@@ -2832,11 +2841,6 @@ JL_DLLEXPORT void jl_gc_get_total_bytes(int64_t *bytes) JL_NOTSAFEPOINT
     *bytes = (num.total_allocd + num.deferred_alloc + num.allocd);
 }
 
-JL_DLLEXPORT uint64_t jl_gc_total_hrtime(void)
-{
-    return gc_num.total_time;
-}
-
 JL_DLLEXPORT jl_gc_num_t jl_gc_num(void)
 {
     jl_gc_num_t num = gc_num;
@@ -3397,13 +3401,6 @@ void gc_mark_queue_all_roots(jl_ptls_t ptls, jl_gc_markqueue_t *mq)
     gc_mark_roots(mq);
 }
 
-// allocator entry points
-
-JL_DLLEXPORT jl_value_t *(jl_gc_alloc)(jl_ptls_t ptls, size_t sz, void *ty)
-{
-    return jl_gc_alloc_(ptls, sz, ty);
-}
-
 // Per-thread initialization
 void jl_init_thread_heap(jl_ptls_t ptls)
 {
@@ -3685,63 +3682,6 @@ JL_DLLEXPORT void *jl_gc_counted_realloc_with_old_size(void *p, size_t old, size
     return data;
 }
 
-// allocation wrappers that save the size of allocations, to allow using
-// jl_gc_counted_* functions with a libc-compatible API.
-
-JL_DLLEXPORT void *jl_malloc(size_t sz)
-{
-    int64_t *p = (int64_t *)jl_gc_counted_malloc(sz + JL_SMALL_BYTE_ALIGNMENT);
-    if (p == NULL)
-        return NULL;
-    p[0] = sz;
-    return (void *)(p + 2); // assumes JL_SMALL_BYTE_ALIGNMENT == 16
-}
-
-//_unchecked_calloc does not check for potential overflow of nm*sz
-STATIC_INLINE void *_unchecked_calloc(size_t nm, size_t sz) {
-    size_t nmsz = nm*sz;
-    int64_t *p = (int64_t *)jl_gc_counted_calloc(nmsz + JL_SMALL_BYTE_ALIGNMENT, 1);
-    if (p == NULL)
-        return NULL;
-    p[0] = nmsz;
-    return (void *)(p + 2); // assumes JL_SMALL_BYTE_ALIGNMENT == 16
-}
-
-JL_DLLEXPORT void *jl_calloc(size_t nm, size_t sz)
-{
-    if (nm > SSIZE_MAX/sz - JL_SMALL_BYTE_ALIGNMENT)
-        return NULL;
-    return _unchecked_calloc(nm, sz);
-}
-
-JL_DLLEXPORT void jl_free(void *p)
-{
-    if (p != NULL) {
-        int64_t *pp = (int64_t *)p - 2;
-        size_t sz = pp[0];
-        jl_gc_counted_free_with_size(pp, sz + JL_SMALL_BYTE_ALIGNMENT);
-    }
-}
-
-JL_DLLEXPORT void *jl_realloc(void *p, size_t sz)
-{
-    int64_t *pp;
-    size_t szold;
-    if (p == NULL) {
-        pp = NULL;
-        szold = 0;
-    }
-    else {
-        pp = (int64_t *)p - 2;
-        szold = pp[0] + JL_SMALL_BYTE_ALIGNMENT;
-    }
-    int64_t *pnew = (int64_t *)jl_gc_counted_realloc_with_old_size(pp, szold, sz + JL_SMALL_BYTE_ALIGNMENT);
-    if (pnew == NULL)
-        return NULL;
-    pnew[0] = sz;
-    return (void *)(pnew + 2); // assumes JL_SMALL_BYTE_ALIGNMENT == 16
-}
-
 // allocating blocks for Arrays and Strings
 
 JL_DLLEXPORT void *jl_gc_managed_malloc(size_t sz)
@@ -3875,18 +3815,6 @@ jl_value_t *jl_gc_permobj(size_t sz, void *ty) JL_NOTSAFEPOINT
     return jl_valueof(o);
 }
 
-JL_DLLEXPORT jl_weakref_t *jl_gc_new_weakref(jl_value_t *value)
-{
-    jl_ptls_t ptls = jl_current_task->ptls;
-    return jl_gc_new_weakref_th(ptls, value);
-}
-
-JL_DLLEXPORT jl_value_t *jl_gc_allocobj(size_t sz)
-{
-    jl_ptls_t ptls = jl_current_task->ptls;
-    return jl_gc_alloc(ptls, sz, NULL);
-}
-
 JL_DLLEXPORT int jl_gc_enable_conservative_gc_support(void)
 {
     if (jl_is_initialized()) {
@@ -4014,14 +3942,14 @@ JL_DLLEXPORT size_t jl_gc_external_obj_hdr_size(void)
 }
 
 
-JL_DLLEXPORT void * jl_gc_alloc_typed(jl_ptls_t ptls, size_t sz, void *ty)
+JL_DLLEXPORT void jl_gc_schedule_foreign_sweepfunc(jl_ptls_t ptls, jl_value_t *obj)
 {
-    return jl_gc_alloc(ptls, sz, ty);
+    arraylist_push(&ptls->gc_tls.sweep_objs, obj);
 }
 
-JL_DLLEXPORT void jl_gc_schedule_foreign_sweepfunc(jl_ptls_t ptls, jl_value_t *obj)
+void jl_gc_notify_image_load(const char* img_data, size_t len)
 {
-    arraylist_push(&ptls->gc_tls.sweep_objs, obj);
+    // Do nothing
 }
 
 #ifdef __cplusplus
diff --git a/src/gc-stock.h b/src/gc-stock.h
index 46f7d3e11e105..cc661ce6e1600 100644
--- a/src/gc-stock.h
+++ b/src/gc-stock.h
@@ -106,12 +106,6 @@ JL_EXTENSION typedef struct _bigval_t {
     // must be 64-byte aligned here, in 32 & 64 bit modes
 } bigval_t;
 
-// data structure for tracking malloc'd genericmemory.
-typedef struct _mallocmemory_t {
-    jl_genericmemory_t *a; // lowest bit is tagged if this is aligned memory
-    struct _mallocmemory_t *next;
-} mallocmemory_t;
-
 // pool page metadata
 typedef struct _jl_gc_pagemeta_t {
     // next metadata structure in per-thread list
@@ -428,21 +422,6 @@ STATIC_INLINE int gc_ith_parallel_collector_thread_id(int i) JL_NOTSAFEPOINT
     return gc_first_tid + i;
 }
 
-STATIC_INLINE int gc_is_parallel_collector_thread(int tid) JL_NOTSAFEPOINT
-{
-    return tid >= gc_first_tid && tid <= gc_last_parallel_collector_thread_id();
-}
-
-STATIC_INLINE int gc_is_concurrent_collector_thread(int tid) JL_NOTSAFEPOINT
-{
-    if (jl_n_sweepthreads == 0) {
-        return 0;
-    }
-    int last_parallel_collector_thread_id = gc_last_parallel_collector_thread_id();
-    int concurrent_collector_thread_id = last_parallel_collector_thread_id + 1;
-    return tid == concurrent_collector_thread_id;
-}
-
 STATIC_INLINE int gc_random_parallel_collector_thread_id(jl_ptls_t ptls) JL_NOTSAFEPOINT
 {
     assert(jl_n_markthreads > 0);
diff --git a/src/julia.h b/src/julia.h
index ed3d9bf825658..b74de3060d26a 100644
--- a/src/julia.h
+++ b/src/julia.h
@@ -858,7 +858,7 @@ static inline jl_value_t *jl_to_typeof(uintptr_t t)
     return (jl_value_t*)t;
 }
 #else
-extern JL_HIDDEN jl_datatype_t *ijl_small_typeof[(jl_max_tags << 4) / sizeof(jl_datatype_t*)];
+extern JL_DLLEXPORT jl_datatype_t *ijl_small_typeof[(jl_max_tags << 4) / sizeof(jl_datatype_t*)];
 static inline jl_value_t *jl_to_typeof(uintptr_t t)
 {
     if (t < (jl_max_tags << 4))
diff --git a/src/julia_internal.h b/src/julia_internal.h
index 20d90fede3d5e..04857d440b643 100644
--- a/src/julia_internal.h
+++ b/src/julia_internal.h
@@ -513,30 +513,6 @@ STATIC_INLINE uint8_t JL_CONST_FUNC jl_gc_szclass_align8(unsigned sz) JL_NOTSAFE
 #define GC_MAX_SZCLASS (2032-sizeof(void*))
 static_assert(ARRAY_CACHE_ALIGN_THRESHOLD > GC_MAX_SZCLASS, "");
 
-
-// Size does NOT include the type tag!!
-STATIC_INLINE jl_value_t *jl_gc_alloc_(jl_ptls_t ptls, size_t sz, void *ty)
-{
-    jl_value_t *v;
-    const size_t allocsz = sz + sizeof(jl_taggedvalue_t);
-    if (sz <= GC_MAX_SZCLASS) {
-        int pool_id = jl_gc_szclass(allocsz);
-        jl_gc_pool_t *p = &ptls->gc_tls.heap.norm_pools[pool_id];
-        int osize = jl_gc_sizeclasses[pool_id];
-        // We call `jl_gc_small_alloc_noinline` instead of `jl_gc_small_alloc` to avoid double-counting in
-        // the Allocations Profiler. (See https://github.com/JuliaLang/julia/pull/43868 for more details.)
-        v = jl_gc_small_alloc_noinline(ptls, (char*)p - (char*)ptls, osize);
-    }
-    else {
-        if (allocsz < sz) // overflow in adding offs, size was "negative"
-            jl_throw(jl_memory_exception);
-        v = jl_gc_big_alloc_noinline(ptls, allocsz);
-    }
-    jl_set_typeof(v, ty);
-    maybe_record_alloc_to_profile(v, sz, (jl_datatype_t*)ty);
-    return v;
-}
-
 /* Programming style note: When using jl_gc_alloc, do not JL_GC_PUSH it into a
  * gc frame, until it has been fully initialized. An uninitialized value in a
  * gc frame can crash upon encountering the first safepoint. By delaying use of
@@ -1077,7 +1053,7 @@ STATIC_INLINE int jl_addr_is_safepoint(uintptr_t addr)
     return addr >= safepoint_addr && addr < safepoint_addr + jl_page_size * 4;
 }
 extern _Atomic(uint32_t) jl_gc_running;
-extern _Atomic(uint32_t) jl_gc_disable_counter;
+extern JL_DLLEXPORT _Atomic(uint32_t) jl_gc_disable_counter;
 // All the functions are safe to be called from within a signal handler
 // provided that the thread will not be interrupted by another asynchronous
 // signal.
diff --git a/src/scheduler.c b/src/scheduler.c
index bb2f85b52283f..b85a481588e4f 100644
--- a/src/scheduler.c
+++ b/src/scheduler.c
@@ -80,9 +80,20 @@ JL_DLLEXPORT int jl_set_task_threadpoolid(jl_task_t *task, int8_t tpid) JL_NOTSA
     return 1;
 }
 
+<<<<<<< HEAD
 // GC functions used
 extern int jl_gc_mark_queue_obj_explicit(jl_gc_mark_cache_t *gc_cache,
                                          jl_gc_markqueue_t *mq, jl_value_t *obj) JL_NOTSAFEPOINT;
+=======
+// parallel task runtime
+// ---
+
+JL_DLLEXPORT uint32_t jl_rand_ptls(uint32_t max) // [0, n)
+{
+    jl_ptls_t ptls = jl_current_task->ptls;
+    return cong(max, &ptls->rngseed);
+}
+>>>>>>> 4f39869d04 (Refactoring to be considered before adding MMTk)
 
 // initialize the threading infrastructure
 // (called only by the main thread)
diff --git a/src/stackwalk.c b/src/stackwalk.c
index 6aa36fa8b499c..5f28b61c4a8fe 100644
--- a/src/stackwalk.c
+++ b/src/stackwalk.c
@@ -5,7 +5,7 @@
   utilities for walking the stack and looking up information about code addresses
 */
 #include <inttypes.h>
-#include "gc-stock.h"
+#include "gc-common.h"
 #include "julia.h"
 #include "julia_internal.h"
 #include "threading.h"
@@ -1294,6 +1294,8 @@ JL_DLLEXPORT void jl_print_backtrace(void) JL_NOTSAFEPOINT
 }
 
 extern int gc_first_tid;
+extern int gc_is_parallel_collector_thread(int tid) JL_NOTSAFEPOINT;
+extern int gc_is_concurrent_collector_thread(int tid) JL_NOTSAFEPOINT;
 
 // Print backtraces for all live tasks, for all threads, to jl_safe_printf stderr
 JL_DLLEXPORT void jl_print_task_backtraces(int show_done) JL_NOTSAFEPOINT
diff --git a/src/staticdata.c b/src/staticdata.c
index 0a8cbe6db7c67..bba35e6dcb5f9 100644
--- a/src/staticdata.c
+++ b/src/staticdata.c
@@ -657,6 +657,7 @@ static void jl_load_sysimg_so(void)
         plen = (size_t *)&jl_system_image_size;
     else
         jl_dlsym(jl_sysimg_handle, "jl_system_image_size", (void **)&plen, 1);
+    jl_gc_notify_image_load(sysimg_data, *plen);
     jl_restore_system_image_data(sysimg_data, *plen);
 }
 
@@ -4054,6 +4055,7 @@ JL_DLLEXPORT jl_value_t *jl_restore_package_image_from_file(const char *fname, j
     jl_dlsym(pkgimg_handle, "jl_system_image_data", (void **)&pkgimg_data, 1);
     size_t *plen;
     jl_dlsym(pkgimg_handle, "jl_system_image_size", (void **)&plen, 1);
+    jl_gc_notify_image_load(pkgimg_data, *plen);
 
     jl_image_t pkgimage = jl_init_processor_pkgimg(pkgimg_handle);
 

From 0a8444ea6f539cdb63481f45411f42629c1c97e1 Mon Sep 17 00:00:00 2001
From: Eduardo Souza <ledusou@gmail.com>
Date: Thu, 29 Aug 2024 04:57:59 +0000
Subject: [PATCH 29/38] Removing jl_gc_notify_image_load, since it's a new
 function and not part of the refactoring

---
 src/gc-interface.h | 5 -----
 src/gc-stock.c     | 5 -----
 src/staticdata.c   | 2 --
 3 files changed, 12 deletions(-)

diff --git a/src/gc-interface.h b/src/gc-interface.h
index 682f22344d69d..25ffed4524f0c 100644
--- a/src/gc-interface.h
+++ b/src/gc-interface.h
@@ -218,11 +218,6 @@ JL_DLLEXPORT void *jl_gc_perm_alloc(size_t sz, int zero, unsigned align,
 // object being allocated and will be used to set the object header.
 struct _jl_value_t *jl_gc_permobj(size_t sz, void *ty) JL_NOTSAFEPOINT;
 
-// This function notifies the GC about memory addresses that are set when loading the boot image.
-// The GC may use that information to, for instance, determine that such objects should
-// be treated as marked and belonged to the old generation in nursery collections.
-void jl_gc_notify_image_load(const char* img_data, size_t len);
-
 // ========================================================================= //
 // Runtime Write-Barriers
 // ========================================================================= //
diff --git a/src/gc-stock.c b/src/gc-stock.c
index 6ebac8a0c079e..88b201a687eba 100644
--- a/src/gc-stock.c
+++ b/src/gc-stock.c
@@ -3947,11 +3947,6 @@ JL_DLLEXPORT void jl_gc_schedule_foreign_sweepfunc(jl_ptls_t ptls, jl_value_t *o
     arraylist_push(&ptls->gc_tls.sweep_objs, obj);
 }
 
-void jl_gc_notify_image_load(const char* img_data, size_t len)
-{
-    // Do nothing
-}
-
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/staticdata.c b/src/staticdata.c
index bba35e6dcb5f9..0a8cbe6db7c67 100644
--- a/src/staticdata.c
+++ b/src/staticdata.c
@@ -657,7 +657,6 @@ static void jl_load_sysimg_so(void)
         plen = (size_t *)&jl_system_image_size;
     else
         jl_dlsym(jl_sysimg_handle, "jl_system_image_size", (void **)&plen, 1);
-    jl_gc_notify_image_load(sysimg_data, *plen);
     jl_restore_system_image_data(sysimg_data, *plen);
 }
 
@@ -4055,7 +4054,6 @@ JL_DLLEXPORT jl_value_t *jl_restore_package_image_from_file(const char *fname, j
     jl_dlsym(pkgimg_handle, "jl_system_image_data", (void **)&pkgimg_data, 1);
     size_t *plen;
     jl_dlsym(pkgimg_handle, "jl_system_image_size", (void **)&plen, 1);
-    jl_gc_notify_image_load(pkgimg_data, *plen);
 
     jl_image_t pkgimage = jl_init_processor_pkgimg(pkgimg_handle);
 

From c8818eab4ec04a248121bef73e1dd5e3b29a3ceb Mon Sep 17 00:00:00 2001
From: Eduardo Souza <ledusou@gmail.com>
Date: Mon, 2 Sep 2024 01:27:08 +0000
Subject: [PATCH 30/38] Moving gc_enable code to gc-common.c

---
 src/gc-common.c | 30 ++++++++++++++++++++++++++++++
 src/gc-stock.c  | 30 ------------------------------
 2 files changed, 30 insertions(+), 30 deletions(-)

diff --git a/src/gc-common.c b/src/gc-common.c
index 2ec167caa667a..03c046bc300f2 100644
--- a/src/gc-common.c
+++ b/src/gc-common.c
@@ -613,6 +613,36 @@ void jl_gc_track_malloced_genericmemory(jl_ptls_t ptls, jl_genericmemory_t *m, i
     ptls->gc_tls.heap.mallocarrays = ma;
 }
 
+// collector entry point and control
+_Atomic(uint32_t) jl_gc_disable_counter = 1;
+
+JL_DLLEXPORT int jl_gc_enable(int on)
+{
+    jl_ptls_t ptls = jl_current_task->ptls;
+    int prev = !ptls->disable_gc;
+    ptls->disable_gc = (on == 0);
+    if (on && !prev) {
+        // disable -> enable
+        if (jl_atomic_fetch_add(&jl_gc_disable_counter, -1) == 1) {
+            gc_num.allocd += gc_num.deferred_alloc;
+            gc_num.deferred_alloc = 0;
+        }
+    }
+    else if (prev && !on) {
+        // enable -> disable
+        jl_atomic_fetch_add(&jl_gc_disable_counter, 1);
+        // check if the GC is running and wait for it to finish
+        jl_gc_safepoint_(ptls);
+    }
+    return prev;
+}
+
+JL_DLLEXPORT int jl_gc_is_enabled(void)
+{
+    jl_ptls_t ptls = jl_current_task->ptls;
+    return !ptls->disable_gc;
+}
+
 int gc_logging_enabled = 0;
 
 JL_DLLEXPORT void jl_enable_gc_logging(int enable) {
diff --git a/src/gc-stock.c b/src/gc-stock.c
index 88b201a687eba..55499bce61182 100644
--- a/src/gc-stock.c
+++ b/src/gc-stock.c
@@ -2803,36 +2803,6 @@ int gc_is_concurrent_collector_thread(int tid) JL_NOTSAFEPOINT
     return tid == concurrent_collector_thread_id;
 }
 
-// collector entry point and control
-_Atomic(uint32_t) jl_gc_disable_counter = 1;
-
-JL_DLLEXPORT int jl_gc_enable(int on)
-{
-    jl_ptls_t ptls = jl_current_task->ptls;
-    int prev = !ptls->disable_gc;
-    ptls->disable_gc = (on == 0);
-    if (on && !prev) {
-        // disable -> enable
-        if (jl_atomic_fetch_add(&jl_gc_disable_counter, -1) == 1) {
-            gc_num.allocd += gc_num.deferred_alloc;
-            gc_num.deferred_alloc = 0;
-        }
-    }
-    else if (prev && !on) {
-        // enable -> disable
-        jl_atomic_fetch_add(&jl_gc_disable_counter, 1);
-        // check if the GC is running and wait for it to finish
-        jl_gc_safepoint_(ptls);
-    }
-    return prev;
-}
-
-JL_DLLEXPORT int jl_gc_is_enabled(void)
-{
-    jl_ptls_t ptls = jl_current_task->ptls;
-    return !ptls->disable_gc;
-}
-
 JL_DLLEXPORT void jl_gc_get_total_bytes(int64_t *bytes) JL_NOTSAFEPOINT
 {
     jl_gc_num_t num = gc_num;

From e721e0c121ee911c29e736668b5e20766844d85e Mon Sep 17 00:00:00 2001
From: Eduardo Souza <ledusou@gmail.com>
Date: Mon, 16 Sep 2024 06:38:02 +0000
Subject: [PATCH 31/38] Addressing PR comments

---
 src/gc-common.c      | 134 +++++++++++++++++++++++++------------------
 src/gc-common.h      |   6 ++
 src/gc-debug.c       |   2 -
 src/gc-interface.h   |  30 +---------
 src/gc-stock.c       |  18 +-----
 src/gc-stock.h       |  15 +++++
 src/julia.h          |   2 +-
 src/julia_internal.h |   4 +-
 src/stackwalk.c      |  10 +---
 9 files changed, 110 insertions(+), 111 deletions(-)

diff --git a/src/gc-common.c b/src/gc-common.c
index 03c046bc300f2..046feae6aa4c5 100644
--- a/src/gc-common.c
+++ b/src/gc-common.c
@@ -491,15 +491,9 @@ int gc_n_threads;
 jl_ptls_t* gc_all_tls_states;
 
 // =========================================================================== //
-// MISC
+// Allocation
 // =========================================================================== //
 
-JL_DLLEXPORT jl_weakref_t *jl_gc_new_weakref(jl_value_t *value)
-{
-    jl_ptls_t ptls = jl_current_task->ptls;
-    return jl_gc_new_weakref_th(ptls, value);
-}
-
 JL_DLLEXPORT void * jl_gc_alloc_typed(jl_ptls_t ptls, size_t sz, void *ty)
 {
     return jl_gc_alloc(ptls, sz, ty);
@@ -575,17 +569,9 @@ JL_DLLEXPORT jl_value_t *(jl_gc_alloc)(jl_ptls_t ptls, size_t sz, void *ty)
     return jl_gc_alloc_(ptls, sz, ty);
 }
 
-const uint64_t _jl_buff_tag[3] = {0x4eadc0004eadc000ull, 0x4eadc0004eadc000ull, 0x4eadc0004eadc000ull}; // aka 0xHEADER00
-JL_DLLEXPORT uintptr_t jl_get_buff_tag(void) JL_NOTSAFEPOINT
-{
-    return jl_buff_tag;
-}
-
-// callback for passing OOM errors from gmp
-JL_DLLEXPORT void jl_throw_out_of_memory_error(void)
-{
-    jl_throw(jl_memory_exception);
-}
+// =========================================================================== //
+// Generic Memory
+// =========================================================================== //
 
 size_t jl_genericmemory_nbytes(jl_genericmemory_t *m) JL_NOTSAFEPOINT
 {
@@ -613,6 +599,66 @@ void jl_gc_track_malloced_genericmemory(jl_ptls_t ptls, jl_genericmemory_t *m, i
     ptls->gc_tls.heap.mallocarrays = ma;
 }
 
+// =========================================================================== //
+// GC Debug
+// =========================================================================== //
+
+int gc_slot_to_fieldidx(void *obj, void *slot, jl_datatype_t *vt) JL_NOTSAFEPOINT
+{
+    int nf = (int)jl_datatype_nfields(vt);
+    for (int i = 1; i < nf; i++) {
+        if (slot < (void*)((char*)obj + jl_field_offset(vt, i)))
+            return i - 1;
+    }
+    return nf - 1;
+}
+
+int gc_slot_to_arrayidx(void *obj, void *_slot) JL_NOTSAFEPOINT
+{
+    char *slot = (char*)_slot;
+    jl_datatype_t *vt = (jl_datatype_t*)jl_typeof(obj);
+    char *start = NULL;
+    size_t len = 0;
+    size_t elsize = sizeof(void*);
+    if (vt == jl_module_type) {
+        jl_module_t *m = (jl_module_t*)obj;
+        start = (char*)m->usings.items;
+        len = m->usings.len;
+    }
+    else if (vt == jl_simplevector_type) {
+        start = (char*)jl_svec_data(obj);
+        len = jl_svec_len(obj);
+    }
+    if (slot < start || slot >= start + elsize * len)
+        return -1;
+    return (slot - start) / elsize;
+}
+
+// =========================================================================== //
+// GC Control
+// =========================================================================== //
+
+JL_DLLEXPORT uint32_t jl_get_gc_disable_counter(void) {
+    return jl_atomic_load_acquire(&jl_gc_disable_counter);
+}
+
+JL_DLLEXPORT int jl_gc_is_enabled(void)
+{
+    jl_ptls_t ptls = jl_current_task->ptls;
+    return !ptls->disable_gc;
+}
+
+int gc_logging_enabled = 0;
+
+JL_DLLEXPORT void jl_enable_gc_logging(int enable) {
+    gc_logging_enabled = enable;
+}
+
+JL_DLLEXPORT int jl_is_gc_logging_enabled(void) {
+    return gc_logging_enabled;
+}
+
+
 // collector entry point and control
 _Atomic(uint32_t) jl_gc_disable_counter = 1;
 
@@ -637,54 +683,30 @@ JL_DLLEXPORT int jl_gc_enable(int on)
     return prev;
 }
 
-JL_DLLEXPORT int jl_gc_is_enabled(void)
+// =========================================================================== //
+// MISC
+// =========================================================================== //
+
+JL_DLLEXPORT jl_weakref_t *jl_gc_new_weakref(jl_value_t *value)
 {
     jl_ptls_t ptls = jl_current_task->ptls;
-    return !ptls->disable_gc;
-}
-
-int gc_logging_enabled = 0;
-
-JL_DLLEXPORT void jl_enable_gc_logging(int enable) {
-    gc_logging_enabled = enable;
+    return jl_gc_new_weakref_th(ptls, value);
 }
 
-JL_DLLEXPORT int jl_is_gc_logging_enabled(void) {
-    return gc_logging_enabled;
+JL_DLLEXPORT jl_datatype_t **jl_get_ijl_small_typeof(void) {
+    return ijl_small_typeof;
 }
 
-// gc-debug common functions
-// ---
-
-int gc_slot_to_fieldidx(void *obj, void *slot, jl_datatype_t *vt) JL_NOTSAFEPOINT
+const uint64_t _jl_buff_tag[3] = {0x4eadc0004eadc000ull, 0x4eadc0004eadc000ull, 0x4eadc0004eadc000ull}; // aka 0xHEADER00
+JL_DLLEXPORT uintptr_t jl_get_buff_tag(void) JL_NOTSAFEPOINT
 {
-    int nf = (int)jl_datatype_nfields(vt);
-    for (int i = 1; i < nf; i++) {
-        if (slot < (void*)((char*)obj + jl_field_offset(vt, i)))
-            return i - 1;
-    }
-    return nf - 1;
+    return jl_buff_tag;
 }
 
-int gc_slot_to_arrayidx(void *obj, void *_slot) JL_NOTSAFEPOINT
+// callback for passing OOM errors from gmp
+JL_DLLEXPORT void jl_throw_out_of_memory_error(void)
 {
-    char *slot = (char*)_slot;
-    jl_datatype_t *vt = (jl_datatype_t*)jl_typeof(obj);
-    char *start = NULL;
-    size_t len = 0;
-    size_t elsize = sizeof(void*);
-    if (vt == jl_module_type) {
-        jl_module_t *m = (jl_module_t*)obj;
-        start = (char*)m->usings.items;
-        len = m->usings.len;
-    }
-    else if (vt == jl_simplevector_type) {
-        start = (char*)jl_svec_data(obj);
-        len = jl_svec_len(obj);
-    }
-    if (slot < start || slot >= start + elsize * len)
-        return -1;
-    return (slot - start) / elsize;
+    jl_throw(jl_memory_exception);
 }
 
 #ifdef __cplusplus
diff --git a/src/gc-common.h b/src/gc-common.h
index 154b9659e9ccb..32b7470b13a58 100644
--- a/src/gc-common.h
+++ b/src/gc-common.h
@@ -179,4 +179,10 @@ JL_DLLEXPORT void jl_finalize_th(jl_task_t *ct, jl_value_t *o);
 extern int gc_n_threads;
 extern jl_ptls_t* gc_all_tls_states;
 
+// =========================================================================== //
+// Logging
+// =========================================================================== //
+
+extern int gc_logging_enabled;
+
 #endif // JL_GC_COMMON_H
diff --git a/src/gc-debug.c b/src/gc-debug.c
index d05fb4b49e9f7..7c479484cde45 100644
--- a/src/gc-debug.c
+++ b/src/gc-debug.c
@@ -1105,8 +1105,6 @@ void gc_count_pool(void)
     jl_safe_printf("************************\n");
 }
 
-extern int gc_logging_enabled;
-
 void _report_gc_finished(uint64_t pause, uint64_t freed, int full, int recollect, int64_t live_bytes) JL_NOTSAFEPOINT {
     if (!gc_logging_enabled) {
         return;
diff --git a/src/gc-interface.h b/src/gc-interface.h
index 25ffed4524f0c..0e9ce32697f35 100644
--- a/src/gc-interface.h
+++ b/src/gc-interface.h
@@ -94,6 +94,8 @@ JL_DLLEXPORT void jl_gc_set_max_memory(uint64_t max_mem);
 // should run a collection cycle again (e.g. a full mark right after a full sweep to ensure
 // we do a full heap traversal).
 JL_DLLEXPORT void jl_gc_collect(jl_gc_collection_t collection);
+// Returns whether the thread with `tid` is a collector thread
+JL_DLLEXPORT int gc_is_collector_thread(int tid) JL_NOTSAFEPOINT;
 
 // ========================================================================= //
 // Metrics
@@ -162,26 +164,6 @@ JL_DLLEXPORT void *jl_gc_counted_calloc(size_t nm, size_t sz);
 JL_DLLEXPORT void jl_gc_counted_free_with_size(void *p, size_t sz);
 // Wrapper around Libc realloc that updates Julia allocation counters.
 JL_DLLEXPORT void *jl_gc_counted_realloc_with_old_size(void *p, size_t old, size_t sz);
-// Wrapper around Libc malloc that allocates a memory region with a few additional machine
-// words before the actual payload that are used to record the size of the requested
-// allocation. Also updates Julia allocation counters. The function returns a pointer to the
-// payload as a result of the allocation.
-JL_DLLEXPORT void *jl_malloc(size_t sz);
-// Wrapper around Libc calloc that allocates a memory region with a few additional machine
-// words before the actual payload that are used to record the size of the requested
-// allocation. Also updates Julia allocation counters. The function returns a pointer to the
-// payload as a result of the allocation.
-JL_DLLEXPORT void *jl_calloc(size_t nm, size_t sz);
-// Wrapper around Libc free that takes a pointer to the payload of a memory region allocated
-// with jl_malloc or jl_calloc, and uses the size information stored in the first machine
-// words of the memory buffer update Julia allocation counters, and then frees the
-// corresponding memory buffer.
-JL_DLLEXPORT void jl_free(void *p);
-// Wrapper around Libc realloc that takes a memory region allocated with jl_malloc or
-// jl_calloc, and uses the size information stored in the first machine words of the memory
-// buffer to update Julia allocation counters, reallocating the corresponding memory buffer
-// in the end.
-JL_DLLEXPORT void *jl_realloc(void *p, size_t sz);
 // Wrapper around Libc malloc that's used to dynamically allocate memory for Arrays and
 // Strings. It increments Julia allocation counters and should check whether we're close to
 // the Julia heap target, and therefore, whether we should run a collection. Note that this
@@ -195,14 +177,6 @@ JL_DLLEXPORT void *jl_gc_managed_malloc(size_t sz);
 // thread-local allocator of the thread referenced by the first jl_ptls_t argument.
 JL_DLLEXPORT struct _jl_weakref_t *jl_gc_new_weakref_th(struct _jl_tls_states_t *ptls,
                                                         struct _jl_value_t *value);
-// Allocates a new weak-reference, assigns its value and increments Julia allocation
-// counters. If thread-local allocators are used, then this function should allocate in the
-// thread-local allocator of the current thread.
-JL_DLLEXPORT struct _jl_weakref_t *jl_gc_new_weakref(struct _jl_value_t *value);
-// Allocates an object whose size is specified by the function argument and increments Julia
-// allocation counters. If thread-local allocators are used, then this function should
-// allocate in the thread-local allocator of the current thread.
-JL_DLLEXPORT struct _jl_value_t *jl_gc_allocobj(size_t sz);
 // Permanently allocates a memory slot of the size specified by the first parameter. This
 // block of memory is allocated in an immortal region that is never swept. The second
 // parameter specifies whether the memory should be filled with zeros. The third and fourth
diff --git a/src/gc-stock.c b/src/gc-stock.c
index 55499bce61182..b345fe08ff69c 100644
--- a/src/gc-stock.c
+++ b/src/gc-stock.c
@@ -2788,19 +2788,8 @@ static void sweep_finalizer_list(arraylist_t *list)
     list->len = j;
 }
 
-int gc_is_parallel_collector_thread(int tid) JL_NOTSAFEPOINT
-{
-    return tid >= gc_first_tid && tid <= gc_last_parallel_collector_thread_id();
-}
-
-int gc_is_concurrent_collector_thread(int tid) JL_NOTSAFEPOINT
-{
-    if (jl_n_sweepthreads == 0) {
-        return 0;
-    }
-    int last_parallel_collector_thread_id = gc_last_parallel_collector_thread_id();
-    int concurrent_collector_thread_id = last_parallel_collector_thread_id + 1;
-    return tid == concurrent_collector_thread_id;
+int gc_is_collector_thread(int tid) JL_NOTSAFEPOINT {
+    return gc_is_parallel_collector_thread(tid) || gc_is_concurrent_collector_thread(tid);
 }
 
 JL_DLLEXPORT void jl_gc_get_total_bytes(int64_t *bytes) JL_NOTSAFEPOINT
@@ -3193,8 +3182,7 @@ static int _jl_gc_collect(jl_ptls_t ptls, jl_gc_collection_t collection)
         // free empty GC state for threads that have exited
         if (jl_atomic_load_relaxed(&ptls2->current_task) == NULL) {
             // GC threads should never exit
-            assert(!gc_is_parallel_collector_thread(t_i));
-            assert(!gc_is_concurrent_collector_thread(t_i));
+            assert(!gc_is_collector_thread(t_i));
             jl_thread_heap_t *heap = &ptls2->gc_tls.heap;
             if (heap->weak_refs.len == 0)
                 small_arraylist_free(&heap->weak_refs);
diff --git a/src/gc-stock.h b/src/gc-stock.h
index cc661ce6e1600..0f8d1eee67581 100644
--- a/src/gc-stock.h
+++ b/src/gc-stock.h
@@ -422,6 +422,21 @@ STATIC_INLINE int gc_ith_parallel_collector_thread_id(int i) JL_NOTSAFEPOINT
     return gc_first_tid + i;
 }
 
+STATIC_INLINE int gc_is_parallel_collector_thread(int tid) JL_NOTSAFEPOINT
+{
+    return tid >= gc_first_tid && tid <= gc_last_parallel_collector_thread_id();
+}
+
+STATIC_INLINE int gc_is_concurrent_collector_thread(int tid) JL_NOTSAFEPOINT
+{
+    if (jl_n_sweepthreads == 0) {
+        return 0;
+    }
+    int last_parallel_collector_thread_id = gc_last_parallel_collector_thread_id();
+    int concurrent_collector_thread_id = last_parallel_collector_thread_id + 1;
+    return tid == concurrent_collector_thread_id;
+}
+
 STATIC_INLINE int gc_random_parallel_collector_thread_id(jl_ptls_t ptls) JL_NOTSAFEPOINT
 {
     assert(jl_n_markthreads > 0);
diff --git a/src/julia.h b/src/julia.h
index b74de3060d26a..ed3d9bf825658 100644
--- a/src/julia.h
+++ b/src/julia.h
@@ -858,7 +858,7 @@ static inline jl_value_t *jl_to_typeof(uintptr_t t)
     return (jl_value_t*)t;
 }
 #else
-extern JL_DLLEXPORT jl_datatype_t *ijl_small_typeof[(jl_max_tags << 4) / sizeof(jl_datatype_t*)];
+extern JL_HIDDEN jl_datatype_t *ijl_small_typeof[(jl_max_tags << 4) / sizeof(jl_datatype_t*)];
 static inline jl_value_t *jl_to_typeof(uintptr_t t)
 {
     if (t < (jl_max_tags << 4))
diff --git a/src/julia_internal.h b/src/julia_internal.h
index 04857d440b643..c079c06f0189a 100644
--- a/src/julia_internal.h
+++ b/src/julia_internal.h
@@ -367,6 +367,8 @@ extern jl_function_t *jl_typeinf_func JL_GLOBALLY_ROOTED;
 extern JL_DLLEXPORT size_t jl_typeinf_world;
 extern _Atomic(jl_typemap_entry_t*) call_cache[N_CALL_CACHE] JL_GLOBALLY_ROOTED;
 
+extern void free_stack(void *stkbuf, size_t bufsz) JL_NOTSAFEPOINT;
+
 JL_DLLEXPORT extern int jl_lineno;
 JL_DLLEXPORT extern const char *jl_filename;
 
@@ -1053,7 +1055,7 @@ STATIC_INLINE int jl_addr_is_safepoint(uintptr_t addr)
     return addr >= safepoint_addr && addr < safepoint_addr + jl_page_size * 4;
 }
 extern _Atomic(uint32_t) jl_gc_running;
-extern JL_DLLEXPORT _Atomic(uint32_t) jl_gc_disable_counter;
+extern _Atomic(uint32_t) jl_gc_disable_counter;
 // All the functions are safe to be called from within a signal handler
 // provided that the thread will not be interrupted by another asynchronous
 // signal.
diff --git a/src/stackwalk.c b/src/stackwalk.c
index 5f28b61c4a8fe..a1de3a6d61a07 100644
--- a/src/stackwalk.c
+++ b/src/stackwalk.c
@@ -1294,8 +1294,6 @@ JL_DLLEXPORT void jl_print_backtrace(void) JL_NOTSAFEPOINT
 }
 
 extern int gc_first_tid;
-extern int gc_is_parallel_collector_thread(int tid) JL_NOTSAFEPOINT;
-extern int gc_is_concurrent_collector_thread(int tid) JL_NOTSAFEPOINT;
 
 // Print backtraces for all live tasks, for all threads, to jl_safe_printf stderr
 JL_DLLEXPORT void jl_print_task_backtraces(int show_done) JL_NOTSAFEPOINT
@@ -1304,12 +1302,8 @@ JL_DLLEXPORT void jl_print_task_backtraces(int show_done) JL_NOTSAFEPOINT
     jl_ptls_t *allstates = jl_atomic_load_relaxed(&jl_all_tls_states);
     for (size_t i = 0; i < nthreads; i++) {
         jl_ptls_t ptls2 = allstates[i];
-        if (gc_is_parallel_collector_thread(i)) {
-            jl_safe_printf("==== Skipping backtrace for parallel GC thread %zu\n", i + 1);
-            continue;
-        }
-        if (gc_is_concurrent_collector_thread(i)) {
-            jl_safe_printf("==== Skipping backtrace for concurrent GC thread %zu\n", i + 1);
+        if (gc_is_collector_thread(i)) {
+            jl_safe_printf("==== Skipping backtrace for parallel/concurrent GC thread %zu\n", i + 1);
             continue;
         }
         if (ptls2 == NULL) {

From 6c0eb93fccbd77a338c6a6e2ddae8888fa6bc1b2 Mon Sep 17 00:00:00 2001
From: Eduardo Souza <ledusou@gmail.com>
Date: Thu, 19 Sep 2024 04:18:13 +0000
Subject: [PATCH 32/38] Push resolution of merge conflict

---
 src/scheduler.c | 11 -----------
 1 file changed, 11 deletions(-)

diff --git a/src/scheduler.c b/src/scheduler.c
index b85a481588e4f..bb2f85b52283f 100644
--- a/src/scheduler.c
+++ b/src/scheduler.c
@@ -80,20 +80,9 @@ JL_DLLEXPORT int jl_set_task_threadpoolid(jl_task_t *task, int8_t tpid) JL_NOTSA
     return 1;
 }
 
-<<<<<<< HEAD
 // GC functions used
 extern int jl_gc_mark_queue_obj_explicit(jl_gc_mark_cache_t *gc_cache,
                                          jl_gc_markqueue_t *mq, jl_value_t *obj) JL_NOTSAFEPOINT;
-=======
-// parallel task runtime
-// ---
-
-JL_DLLEXPORT uint32_t jl_rand_ptls(uint32_t max) // [0, n)
-{
-    jl_ptls_t ptls = jl_current_task->ptls;
-    return cong(max, &ptls->rngseed);
-}
->>>>>>> 4f39869d04 (Refactoring to be considered before adding MMTk)
 
 // initialize the threading infrastructure
 // (called only by the main thread)

From fb0ec76ecc52efae85ad65c34b1a3f49f24475e7 Mon Sep 17 00:00:00 2001
From: Eduardo Souza <ledusou@gmail.com>
Date: Wed, 25 Sep 2024 01:10:31 +0000
Subject: [PATCH 33/38] Removing jl_gc_mark_queue_obj_explicit extern
 definition from scheduler.c

---
 src/scheduler.c | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/src/scheduler.c b/src/scheduler.c
index bb2f85b52283f..7e23f654c2566 100644
--- a/src/scheduler.c
+++ b/src/scheduler.c
@@ -80,10 +80,6 @@ JL_DLLEXPORT int jl_set_task_threadpoolid(jl_task_t *task, int8_t tpid) JL_NOTSA
     return 1;
 }
 
-// GC functions used
-extern int jl_gc_mark_queue_obj_explicit(jl_gc_mark_cache_t *gc_cache,
-                                         jl_gc_markqueue_t *mq, jl_value_t *obj) JL_NOTSAFEPOINT;
-
 // initialize the threading infrastructure
 // (called only by the main thread)
 void jl_init_threadinginfra(void)

From 3eea0790d832eba1d17b1a1564447f51986c7118 Mon Sep 17 00:00:00 2001
From: Eduardo Souza <ledusou@gmail.com>
Date: Wed, 25 Sep 2024 02:50:25 +0000
Subject: [PATCH 34/38] Don't need the getter function since it's possible to
 use jl_small_typeof directly

---
 src/gc-common.c | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/src/gc-common.c b/src/gc-common.c
index 046feae6aa4c5..417f12f26d64d 100644
--- a/src/gc-common.c
+++ b/src/gc-common.c
@@ -693,10 +693,6 @@ JL_DLLEXPORT jl_weakref_t *jl_gc_new_weakref(jl_value_t *value)
     return jl_gc_new_weakref_th(ptls, value);
 }
 
-JL_DLLEXPORT jl_datatype_t **jl_get_ijl_small_typeof(void) {
-    return ijl_small_typeof;
-}
-
 const uint64_t _jl_buff_tag[3] = {0x4eadc0004eadc000ull, 0x4eadc0004eadc000ull, 0x4eadc0004eadc000ull}; // aka 0xHEADER00
 JL_DLLEXPORT uintptr_t jl_get_buff_tag(void) JL_NOTSAFEPOINT
 {

From ef6c79823306f2556951d6f8a70b165aceda2c76 Mon Sep 17 00:00:00 2001
From: Eduardo Souza <ledusou@gmail.com>
Date: Fri, 27 Sep 2024 00:49:07 +0000
Subject: [PATCH 35/38] Remove extern from free_stack declaration in
 julia_internal.h

---
 src/julia_internal.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/julia_internal.h b/src/julia_internal.h
index c079c06f0189a..6fd537ed6baf8 100644
--- a/src/julia_internal.h
+++ b/src/julia_internal.h
@@ -367,7 +367,7 @@ extern jl_function_t *jl_typeinf_func JL_GLOBALLY_ROOTED;
 extern JL_DLLEXPORT size_t jl_typeinf_world;
 extern _Atomic(jl_typemap_entry_t*) call_cache[N_CALL_CACHE] JL_GLOBALLY_ROOTED;
 
-extern void free_stack(void *stkbuf, size_t bufsz) JL_NOTSAFEPOINT;
+void free_stack(void *stkbuf, size_t bufsz) JL_NOTSAFEPOINT;
 
 JL_DLLEXPORT extern int jl_lineno;
 JL_DLLEXPORT extern const char *jl_filename;

From 63ca362bfaeed147887da242a6721de014ca5535 Mon Sep 17 00:00:00 2001
From: Eduardo Souza <ledusou@gmail.com>
Date: Tue, 8 Oct 2024 09:12:49 +0000
Subject: [PATCH 36/38] Putting everything that is common GC tls into
 gc-tls-common.h

---
 src/gc-common.c     |  10 +--
 src/gc-stacks.c     |  18 +++---
 src/gc-stock.c      | 154 ++++++++++++++++++++++----------------------
 src/gc-tls-common.h |  52 +++++++++++++++
 src/gc-tls.h        |  25 -------
 src/julia_threads.h |   2 +
 src/stackwalk.c     |   2 +-
 7 files changed, 147 insertions(+), 116 deletions(-)
 create mode 100644 src/gc-tls-common.h

diff --git a/src/gc-common.c b/src/gc-common.c
index 417f12f26d64d..6ce455d3923ad 100644
--- a/src/gc-common.c
+++ b/src/gc-common.c
@@ -587,16 +587,16 @@ size_t jl_genericmemory_nbytes(jl_genericmemory_t *m) JL_NOTSAFEPOINT
 void jl_gc_track_malloced_genericmemory(jl_ptls_t ptls, jl_genericmemory_t *m, int isaligned){
     // This is **NOT** a GC safe point.
     mallocmemory_t *ma;
-    if (ptls->gc_tls.heap.mafreelist == NULL) {
+    if (ptls->gc_tls_common.heap.mafreelist == NULL) {
         ma = (mallocmemory_t*)malloc_s(sizeof(mallocmemory_t));
     }
     else {
-        ma = ptls->gc_tls.heap.mafreelist;
-        ptls->gc_tls.heap.mafreelist = ma->next;
+        ma = ptls->gc_tls_common.heap.mafreelist;
+        ptls->gc_tls_common.heap.mafreelist = ma->next;
     }
     ma->a = (jl_genericmemory_t*)((uintptr_t)m | !!isaligned);
-    ma->next = ptls->gc_tls.heap.mallocarrays;
-    ptls->gc_tls.heap.mallocarrays = ma;
+    ma->next = ptls->gc_tls_common.heap.mallocarrays;
+    ptls->gc_tls_common.heap.mallocarrays = ma;
 }
 
 // =========================================================================== //
diff --git a/src/gc-stacks.c b/src/gc-stacks.c
index 8c44b65284386..a8fec938456a3 100644
--- a/src/gc-stacks.c
+++ b/src/gc-stacks.c
@@ -131,7 +131,7 @@ void _jl_free_stack(jl_ptls_t ptls, void *stkbuf, size_t bufsz) JL_NOTSAFEPOINT
     if (bufsz <= pool_sizes[JL_N_STACK_POOLS - 1]) {
         unsigned pool_id = select_pool(bufsz);
         if (pool_sizes[pool_id] == bufsz) {
-            small_arraylist_push(&ptls->gc_tls.heap.free_stacks[pool_id], stkbuf);
+            small_arraylist_push(&ptls->gc_tls_common.heap.free_stacks[pool_id], stkbuf);
             return;
         }
     }
@@ -160,7 +160,7 @@ void jl_release_task_stack(jl_ptls_t ptls, jl_task_t *task)
 #ifdef _COMPILER_ASAN_ENABLED_
             __asan_unpoison_stack_memory((uintptr_t)stkbuf, bufsz);
 #endif
-            small_arraylist_push(&ptls->gc_tls.heap.free_stacks[pool_id], stkbuf);
+            small_arraylist_push(&ptls->gc_tls_common.heap.free_stacks[pool_id], stkbuf);
         }
     }
 }
@@ -175,7 +175,7 @@ JL_DLLEXPORT void *jl_malloc_stack(size_t *bufsz, jl_task_t *owner) JL_NOTSAFEPO
     if (ssize <= pool_sizes[JL_N_STACK_POOLS - 1]) {
         unsigned pool_id = select_pool(ssize);
         ssize = pool_sizes[pool_id];
-        small_arraylist_t *pool = &ptls->gc_tls.heap.free_stacks[pool_id];
+        small_arraylist_t *pool = &ptls->gc_tls_common.heap.free_stacks[pool_id];
         if (pool->len > 0) {
             stk = small_arraylist_pop(pool);
         }
@@ -196,7 +196,7 @@ JL_DLLEXPORT void *jl_malloc_stack(size_t *bufsz, jl_task_t *owner) JL_NOTSAFEPO
     }
     *bufsz = ssize;
     if (owner) {
-        small_arraylist_t *live_tasks = &ptls->gc_tls.heap.live_tasks;
+        small_arraylist_t *live_tasks = &ptls->gc_tls_common.heap.live_tasks;
         mtarraylist_push(live_tasks, owner);
     }
     return stk;
@@ -223,7 +223,7 @@ void sweep_stack_pools(void) JL_NOTSAFEPOINT
 
         // free half of stacks that remain unused since last sweep
         for (int p = 0; p < JL_N_STACK_POOLS; p++) {
-            small_arraylist_t *al = &ptls2->gc_tls.heap.free_stacks[p];
+            small_arraylist_t *al = &ptls2->gc_tls_common.heap.free_stacks[p];
             size_t n_to_free;
             if (jl_atomic_load_relaxed(&ptls2->current_task) == NULL) {
                 n_to_free = al->len; // not alive yet or dead, so it does not need these anymore
@@ -245,10 +245,10 @@ void sweep_stack_pools(void) JL_NOTSAFEPOINT
             }
         }
         if (jl_atomic_load_relaxed(&ptls2->current_task) == NULL) {
-            small_arraylist_free(ptls2->gc_tls.heap.free_stacks);
+            small_arraylist_free(ptls2->gc_tls_common.heap.free_stacks);
         }
 
-        small_arraylist_t *live_tasks = &ptls2->gc_tls.heap.live_tasks;
+        small_arraylist_t *live_tasks = &ptls2->gc_tls_common.heap.live_tasks;
         size_t n = 0;
         size_t ndel = 0;
         size_t l = live_tasks->len;
@@ -299,7 +299,7 @@ JL_DLLEXPORT jl_array_t *jl_live_tasks(void)
         jl_ptls_t ptls2 = allstates[i];
         if (ptls2 == NULL)
             continue;
-        small_arraylist_t *live_tasks = &ptls2->gc_tls.heap.live_tasks;
+        small_arraylist_t *live_tasks = &ptls2->gc_tls_common.heap.live_tasks;
         size_t n = mtarraylist_length(live_tasks);
         l += n + (ptls2->root_task->ctx.stkbuf != NULL);
     }
@@ -318,7 +318,7 @@ JL_DLLEXPORT jl_array_t *jl_live_tasks(void)
                 goto restart;
             jl_array_data(a,void*)[j++] = t;
         }
-        small_arraylist_t *live_tasks = &ptls2->gc_tls.heap.live_tasks;
+        small_arraylist_t *live_tasks = &ptls2->gc_tls_common.heap.live_tasks;
         size_t n = mtarraylist_length(live_tasks);
         for (size_t i = 0; i < n; i++) {
             jl_task_t *t = (jl_task_t*)mtarraylist_get(live_tasks, i);
diff --git a/src/gc-stock.c b/src/gc-stock.c
index b345fe08ff69c..8e040c9b25dcf 100644
--- a/src/gc-stock.c
+++ b/src/gc-stock.c
@@ -357,7 +357,7 @@ JL_DLLEXPORT jl_weakref_t *jl_gc_new_weakref_th(jl_ptls_t ptls, jl_value_t *valu
     jl_weakref_t *wr = (jl_weakref_t*)jl_gc_alloc(ptls, sizeof(void*),
                                                   jl_weakref_type);
     wr->value = value;  // NOTE: wb not needed here
-    small_arraylist_push(&ptls->gc_tls.heap.weak_refs, wr);
+    small_arraylist_push(&ptls->gc_tls_common.heap.weak_refs, wr);
     return wr;
 }
 
@@ -367,8 +367,8 @@ static void clear_weak_refs(void)
     for (int i = 0; i < gc_n_threads; i++) {
         jl_ptls_t ptls2 = gc_all_tls_states[i];
         if (ptls2 != NULL) {
-            size_t n, l = ptls2->gc_tls.heap.weak_refs.len;
-            void **lst = ptls2->gc_tls.heap.weak_refs.items;
+            size_t n, l = ptls2->gc_tls_common.heap.weak_refs.len;
+            void **lst = ptls2->gc_tls_common.heap.weak_refs.items;
             for (n = 0; n < l; n++) {
                 jl_weakref_t *wr = (jl_weakref_t*)lst[n];
                 if (!gc_marked(jl_astaggedvalue(wr->value)->bits.gc))
@@ -386,8 +386,8 @@ static void sweep_weak_refs(void)
         if (ptls2 != NULL) {
             size_t n = 0;
             size_t ndel = 0;
-            size_t l = ptls2->gc_tls.heap.weak_refs.len;
-            void **lst = ptls2->gc_tls.heap.weak_refs.items;
+            size_t l = ptls2->gc_tls_common.heap.weak_refs.len;
+            void **lst = ptls2->gc_tls_common.heap.weak_refs.items;
             if (l == 0)
                 continue;
             while (1) {
@@ -402,7 +402,7 @@ static void sweep_weak_refs(void)
                 lst[n] = lst[n + ndel];
                 lst[n + ndel] = tmp;
             }
-            ptls2->gc_tls.heap.weak_refs.len -= ndel;
+            ptls2->gc_tls_common.heap.weak_refs.len -= ndel;
         }
     }
 }
@@ -410,18 +410,18 @@ static void sweep_weak_refs(void)
 
 STATIC_INLINE void jl_batch_accum_heap_size(jl_ptls_t ptls, uint64_t sz) JL_NOTSAFEPOINT
 {
-    uint64_t alloc_acc = jl_atomic_load_relaxed(&ptls->gc_tls.gc_num.alloc_acc) + sz;
+    uint64_t alloc_acc = jl_atomic_load_relaxed(&ptls->gc_tls_common.gc_num.alloc_acc) + sz;
     if (alloc_acc < 16*1024)
-        jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.alloc_acc, alloc_acc);
+        jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.alloc_acc, alloc_acc);
     else {
         jl_atomic_fetch_add_relaxed(&gc_heap_stats.heap_size, alloc_acc);
-        jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.alloc_acc, 0);
+        jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.alloc_acc, 0);
     }
 }
 
 STATIC_INLINE void jl_batch_accum_free_size(jl_ptls_t ptls, uint64_t sz) JL_NOTSAFEPOINT
 {
-    jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.free_acc, jl_atomic_load_relaxed(&ptls->gc_tls.gc_num.free_acc) + sz);
+    jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.free_acc, jl_atomic_load_relaxed(&ptls->gc_tls_common.gc_num.free_acc) + sz);
 }
 
 // big value list
@@ -442,10 +442,10 @@ STATIC_INLINE jl_value_t *jl_gc_big_alloc_inner(jl_ptls_t ptls, size_t sz)
         jl_throw(jl_memory_exception);
     gc_invoke_callbacks(jl_gc_cb_notify_external_alloc_t,
         gc_cblist_notify_external_alloc, (v, allocsz));
-    jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.allocd,
-        jl_atomic_load_relaxed(&ptls->gc_tls.gc_num.allocd) + allocsz);
-    jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.bigalloc,
-        jl_atomic_load_relaxed(&ptls->gc_tls.gc_num.bigalloc) + 1);
+    jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.allocd,
+        jl_atomic_load_relaxed(&ptls->gc_tls_common.gc_num.allocd) + allocsz);
+    jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.bigalloc,
+        jl_atomic_load_relaxed(&ptls->gc_tls_common.gc_num.bigalloc) + 1);
     jl_batch_accum_heap_size(ptls, allocsz);
 #ifdef MEMDEBUG
     memset(v, 0xee, allocsz);
@@ -558,8 +558,8 @@ static void sweep_big(jl_ptls_t ptls) JL_NOTSAFEPOINT
 void jl_gc_count_allocd(size_t sz) JL_NOTSAFEPOINT
 {
     jl_ptls_t ptls = jl_current_task->ptls;
-    jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.allocd,
-        jl_atomic_load_relaxed(&ptls->gc_tls.gc_num.allocd) + sz);
+    jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.allocd,
+        jl_atomic_load_relaxed(&ptls->gc_tls_common.gc_num.allocd) + sz);
     jl_batch_accum_heap_size(ptls, sz);
 }
 
@@ -578,18 +578,18 @@ static void combine_thread_gc_counts(jl_gc_num_t *dest, int update_heap) JL_NOTS
     for (int i = 0; i < gc_n_threads; i++) {
         jl_ptls_t ptls = gc_all_tls_states[i];
         if (ptls) {
-            dest->allocd += (jl_atomic_load_relaxed(&ptls->gc_tls.gc_num.allocd) + gc_num.interval);
-            dest->malloc += jl_atomic_load_relaxed(&ptls->gc_tls.gc_num.malloc);
-            dest->realloc += jl_atomic_load_relaxed(&ptls->gc_tls.gc_num.realloc);
-            dest->poolalloc += jl_atomic_load_relaxed(&ptls->gc_tls.gc_num.poolalloc);
-            dest->bigalloc += jl_atomic_load_relaxed(&ptls->gc_tls.gc_num.bigalloc);
-            dest->freed += jl_atomic_load_relaxed(&ptls->gc_tls.gc_num.free_acc);
+            dest->allocd += (jl_atomic_load_relaxed(&ptls->gc_tls_common.gc_num.allocd) + gc_num.interval);
+            dest->malloc += jl_atomic_load_relaxed(&ptls->gc_tls_common.gc_num.malloc);
+            dest->realloc += jl_atomic_load_relaxed(&ptls->gc_tls_common.gc_num.realloc);
+            dest->poolalloc += jl_atomic_load_relaxed(&ptls->gc_tls_common.gc_num.poolalloc);
+            dest->bigalloc += jl_atomic_load_relaxed(&ptls->gc_tls_common.gc_num.bigalloc);
+            dest->freed += jl_atomic_load_relaxed(&ptls->gc_tls_common.gc_num.free_acc);
             if (update_heap) {
-                uint64_t alloc_acc = jl_atomic_load_relaxed(&ptls->gc_tls.gc_num.alloc_acc);
-                freed_in_runtime += jl_atomic_load_relaxed(&ptls->gc_tls.gc_num.free_acc);
+                uint64_t alloc_acc = jl_atomic_load_relaxed(&ptls->gc_tls_common.gc_num.alloc_acc);
+                freed_in_runtime += jl_atomic_load_relaxed(&ptls->gc_tls_common.gc_num.free_acc);
                 jl_atomic_store_relaxed(&gc_heap_stats.heap_size, alloc_acc + jl_atomic_load_relaxed(&gc_heap_stats.heap_size));
-                jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.alloc_acc, 0);
-                jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.free_acc, 0);
+                jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.alloc_acc, 0);
+                jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.free_acc, 0);
             }
         }
     }
@@ -605,13 +605,13 @@ static void reset_thread_gc_counts(void) JL_NOTSAFEPOINT
         jl_ptls_t ptls = gc_all_tls_states[i];
         if (ptls != NULL) {
             // don't reset `pool_live_bytes` here
-            jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.allocd, -(int64_t)gc_num.interval);
-            jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.malloc, 0);
-            jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.realloc, 0);
-            jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.poolalloc, 0);
-            jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.bigalloc, 0);
-            jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.alloc_acc, 0);
-            jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.free_acc, 0);
+            jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.allocd, -(int64_t)gc_num.interval);
+            jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.malloc, 0);
+            jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.realloc, 0);
+            jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.poolalloc, 0);
+            jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.bigalloc, 0);
+            jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.alloc_acc, 0);
+            jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.free_acc, 0);
         }
     }
 }
@@ -654,8 +654,8 @@ static void sweep_malloced_memory(void) JL_NOTSAFEPOINT
     for (int t_i = 0; t_i < gc_n_threads; t_i++) {
         jl_ptls_t ptls2 = gc_all_tls_states[t_i];
         if (ptls2 != NULL) {
-            mallocmemory_t *ma = ptls2->gc_tls.heap.mallocarrays;
-            mallocmemory_t **pma = &ptls2->gc_tls.heap.mallocarrays;
+            mallocmemory_t *ma = ptls2->gc_tls_common.heap.mallocarrays;
+            mallocmemory_t **pma = &ptls2->gc_tls_common.heap.mallocarrays;
             while (ma != NULL) {
                 mallocmemory_t *nxt = ma->next;
                 jl_value_t *a = (jl_value_t*)((uintptr_t)ma->a & ~1);
@@ -667,8 +667,8 @@ static void sweep_malloced_memory(void) JL_NOTSAFEPOINT
                     *pma = nxt;
                     int isaligned = (uintptr_t)ma->a & 1;
                     jl_gc_free_memory(a, isaligned);
-                    ma->next = ptls2->gc_tls.heap.mafreelist;
-                    ptls2->gc_tls.heap.mafreelist = ma;
+                    ma->next = ptls2->gc_tls_common.heap.mafreelist;
+                    ptls2->gc_tls_common.heap.mafreelist = ma;
                 }
                 gc_time_count_mallocd_memory(bits);
                 ma = nxt;
@@ -729,12 +729,12 @@ STATIC_INLINE jl_value_t *jl_gc_small_alloc_inner(jl_ptls_t ptls, int offset,
     return jl_gc_big_alloc(ptls, osize, NULL);
 #endif
     maybe_collect(ptls);
-    jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.allocd,
-        jl_atomic_load_relaxed(&ptls->gc_tls.gc_num.allocd) + osize);
-    jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.pool_live_bytes,
-        jl_atomic_load_relaxed(&ptls->gc_tls.gc_num.pool_live_bytes) + osize);
-    jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.poolalloc,
-        jl_atomic_load_relaxed(&ptls->gc_tls.gc_num.poolalloc) + 1);
+    jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.allocd,
+        jl_atomic_load_relaxed(&ptls->gc_tls_common.gc_num.allocd) + osize);
+    jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.pool_live_bytes,
+        jl_atomic_load_relaxed(&ptls->gc_tls_common.gc_num.pool_live_bytes) + osize);
+    jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.poolalloc,
+        jl_atomic_load_relaxed(&ptls->gc_tls_common.gc_num.poolalloc) + 1);
     // first try to use the freelist
     jl_taggedvalue_t *v = p->freelist;
     if (v != NULL) {
@@ -971,8 +971,8 @@ static void gc_sweep_page(gc_page_profiler_serializer_t *s, jl_gc_pool_t *p, jl_
     // instead of adding it to the thread that originally allocated the page, so we can avoid
     // an atomic-fetch-add here.
     size_t delta = (GC_PAGE_SZ - GC_PAGE_OFFSET - nfree * osize);
-    jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.pool_live_bytes,
-        jl_atomic_load_relaxed(&ptls->gc_tls.gc_num.pool_live_bytes) + delta);
+    jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.pool_live_bytes,
+        jl_atomic_load_relaxed(&ptls->gc_tls_common.gc_num.pool_live_bytes) + delta);
     jl_atomic_fetch_add_relaxed((_Atomic(int64_t) *)&gc_num.freed, (nfree - old_nfree) * osize);
 }
 
@@ -1228,7 +1228,7 @@ static void gc_sweep_pool(void)
             }
             continue;
         }
-        jl_atomic_store_relaxed(&ptls2->gc_tls.gc_num.pool_live_bytes, 0);
+        jl_atomic_store_relaxed(&ptls2->gc_tls_common.gc_num.pool_live_bytes, 0);
         for (int i = 0; i < JL_GC_N_POOLS; i++) {
             jl_gc_pool_t *p = &ptls2->gc_tls.heap.norm_pools[i];
             jl_taggedvalue_t *last = p->freelist;
@@ -2834,7 +2834,7 @@ JL_DLLEXPORT int64_t jl_gc_pool_live_bytes(void)
     for (int i = 0; i < n_threads; i++) {
         jl_ptls_t ptls2 = all_tls_states[i];
         if (ptls2 != NULL) {
-            pool_live_bytes += jl_atomic_load_relaxed(&ptls2->gc_tls.gc_num.pool_live_bytes);
+            pool_live_bytes += jl_atomic_load_relaxed(&ptls2->gc_tls_common.gc_num.pool_live_bytes);
         }
     }
     return pool_live_bytes;
@@ -3183,11 +3183,12 @@ static int _jl_gc_collect(jl_ptls_t ptls, jl_gc_collection_t collection)
         if (jl_atomic_load_relaxed(&ptls2->current_task) == NULL) {
             // GC threads should never exit
             assert(!gc_is_collector_thread(t_i));
+            jl_thread_heap_common_t *common_heap = &ptls2->gc_tls_common.heap;
             jl_thread_heap_t *heap = &ptls2->gc_tls.heap;
-            if (heap->weak_refs.len == 0)
-                small_arraylist_free(&heap->weak_refs);
-            if (heap->live_tasks.len == 0)
-                small_arraylist_free(&heap->live_tasks);
+            if (common_heap->weak_refs.len == 0)
+                small_arraylist_free(&common_heap->weak_refs);
+            if (common_heap->live_tasks.len == 0)
+                small_arraylist_free(&common_heap->live_tasks);
             if (heap->remset.len == 0)
                 arraylist_free(&heap->remset);
             if (ptls2->finalizers.len == 0)
@@ -3256,8 +3257,8 @@ JL_DLLEXPORT void jl_gc_collect(jl_gc_collection_t collection)
     jl_task_t *ct = jl_current_task;
     jl_ptls_t ptls = ct->ptls;
     if (jl_atomic_load_acquire(&jl_gc_disable_counter)) {
-        size_t localbytes = jl_atomic_load_relaxed(&ptls->gc_tls.gc_num.allocd) + gc_num.interval;
-        jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.allocd, -(int64_t)gc_num.interval);
+        size_t localbytes = jl_atomic_load_relaxed(&ptls->gc_tls_common.gc_num.allocd) + gc_num.interval;
+        jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.allocd, -(int64_t)gc_num.interval);
         static_assert(sizeof(_Atomic(uint64_t)) == sizeof(gc_num.deferred_alloc), "");
         jl_atomic_fetch_add_relaxed((_Atomic(uint64_t)*)&gc_num.deferred_alloc, localbytes);
         return;
@@ -3362,6 +3363,7 @@ void gc_mark_queue_all_roots(jl_ptls_t ptls, jl_gc_markqueue_t *mq)
 // Per-thread initialization
 void jl_init_thread_heap(jl_ptls_t ptls)
 {
+    jl_thread_heap_common_t *common_heap = &ptls->gc_tls_common.heap;
     jl_thread_heap_t *heap = &ptls->gc_tls.heap;
     jl_gc_pool_t *p = heap->norm_pools;
     for (int i = 0; i < JL_GC_N_POOLS; i++) {
@@ -3369,12 +3371,12 @@ void jl_init_thread_heap(jl_ptls_t ptls)
         p[i].freelist = NULL;
         p[i].newpages = NULL;
     }
-    small_arraylist_new(&heap->weak_refs, 0);
-    small_arraylist_new(&heap->live_tasks, 0);
+    small_arraylist_new(&common_heap->weak_refs, 0);
+    small_arraylist_new(&common_heap->live_tasks, 0);
     for (int i = 0; i < JL_N_STACK_POOLS; i++)
-        small_arraylist_new(&heap->free_stacks[i], 0);
-    heap->mallocarrays = NULL;
-    heap->mafreelist = NULL;
+        small_arraylist_new(&common_heap->free_stacks[i], 0);
+    common_heap->mallocarrays = NULL;
+    common_heap->mafreelist = NULL;
     heap->young_generation_of_bigvals = (bigval_t*)calloc_s(sizeof(bigval_t)); // sentinel
     assert(gc_bigval_sentinel_tag != 0); // make sure the sentinel is initialized
     heap->young_generation_of_bigvals->header = gc_bigval_sentinel_tag;
@@ -3400,8 +3402,8 @@ void jl_init_thread_heap(jl_ptls_t ptls)
     jl_atomic_store_relaxed(&q->array, wsa2);
     arraylist_new(&mq->reclaim_set, 32);
 
-    memset(&ptls->gc_tls.gc_num, 0, sizeof(ptls->gc_tls.gc_num));
-    jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.allocd, -(int64_t)gc_num.interval);
+    memset(&ptls->gc_tls_common.gc_num, 0, sizeof(ptls->gc_tls_common.gc_num));
+    jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.allocd, -(int64_t)gc_num.interval);
 }
 
 void jl_free_thread_gc_state(jl_ptls_t ptls)
@@ -3579,10 +3581,10 @@ JL_DLLEXPORT void *jl_gc_counted_malloc(size_t sz)
     if (data != NULL && pgcstack != NULL && ct->world_age) {
         jl_ptls_t ptls = ct->ptls;
         maybe_collect(ptls);
-        jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.allocd,
-            jl_atomic_load_relaxed(&ptls->gc_tls.gc_num.allocd) + sz);
-        jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.malloc,
-            jl_atomic_load_relaxed(&ptls->gc_tls.gc_num.malloc) + 1);
+        jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.allocd,
+            jl_atomic_load_relaxed(&ptls->gc_tls_common.gc_num.allocd) + sz);
+        jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.malloc,
+            jl_atomic_load_relaxed(&ptls->gc_tls_common.gc_num.malloc) + 1);
         jl_batch_accum_heap_size(ptls, sz);
     }
     return data;
@@ -3596,10 +3598,10 @@ JL_DLLEXPORT void *jl_gc_counted_calloc(size_t nm, size_t sz)
     if (data != NULL && pgcstack != NULL && ct->world_age) {
         jl_ptls_t ptls = ct->ptls;
         maybe_collect(ptls);
-        jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.allocd,
-            jl_atomic_load_relaxed(&ptls->gc_tls.gc_num.allocd) + nm*sz);
-        jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.malloc,
-            jl_atomic_load_relaxed(&ptls->gc_tls.gc_num.malloc) + 1);
+        jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.allocd,
+            jl_atomic_load_relaxed(&ptls->gc_tls_common.gc_num.allocd) + nm*sz);
+        jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.malloc,
+            jl_atomic_load_relaxed(&ptls->gc_tls_common.gc_num.malloc) + 1);
         jl_batch_accum_heap_size(ptls, sz * nm);
     }
     return data;
@@ -3624,10 +3626,10 @@ JL_DLLEXPORT void *jl_gc_counted_realloc_with_old_size(void *p, size_t old, size
         jl_ptls_t ptls = ct->ptls;
         maybe_collect(ptls);
         if (!(sz < old))
-            jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.allocd,
-                jl_atomic_load_relaxed(&ptls->gc_tls.gc_num.allocd) + (sz - old));
-        jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.realloc,
-            jl_atomic_load_relaxed(&ptls->gc_tls.gc_num.realloc) + 1);
+            jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.allocd,
+                jl_atomic_load_relaxed(&ptls->gc_tls_common.gc_num.allocd) + (sz - old));
+        jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.realloc,
+            jl_atomic_load_relaxed(&ptls->gc_tls_common.gc_num.realloc) + 1);
 
         int64_t diff = sz - old;
         if (diff < 0) {
@@ -3658,10 +3660,10 @@ JL_DLLEXPORT void *jl_gc_managed_malloc(size_t sz)
     if (b == NULL)
         jl_throw(jl_memory_exception);
 
-    jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.allocd,
-        jl_atomic_load_relaxed(&ptls->gc_tls.gc_num.allocd) + allocsz);
-    jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.malloc,
-        jl_atomic_load_relaxed(&ptls->gc_tls.gc_num.malloc) + 1);
+    jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.allocd,
+        jl_atomic_load_relaxed(&ptls->gc_tls_common.gc_num.allocd) + allocsz);
+    jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.malloc,
+        jl_atomic_load_relaxed(&ptls->gc_tls_common.gc_num.malloc) + 1);
     jl_batch_accum_heap_size(ptls, allocsz);
 #ifdef _OS_WINDOWS_
     SetLastError(last_error);
diff --git a/src/gc-tls-common.h b/src/gc-tls-common.h
new file mode 100644
index 0000000000000..28fbf2d0c448e
--- /dev/null
+++ b/src/gc-tls-common.h
@@ -0,0 +1,52 @@
+// This file is a part of Julia. License is MIT: https://julialang.org/license
+
+// Meant to be included in "julia_threads.h"
+#ifndef JL_GC_TLS_COMMON_H
+#define JL_GC_TLS_COMMON_H
+
+#include "julia_atomics.h"
+
+// GC threading ------------------------------------------------------------------
+
+#include "arraylist.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct {
+    // variable for tracking weak references
+    small_arraylist_t weak_refs;
+    // live tasks started on this thread
+    // that are holding onto a stack from the pool
+    small_arraylist_t live_tasks;
+
+    // variables for tracking malloc'd arrays
+    struct _mallocmemory_t *mallocarrays;
+    struct _mallocmemory_t *mafreelist;
+
+#define JL_N_STACK_POOLS 16
+    small_arraylist_t free_stacks[JL_N_STACK_POOLS];
+} jl_thread_heap_common_t;
+
+typedef struct {
+    _Atomic(int64_t) allocd;
+    _Atomic(int64_t) pool_live_bytes;
+    _Atomic(uint64_t) malloc;
+    _Atomic(uint64_t) realloc;
+    _Atomic(uint64_t) poolalloc;
+    _Atomic(uint64_t) bigalloc;
+    _Atomic(int64_t) free_acc;
+    _Atomic(uint64_t) alloc_acc;
+} jl_thread_gc_num_common_t;
+
+typedef struct {
+    jl_thread_heap_common_t heap;
+    jl_thread_gc_num_common_t gc_num;
+} jl_gc_tls_states_common_t;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // JL_GC_TLS_H
diff --git a/src/gc-tls.h b/src/gc-tls.h
index 9e4b09404db84..ecc815805a98b 100644
--- a/src/gc-tls.h
+++ b/src/gc-tls.h
@@ -21,16 +21,6 @@ typedef struct {
 } jl_gc_pool_t;
 
 typedef struct {
-    // variable for tracking weak references
-    small_arraylist_t weak_refs;
-    // live tasks started on this thread
-    // that are holding onto a stack from the pool
-    small_arraylist_t live_tasks;
-
-    // variables for tracking malloc'd arrays
-    struct _mallocmemory_t *mallocarrays;
-    struct _mallocmemory_t *mafreelist;
-
     // variable for tracking young (i.e. not in `GC_OLD_MARKED`/last generation) large objects
     struct _bigval_t *young_generation_of_bigvals;
 
@@ -42,22 +32,8 @@ typedef struct {
     // variables for allocating objects from pools
 #define JL_GC_N_MAX_POOLS 51 // conservative. must be kept in sync with `src/julia_internal.h`
     jl_gc_pool_t norm_pools[JL_GC_N_MAX_POOLS];
-
-#define JL_N_STACK_POOLS 16
-    small_arraylist_t free_stacks[JL_N_STACK_POOLS];
 } jl_thread_heap_t;
 
-typedef struct {
-    _Atomic(int64_t) allocd;
-    _Atomic(int64_t) pool_live_bytes;
-    _Atomic(uint64_t) malloc;
-    _Atomic(uint64_t) realloc;
-    _Atomic(uint64_t) poolalloc;
-    _Atomic(uint64_t) bigalloc;
-    _Atomic(int64_t) free_acc;
-    _Atomic(uint64_t) alloc_acc;
-} jl_thread_gc_num_t;
-
 typedef struct {
     ws_queue_t chunk_queue;
     ws_queue_t ptr_queue;
@@ -78,7 +54,6 @@ typedef struct {
 typedef struct {
     jl_thread_heap_t heap;
     jl_gc_page_stack_t page_metadata_allocd;
-    jl_thread_gc_num_t gc_num;
     jl_gc_markqueue_t mark_queue;
     jl_gc_mark_cache_t gc_cache;
     _Atomic(size_t) gc_sweeps_requested;
diff --git a/src/julia_threads.h b/src/julia_threads.h
index b697a0bf030ed..fcc28591658cb 100644
--- a/src/julia_threads.h
+++ b/src/julia_threads.h
@@ -5,6 +5,7 @@
 #define JL_THREADS_H
 
 #include "gc-tls.h"
+#include "gc-tls-common.h"
 #include "julia_atomics.h"
 #ifndef _OS_WINDOWS_
 #include "pthread.h"
@@ -155,6 +156,7 @@ typedef struct _jl_tls_states_t {
     // Counter to disable finalizer **on the current thread**
     int finalizers_inhibited;
     jl_gc_tls_states_t gc_tls; // this is very large, and the offset of the first member is baked into codegen
+    jl_gc_tls_states_common_t gc_tls_common; // common tls for both GCs
     volatile sig_atomic_t defer_signal;
     _Atomic(struct _jl_task_t*) current_task;
     struct _jl_task_t *next_task;
diff --git a/src/stackwalk.c b/src/stackwalk.c
index a1de3a6d61a07..0988d7a833c94 100644
--- a/src/stackwalk.c
+++ b/src/stackwalk.c
@@ -1309,7 +1309,7 @@ JL_DLLEXPORT void jl_print_task_backtraces(int show_done) JL_NOTSAFEPOINT
         if (ptls2 == NULL) {
             continue;
         }
-        small_arraylist_t *live_tasks = &ptls2->gc_tls.heap.live_tasks;
+        small_arraylist_t *live_tasks = &ptls2->gc_tls_common.heap.live_tasks;
         size_t n = mtarraylist_length(live_tasks);
         int t_state = JL_TASK_STATE_DONE;
         jl_task_t *t = ptls2->root_task;

From 3271996a9eb45899e330a274420a53d45c6b4079 Mon Sep 17 00:00:00 2001
From: Eduardo Souza <ledusou@gmail.com>
Date: Tue, 8 Oct 2024 09:14:24 +0000
Subject: [PATCH 37/38] Typo

---
 src/gc-tls-common.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/gc-tls-common.h b/src/gc-tls-common.h
index 28fbf2d0c448e..ba36f5c1c238e 100644
--- a/src/gc-tls-common.h
+++ b/src/gc-tls-common.h
@@ -49,4 +49,4 @@ typedef struct {
 }
 #endif
 
-#endif // JL_GC_TLS_H
+#endif // JL_GC_TLS_COMMON_H

From cd4f5a177f0c0c7d9e0fb59bf830f2d914c46727 Mon Sep 17 00:00:00 2001
From: Eduardo Souza <ledusou@gmail.com>
Date: Tue, 8 Oct 2024 22:46:39 +0000
Subject: [PATCH 38/38] Adding gc-tls-common.h to Makefile as a public header

---
 src/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Makefile b/src/Makefile
index a6b1f433b73ce..80bbdbcff67fc 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -103,7 +103,7 @@ ifeq ($(USE_SYSTEM_LIBUV),0)
 UV_HEADERS += uv.h
 UV_HEADERS += uv/*.h
 endif
-PUBLIC_HEADERS := $(BUILDDIR)/julia_version.h $(wildcard $(SRCDIR)/support/*.h) $(addprefix $(SRCDIR)/,work-stealing-queue.h gc-interface.h gc-tls.h julia.h julia_assert.h julia_threads.h julia_fasttls.h julia_locks.h julia_atomics.h jloptions.h)
+PUBLIC_HEADERS := $(BUILDDIR)/julia_version.h $(wildcard $(SRCDIR)/support/*.h) $(addprefix $(SRCDIR)/,work-stealing-queue.h gc-interface.h gc-tls.h gc-tls-common.h julia.h julia_assert.h julia_threads.h julia_fasttls.h julia_locks.h julia_atomics.h jloptions.h)
 ifeq ($(OS),WINNT)
 PUBLIC_HEADERS += $(addprefix $(SRCDIR)/,win32_ucontext.h)
 endif