diff --git a/src/foundation/slab_alloc.c b/src/foundation/slab_alloc.c index 81818ac18..7d67cffc1 100644 --- a/src/foundation/slab_alloc.c +++ b/src/foundation/slab_alloc.c @@ -1,5 +1,5 @@ /* - * slab_alloc.c — Thread-local slab allocator for tree-sitter. + * slab_alloc.c — Slab allocator for tree-sitter. * * Replaces malloc/calloc/realloc/free for ALL tree-sitter allocations, * eliminating ptmalloc2's per-thread arena fragmentation (the root cause @@ -8,12 +8,15 @@ * Tier 1 (≤64B): Fixed-size slab free list. * Matches tree-sitter SubtreeHeapData (64 bytes). O(1) alloc/free. * Backed by 64KB slab pages (malloc = mimalloc in production). + * Pages are owned by one thread for reuse, with a global registry so + * cross-thread tree-sitter frees cannot fall through to plain free(). * * All allocations >64B go directly to malloc() which is mimalloc * in production builds (MI_OVERRIDE=1). This eliminates the complex * tier2 bump allocator and its O(n) ownership checks. * - * On slab_destroy_thread: free all slab pages. + * On reclaim/destroy: free pages with no live chunks; retire pages that still + * have foreign-live chunks and free them when the final chunk returns. * realloc handles slab-to-heap promotion with minimal copying. */ #include "foundation/constants.h" @@ -21,7 +24,10 @@ #include "foundation/compat.h" #include +#include +#include #include +#include #include #include @@ -41,44 +47,85 @@ typedef struct slab_free_node { struct slab_free_node *next; } slab_free_node_t; +typedef struct slab_state slab_state_t; + /* One slab page — a contiguous block of SLAB_PAGE_CHUNKS chunks. */ typedef struct slab_page { - struct slab_page *next; /* linked list of pages */ - char data[SLAB_PAGE_SIZE]; + struct slab_page *next; /* linked list of pages owned by a TLS state */ + struct slab_page *global_next; /* global ownership registry */ + slab_state_t *owner; /* TLS state that can reuse freed chunks */ + unsigned live_count; /* chunks currently handed to tree-sitter */ + bool retired; /* owner reclaimed/destroyed while chunks live */ + alignas(max_align_t) char data[SLAB_PAGE_SIZE]; } slab_page_t; /* Per-thread Tier 1 state. */ -typedef struct { +struct slab_state { slab_page_t *pages; /* linked list of all allocated pages */ slab_free_node_t *freelist; /* singly-linked free list */ bool installed; -} slab_state_t; +}; static CBM_TLS slab_state_t tls_slab; +static atomic_flag g_slab_lock = ATOMIC_FLAG_INIT; +static slab_page_t *g_slab_pages = NULL; /* ── Tier 1 helpers ────────────────────────────────────────────────── */ -/* Rebuild free list from all existing pages. O(pages * SLAB_PAGE_CHUNKS). */ -static void slab_rebuild_freelist(slab_state_t *s) { - s->freelist = NULL; - for (slab_page_t *p = s->pages; p; p = p->next) { - for (size_t i = 0; i < SLAB_PAGE_CHUNKS; i++) { - slab_free_node_t *node = (slab_free_node_t *)(p->data + (i * SLAB_CHUNK_SIZE)); - node->next = s->freelist; - s->freelist = node; +static void slab_lock(void) { + while (atomic_flag_test_and_set_explicit(&g_slab_lock, memory_order_acquire)) { + /* Spin until the allocator registry is available. */ + } +} + +static void slab_unlock(void) { + atomic_flag_clear_explicit(&g_slab_lock, memory_order_release); +} + +static void slab_register_page_locked(slab_page_t *page) { + page->global_next = g_slab_pages; + g_slab_pages = page; +} + +static void slab_unregister_page_locked(slab_page_t *page) { + slab_page_t **cur = &g_slab_pages; + while (*cur) { + if (*cur == page) { + *cur = page->global_next; + page->global_next = NULL; + return; } + cur = &(*cur)->global_next; } } +static slab_page_t *slab_find_page_locked(const void *ptr) { + uintptr_t p = (uintptr_t)ptr; + for (slab_page_t *page = g_slab_pages; page; page = page->global_next) { + uintptr_t lo = (uintptr_t)page->data; + if (p >= lo && p < lo + (uintptr_t)SLAB_PAGE_SIZE) { + return page; + } + } + return NULL; +} + /* Add a new page to the slab and prepend its chunks to the free list. - * Pages are allocated via malloc (= mimalloc in production). */ -static bool slab_grow(slab_state_t *s) { + * Pages are allocated via malloc (= mimalloc in production). Caller holds + * g_slab_lock because tree-sitter's allocator callbacks are global and a + * chunk allocated by one parser thread may be freed by another parser thread. */ +static bool slab_grow_locked(slab_state_t *s) { slab_page_t *page = (slab_page_t *)malloc(sizeof(slab_page_t)); if (!page) { return false; } page->next = s->pages; + page->global_next = NULL; + page->owner = s; + page->live_count = 0; + page->retired = false; s->pages = page; + slab_register_page_locked(page); /* Thread page's chunks onto the free list */ for (size_t i = 0; i < SLAB_PAGE_CHUNKS; i++) { @@ -89,19 +136,39 @@ static bool slab_grow(slab_state_t *s) { return true; } -/* Check if a pointer belongs to any slab page (for realloc/free). - * Linear scan is bounded: per-file reclaim keeps page count small. */ -static bool slab_owns(const slab_state_t *s, const void *ptr) { - uintptr_t p = (uintptr_t)ptr; - for (const slab_page_t *page = s->pages; page; page = page->next) { - uintptr_t lo = (uintptr_t)page->data; - if (p >= lo && p < lo + (uintptr_t)SLAB_PAGE_SIZE) { - return true; +static slab_page_t *slab_detach_owned_pages_locked(slab_state_t *s) { + slab_page_t *free_pages = NULL; + slab_page_t *p = s->pages; + + while (p) { + slab_page_t *next = p->next; + if (p->live_count == 0) { + slab_unregister_page_locked(p); + p->next = free_pages; + free_pages = p; + } else { + p->owner = NULL; + p->retired = true; + p->next = NULL; } + p = next; + } + + s->pages = NULL; + s->freelist = NULL; + return free_pages; +} + +static void slab_free_page_list(slab_page_t *pages) { + while (pages) { + slab_page_t *next = pages->next; + free(pages); + pages = next; } - return false; } +static void slab_free(void *ptr); + /* ── Allocator functions (installed as tree-sitter callbacks) ───── */ static void *slab_malloc(size_t size) { @@ -111,13 +178,20 @@ static void *slab_malloc(size_t size) { /* Tier 1: ≤64B → slab free list */ if (size <= SLAB_CHUNK_SIZE) { slab_state_t *s = &tls_slab; + slab_lock(); if (!s->freelist) { - if (!slab_grow(s)) { + if (!slab_grow_locked(s)) { + slab_unlock(); return malloc(size); /* fallback */ } } slab_free_node_t *node = s->freelist; s->freelist = node->next; + slab_page_t *page = slab_find_page_locked(node); + if (page) { + page->live_count++; + } + slab_unlock(); return node; } @@ -145,18 +219,15 @@ static void *slab_realloc(void *ptr, size_t new_size) { } if (new_size == 0) { /* realloc(ptr, 0) = free + return NULL */ - if (slab_owns(&tls_slab, ptr)) { - slab_free_node_t *node = (slab_free_node_t *)ptr; - node->next = tls_slab.freelist; - tls_slab.freelist = node; - } else { - free(ptr); - } + slab_free(ptr); return NULL; } /* Case 1: ptr is in slab (≤64B block) */ - if (slab_owns(&tls_slab, ptr)) { + slab_lock(); + slab_page_t *page = slab_find_page_locked(ptr); + slab_unlock(); + if (page) { if (new_size <= SLAB_CHUNK_SIZE) { /* Still fits in a slab chunk — reuse same slot */ return ptr; @@ -167,10 +238,7 @@ static void *slab_realloc(void *ptr, size_t new_size) { return NULL; } memcpy(new_ptr, ptr, SLAB_CHUNK_SIZE); - /* Return slab slot to free list */ - slab_free_node_t *node = (slab_free_node_t *)ptr; - node->next = tls_slab.freelist; - tls_slab.freelist = node; + slab_free(ptr); return new_ptr; } @@ -182,13 +250,36 @@ static void slab_free(void *ptr) { if (!ptr) { return; } - /* Slab page */ - if (slab_owns(&tls_slab, ptr)) { + + slab_lock(); + slab_page_t *page = slab_find_page_locked(ptr); + if (page) { + bool free_retired_page = false; + if (page->live_count > 0) { + page->live_count--; + } + slab_free_node_t *node = (slab_free_node_t *)ptr; - node->next = tls_slab.freelist; - tls_slab.freelist = node; + if (page->owner && !page->retired) { + node->next = page->owner->freelist; + page->owner->freelist = node; + } else { + node->next = NULL; + } + + if (page->retired && page->live_count == 0) { + slab_unregister_page_locked(page); + free_retired_page = true; + } + slab_unlock(); + + if (free_retired_page) { + free(page); + } return; } + slab_unlock(); + /* Heap fallback */ free(ptr); } @@ -204,43 +295,37 @@ void cbm_slab_install(void) { } void cbm_slab_reset_thread(void) { - slab_state_t *s = &tls_slab; - if (!s->pages) { - return; - } - slab_rebuild_freelist(s); + cbm_slab_reclaim(); } void cbm_slab_destroy_thread(void) { slab_state_t *s = &tls_slab; - slab_page_t *p = s->pages; - while (p) { - slab_page_t *next = p->next; - free(p); - p = next; - } - s->pages = NULL; - s->freelist = NULL; + + slab_lock(); + slab_page_t *free_pages = slab_detach_owned_pages_locked(s); s->installed = false; + slab_unlock(); + + slab_free_page_list(free_pages); } -/* Reclaim all slab memory for the current thread. +/* Reclaim slab memory owned by the current thread. * * Call ONLY when no live allocations remain — i.e., after ts_tree_delete() - * AND ts_parser_delete() have freed everything back to the free lists. - * This keeps peak memory bounded per-file (not cumulative across files). */ + * AND ts_parser_delete() have freed local parser-owned chunks. If tree-sitter + * still returns a foreign-live chunk later, its page is retired and freed when + * live_count reaches zero. This keeps peak memory bounded per-file without + * handing foreign slab chunks to plain free(). */ void cbm_slab_reclaim(void) { slab_state_t *s = &tls_slab; - slab_page_t *p = s->pages; - while (p) { - slab_page_t *next = p->next; - free(p); - p = next; - } - s->pages = NULL; - s->freelist = NULL; + + slab_lock(); + slab_page_t *free_pages = slab_detach_owned_pages_locked(s); /* NOTE: keep s->installed true — allocator is still active, * just with empty pages. Next slab_malloc will call slab_grow. */ + slab_unlock(); + + slab_free_page_list(free_pages); } /* ── Test API (thin wrappers for unit testing) ──────────────────── */ diff --git a/src/foundation/slab_alloc.h b/src/foundation/slab_alloc.h index 9648ba167..d8ca92e06 100644 --- a/src/foundation/slab_alloc.h +++ b/src/foundation/slab_alloc.h @@ -1,12 +1,14 @@ /* - * slab_alloc.h — Thread-local slab allocator for tree-sitter. + * slab_alloc.h — Slab allocator for tree-sitter. * * Replaces malloc/calloc/realloc/free for ALL tree-sitter allocations * to eliminate ptmalloc2's per-thread arena fragmentation. * * Tier 1 (≤64B): Fixed-size slab free list — O(1) alloc/free. * Matches tree-sitter SubtreeHeapData (CBM_SZ_64 bytes). Backed by - * 64KB slab pages via malloc (= mimalloc in production). + * 64KB slab pages via malloc (= mimalloc in production). Pages are + * reused per thread but tracked globally because tree-sitter allocator + * callbacks are process-global and may receive cross-thread frees. * * All allocations >64B go directly to malloc (= mimalloc in production), * which handles size classes, thread caching, and OS page return @@ -15,7 +17,7 @@ * Usage: * cbm_slab_install(); // once, before any parsing * ... parse files ... - * cbm_slab_destroy_thread(); // on thread exit — frees all memory + * cbm_slab_destroy_thread(); // on thread exit — frees owned memory */ #ifndef CBM_SLAB_ALLOC_H #define CBM_SLAB_ALLOC_H @@ -26,20 +28,22 @@ * Must be called once before any ts_parser_new() calls. Thread-safe. */ void cbm_slab_install(void); -/* Reset the current thread's slab: all chunks become available. +/* Reset the current thread's slab: owned pages are reclaimed or retired. * WARNING: Do NOT call between files if the parser retains live state. * Only safe after cbm_destroy_thread_parser() has been called. */ void cbm_slab_reset_thread(void); -/* Destroy the current thread's allocator state: free all slab pages. +/* Destroy the current thread's allocator state. Pages with live chunks from + * cross-thread tree-sitter ownership are retired and freed on the last free. * Call on thread exit. */ void cbm_slab_destroy_thread(void); -/* Reclaim all slab memory for the current thread. +/* Reclaim current-thread slab memory. * Call ONLY when no live allocations remain (after ts_tree_delete AND - * ts_parser_delete). Keeps the allocator installed — next allocation - * will grow fresh pages as needed. This bounds peak memory per-file - * rather than accumulating across all files in a worker. */ + * ts_parser_delete). If another parser thread still owns a chunk from this + * page, the page is retired instead of freed and is released on the last + * cross-thread free. Keeps the allocator installed — next allocation will + * grow fresh pages as needed. */ void cbm_slab_reclaim(void); /* Test/diagnostic API: direct access to the slab allocator. diff --git a/src/pipeline/pass_parallel.c b/src/pipeline/pass_parallel.c index 93656b5ec..113225b44 100644 --- a/src/pipeline/pass_parallel.c +++ b/src/pipeline/pass_parallel.c @@ -671,13 +671,14 @@ static void extract_worker(int worker_id, void *ctx_ptr) { "total", itoa_log(ec->file_count)); } - /* Reclaim all slab + tier2 memory between files. + /* Reclaim slab memory between files. * * After cbm_free_tree(result), all tree nodes are on free lists. * We then destroy the parser (frees its internal allocations too), - * leaving ZERO live slab/tier2 pointers. At that point, we can - * safely munmap/free every page, bounding peak memory per-file - * instead of accumulating across all 644 files. + * so the current thread should have no live parser-owned slab + * pointers. If tree-sitter returns a chunk from another parser + * thread later, the allocator retires that page and frees it when + * the final live chunk is returned. * * get_thread_parser() in cbm_extract_file will create a fresh * parser for the next file — cost is microseconds vs seconds @@ -2405,11 +2406,9 @@ static void resolve_worker(int worker_id, void *ctx_ptr) { } } free(filtered); - /* Contract: cbm_slab_reclaim() requires the thread parser to be - * destroyed first; otherwise its lexer holds slab pointers - * (lexer.included_ranges) that get freed underneath it, causing - * a heap-use-after-free on the next ts_lexer_goto. The next - * cbm_extract_file on this thread will recreate the parser. */ + /* Contract: destroy the thread parser before reclaim so the + * lexer releases its slab pointers. Foreign-live chunks are + * handled by the allocator's global ownership registry. */ cbm_destroy_thread_parser(); cbm_slab_reclaim(); uint64_t lsp_elapsed_ns = extract_now_ns() - lsp_t0; diff --git a/tests/test_mem.c b/tests/test_mem.c index debb9b505..96f3ce02d 100644 --- a/tests/test_mem.c +++ b/tests/test_mem.c @@ -8,6 +8,7 @@ #include "../src/foundation/mem.h" #include "../src/foundation/arena.h" #include "../src/foundation/slab_alloc.h" +#include "../src/foundation/compat_thread.h" #include "pipeline/pipeline.h" #include "pipeline/pipeline_internal.h" #include "graph_buffer/graph_buffer.h" @@ -521,6 +522,59 @@ TEST(slab_mixed_alloc_free_stress) { PASS(); } +typedef struct { + void *ptr; + atomic_int *go; +} slab_cross_thread_free_ctx_t; + +static void *slab_cross_thread_free_worker(void *arg) { + slab_cross_thread_free_ctx_t *ctx = (slab_cross_thread_free_ctx_t *)arg; + while (ctx->go && !atomic_load_explicit(ctx->go, memory_order_acquire)) { + cbm_usleep(1000); + } + cbm_slab_test_free(ctx->ptr); + return NULL; +} + +TEST(slab_cross_thread_free_is_safe) { + cbm_slab_install(); + + void *p = cbm_slab_test_malloc(32); + ASSERT_NOT_NULL(p); + memset(p, 0x5A, 32); + + atomic_int go; + atomic_init(&go, 1); + slab_cross_thread_free_ctx_t ctx = {.ptr = p, .go = &go}; + cbm_thread_t t; + ASSERT_EQ(cbm_thread_create(&t, 0, slab_cross_thread_free_worker, &ctx), 0); + ASSERT_EQ(cbm_thread_join(&t), 0); + + cbm_slab_destroy_thread(); + PASS(); +} + +TEST(slab_reclaim_with_foreign_live_chunk_is_safe) { + cbm_slab_install(); + + void *p = cbm_slab_test_malloc(32); + ASSERT_NOT_NULL(p); + memset(p, 0xA5, 32); + + atomic_int go; + atomic_init(&go, 0); + slab_cross_thread_free_ctx_t ctx = {.ptr = p, .go = &go}; + cbm_thread_t t; + ASSERT_EQ(cbm_thread_create(&t, 0, slab_cross_thread_free_worker, &ctx), 0); + + cbm_slab_reclaim(); + atomic_store_explicit(&go, 1, memory_order_release); + ASSERT_EQ(cbm_thread_join(&t), 0); + + cbm_slab_destroy_thread(); + PASS(); +} + /* ── Parallel extraction integration test ──────────────────── */ static char g_mem_tmpdir[256]; @@ -665,6 +719,8 @@ SUITE(mem) { RUN_TEST(slab_realloc_slab_to_heap); RUN_TEST(slab_calloc_zeroed); RUN_TEST(slab_mixed_alloc_free_stress); + RUN_TEST(slab_cross_thread_free_is_safe); + RUN_TEST(slab_reclaim_with_foreign_live_chunk_is_safe); /* Integration */ RUN_TEST(parallel_extract_with_slab); }