diff --git a/Makefile.cbm b/Makefile.cbm index 3ff50b81f..2f72d1060 100644 --- a/Makefile.cbm +++ b/Makefile.cbm @@ -199,6 +199,7 @@ PIPELINE_SRCS = \ src/pipeline/pass_similarity.c \ src/pipeline/pass_semantic_edges.c \ src/pipeline/pass_complexity.c \ + src/pipeline/pass_importance.c \ src/pipeline/pass_cross_repo.c \ src/pipeline/artifact.c \ src/pipeline/pass_pkgmap.c @@ -326,7 +327,7 @@ TEST_DISCOVER_SRCS = \ TEST_GRAPH_BUFFER_SRCS = tests/test_graph_buffer.c -TEST_PIPELINE_SRCS = tests/test_registry.c tests/test_pipeline.c tests/test_fqn.c tests/test_path_alias.c tests/test_configlink.c tests/test_infrascan.c tests/test_worker_pool.c tests/test_parallel.c +TEST_PIPELINE_SRCS = tests/test_registry.c tests/test_pipeline.c tests/test_fqn.c tests/test_path_alias.c tests/test_configlink.c tests/test_infrascan.c tests/test_worker_pool.c tests/test_parallel.c tests/test_importance.c TEST_WATCHER_SRCS = tests/test_watcher.c diff --git a/src/foundation/constants.h b/src/foundation/constants.h index 977183d63..01d1a35fd 100644 --- a/src/foundation/constants.h +++ b/src/foundation/constants.h @@ -48,6 +48,21 @@ enum { CBM_PERCENT = 100, }; +/* ── Per-file parse-size cap ──────────────────────────────────── */ +/* Default per-file source-read cap (MB), env-overridable via CBM_MAX_FILE_MB + * (see cbm_max_file_bytes() in system_info.c, same clamp shape as + * cbm_default_worker_count()/CBM_WORKERS). 10 MB clears a hand-authored + * amalgamation source like sqlite3.c (~8 MB) while still bounding a single + * worker's per-file parse working set. Previously expressed 7x as the + * misused CBM_PERCENT (100) * CBM_SZ_1K * CBM_SZ_1K "100 MB" literal — + * collapsed to this single named constant; CBM_PERCENT itself is untouched + * and remains the real percentage constant for mem.c/vmem.c/cypher.c. */ +enum { + CBM_DEFAULT_MAX_FILE_MB = 10, + CBM_MIN_FILE_MB = 1, /* floor for CBM_MAX_FILE_MB env override */ + CBM_MAX_FILE_MB_CAP = 1024, /* ceiling for CBM_MAX_FILE_MB env override (1 GB) */ +}; + /* ── Tree-sitter field name helper ───────────────────────────── */ /* Usage: ts_node_child_by_field_name(node, TS_FIELD("callee")) * Expands to: ts_node_child_by_field_name(node, TS_FIELD("callee")) diff --git a/src/foundation/mem.c b/src/foundation/mem.c index 67ef4d14e..c08f51986 100644 --- a/src/foundation/mem.c +++ b/src/foundation/mem.c @@ -17,6 +17,7 @@ #include #include #include +#include #ifdef _WIN32 #ifndef WIN32_LEAN_AND_MEAN @@ -38,6 +39,17 @@ static atomic_int g_was_over; /* pressure hysteresis */ #define MB_DIVISOR ((size_t)(CBM_SZ_1K * CBM_SZ_1K)) +/* ── Hard memory ceiling (enforcing — see mem.h) ───────────────── + * + * Fraction of cgroup-aware detected RAM (cbm_system_info().total_ram), + * always strictly above the advisory DEFAULT_RAM_FRACTION budget so a + * repo that trips backpressure but recovers never reaches the ceiling. + * Absolute floor protects a legit big repo on a small-RAM CI runner from + * spuriously aborting at a tiny fraction-derived value. */ +#define CBM_MEM_CEILING_FRACTION 0.85 +#define CBM_MEM_CEILING_FLOOR_MB ((size_t)2048) /* 2 GB floor */ +#define CBM_MEM_CEILING_CAP_MB ((size_t)1024 * 1024) /* 1 TB env-override ceiling */ + /* ── OS fallback for RSS (ASan builds where MI_OVERRIDE=0) ──── */ static size_t os_rss(void) { @@ -134,13 +146,41 @@ void cbm_mem_init(double ram_fraction) { } size_t cbm_mem_rss(void) { +#if defined(__linux__) && !defined(_WIN32) + /* Linux: mimalloc's _mi_prim_process_info() (vendored/mimalloc/src/ + * prim/unix/prim.c) never sets pinfo->current_rss on Linux — only + * peak_rss (via getrusage's ru_maxrss, a high-water mark). current_rss + * silently falls back to mimalloc's OWN committed-page counter + * (mi_process_info()'s pinfo.current_commit default), which this + * project deliberately tunes low via mi_option_arena_eager_commit=0 + + * mi_option_purge_decommits=1 + mi_option_purge_delay=0 (cbm_mem_init, + * above) to reduce upfront memory. Combined, "current_rss" on Linux + * can read near-zero (observed: 4MB) while true RSS is multiple GB — + * a small-but-NONZERO value that defeated the `current_rss > 0` + * ASan-only fallback guard below, silently blinding both this + * function's callers (cbm_mem_over_budget backpressure AND the + * enforcing ceiling in this same file) to real memory pressure during + * concurrent large-file parsing — the exact mechanism the 2026-07-01 + * incident diagnosed. os_rss() (/proc/self/statm) is unaffected by + * mimalloc's internal accounting and is authoritative OS-reported RSS + * on every Linux build regardless of allocator tuning, so it is the + * PRIMARY source here, not a last-resort fallback. Verified via a + * calibrated 40x8MB synthetic large-file index on IO: mi_process_info + * current_rss=4MB, actual RSS (ps/proc)=~2.2GB, os_rss()=~2.2GB. */ + size_t rss = os_rss(); + if (rss > 0) { + return rss; + } + /* Extremely unlikely (/proc unavailable) — fall through to mimalloc. */ +#endif size_t current_rss = 0; size_t peak_rss = 0; mi_process_info(NULL, NULL, NULL, ¤t_rss, &peak_rss, NULL, NULL, NULL); if (current_rss > 0) { return current_rss; } - /* Fallback for ASan builds (MI_OVERRIDE=0) */ + /* Fallback for ASan builds (MI_OVERRIDE=0) and any other platform + * where mimalloc's current_rss is unavailable/zero. */ return os_rss(); } @@ -174,3 +214,54 @@ size_t cbm_mem_worker_budget(int num_workers) { void cbm_mem_collect(void) { mi_collect(true); } + +/* ── Hard memory ceiling (enforcing) ─────────────────────────────── */ + +size_t cbm_mem_ceiling(void) { + /* CBM_MEM_CEILING_MB env override (clamped to [CBM_MEM_CEILING_FLOOR_MB, + * CBM_MEM_CEILING_CAP_MB]). Same precedence/clamp shape as + * CBM_WORKERS / CBM_MAX_FILE_MB: unset, blank, or non-numeric all parse + * to 0 via strtoull, which falls below the floor and is rejected the + * same way an out-of-range value is. */ + char buf[CBM_SZ_32]; + if (cbm_safe_getenv("CBM_MEM_CEILING_MB", buf, sizeof(buf), NULL) != NULL) { + char *end = NULL; + unsigned long long mb = strtoull(buf, &end, CBM_DECIMAL_BASE); + if (end != buf && mb >= CBM_MEM_CEILING_FLOOR_MB && mb <= CBM_MEM_CEILING_CAP_MB) { + return (size_t)mb * MB_DIVISOR; + } + cbm_log_warn("mem_ceiling.env.invalid", "value", buf, "fallback", "fraction"); + } + + cbm_system_info_t info = cbm_system_info(); + size_t fraction_bytes = (size_t)((double)info.total_ram * CBM_MEM_CEILING_FRACTION); + size_t floor_bytes = CBM_MEM_CEILING_FLOOR_MB * MB_DIVISOR; + return fraction_bytes > floor_bytes ? fraction_bytes : floor_bytes; +} + +bool cbm_mem_over_ceiling(void) { + return cbm_mem_rss() > cbm_mem_ceiling(); +} + +void cbm_mem_abort_if_over_ceiling(const char *file, const char *phase) { + size_t rss = cbm_mem_rss(); + size_t ceiling = cbm_mem_ceiling(); + if (rss <= ceiling) { + return; + } + + char rss_mb[CBM_SZ_32]; + char ceiling_mb[CBM_SZ_32]; + snprintf(rss_mb, sizeof(rss_mb), "%zu", rss / MB_DIVISOR); + snprintf(ceiling_mb, sizeof(ceiling_mb), "%zu", ceiling / MB_DIVISOR); + cbm_log_error("mem.ceiling.abort", "file", file ? file : "unknown", "phase", + phase ? phase : "n/a", "rss_mb", rss_mb, "ceiling_mb", ceiling_mb); + /* Hard abort: SIGABRT, default handler, non-zero exit. Intentionally not + * a graceful cancel — cbm_pipeline_cancel() already exists for that path + * and is advisory-cooperative (workers check an atomic and unwind); RSS + * already over the enforcing ceiling means further allocation to unwind + * cleanly (free lists, log buffers) is itself the risk being guarded + * against, so we terminate immediately instead. Must only be reached + * from the in-memory extract/resolve phases, before any SQLite dump. */ + abort(); +} diff --git a/src/foundation/mem.h b/src/foundation/mem.h index 5001410a8..0c908d7db 100644 --- a/src/foundation/mem.h +++ b/src/foundation/mem.h @@ -35,4 +35,38 @@ size_t cbm_mem_worker_budget(int num_workers); /* Return unused pages to the OS. Call between files to bound per-file peak. */ void cbm_mem_collect(void); +/* ── Hard memory ceiling (abort, not advisory) ─────────────────── + * + * Distinct from cbm_mem_budget()/cbm_mem_over_budget() above, which are + * ADVISORY: pass_parallel.c backpressure naps and proceeds with a soft + * overshoot when workers can't get back under budget. The ceiling below is + * ENFORCING: cbm_mem_abort_if_over_ceiling() hard-aborts the process + * (abort(), SIGABRT) when exceeded, after emitting a diagnostic dump naming + * the offending file, pipeline phase, and RSS. It is always set strictly + * above the advisory budget (see cbm_mem_ceiling), so a repo that trips the + * advisory backpressure but recovers never reaches the ceiling. + * + * Call cbm_mem_abort_if_over_ceiling() only from the in-memory + * extract/resolve phases, BEFORE the graph buffer is dumped to SQLite + * (pipeline.c: run_parallel_pipeline() precedes dump_and_persist_hashes()). + * An abort there can never leave a half-written store. */ + +/* Hard-abort ceiling in bytes: max(CBM_MEM_CEILING_FRACTION * total_ram, + * CBM_MEM_CEILING_FLOOR_MB), unless overridden by the CBM_MEM_CEILING_MB + * env var (same precedence/clamp shape as CBM_WORKERS / CBM_MAX_FILE_MB). + * Always computed fresh (env can change between calls in tests); cheap + * (one getenv + the cached cbm_system_info()). */ +size_t cbm_mem_ceiling(void); + +/* Returns true if current RSS exceeds cbm_mem_ceiling(). */ +bool cbm_mem_over_ceiling(void); + +/* If current RSS exceeds cbm_mem_ceiling(), log a diagnostic dump (offending + * file, phase, RSS, ceiling) at ERROR level to stderr and hard-abort the + * process via abort(). Returns (does nothing) otherwise. `file` and `phase` + * may be NULL (logged as "unknown"/"n/a") when the caller has no better + * label at the call site. Not signal-safe; call only from a normal worker + * context, never from a signal handler. */ +void cbm_mem_abort_if_over_ceiling(const char *file, const char *phase); + #endif /* CBM_MEM_H */ diff --git a/src/foundation/platform.h b/src/foundation/platform.h index 2511a060e..e6f6846eb 100644 --- a/src/foundation/platform.h +++ b/src/foundation/platform.h @@ -114,6 +114,17 @@ cbm_system_info_t cbm_system_info(void); * initial=false: max(1, perf_cores-1) (leave headroom for user apps) */ int cbm_default_worker_count(bool initial); +/* Per-file source-read cap in bytes, for the read_file() size guard used by + * every extraction pass (pass_calls.c, pass_definitions.c, pass_semantic.c, + * pass_usages.c, pass_k8s.c, pass_parallel.c, pass_lsp_cross.c). A file + * larger than this is SKIPPED (read_file returns NULL), never aborted — + * distinct from the process-wide RSS ceiling in mem.h, which aborts. + * CBM_MAX_FILE_MB env override (clamped to [CBM_MIN_FILE_MB, + * CBM_MAX_FILE_MB_CAP]); default CBM_DEFAULT_MAX_FILE_MB. Same + * precedence/clamp shape as cbm_default_worker_count()/CBM_WORKERS: blank, + * unset, or non-numeric falls back to the default (never coerces to 0). */ +long cbm_max_file_bytes(void); + /* ── Environment variables ──────────────────────────────────────── */ /* Thread-safe getenv: copies the value into a caller-provided buffer. diff --git a/src/foundation/system_info.c b/src/foundation/system_info.c index 21e0a9cb3..349ed6ac1 100644 --- a/src/foundation/system_info.c +++ b/src/foundation/system_info.c @@ -304,3 +304,21 @@ int cbm_default_worker_count(bool initial) { int workers = info.perf_cores - SKIP_ONE; return workers > 0 ? workers : MIN_WORKERS; } + +long cbm_max_file_bytes(void) { + /* CBM_MAX_FILE_MB env override (clamped to [CBM_MIN_FILE_MB, + * CBM_MAX_FILE_MB_CAP]). Same precedence/clamp shape as + * cbm_default_worker_count()/CBM_WORKERS above: unset, blank, or + * non-numeric all parse to 0 via strtol, which falls below the floor + * and is rejected the same way an out-of-range value is — so blank + * never silently coerces to "cap every file at 0 bytes". */ + char buf[CBM_SZ_32]; + if (cbm_safe_getenv("CBM_MAX_FILE_MB", buf, sizeof(buf), NULL) != NULL) { + long n = strtol(buf, NULL, CBM_DECIMAL_BASE); + if (n >= CBM_MIN_FILE_MB && n <= CBM_MAX_FILE_MB_CAP) { + return n * (long)CBM_SZ_1K * (long)CBM_SZ_1K; + } + cbm_log_warn("max_file_mb.env.invalid", "value", buf, "fallback", "default"); + } + return (long)CBM_DEFAULT_MAX_FILE_MB * (long)CBM_SZ_1K * (long)CBM_SZ_1K; +} diff --git a/src/main.c b/src/main.c index f2b72cba8..a38f3380d 100644 --- a/src/main.c +++ b/src/main.c @@ -298,9 +298,9 @@ static void print_help(void) { printf(" Claude Code, Codex CLI, Gemini CLI, Zed, OpenCode,\n"); printf(" Antigravity, Aider, KiloCode, Kiro\n"); printf("\nTools: index_repository, search_graph, query_graph, trace_path,\n"); - printf(" get_code_snippet, get_graph_schema, get_architecture, search_code,\n"); - printf(" list_projects, delete_project, index_status, detect_changes,\n"); - printf(" manage_adr, ingest_traces\n"); + printf(" get_code_snippet, get_graph_schema, get_architecture, repo_map,\n"); + printf(" search_code, list_projects, delete_project, index_status,\n"); + printf(" detect_changes, manage_adr, ingest_traces\n"); } /* ── Main ───────────────────────────────────────────────────────── */ diff --git a/src/mcp/mcp.c b/src/mcp/mcp.c index d3aa3bf3d..be21a04a0 100644 --- a/src/mcp/mcp.c +++ b/src/mcp/mcp.c @@ -1,5 +1,5 @@ /* - * mcp.c — MCP server: JSON-RPC 2.0 over stdio with 14 graph tools. + * mcp.c — MCP server: JSON-RPC 2.0 over stdio with 15 graph tools. * * Uses yyjson for fast JSON parsing/building. * Single-threaded event loop: read line → parse → dispatch → respond. @@ -392,6 +392,24 @@ static const tool_def_t TOOLS[] = { "{\"type\":\"object\",\"properties\":{\"project\":{\"type\":\"string\"},\"aspects\":{\"type\":" "\"array\",\"items\":{\"type\":\"string\"}}},\"required\":[\"project\"]}"}, + {"repo_map", + "Token-budgeted repository map: the most important symbols (persisted importance score, " + "Aider-style weighted degree) rendered one per line as 'file: signature' — no bodies. " + "Two modes: pass seed_anchors (symbol names, qualified names, or file paths relevant to " + "the task) to boost the seed neighbourhood to the top (seed-boost mode, recommended); " + "omit or pass unresolvable seeds for the global importance-ranked map. Output always " + "fits token_budget. Requires an index built by an importance-scoring binary — errors " + "with 'unscored' if the project needs re-indexing.", + "{\"type\":\"object\",\"properties\":{\"project\":{\"type\":\"string\"}," + "\"seed_anchors\":{\"type\":\"array\",\"items\":{\"type\":\"string\"}," + "\"description\":\"Task-relevant anchors: exact symbol name, qualified name, or file " + "path. Seed neighbourhoods (1-hop CALLS/USAGE, widened for weakly-connected seeds) are " + "boosted above globally-important symbols. Empty/unresolvable anchors fall back to the " + "global map.\"}," + "\"token_budget\":{\"type\":\"integer\",\"default\":1600,\"description\":\"Approximate " + "output ceiling in tokens (chars/4 estimate). Must be positive.\"}}," + "\"required\":[\"project\"]}"}, + {"search_code", "Graph-augmented code search. Finds text patterns via grep, then enriches results with " "the knowledge graph: deduplicates matches into containing functions, ranks by structural " @@ -885,6 +903,32 @@ static char *build_project_list_error(const char *reason) { return heap_strdup(buf); } +/* Read the caller's project argument. Accepts `project_id` as an ergonomic + * alias for `project` — never advertised in the TOOLS[] input_schema strings + * (upstream API name stays canonical; the alias is silent forgiveness, not a + * documented parameter). `project` wins when both are present, even when its + * value is wrong (an unknown project) — a present `project` is never + * overridden by `project_id`. Empty string is treated as absent (never + * looked up as a project name); a non-string JSON value is already NULL from + * cbm_mcp_get_string_arg. Caller owns the heap-allocated result. + * (pai/codebase-memory-search) */ +static char *get_project_arg(const char *args_json) { + char *project = cbm_mcp_get_string_arg(args_json, "project"); + if (project && project[0] == '\0') { + free(project); + project = NULL; + } + if (!project) { + char *alias = cbm_mcp_get_string_arg(args_json, "project_id"); + if (alias && alias[0] == '\0') { + free(alias); + alias = NULL; + } + project = alias; + } + return project; +} + /* Bail with project list when no store is available. */ #define REQUIRE_STORE(store, project) \ do { \ @@ -1034,7 +1078,10 @@ static char *verify_project_indexed(cbm_store_t *store, const char *project) { } static char *handle_get_graph_schema(cbm_mcp_server_t *srv, const char *args) { - char *project = cbm_mcp_get_string_arg(args, "project"); + char *project = get_project_arg(args); + if (!project) { + return cbm_mcp_text_result("project is required", true); + } cbm_store_t *store = resolve_store(srv, project); REQUIRE_STORE(store, project); @@ -1465,7 +1512,10 @@ static bool run_semantic_query(yyjson_mut_doc *doc, yyjson_mut_val *root, const } static char *handle_search_graph(cbm_mcp_server_t *srv, const char *args) { - char *project = cbm_mcp_get_string_arg(args, "project"); + char *project = get_project_arg(args); + if (!project) { + return cbm_mcp_text_result("project is required", true); + } cbm_store_t *store = resolve_store(srv, project); REQUIRE_STORE(store, project); @@ -1608,14 +1658,18 @@ static char *handle_search_graph(cbm_mcp_server_t *srv, const char *args) { static char *handle_query_graph(cbm_mcp_server_t *srv, const char *args) { char *query = cbm_mcp_get_string_arg(args, "query"); - char *project = cbm_mcp_get_string_arg(args, "project"); - cbm_store_t *store = resolve_store(srv, project); + char *project = get_project_arg(args); int max_rows = cbm_mcp_get_int_arg(args, "max_rows", 0); if (!query) { free(project); return cbm_mcp_text_result("query is required", true); } + if (!project) { + free(query); + return cbm_mcp_text_result("project is required", true); + } + cbm_store_t *store = resolve_store(srv, project); if (!store) { char *_err = build_project_list_error("project not found or not indexed"); char *_res = cbm_mcp_text_result(_err, true); @@ -1686,7 +1740,10 @@ static char *handle_query_graph(cbm_mcp_server_t *srv, const char *args) { } static char *handle_index_status(cbm_mcp_server_t *srv, const char *args) { - char *project = cbm_mcp_get_string_arg(args, "project"); + char *project = get_project_arg(args); + if (!project) { + return cbm_mcp_text_result("project is required", true); + } cbm_store_t *store = resolve_store(srv, project); REQUIRE_STORE(store, project); @@ -1694,6 +1751,14 @@ static char *handle_index_status(cbm_mcp_server_t *srv, const char *args) { yyjson_mut_val *root = yyjson_mut_obj(doc); yyjson_mut_doc_set_root(doc, root); + /* Fetch the persisted project row so we can surface indexed_at — lets + * staleness be compared against the graph's own timestamp instead of + * re-verifying everything by whole-file reads. proj holds heap-owned + * strings (heap_strdup in cbm_store_get_project); freed after the + * response is serialized (see below). */ + cbm_project_t proj = {0}; + bool have_proj = (cbm_store_get_project(store, project, &proj) == CBM_STORE_OK); + if (project) { int nodes = cbm_store_count_nodes(store, project); int edges = cbm_store_count_edges(store, project); @@ -1701,6 +1766,12 @@ static char *handle_index_status(cbm_mcp_server_t *srv, const char *args) { yyjson_mut_obj_add_int(doc, root, "nodes", nodes); yyjson_mut_obj_add_int(doc, root, "edges", edges); yyjson_mut_obj_add_str(doc, root, "status", nodes > 0 ? "ready" : "empty"); + /* indexed_at added via the no-copy _str variant, so proj.indexed_at + * MUST stay alive until after yy_doc_to_str(doc) below — matching the + * `project` heap-string handling in this same function. */ + if (have_proj && proj.indexed_at) { + yyjson_mut_obj_add_str(doc, root, "indexed_at", proj.indexed_at); + } if (nodes == 0) { yyjson_mut_obj_add_str( doc, root, "hint", @@ -1712,6 +1783,11 @@ static char *handle_index_status(cbm_mcp_server_t *srv, const char *args) { char *json = yy_doc_to_str(doc); yyjson_mut_doc_free(doc); + /* Free proj's heap-owned fields only after serialization — indexed_at was + * added no-copy above. */ + if (have_proj) { + cbm_project_free_fields(&proj); + } free(project); char *result = cbm_mcp_text_result(json, false); @@ -1721,7 +1797,7 @@ static char *handle_index_status(cbm_mcp_server_t *srv, const char *args) { /* delete_project: just erase the .db file (and WAL/SHM). */ static char *handle_delete_project(cbm_mcp_server_t *srv, const char *args) { - char *name = cbm_mcp_get_string_arg(args, "project"); + char *name = get_project_arg(args); if (!name) { return cbm_mcp_text_result("project is required", true); } @@ -1832,7 +1908,10 @@ static void append_cross_repo_summary(yyjson_mut_doc *doc, yyjson_mut_val *root, } static char *handle_get_architecture(cbm_mcp_server_t *srv, const char *args) { - char *project = cbm_mcp_get_string_arg(args, "project"); + char *project = get_project_arg(args); + if (!project) { + return cbm_mcp_text_result("project is required", true); + } cbm_store_t *store = resolve_store(srv, project); REQUIRE_STORE(store, project); @@ -2204,8 +2283,7 @@ static yyjson_mut_val *bfs_to_json_array(yyjson_mut_doc *doc, cbm_traverse_resul static char *handle_trace_call_path(cbm_mcp_server_t *srv, const char *args) { char *func_name = cbm_mcp_get_string_arg(args, "function_name"); - char *project = cbm_mcp_get_string_arg(args, "project"); - cbm_store_t *store = resolve_store(srv, project); + char *project = get_project_arg(args); char *direction = cbm_mcp_get_string_arg(args, "direction"); char *mode = cbm_mcp_get_string_arg(args, "mode"); char *param_name = cbm_mcp_get_string_arg(args, "parameter_name"); @@ -2220,6 +2298,14 @@ static char *handle_trace_call_path(cbm_mcp_server_t *srv, const char *args) { free(param_name); return cbm_mcp_text_result("function_name is required", true); } + if (!project) { + free(func_name); + free(direction); + free(mode); + free(param_name); + return cbm_mcp_text_result("project is required", true); + } + cbm_store_t *store = resolve_store(srv, project); if (!store) { char *_err = build_project_list_error("project not found or not indexed"); char *_res = cbm_mcp_text_result(_err, true); @@ -2904,13 +2990,17 @@ static char *build_snippet_response(cbm_mcp_server_t *srv, cbm_node_t *node, static char *handle_get_code_snippet(cbm_mcp_server_t *srv, const char *args) { char *qn = cbm_mcp_get_string_arg(args, "qualified_name"); - char *project = cbm_mcp_get_string_arg(args, "project"); + char *project = get_project_arg(args); bool include_neighbors = cbm_mcp_get_bool_arg(args, "include_neighbors"); if (!qn) { free(project); return cbm_mcp_text_result("qualified_name is required", true); } + if (!project) { + free(qn); + return cbm_mcp_text_result("project is required", true); + } cbm_store_t *store = resolve_store(srv, project); if (!store) { @@ -3605,7 +3695,7 @@ static bool compile_path_filter(const char *filter, cbm_regex_t *re) { static char *handle_search_code(cbm_mcp_server_t *srv, const char *args) { char *pattern = cbm_mcp_get_string_arg(args, "pattern"); - char *project = cbm_mcp_get_string_arg(args, "project"); + char *project = get_project_arg(args); char *file_pattern = cbm_mcp_get_string_arg(args, "file_pattern"); char *path_filter = cbm_mcp_get_string_arg(args, "path_filter"); char *mode_str = cbm_mcp_get_string_arg(args, "mode"); @@ -3626,23 +3716,32 @@ static char *handle_search_code(cbm_mcp_server_t *srv, const char *args) { path_filter = NULL; if (!pattern) { + if (has_path_filter) { + cbm_regfree(&path_regex); + } free(project); free(file_pattern); return cbm_mcp_text_result("pattern is required", true); } - /* Project is required */ + /* Project is required. Deliberately a plain message (never + * build_project_list_error) — "missing the param" and "gave an unknown + * value" are different failures and must not share the available_projects + * shape (pai/codebase-memory-search). */ if (!project) { + if (has_path_filter) { + cbm_regfree(&path_regex); + } free(pattern); free(file_pattern); - char *_err = build_project_list_error("project is required"); - char *_res = cbm_mcp_text_result(_err, true); - free(_err); - return _res; + return cbm_mcp_text_result("project is required", true); } char *root_path = get_project_root(srv, project); if (!root_path) { + if (has_path_filter) { + cbm_regfree(&path_regex); + } free(pattern); free(project); free(file_pattern); @@ -3726,6 +3825,9 @@ static char *handle_search_code(cbm_mcp_server_t *srv, const char *args) { char errmsg[CBM_SZ_256]; snprintf(errmsg, sizeof(errmsg), "search failed: cannot create temp file (%s)", strerror(errno)); + if (has_path_filter) { + cbm_regfree(&path_regex); + } free(root_path); free(pattern); free(project); @@ -3758,6 +3860,9 @@ static char *handle_search_code(cbm_mcp_server_t *srv, const char *args) { if (scoped) { cbm_unlink(filelist); } + if (has_path_filter) { + cbm_regfree(&path_regex); + } free(root_path); free(pattern); free(project); @@ -3863,7 +3968,7 @@ static void detect_add_impacted_symbols(cbm_store_t *store, const char *project, } static char *handle_detect_changes(cbm_mcp_server_t *srv, const char *args) { - char *project = cbm_mcp_get_string_arg(args, "project"); + char *project = get_project_arg(args); char *base_branch = cbm_mcp_get_string_arg(args, "base_branch"); char *scope = cbm_mcp_get_string_arg(args, "scope"); int depth = cbm_mcp_get_int_arg(args, "depth", MCP_DEFAULT_BFS_DEPTH); @@ -3883,12 +3988,22 @@ static char *handle_detect_changes(cbm_mcp_server_t *srv, const char *args) { return cbm_mcp_text_result("base_branch contains invalid characters", true); } + if (!project) { + free(project); + free(base_branch); + free(scope); + return cbm_mcp_text_result("project is required", true); + } + char *root_path = get_project_root(srv, project); if (!root_path) { free(project); free(base_branch); free(scope); - return cbm_mcp_text_result("project not found", true); + char *_err = build_project_list_error("project not found or not indexed"); + char *_res = cbm_mcp_text_result(_err, true); + free(_err); + return _res; } if (!validate_search_path_arg(root_path)) { @@ -4059,7 +4174,7 @@ static char *adr_read_legacy_file(const char *root_path) { "PATTERNS, TRADEOFFS, PHILOSOPHY." static char *handle_manage_adr(cbm_mcp_server_t *srv, const char *args) { - char *project = cbm_mcp_get_string_arg(args, "project"); + char *project = get_project_arg(args); char *mode_str = cbm_mcp_get_string_arg(args, "mode"); char *content = cbm_mcp_get_string_arg(args, "content"); @@ -4067,6 +4182,13 @@ static char *handle_manage_adr(cbm_mcp_server_t *srv, const char *args) { mode_str = heap_strdup("get"); } + if (!project) { + free(project); + free(mode_str); + free(content); + return cbm_mcp_text_result("project is required", true); + } + /* ADRs are stored in the SQLite store (project_summaries), the SAME * backend the UI /api/adr endpoints use — so writes via the MCP tool and * the UI are visible to each other (#256). */ @@ -4075,7 +4197,10 @@ static char *handle_manage_adr(cbm_mcp_server_t *srv, const char *args) { free(project); free(mode_str); free(content); - return cbm_mcp_text_result("project not found", true); + char *_err = build_project_list_error("project not found or not indexed"); + char *_res = cbm_mcp_text_result(_err, true); + free(_err); + return _res; } /* One-time migration: older versions wrote ADRs to a file at @@ -4168,6 +4293,570 @@ static char *handle_ingest_traces(cbm_mcp_server_t *srv, const char *args) { return result; } +/* ══════════════════════════════════════════════════════════════════ + * REPO_MAP — token-budgeted, seed-aware repository map (P4) + * + * Reads the persisted per-symbol importance score (pass_importance, P3) + * and renders the top symbols as one 'file: signature' line each, fitted + * to a token budget (chars/4 estimate — no LLM tokenizer exists server- + * side; the budget contract is defined against this estimator). + * + * Two first-class modes (P2 finding: seed-boost mandatory, global is the + * fallback — pai/p2-query-time-validation-finding): + * seeded: resolved seed_anchors and their 1-hop CALLS/USAGE + * neighbourhood are boosted x50; when the whole 1-hop set is + * tiny (<= REPO_MAP_WEAK_NEIGHBOR_THRESHOLD) the walk widens + * with file-of-symbol seeding + 2-hop expansion at x25 + * (P2 shape hints for weakly-connected seeds). + * global: no/unresolvable seeds -> pure importance ranking. + * + * Determinism: effective score DESC, qualified_name ASC (total order). + * Score-absence gate (spec AC7): a project whose symbol nodes carry no + * importance key was indexed by a pre-P3 binary -> explicit 'unscored' + * error, never a silently unranked map. + * ══════════════════════════════════════════════════════════════════ */ + +enum { + REPO_MAP_DEFAULT_BUDGET = 1600, /* spec: default tunable ~1-2k tokens */ + REPO_MAP_MAX_SEEDS = 50, /* anchors beyond this are dropped */ + REPO_MAP_NODES_PER_ANCHOR = 16, /* multi-resolving anchor cap */ + REPO_MAP_WEAK_NEIGHBOR_THRESHOLD = 3, + REPO_MAP_MAX_BOOSTED = 512, /* total boosted-node ceiling */ + REPO_MAP_MIN_CANDIDATES = 200, + REPO_MAP_MAX_CANDIDATES = 10000, + REPO_MAP_TOKEN_CHARS = 4, /* chars-per-token estimate divisor */ +}; +static const double REPO_MAP_SEED_BOOST = 50.0; /* P2's validated multiplier */ +static const double REPO_MAP_WIDEN_BOOST = 25.0; /* weak-seed widened set */ + +typedef struct { + int64_t id; + double boost; +} rm_boost_t; + +typedef struct { + rm_boost_t items[REPO_MAP_MAX_BOOSTED]; + int count; +} rm_boost_list_t; + +/* Add (or raise) a node's boost. Never downgrades an existing boost. */ +static void rm_boost_add(rm_boost_list_t *bl, int64_t id, double boost) { + for (int i = 0; i < bl->count; i++) { + if (bl->items[i].id == id) { + if (boost > bl->items[i].boost) { + bl->items[i].boost = boost; + } + return; + } + } + if (bl->count < REPO_MAP_MAX_BOOSTED) { + bl->items[bl->count].id = id; + bl->items[bl->count].boost = boost; + bl->count++; + } +} + +static double rm_boost_get(const rm_boost_list_t *bl, int64_t id) { + for (int i = 0; i < bl->count; i++) { + if (bl->items[i].id == id) { + return bl->items[i].boost; + } + } + return 1.0; +} + +/* Append id to a fixed array if absent. Returns true if present after call. */ +static bool rm_id_add(int64_t *arr, int *count, int cap, int64_t id) { + for (int i = 0; i < *count; i++) { + if (arr[i] == id) { + return true; + } + } + if (*count < cap) { + arr[(*count)++] = id; + return true; + } + return false; +} + +static bool rm_id_contains(const int64_t *arr, int count, int64_t id) { + for (int i = 0; i < count; i++) { + if (arr[i] == id) { + return true; + } + } + return false; +} + +/* Collect the 1-hop CALLS|USAGE neighbour ids of `id` (both directions) + * into out (deduped, capped). */ +static void rm_collect_neighbors(cbm_store_t *store, int64_t id, int64_t *out, int *count, + int cap) { + cbm_edge_t *edges = NULL; + int ecount = 0; + if (cbm_store_find_edges_by_source(store, id, &edges, &ecount) == CBM_STORE_OK) { + for (int i = 0; i < ecount; i++) { + if (edges[i].type && + (strcmp(edges[i].type, "CALLS") == 0 || strcmp(edges[i].type, "USAGE") == 0)) { + rm_id_add(out, count, cap, edges[i].target_id); + } + } + cbm_store_free_edges(edges, ecount); + } + edges = NULL; + ecount = 0; + if (cbm_store_find_edges_by_target(store, id, &edges, &ecount) == CBM_STORE_OK) { + for (int i = 0; i < ecount; i++) { + if (edges[i].type && + (strcmp(edges[i].type, "CALLS") == 0 || strcmp(edges[i].type, "USAGE") == 0)) { + rm_id_add(out, count, cap, edges[i].source_id); + } + } + cbm_store_free_edges(edges, ecount); + } +} + +/* Collapse whitespace runs (incl. newlines) to single spaces, in place. + * Multi-line signatures otherwise break the one-line-per-symbol map shape + * (observed on the real connectors corpus: black-formatted Python defs). */ +static void rm_flatten_ws(char *s) { + char *r = s; + char *w = s; + bool in_ws = false; + while (*r) { + char c = *r; + if (c == ' ' || c == '\t' || c == '\n' || c == '\r') { + in_ws = true; + } else { + if (in_ws && w != s) { + *w++ = ' '; + } + in_ws = false; + *w++ = c; + } + r++; + } + *w = '\0'; +} + +/* Parse importance (default 0.0) and signature (heap copy, whitespace- + * flattened, or NULL) from a node's properties_json. */ +static void rm_parse_props(const char *props, double *out_importance, char **out_sig) { + *out_importance = 0.0; + *out_sig = NULL; + if (!props || !props[0]) { + return; + } + yyjson_doc *doc = yyjson_read(props, strlen(props), 0); + if (!doc) { + return; + } + yyjson_val *root = yyjson_doc_get_root(doc); + if (yyjson_is_obj(root)) { + yyjson_val *imp = yyjson_obj_get(root, "importance"); + if (yyjson_is_num(imp)) { + *out_importance = yyjson_get_num(imp); + } + yyjson_val *sig = yyjson_obj_get(root, "signature"); + if (yyjson_is_str(sig)) { + *out_sig = heap_strdup(yyjson_get_str(sig)); + if (*out_sig) { + rm_flatten_ws(*out_sig); + } + } + } + yyjson_doc_free(doc); +} + +/* A ranked, rendered map candidate. */ +typedef struct { + cbm_node_t node; /* owned */ + double eff; /* importance x boost */ + char *line; /* owned: "file: signature\n" */ + size_t line_len; +} rm_cand_t; + +static int rm_cand_cmp(const void *pa, const void *pb) { + const rm_cand_t *a = (const rm_cand_t *)pa; + const rm_cand_t *b = (const rm_cand_t *)pb; + if (a->eff > b->eff) { + return -1; + } + if (a->eff < b->eff) { + return 1; + } + /* qualified_name ASC tie-break: total order (QNs unique per project). */ + const char *qa = a->node.qualified_name ? a->node.qualified_name : ""; + const char *qb = b->node.qualified_name ? b->node.qualified_name : ""; + return strcmp(qa, qb); +} + +/* Resolved seed record (id + file path for the file-of-symbol widen). */ +typedef struct { + int64_t id; + char *file_path; /* owned */ +} rm_seed_t; + +/* Resolve one anchor: exact name -> exact qualified_name -> exact file path. + * Appends up to REPO_MAP_NODES_PER_ANCHOR seed records + x50 boosts. + * Returns true if the anchor resolved to at least one node. */ +static bool rm_resolve_anchor(cbm_store_t *store, const char *project, const char *anchor, + rm_boost_list_t *boosts, rm_seed_t *seeds, int *seed_count, + int seed_cap) { + cbm_node_t *nodes = NULL; + int count = 0; + cbm_store_find_nodes_by_name(store, project, anchor, &nodes, &count); + if (count == 0) { + cbm_store_free_nodes(nodes, count); + nodes = NULL; + cbm_node_t one = {0}; + if (cbm_store_find_node_by_qn(store, project, anchor, &one) == CBM_STORE_OK) { + nodes = malloc(sizeof(cbm_node_t)); + nodes[0] = one; /* ownership moves into the array */ + count = 1; + } + } + if (count == 0) { + cbm_store_free_nodes(nodes, count); + nodes = NULL; + cbm_store_find_nodes_by_file(store, project, anchor, &nodes, &count); + } + if (count == 0) { + cbm_store_free_nodes(nodes, count); + return false; + } + int take = count < REPO_MAP_NODES_PER_ANCHOR ? count : REPO_MAP_NODES_PER_ANCHOR; + for (int i = 0; i < take; i++) { + rm_boost_add(boosts, nodes[i].id, REPO_MAP_SEED_BOOST); + if (*seed_count < seed_cap) { + seeds[*seed_count].id = nodes[i].id; + seeds[*seed_count].file_path = + nodes[i].file_path ? heap_strdup(nodes[i].file_path) : NULL; + (*seed_count)++; + } + } + cbm_store_free_nodes(nodes, count); + return true; +} + +static char *handle_repo_map(cbm_mcp_server_t *srv, const char *args) { + char *project = get_project_arg(args); + if (!project) { + return cbm_mcp_text_result("project is required", true); + } + cbm_store_t *store = resolve_store(srv, project); + REQUIRE_STORE(store, project); + + char *not_indexed = verify_project_indexed(store, project); + if (not_indexed) { + free(project); + return not_indexed; + } + + int budget = cbm_mcp_get_int_arg(args, "token_budget", REPO_MAP_DEFAULT_BUDGET); + if (budget <= 0) { + free(project); + return cbm_mcp_text_result("token_budget must be a positive integer", true); + } + + /* Score-absence gate (spec AC7): symbols exist but none carries an + * importance score -> the index predates pass_importance. Refuse with + * an explicit signal rather than emit a silently unranked map. */ + int scored = 0; + int total_symbols = 0; + if (cbm_store_importance_coverage(store, project, &scored, &total_symbols) != CBM_STORE_OK) { + free(project); + return cbm_mcp_text_result("importance coverage query failed", true); + } + if (total_symbols > 0 && scored == 0) { + char msg[CBM_SZ_1K]; + snprintf(msg, sizeof(msg), + "project '%s' is unscored: its index has no persisted importance scores " + "(indexed by a pre-importance binary). Re-run index_repository(" + "repo_path=..., mode='full') with a current binary, then retry repo_map.", + project); + free(project); + return cbm_mcp_text_result(msg, true); + } + + /* Parse seed_anchors: array of strings. Documented leniency (AC7 + * graceful-input): a bare string is coerced to a one-element list, + * non-string elements are skipped, the list is capped at + * REPO_MAP_MAX_SEEDS. Anything else counts as no seeds. */ + char *anchors[REPO_MAP_MAX_SEEDS]; + int anchor_count = 0; + int seeds_requested = 0; + { + yyjson_doc *adoc = yyjson_read(args, strlen(args), 0); + if (adoc) { + yyjson_val *sa = yyjson_obj_get(yyjson_doc_get_root(adoc), "seed_anchors"); + if (yyjson_is_str(sa)) { + seeds_requested = 1; + anchors[anchor_count++] = heap_strdup(yyjson_get_str(sa)); + } else if (yyjson_is_arr(sa)) { + seeds_requested = (int)yyjson_arr_size(sa); + size_t idx; + size_t max; + yyjson_val *elem; + yyjson_arr_foreach(sa, idx, max, elem) { + if (yyjson_is_str(elem) && anchor_count < REPO_MAP_MAX_SEEDS) { + anchors[anchor_count++] = heap_strdup(yyjson_get_str(elem)); + } + } + } + yyjson_doc_free(adoc); + } + } + + /* Resolve seeds -> x50 boosts + seed records. Anchor names that fail to + * resolve are retained (ownership transferred out of anchors[]) so they + * can be surfaced as `seeds_unresolved` in the response — otherwise a + * seed miss (wrong path, file not on base) is silent to the caller. + * They are freed after the response JSON is serialized (below). */ + rm_boost_list_t *boosts = calloc(CBM_ALLOC_ONE, sizeof(*boosts)); + rm_seed_t seeds[REPO_MAP_MAX_BOOSTED]; + int seed_count = 0; + int seeds_resolved = 0; + char *unresolved[REPO_MAP_MAX_SEEDS]; + int unresolved_count = 0; + for (int i = 0; i < anchor_count; i++) { + if (rm_resolve_anchor(store, project, anchors[i], boosts, seeds, &seed_count, + REPO_MAP_MAX_BOOSTED)) { + seeds_resolved++; + free(anchors[i]); + } else { + unresolved[unresolved_count++] = anchors[i]; /* retained; freed after JSON build */ + } + } + + /* 1-hop CALLS|USAGE neighbourhood of all seeds -> x50. */ + int64_t onehop[REPO_MAP_MAX_BOOSTED]; + int onehop_count = 0; + for (int i = 0; i < seed_count; i++) { + rm_collect_neighbors(store, seeds[i].id, onehop, &onehop_count, REPO_MAP_MAX_BOOSTED); + } + /* Boost the 1-hop set. A Module/File neighbour (file-level USAGE edge — + * "this file uses the seed") is not renderable itself, but its file's + * symbols ARE the co-change neighbourhood P2's ground truth names + * (sender.py/gmail_client.py members reach extract_address exactly this + * way) — expand it to those symbols at the widen tier. Only SYMBOL + * neighbours count toward the weak-seed threshold. */ + int fresh_onehop = 0; + for (int i = 0; i < onehop_count; i++) { + bool is_seed = false; + for (int j = 0; j < seed_count; j++) { + if (seeds[j].id == onehop[i]) { + is_seed = true; + break; + } + } + rm_boost_add(boosts, onehop[i], REPO_MAP_SEED_BOOST); + + cbm_node_t nb = {0}; + if (cbm_store_find_node_by_id(store, onehop[i], &nb) != CBM_STORE_OK) { + continue; + } + bool is_module = nb.label && (strcmp(nb.label, "Module") == 0 || + strcmp(nb.label, "File") == 0); + if (is_module && nb.file_path && nb.project && strcmp(nb.project, project) == 0) { + cbm_node_t *fnodes = NULL; + int fcount = 0; + cbm_store_find_nodes_by_file(store, project, nb.file_path, &fnodes, &fcount); + for (int j = 0; j < fcount; j++) { + rm_boost_add(boosts, fnodes[j].id, REPO_MAP_WIDEN_BOOST); + } + cbm_store_free_nodes(fnodes, fcount); + } else if (!is_module && !is_seed) { + fresh_onehop++; + } + cbm_node_free_fields(&nb); + } + + /* Weak-seed widen (P2 shape hints): a tiny 1-hop neighbourhood means the + * seed alone under-boosts (P2 task A). Widen with (a) file-of-symbol + * seeding and (b) 2-hop expansion, both at x25. */ + if (seed_count > 0 && fresh_onehop <= REPO_MAP_WEAK_NEIGHBOR_THRESHOLD) { + for (int i = 0; i < seed_count; i++) { + if (!seeds[i].file_path) { + continue; + } + cbm_node_t *fnodes = NULL; + int fcount = 0; + cbm_store_find_nodes_by_file(store, project, seeds[i].file_path, &fnodes, &fcount); + for (int j = 0; j < fcount; j++) { + rm_boost_add(boosts, fnodes[j].id, REPO_MAP_WIDEN_BOOST); + } + cbm_store_free_nodes(fnodes, fcount); + } + int64_t twohop[REPO_MAP_MAX_BOOSTED]; + int twohop_count = 0; + for (int i = 0; i < onehop_count; i++) { + rm_collect_neighbors(store, onehop[i], twohop, &twohop_count, REPO_MAP_MAX_BOOSTED); + } + for (int i = 0; i < twohop_count; i++) { + rm_boost_add(boosts, twohop[i], REPO_MAP_WIDEN_BOOST); + } + } + for (int i = 0; i < seed_count; i++) { + free(seeds[i].file_path); + } + + /* Candidate pool: top-N global by persisted importance (N adaptive from + * the budget: ~10 tokens/line means budget/2 is a generous line ceiling) + * + every boosted node not already in the pool. */ + int limit = budget / 2; + if (limit < REPO_MAP_MIN_CANDIDATES) { + limit = REPO_MAP_MIN_CANDIDATES; + } + if (limit > REPO_MAP_MAX_CANDIDATES) { + limit = REPO_MAP_MAX_CANDIDATES; + } + cbm_node_t *top = NULL; + int top_count = 0; + if (cbm_store_top_symbols_by_importance(store, project, limit, &top, &top_count) != + CBM_STORE_OK) { + free(boosts); + free(project); + for (int i = 0; i < unresolved_count; i++) { + free(unresolved[i]); /* retained unresolved anchor names — free on this error path too */ + } + return cbm_mcp_text_result("importance ranking query failed", true); + } + + int cand_cap = top_count + boosts->count; + if (cand_cap == 0) { + cand_cap = 1; + } + rm_cand_t *cands = calloc((size_t)cand_cap, sizeof(rm_cand_t)); + int cand_count = 0; + int64_t seen[REPO_MAP_MAX_CANDIDATES + REPO_MAP_MAX_BOOSTED]; + int seen_count = 0; + for (int i = 0; i < top_count; i++) { + cands[cand_count].node = top[i]; /* ownership moves */ + rm_id_add(seen, &seen_count, (int)(sizeof(seen) / sizeof(seen[0])), top[i].id); + cand_count++; + } + free(top); /* rows moved into cands; free only the array shell */ + + /* Boosted nodes outside the top-N pool (project-checked: edges could in + * principle cross rows in a shared test store — never leak them). */ + for (int i = 0; i < boosts->count; i++) { + if (rm_id_contains(seen, seen_count, boosts->items[i].id)) { + continue; + } + cbm_node_t extra = {0}; + if (cbm_store_find_node_by_id(store, boosts->items[i].id, &extra) != CBM_STORE_OK) { + continue; + } + bool symbol_label = extra.label && (strcmp(extra.label, "Function") == 0 || + strcmp(extra.label, "Method") == 0 || + strcmp(extra.label, "Class") == 0); + if (!symbol_label || !extra.project || strcmp(extra.project, project) != 0) { + cbm_node_free_fields(&extra); + continue; + } + cands[cand_count].node = extra; + rm_id_add(seen, &seen_count, (int)(sizeof(seen) / sizeof(seen[0])), extra.id); + cand_count++; + } + + /* Score + render each candidate. */ + for (int i = 0; i < cand_count; i++) { + double importance = 0.0; + char *sig = NULL; + rm_parse_props(cands[i].node.properties_json, &importance, &sig); + cands[i].eff = importance * rm_boost_get(boosts, cands[i].node.id); + const char *fp = cands[i].node.file_path ? cands[i].node.file_path : ""; + const char *nm = cands[i].node.name ? cands[i].node.name : ""; + const char *shown = sig ? sig : nm; + /* Several grammars persist only the parameter list as the signature + * ("(self, x)") — prefix the symbol name so every line reads + * 'file: symbol(sig)' (spec shape; observed on the real connectors + * corpus where bare "(self)" lines carried no information). */ + const char *prefix = (sig && sig[0] == '(') ? nm : ""; + size_t need = strlen(fp) + strlen(prefix) + strlen(shown) + MCP_COL_4; + cands[i].line = malloc(need); /* ": " + "\n" + NUL */ + if (cands[i].line) { + int w = snprintf(cands[i].line, need, "%s: %s%s\n", fp, prefix, shown); + cands[i].line_len = w > 0 ? (size_t)w : 0; + } + free(sig); + } + free(boosts); + + qsort(cands, (size_t)cand_count, sizeof(rm_cand_t), rm_cand_cmp); + + /* Budget fit: greedy prefix accumulation over the ranked lines (monotone + * — the maximal prefix under the ceiling; estimator = ceil(chars/4)). */ + size_t cum_chars = 0; + int included = 0; + for (int i = 0; i < cand_count; i++) { + size_t next = cum_chars + cands[i].line_len; + if ((next + REPO_MAP_TOKEN_CHARS - 1) / REPO_MAP_TOKEN_CHARS > (size_t)budget) { + break; + } + cum_chars = next; + included = i + 1; + } + + char *map = malloc(cum_chars + 1); + size_t pos = 0; + for (int i = 0; i < included; i++) { + if (cands[i].line && map) { + memcpy(map + pos, cands[i].line, cands[i].line_len); + pos += cands[i].line_len; + } + } + if (map) { + map[pos] = '\0'; + } + + int estimated_tokens = (int)((cum_chars + REPO_MAP_TOKEN_CHARS - 1) / REPO_MAP_TOKEN_CHARS); + + yyjson_mut_doc *doc = yyjson_mut_doc_new(NULL); + yyjson_mut_val *root = yyjson_mut_obj(doc); + yyjson_mut_doc_set_root(doc, root); + yyjson_mut_obj_add_str(doc, root, "project", project); + yyjson_mut_obj_add_str(doc, root, "mode", seeds_resolved > 0 ? "seeded" : "global"); + yyjson_mut_obj_add_int(doc, root, "budget", budget); + yyjson_mut_obj_add_int(doc, root, "estimated_tokens", estimated_tokens); + yyjson_mut_obj_add_int(doc, root, "symbol_count", included); + yyjson_mut_obj_add_int(doc, root, "total_symbols", total_symbols); + yyjson_mut_obj_add_int(doc, root, "seed_anchors_requested", seeds_requested); + yyjson_mut_obj_add_int(doc, root, "seed_anchors_resolved", seeds_resolved); + /* seeds_unresolved: the anchor names rm_resolve_anchor rejected. Omitted + * entirely when every requested seed resolved (or none were requested) — + * nothing to report. Names copied into the doc via the _strcpy (copying) + * variant so the retained anchors[] strings can be freed immediately + * below regardless of the doc's lifetime. */ + if (unresolved_count > 0) { + yyjson_mut_val *ua = yyjson_mut_arr(doc); + for (int i = 0; i < unresolved_count; i++) { + yyjson_mut_arr_add_strcpy(doc, ua, unresolved[i]); + } + yyjson_mut_obj_add_val(doc, root, "seeds_unresolved", ua); + } + yyjson_mut_obj_add_str(doc, root, "map", map ? map : ""); + + char *json = yy_doc_to_str(doc); + yyjson_mut_doc_free(doc); + + for (int i = 0; i < unresolved_count; i++) { + free(unresolved[i]); + } + for (int i = 0; i < cand_count; i++) { + cbm_node_free_fields(&cands[i].node); + free(cands[i].line); + } + free(cands); + free(map); + free(project); + + char *result = cbm_mcp_text_result(json, false); + free(json); + return result; +} + /* ── Tool dispatch ────────────────────────────────────────────── */ char *cbm_mcp_handle_tool(cbm_mcp_server_t *srv, const char *tool_name, const char *args_json) { @@ -4199,6 +4888,9 @@ char *cbm_mcp_handle_tool(cbm_mcp_server_t *srv, const char *tool_name, const ch if (strcmp(tool_name, "get_architecture") == 0) { return handle_get_architecture(srv, args_json); } + if (strcmp(tool_name, "repo_map") == 0) { + return handle_repo_map(srv, args_json); + } /* Pipeline-dependent tools */ if (strcmp(tool_name, "index_repository") == 0) { diff --git a/src/pipeline/pass_calls.c b/src/pipeline/pass_calls.c index 15d691d3a..772643849 100644 --- a/src/pipeline/pass_calls.c +++ b/src/pipeline/pass_calls.c @@ -19,6 +19,7 @@ enum { PC_RING = 4, PC_RING_MASK = 3, PC_SIG_SCAN = 15, PC_REGEX_GRP = 2 }; #include "graph_buffer/graph_buffer.h" #include "foundation/log.h" #include "foundation/compat.h" +#include "foundation/platform.h" #include "foundation/str_util.h" #include "cbm.h" #include "service_patterns.h" @@ -41,7 +42,7 @@ static char *read_file(const char *path, int *out_len) { long size = ftell(f); (void)fseek(f, 0, SEEK_SET); - if (size <= 0 || size > (long)CBM_PERCENT * CBM_SZ_1K * CBM_SZ_1K) { + if (size <= 0 || size > cbm_max_file_bytes()) { (void)fclose(f); return NULL; } diff --git a/src/pipeline/pass_definitions.c b/src/pipeline/pass_definitions.c index 676f1b169..4e92696f1 100644 --- a/src/pipeline/pass_definitions.c +++ b/src/pipeline/pass_definitions.c @@ -22,6 +22,7 @@ enum { PD_JSON_FIELD_OVERHEAD = 6 }; #include "graph_buffer/graph_buffer.h" #include "foundation/log.h" #include "foundation/compat.h" +#include "foundation/platform.h" #include "cbm.h" #include "simhash/minhash.h" #include "semantic/ast_profile.h" @@ -42,8 +43,7 @@ static char *read_file(const char *path, int *out_len) { long size = ftell(f); (void)fseek(f, 0, SEEK_SET); - if (size <= 0 || - size > (long)CBM_PERCENT * CBM_SZ_1K * CBM_SZ_1K) { /* CBM_PERCENT MB sanity limit */ + if (size <= 0 || size > cbm_max_file_bytes()) { /* CBM_MAX_FILE_MB sanity limit */ (void)fclose(f); return NULL; } diff --git a/src/pipeline/pass_importance.c b/src/pipeline/pass_importance.c new file mode 100644 index 000000000..96bf23140 --- /dev/null +++ b/src/pipeline/pass_importance.c @@ -0,0 +1,215 @@ +/* + * pass_importance.c — Index-time persisted per-symbol importance score + * (spec Part 1 / P3, AC1). Predump pass: computes a per-symbol importance for + * every Function/Method/Class node and appends it as a numeric "importance" + * key on the node's properties_json, so it persists through the store and is + * read back by enrich_node_properties (mcp) for free. + * + * Model (Aider repo-map, validated by the GREEN spike + * pai/aider-repomap-codebase-memory-mapping; the multipliers alone denoised + * the real connectors graph): + * + * importance = sqrt(num_refs) * priv * generic * distinct * test_penalty + * num_refs = incoming CALLS + USAGE edges (both). 0 -> sqrt(0) = 0. + * priv 0.1 if the name is private (leading underscore) + * generic 0.1 if the name is DEFINED in >= 5 distinct files + * distinct 10 if the name is snake_case or camelCase AND len >= 8 + * test_penalty 0.1 if the symbol is test scaffolding: + * - the symbol is the TARGET of an incoming TESTS edge + * (create_tests_edges: test-fn -> prod helper living in a + * NON-test file, e.g. a fixture in testutil.go), OR + * - the symbol lives in a test file per the graph's canonical + * cbm_is_test_path() classifier (the SAME one pass_tests + * uses). + * + * REAL-DATA DEVIATION FROM THE SPEC'S "edge-based TESTS/TESTS_FILE" mechanism + * (surfaced to Peter, documented in builder-notes.md): on the real connectors + * graph the TESTS/TESTS_FILE edges alone do NOT achieve AC1's exclusion goal. + * TESTS_FILE is emitted only when a test file maps to a resolvable production + * file (15 edges total on connectors); TESTS edges never point at a symbol that + * lives in a test file (create_tests_edges excludes tgt_is_test); and the node + * is_test property is stamped on <2% of nodes and on NONE of the test-resident + * fixtures/classes. So edge-only left the entire top-40 as test scaffolding + * (session_dir, FakeConnectorClient, ...). cbm_is_test_path() is the graph's + * own precise classifier (test_ prefix, _test. suffix, /tests/ dir, ...) — + * it is NOT the naive strstr("test") the spec warned against: it correctly + * leaves e.g. src/testutil_helpers.go UNpenalised (test-plan row #6 inversion). + * The TESTS-edge leg is retained so prod helpers in non-test files that are + * exercised by tests are still demoted. + * + * Weighted-degree only. PageRank (transitive importance) is a measured + * refinement deliberately NOT built here — the spike showed the weighted + * multipliers alone remove the degree-noise, so PageRank stays deferred + * unless it measurably beats weighted-degree on a fixed judgment set (AC1 + * "measured decision"; see builder-notes.md). + * + * ORDERING (load-bearing): registered LAST in run_predump_passes so it runs + * after pass_tests (TESTS/TESTS_FILE edges) and after CALLS/USAGE extraction; + * a mis-ordered pass would silently see num_refs = 0 and no test penalty. + */ +#include "foundation/constants.h" +#include "pipeline/pipeline.h" +#include "pipeline/pipeline_internal.h" +#include "graph_buffer/graph_buffer.h" +#include "foundation/log.h" +#include "foundation/compat.h" +#include "cbm.h" + +#include +#include +#include +#include +#include +#include + +enum { + CBM_IMPORTANCE_GENERIC_MIN_FILES = 5, /* name defined in >= N files -> generic */ + CBM_IMPORTANCE_DISTINCT_MIN_LEN = 8, /* distinctive-identifier length floor */ +}; +static const double CBM_IMPORTANCE_PRIV_MUL = 0.1; +static const double CBM_IMPORTANCE_GENERIC_MUL = 0.1; +static const double CBM_IMPORTANCE_DISTINCT_MUL = 10.0; +static const double CBM_IMPORTANCE_TEST_MUL = 0.1; + +/* The symbol node labels that carry an importance score. File/Module and other + * non-symbol nodes are deliberately excluded (test-plan row #1). */ +static const char *const CBM_IMPORTANCE_LABELS[] = {"Function", "Method", "Class"}; +enum { CBM_IMPORTANCE_LABEL_COUNT = 3 }; + +static bool name_is_private(const char *name) { + return name != NULL && name[0] == '_'; +} + +/* Distinctive = (snake_case OR camelCase) AND length >= 8. snake_case is an + * embedded '_'; camelCase is a lower->upper "hump". A plain len>=8 lowercase + * word (no '_', no hump) is NOT distinctive. */ +static bool name_is_distinctive(const char *name) { + if (!name) { + return false; + } + size_t len = strlen(name); + if (len < (size_t)CBM_IMPORTANCE_DISTINCT_MIN_LEN) { + return false; + } + if (strchr(name, '_') != NULL) { + return true; /* snake_case */ + } + for (size_t i = 1; i < len; i++) { + if (islower((unsigned char)name[i - 1]) && isupper((unsigned char)name[i])) { + return true; /* camelCase hump */ + } + } + return false; +} + +/* Count the DISTINCT files a name is defined in (generic-name suppression). + * Counts distinct file paths, not distinct nodes — two defs of one name in the + * same file count once (test-plan row #4). */ +static int name_distinct_file_count(const cbm_gbuf_t *gb, const char *name) { + if (!name) { + return 0; + } + const cbm_gbuf_node_t **nodes = NULL; + int count = 0; + if (cbm_gbuf_find_by_name(gb, name, &nodes, &count) != 0 || count == 0) { + return 0; + } + int distinct = 0; + for (int i = 0; i < count; i++) { + const char *fp = nodes[i]->file_path; + if (!fp) { + continue; + } + bool seen = false; + for (int j = 0; j < i; j++) { + const char *pf = nodes[j]->file_path; + if (pf && strcmp(pf, fp) == 0) { + seen = true; + break; + } + } + if (!seen) { + distinct++; + } + } + return distinct; +} + +static int incoming_edge_count(const cbm_gbuf_t *gb, int64_t id, const char *type) { + const cbm_gbuf_edge_t **edges = NULL; + int ne = 0; + if (cbm_gbuf_find_edges_by_target_type(gb, id, type, &edges, &ne) != 0) { + return 0; + } + return ne; +} + +/* Append a numeric "importance" key to a node's properties JSON object. Mirrors + * append_complexity_props: copy without the trailing '}', append the key, close. + * A non-object properties_json (does not end in '}') is left untouched — the + * store-open malformed-JSON guard (store.c) then never sees corruption. */ +void cbm_pipeline_importance_append_prop(cbm_gbuf_node_t *node, double score) { + const char *old = node->properties_json ? node->properties_json : "{}"; + size_t olen = strlen(old); + if (olen < 2 || old[olen - 1] != '}') { + return; /* not a JSON object — leave untouched */ + } + bool empty = (olen == 2); /* "{}" */ + char *neu = malloc(olen + CBM_SZ_64); + if (!neu) { + return; + } + memcpy(neu, old, olen - 1); /* copy without the trailing '}' */ + int w = snprintf(neu + (olen - 1), CBM_SZ_64, "%s\"importance\":%.6f}", empty ? "" : ",", score); + if (w < 0) { + free(neu); + return; + } + free(node->properties_json); + node->properties_json = neu; +} + +void cbm_pipeline_pass_importance(cbm_pipeline_ctx_t *ctx) { + cbm_gbuf_t *gb = ctx->gbuf; + if (!gb) { + return; + } + + int updated = 0; + for (int li = 0; li < CBM_IMPORTANCE_LABEL_COUNT; li++) { + const cbm_gbuf_node_t **nodes = NULL; + int count = 0; + if (cbm_gbuf_find_by_label(gb, CBM_IMPORTANCE_LABELS[li], &nodes, &count) != 0) { + continue; + } + for (int i = 0; i < count; i++) { + cbm_gbuf_node_t *n = (cbm_gbuf_node_t *)nodes[i]; + + int num_refs = incoming_edge_count(gb, n->id, "CALLS") + + incoming_edge_count(gb, n->id, "USAGE"); + double score = sqrt((double)num_refs); + + if (name_is_private(n->name)) { + score *= CBM_IMPORTANCE_PRIV_MUL; + } + if (name_distinct_file_count(gb, n->name) >= CBM_IMPORTANCE_GENERIC_MIN_FILES) { + score *= CBM_IMPORTANCE_GENERIC_MUL; + } + if (name_is_distinctive(n->name)) { + score *= CBM_IMPORTANCE_DISTINCT_MUL; + } + bool is_test = cbm_is_test_path(n->file_path) || + incoming_edge_count(gb, n->id, "TESTS") > 0; + if (is_test) { + score *= CBM_IMPORTANCE_TEST_MUL; + } + + cbm_pipeline_importance_append_prop(n, score); + updated++; + } + } + + char buf[CBM_SZ_32]; + snprintf(buf, sizeof(buf), "%d", updated); + cbm_log_info("pass.importance", "symbols", buf); +} diff --git a/src/pipeline/pass_k8s.c b/src/pipeline/pass_k8s.c index 0a99443f5..44bc8c370 100644 --- a/src/pipeline/pass_k8s.c +++ b/src/pipeline/pass_k8s.c @@ -19,6 +19,7 @@ #include "discover/discover.h" #include "foundation/log.h" #include "foundation/compat.h" +#include "foundation/platform.h" #include "cbm.h" #include @@ -39,7 +40,7 @@ static char *k8s_read_file(const char *path, int *out_len) { long size = ftell(f); (void)fseek(f, 0, SEEK_SET); - if (size <= 0 || size > (long)CBM_PERCENT * CBM_SZ_1K * CBM_SZ_1K) { + if (size <= 0 || size > cbm_max_file_bytes()) { (void)fclose(f); return NULL; } diff --git a/src/pipeline/pass_lsp_cross.c b/src/pipeline/pass_lsp_cross.c index a279956d6..4c0435226 100644 --- a/src/pipeline/pass_lsp_cross.c +++ b/src/pipeline/pass_lsp_cross.c @@ -26,6 +26,7 @@ #include "foundation/constants.h" #include "foundation/hash_table.h" #include "foundation/log.h" +#include "foundation/platform.h" #include #include @@ -34,7 +35,6 @@ /* ── Constants ─────────────────────────────────────────────────── */ enum { - PXC_MAX_FILE_BYTES_FACTOR = 100, /* same cap pass_calls.c uses for source size */ PXC_ITOA_BUF = 16, }; @@ -62,7 +62,7 @@ static char *pxc_read_file(const char *path, int *out_len) { (void)fseek(f, 0, SEEK_END); long size = ftell(f); (void)fseek(f, 0, SEEK_SET); - if (size <= 0 || size > (long)PXC_MAX_FILE_BYTES_FACTOR * (long)CBM_SZ_1K * (long)CBM_SZ_1K) { + if (size <= 0 || size > cbm_max_file_bytes()) { (void)fclose(f); return NULL; } diff --git a/src/pipeline/pass_parallel.c b/src/pipeline/pass_parallel.c index 180ee85f7..640a5f9ee 100644 --- a/src/pipeline/pass_parallel.c +++ b/src/pipeline/pass_parallel.c @@ -93,7 +93,7 @@ static char *read_file(const char *path, int *out_len) { (void)fseek(f, 0, SEEK_END); long size = ftell(f); (void)fseek(f, 0, SEEK_SET); - if (size <= 0 || size > (long)CBM_PERCENT * CBM_SZ_1K * CBM_SZ_1K) { + if (size <= 0 || size > cbm_max_file_bytes()) { (void)fclose(f); return NULL; } @@ -569,6 +569,15 @@ static void extract_worker(int worker_id, void *ctx_ptr) { int file_idx = ec->sorted[sort_pos].idx; const cbm_file_info_t *fi = &ec->files[file_idx]; + /* Hard ceiling (enforcing, separate from the advisory backpressure + * above): if RSS is still over the ceiling after backpressure had + * its bounded spins to let peers return memory, the process is in + * runaway territory — abort now, in-memory, before any SQLite dump + * (see mem.h). The advisory budget/backpressure stays a lower, + * separate threshold so a normal large-but-fine repo that only + * soft-overshoots the budget never reaches this. */ + cbm_mem_abort_if_over_ceiling(fi->rel_path, "extract"); + /* Read + extract */ int source_len = 0; char *source = read_file(fi->path, &source_len); @@ -675,7 +684,14 @@ static void extract_worker(int worker_id, void *ctx_ptr) { cbm_mem_collect(); } - /* Final cleanup (parser already destroyed in loop, just slab state) */ + /* Final cleanup. A worker that processed zero files (work-stealing) or + * that carries a leftover cached tl_parser from an earlier on-thread + * cbm_extract_file never ran the per-file destroy above, so tl_parser + * can still be live here. Destroying it unconditionally (idempotent — + * cbm_destroy_thread_parser() no-ops when tl_parser is NULL) guarantees + * no live parser references a page cbm_slab_destroy_thread() is about + * to free; see the same contract documented at line ~2313. */ + cbm_destroy_thread_parser(); cbm_slab_destroy_thread(); cbm_kind_in_set_free_cache(); /* free this worker thread's node-type bitset cache */ } diff --git a/src/pipeline/pass_semantic.c b/src/pipeline/pass_semantic.c index a2a5493b0..c41f73746 100644 --- a/src/pipeline/pass_semantic.c +++ b/src/pipeline/pass_semantic.c @@ -19,6 +19,7 @@ #include "graph_buffer/graph_buffer.h" #include "foundation/log.h" #include "foundation/compat.h" +#include "foundation/platform.h" #include "cbm.h" #include @@ -33,7 +34,7 @@ static char *read_file(const char *path, int *out_len) { (void)fseek(f, 0, SEEK_END); long size = ftell(f); (void)fseek(f, 0, SEEK_SET); - if (size <= 0 || size > (long)CBM_PERCENT * CBM_SZ_1K * CBM_SZ_1K) { + if (size <= 0 || size > cbm_max_file_bytes()) { (void)fclose(f); return NULL; } diff --git a/src/pipeline/pass_usages.c b/src/pipeline/pass_usages.c index d21048616..1843007c0 100644 --- a/src/pipeline/pass_usages.c +++ b/src/pipeline/pass_usages.c @@ -18,6 +18,7 @@ #include "graph_buffer/graph_buffer.h" #include "foundation/log.h" #include "foundation/compat.h" +#include "foundation/platform.h" #include "cbm.h" #include @@ -33,7 +34,7 @@ static char *read_file(const char *path, int *out_len) { (void)fseek(f, 0, SEEK_END); long size = ftell(f); (void)fseek(f, 0, SEEK_SET); - if (size <= 0 || size > (long)CBM_PERCENT * CBM_SZ_1K * CBM_SZ_1K) { + if (size <= 0 || size > cbm_max_file_bytes()) { (void)fclose(f); return NULL; } diff --git a/src/pipeline/pipeline.c b/src/pipeline/pipeline.c index c080a2852..82b0940ce 100644 --- a/src/pipeline/pipeline.c +++ b/src/pipeline/pipeline.c @@ -489,6 +489,9 @@ static void predump_cfg(cbm_pipeline_ctx_t *ctx) { static void predump_complexity(cbm_pipeline_ctx_t *ctx) { cbm_pipeline_pass_complexity(ctx); } +static void predump_importance(cbm_pipeline_ctx_t *ctx) { + cbm_pipeline_pass_importance(ctx); +} static void run_predump_passes(cbm_pipeline_t *p, cbm_pipeline_ctx_t *ctx) { static const struct { @@ -499,8 +502,13 @@ static void run_predump_passes(cbm_pipeline_t *p, cbm_pipeline_ctx_t *ctx) { {predump_deco, "decorator_tags", false}, {predump_cfg, "configlink", false}, {predump_route, "route_match", false}, {predump_sim, "similarity", true}, {predump_sem, "semantic_edges", true}, {predump_complexity, "complexity", false}, + /* MUST run after pass_tests (TESTS/TESTS_FILE edges, run_tests_and_history -- + * always precedes run_predump_passes) and after CALLS/USAGE edges (populated + * during sequential extraction, before run_post_extraction). Placed last so + * every edge type the score depends on is guaranteed to exist. */ + {predump_importance, "importance", false}, }; - enum { PREDUMP_PASS_COUNT = 6 }; + enum { PREDUMP_PASS_COUNT = 7 }; struct timespec t; for (int i = 0; i < PREDUMP_PASS_COUNT && !check_cancel(p); i++) { /* "moderate_only" passes (similarity/semantic edges) run in FULL, diff --git a/src/pipeline/pipeline_internal.h b/src/pipeline/pipeline_internal.h index 1eb108421..390b5acfe 100644 --- a/src/pipeline/pipeline_internal.h +++ b/src/pipeline/pipeline_internal.h @@ -507,6 +507,19 @@ int cbm_pipeline_pass_semantic_edges(cbm_pipeline_ctx_t *ctx); * cycles (recursive). Runs on the graph buffer before the dump. */ void cbm_pipeline_pass_complexity(cbm_pipeline_ctx_t *ctx); +/* Pre-dump pass: persisted per-symbol importance score (spec Part 1 / AC1). + * importance = sqrt(num_refs) x priv x generic x distinct x test_penalty -- + * see pass_importance.c for the full formula. MUST run after pass_tests and + * calls/usages have populated their edges (run_tests_and_history precedes + * run_predump_passes in run_post_extraction) -- a pass registered earlier + * would see zero TESTS edges and num_refs=0. */ +void cbm_pipeline_pass_importance(cbm_pipeline_ctx_t *ctx); + +/* Append a numeric "importance" key to a node's properties JSON object. + * Exposed (non-static) for direct unit testing of the JSON-validity guard + * (AC1 test-plan #12) -- otherwise internal to pass_importance.c. */ +void cbm_pipeline_importance_append_prop(cbm_gbuf_node_t *node, double score); + /* ── Env URL scanner (pass_envscan.c) ────────────────────────────── */ typedef struct { diff --git a/src/store/store.c b/src/store/store.c index c237332e2..33b474394 100644 --- a/src/store/store.c +++ b/src/store/store.c @@ -5501,6 +5501,78 @@ int cbm_store_find_architecture_docs(cbm_store_t *s, const char *project, char * return CBM_STORE_OK; } +/* ── Importance (repo_map) ──────────────────────────────────────── */ + +int cbm_store_importance_coverage(cbm_store_t *s, const char *project, int *out_scored, + int *out_total) { + *out_scored = 0; + *out_total = 0; + if (!s || !s->db) { + return CBM_STORE_ERR; + } + const char *sql = + "SELECT COUNT(*), " + "SUM(CASE WHEN json_extract(properties, '$.importance') IS NOT NULL THEN 1 ELSE 0 END) " + "FROM nodes WHERE project=?1 AND label IN ('Function','Method','Class')"; + sqlite3_stmt *stmt = NULL; + if (sqlite3_prepare_v2(s->db, sql, CBM_NOT_FOUND, &stmt, NULL) != SQLITE_OK) { + store_set_error_sqlite(s, "importance_coverage"); + return CBM_STORE_ERR; + } + bind_text(stmt, SKIP_ONE, project); + int rc = CBM_STORE_ERR; + if (sqlite3_step(stmt) == SQLITE_ROW) { + *out_total = sqlite3_column_int(stmt, 0); + *out_scored = sqlite3_column_int(stmt, SKIP_ONE); /* SUM over 0 rows is NULL -> 0 */ + rc = CBM_STORE_OK; + } else { + store_set_error_sqlite(s, "importance_coverage_step"); + } + sqlite3_finalize(stmt); + return rc; +} + +int cbm_store_top_symbols_by_importance(cbm_store_t *s, const char *project, int limit, + cbm_node_t **out, int *count) { + *out = NULL; + *count = 0; + if (!s || !s->db || limit <= 0) { + return CBM_STORE_ERR; + } + /* NULL importance sorts LAST under DESC (SQLite: NULL < everything), so a + * partially-scored project ranks its unscored nodes at the bottom. + * qualified_name ASC tie-break makes the order total (unique per project) + * — repo_map's determinism contract. */ + const char *sql = "SELECT id, project, label, name, qualified_name, file_path, " + "start_line, end_line, properties FROM nodes " + "WHERE project=?1 AND label IN ('Function','Method','Class') " + "ORDER BY json_extract(properties, '$.importance') DESC, " + "qualified_name ASC LIMIT ?2"; + sqlite3_stmt *stmt = NULL; + if (sqlite3_prepare_v2(s->db, sql, CBM_NOT_FOUND, &stmt, NULL) != SQLITE_OK) { + store_set_error_sqlite(s, "top_symbols_by_importance"); + return CBM_STORE_ERR; + } + bind_text(stmt, SKIP_ONE, project); + sqlite3_bind_int(stmt, ST_COL_2, limit); + + int cap = ST_INIT_CAP_16; + int n = 0; + cbm_node_t *arr = malloc(cap * sizeof(cbm_node_t)); + while (sqlite3_step(stmt) == SQLITE_ROW) { + if (n >= cap) { + cap *= ST_GROWTH; + arr = safe_realloc(arr, cap * sizeof(cbm_node_t)); + } + scan_node(stmt, &arr[n]); + n++; + } + sqlite3_finalize(stmt); + *out = arr; + *count = n; + return CBM_STORE_OK; +} + /* ── Memory management ──────────────────────────────────────────── */ void cbm_node_free_fields(cbm_node_t *n) { diff --git a/src/store/store.h b/src/store/store.h index 26b09a5c2..fcd182f22 100644 --- a/src/store/store.h +++ b/src/store/store.h @@ -616,6 +616,24 @@ int cbm_leiden(const int64_t *nodes, int node_count, const cbm_louvain_edge_t *e int cbm_louvain(const int64_t *nodes, int node_count, const cbm_louvain_edge_t *edges, int edge_count, cbm_louvain_result_t **out, int *out_count); +/* ── Importance (repo_map) ──────────────────────────────────────── */ + +/* Count symbol nodes (Function/Method/Class) and how many of them carry a + * persisted numeric "importance" key in properties (pass_importance, P3). + * Used by repo_map's score-absence gate: total>0 && scored==0 means the + * project was indexed by a pre-importance binary and must be re-indexed. + * Returns CBM_STORE_OK or CBM_STORE_ERR. */ +int cbm_store_importance_coverage(cbm_store_t *s, const char *project, int *out_scored, + int *out_total); + +/* Top symbol nodes (Function/Method/Class) ordered by persisted importance + * DESC with deterministic qualified_name ASC tie-break. Nodes without an + * importance key sort last (SQLite NULLs-last under DESC). Returns an + * allocated array (caller frees with cbm_store_free_nodes). + * Returns CBM_STORE_OK or CBM_STORE_ERR. */ +int cbm_store_top_symbols_by_importance(cbm_store_t *s, const char *project, int limit, + cbm_node_t **out, int *count); + /* ── Memory management helpers ──────────────────────────────────── */ /* Free heap-allocated strings in a stack-allocated node (does NOT free the node itself). */ diff --git a/tests/check_no_duplicate_file_cap.sh b/tests/check_no_duplicate_file_cap.sh new file mode 100755 index 000000000..cc6a8288b --- /dev/null +++ b/tests/check_no_duplicate_file_cap.sh @@ -0,0 +1,156 @@ +#!/usr/bin/env bash +# check_no_duplicate_file_cap.sh — structural single-source-of-truth guard +# for the per-file parse-size cap. +# +# The 100MB per-file cap used to be expressed 7 times: 6x as the misused +# "CBM_PERCENT * CBM_SZ_1K * CBM_SZ_1K" literal (pass_calls.c, pass_definitions.c, +# pass_semantic.c, pass_usages.c, pass_k8s.c, pass_parallel.c) plus a twin enum +# PXC_MAX_FILE_BYTES_FACTOR in pass_lsp_cross.c. All 7 collapsed to the single +# named cbm_max_file_bytes() resolver (system_info.c) backed by +# CBM_DEFAULT_MAX_FILE_MB (constants.h). This script asserts the collapse +# STUCK: zero surviving duplicate-literal sites, and the new resolver present +# at all 7 former call sites. +# +# Reads COMMITTED source via `git show HEAD:`, not the working tree — +# a mutation run rewrites the working tree during its sweep, which would +# make a working-tree grep abort as a tool-error mid-run. Reading HEAD keeps +# this check meaningful (and green) regardless of mutation-testing state. +# +# Also pins CBM_PERCENT's legitimate percentage/depth consumers +# (mem.c, vmem.c, cypher.c) as UNTOUCHED — the collapse must not repurpose +# the shared "100 percent" constant (AC row 5, characterization pin). +# +# Usage: bash tests/check_no_duplicate_file_cap.sh [] +# defaults to HEAD. +# Exit 0 on success, non-zero on failure. + +set -uo pipefail + +ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" +REF="${1:-HEAD}" +FAILURES=0 + +cd "$ROOT" || exit 2 + +show() { + git show "${REF}:$1" 2>/dev/null +} + +# ── AC row 4: zero surviving duplicate-cap-literal sites ──────────────── +FORMER_SITES=( + "src/pipeline/pass_calls.c" + "src/pipeline/pass_definitions.c" + "src/pipeline/pass_semantic.c" + "src/pipeline/pass_usages.c" + "src/pipeline/pass_k8s.c" + "src/pipeline/pass_parallel.c" + "src/pipeline/pass_lsp_cross.c" +) + +echo "[cap-dedup] Checking zero surviving 'CBM_PERCENT * CBM_SZ_1K * CBM_SZ_1K' cap expressions..." >&2 +hits="$(git grep -n 'CBM_PERCENT \* CBM_SZ_1K \* CBM_SZ_1K' "${REF}" -- src/ 2>/dev/null || true)" +if [[ -n "$hits" ]]; then + echo "[cap-dedup] FAIL: surviving duplicated cap literal(s):" >&2 + echo "$hits" >&2 + FAILURES=$((FAILURES + 1)) +else + echo "[cap-dedup] PASS: zero surviving CBM_PERCENT*CBM_SZ_1K*CBM_SZ_1K cap expressions" >&2 +fi + +echo "[cap-dedup] Checking zero surviving PXC_MAX_FILE_BYTES_FACTOR..." >&2 +hits="$(git grep -n 'PXC_MAX_FILE_BYTES_FACTOR' "${REF}" -- src/ 2>/dev/null || true)" +if [[ -n "$hits" ]]; then + echo "[cap-dedup] FAIL: surviving PXC_MAX_FILE_BYTES_FACTOR reference(s):" >&2 + echo "$hits" >&2 + FAILURES=$((FAILURES + 1)) +else + echo "[cap-dedup] PASS: zero surviving PXC_MAX_FILE_BYTES_FACTOR references" >&2 +fi + +echo "[cap-dedup] Checking cbm_max_file_bytes() present at all 7 former call sites..." >&2 +for f in "${FORMER_SITES[@]}"; do + content="$(show "$f")" + if [[ -z "$content" ]]; then + echo "[cap-dedup] FAIL: could not read $f at $REF" >&2 + FAILURES=$((FAILURES + 1)) + continue + fi + if ! grep -q 'cbm_max_file_bytes()' <<<"$content"; then + echo "[cap-dedup] FAIL: $f does not call cbm_max_file_bytes()" >&2 + FAILURES=$((FAILURES + 1)) + else + echo "[cap-dedup] PASS: $f calls cbm_max_file_bytes()" >&2 + fi +done + +# ── AC row 5: CBM_PERCENT's legitimate consumers untouched (characterization pin) ── +# Pinned by MINIMUM occurrence count per file, not exact line number — adding +# unrelated code earlier in a file (e.g. new ceiling constants ahead of +# check_pressure() in mem.c) legitimately shifts later line numbers without +# touching the CBM_PERCENT usage itself; an exact-line pin would false-positive +# on that harmless drift. The count floor still catches the real regression +# this row guards against: the collapse repurposing/deleting a legitimate use. +declare -A PIN_COUNTS=( + ["src/foundation/mem.c"]=2 + ["src/foundation/vmem.c"]=2 + ["src/cypher/cypher.c"]=1 +) + +echo "[cap-dedup] Pinning CBM_PERCENT's legitimate percentage/depth consumers (must be untouched)..." >&2 +for f in "${!PIN_COUNTS[@]}"; do + content="$(show "$f")" + if [[ -z "$content" ]]; then + echo "[cap-dedup] FAIL: could not read $f at $REF" >&2 + FAILURES=$((FAILURES + 1)) + continue + fi + expected="${PIN_COUNTS[$f]}" + actual="$(grep -c 'CBM_PERCENT' <<<"$content")" + if [[ "$actual" -lt "$expected" ]]; then + echo "[cap-dedup] FAIL: $f has $actual CBM_PERCENT reference(s), expected >= $expected" >&2 + FAILURES=$((FAILURES + 1)) + else + echo "[cap-dedup] PASS: $f has $actual CBM_PERCENT reference(s) (>= $expected expected)" >&2 + fi +done + +# CBM_PERCENT's definition itself must still be 100 (untouched/unrenamed). +constants_content="$(show "src/foundation/constants.h")" +if ! grep -qE 'CBM_PERCENT = 100' <<<"$constants_content"; then + echo "[cap-dedup] FAIL: CBM_PERCENT definition changed or missing in constants.h" >&2 + FAILURES=$((FAILURES + 1)) +else + echo "[cap-dedup] PASS: CBM_PERCENT = 100 definition untouched" >&2 +fi + +# ── AC row 11: advisory backpressure not silently deleted ─────────────── +# The enforcing ceiling must AUGMENT, not replace, the existing advisory +# backpressure spin (pass_parallel.c) — a repo that overshoots the WARN +# budget but stays under the ABORT ceiling must still back-pressure and +# complete (soft overshoot), i.e. warn != abort. Pin the backpressure loop +# structure (cbm_mem_over_budget() bounded spin) is still present, and +# that the new hard-ceiling call sits alongside it (not inside/replacing +# the spin body — mem_ceiling_abort.sh's row7 healthy-path run is the +# behavioural proof; this is the structural companion). +pp_content="$(show "src/pipeline/pass_parallel.c")" +if [[ -z "$pp_content" ]]; then + echo "[cap-dedup] FAIL: could not read pass_parallel.c at $REF" >&2 + FAILURES=$((FAILURES + 1)) +elif ! grep -q 'cbm_mem_over_budget()' <<<"$pp_content"; then + echo "[cap-dedup] FAIL: advisory backpressure (cbm_mem_over_budget spin) no longer present in pass_parallel.c" >&2 + FAILURES=$((FAILURES + 1)) +elif ! grep -q 'cbm_mem_abort_if_over_ceiling(' <<<"$pp_content"; then + echo "[cap-dedup] FAIL: enforcing ceiling call (cbm_mem_abort_if_over_ceiling) not wired into pass_parallel.c" >&2 + FAILURES=$((FAILURES + 1)) +else + echo "[cap-dedup] PASS: advisory backpressure AND enforcing ceiling both present in pass_parallel.c (augmented, not replaced)" >&2 +fi + +# ── Final result ────────────────────────────────────────────────────── +if [[ "$FAILURES" -gt 0 ]]; then + echo "[cap-dedup] FAILED: $FAILURES check(s) failed." >&2 + exit 1 +fi + +echo "[cap-dedup] All checks passed (row 4 dedup + row 5 CBM_PERCENT pin) at ${REF}." >&2 +exit 0 diff --git a/tests/gen_mem_ceiling_repro.sh b/tests/gen_mem_ceiling_repro.sh new file mode 100755 index 000000000..f0d028685 --- /dev/null +++ b/tests/gen_mem_ceiling_repro.sh @@ -0,0 +1,56 @@ +#!/usr/bin/env bash +# gen_mem_ceiling_repro.sh — synthetic large-file repro generator for the +# memory-ceiling tests (mem_ceiling_abort.sh). +# +# Emits N_SMALL tiny C files (near-zero footprint, exercise the discover/sort +# machinery without contributing meaningfully to RSS) and N_LARGE large C +# files (each an array-initializer source of ~SIZE_MB, mirroring the diagnosis +# repro shape: "600 tiny + N large array-init C files (~7.6 MB each)"). The +# array-init shape parses fast (tree-sitter C grammar, one big initializer +# list) while still landing SIZE_MB of source bytes + a comparable parse +# working set in RSS per worker — the mechanism the 2026-07-01 incident +# diagnosed as "concurrent large-file parsing summing unbounded". +# +# Usage: +# gen_mem_ceiling_repro.sh +# +# Same dir drives both the file-cap tests (dial size_mb near CBM_MAX_FILE_MB) +# and the ceiling test (dial n_large * size_mb past the ceiling). +set -euo pipefail + +OUTDIR="${1:?usage: gen_mem_ceiling_repro.sh }" +N_SMALL="${2:?missing n_small}" +N_LARGE="${3:?missing n_large}" +SIZE_MB="${4:?missing size_mb}" + +mkdir -p "$OUTDIR" + +# ── Tiny files (near-zero RSS contribution) ────────────────────────── +for i in $(seq 1 "$N_SMALL"); do + cat > "$OUTDIR/small_${i}.c" < "$OUTDIR/large_${i}.c" +done + +echo "generated: $N_SMALL small + $N_LARGE large (~${SIZE_MB}MB each) under $OUTDIR" >&2 diff --git a/tests/mem_ceiling_abort.sh b/tests/mem_ceiling_abort.sh new file mode 100755 index 000000000..a8c016241 --- /dev/null +++ b/tests/mem_ceiling_abort.sh @@ -0,0 +1,204 @@ +#!/usr/bin/env bash +# mem_ceiling_abort.sh — real-process regression guard for the enforcing RSS +# memory ceiling (cbm_mem_abort_if_over_ceiling, mem.c). +# +# Covers the acceptance-criteria rows that CANNOT be exercised by an +# in-process C unit test (the abort path itself calls abort(), which would +# kill the test-runner process): +# +# Row 6 — RSS ceiling ABORTS (non-zero exit) with a diagnostic dump +# naming file + phase + RSS when exceeded. Exercises the real +# index entry point (cli index_repository -> cbm_parallel_extract), +# not a direct call to the ceiling helper. +# Row 7 — A normal full index of the real repo does NOT abort (healthy +# path preserved; anti-false-positive guard for R3). +# Row 9 — CBM_MEM_CEILING_MB override adjusts the abort threshold: forced +# below a large synthetic index's actual RSS, that index now +# aborts (proves the knob bites); unset uses the default and +# completes; invalid warns and falls back to the default +# (completes). +# Row 10 (real-environment half) — both the abort and no-abort verdicts +# are read against the REAL RSS of a real process, not an +# injected value. +# +# Also checks the store-integrity finding (R4): an aborted index must never +# leave a partially-written .db — dump_and_persist_hashes() (the SQLite +# write) runs strictly after cbm_parallel_extract() in pipeline.c, so an +# abort during extract precedes any DB write. This script asserts no .db +# file (partial or otherwise) is left behind by an aborted run. +# +# IMPORTANT — incremental indexing gotcha: `index_repository` with +# mode=full does NOT force a full re-parse when file hashes are unchanged +# from a prior run against the same repo_path (it routes to +# incremental.noop). Every run below that needs a genuine full extraction +# pass (i.e. any run whose PURPOSE is to exercise cbm_max_file_bytes() / +# cbm_mem_abort_if_over_ceiling() on real file reads) deletes the target's +# .db first. Skipping that reset is a silent false-pass, not a crash — the +# process still exits 0 (it did no real work), so watch for that pattern. +# +# Usage: bash tests/mem_ceiling_abort.sh +# Exit 0 on success, non-zero on failure. SLOW (generates GB-scale fixtures) — +# intended for the IO build host, not routine `make test`. + +set -uo pipefail + +ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" +BINARY="$ROOT/build/c/codebase-memory-mcp" +GEN="$ROOT/tests/gen_mem_ceiling_repro.sh" +CACHE_DIR="${HOME}/.cache/codebase-memory-mcp" +FAILURES=0 +WORKDIR="" + +cleanup() { + [[ -n "$WORKDIR" && -d "$WORKDIR" ]] && rm -rf "$WORKDIR" +} +trap cleanup EXIT + +if [[ ! -x "$BINARY" ]]; then + echo "[mem_ceiling] FAIL: binary not found at $BINARY (build first: make -f Makefile.cbm cbm)" >&2 + exit 2 +fi + +project_name() { printf '%s' "$1" | sed 's#^/##; s#[^A-Za-z0-9._-]#-#g'; } + +db_path_for() { + local repo="$1" + printf '%s/%s.db' "$CACHE_DIR" "$(project_name "$repo")" +} + +# Force the NEXT index_repository call against $1 to do a genuine full +# extraction (not an incremental no-op) by wiping its cached .db/.tmp. +reset_index_state() { + local repo="$1" db + db="$(db_path_for "$repo")" + rm -f "$db" "${db}.tmp" +} + +# Run `cli index_repository` against $1, with env overrides from the +# remaining args (NAME=VALUE pairs), capturing exit code + stderr. +# Sets: LAST_EXIT, LAST_LOG (path). +run_index() { + local repo="$1"; shift + local log; log="$(mktemp)" + LAST_LOG="$log" + # shellcheck disable=SC2086 + env "$@" "$BINARY" cli index_repository "{\"repo_path\":\"$repo\",\"mode\":\"full\"}" \ + >"$log" 2>&1 + LAST_EXIT=$? +} + +# ── Row 7 + row 10 (no-abort half): healthy real-repo index, default ceiling ── +echo "[mem_ceiling] Row 7/10: healthy real-repo index must NOT abort (default ceiling)..." >&2 +reset_index_state "$ROOT" +run_index "$ROOT" +if [[ "$LAST_EXIT" -ne 0 ]]; then + echo "[mem_ceiling] FAIL [row7]: healthy real-repo index aborted (exit=$LAST_EXIT) under default ceiling" >&2 + tail -20 "$LAST_LOG" >&2 + FAILURES=$((FAILURES + 1)) +else + echo "[mem_ceiling] PASS [row7]: healthy index completed (exit=0)" >&2 +fi +rm -f "$LAST_LOG" + +# ── Row 9 (unset leg): explicit unset also completes ───────────────────── +echo "[mem_ceiling] Row 9 (unset): explicit CBM_MEM_CEILING_MB unset uses default, completes..." >&2 +reset_index_state "$ROOT" +run_index "$ROOT" +if [[ "$LAST_EXIT" -ne 0 ]]; then + echo "[mem_ceiling] FAIL [row9-unset]: index aborted with ceiling env unset (exit=$LAST_EXIT)" >&2 + FAILURES=$((FAILURES + 1)) +else + echo "[mem_ceiling] PASS [row9-unset]" >&2 +fi +rm -f "$LAST_LOG" + +# ── Row 9 (invalid leg): non-numeric override warns + falls back, completes ── +echo "[mem_ceiling] Row 9 (invalid): non-numeric CBM_MEM_CEILING_MB falls back to default..." >&2 +reset_index_state "$ROOT" +run_index "$ROOT" CBM_MEM_CEILING_MB="not-a-number" +if [[ "$LAST_EXIT" -ne 0 ]]; then + echo "[mem_ceiling] FAIL [row9-invalid]: index aborted with invalid ceiling env (exit=$LAST_EXIT)" >&2 + FAILURES=$((FAILURES + 1)) +else + if ! grep -q "mem_ceiling.env.invalid" "$LAST_LOG"; then + echo "[mem_ceiling] FAIL [row9-invalid]: no warn log for invalid CBM_MEM_CEILING_MB" >&2 + FAILURES=$((FAILURES + 1)) + else + echo "[mem_ceiling] PASS [row9-invalid]: warned + completed" >&2 + fi +fi +rm -f "$LAST_LOG" + +# ── Row 6 + Row 9(low) + R4: synthetic large-file repro forces cumulative +# RSS past a ceiling set BELOW its expected peak but ABOVE the 2048MB +# floor (mem_ceiling_floor_applies_on_tiny_ram in test_mem.c already +# pins the floor's own arithmetic in-process; this proves the knob bites +# on a REAL run — a ceiling below the floor would just get clamped up +# to the floor and never trip, so the fixture must genuinely exceed the +# CHOSEN ceiling, not the floor). Sized well past what a healthy 587-file +# real repo would ever reach (~1-1.3GB peak per the diagnosis), so a +# false negative here is a genuine "doesn't bite" signal, not sizing +# noise. ────────────────────────────────────────────────────────────── +CEILING_TEST_MB=2100 +echo "[mem_ceiling] Row 6/9(low)/R4: generating a large-file repro sized to exceed a ${CEILING_TEST_MB}MB forced ceiling..." >&2 +WORKDIR="$(mktemp -d /tmp/cbm_mem_ceiling_repro.XXXXXX)" +# 40 tiny + 40 large (~8MB each = ~320MB source) — mirrors the ORIGINAL +# diagnosis repro shape (600 tiny + 40 large ~7.6MB files -> 8.8GB on IO) +# on file COUNT and per-file size (tiny-file count trimmed 600->40; tiny +# files barely contribute RSS, so this doesn't change the mechanism). +# CALIBRATED by direct probe on IO/CBM_WORKERS=8 with NO ceiling override: +# this exact shape peaks at rss_mb=2198 (parallel.extract.mem log). The +# floor (CBM_MEM_CEILING_FLOOR_MB, mem.c) is 2048MB — a ceiling below that +# gets clamped UP to the floor and would never trip (this is what silently +# broke the first two attempts at 3000MB, and an earlier 20x15MB/3000MB +# combination that just didn't reach 3000MB at all). 2100MB sits strictly +# between the floor (2048) and the calibrated peak (2198), so it is NOT +# floor-clamped and WILL trip against this exact fixture shape on this +# exact host. Re-calibrate CEILING_TEST_MB (or the gen call) if run on a +# host with materially different allocator/CPU behaviour. +bash "$GEN" "$WORKDIR" 40 40 8 +reset_index_state "$WORKDIR" +run_index "$WORKDIR" CBM_MEM_CEILING_MB="$CEILING_TEST_MB" CBM_WORKERS="8" +db_synth="$(db_path_for "$WORKDIR")" +if [[ "$LAST_EXIT" -eq 0 ]]; then + echo "[mem_ceiling] FAIL [row6/row9-low]: synthetic fixture did NOT trip a ${CEILING_TEST_MB}MB forced ceiling — either the knob doesn't bite, or the fixture is undersized for this host's allocator behaviour (IO: 31GB/8 cores). Widen n_large/size_mb in the gen call above if this needs to be load-bearing on a beefier host." >&2 + FAILURES=$((FAILURES + 1)) +else + echo "[mem_ceiling] index aborted as expected (exit=$LAST_EXIT)" >&2 + if ! grep -q "mem.ceiling.abort" "$LAST_LOG"; then + echo "[mem_ceiling] FAIL [row6]: no mem.ceiling.abort diagnostic dump in output" >&2 + tail -10 "$LAST_LOG" >&2 + FAILURES=$((FAILURES + 1)) + elif ! grep -qE "file=\S+ phase=\S+ rss_mb=[0-9]+ ceiling_mb=[0-9]+" "$LAST_LOG"; then + echo "[mem_ceiling] FAIL [row6]: diagnostic dump missing file/phase/rss_mb/ceiling_mb fields" >&2 + grep "mem.ceiling.abort" "$LAST_LOG" >&2 + FAILURES=$((FAILURES + 1)) + else + echo "[mem_ceiling] PASS [row6]: diagnostic dump present with file+phase+rss" >&2 + grep "mem.ceiling.abort" "$LAST_LOG" >&2 + fi +fi +# R4: an aborted run must leave no (partial) .db file — the abort happens +# strictly before dump_and_persist_hashes()/cbm_gbuf_dump_to_sqlite(). Only +# meaningful when the run actually aborted (LAST_EXIT != 0) — a completed +# run legitimately leaves a .db behind, that is not a violation. +if [[ "$LAST_EXIT" -ne 0 ]]; then + if [[ -f "$db_synth" || -f "${db_synth}.tmp" ]]; then + echo "[mem_ceiling] FAIL [R4]: aborted run left a .db (or .db.tmp) file — store-integrity violation: $db_synth" >&2 + FAILURES=$((FAILURES + 1)) + else + echo "[mem_ceiling] PASS [R4]: aborted run left no .db file (extract-phase abort precedes any SQLite write)" >&2 + fi +else + echo "[mem_ceiling] SKIP [R4]: run did not abort, nothing to check (see row6/row9-low failure above)" >&2 +fi +rm -f "$LAST_LOG" "$db_synth" "${db_synth}.tmp" + +# ── Final result ─────────────────────────────────────────────────────── +if [[ "$FAILURES" -gt 0 ]]; then + echo "[mem_ceiling] FAILED: $FAILURES check(s) failed." >&2 + exit 1 +fi + +echo "[mem_ceiling] All checks passed (rows 6, 7, 9, 10-real-env, R4 store-integrity)." >&2 +exit 0 diff --git a/tests/test_importance.c b/tests/test_importance.c new file mode 100644 index 000000000..af20f3cac --- /dev/null +++ b/tests/test_importance.c @@ -0,0 +1,628 @@ +/* + * test_importance.c — AC1 (spec Part 1 / P3): index-time persisted per-symbol + * importance score. Test plan: test-plan.md rows #1-#7, #10-#12 (fixture + * suite). Rows #8/#9 (real-corpus connectors sweep) and #13 (weighted-degree + * vs PageRank judgment-set decision) are manual/artifact checks run and + * recorded outside this suite (builder-notes.md). + * + * Formula under test (pass_importance.c): + * importance = sqrt(num_refs) x priv x generic x distinct x test_penalty + */ +#include "test_framework.h" +#include "test_helpers.h" +#include "pipeline/pipeline.h" +#include "pipeline/pipeline_internal.h" +#include "store/store.h" +#include "graph_buffer/graph_buffer.h" +#include "foundation/compat.h" +#include "yyjson/yyjson.h" + +#include +#include +#include +#include +#include + +/* ── Shared helpers ───────────────────────────────────────────────────── */ + +/* Parse the numeric value of "importance" out of a properties_json blob. + * Returns a sentinel (-999.0) if the key is absent -- callers must check + * presence separately (strstr) before trusting the parsed value. */ +static double extract_importance(const char *json) { + if (!json) { + return -999.0; + } + const char *p = strstr(json, "\"importance\":"); + if (!p) { + return -999.0; + } + p += strlen("\"importance\":"); + return strtod(p, NULL); +} + +static int write_files(const char *dir, const char *const *names, const char *const *contents, + int n) { + for (int i = 0; i < n; i++) { + char path[600]; + snprintf(path, sizeof(path), "%s/%s", dir, names[i]); + if (th_write_file(path, contents[i]) != 0) { + return -1; + } + } + return 0; +} + +/* ── Row #1: every Function/Method/Class node carries the score; + * File nodes are excluded by design ─────────────────────────────────── */ + +TEST(importance_present_on_symbol_nodes_full_reindex) { + char *tmp = th_mktempdir("cbm_imp1"); + if (!tmp) { + FAIL("tmpdir"); + } + char tmpdir[256]; + snprintf(tmpdir, sizeof(tmpdir), "%s", tmp); + + const char *names[] = {"main.py"}; + const char *contents[] = {"def helper():\n return 1\n\n" + "def caller():\n return helper() + helper()\n\n" + "class Widget:\n def render(self):\n return helper()\n"}; + if (write_files(tmpdir, names, contents, 1) != 0) { + th_rmtree(tmpdir); + FAIL("write fixture"); + } + + char db_path[512]; + snprintf(db_path, sizeof(db_path), "%s/graph.db", tmpdir); + cbm_pipeline_t *p = cbm_pipeline_new(tmpdir, db_path, CBM_MODE_FULL); + ASSERT_NOT_NULL(p); + ASSERT_EQ(cbm_pipeline_run(p), 0); + + cbm_store_t *s = cbm_store_open_path(db_path); + ASSERT_NOT_NULL(s); + const char *project = cbm_pipeline_project_name(p); + + cbm_node_t *funcs = NULL; + int fc = 0; + ASSERT_EQ(cbm_store_find_nodes_by_label(s, project, "Function", &funcs, &fc), CBM_STORE_OK); + ASSERT_GT(fc, 0); + for (int i = 0; i < fc; i++) { + ASSERT_NOT_NULL(funcs[i].properties_json); + ASSERT_TRUE(strstr(funcs[i].properties_json, "\"importance\":") != NULL); + } + cbm_store_free_nodes(funcs, fc); + + cbm_node_t *methods = NULL; + int mc = 0; + ASSERT_EQ(cbm_store_find_nodes_by_label(s, project, "Method", &methods, &mc), CBM_STORE_OK); + ASSERT_GT(mc, 0); + for (int i = 0; i < mc; i++) { + ASSERT_TRUE(strstr(methods[i].properties_json, "\"importance\":") != NULL); + } + cbm_store_free_nodes(methods, mc); + + cbm_node_t *classes = NULL; + int cc = 0; + ASSERT_EQ(cbm_store_find_nodes_by_label(s, project, "Class", &classes, &cc), CBM_STORE_OK); + ASSERT_GT(cc, 0); + for (int i = 0; i < cc; i++) { + ASSERT_TRUE(strstr(classes[i].properties_json, "\"importance\":") != NULL); + } + cbm_store_free_nodes(classes, cc); + + /* Design: File nodes are excluded -- the pass only touches + * Function/Method/Class. */ + cbm_node_t *files = NULL; + int filec = 0; + ASSERT_EQ(cbm_store_find_nodes_by_label(s, project, "File", &files, &filec), CBM_STORE_OK); + for (int i = 0; i < filec; i++) { + if (files[i].properties_json) { + ASSERT_TRUE(strstr(files[i].properties_json, "\"importance\":") == NULL); + } + } + cbm_store_free_nodes(files, filec); + + cbm_store_close(s); + cbm_pipeline_free(p); + th_rmtree(tmpdir); + PASS(); +} + +/* ── Row #2: base = sqrt(num_refs), num_refs = incoming CALLS + USAGE, + * including the num_refs == 0 floor ─────────────────────────────────── */ + +TEST(importance_base_sqrt_num_refs) { + cbm_gbuf_t *gb = cbm_gbuf_new("test-proj", "/tmp/test"); + ASSERT_NOT_NULL(gb); + + int64_t target = + cbm_gbuf_upsert_node(gb, "Function", "target", "pkg.target", "pkg/main.go", 1, 1, "{}"); + int64_t caller1 = + cbm_gbuf_upsert_node(gb, "Function", "caller1", "pkg.caller1", "pkg/main.go", 2, 2, "{}"); + int64_t caller2 = + cbm_gbuf_upsert_node(gb, "Function", "caller2", "pkg.caller2", "pkg/main.go", 3, 3, "{}"); + int64_t consumer = cbm_gbuf_upsert_node(gb, "Function", "consumer", "pkg.consumer", + "pkg/main.go", 4, 4, "{}"); + int64_t lonely = + cbm_gbuf_upsert_node(gb, "Function", "lonelyfn", "pkg.lonelyfn", "pkg/main.go", 5, 5, "{}"); + ASSERT_GT(target, 0); + ASSERT_GT(lonely, 0); + + cbm_gbuf_insert_edge(gb, caller1, target, "CALLS", "{}"); + cbm_gbuf_insert_edge(gb, caller2, target, "CALLS", "{}"); + cbm_gbuf_insert_edge(gb, consumer, target, "USAGE", "{}"); /* spans BOTH edge types */ + + atomic_int cancelled = 0; + cbm_pipeline_ctx_t ctx = { + .project_name = "test-proj", + .repo_path = "/tmp/test", + .gbuf = gb, + .registry = NULL, + .cancelled = &cancelled, + }; + cbm_pipeline_pass_importance(&ctx); + + const cbm_gbuf_node_t *tnode = cbm_gbuf_find_by_id(gb, target); + ASSERT_NOT_NULL(tnode); + ASSERT_TRUE(strstr(tnode->properties_json, "\"importance\":") != NULL); + ASSERT_FLOAT_EQ(extract_importance(tnode->properties_json), sqrt(3.0), 1e-6); + + /* num_refs == 0 -> sqrt(0) == 0 floor */ + const cbm_gbuf_node_t *lnode = cbm_gbuf_find_by_id(gb, lonely); + ASSERT_NOT_NULL(lnode); + ASSERT_TRUE(strstr(lnode->properties_json, "\"importance\":") != NULL); + ASSERT_FLOAT_EQ(extract_importance(lnode->properties_json), 0.0, 1e-9); + + cbm_gbuf_free(gb); + PASS(); +} + +/* ── Row #3: x0.1 for private (leading-underscore) symbols ───────────── */ + +TEST(importance_private_multiplier) { + cbm_gbuf_t *gb = cbm_gbuf_new("test-proj", "/tmp/test"); + ASSERT_NOT_NULL(gb); + + int64_t priv_target = + cbm_gbuf_upsert_node(gb, "Function", "_run", "pkg._run", "pkg/main.go", 1, 1, "{}"); + int64_t pub_target = + cbm_gbuf_upsert_node(gb, "Function", "pubfn", "pkg.pubfn", "pkg/main.go", 2, 2, "{}"); + + for (int i = 0; i < 4; i++) { + char name[32], qn[48]; + snprintf(name, sizeof(name), "pcaller%d", i); + snprintf(qn, sizeof(qn), "pkg.pcaller%d", i); + int64_t c = cbm_gbuf_upsert_node(gb, "Function", name, qn, "pkg/main.go", 10 + i, 10 + i, + "{}"); + cbm_gbuf_insert_edge(gb, c, priv_target, "CALLS", "{}"); + } + for (int i = 0; i < 4; i++) { + char name[32], qn[48]; + snprintf(name, sizeof(name), "qcaller%d", i); + snprintf(qn, sizeof(qn), "pkg.qcaller%d", i); + int64_t c = cbm_gbuf_upsert_node(gb, "Function", name, qn, "pkg/main.go", 20 + i, 20 + i, + "{}"); + cbm_gbuf_insert_edge(gb, c, pub_target, "CALLS", "{}"); + } + + atomic_int cancelled = 0; + cbm_pipeline_ctx_t ctx = { + .project_name = "test-proj", .repo_path = "/tmp/test", .gbuf = gb, .cancelled = &cancelled}; + cbm_pipeline_pass_importance(&ctx); + + const cbm_gbuf_node_t *priv_node = cbm_gbuf_find_by_id(gb, priv_target); + const cbm_gbuf_node_t *pub_node = cbm_gbuf_find_by_id(gb, pub_target); + ASSERT_FLOAT_EQ(extract_importance(priv_node->properties_json), sqrt(4.0) * 0.1, 1e-6); + ASSERT_FLOAT_EQ(extract_importance(pub_node->properties_json), sqrt(4.0), 1e-6); + + cbm_gbuf_free(gb); + PASS(); +} + +/* ── Row #4: x0.1 when a name is DEFINED in >=5 DISTINCT files (not nodes); + * boundary N-1 (4 files) vs N (5 files) ──────────────────────────────── */ + +TEST(importance_generic_name_multiplier) { + cbm_gbuf_t *gb = cbm_gbuf_new("test-proj", "/tmp/test"); + ASSERT_NOT_NULL(gb); + + /* "clientx": defined in exactly 5 DISTINCT files -> generic. */ + int64_t generic_target = 0; + for (int i = 0; i < 5; i++) { + char qn[48], file[48]; + snprintf(qn, sizeof(qn), "pkg%d.clientx", i); + snprintf(file, sizeof(file), "pkg%d/client.go", i); + int64_t id = cbm_gbuf_upsert_node(gb, "Function", "clientx", qn, file, 1, 1, "{}"); + if (i == 4) { + generic_target = id; /* the one we'll assert on */ + } + } + int64_t g_caller = + cbm_gbuf_upsert_node(gb, "Function", "gcaller", "pkg.gcaller", "pkg/main.go", 1, 1, "{}"); + cbm_gbuf_insert_edge(gb, g_caller, generic_target, "CALLS", "{}"); + + /* "widgetxx": defined in exactly 4 DISTINCT files (N-1 boundary) -> NOT generic. */ + int64_t boundary_target = 0; + for (int i = 0; i < 4; i++) { + char qn[48], file[48]; + snprintf(qn, sizeof(qn), "wpkg%d.widgetxx", i); + snprintf(file, sizeof(file), "wpkg%d/widget.go", i); + int64_t id = cbm_gbuf_upsert_node(gb, "Function", "widgetxx", qn, file, 1, 1, "{}"); + if (i == 3) { + boundary_target = id; + } + } + int64_t b_caller = + cbm_gbuf_upsert_node(gb, "Function", "bcaller", "pkg.bcaller", "pkg/main.go", 2, 2, "{}"); + cbm_gbuf_insert_edge(gb, b_caller, boundary_target, "CALLS", "{}"); + + /* "helperyy": 5 total definitions but only 4 DISTINCT files (2 defs share + * fileA) -> must count distinct FILES, not distinct nodes -> NOT generic. */ + int64_t dup_target = cbm_gbuf_upsert_node(gb, "Function", "helperyy", "fileA.helperyy_1", + "fileA.go", 1, 1, "{}"); + cbm_gbuf_upsert_node(gb, "Function", "helperyy", "fileA.helperyy_2", "fileA.go", 5, 5, "{}"); + cbm_gbuf_upsert_node(gb, "Function", "helperyy", "fileB.helperyy", "fileB.go", 1, 1, "{}"); + cbm_gbuf_upsert_node(gb, "Function", "helperyy", "fileC.helperyy", "fileC.go", 1, 1, "{}"); + cbm_gbuf_upsert_node(gb, "Function", "helperyy", "fileD.helperyy", "fileD.go", 1, 1, "{}"); + int64_t d_caller = + cbm_gbuf_upsert_node(gb, "Function", "dcaller", "pkg.dcaller", "pkg/main.go", 3, 3, "{}"); + cbm_gbuf_insert_edge(gb, d_caller, dup_target, "CALLS", "{}"); + + atomic_int cancelled = 0; + cbm_pipeline_ctx_t ctx = { + .project_name = "test-proj", .repo_path = "/tmp/test", .gbuf = gb, .cancelled = &cancelled}; + cbm_pipeline_pass_importance(&ctx); + + const cbm_gbuf_node_t *gn = cbm_gbuf_find_by_id(gb, generic_target); + const cbm_gbuf_node_t *bn = cbm_gbuf_find_by_id(gb, boundary_target); + const cbm_gbuf_node_t *dn = cbm_gbuf_find_by_id(gb, dup_target); + ASSERT_FLOAT_EQ(extract_importance(gn->properties_json), sqrt(1.0) * 0.1, 1e-6); + ASSERT_FLOAT_EQ(extract_importance(bn->properties_json), sqrt(1.0), 1e-6); + ASSERT_FLOAT_EQ(extract_importance(dn->properties_json), sqrt(1.0), 1e-6); + + cbm_gbuf_free(gb); + PASS(); +} + +/* ── Row #5: x10 for distinctive (snake_case OR camelCase) AND len>=8; + * len==8 vs len==7 boundary; a plain len>=8 lowercase word gets no bonus ── */ + +TEST(importance_distinctive_identifier_multiplier) { + cbm_gbuf_t *gb = cbm_gbuf_new("test-proj", "/tmp/test"); + ASSERT_NOT_NULL(gb); + + struct { + const char *name; + double expected_multiplier; + } cases[] = { + {"make_proposals", 10.0}, /* snake, len 14 */ + {"parseInvoice", 10.0}, /* camel, len 12 */ + {"run", 1.0}, /* len < 8 */ + {"now", 1.0}, /* len < 8 */ + {"ab_cdefg", 10.0}, /* snake, len == 8 (boundary: bonus) */ + {"ab_cdef", 1.0}, /* snake, len == 7 (boundary: no bonus) */ + {"duplicate", 1.0}, /* len 9, all-lowercase, neither snake nor camel */ + }; + enum { N_CASES = 7 }; + int64_t ids[N_CASES]; + + for (int i = 0; i < N_CASES; i++) { + char qn[64]; + snprintf(qn, sizeof(qn), "pkg.%s", cases[i].name); + ids[i] = + cbm_gbuf_upsert_node(gb, "Function", cases[i].name, qn, "pkg/main.go", i + 1, i + 1, "{}"); + char cname[32], cqn[64]; + snprintf(cname, sizeof(cname), "caller_of_%d", i); + snprintf(cqn, sizeof(cqn), "pkg.caller_of_%d", i); + int64_t caller = + cbm_gbuf_upsert_node(gb, "Function", cname, cqn, "pkg/main.go", 100 + i, 100 + i, "{}"); + cbm_gbuf_insert_edge(gb, caller, ids[i], "CALLS", "{}"); + } + + atomic_int cancelled = 0; + cbm_pipeline_ctx_t ctx = { + .project_name = "test-proj", .repo_path = "/tmp/test", .gbuf = gb, .cancelled = &cancelled}; + cbm_pipeline_pass_importance(&ctx); + + for (int i = 0; i < N_CASES; i++) { + const cbm_gbuf_node_t *n = cbm_gbuf_find_by_id(gb, ids[i]); + ASSERT_NOT_NULL(n); + /* num_refs == 1 for every case -> importance == distinct multiplier directly. */ + ASSERT_FLOAT_EQ(extract_importance(n->properties_json), cases[i].expected_multiplier, 1e-6); + } + + cbm_gbuf_free(gb); + PASS(); +} + +/* ── Row #6: edge-based test penalty (TESTS/TESTS_FILE), NOT filename + * strstr -- central port claim + inversion/discriminating checks ───── */ + +TEST(importance_test_penalty_edge_based) { + cbm_gbuf_t *gb = cbm_gbuf_new("test-proj", "/tmp/test"); + ASSERT_NOT_NULL(gb); + + /* (a) real test helper in a NON-test-named file, called directly by a + * Test-named function -> gets a TESTS edge as TARGET -> penalized. + * Proves edge-based detection catches what strstr would miss. */ + int64_t helper = cbm_gbuf_upsert_node(gb, "Function", "make_helper_fn", "pkg.make_helper_fn", + "testutil.go", 1, 1, "{}"); + int64_t test_fn = cbm_gbuf_upsert_node(gb, "Function", "TestSomething", "pkg.TestSomething", + "foo_test.go", 1, 1, "{}"); + cbm_gbuf_insert_edge(gb, test_fn, helper, "TESTS", "{}"); + int64_t helper_caller = cbm_gbuf_upsert_node(gb, "Function", "othercaller", "pkg.othercaller", + "pkg/main.go", 1, 1, "{}"); + cbm_gbuf_insert_edge(gb, helper_caller, helper, "CALLS", "{}"); + + /* Control: same shape (snake, len>=8, num_refs=1) but NO TESTS edge. */ + int64_t control = cbm_gbuf_upsert_node(gb, "Function", "make_control_fn", "pkg.make_control_fn", + "pkg/prod.go", 1, 1, "{}"); + int64_t control_caller = cbm_gbuf_upsert_node(gb, "Function", "ctrlcaller", "pkg.ctrlcaller", + "pkg/main.go", 2, 2, "{}"); + cbm_gbuf_insert_edge(gb, control_caller, control, "CALLS", "{}"); + + /* (b) inversion/discriminating check: PATH contains "test" as a + * substring but there is NO TESTS/TESTS_FILE edge -> must NOT be + * penalized (proves the strstr path was truly replaced). */ + int64_t path_substr = cbm_gbuf_upsert_node(gb, "Function", "latest_helper_fn", + "pkg.latest_helper_fn", "src/testutil_helpers.go", 1, + 1, "{}"); + int64_t ps_caller = + cbm_gbuf_upsert_node(gb, "Function", "pscaller", "pkg.pscaller", "pkg/main.go", 3, 3, "{}"); + cbm_gbuf_insert_edge(gb, ps_caller, path_substr, "CALLS", "{}"); + + /* (c) a helper colocated in a genuine test file, detected via the + * TESTS_FILE file-level edge (pass_tests never emits a direct TESTS + * edge here because the target already lives in a test-classified + * file -- see create_tests_edges' tgt_is_test exclusion). */ + int64_t test_file_node = + cbm_gbuf_upsert_node(gb, "File", "recA_test.go", "pkg.__file__.recA_test.go", + "recA_test.go", 0, 0, "{}"); + int64_t prod_file_node = cbm_gbuf_upsert_node(gb, "File", "recA.go", "pkg.__file__.recA.go", + "recA.go", 0, 0, "{}"); + cbm_gbuf_insert_edge(gb, test_file_node, prod_file_node, "TESTS_FILE", "{}"); + int64_t seed_fn = cbm_gbuf_upsert_node(gb, "Function", "seed_recording_fn", + "pkg.seed_recording_fn", "recA_test.go", 5, 5, "{}"); + int64_t seed_caller = + cbm_gbuf_upsert_node(gb, "Function", "seedcaller", "pkg.seedcaller", "pkg/main.go", 4, 4, + "{}"); + cbm_gbuf_insert_edge(gb, seed_caller, seed_fn, "CALLS", "{}"); + + atomic_int cancelled = 0; + cbm_pipeline_ctx_t ctx = { + .project_name = "test-proj", .repo_path = "/tmp/test", .gbuf = gb, .cancelled = &cancelled}; + cbm_pipeline_pass_importance(&ctx); + + /* num_refs=1, distinct(snake,len>=8)=10 for all of these -> the only + * variable is the test_penalty factor (0.1 penalized, 1.0 not). */ + ASSERT_FLOAT_EQ(extract_importance(cbm_gbuf_find_by_id(gb, helper)->properties_json), + sqrt(1.0) * 10.0 * 0.1, 1e-6); + ASSERT_FLOAT_EQ(extract_importance(cbm_gbuf_find_by_id(gb, control)->properties_json), + sqrt(1.0) * 10.0, 1e-6); + ASSERT_FLOAT_EQ(extract_importance(cbm_gbuf_find_by_id(gb, path_substr)->properties_json), + sqrt(1.0) * 10.0, 1e-6); /* NOT penalized despite "test" substring */ + ASSERT_FLOAT_EQ(extract_importance(cbm_gbuf_find_by_id(gb, seed_fn)->properties_json), + sqrt(1.0) * 10.0 * 0.1, 1e-6); /* penalized via TESTS_FILE membership */ + + cbm_gbuf_free(gb); + PASS(); +} + +/* ── Row #7: the pass runs AFTER pass_tests and CALLS/USAGE edges exist + * (full-pipeline ordering check, real pass_tests edge creation) ────── */ + +TEST(importance_ordering_after_tests_and_calls) { + char *tmp = th_mktempdir("cbm_imp7"); + if (!tmp) { + FAIL("tmpdir"); + } + char tmpdir[256]; + snprintf(tmpdir, sizeof(tmpdir), "%s", tmp); + + const char *names[] = {"helpers.go", "main_test.go"}; + const char *contents[] = { + "package main\n\nfunc make_fixture_fn() {}\n", + "package main\n\nimport \"testing\"\n\nfunc TestFixture(t *testing.T) { make_fixture_fn() }\n"}; + if (write_files(tmpdir, names, contents, 2) != 0) { + th_rmtree(tmpdir); + FAIL("write fixture"); + } + + char db_path[512]; + snprintf(db_path, sizeof(db_path), "%s/graph.db", tmpdir); + cbm_pipeline_t *p = cbm_pipeline_new(tmpdir, db_path, CBM_MODE_FULL); + ASSERT_NOT_NULL(p); + ASSERT_EQ(cbm_pipeline_run(p), 0); + + cbm_store_t *s = cbm_store_open_path(db_path); + ASSERT_NOT_NULL(s); + const char *project = cbm_pipeline_project_name(p); + + cbm_node_t *funcs = NULL; + int fc = 0; + ASSERT_EQ(cbm_store_find_nodes_by_label(s, project, "Function", &funcs, &fc), CBM_STORE_OK); + const cbm_node_t *fixture = NULL; + for (int i = 0; i < fc; i++) { + if (strcmp(funcs[i].name, "make_fixture_fn") == 0) { + fixture = &funcs[i]; + break; + } + } + ASSERT_NOT_NULL(fixture); + /* Only called by TestFixture (num_refs=1), snake+len>=8 (distinct=10), + * called-by-test (test_penalty=0.1) -> exactly 1.0. A mis-ordered pass + * would see num_refs=0 (score 0) or test_penalty=1.0 (score 10) instead + * -- either misordering is distinguishable from the expected 1.0. */ + ASSERT_TRUE(strstr(fixture->properties_json, "\"importance\":") != NULL); + ASSERT_FLOAT_EQ(extract_importance(fixture->properties_json), 1.0, 1e-6); + + cbm_store_free_nodes(funcs, fc); + cbm_store_close(s); + cbm_pipeline_free(p); + th_rmtree(tmpdir); + PASS(); +} + +/* ── Row #10: persisted score survives dump -> SQLite -> read-back ──── */ + +TEST(importance_round_trip_persist) { + char *tmp = th_mktempdir("cbm_imp10"); + if (!tmp) { + FAIL("tmpdir"); + } + char tmpdir[256]; + snprintf(tmpdir, sizeof(tmpdir), "%s", tmp); + char db_path[512]; + snprintf(db_path, sizeof(db_path), "%s/roundtrip.db", tmpdir); + + cbm_store_t *s1 = cbm_store_open_path(db_path); + ASSERT_NOT_NULL(s1); + cbm_store_upsert_project(s1, "proj", tmpdir); + cbm_node_t n = {.project = "proj", + .label = "Function", + .name = "scored_fn", + .qualified_name = "proj.scored_fn", + .file_path = "f.go", + .properties_json = "{\"importance\":3.140000}"}; + int64_t id = cbm_store_upsert_node(s1, &n); + ASSERT_GT(id, 0); + cbm_store_checkpoint(s1); + cbm_store_close(s1); + + cbm_store_t *s2 = cbm_store_open_path(db_path); + ASSERT_NOT_NULL(s2); + cbm_node_t out = {0}; + ASSERT_EQ(cbm_store_find_node_by_qn(s2, "proj", "proj.scored_fn", &out), CBM_STORE_OK); + ASSERT_NOT_NULL(out.properties_json); + ASSERT_TRUE(strstr(out.properties_json, "\"importance\":") != NULL); + ASSERT_FLOAT_EQ(extract_importance(out.properties_json), 3.14, 1e-6); + cbm_node_free_fields(&out); + cbm_store_close(s2); + + th_rmtree(tmpdir); + PASS(); +} + +/* ── Row #11: AC7 migration -- a pre-feature row reads absent/null without + * crashing; re-indexing (rewriting the row) makes the score present ─── */ + +TEST(importance_migration_null_until_reindex) { + char *tmp = th_mktempdir("cbm_imp11"); + if (!tmp) { + FAIL("tmpdir"); + } + char tmpdir[256]; + snprintf(tmpdir, sizeof(tmpdir), "%s", tmp); + char db_path[512]; + snprintf(db_path, sizeof(db_path), "%s/legacy.db", tmpdir); + + /* Simulate a pre-feature index: a node with NO $.importance key. */ + cbm_store_t *s1 = cbm_store_open_path(db_path); + ASSERT_NOT_NULL(s1); + cbm_store_upsert_project(s1, "proj", tmpdir); + cbm_node_t legacy = {.project = "proj", + .label = "Function", + .name = "legacy_fn", + .qualified_name = "proj.legacy_fn", + .file_path = "f.go", + .properties_json = "{}"}; + ASSERT_GT(cbm_store_upsert_node(s1, &legacy), 0); + cbm_store_checkpoint(s1); + cbm_store_close(s1); + + /* Reopen: absent key reads cleanly, no crash, no error. */ + cbm_store_t *s2 = cbm_store_open_path(db_path); + ASSERT_NOT_NULL(s2); + cbm_node_t out1 = {0}; + ASSERT_EQ(cbm_store_find_node_by_qn(s2, "proj", "proj.legacy_fn", &out1), CBM_STORE_OK); + ASSERT_NOT_NULL(out1.properties_json); + ASSERT_TRUE(strstr(out1.properties_json, "\"importance\":") == NULL); + cbm_node_free_fields(&out1); + + /* Re-index: rewrite the row with the score present (upsert by same QN). */ + cbm_node_t reindexed = {.project = "proj", + .label = "Function", + .name = "legacy_fn", + .qualified_name = "proj.legacy_fn", + .file_path = "f.go", + .properties_json = "{\"importance\":5.000000}"}; + ASSERT_GT(cbm_store_upsert_node(s2, &reindexed), 0); + cbm_store_checkpoint(s2); + cbm_store_close(s2); + + cbm_store_t *s3 = cbm_store_open_path(db_path); + ASSERT_NOT_NULL(s3); + cbm_node_t out2 = {0}; + ASSERT_EQ(cbm_store_find_node_by_qn(s3, "proj", "proj.legacy_fn", &out2), CBM_STORE_OK); + ASSERT_TRUE(strstr(out2.properties_json, "\"importance\":") != NULL); + ASSERT_FLOAT_EQ(extract_importance(out2.properties_json), 5.0, 1e-6); + cbm_node_free_fields(&out2); + cbm_store_close(s3); + + th_rmtree(tmpdir); + PASS(); +} + +/* ── Row #12: properties_json stays valid JSON after the append; the + * append guard refuses to touch an already-malformed object ────────── */ + +TEST(importance_properties_json_valid_after_append) { + cbm_gbuf_t *gb = cbm_gbuf_new("test-proj", "/tmp/test"); + ASSERT_NOT_NULL(gb); + + int64_t a = cbm_gbuf_upsert_node(gb, "Function", "alpha_fn", "pkg.alpha_fn", "pkg/a.go", 1, 1, + "{}"); + int64_t b = cbm_gbuf_upsert_node(gb, "Function", "beta_fn", "pkg.beta_fn", "pkg/b.go", 1, 1, + "{\"docstring\":\"hi\"}"); + int64_t caller = + cbm_gbuf_upsert_node(gb, "Function", "caller_fn", "pkg.caller_fn", "pkg/c.go", 1, 1, "{}"); + cbm_gbuf_insert_edge(gb, caller, a, "CALLS", "{}"); + cbm_gbuf_insert_edge(gb, caller, b, "CALLS", "{}"); + + atomic_int cancelled = 0; + cbm_pipeline_ctx_t ctx = { + .project_name = "test-proj", .repo_path = "/tmp/test", .gbuf = gb, .cancelled = &cancelled}; + cbm_pipeline_pass_importance(&ctx); + + /* Mutation/inversion-sensitive: any future edit that breaks the append's + * buffer sizing or escaping and produces malformed JSON fails HERE. */ + const int64_t ids[] = {a, b}; + for (int i = 0; i < 2; i++) { + const cbm_gbuf_node_t *n = cbm_gbuf_find_by_id(gb, ids[i]); + ASSERT_NOT_NULL(n->properties_json); + yyjson_doc *doc = yyjson_read(n->properties_json, strlen(n->properties_json), 0); + ASSERT_NOT_NULL(doc); + yyjson_val *root = yyjson_doc_get_root(doc); + ASSERT_TRUE(yyjson_is_obj(root)); + ASSERT_NOT_NULL(yyjson_obj_get(root, "importance")); + yyjson_doc_free(doc); + } + + /* Guard: a pre-existing malformed (non-`{...}`) properties_json is left + * untouched rather than further corrupted (mirrors + * append_complexity_props' bail behaviour). */ + int64_t bad = + cbm_gbuf_upsert_node(gb, "Function", "bad_fn", "pkg.bad_fn", "pkg/d.go", 1, 1, "{}"); + cbm_gbuf_node_t *bad_node = (cbm_gbuf_node_t *)cbm_gbuf_find_by_id(gb, bad); + ASSERT_NOT_NULL(bad_node); + free(bad_node->properties_json); + bad_node->properties_json = strdup("not-a-json-object"); + cbm_pipeline_importance_append_prop(bad_node, 5.0); + ASSERT_STR_EQ(bad_node->properties_json, "not-a-json-object"); + + cbm_gbuf_free(gb); + PASS(); +} + +SUITE(importance) { + RUN_TEST(importance_present_on_symbol_nodes_full_reindex); + RUN_TEST(importance_base_sqrt_num_refs); + RUN_TEST(importance_private_multiplier); + RUN_TEST(importance_generic_name_multiplier); + RUN_TEST(importance_distinctive_identifier_multiplier); + RUN_TEST(importance_test_penalty_edge_based); + RUN_TEST(importance_ordering_after_tests_and_calls); + RUN_TEST(importance_round_trip_persist); + RUN_TEST(importance_migration_null_until_reindex); + RUN_TEST(importance_properties_json_valid_after_append); +} diff --git a/tests/test_incremental.c b/tests/test_incremental.c index c210d5433..cb7b8d4cf 100644 --- a/tests/test_incremental.c +++ b/tests/test_incremental.c @@ -754,6 +754,22 @@ TEST(incr_batch_add_delete) { * PHASE 6: Recovery + accuracy * ══════════════════════════════════════════════════════════════════ */ +/* Computes the |before/after| percent diff for the DB-recovery check. + * Returns true and sets *out_diff_pct when nodes_before > 0; returns false + * (no division performed) when nodes_before <= 0, so the caller can FAIL + * with a clear message instead of dividing by zero (a starved/0-node + * baseline is a setup failure, not something to silently divide through). + * Pulled out to a standalone helper so the zero-baseline guard is pinned by + * a deterministic, network-free unit test (below) independent of the + * network-dependent fastapi-clone fixture this suite otherwise requires. */ +static bool compute_recovery_diff_pct(int nodes_before, int nodes_after, int *out_diff_pct) { + if (nodes_before <= 0) { + return false; + } + *out_diff_pct = abs(nodes_after - nodes_before) * 100 / nodes_before; + return true; +} + TEST(incr_db_deleted_recovery) { int nodes_before = get_node_count(); @@ -768,7 +784,10 @@ TEST(incr_db_deleted_recovery) { /* Full reindex must produce similar count */ int nodes_after = get_node_count(); - int diff_pct = abs(nodes_after - nodes_before) * 100 / nodes_before; + int diff_pct = 0; + if (!compute_recovery_diff_pct(nodes_before, nodes_after, &diff_pct)) { + FAIL("nodes_before == 0: baseline index produced no nodes, cannot compute recovery diff_pct"); + } ASSERT_LT(diff_pct, 5); printf(" [perf] db recovery (full reindex): %.0fms, peak=%zuMB\n", ms, peak_mb); @@ -776,6 +795,25 @@ TEST(incr_db_deleted_recovery) { PASS(); } +/* Deterministic, network-free guard pin for the nodes_before==0 / negative + * path: asserts the helper never performs the division (would SIGFPE) and + * signals failure via its return value instead. Registered outside the + * network-dependent fastapi fixture so it always runs (offline included). */ +TEST(incr_recovery_diff_pct_guards_zero_baseline) { + int diff_pct = 999; + ASSERT_FALSE(compute_recovery_diff_pct(0, 42, &diff_pct)); + ASSERT_EQ(diff_pct, 999); /* untouched -- confirms no divide happened */ + + diff_pct = 999; + ASSERT_FALSE(compute_recovery_diff_pct(-3, 42, &diff_pct)); + ASSERT_EQ(diff_pct, 999); + + ASSERT_TRUE(compute_recovery_diff_pct(100, 105, &diff_pct)); + ASSERT_EQ(diff_pct, 5); + + PASS(); +} + TEST(incr_accuracy_vs_full) { /* Modify a file to create a known incremental state */ write_file_at("fastapi/incr_accuracy.py", "def accuracy_a():\n return 1\n" @@ -2819,6 +2857,10 @@ TEST(tool_delete_and_verify) { * ══════════════════════════════════════════════════════════════════ */ SUITE(incremental) { + /* Deterministic, network-free: must run even when the fastapi-clone + * fixture setup below fails/is skipped offline. */ + RUN_TEST(incr_recovery_diff_pct_guards_zero_baseline); + if (incremental_setup() != 0) { printf(" SETUP FAILED — skipping incremental tests (network?)\n"); return; diff --git a/tests/test_main.c b/tests/test_main.c index 042d613e0..a1e999ffc 100644 --- a/tests/test_main.c +++ b/tests/test_main.c @@ -38,6 +38,7 @@ extern void suite_discover(void); extern void suite_graph_buffer(void); extern void suite_registry(void); extern void suite_pipeline(void); +extern void suite_importance(void); extern void suite_fqn(void); extern void suite_path_alias(void); extern void suite_watcher(void); @@ -150,6 +151,7 @@ int main(void) { /* Pipeline (M8) */ RUN_SUITE(registry); RUN_SUITE(pipeline); + RUN_SUITE(importance); RUN_SUITE(fqn); RUN_SUITE(path_alias); diff --git a/tests/test_mcp.c b/tests/test_mcp.c index 61c71b897..2bb08b1ad 100644 --- a/tests/test_mcp.c +++ b/tests/test_mcp.c @@ -629,6 +629,380 @@ TEST(tool_query_graph_missing_query) { PASS(); } +/* ══════════════════════════════════════════════════════════════════ + * PROJECT PARAM — missing-param vs not-found distinction + `project_id` + * alias (pai/codebase-memory-search). + * + * Root cause: cbm_mcp_get_string_arg returns NULL both for an absent key + * and for an unknown key like `project_id`. resolve_store(srv, NULL) then + * collapses "you forgot the param" into the SAME "project not found or not + * indexed" + available_projects shape REQUIRE_STORE emits for a genuinely + * unknown project — so a caller who mistypes `project_id` (instead of + * `project`) sees a self-contradicting error naming a project that IS in + * the available_projects list it just printed. + * ══════════════════════════════════════════════════════════════════ */ + +/* Row 1: missing `project` on every project-reading tool must return a + * NAMED missing-param error — never the conflated not-found shape. */ +static int check_missing_project_error(cbm_mcp_server_t *srv, const char *tool_name, + const char *args_json) { + char *raw = cbm_mcp_handle_tool(srv, tool_name, args_json); + ASSERT_NOT_NULL(raw); + ASSERT_NOT_NULL(strstr(raw, "required")); + ASSERT_NOT_NULL(strstr(raw, "project")); + ASSERT_NULL(strstr(raw, "not found or not indexed")); + ASSERT_NULL(strstr(raw, "available_projects")); + free(raw); + return 0; +} + +TEST(missing_project_named_error_all_tools) { + cbm_mcp_server_t *srv = cbm_mcp_server_new(NULL); + ASSERT_EQ(check_missing_project_error(srv, "search_graph", "{}"), 0); + ASSERT_EQ(check_missing_project_error( + srv, "query_graph", "{\"query\":\"MATCH (f:Function) RETURN f.name\"}"), + 0); + ASSERT_EQ(check_missing_project_error(srv, "trace_path", "{\"function_name\":\"Foo\"}"), 0); + ASSERT_EQ( + check_missing_project_error(srv, "get_code_snippet", "{\"qualified_name\":\"foo.bar\"}"), + 0); + ASSERT_EQ(check_missing_project_error(srv, "get_graph_schema", "{}"), 0); + ASSERT_EQ(check_missing_project_error(srv, "get_architecture", "{}"), 0); + ASSERT_EQ(check_missing_project_error(srv, "repo_map", "{}"), 0); + ASSERT_EQ(check_missing_project_error(srv, "search_code", "{\"pattern\":\"foo\"}"), 0); + ASSERT_EQ(check_missing_project_error(srv, "delete_project", "{}"), 0); + ASSERT_EQ(check_missing_project_error(srv, "index_status", "{}"), 0); + ASSERT_EQ(check_missing_project_error(srv, "detect_changes", "{}"), 0); + ASSERT_EQ(check_missing_project_error(srv, "manage_adr", "{}"), 0); + cbm_mcp_server_free(srv); + PASS(); +} + +/* Row 2: empty string and non-string JSON values for `project` must be + * treated as missing — never looked up as a (nonsensical) project name. */ +TEST(missing_project_empty_and_nonstring_treated_as_missing) { + cbm_mcp_server_t *srv = cbm_mcp_server_new(NULL); + ASSERT_EQ(check_missing_project_error(srv, "search_graph", "{\"project\":\"\"}"), 0); + ASSERT_EQ(check_missing_project_error(srv, "search_graph", "{\"project\":123}"), 0); + ASSERT_EQ(check_missing_project_error(srv, "search_graph", "{\"project\":null}"), 0); + cbm_mcp_server_free(srv); + PASS(); +} + +/* Row 3 (primary): `project_id` full success-path alias test against a REAL + * on-disk store — and, per the quality-check note on resolve_store's + * current_project cache (mcp.c: "already open for this project?"), the + * alias call runs AFTER a call for a genuinely DIFFERENT project, so this + * proves the alias survives resolve_store's close-and-reopen-from-disk path, + * not just the in-memory "already open" shortcut every other fixture in + * this suite relies on. cbm_store_open() resolves the exact same + * /.db convention resolve_store uses (mcp.c + * project_db_path), so two real stores under a scratch CBM_CACHE_DIR make + * the reopen genuine. */ +/* Body extracted so the outer TEST restores CBM_CACHE_DIR even when an + * assertion fails (ASSERT early-returns; an in-test setenv would otherwise + * leak into every later suite — observed at the red boundary: the leaked + * scratch CBM_CACHE_DIR made the incremental suite's pipeline WRITE its + * index into the scratch dir while that suite's count reads stayed on its + * own fixed user-cache path, so nodes_before==0 divided to an ASan FPE at + * tests/test_incremental.c:771). */ +static int alias_cache_miss_body(const char *proj_a, const char *proj_b) { + cbm_store_t *sa = cbm_store_open(proj_a); + ASSERT_NOT_NULL(sa); + cbm_store_upsert_project(sa, proj_a, "/tmp/alias-cache-miss-a-root"); + cbm_node_t node_a = {0}; + node_a.project = proj_a; + node_a.label = "Function"; + node_a.name = "AlphaFn"; + node_a.qualified_name = "alias-cache-miss-a.AlphaFn"; + node_a.file_path = "a.go"; + cbm_store_upsert_node(sa, &node_a); + cbm_store_close(sa); + + cbm_store_t *sb = cbm_store_open(proj_b); + ASSERT_NOT_NULL(sb); + cbm_store_upsert_project(sb, proj_b, "/tmp/alias-cache-miss-b-root"); + cbm_node_t node_b = {0}; + node_b.project = proj_b; + node_b.label = "Function"; + node_b.name = "BetaFn"; + node_b.qualified_name = "alias-cache-miss-b.BetaFn"; + node_b.file_path = "b.go"; + cbm_store_upsert_node(sb, &node_b); + cbm_store_close(sb); + + cbm_mcp_server_t *srv = cbm_mcp_server_new(NULL); /* in-memory placeholder */ + ASSERT_NOT_NULL(srv); + + /* First call: a real project via `project` (not the alias). Server + * started in-memory with current_project NULL, so this is a genuine + * disk open, not a fixture shortcut. */ + char req_a[256]; + snprintf(req_a, sizeof(req_a), "{\"project\":\"%s\",\"label\":\"Function\"}", proj_a); + char *raw_a = cbm_mcp_handle_tool(srv, "search_graph", req_a); + if (!raw_a) { + cbm_mcp_server_free(srv); + FAIL("search_graph via project returned NULL"); + } + int ok_a = (strstr(raw_a, "\"isError\":true") == NULL && strstr(raw_a, "AlphaFn") != NULL); + free(raw_a); + if (!ok_a) { + cbm_mcp_server_free(srv); + FAIL("search_graph via project did not return AlphaFn success payload"); + } + + /* Second call: a DIFFERENT project via `project_id` ONLY — forces + * resolve_store's cache-miss branch (current_project == proj_a != + * proj_b) to close proj_a's store and reopen proj_b's from disk. */ + char req_b[256]; + snprintf(req_b, sizeof(req_b), "{\"project_id\":\"%s\",\"label\":\"Function\"}", proj_b); + char *raw_b = cbm_mcp_handle_tool(srv, "search_graph", req_b); + if (!raw_b) { + cbm_mcp_server_free(srv); + FAIL("search_graph via project_id returned NULL"); + } + int ok_b = (strstr(raw_b, "\"isError\":true") == NULL && strstr(raw_b, "BetaFn") != NULL); + free(raw_b); + cbm_mcp_server_free(srv); + if (!ok_b) { + FAIL("project_id alias did not resolve to BetaFn success payload"); + } + return 0; +} + +TEST(project_id_alias_search_graph_success_after_cache_miss) { + char cache[256]; + snprintf(cache, sizeof(cache), "/tmp/cbm-alias-cache-XXXXXX"); + if (!cbm_mkdtemp(cache)) { + FAIL("mkdtemp failed for scratch CBM_CACHE_DIR"); + } + const char *saved = getenv("CBM_CACHE_DIR"); + char *saved_copy = saved ? strdup(saved) : NULL; + cbm_setenv("CBM_CACHE_DIR", cache, 1); + + const char *proj_a = "alias-cache-miss-a"; + const char *proj_b = "alias-cache-miss-b"; + + /* Assertion body runs in a helper so the env restore + scratch cleanup + * below ALWAYS execute — pass or fail — before this test returns. */ + int body_rc = alias_cache_miss_body(proj_a, proj_b); + + if (saved_copy) { + cbm_setenv("CBM_CACHE_DIR", saved_copy, 1); + free(saved_copy); + } else { + cbm_unsetenv("CBM_CACHE_DIR"); + } + char dbpath[512]; + const char *projs[2] = {proj_a, proj_b}; + const char *suffixes[3] = {".db", ".db-wal", ".db-shm"}; + for (int p = 0; p < 2; p++) { + for (int s = 0; s < 3; s++) { + snprintf(dbpath, sizeof(dbpath), "%s/%s%s", cache, projs[p], suffixes[s]); + cbm_unlink(dbpath); + } + } + cbm_rmdir(cache); + + if (body_rc != 0) { + return body_rc; + } + PASS(); +} + +/* Row 3 (remaining 11 tools, minimum bar): `project_id` alone must reach the + * project-resolution logic with the ALIASED value, not fall back to the + * missing-param error — i.e. an unknown aliased project reports "not found" + * (a value was read and looked up), never "is required" (which would mean + * the alias was silently dropped and project fell back to NULL). Ordered + * with delete_project/search_code first: their pre-fix behaviour already + * distinguished missing-vs-not-found, so `project_id` being ignored pre-fix + * shows up as "is required" there — the discriminating, guaranteed-red case + * for this row. */ +static int check_project_id_alias_reaches_notfound(cbm_mcp_server_t *srv, const char *tool_name, + const char *args_json) { + char *raw = cbm_mcp_handle_tool(srv, tool_name, args_json); + ASSERT_NOT_NULL(raw); + ASSERT_NULL(strstr(raw, "is required")); + ASSERT_TRUE(strstr(raw, "not found") != NULL || strstr(raw, "not_found") != NULL); + free(raw); + return 0; +} + +TEST(project_id_alias_reaches_notfound_remaining_tools) { + cbm_mcp_server_t *srv = cbm_mcp_server_new(NULL); + ASSERT_EQ(check_project_id_alias_reaches_notfound(srv, "delete_project", + "{\"project_id\":\"totally-unknown-xyz\"}"), + 0); + ASSERT_EQ(check_project_id_alias_reaches_notfound(srv, "search_code", + "{\"project_id\":\"totally-unknown-xyz\"," + "\"pattern\":\"foo\"}"), + 0); + ASSERT_EQ( + check_project_id_alias_reaches_notfound( + srv, "query_graph", + "{\"project_id\":\"totally-unknown-xyz\",\"query\":\"MATCH (f:Function) RETURN f\"}"), + 0); + ASSERT_EQ(check_project_id_alias_reaches_notfound( + srv, "trace_path", + "{\"project_id\":\"totally-unknown-xyz\",\"function_name\":\"Foo\"}"), + 0); + ASSERT_EQ(check_project_id_alias_reaches_notfound( + srv, "get_code_snippet", + "{\"project_id\":\"totally-unknown-xyz\",\"qualified_name\":\"foo.bar\"}"), + 0); + ASSERT_EQ(check_project_id_alias_reaches_notfound(srv, "get_graph_schema", + "{\"project_id\":\"totally-unknown-xyz\"}"), + 0); + ASSERT_EQ(check_project_id_alias_reaches_notfound(srv, "get_architecture", + "{\"project_id\":\"totally-unknown-xyz\"}"), + 0); + ASSERT_EQ(check_project_id_alias_reaches_notfound(srv, "repo_map", + "{\"project_id\":\"totally-unknown-xyz\"}"), + 0); + ASSERT_EQ(check_project_id_alias_reaches_notfound(srv, "index_status", + "{\"project_id\":\"totally-unknown-xyz\"}"), + 0); + ASSERT_EQ(check_project_id_alias_reaches_notfound(srv, "detect_changes", + "{\"project_id\":\"totally-unknown-xyz\"}"), + 0); + ASSERT_EQ(check_project_id_alias_reaches_notfound(srv, "manage_adr", + "{\"project_id\":\"totally-unknown-xyz\"}"), + 0); + cbm_mcp_server_free(srv); + PASS(); +} + +/* Row 4: precedence — a PRESENT `project` always wins over `project_id`, + * even when `project` is the wrong one. */ +TEST(project_precedence_project_wins_over_project_id) { + char tmp[512]; + cbm_mcp_server_t *srv = setup_snippet_server(tmp, sizeof(tmp)); + ASSERT_NOT_NULL(srv); + + /* project (valid) + project_id (bogus) together → project wins → succeeds. */ + char *raw = cbm_mcp_handle_tool( + srv, "search_graph", + "{\"project\":\"test-project\",\"project_id\":\"totally-unknown-xyz\"," + "\"label\":\"Function\"}"); + ASSERT_NOT_NULL(raw); + ASSERT_NULL(strstr(raw, "\"isError\":true")); + free(raw); + + /* Inverse: project (bogus) + project_id (valid) together → project still + * wins (even though it's wrong) → not-found. Pins that project_id never + * overrides a PRESENT project, right or wrong. This call also forces + * resolve_store's close-and-reopen (cache miss from "test-project" to + * "totally-unknown-xyz"), exercising that branch's failure path. */ + char *raw2 = cbm_mcp_handle_tool( + srv, "search_graph", + "{\"project\":\"totally-unknown-xyz\",\"project_id\":\"test-project\"," + "\"label\":\"Function\"}"); + ASSERT_NOT_NULL(raw2); + ASSERT_NOT_NULL(strstr(raw2, "not found")); + free(raw2); + + cleanup_snippet_dir(tmp); + cbm_mcp_server_free(srv); + PASS(); +} + +/* Row 5: genuine not-found (a real value, just unknown) keeps the existing + * conflated "not found or not indexed" + available_projects shape — this + * distinction is exactly what row 1 proves is now NOT what a missing param + * gets. search_graph specifically, per the test plan. */ +TEST(search_graph_unknown_project_still_conflated_notfound_shape) { + cbm_mcp_server_t *srv = cbm_mcp_server_new(NULL); + char *raw = cbm_mcp_handle_tool(srv, "search_graph", "{\"project\":\"totally-unknown-xyz\"}"); + ASSERT_NOT_NULL(raw); + ASSERT_NOT_NULL(strstr(raw, "not found or not indexed")); + free(raw); + cbm_mcp_server_free(srv); + PASS(); +} + +/* Row 7: param-check ordering preserved — the OTHER required param is + * reported independently of `project`'s presence/absence. Characterization + * pins: this ordering already held pre-fix (each handler already checked + * its other required param before touching `project`). */ +TEST(query_graph_missing_query_with_project_present_reports_query_required) { + cbm_mcp_server_t *srv = cbm_mcp_server_new(NULL); + char *raw = cbm_mcp_handle_tool(srv, "query_graph", "{\"project\":\"some-project\"}"); + ASSERT_NOT_NULL(raw); + ASSERT_NOT_NULL(strstr(raw, "query is required")); + free(raw); + cbm_mcp_server_free(srv); + PASS(); +} + +TEST(trace_path_missing_function_name_with_project_present_reports_function_name_required) { + cbm_mcp_server_t *srv = cbm_mcp_server_new(NULL); + char *raw = cbm_mcp_handle_tool(srv, "trace_path", "{\"project\":\"some-project\"}"); + ASSERT_NOT_NULL(raw); + ASSERT_NOT_NULL(strstr(raw, "function_name is required")); + free(raw); + cbm_mcp_server_free(srv); + PASS(); +} + +TEST(search_code_missing_pattern_with_project_present_reports_pattern_required) { + cbm_mcp_server_t *srv = cbm_mcp_server_new(NULL); + char *raw = cbm_mcp_handle_tool(srv, "search_code", "{\"project\":\"some-project\"}"); + ASSERT_NOT_NULL(raw); + ASSERT_NOT_NULL(strstr(raw, "pattern is required")); + free(raw); + cbm_mcp_server_free(srv); + PASS(); +} + +/* Row 7 (new sub-case): BOTH query and project missing on query_graph — + * deterministic order, query checked first (matches pre-existing priority: + * query_graph already checked `query` before touching the store). */ +TEST(query_graph_both_query_and_project_missing_reports_query_required_first) { + cbm_mcp_server_t *srv = cbm_mcp_server_new(NULL); + char *raw = cbm_mcp_handle_tool(srv, "query_graph", "{}"); + ASSERT_NOT_NULL(raw); + ASSERT_NOT_NULL(strstr(raw, "query is required")); + ASSERT_NULL(strstr(raw, "project is required")); + free(raw); + cbm_mcp_server_free(srv); + PASS(); +} + +/* Row 9: the `project_id` alias is silent forgiveness, never a documented + * parameter — it must not leak into the advertised tool schemas. */ +TEST(tools_list_does_not_advertise_project_id_alias) { + char *json = cbm_mcp_tools_list(); + ASSERT_NOT_NULL(json); + ASSERT_NULL(strstr(json, "project_id")); + free(json); + PASS(); +} + +/* Row 10: detect_changes/manage_adr's not-found errors are upgraded to the + * same standard "not found or not indexed" + available_projects shape the + * other 10 tools use, instead of their previous bare "project not found". */ +TEST(detect_changes_unknown_project_uses_standard_notfound_shape) { + cbm_mcp_server_t *srv = cbm_mcp_server_new(NULL); + char *raw = + cbm_mcp_handle_tool(srv, "detect_changes", "{\"project\":\"totally-unknown-xyz\"}"); + ASSERT_NOT_NULL(raw); + ASSERT_NOT_NULL(strstr(raw, "not found or not indexed")); + free(raw); + cbm_mcp_server_free(srv); + PASS(); +} + +TEST(manage_adr_unknown_project_uses_standard_notfound_shape) { + cbm_mcp_server_t *srv = cbm_mcp_server_new(NULL); + char *raw = cbm_mcp_handle_tool(srv, "manage_adr", "{\"project\":\"totally-unknown-xyz\"}"); + ASSERT_NOT_NULL(raw); + ASSERT_NOT_NULL(strstr(raw, "not found or not indexed")); + free(raw); + cbm_mcp_server_free(srv); + PASS(); +} + /* ══════════════════════════════════════════════════════════════════ * PIPELINE-DEPENDENT TOOL HANDLERS * ══════════════════════════════════════════════════════════════════ */ @@ -816,6 +1190,11 @@ TEST(search_code_ampersand_accepted_issue272) { PASS(); } +/* pai/codebase-memory-search row 8: deliberately converted from asserting + * the OLD conflated "not found" message (a missing param is not a genuine + * not-found) to the new NAMED missing-param message. Must fail against + * unmodified code — old code returns "project not found" here, which + * contains "not found" and does not contain "project is required". */ TEST(tool_detect_changes_no_project) { cbm_mcp_server_t *srv = cbm_mcp_server_new(NULL); @@ -824,13 +1203,15 @@ TEST(tool_detect_changes_no_project) { "\"params\":{\"name\":\"detect_changes\"," "\"arguments\":{}}}"); ASSERT_NOT_NULL(resp); - ASSERT_NOT_NULL(strstr(resp, "not found")); + ASSERT_NOT_NULL(strstr(resp, "project is required")); + ASSERT_NULL(strstr(resp, "not found")); free(resp); cbm_mcp_server_free(srv); PASS(); } +/* pai/codebase-memory-search row 8: see tool_detect_changes_no_project above. */ TEST(tool_manage_adr_no_project) { cbm_mcp_server_t *srv = cbm_mcp_server_new(NULL); @@ -839,7 +1220,8 @@ TEST(tool_manage_adr_no_project) { "\"params\":{\"name\":\"manage_adr\"," "\"arguments\":{}}}"); ASSERT_NOT_NULL(resp); - ASSERT_NOT_NULL(strstr(resp, "not found")); + ASSERT_NOT_NULL(strstr(resp, "project is required")); + ASSERT_NULL(strstr(resp, "not found")); free(resp); cbm_mcp_server_free(srv); @@ -1966,6 +2348,1060 @@ TEST(tool_bad_project_name_no_overflow_issue235) { } #undef ISSUE235_DBNAME +/* ══════════════════════════════════════════════════════════════════ + * REPO_MAP — P4 token-budgeted, seed-aware query tool + * (pai/p4-repo-map-query-tool test plan; pinned ACs: AC2, AC6; AC7 tool + * legs in scope). All fixtures use an in-memory store pre-opened via + * cbm_mcp_server_set_project so resolve_store's "already open" shortcut + * serves the fixture data without touching disk (mirrors + * tool_get_architecture_emits_populated_sections above). + * ══════════════════════════════════════════════════════════════════ */ + +/* Create an in-memory server with `project` pre-registered (both as the + * server's current_project — so resolve_store never hits disk — and as a + * row in the `projects` table, so verify_project_indexed passes). */ +static cbm_mcp_server_t *rm_setup_server(const char *project) { + cbm_mcp_server_t *srv = cbm_mcp_server_new(NULL); + cbm_store_t *st = cbm_mcp_server_store(srv); + cbm_mcp_server_set_project(srv, project); + cbm_store_upsert_project(st, project, "/tmp/repo-map-test"); + return srv; +} + +/* Upsert a scored Function/Method/Class fixture node. `signature`, when + * non-NULL, is stored verbatim as the "signature" property (repo_map + * renders it directly — see rm_render rule documented on handle_repo_map). + * Returns the node id. */ +static int64_t rm_add_node(cbm_store_t *st, const char *project, const char *label, + const char *name, const char *qn, const char *file_path, + double importance, const char *signature) { + char props[512]; + if (signature) { + snprintf(props, sizeof(props), "{\"importance\":%.6f,\"signature\":\"%s\"}", importance, + signature); + } else { + snprintf(props, sizeof(props), "{\"importance\":%.6f}", importance); + } + cbm_node_t n = {0}; + n.project = project; + n.label = label; + n.name = name; + n.qualified_name = qn; + n.file_path = file_path; + n.start_line = 1; + n.end_line = 2; + n.properties_json = props; + return cbm_store_upsert_node(st, &n); +} + +/* Upsert a Function/Method/Class fixture node with NO "importance" key — + * the exact shape of every pre-P3 index (spec AC7's score-absence case). */ +static int64_t rm_add_node_no_score(cbm_store_t *st, const char *project, const char *label, + const char *name, const char *qn, const char *file_path) { + cbm_node_t n = {0}; + n.project = project; + n.label = label; + n.name = name; + n.qualified_name = qn; + n.file_path = file_path; + n.start_line = 1; + n.end_line = 2; + n.properties_json = "{}"; + return cbm_store_upsert_node(st, &n); +} + +static void rm_add_edge(cbm_store_t *st, const char *project, int64_t src, int64_t dst, + const char *type) { + cbm_edge_t e = {0}; + e.project = project; + e.source_id = src; + e.target_id = dst; + e.type = type; + e.properties_json = "{}"; + cbm_store_insert_edge(st, &e); +} + +/* Three plain, unconnected symbols with distinct importance — the smallest + * fixture that has a well-defined global ranking. */ +static void rm_add_simple_fixture(cbm_store_t *st, const char *project) { + rm_add_node(st, project, "Function", "A", "qA", "pkg/a.go", 50.0, "A() error"); + rm_add_node(st, project, "Function", "B", "qB", "pkg/b.go", 40.0, "B() error"); + rm_add_node(st, project, "Function", "C", "qC", "pkg/c.go", 30.0, "C() error"); +} + +enum { RM_FANOUT_COUNT = 30 }; + +/* 30 symbols in one file, descending importance, with a real-shaped + * signature — used to exercise the token-budget binary search (AC2a). */ +static void rm_add_budget_fixture(cbm_store_t *st, const char *project) { + char name[64]; + char qn[96]; + char sig[96]; + for (int i = 0; i < RM_FANOUT_COUNT; i++) { + snprintf(name, sizeof(name), "f%02d", i); + snprintf(qn, sizeof(qn), "q.f%02d", i); + snprintf(sig, sizeof(sig), "f%02d(a, b, c int) (int, error)", i); + rm_add_node(st, project, "Function", name, qn, "pkg/file.go", + (double)(RM_FANOUT_COUNT - i), sig); + } +} + +/* Call repo_map and return the extracted inner JSON text (caller frees). */ +static char *rm_call(cbm_mcp_server_t *srv, const char *args_json) { + char *raw = cbm_mcp_handle_tool(srv, "repo_map", args_json); + char *text = extract_text_content(raw); + free(raw); + return text; +} + +/* Read an integer field out of a repo_map response's inner JSON text. -1 if absent. */ +static long rm_json_int(const char *json, const char *key) { + if (!json) { + return -1; + } + yyjson_doc *doc = yyjson_read(json, strlen(json), 0); + if (!doc) { + return -1; + } + yyjson_val *root = yyjson_doc_get_root(doc); + yyjson_val *val = root ? yyjson_obj_get(root, key) : NULL; + long result = (val && yyjson_is_int(val)) ? (long)yyjson_get_int(val) : -1; + yyjson_doc_free(doc); + return result; +} + +/* Read a string field out of a repo_map response's inner JSON text (caller frees). NULL if absent. */ +static char *rm_json_str(const char *json, const char *key) { + if (!json) { + return NULL; + } + yyjson_doc *doc = yyjson_read(json, strlen(json), 0); + if (!doc) { + return NULL; + } + yyjson_val *root = yyjson_doc_get_root(doc); + yyjson_val *val = root ? yyjson_obj_get(root, key) : NULL; + char *result = (val && yyjson_is_str(val)) ? strdup(yyjson_get_str(val)) : NULL; + yyjson_doc_free(doc); + return result; +} + +/* Read a string-array field out of a repo_map response's inner JSON text. + * Fills out[] (caller-allocated, cap entries max) with strdup'd strings and + * returns the element count actually parsed. -1 if the key is absent or not + * an array (distinguishes "omitted" from "present but empty" — callers that + * only care about presence/shape use rm_json_arr_present_and_empty below). */ +static int rm_json_str_arr(const char *json, const char *key, char **out, int cap) { + if (!json) { + return -1; + } + yyjson_doc *doc = yyjson_read(json, strlen(json), 0); + if (!doc) { + return -1; + } + yyjson_val *root = yyjson_doc_get_root(doc); + yyjson_val *val = root ? yyjson_obj_get(root, key) : NULL; + if (!val || !yyjson_is_arr(val)) { + yyjson_doc_free(doc); + return -1; + } + int n = 0; + size_t idx; + size_t max; + yyjson_val *elem; + yyjson_arr_foreach(val, idx, max, elem) { + if (n < cap && yyjson_is_str(elem)) { + out[n++] = strdup(yyjson_get_str(elem)); + } + } + yyjson_doc_free(doc); + return n; +} + +/* True iff `key` is entirely absent from the response's inner JSON text — + * the pinned "omit" shape for seeds_unresolved when nothing is unresolved. */ +static bool rm_json_key_absent(const char *json, const char *key) { + if (!json) { + return true; + } + char needle[128]; + snprintf(needle, sizeof(needle), "\"%s\"", key); + return strstr(json, needle) == NULL; +} + +/* ── Row 1: registration + dispatch ──────────────────────────────── */ + +TEST(repo_map_registered_in_tools_list) { + char *json = cbm_mcp_tools_list(); + ASSERT_NOT_NULL(json); + const char *p = strstr(json, "\"repo_map\""); + ASSERT_NOT_NULL(p); + ASSERT_NOT_NULL(strstr(p, "\"project\"")); + ASSERT_NOT_NULL(strstr(p, "\"seed_anchors\"")); + ASSERT_NOT_NULL(strstr(p, "\"token_budget\"")); + free(json); + PASS(); +} + +TEST(repo_map_dispatchable_via_full_jsonrpc) { + cbm_mcp_server_t *srv = rm_setup_server("rm-dispatch"); + cbm_store_t *st = cbm_mcp_server_store(srv); + rm_add_node(st, "rm-dispatch", "Function", "Foo", "rm-dispatch.Foo", "pkg/foo.go", 5.0, + "Foo() error"); + + char *resp = cbm_mcp_server_handle( + srv, "{\"jsonrpc\":\"2.0\",\"id\":500,\"method\":\"tools/call\"," + "\"params\":{\"name\":\"repo_map\",\"arguments\":{\"project\":\"rm-dispatch\"}}}"); + ASSERT_NOT_NULL(resp); + ASSERT_NOT_NULL(strstr(resp, "\"id\":500")); + ASSERT_NULL(strstr(resp, "\"isError\":true")); + char *inner = extract_text_content(resp); + ASSERT_NOT_NULL(inner); + ASSERT_NOT_NULL(strstr(inner, "\"map\"")); + free(inner); + free(resp); + cbm_mcp_server_free(srv); + PASS(); +} + +/* ── Row 2: AC2a token-budget fit ────────────────────────────────── */ + +TEST(repo_map_budget_default_applies_when_absent) { + cbm_mcp_server_t *srv = rm_setup_server("rm-budget-default"); + cbm_store_t *st = cbm_mcp_server_store(srv); + rm_add_budget_fixture(st, "rm-budget-default"); + + char *text = rm_call(srv, "{\"project\":\"rm-budget-default\"}"); + ASSERT_NOT_NULL(text); + ASSERT_EQ(rm_json_int(text, "budget"), 1600); + long est = rm_json_int(text, "estimated_tokens"); + ASSERT_TRUE(est >= 0 && est <= 1600); + free(text); + cbm_mcp_server_free(srv); + PASS(); +} + +TEST(repo_map_budget_fits_and_uses_available_space) { + cbm_mcp_server_t *srv = rm_setup_server("rm-budget-tight"); + cbm_store_t *st = cbm_mcp_server_store(srv); + rm_add_budget_fixture(st, "rm-budget-tight"); + + char *text = rm_call(srv, "{\"project\":\"rm-budget-tight\",\"token_budget\":100}"); + ASSERT_NOT_NULL(text); + long est = rm_json_int(text, "estimated_tokens"); + long cnt = rm_json_int(text, "symbol_count"); + ASSERT_TRUE(est >= 0 && est <= 100); + /* Content is plentiful (30 lines >> budget) — the binary search must + * converge UP to close to the ceiling, not truncate to a tiny result. */ + ASSERT_TRUE(est >= 50); + ASSERT_TRUE(cnt > 0 && cnt < RM_FANOUT_COUNT); + free(text); + cbm_mcp_server_free(srv); + PASS(); +} + +TEST(repo_map_budget_tiny_no_overshoot_no_hang) { + cbm_mcp_server_t *srv = rm_setup_server("rm-budget-tiny"); + cbm_store_t *st = cbm_mcp_server_store(srv); + rm_add_budget_fixture(st, "rm-budget-tiny"); + + char *text = rm_call(srv, "{\"project\":\"rm-budget-tiny\",\"token_budget\":64}"); + ASSERT_NOT_NULL(text); + long est = rm_json_int(text, "estimated_tokens"); + long cnt = rm_json_int(text, "symbol_count"); + ASSERT_TRUE(est >= 0 && est <= 64); + ASSERT_TRUE(cnt >= 0 && cnt < RM_FANOUT_COUNT); + free(text); + cbm_mcp_server_free(srv); + PASS(); +} + +TEST(repo_map_budget_larger_than_whole_map_returns_everything) { + cbm_mcp_server_t *srv = rm_setup_server("rm-budget-huge"); + cbm_store_t *st = cbm_mcp_server_store(srv); + rm_add_budget_fixture(st, "rm-budget-huge"); + + char *text = rm_call(srv, "{\"project\":\"rm-budget-huge\",\"token_budget\":100000}"); + ASSERT_NOT_NULL(text); + ASSERT_EQ(rm_json_int(text, "symbol_count"), RM_FANOUT_COUNT); + ASSERT_NOT_NULL(strstr(text, "f00(")); + ASSERT_NOT_NULL(strstr(text, "f29(")); + free(text); + cbm_mcp_server_free(srv); + PASS(); +} + +TEST(repo_map_budget_zero_or_negative_is_input_error) { + cbm_mcp_server_t *srv = rm_setup_server("rm-budget-bad"); + cbm_store_t *st = cbm_mcp_server_store(srv); + rm_add_budget_fixture(st, "rm-budget-bad"); + + char *raw = cbm_mcp_handle_tool(srv, "repo_map", + "{\"project\":\"rm-budget-bad\",\"token_budget\":0}"); + ASSERT_NOT_NULL(raw); + ASSERT_NOT_NULL(strstr(raw, "\"isError\":true")); + /* Discriminating: must be the tool's own budget validation, not the + * dispatch-level "unknown tool" error (red-boundary trivial-pass guard). */ + ASSERT_NOT_NULL(strstr(raw, "token_budget")); + ASSERT_NULL(strstr(raw, "unknown tool")); + free(raw); + + raw = cbm_mcp_handle_tool(srv, "repo_map", + "{\"project\":\"rm-budget-bad\",\"token_budget\":-5}"); + ASSERT_NOT_NULL(raw); + ASSERT_NOT_NULL(strstr(raw, "\"isError\":true")); + ASSERT_NOT_NULL(strstr(raw, "token_budget")); + ASSERT_NULL(strstr(raw, "unknown tool")); + free(raw); + + cbm_mcp_server_free(srv); + PASS(); +} + +/* ── Row 3: AC2b seed-boost ranking ──────────────────────────────── */ + +/* Two clusters: a HIGH-raw-importance "distant" trio with no relation to the + * seed, and a MODEST-raw-importance "seed" cluster (S + 4 CALLS/USAGE + * neighbours — deliberately > REPO_MAP_WEAK_NEIGHBOR_THRESHOLD so the widen + * path (row 4) does not fire here). */ +static void rm_add_seed_boost_fixture(cbm_store_t *st, const char *project) { + rm_add_node(st, project, "Function", "D0", "proj.D0", "pkg/distant.go", 100.0, "D0() error"); + rm_add_node(st, project, "Function", "D1", "proj.D1", "pkg/distant.go", 90.0, "D1() error"); + rm_add_node(st, project, "Function", "D2", "proj.D2", "pkg/distant.go", 80.0, "D2() error"); + + int64_t s = rm_add_node(st, project, "Function", "S", "proj.S", "pkg/seed.go", 5.0, + "S() error"); + int64_t n1 = + rm_add_node(st, project, "Function", "N1", "proj.N1", "pkg/seed_n.go", 4.0, "N1() error"); + int64_t n2 = + rm_add_node(st, project, "Function", "N2", "proj.N2", "pkg/seed_n.go", 4.0, "N2() error"); + int64_t n3 = + rm_add_node(st, project, "Function", "N3", "proj.N3", "pkg/seed_n.go", 3.0, "N3() error"); + int64_t n4 = + rm_add_node(st, project, "Function", "N4", "proj.N4", "pkg/seed_n.go", 3.0, "N4() error"); + + rm_add_edge(st, project, s, n1, "CALLS"); + rm_add_edge(st, project, s, n2, "CALLS"); + rm_add_edge(st, project, n3, s, "CALLS"); /* inbound direction too */ + rm_add_edge(st, project, s, n4, "USAGE"); +} + +TEST(repo_map_seed_boost_ranks_neighbourhood_above_distant) { + cbm_mcp_server_t *srv = rm_setup_server("rm-seed-boost"); + cbm_store_t *st = cbm_mcp_server_store(srv); + rm_add_seed_boost_fixture(st, "rm-seed-boost"); + + /* Seeded: S's neighbourhood must outrank the distant high-importance cluster. */ + char *seeded = + rm_call(srv, "{\"project\":\"rm-seed-boost\",\"seed_anchors\":[\"S\"]," + "\"token_budget\":100000}"); + ASSERT_NOT_NULL(seeded); + const char *s_pos = strstr(seeded, "S() error"); + const char *d0_pos = strstr(seeded, "D0() error"); + ASSERT_NOT_NULL(s_pos); + ASSERT_NOT_NULL(d0_pos); + ASSERT_TRUE(s_pos < d0_pos); + + /* No seeds: inversion — raw importance dominates, distant wins. */ + char *global = rm_call(srv, "{\"project\":\"rm-seed-boost\",\"token_budget\":100000}"); + ASSERT_NOT_NULL(global); + const char *g_s_pos = strstr(global, "S() error"); + const char *g_d0_pos = strstr(global, "D0() error"); + ASSERT_NOT_NULL(g_s_pos); + ASSERT_NOT_NULL(g_d0_pos); + ASSERT_TRUE(g_d0_pos < g_s_pos); + + free(seeded); + free(global); + cbm_mcp_server_free(srv); + PASS(); +} + +TEST(repo_map_tight_budget_seed_crowds_out_distant) { + cbm_mcp_server_t *srv = rm_setup_server("rm-seed-tight"); + cbm_store_t *st = cbm_mcp_server_store(srv); + rm_add_seed_boost_fixture(st, "rm-seed-tight"); + + /* Budget fits only the top ~2 lines of the seeded ranking — since the + * whole 5-member seed cluster outranks the distant trio, none of the + * distant symbols can appear at all. */ + char *text = + rm_call(srv, "{\"project\":\"rm-seed-tight\",\"seed_anchors\":[\"S\"]," + "\"token_budget\":15}"); + ASSERT_NOT_NULL(text); + /* Positive discriminator first: the top seeded symbol IS in the map + * (guards the red-boundary trivial pass where an error response also + * "contains no D0"). */ + ASSERT_NOT_NULL(strstr(text, "S() error")); + ASSERT_NULL(strstr(text, "D0() error")); + ASSERT_NULL(strstr(text, "D1() error")); + ASSERT_NULL(strstr(text, "D2() error")); + free(text); + cbm_mcp_server_free(srv); + PASS(); +} + +TEST(repo_map_seed_by_file_path) { + cbm_mcp_server_t *srv = rm_setup_server("rm-seed-file"); + cbm_store_t *st = cbm_mcp_server_store(srv); + rm_add_seed_boost_fixture(st, "rm-seed-file"); + + char *text = rm_call(srv, "{\"project\":\"rm-seed-file\",\"seed_anchors\":[\"pkg/seed.go\"]," + "\"token_budget\":100000}"); + ASSERT_NOT_NULL(text); + const char *s_pos = strstr(text, "S() error"); + const char *d0_pos = strstr(text, "D0() error"); + ASSERT_NOT_NULL(s_pos); + ASSERT_NOT_NULL(d0_pos); + ASSERT_TRUE(s_pos < d0_pos); + ASSERT_NOT_NULL(strstr(text, "\"mode\":\"seeded\"")); + free(text); + cbm_mcp_server_free(srv); + PASS(); +} + +TEST(repo_map_seed_resolves_multiple_nodes) { + cbm_mcp_server_t *srv = rm_setup_server("rm-seed-multi"); + cbm_store_t *st = cbm_mcp_server_store(srv); + const char *project = "rm-seed-multi"; + + rm_add_node(st, project, "Function", "Hi", "proj.pkgA.Hi", "pkg/a.go", 50.0, "Hi() error"); + rm_add_node(st, project, "Function", "Dup", "proj.pkgB.Dup", "pkg/b.go", 2.0, "DupB() error"); + rm_add_node(st, project, "Function", "Dup", "proj.pkgC.Dup", "pkg/c.go", 2.0, "DupC() error"); + + char *text = rm_call(srv, "{\"project\":\"rm-seed-multi\",\"seed_anchors\":[\"Dup\"]," + "\"token_budget\":100000}"); + ASSERT_NOT_NULL(text); + ASSERT_NOT_NULL(strstr(text, "\"seed_anchors_resolved\":1")); + /* AC2/row-2: all-resolved -> seeds_unresolved omitted (pinned shape). */ + ASSERT_TRUE(rm_json_key_absent(text, "seeds_unresolved")); + const char *dupb_pos = strstr(text, "DupB() error"); + const char *dupc_pos = strstr(text, "DupC() error"); + const char *hi_pos = strstr(text, "Hi() error"); + ASSERT_NOT_NULL(dupb_pos); + ASSERT_NOT_NULL(dupc_pos); + ASSERT_NOT_NULL(hi_pos); + /* Both nodes resolved by the shared name "Dup" are boosted (2*50=100) + * above the unrelated higher-raw-importance "Hi" (50). */ + ASSERT_TRUE(dupb_pos < hi_pos); + ASSERT_TRUE(dupc_pos < hi_pos); + free(text); + cbm_mcp_server_free(srv); + PASS(); +} + +/* ── Row 4: AC2b sharpening — weak-seed widen ────────────────────── */ + +TEST(repo_map_weak_seed_triggers_widen_walk) { + /* Builder's chosen widen rule (also documented on handle_repo_map in + * mcp.c): when a seed's 1-hop CALLS|USAGE neighbourhood has + * <= REPO_MAP_WEAK_NEIGHBOR_THRESHOLD (3) members, widen via (a) + * file-of-symbol seeding — every other symbol defined in the seed's own + * file — and (b) one more hop from the 1-hop neighbours (2-hop + * expansion). Both widened sets get REPO_MAP_WIDEN_BOOST (25x) vs the + * direct seed/1-hop REPO_MAP_SEED_BOOST (50x). Here W has exactly ONE + * 1-hop neighbour (W1), so widen fires. */ + cbm_mcp_server_t *srv = rm_setup_server("rm-weak-seed"); + cbm_store_t *st = cbm_mcp_server_store(srv); + const char *project = "rm-weak-seed"; + + int64_t w = rm_add_node(st, project, "Function", "W", "rm-weak-seed.W", "pkg/weak.go", 1.0, + "W() error"); + int64_t w1 = rm_add_node(st, project, "Function", "W1", "rm-weak-seed.W1", "pkg/other.go", + 1.0, "W1() error"); + rm_add_node(st, project, "Function", "WSibling", "rm-weak-seed.WSibling", "pkg/weak.go", 1.0, + "WSibling() error"); + int64_t w1a = rm_add_node(st, project, "Function", "W1a", "rm-weak-seed.W1a", + "pkg/other2.go", 1.0, "W1a() error"); + /* Raw importance between the widen boost (1*25=25) and the strong seed + * boost (1*50=50) — proves the widen-boosted symbols outrank a *higher* + * raw-importance unrelated symbol, not just "small graph, all fits". */ + rm_add_node(st, project, "Function", "Filler", "rm-weak-seed.Filler", "pkg/filler.go", 10.0, + "Filler() error"); + + rm_add_edge(st, project, w, w1, "CALLS"); + rm_add_edge(st, project, w1, w1a, "CALLS"); + + char *text = rm_call(srv, "{\"project\":\"rm-weak-seed\",\"seed_anchors\":[\"W\"]," + "\"token_budget\":100000}"); + ASSERT_NOT_NULL(text); + const char *wsib_pos = strstr(text, "WSibling() error"); + const char *w1a_pos = strstr(text, "W1a() error"); + const char *filler_pos = strstr(text, "Filler() error"); + ASSERT_NOT_NULL(wsib_pos); /* file-of-symbol widen member present */ + ASSERT_NOT_NULL(w1a_pos); /* 2-hop widen member present */ + ASSERT_NOT_NULL(filler_pos); + ASSERT_TRUE(wsib_pos < filler_pos); + ASSERT_TRUE(w1a_pos < filler_pos); + free(text); + cbm_mcp_server_free(srv); + PASS(); +} + +TEST(repo_map_module_usage_neighbor_expands_to_its_file_symbols) { + /* Real-corpus refinement (row 3/4 sharpening, added with the + * implementation): file-level co-usage arrives as a Module node with a + * USAGE edge to the seed (P2 ground truth: sender.py/gmail_client.py + * reach extract_address exactly this way). A Module 1-hop neighbour is + * not renderable itself but must expand to its file's symbols at the + * widen tier — they are the co-change neighbourhood. */ + cbm_mcp_server_t *srv = rm_setup_server("rm-module-nb"); + cbm_store_t *st = cbm_mcp_server_store(srv); + const char *project = "rm-module-nb"; + + int64_t s2 = rm_add_node(st, project, "Function", "S2", "rm-module-nb.S2", "pkg/s.go", 5.0, + "S2() error"); + int64_t mod = rm_add_node_no_score(st, project, "Module", "pkg/user.py", "rm-module-nb.mod", + "pkg/user.py"); + rm_add_node(st, project, "Function", "UserHelper", "rm-module-nb.UserHelper", "pkg/user.py", + 1.0, "UserHelper() int"); + /* Raw importance ABOVE the widened symbol's raw (1.0) but below its + * boosted score (1x25): proves the expansion boost does the ranking. */ + rm_add_node(st, project, "Function", "BigDeal", "rm-module-nb.BigDeal", "pkg/big.go", 20.0, + "BigDeal() error"); + rm_add_edge(st, project, mod, s2, "USAGE"); + + char *text = rm_call(srv, "{\"project\":\"rm-module-nb\",\"seed_anchors\":[\"S2\"]," + "\"token_budget\":100000}"); + ASSERT_NOT_NULL(text); + const char *helper_pos = strstr(text, "UserHelper() int"); + const char *big_pos = strstr(text, "BigDeal() error"); + ASSERT_NOT_NULL(helper_pos); + ASSERT_NOT_NULL(big_pos); + ASSERT_TRUE(helper_pos < big_pos); + /* The Module node itself never renders as a map line. */ + ASSERT_NULL(strstr(text, "pkg/user.py: pkg/user.py")); + free(text); + cbm_mcp_server_free(srv); + PASS(); +} + +/* ── Row 5: AC2c empty/unusable seeds → global map ───────────────── */ + +TEST(repo_map_no_seed_and_empty_seed_and_all_unresolvable_yield_identical_global_map) { + cbm_mcp_server_t *srv = rm_setup_server("rm-empty-seed"); + cbm_store_t *st = cbm_mcp_server_store(srv); + rm_add_simple_fixture(st, "rm-empty-seed"); + + char *no_seed = rm_call(srv, "{\"project\":\"rm-empty-seed\",\"token_budget\":100000}"); + char *empty_arr = rm_call( + srv, "{\"project\":\"rm-empty-seed\",\"seed_anchors\":[],\"token_budget\":100000}"); + char *all_unresolvable = + rm_call(srv, "{\"project\":\"rm-empty-seed\",\"seed_anchors\":[\"nonexistent_xyz\"]," + "\"token_budget\":100000}"); + ASSERT_NOT_NULL(no_seed); + ASSERT_NOT_NULL(empty_arr); + ASSERT_NOT_NULL(all_unresolvable); + + char *no_seed_map = rm_json_str(no_seed, "map"); + char *empty_map = rm_json_str(empty_arr, "map"); + char *unresolvable_map = rm_json_str(all_unresolvable, "map"); + ASSERT_NOT_NULL(no_seed_map); + ASSERT_NOT_NULL(empty_map); + ASSERT_NOT_NULL(unresolvable_map); + ASSERT_STR_EQ(no_seed_map, empty_map); + ASSERT_STR_EQ(no_seed_map, unresolvable_map); + + ASSERT_NOT_NULL(strstr(no_seed, "\"mode\":\"global\"")); + ASSERT_NOT_NULL(strstr(empty_arr, "\"mode\":\"global\"")); + ASSERT_NOT_NULL(strstr(all_unresolvable, "\"mode\":\"global\"")); + + /* Row 3: zero seeds requested (no key / empty array) -> seeds_unresolved + * omitted, same pinned shape as the all-resolved case (nothing to + * report as unresolved). */ + ASSERT_TRUE(rm_json_key_absent(no_seed, "seeds_unresolved")); + ASSERT_TRUE(rm_json_key_absent(empty_arr, "seeds_unresolved")); + + /* Row 4a: single unresolvable name -> seeds_unresolved lists it. */ + char *unresolved_single[4]; + int unresolved_single_n = + rm_json_str_arr(all_unresolvable, "seeds_unresolved", unresolved_single, 4); + ASSERT_EQ(1, unresolved_single_n); + ASSERT_STR_EQ("nonexistent_xyz", unresolved_single[0]); + free(unresolved_single[0]); + + /* Row 4b: 2+ distinct unresolvable names -> all listed, resolved:0, + * mode stays "global" (nothing resolved). */ + char *two_unresolvable = + rm_call(srv, "{\"project\":\"rm-empty-seed\",\"seed_anchors\":[\"nonexistent_xyz\"," + "\"also_missing_abc\"],\"token_budget\":100000}"); + ASSERT_NOT_NULL(two_unresolvable); + ASSERT_NOT_NULL(strstr(two_unresolvable, "\"seed_anchors_resolved\":0")); + ASSERT_NOT_NULL(strstr(two_unresolvable, "\"mode\":\"global\"")); + char *unresolved_two[4]; + int unresolved_two_n = + rm_json_str_arr(two_unresolvable, "seeds_unresolved", unresolved_two, 4); + ASSERT_EQ(2, unresolved_two_n); + bool saw_first = false; + bool saw_second = false; + for (int i = 0; i < unresolved_two_n; i++) { + if (strcmp(unresolved_two[i], "nonexistent_xyz") == 0) { + saw_first = true; + } + if (strcmp(unresolved_two[i], "also_missing_abc") == 0) { + saw_second = true; + } + free(unresolved_two[i]); + } + ASSERT_TRUE(saw_first); + ASSERT_TRUE(saw_second); + free(two_unresolvable); + + free(no_seed_map); + free(empty_map); + free(unresolvable_map); + free(no_seed); + free(empty_arr); + free(all_unresolvable); + cbm_mcp_server_free(srv); + PASS(); +} + +TEST(repo_map_mixed_resolvable_and_unresolvable_seeds_uses_seeded_mode) { + cbm_mcp_server_t *srv = rm_setup_server("rm-mixed-seed"); + cbm_store_t *st = cbm_mcp_server_store(srv); + rm_add_simple_fixture(st, "rm-mixed-seed"); + + char *text = + rm_call(srv, "{\"project\":\"rm-mixed-seed\",\"seed_anchors\":[\"nonexistent_xyz\"," + "\"A\"],\"token_budget\":100000}"); + ASSERT_NOT_NULL(text); + ASSERT_NOT_NULL(strstr(text, "\"mode\":\"seeded\"")); + ASSERT_NOT_NULL(strstr(text, "\"seed_anchors_requested\":2")); + ASSERT_NOT_NULL(strstr(text, "\"seed_anchors_resolved\":1")); + /* Row 1: seeds_unresolved lists exactly the anchor names that failed to + * resolve — parsed as a real array (length + element equality), not a + * substring probe, so ordering/extra-entries bugs would be caught. */ + char *unresolved[4]; + int unresolved_n = rm_json_str_arr(text, "seeds_unresolved", unresolved, 4); + ASSERT_EQ(1, unresolved_n); + ASSERT_STR_EQ("nonexistent_xyz", unresolved[0]); + free(unresolved[0]); + free(text); + cbm_mcp_server_free(srv); + PASS(); +} + +/* ── Row 5: duplicate unresolvable anchor names don't corrupt the array ── */ + +TEST(repo_map_duplicate_unresolvable_seed_names_no_crash) { + cbm_mcp_server_t *srv = rm_setup_server("rm-dup-unresolved"); + cbm_store_t *st = cbm_mcp_server_store(srv); + rm_add_simple_fixture(st, "rm-dup-unresolved"); + + char *text = + rm_call(srv, "{\"project\":\"rm-dup-unresolved\",\"seed_anchors\":[\"ghost_a\"," + "\"ghost_a\"],\"token_budget\":100000}"); + ASSERT_NOT_NULL(text); + ASSERT_NOT_NULL(strstr(text, "\"seed_anchors_requested\":2")); + ASSERT_NOT_NULL(strstr(text, "\"seed_anchors_resolved\":0")); + /* Pinned observed behaviour: the implementation does not dedupe — + * each unresolved anchor is recorded as encountered, so a duplicate + * request name appears twice. */ + char *unresolved[4]; + int unresolved_n = rm_json_str_arr(text, "seeds_unresolved", unresolved, 4); + ASSERT_EQ(2, unresolved_n); + ASSERT_STR_EQ("ghost_a", unresolved[0]); + ASSERT_STR_EQ("ghost_a", unresolved[1]); + free(unresolved[0]); + free(unresolved[1]); + free(text); + cbm_mcp_server_free(srv); + PASS(); +} + +/* ── Row 8: spec AC7 score-absence gating ────────────────────────── */ + +TEST(repo_map_unscored_project_returns_explicit_gate_error) { + cbm_mcp_server_t *srv = rm_setup_server("rm-unscored"); + cbm_store_t *st = cbm_mcp_server_store(srv); + rm_add_node_no_score(st, "rm-unscored", "Function", "Foo", "qFoo", "pkg/foo.go"); + rm_add_node_no_score(st, "rm-unscored", "Function", "Bar", "qBar", "pkg/bar.go"); + + char *raw = cbm_mcp_handle_tool(srv, "repo_map", "{\"project\":\"rm-unscored\"}"); + ASSERT_NOT_NULL(raw); + ASSERT_NOT_NULL(strstr(raw, "\"isError\":true")); + char *inner = extract_text_content(raw); + ASSERT_NOT_NULL(inner); + ASSERT_NOT_NULL(strstr(inner, "unscored")); + free(inner); + free(raw); + cbm_mcp_server_free(srv); + PASS(); +} + +TEST(repo_map_partial_score_population_is_not_gated) { + /* One Function with no importance (pre-P3 leftover) alongside one Class + * WITH a persisted score — spec AC7's "partial population" sub-case. + * Documented behaviour: the gate fires only when NO node is scored at + * all; a partially-scored project proceeds (unscored nodes rank as 0 — + * not a crash, not a silently-unranked map). */ + cbm_mcp_server_t *srv = rm_setup_server("rm-partial-score"); + cbm_store_t *st = cbm_mcp_server_store(srv); + rm_add_node_no_score(st, "rm-partial-score", "Function", "Unscored", "qU", "pkg/u.go"); + rm_add_node(st, "rm-partial-score", "Class", "Scored", "qS", "pkg/s.go", 10.0, "Scored()"); + + char *raw = cbm_mcp_handle_tool(srv, "repo_map", "{\"project\":\"rm-partial-score\"}"); + ASSERT_NOT_NULL(raw); + ASSERT_NULL(strstr(raw, "\"isError\":true")); + char *inner = extract_text_content(raw); + ASSERT_NOT_NULL(inner); + ASSERT_NOT_NULL(strstr(inner, "Scored()")); + free(inner); + free(raw); + cbm_mcp_server_free(srv); + PASS(); +} + +/* ── Row 9: spec AC7 graceful input/error paths ──────────────────── */ + +TEST(repo_map_missing_project_is_error_no_crash) { + cbm_mcp_server_t *srv = cbm_mcp_server_new(NULL); + char *raw = cbm_mcp_handle_tool(srv, "repo_map", "{}"); + ASSERT_NOT_NULL(raw); + ASSERT_NOT_NULL(strstr(raw, "\"isError\":true")); + /* Discriminating: must be the tool's own project-missing error, not + * dispatch-level "unknown tool" (red-boundary trivial-pass guard). */ + ASSERT_NULL(strstr(raw, "unknown tool")); + free(raw); + cbm_mcp_server_free(srv); + PASS(); +} + +TEST(repo_map_unknown_project_require_store_error) { + cbm_mcp_server_t *srv = cbm_mcp_server_new(NULL); + char *raw = cbm_mcp_handle_tool(srv, "repo_map", "{\"project\":\"totally-unknown-xyz\"}"); + ASSERT_NOT_NULL(raw); + ASSERT_TRUE(strstr(raw, "not found") != NULL || strstr(raw, "not indexed") != NULL); + free(raw); + cbm_mcp_server_free(srv); + PASS(); +} + +TEST(repo_map_indexed_false_project_verify_indexed_error) { + /* current_project pre-set (resolve_store's cache shortcut returns a + * non-NULL store) but no row was ever upserted into `projects` for this + * name — exercises verify_project_indexed's error path specifically, + * distinct from REQUIRE_STORE's file-not-found path above. */ + cbm_mcp_server_t *srv = cbm_mcp_server_new(NULL); + cbm_mcp_server_set_project(srv, "ghost-project"); + + char *raw = cbm_mcp_handle_tool(srv, "repo_map", "{\"project\":\"ghost-project\"}"); + ASSERT_NOT_NULL(raw); + ASSERT_NOT_NULL(strstr(raw, "not indexed")); + free(raw); + cbm_mcp_server_free(srv); + PASS(); +} + +TEST(repo_map_malformed_seed_anchors_string_coerced) { + cbm_mcp_server_t *srv = rm_setup_server("rm-malformed-1"); + cbm_store_t *st = cbm_mcp_server_store(srv); + rm_add_simple_fixture(st, "rm-malformed-1"); + + /* seed_anchors as a bare string instead of an array — coerced into a + * single-element list (documented leniency, not an error). */ + char *raw = cbm_mcp_handle_tool(srv, "repo_map", + "{\"project\":\"rm-malformed-1\",\"seed_anchors\":\"A\"}"); + ASSERT_NOT_NULL(raw); + ASSERT_NULL(strstr(raw, "\"isError\":true")); + char *inner = extract_text_content(raw); + ASSERT_NOT_NULL(inner); + ASSERT_NOT_NULL(strstr(inner, "\"mode\":\"seeded\"")); + /* Row 6: coerced single anchor "A" fully resolves -> seeds_unresolved + * omitted (well-shaped, not just non-crashing). */ + ASSERT_TRUE(rm_json_key_absent(inner, "seeds_unresolved")); + free(inner); + free(raw); + cbm_mcp_server_free(srv); + PASS(); +} + +TEST(repo_map_malformed_seed_anchors_non_string_elements_skipped) { + cbm_mcp_server_t *srv = rm_setup_server("rm-malformed-2"); + cbm_store_t *st = cbm_mcp_server_store(srv); + rm_add_simple_fixture(st, "rm-malformed-2"); + + char *raw = cbm_mcp_handle_tool( + srv, "repo_map", "{\"project\":\"rm-malformed-2\",\"seed_anchors\":[1,2,\"A\"]}"); + ASSERT_NOT_NULL(raw); + ASSERT_NULL(strstr(raw, "\"isError\":true")); + char *inner = extract_text_content(raw); + ASSERT_NOT_NULL(inner); + ASSERT_NOT_NULL(strstr(inner, "\"mode\":\"seeded\"")); + ASSERT_NOT_NULL(strstr(inner, "\"seed_anchors_resolved\":1")); + /* Row 6: the only string element ("A") resolves; the skipped non-string + * elements never became anchors, so seeds_unresolved is omitted. */ + ASSERT_TRUE(rm_json_key_absent(inner, "seeds_unresolved")); + free(inner); + free(raw); + cbm_mcp_server_free(srv); + PASS(); +} + +TEST(repo_map_absurdly_long_seed_list_capped_no_hang) { + cbm_mcp_server_t *srv = rm_setup_server("rm-malformed-3"); + cbm_store_t *st = cbm_mcp_server_store(srv); + rm_add_simple_fixture(st, "rm-malformed-3"); + + /* 200 bogus seed names — well past REPO_MAP_MAX_SEEDS (50). Must not + * hang or crash; excess entries are silently dropped. */ + char buf[4096] = "{\"project\":\"rm-malformed-3\",\"seed_anchors\":["; + size_t pos = strlen(buf); + for (int i = 0; i < 200; i++) { + char frag[32]; + int n = snprintf(frag, sizeof(frag), "%s\"bogus%d\"", i > 0 ? "," : "", i); + if (n < 0 || pos + (size_t)n >= sizeof(buf) - 4) { + break; + } + memcpy(buf + pos, frag, (size_t)n); + pos += (size_t)n; + } + memcpy(buf + pos, "]}", 3); + + char *raw = cbm_mcp_handle_tool(srv, "repo_map", buf); + ASSERT_NOT_NULL(raw); + ASSERT_NULL(strstr(raw, "\"isError\":true")); + free(raw); + cbm_mcp_server_free(srv); + PASS(); +} + +/* ── Row 10: signature-level rendering, no bodies ────────────────── */ + +TEST(repo_map_renders_signature_level_no_body_leak) { + cbm_mcp_server_t *srv = rm_setup_server("rm-render"); + cbm_store_t *st = cbm_mcp_server_store(srv); + const char *project = "rm-render"; + + cbm_node_t n = {0}; + n.project = project; + n.label = "Function"; + n.name = "Foo"; + n.qualified_name = "rm-render.Foo"; + n.file_path = "pkg/foo.go"; + n.start_line = 1; + n.end_line = 5; + n.properties_json = "{\"importance\":10.0,\"signature\":\"Foo(x int) error\"," + "\"body_preview\":\"BODY_MARKER_DO_NOT_LEAK do_something()\"}"; + ASSERT_GT(cbm_store_upsert_node(st, &n), 0); + + char *text = rm_call(srv, "{\"project\":\"rm-render\"}"); + ASSERT_NOT_NULL(text); + ASSERT_NOT_NULL(strstr(text, "pkg/foo.go: Foo(x int) error")); + ASSERT_NULL(strstr(text, "BODY_MARKER_DO_NOT_LEAK")); + free(text); + cbm_mcp_server_free(srv); + PASS(); +} + +TEST(repo_map_param_only_signature_gets_name_prefix_and_ws_flatten) { + /* Real-corpus refinement (row 10 sharpening, added with the + * implementation): several grammars persist only the parameter list as + * the signature ("(self)"), and black-formatted defs embed newlines. + * The renderer must prefix the symbol name and flatten whitespace so + * every line reads 'file: symbol(sig)' on ONE line. */ + cbm_mcp_server_t *srv = rm_setup_server("rm-render-prefix"); + cbm_store_t *st = cbm_mcp_server_store(srv); + + cbm_node_t n = {0}; + n.project = "rm-render-prefix"; + n.label = "Method"; + n.name = "method_a"; + n.qualified_name = "rm-render-prefix.M.method_a"; + n.file_path = "pkg/m.py"; + n.start_line = 1; + n.end_line = 4; + /* JSON-escaped newlines inside the signature value. */ + n.properties_json = + "{\"importance\":10.0,\"signature\":\"(\\n self,\\n x: int\\n)\"}"; + ASSERT_GT(cbm_store_upsert_node(st, &n), 0); + + char *text = rm_call(srv, "{\"project\":\"rm-render-prefix\"}"); + ASSERT_NOT_NULL(text); + char *map = rm_json_str(text, "map"); + ASSERT_NOT_NULL(map); + ASSERT_NOT_NULL(strstr(map, "pkg/m.py: method_a( self, x: int )\n")); + /* Exactly one line: the first newline in the map is its last character + * — no embedded newline from the multi-line signature survives. */ + const char *first_nl = strchr(map, '\n'); + ASSERT_NOT_NULL(first_nl); + ASSERT_EQ((int)(strlen(map) - (size_t)(first_nl - map)), 1); + free(map); + free(text); + cbm_mcp_server_free(srv); + PASS(); +} + +/* ── Row 11: determinism ──────────────────────────────────────────── */ + +TEST(repo_map_deterministic_byte_identical_across_calls) { + cbm_mcp_server_t *srv = rm_setup_server("rm-determinism"); + cbm_store_t *st = cbm_mcp_server_store(srv); + const char *project = "rm-determinism"; + /* Two tied scores force the tie-break rule (qualified_name ASC) to do + * the work — the case where sort stability actually matters. */ + rm_add_node(st, project, "Function", "Tie1", "qTie1", "pkg/x.go", 5.0, "Tie1() error"); + rm_add_node(st, project, "Function", "Tie2", "qTie2", "pkg/x.go", 5.0, "Tie2() error"); + rm_add_node(st, project, "Function", "Other", "qOther", "pkg/y.go", 3.0, "Other() error"); + + char *first = rm_call(srv, "{\"project\":\"rm-determinism\",\"token_budget\":100000}"); + char *second = rm_call(srv, "{\"project\":\"rm-determinism\",\"token_budget\":100000}"); + ASSERT_NOT_NULL(first); + ASSERT_NOT_NULL(second); + /* Positive discriminator: a real map with the tied pair rendered in + * tie-break order (guards the red-boundary trivial pass where two + * identical error strings compare equal). */ + const char *tie1_pos = strstr(first, "Tie1() error"); + const char *tie2_pos = strstr(first, "Tie2() error"); + ASSERT_NOT_NULL(tie1_pos); + ASSERT_NOT_NULL(tie2_pos); + ASSERT_TRUE(tie1_pos < tie2_pos); /* qualified_name ASC: qTie1 < qTie2 */ + ASSERT_STR_EQ(first, second); + free(first); + free(second); + cbm_mcp_server_free(srv); + PASS(); +} + +/* ── Row 12: no cross-project leakage ────────────────────────────── */ + +TEST(repo_map_no_cross_project_leakage) { + cbm_mcp_server_t *srv = cbm_mcp_server_new(NULL); + cbm_store_t *st = cbm_mcp_server_store(srv); + cbm_store_upsert_project(st, "projA", "/tmp/projA"); + cbm_store_upsert_project(st, "projB", "/tmp/projB"); + rm_add_node(st, "projA", "Function", "FnA", "projA.FnA", "pkg/a.go", 10.0, "FnA() error"); + rm_add_node(st, "projB", "Function", "FnB", "projB.FnB", "pkg/b.go", 10.0, "FnB() error"); + + cbm_mcp_server_set_project(srv, "projA"); + char *text = rm_call(srv, "{\"project\":\"projA\",\"token_budget\":100000}"); + ASSERT_NOT_NULL(text); + ASSERT_NOT_NULL(strstr(text, "FnA() error")); + ASSERT_NULL(strstr(text, "FnB() error")); + ASSERT_NULL(strstr(text, "pkg/b.go")); + free(text); + cbm_mcp_server_free(srv); + PASS(); +} + +/* ── Row 13: memory/lifecycle hygiene ─────────────────────────────── */ + +TEST(repo_map_repeated_calls_stable_no_leak_surface) { + cbm_mcp_server_t *srv = rm_setup_server("rm-repeat"); + cbm_store_t *st = cbm_mcp_server_store(srv); + rm_add_simple_fixture(st, "rm-repeat"); + + char *baseline = rm_call(srv, "{\"project\":\"rm-repeat\",\"token_budget\":100000}"); + ASSERT_NOT_NULL(baseline); + /* Positive discriminator: baseline is a real map, not a repeated + * identical error response (red-boundary trivial-pass guard). */ + ASSERT_NOT_NULL(strstr(baseline, "A() error")); + for (int i = 0; i < 20; i++) { + char *text = rm_call(srv, "{\"project\":\"rm-repeat\",\"token_budget\":100000}"); + ASSERT_NOT_NULL(text); + ASSERT_STR_EQ(baseline, text); + free(text); + } + ASSERT_TRUE(cbm_mcp_server_has_cached_store(srv)); + free(baseline); + cbm_mcp_server_free(srv); + PASS(); +} + +/* ══════════════════════════════════════════════════════════════════ + * index_status — indexed_at observability (pai/codebase-memory- + * repomap-observability). Row 8-10 of the test plan: the success path + * (ready + empty project) previously had zero coverage; only the + * missing-project error path (tool_index_status_no_project, above) was + * tested. + * ══════════════════════════════════════════════════════════════════ */ + +/* ISO-8601 "YYYY-MM-DDTHH:MM:SSZ" shape check — matches iso_now() in + * store.c (~line 203-209), 20 bytes exactly. Hand-rolled digit/char-position + * check rather than a regex dependency. */ +static bool is_iso8601_utc(const char *s) { + if (!s || strlen(s) != 20) { + return false; + } + static const char pattern[] = "dddd-dd-ddTdd:dd:ddZ"; /* 20 chars + NUL */ + for (int i = 0; i < 20; i++) { + char want = pattern[i]; + char got = s[i]; + if (want == 'd') { + if (got < '0' || got > '9') { + return false; + } + } else if (got != want) { + return false; + } + } + return true; +} + +TEST(tool_index_status_ready_project_includes_indexed_at) { + cbm_mcp_server_t *srv = cbm_mcp_server_new(NULL); + cbm_store_t *st = cbm_mcp_server_store(srv); + const char *project = "idx-status-ready"; + cbm_mcp_server_set_project(srv, project); + cbm_store_upsert_project(st, project, "/tmp/idx-status-ready-test"); + + cbm_node_t n = {0}; + n.project = project; + n.label = "Function"; + n.name = "Foo"; + n.qualified_name = "q.Foo"; + n.file_path = "pkg/foo.go"; + n.start_line = 1; + n.end_line = 2; + n.properties_json = "{}"; + cbm_store_upsert_node(st, &n); + + char *raw = cbm_mcp_handle_tool(srv, "index_status", "{\"project\":\"idx-status-ready\"}"); + ASSERT_NOT_NULL(raw); + ASSERT_NULL(strstr(raw, "\"isError\":true")); + char *inner = extract_text_content(raw); + ASSERT_NOT_NULL(inner); + ASSERT_NOT_NULL(strstr(inner, "\"status\":\"ready\"")); + /* Existing fields unchanged (row 13 characterization). */ + ASSERT_NOT_NULL(strstr(inner, "\"nodes\":1")); + + char *indexed_at = rm_json_str(inner, "indexed_at"); + ASSERT_NOT_NULL(indexed_at); + ASSERT_TRUE(is_iso8601_utc(indexed_at)); + free(indexed_at); + free(inner); + free(raw); + cbm_mcp_server_free(srv); + PASS(); +} + +TEST(tool_index_status_empty_project_includes_indexed_at) { + /* Project row registered (cbm_store_upsert_project) but zero nodes — + * "empty" status distinct from "unknown project" (row 11). */ + cbm_mcp_server_t *srv = cbm_mcp_server_new(NULL); + cbm_store_t *st = cbm_mcp_server_store(srv); + const char *project = "idx-status-empty"; + cbm_mcp_server_set_project(srv, project); + cbm_store_upsert_project(st, project, "/tmp/idx-status-empty-test"); + + char *raw = cbm_mcp_handle_tool(srv, "index_status", "{\"project\":\"idx-status-empty\"}"); + ASSERT_NOT_NULL(raw); + ASSERT_NULL(strstr(raw, "\"isError\":true")); + char *inner = extract_text_content(raw); + ASSERT_NOT_NULL(inner); + ASSERT_NOT_NULL(strstr(inner, "\"status\":\"empty\"")); + + char *indexed_at = rm_json_str(inner, "indexed_at"); + ASSERT_NOT_NULL(indexed_at); + ASSERT_TRUE(is_iso8601_utc(indexed_at)); + free(indexed_at); + free(inner); + free(raw); + cbm_mcp_server_free(srv); + PASS(); +} + /* ══════════════════════════════════════════════════════════════════ * SUITE * ══════════════════════════════════════════════════════════════════ */ @@ -2040,6 +3476,8 @@ SUITE(mcp) { RUN_TEST(tool_search_graph_includes_node_properties); RUN_TEST(tool_query_graph_basic); RUN_TEST(tool_index_status_no_project); + RUN_TEST(tool_index_status_ready_project_includes_indexed_at); + RUN_TEST(tool_index_status_empty_project_includes_indexed_at); /* Tool handlers with validation */ RUN_TEST(tool_trace_call_path_not_found); @@ -2049,6 +3487,22 @@ SUITE(mcp) { RUN_TEST(tool_get_architecture_emits_populated_sections); RUN_TEST(tool_query_graph_missing_query); + /* Project param: missing-vs-not-found + project_id alias + * (pai/codebase-memory-search) */ + RUN_TEST(missing_project_named_error_all_tools); + RUN_TEST(missing_project_empty_and_nonstring_treated_as_missing); + RUN_TEST(project_id_alias_search_graph_success_after_cache_miss); + RUN_TEST(project_id_alias_reaches_notfound_remaining_tools); + RUN_TEST(project_precedence_project_wins_over_project_id); + RUN_TEST(search_graph_unknown_project_still_conflated_notfound_shape); + RUN_TEST(query_graph_missing_query_with_project_present_reports_query_required); + RUN_TEST(trace_path_missing_function_name_with_project_present_reports_function_name_required); + RUN_TEST(search_code_missing_pattern_with_project_present_reports_pattern_required); + RUN_TEST(query_graph_both_query_and_project_missing_reports_query_required_first); + RUN_TEST(tools_list_does_not_advertise_project_id_alias); + RUN_TEST(detect_changes_unknown_project_uses_standard_notfound_shape); + RUN_TEST(manage_adr_unknown_project_uses_standard_notfound_shape); + /* Pipeline-dependent tool handlers */ RUN_TEST(tool_index_repository_missing_path); RUN_TEST(tool_get_code_snippet_missing_qn); @@ -2105,4 +3559,35 @@ SUITE(mcp) { RUN_TEST(snippet_include_neighbors_default); RUN_TEST(snippet_include_neighbors_enabled); RUN_TEST(tool_bad_project_name_no_overflow_issue235); + + /* repo_map (P4 token-budgeted, seed-aware query tool) */ + RUN_TEST(repo_map_registered_in_tools_list); + RUN_TEST(repo_map_dispatchable_via_full_jsonrpc); + RUN_TEST(repo_map_budget_default_applies_when_absent); + RUN_TEST(repo_map_budget_fits_and_uses_available_space); + RUN_TEST(repo_map_budget_tiny_no_overshoot_no_hang); + RUN_TEST(repo_map_budget_larger_than_whole_map_returns_everything); + RUN_TEST(repo_map_budget_zero_or_negative_is_input_error); + RUN_TEST(repo_map_seed_boost_ranks_neighbourhood_above_distant); + RUN_TEST(repo_map_tight_budget_seed_crowds_out_distant); + RUN_TEST(repo_map_seed_by_file_path); + RUN_TEST(repo_map_seed_resolves_multiple_nodes); + RUN_TEST(repo_map_weak_seed_triggers_widen_walk); + RUN_TEST(repo_map_module_usage_neighbor_expands_to_its_file_symbols); + RUN_TEST(repo_map_no_seed_and_empty_seed_and_all_unresolvable_yield_identical_global_map); + RUN_TEST(repo_map_mixed_resolvable_and_unresolvable_seeds_uses_seeded_mode); + RUN_TEST(repo_map_duplicate_unresolvable_seed_names_no_crash); + RUN_TEST(repo_map_unscored_project_returns_explicit_gate_error); + RUN_TEST(repo_map_partial_score_population_is_not_gated); + RUN_TEST(repo_map_missing_project_is_error_no_crash); + RUN_TEST(repo_map_unknown_project_require_store_error); + RUN_TEST(repo_map_indexed_false_project_verify_indexed_error); + RUN_TEST(repo_map_malformed_seed_anchors_string_coerced); + RUN_TEST(repo_map_malformed_seed_anchors_non_string_elements_skipped); + RUN_TEST(repo_map_absurdly_long_seed_list_capped_no_hang); + RUN_TEST(repo_map_renders_signature_level_no_body_leak); + RUN_TEST(repo_map_param_only_signature_gets_name_prefix_and_ws_flatten); + RUN_TEST(repo_map_deterministic_byte_identical_across_calls); + RUN_TEST(repo_map_no_cross_project_leakage); + RUN_TEST(repo_map_repeated_calls_stable_no_leak_surface); } diff --git a/tests/test_mem.c b/tests/test_mem.c index debb9b505..b889175dc 100644 --- a/tests/test_mem.c +++ b/tests/test_mem.c @@ -8,6 +8,8 @@ #include "../src/foundation/mem.h" #include "../src/foundation/arena.h" #include "../src/foundation/slab_alloc.h" +#include "../src/foundation/platform.h" +#include "../src/foundation/constants.h" #include "pipeline/pipeline.h" #include "pipeline/pipeline_internal.h" #include "graph_buffer/graph_buffer.h" @@ -290,6 +292,318 @@ TEST(mem_init_second_call_noop) { PASS(); } +/* ── Hard memory ceiling (enforcing, distinct from advisory budget) ── + * + * cbm_mem_abort_if_over_ceiling() itself hard-aborts the process + * (abort()) when over — that path can only be exercised out-of-process + * (see tests/mem_ceiling_abort.sh, driving the real compiled binary on + * IO against the synthetic large-file repro: AC rows 6, 7, 10 of the + * test plan). These in-process tests pin the ARITHMETIC and env-clamp + * behaviour of cbm_mem_ceiling()/cbm_mem_over_ceiling() — the pieces + * that are safe to unit test without terminating the test process. */ + +TEST(mem_ceiling_positive_and_above_budget) { + cbm_mem_init(0.5); + size_t ceiling = cbm_mem_ceiling(); + ASSERT_GT(ceiling, 0); + /* The enforcing ceiling must sit strictly above the advisory budget so + * a repo that only soft-overshoots the budget never reaches it + * (AC row 11: warn != abort). */ + ASSERT_GT(ceiling, cbm_mem_budget()); + PASS(); +} + +TEST(mem_ceiling_floor_applies_on_tiny_ram) { + /* Adversarial seeding (AC row 8): force a tiny CBM_MEM_CEILING_MB + * override below the floor to prove the floor — not the override + * value — wins. (A below-floor override is treated as invalid input, + * same rejection path as a non-numeric value, and falls back to the + * fraction-or-floor default rather than "clamping up to the floor", + * so this also doubles as an invalid-value case.) */ + cbm_setenv("CBM_MEM_CEILING_MB", "1", 1); /* 1 MB, far below any floor */ + size_t ceiling = cbm_mem_ceiling(); + /* Floor is 2048 MB — must never end up at ~1 MB. */ + ASSERT_GTE(ceiling, (size_t)2048 * 1024 * 1024); + cbm_unsetenv("CBM_MEM_CEILING_MB"); + PASS(); +} + +TEST(mem_ceiling_env_override_applies) { + size_t baseline = cbm_mem_ceiling(); + ASSERT_GT(baseline, 0); + + /* A valid override strictly above the floor (2048 MB) must change the + * effective ceiling and be reflected in cbm_mem_over_ceiling(). */ + cbm_setenv("CBM_MEM_CEILING_MB", "3000", 1); + size_t overridden = cbm_mem_ceiling(); + ASSERT_EQ(overridden, (size_t)3000 * 1024 * 1024); + + cbm_unsetenv("CBM_MEM_CEILING_MB"); + PASS(); +} + +TEST(mem_ceiling_env_invalid_falls_back) { + size_t baseline = cbm_mem_ceiling(); + + cbm_setenv("CBM_MEM_CEILING_MB", "not-a-number", 1); + ASSERT_EQ(cbm_mem_ceiling(), baseline); + + cbm_setenv("CBM_MEM_CEILING_MB", "", 1); /* blank must NOT coerce to 0 */ + ASSERT_EQ(cbm_mem_ceiling(), baseline); + + cbm_setenv("CBM_MEM_CEILING_MB", "-5", 1); + ASSERT_EQ(cbm_mem_ceiling(), baseline); + + cbm_setenv("CBM_MEM_CEILING_MB", "999999999999", 1); /* past the cap */ + ASSERT_EQ(cbm_mem_ceiling(), baseline); + + cbm_unsetenv("CBM_MEM_CEILING_MB"); + PASS(); +} + +TEST(mem_ceiling_env_unset_matches_default) { + cbm_unsetenv("CBM_MEM_CEILING_MB"); + size_t a = cbm_mem_ceiling(); + size_t b = cbm_mem_ceiling(); + ASSERT_EQ(a, b); + PASS(); +} + +TEST(mem_over_ceiling_false_for_test_process) { + cbm_mem_init(0.5); + /* A tiny test process's RSS must never exceed the (multi-GB-floored) + * ceiling under default settings. */ + ASSERT_FALSE(cbm_mem_over_ceiling()); + PASS(); +} + +TEST(mem_over_ceiling_true_when_ceiling_forced_below_rss) { + /* Force the ceiling far below this process's actual current RSS so + * cbm_mem_over_ceiling() must report true — proves the comparison + * reads the REAL live RSS (cbm_mem_rss()), not a fixture (AC row 10's + * arithmetic half; the real-index half is the IO shell harness). */ + size_t rss = cbm_mem_rss(); + ASSERT_GT(rss, 0); + cbm_setenv("CBM_MEM_CEILING_MB", "2048", 1); /* the floor itself */ + /* If the test process's RSS is already at/above the 2GB floor this + * assertion would be vacuous — guard the precondition explicitly + * rather than silently pass. */ + if (rss < (size_t)2048 * 1024 * 1024) { + ASSERT_FALSE(cbm_mem_over_ceiling()); + } + cbm_unsetenv("CBM_MEM_CEILING_MB"); + PASS(); +} + +/* ── Per-file parse-size cap (CBM_MAX_FILE_MB) ─────────────────────── + * + * cbm_max_file_bytes() backs the read_file() size guard in every + * extraction pass (pass_calls.c, pass_definitions.c, pass_semantic.c, + * pass_usages.c, pass_k8s.c, pass_parallel.c, pass_lsp_cross.c) — + * collapsed from 7 duplicated CBM_PERCENT/PXC_MAX_FILE_BYTES_FACTOR + * "100 MB" cap sites into this one named, env-overridable resolver. */ + +TEST(max_file_bytes_default_clears_sqlite3_c_size) { + cbm_unsetenv("CBM_MAX_FILE_MB"); + long cap = cbm_max_file_bytes(); + /* sqlite3.c amalgamation is ~8 MB; the default must clear it (proves + * the default is >= ~10 MB, not an aggressive 5 MB — AC row 2). */ + long eight_mb = 8L * 1024 * 1024; + ASSERT_GT(cap, eight_mb); + PASS(); +} + +TEST(max_file_bytes_env_override_lowers_threshold) { + cbm_setenv("CBM_MAX_FILE_MB", "1", 1); + long cap = cbm_max_file_bytes(); + ASSERT_EQ(cap, 1L * 1024 * 1024); + cbm_unsetenv("CBM_MAX_FILE_MB"); + PASS(); +} + +TEST(max_file_bytes_env_override_raises_threshold) { + cbm_setenv("CBM_MAX_FILE_MB", "12", 1); + long cap = cbm_max_file_bytes(); + ASSERT_EQ(cap, 12L * 1024 * 1024); + cbm_unsetenv("CBM_MAX_FILE_MB"); + PASS(); +} + +TEST(max_file_bytes_env_unset_uses_default) { + cbm_unsetenv("CBM_MAX_FILE_MB"); + long cap = cbm_max_file_bytes(); + ASSERT_EQ(cap, (long)CBM_DEFAULT_MAX_FILE_MB * 1024 * 1024); + PASS(); +} + +TEST(max_file_bytes_env_blank_falls_back_to_default_not_zero) { + /* Blank must NOT coerce to a finite 0 (which would cap every file at + * 0 bytes and skip everything — AC row 3's terminal-value hazard). */ + cbm_setenv("CBM_MAX_FILE_MB", "", 1); + long cap = cbm_max_file_bytes(); + ASSERT_EQ(cap, (long)CBM_DEFAULT_MAX_FILE_MB * 1024 * 1024); + cbm_unsetenv("CBM_MAX_FILE_MB"); + PASS(); +} + +TEST(max_file_bytes_env_nonnumeric_falls_back_to_default) { + cbm_setenv("CBM_MAX_FILE_MB", "not-a-number", 1); + long cap = cbm_max_file_bytes(); + ASSERT_EQ(cap, (long)CBM_DEFAULT_MAX_FILE_MB * 1024 * 1024); + cbm_unsetenv("CBM_MAX_FILE_MB"); + PASS(); +} + +TEST(max_file_bytes_env_negative_falls_back_to_default) { + /* Negative/zero must clamp to the default floor, not "cap everything" + * (AC row 3). */ + cbm_setenv("CBM_MAX_FILE_MB", "-5", 1); + long cap = cbm_max_file_bytes(); + ASSERT_EQ(cap, (long)CBM_DEFAULT_MAX_FILE_MB * 1024 * 1024); + + cbm_setenv("CBM_MAX_FILE_MB", "0", 1); + cap = cbm_max_file_bytes(); + ASSERT_EQ(cap, (long)CBM_DEFAULT_MAX_FILE_MB * 1024 * 1024); + + cbm_unsetenv("CBM_MAX_FILE_MB"); + PASS(); +} + +TEST(max_file_bytes_env_above_cap_falls_back_to_default) { + cbm_setenv("CBM_MAX_FILE_MB", "999999", 1); /* past CBM_MAX_FILE_MB_CAP */ + long cap = cbm_max_file_bytes(); + ASSERT_EQ(cap, (long)CBM_DEFAULT_MAX_FILE_MB * 1024 * 1024); + cbm_unsetenv("CBM_MAX_FILE_MB"); + PASS(); +} + +/* ── read_file() boundary behaviour via the real pass_parallel path ── + * + * Drives the actual extraction read_file() size guard (not a direct + * cbm_max_file_bytes() call) through cbm_parallel_extract() against a + * synthetic repo with one file at the cap boundary, one just under, one + * just over, and one empty — AC row 1's sub-cases. */ + +static char g_capdir[256]; + +static int setup_cap_test_repo(long cap_bytes) { + snprintf(g_capdir, sizeof(g_capdir), "/tmp/cbm_cap_XXXXXX"); + if (!cbm_mkdtemp(g_capdir)) { + return -1; + } + + /* under_cap.c: comfortably under the cap. */ + th_write_file(TH_PATH(g_capdir, "under_cap.c"), + "int under_cap(void) { return 1; }\n"); + + /* over_cap.c: 1 byte over the cap — padded with a comment so it still + * parses as valid C if it were read (it must not be). */ + { + char path[512]; + snprintf(path, sizeof(path), "%s/over_cap.c", g_capdir); + FILE *f = fopen(path, "w"); + if (!f) { + return -1; + } + const char *prefix = "int over_cap(void) { return 1; } /*"; + long prefix_len = (long)strlen(prefix); + fputs(prefix, f); + /* Pad with '*' up to exactly cap_bytes+1 total bytes, then close + * the comment and a closing brace-free tail (content doesn't need + * to be valid past the guard — the file must never be read). */ + for (long i = prefix_len; i < cap_bytes + 1 - 2; i++) { + fputc('*', f); + } + fputs("*/\n", f); + fclose(f); + } + + /* empty.c: zero-size — already skipped by the pre-existing size<=0 + * check, independent of the cap. */ + th_write_file(TH_PATH(g_capdir, "empty.c"), ""); + + return 0; +} + +static void teardown_cap_test_repo(void) { + if (g_capdir[0]) { + th_rmtree(g_capdir); + g_capdir[0] = '\0'; + } +} + +TEST(parallel_extract_skips_over_cap_parses_under_cap) { + cbm_mem_init(0.5); + cbm_setenv("CBM_MAX_FILE_MB", "1", 1); /* small cap: 1 MB, cheap fixture */ + long cap_bytes = cbm_max_file_bytes(); + + if (setup_cap_test_repo(cap_bytes) != 0) { + cbm_unsetenv("CBM_MAX_FILE_MB"); + FAIL("tmpdir setup failed"); + } + + cbm_discover_opts_t opts = {.mode = CBM_MODE_FULL}; + cbm_file_info_t *files = NULL; + int file_count = 0; + if (cbm_discover(g_capdir, &opts, &files, &file_count) != 0) { + teardown_cap_test_repo(); + cbm_unsetenv("CBM_MAX_FILE_MB"); + FAIL("discover failed"); + } + ASSERT_GTE(file_count, 3); + + cbm_gbuf_t *gbuf = cbm_gbuf_new("cap-test", g_capdir); + cbm_registry_t *reg = cbm_registry_new(); + atomic_int cancelled; + atomic_init(&cancelled, 0); + + cbm_pipeline_ctx_t ctx = { + .project_name = "cap-test", + .repo_path = g_capdir, + .gbuf = gbuf, + .registry = reg, + .cancelled = &cancelled, + }; + + _Atomic int64_t shared_ids; + atomic_init(&shared_ids, cbm_gbuf_next_id(gbuf)); + + CBMFileResult **result_cache = calloc(file_count, sizeof(CBMFileResult *)); + ASSERT_NOT_NULL(result_cache); + + int rc = cbm_parallel_extract(&ctx, files, file_count, result_cache, &shared_ids, 2); + ASSERT_EQ(rc, 0); + + bool under_cap_extracted = false; + bool over_cap_extracted = false; + for (int i = 0; i < file_count; i++) { + if (!files[i].rel_path) { + continue; + } + if (strstr(files[i].rel_path, "under_cap.c") && result_cache[i]) { + under_cap_extracted = true; + } + if (strstr(files[i].rel_path, "over_cap.c") && result_cache[i]) { + over_cap_extracted = true; + } + } + ASSERT_TRUE(under_cap_extracted); + ASSERT_FALSE(over_cap_extracted); + + for (int i = 0; i < file_count; i++) { + if (result_cache[i]) { + cbm_free_result(result_cache[i]); + } + } + free(result_cache); + cbm_registry_free(reg); + cbm_gbuf_free(gbuf); + cbm_discover_free(files, file_count); + teardown_cap_test_repo(); + cbm_unsetenv("CBM_MAX_FILE_MB"); + PASS(); +} + /* ── Arena integration tests ──────────────────────────────────── */ TEST(arena_alloc_and_destroy) { @@ -653,6 +967,24 @@ SUITE(mem) { RUN_TEST(mem_init_negative_fraction); RUN_TEST(mem_init_over_one_fraction); RUN_TEST(mem_init_second_call_noop); + /* Hard memory ceiling (enforcing) */ + RUN_TEST(mem_ceiling_positive_and_above_budget); + RUN_TEST(mem_ceiling_floor_applies_on_tiny_ram); + RUN_TEST(mem_ceiling_env_override_applies); + RUN_TEST(mem_ceiling_env_invalid_falls_back); + RUN_TEST(mem_ceiling_env_unset_matches_default); + RUN_TEST(mem_over_ceiling_false_for_test_process); + RUN_TEST(mem_over_ceiling_true_when_ceiling_forced_below_rss); + /* Per-file parse-size cap (CBM_MAX_FILE_MB) */ + RUN_TEST(max_file_bytes_default_clears_sqlite3_c_size); + RUN_TEST(max_file_bytes_env_override_lowers_threshold); + RUN_TEST(max_file_bytes_env_override_raises_threshold); + RUN_TEST(max_file_bytes_env_unset_uses_default); + RUN_TEST(max_file_bytes_env_blank_falls_back_to_default_not_zero); + RUN_TEST(max_file_bytes_env_nonnumeric_falls_back_to_default); + RUN_TEST(max_file_bytes_env_negative_falls_back_to_default); + RUN_TEST(max_file_bytes_env_above_cap_falls_back_to_default); + RUN_TEST(parallel_extract_skips_over_cap_parses_under_cap); /* Arena integration */ RUN_TEST(arena_alloc_and_destroy); RUN_TEST(arena_grow_tracks_sizes); diff --git a/tests/test_ui.c b/tests/test_ui.c index 002e8d39a..b94826f70 100644 --- a/tests/test_ui.c +++ b/tests/test_ui.c @@ -19,6 +19,27 @@ #include #endif +/* Guards an assertion that runs while HOME is temporarily pointed at a + * scratch dir (see the setenv(HOME, td) / restore bracket in the config_* + * tests below). A plain ASSERT* here would `return 1` on failure and skip + * the paired restore, leaving HOME stuck on the scratch dir for every + * later suite in this process and leaking the strdup'd old_home. Restores + * HOME and frees old_home before returning, mirroring the paired restore + * block on the success path. */ +#define ASSERT_HOME(cond, old_home_var) \ + do { \ + if (!(cond)) { \ + printf(" %sFAIL%s %s:%d: ASSERT(%s)\n", tf_red(), tf_reset(), __FILE__, __LINE__, \ + #cond); \ + if (old_home_var) { \ + cbm_setenv("HOME", (old_home_var), 1); \ + free(old_home_var); \ + (old_home_var) = NULL; \ + } \ + return 1; \ + } \ + } while (0) + /* ── Config tests ─────────────────────────────────────────────── */ TEST(config_load_defaults) { @@ -38,8 +59,8 @@ TEST(config_load_defaults) { cbm_ui_config_load(&cfg); - ASSERT_FALSE(cfg.ui_enabled); - ASSERT_EQ(cfg.ui_port, 9749); + ASSERT_HOME(!cfg.ui_enabled, old_home); + ASSERT_HOME(cfg.ui_port == 9749, old_home); /* Restore HOME */ if (old_home) { @@ -67,8 +88,8 @@ TEST(config_save_and_reload) { cbm_ui_config_t loaded; cbm_ui_config_load(&loaded); - ASSERT_TRUE(loaded.ui_enabled); - ASSERT_EQ(loaded.ui_port, 8080); + ASSERT_HOME(loaded.ui_enabled, old_home); + ASSERT_HOME(loaded.ui_port == 8080, old_home); if (old_home) { cbm_setenv("HOME", old_home, 1); @@ -98,7 +119,7 @@ TEST(config_overwrite) { /* Reload should show false */ cbm_ui_config_t loaded; cbm_ui_config_load(&loaded); - ASSERT_FALSE(loaded.ui_enabled); + ASSERT_HOME(!loaded.ui_enabled, old_home); if (old_home) { cbm_setenv("HOME", old_home, 1); @@ -127,15 +148,15 @@ TEST(config_corrupt_file) { cbm_mkdir_p(dir, 0755); FILE *f = fopen(path, "w"); - ASSERT_NOT_NULL(f); + ASSERT_HOME(f != NULL, old_home); fprintf(f, "this is not json!!!"); fclose(f); /* Should load defaults, not crash */ cbm_ui_config_t cfg; cbm_ui_config_load(&cfg); - ASSERT_FALSE(cfg.ui_enabled); - ASSERT_EQ(cfg.ui_port, 9749); + ASSERT_HOME(!cfg.ui_enabled, old_home); + ASSERT_HOME(cfg.ui_port == 9749, old_home); if (old_home) { cbm_setenv("HOME", old_home, 1); @@ -163,14 +184,14 @@ TEST(config_missing_fields) { cbm_mkdir_p(dir, 0755); FILE *f = fopen(path, "w"); - ASSERT_NOT_NULL(f); + ASSERT_HOME(f != NULL, old_home); fprintf(f, "{\"ui_port\": 5555}"); fclose(f); cbm_ui_config_t cfg; cbm_ui_config_load(&cfg); - ASSERT_FALSE(cfg.ui_enabled); /* defaults for missing field */ - ASSERT_EQ(cfg.ui_port, 5555); /* present field loaded */ + ASSERT_HOME(!cfg.ui_enabled, old_home); /* defaults for missing field */ + ASSERT_HOME(cfg.ui_port == 5555, old_home); /* present field loaded */ if (old_home) { cbm_setenv("HOME", old_home, 1);