From e89aa776a7aabe59ba45f9fc4064caf72e53af09 Mon Sep 17 00:00:00 2001 From: Martin Vogel Date: Thu, 2 Jul 2026 22:11:00 +0200 Subject: [PATCH] feat(index): report per-file indexing failures via skipped[] + logfile (Stage 2/B2-B4) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit index_repository silently dropped files that failed to index: CBMFileResult.has_error was set but never read, oversized files (>100 MB) were dropped with no signal, and read/extract failures only bumped a logged-only counter — a file that couldn't be indexed just vanished from the graph. Collect and surface them: - New retained {path, reason, phase} error list on struct cbm_pipeline (mirrors the excluded_dirs pattern) + accessor + a back-pointer on cbm_pipeline_ctx_t so both extraction paths can append. Wired in BOTH the sequential (pass_definitions) and parallel (per-worker, merged lock-free) paths — small repos take the sequential path, so wiring only one would leave the guard vacuous. - Feeds: read-fail, extract-fail, the newly-CONSUMED has_error (parse timeout / parse failed, with error_msg), and oversized. The cross_lsp phase is reserved for the crash supervisor (Track C) and not fed here (the cross-LSP passes are best-effort with no failure return; feeding the no-source case would be false positives). - MCP/CLI response gains top-level "skipped_count" (0 on clean) and, when >0, a capped "skipped":{files[<=50],count,truncated} + "logfile". Status stays "indexed" — a reported skip is the expected, handled outcome, not a degradation. - Per-run logfile (full uncapped list) written ONLY when >=1 file is skipped: $CBM_INDEX_LOG override else /logs/-.log. - Generous env-configurable caps (src/foundation/limits.c): CBM_MAX_FILE_BYTES, default raised 100 MB -> 512 MiB; over-cap files are REPORTED (phase oversized) + WARNed, never silently dropped. Reproduce-first: tests/test_index_resilience.c (gating) — an oversized file (cap lowered via env) must appear in skipped[] with the 2 good files still indexed and a logfile written; a clean run has skipped_count 0 and no logfile. Genuine guard: no-op'ing the recorder makes the oversized file silently vanish (RED). Full suite 5750/0, no ASan/UBSan. Part of the resilient-indexing effort (Track B surfacing layer). Refs #668. Signed-off-by: Martin Vogel --- Makefile.cbm | 7 +- src/foundation/limits.c | 26 +++ src/foundation/limits.h | 33 ++++ src/mcp/mcp.c | 102 ++++++++++- src/pipeline/pass_calls.c | 3 +- src/pipeline/pass_definitions.c | 68 ++++++- src/pipeline/pass_k8s.c | 3 +- src/pipeline/pass_parallel.c | 155 +++++++++++++++- src/pipeline/pass_semantic.c | 3 +- src/pipeline/pass_usages.c | 3 +- src/pipeline/pipeline.c | 63 +++++++ src/pipeline/pipeline.h | 30 ++++ src/pipeline/pipeline_incremental.c | 1 + src/pipeline/pipeline_internal.h | 4 + tests/test_index_resilience.c | 263 ++++++++++++++++++++++++++++ tests/test_main.c | 2 + 16 files changed, 747 insertions(+), 19 deletions(-) create mode 100644 src/foundation/limits.c create mode 100644 src/foundation/limits.h create mode 100644 tests/test_index_resilience.c diff --git a/Makefile.cbm b/Makefile.cbm index 0a2d16b94..2c6044c74 100644 --- a/Makefile.cbm +++ b/Makefile.cbm @@ -112,7 +112,8 @@ FOUNDATION_SRCS = \ src/foundation/mem.c \ src/foundation/diagnostics.c \ src/foundation/profile.c \ - src/foundation/dump_verify.c + src/foundation/dump_verify.c \ + src/foundation/limits.c # Existing extraction C code (compiled from current location) EXTRACTION_SRCS = \ @@ -345,7 +346,7 @@ TEST_DISCOVER_SRCS = \ TEST_GRAPH_BUFFER_SRCS = tests/test_graph_buffer.c -TEST_PIPELINE_SRCS = tests/test_registry.c tests/test_pipeline.c tests/test_fqn.c tests/test_route_canon.c tests/test_path_alias.c tests/test_configlink.c tests/test_infrascan.c tests/test_worker_pool.c tests/test_parallel.c +TEST_PIPELINE_SRCS = tests/test_registry.c tests/test_pipeline.c tests/test_fqn.c tests/test_route_canon.c tests/test_path_alias.c tests/test_configlink.c tests/test_infrascan.c tests/test_worker_pool.c tests/test_parallel.c tests/test_index_resilience.c TEST_WATCHER_SRCS = tests/test_watcher.c @@ -567,7 +568,7 @@ $(UNIXCODER_OBJ): $(UNIXCODER_BLOB_SRC) vendored/nomic/code_vectors.bin | $(BUIL OBJS_VENDORED_TEST = $(MIMALLOC_OBJ_TEST) $(SQLITE3_OBJ_TEST) $(TRE_OBJ_TEST) $(GRAMMAR_OBJS_TEST) $(TS_RUNTIME_OBJ_TEST) $(LSP_OBJ_TEST) $(PP_OBJ_TEST) $(LZ4_OBJ_TEST) $(ZSTD_OBJ_TEST) $(UNIXCODER_OBJ) $(BUILD_DIR)/test-runner: $(ALL_TEST_SRCS) $(PROD_SRCS) $(EXTRACTION_SRCS) $(AC_LZ4_SRCS) $(ZSTD_SRCS) $(SQLITE_WRITER_SRC) $(OBJS_VENDORED_TEST) | $(BUILD_DIR) - $(CC) $(CFLAGS_TEST) -o $@ \ + $(CC) $(CFLAGS_TEST) -Itests -Itests/repro -o $@ \ $(ALL_TEST_SRCS) $(PROD_SRCS) \ $(EXTRACTION_SRCS) $(AC_LZ4_SRCS) $(ZSTD_SRCS) $(SQLITE_WRITER_SRC) \ $(OBJS_VENDORED_TEST) \ diff --git a/src/foundation/limits.c b/src/foundation/limits.c new file mode 100644 index 000000000..b7ad14091 --- /dev/null +++ b/src/foundation/limits.c @@ -0,0 +1,26 @@ +/* + * limits.c — Env-configurable safety limits (Stage 2 / Track B4). + */ +#include "foundation/limits.h" + +#include +#include + +long cbm_max_file_bytes(void) { + /* 512 MiB — generous: real source files never approach it, but a + * pathological / vendored blob degrades to a reported "oversized" skip + * instead of a silent drop or an unbounded read. */ + const long default_cap = 512L * 1024 * 1024; + + const char *raw = getenv("CBM_MAX_FILE_BYTES"); + if (raw && raw[0]) { + errno = 0; + char *end = NULL; + long v = strtol(raw, &end, 10); + if (errno == 0 && end != raw && *end == '\0' && v > 0) { + return v; + } + /* Unparseable / non-positive → fall through to the safe default. */ + } + return default_cap; +} diff --git a/src/foundation/limits.h b/src/foundation/limits.h new file mode 100644 index 000000000..ed33f7044 --- /dev/null +++ b/src/foundation/limits.h @@ -0,0 +1,33 @@ +/* + * limits.h — Generous, env-configurable safety limits (Stage 2 / Track B4). + * + * Each knob has a generous default. Hitting a limit degrades to a *reported* + * skip (surfaced via MCP/CLI/logfile), never a silent drop and never an + * unbounded read (unbounded just trades a crash for an OOM/hang). Every limit + * is env-overridable so an operator can tune it per-repo without a rebuild. + */ +#ifndef CBM_LIMITS_H +#define CBM_LIMITS_H + +/* Result of an attempted per-file read, so callers can attribute a skip to the + * right phase/reason instead of collapsing every failure into "read failed". */ +typedef enum { + CBM_READ_OK = 0, /* file read successfully */ + CBM_READ_OPEN_FAIL, /* could not open (missing / permission) */ + CBM_READ_EMPTY, /* zero/negative size — benign, nothing to index */ + CBM_READ_OVERSIZED, /* size exceeds cbm_max_file_bytes() */ + CBM_READ_OOM, /* buffer allocation failed */ +} cbm_read_status_t; + +/* Maximum size (bytes) of a single source file the indexer will read into + * memory. Files larger than this are skipped-and-reported (phase "oversized"), + * never silently dropped. Override with CBM_MAX_FILE_BYTES (a positive integer + * count of bytes). Default 512 MiB (raised from the historical 100 MB cap). + * + * The env var is read on each call — this is intentional: read_file() calls it + * once per file (negligible), and reading fresh means a test / operator can + * change the cap via setenv without a process restart or a stale memoized copy + * leaking across runs. */ +long cbm_max_file_bytes(void); + +#endif /* CBM_LIMITS_H */ diff --git a/src/mcp/mcp.c b/src/mcp/mcp.c index 034e2e599..ac9fd00e3 100644 --- a/src/mcp/mcp.c +++ b/src/mcp/mcp.c @@ -3134,13 +3134,96 @@ static void add_excluded_summary(yyjson_mut_doc *doc, yyjson_mut_val *root, char yyjson_mut_obj_add_val(doc, root, "excluded", excluded); } +/* Cap on per-file skips embedded in the JSON response — keep it compact on + * large repos. The FULL, uncapped list always goes to the per-run logfile; + * the JSON carries "count" + "truncated" so nothing is silently hidden. */ +enum { INDEX_SKIPPED_FILE_CAP = 50 }; + +/* Attach a summary of per-file skips (Stage 2 / Track B). Always emits a + * top-level "skipped_count" (0 on clean runs) so consumers can rely on it. + * When there are skips, also emits: + * "skipped": {"files":[{path,reason,phase}..(<=50)], "count":N, "truncated":bool} + * and, if a per-run logfile was written, "logfile": "". + * The run status stays "indexed" — a skipped file is the expected handled + * outcome, not a failure. errs[] is borrowed (copied into doc). */ +static void add_skipped_summary(yyjson_mut_doc *doc, yyjson_mut_val *root, + const cbm_file_error_t *errs, int count, const char *logfile) { + yyjson_mut_obj_add_int(doc, root, "skipped_count", count < 0 ? 0 : count); + if (!errs || count <= 0) { + return; + } + yyjson_mut_val *skipped = yyjson_mut_obj(doc); + yyjson_mut_val *files = yyjson_mut_arr(doc); + int shown = count < INDEX_SKIPPED_FILE_CAP ? count : INDEX_SKIPPED_FILE_CAP; + for (int i = 0; i < shown; i++) { + yyjson_mut_val *fe = yyjson_mut_obj(doc); + yyjson_mut_obj_add_strcpy(doc, fe, "path", errs[i].path ? errs[i].path : ""); + yyjson_mut_obj_add_strcpy(doc, fe, "reason", errs[i].reason ? errs[i].reason : ""); + yyjson_mut_obj_add_strcpy(doc, fe, "phase", errs[i].phase ? errs[i].phase : ""); + yyjson_mut_arr_add_val(files, fe); + } + yyjson_mut_obj_add_val(doc, skipped, "files", files); + yyjson_mut_obj_add_int(doc, skipped, "count", count); + yyjson_mut_obj_add_bool(doc, skipped, "truncated", count > INDEX_SKIPPED_FILE_CAP); + yyjson_mut_obj_add_val(doc, root, "skipped", skipped); + if (logfile && logfile[0]) { + yyjson_mut_obj_add_strcpy(doc, root, "logfile", logfile); + } +} + +/* Write the FULL (uncapped) skip list to a per-run logfile — ONLY when >=1 file + * was skipped (no logfile on a clean run). Location: + * $CBM_INDEX_LOG (override) else /logs/-.log + * Returns true and fills out_path on success. */ +static bool write_skip_logfile(const char *project, const cbm_file_error_t *errs, int count, + char *out_path, size_t out_sz) { + if (!errs || count <= 0) { + return false; + } + char path[CBM_SZ_1K]; + const char *override = getenv("CBM_INDEX_LOG"); + if (override && override[0]) { + snprintf(path, sizeof(path), "%s", override); + } else { + const char *cdir = cbm_resolve_cache_dir(); + if (!cdir) { + return false; + } + char logdir[CBM_SZ_1K]; + snprintf(logdir, sizeof(logdir), "%s/logs", cdir); + cbm_mkdir_p(logdir, 0755); + snprintf(path, sizeof(path), "%s/%s-%lld.log", logdir, project ? project : "index", + (long long)time(NULL)); + } + FILE *f = cbm_fopen(path, "wb"); + if (!f) { + cbm_log_warn("index.logfile_open_fail", "path", path); + return false; + } + (void)fprintf(f, "# codebase-memory-mcp index skip report\n"); + (void)fprintf(f, "# project=%s skipped=%d\n", project ? project : "", count); + (void)fprintf(f, "# columns: phase\treason\tpath\n"); + for (int i = 0; i < count; i++) { + (void)fprintf(f, "%s\t%s\t%s\n", errs[i].phase ? errs[i].phase : "", + errs[i].reason ? errs[i].reason : "", errs[i].path ? errs[i].path : ""); + } + (void)fclose(f); + if (out_path && out_sz) { + snprintf(out_path, out_sz, "%s", path); + } + return true; +} + /* Build the success portion of the index_repository response. * Returns true when status should be "degraded" (#334 plausibility gate). */ static bool build_index_success_response(cbm_mcp_server_t *srv, yyjson_mut_doc *doc, yyjson_mut_val *root, const char *project_name, const char *repo_path, bool persistence, cbm_pipeline_t *p, - char **excluded_dirs, int excluded_count) { + char **excluded_dirs, int excluded_count, + const cbm_file_error_t *file_errors, int file_error_count, + const char *logfile) { add_excluded_summary(doc, root, excluded_dirs, excluded_count); + add_skipped_summary(doc, root, file_errors, file_error_count, logfile); int exp_nodes = -1; int exp_edges = -1; @@ -3302,6 +3385,12 @@ static char *handle_index_repository(cbm_mcp_server_t *srv, const char *args) { int excluded_count = 0; cbm_pipeline_get_excluded(p, &excluded_dirs, &excluded_count); + /* Capture the per-file skip list (Stage 2 / Track B) while the pipeline + * still owns the strings; the response builder copies them into the doc. */ + cbm_file_error_t *file_errors = NULL; + int file_error_count = 0; + cbm_pipeline_get_file_errors(p, &file_errors, &file_error_count); + cbm_mem_collect(); /* return mimalloc pages to OS after large indexing */ /* Invalidate cached store so next query reopens the fresh database */ @@ -3319,8 +3408,15 @@ static char *handle_index_repository(cbm_mcp_server_t *srv, const char *args) { yyjson_mut_obj_add_str(doc, root, "project", project_name); if (rc == 0) { - bool degraded = build_index_success_response(srv, doc, root, project_name, repo_path, - persistence, p, excluded_dirs, excluded_count); + /* Write the per-run logfile ONLY when there were skips (no logfile on a + * clean run). The FULL list goes to the file; the JSON caps at 50. */ + char logfile_path[CBM_SZ_1K]; + logfile_path[0] = '\0'; + bool has_logfile = write_skip_logfile(project_name, file_errors, file_error_count, + logfile_path, sizeof(logfile_path)); + bool degraded = build_index_success_response( + srv, doc, root, project_name, repo_path, persistence, p, excluded_dirs, excluded_count, + file_errors, file_error_count, has_logfile ? logfile_path : NULL); yyjson_mut_obj_add_str(doc, root, "status", degraded ? "degraded" : "indexed"); } else { yyjson_mut_obj_add_str(doc, root, "status", "error"); diff --git a/src/pipeline/pass_calls.c b/src/pipeline/pass_calls.c index 5c0f978e3..4c2bcf935 100644 --- a/src/pipeline/pass_calls.c +++ b/src/pipeline/pass_calls.c @@ -23,6 +23,7 @@ enum { PC_RING = 4, PC_RING_MASK = 3, PC_SIG_SCAN = 15, PC_REGEX_GRP = 2 }; #include "foundation/log.h" #include "foundation/compat.h" #include "foundation/compat_fs.h" +#include "foundation/limits.h" #include "foundation/str_util.h" #include "cbm.h" #include "service_patterns.h" @@ -53,7 +54,7 @@ static char *read_file(const char *path, int *out_len) { long size = ftell(f); (void)fseek(f, 0, SEEK_SET); - if (size <= 0 || size > (long)CBM_PERCENT * CBM_SZ_1K * CBM_SZ_1K) { + if (size <= 0 || size > cbm_max_file_bytes()) { /* generous, env-configurable cap (B4) */ (void)fclose(f); return NULL; } diff --git a/src/pipeline/pass_definitions.c b/src/pipeline/pass_definitions.c index 767703d34..e7deefd72 100644 --- a/src/pipeline/pass_definitions.c +++ b/src/pipeline/pass_definitions.c @@ -23,6 +23,7 @@ enum { PD_JSON_FIELD_OVERHEAD = 6 }; #include "foundation/log.h" #include "foundation/compat.h" #include "foundation/compat_fs.h" +#include "foundation/limits.h" #include "cbm.h" #include "simhash/minhash.h" #include "semantic/ast_profile.h" @@ -32,20 +33,45 @@ enum { PD_JSON_FIELD_OVERHEAD = 6 }; #include /* Read entire file into heap-allocated buffer. Returns NULL on error. - * Caller must free(). Sets *out_len to byte count. */ -static char *read_file(const char *path, int *out_len) { + * Caller must free(). Sets *out_len to byte count. *out_size receives the + * on-disk size and *out_status the failure reason, so the caller can attribute + * a skip to the right phase/reason (read vs oversized) instead of a silent + * drop. Both out params may be NULL. */ +static char *read_file(const char *path, int *out_len, long *out_size, + cbm_read_status_t *out_status) { + if (out_size) { + *out_size = 0; + } + if (out_status) { + *out_status = CBM_READ_OK; + } FILE *f = cbm_fopen(path, "rb"); if (!f) { + if (out_status) { + *out_status = CBM_READ_OPEN_FAIL; + } return NULL; } (void)fseek(f, 0, SEEK_END); long size = ftell(f); (void)fseek(f, 0, SEEK_SET); + if (out_size) { + *out_size = size; + } - if (size <= 0 || - size > (long)CBM_PERCENT * CBM_SZ_1K * CBM_SZ_1K) { /* CBM_PERCENT MB sanity limit */ + if (size <= 0) { + (void)fclose(f); + if (out_status) { + *out_status = CBM_READ_EMPTY; + } + return NULL; + } + if (size > cbm_max_file_bytes()) { /* generous, env-configurable cap (B4) */ (void)fclose(f); + if (out_status) { + *out_status = CBM_READ_OVERSIZED; + } return NULL; } @@ -57,6 +83,9 @@ static char *read_file(const char *path, int *out_len) { char *buf = malloc((size_t)size + CBM_TS_LOOKAHEAD_PAD); if (!buf) { (void)fclose(f); + if (out_status) { + *out_status = CBM_READ_OOM; + } return NULL; } @@ -487,9 +516,27 @@ int cbm_pipeline_pass_definitions(cbm_pipeline_ctx_t *ctx, const cbm_file_info_t /* Read source file */ int source_len = 0; - char *source = read_file(path, &source_len); + long file_size = 0; + cbm_read_status_t rst = CBM_READ_OK; + char *source = read_file(path, &source_len, &file_size, &rst); if (!source) { errors++; + if (rst == CBM_READ_OVERSIZED) { + /* Never a silent drop: record the oversized skip + WARN so the + * file surfaces in the response/logfile with its sizes. */ + long cap = cbm_max_file_bytes(); + char reason[96]; + snprintf(reason, sizeof(reason), "oversized (%lld MB > %lld MB)", + (long long)(file_size / (CBM_SZ_1K * CBM_SZ_1K)), + (long long)(cap / (CBM_SZ_1K * CBM_SZ_1K))); + cbm_pipeline_add_file_error(ctx->pipeline, rel, reason, "oversized"); + cbm_log_warn("index.file_oversized", "path", rel, "size_mb", + itoa_log((int)(file_size / (CBM_SZ_1K * CBM_SZ_1K))), "cap_mb", + itoa_log((int)(cap / (CBM_SZ_1K * CBM_SZ_1K)))); + } else if (rst == CBM_READ_OPEN_FAIL || rst == CBM_READ_OOM) { + cbm_pipeline_add_file_error(ctx->pipeline, rel, "read failed", "read"); + } + /* CBM_READ_EMPTY: benign 0-byte file — nothing to index, not reported. */ continue; } @@ -502,8 +549,19 @@ int cbm_pipeline_pass_definitions(cbm_pipeline_ctx_t *ctx, const cbm_file_info_t if (!result) { errors++; + cbm_pipeline_add_file_error(ctx->pipeline, rel, "extract failed", "extract"); continue; } + /* Consume the previously-ignored has_error flag: a parse timeout / + * parse failure / unsupported-grammar result carries no defs but must + * still be reported (phase "extract", reason = the extractor's message). + * The empty result flows through unchanged (the defs loop is a no-op). */ + if (result->has_error) { + cbm_pipeline_add_file_error(ctx->pipeline, rel, + result->error_msg ? result->error_msg : "extract failed", + "extract"); + errors++; + } /* Create nodes for each definition */ for (int d = 0; d < result->defs.count; d++) { diff --git a/src/pipeline/pass_k8s.c b/src/pipeline/pass_k8s.c index 494e13895..94dcd439a 100644 --- a/src/pipeline/pass_k8s.c +++ b/src/pipeline/pass_k8s.c @@ -20,6 +20,7 @@ #include "foundation/log.h" #include "foundation/compat.h" #include "foundation/compat_fs.h" +#include "foundation/limits.h" #include "cbm.h" #include @@ -40,7 +41,7 @@ static char *k8s_read_file(const char *path, int *out_len) { long size = ftell(f); (void)fseek(f, 0, SEEK_SET); - if (size <= 0 || size > (long)CBM_PERCENT * CBM_SZ_1K * CBM_SZ_1K) { + if (size <= 0 || size > cbm_max_file_bytes()) { /* generous, env-configurable cap (B4) */ (void)fclose(f); return NULL; } diff --git a/src/pipeline/pass_parallel.c b/src/pipeline/pass_parallel.c index 93656b5ec..49c287ac0 100644 --- a/src/pipeline/pass_parallel.c +++ b/src/pipeline/pass_parallel.c @@ -68,6 +68,7 @@ enum { PP_CSHARP_M_PREFIX_LEN = 2 }; #include "foundation/str_util.h" #include "foundation/profile.h" #include "foundation/compat_regex.h" +#include "foundation/limits.h" #include "cbm.h" #include "simhash/minhash.h" #include "semantic/ast_profile.h" @@ -87,22 +88,51 @@ static uint64_t extract_now_ns(void) { /* ── Helpers (duplicated from pass files — kept static for isolation) ── */ -/* Read file into a malloc'd buffer (= mimalloc in production). */ -static char *read_file(const char *path, int *out_len) { +/* Read file into a malloc'd buffer (= mimalloc in production). + * *out_size receives the on-disk size and *out_status the failure reason so the + * caller can attribute a skip to the right phase (read vs oversized) instead of + * a silent drop. Both out params may be NULL. */ +static char *read_file(const char *path, int *out_len, long *out_size, + cbm_read_status_t *out_status) { + if (out_size) { + *out_size = 0; + } + if (out_status) { + *out_status = CBM_READ_OK; + } FILE *f = cbm_fopen(path, "rb"); if (!f) { + if (out_status) { + *out_status = CBM_READ_OPEN_FAIL; + } return NULL; } (void)fseek(f, 0, SEEK_END); long size = ftell(f); (void)fseek(f, 0, SEEK_SET); - if (size <= 0 || size > (long)CBM_PERCENT * CBM_SZ_1K * CBM_SZ_1K) { + if (out_size) { + *out_size = size; + } + if (size <= 0) { (void)fclose(f); + if (out_status) { + *out_status = CBM_READ_EMPTY; + } + return NULL; + } + if (size > cbm_max_file_bytes()) { /* generous, env-configurable cap (B4) */ + (void)fclose(f); + if (out_status) { + *out_status = CBM_READ_OVERSIZED; + } return NULL; } char *buf = (char *)malloc((size_t)size + SKIP_ONE); if (!buf) { (void)fclose(f); + if (out_status) { + *out_status = CBM_READ_OOM; + } return NULL; } size_t nread = fread(buf, SKIP_ONE, (size_t)size, f); @@ -112,6 +142,50 @@ static char *read_file(const char *path, int *out_len) { return buf; } +/* ── Per-worker skip list (Stage 2 / Track B) ─────────────────────── + * Each extract worker appends read/extract/oversized skips into its OWN list + * (no lock on the hot path); the lists are merged into the pipeline's + * cbm_file_error_t array in the existing sequential merge loop. */ +typedef struct { + cbm_file_error_t *items; + int count; + int cap; +} pp_err_list_t; + +/* NULL-safe heap strdup. */ +static char *pp_err_dup(const char *s) { + if (!s) { + return NULL; + } + size_t n = strlen(s) + 1; + char *d = (char *)malloc(n); + if (d) { + memcpy(d, s, n); + } + return d; +} + +static void pp_err_add(pp_err_list_t *list, const char *path, const char *reason, + const char *phase) { + if (!list) { + return; + } + if (list->count >= list->cap) { + int ncap = list->cap ? list->cap * 2 : 8; + cbm_file_error_t *grown = + (cbm_file_error_t *)realloc(list->items, (size_t)ncap * sizeof(*grown)); + if (!grown) { + return; /* drop on OOM — never fail extraction to record a skip */ + } + list->items = grown; + list->cap = ncap; + } + list->items[list->count].path = pp_err_dup(path); + list->items[list->count].reason = pp_err_dup(reason); + list->items[list->count].phase = pp_err_dup(phase); + list->count++; +} + /* Free source buffer. */ static void free_source(char *buf) { free(buf); @@ -496,8 +570,17 @@ typedef struct { cbm_pkg_entries_t *pkg_entries; /* per-worker manifest arrays (separate allocation) */ _Atomic int64_t retained_bytes; /* total source bytes copied into result arenas */ + + /* Per-worker skip lists (separate allocation, indexed by worker_id — no hot- + * path lock). Merged into the pipeline in the sequential merge loop. */ + pp_err_list_t *err_lists; + _Atomic int oversized_warned; /* throttle for the index.file_oversized WARN */ } extract_ctx_t; +/* Cap on the number of index.file_oversized WARN lines (the full list still goes + * to the response/logfile — this only throttles the stderr noise). */ +enum { PP_OVERSIZED_WARN_MAX = 32 }; + /* Insert one definition node (and its route if present) into the local gbuf. */ static void insert_def_into_gbuf(extract_worker_state_t *ws, const cbm_file_info_t *fi, CBMDefinition *def) { @@ -581,12 +664,34 @@ static void extract_worker(int worker_id, void *ctx_ptr) { int file_idx = ec->sorted[sort_pos].idx; const cbm_file_info_t *fi = &ec->files[file_idx]; + pp_err_list_t *errs = ec->err_lists ? &ec->err_lists[worker_id] : NULL; /* Read + extract */ int source_len = 0; - char *source = read_file(fi->path, &source_len); + long file_size = 0; + cbm_read_status_t rst = CBM_READ_OK; + char *source = read_file(fi->path, &source_len, &file_size, &rst); if (!source) { ws->errors++; + if (rst == CBM_READ_OVERSIZED) { + /* Never a silent drop: record the oversized skip + a throttled + * WARN so the file surfaces in the response/logfile. */ + long cap = cbm_max_file_bytes(); + char reason[96]; + snprintf(reason, sizeof(reason), "oversized (%lld MB > %lld MB)", + (long long)(file_size / (CBM_SZ_1K * CBM_SZ_1K)), + (long long)(cap / (CBM_SZ_1K * CBM_SZ_1K))); + pp_err_add(errs, fi->rel_path, reason, "oversized"); + if (atomic_fetch_add_explicit(&ec->oversized_warned, SKIP_ONE, + memory_order_relaxed) < PP_OVERSIZED_WARN_MAX) { + cbm_log_warn("index.file_oversized", "path", fi->rel_path, "size_mb", + itoa_log((int)(file_size / (CBM_SZ_1K * CBM_SZ_1K))), "cap_mb", + itoa_log((int)(cap / (CBM_SZ_1K * CBM_SZ_1K)))); + } + } else if (rst == CBM_READ_OPEN_FAIL || rst == CBM_READ_OOM) { + pp_err_add(errs, fi->rel_path, "read failed", "read"); + } + /* CBM_READ_EMPTY: benign 0-byte file — not reported. */ continue; } @@ -608,10 +713,21 @@ static void extract_worker(int worker_id, void *ctx_ptr) { log_extract_fail(sort_pos, file_elapsed_ms, fi->rel_path); free_source(source); ws->errors++; + pp_err_add(errs, fi->rel_path, "extract failed", "extract"); continue; } log_extract_done(sort_pos, file_elapsed_ms, result->defs.count, fi->rel_path); + /* Consume the previously-ignored has_error flag: a parse timeout / parse + * failure / unsupported-grammar result carries no defs but must still be + * reported (phase "extract", reason = the extractor's message). The empty + * result flows through unchanged below (the defs loop is a no-op). */ + if (result->has_error) { + pp_err_add(errs, fi->rel_path, result->error_msg ? result->error_msg : "extract failed", + "extract"); + ws->errors++; + } + /* Create definition nodes in local gbuf */ for (int d = 0; d < result->defs.count; d++) { CBMDefinition *def = &result->defs.items[d]; @@ -769,6 +885,10 @@ int cbm_parallel_extract(cbm_pipeline_ctx_t *ctx, const cbm_file_info_t *files, /* Per-worker manifest entry arrays (separate from cache-line-aligned worker state) */ cbm_pkg_entries_t *pkg_entries = calloc(worker_count, sizeof(cbm_pkg_entries_t)); + /* Per-worker skip lists (separate allocation; merged into the pipeline in the + * sequential merge loop below). */ + pp_err_list_t *err_lists = calloc(worker_count, sizeof(pp_err_list_t)); + extract_ctx_t ec = { .files = files, .sorted = sorted, @@ -781,10 +901,12 @@ int cbm_parallel_extract(cbm_pipeline_ctx_t *ctx, const cbm_file_info_t *files, .shared_ids = shared_ids, .cancelled = ctx->cancelled, .pkg_entries = pkg_entries, + .err_lists = err_lists, }; atomic_init(&ec.next_worker_id, 0); atomic_init(&ec.next_file_idx, 0); atomic_init(&ec.retained_bytes, 0); + atomic_init(&ec.oversized_warned, 0); /* Sub-phase: Dispatch workers (parse + extract per file, PARALLEL) */ CBM_PROF_START(t_dispatch); @@ -806,6 +928,24 @@ int cbm_parallel_extract(cbm_pipeline_ctx_t *ctx, const cbm_file_info_t *files, } CBM_PROF_END_N("parallel_extract", "4_merge_gbufs_seq", t_merge, total_nodes); + /* Merge per-worker skip lists into the pipeline (SEQUENTIAL — no lock). + * Runs unconditionally (not gated on local_gbuf) so a worker whose files all + * failed still surfaces its skips. */ + if (err_lists) { + for (int i = 0; i < worker_count; i++) { + for (int j = 0; j < err_lists[i].count; j++) { + cbm_pipeline_add_file_error(ctx->pipeline, err_lists[i].items[j].path, + err_lists[i].items[j].reason, + err_lists[i].items[j].phase); + free(err_lists[i].items[j].path); + free(err_lists[i].items[j].reason); + free(err_lists[i].items[j].phase); + } + free(err_lists[i].items); + } + free(err_lists); + } + merge_pkg_entries(ctx, pkg_entries, worker_count); cbm_aligned_free(workers); @@ -2422,6 +2562,13 @@ static void resolve_worker(int worker_id, void *ctx_ptr) { } atomic_fetch_add_explicit(&rc->lsp_cross_processed, SKIP_ONE, memory_order_relaxed); } else { + /* No retained source → the cross-file LSP refinement no-ops for + * this file. This is a bounded OPTIMIZATION skip, NOT a file + * failure: defs/calls were already extracted and are unaffected. + * Deliberately NOT recorded as a cbm_file_error — doing so would + * flood skipped[] with false positives (itself a false-guard + * bug). The "cross_lsp" phase string is reserved for Track C's + * real crash-attribution signal; leave it unwired here. */ atomic_fetch_add_explicit(&rc->lsp_cross_skipped_no_source, SKIP_ONE, memory_order_relaxed); } diff --git a/src/pipeline/pass_semantic.c b/src/pipeline/pass_semantic.c index 2b9c03b46..8a3bb01f0 100644 --- a/src/pipeline/pass_semantic.c +++ b/src/pipeline/pass_semantic.c @@ -20,6 +20,7 @@ #include "foundation/log.h" #include "foundation/compat.h" #include "foundation/compat_fs.h" +#include "foundation/limits.h" #include "cbm.h" #include @@ -42,7 +43,7 @@ static char *read_file(const char *path, int *out_len) { (void)fseek(f, 0, SEEK_END); long size = ftell(f); (void)fseek(f, 0, SEEK_SET); - if (size <= 0 || size > (long)CBM_PERCENT * CBM_SZ_1K * CBM_SZ_1K) { + if (size <= 0 || size > cbm_max_file_bytes()) { /* generous, env-configurable cap (B4) */ (void)fclose(f); return NULL; } diff --git a/src/pipeline/pass_usages.c b/src/pipeline/pass_usages.c index ec01a0ab6..c6ce7e000 100644 --- a/src/pipeline/pass_usages.c +++ b/src/pipeline/pass_usages.c @@ -19,6 +19,7 @@ #include "foundation/log.h" #include "foundation/compat.h" #include "foundation/compat_fs.h" +#include "foundation/limits.h" #include "cbm.h" #include @@ -41,7 +42,7 @@ static char *read_file(const char *path, int *out_len) { (void)fseek(f, 0, SEEK_END); long size = ftell(f); (void)fseek(f, 0, SEEK_SET); - if (size <= 0 || size > (long)CBM_PERCENT * CBM_SZ_1K * CBM_SZ_1K) { + if (size <= 0 || size > cbm_max_file_bytes()) { /* generous, env-configurable cap (B4) */ (void)fclose(f); return NULL; } diff --git a/src/pipeline/pipeline.c b/src/pipeline/pipeline.c index 1cdfda02b..3a9282036 100644 --- a/src/pipeline/pipeline.c +++ b/src/pipeline/pipeline.c @@ -92,6 +92,14 @@ struct cbm_pipeline { char **excluded_dirs; int excluded_count; + /* Per-file indexing failures (skipped files) surfaced via MCP/CLI/logfile + * (Stage 2 / Track B). A skip is the expected handled outcome of a bad or + * oversized file — the run still succeeds ("indexed"). Owned by the + * pipeline; freed in cbm_pipeline_free. */ + cbm_file_error_t *file_errors; + int file_errors_count; + int file_errors_cap; + /* User-defined extension overrides (loaded once per run) */ cbm_userconfig_t *userconfig; @@ -207,6 +215,15 @@ void cbm_pipeline_free(cbm_pipeline_t *p) { cbm_discover_free_excluded(p->excluded_dirs, p->excluded_count); p->excluded_dirs = NULL; p->excluded_count = 0; + for (int i = 0; i < p->file_errors_count; i++) { + free(p->file_errors[i].path); + free(p->file_errors[i].reason); + free(p->file_errors[i].phase); + } + free(p->file_errors); + p->file_errors = NULL; + p->file_errors_count = 0; + p->file_errors_cap = 0; free(p->branch_qn); free(p->saved_adr); /* freed here too: error paths can exit before the * restore in dump_and_persist_hashes runs. Issue #516. */ @@ -253,6 +270,51 @@ void cbm_pipeline_get_excluded(const cbm_pipeline_t *p, char ***out, int *count) } } +/* NULL-safe heap strdup (avoids a strdup dependency + guards NULL inputs). */ +static char *fe_strdup(const char *s) { + if (!s) { + return NULL; + } + size_t n = strlen(s) + 1; + char *d = (char *)malloc(n); + if (d) { + memcpy(d, s, n); + } + return d; +} + +void cbm_pipeline_add_file_error(cbm_pipeline_t *p, const char *path, const char *reason, + const char *phase) { + if (!p) { + return; + } + if (p->file_errors_count >= p->file_errors_cap) { + int ncap = p->file_errors_cap ? p->file_errors_cap * 2 : 16; + cbm_file_error_t *grown = + (cbm_file_error_t *)realloc(p->file_errors, (size_t)ncap * sizeof(*grown)); + if (!grown) { + /* Never abort indexing just to record a skip — drop this record. */ + return; + } + p->file_errors = grown; + p->file_errors_cap = ncap; + } + cbm_file_error_t *e = &p->file_errors[p->file_errors_count]; + e->path = fe_strdup(path); + e->reason = fe_strdup(reason); + e->phase = fe_strdup(phase); + p->file_errors_count++; +} + +void cbm_pipeline_get_file_errors(const cbm_pipeline_t *p, cbm_file_error_t **out, int *count) { + if (out) { + *out = p ? p->file_errors : NULL; + } + if (count) { + *count = p ? p->file_errors_count : 0; + } +} + void cbm_pipeline_get_committed_counts(const cbm_pipeline_t *p, int *nodes, int *edges) { if (nodes) { *nodes = p ? p->committed_nodes : -1; @@ -1208,6 +1270,7 @@ int cbm_pipeline_run(cbm_pipeline_t *p) { .gbuf = p->gbuf, .registry = p->registry, .cancelled = &p->cancelled, + .pipeline = p, /* so passes can record per-file skips (Track B) */ .mode = (int)p->mode, .path_aliases = path_aliases, }; diff --git a/src/pipeline/pipeline.h b/src/pipeline/pipeline.h index 11fc9d50e..ba50c4d08 100644 --- a/src/pipeline/pipeline.h +++ b/src/pipeline/pipeline.h @@ -79,6 +79,36 @@ void cbm_pipeline_get_excluded(const cbm_pipeline_t *p, char ***out, int *count) * Nodes are the #334 plausibility-gate axis; edges are informational only. */ void cbm_pipeline_get_committed_counts(const cbm_pipeline_t *p, int *nodes, int *edges); +/* ── Per-file indexing failures (Stage 2 / Track B) ─────────────── */ + +/* One source file that was skipped during indexing. All strings are owned by + * the pipeline (copied on record, freed in cbm_pipeline_free). A skip is the + * expected, handled outcome of a bad/oversized file — indexing continues and + * the run still reports status "indexed"; these are surfaced (not errors that + * fail the run) via MCP `skipped[]` / the CLI / a per-run logfile. */ +typedef struct { + char *path; /* repo-relative path of the skipped file */ + char *reason; /* human-readable cause (e.g. "oversized (712 MB > 512 MB)", + * "parse timeout", "read failed") */ + char *phase; /* "read" | "extract" | "oversized". "cross_lsp" is a RESERVED + * phase string for Track C's crash-attribution signal and is + * intentionally NOT emitted today (the cross-LSP passes are + * best-effort/void with no genuine per-file failure). */ +} cbm_file_error_t; + +/* Record a skipped file. path/reason/phase are copied. NULL-safe on p. + * + * NOT thread-safe: call it from the sequential extraction pass, or from the + * parallel merge step (never from inside a parallel worker — workers collect + * into per-worker lists and merge sequentially). */ +void cbm_pipeline_add_file_error(cbm_pipeline_t *p, const char *path, const char *reason, + const char *phase); + +/* Borrowed accessor for the recorded skips (owned by the pipeline, valid until + * cbm_pipeline_free()). out and count are set to NULL and 0 when p is NULL or + * nothing was skipped. Do not free. */ +void cbm_pipeline_get_file_errors(const cbm_pipeline_t *p, cbm_file_error_t **out, int *count); + /* ── Index lock (prevents concurrent pipeline runs on same DB) ──── */ /* Try to acquire the global index lock. Returns true if acquired, diff --git a/src/pipeline/pipeline_incremental.c b/src/pipeline/pipeline_incremental.c index e5d1b4c9f..ebbfebb10 100644 --- a/src/pipeline/pipeline_incremental.c +++ b/src/pipeline/pipeline_incremental.c @@ -814,6 +814,7 @@ int cbm_pipeline_run_incremental(cbm_pipeline_t *p, const char *db_path, cbm_fil .gbuf = existing, .registry = registry, .cancelled = cbm_pipeline_cancelled_ptr(p), + .pipeline = p, /* so passes can record per-file skips (Track B) */ .mode = cbm_pipeline_get_mode(p), .path_aliases = path_aliases, }; diff --git a/src/pipeline/pipeline_internal.h b/src/pipeline/pipeline_internal.h index af1a8de12..77103980f 100644 --- a/src/pipeline/pipeline_internal.h +++ b/src/pipeline/pipeline_internal.h @@ -65,6 +65,10 @@ typedef struct { cbm_gbuf_t *gbuf; /* owned by pipeline */ cbm_registry_t *registry; /* owned by pipeline */ atomic_int *cancelled; /* pointer to pipeline's cancelled flag */ + cbm_pipeline_t *pipeline; /* back-pointer for recording per-file skips + * (Stage 2 / Track B). May be NULL on paths that + * don't record; cbm_pipeline_add_file_error is + * NULL-safe. */ int mode; /* cbm_index_mode_t (0=full, 1=moderate, 2=fast, 3=advanced) */ /* Extraction result cache (sequential pipeline optimization). diff --git a/tests/test_index_resilience.c b/tests/test_index_resilience.c new file mode 100644 index 000000000..e503af58a --- /dev/null +++ b/tests/test_index_resilience.c @@ -0,0 +1,263 @@ +/* + * test_index_resilience.c — Stage 2 / Track B guard. + * + * A file that fails during indexing (here: exceeds the env-configurable size + * cap) must be SKIPPED-AND-REPORTED, never silently dropped, and it must NOT + * take the rest of the repo down with it. This is the genuine guard for the + * error-surfacing wiring (has_error / read / oversized → cbm_file_error_t → + * MCP `skipped[]` + `skipped_count` + per-run logfile). + * + * These indexes run through the full production MCP `index_repository` flow. + * With only a handful of files the pipeline takes the SEQUENTIAL path + * (pass_definitions.c), so this exercises the sequential recording branch on + * every platform regardless of core count. + */ +#include "test_framework.h" +#include "repro_harness.h" /* RProj, rh_to_fwd_slashes, rh_count_label, rh_cleanup */ +#include + +#include +#include +#include + +/* ── Local helpers ──────────────────────────────────────────────── */ + +static void ri_write_text(const char *dir, const char *name, const char *content) { + char path[700]; + snprintf(path, sizeof(path), "%s/%s", dir, name); + FILE *f = fopen(path, "wb"); + if (f) { + fputs(content, f); + fclose(f); + } +} + +/* Write a comment-only python file padded well past `bytes`. The file is never + * parsed (the size cap rejects it before extraction), so its content only needs + * to make it a discoverable .py source that exceeds the cap. */ +static void ri_write_big(const char *dir, const char *name, size_t bytes) { + char path[700]; + snprintf(path, sizeof(path), "%s/%s", dir, name); + FILE *f = fopen(path, "wb"); + if (!f) { + return; + } + static const char line[] = "# oversized filler line padding this file past the size cap\n"; + size_t linelen = sizeof(line) - 1; + size_t written = 0; + while (written < bytes) { + fwrite(line, 1, linelen, f); + written += linelen; + } + fclose(f); +} + +/* Slurp a whole file into a heap buffer (NUL-terminated). NULL on error. */ +static char *ri_slurp(const char *path) { + FILE *f = fopen(path, "rb"); + if (!f) { + return NULL; + } + (void)fseek(f, 0, SEEK_END); + long n = ftell(f); + (void)fseek(f, 0, SEEK_SET); + if (n < 0) { + (void)fclose(f); + return NULL; + } + char *buf = (char *)malloc((size_t)n + 1); + if (!buf) { + (void)fclose(f); + return NULL; + } + size_t rd = fread(buf, 1, (size_t)n, f); + (void)fclose(f); + buf[rd] = '\0'; + return buf; +} + +/* Index the files already written under lp->tmpdir through the production MCP + * flow, capturing the raw response. Returns the opened graph store (NULL on + * failure). Mirrors repro_harness.h's rh_open_indexed but keeps the response so + * we can assert on skipped_count / skipped[] / logfile. */ +static cbm_store_t *ri_index_capture(RProj *lp, char **out_resp) { + lp->project = cbm_project_name_from_path(lp->tmpdir); + if (!lp->project) { + return NULL; + } + const char *home = getenv("HOME"); + if (!home) { + home = "/tmp"; + } + char cache_dir[512]; + snprintf(cache_dir, sizeof(cache_dir), "%s/.cache/codebase-memory-mcp", home); + cbm_mkdir(cache_dir); + snprintf(lp->dbpath, sizeof(lp->dbpath), "%s/%s.db", cache_dir, lp->project); + unlink(lp->dbpath); + lp->srv = cbm_mcp_server_new(NULL); + if (!lp->srv) { + return NULL; + } + char args[700]; + snprintf(args, sizeof(args), "{\"repo_path\":\"%s\"}", lp->tmpdir); + char *resp = cbm_mcp_handle_tool(lp->srv, "index_repository", args); + if (out_resp) { + *out_resp = resp; + } else if (resp) { + free(resp); + } + return cbm_store_open_path(lp->dbpath); +} + +/* ── Tests ──────────────────────────────────────────────────────── */ + +/* INV(oversized-reported): with the size cap set LOW, indexing a repo that + * contains one > cap file plus two good files must: + * - complete with status "indexed" (a skip is a handled outcome, not failure), + * - report skipped_count >= 1 with the big file in skipped[] at phase "oversized", + * - write a per-run logfile (path echoed in the response) that lists the file, + * - and STILL index the two good files (their Function nodes are present). + * + * Guard property: on the UNWIRED code the big file is silently dropped — there + * is no skipped_count / skipped[] / logfile — so every assertion below fails. + */ +TEST(index_oversized_file_reported) { + RProj lp; + memset(&lp, 0, sizeof(lp)); + snprintf(lp.tmpdir, sizeof(lp.tmpdir), "/tmp/cbm_resil_XXXXXX"); + if (!cbm_mkdtemp(lp.tmpdir)) { + FAIL("mkdtemp failed"); + } + rh_to_fwd_slashes(lp.tmpdir); + + ri_write_text(lp.tmpdir, "good.py", "def alpha():\n return 1\n"); + ri_write_text(lp.tmpdir, "good.go", "package main\n\nfunc beta() int { return 2 }\n"); + ri_write_big(lp.tmpdir, "big.py", (size_t)2 * 1024 * 1024); /* ~2 MiB > 1 MiB cap */ + + char logpath[700]; + snprintf(logpath, sizeof(logpath), "%s/skip.log", lp.tmpdir); + cbm_setenv("CBM_MAX_FILE_BYTES", "1048576", 1); /* 1 MiB cap */ + cbm_setenv("CBM_INDEX_LOG", logpath, 1); /* deterministic logfile path */ + + char *resp = NULL; + cbm_store_t *store = ri_index_capture(&lp, &resp); + + /* Unset env IMMEDIATELY (before any assert can bail) so a low cap never + * leaks into other tests in this process — cbm_max_file_bytes() reads env + * on every file. */ + cbm_unsetenv("CBM_MAX_FILE_BYTES"); + cbm_unsetenv("CBM_INDEX_LOG"); + + if (!resp) { + FAIL("no MCP response"); + } + if (!store) { + free(resp); + FAIL("store did not open"); + } + + yyjson_doc *d = yyjson_read(resp, strlen(resp), 0); + ASSERT_NOT_NULL(d); + yyjson_val *sc = yyjson_obj_get(yyjson_doc_get_root(d), "structuredContent"); + ASSERT_NOT_NULL(sc); + + /* Status stays "indexed" — the skip is expected + handled. */ + const char *status = yyjson_get_str(yyjson_obj_get(sc, "status")); + ASSERT_NOT_NULL(status); + ASSERT_STR_EQ("indexed", status); + + /* At least one skip, surfaced at the top level. */ + int skipped_count = yyjson_get_int(yyjson_obj_get(sc, "skipped_count")); + ASSERT_GTE(skipped_count, 1); + + /* The big file is listed, at phase "oversized". */ + yyjson_val *skipped = yyjson_obj_get(sc, "skipped"); + ASSERT_NOT_NULL(skipped); + yyjson_val *files = yyjson_obj_get(skipped, "files"); + ASSERT_NOT_NULL(files); + int found_big = 0; + size_t idx = 0; + size_t fmax = 0; + yyjson_val *fe = NULL; + yyjson_arr_foreach(files, idx, fmax, fe) { + const char *fp = yyjson_get_str(yyjson_obj_get(fe, "path")); + const char *phase = yyjson_get_str(yyjson_obj_get(fe, "phase")); + if (fp && strstr(fp, "big.py")) { + found_big = 1; + ASSERT_NOT_NULL(phase); + ASSERT_STR_EQ("oversized", phase); + } + } + ASSERT_TRUE(found_big); + + /* A logfile was written, its path echoed, and it lists the skipped file. */ + const char *logfile = yyjson_get_str(yyjson_obj_get(sc, "logfile")); + ASSERT_NOT_NULL(logfile); + ASSERT_STR_EQ(logpath, logfile); + char *logtext = ri_slurp(logfile); + ASSERT_NOT_NULL(logtext); + ASSERT_NOT_NULL(strstr(logtext, "big.py")); + ASSERT_NOT_NULL(strstr(logtext, "oversized")); + free(logtext); + + /* The two good files ARE indexed — the skip did not take them down. */ + int funcs = rh_count_label(store, lp.project, "Function"); + ASSERT_GTE(funcs, 2); + + yyjson_doc_free(d); + free(resp); + rh_cleanup(&lp, store); + PASS(); +} + +/* INV(clean-run): a run with no failures reports skipped_count == 0 and emits + * NO "skipped" object and NO "logfile" (a logfile is written only on skips). */ +TEST(index_clean_run_no_logfile) { + RProj lp; + memset(&lp, 0, sizeof(lp)); + snprintf(lp.tmpdir, sizeof(lp.tmpdir), "/tmp/cbm_resil_XXXXXX"); + if (!cbm_mkdtemp(lp.tmpdir)) { + FAIL("mkdtemp failed"); + } + rh_to_fwd_slashes(lp.tmpdir); + + ri_write_text(lp.tmpdir, "good.py", "def alpha():\n return 1\n"); + ri_write_text(lp.tmpdir, "good.go", "package main\n\nfunc beta() int { return 2 }\n"); + + /* Defensive: make sure no stray low cap / log override leaks in. */ + cbm_unsetenv("CBM_MAX_FILE_BYTES"); + cbm_unsetenv("CBM_INDEX_LOG"); + + char *resp = NULL; + cbm_store_t *store = ri_index_capture(&lp, &resp); + if (!resp) { + FAIL("no MCP response"); + } + if (!store) { + free(resp); + FAIL("store did not open"); + } + + yyjson_doc *d = yyjson_read(resp, strlen(resp), 0); + ASSERT_NOT_NULL(d); + yyjson_val *sc = yyjson_obj_get(yyjson_doc_get_root(d), "structuredContent"); + ASSERT_NOT_NULL(sc); + + int skipped_count = yyjson_get_int(yyjson_obj_get(sc, "skipped_count")); + ASSERT_EQ(skipped_count, 0); + ASSERT_NULL(yyjson_obj_get(sc, "skipped")); + ASSERT_NULL(yyjson_obj_get(sc, "logfile")); + + int funcs = rh_count_label(store, lp.project, "Function"); + ASSERT_GTE(funcs, 2); + + yyjson_doc_free(d); + free(resp); + rh_cleanup(&lp, store); + PASS(); +} + +SUITE(index_resilience) { + RUN_TEST(index_oversized_file_reported); + RUN_TEST(index_clean_run_no_logfile); +} diff --git a/tests/test_main.c b/tests/test_main.c index d0cca85d7..67d3d81b3 100644 --- a/tests/test_main.c +++ b/tests/test_main.c @@ -63,6 +63,7 @@ extern void suite_discover(void); extern void suite_graph_buffer(void); extern void suite_registry(void); extern void suite_pipeline(void); +extern void suite_index_resilience(void); extern void suite_fqn(void); extern void suite_route_canon(void); extern void suite_path_alias(void); @@ -183,6 +184,7 @@ int main(int argc, char **argv) { /* Pipeline (M8) */ RUN_SELECTED_SUITE(registry); RUN_SELECTED_SUITE(pipeline); + RUN_SELECTED_SUITE(index_resilience); RUN_SELECTED_SUITE(fqn); RUN_SELECTED_SUITE(route_canon); RUN_SELECTED_SUITE(path_alias);