Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 4 additions & 3 deletions Makefile.cbm
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,8 @@ FOUNDATION_SRCS = \
src/foundation/mem.c \
src/foundation/diagnostics.c \
src/foundation/profile.c \
src/foundation/dump_verify.c
src/foundation/dump_verify.c \
src/foundation/limits.c

# Existing extraction C code (compiled from current location)
EXTRACTION_SRCS = \
Expand Down Expand Up @@ -345,7 +346,7 @@ TEST_DISCOVER_SRCS = \

TEST_GRAPH_BUFFER_SRCS = tests/test_graph_buffer.c

TEST_PIPELINE_SRCS = tests/test_registry.c tests/test_pipeline.c tests/test_fqn.c tests/test_route_canon.c tests/test_path_alias.c tests/test_configlink.c tests/test_infrascan.c tests/test_worker_pool.c tests/test_parallel.c
TEST_PIPELINE_SRCS = tests/test_registry.c tests/test_pipeline.c tests/test_fqn.c tests/test_route_canon.c tests/test_path_alias.c tests/test_configlink.c tests/test_infrascan.c tests/test_worker_pool.c tests/test_parallel.c tests/test_index_resilience.c

TEST_WATCHER_SRCS = tests/test_watcher.c

Expand Down Expand Up @@ -567,7 +568,7 @@ $(UNIXCODER_OBJ): $(UNIXCODER_BLOB_SRC) vendored/nomic/code_vectors.bin | $(BUIL
OBJS_VENDORED_TEST = $(MIMALLOC_OBJ_TEST) $(SQLITE3_OBJ_TEST) $(TRE_OBJ_TEST) $(GRAMMAR_OBJS_TEST) $(TS_RUNTIME_OBJ_TEST) $(LSP_OBJ_TEST) $(PP_OBJ_TEST) $(LZ4_OBJ_TEST) $(ZSTD_OBJ_TEST) $(UNIXCODER_OBJ)

$(BUILD_DIR)/test-runner: $(ALL_TEST_SRCS) $(PROD_SRCS) $(EXTRACTION_SRCS) $(AC_LZ4_SRCS) $(ZSTD_SRCS) $(SQLITE_WRITER_SRC) $(OBJS_VENDORED_TEST) | $(BUILD_DIR)
$(CC) $(CFLAGS_TEST) -o $@ \
$(CC) $(CFLAGS_TEST) -Itests -Itests/repro -o $@ \
$(ALL_TEST_SRCS) $(PROD_SRCS) \
$(EXTRACTION_SRCS) $(AC_LZ4_SRCS) $(ZSTD_SRCS) $(SQLITE_WRITER_SRC) \
$(OBJS_VENDORED_TEST) \
Expand Down
26 changes: 26 additions & 0 deletions src/foundation/limits.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
/*
* limits.c — Env-configurable safety limits (Stage 2 / Track B4).
*/
#include "foundation/limits.h"

#include <errno.h>
#include <stdlib.h>

long cbm_max_file_bytes(void) {
/* 512 MiB — generous: real source files never approach it, but a
* pathological / vendored blob degrades to a reported "oversized" skip
* instead of a silent drop or an unbounded read. */
const long default_cap = 512L * 1024 * 1024;

const char *raw = getenv("CBM_MAX_FILE_BYTES");
if (raw && raw[0]) {
errno = 0;
char *end = NULL;
long v = strtol(raw, &end, 10);
if (errno == 0 && end != raw && *end == '\0' && v > 0) {
return v;
}
/* Unparseable / non-positive → fall through to the safe default. */
}
return default_cap;
}
33 changes: 33 additions & 0 deletions src/foundation/limits.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
/*
* limits.h — Generous, env-configurable safety limits (Stage 2 / Track B4).
*
* Each knob has a generous default. Hitting a limit degrades to a *reported*
* skip (surfaced via MCP/CLI/logfile), never a silent drop and never an
* unbounded read (unbounded just trades a crash for an OOM/hang). Every limit
* is env-overridable so an operator can tune it per-repo without a rebuild.
*/
#ifndef CBM_LIMITS_H
#define CBM_LIMITS_H

/* Result of an attempted per-file read, so callers can attribute a skip to the
* right phase/reason instead of collapsing every failure into "read failed". */
typedef enum {
CBM_READ_OK = 0, /* file read successfully */
CBM_READ_OPEN_FAIL, /* could not open (missing / permission) */
CBM_READ_EMPTY, /* zero/negative size — benign, nothing to index */
CBM_READ_OVERSIZED, /* size exceeds cbm_max_file_bytes() */
CBM_READ_OOM, /* buffer allocation failed */
} cbm_read_status_t;

/* Maximum size (bytes) of a single source file the indexer will read into
* memory. Files larger than this are skipped-and-reported (phase "oversized"),
* never silently dropped. Override with CBM_MAX_FILE_BYTES (a positive integer
* count of bytes). Default 512 MiB (raised from the historical 100 MB cap).
*
* The env var is read on each call — this is intentional: read_file() calls it
* once per file (negligible), and reading fresh means a test / operator can
* change the cap via setenv without a process restart or a stale memoized copy
* leaking across runs. */
long cbm_max_file_bytes(void);

#endif /* CBM_LIMITS_H */
102 changes: 99 additions & 3 deletions src/mcp/mcp.c
Original file line number Diff line number Diff line change
Expand Up @@ -3134,13 +3134,96 @@ static void add_excluded_summary(yyjson_mut_doc *doc, yyjson_mut_val *root, char
yyjson_mut_obj_add_val(doc, root, "excluded", excluded);
}

/* Cap on per-file skips embedded in the JSON response — keep it compact on
* large repos. The FULL, uncapped list always goes to the per-run logfile;
* the JSON carries "count" + "truncated" so nothing is silently hidden. */
enum { INDEX_SKIPPED_FILE_CAP = 50 };

/* Attach a summary of per-file skips (Stage 2 / Track B). Always emits a
* top-level "skipped_count" (0 on clean runs) so consumers can rely on it.
* When there are skips, also emits:
* "skipped": {"files":[{path,reason,phase}..(<=50)], "count":N, "truncated":bool}
* and, if a per-run logfile was written, "logfile": "<path>".
* The run status stays "indexed" — a skipped file is the expected handled
* outcome, not a failure. errs[] is borrowed (copied into doc). */
static void add_skipped_summary(yyjson_mut_doc *doc, yyjson_mut_val *root,
const cbm_file_error_t *errs, int count, const char *logfile) {
yyjson_mut_obj_add_int(doc, root, "skipped_count", count < 0 ? 0 : count);
if (!errs || count <= 0) {
return;
}
yyjson_mut_val *skipped = yyjson_mut_obj(doc);
yyjson_mut_val *files = yyjson_mut_arr(doc);
int shown = count < INDEX_SKIPPED_FILE_CAP ? count : INDEX_SKIPPED_FILE_CAP;
for (int i = 0; i < shown; i++) {
yyjson_mut_val *fe = yyjson_mut_obj(doc);
yyjson_mut_obj_add_strcpy(doc, fe, "path", errs[i].path ? errs[i].path : "");
yyjson_mut_obj_add_strcpy(doc, fe, "reason", errs[i].reason ? errs[i].reason : "");
yyjson_mut_obj_add_strcpy(doc, fe, "phase", errs[i].phase ? errs[i].phase : "");
yyjson_mut_arr_add_val(files, fe);
}
yyjson_mut_obj_add_val(doc, skipped, "files", files);
yyjson_mut_obj_add_int(doc, skipped, "count", count);
yyjson_mut_obj_add_bool(doc, skipped, "truncated", count > INDEX_SKIPPED_FILE_CAP);
yyjson_mut_obj_add_val(doc, root, "skipped", skipped);
if (logfile && logfile[0]) {
yyjson_mut_obj_add_strcpy(doc, root, "logfile", logfile);
}
}

/* Write the FULL (uncapped) skip list to a per-run logfile — ONLY when >=1 file
* was skipped (no logfile on a clean run). Location:
* $CBM_INDEX_LOG (override) else <cache_dir>/logs/<project>-<epoch>.log
* Returns true and fills out_path on success. */
static bool write_skip_logfile(const char *project, const cbm_file_error_t *errs, int count,
char *out_path, size_t out_sz) {
if (!errs || count <= 0) {
return false;
}
char path[CBM_SZ_1K];
const char *override = getenv("CBM_INDEX_LOG");
if (override && override[0]) {
snprintf(path, sizeof(path), "%s", override);
} else {
const char *cdir = cbm_resolve_cache_dir();
if (!cdir) {
return false;
}
char logdir[CBM_SZ_1K];
snprintf(logdir, sizeof(logdir), "%s/logs", cdir);
cbm_mkdir_p(logdir, 0755);
snprintf(path, sizeof(path), "%s/%s-%lld.log", logdir, project ? project : "index",
(long long)time(NULL));
}
FILE *f = cbm_fopen(path, "wb");
if (!f) {
cbm_log_warn("index.logfile_open_fail", "path", path);
return false;
}
(void)fprintf(f, "# codebase-memory-mcp index skip report\n");
(void)fprintf(f, "# project=%s skipped=%d\n", project ? project : "", count);
(void)fprintf(f, "# columns: phase\treason\tpath\n");
for (int i = 0; i < count; i++) {
(void)fprintf(f, "%s\t%s\t%s\n", errs[i].phase ? errs[i].phase : "",
errs[i].reason ? errs[i].reason : "", errs[i].path ? errs[i].path : "");
}
(void)fclose(f);
if (out_path && out_sz) {
snprintf(out_path, out_sz, "%s", path);
}
return true;
}

/* Build the success portion of the index_repository response.
* Returns true when status should be "degraded" (#334 plausibility gate). */
static bool build_index_success_response(cbm_mcp_server_t *srv, yyjson_mut_doc *doc,
yyjson_mut_val *root, const char *project_name,
const char *repo_path, bool persistence, cbm_pipeline_t *p,
char **excluded_dirs, int excluded_count) {
char **excluded_dirs, int excluded_count,
const cbm_file_error_t *file_errors, int file_error_count,
const char *logfile) {
add_excluded_summary(doc, root, excluded_dirs, excluded_count);
add_skipped_summary(doc, root, file_errors, file_error_count, logfile);

int exp_nodes = -1;
int exp_edges = -1;
Expand Down Expand Up @@ -3302,6 +3385,12 @@ static char *handle_index_repository(cbm_mcp_server_t *srv, const char *args) {
int excluded_count = 0;
cbm_pipeline_get_excluded(p, &excluded_dirs, &excluded_count);

/* Capture the per-file skip list (Stage 2 / Track B) while the pipeline
* still owns the strings; the response builder copies them into the doc. */
cbm_file_error_t *file_errors = NULL;
int file_error_count = 0;
cbm_pipeline_get_file_errors(p, &file_errors, &file_error_count);

cbm_mem_collect(); /* return mimalloc pages to OS after large indexing */

/* Invalidate cached store so next query reopens the fresh database */
Expand All @@ -3319,8 +3408,15 @@ static char *handle_index_repository(cbm_mcp_server_t *srv, const char *args) {
yyjson_mut_obj_add_str(doc, root, "project", project_name);

if (rc == 0) {
bool degraded = build_index_success_response(srv, doc, root, project_name, repo_path,
persistence, p, excluded_dirs, excluded_count);
/* Write the per-run logfile ONLY when there were skips (no logfile on a
* clean run). The FULL list goes to the file; the JSON caps at 50. */
char logfile_path[CBM_SZ_1K];
logfile_path[0] = '\0';
bool has_logfile = write_skip_logfile(project_name, file_errors, file_error_count,
logfile_path, sizeof(logfile_path));
bool degraded = build_index_success_response(
srv, doc, root, project_name, repo_path, persistence, p, excluded_dirs, excluded_count,
file_errors, file_error_count, has_logfile ? logfile_path : NULL);
yyjson_mut_obj_add_str(doc, root, "status", degraded ? "degraded" : "indexed");
} else {
yyjson_mut_obj_add_str(doc, root, "status", "error");
Expand Down
3 changes: 2 additions & 1 deletion src/pipeline/pass_calls.c
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ enum { PC_RING = 4, PC_RING_MASK = 3, PC_SIG_SCAN = 15, PC_REGEX_GRP = 2 };
#include "foundation/log.h"
#include "foundation/compat.h"
#include "foundation/compat_fs.h"
#include "foundation/limits.h"
#include "foundation/str_util.h"
#include "cbm.h"
#include "service_patterns.h"
Expand Down Expand Up @@ -53,7 +54,7 @@ static char *read_file(const char *path, int *out_len) {
long size = ftell(f);
(void)fseek(f, 0, SEEK_SET);

if (size <= 0 || size > (long)CBM_PERCENT * CBM_SZ_1K * CBM_SZ_1K) {
if (size <= 0 || size > cbm_max_file_bytes()) { /* generous, env-configurable cap (B4) */
(void)fclose(f);
return NULL;
}
Expand Down
68 changes: 63 additions & 5 deletions src/pipeline/pass_definitions.c
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ enum { PD_JSON_FIELD_OVERHEAD = 6 };
#include "foundation/log.h"
#include "foundation/compat.h"
#include "foundation/compat_fs.h"
#include "foundation/limits.h"
#include "cbm.h"
#include "simhash/minhash.h"
#include "semantic/ast_profile.h"
Expand All @@ -32,20 +33,45 @@ enum { PD_JSON_FIELD_OVERHEAD = 6 };
#include <string.h>

/* Read entire file into heap-allocated buffer. Returns NULL on error.
* Caller must free(). Sets *out_len to byte count. */
static char *read_file(const char *path, int *out_len) {
* Caller must free(). Sets *out_len to byte count. *out_size receives the
* on-disk size and *out_status the failure reason, so the caller can attribute
* a skip to the right phase/reason (read vs oversized) instead of a silent
* drop. Both out params may be NULL. */
static char *read_file(const char *path, int *out_len, long *out_size,
cbm_read_status_t *out_status) {
if (out_size) {
*out_size = 0;
}
if (out_status) {
*out_status = CBM_READ_OK;
}
FILE *f = cbm_fopen(path, "rb");
if (!f) {
if (out_status) {
*out_status = CBM_READ_OPEN_FAIL;
}
return NULL;
}

(void)fseek(f, 0, SEEK_END);
long size = ftell(f);
(void)fseek(f, 0, SEEK_SET);
if (out_size) {
*out_size = size;
}

if (size <= 0 ||
size > (long)CBM_PERCENT * CBM_SZ_1K * CBM_SZ_1K) { /* CBM_PERCENT MB sanity limit */
if (size <= 0) {
(void)fclose(f);
if (out_status) {
*out_status = CBM_READ_EMPTY;
}
return NULL;
}
if (size > cbm_max_file_bytes()) { /* generous, env-configurable cap (B4) */
(void)fclose(f);
if (out_status) {
*out_status = CBM_READ_OVERSIZED;
}
return NULL;
}

Expand All @@ -57,6 +83,9 @@ static char *read_file(const char *path, int *out_len) {
char *buf = malloc((size_t)size + CBM_TS_LOOKAHEAD_PAD);
if (!buf) {
(void)fclose(f);
if (out_status) {
*out_status = CBM_READ_OOM;
}
return NULL;
}

Expand Down Expand Up @@ -487,9 +516,27 @@ int cbm_pipeline_pass_definitions(cbm_pipeline_ctx_t *ctx, const cbm_file_info_t

/* Read source file */
int source_len = 0;
char *source = read_file(path, &source_len);
long file_size = 0;
cbm_read_status_t rst = CBM_READ_OK;
char *source = read_file(path, &source_len, &file_size, &rst);
if (!source) {
errors++;
if (rst == CBM_READ_OVERSIZED) {
/* Never a silent drop: record the oversized skip + WARN so the
* file surfaces in the response/logfile with its sizes. */
long cap = cbm_max_file_bytes();
char reason[96];
snprintf(reason, sizeof(reason), "oversized (%lld MB > %lld MB)",
(long long)(file_size / (CBM_SZ_1K * CBM_SZ_1K)),
(long long)(cap / (CBM_SZ_1K * CBM_SZ_1K)));
cbm_pipeline_add_file_error(ctx->pipeline, rel, reason, "oversized");
cbm_log_warn("index.file_oversized", "path", rel, "size_mb",
itoa_log((int)(file_size / (CBM_SZ_1K * CBM_SZ_1K))), "cap_mb",
itoa_log((int)(cap / (CBM_SZ_1K * CBM_SZ_1K))));
} else if (rst == CBM_READ_OPEN_FAIL || rst == CBM_READ_OOM) {
cbm_pipeline_add_file_error(ctx->pipeline, rel, "read failed", "read");
}
/* CBM_READ_EMPTY: benign 0-byte file — nothing to index, not reported. */
continue;
}

Expand All @@ -502,8 +549,19 @@ int cbm_pipeline_pass_definitions(cbm_pipeline_ctx_t *ctx, const cbm_file_info_t

if (!result) {
errors++;
cbm_pipeline_add_file_error(ctx->pipeline, rel, "extract failed", "extract");
continue;
}
/* Consume the previously-ignored has_error flag: a parse timeout /
* parse failure / unsupported-grammar result carries no defs but must
* still be reported (phase "extract", reason = the extractor's message).
* The empty result flows through unchanged (the defs loop is a no-op). */
if (result->has_error) {
cbm_pipeline_add_file_error(ctx->pipeline, rel,
result->error_msg ? result->error_msg : "extract failed",
"extract");
errors++;
}

/* Create nodes for each definition */
for (int d = 0; d < result->defs.count; d++) {
Expand Down
3 changes: 2 additions & 1 deletion src/pipeline/pass_k8s.c
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
#include "foundation/log.h"
#include "foundation/compat.h"
#include "foundation/compat_fs.h"
#include "foundation/limits.h"
#include "cbm.h"

#include <stdlib.h>
Expand All @@ -40,7 +41,7 @@ static char *k8s_read_file(const char *path, int *out_len) {
long size = ftell(f);
(void)fseek(f, 0, SEEK_SET);

if (size <= 0 || size > (long)CBM_PERCENT * CBM_SZ_1K * CBM_SZ_1K) {
if (size <= 0 || size > cbm_max_file_bytes()) { /* generous, env-configurable cap (B4) */
(void)fclose(f);
return NULL;
}
Expand Down
Loading
Loading