diff --git a/Makefile.toml b/Makefile.toml index e6987085..68d657ad 100644 --- a/Makefile.toml +++ b/Makefile.toml @@ -316,7 +316,7 @@ workspace = false command = "bash" args = [ "-lc", - "set -euo pipefail; head=\"$(git rev-parse HEAD)\"; if [ -n \"$(git status --porcelain)\" ]; then head=\"$head+dirty\"; fi; export ELF_BASELINE_ELF_HEAD=\"$head\"; export ELF_BASELINE_PROJECTS=\"${ELF_BASELINE_PROJECTS:-ELF}\"; export ELF_BASELINE_PROFILE=\"${ELF_BASELINE_PROFILE:-backfill}\"; export ELF_BASELINE_BACKFILL_DOCS=\"${ELF_BASELINE_BACKFILL_DOCS:-2000}\"; export ELF_BASELINE_ELF_TIMEOUT_SECONDS=\"${ELF_BASELINE_ELF_TIMEOUT_SECONDS:-3600}\"; export ELF_BASELINE_MAX_ELF_SECONDS=\"${ELF_BASELINE_MAX_ELF_SECONDS:-3600}\"; docker compose -f docker-compose.baseline.yml run --build --rm baseline-runner", + "set -euo pipefail; head=\"$(git rev-parse HEAD)\"; if [ -n \"$(git status --porcelain)\" ]; then head=\"$head+dirty\"; fi; selected_projects=\"$(printenv ELF_BASELINE_PROJECTS || true)\"; if [ -z \"$selected_projects\" ]; then selected_projects=\"ELF\"; fi; selected_profile=\"$(printenv ELF_BASELINE_PROFILE || true)\"; if [ -z \"$selected_profile\" ]; then selected_profile=\"backfill\"; fi; backfill_docs=\"$(printenv ELF_BASELINE_BACKFILL_DOCS || true)\"; if [ -z \"$backfill_docs\" ]; then backfill_docs=\"2000\"; fi; elf_timeout=\"$(printenv ELF_BASELINE_ELF_TIMEOUT_SECONDS || true)\"; if [ -z \"$elf_timeout\" ]; then elf_timeout=\"3600\"; fi; max_elf_seconds=\"$(printenv ELF_BASELINE_MAX_ELF_SECONDS || true)\"; if [ -z \"$max_elf_seconds\" ]; then max_elf_seconds=\"3600\"; fi; export ELF_BASELINE_ELF_HEAD=\"$head\"; export ELF_BASELINE_PROJECTS=\"$selected_projects\"; export ELF_BASELINE_PROFILE=\"$selected_profile\"; export ELF_BASELINE_BACKFILL_DOCS=\"$backfill_docs\"; export ELF_BASELINE_ELF_TIMEOUT_SECONDS=\"$elf_timeout\"; export ELF_BASELINE_MAX_ELF_SECONDS=\"$max_elf_seconds\"; docker compose -f docker-compose.baseline.yml run --build --rm baseline-runner", ] [tasks.baseline-live-report] diff --git a/README.md b/README.md index 182ac2b5..0fb0a90f 100644 --- a/README.md +++ b/README.md @@ -120,18 +120,29 @@ flowchart TB ### Checked-In Live Benchmark Snapshot -The June 9, 2026 Docker-only live baseline uses the same generated corpus and query -manifest across ELF and the external memory projects below. ELF was run with the -production embedding provider path, `Qwen3-Embedding-8B`, and 4096-dimensional -embeddings. - -- ELF production-provider stress run: 480 documents, 16 queries, `8/8` encoded checks, - `retrieval_pass`, and `pass` in 1163 seconds. -- All-project smoke run: ELF and qmd passed every encoded check. agentmemory passed - same-corpus retrieval but failed or could not complete lifecycle checks. mem0, - memsearch, and claude-mem returned wrong same-corpus retrieval results in the encoded - smoke. OpenViking was `incomplete` because its local embedding dependency could not - complete in the Docker runner. +The June 9, 2026 Docker-only live baseline and production adoption gate use generated +corpus/query manifests across ELF and the external memory projects below. ELF was run +with the production embedding provider path, `Qwen3-Embedding-8B`, and +4096-dimensional embeddings where provider-backed ELF evidence was required. + +- Production adoption gate verdict: ELF is ready for personal production use with + bounded caveats. The private production corpus profile was not run because no + operator-owned private manifest was available; the task failed closed at the missing + manifest guard, so no private-corpus pass is claimed. +- ELF production-provider synthetic run: 8 documents, 6 queries, `8/8` encoded checks, + `retrieval_pass`, and `pass` in 59 seconds. +- ELF production-provider stress run: 480 documents, 16 queries, `9/9` encoded checks, + `retrieval_pass`, and `pass` in 779 seconds. +- ELF production-provider backfill run: 2,000 documents, 16 queries, `9/9` encoded + checks, resume from 1,000 to 2,000 imported documents, zero duplicate source notes, + and `pass` in 2,804 seconds. +- Single-user production restore proof: Docker Compose backup/restore plus Qdrant + rebuild returned `rebuilt_count=1`, `missing_vector_count=0`, `error_count=0`, and + search recovered the restored note. +- Fresh all-project smoke run: ELF and qmd passed every encoded check. agentmemory + passed same-corpus retrieval but failed lifecycle/cold-start coverage. memsearch, + mem0, OpenViking, and claude-mem remained `incomplete` or wrong-result typed states; + those states are reported as limitations, not hidden as proof. - The benchmark runner and report publisher are checked in and Docker-isolated: `cargo make baseline-live-docker`, `cargo make baseline-backfill-docker`, `cargo make baseline-live-report`, and `cargo make baseline-live-docker-clean`. @@ -140,6 +151,7 @@ Detailed evidence and interpretation: - [Live Baseline Benchmark Report - June 9, 2026](docs/guide/benchmarking/2026-06-09-live-baseline-report.md) - [Synthetic Production Corpus Benchmark Report - June 9, 2026](docs/guide/benchmarking/2026-06-09-production-corpus-report.md) +- [Production Adoption Gate Report - June 9, 2026](docs/guide/benchmarking/2026-06-09-production-adoption-gate-report.md) - [Live Baseline Benchmark Runbook](docs/guide/benchmarking/live_baseline_benchmark.md) - [Single-User Production Runbook](docs/guide/single_user_production.md) @@ -185,6 +197,7 @@ Detailed comparison, mechanism-level analysis, and source map: - [Live Baseline Benchmark Report - June 9, 2026](docs/guide/benchmarking/2026-06-09-live-baseline-report.md) - [Synthetic Production Corpus Benchmark Report - June 9, 2026](docs/guide/benchmarking/2026-06-09-production-corpus-report.md) +- [Production Adoption Gate Report - June 9, 2026](docs/guide/benchmarking/2026-06-09-production-adoption-gate-report.md) - [Live Baseline Benchmark Runbook](docs/guide/benchmarking/live_baseline_benchmark.md) - [External Memory Improvement Plan](docs/guide/research/external_memory_improvement_plan.md) - [Detailed External Comparison](docs/guide/research/comparison_external_projects.md) diff --git a/build.rs b/build.rs index b5060b99..d37f7bdc 100644 --- a/build.rs +++ b/build.rs @@ -7,6 +7,8 @@ use vergen_gitcl::{Cargo, Emitter, Gitcl}; fn main() -> Result<(), Box> { let mut emitter = Emitter::default(); + println!("cargo:rustc-env=VERGEN_GIT_SHA=unknown"); + emitter.add_instructions(&Cargo::builder().target_triple(true).build())?; // Disable the git version if installed from . diff --git a/docs/guide/benchmarking/2026-06-09-production-adoption-gate-report.md b/docs/guide/benchmarking/2026-06-09-production-adoption-gate-report.md new file mode 100644 index 00000000..f8bfb7be --- /dev/null +++ b/docs/guide/benchmarking/2026-06-09-production-adoption-gate-report.md @@ -0,0 +1,272 @@ +# Production Adoption Gate Report - June 9, 2026 + +Goal: Record the XY-836 full comparison gate and personal production adoption decision. +Read this when: You need the fresh evidence behind the June 9, 2026 ELF production +adoption claim. +Inputs: P0 benchmark and runbook PRs, live Docker benchmark reports, provider-backed +benchmark runs, and the single-user restore proof. +Depends on: `live_baseline_benchmark.md`, `single_user_production.md`, +`comparison_external_projects.md`, `research_projects_inventory.md`, and +`Makefile.toml`. +Outputs: Production adoption verdict, exact benchmark commands, run ids, limitations, +and README-level claim boundaries. + +## Decision + +ELF is ready for personal production use with bounded caveats. + +The gate supports use as a single-user, self-hosted memory service when operated through +the checked-in Docker Compose production runbook, with backups enabled, Qdrant treated as +rebuildable, and retrieval debugging done through search traces and viewer/admin trace +surfaces rather than raw SQL. + +The caveats are material: + +- No private production corpus manifest was available in this lane. The + `baseline-production-private` task failed closed at its manifest guard, so this report + does not claim a private-corpus pass. +- External comparison remains an objective adapter matrix, not an overall superiority + claim. qmd and ELF passed the encoded smoke checks; agentmemory, memsearch, mem0, + OpenViking, and claude-mem retained typed failures or incomplete states. +- The 2,000-document provider backfill passed but took 2,804 seconds end to end. Large + imports should be planned as batch jobs, not interactive operations. + +Because the private-corpus criterion allows an explicitly bounded result, this gate does +not create a new P0 blocker. If private-corpus proof is required before a specific +deployment, supply `ELF_BASELINE_PRODUCTION_CORPUS_MANIFEST` and rerun +`cargo make baseline-production-private` before relying on private retrieval quality. + +## P0 Inputs + +The current branch is based on the post-observability mainline. The named P0 lanes were +merged before this gate: + +| Issue | PR | Evidence read | +| --- | --- | --- | +| `XY-819` | `#126` | Single-user production backup and restore runbook. | +| `XY-818` | `#127` | Private production corpus benchmark task and manifest guard. | +| `XY-817` | `#128` | Resumable batch ingest and backfill benchmark. | +| `XY-820` | `#130` | Typed lifecycle and adapter failure states. | +| `XY-825` | `#131` | Additional single-user restore and Qdrant rebuild proof. | +| `XY-27` | `#132` | Retrieval observability panels and trace candidate precision repair. | + +## Fresh Commands + +Provider credentials were loaded from an untracked local environment file. Secret values +were not printed or committed. The command forms below assume equivalent provider +environment variables are present in the shell. + +Private manifest guard: + +```sh +cargo make baseline-production-private +``` + +Result: failed closed before the benchmark runner because +`ELF_BASELINE_PRODUCTION_CORPUS_MANIFEST` was not set. + +Production-synthetic provider run: + +```sh +set -a +source .env +set +a +EMBEDDING_MODEL=Qwen3-Embedding-8B \ +EMBEDDING_DIMENSIONS=4096 \ +EMBEDDING_TIMEOUT_MS=30000 \ +ELF_BASELINE_ELF_EMBEDDING_MODE=provider \ +ELF_BASELINE_PROJECTS=ELF \ +ELF_BASELINE_MAX_ELF_SECONDS=1200 \ +cargo make baseline-production-synthetic +``` + +All-project smoke provider run: + +```sh +set -a +source .env +set +a +EMBEDDING_MODEL=Qwen3-Embedding-8B \ +EMBEDDING_DIMENSIONS=4096 \ +EMBEDDING_TIMEOUT_MS=30000 \ +ELF_BASELINE_ELF_EMBEDDING_MODE=provider \ +ELF_BASELINE_PROFILE=smoke \ +cargo make baseline-live-docker +``` + +ELF provider stress run: + +```sh +set -a +source .env +set +a +EMBEDDING_MODEL=Qwen3-Embedding-8B \ +EMBEDDING_DIMENSIONS=4096 \ +EMBEDDING_TIMEOUT_MS=30000 \ +ELF_BASELINE_PROJECTS=ELF \ +ELF_BASELINE_PROFILE=stress \ +ELF_BASELINE_MAX_ELF_SECONDS=1800 \ +ELF_BASELINE_ELF_TIMEOUT_SECONDS=1800 \ +ELF_BASELINE_ELF_EMBEDDING_MODE=provider \ +cargo make baseline-live-docker +``` + +ELF provider backfill run: + +```sh +set -a +source .env +set +a +EMBEDDING_MODEL=Qwen3-Embedding-8B \ +EMBEDDING_DIMENSIONS=4096 \ +EMBEDDING_TIMEOUT_MS=30000 \ +ELF_BASELINE_ELF_EMBEDDING_MODE=provider \ +ELF_BASELINE_ELF_TIMEOUT_SECONDS=3600 \ +ELF_BASELINE_MAX_ELF_SECONDS=3600 \ +cargo make baseline-backfill-docker +``` + +Single-user restore proof: + +```sh +awk '/^bash <<'\''EOF'\''$/{flag=1; next} flag && /^EOF$/{exit} flag {print}' \ + docs/guide/single_user_production.md \ + | perl -0pe 's#tmp/single-user-restore-proof#tmp/xy836-single-user-restore-proof#g; s/51988/52988/g; s/51989/52989/g; s/51990/52990/g; s/51991/52991/g; s/51992/52992/g; s/51993/52993/g; s/elf-restore-proof/elf-xy836-restore-proof/g' \ + > tmp/xy836-restore-proof.sh +bash tmp/xy836-restore-proof.sh +``` + +The proof used alternate local ports because the default proof port range was occupied +on this machine. + +## ELF Evidence + +All provider-backed ELF runs used: + +- Provider id: `provider` +- Embedding model: `Qwen3-Embedding-8B` +- Embedding dimensions: `4096` +- Timeout: `30000` ms +- API path: `/embeddings` + +| Run | Profile | Corpus | Status | Checks | Retrieval | Elapsed | Query result | Backfill and resume | +| --- | --- | --- | --- | --- | --- | --- | --- | --- | +| `live-baseline-20260609083644` | `production-synthetic` | `synthetic-coding-agent-prod-corpus-2026-06-09`, 8 docs, 6 queries | `pass` | `8/8` | `retrieval_pass` | 59 s | 6/6 pass, mean 937.120 ms | 8/8 completed in 8.134 s, resume 4 -> 8, 0 duplicates | +| `live-baseline-20260609090719` | `stress` | generated public, 480 docs, 16 queries | `pass` | `9/9` | `retrieval_pass` | 779 s | 16/16 pass, mean 1128.144 ms | 480/480 completed in 508.835 s, resume 240 -> 480, 0 duplicates | +| `live-baseline-20260609092144` | `backfill` | generated public, 2000 docs, 16 queries | `pass` | `9/9` | `retrieval_pass` | 2804 s | 16/16 pass, mean 1214.454 ms | 2000/2000 completed in 2061.396 s, resume 1000 -> 2000, 0 duplicates | + +The 2,000-document backfill also passed: + +- `resumable_backfill_no_duplicates` +- `same_corpus_retrieval` +- `async_worker_indexing_e2e` +- `update_replaces_note_text` +- `delete_suppresses_retrieval` +- `cold_start_recovery_search` +- `concurrent_write_search_e2e` +- `soak_stability_e2e` +- `resource_envelope` + +The resource envelope check measured 2,793.629 seconds against a 3,600-second limit and +167,652 KB RSS against a 1,500,000 KB limit. + +## Recovery Evidence + +The single-user production proof wrote a note, searched it, recreated the Docker +Compose dependency stack from backup, rebuilt Qdrant from Postgres-held vectors, and +searched again. + +| Step | Evidence | +| --- | --- | +| Note ingest | `ADD`, `remember`, note id `bfaa2f40-e076-490e-ae5a-dd88cf6b6179` | +| Search before restore | 1 result, key `single_user_restore_probe`, trace `535e49be-250f-483c-8845-b4116e591dac`, score 1.148 | +| Qdrant rebuild after restore | `rebuilt_count=1`, `missing_vector_count=0`, `error_count=0` | +| Search after restore | 1 result, key `single_user_restore_probe`, trace `e995263d-8f0e-4472-9a32-354d5cceed33`, score 1.1479998 | + +This satisfies the adoption criterion that Postgres backups, restore, and Qdrant rebuild +are tested without treating Qdrant as a source of truth. + +## External Comparison + +Fresh all-project smoke run: `live-baseline-20260609083814`. + +Corpus: generated public smoke, 3 docs, 3 queries. + +Aggregate verdict: `fail`, because the matrix is strict and external adapters retained +typed failures. The strict failure is useful evidence; it prevents hiding incomplete +adapter states. + +Full encoded check summary: 26 total, 16 pass, 3 fail, 2 wrong-result, 1 lifecycle-fail, +2 incomplete, 1 blocked, 4 not encoded. + +| Project | Status | Retrieval | Checks | Elapsed | Storage | Interpretation | +| --- | --- | --- | --- | --- | --- | --- | +| ELF | `pass` | `retrieval_pass` | `8/8` | 33 s | real | Added corpus, rebuilt Qdrant, returned expected evidence, and passed lifecycle checks. | +| qmd | `pass` | `retrieval_pass` | `4/4` | 59 s | real | Passed same-corpus retrieval, update, delete, and cold-start checks through persisted local collection files. | +| agentmemory | `lifecycle_fail` | `retrieval_pass` | `2/4` | 46 s | mocked | Same-corpus retrieval passed, but update left old text searchable and cold-start recovery is blocked by in-memory harness storage. | +| memsearch | `incomplete` | `invalid_json_result` | `0/1` | 432 s | real | Command completed but did not produce a valid benchmark result. | +| mem0 | `incomplete` | `invalid_json_result` | `2/4` | 462 s | real | Local FastEmbed/Qdrant search missed expected same-corpus results; delete remains not encoded. | +| OpenViking | `incomplete` | `local_embed_install_failed` | `0/1` | 513 s | incomplete | Local embedding install hit a llama-cpp-python build/import failure, so same-corpus local retrieval could not run. | +| claude-mem | `incomplete` | `invalid_json_result` | `0/4` | 107 s | mocked | Repository search missed expected same-corpus results and lifecycle behaviors remain mostly not encoded. | + +## Observability Evidence + +The gate is based on main after `XY-27`, which added read-only viewer retrieval +observability panels and a precision repair for trace candidate scores. The fresh +benchmark runs returned trace ids for every ELF search, and the search responses include +retrieval trajectory summaries. + +Representative provider stress traces: + +| Query | Trace id | +| --- | --- | +| `q-auth` | `7be1b5ce-3676-4625-8221-dcf0204669bf` | +| `q-auth-alt` | `79585c67-cdb8-46f8-bad1-d277295c1e0f` | +| `q-database` | `0cc7d130-fe51-436e-a5b0-971997ba8cb7` | +| `q-database-alt` | `4ffaf8cd-4b0d-4b3d-8154-56551538e81a` | +| `q-deploy` | `c770346e-d563-4ad0-aae6-f56dff334669` | +| `q-deploy-alt` | `84121528-c038-490b-bbc5-3352bcb9a2f5` | + +Representative restore proof traces: + +- Before restore: `535e49be-250f-483c-8845-b4116e591dac` +- After restore: `e995263d-8f0e-4472-9a32-354d5cceed33` + +This is sufficient for the personal production gate: a wrong result can be debugged via +the returned trace id, trajectory stages, trace bundle/admin endpoints, and the viewer +panels without raw SQL. + +## Adoption Criteria + +| Criterion | Result | Evidence and limitation | +| --- | --- | --- | +| Private production corpus benchmark has a passing or explicitly bounded result. | Bounded caveat | `cargo make baseline-production-private` failed closed because `ELF_BASELINE_PRODUCTION_CORPUS_MANIFEST` was unset. No private-corpus pass is claimed. | +| Backfill/resume proves predictable large import behavior. | Pass | `live-baseline-20260609092144`: 2000/2000 completed, resume 1000 -> 2000, zero duplicates, resource envelope passed. | +| Docker Compose backup, restore, and Qdrant rebuild are tested. | Pass | Single-user restore proof rebuilt 1 Qdrant point with 0 missing vectors and recovered searchable results. | +| Retrieval observability can debug wrong results without raw SQL. | Pass | `XY-27` landed, trace ids are returned in benchmark and restore runs, and trajectory summaries are present in search responses. | +| External comparison uses typed failure states and does not rely on mocked adapter results as proof. | Pass | `live-baseline-20260609083814` reports real, mocked, blocked, incomplete, wrong-result, and lifecycle-fail states explicitly. | + +## Follow-Up Queue + +No P0 Decodex lane needs to be requeued from this gate. + +Recommended non-blocking follow-ups: + +- Rerun `baseline-production-private` when an operator-owned private manifest is + available, and publish a private-corpus addendum that does not expose private text. +- Keep qmd as the strongest external local baseline for routing/fusion/debuggability + comparison work. +- Treat agentmemory, memsearch, mem0, OpenViking, and claude-mem adapter failures as + typed benchmark improvement opportunities only if external parity coverage remains a + roadmap goal. + +## Runner Repairs Made By This Gate + +Two small runner fixes were required to collect the fresh evidence: + +- `build.rs` now provides a fallback `VERGEN_GIT_SHA=unknown` before vergen emits git + metadata, so Docker benchmark builds work when the copied context is not a usable git + checkout. +- `baseline-backfill-docker` now resolves default environment values inside the shell + instead of relying on `${VAR:-default}` in the `cargo-make` TOML string, which avoided + malformed values such as `-backfill`. diff --git a/docs/guide/benchmarking/index.md b/docs/guide/benchmarking/index.md index 8d3f7506..d5921631 100644 --- a/docs/guide/benchmarking/index.md +++ b/docs/guide/benchmarking/index.md @@ -30,6 +30,9 @@ cleanup, use `docs/guide/single_user_production.md`. 2026 ELF production-provider stress run and all-project smoke comparison. - `2026-06-09-production-corpus-report.md`: checked-in synthetic production-corpus ELF adoption benchmark report with task queries and evidence IDs. +- `2026-06-09-production-adoption-gate-report.md`: XY-836 production adoption + decision report with fresh provider-backed synthetic, stress, backfill, restore, and + external adapter evidence. ## Update Rules