Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
58 changes: 44 additions & 14 deletions apps/elf-eval/src/bin/live_baseline_elf.rs
Original file line number Diff line number Diff line change
Expand Up @@ -271,7 +271,11 @@ struct CheckSummary {
total: usize,
pass: usize,
fail: usize,
wrong_result: usize,
lifecycle_fail: usize,
incomplete: usize,
blocked: usize,
not_encoded: usize,
}

#[derive(Debug, Serialize)]
Expand Down Expand Up @@ -625,7 +629,7 @@ fn retrieval_check(query_results: &[QueryResult]) -> CheckResult {

CheckResult {
name: "same_corpus_retrieval",
status: if fail_count == 0 { "pass" } else { "fail" },
status: if fail_count == 0 { "pass" } else { "wrong_result" },
reason: if fail_count == 0 {
"All same-corpus retrieval queries returned expected evidence.".to_string()
} else {
Expand All @@ -648,7 +652,7 @@ fn worker_indexing_check(evidence: WorkerRunEvidence) -> CheckResult {

CheckResult {
name: "async_worker_indexing_e2e",
status: if pass { "pass" } else { "fail" },
status: if pass { "pass" } else { "lifecycle_fail" },
reason: if pass {
"ELF worker processed corpus outbox jobs into persisted chunks and embeddings."
.to_string()
Expand All @@ -671,7 +675,7 @@ fn resumable_backfill_check(report: &BackfillReport) -> CheckResult {

CheckResult {
name: "resumable_backfill_no_duplicates",
status: if pass { "pass" } else { "fail" },
status: if pass { "pass" } else { "lifecycle_fail" },
reason: if pass {
"Checkpointed backfill resumed from durable progress and did not duplicate source documents."
.to_string()
Expand Down Expand Up @@ -1033,7 +1037,7 @@ fn resource_envelope_check(elapsed_seconds: f64) -> CheckResult {

CheckResult {
name: "resource_envelope",
status: if pass { "pass" } else { "fail" },
status: if pass { "pass" } else { "lifecycle_fail" },
reason: if pass {
"ELF live-baseline runtime stayed within the configured local resource envelope."
.to_string()
Expand Down Expand Up @@ -1070,11 +1074,34 @@ fn incomplete_check(name: &'static str, reason: &str) -> CheckResult {
}

fn summarize_checks(checks: &[CheckResult]) -> CheckSummary {
let wrong_result = checks.iter().filter(|check| check.status == "wrong_result").count();
let lifecycle_fail = checks.iter().filter(|check| check.status == "lifecycle_fail").count();

CheckSummary {
total: checks.len(),
pass: checks.iter().filter(|check| check.status == "pass").count(),
fail: checks.iter().filter(|check| check.status == "fail").count(),
fail: wrong_result + lifecycle_fail,
wrong_result,
lifecycle_fail,
incomplete: checks.iter().filter(|check| check.status == "incomplete").count(),
blocked: checks.iter().filter(|check| check.status == "blocked").count(),
not_encoded: checks.iter().filter(|check| check.status == "not_encoded").count(),
}
}

fn project_status_from_summary(summary: &CheckSummary) -> &'static str {
if summary.wrong_result > 0 {
"wrong_result"
} else if summary.lifecycle_fail > 0 {
"lifecycle_fail"
} else if summary.blocked > 0 {
"blocked"
} else if summary.incomplete > 0 {
"incomplete"
} else if summary.not_encoded > 0 {
"not_encoded"
} else {
"pass"
}
}

Expand Down Expand Up @@ -1571,15 +1598,18 @@ async fn run(args: Args) -> color_eyre::Result<ElfBaselineReport> {
checks.push(resource_envelope_check(started_at.elapsed().as_secs_f64()));

let check_summary = summarize_checks(&checks);
let status =
if check_summary.fail == 0 && check_summary.incomplete == 0 { "pass" } else { "fail" };
let status = project_status_from_summary(&check_summary);
let reason = if status == "pass" {
"ELF added the corpus, rebuilt Qdrant, and returned expected evidence for every query"
.to_string()
} else {
format!(
"ELF failed {} live-baseline check(s) and left {} incomplete check(s)",
check_summary.fail, check_summary.incomplete
"ELF reported {} wrong-result, {} lifecycle-failure, {} blocked, {} incomplete, and {} not-encoded live-baseline check(s)",
check_summary.wrong_result,
check_summary.lifecycle_fail,
check_summary.blocked,
check_summary.incomplete,
check_summary.not_encoded
)
};
let report = ElfBaselineReport {
Expand Down Expand Up @@ -2000,7 +2030,7 @@ async fn run_update_replacement_check(

Ok(CheckResult {
name: "update_replaces_note_text",
status: if update_pass { "pass" } else { "fail" },
status: if update_pass { "pass" } else { "lifecycle_fail" },
reason: if update_pass {
"Service update plus worker indexing returned the new marker and removed the old marker from the top snippet.".to_string()
} else {
Expand Down Expand Up @@ -2047,7 +2077,7 @@ async fn run_delete_suppression_check(

Ok(CheckResult {
name: "delete_suppresses_retrieval",
status: if delete_pass { "pass" } else { "fail" },
status: if delete_pass { "pass" } else { "lifecycle_fail" },
reason: if delete_pass {
"Service delete suppressed the deleted note from subsequent search results.".to_string()
} else {
Expand Down Expand Up @@ -2083,7 +2113,7 @@ async fn run_cold_start_recovery_check(

Ok(CheckResult {
name: "cold_start_recovery_search",
status: if recovery_query.matched { "pass" } else { "fail" },
status: if recovery_query.matched { "pass" } else { "lifecycle_fail" },
reason: if recovery_query.matched {
"A newly constructed service over the same Postgres and Qdrant stores retrieved persisted evidence.".to_string()
} else {
Expand Down Expand Up @@ -2156,7 +2186,7 @@ async fn run_concurrent_write_check(

Ok(CheckResult {
name: "concurrent_write_search_e2e",
status: if pass { "pass" } else { "fail" },
status: if pass { "pass" } else { "lifecycle_fail" },
reason: if pass {
"Concurrent add_note calls were indexed by the worker and remained searchable."
.to_string()
Expand Down Expand Up @@ -2244,7 +2274,7 @@ async fn run_soak_stability_check(

Ok(Some(CheckResult {
name: "soak_stability_e2e",
status: if pass { "pass" } else { "fail" },
status: if pass { "pass" } else { "lifecycle_fail" },
reason: if pass {
"ELF sustained repeated write, worker indexing, and search probes for the configured soak window.".to_string()
} else {
Expand Down
72 changes: 47 additions & 25 deletions docs/guide/benchmarking/2026-06-09-live-baseline-report.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,11 @@ Verification: Re-run the commands in this report and compare
- ELF passed the production-provider stress run with `Qwen3-Embedding-8B`,
4096-dimensional embeddings, 480 documents, 16 queries, and `8/8` encoded checks.
- In the all-project smoke comparison, ELF and qmd passed every encoded check.
agentmemory passed same-corpus retrieval but failed or could not complete lifecycle
checks. mem0, memsearch, and claude-mem returned wrong same-corpus retrieval results
in the encoded smoke. OpenViking was incomplete because its local embedding dependency
could not complete in the Docker runner.
agentmemory passed same-corpus retrieval but had a typed `lifecycle_fail` on update
replacement and blocked/incomplete durable cold-start coverage in the current mocked
adapter. mem0, memsearch, and claude-mem returned `wrong_result` same-corpus
retrieval results in the encoded smoke. OpenViking was `incomplete` because its local
embedding dependency could not complete in the Docker runner.
- Under the encoded service-style benchmark checks, ELF passed all ELF checks that were
run. Under the encoded local CLI smoke checks, qmd passed all qmd checks that were
run.
Expand Down Expand Up @@ -83,9 +84,9 @@ cargo make baseline-live-docker
| Documents | `3` |
| Queries | `3` |
| Aggregate verdict | `fail` |
| Project summary | `2 pass`, `4 fail`, `1 incomplete` |
| Same-corpus summary | `3 pass`, `3 fail`, `1 incomplete` |
| Full check summary | `17 pass`, `4 fail`, `4 incomplete` |
| Project summary | `2 pass`, `3 wrong_result`, `1 lifecycle_fail`, `1 incomplete` |
| Same-corpus summary | `3 pass`, `3 wrong_result`, `1 incomplete` |
| Full check summary | `17 pass`, `3 wrong_result`, `1 lifecycle_fail`, `4 incomplete` |

The aggregate verdict is `fail` because the top-level report only passes when every
selected project passes every encoded project check.
Expand All @@ -94,11 +95,23 @@ selected project passes every encoded project check.
| --- | --- | --- | --- | --- | --- |
| ELF | `pass` | `retrieval_pass` | `7/7` | `57s` | Service-backed provider run passed retrieval, worker indexing, lifecycle, recovery, and concurrency checks. |
| qmd | `pass` | `retrieval_pass` | `4/4` | `53s` | Local CLI hybrid retrieval baseline passed retrieval, update, delete, and cold-start checks. |
| agentmemory | `fail` | `retrieval_pass` | `2/4` | `38s` | Retrieval passed, but update replacement failed because the old marker remained searchable; cold-start is incomplete in the current in-memory adapter. |
| memsearch | `fail` | `retrieval_wrong_result` | `2/4` | `169s` | Local search ran, update and cold-start passed, but same-corpus retrieval missed expected evidence. |
| mem0 | `fail` | `retrieval_wrong_result` | `2/4` | `41s` | Local add/search ran, update and cold-start passed, but same-corpus retrieval missed expected evidence. |
| agentmemory | `lifecycle_fail` | `retrieval_pass` | `2/4` | `38s` | Retrieval passed, but update replacement failed because the old marker remained searchable; durable cold-start is blocked by the current in-memory adapter. |
| memsearch | `wrong_result` | `retrieval_wrong_result` | `2/4` | `169s` | Local search ran, update and cold-start passed, but same-corpus retrieval missed expected evidence. |
| mem0 | `wrong_result` | `retrieval_wrong_result` | `2/4` | `41s` | Local add/search ran, update and cold-start passed, but same-corpus retrieval missed expected evidence. |
| OpenViking | `incomplete` | `local_embed_install_failed` | `0/1` | `385s` | The local embed install path hit a `llama-cpp-python` build/import failure in Docker, so retrieval was not evaluated. |
| claude-mem | `fail` | `retrieval_wrong_result` | `0/1` | `97s` | Same-corpus repository search ran but did not return expected evidence. |
| claude-mem | `wrong_result` | `retrieval_wrong_result` | `0/1` | `97s` | Same-corpus repository search ran but did not return expected evidence. |

Typed adapter behavior interpretation for this snapshot:

| Project | Storage | Retrieval | Update | Delete/Expire | Cold Start | Scale/Stress |
| --- | --- | --- | --- | --- | --- | --- |
| ELF | `real` | `real` | `real` | `real` | `real` | `real` |
| qmd | `real` | `real` | `real` | `real` | `real` | `real path via ELF_BASELINE_PROJECTS=qmd and scale/stress profiles` |
| agentmemory | `mocked` | `mocked` | `mocked` | `mocked` | `blocked` | `incomplete` |
| memsearch | `real` | `real` | `real` | `real` | `real` | `incomplete` |
| mem0 | `real` | `real` | `real` | `real` | `real` | `incomplete` |
| OpenViking | `incomplete` | `incomplete` | `not_encoded` | `not_encoded` | `not_encoded` | `blocked` |
| claude-mem | `mocked` | `mocked` | `not_encoded` | `not_encoded` | `not_encoded` | `incomplete` |

Re-run command:

Expand All @@ -114,18 +127,24 @@ ELF_BASELINE_ELF_EMBEDDING_MODE=provider \
cargo make baseline-live-docker
```

## Pass, Fail, And Incomplete Rules
## Result Semantics

- `pass`: the project installed and every encoded retrieval, lifecycle, recovery, and
resource check for the selected corpus profile passed.
- `fail`: clone, install, import, build, retrieval, update, delete, recovery,
concurrency, soak, resource-envelope, or another declared project check failed.
- `incomplete`: the project partially ran, but the encoded check could not be completed
without extra provider keys, host integration, native dependency support, durable
runtime wiring, or a project-specific command mapping not yet encoded in the runner.

`incomplete` is not a pass. It means the benchmark needs more wiring before making a
quality claim for that project.
- `wrong_result`: a retrieval check completed but returned the wrong memory or missed
expected evidence.
- `lifecycle_fail`: same-corpus retrieval may pass, but an encoded update, delete,
cold-start, persistence, or related lifecycle check failed.
- `incomplete`: setup or a declared check could not complete because install, runtime,
dependency, or adapter wiring failed in Docker.
- `blocked`: a safe check cannot run without external credentials, manual setup,
durable runtime wiring, or host integration outside this run.
- `not_encoded`: the capability is not covered by the current adapter, so no pass/fail
claim is allowed.

`incomplete`, `blocked`, and `not_encoded` are not passes. They mean the benchmark
needs more wiring or runtime support before making a quality claim for that project or
capability.

## Interpretation

Expand All @@ -140,13 +159,16 @@ ELF checks covered in this run:
- worker-produced chunks and embeddings, not direct in-memory fixture shortcuts;
- explicit update, delete, cold-start, concurrency, soak, and resource checks;
- report metadata that records corpus profile, document count, query count, project
status, check summaries, elapsed seconds, and embedding configuration.
status, check summaries, adapter behavior metadata, elapsed seconds, and embedding
configuration.

qmd was the external project that passed every encoded smoke check. agentmemory passed
same-corpus retrieval, failed update replacement, and has incomplete cold-start coverage
because the current adapter uses an in-memory SDK/KV mock. mem0, memsearch, and
claude-mem failed the encoded smoke retrieval. OpenViking was not retrieval-evaluated
because the Docker local embedding install path did not complete.
same-corpus retrieval, failed update replacement, and has blocked durable cold-start
coverage because the current adapter uses an in-memory SDK/KV mock. mem0, memsearch,
and claude-mem returned wrong same-corpus retrieval results. OpenViking was not
retrieval-evaluated because the Docker local embedding install path did not complete;
retry requires a pinned or otherwise Docker-compatible `llama-cpp-python` local
embedding dependency.

## Speed And Production Stance

Expand Down
Loading