hack-ink · yvette-carlisle · Jun 9, 2026 · Jun 9, 2026 · Jun 9, 2026
diff --git a/apps/elf-eval/src/bin/live_baseline_elf.rs b/apps/elf-eval/src/bin/live_baseline_elf.rs
@@ -271,7 +271,11 @@ struct CheckSummary {
 	total: usize,
 	pass: usize,
 	fail: usize,
+	wrong_result: usize,
+	lifecycle_fail: usize,
 	incomplete: usize,
+	blocked: usize,
+	not_encoded: usize,
 }
 
 #[derive(Debug, Serialize)]
@@ -625,7 +629,7 @@ fn retrieval_check(query_results: &[QueryResult]) -> CheckResult {
 
 	CheckResult {
 		name: "same_corpus_retrieval",
-		status: if fail_count == 0 { "pass" } else { "fail" },
+		status: if fail_count == 0 { "pass" } else { "wrong_result" },
 		reason: if fail_count == 0 {
 			"All same-corpus retrieval queries returned expected evidence.".to_string()
 		} else {
@@ -648,7 +652,7 @@ fn worker_indexing_check(evidence: WorkerRunEvidence) -> CheckResult {
 
 	CheckResult {
 		name: "async_worker_indexing_e2e",
-		status: if pass { "pass" } else { "fail" },
+		status: if pass { "pass" } else { "lifecycle_fail" },
 		reason: if pass {
 			"ELF worker processed corpus outbox jobs into persisted chunks and embeddings."
 				.to_string()
@@ -671,7 +675,7 @@ fn resumable_backfill_check(report: &BackfillReport) -> CheckResult {
 
 	CheckResult {
 		name: "resumable_backfill_no_duplicates",
-		status: if pass { "pass" } else { "fail" },
+		status: if pass { "pass" } else { "lifecycle_fail" },
 		reason: if pass {
 			"Checkpointed backfill resumed from durable progress and did not duplicate source documents."
 				.to_string()
@@ -1033,7 +1037,7 @@ fn resource_envelope_check(elapsed_seconds: f64) -> CheckResult {
 
 	CheckResult {
 		name: "resource_envelope",
-		status: if pass { "pass" } else { "fail" },
+		status: if pass { "pass" } else { "lifecycle_fail" },
 		reason: if pass {
 			"ELF live-baseline runtime stayed within the configured local resource envelope."
 				.to_string()
@@ -1070,11 +1074,34 @@ fn incomplete_check(name: &'static str, reason: &str) -> CheckResult {
 }
 
 fn summarize_checks(checks: &[CheckResult]) -> CheckSummary {
+	let wrong_result = checks.iter().filter(|check| check.status == "wrong_result").count();
+	let lifecycle_fail = checks.iter().filter(|check| check.status == "lifecycle_fail").count();
+
 	CheckSummary {
 		total: checks.len(),
 		pass: checks.iter().filter(|check| check.status == "pass").count(),
-		fail: checks.iter().filter(|check| check.status == "fail").count(),
+		fail: wrong_result + lifecycle_fail,
+		wrong_result,
+		lifecycle_fail,
 		incomplete: checks.iter().filter(|check| check.status == "incomplete").count(),
+		blocked: checks.iter().filter(|check| check.status == "blocked").count(),
+		not_encoded: checks.iter().filter(|check| check.status == "not_encoded").count(),
+	}
+}
+
+fn project_status_from_summary(summary: &CheckSummary) -> &'static str {
+	if summary.wrong_result > 0 {
+		"wrong_result"
+	} else if summary.lifecycle_fail > 0 {
+		"lifecycle_fail"
+	} else if summary.blocked > 0 {
+		"blocked"
+	} else if summary.incomplete > 0 {
+		"incomplete"
+	} else if summary.not_encoded > 0 {
+		"not_encoded"
+	} else {
+		"pass"
 	}
 }
 
@@ -1571,15 +1598,18 @@ async fn run(args: Args) -> color_eyre::Result<ElfBaselineReport> {
 	checks.push(resource_envelope_check(started_at.elapsed().as_secs_f64()));
 
 	let check_summary = summarize_checks(&checks);
-	let status =
-		if check_summary.fail == 0 && check_summary.incomplete == 0 { "pass" } else { "fail" };
+	let status = project_status_from_summary(&check_summary);
 	let reason = if status == "pass" {
 		"ELF added the corpus, rebuilt Qdrant, and returned expected evidence for every query"
 			.to_string()
 	} else {
 		format!(
-			"ELF failed {} live-baseline check(s) and left {} incomplete check(s)",
-			check_summary.fail, check_summary.incomplete
+			"ELF reported {} wrong-result, {} lifecycle-failure, {} blocked, {} incomplete, and {} not-encoded live-baseline check(s)",
+			check_summary.wrong_result,
+			check_summary.lifecycle_fail,
+			check_summary.blocked,
+			check_summary.incomplete,
+			check_summary.not_encoded
 		)
 	};
 	let report = ElfBaselineReport {
@@ -2000,7 +2030,7 @@ async fn run_update_replacement_check(
 
 	Ok(CheckResult {
 		name: "update_replaces_note_text",
-		status: if update_pass { "pass" } else { "fail" },
+		status: if update_pass { "pass" } else { "lifecycle_fail" },
 		reason: if update_pass {
 			"Service update plus worker indexing returned the new marker and removed the old marker from the top snippet.".to_string()
 		} else {
@@ -2047,7 +2077,7 @@ async fn run_delete_suppression_check(
 
 	Ok(CheckResult {
 		name: "delete_suppresses_retrieval",
-		status: if delete_pass { "pass" } else { "fail" },
+		status: if delete_pass { "pass" } else { "lifecycle_fail" },
 		reason: if delete_pass {
 			"Service delete suppressed the deleted note from subsequent search results.".to_string()
 		} else {
@@ -2083,7 +2113,7 @@ async fn run_cold_start_recovery_check(
 
 	Ok(CheckResult {
 		name: "cold_start_recovery_search",
-		status: if recovery_query.matched { "pass" } else { "fail" },
+		status: if recovery_query.matched { "pass" } else { "lifecycle_fail" },
 		reason: if recovery_query.matched {
 			"A newly constructed service over the same Postgres and Qdrant stores retrieved persisted evidence.".to_string()
 		} else {
@@ -2156,7 +2186,7 @@ async fn run_concurrent_write_check(
 
 	Ok(CheckResult {
 		name: "concurrent_write_search_e2e",
-		status: if pass { "pass" } else { "fail" },
+		status: if pass { "pass" } else { "lifecycle_fail" },
 		reason: if pass {
 			"Concurrent add_note calls were indexed by the worker and remained searchable."
 				.to_string()
@@ -2244,7 +2274,7 @@ async fn run_soak_stability_check(
 
 	Ok(Some(CheckResult {
 		name: "soak_stability_e2e",
-		status: if pass { "pass" } else { "fail" },
+		status: if pass { "pass" } else { "lifecycle_fail" },
 		reason: if pass {
 			"ELF sustained repeated write, worker indexing, and search probes for the configured soak window.".to_string()
 		} else {

diff --git a/docs/guide/benchmarking/2026-06-09-live-baseline-report.md b/docs/guide/benchmarking/2026-06-09-live-baseline-report.md
@@ -15,10 +15,11 @@ Verification: Re-run the commands in this report and compare
 - ELF passed the production-provider stress run with `Qwen3-Embedding-8B`,
   4096-dimensional embeddings, 480 documents, 16 queries, and `8/8` encoded checks.
 - In the all-project smoke comparison, ELF and qmd passed every encoded check.
-  agentmemory passed same-corpus retrieval but failed or could not complete lifecycle
-  checks. mem0, memsearch, and claude-mem returned wrong same-corpus retrieval results
-  in the encoded smoke. OpenViking was incomplete because its local embedding dependency
-  could not complete in the Docker runner.
+  agentmemory passed same-corpus retrieval but had a typed `lifecycle_fail` on update
+  replacement and blocked/incomplete durable cold-start coverage in the current mocked
+  adapter. mem0, memsearch, and claude-mem returned `wrong_result` same-corpus
+  retrieval results in the encoded smoke. OpenViking was `incomplete` because its local
+  embedding dependency could not complete in the Docker runner.
 - Under the encoded service-style benchmark checks, ELF passed all ELF checks that were
   run. Under the encoded local CLI smoke checks, qmd passed all qmd checks that were
   run.
@@ -83,9 +84,9 @@ cargo make baseline-live-docker
 | Documents | `3` |
 | Queries | `3` |
 | Aggregate verdict | `fail` |
-| Project summary | `2 pass`, `4 fail`, `1 incomplete` |
-| Same-corpus summary | `3 pass`, `3 fail`, `1 incomplete` |
-| Full check summary | `17 pass`, `4 fail`, `4 incomplete` |
+| Project summary | `2 pass`, `3 wrong_result`, `1 lifecycle_fail`, `1 incomplete` |
+| Same-corpus summary | `3 pass`, `3 wrong_result`, `1 incomplete` |
+| Full check summary | `17 pass`, `3 wrong_result`, `1 lifecycle_fail`, `4 incomplete` |
 
 The aggregate verdict is `fail` because the top-level report only passes when every
 selected project passes every encoded project check.
@@ -94,11 +95,23 @@ selected project passes every encoded project check.
 | --- | --- | --- | --- | --- | --- |
 | ELF | `pass` | `retrieval_pass` | `7/7` | `57s` | Service-backed provider run passed retrieval, worker indexing, lifecycle, recovery, and concurrency checks. |
 | qmd | `pass` | `retrieval_pass` | `4/4` | `53s` | Local CLI hybrid retrieval baseline passed retrieval, update, delete, and cold-start checks. |
-| agentmemory | `fail` | `retrieval_pass` | `2/4` | `38s` | Retrieval passed, but update replacement failed because the old marker remained searchable; cold-start is incomplete in the current in-memory adapter. |
-| memsearch | `fail` | `retrieval_wrong_result` | `2/4` | `169s` | Local search ran, update and cold-start passed, but same-corpus retrieval missed expected evidence. |
-| mem0 | `fail` | `retrieval_wrong_result` | `2/4` | `41s` | Local add/search ran, update and cold-start passed, but same-corpus retrieval missed expected evidence. |
+| agentmemory | `lifecycle_fail` | `retrieval_pass` | `2/4` | `38s` | Retrieval passed, but update replacement failed because the old marker remained searchable; durable cold-start is blocked by the current in-memory adapter. |
+| memsearch | `wrong_result` | `retrieval_wrong_result` | `2/4` | `169s` | Local search ran, update and cold-start passed, but same-corpus retrieval missed expected evidence. |
+| mem0 | `wrong_result` | `retrieval_wrong_result` | `2/4` | `41s` | Local add/search ran, update and cold-start passed, but same-corpus retrieval missed expected evidence. |
 | OpenViking | `incomplete` | `local_embed_install_failed` | `0/1` | `385s` | The local embed install path hit a `llama-cpp-python` build/import failure in Docker, so retrieval was not evaluated. |
-| claude-mem | `fail` | `retrieval_wrong_result` | `0/1` | `97s` | Same-corpus repository search ran but did not return expected evidence. |
+| claude-mem | `wrong_result` | `retrieval_wrong_result` | `0/1` | `97s` | Same-corpus repository search ran but did not return expected evidence. |
+
+Typed adapter behavior interpretation for this snapshot:
+
+| Project | Storage | Retrieval | Update | Delete/Expire | Cold Start | Scale/Stress |
+| --- | --- | --- | --- | --- | --- | --- |
+| ELF | `real` | `real` | `real` | `real` | `real` | `real` |
+| qmd | `real` | `real` | `real` | `real` | `real` | `real path via ELF_BASELINE_PROJECTS=qmd and scale/stress profiles` |
+| agentmemory | `mocked` | `mocked` | `mocked` | `mocked` | `blocked` | `incomplete` |
+| memsearch | `real` | `real` | `real` | `real` | `real` | `incomplete` |
+| mem0 | `real` | `real` | `real` | `real` | `real` | `incomplete` |
+| OpenViking | `incomplete` | `incomplete` | `not_encoded` | `not_encoded` | `not_encoded` | `blocked` |
+| claude-mem | `mocked` | `mocked` | `not_encoded` | `not_encoded` | `not_encoded` | `incomplete` |
 
 Re-run command:
 
@@ -114,18 +127,24 @@ ELF_BASELINE_ELF_EMBEDDING_MODE=provider \
 cargo make baseline-live-docker
 ```
 
-## Pass, Fail, And Incomplete Rules
+## Result Semantics
 
 - `pass`: the project installed and every encoded retrieval, lifecycle, recovery, and
   resource check for the selected corpus profile passed.
-- `fail`: clone, install, import, build, retrieval, update, delete, recovery,
-  concurrency, soak, resource-envelope, or another declared project check failed.
-- `incomplete`: the project partially ran, but the encoded check could not be completed
-  without extra provider keys, host integration, native dependency support, durable
-  runtime wiring, or a project-specific command mapping not yet encoded in the runner.
-
-`incomplete` is not a pass. It means the benchmark needs more wiring before making a
-quality claim for that project.
+- `wrong_result`: a retrieval check completed but returned the wrong memory or missed
+  expected evidence.
+- `lifecycle_fail`: same-corpus retrieval may pass, but an encoded update, delete,
+  cold-start, persistence, or related lifecycle check failed.
+- `incomplete`: setup or a declared check could not complete because install, runtime,
+  dependency, or adapter wiring failed in Docker.
+- `blocked`: a safe check cannot run without external credentials, manual setup,
+  durable runtime wiring, or host integration outside this run.
+- `not_encoded`: the capability is not covered by the current adapter, so no pass/fail
+  claim is allowed.
+
+`incomplete`, `blocked`, and `not_encoded` are not passes. They mean the benchmark
+needs more wiring or runtime support before making a quality claim for that project or
+capability.
 
 ## Interpretation
 
@@ -140,13 +159,16 @@ ELF checks covered in this run:
 - worker-produced chunks and embeddings, not direct in-memory fixture shortcuts;
 - explicit update, delete, cold-start, concurrency, soak, and resource checks;
 - report metadata that records corpus profile, document count, query count, project
-  status, check summaries, elapsed seconds, and embedding configuration.
+  status, check summaries, adapter behavior metadata, elapsed seconds, and embedding
+  configuration.
 
 qmd was the external project that passed every encoded smoke check. agentmemory passed
-same-corpus retrieval, failed update replacement, and has incomplete cold-start coverage
-because the current adapter uses an in-memory SDK/KV mock. mem0, memsearch, and
-claude-mem failed the encoded smoke retrieval. OpenViking was not retrieval-evaluated
-because the Docker local embedding install path did not complete.
+same-corpus retrieval, failed update replacement, and has blocked durable cold-start
+coverage because the current adapter uses an in-memory SDK/KV mock. mem0, memsearch,
+and claude-mem returned wrong same-corpus retrieval results. OpenViking was not
+retrieval-evaluated because the Docker local embedding install path did not complete;
+retry requires a pinned or otherwise Docker-compatible `llama-cpp-python` local
+embedding dependency.
 
 ## Speed And Production Stance