From a0c1ca6685480c9f8b71c0fe6b3525f7bc91c14e Mon Sep 17 00:00:00 2001 From: Yvette Carlisle Date: Fri, 12 Jun 2026 00:14:03 +0800 Subject: [PATCH 1/7] {"schema":"decodex/commit/1","summary":"Add Letta-style core archival benchmark","authority":"XY-927"} --- Makefile.toml | 52 ++++ README.md | 17 +- .../memory_projects_manifest.json | 88 ++++++- .../archival_fallback.json | 192 +++++++++++++++ .../core_block_attachment.json | 192 +++++++++++++++ .../core_block_provenance.json | 192 +++++++++++++++ .../core_block_scope.json | 192 +++++++++++++++ .../project_decision_recovery.json | 230 ++++++++++++++++++ .../stale_core_detection.json | 206 ++++++++++++++++ .../src/bin/real_world_job_benchmark.rs | 1 + .../tests/real_world_job_benchmark.rs | 178 +++++++++++--- ...-11-competitor-strength-adoption-report.md | 29 ++- .../2026-06-11-measurement-coverage-audit.md | 55 +++-- docs/guide/benchmarking/index.md | 13 +- .../real_world_agent_memory_benchmark.md | 10 +- ...1-competitor-strength-adoption-report.json | 30 ++- ...2026-06-11-measurement-coverage-audit.json | 34 +-- .../real_world_agent_memory_benchmark_v1.md | 1 + 18 files changed, 1590 insertions(+), 122 deletions(-) create mode 100644 apps/elf-eval/fixtures/real_world_memory/core_archival_memory/archival_fallback.json create mode 100644 apps/elf-eval/fixtures/real_world_memory/core_archival_memory/core_block_attachment.json create mode 100644 apps/elf-eval/fixtures/real_world_memory/core_archival_memory/core_block_provenance.json create mode 100644 apps/elf-eval/fixtures/real_world_memory/core_archival_memory/core_block_scope.json create mode 100644 apps/elf-eval/fixtures/real_world_memory/core_archival_memory/project_decision_recovery.json create mode 100644 apps/elf-eval/fixtures/real_world_memory/core_archival_memory/stale_core_detection.json diff --git a/Makefile.toml b/Makefile.toml index 42b2033c..33dc2044 100644 --- a/Makefile.toml +++ b/Makefile.toml @@ -428,6 +428,9 @@ args = [ # | real-world-memory-production-ops | composite | | # | real-world-memory-production-ops-json | command | | # | real-world-memory-production-ops-report | command | | +# | real-world-memory-core-archival | composite | | +# | real-world-memory-core-archival-json | command | | +# | real-world-memory-core-archival-report | command | | # | real-world-memory-live-adapters | command | | [tasks.real-world-job-smoke] @@ -824,6 +827,55 @@ args = [ "tmp/real-world-memory/consolidation/report.md", ] +[tasks.real-world-memory-core-archival] +workspace = false +dependencies = [ + "real-world-memory-core-archival-report", +] + +[tasks.real-world-memory-core-archival-json] +workspace = false +command = "cargo" +args = [ + "run", + "-p", + "elf-eval", + "--bin", + "real_world_job_benchmark", + "--", + "run", + "--fixtures", + "apps/elf-eval/fixtures/real_world_memory/core_archival_memory", + "--out", + "tmp/real-world-memory/core-archival/report.json", + "--run-id", + "real-world-memory-core-archival", + "--adapter-id", + "fixture_core_archival_memory", + "--adapter-name", + "ELF core and archival memory fixture", +] + +[tasks.real-world-memory-core-archival-report] +workspace = false +dependencies = [ + "real-world-memory-core-archival-json", +] +command = "cargo" +args = [ + "run", + "-p", + "elf-eval", + "--bin", + "real_world_job_benchmark", + "--", + "publish", + "--report", + "tmp/real-world-memory/core-archival/report.json", + "--out", + "tmp/real-world-memory/core-archival/report.md", +] + [tasks.real-world-memory-live-adapters] workspace = false command = "bash" diff --git a/README.md b/README.md index 8261bf13..f2480a25 100644 --- a/README.md +++ b/README.md @@ -149,13 +149,18 @@ provider-backed ELF evidence was required. mem0, OpenViking, and claude-mem remained typed non-pass states. OpenViking now reaches its pinned Docker local embedding path and is reported as `wrong_result` when same-corpus evidence terms are missed; setup failures remain `incomplete`. -- Real-world agent memory aggregate after the P1 benchmark batch: 38 fixture-backed - jobs across 11 suites, 36 pass, 0 incomplete, 2 blocked, 0 wrong-result, - 0 not-encoded, and 0 unsupported-claim results. The remaining non-pass jobs are - production-ops operator boundaries, not hidden benchmark wins. +- Real-world agent memory aggregate after the P1 benchmark batch and XY-927 + core-vs-archival fixture update: 44 fixture-backed jobs across 12 suites, 42 pass, + 0 incomplete, 2 blocked, 0 wrong-result, 0 not-encoded, and 0 unsupported-claim + results. The remaining non-pass jobs are production-ops operator boundaries, not + hidden benchmark wins. The new `core_archival_memory` suite passes 6 fixture jobs + for core block attachment, scope, provenance, stale-core detection, archival + fallback, and project-decision recovery; it does not create an ELF-over-Letta + claim. - Full-suite live real-world adapter sweep after XY-899: ELF and qmd emit - Docker-isolated `live_real_world` records for all 38 encoded jobs across 11 suites - through `cargo make real-world-memory-live-adapters`. Both keep the original + Docker-isolated `live_real_world` records for the previously measured 38 encoded + jobs across 11 suites through `cargo make real-world-memory-live-adapters`. Both + keep the original targeted `work_resume`, `retrieval`, and `project_decisions` slice passing, but the full sweep is not a full-suite pass. The fresh ELF sweep reports 18 pass, 5 wrong_result, 2 blocked, and 13 not_encoded jobs. The fresh qmd sweep reports diff --git a/apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json b/apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json index 2832b202..8cc03e41 100644 --- a/apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json +++ b/apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json @@ -2088,24 +2088,24 @@ "evidence_class": "research_gate", "docker_default": true, "host_global_installs_required": false, - "overall_status": "not_encoded", + "overall_status": "blocked", "setup": { - "status": "not_encoded", - "evidence": "Letta is D1 reviewed as a core/archival memory reference, but no Docker real_world_job adapter is implemented." + "status": "blocked", + "evidence": "Letta is D1 reviewed as a core/archival memory reference. The contained comparison contract is a Docker-only benchmark-created agent export that must return core block JSON, archival search readback, and source ids before any scenario claim is scored." }, "run": { "status": "not_encoded", - "evidence": "No Letta core block, archival memory, or shared-memory job is encoded." + "evidence": "No Letta materializer currently creates the benchmark agent, imports the ELF core_archival_memory fixture corpus, or exports comparable core and archival evidence." }, "result": { "status": "not_encoded", - "evidence": "No Letta personalization or project-decision suite result is claimed." + "evidence": "No Letta core block, archival fallback, stale-core, scope, provenance, or project-decision result is claimed." }, "capabilities": [ { "capability": "core_archival_memory", - "status": "not_encoded", - "evidence": "Core blocks and archival memory are reference semantics but not scored." + "status": "blocked", + "evidence": "ELF fixture jobs now score core block attachment, scope, provenance, stale-core detection, archival fallback, and project-decision recovery separately from archival note search; Letta remains blocked until its export maps equivalent source ids." }, { "capability": "docker_embedding_configuration", @@ -2133,6 +2133,67 @@ "suite_id": "work_resume", "status": "not_encoded", "evidence": "Agent resumption through Letta memory blocks is not encoded." + }, + { + "suite_id": "core_archival_memory", + "status": "blocked", + "evidence": "ELF fixture coverage exists, but Letta has no contained export/readback artifact for the same core-vs-archival jobs." + } + ], + "scenarios": [ + { + "scenario_id": "core_block_attachment_readback", + "suite_id": "core_archival_memory", + "status": "not_encoded", + "elf_position": "untested", + "comparison_outcome": "not_tested", + "evidence": "ELF fixture core-archival-core-block-attachment-001 scores exact core block attachment and keeps core readback out of Qdrant-backed archival search. Letta has no comparable exported core block attachment evidence.", + "artifact": "apps/elf-eval/fixtures/real_world_memory/core_archival_memory/core_block_attachment.json" + }, + { + "scenario_id": "core_block_scope_readback", + "suite_id": "core_archival_memory", + "status": "not_encoded", + "elf_position": "untested", + "comparison_outcome": "not_tested", + "evidence": "ELF fixture core-archival-core-block-scope-001 scores read_profile, shared scope, and private-owner boundaries. Letta scope behavior remains unscored without a contained export of agent, block, and visibility metadata.", + "artifact": "apps/elf-eval/fixtures/real_world_memory/core_archival_memory/core_block_scope.json" + }, + { + "scenario_id": "core_block_provenance_readback", + "suite_id": "core_archival_memory", + "status": "not_encoded", + "elf_position": "untested", + "comparison_outcome": "not_tested", + "evidence": "ELF fixture core-archival-core-block-provenance-001 scores source_ref and audit_history readback. Letta provenance remains not_tested until exported core memory includes stable source ids and audit-equivalent events.", + "artifact": "apps/elf-eval/fixtures/real_world_memory/core_archival_memory/core_block_provenance.json" + }, + { + "scenario_id": "stale_core_detection", + "suite_id": "core_archival_memory", + "status": "blocked", + "elf_position": "untested", + "comparison_outcome": "blocked", + "evidence": "ELF fixture core-archival-stale-core-detection-001 scores archival evidence superseding a stale core block. Letta stale-core comparison is blocked until core export and archival readback can be joined by source ids.", + "artifact": "apps/elf-eval/fixtures/real_world_memory/core_archival_memory/stale_core_detection.json" + }, + { + "scenario_id": "archival_fallback_readback", + "suite_id": "core_archival_memory", + "status": "blocked", + "elf_position": "untested", + "comparison_outcome": "blocked", + "evidence": "ELF fixture core-archival-archival-fallback-001 scores fallback from insufficient core memory to archival note search. Letta fallback comparison is blocked until archival search output can be exported with source ids.", + "artifact": "apps/elf-eval/fixtures/real_world_memory/core_archival_memory/archival_fallback.json" + }, + { + "scenario_id": "core_archival_project_decision_recovery", + "suite_id": "core_archival_memory", + "status": "not_encoded", + "elf_position": "untested", + "comparison_outcome": "not_tested", + "evidence": "ELF fixture core-archival-project-decision-recovery-001 scores core routing plus archival decision rationale. Letta project-decision recovery remains not_tested until the contained export/readback contract exists.", + "artifact": "apps/elf-eval/fixtures/real_world_memory/core_archival_memory/project_decision_recovery.json" } ], "evidence": [ @@ -2160,14 +2221,15 @@ "evidence": "Official Docker deployment guide and embedding configuration boundary." } ], - "setup_path": "Define Docker server setup, embedding model configuration, and a core/archival memory fixture flow.", - "runtime_boundary": "Docker-only Letta server or CLI flow with benchmark-created agents and no host-global state.", - "resource_expectation": "Embedding model and agent server state must be explicit; record storage and provider boundaries.", + "setup_path": "Use a Docker-only Letta server or CLI flow that creates a benchmark-owned agent, loads the checked-in core_archival_memory fixture corpus, writes core memory and archival memory with fixture source ids, then exports core block JSON plus archival search/readback JSON.", + "runtime_boundary": "Docker-only Letta server or CLI flow with benchmark-created agents, benchmark-owned storage, no host-global state, and no unstated hosted service dependency.", + "resource_expectation": "Embedding model, agent server state, exported core memory, archival search output, and provider boundaries must be explicit in the artifact.", "retry_guidance": [ - "Create a tiny Docker agent with archival memory search.", - "Score core-versus-archival retrieval only after source evidence can be exported." + "Create a tiny Docker agent with core memory and archival memory loaded from the ELF core_archival_memory fixtures.", + "Export core block readback, archival search results, source ids, and any audit-equivalent metadata as JSON before scoring.", + "Score core-versus-archival scenarios only after source evidence can be exported and mapped to the fixture evidence ids." ], - "research_depth": "D1 feasibility verdict: research_only (XY-882); core/archival reference, adapter not encoded" + "research_depth": "D1 feasibility verdict: research_only (XY-882); XY-927 selects the contained export/readback contract, but the Letta adapter remains blocked until that artifact exists" } }, { diff --git a/apps/elf-eval/fixtures/real_world_memory/core_archival_memory/archival_fallback.json b/apps/elf-eval/fixtures/real_world_memory/core_archival_memory/archival_fallback.json new file mode 100644 index 00000000..b1928711 --- /dev/null +++ b/apps/elf-eval/fixtures/real_world_memory/core_archival_memory/archival_fallback.json @@ -0,0 +1,192 @@ +{ + "schema": "elf.real_world_job/v1", + "job_id": "core-archival-archival-fallback-001", + "suite": "core_archival_memory", + "title": "Fall back to archival notes when core memory is insufficient", + "corpus": { + "corpus_id": "real-world-memory-core-archival-2026-06-11", + "profile": "synthetic", + "items": [ + { + "evidence_id": "fallback-core-insufficient", + "kind": "core_block", + "text": "Core block summary: a rollback runbook exists for single-user production, but this core block intentionally omits the rollback steps.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "archival_fallback", + "evidence_id": "fallback-core-insufficient" + }, + "locator": { + "quote": "intentionally omits the rollback steps" + } + }, + "created_at": "2026-06-11T04:40:00Z" + }, + { + "evidence_id": "fallback-archival-runbook", + "kind": "runbook", + "text": "Archival rollback note: restore the Postgres backup, rebuild Qdrant from Postgres chunk vectors, and verify search recovers the restored note.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "archival_fallback", + "evidence_id": "fallback-archival-runbook" + }, + "locator": { + "quote": "restore the Postgres backup, rebuild Qdrant from Postgres chunk vectors" + } + }, + "created_at": "2026-06-11T04:41:00Z" + }, + { + "evidence_id": "fallback-core-only-trap", + "kind": "unsupported_claim", + "text": "Unsupported shortcut: answer the rollback steps from the core block alone without archival note search.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "archival_fallback", + "evidence_id": "fallback-core-only-trap" + } + }, + "created_at": "2026-06-11T04:39:00Z" + } + ], + "adapter_response": { + "adapter_id": "fixture_core_archival_memory", + "answer": { + "content": "The core block is insufficient because it says the rollback runbook exists but omits the steps. Fall back to archival note search: restore the Postgres backup, rebuild Qdrant from Postgres chunk vectors, and verify search recovers the restored note.", + "claims": [ + { + "claim_id": "core_memory_insufficient", + "text": "The core block is insufficient because it omits the rollback steps.", + "evidence_ids": ["fallback-core-insufficient"], + "confidence": "high" + }, + { + "claim_id": "archival_fallback_steps", + "text": "The archival rollback steps are restore Postgres, rebuild Qdrant from Postgres vectors, and verify search recovers the restored note.", + "evidence_ids": ["fallback-archival-runbook"], + "confidence": "high" + } + ], + "evidence_ids": ["fallback-core-insufficient", "fallback-archival-runbook"], + "latency_ms": 1.3, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + } + } + } + }, + "timeline": [ + { + "event_id": "core-rollback-summary-attached", + "ts": "2026-06-11T04:40:00Z", + "actor": "agent", + "action": "attachment_added", + "evidence_ids": ["fallback-core-insufficient"], + "summary": "A core block pointed at the rollback runbook but did not include the steps." + }, + { + "event_id": "archival-rollback-note-recorded", + "ts": "2026-06-11T04:41:00Z", + "actor": "agent", + "action": "recorded_runbook", + "evidence_ids": ["fallback-archival-runbook"], + "summary": "The detailed rollback steps were recorded as archival note evidence." + } + ], + "prompt": { + "role": "user", + "content": "The attached core block only says a rollback runbook exists. What are the rollback steps?", + "job_mode": "answer", + "constraints": ["cite_evidence", "use_archival_fallback", "avoid_core_only_hallucination"] + }, + "expected_answer": { + "must_include": [ + { + "claim_id": "core_memory_insufficient", + "text": "The core block is insufficient because it omits the rollback steps." + }, + { + "claim_id": "archival_fallback_steps", + "text": "The archival rollback steps are restore Postgres, rebuild Qdrant from Postgres vectors, and verify search recovers the restored note." + } + ], + "must_not_include": [ + "answer the rollback steps from the core block alone" + ], + "evidence_links": { + "core_memory_insufficient": ["fallback-core-insufficient"], + "archival_fallback_steps": ["fallback-archival-runbook"] + }, + "answer_type": "archival_fallback_answer", + "accepted_alternates": [], + "requires_caveat": false, + "requires_refusal": false + }, + "required_evidence": [ + { + "evidence_id": "fallback-core-insufficient", + "claim_id": "core_memory_insufficient", + "requirement": "explain", + "quote": "intentionally omits the rollback steps" + }, + { + "evidence_id": "fallback-archival-runbook", + "claim_id": "archival_fallback_steps", + "requirement": "cite", + "quote": "restore the Postgres backup, rebuild Qdrant from Postgres chunk vectors" + } + ], + "negative_traps": [ + { + "trap_id": "core-only-rollback-hallucination", + "type": "unsupported_claim", + "evidence_ids": ["fallback-core-only-trap"], + "failure_if_used": true + } + ], + "scoring_rubric": { + "dimensions": { + "answer_correctness": { + "weight": 0.3, + "max_points": 1.0, + "criteria": "Provides the archival rollback steps." + }, + "evidence_grounding": { + "weight": 0.3, + "max_points": 1.0, + "criteria": "Cites both insufficient core memory and archival fallback evidence." + }, + "trap_avoidance": { + "weight": 0.2, + "max_points": 1.0, + "criteria": "Avoids core-only hallucination." + }, + "workflow_helpfulness": { + "weight": 0.2, + "max_points": 1.0, + "criteria": "Makes the fallback path explicit." + } + }, + "pass_threshold": 0.85, + "hard_fail_rules": [ + "unsupported high-confidence claim about a required decision or fact", + "use of a negative trap marked failure_if_used = true" + ] + }, + "allowed_uncertainty": { + "can_answer_unknown": false, + "acceptable_phrases": ["The fixture does not provide that evidence."], + "fallback_action": "state_blocker" + }, + "tags": ["synthetic", "core_archival_memory", "archival_fallback", "rollback", "no_live_claim"] +} diff --git a/apps/elf-eval/fixtures/real_world_memory/core_archival_memory/core_block_attachment.json b/apps/elf-eval/fixtures/real_world_memory/core_archival_memory/core_block_attachment.json new file mode 100644 index 00000000..c1f34487 --- /dev/null +++ b/apps/elf-eval/fixtures/real_world_memory/core_archival_memory/core_block_attachment.json @@ -0,0 +1,192 @@ +{ + "schema": "elf.real_world_job/v1", + "job_id": "core-archival-core-block-attachment-001", + "suite": "core_archival_memory", + "title": "Read an explicitly attached core block without treating it as archival search", + "corpus": { + "corpus_id": "real-world-memory-core-archival-2026-06-11", + "profile": "synthetic", + "items": [ + { + "evidence_id": "core-attachment-active", + "kind": "core_block", + "text": "Core block attachment: key project_style has an active attachment for tenant local-tenant project ELF agent local-agent read_profile private_plus_project.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "core_block_attachment", + "evidence_id": "core-attachment-active" + }, + "locator": { + "quote": "active attachment for tenant local-tenant project ELF agent local-agent read_profile private_plus_project" + } + }, + "created_at": "2026-06-11T04:00:00Z" + }, + { + "evidence_id": "core-attachment-not-search", + "kind": "core_block_contract", + "text": "Core block readback is not archival search; it does not embed, rerank, search Qdrant, create a search session, or record note hits.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "core_block_attachment", + "evidence_id": "core-attachment-not-search" + }, + "locator": { + "quote": "does not embed, rerank, search Qdrant" + } + }, + "created_at": "2026-06-11T04:01:00Z" + }, + { + "evidence_id": "core-attachment-qdrant-trap", + "kind": "stale_claim", + "text": "Stale shortcut: core blocks are indexed into Qdrant and returned as normal archival note search hits.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "core_block_attachment", + "evidence_id": "core-attachment-qdrant-trap" + } + }, + "created_at": "2026-06-11T03:59:00Z" + } + ], + "adapter_response": { + "adapter_id": "fixture_core_archival_memory", + "answer": { + "content": "Return the project_style core block because it has an active attachment for the exact tenant, project, agent, and private_plus_project read profile. Keep that readback separate from archival search because core blocks do not embed, rerank, search Qdrant, create search sessions, or record note hits.", + "claims": [ + { + "claim_id": "attached_core_block_readback", + "text": "The project_style core block is returned through its exact active attachment.", + "evidence_ids": ["core-attachment-active"], + "confidence": "high" + }, + { + "claim_id": "core_not_archival_search", + "text": "Core block readback is separate from archival search and Qdrant-derived note retrieval.", + "evidence_ids": ["core-attachment-not-search"], + "confidence": "high" + } + ], + "evidence_ids": ["core-attachment-active", "core-attachment-not-search"], + "latency_ms": 1.0, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + } + } + } + }, + "timeline": [ + { + "event_id": "core-project-style-attached", + "ts": "2026-06-11T04:00:00Z", + "actor": "agent", + "action": "attachment_added", + "evidence_ids": ["core-attachment-active"], + "summary": "The project_style core block was attached for the exact read profile." + }, + { + "event_id": "core-archival-boundary-recorded", + "ts": "2026-06-11T04:01:00Z", + "actor": "agent", + "action": "recorded_contract", + "evidence_ids": ["core-attachment-not-search"], + "summary": "The core block readback boundary was recorded separately from archival search." + } + ], + "prompt": { + "role": "user", + "content": "Which always-loaded project style block is attached for this agent, and should it appear as a normal archival search hit?", + "job_mode": "answer", + "constraints": ["cite_evidence", "separate_core_from_archival_search", "avoid_qdrant_core_block_claims"] + }, + "expected_answer": { + "must_include": [ + { + "claim_id": "attached_core_block_readback", + "text": "The project_style core block is returned through its exact active attachment." + }, + { + "claim_id": "core_not_archival_search", + "text": "Core block readback is separate from archival search and Qdrant-derived note retrieval." + } + ], + "must_not_include": [ + "core blocks are indexed into Qdrant and returned as normal archival note search hits" + ], + "evidence_links": { + "attached_core_block_readback": ["core-attachment-active"], + "core_not_archival_search": ["core-attachment-not-search"] + }, + "answer_type": "direct_answer", + "accepted_alternates": [], + "requires_caveat": false, + "requires_refusal": false + }, + "required_evidence": [ + { + "evidence_id": "core-attachment-active", + "claim_id": "attached_core_block_readback", + "requirement": "cite", + "quote": "active attachment for tenant local-tenant project ELF agent local-agent read_profile private_plus_project" + }, + { + "evidence_id": "core-attachment-not-search", + "claim_id": "core_not_archival_search", + "requirement": "cite", + "quote": "does not embed, rerank, search Qdrant" + } + ], + "negative_traps": [ + { + "trap_id": "qdrant-core-block-search-hit", + "type": "stale_fact", + "evidence_ids": ["core-attachment-qdrant-trap"], + "failure_if_used": true + } + ], + "scoring_rubric": { + "dimensions": { + "answer_correctness": { + "weight": 0.3, + "max_points": 1.0, + "criteria": "Identifies the attached core block." + }, + "evidence_grounding": { + "weight": 0.3, + "max_points": 1.0, + "criteria": "Cites attachment and core-search boundary evidence." + }, + "trap_avoidance": { + "weight": 0.2, + "max_points": 1.0, + "criteria": "Avoids indexing core blocks into Qdrant-backed archival search." + }, + "lifecycle_behavior": { + "weight": 0.2, + "max_points": 1.0, + "criteria": "Preserves explicit attachment semantics." + } + }, + "pass_threshold": 0.85, + "hard_fail_rules": [ + "unsupported high-confidence claim about a required decision or fact", + "use of a negative trap marked failure_if_used = true" + ] + }, + "allowed_uncertainty": { + "can_answer_unknown": false, + "acceptable_phrases": ["The fixture does not provide that evidence."], + "fallback_action": "state_blocker" + }, + "tags": ["synthetic", "core_archival_memory", "core_block", "attachment", "no_live_claim"] +} diff --git a/apps/elf-eval/fixtures/real_world_memory/core_archival_memory/core_block_provenance.json b/apps/elf-eval/fixtures/real_world_memory/core_archival_memory/core_block_provenance.json new file mode 100644 index 00000000..f1fd4f92 --- /dev/null +++ b/apps/elf-eval/fixtures/real_world_memory/core_archival_memory/core_block_provenance.json @@ -0,0 +1,192 @@ +{ + "schema": "elf.real_world_job/v1", + "job_id": "core-archival-core-block-provenance-001", + "suite": "core_archival_memory", + "title": "Return source refs and audit events for core block assertions", + "corpus": { + "corpus_id": "real-world-memory-core-archival-2026-06-11", + "profile": "synthetic", + "items": [ + { + "evidence_id": "core-provenance-source-ref", + "kind": "core_block", + "text": "Provenance evidence: core block release_policy returns source_ref schema source_ref/v1 with resolver real_world_job_fixture/v1 and locator quote retained for reviewer inspection.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "core_block_provenance", + "evidence_id": "core-provenance-source-ref" + }, + "locator": { + "quote": "source_ref schema source_ref/v1" + } + }, + "created_at": "2026-06-11T04:20:00Z" + }, + { + "evidence_id": "core-provenance-audit-events", + "kind": "core_block_event", + "text": "Audit evidence: release_policy has append-only events block_created, block_updated, and attachment_added returned in audit_history.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "core_block_provenance", + "evidence_id": "core-provenance-audit-events" + }, + "locator": { + "quote": "block_created, block_updated, and attachment_added" + } + }, + "created_at": "2026-06-11T04:21:00Z" + }, + { + "evidence_id": "core-provenance-trusted-memory-trap", + "kind": "stale_claim", + "text": "Stale shortcut: always-loaded core memory is trusted without returning source_ref or audit_history.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "core_block_provenance", + "evidence_id": "core-provenance-trusted-memory-trap" + } + }, + "created_at": "2026-06-11T04:19:00Z" + } + ], + "adapter_response": { + "adapter_id": "fixture_core_archival_memory", + "answer": { + "content": "The release_policy core block must return its source_ref with source_ref/v1 resolver data and retain the locator quote for inspection. Its provenance also includes append-only block_created, block_updated, and attachment_added events in audit_history.", + "claims": [ + { + "claim_id": "core_source_ref_returned", + "text": "The release_policy core block returns source_ref/v1 provenance.", + "evidence_ids": ["core-provenance-source-ref"], + "confidence": "high" + }, + { + "claim_id": "core_audit_history_returned", + "text": "The release_policy core block returns block_created, block_updated, and attachment_added audit events.", + "evidence_ids": ["core-provenance-audit-events"], + "confidence": "high" + } + ], + "evidence_ids": ["core-provenance-source-ref", "core-provenance-audit-events"], + "latency_ms": 1.0, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + } + } + } + }, + "timeline": [ + { + "event_id": "core-release-policy-created", + "ts": "2026-06-11T04:20:00Z", + "actor": "agent", + "action": "block_created", + "evidence_ids": ["core-provenance-source-ref"], + "summary": "The release_policy block was created with a source_ref pointer." + }, + { + "event_id": "core-release-policy-attached", + "ts": "2026-06-11T04:21:00Z", + "actor": "agent", + "action": "attachment_added", + "evidence_ids": ["core-provenance-audit-events"], + "summary": "The release_policy block attachment event was added to audit history." + } + ], + "prompt": { + "role": "user", + "content": "What provenance should a returned core release_policy block include?", + "job_mode": "answer", + "constraints": ["cite_evidence", "include_source_ref", "include_audit_history"] + }, + "expected_answer": { + "must_include": [ + { + "claim_id": "core_source_ref_returned", + "text": "The release_policy core block returns source_ref/v1 provenance." + }, + { + "claim_id": "core_audit_history_returned", + "text": "The release_policy core block returns block_created, block_updated, and attachment_added audit events." + } + ], + "must_not_include": [ + "always-loaded core memory is trusted without returning source_ref or audit_history" + ], + "evidence_links": { + "core_source_ref_returned": ["core-provenance-source-ref"], + "core_audit_history_returned": ["core-provenance-audit-events"] + }, + "answer_type": "provenance_bundle", + "accepted_alternates": [], + "requires_caveat": false, + "requires_refusal": false + }, + "required_evidence": [ + { + "evidence_id": "core-provenance-source-ref", + "claim_id": "core_source_ref_returned", + "requirement": "cite", + "quote": "source_ref schema source_ref/v1" + }, + { + "evidence_id": "core-provenance-audit-events", + "claim_id": "core_audit_history_returned", + "requirement": "cite", + "quote": "block_created, block_updated, and attachment_added" + } + ], + "negative_traps": [ + { + "trap_id": "trusted-core-no-provenance", + "type": "unsupported_claim", + "evidence_ids": ["core-provenance-trusted-memory-trap"], + "failure_if_used": true + } + ], + "scoring_rubric": { + "dimensions": { + "answer_correctness": { + "weight": 0.25, + "max_points": 1.0, + "criteria": "States the returned provenance fields." + }, + "evidence_grounding": { + "weight": 0.35, + "max_points": 1.0, + "criteria": "Cites source_ref and audit-history evidence." + }, + "trap_avoidance": { + "weight": 0.2, + "max_points": 1.0, + "criteria": "Avoids trusted-without-provenance claims." + }, + "workflow_helpfulness": { + "weight": 0.2, + "max_points": 1.0, + "criteria": "Answers in a reviewer-usable provenance bundle shape." + } + }, + "pass_threshold": 0.85, + "hard_fail_rules": [ + "unsupported high-confidence claim about a required decision or fact", + "use of a negative trap marked failure_if_used = true" + ] + }, + "allowed_uncertainty": { + "can_answer_unknown": false, + "acceptable_phrases": ["The fixture does not provide that evidence."], + "fallback_action": "state_blocker" + }, + "tags": ["synthetic", "core_archival_memory", "provenance", "audit_history", "source_ref"] +} diff --git a/apps/elf-eval/fixtures/real_world_memory/core_archival_memory/core_block_scope.json b/apps/elf-eval/fixtures/real_world_memory/core_archival_memory/core_block_scope.json new file mode 100644 index 00000000..3b379b85 --- /dev/null +++ b/apps/elf-eval/fixtures/real_world_memory/core_archival_memory/core_block_scope.json @@ -0,0 +1,192 @@ +{ + "schema": "elf.real_world_job/v1", + "job_id": "core-archival-core-block-scope-001", + "suite": "core_archival_memory", + "title": "Apply core block scope and private-owner checks before readback", + "corpus": { + "corpus_id": "real-world-memory-core-archival-2026-06-11", + "profile": "synthetic", + "items": [ + { + "evidence_id": "core-scope-project-shared-readable", + "kind": "core_block", + "text": "Scope evidence: project_shared block release_gate is readable for tenant local-tenant project ELF agent local-agent only when the active attachment and read_profile all_scopes allow project_shared.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "core_block_scope", + "evidence_id": "core-scope-project-shared-readable" + }, + "locator": { + "quote": "active attachment and read_profile all_scopes allow project_shared" + } + }, + "created_at": "2026-06-11T04:10:00Z" + }, + { + "evidence_id": "core-scope-private-owner", + "kind": "core_block", + "text": "Private owner evidence: agent_private block agent_a_workflow belongs to agent-a and must not be returned to agent-b even if agent-b has a matching read_profile label.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "core_block_scope", + "evidence_id": "core-scope-private-owner" + }, + "locator": { + "quote": "must not be returned to agent-b" + } + }, + "created_at": "2026-06-11T04:11:00Z" + }, + { + "evidence_id": "core-scope-bypass-trap", + "kind": "stale_claim", + "text": "Stale shortcut: a core block attachment bypasses read_profile scope checks, private-owner checks, and shared grants.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "core_block_scope", + "evidence_id": "core-scope-bypass-trap" + } + }, + "created_at": "2026-06-11T04:09:00Z" + } + ], + "adapter_response": { + "adapter_id": "fixture_core_archival_memory", + "answer": { + "content": "Return the release_gate core block only when the active attachment and all_scopes read profile allow project_shared. Do not return agent_a_workflow to agent-b, because private-owner checks still apply to agent_private core blocks.", + "claims": [ + { + "claim_id": "shared_core_scope_allowed", + "text": "The project_shared release_gate block is readable only when attachment and read_profile allow project_shared.", + "evidence_ids": ["core-scope-project-shared-readable"], + "confidence": "high" + }, + { + "claim_id": "private_core_scope_denied", + "text": "The agent_private agent_a_workflow block must not be returned to agent-b.", + "evidence_ids": ["core-scope-private-owner"], + "confidence": "high" + } + ], + "evidence_ids": ["core-scope-project-shared-readable", "core-scope-private-owner"], + "latency_ms": 1.1, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + } + } + } + }, + "timeline": [ + { + "event_id": "core-release-gate-shared", + "ts": "2026-06-11T04:10:00Z", + "actor": "agent", + "action": "attachment_added", + "evidence_ids": ["core-scope-project-shared-readable"], + "summary": "The release_gate block was attached with project_shared scope." + }, + { + "event_id": "core-agent-a-private", + "ts": "2026-06-11T04:11:00Z", + "actor": "agent-a", + "action": "block_created", + "evidence_ids": ["core-scope-private-owner"], + "summary": "The agent_a_workflow block remained private to agent-a." + } + ], + "prompt": { + "role": "user", + "content": "For core memory readback, which shared block can this agent see, and can agent-b also see agent-a's private block?", + "job_mode": "answer", + "constraints": ["cite_evidence", "enforce_scope", "avoid_private_owner_leakage"] + }, + "expected_answer": { + "must_include": [ + { + "claim_id": "shared_core_scope_allowed", + "text": "The project_shared release_gate block is readable only when attachment and read_profile allow project_shared." + }, + { + "claim_id": "private_core_scope_denied", + "text": "The agent_private agent_a_workflow block must not be returned to agent-b." + } + ], + "must_not_include": [ + "a core block attachment bypasses read_profile scope checks" + ], + "evidence_links": { + "shared_core_scope_allowed": ["core-scope-project-shared-readable"], + "private_core_scope_denied": ["core-scope-private-owner"] + }, + "answer_type": "direct_answer", + "accepted_alternates": [], + "requires_caveat": false, + "requires_refusal": false + }, + "required_evidence": [ + { + "evidence_id": "core-scope-project-shared-readable", + "claim_id": "shared_core_scope_allowed", + "requirement": "cite", + "quote": "active attachment and read_profile all_scopes allow project_shared" + }, + { + "evidence_id": "core-scope-private-owner", + "claim_id": "private_core_scope_denied", + "requirement": "cite", + "quote": "must not be returned to agent-b" + } + ], + "negative_traps": [ + { + "trap_id": "core-attachment-bypasses-scope", + "type": "scope_leak", + "evidence_ids": ["core-scope-bypass-trap"], + "failure_if_used": true + } + ], + "scoring_rubric": { + "dimensions": { + "answer_correctness": { + "weight": 0.3, + "max_points": 1.0, + "criteria": "Applies readable shared scope and denied private owner scope." + }, + "evidence_grounding": { + "weight": 0.25, + "max_points": 1.0, + "criteria": "Cites scope and private-owner evidence." + }, + "trap_avoidance": { + "weight": 0.25, + "max_points": 1.0, + "criteria": "Avoids scope-bypass claims." + }, + "ownership_correctness": { + "weight": 0.2, + "max_points": 1.0, + "criteria": "Does not leak private core blocks across agents." + } + }, + "pass_threshold": 0.85, + "hard_fail_rules": [ + "unsupported high-confidence claim about a required decision or fact", + "use of a negative trap marked failure_if_used = true" + ] + }, + "allowed_uncertainty": { + "can_answer_unknown": false, + "acceptable_phrases": ["The fixture does not provide that evidence."], + "fallback_action": "state_blocker" + }, + "tags": ["synthetic", "core_archival_memory", "scope", "private_owner", "no_live_claim"] +} diff --git a/apps/elf-eval/fixtures/real_world_memory/core_archival_memory/project_decision_recovery.json b/apps/elf-eval/fixtures/real_world_memory/core_archival_memory/project_decision_recovery.json new file mode 100644 index 00000000..229ecc34 --- /dev/null +++ b/apps/elf-eval/fixtures/real_world_memory/core_archival_memory/project_decision_recovery.json @@ -0,0 +1,230 @@ +{ + "schema": "elf.real_world_job/v1", + "job_id": "core-archival-project-decision-recovery-001", + "suite": "core_archival_memory", + "title": "Recover a project decision from core routing and archival rationale", + "corpus": { + "corpus_id": "real-world-memory-core-archival-2026-06-11", + "profile": "synthetic", + "items": [ + { + "evidence_id": "decision-core-routing-block", + "kind": "core_block", + "text": "Core decision routing block: keep the benchmark outcome policy always attached and route detailed rationale to archival notes.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "project_decision_recovery", + "evidence_id": "decision-core-routing-block" + }, + "locator": { + "quote": "route detailed rationale to archival notes" + } + }, + "created_at": "2026-06-11T04:50:00Z" + }, + { + "evidence_id": "decision-archival-outcome-policy", + "kind": "decision", + "text": "Archival decision record: scenario outcomes use win, tie, loss, not_tested, blocked, or non_goal only when scenario evidence supports them.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "project_decision_recovery", + "evidence_id": "decision-archival-outcome-policy" + }, + "locator": { + "quote": "use win, tie, loss, not_tested, blocked, or non_goal only when scenario evidence supports them" + } + }, + "created_at": "2026-06-11T04:51:00Z" + }, + { + "evidence_id": "decision-archival-core-search-boundary", + "kind": "decision", + "text": "Archival project decision: core blocks stay separate from archival note search and Qdrant-derived retrieval.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "project_decision_recovery", + "evidence_id": "decision-archival-core-search-boundary" + }, + "locator": { + "quote": "core blocks stay separate from archival note search" + } + }, + "created_at": "2026-06-11T04:52:00Z" + }, + { + "evidence_id": "decision-letta-win-trap", + "kind": "unsupported_claim", + "text": "Wrong claim: Letta comparison can be scored as an ELF win because ELF has core blocks.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "project_decision_recovery", + "evidence_id": "decision-letta-win-trap" + } + }, + "created_at": "2026-06-11T04:49:00Z" + } + ], + "adapter_response": { + "adapter_id": "fixture_core_archival_memory", + "answer": { + "content": "Use the always-attached core routing block to find the benchmark outcome policy, then cite archival notes for the detailed decision. The archival decision says to use win, tie, loss, not_tested, blocked, or non_goal only when scenario evidence supports them. It also says core blocks stay separate from archival note search and Qdrant-derived retrieval, so no ELF-over-Letta claim follows from ELF having core blocks.", + "claims": [ + { + "claim_id": "core_routes_to_archival_rationale", + "text": "The core routing block points detailed decision rationale to archival notes.", + "evidence_ids": ["decision-core-routing-block"], + "confidence": "high" + }, + { + "claim_id": "outcomes_require_evidence", + "text": "Scenario outcomes use win, tie, loss, not_tested, blocked, or non_goal only when evidence supports them.", + "evidence_ids": ["decision-archival-outcome-policy"], + "confidence": "high" + }, + { + "claim_id": "core_archival_boundary_preserved", + "text": "Core blocks stay separate from archival note search and Qdrant-derived retrieval.", + "evidence_ids": ["decision-archival-core-search-boundary"], + "confidence": "high" + } + ], + "evidence_ids": [ + "decision-core-routing-block", + "decision-archival-outcome-policy", + "decision-archival-core-search-boundary" + ], + "latency_ms": 1.4, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + } + } + } + }, + "timeline": [ + { + "event_id": "decision-routing-core-attached", + "ts": "2026-06-11T04:50:00Z", + "actor": "agent", + "action": "attachment_added", + "evidence_ids": ["decision-core-routing-block"], + "summary": "A core block kept the outcome-policy routing pointer always attached." + }, + { + "event_id": "decision-outcome-policy-archived", + "ts": "2026-06-11T04:51:00Z", + "actor": "agent", + "action": "recorded_decision", + "evidence_ids": ["decision-archival-outcome-policy", "decision-archival-core-search-boundary"], + "summary": "Archival notes recorded the detailed outcome policy and core-search boundary." + } + ], + "prompt": { + "role": "user", + "content": "What is the benchmark outcome policy, and does having ELF core blocks make Letta a measured loss?", + "job_mode": "decide", + "constraints": ["cite_evidence", "recover_project_decision", "avoid_unsupported_letta_claims"] + }, + "expected_answer": { + "must_include": [ + { + "claim_id": "core_routes_to_archival_rationale", + "text": "The core routing block points detailed decision rationale to archival notes." + }, + { + "claim_id": "outcomes_require_evidence", + "text": "Scenario outcomes use win, tie, loss, not_tested, blocked, or non_goal only when evidence supports them." + }, + { + "claim_id": "core_archival_boundary_preserved", + "text": "Core blocks stay separate from archival note search and Qdrant-derived retrieval." + } + ], + "must_not_include": [ + "Letta comparison can be scored as an ELF win because ELF has core blocks" + ], + "evidence_links": { + "core_routes_to_archival_rationale": ["decision-core-routing-block"], + "outcomes_require_evidence": ["decision-archival-outcome-policy"], + "core_archival_boundary_preserved": ["decision-archival-core-search-boundary"] + }, + "answer_type": "decision_record", + "accepted_alternates": [], + "requires_caveat": false, + "requires_refusal": false + }, + "required_evidence": [ + { + "evidence_id": "decision-core-routing-block", + "claim_id": "core_routes_to_archival_rationale", + "requirement": "cite", + "quote": "route detailed rationale to archival notes" + }, + { + "evidence_id": "decision-archival-outcome-policy", + "claim_id": "outcomes_require_evidence", + "requirement": "cite", + "quote": "use win, tie, loss, not_tested, blocked, or non_goal only when scenario evidence supports them" + }, + { + "evidence_id": "decision-archival-core-search-boundary", + "claim_id": "core_archival_boundary_preserved", + "requirement": "cite", + "quote": "core blocks stay separate from archival note search" + } + ], + "negative_traps": [ + { + "trap_id": "unsupported-letta-loss-from-elf-core", + "type": "unsupported_claim", + "evidence_ids": ["decision-letta-win-trap"], + "failure_if_used": true + } + ], + "scoring_rubric": { + "dimensions": { + "answer_correctness": { + "weight": 0.25, + "max_points": 1.0, + "criteria": "Recovers the benchmark outcome policy." + }, + "evidence_grounding": { + "weight": 0.3, + "max_points": 1.0, + "criteria": "Cites core routing and archival decision evidence." + }, + "trap_avoidance": { + "weight": 0.25, + "max_points": 1.0, + "criteria": "Avoids an unsupported Letta win or loss claim." + }, + "workflow_helpfulness": { + "weight": 0.2, + "max_points": 1.0, + "criteria": "Explains how core memory and archival decision evidence work together." + } + }, + "pass_threshold": 0.85, + "hard_fail_rules": [ + "unsupported high-confidence claim about a required decision or fact", + "use of a negative trap marked failure_if_used = true" + ] + }, + "allowed_uncertainty": { + "can_answer_unknown": false, + "acceptable_phrases": ["The fixture does not provide that evidence."], + "fallback_action": "state_blocker" + }, + "tags": ["synthetic", "core_archival_memory", "project_decisions", "letta_boundary", "no_live_claim"] +} diff --git a/apps/elf-eval/fixtures/real_world_memory/core_archival_memory/stale_core_detection.json b/apps/elf-eval/fixtures/real_world_memory/core_archival_memory/stale_core_detection.json new file mode 100644 index 00000000..084c26cb --- /dev/null +++ b/apps/elf-eval/fixtures/real_world_memory/core_archival_memory/stale_core_detection.json @@ -0,0 +1,206 @@ +{ + "schema": "elf.real_world_job/v1", + "job_id": "core-archival-stale-core-detection-001", + "suite": "core_archival_memory", + "title": "Detect a stale core block when archival evidence supersedes it", + "corpus": { + "corpus_id": "real-world-memory-core-archival-2026-06-11", + "profile": "synthetic", + "items": [ + { + "evidence_id": "stale-core-validation-gate", + "kind": "core_block", + "text": "Stale core block: the validation gate is cargo make lint and cargo make test.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "stale_core_detection", + "evidence_id": "stale-core-validation-gate" + } + }, + "created_at": "2026-06-10T10:00:00Z" + }, + { + "evidence_id": "archival-current-validation-gate", + "kind": "decision", + "text": "Archival decision update: before pushing a refreshed PR head, run cargo make fmt, cargo make lint-fix, and cargo make checks.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "stale_core_detection", + "evidence_id": "archival-current-validation-gate" + }, + "locator": { + "quote": "cargo make fmt, cargo make lint-fix, and cargo make checks" + } + }, + "created_at": "2026-06-11T04:30:00Z" + }, + { + "evidence_id": "archival-supersedes-core-rationale", + "kind": "decision", + "text": "Rationale: archival note evidence supersedes the attached core block until the core block is updated from source-of-truth state.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "stale_core_detection", + "evidence_id": "archival-supersedes-core-rationale" + }, + "locator": { + "quote": "supersedes the attached core block" + } + }, + "created_at": "2026-06-11T04:31:00Z" + }, + { + "evidence_id": "stale-core-answer-trap", + "kind": "stale_claim", + "text": "Wrong answer trap: cite the core block as current and skip archival evidence.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "stale_core_detection", + "evidence_id": "stale-core-answer-trap" + } + }, + "created_at": "2026-06-11T04:29:00Z" + } + ], + "adapter_response": { + "adapter_id": "fixture_core_archival_memory", + "answer": { + "content": "Treat the attached validation-gate core block as stale. The current archival decision says to run cargo make fmt, cargo make lint-fix, and cargo make checks before pushing a refreshed PR head, and the archival rationale says that evidence supersedes the core block until it is updated from source-of-truth state.", + "claims": [ + { + "claim_id": "stale_core_detected", + "text": "The attached validation-gate core block is stale.", + "evidence_ids": ["archival-supersedes-core-rationale"], + "confidence": "high" + }, + { + "claim_id": "archival_current_gate", + "text": "The current archival validation gate is cargo make fmt, cargo make lint-fix, and cargo make checks.", + "evidence_ids": ["archival-current-validation-gate"], + "confidence": "high" + } + ], + "evidence_ids": ["archival-current-validation-gate", "archival-supersedes-core-rationale"], + "latency_ms": 1.2, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + } + } + } + }, + "timeline": [ + { + "event_id": "old-core-gate-attached", + "ts": "2026-06-10T10:00:00Z", + "actor": "agent", + "action": "block_created", + "evidence_ids": ["stale-core-validation-gate"], + "summary": "A core block recorded an old validation gate." + }, + { + "event_id": "archival-gate-updated", + "ts": "2026-06-11T04:30:00Z", + "actor": "agent", + "action": "updated_decision", + "evidence_ids": ["archival-current-validation-gate", "archival-supersedes-core-rationale"], + "summary": "Archival evidence superseded the old core validation gate." + } + ], + "prompt": { + "role": "user", + "content": "The attached core block says the gate is lint and test. Is that still current before a refreshed PR push?", + "job_mode": "answer", + "constraints": ["cite_evidence", "detect_stale_core", "prefer_current_archival_evidence"] + }, + "expected_answer": { + "must_include": [ + { + "claim_id": "stale_core_detected", + "text": "The attached validation-gate core block is stale." + }, + { + "claim_id": "archival_current_gate", + "text": "The current archival validation gate is cargo make fmt, cargo make lint-fix, and cargo make checks." + } + ], + "must_not_include": [ + "the validation gate is cargo make lint and cargo make test" + ], + "evidence_links": { + "stale_core_detected": ["archival-supersedes-core-rationale"], + "archival_current_gate": ["archival-current-validation-gate"] + }, + "answer_type": "current_state_with_stale_core_caveat", + "accepted_alternates": [], + "requires_caveat": false, + "requires_refusal": false + }, + "required_evidence": [ + { + "evidence_id": "archival-current-validation-gate", + "claim_id": "archival_current_gate", + "requirement": "cite", + "quote": "cargo make fmt, cargo make lint-fix, and cargo make checks" + }, + { + "evidence_id": "archival-supersedes-core-rationale", + "claim_id": "stale_core_detected", + "requirement": "explain", + "quote": "supersedes the attached core block" + } + ], + "negative_traps": [ + { + "trap_id": "stale-core-current-answer", + "type": "stale_fact", + "evidence_ids": ["stale-core-validation-gate", "stale-core-answer-trap"], + "failure_if_used": true + } + ], + "scoring_rubric": { + "dimensions": { + "answer_correctness": { + "weight": 0.3, + "max_points": 1.0, + "criteria": "States that the attached core block is stale." + }, + "evidence_grounding": { + "weight": 0.3, + "max_points": 1.0, + "criteria": "Cites current archival evidence and supersession rationale." + }, + "trap_avoidance": { + "weight": 0.2, + "max_points": 1.0, + "criteria": "Avoids answering from stale core memory." + }, + "lifecycle_behavior": { + "weight": 0.2, + "max_points": 1.0, + "criteria": "Detects stale core state when archival evidence supersedes it." + } + }, + "pass_threshold": 0.85, + "hard_fail_rules": [ + "unsupported high-confidence claim about a required decision or fact", + "use of a negative trap marked failure_if_used = true" + ] + }, + "allowed_uncertainty": { + "can_answer_unknown": false, + "acceptable_phrases": ["The fixture does not provide that evidence."], + "fallback_action": "state_blocker" + }, + "tags": ["synthetic", "core_archival_memory", "stale_core", "archival_supersession", "no_live_claim"] +} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark.rs b/apps/elf-eval/src/bin/real_world_job_benchmark.rs index a167d2bd..a8bd3973 100644 --- a/apps/elf-eval/src/bin/real_world_job_benchmark.rs +++ b/apps/elf-eval/src/bin/real_world_job_benchmark.rs @@ -54,6 +54,7 @@ const SUITES: &[&str] = &[ "capture_integration", "production_ops", "personalization", + "core_archival_memory", ]; #[derive(Debug, Parser)] diff --git a/apps/elf-eval/tests/real_world_job_benchmark.rs b/apps/elf-eval/tests/real_world_job_benchmark.rs index a8c7e927..2300565b 100644 --- a/apps/elf-eval/tests/real_world_job_benchmark.rs +++ b/apps/elf-eval/tests/real_world_job_benchmark.rs @@ -60,6 +60,10 @@ fn production_ops_fixture_dir() -> PathBuf { real_world_memory_fixture_dir().join("production_ops") } +fn core_archival_memory_fixture_dir() -> PathBuf { + real_world_memory_fixture_dir().join("core_archival_memory") +} + fn workspace_root() -> Result { let manifest_dir = Path::new(env!("CARGO_MANIFEST_DIR")); let root = manifest_dir @@ -373,7 +377,7 @@ fn external_adapter_run_summarizes_nonzero_scenario_losses() -> Result<()> { report .pointer("/external_adapters/summary/scenario_position_counts/untested") .and_then(Value::as_u64), - Some(10) + Some(16) ); assert_eq!( report @@ -385,7 +389,7 @@ fn external_adapter_run_summarizes_nonzero_scenario_losses() -> Result<()> { report .pointer("/external_adapters/summary/scenario_outcome_counts/not_tested") .and_then(Value::as_u64), - Some(7) + Some(11) ); let adapters = array_at(&report, "/external_adapters/adapters")?; @@ -472,13 +476,13 @@ fn assert_external_adapter_manifest_summary(report: &Value) { report .pointer("/external_adapters/summary/overall_status_counts/blocked") .and_then(Value::as_u64), - Some(5) + Some(6) ); assert_eq!( report .pointer("/external_adapters/summary/overall_status_counts/not_encoded") .and_then(Value::as_u64), - Some(7) + Some(6) ); assert_eq!( report @@ -496,7 +500,7 @@ fn assert_external_adapter_manifest_summary(report: &Value) { report .pointer("/external_adapters/summary/suite_status_counts/blocked") .and_then(Value::as_u64), - Some(13) + Some(14) ); assert_eq!( report @@ -531,7 +535,7 @@ fn assert_external_adapter_manifest_scenario_summary(report: &Value) { report .pointer("/external_adapters/summary/scenario_status_counts/blocked") .and_then(Value::as_u64), - Some(2) + Some(4) ); assert_eq!( report @@ -561,7 +565,7 @@ fn assert_external_adapter_manifest_scenario_summary(report: &Value) { report .pointer("/external_adapters/summary/scenario_status_counts/not_encoded") .and_then(Value::as_u64), - Some(3) + Some(7) ); assert_eq!( report @@ -585,7 +589,7 @@ fn assert_external_adapter_manifest_scenario_summary(report: &Value) { report .pointer("/external_adapters/summary/scenario_position_counts/untested") .and_then(Value::as_u64), - Some(11) + Some(17) ); assert_eq!( report @@ -609,13 +613,13 @@ fn assert_external_adapter_manifest_scenario_summary(report: &Value) { report .pointer("/external_adapters/summary/scenario_outcome_counts/not_tested") .and_then(Value::as_u64), - Some(8) + Some(12) ); assert_eq!( report .pointer("/external_adapters/summary/scenario_outcome_counts/blocked") .and_then(Value::as_u64), - Some(1) + Some(3) ); assert_eq!( report @@ -645,6 +649,7 @@ fn assert_external_adapter_manifest_records(report: &Value) -> Result<()> { let graphify = find_by_field(adapters, "/adapter_id", "graphify_docker_smoke")?; let qmd_deep = find_by_field(adapters, "/adapter_id", "qmd_deep_profile_gate")?; let openviking_deep = find_by_field(adapters, "/adapter_id", "openviking_deep_profile_gate")?; + let letta = find_by_field(adapters, "/adapter_id", "letta_research_gate")?; assert_eq!(elf.pointer("/evidence_class").and_then(Value::as_str), Some("fixture_backed")); assert_eq!(elf.pointer("/overall_status").and_then(Value::as_str), Some("blocked")); @@ -678,6 +683,36 @@ fn assert_external_adapter_manifest_records(report: &Value) -> Result<()> { assert_first_generation_adapter_records(agentmemory, mem0, memsearch, claude_mem); assert_eq!(openviking.pointer("/overall_status").and_then(Value::as_str), Some("wrong_result")); + + assert_graph_rag_research_gate_records(ragflow, lightrag, graphrag); + assert_graphiti_zep_adapter(graphiti_zep); + assert_graphify_adapter(graphify)?; + assert_letta_core_archival_gate(letta)?; + + assert_eq!( + qmd_deep.pointer("/capabilities/2/status").and_then(Value::as_str), + Some("unsupported") + ); + assert_eq!( + qmd_deep.pointer("/result/artifact").and_then(Value::as_str), + Some("docs/research/2026-06-11-qmd-openviking-strength-profile-report.json") + ); + assert_eq!( + openviking_deep.pointer("/adapter_kind").and_then(Value::as_str), + Some("docker_local_embed_context_trajectory_gate") + ); + + assert_openviking_deep_profile_gate(openviking_deep); + + assert_eq!( + openviking_deep.pointer("/result/artifact").and_then(Value::as_str), + Some("docs/research/2026-06-11-qmd-openviking-strength-profile-report.json") + ); + + Ok(()) +} + +fn assert_graph_rag_research_gate_records(ragflow: &Value, lightrag: &Value, graphrag: &Value) { assert_eq!(ragflow.pointer("/evidence_class").and_then(Value::as_str), Some("research_gate")); assert_eq!(ragflow.pointer("/overall_status").and_then(Value::as_str), Some("blocked")); assert_eq!( @@ -718,29 +753,54 @@ fn assert_external_adapter_manifest_records(report: &Value) -> Result<()> { Some("cargo make graphrag-docker-smoke") ); assert_eq!(graphrag.pointer("/suites/1/status").and_then(Value::as_str), Some("not_encoded")); +} - assert_graphiti_zep_adapter(graphiti_zep); - assert_graphify_adapter(graphify)?; - - assert_eq!( - qmd_deep.pointer("/capabilities/2/status").and_then(Value::as_str), - Some("unsupported") - ); - assert_eq!( - qmd_deep.pointer("/result/artifact").and_then(Value::as_str), - Some("docs/research/2026-06-11-qmd-openviking-strength-profile-report.json") - ); - assert_eq!( - openviking_deep.pointer("/adapter_kind").and_then(Value::as_str), - Some("docker_local_embed_context_trajectory_gate") +fn assert_letta_core_archival_gate(adapter: &Value) -> Result<()> { + assert_eq!(adapter.pointer("/overall_status").and_then(Value::as_str), Some("blocked")); + assert!( + adapter + .pointer("/setup/evidence") + .and_then(Value::as_str) + .is_some_and(|evidence| evidence.contains("Docker-only benchmark-created agent export")) ); + assert!(adapter.pointer("/execution_metadata/setup_path").and_then(Value::as_str).is_some_and( + |setup| setup.contains("exports core block JSON plus archival search/readback JSON") + )); - assert_openviking_deep_profile_gate(openviking_deep); + let suites = array_at(adapter, "/suites")?; + let core_suite = find_by_field(suites, "/suite_id", "core_archival_memory")?; + + assert_eq!(core_suite.pointer("/status").and_then(Value::as_str), Some("blocked")); + + let scenarios = array_at(adapter, "/scenarios")?; + let attachment = find_by_field(scenarios, "/scenario_id", "core_block_attachment_readback")?; + let scope = find_by_field(scenarios, "/scenario_id", "core_block_scope_readback")?; + let provenance = find_by_field(scenarios, "/scenario_id", "core_block_provenance_readback")?; + let stale = find_by_field(scenarios, "/scenario_id", "stale_core_detection")?; + let fallback = find_by_field(scenarios, "/scenario_id", "archival_fallback_readback")?; + let decision = + find_by_field(scenarios, "/scenario_id", "core_archival_project_decision_recovery")?; + + assert_eq!(scenarios.len(), 6); + + for scenario in [attachment, scope, provenance, stale, fallback, decision] { + assert_eq!(scenario.pointer("/elf_position").and_then(Value::as_str), Some("untested")); + assert!( + ["not_tested", "blocked"].contains( + &scenario + .pointer("/comparison_outcome") + .and_then(Value::as_str) + .ok_or_else(|| eyre::eyre!("missing Letta comparison_outcome"))? + ) + ); + } assert_eq!( - openviking_deep.pointer("/result/artifact").and_then(Value::as_str), - Some("docs/research/2026-06-11-qmd-openviking-strength-profile-report.json") + attachment.pointer("/comparison_outcome").and_then(Value::as_str), + Some("not_tested") ); + assert_eq!(stale.pointer("/comparison_outcome").and_then(Value::as_str), Some("blocked")); + assert_eq!(fallback.pointer("/comparison_outcome").and_then(Value::as_str), Some("blocked")); Ok(()) } @@ -1320,7 +1380,7 @@ fn assert_live_sweep_record(adapter: &Value, production_ops_status: &str) -> Res fn runner_discovers_nested_fixture_layout() -> Result<()> { let report = run_json_report_from(fixture_root())?; - assert_eq!(report.pointer("/summary/job_count").and_then(Value::as_u64), Some(38)); + assert_eq!(report.pointer("/summary/job_count").and_then(Value::as_u64), Some(44)); Ok(()) } @@ -2497,9 +2557,9 @@ fn generated_json_report_renders_markdown() -> Result<()> { assert!(markdown.contains("xy844-current-worktree")); assert!(markdown.contains("Existing live-baseline reports remain valid")); assert!(markdown.contains("### Adapter Scenario Judgments")); - assert!(markdown.contains("ELF scenario positions: `wins=8, ties=8, loses=1, untested=11`")); + assert!(markdown.contains("ELF scenario positions: `wins=8, ties=8, loses=1, untested=17`")); assert!(markdown.contains( - "Scenario comparison outcomes: `win=8, tie=8, loss=1, not_tested=8, blocked=1, non_goal=2`" + "Scenario comparison outcomes: `win=8, tie=8, loss=1, not_tested=12, blocked=3, non_goal=2`" )); assert!(markdown.contains("| `claude_mem_live_baseline` | `same_corpus_retrieval`")); assert!(markdown.contains("| `memsearch_live_baseline` | `ttl_expiry_lifecycle`")); @@ -2776,6 +2836,46 @@ fn production_ops_fixtures_report_bounded_typed_states() -> Result<()> { Ok(()) } +#[test] +fn core_archival_memory_fixtures_score_separate_core_and_archival_jobs() -> Result<()> { + let report = run_json_report_from(core_archival_memory_fixture_dir())?; + + assert_eq!(report.pointer("/summary/job_count").and_then(Value::as_u64), Some(6)); + assert_eq!(report.pointer("/summary/encoded_suite_count").and_then(Value::as_u64), Some(1)); + assert_eq!(report.pointer("/summary/pass").and_then(Value::as_u64), Some(6)); + assert_eq!(report.pointer("/summary/wrong_result").and_then(Value::as_u64), Some(0)); + assert_eq!(report.pointer("/summary/blocked").and_then(Value::as_u64), Some(0)); + assert_eq!( + report.pointer("/summary/expected_evidence_recall").and_then(Value::as_f64), + Some(1.0) + ); + assert_eq!(report.pointer("/summary/evidence_coverage").and_then(Value::as_f64), Some(1.0)); + + let suites = array_at(&report, "/suites")?; + let core = find_by_field(suites, "/suite_id", "core_archival_memory")?; + + assert_eq!(core.pointer("/status").and_then(Value::as_str), Some("pass")); + assert_eq!(core.pointer("/encoded_job_count").and_then(Value::as_u64), Some(6)); + + let jobs = array_at(&report, "/jobs")?; + + for job_id in [ + "core-archival-core-block-attachment-001", + "core-archival-core-block-scope-001", + "core-archival-core-block-provenance-001", + "core-archival-stale-core-detection-001", + "core-archival-archival-fallback-001", + "core-archival-project-decision-recovery-001", + ] { + let job = find_by_field(jobs, "/job_id", job_id)?; + + assert_eq!(job.pointer("/suite_id").and_then(Value::as_str), Some("core_archival_memory")); + assert_eq!(job.pointer("/status").and_then(Value::as_str), Some("pass")); + } + + Ok(()) +} + fn assert_root_knowledge_summary(report: &Value) { assert_eq!(report.pointer("/summary/knowledge/job_count").and_then(Value::as_u64), Some(2)); assert_eq!(report.pointer("/summary/knowledge/page_count").and_then(Value::as_u64), Some(4)); @@ -2786,8 +2886,8 @@ fn assert_root_knowledge_summary(report: &Value) { } fn assert_root_aggregate_summary(report: &Value) { - assert_eq!(report.pointer("/summary/job_count").and_then(Value::as_u64), Some(38)); - assert_eq!(report.pointer("/summary/pass").and_then(Value::as_u64), Some(36)); + assert_eq!(report.pointer("/summary/job_count").and_then(Value::as_u64), Some(44)); + assert_eq!(report.pointer("/summary/pass").and_then(Value::as_u64), Some(42)); assert_eq!(report.pointer("/summary/wrong_result").and_then(Value::as_u64), Some(0)); assert_eq!(report.pointer("/summary/incomplete").and_then(Value::as_u64), Some(0)); assert_eq!(report.pointer("/summary/blocked").and_then(Value::as_u64), Some(2)); @@ -2830,9 +2930,9 @@ fn assert_root_aggregate_summary(report: &Value) { ); assert_eq!( report.pointer("/summary/evidence_required_count").and_then(Value::as_u64), - Some(84) + Some(97) ); - assert_eq!(report.pointer("/summary/evidence_covered_count").and_then(Value::as_u64), Some(84)); + assert_eq!(report.pointer("/summary/evidence_covered_count").and_then(Value::as_u64), Some(97)); assert_eq!(report.pointer("/summary/evidence_coverage").and_then(Value::as_f64), Some(1.0)); assert_eq!(report.pointer("/summary/source_ref_coverage").and_then(Value::as_f64), Some(1.0)); assert_eq!(report.pointer("/summary/quote_coverage").and_then(Value::as_f64), Some(1.0)); @@ -2876,6 +2976,7 @@ fn assert_root_aggregate_suites(report: &Value) -> Result<()> { "knowledge_compilation", "operator_debugging_ux", "memory_evolution", + "core_archival_memory", ] { let suite = find_by_field(suites, "/suite_id", suite_id)?; @@ -2898,6 +2999,11 @@ fn assert_root_aggregate_suites(report: &Value) -> Result<()> { assert_eq!(debug_suite.pointer("/status").and_then(Value::as_str), Some("pass")); + let core_suite = find_by_field(suites, "/suite_id", "core_archival_memory")?; + + assert_eq!(core_suite.pointer("/status").and_then(Value::as_str), Some("pass")); + assert_eq!(core_suite.pointer("/encoded_job_count").and_then(Value::as_u64), Some(6)); + let production_ops = find_by_field(suites, "/suite_id", "production_ops")?; assert_eq!(production_ops.pointer("/status").and_then(Value::as_str), Some("blocked")); @@ -2915,6 +3021,8 @@ fn assert_root_aggregate_jobs(report: &Value) -> Result<()> { let stage_job = find_by_field(jobs, "/job_id", "operator-debug-stage-attribution-001")?; let production_restore = find_by_field(jobs, "/job_id", "production-ops-restore-cold-start-001")?; + let core_fallback = find_by_field(jobs, "/job_id", "core-archival-archival-fallback-001")?; + let stale_core = find_by_field(jobs, "/job_id", "core-archival-stale-core-detection-001")?; assert_eq!(rebuild.pointer("/qdrant_rebuild_case").and_then(Value::as_bool), Some(true)); assert_eq!( @@ -2926,6 +3034,8 @@ fn assert_root_aggregate_jobs(report: &Value) -> Result<()> { assert_eq!(personalization.pointer("/scope_correct_count").and_then(Value::as_u64), Some(1)); assert_eq!(stage_job.pointer("/status").and_then(Value::as_str), Some("pass")); assert_eq!(relation_job.pointer("/status").and_then(Value::as_str), Some("pass")); + assert_eq!(core_fallback.pointer("/status").and_then(Value::as_str), Some("pass")); + assert_eq!(stale_core.pointer("/status").and_then(Value::as_str), Some("pass")); assert_eq!( stage_job.pointer("/trace_explainability/failure_stage").and_then(Value::as_str), Some("rerank.score") diff --git a/docs/guide/benchmarking/2026-06-11-competitor-strength-adoption-report.md b/docs/guide/benchmarking/2026-06-11-competitor-strength-adoption-report.md index 120c6b3d..d3f19cce 100644 --- a/docs/guide/benchmarking/2026-06-11-competitor-strength-adoption-report.md +++ b/docs/guide/benchmarking/2026-06-11-competitor-strength-adoption-report.md @@ -38,14 +38,15 @@ The remaining caveats are material: setup exists. - Several competitor strengths remain `not_tested` or blocked: OpenMemory UI/export is blocked by the XY-931 export-helper setup probe, hosted mem0 Platform - behavior remains a non-goal, and OpenViking trajectory, Letta core-vs-archival - memory, and graph/RAG navigation remain unproven. mem0 local OSS preference history - is measured separately and is an ELF loss on the current correction history - scenario. The XY-923 follow-up also scores qmd's immediate top-10/replay artifact - ergonomics as stronger than ELF's default stress report, while expansion, fusion, - and rerank remain untested. XY-932 adds a narrow live operator-debug slice where - ELF beats qmd on trace hydration and candidate-drop visibility, but OpenMemory - UI/export and claude-mem viewer workflows remain blocked or not encoded. + behavior remains a non-goal, OpenViking trajectory and graph/RAG navigation remain + unproven, and Letta core-vs-archival comparison is blocked until the selected + contained export/readback path exists. mem0 local OSS preference history is + measured separately and is an ELF loss on the current correction history scenario. + The XY-923 follow-up also scores qmd's immediate top-10/replay artifact ergonomics + as stronger than ELF's default stress report, while expansion, fusion, and rerank + remain untested. XY-932 adds a narrow live operator-debug slice where ELF beats qmd + on trace hydration and candidate-drop visibility, but OpenMemory UI/export and + claude-mem viewer workflows remain blocked or not encoded. ## Evidence Classes @@ -70,7 +71,8 @@ results, or lifecycle failures into one aggregate leaderboard. | Command or run | Artifact | Supported claim | | --- | --- | --- | -| `cargo make real-world-memory` | `2026-06-11-measurement-coverage-audit.md` | ELF fixture aggregate covers 38 jobs across 11 suites with 36 pass and 2 blocked production-ops operator boundaries. | +| `cargo make real-world-memory` | `2026-06-11-measurement-coverage-audit.md` | ELF fixture aggregate covers 44 jobs across 12 suites with 42 pass and 2 blocked production-ops operator boundaries, including 6 passing `core_archival_memory` jobs. | +| `cargo make real-world-memory-core-archival` | `tmp/real-world-memory/core-archival/report.json` | ELF core-block behavior is scored separately from archival note search for attachment, scope, provenance, stale-core detection, archival fallback, and project-decision recovery. | | `cargo make real-world-memory-live-adapters` | `2026-06-11-measurement-coverage-audit.md` | ELF live service adapter reports 18 pass, 5 wrong_result, 2 blocked, and 13 not_encoded jobs; qmd reports 17 pass, 6 wrong_result, 2 blocked, and 13 not_encoded jobs. | | `cargo make real-world-job-operator-ux-live-adapters` | `tmp/real-world-job/operator-ux-live-adapters/summary.json` | The narrow live operator-debug slice scores ELF as pass and qmd as wrong_result: ELF wins trace hydration, candidate-drop visibility, and selected-but-not-narrated evidence; both systems expose replay commands and repair-action guidance. | | `ELF_BASELINE_PROJECTS=ELF,agentmemory,mem0,memsearch,claude-mem cargo make baseline-live-docker` | `2026-06-11-first-generation-oss-adapter-promotion-report.md` | mem0/OpenMemory and memsearch pass basic local baseline smokes; agentmemory remains lifecycle_fail and claude-mem remains wrong_result. | @@ -86,7 +88,7 @@ results, or lifecycle failures into one aggregate leaderboard. | --- | --- | --- | --- | --- | | Source-of-truth rebuild and evidence-bound writes | `win` | `fixture_backed`, `live_real_world`, `live_baseline_only` | ELF has the strongest measured source-of-truth and rebuild story: Postgres is authoritative, Qdrant is rebuildable, trust-source jobs pass, and production restore/rebuild proof exists. | None | | Work resume and coding-agent continuity | `tie` | `fixture_backed`, `live_real_world`, `live_baseline_only`, `blocked`, `not_encoded` | ELF and qmd both pass encoded live `work_resume` jobs; agentmemory, claude-mem, and OpenViking continuity strengths remain blocked or not encoded. | XY-925, XY-928 | -| Project decisions and reversals | `tie` | `fixture_backed`, `live_real_world`, `research_gate`, `not_encoded` | ELF and qmd both pass encoded `project_decisions` jobs; Letta-style core/archival decision memory is not tested. | XY-927 | +| Project decisions and reversals | `tie` | `fixture_backed`, `live_real_world`, `research_gate`, `not_encoded` | ELF and qmd both pass encoded `project_decisions` jobs. The ELF `core_archival_memory` fixture also scores project-decision recovery through core routing plus archival rationale, but Letta-style comparison remains blocked without contained export evidence. | XY-927 | | Retrieval quality | `tie` | `fixture_backed`, `live_real_world`, `live_baseline_only` | ELF and qmd both pass encoded live retrieval and stress/same-corpus retrieval evidence. | XY-923 | | Retrieval quality and local debug UX | `loss` | `live_baseline_only`, `research_gate`, `wrong_result`, `not_encoded` | The XY-923 trace/replay report scores qmd stronger on immediate top-10 candidate artifacts and short CLI replay commands. ELF keeps useful service trace/admin replay surfaces, and expansion, fusion, rerank-on, and candidate-drop diagnostics remain untested. | XY-923 | | Memory evolution and temporal history | `loss` | `fixture_backed`, `live_real_world`, `live_baseline_only`, `wrong_result`, `blocked` | ELF fixture memory evolution passes, but live ELF passes only delete/TTL and reports five wrong_result jobs where current-vs-historical state is not reconciled. The mem0 local OSS preference-correction history scenario is now measured and is also an ELF loss. | XY-905 | @@ -98,7 +100,7 @@ results, or lifecycle failures into one aggregate leaderboard. | Private corpus and provider boundaries | `blocked` | `blocked` | Private production profile fails closed without an operator-owned manifest; provider-backed production-ops gates require explicit credentials. | XY-930 | | Personalization and scoped preferences | `tie` | `fixture_backed`, `live_real_world`, `live_baseline_only`, `not_encoded` | ELF and qmd both pass the single encoded live personalization job. mem0 local OSS now passes entity-scoped personalization, so scoped preference behavior is a measured tie; preference correction history remains a separate ELF loss. | XY-927 | | Context trajectory and hierarchical retrieval | `not_tested` | `live_baseline_only`, `research_gate`, `wrong_result`, `not_encoded` | OpenViking reaches the pinned Docker local embedding path but misses expected same-corpus evidence; staged trajectory/hierarchy scoring is not encoded. | XY-928 | -| Core-vs-archival memory | `not_tested` | `research_gate`, `not_encoded` | ELF has core block semantics in the service contract, but comparable core-vs-archival jobs and a contained Letta export path are not encoded. | XY-927 | +| Core-vs-archival memory | `blocked` | `fixture_backed`, `research_gate`, `blocked`, `not_encoded` | ELF now has 6 fixture-backed `core_archival_memory` jobs that score core block attachment, scope, provenance, stale-core detection, archival fallback, and project-decision recovery separately from archival note search. Letta remains blocked or not tested until its contained export/readback artifact maps core and archival source ids. | XY-927 | | Graph/RAG navigation and citations | `not_tested` | `smoke_only`, `research_gate`, `blocked`, `wrong_result`, `not_encoded` | Graph/RAG smokes produce scored or typed non-pass adapter reports where possible, but broad graph/RAG navigation and citation quality are not tested. | XY-929 | ## Follow-Up Queue @@ -110,7 +112,7 @@ results, or lifecycle failures into one aggregate leaderboard. | XY-924/XY-931 | P0 | Encoded local OSS history; UI/export setup blocker measured | mem0/OpenMemory local OSS history and SDK export-style readback are measured; OpenMemory UI/export has a blocked export-helper setup probe and still needs a dedicated compose/import path before any product-UX comparison. | | XY-925 | P1 | Backlog | First-generation OSS continuity and source-store adapters. | | XY-926 | P1 | Backlog | Live operator-debugging, capture, consolidation, and knowledge-page suites. | -| XY-927 | P1 | Backlog | Letta-style core-vs-archival memory comparison. | +| XY-927 | P1 | Fixture encoded; Letta export blocked | ELF core-vs-archival fixture coverage is encoded; a contained Letta export/readback adapter remains future work before win/tie/loss claims. | | XY-928 | P1 | Backlog | OpenViking context-trajectory and hierarchy benchmark. | | XY-929 | P2 | Backlog | Graph/RAG adapters beyond scored smokes. | | XY-930 | P1 | Backlog | Private-corpus and credentialed production gates after operator inputs exist. | @@ -123,6 +125,9 @@ results, or lifecycle failures into one aggregate leaderboard. evidence among the tracked systems. - ELF ties qmd on encoded live retrieval, work-resume, project-decisions, and personalization slices. +- ELF fixture-backed `core_archival_memory` coverage passes attachment, scope, + provenance, stale-core detection, archival fallback, and project-decision recovery + jobs separately from archival search. - ELF has a narrow live operator-debug win over qmd for trace hydration, candidate-drop visibility, and selected-but-not-narrated evidence, with replay-command availability and repair-action clarity tied. diff --git a/docs/guide/benchmarking/2026-06-11-measurement-coverage-audit.md b/docs/guide/benchmarking/2026-06-11-measurement-coverage-audit.md index e10ce945..ee4d9de0 100644 --- a/docs/guide/benchmarking/2026-06-11-measurement-coverage-audit.md +++ b/docs/guide/benchmarking/2026-06-11-measurement-coverage-audit.md @@ -5,9 +5,9 @@ not comparable, and which measurement reports should guide future ELF iteration. Read this when: You need to answer whether ELF has enough empirical evidence to claim a win, tie, loss, or non-claim against tracked memory, RAG, graph, and agent-continuity projects. -Inputs: Fresh local runs of `cargo make real-world-memory` and -`cargo make real-world-memory-live-adapters` in the current XY-898 lane after -adapter-report consistency repairs, plus +Inputs: Fresh local runs of `cargo make real-world-memory-core-archival`, +`cargo make real-world-memory`, and the earlier `cargo make real-world-memory-live-adapters` +measurement in the current benchmark lane after adapter-report consistency repairs, plus `apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json`, `2026-06-11-competitor-strength-evidence-matrix.md`, and `2026-06-11-elf-iteration-direction-from-competitor-benchmarks.md`. @@ -22,8 +22,11 @@ tracked project's strongest scenario. What is proven today: -- ELF has a strong fixture-backed real-world benchmark contract: 38 jobs, 36 pass, - 2 blocked operator boundaries, and no wrong results in the fixture aggregate. +- ELF has a strong fixture-backed real-world benchmark contract: 44 jobs across 12 + suites, 42 pass, 2 blocked operator boundaries, and no wrong results in the + fixture aggregate. The new `core_archival_memory` suite contributes 6 passing jobs + for core block attachment, scope, provenance, stale-core detection, archival + fallback, and project-decision recovery. - ELF and qmd have comparable full-suite live real-world sweeps, but neither has a full-suite live pass. ELF is one pass ahead in the fresh aggregate because qmd misses the memory-evolution delete/TTL tombstone job. @@ -31,9 +34,10 @@ What is proven today: checked-in provider synthetic, stress, backfill, backup/restore, and Qdrant rebuild evidence. - The current comparison still undermeasures most competitor strengths. OpenViking - trajectory, mem0/OpenMemory entity history and UI, Letta core-vs-archival memory, - Graphiti/Zep temporal graph behavior, graph/RAG navigation, agentmemory and - claude-mem capture/continuity, and knowledge-page workflows remain non-claims. + trajectory, mem0/OpenMemory entity history and UI, Letta product export/readback + for core-vs-archival memory, Graphiti/Zep temporal graph behavior, graph/RAG + navigation, agentmemory and claude-mem capture/continuity, and knowledge-page + workflows remain non-claims. The separate XY-932 operator-debug live slice now scores ELF against qmd for trace hydration and candidate-drop visibility, but does not cover OpenMemory or claude-mem UI flows. @@ -43,12 +47,13 @@ production," but the competitiveness objective remains open. ## Fresh Runs -These commands were run in the current XY-898 lane after adapter-report consistency -repairs: +These commands were run in the current benchmark lanes after adapter-report +consistency repairs and the XY-927 core-vs-archival fixture update: | Command | Result | Runtime | | --- | --- | ---: | -| `cargo make real-world-memory` | pass | 11.91 seconds | +| `cargo make real-world-memory-core-archival` | pass | 57.01 seconds | +| `cargo make real-world-memory` | pass | 8.94 seconds | | `cargo make real-world-memory-live-adapters` | pass | 121.51 seconds | The live adapter run emitted repeated Qdrant client/server compatibility warnings, but @@ -62,21 +67,21 @@ failure. | Metric | Value | | --- | ---: | -| Jobs | `38` | -| Encoded suites | `11` | -| Pass | `36` | +| Jobs | `44` | +| Encoded suites | `12` | +| Pass | `42` | | Blocked | `2` | | Wrong result | `0` | | Lifecycle fail | `0` | | Incomplete | `0` | | Not encoded | `0` | | Unsupported claim | `0` | -| Mean score | `0.947` | -| Mean latency | `4.411 ms` | -| Expected evidence recall | `77/77` | -| Evidence coverage | `84/84` | -| Source-ref coverage | `84/84` | -| Quote coverage | `84/84` | +| Mean score | `0.955` | +| Mean latency | `3.958 ms` | +| Expected evidence recall | `90/90` | +| Evidence coverage | `97/97` | +| Source-ref coverage | `97/97` | +| Quote coverage | `97/97` | This proves fixture contract breadth and scoring behavior. It does not prove every live adapter or competitor runtime can complete those jobs. @@ -136,8 +141,8 @@ The checked-in manifest records 23 adapter records across 17 unique project name | `pass` | `4` | | `wrong_result` | `6` | | `lifecycle_fail` | `1` | -| `blocked` | `5` | -| `not_encoded` | `7` | +| `blocked` | `6` | +| `not_encoded` | `6` | The generated JSON report emits `external_project_count: 16`, matching the unique non-ELF project-name count from the manifest. The companion audit JSON separately @@ -158,7 +163,7 @@ records `unique_project_names: 17` for the full project list including ELF. | LightRAG | `research_gate` | `blocked`. | Graph/RAG context export with source-path citations. | Docker context-export report with explicit provider config and source citation mapping. | | GraphRAG | `research_gate` | `blocked`. | Graph summaries and document/text-unit evidence tables. | Cost-bounded Docker adapter report over a tiny corpus. | | Graphiti/Zep | `research_gate` | `blocked`. | Temporal graph facts and validity windows. | Docker-local temporal graph adapter report for current and historical facts. | -| Letta | `research_gate` | `not_encoded`. | Core memory blocks versus archival memory. | Contained export contract, then core-vs-archival and decision-memory report. | +| Letta | `research_gate` | `blocked` for the selected contained export/readback path; scenario rows remain `not_tested` or `blocked`. | Core memory blocks versus archival memory. | Implement the Docker-only export/readback adapter before any Letta win/tie/loss claim. | | LangGraph | `research_gate` | `not_encoded`; direct memory backend is unsupported. | Checkpoint replay and fork/regression debugging. | Treat as benchmark-infra reference unless a memory-output contract emerges. | | nanograph | `research_gate` | `not_encoded`; full memory backend is unsupported. | Typed graph schema and query ergonomics. | Typed relation query report only if evidence ids can be emitted. | | llm-wiki | `research_gate` | `not_encoded`. | Wiki/page generation, query-save, lint and repair loops. | Contained page-generation report with citation and unsupported-claim lint. | @@ -171,7 +176,7 @@ records `unique_project_names: 17` for the full project list including ELF. | --- | --- | --- | --- | | Retrieval/debug | ELF and qmd live retrieval pass; qmd same-corpus baseline passes. | Tie on encoded live retrieval; no ELF-over-qmd UX claim. | qmd/ELF deep trace replay and debug ergonomics scoring. | | Work resume | ELF and qmd live pass. | ELF is credible on encoded work resume. | agentmemory, claude-mem, and OpenViking comparable continuity adapters. | -| Project decisions | ELF and qmd live pass. | ELF is credible on encoded project-decision recovery. | Letta core/archival decision memory comparison. | +| Project decisions | ELF and qmd live pass; ELF fixture coverage also passes core routing plus archival rationale recovery. | ELF is credible on encoded project-decision recovery. | Letta core/archival decision memory export and scoring. | | Source of truth | ELF and qmd live pass; ELF has stronger production restore/rebuild evidence. | ELF has strongest measured source-of-truth discipline. | memsearch source-of-truth reindex/reload evidence. | | Memory evolution | ELF live fails 5/6 jobs; qmd live fails 6/6 jobs after missing the delete/TTL tombstone evidence; fixture aggregate passes. | No broad live superiority claim. | Historical conflict evidence links and Graphiti/Zep temporal comparison. | | Consolidation | Fixture aggregate passes; live adapters are not encoded. | Fixture-only claim. | Live proposal generation with lineage, confidence, and review-action audit. | @@ -181,7 +186,7 @@ records `unique_project_names: 17` for the full project list including ELF. | Production ops | ELF has separate production-provider/backfill/restore evidence; live sweep is not a full production-ops pass. | Bounded personal-production adoption claim with caveats. | Private corpus manifest and credentialed provider gates. | | Personalization | ELF and qmd live pass one scoped preference job. | Narrow encoded pass only. | mem0/OpenMemory and Letta entity/preference history comparison. | | Context trajectory | Not comparable. | No claim. | OpenViking staged hierarchy/trajectory scoring. | -| Core-vs-archival memory | Not comparable. | No claim. | Letta contained export and ELF core-block benchmark. | +| Core-vs-archival memory | ELF fixture suite passes 6/6; Letta comparison is blocked until export/readback evidence exists. | Fixture-only ELF core-block claim; no ELF-over-Letta claim. | Letta contained export/readback artifact with core block JSON, archival search/readback JSON, and source ids. | | Graph/RAG navigation | RAGFlow, LightRAG, GraphRAG, and Graphiti/Zep remain typed research gates; graphify has a tiny scored `wrong_result` smoke. | No graph/RAG parity claim; only graphify's bounded non-pass smoke can be cited. | Larger contained RAG/graph adapters with evidence-linked outputs before any ELF graph/RAG win, tie, or loss claim. | ## Next Measurement Reports diff --git a/docs/guide/benchmarking/index.md b/docs/guide/benchmarking/index.md index 6030af7b..7e17b183 100644 --- a/docs/guide/benchmarking/index.md +++ b/docs/guide/benchmarking/index.md @@ -55,9 +55,9 @@ cleanup, use `docs/guide/single_user_production.md`. optimization-direction report that translates measured benchmark data and competitor strengths into prioritized ELF iteration themes and explicit non-claims. - `2026-06-11-measurement-coverage-audit.md`: fresh coverage audit that separates - current measured ELF/qmd data, fixture evidence, external adapter ledger coverage, - scenario non-claims, and the next measurement reports needed before stronger - competitor claims. + current measured ELF/qmd data, fixture evidence including the XY-927 + `core_archival_memory` suite, external adapter ledger coverage, scenario non-claims, + and the next measurement reports needed before stronger competitor claims. - `2026-06-11-elf-qmd-retrieval-debug-profile.md`: fresh ELF/qmd retrieval-debug profile with real-world retrieval-suite evidence, 480-document stress baseline evidence, qmd top-10 artifact inspection, and explicit rerank/fusion non-claims. @@ -89,9 +89,10 @@ cleanup, use `docs/guide/single_user_production.md`. `real_world_job` adapter reports without converting smoke evidence into quality claims. - `2026-06-11-competitor-strength-adoption-report.md`: XY-901 final - competitor-strength adoption report with the bounded personal-production decision, - scenario-level win/tie/loss/not-tested matrix, claim boundaries, and optimization - issue queue. + competitor-strength adoption report, updated by XY-927 with fixture-backed + core-vs-archival coverage and a blocked Letta export/readback boundary, plus the + bounded personal-production decision, scenario-level win/tie/loss/not-tested + matrix, claim boundaries, and optimization issue queue. - `2026-06-11-mem0-openmemory-history-ui-export-report.md`: XY-924 plus XY-931 mem0/OpenMemory local OSS history, preference-correction, deletion-audit, personalization, and export-readback comparison with normalized diff --git a/docs/guide/benchmarking/real_world_agent_memory_benchmark.md b/docs/guide/benchmarking/real_world_agent_memory_benchmark.md index e4745d72..7cae59a3 100644 --- a/docs/guide/benchmarking/real_world_agent_memory_benchmark.md +++ b/docs/guide/benchmarking/real_world_agent_memory_benchmark.md @@ -58,6 +58,7 @@ compile knowledge, and state honest uncertainty. | Capture/integration | Accuracy of hooks, imports, exclusions, and write policies. | Capture a session decision while excluding private spans. | | Production ops | Backfill, restore, cold start, resource, and bounded-failure behavior. | Resume interrupted import without duplicate source notes. | | Personalization | Scoped preferences without cross-tenant leakage. | Apply the user's current preference and ignore another project's note. | +| Core/archival memory | Always-loaded core memory behavior kept separate from archival note search. | Detect a stale core block and fall back to archival evidence. | ## External Reference Mapping @@ -163,6 +164,9 @@ including the retrieval-quality slice below. The suite currently encodes: classification, and provider credential boundary `blocked` classification. - `personalization`: scoped stable preference correction without temporary or cross-project preference leakage. +- `core_archival_memory`: core block attachment, scope, provenance, stale-core + detection, archival fallback, and project-decision recovery through core routing + plus archival rationale. The generated report includes evidence coverage, source-ref coverage, quote coverage, unsupported-claim count, stale retrieval count, stale-answer count, conflict detection @@ -221,8 +225,10 @@ research gates. Its `external_adapters` report section distinguishes: future adapter path, not fixture-backed or live execution evidence. Current state: the `elf_live_real_world` and `qmd_live_real_world` adapters run a full -encoded-suite sweep through `cargo make real-world-memory-live-adapters`. Each adapter -materializes generated runtime answers for 38 jobs across 11 suites before scoring. +encoded-suite sweep through `cargo make real-world-memory-live-adapters`. The latest +recorded live sweep materializes generated runtime answers for 38 jobs across 11 +suites before scoring; the newer fixture-only `core_archival_memory` suite is not yet +included in that live sweep. The original targeted `work_resume`, `retrieval`, and `project_decisions` slice still passes, but the full sweep is not a full-suite pass: memory_evolution is `wrong_result`, production_ops remains typed `incomplete`/`blocked`/`not_encoded`, and diff --git a/docs/research/2026-06-11-competitor-strength-adoption-report.json b/docs/research/2026-06-11-competitor-strength-adoption-report.json index 56ec65a5..7a9d9d85 100644 --- a/docs/research/2026-06-11-competitor-strength-adoption-report.json +++ b/docs/research/2026-06-11-competitor-strength-adoption-report.json @@ -12,7 +12,7 @@ "Live temporal reconciliation remains wrong_result for five of six memory_evolution jobs.", "Private-corpus production quality is blocked until an operator-owned manifest exists.", "Credentialed provider production-ops gates are blocked until explicit provider setup exists.", - "Several competitor strengths remain not_tested or blocked: OpenMemory UI/export is blocked by the XY-931 export-helper setup probe, hosted mem0 Platform behavior remains a non-goal, and OpenViking trajectory, Letta core-vs-archival memory, and graph/RAG navigation remain unproven. mem0 local OSS preference history is measured separately and is an ELF loss on the current correction-history scenario. The XY-923 follow-up scores qmd immediate top-10/replay artifact ergonomics as stronger than ELF's default stress report, while expansion, fusion, and rerank remain untested. XY-932 adds a narrow live operator-debug slice where ELF beats qmd on trace hydration and candidate-drop visibility, but OpenMemory UI/export and claude-mem viewer workflows remain blocked or not encoded." + "Several competitor strengths remain not_tested or blocked: OpenMemory UI/export is blocked by the XY-931 export-helper setup probe, hosted mem0 Platform behavior remains a non-goal, OpenViking trajectory and graph/RAG navigation remain unproven, and Letta core-vs-archival comparison is blocked until the selected contained export/readback path exists. mem0 local OSS preference history is measured separately and is an ELF loss on the current correction-history scenario. The XY-923 follow-up scores qmd immediate top-10/replay artifact ergonomics as stronger than ELF's default stress report, while expansion, fusion, and rerank remain untested. XY-932 adds a narrow live operator-debug slice where ELF beats qmd on trace hydration and candidate-drop visibility, but OpenMemory UI/export and claude-mem viewer workflows remain blocked or not encoded." ] }, "evidence_class_terms": [ @@ -39,7 +39,12 @@ { "command": "cargo make real-world-memory", "artifact": "docs/guide/benchmarking/2026-06-11-measurement-coverage-audit.md", - "claim": "ELF fixture aggregate covers 38 jobs across 11 suites with 36 pass and 2 blocked production-ops operator boundaries." + "claim": "ELF fixture aggregate covers 44 jobs across 12 suites with 42 pass and 2 blocked production-ops operator boundaries, including 6 passing core_archival_memory jobs." + }, + { + "command": "cargo make real-world-memory-core-archival", + "artifact": "tmp/real-world-memory/core-archival/report.json", + "claim": "ELF core_archival_memory fixture coverage scores core block attachment, scope, provenance, stale-core detection, archival fallback, and project-decision recovery separately from archival note search." }, { "command": "cargo make real-world-memory-live-adapters", @@ -132,14 +137,14 @@ "research_gate", "not_encoded" ], - "measured_claim": "ELF and qmd both pass encoded project_decisions jobs. Letta-style core/archival decision memory is not tested.", + "measured_claim": "ELF and qmd both pass encoded project_decisions jobs. The new ELF core_archival_memory fixture also scores project-decision recovery through core routing plus archival rationale, but Letta-style comparison remains blocked without contained export evidence.", "command_artifacts": [ "docs/guide/benchmarking/2026-06-11-measurement-coverage-audit.md" ], "follow_up_issues": [ "XY-927" ], - "caveat": "No Letta comparison exists until a contained export path is selected." + "caveat": "No Letta comparison exists until the selected contained export/readback path produces source-id-mapped evidence." }, { "scenario_id": "retrieval_quality", @@ -361,20 +366,24 @@ { "scenario_id": "core_vs_archival_memory", "title": "Core-vs-archival memory", - "outcome": "not_tested", + "outcome": "blocked", "evidence_classes": [ + "fixture_backed", "research_gate", + "blocked", "not_encoded" ], - "measured_claim": "ELF has core block semantics in the service contract, but comparable core-vs-archival benchmark jobs and a contained Letta export path are not encoded.", + "measured_claim": "ELF now has 6 fixture-backed core_archival_memory jobs that score core block attachment, scope, provenance, stale-core detection, archival fallback, and project-decision recovery separately from archival note search. Letta remains blocked or not_tested until its contained export/readback artifact maps core and archival source ids.", "command_artifacts": [ "docs/spec/system_elf_memory_service_v2.md", - "docs/guide/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md" + "apps/elf-eval/fixtures/real_world_memory/core_archival_memory", + "apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json", + "tmp/real-world-memory/core-archival/report.json" ], "follow_up_issues": [ "XY-927" ], - "caveat": "No ELF-over-Letta claim is allowed." + "caveat": "No ELF-over-Letta claim is allowed; the selected Letta path must export core block JSON, archival search/readback JSON, and source ids before scoring." }, { "scenario_id": "graph_rag_navigation_citations", @@ -431,8 +440,8 @@ { "issue": "XY-927", "priority": "P1", - "state": "Backlog", - "gap": "Letta-style core-vs-archival memory comparison." + "state": "Fixture encoded; Letta export blocked", + "gap": "ELF core_archival_memory fixture coverage is encoded; a contained Letta export/readback adapter remains future work before win/tie/loss claims." }, { "issue": "XY-928", @@ -464,6 +473,7 @@ "ELF is adoptable for bounded personal production use with caveats.", "ELF has the strongest measured source-of-truth, rebuild, restore, and backfill evidence among the tracked systems.", "ELF ties qmd on encoded live retrieval, work_resume, project_decisions, and personalization slices.", + "ELF fixture-backed core_archival_memory coverage passes attachment, scope, provenance, stale-core detection, archival fallback, and project-decision recovery jobs separately from archival search.", "ELF has a live temporal reconciliation loss against the benchmark expectation: five memory_evolution jobs remain wrong_result.", "Most competitor strengths outside qmd retrieval are not_tested, blocked, smoke_only, or research_gate.", "ELF has a narrow live operator-debug win over qmd for trace hydration, candidate-drop visibility, and selected-but-not-narrated evidence, with replay-command availability and repair-action clarity tied." diff --git a/docs/research/2026-06-11-measurement-coverage-audit.json b/docs/research/2026-06-11-measurement-coverage-audit.json index ab71c30e..0ebe1ec9 100644 --- a/docs/research/2026-06-11-measurement-coverage-audit.json +++ b/docs/research/2026-06-11-measurement-coverage-audit.json @@ -1,14 +1,20 @@ { "schema": "elf.benchmark_measurement_coverage_audit/v2", "run_id": "2026-06-11-measurement-coverage-audit", - "source_revision": "current XY-898 lane after adapter-report consistency repairs", + "source_revision": "current benchmark lane after adapter-report consistency repairs and XY-927 core-vs-archival fixture update", "created_at": "2026-06-11", "scope": "ELF memory-system competitiveness measurement coverage, external competitor comparison evidence, and next report directions", "commands": [ + { + "command": "cargo make real-world-memory-core-archival", + "status": "pass", + "runtime_seconds": 57.01, + "artifact": "tmp/real-world-memory/core-archival/report.json" + }, { "command": "cargo make real-world-memory", "status": "pass", - "runtime_seconds": 11.91, + "runtime_seconds": 8.94, "artifact": "tmp/real-world-memory/real-world-memory-report.json" }, { @@ -19,21 +25,21 @@ } ], "fixture_aggregate": { - "job_count": 38, - "encoded_suite_count": 11, - "pass": 36, + "job_count": 44, + "encoded_suite_count": 12, + "pass": 42, "wrong_result": 0, "lifecycle_fail": 0, "incomplete": 0, "blocked": 2, "not_encoded": 0, "unsupported_claim": 0, - "mean_score": 0.947, - "mean_latency_ms": 4.411, - "expected_evidence_total": 77, - "expected_evidence_matched": 77, - "evidence_required_count": 84, - "evidence_covered_count": 84 + "mean_score": 0.955, + "mean_latency_ms": 3.958, + "expected_evidence_total": 90, + "expected_evidence_matched": 90, + "evidence_required_count": 97, + "evidence_covered_count": 97 }, "live_real_world_adapters": [ { @@ -197,8 +203,8 @@ "pass": 4, "wrong_result": 6, "lifecycle_fail": 1, - "blocked": 5, - "not_encoded": 7 + "blocked": 6, + "not_encoded": 6 }, "xy900_update_note": "XY-900 promotes graphify from research_gate/blocked to a tiny scored live_real_world wrong_result smoke; broad graph/RAG quality remains unproven.", "xy932_update_note": "XY-932 adds narrow ELF/qmd operator-debug live_real_world records: ELF pass and qmd wrong_result for trace hydration/candidate-drop visibility, with OpenMemory and claude-mem UI still unmeasured." @@ -212,7 +218,7 @@ "OpenViking_context_trajectory", "mem0_OpenMemory_entity_history_ui", "agentmemory_claude_mem_capture_continuity", - "Letta_core_vs_archival_memory", + "Letta_core_vs_archival_export_path", "Graphiti_Zep_temporal_graph", "RAG_graph_navigation", "llm_wiki_gbrain_graphify_knowledge_workflows" diff --git a/docs/spec/real_world_agent_memory_benchmark_v1.md b/docs/spec/real_world_agent_memory_benchmark_v1.md index 5bb56574..aa5c78c3 100644 --- a/docs/spec/real_world_agent_memory_benchmark_v1.md +++ b/docs/spec/real_world_agent_memory_benchmark_v1.md @@ -525,6 +525,7 @@ Suite ids are stable public names. Each suite MUST contain at least one | `capture_integration` | Evaluate how accurately work observations become usable memory across agents and tools. | Capture a session decision; exclude private spans; import external agent observations. | Hook/import logs, write policy audits, excluded spans, resulting note ids. | answer_correctness, evidence_grounding, trap_avoidance, lifecycle_behavior. | agentmemory, claude-mem, memsearch, mem0. | | `production_ops` | Prove safe operation under backup, restore, backfill, cold start, resource, and credential boundaries. | Resume interrupted import; restore from backup; report missing private manifest as bounded caveat. | Command/report artifacts, resource envelope, checkpoint state, failure guard evidence. | lifecycle_behavior, latency_resource, uncertainty_handling, evidence_grounding. | ELF, qmd, memsearch, LangGraph. | | `personalization` | Apply user/project preferences correctly without leaking across scopes or overfitting stale preferences. | Remember preferred response style; avoid using another project tenant's note; update a preference. | Scoped memory ids, preference versions, tenant/project/agent context, negative cross-scope traps. | personalization_fit, trap_avoidance, evidence_grounding, answer_correctness. | mem0, Letta, agentmemory, ELF. | +| `core_archival_memory` | Verify always-loaded core memory behavior separately from archival note search and derived retrieval indexes. | Read an attached core block; enforce core block scope; detect stale core state from archival evidence; fall back to archival notes; recover a decision from core routing plus archival rationale. | Core block ids, attachment ids, read_profile/scope metadata, source_ref and audit history, archival note evidence ids, stale-core traps, and explicit no-Qdrant-core-block boundary evidence. | answer_correctness, evidence_grounding, trap_avoidance, lifecycle_behavior, workflow_helpfulness. | Letta, ELF. | ## Report Semantics From 0ff95b0201a5ea139762bb2ffee5d9af0cb55612 Mon Sep 17 00:00:00 2001 From: Yvette Carlisle Date: Fri, 12 Jun 2026 00:47:57 +0800 Subject: [PATCH 2/7] {"schema":"decodex/commit/1","summary":"Repair Letta benchmark review drift","authority":"XY-927"} --- .../memory_projects_manifest.json | 7 +++- .../tests/real_world_job_benchmark.rs | 15 +++++++ ...-11-competitor-strength-evidence-matrix.md | 21 ++++++---- ...on-direction-from-competitor-benchmarks.md | 22 ++++++---- .../research/research_projects_inventory.md | 2 +- ...-11-xy-897-competitor-strength-matrix.json | 42 ++++++++++--------- 6 files changed, 69 insertions(+), 40 deletions(-) diff --git a/apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json b/apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json index a5822e69..e10585a8 100644 --- a/apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json +++ b/apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json @@ -29,7 +29,7 @@ }, "run": { "status": "blocked", - "evidence": "The current fixture set reports 40 jobs, 38 pass, 0 incomplete, 2 blocked, 0 wrong_result, 0 not_encoded, and 0 unsupported_claim.", + "evidence": "The current fixture set reports 46 jobs across 12 suites: 44 pass, 0 incomplete, 2 blocked, 0 wrong_result, 0 not_encoded, and 0 unsupported_claim. The six core_archival_memory jobs pass as ELF fixture evidence, not as live Letta comparison evidence.", "command": "cargo make real-world-memory", "artifact": "tmp/real-world-memory/real-world-memory-report.json" }, @@ -101,6 +101,11 @@ "status": "pass", "evidence": "Four redaction, exclusion, source-id, evidence-binding, and capture-boundary fixtures are encoded and passing." }, + { + "suite_id": "core_archival_memory", + "status": "pass", + "evidence": "Six fixture jobs score core block attachment, scope, provenance, stale-core detection, archival fallback, and project-decision recovery separately from archival note search." + }, { "suite_id": "production_ops", "status": "blocked", diff --git a/apps/elf-eval/tests/real_world_job_benchmark.rs b/apps/elf-eval/tests/real_world_job_benchmark.rs index fa20dc07..d7d5eae7 100644 --- a/apps/elf-eval/tests/real_world_job_benchmark.rs +++ b/apps/elf-eval/tests/real_world_job_benchmark.rs @@ -705,6 +705,21 @@ fn assert_external_adapter_manifest_records(report: &Value) -> Result<()> { assert_eq!(elf.pointer("/evidence_class").and_then(Value::as_str), Some("fixture_backed")); assert_eq!(elf.pointer("/overall_status").and_then(Value::as_str), Some("blocked")); + assert!(elf.pointer("/run/evidence").and_then(Value::as_str).is_some_and(|evidence| { + evidence.contains("46 jobs across 12 suites") + && evidence.contains("44 pass") + && evidence.contains("core_archival_memory") + })); + + let elf_suites = array_at(elf, "/suites")?; + let elf_core_archival = find_by_field(elf_suites, "/suite_id", "core_archival_memory")?; + + assert_eq!(elf_core_archival.pointer("/status").and_then(Value::as_str), Some("pass")); + assert!(elf_core_archival.pointer("/evidence").and_then(Value::as_str).is_some_and( + |evidence| evidence.contains("core block attachment") + && evidence.contains("project-decision recovery") + && evidence.contains("archival note search") + )); assert_eq!( elf_live.pointer("/evidence_class").and_then(Value::as_str), Some("live_real_world") diff --git a/docs/guide/benchmarking/2026-06-11-competitor-strength-evidence-matrix.md b/docs/guide/benchmarking/2026-06-11-competitor-strength-evidence-matrix.md index d042d0ec..58692226 100644 --- a/docs/guide/benchmarking/2026-06-11-competitor-strength-evidence-matrix.md +++ b/docs/guide/benchmarking/2026-06-11-competitor-strength-evidence-matrix.md @@ -7,6 +7,8 @@ non-claim against a tracked memory, RAG, or graph project. Inputs: `docs/guide/benchmarking/2026-06-10-production-adoption-refresh.md`, `docs/guide/benchmarking/2026-06-10-real-world-comparison-report.md`, `docs/guide/benchmarking/2026-06-10-live-real-world-sweep-report.md`, +`docs/guide/benchmarking/2026-06-11-measurement-coverage-audit.md`, +`docs/guide/benchmarking/2026-06-11-competitor-strength-adoption-report.md`, `docs/guide/research/external_memory_improvement_plan.md`, `docs/guide/research/research_projects_inventory.md`, `apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json`, @@ -29,9 +31,10 @@ Current boundary: live pass. The fresh ELF sweep produced 40 jobs with 22 pass, 5 wrong_result, 0 incomplete, 2 blocked, and 11 not_encoded; the fresh qmd sweep produced 17 pass, 6 wrong_result, 0 incomplete, 2 blocked, and 15 not_encoded. -- ELF fixture evidence is strong: `cargo make real-world-memory` reports 40 jobs - across 11 suites with 38 pass and 2 blocked production-ops operator boundaries. - That proves the fixture contract, not live-service parity. +- ELF fixture evidence is strong: `cargo make real-world-memory` reports 46 jobs + across 12 suites with 44 pass and 2 blocked production-ops operator boundaries. + The added `core_archival_memory` suite contributes 6 fixture-only passes for ELF + core-block behavior; it does not create an ELF-over-Letta claim. - qmd is the strongest measured local retrieval-debug comparison, but the current evidence still separates its same-corpus/live-retrieval strengths from the full-suite live non-pass sweep. @@ -45,7 +48,7 @@ Current boundary: The current manifest has 23 adapter records across 16 external projects plus ELF. Evidence-class counts: 1 `fixture_backed`, 6 `live_baseline_only`, 5 `live_real_world`, and 11 `research_gate`. Overall adapter-status counts: 4 `pass`, -6 `wrong_result`, 1 `lifecycle_fail`, 5 `blocked`, and 7 `not_encoded`. +6 `wrong_result`, 1 `lifecycle_fail`, 6 `blocked`, and 6 `not_encoded`. ## State Taxonomy @@ -83,7 +86,7 @@ lifecycle-fail -> `lifecycle_fail`, and not-encoded -> `not_encoded`. | LightRAG | Lightweight graph/RAG context export with source file-path citation shape. | `research_gate`. | `blocked`: `ELF_LIGHTRAG_CONTEXT_START=1 cargo make lightrag-docker-context-smoke`, `tmp/real-world-memory/lightrag-context/summary.json`. | `blocked`: Docker service setup and context export are not proven. | XY-886 Docker context-export adapter with explicit provider config and source citation mapping. | Context-only query modes, graph-aware retrieval layout, and file-path citation readback. | | GraphRAG | GraphRAG indexing, graph summaries, and document/text-unit evidence tables. | `research_gate`. | `blocked`: `ELF_GRAPHRAG_SMOKE_RUN=1 cargo make graphrag-docker-smoke`, `tmp/real-world-memory/graphrag-smoke/summary.json`. | `blocked`: indexing resource envelope and source citation mapping are not proven. | XY-887 cost-bounded Docker adapter over a tiny corpus and scored output tables. | Graph summary artifacts, local/global search separation, and source table evidence mapping. | | Graphiti/Zep | Temporal graph memory with current, historical, and future fact validity windows. | `research_gate`. | `blocked`: `ELF_GRAPHITI_ZEP_SMOKE_START=1 ELF_GRAPHITI_ZEP_SMOKE_RUN=1 cargo make graphiti-zep-docker-temporal-smoke`, `tmp/real-world-memory/graphiti-zep-smoke/summary.json`. | `blocked`: Docker graph-store and temporal adapter are not proven. | XY-888 Docker-local temporal graph adapter scoring current/historical fact validity. | Temporal fact windows, invalidation/supersession semantics, and graph fact provenance. | -| Letta | Core memory blocks versus archival memory with explicit operating-context surfaces. | `research_gate`. | `not_encoded`: `docs/research/2026-06-10-xy-882-rag-graph-adapter-feasibility.json`. | `blocked`: contained evidence export path is not selected. | Select contained export contract, then encode core-vs-archival, personalization, and project-decision jobs. | Core memory block ergonomics, archival separation, and shared operating context readback. | +| Letta | Core memory blocks versus archival memory with explicit operating-context surfaces. | `research_gate`. | `blocked`: the selected comparison contract is a Docker-only benchmark-created agent export that returns core block JSON, archival search/readback JSON, and source ids; no materialized export exists yet. | `blocked`: no Letta materializer currently creates the benchmark agent, imports the ELF `core_archival_memory` fixture corpus, or exports comparable core and archival evidence. | Implement and run the contained export/readback adapter before any Letta win, tie, or loss claim; keep personalization and project-decision scenarios blocked or not tested until that evidence exists. | Core memory block ergonomics, archival separation, and shared operating context readback. | | LangGraph | Checkpoint/replay regression workflow and durable state replay for agent runs. | `research_gate`. | `not_encoded`: `docs/research/2026-06-10-xy-882-rag-graph-adapter-feasibility.json`. | `unsupported`: not a standalone memory backend adapter. | Non-goal for direct win/loss until a standalone memory output contract exists; use replay jobs as benchmark infrastructure reference. | Checkpoint replay, deterministic regression, and state-diff evaluation patterns. | | nanograph | Typed graph schema and query ergonomics for graph-lite developer experience. | `research_gate`. | `not_encoded`: `docs/research/2026-06-10-xy-882-rag-graph-adapter-feasibility.json`. | `unsupported`: not a memory backend comparison target. | Non-goal for direct win/loss unless a contained memory-backed target emerges; measure ELF graph-lite DX instead. | Typed relation schema, query ergonomics, and small graph developer experience. | | llm-wiki | LLM-maintained wiki or knowledge-page workflow with query-save and lint loops. | `research_gate`. | `not_encoded`: `docs/research/2026-06-10-xy-882-rag-graph-adapter-feasibility.json`. | `unsupported`: no live service runtime for adapter proof. | Select contained plugin or instruction harness, then score knowledge pages for citations, unsupported claims, rebuild, and stale-source lint. | Maintained wiki workflows, page lint, query-save loops, and topic-scoped navigation. | @@ -96,7 +99,7 @@ lifecycle-fail -> `lifecycle_fail`, and not-encoded -> `not_encoded`. | --- | --- | --- | --- | --- | | Retrieval/debug | Fixture retrieval passes; live retrieval passes. | qmd. | qmd live retrieval passes and live baseline passes, but full-suite live status is `wrong_result`. | Run qmd deep profile and ELF/qmd trace-level replay with expansion, fusion, rerank, and candidate-drop diagnostics. | | Work resume | Fixture and live work_resume pass. | agentmemory, claude-mem, OpenViking. | agentmemory `lifecycle_fail`, claude-mem `wrong_result`, OpenViking work_resume `not_encoded`. | Encode durable work_resume adapters or keep each blocked with lifecycle/setup evidence. | -| Project decisions | Fixture and live project_decisions pass. | qmd, Letta. | qmd live project_decisions pass; Letta is `research_gate` `not_encoded`. | Add Letta core/archival decision jobs only after a contained export path exists. | +| Project decisions | Fixture and live project_decisions pass; the ELF core-archival fixture also scores project-decision recovery through core routing plus archival rationale. | qmd, Letta. | qmd live project_decisions pass; Letta project-decision recovery is `research_gate` `not_tested` or `blocked` until the contained export path exists. | Run the Letta core/archival export/readback contract before treating project-decision recovery as a comparable scenario. | | Source-of-truth | Fixture and live trust_source_of_truth pass. | memsearch. | memsearch canonical-store, reindex, delete, and reload smoke now passes, but source-of-truth real_world_job prompts are `not_encoded`. | Score memsearch source-of-truth rebuild/reload jobs before any suite-level win/loss claim. | | Temporal/current-vs-historical memory | Fixture memory_evolution passes; live memory_evolution is `wrong_result`. | Graphiti/Zep, mem0/OpenMemory. | Graphiti/Zep is `research_gate` `blocked`; mem0/OpenMemory local OSS preference history, entity scope, deletion audit, and SDK `get_all` now pass; OpenMemory UI/export is blocked by the export-helper setup probe; graph-memory scenarios are `not_encoded`. | Fix ELF/qmd live memory_evolution evidence links, add OpenMemory product app import/export readback, and run XY-888. | | Consolidation | Fixture consolidation passes; live consolidation is `not_encoded`. | agentmemory, managed-memory references, llm-wiki. | No manifest project has live consolidation scoring. | Run reviewable consolidation proposal generation with source refs, unsupported-claim flags, and audit transitions. | @@ -104,9 +107,9 @@ lifecycle-fail -> `lifecycle_fail`, and not-encoded -> `not_encoded`. | Operator debugging | Fixture operator_debugging_ux passes, and the narrow live operator-debug slice passes for trace hydration, candidate-drop visibility, selected-but-not-narrated evidence, replay-command availability, and repair-action clarity. | qmd, claude-mem, OpenMemory. | qmd ties replay-command availability and repair-action clarity but is `wrong_result` for trace hydration, candidate-drop stage visibility, and selected-but-not-narrated evidence; claude-mem and OpenMemory UX remain `not_encoded` or blocked. | Add bounded OpenMemory and claude-mem UI/export or viewer runners before any broader operator-UX claim. | | Capture/write policy | Fixture capture_integration passes; ELF live capture_integration passes 4/4 with zero redaction leaks, source ids, write-policy audit, and evidence binding. | agentmemory, claude-mem. | agentmemory capture is `blocked` by mocked/in-memory storage; claude-mem hook/viewer capture is `not_encoded`. | Run durable agentmemory and claude-mem capture-hook jobs proving redaction, exclusion, evidence binding, source ids, and no secret leakage. | | Production ops | Fixture production_ops has 4 pass and 2 blocked; live production_ops is `blocked`; production adoption has provider/backfill/restore evidence. | ELF production gate, qmd, RAG/RAGFlow resource gates. | qmd live production_ops is `blocked`; RAG/resource gates are `research_gate` `blocked`. | Rerun private-corpus and credentialed gates only when operator-owned manifest and credentials exist. | -| Personalization | Fixture and live personalization pass. | mem0/OpenMemory, Letta. | mem0/OpenMemory and Letta personalization are `not_encoded`. | Encode scoped preference readback for mem0/OpenMemory and Letta before personalization superiority claims. | +| Personalization | Fixture and live personalization pass. | mem0/OpenMemory, Letta. | mem0/OpenMemory personalization is `not_encoded`; Letta scoped preference readback remains `not_tested` until the contained core/archival export path exists. | Encode scoped preference readback for mem0/OpenMemory and Letta before personalization superiority claims. | | Context trajectory | ELF has trace direction but no comparable staged trajectory scenario. | OpenViking. | OpenViking setup is pinned, same-corpus retrieval is `wrong_result`, and hierarchy trajectory is `not_encoded`. | Make OpenViking evidence-bearing retrieval pass, then score staged context trajectory outputs. | -| Core-vs-archival memory | ELF core-block semantics exist in the service contract, but comparative benchmark coverage is not encoded here. | Letta. | Letta is `research_gate` `not_encoded` until contained export proof exists. | Add ELF core-block versus archival-search jobs; compare Letta only after contained export proof. | +| Core-vs-archival memory | Fixture `core_archival_memory` passes 6/6 and scores core block attachment, scope, provenance, stale-core detection, archival fallback, and project-decision recovery separately from archival note search. | Letta. | Letta is `research_gate` `blocked`/`not_tested` until the selected contained export/readback artifact exists. | Implement the Letta export/readback adapter, then compare only scenarios whose core block JSON, archival search/readback JSON, and source ids are present. | | Graph/RAG navigation | ELF relation context is not enough to claim graph/RAG navigation parity. | RAGFlow, LightRAG, GraphRAG, Graphiti/Zep, graphify. | RAGFlow, LightRAG, GraphRAG, and Graphiti/Zep remain `research_gate` blocked/incomplete without explicit setup; graphify has only a tiny scored smoke `wrong_result`. | Run larger contained graph/RAG adapters with evidence-linked outputs before any ELF graph/RAG win, tie, or loss claim. | ## Parallelizable Benchmark Follow-Ups @@ -129,7 +132,7 @@ now explicit: | Graphiti/Zep temporal graph adapter | XY-888 | yes | Docker-local graph store setup. | Current/historical/future fact validity and evidence ids. | | graphify graph report adapter | XY-889 plus post-XY-900 expansion | yes | Representative graph/RAG jobs beyond the tiny scored smoke. | `graph.json` and `GRAPH_REPORT` evidence mapped to scored graph navigation and knowledge synthesis ids. | | Private corpus and credentialed production ops | Operator-owned benchmark gates | no | Sanitized private manifest and routed provider credentials. | Private-corpus retrieval quality and credentialed production-ops evidence. | -| Letta, LangGraph, nanograph, llm-wiki direct adapters | Research-only until output contract | no | Contained evidence export or non-memory-backend comparability contract. | Run only after each has a comparable output contract; otherwise keep as product-reference evidence. | +| Letta, LangGraph, nanograph, llm-wiki direct adapters | Letta export artifact blocked; others research-only until output contract | no | Letta needs the selected contained export/readback artifact; the others need a non-memory-backend comparability contract. | Run only after comparable output exists; otherwise keep as product-reference evidence. | ## Validation Contract diff --git a/docs/guide/benchmarking/2026-06-11-elf-iteration-direction-from-competitor-benchmarks.md b/docs/guide/benchmarking/2026-06-11-elf-iteration-direction-from-competitor-benchmarks.md index 5948ba26..1363d3f0 100644 --- a/docs/guide/benchmarking/2026-06-11-elf-iteration-direction-from-competitor-benchmarks.md +++ b/docs/guide/benchmarking/2026-06-11-elf-iteration-direction-from-competitor-benchmarks.md @@ -116,8 +116,8 @@ Overall adapter statuses: | `pass` | `4` | | `wrong_result` | `6` | | `lifecycle_fail` | `1` | -| `blocked` | `5` | -| `not_encoded` | `7` | +| `blocked` | `6` | +| `not_encoded` | `6` | The ledger is intentionally not a leaderboard. It prevents fixture evidence, same-corpus checks, research gates, and live real-world runs from being collapsed into @@ -129,7 +129,7 @@ one misleading score. | --- | --- | --- | | Retrieval/debug | ELF and qmd are tied on encoded live retrieval; qmd remains the stronger debug UX reference. | Add trace-level replay, expansion/fusion/rerank knobs, candidate-drop diagnosis, and command-line replay. | | Work resume | ELF live work-resume passes; continuity-oriented competitors are undermeasured. | Borrow agentmemory/claude-mem capture breadth and OpenViking staged context, but require durable adapter proof. | -| Project decisions | ELF and qmd live project-decision suites pass; Letta is not encoded. | Add core-vs-archival decision-memory scenarios before comparing Letta. | +| Project decisions | ELF and qmd live project-decision suites pass; ELF fixture-backed `core_archival_memory` also scores project-decision recovery, while Letta remains blocked without export evidence. | Run the Letta core/archival export/readback contract before treating project-decision recovery as comparable. | | Source of truth | ELF has the strongest measured source-of-truth evidence. | Borrow memsearch's local canonical-store ergonomics without making files or vectors authoritative. | | Temporal memory | ELF fixture passes, but live memory evolution is wrong_result. | Prioritize current-vs-historical evidence links and Graphiti/Zep-style validity windows. | | Consolidation | ELF fixture passes, but live proposal generation is not encoded. | Build reviewable derived proposals with source refs, confidence, unsupported-claim flags, and apply/defer/discard audit. | @@ -137,9 +137,9 @@ one misleading score. | Operator debugging | Fixture UX passes and the narrow live trace/viewer slice is scored: ELF passes, qmd ties replay/repair clarity but is wrong_result for trace hydration and candidate-drop visibility. | Expand coverage to OpenMemory and claude-mem UI/export or viewer runners before any broader operator-UX claim. | | Capture/write policy | ELF live capture/write-policy self-check passes with zero redaction leaks; qmd is `not_encoded`; agentmemory is `blocked`; claude-mem is `not_encoded`. | Borrow agentmemory/claude-mem capture breadth only after durable local hook/viewer evidence exists, while preserving redaction and evidence binding. | | Production ops | ELF has the strongest checked-in evidence, with private/credential gates blocked. | Keep Docker-first production proof and add private corpus only when an operator-owned manifest exists. | -| Personalization | ELF live personalization passes; mem0/OpenMemory and Letta are not encoded. | Add entity-scoped preference history and UI readback before claiming stronger personalization. | +| Personalization | ELF live personalization passes; mem0/OpenMemory is not encoded and Letta scoped preference readback remains not tested until its contained export path exists. | Add entity-scoped preference history and UI readback before claiming stronger personalization. | | Context trajectory | Not comparable yet; OpenViking remains the reference. | Score staged retrieval, hierarchy expansion, and trajectory readback. | -| Core-vs-archival | Product gap, not a measured comparison yet. | Borrow Letta's core memory block shape with explicit scope, provenance, and read-only attachment. | +| Core-vs-archival | ELF fixture-backed `core_archival_memory` passes 6/6, but Letta remains blocked/not tested because no contained export artifact exists. | Borrow Letta's core memory block shape while keeping any win/tie/loss claim gated on exported core block, archival readback, and source-id evidence. | | Graph/RAG navigation | RAGFlow, LightRAG, GraphRAG, and Graphiti/Zep remain research gates; graphify has a tiny scored `wrong_result` smoke. | Run larger contained graph/RAG adapters before any broad graph-navigation claim. | ## Project Guidance Matrix @@ -157,7 +157,7 @@ one misleading score. | LightRAG | `research_gate`; current status is `blocked`. | Lightweight graph/RAG context export and source-path citation shape. | Borrow context-export ideas for graph/RAG navigation after Docker proof. | | GraphRAG | `research_gate`; current status is `blocked`. | Graph summaries, document/text-unit tables, local/global search separation. | Borrow graph summary artifacts for knowledge pages and graph navigation after cost-bounded output proof. | | Graphiti/Zep | `research_gate`; current status is `blocked`. | Temporal graph facts, validity windows, current-vs-historical answers. | Use as the semantic model for ELF temporal memory and relation validity benchmarks. | -| Letta | `research_gate`; current status is `not_encoded`. | Core memory blocks versus archival memory. | Add explicit scoped core blocks in ELF, but compare Letta only after a contained export path exists. | +| Letta | `research_gate`; current status is `blocked` until the selected contained export/readback artifact exists. | Core memory blocks versus archival memory. | Keep ELF's fixture-backed core block coverage separate from Letta comparison claims; compare Letta only after exported core and archival evidence exists. | | LangGraph | `research_gate`; current status is `not_encoded` or `unsupported` as a direct memory backend. | Checkpoint, replay, fork, and regression debugging for agent state. | Borrow replay/regression patterns for benchmark infrastructure, not as direct memory parity. | | nanograph | `research_gate`; current status is `not_encoded` or `unsupported` as a full memory backend. | Typed graph schema and query ergonomics. | Borrow graph-lite DX and typed relation query ideas. | | llm-wiki | `research_gate`; current status is `not_encoded`. | Maintained wiki pages, query-save, lint, and repair loops. | Use as a reference for rebuildable, cited knowledge pages. | @@ -225,8 +225,10 @@ These improve day-to-day usefulness while preserving ELF's evidence-bound core. - Borrow from: Letta core memory versus archival memory. - ELF shape: scoped read-only blocks with provenance and attachment rules, separate from archival search. - - Benchmark gate: core-vs-archival jobs prove correct attachment, sharing, and - fallback to search. + - Benchmark gate: ELF fixture jobs now prove attachment, scope, provenance, + stale-core detection, archival fallback, and project-decision recovery; Letta + comparison remains gated on exported core block, archival readback, and source-id + evidence. ### P2 - Expand External Comparison Without Fake Wins @@ -265,7 +267,9 @@ Do not claim: - ELF beats mem0/OpenMemory on hosted memory, entity history, UI, or optional graph memory. Those scenarios are not encoded; the operator-debug win is only against qmd on a narrow trace/replay slice. -- ELF beats Letta on core-vs-archival memory. That scenario is not encoded. +- ELF beats Letta on core-vs-archival memory. ELF has fixture-backed coverage, but + Letta remains blocked/not tested until the selected contained export/readback path + produces comparable source-id-mapped evidence. - ELF beats RAGFlow, LightRAG, GraphRAG, Graphiti/Zep, or graphify on graph/RAG navigation. Current evidence is research-gate or blocked except graphify's tiny non-pass smoke. diff --git a/docs/guide/research/research_projects_inventory.md b/docs/guide/research/research_projects_inventory.md index 2f1cb9c0..be322238 100644 --- a/docs/guide/research/research_projects_inventory.md +++ b/docs/guide/research/research_projects_inventory.md @@ -31,7 +31,7 @@ Last updated: June 11, 2026. | [gbrain](https://github.com/garrytan/gbrain) | D1 | Reviewed; XY-882 verdict `blocked` | `rw.knowledge-synthesis`, `rw.operator-continuity` | Operational knowledge brain, `compiled_truth` + timeline pages, enrichment and maintenance loops; blocked on Docker-local brain repo and database proof | `docs/guide/research/comparison_external_projects.md`; `docs/research/2026-06-09-xy-841-external-memory-benchmark-dimensions.json`; `docs/research/2026-06-10-xy-882-rag-graph-adapter-feasibility.json` | | [Always-On Memory Agent](https://github.com/GoogleCloudPlatform/generative-ai/tree/main/gemini/agents/always-on-memory-agent) | D1 | Reviewed | `rw.consolidation-review`, `rw.operator-continuity` | Always-on multimodal ingest + scheduled consolidation loop with simple local ops surface | `docs/guide/research/comparison_external_projects.md`; `docs/research/2026-06-09-xy-841-external-memory-benchmark-dimensions.json` | | [graphify](https://github.com/safishamsi/graphify) | D1 | Reviewed; XY-882 verdict `adapter_candidate`; XY-889 adds Docker graph/report smoke | `rw.graph-navigation`, `rw.knowledge-synthesis`, `rw.resume-evidence` | Multimodal graph compression, deterministic code extraction, and graph/report outputs with source-file/source-location references; current ELF evidence is a generated-corpus Docker smoke, not broad graph-quality proof | `docs/guide/research/comparison_external_projects.md`; `docs/research/2026-06-09-xy-841-external-memory-benchmark-dimensions.json`; `docs/research/2026-06-10-xy-882-rag-graph-adapter-feasibility.json`; `apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json` | -| [Letta](https://github.com/letta-ai/letta) | D1 | Reviewed; XY-882 verdict `research_only` | `rw.core-archival`, `rw.operator-continuity` | Core vs archival memory split, shared blocks; not an implementation candidate until a supported contained server path can export evidence | `docs/guide/research/comparison_external_projects.md`; `docs/research/2026-06-09-xy-841-external-memory-benchmark-dimensions.json`; `docs/research/2026-06-10-xy-882-rag-graph-adapter-feasibility.json` | +| [Letta](https://github.com/letta-ai/letta) | D1 | Reviewed; XY-882 verdict `research_only`; XY-927 selects blocked contained export/readback path | `rw.core-archival`, `rw.operator-continuity` | Core vs archival memory split, shared blocks; compare only after a Docker-only benchmark-created agent export returns core block JSON, archival readback JSON, and source ids | `docs/guide/research/comparison_external_projects.md`; `docs/research/2026-06-09-xy-841-external-memory-benchmark-dimensions.json`; `docs/research/2026-06-10-xy-882-rag-graph-adapter-feasibility.json`; `apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json` | | [LangGraph](https://docs.langchain.com/oss/python/langgraph/persistence) | D1 | Reviewed; XY-882 verdict `research_only` | `rw.replay-regression`, `rw.resume-evidence` | Checkpoint/replay mindset for quality regression workflows; not a standalone memory backend adapter | `docs/guide/research/comparison_external_projects.md`; `docs/research/2026-06-09-xy-841-external-memory-benchmark-dimensions.json`; `docs/research/2026-06-10-xy-882-rag-graph-adapter-feasibility.json` | | [Graphiti / Zep](https://help.getzep.com/graphiti/core-concepts/temporal-awareness) | D1 | Reviewed; XY-882 verdict `adapter_candidate` | `rw.graph-temporal`, `rw.resume-evidence` | Temporal fact validity model with Docker-local graph-store options and UUID/fact/validity-window output | `docs/guide/research/comparison_external_projects.md`; `docs/research/2026-06-09-xy-841-external-memory-benchmark-dimensions.json`; `docs/research/2026-06-10-xy-882-rag-graph-adapter-feasibility.json` | | [nanograph](https://github.com/nanograph/nanograph) | D1 | Reviewed; XY-882 verdict `research_only` | `rw.graph-temporal`, `rw.retrieval-debug` | Typed schema + typed query ergonomics for graph-lite developer experience; official shape is no server/no Docker | `docs/guide/research/comparison_external_projects.md`; `docs/research/2026-06-09-xy-841-external-memory-benchmark-dimensions.json`; `apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json`; `docs/research/2026-06-10-xy-882-rag-graph-adapter-feasibility.json` | diff --git a/docs/research/2026-06-11-xy-897-competitor-strength-matrix.json b/docs/research/2026-06-11-xy-897-competitor-strength-matrix.json index 528fc057..558fa520 100644 --- a/docs/research/2026-06-11-xy-897-competitor-strength-matrix.json +++ b/docs/research/2026-06-11-xy-897-competitor-strength-matrix.json @@ -8,6 +8,8 @@ "docs/guide/benchmarking/2026-06-10-production-adoption-refresh.md", "docs/guide/benchmarking/2026-06-10-real-world-comparison-report.md", "docs/guide/benchmarking/2026-06-10-live-real-world-sweep-report.md", + "docs/guide/benchmarking/2026-06-11-measurement-coverage-audit.md", + "docs/guide/benchmarking/2026-06-11-competitor-strength-adoption-report.md", "docs/guide/research/external_memory_improvement_plan.md", "docs/guide/research/research_projects_inventory.md", "apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json", @@ -30,8 +32,8 @@ }, "overall_status_counts": { "lifecycle_fail": 1, - "blocked": 5, - "not_encoded": 7, + "blocked": 6, + "not_encoded": 6, "pass": 4, "wrong_result": 6 } @@ -310,17 +312,17 @@ "supporting_evidence_classes": [ "research_gate" ], - "measured_status": "not_encoded", + "measured_status": "blocked", "proof": { - "command": null, - "artifact": "docs/research/2026-06-10-xy-882-rag-graph-adapter-feasibility.json" + "command": "blocked until a Docker-only benchmark-created agent export is implemented", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json" }, "unsupported_or_blocked_status": { "state": "blocked", - "typed_reason": "contained_evidence_export_path_not_selected", - "details": "Research-only until a supported contained server path can export core/archival evidence without relying on unsupported setup." + "typed_reason": "contained_export_readback_artifact_missing", + "details": "The selected contract requires a benchmark-created Letta agent export with core block JSON, archival search/readback JSON, and source ids before any scenario claim can be scored." }, - "benchmark_before_claim": "Select a contained evidence export contract, then encode core-vs-archival memory, personalization, and project-decision jobs.", + "benchmark_before_claim": "Implement and run the contained export/readback adapter before any Letta win, tie, or loss claim; keep personalization and project-decision scenarios blocked or not tested until that evidence exists.", "borrow_if_stronger": "Borrow explicit core memory block ergonomics, archival separation, and shared operating context readback." }, { @@ -446,11 +448,11 @@ { "scenario_id": "project_decisions", "scenario": "project decisions", - "current_elf_evidence": "ELF fixture-backed and live_real_world project_decisions suites pass.", + "current_elf_evidence": "ELF fixture-backed and live_real_world project_decisions suites pass; the ELF core_archival_memory fixture also scores project-decision recovery through core routing plus archival rationale.", "strongest_competitor_or_reference": "qmd, Letta", - "current_competitor_evidence": "qmd live_real_world project_decisions passes; Letta project_decisions is research_gate not_encoded.", - "current_state": "ELF and qmd are the only measured live competitors for this scenario.", - "next_measurement": "Add core/archival decision-memory jobs for Letta only after a contained export path exists; otherwise keep Letta as design reference." + "current_competitor_evidence": "qmd live_real_world project_decisions passes; Letta project-decision recovery is research_gate not_tested or blocked until the contained export path exists.", + "current_state": "ELF and qmd are the only measured live competitors for this scenario; Letta remains a product-reference comparison target.", + "next_measurement": "Run the Letta core/archival export/readback contract before treating project-decision recovery as a comparable scenario." }, { "scenario_id": "source_of_truth", @@ -520,7 +522,7 @@ "scenario": "personalization", "current_elf_evidence": "ELF fixture-backed personalization passes and ELF live_real_world personalization passes.", "strongest_competitor_or_reference": "mem0/OpenMemory, Letta", - "current_competitor_evidence": "mem0/OpenMemory personalization is not_encoded and Letta personalization is research_gate not_encoded.", + "current_competitor_evidence": "mem0/OpenMemory personalization is not_encoded and Letta scoped preference readback remains not_tested until the contained core/archival export path exists.", "current_state": "ELF and qmd have live encoded evidence; personalization-specialized competitors are not yet comparable.", "next_measurement": "Encode mem0/OpenMemory and Letta scoped-preference readback jobs before making personalization superiority claims." }, @@ -536,11 +538,11 @@ { "scenario_id": "core_vs_archival_memory", "scenario": "core-vs-archival memory", - "current_elf_evidence": "ELF spec and admin surfaces define core blocks, but comparative benchmark coverage is not yet encoded here.", + "current_elf_evidence": "ELF fixture core_archival_memory passes 6/6 and scores core block attachment, scope, provenance, stale-core detection, archival fallback, and project-decision recovery separately from archival note search.", "strongest_competitor_or_reference": "Letta", - "current_competitor_evidence": "Letta is research_gate not_encoded until a contained evidence export path is selected.", - "current_state": "Scenario is a product gap measurement target, not a current win/loss surface.", - "next_measurement": "Add core-block versus archival-search jobs for ELF and only compare Letta after contained export proof exists." + "current_competitor_evidence": "Letta is research_gate blocked/not_tested until the selected contained export/readback artifact exists.", + "current_state": "ELF has fixture-only core-block evidence; Letta remains unscored, so no win, tie, or loss claim is allowed.", + "next_measurement": "Implement the Letta export/readback adapter, then compare only scenarios whose core block JSON, archival search/readback JSON, and source ids are present." }, { "scenario_id": "graph_rag_navigation", @@ -646,10 +648,10 @@ }, { "workstream": "Letta, LangGraph, nanograph, llm-wiki direct adapters", - "issue_or_candidate": "research-only until output contract", + "issue_or_candidate": "Letta export artifact blocked; others research-only until output contract", "parallelizable": false, - "blocked_by": "Contained evidence export or non-memory-backend comparability contract.", - "measurement": "Only run after each has a comparable output contract; otherwise treat as product-reference evidence." + "blocked_by": "Letta needs the selected contained export/readback artifact; the others need a non-memory-backend comparability contract.", + "measurement": "Only run after comparable output exists; otherwise treat as product-reference evidence." } ] } From 2e0b926183f58f3230ac36750c727f6afb0cc1db Mon Sep 17 00:00:00 2001 From: Yvette Carlisle Date: Fri, 12 Jun 2026 01:01:41 +0800 Subject: [PATCH 3/7] {"schema":"decodex/commit/1","summary":"Repair Letta benchmark report drift","authority":"XY-927"} --- .../tests/real_world_job_benchmark.rs | 38 +++++++++++++++++++ ...-11-competitor-strength-evidence-matrix.md | 2 +- ...on-direction-from-competitor-benchmarks.md | 14 +++---- ...-11-xy-897-competitor-strength-matrix.json | 6 +-- 4 files changed, 49 insertions(+), 11 deletions(-) diff --git a/apps/elf-eval/tests/real_world_job_benchmark.rs b/apps/elf-eval/tests/real_world_job_benchmark.rs index d7d5eae7..44d94368 100644 --- a/apps/elf-eval/tests/real_world_job_benchmark.rs +++ b/apps/elf-eval/tests/real_world_job_benchmark.rs @@ -1923,6 +1923,7 @@ fn current_benchmark_reports_preserve_live_sweep_boundaries() -> Result<()> { let competitor_matrix_json = serde_json::from_str::(&fs::read_to_string( competitor_strength_matrix_json_path()?, )?)?; + let iteration_direction = fs::read_to_string(iteration_direction_report_path()?)?; let external_manifest = fs::read_to_string(external_adapter_manifest_path())?; let retrieval_debug_profile = serde_json::from_str::(&fs::read_to_string(retrieval_debug_profile_json_path()?)?)?; @@ -1949,6 +1950,16 @@ fn current_benchmark_reports_preserve_live_sweep_boundaries() -> Result<()> { assert!(external_manifest.contains( "The qmd live real-world sweep covers the current encoded fixture corpus; expanded retrieval-debug strength suites still need their own materialized adapter run." )); + assert!(iteration_direction.contains("| Jobs | `46` |")); + assert!(iteration_direction.contains("| Encoded suites | `12` |")); + assert!(iteration_direction.contains("| Pass | `44` |")); + assert!(iteration_direction.contains("| Evidence coverage | `101/101` |")); + assert!(iteration_direction.contains("| Expected evidence recall | `93/93` |")); + assert!(competitor_matrix.contains("scenario-level `live_baseline_only` tie")); + assert!( + competitor_matrix + .contains("broader real-world personalization prompt adapter remains `not_encoded`") + ); for stale_phrase in [ "same live sweep shape as ELF", @@ -1957,9 +1968,13 @@ fn current_benchmark_reports_preserve_live_sweep_boundaries() -> Result<()> { "wrong_result, incomplete, blocked, and not_encoded states remain visible", "broader live suites remain `wrong_result`, `incomplete`, or `not_encoded`", "The qmd live real-world slice covers representative jobs only", + "| Jobs | `40` |", + "| Encoded suites | `11` |", + "| Pass | `38` |", ] { assert!(!measurement_audit.contains(stale_phrase)); assert!(!competitor_matrix.contains(stale_phrase)); + assert!(!iteration_direction.contains(stale_phrase)); assert!(!external_manifest.contains(stale_phrase)); } @@ -2243,6 +2258,7 @@ fn assert_competitor_strength_matrix_json(matrix: &Value) -> Result<()> { let scenarios = array_at(matrix, "/scenario_matrix")?; let retrieval_debug = find_by_field(scenarios, "/scenario_id", "retrieval_debug")?; let operator_debug = find_by_field(scenarios, "/scenario_id", "operator_debugging")?; + let personalization = find_by_field(scenarios, "/scenario_id", "personalization")?; let context_trajectory = find_by_field(scenarios, "/scenario_id", "context_trajectory")?; assert_competitor_strength_matrix_manifest_counts(matrix); @@ -2330,6 +2346,9 @@ fn assert_competitor_strength_matrix_json(matrix: &Value) -> Result<()> { .and_then(Value::as_str) .is_some_and(|claim| claim.contains("OpenMemory and claude-mem UI/export")) ); + + assert_personalization_matrix_record(personalization); + assert!( context_trajectory .pointer("/current_state") @@ -2346,6 +2365,25 @@ fn assert_competitor_strength_matrix_json(matrix: &Value) -> Result<()> { Ok(()) } +fn assert_personalization_matrix_record(personalization: &Value) { + assert!( + personalization + .pointer("/current_competitor_evidence") + .and_then(Value::as_str) + .is_some_and(|claim| claim.contains("scenario-level live_baseline_only tie") + && claim.contains( + "broader real_world_job personalization prompt adapter remains not_encoded" + )) + ); + assert!( + personalization + .pointer("/current_state") + .and_then(Value::as_str) + .is_some_and(|state| state.contains("ties the scoped-personalization smoke") + && state.contains("not yet comparable across the broader suite")) + ); +} + fn assert_competitor_strength_matrix_manifest_counts(matrix: &Value) { assert_eq!( matrix.pointer("/manifest_summary/adapter_records").and_then(Value::as_u64), diff --git a/docs/guide/benchmarking/2026-06-11-competitor-strength-evidence-matrix.md b/docs/guide/benchmarking/2026-06-11-competitor-strength-evidence-matrix.md index 58692226..8ce82a39 100644 --- a/docs/guide/benchmarking/2026-06-11-competitor-strength-evidence-matrix.md +++ b/docs/guide/benchmarking/2026-06-11-competitor-strength-evidence-matrix.md @@ -107,7 +107,7 @@ lifecycle-fail -> `lifecycle_fail`, and not-encoded -> `not_encoded`. | Operator debugging | Fixture operator_debugging_ux passes, and the narrow live operator-debug slice passes for trace hydration, candidate-drop visibility, selected-but-not-narrated evidence, replay-command availability, and repair-action clarity. | qmd, claude-mem, OpenMemory. | qmd ties replay-command availability and repair-action clarity but is `wrong_result` for trace hydration, candidate-drop stage visibility, and selected-but-not-narrated evidence; claude-mem and OpenMemory UX remain `not_encoded` or blocked. | Add bounded OpenMemory and claude-mem UI/export or viewer runners before any broader operator-UX claim. | | Capture/write policy | Fixture capture_integration passes; ELF live capture_integration passes 4/4 with zero redaction leaks, source ids, write-policy audit, and evidence binding. | agentmemory, claude-mem. | agentmemory capture is `blocked` by mocked/in-memory storage; claude-mem hook/viewer capture is `not_encoded`. | Run durable agentmemory and claude-mem capture-hook jobs proving redaction, exclusion, evidence binding, source ids, and no secret leakage. | | Production ops | Fixture production_ops has 4 pass and 2 blocked; live production_ops is `blocked`; production adoption has provider/backfill/restore evidence. | ELF production gate, qmd, RAG/RAGFlow resource gates. | qmd live production_ops is `blocked`; RAG/resource gates are `research_gate` `blocked`. | Rerun private-corpus and credentialed gates only when operator-owned manifest and credentials exist. | -| Personalization | Fixture and live personalization pass. | mem0/OpenMemory, Letta. | mem0/OpenMemory personalization is `not_encoded`; Letta scoped preference readback remains `not_tested` until the contained core/archival export path exists. | Encode scoped preference readback for mem0/OpenMemory and Letta before personalization superiority claims. | +| Personalization | Fixture and live personalization pass. | mem0/OpenMemory, Letta. | mem0/OpenMemory has a scenario-level `live_baseline_only` tie for entity-scoped personalization, while the broader real-world personalization prompt adapter remains `not_encoded`; Letta scoped preference readback remains `not_tested` until the contained core/archival export path exists. | Encode broader mem0/OpenMemory real-world personalization prompts and Letta scoped preference readback before personalization superiority claims. | | Context trajectory | ELF has trace direction but no comparable staged trajectory scenario. | OpenViking. | OpenViking setup is pinned, same-corpus retrieval is `wrong_result`, and hierarchy trajectory is `not_encoded`. | Make OpenViking evidence-bearing retrieval pass, then score staged context trajectory outputs. | | Core-vs-archival memory | Fixture `core_archival_memory` passes 6/6 and scores core block attachment, scope, provenance, stale-core detection, archival fallback, and project-decision recovery separately from archival note search. | Letta. | Letta is `research_gate` `blocked`/`not_tested` until the selected contained export/readback artifact exists. | Implement the Letta export/readback adapter, then compare only scenarios whose core block JSON, archival search/readback JSON, and source ids are present. | | Graph/RAG navigation | ELF relation context is not enough to claim graph/RAG navigation parity. | RAGFlow, LightRAG, GraphRAG, Graphiti/Zep, graphify. | RAGFlow, LightRAG, GraphRAG, and Graphiti/Zep remain `research_gate` blocked/incomplete without explicit setup; graphify has only a tiny scored smoke `wrong_result`. | Run larger contained graph/RAG adapters with evidence-linked outputs before any ELF graph/RAG win, tie, or loss claim. | diff --git a/docs/guide/benchmarking/2026-06-11-elf-iteration-direction-from-competitor-benchmarks.md b/docs/guide/benchmarking/2026-06-11-elf-iteration-direction-from-competitor-benchmarks.md index 1363d3f0..cffe4849 100644 --- a/docs/guide/benchmarking/2026-06-11-elf-iteration-direction-from-competitor-benchmarks.md +++ b/docs/guide/benchmarking/2026-06-11-elf-iteration-direction-from-competitor-benchmarks.md @@ -44,18 +44,18 @@ The strongest current statement is: | Metric | Value | | --- | ---: | -| Jobs | `40` | -| Encoded suites | `11` | -| Pass | `38` | +| Jobs | `46` | +| Encoded suites | `12` | +| Pass | `44` | | Blocked | `2` | | Wrong result | `0` | | Lifecycle fail | `0` | | Incomplete | `0` | | Not encoded | `0` | | Unsupported claim | `0` | -| Mean score | `0.950` | -| Evidence coverage | `88/88` | -| Expected evidence recall | `80/80` | +| Mean score | `0.957` | +| Evidence coverage | `101/101` | +| Expected evidence recall | `93/93` | This proves the fixture contract is broad and well controlled. It does not prove that every live adapter or every competitor runtime passes those scenarios. @@ -137,7 +137,7 @@ one misleading score. | Operator debugging | Fixture UX passes and the narrow live trace/viewer slice is scored: ELF passes, qmd ties replay/repair clarity but is wrong_result for trace hydration and candidate-drop visibility. | Expand coverage to OpenMemory and claude-mem UI/export or viewer runners before any broader operator-UX claim. | | Capture/write policy | ELF live capture/write-policy self-check passes with zero redaction leaks; qmd is `not_encoded`; agentmemory is `blocked`; claude-mem is `not_encoded`. | Borrow agentmemory/claude-mem capture breadth only after durable local hook/viewer evidence exists, while preserving redaction and evidence binding. | | Production ops | ELF has the strongest checked-in evidence, with private/credential gates blocked. | Keep Docker-first production proof and add private corpus only when an operator-owned manifest exists. | -| Personalization | ELF live personalization passes; mem0/OpenMemory is not encoded and Letta scoped preference readback remains not tested until its contained export path exists. | Add entity-scoped preference history and UI readback before claiming stronger personalization. | +| Personalization | ELF live personalization passes; mem0/OpenMemory ties the entity-scoped personalization smoke but still lacks a broader real-world prompt adapter, and Letta scoped preference readback remains not tested until its contained export path exists. | Add broader entity/preference history and UI readback before claiming stronger personalization. | | Context trajectory | Not comparable yet; OpenViking remains the reference. | Score staged retrieval, hierarchy expansion, and trajectory readback. | | Core-vs-archival | ELF fixture-backed `core_archival_memory` passes 6/6, but Letta remains blocked/not tested because no contained export artifact exists. | Borrow Letta's core memory block shape while keeping any win/tie/loss claim gated on exported core block, archival readback, and source-id evidence. | | Graph/RAG navigation | RAGFlow, LightRAG, GraphRAG, and Graphiti/Zep remain research gates; graphify has a tiny scored `wrong_result` smoke. | Run larger contained graph/RAG adapters before any broad graph-navigation claim. | diff --git a/docs/research/2026-06-11-xy-897-competitor-strength-matrix.json b/docs/research/2026-06-11-xy-897-competitor-strength-matrix.json index 558fa520..d7dd1938 100644 --- a/docs/research/2026-06-11-xy-897-competitor-strength-matrix.json +++ b/docs/research/2026-06-11-xy-897-competitor-strength-matrix.json @@ -522,9 +522,9 @@ "scenario": "personalization", "current_elf_evidence": "ELF fixture-backed personalization passes and ELF live_real_world personalization passes.", "strongest_competitor_or_reference": "mem0/OpenMemory, Letta", - "current_competitor_evidence": "mem0/OpenMemory personalization is not_encoded and Letta scoped preference readback remains not_tested until the contained core/archival export path exists.", - "current_state": "ELF and qmd have live encoded evidence; personalization-specialized competitors are not yet comparable.", - "next_measurement": "Encode mem0/OpenMemory and Letta scoped-preference readback jobs before making personalization superiority claims." + "current_competitor_evidence": "mem0/OpenMemory has a scenario-level live_baseline_only tie for entity_scoped_personalization, while the broader real_world_job personalization prompt adapter remains not_encoded; Letta scoped preference readback remains not_tested until the contained core/archival export path exists.", + "current_state": "ELF and qmd have live encoded personalization evidence; mem0/OpenMemory ties the scoped-personalization smoke but is not yet comparable across the broader suite, and Letta remains unscored.", + "next_measurement": "Encode broader mem0/OpenMemory real_world_job personalization prompts and Letta scoped-preference readback jobs before making personalization superiority claims." }, { "scenario_id": "context_trajectory", From 69617e455579415649601431cb979bd4cd7a32ea Mon Sep 17 00:00:00 2001 From: Yvette Carlisle Date: Fri, 12 Jun 2026 02:08:10 +0800 Subject: [PATCH 4/7] {"schema":"decodex/commit/1","summary":"Repair core archival benchmark guide aggregate","authority":"XY-927"} --- .../real_world_agent_memory_benchmark.md | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/docs/guide/benchmarking/real_world_agent_memory_benchmark.md b/docs/guide/benchmarking/real_world_agent_memory_benchmark.md index 9d6f279d..a5fb2eca 100644 --- a/docs/guide/benchmarking/real_world_agent_memory_benchmark.md +++ b/docs/guide/benchmarking/real_world_agent_memory_benchmark.md @@ -229,16 +229,19 @@ research gates. Its `external_adapters` report section distinguishes: - `research_gate`: checked-in source/setup/runtime/resource/retry metadata for a future adapter path, not fixture-backed or live execution evidence. -Current fixture state: `cargo make real-world-memory` covers 43 jobs across 12 suites, -with 38 pass and 5 blocked. The blocked jobs are production-ops operator boundaries -plus the XY-928 OpenViking `context_trajectory` gates for staged retrieval, hierarchy -selection, and recursive/context expansion. +Current fixture state: `cargo make real-world-memory` covers 49 jobs across 13 suites, +with 44 pass and 5 blocked. The added `core_archival_memory` suite contributes six +passing fixture jobs for core block attachment, scope, provenance, stale-core +detection, archival fallback, and project-decision recovery. The blocked jobs are +production-ops operator boundaries plus the XY-928 OpenViking `context_trajectory` +gates for staged retrieval, hierarchy selection, and recursive/context expansion. Current live-adapter state: the `elf_live_real_world` and `qmd_live_real_world` adapters run a full encoded-suite sweep through `cargo make real-world-memory-live-adapters`. Each adapter materializes generated runtime answers for 40 jobs across 11 suites before scoring. -The newer fixture-only `core_archival_memory` suite is scored separately and is not yet -included in that live sweep. +The fixture-only `core_archival_memory` suite can also be run through +`cargo make real-world-memory-core-archival`; it is not yet included in that live +sweep. The original targeted `work_resume`, `retrieval`, and `project_decisions` slice still passes, and ELF now passes the live `capture_integration` self-checks for redaction, exclusions, source ids, evidence binding, and no secret leakage. The full sweep is From c82e9f7e2a2a24a996a1fe73b20784ffc0069784 Mon Sep 17 00:00:00 2001 From: Yvette Carlisle Date: Fri, 12 Jun 2026 02:50:43 +0800 Subject: [PATCH 5/7] {"schema":"decodex/commit/1","summary":"Enforce Letta core archival benchmark boundaries","authority":"XY-927"} --- .../project_decision_recovery.json | 53 ++++++++-- .../src/bin/real_world_job_benchmark.rs | 47 ++++++++- .../tests/real_world_job_benchmark.rs | 99 +++++++++++++++++-- ...on-direction-from-competitor-benchmarks.md | 8 +- .../2026-06-11-measurement-coverage-audit.md | 8 +- ...2026-06-11-measurement-coverage-audit.json | 8 +- 6 files changed, 192 insertions(+), 31 deletions(-) diff --git a/apps/elf-eval/fixtures/real_world_memory/core_archival_memory/project_decision_recovery.json b/apps/elf-eval/fixtures/real_world_memory/core_archival_memory/project_decision_recovery.json index 229ecc34..423db375 100644 --- a/apps/elf-eval/fixtures/real_world_memory/core_archival_memory/project_decision_recovery.json +++ b/apps/elf-eval/fixtures/real_world_memory/core_archival_memory/project_decision_recovery.json @@ -58,10 +58,27 @@ }, "created_at": "2026-06-11T04:52:00Z" }, + { + "evidence_id": "decision-letta-export-boundary", + "kind": "comparison_boundary", + "text": "Letta comparison boundary: no contained export/readback artifact maps core block JSON, archival search/readback JSON, and source ids, so Letta remains blocked or not_tested and no win, tie, or loss claim is allowed.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "project_decision_recovery", + "evidence_id": "decision-letta-export-boundary" + }, + "locator": { + "quote": "no contained export/readback artifact maps core block JSON" + } + }, + "created_at": "2026-06-11T04:53:00Z" + }, { "evidence_id": "decision-letta-win-trap", "kind": "unsupported_claim", - "text": "Wrong claim: Letta comparison can be scored as an ELF win because ELF has core blocks.", + "text": "Wrong claim: Letta comparison can be scored as an ELF win or measured loss because ELF has core blocks.", "source_ref": { "schema": "source_ref/v1", "resolver": "real_world_job_fixture/v1", @@ -76,7 +93,7 @@ "adapter_response": { "adapter_id": "fixture_core_archival_memory", "answer": { - "content": "Use the always-attached core routing block to find the benchmark outcome policy, then cite archival notes for the detailed decision. The archival decision says to use win, tie, loss, not_tested, blocked, or non_goal only when scenario evidence supports them. It also says core blocks stay separate from archival note search and Qdrant-derived retrieval, so no ELF-over-Letta claim follows from ELF having core blocks.", + "content": "Use the always-attached core routing block to find the benchmark outcome policy, then cite archival notes for the detailed decision. The archival decision says to use win, tie, loss, not_tested, blocked, or non_goal only when scenario evidence supports them. It also says core blocks stay separate from archival note search and Qdrant-derived retrieval. Letta remains blocked or not_tested until a contained export/readback artifact maps core and archival source ids, so no ELF-over-Letta claim follows from ELF having core blocks.", "claims": [ { "claim_id": "core_routes_to_archival_rationale", @@ -95,12 +112,19 @@ "text": "Core blocks stay separate from archival note search and Qdrant-derived retrieval.", "evidence_ids": ["decision-archival-core-search-boundary"], "confidence": "high" + }, + { + "claim_id": "letta_comparison_requires_export", + "text": "Letta remains blocked or not_tested until a contained export/readback artifact maps core and archival source ids.", + "evidence_ids": ["decision-letta-export-boundary"], + "confidence": "high" } ], "evidence_ids": [ "decision-core-routing-block", "decision-archival-outcome-policy", - "decision-archival-core-search-boundary" + "decision-archival-core-search-boundary", + "decision-letta-export-boundary" ], "latency_ms": 1.4, "cost": { @@ -126,7 +150,11 @@ "ts": "2026-06-11T04:51:00Z", "actor": "agent", "action": "recorded_decision", - "evidence_ids": ["decision-archival-outcome-policy", "decision-archival-core-search-boundary"], + "evidence_ids": [ + "decision-archival-outcome-policy", + "decision-archival-core-search-boundary", + "decision-letta-export-boundary" + ], "summary": "Archival notes recorded the detailed outcome policy and core-search boundary." } ], @@ -149,15 +177,22 @@ { "claim_id": "core_archival_boundary_preserved", "text": "Core blocks stay separate from archival note search and Qdrant-derived retrieval." + }, + { + "claim_id": "letta_comparison_requires_export", + "text": "Letta remains blocked or not_tested until a contained export/readback artifact maps core and archival source ids." } ], "must_not_include": [ - "Letta comparison can be scored as an ELF win because ELF has core blocks" + "Letta comparison can be scored as an ELF win", + "Letta is a measured loss", + "Letta comparison can be scored as a measured loss" ], "evidence_links": { "core_routes_to_archival_rationale": ["decision-core-routing-block"], "outcomes_require_evidence": ["decision-archival-outcome-policy"], - "core_archival_boundary_preserved": ["decision-archival-core-search-boundary"] + "core_archival_boundary_preserved": ["decision-archival-core-search-boundary"], + "letta_comparison_requires_export": ["decision-letta-export-boundary"] }, "answer_type": "decision_record", "accepted_alternates": [], @@ -182,6 +217,12 @@ "claim_id": "core_archival_boundary_preserved", "requirement": "cite", "quote": "core blocks stay separate from archival note search" + }, + { + "evidence_id": "decision-letta-export-boundary", + "claim_id": "letta_comparison_requires_export", + "requirement": "cite", + "quote": "no contained export/readback artifact maps core block JSON" } ], "negative_traps": [ diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark.rs b/apps/elf-eval/src/bin/real_world_job_benchmark.rs index 11ed5106..8590b5ae 100644 --- a/apps/elf-eval/src/bin/real_world_job_benchmark.rs +++ b/apps/elf-eval/src/bin/real_world_job_benchmark.rs @@ -3111,9 +3111,15 @@ fn job_metrics(job: &RealWorldJob, answer: &ProducedAnswer) -> JobMetrics { .filter(|evidence| produced_evidence.contains(&evidence.evidence_id)) .count(); let stale_retrieval_count = trap_use_count(job, &produced_evidence, "stale_fact", answer); - let scope_violation_count = trap_use_count(job, &produced_evidence, "near_duplicate", answer); - let scope_check_count = - job.negative_traps.iter().filter(|trap| trap.trap_type == "near_duplicate").count(); + let scope_violation_count = ["near_duplicate", "scope_leak"] + .into_iter() + .map(|trap_type| trap_use_count(job, &produced_evidence, trap_type, answer)) + .sum(); + let scope_check_count = job + .negative_traps + .iter() + .filter(|trap| is_scope_trap_type(trap.trap_type.as_str())) + .count(); let redaction_leak_count = trap_use_count(job, &produced_evidence, "privacy_leak", answer); let scope_correct_count = scope_check_count.saturating_sub(scope_violation_count); let qdrant_rebuild_case = job.tags.iter().any(|tag| tag == "qdrant_rebuild"); @@ -3138,6 +3144,10 @@ fn source_ref_by_evidence(job: &RealWorldJob) -> BTreeMap<&str, &Value> { job.corpus.items.iter().map(|item| (item.evidence_id.as_str(), &item.source_ref)).collect() } +fn is_scope_trap_type(trap_type: &str) -> bool { + matches!(trap_type, "near_duplicate" | "scope_leak") +} + fn trap_use_count( job: &RealWorldJob, produced_evidence: &BTreeSet, @@ -3933,11 +3943,42 @@ fn validate_adapter_scenarios(path: &Path, adapter: &ExternalAdapterReport) -> R suite_id )); } + + let outcome = scenario_comparison_outcome(scenario); + + if unmeasured_status_has_measured_outcome(scenario.status, outcome) { + return Err(eyre::eyre!( + "{} adapter {} scenario {} uses {} status with {} outcome.", + path.display(), + adapter.adapter_id, + scenario.scenario_id, + adapter_status_str(scenario.status), + scenario_comparison_outcome_str(outcome) + )); + } } Ok(()) } +fn unmeasured_status_has_measured_outcome( + status: AdapterCoverageStatus, + outcome: ScenarioComparisonOutcome, +) -> bool { + matches!( + status, + AdapterCoverageStatus::Blocked + | AdapterCoverageStatus::Incomplete + | AdapterCoverageStatus::NotEncoded + | AdapterCoverageStatus::Unsupported + ) && matches!( + outcome, + ScenarioComparisonOutcome::Win + | ScenarioComparisonOutcome::Tie + | ScenarioComparisonOutcome::Loss + ) +} + fn validate_adapter_evidence(path: &Path, adapter: &ExternalAdapterReport) -> Result<()> { for evidence in &adapter.evidence { if evidence.kind.trim().is_empty() || evidence.reference.trim().is_empty() { diff --git a/apps/elf-eval/tests/real_world_job_benchmark.rs b/apps/elf-eval/tests/real_world_job_benchmark.rs index 7fe90f1a..26e50498 100644 --- a/apps/elf-eval/tests/real_world_job_benchmark.rs +++ b/apps/elf-eval/tests/real_world_job_benchmark.rs @@ -1428,6 +1428,59 @@ fn operator_debug_live_adapter_task_is_docker_scoped() -> Result<()> { Ok(()) } +#[test] +fn external_adapter_manifest_rejects_unmeasured_win_loss_scenario_outcomes() -> Result<()> { + let mut manifest = + serde_json::from_str::(&fs::read_to_string(external_adapter_manifest_path())?)?; + let adapters = manifest + .pointer_mut("/adapters") + .and_then(Value::as_array_mut) + .ok_or_else(|| eyre::eyre!("missing manifest adapters"))?; + let letta = adapters + .iter_mut() + .find(|adapter| { + adapter.pointer("/adapter_id").and_then(Value::as_str) == Some("letta_research_gate") + }) + .ok_or_else(|| eyre::eyre!("missing Letta adapter"))?; + let scenarios = letta + .pointer_mut("/scenarios") + .and_then(Value::as_array_mut) + .ok_or_else(|| eyre::eyre!("missing Letta scenarios"))?; + let attachment = scenarios + .iter_mut() + .find(|scenario| { + scenario.pointer("/scenario_id").and_then(Value::as_str) + == Some("core_block_attachment_readback") + }) + .ok_or_else(|| eyre::eyre!("missing Letta attachment scenario"))?; + + set_json_pointer(attachment, "/comparison_outcome", serde_json::json!("win"))?; + + let temp_dir = + env::temp_dir().join(format!("elf-real-world-invalid-scenario-test-{}", process::id())); + + fs::create_dir_all(&temp_dir)?; + + let manifest_path = temp_dir.join("memory_projects_manifest.json"); + + fs::write(&manifest_path, serde_json::to_vec_pretty(&manifest)?)?; + + let output = Command::new(env!("CARGO_BIN_EXE_real_world_job_benchmark")) + .arg("run") + .arg("--fixtures") + .arg(fixture_dir()) + .arg("--external-adapter-manifest") + .arg(&manifest_path) + .output()?; + + assert!(!output.status.success(), "invalid scenario outcome unexpectedly passed"); + assert!( + String::from_utf8_lossy(&output.stderr).contains("not_encoded status with win outcome") + ); + + Ok(()) +} + #[test] fn live_adapter_supports_elf_capture_write_policy_without_external_hook_claims() -> Result<()> { let workspace = workspace_root()?; @@ -2060,8 +2113,8 @@ fn assert_current_report_text_boundaries( assert!(iteration_direction.contains("| Jobs | `49` |")); assert!(iteration_direction.contains("| Encoded suites | `13` |")); assert!(iteration_direction.contains("| Pass | `44` |")); - assert!(iteration_direction.contains("| Evidence coverage | `110/110` |")); - assert!(iteration_direction.contains("| Expected evidence recall | `99/99` |")); + assert!(iteration_direction.contains("| Evidence coverage | `111/111` |")); + assert!(iteration_direction.contains("| Expected evidence recall | `100/100` |")); for stale_phrase in [ "same live sweep shape as ELF", @@ -2850,10 +2903,10 @@ fn assert_iteration_direction_current_measurement_counts(markdown: &str) { "| Encoded suites | `13` |", "| Blocked | `5` |", "| Mean score | `0.898` |", - "| Evidence coverage | `110/110` |", - "| Source-ref coverage | `110/110` |", - "| Quote coverage | `110/110` |", - "| Expected evidence recall | `99/99` |", + "| Evidence coverage | `111/111` |", + "| Source-ref coverage | `111/111` |", + "| Quote coverage | `111/111` |", + "| Expected evidence recall | `100/100` |", "| `blocked` | `7` |", "| `not_encoded` | `5` |", "`live_baseline_only`, `fixture_backed`, and `research_gate`", @@ -3211,6 +3264,14 @@ fn core_archival_memory_fixtures_score_separate_core_and_archival_jobs() -> Resu Some(1.0) ); assert_eq!(report.pointer("/summary/evidence_coverage").and_then(Value::as_f64), Some(1.0)); + assert_eq!( + report.pointer("/summary/evidence_required_count").and_then(Value::as_u64), + Some(14) + ); + assert_eq!(report.pointer("/summary/evidence_covered_count").and_then(Value::as_u64), Some(14)); + assert_eq!(report.pointer("/summary/scope_check_count").and_then(Value::as_u64), Some(1)); + assert_eq!(report.pointer("/summary/scope_correct_count").and_then(Value::as_u64), Some(1)); + assert_eq!(report.pointer("/summary/scope_violation_count").and_then(Value::as_u64), Some(0)); let suites = array_at(&report, "/suites")?; let core = find_by_field(suites, "/suite_id", "core_archival_memory")?; @@ -3234,6 +3295,24 @@ fn core_archival_memory_fixtures_score_separate_core_and_archival_jobs() -> Resu assert_eq!(job.pointer("/status").and_then(Value::as_str), Some("pass")); } + let scope = find_by_field(jobs, "/job_id", "core-archival-core-block-scope-001")?; + let decision = find_by_field(jobs, "/job_id", "core-archival-project-decision-recovery-001")?; + + assert_eq!(scope.pointer("/scope_check_count").and_then(Value::as_u64), Some(1)); + assert_eq!(scope.pointer("/scope_correct_count").and_then(Value::as_u64), Some(1)); + assert_eq!(scope.pointer("/scope_violation_count").and_then(Value::as_u64), Some(0)); + assert!( + decision + .pointer("/produced_answer") + .and_then(Value::as_str) + .is_some_and(|content| content.contains("Letta remains blocked or not_tested")) + ); + assert!( + array_at(decision, "/produced_evidence")? + .iter() + .any(|id| id.as_str() == Some("decision-letta-export-boundary")) + ); + Ok(()) } @@ -3319,8 +3398,8 @@ fn assert_root_aggregate_summary(report: &Value) { Some(0) ); assert_eq!(report.pointer("/summary/redaction_leak_count").and_then(Value::as_u64), Some(0)); - assert_eq!(report.pointer("/summary/scope_check_count").and_then(Value::as_u64), Some(2)); - assert_eq!(report.pointer("/summary/scope_correct_count").and_then(Value::as_u64), Some(2)); + assert_eq!(report.pointer("/summary/scope_check_count").and_then(Value::as_u64), Some(3)); + assert_eq!(report.pointer("/summary/scope_correct_count").and_then(Value::as_u64), Some(3)); assert_eq!(report.pointer("/summary/scope_violation_count").and_then(Value::as_u64), Some(0)); assert_eq!( report.pointer("/summary/qdrant_rebuild_case_count").and_then(Value::as_u64), @@ -3332,11 +3411,11 @@ fn assert_root_aggregate_summary(report: &Value) { ); assert_eq!( report.pointer("/summary/evidence_required_count").and_then(Value::as_u64), - Some(110) + Some(111) ); assert_eq!( report.pointer("/summary/evidence_covered_count").and_then(Value::as_u64), - Some(110) + Some(111) ); assert_eq!(report.pointer("/summary/evidence_coverage").and_then(Value::as_f64), Some(1.0)); assert_eq!(report.pointer("/summary/source_ref_coverage").and_then(Value::as_f64), Some(1.0)); diff --git a/docs/guide/benchmarking/2026-06-11-elf-iteration-direction-from-competitor-benchmarks.md b/docs/guide/benchmarking/2026-06-11-elf-iteration-direction-from-competitor-benchmarks.md index e32910a1..6fa05a45 100644 --- a/docs/guide/benchmarking/2026-06-11-elf-iteration-direction-from-competitor-benchmarks.md +++ b/docs/guide/benchmarking/2026-06-11-elf-iteration-direction-from-competitor-benchmarks.md @@ -54,10 +54,10 @@ The strongest current statement is: | Not encoded | `0` | | Unsupported claim | `0` | | Mean score | `0.898` | -| Evidence coverage | `110/110` | -| Source-ref coverage | `110/110` | -| Quote coverage | `110/110` | -| Expected evidence recall | `99/99` | +| Evidence coverage | `111/111` | +| Source-ref coverage | `111/111` | +| Quote coverage | `111/111` | +| Expected evidence recall | `100/100` | This proves the fixture contract is broad and well controlled. It does not prove that every live adapter or every competitor runtime passes those scenarios. diff --git a/docs/guide/benchmarking/2026-06-11-measurement-coverage-audit.md b/docs/guide/benchmarking/2026-06-11-measurement-coverage-audit.md index 90cd444c..c4e8381a 100644 --- a/docs/guide/benchmarking/2026-06-11-measurement-coverage-audit.md +++ b/docs/guide/benchmarking/2026-06-11-measurement-coverage-audit.md @@ -90,10 +90,10 @@ failure. | Unsupported claim | `0` | | Mean score | `0.898` | | Mean latency | `3.940 ms` | -| Expected evidence recall | `99/99` | -| Evidence coverage | `110/110` | -| Source-ref coverage | `110/110` | -| Quote coverage | `110/110` | +| Expected evidence recall | `100/100` | +| Evidence coverage | `111/111` | +| Source-ref coverage | `111/111` | +| Quote coverage | `111/111` | This proves fixture contract breadth and scoring behavior. It does not prove every live adapter or competitor runtime can complete those jobs. diff --git a/docs/research/2026-06-11-measurement-coverage-audit.json b/docs/research/2026-06-11-measurement-coverage-audit.json index bd7637f0..397f781e 100644 --- a/docs/research/2026-06-11-measurement-coverage-audit.json +++ b/docs/research/2026-06-11-measurement-coverage-audit.json @@ -36,10 +36,10 @@ "unsupported_claim": 0, "mean_score": 0.898, "mean_latency_ms": 3.94, - "expected_evidence_total": 99, - "expected_evidence_matched": 99, - "evidence_required_count": 110, - "evidence_covered_count": 110 + "expected_evidence_total": 100, + "expected_evidence_matched": 100, + "evidence_required_count": 111, + "evidence_covered_count": 111 }, "live_real_world_adapters": [ { From 5534191f2b3ed8ddf04cff84cbf7d56e767bff18 Mon Sep 17 00:00:00 2001 From: Yvette Carlisle Date: Fri, 12 Jun 2026 03:06:56 +0800 Subject: [PATCH 6/7] {"schema":"decodex/commit/1","summary":"Guard unmeasured adapter scenario positions","authority":"XY-927"} --- .../src/bin/real_world_job_benchmark.rs | 35 +++++ .../tests/real_world_job_benchmark.rs | 126 +++++++++++------- ...-11-competitor-strength-evidence-matrix.md | 2 +- .../2026-06-11-measurement-coverage-audit.md | 4 +- ...2026-06-11-measurement-coverage-audit.json | 4 +- ...-11-xy-897-competitor-strength-matrix.json | 4 +- 6 files changed, 121 insertions(+), 54 deletions(-) diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark.rs b/apps/elf-eval/src/bin/real_world_job_benchmark.rs index 8590b5ae..81cda7c7 100644 --- a/apps/elf-eval/src/bin/real_world_job_benchmark.rs +++ b/apps/elf-eval/src/bin/real_world_job_benchmark.rs @@ -3956,6 +3956,16 @@ fn validate_adapter_scenarios(path: &Path, adapter: &ExternalAdapterReport) -> R scenario_comparison_outcome_str(outcome) )); } + if unmeasured_status_has_measured_position(scenario.status, scenario.elf_position) { + return Err(eyre::eyre!( + "{} adapter {} scenario {} uses {} status with {} position.", + path.display(), + adapter.adapter_id, + scenario.scenario_id, + adapter_status_str(scenario.status), + scenario_position_str(scenario.elf_position) + )); + } } Ok(()) @@ -3979,6 +3989,22 @@ fn unmeasured_status_has_measured_outcome( ) } +fn unmeasured_status_has_measured_position( + status: AdapterCoverageStatus, + position: ElfScenarioPosition, +) -> bool { + matches!( + status, + AdapterCoverageStatus::Blocked + | AdapterCoverageStatus::Incomplete + | AdapterCoverageStatus::NotEncoded + | AdapterCoverageStatus::Unsupported + ) && matches!( + position, + ElfScenarioPosition::Wins | ElfScenarioPosition::Ties | ElfScenarioPosition::Loses + ) +} + fn validate_adapter_evidence(path: &Path, adapter: &ExternalAdapterReport) -> Result<()> { for evidence in &adapter.evidence { if evidence.kind.trim().is_empty() || evidence.reference.trim().is_empty() { @@ -5036,6 +5062,15 @@ fn scenario_comparison_outcome_str(outcome: ScenarioComparisonOutcome) -> &'stat } } +fn scenario_position_str(position: ElfScenarioPosition) -> &'static str { + match position { + ElfScenarioPosition::Wins => "wins", + ElfScenarioPosition::Ties => "ties", + ElfScenarioPosition::Loses => "loses", + ElfScenarioPosition::Untested => "untested", + } +} + fn adapter_status_counts_display(counts: &AdapterStatusCounts) -> String { [ ("real", counts.real), diff --git a/apps/elf-eval/tests/real_world_job_benchmark.rs b/apps/elf-eval/tests/real_world_job_benchmark.rs index 26e50498..5ae959a7 100644 --- a/apps/elf-eval/tests/real_world_job_benchmark.rs +++ b/apps/elf-eval/tests/real_world_job_benchmark.rs @@ -5,7 +5,7 @@ use std::{ env, fs, path::{Path, PathBuf}, - process::{self, Command}, + process::{self, Command, Output}, }; use color_eyre::{Result, eyre}; @@ -267,6 +267,56 @@ fn set_json_pointer(value: &mut Value, pointer: &str, replacement: Value) -> Res Ok(()) } +fn run_external_manifest_with_letta_attachment_mutation( + slug: &str, + mutation: F, +) -> Result +where + F: FnOnce(&mut Value) -> Result<()>, +{ + let mut manifest = + serde_json::from_str::(&fs::read_to_string(external_adapter_manifest_path())?)?; + let adapters = manifest + .pointer_mut("/adapters") + .and_then(Value::as_array_mut) + .ok_or_else(|| eyre::eyre!("missing manifest adapters"))?; + let letta = adapters + .iter_mut() + .find(|adapter| { + adapter.pointer("/adapter_id").and_then(Value::as_str) == Some("letta_research_gate") + }) + .ok_or_else(|| eyre::eyre!("missing Letta adapter"))?; + let scenarios = letta + .pointer_mut("/scenarios") + .and_then(Value::as_array_mut) + .ok_or_else(|| eyre::eyre!("missing Letta scenarios"))?; + let attachment = scenarios + .iter_mut() + .find(|scenario| { + scenario.pointer("/scenario_id").and_then(Value::as_str) + == Some("core_block_attachment_readback") + }) + .ok_or_else(|| eyre::eyre!("missing Letta attachment scenario"))?; + + mutation(attachment)?; + + let temp_dir = env::temp_dir().join(format!("elf-real-world-{slug}-{}", process::id())); + + fs::create_dir_all(&temp_dir)?; + + let manifest_path = temp_dir.join("memory_projects_manifest.json"); + + fs::write(&manifest_path, serde_json::to_vec_pretty(&manifest)?)?; + + Ok(Command::new(env!("CARGO_BIN_EXE_real_world_job_benchmark")) + .arg("run") + .arg("--fixtures") + .arg(fixture_dir()) + .arg("--external-adapter-manifest") + .arg(&manifest_path) + .output()?) +} + #[test] fn smoke_fixture_produces_typed_json_report() -> Result<()> { let report = run_json_report()?; @@ -1430,52 +1480,34 @@ fn operator_debug_live_adapter_task_is_docker_scoped() -> Result<()> { #[test] fn external_adapter_manifest_rejects_unmeasured_win_loss_scenario_outcomes() -> Result<()> { - let mut manifest = - serde_json::from_str::(&fs::read_to_string(external_adapter_manifest_path())?)?; - let adapters = manifest - .pointer_mut("/adapters") - .and_then(Value::as_array_mut) - .ok_or_else(|| eyre::eyre!("missing manifest adapters"))?; - let letta = adapters - .iter_mut() - .find(|adapter| { - adapter.pointer("/adapter_id").and_then(Value::as_str) == Some("letta_research_gate") - }) - .ok_or_else(|| eyre::eyre!("missing Letta adapter"))?; - let scenarios = letta - .pointer_mut("/scenarios") - .and_then(Value::as_array_mut) - .ok_or_else(|| eyre::eyre!("missing Letta scenarios"))?; - let attachment = scenarios - .iter_mut() - .find(|scenario| { - scenario.pointer("/scenario_id").and_then(Value::as_str) - == Some("core_block_attachment_readback") - }) - .ok_or_else(|| eyre::eyre!("missing Letta attachment scenario"))?; - - set_json_pointer(attachment, "/comparison_outcome", serde_json::json!("win"))?; - - let temp_dir = - env::temp_dir().join(format!("elf-real-world-invalid-scenario-test-{}", process::id())); - - fs::create_dir_all(&temp_dir)?; + let output = run_external_manifest_with_letta_attachment_mutation( + "invalid-scenario-outcome-test", + |scenario| set_json_pointer(scenario, "/comparison_outcome", serde_json::json!("win")), + )?; - let manifest_path = temp_dir.join("memory_projects_manifest.json"); + assert!(!output.status.success(), "invalid scenario outcome unexpectedly passed"); + assert!( + String::from_utf8_lossy(&output.stderr).contains("not_encoded status with win outcome") + ); - fs::write(&manifest_path, serde_json::to_vec_pretty(&manifest)?)?; + Ok(()) +} - let output = Command::new(env!("CARGO_BIN_EXE_real_world_job_benchmark")) - .arg("run") - .arg("--fixtures") - .arg(fixture_dir()) - .arg("--external-adapter-manifest") - .arg(&manifest_path) - .output()?; +#[test] +fn external_adapter_manifest_rejects_unmeasured_win_loss_scenario_positions() -> Result<()> { + let output = run_external_manifest_with_letta_attachment_mutation( + "invalid-scenario-position-test", + |scenario| { + set_json_pointer(scenario, "/status", serde_json::json!("not_encoded"))?; + set_json_pointer(scenario, "/elf_position", serde_json::json!("wins"))?; + + set_json_pointer(scenario, "/comparison_outcome", serde_json::json!("not_tested")) + }, + )?; - assert!(!output.status.success(), "invalid scenario outcome unexpectedly passed"); + assert!(!output.status.success(), "invalid scenario position unexpectedly passed"); assert!( - String::from_utf8_lossy(&output.stderr).contains("not_encoded status with win outcome") + String::from_utf8_lossy(&output.stderr).contains("not_encoded status with wins position") ); Ok(()) @@ -2500,13 +2532,13 @@ fn assert_competitor_strength_matrix_manifest_counts(matrix: &Value) { ); assert_eq!( matrix.pointer("/manifest_summary/overall_status_counts/blocked").and_then(Value::as_u64), - Some(6) + Some(7) ); assert_eq!( matrix .pointer("/manifest_summary/overall_status_counts/not_encoded") .and_then(Value::as_u64), - Some(6) + Some(5) ); assert_eq!( matrix @@ -2886,13 +2918,13 @@ fn assert_operator_facing_strength_profile_boundaries( fn assert_measurement_audit_adapter_status_counts(markdown: &str) { for expected in [ - "| `blocked` | `6` |", - "| `not_encoded` | `6` |", + "| `blocked` | `7` |", + "| `not_encoded` | `5` |", "The generated JSON report emits `external_project_count: 16`", ] { assert!(markdown.contains(expected), "missing measurement audit text: {expected}"); } - for stale in ["| `blocked` | `5` |", "| `not_encoded` | `7` |"] { + for stale in ["| `blocked` | `6` |", "| `not_encoded` | `6` |"] { assert!(!markdown.contains(stale), "stale measurement audit text: {stale}"); } } diff --git a/docs/guide/benchmarking/2026-06-11-competitor-strength-evidence-matrix.md b/docs/guide/benchmarking/2026-06-11-competitor-strength-evidence-matrix.md index d5c9200a..06680c4e 100644 --- a/docs/guide/benchmarking/2026-06-11-competitor-strength-evidence-matrix.md +++ b/docs/guide/benchmarking/2026-06-11-competitor-strength-evidence-matrix.md @@ -49,7 +49,7 @@ Current boundary: The current manifest has 23 adapter records across 16 external projects plus ELF. Evidence-class counts: 1 `fixture_backed`, 6 `live_baseline_only`, 5 `live_real_world`, and 11 `research_gate`. Overall adapter-status counts: 4 `pass`, -6 `wrong_result`, 1 `lifecycle_fail`, 6 `blocked`, and 6 `not_encoded`. +6 `wrong_result`, 1 `lifecycle_fail`, 7 `blocked`, and 5 `not_encoded`. ## State Taxonomy diff --git a/docs/guide/benchmarking/2026-06-11-measurement-coverage-audit.md b/docs/guide/benchmarking/2026-06-11-measurement-coverage-audit.md index c4e8381a..67c26673 100644 --- a/docs/guide/benchmarking/2026-06-11-measurement-coverage-audit.md +++ b/docs/guide/benchmarking/2026-06-11-measurement-coverage-audit.md @@ -156,8 +156,8 @@ The checked-in manifest records 23 adapter records across 17 unique project name | `pass` | `4` | | `wrong_result` | `6` | | `lifecycle_fail` | `1` | -| `blocked` | `6` | -| `not_encoded` | `6` | +| `blocked` | `7` | +| `not_encoded` | `5` | The generated JSON report emits `external_project_count: 16`, matching the unique non-ELF project-name count from the manifest. The companion audit JSON separately diff --git a/docs/research/2026-06-11-measurement-coverage-audit.json b/docs/research/2026-06-11-measurement-coverage-audit.json index 397f781e..ff2405b1 100644 --- a/docs/research/2026-06-11-measurement-coverage-audit.json +++ b/docs/research/2026-06-11-measurement-coverage-audit.json @@ -203,8 +203,8 @@ "pass": 4, "wrong_result": 6, "lifecycle_fail": 1, - "blocked": 6, - "not_encoded": 6 + "blocked": 7, + "not_encoded": 5 }, "xy900_update_note": "XY-900 promotes graphify from research_gate/blocked to a tiny scored live_real_world wrong_result smoke; broad graph/RAG quality remains unproven.", "xy932_update_note": "XY-932 adds narrow ELF/qmd operator-debug live_real_world records: ELF pass and qmd wrong_result for trace hydration/candidate-drop visibility, with OpenMemory and claude-mem UI still unmeasured.", diff --git a/docs/research/2026-06-11-xy-897-competitor-strength-matrix.json b/docs/research/2026-06-11-xy-897-competitor-strength-matrix.json index 92665fdb..93e23158 100644 --- a/docs/research/2026-06-11-xy-897-competitor-strength-matrix.json +++ b/docs/research/2026-06-11-xy-897-competitor-strength-matrix.json @@ -32,8 +32,8 @@ }, "overall_status_counts": { "lifecycle_fail": 1, - "blocked": 6, - "not_encoded": 6, + "blocked": 7, + "not_encoded": 5, "pass": 4, "wrong_result": 6 } From 05232fbad4e3ad5f96cfe0757181135278b9cbda Mon Sep 17 00:00:00 2001 From: Yvette Carlisle Date: Fri, 12 Jun 2026 03:44:56 +0800 Subject: [PATCH 7/7] {"schema":"decodex/commit/1","summary":"Align Letta core archival comparison contract","authority":"XY-927"} --- README.md | 11 +- .../memory_projects_manifest.json | 1 + .../src/bin/real_world_job_benchmark.rs | 48 +++++++ .../tests/real_world_job_benchmark.rs | 128 +++++++++++++++--- ...-11-competitor-strength-adoption-report.md | 7 +- .../2026-06-11-measurement-coverage-audit.md | 2 +- .../real_world_agent_memory_benchmark.md | 16 ++- .../research/comparison_external_projects.md | 16 ++- ...1-competitor-strength-adoption-report.json | 2 +- .../real_world_agent_memory_benchmark_v1.md | 14 +- 10 files changed, 202 insertions(+), 43 deletions(-) diff --git a/README.md b/README.md index 08c35d00..5bcef8ee 100644 --- a/README.md +++ b/README.md @@ -145,10 +145,13 @@ provider-backed ELF evidence was required. rebuild returned `rebuilt_count=1`, `missing_vector_count=0`, `error_count=0`, and search recovered the restored note. - Fresh all-project smoke run: ELF and qmd passed every encoded check. agentmemory - passed same-corpus retrieval but failed lifecycle/cold-start coverage. memsearch, - mem0, OpenViking, and claude-mem remained typed non-pass states. OpenViking now - reaches its pinned Docker local embedding path and is reported as `wrong_result` - when same-corpus evidence terms are missed; setup failures remain `incomplete`. + passed same-corpus retrieval but failed lifecycle/cold-start coverage. mem0/OpenMemory + and memsearch now pass their scoped local baseline smokes, while OpenMemory + UI/export, hosted mem0 Platform, optional graph memory, and broader memsearch prompt + and TTL coverage remain blocked, unsupported, or not encoded. OpenViking now reaches + its pinned Docker local embedding path and is reported as `wrong_result` when + same-corpus evidence terms are missed; claude-mem and OpenViking non-retrieval + coverage remain typed non-pass states. - Real-world agent memory aggregate after XY-927 and XY-928: 49 fixture-backed jobs across 13 suites, 44 pass, 0 incomplete, 5 blocked, 0 wrong-result, 0 not-encoded, and 0 unsupported-claim results. The remaining non-pass jobs are diff --git a/apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json b/apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json index 66813da7..42d3ab15 100644 --- a/apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json +++ b/apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json @@ -858,6 +858,7 @@ "suite_id": "work_resume", "status": "blocked", "elf_position": "untested", + "comparison_outcome": "blocked", "evidence": "agentmemory's relevant strength is durable coding-agent continuity and capture, but the Docker harness has not proven a persistent session/capture path. Keep work_resume and capture claims blocked until a durable local adapter path exists.", "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json" }, diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark.rs b/apps/elf-eval/src/bin/real_world_job_benchmark.rs index 81cda7c7..d4d0c6ac 100644 --- a/apps/elf-eval/src/bin/real_world_job_benchmark.rs +++ b/apps/elf-eval/src/bin/real_world_job_benchmark.rs @@ -3946,6 +3946,14 @@ fn validate_adapter_scenarios(path: &Path, adapter: &ExternalAdapterReport) -> R let outcome = scenario_comparison_outcome(scenario); + if blocked_status_missing_blocked_outcome(scenario.status, scenario.comparison_outcome) { + return Err(eyre::eyre!( + "{} adapter {} scenario {} uses blocked status without blocked comparison outcome.", + path.display(), + adapter.adapter_id, + scenario.scenario_id + )); + } if unmeasured_status_has_measured_outcome(scenario.status, outcome) { return Err(eyre::eyre!( "{} adapter {} scenario {} uses {} status with {} outcome.", @@ -3966,11 +3974,28 @@ fn validate_adapter_scenarios(path: &Path, adapter: &ExternalAdapterReport) -> R scenario_position_str(scenario.elf_position) )); } + if explicit_outcome_conflicts_with_position(scenario) { + return Err(eyre::eyre!( + "{} adapter {} scenario {} uses {} position with {} outcome.", + path.display(), + adapter.adapter_id, + scenario.scenario_id, + scenario_position_str(scenario.elf_position), + scenario_comparison_outcome_str(outcome) + )); + } } Ok(()) } +fn blocked_status_missing_blocked_outcome( + status: AdapterCoverageStatus, + outcome: Option, +) -> bool { + status == AdapterCoverageStatus::Blocked && outcome != Some(ScenarioComparisonOutcome::Blocked) +} + fn unmeasured_status_has_measured_outcome( status: AdapterCoverageStatus, outcome: ScenarioComparisonOutcome, @@ -4005,6 +4030,29 @@ fn unmeasured_status_has_measured_position( ) } +fn explicit_outcome_conflicts_with_position(scenario: &AdapterScenarioJudgment) -> bool { + let Some(outcome) = scenario.comparison_outcome else { + return false; + }; + + !position_supports_outcome(scenario.elf_position, outcome) +} + +fn position_supports_outcome( + position: ElfScenarioPosition, + outcome: ScenarioComparisonOutcome, +) -> bool { + matches!( + (position, outcome), + (ElfScenarioPosition::Wins, ScenarioComparisonOutcome::Win) + | (ElfScenarioPosition::Ties, ScenarioComparisonOutcome::Tie) + | (ElfScenarioPosition::Loses, ScenarioComparisonOutcome::Loss) + | (ElfScenarioPosition::Untested, ScenarioComparisonOutcome::NotTested) + | (ElfScenarioPosition::Untested, ScenarioComparisonOutcome::Blocked) + | (ElfScenarioPosition::Untested, ScenarioComparisonOutcome::NonGoal) + ) +} + fn validate_adapter_evidence(path: &Path, adapter: &ExternalAdapterReport) -> Result<()> { for evidence in &adapter.evidence { if evidence.kind.trim().is_empty() || evidence.reference.trim().is_empty() { diff --git a/apps/elf-eval/tests/real_world_job_benchmark.rs b/apps/elf-eval/tests/real_world_job_benchmark.rs index 5ae959a7..024a0697 100644 --- a/apps/elf-eval/tests/real_world_job_benchmark.rs +++ b/apps/elf-eval/tests/real_world_job_benchmark.rs @@ -190,6 +190,14 @@ fn readme_path() -> Result { Ok(workspace_root()?.join("README.md")) } +fn comparison_external_projects_path() -> Result { + Ok(workspace_root()? + .join("docs") + .join("guide") + .join("research") + .join("comparison_external_projects.md")) +} + fn benchmarking_index_path() -> Result { Ok(workspace_root()?.join("docs").join("guide").join("benchmarking").join("index.md")) } @@ -271,6 +279,23 @@ fn run_external_manifest_with_letta_attachment_mutation( slug: &str, mutation: F, ) -> Result +where + F: FnOnce(&mut Value) -> Result<()>, +{ + run_external_manifest_scenario_mutation( + slug, + "letta_research_gate", + "core_block_attachment_readback", + mutation, + ) +} + +fn run_external_manifest_scenario_mutation( + slug: &str, + adapter_id: &str, + scenario_id: &str, + mutation: F, +) -> Result where F: FnOnce(&mut Value) -> Result<()>, { @@ -280,25 +305,22 @@ where .pointer_mut("/adapters") .and_then(Value::as_array_mut) .ok_or_else(|| eyre::eyre!("missing manifest adapters"))?; - let letta = adapters + let adapter = adapters .iter_mut() - .find(|adapter| { - adapter.pointer("/adapter_id").and_then(Value::as_str) == Some("letta_research_gate") - }) - .ok_or_else(|| eyre::eyre!("missing Letta adapter"))?; - let scenarios = letta + .find(|adapter| adapter.pointer("/adapter_id").and_then(Value::as_str) == Some(adapter_id)) + .ok_or_else(|| eyre::eyre!("missing {adapter_id} adapter"))?; + let scenarios = adapter .pointer_mut("/scenarios") .and_then(Value::as_array_mut) - .ok_or_else(|| eyre::eyre!("missing Letta scenarios"))?; - let attachment = scenarios + .ok_or_else(|| eyre::eyre!("missing {adapter_id} scenarios"))?; + let scenario = scenarios .iter_mut() .find(|scenario| { - scenario.pointer("/scenario_id").and_then(Value::as_str) - == Some("core_block_attachment_readback") + scenario.pointer("/scenario_id").and_then(Value::as_str) == Some(scenario_id) }) - .ok_or_else(|| eyre::eyre!("missing Letta attachment scenario"))?; + .ok_or_else(|| eyre::eyre!("missing {scenario_id} scenario"))?; - mutation(attachment)?; + mutation(scenario)?; let temp_dir = env::temp_dir().join(format!("elf-real-world-{slug}-{}", process::id())); @@ -495,7 +517,7 @@ fn external_adapter_run_summarizes_nonzero_scenario_losses() -> Result<()> { report .pointer("/external_adapters/summary/scenario_outcome_counts/not_tested") .and_then(Value::as_u64), - Some(11) + Some(10) ); let adapters = array_at(&report, "/external_adapters/adapters")?; @@ -719,13 +741,13 @@ fn assert_external_adapter_manifest_scenario_summary(report: &Value) { report .pointer("/external_adapters/summary/scenario_outcome_counts/not_tested") .and_then(Value::as_u64), - Some(12) + Some(11) ); assert_eq!( report .pointer("/external_adapters/summary/scenario_outcome_counts/blocked") .and_then(Value::as_u64), - Some(4) + Some(5) ); assert_eq!( report @@ -1097,6 +1119,10 @@ fn assert_first_generation_adapter_records( Some("wins") ); assert_eq!(agentmemory.pointer("/scenarios/2/status").and_then(Value::as_str), Some("blocked")); + assert_eq!( + agentmemory.pointer("/scenarios/2/comparison_outcome").and_then(Value::as_str), + Some("blocked") + ); assert_eq!( mem0.pointer("/capabilities/2/capability").and_then(Value::as_str), Some("local_lifecycle_update_delete_reload") @@ -1513,6 +1539,49 @@ fn external_adapter_manifest_rejects_unmeasured_win_loss_scenario_positions() -> Ok(()) } +#[test] +fn external_adapter_manifest_rejects_blocked_status_without_blocked_outcome() -> Result<()> { + let output = run_external_manifest_scenario_mutation( + "invalid-blocked-scenario-outcome-test", + "letta_research_gate", + "stale_core_detection", + |scenario| { + scenario + .as_object_mut() + .ok_or_else(|| eyre::eyre!("scenario is not an object"))? + .remove("comparison_outcome"); + + Ok(()) + }, + )?; + + assert!(!output.status.success(), "invalid blocked scenario unexpectedly passed"); + assert!( + String::from_utf8_lossy(&output.stderr) + .contains("blocked status without blocked comparison outcome") + ); + + Ok(()) +} + +#[test] +fn external_adapter_manifest_rejects_conflicting_scenario_position_and_outcome() -> Result<()> { + let output = run_external_manifest_with_letta_attachment_mutation( + "invalid-scenario-position-outcome-test", + |scenario| { + set_json_pointer(scenario, "/status", serde_json::json!("pass"))?; + set_json_pointer(scenario, "/elf_position", serde_json::json!("ties"))?; + + set_json_pointer(scenario, "/comparison_outcome", serde_json::json!("loss")) + }, + )?; + + assert!(!output.status.success(), "conflicting scenario unexpectedly passed"); + assert!(String::from_utf8_lossy(&output.stderr).contains("ties position with loss outcome")); + + Ok(()) +} + #[test] fn live_adapter_supports_elf_capture_write_policy_without_external_hook_claims() -> Result<()> { let workspace = workspace_root()?; @@ -1648,6 +1717,8 @@ fn capture_write_policy_live_report_preserves_competitor_boundaries() -> Result< assert!(markdown.contains("Do not claim ELF broadly beats agentmemory or claude-mem")); assert!(benchmarking_index.contains("2026-06-11-capture-write-policy-live-report.md")); assert!(readme.contains("Capture/Write-Policy Live Report - June 11, 2026")); + assert!(readme.contains("mem0/OpenMemory")); + assert!(readme.contains("and memsearch now pass their scoped local baseline")); Ok(()) } @@ -2039,6 +2110,7 @@ fn current_benchmark_reports_preserve_live_sweep_boundaries() -> Result<()> { )?)?; let iteration_direction = fs::read_to_string(iteration_direction_report_path()?)?; let external_manifest = fs::read_to_string(external_adapter_manifest_path())?; + let comparison_external_projects = fs::read_to_string(comparison_external_projects_path()?)?; let retrieval_debug_profile = serde_json::from_str::(&fs::read_to_string(retrieval_debug_profile_json_path()?)?)?; let temporal_history = serde_json::from_str::(&fs::read_to_string( @@ -2050,6 +2122,7 @@ fn current_benchmark_reports_preserve_live_sweep_boundaries() -> Result<()> { &competitor_matrix, &iteration_direction, &external_manifest, + &comparison_external_projects, ); let qmd_live = find_by_field( @@ -2114,6 +2187,7 @@ fn assert_current_report_text_boundaries( competitor_matrix: &str, iteration_direction: &str, external_manifest: &str, + comparison_external_projects: &str, ) { assert!( measurement_audit.contains( @@ -2124,6 +2198,7 @@ fn assert_current_report_text_boundaries( measurement_audit .contains("qmd live fails 6/6 jobs after missing the delete/TTL tombstone evidence") ); + assert!(measurement_audit.contains("Basic local smoke and local OSS history/readback pass")); assert_measurement_audit_adapter_status_counts(measurement_audit); @@ -2142,6 +2217,14 @@ fn assert_current_report_text_boundaries( assert!(external_manifest.contains( "The qmd live real-world sweep covers the current encoded fixture corpus; expanded retrieval-debug strength suites still need their own materialized adapter run." )); + assert!( + comparison_external_projects + .contains("Benchmark-grounded for scoped local OSS same-corpus retrieval") + ); + assert!( + comparison_external_projects + .contains("Benchmark-grounded for local same-corpus retrieval, reindex/update/delete") + ); assert!(iteration_direction.contains("| Jobs | `49` |")); assert!(iteration_direction.contains("| Encoded suites | `13` |")); assert!(iteration_direction.contains("| Pass | `44` |")); @@ -2158,11 +2241,15 @@ fn assert_current_report_text_boundaries( "| Jobs | `40` |", "| Encoded suites | `11` |", "| Pass | `38` |", + "history/UI/hosted/graph behavior remains", + "current local adapter is incomplete/wrong-result", + "current adapter is incomplete/invalid-result", ] { assert!(!measurement_audit.contains(stale_phrase)); assert!(!competitor_matrix.contains(stale_phrase)); assert!(!iteration_direction.contains(stale_phrase)); assert!(!external_manifest.contains(stale_phrase)); + assert!(!comparison_external_projects.contains(stale_phrase)); } } @@ -2187,10 +2274,19 @@ fn qmd_trace_replay_diagnostics_report_preserves_claim_boundaries() -> Result<() assert!(benchmarking_index.contains("qmd top-10/replay artifact")); assert!(benchmarking_index.contains("ELF trace/admin surfaces")); assert!(adoption_report.contains("| Retrieval quality and local debug UX | `loss` |")); + assert!(adoption_report.contains("Letta scenario rows remain")); + assert!(adoption_report.contains("blocked or `not_tested`")); assert!( adoption_report .contains("Do not claim qmd's trace/replay artifact win is a broad qmd-over-ELF") ); + assert!(array_at(&adoption_json, "/adoption_decision/remaining_caveats")?.iter().any( + |caveat| { + caveat.as_str().is_some_and(|text| { + text.contains("Letta scenario rows remain blocked or not_tested") + }) + } + )); assert_trace_replay_adoption_json(&adoption_json)?; @@ -3005,7 +3101,7 @@ fn generated_json_report_renders_markdown() -> Result<()> { assert!(markdown.contains("### Adapter Scenario Judgments")); assert!(markdown.contains("ELF scenario positions: `wins=8, ties=9, loses=1, untested=18`")); assert!(markdown.contains( - "Scenario comparison outcomes: `win=8, tie=9, loss=1, not_tested=12, blocked=4, non_goal=2`" + "Scenario comparison outcomes: `win=8, tie=9, loss=1, not_tested=11, blocked=5, non_goal=2`" )); assert!(markdown.contains("| `claude_mem_live_baseline` | `same_corpus_retrieval`")); assert!(markdown.contains("| `memsearch_live_baseline` | `ttl_expiry_lifecycle`")); diff --git a/docs/guide/benchmarking/2026-06-11-competitor-strength-adoption-report.md b/docs/guide/benchmarking/2026-06-11-competitor-strength-adoption-report.md index f12b52ae..ef6eafb1 100644 --- a/docs/guide/benchmarking/2026-06-11-competitor-strength-adoption-report.md +++ b/docs/guide/benchmarking/2026-06-11-competitor-strength-adoption-report.md @@ -42,9 +42,10 @@ The remaining caveats are material: memory, and graph/RAG navigation remain unproven. XY-928 encodes OpenViking staged trajectory, hierarchy selection, and recursive/context expansion as blocked fixtures behind same-corpus evidence output and missing staged artifacts. XY-927 adds - fixture-only `core_archival_memory` coverage, but Letta comparison remains blocked - until the selected contained export/readback path exists. mem0 local OSS preference - history is measured separately and is an ELF loss on the current correction history + fixture-only `core_archival_memory` coverage, but Letta scenario rows remain + blocked or `not_tested` until the selected contained export/readback path exists. + mem0 local OSS preference history is measured separately and is an ELF loss on the + current correction history scenario. The XY-923 follow-up also scores qmd's immediate top-10/replay artifact ergonomics as stronger than ELF's default stress report, while expansion, fusion, and rerank remain untested. XY-932 adds a narrow live operator-debug slice where diff --git a/docs/guide/benchmarking/2026-06-11-measurement-coverage-audit.md b/docs/guide/benchmarking/2026-06-11-measurement-coverage-audit.md index 67c26673..66cd69b6 100644 --- a/docs/guide/benchmarking/2026-06-11-measurement-coverage-audit.md +++ b/docs/guide/benchmarking/2026-06-11-measurement-coverage-audit.md @@ -170,7 +170,7 @@ records `unique_project_names: 17` for the full project list including ELF. | ELF | `fixture_backed` plus `live_real_world` | Fixture aggregate passes except 5 blocked operator or measurement-gate boundaries; live full sweep is `wrong_result`; live capture/write-policy and narrow operator-debug slices pass. | Full live memory evolution, live consolidation, live knowledge pages, live production ops, competitor capture hooks, OpenViking staged trajectory artifacts, and broader operator UI runners. | Memory-evolution diagnostic report, then consolidation/knowledge reports plus agentmemory/claude-mem capture, OpenViking staged trajectory artifacts, and OpenMemory/claude-mem UI runners. | | qmd | `live_real_world` plus `live_baseline_only` | Fresh full sweep is five passes behind ELF because qmd misses the delete/TTL tombstone job and keeps capture/write-policy jobs typed `not_encoded`; same-corpus baseline passes; narrow operator-debug live slice ties replay commands but is `wrong_result` for trace hydration and candidate-drop visibility. | Deep retrieval-debug ergonomics and trace replay beyond the narrow operator-debug slice. | qmd/ELF deep retrieval-debug profile with expansion, fusion, rerank, and dropped-candidate traces. | | agentmemory | `live_baseline_only` | `lifecycle_fail`; capture comparison is `blocked` because the Docker baseline uses a process-local StateKV Map and in-memory index, with no durable local session/capture path for source ids, exclusions, write-policy audit, or evidence-bound output. | Durable coding-agent continuity and capture hooks. | Durable lifecycle and work-resume/capture adapter report. | -| mem0/OpenMemory | `live_baseline_only` | Basic local smoke now passes; history/UI/hosted/graph behavior remains `not_encoded`. | Entity history, lifecycle UI, OpenMemory inspection. | Entity-history, deletion-audit, and UI/export readback report. | +| mem0/OpenMemory | `live_baseline_only` | Basic local smoke and local OSS history/readback pass; OpenMemory UI/export is blocked, hosted Platform export is a non-goal, and optional graph plus broader prompt coverage remain `not_encoded`. | Entity history, lifecycle UI, OpenMemory inspection. | Entity-history, deletion-audit, and UI/export readback report. | | memsearch | `live_baseline_only` | Basic canonical Markdown reindex/reload smoke now passes; real-world prompt coverage remains `not_encoded`. | Markdown canonical store and local reindex clarity. | Source-of-truth and retrieval-debug real-world adapter report. | | OpenViking | `live_baseline_only` plus `fixture_backed` and `research_gate` | Same-corpus retrieval is `wrong_result`; staged retrieval, hierarchy selection, and recursive/context expansion are encoded as blocked fixtures. | Hierarchical staged context trajectory. | Evidence-bearing retrieval fix, then materialized staged trajectory report. | | claude-mem | `live_baseline_only` | `wrong_result`; capture breadth is `not_encoded` because hooks, timeline, observations, viewer capture, and automatic capture review were not run against real-world jobs. | Progressive disclosure and automatic capture review. | Work-resume, operator-debugging, and capture/write-policy report. | diff --git a/docs/guide/benchmarking/real_world_agent_memory_benchmark.md b/docs/guide/benchmarking/real_world_agent_memory_benchmark.md index a5fb2eca..4e6bd18d 100644 --- a/docs/guide/benchmarking/real_world_agent_memory_benchmark.md +++ b/docs/guide/benchmarking/real_world_agent_memory_benchmark.md @@ -252,12 +252,16 @@ operator_debugging_ux remain `not_encoded` for this live adapter path. qmd keeps `live_baseline_only` same-corpus record for update/delete/cold-start checks; that record is not a real-world suite win. agentmemory is blocked on durable upstream storage for lifecycle proof and capture breadth. mem0/OpenMemory, memsearch, and -claude-mem currently retain wrong-result, not-encoded, or incomplete live-baseline -states for the checked-in adapter evidence. OpenViking now reaches its pinned Docker -local embedding setup but remains a same-corpus `wrong_result` until it returns -evidence-bearing retrieval output. The checked-in `context_trajectory` fixtures keep -OpenViking staged retrieval, hierarchy selection, and recursive/context expansion -blocked until same-corpus evidence ids match and staged artifacts are materialized. +claude-mem no longer share one live-baseline boundary: mem0/OpenMemory and memsearch +now pass scoped local baseline paths, while OpenMemory product UI/export, hosted +Platform behavior, optional graph memory, memsearch real-world prompt/TTL coverage, +and claude-mem hook/viewer capture remain blocked, unsupported, not encoded, or +wrong-result for the checked-in adapter evidence. OpenViking now reaches its pinned +Docker local embedding setup but remains a same-corpus `wrong_result` until it +returns evidence-bearing retrieval output. The checked-in `context_trajectory` +fixtures keep OpenViking staged retrieval, hierarchy selection, and recursive/context +expansion blocked until same-corpus evidence ids match and staged artifacts are +materialized. The expanded RAG and graph-memory records for RAGFlow, LightRAG, GraphRAG, Graphiti/Zep, Letta, LangGraph, nanograph, llm-wiki, gbrain, graphify, and deeper qmd/OpenViking profiles are `research_gate` records until diff --git a/docs/guide/research/comparison_external_projects.md b/docs/guide/research/comparison_external_projects.md index 05e12a0d..7173ecb1 100644 --- a/docs/guide/research/comparison_external_projects.md +++ b/docs/guide/research/comparison_external_projects.md @@ -50,10 +50,14 @@ Use the evidence class before making claims: until a deep dive or adapter run exists. Current benchmark-grounded scope is narrow. The June 9, 2026 all-project smoke run -proved encoded same-corpus/lifecycle behavior only for the current adapters: ELF and qmd -passed their encoded smoke checks; agentmemory passed same-corpus retrieval but failed -or could not prove durable lifecycle behavior; memsearch, mem0, OpenViking, and -claude-mem retained `incomplete`, wrong-result, or not-encoded states. All broader suite +proved encoded same-corpus/lifecycle behavior only for the then-current adapters: ELF +and qmd passed their encoded smoke checks; agentmemory passed same-corpus retrieval but +failed or could not prove durable lifecycle behavior; memsearch, mem0, OpenViking, and +claude-mem retained `incomplete`, wrong-result, or not-encoded states. Later June 11 +follow-ups promote scoped local mem0/OpenMemory and memsearch baseline paths, while +OpenMemory UI/export, hosted Platform behavior, optional graph memory, broader +memsearch prompt/TTL coverage, OpenViking staged trajectory, and claude-mem hook/viewer +capture remain blocked, unsupported, not encoded, or wrong-result. All broader suite fit below is research guidance, not a benchmark result. The real-world job runner now carries a separate external adapter coverage manifest: @@ -100,8 +104,8 @@ Project-to-suite map: | agentmemory | `rw.operator-continuity`, `rw.resume-evidence`, `rw.lifecycle-staleness` | Cross-agent hooks, MCP/REST packaging, viewer, lifecycle/consolidation claims, and coding-agent continuity focus make it the right reference for daily agent memory ergonomics. | Use durable upstream storage rather than the current in-memory mock; ingest realistic agent sessions through the public hook/API path; prove restart, update/supersede, delete, and viewer/trace readback. | Mixed: benchmark-grounded only for current same-corpus retrieval; current lifecycle evidence is a failure/blocker, while hooks/viewer/consolidation are docs-grounded. Confidence: medium for suite fit, low for durable adapter quality. | ELF is stronger on evidence-bound writes and source-of-truth discipline; agentmemory remains the reference for capture breadth and agent-continuity UX. | | qmd | `rw.retrieval-debug`, `rw.lifecycle-staleness`, `rw.resume-evidence` | Its local CLI, structured JSON query output, expansion modes, hybrid routing, weighted fusion, rerank, update, delete, and cold-start path make it the strongest local retrieval-debug baseline. | Run `qmd` over the real-world corpus, capture query JSON, then rewrite/delete corpus files and rerun update/embed/query in fresh processes. | Benchmark-grounded for current smoke retrieval/update/delete/cold-start pass; docs-grounded for deeper query planning ergonomics. Confidence: high for local adapter baseline. | ELF is not yet stronger on local CLI debug ergonomics; treat qmd as the retrieval-debug reference while keeping ELF's service/provenance model. | | claude-mem | `rw.operator-continuity`, `rw.resume-evidence`, `rw.retrieval-debug` | Progressive-disclosure search, auto-capture hooks, local viewer, and observation/timeline workflows are directly aligned with real agent resumption jobs. | Exercise a real local repository with hook-driven capture, then evaluate `search -> timeline -> observations` behavior after restart; do not rely on mocked storage. | Docs-grounded for progressive disclosure/viewer; current benchmark adapter evidence is incomplete/wrong-result and mostly not encoded for lifecycle. Confidence: medium for product reference, low for current adapter claims. | ELF has stronger provenance and service boundaries, but claude-mem remains a reference for operator workflow and progressive disclosure UX. | -| mem0 / OpenMemory | `rw.lifecycle-staleness`, `rw.graph-temporal`, `rw.operator-continuity`, `rw.resume-evidence` | Entity-scoped memory, memory history, expiration, hosted/OSS surfaces, OpenMemory UI, and optional graph memory make it the broadest lifecycle and ecosystem comparison target. | Separate OSS local FastEmbed/Qdrant evidence from hosted Platform claims; prove add/update/delete/history, entity-scoped retrieval, expiration exclusion, OpenMemory UI readback, and optional graph context on the same corpus. | Docs-grounded for lifecycle/entity/graph/UI claims; current local adapter is incomplete/wrong-result for same-corpus retrieval and delete remains not encoded. Confidence: medium for suite fit, low for current adapter quality. | ELF is stronger on deterministic evidence-bound writes; mem0/OpenMemory is the reference for ecosystem reach, entity-scoped history, hosted option, and optional graph UX. | -| memsearch | `rw.lifecycle-staleness`, `rw.retrieval-debug`, `rw.resume-evidence` | Markdown as canonical memory plus incremental/content-addressed reindexing is a useful model for source transparency and rebuildable derived indexes. | Index a real-world Markdown corpus, mutate/delete files, rerun index/search from fresh processes, and record Milvus mode so Lite/Server/Cloud behavior is not conflated. | Docs-grounded for architecture; current adapter is incomplete/invalid-result, so no pass/fail quality claim is allowed. Confidence: medium for design pattern, low for current adapter evidence. | ELF already owns source-of-truth plus rebuildable index at service level; memsearch remains a reference for simple local canonical-store ergonomics. | +| mem0 / OpenMemory | `rw.lifecycle-staleness`, `rw.graph-temporal`, `rw.operator-continuity`, `rw.resume-evidence` | Entity-scoped memory, memory history, expiration, hosted/OSS surfaces, OpenMemory UI, and optional graph memory make it the broadest lifecycle and ecosystem comparison target. | Separate OSS local FastEmbed/Qdrant evidence from hosted Platform claims; prove add/update/delete/history, entity-scoped retrieval, expiration exclusion, OpenMemory UI readback, and optional graph context on the same corpus. | Benchmark-grounded for scoped local OSS same-corpus retrieval, update/delete/reload, history, entity filters, local `get_all` readback, and deletion audit; OpenMemory product UI/export remains blocked, hosted Platform is a non-goal, and optional graph plus broader prompt coverage remain not encoded. Confidence: medium for suite fit and scoped local adapter quality, low for product UI/hosted/graph claims. | ELF is stronger on deterministic evidence-bound writes; mem0/OpenMemory remains the reference for ecosystem reach, entity-scoped history, hosted option, and optional graph UX, with local preference-correction history currently measured as an ELF loss. | +| memsearch | `rw.lifecycle-staleness`, `rw.retrieval-debug`, `rw.resume-evidence` | Markdown as canonical memory plus incremental/content-addressed reindexing is a useful model for source transparency and rebuildable derived indexes. | Index a real-world Markdown corpus, mutate/delete files, rerun index/search from fresh processes, and record Milvus mode so Lite/Server/Cloud behavior is not conflated. | Benchmark-grounded for local same-corpus retrieval, reindex/update/delete, and cold-start reload smoke; no real-world prompt adapter is encoded, so Markdown-first behavior remains baseline scenario evidence rather than suite pass evidence. Confidence: medium for design pattern and scoped local adapter evidence, low for broad real-world adapter coverage. | ELF already owns source-of-truth plus rebuildable index at service level; memsearch remains a reference for simple local canonical-store ergonomics and transparent local reindexing. | | OpenViking | `rw.context-trajectory`, `rw.resume-evidence`, `rw.retrieval-debug` | `viking://` context organization, intent analysis, hierarchical retrieval, staged find/search behavior, and session compression are relevant to multi-hop agent context jobs. | Use the pinned Docker local embedding path, then evaluate `add_resource`/`find`/`search` over multi-stage jobs with stage output, hierarchy, and session memory evidence. | Docs-grounded for mechanism; current benchmark adapter reaches local embedding setup and `add_resource`/`find`, but remains `wrong_result` because same-corpus evidence terms are missed. Confidence: medium for architecture reference, low for runnable adapter quality. | ELF has first-class traces and evidence-bound notes, but OpenViking is the reference for hierarchical context trajectory and filesystem-like organization. | | llm-wiki | `rw.knowledge-synthesis`, `rw.resume-evidence` | Query/save/lint flows and topic-scoped wiki pages are a useful reference for turning retrieved memory into maintained project knowledge. | Run a corpus-to-wiki job, ask resume/decision questions, require page citations back to source memory, then mutate a stale source and prove lint/repair catches it. | Docs-grounded D1; no benchmark adapter evidence. Confidence: medium for derived-knowledge fit. | ELF is not yet stronger on derived knowledge pages; llm-wiki should inform rebuildable, evidence-cited dossiers rather than core storage. | | gbrain | `rw.knowledge-synthesis`, `rw.operator-continuity` | `compiled_truth`, timeline sections, backlinks, primary-home routing, and enrichment workflows model a living operational brain for project work. | Build or update pages from the real-world corpus, require current-truth plus timeline answers, and prove enrichment/backlink maintenance does not hide unsupported claims. | Docs-grounded D1; no benchmark adapter evidence. Confidence: medium for operator knowledge UX. | ELF should keep source notes authoritative; gbrain is a reference for presentation, enrichment, and maintenance loops. | diff --git a/docs/research/2026-06-11-competitor-strength-adoption-report.json b/docs/research/2026-06-11-competitor-strength-adoption-report.json index 71ad0918..abc0fc70 100644 --- a/docs/research/2026-06-11-competitor-strength-adoption-report.json +++ b/docs/research/2026-06-11-competitor-strength-adoption-report.json @@ -12,7 +12,7 @@ "Live temporal reconciliation remains wrong_result for five of six memory_evolution jobs.", "Private-corpus production quality is blocked until an operator-owned manifest exists.", "Credentialed provider production-ops gates are blocked until explicit provider setup exists.", - "Several competitor strengths remain not_tested or blocked: OpenMemory UI/export is blocked by the XY-931 export-helper setup probe, hosted mem0 Platform behavior remains a non-goal, and OpenViking trajectory, Letta core-vs-archival memory, and graph/RAG navigation remain unproven. XY-928 encodes OpenViking staged trajectory, hierarchy selection, and recursive/context expansion as blocked fixtures behind same-corpus evidence output and missing staged artifacts. XY-927 adds six ELF fixture-backed core_archival_memory jobs, but the Letta comparison remains blocked until the selected contained export/readback path exists. mem0 local OSS preference history is measured separately and is an ELF loss on the current correction-history scenario. The XY-923 follow-up scores qmd immediate top-10/replay artifact ergonomics as stronger than ELF's default stress report, while expansion, fusion, and rerank remain untested. XY-932 adds a narrow live operator-debug slice where ELF beats qmd on trace hydration and candidate-drop visibility, but OpenMemory UI/export and claude-mem viewer workflows remain blocked or not encoded. XY-933 adds an ELF live capture/write-policy self-check, but agentmemory capture breadth is blocked by mocked/in-memory storage and claude-mem hook/viewer capture remains untested." + "Several competitor strengths remain not_tested or blocked: OpenMemory UI/export is blocked by the XY-931 export-helper setup probe, hosted mem0 Platform behavior remains a non-goal, and OpenViking trajectory, Letta core-vs-archival memory, and graph/RAG navigation remain unproven. XY-928 encodes OpenViking staged trajectory, hierarchy selection, and recursive/context expansion as blocked fixtures behind same-corpus evidence output and missing staged artifacts. XY-927 adds six ELF fixture-backed core_archival_memory jobs, but Letta scenario rows remain blocked or not_tested until the selected contained export/readback path exists. mem0 local OSS preference history is measured separately and is an ELF loss on the current correction-history scenario. The XY-923 follow-up scores qmd immediate top-10/replay artifact ergonomics as stronger than ELF's default stress report, while expansion, fusion, and rerank remain untested. XY-932 adds a narrow live operator-debug slice where ELF beats qmd on trace hydration and candidate-drop visibility, but OpenMemory UI/export and claude-mem viewer workflows remain blocked or not encoded. XY-933 adds an ELF live capture/write-policy self-check, but agentmemory capture breadth is blocked by mocked/in-memory storage and claude-mem hook/viewer capture remains untested." ] }, "evidence_class_terms": [ diff --git a/docs/spec/real_world_agent_memory_benchmark_v1.md b/docs/spec/real_world_agent_memory_benchmark_v1.md index 459f6972..059a14d8 100644 --- a/docs/spec/real_world_agent_memory_benchmark_v1.md +++ b/docs/spec/real_world_agent_memory_benchmark_v1.md @@ -190,12 +190,14 @@ Each `adapters[]` record MUST include: optional `suite_id`, `status`, `elf_position`, optional `comparison_outcome`, `evidence`, and optional `command` and `artifact`. `elf_position` MUST be one of `wins`, `ties`, `loses`, or `untested`. `comparison_outcome`, when present, MUST be - one of `win`, `tie`, `loss`, `not_tested`, `blocked`, or `non_goal`. Reports SHOULD - derive `comparison_outcome` from `elf_position` when omitted, but SHOULD use the - explicit field for scenarios where the legacy ELF-relative position is less precise - than the report outcome. Scenario judgments are report inputs for dimension-level - comparison; they MUST NOT convert live-baseline-only evidence into real-world suite - pass claims. + one of `win`, `tie`, `loss`, `not_tested`, `blocked`, or `non_goal`. Scenario rows + with `status = "blocked"` MUST set `comparison_outcome = "blocked"` explicitly so a + blocked evidence path is not derived from `elf_position = "untested"` as + `not_tested`. Reports SHOULD derive `comparison_outcome` from `elf_position` when + omitted for non-blocked rows, but SHOULD use the explicit field for scenarios where + the legacy ELF-relative position is less precise than the report outcome. Scenario + judgments are report inputs for dimension-level comparison; they MUST NOT convert + live-baseline-only evidence into real-world suite pass claims. - `evidence`: array of evidence pointers with `kind`, `ref`, and `status`. - `notes`: optional bounded explanatory strings. - `follow_up`: optional `title` and `reason`.