diff --git a/Makefile.toml b/Makefile.toml index 9dcc099b..eba76c24 100644 --- a/Makefile.toml +++ b/Makefile.toml @@ -428,6 +428,9 @@ args = [ # | real-world-memory-production-ops | composite | | # | real-world-memory-production-ops-json | command | | # | real-world-memory-production-ops-report | command | | +# | real-world-memory-core-archival | composite | | +# | real-world-memory-core-archival-json | command | | +# | real-world-memory-core-archival-report | command | | # | real-world-memory-live-adapters | command | | [tasks.real-world-job-smoke] @@ -824,6 +827,55 @@ args = [ "tmp/real-world-memory/consolidation/report.md", ] +[tasks.real-world-memory-core-archival] +workspace = false +dependencies = [ + "real-world-memory-core-archival-report", +] + +[tasks.real-world-memory-core-archival-json] +workspace = false +command = "cargo" +args = [ + "run", + "-p", + "elf-eval", + "--bin", + "real_world_job_benchmark", + "--", + "run", + "--fixtures", + "apps/elf-eval/fixtures/real_world_memory/core_archival_memory", + "--out", + "tmp/real-world-memory/core-archival/report.json", + "--run-id", + "real-world-memory-core-archival", + "--adapter-id", + "fixture_core_archival_memory", + "--adapter-name", + "ELF core and archival memory fixture", +] + +[tasks.real-world-memory-core-archival-report] +workspace = false +dependencies = [ + "real-world-memory-core-archival-json", +] +command = "cargo" +args = [ + "run", + "-p", + "elf-eval", + "--bin", + "real_world_job_benchmark", + "--", + "publish", + "--report", + "tmp/real-world-memory/core-archival/report.json", + "--out", + "tmp/real-world-memory/core-archival/report.md", +] + [tasks.real-world-memory-live-adapters] workspace = false command = "bash" diff --git a/README.md b/README.md index 3e7ec848..7df564fc 100644 --- a/README.md +++ b/README.md @@ -145,15 +145,22 @@ provider-backed ELF evidence was required. rebuild returned `rebuilt_count=1`, `missing_vector_count=0`, `error_count=0`, and search recovered the restored note. - Fresh all-project smoke run: ELF and qmd passed every encoded check. agentmemory - passed same-corpus retrieval but failed lifecycle/cold-start coverage. memsearch, - mem0, OpenViking, and claude-mem remained typed non-pass states. OpenViking now - reaches its pinned Docker local embedding path and is reported as `wrong_result` - when same-corpus evidence terms are missed; setup failures remain `incomplete`. -- Real-world agent memory aggregate after XY-928: 43 fixture-backed jobs across - 12 suites, 38 pass, 0 incomplete, 5 blocked, 0 wrong-result, 0 not-encoded, and - 0 unsupported-claim results. The remaining non-pass jobs are production-ops - operator boundaries plus blocked OpenViking staged trajectory, hierarchy selection, - and recursive/context expansion measurement gates, not hidden benchmark wins. + passed same-corpus retrieval but failed lifecycle/cold-start coverage. mem0/OpenMemory + and memsearch now pass their scoped local baseline smokes, while OpenMemory + UI/export, hosted mem0 Platform, optional graph memory, and broader memsearch prompt + and TTL coverage remain blocked, unsupported, or not encoded. OpenViking now reaches + its pinned Docker local embedding path and is reported as `wrong_result` when + same-corpus evidence terms are missed; claude-mem and OpenViking non-retrieval + coverage remain typed non-pass states. +- Real-world agent memory aggregate after XY-927 and XY-928: 49 fixture-backed + jobs across 13 suites, 44 pass, 0 incomplete, 5 blocked, 0 wrong-result, + 0 not-encoded, and 0 unsupported-claim results. The remaining non-pass jobs are + production-ops operator boundaries plus blocked OpenViking staged trajectory, + hierarchy selection, and recursive/context expansion measurement gates, not + hidden benchmark wins. The new `core_archival_memory` suite passes 6 fixture + jobs for core block attachment, scope, provenance, stale-core detection, + archival fallback, and project-decision recovery; it does not create an + ELF-over-Letta claim. - Full-suite live real-world adapter sweep after XY-899: ELF and qmd emit Docker-isolated `live_real_world` records for all 40 encoded jobs across 11 suites through `cargo make real-world-memory-live-adapters`. Both keep the original diff --git a/apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json b/apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json index 1189ec5f..b70fec8b 100644 --- a/apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json +++ b/apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json @@ -29,7 +29,7 @@ }, "run": { "status": "blocked", - "evidence": "The current fixture set reports 43 jobs, 38 pass, 0 incomplete, 5 blocked, 0 wrong_result, 0 not_encoded, and 0 unsupported_claim.", + "evidence": "The current fixture set reports 49 jobs across 13 suites: 44 pass, 0 incomplete, 5 blocked, 0 wrong_result, 0 not_encoded, and 0 unsupported_claim. The six core_archival_memory jobs pass as ELF fixture evidence, not as live Letta comparison evidence; context_trajectory remains blocked behind OpenViking staged-artifact materialization.", "command": "cargo make real-world-memory", "artifact": "tmp/real-world-memory/real-world-memory-report.json" }, @@ -101,6 +101,11 @@ "status": "pass", "evidence": "Four redaction, exclusion, source-id, evidence-binding, and capture-boundary fixtures are encoded and passing." }, + { + "suite_id": "core_archival_memory", + "status": "pass", + "evidence": "Six fixture jobs score core block attachment, scope, provenance, stale-core detection, archival fallback, and project-decision recovery separately from archival note search." + }, { "suite_id": "production_ops", "status": "blocked", @@ -2212,24 +2217,24 @@ "evidence_class": "research_gate", "docker_default": true, "host_global_installs_required": false, - "overall_status": "not_encoded", + "overall_status": "blocked", "setup": { - "status": "not_encoded", - "evidence": "Letta is D1 reviewed as a core/archival memory reference, but no Docker real_world_job adapter is implemented." + "status": "blocked", + "evidence": "Letta is D1 reviewed as a core/archival memory reference. The contained comparison contract is a Docker-only benchmark-created agent export that must return core block JSON, archival search readback, and source ids before any scenario claim is scored." }, "run": { "status": "not_encoded", - "evidence": "No Letta core block, archival memory, or shared-memory job is encoded." + "evidence": "No Letta materializer currently creates the benchmark agent, imports the ELF core_archival_memory fixture corpus, or exports comparable core and archival evidence." }, "result": { "status": "not_encoded", - "evidence": "No Letta personalization or project-decision suite result is claimed." + "evidence": "No Letta core block, archival fallback, stale-core, scope, provenance, or project-decision result is claimed." }, "capabilities": [ { "capability": "core_archival_memory", - "status": "not_encoded", - "evidence": "Core blocks and archival memory are reference semantics but not scored." + "status": "blocked", + "evidence": "ELF fixture jobs now score core block attachment, scope, provenance, stale-core detection, archival fallback, and project-decision recovery separately from archival note search; Letta remains blocked until its export maps equivalent source ids." }, { "capability": "docker_embedding_configuration", @@ -2257,6 +2262,67 @@ "suite_id": "work_resume", "status": "not_encoded", "evidence": "Agent resumption through Letta memory blocks is not encoded." + }, + { + "suite_id": "core_archival_memory", + "status": "blocked", + "evidence": "ELF fixture coverage exists, but Letta has no contained export/readback artifact for the same core-vs-archival jobs." + } + ], + "scenarios": [ + { + "scenario_id": "core_block_attachment_readback", + "suite_id": "core_archival_memory", + "status": "not_encoded", + "elf_position": "untested", + "comparison_outcome": "not_tested", + "evidence": "ELF fixture core-archival-core-block-attachment-001 scores exact core block attachment and keeps core readback out of Qdrant-backed archival search. Letta has no comparable exported core block attachment evidence.", + "artifact": "apps/elf-eval/fixtures/real_world_memory/core_archival_memory/core_block_attachment.json" + }, + { + "scenario_id": "core_block_scope_readback", + "suite_id": "core_archival_memory", + "status": "not_encoded", + "elf_position": "untested", + "comparison_outcome": "not_tested", + "evidence": "ELF fixture core-archival-core-block-scope-001 scores read_profile, shared scope, and private-owner boundaries. Letta scope behavior remains unscored without a contained export of agent, block, and visibility metadata.", + "artifact": "apps/elf-eval/fixtures/real_world_memory/core_archival_memory/core_block_scope.json" + }, + { + "scenario_id": "core_block_provenance_readback", + "suite_id": "core_archival_memory", + "status": "not_encoded", + "elf_position": "untested", + "comparison_outcome": "not_tested", + "evidence": "ELF fixture core-archival-core-block-provenance-001 scores source_ref and audit_history readback. Letta provenance remains not_tested until exported core memory includes stable source ids and audit-equivalent events.", + "artifact": "apps/elf-eval/fixtures/real_world_memory/core_archival_memory/core_block_provenance.json" + }, + { + "scenario_id": "stale_core_detection", + "suite_id": "core_archival_memory", + "status": "blocked", + "elf_position": "untested", + "comparison_outcome": "blocked", + "evidence": "ELF fixture core-archival-stale-core-detection-001 scores archival evidence superseding a stale core block. Letta stale-core comparison is blocked until core export and archival readback can be joined by source ids.", + "artifact": "apps/elf-eval/fixtures/real_world_memory/core_archival_memory/stale_core_detection.json" + }, + { + "scenario_id": "archival_fallback_readback", + "suite_id": "core_archival_memory", + "status": "blocked", + "elf_position": "untested", + "comparison_outcome": "blocked", + "evidence": "ELF fixture core-archival-archival-fallback-001 scores fallback from insufficient core memory to archival note search. Letta fallback comparison is blocked until archival search output can be exported with source ids.", + "artifact": "apps/elf-eval/fixtures/real_world_memory/core_archival_memory/archival_fallback.json" + }, + { + "scenario_id": "core_archival_project_decision_recovery", + "suite_id": "core_archival_memory", + "status": "not_encoded", + "elf_position": "untested", + "comparison_outcome": "not_tested", + "evidence": "ELF fixture core-archival-project-decision-recovery-001 scores core routing plus archival decision rationale. Letta project-decision recovery remains not_tested until the contained export/readback contract exists.", + "artifact": "apps/elf-eval/fixtures/real_world_memory/core_archival_memory/project_decision_recovery.json" } ], "evidence": [ @@ -2284,14 +2350,15 @@ "evidence": "Official Docker deployment guide and embedding configuration boundary." } ], - "setup_path": "Define Docker server setup, embedding model configuration, and a core/archival memory fixture flow.", - "runtime_boundary": "Docker-only Letta server or CLI flow with benchmark-created agents and no host-global state.", - "resource_expectation": "Embedding model and agent server state must be explicit; record storage and provider boundaries.", + "setup_path": "Use a Docker-only Letta server or CLI flow that creates a benchmark-owned agent, loads the checked-in core_archival_memory fixture corpus, writes core memory and archival memory with fixture source ids, then exports core block JSON plus archival search/readback JSON.", + "runtime_boundary": "Docker-only Letta server or CLI flow with benchmark-created agents, benchmark-owned storage, no host-global state, and no unstated hosted service dependency.", + "resource_expectation": "Embedding model, agent server state, exported core memory, archival search output, and provider boundaries must be explicit in the artifact.", "retry_guidance": [ - "Create a tiny Docker agent with archival memory search.", - "Score core-versus-archival retrieval only after source evidence can be exported." + "Create a tiny Docker agent with core memory and archival memory loaded from the ELF core_archival_memory fixtures.", + "Export core block readback, archival search results, source ids, and any audit-equivalent metadata as JSON before scoring.", + "Score core-versus-archival scenarios only after source evidence can be exported and mapped to the fixture evidence ids." ], - "research_depth": "D1 feasibility verdict: research_only (XY-882); core/archival reference, adapter not encoded" + "research_depth": "D1 feasibility verdict: research_only (XY-882); XY-927 selects the contained export/readback contract, but the Letta adapter remains blocked until that artifact exists" } }, { diff --git a/apps/elf-eval/fixtures/real_world_memory/core_archival_memory/archival_fallback.json b/apps/elf-eval/fixtures/real_world_memory/core_archival_memory/archival_fallback.json new file mode 100644 index 00000000..b1928711 --- /dev/null +++ b/apps/elf-eval/fixtures/real_world_memory/core_archival_memory/archival_fallback.json @@ -0,0 +1,192 @@ +{ + "schema": "elf.real_world_job/v1", + "job_id": "core-archival-archival-fallback-001", + "suite": "core_archival_memory", + "title": "Fall back to archival notes when core memory is insufficient", + "corpus": { + "corpus_id": "real-world-memory-core-archival-2026-06-11", + "profile": "synthetic", + "items": [ + { + "evidence_id": "fallback-core-insufficient", + "kind": "core_block", + "text": "Core block summary: a rollback runbook exists for single-user production, but this core block intentionally omits the rollback steps.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "archival_fallback", + "evidence_id": "fallback-core-insufficient" + }, + "locator": { + "quote": "intentionally omits the rollback steps" + } + }, + "created_at": "2026-06-11T04:40:00Z" + }, + { + "evidence_id": "fallback-archival-runbook", + "kind": "runbook", + "text": "Archival rollback note: restore the Postgres backup, rebuild Qdrant from Postgres chunk vectors, and verify search recovers the restored note.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "archival_fallback", + "evidence_id": "fallback-archival-runbook" + }, + "locator": { + "quote": "restore the Postgres backup, rebuild Qdrant from Postgres chunk vectors" + } + }, + "created_at": "2026-06-11T04:41:00Z" + }, + { + "evidence_id": "fallback-core-only-trap", + "kind": "unsupported_claim", + "text": "Unsupported shortcut: answer the rollback steps from the core block alone without archival note search.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "archival_fallback", + "evidence_id": "fallback-core-only-trap" + } + }, + "created_at": "2026-06-11T04:39:00Z" + } + ], + "adapter_response": { + "adapter_id": "fixture_core_archival_memory", + "answer": { + "content": "The core block is insufficient because it says the rollback runbook exists but omits the steps. Fall back to archival note search: restore the Postgres backup, rebuild Qdrant from Postgres chunk vectors, and verify search recovers the restored note.", + "claims": [ + { + "claim_id": "core_memory_insufficient", + "text": "The core block is insufficient because it omits the rollback steps.", + "evidence_ids": ["fallback-core-insufficient"], + "confidence": "high" + }, + { + "claim_id": "archival_fallback_steps", + "text": "The archival rollback steps are restore Postgres, rebuild Qdrant from Postgres vectors, and verify search recovers the restored note.", + "evidence_ids": ["fallback-archival-runbook"], + "confidence": "high" + } + ], + "evidence_ids": ["fallback-core-insufficient", "fallback-archival-runbook"], + "latency_ms": 1.3, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + } + } + } + }, + "timeline": [ + { + "event_id": "core-rollback-summary-attached", + "ts": "2026-06-11T04:40:00Z", + "actor": "agent", + "action": "attachment_added", + "evidence_ids": ["fallback-core-insufficient"], + "summary": "A core block pointed at the rollback runbook but did not include the steps." + }, + { + "event_id": "archival-rollback-note-recorded", + "ts": "2026-06-11T04:41:00Z", + "actor": "agent", + "action": "recorded_runbook", + "evidence_ids": ["fallback-archival-runbook"], + "summary": "The detailed rollback steps were recorded as archival note evidence." + } + ], + "prompt": { + "role": "user", + "content": "The attached core block only says a rollback runbook exists. What are the rollback steps?", + "job_mode": "answer", + "constraints": ["cite_evidence", "use_archival_fallback", "avoid_core_only_hallucination"] + }, + "expected_answer": { + "must_include": [ + { + "claim_id": "core_memory_insufficient", + "text": "The core block is insufficient because it omits the rollback steps." + }, + { + "claim_id": "archival_fallback_steps", + "text": "The archival rollback steps are restore Postgres, rebuild Qdrant from Postgres vectors, and verify search recovers the restored note." + } + ], + "must_not_include": [ + "answer the rollback steps from the core block alone" + ], + "evidence_links": { + "core_memory_insufficient": ["fallback-core-insufficient"], + "archival_fallback_steps": ["fallback-archival-runbook"] + }, + "answer_type": "archival_fallback_answer", + "accepted_alternates": [], + "requires_caveat": false, + "requires_refusal": false + }, + "required_evidence": [ + { + "evidence_id": "fallback-core-insufficient", + "claim_id": "core_memory_insufficient", + "requirement": "explain", + "quote": "intentionally omits the rollback steps" + }, + { + "evidence_id": "fallback-archival-runbook", + "claim_id": "archival_fallback_steps", + "requirement": "cite", + "quote": "restore the Postgres backup, rebuild Qdrant from Postgres chunk vectors" + } + ], + "negative_traps": [ + { + "trap_id": "core-only-rollback-hallucination", + "type": "unsupported_claim", + "evidence_ids": ["fallback-core-only-trap"], + "failure_if_used": true + } + ], + "scoring_rubric": { + "dimensions": { + "answer_correctness": { + "weight": 0.3, + "max_points": 1.0, + "criteria": "Provides the archival rollback steps." + }, + "evidence_grounding": { + "weight": 0.3, + "max_points": 1.0, + "criteria": "Cites both insufficient core memory and archival fallback evidence." + }, + "trap_avoidance": { + "weight": 0.2, + "max_points": 1.0, + "criteria": "Avoids core-only hallucination." + }, + "workflow_helpfulness": { + "weight": 0.2, + "max_points": 1.0, + "criteria": "Makes the fallback path explicit." + } + }, + "pass_threshold": 0.85, + "hard_fail_rules": [ + "unsupported high-confidence claim about a required decision or fact", + "use of a negative trap marked failure_if_used = true" + ] + }, + "allowed_uncertainty": { + "can_answer_unknown": false, + "acceptable_phrases": ["The fixture does not provide that evidence."], + "fallback_action": "state_blocker" + }, + "tags": ["synthetic", "core_archival_memory", "archival_fallback", "rollback", "no_live_claim"] +} diff --git a/apps/elf-eval/fixtures/real_world_memory/core_archival_memory/core_block_attachment.json b/apps/elf-eval/fixtures/real_world_memory/core_archival_memory/core_block_attachment.json new file mode 100644 index 00000000..c1f34487 --- /dev/null +++ b/apps/elf-eval/fixtures/real_world_memory/core_archival_memory/core_block_attachment.json @@ -0,0 +1,192 @@ +{ + "schema": "elf.real_world_job/v1", + "job_id": "core-archival-core-block-attachment-001", + "suite": "core_archival_memory", + "title": "Read an explicitly attached core block without treating it as archival search", + "corpus": { + "corpus_id": "real-world-memory-core-archival-2026-06-11", + "profile": "synthetic", + "items": [ + { + "evidence_id": "core-attachment-active", + "kind": "core_block", + "text": "Core block attachment: key project_style has an active attachment for tenant local-tenant project ELF agent local-agent read_profile private_plus_project.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "core_block_attachment", + "evidence_id": "core-attachment-active" + }, + "locator": { + "quote": "active attachment for tenant local-tenant project ELF agent local-agent read_profile private_plus_project" + } + }, + "created_at": "2026-06-11T04:00:00Z" + }, + { + "evidence_id": "core-attachment-not-search", + "kind": "core_block_contract", + "text": "Core block readback is not archival search; it does not embed, rerank, search Qdrant, create a search session, or record note hits.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "core_block_attachment", + "evidence_id": "core-attachment-not-search" + }, + "locator": { + "quote": "does not embed, rerank, search Qdrant" + } + }, + "created_at": "2026-06-11T04:01:00Z" + }, + { + "evidence_id": "core-attachment-qdrant-trap", + "kind": "stale_claim", + "text": "Stale shortcut: core blocks are indexed into Qdrant and returned as normal archival note search hits.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "core_block_attachment", + "evidence_id": "core-attachment-qdrant-trap" + } + }, + "created_at": "2026-06-11T03:59:00Z" + } + ], + "adapter_response": { + "adapter_id": "fixture_core_archival_memory", + "answer": { + "content": "Return the project_style core block because it has an active attachment for the exact tenant, project, agent, and private_plus_project read profile. Keep that readback separate from archival search because core blocks do not embed, rerank, search Qdrant, create search sessions, or record note hits.", + "claims": [ + { + "claim_id": "attached_core_block_readback", + "text": "The project_style core block is returned through its exact active attachment.", + "evidence_ids": ["core-attachment-active"], + "confidence": "high" + }, + { + "claim_id": "core_not_archival_search", + "text": "Core block readback is separate from archival search and Qdrant-derived note retrieval.", + "evidence_ids": ["core-attachment-not-search"], + "confidence": "high" + } + ], + "evidence_ids": ["core-attachment-active", "core-attachment-not-search"], + "latency_ms": 1.0, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + } + } + } + }, + "timeline": [ + { + "event_id": "core-project-style-attached", + "ts": "2026-06-11T04:00:00Z", + "actor": "agent", + "action": "attachment_added", + "evidence_ids": ["core-attachment-active"], + "summary": "The project_style core block was attached for the exact read profile." + }, + { + "event_id": "core-archival-boundary-recorded", + "ts": "2026-06-11T04:01:00Z", + "actor": "agent", + "action": "recorded_contract", + "evidence_ids": ["core-attachment-not-search"], + "summary": "The core block readback boundary was recorded separately from archival search." + } + ], + "prompt": { + "role": "user", + "content": "Which always-loaded project style block is attached for this agent, and should it appear as a normal archival search hit?", + "job_mode": "answer", + "constraints": ["cite_evidence", "separate_core_from_archival_search", "avoid_qdrant_core_block_claims"] + }, + "expected_answer": { + "must_include": [ + { + "claim_id": "attached_core_block_readback", + "text": "The project_style core block is returned through its exact active attachment." + }, + { + "claim_id": "core_not_archival_search", + "text": "Core block readback is separate from archival search and Qdrant-derived note retrieval." + } + ], + "must_not_include": [ + "core blocks are indexed into Qdrant and returned as normal archival note search hits" + ], + "evidence_links": { + "attached_core_block_readback": ["core-attachment-active"], + "core_not_archival_search": ["core-attachment-not-search"] + }, + "answer_type": "direct_answer", + "accepted_alternates": [], + "requires_caveat": false, + "requires_refusal": false + }, + "required_evidence": [ + { + "evidence_id": "core-attachment-active", + "claim_id": "attached_core_block_readback", + "requirement": "cite", + "quote": "active attachment for tenant local-tenant project ELF agent local-agent read_profile private_plus_project" + }, + { + "evidence_id": "core-attachment-not-search", + "claim_id": "core_not_archival_search", + "requirement": "cite", + "quote": "does not embed, rerank, search Qdrant" + } + ], + "negative_traps": [ + { + "trap_id": "qdrant-core-block-search-hit", + "type": "stale_fact", + "evidence_ids": ["core-attachment-qdrant-trap"], + "failure_if_used": true + } + ], + "scoring_rubric": { + "dimensions": { + "answer_correctness": { + "weight": 0.3, + "max_points": 1.0, + "criteria": "Identifies the attached core block." + }, + "evidence_grounding": { + "weight": 0.3, + "max_points": 1.0, + "criteria": "Cites attachment and core-search boundary evidence." + }, + "trap_avoidance": { + "weight": 0.2, + "max_points": 1.0, + "criteria": "Avoids indexing core blocks into Qdrant-backed archival search." + }, + "lifecycle_behavior": { + "weight": 0.2, + "max_points": 1.0, + "criteria": "Preserves explicit attachment semantics." + } + }, + "pass_threshold": 0.85, + "hard_fail_rules": [ + "unsupported high-confidence claim about a required decision or fact", + "use of a negative trap marked failure_if_used = true" + ] + }, + "allowed_uncertainty": { + "can_answer_unknown": false, + "acceptable_phrases": ["The fixture does not provide that evidence."], + "fallback_action": "state_blocker" + }, + "tags": ["synthetic", "core_archival_memory", "core_block", "attachment", "no_live_claim"] +} diff --git a/apps/elf-eval/fixtures/real_world_memory/core_archival_memory/core_block_provenance.json b/apps/elf-eval/fixtures/real_world_memory/core_archival_memory/core_block_provenance.json new file mode 100644 index 00000000..f1fd4f92 --- /dev/null +++ b/apps/elf-eval/fixtures/real_world_memory/core_archival_memory/core_block_provenance.json @@ -0,0 +1,192 @@ +{ + "schema": "elf.real_world_job/v1", + "job_id": "core-archival-core-block-provenance-001", + "suite": "core_archival_memory", + "title": "Return source refs and audit events for core block assertions", + "corpus": { + "corpus_id": "real-world-memory-core-archival-2026-06-11", + "profile": "synthetic", + "items": [ + { + "evidence_id": "core-provenance-source-ref", + "kind": "core_block", + "text": "Provenance evidence: core block release_policy returns source_ref schema source_ref/v1 with resolver real_world_job_fixture/v1 and locator quote retained for reviewer inspection.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "core_block_provenance", + "evidence_id": "core-provenance-source-ref" + }, + "locator": { + "quote": "source_ref schema source_ref/v1" + } + }, + "created_at": "2026-06-11T04:20:00Z" + }, + { + "evidence_id": "core-provenance-audit-events", + "kind": "core_block_event", + "text": "Audit evidence: release_policy has append-only events block_created, block_updated, and attachment_added returned in audit_history.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "core_block_provenance", + "evidence_id": "core-provenance-audit-events" + }, + "locator": { + "quote": "block_created, block_updated, and attachment_added" + } + }, + "created_at": "2026-06-11T04:21:00Z" + }, + { + "evidence_id": "core-provenance-trusted-memory-trap", + "kind": "stale_claim", + "text": "Stale shortcut: always-loaded core memory is trusted without returning source_ref or audit_history.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "core_block_provenance", + "evidence_id": "core-provenance-trusted-memory-trap" + } + }, + "created_at": "2026-06-11T04:19:00Z" + } + ], + "adapter_response": { + "adapter_id": "fixture_core_archival_memory", + "answer": { + "content": "The release_policy core block must return its source_ref with source_ref/v1 resolver data and retain the locator quote for inspection. Its provenance also includes append-only block_created, block_updated, and attachment_added events in audit_history.", + "claims": [ + { + "claim_id": "core_source_ref_returned", + "text": "The release_policy core block returns source_ref/v1 provenance.", + "evidence_ids": ["core-provenance-source-ref"], + "confidence": "high" + }, + { + "claim_id": "core_audit_history_returned", + "text": "The release_policy core block returns block_created, block_updated, and attachment_added audit events.", + "evidence_ids": ["core-provenance-audit-events"], + "confidence": "high" + } + ], + "evidence_ids": ["core-provenance-source-ref", "core-provenance-audit-events"], + "latency_ms": 1.0, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + } + } + } + }, + "timeline": [ + { + "event_id": "core-release-policy-created", + "ts": "2026-06-11T04:20:00Z", + "actor": "agent", + "action": "block_created", + "evidence_ids": ["core-provenance-source-ref"], + "summary": "The release_policy block was created with a source_ref pointer." + }, + { + "event_id": "core-release-policy-attached", + "ts": "2026-06-11T04:21:00Z", + "actor": "agent", + "action": "attachment_added", + "evidence_ids": ["core-provenance-audit-events"], + "summary": "The release_policy block attachment event was added to audit history." + } + ], + "prompt": { + "role": "user", + "content": "What provenance should a returned core release_policy block include?", + "job_mode": "answer", + "constraints": ["cite_evidence", "include_source_ref", "include_audit_history"] + }, + "expected_answer": { + "must_include": [ + { + "claim_id": "core_source_ref_returned", + "text": "The release_policy core block returns source_ref/v1 provenance." + }, + { + "claim_id": "core_audit_history_returned", + "text": "The release_policy core block returns block_created, block_updated, and attachment_added audit events." + } + ], + "must_not_include": [ + "always-loaded core memory is trusted without returning source_ref or audit_history" + ], + "evidence_links": { + "core_source_ref_returned": ["core-provenance-source-ref"], + "core_audit_history_returned": ["core-provenance-audit-events"] + }, + "answer_type": "provenance_bundle", + "accepted_alternates": [], + "requires_caveat": false, + "requires_refusal": false + }, + "required_evidence": [ + { + "evidence_id": "core-provenance-source-ref", + "claim_id": "core_source_ref_returned", + "requirement": "cite", + "quote": "source_ref schema source_ref/v1" + }, + { + "evidence_id": "core-provenance-audit-events", + "claim_id": "core_audit_history_returned", + "requirement": "cite", + "quote": "block_created, block_updated, and attachment_added" + } + ], + "negative_traps": [ + { + "trap_id": "trusted-core-no-provenance", + "type": "unsupported_claim", + "evidence_ids": ["core-provenance-trusted-memory-trap"], + "failure_if_used": true + } + ], + "scoring_rubric": { + "dimensions": { + "answer_correctness": { + "weight": 0.25, + "max_points": 1.0, + "criteria": "States the returned provenance fields." + }, + "evidence_grounding": { + "weight": 0.35, + "max_points": 1.0, + "criteria": "Cites source_ref and audit-history evidence." + }, + "trap_avoidance": { + "weight": 0.2, + "max_points": 1.0, + "criteria": "Avoids trusted-without-provenance claims." + }, + "workflow_helpfulness": { + "weight": 0.2, + "max_points": 1.0, + "criteria": "Answers in a reviewer-usable provenance bundle shape." + } + }, + "pass_threshold": 0.85, + "hard_fail_rules": [ + "unsupported high-confidence claim about a required decision or fact", + "use of a negative trap marked failure_if_used = true" + ] + }, + "allowed_uncertainty": { + "can_answer_unknown": false, + "acceptable_phrases": ["The fixture does not provide that evidence."], + "fallback_action": "state_blocker" + }, + "tags": ["synthetic", "core_archival_memory", "provenance", "audit_history", "source_ref"] +} diff --git a/apps/elf-eval/fixtures/real_world_memory/core_archival_memory/core_block_scope.json b/apps/elf-eval/fixtures/real_world_memory/core_archival_memory/core_block_scope.json new file mode 100644 index 00000000..3b379b85 --- /dev/null +++ b/apps/elf-eval/fixtures/real_world_memory/core_archival_memory/core_block_scope.json @@ -0,0 +1,192 @@ +{ + "schema": "elf.real_world_job/v1", + "job_id": "core-archival-core-block-scope-001", + "suite": "core_archival_memory", + "title": "Apply core block scope and private-owner checks before readback", + "corpus": { + "corpus_id": "real-world-memory-core-archival-2026-06-11", + "profile": "synthetic", + "items": [ + { + "evidence_id": "core-scope-project-shared-readable", + "kind": "core_block", + "text": "Scope evidence: project_shared block release_gate is readable for tenant local-tenant project ELF agent local-agent only when the active attachment and read_profile all_scopes allow project_shared.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "core_block_scope", + "evidence_id": "core-scope-project-shared-readable" + }, + "locator": { + "quote": "active attachment and read_profile all_scopes allow project_shared" + } + }, + "created_at": "2026-06-11T04:10:00Z" + }, + { + "evidence_id": "core-scope-private-owner", + "kind": "core_block", + "text": "Private owner evidence: agent_private block agent_a_workflow belongs to agent-a and must not be returned to agent-b even if agent-b has a matching read_profile label.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "core_block_scope", + "evidence_id": "core-scope-private-owner" + }, + "locator": { + "quote": "must not be returned to agent-b" + } + }, + "created_at": "2026-06-11T04:11:00Z" + }, + { + "evidence_id": "core-scope-bypass-trap", + "kind": "stale_claim", + "text": "Stale shortcut: a core block attachment bypasses read_profile scope checks, private-owner checks, and shared grants.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "core_block_scope", + "evidence_id": "core-scope-bypass-trap" + } + }, + "created_at": "2026-06-11T04:09:00Z" + } + ], + "adapter_response": { + "adapter_id": "fixture_core_archival_memory", + "answer": { + "content": "Return the release_gate core block only when the active attachment and all_scopes read profile allow project_shared. Do not return agent_a_workflow to agent-b, because private-owner checks still apply to agent_private core blocks.", + "claims": [ + { + "claim_id": "shared_core_scope_allowed", + "text": "The project_shared release_gate block is readable only when attachment and read_profile allow project_shared.", + "evidence_ids": ["core-scope-project-shared-readable"], + "confidence": "high" + }, + { + "claim_id": "private_core_scope_denied", + "text": "The agent_private agent_a_workflow block must not be returned to agent-b.", + "evidence_ids": ["core-scope-private-owner"], + "confidence": "high" + } + ], + "evidence_ids": ["core-scope-project-shared-readable", "core-scope-private-owner"], + "latency_ms": 1.1, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + } + } + } + }, + "timeline": [ + { + "event_id": "core-release-gate-shared", + "ts": "2026-06-11T04:10:00Z", + "actor": "agent", + "action": "attachment_added", + "evidence_ids": ["core-scope-project-shared-readable"], + "summary": "The release_gate block was attached with project_shared scope." + }, + { + "event_id": "core-agent-a-private", + "ts": "2026-06-11T04:11:00Z", + "actor": "agent-a", + "action": "block_created", + "evidence_ids": ["core-scope-private-owner"], + "summary": "The agent_a_workflow block remained private to agent-a." + } + ], + "prompt": { + "role": "user", + "content": "For core memory readback, which shared block can this agent see, and can agent-b also see agent-a's private block?", + "job_mode": "answer", + "constraints": ["cite_evidence", "enforce_scope", "avoid_private_owner_leakage"] + }, + "expected_answer": { + "must_include": [ + { + "claim_id": "shared_core_scope_allowed", + "text": "The project_shared release_gate block is readable only when attachment and read_profile allow project_shared." + }, + { + "claim_id": "private_core_scope_denied", + "text": "The agent_private agent_a_workflow block must not be returned to agent-b." + } + ], + "must_not_include": [ + "a core block attachment bypasses read_profile scope checks" + ], + "evidence_links": { + "shared_core_scope_allowed": ["core-scope-project-shared-readable"], + "private_core_scope_denied": ["core-scope-private-owner"] + }, + "answer_type": "direct_answer", + "accepted_alternates": [], + "requires_caveat": false, + "requires_refusal": false + }, + "required_evidence": [ + { + "evidence_id": "core-scope-project-shared-readable", + "claim_id": "shared_core_scope_allowed", + "requirement": "cite", + "quote": "active attachment and read_profile all_scopes allow project_shared" + }, + { + "evidence_id": "core-scope-private-owner", + "claim_id": "private_core_scope_denied", + "requirement": "cite", + "quote": "must not be returned to agent-b" + } + ], + "negative_traps": [ + { + "trap_id": "core-attachment-bypasses-scope", + "type": "scope_leak", + "evidence_ids": ["core-scope-bypass-trap"], + "failure_if_used": true + } + ], + "scoring_rubric": { + "dimensions": { + "answer_correctness": { + "weight": 0.3, + "max_points": 1.0, + "criteria": "Applies readable shared scope and denied private owner scope." + }, + "evidence_grounding": { + "weight": 0.25, + "max_points": 1.0, + "criteria": "Cites scope and private-owner evidence." + }, + "trap_avoidance": { + "weight": 0.25, + "max_points": 1.0, + "criteria": "Avoids scope-bypass claims." + }, + "ownership_correctness": { + "weight": 0.2, + "max_points": 1.0, + "criteria": "Does not leak private core blocks across agents." + } + }, + "pass_threshold": 0.85, + "hard_fail_rules": [ + "unsupported high-confidence claim about a required decision or fact", + "use of a negative trap marked failure_if_used = true" + ] + }, + "allowed_uncertainty": { + "can_answer_unknown": false, + "acceptable_phrases": ["The fixture does not provide that evidence."], + "fallback_action": "state_blocker" + }, + "tags": ["synthetic", "core_archival_memory", "scope", "private_owner", "no_live_claim"] +} diff --git a/apps/elf-eval/fixtures/real_world_memory/core_archival_memory/project_decision_recovery.json b/apps/elf-eval/fixtures/real_world_memory/core_archival_memory/project_decision_recovery.json new file mode 100644 index 00000000..423db375 --- /dev/null +++ b/apps/elf-eval/fixtures/real_world_memory/core_archival_memory/project_decision_recovery.json @@ -0,0 +1,271 @@ +{ + "schema": "elf.real_world_job/v1", + "job_id": "core-archival-project-decision-recovery-001", + "suite": "core_archival_memory", + "title": "Recover a project decision from core routing and archival rationale", + "corpus": { + "corpus_id": "real-world-memory-core-archival-2026-06-11", + "profile": "synthetic", + "items": [ + { + "evidence_id": "decision-core-routing-block", + "kind": "core_block", + "text": "Core decision routing block: keep the benchmark outcome policy always attached and route detailed rationale to archival notes.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "project_decision_recovery", + "evidence_id": "decision-core-routing-block" + }, + "locator": { + "quote": "route detailed rationale to archival notes" + } + }, + "created_at": "2026-06-11T04:50:00Z" + }, + { + "evidence_id": "decision-archival-outcome-policy", + "kind": "decision", + "text": "Archival decision record: scenario outcomes use win, tie, loss, not_tested, blocked, or non_goal only when scenario evidence supports them.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "project_decision_recovery", + "evidence_id": "decision-archival-outcome-policy" + }, + "locator": { + "quote": "use win, tie, loss, not_tested, blocked, or non_goal only when scenario evidence supports them" + } + }, + "created_at": "2026-06-11T04:51:00Z" + }, + { + "evidence_id": "decision-archival-core-search-boundary", + "kind": "decision", + "text": "Archival project decision: core blocks stay separate from archival note search and Qdrant-derived retrieval.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "project_decision_recovery", + "evidence_id": "decision-archival-core-search-boundary" + }, + "locator": { + "quote": "core blocks stay separate from archival note search" + } + }, + "created_at": "2026-06-11T04:52:00Z" + }, + { + "evidence_id": "decision-letta-export-boundary", + "kind": "comparison_boundary", + "text": "Letta comparison boundary: no contained export/readback artifact maps core block JSON, archival search/readback JSON, and source ids, so Letta remains blocked or not_tested and no win, tie, or loss claim is allowed.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "project_decision_recovery", + "evidence_id": "decision-letta-export-boundary" + }, + "locator": { + "quote": "no contained export/readback artifact maps core block JSON" + } + }, + "created_at": "2026-06-11T04:53:00Z" + }, + { + "evidence_id": "decision-letta-win-trap", + "kind": "unsupported_claim", + "text": "Wrong claim: Letta comparison can be scored as an ELF win or measured loss because ELF has core blocks.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "project_decision_recovery", + "evidence_id": "decision-letta-win-trap" + } + }, + "created_at": "2026-06-11T04:49:00Z" + } + ], + "adapter_response": { + "adapter_id": "fixture_core_archival_memory", + "answer": { + "content": "Use the always-attached core routing block to find the benchmark outcome policy, then cite archival notes for the detailed decision. The archival decision says to use win, tie, loss, not_tested, blocked, or non_goal only when scenario evidence supports them. It also says core blocks stay separate from archival note search and Qdrant-derived retrieval. Letta remains blocked or not_tested until a contained export/readback artifact maps core and archival source ids, so no ELF-over-Letta claim follows from ELF having core blocks.", + "claims": [ + { + "claim_id": "core_routes_to_archival_rationale", + "text": "The core routing block points detailed decision rationale to archival notes.", + "evidence_ids": ["decision-core-routing-block"], + "confidence": "high" + }, + { + "claim_id": "outcomes_require_evidence", + "text": "Scenario outcomes use win, tie, loss, not_tested, blocked, or non_goal only when evidence supports them.", + "evidence_ids": ["decision-archival-outcome-policy"], + "confidence": "high" + }, + { + "claim_id": "core_archival_boundary_preserved", + "text": "Core blocks stay separate from archival note search and Qdrant-derived retrieval.", + "evidence_ids": ["decision-archival-core-search-boundary"], + "confidence": "high" + }, + { + "claim_id": "letta_comparison_requires_export", + "text": "Letta remains blocked or not_tested until a contained export/readback artifact maps core and archival source ids.", + "evidence_ids": ["decision-letta-export-boundary"], + "confidence": "high" + } + ], + "evidence_ids": [ + "decision-core-routing-block", + "decision-archival-outcome-policy", + "decision-archival-core-search-boundary", + "decision-letta-export-boundary" + ], + "latency_ms": 1.4, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + } + } + } + }, + "timeline": [ + { + "event_id": "decision-routing-core-attached", + "ts": "2026-06-11T04:50:00Z", + "actor": "agent", + "action": "attachment_added", + "evidence_ids": ["decision-core-routing-block"], + "summary": "A core block kept the outcome-policy routing pointer always attached." + }, + { + "event_id": "decision-outcome-policy-archived", + "ts": "2026-06-11T04:51:00Z", + "actor": "agent", + "action": "recorded_decision", + "evidence_ids": [ + "decision-archival-outcome-policy", + "decision-archival-core-search-boundary", + "decision-letta-export-boundary" + ], + "summary": "Archival notes recorded the detailed outcome policy and core-search boundary." + } + ], + "prompt": { + "role": "user", + "content": "What is the benchmark outcome policy, and does having ELF core blocks make Letta a measured loss?", + "job_mode": "decide", + "constraints": ["cite_evidence", "recover_project_decision", "avoid_unsupported_letta_claims"] + }, + "expected_answer": { + "must_include": [ + { + "claim_id": "core_routes_to_archival_rationale", + "text": "The core routing block points detailed decision rationale to archival notes." + }, + { + "claim_id": "outcomes_require_evidence", + "text": "Scenario outcomes use win, tie, loss, not_tested, blocked, or non_goal only when evidence supports them." + }, + { + "claim_id": "core_archival_boundary_preserved", + "text": "Core blocks stay separate from archival note search and Qdrant-derived retrieval." + }, + { + "claim_id": "letta_comparison_requires_export", + "text": "Letta remains blocked or not_tested until a contained export/readback artifact maps core and archival source ids." + } + ], + "must_not_include": [ + "Letta comparison can be scored as an ELF win", + "Letta is a measured loss", + "Letta comparison can be scored as a measured loss" + ], + "evidence_links": { + "core_routes_to_archival_rationale": ["decision-core-routing-block"], + "outcomes_require_evidence": ["decision-archival-outcome-policy"], + "core_archival_boundary_preserved": ["decision-archival-core-search-boundary"], + "letta_comparison_requires_export": ["decision-letta-export-boundary"] + }, + "answer_type": "decision_record", + "accepted_alternates": [], + "requires_caveat": false, + "requires_refusal": false + }, + "required_evidence": [ + { + "evidence_id": "decision-core-routing-block", + "claim_id": "core_routes_to_archival_rationale", + "requirement": "cite", + "quote": "route detailed rationale to archival notes" + }, + { + "evidence_id": "decision-archival-outcome-policy", + "claim_id": "outcomes_require_evidence", + "requirement": "cite", + "quote": "use win, tie, loss, not_tested, blocked, or non_goal only when scenario evidence supports them" + }, + { + "evidence_id": "decision-archival-core-search-boundary", + "claim_id": "core_archival_boundary_preserved", + "requirement": "cite", + "quote": "core blocks stay separate from archival note search" + }, + { + "evidence_id": "decision-letta-export-boundary", + "claim_id": "letta_comparison_requires_export", + "requirement": "cite", + "quote": "no contained export/readback artifact maps core block JSON" + } + ], + "negative_traps": [ + { + "trap_id": "unsupported-letta-loss-from-elf-core", + "type": "unsupported_claim", + "evidence_ids": ["decision-letta-win-trap"], + "failure_if_used": true + } + ], + "scoring_rubric": { + "dimensions": { + "answer_correctness": { + "weight": 0.25, + "max_points": 1.0, + "criteria": "Recovers the benchmark outcome policy." + }, + "evidence_grounding": { + "weight": 0.3, + "max_points": 1.0, + "criteria": "Cites core routing and archival decision evidence." + }, + "trap_avoidance": { + "weight": 0.25, + "max_points": 1.0, + "criteria": "Avoids an unsupported Letta win or loss claim." + }, + "workflow_helpfulness": { + "weight": 0.2, + "max_points": 1.0, + "criteria": "Explains how core memory and archival decision evidence work together." + } + }, + "pass_threshold": 0.85, + "hard_fail_rules": [ + "unsupported high-confidence claim about a required decision or fact", + "use of a negative trap marked failure_if_used = true" + ] + }, + "allowed_uncertainty": { + "can_answer_unknown": false, + "acceptable_phrases": ["The fixture does not provide that evidence."], + "fallback_action": "state_blocker" + }, + "tags": ["synthetic", "core_archival_memory", "project_decisions", "letta_boundary", "no_live_claim"] +} diff --git a/apps/elf-eval/fixtures/real_world_memory/core_archival_memory/stale_core_detection.json b/apps/elf-eval/fixtures/real_world_memory/core_archival_memory/stale_core_detection.json new file mode 100644 index 00000000..084c26cb --- /dev/null +++ b/apps/elf-eval/fixtures/real_world_memory/core_archival_memory/stale_core_detection.json @@ -0,0 +1,206 @@ +{ + "schema": "elf.real_world_job/v1", + "job_id": "core-archival-stale-core-detection-001", + "suite": "core_archival_memory", + "title": "Detect a stale core block when archival evidence supersedes it", + "corpus": { + "corpus_id": "real-world-memory-core-archival-2026-06-11", + "profile": "synthetic", + "items": [ + { + "evidence_id": "stale-core-validation-gate", + "kind": "core_block", + "text": "Stale core block: the validation gate is cargo make lint and cargo make test.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "stale_core_detection", + "evidence_id": "stale-core-validation-gate" + } + }, + "created_at": "2026-06-10T10:00:00Z" + }, + { + "evidence_id": "archival-current-validation-gate", + "kind": "decision", + "text": "Archival decision update: before pushing a refreshed PR head, run cargo make fmt, cargo make lint-fix, and cargo make checks.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "stale_core_detection", + "evidence_id": "archival-current-validation-gate" + }, + "locator": { + "quote": "cargo make fmt, cargo make lint-fix, and cargo make checks" + } + }, + "created_at": "2026-06-11T04:30:00Z" + }, + { + "evidence_id": "archival-supersedes-core-rationale", + "kind": "decision", + "text": "Rationale: archival note evidence supersedes the attached core block until the core block is updated from source-of-truth state.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "stale_core_detection", + "evidence_id": "archival-supersedes-core-rationale" + }, + "locator": { + "quote": "supersedes the attached core block" + } + }, + "created_at": "2026-06-11T04:31:00Z" + }, + { + "evidence_id": "stale-core-answer-trap", + "kind": "stale_claim", + "text": "Wrong answer trap: cite the core block as current and skip archival evidence.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "stale_core_detection", + "evidence_id": "stale-core-answer-trap" + } + }, + "created_at": "2026-06-11T04:29:00Z" + } + ], + "adapter_response": { + "adapter_id": "fixture_core_archival_memory", + "answer": { + "content": "Treat the attached validation-gate core block as stale. The current archival decision says to run cargo make fmt, cargo make lint-fix, and cargo make checks before pushing a refreshed PR head, and the archival rationale says that evidence supersedes the core block until it is updated from source-of-truth state.", + "claims": [ + { + "claim_id": "stale_core_detected", + "text": "The attached validation-gate core block is stale.", + "evidence_ids": ["archival-supersedes-core-rationale"], + "confidence": "high" + }, + { + "claim_id": "archival_current_gate", + "text": "The current archival validation gate is cargo make fmt, cargo make lint-fix, and cargo make checks.", + "evidence_ids": ["archival-current-validation-gate"], + "confidence": "high" + } + ], + "evidence_ids": ["archival-current-validation-gate", "archival-supersedes-core-rationale"], + "latency_ms": 1.2, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + } + } + } + }, + "timeline": [ + { + "event_id": "old-core-gate-attached", + "ts": "2026-06-10T10:00:00Z", + "actor": "agent", + "action": "block_created", + "evidence_ids": ["stale-core-validation-gate"], + "summary": "A core block recorded an old validation gate." + }, + { + "event_id": "archival-gate-updated", + "ts": "2026-06-11T04:30:00Z", + "actor": "agent", + "action": "updated_decision", + "evidence_ids": ["archival-current-validation-gate", "archival-supersedes-core-rationale"], + "summary": "Archival evidence superseded the old core validation gate." + } + ], + "prompt": { + "role": "user", + "content": "The attached core block says the gate is lint and test. Is that still current before a refreshed PR push?", + "job_mode": "answer", + "constraints": ["cite_evidence", "detect_stale_core", "prefer_current_archival_evidence"] + }, + "expected_answer": { + "must_include": [ + { + "claim_id": "stale_core_detected", + "text": "The attached validation-gate core block is stale." + }, + { + "claim_id": "archival_current_gate", + "text": "The current archival validation gate is cargo make fmt, cargo make lint-fix, and cargo make checks." + } + ], + "must_not_include": [ + "the validation gate is cargo make lint and cargo make test" + ], + "evidence_links": { + "stale_core_detected": ["archival-supersedes-core-rationale"], + "archival_current_gate": ["archival-current-validation-gate"] + }, + "answer_type": "current_state_with_stale_core_caveat", + "accepted_alternates": [], + "requires_caveat": false, + "requires_refusal": false + }, + "required_evidence": [ + { + "evidence_id": "archival-current-validation-gate", + "claim_id": "archival_current_gate", + "requirement": "cite", + "quote": "cargo make fmt, cargo make lint-fix, and cargo make checks" + }, + { + "evidence_id": "archival-supersedes-core-rationale", + "claim_id": "stale_core_detected", + "requirement": "explain", + "quote": "supersedes the attached core block" + } + ], + "negative_traps": [ + { + "trap_id": "stale-core-current-answer", + "type": "stale_fact", + "evidence_ids": ["stale-core-validation-gate", "stale-core-answer-trap"], + "failure_if_used": true + } + ], + "scoring_rubric": { + "dimensions": { + "answer_correctness": { + "weight": 0.3, + "max_points": 1.0, + "criteria": "States that the attached core block is stale." + }, + "evidence_grounding": { + "weight": 0.3, + "max_points": 1.0, + "criteria": "Cites current archival evidence and supersession rationale." + }, + "trap_avoidance": { + "weight": 0.2, + "max_points": 1.0, + "criteria": "Avoids answering from stale core memory." + }, + "lifecycle_behavior": { + "weight": 0.2, + "max_points": 1.0, + "criteria": "Detects stale core state when archival evidence supersedes it." + } + }, + "pass_threshold": 0.85, + "hard_fail_rules": [ + "unsupported high-confidence claim about a required decision or fact", + "use of a negative trap marked failure_if_used = true" + ] + }, + "allowed_uncertainty": { + "can_answer_unknown": false, + "acceptable_phrases": ["The fixture does not provide that evidence."], + "fallback_action": "state_blocker" + }, + "tags": ["synthetic", "core_archival_memory", "stale_core", "archival_supersession", "no_live_claim"] +} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark.rs b/apps/elf-eval/src/bin/real_world_job_benchmark.rs index efd4a34a..d4d0c6ac 100644 --- a/apps/elf-eval/src/bin/real_world_job_benchmark.rs +++ b/apps/elf-eval/src/bin/real_world_job_benchmark.rs @@ -54,6 +54,7 @@ const SUITES: &[&str] = &[ "capture_integration", "production_ops", "personalization", + "core_archival_memory", "context_trajectory", ]; @@ -3110,9 +3111,15 @@ fn job_metrics(job: &RealWorldJob, answer: &ProducedAnswer) -> JobMetrics { .filter(|evidence| produced_evidence.contains(&evidence.evidence_id)) .count(); let stale_retrieval_count = trap_use_count(job, &produced_evidence, "stale_fact", answer); - let scope_violation_count = trap_use_count(job, &produced_evidence, "near_duplicate", answer); - let scope_check_count = - job.negative_traps.iter().filter(|trap| trap.trap_type == "near_duplicate").count(); + let scope_violation_count = ["near_duplicate", "scope_leak"] + .into_iter() + .map(|trap_type| trap_use_count(job, &produced_evidence, trap_type, answer)) + .sum(); + let scope_check_count = job + .negative_traps + .iter() + .filter(|trap| is_scope_trap_type(trap.trap_type.as_str())) + .count(); let redaction_leak_count = trap_use_count(job, &produced_evidence, "privacy_leak", answer); let scope_correct_count = scope_check_count.saturating_sub(scope_violation_count); let qdrant_rebuild_case = job.tags.iter().any(|tag| tag == "qdrant_rebuild"); @@ -3137,6 +3144,10 @@ fn source_ref_by_evidence(job: &RealWorldJob) -> BTreeMap<&str, &Value> { job.corpus.items.iter().map(|item| (item.evidence_id.as_str(), &item.source_ref)).collect() } +fn is_scope_trap_type(trap_type: &str) -> bool { + matches!(trap_type, "near_duplicate" | "scope_leak") +} + fn trap_use_count( job: &RealWorldJob, produced_evidence: &BTreeSet, @@ -3932,11 +3943,116 @@ fn validate_adapter_scenarios(path: &Path, adapter: &ExternalAdapterReport) -> R suite_id )); } + + let outcome = scenario_comparison_outcome(scenario); + + if blocked_status_missing_blocked_outcome(scenario.status, scenario.comparison_outcome) { + return Err(eyre::eyre!( + "{} adapter {} scenario {} uses blocked status without blocked comparison outcome.", + path.display(), + adapter.adapter_id, + scenario.scenario_id + )); + } + if unmeasured_status_has_measured_outcome(scenario.status, outcome) { + return Err(eyre::eyre!( + "{} adapter {} scenario {} uses {} status with {} outcome.", + path.display(), + adapter.adapter_id, + scenario.scenario_id, + adapter_status_str(scenario.status), + scenario_comparison_outcome_str(outcome) + )); + } + if unmeasured_status_has_measured_position(scenario.status, scenario.elf_position) { + return Err(eyre::eyre!( + "{} adapter {} scenario {} uses {} status with {} position.", + path.display(), + adapter.adapter_id, + scenario.scenario_id, + adapter_status_str(scenario.status), + scenario_position_str(scenario.elf_position) + )); + } + if explicit_outcome_conflicts_with_position(scenario) { + return Err(eyre::eyre!( + "{} adapter {} scenario {} uses {} position with {} outcome.", + path.display(), + adapter.adapter_id, + scenario.scenario_id, + scenario_position_str(scenario.elf_position), + scenario_comparison_outcome_str(outcome) + )); + } } Ok(()) } +fn blocked_status_missing_blocked_outcome( + status: AdapterCoverageStatus, + outcome: Option, +) -> bool { + status == AdapterCoverageStatus::Blocked && outcome != Some(ScenarioComparisonOutcome::Blocked) +} + +fn unmeasured_status_has_measured_outcome( + status: AdapterCoverageStatus, + outcome: ScenarioComparisonOutcome, +) -> bool { + matches!( + status, + AdapterCoverageStatus::Blocked + | AdapterCoverageStatus::Incomplete + | AdapterCoverageStatus::NotEncoded + | AdapterCoverageStatus::Unsupported + ) && matches!( + outcome, + ScenarioComparisonOutcome::Win + | ScenarioComparisonOutcome::Tie + | ScenarioComparisonOutcome::Loss + ) +} + +fn unmeasured_status_has_measured_position( + status: AdapterCoverageStatus, + position: ElfScenarioPosition, +) -> bool { + matches!( + status, + AdapterCoverageStatus::Blocked + | AdapterCoverageStatus::Incomplete + | AdapterCoverageStatus::NotEncoded + | AdapterCoverageStatus::Unsupported + ) && matches!( + position, + ElfScenarioPosition::Wins | ElfScenarioPosition::Ties | ElfScenarioPosition::Loses + ) +} + +fn explicit_outcome_conflicts_with_position(scenario: &AdapterScenarioJudgment) -> bool { + let Some(outcome) = scenario.comparison_outcome else { + return false; + }; + + !position_supports_outcome(scenario.elf_position, outcome) +} + +fn position_supports_outcome( + position: ElfScenarioPosition, + outcome: ScenarioComparisonOutcome, +) -> bool { + matches!( + (position, outcome), + (ElfScenarioPosition::Wins, ScenarioComparisonOutcome::Win) + | (ElfScenarioPosition::Ties, ScenarioComparisonOutcome::Tie) + | (ElfScenarioPosition::Loses, ScenarioComparisonOutcome::Loss) + | (ElfScenarioPosition::Untested, ScenarioComparisonOutcome::NotTested) + | (ElfScenarioPosition::Untested, ScenarioComparisonOutcome::Blocked) + | (ElfScenarioPosition::Untested, ScenarioComparisonOutcome::NonGoal) + ) +} + fn validate_adapter_evidence(path: &Path, adapter: &ExternalAdapterReport) -> Result<()> { for evidence in &adapter.evidence { if evidence.kind.trim().is_empty() || evidence.reference.trim().is_empty() { @@ -4994,6 +5110,15 @@ fn scenario_comparison_outcome_str(outcome: ScenarioComparisonOutcome) -> &'stat } } +fn scenario_position_str(position: ElfScenarioPosition) -> &'static str { + match position { + ElfScenarioPosition::Wins => "wins", + ElfScenarioPosition::Ties => "ties", + ElfScenarioPosition::Loses => "loses", + ElfScenarioPosition::Untested => "untested", + } +} + fn adapter_status_counts_display(counts: &AdapterStatusCounts) -> String { [ ("real", counts.real), diff --git a/apps/elf-eval/tests/real_world_job_benchmark.rs b/apps/elf-eval/tests/real_world_job_benchmark.rs index 2ee9d46a..9c57c62b 100644 --- a/apps/elf-eval/tests/real_world_job_benchmark.rs +++ b/apps/elf-eval/tests/real_world_job_benchmark.rs @@ -5,7 +5,7 @@ use std::{ env, fs, path::{Path, PathBuf}, - process::{self, Command}, + process::{self, Command, Output}, }; use color_eyre::{Result, eyre}; @@ -64,6 +64,10 @@ fn production_ops_fixture_dir() -> PathBuf { real_world_memory_fixture_dir().join("production_ops") } +fn core_archival_memory_fixture_dir() -> PathBuf { + real_world_memory_fixture_dir().join("core_archival_memory") +} + fn context_trajectory_fixture_dir() -> PathBuf { real_world_memory_fixture_dir().join("context_trajectory") } @@ -190,6 +194,14 @@ fn readme_path() -> Result { Ok(workspace_root()?.join("README.md")) } +fn comparison_external_projects_path() -> Result { + Ok(workspace_root()? + .join("docs") + .join("guide") + .join("research") + .join("comparison_external_projects.md")) +} + fn benchmarking_index_path() -> Result { Ok(workspace_root()?.join("docs").join("guide").join("benchmarking").join("index.md")) } @@ -267,6 +279,70 @@ fn set_json_pointer(value: &mut Value, pointer: &str, replacement: Value) -> Res Ok(()) } +fn run_external_manifest_with_letta_attachment_mutation( + slug: &str, + mutation: F, +) -> Result +where + F: FnOnce(&mut Value) -> Result<()>, +{ + run_external_manifest_scenario_mutation( + slug, + "letta_research_gate", + "core_block_attachment_readback", + mutation, + ) +} + +fn run_external_manifest_scenario_mutation( + slug: &str, + adapter_id: &str, + scenario_id: &str, + mutation: F, +) -> Result +where + F: FnOnce(&mut Value) -> Result<()>, +{ + let mut manifest = + serde_json::from_str::(&fs::read_to_string(external_adapter_manifest_path())?)?; + let adapters = manifest + .pointer_mut("/adapters") + .and_then(Value::as_array_mut) + .ok_or_else(|| eyre::eyre!("missing manifest adapters"))?; + let adapter = adapters + .iter_mut() + .find(|adapter| adapter.pointer("/adapter_id").and_then(Value::as_str) == Some(adapter_id)) + .ok_or_else(|| eyre::eyre!("missing {adapter_id} adapter"))?; + let scenarios = adapter + .pointer_mut("/scenarios") + .and_then(Value::as_array_mut) + .ok_or_else(|| eyre::eyre!("missing {adapter_id} scenarios"))?; + let scenario = scenarios + .iter_mut() + .find(|scenario| { + scenario.pointer("/scenario_id").and_then(Value::as_str) == Some(scenario_id) + }) + .ok_or_else(|| eyre::eyre!("missing {scenario_id} scenario"))?; + + mutation(scenario)?; + + let temp_dir = env::temp_dir().join(format!("elf-real-world-{slug}-{}", process::id())); + + fs::create_dir_all(&temp_dir)?; + + let manifest_path = temp_dir.join("memory_projects_manifest.json"); + + fs::write(&manifest_path, serde_json::to_vec_pretty(&manifest)?)?; + + Ok(Command::new(env!("CARGO_BIN_EXE_real_world_job_benchmark")) + .arg("run") + .arg("--fixtures") + .arg(fixture_dir()) + .arg("--external-adapter-manifest") + .arg(&manifest_path) + .output()?) +} + #[test] fn smoke_fixture_produces_typed_json_report() -> Result<()> { let report = run_json_report()?; @@ -434,7 +510,7 @@ fn external_adapter_run_summarizes_nonzero_scenario_losses() -> Result<()> { report .pointer("/external_adapters/summary/scenario_position_counts/untested") .and_then(Value::as_u64), - Some(16) + Some(22) ); assert_eq!( report @@ -446,7 +522,7 @@ fn external_adapter_run_summarizes_nonzero_scenario_losses() -> Result<()> { report .pointer("/external_adapters/summary/scenario_outcome_counts/not_tested") .and_then(Value::as_u64), - Some(7) + Some(11) ); let adapters = array_at(&report, "/external_adapters/adapters")?; @@ -541,13 +617,13 @@ fn assert_external_adapter_manifest_status_summary(report: &Value) { report .pointer("/external_adapters/summary/overall_status_counts/blocked") .and_then(Value::as_u64), - Some(6) + Some(7) ); assert_eq!( report .pointer("/external_adapters/summary/overall_status_counts/not_encoded") .and_then(Value::as_u64), - Some(6) + Some(5) ); assert_eq!( report @@ -565,13 +641,13 @@ fn assert_external_adapter_manifest_status_summary(report: &Value) { report .pointer("/external_adapters/summary/suite_status_counts/blocked") .and_then(Value::as_u64), - Some(18) + Some(19) ); assert_eq!( report .pointer("/external_adapters/summary/suite_status_counts/pass") .and_then(Value::as_u64), - Some(22) + Some(23) ); assert_eq!( report @@ -610,7 +686,7 @@ fn assert_external_adapter_manifest_scenario_summary(report: &Value) { report .pointer("/external_adapters/summary/scenario_status_counts/blocked") .and_then(Value::as_u64), - Some(6) + Some(8) ); assert_eq!( report @@ -640,7 +716,7 @@ fn assert_external_adapter_manifest_scenario_summary(report: &Value) { report .pointer("/external_adapters/summary/scenario_status_counts/not_encoded") .and_then(Value::as_u64), - Some(2) + Some(6) ); assert_eq!( report @@ -664,7 +740,7 @@ fn assert_external_adapter_manifest_scenario_summary(report: &Value) { report .pointer("/external_adapters/summary/scenario_position_counts/untested") .and_then(Value::as_u64), - Some(17) + Some(23) ); assert_eq!( report @@ -688,13 +764,13 @@ fn assert_external_adapter_manifest_scenario_summary(report: &Value) { report .pointer("/external_adapters/summary/scenario_outcome_counts/not_tested") .and_then(Value::as_u64), - Some(8) + Some(12) ); assert_eq!( report .pointer("/external_adapters/summary/scenario_outcome_counts/blocked") .and_then(Value::as_u64), - Some(6) + Some(8) ); assert_eq!( report @@ -724,6 +800,7 @@ fn assert_external_adapter_manifest_records(report: &Value) -> Result<()> { let graphify = find_by_field(adapters, "/adapter_id", "graphify_docker_smoke")?; let qmd_deep = find_by_field(adapters, "/adapter_id", "qmd_deep_profile_gate")?; let openviking_deep = find_by_field(adapters, "/adapter_id", "openviking_deep_profile_gate")?; + let letta = find_by_field(adapters, "/adapter_id", "letta_research_gate")?; assert_elf_fixture_adapter_record(elf)?; @@ -757,6 +834,37 @@ fn assert_external_adapter_manifest_records(report: &Value) -> Result<()> { assert_first_generation_adapter_records(agentmemory, mem0, memsearch, claude_mem); assert_eq!(openviking.pointer("/overall_status").and_then(Value::as_str), Some("wrong_result")); + + assert_graph_rag_research_gate_records(ragflow, lightrag, graphrag); + assert_graphiti_zep_adapter(graphiti_zep); + assert_graphify_adapter(graphify)?; + assert_letta_core_archival_gate(letta)?; + assert_qmd_deep_profile_gate(qmd_deep); + + assert_eq!( + qmd_deep.pointer("/capabilities/2/status").and_then(Value::as_str), + Some("unsupported") + ); + assert_eq!( + qmd_deep.pointer("/result/artifact").and_then(Value::as_str), + Some("docs/research/2026-06-11-qmd-openviking-strength-profile-report.json") + ); + assert_eq!( + openviking_deep.pointer("/adapter_kind").and_then(Value::as_str), + Some("docker_local_embed_context_trajectory_gate") + ); + + assert_openviking_deep_profile_gate(openviking_deep); + + assert_eq!( + openviking_deep.pointer("/result/artifact").and_then(Value::as_str), + Some("docs/research/2026-06-11-qmd-openviking-strength-profile-report.json") + ); + + Ok(()) +} + +fn assert_graph_rag_research_gate_records(ragflow: &Value, lightrag: &Value, graphrag: &Value) { assert_eq!(ragflow.pointer("/evidence_class").and_then(Value::as_str), Some("research_gate")); assert_eq!(ragflow.pointer("/overall_status").and_then(Value::as_str), Some("blocked")); assert_eq!( @@ -797,30 +905,54 @@ fn assert_external_adapter_manifest_records(report: &Value) -> Result<()> { Some("cargo make graphrag-docker-smoke") ); assert_eq!(graphrag.pointer("/suites/1/status").and_then(Value::as_str), Some("not_encoded")); +} - assert_graphiti_zep_adapter(graphiti_zep); - assert_graphify_adapter(graphify)?; - assert_qmd_deep_profile_gate(qmd_deep); - - assert_eq!( - qmd_deep.pointer("/capabilities/2/status").and_then(Value::as_str), - Some("unsupported") - ); - assert_eq!( - qmd_deep.pointer("/result/artifact").and_then(Value::as_str), - Some("docs/research/2026-06-11-qmd-openviking-strength-profile-report.json") - ); - assert_eq!( - openviking_deep.pointer("/adapter_kind").and_then(Value::as_str), - Some("docker_local_embed_context_trajectory_gate") +fn assert_letta_core_archival_gate(adapter: &Value) -> Result<()> { + assert_eq!(adapter.pointer("/overall_status").and_then(Value::as_str), Some("blocked")); + assert!( + adapter + .pointer("/setup/evidence") + .and_then(Value::as_str) + .is_some_and(|evidence| evidence.contains("Docker-only benchmark-created agent export")) ); + assert!(adapter.pointer("/execution_metadata/setup_path").and_then(Value::as_str).is_some_and( + |setup| setup.contains("exports core block JSON plus archival search/readback JSON") + )); - assert_openviking_deep_profile_gate(openviking_deep); + let suites = array_at(adapter, "/suites")?; + let core_suite = find_by_field(suites, "/suite_id", "core_archival_memory")?; + + assert_eq!(core_suite.pointer("/status").and_then(Value::as_str), Some("blocked")); + + let scenarios = array_at(adapter, "/scenarios")?; + let attachment = find_by_field(scenarios, "/scenario_id", "core_block_attachment_readback")?; + let scope = find_by_field(scenarios, "/scenario_id", "core_block_scope_readback")?; + let provenance = find_by_field(scenarios, "/scenario_id", "core_block_provenance_readback")?; + let stale = find_by_field(scenarios, "/scenario_id", "stale_core_detection")?; + let fallback = find_by_field(scenarios, "/scenario_id", "archival_fallback_readback")?; + let decision = + find_by_field(scenarios, "/scenario_id", "core_archival_project_decision_recovery")?; + + assert_eq!(scenarios.len(), 6); + + for scenario in [attachment, scope, provenance, stale, fallback, decision] { + assert_eq!(scenario.pointer("/elf_position").and_then(Value::as_str), Some("untested")); + assert!( + ["not_tested", "blocked"].contains( + &scenario + .pointer("/comparison_outcome") + .and_then(Value::as_str) + .ok_or_else(|| eyre::eyre!("missing Letta comparison_outcome"))? + ) + ); + } assert_eq!( - openviking_deep.pointer("/result/artifact").and_then(Value::as_str), - Some("docs/research/2026-06-11-qmd-openviking-strength-profile-report.json") + attachment.pointer("/comparison_outcome").and_then(Value::as_str), + Some("not_tested") ); + assert_eq!(stale.pointer("/comparison_outcome").and_then(Value::as_str), Some("blocked")); + assert_eq!(fallback.pointer("/comparison_outcome").and_then(Value::as_str), Some("blocked")); Ok(()) } @@ -828,10 +960,24 @@ fn assert_external_adapter_manifest_records(report: &Value) -> Result<()> { fn assert_elf_fixture_adapter_record(adapter: &Value) -> Result<()> { assert_eq!(adapter.pointer("/evidence_class").and_then(Value::as_str), Some("fixture_backed")); assert_eq!(adapter.pointer("/overall_status").and_then(Value::as_str), Some("blocked")); + assert!(adapter.pointer("/run/evidence").and_then(Value::as_str).is_some_and(|evidence| { + evidence.contains("49 jobs across 13 suites") + && evidence.contains("44 pass") + && evidence.contains("5 blocked") + && evidence.contains("core_archival_memory") + && evidence.contains("context_trajectory") + })); let suites = array_at(adapter, "/suites")?; + let core_archival = find_by_field(suites, "/suite_id", "core_archival_memory")?; let context_trajectory = find_by_field(suites, "/suite_id", "context_trajectory")?; + assert_eq!(core_archival.pointer("/status").and_then(Value::as_str), Some("pass")); + assert!(core_archival.pointer("/evidence").and_then(Value::as_str).is_some_and(|evidence| { + evidence.contains("core block attachment") + && evidence.contains("project-decision recovery") + && evidence.contains("archival note search") + })); assert_eq!(context_trajectory.pointer("/status").and_then(Value::as_str), Some("blocked")); assert!( adapter @@ -1003,6 +1149,10 @@ fn assert_agentmemory_first_generation_records(agentmemory: &Value) { Some("wins") ); assert_eq!(agentmemory.pointer("/scenarios/2/status").and_then(Value::as_str), Some("blocked")); + assert_eq!( + agentmemory.pointer("/scenarios/2/comparison_outcome").and_then(Value::as_str), + Some("blocked") + ); } fn assert_mem0_first_generation_records(mem0: &Value) { @@ -1446,6 +1596,84 @@ fn operator_debug_live_adapter_task_is_docker_scoped() -> Result<()> { Ok(()) } +#[test] +fn external_adapter_manifest_rejects_unmeasured_win_loss_scenario_outcomes() -> Result<()> { + let output = run_external_manifest_with_letta_attachment_mutation( + "invalid-scenario-outcome-test", + |scenario| set_json_pointer(scenario, "/comparison_outcome", serde_json::json!("win")), + )?; + + assert!(!output.status.success(), "invalid scenario outcome unexpectedly passed"); + assert!( + String::from_utf8_lossy(&output.stderr).contains("not_encoded status with win outcome") + ); + + Ok(()) +} + +#[test] +fn external_adapter_manifest_rejects_unmeasured_win_loss_scenario_positions() -> Result<()> { + let output = run_external_manifest_with_letta_attachment_mutation( + "invalid-scenario-position-test", + |scenario| { + set_json_pointer(scenario, "/status", serde_json::json!("not_encoded"))?; + set_json_pointer(scenario, "/elf_position", serde_json::json!("wins"))?; + + set_json_pointer(scenario, "/comparison_outcome", serde_json::json!("not_tested")) + }, + )?; + + assert!(!output.status.success(), "invalid scenario position unexpectedly passed"); + assert!( + String::from_utf8_lossy(&output.stderr).contains("not_encoded status with wins position") + ); + + Ok(()) +} + +#[test] +fn external_adapter_manifest_rejects_blocked_status_without_blocked_outcome() -> Result<()> { + let output = run_external_manifest_scenario_mutation( + "invalid-blocked-scenario-outcome-test", + "letta_research_gate", + "stale_core_detection", + |scenario| { + scenario + .as_object_mut() + .ok_or_else(|| eyre::eyre!("scenario is not an object"))? + .remove("comparison_outcome"); + + Ok(()) + }, + )?; + + assert!(!output.status.success(), "invalid blocked scenario unexpectedly passed"); + assert!( + String::from_utf8_lossy(&output.stderr) + .contains("blocked status without blocked comparison outcome") + ); + + Ok(()) +} + +#[test] +fn external_adapter_manifest_rejects_conflicting_scenario_position_and_outcome() -> Result<()> { + let output = run_external_manifest_with_letta_attachment_mutation( + "invalid-scenario-position-outcome-test", + |scenario| { + set_json_pointer(scenario, "/status", serde_json::json!("pass"))?; + set_json_pointer(scenario, "/elf_position", serde_json::json!("ties"))?; + + set_json_pointer(scenario, "/comparison_outcome", serde_json::json!("loss")) + }, + )?; + + assert!(!output.status.success(), "conflicting scenario unexpectedly passed"); + assert!(String::from_utf8_lossy(&output.stderr).contains("ties position with loss outcome")); + + Ok(()) +} + #[test] fn live_adapter_supports_elf_capture_write_policy_without_external_hook_claims() -> Result<()> { let workspace = workspace_root()?; @@ -1585,6 +1813,8 @@ fn capture_write_policy_live_report_preserves_competitor_boundaries() -> Result< assert!(markdown.contains("Do not claim ELF broadly beats agentmemory or claude-mem")); assert!(benchmarking_index.contains("2026-06-11-capture-write-policy-live-report.md")); assert!(readme.contains("Capture/Write-Policy Live Report - June 11, 2026")); + assert!(readme.contains("mem0/OpenMemory")); + assert!(readme.contains("and memsearch now pass their scoped local baseline")); assert!( collapse_whitespace(&readme) .contains("claude-mem hook/viewer capture remains blocked until Docker-contained") @@ -1654,7 +1884,7 @@ fn assert_live_sweep_record(adapter: &Value, production_ops_status: &str) -> Res fn runner_discovers_nested_fixture_layout() -> Result<()> { let report = run_json_report_from(fixture_root())?; - assert_eq!(report.pointer("/summary/job_count").and_then(Value::as_u64), Some(43)); + assert_eq!(report.pointer("/summary/job_count").and_then(Value::as_u64), Some(49)); Ok(()) } @@ -1978,52 +2208,25 @@ fn current_benchmark_reports_preserve_live_sweep_boundaries() -> Result<()> { let competitor_matrix_json = serde_json::from_str::(&fs::read_to_string( competitor_strength_matrix_json_path()?, )?)?; + let iteration_direction = fs::read_to_string(iteration_direction_report_path()?)?; let external_manifest = fs::read_to_string(external_adapter_manifest_path())?; + let comparison_external_projects = fs::read_to_string(comparison_external_projects_path()?)?; let retrieval_debug_profile = serde_json::from_str::(&fs::read_to_string(retrieval_debug_profile_json_path()?)?)?; let temporal_history = serde_json::from_str::(&fs::read_to_string( temporal_history_competitor_gap_json_path()?, )?)?; - assert!( - measurement_audit.contains( - "| `memory_evolution` | `6` | `pass:1`, `wrong_result:5` | `wrong_result:6` |" - ) - ); - assert!( - measurement_audit - .contains("qmd live fails 6/6 jobs after missing the delete/TTL tombstone evidence") + assert_current_report_text_boundaries( + &measurement_audit, + &competitor_matrix, + &iteration_direction, + &external_manifest, + &comparison_external_projects, ); - assert_measurement_audit_adapter_status_counts(&measurement_audit); - assert_first_generation_current_summary_boundaries(&measurement_audit, &competitor_matrix); - - assert!( - competitor_matrix - .contains("broader live suites remain `wrong_result`, `blocked`, or `not_encoded`") - ); assert!(competitor_matrix.contains("claude-mem work_resume remains `not_encoded`")); assert!(!competitor_matrix.contains("claude-mem `wrong_result`, OpenViking work_resume")); - assert!(external_manifest.contains( - "The record is a full-suite sweep, not a full-suite pass; wrong_result, blocked, and not_encoded states remain visible." - )); - assert!(external_manifest.contains( - "The qmd live real-world sweep covers the current encoded fixture corpus; expanded retrieval-debug strength suites still need their own materialized adapter run." - )); - - for stale_phrase in [ - "same live sweep shape as ELF", - "ELF and qmd live fail 5/6 jobs", - "both systems currently fail 5/6 live memory-evolution jobs", - "wrong_result, incomplete, blocked, and not_encoded states remain visible", - "broader live suites remain `wrong_result`, `incomplete`, or `not_encoded`", - "The qmd live real-world slice covers representative jobs only", - "blocked or not encoded", - ] { - assert!(!measurement_audit.contains(stale_phrase)); - assert!(!competitor_matrix.contains(stale_phrase)); - assert!(!external_manifest.contains(stale_phrase)); - } let qmd_live = find_by_field( array_at(&measurement_audit_json, "/live_real_world_adapters")?, @@ -2082,24 +2285,85 @@ fn current_benchmark_reports_preserve_live_sweep_boundaries() -> Result<()> { Ok(()) } -fn assert_first_generation_current_summary_boundaries( +fn assert_current_report_text_boundaries( measurement_audit: &str, competitor_matrix: &str, + iteration_direction: &str, + external_manifest: &str, + comparison_external_projects: &str, ) { + assert!( + measurement_audit.contains( + "| `memory_evolution` | `6` | `pass:1`, `wrong_result:5` | `wrong_result:6` |" + ) + ); + assert!( + measurement_audit + .contains("qmd live fails 6/6 jobs after missing the delete/TTL tombstone evidence") + ); + assert!(measurement_audit.contains("Basic local smoke and local OSS history/readback pass")); assert!(measurement_audit.contains("claude-mem hook/viewer capture is `blocked`")); assert!(!measurement_audit.contains("claude-mem hook/viewer capture remains untested")); assert!(!measurement_audit.contains("blocked or untested")); + + assert_measurement_audit_adapter_status_counts(measurement_audit); + + assert!( + competitor_matrix + .contains("broader live suites remain `wrong_result`, `blocked`, or `not_encoded`") + ); assert!(competitor_matrix.contains( - "Overall adapter-status counts: 4 `pass`,\n6 `wrong_result`, 1 `lifecycle_fail`, 6 `blocked`, and 6 `not_encoded`." + "Overall adapter-status counts: 4 `pass`,\n6 `wrong_result`, 1 `lifecycle_fail`, 7 `blocked`, and 5 `not_encoded`." )); assert!(!competitor_matrix.contains("5 `blocked`, and 7 `not_encoded`")); assert!( competitor_matrix .contains("mem0/OpenMemory local OSS entity-scoped personalization now passes") ); + assert!(competitor_matrix.contains("scoped preference behavior is a measured tie")); assert!( !competitor_matrix.contains("mem0/OpenMemory and Letta personalization are `not_encoded`") ); + assert!(external_manifest.contains( + "The record is a full-suite sweep, not a full-suite pass; wrong_result, blocked, and not_encoded states remain visible." + )); + assert!(external_manifest.contains( + "The qmd live real-world sweep covers the current encoded fixture corpus; expanded retrieval-debug strength suites still need their own materialized adapter run." + )); + assert!( + comparison_external_projects + .contains("Benchmark-grounded for scoped local OSS same-corpus retrieval") + ); + assert!( + comparison_external_projects + .contains("Benchmark-grounded for local same-corpus retrieval, reindex/update/delete") + ); + assert!(iteration_direction.contains("| Jobs | `49` |")); + assert!(iteration_direction.contains("| Encoded suites | `13` |")); + assert!(iteration_direction.contains("| Pass | `44` |")); + assert!(iteration_direction.contains("| Evidence coverage | `111/111` |")); + assert!(iteration_direction.contains("| Expected evidence recall | `100/100` |")); + + for stale_phrase in [ + "same live sweep shape as ELF", + "ELF and qmd live fail 5/6 jobs", + "both systems currently fail 5/6 live memory-evolution jobs", + "wrong_result, incomplete, blocked, and not_encoded states remain visible", + "broader live suites remain `wrong_result`, `incomplete`, or `not_encoded`", + "The qmd live real-world slice covers representative jobs only", + "| Jobs | `40` |", + "| Encoded suites | `11` |", + "| Pass | `38` |", + "history/UI/hosted/graph behavior remains", + "current local adapter is incomplete/wrong-result", + "current adapter is incomplete/invalid-result", + ] { + assert!(!measurement_audit.contains(stale_phrase)); + assert!(!competitor_matrix.contains(stale_phrase)); + assert!(!iteration_direction.contains(stale_phrase)); + assert!(!external_manifest.contains(stale_phrase)); + assert!(!comparison_external_projects.contains(stale_phrase)); + } } #[test] @@ -2123,6 +2387,8 @@ fn qmd_trace_replay_diagnostics_report_preserves_claim_boundaries() -> Result<() assert!(benchmarking_index.contains("qmd top-10/replay artifact")); assert!(benchmarking_index.contains("ELF trace/admin surfaces")); assert!(adoption_report.contains("| Retrieval quality and local debug UX | `loss` |")); + assert!(adoption_report.contains("Letta scenario rows remain")); + assert!(adoption_report.contains("blocked or `not_tested`")); assert_trace_replay_viewer_blocker_boundaries( &readme, @@ -2136,6 +2402,13 @@ fn qmd_trace_replay_diagnostics_report_preserves_claim_boundaries() -> Result<() adoption_report .contains("Do not claim qmd's trace/replay artifact win is a broad qmd-over-ELF") ); + assert!(array_at(&adoption_json, "/adoption_decision/remaining_caveats")?.iter().any( + |caveat| { + caveat.as_str().is_some_and(|text| { + text.contains("Letta scenario rows remain blocked or not_tested") + }) + } + )); assert_trace_replay_adoption_json(&adoption_json)?; @@ -2488,20 +2761,8 @@ fn assert_competitor_strength_matrix_scenario_json(scenarios: &[Value]) -> Resul let personalization = find_by_field(scenarios, "/scenario_id", "personalization")?; - assert!( - personalization - .pointer("/current_competitor_evidence") - .and_then(Value::as_str) - .is_some_and(|claim| claim - .contains("mem0/OpenMemory local OSS entity-scoped personalization now passes") - && claim.contains("Letta personalization is research_gate not_encoded")) - ); - assert!( - personalization - .pointer("/current_state") - .and_then(Value::as_str) - .is_some_and(|state| state.contains("scoped personalization is a tie")) - ); + assert_personalization_matrix_record(personalization); + assert!( context_trajectory .pointer("/current_state") @@ -2518,6 +2779,23 @@ fn assert_competitor_strength_matrix_scenario_json(scenarios: &[Value]) -> Resul Ok(()) } +fn assert_personalization_matrix_record(personalization: &Value) { + assert!( + personalization + .pointer("/current_competitor_evidence") + .and_then(Value::as_str) + .is_some_and(|claim| claim + .contains("mem0/OpenMemory local OSS entity-scoped personalization now passes") + && claim.contains("Letta personalization is research_gate not_encoded")) + ); + assert!( + personalization + .pointer("/current_state") + .and_then(Value::as_str) + .is_some_and(|state| state.contains("scoped personalization is a tie")) + ); +} + fn assert_competitor_strength_matrix_manifest_counts(matrix: &Value) { assert_eq!( matrix.pointer("/manifest_summary/adapter_records").and_then(Value::as_u64), @@ -2535,13 +2813,13 @@ fn assert_competitor_strength_matrix_manifest_counts(matrix: &Value) { ); assert_eq!( matrix.pointer("/manifest_summary/overall_status_counts/blocked").and_then(Value::as_u64), - Some(6) + Some(7) ); assert_eq!( matrix .pointer("/manifest_summary/overall_status_counts/not_encoded") .and_then(Value::as_u64), - Some(6) + Some(5) ); assert_eq!( matrix @@ -2921,29 +3199,29 @@ fn assert_operator_facing_strength_profile_boundaries( fn assert_measurement_audit_adapter_status_counts(markdown: &str) { for expected in [ - "| `blocked` | `6` |", - "| `not_encoded` | `6` |", + "| `blocked` | `7` |", + "| `not_encoded` | `5` |", "The generated JSON report emits `external_project_count: 16`", ] { assert!(markdown.contains(expected), "missing measurement audit text: {expected}"); } - for stale in ["| `blocked` | `5` |", "| `not_encoded` | `7` |"] { + for stale in ["| `blocked` | `6` |", "| `not_encoded` | `6` |"] { assert!(!markdown.contains(stale), "stale measurement audit text: {stale}"); } } fn assert_iteration_direction_current_measurement_counts(markdown: &str) { for expected in [ - "| Jobs | `43` |", - "| Encoded suites | `12` |", + "| Jobs | `49` |", + "| Encoded suites | `13` |", "| Blocked | `5` |", - "| Mean score | `0.884` |", - "| Evidence coverage | `97/97` |", - "| Source-ref coverage | `97/97` |", - "| Quote coverage | `97/97` |", - "| Expected evidence recall | `89/89` |", - "| `blocked` | `6` |", - "| `not_encoded` | `6` |", + "| Mean score | `0.898` |", + "| Evidence coverage | `111/111` |", + "| Source-ref coverage | `111/111` |", + "| Quote coverage | `111/111` |", + "| Expected evidence recall | `100/100` |", + "| `blocked` | `7` |", + "| `not_encoded` | `5` |", "`live_baseline_only`, `fixture_backed`, and `research_gate`", "`blocked` for fixture-backed trajectory gates", ] { @@ -3006,9 +3284,9 @@ fn generated_json_report_renders_markdown() -> Result<()> { assert!(markdown.contains("xy844-current-worktree")); assert!(markdown.contains("Existing live-baseline reports remain valid")); assert!(markdown.contains("### Adapter Scenario Judgments")); - assert!(markdown.contains("ELF scenario positions: `wins=9, ties=9, loses=1, untested=17`")); + assert!(markdown.contains("ELF scenario positions: `wins=9, ties=9, loses=1, untested=23`")); assert!(markdown.contains( - "Scenario comparison outcomes: `win=9, tie=9, loss=1, not_tested=8, blocked=6, non_goal=3`" + "Scenario comparison outcomes: `win=9, tie=9, loss=1, not_tested=12, blocked=8, non_goal=3`" )); assert!(markdown.contains("| `claude_mem_live_baseline` | `same_corpus_retrieval`")); assert!(markdown.contains("| `memsearch_live_baseline` | `ttl_expiry_lifecycle`")); @@ -3286,6 +3564,72 @@ fn production_ops_fixtures_report_bounded_typed_states() -> Result<()> { Ok(()) } +#[test] +fn core_archival_memory_fixtures_score_separate_core_and_archival_jobs() -> Result<()> { + let report = run_json_report_from(core_archival_memory_fixture_dir())?; + + assert_eq!(report.pointer("/summary/job_count").and_then(Value::as_u64), Some(6)); + assert_eq!(report.pointer("/summary/encoded_suite_count").and_then(Value::as_u64), Some(1)); + assert_eq!(report.pointer("/summary/pass").and_then(Value::as_u64), Some(6)); + assert_eq!(report.pointer("/summary/wrong_result").and_then(Value::as_u64), Some(0)); + assert_eq!(report.pointer("/summary/blocked").and_then(Value::as_u64), Some(0)); + assert_eq!( + report.pointer("/summary/expected_evidence_recall").and_then(Value::as_f64), + Some(1.0) + ); + assert_eq!(report.pointer("/summary/evidence_coverage").and_then(Value::as_f64), Some(1.0)); + assert_eq!( + report.pointer("/summary/evidence_required_count").and_then(Value::as_u64), + Some(14) + ); + assert_eq!(report.pointer("/summary/evidence_covered_count").and_then(Value::as_u64), Some(14)); + assert_eq!(report.pointer("/summary/scope_check_count").and_then(Value::as_u64), Some(1)); + assert_eq!(report.pointer("/summary/scope_correct_count").and_then(Value::as_u64), Some(1)); + assert_eq!(report.pointer("/summary/scope_violation_count").and_then(Value::as_u64), Some(0)); + + let suites = array_at(&report, "/suites")?; + let core = find_by_field(suites, "/suite_id", "core_archival_memory")?; + + assert_eq!(core.pointer("/status").and_then(Value::as_str), Some("pass")); + assert_eq!(core.pointer("/encoded_job_count").and_then(Value::as_u64), Some(6)); + + let jobs = array_at(&report, "/jobs")?; + + for job_id in [ + "core-archival-core-block-attachment-001", + "core-archival-core-block-scope-001", + "core-archival-core-block-provenance-001", + "core-archival-stale-core-detection-001", + "core-archival-archival-fallback-001", + "core-archival-project-decision-recovery-001", + ] { + let job = find_by_field(jobs, "/job_id", job_id)?; + + assert_eq!(job.pointer("/suite_id").and_then(Value::as_str), Some("core_archival_memory")); + assert_eq!(job.pointer("/status").and_then(Value::as_str), Some("pass")); + } + + let scope = find_by_field(jobs, "/job_id", "core-archival-core-block-scope-001")?; + let decision = find_by_field(jobs, "/job_id", "core-archival-project-decision-recovery-001")?; + + assert_eq!(scope.pointer("/scope_check_count").and_then(Value::as_u64), Some(1)); + assert_eq!(scope.pointer("/scope_correct_count").and_then(Value::as_u64), Some(1)); + assert_eq!(scope.pointer("/scope_violation_count").and_then(Value::as_u64), Some(0)); + assert!( + decision + .pointer("/produced_answer") + .and_then(Value::as_str) + .is_some_and(|content| content.contains("Letta remains blocked or not_tested")) + ); + assert!( + array_at(decision, "/produced_evidence")? + .iter() + .any(|id| id.as_str() == Some("decision-letta-export-boundary")) + ); + + Ok(()) +} + #[test] fn context_trajectory_fixtures_report_blocked_openviking_gates() -> Result<()> { let report = run_json_report_from(context_trajectory_fixture_dir())?; @@ -3336,9 +3680,9 @@ fn assert_root_knowledge_summary(report: &Value) { } fn assert_root_aggregate_summary(report: &Value) { - assert_eq!(report.pointer("/summary/job_count").and_then(Value::as_u64), Some(43)); - assert_eq!(report.pointer("/summary/encoded_suite_count").and_then(Value::as_u64), Some(12)); - assert_eq!(report.pointer("/summary/pass").and_then(Value::as_u64), Some(38)); + assert_eq!(report.pointer("/summary/job_count").and_then(Value::as_u64), Some(49)); + assert_eq!(report.pointer("/summary/encoded_suite_count").and_then(Value::as_u64), Some(13)); + assert_eq!(report.pointer("/summary/pass").and_then(Value::as_u64), Some(44)); assert_eq!(report.pointer("/summary/wrong_result").and_then(Value::as_u64), Some(0)); assert_eq!(report.pointer("/summary/incomplete").and_then(Value::as_u64), Some(0)); assert_eq!(report.pointer("/summary/blocked").and_then(Value::as_u64), Some(5)); @@ -3368,8 +3712,8 @@ fn assert_root_aggregate_summary(report: &Value) { Some(0) ); assert_eq!(report.pointer("/summary/redaction_leak_count").and_then(Value::as_u64), Some(0)); - assert_eq!(report.pointer("/summary/scope_check_count").and_then(Value::as_u64), Some(2)); - assert_eq!(report.pointer("/summary/scope_correct_count").and_then(Value::as_u64), Some(2)); + assert_eq!(report.pointer("/summary/scope_check_count").and_then(Value::as_u64), Some(3)); + assert_eq!(report.pointer("/summary/scope_correct_count").and_then(Value::as_u64), Some(3)); assert_eq!(report.pointer("/summary/scope_violation_count").and_then(Value::as_u64), Some(0)); assert_eq!( report.pointer("/summary/qdrant_rebuild_case_count").and_then(Value::as_u64), @@ -3381,9 +3725,12 @@ fn assert_root_aggregate_summary(report: &Value) { ); assert_eq!( report.pointer("/summary/evidence_required_count").and_then(Value::as_u64), - Some(97) + Some(111) + ); + assert_eq!( + report.pointer("/summary/evidence_covered_count").and_then(Value::as_u64), + Some(111) ); - assert_eq!(report.pointer("/summary/evidence_covered_count").and_then(Value::as_u64), Some(97)); assert_eq!(report.pointer("/summary/evidence_coverage").and_then(Value::as_f64), Some(1.0)); assert_eq!(report.pointer("/summary/source_ref_coverage").and_then(Value::as_f64), Some(1.0)); assert_eq!(report.pointer("/summary/quote_coverage").and_then(Value::as_f64), Some(1.0)); @@ -3427,6 +3774,7 @@ fn assert_root_aggregate_suites(report: &Value) -> Result<()> { "knowledge_compilation", "operator_debugging_ux", "memory_evolution", + "core_archival_memory", ] { let suite = find_by_field(suites, "/suite_id", suite_id)?; @@ -3449,6 +3797,11 @@ fn assert_root_aggregate_suites(report: &Value) -> Result<()> { assert_eq!(debug_suite.pointer("/status").and_then(Value::as_str), Some("pass")); + let core_suite = find_by_field(suites, "/suite_id", "core_archival_memory")?; + + assert_eq!(core_suite.pointer("/status").and_then(Value::as_str), Some("pass")); + assert_eq!(core_suite.pointer("/encoded_job_count").and_then(Value::as_u64), Some(6)); + let production_ops = find_by_field(suites, "/suite_id", "production_ops")?; assert_eq!(production_ops.pointer("/status").and_then(Value::as_str), Some("blocked")); @@ -3471,6 +3824,8 @@ fn assert_root_aggregate_jobs(report: &Value) -> Result<()> { let stage_job = find_by_field(jobs, "/job_id", "operator-debug-stage-attribution-001")?; let production_restore = find_by_field(jobs, "/job_id", "production-ops-restore-cold-start-001")?; + let core_fallback = find_by_field(jobs, "/job_id", "core-archival-archival-fallback-001")?; + let stale_core = find_by_field(jobs, "/job_id", "core-archival-stale-core-detection-001")?; assert_eq!(rebuild.pointer("/qdrant_rebuild_case").and_then(Value::as_bool), Some(true)); assert_eq!( @@ -3482,6 +3837,8 @@ fn assert_root_aggregate_jobs(report: &Value) -> Result<()> { assert_eq!(personalization.pointer("/scope_correct_count").and_then(Value::as_u64), Some(1)); assert_eq!(stage_job.pointer("/status").and_then(Value::as_str), Some("pass")); assert_eq!(relation_job.pointer("/status").and_then(Value::as_str), Some("pass")); + assert_eq!(core_fallback.pointer("/status").and_then(Value::as_str), Some("pass")); + assert_eq!(stale_core.pointer("/status").and_then(Value::as_str), Some("pass")); assert_eq!( stage_job.pointer("/trace_explainability/failure_stage").and_then(Value::as_str), Some("rerank.score") diff --git a/docs/guide/benchmarking/2026-06-11-competitor-strength-adoption-report.md b/docs/guide/benchmarking/2026-06-11-competitor-strength-adoption-report.md index 5636fc71..2d99e670 100644 --- a/docs/guide/benchmarking/2026-06-11-competitor-strength-adoption-report.md +++ b/docs/guide/benchmarking/2026-06-11-competitor-strength-adoption-report.md @@ -42,8 +42,11 @@ The remaining caveats are material: behavior remains a non-goal, and OpenViking trajectory, Letta core-vs-archival memory, and graph/RAG navigation remain unproven. XY-928 encodes OpenViking staged trajectory, hierarchy selection, and recursive/context expansion as blocked fixtures - behind same-corpus evidence output and missing staged artifacts. mem0 local OSS preference history - is measured separately and is an ELF loss on the current correction history + behind same-corpus evidence output and missing staged artifacts. XY-927 adds + fixture-only `core_archival_memory` coverage, but Letta scenario rows remain + blocked or `not_tested` until the selected contained export/readback path exists. + mem0 local OSS preference history is measured separately and is an ELF loss on the + current correction history scenario. The XY-923 follow-up also scores qmd's immediate top-10/replay artifact ergonomics as stronger than ELF's default stress report, while expansion, fusion, and rerank remain untested. XY-932 adds a narrow live operator-debug slice where @@ -81,7 +84,8 @@ results, or lifecycle failures into one aggregate leaderboard. | Command or run | Artifact | Supported claim | | --- | --- | --- | -| `cargo make real-world-memory` | `2026-06-11-measurement-coverage-audit.md` | ELF fixture aggregate covers 43 jobs across 12 suites with 38 pass and 5 blocked production-ops or OpenViking context-trajectory measurement gates. | +| `cargo make real-world-memory` | `2026-06-11-measurement-coverage-audit.md` | ELF fixture aggregate covers 49 jobs across 13 suites with 44 pass and 5 blocked production-ops or OpenViking context-trajectory measurement gates, including 6 passing `core_archival_memory` jobs. | +| `cargo make real-world-memory-core-archival` | `tmp/real-world-memory/core-archival/report.json` | ELF core-block behavior is scored separately from archival note search for attachment, scope, provenance, stale-core detection, archival fallback, and project-decision recovery. | | `cargo make real-world-memory-live-adapters` | `2026-06-11-measurement-coverage-audit.md` | ELF live service adapter reports 22 pass, 5 wrong_result, 2 blocked, and 11 not_encoded jobs; qmd reports 17 pass, 6 wrong_result, 2 blocked, and 15 not_encoded jobs. | | `cargo make real-world-memory-live-adapters` | `2026-06-11-capture-write-policy-live-report.md` | ELF live capture/write-policy jobs pass for redaction, exclusions, source ids, evidence binding, and no secret leakage; qmd remains not_encoded, while agentmemory and claude-mem capture breadth are blocked until durable hook/viewer evidence exists. | | `cargo make real-world-job-operator-ux-live-adapters` | `tmp/real-world-job/operator-ux-live-adapters/summary.json` | The narrow live operator-debug slice scores ELF as pass and qmd as wrong_result: ELF wins trace hydration, candidate-drop visibility, and selected-but-not-narrated evidence; both systems expose replay commands and repair-action guidance. | @@ -99,7 +103,7 @@ results, or lifecycle failures into one aggregate leaderboard. | --- | --- | --- | --- | --- | | Source-of-truth rebuild and evidence-bound writes | `win` | `fixture_backed`, `live_real_world`, `live_baseline_only` | ELF has the strongest measured source-of-truth and rebuild story: Postgres is authoritative, Qdrant is rebuildable, trust-source jobs pass, and production restore/rebuild proof exists. | None | | Work resume and coding-agent continuity | `tie` | `fixture_backed`, `live_real_world`, `live_baseline_only`, `blocked`, `not_encoded` | ELF and qmd both pass encoded live `work_resume` jobs. XY-925 selects agentmemory's next durable local path but keeps it blocked until the SDK KV/index and observation log survive a fresh process; claude-mem work_resume remains `not_encoded`, and OpenViking continuity trajectory remains `blocked`. | XY-928 | -| Project decisions and reversals | `tie` | `fixture_backed`, `live_real_world`, `research_gate`, `not_encoded` | ELF and qmd both pass encoded `project_decisions` jobs; Letta-style core/archival decision memory is not tested. | XY-927 | +| Project decisions and reversals | `tie` | `fixture_backed`, `live_real_world`, `research_gate`, `not_encoded` | ELF and qmd both pass encoded `project_decisions` jobs. The ELF `core_archival_memory` fixture also scores project-decision recovery through core routing plus archival rationale, but Letta-style comparison remains blocked without contained export evidence. | XY-927 | | Retrieval quality | `tie` | `fixture_backed`, `live_real_world`, `live_baseline_only` | ELF and qmd both pass encoded live retrieval and stress/same-corpus retrieval evidence. | XY-923 | | Retrieval quality and local debug UX | `loss` | `live_baseline_only`, `research_gate`, `wrong_result`, `not_encoded` | The XY-923 trace/replay report scores qmd stronger on immediate top-10 candidate artifacts and short CLI replay commands. ELF keeps useful service trace/admin replay surfaces, and expansion, fusion, rerank-on, and candidate-drop diagnostics remain untested. | XY-923 | | Memory evolution and temporal history | `loss` | `fixture_backed`, `live_real_world`, `live_baseline_only`, `wrong_result`, `blocked` | ELF fixture memory evolution passes, but live ELF passes only delete/TTL and reports five wrong_result jobs where current-vs-historical state is not reconciled. The mem0 local OSS preference-correction history scenario is now measured and is also an ELF loss. | XY-905 | @@ -111,7 +115,7 @@ results, or lifecycle failures into one aggregate leaderboard. | Private corpus and provider boundaries | `blocked` | `blocked` | Private production profile fails closed without an operator-owned manifest; provider-backed production-ops gates require explicit credentials. | XY-930 | | Personalization and scoped preferences | `tie` | `fixture_backed`, `live_real_world`, `live_baseline_only`, `not_encoded` | ELF and qmd both pass the single encoded live personalization job. mem0 local OSS now passes entity-scoped personalization, so scoped preference behavior is a measured tie; preference correction history remains a separate ELF loss. | XY-927 | | Context trajectory and hierarchical retrieval | `not_tested` | `fixture_backed`, `live_baseline_only`, `research_gate`, `wrong_result`, `blocked` | OpenViking reaches the pinned Docker local embedding path and now exposes expected/matched/missing evidence ids, but same-corpus evidence is still wrong_result; staged trajectory, hierarchy selection, and recursive expansion are encoded as blocked fixtures, not scored comparisons. | XY-928 | -| Core-vs-archival memory | `not_tested` | `research_gate`, `not_encoded` | ELF has core block semantics in the service contract, but comparable core-vs-archival jobs and a contained Letta export path are not encoded. | XY-927 | +| Core-vs-archival memory | `blocked` | `fixture_backed`, `research_gate`, `blocked`, `not_encoded` | ELF now has 6 fixture-backed `core_archival_memory` jobs that score core block attachment, scope, provenance, stale-core detection, archival fallback, and project-decision recovery separately from archival note search. Letta remains blocked or not tested until its contained export/readback artifact maps core and archival source ids. | XY-927 | | Graph/RAG navigation and citations | `not_tested` | `smoke_only`, `research_gate`, `blocked`, `wrong_result`, `not_encoded` | Graph/RAG smokes produce scored or typed non-pass adapter reports where possible, but broad graph/RAG navigation and citation quality are not tested. | XY-929 | ## Follow-Up Queue @@ -124,7 +128,7 @@ results, or lifecycle failures into one aggregate leaderboard. | XY-925 | P1 | Fixture slice encoded; runtime paths still blocked | First-generation OSS prompt coverage and typed blockers are recorded for agentmemory, memsearch, and claude-mem; durable agentmemory hooks and claude-mem viewer/operator runs still need runtime adapters. | | XY-926 | P1 | Backlog | Live consolidation and knowledge-page suites; broad operator-debugging remains dependent on OpenMemory and claude-mem UI runners. | | XY-933 | P1 | Live ELF self-check encoded | Capture/write-policy redaction, exclusion, source-id, evidence-binding, and no-leak scoring for ELF; durable agentmemory/claude-mem capture-hook comparison remains blocked. | -| XY-927 | P1 | Backlog | Letta-style core-vs-archival memory comparison. | +| XY-927 | P1 | Fixture encoded; Letta export blocked | ELF core-vs-archival fixture coverage is encoded; a contained Letta export/readback adapter remains future work before win/tie/loss claims. | | XY-928 | P1 | Encoded blocked fixtures | OpenViking context-trajectory and hierarchy benchmark is encoded but blocked until evidence-bearing same-corpus and staged artifacts exist. | | XY-929 | P2 | Backlog | Graph/RAG adapters beyond scored smokes. | | XY-930 | P1 | Backlog | Private-corpus and credentialed production gates after operator inputs exist. | @@ -137,6 +141,9 @@ results, or lifecycle failures into one aggregate leaderboard. evidence among the tracked systems. - ELF ties qmd on encoded live retrieval, work-resume, project-decisions, and personalization slices. +- ELF fixture-backed `core_archival_memory` coverage passes attachment, scope, + provenance, stale-core detection, archival fallback, and project-decision recovery + jobs separately from archival search. - ELF has a narrow live operator-debug win over qmd for trace hydration, candidate-drop visibility, and selected-but-not-narrated evidence, with replay-command availability and repair-action clarity tied. diff --git a/docs/guide/benchmarking/2026-06-11-competitor-strength-evidence-matrix.md b/docs/guide/benchmarking/2026-06-11-competitor-strength-evidence-matrix.md index 40c4c53a..c1ca8dcf 100644 --- a/docs/guide/benchmarking/2026-06-11-competitor-strength-evidence-matrix.md +++ b/docs/guide/benchmarking/2026-06-11-competitor-strength-evidence-matrix.md @@ -7,6 +7,8 @@ non-claim against a tracked memory, RAG, or graph project. Inputs: `docs/guide/benchmarking/2026-06-10-production-adoption-refresh.md`, `docs/guide/benchmarking/2026-06-10-real-world-comparison-report.md`, `docs/guide/benchmarking/2026-06-10-live-real-world-sweep-report.md`, +`docs/guide/benchmarking/2026-06-11-measurement-coverage-audit.md`, +`docs/guide/benchmarking/2026-06-11-competitor-strength-adoption-report.md`, `docs/guide/research/external_memory_improvement_plan.md`, `docs/guide/research/research_projects_inventory.md`, `apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json`, @@ -29,10 +31,11 @@ Current boundary: live pass. The fresh ELF sweep produced 40 jobs with 22 pass, 5 wrong_result, 0 incomplete, 2 blocked, and 11 not_encoded; the fresh qmd sweep produced 17 pass, 6 wrong_result, 0 incomplete, 2 blocked, and 15 not_encoded. -- ELF fixture evidence is strong: `cargo make real-world-memory` reports 43 jobs - across 12 suites with 38 pass and 5 blocked production-ops or OpenViking - context-trajectory measurement gates. - That proves the fixture contract, not live-service parity. +- ELF fixture evidence is strong: `cargo make real-world-memory` reports 49 jobs + across 13 suites with 44 pass and 5 blocked production-ops or OpenViking + context-trajectory measurement gates. The added `core_archival_memory` suite + contributes 6 fixture-only passes for ELF core-block behavior; it does not create + an ELF-over-Letta claim. This proves the fixture contract, not live-service parity. - qmd is the strongest measured local retrieval-debug comparison, but the current evidence still separates its same-corpus/live-retrieval strengths from the full-suite live non-pass sweep. @@ -46,7 +49,7 @@ Current boundary: The current manifest has 23 adapter records across 16 external projects plus ELF. Evidence-class counts: 1 `fixture_backed`, 6 `live_baseline_only`, 5 `live_real_world`, and 11 `research_gate`. Overall adapter-status counts: 4 `pass`, -6 `wrong_result`, 1 `lifecycle_fail`, 6 `blocked`, and 6 `not_encoded`. +6 `wrong_result`, 1 `lifecycle_fail`, 7 `blocked`, and 5 `not_encoded`. ## State Taxonomy @@ -84,7 +87,7 @@ lifecycle-fail -> `lifecycle_fail`, and not-encoded -> `not_encoded`. | LightRAG | Lightweight graph/RAG context export with source file-path citation shape. | `research_gate`. | `blocked`: `ELF_LIGHTRAG_CONTEXT_START=1 cargo make lightrag-docker-context-smoke`, `tmp/real-world-memory/lightrag-context/summary.json`. | `blocked`: Docker service setup and context export are not proven. | XY-886 Docker context-export adapter with explicit provider config and source citation mapping. | Context-only query modes, graph-aware retrieval layout, and file-path citation readback. | | GraphRAG | GraphRAG indexing, graph summaries, and document/text-unit evidence tables. | `research_gate`. | `blocked`: `ELF_GRAPHRAG_SMOKE_RUN=1 cargo make graphrag-docker-smoke`, `tmp/real-world-memory/graphrag-smoke/summary.json`. | `blocked`: indexing resource envelope and source citation mapping are not proven. | XY-887 cost-bounded Docker adapter over a tiny corpus and scored output tables. | Graph summary artifacts, local/global search separation, and source table evidence mapping. | | Graphiti/Zep | Temporal graph memory with current, historical, and future fact validity windows. | `research_gate`. | `blocked`: `ELF_GRAPHITI_ZEP_SMOKE_START=1 ELF_GRAPHITI_ZEP_SMOKE_RUN=1 cargo make graphiti-zep-docker-temporal-smoke`, `tmp/real-world-memory/graphiti-zep-smoke/summary.json`. | `blocked`: Docker graph-store and temporal adapter are not proven. | XY-888 Docker-local temporal graph adapter scoring current/historical fact validity. | Temporal fact windows, invalidation/supersession semantics, and graph fact provenance. | -| Letta | Core memory blocks versus archival memory with explicit operating-context surfaces. | `research_gate`. | `not_encoded`: `docs/research/2026-06-10-xy-882-rag-graph-adapter-feasibility.json`. | `blocked`: contained evidence export path is not selected. | Select contained export contract, then encode core-vs-archival, personalization, and project-decision jobs. | Core memory block ergonomics, archival separation, and shared operating context readback. | +| Letta | Core memory blocks versus archival memory with explicit operating-context surfaces. | `research_gate`. | `blocked`: the selected comparison contract is a Docker-only benchmark-created agent export that returns core block JSON, archival search/readback JSON, and source ids; no materialized export exists yet. | `blocked`: no Letta materializer currently creates the benchmark agent, imports the ELF `core_archival_memory` fixture corpus, or exports comparable core and archival evidence. | Implement and run the contained export/readback adapter before any Letta win, tie, or loss claim; keep personalization and project-decision scenarios blocked or not tested until that evidence exists. | Core memory block ergonomics, archival separation, and shared operating context readback. | | LangGraph | Checkpoint/replay regression workflow and durable state replay for agent runs. | `research_gate`. | `not_encoded`: `docs/research/2026-06-10-xy-882-rag-graph-adapter-feasibility.json`. | `unsupported`: not a standalone memory backend adapter. | Non-goal for direct win/loss until a standalone memory output contract exists; use replay jobs as benchmark infrastructure reference. | Checkpoint replay, deterministic regression, and state-diff evaluation patterns. | | nanograph | Typed graph schema and query ergonomics for graph-lite developer experience. | `research_gate`. | `not_encoded`: `docs/research/2026-06-10-xy-882-rag-graph-adapter-feasibility.json`. | `unsupported`: not a memory backend comparison target. | Non-goal for direct win/loss unless a contained memory-backed target emerges; measure ELF graph-lite DX instead. | Typed relation schema, query ergonomics, and small graph developer experience. | | llm-wiki | LLM-maintained wiki or knowledge-page workflow with query-save and lint loops. | `research_gate`. | `not_encoded`: `docs/research/2026-06-10-xy-882-rag-graph-adapter-feasibility.json`. | `unsupported`: no live service runtime for adapter proof. | Select contained plugin or instruction harness, then score knowledge pages for citations, unsupported claims, rebuild, and stale-source lint. | Maintained wiki workflows, page lint, query-save loops, and topic-scoped navigation. | @@ -97,7 +100,7 @@ lifecycle-fail -> `lifecycle_fail`, and not-encoded -> `not_encoded`. | --- | --- | --- | --- | --- | | Retrieval/debug | Fixture retrieval passes; live retrieval passes. | qmd. | qmd live retrieval passes and live baseline passes, but full-suite live status is `wrong_result`. | Run qmd deep profile and ELF/qmd trace-level replay with expansion, fusion, rerank, and candidate-drop diagnostics. | | Work resume | Fixture and live work_resume pass. | agentmemory, claude-mem, OpenViking. | agentmemory `lifecycle_fail`; claude-mem work_resume remains `not_encoded` pending a durable repository-backed adapter; OpenViking work_resume is `not_encoded`. | Encode durable work_resume adapters or keep each blocked with lifecycle/setup evidence. | -| Project decisions | Fixture and live project_decisions pass. | qmd, Letta. | qmd live project_decisions pass; Letta is `research_gate` `not_encoded`. | Add Letta core/archival decision jobs only after a contained export path exists. | +| Project decisions | Fixture and live project_decisions pass; the ELF core-archival fixture also scores project-decision recovery through core routing plus archival rationale. | qmd, Letta. | qmd live project_decisions pass; Letta project-decision recovery is `research_gate` `not_tested` or `blocked` until the contained export path exists. | Run the Letta core/archival export/readback contract before treating project-decision recovery as a comparable scenario. | | Source-of-truth | Fixture and live trust_source_of_truth pass. | memsearch. | memsearch canonical-store, reindex, delete, and reload smoke passes; XY-925 fixture-backed source-of-truth prompts now cover the canonical Markdown rebuild/reload boundary, but no live memsearch prompt adapter pass is claimed. | Promote memsearch source-of-truth rebuild/reload prompts into a live adapter before any suite-level win/loss claim. | | Temporal/current-vs-historical memory | Fixture memory_evolution passes; live memory_evolution is `wrong_result`. | Graphiti/Zep, mem0/OpenMemory. | Graphiti/Zep is `research_gate` `blocked`; mem0/OpenMemory local OSS preference history, entity scope, deletion audit, and SDK `get_all` now pass; OpenMemory UI/export is blocked by the export-helper setup probe; graph-memory scenarios are `not_encoded`. | Fix ELF/qmd live memory_evolution evidence links, add OpenMemory product app import/export readback, and run XY-888. | | Consolidation | Fixture consolidation passes; live consolidation is `not_encoded`. | agentmemory, managed-memory references, llm-wiki. | No manifest project has live consolidation scoring. | Run reviewable consolidation proposal generation with source refs, unsupported-claim flags, and audit transitions. | @@ -107,7 +110,7 @@ lifecycle-fail -> `lifecycle_fail`, and not-encoded -> `not_encoded`. | Production ops | Fixture production_ops has 4 pass and 2 blocked; live production_ops is `blocked`; production adoption has provider/backfill/restore evidence. | ELF production gate, qmd, RAG/RAGFlow resource gates. | qmd live production_ops is `blocked`; RAG/resource gates are `research_gate` `blocked`. | Rerun private-corpus and credentialed gates only when operator-owned manifest and credentials exist. | | Personalization | Fixture and live personalization pass. | mem0/OpenMemory, Letta. | mem0/OpenMemory local OSS entity-scoped personalization now passes, so scoped preference behavior is a measured tie; OpenMemory UI/export remains blocked, hosted Platform export is non-goal, optional graph memory remains outside local OSS scoring, and Letta personalization is `research_gate` `not_encoded`. | Add OpenMemory product app import/export and contained Letta scoped-preference readback before broader personalization superiority claims. | | Context trajectory | ELF has trace direction but no comparable staged trajectory scenario. | OpenViking. | OpenViking setup is pinned, same-corpus retrieval is `wrong_result`, and staged/hierarchy/recursive trajectory jobs are encoded as `blocked`. | Make OpenViking evidence-bearing retrieval pass, then score staged context trajectory outputs. | -| Core-vs-archival memory | ELF core-block semantics exist in the service contract, but comparative benchmark coverage is not encoded here. | Letta. | Letta is `research_gate` `not_encoded` until contained export proof exists. | Add ELF core-block versus archival-search jobs; compare Letta only after contained export proof. | +| Core-vs-archival memory | Fixture `core_archival_memory` passes 6/6 and scores core block attachment, scope, provenance, stale-core detection, archival fallback, and project-decision recovery separately from archival note search. | Letta. | Letta is `research_gate` `blocked`/`not_tested` until the selected contained export/readback artifact exists. | Implement the Letta export/readback adapter, then compare only scenarios whose core block JSON, archival search/readback JSON, and source ids are present. | | Graph/RAG navigation | ELF relation context is not enough to claim graph/RAG navigation parity. | RAGFlow, LightRAG, GraphRAG, Graphiti/Zep, graphify. | RAGFlow, LightRAG, GraphRAG, and Graphiti/Zep remain `research_gate` blocked/incomplete without explicit setup; graphify has only a tiny scored smoke `wrong_result`. | Run larger contained graph/RAG adapters with evidence-linked outputs before any ELF graph/RAG win, tie, or loss claim. | ## Parallelizable Benchmark Follow-Ups @@ -130,7 +133,7 @@ now explicit: | Graphiti/Zep temporal graph adapter | XY-888 | yes | Docker-local graph store setup. | Current/historical/future fact validity and evidence ids. | | graphify graph report adapter | XY-889 plus post-XY-900 expansion | yes | Representative graph/RAG jobs beyond the tiny scored smoke. | `graph.json` and `GRAPH_REPORT` evidence mapped to scored graph navigation and knowledge synthesis ids. | | Private corpus and credentialed production ops | Operator-owned benchmark gates | no | Sanitized private manifest and routed provider credentials. | Private-corpus retrieval quality and credentialed production-ops evidence. | -| Letta, LangGraph, nanograph, llm-wiki direct adapters | Research-only until output contract | no | Contained evidence export or non-memory-backend comparability contract. | Run only after each has a comparable output contract; otherwise keep as product-reference evidence. | +| Letta, LangGraph, nanograph, llm-wiki direct adapters | Letta export artifact blocked; others research-only until output contract | no | Letta needs the selected contained export/readback artifact; the others need a non-memory-backend comparability contract. | Run only after comparable output exists; otherwise keep as product-reference evidence. | ## Validation Contract diff --git a/docs/guide/benchmarking/2026-06-11-elf-iteration-direction-from-competitor-benchmarks.md b/docs/guide/benchmarking/2026-06-11-elf-iteration-direction-from-competitor-benchmarks.md index 55ce3ed4..6fa05a45 100644 --- a/docs/guide/benchmarking/2026-06-11-elf-iteration-direction-from-competitor-benchmarks.md +++ b/docs/guide/benchmarking/2026-06-11-elf-iteration-direction-from-competitor-benchmarks.md @@ -44,20 +44,20 @@ The strongest current statement is: | Metric | Value | | --- | ---: | -| Jobs | `43` | -| Encoded suites | `12` | -| Pass | `38` | +| Jobs | `49` | +| Encoded suites | `13` | +| Pass | `44` | | Blocked | `5` | | Wrong result | `0` | | Lifecycle fail | `0` | | Incomplete | `0` | | Not encoded | `0` | | Unsupported claim | `0` | -| Mean score | `0.884` | -| Evidence coverage | `97/97` | -| Source-ref coverage | `97/97` | -| Quote coverage | `97/97` | -| Expected evidence recall | `89/89` | +| Mean score | `0.898` | +| Evidence coverage | `111/111` | +| Source-ref coverage | `111/111` | +| Quote coverage | `111/111` | +| Expected evidence recall | `100/100` | This proves the fixture contract is broad and well controlled. It does not prove that every live adapter or every competitor runtime passes those scenarios. @@ -118,8 +118,8 @@ Overall adapter statuses: | `pass` | `4` | | `wrong_result` | `6` | | `lifecycle_fail` | `1` | -| `blocked` | `6` | -| `not_encoded` | `6` | +| `blocked` | `7` | +| `not_encoded` | `5` | The ledger is intentionally not a leaderboard. It prevents fixture evidence, same-corpus checks, research gates, and live real-world runs from being collapsed into @@ -131,7 +131,7 @@ one misleading score. | --- | --- | --- | | Retrieval/debug | ELF and qmd are tied on encoded live retrieval; qmd remains the stronger debug UX reference. | Add trace-level replay, expansion/fusion/rerank knobs, candidate-drop diagnosis, and command-line replay. | | Work resume | ELF live work-resume passes; continuity-oriented competitors are undermeasured. | Borrow agentmemory/claude-mem capture breadth and OpenViking staged context, but require durable adapter proof. | -| Project decisions | ELF and qmd live project-decision suites pass; Letta is not encoded. | Add core-vs-archival decision-memory scenarios before comparing Letta. | +| Project decisions | ELF and qmd live project-decision suites pass; ELF fixture-backed `core_archival_memory` also scores project-decision recovery, while Letta remains blocked without export evidence. | Run the Letta core/archival export/readback contract before treating project-decision recovery as comparable. | | Source of truth | ELF has the strongest measured source-of-truth evidence. | Borrow memsearch's local canonical-store ergonomics without making files or vectors authoritative. | | Temporal memory | ELF fixture passes, but live memory evolution is wrong_result. | Prioritize current-vs-historical evidence links and Graphiti/Zep-style validity windows. | | Consolidation | ELF fixture passes, but live proposal generation is not encoded. | Build reviewable derived proposals with source refs, confidence, unsupported-claim flags, and apply/defer/discard audit. | @@ -139,9 +139,9 @@ one misleading score. | Operator debugging | Fixture UX passes and the narrow live trace/viewer slice is scored: ELF passes, qmd ties replay/repair clarity but is wrong_result for trace hydration and candidate-drop visibility. | Expand coverage to OpenMemory and claude-mem UI/export or viewer runners before any broader operator-UX claim. | | Capture/write policy | ELF live capture/write-policy self-check passes with zero redaction leaks; qmd is `not_encoded`; agentmemory is `blocked`; claude-mem is `not_encoded`. | Borrow agentmemory/claude-mem capture breadth only after durable local hook/viewer evidence exists, while preserving redaction and evidence binding. | | Production ops | ELF has the strongest checked-in evidence, with private/credential gates blocked. | Keep Docker-first production proof and add private corpus only when an operator-owned manifest exists. | -| Personalization | ELF live personalization passes; mem0/OpenMemory and Letta are not encoded. | Add entity-scoped preference history and UI readback before claiming stronger personalization. | +| Personalization | ELF live personalization passes; mem0/OpenMemory ties the entity-scoped personalization smoke but still lacks a broader real-world prompt adapter, and Letta scoped preference readback remains not tested until its contained export path exists. | Add broader entity/preference history and UI readback before claiming stronger personalization. | | Context trajectory | Not comparable yet; OpenViking remains the reference. | Score staged retrieval, hierarchy expansion, and trajectory readback. | -| Core-vs-archival | Product gap, not a measured comparison yet. | Borrow Letta's core memory block shape with explicit scope, provenance, and read-only attachment. | +| Core-vs-archival | ELF fixture-backed `core_archival_memory` passes 6/6, but Letta remains blocked/not tested because no contained export artifact exists. | Borrow Letta's core memory block shape while keeping any win/tie/loss claim gated on exported core block, archival readback, and source-id evidence. | | Graph/RAG navigation | RAGFlow, LightRAG, GraphRAG, and Graphiti/Zep remain research gates; graphify has a tiny scored `wrong_result` smoke. | Run larger contained graph/RAG adapters before any broad graph-navigation claim. | ## Project Guidance Matrix @@ -159,7 +159,7 @@ one misleading score. | LightRAG | `research_gate`; current status is `blocked`. | Lightweight graph/RAG context export and source-path citation shape. | Borrow context-export ideas for graph/RAG navigation after Docker proof. | | GraphRAG | `research_gate`; current status is `blocked`. | Graph summaries, document/text-unit tables, local/global search separation. | Borrow graph summary artifacts for knowledge pages and graph navigation after cost-bounded output proof. | | Graphiti/Zep | `research_gate`; current status is `blocked`. | Temporal graph facts, validity windows, current-vs-historical answers. | Use as the semantic model for ELF temporal memory and relation validity benchmarks. | -| Letta | `research_gate`; current status is `not_encoded`. | Core memory blocks versus archival memory. | Add explicit scoped core blocks in ELF, but compare Letta only after a contained export path exists. | +| Letta | `research_gate`; current status is `blocked` until the selected contained export/readback artifact exists. | Core memory blocks versus archival memory. | Keep ELF's fixture-backed core block coverage separate from Letta comparison claims; compare Letta only after exported core and archival evidence exists. | | LangGraph | `research_gate`; current status is `not_encoded` or `unsupported` as a direct memory backend. | Checkpoint, replay, fork, and regression debugging for agent state. | Borrow replay/regression patterns for benchmark infrastructure, not as direct memory parity. | | nanograph | `research_gate`; current status is `not_encoded` or `unsupported` as a full memory backend. | Typed graph schema and query ergonomics. | Borrow graph-lite DX and typed relation query ideas. | | llm-wiki | `research_gate`; current status is `not_encoded`. | Maintained wiki pages, query-save, lint, and repair loops. | Use as a reference for rebuildable, cited knowledge pages. | @@ -227,8 +227,10 @@ These improve day-to-day usefulness while preserving ELF's evidence-bound core. - Borrow from: Letta core memory versus archival memory. - ELF shape: scoped read-only blocks with provenance and attachment rules, separate from archival search. - - Benchmark gate: core-vs-archival jobs prove correct attachment, sharing, and - fallback to search. + - Benchmark gate: ELF fixture jobs now prove attachment, scope, provenance, + stale-core detection, archival fallback, and project-decision recovery; Letta + comparison remains gated on exported core block, archival readback, and source-id + evidence. ### P2 - Expand External Comparison Without Fake Wins @@ -269,7 +271,9 @@ Do not claim: - ELF beats mem0/OpenMemory on hosted memory, entity history, UI, or optional graph memory. Those scenarios are not encoded; the operator-debug win is only against qmd on a narrow trace/replay slice. -- ELF beats Letta on core-vs-archival memory. That scenario is not encoded. +- ELF beats Letta on core-vs-archival memory. ELF has fixture-backed coverage, but + Letta remains blocked/not tested until the selected contained export/readback path + produces comparable source-id-mapped evidence. - ELF beats RAGFlow, LightRAG, GraphRAG, Graphiti/Zep, or graphify on graph/RAG navigation. Current evidence is research-gate or blocked except graphify's tiny non-pass smoke. diff --git a/docs/guide/benchmarking/2026-06-11-measurement-coverage-audit.md b/docs/guide/benchmarking/2026-06-11-measurement-coverage-audit.md index 3174aeed..470a89a7 100644 --- a/docs/guide/benchmarking/2026-06-11-measurement-coverage-audit.md +++ b/docs/guide/benchmarking/2026-06-11-measurement-coverage-audit.md @@ -5,10 +5,11 @@ not comparable, and which measurement reports should guide future ELF iteration. Read this when: You need to answer whether ELF has enough empirical evidence to claim a win, tie, loss, or non-claim against tracked memory, RAG, graph, and agent-continuity projects. -Inputs: A fresh local `cargo make real-world-memory` run in the current XY-928 lane -after OpenViking context-trajectory fixture encoding, the retained XY-933 -`cargo make real-world-memory-live-adapters` evidence after live capture/write-policy -scoring, plus +Inputs: Fresh local runs of `cargo make real-world-memory-core-archival`, +`cargo make real-world-memory`, and retained XY-933 +`cargo make real-world-memory-live-adapters` evidence after XY-927 +core-vs-archival fixture coverage, XY-928 OpenViking context-trajectory fixture +encoding, and live capture/write-policy scoring, plus `apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json`, `2026-06-11-competitor-strength-evidence-matrix.md`, and `2026-06-11-elf-iteration-direction-from-competitor-benchmarks.md`. @@ -23,10 +24,13 @@ tracked project's strongest scenario. What is proven today: -- ELF has a strong fixture-backed real-world benchmark contract: 43 jobs, 38 pass, - 5 blocked operator or measurement-gate boundaries, and no wrong results in the - fixture aggregate. The added XY-928 `context_trajectory` jobs are blocked - OpenViking staged/hierarchy/recursive gates, not ELF wins. +- ELF has a strong fixture-backed real-world benchmark contract: 49 jobs across 13 + suites, 44 pass, 5 blocked operator or measurement-gate boundaries, and no wrong + results in the fixture aggregate. The new `core_archival_memory` suite contributes + 6 passing jobs for core block attachment, scope, provenance, stale-core detection, + archival fallback, and project-decision recovery. The added XY-928 + `context_trajectory` jobs are blocked OpenViking staged/hierarchy/recursive gates, + not ELF wins. - ELF and qmd have comparable full-suite live real-world sweeps, but neither has a full-suite live pass. ELF is five passes ahead in the fresh aggregate because qmd misses the memory-evolution delete/TTL tombstone job and the capture/write-policy @@ -40,10 +44,10 @@ What is proven today: checked-in provider synthetic, stress, backfill, backup/restore, and Qdrant rebuild evidence. - The current comparison still undermeasures most competitor strengths. OpenViking - trajectory, mem0/OpenMemory entity history and UI, Letta core-vs-archival memory, - Graphiti/Zep temporal graph behavior, graph/RAG navigation, agentmemory and - claude-mem continuity/capture breadth, and knowledge-page workflows remain - non-claims. + trajectory, mem0/OpenMemory entity history and UI, Letta product export/readback + for core-vs-archival memory, Graphiti/Zep temporal graph behavior, graph/RAG + navigation, agentmemory and claude-mem continuity/capture breadth, and knowledge-page + workflows remain non-claims. The separate XY-932 operator-debug live slice now scores ELF against qmd for trace hydration and candidate-drop visibility, but does not cover OpenMemory or claude-mem UI flows. @@ -53,12 +57,14 @@ production," but the competitiveness objective remains open. ## Fresh Runs -The fixture command was refreshed in the current XY-928 lane after the OpenViking -context-trajectory fixtures were added. The live-adapter command records the retained -XY-933 evidence after live capture/write-policy scoring: +These commands were run in the current benchmark lanes after adapter-report +consistency repairs, the XY-927 core-vs-archival fixture update, the XY-928 +OpenViking context-trajectory fixture update, and XY-933 live capture/write-policy +scoring: | Command | Result | Runtime | | --- | --- | ---: | +| `cargo make real-world-memory-core-archival` | pass | 12.14 seconds | | `cargo make real-world-memory` | pass | 11.09 seconds | | `cargo make real-world-memory-live-adapters` | pass | 137.66 seconds | @@ -73,21 +79,21 @@ failure. | Metric | Value | | --- | ---: | -| Jobs | `43` | -| Encoded suites | `12` | -| Pass | `38` | +| Jobs | `49` | +| Encoded suites | `13` | +| Pass | `44` | | Blocked | `5` | | Wrong result | `0` | | Lifecycle fail | `0` | | Incomplete | `0` | | Not encoded | `0` | | Unsupported claim | `0` | -| Mean score | `0.884` | +| Mean score | `0.898` | | Mean latency | `3.940 ms` | -| Expected evidence recall | `89/89` | -| Evidence coverage | `97/97` | -| Source-ref coverage | `97/97` | -| Quote coverage | `97/97` | +| Expected evidence recall | `100/100` | +| Evidence coverage | `111/111` | +| Source-ref coverage | `111/111` | +| Quote coverage | `111/111` | This proves fixture contract breadth and scoring behavior. It does not prove every live adapter or competitor runtime can complete those jobs. @@ -150,8 +156,8 @@ The checked-in manifest records 23 adapter records across 17 unique project name | `pass` | `4` | | `wrong_result` | `6` | | `lifecycle_fail` | `1` | -| `blocked` | `6` | -| `not_encoded` | `6` | +| `blocked` | `7` | +| `not_encoded` | `5` | The generated JSON report emits `external_project_count: 16`, matching the unique non-ELF project-name count from the manifest. The companion audit JSON separately @@ -164,7 +170,7 @@ records `unique_project_names: 17` for the full project list including ELF. | ELF | `fixture_backed` plus `live_real_world` | Fixture aggregate passes except 5 blocked operator or measurement-gate boundaries; live full sweep is `wrong_result`; live capture/write-policy and narrow operator-debug slices pass. | Full live memory evolution, live consolidation, live knowledge pages, live production ops, competitor capture hooks, OpenViking staged trajectory artifacts, and broader operator UI runners. | Memory-evolution diagnostic report, then consolidation/knowledge reports plus agentmemory/claude-mem capture, OpenViking staged trajectory artifacts, and OpenMemory/claude-mem UI runners. | | qmd | `live_real_world` plus `live_baseline_only` | Fresh full sweep is five passes behind ELF because qmd misses the delete/TTL tombstone job and keeps capture/write-policy jobs typed `not_encoded`; same-corpus baseline passes; narrow operator-debug live slice ties replay commands but is `wrong_result` for trace hydration and candidate-drop visibility. | Deep retrieval-debug ergonomics and trace replay beyond the narrow operator-debug slice. | qmd/ELF deep retrieval-debug profile with expansion, fusion, rerank, and dropped-candidate traces. | | agentmemory | `live_baseline_only` | `lifecycle_fail`; capture comparison is `blocked` because the Docker baseline uses a process-local StateKV Map and in-memory index, with no durable local session/capture path for source ids, exclusions, write-policy audit, or evidence-bound output. | Durable coding-agent continuity and capture hooks. | Durable lifecycle and work-resume/capture adapter report. | -| mem0/OpenMemory | `live_baseline_only` | Basic local smoke now passes; history/UI/hosted/graph behavior remains `not_encoded`. | Entity history, lifecycle UI, OpenMemory inspection. | Entity-history, deletion-audit, and UI/export readback report. | +| mem0/OpenMemory | `live_baseline_only` | Basic local smoke and local OSS history/readback pass; OpenMemory UI/export is blocked, hosted Platform export is a non-goal, and optional graph plus broader prompt coverage remain `not_encoded`. | Entity history, lifecycle UI, OpenMemory inspection. | Entity-history, deletion-audit, and UI/export readback report. | | memsearch | `live_baseline_only`; XY-925 `fixture_backed` | Basic canonical Markdown reindex/reload smoke passes, and XY-925 adds fixture-backed source-store and retrieval-debug prompts without claiming a live memsearch adapter pass. | Markdown canonical store and local reindex clarity. | Runtime source-of-truth and retrieval-debug adapter execution over the existing prompt jobs. | | OpenViking | `live_baseline_only` plus `fixture_backed` and `research_gate` | Same-corpus retrieval is `wrong_result`; staged retrieval, hierarchy selection, and recursive/context expansion are encoded as blocked fixtures. | Hierarchical staged context trajectory. | Evidence-bearing retrieval fix, then materialized staged trajectory report. | | claude-mem | `live_baseline_only`; XY-925 `fixture_backed` | Same-corpus retrieval remains `wrong_result`; XY-925 adds fixture-backed progressive-disclosure and retrieval-repair prompts, with hook capture and viewer/operator workflows still blocked. | Progressive disclosure and automatic capture review. | Work-resume, operator-debugging, capture/write-policy, and viewer/operator runtime report. | @@ -172,7 +178,7 @@ records `unique_project_names: 17` for the full project list including ELF. | LightRAG | `research_gate` | `blocked`. | Graph/RAG context export with source-path citations. | Docker context-export report with explicit provider config and source citation mapping. | | GraphRAG | `research_gate` | `blocked`. | Graph summaries and document/text-unit evidence tables. | Cost-bounded Docker adapter report over a tiny corpus. | | Graphiti/Zep | `research_gate` | `blocked`. | Temporal graph facts and validity windows. | Docker-local temporal graph adapter report for current and historical facts. | -| Letta | `research_gate` | `not_encoded`. | Core memory blocks versus archival memory. | Contained export contract, then core-vs-archival and decision-memory report. | +| Letta | `research_gate` | `blocked` for the selected contained export/readback path; scenario rows remain `not_tested` or `blocked`. | Core memory blocks versus archival memory. | Implement the Docker-only export/readback adapter before any Letta win/tie/loss claim. | | LangGraph | `research_gate` | `not_encoded`; direct memory backend is unsupported. | Checkpoint replay and fork/regression debugging. | Treat as benchmark-infra reference unless a memory-output contract emerges. | | nanograph | `research_gate` | `not_encoded`; full memory backend is unsupported. | Typed graph schema and query ergonomics. | Typed relation query report only if evidence ids can be emitted. | | llm-wiki | `research_gate` | `not_encoded`. | Wiki/page generation, query-save, lint and repair loops. | Contained page-generation report with citation and unsupported-claim lint. | @@ -185,7 +191,7 @@ records `unique_project_names: 17` for the full project list including ELF. | --- | --- | --- | --- | | Retrieval/debug | ELF and qmd live retrieval pass; qmd same-corpus baseline passes. | Tie on encoded live retrieval; no ELF-over-qmd UX claim. | qmd/ELF deep trace replay and debug ergonomics scoring. | | Work resume | ELF and qmd live pass. | ELF is credible on encoded work resume. | agentmemory, claude-mem, and OpenViking comparable continuity adapters. | -| Project decisions | ELF and qmd live pass. | ELF is credible on encoded project-decision recovery. | Letta core/archival decision memory comparison. | +| Project decisions | ELF and qmd live pass; ELF fixture coverage also passes core routing plus archival rationale recovery. | ELF is credible on encoded project-decision recovery. | Letta core/archival decision memory export and scoring. | | Source of truth | ELF and qmd live pass; ELF has stronger production restore/rebuild evidence. | ELF has strongest measured source-of-truth discipline. | memsearch source-of-truth reindex/reload evidence. | | Memory evolution | ELF live fails 5/6 jobs; qmd live fails 6/6 jobs after missing the delete/TTL tombstone evidence; fixture aggregate passes. | No broad live superiority claim. | Historical conflict evidence links and Graphiti/Zep temporal comparison. | | Consolidation | Fixture aggregate passes; live adapters are not encoded. | Fixture-only claim. | Live proposal generation with lineage, confidence, and review-action audit. | @@ -195,7 +201,7 @@ records `unique_project_names: 17` for the full project list including ELF. | Production ops | ELF has separate production-provider/backfill/restore evidence; live sweep is not a full production-ops pass. | Bounded personal-production adoption claim with caveats. | Private corpus manifest and credentialed provider gates. | | Personalization | ELF and qmd live pass one scoped preference job. | Narrow encoded pass only. | mem0/OpenMemory and Letta entity/preference history comparison. | | Context trajectory | Not comparable. | No claim. | OpenViking staged hierarchy/trajectory scoring. | -| Core-vs-archival memory | Not comparable. | No claim. | Letta contained export and ELF core-block benchmark. | +| Core-vs-archival memory | ELF fixture suite passes 6/6; Letta comparison is blocked until export/readback evidence exists. | Fixture-only ELF core-block claim; no ELF-over-Letta claim. | Letta contained export/readback artifact with core block JSON, archival search/readback JSON, and source ids. | | Graph/RAG navigation | RAGFlow, LightRAG, GraphRAG, and Graphiti/Zep remain typed research gates; graphify has a tiny scored `wrong_result` smoke. | No graph/RAG parity claim; only graphify's bounded non-pass smoke can be cited. | Larger contained RAG/graph adapters with evidence-linked outputs before any ELF graph/RAG win, tie, or loss claim. | ## Next Measurement Reports diff --git a/docs/guide/benchmarking/index.md b/docs/guide/benchmarking/index.md index 1668aa31..e2eb3469 100644 --- a/docs/guide/benchmarking/index.md +++ b/docs/guide/benchmarking/index.md @@ -55,9 +55,9 @@ cleanup, use `docs/guide/single_user_production.md`. optimization-direction report that translates measured benchmark data and competitor strengths into prioritized ELF iteration themes and explicit non-claims. - `2026-06-11-measurement-coverage-audit.md`: fresh coverage audit that separates - current measured ELF/qmd data, fixture evidence, external adapter ledger coverage, - scenario non-claims, and the next measurement reports needed before stronger - competitor claims. + current measured ELF/qmd data, fixture evidence including the XY-927 + `core_archival_memory` suite, external adapter ledger coverage, scenario non-claims, + and the next measurement reports needed before stronger competitor claims. - `2026-06-11-elf-qmd-retrieval-debug-profile.md`: fresh ELF/qmd retrieval-debug profile with real-world retrieval-suite evidence, 480-document stress baseline evidence, qmd top-10 artifact inspection, and explicit rerank/fusion non-claims. @@ -95,9 +95,10 @@ cleanup, use `docs/guide/single_user_production.md`. `real_world_job` adapter reports without converting smoke evidence into quality claims. - `2026-06-11-competitor-strength-adoption-report.md`: XY-901 final - competitor-strength adoption report with the bounded personal-production decision, - scenario-level win/tie/loss/not-tested matrix, claim boundaries, and optimization - issue queue. + competitor-strength adoption report, updated by XY-927 with fixture-backed + core-vs-archival coverage and a blocked Letta export/readback boundary, plus the + bounded personal-production decision, scenario-level win/tie/loss/not-tested + matrix, claim boundaries, and optimization issue queue. - `2026-06-11-capture-write-policy-live-report.md`: XY-933 live capture/write-policy report that scores ELF redaction, exclusions, source ids, evidence binding, and no secret leakage while preserving typed blocked/untested boundaries for agentmemory diff --git a/docs/guide/benchmarking/real_world_agent_memory_benchmark.md b/docs/guide/benchmarking/real_world_agent_memory_benchmark.md index c15cc912..4e6bd18d 100644 --- a/docs/guide/benchmarking/real_world_agent_memory_benchmark.md +++ b/docs/guide/benchmarking/real_world_agent_memory_benchmark.md @@ -58,6 +58,7 @@ compile knowledge, and state honest uncertainty. | Capture/integration | Accuracy of hooks, imports, exclusions, and write policies. | Capture a session decision while excluding private spans. | | Production ops | Backfill, restore, cold start, resource, and bounded-failure behavior. | Resume interrupted import without duplicate source notes. | | Personalization | Scoped preferences without cross-tenant leakage. | Apply the user's current preference and ignore another project's note. | +| Core/archival memory | Always-loaded core memory behavior kept separate from archival note search. | Detect a stale core block and fall back to archival evidence. | | Context trajectory | Staged context trajectory, hierarchy selection, and recursive expansion. | Block OpenViking trajectory scoring until same-corpus evidence ids and comparable stage artifacts exist. | ## External Reference Mapping @@ -165,6 +166,9 @@ including the retrieval-quality slice below. The suite currently encodes: classification, and provider credential boundary `blocked` classification. - `personalization`: scoped stable preference correction without temporary or cross-project preference leakage. +- `core_archival_memory`: core block attachment, scope, provenance, stale-core + detection, archival fallback, and project-decision recovery through core routing + plus archival rationale. - `context_trajectory`: OpenViking staged retrieval, hierarchy selection, and recursive/context expansion jobs encoded as `blocked` until same-corpus expected evidence ids and comparable stage artifacts are available. @@ -225,14 +229,19 @@ research gates. Its `external_adapters` report section distinguishes: - `research_gate`: checked-in source/setup/runtime/resource/retry metadata for a future adapter path, not fixture-backed or live execution evidence. -Current fixture state: `cargo make real-world-memory` covers 43 jobs across 12 suites, -with 38 pass and 5 blocked. The blocked jobs are production-ops operator boundaries -plus the XY-928 OpenViking `context_trajectory` gates for staged retrieval, hierarchy -selection, and recursive/context expansion. +Current fixture state: `cargo make real-world-memory` covers 49 jobs across 13 suites, +with 44 pass and 5 blocked. The added `core_archival_memory` suite contributes six +passing fixture jobs for core block attachment, scope, provenance, stale-core +detection, archival fallback, and project-decision recovery. The blocked jobs are +production-ops operator boundaries plus the XY-928 OpenViking `context_trajectory` +gates for staged retrieval, hierarchy selection, and recursive/context expansion. Current live-adapter state: the `elf_live_real_world` and `qmd_live_real_world` adapters run a full encoded-suite sweep through `cargo make real-world-memory-live-adapters`. Each adapter materializes generated runtime answers for 40 jobs across 11 suites before scoring. +The fixture-only `core_archival_memory` suite can also be run through +`cargo make real-world-memory-core-archival`; it is not yet included in that live +sweep. The original targeted `work_resume`, `retrieval`, and `project_decisions` slice still passes, and ELF now passes the live `capture_integration` self-checks for redaction, exclusions, source ids, evidence binding, and no secret leakage. The full sweep is @@ -243,12 +252,16 @@ operator_debugging_ux remain `not_encoded` for this live adapter path. qmd keeps `live_baseline_only` same-corpus record for update/delete/cold-start checks; that record is not a real-world suite win. agentmemory is blocked on durable upstream storage for lifecycle proof and capture breadth. mem0/OpenMemory, memsearch, and -claude-mem currently retain wrong-result, not-encoded, or incomplete live-baseline -states for the checked-in adapter evidence. OpenViking now reaches its pinned Docker -local embedding setup but remains a same-corpus `wrong_result` until it returns -evidence-bearing retrieval output. The checked-in `context_trajectory` fixtures keep -OpenViking staged retrieval, hierarchy selection, and recursive/context expansion -blocked until same-corpus evidence ids match and staged artifacts are materialized. +claude-mem no longer share one live-baseline boundary: mem0/OpenMemory and memsearch +now pass scoped local baseline paths, while OpenMemory product UI/export, hosted +Platform behavior, optional graph memory, memsearch real-world prompt/TTL coverage, +and claude-mem hook/viewer capture remain blocked, unsupported, not encoded, or +wrong-result for the checked-in adapter evidence. OpenViking now reaches its pinned +Docker local embedding setup but remains a same-corpus `wrong_result` until it +returns evidence-bearing retrieval output. The checked-in `context_trajectory` +fixtures keep OpenViking staged retrieval, hierarchy selection, and recursive/context +expansion blocked until same-corpus evidence ids match and staged artifacts are +materialized. The expanded RAG and graph-memory records for RAGFlow, LightRAG, GraphRAG, Graphiti/Zep, Letta, LangGraph, nanograph, llm-wiki, gbrain, graphify, and deeper qmd/OpenViking profiles are `research_gate` records until diff --git a/docs/guide/research/comparison_external_projects.md b/docs/guide/research/comparison_external_projects.md index 05e12a0d..7173ecb1 100644 --- a/docs/guide/research/comparison_external_projects.md +++ b/docs/guide/research/comparison_external_projects.md @@ -50,10 +50,14 @@ Use the evidence class before making claims: until a deep dive or adapter run exists. Current benchmark-grounded scope is narrow. The June 9, 2026 all-project smoke run -proved encoded same-corpus/lifecycle behavior only for the current adapters: ELF and qmd -passed their encoded smoke checks; agentmemory passed same-corpus retrieval but failed -or could not prove durable lifecycle behavior; memsearch, mem0, OpenViking, and -claude-mem retained `incomplete`, wrong-result, or not-encoded states. All broader suite +proved encoded same-corpus/lifecycle behavior only for the then-current adapters: ELF +and qmd passed their encoded smoke checks; agentmemory passed same-corpus retrieval but +failed or could not prove durable lifecycle behavior; memsearch, mem0, OpenViking, and +claude-mem retained `incomplete`, wrong-result, or not-encoded states. Later June 11 +follow-ups promote scoped local mem0/OpenMemory and memsearch baseline paths, while +OpenMemory UI/export, hosted Platform behavior, optional graph memory, broader +memsearch prompt/TTL coverage, OpenViking staged trajectory, and claude-mem hook/viewer +capture remain blocked, unsupported, not encoded, or wrong-result. All broader suite fit below is research guidance, not a benchmark result. The real-world job runner now carries a separate external adapter coverage manifest: @@ -100,8 +104,8 @@ Project-to-suite map: | agentmemory | `rw.operator-continuity`, `rw.resume-evidence`, `rw.lifecycle-staleness` | Cross-agent hooks, MCP/REST packaging, viewer, lifecycle/consolidation claims, and coding-agent continuity focus make it the right reference for daily agent memory ergonomics. | Use durable upstream storage rather than the current in-memory mock; ingest realistic agent sessions through the public hook/API path; prove restart, update/supersede, delete, and viewer/trace readback. | Mixed: benchmark-grounded only for current same-corpus retrieval; current lifecycle evidence is a failure/blocker, while hooks/viewer/consolidation are docs-grounded. Confidence: medium for suite fit, low for durable adapter quality. | ELF is stronger on evidence-bound writes and source-of-truth discipline; agentmemory remains the reference for capture breadth and agent-continuity UX. | | qmd | `rw.retrieval-debug`, `rw.lifecycle-staleness`, `rw.resume-evidence` | Its local CLI, structured JSON query output, expansion modes, hybrid routing, weighted fusion, rerank, update, delete, and cold-start path make it the strongest local retrieval-debug baseline. | Run `qmd` over the real-world corpus, capture query JSON, then rewrite/delete corpus files and rerun update/embed/query in fresh processes. | Benchmark-grounded for current smoke retrieval/update/delete/cold-start pass; docs-grounded for deeper query planning ergonomics. Confidence: high for local adapter baseline. | ELF is not yet stronger on local CLI debug ergonomics; treat qmd as the retrieval-debug reference while keeping ELF's service/provenance model. | | claude-mem | `rw.operator-continuity`, `rw.resume-evidence`, `rw.retrieval-debug` | Progressive-disclosure search, auto-capture hooks, local viewer, and observation/timeline workflows are directly aligned with real agent resumption jobs. | Exercise a real local repository with hook-driven capture, then evaluate `search -> timeline -> observations` behavior after restart; do not rely on mocked storage. | Docs-grounded for progressive disclosure/viewer; current benchmark adapter evidence is incomplete/wrong-result and mostly not encoded for lifecycle. Confidence: medium for product reference, low for current adapter claims. | ELF has stronger provenance and service boundaries, but claude-mem remains a reference for operator workflow and progressive disclosure UX. | -| mem0 / OpenMemory | `rw.lifecycle-staleness`, `rw.graph-temporal`, `rw.operator-continuity`, `rw.resume-evidence` | Entity-scoped memory, memory history, expiration, hosted/OSS surfaces, OpenMemory UI, and optional graph memory make it the broadest lifecycle and ecosystem comparison target. | Separate OSS local FastEmbed/Qdrant evidence from hosted Platform claims; prove add/update/delete/history, entity-scoped retrieval, expiration exclusion, OpenMemory UI readback, and optional graph context on the same corpus. | Docs-grounded for lifecycle/entity/graph/UI claims; current local adapter is incomplete/wrong-result for same-corpus retrieval and delete remains not encoded. Confidence: medium for suite fit, low for current adapter quality. | ELF is stronger on deterministic evidence-bound writes; mem0/OpenMemory is the reference for ecosystem reach, entity-scoped history, hosted option, and optional graph UX. | -| memsearch | `rw.lifecycle-staleness`, `rw.retrieval-debug`, `rw.resume-evidence` | Markdown as canonical memory plus incremental/content-addressed reindexing is a useful model for source transparency and rebuildable derived indexes. | Index a real-world Markdown corpus, mutate/delete files, rerun index/search from fresh processes, and record Milvus mode so Lite/Server/Cloud behavior is not conflated. | Docs-grounded for architecture; current adapter is incomplete/invalid-result, so no pass/fail quality claim is allowed. Confidence: medium for design pattern, low for current adapter evidence. | ELF already owns source-of-truth plus rebuildable index at service level; memsearch remains a reference for simple local canonical-store ergonomics. | +| mem0 / OpenMemory | `rw.lifecycle-staleness`, `rw.graph-temporal`, `rw.operator-continuity`, `rw.resume-evidence` | Entity-scoped memory, memory history, expiration, hosted/OSS surfaces, OpenMemory UI, and optional graph memory make it the broadest lifecycle and ecosystem comparison target. | Separate OSS local FastEmbed/Qdrant evidence from hosted Platform claims; prove add/update/delete/history, entity-scoped retrieval, expiration exclusion, OpenMemory UI readback, and optional graph context on the same corpus. | Benchmark-grounded for scoped local OSS same-corpus retrieval, update/delete/reload, history, entity filters, local `get_all` readback, and deletion audit; OpenMemory product UI/export remains blocked, hosted Platform is a non-goal, and optional graph plus broader prompt coverage remain not encoded. Confidence: medium for suite fit and scoped local adapter quality, low for product UI/hosted/graph claims. | ELF is stronger on deterministic evidence-bound writes; mem0/OpenMemory remains the reference for ecosystem reach, entity-scoped history, hosted option, and optional graph UX, with local preference-correction history currently measured as an ELF loss. | +| memsearch | `rw.lifecycle-staleness`, `rw.retrieval-debug`, `rw.resume-evidence` | Markdown as canonical memory plus incremental/content-addressed reindexing is a useful model for source transparency and rebuildable derived indexes. | Index a real-world Markdown corpus, mutate/delete files, rerun index/search from fresh processes, and record Milvus mode so Lite/Server/Cloud behavior is not conflated. | Benchmark-grounded for local same-corpus retrieval, reindex/update/delete, and cold-start reload smoke; no real-world prompt adapter is encoded, so Markdown-first behavior remains baseline scenario evidence rather than suite pass evidence. Confidence: medium for design pattern and scoped local adapter evidence, low for broad real-world adapter coverage. | ELF already owns source-of-truth plus rebuildable index at service level; memsearch remains a reference for simple local canonical-store ergonomics and transparent local reindexing. | | OpenViking | `rw.context-trajectory`, `rw.resume-evidence`, `rw.retrieval-debug` | `viking://` context organization, intent analysis, hierarchical retrieval, staged find/search behavior, and session compression are relevant to multi-hop agent context jobs. | Use the pinned Docker local embedding path, then evaluate `add_resource`/`find`/`search` over multi-stage jobs with stage output, hierarchy, and session memory evidence. | Docs-grounded for mechanism; current benchmark adapter reaches local embedding setup and `add_resource`/`find`, but remains `wrong_result` because same-corpus evidence terms are missed. Confidence: medium for architecture reference, low for runnable adapter quality. | ELF has first-class traces and evidence-bound notes, but OpenViking is the reference for hierarchical context trajectory and filesystem-like organization. | | llm-wiki | `rw.knowledge-synthesis`, `rw.resume-evidence` | Query/save/lint flows and topic-scoped wiki pages are a useful reference for turning retrieved memory into maintained project knowledge. | Run a corpus-to-wiki job, ask resume/decision questions, require page citations back to source memory, then mutate a stale source and prove lint/repair catches it. | Docs-grounded D1; no benchmark adapter evidence. Confidence: medium for derived-knowledge fit. | ELF is not yet stronger on derived knowledge pages; llm-wiki should inform rebuildable, evidence-cited dossiers rather than core storage. | | gbrain | `rw.knowledge-synthesis`, `rw.operator-continuity` | `compiled_truth`, timeline sections, backlinks, primary-home routing, and enrichment workflows model a living operational brain for project work. | Build or update pages from the real-world corpus, require current-truth plus timeline answers, and prove enrichment/backlink maintenance does not hide unsupported claims. | Docs-grounded D1; no benchmark adapter evidence. Confidence: medium for operator knowledge UX. | ELF should keep source notes authoritative; gbrain is a reference for presentation, enrichment, and maintenance loops. | diff --git a/docs/guide/research/research_projects_inventory.md b/docs/guide/research/research_projects_inventory.md index 2f1cb9c0..be322238 100644 --- a/docs/guide/research/research_projects_inventory.md +++ b/docs/guide/research/research_projects_inventory.md @@ -31,7 +31,7 @@ Last updated: June 11, 2026. | [gbrain](https://github.com/garrytan/gbrain) | D1 | Reviewed; XY-882 verdict `blocked` | `rw.knowledge-synthesis`, `rw.operator-continuity` | Operational knowledge brain, `compiled_truth` + timeline pages, enrichment and maintenance loops; blocked on Docker-local brain repo and database proof | `docs/guide/research/comparison_external_projects.md`; `docs/research/2026-06-09-xy-841-external-memory-benchmark-dimensions.json`; `docs/research/2026-06-10-xy-882-rag-graph-adapter-feasibility.json` | | [Always-On Memory Agent](https://github.com/GoogleCloudPlatform/generative-ai/tree/main/gemini/agents/always-on-memory-agent) | D1 | Reviewed | `rw.consolidation-review`, `rw.operator-continuity` | Always-on multimodal ingest + scheduled consolidation loop with simple local ops surface | `docs/guide/research/comparison_external_projects.md`; `docs/research/2026-06-09-xy-841-external-memory-benchmark-dimensions.json` | | [graphify](https://github.com/safishamsi/graphify) | D1 | Reviewed; XY-882 verdict `adapter_candidate`; XY-889 adds Docker graph/report smoke | `rw.graph-navigation`, `rw.knowledge-synthesis`, `rw.resume-evidence` | Multimodal graph compression, deterministic code extraction, and graph/report outputs with source-file/source-location references; current ELF evidence is a generated-corpus Docker smoke, not broad graph-quality proof | `docs/guide/research/comparison_external_projects.md`; `docs/research/2026-06-09-xy-841-external-memory-benchmark-dimensions.json`; `docs/research/2026-06-10-xy-882-rag-graph-adapter-feasibility.json`; `apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json` | -| [Letta](https://github.com/letta-ai/letta) | D1 | Reviewed; XY-882 verdict `research_only` | `rw.core-archival`, `rw.operator-continuity` | Core vs archival memory split, shared blocks; not an implementation candidate until a supported contained server path can export evidence | `docs/guide/research/comparison_external_projects.md`; `docs/research/2026-06-09-xy-841-external-memory-benchmark-dimensions.json`; `docs/research/2026-06-10-xy-882-rag-graph-adapter-feasibility.json` | +| [Letta](https://github.com/letta-ai/letta) | D1 | Reviewed; XY-882 verdict `research_only`; XY-927 selects blocked contained export/readback path | `rw.core-archival`, `rw.operator-continuity` | Core vs archival memory split, shared blocks; compare only after a Docker-only benchmark-created agent export returns core block JSON, archival readback JSON, and source ids | `docs/guide/research/comparison_external_projects.md`; `docs/research/2026-06-09-xy-841-external-memory-benchmark-dimensions.json`; `docs/research/2026-06-10-xy-882-rag-graph-adapter-feasibility.json`; `apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json` | | [LangGraph](https://docs.langchain.com/oss/python/langgraph/persistence) | D1 | Reviewed; XY-882 verdict `research_only` | `rw.replay-regression`, `rw.resume-evidence` | Checkpoint/replay mindset for quality regression workflows; not a standalone memory backend adapter | `docs/guide/research/comparison_external_projects.md`; `docs/research/2026-06-09-xy-841-external-memory-benchmark-dimensions.json`; `docs/research/2026-06-10-xy-882-rag-graph-adapter-feasibility.json` | | [Graphiti / Zep](https://help.getzep.com/graphiti/core-concepts/temporal-awareness) | D1 | Reviewed; XY-882 verdict `adapter_candidate` | `rw.graph-temporal`, `rw.resume-evidence` | Temporal fact validity model with Docker-local graph-store options and UUID/fact/validity-window output | `docs/guide/research/comparison_external_projects.md`; `docs/research/2026-06-09-xy-841-external-memory-benchmark-dimensions.json`; `docs/research/2026-06-10-xy-882-rag-graph-adapter-feasibility.json` | | [nanograph](https://github.com/nanograph/nanograph) | D1 | Reviewed; XY-882 verdict `research_only` | `rw.graph-temporal`, `rw.retrieval-debug` | Typed schema + typed query ergonomics for graph-lite developer experience; official shape is no server/no Docker | `docs/guide/research/comparison_external_projects.md`; `docs/research/2026-06-09-xy-841-external-memory-benchmark-dimensions.json`; `apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json`; `docs/research/2026-06-10-xy-882-rag-graph-adapter-feasibility.json` | diff --git a/docs/research/2026-06-11-competitor-strength-adoption-report.json b/docs/research/2026-06-11-competitor-strength-adoption-report.json index 7bb448bd..5d4aa7ad 100644 --- a/docs/research/2026-06-11-competitor-strength-adoption-report.json +++ b/docs/research/2026-06-11-competitor-strength-adoption-report.json @@ -12,7 +12,7 @@ "Live temporal reconciliation remains wrong_result for five of six memory_evolution jobs.", "Private-corpus production quality is blocked until an operator-owned manifest exists.", "Credentialed provider production-ops gates are blocked until explicit provider setup exists.", - "Several competitor strengths remain not_tested or blocked: OpenMemory UI/export is blocked by the XY-931 export-helper setup probe, hosted mem0 Platform behavior remains a non-goal, and Letta core-vs-archival memory plus graph/RAG navigation remain unproven. XY-928 encodes OpenViking staged trajectory, hierarchy selection, and recursive/context expansion as blocked fixtures behind same-corpus evidence output and missing staged artifacts. mem0 local OSS preference history is measured separately and is an ELF loss on the current correction-history scenario. The XY-923 follow-up scores qmd immediate top-10/replay artifact ergonomics as stronger than ELF's default stress report, while expansion, fusion, and rerank remain untested. XY-932 adds a narrow live operator-debug slice where ELF beats qmd on trace hydration and candidate-drop visibility, but OpenMemory UI/export remains blocked and claude-mem viewer workflows remain blocked until Docker-contained hook/viewer evidence exists. XY-925 adds fixture-backed first-generation OSS prompt coverage and typed blockers for agentmemory durable continuity, memsearch Markdown source-store/debug jobs, and claude-mem progressive-disclosure, retrieval-repair, hook, and viewer/operator surfaces without creating live external real-world suite passes. XY-933 adds an ELF live capture/write-policy self-check, but agentmemory and claude-mem hook-capture breadth remain blocked until Docker-contained hook/viewer evidence exists." + "Several competitor strengths remain not_tested or blocked: OpenMemory UI/export is blocked by the XY-931 export-helper setup probe, hosted mem0 Platform behavior remains a non-goal, and OpenViking trajectory, Letta core-vs-archival memory, and graph/RAG navigation remain unproven. XY-928 encodes OpenViking staged trajectory, hierarchy selection, and recursive/context expansion as blocked fixtures behind same-corpus evidence output and missing staged artifacts. XY-927 adds six ELF fixture-backed core_archival_memory jobs, but Letta scenario rows remain blocked or not_tested until the selected contained export/readback path exists. XY-925 adds fixture-backed first-generation OSS prompt coverage and typed blockers for agentmemory durable continuity, memsearch Markdown source-store/debug jobs, and claude-mem progressive-disclosure, retrieval-repair, hook, and viewer/operator surfaces without creating live external real-world suite passes. mem0 local OSS preference history is measured separately and is an ELF loss on the current correction-history scenario. The XY-923 follow-up scores qmd immediate top-10/replay artifact ergonomics as stronger than ELF's default stress report, while expansion, fusion, and rerank remain untested. XY-932 adds a narrow live operator-debug slice where ELF beats qmd on trace hydration and candidate-drop visibility, but OpenMemory UI/export remains blocked and claude-mem viewer workflows remain blocked until Docker-contained hook/viewer evidence exists. XY-933 adds an ELF live capture/write-policy self-check, but agentmemory and claude-mem hook-capture breadth remain blocked until Docker-contained hook/viewer evidence exists." ] }, "evidence_class_terms": [ @@ -39,7 +39,12 @@ { "command": "cargo make real-world-memory", "artifact": "docs/guide/benchmarking/2026-06-11-measurement-coverage-audit.md", - "claim": "ELF fixture aggregate covers 43 jobs across 12 suites with 38 pass and 5 blocked production-ops or OpenViking context-trajectory measurement gates." + "claim": "ELF fixture aggregate covers 49 jobs across 13 suites with 44 pass and 5 blocked production-ops or OpenViking context-trajectory measurement gates, including 6 passing core_archival_memory jobs." + }, + { + "command": "cargo make real-world-memory-core-archival", + "artifact": "tmp/real-world-memory/core-archival/report.json", + "claim": "ELF core_archival_memory fixture coverage scores core block attachment, scope, provenance, stale-core detection, archival fallback, and project-decision recovery separately from archival note search." }, { "command": "cargo make real-world-memory-live-adapters", @@ -142,14 +147,14 @@ "research_gate", "not_encoded" ], - "measured_claim": "ELF and qmd both pass encoded project_decisions jobs. Letta-style core/archival decision memory is not tested.", + "measured_claim": "ELF and qmd both pass encoded project_decisions jobs. The new ELF core_archival_memory fixture also scores project-decision recovery through core routing plus archival rationale, but Letta-style comparison remains blocked without contained export evidence.", "command_artifacts": [ "docs/guide/benchmarking/2026-06-11-measurement-coverage-audit.md" ], "follow_up_issues": [ "XY-927" ], - "caveat": "No Letta comparison exists until a contained export path is selected." + "caveat": "No Letta comparison exists until the selected contained export/readback path produces source-id-mapped evidence." }, { "scenario_id": "retrieval_quality", @@ -377,20 +382,24 @@ { "scenario_id": "core_vs_archival_memory", "title": "Core-vs-archival memory", - "outcome": "not_tested", + "outcome": "blocked", "evidence_classes": [ + "fixture_backed", "research_gate", + "blocked", "not_encoded" ], - "measured_claim": "ELF has core block semantics in the service contract, but comparable core-vs-archival benchmark jobs and a contained Letta export path are not encoded.", + "measured_claim": "ELF now has 6 fixture-backed core_archival_memory jobs that score core block attachment, scope, provenance, stale-core detection, archival fallback, and project-decision recovery separately from archival note search. Letta remains blocked or not_tested until its contained export/readback artifact maps core and archival source ids.", "command_artifacts": [ "docs/spec/system_elf_memory_service_v2.md", - "docs/guide/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md" + "apps/elf-eval/fixtures/real_world_memory/core_archival_memory", + "apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json", + "tmp/real-world-memory/core-archival/report.json" ], "follow_up_issues": [ "XY-927" ], - "caveat": "No ELF-over-Letta claim is allowed." + "caveat": "No ELF-over-Letta claim is allowed; the selected Letta path must export core block JSON, archival search/readback JSON, and source ids before scoring." }, { "scenario_id": "graph_rag_navigation_citations", @@ -453,8 +462,8 @@ { "issue": "XY-927", "priority": "P1", - "state": "Backlog", - "gap": "Letta-style core-vs-archival memory comparison." + "state": "Fixture encoded; Letta export blocked", + "gap": "ELF core_archival_memory fixture coverage is encoded; a contained Letta export/readback adapter remains future work before win/tie/loss claims." }, { "issue": "XY-928", @@ -486,6 +495,7 @@ "ELF is adoptable for bounded personal production use with caveats.", "ELF has the strongest measured source-of-truth, rebuild, restore, and backfill evidence among the tracked systems.", "ELF ties qmd on encoded live retrieval, work_resume, project_decisions, and personalization slices.", + "ELF fixture-backed core_archival_memory coverage passes attachment, scope, provenance, stale-core detection, archival fallback, and project-decision recovery jobs separately from archival search.", "ELF has a live temporal reconciliation loss against the benchmark expectation: five memory_evolution jobs remain wrong_result.", "Most competitor strengths outside qmd retrieval are not_tested, blocked, smoke_only, or research_gate.", "ELF has a narrow live operator-debug win over qmd for trace hydration, candidate-drop visibility, and selected-but-not-narrated evidence, with replay-command availability and repair-action clarity tied.", diff --git a/docs/research/2026-06-11-measurement-coverage-audit.json b/docs/research/2026-06-11-measurement-coverage-audit.json index fd210705..ff2405b1 100644 --- a/docs/research/2026-06-11-measurement-coverage-audit.json +++ b/docs/research/2026-06-11-measurement-coverage-audit.json @@ -1,10 +1,16 @@ { "schema": "elf.benchmark_measurement_coverage_audit/v2", "run_id": "2026-06-11-measurement-coverage-audit", - "source_revision": "current XY-928 lane rebased after live capture/write-policy scoring", + "source_revision": "current benchmark lane after XY-927 core-vs-archival fixture coverage, XY-928 context-trajectory blocked fixtures, and XY-933 live capture/write-policy scoring", "created_at": "2026-06-11", "scope": "ELF memory-system competitiveness measurement coverage, external competitor comparison evidence, and next report directions", "commands": [ + { + "command": "cargo make real-world-memory-core-archival", + "status": "pass", + "runtime_seconds": 12.14, + "artifact": "tmp/real-world-memory/core-archival/report.json" + }, { "command": "cargo make real-world-memory", "status": "pass", @@ -19,21 +25,21 @@ } ], "fixture_aggregate": { - "job_count": 43, - "encoded_suite_count": 12, - "pass": 38, + "job_count": 49, + "encoded_suite_count": 13, + "pass": 44, "wrong_result": 0, "lifecycle_fail": 0, "incomplete": 0, "blocked": 5, "not_encoded": 0, "unsupported_claim": 0, - "mean_score": 0.884, + "mean_score": 0.898, "mean_latency_ms": 3.94, - "expected_evidence_total": 89, - "expected_evidence_matched": 89, - "evidence_required_count": 97, - "evidence_covered_count": 97 + "expected_evidence_total": 100, + "expected_evidence_matched": 100, + "evidence_required_count": 111, + "evidence_covered_count": 111 }, "live_real_world_adapters": [ { @@ -197,8 +203,8 @@ "pass": 4, "wrong_result": 6, "lifecycle_fail": 1, - "blocked": 6, - "not_encoded": 6 + "blocked": 7, + "not_encoded": 5 }, "xy900_update_note": "XY-900 promotes graphify from research_gate/blocked to a tiny scored live_real_world wrong_result smoke; broad graph/RAG quality remains unproven.", "xy932_update_note": "XY-932 adds narrow ELF/qmd operator-debug live_real_world records: ELF pass and qmd wrong_result for trace hydration/candidate-drop visibility, with OpenMemory and claude-mem UI still unmeasured.", @@ -214,7 +220,7 @@ "OpenViking_context_trajectory", "mem0_OpenMemory_entity_history_ui", "agentmemory_claude_mem_capture_breadth", - "Letta_core_vs_archival_memory", + "Letta_core_vs_archival_export_path", "Graphiti_Zep_temporal_graph", "RAG_graph_navigation", "llm_wiki_gbrain_graphify_knowledge_workflows" diff --git a/docs/research/2026-06-11-xy-897-competitor-strength-matrix.json b/docs/research/2026-06-11-xy-897-competitor-strength-matrix.json index 7233bf66..59e5a19f 100644 --- a/docs/research/2026-06-11-xy-897-competitor-strength-matrix.json +++ b/docs/research/2026-06-11-xy-897-competitor-strength-matrix.json @@ -8,6 +8,8 @@ "docs/guide/benchmarking/2026-06-10-production-adoption-refresh.md", "docs/guide/benchmarking/2026-06-10-real-world-comparison-report.md", "docs/guide/benchmarking/2026-06-10-live-real-world-sweep-report.md", + "docs/guide/benchmarking/2026-06-11-measurement-coverage-audit.md", + "docs/guide/benchmarking/2026-06-11-competitor-strength-adoption-report.md", "docs/guide/research/external_memory_improvement_plan.md", "docs/guide/research/research_projects_inventory.md", "apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json", @@ -30,8 +32,8 @@ }, "overall_status_counts": { "lifecycle_fail": 1, - "blocked": 6, - "not_encoded": 6, + "blocked": 7, + "not_encoded": 5, "pass": 4, "wrong_result": 6 } @@ -313,17 +315,17 @@ "supporting_evidence_classes": [ "research_gate" ], - "measured_status": "not_encoded", + "measured_status": "blocked", "proof": { - "command": null, - "artifact": "docs/research/2026-06-10-xy-882-rag-graph-adapter-feasibility.json" + "command": "blocked until a Docker-only benchmark-created agent export is implemented", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json" }, "unsupported_or_blocked_status": { "state": "blocked", - "typed_reason": "contained_evidence_export_path_not_selected", - "details": "Research-only until a supported contained server path can export core/archival evidence without relying on unsupported setup." + "typed_reason": "contained_export_readback_artifact_missing", + "details": "The selected contract requires a benchmark-created Letta agent export with core block JSON, archival search/readback JSON, and source ids before any scenario claim can be scored." }, - "benchmark_before_claim": "Select a contained evidence export contract, then encode core-vs-archival memory, personalization, and project-decision jobs.", + "benchmark_before_claim": "Implement and run the contained export/readback adapter before any Letta win, tie, or loss claim; keep personalization and project-decision scenarios blocked or not tested until that evidence exists.", "borrow_if_stronger": "Borrow explicit core memory block ergonomics, archival separation, and shared operating context readback." }, { @@ -449,11 +451,11 @@ { "scenario_id": "project_decisions", "scenario": "project decisions", - "current_elf_evidence": "ELF fixture-backed and live_real_world project_decisions suites pass.", + "current_elf_evidence": "ELF fixture-backed and live_real_world project_decisions suites pass; the ELF core_archival_memory fixture also scores project-decision recovery through core routing plus archival rationale.", "strongest_competitor_or_reference": "qmd, Letta", - "current_competitor_evidence": "qmd live_real_world project_decisions passes; Letta project_decisions is research_gate not_encoded.", - "current_state": "ELF and qmd are the only measured live competitors for this scenario.", - "next_measurement": "Add core/archival decision-memory jobs for Letta only after a contained export path exists; otherwise keep Letta as design reference." + "current_competitor_evidence": "qmd live_real_world project_decisions passes; Letta project-decision recovery is research_gate not_tested or blocked until the contained export path exists.", + "current_state": "ELF and qmd are the only measured live competitors for this scenario; Letta remains a product-reference comparison target.", + "next_measurement": "Run the Letta core/archival export/readback contract before treating project-decision recovery as a comparable scenario." }, { "scenario_id": "source_of_truth", @@ -539,11 +541,11 @@ { "scenario_id": "core_vs_archival_memory", "scenario": "core-vs-archival memory", - "current_elf_evidence": "ELF spec and admin surfaces define core blocks, but comparative benchmark coverage is not yet encoded here.", + "current_elf_evidence": "ELF fixture core_archival_memory passes 6/6 and scores core block attachment, scope, provenance, stale-core detection, archival fallback, and project-decision recovery separately from archival note search.", "strongest_competitor_or_reference": "Letta", - "current_competitor_evidence": "Letta is research_gate not_encoded until a contained evidence export path is selected.", - "current_state": "Scenario is a product gap measurement target, not a current win/loss surface.", - "next_measurement": "Add core-block versus archival-search jobs for ELF and only compare Letta after contained export proof exists." + "current_competitor_evidence": "Letta is research_gate blocked/not_tested until the selected contained export/readback artifact exists.", + "current_state": "ELF has fixture-only core-block evidence; Letta remains unscored, so no win, tie, or loss claim is allowed.", + "next_measurement": "Implement the Letta export/readback adapter, then compare only scenarios whose core block JSON, archival search/readback JSON, and source ids are present." }, { "scenario_id": "graph_rag_navigation", @@ -649,10 +651,10 @@ }, { "workstream": "Letta, LangGraph, nanograph, llm-wiki direct adapters", - "issue_or_candidate": "research-only until output contract", + "issue_or_candidate": "Letta export artifact blocked; others research-only until output contract", "parallelizable": false, - "blocked_by": "Contained evidence export or non-memory-backend comparability contract.", - "measurement": "Only run after each has a comparable output contract; otherwise treat as product-reference evidence." + "blocked_by": "Letta needs the selected contained export/readback artifact; the others need a non-memory-backend comparability contract.", + "measurement": "Only run after comparable output exists; otherwise treat as product-reference evidence." } ] } diff --git a/docs/spec/real_world_agent_memory_benchmark_v1.md b/docs/spec/real_world_agent_memory_benchmark_v1.md index cfa15fed..059a14d8 100644 --- a/docs/spec/real_world_agent_memory_benchmark_v1.md +++ b/docs/spec/real_world_agent_memory_benchmark_v1.md @@ -190,12 +190,14 @@ Each `adapters[]` record MUST include: optional `suite_id`, `status`, `elf_position`, optional `comparison_outcome`, `evidence`, and optional `command` and `artifact`. `elf_position` MUST be one of `wins`, `ties`, `loses`, or `untested`. `comparison_outcome`, when present, MUST be - one of `win`, `tie`, `loss`, `not_tested`, `blocked`, or `non_goal`. Reports SHOULD - derive `comparison_outcome` from `elf_position` when omitted, but SHOULD use the - explicit field for scenarios where the legacy ELF-relative position is less precise - than the report outcome. Scenario judgments are report inputs for dimension-level - comparison; they MUST NOT convert live-baseline-only evidence into real-world suite - pass claims. + one of `win`, `tie`, `loss`, `not_tested`, `blocked`, or `non_goal`. Scenario rows + with `status = "blocked"` MUST set `comparison_outcome = "blocked"` explicitly so a + blocked evidence path is not derived from `elf_position = "untested"` as + `not_tested`. Reports SHOULD derive `comparison_outcome` from `elf_position` when + omitted for non-blocked rows, but SHOULD use the explicit field for scenarios where + the legacy ELF-relative position is less precise than the report outcome. Scenario + judgments are report inputs for dimension-level comparison; they MUST NOT convert + live-baseline-only evidence into real-world suite pass claims. - `evidence`: array of evidence pointers with `kind`, `ref`, and `status`. - `notes`: optional bounded explanatory strings. - `follow_up`: optional `title` and `reason`. @@ -537,6 +539,7 @@ Suite ids are stable public names. Each suite MUST contain at least one | `capture_integration` | Evaluate how accurately work observations become usable memory across agents and tools. | Capture a session decision; exclude private spans; import external agent observations. | Hook/import logs, write policy audits, excluded spans, resulting note ids. | answer_correctness, evidence_grounding, trap_avoidance, lifecycle_behavior. | agentmemory, claude-mem, memsearch, mem0. | | `production_ops` | Prove safe operation under backup, restore, backfill, cold start, resource, and credential boundaries. | Resume interrupted import; restore from backup; report missing private manifest as bounded caveat. | Command/report artifacts, resource envelope, checkpoint state, failure guard evidence. | lifecycle_behavior, latency_resource, uncertainty_handling, evidence_grounding. | ELF, qmd, memsearch, LangGraph. | | `personalization` | Apply user/project preferences correctly without leaking across scopes or overfitting stale preferences. | Remember preferred response style; avoid using another project tenant's note; update a preference. | Scoped memory ids, preference versions, tenant/project/agent context, negative cross-scope traps. | personalization_fit, trap_avoidance, evidence_grounding, answer_correctness. | mem0, Letta, agentmemory, ELF. | +| `core_archival_memory` | Verify always-loaded core memory behavior separately from archival note search and derived retrieval indexes. | Read an attached core block; enforce core block scope; detect stale core state from archival evidence; fall back to archival notes; recover a decision from core routing plus archival rationale. | Core block ids, attachment ids, read_profile/scope metadata, source_ref and audit history, archival note evidence ids, stale-core traps, and explicit no-Qdrant-core-block boundary evidence. | answer_correctness, evidence_grounding, trap_avoidance, lifecycle_behavior, workflow_helpfulness. | Letta, ELF. | | `context_trajectory` | Measure staged context trajectory, hierarchy selection, and recursive/context expansion without converting setup or retrieval preconditions into trajectory wins. | Explain whether a staged trajectory can be scored; identify selected hierarchy nodes; report recursive expansion paths and pruned branches. | Same-corpus expected evidence ids, matched/missing evidence ids, stage artifacts, selected hierarchy nodes, expansion paths, comparable ELF trace/session artifacts when a comparison is claimed. | answer_correctness, evidence_grounding, trap_avoidance, debuggability, workflow_helpfulness. | OpenViking, ELF, qmd. | ## Report Semantics