From 3660a827413aa9d747072f7fdb6d0336dbd4519d Mon Sep 17 00:00:00 2001 From: Yvette Carlisle Date: Tue, 16 Jun 2026 17:11:29 +0800 Subject: [PATCH] {"schema":"decodex/commit/1","summary":"Add reviewable memory summary source trace contract","authority":"XY-952"} --- Makefile.toml | 52 ++ README.md | 14 +- .../memory_projects_manifest.json | 2 +- .../reviewable_summary_source_trace.json | 589 ++++++++++++ .../src/bin/real_world_job_benchmark.rs | 879 +++++++++++++++++- .../tests/real_world_job_benchmark.rs | 361 ++++++- ...-11-competitor-strength-adoption-report.md | 3 +- ...-11-competitor-strength-evidence-matrix.md | 12 +- ...on-direction-from-competitor-benchmarks.md | 18 +- ...6-06-16-dreaming-readiness-stage-ledger.md | 27 +- .../real_world_agent_memory_benchmark.md | 15 +- ...1-competitor-strength-adoption-report.json | 7 +- ...06-16-dreaming-readiness-stage-ledger.json | 28 +- docs/spec/index.md | 2 + .../real_world_agent_memory_benchmark_v1.md | 19 + docs/spec/system_elf_memory_service_v2.md | 17 + docs/spec/system_memory_summary_v1.md | 171 ++++ 17 files changed, 2105 insertions(+), 111 deletions(-) create mode 100644 apps/elf-eval/fixtures/real_world_memory/memory_summary/reviewable_summary_source_trace.json create mode 100644 docs/spec/system_memory_summary_v1.md diff --git a/Makefile.toml b/Makefile.toml index 6e8e6c56..1cc9d93b 100644 --- a/Makefile.toml +++ b/Makefile.toml @@ -418,6 +418,9 @@ args = [ # | real-world-memory-consolidation | composite | | # | real-world-memory-consolidation-json | command | | # | real-world-memory-consolidation-report | command | | +# | real-world-memory-summary | composite | | +# | real-world-memory-summary-json | command | | +# | real-world-memory-summary-report | command | | # | real-world-memory-live-consolidation | command | | # | real-world-job-operator-ux | composite | | # | real-world-job-operator-ux-json | command | | @@ -831,6 +834,55 @@ args = [ "tmp/real-world-memory/consolidation/report.md", ] +[tasks.real-world-memory-summary] +workspace = false +dependencies = [ + "real-world-memory-summary-report", +] + +[tasks.real-world-memory-summary-json] +workspace = false +command = "cargo" +args = [ + "run", + "-p", + "elf-eval", + "--bin", + "real_world_job_benchmark", + "--", + "run", + "--fixtures", + "apps/elf-eval/fixtures/real_world_memory/memory_summary", + "--out", + "tmp/real-world-memory/memory-summary/report.json", + "--run-id", + "real-world-memory-summary", + "--adapter-id", + "fixture_memory_summary", + "--adapter-name", + "ELF memory summary fixture", +] + +[tasks.real-world-memory-summary-report] +workspace = false +dependencies = [ + "real-world-memory-summary-json", +] +command = "cargo" +args = [ + "run", + "-p", + "elf-eval", + "--bin", + "real_world_job_benchmark", + "--", + "publish", + "--report", + "tmp/real-world-memory/memory-summary/report.json", + "--out", + "tmp/real-world-memory/memory-summary/report.md", +] + [tasks.real-world-memory-live-consolidation] workspace = false command = "bash" diff --git a/README.md b/README.md index aa3b0350..982fb341 100644 --- a/README.md +++ b/README.md @@ -152,15 +152,17 @@ provider-backed ELF evidence was required. its pinned Docker local embedding path and is reported as `wrong_result` when same-corpus evidence terms are missed; claude-mem and OpenViking non-retrieval coverage remain typed non-pass states. -- Real-world agent memory aggregate after XY-927 and XY-928: 49 fixture-backed - jobs across 13 suites, 44 pass, 0 incomplete, 5 blocked, 0 wrong-result, +- Real-world agent memory aggregate after XY-952: 50 fixture-backed + jobs across 14 suites, 45 pass, 0 incomplete, 5 blocked, 0 wrong-result, 0 not-encoded, and 0 unsupported-claim results. The remaining non-pass jobs are production-ops operator boundaries plus blocked OpenViking staged trajectory, hierarchy selection, and recursive/context expansion measurement gates, not - hidden benchmark wins. The new `core_archival_memory` suite passes 6 fixture - jobs for core block attachment, scope, provenance, stale-core detection, - archival fallback, and project-decision recovery; it does not create an - ELF-over-Letta claim. + hidden benchmark wins. The `core_archival_memory` suite passes 6 fixture jobs for + core block attachment, scope, provenance, stale-core detection, archival fallback, + and project-decision recovery; it does not create an ELF-over-Letta claim. The new + `memory_summary` fixture passes 1 source-trace job for reviewable top-of-mind, + background, stale, superseded, tombstoned, and derived project-profile entries; it + does not create a managed-memory parity claim. - Full-suite live real-world adapter sweep after XY-926: ELF and qmd emit Docker-isolated `live_real_world` records for all 55 checked-in jobs across 13 suites through `cargo make real-world-memory-live-adapters`. Both keep the original diff --git a/apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json b/apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json index f5ccdf80..f4286e24 100644 --- a/apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json +++ b/apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json @@ -29,7 +29,7 @@ }, "run": { "status": "blocked", - "evidence": "The current fixture set reports 49 jobs across 13 suites: 44 pass, 0 incomplete, 5 blocked, 0 wrong_result, 0 not_encoded, and 0 unsupported_claim. The six core_archival_memory jobs pass as ELF fixture evidence, not as live Letta comparison evidence; context_trajectory remains blocked behind OpenViking staged-artifact materialization.", + "evidence": "The current fixture set reports 50 jobs across 14 suites: 45 pass, 0 incomplete, 5 blocked, 0 wrong_result, 0 not_encoded, and 0 unsupported_claim. The six core_archival_memory jobs pass as ELF fixture evidence, not as live Letta comparison evidence; the one memory_summary job passes as fixture-backed source-trace evidence, not as managed-memory parity evidence; context_trajectory remains blocked behind OpenViking staged-artifact materialization.", "command": "cargo make real-world-memory", "artifact": "tmp/real-world-memory/real-world-memory-report.json" }, diff --git a/apps/elf-eval/fixtures/real_world_memory/memory_summary/reviewable_summary_source_trace.json b/apps/elf-eval/fixtures/real_world_memory/memory_summary/reviewable_summary_source_trace.json new file mode 100644 index 00000000..b7b552ca --- /dev/null +++ b/apps/elf-eval/fixtures/real_world_memory/memory_summary/reviewable_summary_source_trace.json @@ -0,0 +1,589 @@ +{ + "schema": "elf.real_world_job/v1", + "job_id": "memory-summary-source-trace-001", + "suite": "memory_summary", + "title": "Read back a reviewable current memory summary with source trace", + "corpus": { + "corpus_id": "real-world-memory-summary-2026-06-16", + "profile": "synthetic", + "items": [ + { + "evidence_id": "summary-contract-current", + "kind": "decision", + "text": "Current decision: ELF memory summaries are derived reviewable readback artifacts and must not mutate authoritative source notes.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "reviewable_summary_source_trace", + "evidence_id": "summary-contract-current" + }, + "locator": { + "quote": "derived reviewable readback artifacts" + } + }, + "created_at": "2026-06-16T02:00:00Z" + }, + { + "evidence_id": "summary-background-sot", + "kind": "fact", + "text": "Background memory: Postgres remains the source of truth while Qdrant is a rebuildable derived retrieval index.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "reviewable_summary_source_trace", + "evidence_id": "summary-background-sot" + }, + "locator": { + "quote": "Postgres remains the source of truth" + } + }, + "created_at": "2026-06-10T09:00:00Z" + }, + { + "evidence_id": "stale-summary-gap", + "kind": "note", + "text": "Stale summary note: memory-summary and top-of-mind behavior are not encoded and should stay not_tested.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "reviewable_summary_source_trace", + "evidence_id": "stale-summary-gap" + } + }, + "created_at": "2026-06-15T08:00:00Z" + }, + { + "evidence_id": "xy952-summary-contract", + "kind": "decision", + "text": "XY-952 update: memory-summary and top-of-mind behavior now has a fixture-backed reviewable source-trace contract.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "reviewable_summary_source_trace", + "evidence_id": "xy952-summary-contract" + }, + "locator": { + "quote": "fixture-backed reviewable source-trace contract" + } + }, + "created_at": "2026-06-16T02:30:00Z" + }, + { + "evidence_id": "superseded-live-evolution-loss", + "kind": "report", + "text": "Historical report: before XY-905, ELF live memory_evolution had one pass and five wrong_result jobs.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "reviewable_summary_source_trace", + "evidence_id": "superseded-live-evolution-loss" + } + }, + "created_at": "2026-06-11T10:00:00Z" + }, + { + "evidence_id": "xy905-live-evolution-pass", + "kind": "report", + "text": "Current report: after XY-905, ELF live memory_evolution passes all six encoded jobs with current, historical, rationale, tombstone, and invalidation evidence.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "reviewable_summary_source_trace", + "evidence_id": "xy905-live-evolution-pass" + }, + "locator": { + "quote": "passes all six encoded jobs" + } + }, + "created_at": "2026-06-16T02:20:00Z" + }, + { + "evidence_id": "summary-temporary-claim", + "kind": "note", + "text": "Temporary summary claim: publish a managed-memory parity claim from fixture-only summary evidence.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "reviewable_summary_source_trace", + "evidence_id": "summary-temporary-claim" + } + }, + "created_at": "2026-06-15T11:00:00Z" + }, + { + "evidence_id": "summary-ttl-tombstone", + "kind": "trace", + "text": "Summary tombstone: the fixture-only managed-memory parity claim expired at 2026-06-16T00:00:00Z and must be excluded from current top-of-mind memory.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "reviewable_summary_source_trace", + "evidence_id": "summary-ttl-tombstone" + }, + "locator": { + "quote": "must be excluded from current top-of-mind memory" + } + }, + "created_at": "2026-06-16T00:00:00Z" + }, + { + "evidence_id": "summary-contract-non-parity-boundary", + "kind": "decision", + "text": "Boundary: the local memory-summary contract is not evidence of parity with OpenAI or Anthropic managed memory products.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "reviewable_summary_source_trace", + "evidence_id": "summary-contract-non-parity-boundary" + }, + "locator": { + "quote": "not evidence of parity" + } + }, + "created_at": "2026-06-16T02:40:00Z" + } + ], + "adapter_response": { + "adapter_id": "fixture_memory_summary", + "answer": { + "content": "The reviewable memory summary keeps the current XY-952 source-trace contract top of mind, keeps the Postgres/Qdrant source-of-truth rule as background, downgrades the old not-tested summary gap and pre-XY-905 live loss, preserves the TTL tombstone for the parity claim, and excludes unsupported managed-memory parity as a derived project-profile candidate.", + "claims": [ + { + "claim_id": "summary_contract_reviewable", + "text": "The memory summary is a derived reviewable readback artifact and must not mutate authoritative notes.", + "evidence_ids": ["summary-contract-current"], + "confidence": "high" + }, + { + "claim_id": "summary_stage_now_fixture_backed", + "text": "The memory-summary stage now has a fixture-backed reviewable source-trace contract.", + "evidence_ids": ["xy952-summary-contract"], + "confidence": "high" + }, + { + "claim_id": "summary_preserves_tombstone", + "text": "The expired managed-memory parity claim is excluded from current top-of-mind memory.", + "evidence_ids": ["summary-ttl-tombstone"], + "confidence": "high" + }, + { + "claim_id": "summary_excludes_unsupported_parity", + "text": "The local memory-summary contract is not evidence of parity with managed memory products.", + "evidence_ids": ["summary-contract-non-parity-boundary"], + "confidence": "high" + } + ], + "evidence_ids": [ + "summary-contract-current", + "xy952-summary-contract", + "summary-ttl-tombstone", + "summary-contract-non-parity-boundary" + ], + "memory_summaries": [ + { + "summary_id": "summary-xy952-reviewable-memory", + "contract_schema": "elf.memory_summary/v1", + "generated_at": "2026-06-16T03:00:00Z", + "tenant_id": "fixture-tenant", + "project_id": "elf", + "agent_id": "xy-952-fixture-agent", + "read_profile": "private_plus_project", + "entries": [ + { + "entry_id": "top-xy952-contract", + "category": "top_of_mind", + "text": "Memory summaries now use a reviewable source-trace contract.", + "source_refs": ["xy952-summary-contract"], + "freshness": { + "status": "current", + "observed_at": "2026-06-16T02:30:00Z", + "valid_from": "2026-06-16T02:30:00Z", + "valid_to": null, + "last_confirmed_at": "2026-06-16T03:00:00Z", + "superseded_by": [], + "tombstone_refs": [] + }, + "rationale": { + "decision": "included", + "reason_code": "TOP_OF_MIND_CURRENT_REVIEWABLE_SUMMARY_CONTRACT", + "reason": "The current issue lane is adding the summary/source-trace contract and benchmark guard." + }, + "unsupported_claim_flags": [] + }, + { + "entry_id": "background-source-truth", + "category": "background", + "text": "Postgres remains authoritative while Qdrant remains a rebuildable derived index.", + "source_refs": ["summary-background-sot"], + "freshness": { + "status": "background", + "observed_at": "2026-06-10T09:00:00Z", + "valid_from": "2026-06-10T09:00:00Z", + "valid_to": null, + "last_confirmed_at": "2026-06-16T03:00:00Z", + "superseded_by": [], + "tombstone_refs": [] + }, + "rationale": { + "decision": "included", + "reason_code": "BACKGROUND_STABLE_SOURCE_OF_TRUTH_BOUNDARY", + "reason": "The source-of-truth boundary is stable context, not urgent top-of-mind work." + }, + "unsupported_claim_flags": [] + }, + { + "entry_id": "stale-summary-not-tested", + "category": "stale", + "text": "The old memory-summary stage state was not_tested before XY-952.", + "source_refs": ["stale-summary-gap"], + "freshness": { + "status": "stale", + "observed_at": "2026-06-15T08:00:00Z", + "valid_from": "2026-06-15T08:00:00Z", + "valid_to": "2026-06-16T02:30:00Z", + "last_confirmed_at": "2026-06-15T08:00:00Z", + "superseded_by": ["xy952-summary-contract"], + "tombstone_refs": [] + }, + "rationale": { + "decision": "downgraded", + "reason_code": "DOWNGRADED_STALE_SUMMARY_STAGE_REPLACED", + "reason": "XY-952 adds a fixture-backed contract, so the earlier not_tested state is history." + }, + "unsupported_claim_flags": [] + }, + { + "entry_id": "superseded-live-evolution-loss", + "category": "superseded", + "text": "The pre-XY-905 live memory_evolution loss is historical.", + "source_refs": ["superseded-live-evolution-loss"], + "freshness": { + "status": "superseded", + "observed_at": "2026-06-11T10:00:00Z", + "valid_from": "2026-06-11T10:00:00Z", + "valid_to": "2026-06-16T02:20:00Z", + "last_confirmed_at": "2026-06-11T10:00:00Z", + "superseded_by": ["xy905-live-evolution-pass"], + "tombstone_refs": [] + }, + "rationale": { + "decision": "downgraded", + "reason_code": "SUPERSEDED_BY_XY905_LIVE_RECONCILIATION", + "reason": "The XY-905 report superseded the older live memory_evolution wrong_result state." + }, + "unsupported_claim_flags": [] + }, + { + "entry_id": "tombstone-managed-parity-claim", + "category": "tombstone", + "text": "The fixture-only managed-memory parity claim is tombstoned and excluded.", + "source_refs": ["summary-ttl-tombstone"], + "freshness": { + "status": "tombstoned", + "observed_at": "2026-06-16T00:00:00Z", + "valid_from": "2026-06-15T11:00:00Z", + "valid_to": "2026-06-16T00:00:00Z", + "last_confirmed_at": "2026-06-16T00:00:00Z", + "superseded_by": [], + "tombstone_refs": ["summary-ttl-tombstone"] + }, + "rationale": { + "decision": "excluded", + "reason_code": "TOMBSTONE_TTL_INVALIDATED_PARITY_CLAIM", + "reason": "The tombstone says the parity claim expired and must not appear as current top-of-mind memory." + }, + "unsupported_claim_flags": [] + }, + { + "entry_id": "derived-project-profile-summary-boundary", + "category": "derived_project_profile", + "text": "Project profile: ELF summaries are reviewable derived readback, not authoritative notes.", + "source_refs": ["summary-contract-current", "summary-background-sot"], + "freshness": { + "status": "current", + "observed_at": "2026-06-16T02:00:00Z", + "valid_from": "2026-06-16T02:00:00Z", + "valid_to": null, + "last_confirmed_at": "2026-06-16T03:00:00Z", + "superseded_by": [], + "tombstone_refs": [] + }, + "rationale": { + "decision": "included", + "reason_code": "DERIVED_PROFILE_SOURCE_BACKED_BOUNDARY", + "reason": "The derived project profile is source-backed and labels summaries as non-authoritative." + }, + "unsupported_claim_flags": [] + }, + { + "entry_id": "derived-project-profile-parity-excluded", + "category": "derived_project_profile", + "text": "Excluded candidate: the local summary contract proves parity with managed memory products.", + "source_refs": [], + "freshness": { + "status": "unsupported", + "observed_at": "2026-06-16T03:00:00Z", + "valid_from": null, + "valid_to": null, + "last_confirmed_at": null, + "superseded_by": [], + "tombstone_refs": [] + }, + "rationale": { + "decision": "excluded", + "reason_code": "EXCLUDED_UNSUPPORTED_MANAGED_MEMORY_PARITY", + "reason": "The local contract is not comparable live evidence for OpenAI or Anthropic managed memory products." + }, + "unsupported_claim_flags": [ + { + "claim_id": "managed_memory_parity", + "message": "No comparable live managed-memory runner exists for this lane.", + "source": { + "evidence_id": "summary-contract-non-parity-boundary" + } + } + ] + } + ], + "source_trace": { + "selected_source_refs": [ + { + "evidence_id": "xy952-summary-contract", + "status": "active", + "reason": "current top-of-mind contract evidence" + }, + { + "evidence_id": "summary-background-sot", + "status": "active", + "reason": "stable background source-of-truth evidence" + } + ], + "dropped_source_refs": [ + { + "evidence_id": "summary-temporary-claim", + "status": "expired", + "reason": "tombstoned parity claim" + } + ], + "stale_source_refs": [ + { + "evidence_id": "stale-summary-gap", + "status": "stale", + "reason": "superseded by XY-952 fixture-backed contract", + "superseded_by": "xy952-summary-contract" + } + ], + "superseded_source_refs": [ + { + "evidence_id": "superseded-live-evolution-loss", + "status": "superseded", + "reason": "XY-905 live report superseded the old loss", + "superseded_by": "xy905-live-evolution-pass" + } + ], + "tombstone_source_refs": [ + { + "evidence_id": "summary-ttl-tombstone", + "status": "tombstoned", + "reason": "TTL invalidation suppresses the parity claim" + } + ], + "unsupported_claim_flags": [ + { + "claim_id": "managed_memory_parity", + "message": "Fixture-backed contract evidence is not managed-memory parity evidence." + } + ] + } + } + ], + "latency_ms": 1.1, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + } + } + } + }, + "timeline": [ + { + "event_id": "summary-gap-recorded", + "ts": "2026-06-15T08:00:00Z", + "actor": "agent", + "action": "recorded_not_tested_stage", + "evidence_ids": ["stale-summary-gap"], + "summary": "The stage ledger recorded memory summary behavior as not_tested." + }, + { + "event_id": "temporary-parity-claim-expired", + "ts": "2026-06-16T00:00:00Z", + "actor": "worker", + "action": "ttl_invalidated_claim", + "evidence_ids": ["summary-ttl-tombstone"], + "summary": "The temporary parity claim was tombstoned." + }, + { + "event_id": "xy952-contract-recorded", + "ts": "2026-06-16T02:30:00Z", + "actor": "agent", + "action": "recorded_summary_contract", + "evidence_ids": ["xy952-summary-contract"], + "summary": "The summary/source-trace contract became fixture-backed." + } + ], + "prompt": { + "role": "user", + "content": "Show the current memory summary surface and explain why stale, tombstoned, and unsupported derived memories are not top-of-mind current facts.", + "job_mode": "summary_readback", + "constraints": [ + "cite_evidence", + "preserve_current_vs_historical_truth", + "expose_source_trace", + "do_not_claim_managed_memory_parity" + ] + }, + "expected_answer": { + "must_include": [ + { + "claim_id": "summary_contract_reviewable", + "text": "The memory summary is a derived reviewable readback artifact and must not mutate authoritative notes." + }, + { + "claim_id": "summary_stage_now_fixture_backed", + "text": "The memory-summary stage now has a fixture-backed reviewable source-trace contract." + }, + { + "claim_id": "summary_preserves_tombstone", + "text": "The expired managed-memory parity claim is excluded from current top-of-mind memory." + }, + { + "claim_id": "summary_excludes_unsupported_parity", + "text": "The local memory-summary contract is not evidence of parity with managed memory products." + } + ], + "must_not_include": [ + "ELF has parity with managed memory products.", + "memory summaries are authoritative source notes", + "memory-summary and top-of-mind behavior are not encoded and should stay not_tested" + ], + "evidence_links": { + "summary_contract_reviewable": ["summary-contract-current"], + "summary_stage_now_fixture_backed": ["xy952-summary-contract"], + "summary_preserves_tombstone": ["summary-ttl-tombstone"], + "summary_excludes_unsupported_parity": ["summary-contract-non-parity-boundary"] + }, + "answer_type": "reviewable_memory_summary", + "accepted_alternates": [], + "requires_caveat": false, + "requires_refusal": false + }, + "required_evidence": [ + { + "evidence_id": "summary-contract-current", + "claim_id": "summary_contract_reviewable", + "requirement": "cite", + "quote": "derived reviewable readback artifacts" + }, + { + "evidence_id": "xy952-summary-contract", + "claim_id": "summary_stage_now_fixture_backed", + "requirement": "cite", + "quote": "fixture-backed reviewable source-trace contract" + }, + { + "evidence_id": "summary-ttl-tombstone", + "claim_id": "summary_preserves_tombstone", + "requirement": "cite", + "quote": "must be excluded from current top-of-mind memory" + }, + { + "evidence_id": "summary-contract-non-parity-boundary", + "claim_id": "summary_excludes_unsupported_parity", + "requirement": "cite", + "quote": "not evidence of parity" + } + ], + "negative_traps": [ + { + "trap_id": "stale-summary-gap-current", + "type": "stale_fact", + "evidence_ids": ["stale-summary-gap"], + "failure_if_used": true + }, + { + "trap_id": "temporary-parity-claim-current", + "type": "stale_fact", + "evidence_ids": ["summary-temporary-claim"], + "failure_if_used": true + } + ], + "scoring_rubric": { + "dimensions": { + "answer_correctness": { + "weight": 0.2, + "max_points": 1.0, + "criteria": "States that the summary is reviewable derived readback and identifies the current fixture-backed contract." + }, + "evidence_grounding": { + "weight": 0.25, + "max_points": 1.0, + "criteria": "Summary entries and answer claims carry source refs or explicit unsupported-claim flags." + }, + "lifecycle_behavior": { + "weight": 0.25, + "max_points": 1.0, + "criteria": "Stale, superseded, and tombstoned entries are downgraded or excluded instead of treated as current top-of-mind facts." + }, + "trap_avoidance": { + "weight": 0.15, + "max_points": 1.0, + "criteria": "Avoids stale not_tested and expired parity traps as current facts." + }, + "uncertainty_handling": { + "weight": 0.15, + "max_points": 1.0, + "criteria": "Unsupported managed-memory parity is flagged or excluded, not silently asserted." + } + }, + "pass_threshold": 0.85, + "hard_fail_rules": [ + "unsupported high-confidence claim about a required decision or fact", + "use of a negative trap marked failure_if_used = true", + "stale, superseded, or tombstoned memory must not appear as current top-of-mind", + "derived summary entries must have source refs or unsupported-claim flags" + ] + }, + "allowed_uncertainty": { + "can_answer_unknown": false, + "acceptable_phrases": [], + "fallback_action": "state_blocker" + }, + "memory_summary": { + "required_categories": [ + "top_of_mind", + "background", + "stale", + "superseded", + "tombstone", + "derived_project_profile" + ] + }, + "tags": [ + "synthetic", + "memory_summary", + "source_trace", + "reviewable_derived_readback", + "no_live_claim" + ] +} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark.rs b/apps/elf-eval/src/bin/real_world_job_benchmark.rs index 53314c5b..2038b5c5 100644 --- a/apps/elf-eval/src/bin/real_world_job_benchmark.rs +++ b/apps/elf-eval/src/bin/real_world_job_benchmark.rs @@ -49,6 +49,7 @@ const SUITES: &[&str] = &[ "retrieval", "memory_evolution", "consolidation", + "memory_summary", "knowledge_compilation", "operator_debugging_ux", "capture_integration", @@ -148,6 +149,7 @@ struct RealWorldJob { #[serde(default)] encoding: JobEncoding, memory_evolution: Option, + memory_summary: Option, } #[derive(Debug, Deserialize)] @@ -355,6 +357,12 @@ struct HistoryReadback { requires_note_version_links: bool, } +#[derive(Debug, Deserialize)] +struct MemorySummaryExpectation { + #[serde(default)] + required_categories: Vec, +} + #[derive(Debug, Deserialize)] struct ScoringRubric { #[serde(default)] @@ -395,6 +403,8 @@ struct ProducedAnswer { evidence_ids: Vec, #[serde(default)] pages: Vec, + #[serde(default)] + memory_summaries: Vec, #[serde(skip_serializing_if = "Option::is_none")] latency_ms: Option, #[serde(skip_serializing_if = "Option::is_none")] @@ -466,6 +476,84 @@ struct DerivedPageRebuild { allowed_variance: Vec, } +#[derive(Clone, Debug, Deserialize, Serialize)] +struct MemorySummaryArtifact { + summary_id: String, + contract_schema: String, + generated_at: String, + tenant_id: String, + project_id: String, + agent_id: String, + read_profile: String, + #[serde(default)] + entries: Vec, + source_trace: MemorySummarySourceTrace, +} + +#[derive(Clone, Debug, Deserialize, Serialize)] +struct MemorySummaryEntry { + entry_id: String, + category: String, + text: String, + #[serde(default)] + source_refs: Vec, + freshness: MemorySummaryFreshness, + rationale: MemorySummaryRationale, + #[serde(default)] + unsupported_claim_flags: Vec, +} + +#[derive(Clone, Debug, Deserialize, Serialize)] +struct MemorySummaryFreshness { + status: String, + #[serde(skip_serializing_if = "Option::is_none")] + observed_at: Option, + #[serde(skip_serializing_if = "Option::is_none")] + valid_from: Option, + #[serde(skip_serializing_if = "Option::is_none")] + valid_to: Option, + #[serde(skip_serializing_if = "Option::is_none")] + last_confirmed_at: Option, + #[serde(default)] + superseded_by: Vec, + #[serde(default)] + tombstone_refs: Vec, +} + +#[derive(Clone, Debug, Deserialize, Serialize)] +struct MemorySummaryRationale { + decision: String, + reason_code: String, + reason: String, +} + +#[derive(Clone, Debug, Default, Deserialize, Serialize)] +struct MemorySummarySourceTrace { + #[serde(default)] + selected_source_refs: Vec, + #[serde(default)] + dropped_source_refs: Vec, + #[serde(default)] + stale_source_refs: Vec, + #[serde(default)] + superseded_source_refs: Vec, + #[serde(default)] + tombstone_source_refs: Vec, + #[serde(default)] + unsupported_claim_flags: Vec, +} + +#[derive(Clone, Debug, Deserialize, Serialize)] +struct MemorySummarySourceTraceItem { + evidence_id: String, + #[serde(skip_serializing_if = "Option::is_none")] + status: Option, + #[serde(skip_serializing_if = "Option::is_none")] + reason: Option, + #[serde(skip_serializing_if = "Option::is_none")] + superseded_by: Option, +} + #[derive(Clone, Debug, Deserialize)] struct ConsolidationFixture { #[serde(default)] @@ -945,6 +1033,8 @@ struct ReportSummary { #[serde(default)] consolidation: ConsolidationSummaryReport, #[serde(skip_serializing_if = "Option::is_none")] + memory_summary: Option, + #[serde(skip_serializing_if = "Option::is_none")] knowledge: Option, } @@ -959,6 +1049,41 @@ struct ConsolidationSummaryReport { executable_gap_count: usize, } +#[derive(Clone, Debug, Default, Deserialize, Serialize)] +struct MemorySummaryReport { + job_count: usize, + summary_count: usize, + entry_count: usize, + required_category_count: usize, + covered_required_category_count: usize, + missing_required_category_count: usize, + top_of_mind_count: usize, + background_count: usize, + stale_count: usize, + superseded_count: usize, + tombstone_count: usize, + derived_project_profile_count: usize, + source_ref_required_count: usize, + source_ref_entry_count: usize, + source_ref_coverage: f64, + freshness_marker_count: usize, + freshness_coverage: f64, + rationale_count: usize, + rationale_coverage: f64, + invalid_top_of_mind_count: usize, + untraced_entry_count: usize, + derived_with_source_or_unsupported_count: usize, + derived_missing_source_or_unsupported_count: usize, + unsupported_derived_entry_count: usize, + unsupported_current_entry_count: usize, + tombstone_ref_count: usize, + source_trace_selected_count: usize, + source_trace_dropped_count: usize, + source_trace_stale_count: usize, + source_trace_superseded_count: usize, + source_trace_tombstone_count: usize, +} + #[derive(Clone, Debug, Default, Deserialize, Serialize)] struct KnowledgeSummary { job_count: usize, @@ -1033,6 +1158,8 @@ struct JobReport { trace_explainability: Option, #[serde(skip_serializing_if = "Option::is_none")] knowledge: Option, + #[serde(skip_serializing_if = "Option::is_none")] + memory_summary: Option, trap_ids_used: Vec, dimension_scores: Vec, reason: String, @@ -1161,6 +1288,40 @@ struct KnowledgeJobMetrics { page_usefulness: f64, } +#[derive(Clone, Debug, Default, Deserialize, Serialize)] +struct MemorySummaryJobMetrics { + summary_count: usize, + entry_count: usize, + required_category_count: usize, + covered_required_category_count: usize, + missing_required_category_count: usize, + top_of_mind_count: usize, + background_count: usize, + stale_count: usize, + superseded_count: usize, + tombstone_count: usize, + derived_project_profile_count: usize, + source_ref_required_count: usize, + source_ref_entry_count: usize, + source_ref_coverage: f64, + freshness_marker_count: usize, + freshness_coverage: f64, + rationale_count: usize, + rationale_coverage: f64, + invalid_top_of_mind_count: usize, + untraced_entry_count: usize, + derived_with_source_or_unsupported_count: usize, + derived_missing_source_or_unsupported_count: usize, + unsupported_derived_entry_count: usize, + unsupported_current_entry_count: usize, + tombstone_ref_count: usize, + source_trace_selected_count: usize, + source_trace_dropped_count: usize, + source_trace_stale_count: usize, + source_trace_superseded_count: usize, + source_trace_tombstone_count: usize, +} + #[derive(Clone, Debug, Default, Deserialize, Serialize)] struct EvolutionSummary { stale_answer_count: usize, @@ -1226,6 +1387,7 @@ struct JobScoring { reason: String, evolution: Option, consolidation: Option, + memory_summary: Option, } #[derive(Debug, Default)] @@ -1248,6 +1410,12 @@ struct FailureCounts { review_action_failures: usize, source_mutations: usize, blocking_executable_gaps: usize, + memory_summary_invalid_current_entries: usize, + memory_summary_untraced_entries: usize, + memory_summary_missing_freshness: usize, + memory_summary_missing_rationale: usize, + memory_summary_missing_categories: usize, + memory_summary_unsupported_current_entries: usize, untraced_page_sections: usize, missed_stale_findings: usize, rebuild_failures: usize, @@ -1375,6 +1543,7 @@ fn validate_job(job: &RealWorldJob, path: &Path) -> Result<()> { validate_operator_debug(job, path)?; validate_job_encoding(job, path)?; validate_memory_evolution(job, path)?; + validate_memory_summary_expectation(job, path)?; validate_trace_explainability(job, path)?; Ok(()) @@ -1651,6 +1820,19 @@ fn validate_adapter_response(job: &RealWorldJob, path: &Path) -> Result<()> { for page in &adapter_response.answer.pages { validate_page_artifact(page, path, &evidence_ids, &event_ids)?; } + for summary in &adapter_response.answer.memory_summaries { + validate_memory_summary_artifact(summary, path, &evidence_ids)?; + } + + if job.suite == "memory_summary" + && adapter_response.answer.memory_summaries.is_empty() + && job.encoding.status.is_none() + { + return Err(eyre::eyre!( + "{} memory_summary jobs must provide adapter_response.answer.memory_summaries.", + path.display() + )); + } Ok(()) } @@ -1728,6 +1910,172 @@ fn validate_page_artifact( Ok(()) } +fn validate_memory_summary_artifact( + summary: &MemorySummaryArtifact, + path: &Path, + evidence_ids: &BTreeSet, +) -> Result<()> { + if summary.summary_id.trim().is_empty() + || summary.contract_schema != "elf.memory_summary/v1" + || summary.generated_at.trim().is_empty() + || summary.tenant_id.trim().is_empty() + || summary.project_id.trim().is_empty() + || summary.agent_id.trim().is_empty() + || summary.read_profile.trim().is_empty() + || summary.entries.is_empty() + { + return Err(eyre::eyre!("{} has an incomplete memory summary.", path.display())); + } + + validate_optional_rfc3339(&summary.generated_at, path, summary.summary_id.as_str())?; + + for entry in &summary.entries { + validate_memory_summary_entry(entry, path, evidence_ids)?; + } + + validate_memory_summary_source_trace(&summary.source_trace, path, evidence_ids)?; + + Ok(()) +} + +fn validate_memory_summary_entry( + entry: &MemorySummaryEntry, + path: &Path, + evidence_ids: &BTreeSet, +) -> Result<()> { + if entry.entry_id.trim().is_empty() + || entry.category.trim().is_empty() + || entry.text.trim().is_empty() + { + return Err(eyre::eyre!("{} has an incomplete memory summary entry.", path.display())); + } + if !is_memory_summary_category(entry.category.as_str()) { + return Err(eyre::eyre!( + "{} has unknown memory summary category {}.", + path.display(), + entry.category + )); + } + if !is_memory_summary_freshness_status(entry.freshness.status.as_str()) { + return Err(eyre::eyre!( + "{} has unknown memory summary freshness status {}.", + path.display(), + entry.freshness.status + )); + } + if !is_memory_summary_rationale_decision(entry.rationale.decision.as_str()) { + return Err(eyre::eyre!( + "{} has unknown memory summary rationale decision {}.", + path.display(), + entry.rationale.decision + )); + } + + for evidence_id in &entry.source_refs { + ensure_known_evidence(path, evidence_ids, evidence_id)?; + } + for evidence_id in &entry.freshness.tombstone_refs { + ensure_known_evidence(path, evidence_ids, evidence_id)?; + } + for flag in &entry.unsupported_claim_flags { + if !flag.is_object() { + return Err(eyre::eyre!( + "{} memory summary unsupported-claim flags must be JSON objects.", + path.display() + )); + } + } + + validate_optional_summary_time( + path, + entry.freshness.observed_at.as_deref(), + entry.entry_id.as_str(), + )?; + validate_optional_summary_time( + path, + entry.freshness.valid_from.as_deref(), + entry.entry_id.as_str(), + )?; + validate_optional_summary_time( + path, + entry.freshness.valid_to.as_deref(), + entry.entry_id.as_str(), + )?; + validate_optional_summary_time( + path, + entry.freshness.last_confirmed_at.as_deref(), + entry.entry_id.as_str(), + )?; + + Ok(()) +} + +fn validate_memory_summary_source_trace( + trace: &MemorySummarySourceTrace, + path: &Path, + evidence_ids: &BTreeSet, +) -> Result<()> { + for item in trace + .selected_source_refs + .iter() + .chain(trace.dropped_source_refs.iter()) + .chain(trace.stale_source_refs.iter()) + .chain(trace.superseded_source_refs.iter()) + .chain(trace.tombstone_source_refs.iter()) + { + if item.evidence_id.trim().is_empty() { + return Err(eyre::eyre!("{} has an empty memory summary trace item.", path.display())); + } + + ensure_known_evidence(path, evidence_ids, item.evidence_id.as_str())?; + } + for flag in &trace.unsupported_claim_flags { + if !flag.is_object() { + return Err(eyre::eyre!( + "{} memory summary source-trace unsupported-claim flags must be JSON objects.", + path.display() + )); + } + } + + Ok(()) +} + +fn validate_optional_summary_time(path: &Path, value: Option<&str>, id: &str) -> Result<()> { + if let Some(value) = value { + validate_optional_rfc3339(value, path, id)?; + } + + Ok(()) +} + +fn is_memory_summary_category(category: &str) -> bool { + matches!( + category, + "top_of_mind" + | "background" + | "stale" | "superseded" + | "tombstone" + | "derived_project_profile" + ) +} + +fn is_memory_summary_freshness_status(status: &str) -> bool { + matches!( + status, + "current" + | "background" + | "historical" + | "stale" | "superseded" + | "tombstoned" + | "unsupported" + ) +} + +fn is_memory_summary_rationale_decision(decision: &str) -> bool { + matches!(decision, "included" | "downgraded" | "excluded") +} + fn validate_scoring_rubric(job: &RealWorldJob, path: &Path) -> Result<()> { if !(0.0..=1.0).contains(&job.scoring_rubric.pass_threshold) { return Err(eyre::eyre!("{} has invalid pass_threshold.", path.display())); @@ -1905,6 +2253,31 @@ fn validate_memory_evolution(job: &RealWorldJob, path: &Path) -> Result<()> { Ok(()) } +fn validate_memory_summary_expectation(job: &RealWorldJob, path: &Path) -> Result<()> { + let Some(summary) = &job.memory_summary else { + if job.suite == "memory_summary" && job.encoding.status.is_none() { + return Err(eyre::eyre!( + "{} memory_summary jobs must provide memory_summary expectations.", + path.display() + )); + } + + return Ok(()); + }; + + for category in &summary.required_categories { + if !is_memory_summary_category(category.as_str()) { + return Err(eyre::eyre!( + "{} memory_summary expectation references unknown category {}.", + path.display(), + category + )); + } + } + + Ok(()) +} + fn validate_evolution_conflict( path: &Path, evidence_ids: &BTreeSet, @@ -2162,32 +2535,18 @@ fn score_job(job: &RealWorldJob) -> JobScoring { if let Some(status) = job.encoding.status { let evolution = evolution_job_report(job, answer, &trap_ids_used, 0); - return JobScoring { - status, - normalized_score: 0.0, - hard_fail_hits: Vec::new(), - unsupported_claims: Vec::new(), - wrong_result_count: 0, - knowledge: None, - trap_ids_used, - dimension_scores: declared_not_encoded_dimension_scores(job), - reason: job - .encoding - .reason - .clone() - .unwrap_or_else(|| "Job did not reach a runnable scoring state.".to_string()), - evolution, - consolidation, - }; + return score_declared_job(job, status, trap_ids_used, evolution, consolidation); } let missing_claims = missing_required_claims(job, answer); let forbidden_claims = forbidden_claim_hits(job, answer); let missing_evidence = missing_required_evidence(job, &produced_evidence); let knowledge = knowledge_metrics(job, answer); + let memory_summary = memory_summary_metrics(job, answer); let mut unsupported_claims = unsupported_claims(job, answer); unsupported_claims.extend(unsupported_page_claims(answer)); + unsupported_claims.extend(unsupported_memory_summary_claims(job, answer)); let operator_counts = operator_debug_failure_counts(job); let latency_violations = latency_violations(job, answer); @@ -2217,6 +2576,24 @@ fn score_job(job: &RealWorldJob) -> JobScoring { review_action_failures: review_action_failures(consolidation.as_ref()), source_mutations: consolidation.as_ref().map_or(0, |report| report.source_mutation_count), blocking_executable_gaps: blocking_executable_gaps(consolidation.as_ref()), + memory_summary_invalid_current_entries: memory_summary + .as_ref() + .map_or(0, |metrics| metrics.invalid_top_of_mind_count), + memory_summary_untraced_entries: memory_summary + .as_ref() + .map_or(0, |metrics| metrics.untraced_entry_count), + memory_summary_missing_freshness: memory_summary.as_ref().map_or(0, |metrics| { + metrics.entry_count.saturating_sub(metrics.freshness_marker_count) + }), + memory_summary_missing_rationale: memory_summary + .as_ref() + .map_or(0, |metrics| metrics.entry_count.saturating_sub(metrics.rationale_count)), + memory_summary_missing_categories: memory_summary + .as_ref() + .map_or(0, |metrics| metrics.missing_required_category_count), + memory_summary_unsupported_current_entries: memory_summary + .as_ref() + .map_or(0, |metrics| metrics.unsupported_current_entry_count), untraced_page_sections: knowledge .as_ref() .map_or(0, |metrics| metrics.untraced_section_count), @@ -2226,23 +2603,7 @@ fn score_job(job: &RealWorldJob) -> JobScoring { }; let dimension_scores = dimension_scores(job, &counts); let normalized_score = normalized_score(&dimension_scores); - let wrong_result_count = counts.missing_claims - + counts.forbidden_claims - + counts.missing_evidence - + counts.trap_uses - + counts.operator_debug_missing - + counts.operator_debug_raw_sql - + counts.operator_debug_trace_gaps - + counts.operator_debug_repair_unclear - + counts.conflict_detection_missing - + counts.update_rationale_missing - + counts.proposal_usefulness_failures - + counts.lineage_failures - + counts.review_action_failures - + counts.untraced_page_sections - + counts.missed_stale_findings - + counts.rebuild_failures - + counts.page_usefulness_failures; + let wrong_result_count = wrong_result_count(&counts); let status = job_status( normalized_score, job.scoring_rubric.pass_threshold, @@ -2270,9 +2631,63 @@ fn score_job(job: &RealWorldJob) -> JobScoring { reason, evolution, consolidation, + memory_summary, + } +} + +fn score_declared_job( + job: &RealWorldJob, + status: TypedStatus, + trap_ids_used: Vec, + evolution: Option, + consolidation: Option, +) -> JobScoring { + JobScoring { + status, + normalized_score: 0.0, + hard_fail_hits: Vec::new(), + unsupported_claims: Vec::new(), + wrong_result_count: 0, + knowledge: None, + trap_ids_used, + dimension_scores: declared_not_encoded_dimension_scores(job), + reason: job + .encoding + .reason + .clone() + .unwrap_or_else(|| "Job did not reach a runnable scoring state.".to_string()), + evolution, + consolidation, + memory_summary: None, } } +fn wrong_result_count(counts: &FailureCounts) -> usize { + counts.missing_claims + + counts.forbidden_claims + + counts.missing_evidence + + counts.trap_uses + + counts.operator_debug_missing + + counts.operator_debug_raw_sql + + counts.operator_debug_trace_gaps + + counts.operator_debug_repair_unclear + + counts.conflict_detection_missing + + counts.update_rationale_missing + + counts.proposal_usefulness_failures + + counts.lineage_failures + + counts.review_action_failures + + counts.memory_summary_invalid_current_entries + + counts.memory_summary_untraced_entries + + counts.memory_summary_missing_freshness + + counts.memory_summary_missing_rationale + + counts.memory_summary_missing_categories + + counts.memory_summary_unsupported_current_entries + + counts.untraced_page_sections + + counts.missed_stale_findings + + counts.rebuild_failures + + counts.page_usefulness_failures +} + fn operator_debug_failure_counts(job: &RealWorldJob) -> FailureCounts { let Some(debug) = &job.operator_debug else { return FailureCounts { @@ -2320,6 +2735,7 @@ fn synthetic_answer(job: &RealWorldJob) -> &ProducedAnswer { claims: Vec::new(), evidence_ids: Vec::new(), pages: Vec::new(), + memory_summaries: Vec::new(), latency_ms: None, cost: None, trace_explainability: None, @@ -2801,6 +3217,202 @@ fn page_usefulness_failure_count(metrics: &KnowledgeJobMetrics) -> usize { if metrics.page_usefulness < 0.8 { 1 } else { 0 } } +fn memory_summary_metrics( + job: &RealWorldJob, + answer: &ProducedAnswer, +) -> Option { + if answer.memory_summaries.is_empty() { + return None; + } + + let mut metrics = MemorySummaryJobMetrics { + summary_count: answer.memory_summaries.len(), + required_category_count: job + .memory_summary + .as_ref() + .map_or(0, |summary| summary.required_categories.len()), + ..MemorySummaryJobMetrics::default() + }; + let mut categories = BTreeSet::new(); + + for summary in &answer.memory_summaries { + accumulate_memory_summary_metrics(summary, &mut metrics, &mut categories); + } + + let covered_required_category_count = job.memory_summary.as_ref().map_or(0, |summary| { + summary.required_categories.iter().filter(|category| categories.contains(*category)).count() + }); + + metrics.covered_required_category_count = covered_required_category_count; + metrics.missing_required_category_count = + metrics.required_category_count.saturating_sub(covered_required_category_count); + metrics.source_ref_coverage = + ratio(metrics.source_ref_entry_count, metrics.source_ref_required_count); + metrics.freshness_coverage = ratio(metrics.freshness_marker_count, metrics.entry_count); + metrics.rationale_coverage = ratio(metrics.rationale_count, metrics.entry_count); + + Some(metrics) +} + +fn accumulate_memory_summary_metrics( + summary: &MemorySummaryArtifact, + metrics: &mut MemorySummaryJobMetrics, + categories: &mut BTreeSet, +) { + metrics.source_trace_selected_count += summary.source_trace.selected_source_refs.len(); + metrics.source_trace_dropped_count += summary.source_trace.dropped_source_refs.len(); + metrics.source_trace_stale_count += summary.source_trace.stale_source_refs.len(); + metrics.source_trace_superseded_count += summary.source_trace.superseded_source_refs.len(); + metrics.source_trace_tombstone_count += summary.source_trace.tombstone_source_refs.len(); + + let non_current_source_refs = memory_summary_non_current_trace_refs(&summary.source_trace); + + for entry in &summary.entries { + metrics.entry_count += 1; + + categories.insert(entry.category.clone()); + + accumulate_memory_summary_category(entry.category.as_str(), metrics); + + if memory_summary_entry_requires_source_ref(entry) { + metrics.source_ref_required_count += 1; + + if entry.source_refs.is_empty() { + metrics.untraced_entry_count += 1; + } + } + if !entry.source_refs.is_empty() { + metrics.source_ref_entry_count += 1; + } + if memory_summary_entry_has_freshness(entry) { + metrics.freshness_marker_count += 1; + } + if memory_summary_entry_has_rationale(entry) { + metrics.rationale_count += 1; + } + if memory_summary_entry_is_invalid_top_of_mind(entry, &non_current_source_refs) { + metrics.invalid_top_of_mind_count += 1; + } + if entry.category == "derived_project_profile" { + let has_support = + !entry.source_refs.is_empty() || !entry.unsupported_claim_flags.is_empty(); + + if has_support { + metrics.derived_with_source_or_unsupported_count += 1; + } else { + metrics.derived_missing_source_or_unsupported_count += 1; + } + if !entry.unsupported_claim_flags.is_empty() { + metrics.unsupported_derived_entry_count += 1; + } + if memory_summary_entry_includes_unsupported_current_claim(entry) { + metrics.unsupported_current_entry_count += 1; + } + } + + metrics.tombstone_ref_count += entry.freshness.tombstone_refs.len(); + } +} + +fn memory_summary_non_current_trace_refs(trace: &MemorySummarySourceTrace) -> BTreeSet<&str> { + trace + .stale_source_refs + .iter() + .chain(trace.superseded_source_refs.iter()) + .chain(trace.tombstone_source_refs.iter()) + .map(|item| item.evidence_id.as_str()) + .collect() +} + +fn accumulate_memory_summary_category(category: &str, metrics: &mut MemorySummaryJobMetrics) { + match category { + "top_of_mind" => metrics.top_of_mind_count += 1, + "background" => metrics.background_count += 1, + "stale" => metrics.stale_count += 1, + "superseded" => metrics.superseded_count += 1, + "tombstone" => metrics.tombstone_count += 1, + "derived_project_profile" => metrics.derived_project_profile_count += 1, + _ => {}, + } +} + +fn memory_summary_entry_requires_source_ref(entry: &MemorySummaryEntry) -> bool { + !(entry.category == "derived_project_profile" + && entry.source_refs.is_empty() + && !entry.unsupported_claim_flags.is_empty() + && entry.rationale.decision == "excluded") +} + +fn memory_summary_entry_is_invalid_top_of_mind( + entry: &MemorySummaryEntry, + non_current_source_refs: &BTreeSet<&str>, +) -> bool { + entry.category == "top_of_mind" + && (entry.freshness.status != "current" + || entry.rationale.decision != "included" + || !entry.freshness.superseded_by.is_empty() + || !entry.freshness.tombstone_refs.is_empty() + || entry + .source_refs + .iter() + .any(|source_ref| non_current_source_refs.contains(source_ref.as_str()))) +} + +fn memory_summary_entry_has_freshness(entry: &MemorySummaryEntry) -> bool { + if entry.freshness.status.trim().is_empty() { + return false; + } + + match entry.category.as_str() { + "superseded" => !entry.freshness.superseded_by.is_empty(), + "tombstone" => + entry.freshness.status == "tombstoned" && !entry.freshness.tombstone_refs.is_empty(), + _ => true, + } +} + +fn memory_summary_entry_has_rationale(entry: &MemorySummaryEntry) -> bool { + !entry.rationale.decision.trim().is_empty() + && !entry.rationale.reason_code.trim().is_empty() + && !entry.rationale.reason.trim().is_empty() +} + +fn memory_summary_entry_includes_unsupported_current_claim(entry: &MemorySummaryEntry) -> bool { + !entry.unsupported_claim_flags.is_empty() + && (entry.rationale.decision != "excluded" || entry.freshness.status == "current") +} + +fn unsupported_memory_summary_claims( + job: &RealWorldJob, + answer: &ProducedAnswer, +) -> Vec { + answer + .memory_summaries + .iter() + .flat_map(|summary| { + summary.entries.iter().filter_map(|entry| { + if entry.category != "derived_project_profile" + || !entry.source_refs.is_empty() + || !entry.unsupported_claim_flags.is_empty() + { + return None; + } + + Some(UnsupportedClaimReport { + suite_id: job.suite.clone(), + job_id: job.job_id.clone(), + claim_id: Some(format!("{}:{}", summary.summary_id, entry.entry_id)), + claim_text: bounded_text(entry.text.as_str(), 240), + reason: + "derived memory summary entry has no source refs and no unsupported-claim flags" + .to_string(), + evidence_ids: entry.source_refs.clone(), + }) + }) + }) + .collect() +} + fn hard_fail_hits( job: &RealWorldJob, unsupported_claims: &[UnsupportedClaimReport], @@ -2873,19 +3485,31 @@ fn dimension_score(dimension_id: &str, max_points: f64, counts: &FailureCounts) || counts.conflict_detection_missing > 0 || counts.proposal_usefulness_failures > 0 || counts.review_action_failures > 0 + || counts.memory_summary_invalid_current_entries > 0 + || counts.memory_summary_missing_categories > 0 + || counts.memory_summary_unsupported_current_entries > 0 || counts.page_usefulness_failures > 0, "evidence_grounding" => counts.missing_evidence > 0 || counts.unsupported_claims > 0 || counts.lineage_failures > 0 + || counts.memory_summary_untraced_entries > 0 || counts.untraced_page_sections > 0, - "trap_avoidance" => counts.trap_uses > 0 || counts.missed_stale_findings > 0, - "uncertainty_handling" => counts.unsupported_claims > 0, + "trap_avoidance" => + counts.trap_uses > 0 + || counts.memory_summary_invalid_current_entries > 0 + || counts.missed_stale_findings > 0, + "uncertainty_handling" => + counts.unsupported_claims > 0 || counts.memory_summary_unsupported_current_entries > 0, "lifecycle_behavior" => counts.stale_answers > 0 || counts.conflict_detection_missing > 0 || counts.update_rationale_missing > 0 || counts.source_mutations > 0 + || counts.memory_summary_invalid_current_entries > 0 + || counts.memory_summary_missing_freshness > 0 + || counts.memory_summary_missing_rationale > 0 + || counts.memory_summary_unsupported_current_entries > 0 || counts.rebuild_failures > 0, "source_immutability" => counts.source_mutations > 0, "proposal_usefulness" => counts.proposal_usefulness_failures > 0, @@ -2998,6 +3622,12 @@ fn wrong_result_signal_count(counts: &FailureCounts) -> usize { + counts.proposal_usefulness_failures + counts.lineage_failures + counts.review_action_failures + + counts.memory_summary_invalid_current_entries + + counts.memory_summary_untraced_entries + + counts.memory_summary_missing_freshness + + counts.memory_summary_missing_rationale + + counts.memory_summary_missing_categories + + counts.memory_summary_unsupported_current_entries + counts.untraced_page_sections + counts.missed_stale_findings + counts.rebuild_failures @@ -3050,6 +3680,7 @@ fn job_report(job: &RealWorldJob, scoring: JobScoring) -> JobReport { cost: answer.cost.clone(), trace_explainability: answer.trace_explainability.clone(), knowledge: scoring.knowledge, + memory_summary: scoring.memory_summary, trap_ids_used: scoring.trap_ids_used, dimension_scores: scoring.dimension_scores, reason: scoring.reason, @@ -3551,6 +4182,7 @@ fn report_summary(jobs: &[JobReport], suites: &[SuiteReport]) -> ReportSummary { .map(|debug| debug.ux_gaps.len()) .sum(), consolidation: consolidation_summary(jobs), + memory_summary: memory_summary_summary(jobs), knowledge: knowledge_summary(jobs), ..ReportSummary::default() }; @@ -3667,6 +4299,99 @@ fn consolidation_summary(jobs: &[JobReport]) -> ConsolidationSummaryReport { } } +fn memory_summary_summary(jobs: &[JobReport]) -> Option { + let memory_jobs = jobs.iter().filter_map(|job| job.memory_summary.as_ref()).collect::>(); + + if memory_jobs.is_empty() { + return None; + } + + let job_count = memory_jobs.len(); + let summary_count = memory_jobs.iter().map(|metrics| metrics.summary_count).sum(); + let entry_count = memory_jobs.iter().map(|metrics| metrics.entry_count).sum(); + let required_category_count = + memory_jobs.iter().map(|metrics| metrics.required_category_count).sum(); + let covered_required_category_count = + memory_jobs.iter().map(|metrics| metrics.covered_required_category_count).sum(); + let source_ref_required_count = + memory_jobs.iter().map(|metrics| metrics.source_ref_required_count).sum(); + let source_ref_entry_count = + memory_jobs.iter().map(|metrics| metrics.source_ref_entry_count).sum(); + let freshness_marker_count = + memory_jobs.iter().map(|metrics| metrics.freshness_marker_count).sum(); + let rationale_count = memory_jobs.iter().map(|metrics| metrics.rationale_count).sum(); + + Some(MemorySummaryReport { + job_count, + summary_count, + entry_count, + required_category_count, + covered_required_category_count, + missing_required_category_count: memory_jobs + .iter() + .map(|metrics| metrics.missing_required_category_count) + .sum(), + top_of_mind_count: memory_jobs.iter().map(|metrics| metrics.top_of_mind_count).sum(), + background_count: memory_jobs.iter().map(|metrics| metrics.background_count).sum(), + stale_count: memory_jobs.iter().map(|metrics| metrics.stale_count).sum(), + superseded_count: memory_jobs.iter().map(|metrics| metrics.superseded_count).sum(), + tombstone_count: memory_jobs.iter().map(|metrics| metrics.tombstone_count).sum(), + derived_project_profile_count: memory_jobs + .iter() + .map(|metrics| metrics.derived_project_profile_count) + .sum(), + source_ref_required_count, + source_ref_entry_count, + source_ref_coverage: ratio(source_ref_entry_count, source_ref_required_count), + freshness_marker_count, + freshness_coverage: ratio(freshness_marker_count, entry_count), + rationale_count, + rationale_coverage: ratio(rationale_count, entry_count), + invalid_top_of_mind_count: memory_jobs + .iter() + .map(|metrics| metrics.invalid_top_of_mind_count) + .sum(), + untraced_entry_count: memory_jobs.iter().map(|metrics| metrics.untraced_entry_count).sum(), + derived_with_source_or_unsupported_count: memory_jobs + .iter() + .map(|metrics| metrics.derived_with_source_or_unsupported_count) + .sum(), + derived_missing_source_or_unsupported_count: memory_jobs + .iter() + .map(|metrics| metrics.derived_missing_source_or_unsupported_count) + .sum(), + unsupported_derived_entry_count: memory_jobs + .iter() + .map(|metrics| metrics.unsupported_derived_entry_count) + .sum(), + unsupported_current_entry_count: memory_jobs + .iter() + .map(|metrics| metrics.unsupported_current_entry_count) + .sum(), + tombstone_ref_count: memory_jobs.iter().map(|metrics| metrics.tombstone_ref_count).sum(), + source_trace_selected_count: memory_jobs + .iter() + .map(|metrics| metrics.source_trace_selected_count) + .sum(), + source_trace_dropped_count: memory_jobs + .iter() + .map(|metrics| metrics.source_trace_dropped_count) + .sum(), + source_trace_stale_count: memory_jobs + .iter() + .map(|metrics| metrics.source_trace_stale_count) + .sum(), + source_trace_superseded_count: memory_jobs + .iter() + .map(|metrics| metrics.source_trace_superseded_count) + .sum(), + source_trace_tombstone_count: memory_jobs + .iter() + .map(|metrics| metrics.source_trace_tombstone_count) + .sum(), + }) +} + fn knowledge_summary(jobs: &[JobReport]) -> Option { let knowledge_jobs = jobs.iter().filter_map(|job| job.knowledge.as_ref()).collect::>(); @@ -4377,6 +5102,7 @@ fn render_markdown(report: &RealWorldReport, report_path: &Path) -> String { render_markdown_evolution(&mut out, report); render_markdown_trace_explainability(&mut out, report); render_markdown_consolidation(&mut out, report); + render_markdown_memory_summary(&mut out, report); render_markdown_knowledge(&mut out, report); render_markdown_unsupported_claims(&mut out, report); render_markdown_follow_ups(&mut out, report); @@ -4670,7 +5396,16 @@ fn render_markdown_header(out: &mut String, report: &RealWorldReport, report_pat )); out.push_str(&format!("- Operator UX gaps: `{}`\n", report.summary.operator_ux_gap_count)); - if let Some(knowledge) = &report.summary.knowledge { + render_markdown_optional_summary_metrics(out, &report.summary); + + out.push_str(&format!( + "- Private corpus redaction: `{}`\n\n", + md_inline(report.private_corpus_redaction.policy.as_str()) + )); +} + +fn render_markdown_optional_summary_metrics(out: &mut String, summary: &ReportSummary) { + if let Some(knowledge) = &summary.knowledge { out.push_str(&format!( "- Knowledge citation coverage: `{:.3}`\n", knowledge.citation_coverage @@ -4690,11 +5425,30 @@ fn render_markdown_header(out: &mut String, report: &RealWorldReport, report_pat knowledge.unsupported_summary_count )); } - - out.push_str(&format!( - "- Private corpus redaction: `{}`\n\n", - md_inline(report.private_corpus_redaction.policy.as_str()) - )); + if let Some(memory_summary) = &summary.memory_summary { + out.push_str(&format!( + "- Memory summary entries: `{}` across `{}` artifact(s)\n", + memory_summary.entry_count, memory_summary.summary_count + )); + out.push_str(&format!( + "- Memory summary source-ref coverage: `{}/{}` (`{:.3}`)\n", + memory_summary.source_ref_entry_count, + memory_summary.source_ref_required_count, + memory_summary.source_ref_coverage + )); + out.push_str(&format!( + "- Memory summary invalid top-of-mind count: `{}`\n", + memory_summary.invalid_top_of_mind_count + )); + out.push_str(&format!( + "- Memory summary unsupported derived entries: `{}`\n", + memory_summary.unsupported_derived_entry_count + )); + out.push_str(&format!( + "- Memory summary unsupported current entries: `{}`\n", + memory_summary.unsupported_current_entry_count + )); + } } fn render_markdown_quality_summary(out: &mut String, report: &RealWorldReport) { @@ -5128,6 +5882,46 @@ fn render_markdown_knowledge(out: &mut String, report: &RealWorldReport) { out.push('\n'); } +fn render_markdown_memory_summary(out: &mut String, report: &RealWorldReport) { + let memory_jobs = + report.jobs.iter().filter(|job| job.memory_summary.is_some()).collect::>(); + + if memory_jobs.is_empty() { + return; + } + + out.push_str("## Memory Summary Metrics\n\n"); + out.push_str("| Job | Summaries | Entries | Categories | Source Coverage | Freshness | Rationale | Invalid Top-of-Mind | Untraced | Derived Unsupported | Unsupported Current | Tombstone Refs |\n"); + out.push_str( + "| --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: |\n", + ); + + for job in memory_jobs { + let Some(metrics) = &job.memory_summary else { + continue; + }; + + out.push_str(&format!( + "| {} | {} | {} | `{}/{}` | `{:.3}` | `{:.3}` | `{:.3}` | {} | {} | {} | {} | {} |\n", + md_cell(job.job_id.as_str()), + metrics.summary_count, + metrics.entry_count, + metrics.covered_required_category_count, + metrics.required_category_count, + metrics.source_ref_coverage, + metrics.freshness_coverage, + metrics.rationale_coverage, + metrics.invalid_top_of_mind_count, + metrics.untraced_entry_count, + metrics.unsupported_derived_entry_count, + metrics.unsupported_current_entry_count, + metrics.tombstone_ref_count + )); + } + + out.push('\n'); +} + fn render_markdown_unsupported_claims(out: &mut String, report: &RealWorldReport) { out.push_str("## Unsupported Claims\n\n"); @@ -5198,6 +5992,7 @@ fn render_markdown_semantics(out: &mut String, report: &RealWorldReport) { out.push_str("- `unsupported_claim`: a job produced a substantive claim not supported by the fixture evidence links.\n"); out.push_str("- `not_encoded`: a suite has no checked-in fixture, or an encoded fixture declares a capability gap so no pass/fail claim is allowed.\n\n"); out.push_str("For `knowledge_compilation` jobs, generated pages are benchmark artifacts. Page sections must cite source evidence or timeline events, or be explicitly flagged as unsupported. Flagged unsupported summaries are counted separately from hidden unsupported claims.\n\n"); + out.push_str("For `memory_summary` jobs, summary artifacts are derived review surfaces. Top-of-mind entries must be current, included or downgraded entries must carry source refs, and derived project-profile entries must either cite sources or be explicitly flagged as unsupported.\n\n"); out.push_str("## Suites With `not_encoded` Status\n\n"); if report.not_encoded_suites.is_empty() { diff --git a/apps/elf-eval/tests/real_world_job_benchmark.rs b/apps/elf-eval/tests/real_world_job_benchmark.rs index a7ea546b..60c020c8 100644 --- a/apps/elf-eval/tests/real_world_job_benchmark.rs +++ b/apps/elf-eval/tests/real_world_job_benchmark.rs @@ -56,6 +56,10 @@ fn consolidation_fixture_dir() -> PathBuf { real_world_memory_fixture_dir().join("consolidation") } +fn memory_summary_fixture_dir() -> PathBuf { + real_world_memory_fixture_dir().join("memory_summary") +} + fn knowledge_fixture_dir() -> PathBuf { real_world_memory_fixture_dir().join("knowledge") } @@ -293,6 +297,10 @@ fn run_json_report() -> Result { run_json_report_from(fixture_dir()) } +fn load_json(path: &Path) -> Result { + Ok(serde_json::from_str::(&fs::read_to_string(path)?)?) +} + fn array_at<'a>(value: &'a Value, pointer: &str) -> Result<&'a Vec> { value .pointer(pointer) @@ -1014,10 +1022,11 @@ fn assert_elf_fixture_adapter_record(adapter: &Value) -> Result<()> { assert_eq!(adapter.pointer("/evidence_class").and_then(Value::as_str), Some("fixture_backed")); assert_eq!(adapter.pointer("/overall_status").and_then(Value::as_str), Some("blocked")); assert!(adapter.pointer("/run/evidence").and_then(Value::as_str).is_some_and(|evidence| { - evidence.contains("49 jobs across 13 suites") - && evidence.contains("44 pass") + evidence.contains("50 jobs across 14 suites") + && evidence.contains("45 pass") && evidence.contains("5 blocked") && evidence.contains("core_archival_memory") + && evidence.contains("memory_summary") && evidence.contains("context_trajectory") })); @@ -2222,7 +2231,7 @@ fn assert_live_sweep_record(adapter: &Value, production_ops_status: &str) -> Res fn runner_discovers_nested_fixture_layout() -> Result<()> { let report = run_json_report_from(fixture_root())?; - assert_eq!(report.pointer("/summary/job_count").and_then(Value::as_u64), Some(49)); + assert_eq!(report.pointer("/summary/job_count").and_then(Value::as_u64), Some(50)); Ok(()) } @@ -2676,11 +2685,11 @@ fn assert_current_report_text_boundaries( comparison_external_projects .contains("Benchmark-grounded for local same-corpus retrieval, reindex/update/delete") ); - assert!(iteration_direction.contains("| Jobs | `49` |")); - assert!(iteration_direction.contains("| Encoded suites | `13` |")); - assert!(iteration_direction.contains("| Pass | `44` |")); - assert!(iteration_direction.contains("| Evidence coverage | `111/111` |")); - assert!(iteration_direction.contains("| Expected evidence recall | `100/100` |")); + assert!(iteration_direction.contains("| Jobs | `50` |")); + assert!(iteration_direction.contains("| Encoded suites | `14` |")); + assert!(iteration_direction.contains("| Pass | `45` |")); + assert!(iteration_direction.contains("| Evidence coverage | `115/115` |")); + assert!(iteration_direction.contains("| Expected evidence recall | `107/107` |")); for stale_phrase in [ "same live sweep shape as ELF", @@ -3663,14 +3672,14 @@ fn assert_measurement_audit_adapter_status_counts(markdown: &str) { fn assert_iteration_direction_current_measurement_counts(markdown: &str) { for expected in [ - "| Jobs | `49` |", - "| Encoded suites | `13` |", + "| Jobs | `50` |", + "| Encoded suites | `14` |", "| Blocked | `5` |", - "| Mean score | `0.898` |", - "| Evidence coverage | `111/111` |", - "| Source-ref coverage | `111/111` |", - "| Quote coverage | `111/111` |", - "| Expected evidence recall | `100/100` |", + "| Mean score | `0.900` |", + "| Evidence coverage | `115/115` |", + "| Source-ref coverage | `115/115` |", + "| Quote coverage | `115/115` |", + "| Expected evidence recall | `107/107` |", "| `blocked` | `7` |", "| `not_encoded` | `5` |", "`live_baseline_only`, `fixture_backed`, and `research_gate`", @@ -4109,23 +4118,55 @@ fn assert_dreaming_readiness_baseline_counts(ledger: &Value, stages: &[Value]) - assert!(array_contains_str(ledger, "/summary/improved", "current_vs_historical_correctness")?); assert!(array_contains_str(ledger, "/summary/improved", "preference_evolution")?); assert!(array_contains_str(ledger, "/summary/improved", "reviewable_consolidation")?); + assert!(array_contains_str( + ledger, + "/summary/improved", + "memory_summary_top_of_mind_behavior" + )?); assert!(array_at(ledger, "/summary/regressed")?.is_empty()); assert!(array_contains_str(ledger, "/summary/unchanged", "deletion_ttl_tombstone_behavior")?); assert!(array_contains_str(ledger, "/summary/unchanged", "final_competitor_retest_status")?); assert!(array_contains_str(ledger, "/summary/blocked", "scheduled_memory_task_readiness")?); assert!(array_contains_str(ledger, "/summary/not_tested", "proactive_brief_readiness")?); + assert_dreaming_memory_summary_stage(stages)?; + + Ok(()) +} + +fn assert_dreaming_memory_summary_stage(stages: &[Value]) -> Result<()> { + let summary_stage = find_by_field(stages, "/stage_id", "memory_summary_top_of_mind_behavior")?; + + assert_eq!( + summary_stage.pointer("/comparison_judgment").and_then(Value::as_str), + Some("improved") + ); + assert_eq!(summary_stage.pointer("/post_stage_counts/pass").and_then(Value::as_u64), Some(9)); + assert_eq!( + summary_stage.pointer("/post_stage_counts/not_tested").and_then(Value::as_u64), + Some(0) + ); + assert!( + summary_stage + .pointer("/post_stage_basis") + .and_then(Value::as_str) + .is_some_and(|basis| basis.contains("fixture-backed memory_summary job") + && basis.contains("unsupported-claim flags")) + ); + Ok(()) } fn assert_dreaming_readiness_markdown_boundaries(markdown: &str) { assert!( - markdown - .contains("`improved`: current-vs-historical correctness, preference evolution, and") - && markdown.contains("reviewable consolidation") + markdown.contains("`improved`: current-vs-historical correctness, preference evolution") + && markdown.contains("reviewable") + && markdown.contains("consolidation, and memory-summary/top-of-mind fixture readback") ); + assert!(markdown.contains("memory-summary/top-of-mind fixture readback")); assert!(markdown.contains("`regressed`: none")); assert!(markdown.contains("the XY-905 run passes all six memory-evolution jobs")); + assert!(markdown.contains("XY-952 adds a reviewable `elf.memory_summary/v1`")); assert!(markdown.contains("XY-905")); assert!( markdown @@ -4172,6 +4213,267 @@ fn knowledge_json_report_renders_markdown_metrics() -> Result<()> { Ok(()) } +#[test] +fn memory_summary_fixtures_score_reviewable_source_trace_contract() -> Result<()> { + let report = run_json_report_from(memory_summary_fixture_dir())?; + + assert_eq!(report.pointer("/summary/job_count").and_then(Value::as_u64), Some(1)); + assert_eq!(report.pointer("/summary/encoded_suite_count").and_then(Value::as_u64), Some(1)); + assert_eq!(report.pointer("/summary/pass").and_then(Value::as_u64), Some(1)); + assert_eq!(report.pointer("/summary/wrong_result").and_then(Value::as_u64), Some(0)); + assert_eq!(report.pointer("/summary/unsupported_claim").and_then(Value::as_u64), Some(0)); + assert_eq!( + report.pointer("/summary/memory_summary/summary_count").and_then(Value::as_u64), + Some(1) + ); + assert_eq!( + report.pointer("/summary/memory_summary/entry_count").and_then(Value::as_u64), + Some(7) + ); + assert_eq!( + report + .pointer("/summary/memory_summary/covered_required_category_count") + .and_then(Value::as_u64), + Some(6) + ); + assert_eq!( + report.pointer("/summary/memory_summary/source_ref_coverage").and_then(Value::as_f64), + Some(1.0) + ); + assert_eq!( + report.pointer("/summary/memory_summary/freshness_coverage").and_then(Value::as_f64), + Some(1.0) + ); + assert_eq!( + report.pointer("/summary/memory_summary/rationale_coverage").and_then(Value::as_f64), + Some(1.0) + ); + assert_eq!( + report.pointer("/summary/memory_summary/invalid_top_of_mind_count").and_then(Value::as_u64), + Some(0) + ); + assert_eq!( + report + .pointer("/summary/memory_summary/unsupported_derived_entry_count") + .and_then(Value::as_u64), + Some(1) + ); + + let suites = array_at(&report, "/suites")?; + let memory_summary = find_by_field(suites, "/suite_id", "memory_summary")?; + + assert_eq!(memory_summary.pointer("/status").and_then(Value::as_str), Some("pass")); + assert_eq!(memory_summary.pointer("/encoded_job_count").and_then(Value::as_u64), Some(1)); + + let jobs = array_at(&report, "/jobs")?; + let job = find_by_field(jobs, "/job_id", "memory-summary-source-trace-001")?; + + assert_eq!(job.pointer("/status").and_then(Value::as_str), Some("pass")); + assert_eq!(job.pointer("/memory_summary/top_of_mind_count").and_then(Value::as_u64), Some(1)); + assert_eq!(job.pointer("/memory_summary/tombstone_ref_count").and_then(Value::as_u64), Some(1)); + + Ok(()) +} + +#[test] +fn memory_summary_markdown_renders_source_trace_metrics() -> Result<()> { + let report = run_json_report_from(memory_summary_fixture_dir())?; + let temp_dir = + env::temp_dir().join(format!("elf-real-world-memory-summary-test-{}", process::id())); + + fs::create_dir_all(&temp_dir)?; + + let report_path = temp_dir.join("memory-summary-report.json"); + let markdown_path = temp_dir.join("memory-summary-report.md"); + + fs::write(&report_path, serde_json::to_vec_pretty(&report)?)?; + + let output = Command::new(env!("CARGO_BIN_EXE_real_world_job_benchmark")) + .arg("publish") + .arg("--report") + .arg(&report_path) + .arg("--out") + .arg(&markdown_path) + .output()?; + + assert!( + output.status.success(), + "real_world_job publisher failed: {}", + String::from_utf8_lossy(&output.stderr), + ); + + let markdown = fs::read_to_string(markdown_path)?; + + assert!(markdown.contains("Memory Summary Metrics")); + assert!(markdown.contains("memory-summary-source-trace-001")); + assert!(markdown.contains("Memory summary source-ref coverage")); + assert!(markdown.contains("Invalid Top-of-Mind")); + assert!(markdown.contains("Derived Unsupported")); + + Ok(()) +} + +#[test] +fn memory_summary_fixture_fails_stale_top_of_mind_entries() -> Result<()> { + let fixture_path = memory_summary_fixture_dir().join("reviewable_summary_source_trace.json"); + let mut fixture = load_json(&fixture_path)?; + + fixture["corpus"]["adapter_response"]["answer"]["memory_summaries"][0]["entries"][2]["category"] = + Value::String("top_of_mind".to_string()); + fixture["corpus"]["adapter_response"]["answer"]["memory_summaries"][0]["entries"][2]["freshness"] + ["status"] = Value::String("current".to_string()); + + let temp_dir = + env::temp_dir().join(format!("elf-memory-summary-stale-current-test-{}", process::id())); + + fs::create_dir_all(&temp_dir)?; + fs::write(temp_dir.join("stale_current_summary.json"), serde_json::to_vec_pretty(&fixture)?)?; + + let report = run_json_report_from(temp_dir)?; + let jobs = array_at(&report, "/jobs")?; + let job = find_by_field(jobs, "/job_id", "memory-summary-source-trace-001")?; + + assert_eq!(job.pointer("/status").and_then(Value::as_str), Some("wrong_result")); + assert_eq!( + job.pointer("/memory_summary/invalid_top_of_mind_count").and_then(Value::as_u64), + Some(1) + ); + assert_eq!(report.pointer("/summary/wrong_result").and_then(Value::as_u64), Some(1)); + + Ok(()) +} + +#[test] +fn memory_summary_fixture_fails_tombstoned_top_of_mind_entries() -> Result<()> { + let fixture_path = memory_summary_fixture_dir().join("reviewable_summary_source_trace.json"); + let mut fixture = load_json(&fixture_path)?; + + fixture["corpus"]["adapter_response"]["answer"]["memory_summaries"][0]["entries"][4]["category"] = + Value::String("top_of_mind".to_string()); + fixture["corpus"]["adapter_response"]["answer"]["memory_summaries"][0]["entries"][4]["freshness"] + ["status"] = Value::String("current".to_string()); + + let temp_dir = env::temp_dir() + .join(format!("elf-memory-summary-tombstone-current-test-{}", process::id())); + + fs::create_dir_all(&temp_dir)?; + fs::write( + temp_dir.join("tombstone_current_summary.json"), + serde_json::to_vec_pretty(&fixture)?, + )?; + + let report = run_json_report_from(temp_dir)?; + let jobs = array_at(&report, "/jobs")?; + let job = find_by_field(jobs, "/job_id", "memory-summary-source-trace-001")?; + + assert_eq!(job.pointer("/status").and_then(Value::as_str), Some("wrong_result")); + assert_eq!( + job.pointer("/memory_summary/invalid_top_of_mind_count").and_then(Value::as_u64), + Some(1) + ); + assert_eq!(report.pointer("/summary/wrong_result").and_then(Value::as_u64), Some(1)); + + Ok(()) +} + +#[test] +fn memory_summary_fixture_fails_untraced_derived_profile_entries() -> Result<()> { + let fixture_path = memory_summary_fixture_dir().join("reviewable_summary_source_trace.json"); + let mut fixture = load_json(&fixture_path)?; + + fixture["corpus"]["adapter_response"]["answer"]["memory_summaries"][0]["entries"][6]["unsupported_claim_flags"] = + Value::Array(Vec::new()); + + let temp_dir = + env::temp_dir().join(format!("elf-memory-summary-untraced-derived-test-{}", process::id())); + + fs::create_dir_all(&temp_dir)?; + fs::write( + temp_dir.join("untraced_derived_summary.json"), + serde_json::to_vec_pretty(&fixture)?, + )?; + + let report = run_json_report_from(temp_dir)?; + let jobs = array_at(&report, "/jobs")?; + let job = find_by_field(jobs, "/job_id", "memory-summary-source-trace-001")?; + + assert_eq!(job.pointer("/status").and_then(Value::as_str), Some("unsupported_claim")); + assert_eq!( + job.pointer("/memory_summary/derived_missing_source_or_unsupported_count") + .and_then(Value::as_u64), + Some(1) + ); + assert_eq!(report.pointer("/summary/unsupported_claim").and_then(Value::as_u64), Some(1)); + + Ok(()) +} + +#[test] +fn memory_summary_fixture_fails_unsupported_current_derived_entries() -> Result<()> { + let fixture_path = memory_summary_fixture_dir().join("reviewable_summary_source_trace.json"); + let mut fixture = load_json(&fixture_path)?; + + fixture["corpus"]["adapter_response"]["answer"]["memory_summaries"][0]["entries"][6]["source_refs"] = + Value::Array(vec![Value::String("summary-contract-non-parity-boundary".to_string())]); + fixture["corpus"]["adapter_response"]["answer"]["memory_summaries"][0]["entries"][6]["freshness"] + ["status"] = Value::String("current".to_string()); + fixture["corpus"]["adapter_response"]["answer"]["memory_summaries"][0]["entries"][6]["rationale"] + ["decision"] = Value::String("included".to_string()); + + let temp_dir = env::temp_dir() + .join(format!("elf-memory-summary-unsupported-current-test-{}", process::id())); + + fs::create_dir_all(&temp_dir)?; + fs::write( + temp_dir.join("unsupported_current_summary.json"), + serde_json::to_vec_pretty(&fixture)?, + )?; + + let report = run_json_report_from(temp_dir)?; + let jobs = array_at(&report, "/jobs")?; + let job = find_by_field(jobs, "/job_id", "memory-summary-source-trace-001")?; + + assert_eq!(job.pointer("/status").and_then(Value::as_str), Some("wrong_result")); + assert_eq!( + job.pointer("/memory_summary/unsupported_current_entry_count").and_then(Value::as_u64), + Some(1) + ); + assert_eq!(report.pointer("/summary/wrong_result").and_then(Value::as_u64), Some(1)); + + Ok(()) +} + +#[test] +fn memory_summary_fixture_fails_tombstone_entries_without_tombstone_refs() -> Result<()> { + let fixture_path = memory_summary_fixture_dir().join("reviewable_summary_source_trace.json"); + let mut fixture = load_json(&fixture_path)?; + + fixture["corpus"]["adapter_response"]["answer"]["memory_summaries"][0]["entries"][4]["freshness"] + ["tombstone_refs"] = Value::Array(Vec::new()); + + let temp_dir = + env::temp_dir().join(format!("elf-memory-summary-tombstone-refs-test-{}", process::id())); + + fs::create_dir_all(&temp_dir)?; + fs::write( + temp_dir.join("missing_tombstone_refs_summary.json"), + serde_json::to_vec_pretty(&fixture)?, + )?; + + let report = run_json_report_from(temp_dir)?; + let jobs = array_at(&report, "/jobs")?; + let job = find_by_field(jobs, "/job_id", "memory-summary-source-trace-001")?; + + assert_eq!(job.pointer("/status").and_then(Value::as_str), Some("wrong_result")); + assert_eq!( + job.pointer("/memory_summary/freshness_coverage").and_then(Value::as_f64), + Some(0.857) + ); + assert_eq!(report.pointer("/summary/wrong_result").and_then(Value::as_u64), Some(1)); + + Ok(()) +} + #[test] fn production_ops_fixtures_report_bounded_typed_states() -> Result<()> { let report = run_json_report_from(production_ops_fixture_dir())?; @@ -4331,9 +4633,9 @@ fn assert_root_knowledge_summary(report: &Value) { } fn assert_root_aggregate_summary(report: &Value) { - assert_eq!(report.pointer("/summary/job_count").and_then(Value::as_u64), Some(49)); - assert_eq!(report.pointer("/summary/encoded_suite_count").and_then(Value::as_u64), Some(13)); - assert_eq!(report.pointer("/summary/pass").and_then(Value::as_u64), Some(44)); + assert_eq!(report.pointer("/summary/job_count").and_then(Value::as_u64), Some(50)); + assert_eq!(report.pointer("/summary/encoded_suite_count").and_then(Value::as_u64), Some(14)); + assert_eq!(report.pointer("/summary/pass").and_then(Value::as_u64), Some(45)); assert_eq!(report.pointer("/summary/wrong_result").and_then(Value::as_u64), Some(0)); assert_eq!(report.pointer("/summary/incomplete").and_then(Value::as_u64), Some(0)); assert_eq!(report.pointer("/summary/blocked").and_then(Value::as_u64), Some(5)); @@ -4376,11 +4678,11 @@ fn assert_root_aggregate_summary(report: &Value) { ); assert_eq!( report.pointer("/summary/evidence_required_count").and_then(Value::as_u64), - Some(111) + Some(115) ); assert_eq!( report.pointer("/summary/evidence_covered_count").and_then(Value::as_u64), - Some(111) + Some(115) ); assert_eq!(report.pointer("/summary/evidence_coverage").and_then(Value::as_f64), Some(1.0)); assert_eq!(report.pointer("/summary/source_ref_coverage").and_then(Value::as_f64), Some(1.0)); @@ -4407,6 +4709,18 @@ fn assert_root_aggregate_summary(report: &Value) { .and_then(Value::as_u64), Some(1) ); + assert_eq!( + report.pointer("/summary/memory_summary/job_count").and_then(Value::as_u64), + Some(1) + ); + assert_eq!( + report.pointer("/summary/memory_summary/invalid_top_of_mind_count").and_then(Value::as_u64), + Some(0) + ); + assert_eq!( + report.pointer("/summary/memory_summary/source_ref_coverage").and_then(Value::as_f64), + Some(1.0) + ); assert_root_knowledge_summary(report); } @@ -4422,6 +4736,7 @@ fn assert_root_aggregate_suites(report: &Value) -> Result<()> { "capture_integration", "personalization", "consolidation", + "memory_summary", "knowledge_compilation", "operator_debugging_ux", "memory_evolution", diff --git a/docs/guide/benchmarking/2026-06-11-competitor-strength-adoption-report.md b/docs/guide/benchmarking/2026-06-11-competitor-strength-adoption-report.md index 686ed123..35786e4f 100644 --- a/docs/guide/benchmarking/2026-06-11-competitor-strength-adoption-report.md +++ b/docs/guide/benchmarking/2026-06-11-competitor-strength-adoption-report.md @@ -88,7 +88,8 @@ results, or lifecycle failures into one aggregate leaderboard. | Command or run | Artifact | Supported claim | | --- | --- | --- | -| `cargo make real-world-memory` | `2026-06-11-measurement-coverage-audit.md` | ELF fixture aggregate covers 49 jobs across 13 suites with 44 pass and 5 blocked production-ops or OpenViking context-trajectory measurement gates, including 6 passing `core_archival_memory` jobs. | +| `cargo make real-world-memory` | `2026-06-11-measurement-coverage-audit.md` plus XY-952 fixture update | ELF fixture aggregate covers 50 jobs across 14 suites with 45 pass and 5 blocked production-ops or OpenViking context-trajectory measurement gates, including 6 passing `core_archival_memory` jobs and 1 passing `memory_summary` source-trace job. | +| `cargo make real-world-memory-summary` | `tmp/real-world-memory/memory-summary/report.json` | The memory summary fixture scores reviewable top-of-mind, background, stale, superseded, tombstoned, and derived project-profile entries with source refs, freshness metadata, rationale, and unsupported-claim flags; this is fixture-backed contract evidence, not managed-memory parity. | | `cargo make real-world-memory-core-archival` | `tmp/real-world-memory/core-archival/report.json` | ELF core-block behavior is scored separately from archival note search for attachment, scope, provenance, stale-core detection, archival fallback, and project-decision recovery. | | `cargo make real-world-memory-live-adapters` | `2026-06-11-measurement-coverage-audit.md` | ELF live service adapter reports 22 pass, 5 wrong_result, 2 blocked, and 11 not_encoded jobs; qmd reports 17 pass, 6 wrong_result, 2 blocked, and 15 not_encoded jobs. | | `cargo make real-world-memory-live-adapters` | `2026-06-11-capture-write-policy-live-report.md` | ELF live capture/write-policy jobs pass for redaction, exclusions, source ids, evidence binding, and no secret leakage; qmd remains not_encoded, while agentmemory and claude-mem capture breadth are blocked until durable hook/viewer evidence exists. | diff --git a/docs/guide/benchmarking/2026-06-11-competitor-strength-evidence-matrix.md b/docs/guide/benchmarking/2026-06-11-competitor-strength-evidence-matrix.md index 0a956467..fea85347 100644 --- a/docs/guide/benchmarking/2026-06-11-competitor-strength-evidence-matrix.md +++ b/docs/guide/benchmarking/2026-06-11-competitor-strength-evidence-matrix.md @@ -31,11 +31,13 @@ Current boundary: live pass. The fresh ELF sweep produced 40 jobs with 22 pass, 5 wrong_result, 0 incomplete, 2 blocked, and 11 not_encoded; the fresh qmd sweep produced 17 pass, 6 wrong_result, 0 incomplete, 2 blocked, and 15 not_encoded. -- ELF fixture evidence is strong: `cargo make real-world-memory` reports 49 jobs - across 13 suites with 44 pass and 5 blocked production-ops or OpenViking - context-trajectory measurement gates. The added `core_archival_memory` suite - contributes 6 fixture-only passes for ELF core-block behavior; it does not create - an ELF-over-Letta claim. This proves the fixture contract, not live-service parity. +- ELF fixture evidence is strong: `cargo make real-world-memory` reports 50 jobs + across 14 suites with 45 pass and 5 blocked production-ops or OpenViking + context-trajectory measurement gates. The `core_archival_memory` suite contributes + 6 fixture-only passes for ELF core-block behavior; it does not create an + ELF-over-Letta claim. The `memory_summary` suite contributes one fixture-backed + source-trace pass; it does not create managed-memory parity evidence. This proves + the fixture contract, not live-service parity. - qmd is the strongest measured local retrieval-debug comparison, but the current evidence still separates its same-corpus/live-retrieval strengths from the full-suite live non-pass sweep. diff --git a/docs/guide/benchmarking/2026-06-11-elf-iteration-direction-from-competitor-benchmarks.md b/docs/guide/benchmarking/2026-06-11-elf-iteration-direction-from-competitor-benchmarks.md index f5a2ad4b..f919f5d7 100644 --- a/docs/guide/benchmarking/2026-06-11-elf-iteration-direction-from-competitor-benchmarks.md +++ b/docs/guide/benchmarking/2026-06-11-elf-iteration-direction-from-competitor-benchmarks.md @@ -44,20 +44,20 @@ The strongest current statement is: | Metric | Value | | --- | ---: | -| Jobs | `49` | -| Encoded suites | `13` | -| Pass | `44` | +| Jobs | `50` | +| Encoded suites | `14` | +| Pass | `45` | | Blocked | `5` | | Wrong result | `0` | | Lifecycle fail | `0` | | Incomplete | `0` | | Not encoded | `0` | | Unsupported claim | `0` | -| Mean score | `0.898` | -| Evidence coverage | `111/111` | -| Source-ref coverage | `111/111` | -| Quote coverage | `111/111` | -| Expected evidence recall | `100/100` | +| Mean score | `0.900` | +| Evidence coverage | `115/115` | +| Source-ref coverage | `115/115` | +| Quote coverage | `115/115` | +| Expected evidence recall | `107/107` | This proves the fixture contract is broad and well controlled. It does not prove that every live adapter or every competitor runtime passes those scenarios. @@ -136,7 +136,7 @@ one misleading score. | Source of truth | ELF has the strongest measured source-of-truth evidence. | Borrow memsearch's local canonical-store ergonomics without making files or vectors authoritative. | | Temporal memory | ELF fixture passes, but live memory evolution is wrong_result. | Prioritize current-vs-historical evidence links and Graphiti/Zep-style validity windows. | | Consolidation | ELF fixture passes and XY-934 adds live service-backed proposal materialization, lineage, confidence/usefulness, unsupported-claim flags, and apply/defer/discard audit; direct competitor runners remain untested. | Keep derived proposal review as the safety boundary and add competitor/reference runners only when they emit comparable artifacts. | -| Knowledge pages | ELF fixture pages pass; live knowledge generation is not encoded. | Borrow llm-wiki lint/query-save loops, gbrain timelines, and graphify reports behind rebuild/lint benchmarks. | +| Memory summaries and knowledge pages | ELF fixture pages pass, and XY-952 adds a fixture-backed `memory_summary` source-trace contract; live top-of-mind behavior and live knowledge generation are not encoded. | Borrow llm-wiki lint/query-save loops, gbrain timelines, graphify reports, and managed-memory review patterns behind source-linked summary and rebuild/lint benchmarks. | | Operator debugging | Fixture UX passes and the narrow live trace/viewer slice is scored: ELF passes, qmd ties replay/repair clarity but is wrong_result for trace hydration and candidate-drop visibility. | Expand coverage to OpenMemory and claude-mem UI/export or viewer runners before any broader operator-UX claim. | | Capture/write policy | ELF live capture/write-policy self-check passes with zero redaction leaks; qmd is `not_encoded`; agentmemory is `blocked`; claude-mem is `not_encoded`. | Borrow agentmemory/claude-mem capture breadth only after durable local hook/viewer evidence exists, while preserving redaction and evidence binding. | | Production ops | ELF has the strongest checked-in evidence, with private/credential gates blocked. | Keep Docker-first production proof and add private corpus only when an operator-owned manifest exists. | diff --git a/docs/guide/benchmarking/2026-06-16-dreaming-readiness-stage-ledger.md b/docs/guide/benchmarking/2026-06-16-dreaming-readiness-stage-ledger.md index df37634e..e5b9c128 100644 --- a/docs/guide/benchmarking/2026-06-16-dreaming-readiness-stage-ledger.md +++ b/docs/guide/benchmarking/2026-06-16-dreaming-readiness-stage-ledger.md @@ -7,8 +7,8 @@ need the baseline command matrix, typed evidence status, post-stage outcome, and report shape required before claiming the stage improved. Inputs: `docs/research/2026-06-16-dreaming-readiness-stage-ledger.json`, the June 11 competitor-strength, temporal-history, and iteration-direction reports, the XY-905 -June 16 live temporal reconciliation report, the consolidation proposal spec, and the -checked-in real-world fixture suites. +June 16 live temporal reconciliation report, the consolidation proposal spec, the +memory summary spec, and the checked-in real-world fixture suites. Outputs: A stage-by-stage ledger that downstream issues can update with `improved`, `regressed`, `unchanged`, `blocked`, or `not_tested` judgments. @@ -20,14 +20,13 @@ and now includes the XY-905 post-stage result for live temporal reconciliation. Current stage status: -- `improved`: current-vs-historical correctness, preference evolution, and - reviewable consolidation. +- `improved`: current-vs-historical correctness, preference evolution, reviewable + consolidation, and memory-summary/top-of-mind fixture readback. - `regressed`: none. - `unchanged`: deletion/TTL/tombstone behavior and the final competitor retest baseline. - `blocked`: scheduled-memory-task readiness. -- `not_tested`: memory-summary/top-of-mind live behavior and proactive brief - readiness. +- `not_tested`: proactive brief readiness. The known live `memory_evolution` loss is now repaired for the encoded ELF live adapter slice: the XY-905 run passes all six memory-evolution jobs and reports @@ -40,6 +39,12 @@ service-backed proposal materialization, source lineage, confidence/usefulness, unsupported-claim flags, apply/defer/discard audit transitions, and zero source mutations. Direct competitor runners remain untested or product-reference only. +Memory summary and top-of-mind behavior is improved only at the fixture-backed +contract level: XY-952 adds a reviewable `elf.memory_summary/v1` source-trace fixture +that distinguishes current top-of-mind, background, stale, superseded, tombstoned, and +derived project-profile entries. It does not prove live top-of-mind product behavior or +parity with managed memory products. + ## Ledger Rules - Every downstream Dreaming or competitor-improvement stage must write a post-stage @@ -64,7 +69,7 @@ mutations. Direct competitor runners remain untested or product-reference only. | Preference evolution and correction history | `cargo make real-world-memory-evolution`; `cargo make real-world-memory-live-adapters`; `cargo make openmemory-ui-export-readback` | Same commands; include mem0/OpenMemory boundary evidence | `pass=0`, `wrong_result=1`, `blocked=0`, `not_tested=0`, `not_encoded=0` | `pass=1`, `wrong_result=0`, `blocked=0`, `not_tested=0`, `not_encoded=0` | `improved` | Measure preference correction against mem0/OpenMemory history and UI/export surfaces before making any broader history-quality claim. | | Deletion, TTL, and tombstone behavior | `cargo make real-world-memory`; `cargo make real-world-memory-live-adapters` | Same commands | `pass=1`, `wrong_result=0`, `blocked=0`, `not_tested=0`, `not_encoded=0` | `pass=1`, `wrong_result=0`, `blocked=0`, `not_tested=0`, `not_encoded=0` | `unchanged` | Extend tombstone and TTL readback beyond the single encoded job into update/delete/recreate history cases. | | Reviewable consolidation | `cargo make real-world-memory-consolidation` | `cargo make real-world-memory-consolidation`; `cargo make real-world-memory-live-consolidation`; `cargo make real-world-memory-live-adapters` | `pass=4`, `wrong_result=0`, `blocked=0`, `not_tested=1`, `not_encoded=1` | `pass=4`, `wrong_result=0`, `blocked=0`, `not_tested=0`, `not_encoded=0` | `improved` | Keep Dreaming output derived and reviewable, and add direct competitor/reference runners only when they emit comparable source ids, confidence, unsupported-claim flags, and review audit artifacts. | -| Memory summary and top-of-mind behavior | `cargo make real-world-memory-knowledge`; `cargo make real-world-memory-core-archival` | Same commands plus `cargo make real-world-memory-live-adapters` | `pass=8`, `wrong_result=0`, `blocked=0`, `not_tested=1`, `not_encoded=1` | not run by XY-905 | `not_tested` | Build summaries as cited, rebuildable derived pages or core blocks; do not turn hidden summaries into authoritative memory. | +| Memory summary and top-of-mind behavior | `cargo make real-world-memory-knowledge`; `cargo make real-world-memory-core-archival` | `cargo make real-world-memory-summary`; `cargo make real-world-memory-knowledge`; `cargo make real-world-memory-core-archival`; `cargo make real-world-memory-live-adapters` | `pass=8`, `wrong_result=0`, `blocked=0`, `not_tested=1`, `not_encoded=1` | `pass=9`, `wrong_result=0`, `blocked=0`, `not_tested=0`, `not_encoded=0` | `improved` | Move from fixture-backed summary/source-trace readback into service-native admin readback and later live top-of-mind behavior; do not turn hidden summaries into authoritative memory. | | Proactive brief readiness | `cargo make real-world-first-generation-oss`; `cargo make real-world-job-operator-ux` | Same commands plus `cargo make real-world-memory-live-adapters` | `pass=0`, `wrong_result=0`, `blocked=0`, `not_tested=1`, `not_encoded=1` | not run by XY-905 | `not_tested` | Add direct proactive-brief fixtures before any pass claim; briefs must be source-linked and repairable. | | Scheduled memory task readiness | `cargo make real-world-memory-consolidation` | `cargo make real-world-memory-consolidation`; `cargo make real-world-memory-live-adapters` | `pass=0`, `wrong_result=0`, `blocked=1`, `not_tested=0`, `not_encoded=0` | not run by XY-905 | `blocked` | Scheduled runs are future work; start with queued derived proposal runs and keep operator review mandatory. | | Final competitor retest status | `cargo make real-world-memory-live-adapters`; `cargo make real-world-first-generation-oss`; `cargo make real-world-memory-graph-rag`; `cargo make openmemory-ui-export-readback`; `cargo make baseline-production-private-addendum` when operator input exists | Same commands; private/provider commands may remain typed blocked under XY-930 | `pass=22`, `wrong_result=5`, `blocked=2`, `not_tested=11`, `not_encoded=11` | partial XY-905 evidence: ELF live adapter `pass=40`, `wrong_result=0`, `blocked=5`, `not_encoded=10` | `unchanged` | Rerun the broader competitor matrix after each optimization; the XY-905 live adapter improvement does not replace private/provider or external competitor gates. | @@ -77,7 +82,7 @@ mutations. Direct competitor runners remain untested or product-reference only. | Preference evolution and correction history | `docs/guide/benchmarking/2026-06-16-live-temporal-reconciliation-report.md`; `docs/research/2026-06-16-live-temporal-reconciliation-report.json`; `docs/guide/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md`; `docs/guide/benchmarking/2026-06-11-mem0-openmemory-history-ui-export-report.md`; `docs/research/2026-06-11-temporal-history-competitor-gap-report.json` | | Deletion, TTL, and tombstone behavior | `docs/guide/benchmarking/2026-06-16-live-temporal-reconciliation-report.md`; `docs/research/2026-06-16-live-temporal-reconciliation-report.json`; `docs/guide/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md`; `docs/guide/benchmarking/2026-06-11-measurement-coverage-audit.md` | | Reviewable consolidation | `docs/spec/system_consolidation_proposals_v1.md`; `apps/elf-eval/fixtures/real_world_memory/consolidation/`; `docs/guide/benchmarking/2026-06-16-live-consolidation-proposal-scoring-report.md`; `docs/research/2026-06-16-live-consolidation-proposal-scoring-report.json` | -| Memory summary and top-of-mind behavior | `apps/elf-eval/fixtures/real_world_memory/knowledge/`; `apps/elf-eval/fixtures/real_world_memory/core_archival_memory/`; `docs/guide/benchmarking/2026-06-11-competitor-strength-adoption-report.md` | +| Memory summary and top-of-mind behavior | `docs/spec/system_memory_summary_v1.md`; `apps/elf-eval/fixtures/real_world_memory/memory_summary/`; `apps/elf-eval/fixtures/real_world_memory/knowledge/`; `apps/elf-eval/fixtures/real_world_memory/core_archival_memory/`; `docs/guide/benchmarking/2026-06-11-competitor-strength-adoption-report.md` | | Proactive brief readiness | `docs/research/2026-06-08-agent-memory-selection.json`; `docs/guide/benchmarking/2026-06-11-first-generation-oss-continuity-source-store-report.md` | | Scheduled memory task readiness | `docs/spec/system_consolidation_proposals_v1.md`; `docs/research/2026-06-08-agent-memory-selection.json` | | Final competitor retest status | `docs/guide/benchmarking/2026-06-11-competitor-strength-adoption-report.md`; `docs/research/2026-06-11-competitor-strength-adoption-report.json`; `docs/guide/benchmarking/2026-06-11-graph-rag-scored-smoke-adapter-report.md`; `docs/guide/benchmarking/2026-06-11-first-generation-oss-continuity-source-store-report.md` | @@ -109,6 +114,8 @@ Allowed: files. - The current ledger preserves typed non-pass states and records the XY-905 live memory-evolution improvement. +- The current ledger records the XY-952 fixture-backed memory-summary/source-trace + contract improvement. - Fixture-backed knowledge and core/archival jobs can be used as regression guards for report shape. - Reviewable consolidation now has ELF live service-backed proposal scoring evidence, @@ -117,8 +124,8 @@ Allowed: Not allowed: - Do not claim this ledger proves preference history against mem0/OpenMemory, - proactive briefs, scheduled tasks, private-corpus gates, hosted memory, broad - consolidation superiority, or competitor adapters. + live top-of-mind behavior, proactive briefs, scheduled tasks, private-corpus gates, + hosted memory, broad consolidation superiority, or competitor adapters. - Do not claim ELF has full-suite live real-world pass evidence. - Do not claim private-corpus or provider-backed production quality without the operator-owned inputs required by XY-930. diff --git a/docs/guide/benchmarking/real_world_agent_memory_benchmark.md b/docs/guide/benchmarking/real_world_agent_memory_benchmark.md index ce1bcc1d..2527bb5c 100644 --- a/docs/guide/benchmarking/real_world_agent_memory_benchmark.md +++ b/docs/guide/benchmarking/real_world_agent_memory_benchmark.md @@ -229,12 +229,15 @@ research gates. Its `external_adapters` report section distinguishes: - `research_gate`: checked-in source/setup/runtime/resource/retry metadata for a future adapter path, not fixture-backed or live execution evidence. -Current fixture state: `cargo make real-world-memory` covers 49 jobs across 13 suites, -with 44 pass and 5 blocked. The added `core_archival_memory` suite contributes six -passing fixture jobs for core block attachment, scope, provenance, stale-core -detection, archival fallback, and project-decision recovery. The blocked jobs are -production-ops operator boundaries plus the XY-928 OpenViking `context_trajectory` -gates for staged retrieval, hierarchy selection, and recursive/context expansion. +Current fixture state: `cargo make real-world-memory` covers 50 jobs across 14 suites, +with 45 pass and 5 blocked. The `core_archival_memory` suite contributes six passing +fixture jobs for core block attachment, scope, provenance, stale-core detection, +archival fallback, and project-decision recovery. The `memory_summary` suite +contributes one passing fixture-backed source-trace job for reviewable current, +background, stale, superseded, tombstoned, and derived project-profile entries. The +blocked jobs are production-ops operator boundaries plus the XY-928 OpenViking +`context_trajectory` gates for staged retrieval, hierarchy selection, and recursive +context expansion. Current live-adapter state: the `elf_live_real_world` and `qmd_live_real_world` adapters run a full checked-in suite sweep through `cargo make real-world-memory-live-adapters`. Each adapter diff --git a/docs/research/2026-06-11-competitor-strength-adoption-report.json b/docs/research/2026-06-11-competitor-strength-adoption-report.json index bc5761b4..6384763e 100644 --- a/docs/research/2026-06-11-competitor-strength-adoption-report.json +++ b/docs/research/2026-06-11-competitor-strength-adoption-report.json @@ -40,7 +40,12 @@ { "command": "cargo make real-world-memory", "artifact": "docs/guide/benchmarking/2026-06-11-measurement-coverage-audit.md", - "claim": "ELF fixture aggregate covers 49 jobs across 13 suites with 44 pass and 5 blocked production-ops or OpenViking context-trajectory measurement gates, including 6 passing core_archival_memory jobs." + "claim": "ELF fixture aggregate covers 50 jobs across 14 suites with 45 pass and 5 blocked production-ops or OpenViking context-trajectory measurement gates, including 6 passing core_archival_memory jobs and 1 passing memory_summary source-trace job." + }, + { + "command": "cargo make real-world-memory-summary", + "artifact": "tmp/real-world-memory/memory-summary/report.json", + "claim": "The memory summary fixture scores reviewable top-of-mind, background, stale, superseded, tombstoned, and derived project-profile entries with source refs, freshness metadata, rationale, and unsupported-claim flags; this is fixture-backed contract evidence, not managed-memory parity." }, { "command": "cargo make real-world-memory-core-archival", diff --git a/docs/research/2026-06-16-dreaming-readiness-stage-ledger.json b/docs/research/2026-06-16-dreaming-readiness-stage-ledger.json index 76104dc5..1ba0eef5 100644 --- a/docs/research/2026-06-16-dreaming-readiness-stage-ledger.json +++ b/docs/research/2026-06-16-dreaming-readiness-stage-ledger.json @@ -4,7 +4,7 @@ "authority": "XY-951", "created_at": "2026-06-16T00:00:00Z", "purpose": "Define the benchmark evidence gate that every Dreaming-inspired ELF optimization stage must update before claiming completion.", - "source_evidence_cutoff": "Checked-in benchmark and research evidence through the XY-905 live temporal reconciliation run and XY-934 live consolidation proposal scoring run on 2026-06-16; no private-corpus or provider-backed production pass is claimed by this ledger.", + "source_evidence_cutoff": "Checked-in benchmark and research evidence through the XY-905 live temporal reconciliation run, XY-934 live consolidation proposal scoring run, and XY-952 fixture-backed memory summary/source-trace contract on 2026-06-16; no private-corpus or provider-backed production pass is claimed by this ledger.", "typed_status_terms": [ "pass", "wrong_result", @@ -43,7 +43,8 @@ "improved": [ "current_vs_historical_correctness", "preference_evolution", - "reviewable_consolidation" + "reviewable_consolidation", + "memory_summary_top_of_mind_behavior" ], "regressed": [], "unchanged": [ @@ -54,7 +55,6 @@ "scheduled_memory_task_readiness" ], "not_tested": [ - "memory_summary_top_of_mind_behavior", "proactive_brief_readiness" ] }, @@ -288,7 +288,7 @@ { "stage_id": "memory_summary_top_of_mind_behavior", "stage_name": "Memory summary and top-of-mind behavior", - "dependent_issue": "XY-926", + "dependent_issue": "XY-952", "evidence_class": "fixture_backed", "baseline_commands": [ { @@ -303,6 +303,10 @@ } ], "post_stage_commands": [ + { + "command": "cargo make real-world-memory-summary", + "required_artifact": "tmp/real-world-memory/memory-summary/report.json" + }, { "command": "cargo make real-world-memory-knowledge", "required_artifact": "tmp/real-world-memory/knowledge-report.json" @@ -317,6 +321,8 @@ } ], "evidence_files": [ + "docs/spec/system_memory_summary_v1.md", + "apps/elf-eval/fixtures/real_world_memory/memory_summary/", "docs/guide/benchmarking/2026-06-11-competitor-strength-adoption-report.md", "apps/elf-eval/fixtures/real_world_memory/knowledge/", "apps/elf-eval/fixtures/real_world_memory/core_archival_memory/" @@ -329,10 +335,18 @@ "not_encoded": 1 }, "baseline_basis": "Knowledge and core/archival fixtures pass, but live knowledge compilation and top-of-mind product behavior are not encoded.", - "comparison_judgment": "not_tested", + "post_stage_counts": { + "pass": 9, + "wrong_result": 0, + "blocked": 0, + "not_tested": 0, + "not_encoded": 0 + }, + "post_stage_basis": "XY-952 adds one fixture-backed memory_summary job with top-of-mind, background, stale, superseded, tombstone, and derived project-profile entries, source refs, freshness metadata, rationale, and unsupported-claim flags.", + "comparison_judgment": "improved", "regression_rule": "Any stale summary, unsupported section, missing source id, or stale core block presented as current is a regression.", - "improvement_rule": "An improvement requires live top-of-mind or summary readback that remains source-linked and linted for stale/unsupported claims.", - "next_optimization_direction": "Build summaries as derived, cited, rebuildable pages or core blocks; do not replace authoritative notes with hidden summaries." + "improvement_rule": "An improvement requires top-of-mind or summary readback that remains source-linked, exposes freshness and rationale, and fails stale-current or unsupported-derived claims.", + "next_optimization_direction": "Move from fixture-backed summary/source-trace readback into service-native admin readback and later live top-of-mind behavior without replacing authoritative notes with hidden summaries." }, { "stage_id": "proactive_brief_readiness", diff --git a/docs/spec/index.md b/docs/spec/index.md index 353bb63f..86c90cd8 100644 --- a/docs/spec/index.md +++ b/docs/spec/index.md @@ -35,6 +35,8 @@ Question this index answers: "what must remain true?" and storage invariants. - `system_consolidation_proposals_v1.md`: Reviewable derived consolidation run and proposal contract over immutable source evidence. +- `system_memory_summary_v1.md`: Reviewable current/background/stale/superseded/ + tombstoned/derived memory summary and source-trace contract. - `system_knowledge_pages_v1.md`: Derived project/entity/concept/issue/decision page storage, rebuild, citation, and stale-source lint contract. - `system_competitive_parity_gate_v1.md`: Docker-only adoption gate that decides diff --git a/docs/spec/real_world_agent_memory_benchmark_v1.md b/docs/spec/real_world_agent_memory_benchmark_v1.md index d0e58c5c..b371e9a5 100644 --- a/docs/spec/real_world_agent_memory_benchmark_v1.md +++ b/docs/spec/real_world_agent_memory_benchmark_v1.md @@ -69,6 +69,7 @@ runner execution. "operator_debug": {}, "encoding": {}, "memory_evolution": {}, + "memory_summary": {}, "tags": [] } ``` @@ -92,6 +93,7 @@ runner execution. | `operator_debug` | object or null | Optional for most suites; required for `operator_debugging_ux` jobs. Records trace/viewer evidence and operator workflow scoring inputs. | | `encoding` | object | Optional job-level limitation declaration. Only `not_encoded`, `blocked`, and `incomplete` statuses are allowed here. | | `memory_evolution` | object or null | Optional for most suites; used by `memory_evolution` jobs to report current evidence, historical evidence, stale traps, conflicts, update rationale, and temporal-validity limitations. | +| `memory_summary` | object or null | Optional for most suites; used by `memory_summary` jobs to report reviewable summary/source-trace metrics defined in `system_memory_summary_v1.md`. | | `tags` | array | Optional labels such as `private_corpus`, `synthetic`, `adapter_required`, or `no_live_claim`. | ### `corpus` @@ -538,6 +540,7 @@ Suite ids are stable public names. Each suite MUST contain at least one | `retrieval` | Measure task-relevant retrieval quality beyond top-k keyword matching. | Answer a task query with expected evidence; find alternate phrasing; avoid near-duplicate project evidence. | Expected evidence ids, allowed alternates, decoy evidence ids, trace ids when available. | answer_correctness, evidence_grounding, trap_avoidance, latency_resource. | qmd, ELF, memsearch, OpenViking. | | `memory_evolution` | Verify updates, deletes, expiry, supersession, contradiction handling, and history. | Apply a new preference; suppress a deleted memory; explain what superseded an old fact. | Before/after memory versions, ingest decision rows or adapter history, current timeline event. | lifecycle_behavior, answer_correctness, evidence_grounding, trap_avoidance. | mem0, ELF, Graphiti/Zep, Letta. | | `consolidation` | Test reviewable derived memory formation without hidden source mutation. | Produce a consolidation proposal; identify unsupported claims; discard stale synthesis. | Source inputs, derived proposal id, lineage, review state, conflict markers. | answer_correctness, evidence_grounding, uncertainty_handling, debuggability. | Claude Dreams, Gemini CLI Auto Memory, Always-On Memory Agent, ELF. | +| `memory_summary` | Test reviewable top-of-mind, background, stale, superseded, tombstoned, and derived project-profile memory readback. | Produce a current memory summary; downgrade stale memory; expose a TTL tombstone; refuse an unsupported derived profile claim. | Summary entry source refs, freshness and validity markers, source trace, inclusion/downgrade/exclusion rationale, unsupported-claim flags. | answer_correctness, evidence_grounding, lifecycle_behavior, trap_avoidance, uncertainty_handling. | OpenAI Dreaming, Claude Dreams, Always-On Memory Agent, ELF. | | `knowledge_compilation` | Compile evidence into maintained project/entity/concept pages while preserving provenance. | Build a project status page; answer from compiled truth plus timeline; lint a stale page section. | Page section sources, backlinks, timeline entries, lint evidence. | answer_correctness, evidence_grounding, workflow_helpfulness, trap_avoidance. | llm-wiki, gbrain, graphify, ELF. | | `operator_debugging_ux` | Show whether a wrong or ambiguous memory result can be debugged without raw store spelunking. | Explain why a result ranked first; inspect a trace; identify which stage dropped expected evidence. | Trace bundle, retrieval trajectory, candidate metrics, viewer or CLI readback. | debuggability, evidence_grounding, workflow_helpfulness, answer_correctness. | claude-mem, qmd, agentmemory, ELF. | | `capture_integration` | Evaluate how accurately work observations become usable memory across agents and tools. | Capture a session decision; exclude private spans; import external agent observations. | Hook/import logs, write policy audits, excluded spans, resulting note ids. | answer_correctness, evidence_grounding, trap_avoidance, lifecycle_behavior. | agentmemory, claude-mem, memsearch, mem0. | @@ -614,6 +617,22 @@ conflict detection counts, update rationale availability, and temporal-validity `not_encoded` counts. A temporal graph validity job MUST NOT be reported as `pass` unless the runner can evaluate current-only versus historical relation facts. +Reports that encode `memory_summary` jobs MUST also include: + +- summary artifact count and entry count; +- source-ref coverage for included or downgraded summary entries; +- freshness-marker and rationale coverage; +- stale-current violation count for top-of-mind entries; +- derived entries missing both source refs and unsupported-claim flags; +- unsupported derived candidate count. +- unsupported derived entries included as current memory. + +A `memory_summary` job MUST NOT pass when stale, superseded, or tombstoned entries are +presented as current top-of-mind facts. A derived project-profile entry MUST NOT pass +unless it has source refs or explicit unsupported-claim flags. A derived entry with +unsupported-claim flags MUST NOT pass when it is included as current memory instead of +being excluded or downgraded for review. + Consolidation suite reports MUST also include: - proposal usefulness score, or `null` when the job has no proposal payloads; diff --git a/docs/spec/system_elf_memory_service_v2.md b/docs/spec/system_elf_memory_service_v2.md index 1d19df90..b33588e9 100644 --- a/docs/spec/system_elf_memory_service_v2.md +++ b/docs/spec/system_elf_memory_service_v2.md @@ -1115,6 +1115,23 @@ Behavior: knowledge page snippets wherever surfaced. - The detailed contract is defined in `system_knowledge_pages_v1.md`. +Admin reviewable memory summary readback: + +Behavior: +- Memory summary readback is a derived, reviewable artifact surface, not + authoritative note search and not a hidden note rewrite path. +- Summary entries must follow `elf.memory_summary/v1`, carry source refs, freshness or + validity metadata, and inclusion/downgrade/exclusion rationale for top-of-mind, + background, stale, superseded, tombstoned, and derived project-profile entries. +- Stale, superseded, or tombstoned entries must not be returned as current + top-of-mind facts. +- Derived project-profile entries must either cite source refs or carry explicit + unsupported-claim flags when excluded. +- Memory summaries must not call provider adapters, mutate authoritative source notes, + create Qdrant points, create search sessions, or record note hits in v1 contract + validation. +- The detailed contract is defined in `system_memory_summary_v1.md`. + POST /v2/admin/qdrant/rebuild Behavior: diff --git a/docs/spec/system_memory_summary_v1.md b/docs/spec/system_memory_summary_v1.md new file mode 100644 index 00000000..0db2fe57 --- /dev/null +++ b/docs/spec/system_memory_summary_v1.md @@ -0,0 +1,171 @@ +# Reviewable Memory Summary v1 Specification + +Purpose: Define the reviewable memory summary and source-trace contract. +Status: normative +Read this when: You are implementing, validating, or reviewing summary readback for top-of-mind, background, stale, superseded, tombstoned, or derived project-profile memory. +Not this document: Scheduled background jobs, polished viewer UI, live provider generation, or authoritative note mutation. +Defines: `elf.memory_summary/v1` summary artifacts, entries, source traces, freshness markers, and inclusion rationale. + +## Core Rule + +Memory summaries are derived readback artifacts. They must never replace, rewrite, +delete, deprecate, or silently update authoritative notes, docs, event audits, graph +facts, consolidation proposals, traces, or source pointers. + +Postgres remains the source of truth for source memory. A summary may be rebuilt, +discarded, archived, or regenerated without changing the source memory that produced +it. A summary is useful only when an operator can inspect why each entry is current, +background, stale, superseded, tombstoned, or excluded. + +## Contract Schema + +Canonical schema identifier: + +```text +elf.memory_summary/v1 +``` + +Every persisted or benchmarked summary artifact must carry +`contract_schema = "elf.memory_summary/v1"`. + +## Summary Artifact + +Required fields: + +- `summary_id`: stable summary artifact id. +- `contract_schema`: `elf.memory_summary/v1`. +- `generated_at`: RFC3339 timestamp for the readback artifact. +- `tenant_id`, `project_id`, `agent_id`, and `read_profile`: context used to build the + readback. +- `entries`: non-empty array of summary entries. +- `source_trace`: source selection and exclusion metadata. + +The artifact may include provider metadata in future lanes, but v1 summary readback +does not require provider execution and must not hide source selection behind provider +state. + +## Entry Categories + +`entries[].category` must be one of: + +- `top_of_mind`: current high-priority memory that may be attached or shown first. +- `background`: current lower-priority memory that is useful context but not urgent. +- `stale`: non-current memory retained only to explain why it is stale. +- `superseded`: historical memory replaced by newer source evidence. +- `tombstone`: delete, TTL, invalidation, or suppression evidence. +- `derived_project_profile`: derived profile or project-summary entry. + +`top_of_mind` entries must have `freshness.status = "current"`. A stale, +superseded, tombstoned, historical, unsupported, or unknown entry must not be surfaced +as top-of-mind. + +## Entry Contract + +Each summary entry must include: + +- `entry_id`: stable id within the summary. +- `category`: one of the categories above. +- `text`: bounded English summary text. +- `source_refs`: source evidence ids or source-ref handles used for the entry. +- `freshness`: validity metadata. +- `rationale`: inclusion, downgrade, or exclusion rationale. +- `unsupported_claim_flags`: reviewer prompts for claims that are not supported well + enough to include as current derived memory. + +`source_refs` must be non-empty for every included or downgraded entry. A +`derived_project_profile` entry may have empty `source_refs` only when +`rationale.decision = "excluded"` and `unsupported_claim_flags` is non-empty. That +shape records a refused derived claim, not a usable memory entry. + +## Freshness + +`freshness` must include: + +- `status`: one of `current`, `background`, `historical`, `stale`, `superseded`, + `tombstoned`, or `unsupported`. +- `observed_at`: RFC3339 timestamp when the source was observed, or `null` when the + source is intentionally untimed. +- `valid_from`: RFC3339 timestamp or `null`. +- `valid_to`: RFC3339 timestamp or `null`. +- `last_confirmed_at`: RFC3339 timestamp or `null`. +- `superseded_by`: array of entry ids or source ids that supersede this entry. +- `tombstone_refs`: array of source ids or source-ref handles proving deletion, TTL + expiry, invalidation, or suppression. + +For `category = "superseded"`, `freshness.superseded_by` must be non-empty. +For `category = "tombstone"`, `freshness.tombstone_refs` must be non-empty and +`freshness.status` must be `tombstoned`. + +## Rationale + +`rationale` must include: + +- `decision`: one of `included`, `downgraded`, or `excluded`. +- `reason_code`: stable code for why the entry appears in its category. +- `reason`: reviewer-facing explanation. + +Allowed reason-code families: + +- `TOP_OF_MIND_*` +- `BACKGROUND_*` +- `DOWNGRADED_STALE_*` +- `SUPERSEDED_*` +- `TOMBSTONE_*` +- `DERIVED_PROFILE_*` +- `EXCLUDED_UNSUPPORTED_*` + +The rationale must say why an entry is included, downgraded, or excluded. It is not +enough to say that an entry exists. + +## Source Trace + +`source_trace` must include: + +- `selected_source_refs`: sources used for included or downgraded entries. +- `dropped_source_refs`: candidates not used in the final summary. +- `stale_source_refs`: stale source candidates and their downgrade reason. +- `superseded_source_refs`: superseded sources and the source that superseded them. +- `tombstone_source_refs`: tombstone or TTL invalidation sources. +- `unsupported_claim_flags`: page-level or entry-level unsupported derived claims. + +Each source trace item should preserve source status, source `updated_at` or +equivalent freshness timestamp when available, and source snapshot metadata. Empty +trace arrays are allowed only when the category is absent from the summary. + +## Readback Rules + +Summary readback must: + +- Label the artifact as derived and reviewable. +- Return entries with source refs, freshness metadata, and rationale. +- Preserve current-vs-historical truth: current facts may be top-of-mind, while old + facts must be stale, superseded, tombstoned, or excluded. +- Preserve tombstones and TTL invalidations as suppression evidence instead of + restating the deleted fact as current. +- Preserve unsupported derived candidates as reviewer prompts, not as current facts. + +Summary readback must not: + +- Present a stale, superseded, or tombstoned source as current top-of-mind memory. +- Treat a derived profile entry as authoritative source memory. +- Omit source refs from included or downgraded entries. +- Include a derived project-profile entry with neither source refs nor unsupported + claim flags. +- Claim parity with managed memory or Dreaming products from this local contract alone. + +## Benchmark Requirements + +The `memory_summary` real-world benchmark suite must fail when: + +- stale, superseded, or tombstoned entries appear as current top-of-mind facts; +- included or downgraded entries lack source refs; +- entries lack freshness or rationale metadata; +- derived project-profile entries lack both source refs and unsupported-claim flags; +- unsupported derived claims are silently included as current memory. + +Unsupported derived claims may appear only as reviewer prompts. A summary entry with +`unsupported_claim_flags` must not also be included as current memory. + +Fixture-backed evidence proves only the contract shape. Live top-of-mind behavior and +scheduled background generation require separate live reports before product-quality +claims are allowed.