From 40ede386711307f9cfa8d674806ee636628a4a1d Mon Sep 17 00:00:00 2001 From: Yvette Carlisle Date: Tue, 9 Jun 2026 23:11:57 +0800 Subject: [PATCH] {"schema":"decodex/commit/1","summary":"Add real-world memory evolution benchmark cases","authority":"XY-846"} --- Makefile.toml | 52 ++ .../benchmark_conclusion_overturned.json | 263 +++++++ .../deployment_method_superseded.json | 226 ++++++ .../evolution/issue_blocked_to_done.json | 221 ++++++ ...ference_changed_current_vs_historical.json | 224 ++++++ ...elation_temporal_validity_not_encoded.json | 199 ++++++ .../src/bin/real_world_job_benchmark.rs | 668 +++++++++++++++++- .../tests/real_world_job_benchmark.rs | 197 +++++- docs/guide/benchmarking/index.md | 3 + .../benchmarking/live_baseline_benchmark.md | 11 + .../real_world_agent_memory_benchmark.md | 30 +- .../real_world_memory_evolution.md | 64 ++ .../real_world_agent_memory_benchmark_v1.md | 47 +- 13 files changed, 2167 insertions(+), 38 deletions(-) create mode 100644 apps/elf-eval/fixtures/real_world_memory/evolution/benchmark_conclusion_overturned.json create mode 100644 apps/elf-eval/fixtures/real_world_memory/evolution/deployment_method_superseded.json create mode 100644 apps/elf-eval/fixtures/real_world_memory/evolution/issue_blocked_to_done.json create mode 100644 apps/elf-eval/fixtures/real_world_memory/evolution/preference_changed_current_vs_historical.json create mode 100644 apps/elf-eval/fixtures/real_world_memory/evolution/relation_temporal_validity_not_encoded.json create mode 100644 docs/guide/benchmarking/real_world_memory_evolution.md diff --git a/Makefile.toml b/Makefile.toml index 8eb6cf43..ed9a5405 100644 --- a/Makefile.toml +++ b/Makefile.toml @@ -400,6 +400,9 @@ args = [ # | real-world-memory | composite | | # | real-world-memory-json | command | | # | real-world-memory-report | command | | +# | real-world-memory-evolution | composite | | +# | real-world-memory-evolution-json | command | | +# | real-world-memory-evolution-report | command | | # | real-world-job-operator-ux | composite | | # | real-world-job-operator-ux-json | command | | # | real-world-job-operator-ux-report | command | | @@ -496,6 +499,55 @@ args = [ "tmp/real-world-memory/real-world-memory-report.md", ] +[tasks.real-world-memory-evolution] +workspace = false +dependencies = [ + "real-world-memory-evolution-report", +] + +[tasks.real-world-memory-evolution-json] +workspace = false +command = "cargo" +args = [ + "run", + "-p", + "elf-eval", + "--bin", + "real_world_job_benchmark", + "--", + "run", + "--fixtures", + "apps/elf-eval/fixtures/real_world_memory/evolution", + "--out", + "tmp/real-world-memory/evolution-report.json", + "--run-id", + "real-world-memory-evolution", + "--adapter-id", + "fixture_memory_evolution", + "--adapter-name", + "ELF fixture memory evolution", +] + +[tasks.real-world-memory-evolution-report] +workspace = false +dependencies = [ + "real-world-memory-evolution-json", +] +command = "cargo" +args = [ + "run", + "-p", + "elf-eval", + "--bin", + "real_world_job_benchmark", + "--", + "publish", + "--report", + "tmp/real-world-memory/evolution-report.json", + "--out", + "tmp/real-world-memory/evolution-report.md", +] + [tasks.real-world-job-operator-ux] workspace = false dependencies = [ diff --git a/apps/elf-eval/fixtures/real_world_memory/evolution/benchmark_conclusion_overturned.json b/apps/elf-eval/fixtures/real_world_memory/evolution/benchmark_conclusion_overturned.json new file mode 100644 index 00000000..0d694597 --- /dev/null +++ b/apps/elf-eval/fixtures/real_world_memory/evolution/benchmark_conclusion_overturned.json @@ -0,0 +1,263 @@ +{ + "schema": "elf.real_world_job/v1", + "job_id": "memory-evolution-benchmark-verdict-001", + "suite": "memory_evolution", + "title": "Use the current production adoption verdict after an older conclusion changed", + "corpus": { + "corpus_id": "real-world-memory-evolution-2026-06-09", + "profile": "synthetic", + "items": [ + { + "evidence_id": "verdict-old-not-ready", + "kind": "decision", + "text": "Earlier conclusion: ELF was not production ready because private corpus and restore proof were missing.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "benchmark_conclusion_overturned", + "evidence_id": "verdict-old-not-ready" + } + }, + "created_at": "2026-06-07T00:00:00Z" + }, + { + "evidence_id": "verdict-current-ready-bounded", + "kind": "decision", + "text": "Production adoption gate on 2026-06-09 says ELF is ready for personal production use with bounded caveats.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "benchmark_conclusion_overturned", + "evidence_id": "verdict-current-ready-bounded" + } + }, + "created_at": "2026-06-09T00:00:00Z" + }, + { + "evidence_id": "verdict-bounded-private-caveat", + "kind": "decision", + "text": "The private production corpus was not run; the gate records it as a bounded caveat, not a private-corpus pass.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "benchmark_conclusion_overturned", + "evidence_id": "verdict-bounded-private-caveat" + } + }, + "created_at": "2026-06-09T00:05:00Z" + }, + { + "evidence_id": "verdict-update-rationale", + "kind": "decision", + "text": "The verdict changed after provider-backed synthetic, stress, backfill, and restore proof evidence was recorded.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "benchmark_conclusion_overturned", + "evidence_id": "verdict-update-rationale" + } + }, + "created_at": "2026-06-09T00:10:00Z" + } + ], + "adapter_response": { + "adapter_id": "fixture_memory_evolution", + "answer": { + "content": "The current verdict is that ELF is ready for personal production use with bounded caveats; the older not-ready conclusion is historical, and the private corpus remains an explicit caveat rather than a private-corpus pass.", + "claims": [ + { + "claim_id": "current_benchmark_verdict", + "text": "ELF is ready for personal production use with bounded caveats.", + "evidence_ids": [ + "verdict-current-ready-bounded", + "verdict-old-not-ready", + "verdict-update-rationale" + ], + "confidence": "high" + }, + { + "claim_id": "benchmark_update_rationale", + "text": "The verdict changed after provider-backed benchmark and restore proof evidence was recorded.", + "evidence_ids": ["verdict-update-rationale"], + "confidence": "high" + }, + { + "claim_id": "private_corpus_caveat", + "text": "The private corpus remains a bounded caveat rather than a private-corpus pass.", + "evidence_ids": ["verdict-bounded-private-caveat"], + "confidence": "high" + } + ], + "evidence_ids": [ + "verdict-current-ready-bounded", + "verdict-bounded-private-caveat", + "verdict-update-rationale" + ], + "latency_ms": 1.5, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + } + } + } + }, + "timeline": [ + { + "event_id": "verdict-not-ready", + "ts": "2026-06-07T00:00:00Z", + "actor": "agent", + "action": "made_decision", + "evidence_ids": ["verdict-old-not-ready"], + "summary": "The older verdict said ELF was not ready." + }, + { + "event_id": "verdict-ready", + "ts": "2026-06-09T00:00:00Z", + "actor": "agent", + "action": "updated_memory", + "evidence_ids": [ + "verdict-current-ready-bounded", + "verdict-bounded-private-caveat", + "verdict-update-rationale" + ], + "summary": "The adoption gate changed the current verdict and preserved the private-corpus caveat." + } + ], + "prompt": { + "role": "user", + "content": "What is the current benchmark adoption conclusion, and what older conclusion changed?", + "job_mode": "decide", + "constraints": ["cite_evidence", "distinguish_current_from_historical", "state_caveats"] + }, + "expected_answer": { + "must_include": [ + { + "claim_id": "current_benchmark_verdict", + "text": "ELF is ready for personal production use with bounded caveats." + }, + { + "claim_id": "benchmark_update_rationale", + "text": "The verdict changed after provider-backed benchmark and restore proof evidence was recorded." + }, + { + "claim_id": "private_corpus_caveat", + "text": "The private corpus remains a bounded caveat rather than a private-corpus pass." + } + ], + "must_not_include": [ + "ELF is not ready for personal production use.", + "The private production corpus passed." + ], + "evidence_links": { + "current_benchmark_verdict": [ + "verdict-current-ready-bounded", + "verdict-old-not-ready", + "verdict-update-rationale" + ], + "benchmark_update_rationale": ["verdict-update-rationale"], + "private_corpus_caveat": ["verdict-bounded-private-caveat"] + }, + "answer_type": "decision_record", + "accepted_alternates": [], + "requires_caveat": false, + "requires_refusal": false + }, + "required_evidence": [ + { + "evidence_id": "verdict-current-ready-bounded", + "claim_id": "current_benchmark_verdict", + "requirement": "cite", + "quote": "ready for personal production use with bounded caveats" + }, + { + "evidence_id": "verdict-bounded-private-caveat", + "claim_id": "private_corpus_caveat", + "requirement": "cite", + "quote": "bounded caveat, not a private-corpus pass" + }, + { + "evidence_id": "verdict-update-rationale", + "claim_id": "benchmark_update_rationale", + "requirement": "explain", + "quote": "provider-backed synthetic, stress, backfill, and restore proof" + } + ], + "negative_traps": [ + { + "trap_id": "old-not-ready-verdict-current", + "type": "stale_fact", + "evidence_ids": ["verdict-old-not-ready"], + "failure_if_used": false + } + ], + "scoring_rubric": { + "dimensions": { + "lifecycle_behavior": { + "weight": 0.35, + "max_points": 1.0, + "criteria": "Reports the current adoption verdict and historical supersession." + }, + "answer_correctness": { + "weight": 0.25, + "max_points": 1.0, + "criteria": "States the current verdict and private-corpus caveat." + }, + "evidence_grounding": { + "weight": 0.25, + "max_points": 1.0, + "criteria": "Cites current verdict, caveat, and rationale evidence." + }, + "trap_avoidance": { + "weight": 0.15, + "max_points": 1.0, + "criteria": "Does not report the old not-ready verdict as current." + } + }, + "pass_threshold": 0.8, + "hard_fail_rules": [ + "unsupported high-confidence claim about a required decision or fact", + "use of a negative trap marked failure_if_used = true" + ] + }, + "allowed_uncertainty": { + "can_answer_unknown": false, + "acceptable_phrases": ["The fixture does not provide that evidence."], + "fallback_action": "state_blocker" + }, + "memory_evolution": { + "current_evidence_ids": ["verdict-current-ready-bounded"], + "historical_evidence_ids": ["verdict-old-not-ready"], + "stale_trap_ids": ["old-not-ready-verdict-current"], + "conflicts": [ + { + "conflict_id": "benchmark-verdict-overturned", + "claim_id": "current_benchmark_verdict", + "current_evidence_id": "verdict-current-ready-bounded", + "historical_evidence_id": "verdict-old-not-ready", + "resolved_by_evidence_id": "verdict-update-rationale" + } + ], + "update_rationale": { + "claim_id": "benchmark_update_rationale", + "evidence_ids": ["verdict-update-rationale"], + "available": true + }, + "temporal_validity": { + "required": false, + "encoded": false, + "follow_up": null + } + }, + "tags": [ + "synthetic", + "memory_evolution", + "reference_mem0_history", + "no_live_claim" + ] +} diff --git a/apps/elf-eval/fixtures/real_world_memory/evolution/deployment_method_superseded.json b/apps/elf-eval/fixtures/real_world_memory/evolution/deployment_method_superseded.json new file mode 100644 index 00000000..f20d9f08 --- /dev/null +++ b/apps/elf-eval/fixtures/real_world_memory/evolution/deployment_method_superseded.json @@ -0,0 +1,226 @@ +{ + "schema": "elf.real_world_job/v1", + "job_id": "memory-evolution-deploy-method-001", + "suite": "memory_evolution", + "title": "Prefer the superseding production deployment method over the old smoke path", + "corpus": { + "corpus_id": "real-world-memory-evolution-2026-06-09", + "profile": "synthetic", + "items": [ + { + "evidence_id": "deploy-old-quickstart", + "kind": "runbook", + "text": "Old deployment method: use quickstart cargo run service terminals for local smoke only.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "deployment_method_superseded", + "evidence_id": "deploy-old-quickstart" + } + }, + "created_at": "2026-06-02T00:00:00Z" + }, + { + "evidence_id": "deploy-current-production-runbook", + "kind": "runbook", + "text": "Current single-user production operation uses Docker Compose production runbook with backup, restore, and Qdrant rebuild.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "deployment_method_superseded", + "evidence_id": "deploy-current-production-runbook" + } + }, + "created_at": "2026-06-09T00:00:00Z" + }, + { + "evidence_id": "deploy-supersession-rationale", + "kind": "decision", + "text": "Quickstart is no longer production guidance because backup, restore, rollback, and provider config handling must be explicit.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "deployment_method_superseded", + "evidence_id": "deploy-supersession-rationale" + } + }, + "created_at": "2026-06-09T00:10:00Z" + } + ], + "adapter_response": { + "adapter_id": "fixture_memory_evolution", + "answer": { + "content": "Use the Docker Compose production runbook with backup, restore, and Qdrant rebuild for production; the cargo run quickstart is only historical local-smoke guidance because production recovery handling must be explicit.", + "claims": [ + { + "claim_id": "current_deployment_method", + "text": "Use the Docker Compose production runbook with backup, restore, and Qdrant rebuild for production.", + "evidence_ids": [ + "deploy-current-production-runbook", + "deploy-old-quickstart", + "deploy-supersession-rationale" + ], + "confidence": "high" + }, + { + "claim_id": "deployment_update_rationale", + "text": "The quickstart was superseded because production recovery handling must be explicit.", + "evidence_ids": ["deploy-supersession-rationale"], + "confidence": "high" + } + ], + "evidence_ids": [ + "deploy-current-production-runbook", + "deploy-supersession-rationale" + ], + "latency_ms": 1.4, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + } + } + } + }, + "timeline": [ + { + "event_id": "deploy-quickstart", + "ts": "2026-06-02T00:00:00Z", + "actor": "agent", + "action": "recorded_runbook", + "evidence_ids": ["deploy-old-quickstart"], + "summary": "The quickstart path existed for local smoke use." + }, + { + "event_id": "deploy-production-runbook", + "ts": "2026-06-09T00:00:00Z", + "actor": "agent", + "action": "updated_memory", + "evidence_ids": ["deploy-current-production-runbook", "deploy-supersession-rationale"], + "summary": "The production runbook became the current production method." + } + ], + "prompt": { + "role": "user", + "content": "Which deployment path should I use for production now?", + "job_mode": "operate", + "constraints": ["cite_evidence", "distinguish_current_from_historical"] + }, + "expected_answer": { + "must_include": [ + { + "claim_id": "current_deployment_method", + "text": "Use the Docker Compose production runbook with backup, restore, and Qdrant rebuild for production." + }, + { + "claim_id": "deployment_update_rationale", + "text": "The quickstart was superseded because production recovery handling must be explicit." + } + ], + "must_not_include": [ + "Use quickstart cargo run service terminals for production." + ], + "evidence_links": { + "current_deployment_method": [ + "deploy-current-production-runbook", + "deploy-old-quickstart", + "deploy-supersession-rationale" + ], + "deployment_update_rationale": ["deploy-supersession-rationale"] + }, + "answer_type": "ops_runbook", + "accepted_alternates": [], + "requires_caveat": false, + "requires_refusal": false + }, + "required_evidence": [ + { + "evidence_id": "deploy-current-production-runbook", + "claim_id": "current_deployment_method", + "requirement": "cite", + "quote": "Docker Compose production runbook" + }, + { + "evidence_id": "deploy-supersession-rationale", + "claim_id": "deployment_update_rationale", + "requirement": "explain", + "quote": "backup, restore, rollback" + } + ], + "negative_traps": [ + { + "trap_id": "old-quickstart-production", + "type": "stale_fact", + "evidence_ids": ["deploy-old-quickstart"], + "failure_if_used": false + } + ], + "scoring_rubric": { + "dimensions": { + "lifecycle_behavior": { + "weight": 0.35, + "max_points": 1.0, + "criteria": "Chooses the superseding production runbook." + }, + "answer_correctness": { + "weight": 0.25, + "max_points": 1.0, + "criteria": "Answers with the current production method." + }, + "evidence_grounding": { + "weight": 0.25, + "max_points": 1.0, + "criteria": "Cites current runbook and supersession rationale." + }, + "trap_avoidance": { + "weight": 0.15, + "max_points": 1.0, + "criteria": "Does not turn the quickstart smoke path into production guidance." + } + }, + "pass_threshold": 0.8, + "hard_fail_rules": [ + "unsupported high-confidence claim about a required decision or fact", + "use of a negative trap marked failure_if_used = true" + ] + }, + "allowed_uncertainty": { + "can_answer_unknown": false, + "acceptable_phrases": ["The fixture does not provide that evidence."], + "fallback_action": "state_blocker" + }, + "memory_evolution": { + "current_evidence_ids": ["deploy-current-production-runbook"], + "historical_evidence_ids": ["deploy-old-quickstart"], + "stale_trap_ids": ["old-quickstart-production"], + "conflicts": [ + { + "conflict_id": "deployment-method-supersession", + "claim_id": "current_deployment_method", + "current_evidence_id": "deploy-current-production-runbook", + "historical_evidence_id": "deploy-old-quickstart", + "resolved_by_evidence_id": "deploy-supersession-rationale" + } + ], + "update_rationale": { + "claim_id": "deployment_update_rationale", + "evidence_ids": ["deploy-supersession-rationale"], + "available": true + }, + "temporal_validity": { + "required": false, + "encoded": false, + "follow_up": null + } + }, + "tags": [ + "synthetic", + "memory_evolution", + "reference_letta_core_block", + "no_live_claim" + ] +} diff --git a/apps/elf-eval/fixtures/real_world_memory/evolution/issue_blocked_to_done.json b/apps/elf-eval/fixtures/real_world_memory/evolution/issue_blocked_to_done.json new file mode 100644 index 00000000..8fb40f85 --- /dev/null +++ b/apps/elf-eval/fixtures/real_world_memory/evolution/issue_blocked_to_done.json @@ -0,0 +1,221 @@ +{ + "schema": "elf.real_world_job/v1", + "job_id": "memory-evolution-issue-state-001", + "suite": "memory_evolution", + "title": "Report an issue as done after an earlier blocker cleared", + "corpus": { + "corpus_id": "real-world-memory-evolution-2026-06-09", + "profile": "synthetic", + "items": [ + { + "evidence_id": "issue-xy900-blocked", + "kind": "issue", + "text": "On 2026-06-06, XY-900 was blocked on missing real_world_job fixture/report implementation.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "issue_blocked_to_done", + "evidence_id": "issue-xy900-blocked" + } + }, + "created_at": "2026-06-06T00:00:00Z" + }, + { + "evidence_id": "issue-xy900-done", + "kind": "issue", + "text": "On 2026-06-09, XY-900 is done after PR #200 added the real_world_job fixture/report implementation.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "issue_blocked_to_done", + "evidence_id": "issue-xy900-done" + } + }, + "created_at": "2026-06-09T00:00:00Z" + }, + { + "evidence_id": "issue-xy900-resolution-rationale", + "kind": "decision", + "text": "The blocker cleared because the fixture/report runner now exists and publishes typed real-world job reports.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "issue_blocked_to_done", + "evidence_id": "issue-xy900-resolution-rationale" + } + }, + "created_at": "2026-06-09T00:05:00Z" + } + ], + "adapter_response": { + "adapter_id": "fixture_memory_evolution", + "answer": { + "content": "XY-900 is currently done after PR #200; the earlier missing real_world_job fixture/report blocker is historical and cleared because the runner now publishes typed reports.", + "claims": [ + { + "claim_id": "current_issue_state", + "text": "XY-900 is currently done after PR #200.", + "evidence_ids": [ + "issue-xy900-done", + "issue-xy900-blocked", + "issue-xy900-resolution-rationale" + ], + "confidence": "high" + }, + { + "claim_id": "issue_update_rationale", + "text": "The blocker cleared because the fixture/report runner now exists.", + "evidence_ids": ["issue-xy900-resolution-rationale"], + "confidence": "high" + } + ], + "evidence_ids": ["issue-xy900-done", "issue-xy900-resolution-rationale"], + "latency_ms": 1.3, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + } + } + } + }, + "timeline": [ + { + "event_id": "xy900-blocked", + "ts": "2026-06-06T00:00:00Z", + "actor": "agent", + "action": "hit_blocker", + "evidence_ids": ["issue-xy900-blocked"], + "summary": "The issue was blocked on missing fixture/report implementation." + }, + { + "event_id": "xy900-done", + "ts": "2026-06-09T00:00:00Z", + "actor": "agent", + "action": "updated_memory", + "evidence_ids": ["issue-xy900-done", "issue-xy900-resolution-rationale"], + "summary": "The implementation landed and the blocker cleared." + } + ], + "prompt": { + "role": "user", + "content": "Is XY-900 still blocked, or is it done now?", + "job_mode": "resume", + "constraints": ["cite_evidence", "distinguish_current_from_historical"] + }, + "expected_answer": { + "must_include": [ + { + "claim_id": "current_issue_state", + "text": "XY-900 is currently done after PR #200." + }, + { + "claim_id": "issue_update_rationale", + "text": "The blocker cleared because the fixture/report runner now exists." + } + ], + "must_not_include": ["XY-900 is currently blocked."], + "evidence_links": { + "current_issue_state": [ + "issue-xy900-done", + "issue-xy900-blocked", + "issue-xy900-resolution-rationale" + ], + "issue_update_rationale": ["issue-xy900-resolution-rationale"] + }, + "answer_type": "resume_summary", + "accepted_alternates": [], + "requires_caveat": false, + "requires_refusal": false + }, + "required_evidence": [ + { + "evidence_id": "issue-xy900-done", + "claim_id": "current_issue_state", + "requirement": "cite", + "quote": "XY-900 is done" + }, + { + "evidence_id": "issue-xy900-resolution-rationale", + "claim_id": "issue_update_rationale", + "requirement": "explain", + "quote": "fixture/report runner now exists" + } + ], + "negative_traps": [ + { + "trap_id": "old-issue-blocker-current", + "type": "stale_fact", + "evidence_ids": ["issue-xy900-blocked"], + "failure_if_used": false + } + ], + "scoring_rubric": { + "dimensions": { + "lifecycle_behavior": { + "weight": 0.35, + "max_points": 1.0, + "criteria": "Reports the latest issue state rather than the historical blocker." + }, + "answer_correctness": { + "weight": 0.25, + "max_points": 1.0, + "criteria": "States that the issue is done and why." + }, + "evidence_grounding": { + "weight": 0.25, + "max_points": 1.0, + "criteria": "Uses current completion and resolution evidence." + }, + "trap_avoidance": { + "weight": 0.15, + "max_points": 1.0, + "criteria": "Does not report the old blocker as current." + } + }, + "pass_threshold": 0.8, + "hard_fail_rules": [ + "unsupported high-confidence claim about a required decision or fact", + "use of a negative trap marked failure_if_used = true" + ] + }, + "allowed_uncertainty": { + "can_answer_unknown": false, + "acceptable_phrases": ["The fixture does not provide that evidence."], + "fallback_action": "state_blocker" + }, + "memory_evolution": { + "current_evidence_ids": ["issue-xy900-done"], + "historical_evidence_ids": ["issue-xy900-blocked"], + "stale_trap_ids": ["old-issue-blocker-current"], + "conflicts": [ + { + "conflict_id": "issue-state-blocked-to-done", + "claim_id": "current_issue_state", + "current_evidence_id": "issue-xy900-done", + "historical_evidence_id": "issue-xy900-blocked", + "resolved_by_evidence_id": "issue-xy900-resolution-rationale" + } + ], + "update_rationale": { + "claim_id": "issue_update_rationale", + "evidence_ids": ["issue-xy900-resolution-rationale"], + "available": true + }, + "temporal_validity": { + "required": false, + "encoded": false, + "follow_up": null + } + }, + "tags": [ + "synthetic", + "memory_evolution", + "reference_mem0_history", + "no_live_claim" + ] +} diff --git a/apps/elf-eval/fixtures/real_world_memory/evolution/preference_changed_current_vs_historical.json b/apps/elf-eval/fixtures/real_world_memory/evolution/preference_changed_current_vs_historical.json new file mode 100644 index 00000000..bf5e93c7 --- /dev/null +++ b/apps/elf-eval/fixtures/real_world_memory/evolution/preference_changed_current_vs_historical.json @@ -0,0 +1,224 @@ +{ + "schema": "elf.real_world_job/v1", + "job_id": "memory-evolution-preference-001", + "suite": "memory_evolution", + "title": "Apply the current user preference while preserving the historical one", + "corpus": { + "corpus_id": "real-world-memory-evolution-2026-06-09", + "profile": "synthetic", + "items": [ + { + "evidence_id": "pref-old-terse-bullets", + "kind": "note", + "text": "On 2026-06-01, the user preferred terse bullet-only benchmark updates.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "preference_changed_current_vs_historical", + "evidence_id": "pref-old-terse-bullets" + } + }, + "created_at": "2026-06-01T00:00:00Z" + }, + { + "evidence_id": "pref-current-concise-rationale", + "kind": "note", + "text": "On 2026-06-08, the user changed preference to concise prose with explicit evidence before bullets.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "preference_changed_current_vs_historical", + "evidence_id": "pref-current-concise-rationale" + } + }, + "created_at": "2026-06-08T00:00:00Z" + }, + { + "evidence_id": "pref-update-rationale", + "kind": "decision", + "text": "The user said the earlier terse bullets hid rationale, so future benchmark updates should include concise rationale.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "preference_changed_current_vs_historical", + "evidence_id": "pref-update-rationale" + } + }, + "created_at": "2026-06-08T00:05:00Z" + } + ], + "adapter_response": { + "adapter_id": "fixture_memory_evolution", + "answer": { + "content": "Use concise prose with explicit evidence before bullets; the terse bullet-only preference is historical because it hid rationale.", + "claims": [ + { + "claim_id": "current_preference", + "text": "Use concise prose with explicit evidence before bullets.", + "evidence_ids": [ + "pref-current-concise-rationale", + "pref-old-terse-bullets", + "pref-update-rationale" + ], + "confidence": "high" + }, + { + "claim_id": "preference_update_rationale", + "text": "The preference changed because terse bullets hid rationale.", + "evidence_ids": ["pref-update-rationale"], + "confidence": "high" + } + ], + "evidence_ids": ["pref-current-concise-rationale", "pref-update-rationale"], + "latency_ms": 1.1, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + } + } + } + }, + "timeline": [ + { + "event_id": "preference-old", + "ts": "2026-06-01T00:00:00Z", + "actor": "user", + "action": "set_preference", + "evidence_ids": ["pref-old-terse-bullets"], + "summary": "The user initially preferred terse bullet-only benchmark updates." + }, + { + "event_id": "preference-current", + "ts": "2026-06-08T00:00:00Z", + "actor": "user", + "action": "updated_memory", + "evidence_ids": ["pref-current-concise-rationale", "pref-update-rationale"], + "summary": "The user changed the preference and gave the rationale." + } + ], + "prompt": { + "role": "user", + "content": "How should benchmark updates be written now, and what changed?", + "job_mode": "personalize", + "constraints": ["cite_evidence", "distinguish_current_from_historical"] + }, + "expected_answer": { + "must_include": [ + { + "claim_id": "current_preference", + "text": "Use concise prose with explicit evidence before bullets." + }, + { + "claim_id": "preference_update_rationale", + "text": "The preference changed because terse bullets hid rationale." + } + ], + "must_not_include": [ + "Use terse bullet-only benchmark updates as the current preference." + ], + "evidence_links": { + "current_preference": [ + "pref-current-concise-rationale", + "pref-old-terse-bullets", + "pref-update-rationale" + ], + "preference_update_rationale": ["pref-update-rationale"] + }, + "answer_type": "direct_answer", + "accepted_alternates": [], + "requires_caveat": false, + "requires_refusal": false + }, + "required_evidence": [ + { + "evidence_id": "pref-current-concise-rationale", + "claim_id": "current_preference", + "requirement": "cite", + "quote": "changed preference to concise prose" + }, + { + "evidence_id": "pref-update-rationale", + "claim_id": "preference_update_rationale", + "requirement": "explain", + "quote": "terse bullets hid rationale" + } + ], + "negative_traps": [ + { + "trap_id": "old-terse-preference-current", + "type": "stale_fact", + "evidence_ids": ["pref-old-terse-bullets"], + "failure_if_used": false + } + ], + "scoring_rubric": { + "dimensions": { + "lifecycle_behavior": { + "weight": 0.35, + "max_points": 1.0, + "criteria": "Chooses the current preference while preserving the historical version." + }, + "answer_correctness": { + "weight": 0.25, + "max_points": 1.0, + "criteria": "States the current preference and update rationale." + }, + "evidence_grounding": { + "weight": 0.25, + "max_points": 1.0, + "criteria": "Cites the current preference and rationale evidence." + }, + "trap_avoidance": { + "weight": 0.15, + "max_points": 1.0, + "criteria": "Does not promote the stale preference as current." + } + }, + "pass_threshold": 0.8, + "hard_fail_rules": [ + "unsupported high-confidence claim about a required decision or fact", + "use of a negative trap marked failure_if_used = true" + ] + }, + "allowed_uncertainty": { + "can_answer_unknown": false, + "acceptable_phrases": ["The fixture does not provide that evidence."], + "fallback_action": "state_blocker" + }, + "memory_evolution": { + "current_evidence_ids": ["pref-current-concise-rationale"], + "historical_evidence_ids": ["pref-old-terse-bullets"], + "stale_trap_ids": ["old-terse-preference-current"], + "conflicts": [ + { + "conflict_id": "preference-style-supersession", + "claim_id": "current_preference", + "current_evidence_id": "pref-current-concise-rationale", + "historical_evidence_id": "pref-old-terse-bullets", + "resolved_by_evidence_id": "pref-update-rationale" + } + ], + "update_rationale": { + "claim_id": "preference_update_rationale", + "evidence_ids": ["pref-update-rationale"], + "available": true + }, + "temporal_validity": { + "required": false, + "encoded": false, + "follow_up": null + } + }, + "tags": [ + "synthetic", + "memory_evolution", + "reference_mem0_history", + "reference_letta_core_block", + "no_live_claim" + ] +} diff --git a/apps/elf-eval/fixtures/real_world_memory/evolution/relation_temporal_validity_not_encoded.json b/apps/elf-eval/fixtures/real_world_memory/evolution/relation_temporal_validity_not_encoded.json new file mode 100644 index 00000000..6c3a0c0f --- /dev/null +++ b/apps/elf-eval/fixtures/real_world_memory/evolution/relation_temporal_validity_not_encoded.json @@ -0,0 +1,199 @@ +{ + "schema": "elf.real_world_job/v1", + "job_id": "memory-evolution-relation-temporal-001", + "suite": "memory_evolution", + "title": "Mark temporal relation validity as not encoded instead of faking a graph pass", + "encoding": { + "status": "not_encoded", + "reason": "ELF graph-lite currently returns bounded relation context, but this runner does not yet encode current-only versus historical temporal validity for relation facts.", + "follow_up": { + "title": "[ELF graph P1] Add temporal validity to graph-lite facts", + "reason": "Relation facts need valid_from and invalidated_at semantics before this job can claim a current-versus-historical graph pass." + } + }, + "corpus": { + "corpus_id": "real-world-memory-evolution-2026-06-09", + "profile": "synthetic", + "items": [ + { + "evidence_id": "relation-old-owner", + "kind": "adapter_state", + "text": "Before 2026-06-06, Team Delta owned deployment method review.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "relation_temporal_validity_not_encoded", + "evidence_id": "relation-old-owner" + } + }, + "created_at": "2026-06-05T00:00:00Z" + }, + { + "evidence_id": "relation-current-owner", + "kind": "adapter_state", + "text": "Since 2026-06-08, Team Echo owns deployment method review.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "relation_temporal_validity_not_encoded", + "evidence_id": "relation-current-owner" + } + }, + "created_at": "2026-06-08T00:00:00Z" + }, + { + "evidence_id": "relation-owner-rationale", + "kind": "decision", + "text": "Ownership moved after single-user production runbook scope changed.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "relation_temporal_validity_not_encoded", + "evidence_id": "relation-owner-rationale" + } + }, + "created_at": "2026-06-08T00:05:00Z" + } + ] + }, + "timeline": [ + { + "event_id": "relation-old-owner", + "ts": "2026-06-05T00:00:00Z", + "actor": "agent", + "action": "recorded_relation", + "evidence_ids": ["relation-old-owner"], + "summary": "Team Delta was the historical owner." + }, + { + "event_id": "relation-current-owner", + "ts": "2026-06-08T00:00:00Z", + "actor": "agent", + "action": "updated_memory", + "evidence_ids": ["relation-current-owner", "relation-owner-rationale"], + "summary": "Team Echo became the current owner after the scope changed." + } + ], + "prompt": { + "role": "user", + "content": "Who currently owns deployment method review, and who owned it historically?", + "job_mode": "answer", + "constraints": ["cite_evidence", "distinguish_current_from_historical"] + }, + "expected_answer": { + "must_include": [ + { + "claim_id": "relation_current_owner", + "text": "Team Echo currently owns deployment method review." + }, + { + "claim_id": "relation_historical_owner", + "text": "Team Delta owned deployment method review historically." + } + ], + "must_not_include": ["Team Delta currently owns deployment method review."], + "evidence_links": { + "relation_current_owner": [ + "relation-current-owner", + "relation-old-owner", + "relation-owner-rationale" + ], + "relation_historical_owner": ["relation-old-owner"] + }, + "answer_type": "direct_answer", + "accepted_alternates": [], + "requires_caveat": false, + "requires_refusal": false + }, + "required_evidence": [ + { + "evidence_id": "relation-current-owner", + "claim_id": "relation_current_owner", + "requirement": "cite", + "quote": "Team Echo owns deployment method review" + }, + { + "evidence_id": "relation-old-owner", + "claim_id": "relation_historical_owner", + "requirement": "cite", + "quote": "Team Delta owned deployment method review" + } + ], + "negative_traps": [ + { + "trap_id": "old-owner-as-current", + "type": "stale_fact", + "evidence_ids": ["relation-old-owner"], + "failure_if_used": false + } + ], + "scoring_rubric": { + "dimensions": { + "lifecycle_behavior": { + "weight": 0.4, + "max_points": 1.0, + "criteria": "Requires current-only versus historical temporal validity for relation facts." + }, + "answer_correctness": { + "weight": 0.25, + "max_points": 1.0, + "criteria": "Would identify current and historical owners separately." + }, + "evidence_grounding": { + "weight": 0.2, + "max_points": 1.0, + "criteria": "Would cite both current and historical relation evidence." + }, + "trap_avoidance": { + "weight": 0.15, + "max_points": 1.0, + "criteria": "Would not report the historical owner as current." + } + }, + "pass_threshold": 0.8, + "hard_fail_rules": [ + "unsupported high-confidence claim about a required decision or fact", + "use of a negative trap marked failure_if_used = true" + ] + }, + "allowed_uncertainty": { + "can_answer_unknown": true, + "acceptable_phrases": ["Temporal relation validity is not encoded in this runner."], + "fallback_action": "state_blocker" + }, + "memory_evolution": { + "current_evidence_ids": ["relation-current-owner"], + "historical_evidence_ids": ["relation-old-owner"], + "stale_trap_ids": ["old-owner-as-current"], + "conflicts": [ + { + "conflict_id": "relation-owner-current-historical", + "claim_id": "relation_current_owner", + "current_evidence_id": "relation-current-owner", + "historical_evidence_id": "relation-old-owner", + "resolved_by_evidence_id": "relation-owner-rationale" + } + ], + "update_rationale": { + "claim_id": "relation_owner_update_rationale", + "evidence_ids": ["relation-owner-rationale"], + "available": false + }, + "temporal_validity": { + "required": true, + "encoded": false, + "follow_up": "[ELF graph P1] Add temporal validity to graph-lite facts" + } + }, + "tags": [ + "synthetic", + "memory_evolution", + "reference_graphiti_zep_temporal", + "reference_nanograph_typed_query", + "not_encoded", + "no_live_claim" + ] +} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark.rs b/apps/elf-eval/src/bin/real_world_job_benchmark.rs index 59ee9bd2..643572d5 100644 --- a/apps/elf-eval/src/bin/real_world_job_benchmark.rs +++ b/apps/elf-eval/src/bin/real_world_job_benchmark.rs @@ -108,6 +108,9 @@ struct RealWorldJob { operator_debug: Option, #[serde(default)] tags: Vec, + #[serde(default)] + encoding: JobEncoding, + memory_evolution: Option, } #[derive(Debug, Deserialize)] @@ -249,6 +252,57 @@ struct NegativeTrap { failure_if_used: bool, } +#[derive(Debug, Default, Deserialize)] +struct JobEncoding { + status: Option, + reason: Option, + follow_up: Option, +} + +#[derive(Clone, Debug, Deserialize, Serialize)] +struct FollowUpInput { + title: String, + reason: String, +} + +#[derive(Debug, Deserialize)] +struct MemoryEvolution { + #[serde(default)] + current_evidence_ids: Vec, + #[serde(default)] + historical_evidence_ids: Vec, + #[serde(default)] + stale_trap_ids: Vec, + #[serde(default)] + conflicts: Vec, + update_rationale: Option, + temporal_validity: Option, +} + +#[derive(Debug, Deserialize)] +struct EvolutionConflict { + conflict_id: String, + claim_id: String, + current_evidence_id: String, + historical_evidence_id: String, + resolved_by_evidence_id: Option, +} + +#[derive(Debug, Deserialize)] +struct UpdateRationale { + claim_id: String, + #[serde(default)] + evidence_ids: Vec, + available: bool, +} + +#[derive(Debug, Deserialize)] +struct TemporalValidity { + required: bool, + encoded: bool, + follow_up: Option, +} + #[derive(Debug, Deserialize)] struct ScoringRubric { #[serde(default)] @@ -374,6 +428,10 @@ struct RealWorldReport { unsupported_claims: Vec, not_encoded_suites: Vec, private_corpus_redaction: PrivateCorpusRedaction, + #[serde(default)] + evolution: EvolutionSummary, + #[serde(default)] + follow_ups: Vec, } #[derive(Debug, Deserialize, Serialize)] @@ -399,6 +457,14 @@ struct ReportSummary { unsupported_claim: usize, unsupported_claim_count: usize, wrong_result_count: usize, + #[serde(default)] + stale_answer_count: usize, + #[serde(default)] + conflict_detection_count: usize, + #[serde(default)] + update_rationale_available_count: usize, + #[serde(default)] + temporal_validity_not_encoded_count: usize, mean_score: f64, mean_latency_ms: Option, total_cost: Option, @@ -454,6 +520,14 @@ struct SuiteReport { score_mean: Option, unsupported_claim_count: usize, wrong_result_count: usize, + #[serde(default)] + stale_answer_count: usize, + #[serde(default)] + conflict_detection_count: usize, + #[serde(default)] + update_rationale_available_count: usize, + #[serde(default)] + temporal_validity_not_encoded_count: usize, reason: String, } @@ -470,6 +544,14 @@ struct JobReport { produced_evidence: Vec, unsupported_claim_count: usize, wrong_result_count: usize, + #[serde(default)] + stale_answer_count: usize, + #[serde(default)] + conflict_detection_count: usize, + #[serde(default)] + update_rationale_available: bool, + #[serde(default)] + temporal_validity_not_encoded: bool, latency_ms: Option, cost: Option, trap_ids_used: Vec, @@ -501,6 +583,8 @@ struct JobReport { qdrant_rebuild_case: bool, #[serde(skip_serializing_if = "Option::is_none")] operator_debug: Option, + #[serde(skip_serializing_if = "Option::is_none")] + evolution: Option, } #[derive(Debug, Deserialize, Serialize)] @@ -528,6 +612,38 @@ struct UnsupportedClaimReport { evidence_ids: Vec, } +#[derive(Clone, Debug, Default, Deserialize, Serialize)] +struct EvolutionSummary { + stale_answer_count: usize, + conflict_detection_count: usize, + update_rationale_available_count: usize, + temporal_validity_not_encoded_count: usize, +} + +#[derive(Clone, Debug, Deserialize, Serialize)] +struct EvolutionJobReport { + current_evidence: Vec, + historical_evidence: Vec, + stale_trap_ids_used: Vec, + stale_answer_count: usize, + conflict_count: usize, + conflict_detection_count: usize, + update_rationale_available: bool, + temporal_validity_required: bool, + temporal_validity_encoded: bool, + temporal_validity_not_encoded: bool, + #[serde(skip_serializing_if = "Option::is_none")] + follow_up: Option, +} + +#[derive(Debug, Deserialize, Serialize)] +struct FollowUpReport { + suite_id: String, + job_id: String, + title: String, + reason: String, +} + #[derive(Debug, Deserialize, Serialize)] struct PrivateCorpusRedaction { policy: String, @@ -544,6 +660,7 @@ struct JobScoring { trap_ids_used: Vec, dimension_scores: Vec, reason: String, + evolution: Option, } #[derive(Debug, Default)] @@ -557,6 +674,9 @@ struct FailureCounts { operator_debug_raw_sql: usize, operator_debug_trace_gaps: usize, operator_debug_repair_unclear: usize, + stale_answers: usize, + conflict_detection_missing: usize, + update_rationale_missing: usize, } #[derive(Debug, Default)] @@ -676,6 +796,8 @@ fn validate_job(job: &RealWorldJob, path: &Path) -> Result<()> { validate_scoring_rubric(job, path)?; validate_allowed_uncertainty(job, path)?; validate_operator_debug(job, path)?; + validate_job_encoding(job, path)?; + validate_memory_evolution(job, path)?; Ok(()) } @@ -949,6 +1071,141 @@ fn validate_operator_debug(job: &RealWorldJob, path: &Path) -> Result<()> { Ok(()) } +fn validate_job_encoding(job: &RealWorldJob, path: &Path) -> Result<()> { + if let Some(status) = job.encoding.status { + if !matches!( + status, + TypedStatus::NotEncoded | TypedStatus::Blocked | TypedStatus::Incomplete + ) { + return Err(eyre::eyre!( + "{} job {} uses encoding.status {}; only not_encoded, blocked, or incomplete are allowed.", + path.display(), + job.job_id, + status_str(status) + )); + } + if job.encoding.reason.as_deref().is_none_or(|reason| reason.trim().is_empty()) { + return Err(eyre::eyre!( + "{} job {} declares encoding.status but no reason.", + path.display(), + job.job_id + )); + } + } + if let Some(follow_up) = &job.encoding.follow_up + && (follow_up.title.trim().is_empty() || follow_up.reason.trim().is_empty()) + { + return Err(eyre::eyre!( + "{} job {} has an incomplete encoding follow-up.", + path.display(), + job.job_id + )); + } + + Ok(()) +} + +fn validate_memory_evolution(job: &RealWorldJob, path: &Path) -> Result<()> { + let Some(evolution) = &job.memory_evolution else { + return Ok(()); + }; + let evidence_ids = corpus_evidence_ids(job); + let trap_ids = + job.negative_traps.iter().map(|trap| trap.trap_id.as_str()).collect::>(); + + for evidence_id in + evolution.current_evidence_ids.iter().chain(evolution.historical_evidence_ids.iter()) + { + ensure_known_evidence(path, &evidence_ids, evidence_id)?; + } + for trap_id in &evolution.stale_trap_ids { + if !trap_ids.contains(trap_id.as_str()) { + return Err(eyre::eyre!( + "{} job {} references unknown stale trap id {}.", + path.display(), + job.job_id, + trap_id + )); + } + } + for conflict in &evolution.conflicts { + validate_evolution_conflict(path, &evidence_ids, conflict)?; + } + + if let Some(rationale) = &evolution.update_rationale { + validate_update_rationale(path, &evidence_ids, rationale)?; + } + if let Some(temporal) = &evolution.temporal_validity { + validate_temporal_validity(job, path, temporal)?; + } + + Ok(()) +} + +fn validate_evolution_conflict( + path: &Path, + evidence_ids: &BTreeSet, + conflict: &EvolutionConflict, +) -> Result<()> { + if conflict.conflict_id.trim().is_empty() || conflict.claim_id.trim().is_empty() { + return Err(eyre::eyre!("{} has an incomplete evolution conflict.", path.display())); + } + + ensure_known_evidence(path, evidence_ids, conflict.current_evidence_id.as_str())?; + ensure_known_evidence(path, evidence_ids, conflict.historical_evidence_id.as_str())?; + + if let Some(evidence_id) = &conflict.resolved_by_evidence_id { + ensure_known_evidence(path, evidence_ids, evidence_id)?; + } + + Ok(()) +} + +fn validate_update_rationale( + path: &Path, + evidence_ids: &BTreeSet, + rationale: &UpdateRationale, +) -> Result<()> { + if rationale.claim_id.trim().is_empty() { + return Err(eyre::eyre!( + "{} has an update rationale with an empty claim_id.", + path.display() + )); + } + + for evidence_id in &rationale.evidence_ids { + ensure_known_evidence(path, evidence_ids, evidence_id)?; + } + + Ok(()) +} + +fn validate_temporal_validity( + job: &RealWorldJob, + path: &Path, + temporal: &TemporalValidity, +) -> Result<()> { + if temporal.follow_up.as_deref().is_some_and(|follow_up| follow_up.trim().is_empty()) { + return Err(eyre::eyre!( + "{} job {} has an empty temporal validity follow-up.", + path.display(), + job.job_id + )); + } + if temporal.required + && !temporal.encoded + && !matches!(job.encoding.status, Some(TypedStatus::NotEncoded | TypedStatus::Blocked)) + { + return Err(eyre::eyre!( + "{} job {} requires temporal validity but does not declare a not_encoded or blocked encoding status.", + path.display(), + job.job_id + )); + } + + Ok(()) +} + fn validate_optional_debug_field(path: &Path, value: Option<&str>, field: &str) -> Result<()> { if value.is_some_and(|value| value.trim().is_empty()) { return Err(eyre::eyre!("{} has empty operator_debug {field}.", path.display())); @@ -1019,6 +1276,8 @@ fn build_report(jobs: &[RealWorldJob], args: &RunArgs) -> Result>(); let summary = report_summary(&job_reports, &suites); + let evolution = evolution_summary(&job_reports); + let follow_ups = follow_up_reports(jobs); Ok(RealWorldReport { schema: REPORT_SCHEMA.to_string(), @@ -1033,19 +1292,48 @@ fn build_report(jobs: &[RealWorldJob], args: &RunArgs) -> Result JobScoring { let answer = produced_answer(job); let produced_evidence = produced_evidence_ids(answer); + let trap_ids_used = trap_ids_used(job, &produced_evidence); + + if let Some(status) = job.encoding.status { + let evolution = evolution_job_report(job, answer, &trap_ids_used, 0); + + return JobScoring { + status, + normalized_score: 0.0, + hard_fail_hits: Vec::new(), + unsupported_claims: Vec::new(), + wrong_result_count: 0, + trap_ids_used, + dimension_scores: declared_not_encoded_dimension_scores(job), + reason: job + .encoding + .reason + .clone() + .unwrap_or_else(|| "Job did not reach a runnable scoring state.".to_string()), + evolution, + }; + } + let missing_claims = missing_required_claims(job, answer); let forbidden_claims = forbidden_claim_hits(job, answer); let missing_evidence = missing_required_evidence(job, &produced_evidence); - let trap_ids_used = trap_ids_used(job, &produced_evidence); let mut unsupported_claims = unsupported_claims(job, answer); let operator_counts = operator_debug_failure_counts(job); let hard_fail_hits = hard_fail_hits(job, &unsupported_claims, &trap_ids_used); + let evolution = evolution_job_report(job, answer, &trap_ids_used, forbidden_claims.len()); + let stale_answers = evolution.as_ref().map_or(0, |report| report.stale_answer_count); + let conflict_detection_missing = evolution + .as_ref() + .map_or(0, |report| report.conflict_count - report.conflict_detection_count); + let update_rationale_missing = evolution.as_ref().map_or(0, update_rationale_missing_count); let counts = FailureCounts { missing_claims: missing_claims.len(), forbidden_claims: forbidden_claims.len(), @@ -1056,6 +1344,9 @@ fn score_job(job: &RealWorldJob) -> JobScoring { operator_debug_raw_sql: operator_counts.operator_debug_raw_sql, operator_debug_trace_gaps: operator_counts.operator_debug_trace_gaps, operator_debug_repair_unclear: operator_counts.operator_debug_repair_unclear, + stale_answers, + conflict_detection_missing, + update_rationale_missing, }; let dimension_scores = dimension_scores(job, &counts); let normalized_score = normalized_score(&dimension_scores); @@ -1066,7 +1357,9 @@ fn score_job(job: &RealWorldJob) -> JobScoring { + counts.operator_debug_missing + counts.operator_debug_raw_sql + counts.operator_debug_trace_gaps - + counts.operator_debug_repair_unclear; + + counts.operator_debug_repair_unclear + + counts.conflict_detection_missing + + counts.update_rationale_missing; let status = job_status( normalized_score, job.scoring_rubric.pass_threshold, @@ -1089,6 +1382,7 @@ fn score_job(job: &RealWorldJob) -> JobScoring { trap_ids_used, dimension_scores, reason, + evolution, } } @@ -1108,6 +1402,19 @@ fn operator_debug_failure_counts(job: &RealWorldJob) -> FailureCounts { } } +fn declared_not_encoded_dimension_scores(job: &RealWorldJob) -> Vec { + job.scoring_rubric + .dimensions + .iter() + .map(|(dimension_id, dimension)| DimensionScoreReport { + dimension: dimension_id.clone(), + score: 0.0, + max_points: dimension.max_points, + weight: dimension.weight, + }) + .collect() +} + fn produced_answer(job: &RealWorldJob) -> &ProducedAnswer { job.corpus .adapter_response @@ -1196,6 +1503,129 @@ fn trap_ids_used(job: &RealWorldJob, produced_evidence: &BTreeSet) -> Ve .collect() } +fn evolution_job_report( + job: &RealWorldJob, + answer: &ProducedAnswer, + trap_ids_used: &[String], + forbidden_claim_count: usize, +) -> Option { + let evolution = job.memory_evolution.as_ref()?; + let stale_trap_ids_used = stale_trap_ids_used(job, evolution, trap_ids_used); + let stale_answer_count = + stale_answer_count(job, evolution, &stale_trap_ids_used, forbidden_claim_count); + let conflict_detection_count = evolution + .conflicts + .iter() + .filter(|conflict| conflict_is_detected(conflict, answer)) + .count(); + let update_rationale_available = evolution + .update_rationale + .as_ref() + .is_some_and(|rationale| update_rationale_is_available(rationale, answer)); + let temporal_validity_required = + evolution.temporal_validity.as_ref().is_some_and(|temporal| temporal.required); + let temporal_validity_encoded = + evolution.temporal_validity.as_ref().is_some_and(|temporal| temporal.encoded); + let temporal_validity_not_encoded = temporal_validity_required && !temporal_validity_encoded; + let follow_up = evolution + .temporal_validity + .as_ref() + .and_then(|temporal| temporal.follow_up.clone()) + .or_else(|| job.encoding.follow_up.as_ref().map(|follow_up| follow_up.title.clone())); + + Some(EvolutionJobReport { + current_evidence: evolution.current_evidence_ids.clone(), + historical_evidence: evolution.historical_evidence_ids.clone(), + stale_answer_count, + stale_trap_ids_used, + conflict_count: evolution.conflicts.len(), + conflict_detection_count, + update_rationale_available, + temporal_validity_required, + temporal_validity_encoded, + temporal_validity_not_encoded, + follow_up, + }) +} + +fn stale_answer_count( + job: &RealWorldJob, + evolution: &MemoryEvolution, + stale_trap_ids_used: &[String], + forbidden_claim_count: usize, +) -> usize { + let stale_trap_count = if evolution.stale_trap_ids.is_empty() { + job.negative_traps.iter().filter(|trap| trap.trap_type == "stale_fact").count() + } else { + evolution.stale_trap_ids.len() + }; + let stale_forbidden_claims = if stale_trap_count > 0 { forbidden_claim_count } else { 0 }; + + stale_trap_ids_used.len().max(stale_forbidden_claims) +} + +fn stale_trap_ids_used( + job: &RealWorldJob, + evolution: &MemoryEvolution, + trap_ids_used: &[String], +) -> Vec { + let declared_stale_traps = if evolution.stale_trap_ids.is_empty() { + job.negative_traps + .iter() + .filter(|trap| trap.trap_type == "stale_fact") + .map(|trap| trap.trap_id.as_str()) + .collect::>() + } else { + evolution.stale_trap_ids.iter().map(String::as_str).collect::>() + }; + + trap_ids_used + .iter() + .filter(|trap_id| declared_stale_traps.contains(trap_id.as_str())) + .cloned() + .collect() +} + +fn conflict_is_detected(conflict: &EvolutionConflict, answer: &ProducedAnswer) -> bool { + let mut required_evidence = + vec![conflict.current_evidence_id.as_str(), conflict.historical_evidence_id.as_str()]; + + if let Some(evidence_id) = &conflict.resolved_by_evidence_id { + required_evidence.push(evidence_id.as_str()); + } + + answer.claims.iter().any(|claim| { + claim.claim_id.as_deref() == Some(conflict.claim_id.as_str()) + && required_evidence + .iter() + .all(|evidence_id| claim.evidence_ids.iter().any(|id| id == evidence_id)) + }) +} + +fn update_rationale_is_available(rationale: &UpdateRationale, answer: &ProducedAnswer) -> bool { + if !rationale.available { + return false; + } + + answer.claims.iter().any(|claim| { + claim.claim_id.as_deref() == Some(rationale.claim_id.as_str()) + && !claim.evidence_ids.is_empty() + && rationale.evidence_ids.iter().any(|evidence_id| { + claim.evidence_ids.iter().any(|produced| produced == evidence_id) + }) + }) +} + +fn update_rationale_missing_count(report: &EvolutionJobReport) -> usize { + if report.update_rationale_available || report.temporal_validity_not_encoded { + 0 + } else if report.conflict_count > 0 { + 1 + } else { + 0 + } +} + fn unsupported_claims(job: &RealWorldJob, answer: &ProducedAnswer) -> Vec { answer.claims.iter().filter_map(|claim| unsupported_claim(job, claim)).collect() } @@ -1290,11 +1720,15 @@ fn dimension_score(dimension_id: &str, max_points: f64, counts: &FailureCounts) "answer_correctness" | "workflow_helpfulness" => counts.missing_claims > 0 || counts.forbidden_claims > 0 - || counts.operator_debug_repair_unclear > 0, + || counts.operator_debug_repair_unclear > 0 + || counts.conflict_detection_missing > 0, "evidence_grounding" => counts.missing_evidence > 0 || counts.unsupported_claims > 0, "trap_avoidance" => counts.trap_uses > 0, "uncertainty_handling" => counts.unsupported_claims > 0, - "lifecycle_behavior" => false, + "lifecycle_behavior" => + counts.stale_answers > 0 + || counts.conflict_detection_missing > 0 + || counts.update_rationale_missing > 0, "debuggability" => counts.missing_claims > 0 || counts.unsupported_claims > 0 @@ -1351,6 +1785,8 @@ fn job_reason(status: TypedStatus, counts: &FailureCounts, normalized_score: f64 + counts.operator_debug_raw_sql + counts.operator_debug_trace_gaps + counts.operator_debug_repair_unclear + + counts.conflict_detection_missing + + counts.update_rationale_missing ), TypedStatus::WrongResult => format!( "Job produced {} wrong-result signal(s) and normalized_score {normalized_score:.3}.", @@ -1362,6 +1798,8 @@ fn job_reason(status: TypedStatus, counts: &FailureCounts, normalized_score: f64 + counts.operator_debug_raw_sql + counts.operator_debug_trace_gaps + counts.operator_debug_repair_unclear + + counts.conflict_detection_missing + + counts.update_rationale_missing ), _ => "Job did not reach a runnable scoring state.".to_string(), } @@ -1383,6 +1821,22 @@ fn job_report(job: &RealWorldJob, scoring: JobScoring) -> JobReport { produced_evidence: produced_evidence_ids(answer).into_iter().collect(), unsupported_claim_count: scoring.unsupported_claims.len(), wrong_result_count: scoring.wrong_result_count, + stale_answer_count: scoring + .evolution + .as_ref() + .map_or(0, |report| report.stale_answer_count), + conflict_detection_count: scoring + .evolution + .as_ref() + .map_or(0, |report| report.conflict_detection_count), + update_rationale_available: scoring + .evolution + .as_ref() + .is_some_and(|report| report.update_rationale_available), + temporal_validity_not_encoded: scoring + .evolution + .as_ref() + .is_some_and(|report| report.temporal_validity_not_encoded), latency_ms: answer.latency_ms, cost: answer.cost.clone(), trap_ids_used: scoring.trap_ids_used, @@ -1401,6 +1855,7 @@ fn job_report(job: &RealWorldJob, scoring: JobScoring) -> JobReport { redaction_leak_count: metrics.redaction_leak_count, qdrant_rebuild_case: metrics.qdrant_rebuild_case, operator_debug: job.operator_debug.clone(), + evolution: scoring.evolution, } } @@ -1530,6 +1985,10 @@ fn suite_report(suite_id: &str, jobs: &[JobReport]) -> SuiteReport { score_mean: None, unsupported_claim_count: 0, wrong_result_count: 0, + stale_answer_count: 0, + conflict_detection_count: 0, + update_rationale_available_count: 0, + temporal_validity_not_encoded_count: 0, reason: NOT_ENCODED_REASON.to_string(), }; } @@ -1538,6 +1997,12 @@ fn suite_report(suite_id: &str, jobs: &[JobReport]) -> SuiteReport { let score_sum = suite_jobs.iter().map(|job| job.normalized_score).sum::(); let unsupported_claim_count = suite_jobs.iter().map(|job| job.unsupported_claim_count).sum(); let wrong_result_count = suite_jobs.iter().map(|job| job.wrong_result_count).sum(); + let stale_answer_count = suite_jobs.iter().map(|job| job.stale_answer_count).sum(); + let conflict_detection_count = suite_jobs.iter().map(|job| job.conflict_detection_count).sum(); + let update_rationale_available_count = + suite_jobs.iter().filter(|job| job.update_rationale_available).count(); + let temporal_validity_not_encoded_count = + suite_jobs.iter().filter(|job| job.temporal_validity_not_encoded).count(); SuiteReport { suite_id: suite_id.to_string(), @@ -1546,6 +2011,10 @@ fn suite_report(suite_id: &str, jobs: &[JobReport]) -> SuiteReport { score_mean: Some(round3(score_sum / suite_jobs.len() as f64)), unsupported_claim_count, wrong_result_count, + stale_answer_count, + conflict_detection_count, + update_rationale_available_count, + temporal_validity_not_encoded_count, reason: suite_reason(status, suite_jobs.len()), } } @@ -1563,6 +2032,8 @@ fn aggregate_status(jobs: &[&JobReport]) -> TypedStatus { TypedStatus::Incomplete } else if statuses.contains(&TypedStatus::Blocked) { TypedStatus::Blocked + } else if statuses.contains(&TypedStatus::NotEncoded) { + TypedStatus::NotEncoded } else if statuses.contains(&TypedStatus::Pass) { TypedStatus::Pass } else { @@ -1580,7 +2051,12 @@ fn suite_reason(status: TypedStatus, encoded_job_count: usize) -> String { "At least one encoded lifecycle-scored job failed lifecycle behavior.".to_string(), TypedStatus::Incomplete => "At least one encoded job could not complete.".to_string(), TypedStatus::Blocked => "At least one encoded job is blocked.".to_string(), - TypedStatus::NotEncoded => NOT_ENCODED_REASON.to_string(), + TypedStatus::NotEncoded => + if encoded_job_count == 0 { + NOT_ENCODED_REASON.to_string() + } else { + "At least one encoded fixture declares a not_encoded limitation.".to_string() + }, } } @@ -1595,13 +2071,20 @@ fn report_summary(jobs: &[JobReport], suites: &[SuiteReport]) -> ReportSummary { let scope_correct_count = jobs.iter().map(|job| job.scope_correct_count).sum(); let mut summary = ReportSummary { job_count: jobs.len(), - encoded_suite_count: suites - .iter() - .filter(|suite| suite.status != TypedStatus::NotEncoded) - .count(), - not_encoded: suites.iter().filter(|suite| suite.status == TypedStatus::NotEncoded).count(), + encoded_suite_count: suites.iter().filter(|suite| suite.encoded_job_count > 0).count(), + not_encoded: 0, unsupported_claim_count: jobs.iter().map(|job| job.unsupported_claim_count).sum(), wrong_result_count: jobs.iter().map(|job| job.wrong_result_count).sum(), + stale_answer_count: jobs.iter().map(|job| job.stale_answer_count).sum(), + conflict_detection_count: jobs.iter().map(|job| job.conflict_detection_count).sum(), + update_rationale_available_count: jobs + .iter() + .filter(|job| job.update_rationale_available) + .count(), + temporal_validity_not_encoded_count: jobs + .iter() + .filter(|job| job.temporal_validity_not_encoded) + .count(), mean_score: mean_score(jobs), mean_latency_ms: mean_latency(jobs), total_cost: total_cost(jobs), @@ -1659,6 +2142,34 @@ fn report_summary(jobs: &[JobReport], suites: &[SuiteReport]) -> ReportSummary { summary } +fn evolution_summary(jobs: &[JobReport]) -> EvolutionSummary { + EvolutionSummary { + stale_answer_count: jobs.iter().map(|job| job.stale_answer_count).sum(), + conflict_detection_count: jobs.iter().map(|job| job.conflict_detection_count).sum(), + update_rationale_available_count: jobs + .iter() + .filter(|job| job.update_rationale_available) + .count(), + temporal_validity_not_encoded_count: jobs + .iter() + .filter(|job| job.temporal_validity_not_encoded) + .count(), + } +} + +fn follow_up_reports(jobs: &[RealWorldJob]) -> Vec { + jobs.iter() + .filter_map(|job| { + job.encoding.follow_up.as_ref().map(|follow_up| FollowUpReport { + suite_id: job.suite.clone(), + job_id: job.job_id.clone(), + title: follow_up.title.clone(), + reason: follow_up.reason.clone(), + }) + }) + .collect() +} + fn ratio(numerator: usize, denominator: usize) -> f64 { if denominator == 0 { return 0.0; @@ -1756,7 +2267,9 @@ fn render_markdown(report: &RealWorldReport, report_path: &Path) -> String { render_markdown_suites(&mut out, report); render_markdown_jobs(&mut out, report); render_markdown_operator_debugging(&mut out, report); + render_markdown_evolution(&mut out, report); render_markdown_unsupported_claims(&mut out, report); + render_markdown_follow_ups(&mut out, report); render_markdown_semantics(&mut out, report); out @@ -1786,14 +2299,33 @@ fn render_markdown_header(out: &mut String, report: &RealWorldReport, report_pat md_inline(report.adapter.behavior.as_str()) )); out.push_str(&format!("- Jobs: `{}`\n", report.summary.job_count)); - out.push_str(&format!("- Encoded suites: `{}`\n", report.summary.encoded_suite_count)); - out.push_str(&format!("- Not-encoded suites: `{}`\n", report.not_encoded_suites.len())); - out.push_str(&format!("- Status summary: `{}` pass, `{}` wrong_result, `{}` lifecycle_fail, `{}` incomplete, `{}` blocked, `{}` unsupported_claim\n", report.summary.pass, report.summary.wrong_result, report.summary.lifecycle_fail, report.summary.incomplete, report.summary.blocked, report.summary.unsupported_claim)); + out.push_str(&format!( + "- Suites with encoded jobs: `{}`\n", + report.summary.encoded_suite_count + )); + out.push_str(&format!( + "- Suites with `not_encoded` status: `{}`\n", + report.not_encoded_suites.len() + )); + out.push_str(&format!("- Status summary: `{}` pass, `{}` wrong_result, `{}` lifecycle_fail, `{}` incomplete, `{}` blocked, `{}` not_encoded, `{}` unsupported_claim\n", report.summary.pass, report.summary.wrong_result, report.summary.lifecycle_fail, report.summary.incomplete, report.summary.blocked, report.summary.not_encoded, report.summary.unsupported_claim)); out.push_str(&format!( "- Unsupported claim count: `{}`\n", report.summary.unsupported_claim_count )); out.push_str(&format!("- Wrong-result count: `{}`\n", report.summary.wrong_result_count)); + out.push_str(&format!("- Stale-answer count: `{}`\n", report.summary.stale_answer_count)); + out.push_str(&format!( + "- Conflict detections: `{}`\n", + report.summary.conflict_detection_count + )); + out.push_str(&format!( + "- Update rationales available: `{}`\n", + report.summary.update_rationale_available_count + )); + out.push_str(&format!( + "- Temporal validity not encoded: `{}`\n", + report.summary.temporal_validity_not_encoded_count + )); out.push_str(&format!( "- Evidence coverage: `{}/{}` (`{:.3}`)\n", report.summary.evidence_covered_count, @@ -1850,17 +2382,21 @@ fn render_markdown_header(out: &mut String, report: &RealWorldReport, report_pat fn render_markdown_suites(out: &mut String, report: &RealWorldReport) { out.push_str("## Suites\n\n"); out.push_str( - "| Suite | Status | Jobs | Score | Unsupported Claims | Wrong Results | Reason |\n", + "| Suite | Status | Jobs | Score | Stale Answers | Conflicts | Update Rationales | Temporal Gaps | Unsupported Claims | Wrong Results | Reason |\n", ); - out.push_str("| --- | --- | ---: | ---: | ---: | ---: | --- |\n"); + out.push_str("| --- | --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | --- |\n"); for suite in &report.suites { out.push_str(&format!( - "| {} | `{}` | {} | `{}` | {} | {} | {} |\n", + "| {} | `{}` | {} | `{}` | {} | {} | {} | {} | {} | {} | {} |\n", md_cell(suite.suite_id.as_str()), status_str(suite.status), suite.encoded_job_count, optional_f64(suite.score_mean, ""), + suite.stale_answer_count, + suite.conflict_detection_count, + suite.update_rationale_available_count, + suite.temporal_validity_not_encoded_count, suite.unsupported_claim_count, suite.wrong_result_count, md_cell(suite.reason.as_str()) @@ -1872,8 +2408,10 @@ fn render_markdown_suites(out: &mut String, report: &RealWorldReport) { fn render_markdown_jobs(out: &mut String, report: &RealWorldReport) { out.push_str("## Jobs\n\n"); - out.push_str("| Suite | Job | Status | Score | Expected Evidence | Produced Evidence | Unsupported Claims | Wrong Results | Latency | Cost |\n"); - out.push_str("| --- | --- | --- | ---: | --- | --- | ---: | ---: | ---: | --- |\n"); + out.push_str("| Suite | Job | Status | Score | Expected Evidence | Produced Evidence | Stale Answers | Conflicts | Update Rationale | Temporal Gap | Unsupported Claims | Wrong Results | Latency | Cost |\n"); + out.push_str( + "| --- | --- | --- | ---: | --- | --- | ---: | ---: | --- | --- | ---: | ---: | ---: | --- |\n", + ); for job in &report.jobs { let expected = job @@ -1885,13 +2423,17 @@ fn render_markdown_jobs(out: &mut String, report: &RealWorldReport) { let produced = job.produced_evidence.join(", "); out.push_str(&format!( - "| {} | {} | `{}` | `{:.3}` | `{}` | `{}` | {} | {} | `{}` | `{}` |\n", + "| {} | {} | `{}` | `{:.3}` | `{}` | `{}` | {} | {} | `{}` | `{}` | {} | {} | `{}` | `{}` |\n", md_cell(job.suite_id.as_str()), md_cell(job.job_id.as_str()), status_str(job.status), job.normalized_score, md_inline(expected.as_str()), md_inline(produced.as_str()), + job.stale_answer_count, + job.conflict_detection_count, + bool_display(job.update_rationale_available), + bool_display(job.temporal_validity_not_encoded), job.unsupported_claim_count, job.wrong_result_count, optional_f64(job.latency_ms, " ms"), @@ -1990,6 +2532,47 @@ fn ux_gap_cell(gaps: &[OperatorUxGap]) -> String { .join("
") } +fn render_markdown_evolution(out: &mut String, report: &RealWorldReport) { + out.push_str("## Memory Evolution\n\n"); + out.push_str(&format!("- Stale answers: `{}`\n", report.evolution.stale_answer_count)); + out.push_str(&format!( + "- Conflict detections: `{}`\n", + report.evolution.conflict_detection_count + )); + out.push_str(&format!( + "- Update rationales available: `{}`\n", + report.evolution.update_rationale_available_count + )); + out.push_str(&format!( + "- Temporal validity not encoded: `{}`\n\n", + report.evolution.temporal_validity_not_encoded_count + )); + out.push_str("| Suite | Job | Current Evidence | Historical Evidence | Stale Traps Used | Conflict Count | Detected | Update Rationale | Temporal Validity | Follow-up |\n"); + out.push_str("| --- | --- | --- | --- | --- | ---: | ---: | --- | --- | --- |\n"); + + for job in &report.jobs { + let Some(evolution) = &job.evolution else { + continue; + }; + + out.push_str(&format!( + "| {} | {} | `{}` | `{}` | `{}` | {} | {} | `{}` | `{}` | {} |\n", + md_cell(job.suite_id.as_str()), + md_cell(job.job_id.as_str()), + md_inline(evolution.current_evidence.join(", ").as_str()), + md_inline(evolution.historical_evidence.join(", ").as_str()), + md_inline(evolution.stale_trap_ids_used.join(", ").as_str()), + evolution.conflict_count, + evolution.conflict_detection_count, + bool_display(evolution.update_rationale_available), + temporal_display(evolution), + md_cell(evolution.follow_up.as_deref().unwrap_or("-")) + )); + } + + out.push('\n'); +} + fn render_markdown_unsupported_claims(out: &mut String, report: &RealWorldReport) { out.push_str("## Unsupported Claims\n\n"); @@ -2016,6 +2599,31 @@ fn render_markdown_unsupported_claims(out: &mut String, report: &RealWorldReport out.push('\n'); } +fn render_markdown_follow_ups(out: &mut String, report: &RealWorldReport) { + out.push_str("## Follow-Ups\n\n"); + + if report.follow_ups.is_empty() { + out.push_str("No benchmark follow-ups were declared by encoded jobs.\n\n"); + + return; + } + + out.push_str("| Suite | Job | Follow-up | Reason |\n"); + out.push_str("| --- | --- | --- | --- |\n"); + + for follow_up in &report.follow_ups { + out.push_str(&format!( + "| {} | {} | {} | {} |\n", + md_cell(follow_up.suite_id.as_str()), + md_cell(follow_up.job_id.as_str()), + md_cell(follow_up.title.as_str()), + md_cell(follow_up.reason.as_str()) + )); + } + + out.push('\n'); +} + fn render_markdown_semantics(out: &mut String, report: &RealWorldReport) { out.push_str("## Result Semantics\n\n"); out.push_str( @@ -2024,7 +2632,7 @@ fn render_markdown_semantics(out: &mut String, report: &RealWorldReport) { out.push_str("It is a real-world job fixture report, not a Docker live-baseline report.\n"); out.push_str("Existing live-baseline reports remain valid for their encoded retrieval and lifecycle checks and are not reinterpreted as real-world suite wins.\n\n"); out.push_str( - "The summary counters report required evidence coverage, source-ref coverage, quote coverage, stale retrievals, scope violations, redaction leaks, and Qdrant rebuild case coverage across encoded jobs.\n\n", + "The summary counters report required evidence coverage, source-ref coverage, quote coverage, stale retrievals, scope violations, redaction leaks, Qdrant rebuild case coverage, stale answers, conflict detections, update rationale availability, and temporal validity gaps across encoded jobs.\n\n", ); out.push_str( "- `pass`: encoded jobs met their pass threshold with required evidence and no hard-fail rule.\n", @@ -2033,8 +2641,8 @@ fn render_markdown_semantics(out: &mut String, report: &RealWorldReport) { "- `wrong_result`: a job completed but missed required answer or evidence expectations.\n", ); out.push_str("- `unsupported_claim`: a job produced a substantive claim not supported by the fixture evidence links.\n"); - out.push_str("- `not_encoded`: a suite has no checked-in real_world_job fixture, so no pass/fail claim is allowed.\n\n"); - out.push_str("## Not-Encoded Suites\n\n"); + out.push_str("- `not_encoded`: a suite has no checked-in fixture, or an encoded fixture declares a capability gap so no pass/fail claim is allowed.\n\n"); + out.push_str("## Suites With `not_encoded` Status\n\n"); if report.not_encoded_suites.is_empty() { out.push_str("All declared suites have at least one encoded job.\n"); @@ -2079,6 +2687,22 @@ fn optional_f64(value: Option, suffix: &str) -> String { value.map(|value| format!("{value:.3}{suffix}")).unwrap_or_else(|| "-".to_string()) } +fn bool_display(value: bool) -> &'static str { + if value { "true" } else { "false" } +} + +fn temporal_display(evolution: &EvolutionJobReport) -> &'static str { + if evolution.temporal_validity_not_encoded { + "not_encoded" + } else if evolution.temporal_validity_encoded { + "encoded" + } else if evolution.temporal_validity_required { + "required" + } else { + "-" + } +} + fn cost_display(cost: Option<&CostReport>) -> String { let Some(cost) = cost else { return "-".to_string(); diff --git a/apps/elf-eval/tests/real_world_job_benchmark.rs b/apps/elf-eval/tests/real_world_job_benchmark.rs index 8c53299c..db644110 100644 --- a/apps/elf-eval/tests/real_world_job_benchmark.rs +++ b/apps/elf-eval/tests/real_world_job_benchmark.rs @@ -23,6 +23,10 @@ fn real_world_memory_fixture_dir() -> PathBuf { Path::new(env!("CARGO_MANIFEST_DIR")).join("fixtures").join("real_world_memory") } +fn evolution_fixture_dir() -> PathBuf { + real_world_memory_fixture_dir().join("evolution") +} + fn operator_debug_fixture_dir() -> PathBuf { fixture_root().join("operator_debugging_ux") } @@ -61,6 +65,15 @@ fn find_by_field<'a>(items: &'a [Value], field: &str, expected: &str) -> Result< .ok_or_else(|| eyre::eyre!("missing item with {field} = {expected}")) } +fn set_json_pointer(value: &mut Value, pointer: &str, replacement: Value) -> Result<()> { + let target = + value.pointer_mut(pointer).ok_or_else(|| eyre::eyre!("missing JSON pointer {pointer}"))?; + + *target = replacement; + + Ok(()) +} + #[test] fn smoke_fixture_produces_typed_json_report() -> Result<()> { let report = run_json_report()?; @@ -189,10 +202,24 @@ fn generated_json_report_renders_markdown() -> Result<()> { fn real_world_memory_fixtures_report_trust_and_personalization_metrics() -> Result<()> { let report = run_json_report_from(real_world_memory_fixture_dir())?; - assert_eq!(report.pointer("/summary/job_count").and_then(Value::as_u64), Some(4)); - assert_eq!(report.pointer("/summary/pass").and_then(Value::as_u64), Some(4)); + assert_eq!(report.pointer("/summary/job_count").and_then(Value::as_u64), Some(9)); + assert_eq!(report.pointer("/summary/pass").and_then(Value::as_u64), Some(8)); + assert_eq!(report.pointer("/summary/not_encoded").and_then(Value::as_u64), Some(1)); assert_eq!(report.pointer("/summary/unsupported_claim_count").and_then(Value::as_u64), Some(0)); assert_eq!(report.pointer("/summary/stale_retrieval_count").and_then(Value::as_u64), Some(0)); + assert_eq!(report.pointer("/summary/stale_answer_count").and_then(Value::as_u64), Some(0)); + assert_eq!( + report.pointer("/summary/conflict_detection_count").and_then(Value::as_u64), + Some(4) + ); + assert_eq!( + report.pointer("/summary/update_rationale_available_count").and_then(Value::as_u64), + Some(4) + ); + assert_eq!( + report.pointer("/summary/temporal_validity_not_encoded_count").and_then(Value::as_u64), + Some(1) + ); assert_eq!(report.pointer("/summary/redaction_leak_count").and_then(Value::as_u64), Some(0)); assert_eq!(report.pointer("/summary/scope_check_count").and_then(Value::as_u64), Some(1)); assert_eq!(report.pointer("/summary/scope_correct_count").and_then(Value::as_u64), Some(1)); @@ -205,22 +232,27 @@ fn real_world_memory_fixtures_report_trust_and_personalization_metrics() -> Resu report.pointer("/summary/qdrant_rebuild_pass_count").and_then(Value::as_u64), Some(1) ); - assert_eq!(report.pointer("/summary/evidence_required_count").and_then(Value::as_u64), Some(8)); - assert_eq!(report.pointer("/summary/evidence_covered_count").and_then(Value::as_u64), Some(8)); - assert_eq!(report.pointer("/summary/evidence_coverage").and_then(Value::as_f64), Some(1.0)); - assert_eq!(report.pointer("/summary/source_ref_coverage").and_then(Value::as_f64), Some(1.0)); - assert_eq!(report.pointer("/summary/quote_coverage").and_then(Value::as_f64), Some(1.0)); + assert_eq!( + report.pointer("/summary/evidence_required_count").and_then(Value::as_u64), + Some(19) + ); + assert_eq!(report.pointer("/summary/evidence_covered_count").and_then(Value::as_u64), Some(17)); + assert_eq!(report.pointer("/summary/evidence_coverage").and_then(Value::as_f64), Some(0.895)); + assert_eq!(report.pointer("/summary/source_ref_coverage").and_then(Value::as_f64), Some(0.895)); + assert_eq!(report.pointer("/summary/quote_coverage").and_then(Value::as_f64), Some(0.895)); let suites = array_at(&report, "/suites")?; - for suite_id in - ["trust_source_of_truth", "memory_evolution", "capture_integration", "personalization"] - { + for suite_id in ["trust_source_of_truth", "capture_integration", "personalization"] { let suite = find_by_field(suites, "/suite_id", suite_id)?; assert_eq!(suite.pointer("/status").and_then(Value::as_str), Some("pass")); } + let memory_evolution = find_by_field(suites, "/suite_id", "memory_evolution")?; + + assert_eq!(memory_evolution.pointer("/status").and_then(Value::as_str), Some("not_encoded")); + let jobs = array_at(&report, "/jobs")?; let rebuild = find_by_field(jobs, "/job_id", "trust-sot-rebuild-001")?; let redaction = find_by_field(jobs, "/job_id", "capture-redaction-exclusion-001")?; @@ -234,6 +266,115 @@ fn real_world_memory_fixtures_report_trust_and_personalization_metrics() -> Resu Ok(()) } +#[test] +fn memory_evolution_fixtures_report_temporal_and_staleness_metrics() -> Result<()> { + let report = run_json_report_from(evolution_fixture_dir())?; + + assert_eq!(report.pointer("/summary/job_count").and_then(Value::as_u64), Some(5)); + assert_eq!(report.pointer("/summary/encoded_suite_count").and_then(Value::as_u64), Some(1)); + assert_eq!(report.pointer("/summary/pass").and_then(Value::as_u64), Some(4)); + assert_eq!(report.pointer("/summary/not_encoded").and_then(Value::as_u64), Some(1)); + assert_eq!(report.pointer("/summary/stale_answer_count").and_then(Value::as_u64), Some(0)); + assert_eq!( + report.pointer("/summary/conflict_detection_count").and_then(Value::as_u64), + Some(4) + ); + assert_eq!( + report.pointer("/summary/update_rationale_available_count").and_then(Value::as_u64), + Some(4) + ); + assert_eq!( + report.pointer("/summary/temporal_validity_not_encoded_count").and_then(Value::as_u64), + Some(1) + ); + assert_eq!( + report.pointer("/evolution/temporal_validity_not_encoded_count").and_then(Value::as_u64), + Some(1) + ); + + let suites = array_at(&report, "/suites")?; + let memory_evolution = find_by_field(suites, "/suite_id", "memory_evolution")?; + + assert_eq!(memory_evolution.pointer("/status").and_then(Value::as_str), Some("not_encoded")); + assert_eq!( + memory_evolution.pointer("/temporal_validity_not_encoded_count").and_then(Value::as_u64), + Some(1) + ); + + let jobs = array_at(&report, "/jobs")?; + let relation_job = find_by_field(jobs, "/job_id", "memory-evolution-relation-temporal-001")?; + + assert_eq!(relation_job.pointer("/status").and_then(Value::as_str), Some("not_encoded")); + assert_eq!( + relation_job.pointer("/evolution/temporal_validity_not_encoded").and_then(Value::as_bool), + Some(true) + ); + + let follow_ups = array_at(&report, "/follow_ups")?; + + assert_eq!(follow_ups.len(), 1); + assert_eq!( + follow_ups + .first() + .and_then(|follow_up| follow_up.pointer("/title")) + .and_then(Value::as_str), + Some("[ELF graph P1] Add temporal validity to graph-lite facts") + ); + + Ok(()) +} + +#[test] +fn memory_evolution_counts_stale_answer_when_old_fact_is_answered_as_current() -> Result<()> { + let fixture_path = + evolution_fixture_dir().join("preference_changed_current_vs_historical.json"); + let mut fixture = serde_json::from_str::(&fs::read_to_string(fixture_path)?)?; + + set_json_pointer( + &mut fixture, + "/corpus/adapter_response/answer/content", + Value::String( + "Use terse bullet-only benchmark updates as the current preference.".to_string(), + ), + )?; + set_json_pointer( + &mut fixture, + "/corpus/adapter_response/answer/evidence_ids", + serde_json::json!(["pref-old-terse-bullets"]), + )?; + set_json_pointer( + &mut fixture, + "/corpus/adapter_response/answer/claims", + serde_json::json!([ + { + "claim_id": "current_preference", + "text": "Use terse bullet-only benchmark updates as the current preference.", + "evidence_ids": ["pref-old-terse-bullets"], + "confidence": "high" + } + ]), + )?; + + let temp_dir = + env::temp_dir().join(format!("elf-real-world-memory-stale-test-{}", process::id())); + + fs::create_dir_all(&temp_dir)?; + fs::write(temp_dir.join("stale_preference.json"), serde_json::to_vec_pretty(&fixture)?)?; + + let report = run_json_report_from(temp_dir)?; + + assert_eq!(report.pointer("/summary/stale_answer_count").and_then(Value::as_u64), Some(1)); + assert_eq!(report.pointer("/summary/wrong_result").and_then(Value::as_u64), Some(1)); + + let jobs = array_at(&report, "/jobs")?; + let job = find_by_field(jobs, "/job_id", "memory-evolution-preference-001")?; + + assert_eq!(job.pointer("/status").and_then(Value::as_str), Some("wrong_result")); + assert_eq!(job.pointer("/evolution/stale_answer_count").and_then(Value::as_u64), Some(1)); + + Ok(()) +} + #[test] fn operator_debug_json_report_renders_markdown_links() -> Result<()> { let report = run_json_report_from(operator_debug_fixture_dir())?; @@ -271,3 +412,39 @@ fn operator_debug_json_report_renders_markdown_links() -> Result<()> { Ok(()) } + +#[test] +fn memory_evolution_report_renders_markdown_counters() -> Result<()> { + let report = run_json_report_from(evolution_fixture_dir())?; + let temp_dir = + env::temp_dir().join(format!("elf-real-world-memory-evolution-test-{}", process::id())); + + fs::create_dir_all(&temp_dir)?; + + let report_path = temp_dir.join("evolution-report.json"); + let markdown_path = temp_dir.join("evolution-report.md"); + + fs::write(&report_path, serde_json::to_vec_pretty(&report)?)?; + + let output = Command::new(env!("CARGO_BIN_EXE_real_world_job_benchmark")) + .arg("publish") + .arg("--report") + .arg(&report_path) + .arg("--out") + .arg(&markdown_path) + .output()?; + + assert!( + output.status.success(), + "real_world_job publisher failed: {}", + String::from_utf8_lossy(&output.stderr), + ); + + let markdown = fs::read_to_string(markdown_path)?; + + assert!(markdown.contains("## Memory Evolution")); + assert!(markdown.contains("Temporal validity not encoded: `1`")); + assert!(markdown.contains("[ELF graph P1] Add temporal validity to graph-lite facts")); + + Ok(()) +} diff --git a/docs/guide/benchmarking/index.md b/docs/guide/benchmarking/index.md index dbd0a907..2829e253 100644 --- a/docs/guide/benchmarking/index.md +++ b/docs/guide/benchmarking/index.md @@ -39,6 +39,9 @@ cleanup, use `docs/guide/single_user_production.md`. step counts, dropped-candidate visibility, and repair-action clarity. - `real_world_agent_memory_benchmark.md`: operator overview for the v1 real-world agent memory benchmark contract, including suite taxonomy and typed report states. +- `real_world_memory_evolution.md`: run and interpret the checked-in memory evolution + jobs for current facts, historical facts, stale traps, conflicts, update rationales, + and temporal graph limitations. ## Update Rules diff --git a/docs/guide/benchmarking/live_baseline_benchmark.md b/docs/guide/benchmarking/live_baseline_benchmark.md index 6af7fe8f..e5a05968 100644 --- a/docs/guide/benchmarking/live_baseline_benchmark.md +++ b/docs/guide/benchmarking/live_baseline_benchmark.md @@ -321,6 +321,17 @@ The trust/personalization fixture set lives under coverage, source-ref coverage, quote coverage, stale retrievals, scope correctness, redaction leaks, and Qdrant rebuild coverage. +The memory evolution suite is a separate checked-in real-world job fixture set: + +```sh +cargo make real-world-memory-evolution +``` + +It lives under `apps/elf-eval/fixtures/real_world_memory/evolution/` and reports +stale-answer count, conflict detection count, update rationale availability, temporal +validity gaps, and unsupported claims. Its relation-temporal fixture is deliberately +`not_encoded` until graph-lite temporal validity is implemented. + ## Clean Up ```sh diff --git a/docs/guide/benchmarking/real_world_agent_memory_benchmark.md b/docs/guide/benchmarking/real_world_agent_memory_benchmark.md index b354af1d..6f9539b4 100644 --- a/docs/guide/benchmarking/real_world_agent_memory_benchmark.md +++ b/docs/guide/benchmarking/real_world_agent_memory_benchmark.md @@ -139,16 +139,36 @@ The suite currently encodes: - `trust_source_of_truth`: evidence binding, source refs, and Qdrant rebuild from Postgres-held chunk embeddings before answering. -- `memory_evolution`: TTL/delete suppression for a stale deleted fact. +- `memory_evolution`: TTL/delete suppression plus current-versus-historical preference, + issue status, deployment method, benchmark conclusion, and temporal relation cases. - `capture_integration`: write-policy audit behavior for redaction/private exclusion. - `personalization`: scoped stable preference correction without temporary or cross-project preference leakage. The generated report includes evidence coverage, source-ref coverage, quote coverage, -unsupported-claim count, stale retrieval count, scope correctness, redaction leak -count, and Qdrant rebuild case/pass counts. The fixtures include negative traps for -unsupported prior claims, stale deleted facts, cross-project preference leakage, and -private/redacted text leakage. +unsupported-claim count, stale retrieval count, stale-answer count, conflict detection +count, update rationale availability, temporal validity `not_encoded` count, scope +correctness, redaction leak count, and Qdrant rebuild case/pass counts. The fixtures +include negative traps for unsupported prior claims, stale deleted facts, stale +historical facts, cross-project preference leakage, and private/redacted text leakage. + +Narrow memory evolution increment: + +```sh +cargo make real-world-memory-evolution +``` + +Artifacts: + +```text +tmp/real-world-memory/evolution-report.json +tmp/real-world-memory/evolution-report.md +``` + +This parses `apps/elf-eval/fixtures/real_world_memory/evolution/` and reports only +the cases added for current-versus-historical interpretation and temporal staleness. +The relation temporal-validity fixture is deliberately `not_encoded` and declares the +graph follow-up instead of claiming a fake graph pass. Operator debugging UX increment: diff --git a/docs/guide/benchmarking/real_world_memory_evolution.md b/docs/guide/benchmarking/real_world_memory_evolution.md new file mode 100644 index 00000000..69d31d58 --- /dev/null +++ b/docs/guide/benchmarking/real_world_memory_evolution.md @@ -0,0 +1,64 @@ +# Real-World Memory Evolution Benchmark + +Goal: Run and interpret the checked-in memory evolution real-world job fixtures. +Read this when: You need to test current facts, historical facts, stale facts, +conflicts, corrected memories, and temporal validity limitations. +Inputs: `apps/elf-eval/fixtures/real_world_memory/evolution/`, +`apps/elf-eval/src/bin/real_world_job_benchmark.rs`, and `Makefile.toml`. +Depends on: `docs/spec/real_world_agent_memory_benchmark_v1.md`, +`docs/guide/benchmarking/real_world_agent_memory_benchmark.md`, and +`docs/guide/research/comparison_external_projects.md`. +Outputs: `tmp/real-world-memory/evolution-report.json` and +`tmp/real-world-memory/evolution-report.md`. + +## Scope + +This suite is part of the real-world job benchmark family. It is not a Docker +live-baseline retrieval matrix and does not claim private production readiness. + +The checked-in fixture set covers: + +- User preference supersession, using mem0-style memory history and Letta-style + current operating memory as reference patterns. +- Issue state evolution from blocked to done. +- Production deployment guidance superseding a local smoke quickstart. +- Benchmark adoption verdict reversal with a bounded private-corpus caveat. +- Relation fact current-versus-historical ownership, encoded as `not_encoded` + because temporal graph validity is not yet implemented in the runner. + +The relation case borrows from Graphiti/Zep temporal validity and nanograph typed +query ergonomics. It intentionally does not fake a pass for graph temporal behavior. +The report declares the follow-up `[ELF graph P1] Add temporal validity to graph-lite +facts`. + +## Run + +```sh +cargo make real-world-memory-evolution +``` + +Generated artifacts: + +```text +tmp/real-world-memory/evolution-report.json +tmp/real-world-memory/evolution-report.md +``` + +## Metrics + +The runner reports memory evolution counters at summary, suite, and job levels: + +- `stale_answer_count`: stale negative traps or stale-current forbidden claims used + by produced answers. +- `conflict_detection_count`: current-versus-historical conflicts detected with + both current and historical evidence. +- `update_rationale_available_count`: jobs where the produced answer cites the + update rationale. +- `temporal_validity_not_encoded_count`: jobs that require temporal graph validity + but are deliberately declared `not_encoded`. +- `unsupported_claim_count`: existing real-world job unsupported claim counter. + +Runnable jobs should have `stale_answer_count = 0`, nonzero conflict detection, and +an update rationale when the fixture provides one. A temporal validity gap should +remain `not_encoded` until graph-lite facts can model current-only and historical +relation validity. diff --git a/docs/spec/real_world_agent_memory_benchmark_v1.md b/docs/spec/real_world_agent_memory_benchmark_v1.md index 5b65c0d0..8b7552a7 100644 --- a/docs/spec/real_world_agent_memory_benchmark_v1.md +++ b/docs/spec/real_world_agent_memory_benchmark_v1.md @@ -67,6 +67,8 @@ runner execution. "scoring_rubric": {}, "allowed_uncertainty": {}, "operator_debug": {}, + "encoding": {}, + "memory_evolution": {}, "tags": [] } ``` @@ -88,6 +90,8 @@ runner execution. | `scoring_rubric` | object | Dimensions, weights, thresholds, and hard-fail rules for this job. | | `allowed_uncertainty` | object | Explicit uncertainty language and fallback behavior accepted for the job. | | `operator_debug` | object or null | Optional for most suites; required for `operator_debugging_ux` jobs. Records trace/viewer evidence and operator workflow scoring inputs. | +| `encoding` | object | Optional job-level limitation declaration. Only `not_encoded`, `blocked`, and `incomplete` statuses are allowed here. | +| `memory_evolution` | object or null | Optional for most suites; used by `memory_evolution` jobs to report current evidence, historical evidence, stale traps, conflicts, update rationale, and temporal-validity limitations. | | `tags` | array | Optional labels such as `private_corpus`, `synthetic`, `adapter_required`, or `no_live_claim`. | ### `corpus` @@ -194,6 +198,41 @@ Trap types: Each trap MUST include `trap_id`, `type`, `evidence_ids`, and `failure_if_used`. +### `encoding` + +`encoding` declares a fixture that is intentionally not scored as a runnable pass +because the benchmark capability is not encoded or cannot run yet. + +Allowed `status` values: + +- `not_encoded`: the fixture documents a capability gap and must not claim pass. +- `blocked`: required adapter, corpus, or system support is missing. +- `incomplete`: fixture execution cannot reach a complete scored state. + +When `status` is present, `reason` MUST be a non-empty explanation. `follow_up` is +optional, but when present it MUST include non-empty `title` and `reason` fields. + +### `memory_evolution` + +`memory_evolution` is used by jobs that test whether an answer distinguishes current +facts, historical facts, stale facts, conflicts, corrected memories, and missing +temporal validity support. + +Fields: + +- `current_evidence_ids`: evidence ids that support the current answer. +- `historical_evidence_ids`: evidence ids that are historically true but not current + answers unless the prompt asks for history. +- `stale_trap_ids`: negative trap ids that represent stale answers. +- `conflicts`: array of conflicts with `conflict_id`, `claim_id`, + `current_evidence_id`, `historical_evidence_id`, and optional + `resolved_by_evidence_id`. +- `update_rationale`: optional object with `claim_id`, `evidence_ids`, and + `available` to show whether the answer can explain why the memory changed. +- `temporal_validity`: optional object with `required`, `encoded`, and optional + `follow_up`. When `required = true` and `encoded = false`, the job MUST declare + `encoding.status = "not_encoded"` or `encoding.status = "blocked"`. + ### `operator_debug` `operator_debug` is required when `suite = "operator_debugging_ux"` and optional @@ -326,7 +365,8 @@ Suite status rules: no higher-risk `unsupported_claim` is present. - A suite is `unsupported_claim` when any hard-fail unsupported claim occurs. - A suite is `incomplete` or `blocked` when required jobs cannot run for those reasons. -- A suite is `not_encoded` when no job in that suite is implemented. +- A suite is `not_encoded` when no job in that suite is implemented, or when an + encoded fixture declares a job-level capability gap that prevents a suite pass claim. Reports MUST include: @@ -337,6 +377,11 @@ Reports MUST include: - explicit `not_encoded` suite list; - private-corpus redaction policy when private fixtures are used. +Reports that encode `memory_evolution` jobs SHOULD also include stale-answer counts, +conflict detection counts, update rationale availability, and temporal-validity +`not_encoded` counts. A temporal graph validity job MUST NOT be reported as `pass` +until the runner can evaluate current-only versus historical relation facts. + ## Claim Rules - A project MAY claim a suite pass only for suites with encoded jobs and a published