From 40ede386711307f9cfa8d674806ee636628a4a1d Mon Sep 17 00:00:00 2001
From: Yvette Carlisle <y@acg.box>
Date: Tue, 9 Jun 2026 23:11:57 +0800
Subject: [PATCH] {"schema":"decodex/commit/1","summary":"Add real-world memory
 evolution benchmark cases","authority":"XY-846"}

---
 Makefile.toml                                 |  52 ++
 .../benchmark_conclusion_overturned.json      | 263 +++++++
 .../deployment_method_superseded.json         | 226 ++++++
 .../evolution/issue_blocked_to_done.json      | 221 ++++++
 ...ference_changed_current_vs_historical.json | 224 ++++++
 ...elation_temporal_validity_not_encoded.json | 199 ++++++
 .../src/bin/real_world_job_benchmark.rs       | 668 +++++++++++++++++-
 .../tests/real_world_job_benchmark.rs         | 197 +++++-
 docs/guide/benchmarking/index.md              |   3 +
 .../benchmarking/live_baseline_benchmark.md   |  11 +
 .../real_world_agent_memory_benchmark.md      |  30 +-
 .../real_world_memory_evolution.md            |  64 ++
 .../real_world_agent_memory_benchmark_v1.md   |  47 +-
 13 files changed, 2167 insertions(+), 38 deletions(-)
 create mode 100644 apps/elf-eval/fixtures/real_world_memory/evolution/benchmark_conclusion_overturned.json
 create mode 100644 apps/elf-eval/fixtures/real_world_memory/evolution/deployment_method_superseded.json
 create mode 100644 apps/elf-eval/fixtures/real_world_memory/evolution/issue_blocked_to_done.json
 create mode 100644 apps/elf-eval/fixtures/real_world_memory/evolution/preference_changed_current_vs_historical.json
 create mode 100644 apps/elf-eval/fixtures/real_world_memory/evolution/relation_temporal_validity_not_encoded.json
 create mode 100644 docs/guide/benchmarking/real_world_memory_evolution.md

diff --git a/Makefile.toml b/Makefile.toml
index 8eb6cf43..ed9a5405 100644
--- a/Makefile.toml
+++ b/Makefile.toml
@@ -400,6 +400,9 @@ args = [
 # | real-world-memory                | composite |     |
 # | real-world-memory-json           | command   |     |
 # | real-world-memory-report         | command   |     |
+# | real-world-memory-evolution        | composite |     |
+# | real-world-memory-evolution-json   | command   |     |
+# | real-world-memory-evolution-report | command   |     |
 # | real-world-job-operator-ux       | composite |     |
 # | real-world-job-operator-ux-json  | command   |     |
 # | real-world-job-operator-ux-report | command  |     |
@@ -496,6 +499,55 @@ args = [
 	"tmp/real-world-memory/real-world-memory-report.md",
 ]
 
+[tasks.real-world-memory-evolution]
+workspace = false
+dependencies = [
+	"real-world-memory-evolution-report",
+]
+
+[tasks.real-world-memory-evolution-json]
+workspace = false
+command = "cargo"
+args = [
+	"run",
+	"-p",
+	"elf-eval",
+	"--bin",
+	"real_world_job_benchmark",
+	"--",
+	"run",
+	"--fixtures",
+	"apps/elf-eval/fixtures/real_world_memory/evolution",
+	"--out",
+	"tmp/real-world-memory/evolution-report.json",
+	"--run-id",
+	"real-world-memory-evolution",
+	"--adapter-id",
+	"fixture_memory_evolution",
+	"--adapter-name",
+	"ELF fixture memory evolution",
+]
+
+[tasks.real-world-memory-evolution-report]
+workspace = false
+dependencies = [
+	"real-world-memory-evolution-json",
+]
+command = "cargo"
+args = [
+	"run",
+	"-p",
+	"elf-eval",
+	"--bin",
+	"real_world_job_benchmark",
+	"--",
+	"publish",
+	"--report",
+	"tmp/real-world-memory/evolution-report.json",
+	"--out",
+	"tmp/real-world-memory/evolution-report.md",
+]
+
 [tasks.real-world-job-operator-ux]
 workspace = false
 dependencies = [
diff --git a/apps/elf-eval/fixtures/real_world_memory/evolution/benchmark_conclusion_overturned.json b/apps/elf-eval/fixtures/real_world_memory/evolution/benchmark_conclusion_overturned.json
new file mode 100644
index 00000000..0d694597
--- /dev/null
+++ b/apps/elf-eval/fixtures/real_world_memory/evolution/benchmark_conclusion_overturned.json
@@ -0,0 +1,263 @@
+{
+  "schema": "elf.real_world_job/v1",
+  "job_id": "memory-evolution-benchmark-verdict-001",
+  "suite": "memory_evolution",
+  "title": "Use the current production adoption verdict after an older conclusion changed",
+  "corpus": {
+    "corpus_id": "real-world-memory-evolution-2026-06-09",
+    "profile": "synthetic",
+    "items": [
+      {
+        "evidence_id": "verdict-old-not-ready",
+        "kind": "decision",
+        "text": "Earlier conclusion: ELF was not production ready because private corpus and restore proof were missing.",
+        "source_ref": {
+          "schema": "source_ref/v1",
+          "resolver": "real_world_job_fixture/v1",
+          "ref": {
+            "fixture": "benchmark_conclusion_overturned",
+            "evidence_id": "verdict-old-not-ready"
+          }
+        },
+        "created_at": "2026-06-07T00:00:00Z"
+      },
+      {
+        "evidence_id": "verdict-current-ready-bounded",
+        "kind": "decision",
+        "text": "Production adoption gate on 2026-06-09 says ELF is ready for personal production use with bounded caveats.",
+        "source_ref": {
+          "schema": "source_ref/v1",
+          "resolver": "real_world_job_fixture/v1",
+          "ref": {
+            "fixture": "benchmark_conclusion_overturned",
+            "evidence_id": "verdict-current-ready-bounded"
+          }
+        },
+        "created_at": "2026-06-09T00:00:00Z"
+      },
+      {
+        "evidence_id": "verdict-bounded-private-caveat",
+        "kind": "decision",
+        "text": "The private production corpus was not run; the gate records it as a bounded caveat, not a private-corpus pass.",
+        "source_ref": {
+          "schema": "source_ref/v1",
+          "resolver": "real_world_job_fixture/v1",
+          "ref": {
+            "fixture": "benchmark_conclusion_overturned",
+            "evidence_id": "verdict-bounded-private-caveat"
+          }
+        },
+        "created_at": "2026-06-09T00:05:00Z"
+      },
+      {
+        "evidence_id": "verdict-update-rationale",
+        "kind": "decision",
+        "text": "The verdict changed after provider-backed synthetic, stress, backfill, and restore proof evidence was recorded.",
+        "source_ref": {
+          "schema": "source_ref/v1",
+          "resolver": "real_world_job_fixture/v1",
+          "ref": {
+            "fixture": "benchmark_conclusion_overturned",
+            "evidence_id": "verdict-update-rationale"
+          }
+        },
+        "created_at": "2026-06-09T00:10:00Z"
+      }
+    ],
+    "adapter_response": {
+      "adapter_id": "fixture_memory_evolution",
+      "answer": {
+        "content": "The current verdict is that ELF is ready for personal production use with bounded caveats; the older not-ready conclusion is historical, and the private corpus remains an explicit caveat rather than a private-corpus pass.",
+        "claims": [
+          {
+            "claim_id": "current_benchmark_verdict",
+            "text": "ELF is ready for personal production use with bounded caveats.",
+            "evidence_ids": [
+              "verdict-current-ready-bounded",
+              "verdict-old-not-ready",
+              "verdict-update-rationale"
+            ],
+            "confidence": "high"
+          },
+          {
+            "claim_id": "benchmark_update_rationale",
+            "text": "The verdict changed after provider-backed benchmark and restore proof evidence was recorded.",
+            "evidence_ids": ["verdict-update-rationale"],
+            "confidence": "high"
+          },
+          {
+            "claim_id": "private_corpus_caveat",
+            "text": "The private corpus remains a bounded caveat rather than a private-corpus pass.",
+            "evidence_ids": ["verdict-bounded-private-caveat"],
+            "confidence": "high"
+          }
+        ],
+        "evidence_ids": [
+          "verdict-current-ready-bounded",
+          "verdict-bounded-private-caveat",
+          "verdict-update-rationale"
+        ],
+        "latency_ms": 1.5,
+        "cost": {
+          "currency": "USD",
+          "amount": 0.0,
+          "input_tokens": 0,
+          "output_tokens": 0
+        }
+      }
+    }
+  },
+  "timeline": [
+    {
+      "event_id": "verdict-not-ready",
+      "ts": "2026-06-07T00:00:00Z",
+      "actor": "agent",
+      "action": "made_decision",
+      "evidence_ids": ["verdict-old-not-ready"],
+      "summary": "The older verdict said ELF was not ready."
+    },
+    {
+      "event_id": "verdict-ready",
+      "ts": "2026-06-09T00:00:00Z",
+      "actor": "agent",
+      "action": "updated_memory",
+      "evidence_ids": [
+        "verdict-current-ready-bounded",
+        "verdict-bounded-private-caveat",
+        "verdict-update-rationale"
+      ],
+      "summary": "The adoption gate changed the current verdict and preserved the private-corpus caveat."
+    }
+  ],
+  "prompt": {
+    "role": "user",
+    "content": "What is the current benchmark adoption conclusion, and what older conclusion changed?",
+    "job_mode": "decide",
+    "constraints": ["cite_evidence", "distinguish_current_from_historical", "state_caveats"]
+  },
+  "expected_answer": {
+    "must_include": [
+      {
+        "claim_id": "current_benchmark_verdict",
+        "text": "ELF is ready for personal production use with bounded caveats."
+      },
+      {
+        "claim_id": "benchmark_update_rationale",
+        "text": "The verdict changed after provider-backed benchmark and restore proof evidence was recorded."
+      },
+      {
+        "claim_id": "private_corpus_caveat",
+        "text": "The private corpus remains a bounded caveat rather than a private-corpus pass."
+      }
+    ],
+    "must_not_include": [
+      "ELF is not ready for personal production use.",
+      "The private production corpus passed."
+    ],
+    "evidence_links": {
+      "current_benchmark_verdict": [
+        "verdict-current-ready-bounded",
+        "verdict-old-not-ready",
+        "verdict-update-rationale"
+      ],
+      "benchmark_update_rationale": ["verdict-update-rationale"],
+      "private_corpus_caveat": ["verdict-bounded-private-caveat"]
+    },
+    "answer_type": "decision_record",
+    "accepted_alternates": [],
+    "requires_caveat": false,
+    "requires_refusal": false
+  },
+  "required_evidence": [
+    {
+      "evidence_id": "verdict-current-ready-bounded",
+      "claim_id": "current_benchmark_verdict",
+      "requirement": "cite",
+      "quote": "ready for personal production use with bounded caveats"
+    },
+    {
+      "evidence_id": "verdict-bounded-private-caveat",
+      "claim_id": "private_corpus_caveat",
+      "requirement": "cite",
+      "quote": "bounded caveat, not a private-corpus pass"
+    },
+    {
+      "evidence_id": "verdict-update-rationale",
+      "claim_id": "benchmark_update_rationale",
+      "requirement": "explain",
+      "quote": "provider-backed synthetic, stress, backfill, and restore proof"
+    }
+  ],
+  "negative_traps": [
+    {
+      "trap_id": "old-not-ready-verdict-current",
+      "type": "stale_fact",
+      "evidence_ids": ["verdict-old-not-ready"],
+      "failure_if_used": false
+    }
+  ],
+  "scoring_rubric": {
+    "dimensions": {
+      "lifecycle_behavior": {
+        "weight": 0.35,
+        "max_points": 1.0,
+        "criteria": "Reports the current adoption verdict and historical supersession."
+      },
+      "answer_correctness": {
+        "weight": 0.25,
+        "max_points": 1.0,
+        "criteria": "States the current verdict and private-corpus caveat."
+      },
+      "evidence_grounding": {
+        "weight": 0.25,
+        "max_points": 1.0,
+        "criteria": "Cites current verdict, caveat, and rationale evidence."
+      },
+      "trap_avoidance": {
+        "weight": 0.15,
+        "max_points": 1.0,
+        "criteria": "Does not report the old not-ready verdict as current."
+      }
+    },
+    "pass_threshold": 0.8,
+    "hard_fail_rules": [
+      "unsupported high-confidence claim about a required decision or fact",
+      "use of a negative trap marked failure_if_used = true"
+    ]
+  },
+  "allowed_uncertainty": {
+    "can_answer_unknown": false,
+    "acceptable_phrases": ["The fixture does not provide that evidence."],
+    "fallback_action": "state_blocker"
+  },
+  "memory_evolution": {
+    "current_evidence_ids": ["verdict-current-ready-bounded"],
+    "historical_evidence_ids": ["verdict-old-not-ready"],
+    "stale_trap_ids": ["old-not-ready-verdict-current"],
+    "conflicts": [
+      {
+        "conflict_id": "benchmark-verdict-overturned",
+        "claim_id": "current_benchmark_verdict",
+        "current_evidence_id": "verdict-current-ready-bounded",
+        "historical_evidence_id": "verdict-old-not-ready",
+        "resolved_by_evidence_id": "verdict-update-rationale"
+      }
+    ],
+    "update_rationale": {
+      "claim_id": "benchmark_update_rationale",
+      "evidence_ids": ["verdict-update-rationale"],
+      "available": true
+    },
+    "temporal_validity": {
+      "required": false,
+      "encoded": false,
+      "follow_up": null
+    }
+  },
+  "tags": [
+    "synthetic",
+    "memory_evolution",
+    "reference_mem0_history",
+    "no_live_claim"
+  ]
+}
diff --git a/apps/elf-eval/fixtures/real_world_memory/evolution/deployment_method_superseded.json b/apps/elf-eval/fixtures/real_world_memory/evolution/deployment_method_superseded.json
new file mode 100644
index 00000000..f20d9f08
--- /dev/null
+++ b/apps/elf-eval/fixtures/real_world_memory/evolution/deployment_method_superseded.json
@@ -0,0 +1,226 @@
+{
+  "schema": "elf.real_world_job/v1",
+  "job_id": "memory-evolution-deploy-method-001",
+  "suite": "memory_evolution",
+  "title": "Prefer the superseding production deployment method over the old smoke path",
+  "corpus": {
+    "corpus_id": "real-world-memory-evolution-2026-06-09",
+    "profile": "synthetic",
+    "items": [
+      {
+        "evidence_id": "deploy-old-quickstart",
+        "kind": "runbook",
+        "text": "Old deployment method: use quickstart cargo run service terminals for local smoke only.",
+        "source_ref": {
+          "schema": "source_ref/v1",
+          "resolver": "real_world_job_fixture/v1",
+          "ref": {
+            "fixture": "deployment_method_superseded",
+            "evidence_id": "deploy-old-quickstart"
+          }
+        },
+        "created_at": "2026-06-02T00:00:00Z"
+      },
+      {
+        "evidence_id": "deploy-current-production-runbook",
+        "kind": "runbook",
+        "text": "Current single-user production operation uses Docker Compose production runbook with backup, restore, and Qdrant rebuild.",
+        "source_ref": {
+          "schema": "source_ref/v1",
+          "resolver": "real_world_job_fixture/v1",
+          "ref": {
+            "fixture": "deployment_method_superseded",
+            "evidence_id": "deploy-current-production-runbook"
+          }
+        },
+        "created_at": "2026-06-09T00:00:00Z"
+      },
+      {
+        "evidence_id": "deploy-supersession-rationale",
+        "kind": "decision",
+        "text": "Quickstart is no longer production guidance because backup, restore, rollback, and provider config handling must be explicit.",
+        "source_ref": {
+          "schema": "source_ref/v1",
+          "resolver": "real_world_job_fixture/v1",
+          "ref": {
+            "fixture": "deployment_method_superseded",
+            "evidence_id": "deploy-supersession-rationale"
+          }
+        },
+        "created_at": "2026-06-09T00:10:00Z"
+      }
+    ],
+    "adapter_response": {
+      "adapter_id": "fixture_memory_evolution",
+      "answer": {
+        "content": "Use the Docker Compose production runbook with backup, restore, and Qdrant rebuild for production; the cargo run quickstart is only historical local-smoke guidance because production recovery handling must be explicit.",
+        "claims": [
+          {
+            "claim_id": "current_deployment_method",
+            "text": "Use the Docker Compose production runbook with backup, restore, and Qdrant rebuild for production.",
+            "evidence_ids": [
+              "deploy-current-production-runbook",
+              "deploy-old-quickstart",
+              "deploy-supersession-rationale"
+            ],
+            "confidence": "high"
+          },
+          {
+            "claim_id": "deployment_update_rationale",
+            "text": "The quickstart was superseded because production recovery handling must be explicit.",
+            "evidence_ids": ["deploy-supersession-rationale"],
+            "confidence": "high"
+          }
+        ],
+        "evidence_ids": [
+          "deploy-current-production-runbook",
+          "deploy-supersession-rationale"
+        ],
+        "latency_ms": 1.4,
+        "cost": {
+          "currency": "USD",
+          "amount": 0.0,
+          "input_tokens": 0,
+          "output_tokens": 0
+        }
+      }
+    }
+  },
+  "timeline": [
+    {
+      "event_id": "deploy-quickstart",
+      "ts": "2026-06-02T00:00:00Z",
+      "actor": "agent",
+      "action": "recorded_runbook",
+      "evidence_ids": ["deploy-old-quickstart"],
+      "summary": "The quickstart path existed for local smoke use."
+    },
+    {
+      "event_id": "deploy-production-runbook",
+      "ts": "2026-06-09T00:00:00Z",
+      "actor": "agent",
+      "action": "updated_memory",
+      "evidence_ids": ["deploy-current-production-runbook", "deploy-supersession-rationale"],
+      "summary": "The production runbook became the current production method."
+    }
+  ],
+  "prompt": {
+    "role": "user",
+    "content": "Which deployment path should I use for production now?",
+    "job_mode": "operate",
+    "constraints": ["cite_evidence", "distinguish_current_from_historical"]
+  },
+  "expected_answer": {
+    "must_include": [
+      {
+        "claim_id": "current_deployment_method",
+        "text": "Use the Docker Compose production runbook with backup, restore, and Qdrant rebuild for production."
+      },
+      {
+        "claim_id": "deployment_update_rationale",
+        "text": "The quickstart was superseded because production recovery handling must be explicit."
+      }
+    ],
+    "must_not_include": [
+      "Use quickstart cargo run service terminals for production."
+    ],
+    "evidence_links": {
+      "current_deployment_method": [
+        "deploy-current-production-runbook",
+        "deploy-old-quickstart",
+        "deploy-supersession-rationale"
+      ],
+      "deployment_update_rationale": ["deploy-supersession-rationale"]
+    },
+    "answer_type": "ops_runbook",
+    "accepted_alternates": [],
+    "requires_caveat": false,
+    "requires_refusal": false
+  },
+  "required_evidence": [
+    {
+      "evidence_id": "deploy-current-production-runbook",
+      "claim_id": "current_deployment_method",
+      "requirement": "cite",
+      "quote": "Docker Compose production runbook"
+    },
+    {
+      "evidence_id": "deploy-supersession-rationale",
+      "claim_id": "deployment_update_rationale",
+      "requirement": "explain",
+      "quote": "backup, restore, rollback"
+    }
+  ],
+  "negative_traps": [
+    {
+      "trap_id": "old-quickstart-production",
+      "type": "stale_fact",
+      "evidence_ids": ["deploy-old-quickstart"],
+      "failure_if_used": false
+    }
+  ],
+  "scoring_rubric": {
+    "dimensions": {
+      "lifecycle_behavior": {
+        "weight": 0.35,
+        "max_points": 1.0,
+        "criteria": "Chooses the superseding production runbook."
+      },
+      "answer_correctness": {
+        "weight": 0.25,
+        "max_points": 1.0,
+        "criteria": "Answers with the current production method."
+      },
+      "evidence_grounding": {
+        "weight": 0.25,
+        "max_points": 1.0,
+        "criteria": "Cites current runbook and supersession rationale."
+      },
+      "trap_avoidance": {
+        "weight": 0.15,
+        "max_points": 1.0,
+        "criteria": "Does not turn the quickstart smoke path into production guidance."
+      }
+    },
+    "pass_threshold": 0.8,
+    "hard_fail_rules": [
+      "unsupported high-confidence claim about a required decision or fact",
+      "use of a negative trap marked failure_if_used = true"
+    ]
+  },
+  "allowed_uncertainty": {
+    "can_answer_unknown": false,
+    "acceptable_phrases": ["The fixture does not provide that evidence."],
+    "fallback_action": "state_blocker"
+  },
+  "memory_evolution": {
+    "current_evidence_ids": ["deploy-current-production-runbook"],
+    "historical_evidence_ids": ["deploy-old-quickstart"],
+    "stale_trap_ids": ["old-quickstart-production"],
+    "conflicts": [
+      {
+        "conflict_id": "deployment-method-supersession",
+        "claim_id": "current_deployment_method",
+        "current_evidence_id": "deploy-current-production-runbook",
+        "historical_evidence_id": "deploy-old-quickstart",
+        "resolved_by_evidence_id": "deploy-supersession-rationale"
+      }
+    ],
+    "update_rationale": {
+      "claim_id": "deployment_update_rationale",
+      "evidence_ids": ["deploy-supersession-rationale"],
+      "available": true
+    },
+    "temporal_validity": {
+      "required": false,
+      "encoded": false,
+      "follow_up": null
+    }
+  },
+  "tags": [
+    "synthetic",
+    "memory_evolution",
+    "reference_letta_core_block",
+    "no_live_claim"
+  ]
+}
diff --git a/apps/elf-eval/fixtures/real_world_memory/evolution/issue_blocked_to_done.json b/apps/elf-eval/fixtures/real_world_memory/evolution/issue_blocked_to_done.json
new file mode 100644
index 00000000..8fb40f85
--- /dev/null
+++ b/apps/elf-eval/fixtures/real_world_memory/evolution/issue_blocked_to_done.json
@@ -0,0 +1,221 @@
+{
+  "schema": "elf.real_world_job/v1",
+  "job_id": "memory-evolution-issue-state-001",
+  "suite": "memory_evolution",
+  "title": "Report an issue as done after an earlier blocker cleared",
+  "corpus": {
+    "corpus_id": "real-world-memory-evolution-2026-06-09",
+    "profile": "synthetic",
+    "items": [
+      {
+        "evidence_id": "issue-xy900-blocked",
+        "kind": "issue",
+        "text": "On 2026-06-06, XY-900 was blocked on missing real_world_job fixture/report implementation.",
+        "source_ref": {
+          "schema": "source_ref/v1",
+          "resolver": "real_world_job_fixture/v1",
+          "ref": {
+            "fixture": "issue_blocked_to_done",
+            "evidence_id": "issue-xy900-blocked"
+          }
+        },
+        "created_at": "2026-06-06T00:00:00Z"
+      },
+      {
+        "evidence_id": "issue-xy900-done",
+        "kind": "issue",
+        "text": "On 2026-06-09, XY-900 is done after PR #200 added the real_world_job fixture/report implementation.",
+        "source_ref": {
+          "schema": "source_ref/v1",
+          "resolver": "real_world_job_fixture/v1",
+          "ref": {
+            "fixture": "issue_blocked_to_done",
+            "evidence_id": "issue-xy900-done"
+          }
+        },
+        "created_at": "2026-06-09T00:00:00Z"
+      },
+      {
+        "evidence_id": "issue-xy900-resolution-rationale",
+        "kind": "decision",
+        "text": "The blocker cleared because the fixture/report runner now exists and publishes typed real-world job reports.",
+        "source_ref": {
+          "schema": "source_ref/v1",
+          "resolver": "real_world_job_fixture/v1",
+          "ref": {
+            "fixture": "issue_blocked_to_done",
+            "evidence_id": "issue-xy900-resolution-rationale"
+          }
+        },
+        "created_at": "2026-06-09T00:05:00Z"
+      }
+    ],
+    "adapter_response": {
+      "adapter_id": "fixture_memory_evolution",
+      "answer": {
+        "content": "XY-900 is currently done after PR #200; the earlier missing real_world_job fixture/report blocker is historical and cleared because the runner now publishes typed reports.",
+        "claims": [
+          {
+            "claim_id": "current_issue_state",
+            "text": "XY-900 is currently done after PR #200.",
+            "evidence_ids": [
+              "issue-xy900-done",
+              "issue-xy900-blocked",
+              "issue-xy900-resolution-rationale"
+            ],
+            "confidence": "high"
+          },
+          {
+            "claim_id": "issue_update_rationale",
+            "text": "The blocker cleared because the fixture/report runner now exists.",
+            "evidence_ids": ["issue-xy900-resolution-rationale"],
+            "confidence": "high"
+          }
+        ],
+        "evidence_ids": ["issue-xy900-done", "issue-xy900-resolution-rationale"],
+        "latency_ms": 1.3,
+        "cost": {
+          "currency": "USD",
+          "amount": 0.0,
+          "input_tokens": 0,
+          "output_tokens": 0
+        }
+      }
+    }
+  },
+  "timeline": [
+    {
+      "event_id": "xy900-blocked",
+      "ts": "2026-06-06T00:00:00Z",
+      "actor": "agent",
+      "action": "hit_blocker",
+      "evidence_ids": ["issue-xy900-blocked"],
+      "summary": "The issue was blocked on missing fixture/report implementation."
+    },
+    {
+      "event_id": "xy900-done",
+      "ts": "2026-06-09T00:00:00Z",
+      "actor": "agent",
+      "action": "updated_memory",
+      "evidence_ids": ["issue-xy900-done", "issue-xy900-resolution-rationale"],
+      "summary": "The implementation landed and the blocker cleared."
+    }
+  ],
+  "prompt": {
+    "role": "user",
+    "content": "Is XY-900 still blocked, or is it done now?",
+    "job_mode": "resume",
+    "constraints": ["cite_evidence", "distinguish_current_from_historical"]
+  },
+  "expected_answer": {
+    "must_include": [
+      {
+        "claim_id": "current_issue_state",
+        "text": "XY-900 is currently done after PR #200."
+      },
+      {
+        "claim_id": "issue_update_rationale",
+        "text": "The blocker cleared because the fixture/report runner now exists."
+      }
+    ],
+    "must_not_include": ["XY-900 is currently blocked."],
+    "evidence_links": {
+      "current_issue_state": [
+        "issue-xy900-done",
+        "issue-xy900-blocked",
+        "issue-xy900-resolution-rationale"
+      ],
+      "issue_update_rationale": ["issue-xy900-resolution-rationale"]
+    },
+    "answer_type": "resume_summary",
+    "accepted_alternates": [],
+    "requires_caveat": false,
+    "requires_refusal": false
+  },
+  "required_evidence": [
+    {
+      "evidence_id": "issue-xy900-done",
+      "claim_id": "current_issue_state",
+      "requirement": "cite",
+      "quote": "XY-900 is done"
+    },
+    {
+      "evidence_id": "issue-xy900-resolution-rationale",
+      "claim_id": "issue_update_rationale",
+      "requirement": "explain",
+      "quote": "fixture/report runner now exists"
+    }
+  ],
+  "negative_traps": [
+    {
+      "trap_id": "old-issue-blocker-current",
+      "type": "stale_fact",
+      "evidence_ids": ["issue-xy900-blocked"],
+      "failure_if_used": false
+    }
+  ],
+  "scoring_rubric": {
+    "dimensions": {
+      "lifecycle_behavior": {
+        "weight": 0.35,
+        "max_points": 1.0,
+        "criteria": "Reports the latest issue state rather than the historical blocker."
+      },
+      "answer_correctness": {
+        "weight": 0.25,
+        "max_points": 1.0,
+        "criteria": "States that the issue is done and why."
+      },
+      "evidence_grounding": {
+        "weight": 0.25,
+        "max_points": 1.0,
+        "criteria": "Uses current completion and resolution evidence."
+      },
+      "trap_avoidance": {
+        "weight": 0.15,
+        "max_points": 1.0,
+        "criteria": "Does not report the old blocker as current."
+      }
+    },
+    "pass_threshold": 0.8,
+    "hard_fail_rules": [
+      "unsupported high-confidence claim about a required decision or fact",
+      "use of a negative trap marked failure_if_used = true"
+    ]
+  },
+  "allowed_uncertainty": {
+    "can_answer_unknown": false,
+    "acceptable_phrases": ["The fixture does not provide that evidence."],
+    "fallback_action": "state_blocker"
+  },
+  "memory_evolution": {
+    "current_evidence_ids": ["issue-xy900-done"],
+    "historical_evidence_ids": ["issue-xy900-blocked"],
+    "stale_trap_ids": ["old-issue-blocker-current"],
+    "conflicts": [
+      {
+        "conflict_id": "issue-state-blocked-to-done",
+        "claim_id": "current_issue_state",
+        "current_evidence_id": "issue-xy900-done",
+        "historical_evidence_id": "issue-xy900-blocked",
+        "resolved_by_evidence_id": "issue-xy900-resolution-rationale"
+      }
+    ],
+    "update_rationale": {
+      "claim_id": "issue_update_rationale",
+      "evidence_ids": ["issue-xy900-resolution-rationale"],
+      "available": true
+    },
+    "temporal_validity": {
+      "required": false,
+      "encoded": false,
+      "follow_up": null
+    }
+  },
+  "tags": [
+    "synthetic",
+    "memory_evolution",
+    "reference_mem0_history",
+    "no_live_claim"
+  ]
+}
diff --git a/apps/elf-eval/fixtures/real_world_memory/evolution/preference_changed_current_vs_historical.json b/apps/elf-eval/fixtures/real_world_memory/evolution/preference_changed_current_vs_historical.json
new file mode 100644
index 00000000..bf5e93c7
--- /dev/null
+++ b/apps/elf-eval/fixtures/real_world_memory/evolution/preference_changed_current_vs_historical.json
@@ -0,0 +1,224 @@
+{
+  "schema": "elf.real_world_job/v1",
+  "job_id": "memory-evolution-preference-001",
+  "suite": "memory_evolution",
+  "title": "Apply the current user preference while preserving the historical one",
+  "corpus": {
+    "corpus_id": "real-world-memory-evolution-2026-06-09",
+    "profile": "synthetic",
+    "items": [
+      {
+        "evidence_id": "pref-old-terse-bullets",
+        "kind": "note",
+        "text": "On 2026-06-01, the user preferred terse bullet-only benchmark updates.",
+        "source_ref": {
+          "schema": "source_ref/v1",
+          "resolver": "real_world_job_fixture/v1",
+          "ref": {
+            "fixture": "preference_changed_current_vs_historical",
+            "evidence_id": "pref-old-terse-bullets"
+          }
+        },
+        "created_at": "2026-06-01T00:00:00Z"
+      },
+      {
+        "evidence_id": "pref-current-concise-rationale",
+        "kind": "note",
+        "text": "On 2026-06-08, the user changed preference to concise prose with explicit evidence before bullets.",
+        "source_ref": {
+          "schema": "source_ref/v1",
+          "resolver": "real_world_job_fixture/v1",
+          "ref": {
+            "fixture": "preference_changed_current_vs_historical",
+            "evidence_id": "pref-current-concise-rationale"
+          }
+        },
+        "created_at": "2026-06-08T00:00:00Z"
+      },
+      {
+        "evidence_id": "pref-update-rationale",
+        "kind": "decision",
+        "text": "The user said the earlier terse bullets hid rationale, so future benchmark updates should include concise rationale.",
+        "source_ref": {
+          "schema": "source_ref/v1",
+          "resolver": "real_world_job_fixture/v1",
+          "ref": {
+            "fixture": "preference_changed_current_vs_historical",
+            "evidence_id": "pref-update-rationale"
+          }
+        },
+        "created_at": "2026-06-08T00:05:00Z"
+      }
+    ],
+    "adapter_response": {
+      "adapter_id": "fixture_memory_evolution",
+      "answer": {
+        "content": "Use concise prose with explicit evidence before bullets; the terse bullet-only preference is historical because it hid rationale.",
+        "claims": [
+          {
+            "claim_id": "current_preference",
+            "text": "Use concise prose with explicit evidence before bullets.",
+            "evidence_ids": [
+              "pref-current-concise-rationale",
+              "pref-old-terse-bullets",
+              "pref-update-rationale"
+            ],
+            "confidence": "high"
+          },
+          {
+            "claim_id": "preference_update_rationale",
+            "text": "The preference changed because terse bullets hid rationale.",
+            "evidence_ids": ["pref-update-rationale"],
+            "confidence": "high"
+          }
+        ],
+        "evidence_ids": ["pref-current-concise-rationale", "pref-update-rationale"],
+        "latency_ms": 1.1,
+        "cost": {
+          "currency": "USD",
+          "amount": 0.0,
+          "input_tokens": 0,
+          "output_tokens": 0
+        }
+      }
+    }
+  },
+  "timeline": [
+    {
+      "event_id": "preference-old",
+      "ts": "2026-06-01T00:00:00Z",
+      "actor": "user",
+      "action": "set_preference",
+      "evidence_ids": ["pref-old-terse-bullets"],
+      "summary": "The user initially preferred terse bullet-only benchmark updates."
+    },
+    {
+      "event_id": "preference-current",
+      "ts": "2026-06-08T00:00:00Z",
+      "actor": "user",
+      "action": "updated_memory",
+      "evidence_ids": ["pref-current-concise-rationale", "pref-update-rationale"],
+      "summary": "The user changed the preference and gave the rationale."
+    }
+  ],
+  "prompt": {
+    "role": "user",
+    "content": "How should benchmark updates be written now, and what changed?",
+    "job_mode": "personalize",
+    "constraints": ["cite_evidence", "distinguish_current_from_historical"]
+  },
+  "expected_answer": {
+    "must_include": [
+      {
+        "claim_id": "current_preference",
+        "text": "Use concise prose with explicit evidence before bullets."
+      },
+      {
+        "claim_id": "preference_update_rationale",
+        "text": "The preference changed because terse bullets hid rationale."
+      }
+    ],
+    "must_not_include": [
+      "Use terse bullet-only benchmark updates as the current preference."
+    ],
+    "evidence_links": {
+      "current_preference": [
+        "pref-current-concise-rationale",
+        "pref-old-terse-bullets",
+        "pref-update-rationale"
+      ],
+      "preference_update_rationale": ["pref-update-rationale"]
+    },
+    "answer_type": "direct_answer",
+    "accepted_alternates": [],
+    "requires_caveat": false,
+    "requires_refusal": false
+  },
+  "required_evidence": [
+    {
+      "evidence_id": "pref-current-concise-rationale",
+      "claim_id": "current_preference",
+      "requirement": "cite",
+      "quote": "changed preference to concise prose"
+    },
+    {
+      "evidence_id": "pref-update-rationale",
+      "claim_id": "preference_update_rationale",
+      "requirement": "explain",
+      "quote": "terse bullets hid rationale"
+    }
+  ],
+  "negative_traps": [
+    {
+      "trap_id": "old-terse-preference-current",
+      "type": "stale_fact",
+      "evidence_ids": ["pref-old-terse-bullets"],
+      "failure_if_used": false
+    }
+  ],
+  "scoring_rubric": {
+    "dimensions": {
+      "lifecycle_behavior": {
+        "weight": 0.35,
+        "max_points": 1.0,
+        "criteria": "Chooses the current preference while preserving the historical version."
+      },
+      "answer_correctness": {
+        "weight": 0.25,
+        "max_points": 1.0,
+        "criteria": "States the current preference and update rationale."
+      },
+      "evidence_grounding": {
+        "weight": 0.25,
+        "max_points": 1.0,
+        "criteria": "Cites the current preference and rationale evidence."
+      },
+      "trap_avoidance": {
+        "weight": 0.15,
+        "max_points": 1.0,
+        "criteria": "Does not promote the stale preference as current."
+      }
+    },
+    "pass_threshold": 0.8,
+    "hard_fail_rules": [
+      "unsupported high-confidence claim about a required decision or fact",
+      "use of a negative trap marked failure_if_used = true"
+    ]
+  },
+  "allowed_uncertainty": {
+    "can_answer_unknown": false,
+    "acceptable_phrases": ["The fixture does not provide that evidence."],
+    "fallback_action": "state_blocker"
+  },
+  "memory_evolution": {
+    "current_evidence_ids": ["pref-current-concise-rationale"],
+    "historical_evidence_ids": ["pref-old-terse-bullets"],
+    "stale_trap_ids": ["old-terse-preference-current"],
+    "conflicts": [
+      {
+        "conflict_id": "preference-style-supersession",
+        "claim_id": "current_preference",
+        "current_evidence_id": "pref-current-concise-rationale",
+        "historical_evidence_id": "pref-old-terse-bullets",
+        "resolved_by_evidence_id": "pref-update-rationale"
+      }
+    ],
+    "update_rationale": {
+      "claim_id": "preference_update_rationale",
+      "evidence_ids": ["pref-update-rationale"],
+      "available": true
+    },
+    "temporal_validity": {
+      "required": false,
+      "encoded": false,
+      "follow_up": null
+    }
+  },
+  "tags": [
+    "synthetic",
+    "memory_evolution",
+    "reference_mem0_history",
+    "reference_letta_core_block",
+    "no_live_claim"
+  ]
+}
diff --git a/apps/elf-eval/fixtures/real_world_memory/evolution/relation_temporal_validity_not_encoded.json b/apps/elf-eval/fixtures/real_world_memory/evolution/relation_temporal_validity_not_encoded.json
new file mode 100644
index 00000000..6c3a0c0f
--- /dev/null
+++ b/apps/elf-eval/fixtures/real_world_memory/evolution/relation_temporal_validity_not_encoded.json
@@ -0,0 +1,199 @@
+{
+  "schema": "elf.real_world_job/v1",
+  "job_id": "memory-evolution-relation-temporal-001",
+  "suite": "memory_evolution",
+  "title": "Mark temporal relation validity as not encoded instead of faking a graph pass",
+  "encoding": {
+    "status": "not_encoded",
+    "reason": "ELF graph-lite currently returns bounded relation context, but this runner does not yet encode current-only versus historical temporal validity for relation facts.",
+    "follow_up": {
+      "title": "[ELF graph P1] Add temporal validity to graph-lite facts",
+      "reason": "Relation facts need valid_from and invalidated_at semantics before this job can claim a current-versus-historical graph pass."
+    }
+  },
+  "corpus": {
+    "corpus_id": "real-world-memory-evolution-2026-06-09",
+    "profile": "synthetic",
+    "items": [
+      {
+        "evidence_id": "relation-old-owner",
+        "kind": "adapter_state",
+        "text": "Before 2026-06-06, Team Delta owned deployment method review.",
+        "source_ref": {
+          "schema": "source_ref/v1",
+          "resolver": "real_world_job_fixture/v1",
+          "ref": {
+            "fixture": "relation_temporal_validity_not_encoded",
+            "evidence_id": "relation-old-owner"
+          }
+        },
+        "created_at": "2026-06-05T00:00:00Z"
+      },
+      {
+        "evidence_id": "relation-current-owner",
+        "kind": "adapter_state",
+        "text": "Since 2026-06-08, Team Echo owns deployment method review.",
+        "source_ref": {
+          "schema": "source_ref/v1",
+          "resolver": "real_world_job_fixture/v1",
+          "ref": {
+            "fixture": "relation_temporal_validity_not_encoded",
+            "evidence_id": "relation-current-owner"
+          }
+        },
+        "created_at": "2026-06-08T00:00:00Z"
+      },
+      {
+        "evidence_id": "relation-owner-rationale",
+        "kind": "decision",
+        "text": "Ownership moved after single-user production runbook scope changed.",
+        "source_ref": {
+          "schema": "source_ref/v1",
+          "resolver": "real_world_job_fixture/v1",
+          "ref": {
+            "fixture": "relation_temporal_validity_not_encoded",
+            "evidence_id": "relation-owner-rationale"
+          }
+        },
+        "created_at": "2026-06-08T00:05:00Z"
+      }
+    ]
+  },
+  "timeline": [
+    {
+      "event_id": "relation-old-owner",
+      "ts": "2026-06-05T00:00:00Z",
+      "actor": "agent",
+      "action": "recorded_relation",
+      "evidence_ids": ["relation-old-owner"],
+      "summary": "Team Delta was the historical owner."
+    },
+    {
+      "event_id": "relation-current-owner",
+      "ts": "2026-06-08T00:00:00Z",
+      "actor": "agent",
+      "action": "updated_memory",
+      "evidence_ids": ["relation-current-owner", "relation-owner-rationale"],
+      "summary": "Team Echo became the current owner after the scope changed."
+    }
+  ],
+  "prompt": {
+    "role": "user",
+    "content": "Who currently owns deployment method review, and who owned it historically?",
+    "job_mode": "answer",
+    "constraints": ["cite_evidence", "distinguish_current_from_historical"]
+  },
+  "expected_answer": {
+    "must_include": [
+      {
+        "claim_id": "relation_current_owner",
+        "text": "Team Echo currently owns deployment method review."
+      },
+      {
+        "claim_id": "relation_historical_owner",
+        "text": "Team Delta owned deployment method review historically."
+      }
+    ],
+    "must_not_include": ["Team Delta currently owns deployment method review."],
+    "evidence_links": {
+      "relation_current_owner": [
+        "relation-current-owner",
+        "relation-old-owner",
+        "relation-owner-rationale"
+      ],
+      "relation_historical_owner": ["relation-old-owner"]
+    },
+    "answer_type": "direct_answer",
+    "accepted_alternates": [],
+    "requires_caveat": false,
+    "requires_refusal": false
+  },
+  "required_evidence": [
+    {
+      "evidence_id": "relation-current-owner",
+      "claim_id": "relation_current_owner",
+      "requirement": "cite",
+      "quote": "Team Echo owns deployment method review"
+    },
+    {
+      "evidence_id": "relation-old-owner",
+      "claim_id": "relation_historical_owner",
+      "requirement": "cite",
+      "quote": "Team Delta owned deployment method review"
+    }
+  ],
+  "negative_traps": [
+    {
+      "trap_id": "old-owner-as-current",
+      "type": "stale_fact",
+      "evidence_ids": ["relation-old-owner"],
+      "failure_if_used": false
+    }
+  ],
+  "scoring_rubric": {
+    "dimensions": {
+      "lifecycle_behavior": {
+        "weight": 0.4,
+        "max_points": 1.0,
+        "criteria": "Requires current-only versus historical temporal validity for relation facts."
+      },
+      "answer_correctness": {
+        "weight": 0.25,
+        "max_points": 1.0,
+        "criteria": "Would identify current and historical owners separately."
+      },
+      "evidence_grounding": {
+        "weight": 0.2,
+        "max_points": 1.0,
+        "criteria": "Would cite both current and historical relation evidence."
+      },
+      "trap_avoidance": {
+        "weight": 0.15,
+        "max_points": 1.0,
+        "criteria": "Would not report the historical owner as current."
+      }
+    },
+    "pass_threshold": 0.8,
+    "hard_fail_rules": [
+      "unsupported high-confidence claim about a required decision or fact",
+      "use of a negative trap marked failure_if_used = true"
+    ]
+  },
+  "allowed_uncertainty": {
+    "can_answer_unknown": true,
+    "acceptable_phrases": ["Temporal relation validity is not encoded in this runner."],
+    "fallback_action": "state_blocker"
+  },
+  "memory_evolution": {
+    "current_evidence_ids": ["relation-current-owner"],
+    "historical_evidence_ids": ["relation-old-owner"],
+    "stale_trap_ids": ["old-owner-as-current"],
+    "conflicts": [
+      {
+        "conflict_id": "relation-owner-current-historical",
+        "claim_id": "relation_current_owner",
+        "current_evidence_id": "relation-current-owner",
+        "historical_evidence_id": "relation-old-owner",
+        "resolved_by_evidence_id": "relation-owner-rationale"
+      }
+    ],
+    "update_rationale": {
+      "claim_id": "relation_owner_update_rationale",
+      "evidence_ids": ["relation-owner-rationale"],
+      "available": false
+    },
+    "temporal_validity": {
+      "required": true,
+      "encoded": false,
+      "follow_up": "[ELF graph P1] Add temporal validity to graph-lite facts"
+    }
+  },
+  "tags": [
+    "synthetic",
+    "memory_evolution",
+    "reference_graphiti_zep_temporal",
+    "reference_nanograph_typed_query",
+    "not_encoded",
+    "no_live_claim"
+  ]
+}
diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark.rs b/apps/elf-eval/src/bin/real_world_job_benchmark.rs
index 59ee9bd2..643572d5 100644
--- a/apps/elf-eval/src/bin/real_world_job_benchmark.rs
+++ b/apps/elf-eval/src/bin/real_world_job_benchmark.rs
@@ -108,6 +108,9 @@ struct RealWorldJob {
 	operator_debug: Option<OperatorDebugEvidence>,
 	#[serde(default)]
 	tags: Vec<String>,
+	#[serde(default)]
+	encoding: JobEncoding,
+	memory_evolution: Option<MemoryEvolution>,
 }
 
 #[derive(Debug, Deserialize)]
@@ -249,6 +252,57 @@ struct NegativeTrap {
 	failure_if_used: bool,
 }
 
+#[derive(Debug, Default, Deserialize)]
+struct JobEncoding {
+	status: Option<TypedStatus>,
+	reason: Option<String>,
+	follow_up: Option<FollowUpInput>,
+}
+
+#[derive(Clone, Debug, Deserialize, Serialize)]
+struct FollowUpInput {
+	title: String,
+	reason: String,
+}
+
+#[derive(Debug, Deserialize)]
+struct MemoryEvolution {
+	#[serde(default)]
+	current_evidence_ids: Vec<String>,
+	#[serde(default)]
+	historical_evidence_ids: Vec<String>,
+	#[serde(default)]
+	stale_trap_ids: Vec<String>,
+	#[serde(default)]
+	conflicts: Vec<EvolutionConflict>,
+	update_rationale: Option<UpdateRationale>,
+	temporal_validity: Option<TemporalValidity>,
+}
+
+#[derive(Debug, Deserialize)]
+struct EvolutionConflict {
+	conflict_id: String,
+	claim_id: String,
+	current_evidence_id: String,
+	historical_evidence_id: String,
+	resolved_by_evidence_id: Option<String>,
+}
+
+#[derive(Debug, Deserialize)]
+struct UpdateRationale {
+	claim_id: String,
+	#[serde(default)]
+	evidence_ids: Vec<String>,
+	available: bool,
+}
+
+#[derive(Debug, Deserialize)]
+struct TemporalValidity {
+	required: bool,
+	encoded: bool,
+	follow_up: Option<String>,
+}
+
 #[derive(Debug, Deserialize)]
 struct ScoringRubric {
 	#[serde(default)]
@@ -374,6 +428,10 @@ struct RealWorldReport {
 	unsupported_claims: Vec<UnsupportedClaimReport>,
 	not_encoded_suites: Vec<String>,
 	private_corpus_redaction: PrivateCorpusRedaction,
+	#[serde(default)]
+	evolution: EvolutionSummary,
+	#[serde(default)]
+	follow_ups: Vec<FollowUpReport>,
 }
 
 #[derive(Debug, Deserialize, Serialize)]
@@ -399,6 +457,14 @@ struct ReportSummary {
 	unsupported_claim: usize,
 	unsupported_claim_count: usize,
 	wrong_result_count: usize,
+	#[serde(default)]
+	stale_answer_count: usize,
+	#[serde(default)]
+	conflict_detection_count: usize,
+	#[serde(default)]
+	update_rationale_available_count: usize,
+	#[serde(default)]
+	temporal_validity_not_encoded_count: usize,
 	mean_score: f64,
 	mean_latency_ms: Option<f64>,
 	total_cost: Option<CostReport>,
@@ -454,6 +520,14 @@ struct SuiteReport {
 	score_mean: Option<f64>,
 	unsupported_claim_count: usize,
 	wrong_result_count: usize,
+	#[serde(default)]
+	stale_answer_count: usize,
+	#[serde(default)]
+	conflict_detection_count: usize,
+	#[serde(default)]
+	update_rationale_available_count: usize,
+	#[serde(default)]
+	temporal_validity_not_encoded_count: usize,
 	reason: String,
 }
 
@@ -470,6 +544,14 @@ struct JobReport {
 	produced_evidence: Vec<String>,
 	unsupported_claim_count: usize,
 	wrong_result_count: usize,
+	#[serde(default)]
+	stale_answer_count: usize,
+	#[serde(default)]
+	conflict_detection_count: usize,
+	#[serde(default)]
+	update_rationale_available: bool,
+	#[serde(default)]
+	temporal_validity_not_encoded: bool,
 	latency_ms: Option<f64>,
 	cost: Option<CostReport>,
 	trap_ids_used: Vec<String>,
@@ -501,6 +583,8 @@ struct JobReport {
 	qdrant_rebuild_case: bool,
 	#[serde(skip_serializing_if = "Option::is_none")]
 	operator_debug: Option<OperatorDebugEvidence>,
+	#[serde(skip_serializing_if = "Option::is_none")]
+	evolution: Option<EvolutionJobReport>,
 }
 
 #[derive(Debug, Deserialize, Serialize)]
@@ -528,6 +612,38 @@ struct UnsupportedClaimReport {
 	evidence_ids: Vec<String>,
 }
 
+#[derive(Clone, Debug, Default, Deserialize, Serialize)]
+struct EvolutionSummary {
+	stale_answer_count: usize,
+	conflict_detection_count: usize,
+	update_rationale_available_count: usize,
+	temporal_validity_not_encoded_count: usize,
+}
+
+#[derive(Clone, Debug, Deserialize, Serialize)]
+struct EvolutionJobReport {
+	current_evidence: Vec<String>,
+	historical_evidence: Vec<String>,
+	stale_trap_ids_used: Vec<String>,
+	stale_answer_count: usize,
+	conflict_count: usize,
+	conflict_detection_count: usize,
+	update_rationale_available: bool,
+	temporal_validity_required: bool,
+	temporal_validity_encoded: bool,
+	temporal_validity_not_encoded: bool,
+	#[serde(skip_serializing_if = "Option::is_none")]
+	follow_up: Option<String>,
+}
+
+#[derive(Debug, Deserialize, Serialize)]
+struct FollowUpReport {
+	suite_id: String,
+	job_id: String,
+	title: String,
+	reason: String,
+}
+
 #[derive(Debug, Deserialize, Serialize)]
 struct PrivateCorpusRedaction {
 	policy: String,
@@ -544,6 +660,7 @@ struct JobScoring {
 	trap_ids_used: Vec<String>,
 	dimension_scores: Vec<DimensionScoreReport>,
 	reason: String,
+	evolution: Option<EvolutionJobReport>,
 }
 
 #[derive(Debug, Default)]
@@ -557,6 +674,9 @@ struct FailureCounts {
 	operator_debug_raw_sql: usize,
 	operator_debug_trace_gaps: usize,
 	operator_debug_repair_unclear: usize,
+	stale_answers: usize,
+	conflict_detection_missing: usize,
+	update_rationale_missing: usize,
 }
 
 #[derive(Debug, Default)]
@@ -676,6 +796,8 @@ fn validate_job(job: &RealWorldJob, path: &Path) -> Result<()> {
 	validate_scoring_rubric(job, path)?;
 	validate_allowed_uncertainty(job, path)?;
 	validate_operator_debug(job, path)?;
+	validate_job_encoding(job, path)?;
+	validate_memory_evolution(job, path)?;
 
 	Ok(())
 }
@@ -949,6 +1071,141 @@ fn validate_operator_debug(job: &RealWorldJob, path: &Path) -> Result<()> {
 	Ok(())
 }
 
+fn validate_job_encoding(job: &RealWorldJob, path: &Path) -> Result<()> {
+	if let Some(status) = job.encoding.status {
+		if !matches!(
+			status,
+			TypedStatus::NotEncoded | TypedStatus::Blocked | TypedStatus::Incomplete
+		) {
+			return Err(eyre::eyre!(
+				"{} job {} uses encoding.status {}; only not_encoded, blocked, or incomplete are allowed.",
+				path.display(),
+				job.job_id,
+				status_str(status)
+			));
+		}
+		if job.encoding.reason.as_deref().is_none_or(|reason| reason.trim().is_empty()) {
+			return Err(eyre::eyre!(
+				"{} job {} declares encoding.status but no reason.",
+				path.display(),
+				job.job_id
+			));
+		}
+	}
+	if let Some(follow_up) = &job.encoding.follow_up
+		&& (follow_up.title.trim().is_empty() || follow_up.reason.trim().is_empty())
+	{
+		return Err(eyre::eyre!(
+			"{} job {} has an incomplete encoding follow-up.",
+			path.display(),
+			job.job_id
+		));
+	}
+
+	Ok(())
+}
+
+fn validate_memory_evolution(job: &RealWorldJob, path: &Path) -> Result<()> {
+	let Some(evolution) = &job.memory_evolution else {
+		return Ok(());
+	};
+	let evidence_ids = corpus_evidence_ids(job);
+	let trap_ids =
+		job.negative_traps.iter().map(|trap| trap.trap_id.as_str()).collect::<BTreeSet<_>>();
+
+	for evidence_id in
+		evolution.current_evidence_ids.iter().chain(evolution.historical_evidence_ids.iter())
+	{
+		ensure_known_evidence(path, &evidence_ids, evidence_id)?;
+	}
+	for trap_id in &evolution.stale_trap_ids {
+		if !trap_ids.contains(trap_id.as_str()) {
+			return Err(eyre::eyre!(
+				"{} job {} references unknown stale trap id {}.",
+				path.display(),
+				job.job_id,
+				trap_id
+			));
+		}
+	}
+	for conflict in &evolution.conflicts {
+		validate_evolution_conflict(path, &evidence_ids, conflict)?;
+	}
+
+	if let Some(rationale) = &evolution.update_rationale {
+		validate_update_rationale(path, &evidence_ids, rationale)?;
+	}
+	if let Some(temporal) = &evolution.temporal_validity {
+		validate_temporal_validity(job, path, temporal)?;
+	}
+
+	Ok(())
+}
+
+fn validate_evolution_conflict(
+	path: &Path,
+	evidence_ids: &BTreeSet<String>,
+	conflict: &EvolutionConflict,
+) -> Result<()> {
+	if conflict.conflict_id.trim().is_empty() || conflict.claim_id.trim().is_empty() {
+		return Err(eyre::eyre!("{} has an incomplete evolution conflict.", path.display()));
+	}
+
+	ensure_known_evidence(path, evidence_ids, conflict.current_evidence_id.as_str())?;
+	ensure_known_evidence(path, evidence_ids, conflict.historical_evidence_id.as_str())?;
+
+	if let Some(evidence_id) = &conflict.resolved_by_evidence_id {
+		ensure_known_evidence(path, evidence_ids, evidence_id)?;
+	}
+
+	Ok(())
+}
+
+fn validate_update_rationale(
+	path: &Path,
+	evidence_ids: &BTreeSet<String>,
+	rationale: &UpdateRationale,
+) -> Result<()> {
+	if rationale.claim_id.trim().is_empty() {
+		return Err(eyre::eyre!(
+			"{} has an update rationale with an empty claim_id.",
+			path.display()
+		));
+	}
+
+	for evidence_id in &rationale.evidence_ids {
+		ensure_known_evidence(path, evidence_ids, evidence_id)?;
+	}
+
+	Ok(())
+}
+
+fn validate_temporal_validity(
+	job: &RealWorldJob,
+	path: &Path,
+	temporal: &TemporalValidity,
+) -> Result<()> {
+	if temporal.follow_up.as_deref().is_some_and(|follow_up| follow_up.trim().is_empty()) {
+		return Err(eyre::eyre!(
+			"{} job {} has an empty temporal validity follow-up.",
+			path.display(),
+			job.job_id
+		));
+	}
+	if temporal.required
+		&& !temporal.encoded
+		&& !matches!(job.encoding.status, Some(TypedStatus::NotEncoded | TypedStatus::Blocked))
+	{
+		return Err(eyre::eyre!(
+			"{} job {} requires temporal validity but does not declare a not_encoded or blocked encoding status.",
+			path.display(),
+			job.job_id
+		));
+	}
+
+	Ok(())
+}
+
 fn validate_optional_debug_field(path: &Path, value: Option<&str>, field: &str) -> Result<()> {
 	if value.is_some_and(|value| value.trim().is_empty()) {
 		return Err(eyre::eyre!("{} has empty operator_debug {field}.", path.display()));
@@ -1019,6 +1276,8 @@ fn build_report(jobs: &[RealWorldJob], args: &RunArgs) -> Result<RealWorldReport
 		.map(|suite| suite.suite_id.clone())
 		.collect::<Vec<_>>();
 	let summary = report_summary(&job_reports, &suites);
+	let evolution = evolution_summary(&job_reports);
+	let follow_ups = follow_up_reports(jobs);
 
 	Ok(RealWorldReport {
 		schema: REPORT_SCHEMA.to_string(),
@@ -1033,19 +1292,48 @@ fn build_report(jobs: &[RealWorldJob], args: &RunArgs) -> Result<RealWorldReport
 		unsupported_claims,
 		not_encoded_suites,
 		private_corpus_redaction: private_corpus_redaction(jobs),
+		evolution,
+		follow_ups,
 	})
 }
 
 fn score_job(job: &RealWorldJob) -> JobScoring {
 	let answer = produced_answer(job);
 	let produced_evidence = produced_evidence_ids(answer);
+	let trap_ids_used = trap_ids_used(job, &produced_evidence);
+
+	if let Some(status) = job.encoding.status {
+		let evolution = evolution_job_report(job, answer, &trap_ids_used, 0);
+
+		return JobScoring {
+			status,
+			normalized_score: 0.0,
+			hard_fail_hits: Vec::new(),
+			unsupported_claims: Vec::new(),
+			wrong_result_count: 0,
+			trap_ids_used,
+			dimension_scores: declared_not_encoded_dimension_scores(job),
+			reason: job
+				.encoding
+				.reason
+				.clone()
+				.unwrap_or_else(|| "Job did not reach a runnable scoring state.".to_string()),
+			evolution,
+		};
+	}
+
 	let missing_claims = missing_required_claims(job, answer);
 	let forbidden_claims = forbidden_claim_hits(job, answer);
 	let missing_evidence = missing_required_evidence(job, &produced_evidence);
-	let trap_ids_used = trap_ids_used(job, &produced_evidence);
 	let mut unsupported_claims = unsupported_claims(job, answer);
 	let operator_counts = operator_debug_failure_counts(job);
 	let hard_fail_hits = hard_fail_hits(job, &unsupported_claims, &trap_ids_used);
+	let evolution = evolution_job_report(job, answer, &trap_ids_used, forbidden_claims.len());
+	let stale_answers = evolution.as_ref().map_or(0, |report| report.stale_answer_count);
+	let conflict_detection_missing = evolution
+		.as_ref()
+		.map_or(0, |report| report.conflict_count - report.conflict_detection_count);
+	let update_rationale_missing = evolution.as_ref().map_or(0, update_rationale_missing_count);
 	let counts = FailureCounts {
 		missing_claims: missing_claims.len(),
 		forbidden_claims: forbidden_claims.len(),
@@ -1056,6 +1344,9 @@ fn score_job(job: &RealWorldJob) -> JobScoring {
 		operator_debug_raw_sql: operator_counts.operator_debug_raw_sql,
 		operator_debug_trace_gaps: operator_counts.operator_debug_trace_gaps,
 		operator_debug_repair_unclear: operator_counts.operator_debug_repair_unclear,
+		stale_answers,
+		conflict_detection_missing,
+		update_rationale_missing,
 	};
 	let dimension_scores = dimension_scores(job, &counts);
 	let normalized_score = normalized_score(&dimension_scores);
@@ -1066,7 +1357,9 @@ fn score_job(job: &RealWorldJob) -> JobScoring {
 		+ counts.operator_debug_missing
 		+ counts.operator_debug_raw_sql
 		+ counts.operator_debug_trace_gaps
-		+ counts.operator_debug_repair_unclear;
+		+ counts.operator_debug_repair_unclear
+		+ counts.conflict_detection_missing
+		+ counts.update_rationale_missing;
 	let status = job_status(
 		normalized_score,
 		job.scoring_rubric.pass_threshold,
@@ -1089,6 +1382,7 @@ fn score_job(job: &RealWorldJob) -> JobScoring {
 		trap_ids_used,
 		dimension_scores,
 		reason,
+		evolution,
 	}
 }
 
@@ -1108,6 +1402,19 @@ fn operator_debug_failure_counts(job: &RealWorldJob) -> FailureCounts {
 	}
 }
 
+fn declared_not_encoded_dimension_scores(job: &RealWorldJob) -> Vec<DimensionScoreReport> {
+	job.scoring_rubric
+		.dimensions
+		.iter()
+		.map(|(dimension_id, dimension)| DimensionScoreReport {
+			dimension: dimension_id.clone(),
+			score: 0.0,
+			max_points: dimension.max_points,
+			weight: dimension.weight,
+		})
+		.collect()
+}
+
 fn produced_answer(job: &RealWorldJob) -> &ProducedAnswer {
 	job.corpus
 		.adapter_response
@@ -1196,6 +1503,129 @@ fn trap_ids_used(job: &RealWorldJob, produced_evidence: &BTreeSet<String>) -> Ve
 		.collect()
 }
 
+fn evolution_job_report(
+	job: &RealWorldJob,
+	answer: &ProducedAnswer,
+	trap_ids_used: &[String],
+	forbidden_claim_count: usize,
+) -> Option<EvolutionJobReport> {
+	let evolution = job.memory_evolution.as_ref()?;
+	let stale_trap_ids_used = stale_trap_ids_used(job, evolution, trap_ids_used);
+	let stale_answer_count =
+		stale_answer_count(job, evolution, &stale_trap_ids_used, forbidden_claim_count);
+	let conflict_detection_count = evolution
+		.conflicts
+		.iter()
+		.filter(|conflict| conflict_is_detected(conflict, answer))
+		.count();
+	let update_rationale_available = evolution
+		.update_rationale
+		.as_ref()
+		.is_some_and(|rationale| update_rationale_is_available(rationale, answer));
+	let temporal_validity_required =
+		evolution.temporal_validity.as_ref().is_some_and(|temporal| temporal.required);
+	let temporal_validity_encoded =
+		evolution.temporal_validity.as_ref().is_some_and(|temporal| temporal.encoded);
+	let temporal_validity_not_encoded = temporal_validity_required && !temporal_validity_encoded;
+	let follow_up = evolution
+		.temporal_validity
+		.as_ref()
+		.and_then(|temporal| temporal.follow_up.clone())
+		.or_else(|| job.encoding.follow_up.as_ref().map(|follow_up| follow_up.title.clone()));
+
+	Some(EvolutionJobReport {
+		current_evidence: evolution.current_evidence_ids.clone(),
+		historical_evidence: evolution.historical_evidence_ids.clone(),
+		stale_answer_count,
+		stale_trap_ids_used,
+		conflict_count: evolution.conflicts.len(),
+		conflict_detection_count,
+		update_rationale_available,
+		temporal_validity_required,
+		temporal_validity_encoded,
+		temporal_validity_not_encoded,
+		follow_up,
+	})
+}
+
+fn stale_answer_count(
+	job: &RealWorldJob,
+	evolution: &MemoryEvolution,
+	stale_trap_ids_used: &[String],
+	forbidden_claim_count: usize,
+) -> usize {
+	let stale_trap_count = if evolution.stale_trap_ids.is_empty() {
+		job.negative_traps.iter().filter(|trap| trap.trap_type == "stale_fact").count()
+	} else {
+		evolution.stale_trap_ids.len()
+	};
+	let stale_forbidden_claims = if stale_trap_count > 0 { forbidden_claim_count } else { 0 };
+
+	stale_trap_ids_used.len().max(stale_forbidden_claims)
+}
+
+fn stale_trap_ids_used(
+	job: &RealWorldJob,
+	evolution: &MemoryEvolution,
+	trap_ids_used: &[String],
+) -> Vec<String> {
+	let declared_stale_traps = if evolution.stale_trap_ids.is_empty() {
+		job.negative_traps
+			.iter()
+			.filter(|trap| trap.trap_type == "stale_fact")
+			.map(|trap| trap.trap_id.as_str())
+			.collect::<BTreeSet<_>>()
+	} else {
+		evolution.stale_trap_ids.iter().map(String::as_str).collect::<BTreeSet<_>>()
+	};
+
+	trap_ids_used
+		.iter()
+		.filter(|trap_id| declared_stale_traps.contains(trap_id.as_str()))
+		.cloned()
+		.collect()
+}
+
+fn conflict_is_detected(conflict: &EvolutionConflict, answer: &ProducedAnswer) -> bool {
+	let mut required_evidence =
+		vec![conflict.current_evidence_id.as_str(), conflict.historical_evidence_id.as_str()];
+
+	if let Some(evidence_id) = &conflict.resolved_by_evidence_id {
+		required_evidence.push(evidence_id.as_str());
+	}
+
+	answer.claims.iter().any(|claim| {
+		claim.claim_id.as_deref() == Some(conflict.claim_id.as_str())
+			&& required_evidence
+				.iter()
+				.all(|evidence_id| claim.evidence_ids.iter().any(|id| id == evidence_id))
+	})
+}
+
+fn update_rationale_is_available(rationale: &UpdateRationale, answer: &ProducedAnswer) -> bool {
+	if !rationale.available {
+		return false;
+	}
+
+	answer.claims.iter().any(|claim| {
+		claim.claim_id.as_deref() == Some(rationale.claim_id.as_str())
+			&& !claim.evidence_ids.is_empty()
+			&& rationale.evidence_ids.iter().any(|evidence_id| {
+				claim.evidence_ids.iter().any(|produced| produced == evidence_id)
+			})
+	})
+}
+
+fn update_rationale_missing_count(report: &EvolutionJobReport) -> usize {
+	if report.update_rationale_available || report.temporal_validity_not_encoded {
+		0
+	} else if report.conflict_count > 0 {
+		1
+	} else {
+		0
+	}
+}
+
 fn unsupported_claims(job: &RealWorldJob, answer: &ProducedAnswer) -> Vec<UnsupportedClaimReport> {
 	answer.claims.iter().filter_map(|claim| unsupported_claim(job, claim)).collect()
 }
@@ -1290,11 +1720,15 @@ fn dimension_score(dimension_id: &str, max_points: f64, counts: &FailureCounts)
 		"answer_correctness" | "workflow_helpfulness" =>
 			counts.missing_claims > 0
 				|| counts.forbidden_claims > 0
-				|| counts.operator_debug_repair_unclear > 0,
+				|| counts.operator_debug_repair_unclear > 0
+				|| counts.conflict_detection_missing > 0,
 		"evidence_grounding" => counts.missing_evidence > 0 || counts.unsupported_claims > 0,
 		"trap_avoidance" => counts.trap_uses > 0,
 		"uncertainty_handling" => counts.unsupported_claims > 0,
-		"lifecycle_behavior" => false,
+		"lifecycle_behavior" =>
+			counts.stale_answers > 0
+				|| counts.conflict_detection_missing > 0
+				|| counts.update_rationale_missing > 0,
 		"debuggability" =>
 			counts.missing_claims > 0
 				|| counts.unsupported_claims > 0
@@ -1351,6 +1785,8 @@ fn job_reason(status: TypedStatus, counts: &FailureCounts, normalized_score: f64
 				+ counts.operator_debug_raw_sql
 				+ counts.operator_debug_trace_gaps
 				+ counts.operator_debug_repair_unclear
+				+ counts.conflict_detection_missing
+				+ counts.update_rationale_missing
 		),
 		TypedStatus::WrongResult => format!(
 			"Job produced {} wrong-result signal(s) and normalized_score {normalized_score:.3}.",
@@ -1362,6 +1798,8 @@ fn job_reason(status: TypedStatus, counts: &FailureCounts, normalized_score: f64
 				+ counts.operator_debug_raw_sql
 				+ counts.operator_debug_trace_gaps
 				+ counts.operator_debug_repair_unclear
+				+ counts.conflict_detection_missing
+				+ counts.update_rationale_missing
 		),
 		_ => "Job did not reach a runnable scoring state.".to_string(),
 	}
@@ -1383,6 +1821,22 @@ fn job_report(job: &RealWorldJob, scoring: JobScoring) -> JobReport {
 		produced_evidence: produced_evidence_ids(answer).into_iter().collect(),
 		unsupported_claim_count: scoring.unsupported_claims.len(),
 		wrong_result_count: scoring.wrong_result_count,
+		stale_answer_count: scoring
+			.evolution
+			.as_ref()
+			.map_or(0, |report| report.stale_answer_count),
+		conflict_detection_count: scoring
+			.evolution
+			.as_ref()
+			.map_or(0, |report| report.conflict_detection_count),
+		update_rationale_available: scoring
+			.evolution
+			.as_ref()
+			.is_some_and(|report| report.update_rationale_available),
+		temporal_validity_not_encoded: scoring
+			.evolution
+			.as_ref()
+			.is_some_and(|report| report.temporal_validity_not_encoded),
 		latency_ms: answer.latency_ms,
 		cost: answer.cost.clone(),
 		trap_ids_used: scoring.trap_ids_used,
@@ -1401,6 +1855,7 @@ fn job_report(job: &RealWorldJob, scoring: JobScoring) -> JobReport {
 		redaction_leak_count: metrics.redaction_leak_count,
 		qdrant_rebuild_case: metrics.qdrant_rebuild_case,
 		operator_debug: job.operator_debug.clone(),
+		evolution: scoring.evolution,
 	}
 }
 
@@ -1530,6 +1985,10 @@ fn suite_report(suite_id: &str, jobs: &[JobReport]) -> SuiteReport {
 			score_mean: None,
 			unsupported_claim_count: 0,
 			wrong_result_count: 0,
+			stale_answer_count: 0,
+			conflict_detection_count: 0,
+			update_rationale_available_count: 0,
+			temporal_validity_not_encoded_count: 0,
 			reason: NOT_ENCODED_REASON.to_string(),
 		};
 	}
@@ -1538,6 +1997,12 @@ fn suite_report(suite_id: &str, jobs: &[JobReport]) -> SuiteReport {
 	let score_sum = suite_jobs.iter().map(|job| job.normalized_score).sum::<f64>();
 	let unsupported_claim_count = suite_jobs.iter().map(|job| job.unsupported_claim_count).sum();
 	let wrong_result_count = suite_jobs.iter().map(|job| job.wrong_result_count).sum();
+	let stale_answer_count = suite_jobs.iter().map(|job| job.stale_answer_count).sum();
+	let conflict_detection_count = suite_jobs.iter().map(|job| job.conflict_detection_count).sum();
+	let update_rationale_available_count =
+		suite_jobs.iter().filter(|job| job.update_rationale_available).count();
+	let temporal_validity_not_encoded_count =
+		suite_jobs.iter().filter(|job| job.temporal_validity_not_encoded).count();
 
 	SuiteReport {
 		suite_id: suite_id.to_string(),
@@ -1546,6 +2011,10 @@ fn suite_report(suite_id: &str, jobs: &[JobReport]) -> SuiteReport {
 		score_mean: Some(round3(score_sum / suite_jobs.len() as f64)),
 		unsupported_claim_count,
 		wrong_result_count,
+		stale_answer_count,
+		conflict_detection_count,
+		update_rationale_available_count,
+		temporal_validity_not_encoded_count,
 		reason: suite_reason(status, suite_jobs.len()),
 	}
 }
@@ -1563,6 +2032,8 @@ fn aggregate_status(jobs: &[&JobReport]) -> TypedStatus {
 		TypedStatus::Incomplete
 	} else if statuses.contains(&TypedStatus::Blocked) {
 		TypedStatus::Blocked
+	} else if statuses.contains(&TypedStatus::NotEncoded) {
+		TypedStatus::NotEncoded
 	} else if statuses.contains(&TypedStatus::Pass) {
 		TypedStatus::Pass
 	} else {
@@ -1580,7 +2051,12 @@ fn suite_reason(status: TypedStatus, encoded_job_count: usize) -> String {
 			"At least one encoded lifecycle-scored job failed lifecycle behavior.".to_string(),
 		TypedStatus::Incomplete => "At least one encoded job could not complete.".to_string(),
 		TypedStatus::Blocked => "At least one encoded job is blocked.".to_string(),
-		TypedStatus::NotEncoded => NOT_ENCODED_REASON.to_string(),
+		TypedStatus::NotEncoded =>
+			if encoded_job_count == 0 {
+				NOT_ENCODED_REASON.to_string()
+			} else {
+				"At least one encoded fixture declares a not_encoded limitation.".to_string()
+			},
 	}
 }
 
@@ -1595,13 +2071,20 @@ fn report_summary(jobs: &[JobReport], suites: &[SuiteReport]) -> ReportSummary {
 	let scope_correct_count = jobs.iter().map(|job| job.scope_correct_count).sum();
 	let mut summary = ReportSummary {
 		job_count: jobs.len(),
-		encoded_suite_count: suites
-			.iter()
-			.filter(|suite| suite.status != TypedStatus::NotEncoded)
-			.count(),
-		not_encoded: suites.iter().filter(|suite| suite.status == TypedStatus::NotEncoded).count(),
+		encoded_suite_count: suites.iter().filter(|suite| suite.encoded_job_count > 0).count(),
+		not_encoded: 0,
 		unsupported_claim_count: jobs.iter().map(|job| job.unsupported_claim_count).sum(),
 		wrong_result_count: jobs.iter().map(|job| job.wrong_result_count).sum(),
+		stale_answer_count: jobs.iter().map(|job| job.stale_answer_count).sum(),
+		conflict_detection_count: jobs.iter().map(|job| job.conflict_detection_count).sum(),
+		update_rationale_available_count: jobs
+			.iter()
+			.filter(|job| job.update_rationale_available)
+			.count(),
+		temporal_validity_not_encoded_count: jobs
+			.iter()
+			.filter(|job| job.temporal_validity_not_encoded)
+			.count(),
 		mean_score: mean_score(jobs),
 		mean_latency_ms: mean_latency(jobs),
 		total_cost: total_cost(jobs),
@@ -1659,6 +2142,34 @@ fn report_summary(jobs: &[JobReport], suites: &[SuiteReport]) -> ReportSummary {
 	summary
 }
 
+fn evolution_summary(jobs: &[JobReport]) -> EvolutionSummary {
+	EvolutionSummary {
+		stale_answer_count: jobs.iter().map(|job| job.stale_answer_count).sum(),
+		conflict_detection_count: jobs.iter().map(|job| job.conflict_detection_count).sum(),
+		update_rationale_available_count: jobs
+			.iter()
+			.filter(|job| job.update_rationale_available)
+			.count(),
+		temporal_validity_not_encoded_count: jobs
+			.iter()
+			.filter(|job| job.temporal_validity_not_encoded)
+			.count(),
+	}
+}
+
+fn follow_up_reports(jobs: &[RealWorldJob]) -> Vec<FollowUpReport> {
+	jobs.iter()
+		.filter_map(|job| {
+			job.encoding.follow_up.as_ref().map(|follow_up| FollowUpReport {
+				suite_id: job.suite.clone(),
+				job_id: job.job_id.clone(),
+				title: follow_up.title.clone(),
+				reason: follow_up.reason.clone(),
+			})
+		})
+		.collect()
+}
+
 fn ratio(numerator: usize, denominator: usize) -> f64 {
 	if denominator == 0 {
 		return 0.0;
@@ -1756,7 +2267,9 @@ fn render_markdown(report: &RealWorldReport, report_path: &Path) -> String {
 	render_markdown_suites(&mut out, report);
 	render_markdown_jobs(&mut out, report);
 	render_markdown_operator_debugging(&mut out, report);
+	render_markdown_evolution(&mut out, report);
 	render_markdown_unsupported_claims(&mut out, report);
+	render_markdown_follow_ups(&mut out, report);
 	render_markdown_semantics(&mut out, report);
 
 	out
@@ -1786,14 +2299,33 @@ fn render_markdown_header(out: &mut String, report: &RealWorldReport, report_pat
 		md_inline(report.adapter.behavior.as_str())
 	));
 	out.push_str(&format!("- Jobs: `{}`\n", report.summary.job_count));
-	out.push_str(&format!("- Encoded suites: `{}`\n", report.summary.encoded_suite_count));
-	out.push_str(&format!("- Not-encoded suites: `{}`\n", report.not_encoded_suites.len()));
-	out.push_str(&format!("- Status summary: `{}` pass, `{}` wrong_result, `{}` lifecycle_fail, `{}` incomplete, `{}` blocked, `{}` unsupported_claim\n", report.summary.pass, report.summary.wrong_result, report.summary.lifecycle_fail, report.summary.incomplete, report.summary.blocked, report.summary.unsupported_claim));
+	out.push_str(&format!(
+		"- Suites with encoded jobs: `{}`\n",
+		report.summary.encoded_suite_count
+	));
+	out.push_str(&format!(
+		"- Suites with `not_encoded` status: `{}`\n",
+		report.not_encoded_suites.len()
+	));
+	out.push_str(&format!("- Status summary: `{}` pass, `{}` wrong_result, `{}` lifecycle_fail, `{}` incomplete, `{}` blocked, `{}` not_encoded, `{}` unsupported_claim\n", report.summary.pass, report.summary.wrong_result, report.summary.lifecycle_fail, report.summary.incomplete, report.summary.blocked, report.summary.not_encoded, report.summary.unsupported_claim));
 	out.push_str(&format!(
 		"- Unsupported claim count: `{}`\n",
 		report.summary.unsupported_claim_count
 	));
 	out.push_str(&format!("- Wrong-result count: `{}`\n", report.summary.wrong_result_count));
+	out.push_str(&format!("- Stale-answer count: `{}`\n", report.summary.stale_answer_count));
+	out.push_str(&format!(
+		"- Conflict detections: `{}`\n",
+		report.summary.conflict_detection_count
+	));
+	out.push_str(&format!(
+		"- Update rationales available: `{}`\n",
+		report.summary.update_rationale_available_count
+	));
+	out.push_str(&format!(
+		"- Temporal validity not encoded: `{}`\n",
+		report.summary.temporal_validity_not_encoded_count
+	));
 	out.push_str(&format!(
 		"- Evidence coverage: `{}/{}` (`{:.3}`)\n",
 		report.summary.evidence_covered_count,
@@ -1850,17 +2382,21 @@ fn render_markdown_header(out: &mut String, report: &RealWorldReport, report_pat
 fn render_markdown_suites(out: &mut String, report: &RealWorldReport) {
 	out.push_str("## Suites\n\n");
 	out.push_str(
-		"| Suite | Status | Jobs | Score | Unsupported Claims | Wrong Results | Reason |\n",
+		"| Suite | Status | Jobs | Score | Stale Answers | Conflicts | Update Rationales | Temporal Gaps | Unsupported Claims | Wrong Results | Reason |\n",
 	);
-	out.push_str("| --- | --- | ---: | ---: | ---: | ---: | --- |\n");
+	out.push_str("| --- | --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | --- |\n");
 
 	for suite in &report.suites {
 		out.push_str(&format!(
-			"| {} | `{}` | {} | `{}` | {} | {} | {} |\n",
+			"| {} | `{}` | {} | `{}` | {} | {} | {} | {} | {} | {} | {} |\n",
 			md_cell(suite.suite_id.as_str()),
 			status_str(suite.status),
 			suite.encoded_job_count,
 			optional_f64(suite.score_mean, ""),
+			suite.stale_answer_count,
+			suite.conflict_detection_count,
+			suite.update_rationale_available_count,
+			suite.temporal_validity_not_encoded_count,
 			suite.unsupported_claim_count,
 			suite.wrong_result_count,
 			md_cell(suite.reason.as_str())
@@ -1872,8 +2408,10 @@ fn render_markdown_suites(out: &mut String, report: &RealWorldReport) {
 
 fn render_markdown_jobs(out: &mut String, report: &RealWorldReport) {
 	out.push_str("## Jobs\n\n");
-	out.push_str("| Suite | Job | Status | Score | Expected Evidence | Produced Evidence | Unsupported Claims | Wrong Results | Latency | Cost |\n");
-	out.push_str("| --- | --- | --- | ---: | --- | --- | ---: | ---: | ---: | --- |\n");
+	out.push_str("| Suite | Job | Status | Score | Expected Evidence | Produced Evidence | Stale Answers | Conflicts | Update Rationale | Temporal Gap | Unsupported Claims | Wrong Results | Latency | Cost |\n");
+	out.push_str(
+		"| --- | --- | --- | ---: | --- | --- | ---: | ---: | --- | --- | ---: | ---: | ---: | --- |\n",
+	);
 
 	for job in &report.jobs {
 		let expected = job
@@ -1885,13 +2423,17 @@ fn render_markdown_jobs(out: &mut String, report: &RealWorldReport) {
 		let produced = job.produced_evidence.join(", ");
 
 		out.push_str(&format!(
-			"| {} | {} | `{}` | `{:.3}` | `{}` | `{}` | {} | {} | `{}` | `{}` |\n",
+			"| {} | {} | `{}` | `{:.3}` | `{}` | `{}` | {} | {} | `{}` | `{}` | {} | {} | `{}` | `{}` |\n",
 			md_cell(job.suite_id.as_str()),
 			md_cell(job.job_id.as_str()),
 			status_str(job.status),
 			job.normalized_score,
 			md_inline(expected.as_str()),
 			md_inline(produced.as_str()),
+			job.stale_answer_count,
+			job.conflict_detection_count,
+			bool_display(job.update_rationale_available),
+			bool_display(job.temporal_validity_not_encoded),
 			job.unsupported_claim_count,
 			job.wrong_result_count,
 			optional_f64(job.latency_ms, " ms"),
@@ -1990,6 +2532,47 @@ fn ux_gap_cell(gaps: &[OperatorUxGap]) -> String {
 		.join("<br>")
 }
 
+fn render_markdown_evolution(out: &mut String, report: &RealWorldReport) {
+	out.push_str("## Memory Evolution\n\n");
+	out.push_str(&format!("- Stale answers: `{}`\n", report.evolution.stale_answer_count));
+	out.push_str(&format!(
+		"- Conflict detections: `{}`\n",
+		report.evolution.conflict_detection_count
+	));
+	out.push_str(&format!(
+		"- Update rationales available: `{}`\n",
+		report.evolution.update_rationale_available_count
+	));
+	out.push_str(&format!(
+		"- Temporal validity not encoded: `{}`\n\n",
+		report.evolution.temporal_validity_not_encoded_count
+	));
+	out.push_str("| Suite | Job | Current Evidence | Historical Evidence | Stale Traps Used | Conflict Count | Detected | Update Rationale | Temporal Validity | Follow-up |\n");
+	out.push_str("| --- | --- | --- | --- | --- | ---: | ---: | --- | --- | --- |\n");
+
+	for job in &report.jobs {
+		let Some(evolution) = &job.evolution else {
+			continue;
+		};
+
+		out.push_str(&format!(
+			"| {} | {} | `{}` | `{}` | `{}` | {} | {} | `{}` | `{}` | {} |\n",
+			md_cell(job.suite_id.as_str()),
+			md_cell(job.job_id.as_str()),
+			md_inline(evolution.current_evidence.join(", ").as_str()),
+			md_inline(evolution.historical_evidence.join(", ").as_str()),
+			md_inline(evolution.stale_trap_ids_used.join(", ").as_str()),
+			evolution.conflict_count,
+			evolution.conflict_detection_count,
+			bool_display(evolution.update_rationale_available),
+			temporal_display(evolution),
+			md_cell(evolution.follow_up.as_deref().unwrap_or("-"))
+		));
+	}
+
+	out.push('\n');
+}
+
 fn render_markdown_unsupported_claims(out: &mut String, report: &RealWorldReport) {
 	out.push_str("## Unsupported Claims\n\n");
 
@@ -2016,6 +2599,31 @@ fn render_markdown_unsupported_claims(out: &mut String, report: &RealWorldReport
 	out.push('\n');
 }
 
+fn render_markdown_follow_ups(out: &mut String, report: &RealWorldReport) {
+	out.push_str("## Follow-Ups\n\n");
+
+	if report.follow_ups.is_empty() {
+		out.push_str("No benchmark follow-ups were declared by encoded jobs.\n\n");
+
+		return;
+	}
+
+	out.push_str("| Suite | Job | Follow-up | Reason |\n");
+	out.push_str("| --- | --- | --- | --- |\n");
+
+	for follow_up in &report.follow_ups {
+		out.push_str(&format!(
+			"| {} | {} | {} | {} |\n",
+			md_cell(follow_up.suite_id.as_str()),
+			md_cell(follow_up.job_id.as_str()),
+			md_cell(follow_up.title.as_str()),
+			md_cell(follow_up.reason.as_str())
+		));
+	}
+
+	out.push('\n');
+}
+
 fn render_markdown_semantics(out: &mut String, report: &RealWorldReport) {
 	out.push_str("## Result Semantics\n\n");
 	out.push_str(
@@ -2024,7 +2632,7 @@ fn render_markdown_semantics(out: &mut String, report: &RealWorldReport) {
 	out.push_str("It is a real-world job fixture report, not a Docker live-baseline report.\n");
 	out.push_str("Existing live-baseline reports remain valid for their encoded retrieval and lifecycle checks and are not reinterpreted as real-world suite wins.\n\n");
 	out.push_str(
-		"The summary counters report required evidence coverage, source-ref coverage, quote coverage, stale retrievals, scope violations, redaction leaks, and Qdrant rebuild case coverage across encoded jobs.\n\n",
+		"The summary counters report required evidence coverage, source-ref coverage, quote coverage, stale retrievals, scope violations, redaction leaks, Qdrant rebuild case coverage, stale answers, conflict detections, update rationale availability, and temporal validity gaps across encoded jobs.\n\n",
 	);
 	out.push_str(
 		"- `pass`: encoded jobs met their pass threshold with required evidence and no hard-fail rule.\n",
@@ -2033,8 +2641,8 @@ fn render_markdown_semantics(out: &mut String, report: &RealWorldReport) {
 		"- `wrong_result`: a job completed but missed required answer or evidence expectations.\n",
 	);
 	out.push_str("- `unsupported_claim`: a job produced a substantive claim not supported by the fixture evidence links.\n");
-	out.push_str("- `not_encoded`: a suite has no checked-in real_world_job fixture, so no pass/fail claim is allowed.\n\n");
-	out.push_str("## Not-Encoded Suites\n\n");
+	out.push_str("- `not_encoded`: a suite has no checked-in fixture, or an encoded fixture declares a capability gap so no pass/fail claim is allowed.\n\n");
+	out.push_str("## Suites With `not_encoded` Status\n\n");
 
 	if report.not_encoded_suites.is_empty() {
 		out.push_str("All declared suites have at least one encoded job.\n");
@@ -2079,6 +2687,22 @@ fn optional_f64(value: Option<f64>, suffix: &str) -> String {
 	value.map(|value| format!("{value:.3}{suffix}")).unwrap_or_else(|| "-".to_string())
 }
 
+fn bool_display(value: bool) -> &'static str {
+	if value { "true" } else { "false" }
+}
+
+fn temporal_display(evolution: &EvolutionJobReport) -> &'static str {
+	if evolution.temporal_validity_not_encoded {
+		"not_encoded"
+	} else if evolution.temporal_validity_encoded {
+		"encoded"
+	} else if evolution.temporal_validity_required {
+		"required"
+	} else {
+		"-"
+	}
+}
+
 fn cost_display(cost: Option<&CostReport>) -> String {
 	let Some(cost) = cost else {
 		return "-".to_string();
diff --git a/apps/elf-eval/tests/real_world_job_benchmark.rs b/apps/elf-eval/tests/real_world_job_benchmark.rs
index 8c53299c..db644110 100644
--- a/apps/elf-eval/tests/real_world_job_benchmark.rs
+++ b/apps/elf-eval/tests/real_world_job_benchmark.rs
@@ -23,6 +23,10 @@ fn real_world_memory_fixture_dir() -> PathBuf {
 	Path::new(env!("CARGO_MANIFEST_DIR")).join("fixtures").join("real_world_memory")
 }
 
+fn evolution_fixture_dir() -> PathBuf {
+	real_world_memory_fixture_dir().join("evolution")
+}
+
 fn operator_debug_fixture_dir() -> PathBuf {
 	fixture_root().join("operator_debugging_ux")
 }
@@ -61,6 +65,15 @@ fn find_by_field<'a>(items: &'a [Value], field: &str, expected: &str) -> Result<
 		.ok_or_else(|| eyre::eyre!("missing item with {field} = {expected}"))
 }
 
+fn set_json_pointer(value: &mut Value, pointer: &str, replacement: Value) -> Result<()> {
+	let target =
+		value.pointer_mut(pointer).ok_or_else(|| eyre::eyre!("missing JSON pointer {pointer}"))?;
+
+	*target = replacement;
+
+	Ok(())
+}
+
 #[test]
 fn smoke_fixture_produces_typed_json_report() -> Result<()> {
 	let report = run_json_report()?;
@@ -189,10 +202,24 @@ fn generated_json_report_renders_markdown() -> Result<()> {
 fn real_world_memory_fixtures_report_trust_and_personalization_metrics() -> Result<()> {
 	let report = run_json_report_from(real_world_memory_fixture_dir())?;
 
-	assert_eq!(report.pointer("/summary/job_count").and_then(Value::as_u64), Some(4));
-	assert_eq!(report.pointer("/summary/pass").and_then(Value::as_u64), Some(4));
+	assert_eq!(report.pointer("/summary/job_count").and_then(Value::as_u64), Some(9));
+	assert_eq!(report.pointer("/summary/pass").and_then(Value::as_u64), Some(8));
+	assert_eq!(report.pointer("/summary/not_encoded").and_then(Value::as_u64), Some(1));
 	assert_eq!(report.pointer("/summary/unsupported_claim_count").and_then(Value::as_u64), Some(0));
 	assert_eq!(report.pointer("/summary/stale_retrieval_count").and_then(Value::as_u64), Some(0));
+	assert_eq!(report.pointer("/summary/stale_answer_count").and_then(Value::as_u64), Some(0));
+	assert_eq!(
+		report.pointer("/summary/conflict_detection_count").and_then(Value::as_u64),
+		Some(4)
+	);
+	assert_eq!(
+		report.pointer("/summary/update_rationale_available_count").and_then(Value::as_u64),
+		Some(4)
+	);
+	assert_eq!(
+		report.pointer("/summary/temporal_validity_not_encoded_count").and_then(Value::as_u64),
+		Some(1)
+	);
 	assert_eq!(report.pointer("/summary/redaction_leak_count").and_then(Value::as_u64), Some(0));
 	assert_eq!(report.pointer("/summary/scope_check_count").and_then(Value::as_u64), Some(1));
 	assert_eq!(report.pointer("/summary/scope_correct_count").and_then(Value::as_u64), Some(1));
@@ -205,22 +232,27 @@ fn real_world_memory_fixtures_report_trust_and_personalization_metrics() -> Resu
 		report.pointer("/summary/qdrant_rebuild_pass_count").and_then(Value::as_u64),
 		Some(1)
 	);
-	assert_eq!(report.pointer("/summary/evidence_required_count").and_then(Value::as_u64), Some(8));
-	assert_eq!(report.pointer("/summary/evidence_covered_count").and_then(Value::as_u64), Some(8));
-	assert_eq!(report.pointer("/summary/evidence_coverage").and_then(Value::as_f64), Some(1.0));
-	assert_eq!(report.pointer("/summary/source_ref_coverage").and_then(Value::as_f64), Some(1.0));
-	assert_eq!(report.pointer("/summary/quote_coverage").and_then(Value::as_f64), Some(1.0));
+	assert_eq!(
+		report.pointer("/summary/evidence_required_count").and_then(Value::as_u64),
+		Some(19)
+	);
+	assert_eq!(report.pointer("/summary/evidence_covered_count").and_then(Value::as_u64), Some(17));
+	assert_eq!(report.pointer("/summary/evidence_coverage").and_then(Value::as_f64), Some(0.895));
+	assert_eq!(report.pointer("/summary/source_ref_coverage").and_then(Value::as_f64), Some(0.895));
+	assert_eq!(report.pointer("/summary/quote_coverage").and_then(Value::as_f64), Some(0.895));
 
 	let suites = array_at(&report, "/suites")?;
 
-	for suite_id in
-		["trust_source_of_truth", "memory_evolution", "capture_integration", "personalization"]
-	{
+	for suite_id in ["trust_source_of_truth", "capture_integration", "personalization"] {
 		let suite = find_by_field(suites, "/suite_id", suite_id)?;
 
 		assert_eq!(suite.pointer("/status").and_then(Value::as_str), Some("pass"));
 	}
 
+	let memory_evolution = find_by_field(suites, "/suite_id", "memory_evolution")?;
+
+	assert_eq!(memory_evolution.pointer("/status").and_then(Value::as_str), Some("not_encoded"));
+
 	let jobs = array_at(&report, "/jobs")?;
 	let rebuild = find_by_field(jobs, "/job_id", "trust-sot-rebuild-001")?;
 	let redaction = find_by_field(jobs, "/job_id", "capture-redaction-exclusion-001")?;
@@ -234,6 +266,115 @@ fn real_world_memory_fixtures_report_trust_and_personalization_metrics() -> Resu
 	Ok(())
 }
 
+#[test]
+fn memory_evolution_fixtures_report_temporal_and_staleness_metrics() -> Result<()> {
+	let report = run_json_report_from(evolution_fixture_dir())?;
+
+	assert_eq!(report.pointer("/summary/job_count").and_then(Value::as_u64), Some(5));
+	assert_eq!(report.pointer("/summary/encoded_suite_count").and_then(Value::as_u64), Some(1));
+	assert_eq!(report.pointer("/summary/pass").and_then(Value::as_u64), Some(4));
+	assert_eq!(report.pointer("/summary/not_encoded").and_then(Value::as_u64), Some(1));
+	assert_eq!(report.pointer("/summary/stale_answer_count").and_then(Value::as_u64), Some(0));
+	assert_eq!(
+		report.pointer("/summary/conflict_detection_count").and_then(Value::as_u64),
+		Some(4)
+	);
+	assert_eq!(
+		report.pointer("/summary/update_rationale_available_count").and_then(Value::as_u64),
+		Some(4)
+	);
+	assert_eq!(
+		report.pointer("/summary/temporal_validity_not_encoded_count").and_then(Value::as_u64),
+		Some(1)
+	);
+	assert_eq!(
+		report.pointer("/evolution/temporal_validity_not_encoded_count").and_then(Value::as_u64),
+		Some(1)
+	);
+
+	let suites = array_at(&report, "/suites")?;
+	let memory_evolution = find_by_field(suites, "/suite_id", "memory_evolution")?;
+
+	assert_eq!(memory_evolution.pointer("/status").and_then(Value::as_str), Some("not_encoded"));
+	assert_eq!(
+		memory_evolution.pointer("/temporal_validity_not_encoded_count").and_then(Value::as_u64),
+		Some(1)
+	);
+
+	let jobs = array_at(&report, "/jobs")?;
+	let relation_job = find_by_field(jobs, "/job_id", "memory-evolution-relation-temporal-001")?;
+
+	assert_eq!(relation_job.pointer("/status").and_then(Value::as_str), Some("not_encoded"));
+	assert_eq!(
+		relation_job.pointer("/evolution/temporal_validity_not_encoded").and_then(Value::as_bool),
+		Some(true)
+	);
+
+	let follow_ups = array_at(&report, "/follow_ups")?;
+
+	assert_eq!(follow_ups.len(), 1);
+	assert_eq!(
+		follow_ups
+			.first()
+			.and_then(|follow_up| follow_up.pointer("/title"))
+			.and_then(Value::as_str),
+		Some("[ELF graph P1] Add temporal validity to graph-lite facts")
+	);
+
+	Ok(())
+}
+
+#[test]
+fn memory_evolution_counts_stale_answer_when_old_fact_is_answered_as_current() -> Result<()> {
+	let fixture_path =
+		evolution_fixture_dir().join("preference_changed_current_vs_historical.json");
+	let mut fixture = serde_json::from_str::<Value>(&fs::read_to_string(fixture_path)?)?;
+
+	set_json_pointer(
+		&mut fixture,
+		"/corpus/adapter_response/answer/content",
+		Value::String(
+			"Use terse bullet-only benchmark updates as the current preference.".to_string(),
+		),
+	)?;
+	set_json_pointer(
+		&mut fixture,
+		"/corpus/adapter_response/answer/evidence_ids",
+		serde_json::json!(["pref-old-terse-bullets"]),
+	)?;
+	set_json_pointer(
+		&mut fixture,
+		"/corpus/adapter_response/answer/claims",
+		serde_json::json!([
+			{
+				"claim_id": "current_preference",
+				"text": "Use terse bullet-only benchmark updates as the current preference.",
+				"evidence_ids": ["pref-old-terse-bullets"],
+				"confidence": "high"
+			}
+		]),
+	)?;
+
+	let temp_dir =
+		env::temp_dir().join(format!("elf-real-world-memory-stale-test-{}", process::id()));
+
+	fs::create_dir_all(&temp_dir)?;
+	fs::write(temp_dir.join("stale_preference.json"), serde_json::to_vec_pretty(&fixture)?)?;
+
+	let report = run_json_report_from(temp_dir)?;
+
+	assert_eq!(report.pointer("/summary/stale_answer_count").and_then(Value::as_u64), Some(1));
+	assert_eq!(report.pointer("/summary/wrong_result").and_then(Value::as_u64), Some(1));
+
+	let jobs = array_at(&report, "/jobs")?;
+	let job = find_by_field(jobs, "/job_id", "memory-evolution-preference-001")?;
+
+	assert_eq!(job.pointer("/status").and_then(Value::as_str), Some("wrong_result"));
+	assert_eq!(job.pointer("/evolution/stale_answer_count").and_then(Value::as_u64), Some(1));
+
+	Ok(())
+}
+
 #[test]
 fn operator_debug_json_report_renders_markdown_links() -> Result<()> {
 	let report = run_json_report_from(operator_debug_fixture_dir())?;
@@ -271,3 +412,39 @@ fn operator_debug_json_report_renders_markdown_links() -> Result<()> {
 
 	Ok(())
 }
+
+#[test]
+fn memory_evolution_report_renders_markdown_counters() -> Result<()> {
+	let report = run_json_report_from(evolution_fixture_dir())?;
+	let temp_dir =
+		env::temp_dir().join(format!("elf-real-world-memory-evolution-test-{}", process::id()));
+
+	fs::create_dir_all(&temp_dir)?;
+
+	let report_path = temp_dir.join("evolution-report.json");
+	let markdown_path = temp_dir.join("evolution-report.md");
+
+	fs::write(&report_path, serde_json::to_vec_pretty(&report)?)?;
+
+	let output = Command::new(env!("CARGO_BIN_EXE_real_world_job_benchmark"))
+		.arg("publish")
+		.arg("--report")
+		.arg(&report_path)
+		.arg("--out")
+		.arg(&markdown_path)
+		.output()?;
+
+	assert!(
+		output.status.success(),
+		"real_world_job publisher failed: {}",
+		String::from_utf8_lossy(&output.stderr),
+	);
+
+	let markdown = fs::read_to_string(markdown_path)?;
+
+	assert!(markdown.contains("## Memory Evolution"));
+	assert!(markdown.contains("Temporal validity not encoded: `1`"));
+	assert!(markdown.contains("[ELF graph P1] Add temporal validity to graph-lite facts"));
+
+	Ok(())
+}
diff --git a/docs/guide/benchmarking/index.md b/docs/guide/benchmarking/index.md
index dbd0a907..2829e253 100644
--- a/docs/guide/benchmarking/index.md
+++ b/docs/guide/benchmarking/index.md
@@ -39,6 +39,9 @@ cleanup, use `docs/guide/single_user_production.md`.
   step counts, dropped-candidate visibility, and repair-action clarity.
 - `real_world_agent_memory_benchmark.md`: operator overview for the v1 real-world
   agent memory benchmark contract, including suite taxonomy and typed report states.
+- `real_world_memory_evolution.md`: run and interpret the checked-in memory evolution
+  jobs for current facts, historical facts, stale traps, conflicts, update rationales,
+  and temporal graph limitations.
 
 ## Update Rules
 
diff --git a/docs/guide/benchmarking/live_baseline_benchmark.md b/docs/guide/benchmarking/live_baseline_benchmark.md
index 6af7fe8f..e5a05968 100644
--- a/docs/guide/benchmarking/live_baseline_benchmark.md
+++ b/docs/guide/benchmarking/live_baseline_benchmark.md
@@ -321,6 +321,17 @@ The trust/personalization fixture set lives under
 coverage, source-ref coverage, quote coverage, stale retrievals, scope correctness,
 redaction leaks, and Qdrant rebuild coverage.
 
+The memory evolution suite is a separate checked-in real-world job fixture set:
+
+```sh
+cargo make real-world-memory-evolution
+```
+
+It lives under `apps/elf-eval/fixtures/real_world_memory/evolution/` and reports
+stale-answer count, conflict detection count, update rationale availability, temporal
+validity gaps, and unsupported claims. Its relation-temporal fixture is deliberately
+`not_encoded` until graph-lite temporal validity is implemented.
+
 ## Clean Up
 
 ```sh
diff --git a/docs/guide/benchmarking/real_world_agent_memory_benchmark.md b/docs/guide/benchmarking/real_world_agent_memory_benchmark.md
index b354af1d..6f9539b4 100644
--- a/docs/guide/benchmarking/real_world_agent_memory_benchmark.md
+++ b/docs/guide/benchmarking/real_world_agent_memory_benchmark.md
@@ -139,16 +139,36 @@ The suite currently encodes:
 
 - `trust_source_of_truth`: evidence binding, source refs, and Qdrant rebuild from
   Postgres-held chunk embeddings before answering.
-- `memory_evolution`: TTL/delete suppression for a stale deleted fact.
+- `memory_evolution`: TTL/delete suppression plus current-versus-historical preference,
+  issue status, deployment method, benchmark conclusion, and temporal relation cases.
 - `capture_integration`: write-policy audit behavior for redaction/private exclusion.
 - `personalization`: scoped stable preference correction without temporary or
   cross-project preference leakage.
 
 The generated report includes evidence coverage, source-ref coverage, quote coverage,
-unsupported-claim count, stale retrieval count, scope correctness, redaction leak
-count, and Qdrant rebuild case/pass counts. The fixtures include negative traps for
-unsupported prior claims, stale deleted facts, cross-project preference leakage, and
-private/redacted text leakage.
+unsupported-claim count, stale retrieval count, stale-answer count, conflict detection
+count, update rationale availability, temporal validity `not_encoded` count, scope
+correctness, redaction leak count, and Qdrant rebuild case/pass counts. The fixtures
+include negative traps for unsupported prior claims, stale deleted facts, stale
+historical facts, cross-project preference leakage, and private/redacted text leakage.
+
+Narrow memory evolution increment:
+
+```sh
+cargo make real-world-memory-evolution
+```
+
+Artifacts:
+
+```text
+tmp/real-world-memory/evolution-report.json
+tmp/real-world-memory/evolution-report.md
+```
+
+This parses `apps/elf-eval/fixtures/real_world_memory/evolution/` and reports only
+the cases added for current-versus-historical interpretation and temporal staleness.
+The relation temporal-validity fixture is deliberately `not_encoded` and declares the
+graph follow-up instead of claiming a fake graph pass.
 
 Operator debugging UX increment:
 
diff --git a/docs/guide/benchmarking/real_world_memory_evolution.md b/docs/guide/benchmarking/real_world_memory_evolution.md
new file mode 100644
index 00000000..69d31d58
--- /dev/null
+++ b/docs/guide/benchmarking/real_world_memory_evolution.md
@@ -0,0 +1,64 @@
+# Real-World Memory Evolution Benchmark
+
+Goal: Run and interpret the checked-in memory evolution real-world job fixtures.
+Read this when: You need to test current facts, historical facts, stale facts,
+conflicts, corrected memories, and temporal validity limitations.
+Inputs: `apps/elf-eval/fixtures/real_world_memory/evolution/`,
+`apps/elf-eval/src/bin/real_world_job_benchmark.rs`, and `Makefile.toml`.
+Depends on: `docs/spec/real_world_agent_memory_benchmark_v1.md`,
+`docs/guide/benchmarking/real_world_agent_memory_benchmark.md`, and
+`docs/guide/research/comparison_external_projects.md`.
+Outputs: `tmp/real-world-memory/evolution-report.json` and
+`tmp/real-world-memory/evolution-report.md`.
+
+## Scope
+
+This suite is part of the real-world job benchmark family. It is not a Docker
+live-baseline retrieval matrix and does not claim private production readiness.
+
+The checked-in fixture set covers:
+
+- User preference supersession, using mem0-style memory history and Letta-style
+  current operating memory as reference patterns.
+- Issue state evolution from blocked to done.
+- Production deployment guidance superseding a local smoke quickstart.
+- Benchmark adoption verdict reversal with a bounded private-corpus caveat.
+- Relation fact current-versus-historical ownership, encoded as `not_encoded`
+  because temporal graph validity is not yet implemented in the runner.
+
+The relation case borrows from Graphiti/Zep temporal validity and nanograph typed
+query ergonomics. It intentionally does not fake a pass for graph temporal behavior.
+The report declares the follow-up `[ELF graph P1] Add temporal validity to graph-lite
+facts`.
+
+## Run
+
+```sh
+cargo make real-world-memory-evolution
+```
+
+Generated artifacts:
+
+```text
+tmp/real-world-memory/evolution-report.json
+tmp/real-world-memory/evolution-report.md
+```
+
+## Metrics
+
+The runner reports memory evolution counters at summary, suite, and job levels:
+
+- `stale_answer_count`: stale negative traps or stale-current forbidden claims used
+  by produced answers.
+- `conflict_detection_count`: current-versus-historical conflicts detected with
+  both current and historical evidence.
+- `update_rationale_available_count`: jobs where the produced answer cites the
+  update rationale.
+- `temporal_validity_not_encoded_count`: jobs that require temporal graph validity
+  but are deliberately declared `not_encoded`.
+- `unsupported_claim_count`: existing real-world job unsupported claim counter.
+
+Runnable jobs should have `stale_answer_count = 0`, nonzero conflict detection, and
+an update rationale when the fixture provides one. A temporal validity gap should
+remain `not_encoded` until graph-lite facts can model current-only and historical
+relation validity.
diff --git a/docs/spec/real_world_agent_memory_benchmark_v1.md b/docs/spec/real_world_agent_memory_benchmark_v1.md
index 5b65c0d0..8b7552a7 100644
--- a/docs/spec/real_world_agent_memory_benchmark_v1.md
+++ b/docs/spec/real_world_agent_memory_benchmark_v1.md
@@ -67,6 +67,8 @@ runner execution.
   "scoring_rubric": {},
   "allowed_uncertainty": {},
   "operator_debug": {},
+  "encoding": {},
+  "memory_evolution": {},
   "tags": []
 }
 ```
@@ -88,6 +90,8 @@ runner execution.
 | `scoring_rubric` | object | Dimensions, weights, thresholds, and hard-fail rules for this job. |
 | `allowed_uncertainty` | object | Explicit uncertainty language and fallback behavior accepted for the job. |
 | `operator_debug` | object or null | Optional for most suites; required for `operator_debugging_ux` jobs. Records trace/viewer evidence and operator workflow scoring inputs. |
+| `encoding` | object | Optional job-level limitation declaration. Only `not_encoded`, `blocked`, and `incomplete` statuses are allowed here. |
+| `memory_evolution` | object or null | Optional for most suites; used by `memory_evolution` jobs to report current evidence, historical evidence, stale traps, conflicts, update rationale, and temporal-validity limitations. |
 | `tags` | array | Optional labels such as `private_corpus`, `synthetic`, `adapter_required`, or `no_live_claim`. |
 
 ### `corpus`
@@ -194,6 +198,41 @@ Trap types:
 
 Each trap MUST include `trap_id`, `type`, `evidence_ids`, and `failure_if_used`.
 
+### `encoding`
+
+`encoding` declares a fixture that is intentionally not scored as a runnable pass
+because the benchmark capability is not encoded or cannot run yet.
+
+Allowed `status` values:
+
+- `not_encoded`: the fixture documents a capability gap and must not claim pass.
+- `blocked`: required adapter, corpus, or system support is missing.
+- `incomplete`: fixture execution cannot reach a complete scored state.
+
+When `status` is present, `reason` MUST be a non-empty explanation. `follow_up` is
+optional, but when present it MUST include non-empty `title` and `reason` fields.
+
+### `memory_evolution`
+
+`memory_evolution` is used by jobs that test whether an answer distinguishes current
+facts, historical facts, stale facts, conflicts, corrected memories, and missing
+temporal validity support.
+
+Fields:
+
+- `current_evidence_ids`: evidence ids that support the current answer.
+- `historical_evidence_ids`: evidence ids that are historically true but not current
+  answers unless the prompt asks for history.
+- `stale_trap_ids`: negative trap ids that represent stale answers.
+- `conflicts`: array of conflicts with `conflict_id`, `claim_id`,
+  `current_evidence_id`, `historical_evidence_id`, and optional
+  `resolved_by_evidence_id`.
+- `update_rationale`: optional object with `claim_id`, `evidence_ids`, and
+  `available` to show whether the answer can explain why the memory changed.
+- `temporal_validity`: optional object with `required`, `encoded`, and optional
+  `follow_up`. When `required = true` and `encoded = false`, the job MUST declare
+  `encoding.status = "not_encoded"` or `encoding.status = "blocked"`.
+
 ### `operator_debug`
 
 `operator_debug` is required when `suite = "operator_debugging_ux"` and optional
@@ -326,7 +365,8 @@ Suite status rules:
   no higher-risk `unsupported_claim` is present.
 - A suite is `unsupported_claim` when any hard-fail unsupported claim occurs.
 - A suite is `incomplete` or `blocked` when required jobs cannot run for those reasons.
-- A suite is `not_encoded` when no job in that suite is implemented.
+- A suite is `not_encoded` when no job in that suite is implemented, or when an
+  encoded fixture declares a job-level capability gap that prevents a suite pass claim.
 
 Reports MUST include:
 
@@ -337,6 +377,11 @@ Reports MUST include:
 - explicit `not_encoded` suite list;
 - private-corpus redaction policy when private fixtures are used.
 
+Reports that encode `memory_evolution` jobs SHOULD also include stale-answer counts,
+conflict detection counts, update rationale availability, and temporal-validity
+`not_encoded` counts. A temporal graph validity job MUST NOT be reported as `pass`
+until the runner can evaluate current-only versus historical relation facts.
+
 ## Claim Rules
 
 - A project MAY claim a suite pass only for suites with encoded jobs and a published