From 9c109860c30f7bbc9e2b94bfbf8d9ab7248ddad8 Mon Sep 17 00:00:00 2001
From: Yvette Carlisle <y@acg.box>
Date: Tue, 16 Jun 2026 01:28:12 +0800
Subject: [PATCH 1/2] {"schema":"decodex/commit/1","summary":"Add
 Dreaming-readiness stage benchmark ledger","authority":"XY-951"}

---
 .../tests/real_world_job_benchmark.rs         | 164 +++++++
 ...6-06-16-dreaming-readiness-stage-ledger.md | 114 +++++
 docs/guide/benchmarking/index.md              |   5 +
 ...06-16-dreaming-readiness-stage-ledger.json | 454 ++++++++++++++++++
 4 files changed, 737 insertions(+)
 create mode 100644 docs/guide/benchmarking/2026-06-16-dreaming-readiness-stage-ledger.md
 create mode 100644 docs/research/2026-06-16-dreaming-readiness-stage-ledger.json
diff --git a/apps/elf-eval/tests/real_world_job_benchmark.rs b/apps/elf-eval/tests/real_world_job_benchmark.rs
index a71a7c81..ad52e8c5 100644
--- a/apps/elf-eval/tests/real_world_job_benchmark.rs
+++ b/apps/elf-eval/tests/real_world_job_benchmark.rs
@@ -182,6 +182,21 @@ fn temporal_history_competitor_gap_json_path() -> Result<PathBuf> {
 		.join("2026-06-11-temporal-history-competitor-gap-report.json"))
 }
 
+fn dreaming_readiness_stage_ledger_json_path() -> Result<PathBuf> {
+	Ok(workspace_root()?
+		.join("docs")
+		.join("research")
+		.join("2026-06-16-dreaming-readiness-stage-ledger.json"))
+}
+
+fn dreaming_readiness_stage_ledger_markdown_path() -> Result<PathBuf> {
+	Ok(workspace_root()?
+		.join("docs")
+		.join("guide")
+		.join("benchmarking")
+		.join("2026-06-16-dreaming-readiness-stage-ledger.md"))
+}
+
 fn competitor_strength_matrix_path() -> Result<PathBuf> {
 	Ok(workspace_root()?
 		.join("docs")
@@ -3665,6 +3680,155 @@ fn mem0_delete_audit_probe_requires_explicit_delete_history_event() -> Result<()
 	Ok(())
 }
 
+#[test]
+fn dreaming_readiness_stage_ledger_preserves_gate_shape() -> Result<()> {
+	let ledger = serde_json::from_str::<Value>(&fs::read_to_string(
+		dreaming_readiness_stage_ledger_json_path()?,
+	)?)?;
+	let markdown = fs::read_to_string(dreaming_readiness_stage_ledger_markdown_path()?)?;
+	let stages = array_at(&ledger, "/stage_gates")?;
+
+	assert_dreaming_readiness_ledger_header(&ledger)?;
+	assert_dreaming_readiness_stage_shape(&ledger, stages)?;
+	assert_dreaming_readiness_baseline_counts(&ledger, stages)?;
+	assert_dreaming_readiness_markdown_boundaries(&markdown);
+
+	Ok(())
+}
+
+fn assert_dreaming_readiness_ledger_header(ledger: &Value) -> Result<()> {
+	assert_eq!(
+		ledger.pointer("/schema").and_then(Value::as_str),
+		Some("elf.dreaming_readiness_stage_ledger/v1")
+	);
+	assert_eq!(ledger.pointer("/authority").and_then(Value::as_str), Some("XY-951"));
+
+	for term in ["improved", "regressed", "unchanged", "blocked", "not_tested"] {
+		assert!(array_contains_str(ledger, "/judgment_terms", term)?);
+	}
+	for term in ["pass", "wrong_result", "blocked", "not_tested", "not_encoded"] {
+		assert!(array_contains_str(ledger, "/count_fields", term)?);
+	}
+
+	Ok(())
+}
+
+fn assert_dreaming_readiness_stage_shape(ledger: &Value, stages: &[Value]) -> Result<()> {
+	assert_eq!(stages.len(), 8);
+
+	for stage_id in [
+		"current_vs_historical_correctness",
+		"preference_evolution",
+		"deletion_ttl_tombstone_behavior",
+		"reviewable_consolidation",
+		"memory_summary_top_of_mind_behavior",
+		"proactive_brief_readiness",
+		"scheduled_memory_task_readiness",
+		"final_competitor_retest_status",
+	] {
+		find_by_field(stages, "/stage_id", stage_id)?;
+	}
+	for stage in stages {
+		let stage_id =
+			stage.pointer("/stage_id").and_then(Value::as_str).unwrap_or("<missing stage_id>");
+
+		assert!(
+			!array_at(stage, "/baseline_commands")?.is_empty(),
+			"{stage_id} missing baseline commands"
+		);
+		assert!(
+			!array_at(stage, "/post_stage_commands")?.is_empty(),
+			"{stage_id} missing post-stage commands"
+		);
+		assert!(
+			!array_at(stage, "/evidence_files")?.is_empty(),
+			"{stage_id} missing evidence files"
+		);
+
+		for count_field in ["pass", "wrong_result", "blocked", "not_tested"] {
+			let pointer = format!("/baseline_counts/{count_field}");
+
+			assert!(
+				stage.pointer(&pointer).and_then(Value::as_u64).is_some(),
+				"{stage_id} missing {pointer}"
+			);
+		}
+
+		let judgment = stage
+			.pointer("/comparison_judgment")
+			.and_then(Value::as_str)
+			.ok_or_else(|| eyre::eyre!("{stage_id} missing comparison_judgment"))?;
+
+		assert!(array_contains_str(ledger, "/judgment_terms", judgment)?);
+	}
+
+	Ok(())
+}
+
+fn assert_dreaming_readiness_baseline_counts(ledger: &Value, stages: &[Value]) -> Result<()> {
+	let current = find_by_field(stages, "/stage_id", "current_vs_historical_correctness")?;
+
+	assert_eq!(current.pointer("/baseline_counts/pass").and_then(Value::as_u64), Some(1));
+	assert_eq!(current.pointer("/baseline_counts/wrong_result").and_then(Value::as_u64), Some(5));
+	assert_eq!(current.pointer("/comparison_judgment").and_then(Value::as_str), Some("unchanged"));
+	assert!(
+		current
+			.pointer("/baseline_basis")
+			.and_then(Value::as_str)
+			.is_some_and(|basis| basis.contains("five current-vs-historical jobs"))
+	);
+
+	let preference = find_by_field(stages, "/stage_id", "preference_evolution")?;
+
+	assert_eq!(
+		preference.pointer("/baseline_counts/wrong_result").and_then(Value::as_u64),
+		Some(1)
+	);
+
+	let tombstone = find_by_field(stages, "/stage_id", "deletion_ttl_tombstone_behavior")?;
+
+	assert_eq!(tombstone.pointer("/baseline_counts/pass").and_then(Value::as_u64), Some(1));
+
+	let consolidation = find_by_field(stages, "/stage_id", "reviewable_consolidation")?;
+
+	assert_eq!(
+		consolidation.pointer("/comparison_judgment").and_then(Value::as_str),
+		Some("not_tested")
+	);
+	assert_eq!(
+		consolidation.pointer("/baseline_counts/not_encoded").and_then(Value::as_u64),
+		Some(1)
+	);
+
+	let scheduled = find_by_field(stages, "/stage_id", "scheduled_memory_task_readiness")?;
+
+	assert_eq!(scheduled.pointer("/comparison_judgment").and_then(Value::as_str), Some("blocked"));
+	assert_eq!(scheduled.pointer("/baseline_counts/blocked").and_then(Value::as_u64), Some(1));
+
+	let retest = find_by_field(stages, "/stage_id", "final_competitor_retest_status")?;
+
+	assert_eq!(retest.pointer("/baseline_counts/pass").and_then(Value::as_u64), Some(22));
+	assert_eq!(retest.pointer("/baseline_counts/wrong_result").and_then(Value::as_u64), Some(5));
+	assert_eq!(retest.pointer("/baseline_counts/blocked").and_then(Value::as_u64), Some(2));
+	assert_eq!(retest.pointer("/baseline_counts/not_tested").and_then(Value::as_u64), Some(11));
+	assert_eq!(retest.pointer("/baseline_counts/not_encoded").and_then(Value::as_u64), Some(11));
+	assert!(array_at(ledger, "/summary/improved")?.is_empty());
+	assert!(array_at(ledger, "/summary/regressed")?.is_empty());
+	assert!(array_contains_str(ledger, "/summary/unchanged", "current_vs_historical_correctness")?);
+	assert!(array_contains_str(ledger, "/summary/blocked", "scheduled_memory_task_readiness")?);
+	assert!(array_contains_str(ledger, "/summary/not_tested", "proactive_brief_readiness")?);
+
+	Ok(())
+}
+
+fn assert_dreaming_readiness_markdown_boundaries(markdown: &str) {
+	assert!(markdown.contains("`improved`: none"));
+	assert!(markdown.contains("`regressed`: none"));
+	assert!(markdown.contains("live `memory_evolution` is not solved until"));
+	assert!(markdown.contains("XY-905"));
+	assert!(markdown.contains("Do not claim this ledger fixes temporal reconciliation"));
+}
+
 #[test]
 fn knowledge_json_report_renders_markdown_metrics() -> Result<()> {
 	let report = run_json_report_from(knowledge_fixture_dir())?;
diff --git a/docs/guide/benchmarking/2026-06-16-dreaming-readiness-stage-ledger.md b/docs/guide/benchmarking/2026-06-16-dreaming-readiness-stage-ledger.md
new file mode 100644
index 00000000..8d299867
--- /dev/null
+++ b/docs/guide/benchmarking/2026-06-16-dreaming-readiness-stage-ledger.md
@@ -0,0 +1,114 @@
+# Dreaming-Readiness Stage Ledger - June 16, 2026
+
+Goal: Define the Decodex benchmark gate for Dreaming-inspired ELF memory-system
+optimization stages.
+Read this when: You are starting or finishing a staged memory improvement lane and
+need the baseline command matrix, typed evidence status, and report shape required
+before claiming the stage improved.
+Inputs: `docs/research/2026-06-16-dreaming-readiness-stage-ledger.json`, the June 11
+competitor-strength, temporal-history, and iteration-direction reports, the
+consolidation proposal spec, and the checked-in real-world fixture suites.
+Outputs: A stage-by-stage ledger that downstream issues can update with
+`improved`, `regressed`, `unchanged`, `blocked`, or `not_tested` judgments.
+
+## Executive Judgment
+
+This ledger does not claim a new product win. It creates the gate later product lanes
+must pass before they can claim a Dreaming or competitor-inspired stage is done.
+
+Current baseline:
+
+- `improved`: none.
+- `regressed`: none.
+- `unchanged`: current-vs-historical correctness, preference evolution,
+  deletion/TTL/tombstone behavior, and the final competitor retest baseline.
+- `blocked`: scheduled-memory-task readiness.
+- `not_tested`: reviewable consolidation beyond fixtures, memory-summary/top-of-mind
+  live behavior, and proactive brief readiness.
+
+The important known loss is preserved: live `memory_evolution` is not solved until
+XY-905 changes behavior and reruns the live gate. The current ELF live adapter passes
+only the delete/TTL tombstone job and keeps five current-vs-historical jobs as
+`wrong_result`.
+
+## Ledger Rules
+
+- Every downstream Dreaming or competitor-improvement stage must write a post-stage
+  JSON report and Markdown summary before claiming phase completion.
+- The report must compare against the baseline counts in
+  `docs/research/2026-06-16-dreaming-readiness-stage-ledger.json`.
+- The comparison judgment must be one of `improved`, `regressed`, `unchanged`,
+  `blocked`, or `not_tested`.
+- Typed non-pass labels stay typed. Do not collapse `wrong_result`, `blocked`,
+  `not_tested`, `not_encoded`, `incomplete`, `lifecycle_fail`, `unsupported`, or
+  `non_goal` into a single pass/fail label.
+- Fixture-backed evidence proves benchmark shape only. It does not prove live product
+  behavior.
+- Private-corpus and provider-backed gates remain typed blocked unless an operator
+  supplies explicit inputs; those boundaries are tied to XY-930.
+
+## Stage Command Matrix
+
+| Stage | Baseline command(s) | Required post-stage command(s) | Current counts | Judgment | Next optimization direction |
+| --- | --- | --- | --- | --- | --- |
+| Current-vs-historical correctness | `cargo make real-world-memory-evolution`; `cargo make real-world-memory-live-adapters` | Same commands; publish post-stage JSON and Markdown evidence | `pass=1`, `wrong_result=5`, `blocked=0`, `not_tested=0` | `unchanged` | XY-905 must make live answers cite current, historical, rationale, and tombstone evidence instead of only retrieving snippets. |
+| Preference evolution and correction history | `cargo make real-world-memory-evolution`; `cargo make real-world-memory-live-adapters`; `cargo make openmemory-ui-export-readback` | Same commands; include mem0/OpenMemory boundary evidence | `pass=0`, `wrong_result=1`, `blocked=0`, `not_tested=0` | `unchanged` | Preserve current and superseded preferences with rationale evidence; do not claim ELF beats mem0/OpenMemory history until measured. |
+| Deletion, TTL, and tombstone behavior | `cargo make real-world-memory`; `cargo make real-world-memory-live-adapters` | Same commands | `pass=1`, `wrong_result=0`, `blocked=0`, `not_tested=0` | `unchanged` | Preserve the current tombstone pass while repairing adjacent temporal-history wrong results. |
+| Reviewable consolidation | `cargo make real-world-memory-consolidation` | `cargo make real-world-memory-consolidation`; `cargo make real-world-memory-live-adapters` | `pass=4`, `wrong_result=0`, `blocked=0`, `not_tested=1` | `not_tested` | Keep Dreaming output derived and reviewable with lineage, confidence, unsupported-claim flags, apply/defer/discard audit, and no source mutation. |
+| Memory summary and top-of-mind behavior | `cargo make real-world-memory-knowledge`; `cargo make real-world-memory-core-archival` | Same commands plus `cargo make real-world-memory-live-adapters` | `pass=8`, `wrong_result=0`, `blocked=0`, `not_tested=1` | `not_tested` | Build summaries as cited, rebuildable derived pages or core blocks; do not turn hidden summaries into authoritative memory. |
+| Proactive brief readiness | `cargo make real-world-first-generation-oss`; `cargo make real-world-job-operator-ux` | Same commands plus `cargo make real-world-memory-live-adapters` | `pass=0`, `wrong_result=0`, `blocked=0`, `not_tested=1` | `not_tested` | Add direct proactive-brief fixtures before any pass claim; briefs must be source-linked and repairable. |
+| Scheduled memory task readiness | `cargo make real-world-memory-consolidation` | `cargo make real-world-memory-consolidation`; `cargo make real-world-memory-live-adapters` | `pass=0`, `wrong_result=0`, `blocked=1`, `not_tested=0` | `blocked` | Scheduled runs are future work; start with queued derived proposal runs and keep operator review mandatory. |
+| Final competitor retest status | `cargo make real-world-memory-live-adapters`; `cargo make real-world-first-generation-oss`; `cargo make real-world-memory-graph-rag`; `cargo make openmemory-ui-export-readback`; `cargo make baseline-production-private-addendum` when operator input exists | Same commands; private/provider commands may remain typed blocked under XY-930 | `pass=22`, `wrong_result=5`, `blocked=2`, `not_tested=11` | `unchanged` | Rerun the relevant competitor matrix after each optimization and update improved/regressed/unchanged/blocked/not-tested buckets. |
+
+## Evidence Anchors
+
+| Stage | Evidence file(s) |
+| --- | --- |
+| Current-vs-historical correctness | `docs/guide/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md`; `docs/research/2026-06-11-temporal-history-competitor-gap-report.json`; `docs/guide/benchmarking/2026-06-11-competitor-strength-adoption-report.md` |
+| Preference evolution and correction history | `docs/guide/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md`; `docs/guide/benchmarking/2026-06-11-mem0-openmemory-history-ui-export-report.md`; `docs/research/2026-06-11-temporal-history-competitor-gap-report.json` |
+| Deletion, TTL, and tombstone behavior | `docs/guide/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md`; `docs/guide/benchmarking/2026-06-11-measurement-coverage-audit.md` |
+| Reviewable consolidation | `docs/spec/system_consolidation_proposals_v1.md`; `apps/elf-eval/fixtures/real_world_memory/consolidation/`; `docs/guide/benchmarking/2026-06-11-competitor-strength-adoption-report.md` |
+| Memory summary and top-of-mind behavior | `apps/elf-eval/fixtures/real_world_memory/knowledge/`; `apps/elf-eval/fixtures/real_world_memory/core_archival_memory/`; `docs/guide/benchmarking/2026-06-11-competitor-strength-adoption-report.md` |
+| Proactive brief readiness | `docs/research/2026-06-08-agent-memory-selection.json`; `docs/guide/benchmarking/2026-06-11-first-generation-oss-continuity-source-store-report.md` |
+| Scheduled memory task readiness | `docs/spec/system_consolidation_proposals_v1.md`; `docs/research/2026-06-08-agent-memory-selection.json` |
+| Final competitor retest status | `docs/guide/benchmarking/2026-06-11-competitor-strength-adoption-report.md`; `docs/research/2026-06-11-competitor-strength-adoption-report.json`; `docs/guide/benchmarking/2026-06-11-graph-rag-scored-smoke-adapter-report.md`; `docs/guide/benchmarking/2026-06-11-first-generation-oss-continuity-source-store-report.md` |
+
+## Report Shape For Downstream Issues
+
+Downstream stage reports should use the same fields as the JSON ledger:
+
+- `stage_id`
+- `baseline_commands`
+- `post_stage_commands`
+- `evidence_files`
+- `baseline_counts`
+- `post_stage_counts`
+- `comparison_judgment`
+- `regression_rule`
+- `improvement_rule`
+- `next_optimization_direction`
+
+If a stage cannot run because credentials, private corpus, provider setup, or a
+product surface is absent, record `blocked` or `not_tested` with the concrete blocker.
+Do not silently drop the stage from the report.
+
+## Claim Boundaries
+
+Allowed:
+
+- The Dreaming-readiness gate exists and names required stage commands and evidence
+  files.
+- The current baseline preserves typed non-pass states and the known live
+  memory-evolution loss.
+- Fixture-backed consolidation, knowledge, and core/archival jobs can be used as
+  regression guards for report shape.
+
+Not allowed:
+
+- Do not claim this ledger fixes temporal reconciliation, preference history,
+  consolidation, proactive briefs, scheduled tasks, or competitor adapters.
+- Do not claim ELF has full-suite live real-world pass evidence.
+- Do not claim private-corpus or provider-backed production quality without the
+  operator-owned inputs required by XY-930.
+- Do not claim fixture-only or smoke-only evidence proves broad competitor
+  superiority.
diff --git a/docs/guide/benchmarking/index.md b/docs/guide/benchmarking/index.md
index b2292476..991dd2f9 100644
--- a/docs/guide/benchmarking/index.md
+++ b/docs/guide/benchmarking/index.md
@@ -110,6 +110,11 @@ cleanup, use `docs/guide/single_user_production.md`.
   personalization, and export-readback comparison with normalized
   win/tie/loss/not-tested/blocked/non-goal outcomes and explicit hosted/UI/graph
   non-claims.
+- `2026-06-16-dreaming-readiness-stage-ledger.md`: XY-951 stage-gate ledger for
+  Dreaming-inspired memory improvements, with the required current baseline,
+  post-stage command matrix, typed improved/regressed/unchanged/blocked/not-tested
+  buckets, and machine-readable companion file
+  `docs/research/2026-06-16-dreaming-readiness-stage-ledger.json`.
 - `real_world_agent_memory_benchmark.md`: operator overview for the v1 real-world
   agent memory benchmark contract, including suite taxonomy, typed report states,
   knowledge-compilation fixture tasks, and the production-ops fixture target.
diff --git a/docs/research/2026-06-16-dreaming-readiness-stage-ledger.json b/docs/research/2026-06-16-dreaming-readiness-stage-ledger.json
new file mode 100644
index 00000000..9e43f1be
--- /dev/null
+++ b/docs/research/2026-06-16-dreaming-readiness-stage-ledger.json
@@ -0,0 +1,454 @@
+{
+  "schema": "elf.dreaming_readiness_stage_ledger/v1",
+  "ledger_id": "xy-951-dreaming-readiness-stage-ledger-2026-06-16",
+  "authority": "XY-951",
+  "created_at": "2026-06-16T00:00:00Z",
+  "purpose": "Define the benchmark evidence gate that every Dreaming-inspired ELF optimization stage must update before claiming completion.",
+  "source_evidence_cutoff": "Checked-in benchmark and research evidence through 2026-06-11; no new live/provider/private benchmark pass is claimed by this ledger.",
+  "typed_status_terms": [
+    "pass",
+    "wrong_result",
+    "blocked",
+    "not_tested",
+    "not_encoded",
+    "incomplete",
+    "lifecycle_fail",
+    "unsupported",
+    "non_goal"
+  ],
+  "judgment_terms": [
+    "improved",
+    "regressed",
+    "unchanged",
+    "blocked",
+    "not_tested"
+  ],
+  "count_fields": [
+    "pass",
+    "wrong_result",
+    "blocked",
+    "not_tested",
+    "not_encoded"
+  ],
+  "gate_rules": [
+    "Every downstream Dreaming or competitor-improvement stage must write a post-stage JSON report and Markdown summary before claiming phase completion.",
+    "Post-stage reports must compare against this ledger's baseline counts and set exactly one comparison_judgment: improved, regressed, unchanged, blocked, or not_tested.",
+    "Typed non-pass states must remain typed; blocked, not_tested, not_encoded, incomplete, lifecycle_fail, unsupported, and wrong_result must not be collapsed into a generic fail or hidden under pass.",
+    "Fixture-backed evidence may prove benchmark shape but must not be promoted into live_real_world product quality.",
+    "Private-corpus and provider-backed production gates remain typed blocked unless the operator supplies explicit inputs; those blockers are tracked under XY-930.",
+    "The live memory_evolution loss remains open until XY-905 changes behavior and reruns the live gate."
+  ],
+  "summary": {
+    "improved": [],
+    "regressed": [],
+    "unchanged": [
+      "current_vs_historical_correctness",
+      "preference_evolution",
+      "deletion_ttl_tombstone_behavior",
+      "final_competitor_retest_status"
+    ],
+    "blocked": [
+      "scheduled_memory_task_readiness"
+    ],
+    "not_tested": [
+      "reviewable_consolidation",
+      "memory_summary_top_of_mind_behavior",
+      "proactive_brief_readiness"
+    ]
+  },
+  "stage_gates": [
+    {
+      "stage_id": "current_vs_historical_correctness",
+      "stage_name": "Current-vs-historical correctness",
+      "dependent_issue": "XY-905",
+      "evidence_class": "live_real_world",
+      "baseline_commands": [
+        {
+          "command": "cargo make real-world-memory-evolution",
+          "artifact": "tmp/real-world-memory/evolution-report.json",
+          "purpose": "Fixture gate for current facts, historical facts, conflicts, and update rationales."
+        },
+        {
+          "command": "cargo make real-world-memory-live-adapters",
+          "artifact": "tmp/real-world-memory/live-adapters/",
+          "purpose": "Live ELF/qmd real-world adapter gate for the memory_evolution suite."
+        }
+      ],
+      "post_stage_commands": [
+        {
+          "command": "cargo make real-world-memory-evolution",
+          "required_artifact": "tmp/real-world-memory/evolution-report.json"
+        },
+        {
+          "command": "cargo make real-world-memory-live-adapters",
+          "required_artifact": "tmp/real-world-memory/live-adapters/"
+        }
+      ],
+      "evidence_files": [
+        "docs/guide/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md",
+        "docs/research/2026-06-11-temporal-history-competitor-gap-report.json",
+        "docs/guide/benchmarking/2026-06-11-competitor-strength-adoption-report.md"
+      ],
+      "baseline_counts": {
+        "pass": 1,
+        "wrong_result": 5,
+        "blocked": 0,
+        "not_tested": 0,
+        "not_encoded": 0
+      },
+      "baseline_basis": "ELF live service adapter memory_evolution suite: one delete/TTL job passes and five current-vs-historical jobs are wrong_result.",
+      "comparison_judgment": "unchanged",
+      "regression_rule": "Any new wrong_result, missed evidence, or loss of the delete/TTL pass is a regression.",
+      "improvement_rule": "An improvement requires fewer live ELF wrong_result jobs without increasing blocked/not_tested counts.",
+      "next_optimization_direction": "Implement current/historical/rationale/tombstone answer and trace selection before claiming temporal memory is solved."
+    },
+    {
+      "stage_id": "preference_evolution",
+      "stage_name": "Preference evolution and correction history",
+      "dependent_issue": "XY-905",
+      "evidence_class": "live_real_world",
+      "baseline_commands": [
+        {
+          "command": "cargo make real-world-memory-evolution",
+          "artifact": "tmp/real-world-memory/evolution-report.json",
+          "purpose": "Fixture gate for the preference-change job."
+        },
+        {
+          "command": "cargo make real-world-memory-live-adapters",
+          "artifact": "tmp/real-world-memory/live-adapters/",
+          "purpose": "Live adapter gate for memory-evolution-preference-001."
+        },
+        {
+          "command": "cargo make openmemory-ui-export-readback",
+          "artifact": "docs/guide/benchmarking/2026-06-11-mem0-openmemory-history-ui-export-report.md",
+          "purpose": "External comparison boundary for mem0/OpenMemory preference correction and export-style history."
+        }
+      ],
+      "post_stage_commands": [
+        {
+          "command": "cargo make real-world-memory-evolution",
+          "required_artifact": "tmp/real-world-memory/evolution-report.json"
+        },
+        {
+          "command": "cargo make real-world-memory-live-adapters",
+          "required_artifact": "tmp/real-world-memory/live-adapters/"
+        },
+        {
+          "command": "cargo make openmemory-ui-export-readback",
+          "required_artifact": "tmp/live-baseline/"
+        }
+      ],
+      "evidence_files": [
+        "docs/guide/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md",
+        "docs/guide/benchmarking/2026-06-11-mem0-openmemory-history-ui-export-report.md",
+        "docs/research/2026-06-11-temporal-history-competitor-gap-report.json"
+      ],
+      "baseline_counts": {
+        "pass": 0,
+        "wrong_result": 1,
+        "blocked": 0,
+        "not_tested": 0,
+        "not_encoded": 0
+      },
+      "baseline_basis": "ELF live memory-evolution-preference-001 is wrong_result; mem0 local OSS preference correction history is measured as an ELF loss.",
+      "comparison_judgment": "unchanged",
+      "regression_rule": "Any loss of fixture preference correctness or any new blocked/not_tested live preference gate is a regression.",
+      "improvement_rule": "An improvement requires live preference correction history to pass while preserving old preference history as historical evidence.",
+      "next_optimization_direction": "Add explicit preference correction history and answer fields that name the current preference, the superseded preference, and the rationale evidence."
+    },
+    {
+      "stage_id": "deletion_ttl_tombstone_behavior",
+      "stage_name": "Deletion, TTL, and tombstone behavior",
+      "dependent_issue": "XY-905",
+      "evidence_class": "live_real_world",
+      "baseline_commands": [
+        {
+          "command": "cargo make real-world-memory",
+          "artifact": "tmp/real-world-memory/real-world-memory-report.json",
+          "purpose": "Aggregate fixture gate containing memory-evolution-delete-ttl-001."
+        },
+        {
+          "command": "cargo make real-world-memory-live-adapters",
+          "artifact": "tmp/real-world-memory/live-adapters/",
+          "purpose": "Live adapter gate for tombstone behavior."
+        }
+      ],
+      "post_stage_commands": [
+        {
+          "command": "cargo make real-world-memory",
+          "required_artifact": "tmp/real-world-memory/real-world-memory-report.json"
+        },
+        {
+          "command": "cargo make real-world-memory-live-adapters",
+          "required_artifact": "tmp/real-world-memory/live-adapters/"
+        }
+      ],
+      "evidence_files": [
+        "docs/guide/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md",
+        "docs/guide/benchmarking/2026-06-11-measurement-coverage-audit.md"
+      ],
+      "baseline_counts": {
+        "pass": 1,
+        "wrong_result": 0,
+        "blocked": 0,
+        "not_tested": 0,
+        "not_encoded": 0
+      },
+      "baseline_basis": "ELF live memory-evolution-delete-ttl-001 passes with tombstone and current-plan evidence; qmd misses the tombstone.",
+      "comparison_judgment": "unchanged",
+      "regression_rule": "Losing tombstone evidence, returning stale deleted content, or failing the aggregate fixture is a regression.",
+      "improvement_rule": "This stage is already pass for ELF; improvement requires preserving the pass while reducing adjacent memory_evolution wrong_result counts.",
+      "next_optimization_direction": "Keep tombstone and TTL invalidation evidence answerable as temporal reconciliation is repaired."
+    },
+    {
+      "stage_id": "reviewable_consolidation",
+      "stage_name": "Reviewable consolidation",
+      "dependent_issue": "XY-926",
+      "evidence_class": "fixture_backed",
+      "baseline_commands": [
+        {
+          "command": "cargo make real-world-memory-consolidation",
+          "artifact": "tmp/real-world-memory/consolidation/report.json",
+          "purpose": "Fixture gate for review actions, lineage, unsupported claims, contradiction, and source immutability."
+        }
+      ],
+      "post_stage_commands": [
+        {
+          "command": "cargo make real-world-memory-consolidation",
+          "required_artifact": "tmp/real-world-memory/consolidation/report.json"
+        },
+        {
+          "command": "cargo make real-world-memory-live-adapters",
+          "required_artifact": "tmp/real-world-memory/live-adapters/"
+        }
+      ],
+      "evidence_files": [
+        "docs/spec/system_consolidation_proposals_v1.md",
+        "docs/guide/benchmarking/2026-06-11-competitor-strength-adoption-report.md",
+        "apps/elf-eval/fixtures/real_world_memory/consolidation/"
+      ],
+      "baseline_counts": {
+        "pass": 4,
+        "wrong_result": 0,
+        "blocked": 0,
+        "not_tested": 1,
+        "not_encoded": 1
+      },
+      "baseline_basis": "Consolidation fixtures pass, but live consolidation proposal generation and review-action scoring are not encoded.",
+      "comparison_judgment": "not_tested",
+      "regression_rule": "Any source mutation, missing lineage, or collapse of review actions into an automatic rewrite is a regression.",
+      "improvement_rule": "An improvement requires live or service-backed consolidation scoring without provider hidden state and without mutating authoritative sources.",
+      "next_optimization_direction": "Keep Dreaming output derived and reviewable: proposal lineage, confidence, unsupported-claim flags, apply/defer/discard audit, and immutable source snapshots."
+    },
+    {
+      "stage_id": "memory_summary_top_of_mind_behavior",
+      "stage_name": "Memory summary and top-of-mind behavior",
+      "dependent_issue": "XY-926",
+      "evidence_class": "fixture_backed",
+      "baseline_commands": [
+        {
+          "command": "cargo make real-world-memory-knowledge",
+          "artifact": "tmp/real-world-memory/knowledge-report.json",
+          "purpose": "Fixture gate for derived knowledge pages, citations, stale-source lint, and repair guidance."
+        },
+        {
+          "command": "cargo make real-world-memory-core-archival",
+          "artifact": "tmp/real-world-memory/core-archival/report.json",
+          "purpose": "Fixture gate for always-attached core block attachment, scope, provenance, stale-core detection, and archival fallback."
+        }
+      ],
+      "post_stage_commands": [
+        {
+          "command": "cargo make real-world-memory-knowledge",
+          "required_artifact": "tmp/real-world-memory/knowledge-report.json"
+        },
+        {
+          "command": "cargo make real-world-memory-core-archival",
+          "required_artifact": "tmp/real-world-memory/core-archival/report.json"
+        },
+        {
+          "command": "cargo make real-world-memory-live-adapters",
+          "required_artifact": "tmp/real-world-memory/live-adapters/"
+        }
+      ],
+      "evidence_files": [
+        "docs/guide/benchmarking/2026-06-11-competitor-strength-adoption-report.md",
+        "apps/elf-eval/fixtures/real_world_memory/knowledge/",
+        "apps/elf-eval/fixtures/real_world_memory/core_archival_memory/"
+      ],
+      "baseline_counts": {
+        "pass": 8,
+        "wrong_result": 0,
+        "blocked": 0,
+        "not_tested": 1,
+        "not_encoded": 1
+      },
+      "baseline_basis": "Knowledge and core/archival fixtures pass, but live knowledge compilation and top-of-mind product behavior are not encoded.",
+      "comparison_judgment": "not_tested",
+      "regression_rule": "Any stale summary, unsupported section, missing source id, or stale core block presented as current is a regression.",
+      "improvement_rule": "An improvement requires live top-of-mind or summary readback that remains source-linked and linted for stale/unsupported claims.",
+      "next_optimization_direction": "Build summaries as derived, cited, rebuildable pages or core blocks; do not replace authoritative notes with hidden summaries."
+    },
+    {
+      "stage_id": "proactive_brief_readiness",
+      "stage_name": "Proactive brief readiness",
+      "dependent_issue": "XY-926",
+      "evidence_class": "not_encoded",
+      "baseline_commands": [
+        {
+          "command": "cargo make real-world-first-generation-oss",
+          "artifact": "tmp/real-world-memory/first-generation-oss/report.json",
+          "purpose": "Regression guard for claude-mem progressive-disclosure and retrieval-repair reference behavior."
+        },
+        {
+          "command": "cargo make real-world-job-operator-ux",
+          "artifact": "tmp/real-world-job/real-world-job-operator-ux-report.json",
+          "purpose": "Regression guard for operator-facing trace and repair-action clarity."
+        }
+      ],
+      "post_stage_commands": [
+        {
+          "command": "cargo make real-world-first-generation-oss",
+          "required_artifact": "tmp/real-world-memory/first-generation-oss/report.json"
+        },
+        {
+          "command": "cargo make real-world-job-operator-ux",
+          "required_artifact": "tmp/real-world-job/real-world-job-operator-ux-report.json"
+        },
+        {
+          "command": "cargo make real-world-memory-live-adapters",
+          "required_artifact": "tmp/real-world-memory/live-adapters/"
+        }
+      ],
+      "evidence_files": [
+        "docs/research/2026-06-08-agent-memory-selection.json",
+        "docs/guide/benchmarking/2026-06-11-first-generation-oss-continuity-source-store-report.md",
+        "docs/guide/benchmarking/2026-06-11-competitor-strength-adoption-report.md"
+      ],
+      "baseline_counts": {
+        "pass": 0,
+        "wrong_result": 0,
+        "blocked": 0,
+        "not_tested": 1,
+        "not_encoded": 1
+      },
+      "baseline_basis": "No direct proactive-brief real_world_job suite exists; adjacent progressive-disclosure and operator-debug fixtures are reference guards only.",
+      "comparison_judgment": "not_tested",
+      "regression_rule": "A proactive brief that is uncited, leaks excluded content, or cannot explain source selection is a regression.",
+      "improvement_rule": "An improvement requires a direct proactive-brief fixture or live adapter report with cited source ids and typed non-pass handling.",
+      "next_optimization_direction": "Add proactive briefs only as source-linked derived output with repair guidance and no secret or excluded-span leakage."
+    },
+    {
+      "stage_id": "scheduled_memory_task_readiness",
+      "stage_name": "Scheduled memory task readiness",
+      "dependent_issue": "XY-926",
+      "evidence_class": "blocked",
+      "baseline_commands": [
+        {
+          "command": "cargo make real-world-memory-consolidation",
+          "artifact": "tmp/real-world-memory/consolidation/report.json",
+          "purpose": "Current closest fixture gate for deterministic fixture/manual consolidation runs."
+        }
+      ],
+      "post_stage_commands": [
+        {
+          "command": "cargo make real-world-memory-consolidation",
+          "required_artifact": "tmp/real-world-memory/consolidation/report.json"
+        },
+        {
+          "command": "cargo make real-world-memory-live-adapters",
+          "required_artifact": "tmp/real-world-memory/live-adapters/"
+        }
+      ],
+      "evidence_files": [
+        "docs/spec/system_consolidation_proposals_v1.md",
+        "docs/research/2026-06-08-agent-memory-selection.json"
+      ],
+      "baseline_counts": {
+        "pass": 0,
+        "wrong_result": 0,
+        "blocked": 1,
+        "not_tested": 0,
+        "not_encoded": 0
+      },
+      "baseline_basis": "The consolidation spec permits fixture and manual job_kind only; scheduled is explicitly future work and no scheduled-memory-task benchmark is encoded.",
+      "comparison_judgment": "blocked",
+      "regression_rule": "Adding scheduled tasks without reviewable output, immutable source snapshots, and explicit operator review is a regression.",
+      "improvement_rule": "An improvement requires a scheduled-task fixture or live report that keeps task output reviewable and records provider/private boundaries as typed blockers.",
+      "next_optimization_direction": "Model scheduled tasks as queued derived proposal runs first; do not allow a scheduler to mutate authoritative memory silently."
+    },
+    {
+      "stage_id": "final_competitor_retest_status",
+      "stage_name": "Final competitor retest status",
+      "dependent_issue": "XY-951",
+      "evidence_class": "live_real_world",
+      "baseline_commands": [
+        {
+          "command": "cargo make real-world-memory-live-adapters",
+          "artifact": "tmp/real-world-memory/live-adapters/",
+          "purpose": "Full encoded ELF/qmd live real-world sweep."
+        },
+        {
+          "command": "cargo make real-world-first-generation-oss",
+          "artifact": "tmp/real-world-memory/first-generation-oss/report.json",
+          "purpose": "First-generation OSS prompt fixture and typed blocker slice."
+        },
+        {
+          "command": "cargo make real-world-memory-graph-rag",
+          "artifact": "tmp/real-world-memory/graph-rag/report.json",
+          "purpose": "Representative graph/RAG typed non-pass fixture slice."
+        },
+        {
+          "command": "cargo make openmemory-ui-export-readback",
+          "artifact": "tmp/live-baseline/",
+          "purpose": "mem0/OpenMemory local OSS history and export-readback boundary."
+        },
+        {
+          "command": "cargo make baseline-production-private-addendum",
+          "artifact": "tmp/live-baseline/private-production-addendum.md",
+          "purpose": "Private-corpus addendum; remains blocked unless an operator-owned manifest is supplied."
+        }
+      ],
+      "post_stage_commands": [
+        {
+          "command": "cargo make real-world-memory-live-adapters",
+          "required_artifact": "tmp/real-world-memory/live-adapters/"
+        },
+        {
+          "command": "cargo make real-world-first-generation-oss",
+          "required_artifact": "tmp/real-world-memory/first-generation-oss/report.json"
+        },
+        {
+          "command": "cargo make real-world-memory-graph-rag",
+          "required_artifact": "tmp/real-world-memory/graph-rag/report.json"
+        },
+        {
+          "command": "cargo make openmemory-ui-export-readback",
+          "required_artifact": "tmp/live-baseline/"
+        },
+        {
+          "command": "cargo make baseline-production-private-addendum",
+          "required_artifact": "tmp/live-baseline/private-production-addendum.md"
+        }
+      ],
+      "evidence_files": [
+        "docs/guide/benchmarking/2026-06-11-competitor-strength-adoption-report.md",
+        "docs/research/2026-06-11-competitor-strength-adoption-report.json",
+        "docs/guide/benchmarking/2026-06-11-graph-rag-scored-smoke-adapter-report.md",
+        "docs/guide/benchmarking/2026-06-11-first-generation-oss-continuity-source-store-report.md"
+      ],
+      "baseline_counts": {
+        "pass": 22,
+        "wrong_result": 5,
+        "blocked": 2,
+        "not_tested": 11,
+        "not_encoded": 11
+      },
+      "baseline_basis": "ELF full live real-world sweep: 22 pass, 5 wrong_result, 2 blocked, and 11 not_encoded jobs. The not_encoded jobs are represented as not_tested for this stage gate while preserving the raw not_encoded count.",
+      "comparison_judgment": "unchanged",
+      "regression_rule": "Any higher wrong_result/blocked/not_tested count, missing typed blocker, or unsupported broad competitor win claim is a regression.",
+      "improvement_rule": "An improvement requires reduced live wrong_result or not_tested counts with no weakened evidence-class boundary and no private/provider claim without inputs.",
+      "next_optimization_direction": "Rerun the full relevant competitor matrix after each product optimization and update the Markdown/JSON ledger with improved, regressed, unchanged, blocked, and not_tested buckets."
+    }
+  ]
+}

From 70faad0c6c93b9cd930c470840725d0aa5583d1b Mon Sep 17 00:00:00 2001
From: Yvette Carlisle <y@acg.box>
Date: Tue, 16 Jun 2026 09:02:36 +0800
Subject: [PATCH 2/2] {"schema":"decodex/commit/1","summary":"Refresh XY-951
 review gate after stale Devin suite","authority":"XY-951"}