From 9c109860c30f7bbc9e2b94bfbf8d9ab7248ddad8 Mon Sep 17 00:00:00 2001 From: Yvette Carlisle Date: Tue, 16 Jun 2026 01:28:12 +0800 Subject: [PATCH 1/2] {"schema":"decodex/commit/1","summary":"Add Dreaming-readiness stage benchmark ledger","authority":"XY-951"} --- .../tests/real_world_job_benchmark.rs | 164 +++++++ ...6-06-16-dreaming-readiness-stage-ledger.md | 114 +++++ docs/guide/benchmarking/index.md | 5 + ...06-16-dreaming-readiness-stage-ledger.json | 454 ++++++++++++++++++ 4 files changed, 737 insertions(+) create mode 100644 docs/guide/benchmarking/2026-06-16-dreaming-readiness-stage-ledger.md create mode 100644 docs/research/2026-06-16-dreaming-readiness-stage-ledger.json diff --git a/apps/elf-eval/tests/real_world_job_benchmark.rs b/apps/elf-eval/tests/real_world_job_benchmark.rs index a71a7c81..ad52e8c5 100644 --- a/apps/elf-eval/tests/real_world_job_benchmark.rs +++ b/apps/elf-eval/tests/real_world_job_benchmark.rs @@ -182,6 +182,21 @@ fn temporal_history_competitor_gap_json_path() -> Result { .join("2026-06-11-temporal-history-competitor-gap-report.json")) } +fn dreaming_readiness_stage_ledger_json_path() -> Result { + Ok(workspace_root()? + .join("docs") + .join("research") + .join("2026-06-16-dreaming-readiness-stage-ledger.json")) +} + +fn dreaming_readiness_stage_ledger_markdown_path() -> Result { + Ok(workspace_root()? + .join("docs") + .join("guide") + .join("benchmarking") + .join("2026-06-16-dreaming-readiness-stage-ledger.md")) +} + fn competitor_strength_matrix_path() -> Result { Ok(workspace_root()? .join("docs") @@ -3665,6 +3680,155 @@ fn mem0_delete_audit_probe_requires_explicit_delete_history_event() -> Result<() Ok(()) } +#[test] +fn dreaming_readiness_stage_ledger_preserves_gate_shape() -> Result<()> { + let ledger = serde_json::from_str::(&fs::read_to_string( + dreaming_readiness_stage_ledger_json_path()?, + )?)?; + let markdown = fs::read_to_string(dreaming_readiness_stage_ledger_markdown_path()?)?; + let stages = array_at(&ledger, "/stage_gates")?; + + assert_dreaming_readiness_ledger_header(&ledger)?; + assert_dreaming_readiness_stage_shape(&ledger, stages)?; + assert_dreaming_readiness_baseline_counts(&ledger, stages)?; + assert_dreaming_readiness_markdown_boundaries(&markdown); + + Ok(()) +} + +fn assert_dreaming_readiness_ledger_header(ledger: &Value) -> Result<()> { + assert_eq!( + ledger.pointer("/schema").and_then(Value::as_str), + Some("elf.dreaming_readiness_stage_ledger/v1") + ); + assert_eq!(ledger.pointer("/authority").and_then(Value::as_str), Some("XY-951")); + + for term in ["improved", "regressed", "unchanged", "blocked", "not_tested"] { + assert!(array_contains_str(ledger, "/judgment_terms", term)?); + } + for term in ["pass", "wrong_result", "blocked", "not_tested", "not_encoded"] { + assert!(array_contains_str(ledger, "/count_fields", term)?); + } + + Ok(()) +} + +fn assert_dreaming_readiness_stage_shape(ledger: &Value, stages: &[Value]) -> Result<()> { + assert_eq!(stages.len(), 8); + + for stage_id in [ + "current_vs_historical_correctness", + "preference_evolution", + "deletion_ttl_tombstone_behavior", + "reviewable_consolidation", + "memory_summary_top_of_mind_behavior", + "proactive_brief_readiness", + "scheduled_memory_task_readiness", + "final_competitor_retest_status", + ] { + find_by_field(stages, "/stage_id", stage_id)?; + } + for stage in stages { + let stage_id = + stage.pointer("/stage_id").and_then(Value::as_str).unwrap_or(""); + + assert!( + !array_at(stage, "/baseline_commands")?.is_empty(), + "{stage_id} missing baseline commands" + ); + assert!( + !array_at(stage, "/post_stage_commands")?.is_empty(), + "{stage_id} missing post-stage commands" + ); + assert!( + !array_at(stage, "/evidence_files")?.is_empty(), + "{stage_id} missing evidence files" + ); + + for count_field in ["pass", "wrong_result", "blocked", "not_tested"] { + let pointer = format!("/baseline_counts/{count_field}"); + + assert!( + stage.pointer(&pointer).and_then(Value::as_u64).is_some(), + "{stage_id} missing {pointer}" + ); + } + + let judgment = stage + .pointer("/comparison_judgment") + .and_then(Value::as_str) + .ok_or_else(|| eyre::eyre!("{stage_id} missing comparison_judgment"))?; + + assert!(array_contains_str(ledger, "/judgment_terms", judgment)?); + } + + Ok(()) +} + +fn assert_dreaming_readiness_baseline_counts(ledger: &Value, stages: &[Value]) -> Result<()> { + let current = find_by_field(stages, "/stage_id", "current_vs_historical_correctness")?; + + assert_eq!(current.pointer("/baseline_counts/pass").and_then(Value::as_u64), Some(1)); + assert_eq!(current.pointer("/baseline_counts/wrong_result").and_then(Value::as_u64), Some(5)); + assert_eq!(current.pointer("/comparison_judgment").and_then(Value::as_str), Some("unchanged")); + assert!( + current + .pointer("/baseline_basis") + .and_then(Value::as_str) + .is_some_and(|basis| basis.contains("five current-vs-historical jobs")) + ); + + let preference = find_by_field(stages, "/stage_id", "preference_evolution")?; + + assert_eq!( + preference.pointer("/baseline_counts/wrong_result").and_then(Value::as_u64), + Some(1) + ); + + let tombstone = find_by_field(stages, "/stage_id", "deletion_ttl_tombstone_behavior")?; + + assert_eq!(tombstone.pointer("/baseline_counts/pass").and_then(Value::as_u64), Some(1)); + + let consolidation = find_by_field(stages, "/stage_id", "reviewable_consolidation")?; + + assert_eq!( + consolidation.pointer("/comparison_judgment").and_then(Value::as_str), + Some("not_tested") + ); + assert_eq!( + consolidation.pointer("/baseline_counts/not_encoded").and_then(Value::as_u64), + Some(1) + ); + + let scheduled = find_by_field(stages, "/stage_id", "scheduled_memory_task_readiness")?; + + assert_eq!(scheduled.pointer("/comparison_judgment").and_then(Value::as_str), Some("blocked")); + assert_eq!(scheduled.pointer("/baseline_counts/blocked").and_then(Value::as_u64), Some(1)); + + let retest = find_by_field(stages, "/stage_id", "final_competitor_retest_status")?; + + assert_eq!(retest.pointer("/baseline_counts/pass").and_then(Value::as_u64), Some(22)); + assert_eq!(retest.pointer("/baseline_counts/wrong_result").and_then(Value::as_u64), Some(5)); + assert_eq!(retest.pointer("/baseline_counts/blocked").and_then(Value::as_u64), Some(2)); + assert_eq!(retest.pointer("/baseline_counts/not_tested").and_then(Value::as_u64), Some(11)); + assert_eq!(retest.pointer("/baseline_counts/not_encoded").and_then(Value::as_u64), Some(11)); + assert!(array_at(ledger, "/summary/improved")?.is_empty()); + assert!(array_at(ledger, "/summary/regressed")?.is_empty()); + assert!(array_contains_str(ledger, "/summary/unchanged", "current_vs_historical_correctness")?); + assert!(array_contains_str(ledger, "/summary/blocked", "scheduled_memory_task_readiness")?); + assert!(array_contains_str(ledger, "/summary/not_tested", "proactive_brief_readiness")?); + + Ok(()) +} + +fn assert_dreaming_readiness_markdown_boundaries(markdown: &str) { + assert!(markdown.contains("`improved`: none")); + assert!(markdown.contains("`regressed`: none")); + assert!(markdown.contains("live `memory_evolution` is not solved until")); + assert!(markdown.contains("XY-905")); + assert!(markdown.contains("Do not claim this ledger fixes temporal reconciliation")); +} + #[test] fn knowledge_json_report_renders_markdown_metrics() -> Result<()> { let report = run_json_report_from(knowledge_fixture_dir())?; diff --git a/docs/guide/benchmarking/2026-06-16-dreaming-readiness-stage-ledger.md b/docs/guide/benchmarking/2026-06-16-dreaming-readiness-stage-ledger.md new file mode 100644 index 00000000..8d299867 --- /dev/null +++ b/docs/guide/benchmarking/2026-06-16-dreaming-readiness-stage-ledger.md @@ -0,0 +1,114 @@ +# Dreaming-Readiness Stage Ledger - June 16, 2026 + +Goal: Define the Decodex benchmark gate for Dreaming-inspired ELF memory-system +optimization stages. +Read this when: You are starting or finishing a staged memory improvement lane and +need the baseline command matrix, typed evidence status, and report shape required +before claiming the stage improved. +Inputs: `docs/research/2026-06-16-dreaming-readiness-stage-ledger.json`, the June 11 +competitor-strength, temporal-history, and iteration-direction reports, the +consolidation proposal spec, and the checked-in real-world fixture suites. +Outputs: A stage-by-stage ledger that downstream issues can update with +`improved`, `regressed`, `unchanged`, `blocked`, or `not_tested` judgments. + +## Executive Judgment + +This ledger does not claim a new product win. It creates the gate later product lanes +must pass before they can claim a Dreaming or competitor-inspired stage is done. + +Current baseline: + +- `improved`: none. +- `regressed`: none. +- `unchanged`: current-vs-historical correctness, preference evolution, + deletion/TTL/tombstone behavior, and the final competitor retest baseline. +- `blocked`: scheduled-memory-task readiness. +- `not_tested`: reviewable consolidation beyond fixtures, memory-summary/top-of-mind + live behavior, and proactive brief readiness. + +The important known loss is preserved: live `memory_evolution` is not solved until +XY-905 changes behavior and reruns the live gate. The current ELF live adapter passes +only the delete/TTL tombstone job and keeps five current-vs-historical jobs as +`wrong_result`. + +## Ledger Rules + +- Every downstream Dreaming or competitor-improvement stage must write a post-stage + JSON report and Markdown summary before claiming phase completion. +- The report must compare against the baseline counts in + `docs/research/2026-06-16-dreaming-readiness-stage-ledger.json`. +- The comparison judgment must be one of `improved`, `regressed`, `unchanged`, + `blocked`, or `not_tested`. +- Typed non-pass labels stay typed. Do not collapse `wrong_result`, `blocked`, + `not_tested`, `not_encoded`, `incomplete`, `lifecycle_fail`, `unsupported`, or + `non_goal` into a single pass/fail label. +- Fixture-backed evidence proves benchmark shape only. It does not prove live product + behavior. +- Private-corpus and provider-backed gates remain typed blocked unless an operator + supplies explicit inputs; those boundaries are tied to XY-930. + +## Stage Command Matrix + +| Stage | Baseline command(s) | Required post-stage command(s) | Current counts | Judgment | Next optimization direction | +| --- | --- | --- | --- | --- | --- | +| Current-vs-historical correctness | `cargo make real-world-memory-evolution`; `cargo make real-world-memory-live-adapters` | Same commands; publish post-stage JSON and Markdown evidence | `pass=1`, `wrong_result=5`, `blocked=0`, `not_tested=0` | `unchanged` | XY-905 must make live answers cite current, historical, rationale, and tombstone evidence instead of only retrieving snippets. | +| Preference evolution and correction history | `cargo make real-world-memory-evolution`; `cargo make real-world-memory-live-adapters`; `cargo make openmemory-ui-export-readback` | Same commands; include mem0/OpenMemory boundary evidence | `pass=0`, `wrong_result=1`, `blocked=0`, `not_tested=0` | `unchanged` | Preserve current and superseded preferences with rationale evidence; do not claim ELF beats mem0/OpenMemory history until measured. | +| Deletion, TTL, and tombstone behavior | `cargo make real-world-memory`; `cargo make real-world-memory-live-adapters` | Same commands | `pass=1`, `wrong_result=0`, `blocked=0`, `not_tested=0` | `unchanged` | Preserve the current tombstone pass while repairing adjacent temporal-history wrong results. | +| Reviewable consolidation | `cargo make real-world-memory-consolidation` | `cargo make real-world-memory-consolidation`; `cargo make real-world-memory-live-adapters` | `pass=4`, `wrong_result=0`, `blocked=0`, `not_tested=1` | `not_tested` | Keep Dreaming output derived and reviewable with lineage, confidence, unsupported-claim flags, apply/defer/discard audit, and no source mutation. | +| Memory summary and top-of-mind behavior | `cargo make real-world-memory-knowledge`; `cargo make real-world-memory-core-archival` | Same commands plus `cargo make real-world-memory-live-adapters` | `pass=8`, `wrong_result=0`, `blocked=0`, `not_tested=1` | `not_tested` | Build summaries as cited, rebuildable derived pages or core blocks; do not turn hidden summaries into authoritative memory. | +| Proactive brief readiness | `cargo make real-world-first-generation-oss`; `cargo make real-world-job-operator-ux` | Same commands plus `cargo make real-world-memory-live-adapters` | `pass=0`, `wrong_result=0`, `blocked=0`, `not_tested=1` | `not_tested` | Add direct proactive-brief fixtures before any pass claim; briefs must be source-linked and repairable. | +| Scheduled memory task readiness | `cargo make real-world-memory-consolidation` | `cargo make real-world-memory-consolidation`; `cargo make real-world-memory-live-adapters` | `pass=0`, `wrong_result=0`, `blocked=1`, `not_tested=0` | `blocked` | Scheduled runs are future work; start with queued derived proposal runs and keep operator review mandatory. | +| Final competitor retest status | `cargo make real-world-memory-live-adapters`; `cargo make real-world-first-generation-oss`; `cargo make real-world-memory-graph-rag`; `cargo make openmemory-ui-export-readback`; `cargo make baseline-production-private-addendum` when operator input exists | Same commands; private/provider commands may remain typed blocked under XY-930 | `pass=22`, `wrong_result=5`, `blocked=2`, `not_tested=11` | `unchanged` | Rerun the relevant competitor matrix after each optimization and update improved/regressed/unchanged/blocked/not-tested buckets. | + +## Evidence Anchors + +| Stage | Evidence file(s) | +| --- | --- | +| Current-vs-historical correctness | `docs/guide/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md`; `docs/research/2026-06-11-temporal-history-competitor-gap-report.json`; `docs/guide/benchmarking/2026-06-11-competitor-strength-adoption-report.md` | +| Preference evolution and correction history | `docs/guide/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md`; `docs/guide/benchmarking/2026-06-11-mem0-openmemory-history-ui-export-report.md`; `docs/research/2026-06-11-temporal-history-competitor-gap-report.json` | +| Deletion, TTL, and tombstone behavior | `docs/guide/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md`; `docs/guide/benchmarking/2026-06-11-measurement-coverage-audit.md` | +| Reviewable consolidation | `docs/spec/system_consolidation_proposals_v1.md`; `apps/elf-eval/fixtures/real_world_memory/consolidation/`; `docs/guide/benchmarking/2026-06-11-competitor-strength-adoption-report.md` | +| Memory summary and top-of-mind behavior | `apps/elf-eval/fixtures/real_world_memory/knowledge/`; `apps/elf-eval/fixtures/real_world_memory/core_archival_memory/`; `docs/guide/benchmarking/2026-06-11-competitor-strength-adoption-report.md` | +| Proactive brief readiness | `docs/research/2026-06-08-agent-memory-selection.json`; `docs/guide/benchmarking/2026-06-11-first-generation-oss-continuity-source-store-report.md` | +| Scheduled memory task readiness | `docs/spec/system_consolidation_proposals_v1.md`; `docs/research/2026-06-08-agent-memory-selection.json` | +| Final competitor retest status | `docs/guide/benchmarking/2026-06-11-competitor-strength-adoption-report.md`; `docs/research/2026-06-11-competitor-strength-adoption-report.json`; `docs/guide/benchmarking/2026-06-11-graph-rag-scored-smoke-adapter-report.md`; `docs/guide/benchmarking/2026-06-11-first-generation-oss-continuity-source-store-report.md` | + +## Report Shape For Downstream Issues + +Downstream stage reports should use the same fields as the JSON ledger: + +- `stage_id` +- `baseline_commands` +- `post_stage_commands` +- `evidence_files` +- `baseline_counts` +- `post_stage_counts` +- `comparison_judgment` +- `regression_rule` +- `improvement_rule` +- `next_optimization_direction` + +If a stage cannot run because credentials, private corpus, provider setup, or a +product surface is absent, record `blocked` or `not_tested` with the concrete blocker. +Do not silently drop the stage from the report. + +## Claim Boundaries + +Allowed: + +- The Dreaming-readiness gate exists and names required stage commands and evidence + files. +- The current baseline preserves typed non-pass states and the known live + memory-evolution loss. +- Fixture-backed consolidation, knowledge, and core/archival jobs can be used as + regression guards for report shape. + +Not allowed: + +- Do not claim this ledger fixes temporal reconciliation, preference history, + consolidation, proactive briefs, scheduled tasks, or competitor adapters. +- Do not claim ELF has full-suite live real-world pass evidence. +- Do not claim private-corpus or provider-backed production quality without the + operator-owned inputs required by XY-930. +- Do not claim fixture-only or smoke-only evidence proves broad competitor + superiority. diff --git a/docs/guide/benchmarking/index.md b/docs/guide/benchmarking/index.md index b2292476..991dd2f9 100644 --- a/docs/guide/benchmarking/index.md +++ b/docs/guide/benchmarking/index.md @@ -110,6 +110,11 @@ cleanup, use `docs/guide/single_user_production.md`. personalization, and export-readback comparison with normalized win/tie/loss/not-tested/blocked/non-goal outcomes and explicit hosted/UI/graph non-claims. +- `2026-06-16-dreaming-readiness-stage-ledger.md`: XY-951 stage-gate ledger for + Dreaming-inspired memory improvements, with the required current baseline, + post-stage command matrix, typed improved/regressed/unchanged/blocked/not-tested + buckets, and machine-readable companion file + `docs/research/2026-06-16-dreaming-readiness-stage-ledger.json`. - `real_world_agent_memory_benchmark.md`: operator overview for the v1 real-world agent memory benchmark contract, including suite taxonomy, typed report states, knowledge-compilation fixture tasks, and the production-ops fixture target. diff --git a/docs/research/2026-06-16-dreaming-readiness-stage-ledger.json b/docs/research/2026-06-16-dreaming-readiness-stage-ledger.json new file mode 100644 index 00000000..9e43f1be --- /dev/null +++ b/docs/research/2026-06-16-dreaming-readiness-stage-ledger.json @@ -0,0 +1,454 @@ +{ + "schema": "elf.dreaming_readiness_stage_ledger/v1", + "ledger_id": "xy-951-dreaming-readiness-stage-ledger-2026-06-16", + "authority": "XY-951", + "created_at": "2026-06-16T00:00:00Z", + "purpose": "Define the benchmark evidence gate that every Dreaming-inspired ELF optimization stage must update before claiming completion.", + "source_evidence_cutoff": "Checked-in benchmark and research evidence through 2026-06-11; no new live/provider/private benchmark pass is claimed by this ledger.", + "typed_status_terms": [ + "pass", + "wrong_result", + "blocked", + "not_tested", + "not_encoded", + "incomplete", + "lifecycle_fail", + "unsupported", + "non_goal" + ], + "judgment_terms": [ + "improved", + "regressed", + "unchanged", + "blocked", + "not_tested" + ], + "count_fields": [ + "pass", + "wrong_result", + "blocked", + "not_tested", + "not_encoded" + ], + "gate_rules": [ + "Every downstream Dreaming or competitor-improvement stage must write a post-stage JSON report and Markdown summary before claiming phase completion.", + "Post-stage reports must compare against this ledger's baseline counts and set exactly one comparison_judgment: improved, regressed, unchanged, blocked, or not_tested.", + "Typed non-pass states must remain typed; blocked, not_tested, not_encoded, incomplete, lifecycle_fail, unsupported, and wrong_result must not be collapsed into a generic fail or hidden under pass.", + "Fixture-backed evidence may prove benchmark shape but must not be promoted into live_real_world product quality.", + "Private-corpus and provider-backed production gates remain typed blocked unless the operator supplies explicit inputs; those blockers are tracked under XY-930.", + "The live memory_evolution loss remains open until XY-905 changes behavior and reruns the live gate." + ], + "summary": { + "improved": [], + "regressed": [], + "unchanged": [ + "current_vs_historical_correctness", + "preference_evolution", + "deletion_ttl_tombstone_behavior", + "final_competitor_retest_status" + ], + "blocked": [ + "scheduled_memory_task_readiness" + ], + "not_tested": [ + "reviewable_consolidation", + "memory_summary_top_of_mind_behavior", + "proactive_brief_readiness" + ] + }, + "stage_gates": [ + { + "stage_id": "current_vs_historical_correctness", + "stage_name": "Current-vs-historical correctness", + "dependent_issue": "XY-905", + "evidence_class": "live_real_world", + "baseline_commands": [ + { + "command": "cargo make real-world-memory-evolution", + "artifact": "tmp/real-world-memory/evolution-report.json", + "purpose": "Fixture gate for current facts, historical facts, conflicts, and update rationales." + }, + { + "command": "cargo make real-world-memory-live-adapters", + "artifact": "tmp/real-world-memory/live-adapters/", + "purpose": "Live ELF/qmd real-world adapter gate for the memory_evolution suite." + } + ], + "post_stage_commands": [ + { + "command": "cargo make real-world-memory-evolution", + "required_artifact": "tmp/real-world-memory/evolution-report.json" + }, + { + "command": "cargo make real-world-memory-live-adapters", + "required_artifact": "tmp/real-world-memory/live-adapters/" + } + ], + "evidence_files": [ + "docs/guide/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md", + "docs/research/2026-06-11-temporal-history-competitor-gap-report.json", + "docs/guide/benchmarking/2026-06-11-competitor-strength-adoption-report.md" + ], + "baseline_counts": { + "pass": 1, + "wrong_result": 5, + "blocked": 0, + "not_tested": 0, + "not_encoded": 0 + }, + "baseline_basis": "ELF live service adapter memory_evolution suite: one delete/TTL job passes and five current-vs-historical jobs are wrong_result.", + "comparison_judgment": "unchanged", + "regression_rule": "Any new wrong_result, missed evidence, or loss of the delete/TTL pass is a regression.", + "improvement_rule": "An improvement requires fewer live ELF wrong_result jobs without increasing blocked/not_tested counts.", + "next_optimization_direction": "Implement current/historical/rationale/tombstone answer and trace selection before claiming temporal memory is solved." + }, + { + "stage_id": "preference_evolution", + "stage_name": "Preference evolution and correction history", + "dependent_issue": "XY-905", + "evidence_class": "live_real_world", + "baseline_commands": [ + { + "command": "cargo make real-world-memory-evolution", + "artifact": "tmp/real-world-memory/evolution-report.json", + "purpose": "Fixture gate for the preference-change job." + }, + { + "command": "cargo make real-world-memory-live-adapters", + "artifact": "tmp/real-world-memory/live-adapters/", + "purpose": "Live adapter gate for memory-evolution-preference-001." + }, + { + "command": "cargo make openmemory-ui-export-readback", + "artifact": "docs/guide/benchmarking/2026-06-11-mem0-openmemory-history-ui-export-report.md", + "purpose": "External comparison boundary for mem0/OpenMemory preference correction and export-style history." + } + ], + "post_stage_commands": [ + { + "command": "cargo make real-world-memory-evolution", + "required_artifact": "tmp/real-world-memory/evolution-report.json" + }, + { + "command": "cargo make real-world-memory-live-adapters", + "required_artifact": "tmp/real-world-memory/live-adapters/" + }, + { + "command": "cargo make openmemory-ui-export-readback", + "required_artifact": "tmp/live-baseline/" + } + ], + "evidence_files": [ + "docs/guide/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md", + "docs/guide/benchmarking/2026-06-11-mem0-openmemory-history-ui-export-report.md", + "docs/research/2026-06-11-temporal-history-competitor-gap-report.json" + ], + "baseline_counts": { + "pass": 0, + "wrong_result": 1, + "blocked": 0, + "not_tested": 0, + "not_encoded": 0 + }, + "baseline_basis": "ELF live memory-evolution-preference-001 is wrong_result; mem0 local OSS preference correction history is measured as an ELF loss.", + "comparison_judgment": "unchanged", + "regression_rule": "Any loss of fixture preference correctness or any new blocked/not_tested live preference gate is a regression.", + "improvement_rule": "An improvement requires live preference correction history to pass while preserving old preference history as historical evidence.", + "next_optimization_direction": "Add explicit preference correction history and answer fields that name the current preference, the superseded preference, and the rationale evidence." + }, + { + "stage_id": "deletion_ttl_tombstone_behavior", + "stage_name": "Deletion, TTL, and tombstone behavior", + "dependent_issue": "XY-905", + "evidence_class": "live_real_world", + "baseline_commands": [ + { + "command": "cargo make real-world-memory", + "artifact": "tmp/real-world-memory/real-world-memory-report.json", + "purpose": "Aggregate fixture gate containing memory-evolution-delete-ttl-001." + }, + { + "command": "cargo make real-world-memory-live-adapters", + "artifact": "tmp/real-world-memory/live-adapters/", + "purpose": "Live adapter gate for tombstone behavior." + } + ], + "post_stage_commands": [ + { + "command": "cargo make real-world-memory", + "required_artifact": "tmp/real-world-memory/real-world-memory-report.json" + }, + { + "command": "cargo make real-world-memory-live-adapters", + "required_artifact": "tmp/real-world-memory/live-adapters/" + } + ], + "evidence_files": [ + "docs/guide/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md", + "docs/guide/benchmarking/2026-06-11-measurement-coverage-audit.md" + ], + "baseline_counts": { + "pass": 1, + "wrong_result": 0, + "blocked": 0, + "not_tested": 0, + "not_encoded": 0 + }, + "baseline_basis": "ELF live memory-evolution-delete-ttl-001 passes with tombstone and current-plan evidence; qmd misses the tombstone.", + "comparison_judgment": "unchanged", + "regression_rule": "Losing tombstone evidence, returning stale deleted content, or failing the aggregate fixture is a regression.", + "improvement_rule": "This stage is already pass for ELF; improvement requires preserving the pass while reducing adjacent memory_evolution wrong_result counts.", + "next_optimization_direction": "Keep tombstone and TTL invalidation evidence answerable as temporal reconciliation is repaired." + }, + { + "stage_id": "reviewable_consolidation", + "stage_name": "Reviewable consolidation", + "dependent_issue": "XY-926", + "evidence_class": "fixture_backed", + "baseline_commands": [ + { + "command": "cargo make real-world-memory-consolidation", + "artifact": "tmp/real-world-memory/consolidation/report.json", + "purpose": "Fixture gate for review actions, lineage, unsupported claims, contradiction, and source immutability." + } + ], + "post_stage_commands": [ + { + "command": "cargo make real-world-memory-consolidation", + "required_artifact": "tmp/real-world-memory/consolidation/report.json" + }, + { + "command": "cargo make real-world-memory-live-adapters", + "required_artifact": "tmp/real-world-memory/live-adapters/" + } + ], + "evidence_files": [ + "docs/spec/system_consolidation_proposals_v1.md", + "docs/guide/benchmarking/2026-06-11-competitor-strength-adoption-report.md", + "apps/elf-eval/fixtures/real_world_memory/consolidation/" + ], + "baseline_counts": { + "pass": 4, + "wrong_result": 0, + "blocked": 0, + "not_tested": 1, + "not_encoded": 1 + }, + "baseline_basis": "Consolidation fixtures pass, but live consolidation proposal generation and review-action scoring are not encoded.", + "comparison_judgment": "not_tested", + "regression_rule": "Any source mutation, missing lineage, or collapse of review actions into an automatic rewrite is a regression.", + "improvement_rule": "An improvement requires live or service-backed consolidation scoring without provider hidden state and without mutating authoritative sources.", + "next_optimization_direction": "Keep Dreaming output derived and reviewable: proposal lineage, confidence, unsupported-claim flags, apply/defer/discard audit, and immutable source snapshots." + }, + { + "stage_id": "memory_summary_top_of_mind_behavior", + "stage_name": "Memory summary and top-of-mind behavior", + "dependent_issue": "XY-926", + "evidence_class": "fixture_backed", + "baseline_commands": [ + { + "command": "cargo make real-world-memory-knowledge", + "artifact": "tmp/real-world-memory/knowledge-report.json", + "purpose": "Fixture gate for derived knowledge pages, citations, stale-source lint, and repair guidance." + }, + { + "command": "cargo make real-world-memory-core-archival", + "artifact": "tmp/real-world-memory/core-archival/report.json", + "purpose": "Fixture gate for always-attached core block attachment, scope, provenance, stale-core detection, and archival fallback." + } + ], + "post_stage_commands": [ + { + "command": "cargo make real-world-memory-knowledge", + "required_artifact": "tmp/real-world-memory/knowledge-report.json" + }, + { + "command": "cargo make real-world-memory-core-archival", + "required_artifact": "tmp/real-world-memory/core-archival/report.json" + }, + { + "command": "cargo make real-world-memory-live-adapters", + "required_artifact": "tmp/real-world-memory/live-adapters/" + } + ], + "evidence_files": [ + "docs/guide/benchmarking/2026-06-11-competitor-strength-adoption-report.md", + "apps/elf-eval/fixtures/real_world_memory/knowledge/", + "apps/elf-eval/fixtures/real_world_memory/core_archival_memory/" + ], + "baseline_counts": { + "pass": 8, + "wrong_result": 0, + "blocked": 0, + "not_tested": 1, + "not_encoded": 1 + }, + "baseline_basis": "Knowledge and core/archival fixtures pass, but live knowledge compilation and top-of-mind product behavior are not encoded.", + "comparison_judgment": "not_tested", + "regression_rule": "Any stale summary, unsupported section, missing source id, or stale core block presented as current is a regression.", + "improvement_rule": "An improvement requires live top-of-mind or summary readback that remains source-linked and linted for stale/unsupported claims.", + "next_optimization_direction": "Build summaries as derived, cited, rebuildable pages or core blocks; do not replace authoritative notes with hidden summaries." + }, + { + "stage_id": "proactive_brief_readiness", + "stage_name": "Proactive brief readiness", + "dependent_issue": "XY-926", + "evidence_class": "not_encoded", + "baseline_commands": [ + { + "command": "cargo make real-world-first-generation-oss", + "artifact": "tmp/real-world-memory/first-generation-oss/report.json", + "purpose": "Regression guard for claude-mem progressive-disclosure and retrieval-repair reference behavior." + }, + { + "command": "cargo make real-world-job-operator-ux", + "artifact": "tmp/real-world-job/real-world-job-operator-ux-report.json", + "purpose": "Regression guard for operator-facing trace and repair-action clarity." + } + ], + "post_stage_commands": [ + { + "command": "cargo make real-world-first-generation-oss", + "required_artifact": "tmp/real-world-memory/first-generation-oss/report.json" + }, + { + "command": "cargo make real-world-job-operator-ux", + "required_artifact": "tmp/real-world-job/real-world-job-operator-ux-report.json" + }, + { + "command": "cargo make real-world-memory-live-adapters", + "required_artifact": "tmp/real-world-memory/live-adapters/" + } + ], + "evidence_files": [ + "docs/research/2026-06-08-agent-memory-selection.json", + "docs/guide/benchmarking/2026-06-11-first-generation-oss-continuity-source-store-report.md", + "docs/guide/benchmarking/2026-06-11-competitor-strength-adoption-report.md" + ], + "baseline_counts": { + "pass": 0, + "wrong_result": 0, + "blocked": 0, + "not_tested": 1, + "not_encoded": 1 + }, + "baseline_basis": "No direct proactive-brief real_world_job suite exists; adjacent progressive-disclosure and operator-debug fixtures are reference guards only.", + "comparison_judgment": "not_tested", + "regression_rule": "A proactive brief that is uncited, leaks excluded content, or cannot explain source selection is a regression.", + "improvement_rule": "An improvement requires a direct proactive-brief fixture or live adapter report with cited source ids and typed non-pass handling.", + "next_optimization_direction": "Add proactive briefs only as source-linked derived output with repair guidance and no secret or excluded-span leakage." + }, + { + "stage_id": "scheduled_memory_task_readiness", + "stage_name": "Scheduled memory task readiness", + "dependent_issue": "XY-926", + "evidence_class": "blocked", + "baseline_commands": [ + { + "command": "cargo make real-world-memory-consolidation", + "artifact": "tmp/real-world-memory/consolidation/report.json", + "purpose": "Current closest fixture gate for deterministic fixture/manual consolidation runs." + } + ], + "post_stage_commands": [ + { + "command": "cargo make real-world-memory-consolidation", + "required_artifact": "tmp/real-world-memory/consolidation/report.json" + }, + { + "command": "cargo make real-world-memory-live-adapters", + "required_artifact": "tmp/real-world-memory/live-adapters/" + } + ], + "evidence_files": [ + "docs/spec/system_consolidation_proposals_v1.md", + "docs/research/2026-06-08-agent-memory-selection.json" + ], + "baseline_counts": { + "pass": 0, + "wrong_result": 0, + "blocked": 1, + "not_tested": 0, + "not_encoded": 0 + }, + "baseline_basis": "The consolidation spec permits fixture and manual job_kind only; scheduled is explicitly future work and no scheduled-memory-task benchmark is encoded.", + "comparison_judgment": "blocked", + "regression_rule": "Adding scheduled tasks without reviewable output, immutable source snapshots, and explicit operator review is a regression.", + "improvement_rule": "An improvement requires a scheduled-task fixture or live report that keeps task output reviewable and records provider/private boundaries as typed blockers.", + "next_optimization_direction": "Model scheduled tasks as queued derived proposal runs first; do not allow a scheduler to mutate authoritative memory silently." + }, + { + "stage_id": "final_competitor_retest_status", + "stage_name": "Final competitor retest status", + "dependent_issue": "XY-951", + "evidence_class": "live_real_world", + "baseline_commands": [ + { + "command": "cargo make real-world-memory-live-adapters", + "artifact": "tmp/real-world-memory/live-adapters/", + "purpose": "Full encoded ELF/qmd live real-world sweep." + }, + { + "command": "cargo make real-world-first-generation-oss", + "artifact": "tmp/real-world-memory/first-generation-oss/report.json", + "purpose": "First-generation OSS prompt fixture and typed blocker slice." + }, + { + "command": "cargo make real-world-memory-graph-rag", + "artifact": "tmp/real-world-memory/graph-rag/report.json", + "purpose": "Representative graph/RAG typed non-pass fixture slice." + }, + { + "command": "cargo make openmemory-ui-export-readback", + "artifact": "tmp/live-baseline/", + "purpose": "mem0/OpenMemory local OSS history and export-readback boundary." + }, + { + "command": "cargo make baseline-production-private-addendum", + "artifact": "tmp/live-baseline/private-production-addendum.md", + "purpose": "Private-corpus addendum; remains blocked unless an operator-owned manifest is supplied." + } + ], + "post_stage_commands": [ + { + "command": "cargo make real-world-memory-live-adapters", + "required_artifact": "tmp/real-world-memory/live-adapters/" + }, + { + "command": "cargo make real-world-first-generation-oss", + "required_artifact": "tmp/real-world-memory/first-generation-oss/report.json" + }, + { + "command": "cargo make real-world-memory-graph-rag", + "required_artifact": "tmp/real-world-memory/graph-rag/report.json" + }, + { + "command": "cargo make openmemory-ui-export-readback", + "required_artifact": "tmp/live-baseline/" + }, + { + "command": "cargo make baseline-production-private-addendum", + "required_artifact": "tmp/live-baseline/private-production-addendum.md" + } + ], + "evidence_files": [ + "docs/guide/benchmarking/2026-06-11-competitor-strength-adoption-report.md", + "docs/research/2026-06-11-competitor-strength-adoption-report.json", + "docs/guide/benchmarking/2026-06-11-graph-rag-scored-smoke-adapter-report.md", + "docs/guide/benchmarking/2026-06-11-first-generation-oss-continuity-source-store-report.md" + ], + "baseline_counts": { + "pass": 22, + "wrong_result": 5, + "blocked": 2, + "not_tested": 11, + "not_encoded": 11 + }, + "baseline_basis": "ELF full live real-world sweep: 22 pass, 5 wrong_result, 2 blocked, and 11 not_encoded jobs. The not_encoded jobs are represented as not_tested for this stage gate while preserving the raw not_encoded count.", + "comparison_judgment": "unchanged", + "regression_rule": "Any higher wrong_result/blocked/not_tested count, missing typed blocker, or unsupported broad competitor win claim is a regression.", + "improvement_rule": "An improvement requires reduced live wrong_result or not_tested counts with no weakened evidence-class boundary and no private/provider claim without inputs.", + "next_optimization_direction": "Rerun the full relevant competitor matrix after each product optimization and update the Markdown/JSON ledger with improved, regressed, unchanged, blocked, and not_tested buckets." + } + ] +} From 70faad0c6c93b9cd930c470840725d0aa5583d1b Mon Sep 17 00:00:00 2001 From: Yvette Carlisle Date: Tue, 16 Jun 2026 09:02:36 +0800 Subject: [PATCH 2/2] {"schema":"decodex/commit/1","summary":"Refresh XY-951 review gate after stale Devin suite","authority":"XY-951"}