Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
164 changes: 164 additions & 0 deletions apps/elf-eval/tests/real_world_job_benchmark.rs
Original file line number Diff line number Diff line change
Expand Up @@ -182,6 +182,21 @@ fn temporal_history_competitor_gap_json_path() -> Result<PathBuf> {
.join("2026-06-11-temporal-history-competitor-gap-report.json"))
}

fn dreaming_readiness_stage_ledger_json_path() -> Result<PathBuf> {
Ok(workspace_root()?
.join("docs")
.join("research")
.join("2026-06-16-dreaming-readiness-stage-ledger.json"))
}

fn dreaming_readiness_stage_ledger_markdown_path() -> Result<PathBuf> {
Ok(workspace_root()?
.join("docs")
.join("guide")
.join("benchmarking")
.join("2026-06-16-dreaming-readiness-stage-ledger.md"))
}

fn competitor_strength_matrix_path() -> Result<PathBuf> {
Ok(workspace_root()?
.join("docs")
Expand Down Expand Up @@ -3665,6 +3680,155 @@ fn mem0_delete_audit_probe_requires_explicit_delete_history_event() -> Result<()
Ok(())
}

#[test]
fn dreaming_readiness_stage_ledger_preserves_gate_shape() -> Result<()> {
let ledger = serde_json::from_str::<Value>(&fs::read_to_string(
dreaming_readiness_stage_ledger_json_path()?,
)?)?;
let markdown = fs::read_to_string(dreaming_readiness_stage_ledger_markdown_path()?)?;
let stages = array_at(&ledger, "/stage_gates")?;

assert_dreaming_readiness_ledger_header(&ledger)?;
assert_dreaming_readiness_stage_shape(&ledger, stages)?;
assert_dreaming_readiness_baseline_counts(&ledger, stages)?;
assert_dreaming_readiness_markdown_boundaries(&markdown);

Ok(())
}

fn assert_dreaming_readiness_ledger_header(ledger: &Value) -> Result<()> {
assert_eq!(
ledger.pointer("/schema").and_then(Value::as_str),
Some("elf.dreaming_readiness_stage_ledger/v1")
);
assert_eq!(ledger.pointer("/authority").and_then(Value::as_str), Some("XY-951"));

for term in ["improved", "regressed", "unchanged", "blocked", "not_tested"] {
assert!(array_contains_str(ledger, "/judgment_terms", term)?);
}
for term in ["pass", "wrong_result", "blocked", "not_tested", "not_encoded"] {
assert!(array_contains_str(ledger, "/count_fields", term)?);
}

Ok(())
}

fn assert_dreaming_readiness_stage_shape(ledger: &Value, stages: &[Value]) -> Result<()> {
assert_eq!(stages.len(), 8);

for stage_id in [
"current_vs_historical_correctness",
"preference_evolution",
"deletion_ttl_tombstone_behavior",
"reviewable_consolidation",
"memory_summary_top_of_mind_behavior",
"proactive_brief_readiness",
"scheduled_memory_task_readiness",
"final_competitor_retest_status",
] {
find_by_field(stages, "/stage_id", stage_id)?;
}
for stage in stages {
let stage_id =
stage.pointer("/stage_id").and_then(Value::as_str).unwrap_or("<missing stage_id>");

assert!(
!array_at(stage, "/baseline_commands")?.is_empty(),
"{stage_id} missing baseline commands"
);
assert!(
!array_at(stage, "/post_stage_commands")?.is_empty(),
"{stage_id} missing post-stage commands"
);
assert!(
!array_at(stage, "/evidence_files")?.is_empty(),
"{stage_id} missing evidence files"
);

for count_field in ["pass", "wrong_result", "blocked", "not_tested"] {
let pointer = format!("/baseline_counts/{count_field}");

assert!(
stage.pointer(&pointer).and_then(Value::as_u64).is_some(),
"{stage_id} missing {pointer}"
);
}

let judgment = stage
.pointer("/comparison_judgment")
.and_then(Value::as_str)
.ok_or_else(|| eyre::eyre!("{stage_id} missing comparison_judgment"))?;

assert!(array_contains_str(ledger, "/judgment_terms", judgment)?);
}

Ok(())
}

fn assert_dreaming_readiness_baseline_counts(ledger: &Value, stages: &[Value]) -> Result<()> {
let current = find_by_field(stages, "/stage_id", "current_vs_historical_correctness")?;

assert_eq!(current.pointer("/baseline_counts/pass").and_then(Value::as_u64), Some(1));
assert_eq!(current.pointer("/baseline_counts/wrong_result").and_then(Value::as_u64), Some(5));
assert_eq!(current.pointer("/comparison_judgment").and_then(Value::as_str), Some("unchanged"));
assert!(
current
.pointer("/baseline_basis")
.and_then(Value::as_str)
.is_some_and(|basis| basis.contains("five current-vs-historical jobs"))
);

let preference = find_by_field(stages, "/stage_id", "preference_evolution")?;

assert_eq!(
preference.pointer("/baseline_counts/wrong_result").and_then(Value::as_u64),
Some(1)
);

let tombstone = find_by_field(stages, "/stage_id", "deletion_ttl_tombstone_behavior")?;

assert_eq!(tombstone.pointer("/baseline_counts/pass").and_then(Value::as_u64), Some(1));

let consolidation = find_by_field(stages, "/stage_id", "reviewable_consolidation")?;

assert_eq!(
consolidation.pointer("/comparison_judgment").and_then(Value::as_str),
Some("not_tested")
);
assert_eq!(
consolidation.pointer("/baseline_counts/not_encoded").and_then(Value::as_u64),
Some(1)
);

let scheduled = find_by_field(stages, "/stage_id", "scheduled_memory_task_readiness")?;

assert_eq!(scheduled.pointer("/comparison_judgment").and_then(Value::as_str), Some("blocked"));
assert_eq!(scheduled.pointer("/baseline_counts/blocked").and_then(Value::as_u64), Some(1));

let retest = find_by_field(stages, "/stage_id", "final_competitor_retest_status")?;

assert_eq!(retest.pointer("/baseline_counts/pass").and_then(Value::as_u64), Some(22));
assert_eq!(retest.pointer("/baseline_counts/wrong_result").and_then(Value::as_u64), Some(5));
assert_eq!(retest.pointer("/baseline_counts/blocked").and_then(Value::as_u64), Some(2));
assert_eq!(retest.pointer("/baseline_counts/not_tested").and_then(Value::as_u64), Some(11));
assert_eq!(retest.pointer("/baseline_counts/not_encoded").and_then(Value::as_u64), Some(11));
assert!(array_at(ledger, "/summary/improved")?.is_empty());
assert!(array_at(ledger, "/summary/regressed")?.is_empty());
assert!(array_contains_str(ledger, "/summary/unchanged", "current_vs_historical_correctness")?);
assert!(array_contains_str(ledger, "/summary/blocked", "scheduled_memory_task_readiness")?);
assert!(array_contains_str(ledger, "/summary/not_tested", "proactive_brief_readiness")?);

Ok(())
}

fn assert_dreaming_readiness_markdown_boundaries(markdown: &str) {
assert!(markdown.contains("`improved`: none"));
assert!(markdown.contains("`regressed`: none"));
assert!(markdown.contains("live `memory_evolution` is not solved until"));
assert!(markdown.contains("XY-905"));
assert!(markdown.contains("Do not claim this ledger fixes temporal reconciliation"));
}

#[test]
fn knowledge_json_report_renders_markdown_metrics() -> Result<()> {
let report = run_json_report_from(knowledge_fixture_dir())?;
Expand Down
114 changes: 114 additions & 0 deletions docs/guide/benchmarking/2026-06-16-dreaming-readiness-stage-ledger.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
# Dreaming-Readiness Stage Ledger - June 16, 2026

Goal: Define the Decodex benchmark gate for Dreaming-inspired ELF memory-system
optimization stages.
Read this when: You are starting or finishing a staged memory improvement lane and
need the baseline command matrix, typed evidence status, and report shape required
before claiming the stage improved.
Inputs: `docs/research/2026-06-16-dreaming-readiness-stage-ledger.json`, the June 11
competitor-strength, temporal-history, and iteration-direction reports, the
consolidation proposal spec, and the checked-in real-world fixture suites.
Outputs: A stage-by-stage ledger that downstream issues can update with
`improved`, `regressed`, `unchanged`, `blocked`, or `not_tested` judgments.

## Executive Judgment

This ledger does not claim a new product win. It creates the gate later product lanes
must pass before they can claim a Dreaming or competitor-inspired stage is done.

Current baseline:

- `improved`: none.
- `regressed`: none.
- `unchanged`: current-vs-historical correctness, preference evolution,
deletion/TTL/tombstone behavior, and the final competitor retest baseline.
- `blocked`: scheduled-memory-task readiness.
- `not_tested`: reviewable consolidation beyond fixtures, memory-summary/top-of-mind
live behavior, and proactive brief readiness.

The important known loss is preserved: live `memory_evolution` is not solved until
XY-905 changes behavior and reruns the live gate. The current ELF live adapter passes
only the delete/TTL tombstone job and keeps five current-vs-historical jobs as
`wrong_result`.

## Ledger Rules

- Every downstream Dreaming or competitor-improvement stage must write a post-stage
JSON report and Markdown summary before claiming phase completion.
- The report must compare against the baseline counts in
`docs/research/2026-06-16-dreaming-readiness-stage-ledger.json`.
- The comparison judgment must be one of `improved`, `regressed`, `unchanged`,
`blocked`, or `not_tested`.
- Typed non-pass labels stay typed. Do not collapse `wrong_result`, `blocked`,
`not_tested`, `not_encoded`, `incomplete`, `lifecycle_fail`, `unsupported`, or
`non_goal` into a single pass/fail label.
- Fixture-backed evidence proves benchmark shape only. It does not prove live product
behavior.
- Private-corpus and provider-backed gates remain typed blocked unless an operator
supplies explicit inputs; those boundaries are tied to XY-930.

## Stage Command Matrix

| Stage | Baseline command(s) | Required post-stage command(s) | Current counts | Judgment | Next optimization direction |
| --- | --- | --- | --- | --- | --- |
| Current-vs-historical correctness | `cargo make real-world-memory-evolution`; `cargo make real-world-memory-live-adapters` | Same commands; publish post-stage JSON and Markdown evidence | `pass=1`, `wrong_result=5`, `blocked=0`, `not_tested=0` | `unchanged` | XY-905 must make live answers cite current, historical, rationale, and tombstone evidence instead of only retrieving snippets. |
| Preference evolution and correction history | `cargo make real-world-memory-evolution`; `cargo make real-world-memory-live-adapters`; `cargo make openmemory-ui-export-readback` | Same commands; include mem0/OpenMemory boundary evidence | `pass=0`, `wrong_result=1`, `blocked=0`, `not_tested=0` | `unchanged` | Preserve current and superseded preferences with rationale evidence; do not claim ELF beats mem0/OpenMemory history until measured. |
| Deletion, TTL, and tombstone behavior | `cargo make real-world-memory`; `cargo make real-world-memory-live-adapters` | Same commands | `pass=1`, `wrong_result=0`, `blocked=0`, `not_tested=0` | `unchanged` | Preserve the current tombstone pass while repairing adjacent temporal-history wrong results. |
| Reviewable consolidation | `cargo make real-world-memory-consolidation` | `cargo make real-world-memory-consolidation`; `cargo make real-world-memory-live-adapters` | `pass=4`, `wrong_result=0`, `blocked=0`, `not_tested=1` | `not_tested` | Keep Dreaming output derived and reviewable with lineage, confidence, unsupported-claim flags, apply/defer/discard audit, and no source mutation. |
| Memory summary and top-of-mind behavior | `cargo make real-world-memory-knowledge`; `cargo make real-world-memory-core-archival` | Same commands plus `cargo make real-world-memory-live-adapters` | `pass=8`, `wrong_result=0`, `blocked=0`, `not_tested=1` | `not_tested` | Build summaries as cited, rebuildable derived pages or core blocks; do not turn hidden summaries into authoritative memory. |
| Proactive brief readiness | `cargo make real-world-first-generation-oss`; `cargo make real-world-job-operator-ux` | Same commands plus `cargo make real-world-memory-live-adapters` | `pass=0`, `wrong_result=0`, `blocked=0`, `not_tested=1` | `not_tested` | Add direct proactive-brief fixtures before any pass claim; briefs must be source-linked and repairable. |
| Scheduled memory task readiness | `cargo make real-world-memory-consolidation` | `cargo make real-world-memory-consolidation`; `cargo make real-world-memory-live-adapters` | `pass=0`, `wrong_result=0`, `blocked=1`, `not_tested=0` | `blocked` | Scheduled runs are future work; start with queued derived proposal runs and keep operator review mandatory. |
| Final competitor retest status | `cargo make real-world-memory-live-adapters`; `cargo make real-world-first-generation-oss`; `cargo make real-world-memory-graph-rag`; `cargo make openmemory-ui-export-readback`; `cargo make baseline-production-private-addendum` when operator input exists | Same commands; private/provider commands may remain typed blocked under XY-930 | `pass=22`, `wrong_result=5`, `blocked=2`, `not_tested=11` | `unchanged` | Rerun the relevant competitor matrix after each optimization and update improved/regressed/unchanged/blocked/not-tested buckets. |

## Evidence Anchors

| Stage | Evidence file(s) |
| --- | --- |
| Current-vs-historical correctness | `docs/guide/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md`; `docs/research/2026-06-11-temporal-history-competitor-gap-report.json`; `docs/guide/benchmarking/2026-06-11-competitor-strength-adoption-report.md` |
| Preference evolution and correction history | `docs/guide/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md`; `docs/guide/benchmarking/2026-06-11-mem0-openmemory-history-ui-export-report.md`; `docs/research/2026-06-11-temporal-history-competitor-gap-report.json` |
| Deletion, TTL, and tombstone behavior | `docs/guide/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md`; `docs/guide/benchmarking/2026-06-11-measurement-coverage-audit.md` |
| Reviewable consolidation | `docs/spec/system_consolidation_proposals_v1.md`; `apps/elf-eval/fixtures/real_world_memory/consolidation/`; `docs/guide/benchmarking/2026-06-11-competitor-strength-adoption-report.md` |
| Memory summary and top-of-mind behavior | `apps/elf-eval/fixtures/real_world_memory/knowledge/`; `apps/elf-eval/fixtures/real_world_memory/core_archival_memory/`; `docs/guide/benchmarking/2026-06-11-competitor-strength-adoption-report.md` |
| Proactive brief readiness | `docs/research/2026-06-08-agent-memory-selection.json`; `docs/guide/benchmarking/2026-06-11-first-generation-oss-continuity-source-store-report.md` |
| Scheduled memory task readiness | `docs/spec/system_consolidation_proposals_v1.md`; `docs/research/2026-06-08-agent-memory-selection.json` |
| Final competitor retest status | `docs/guide/benchmarking/2026-06-11-competitor-strength-adoption-report.md`; `docs/research/2026-06-11-competitor-strength-adoption-report.json`; `docs/guide/benchmarking/2026-06-11-graph-rag-scored-smoke-adapter-report.md`; `docs/guide/benchmarking/2026-06-11-first-generation-oss-continuity-source-store-report.md` |

## Report Shape For Downstream Issues

Downstream stage reports should use the same fields as the JSON ledger:

- `stage_id`
- `baseline_commands`
- `post_stage_commands`
- `evidence_files`
- `baseline_counts`
- `post_stage_counts`
- `comparison_judgment`
- `regression_rule`
- `improvement_rule`
- `next_optimization_direction`

If a stage cannot run because credentials, private corpus, provider setup, or a
product surface is absent, record `blocked` or `not_tested` with the concrete blocker.
Do not silently drop the stage from the report.

## Claim Boundaries

Allowed:

- The Dreaming-readiness gate exists and names required stage commands and evidence
files.
- The current baseline preserves typed non-pass states and the known live
memory-evolution loss.
- Fixture-backed consolidation, knowledge, and core/archival jobs can be used as
regression guards for report shape.

Not allowed:

- Do not claim this ledger fixes temporal reconciliation, preference history,
consolidation, proactive briefs, scheduled tasks, or competitor adapters.
- Do not claim ELF has full-suite live real-world pass evidence.
- Do not claim private-corpus or provider-backed production quality without the
operator-owned inputs required by XY-930.
- Do not claim fixture-only or smoke-only evidence proves broad competitor
superiority.
5 changes: 5 additions & 0 deletions docs/guide/benchmarking/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,11 @@ cleanup, use `docs/guide/single_user_production.md`.
personalization, and export-readback comparison with normalized
win/tie/loss/not-tested/blocked/non-goal outcomes and explicit hosted/UI/graph
non-claims.
- `2026-06-16-dreaming-readiness-stage-ledger.md`: XY-951 stage-gate ledger for
Dreaming-inspired memory improvements, with the required current baseline,
post-stage command matrix, typed improved/regressed/unchanged/blocked/not-tested
buckets, and machine-readable companion file
`docs/research/2026-06-16-dreaming-readiness-stage-ledger.json`.
- `real_world_agent_memory_benchmark.md`: operator overview for the v1 real-world
agent memory benchmark contract, including suite taxonomy, typed report states,
knowledge-compilation fixture tasks, and the production-ops fixture target.
Expand Down
Loading