From 6a39c5eeac85c45e67ecf968386d34f8ac02084a Mon Sep 17 00:00:00 2001 From: Yvette Carlisle Date: Tue, 16 Jun 2026 23:03:41 +0800 Subject: [PATCH] {"schema":"decodex/commit/1","summary":"Add proactive brief benchmark scoring","authority":"XY-953"} --- Makefile.toml | 52 ++ README.md | 19 +- .../memory_projects_manifest.json | 12 +- .../proactive_brief/daily_project_brief.json | 267 +++++++ .../private_corpus_refresh_blocked.json | 124 +++ .../proactive_brief/resume_work_brief.json | 251 ++++++ .../proactive_brief/stale_decision_audit.json | 218 +++++ .../stale_plan_preference_warning.json | 316 ++++++++ .../src/bin/real_world_job_benchmark.rs | 752 +++++++++++++++++- .../tests/real_world_job_benchmark.rs | 362 ++++++++- ...-11-competitor-strength-adoption-report.md | 3 +- ...-11-competitor-strength-evidence-matrix.md | 16 +- ...on-direction-from-competitor-benchmarks.md | 21 +- ...6-06-16-dreaming-readiness-stage-ledger.md | 28 +- ...26-06-16-proactive-brief-scoring-report.md | 100 +++ docs/guide/benchmarking/index.md | 4 + .../real_world_agent_memory_benchmark.md | 12 +- ...1-competitor-strength-adoption-report.json | 7 +- ...06-16-dreaming-readiness-stage-ledger.json | 54 +- ...-06-16-proactive-brief-scoring-report.json | 131 +++ 20 files changed, 2646 insertions(+), 103 deletions(-) create mode 100644 apps/elf-eval/fixtures/real_world_memory/proactive_brief/daily_project_brief.json create mode 100644 apps/elf-eval/fixtures/real_world_memory/proactive_brief/private_corpus_refresh_blocked.json create mode 100644 apps/elf-eval/fixtures/real_world_memory/proactive_brief/resume_work_brief.json create mode 100644 apps/elf-eval/fixtures/real_world_memory/proactive_brief/stale_decision_audit.json create mode 100644 apps/elf-eval/fixtures/real_world_memory/proactive_brief/stale_plan_preference_warning.json create mode 100644 docs/guide/benchmarking/2026-06-16-proactive-brief-scoring-report.md create mode 100644 docs/research/2026-06-16-proactive-brief-scoring-report.json diff --git a/Makefile.toml b/Makefile.toml index 1cc9d93b..04068ebb 100644 --- a/Makefile.toml +++ b/Makefile.toml @@ -421,6 +421,9 @@ args = [ # | real-world-memory-summary | composite | | # | real-world-memory-summary-json | command | | # | real-world-memory-summary-report | command | | +# | real-world-memory-proactive-brief | composite | | +# | real-world-memory-proactive-brief-json | command | | +# | real-world-memory-proactive-brief-report | command | | # | real-world-memory-live-consolidation | command | | # | real-world-job-operator-ux | composite | | # | real-world-job-operator-ux-json | command | | @@ -883,6 +886,55 @@ args = [ "tmp/real-world-memory/memory-summary/report.md", ] +[tasks.real-world-memory-proactive-brief] +workspace = false +dependencies = [ + "real-world-memory-proactive-brief-report", +] + +[tasks.real-world-memory-proactive-brief-json] +workspace = false +command = "cargo" +args = [ + "run", + "-p", + "elf-eval", + "--bin", + "real_world_job_benchmark", + "--", + "run", + "--fixtures", + "apps/elf-eval/fixtures/real_world_memory/proactive_brief", + "--out", + "tmp/real-world-memory/proactive-brief/report.json", + "--run-id", + "real-world-memory-proactive-brief", + "--adapter-id", + "fixture_proactive_brief", + "--adapter-name", + "ELF proactive brief fixture", +] + +[tasks.real-world-memory-proactive-brief-report] +workspace = false +dependencies = [ + "real-world-memory-proactive-brief-json", +] +command = "cargo" +args = [ + "run", + "-p", + "elf-eval", + "--bin", + "real_world_job_benchmark", + "--", + "publish", + "--report", + "tmp/real-world-memory/proactive-brief/report.json", + "--out", + "tmp/real-world-memory/proactive-brief/report.md", +] + [tasks.real-world-memory-live-consolidation] workspace = false command = "bash" diff --git a/README.md b/README.md index 982fb341..f52c4bc3 100644 --- a/README.md +++ b/README.md @@ -152,17 +152,20 @@ provider-backed ELF evidence was required. its pinned Docker local embedding path and is reported as `wrong_result` when same-corpus evidence terms are missed; claude-mem and OpenViking non-retrieval coverage remain typed non-pass states. -- Real-world agent memory aggregate after XY-952: 50 fixture-backed - jobs across 14 suites, 45 pass, 0 incomplete, 5 blocked, 0 wrong-result, +- Real-world agent memory aggregate after XY-953: 55 fixture-backed + jobs across 15 suites, 49 pass, 0 incomplete, 6 blocked, 0 wrong-result, 0 not-encoded, and 0 unsupported-claim results. The remaining non-pass jobs are production-ops operator boundaries plus blocked OpenViking staged trajectory, - hierarchy selection, and recursive/context expansion measurement gates, not - hidden benchmark wins. The `core_archival_memory` suite passes 6 fixture jobs for - core block attachment, scope, provenance, stale-core detection, archival fallback, - and project-decision recovery; it does not create an ELF-over-Letta claim. The new + hierarchy selection, recursive/context expansion measurement gates, and the + private-corpus refresh blocker tied to XY-930, not hidden benchmark wins. The + `core_archival_memory` suite passes 6 fixture jobs for core block attachment, scope, + provenance, stale-core detection, archival fallback, and project-decision recovery; + it does not create an ELF-over-Letta claim. The `memory_summary` fixture passes 1 source-trace job for reviewable top-of-mind, background, stale, superseded, tombstoned, and derived project-profile entries; it - does not create a managed-memory parity claim. + does not create a managed-memory parity claim. The new `proactive_brief` fixture + scores 5 jobs, with 4 pass and 1 blocked private-corpus case; it does not create + Pulse or hosted managed-memory parity. - Full-suite live real-world adapter sweep after XY-926: ELF and qmd emit Docker-isolated `live_real_world` records for all 55 checked-in jobs across 13 suites through `cargo make real-world-memory-live-adapters`. Both keep the original @@ -268,6 +271,7 @@ Detailed evidence and interpretation: - [Live Consolidation Proposal Scoring Report - June 16, 2026](docs/guide/benchmarking/2026-06-16-live-consolidation-proposal-scoring-report.md) - [First-Generation OSS Continuity and Source-Store Report - June 11, 2026](docs/guide/benchmarking/2026-06-11-first-generation-oss-continuity-source-store-report.md) - [Live Temporal Reconciliation Report - June 16, 2026](docs/guide/benchmarking/2026-06-16-live-temporal-reconciliation-report.md) +- [Proactive Brief Scoring Report - June 16, 2026](docs/guide/benchmarking/2026-06-16-proactive-brief-scoring-report.md) - [Live Baseline Benchmark Runbook](docs/guide/benchmarking/live_baseline_benchmark.md) - [Single-User Production Runbook](docs/guide/single_user_production.md) - Benchmark contract: @@ -349,6 +353,7 @@ Detailed comparison, mechanism-level analysis, and source map: - [Live Consolidation Proposal Scoring Report - June 16, 2026](docs/guide/benchmarking/2026-06-16-live-consolidation-proposal-scoring-report.md) - [First-Generation OSS Continuity and Source-Store Report - June 11, 2026](docs/guide/benchmarking/2026-06-11-first-generation-oss-continuity-source-store-report.md) - [Live Temporal Reconciliation Report - June 16, 2026](docs/guide/benchmarking/2026-06-16-live-temporal-reconciliation-report.md) +- [Proactive Brief Scoring Report - June 16, 2026](docs/guide/benchmarking/2026-06-16-proactive-brief-scoring-report.md) - [Live Baseline Benchmark Runbook](docs/guide/benchmarking/live_baseline_benchmark.md) - [Real-World Agent Memory Benchmark](docs/guide/benchmarking/real_world_agent_memory_benchmark.md) - [External Memory Improvement Plan](docs/guide/research/external_memory_improvement_plan.md) diff --git a/apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json b/apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json index f4286e24..e1802f44 100644 --- a/apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json +++ b/apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json @@ -29,7 +29,7 @@ }, "run": { "status": "blocked", - "evidence": "The current fixture set reports 50 jobs across 14 suites: 45 pass, 0 incomplete, 5 blocked, 0 wrong_result, 0 not_encoded, and 0 unsupported_claim. The six core_archival_memory jobs pass as ELF fixture evidence, not as live Letta comparison evidence; the one memory_summary job passes as fixture-backed source-trace evidence, not as managed-memory parity evidence; context_trajectory remains blocked behind OpenViking staged-artifact materialization.", + "evidence": "The current fixture set reports 55 jobs across 15 suites: 49 pass, 0 incomplete, 6 blocked, 0 wrong_result, 0 not_encoded, and 0 unsupported_claim. The six core_archival_memory jobs pass as ELF fixture evidence, not as live Letta comparison evidence; the one memory_summary job passes as fixture-backed source-trace evidence, not as managed-memory parity evidence; the proactive_brief suite scores 4 passing evidence-linked suggestions plus one blocked private-corpus refresh case tied to XY-930, not Pulse or hosted managed-memory parity; context_trajectory remains blocked behind OpenViking staged-artifact materialization.", "command": "cargo make real-world-memory", "artifact": "tmp/real-world-memory/real-world-memory-report.json" }, @@ -86,6 +86,16 @@ "status": "pass", "evidence": "Proposal-only consolidation fixtures are encoded and passing without source mutation." }, + { + "suite_id": "memory_summary", + "status": "pass", + "evidence": "The source-trace memory summary fixture is encoded and passing with freshness, rationale, tombstone, and unsupported-claim guards." + }, + { + "suite_id": "proactive_brief", + "status": "blocked", + "evidence": "The proactive brief suite scores 4 passing source-linked suggestions and 1 typed private-corpus refresh blocker tied to XY-930." + }, { "suite_id": "knowledge_compilation", "status": "pass", diff --git a/apps/elf-eval/fixtures/real_world_memory/proactive_brief/daily_project_brief.json b/apps/elf-eval/fixtures/real_world_memory/proactive_brief/daily_project_brief.json new file mode 100644 index 00000000..b31ef1c6 --- /dev/null +++ b/apps/elf-eval/fixtures/real_world_memory/proactive_brief/daily_project_brief.json @@ -0,0 +1,267 @@ +{ + "schema": "elf.real_world_job/v1", + "job_id": "proactive-daily-project-brief-001", + "suite": "proactive_brief", + "title": "Generate a daily project brief from current project memory", + "corpus": { + "corpus_id": "real-world-memory-proactive-brief-2026-06-16", + "profile": "synthetic", + "items": [ + { + "evidence_id": "daily-current-validation-gate", + "kind": "decision", + "text": "Current project decision: before review handoff, the ELF lane must run the proactive brief fixture command and targeted real_world_job_benchmark tests.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "daily_project_brief", + "evidence_id": "daily-current-validation-gate" + }, + "locator": { + "quote": "run the proactive brief fixture command" + } + }, + "created_at": "2026-06-16T04:00:00Z" + }, + { + "evidence_id": "daily-current-ledger-update", + "kind": "plan", + "text": "Current plan: update the XY-951 Dreaming-readiness stage ledger with the proactive brief benchmark delta and next optimization direction.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "daily_project_brief", + "evidence_id": "daily-current-ledger-update" + }, + "locator": { + "quote": "update the XY-951 Dreaming-readiness stage ledger" + } + }, + "created_at": "2026-06-16T04:05:00Z" + }, + { + "evidence_id": "daily-old-parity-trap", + "kind": "note", + "text": "Stale note: fixture-only proactive briefs prove parity with OpenAI Pulse and hosted managed products.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "daily_project_brief", + "evidence_id": "daily-old-parity-trap" + } + }, + "created_at": "2026-06-15T10:00:00Z" + } + ], + "adapter_response": { + "adapter_id": "fixture_proactive_brief", + "answer": { + "content": "Daily brief: run the proactive brief benchmark command, keep the XY-951 ledger update next, and do not claim Pulse or hosted managed-product parity from fixture-only evidence.", + "claims": [ + { + "claim_id": "daily_validation_gate", + "text": "The next validation step is the proactive brief fixture command plus targeted real_world_job_benchmark tests.", + "evidence_ids": ["daily-current-validation-gate"], + "confidence": "high" + }, + { + "claim_id": "daily_ledger_update", + "text": "The XY-951 stage ledger must record the proactive brief benchmark delta.", + "evidence_ids": ["daily-current-ledger-update"], + "confidence": "high" + } + ], + "evidence_ids": ["daily-current-validation-gate", "daily-current-ledger-update"], + "proactive_briefs": [ + { + "brief_id": "brief-daily-project-2026-06-16", + "contract_schema": "elf.proactive_project_brief/v1", + "generated_at": "2026-06-16T04:30:00Z", + "tenant_id": "fixture-tenant", + "project_id": "elf", + "agent_id": "xy-953-fixture-agent", + "read_profile": "private_plus_project", + "brief_kind": "daily_project_brief", + "suggestions": [ + { + "suggestion_id": "daily-run-proactive-gate", + "suggestion_kind": "daily_project_brief", + "title": "Run the proactive brief benchmark gate", + "body": "Run the proactive brief fixture command before claiming the lane is validation-ready, then update the XY-951 ledger.", + "evidence_refs": ["daily-current-validation-gate", "daily-current-ledger-update"], + "freshness": { + "status": "current", + "observed_at": "2026-06-16T04:05:00Z", + "valid_from": "2026-06-16T04:00:00Z", + "valid_to": null, + "last_confirmed_at": "2026-06-16T04:30:00Z", + "superseded_by": [], + "tombstone_refs": [] + }, + "action": { + "decision": "recommend", + "reason_code": "RECOMMEND_CURRENT_EVIDENCE_BOUND_BRIEF", + "reason": "Both source refs are current project-memory items and no tombstone or supersession source is selected." + }, + "unsupported_claim_flags": [] + } + ], + "source_trace": { + "selected_source_refs": [ + { + "evidence_id": "daily-current-validation-gate", + "status": "current", + "reason": "Required validation command source." + }, + { + "evidence_id": "daily-current-ledger-update", + "status": "current", + "reason": "Required ledger update source." + } + ], + "dropped_source_refs": [], + "stale_source_refs": [ + { + "evidence_id": "daily-old-parity-trap", + "status": "stale", + "reason": "Fixture-only evidence cannot prove Pulse parity." + } + ], + "superseded_source_refs": [], + "tombstone_source_refs": [], + "unsupported_claim_flags": [] + } + } + ], + "latency_ms": 2.0, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + } + } + } + }, + "timeline": [ + { + "event_id": "daily-gate-recorded", + "ts": "2026-06-16T04:00:00Z", + "actor": "agent", + "action": "recorded_current_gate", + "evidence_ids": ["daily-current-validation-gate"], + "summary": "The current validation gate was recorded as proactive fixture plus targeted tests." + }, + { + "event_id": "daily-ledger-action-recorded", + "ts": "2026-06-16T04:05:00Z", + "actor": "agent", + "action": "recorded_ledger_action", + "evidence_ids": ["daily-current-ledger-update"], + "summary": "The ledger update remained the next optimization artifact." + } + ], + "prompt": { + "role": "user", + "content": "Generate a daily project brief with only source-linked current recommendations.", + "job_mode": "proactive_brief", + "constraints": [ + "cite_evidence", + "mark_currentness", + "include_action_rationale", + "do_not_claim_pulse_parity" + ] + }, + "expected_answer": { + "must_include": [ + { + "claim_id": "daily_validation_gate", + "text": "The next validation step is the proactive brief fixture command plus targeted real_world_job_benchmark tests." + }, + { + "claim_id": "daily_ledger_update", + "text": "The XY-951 stage ledger must record the proactive brief benchmark delta." + } + ], + "must_not_include": [ + "fixture-only proactive briefs prove parity with OpenAI Pulse", + "fixture-only proactive briefs prove hosted managed-product parity" + ], + "evidence_links": { + "daily_validation_gate": ["daily-current-validation-gate"], + "daily_ledger_update": ["daily-current-ledger-update"] + }, + "answer_type": "proactive_project_brief", + "accepted_alternates": [], + "requires_caveat": false, + "requires_refusal": false + }, + "required_evidence": [ + { + "evidence_id": "daily-current-validation-gate", + "claim_id": "daily_validation_gate", + "requirement": "cite", + "quote": "proactive brief fixture command" + }, + { + "evidence_id": "daily-current-ledger-update", + "claim_id": "daily_ledger_update", + "requirement": "cite", + "quote": "XY-951 Dreaming-readiness stage ledger" + } + ], + "negative_traps": [ + { + "trap_id": "daily-fixture-parity-trap", + "type": "stale_fact", + "evidence_ids": ["daily-old-parity-trap"], + "failure_if_used": true + } + ], + "proactive_brief": { + "required_suggestion_kinds": ["daily_project_brief"] + }, + "scoring_rubric": { + "dimensions": { + "answer_correctness": { + "weight": 0.25, + "max_points": 1.0, + "criteria": "Brief names the current validation and ledger actions." + }, + "evidence_grounding": { + "weight": 0.3, + "max_points": 1.0, + "criteria": "Every proactive suggestion has evidence refs." + }, + "trap_avoidance": { + "weight": 0.2, + "max_points": 1.0, + "criteria": "The stale Pulse parity trap is not selected." + }, + "workflow_helpfulness": { + "weight": 0.15, + "max_points": 1.0, + "criteria": "The suggestion is actionable for the current lane." + }, + "lifecycle_behavior": { + "weight": 0.1, + "max_points": 1.0, + "criteria": "Freshness and currentness markers prevent stale facts from being current recommendations." + } + }, + "pass_threshold": 0.8, + "hard_fail_rules": [ + "unsupported high-confidence claim about a required decision or fact", + "use of a negative trap marked failure_if_used = true" + ] + }, + "allowed_uncertainty": { + "can_answer_unknown": true, + "acceptable_phrases": ["No hosted parity claim is supported by this fixture."], + "fallback_action": "defer_unsupported_claim" + }, + "tags": ["synthetic", "proactive_brief", "daily_project_brief", "fixture_backed"] +} diff --git a/apps/elf-eval/fixtures/real_world_memory/proactive_brief/private_corpus_refresh_blocked.json b/apps/elf-eval/fixtures/real_world_memory/proactive_brief/private_corpus_refresh_blocked.json new file mode 100644 index 00000000..92ab7305 --- /dev/null +++ b/apps/elf-eval/fixtures/real_world_memory/proactive_brief/private_corpus_refresh_blocked.json @@ -0,0 +1,124 @@ +{ + "schema": "elf.real_world_job/v1", + "job_id": "proactive-private-corpus-refresh-blocked-001", + "suite": "proactive_brief", + "title": "Block private-corpus refresh suggestions when no operator manifest exists", + "corpus": { + "corpus_id": "real-world-memory-proactive-private-refresh-2026-06-16", + "profile": "private_sanitized", + "items": [ + { + "evidence_id": "private-refresh-no-manifest", + "kind": "blocker", + "text": "Private corpus refresh blocker: no operator-owned private production corpus manifest is checked in or available to this fixture.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "private_corpus_refresh_blocked", + "evidence_id": "private-refresh-no-manifest" + }, + "locator": { + "quote": "no operator-owned private production corpus manifest" + } + }, + "created_at": "2026-06-16T06:20:00Z" + }, + { + "evidence_id": "xy930-private-input-boundary", + "kind": "issue", + "text": "XY-930 owns private-corpus and credentialed production gates when operator inputs exist; fixture-only output must not claim a private-corpus refresh pass.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "private_corpus_refresh_blocked", + "evidence_id": "xy930-private-input-boundary" + }, + "locator": { + "quote": "XY-930 owns private-corpus and credentialed production gates" + } + }, + "created_at": "2026-06-16T06:21:00Z" + } + ] + }, + "timeline": [ + { + "event_id": "private-refresh-blocked-recorded", + "ts": "2026-06-16T06:20:00Z", + "actor": "agent", + "action": "recorded_private_input_blocker", + "evidence_ids": ["private-refresh-no-manifest", "xy930-private-input-boundary"], + "summary": "The private-corpus refresh suggestion remains blocked until operator-owned inputs exist." + } + ], + "prompt": { + "role": "user", + "content": "Suggest a private-corpus refresh when private inputs exist.", + "job_mode": "proactive_brief", + "constraints": [ + "do_not_require_private_corpus", + "block_without_operator_manifest", + "tie_blocker_to_xy930" + ] + }, + "expected_answer": { + "must_include": [], + "must_not_include": [ + "private-corpus refresh passed", + "private production corpus quality is proven" + ], + "evidence_links": {}, + "answer_type": "proactive_project_brief", + "accepted_alternates": [], + "requires_caveat": true, + "requires_refusal": true + }, + "required_evidence": [], + "negative_traps": [], + "scoring_rubric": { + "dimensions": { + "answer_correctness": { + "weight": 0.3, + "max_points": 1.0, + "criteria": "The fixture records the private-corpus refresh as blocked." + }, + "evidence_grounding": { + "weight": 0.3, + "max_points": 1.0, + "criteria": "The blocker points to the missing manifest and XY-930 boundary." + }, + "uncertainty_handling": { + "weight": 0.25, + "max_points": 1.0, + "criteria": "No private-corpus pass is claimed." + }, + "lifecycle_behavior": { + "weight": 0.15, + "max_points": 1.0, + "criteria": "The blocked state remains typed and not hidden as pass." + } + }, + "pass_threshold": 0.8, + "hard_fail_rules": [ + "unsupported high-confidence claim about a required decision or fact" + ] + }, + "allowed_uncertainty": { + "can_answer_unknown": true, + "acceptable_phrases": [ + "No operator-owned private production corpus manifest is available." + ], + "fallback_action": "block_private_refresh_until_xy930_inputs_exist" + }, + "encoding": { + "status": "blocked", + "reason": "No operator-owned private production corpus manifest is available; private-corpus refresh suggestions stay blocked under XY-930.", + "follow_up": { + "title": "XY-930 private-corpus input gate", + "reason": "Run private-corpus and credentialed production gates only when operator-owned inputs exist." + } + }, + "tags": ["private_sanitized", "proactive_brief", "private_corpus_refresh", "xy930_blocked"] +} diff --git a/apps/elf-eval/fixtures/real_world_memory/proactive_brief/resume_work_brief.json b/apps/elf-eval/fixtures/real_world_memory/proactive_brief/resume_work_brief.json new file mode 100644 index 00000000..64cebd93 --- /dev/null +++ b/apps/elf-eval/fixtures/real_world_memory/proactive_brief/resume_work_brief.json @@ -0,0 +1,251 @@ +{ + "schema": "elf.real_world_job/v1", + "job_id": "proactive-resume-work-brief-001", + "suite": "proactive_brief", + "title": "Generate a resume-work brief from current handoff memory", + "corpus": { + "corpus_id": "real-world-memory-proactive-brief-2026-06-16", + "profile": "synthetic", + "items": [ + { + "evidence_id": "resume-current-handoff", + "kind": "handoff", + "text": "Current handoff: continue in branch y/elf-xy-953, add proactive brief fixtures and scoring, then run the proactive brief benchmark command.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "resume_work_brief", + "evidence_id": "resume-current-handoff" + }, + "locator": { + "quote": "add proactive brief fixtures and scoring" + } + }, + "created_at": "2026-06-16T05:00:00Z" + }, + { + "evidence_id": "resume-current-validation", + "kind": "plan", + "text": "Current validation plan: run cargo make real-world-memory-proactive-brief and targeted elf-eval real_world_job_benchmark tests before phase completion.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "resume_work_brief", + "evidence_id": "resume-current-validation" + }, + "locator": { + "quote": "real-world-memory-proactive-brief" + } + }, + "created_at": "2026-06-16T05:03:00Z" + }, + { + "evidence_id": "resume-stale-validation", + "kind": "note", + "text": "Stale handoff: only run the work_resume smoke and skip proactive brief scoring.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "resume_work_brief", + "evidence_id": "resume-stale-validation" + } + }, + "created_at": "2026-06-15T05:00:00Z" + } + ], + "adapter_response": { + "adapter_id": "fixture_proactive_brief", + "answer": { + "content": "Resume brief: stay on y/elf-xy-953, finish proactive brief fixture/scoring work, and validate with cargo make real-world-memory-proactive-brief plus targeted elf-eval tests.", + "claims": [ + { + "claim_id": "resume_current_handoff", + "text": "The current resume point is branch y/elf-xy-953 with proactive brief fixture and scoring work.", + "evidence_ids": ["resume-current-handoff"], + "confidence": "high" + }, + { + "claim_id": "resume_validation", + "text": "The validation plan includes cargo make real-world-memory-proactive-brief.", + "evidence_ids": ["resume-current-validation"], + "confidence": "high" + } + ], + "evidence_ids": ["resume-current-handoff", "resume-current-validation"], + "proactive_briefs": [ + { + "brief_id": "brief-resume-work-2026-06-16", + "contract_schema": "elf.proactive_project_brief/v1", + "generated_at": "2026-06-16T05:30:00Z", + "tenant_id": "fixture-tenant", + "project_id": "elf", + "agent_id": "xy-953-fixture-agent", + "read_profile": "private_plus_project", + "brief_kind": "resume_work", + "suggestions": [ + { + "suggestion_id": "resume-continue-proactive-brief", + "suggestion_kind": "resume_work", + "title": "Continue proactive brief scoring", + "body": "Continue the XY-953 fixture and runner scoring work on y/elf-xy-953, then run the proactive brief benchmark command.", + "evidence_refs": ["resume-current-handoff", "resume-current-validation"], + "freshness": { + "status": "current", + "observed_at": "2026-06-16T05:03:00Z", + "valid_from": "2026-06-16T05:00:00Z", + "valid_to": null, + "last_confirmed_at": "2026-06-16T05:30:00Z", + "superseded_by": [], + "tombstone_refs": [] + }, + "action": { + "decision": "recommend", + "reason_code": "RECOMMEND_CURRENT_HANDOFF", + "reason": "The current handoff and validation plan agree on the same proactive brief work." + }, + "unsupported_claim_flags": [] + } + ], + "source_trace": { + "selected_source_refs": [ + { + "evidence_id": "resume-current-handoff", + "status": "current", + "reason": "Current work handoff." + }, + { + "evidence_id": "resume-current-validation", + "status": "current", + "reason": "Current validation command." + } + ], + "dropped_source_refs": [], + "stale_source_refs": [ + { + "evidence_id": "resume-stale-validation", + "status": "stale", + "reason": "The proactive brief lane now has a direct command." + } + ], + "superseded_source_refs": [], + "tombstone_source_refs": [], + "unsupported_claim_flags": [] + } + } + ], + "latency_ms": 2.0, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + } + } + } + }, + "timeline": [ + { + "event_id": "resume-handoff-recorded", + "ts": "2026-06-16T05:00:00Z", + "actor": "agent", + "action": "recorded_handoff", + "evidence_ids": ["resume-current-handoff"], + "summary": "The current handoff pointed at proactive brief scoring." + } + ], + "prompt": { + "role": "user", + "content": "Generate a resume-work brief that identifies the current next action and validation command.", + "job_mode": "proactive_brief", + "constraints": ["cite_evidence", "mark_currentness", "include_action_rationale"] + }, + "expected_answer": { + "must_include": [ + { + "claim_id": "resume_current_handoff", + "text": "The current resume point is branch y/elf-xy-953 with proactive brief fixture and scoring work." + }, + { + "claim_id": "resume_validation", + "text": "The validation plan includes cargo make real-world-memory-proactive-brief." + } + ], + "must_not_include": ["skip proactive brief scoring"], + "evidence_links": { + "resume_current_handoff": ["resume-current-handoff"], + "resume_validation": ["resume-current-validation"] + }, + "answer_type": "proactive_project_brief", + "accepted_alternates": [], + "requires_caveat": false, + "requires_refusal": false + }, + "required_evidence": [ + { + "evidence_id": "resume-current-handoff", + "claim_id": "resume_current_handoff", + "requirement": "cite", + "quote": "proactive brief fixtures and scoring" + }, + { + "evidence_id": "resume-current-validation", + "claim_id": "resume_validation", + "requirement": "cite", + "quote": "cargo make real-world-memory-proactive-brief" + } + ], + "negative_traps": [ + { + "trap_id": "resume-stale-validation-trap", + "type": "stale_fact", + "evidence_ids": ["resume-stale-validation"], + "failure_if_used": true + } + ], + "proactive_brief": { + "required_suggestion_kinds": ["resume_work"] + }, + "scoring_rubric": { + "dimensions": { + "answer_correctness": { + "weight": 0.25, + "max_points": 1.0, + "criteria": "Brief identifies the current handoff and validation command." + }, + "evidence_grounding": { + "weight": 0.3, + "max_points": 1.0, + "criteria": "The resume suggestion carries evidence refs." + }, + "trap_avoidance": { + "weight": 0.2, + "max_points": 1.0, + "criteria": "The stale validation trap is not used." + }, + "workflow_helpfulness": { + "weight": 0.15, + "max_points": 1.0, + "criteria": "The brief gives a concrete resume action." + }, + "lifecycle_behavior": { + "weight": 0.1, + "max_points": 1.0, + "criteria": "Currentness markers keep stale handoff content out." + } + }, + "pass_threshold": 0.8, + "hard_fail_rules": [ + "unsupported high-confidence claim about a required decision or fact", + "use of a negative trap marked failure_if_used = true" + ] + }, + "allowed_uncertainty": { + "can_answer_unknown": true, + "acceptable_phrases": ["No current handoff evidence is available."], + "fallback_action": "defer_resume_brief" + }, + "tags": ["synthetic", "proactive_brief", "resume_work", "fixture_backed"] +} diff --git a/apps/elf-eval/fixtures/real_world_memory/proactive_brief/stale_decision_audit.json b/apps/elf-eval/fixtures/real_world_memory/proactive_brief/stale_decision_audit.json new file mode 100644 index 00000000..5cb30dc6 --- /dev/null +++ b/apps/elf-eval/fixtures/real_world_memory/proactive_brief/stale_decision_audit.json @@ -0,0 +1,218 @@ +{ + "schema": "elf.real_world_job/v1", + "job_id": "proactive-stale-decision-audit-001", + "suite": "proactive_brief", + "title": "Warn about a stale project decision before suggesting work", + "corpus": { + "corpus_id": "real-world-memory-proactive-brief-2026-06-16", + "profile": "synthetic", + "items": [ + { + "evidence_id": "stale-decision-old-gate", + "kind": "decision", + "text": "Historical decision: use only cargo make real-world-job-operator-ux to evaluate proactive readiness.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "stale_decision_audit", + "evidence_id": "stale-decision-old-gate" + } + }, + "created_at": "2026-06-15T09:00:00Z" + }, + { + "evidence_id": "stale-decision-new-gate", + "kind": "decision", + "text": "Current decision: proactive readiness must use the direct real-world-memory-proactive-brief suite before any proactive brief pass claim.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "stale_decision_audit", + "evidence_id": "stale-decision-new-gate" + }, + "locator": { + "quote": "direct real-world-memory-proactive-brief suite" + } + }, + "created_at": "2026-06-16T05:40:00Z" + } + ], + "adapter_response": { + "adapter_id": "fixture_proactive_brief", + "answer": { + "content": "Stale decision audit: defer the old operator-ux-only readiness decision and use the direct real-world-memory-proactive-brief suite for any proactive pass claim.", + "claims": [ + { + "claim_id": "stale_decision_replaced", + "text": "The operator-ux-only proactive readiness decision is superseded by the direct proactive brief suite.", + "evidence_ids": ["stale-decision-old-gate", "stale-decision-new-gate"], + "confidence": "high" + } + ], + "evidence_ids": ["stale-decision-old-gate", "stale-decision-new-gate"], + "proactive_briefs": [ + { + "brief_id": "brief-stale-decision-audit-2026-06-16", + "contract_schema": "elf.proactive_project_brief/v1", + "generated_at": "2026-06-16T05:45:00Z", + "tenant_id": "fixture-tenant", + "project_id": "elf", + "agent_id": "xy-953-fixture-agent", + "read_profile": "private_plus_project", + "brief_kind": "stale_decision_audit", + "suggestions": [ + { + "suggestion_id": "audit-old-operator-ux-only-gate", + "suggestion_kind": "stale_decision_audit", + "title": "Defer the old operator-ux-only readiness gate", + "body": "Do not use the old operator-ux-only decision as current readiness evidence; it is superseded by the direct proactive brief suite.", + "evidence_refs": ["stale-decision-old-gate", "stale-decision-new-gate"], + "freshness": { + "status": "superseded", + "observed_at": "2026-06-16T05:40:00Z", + "valid_from": "2026-06-15T09:00:00Z", + "valid_to": "2026-06-16T05:40:00Z", + "last_confirmed_at": "2026-06-16T05:45:00Z", + "superseded_by": ["stale-decision-new-gate"], + "tombstone_refs": [] + }, + "action": { + "decision": "defer", + "reason_code": "DEFER_SUPERSEDED_DECISION", + "reason": "The old decision is retained as history and must not be used as the current proactive-readiness gate." + }, + "unsupported_claim_flags": [] + } + ], + "source_trace": { + "selected_source_refs": [ + { + "evidence_id": "stale-decision-new-gate", + "status": "current", + "reason": "Current proactive-readiness gate." + } + ], + "dropped_source_refs": [], + "stale_source_refs": [], + "superseded_source_refs": [ + { + "evidence_id": "stale-decision-old-gate", + "status": "superseded", + "reason": "Replaced by the direct proactive brief suite.", + "superseded_by": "stale-decision-new-gate" + } + ], + "tombstone_source_refs": [], + "unsupported_claim_flags": [] + } + } + ], + "latency_ms": 2.0, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + } + } + } + }, + "timeline": [ + { + "event_id": "stale-decision-replaced", + "ts": "2026-06-16T05:40:00Z", + "actor": "agent", + "action": "superseded_decision", + "evidence_ids": ["stale-decision-old-gate", "stale-decision-new-gate"], + "summary": "The direct proactive brief suite superseded the old operator-ux-only readiness gate." + } + ], + "prompt": { + "role": "user", + "content": "Audit stale project decisions before generating proactive suggestions.", + "job_mode": "proactive_brief", + "constraints": ["cite_evidence", "mark_currentness", "include_defer_reason"] + }, + "expected_answer": { + "must_include": [ + { + "claim_id": "stale_decision_replaced", + "text": "The operator-ux-only proactive readiness decision is superseded by the direct proactive brief suite." + } + ], + "must_not_include": ["use only cargo make real-world-job-operator-ux to evaluate proactive readiness"], + "evidence_links": { + "stale_decision_replaced": ["stale-decision-old-gate", "stale-decision-new-gate"] + }, + "answer_type": "proactive_project_brief", + "accepted_alternates": [], + "requires_caveat": false, + "requires_refusal": false + }, + "required_evidence": [ + { + "evidence_id": "stale-decision-old-gate", + "claim_id": "stale_decision_replaced", + "requirement": "cite", + "quote": "Historical decision" + }, + { + "evidence_id": "stale-decision-new-gate", + "claim_id": "stale_decision_replaced", + "requirement": "cite", + "quote": "direct real-world-memory-proactive-brief suite" + } + ], + "negative_traps": [ + { + "trap_id": "stale-decision-current-trap", + "type": "stale_fact", + "evidence_ids": ["stale-decision-old-gate"], + "failure_if_used": false + } + ], + "proactive_brief": { + "required_suggestion_kinds": ["stale_decision_audit"] + }, + "scoring_rubric": { + "dimensions": { + "answer_correctness": { + "weight": 0.25, + "max_points": 1.0, + "criteria": "Brief identifies the superseded decision." + }, + "evidence_grounding": { + "weight": 0.3, + "max_points": 1.0, + "criteria": "The stale-decision warning cites old and new evidence." + }, + "trap_avoidance": { + "weight": 0.2, + "max_points": 1.0, + "criteria": "The old decision is not presented as current." + }, + "workflow_helpfulness": { + "weight": 0.15, + "max_points": 1.0, + "criteria": "The warning gives a defer reason." + }, + "lifecycle_behavior": { + "weight": 0.1, + "max_points": 1.0, + "criteria": "Supersession markers are present." + } + }, + "pass_threshold": 0.8, + "hard_fail_rules": [ + "unsupported high-confidence claim about a required decision or fact" + ] + }, + "allowed_uncertainty": { + "can_answer_unknown": true, + "acceptable_phrases": ["No superseding decision is available."], + "fallback_action": "defer_stale_decision" + }, + "tags": ["synthetic", "proactive_brief", "stale_decision_audit", "fixture_backed"] +} diff --git a/apps/elf-eval/fixtures/real_world_memory/proactive_brief/stale_plan_preference_warning.json b/apps/elf-eval/fixtures/real_world_memory/proactive_brief/stale_plan_preference_warning.json new file mode 100644 index 00000000..11338f90 --- /dev/null +++ b/apps/elf-eval/fixtures/real_world_memory/proactive_brief/stale_plan_preference_warning.json @@ -0,0 +1,316 @@ +{ + "schema": "elf.real_world_job/v1", + "job_id": "proactive-stale-plan-preference-warning-001", + "suite": "proactive_brief", + "title": "Reject stale plan and preference suggestions after TTL invalidation", + "corpus": { + "corpus_id": "real-world-memory-proactive-brief-2026-06-16", + "profile": "synthetic", + "items": [ + { + "evidence_id": "stale-plan-old", + "kind": "plan", + "text": "Old plan: publish the proactive brief report without running the new proactive brief benchmark.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "stale_plan_preference_warning", + "evidence_id": "stale-plan-old" + } + }, + "created_at": "2026-06-15T12:00:00Z" + }, + { + "evidence_id": "stale-plan-ttl", + "kind": "trace", + "text": "TTL invalidation: the old proactive report publishing plan expired at 2026-06-16T00:00:00Z and must not be recommended as current work.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "stale_plan_preference_warning", + "evidence_id": "stale-plan-ttl" + }, + "locator": { + "quote": "must not be recommended as current work" + } + }, + "created_at": "2026-06-16T00:00:00Z" + }, + { + "evidence_id": "current-plan-run-gate", + "kind": "plan", + "text": "Current plan: run the proactive brief benchmark command and targeted tests before publishing the report.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "stale_plan_preference_warning", + "evidence_id": "current-plan-run-gate" + }, + "locator": { + "quote": "run the proactive brief benchmark command" + } + }, + "created_at": "2026-06-16T06:00:00Z" + }, + { + "evidence_id": "old-preference-long-brief", + "kind": "preference", + "text": "Historical preference: write long proactive briefs with broad product comparisons.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "stale_plan_preference_warning", + "evidence_id": "old-preference-long-brief" + } + }, + "created_at": "2026-06-15T12:05:00Z" + }, + { + "evidence_id": "current-preference-concise-brief", + "kind": "preference", + "text": "Current preference: proactive briefs should be concise, evidence-linked, and avoid broad hosted-product parity claims.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "stale_plan_preference_warning", + "evidence_id": "current-preference-concise-brief" + }, + "locator": { + "quote": "concise, evidence-linked" + } + }, + "created_at": "2026-06-16T06:02:00Z" + } + ], + "adapter_response": { + "adapter_id": "fixture_proactive_brief", + "answer": { + "content": "Stale plan/preference warning: reject the expired publish-first plan, use the current run-gate plan, and prefer concise evidence-linked briefs without broad hosted-product parity claims.", + "claims": [ + { + "claim_id": "stale_plan_rejected", + "text": "The expired publish-first plan must not be recommended as current work.", + "evidence_ids": ["stale-plan-ttl"], + "confidence": "high" + }, + { + "claim_id": "current_preference_concise", + "text": "The current brief preference is concise and evidence-linked.", + "evidence_ids": ["current-preference-concise-brief"], + "confidence": "high" + } + ], + "evidence_ids": ["stale-plan-ttl", "current-plan-run-gate", "current-preference-concise-brief"], + "proactive_briefs": [ + { + "brief_id": "brief-stale-plan-preference-2026-06-16", + "contract_schema": "elf.proactive_project_brief/v1", + "generated_at": "2026-06-16T06:10:00Z", + "tenant_id": "fixture-tenant", + "project_id": "elf", + "agent_id": "xy-953-fixture-agent", + "read_profile": "private_plus_project", + "brief_kind": "stale_plan_preference_warning", + "suggestions": [ + { + "suggestion_id": "reject-expired-publish-first-plan", + "suggestion_kind": "stale_plan_preference_warning", + "title": "Reject the expired publish-first plan", + "body": "Do not publish the proactive report before running the new proactive brief benchmark; the old plan expired under TTL.", + "evidence_refs": ["stale-plan-old", "stale-plan-ttl", "current-plan-run-gate"], + "freshness": { + "status": "tombstoned", + "observed_at": "2026-06-16T00:00:00Z", + "valid_from": "2026-06-15T12:00:00Z", + "valid_to": "2026-06-16T00:00:00Z", + "last_confirmed_at": "2026-06-16T06:10:00Z", + "superseded_by": ["current-plan-run-gate"], + "tombstone_refs": ["stale-plan-ttl"] + }, + "action": { + "decision": "reject", + "reason_code": "REJECT_TTL_INVALIDATED_PLAN", + "reason": "The old publish-first plan has explicit TTL invalidation and a current replacement plan exists." + }, + "unsupported_claim_flags": [] + }, + { + "suggestion_id": "defer-long-comparison-preference", + "suggestion_kind": "stale_plan_preference_warning", + "title": "Defer long product-comparison prose", + "body": "Use concise evidence-linked proactive briefs and avoid broad hosted-product parity claims.", + "evidence_refs": ["old-preference-long-brief", "current-preference-concise-brief"], + "freshness": { + "status": "superseded", + "observed_at": "2026-06-16T06:02:00Z", + "valid_from": "2026-06-15T12:05:00Z", + "valid_to": "2026-06-16T06:02:00Z", + "last_confirmed_at": "2026-06-16T06:10:00Z", + "superseded_by": ["current-preference-concise-brief"], + "tombstone_refs": [] + }, + "action": { + "decision": "defer", + "reason_code": "DEFER_SUPERSEDED_PREFERENCE", + "reason": "The old long-comparison preference is superseded by a concise evidence-linked preference." + }, + "unsupported_claim_flags": [] + } + ], + "source_trace": { + "selected_source_refs": [ + { + "evidence_id": "current-plan-run-gate", + "status": "current", + "reason": "Replacement current plan." + }, + { + "evidence_id": "current-preference-concise-brief", + "status": "current", + "reason": "Replacement current preference." + } + ], + "dropped_source_refs": [], + "stale_source_refs": [], + "superseded_source_refs": [ + { + "evidence_id": "old-preference-long-brief", + "status": "superseded", + "reason": "Replaced by concise evidence-linked preference.", + "superseded_by": "current-preference-concise-brief" + } + ], + "tombstone_source_refs": [ + { + "evidence_id": "stale-plan-ttl", + "status": "tombstoned", + "reason": "TTL invalidation for old publish-first plan." + } + ], + "unsupported_claim_flags": [] + } + } + ], + "latency_ms": 2.0, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + } + } + } + }, + "timeline": [ + { + "event_id": "stale-plan-ttl-recorded", + "ts": "2026-06-16T00:00:00Z", + "actor": "agent", + "action": "ttl_invalidated_plan", + "evidence_ids": ["stale-plan-old", "stale-plan-ttl"], + "summary": "The old publish-first plan expired and must not be current work." + } + ], + "prompt": { + "role": "user", + "content": "Warn me about stale plans or preferences before making proactive suggestions.", + "job_mode": "proactive_brief", + "constraints": ["cite_evidence", "mark_tombstones", "include_reject_or_defer_reason"] + }, + "expected_answer": { + "must_include": [ + { + "claim_id": "stale_plan_rejected", + "text": "The expired publish-first plan must not be recommended as current work." + }, + { + "claim_id": "current_preference_concise", + "text": "The current brief preference is concise and evidence-linked." + } + ], + "must_not_include": [ + "publish the proactive brief report without running the new proactive brief benchmark", + "write long proactive briefs with broad product comparisons" + ], + "evidence_links": { + "stale_plan_rejected": ["stale-plan-old", "stale-plan-ttl", "current-plan-run-gate"], + "current_preference_concise": [ + "old-preference-long-brief", + "current-preference-concise-brief" + ] + }, + "answer_type": "proactive_project_brief", + "accepted_alternates": [], + "requires_caveat": false, + "requires_refusal": false + }, + "required_evidence": [ + { + "evidence_id": "stale-plan-ttl", + "claim_id": "stale_plan_rejected", + "requirement": "cite", + "quote": "must not be recommended as current work" + }, + { + "evidence_id": "current-preference-concise-brief", + "claim_id": "current_preference_concise", + "requirement": "cite", + "quote": "concise, evidence-linked" + } + ], + "negative_traps": [ + { + "trap_id": "ttl-plan-current-trap", + "type": "stale_fact", + "evidence_ids": ["stale-plan-old"], + "failure_if_used": false + } + ], + "proactive_brief": { + "required_suggestion_kinds": ["stale_plan_preference_warning"] + }, + "scoring_rubric": { + "dimensions": { + "answer_correctness": { + "weight": 0.25, + "max_points": 1.0, + "criteria": "Brief rejects the expired plan and names current preference." + }, + "evidence_grounding": { + "weight": 0.3, + "max_points": 1.0, + "criteria": "Every stale warning carries source refs." + }, + "trap_avoidance": { + "weight": 0.2, + "max_points": 1.0, + "criteria": "TTL-invalidated content is not current." + }, + "workflow_helpfulness": { + "weight": 0.15, + "max_points": 1.0, + "criteria": "The warning gives reject and defer rationale." + }, + "lifecycle_behavior": { + "weight": 0.1, + "max_points": 1.0, + "criteria": "TTL tombstone and supersession markers are preserved." + } + }, + "pass_threshold": 0.8, + "hard_fail_rules": [ + "unsupported high-confidence claim about a required decision or fact" + ] + }, + "allowed_uncertainty": { + "can_answer_unknown": true, + "acceptable_phrases": ["No TTL invalidation evidence is available."], + "fallback_action": "defer_stale_plan_warning" + }, + "tags": ["synthetic", "proactive_brief", "stale_plan_preference_warning", "fixture_backed"] +} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark.rs b/apps/elf-eval/src/bin/real_world_job_benchmark.rs index 2038b5c5..d93398c7 100644 --- a/apps/elf-eval/src/bin/real_world_job_benchmark.rs +++ b/apps/elf-eval/src/bin/real_world_job_benchmark.rs @@ -50,6 +50,7 @@ const SUITES: &[&str] = &[ "memory_evolution", "consolidation", "memory_summary", + "proactive_brief", "knowledge_compilation", "operator_debugging_ux", "capture_integration", @@ -150,6 +151,7 @@ struct RealWorldJob { encoding: JobEncoding, memory_evolution: Option, memory_summary: Option, + proactive_brief: Option, } #[derive(Debug, Deserialize)] @@ -363,6 +365,12 @@ struct MemorySummaryExpectation { required_categories: Vec, } +#[derive(Debug, Deserialize)] +struct ProactiveBriefExpectation { + #[serde(default)] + required_suggestion_kinds: Vec, +} + #[derive(Debug, Deserialize)] struct ScoringRubric { #[serde(default)] @@ -405,6 +413,8 @@ struct ProducedAnswer { pages: Vec, #[serde(default)] memory_summaries: Vec, + #[serde(default)] + proactive_briefs: Vec, #[serde(skip_serializing_if = "Option::is_none")] latency_ms: Option, #[serde(skip_serializing_if = "Option::is_none")] @@ -554,6 +564,42 @@ struct MemorySummarySourceTraceItem { superseded_by: Option, } +#[derive(Clone, Debug, Deserialize, Serialize)] +struct ProactiveBriefArtifact { + brief_id: String, + contract_schema: String, + generated_at: String, + tenant_id: String, + project_id: String, + agent_id: String, + read_profile: String, + brief_kind: String, + #[serde(default)] + suggestions: Vec, + source_trace: MemorySummarySourceTrace, +} + +#[derive(Clone, Debug, Deserialize, Serialize)] +struct ProactiveSuggestion { + suggestion_id: String, + suggestion_kind: String, + title: String, + body: String, + #[serde(default)] + evidence_refs: Vec, + freshness: MemorySummaryFreshness, + action: ProactiveSuggestionAction, + #[serde(default)] + unsupported_claim_flags: Vec, +} + +#[derive(Clone, Debug, Deserialize, Serialize)] +struct ProactiveSuggestionAction { + decision: String, + reason_code: String, + reason: String, +} + #[derive(Clone, Debug, Deserialize)] struct ConsolidationFixture { #[serde(default)] @@ -1035,6 +1081,8 @@ struct ReportSummary { #[serde(skip_serializing_if = "Option::is_none")] memory_summary: Option, #[serde(skip_serializing_if = "Option::is_none")] + proactive_brief: Option, + #[serde(skip_serializing_if = "Option::is_none")] knowledge: Option, } @@ -1084,6 +1132,38 @@ struct MemorySummaryReport { source_trace_tombstone_count: usize, } +#[derive(Clone, Debug, Default, Deserialize, Serialize)] +struct ProactiveBriefSummaryReport { + job_count: usize, + brief_count: usize, + suggestion_count: usize, + required_suggestion_kind_count: usize, + covered_required_suggestion_kind_count: usize, + missing_required_suggestion_kind_count: usize, + evidence_ref_required_count: usize, + evidence_ref_suggestion_count: usize, + evidence_ref_coverage: f64, + freshness_marker_count: usize, + freshness_coverage: f64, + action_rationale_count: usize, + action_rationale_coverage: f64, + recommended_count: usize, + deferred_count: usize, + rejected_count: usize, + current_suggestion_count: usize, + non_current_suggestion_count: usize, + stale_warning_count: usize, + invalid_current_suggestion_count: usize, + untraced_suggestion_count: usize, + unsupported_current_suggestion_count: usize, + tombstone_violation_count: usize, + source_trace_selected_count: usize, + source_trace_dropped_count: usize, + source_trace_stale_count: usize, + source_trace_superseded_count: usize, + source_trace_tombstone_count: usize, +} + #[derive(Clone, Debug, Default, Deserialize, Serialize)] struct KnowledgeSummary { job_count: usize, @@ -1160,6 +1240,8 @@ struct JobReport { knowledge: Option, #[serde(skip_serializing_if = "Option::is_none")] memory_summary: Option, + #[serde(skip_serializing_if = "Option::is_none")] + proactive_brief: Option, trap_ids_used: Vec, dimension_scores: Vec, reason: String, @@ -1322,6 +1404,37 @@ struct MemorySummaryJobMetrics { source_trace_tombstone_count: usize, } +#[derive(Clone, Debug, Default, Deserialize, Serialize)] +struct ProactiveBriefJobMetrics { + brief_count: usize, + suggestion_count: usize, + required_suggestion_kind_count: usize, + covered_required_suggestion_kind_count: usize, + missing_required_suggestion_kind_count: usize, + evidence_ref_required_count: usize, + evidence_ref_suggestion_count: usize, + evidence_ref_coverage: f64, + freshness_marker_count: usize, + freshness_coverage: f64, + action_rationale_count: usize, + action_rationale_coverage: f64, + recommended_count: usize, + deferred_count: usize, + rejected_count: usize, + current_suggestion_count: usize, + non_current_suggestion_count: usize, + stale_warning_count: usize, + invalid_current_suggestion_count: usize, + untraced_suggestion_count: usize, + unsupported_current_suggestion_count: usize, + tombstone_violation_count: usize, + source_trace_selected_count: usize, + source_trace_dropped_count: usize, + source_trace_stale_count: usize, + source_trace_superseded_count: usize, + source_trace_tombstone_count: usize, +} + #[derive(Clone, Debug, Default, Deserialize, Serialize)] struct EvolutionSummary { stale_answer_count: usize, @@ -1388,6 +1501,7 @@ struct JobScoring { evolution: Option, consolidation: Option, memory_summary: Option, + proactive_brief: Option, } #[derive(Debug, Default)] @@ -1416,6 +1530,13 @@ struct FailureCounts { memory_summary_missing_rationale: usize, memory_summary_missing_categories: usize, memory_summary_unsupported_current_entries: usize, + proactive_brief_invalid_current_suggestions: usize, + proactive_brief_untraced_suggestions: usize, + proactive_brief_missing_freshness: usize, + proactive_brief_missing_action_rationale: usize, + proactive_brief_missing_kinds: usize, + proactive_brief_unsupported_current_suggestions: usize, + proactive_brief_tombstone_violations: usize, untraced_page_sections: usize, missed_stale_findings: usize, rebuild_failures: usize, @@ -1544,6 +1665,7 @@ fn validate_job(job: &RealWorldJob, path: &Path) -> Result<()> { validate_job_encoding(job, path)?; validate_memory_evolution(job, path)?; validate_memory_summary_expectation(job, path)?; + validate_proactive_brief_expectation(job, path)?; validate_trace_explainability(job, path)?; Ok(()) @@ -1823,6 +1945,9 @@ fn validate_adapter_response(job: &RealWorldJob, path: &Path) -> Result<()> { for summary in &adapter_response.answer.memory_summaries { validate_memory_summary_artifact(summary, path, &evidence_ids)?; } + for brief in &adapter_response.answer.proactive_briefs { + validate_proactive_brief_artifact(brief, path, &evidence_ids)?; + } if job.suite == "memory_summary" && adapter_response.answer.memory_summaries.is_empty() @@ -1833,6 +1958,15 @@ fn validate_adapter_response(job: &RealWorldJob, path: &Path) -> Result<()> { path.display() )); } + if job.suite == "proactive_brief" + && adapter_response.answer.proactive_briefs.is_empty() + && job.encoding.status.is_none() + { + return Err(eyre::eyre!( + "{} proactive_brief jobs must provide adapter_response.answer.proactive_briefs.", + path.display() + )); + } Ok(()) } @@ -2041,6 +2175,112 @@ fn validate_memory_summary_source_trace( Ok(()) } +fn validate_proactive_brief_artifact( + brief: &ProactiveBriefArtifact, + path: &Path, + evidence_ids: &BTreeSet, +) -> Result<()> { + if brief.brief_id.trim().is_empty() + || brief.contract_schema != "elf.proactive_project_brief/v1" + || brief.generated_at.trim().is_empty() + || brief.tenant_id.trim().is_empty() + || brief.project_id.trim().is_empty() + || brief.agent_id.trim().is_empty() + || brief.read_profile.trim().is_empty() + || brief.brief_kind.trim().is_empty() + || brief.suggestions.is_empty() + { + return Err(eyre::eyre!("{} has an incomplete proactive brief.", path.display())); + } + + validate_optional_rfc3339(&brief.generated_at, path, brief.brief_id.as_str())?; + + for suggestion in &brief.suggestions { + validate_proactive_suggestion(suggestion, path, evidence_ids)?; + } + + validate_memory_summary_source_trace(&brief.source_trace, path, evidence_ids)?; + + Ok(()) +} + +fn validate_proactive_suggestion( + suggestion: &ProactiveSuggestion, + path: &Path, + evidence_ids: &BTreeSet, +) -> Result<()> { + if suggestion.suggestion_id.trim().is_empty() + || suggestion.suggestion_kind.trim().is_empty() + || suggestion.title.trim().is_empty() + || suggestion.body.trim().is_empty() + { + return Err(eyre::eyre!("{} has an incomplete proactive suggestion.", path.display())); + } + if !is_proactive_suggestion_kind(suggestion.suggestion_kind.as_str()) { + return Err(eyre::eyre!( + "{} has unknown proactive suggestion kind {}.", + path.display(), + suggestion.suggestion_kind + )); + } + if !is_memory_summary_freshness_status(suggestion.freshness.status.as_str()) { + return Err(eyre::eyre!( + "{} has unknown proactive freshness status {}.", + path.display(), + suggestion.freshness.status + )); + } + if !is_proactive_action_decision(suggestion.action.decision.as_str()) { + return Err(eyre::eyre!( + "{} has unknown proactive action decision {}.", + path.display(), + suggestion.action.decision + )); + } + if suggestion.action.reason_code.trim().is_empty() || suggestion.action.reason.trim().is_empty() + { + return Err(eyre::eyre!("{} has incomplete proactive action rationale.", path.display())); + } + + for evidence_id in &suggestion.evidence_refs { + ensure_known_evidence(path, evidence_ids, evidence_id)?; + } + for evidence_id in &suggestion.freshness.tombstone_refs { + ensure_known_evidence(path, evidence_ids, evidence_id)?; + } + for flag in &suggestion.unsupported_claim_flags { + if !flag.is_object() { + return Err(eyre::eyre!( + "{} proactive unsupported-claim flags must be JSON objects.", + path.display() + )); + } + } + + validate_optional_summary_time( + path, + suggestion.freshness.observed_at.as_deref(), + suggestion.suggestion_id.as_str(), + )?; + validate_optional_summary_time( + path, + suggestion.freshness.valid_from.as_deref(), + suggestion.suggestion_id.as_str(), + )?; + validate_optional_summary_time( + path, + suggestion.freshness.valid_to.as_deref(), + suggestion.suggestion_id.as_str(), + )?; + validate_optional_summary_time( + path, + suggestion.freshness.last_confirmed_at.as_deref(), + suggestion.suggestion_id.as_str(), + )?; + + Ok(()) +} + fn validate_optional_summary_time(path: &Path, value: Option<&str>, id: &str) -> Result<()> { if let Some(value) = value { validate_optional_rfc3339(value, path, id)?; @@ -2076,6 +2316,21 @@ fn is_memory_summary_rationale_decision(decision: &str) -> bool { matches!(decision, "included" | "downgraded" | "excluded") } +fn is_proactive_suggestion_kind(kind: &str) -> bool { + matches!( + kind, + "daily_project_brief" + | "resume_work" + | "stale_decision_audit" + | "stale_plan_preference_warning" + | "private_corpus_refresh" + ) +} + +fn is_proactive_action_decision(decision: &str) -> bool { + matches!(decision, "recommend" | "defer" | "reject") +} + fn validate_scoring_rubric(job: &RealWorldJob, path: &Path) -> Result<()> { if !(0.0..=1.0).contains(&job.scoring_rubric.pass_threshold) { return Err(eyre::eyre!("{} has invalid pass_threshold.", path.display())); @@ -2278,6 +2533,31 @@ fn validate_memory_summary_expectation(job: &RealWorldJob, path: &Path) -> Resul Ok(()) } +fn validate_proactive_brief_expectation(job: &RealWorldJob, path: &Path) -> Result<()> { + let Some(brief) = &job.proactive_brief else { + if job.suite == "proactive_brief" && job.encoding.status.is_none() { + return Err(eyre::eyre!( + "{} proactive_brief jobs must provide proactive_brief expectations.", + path.display() + )); + } + + return Ok(()); + }; + + for kind in &brief.required_suggestion_kinds { + if !is_proactive_suggestion_kind(kind.as_str()) { + return Err(eyre::eyre!( + "{} proactive_brief expectation references unknown suggestion kind {}.", + path.display(), + kind + )); + } + } + + Ok(()) +} + fn validate_evolution_conflict( path: &Path, evidence_ids: &BTreeSet, @@ -2543,10 +2823,12 @@ fn score_job(job: &RealWorldJob) -> JobScoring { let missing_evidence = missing_required_evidence(job, &produced_evidence); let knowledge = knowledge_metrics(job, answer); let memory_summary = memory_summary_metrics(job, answer); + let proactive_brief = proactive_brief_metrics(job, answer); let mut unsupported_claims = unsupported_claims(job, answer); unsupported_claims.extend(unsupported_page_claims(answer)); unsupported_claims.extend(unsupported_memory_summary_claims(job, answer)); + unsupported_claims.extend(unsupported_proactive_suggestions(job, answer)); let operator_counts = operator_debug_failure_counts(job); let latency_violations = latency_violations(job, answer); @@ -2557,7 +2839,7 @@ fn score_job(job: &RealWorldJob) -> JobScoring { .as_ref() .map_or(0, |report| report.conflict_count - report.conflict_detection_count); let update_rationale_missing = evolution.as_ref().map_or(0, update_rationale_missing_count); - let counts = FailureCounts { + let mut counts = FailureCounts { missing_claims: missing_claims.len(), forbidden_claims: forbidden_claims.len(), missing_evidence: missing_evidence.len(), @@ -2576,31 +2858,18 @@ fn score_job(job: &RealWorldJob) -> JobScoring { review_action_failures: review_action_failures(consolidation.as_ref()), source_mutations: consolidation.as_ref().map_or(0, |report| report.source_mutation_count), blocking_executable_gaps: blocking_executable_gaps(consolidation.as_ref()), - memory_summary_invalid_current_entries: memory_summary - .as_ref() - .map_or(0, |metrics| metrics.invalid_top_of_mind_count), - memory_summary_untraced_entries: memory_summary - .as_ref() - .map_or(0, |metrics| metrics.untraced_entry_count), - memory_summary_missing_freshness: memory_summary.as_ref().map_or(0, |metrics| { - metrics.entry_count.saturating_sub(metrics.freshness_marker_count) - }), - memory_summary_missing_rationale: memory_summary - .as_ref() - .map_or(0, |metrics| metrics.entry_count.saturating_sub(metrics.rationale_count)), - memory_summary_missing_categories: memory_summary - .as_ref() - .map_or(0, |metrics| metrics.missing_required_category_count), - memory_summary_unsupported_current_entries: memory_summary - .as_ref() - .map_or(0, |metrics| metrics.unsupported_current_entry_count), untraced_page_sections: knowledge .as_ref() .map_or(0, |metrics| metrics.untraced_section_count), missed_stale_findings: knowledge.as_ref().map_or(0, missed_stale_finding_count), rebuild_failures: knowledge.as_ref().map_or(0, |metrics| metrics.rebuild_failure_count), page_usefulness_failures: knowledge.as_ref().map_or(0, page_usefulness_failure_count), + ..FailureCounts::default() }; + + apply_memory_summary_failure_counts(&mut counts, memory_summary.as_ref()); + apply_proactive_brief_failure_counts(&mut counts, proactive_brief.as_ref()); + let dimension_scores = dimension_scores(job, &counts); let normalized_score = normalized_score(&dimension_scores); let wrong_result_count = wrong_result_count(&counts); @@ -2632,9 +2901,48 @@ fn score_job(job: &RealWorldJob) -> JobScoring { evolution, consolidation, memory_summary, + proactive_brief, } } +fn apply_memory_summary_failure_counts( + counts: &mut FailureCounts, + metrics: Option<&MemorySummaryJobMetrics>, +) { + let Some(metrics) = metrics else { + return; + }; + + counts.memory_summary_invalid_current_entries = metrics.invalid_top_of_mind_count; + counts.memory_summary_untraced_entries = metrics.untraced_entry_count; + counts.memory_summary_missing_freshness = + metrics.entry_count.saturating_sub(metrics.freshness_marker_count); + counts.memory_summary_missing_rationale = + metrics.entry_count.saturating_sub(metrics.rationale_count); + counts.memory_summary_missing_categories = metrics.missing_required_category_count; + counts.memory_summary_unsupported_current_entries = metrics.unsupported_current_entry_count; +} + +fn apply_proactive_brief_failure_counts( + counts: &mut FailureCounts, + metrics: Option<&ProactiveBriefJobMetrics>, +) { + let Some(metrics) = metrics else { + return; + }; + + counts.proactive_brief_invalid_current_suggestions = metrics.invalid_current_suggestion_count; + counts.proactive_brief_untraced_suggestions = metrics.untraced_suggestion_count; + counts.proactive_brief_missing_freshness = + metrics.suggestion_count.saturating_sub(metrics.freshness_marker_count); + counts.proactive_brief_missing_action_rationale = + metrics.suggestion_count.saturating_sub(metrics.action_rationale_count); + counts.proactive_brief_missing_kinds = metrics.missing_required_suggestion_kind_count; + counts.proactive_brief_unsupported_current_suggestions = + metrics.unsupported_current_suggestion_count; + counts.proactive_brief_tombstone_violations = metrics.tombstone_violation_count; +} + fn score_declared_job( job: &RealWorldJob, status: TypedStatus, @@ -2659,6 +2967,7 @@ fn score_declared_job( evolution, consolidation, memory_summary: None, + proactive_brief: None, } } @@ -2682,6 +2991,13 @@ fn wrong_result_count(counts: &FailureCounts) -> usize { + counts.memory_summary_missing_rationale + counts.memory_summary_missing_categories + counts.memory_summary_unsupported_current_entries + + counts.proactive_brief_invalid_current_suggestions + + counts.proactive_brief_untraced_suggestions + + counts.proactive_brief_missing_freshness + + counts.proactive_brief_missing_action_rationale + + counts.proactive_brief_missing_kinds + + counts.proactive_brief_unsupported_current_suggestions + + counts.proactive_brief_tombstone_violations + counts.untraced_page_sections + counts.missed_stale_findings + counts.rebuild_failures @@ -2736,6 +3052,7 @@ fn synthetic_answer(job: &RealWorldJob) -> &ProducedAnswer { evidence_ids: Vec::new(), pages: Vec::new(), memory_summaries: Vec::new(), + proactive_briefs: Vec::new(), latency_ms: None, cost: None, trace_explainability: None, @@ -2748,6 +3065,11 @@ fn produced_evidence_ids(answer: &ProducedAnswer) -> BTreeSet { for claim in &answer.claims { evidence.extend(claim.evidence_ids.iter().cloned()); } + for brief in &answer.proactive_briefs { + for suggestion in &brief.suggestions { + evidence.extend(suggestion.evidence_refs.iter().cloned()); + } + } evidence } @@ -3413,6 +3735,219 @@ fn unsupported_memory_summary_claims( .collect() } +fn proactive_brief_metrics( + job: &RealWorldJob, + answer: &ProducedAnswer, +) -> Option { + if answer.proactive_briefs.is_empty() { + return None; + } + + let mut metrics = ProactiveBriefJobMetrics { + brief_count: answer.proactive_briefs.len(), + required_suggestion_kind_count: job + .proactive_brief + .as_ref() + .map_or(0, |brief| brief.required_suggestion_kinds.len()), + ..ProactiveBriefJobMetrics::default() + }; + let mut suggestion_kinds = BTreeSet::new(); + + for brief in &answer.proactive_briefs { + accumulate_proactive_brief_metrics(brief, &mut metrics, &mut suggestion_kinds); + } + + let covered_required_suggestion_kind_count = job.proactive_brief.as_ref().map_or(0, |brief| { + brief + .required_suggestion_kinds + .iter() + .filter(|kind| suggestion_kinds.contains(*kind)) + .count() + }); + + metrics.covered_required_suggestion_kind_count = covered_required_suggestion_kind_count; + metrics.missing_required_suggestion_kind_count = metrics + .required_suggestion_kind_count + .saturating_sub(covered_required_suggestion_kind_count); + metrics.evidence_ref_coverage = + ratio(metrics.evidence_ref_suggestion_count, metrics.evidence_ref_required_count); + metrics.freshness_coverage = ratio(metrics.freshness_marker_count, metrics.suggestion_count); + metrics.action_rationale_coverage = + ratio(metrics.action_rationale_count, metrics.suggestion_count); + + Some(metrics) +} + +fn accumulate_proactive_brief_metrics( + brief: &ProactiveBriefArtifact, + metrics: &mut ProactiveBriefJobMetrics, + suggestion_kinds: &mut BTreeSet, +) { + metrics.source_trace_selected_count += brief.source_trace.selected_source_refs.len(); + metrics.source_trace_dropped_count += brief.source_trace.dropped_source_refs.len(); + metrics.source_trace_stale_count += brief.source_trace.stale_source_refs.len(); + metrics.source_trace_superseded_count += brief.source_trace.superseded_source_refs.len(); + metrics.source_trace_tombstone_count += brief.source_trace.tombstone_source_refs.len(); + + let non_current_refs = memory_summary_non_current_trace_refs(&brief.source_trace); + let tombstone_refs = proactive_tombstone_trace_refs(&brief.source_trace); + + for suggestion in &brief.suggestions { + metrics.suggestion_count += 1; + metrics.evidence_ref_required_count += 1; + + suggestion_kinds.insert(suggestion.suggestion_kind.clone()); + + if suggestion.evidence_refs.is_empty() { + metrics.untraced_suggestion_count += 1; + } else { + metrics.evidence_ref_suggestion_count += 1; + } + if proactive_suggestion_has_freshness(suggestion) { + metrics.freshness_marker_count += 1; + } + if proactive_suggestion_has_action_rationale(suggestion) { + metrics.action_rationale_count += 1; + } + + accumulate_proactive_action_decision(suggestion.action.decision.as_str(), metrics); + + if suggestion.freshness.status == "current" { + metrics.current_suggestion_count += 1; + } else { + metrics.non_current_suggestion_count += 1; + } + if proactive_suggestion_is_stale_warning(suggestion) { + metrics.stale_warning_count += 1; + } + if proactive_suggestion_is_invalid_current(suggestion, &non_current_refs) { + metrics.invalid_current_suggestion_count += 1; + } + if proactive_suggestion_is_unsupported_current(suggestion) { + metrics.unsupported_current_suggestion_count += 1; + } + if proactive_suggestion_is_tombstone_violation(suggestion, &tombstone_refs) { + metrics.tombstone_violation_count += 1; + } + } +} + +fn proactive_tombstone_trace_refs(trace: &MemorySummarySourceTrace) -> BTreeSet<&str> { + trace.tombstone_source_refs.iter().map(|item| item.evidence_id.as_str()).collect() +} + +fn accumulate_proactive_action_decision(decision: &str, metrics: &mut ProactiveBriefJobMetrics) { + match decision { + "recommend" => metrics.recommended_count += 1, + "defer" => metrics.deferred_count += 1, + "reject" => metrics.rejected_count += 1, + _ => {}, + } +} + +fn proactive_suggestion_has_freshness(suggestion: &ProactiveSuggestion) -> bool { + if suggestion.freshness.status.trim().is_empty() { + return false; + } + + match suggestion.freshness.status.as_str() { + "superseded" => !suggestion.freshness.superseded_by.is_empty(), + "tombstoned" => !suggestion.freshness.tombstone_refs.is_empty(), + _ => true, + } +} + +fn proactive_suggestion_has_action_rationale(suggestion: &ProactiveSuggestion) -> bool { + !suggestion.action.decision.trim().is_empty() + && !suggestion.action.reason_code.trim().is_empty() + && !suggestion.action.reason.trim().is_empty() +} + +fn proactive_suggestion_is_stale_warning(suggestion: &ProactiveSuggestion) -> bool { + matches!( + suggestion.suggestion_kind.as_str(), + "stale_decision_audit" | "stale_plan_preference_warning" + ) && suggestion.freshness.status != "current" +} + +fn proactive_suggestion_is_invalid_current( + suggestion: &ProactiveSuggestion, + non_current_refs: &BTreeSet<&str>, +) -> bool { + suggestion.freshness.status == "current" + && (!suggestion.freshness.superseded_by.is_empty() + || !suggestion.freshness.tombstone_refs.is_empty() + || suggestion + .evidence_refs + .iter() + .any(|evidence_id| non_current_refs.contains(evidence_id.as_str()))) +} + +fn proactive_suggestion_is_unsupported_current(suggestion: &ProactiveSuggestion) -> bool { + !suggestion.unsupported_claim_flags.is_empty() + && (suggestion.action.decision == "recommend" || suggestion.freshness.status == "current") +} + +fn proactive_suggestion_is_tombstone_violation( + suggestion: &ProactiveSuggestion, + tombstone_refs: &BTreeSet<&str>, +) -> bool { + suggestion.freshness.status == "current" + && (!suggestion.freshness.tombstone_refs.is_empty() + || suggestion + .evidence_refs + .iter() + .any(|evidence_id| tombstone_refs.contains(evidence_id.as_str()))) +} + +fn unsupported_proactive_suggestions( + job: &RealWorldJob, + answer: &ProducedAnswer, +) -> Vec { + answer + .proactive_briefs + .iter() + .flat_map(|brief| { + brief.suggestions.iter().filter_map(|suggestion| { + if suggestion.evidence_refs.is_empty() { + return Some(proactive_unsupported_claim_report( + job, + brief, + suggestion, + "proactive suggestion has no evidence refs", + )); + } + if proactive_suggestion_is_unsupported_current(suggestion) { + return Some(proactive_unsupported_claim_report( + job, + brief, + suggestion, + "unsupported proactive claim is still recommended or marked current", + )); + } + + None + }) + }) + .collect() +} + +fn proactive_unsupported_claim_report( + job: &RealWorldJob, + brief: &ProactiveBriefArtifact, + suggestion: &ProactiveSuggestion, + reason: &str, +) -> UnsupportedClaimReport { + UnsupportedClaimReport { + suite_id: job.suite.clone(), + job_id: job.job_id.clone(), + claim_id: Some(format!("{}:{}", brief.brief_id, suggestion.suggestion_id)), + claim_text: bounded_text(suggestion.body.as_str(), 240), + reason: reason.to_string(), + evidence_ids: suggestion.evidence_refs.clone(), + } +} + fn hard_fail_hits( job: &RealWorldJob, unsupported_claims: &[UnsupportedClaimReport], @@ -3488,19 +4023,28 @@ fn dimension_score(dimension_id: &str, max_points: f64, counts: &FailureCounts) || counts.memory_summary_invalid_current_entries > 0 || counts.memory_summary_missing_categories > 0 || counts.memory_summary_unsupported_current_entries > 0 + || counts.proactive_brief_invalid_current_suggestions > 0 + || counts.proactive_brief_missing_kinds > 0 + || counts.proactive_brief_unsupported_current_suggestions > 0 + || counts.proactive_brief_tombstone_violations > 0 || counts.page_usefulness_failures > 0, "evidence_grounding" => counts.missing_evidence > 0 || counts.unsupported_claims > 0 || counts.lineage_failures > 0 || counts.memory_summary_untraced_entries > 0 + || counts.proactive_brief_untraced_suggestions > 0 || counts.untraced_page_sections > 0, "trap_avoidance" => counts.trap_uses > 0 || counts.memory_summary_invalid_current_entries > 0 + || counts.proactive_brief_invalid_current_suggestions > 0 + || counts.proactive_brief_tombstone_violations > 0 || counts.missed_stale_findings > 0, "uncertainty_handling" => - counts.unsupported_claims > 0 || counts.memory_summary_unsupported_current_entries > 0, + counts.unsupported_claims > 0 + || counts.memory_summary_unsupported_current_entries > 0 + || counts.proactive_brief_unsupported_current_suggestions > 0, "lifecycle_behavior" => counts.stale_answers > 0 || counts.conflict_detection_missing > 0 @@ -3510,6 +4054,11 @@ fn dimension_score(dimension_id: &str, max_points: f64, counts: &FailureCounts) || counts.memory_summary_missing_freshness > 0 || counts.memory_summary_missing_rationale > 0 || counts.memory_summary_unsupported_current_entries > 0 + || counts.proactive_brief_invalid_current_suggestions > 0 + || counts.proactive_brief_missing_freshness > 0 + || counts.proactive_brief_missing_action_rationale > 0 + || counts.proactive_brief_unsupported_current_suggestions > 0 + || counts.proactive_brief_tombstone_violations > 0 || counts.rebuild_failures > 0, "source_immutability" => counts.source_mutations > 0, "proposal_usefulness" => counts.proposal_usefulness_failures > 0, @@ -3681,6 +4230,7 @@ fn job_report(job: &RealWorldJob, scoring: JobScoring) -> JobReport { trace_explainability: answer.trace_explainability.clone(), knowledge: scoring.knowledge, memory_summary: scoring.memory_summary, + proactive_brief: scoring.proactive_brief, trap_ids_used: scoring.trap_ids_used, dimension_scores: scoring.dimension_scores, reason: scoring.reason, @@ -4183,6 +4733,7 @@ fn report_summary(jobs: &[JobReport], suites: &[SuiteReport]) -> ReportSummary { .sum(), consolidation: consolidation_summary(jobs), memory_summary: memory_summary_summary(jobs), + proactive_brief: proactive_brief_summary(jobs), knowledge: knowledge_summary(jobs), ..ReportSummary::default() }; @@ -4392,6 +4943,100 @@ fn memory_summary_summary(jobs: &[JobReport]) -> Option { }) } +fn proactive_brief_summary(jobs: &[JobReport]) -> Option { + let proactive_jobs = + jobs.iter().filter_map(|job| job.proactive_brief.as_ref()).collect::>(); + + if proactive_jobs.is_empty() { + return None; + } + + let job_count = proactive_jobs.len(); + let suggestion_count = + proactive_jobs.iter().map(|metrics| metrics.suggestion_count).sum::(); + let evidence_ref_required_count = + proactive_jobs.iter().map(|metrics| metrics.evidence_ref_required_count).sum(); + let evidence_ref_suggestion_count = + proactive_jobs.iter().map(|metrics| metrics.evidence_ref_suggestion_count).sum(); + let freshness_marker_count = + proactive_jobs.iter().map(|metrics| metrics.freshness_marker_count).sum(); + let action_rationale_count = + proactive_jobs.iter().map(|metrics| metrics.action_rationale_count).sum(); + + Some(ProactiveBriefSummaryReport { + job_count, + brief_count: proactive_jobs.iter().map(|metrics| metrics.brief_count).sum(), + suggestion_count, + required_suggestion_kind_count: proactive_jobs + .iter() + .map(|metrics| metrics.required_suggestion_kind_count) + .sum(), + covered_required_suggestion_kind_count: proactive_jobs + .iter() + .map(|metrics| metrics.covered_required_suggestion_kind_count) + .sum(), + missing_required_suggestion_kind_count: proactive_jobs + .iter() + .map(|metrics| metrics.missing_required_suggestion_kind_count) + .sum(), + evidence_ref_required_count, + evidence_ref_suggestion_count, + evidence_ref_coverage: ratio(evidence_ref_suggestion_count, evidence_ref_required_count), + freshness_marker_count, + freshness_coverage: ratio(freshness_marker_count, suggestion_count), + action_rationale_count, + action_rationale_coverage: ratio(action_rationale_count, suggestion_count), + recommended_count: proactive_jobs.iter().map(|metrics| metrics.recommended_count).sum(), + deferred_count: proactive_jobs.iter().map(|metrics| metrics.deferred_count).sum(), + rejected_count: proactive_jobs.iter().map(|metrics| metrics.rejected_count).sum(), + current_suggestion_count: proactive_jobs + .iter() + .map(|metrics| metrics.current_suggestion_count) + .sum(), + non_current_suggestion_count: proactive_jobs + .iter() + .map(|metrics| metrics.non_current_suggestion_count) + .sum(), + stale_warning_count: proactive_jobs.iter().map(|metrics| metrics.stale_warning_count).sum(), + invalid_current_suggestion_count: proactive_jobs + .iter() + .map(|metrics| metrics.invalid_current_suggestion_count) + .sum(), + untraced_suggestion_count: proactive_jobs + .iter() + .map(|metrics| metrics.untraced_suggestion_count) + .sum(), + unsupported_current_suggestion_count: proactive_jobs + .iter() + .map(|metrics| metrics.unsupported_current_suggestion_count) + .sum(), + tombstone_violation_count: proactive_jobs + .iter() + .map(|metrics| metrics.tombstone_violation_count) + .sum(), + source_trace_selected_count: proactive_jobs + .iter() + .map(|metrics| metrics.source_trace_selected_count) + .sum(), + source_trace_dropped_count: proactive_jobs + .iter() + .map(|metrics| metrics.source_trace_dropped_count) + .sum(), + source_trace_stale_count: proactive_jobs + .iter() + .map(|metrics| metrics.source_trace_stale_count) + .sum(), + source_trace_superseded_count: proactive_jobs + .iter() + .map(|metrics| metrics.source_trace_superseded_count) + .sum(), + source_trace_tombstone_count: proactive_jobs + .iter() + .map(|metrics| metrics.source_trace_tombstone_count) + .sum(), + }) +} + fn knowledge_summary(jobs: &[JobReport]) -> Option { let knowledge_jobs = jobs.iter().filter_map(|job| job.knowledge.as_ref()).collect::>(); @@ -5103,6 +5748,7 @@ fn render_markdown(report: &RealWorldReport, report_path: &Path) -> String { render_markdown_trace_explainability(&mut out, report); render_markdown_consolidation(&mut out, report); render_markdown_memory_summary(&mut out, report); + render_markdown_proactive_brief(&mut out, report); render_markdown_knowledge(&mut out, report); render_markdown_unsupported_claims(&mut out, report); render_markdown_follow_ups(&mut out, report); @@ -5449,6 +6095,30 @@ fn render_markdown_optional_summary_metrics(out: &mut String, summary: &ReportSu memory_summary.unsupported_current_entry_count )); } + if let Some(proactive) = &summary.proactive_brief { + out.push_str(&format!( + "- Proactive brief suggestions: `{}` across `{}` artifact(s)\n", + proactive.suggestion_count, proactive.brief_count + )); + out.push_str(&format!( + "- Proactive evidence-ref coverage: `{}/{}` (`{:.3}`)\n", + proactive.evidence_ref_suggestion_count, + proactive.evidence_ref_required_count, + proactive.evidence_ref_coverage + )); + out.push_str(&format!( + "- Proactive freshness/action rationale coverage: `{:.3}` / `{:.3}`\n", + proactive.freshness_coverage, proactive.action_rationale_coverage + )); + out.push_str(&format!( + "- Proactive stale/currentness violations: `{}` invalid current, `{}` tombstone violation(s)\n", + proactive.invalid_current_suggestion_count, proactive.tombstone_violation_count + )); + out.push_str(&format!( + "- Proactive rejected/deferred suggestions: `{}` rejected, `{}` deferred\n", + proactive.rejected_count, proactive.deferred_count + )); + } } fn render_markdown_quality_summary(out: &mut String, report: &RealWorldReport) { @@ -5922,6 +6592,47 @@ fn render_markdown_memory_summary(out: &mut String, report: &RealWorldReport) { out.push('\n'); } +fn render_markdown_proactive_brief(out: &mut String, report: &RealWorldReport) { + let proactive_jobs = + report.jobs.iter().filter(|job| job.proactive_brief.is_some()).collect::>(); + + if proactive_jobs.is_empty() { + return; + } + + out.push_str("## Proactive Brief Metrics\n\n"); + out.push_str("| Job | Briefs | Suggestions | Kinds | Evidence Coverage | Freshness | Action Rationale | Invalid Current | Untraced | Unsupported Current | Tombstone Violations | Rejected | Deferred |\n"); + out.push_str( + "| --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: |\n", + ); + + for job in proactive_jobs { + let Some(metrics) = &job.proactive_brief else { + continue; + }; + + out.push_str(&format!( + "| {} | {} | {} | `{}/{}` | `{:.3}` | `{:.3}` | `{:.3}` | {} | {} | {} | {} | {} | {} |\n", + md_cell(job.job_id.as_str()), + metrics.brief_count, + metrics.suggestion_count, + metrics.covered_required_suggestion_kind_count, + metrics.required_suggestion_kind_count, + metrics.evidence_ref_coverage, + metrics.freshness_coverage, + metrics.action_rationale_coverage, + metrics.invalid_current_suggestion_count, + metrics.untraced_suggestion_count, + metrics.unsupported_current_suggestion_count, + metrics.tombstone_violation_count, + metrics.rejected_count, + metrics.deferred_count + )); + } + + out.push('\n'); +} + fn render_markdown_unsupported_claims(out: &mut String, report: &RealWorldReport) { out.push_str("## Unsupported Claims\n\n"); @@ -5993,6 +6704,7 @@ fn render_markdown_semantics(out: &mut String, report: &RealWorldReport) { out.push_str("- `not_encoded`: a suite has no checked-in fixture, or an encoded fixture declares a capability gap so no pass/fail claim is allowed.\n\n"); out.push_str("For `knowledge_compilation` jobs, generated pages are benchmark artifacts. Page sections must cite source evidence or timeline events, or be explicitly flagged as unsupported. Flagged unsupported summaries are counted separately from hidden unsupported claims.\n\n"); out.push_str("For `memory_summary` jobs, summary artifacts are derived review surfaces. Top-of-mind entries must be current, included or downgraded entries must carry source refs, and derived project-profile entries must either cite sources or be explicitly flagged as unsupported.\n\n"); + out.push_str("For `proactive_brief` jobs, brief artifacts are fixture-scored derived outputs, not scheduled UI behavior. Every suggestion must carry evidence refs, freshness/currentness metadata, and an action rationale; stale, superseded, or tombstoned sources must not be presented as current recommendations.\n\n"); out.push_str("## Suites With `not_encoded` Status\n\n"); if report.not_encoded_suites.is_empty() { diff --git a/apps/elf-eval/tests/real_world_job_benchmark.rs b/apps/elf-eval/tests/real_world_job_benchmark.rs index 60c020c8..37e99898 100644 --- a/apps/elf-eval/tests/real_world_job_benchmark.rs +++ b/apps/elf-eval/tests/real_world_job_benchmark.rs @@ -60,6 +60,10 @@ fn memory_summary_fixture_dir() -> PathBuf { real_world_memory_fixture_dir().join("memory_summary") } +fn proactive_brief_fixture_dir() -> PathBuf { + real_world_memory_fixture_dir().join("proactive_brief") +} + fn knowledge_fixture_dir() -> PathBuf { real_world_memory_fixture_dir().join("knowledge") } @@ -701,13 +705,13 @@ fn assert_external_adapter_manifest_status_summary(report: &Value) { report .pointer("/external_adapters/summary/suite_status_counts/blocked") .and_then(Value::as_u64), - Some(21) + Some(22) ); assert_eq!( report .pointer("/external_adapters/summary/suite_status_counts/pass") .and_then(Value::as_u64), - Some(26) + Some(27) ); assert_eq!( report @@ -1022,11 +1026,12 @@ fn assert_elf_fixture_adapter_record(adapter: &Value) -> Result<()> { assert_eq!(adapter.pointer("/evidence_class").and_then(Value::as_str), Some("fixture_backed")); assert_eq!(adapter.pointer("/overall_status").and_then(Value::as_str), Some("blocked")); assert!(adapter.pointer("/run/evidence").and_then(Value::as_str).is_some_and(|evidence| { - evidence.contains("50 jobs across 14 suites") - && evidence.contains("45 pass") - && evidence.contains("5 blocked") + evidence.contains("55 jobs across 15 suites") + && evidence.contains("49 pass") + && evidence.contains("6 blocked") && evidence.contains("core_archival_memory") && evidence.contains("memory_summary") + && evidence.contains("proactive_brief") && evidence.contains("context_trajectory") })); @@ -2231,7 +2236,7 @@ fn assert_live_sweep_record(adapter: &Value, production_ops_status: &str) -> Res fn runner_discovers_nested_fixture_layout() -> Result<()> { let report = run_json_report_from(fixture_root())?; - assert_eq!(report.pointer("/summary/job_count").and_then(Value::as_u64), Some(50)); + assert_eq!(report.pointer("/summary/job_count").and_then(Value::as_u64), Some(55)); Ok(()) } @@ -2685,11 +2690,11 @@ fn assert_current_report_text_boundaries( comparison_external_projects .contains("Benchmark-grounded for local same-corpus retrieval, reindex/update/delete") ); - assert!(iteration_direction.contains("| Jobs | `50` |")); - assert!(iteration_direction.contains("| Encoded suites | `14` |")); - assert!(iteration_direction.contains("| Pass | `45` |")); - assert!(iteration_direction.contains("| Evidence coverage | `115/115` |")); - assert!(iteration_direction.contains("| Expected evidence recall | `107/107` |")); + assert!(iteration_direction.contains("| Jobs | `55` |")); + assert!(iteration_direction.contains("| Encoded suites | `15` |")); + assert!(iteration_direction.contains("| Pass | `49` |")); + assert!(iteration_direction.contains("| Evidence coverage | `123/123` |")); + assert!(iteration_direction.contains("| Expected evidence recall | `115/115` |")); for stale_phrase in [ "same live sweep shape as ELF", @@ -2700,7 +2705,12 @@ fn assert_current_report_text_boundaries( "The qmd live real-world slice covers representative jobs only", "| Jobs | `40` |", "| Encoded suites | `11` |", + "| Jobs | `50` |", + "| Encoded suites | `14` |", "| Pass | `38` |", + "| Pass | `45` |", + "| Evidence coverage | `115/115` |", + "| Expected evidence recall | `107/107` |", "history/UI/hosted/graph behavior remains", "current local adapter is incomplete/wrong-result", "current adapter is incomplete/invalid-result", @@ -3672,14 +3682,14 @@ fn assert_measurement_audit_adapter_status_counts(markdown: &str) { fn assert_iteration_direction_current_measurement_counts(markdown: &str) { for expected in [ - "| Jobs | `50` |", - "| Encoded suites | `14` |", - "| Blocked | `5` |", - "| Mean score | `0.900` |", - "| Evidence coverage | `115/115` |", - "| Source-ref coverage | `115/115` |", - "| Quote coverage | `115/115` |", - "| Expected evidence recall | `107/107` |", + "| Jobs | `55` |", + "| Encoded suites | `15` |", + "| Blocked | `6` |", + "| Mean score | `0.891` |", + "| Evidence coverage | `123/123` |", + "| Source-ref coverage | `123/123` |", + "| Quote coverage | `123/123` |", + "| Expected evidence recall | `115/115` |", "| `blocked` | `7` |", "| `not_encoded` | `5` |", "`live_baseline_only`, `fixture_backed`, and `research_gate`", @@ -3690,9 +3700,14 @@ fn assert_iteration_direction_current_measurement_counts(markdown: &str) { for stale in [ "| Jobs | `40` |", "| Encoded suites | `11` |", + "| Jobs | `50` |", + "| Encoded suites | `14` |", "| Mean score | `0.950` |", + "| Mean score | `0.900` |", "| Evidence coverage | `88/88` |", + "| Evidence coverage | `115/115` |", "| Expected evidence recall | `80/80` |", + "| Expected evidence recall | `107/107` |", "| `blocked` | `5` |", "| `not_encoded` | `7` |", "`live_baseline_only` plus `research_gate`", @@ -4123,13 +4138,15 @@ fn assert_dreaming_readiness_baseline_counts(ledger: &Value, stages: &[Value]) - "/summary/improved", "memory_summary_top_of_mind_behavior" )?); + assert!(array_contains_str(ledger, "/summary/improved", "proactive_brief_readiness")?); assert!(array_at(ledger, "/summary/regressed")?.is_empty()); assert!(array_contains_str(ledger, "/summary/unchanged", "deletion_ttl_tombstone_behavior")?); assert!(array_contains_str(ledger, "/summary/unchanged", "final_competitor_retest_status")?); assert!(array_contains_str(ledger, "/summary/blocked", "scheduled_memory_task_readiness")?); - assert!(array_contains_str(ledger, "/summary/not_tested", "proactive_brief_readiness")?); + assert!(array_at(ledger, "/summary/not_tested")?.is_empty()); assert_dreaming_memory_summary_stage(stages)?; + assert_dreaming_proactive_brief_stage(stages)?; Ok(()) } @@ -4157,13 +4174,60 @@ fn assert_dreaming_memory_summary_stage(stages: &[Value]) -> Result<()> { Ok(()) } +fn assert_dreaming_proactive_brief_stage(stages: &[Value]) -> Result<()> { + let proactive_stage = find_by_field(stages, "/stage_id", "proactive_brief_readiness")?; + + assert_eq!( + proactive_stage.pointer("/comparison_judgment").and_then(Value::as_str), + Some("improved") + ); + assert_eq!(proactive_stage.pointer("/post_stage_counts/pass").and_then(Value::as_u64), Some(4)); + assert_eq!( + proactive_stage.pointer("/post_stage_counts/blocked").and_then(Value::as_u64), + Some(1) + ); + assert_eq!( + proactive_stage.pointer("/post_stage_counts/evidence_ref_coverage").and_then(Value::as_f64), + Some(1.0) + ); + assert_eq!( + proactive_stage.pointer("/post_stage_counts/freshness_coverage").and_then(Value::as_f64), + Some(1.0) + ); + assert_eq!( + proactive_stage + .pointer("/post_stage_counts/action_rationale_coverage") + .and_then(Value::as_f64), + Some(1.0) + ); + assert_eq!( + proactive_stage + .pointer("/post_stage_counts/tombstone_violation_count") + .and_then(Value::as_u64), + Some(0) + ); + assert!( + proactive_stage + .pointer("/post_stage_basis") + .and_then(Value::as_str) + .is_some_and(|basis| basis.contains("five proactive_brief fixture jobs") + && basis.contains("typed private-corpus refresh blocker")) + ); + + Ok(()) +} + fn assert_dreaming_readiness_markdown_boundaries(markdown: &str) { assert!( markdown.contains("`improved`: current-vs-historical correctness, preference evolution") && markdown.contains("reviewable") - && markdown.contains("consolidation, and memory-summary/top-of-mind fixture readback") + && markdown.contains("proactive brief") ); assert!(markdown.contains("memory-summary/top-of-mind fixture readback")); + assert!(markdown.contains("XY-953 adds a direct `proactive_brief` suite")); + assert!(markdown.contains( + "Do not claim fixture-backed proactive brief scoring proves OpenAI Pulse parity" + )); assert!(markdown.contains("`regressed`: none")); assert!(markdown.contains("the XY-905 run passes all six memory-evolution jobs")); assert!(markdown.contains("XY-952 adds a reviewable `elf.memory_summary/v1`")); @@ -4474,6 +4538,207 @@ fn memory_summary_fixture_fails_tombstone_entries_without_tombstone_refs() -> Re Ok(()) } +#[test] +fn proactive_brief_fixtures_score_source_linked_suggestions() -> Result<()> { + let report = run_json_report_from(proactive_brief_fixture_dir())?; + + assert_eq!(report.pointer("/summary/job_count").and_then(Value::as_u64), Some(5)); + assert_eq!(report.pointer("/summary/pass").and_then(Value::as_u64), Some(4)); + assert_eq!(report.pointer("/summary/blocked").and_then(Value::as_u64), Some(1)); + assert_eq!(report.pointer("/summary/wrong_result").and_then(Value::as_u64), Some(0)); + assert_eq!(report.pointer("/summary/unsupported_claim").and_then(Value::as_u64), Some(0)); + assert_eq!( + report.pointer("/summary/proactive_brief/brief_count").and_then(Value::as_u64), + Some(4) + ); + assert_eq!( + report.pointer("/summary/proactive_brief/suggestion_count").and_then(Value::as_u64), + Some(5) + ); + assert_eq!( + report.pointer("/summary/proactive_brief/evidence_ref_coverage").and_then(Value::as_f64), + Some(1.0) + ); + assert_eq!( + report.pointer("/summary/proactive_brief/freshness_coverage").and_then(Value::as_f64), + Some(1.0) + ); + assert_eq!( + report + .pointer("/summary/proactive_brief/action_rationale_coverage") + .and_then(Value::as_f64), + Some(1.0) + ); + assert_eq!( + report + .pointer("/summary/proactive_brief/invalid_current_suggestion_count") + .and_then(Value::as_u64), + Some(0) + ); + assert_eq!( + report + .pointer("/summary/proactive_brief/tombstone_violation_count") + .and_then(Value::as_u64), + Some(0) + ); + assert_eq!( + report.pointer("/summary/proactive_brief/rejected_count").and_then(Value::as_u64), + Some(1) + ); + assert_eq!( + report.pointer("/summary/proactive_brief/deferred_count").and_then(Value::as_u64), + Some(2) + ); + + let suites = array_at(&report, "/suites")?; + let proactive = find_by_field(suites, "/suite_id", "proactive_brief")?; + + assert_eq!(proactive.pointer("/status").and_then(Value::as_str), Some("blocked")); + assert_eq!(proactive.pointer("/encoded_job_count").and_then(Value::as_u64), Some(5)); + + let jobs = array_at(&report, "/jobs")?; + let daily = find_by_field(jobs, "/job_id", "proactive-daily-project-brief-001")?; + let private = find_by_field(jobs, "/job_id", "proactive-private-corpus-refresh-blocked-001")?; + + assert_eq!(daily.pointer("/status").and_then(Value::as_str), Some("pass")); + assert_eq!( + daily.pointer("/proactive_brief/evidence_ref_coverage").and_then(Value::as_f64), + Some(1.0) + ); + assert_eq!(private.pointer("/status").and_then(Value::as_str), Some("blocked")); + assert!( + report + .pointer("/follow_ups/0/title") + .and_then(Value::as_str) + .is_some_and(|title| title.contains("XY-930")) + ); + + Ok(()) +} + +#[test] +fn proactive_brief_markdown_renders_source_and_freshness_metrics() -> Result<()> { + let report = run_json_report_from(proactive_brief_fixture_dir())?; + let temp_dir = + env::temp_dir().join(format!("elf-real-world-proactive-brief-test-{}", process::id())); + + fs::create_dir_all(&temp_dir)?; + + let report_path = temp_dir.join("proactive-brief-report.json"); + let markdown_path = temp_dir.join("proactive-brief-report.md"); + + fs::write(&report_path, serde_json::to_vec_pretty(&report)?)?; + + let output = Command::new(env!("CARGO_BIN_EXE_real_world_job_benchmark")) + .arg("publish") + .arg("--report") + .arg(&report_path) + .arg("--out") + .arg(&markdown_path) + .output()?; + + assert!( + output.status.success(), + "real_world_job publisher failed: {}", + String::from_utf8_lossy(&output.stderr), + ); + + let markdown = fs::read_to_string(markdown_path)?; + + assert!(markdown.contains("Proactive Brief Metrics")); + assert!(markdown.contains("proactive-daily-project-brief-001")); + assert!(markdown.contains("Proactive evidence-ref coverage")); + assert!(markdown.contains("Invalid Current")); + assert!(markdown.contains("Tombstone Violations")); + + Ok(()) +} + +#[test] +fn proactive_brief_fixture_fails_unsupported_suggestions() -> Result<()> { + let fixture_path = proactive_brief_fixture_dir().join("daily_project_brief.json"); + let mut fixture = load_json(&fixture_path)?; + + fixture["corpus"]["adapter_response"]["answer"]["proactive_briefs"][0]["suggestions"][0]["evidence_refs"] = + Value::Array(Vec::new()); + + let temp_dir = + env::temp_dir().join(format!("elf-proactive-unsupported-test-{}", process::id())); + + fs::create_dir_all(&temp_dir)?; + fs::write(temp_dir.join("unsupported_brief.json"), serde_json::to_vec_pretty(&fixture)?)?; + + let report = run_json_report_from(temp_dir)?; + let jobs = array_at(&report, "/jobs")?; + let job = find_by_field(jobs, "/job_id", "proactive-daily-project-brief-001")?; + + assert_eq!(job.pointer("/status").and_then(Value::as_str), Some("unsupported_claim")); + assert_eq!( + job.pointer("/proactive_brief/untraced_suggestion_count").and_then(Value::as_u64), + Some(1) + ); + assert_eq!(report.pointer("/summary/unsupported_claim").and_then(Value::as_u64), Some(1)); + + Ok(()) +} + +#[test] +fn proactive_brief_fixture_fails_stale_decisions_presented_current() -> Result<()> { + let fixture_path = proactive_brief_fixture_dir().join("stale_decision_audit.json"); + let mut fixture = load_json(&fixture_path)?; + + fixture["corpus"]["adapter_response"]["answer"]["proactive_briefs"][0]["suggestions"][0]["freshness"] + ["status"] = Value::String("current".to_string()); + + let temp_dir = + env::temp_dir().join(format!("elf-proactive-stale-current-test-{}", process::id())); + + fs::create_dir_all(&temp_dir)?; + fs::write(temp_dir.join("stale_current_brief.json"), serde_json::to_vec_pretty(&fixture)?)?; + + let report = run_json_report_from(temp_dir)?; + let jobs = array_at(&report, "/jobs")?; + let job = find_by_field(jobs, "/job_id", "proactive-stale-decision-audit-001")?; + + assert_eq!(job.pointer("/status").and_then(Value::as_str), Some("wrong_result")); + assert_eq!( + job.pointer("/proactive_brief/invalid_current_suggestion_count").and_then(Value::as_u64), + Some(1) + ); + assert_eq!(report.pointer("/summary/wrong_result").and_then(Value::as_u64), Some(1)); + + Ok(()) +} + +#[test] +fn proactive_brief_fixture_fails_tombstone_ttl_violations() -> Result<()> { + let fixture_path = proactive_brief_fixture_dir().join("stale_plan_preference_warning.json"); + let mut fixture = load_json(&fixture_path)?; + + fixture["corpus"]["adapter_response"]["answer"]["proactive_briefs"][0]["suggestions"][0]["freshness"] + ["status"] = Value::String("current".to_string()); + fixture["corpus"]["adapter_response"]["answer"]["proactive_briefs"][0]["suggestions"][0]["action"] + ["decision"] = Value::String("recommend".to_string()); + + let temp_dir = env::temp_dir().join(format!("elf-proactive-tombstone-test-{}", process::id())); + + fs::create_dir_all(&temp_dir)?; + fs::write(temp_dir.join("tombstone_current_brief.json"), serde_json::to_vec_pretty(&fixture)?)?; + + let report = run_json_report_from(temp_dir)?; + let jobs = array_at(&report, "/jobs")?; + let job = find_by_field(jobs, "/job_id", "proactive-stale-plan-preference-warning-001")?; + + assert_eq!(job.pointer("/status").and_then(Value::as_str), Some("wrong_result")); + assert_eq!( + job.pointer("/proactive_brief/tombstone_violation_count").and_then(Value::as_u64), + Some(1) + ); + assert_eq!(report.pointer("/summary/wrong_result").and_then(Value::as_u64), Some(1)); + + Ok(()) +} + #[test] fn production_ops_fixtures_report_bounded_typed_states() -> Result<()> { let report = run_json_report_from(production_ops_fixture_dir())?; @@ -4633,12 +4898,12 @@ fn assert_root_knowledge_summary(report: &Value) { } fn assert_root_aggregate_summary(report: &Value) { - assert_eq!(report.pointer("/summary/job_count").and_then(Value::as_u64), Some(50)); - assert_eq!(report.pointer("/summary/encoded_suite_count").and_then(Value::as_u64), Some(14)); - assert_eq!(report.pointer("/summary/pass").and_then(Value::as_u64), Some(45)); + assert_eq!(report.pointer("/summary/job_count").and_then(Value::as_u64), Some(55)); + assert_eq!(report.pointer("/summary/encoded_suite_count").and_then(Value::as_u64), Some(15)); + assert_eq!(report.pointer("/summary/pass").and_then(Value::as_u64), Some(49)); assert_eq!(report.pointer("/summary/wrong_result").and_then(Value::as_u64), Some(0)); assert_eq!(report.pointer("/summary/incomplete").and_then(Value::as_u64), Some(0)); - assert_eq!(report.pointer("/summary/blocked").and_then(Value::as_u64), Some(5)); + assert_eq!(report.pointer("/summary/blocked").and_then(Value::as_u64), Some(6)); assert_eq!(report.pointer("/summary/not_encoded").and_then(Value::as_u64), Some(0)); assert_eq!(report.pointer("/summary/unsupported_claim_count").and_then(Value::as_u64), Some(0)); assert_eq!(report.pointer("/summary/wrong_result_count").and_then(Value::as_u64), Some(0)); @@ -4678,11 +4943,11 @@ fn assert_root_aggregate_summary(report: &Value) { ); assert_eq!( report.pointer("/summary/evidence_required_count").and_then(Value::as_u64), - Some(115) + Some(123) ); assert_eq!( report.pointer("/summary/evidence_covered_count").and_then(Value::as_u64), - Some(115) + Some(123) ); assert_eq!(report.pointer("/summary/evidence_coverage").and_then(Value::as_f64), Some(1.0)); assert_eq!(report.pointer("/summary/source_ref_coverage").and_then(Value::as_f64), Some(1.0)); @@ -4723,6 +4988,44 @@ fn assert_root_aggregate_summary(report: &Value) { ); assert_root_knowledge_summary(report); + assert_root_proactive_brief_summary(report); +} + +fn assert_root_proactive_brief_summary(report: &Value) { + assert_eq!( + report.pointer("/summary/proactive_brief/job_count").and_then(Value::as_u64), + Some(4) + ); + assert_eq!( + report.pointer("/summary/proactive_brief/suggestion_count").and_then(Value::as_u64), + Some(5) + ); + assert_eq!( + report.pointer("/summary/proactive_brief/evidence_ref_coverage").and_then(Value::as_f64), + Some(1.0) + ); + assert_eq!( + report.pointer("/summary/proactive_brief/freshness_coverage").and_then(Value::as_f64), + Some(1.0) + ); + assert_eq!( + report + .pointer("/summary/proactive_brief/action_rationale_coverage") + .and_then(Value::as_f64), + Some(1.0) + ); + assert_eq!( + report + .pointer("/summary/proactive_brief/invalid_current_suggestion_count") + .and_then(Value::as_u64), + Some(0) + ); + assert_eq!( + report + .pointer("/summary/proactive_brief/tombstone_violation_count") + .and_then(Value::as_u64), + Some(0) + ); } fn assert_root_aggregate_suites(report: &Value) -> Result<()> { @@ -4773,6 +5076,11 @@ fn assert_root_aggregate_suites(report: &Value) -> Result<()> { assert_eq!(production_ops.pointer("/status").and_then(Value::as_str), Some("blocked")); assert_eq!(production_ops.pointer("/encoded_job_count").and_then(Value::as_u64), Some(6)); + let proactive = find_by_field(suites, "/suite_id", "proactive_brief")?; + + assert_eq!(proactive.pointer("/status").and_then(Value::as_str), Some("blocked")); + assert_eq!(proactive.pointer("/encoded_job_count").and_then(Value::as_u64), Some(5)); + let context_trajectory = find_by_field(suites, "/suite_id", "context_trajectory")?; assert_eq!(context_trajectory.pointer("/status").and_then(Value::as_str), Some("blocked")); diff --git a/docs/guide/benchmarking/2026-06-11-competitor-strength-adoption-report.md b/docs/guide/benchmarking/2026-06-11-competitor-strength-adoption-report.md index 35786e4f..c893db22 100644 --- a/docs/guide/benchmarking/2026-06-11-competitor-strength-adoption-report.md +++ b/docs/guide/benchmarking/2026-06-11-competitor-strength-adoption-report.md @@ -88,8 +88,9 @@ results, or lifecycle failures into one aggregate leaderboard. | Command or run | Artifact | Supported claim | | --- | --- | --- | -| `cargo make real-world-memory` | `2026-06-11-measurement-coverage-audit.md` plus XY-952 fixture update | ELF fixture aggregate covers 50 jobs across 14 suites with 45 pass and 5 blocked production-ops or OpenViking context-trajectory measurement gates, including 6 passing `core_archival_memory` jobs and 1 passing `memory_summary` source-trace job. | +| `cargo make real-world-memory` | `2026-06-11-measurement-coverage-audit.md` plus XY-952 and XY-953 fixture updates | ELF fixture aggregate covers 55 jobs across 15 suites with 49 pass and 6 blocked production-ops, private-corpus, or OpenViking context-trajectory measurement gates, including 6 passing `core_archival_memory` jobs, 1 passing `memory_summary` source-trace job, and 4 passing `proactive_brief` suggestion jobs plus 1 private-corpus blocker. | | `cargo make real-world-memory-summary` | `tmp/real-world-memory/memory-summary/report.json` | The memory summary fixture scores reviewable top-of-mind, background, stale, superseded, tombstoned, and derived project-profile entries with source refs, freshness metadata, rationale, and unsupported-claim flags; this is fixture-backed contract evidence, not managed-memory parity. | +| `cargo make real-world-memory-proactive-brief` | `tmp/real-world-memory/proactive-brief/report.json` and `2026-06-16-proactive-brief-scoring-report.md` | The proactive brief fixture scores daily project brief, resume-work brief, stale decision audit, stale plan/preference warning, and private-corpus refresh blocker scenarios with evidence refs, freshness/currentness markers, action rationale, and stale/tombstone guards; this is fixture-backed contract evidence, not Pulse or hosted managed-memory parity. | | `cargo make real-world-memory-core-archival` | `tmp/real-world-memory/core-archival/report.json` | ELF core-block behavior is scored separately from archival note search for attachment, scope, provenance, stale-core detection, archival fallback, and project-decision recovery. | | `cargo make real-world-memory-live-adapters` | `2026-06-11-measurement-coverage-audit.md` | ELF live service adapter reports 22 pass, 5 wrong_result, 2 blocked, and 11 not_encoded jobs; qmd reports 17 pass, 6 wrong_result, 2 blocked, and 15 not_encoded jobs. | | `cargo make real-world-memory-live-adapters` | `2026-06-11-capture-write-policy-live-report.md` | ELF live capture/write-policy jobs pass for redaction, exclusions, source ids, evidence binding, and no secret leakage; qmd remains not_encoded, while agentmemory and claude-mem capture breadth are blocked until durable hook/viewer evidence exists. | diff --git a/docs/guide/benchmarking/2026-06-11-competitor-strength-evidence-matrix.md b/docs/guide/benchmarking/2026-06-11-competitor-strength-evidence-matrix.md index fea85347..80b7620e 100644 --- a/docs/guide/benchmarking/2026-06-11-competitor-strength-evidence-matrix.md +++ b/docs/guide/benchmarking/2026-06-11-competitor-strength-evidence-matrix.md @@ -31,13 +31,15 @@ Current boundary: live pass. The fresh ELF sweep produced 40 jobs with 22 pass, 5 wrong_result, 0 incomplete, 2 blocked, and 11 not_encoded; the fresh qmd sweep produced 17 pass, 6 wrong_result, 0 incomplete, 2 blocked, and 15 not_encoded. -- ELF fixture evidence is strong: `cargo make real-world-memory` reports 50 jobs - across 14 suites with 45 pass and 5 blocked production-ops or OpenViking - context-trajectory measurement gates. The `core_archival_memory` suite contributes - 6 fixture-only passes for ELF core-block behavior; it does not create an - ELF-over-Letta claim. The `memory_summary` suite contributes one fixture-backed - source-trace pass; it does not create managed-memory parity evidence. This proves - the fixture contract, not live-service parity. +- ELF fixture evidence is strong: `cargo make real-world-memory` reports 55 jobs + across 15 suites with 49 pass and 6 blocked production-ops, private-corpus, or + OpenViking context-trajectory measurement gates. The `core_archival_memory` suite + contributes 6 fixture-only passes for ELF core-block behavior; it does not create + an ELF-over-Letta claim. The `memory_summary` suite contributes one fixture-backed + source-trace pass; it does not create managed-memory parity evidence. The + `proactive_brief` suite contributes four fixture-backed source-linked suggestion + passes and one private-corpus blocker; it does not create Pulse or hosted + managed-memory parity. This proves the fixture contract, not live-service parity. - qmd is the strongest measured local retrieval-debug comparison, but the current evidence still separates its same-corpus/live-retrieval strengths from the full-suite live non-pass sweep. diff --git a/docs/guide/benchmarking/2026-06-11-elf-iteration-direction-from-competitor-benchmarks.md b/docs/guide/benchmarking/2026-06-11-elf-iteration-direction-from-competitor-benchmarks.md index f919f5d7..7c03cb74 100644 --- a/docs/guide/benchmarking/2026-06-11-elf-iteration-direction-from-competitor-benchmarks.md +++ b/docs/guide/benchmarking/2026-06-11-elf-iteration-direction-from-competitor-benchmarks.md @@ -44,23 +44,26 @@ The strongest current statement is: | Metric | Value | | --- | ---: | -| Jobs | `50` | -| Encoded suites | `14` | -| Pass | `45` | -| Blocked | `5` | +| Jobs | `55` | +| Encoded suites | `15` | +| Pass | `49` | +| Blocked | `6` | | Wrong result | `0` | | Lifecycle fail | `0` | | Incomplete | `0` | | Not encoded | `0` | | Unsupported claim | `0` | -| Mean score | `0.900` | -| Evidence coverage | `115/115` | -| Source-ref coverage | `115/115` | -| Quote coverage | `115/115` | -| Expected evidence recall | `107/107` | +| Mean score | `0.891` | +| Evidence coverage | `123/123` | +| Source-ref coverage | `123/123` | +| Quote coverage | `123/123` | +| Expected evidence recall | `115/115` | This proves the fixture contract is broad and well controlled. It does not prove that every live adapter or every competitor runtime passes those scenarios. +The new `proactive_brief` fixture slice contributes four passing evidence-linked +suggestion jobs and one typed private-corpus blocker tied to XY-930; it does not +prove Pulse or hosted managed-memory parity. ### Live Real-World Sweep diff --git a/docs/guide/benchmarking/2026-06-16-dreaming-readiness-stage-ledger.md b/docs/guide/benchmarking/2026-06-16-dreaming-readiness-stage-ledger.md index e5b9c128..0835990f 100644 --- a/docs/guide/benchmarking/2026-06-16-dreaming-readiness-stage-ledger.md +++ b/docs/guide/benchmarking/2026-06-16-dreaming-readiness-stage-ledger.md @@ -8,7 +8,8 @@ report shape required before claiming the stage improved. Inputs: `docs/research/2026-06-16-dreaming-readiness-stage-ledger.json`, the June 11 competitor-strength, temporal-history, and iteration-direction reports, the XY-905 June 16 live temporal reconciliation report, the consolidation proposal spec, the -memory summary spec, and the checked-in real-world fixture suites. +memory summary spec, the XY-953 proactive brief scoring report, and the checked-in +real-world fixture suites. Outputs: A stage-by-stage ledger that downstream issues can update with `improved`, `regressed`, `unchanged`, `blocked`, or `not_tested` judgments. @@ -21,12 +22,13 @@ and now includes the XY-905 post-stage result for live temporal reconciliation. Current stage status: - `improved`: current-vs-historical correctness, preference evolution, reviewable - consolidation, and memory-summary/top-of-mind fixture readback. + consolidation, memory-summary/top-of-mind fixture readback, and proactive brief + fixture scoring. - `regressed`: none. - `unchanged`: deletion/TTL/tombstone behavior and the final competitor retest baseline. - `blocked`: scheduled-memory-task readiness. -- `not_tested`: proactive brief readiness. +- `not_tested`: none. The known live `memory_evolution` loss is now repaired for the encoded ELF live adapter slice: the XY-905 run passes all six memory-evolution jobs and reports @@ -45,6 +47,12 @@ that distinguishes current top-of-mind, background, stale, superseded, tombstone derived project-profile entries. It does not prove live top-of-mind product behavior or parity with managed memory products. +Proactive brief readiness is improved only at the fixture-backed benchmark level: +XY-953 adds a direct `proactive_brief` suite with daily project brief, resume-work +brief, stale decision audit, stale plan/preference warning, and private-corpus refresh +blocker scenarios. It does not prove OpenAI Pulse parity, hosted managed-memory +parity, background scheduling, or private-corpus production quality. + ## Ledger Rules - Every downstream Dreaming or competitor-improvement stage must write a post-stage @@ -70,7 +78,7 @@ parity with managed memory products. | Deletion, TTL, and tombstone behavior | `cargo make real-world-memory`; `cargo make real-world-memory-live-adapters` | Same commands | `pass=1`, `wrong_result=0`, `blocked=0`, `not_tested=0`, `not_encoded=0` | `pass=1`, `wrong_result=0`, `blocked=0`, `not_tested=0`, `not_encoded=0` | `unchanged` | Extend tombstone and TTL readback beyond the single encoded job into update/delete/recreate history cases. | | Reviewable consolidation | `cargo make real-world-memory-consolidation` | `cargo make real-world-memory-consolidation`; `cargo make real-world-memory-live-consolidation`; `cargo make real-world-memory-live-adapters` | `pass=4`, `wrong_result=0`, `blocked=0`, `not_tested=1`, `not_encoded=1` | `pass=4`, `wrong_result=0`, `blocked=0`, `not_tested=0`, `not_encoded=0` | `improved` | Keep Dreaming output derived and reviewable, and add direct competitor/reference runners only when they emit comparable source ids, confidence, unsupported-claim flags, and review audit artifacts. | | Memory summary and top-of-mind behavior | `cargo make real-world-memory-knowledge`; `cargo make real-world-memory-core-archival` | `cargo make real-world-memory-summary`; `cargo make real-world-memory-knowledge`; `cargo make real-world-memory-core-archival`; `cargo make real-world-memory-live-adapters` | `pass=8`, `wrong_result=0`, `blocked=0`, `not_tested=1`, `not_encoded=1` | `pass=9`, `wrong_result=0`, `blocked=0`, `not_tested=0`, `not_encoded=0` | `improved` | Move from fixture-backed summary/source-trace readback into service-native admin readback and later live top-of-mind behavior; do not turn hidden summaries into authoritative memory. | -| Proactive brief readiness | `cargo make real-world-first-generation-oss`; `cargo make real-world-job-operator-ux` | Same commands plus `cargo make real-world-memory-live-adapters` | `pass=0`, `wrong_result=0`, `blocked=0`, `not_tested=1`, `not_encoded=1` | not run by XY-905 | `not_tested` | Add direct proactive-brief fixtures before any pass claim; briefs must be source-linked and repairable. | +| Proactive brief readiness | `cargo make real-world-first-generation-oss`; `cargo make real-world-job-operator-ux` | `cargo make real-world-memory-proactive-brief`; `cargo make real-world-memory`; `cargo test -p elf-eval --test real_world_job_benchmark -- --test-threads=1` | `pass=0`, `wrong_result=0`, `blocked=0`, `not_tested=1`, `not_encoded=1` | `pass=4`, `wrong_result=0`, `blocked=1`, `not_tested=0`, `not_encoded=0`; evidence-ref/freshness/rationale coverage `1.000`; invalid-current and tombstone violations `0` | `improved` | Move from fixture-backed proactive brief scoring into service-native generated brief readback and later live adapter materialization; keep scheduling and private-corpus refresh behind owned lanes and operator inputs. | | Scheduled memory task readiness | `cargo make real-world-memory-consolidation` | `cargo make real-world-memory-consolidation`; `cargo make real-world-memory-live-adapters` | `pass=0`, `wrong_result=0`, `blocked=1`, `not_tested=0`, `not_encoded=0` | not run by XY-905 | `blocked` | Scheduled runs are future work; start with queued derived proposal runs and keep operator review mandatory. | | Final competitor retest status | `cargo make real-world-memory-live-adapters`; `cargo make real-world-first-generation-oss`; `cargo make real-world-memory-graph-rag`; `cargo make openmemory-ui-export-readback`; `cargo make baseline-production-private-addendum` when operator input exists | Same commands; private/provider commands may remain typed blocked under XY-930 | `pass=22`, `wrong_result=5`, `blocked=2`, `not_tested=11`, `not_encoded=11` | partial XY-905 evidence: ELF live adapter `pass=40`, `wrong_result=0`, `blocked=5`, `not_encoded=10` | `unchanged` | Rerun the broader competitor matrix after each optimization; the XY-905 live adapter improvement does not replace private/provider or external competitor gates. | @@ -83,7 +91,7 @@ parity with managed memory products. | Deletion, TTL, and tombstone behavior | `docs/guide/benchmarking/2026-06-16-live-temporal-reconciliation-report.md`; `docs/research/2026-06-16-live-temporal-reconciliation-report.json`; `docs/guide/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md`; `docs/guide/benchmarking/2026-06-11-measurement-coverage-audit.md` | | Reviewable consolidation | `docs/spec/system_consolidation_proposals_v1.md`; `apps/elf-eval/fixtures/real_world_memory/consolidation/`; `docs/guide/benchmarking/2026-06-16-live-consolidation-proposal-scoring-report.md`; `docs/research/2026-06-16-live-consolidation-proposal-scoring-report.json` | | Memory summary and top-of-mind behavior | `docs/spec/system_memory_summary_v1.md`; `apps/elf-eval/fixtures/real_world_memory/memory_summary/`; `apps/elf-eval/fixtures/real_world_memory/knowledge/`; `apps/elf-eval/fixtures/real_world_memory/core_archival_memory/`; `docs/guide/benchmarking/2026-06-11-competitor-strength-adoption-report.md` | -| Proactive brief readiness | `docs/research/2026-06-08-agent-memory-selection.json`; `docs/guide/benchmarking/2026-06-11-first-generation-oss-continuity-source-store-report.md` | +| Proactive brief readiness | `docs/guide/benchmarking/2026-06-16-proactive-brief-scoring-report.md`; `docs/research/2026-06-16-proactive-brief-scoring-report.json`; `apps/elf-eval/fixtures/real_world_memory/proactive_brief/`; `docs/research/2026-06-08-agent-memory-selection.json`; `docs/guide/benchmarking/2026-06-11-first-generation-oss-continuity-source-store-report.md` | | Scheduled memory task readiness | `docs/spec/system_consolidation_proposals_v1.md`; `docs/research/2026-06-08-agent-memory-selection.json` | | Final competitor retest status | `docs/guide/benchmarking/2026-06-11-competitor-strength-adoption-report.md`; `docs/research/2026-06-11-competitor-strength-adoption-report.json`; `docs/guide/benchmarking/2026-06-11-graph-rag-scored-smoke-adapter-report.md`; `docs/guide/benchmarking/2026-06-11-first-generation-oss-continuity-source-store-report.md` | @@ -116,6 +124,9 @@ Allowed: memory-evolution improvement. - The current ledger records the XY-952 fixture-backed memory-summary/source-trace contract improvement. +- The current ledger records the XY-953 fixture-backed proactive brief scoring + improvement with source refs, freshness/currentness markers, reject/defer rationale, + and typed private-corpus blocking. - Fixture-backed knowledge and core/archival jobs can be used as regression guards for report shape. - Reviewable consolidation now has ELF live service-backed proposal scoring evidence, @@ -124,8 +135,11 @@ Allowed: Not allowed: - Do not claim this ledger proves preference history against mem0/OpenMemory, - live top-of-mind behavior, proactive briefs, scheduled tasks, private-corpus gates, - hosted memory, broad consolidation superiority, or competitor adapters. + live top-of-mind behavior, live proactive brief behavior, scheduled tasks, + private-corpus gates, hosted memory, broad consolidation superiority, or competitor + adapters. +- Do not claim fixture-backed proactive brief scoring proves OpenAI Pulse parity or + hosted managed-memory parity. - Do not claim ELF has full-suite live real-world pass evidence. - Do not claim private-corpus or provider-backed production quality without the operator-owned inputs required by XY-930. diff --git a/docs/guide/benchmarking/2026-06-16-proactive-brief-scoring-report.md b/docs/guide/benchmarking/2026-06-16-proactive-brief-scoring-report.md new file mode 100644 index 00000000..255c544d --- /dev/null +++ b/docs/guide/benchmarking/2026-06-16-proactive-brief-scoring-report.md @@ -0,0 +1,100 @@ +# Proactive Brief Scoring Report - June 16, 2026 + +Purpose: Publish the XY-953 fixture-backed proactive project brief scoring result. +Status: benchmark report +Read this when: You need the current proactive-brief fixture evidence, stage-ledger +delta, and claim boundaries. +Not this document: A scheduler design, morning-dashboard UI, private-corpus run, or +hosted managed-memory comparison. +Source: `docs/research/2026-06-16-proactive-brief-scoring-report.json`. + +## Summary + +`cargo make real-world-memory-proactive-brief` now scores a direct +`proactive_brief` fixture suite. The suite has 5 jobs: 4 pass, 1 blocked, 0 +wrong_result, and 0 unsupported-claim results. + +The four runnable jobs produce 5 suggestions across daily project brief, +resume-work brief, stale decision audit, and stale plan/preference warning scenarios. +Suggestion evidence-ref coverage is `5/5`; freshness/currentness coverage is `1.000`; +action-rationale coverage is `1.000`. The suite records 2 recommendations, 2 defers, +and 1 rejection, with 0 invalid-current suggestions and 0 tombstone violations. + +The private-corpus refresh scenario remains a typed blocker tied to XY-930 because no +operator-owned private production corpus manifest is available. This is intentional: +the benchmark must not require private corpus access and must not turn missing private +inputs into a fixture pass. + +## Fixture Results + +| Job | Status | Suggestion kind | Decision | Evidence and freshness outcome | +| --- | --- | --- | --- | --- | +| `proactive-daily-project-brief-001` | `pass` | `daily_project_brief` | `recommend` | Current source refs selected; stale Pulse-parity trap dropped. | +| `proactive-resume-work-brief-001` | `pass` | `resume_work` | `recommend` | Current handoff and validation refs selected; stale branch trap dropped. | +| `proactive-stale-decision-audit-001` | `pass` | `stale_decision_audit` | `defer` | Superseded decision is surfaced as stale, not current. | +| `proactive-stale-plan-preference-warning-001` | `pass` | `stale_plan_preference_warning` | `defer`, `reject` | Expired, superseded, and tombstoned sources are warning inputs, not current recommendations. | +| `proactive-private-corpus-refresh-blocked-001` | `blocked` | `private_corpus_refresh` | blocked | Private-corpus refresh stays blocked until XY-930 operator inputs exist. | + +## Aggregate Delta + +The root fixture aggregate after XY-953 is: + +| Metric | Value | +| --- | ---: | +| Jobs | `55` | +| Encoded suites | `15` | +| Pass | `49` | +| Blocked | `6` | +| Wrong result | `0` | +| Incomplete | `0` | +| Not encoded | `0` | +| Unsupported claim count | `0` | +| Evidence coverage | `123/123` | +| Source-ref coverage | `123/123` | +| Quote coverage | `123/123` | +| Expected evidence recall | `1.000` | +| Mean score | `0.891` | + +XY-951 stage-ledger delta for `proactive_brief_readiness`: + +| Baseline | After XY-953 | Judgment | +| --- | --- | --- | +| `pass=0`, `wrong_result=0`, `blocked=0`, `not_tested=1`, `not_encoded=1` | `pass=4`, `wrong_result=0`, `blocked=1`, `not_tested=0`, `not_encoded=0` | `improved` | + +## Regression Guards + +The proactive scorer fails or downgrades output when a suggestion: + +- lacks evidence refs, +- lacks freshness/currentness markers, +- lacks a reject/defer/recommend rationale, +- presents stale, superseded, expired, or tombstoned evidence as current, +- ignores TTL invalidations or tombstones, +- carries unsupported current-suggestion flags, +- or claims private-corpus, Pulse, or hosted managed-memory parity from fixture-only + output. + +## Claim Boundaries + +Allowed: + +- ELF now has fixture-backed proactive brief scoring for project briefs and stale + context warnings. +- Passing proactive suggestions include evidence refs, freshness/currentness markers, + and action rationale. +- The private-corpus refresh case is encoded as a typed blocker tied to XY-930. + +Not allowed: + +- Do not claim OpenAI Pulse parity. +- Do not claim hosted managed-memory parity. +- Do not claim scheduler, morning-dashboard, or background execution behavior. +- Do not claim private-corpus refresh quality without operator-owned inputs. +- Do not treat proactive suggestions as authoritative notes; they are derived, + source-linked output that must remain reviewable. + +## Next Direction + +Move from fixture-backed proactive brief scoring into service-native generated brief +readback and later live adapter materialization. Scheduling and private-corpus refresh +remain owned by their separate lanes and operator-input gates. diff --git a/docs/guide/benchmarking/index.md b/docs/guide/benchmarking/index.md index c6d926a5..9c8449f0 100644 --- a/docs/guide/benchmarking/index.md +++ b/docs/guide/benchmarking/index.md @@ -119,6 +119,10 @@ cleanup, use `docs/guide/single_user_production.md`. post-stage command matrix, typed improved/regressed/unchanged/blocked/not-tested buckets, and machine-readable companion file `docs/research/2026-06-16-dreaming-readiness-stage-ledger.json`. +- `2026-06-16-proactive-brief-scoring-report.md`: XY-953 fixture-backed proactive + project brief scoring report with source refs, freshness/currentness markers, + reject/defer rationale, stale/tombstone guards, and the private-corpus blocker tied + to XY-930. - `2026-06-16-live-temporal-reconciliation-report.md`: XY-905 live temporal reconciliation follow-up showing ELF live `memory_evolution` moving from `pass=1`, `wrong_result=5` to `pass=6`, `wrong_result=0`, with trace/readback diff --git a/docs/guide/benchmarking/real_world_agent_memory_benchmark.md b/docs/guide/benchmarking/real_world_agent_memory_benchmark.md index 2527bb5c..84640e02 100644 --- a/docs/guide/benchmarking/real_world_agent_memory_benchmark.md +++ b/docs/guide/benchmarking/real_world_agent_memory_benchmark.md @@ -229,15 +229,17 @@ research gates. Its `external_adapters` report section distinguishes: - `research_gate`: checked-in source/setup/runtime/resource/retry metadata for a future adapter path, not fixture-backed or live execution evidence. -Current fixture state: `cargo make real-world-memory` covers 50 jobs across 14 suites, -with 45 pass and 5 blocked. The `core_archival_memory` suite contributes six passing +Current fixture state: `cargo make real-world-memory` covers 55 jobs across 15 suites, +with 49 pass and 6 blocked. The `core_archival_memory` suite contributes six passing fixture jobs for core block attachment, scope, provenance, stale-core detection, archival fallback, and project-decision recovery. The `memory_summary` suite contributes one passing fixture-backed source-trace job for reviewable current, background, stale, superseded, tombstoned, and derived project-profile entries. The -blocked jobs are production-ops operator boundaries plus the XY-928 OpenViking -`context_trajectory` gates for staged retrieval, hierarchy selection, and recursive -context expansion. +`proactive_brief` suite contributes four passing source-linked proactive suggestions +and one typed private-corpus refresh blocker tied to XY-930. The blocked jobs are +production-ops operator boundaries, the private-corpus refresh blocker, plus the +XY-928 OpenViking `context_trajectory` gates for staged retrieval, hierarchy +selection, and recursive context expansion. Current live-adapter state: the `elf_live_real_world` and `qmd_live_real_world` adapters run a full checked-in suite sweep through `cargo make real-world-memory-live-adapters`. Each adapter diff --git a/docs/research/2026-06-11-competitor-strength-adoption-report.json b/docs/research/2026-06-11-competitor-strength-adoption-report.json index 6384763e..83e8d854 100644 --- a/docs/research/2026-06-11-competitor-strength-adoption-report.json +++ b/docs/research/2026-06-11-competitor-strength-adoption-report.json @@ -40,13 +40,18 @@ { "command": "cargo make real-world-memory", "artifact": "docs/guide/benchmarking/2026-06-11-measurement-coverage-audit.md", - "claim": "ELF fixture aggregate covers 50 jobs across 14 suites with 45 pass and 5 blocked production-ops or OpenViking context-trajectory measurement gates, including 6 passing core_archival_memory jobs and 1 passing memory_summary source-trace job." + "claim": "ELF fixture aggregate covers 55 jobs across 15 suites with 49 pass and 6 blocked production-ops, private-corpus, or OpenViking context-trajectory measurement gates, including 6 passing core_archival_memory jobs, 1 passing memory_summary source-trace job, and 4 passing proactive_brief suggestion jobs plus 1 private-corpus blocker." }, { "command": "cargo make real-world-memory-summary", "artifact": "tmp/real-world-memory/memory-summary/report.json", "claim": "The memory summary fixture scores reviewable top-of-mind, background, stale, superseded, tombstoned, and derived project-profile entries with source refs, freshness metadata, rationale, and unsupported-claim flags; this is fixture-backed contract evidence, not managed-memory parity." }, + { + "command": "cargo make real-world-memory-proactive-brief", + "artifact": "tmp/real-world-memory/proactive-brief/report.json", + "claim": "The proactive brief fixture scores daily project brief, resume-work brief, stale decision audit, stale plan/preference warning, and private-corpus refresh blocker scenarios with evidence refs, freshness/currentness markers, action rationale, and stale/tombstone guards; this is fixture-backed contract evidence, not Pulse or hosted managed-memory parity." + }, { "command": "cargo make real-world-memory-core-archival", "artifact": "tmp/real-world-memory/core-archival/report.json", diff --git a/docs/research/2026-06-16-dreaming-readiness-stage-ledger.json b/docs/research/2026-06-16-dreaming-readiness-stage-ledger.json index 1ba0eef5..1737c065 100644 --- a/docs/research/2026-06-16-dreaming-readiness-stage-ledger.json +++ b/docs/research/2026-06-16-dreaming-readiness-stage-ledger.json @@ -4,7 +4,7 @@ "authority": "XY-951", "created_at": "2026-06-16T00:00:00Z", "purpose": "Define the benchmark evidence gate that every Dreaming-inspired ELF optimization stage must update before claiming completion.", - "source_evidence_cutoff": "Checked-in benchmark and research evidence through the XY-905 live temporal reconciliation run, XY-934 live consolidation proposal scoring run, and XY-952 fixture-backed memory summary/source-trace contract on 2026-06-16; no private-corpus or provider-backed production pass is claimed by this ledger.", + "source_evidence_cutoff": "Checked-in benchmark and research evidence through the XY-905 live temporal reconciliation run, XY-934 live consolidation proposal scoring run, XY-952 fixture-backed memory summary/source-trace contract, and XY-953 fixture-backed proactive brief scoring on 2026-06-16; no private-corpus or provider-backed production pass is claimed by this ledger.", "typed_status_terms": [ "pass", "wrong_result", @@ -37,14 +37,16 @@ "Fixture-backed evidence may prove benchmark shape but must not be promoted into live_real_world product quality.", "Private-corpus and provider-backed production gates remain typed blocked unless the operator supplies explicit inputs; those blockers are tracked under XY-930.", "The XY-905 post-stage live memory_evolution result is a narrow temporal reconciliation improvement only; it must not be converted into private-corpus, hosted memory, or broad competitor superiority claims.", - "The XY-934 live consolidation result is a narrow ELF self-check only; it must not be converted into broad managed dreaming, Always-On Memory Agent, qmd, agentmemory, or llm-wiki superiority claims without comparable contained runners." + "The XY-934 live consolidation result is a narrow ELF self-check only; it must not be converted into broad managed dreaming, Always-On Memory Agent, qmd, agentmemory, or llm-wiki superiority claims without comparable contained runners.", + "The XY-953 proactive brief result is fixture-backed benchmark-shape evidence only; it must not be converted into OpenAI Pulse, hosted managed-memory, scheduler, or private-corpus parity claims." ], "summary": { "improved": [ "current_vs_historical_correctness", "preference_evolution", "reviewable_consolidation", - "memory_summary_top_of_mind_behavior" + "memory_summary_top_of_mind_behavior", + "proactive_brief_readiness" ], "regressed": [], "unchanged": [ @@ -54,9 +56,7 @@ "blocked": [ "scheduled_memory_task_readiness" ], - "not_tested": [ - "proactive_brief_readiness" - ] + "not_tested": [] }, "stage_gates": [ { @@ -351,8 +351,8 @@ { "stage_id": "proactive_brief_readiness", "stage_name": "Proactive brief readiness", - "dependent_issue": "XY-926", - "evidence_class": "not_encoded", + "dependent_issue": "XY-953", + "evidence_class": "fixture_backed", "baseline_commands": [ { "command": "cargo make real-world-first-generation-oss", @@ -367,19 +367,22 @@ ], "post_stage_commands": [ { - "command": "cargo make real-world-first-generation-oss", - "required_artifact": "tmp/real-world-memory/first-generation-oss/report.json" + "command": "cargo make real-world-memory-proactive-brief", + "required_artifact": "tmp/real-world-memory/proactive-brief/report.json" }, { - "command": "cargo make real-world-job-operator-ux", - "required_artifact": "tmp/real-world-job/real-world-job-operator-ux-report.json" + "command": "cargo make real-world-memory", + "required_artifact": "tmp/real-world-memory/real-world-memory-report.json" }, { - "command": "cargo make real-world-memory-live-adapters", - "required_artifact": "tmp/real-world-memory/live-adapters/" + "command": "cargo test -p elf-eval --test real_world_job_benchmark -- --test-threads=1", + "required_artifact": "test output" } ], "evidence_files": [ + "docs/guide/benchmarking/2026-06-16-proactive-brief-scoring-report.md", + "docs/research/2026-06-16-proactive-brief-scoring-report.json", + "apps/elf-eval/fixtures/real_world_memory/proactive_brief/", "docs/research/2026-06-08-agent-memory-selection.json", "docs/guide/benchmarking/2026-06-11-first-generation-oss-continuity-source-store-report.md", "docs/guide/benchmarking/2026-06-11-competitor-strength-adoption-report.md" @@ -392,10 +395,25 @@ "not_encoded": 1 }, "baseline_basis": "No direct proactive-brief real_world_job suite exists; adjacent progressive-disclosure and operator-debug fixtures are reference guards only.", - "comparison_judgment": "not_tested", - "regression_rule": "A proactive brief that is uncited, leaks excluded content, or cannot explain source selection is a regression.", - "improvement_rule": "An improvement requires a direct proactive-brief fixture or live adapter report with cited source ids and typed non-pass handling.", - "next_optimization_direction": "Add proactive briefs only as source-linked derived output with repair guidance and no secret or excluded-span leakage." + "post_stage_counts": { + "pass": 4, + "wrong_result": 0, + "blocked": 1, + "not_tested": 0, + "not_encoded": 0, + "suggestions": 5, + "evidence_ref_coverage": 1.0, + "freshness_coverage": 1.0, + "action_rationale_coverage": 1.0, + "invalid_current_suggestion_count": 0, + "unsupported_current_suggestion_count": 0, + "tombstone_violation_count": 0 + }, + "post_stage_basis": "XY-953 adds five proactive_brief fixture jobs: daily project brief, resume-work brief, stale decision audit, stale plan/preference warning, and a typed private-corpus refresh blocker tied to XY-930. The four runnable jobs pass with five evidence-linked suggestions, freshness/currentness markers, action rationale, stale/superseded/tombstone source traces, and no unsupported-current or tombstone violations.", + "comparison_judgment": "improved", + "regression_rule": "A proactive brief that is uncited, lacks freshness/currentness metadata, omits reject/defer rationale, presents stale or tombstoned facts as current, ignores TTL invalidations, leaks excluded content, or claims Pulse/private-corpus parity is a regression.", + "improvement_rule": "An improvement requires direct proactive-brief fixture or live adapter evidence with cited source ids, freshness/currentness markers, reject/defer rationale, and typed non-pass handling for unavailable private inputs.", + "next_optimization_direction": "Move from fixture-backed proactive brief scoring into service-native generated brief readback and later live adapter materialization; keep scheduling and private-corpus refresh behind their owned lanes and operator inputs." }, { "stage_id": "scheduled_memory_task_readiness", diff --git a/docs/research/2026-06-16-proactive-brief-scoring-report.json b/docs/research/2026-06-16-proactive-brief-scoring-report.json new file mode 100644 index 00000000..e81a72d9 --- /dev/null +++ b/docs/research/2026-06-16-proactive-brief-scoring-report.json @@ -0,0 +1,131 @@ +{ + "schema": "elf.proactive_brief_scoring_report/v1", + "issue": "XY-953", + "created_at": "2026-06-16T14:33:01Z", + "purpose": "Record fixture-backed proactive project brief scoring without claiming scheduler, private-corpus, OpenAI Pulse, or hosted managed-memory parity.", + "evidence_class": "fixture_backed", + "commands": [ + { + "command": "cargo make real-world-memory-proactive-brief", + "status": "pass", + "artifact": "tmp/real-world-memory/proactive-brief/report.json", + "markdown_artifact": "tmp/real-world-memory/proactive-brief/report.md" + }, + { + "command": "cargo make real-world-memory", + "status": "pass", + "artifact": "tmp/real-world-memory/real-world-memory-report.json", + "markdown_artifact": "tmp/real-world-memory/real-world-memory-report.md" + } + ], + "proactive_brief_summary": { + "job_count": 5, + "pass": 4, + "blocked": 1, + "wrong_result": 0, + "unsupported_claim_count": 0, + "evidence_required_count": 8, + "evidence_covered_count": 8, + "expected_evidence_recall": 1.0, + "suggestion_count": 5, + "evidence_ref_coverage": 1.0, + "freshness_coverage": 1.0, + "action_rationale_coverage": 1.0, + "recommended_count": 2, + "deferred_count": 2, + "rejected_count": 1, + "current_suggestion_count": 2, + "non_current_suggestion_count": 3, + "stale_warning_count": 3, + "invalid_current_suggestion_count": 0, + "untraced_suggestion_count": 0, + "unsupported_current_suggestion_count": 0, + "tombstone_violation_count": 0, + "source_trace_selected_count": 7, + "source_trace_stale_count": 2, + "source_trace_superseded_count": 2, + "source_trace_tombstone_count": 1 + }, + "root_fixture_summary_after_xy953": { + "job_count": 55, + "encoded_suite_count": 15, + "pass": 49, + "wrong_result": 0, + "incomplete": 0, + "blocked": 6, + "not_encoded": 0, + "unsupported_claim_count": 0, + "evidence_required_count": 123, + "evidence_covered_count": 123, + "expected_evidence_recall": 1.0, + "source_ref_coverage": 1.0, + "quote_coverage": 1.0, + "mean_score": 0.891 + }, + "scenario_results": [ + { + "job_id": "proactive-daily-project-brief-001", + "status": "pass", + "suggestion_kind": "daily_project_brief", + "decision": "recommend", + "evidence_refs": 2, + "freshness_status": "current" + }, + { + "job_id": "proactive-resume-work-brief-001", + "status": "pass", + "suggestion_kind": "resume_work", + "decision": "recommend", + "evidence_refs": 2, + "freshness_status": "current" + }, + { + "job_id": "proactive-stale-decision-audit-001", + "status": "pass", + "suggestion_kind": "stale_decision_audit", + "decision": "defer", + "evidence_refs": 2, + "freshness_status": "superseded" + }, + { + "job_id": "proactive-stale-plan-preference-warning-001", + "status": "pass", + "suggestion_kind": "stale_plan_preference_warning", + "decisions": ["defer", "reject"], + "evidence_refs": 5, + "freshness_statuses": ["expired", "superseded", "tombstoned"] + }, + { + "job_id": "proactive-private-corpus-refresh-blocked-001", + "status": "blocked", + "suggestion_kind": "private_corpus_refresh", + "blocker": "No operator-owned private production corpus manifest is available; private-corpus refresh suggestions stay blocked under XY-930." + } + ], + "stage_ledger_delta": { + "stage_id": "proactive_brief_readiness", + "baseline_counts": { + "pass": 0, + "wrong_result": 0, + "blocked": 0, + "not_tested": 1, + "not_encoded": 1 + }, + "post_stage_counts": { + "pass": 4, + "wrong_result": 0, + "blocked": 1, + "not_tested": 0, + "not_encoded": 0 + }, + "comparison_judgment": "improved", + "next_optimization_direction": "Move from fixture-backed proactive brief scoring into service-native generated brief readback and later live adapter materialization; keep scheduling and private-corpus refresh behind their owned lanes and operator inputs." + }, + "claim_boundaries": [ + "Do not claim OpenAI Pulse parity from this fixture-backed report.", + "Do not claim hosted managed-memory parity from this fixture-backed report.", + "Do not claim background scheduling or a morning-dashboard UI.", + "Do not claim private-corpus refresh quality without operator-owned inputs under XY-930.", + "Treat proactive briefs as derived output that must remain source-linked and reviewable." + ] +}