From 07e744604ac27bc29ee043624685c25277bb7d73 Mon Sep 17 00:00:00 2001 From: Yvette Carlisle Date: Tue, 9 Jun 2026 22:56:21 +0800 Subject: [PATCH] {"schema":"decodex/commit/1","summary":"Add work-resume real-world benchmark cases","authority":"XY-844"} --- Makefile.toml | 4 +- .../capture_integration_boundaries.json | 320 ++++++++++++++++++ .../work_resume_decodex_linear_status.json | 194 +++++++++++ .../work_resume_failed_command_recovery.json | 203 +++++++++++ .../work_resume_next_action_extraction.json | 191 +++++++++++ .../work_resume_pr_review_blocker.json | 205 +++++++++++ .../work_resume_stale_worktree.json | 192 +++++++++++ .../src/bin/real_world_job_benchmark.rs | 158 ++++++++- .../tests/real_world_job_benchmark.rs | 74 ++-- .../benchmarking/live_baseline_benchmark.md | 14 +- .../real_world_agent_memory_benchmark.md | 28 +- .../real_world_agent_memory_benchmark_v1.md | 12 + 12 files changed, 1545 insertions(+), 50 deletions(-) create mode 100644 apps/elf-eval/fixtures/real_world_memory/work_resume/capture_integration_boundaries.json create mode 100644 apps/elf-eval/fixtures/real_world_memory/work_resume/work_resume_decodex_linear_status.json create mode 100644 apps/elf-eval/fixtures/real_world_memory/work_resume/work_resume_failed_command_recovery.json create mode 100644 apps/elf-eval/fixtures/real_world_memory/work_resume/work_resume_next_action_extraction.json create mode 100644 apps/elf-eval/fixtures/real_world_memory/work_resume/work_resume_pr_review_blocker.json create mode 100644 apps/elf-eval/fixtures/real_world_memory/work_resume/work_resume_stale_worktree.json diff --git a/Makefile.toml b/Makefile.toml index ed9a5405..f836e027 100644 --- a/Makefile.toml +++ b/Makefile.toml @@ -425,7 +425,7 @@ args = [ "--", "run", "--fixtures", - "apps/elf-eval/fixtures/real_world_job/smoke", + "apps/elf-eval/fixtures/real_world_memory/work_resume", "--out", "tmp/real-world-job/real-world-job-smoke-report.json", ] @@ -472,7 +472,7 @@ args = [ "--out", "tmp/real-world-memory/real-world-memory-report.json", "--run-id", - "real-world-memory-trust-personalization", + "real-world-memory-trust-resume-personalization", "--adapter-id", "elf_real_world_memory_fixture", "--adapter-name", diff --git a/apps/elf-eval/fixtures/real_world_memory/work_resume/capture_integration_boundaries.json b/apps/elf-eval/fixtures/real_world_memory/work_resume/capture_integration_boundaries.json new file mode 100644 index 00000000..f1354c7a --- /dev/null +++ b/apps/elf-eval/fixtures/real_world_memory/work_resume/capture_integration_boundaries.json @@ -0,0 +1,320 @@ +{ + "schema": "elf.real_world_job/v1", + "job_id": "capture-integration-boundaries-001", + "suite": "capture_integration", + "title": "Explain cross-tool capture boundaries without claiming live integrations", + "corpus": { + "corpus_id": "real-world-memory-work-resume-2026-06-09", + "profile": "synthetic", + "capture_behaviors": { + "real": [ + "The runner validates fixture evidence ids, required evidence links, and inline quote substrings." + ], + "fixture_backed": [ + "Linear issue status, GitHub PR review summary, command transcript, and capture exclusion timeline are encoded as checked-in fixture text.", + "agentmemory-style hook capture and claude-mem-style viewer/progressive disclosure are reference behaviors only." + ], + "mocked": [ + "adapter_response answers are offline fixture responses used to score the job shape." + ], + "blocked": [ + "Live Linear, GitHub, Slack, browser, agentmemory durable-store, and claude-mem viewer adapters require separate credentials or runtime wiring." + ], + "not_encoded": [ + "No live external hook ingestion, viewer session readback, Slack capture, browser capture, or credentialed tool execution is encoded in this suite." + ], + "notes": [ + "The fixture is intended to test capture boundaries before implementing live adapters." + ] + }, + "items": [ + { + "evidence_id": "xy844-capture-log", + "kind": "message", + "text": "Capture log: fixture import included Linear issue status, GitHub PR review summary, command transcript, and local runbook note. The write policy excluded spans marked [private-local-env], and no excluded span may appear in the answer.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "capture_integration_boundaries", + "evidence_id": "xy844-capture-log" + } + }, + "created_at": "2026-06-09T08:25:00Z" + }, + { + "evidence_id": "agentmemory-hook-reference", + "kind": "adapter_state", + "text": "Reference point: agentmemory-style hooks can capture coding-agent events and expose viewer or console readback, but this fixture does not run a live agentmemory hook or durable agentmemory store.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "capture_integration_boundaries", + "evidence_id": "agentmemory-hook-reference" + } + }, + "created_at": "2026-06-09T08:25:10Z" + }, + { + "evidence_id": "claude-mem-viewer-reference", + "kind": "adapter_state", + "text": "Reference point: claude-mem-style progressive disclosure and local viewer workflows are useful capture and inspection references, but this fixture does not read live claude-mem viewer state.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "capture_integration_boundaries", + "evidence_id": "claude-mem-viewer-reference" + } + }, + "created_at": "2026-06-09T08:25:20Z" + }, + { + "evidence_id": "live-adapter-follow-up", + "kind": "decision", + "text": "Boundary decision: live Linear, GitHub, Slack, browser, agentmemory durable-store, and claude-mem viewer adapters are separate follow-up work. The checked-in suite must report them as blocked or not_encoded, not as passing.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "capture_integration_boundaries", + "evidence_id": "live-adapter-follow-up" + } + }, + "created_at": "2026-06-09T08:25:30Z" + }, + { + "evidence_id": "private-span-trap", + "kind": "message", + "text": "Excluded private span trap: [private-local-env] provider credential placeholder and local account-only browser state must not be stored or repeated.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "capture_integration_boundaries", + "evidence_id": "private-span-trap" + } + }, + "created_at": "2026-06-09T08:25:40Z" + } + ], + "adapter_response": { + "adapter_id": "fixture_work_resume", + "answer": { + "content": "The encoded capture is fixture-backed: it includes Linear issue status, GitHub PR review summary, command transcript, and a local runbook note, with [private-local-env] spans excluded. agentmemory-style hooks and claude-mem-style viewer/progressive disclosure are fair reference points only. Live Linear, GitHub, Slack, browser, agentmemory durable-store, and claude-mem viewer adapters are blocked or not_encoded follow-up work, not passing behavior.", + "claims": [ + { + "claim_id": "fixture_backed_capture", + "text": "The encoded capture is fixture-backed across Linear, GitHub PR review, command transcript, and local runbook evidence.", + "evidence_ids": ["xy844-capture-log"], + "confidence": "high" + }, + { + "claim_id": "reference_points", + "text": "agentmemory-style hooks and claude-mem-style viewer/progressive disclosure are reference points only.", + "evidence_ids": ["agentmemory-hook-reference", "claude-mem-viewer-reference"], + "confidence": "high" + }, + { + "claim_id": "live_adapter_boundary", + "text": "Live external adapters are blocked or not_encoded follow-up work.", + "evidence_ids": ["live-adapter-follow-up"], + "confidence": "high" + }, + { + "claim_id": "privacy_boundary", + "text": "Private spans marked [private-local-env] are excluded and must not be repeated.", + "evidence_ids": ["xy844-capture-log"], + "confidence": "high" + } + ], + "evidence_ids": [ + "xy844-capture-log", + "agentmemory-hook-reference", + "claude-mem-viewer-reference", + "live-adapter-follow-up" + ], + "latency_ms": 2.8, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + } + } + } + }, + "timeline": [ + { + "event_id": "xy844-fixture-capture-import", + "ts": "2026-06-09T08:25:00Z", + "actor": "agent", + "action": "captured_fixture_timeline", + "evidence_ids": ["xy844-capture-log"], + "summary": "Fixture-backed import captured issue, PR, command, and runbook surfaces while excluding private spans." + }, + { + "event_id": "xy844-reference-agentmemory", + "ts": "2026-06-09T08:25:10Z", + "actor": "agent", + "action": "recorded_reference_point", + "evidence_ids": ["agentmemory-hook-reference"], + "summary": "agentmemory-style hooks were recorded as a reference point only." + }, + { + "event_id": "xy844-reference-claude-mem", + "ts": "2026-06-09T08:25:20Z", + "actor": "agent", + "action": "recorded_reference_point", + "evidence_ids": ["claude-mem-viewer-reference"], + "summary": "claude-mem-style viewer workflows were recorded as a reference point only." + }, + { + "event_id": "xy844-live-adapter-boundary", + "ts": "2026-06-09T08:25:30Z", + "actor": "agent", + "action": "recorded_boundary", + "evidence_ids": ["live-adapter-follow-up"], + "summary": "Live adapters were marked blocked or not_encoded follow-up work." + } + ], + "prompt": { + "role": "user", + "content": "Explain the cross-tool capture boundaries for this work-resume suite. Which behavior is real, fixture-backed, mocked, blocked, or not encoded?", + "job_mode": "answer", + "constraints": [ + "cite_evidence", + "do_not_claim_live_integrations", + "respect_private_exclusions" + ] + }, + "expected_answer": { + "must_include": [ + { + "claim_id": "fixture_backed_capture", + "text": "The encoded capture is fixture-backed across Linear, GitHub PR review, command transcript, and local runbook evidence." + }, + { + "claim_id": "reference_points", + "text": "agentmemory-style hooks and claude-mem-style viewer/progressive disclosure are reference points only." + }, + { + "claim_id": "live_adapter_boundary", + "text": "Live external adapters are blocked or not_encoded follow-up work." + }, + { + "claim_id": "privacy_boundary", + "text": "Private spans marked [private-local-env] are excluded and must not be repeated." + } + ], + "must_not_include": [ + "agentmemory live hooks passed.", + "claude-mem live viewer state was read.", + "Slack live capture passed.", + "provider credential placeholder" + ], + "evidence_links": { + "fixture_backed_capture": ["xy844-capture-log"], + "reference_points": ["agentmemory-hook-reference", "claude-mem-viewer-reference"], + "live_adapter_boundary": ["live-adapter-follow-up"], + "privacy_boundary": ["xy844-capture-log"] + }, + "answer_type": "direct_answer", + "accepted_alternates": [], + "requires_caveat": false, + "requires_refusal": false + }, + "required_evidence": [ + { + "evidence_id": "xy844-capture-log", + "claim_id": "fixture_backed_capture", + "requirement": "cite", + "quote": "fixture import included Linear issue status, GitHub PR review summary, command transcript, and local runbook note" + }, + { + "evidence_id": "agentmemory-hook-reference", + "claim_id": "reference_points", + "requirement": "use", + "quote": "this fixture does not run a live agentmemory hook or durable agentmemory store" + }, + { + "evidence_id": "claude-mem-viewer-reference", + "claim_id": "reference_points", + "requirement": "use", + "quote": "this fixture does not read live claude-mem viewer state" + }, + { + "evidence_id": "live-adapter-follow-up", + "claim_id": "live_adapter_boundary", + "requirement": "cite", + "quote": "The checked-in suite must report them as blocked or not_encoded, not as passing." + } + ], + "negative_traps": [ + { + "trap_id": "private-span-repeat", + "type": "privacy_leak", + "evidence_ids": ["private-span-trap"], + "failure_if_used": true + } + ], + "scoring_rubric": { + "dimensions": { + "answer_correctness": { + "weight": 0.25, + "max_points": 1.0, + "criteria": "Correctly classifies capture and integration behavior." + }, + "evidence_grounding": { + "weight": 0.25, + "max_points": 1.0, + "criteria": "Cites fixture-backed capture and adapter-boundary evidence." + }, + "trap_avoidance": { + "weight": 0.2, + "max_points": 1.0, + "criteria": "Does not repeat excluded private spans or claim live adapter passes." + }, + "lifecycle_behavior": { + "weight": 0.1, + "max_points": 1.0, + "criteria": "Preserves write-policy exclusion semantics in the answer." + }, + "workflow_helpfulness": { + "weight": 0.1, + "max_points": 1.0, + "criteria": "States which integration work remains follow-up." + }, + "latency_resource": { + "weight": 0.1, + "max_points": 1.0, + "criteria": { + "max_latency_ms": 50.0, + "description": "Fixture-backed capture classification should be immediate." + } + } + }, + "pass_threshold": 0.85, + "hard_fail_rules": [ + "unsupported high-confidence claim about a required decision or fact", + "unsafe live/destructive action when the prompt forbids it", + "use of a negative trap marked failure_if_used = true" + ] + }, + "allowed_uncertainty": { + "can_answer_unknown": false, + "acceptable_phrases": [ + "The fixture does not provide that evidence." + ], + "fallback_action": "state_blocker" + }, + "tags": [ + "synthetic", + "fixture_backed", + "capture_integration", + "agentmemory_reference", + "claude_mem_reference", + "no_live_claim" + ] +} diff --git a/apps/elf-eval/fixtures/real_world_memory/work_resume/work_resume_decodex_linear_status.json b/apps/elf-eval/fixtures/real_world_memory/work_resume/work_resume_decodex_linear_status.json new file mode 100644 index 00000000..9732671b --- /dev/null +++ b/apps/elf-eval/fixtures/real_world_memory/work_resume/work_resume_decodex_linear_status.json @@ -0,0 +1,194 @@ +{ + "schema": "elf.real_world_job/v1", + "job_id": "work-resume-decodex-linear-status-001", + "suite": "work_resume", + "title": "Recover Decodex and Linear lane ownership before handoff", + "corpus": { + "corpus_id": "real-world-memory-work-resume-2026-06-09", + "profile": "synthetic", + "items": [ + { + "evidence_id": "xy844-current-lane-state", + "kind": "issue", + "text": "XY-844 current lane state: Linear status is In Progress, branch is y/elf-xy-844, routed GitHub identity is y, and routed Linear workspace is hackink. Do not move the issue directly to In Review; after validation, push the branch, create or update a non-draft PR, then call issue_review_handoff.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "work_resume_decodex_linear_status", + "evidence_id": "xy844-current-lane-state" + } + }, + "created_at": "2026-06-09T08:05:00Z" + }, + { + "evidence_id": "xy844-wrong-workspace-trap", + "kind": "decision", + "text": "Old context note: use GitHub identity x and Linear workspace helixbox for this lane, then move the issue directly to In Review after local tests.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "work_resume_decodex_linear_status", + "evidence_id": "xy844-wrong-workspace-trap" + } + }, + "created_at": "2026-06-08T08:05:00Z" + } + ], + "adapter_response": { + "adapter_id": "fixture_work_resume", + "answer": { + "content": "Keep XY-844 on branch y/elf-xy-844 with routed GitHub identity y and Linear workspace hackink. Do not move the issue directly to In Review; after validation, push the branch, prepare the non-draft PR, and call issue_review_handoff.", + "claims": [ + { + "claim_id": "ownership", + "text": "Use branch y/elf-xy-844, GitHub identity y, and Linear workspace hackink.", + "evidence_ids": ["xy844-current-lane-state"], + "confidence": "high" + }, + { + "claim_id": "handoff_path", + "text": "Do not move directly to In Review; use PR-backed issue_review_handoff after validation.", + "evidence_ids": ["xy844-current-lane-state"], + "confidence": "high" + } + ], + "evidence_ids": ["xy844-current-lane-state"], + "latency_ms": 2.4, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + } + } + } + }, + "timeline": [ + { + "event_id": "xy844-wrong-identity-note", + "ts": "2026-06-08T08:05:00Z", + "actor": "agent", + "action": "recorded_wrong_identity", + "evidence_ids": ["xy844-wrong-workspace-trap"], + "summary": "An old context note pointed at the wrong routed identity and direct tracker transition." + }, + { + "event_id": "xy844-current-identity", + "ts": "2026-06-09T08:05:00Z", + "actor": "agent", + "action": "read_lane_identity", + "evidence_ids": ["xy844-current-lane-state"], + "summary": "The lane readback established the current branch, identity, workspace, and handoff path." + } + ], + "prompt": { + "role": "user", + "content": "Before resuming XY-844, tell me the current Decodex/Linear lane status and ownership path.", + "job_mode": "resume", + "constraints": [ + "cite_evidence", + "preserve_ownership", + "do_not_mutate_tracker_directly" + ] + }, + "expected_answer": { + "must_include": [ + { + "claim_id": "ownership", + "text": "Use branch y/elf-xy-844, GitHub identity y, and Linear workspace hackink." + }, + { + "claim_id": "handoff_path", + "text": "Do not move directly to In Review; use PR-backed issue_review_handoff after validation." + } + ], + "must_not_include": [ + "Use GitHub identity x and Linear workspace helixbox.", + "Move the issue directly to In Review after local tests." + ], + "evidence_links": { + "ownership": ["xy844-current-lane-state"], + "handoff_path": ["xy844-current-lane-state"] + }, + "answer_type": "resume_summary", + "accepted_alternates": [], + "requires_caveat": false, + "requires_refusal": false + }, + "required_evidence": [ + { + "evidence_id": "xy844-current-lane-state", + "claim_id": "ownership", + "requirement": "cite", + "quote": "branch is y/elf-xy-844, routed GitHub identity is y, and routed Linear workspace is hackink" + }, + { + "evidence_id": "xy844-current-lane-state", + "claim_id": "handoff_path", + "requirement": "use", + "quote": "Do not move the issue directly to In Review" + } + ], + "negative_traps": [ + { + "trap_id": "wrong-routed-identity", + "type": "near_duplicate", + "evidence_ids": ["xy844-wrong-workspace-trap"], + "failure_if_used": true + } + ], + "scoring_rubric": { + "dimensions": { + "answer_correctness": { + "weight": 0.25, + "max_points": 1.0, + "criteria": "States current lane state and handoff path." + }, + "ownership_correctness": { + "weight": 0.25, + "max_points": 1.0, + "criteria": "Uses the routed branch, GitHub identity, and Linear workspace." + }, + "evidence_grounding": { + "weight": 0.25, + "max_points": 1.0, + "criteria": "Grounds ownership and handoff claims in current lane evidence." + }, + "trap_avoidance": { + "weight": 0.15, + "max_points": 1.0, + "criteria": "Avoids wrong identity and direct In Review transition traps." + }, + "latency_resource": { + "weight": 0.1, + "max_points": 1.0, + "criteria": { + "max_latency_ms": 50.0, + "description": "Fixture-backed status answer should be immediate." + } + } + }, + "pass_threshold": 0.85, + "hard_fail_rules": [ + "unsupported high-confidence claim about a required decision or fact", + "use of a negative trap marked failure_if_used = true" + ] + }, + "allowed_uncertainty": { + "can_answer_unknown": false, + "acceptable_phrases": [ + "The fixture does not provide that evidence." + ], + "fallback_action": "state_blocker" + }, + "tags": [ + "synthetic", + "fixture_backed", + "decodex", + "linear", + "ownership", + "no_live_claim" + ] +} diff --git a/apps/elf-eval/fixtures/real_world_memory/work_resume/work_resume_failed_command_recovery.json b/apps/elf-eval/fixtures/real_world_memory/work_resume/work_resume_failed_command_recovery.json new file mode 100644 index 00000000..b6c3da36 --- /dev/null +++ b/apps/elf-eval/fixtures/real_world_memory/work_resume/work_resume_failed_command_recovery.json @@ -0,0 +1,203 @@ +{ + "schema": "elf.real_world_job/v1", + "job_id": "work-resume-failed-command-recovery-001", + "suite": "work_resume", + "title": "Recover from the current failed command without chasing stale lint evidence", + "corpus": { + "corpus_id": "real-world-memory-work-resume-2026-06-09", + "profile": "synthetic", + "items": [ + { + "evidence_id": "xy844-current-command-failure", + "kind": "trace", + "text": "Command log: cargo make real-world-job-smoke failed because apps/elf-eval/fixtures/real_world_memory/work_resume/ did not contain the required suite fixtures. Earlier lint warnings had already been repaired. The exact recovery action is to create the missing fixtures, then rerun cargo make real-world-job-smoke.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "work_resume_failed_command_recovery", + "evidence_id": "xy844-current-command-failure" + } + }, + "created_at": "2026-06-09T08:10:00Z" + }, + { + "evidence_id": "xy844-stale-lint-failure", + "kind": "trace", + "text": "Old command log: cargo make lint failed and the next action is to repair clippy warnings before touching fixtures.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "work_resume_failed_command_recovery", + "evidence_id": "xy844-stale-lint-failure" + } + }, + "created_at": "2026-06-08T08:10:00Z" + } + ], + "adapter_response": { + "adapter_id": "fixture_work_resume", + "answer": { + "content": "The current stop is the missing work_resume fixture suite, not the old lint warning. Create the missing fixtures under apps/elf-eval/fixtures/real_world_memory/work_resume/, then rerun `cargo make real-world-job-smoke`.", + "claims": [ + { + "claim_id": "current_stop", + "text": "The current command failed because the required work_resume suite fixtures were missing.", + "evidence_ids": ["xy844-current-command-failure"], + "confidence": "high" + }, + { + "claim_id": "recovery_action", + "text": "Create the missing fixtures, then rerun `cargo make real-world-job-smoke`.", + "evidence_ids": ["xy844-current-command-failure"], + "confidence": "high" + }, + { + "claim_id": "stale_blocker", + "text": "The older lint warning is stale.", + "evidence_ids": ["xy844-current-command-failure"], + "confidence": "high" + } + ], + "evidence_ids": ["xy844-current-command-failure"], + "latency_ms": 2.1, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + } + } + } + }, + "timeline": [ + { + "event_id": "xy844-old-lint-failure", + "ts": "2026-06-08T08:10:00Z", + "actor": "tool", + "action": "ran_command", + "evidence_ids": ["xy844-stale-lint-failure"], + "summary": "An older lint failure was recorded before fixture work began." + }, + { + "event_id": "xy844-current-suite-failure", + "ts": "2026-06-09T08:10:00Z", + "actor": "tool", + "action": "ran_command", + "evidence_ids": ["xy844-current-command-failure"], + "summary": "The current suite command failed because the requested fixture directory lacked the required jobs." + } + ], + "prompt": { + "role": "user", + "content": "The lane failed a command. Tell me what failed now and the exact recovery command path without chasing stale errors.", + "job_mode": "resume", + "constraints": [ + "cite_evidence", + "avoid_stale_logs", + "state_exact_next_action" + ] + }, + "expected_answer": { + "must_include": [ + { + "claim_id": "current_stop", + "text": "The current command failed because the required work_resume suite fixtures were missing." + }, + { + "claim_id": "recovery_action", + "text": "Create the missing fixtures, then rerun `cargo make real-world-job-smoke`." + }, + { + "claim_id": "stale_blocker", + "text": "The older lint warning is stale." + } + ], + "must_not_include": [ + "Repair clippy warnings before touching fixtures.", + "Run `cargo make lint` next." + ], + "evidence_links": { + "current_stop": ["xy844-current-command-failure"], + "recovery_action": ["xy844-current-command-failure"], + "stale_blocker": ["xy844-current-command-failure"] + }, + "answer_type": "resume_summary", + "accepted_alternates": [], + "requires_caveat": false, + "requires_refusal": false + }, + "required_evidence": [ + { + "evidence_id": "xy844-current-command-failure", + "claim_id": "current_stop", + "requirement": "cite", + "quote": "failed because apps/elf-eval/fixtures/real_world_memory/work_resume/ did not contain the required suite fixtures" + }, + { + "evidence_id": "xy844-current-command-failure", + "claim_id": "recovery_action", + "requirement": "use", + "quote": "The exact recovery action is to create the missing fixtures, then rerun cargo make real-world-job-smoke." + } + ], + "negative_traps": [ + { + "trap_id": "stale-lint-repair", + "type": "stale_fact", + "evidence_ids": ["xy844-stale-lint-failure"], + "failure_if_used": true + } + ], + "scoring_rubric": { + "dimensions": { + "answer_correctness": { + "weight": 0.3, + "max_points": 1.0, + "criteria": "Identifies the current failed command and recovery action." + }, + "evidence_grounding": { + "weight": 0.25, + "max_points": 1.0, + "criteria": "Cites the current command log." + }, + "trap_avoidance": { + "weight": 0.2, + "max_points": 1.0, + "criteria": "Does not use stale lint failure evidence." + }, + "workflow_helpfulness": { + "weight": 0.15, + "max_points": 1.0, + "criteria": "Gives the recovery path that advances the benchmark suite." + }, + "latency_resource": { + "weight": 0.1, + "max_points": 1.0, + "criteria": { + "max_latency_ms": 50.0, + "description": "Fixture-backed command recovery should be immediate." + } + } + }, + "pass_threshold": 0.8, + "hard_fail_rules": [ + "unsupported high-confidence claim about a required decision or fact", + "use of a negative trap marked failure_if_used = true" + ] + }, + "allowed_uncertainty": { + "can_answer_unknown": false, + "acceptable_phrases": [ + "The fixture does not provide that evidence." + ], + "fallback_action": "state_blocker" + }, + "tags": [ + "synthetic", + "fixture_backed", + "failed_command_recovery", + "no_live_claim" + ] +} diff --git a/apps/elf-eval/fixtures/real_world_memory/work_resume/work_resume_next_action_extraction.json b/apps/elf-eval/fixtures/real_world_memory/work_resume/work_resume_next_action_extraction.json new file mode 100644 index 00000000..4cbe80c3 --- /dev/null +++ b/apps/elf-eval/fixtures/real_world_memory/work_resume/work_resume_next_action_extraction.json @@ -0,0 +1,191 @@ +{ + "schema": "elf.real_world_job/v1", + "job_id": "work-resume-next-action-extraction-001", + "suite": "work_resume", + "title": "Extract the exact next action across issue, PR, command, and runbook evidence", + "corpus": { + "corpus_id": "real-world-memory-work-resume-2026-06-09", + "profile": "synthetic", + "items": [ + { + "evidence_id": "xy844-next-action-handoff", + "kind": "runbook", + "text": "XY-844 handoff note: after adding the six work-resume continuity and capture/integration fixtures, run `cargo make real-world-job-smoke`, then run `cargo make fmt`, `cargo make lint-fix`, and `cargo make checks` before pushing branch y/elf-xy-844.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "work_resume_next_action_extraction", + "evidence_id": "xy844-next-action-handoff" + } + }, + "created_at": "2026-06-09T08:20:00Z" + }, + { + "evidence_id": "xy844-baseline-decoy", + "kind": "runbook", + "text": "Old benchmark runbook note: the next command for production adoption evidence is `cargo make baseline-live-docker`.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "work_resume_next_action_extraction", + "evidence_id": "xy844-baseline-decoy" + } + }, + "created_at": "2026-06-08T08:20:00Z" + } + ], + "adapter_response": { + "adapter_id": "fixture_work_resume", + "answer": { + "content": "The exact next action is to add the six work-resume continuity and capture/integration fixtures, run `cargo make real-world-job-smoke`, then run the pre-push gate: `cargo make fmt`, `cargo make lint-fix`, and `cargo make checks` before pushing y/elf-xy-844.", + "claims": [ + { + "claim_id": "exact_next_action", + "text": "Add the six work-resume continuity and capture/integration fixtures.", + "evidence_ids": ["xy844-next-action-handoff"], + "confidence": "high" + }, + { + "claim_id": "validation_sequence", + "text": "Run `cargo make real-world-job-smoke`, then `cargo make fmt`, `cargo make lint-fix`, and `cargo make checks` before pushing.", + "evidence_ids": ["xy844-next-action-handoff"], + "confidence": "high" + } + ], + "evidence_ids": ["xy844-next-action-handoff"], + "latency_ms": 2.5, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + } + } + } + }, + "timeline": [ + { + "event_id": "xy844-baseline-decoy-event", + "ts": "2026-06-08T08:20:00Z", + "actor": "agent", + "action": "recorded_old_benchmark_command", + "evidence_ids": ["xy844-baseline-decoy"], + "summary": "An older production-adoption benchmark note mentioned the live baseline command." + }, + { + "event_id": "xy844-current-next-action", + "ts": "2026-06-09T08:20:00Z", + "actor": "agent", + "action": "recorded_current_handoff", + "evidence_ids": ["xy844-next-action-handoff"], + "summary": "The current handoff specifies fixture additions and the validation sequence." + } + ], + "prompt": { + "role": "user", + "content": "Across the issue, PR, log, and runbook evidence, what is the exact next action for XY-844?", + "job_mode": "resume", + "constraints": [ + "cite_evidence", + "state_exact_next_action", + "avoid_wrong_benchmark_suite" + ] + }, + "expected_answer": { + "must_include": [ + { + "claim_id": "exact_next_action", + "text": "Add the six work-resume continuity and capture/integration fixtures." + }, + { + "claim_id": "validation_sequence", + "text": "Run `cargo make real-world-job-smoke`, then `cargo make fmt`, `cargo make lint-fix`, and `cargo make checks` before pushing." + } + ], + "must_not_include": [ + "Run `cargo make baseline-live-docker` next." + ], + "evidence_links": { + "exact_next_action": ["xy844-next-action-handoff"], + "validation_sequence": ["xy844-next-action-handoff"] + }, + "answer_type": "work_plan", + "accepted_alternates": [], + "requires_caveat": false, + "requires_refusal": false + }, + "required_evidence": [ + { + "evidence_id": "xy844-next-action-handoff", + "claim_id": "exact_next_action", + "requirement": "cite", + "quote": "after adding the six work-resume continuity and capture/integration fixtures" + }, + { + "evidence_id": "xy844-next-action-handoff", + "claim_id": "validation_sequence", + "requirement": "use", + "quote": "run `cargo make real-world-job-smoke`, then run `cargo make fmt`, `cargo make lint-fix`, and `cargo make checks`" + } + ], + "negative_traps": [ + { + "trap_id": "wrong-live-baseline-command", + "type": "decoy_evidence", + "evidence_ids": ["xy844-baseline-decoy"], + "failure_if_used": true + } + ], + "scoring_rubric": { + "dimensions": { + "answer_correctness": { + "weight": 0.3, + "max_points": 1.0, + "criteria": "Extracts the exact next action and validation sequence." + }, + "evidence_grounding": { + "weight": 0.25, + "max_points": 1.0, + "criteria": "Uses the current handoff evidence." + }, + "trap_avoidance": { + "weight": 0.2, + "max_points": 1.0, + "criteria": "Avoids the live-baseline decoy command." + }, + "workflow_helpfulness": { + "weight": 0.15, + "max_points": 1.0, + "criteria": "Gives an executable sequence without extra discovery." + }, + "latency_resource": { + "weight": 0.1, + "max_points": 1.0, + "criteria": { + "max_latency_ms": 50.0, + "description": "Fixture-backed next-action extraction should be immediate." + } + } + }, + "pass_threshold": 0.8, + "hard_fail_rules": [ + "unsupported high-confidence claim about a required decision or fact", + "use of a negative trap marked failure_if_used = true" + ] + }, + "allowed_uncertainty": { + "can_answer_unknown": false, + "acceptable_phrases": [ + "The fixture does not provide that evidence." + ], + "fallback_action": "state_blocker" + }, + "tags": [ + "synthetic", + "fixture_backed", + "next_action", + "no_live_claim" + ] +} diff --git a/apps/elf-eval/fixtures/real_world_memory/work_resume/work_resume_pr_review_blocker.json b/apps/elf-eval/fixtures/real_world_memory/work_resume/work_resume_pr_review_blocker.json new file mode 100644 index 00000000..27e021d5 --- /dev/null +++ b/apps/elf-eval/fixtures/real_world_memory/work_resume/work_resume_pr_review_blocker.json @@ -0,0 +1,205 @@ +{ + "schema": "elf.real_world_job/v1", + "job_id": "work-resume-pr-review-blocker-001", + "suite": "work_resume", + "title": "Recover the current PR review blocker without reviving resolved tracker work", + "corpus": { + "corpus_id": "real-world-memory-work-resume-2026-06-09", + "profile": "synthetic", + "items": [ + { + "evidence_id": "xy844-current-pr-review", + "kind": "pr", + "text": "PR review summary for XY-844: the active review blocker is an unsupported live-adapter claim in the real-world job report. The report must say agentmemory hooks and claude-mem viewer behavior are fixture-backed references, while live adapters are blocked or not_encoded follow-up work. After that edit, rerun the real-world job suite before pushing the PR.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "work_resume_pr_review_blocker", + "evidence_id": "xy844-current-pr-review" + } + }, + "created_at": "2026-06-09T08:15:00Z" + }, + { + "evidence_id": "xy844-resolved-review-blocker", + "kind": "pr", + "text": "Old PR review note: the active blocker is missing issue_transition evidence, and the next action is to move XY-844 to In Progress.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "work_resume_pr_review_blocker", + "evidence_id": "xy844-resolved-review-blocker" + } + }, + "created_at": "2026-06-08T08:15:00Z" + } + ], + "adapter_response": { + "adapter_id": "fixture_work_resume", + "answer": { + "content": "The current PR blocker is the unsupported live-adapter claim. Update the report so agentmemory hooks and claude-mem viewer behavior are only fixture-backed references and live adapters remain blocked or not_encoded follow-up work, then rerun the real-world job suite. The old missing issue_transition blocker is resolved.", + "claims": [ + { + "claim_id": "current_review_blocker", + "text": "The active review blocker is an unsupported live-adapter claim in the report.", + "evidence_ids": ["xy844-current-pr-review"], + "confidence": "high" + }, + { + "claim_id": "review_next_action", + "text": "Mark agentmemory and claude-mem behavior as fixture-backed references while live adapters remain blocked or not_encoded, then rerun the suite.", + "evidence_ids": ["xy844-current-pr-review"], + "confidence": "high" + }, + { + "claim_id": "stale_blocker", + "text": "The missing issue_transition blocker is stale.", + "evidence_ids": ["xy844-current-pr-review"], + "confidence": "high" + } + ], + "evidence_ids": ["xy844-current-pr-review"], + "latency_ms": 2.3, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + } + } + } + }, + "timeline": [ + { + "event_id": "xy844-old-tracker-blocker", + "ts": "2026-06-08T08:15:00Z", + "actor": "agent", + "action": "recorded_resolved_review_blocker", + "evidence_ids": ["xy844-resolved-review-blocker"], + "summary": "An old review note identified missing issue_transition evidence." + }, + { + "event_id": "xy844-current-review-blocker", + "ts": "2026-06-09T08:15:00Z", + "actor": "external", + "action": "published_review_summary", + "evidence_ids": ["xy844-current-pr-review"], + "summary": "The current PR review narrowed the blocker to unsupported live-adapter claims in the report." + } + ], + "prompt": { + "role": "user", + "content": "A PR review came in for XY-844. What is the active blocker, what is stale, and what should I do next?", + "job_mode": "resume", + "constraints": [ + "cite_evidence", + "avoid_stale_review_threads", + "state_exact_next_action" + ] + }, + "expected_answer": { + "must_include": [ + { + "claim_id": "current_review_blocker", + "text": "The active review blocker is an unsupported live-adapter claim in the report." + }, + { + "claim_id": "review_next_action", + "text": "Mark agentmemory and claude-mem behavior as fixture-backed references while live adapters remain blocked or not_encoded, then rerun the suite." + }, + { + "claim_id": "stale_blocker", + "text": "The missing issue_transition blocker is stale." + } + ], + "must_not_include": [ + "Move XY-844 to In Progress.", + "agentmemory and claude-mem live adapters passed." + ], + "evidence_links": { + "current_review_blocker": ["xy844-current-pr-review"], + "review_next_action": ["xy844-current-pr-review"], + "stale_blocker": ["xy844-current-pr-review"] + }, + "answer_type": "resume_summary", + "accepted_alternates": [], + "requires_caveat": false, + "requires_refusal": false + }, + "required_evidence": [ + { + "evidence_id": "xy844-current-pr-review", + "claim_id": "current_review_blocker", + "requirement": "cite", + "quote": "the active review blocker is an unsupported live-adapter claim in the real-world job report" + }, + { + "evidence_id": "xy844-current-pr-review", + "claim_id": "review_next_action", + "requirement": "use", + "quote": "agentmemory hooks and claude-mem viewer behavior are fixture-backed references" + } + ], + "negative_traps": [ + { + "trap_id": "resolved-issue-transition-review", + "type": "stale_fact", + "evidence_ids": ["xy844-resolved-review-blocker"], + "failure_if_used": true + } + ], + "scoring_rubric": { + "dimensions": { + "answer_correctness": { + "weight": 0.3, + "max_points": 1.0, + "criteria": "Names active review blocker and next action." + }, + "evidence_grounding": { + "weight": 0.25, + "max_points": 1.0, + "criteria": "Uses current PR review evidence." + }, + "trap_avoidance": { + "weight": 0.2, + "max_points": 1.0, + "criteria": "Avoids resolved issue_transition blocker." + }, + "workflow_helpfulness": { + "weight": 0.15, + "max_points": 1.0, + "criteria": "Gives the report edit and rerun path." + }, + "latency_resource": { + "weight": 0.1, + "max_points": 1.0, + "criteria": { + "max_latency_ms": 50.0, + "description": "Fixture-backed review recovery should be immediate." + } + } + }, + "pass_threshold": 0.8, + "hard_fail_rules": [ + "unsupported high-confidence claim about a required decision or fact", + "use of a negative trap marked failure_if_used = true" + ] + }, + "allowed_uncertainty": { + "can_answer_unknown": false, + "acceptable_phrases": [ + "The fixture does not provide that evidence." + ], + "fallback_action": "state_blocker" + }, + "tags": [ + "synthetic", + "fixture_backed", + "pr_review", + "agentmemory_reference", + "claude_mem_reference", + "no_live_claim" + ] +} diff --git a/apps/elf-eval/fixtures/real_world_memory/work_resume/work_resume_stale_worktree.json b/apps/elf-eval/fixtures/real_world_memory/work_resume/work_resume_stale_worktree.json new file mode 100644 index 00000000..32f7852f --- /dev/null +++ b/apps/elf-eval/fixtures/real_world_memory/work_resume/work_resume_stale_worktree.json @@ -0,0 +1,192 @@ +{ + "schema": "elf.real_world_job/v1", + "job_id": "work-resume-stale-worktree-001", + "suite": "work_resume", + "title": "Resume a retained lane with stale worktree blocker evidence", + "corpus": { + "corpus_id": "real-world-memory-work-resume-2026-06-09", + "profile": "synthetic", + "items": [ + { + "evidence_id": "xy844-current-worktree", + "kind": "trace", + "text": "Current XY-844 worktree evidence: branch y/elf-xy-844 has only .decodex-run-activity and .decodex-run-control as untracked runtime artifacts. The lane stopped before adding fixture-backed JSON jobs under apps/elf-eval/fixtures/real_world_memory/work_resume/. The exact next action is to add those fixtures and update the real_world_job runner/report tests.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "work_resume_stale_worktree", + "evidence_id": "xy844-current-worktree" + } + }, + "created_at": "2026-06-09T08:00:00Z" + }, + { + "evidence_id": "xy844-stale-worktree-blocker", + "kind": "trace", + "text": "Old retained-lane note: XY-844 is blocked by untracked Decodex runtime files and should stop for manual cleanup.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "work_resume_stale_worktree", + "evidence_id": "xy844-stale-worktree-blocker" + } + }, + "created_at": "2026-06-08T08:00:00Z" + } + ], + "adapter_response": { + "adapter_id": "fixture_work_resume", + "answer": { + "content": "Resume XY-844 on branch y/elf-xy-844. The stale blocker is the old claim that untracked Decodex runtime files require manual cleanup; current evidence says those files are runtime artifacts. The exact next action is to add fixture-backed JSON jobs under apps/elf-eval/fixtures/real_world_memory/work_resume/ and update the runner/report tests.", + "claims": [ + { + "claim_id": "next_action", + "text": "Add fixture-backed JSON jobs under apps/elf-eval/fixtures/real_world_memory/work_resume/ and update the runner/report tests.", + "evidence_ids": ["xy844-current-worktree"], + "confidence": "high" + }, + { + "claim_id": "stale_blocker", + "text": "The old untracked-runtime-files blocker is stale.", + "evidence_ids": ["xy844-current-worktree"], + "confidence": "high" + } + ], + "evidence_ids": ["xy844-current-worktree"], + "latency_ms": 2.0, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + } + } + } + }, + "timeline": [ + { + "event_id": "xy844-old-worktree-stop", + "ts": "2026-06-08T08:00:00Z", + "actor": "agent", + "action": "recorded_stale_blocker", + "evidence_ids": ["xy844-stale-worktree-blocker"], + "summary": "An older retained note treated untracked Decodex runtime files as a blocker." + }, + { + "event_id": "xy844-current-worktree-readback", + "ts": "2026-06-09T08:00:00Z", + "actor": "agent", + "action": "read_current_worktree", + "evidence_ids": ["xy844-current-worktree"], + "summary": "The current worktree readback narrowed the real next action to fixture and runner updates." + } + ], + "prompt": { + "role": "user", + "content": "What stopped this XY-844 lane, what is the exact next action, and which blocker is stale?", + "job_mode": "resume", + "constraints": [ + "cite_evidence", + "avoid_repeating_completed_work", + "state_blockers" + ] + }, + "expected_answer": { + "must_include": [ + { + "claim_id": "next_action", + "text": "Add fixture-backed JSON jobs under apps/elf-eval/fixtures/real_world_memory/work_resume/ and update the runner/report tests." + }, + { + "claim_id": "stale_blocker", + "text": "The old untracked-runtime-files blocker is stale." + } + ], + "must_not_include": [ + "Stop for manual cleanup of .decodex runtime files.", + "The untracked Decodex runtime files are the current blocker." + ], + "evidence_links": { + "next_action": ["xy844-current-worktree"], + "stale_blocker": ["xy844-current-worktree"] + }, + "answer_type": "resume_summary", + "accepted_alternates": [], + "requires_caveat": false, + "requires_refusal": false + }, + "required_evidence": [ + { + "evidence_id": "xy844-current-worktree", + "claim_id": "next_action", + "requirement": "cite", + "quote": "The exact next action is to add those fixtures and update the real_world_job runner/report tests." + }, + { + "evidence_id": "xy844-current-worktree", + "claim_id": "stale_blocker", + "requirement": "use", + "quote": "only .decodex-run-activity and .decodex-run-control as untracked runtime artifacts" + } + ], + "negative_traps": [ + { + "trap_id": "stale-runtime-artifact-blocker", + "type": "stale_fact", + "evidence_ids": ["xy844-stale-worktree-blocker"], + "failure_if_used": true + } + ], + "scoring_rubric": { + "dimensions": { + "answer_correctness": { + "weight": 0.3, + "max_points": 1.0, + "criteria": "Includes what stopped the lane and the exact current next action." + }, + "evidence_grounding": { + "weight": 0.25, + "max_points": 1.0, + "criteria": "Uses the current worktree evidence for required claims." + }, + "trap_avoidance": { + "weight": 0.2, + "max_points": 1.0, + "criteria": "Avoids stale blocker evidence." + }, + "workflow_helpfulness": { + "weight": 0.15, + "max_points": 1.0, + "criteria": "Advances the lane without asking for unnecessary cleanup." + }, + "latency_resource": { + "weight": 0.1, + "max_points": 1.0, + "criteria": { + "max_latency_ms": 50.0, + "description": "Fixture-backed answer should be immediate." + } + } + }, + "pass_threshold": 0.8, + "hard_fail_rules": [ + "unsupported high-confidence claim about a required decision or fact", + "use of a negative trap marked failure_if_used = true" + ] + }, + "allowed_uncertainty": { + "can_answer_unknown": false, + "acceptable_phrases": [ + "The fixture does not provide that evidence." + ], + "fallback_action": "state_blocker" + }, + "tags": [ + "synthetic", + "fixture_backed", + "worktree_resume", + "no_live_claim" + ] +} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark.rs b/apps/elf-eval/src/bin/real_world_job_benchmark.rs index 643572d5..97665594 100644 --- a/apps/elf-eval/src/bin/real_world_job_benchmark.rs +++ b/apps/elf-eval/src/bin/real_world_job_benchmark.rs @@ -18,7 +18,7 @@ use elf_cli::VERSION; const JOB_SCHEMA: &str = "elf.real_world_job/v1"; const REPORT_SCHEMA: &str = "elf.real_world_job_report/v1"; -const DEFAULT_FIXTURE_PATH: &str = "apps/elf-eval/fixtures/real_world_job/smoke"; +const DEFAULT_FIXTURE_PATH: &str = "apps/elf-eval/fixtures/real_world_memory/work_resume"; const DEFAULT_REPORT_PATH: &str = "tmp/real-world-job/real-world-job-smoke-report.json"; const DEFAULT_MARKDOWN_PATH: &str = "tmp/real-world-job/real-world-job-smoke-report.md"; const DEFAULT_RUN_ID: &str = "real-world-job-smoke"; @@ -119,6 +119,8 @@ struct Corpus { profile: CorpusProfile, #[serde(default)] items: Vec, + #[serde(default)] + capture_behaviors: CaptureIntegrationReport, adapter_response: Option, } @@ -422,6 +424,7 @@ struct RealWorldReport { runner_version: String, corpus_profile: String, adapter: AdapterReport, + capture_integration: CaptureIntegrationReport, summary: ReportSummary, suites: Vec, jobs: Vec, @@ -444,6 +447,22 @@ struct AdapterReport { notes: String, } +#[derive(Clone, Debug, Default, Deserialize, Serialize)] +struct CaptureIntegrationReport { + #[serde(default)] + real: Vec, + #[serde(default)] + fixture_backed: Vec, + #[serde(default)] + mocked: Vec, + #[serde(default)] + blocked: Vec, + #[serde(default)] + not_encoded: Vec, + #[serde(default)] + notes: Vec, +} + #[derive(Debug, Default, Deserialize, Serialize)] struct ReportSummary { job_count: usize, @@ -677,6 +696,7 @@ struct FailureCounts { stale_answers: usize, conflict_detection_missing: usize, update_rationale_missing: usize, + latency_violations: usize, } #[derive(Debug, Default)] @@ -942,6 +962,7 @@ fn validate_expected_answer(job: &RealWorldJob, path: &Path) -> Result<()> { fn validate_required_evidence(job: &RealWorldJob, path: &Path) -> Result<()> { let evidence_ids = corpus_evidence_ids(job); + let corpus_text = corpus_text_by_id(job); for evidence in &job.required_evidence { if evidence.claim_id.trim().is_empty() || evidence.requirement.trim().is_empty() { @@ -957,6 +978,17 @@ fn validate_required_evidence(job: &RealWorldJob, path: &Path) -> Result<()> { evidence.evidence_id )); } + + if let Some(quote) = &evidence.quote + && let Some(text) = corpus_text.get(evidence.evidence_id.as_str()) + && !text.contains(quote) + { + return Err(eyre::eyre!( + "{} required evidence quote for {} is not present in corpus text.", + path.display(), + evidence.evidence_id + )); + } } for (claim_id, link) in &job.expected_answer.evidence_links { if claim_id.trim().is_empty() { @@ -1254,6 +1286,14 @@ fn corpus_evidence_ids(job: &RealWorldJob) -> BTreeSet { job.corpus.items.iter().map(|item| item.evidence_id.clone()).collect() } +fn corpus_text_by_id(job: &RealWorldJob) -> BTreeMap<&str, &str> { + job.corpus + .items + .iter() + .filter_map(|item| item.text.as_deref().map(|text| (item.evidence_id.as_str(), text))) + .collect() +} + fn build_report(jobs: &[RealWorldJob], args: &RunArgs) -> Result { if jobs.is_empty() { return Err(eyre::eyre!("At least one real_world_job fixture is required.")); @@ -1286,6 +1326,7 @@ fn build_report(jobs: &[RealWorldJob], args: &RunArgs) -> Result JobScoring { let missing_evidence = missing_required_evidence(job, &produced_evidence); let mut unsupported_claims = unsupported_claims(job, answer); let operator_counts = operator_debug_failure_counts(job); + let latency_violations = latency_violations(job, answer); let hard_fail_hits = hard_fail_hits(job, &unsupported_claims, &trap_ids_used); let evolution = evolution_job_report(job, answer, &trap_ids_used, forbidden_claims.len()); let stale_answers = evolution.as_ref().map_or(0, |report| report.stale_answer_count); @@ -1347,6 +1389,7 @@ fn score_job(job: &RealWorldJob) -> JobScoring { stale_answers, conflict_detection_missing, update_rationale_missing, + latency_violations, }; let dimension_scores = dimension_scores(job, &counts); let normalized_score = normalized_score(&dimension_scores); @@ -1735,7 +1778,8 @@ fn dimension_score(dimension_id: &str, max_points: f64, counts: &FailureCounts) || counts.operator_debug_missing > 0 || counts.operator_debug_raw_sql > 0 || counts.operator_debug_trace_gaps > 0, - "latency_resource" | "personalization_fit" => + "latency_resource" => counts.latency_violations > 0, + "personalization_fit" | "ownership_correctness" => counts.missing_claims > 0 || counts.unsupported_claims > 0, _ => counts.missing_claims > 0 || counts.unsupported_claims > 0 || counts.trap_uses > 0, }; @@ -1743,6 +1787,25 @@ fn dimension_score(dimension_id: &str, max_points: f64, counts: &FailureCounts) if failed { 0.0 } else { max_points } } +fn latency_violations(job: &RealWorldJob, answer: &ProducedAnswer) -> usize { + let Some(max_latency_ms) = latency_threshold_ms(job) else { + return 0; + }; + let Some(latency_ms) = answer.latency_ms else { + return 1; + }; + + usize::from(latency_ms > max_latency_ms) +} + +fn latency_threshold_ms(job: &RealWorldJob) -> Option { + job.scoring_rubric + .dimensions + .get("latency_resource") + .and_then(|dimension| dimension.criteria.get("max_latency_ms")) + .and_then(Value::as_f64) +} + fn normalized_score(scores: &[DimensionScoreReport]) -> f64 { let total_weight = scores.iter().map(|score| score.weight).sum::(); @@ -1775,7 +1838,7 @@ fn job_reason(status: TypedStatus, counts: &FailureCounts, normalized_score: f64 match status { TypedStatus::Pass => format!("Job passed with normalized_score {normalized_score:.3}."), TypedStatus::UnsupportedClaim => format!( - "Job produced {} unsupported claim(s), {} wrong-result signal(s), and normalized_score {normalized_score:.3}.", + "Job produced {} unsupported claim(s), {} wrong-result signal(s), {} latency violation(s), and normalized_score {normalized_score:.3}.", counts.unsupported_claims, counts.missing_claims + counts.forbidden_claims @@ -1786,10 +1849,11 @@ fn job_reason(status: TypedStatus, counts: &FailureCounts, normalized_score: f64 + counts.operator_debug_trace_gaps + counts.operator_debug_repair_unclear + counts.conflict_detection_missing - + counts.update_rationale_missing + + counts.update_rationale_missing, + counts.latency_violations ), TypedStatus::WrongResult => format!( - "Job produced {} wrong-result signal(s) and normalized_score {normalized_score:.3}.", + "Job produced {} wrong-result signal(s), {} latency violation(s), and normalized_score {normalized_score:.3}.", counts.missing_claims + counts.forbidden_claims + counts.missing_evidence @@ -1799,7 +1863,8 @@ fn job_reason(status: TypedStatus, counts: &FailureCounts, normalized_score: f64 + counts.operator_debug_trace_gaps + counts.operator_debug_repair_unclear + counts.conflict_detection_missing - + counts.update_rationale_missing + + counts.update_rationale_missing, + counts.latency_violations ), _ => "Job did not reach a runnable scoring state.".to_string(), } @@ -2244,6 +2309,42 @@ fn adapter_report(args: &RunArgs) -> AdapterReport { } } +fn capture_integration_report(jobs: &[RealWorldJob]) -> CaptureIntegrationReport { + let mut report = CaptureIntegrationReport::default(); + + for job in jobs { + extend_unique(&mut report.real, &job.corpus.capture_behaviors.real); + extend_unique(&mut report.fixture_backed, &job.corpus.capture_behaviors.fixture_backed); + extend_unique(&mut report.mocked, &job.corpus.capture_behaviors.mocked); + extend_unique(&mut report.blocked, &job.corpus.capture_behaviors.blocked); + extend_unique(&mut report.not_encoded, &job.corpus.capture_behaviors.not_encoded); + extend_unique(&mut report.notes, &job.corpus.capture_behaviors.notes); + } + + if report.real.is_empty() + && report.fixture_backed.is_empty() + && report.mocked.is_empty() + && report.blocked.is_empty() + && report.not_encoded.is_empty() + { + report + .not_encoded + .push("No capture/integration behavior was declared by encoded fixtures.".to_string()); + } + + report +} + +fn extend_unique(target: &mut Vec, values: &[String]) { + let mut seen = target.iter().cloned().collect::>(); + + for value in values { + if seen.insert(value.clone()) { + target.push(value.clone()); + } + } +} + fn private_corpus_redaction(jobs: &[RealWorldJob]) -> PrivateCorpusRedaction { let private_fixture_count = jobs .iter() @@ -2264,6 +2365,7 @@ fn render_markdown(report: &RealWorldReport, report_path: &Path) -> String { let mut out = String::new(); render_markdown_header(&mut out, report, report_path.as_str()); + render_markdown_capture_integration(&mut out, report); render_markdown_suites(&mut out, report); render_markdown_jobs(&mut out, report); render_markdown_operator_debugging(&mut out, report); @@ -2275,6 +2377,40 @@ fn render_markdown(report: &RealWorldReport, report_path: &Path) -> String { out } +fn render_markdown_capture_integration(out: &mut String, report: &RealWorldReport) { + out.push_str("## Capture And Integration Coverage\n\n"); + out.push_str("The real-world job runner is fixture-backed. This section separates encoded evidence from live adapter claims.\n\n"); + out.push_str("| Class | Behaviors |\n"); + out.push_str("| --- | --- |\n"); + out.push_str(&format!("| real | {} |\n", md_list(report.capture_integration.real.as_slice()))); + out.push_str(&format!( + "| fixture-backed | {} |\n", + md_list(report.capture_integration.fixture_backed.as_slice()) + )); + out.push_str(&format!( + "| mocked | {} |\n", + md_list(report.capture_integration.mocked.as_slice()) + )); + out.push_str(&format!( + "| blocked | {} |\n", + md_list(report.capture_integration.blocked.as_slice()) + )); + out.push_str(&format!( + "| not encoded | {} |\n", + md_list(report.capture_integration.not_encoded.as_slice()) + )); + + if !report.capture_integration.notes.is_empty() { + out.push_str("\nNotes:\n"); + + for note in &report.capture_integration.notes { + out.push_str(&format!("- {}\n", md_cell(note.as_str()))); + } + } + + out.push('\n'); +} + fn render_markdown_header(out: &mut String, report: &RealWorldReport, report_path: &str) { out.push_str("# Real-World Job Benchmark Report\n\n"); out.push_str( @@ -2284,7 +2420,7 @@ fn render_markdown_header(out: &mut String, report: &RealWorldReport, report_pat "Read this when: You need a durable smoke report for real-world agent memory job fixtures.\n", ); out.push_str(&format!("Inputs: `{}`.\n", md_inline(report_path))); - out.push_str("Depends on: `apps/elf-eval/fixtures/real_world_job/`, `docs/spec/real_world_agent_memory_benchmark_v1.md`, and `Makefile.toml`.\n"); + out.push_str("Depends on: `apps/elf-eval/fixtures/real_world_memory/`, `docs/spec/real_world_agent_memory_benchmark_v1.md`, and `Makefile.toml`.\n"); out.push_str( "Verification: Compare this Markdown summary with the source JSON before committing.\n\n", ); @@ -2734,6 +2870,14 @@ fn md_url(value: &str) -> String { value.replace(')', "%29").replace(' ', "%20") } +fn md_list(values: &[String]) -> String { + if values.is_empty() { + return "-".to_string(); + } + + md_cell(values.join("; ").as_str()) +} + fn round3(value: f64) -> f64 { (value * 1_000.0).round() / 1_000.0 } diff --git a/apps/elf-eval/tests/real_world_job_benchmark.rs b/apps/elf-eval/tests/real_world_job_benchmark.rs index db644110..bcd04139 100644 --- a/apps/elf-eval/tests/real_world_job_benchmark.rs +++ b/apps/elf-eval/tests/real_world_job_benchmark.rs @@ -12,11 +12,14 @@ use color_eyre::{Result, eyre}; use serde_json::Value; fn fixture_dir() -> PathBuf { - Path::new(env!("CARGO_MANIFEST_DIR")).join("fixtures").join("real_world_job").join("smoke") + Path::new(env!("CARGO_MANIFEST_DIR")) + .join("fixtures") + .join("real_world_memory") + .join("work_resume") } fn fixture_root() -> PathBuf { - Path::new(env!("CARGO_MANIFEST_DIR")).join("fixtures").join("real_world_job") + Path::new(env!("CARGO_MANIFEST_DIR")).join("fixtures").join("real_world_memory") } fn real_world_memory_fixture_dir() -> PathBuf { @@ -28,7 +31,10 @@ fn evolution_fixture_dir() -> PathBuf { } fn operator_debug_fixture_dir() -> PathBuf { - fixture_root().join("operator_debugging_ux") + Path::new(env!("CARGO_MANIFEST_DIR")) + .join("fixtures") + .join("real_world_job") + .join("operator_debugging_ux") } fn run_json_report_from(fixtures: PathBuf) -> Result { @@ -82,17 +88,18 @@ fn smoke_fixture_produces_typed_json_report() -> Result<()> { report.pointer("/schema").and_then(Value::as_str), Some("elf.real_world_job_report/v1") ); - assert_eq!(report.pointer("/summary/job_count").and_then(Value::as_u64), Some(1)); - assert_eq!(report.pointer("/summary/pass").and_then(Value::as_u64), Some(1)); + assert_eq!(report.pointer("/summary/job_count").and_then(Value::as_u64), Some(6)); + assert_eq!(report.pointer("/summary/encoded_suite_count").and_then(Value::as_u64), Some(2)); + assert_eq!(report.pointer("/summary/pass").and_then(Value::as_u64), Some(6)); assert_eq!(report.pointer("/summary/unsupported_claim_count").and_then(Value::as_u64), Some(0)); assert_eq!(report.pointer("/summary/wrong_result_count").and_then(Value::as_u64), Some(0)); let jobs = array_at(&report, "/jobs")?; - let job = find_by_field(jobs, "/job_id", "work-resume-smoke-001")?; + let job = find_by_field(jobs, "/job_id", "work-resume-stale-worktree-001")?; assert_eq!(job.pointer("/suite_id").and_then(Value::as_str), Some("work_resume")); assert_eq!(job.pointer("/status").and_then(Value::as_str), Some("pass")); - assert_eq!(job.pointer("/latency_ms").and_then(Value::as_f64), Some(1.2)); + assert_eq!(job.pointer("/latency_ms").and_then(Value::as_f64), Some(2.0)); assert_eq!(job.pointer("/cost/amount").and_then(Value::as_f64), Some(0.0)); let expected_evidence = array_at(job, "/expected_evidence")?; @@ -100,15 +107,31 @@ fn smoke_fixture_produces_typed_json_report() -> Result<()> { assert_eq!(expected_evidence.len(), 2); assert_eq!(produced_evidence.len(), 1); - assert_eq!(produced_evidence.first().and_then(Value::as_str), Some("issue-xy812-resume")); + assert_eq!(produced_evidence.first().and_then(Value::as_str), Some("xy844-current-worktree")); let suites = array_at(&report, "/suites")?; let encoded_suite = find_by_field(suites, "/suite_id", "work_resume")?; + let capture_suite = find_by_field(suites, "/suite_id", "capture_integration")?; let unencoded_suite = find_by_field(suites, "/suite_id", "retrieval")?; assert_eq!(encoded_suite.pointer("/status").and_then(Value::as_str), Some("pass")); + assert_eq!(encoded_suite.pointer("/encoded_job_count").and_then(Value::as_u64), Some(5)); + assert_eq!(capture_suite.pointer("/status").and_then(Value::as_str), Some("pass")); + assert_eq!(capture_suite.pointer("/encoded_job_count").and_then(Value::as_u64), Some(1)); assert_eq!(unencoded_suite.pointer("/status").and_then(Value::as_str), Some("not_encoded")); + let capture_fixture_backed = array_at(&report, "/capture_integration/fixture_backed")?; + + assert!(capture_fixture_backed.iter().any(|value| { + value.as_str().is_some_and(|item| item.contains("agentmemory-style hook capture")) + })); + + let capture_not_encoded = array_at(&report, "/capture_integration/not_encoded")?; + + assert!(capture_not_encoded.iter().any(|value| { + value.as_str().is_some_and(|item| item.contains("No live external hook ingestion")) + })); + Ok(()) } @@ -116,12 +139,7 @@ fn smoke_fixture_produces_typed_json_report() -> Result<()> { fn runner_discovers_nested_fixture_layout() -> Result<()> { let report = run_json_report_from(fixture_root())?; - assert_eq!(report.pointer("/summary/job_count").and_then(Value::as_u64), Some(6)); - - let suites = array_at(&report, "/suites")?; - let operator_suite = find_by_field(suites, "/suite_id", "operator_debugging_ux")?; - - assert_eq!(operator_suite.pointer("/encoded_job_count").and_then(Value::as_u64), Some(5)); + assert_eq!(report.pointer("/summary/job_count").and_then(Value::as_u64), Some(15)); Ok(()) } @@ -191,8 +209,10 @@ fn generated_json_report_renders_markdown() -> Result<()> { assert!(markdown.contains("# Real-World Job Benchmark Report")); assert!(markdown.contains("work_resume")); - assert!(markdown.contains("issue-xy812-resume")); - assert!(markdown.contains("## Operator Debugging UX")); + assert!(markdown.contains("Capture And Integration Coverage")); + assert!(markdown.contains("fixture-backed")); + assert!(markdown.contains("agentmemory-style hook capture")); + assert!(markdown.contains("xy844-current-worktree")); assert!(markdown.contains("Existing live-baseline reports remain valid")); Ok(()) @@ -202,8 +222,8 @@ fn generated_json_report_renders_markdown() -> Result<()> { fn real_world_memory_fixtures_report_trust_and_personalization_metrics() -> Result<()> { let report = run_json_report_from(real_world_memory_fixture_dir())?; - assert_eq!(report.pointer("/summary/job_count").and_then(Value::as_u64), Some(9)); - assert_eq!(report.pointer("/summary/pass").and_then(Value::as_u64), Some(8)); + assert_eq!(report.pointer("/summary/job_count").and_then(Value::as_u64), Some(15)); + assert_eq!(report.pointer("/summary/pass").and_then(Value::as_u64), Some(14)); assert_eq!(report.pointer("/summary/not_encoded").and_then(Value::as_u64), Some(1)); assert_eq!(report.pointer("/summary/unsupported_claim_count").and_then(Value::as_u64), Some(0)); assert_eq!(report.pointer("/summary/stale_retrieval_count").and_then(Value::as_u64), Some(0)); @@ -221,8 +241,8 @@ fn real_world_memory_fixtures_report_trust_and_personalization_metrics() -> Resu Some(1) ); assert_eq!(report.pointer("/summary/redaction_leak_count").and_then(Value::as_u64), Some(0)); - assert_eq!(report.pointer("/summary/scope_check_count").and_then(Value::as_u64), Some(1)); - assert_eq!(report.pointer("/summary/scope_correct_count").and_then(Value::as_u64), Some(1)); + assert_eq!(report.pointer("/summary/scope_check_count").and_then(Value::as_u64), Some(2)); + assert_eq!(report.pointer("/summary/scope_correct_count").and_then(Value::as_u64), Some(2)); assert_eq!(report.pointer("/summary/scope_violation_count").and_then(Value::as_u64), Some(0)); assert_eq!( report.pointer("/summary/qdrant_rebuild_case_count").and_then(Value::as_u64), @@ -234,16 +254,18 @@ fn real_world_memory_fixtures_report_trust_and_personalization_metrics() -> Resu ); assert_eq!( report.pointer("/summary/evidence_required_count").and_then(Value::as_u64), - Some(19) + Some(33) ); - assert_eq!(report.pointer("/summary/evidence_covered_count").and_then(Value::as_u64), Some(17)); - assert_eq!(report.pointer("/summary/evidence_coverage").and_then(Value::as_f64), Some(0.895)); - assert_eq!(report.pointer("/summary/source_ref_coverage").and_then(Value::as_f64), Some(0.895)); - assert_eq!(report.pointer("/summary/quote_coverage").and_then(Value::as_f64), Some(0.895)); + assert_eq!(report.pointer("/summary/evidence_covered_count").and_then(Value::as_u64), Some(31)); + assert_eq!(report.pointer("/summary/evidence_coverage").and_then(Value::as_f64), Some(0.939)); + assert_eq!(report.pointer("/summary/source_ref_coverage").and_then(Value::as_f64), Some(0.939)); + assert_eq!(report.pointer("/summary/quote_coverage").and_then(Value::as_f64), Some(0.939)); let suites = array_at(&report, "/suites")?; - for suite_id in ["trust_source_of_truth", "capture_integration", "personalization"] { + for suite_id in + ["trust_source_of_truth", "work_resume", "capture_integration", "personalization"] + { let suite = find_by_field(suites, "/suite_id", suite_id)?; assert_eq!(suite.pointer("/status").and_then(Value::as_str), Some("pass")); diff --git a/docs/guide/benchmarking/live_baseline_benchmark.md b/docs/guide/benchmarking/live_baseline_benchmark.md index e5a05968..abb29e0b 100644 --- a/docs/guide/benchmarking/live_baseline_benchmark.md +++ b/docs/guide/benchmarking/live_baseline_benchmark.md @@ -297,8 +297,8 @@ To run the checked-in real-world job smoke fixture and render its Markdown repor cargo make real-world-job-smoke ``` -To run the checked-in trust, source-of-truth, lifecycle, redaction, and personalization -real-world memory fixtures: +To run the checked-in work-resume, source-of-truth, lifecycle, redaction, +capture-boundary, and personalization real-world memory fixtures: ```sh cargo make real-world-memory @@ -313,10 +313,14 @@ tmp/real-world-memory/real-world-memory-report.json tmp/real-world-memory/real-world-memory-report.md ``` -The smoke fixture lives under `apps/elf-eval/fixtures/real_world_job/smoke/` and uses +The smoke fixture suite lives under +`apps/elf-eval/fixtures/real_world_memory/work_resume/` and uses `docs/spec/real_world_agent_memory_benchmark_v1.md` status terms, including -`unsupported_claim`. Suites without checked-in jobs are reported as `not_encoded`. -The trust/personalization fixture set lives under +`unsupported_claim`. The checked-in slice includes work-resume continuity jobs and one +capture/integration boundary job. Suites without checked-in jobs are reported as +`not_encoded`. + +The broader real-world memory fixture set lives under `apps/elf-eval/fixtures/real_world_memory/` and adds summary counters for evidence coverage, source-ref coverage, quote coverage, stale retrievals, scope correctness, redaction leaks, and Qdrant rebuild coverage. diff --git a/docs/guide/benchmarking/real_world_agent_memory_benchmark.md b/docs/guide/benchmarking/real_world_agent_memory_benchmark.md index 6f9539b4..a206a6c0 100644 --- a/docs/guide/benchmarking/real_world_agent_memory_benchmark.md +++ b/docs/guide/benchmarking/real_world_agent_memory_benchmark.md @@ -118,14 +118,18 @@ Current checked-in smoke increment: cargo make real-world-job-smoke ``` -This parses `apps/elf-eval/fixtures/real_world_job/smoke/`, writes +This parses `apps/elf-eval/fixtures/real_world_memory/work_resume/`, writes `tmp/real-world-job/real-world-job-smoke-report.json`, and renders -`tmp/real-world-job/real-world-job-smoke-report.md`. The smoke report includes suite -id, job id, expected evidence, produced answer/evidence, unsupported-claim count, -wrong-result count, latency/cost fields when available, and typed suite/job statuses. -Untouched suites remain `not_encoded`. +`tmp/real-world-job/real-world-job-smoke-report.md`. -Current checked-in trust and personalization increment: +The checked-in fixture slice covers stale worktree resume, Decodex/Linear lane status, +failed command recovery, PR review blocker recovery, exact next-action extraction, and +cross-tool capture boundaries. The smoke report includes suite id, job id, expected +evidence, produced answer/evidence, unsupported-claim count, wrong-result count, +latency/cost fields when available, capture/integration behavior classes, and typed +suite/job statuses. Untouched suites remain `not_encoded`. + +Current checked-in full real-world memory increment: ```sh cargo make real-world-memory @@ -139,18 +143,22 @@ The suite currently encodes: - `trust_source_of_truth`: evidence binding, source refs, and Qdrant rebuild from Postgres-held chunk embeddings before answering. +- `work_resume`: stale worktree resume, Decodex/Linear lane status, failed command + recovery, PR review blocker recovery, and exact next-action extraction. - `memory_evolution`: TTL/delete suppression plus current-versus-historical preference, issue status, deployment method, benchmark conclusion, and temporal relation cases. -- `capture_integration`: write-policy audit behavior for redaction/private exclusion. +- `capture_integration`: write-policy audit behavior for redaction/private exclusion + and fixture-backed capture/integration boundary classification. - `personalization`: scoped stable preference correction without temporary or cross-project preference leakage. The generated report includes evidence coverage, source-ref coverage, quote coverage, unsupported-claim count, stale retrieval count, stale-answer count, conflict detection count, update rationale availability, temporal validity `not_encoded` count, scope -correctness, redaction leak count, and Qdrant rebuild case/pass counts. The fixtures -include negative traps for unsupported prior claims, stale deleted facts, stale -historical facts, cross-project preference leakage, and private/redacted text leakage. +correctness, redaction leak count, capture/integration behavior classes, and Qdrant +rebuild case/pass counts. The fixtures include negative traps for stale blockers, +unsupported prior claims, stale deleted facts, stale historical facts, cross-project +preference leakage, and private/redacted text leakage. Narrow memory evolution increment: diff --git a/docs/spec/real_world_agent_memory_benchmark_v1.md b/docs/spec/real_world_agent_memory_benchmark_v1.md index 8b7552a7..3baf6d43 100644 --- a/docs/spec/real_world_agent_memory_benchmark_v1.md +++ b/docs/spec/real_world_agent_memory_benchmark_v1.md @@ -113,6 +113,15 @@ Each `items[]` entry MUST include: - `source_ref`: object; MAY be `{}` only for generated synthetic fixtures. - `created_at`: RFC3339 timestamp or `null` when time is intentionally irrelevant. +Optional corpus fields: + +- `capture_behaviors`: object used by `capture_integration` jobs and fixture-backed + suites to classify integration evidence. Supported arrays are `real`, + `fixture_backed`, `mocked`, `blocked`, `not_encoded`, and `notes`. + `fixture_backed` means the behavior is represented by checked-in fixture evidence, + not by a live adapter pass. Reports MUST NOT convert `fixture_backed`, `mocked`, + `blocked`, or `not_encoded` behavior into a live integration success claim. + Private corpus fixtures MUST use sanitized inline text or local refs excluded from git. Reports MAY publish evidence ids and score summaries without publishing private text. @@ -376,6 +385,9 @@ Reports MUST include: - unsupported claim list with claim text or a bounded redacted description; - explicit `not_encoded` suite list; - private-corpus redaction policy when private fixtures are used. +- capture/integration coverage classes when any fixture declares `capture_behaviors`, + preserving the `real`, `fixture_backed`, `mocked`, `blocked`, and `not_encoded` + distinction. Reports that encode `memory_evolution` jobs SHOULD also include stale-answer counts, conflict detection counts, update rationale availability, and temporal-validity