diff --git a/Makefile.toml b/Makefile.toml index f836e027..d35f6b74 100644 --- a/Makefile.toml +++ b/Makefile.toml @@ -406,6 +406,9 @@ args = [ # | real-world-job-operator-ux | composite | | # | real-world-job-operator-ux-json | command | | # | real-world-job-operator-ux-report | command | | +# | real-world-memory-retrieval | composite | | +# | real-world-memory-retrieval-json | command | | +# | real-world-memory-retrieval-report | command | | [tasks.real-world-job-smoke] workspace = false @@ -597,6 +600,55 @@ args = [ "tmp/real-world-job/real-world-job-operator-ux-report.md", ] +[tasks.real-world-memory-retrieval] +workspace = false +dependencies = [ + "real-world-memory-retrieval-report", +] + +[tasks.real-world-memory-retrieval-json] +workspace = false +command = "cargo" +args = [ + "run", + "-p", + "elf-eval", + "--bin", + "real_world_job_benchmark", + "--", + "run", + "--fixtures", + "apps/elf-eval/fixtures/real_world_memory/retrieval", + "--run-id", + "real-world-memory-retrieval", + "--adapter-id", + "fixture_retrieval", + "--adapter-name", + "ELF fixture retrieval cases", + "--out", + "tmp/real-world-memory/retrieval-report.json", +] + +[tasks.real-world-memory-retrieval-report] +workspace = false +dependencies = [ + "real-world-memory-retrieval-json", +] +command = "cargo" +args = [ + "run", + "-p", + "elf-eval", + "--bin", + "real_world_job_benchmark", + "--", + "publish", + "--report", + "tmp/real-world-memory/retrieval-report.json", + "--out", + "tmp/real-world-memory/retrieval-report.md", +] + # Meta # | task | type | cwd | diff --git a/apps/elf-eval/fixtures/real_world_memory/retrieval/alternate_phrasing.json b/apps/elf-eval/fixtures/real_world_memory/retrieval/alternate_phrasing.json new file mode 100644 index 00000000..c939fb62 --- /dev/null +++ b/apps/elf-eval/fixtures/real_world_memory/retrieval/alternate_phrasing.json @@ -0,0 +1,173 @@ +{ + "schema": "elf.real_world_job/v1", + "job_id": "retrieval-alt-phrasing-001", + "suite": "retrieval", + "title": "Recover current handoff evidence from alternate phrasing", + "corpus": { + "corpus_id": "real-world-memory-retrieval-2026-06-09", + "profile": "synthetic", + "items": [ + { + "evidence_id": "xy840-current-handoff", + "kind": "issue", + "text": "XY-840 trace schema lane uses branch y/elf-xy-840. Before review handoff, run `cargo make checks` after the trace schema update is complete.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "alternate_phrasing", + "evidence_id": "xy840-current-handoff" + } + }, + "created_at": "2026-06-09T01:00:00Z" + }, + { + "evidence_id": "xy840-old-handoff-trap", + "kind": "decision", + "text": "Old note: XY-840 used branch y/elf-old-840 and only needed `cargo make test` before handoff.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "alternate_phrasing", + "evidence_id": "xy840-old-handoff-trap" + } + }, + "created_at": "2026-06-08T01:00:00Z" + } + ], + "adapter_response": { + "adapter_id": "fixture_retrieval", + "answer": { + "content": "Use branch y/elf-xy-840 for XY-840 and run `cargo make checks` before review handoff.", + "claims": [ + { + "claim_id": "branch", + "text": "Use branch y/elf-xy-840 for XY-840.", + "evidence_ids": ["xy840-current-handoff"], + "confidence": "high" + }, + { + "claim_id": "gate", + "text": "Run `cargo make checks` before review handoff.", + "evidence_ids": ["xy840-current-handoff"], + "confidence": "high" + } + ], + "evidence_ids": ["xy840-current-handoff"], + "latency_ms": 13.4, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + } + } + } + }, + "timeline": [ + { + "event_id": "xy840-old-branch", + "ts": "2026-06-08T01:00:00Z", + "actor": "agent", + "action": "recorded_old_handoff", + "evidence_ids": ["xy840-old-handoff-trap"], + "summary": "An older handoff note referenced the wrong branch and a narrower gate." + }, + { + "event_id": "xy840-current-handoff", + "ts": "2026-06-09T01:00:00Z", + "actor": "agent", + "action": "updated_handoff", + "evidence_ids": ["xy840-current-handoff"], + "summary": "The current handoff evidence changed the branch and validation gate." + } + ], + "prompt": { + "role": "user", + "content": "For the trace-schema handoff, which XY-840 branch and pre-review check do I need?", + "job_mode": "answer", + "constraints": ["cite_evidence", "avoid_stale_facts"] + }, + "expected_answer": { + "must_include": [ + { + "claim_id": "branch", + "text": "Use branch y/elf-xy-840 for XY-840." + }, + { + "claim_id": "gate", + "text": "Run `cargo make checks` before review handoff." + } + ], + "must_not_include": [ + "Use branch y/elf-old-840 for XY-840.", + "Run `cargo make test` before review handoff." + ], + "evidence_links": { + "branch": ["xy840-current-handoff"], + "gate": ["xy840-current-handoff"] + }, + "answer_type": "direct_answer", + "accepted_alternates": [], + "requires_caveat": false, + "requires_refusal": false + }, + "required_evidence": [ + { + "evidence_id": "xy840-current-handoff", + "claim_id": "branch", + "requirement": "cite", + "quote": "uses branch y/elf-xy-840" + }, + { + "evidence_id": "xy840-current-handoff", + "claim_id": "gate", + "requirement": "use", + "quote": "run `cargo make checks`" + } + ], + "negative_traps": [ + { + "trap_id": "old-xy840-handoff", + "type": "stale_fact", + "evidence_ids": ["xy840-old-handoff-trap"], + "failure_if_used": true + } + ], + "scoring_rubric": { + "dimensions": { + "answer_correctness": { + "weight": 0.35, + "max_points": 1.0, + "criteria": "Returns the current branch and pre-review check." + }, + "evidence_grounding": { + "weight": 0.3, + "max_points": 1.0, + "criteria": "Cites the current handoff evidence." + }, + "trap_avoidance": { + "weight": 0.2, + "max_points": 1.0, + "criteria": "Ignores the stale branch and test-only gate." + }, + "latency_resource": { + "weight": 0.15, + "max_points": 1.0, + "criteria": "Reports bounded fixture latency and no cost." + } + }, + "pass_threshold": 0.8, + "hard_fail_rules": [ + "unsupported high-confidence claim about a required decision or fact", + "use of a negative trap marked failure_if_used = true" + ] + }, + "allowed_uncertainty": { + "can_answer_unknown": false, + "acceptable_phrases": ["The fixture does not provide that evidence."], + "fallback_action": "state_blocker" + }, + "tags": ["synthetic", "retrieval", "alternate_phrasing", "no_live_claim"] +} diff --git a/apps/elf-eval/fixtures/real_world_memory/retrieval/current_vs_obsolete.json b/apps/elf-eval/fixtures/real_world_memory/retrieval/current_vs_obsolete.json new file mode 100644 index 00000000..d6880447 --- /dev/null +++ b/apps/elf-eval/fixtures/real_world_memory/retrieval/current_vs_obsolete.json @@ -0,0 +1,148 @@ +{ + "schema": "elf.real_world_job/v1", + "job_id": "retrieval-current-vs-obsolete-001", + "suite": "retrieval", + "title": "Select current benchmark context over obsolete live-baseline claims", + "corpus": { + "corpus_id": "real-world-memory-retrieval-2026-06-09", + "profile": "synthetic", + "items": [ + { + "evidence_id": "obsolete-live-baseline-win", + "kind": "decision", + "text": "Obsolete draft: top-k live baseline results alone prove real-world job suite wins.", + "source_ref": {}, + "created_at": "2026-06-08T04:00:00Z" + }, + { + "evidence_id": "current-real-world-boundary", + "kind": "decision", + "text": "Current policy: live-baseline reports remain valid for Docker retrieval and lifecycle checks, but they are not real-world job suite wins. Real-world job reports must be published separately.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "current_vs_obsolete", + "evidence_id": "current-real-world-boundary" + } + }, + "created_at": "2026-06-09T04:00:00Z" + } + ], + "adapter_response": { + "adapter_id": "fixture_retrieval", + "answer": { + "content": "Use the current boundary: live-baseline reports stay valid for Docker retrieval and lifecycle evidence, but they are not real-world job suite wins; publish real-world job reports separately.", + "claims": [ + { + "claim_id": "current_boundary", + "text": "Live-baseline reports are not real-world job suite wins.", + "evidence_ids": ["current-real-world-boundary"], + "confidence": "high" + } + ], + "evidence_ids": ["current-real-world-boundary"], + "latency_ms": 15.7, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + } + } + } + }, + "timeline": [ + { + "event_id": "obsolete-draft", + "ts": "2026-06-08T04:00:00Z", + "actor": "agent", + "action": "recorded_obsolete_policy", + "evidence_ids": ["obsolete-live-baseline-win"], + "summary": "A draft conflated live-baseline retrieval checks with real-world job wins." + }, + { + "event_id": "current-boundary", + "ts": "2026-06-09T04:00:00Z", + "actor": "agent", + "action": "updated_policy", + "evidence_ids": ["current-real-world-boundary"], + "summary": "The current policy separates live-baseline evidence from real-world job suite claims." + } + ], + "prompt": { + "role": "user", + "content": "Can I cite the live-baseline pass as a real-world job suite win?", + "job_mode": "answer", + "constraints": ["cite_evidence", "use_current_policy", "avoid_obsolete_context"] + }, + "expected_answer": { + "must_include": [ + { + "claim_id": "current_boundary", + "text": "Live-baseline reports are not real-world job suite wins." + } + ], + "must_not_include": [ + "Top-k live baseline results alone prove real-world job suite wins." + ], + "evidence_links": { + "current_boundary": ["current-real-world-boundary"] + }, + "answer_type": "direct_answer", + "accepted_alternates": [], + "requires_caveat": false, + "requires_refusal": false + }, + "required_evidence": [ + { + "evidence_id": "current-real-world-boundary", + "claim_id": "current_boundary", + "requirement": "cite", + "quote": "they are not real-world job suite wins" + } + ], + "negative_traps": [ + { + "trap_id": "obsolete-suite-win", + "type": "stale_fact", + "evidence_ids": ["obsolete-live-baseline-win"], + "failure_if_used": true + } + ], + "scoring_rubric": { + "dimensions": { + "answer_correctness": { + "weight": 0.35, + "max_points": 1.0, + "criteria": "Answers with the current claim boundary." + }, + "evidence_grounding": { + "weight": 0.3, + "max_points": 1.0, + "criteria": "Cites the current policy evidence." + }, + "trap_avoidance": { + "weight": 0.25, + "max_points": 1.0, + "criteria": "Avoids the obsolete top-k claim." + }, + "uncertainty_handling": { + "weight": 0.1, + "max_points": 1.0, + "criteria": "Does not hedge when sufficient current evidence exists." + } + }, + "pass_threshold": 0.8, + "hard_fail_rules": [ + "unsupported high-confidence claim about a required decision or fact", + "use of a negative trap marked failure_if_used = true" + ] + }, + "allowed_uncertainty": { + "can_answer_unknown": false, + "acceptable_phrases": ["The fixture does not provide that evidence."], + "fallback_action": "state_blocker" + }, + "tags": ["synthetic", "retrieval", "current_vs_obsolete", "no_live_claim"] +} diff --git a/apps/elf-eval/fixtures/real_world_memory/retrieval/distractor_heavy.json b/apps/elf-eval/fixtures/real_world_memory/retrieval/distractor_heavy.json new file mode 100644 index 00000000..819844b4 --- /dev/null +++ b/apps/elf-eval/fixtures/real_world_memory/retrieval/distractor_heavy.json @@ -0,0 +1,200 @@ +{ + "schema": "elf.real_world_job/v1", + "job_id": "retrieval-distractor-heavy-001", + "suite": "retrieval", + "title": "Find provider stress evidence in a distractor-heavy corpus", + "corpus": { + "corpus_id": "real-world-memory-retrieval-2026-06-09", + "profile": "synthetic", + "items": [ + { + "evidence_id": "elf-provider-stress-target", + "kind": "runbook", + "text": "For the ELF provider stress check, set ELF_BASELINE_PROJECTS=ELF and ELF_BASELINE_PROFILE=stress with provider embeddings. The expected report is the live baseline Docker report.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "distractor_heavy", + "evidence_id": "elf-provider-stress-target" + } + }, + "created_at": "2026-06-09T02:00:00Z" + }, + { + "evidence_id": "qmd-smoke-distractor", + "kind": "adapter_state", + "text": "qmd smoke uses a local collection and should not be described as the ELF provider stress run.", + "source_ref": {}, + "created_at": "2026-06-09T02:01:00Z" + }, + { + "evidence_id": "mem0-stress-distractor", + "kind": "adapter_state", + "text": "mem0 local FastEmbed stress evidence is not encoded for this provider profile.", + "source_ref": {}, + "created_at": "2026-06-09T02:02:00Z" + }, + { + "evidence_id": "openviking-install-distractor", + "kind": "adapter_state", + "text": "OpenViking local embedding install failure is an incomplete adapter state, not a provider stress pass.", + "source_ref": {}, + "created_at": "2026-06-09T02:03:00Z" + }, + { + "evidence_id": "private-manifest-distractor", + "kind": "runbook", + "text": "The private production manifest guard fails closed when ELF_BASELINE_PRODUCTION_CORPUS_MANIFEST is unset.", + "source_ref": {}, + "created_at": "2026-06-09T02:04:00Z" + }, + { + "evidence_id": "backfill-distractor", + "kind": "runbook", + "text": "The backfill profile defaults to ELF only and records resumable import evidence.", + "source_ref": {}, + "created_at": "2026-06-09T02:05:00Z" + }, + { + "evidence_id": "scale-distractor", + "kind": "runbook", + "text": "The scale profile has generated distractor notes but is not the provider stress profile.", + "source_ref": {}, + "created_at": "2026-06-09T02:06:00Z" + }, + { + "evidence_id": "smoke-distractor", + "kind": "runbook", + "text": "The smoke profile is the default quick matrix and should not be used as the stress command.", + "source_ref": {}, + "created_at": "2026-06-09T02:07:00Z" + }, + { + "evidence_id": "agentmemory-distractor", + "kind": "adapter_state", + "text": "agentmemory same-corpus retrieval passed with mocked storage but lifecycle did not pass.", + "source_ref": {}, + "created_at": "2026-06-09T02:08:00Z" + } + ], + "adapter_response": { + "adapter_id": "fixture_retrieval", + "answer": { + "content": "Target ELF only with the stress profile: set ELF_BASELINE_PROJECTS=ELF and ELF_BASELINE_PROFILE=stress with provider embeddings.", + "claims": [ + { + "claim_id": "stress_target", + "text": "Set ELF_BASELINE_PROJECTS=ELF and ELF_BASELINE_PROFILE=stress.", + "evidence_ids": ["elf-provider-stress-target"], + "confidence": "high" + } + ], + "evidence_ids": ["elf-provider-stress-target"], + "latency_ms": 22.8, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + } + } + } + }, + "timeline": [ + { + "event_id": "provider-stress-requested", + "ts": "2026-06-09T02:00:00Z", + "actor": "operator", + "action": "requested_provider_stress", + "evidence_ids": ["elf-provider-stress-target"], + "summary": "The operator requested the ELF provider stress profile, not a smoke or external adapter run." + } + ], + "prompt": { + "role": "user", + "content": "Which profile and project selector should I use for the provider-backed ELF stress run?", + "job_mode": "answer", + "constraints": ["cite_evidence", "avoid_adapter_parity_claims"] + }, + "expected_answer": { + "must_include": [ + { + "claim_id": "stress_target", + "text": "Set ELF_BASELINE_PROJECTS=ELF and ELF_BASELINE_PROFILE=stress." + } + ], + "must_not_include": [ + "Set ELF_BASELINE_PROJECTS=qmd.", + "Use the smoke profile as the stress run.", + "OpenViking passed the provider stress profile." + ], + "evidence_links": { + "stress_target": ["elf-provider-stress-target"] + }, + "answer_type": "direct_answer", + "accepted_alternates": [], + "requires_caveat": false, + "requires_refusal": false + }, + "required_evidence": [ + { + "evidence_id": "elf-provider-stress-target", + "claim_id": "stress_target", + "requirement": "cite", + "quote": "set ELF_BASELINE_PROJECTS=ELF and ELF_BASELINE_PROFILE=stress" + } + ], + "negative_traps": [ + { + "trap_id": "adapter-and-profile-distractors", + "type": "decoy_evidence", + "evidence_ids": [ + "qmd-smoke-distractor", + "mem0-stress-distractor", + "openviking-install-distractor", + "private-manifest-distractor", + "backfill-distractor", + "scale-distractor", + "smoke-distractor", + "agentmemory-distractor" + ], + "failure_if_used": true + } + ], + "scoring_rubric": { + "dimensions": { + "answer_correctness": { + "weight": 0.35, + "max_points": 1.0, + "criteria": "Returns the exact ELF-only stress selector." + }, + "evidence_grounding": { + "weight": 0.3, + "max_points": 1.0, + "criteria": "Uses the provider stress target evidence." + }, + "trap_avoidance": { + "weight": 0.25, + "max_points": 1.0, + "criteria": "Avoids adapter, private-manifest, smoke, scale, and backfill decoys." + }, + "latency_resource": { + "weight": 0.1, + "max_points": 1.0, + "criteria": "Reports bounded fixture latency and cost." + } + }, + "pass_threshold": 0.8, + "hard_fail_rules": [ + "unsupported high-confidence claim about a required decision or fact", + "use of a negative trap marked failure_if_used = true" + ] + }, + "allowed_uncertainty": { + "can_answer_unknown": false, + "acceptable_phrases": ["The fixture does not provide that evidence."], + "fallback_action": "state_blocker" + }, + "tags": ["synthetic", "retrieval", "distractor_heavy", "no_live_claim"] +} diff --git a/apps/elf-eval/fixtures/real_world_memory/retrieval/minimal_sufficient_context.json b/apps/elf-eval/fixtures/real_world_memory/retrieval/minimal_sufficient_context.json new file mode 100644 index 00000000..5c3e0bfe --- /dev/null +++ b/apps/elf-eval/fixtures/real_world_memory/retrieval/minimal_sufficient_context.json @@ -0,0 +1,148 @@ +{ + "schema": "elf.real_world_job/v1", + "job_id": "retrieval-minimal-context-001", + "suite": "retrieval", + "title": "Return minimal sufficient restore evidence without irrelevant context", + "corpus": { + "corpus_id": "real-world-memory-retrieval-2026-06-09", + "profile": "synthetic", + "items": [ + { + "evidence_id": "restore-minimal-proof", + "kind": "runbook", + "text": "Minimal sufficient restore proof: note ingest returned ADD/remember, Qdrant rebuild returned rebuilt_count=1, missing_vector_count=0, error_count=0, and search recovered the restored note.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "minimal_sufficient_context", + "evidence_id": "restore-minimal-proof" + } + }, + "created_at": "2026-06-09T05:00:00Z" + }, + { + "evidence_id": "restore-verbose-log-decoy", + "kind": "trace", + "text": "Verbose restore log includes port remapping, shell setup, and temporary script names that are not needed to answer the proof question.", + "source_ref": {}, + "created_at": "2026-06-09T05:01:00Z" + }, + { + "evidence_id": "provider-run-decoy", + "kind": "trace", + "text": "Provider stress run latency and embedding dimensions are unrelated to the minimal restore proof.", + "source_ref": {}, + "created_at": "2026-06-09T05:02:00Z" + } + ], + "adapter_response": { + "adapter_id": "fixture_retrieval", + "answer": { + "content": "The minimal sufficient restore proof is that note ingest returned ADD/remember, Qdrant rebuild returned rebuilt_count=1 with zero missing vectors and zero errors, and search recovered the restored note.", + "claims": [ + { + "claim_id": "restore_minimal", + "text": "Note ingest returned ADD/remember, rebuild returned rebuilt_count=1 with zero missing vectors and zero errors, and search recovered the restored note.", + "evidence_ids": ["restore-minimal-proof"], + "confidence": "high" + } + ], + "evidence_ids": ["restore-minimal-proof"], + "latency_ms": 9.1, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + } + } + } + }, + "timeline": [ + { + "event_id": "restore-proof-recorded", + "ts": "2026-06-09T05:00:00Z", + "actor": "agent", + "action": "published_restore_proof", + "evidence_ids": ["restore-minimal-proof"], + "summary": "The restore proof recorded the minimal required note ingest, rebuild, and recovered-search evidence." + } + ], + "prompt": { + "role": "user", + "content": "What is the minimal sufficient context proving the restore recovered memory?", + "job_mode": "answer", + "constraints": ["cite_evidence", "minimal_sufficient_context", "avoid_irrelevant_context"] + }, + "expected_answer": { + "must_include": [ + { + "claim_id": "restore_minimal", + "text": "Note ingest returned ADD/remember, rebuild returned rebuilt_count=1 with zero missing vectors and zero errors, and search recovered the restored note." + } + ], + "must_not_include": [ + "Port remapping is required to prove restore correctness.", + "Provider stress latency is required to prove restore correctness." + ], + "evidence_links": { + "restore_minimal": ["restore-minimal-proof"] + }, + "answer_type": "direct_answer", + "accepted_alternates": [], + "requires_caveat": false, + "requires_refusal": false + }, + "required_evidence": [ + { + "evidence_id": "restore-minimal-proof", + "claim_id": "restore_minimal", + "requirement": "cite", + "quote": "Qdrant rebuild returned rebuilt_count=1, missing_vector_count=0, error_count=0" + } + ], + "negative_traps": [ + { + "trap_id": "irrelevant-restore-context", + "type": "decoy_evidence", + "evidence_ids": ["restore-verbose-log-decoy", "provider-run-decoy"], + "failure_if_used": true + } + ], + "scoring_rubric": { + "dimensions": { + "answer_correctness": { + "weight": 0.3, + "max_points": 1.0, + "criteria": "States only the minimal restore proof." + }, + "evidence_grounding": { + "weight": 0.3, + "max_points": 1.0, + "criteria": "Cites the minimal proof evidence." + }, + "trap_avoidance": { + "weight": 0.25, + "max_points": 1.0, + "criteria": "Avoids verbose logs and unrelated provider evidence." + }, + "workflow_helpfulness": { + "weight": 0.15, + "max_points": 1.0, + "criteria": "Keeps the answer compact enough for agent context use." + } + }, + "pass_threshold": 0.8, + "hard_fail_rules": [ + "unsupported high-confidence claim about a required decision or fact", + "use of a negative trap marked failure_if_used = true" + ] + }, + "allowed_uncertainty": { + "can_answer_unknown": false, + "acceptable_phrases": ["The fixture does not provide that evidence."], + "fallback_action": "state_blocker" + }, + "tags": ["synthetic", "retrieval", "minimal_sufficient_context", "no_live_claim"] +} diff --git a/apps/elf-eval/fixtures/real_world_memory/retrieval/multi_hop_routing.json b/apps/elf-eval/fixtures/real_world_memory/retrieval/multi_hop_routing.json new file mode 100644 index 00000000..bd2e6b8b --- /dev/null +++ b/apps/elf-eval/fixtures/real_world_memory/retrieval/multi_hop_routing.json @@ -0,0 +1,181 @@ +{ + "schema": "elf.real_world_job/v1", + "job_id": "retrieval-multi-hop-routing-001", + "suite": "retrieval", + "title": "Answer a multi-hop benchmark routing question", + "corpus": { + "corpus_id": "real-world-memory-retrieval-2026-06-09", + "profile": "synthetic", + "items": [ + { + "evidence_id": "xy845-landing-zone", + "kind": "issue", + "text": "XY-845 should add retrieval-quality real_world_job cases under apps/elf-eval/fixtures/real_world_memory/retrieval/ and extend the runner/report seams.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "multi_hop_routing", + "evidence_id": "xy845-landing-zone" + } + }, + "created_at": "2026-06-09T03:00:00Z" + }, + { + "evidence_id": "routing-reference-boundary", + "kind": "decision", + "text": "qmd and OpenViking are strong references for routing, fusion, hierarchical retrieval, and staged trajectory, but parity must not be claimed unless their adapters actually run.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "multi_hop_routing", + "evidence_id": "routing-reference-boundary" + } + }, + "created_at": "2026-06-09T03:01:00Z" + }, + { + "evidence_id": "ranking-tune-trap", + "kind": "decision", + "text": "Do not tune ELF ranking blindly to fixtures; ranking changes need trace and provenance evidence.", + "source_ref": {}, + "created_at": "2026-06-09T03:02:00Z" + } + ], + "adapter_response": { + "adapter_id": "fixture_retrieval", + "answer": { + "content": "Add the new cases under apps/elf-eval/fixtures/real_world_memory/retrieval/ and extend the runner/report seams, while treating qmd and OpenViking only as references unless their adapters actually run.", + "claims": [ + { + "claim_id": "landing_zone", + "text": "Add retrieval-quality real_world_job cases under apps/elf-eval/fixtures/real_world_memory/retrieval/.", + "evidence_ids": ["xy845-landing-zone"], + "confidence": "high" + }, + { + "claim_id": "reference_boundary", + "text": "Treat qmd and OpenViking only as references unless their adapters actually run.", + "evidence_ids": ["routing-reference-boundary"], + "confidence": "high" + } + ], + "evidence_ids": ["xy845-landing-zone", "routing-reference-boundary"], + "latency_ms": 31.5, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + } + } + } + }, + "timeline": [ + { + "event_id": "issue-route", + "ts": "2026-06-09T03:00:00Z", + "actor": "operator", + "action": "specified_landing_zone", + "evidence_ids": ["xy845-landing-zone"], + "summary": "The issue named the real_world_memory retrieval fixture path and runner/report seams." + }, + { + "event_id": "reference-boundary", + "ts": "2026-06-09T03:01:00Z", + "actor": "agent", + "action": "recorded_reference_boundary", + "evidence_ids": ["routing-reference-boundary"], + "summary": "External projects are design references, not benchmark passes without adapters." + } + ], + "prompt": { + "role": "user", + "content": "How should XY-845 extend the benchmark while respecting the qmd/OpenViking reference boundary?", + "job_mode": "decide", + "constraints": ["cite_evidence", "avoid_unsupported_claims", "avoid_blind_ranking_tuning"] + }, + "expected_answer": { + "must_include": [ + { + "claim_id": "landing_zone", + "text": "Add retrieval-quality real_world_job cases under apps/elf-eval/fixtures/real_world_memory/retrieval/." + }, + { + "claim_id": "reference_boundary", + "text": "Treat qmd and OpenViking only as references unless their adapters actually run." + } + ], + "must_not_include": [ + "Claim qmd parity from fixture-only output.", + "Claim OpenViking parity from fixture-only output.", + "Tune ELF ranking blindly to fixtures." + ], + "evidence_links": { + "landing_zone": ["xy845-landing-zone"], + "reference_boundary": ["routing-reference-boundary"] + }, + "answer_type": "decision_record", + "accepted_alternates": [], + "requires_caveat": false, + "requires_refusal": false + }, + "required_evidence": [ + { + "evidence_id": "xy845-landing-zone", + "claim_id": "landing_zone", + "requirement": "cite", + "quote": "apps/elf-eval/fixtures/real_world_memory/retrieval/" + }, + { + "evidence_id": "routing-reference-boundary", + "claim_id": "reference_boundary", + "requirement": "cite", + "quote": "parity must not be claimed unless their adapters actually run" + } + ], + "negative_traps": [ + { + "trap_id": "blind-ranking-tune", + "type": "unsafe_action", + "evidence_ids": ["ranking-tune-trap"], + "failure_if_used": true + } + ], + "scoring_rubric": { + "dimensions": { + "answer_correctness": { + "weight": 0.3, + "max_points": 1.0, + "criteria": "Combines landing-zone and reference-boundary evidence." + }, + "evidence_grounding": { + "weight": 0.3, + "max_points": 1.0, + "criteria": "Cites both required evidence ids." + }, + "trap_avoidance": { + "weight": 0.2, + "max_points": 1.0, + "criteria": "Avoids blind ranking tuning and unsupported parity claims." + }, + "workflow_helpfulness": { + "weight": 0.2, + "max_points": 1.0, + "criteria": "Gives an implementation route rather than a vague retrieval summary." + } + }, + "pass_threshold": 0.8, + "hard_fail_rules": [ + "unsupported high-confidence claim about a required decision or fact", + "use of a negative trap marked failure_if_used = true" + ] + }, + "allowed_uncertainty": { + "can_answer_unknown": false, + "acceptable_phrases": ["The fixture does not provide that evidence."], + "fallback_action": "state_blocker" + }, + "tags": ["synthetic", "retrieval", "multi_hop", "hierarchical_routing", "no_live_claim"] +} diff --git a/apps/elf-eval/fixtures/real_world_memory/retrieval/stage_explainability_wrong_result.json b/apps/elf-eval/fixtures/real_world_memory/retrieval/stage_explainability_wrong_result.json new file mode 100644 index 00000000..56dd2269 --- /dev/null +++ b/apps/elf-eval/fixtures/real_world_memory/retrieval/stage_explainability_wrong_result.json @@ -0,0 +1,206 @@ +{ + "schema": "elf.real_world_job/v1", + "job_id": "operator-debug-stage-attribution-001", + "suite": "operator_debugging_ux", + "title": "Attribute a wrong result to the retrieval stage that demoted evidence", + "corpus": { + "corpus_id": "real-world-memory-retrieval-2026-06-09", + "profile": "synthetic", + "items": [ + { + "evidence_id": "stage-target", + "kind": "trace", + "text": "Correct trace evidence: the expected XY-845 retrieval fixture was present in recall.candidates, then demoted at rerank.score because a stale decoy shared more lexical terms.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "stage_explainability_wrong_result", + "evidence_id": "stage-target" + } + }, + "created_at": "2026-06-09T06:00:00Z" + }, + { + "evidence_id": "stage-decoy", + "kind": "trace", + "text": "Stale decoy: XY-845 should only use top-k smoke checks; real-world retrieval fixtures are out of scope.", + "source_ref": {}, + "created_at": "2026-06-08T06:00:00Z" + } + ], + "adapter_response": { + "adapter_id": "fixture_retrieval", + "answer": { + "content": "The trace shows the expected evidence was present in recall.candidates but demoted at rerank.score; however, the selected answer followed the stale top-k smoke-only evidence.", + "claims": [], + "evidence_ids": ["stage-decoy"], + "latency_ms": 18.2, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + }, + "trace_explainability": { + "trace_id": "66666666-6666-4666-8666-666666666666", + "failure_stage": "rerank.score", + "failure_reason": "Expected evidence survived candidate recall but was demoted below a stale decoy during rerank.", + "stages": [ + { + "stage_name": "rewrite.expansion", + "kept_evidence": [], + "dropped_evidence": [], + "demoted_evidence": [], + "distractor_evidence": [], + "notes": "Alternate phrasing preserved the original intent." + }, + { + "stage_name": "recall.candidates", + "kept_evidence": ["stage-target", "stage-decoy"], + "dropped_evidence": [], + "demoted_evidence": [], + "distractor_evidence": ["stage-decoy"], + "notes": "Candidate recall found both expected evidence and stale decoy evidence." + }, + { + "stage_name": "rerank.score", + "kept_evidence": ["stage-decoy"], + "dropped_evidence": [], + "demoted_evidence": ["stage-target"], + "distractor_evidence": ["stage-decoy"], + "notes": "The stale decoy outranked the expected evidence." + }, + { + "stage_name": "selection.final", + "kept_evidence": ["stage-decoy"], + "dropped_evidence": ["stage-target"], + "demoted_evidence": [], + "distractor_evidence": ["stage-decoy"], + "notes": "Final selection missed the required evidence." + } + ] + } + } + } + }, + "timeline": [ + { + "event_id": "trace-candidate-recall", + "ts": "2026-06-09T06:00:00Z", + "actor": "agent", + "action": "inspected_trace", + "evidence_ids": ["stage-target"], + "summary": "The trace showed expected evidence at candidate recall and demotion at rerank." + }, + { + "event_id": "stale-decoy-ranked", + "ts": "2026-06-09T06:01:00Z", + "actor": "agent", + "action": "selected_wrong_context", + "evidence_ids": ["stage-decoy"], + "summary": "A stale decoy became the selected answer even though it was obsolete." + } + ], + "prompt": { + "role": "user", + "content": "Why did the wrong retrieval result happen, and which stage dropped or demoted the expected evidence?", + "job_mode": "debug", + "constraints": ["cite_evidence", "identify_retrieval_stage", "avoid_obsolete_context"] + }, + "expected_answer": { + "must_include": [ + { + "claim_id": "stage_attribution", + "text": "Expected evidence was present in recall.candidates but demoted at rerank.score." + } + ], + "must_not_include": [ + "real-world retrieval fixtures are out of scope" + ], + "evidence_links": { + "stage_attribution": ["stage-target"] + }, + "answer_type": "debug_report", + "accepted_alternates": [], + "requires_caveat": false, + "requires_refusal": false + }, + "required_evidence": [ + { + "evidence_id": "stage-target", + "claim_id": "stage_attribution", + "requirement": "explain", + "quote": "present in recall.candidates, then demoted at rerank.score" + } + ], + "negative_traps": [ + { + "trap_id": "stale-top-k-only-decoy", + "type": "stale_fact", + "evidence_ids": ["stage-decoy"], + "failure_if_used": true + } + ], + "scoring_rubric": { + "dimensions": { + "answer_correctness": { + "weight": 0.25, + "max_points": 1.0, + "criteria": "Identifies the stage attribution without selecting the stale final answer." + }, + "evidence_grounding": { + "weight": 0.25, + "max_points": 1.0, + "criteria": "Uses the expected trace evidence." + }, + "trap_avoidance": { + "weight": 0.2, + "max_points": 1.0, + "criteria": "Does not cite the stale top-k-only decoy." + }, + "debuggability": { + "weight": 0.2, + "max_points": 1.0, + "criteria": "Reports the stage that demoted expected evidence." + }, + "workflow_helpfulness": { + "weight": 0.1, + "max_points": 1.0, + "criteria": "Turns the wrong result into actionable trace evidence." + } + }, + "pass_threshold": 0.8, + "hard_fail_rules": [ + "unsupported high-confidence claim about a required decision or fact", + "use of a negative trap marked failure_if_used = true" + ] + }, + "allowed_uncertainty": { + "can_answer_unknown": false, + "acceptable_phrases": ["The fixture does not provide that evidence."], + "fallback_action": "state_blocker" + }, + "operator_debug": { + "failure_mode": "rerank_demoted_expected_evidence", + "trace_id": "66666666-6666-4666-8666-666666666666", + "viewer_url": "/viewer?trace_id=66666666-6666-4666-8666-666666666666", + "admin_trace_bundle_url": "/v2/admin/traces/66666666-6666-4666-8666-666666666666/bundle?mode=full&stage_items_limit=128&candidates_limit=200", + "root_cause": "The expected evidence survived recall.candidates but was demoted below a stale decoy during rerank.score.", + "steps_to_root_cause": 3, + "raw_sql_needed": false, + "dropped_candidate_visibility": "visible in trace_explainability rerank.score and selection.final stages", + "trace_completeness": "complete", + "repair_action_clarity": "clear", + "viewer_panels": ["Trace", "Retrieval Funnel", "Replay Candidates", "Stage Details"], + "cli_steps": [ + "open trace explainability bundle", + "compare recall.candidates with rerank.score", + "inspect selected stale decoy", + "repair rerank inputs or stale-context filtering" + ], + "trace_evidence": ["stage-target", "stage-decoy"], + "ux_gaps": [] + }, + "tags": ["synthetic", "operator_debugging_ux", "trace_explainability", "wrong_result", "no_live_claim"] +} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark.rs b/apps/elf-eval/src/bin/real_world_job_benchmark.rs index 97665594..d87202b7 100644 --- a/apps/elf-eval/src/bin/real_world_job_benchmark.rs +++ b/apps/elf-eval/src/bin/real_world_job_benchmark.rs @@ -346,6 +346,8 @@ struct ProducedAnswer { latency_ms: Option, #[serde(skip_serializing_if = "Option::is_none")] cost: Option, + #[serde(skip_serializing_if = "Option::is_none")] + trace_explainability: Option, } #[derive(Clone, Debug, Deserialize, Serialize)] @@ -404,6 +406,33 @@ struct OperatorUxGap { follow_up_issue: String, } +#[derive(Clone, Debug, Deserialize, Serialize)] +struct TraceExplainability { + #[serde(skip_serializing_if = "Option::is_none")] + trace_id: Option, + #[serde(skip_serializing_if = "Option::is_none")] + failure_stage: Option, + #[serde(skip_serializing_if = "Option::is_none")] + failure_reason: Option, + #[serde(default)] + stages: Vec, +} + +#[derive(Clone, Debug, Deserialize, Serialize)] +struct TraceStageExplainability { + stage_name: String, + #[serde(default)] + kept_evidence: Vec, + #[serde(default)] + dropped_evidence: Vec, + #[serde(default)] + demoted_evidence: Vec, + #[serde(default)] + distractor_evidence: Vec, + #[serde(skip_serializing_if = "Option::is_none")] + notes: Option, +} + #[derive(Clone, Copy, Debug, Eq, Ord, PartialEq, PartialOrd, Deserialize, Serialize)] #[serde(rename_all = "snake_case")] enum TypedStatus { @@ -484,6 +513,13 @@ struct ReportSummary { update_rationale_available_count: usize, #[serde(default)] temporal_validity_not_encoded_count: usize, + expected_evidence_total: usize, + expected_evidence_matched: usize, + expected_evidence_recall: f64, + irrelevant_context_count: usize, + irrelevant_context_ratio: f64, + trace_explainability_count: usize, + wrong_result_stage_attribution_count: usize, mean_score: f64, mean_latency_ms: Option, total_cost: Option, @@ -547,6 +583,9 @@ struct SuiteReport { update_rationale_available_count: usize, #[serde(default)] temporal_validity_not_encoded_count: usize, + expected_evidence_recall: Option, + irrelevant_context_ratio: Option, + trace_explainability_count: usize, reason: String, } @@ -571,8 +610,10 @@ struct JobReport { update_rationale_available: bool, #[serde(default)] temporal_validity_not_encoded: bool, + retrieval_quality: RetrievalQualityReport, latency_ms: Option, cost: Option, + trace_explainability: Option, trap_ids_used: Vec, dimension_scores: Vec, reason: String, @@ -621,6 +662,17 @@ struct DimensionScoreReport { weight: f64, } +#[derive(Debug, Deserialize, Serialize)] +struct RetrievalQualityReport { + expected_evidence_total: usize, + expected_evidence_matched: usize, + expected_evidence_recall: f64, + produced_evidence_total: usize, + irrelevant_context_count: usize, + irrelevant_context_ratio: f64, + trap_context_count: usize, +} + #[derive(Clone, Debug, Deserialize, Serialize)] struct UnsupportedClaimReport { suite_id: String, @@ -818,6 +870,7 @@ fn validate_job(job: &RealWorldJob, path: &Path) -> Result<()> { validate_operator_debug(job, path)?; validate_job_encoding(job, path)?; validate_memory_evolution(job, path)?; + validate_trace_explainability(job, path)?; Ok(()) } @@ -1238,6 +1291,47 @@ fn validate_temporal_validity( Ok(()) } +fn validate_trace_explainability(job: &RealWorldJob, path: &Path) -> Result<()> { + let Some(trace) = job + .corpus + .adapter_response + .as_ref() + .and_then(|response| response.answer.trace_explainability.as_ref()) + else { + return Ok(()); + }; + let known = corpus_evidence_ids(job); + let stage_names = + trace.stages.iter().map(|stage| stage.stage_name.as_str()).collect::>(); + + if trace.trace_id.as_deref().is_some_and(str::is_empty) { + return Err(eyre::eyre!("{} has an empty trace_explainability trace_id.", path.display())); + } + if trace.failure_stage.as_deref().is_some_and(str::is_empty) { + return Err(eyre::eyre!( + "{} has an empty trace_explainability failure_stage.", + path.display() + )); + } + + if let Some(failure_stage) = trace.failure_stage.as_deref() + && !stage_names.is_empty() + && !stage_names.contains(failure_stage) + { + return Err(eyre::eyre!( + "{} trace_explainability failure_stage {} is not present in stages.", + path.display(), + failure_stage + )); + } + + for stage in &trace.stages { + validate_trace_stage(stage, &known, path)?; + } + + Ok(()) +} + fn validate_optional_debug_field(path: &Path, value: Option<&str>, field: &str) -> Result<()> { if value.is_some_and(|value| value.trim().is_empty()) { return Err(eyre::eyre!("{} has empty operator_debug {field}.", path.display())); @@ -1254,6 +1348,28 @@ fn validate_non_empty_debug_list(path: &Path, values: &[String], field: &str) -> Ok(()) } +fn validate_trace_stage( + stage: &TraceStageExplainability, + known: &BTreeSet, + path: &Path, +) -> Result<()> { + if stage.stage_name.trim().is_empty() { + return Err(eyre::eyre!("{} has a trace stage with an empty stage_name.", path.display())); + } + + for evidence_id in stage + .kept_evidence + .iter() + .chain(stage.dropped_evidence.iter()) + .chain(stage.demoted_evidence.iter()) + .chain(stage.distractor_evidence.iter()) + { + ensure_known_evidence(path, known, evidence_id)?; + } + + Ok(()) +} + fn validate_required_rfc3339(value: &str, path: &Path, id: &str) -> Result<()> { if OffsetDateTime::parse(value, &Rfc3339).is_err() { return Err(eyre::eyre!("{} has invalid RFC3339 timestamp for {}.", path.display(), id)); @@ -1477,6 +1593,7 @@ fn synthetic_answer(job: &RealWorldJob) -> &ProducedAnswer { evidence_ids: Vec::new(), latency_ms: None, cost: None, + trace_explainability: None, }) } @@ -1873,6 +1990,7 @@ fn job_reason(status: TypedStatus, counts: &FailureCounts, normalized_score: f64 fn job_report(job: &RealWorldJob, scoring: JobScoring) -> JobReport { let answer = produced_answer(job); let metrics = job_metrics(job, answer); + let retrieval_quality = retrieval_quality_report(job, answer); JobReport { suite_id: job.suite.clone(), @@ -1902,8 +2020,10 @@ fn job_report(job: &RealWorldJob, scoring: JobScoring) -> JobReport { .evolution .as_ref() .is_some_and(|report| report.temporal_validity_not_encoded), + retrieval_quality, latency_ms: answer.latency_ms, cost: answer.cost.clone(), + trace_explainability: answer.trace_explainability.clone(), trap_ids_used: scoring.trap_ids_used, dimension_scores: scoring.dimension_scores, reason: scoring.reason, @@ -2024,6 +2144,51 @@ fn answer_contains_corpus_item( .is_some_and(|text| !text.trim().is_empty() && answer.content.contains(text)) } +fn retrieval_quality_report(job: &RealWorldJob, answer: &ProducedAnswer) -> RetrievalQualityReport { + let expected = expected_evidence_ids(job); + let allowed = allowed_evidence_ids(job); + let produced = produced_evidence_ids(answer); + let trap_evidence = trap_evidence_ids(job); + let expected_evidence_matched = + expected.iter().filter(|evidence_id| produced.contains(evidence_id.as_str())).count(); + let irrelevant_context_count = + produced.iter().filter(|evidence_id| !allowed.contains(evidence_id.as_str())).count(); + let trap_context_count = + produced.iter().filter(|evidence_id| trap_evidence.contains(evidence_id.as_str())).count(); + + RetrievalQualityReport { + expected_evidence_total: expected.len(), + expected_evidence_matched, + expected_evidence_recall: ratio_or(expected_evidence_matched, expected.len(), 1.0), + produced_evidence_total: produced.len(), + irrelevant_context_count, + irrelevant_context_ratio: ratio_or(irrelevant_context_count, produced.len(), 0.0), + trap_context_count, + } +} + +fn expected_evidence_ids(job: &RealWorldJob) -> BTreeSet { + job.required_evidence + .iter() + .filter(|evidence| is_required_use(evidence)) + .map(|evidence| evidence.evidence_id.clone()) + .collect() +} + +fn allowed_evidence_ids(job: &RealWorldJob) -> BTreeSet { + let mut allowed = expected_evidence_ids(job); + + for link in job.expected_answer.evidence_links.values() { + allowed.extend(link.ids()); + } + + allowed +} + +fn trap_evidence_ids(job: &RealWorldJob) -> BTreeSet { + job.negative_traps.iter().flat_map(|trap| trap.evidence_ids.iter().cloned()).collect() +} + fn expected_evidence_report(job: &RealWorldJob) -> Vec { job.required_evidence .iter() @@ -2054,6 +2219,9 @@ fn suite_report(suite_id: &str, jobs: &[JobReport]) -> SuiteReport { conflict_detection_count: 0, update_rationale_available_count: 0, temporal_validity_not_encoded_count: 0, + expected_evidence_recall: None, + irrelevant_context_ratio: None, + trace_explainability_count: 0, reason: NOT_ENCODED_REASON.to_string(), }; } @@ -2068,6 +2236,8 @@ fn suite_report(suite_id: &str, jobs: &[JobReport]) -> SuiteReport { suite_jobs.iter().filter(|job| job.update_rationale_available).count(); let temporal_validity_not_encoded_count = suite_jobs.iter().filter(|job| job.temporal_validity_not_encoded).count(); + let trace_explainability_count = + suite_jobs.iter().filter(|job| job.trace_explainability.is_some()).count(); SuiteReport { suite_id: suite_id.to_string(), @@ -2080,6 +2250,9 @@ fn suite_report(suite_id: &str, jobs: &[JobReport]) -> SuiteReport { conflict_detection_count, update_rationale_available_count, temporal_validity_not_encoded_count, + expected_evidence_recall: Some(expected_evidence_recall_for_jobs(&suite_jobs)), + irrelevant_context_ratio: Some(irrelevant_context_ratio_for_jobs(&suite_jobs)), + trace_explainability_count, reason: suite_reason(status, suite_jobs.len()), } } @@ -2126,6 +2299,7 @@ fn suite_reason(status: TypedStatus, encoded_job_count: usize) -> String { } fn report_summary(jobs: &[JobReport], suites: &[SuiteReport]) -> ReportSummary { + let job_refs = jobs.iter().collect::>(); let evidence_required_count = jobs.iter().map(|job| job.evidence_required_count).sum(); let evidence_covered_count = jobs.iter().map(|job| job.evidence_covered_count).sum(); let source_ref_required_count = jobs.iter().map(|job| job.source_ref_required_count).sum(); @@ -2150,6 +2324,31 @@ fn report_summary(jobs: &[JobReport], suites: &[SuiteReport]) -> ReportSummary { .iter() .filter(|job| job.temporal_validity_not_encoded) .count(), + expected_evidence_total: jobs + .iter() + .map(|job| job.retrieval_quality.expected_evidence_total) + .sum(), + expected_evidence_matched: jobs + .iter() + .map(|job| job.retrieval_quality.expected_evidence_matched) + .sum(), + expected_evidence_recall: expected_evidence_recall_for_jobs(&job_refs), + irrelevant_context_count: jobs + .iter() + .map(|job| job.retrieval_quality.irrelevant_context_count) + .sum(), + irrelevant_context_ratio: irrelevant_context_ratio_for_jobs(&job_refs), + trace_explainability_count: jobs + .iter() + .filter(|job| job.trace_explainability.is_some()) + .count(), + wrong_result_stage_attribution_count: jobs + .iter() + .filter(|job| { + job.status == TypedStatus::WrongResult + && trace_failure_stage(job.trace_explainability.as_ref()).is_some() + }) + .count(), mean_score: mean_score(jobs), mean_latency_ms: mean_latency(jobs), total_cost: total_cost(jobs), @@ -2243,6 +2442,26 @@ fn ratio(numerator: usize, denominator: usize) -> f64 { round3(numerator as f64 / denominator as f64) } +fn expected_evidence_recall_for_jobs(jobs: &[&JobReport]) -> f64 { + let total = jobs.iter().map(|job| job.retrieval_quality.expected_evidence_total).sum::(); + let matched = + jobs.iter().map(|job| job.retrieval_quality.expected_evidence_matched).sum::(); + + ratio_or(matched, total, 1.0) +} + +fn irrelevant_context_ratio_for_jobs(jobs: &[&JobReport]) -> f64 { + let total = jobs.iter().map(|job| job.retrieval_quality.produced_evidence_total).sum::(); + let irrelevant = + jobs.iter().map(|job| job.retrieval_quality.irrelevant_context_count).sum::(); + + ratio_or(irrelevant, total, 0.0) +} + +fn ratio_or(numerator: usize, denominator: usize, empty_value: f64) -> f64 { + if denominator == 0 { empty_value } else { round3(numerator as f64 / denominator as f64) } +} + fn mean_score(jobs: &[JobReport]) -> f64 { if jobs.is_empty() { return 0.0; @@ -2370,6 +2589,7 @@ fn render_markdown(report: &RealWorldReport, report_path: &Path) -> String { render_markdown_jobs(&mut out, report); render_markdown_operator_debugging(&mut out, report); render_markdown_evolution(&mut out, report); + render_markdown_trace_explainability(&mut out, report); render_markdown_unsupported_claims(&mut out, report); render_markdown_follow_ups(&mut out, report); render_markdown_semantics(&mut out, report); @@ -2420,7 +2640,7 @@ fn render_markdown_header(out: &mut String, report: &RealWorldReport, report_pat "Read this when: You need a durable smoke report for real-world agent memory job fixtures.\n", ); out.push_str(&format!("Inputs: `{}`.\n", md_inline(report_path))); - out.push_str("Depends on: `apps/elf-eval/fixtures/real_world_memory/`, `docs/spec/real_world_agent_memory_benchmark_v1.md`, and `Makefile.toml`.\n"); + out.push_str("Depends on: `apps/elf-eval/fixtures/real_world_job/`, `apps/elf-eval/fixtures/real_world_memory/`, `docs/spec/real_world_agent_memory_benchmark_v1.md`, and `Makefile.toml`.\n"); out.push_str( "Verification: Compare this Markdown summary with the source JSON before committing.\n\n", ); @@ -2493,6 +2713,21 @@ fn render_markdown_header(out: &mut String, report: &RealWorldReport, report_pat "- Qdrant rebuild cases: `{}` encoded, `{}` pass\n", report.summary.qdrant_rebuild_case_count, report.summary.qdrant_rebuild_pass_count )); + out.push_str(&format!( + "- Expected evidence recall: `{:.3}` ({}/{})\n", + report.summary.expected_evidence_recall, + report.summary.expected_evidence_matched, + report.summary.expected_evidence_total + )); + out.push_str(&format!( + "- Irrelevant context ratio: `{:.3}` ({} irrelevant)\n", + report.summary.irrelevant_context_ratio, report.summary.irrelevant_context_count + )); + out.push_str(&format!( + "- Trace explainability: `{}` job(s), `{}` wrong-result stage attribution(s)\n", + report.summary.trace_explainability_count, + report.summary.wrong_result_stage_attribution_count + )); out.push_str(&format!("- Mean score: `{:.3}`\n", report.summary.mean_score)); out.push_str(&format!( "- Mean latency: `{}`\n", @@ -2518,17 +2753,20 @@ fn render_markdown_header(out: &mut String, report: &RealWorldReport, report_pat fn render_markdown_suites(out: &mut String, report: &RealWorldReport) { out.push_str("## Suites\n\n"); out.push_str( - "| Suite | Status | Jobs | Score | Stale Answers | Conflicts | Update Rationales | Temporal Gaps | Unsupported Claims | Wrong Results | Reason |\n", + "| Suite | Status | Jobs | Score | Evidence Recall | Irrelevant Context | Trace Explain | Stale Answers | Conflicts | Update Rationales | Temporal Gaps | Unsupported Claims | Wrong Results | Reason |\n", ); - out.push_str("| --- | --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | --- |\n"); + out.push_str("| --- | --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | --- |\n"); for suite in &report.suites { out.push_str(&format!( - "| {} | `{}` | {} | `{}` | {} | {} | {} | {} | {} | {} | {} |\n", + "| {} | `{}` | {} | `{}` | `{}` | `{}` | {} | {} | {} | {} | {} | {} | {} | {} |\n", md_cell(suite.suite_id.as_str()), status_str(suite.status), suite.encoded_job_count, optional_f64(suite.score_mean, ""), + optional_f64(suite.expected_evidence_recall, ""), + optional_f64(suite.irrelevant_context_ratio, ""), + suite.trace_explainability_count, suite.stale_answer_count, suite.conflict_detection_count, suite.update_rationale_available_count, @@ -2544,9 +2782,9 @@ fn render_markdown_suites(out: &mut String, report: &RealWorldReport) { fn render_markdown_jobs(out: &mut String, report: &RealWorldReport) { out.push_str("## Jobs\n\n"); - out.push_str("| Suite | Job | Status | Score | Expected Evidence | Produced Evidence | Stale Answers | Conflicts | Update Rationale | Temporal Gap | Unsupported Claims | Wrong Results | Latency | Cost |\n"); + out.push_str("| Suite | Job | Status | Score | Evidence Recall | Irrelevant Context | Expected Evidence | Produced Evidence | Trace Failure Stage | Stale Answers | Conflicts | Update Rationale | Temporal Gap | Unsupported Claims | Wrong Results | Latency | Cost |\n"); out.push_str( - "| --- | --- | --- | ---: | --- | --- | ---: | ---: | --- | --- | ---: | ---: | ---: | --- |\n", + "| --- | --- | --- | ---: | ---: | ---: | --- | --- | --- | ---: | ---: | --- | --- | ---: | ---: | ---: | --- |\n", ); for job in &report.jobs { @@ -2559,13 +2797,16 @@ fn render_markdown_jobs(out: &mut String, report: &RealWorldReport) { let produced = job.produced_evidence.join(", "); out.push_str(&format!( - "| {} | {} | `{}` | `{:.3}` | `{}` | `{}` | {} | {} | `{}` | `{}` | {} | {} | `{}` | `{}` |\n", + "| {} | {} | `{}` | `{:.3}` | `{:.3}` | `{:.3}` | `{}` | `{}` | `{}` | {} | {} | `{}` | `{}` | {} | {} | `{}` | `{}` |\n", md_cell(job.suite_id.as_str()), md_cell(job.job_id.as_str()), status_str(job.status), job.normalized_score, + job.retrieval_quality.expected_evidence_recall, + job.retrieval_quality.irrelevant_context_ratio, md_inline(expected.as_str()), md_inline(produced.as_str()), + md_inline(trace_failure_stage(job.trace_explainability.as_ref()).unwrap_or("-")), job.stale_answer_count, job.conflict_detection_count, bool_display(job.update_rationale_available), @@ -2709,6 +2950,38 @@ fn render_markdown_evolution(out: &mut String, report: &RealWorldReport) { out.push('\n'); } +fn render_markdown_trace_explainability(out: &mut String, report: &RealWorldReport) { + out.push_str("## Trace Explainability\n\n"); + + let jobs = + report.jobs.iter().filter(|job| job.trace_explainability.is_some()).collect::>(); + + if jobs.is_empty() { + out.push_str("No encoded job reported trace explainability metadata.\n\n"); + + return; + } + + out.push_str("| Suite | Job | Trace | Failure Stage | Reason | Stage Evidence |\n"); + out.push_str("| --- | --- | --- | --- | --- | --- |\n"); + + for job in jobs { + let trace = job.trace_explainability.as_ref(); + + out.push_str(&format!( + "| {} | {} | `{}` | `{}` | {} | {} |\n", + md_cell(job.suite_id.as_str()), + md_cell(job.job_id.as_str()), + md_inline(trace.and_then(|trace| trace.trace_id.as_deref()).unwrap_or("-")), + md_inline(trace_failure_stage(trace).unwrap_or("-")), + md_cell(trace_failure_reason(trace).unwrap_or("-")), + md_cell(trace_stage_summary(trace).as_str()) + )); + } + + out.push('\n'); +} + fn render_markdown_unsupported_claims(out: &mut String, report: &RealWorldReport) { out.push_str("## Unsupported Claims\n\n"); @@ -2768,7 +3041,7 @@ fn render_markdown_semantics(out: &mut String, report: &RealWorldReport) { out.push_str("It is a real-world job fixture report, not a Docker live-baseline report.\n"); out.push_str("Existing live-baseline reports remain valid for their encoded retrieval and lifecycle checks and are not reinterpreted as real-world suite wins.\n\n"); out.push_str( - "The summary counters report required evidence coverage, source-ref coverage, quote coverage, stale retrievals, scope violations, redaction leaks, Qdrant rebuild case coverage, stale answers, conflict detections, update rationale availability, and temporal validity gaps across encoded jobs.\n\n", + "The summary counters report required evidence coverage, source-ref coverage, quote coverage, expected evidence recall, irrelevant context ratio, trace explainability, stale retrievals, scope violations, redaction leaks, Qdrant rebuild case coverage, stale answers, conflict detections, update rationale availability, and temporal validity gaps across encoded jobs.\n\n", ); out.push_str( "- `pass`: encoded jobs met their pass threshold with required evidence and no hard-fail rule.\n", @@ -2801,6 +3074,36 @@ fn status_str(status: TypedStatus) -> &'static str { } } +fn trace_failure_stage(trace: Option<&TraceExplainability>) -> Option<&str> { + trace.and_then(|trace| trace.failure_stage.as_deref()) +} + +fn trace_failure_reason(trace: Option<&TraceExplainability>) -> Option<&str> { + trace.and_then(|trace| trace.failure_reason.as_deref()) +} + +fn trace_stage_summary(trace: Option<&TraceExplainability>) -> String { + let Some(trace) = trace else { + return "-".to_string(); + }; + let stages = trace + .stages + .iter() + .map(|stage| { + format!( + "{} kept={} demoted={} dropped={} distractors={}", + stage.stage_name, + stage.kept_evidence.join("+"), + stage.demoted_evidence.join("+"), + stage.dropped_evidence.join("+"), + stage.distractor_evidence.join("+") + ) + }) + .collect::>(); + + if stages.is_empty() { "-".to_string() } else { stages.join("; ") } +} + fn write_or_print(path: Option<&Path>, content: &str) -> Result<()> { if let Some(path) = path { if let Some(parent) = path.parent() diff --git a/apps/elf-eval/tests/real_world_job_benchmark.rs b/apps/elf-eval/tests/real_world_job_benchmark.rs index bcd04139..3b09e622 100644 --- a/apps/elf-eval/tests/real_world_job_benchmark.rs +++ b/apps/elf-eval/tests/real_world_job_benchmark.rs @@ -37,6 +37,13 @@ fn operator_debug_fixture_dir() -> PathBuf { .join("operator_debugging_ux") } +fn retrieval_fixture_dir() -> PathBuf { + Path::new(env!("CARGO_MANIFEST_DIR")) + .join("fixtures") + .join("real_world_memory") + .join("retrieval") +} + fn run_json_report_from(fixtures: PathBuf) -> Result { let output = Command::new(env!("CARGO_BIN_EXE_real_world_job_benchmark")) .arg("run") @@ -139,7 +146,7 @@ fn smoke_fixture_produces_typed_json_report() -> Result<()> { fn runner_discovers_nested_fixture_layout() -> Result<()> { let report = run_json_report_from(fixture_root())?; - assert_eq!(report.pointer("/summary/job_count").and_then(Value::as_u64), Some(15)); + assert_eq!(report.pointer("/summary/job_count").and_then(Value::as_u64), Some(21)); Ok(()) } @@ -219,14 +226,24 @@ fn generated_json_report_renders_markdown() -> Result<()> { } #[test] -fn real_world_memory_fixtures_report_trust_and_personalization_metrics() -> Result<()> { +fn real_world_memory_fixtures_report_aggregate_metrics() -> Result<()> { let report = run_json_report_from(real_world_memory_fixture_dir())?; - assert_eq!(report.pointer("/summary/job_count").and_then(Value::as_u64), Some(15)); - assert_eq!(report.pointer("/summary/pass").and_then(Value::as_u64), Some(14)); + assert_eq!(report.pointer("/summary/job_count").and_then(Value::as_u64), Some(21)); + assert_eq!(report.pointer("/summary/pass").and_then(Value::as_u64), Some(19)); + assert_eq!(report.pointer("/summary/wrong_result").and_then(Value::as_u64), Some(1)); assert_eq!(report.pointer("/summary/not_encoded").and_then(Value::as_u64), Some(1)); assert_eq!(report.pointer("/summary/unsupported_claim_count").and_then(Value::as_u64), Some(0)); - assert_eq!(report.pointer("/summary/stale_retrieval_count").and_then(Value::as_u64), Some(0)); + assert_eq!(report.pointer("/summary/wrong_result_count").and_then(Value::as_u64), Some(3)); + assert_eq!( + report.pointer("/summary/expected_evidence_recall").and_then(Value::as_f64), + Some(0.912) + ); + assert_eq!( + report.pointer("/summary/irrelevant_context_ratio").and_then(Value::as_f64), + Some(0.028) + ); + assert_eq!(report.pointer("/summary/stale_retrieval_count").and_then(Value::as_u64), Some(1)); assert_eq!(report.pointer("/summary/stale_answer_count").and_then(Value::as_u64), Some(0)); assert_eq!( report.pointer("/summary/conflict_detection_count").and_then(Value::as_u64), @@ -254,18 +271,30 @@ fn real_world_memory_fixtures_report_trust_and_personalization_metrics() -> Resu ); assert_eq!( report.pointer("/summary/evidence_required_count").and_then(Value::as_u64), - Some(33) + Some(41) + ); + assert_eq!(report.pointer("/summary/evidence_covered_count").and_then(Value::as_u64), Some(38)); + assert_eq!(report.pointer("/summary/evidence_coverage").and_then(Value::as_f64), Some(0.927)); + assert_eq!(report.pointer("/summary/source_ref_coverage").and_then(Value::as_f64), Some(0.927)); + assert_eq!(report.pointer("/summary/quote_coverage").and_then(Value::as_f64), Some(0.927)); + assert_eq!( + report.pointer("/summary/trace_explainability_count").and_then(Value::as_u64), + Some(1) + ); + assert_eq!( + report.pointer("/summary/wrong_result_stage_attribution_count").and_then(Value::as_u64), + Some(1) ); - assert_eq!(report.pointer("/summary/evidence_covered_count").and_then(Value::as_u64), Some(31)); - assert_eq!(report.pointer("/summary/evidence_coverage").and_then(Value::as_f64), Some(0.939)); - assert_eq!(report.pointer("/summary/source_ref_coverage").and_then(Value::as_f64), Some(0.939)); - assert_eq!(report.pointer("/summary/quote_coverage").and_then(Value::as_f64), Some(0.939)); let suites = array_at(&report, "/suites")?; - for suite_id in - ["trust_source_of_truth", "work_resume", "capture_integration", "personalization"] - { + for suite_id in [ + "trust_source_of_truth", + "work_resume", + "retrieval", + "capture_integration", + "personalization", + ] { let suite = find_by_field(suites, "/suite_id", suite_id)?; assert_eq!(suite.pointer("/status").and_then(Value::as_str), Some("pass")); @@ -275,15 +304,112 @@ fn real_world_memory_fixtures_report_trust_and_personalization_metrics() -> Resu assert_eq!(memory_evolution.pointer("/status").and_then(Value::as_str), Some("not_encoded")); + let debug_suite = find_by_field(suites, "/suite_id", "operator_debugging_ux")?; + + assert_eq!(debug_suite.pointer("/status").and_then(Value::as_str), Some("wrong_result")); + let jobs = array_at(&report, "/jobs")?; let rebuild = find_by_field(jobs, "/job_id", "trust-sot-rebuild-001")?; let redaction = find_by_field(jobs, "/job_id", "capture-redaction-exclusion-001")?; let personalization = find_by_field(jobs, "/job_id", "personalization-scoped-preference-001")?; + let stage_job = find_by_field(jobs, "/job_id", "operator-debug-stage-attribution-001")?; assert_eq!(rebuild.pointer("/qdrant_rebuild_case").and_then(Value::as_bool), Some(true)); assert_eq!(redaction.pointer("/redaction_leak_count").and_then(Value::as_u64), Some(0)); assert_eq!(personalization.pointer("/scope_check_count").and_then(Value::as_u64), Some(1)); assert_eq!(personalization.pointer("/scope_correct_count").and_then(Value::as_u64), Some(1)); + assert_eq!( + stage_job.pointer("/trace_explainability/failure_stage").and_then(Value::as_str), + Some("rerank.score") + ); + + Ok(()) +} + +#[test] +fn retrieval_fixtures_report_quality_and_trace_attribution() -> Result<()> { + let report = run_json_report_from(retrieval_fixture_dir())?; + + assert_eq!(report.pointer("/summary/job_count").and_then(Value::as_u64), Some(6)); + assert_eq!(report.pointer("/summary/pass").and_then(Value::as_u64), Some(5)); + assert_eq!(report.pointer("/summary/wrong_result").and_then(Value::as_u64), Some(1)); + assert_eq!( + report.pointer("/summary/expected_evidence_recall").and_then(Value::as_f64), + Some(0.857) + ); + assert_eq!( + report.pointer("/summary/irrelevant_context_ratio").and_then(Value::as_f64), + Some(0.143) + ); + assert_eq!( + report.pointer("/summary/trace_explainability_count").and_then(Value::as_u64), + Some(1) + ); + assert_eq!( + report.pointer("/summary/wrong_result_stage_attribution_count").and_then(Value::as_u64), + Some(1) + ); + + let suites = array_at(&report, "/suites")?; + let retrieval_suite = find_by_field(suites, "/suite_id", "retrieval")?; + let debug_suite = find_by_field(suites, "/suite_id", "operator_debugging_ux")?; + + assert_eq!(retrieval_suite.pointer("/status").and_then(Value::as_str), Some("pass")); + assert_eq!(retrieval_suite.pointer("/encoded_job_count").and_then(Value::as_u64), Some(5)); + assert_eq!(debug_suite.pointer("/status").and_then(Value::as_str), Some("wrong_result")); + + let jobs = array_at(&report, "/jobs")?; + let stage_job = find_by_field(jobs, "/job_id", "operator-debug-stage-attribution-001")?; + + assert_eq!(stage_job.pointer("/status").and_then(Value::as_str), Some("wrong_result")); + assert_eq!( + stage_job.pointer("/trace_explainability/failure_stage").and_then(Value::as_str), + Some("rerank.score") + ); + assert_eq!( + stage_job.pointer("/retrieval_quality/expected_evidence_recall").and_then(Value::as_f64), + Some(0.0) + ); + assert_eq!( + stage_job.pointer("/retrieval_quality/irrelevant_context_ratio").and_then(Value::as_f64), + Some(1.0) + ); + + Ok(()) +} + +#[test] +fn retrieval_report_markdown_includes_quality_metrics() -> Result<()> { + let report = run_json_report_from(retrieval_fixture_dir())?; + let temp_dir = env::temp_dir().join(format!("elf-real-world-retrieval-test-{}", process::id())); + + fs::create_dir_all(&temp_dir)?; + + let report_path = temp_dir.join("retrieval-report.json"); + let markdown_path = temp_dir.join("retrieval-report.md"); + + fs::write(&report_path, serde_json::to_vec_pretty(&report)?)?; + + let output = Command::new(env!("CARGO_BIN_EXE_real_world_job_benchmark")) + .arg("publish") + .arg("--report") + .arg(&report_path) + .arg("--out") + .arg(&markdown_path) + .output()?; + + assert!( + output.status.success(), + "real_world_job publisher failed: {}", + String::from_utf8_lossy(&output.stderr), + ); + + let markdown = fs::read_to_string(markdown_path)?; + + assert!(markdown.contains("Expected evidence recall")); + assert!(markdown.contains("Irrelevant context ratio")); + assert!(markdown.contains("Trace Explainability")); + assert!(markdown.contains("rerank.score")); Ok(()) } diff --git a/docs/guide/benchmarking/live_baseline_benchmark.md b/docs/guide/benchmarking/live_baseline_benchmark.md index abb29e0b..ff0d52d4 100644 --- a/docs/guide/benchmarking/live_baseline_benchmark.md +++ b/docs/guide/benchmarking/live_baseline_benchmark.md @@ -336,6 +336,26 @@ stale-answer count, conflict detection count, update rationale availability, tem validity gaps, and unsupported claims. Its relation-temporal fixture is deliberately `not_encoded` until graph-lite temporal validity is implemented. +To run the checked-in retrieval-quality real-world fixtures: + +```sh +cargo make real-world-memory-retrieval +``` + +Artifacts: + +```text +tmp/real-world-memory/retrieval-report.json +tmp/real-world-memory/retrieval-report.md +``` + +The retrieval fixture lives under +`apps/elf-eval/fixtures/real_world_memory/retrieval/` and covers alternate phrasing, +distractor-heavy corpora, multi-hop routing questions, current-versus-obsolete context +selection, minimal sufficient context, and stage-level wrong-result explainability. +It is still an offline fixture report; qmd and OpenViking remain reference systems +unless an adapter actually runs and records typed evidence. + ## Clean Up ```sh diff --git a/docs/guide/benchmarking/real_world_agent_memory_benchmark.md b/docs/guide/benchmarking/real_world_agent_memory_benchmark.md index a206a6c0..8fff2a76 100644 --- a/docs/guide/benchmarking/real_world_agent_memory_benchmark.md +++ b/docs/guide/benchmarking/real_world_agent_memory_benchmark.md @@ -129,7 +129,7 @@ evidence, produced answer/evidence, unsupported-claim count, wrong-result count, latency/cost fields when available, capture/integration behavior classes, and typed suite/job statuses. Untouched suites remain `not_encoded`. -Current checked-in full real-world memory increment: +Current checked-in aggregate memory increment: ```sh cargo make real-world-memory @@ -139,14 +139,19 @@ This parses `apps/elf-eval/fixtures/real_world_memory/`, writes `tmp/real-world-memory/real-world-memory-report.json`, and renders `tmp/real-world-memory/real-world-memory-report.md`. -The suite currently encodes: +This command recursively parses all checked-in `real_world_memory` fixture slices, +including the retrieval-quality slice below. The suite currently encodes: - `trust_source_of_truth`: evidence binding, source refs, and Qdrant rebuild from Postgres-held chunk embeddings before answering. - `work_resume`: stale worktree resume, Decodex/Linear lane status, failed command recovery, PR review blocker recovery, and exact next-action extraction. +- `retrieval`: alternate phrasing, distractor-heavy retrieval, multi-hop routing, + current-versus-obsolete selection, and minimal sufficient context. - `memory_evolution`: TTL/delete suppression plus current-versus-historical preference, issue status, deployment method, benchmark conclusion, and temporal relation cases. +- `operator_debugging_ux`: deliberate wrong-result trace attribution that identifies + the retrieval stage that demoted expected evidence. - `capture_integration`: write-policy audit behavior for redaction/private exclusion and fixture-backed capture/integration boundary classification. - `personalization`: scoped stable preference correction without temporary or @@ -155,10 +160,12 @@ The suite currently encodes: The generated report includes evidence coverage, source-ref coverage, quote coverage, unsupported-claim count, stale retrieval count, stale-answer count, conflict detection count, update rationale availability, temporal validity `not_encoded` count, scope -correctness, redaction leak count, capture/integration behavior classes, and Qdrant -rebuild case/pass counts. The fixtures include negative traps for stale blockers, -unsupported prior claims, stale deleted facts, stale historical facts, cross-project -preference leakage, and private/redacted text leakage. +correctness, redaction leak count, capture/integration behavior classes, Qdrant +rebuild case/pass counts, expected evidence recall, irrelevant context ratio, +latency/cost, and trace explainability counters. The fixtures include negative traps +for stale blockers, unsupported prior claims, stale deleted facts, stale historical +facts, cross-project preference leakage, private/redacted text leakage, obsolete +retrieval context, and distractor context. Narrow memory evolution increment: @@ -178,6 +185,22 @@ the cases added for current-versus-historical interpretation and temporal stalen The relation temporal-validity fixture is deliberately `not_encoded` and declares the graph follow-up instead of claiming a fake graph pass. +Current checked-in retrieval-quality increment: + +```sh +cargo make real-world-memory-retrieval +``` + +This parses `apps/elf-eval/fixtures/real_world_memory/retrieval/`, writes +`tmp/real-world-memory/retrieval-report.json`, and renders +`tmp/real-world-memory/retrieval-report.md`. The fixture set covers alternate +phrasing, distractor-heavy retrieval, multi-hop routing, current-versus-obsolete +selection, minimal sufficient context, and a deliberate wrong-result trace attribution +case. Reports include expected evidence recall, irrelevant context ratio, latency/cost, +and optional trace explainability metadata. The qmd and OpenViking references in these +fixtures are design references only; no parity claim is allowed unless an external +adapter run actually provides evidence. + Operator debugging UX increment: ```sh diff --git a/docs/spec/real_world_agent_memory_benchmark_v1.md b/docs/spec/real_world_agent_memory_benchmark_v1.md index 3baf6d43..dafc1df0 100644 --- a/docs/spec/real_world_agent_memory_benchmark_v1.md +++ b/docs/spec/real_world_agent_memory_benchmark_v1.md @@ -381,6 +381,10 @@ Reports MUST include: - run id, runner version, corpus profile, job ids, suite ids, project adapter metadata; - per-job status, normalized score, hard-fail hits, evidence ids used, trap ids used; +- expected evidence recall and irrelevant context ratio at job, suite, and summary + levels when the runner can derive them from fixture evidence ids; +- trace explainability metadata when an adapter or fixture can identify retrieval + stages, especially for wrong-result stage attribution; - per-suite typed status and score distribution; - unsupported claim list with claim text or a bounded redacted description; - explicit `not_encoded` suite list;