Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
52 changes: 52 additions & 0 deletions Makefile.toml
Original file line number Diff line number Diff line change
Expand Up @@ -406,6 +406,9 @@ args = [
# | real-world-job-operator-ux | composite | |
# | real-world-job-operator-ux-json | command | |
# | real-world-job-operator-ux-report | command | |
# | real-world-memory-retrieval | composite | |
# | real-world-memory-retrieval-json | command | |
# | real-world-memory-retrieval-report | command | |

[tasks.real-world-job-smoke]
workspace = false
Expand Down Expand Up @@ -597,6 +600,55 @@ args = [
"tmp/real-world-job/real-world-job-operator-ux-report.md",
]

[tasks.real-world-memory-retrieval]
workspace = false
dependencies = [
"real-world-memory-retrieval-report",
]

[tasks.real-world-memory-retrieval-json]
workspace = false
command = "cargo"
args = [
"run",
"-p",
"elf-eval",
"--bin",
"real_world_job_benchmark",
"--",
"run",
"--fixtures",
"apps/elf-eval/fixtures/real_world_memory/retrieval",
"--run-id",
"real-world-memory-retrieval",
"--adapter-id",
"fixture_retrieval",
"--adapter-name",
"ELF fixture retrieval cases",
"--out",
"tmp/real-world-memory/retrieval-report.json",
]

[tasks.real-world-memory-retrieval-report]
workspace = false
dependencies = [
"real-world-memory-retrieval-json",
]
command = "cargo"
args = [
"run",
"-p",
"elf-eval",
"--bin",
"real_world_job_benchmark",
"--",
"publish",
"--report",
"tmp/real-world-memory/retrieval-report.json",
"--out",
"tmp/real-world-memory/retrieval-report.md",
]


# Meta
# | task | type | cwd |
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,173 @@
{
"schema": "elf.real_world_job/v1",
"job_id": "retrieval-alt-phrasing-001",
"suite": "retrieval",
"title": "Recover current handoff evidence from alternate phrasing",
"corpus": {
"corpus_id": "real-world-memory-retrieval-2026-06-09",
"profile": "synthetic",
"items": [
{
"evidence_id": "xy840-current-handoff",
"kind": "issue",
"text": "XY-840 trace schema lane uses branch y/elf-xy-840. Before review handoff, run `cargo make checks` after the trace schema update is complete.",
"source_ref": {
"schema": "source_ref/v1",
"resolver": "real_world_job_fixture/v1",
"ref": {
"fixture": "alternate_phrasing",
"evidence_id": "xy840-current-handoff"
}
},
"created_at": "2026-06-09T01:00:00Z"
},
{
"evidence_id": "xy840-old-handoff-trap",
"kind": "decision",
"text": "Old note: XY-840 used branch y/elf-old-840 and only needed `cargo make test` before handoff.",
"source_ref": {
"schema": "source_ref/v1",
"resolver": "real_world_job_fixture/v1",
"ref": {
"fixture": "alternate_phrasing",
"evidence_id": "xy840-old-handoff-trap"
}
},
"created_at": "2026-06-08T01:00:00Z"
}
],
"adapter_response": {
"adapter_id": "fixture_retrieval",
"answer": {
"content": "Use branch y/elf-xy-840 for XY-840 and run `cargo make checks` before review handoff.",
"claims": [
{
"claim_id": "branch",
"text": "Use branch y/elf-xy-840 for XY-840.",
"evidence_ids": ["xy840-current-handoff"],
"confidence": "high"
},
{
"claim_id": "gate",
"text": "Run `cargo make checks` before review handoff.",
"evidence_ids": ["xy840-current-handoff"],
"confidence": "high"
}
],
"evidence_ids": ["xy840-current-handoff"],
"latency_ms": 13.4,
"cost": {
"currency": "USD",
"amount": 0.0,
"input_tokens": 0,
"output_tokens": 0
}
}
}
},
"timeline": [
{
"event_id": "xy840-old-branch",
"ts": "2026-06-08T01:00:00Z",
"actor": "agent",
"action": "recorded_old_handoff",
"evidence_ids": ["xy840-old-handoff-trap"],
"summary": "An older handoff note referenced the wrong branch and a narrower gate."
},
{
"event_id": "xy840-current-handoff",
"ts": "2026-06-09T01:00:00Z",
"actor": "agent",
"action": "updated_handoff",
"evidence_ids": ["xy840-current-handoff"],
"summary": "The current handoff evidence changed the branch and validation gate."
}
],
"prompt": {
"role": "user",
"content": "For the trace-schema handoff, which XY-840 branch and pre-review check do I need?",
"job_mode": "answer",
"constraints": ["cite_evidence", "avoid_stale_facts"]
},
"expected_answer": {
"must_include": [
{
"claim_id": "branch",
"text": "Use branch y/elf-xy-840 for XY-840."
},
{
"claim_id": "gate",
"text": "Run `cargo make checks` before review handoff."
}
],
"must_not_include": [
"Use branch y/elf-old-840 for XY-840.",
"Run `cargo make test` before review handoff."
],
"evidence_links": {
"branch": ["xy840-current-handoff"],
"gate": ["xy840-current-handoff"]
},
"answer_type": "direct_answer",
"accepted_alternates": [],
"requires_caveat": false,
"requires_refusal": false
},
"required_evidence": [
{
"evidence_id": "xy840-current-handoff",
"claim_id": "branch",
"requirement": "cite",
"quote": "uses branch y/elf-xy-840"
},
{
"evidence_id": "xy840-current-handoff",
"claim_id": "gate",
"requirement": "use",
"quote": "run `cargo make checks`"
}
],
"negative_traps": [
{
"trap_id": "old-xy840-handoff",
"type": "stale_fact",
"evidence_ids": ["xy840-old-handoff-trap"],
"failure_if_used": true
}
],
"scoring_rubric": {
"dimensions": {
"answer_correctness": {
"weight": 0.35,
"max_points": 1.0,
"criteria": "Returns the current branch and pre-review check."
},
"evidence_grounding": {
"weight": 0.3,
"max_points": 1.0,
"criteria": "Cites the current handoff evidence."
},
"trap_avoidance": {
"weight": 0.2,
"max_points": 1.0,
"criteria": "Ignores the stale branch and test-only gate."
},
"latency_resource": {
"weight": 0.15,
"max_points": 1.0,
"criteria": "Reports bounded fixture latency and no cost."
}
},
"pass_threshold": 0.8,
"hard_fail_rules": [
"unsupported high-confidence claim about a required decision or fact",
"use of a negative trap marked failure_if_used = true"
]
},
"allowed_uncertainty": {
"can_answer_unknown": false,
"acceptable_phrases": ["The fixture does not provide that evidence."],
"fallback_action": "state_blocker"
},
"tags": ["synthetic", "retrieval", "alternate_phrasing", "no_live_claim"]
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,148 @@
{
"schema": "elf.real_world_job/v1",
"job_id": "retrieval-current-vs-obsolete-001",
"suite": "retrieval",
"title": "Select current benchmark context over obsolete live-baseline claims",
"corpus": {
"corpus_id": "real-world-memory-retrieval-2026-06-09",
"profile": "synthetic",
"items": [
{
"evidence_id": "obsolete-live-baseline-win",
"kind": "decision",
"text": "Obsolete draft: top-k live baseline results alone prove real-world job suite wins.",
"source_ref": {},
"created_at": "2026-06-08T04:00:00Z"
},
{
"evidence_id": "current-real-world-boundary",
"kind": "decision",
"text": "Current policy: live-baseline reports remain valid for Docker retrieval and lifecycle checks, but they are not real-world job suite wins. Real-world job reports must be published separately.",
"source_ref": {
"schema": "source_ref/v1",
"resolver": "real_world_job_fixture/v1",
"ref": {
"fixture": "current_vs_obsolete",
"evidence_id": "current-real-world-boundary"
}
},
"created_at": "2026-06-09T04:00:00Z"
}
],
"adapter_response": {
"adapter_id": "fixture_retrieval",
"answer": {
"content": "Use the current boundary: live-baseline reports stay valid for Docker retrieval and lifecycle evidence, but they are not real-world job suite wins; publish real-world job reports separately.",
"claims": [
{
"claim_id": "current_boundary",
"text": "Live-baseline reports are not real-world job suite wins.",
"evidence_ids": ["current-real-world-boundary"],
"confidence": "high"
}
],
"evidence_ids": ["current-real-world-boundary"],
"latency_ms": 15.7,
"cost": {
"currency": "USD",
"amount": 0.0,
"input_tokens": 0,
"output_tokens": 0
}
}
}
},
"timeline": [
{
"event_id": "obsolete-draft",
"ts": "2026-06-08T04:00:00Z",
"actor": "agent",
"action": "recorded_obsolete_policy",
"evidence_ids": ["obsolete-live-baseline-win"],
"summary": "A draft conflated live-baseline retrieval checks with real-world job wins."
},
{
"event_id": "current-boundary",
"ts": "2026-06-09T04:00:00Z",
"actor": "agent",
"action": "updated_policy",
"evidence_ids": ["current-real-world-boundary"],
"summary": "The current policy separates live-baseline evidence from real-world job suite claims."
}
],
"prompt": {
"role": "user",
"content": "Can I cite the live-baseline pass as a real-world job suite win?",
"job_mode": "answer",
"constraints": ["cite_evidence", "use_current_policy", "avoid_obsolete_context"]
},
"expected_answer": {
"must_include": [
{
"claim_id": "current_boundary",
"text": "Live-baseline reports are not real-world job suite wins."
}
],
"must_not_include": [
"Top-k live baseline results alone prove real-world job suite wins."
],
"evidence_links": {
"current_boundary": ["current-real-world-boundary"]
},
"answer_type": "direct_answer",
"accepted_alternates": [],
"requires_caveat": false,
"requires_refusal": false
},
"required_evidence": [
{
"evidence_id": "current-real-world-boundary",
"claim_id": "current_boundary",
"requirement": "cite",
"quote": "they are not real-world job suite wins"
}
],
"negative_traps": [
{
"trap_id": "obsolete-suite-win",
"type": "stale_fact",
"evidence_ids": ["obsolete-live-baseline-win"],
"failure_if_used": true
}
],
"scoring_rubric": {
"dimensions": {
"answer_correctness": {
"weight": 0.35,
"max_points": 1.0,
"criteria": "Answers with the current claim boundary."
},
"evidence_grounding": {
"weight": 0.3,
"max_points": 1.0,
"criteria": "Cites the current policy evidence."
},
"trap_avoidance": {
"weight": 0.25,
"max_points": 1.0,
"criteria": "Avoids the obsolete top-k claim."
},
"uncertainty_handling": {
"weight": 0.1,
"max_points": 1.0,
"criteria": "Does not hedge when sufficient current evidence exists."
}
},
"pass_threshold": 0.8,
"hard_fail_rules": [
"unsupported high-confidence claim about a required decision or fact",
"use of a negative trap marked failure_if_used = true"
]
},
"allowed_uncertainty": {
"can_answer_unknown": false,
"acceptable_phrases": ["The fixture does not provide that evidence."],
"fallback_action": "state_blocker"
},
"tags": ["synthetic", "retrieval", "current_vs_obsolete", "no_live_claim"]
}
Loading