Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion apps/elf-api/static/viewer.html
Original file line number Diff line number Diff line change
Expand Up @@ -1463,13 +1463,14 @@ <h2>Recent Traces</h2>
}
return section("Relation Context", [
table(
["Rank", "Scope", "Subject", "Predicate", "Object", "Evidence Notes"],
["Rank", "Scope", "Subject", "Predicate", "Object", "Temporal", "Evidence Notes"],
relations.map(({ item, context }) => [
item.rank,
context.scope,
getPath(context, ["subject", "canonical"]) || "none",
context.predicate,
getPath(context, ["object", "entity", "canonical"]) || getPath(context, ["object", "value"]) || "none",
context.temporal_status || "current",
(context.evidence_note_ids || []).join(", ")
])
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,8 @@
"schema": "elf.real_world_job/v1",
"job_id": "memory-evolution-relation-temporal-001",
"suite": "memory_evolution",
"title": "Mark temporal relation validity as not encoded instead of faking a graph pass",
"encoding": {
"status": "not_encoded",
"reason": "ELF graph-lite currently returns bounded relation context, but this runner does not yet encode current-only versus historical temporal validity for relation facts.",
"follow_up": {
"title": "[ELF graph P1] Add temporal validity to graph-lite facts",
"reason": "Relation facts need valid_from and invalidated_at semantics before this job can claim a current-versus-historical graph pass."
}
},
"title": "Distinguish current and historical relation validity in graph-lite context",
"encoding": {},
"corpus": {
"corpus_id": "real-world-memory-evolution-2026-06-09",
"profile": "synthetic",
Expand All @@ -23,7 +16,7 @@
"schema": "source_ref/v1",
"resolver": "real_world_job_fixture/v1",
"ref": {
"fixture": "relation_temporal_validity_not_encoded",
"fixture": "relation_temporal_validity",
"evidence_id": "relation-old-owner"
}
},
Expand All @@ -37,7 +30,7 @@
"schema": "source_ref/v1",
"resolver": "real_world_job_fixture/v1",
"ref": {
"fixture": "relation_temporal_validity_not_encoded",
"fixture": "relation_temporal_validity",
"evidence_id": "relation-current-owner"
}
},
Expand All @@ -51,13 +44,49 @@
"schema": "source_ref/v1",
"resolver": "real_world_job_fixture/v1",
"ref": {
"fixture": "relation_temporal_validity_not_encoded",
"fixture": "relation_temporal_validity",
"evidence_id": "relation-owner-rationale"
}
},
"created_at": "2026-06-08T00:05:00Z"
}
]
],
"adapter_response": {
"adapter_id": "fixture_memory_evolution",
"answer": {
"content": "Team Echo currently owns deployment method review. Team Delta owned deployment method review historically. The ownership moved after the single-user production runbook scope changed.",
"claims": [
{
"claim_id": "relation_current_owner",
"text": "Team Echo currently owns deployment method review.",
"evidence_ids": [
"relation-current-owner",
"relation-old-owner",
"relation-owner-rationale"
],
"confidence": "high"
},
{
"claim_id": "relation_historical_owner",
"text": "Team Delta owned deployment method review historically.",
"evidence_ids": ["relation-old-owner"],
"confidence": "high"
},
{
"claim_id": "relation_owner_update_rationale",
"text": "Ownership moved after single-user production runbook scope changed.",
"evidence_ids": ["relation-owner-rationale"],
"confidence": "high"
}
],
"evidence_ids": [
"relation-current-owner",
"relation-old-owner",
"relation-owner-rationale"
]
},
"consolidation": null
}
},
"timeline": [
{
Expand Down Expand Up @@ -101,7 +130,8 @@
"relation-old-owner",
"relation-owner-rationale"
],
"relation_historical_owner": ["relation-old-owner"]
"relation_historical_owner": ["relation-old-owner"],
"relation_owner_update_rationale": ["relation-owner-rationale"]
},
"answer_type": "direct_answer",
"accepted_alternates": [],
Expand Down Expand Up @@ -160,9 +190,9 @@
]
},
"allowed_uncertainty": {
"can_answer_unknown": true,
"acceptable_phrases": ["Temporal relation validity is not encoded in this runner."],
"fallback_action": "state_blocker"
"can_answer_unknown": false,
"acceptable_phrases": [],
"fallback_action": "score_temporal_relation_behavior"
},
"memory_evolution": {
"current_evidence_ids": ["relation-current-owner"],
Expand All @@ -180,20 +210,19 @@
"update_rationale": {
"claim_id": "relation_owner_update_rationale",
"evidence_ids": ["relation-owner-rationale"],
"available": false
"available": true
},
"temporal_validity": {
"required": true,
"encoded": false,
"follow_up": "[ELF graph P1] Add temporal validity to graph-lite facts"
"encoded": true
}
},
"tags": [
"synthetic",
"memory_evolution",
"reference_graphiti_zep_temporal",
"reference_nanograph_typed_query",
"not_encoded",
"graph_temporal_encoded",
"no_live_claim"
]
}
61 changes: 31 additions & 30 deletions apps/elf-eval/tests/real_world_job_benchmark.rs
Original file line number Diff line number Diff line change
Expand Up @@ -689,16 +689,16 @@ fn assert_root_knowledge_summary(report: &Value) {

fn assert_root_aggregate_summary(report: &Value) {
assert_eq!(report.pointer("/summary/job_count").and_then(Value::as_u64), Some(38));
assert_eq!(report.pointer("/summary/pass").and_then(Value::as_u64), Some(34));
assert_eq!(report.pointer("/summary/pass").and_then(Value::as_u64), Some(35));
assert_eq!(report.pointer("/summary/wrong_result").and_then(Value::as_u64), Some(0));
assert_eq!(report.pointer("/summary/incomplete").and_then(Value::as_u64), Some(1));
assert_eq!(report.pointer("/summary/blocked").and_then(Value::as_u64), Some(2));
assert_eq!(report.pointer("/summary/not_encoded").and_then(Value::as_u64), Some(1));
assert_eq!(report.pointer("/summary/not_encoded").and_then(Value::as_u64), Some(0));
assert_eq!(report.pointer("/summary/unsupported_claim_count").and_then(Value::as_u64), Some(0));
assert_eq!(report.pointer("/summary/wrong_result_count").and_then(Value::as_u64), Some(0));
assert_eq!(
report.pointer("/summary/expected_evidence_recall").and_then(Value::as_f64),
Some(0.973)
Some(1.0)
);
assert_eq!(
report.pointer("/summary/irrelevant_context_ratio").and_then(Value::as_f64),
Expand All @@ -708,15 +708,15 @@ fn assert_root_aggregate_summary(report: &Value) {
assert_eq!(report.pointer("/summary/stale_answer_count").and_then(Value::as_u64), Some(0));
assert_eq!(
report.pointer("/summary/conflict_detection_count").and_then(Value::as_u64),
Some(6)
Some(7)
);
assert_eq!(
report.pointer("/summary/update_rationale_available_count").and_then(Value::as_u64),
Some(9)
Some(10)
);
assert_eq!(
report.pointer("/summary/temporal_validity_not_encoded_count").and_then(Value::as_u64),
Some(1)
Some(0)
);
assert_eq!(report.pointer("/summary/redaction_leak_count").and_then(Value::as_u64), Some(0));
assert_eq!(report.pointer("/summary/scope_check_count").and_then(Value::as_u64), Some(2));
Expand All @@ -734,10 +734,10 @@ fn assert_root_aggregate_summary(report: &Value) {
report.pointer("/summary/evidence_required_count").and_then(Value::as_u64),
Some(82)
);
assert_eq!(report.pointer("/summary/evidence_covered_count").and_then(Value::as_u64), Some(80));
assert_eq!(report.pointer("/summary/evidence_coverage").and_then(Value::as_f64), Some(0.976));
assert_eq!(report.pointer("/summary/source_ref_coverage").and_then(Value::as_f64), Some(0.976));
assert_eq!(report.pointer("/summary/quote_coverage").and_then(Value::as_f64), Some(0.976));
assert_eq!(report.pointer("/summary/evidence_covered_count").and_then(Value::as_u64), Some(82));
assert_eq!(report.pointer("/summary/evidence_coverage").and_then(Value::as_f64), Some(1.0));
assert_eq!(report.pointer("/summary/source_ref_coverage").and_then(Value::as_f64), Some(1.0));
assert_eq!(report.pointer("/summary/quote_coverage").and_then(Value::as_f64), Some(1.0));
assert_eq!(
report.pointer("/summary/trace_explainability_count").and_then(Value::as_u64),
Some(1)
Expand Down Expand Up @@ -777,6 +777,7 @@ fn assert_root_aggregate_suites(report: &Value) -> Result<()> {
"consolidation",
"knowledge_compilation",
"operator_debugging_ux",
"memory_evolution",
] {
let suite = find_by_field(suites, "/suite_id", suite_id)?;

Expand All @@ -785,7 +786,7 @@ fn assert_root_aggregate_suites(report: &Value) -> Result<()> {

let memory_evolution = find_by_field(suites, "/suite_id", "memory_evolution")?;

assert_eq!(memory_evolution.pointer("/status").and_then(Value::as_str), Some("not_encoded"));
assert_eq!(memory_evolution.pointer("/status").and_then(Value::as_str), Some("pass"));

let project_decisions = find_by_field(suites, "/suite_id", "project_decisions")?;

Expand All @@ -812,6 +813,7 @@ fn assert_root_aggregate_jobs(report: &Value) -> Result<()> {
let rebuild = find_by_field(jobs, "/job_id", "trust-sot-rebuild-001")?;
let redaction = find_by_field(jobs, "/job_id", "capture-redaction-exclusion-001")?;
let personalization = find_by_field(jobs, "/job_id", "personalization-scoped-preference-001")?;
let relation_job = find_by_field(jobs, "/job_id", "memory-evolution-relation-temporal-001")?;
let stage_job = find_by_field(jobs, "/job_id", "operator-debug-stage-attribution-001")?;
let production_restore =
find_by_field(jobs, "/job_id", "production-ops-restore-cold-start-001")?;
Expand All @@ -825,6 +827,7 @@ fn assert_root_aggregate_jobs(report: &Value) -> Result<()> {
assert_eq!(personalization.pointer("/scope_check_count").and_then(Value::as_u64), Some(1));
assert_eq!(personalization.pointer("/scope_correct_count").and_then(Value::as_u64), Some(1));
assert_eq!(stage_job.pointer("/status").and_then(Value::as_str), Some("pass"));
assert_eq!(relation_job.pointer("/status").and_then(Value::as_str), Some("pass"));
assert_eq!(
stage_job.pointer("/trace_explainability/failure_stage").and_then(Value::as_str),
Some("rerank.score")
Expand Down Expand Up @@ -992,54 +995,51 @@ fn memory_evolution_fixtures_report_temporal_and_staleness_metrics() -> Result<(

assert_eq!(report.pointer("/summary/job_count").and_then(Value::as_u64), Some(5));
assert_eq!(report.pointer("/summary/encoded_suite_count").and_then(Value::as_u64), Some(1));
assert_eq!(report.pointer("/summary/pass").and_then(Value::as_u64), Some(4));
assert_eq!(report.pointer("/summary/not_encoded").and_then(Value::as_u64), Some(1));
assert_eq!(report.pointer("/summary/pass").and_then(Value::as_u64), Some(5));
assert_eq!(report.pointer("/summary/not_encoded").and_then(Value::as_u64), Some(0));
assert_eq!(report.pointer("/summary/stale_answer_count").and_then(Value::as_u64), Some(0));
assert_eq!(
report.pointer("/summary/conflict_detection_count").and_then(Value::as_u64),
Some(4)
Some(5)
);
assert_eq!(
report.pointer("/summary/update_rationale_available_count").and_then(Value::as_u64),
Some(4)
Some(5)
);
assert_eq!(
report.pointer("/summary/temporal_validity_not_encoded_count").and_then(Value::as_u64),
Some(1)
Some(0)
);
assert_eq!(
report.pointer("/evolution/temporal_validity_not_encoded_count").and_then(Value::as_u64),
Some(1)
Some(0)
);

let suites = array_at(&report, "/suites")?;
let memory_evolution = find_by_field(suites, "/suite_id", "memory_evolution")?;

assert_eq!(memory_evolution.pointer("/status").and_then(Value::as_str), Some("not_encoded"));
assert_eq!(memory_evolution.pointer("/status").and_then(Value::as_str), Some("pass"));
assert_eq!(
memory_evolution.pointer("/temporal_validity_not_encoded_count").and_then(Value::as_u64),
Some(1)
Some(0)
);

let jobs = array_at(&report, "/jobs")?;
let relation_job = find_by_field(jobs, "/job_id", "memory-evolution-relation-temporal-001")?;

assert_eq!(relation_job.pointer("/status").and_then(Value::as_str), Some("not_encoded"));
assert_eq!(relation_job.pointer("/status").and_then(Value::as_str), Some("pass"));
assert_eq!(
relation_job.pointer("/evolution/temporal_validity_not_encoded").and_then(Value::as_bool),
Some(false)
);
assert_eq!(
relation_job.pointer("/evolution/temporal_validity_encoded").and_then(Value::as_bool),
Some(true)
);

let follow_ups = array_at(&report, "/follow_ups")?;

assert_eq!(follow_ups.len(), 1);
assert_eq!(
follow_ups
.first()
.and_then(|follow_up| follow_up.pointer("/title"))
.and_then(Value::as_str),
Some("[ELF graph P1] Add temporal validity to graph-lite facts")
);
assert!(follow_ups.is_empty());

Ok(())
}
Expand Down Expand Up @@ -1163,8 +1163,9 @@ fn memory_evolution_report_renders_markdown_counters() -> Result<()> {
let markdown = fs::read_to_string(markdown_path)?;

assert!(markdown.contains("## Memory Evolution"));
assert!(markdown.contains("Temporal validity not encoded: `1`"));
assert!(markdown.contains("[ELF graph P1] Add temporal validity to graph-lite facts"));
assert!(markdown.contains("Temporal validity not encoded: `0`"));
assert!(markdown.contains("| memory_evolution | memory-evolution-relation-temporal-001"));
assert!(markdown.contains("`encoded`"));

Ok(())
}
Expand Down
4 changes: 2 additions & 2 deletions docs/guide/benchmarking/live_baseline_benchmark.md
Original file line number Diff line number Diff line change
Expand Up @@ -353,8 +353,8 @@ cargo make real-world-memory-evolution

It lives under `apps/elf-eval/fixtures/real_world_memory/evolution/` and reports
stale-answer count, conflict detection count, update rationale availability, temporal
validity gaps, and unsupported claims. Its relation-temporal fixture is deliberately
`not_encoded` until graph-lite temporal validity is implemented.
validity encoding, and unsupported claims. Its relation-temporal fixture is encoded as
a normal pass/fail check for current versus historical graph-lite relation context.

To run the checked-in retrieval-quality real-world fixtures:

Expand Down
6 changes: 3 additions & 3 deletions docs/guide/benchmarking/real_world_agent_memory_benchmark.md
Original file line number Diff line number Diff line change
Expand Up @@ -166,7 +166,7 @@ including the retrieval-quality slice below. The suite currently encodes:

The generated report includes evidence coverage, source-ref coverage, quote coverage,
unsupported-claim count, stale retrieval count, stale-answer count, conflict detection
count, update rationale availability, temporal validity `not_encoded` count, scope
count, update rationale availability, temporal validity encoding count, scope
correctness, redaction leak count, capture/integration behavior classes, Qdrant
rebuild case/pass counts, expected evidence recall, irrelevant context ratio,
latency/cost, answer-type plus caveat/refusal/uncertainty flags, and trace
Expand Down Expand Up @@ -262,8 +262,8 @@ tmp/real-world-memory/evolution-report.md

This parses `apps/elf-eval/fixtures/real_world_memory/evolution/` and reports only
the cases added for current-versus-historical interpretation and temporal staleness.
The relation temporal-validity fixture is deliberately `not_encoded` and declares the
graph follow-up instead of claiming a fake graph pass.
The relation temporal-validity fixture is encoded and scores current owner,
historical owner, update rationale, and stale-owner trap behavior.

Current checked-in retrieval-quality increment:

Expand Down
20 changes: 10 additions & 10 deletions docs/guide/benchmarking/real_world_memory_evolution.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

Goal: Run and interpret the checked-in memory evolution real-world job fixtures.
Read this when: You need to test current facts, historical facts, stale facts,
conflicts, corrected memories, and temporal validity limitations.
conflicts, corrected memories, and temporal relation validity.
Inputs: `apps/elf-eval/fixtures/real_world_memory/evolution/`,
`apps/elf-eval/src/bin/real_world_job_benchmark.rs`, and `Makefile.toml`.
Depends on: `docs/spec/real_world_agent_memory_benchmark_v1.md`,
Expand All @@ -23,13 +23,12 @@ The checked-in fixture set covers:
- Issue state evolution from blocked to done.
- Production deployment guidance superseding a local smoke quickstart.
- Benchmark adoption verdict reversal with a bounded private-corpus caveat.
- Relation fact current-versus-historical ownership, encoded as `not_encoded`
because temporal graph validity is not yet implemented in the runner.
- Relation fact current-versus-historical ownership with graph-lite temporal
validity encoded as a normal pass/fail fixture.

The relation case borrows from Graphiti/Zep temporal validity and nanograph typed
query ergonomics. It intentionally does not fake a pass for graph temporal behavior.
The report declares the follow-up `[ELF graph P1] Add temporal validity to graph-lite
facts`.
query ergonomics while preserving ELF's Postgres source-of-truth and evidence-link
requirements.

## Run

Expand All @@ -55,10 +54,11 @@ The runner reports memory evolution counters at summary, suite, and job levels:
- `update_rationale_available_count`: jobs where the produced answer cites the
update rationale.
- `temporal_validity_not_encoded_count`: jobs that require temporal graph validity
but are deliberately declared `not_encoded`.
but are deliberately declared `not_encoded`; this should be `0` for the checked-in
evolution fixture set.
- `unsupported_claim_count`: existing real-world job unsupported claim counter.

Runnable jobs should have `stale_answer_count = 0`, nonzero conflict detection, and
an update rationale when the fixture provides one. A temporal validity gap should
remain `not_encoded` until graph-lite facts can model current-only and historical
relation validity.
an update rationale when the fixture provides one. The relation temporal-validity job
should report temporal validity as encoded and pass only when current and historical
relation evidence are distinguished.
Loading