diff --git a/apps/elf-api/static/viewer.html b/apps/elf-api/static/viewer.html index 05de83af..752e0c6f 100644 --- a/apps/elf-api/static/viewer.html +++ b/apps/elf-api/static/viewer.html @@ -1463,13 +1463,14 @@

Recent Traces

} return section("Relation Context", [ table( - ["Rank", "Scope", "Subject", "Predicate", "Object", "Evidence Notes"], + ["Rank", "Scope", "Subject", "Predicate", "Object", "Temporal", "Evidence Notes"], relations.map(({ item, context }) => [ item.rank, context.scope, getPath(context, ["subject", "canonical"]) || "none", context.predicate, getPath(context, ["object", "entity", "canonical"]) || getPath(context, ["object", "value"]) || "none", + context.temporal_status || "current", (context.evidence_note_ids || []).join(", ") ]) ) diff --git a/apps/elf-eval/fixtures/real_world_memory/evolution/relation_temporal_validity_not_encoded.json b/apps/elf-eval/fixtures/real_world_memory/evolution/relation_temporal_validity.json similarity index 73% rename from apps/elf-eval/fixtures/real_world_memory/evolution/relation_temporal_validity_not_encoded.json rename to apps/elf-eval/fixtures/real_world_memory/evolution/relation_temporal_validity.json index 6c3a0c0f..e3a50717 100644 --- a/apps/elf-eval/fixtures/real_world_memory/evolution/relation_temporal_validity_not_encoded.json +++ b/apps/elf-eval/fixtures/real_world_memory/evolution/relation_temporal_validity.json @@ -2,15 +2,8 @@ "schema": "elf.real_world_job/v1", "job_id": "memory-evolution-relation-temporal-001", "suite": "memory_evolution", - "title": "Mark temporal relation validity as not encoded instead of faking a graph pass", - "encoding": { - "status": "not_encoded", - "reason": "ELF graph-lite currently returns bounded relation context, but this runner does not yet encode current-only versus historical temporal validity for relation facts.", - "follow_up": { - "title": "[ELF graph P1] Add temporal validity to graph-lite facts", - "reason": "Relation facts need valid_from and invalidated_at semantics before this job can claim a current-versus-historical graph pass." - } - }, + "title": "Distinguish current and historical relation validity in graph-lite context", + "encoding": {}, "corpus": { "corpus_id": "real-world-memory-evolution-2026-06-09", "profile": "synthetic", @@ -23,7 +16,7 @@ "schema": "source_ref/v1", "resolver": "real_world_job_fixture/v1", "ref": { - "fixture": "relation_temporal_validity_not_encoded", + "fixture": "relation_temporal_validity", "evidence_id": "relation-old-owner" } }, @@ -37,7 +30,7 @@ "schema": "source_ref/v1", "resolver": "real_world_job_fixture/v1", "ref": { - "fixture": "relation_temporal_validity_not_encoded", + "fixture": "relation_temporal_validity", "evidence_id": "relation-current-owner" } }, @@ -51,13 +44,49 @@ "schema": "source_ref/v1", "resolver": "real_world_job_fixture/v1", "ref": { - "fixture": "relation_temporal_validity_not_encoded", + "fixture": "relation_temporal_validity", "evidence_id": "relation-owner-rationale" } }, "created_at": "2026-06-08T00:05:00Z" } - ] + ], + "adapter_response": { + "adapter_id": "fixture_memory_evolution", + "answer": { + "content": "Team Echo currently owns deployment method review. Team Delta owned deployment method review historically. The ownership moved after the single-user production runbook scope changed.", + "claims": [ + { + "claim_id": "relation_current_owner", + "text": "Team Echo currently owns deployment method review.", + "evidence_ids": [ + "relation-current-owner", + "relation-old-owner", + "relation-owner-rationale" + ], + "confidence": "high" + }, + { + "claim_id": "relation_historical_owner", + "text": "Team Delta owned deployment method review historically.", + "evidence_ids": ["relation-old-owner"], + "confidence": "high" + }, + { + "claim_id": "relation_owner_update_rationale", + "text": "Ownership moved after single-user production runbook scope changed.", + "evidence_ids": ["relation-owner-rationale"], + "confidence": "high" + } + ], + "evidence_ids": [ + "relation-current-owner", + "relation-old-owner", + "relation-owner-rationale" + ] + }, + "consolidation": null + } }, "timeline": [ { @@ -101,7 +130,8 @@ "relation-old-owner", "relation-owner-rationale" ], - "relation_historical_owner": ["relation-old-owner"] + "relation_historical_owner": ["relation-old-owner"], + "relation_owner_update_rationale": ["relation-owner-rationale"] }, "answer_type": "direct_answer", "accepted_alternates": [], @@ -160,9 +190,9 @@ ] }, "allowed_uncertainty": { - "can_answer_unknown": true, - "acceptable_phrases": ["Temporal relation validity is not encoded in this runner."], - "fallback_action": "state_blocker" + "can_answer_unknown": false, + "acceptable_phrases": [], + "fallback_action": "score_temporal_relation_behavior" }, "memory_evolution": { "current_evidence_ids": ["relation-current-owner"], @@ -180,12 +210,11 @@ "update_rationale": { "claim_id": "relation_owner_update_rationale", "evidence_ids": ["relation-owner-rationale"], - "available": false + "available": true }, "temporal_validity": { "required": true, - "encoded": false, - "follow_up": "[ELF graph P1] Add temporal validity to graph-lite facts" + "encoded": true } }, "tags": [ @@ -193,7 +222,7 @@ "memory_evolution", "reference_graphiti_zep_temporal", "reference_nanograph_typed_query", - "not_encoded", + "graph_temporal_encoded", "no_live_claim" ] } diff --git a/apps/elf-eval/tests/real_world_job_benchmark.rs b/apps/elf-eval/tests/real_world_job_benchmark.rs index 496237d7..eb1d38ca 100644 --- a/apps/elf-eval/tests/real_world_job_benchmark.rs +++ b/apps/elf-eval/tests/real_world_job_benchmark.rs @@ -689,16 +689,16 @@ fn assert_root_knowledge_summary(report: &Value) { fn assert_root_aggregate_summary(report: &Value) { assert_eq!(report.pointer("/summary/job_count").and_then(Value::as_u64), Some(38)); - assert_eq!(report.pointer("/summary/pass").and_then(Value::as_u64), Some(34)); + assert_eq!(report.pointer("/summary/pass").and_then(Value::as_u64), Some(35)); assert_eq!(report.pointer("/summary/wrong_result").and_then(Value::as_u64), Some(0)); assert_eq!(report.pointer("/summary/incomplete").and_then(Value::as_u64), Some(1)); assert_eq!(report.pointer("/summary/blocked").and_then(Value::as_u64), Some(2)); - assert_eq!(report.pointer("/summary/not_encoded").and_then(Value::as_u64), Some(1)); + assert_eq!(report.pointer("/summary/not_encoded").and_then(Value::as_u64), Some(0)); assert_eq!(report.pointer("/summary/unsupported_claim_count").and_then(Value::as_u64), Some(0)); assert_eq!(report.pointer("/summary/wrong_result_count").and_then(Value::as_u64), Some(0)); assert_eq!( report.pointer("/summary/expected_evidence_recall").and_then(Value::as_f64), - Some(0.973) + Some(1.0) ); assert_eq!( report.pointer("/summary/irrelevant_context_ratio").and_then(Value::as_f64), @@ -708,15 +708,15 @@ fn assert_root_aggregate_summary(report: &Value) { assert_eq!(report.pointer("/summary/stale_answer_count").and_then(Value::as_u64), Some(0)); assert_eq!( report.pointer("/summary/conflict_detection_count").and_then(Value::as_u64), - Some(6) + Some(7) ); assert_eq!( report.pointer("/summary/update_rationale_available_count").and_then(Value::as_u64), - Some(9) + Some(10) ); assert_eq!( report.pointer("/summary/temporal_validity_not_encoded_count").and_then(Value::as_u64), - Some(1) + Some(0) ); assert_eq!(report.pointer("/summary/redaction_leak_count").and_then(Value::as_u64), Some(0)); assert_eq!(report.pointer("/summary/scope_check_count").and_then(Value::as_u64), Some(2)); @@ -734,10 +734,10 @@ fn assert_root_aggregate_summary(report: &Value) { report.pointer("/summary/evidence_required_count").and_then(Value::as_u64), Some(82) ); - assert_eq!(report.pointer("/summary/evidence_covered_count").and_then(Value::as_u64), Some(80)); - assert_eq!(report.pointer("/summary/evidence_coverage").and_then(Value::as_f64), Some(0.976)); - assert_eq!(report.pointer("/summary/source_ref_coverage").and_then(Value::as_f64), Some(0.976)); - assert_eq!(report.pointer("/summary/quote_coverage").and_then(Value::as_f64), Some(0.976)); + assert_eq!(report.pointer("/summary/evidence_covered_count").and_then(Value::as_u64), Some(82)); + assert_eq!(report.pointer("/summary/evidence_coverage").and_then(Value::as_f64), Some(1.0)); + assert_eq!(report.pointer("/summary/source_ref_coverage").and_then(Value::as_f64), Some(1.0)); + assert_eq!(report.pointer("/summary/quote_coverage").and_then(Value::as_f64), Some(1.0)); assert_eq!( report.pointer("/summary/trace_explainability_count").and_then(Value::as_u64), Some(1) @@ -777,6 +777,7 @@ fn assert_root_aggregate_suites(report: &Value) -> Result<()> { "consolidation", "knowledge_compilation", "operator_debugging_ux", + "memory_evolution", ] { let suite = find_by_field(suites, "/suite_id", suite_id)?; @@ -785,7 +786,7 @@ fn assert_root_aggregate_suites(report: &Value) -> Result<()> { let memory_evolution = find_by_field(suites, "/suite_id", "memory_evolution")?; - assert_eq!(memory_evolution.pointer("/status").and_then(Value::as_str), Some("not_encoded")); + assert_eq!(memory_evolution.pointer("/status").and_then(Value::as_str), Some("pass")); let project_decisions = find_by_field(suites, "/suite_id", "project_decisions")?; @@ -812,6 +813,7 @@ fn assert_root_aggregate_jobs(report: &Value) -> Result<()> { let rebuild = find_by_field(jobs, "/job_id", "trust-sot-rebuild-001")?; let redaction = find_by_field(jobs, "/job_id", "capture-redaction-exclusion-001")?; let personalization = find_by_field(jobs, "/job_id", "personalization-scoped-preference-001")?; + let relation_job = find_by_field(jobs, "/job_id", "memory-evolution-relation-temporal-001")?; let stage_job = find_by_field(jobs, "/job_id", "operator-debug-stage-attribution-001")?; let production_restore = find_by_field(jobs, "/job_id", "production-ops-restore-cold-start-001")?; @@ -825,6 +827,7 @@ fn assert_root_aggregate_jobs(report: &Value) -> Result<()> { assert_eq!(personalization.pointer("/scope_check_count").and_then(Value::as_u64), Some(1)); assert_eq!(personalization.pointer("/scope_correct_count").and_then(Value::as_u64), Some(1)); assert_eq!(stage_job.pointer("/status").and_then(Value::as_str), Some("pass")); + assert_eq!(relation_job.pointer("/status").and_then(Value::as_str), Some("pass")); assert_eq!( stage_job.pointer("/trace_explainability/failure_stage").and_then(Value::as_str), Some("rerank.score") @@ -992,54 +995,51 @@ fn memory_evolution_fixtures_report_temporal_and_staleness_metrics() -> Result<( assert_eq!(report.pointer("/summary/job_count").and_then(Value::as_u64), Some(5)); assert_eq!(report.pointer("/summary/encoded_suite_count").and_then(Value::as_u64), Some(1)); - assert_eq!(report.pointer("/summary/pass").and_then(Value::as_u64), Some(4)); - assert_eq!(report.pointer("/summary/not_encoded").and_then(Value::as_u64), Some(1)); + assert_eq!(report.pointer("/summary/pass").and_then(Value::as_u64), Some(5)); + assert_eq!(report.pointer("/summary/not_encoded").and_then(Value::as_u64), Some(0)); assert_eq!(report.pointer("/summary/stale_answer_count").and_then(Value::as_u64), Some(0)); assert_eq!( report.pointer("/summary/conflict_detection_count").and_then(Value::as_u64), - Some(4) + Some(5) ); assert_eq!( report.pointer("/summary/update_rationale_available_count").and_then(Value::as_u64), - Some(4) + Some(5) ); assert_eq!( report.pointer("/summary/temporal_validity_not_encoded_count").and_then(Value::as_u64), - Some(1) + Some(0) ); assert_eq!( report.pointer("/evolution/temporal_validity_not_encoded_count").and_then(Value::as_u64), - Some(1) + Some(0) ); let suites = array_at(&report, "/suites")?; let memory_evolution = find_by_field(suites, "/suite_id", "memory_evolution")?; - assert_eq!(memory_evolution.pointer("/status").and_then(Value::as_str), Some("not_encoded")); + assert_eq!(memory_evolution.pointer("/status").and_then(Value::as_str), Some("pass")); assert_eq!( memory_evolution.pointer("/temporal_validity_not_encoded_count").and_then(Value::as_u64), - Some(1) + Some(0) ); let jobs = array_at(&report, "/jobs")?; let relation_job = find_by_field(jobs, "/job_id", "memory-evolution-relation-temporal-001")?; - assert_eq!(relation_job.pointer("/status").and_then(Value::as_str), Some("not_encoded")); + assert_eq!(relation_job.pointer("/status").and_then(Value::as_str), Some("pass")); assert_eq!( relation_job.pointer("/evolution/temporal_validity_not_encoded").and_then(Value::as_bool), + Some(false) + ); + assert_eq!( + relation_job.pointer("/evolution/temporal_validity_encoded").and_then(Value::as_bool), Some(true) ); let follow_ups = array_at(&report, "/follow_ups")?; - assert_eq!(follow_ups.len(), 1); - assert_eq!( - follow_ups - .first() - .and_then(|follow_up| follow_up.pointer("/title")) - .and_then(Value::as_str), - Some("[ELF graph P1] Add temporal validity to graph-lite facts") - ); + assert!(follow_ups.is_empty()); Ok(()) } @@ -1163,8 +1163,9 @@ fn memory_evolution_report_renders_markdown_counters() -> Result<()> { let markdown = fs::read_to_string(markdown_path)?; assert!(markdown.contains("## Memory Evolution")); - assert!(markdown.contains("Temporal validity not encoded: `1`")); - assert!(markdown.contains("[ELF graph P1] Add temporal validity to graph-lite facts")); + assert!(markdown.contains("Temporal validity not encoded: `0`")); + assert!(markdown.contains("| memory_evolution | memory-evolution-relation-temporal-001")); + assert!(markdown.contains("`encoded`")); Ok(()) } diff --git a/docs/guide/benchmarking/live_baseline_benchmark.md b/docs/guide/benchmarking/live_baseline_benchmark.md index 3b4f9137..8e8b22cf 100644 --- a/docs/guide/benchmarking/live_baseline_benchmark.md +++ b/docs/guide/benchmarking/live_baseline_benchmark.md @@ -353,8 +353,8 @@ cargo make real-world-memory-evolution It lives under `apps/elf-eval/fixtures/real_world_memory/evolution/` and reports stale-answer count, conflict detection count, update rationale availability, temporal -validity gaps, and unsupported claims. Its relation-temporal fixture is deliberately -`not_encoded` until graph-lite temporal validity is implemented. +validity encoding, and unsupported claims. Its relation-temporal fixture is encoded as +a normal pass/fail check for current versus historical graph-lite relation context. To run the checked-in retrieval-quality real-world fixtures: diff --git a/docs/guide/benchmarking/real_world_agent_memory_benchmark.md b/docs/guide/benchmarking/real_world_agent_memory_benchmark.md index e0cc5c26..388a4c28 100644 --- a/docs/guide/benchmarking/real_world_agent_memory_benchmark.md +++ b/docs/guide/benchmarking/real_world_agent_memory_benchmark.md @@ -166,7 +166,7 @@ including the retrieval-quality slice below. The suite currently encodes: The generated report includes evidence coverage, source-ref coverage, quote coverage, unsupported-claim count, stale retrieval count, stale-answer count, conflict detection -count, update rationale availability, temporal validity `not_encoded` count, scope +count, update rationale availability, temporal validity encoding count, scope correctness, redaction leak count, capture/integration behavior classes, Qdrant rebuild case/pass counts, expected evidence recall, irrelevant context ratio, latency/cost, answer-type plus caveat/refusal/uncertainty flags, and trace @@ -262,8 +262,8 @@ tmp/real-world-memory/evolution-report.md This parses `apps/elf-eval/fixtures/real_world_memory/evolution/` and reports only the cases added for current-versus-historical interpretation and temporal staleness. -The relation temporal-validity fixture is deliberately `not_encoded` and declares the -graph follow-up instead of claiming a fake graph pass. +The relation temporal-validity fixture is encoded and scores current owner, +historical owner, update rationale, and stale-owner trap behavior. Current checked-in retrieval-quality increment: diff --git a/docs/guide/benchmarking/real_world_memory_evolution.md b/docs/guide/benchmarking/real_world_memory_evolution.md index 69d31d58..718b09aa 100644 --- a/docs/guide/benchmarking/real_world_memory_evolution.md +++ b/docs/guide/benchmarking/real_world_memory_evolution.md @@ -2,7 +2,7 @@ Goal: Run and interpret the checked-in memory evolution real-world job fixtures. Read this when: You need to test current facts, historical facts, stale facts, -conflicts, corrected memories, and temporal validity limitations. +conflicts, corrected memories, and temporal relation validity. Inputs: `apps/elf-eval/fixtures/real_world_memory/evolution/`, `apps/elf-eval/src/bin/real_world_job_benchmark.rs`, and `Makefile.toml`. Depends on: `docs/spec/real_world_agent_memory_benchmark_v1.md`, @@ -23,13 +23,12 @@ The checked-in fixture set covers: - Issue state evolution from blocked to done. - Production deployment guidance superseding a local smoke quickstart. - Benchmark adoption verdict reversal with a bounded private-corpus caveat. -- Relation fact current-versus-historical ownership, encoded as `not_encoded` - because temporal graph validity is not yet implemented in the runner. +- Relation fact current-versus-historical ownership with graph-lite temporal + validity encoded as a normal pass/fail fixture. The relation case borrows from Graphiti/Zep temporal validity and nanograph typed -query ergonomics. It intentionally does not fake a pass for graph temporal behavior. -The report declares the follow-up `[ELF graph P1] Add temporal validity to graph-lite -facts`. +query ergonomics while preserving ELF's Postgres source-of-truth and evidence-link +requirements. ## Run @@ -55,10 +54,11 @@ The runner reports memory evolution counters at summary, suite, and job levels: - `update_rationale_available_count`: jobs where the produced answer cites the update rationale. - `temporal_validity_not_encoded_count`: jobs that require temporal graph validity - but are deliberately declared `not_encoded`. + but are deliberately declared `not_encoded`; this should be `0` for the checked-in + evolution fixture set. - `unsupported_claim_count`: existing real-world job unsupported claim counter. Runnable jobs should have `stale_answer_count = 0`, nonzero conflict detection, and -an update rationale when the fixture provides one. A temporal validity gap should -remain `not_encoded` until graph-lite facts can model current-only and historical -relation validity. +an update rationale when the fixture provides one. The relation temporal-validity job +should report temporal validity as encoded and pass only when current and historical +relation evidence are distinguished. diff --git a/docs/guide/research/comparison_external_projects.md b/docs/guide/research/comparison_external_projects.md index 9d8ae4f1..baaef043 100644 --- a/docs/guide/research/comparison_external_projects.md +++ b/docs/guide/research/comparison_external_projects.md @@ -96,7 +96,7 @@ Project-to-suite map: | graphify | `rw.graph-navigation`, `rw.knowledge-synthesis`, `rw.resume-evidence` | Deterministic code extraction, LLM-assisted graph building, honesty tags, graph reports, and assistant hooks are strong references for graph-compressed navigation over large corpora. | Generate graph/report artifacts from the benchmark corpus, require answers to use graph structure plus source evidence, and prove rebuild behavior after corpus edits. | Docs-grounded D1; no benchmark adapter evidence. Confidence: medium for graph-navigation reference. | ELF is stronger as a memory service; graphify is the reference for rebuildable graph reports and pre-search guidance. | | Letta | `rw.core-archival`, `rw.operator-continuity` | Core memory blocks, archival memory, and shared/read-only memory blocks map directly to always-loaded operating context versus retrievable memory. | Build a multi-agent job where core blocks must be attached/detached/shared read-only, while archival memory is retrieved separately and audited. | Docs-grounded D1; no benchmark adapter evidence. Confidence: medium for memory-semantics reference. | ELF has scoped notes but not first-class core/archival block ergonomics; Letta is the reference dimension. | | LangGraph | `rw.replay-regression`, `rw.resume-evidence` | Thread checkpoints, durable execution, replay, fork, and time travel define a strong model for debugging agent-state and memory-regression behavior. | Run an agent job with memory reads across checkpoints, replay/fork the thread after a stale-memory failure, and verify side-effect boundaries. | Docs-grounded D1; no benchmark adapter evidence. Confidence: medium for replay workflow reference. | ELF traces are useful but do not replace full agent checkpoint replay; LangGraph is the reference for replay-regression jobs. | -| Graphiti / Zep | `rw.graph-temporal`, `rw.resume-evidence` | Temporal entities, relations, fact triples, validity windows, and graph search directly target stale/contradictory factual memory. | Add fact triples with validity changes, query current and historical answers, and score invalidation/append behavior under contradiction traps. | Docs-grounded D1; no benchmark adapter evidence. Confidence: medium-high for temporal-graph dimension. | ELF graph-lite is not yet stronger on temporal graph validity; Graphiti/Zep is the reference dimension. | +| Graphiti / Zep | `rw.graph-temporal`, `rw.resume-evidence` | Temporal entities, relations, fact triples, validity windows, and graph search directly target stale/contradictory factual memory. | Add fact triples with validity changes, query current and historical answers, and score invalidation/append behavior under contradiction traps. | Docs-grounded D1; no benchmark adapter evidence. Confidence: medium-high for temporal-graph dimension. | ELF graph-lite covers evidence-linked validity windows and current/historical relation context; Graphiti/Zep remains the reference for broader temporal graph workflows. | | nanograph | `rw.graph-temporal`, `rw.retrieval-debug` | Typed schema and typed query ergonomics are relevant to making ELF graph-lite interactions inspectable and hard to misuse. | Define typed graph schemas and queries for the same fact set, then score developer-visible validation, query shape, and explainability rather than retrieval quality alone. | Docs-grounded D1; no benchmark adapter evidence. Confidence: medium for DX reference, low for memory-system comparison. | ELF should borrow typed graph ergonomics without treating nanograph as a full memory backend. | Pending watch items remain D0. Keep them out of benchmark strength claims until current @@ -117,7 +117,7 @@ evidence is gathered: | Progressive disclosure UX | claude-mem, OpenViking | ELF has L0/L1/L2 shaping and traces, but the operator workflow still needs better search-session navigation. | | Entity-scoped history and managed ecosystem reach | mem0/OpenMemory | ELF has ingest decisions and versions, but not the same hosted option, SDK reach, or first-class memory history surface. | | Core memory versus archival memory | Letta | ELF scopes notes well, but lacks attachable/read-only core memory blocks as a distinct user-facing layer. | -| Temporal graph validity | Graphiti/Zep | ELF graph-lite persists relation context, but temporal invalidation/current-vs-historical graph behavior is not the reference yet. | +| Temporal graph validity | Graphiti/Zep | ELF graph-lite now persists validity windows and labels current versus historical relation context, while Graphiti/Zep remains the broader reference for temporal graph workflows. | | Agent replay and forkable regression debugging | LangGraph | ELF traces are replay evidence for retrieval, not full persisted agent-state replay with side-effect boundaries. | | Derived knowledge pages and lint/repair loops | llm-wiki, gbrain | ELF does not yet ship rebuildable entity/project pages with unsupported-claim lint as a first-class workflow. | | Scheduled consolidation as a product surface | Always-On Memory Agent | ELF's target should be reviewable derived consolidation, but the scheduling/operator-control workflow is not implemented. | diff --git a/docs/guide/research/external_memory_improvement_plan.md b/docs/guide/research/external_memory_improvement_plan.md index bd37e8fc..508bfab2 100644 --- a/docs/guide/research/external_memory_improvement_plan.md +++ b/docs/guide/research/external_memory_improvement_plan.md @@ -15,7 +15,7 @@ The objective position is: - Better than the tested alternatives on evidence-bound writes, deterministic ingestion boundaries, source-of-truth discipline, rebuildable indexing, multi-tenant service shape, and the current encoded Docker benchmark. - Comparable to the best tested alternative, qmd, on local retrieval quality under the smoke scenario, but ELF has a stronger service/provenance model while qmd has stronger local retrieval-debug ergonomics. - Behind agentmemory, claude-mem/OpenMemory-style tools, and some managed-memory products on operator UX, visible memory inspection, and turn-by-turn operational comfort. -- Behind Graphiti/Zep, Letta, and mem0-style systems on some memory semantics: temporal graph validity, explicit memory history, core-vs-archival blocks, and reviewable memory evolution. +- Behind Graphiti/Zep, Letta, and mem0-style systems on some broader memory semantics: temporal graph workflows beyond graph-lite relation context, explicit memory history, core-vs-archival blocks, and reviewable memory evolution. - Not yet proven on large private personal corpus migration, repeated batch backfill, cold-start persistence across every adapter, or long-running unattended production operation. So the answer is not "ELF is universally better." The current evidence supports "ELF is the better foundation for this repo's desired high-trust, evidence-linked memory system, and it can become the better personal-production choice if the P0 work lands and is benchmarked." @@ -84,7 +84,7 @@ Use these terms in future benchmark reports and Linear issues: | `wrong_result` | The system completed but returned an incorrect memory or missed the expected evidence. | mem0/memsearch/claude-mem smoke retrieval mismatch. | | `lifecycle_fail` | Retrieval may work, but update/delete/cold-start/persistence behavior is wrong or incomplete. | agentmemory adapter passing retrieval but not lifecycle. | | `incomplete` | The benchmark could not reach the behavioral check due to install/runtime/dependency failure. | OpenViking local embedding install failure in Docker. | -| `not_encoded` | Capability is not currently covered by the benchmark, so no pass/fail claim is allowed. | Viewer quality, batch backfill UX, graph temporal validity. | +| `not_encoded` | Capability is not currently covered by the benchmark, so no pass/fail claim is allowed. | Viewer quality and batch backfill UX. | | `blocked` | A safe test cannot run without external credentials, manual setup, or a dependency outside the issue scope. | Private corpus evaluation before sanitized corpus exists. | ## Priority Program @@ -319,21 +319,21 @@ Adopt from: Implementation shape: -- Add valid_from, valid_to or invalidated_at semantics for relation facts. -- Keep append-only relation history. -- Add APIs for current facts vs historical facts. -- Extend search relation_context to respect temporal validity. +- Use `valid_from` and `valid_to` semantics for relation facts. +- Keep append-only relation history and supersession evidence. +- Expose current versus historical temporal status in graph query and search relation context. +- Keep broader typed graph query ergonomics scoped to XY-70. Acceptance: - Contradictory facts do not overwrite silently. -- Search can choose current-only or historical relation context. -- Tests cover invalidation and old-state replay. +- Search relation context labels current and historical facts. +- Tests cover invalidation, current readback, and old-state replay. Linear mapping: - Existing related: XY-70 covers graph-lite typed schema/query. -- New issue required: `[ELF graph P1] Add temporal validity to graph-lite facts`. +- Focused implementation issue: XY-863 `[ELF graph P1] Add temporal validity to graph-lite relation context`. #### P1.4 Memory History and Evolution API @@ -518,7 +518,7 @@ Linear mapping: | 5 | P0 | Make external adapters lifecycle-durable and fail-typed | New, follows XY-801 | yes | fair external comparison | | 6 | P1 | Implement reviewable consolidation worker and proposal review flow | follows XY-800 | partly | knowledge pages | | 7 | P1 | Split XY-286 into derived page storage, rebuild, lint, and viewer/search integration | XY-286 | partly | durable knowledge layer | -| 8 | P1 | Add temporal validity to graph-lite facts | follows/relates XY-70 | yes | time-aware relation context | +| 8 | P1 | Add temporal validity to graph-lite relation context | XY-863, follows/relates XY-70 | yes | time-aware relation context | | 9 | P1 | Add memory history and evolution readback API | New | yes | lifecycle auditability | | 10 | P1 | Add scoped core memory blocks with archival separation | New | yes | agent operating context | | 11 | P1 | Add staged search trajectory profiles | New or XY-27 follow-up | after XY-27 | advanced retrieval tuning | diff --git a/docs/spec/real_world_agent_memory_benchmark_v1.md b/docs/spec/real_world_agent_memory_benchmark_v1.md index 67bdba04..5660f322 100644 --- a/docs/spec/real_world_agent_memory_benchmark_v1.md +++ b/docs/spec/real_world_agent_memory_benchmark_v1.md @@ -382,6 +382,8 @@ Fields: - `temporal_validity`: optional object with `required`, `encoded`, and optional `follow_up`. When `required = true` and `encoded = false`, the job MUST declare `encoding.status = "not_encoded"` or `encoding.status = "blocked"`. + When `encoded = true`, the job is scored normally and must include concrete + produced evidence for current and historical validity behavior. ### `operator_debug` @@ -547,7 +549,7 @@ Reports MUST include: Reports that encode `memory_evolution` jobs SHOULD also include stale-answer counts, conflict detection counts, update rationale availability, and temporal-validity `not_encoded` counts. A temporal graph validity job MUST NOT be reported as `pass` -until the runner can evaluate current-only versus historical relation facts. +unless the runner can evaluate current-only versus historical relation facts. Consolidation suite reports MUST also include: diff --git a/docs/spec/system_elf_memory_service_v2.md b/docs/spec/system_elf_memory_service_v2.md index d103944a..8c484d07 100644 --- a/docs/spec/system_elf_memory_service_v2.md +++ b/docs/spec/system_elf_memory_service_v2.md @@ -1071,6 +1071,7 @@ Response: }, "valid_from": "...", "valid_to": null, + "temporal_status": "current|historical|future", "evidence_note_ids": ["uuid", "uuid"] } ] @@ -1084,6 +1085,9 @@ Notes: - `relation_context` is omitted unless `search.graph_context.enabled` is true. - When present, relation context is evidence-bound and bounded by `search.graph_context.max_facts_per_item` and `search.graph_context.max_evidence_notes_per_fact`. +- `relation_context.temporal_status` is derived from the graph fact validity window at the search read timestamp. + Historical facts may be returned when they are evidence-linked to a selected note; they must be labeled + `historical` instead of being presented as current. - It is included wherever `SearchExplain` is returned, including admin trace surfaces (`/v2/admin/traces/*` and `/v2/admin/trace-items/*`), in addition to search responses. - Admin trace endpoints validate `tenant_id` + `project_id` only for access control. They are intended for @@ -1657,6 +1661,7 @@ Response: "predicate_id": "uuid|null", "valid_from": "...", "valid_to": "...|null", + "temporal_status": "current|historical|future", "object": { "entity": { "entity_id": "uuid", diff --git a/docs/spec/system_graph_memory_postgres_v1.md b/docs/spec/system_graph_memory_postgres_v1.md index afe8f0c9..92012ae0 100644 --- a/docs/spec/system_graph_memory_postgres_v1.md +++ b/docs/spec/system_graph_memory_postgres_v1.md @@ -194,6 +194,11 @@ Supersession rule (write-time): - An active fact is defined by: `valid_from <= now AND (valid_to IS NULL OR valid_to > now)`. - Active duplicate prevention is enforced by partial unique indexes. - When ingestion reintroduces a note equivalent to an existing active fact, the system reuses the existing fact row and appends additional evidence rows for the new note instead of creating another active duplicate fact row. +- Graph read APIs should expose relation temporal state derived from the validity window: + - `current` when `valid_from <= read_at AND (valid_to IS NULL OR valid_to > read_at)`. + - `historical` when `valid_to <= read_at`. + - `future` when `valid_from > read_at`. +- Search relation context may include historical facts when they are evidence-linked to a returned note, but it must label them as historical instead of silently treating them as current. ============================================================ 7. CALL EXAMPLES diff --git a/packages/elf-service/src/graph.rs b/packages/elf-service/src/graph.rs index 4302063a..8b187100 100644 --- a/packages/elf-service/src/graph.rs +++ b/packages/elf-service/src/graph.rs @@ -1,11 +1,25 @@ //! Graph retrieval and mutation APIs. +use serde::{Deserialize, Serialize}; use time::OffsetDateTime; use uuid::Uuid; use crate::{ElfService, Error, Result}; use elf_storage::graph; +/// Temporal state for a graph relation fact relative to a read timestamp. +#[derive(Clone, Copy, Debug, Default, Eq, PartialEq, Deserialize, Serialize)] +#[serde(rename_all = "snake_case")] +pub enum RelationTemporalStatus { + /// The fact's validity window starts after the read timestamp. + Future, + /// The fact is valid at the read timestamp. + #[default] + Current, + /// The fact was invalidated before or at the read timestamp. + Historical, +} + #[allow(dead_code)] pub(crate) struct GraphUpsertFactArgs<'a> { pub tenant_id: &'a str, @@ -56,3 +70,18 @@ impl ElfService { Ok(fact_id) } } + +pub(crate) fn relation_temporal_status( + valid_from: OffsetDateTime, + valid_to: Option, + read_at: OffsetDateTime, +) -> RelationTemporalStatus { + if valid_from > read_at { + return RelationTemporalStatus::Future; + } + if valid_to.is_some_and(|valid_to| valid_to <= read_at) { + return RelationTemporalStatus::Historical; + } + + RelationTemporalStatus::Current +} diff --git a/packages/elf-service/src/graph_query.rs b/packages/elf-service/src/graph_query.rs index f949aa83..75e37d73 100644 --- a/packages/elf-service/src/graph_query.rs +++ b/packages/elf-service/src/graph_query.rs @@ -10,6 +10,7 @@ use uuid::Uuid; use crate::{ ElfService, Error, Result, access::{self, ORG_PROJECT_ID}, + graph::RelationTemporalStatus, search, }; use elf_storage::{graph, models::GraphEntity}; @@ -188,6 +189,8 @@ pub struct GraphQueryFact { #[serde(with = "crate::time_serde::option")] /// End of the fact validity window, if superseded. pub valid_to: Option, + /// Temporal state for the fact relative to the service read timestamp. + pub temporal_status: RelationTemporalStatus, /// Object payload for the fact. pub object: GraphQueryObject, /// Evidence note identifiers supporting the fact. @@ -328,6 +331,7 @@ impl ElfService { .map(|item| format!("{}:{}", item.scope, item.space_owner_agent_id)) .collect(); let predicate_id = predicate.as_ref().map(|predicate| predicate.id); + let read_at = OffsetDateTime::now_utc(); let rows = fetch_graph_query_rows( &mut conn, GraphQueryRowsFetchParams { @@ -367,6 +371,11 @@ impl ElfService { predicate_id: row.predicate_id, valid_from: row.valid_from, valid_to: row.valid_to, + temporal_status: crate::graph::relation_temporal_status( + row.valid_from, + row.valid_to, + read_at, + ), object, evidence_note_ids: row.evidence_note_ids, } @@ -696,6 +705,7 @@ mod tests { use crate::{ ELF_GRAPH_QUERY_SCHEMA_V1, Error, GraphQueryFact, GraphQueryObject, GraphQueryObjectEntity, + graph::RelationTemporalStatus, graph_query::{self, GraphQueryEntityRef, GraphQueryRequest, OffsetDateTime}, }; @@ -737,6 +747,7 @@ mod tests { predicate_id: None, valid_from: OffsetDateTime::from_unix_timestamp(1).expect("valid timestamp"), valid_to: None, + temporal_status: RelationTemporalStatus::Current, object: GraphQueryObject { entity: Some(GraphQueryObjectEntity { entity_id: Uuid::from_u128(100), @@ -755,6 +766,7 @@ mod tests { predicate_id: None, valid_from: OffsetDateTime::from_unix_timestamp(2).expect("valid timestamp"), valid_to: None, + temporal_status: RelationTemporalStatus::Current, object: GraphQueryObject { entity: Some(GraphQueryObjectEntity { entity_id: Uuid::from_u128(101), @@ -773,6 +785,7 @@ mod tests { predicate_id: None, valid_from: OffsetDateTime::from_unix_timestamp(3).expect("valid timestamp"), valid_to: None, + temporal_status: RelationTemporalStatus::Current, object: GraphQueryObject { entity: None, value: Some("office".to_string()) }, evidence_note_ids: vec![], }, diff --git a/packages/elf-service/src/lib.rs b/packages/elf-service/src/lib.rs index 55f98c4d..4378befc 100644 --- a/packages/elf-service/src/lib.rs +++ b/packages/elf-service/src/lib.rs @@ -52,6 +52,7 @@ pub use self::{ TextPositionSelector, TextQuoteSelector, }, error::{Error, Result}, + graph::RelationTemporalStatus, graph_query::{ ELF_GRAPH_QUERY_SCHEMA_V1, GraphQueryEntity, GraphQueryEntityRef, GraphQueryExplain, GraphQueryFact, GraphQueryObject, GraphQueryObjectEntity, GraphQueryPredicate, diff --git a/packages/elf-service/src/search.rs b/packages/elf-service/src/search.rs index 1325c00e..efbbccb3 100644 --- a/packages/elf-service/src/search.rs +++ b/packages/elf-service/src/search.rs @@ -24,6 +24,7 @@ use uuid::Uuid; use crate::{ ElfService, Result, access::{self, ORG_PROJECT_ID}, + graph::RelationTemporalStatus, ranking_explain_v2::{self, SEARCH_RANKING_EXPLAIN_SCHEMA_V2, TraceTermsArgs}, }; use elf_config::{Config, SearchCache}; @@ -69,7 +70,8 @@ WITH selected_facts AS ( object_entity.kind AS object_kind, gf.object_value, gf.valid_from, - gf.valid_to + gf.valid_to, + (gf.valid_from <= $4 AND (gf.valid_to IS NULL OR gf.valid_to > $4)) AS is_current FROM unnest($7::uuid[]) AS snc(selected_note_id) JOIN graph_fact_evidence gfe ON gfe.note_id = snc.selected_note_id @@ -90,8 +92,12 @@ WITH selected_facts AS ( OR gf.scope = ANY($6::text[]) ) AND gf.valid_from <= $4 - AND (gf.valid_to IS NULL OR gf.valid_to > $4) - ORDER BY snc.selected_note_id, gf.fact_id, gf.valid_from DESC, gf.fact_id ASC + ORDER BY + snc.selected_note_id, + gf.fact_id, + (gf.valid_from <= $4 AND (gf.valid_to IS NULL OR gf.valid_to > $4)) DESC, + gf.valid_from DESC, + gf.fact_id ASC ), ranked_facts AS ( SELECT @@ -107,9 +113,10 @@ ranked_facts AS ( object_value, valid_from, valid_to, + is_current, ROW_NUMBER() OVER ( PARTITION BY selected_note_id - ORDER BY valid_from DESC, fact_id ASC + ORDER BY is_current DESC, valid_from DESC, fact_id ASC ) AS fact_rank FROM selected_facts ), @@ -127,6 +134,7 @@ bounded_facts AS ( object_value, valid_from, valid_to, + is_current, fact_rank FROM ranked_facts WHERE fact_rank <= $9 @@ -145,6 +153,7 @@ evidence_ranked AS ( bf.object_value, bf.valid_from, bf.valid_to, + bf.is_current, bf.fact_rank, e.note_id AS evidence_note_id, e.created_at AS evidence_created_at, @@ -170,6 +179,7 @@ fact_contexts AS ( object_value, valid_from, valid_to, + is_current, fact_rank, ARRAY_AGG(evidence_note_id ORDER BY evidence_created_at ASC, evidence_note_id ASC) AS evidence_note_ids FROM evidence_ranked @@ -187,6 +197,7 @@ fact_contexts AS ( object_value, valid_from, valid_to, + is_current, fact_rank ) SELECT @@ -202,6 +213,7 @@ SELECT object_value, valid_from, valid_to, + is_current, evidence_note_ids FROM fact_contexts ORDER BY note_id, fact_rank @@ -336,6 +348,9 @@ pub struct SearchExplainRelationContext { /// End of the fact validity window, if superseded. pub valid_to: Option, #[serde(default)] + /// Temporal state for the fact relative to the search read timestamp. + pub temporal_status: RelationTemporalStatus, + #[serde(default)] /// Evidence note identifiers supporting the fact. pub evidence_note_ids: Vec, } @@ -1208,6 +1223,7 @@ struct SearchRelationContextRow { object_value: Option, valid_from: OffsetDateTime, valid_to: Option, + is_current: bool, evidence_note_ids: Vec, } @@ -4745,6 +4761,11 @@ WHERE note_id = ANY($1::uuid[]) object, valid_from: row.valid_from, valid_to: row.valid_to, + temporal_status: if row.is_current { + RelationTemporalStatus::Current + } else { + RelationTemporalStatus::Historical + }, evidence_note_ids: row.evidence_note_ids, }, ); diff --git a/packages/elf-service/tests/acceptance/chunk_search.rs b/packages/elf-service/tests/acceptance/chunk_search.rs index 422ad36a..867ba014 100644 --- a/packages/elf-service/tests/acceptance/chunk_search.rs +++ b/packages/elf-service/tests/acceptance/chunk_search.rs @@ -15,8 +15,9 @@ use uuid::Uuid; use crate::acceptance::{self, SpyExtractor, StubEmbedding, StubRerank}; use elf_config::ProviderConfig; use elf_service::{ - BoxFuture, ElfService, NoteFetchResponse, PayloadLevel, Providers, RerankProvider, Result, - SearchDetailsRequest, SearchRequest, SearchTimelineRequest, TraceTrajectoryGetRequest, + BoxFuture, ElfService, NoteFetchResponse, PayloadLevel, Providers, RelationTemporalStatus, + RerankProvider, Result, SearchDetailsRequest, SearchRequest, SearchTimelineRequest, + TraceTrajectoryGetRequest, }; use elf_storage::qdrant::{BM25_MODEL, BM25_VECTOR_NAME, DENSE_VECTOR_NAME}; use elf_testkit::TestDatabase; @@ -585,7 +586,7 @@ async fn setup_graph_context_test( async fn seed_relation_context_fixture( service: &ElfService, embedding_version: &str, -) -> (Uuid, Uuid) { +) -> (Uuid, Uuid, Uuid) { let now = OffsetDateTime::now_utc(); let note_id = Uuid::new_v4(); let note_id_2 = Uuid::new_v4(); @@ -630,7 +631,7 @@ async fn seed_relation_context_fixture( predicate_id, "Bob", older_fact_valid_from, - None, + Some(newer_fact_valid_from), ) .await; insert_graph_fact_evidence( @@ -666,7 +667,7 @@ async fn seed_relation_context_fixture( ) .await; - (note_id, newer_fact_id) + (note_id, newer_fact_id, older_fact_id) } #[tokio::test] @@ -769,12 +770,74 @@ async fn search_raw_quick_includes_relation_context_and_respects_fact_bounds() { "Expected the most recent fact after truncation." ); assert_eq!(relation_context[0].object.value.as_deref(), Some("Carol")); + assert_eq!(relation_context[0].temporal_status, RelationTemporalStatus::Current); + assert!(relation_context[0].valid_to.is_none()); assert_eq!(relation_context[0].evidence_note_ids.len(), 1); assert_eq!(relation_context[0].evidence_note_ids[0], note_id); context.test_db.cleanup().await.expect("Failed to cleanup test database."); } +#[tokio::test] +#[ignore = "Requires external Postgres and Qdrant. Set ELF_PG_DSN and ELF_QDRANT_URL to run."] +async fn search_raw_quick_marks_historical_relation_context() { + let providers = build_providers(StubRerank); + let Some(context) = setup_graph_context_test( + "search_raw_quick_marks_historical_relation_context", + providers, + 2, + 2, + ) + .await + else { + return; + }; + let fixture = seed_relation_context_fixture(&context.service, &context.embedding_version).await; + let older_fact_id = fixture.2; + let response = context + .service + .search_raw_quick(SearchRequest { + tenant_id: "t".to_string(), + project_id: "p".to_string(), + agent_id: "a".to_string(), + token_id: None, + read_profile: "private_only".to_string(), + payload_level: Default::default(), + query: "Alice".to_string(), + top_k: Some(5), + candidate_k: Some(10), + filter: None, + record_hits: Some(false), + ranking: None, + }) + .await + .expect("Search failed."); + let item = response.items.first().expect("Expected search result."); + let relation_context = item + .explain + .relation_context + .as_ref() + .expect("Expected relation context in search explain."); + + assert_eq!( + relation_context.len(), + 2, + "Expected current and historical relation facts in context.", + ); + assert_eq!(relation_context[0].temporal_status, RelationTemporalStatus::Current); + + let historical = relation_context + .iter() + .find(|context| context.fact_id == older_fact_id) + .expect("Expected historical fact in relation context."); + + assert_eq!(historical.object.value.as_deref(), Some("Bob")); + assert_eq!(historical.temporal_status, RelationTemporalStatus::Historical); + assert!(historical.valid_to.is_some()); + + context.test_db.cleanup().await.expect("Failed to cleanup test database."); +} + #[tokio::test] #[ignore = "Requires external Postgres and Qdrant. Set ELF_PG_DSN and ELF_QDRANT_URL to run."] async fn search_stitches_adjacent_chunks() { diff --git a/packages/elf-service/tests/acceptance/graph_ingestion.rs b/packages/elf-service/tests/acceptance/graph_ingestion.rs index 639c9096..511c2195 100644 --- a/packages/elf-service/tests/acceptance/graph_ingestion.rs +++ b/packages/elf-service/tests/acceptance/graph_ingestion.rs @@ -13,7 +13,8 @@ use elf_config::EmbeddingProviderConfig; use elf_domain::memory_policy::MemoryPolicyDecision; use elf_service::{ AddEventRequest, AddNoteInput, AddNoteRequest, BoxFuture, ElfService, EmbeddingProvider, - EventMessage, NoteOp, Providers, Result, StructuredFields, + EventMessage, GraphQueryEntityRef, GraphQueryPredicateRef, GraphQueryRequest, NoteOp, + Providers, RelationTemporalStatus, Result, StructuredFields, }; const TEST_TENANT: &str = "t"; @@ -153,6 +154,21 @@ fn duplicate_fact_attaches_multiple_evidence_request() -> AddNoteRequest { } } +fn works_at_graph_query_request(as_of: OffsetDateTime) -> GraphQueryRequest { + GraphQueryRequest { + tenant_id: TEST_TENANT.to_string(), + project_id: TEST_PROJECT.to_string(), + agent_id: "a".to_string(), + read_profile: "private_only".to_string(), + subject: GraphQueryEntityRef::Surface { surface: "Alice".to_string() }, + predicate: Some(GraphQueryPredicateRef::Surface { surface: "works at".to_string() }), + scopes: Some(vec![TEST_SCOPE.to_string()]), + as_of: Some(as_of), + limit: Some(10), + explain: Some(true), + } +} + async fn graph_fact_id(pool: &PgPool) -> Uuid { sqlx::query_scalar( "\ @@ -478,8 +494,9 @@ async fn add_note_single_predicate_supersedes_conflicting_fact() { acceptance::reset_db(&service.db.pool).await.expect("Failed to reset test database."); - add_fact_note(&service, "employment-a", "Alice works at Initech.", "works at", "Initech").await; - + let old_note_id = + add_fact_note(&service, "employment-a", "Alice works at Initech.", "works at", "Initech") + .await; let fact_a = graph_fact_row(&service.db.pool, "works at", "Initech").await; let predicate_id = fact_a.predicate_id.expect("Expected predicate_id."); @@ -510,6 +527,27 @@ async fn add_note_single_predicate_supersedes_conflicting_fact() { assert_eq!(active_after.as_deref(), Some("Globex")); + let historical_replay = service + .graph_query(works_at_graph_query_request(t_before)) + .await + .expect("historical graph query failed."); + + assert_eq!(historical_replay.facts.len(), 1); + assert_eq!(historical_replay.facts[0].object.value.as_deref(), Some("Initech")); + assert_eq!(historical_replay.facts[0].valid_to, Some(fact_b.valid_from)); + assert_eq!(historical_replay.facts[0].temporal_status, RelationTemporalStatus::Historical); + assert_eq!(historical_replay.facts[0].evidence_note_ids, vec![old_note_id]); + + let current_readback = service + .graph_query(works_at_graph_query_request(t_after)) + .await + .expect("current graph query failed."); + + assert_eq!(current_readback.facts.len(), 1); + assert_eq!(current_readback.facts[0].object.value.as_deref(), Some("Globex")); + assert_eq!(current_readback.facts[0].temporal_status, RelationTemporalStatus::Current); + assert_eq!(current_readback.facts[0].evidence_note_ids, vec![note_id]); + let supersession_count = supersession_count(&service.db.pool, fact_a.fact_id, fact_b.fact_id, note_id).await;