diff --git a/README.md b/README.md index 414723df..f9ef9e1b 100644 --- a/README.md +++ b/README.md @@ -149,10 +149,11 @@ provider-backed ELF evidence was required. mem0, OpenViking, and claude-mem remained typed non-pass states. OpenViking now reaches its pinned Docker local embedding path and is reported as `wrong_result` when same-corpus evidence terms are missed; setup failures remain `incomplete`. -- Real-world agent memory aggregate after the P1 benchmark batch: 40 fixture-backed - jobs across 11 suites, 38 pass, 0 incomplete, 2 blocked, 0 wrong-result, - 0 not-encoded, and 0 unsupported-claim results. The remaining non-pass jobs are - production-ops operator boundaries, not hidden benchmark wins. +- Real-world agent memory aggregate after XY-928: 43 fixture-backed jobs across + 12 suites, 38 pass, 0 incomplete, 5 blocked, 0 wrong-result, 0 not-encoded, and + 0 unsupported-claim results. The remaining non-pass jobs are production-ops + operator boundaries plus blocked OpenViking staged trajectory, hierarchy selection, + and recursive/context expansion measurement gates, not hidden benchmark wins. - Full-suite live real-world adapter sweep after XY-899: ELF and qmd emit Docker-isolated `live_real_world` records for all 40 encoded jobs across 11 suites through `cargo make real-world-memory-live-adapters`. Both keep the original diff --git a/apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json b/apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json index 10acb39e..c6074d60 100644 --- a/apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json +++ b/apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json @@ -29,7 +29,7 @@ }, "run": { "status": "blocked", - "evidence": "The current fixture set reports 40 jobs, 38 pass, 0 incomplete, 2 blocked, 0 wrong_result, 0 not_encoded, and 0 unsupported_claim.", + "evidence": "The current fixture set reports 43 jobs, 38 pass, 0 incomplete, 5 blocked, 0 wrong_result, 0 not_encoded, and 0 unsupported_claim.", "command": "cargo make real-world-memory", "artifact": "tmp/real-world-memory/real-world-memory-report.json" }, @@ -110,6 +110,11 @@ "suite_id": "personalization", "status": "pass", "evidence": "The scoped preference fixture is encoded and passing." + }, + { + "suite_id": "context_trajectory", + "status": "blocked", + "evidence": "OpenViking staged retrieval, hierarchy selection, and recursive/context expansion fixtures are encoded as blocked until same-corpus evidence ids and staged artifacts are materialized." } ], "evidence": [ @@ -126,7 +131,7 @@ ], "notes": [ "This adapter record exists to keep ELF fixture results separate from live external adapter results.", - "The remaining non-pass ELF fixture states are production-ops operator boundaries: provider credentials and an operator-owned private corpus manifest.", + "The remaining non-pass ELF fixture states are production-ops operator boundaries plus OpenViking context-trajectory measurement gates.", "Use elf_live_real_world for service-runtime real_world_job evidence; this fixture-backed record must not imply live-service behavior." ] }, @@ -1189,7 +1194,7 @@ }, "run": { "status": "wrong_result", - "evidence": "The adapter reached same-corpus add_resource/find, but returned 0 of 3 expected evidence-term matches in the smoke run.", + "evidence": "The adapter reached same-corpus add_resource/find and now exposes expected/matched/missing evidence ids, but returned 0 of 3 expected evidence-term matches in the smoke run.", "artifact": "tmp/live-baseline/live-baseline-report.json" }, "result": { @@ -1210,8 +1215,8 @@ }, { "capability": "context_trajectory", - "status": "not_encoded", - "evidence": "OpenViking staged/hierarchical retrieval is a reference dimension but is not encoded as a real_world_job run." + "status": "blocked", + "evidence": "OpenViking staged/hierarchical retrieval is now encoded as blocked context_trajectory fixtures until same-corpus expected evidence ids match and staged artifacts are materialized." }, { "capability": "real_world_job_adapter", @@ -1231,9 +1236,9 @@ "evidence": "Hierarchical context resume scenarios are not encoded for OpenViking." }, { - "suite_id": "operator_debugging_ux", - "status": "not_encoded", - "evidence": "Stage trajectory readback is not encoded in this runner." + "suite_id": "context_trajectory", + "status": "blocked", + "evidence": "The staged retrieval, hierarchy selection, and recursive/context expansion fixtures are encoded as blocked behind same-corpus evidence output and staged artifact readback." } ], "evidence": [ @@ -1266,11 +1271,11 @@ ] }, "notes": [ - "Record OpenViking as wrong_result now that the pinned Docker local embedding path reaches add_resource/find but misses expected evidence." + "Record OpenViking as wrong_result now that the pinned Docker local embedding path reaches add_resource/find but misses expected evidence; keep context_trajectory as blocked until staged artifacts exist." ], "follow_up": { - "title": "Fix OpenViking evidence-bearing same-corpus retrieval output", - "reason": "The current adapter reaches add_resource/find but must return evidence-bearing content before real-world job suites can be scored." + "title": "Fix OpenViking evidence-bearing same-corpus retrieval output and materialize staged artifacts", + "reason": "The current adapter reaches add_resource/find and exposes expected evidence ids, but must match evidence ids and return stage/hierarchy/recursive artifacts before trajectory quality can be scored." } }, { @@ -1481,7 +1486,7 @@ "evidence_class": "research_gate", "docker_default": true, "host_global_installs_required": false, - "overall_status": "not_encoded", + "overall_status": "blocked", "setup": { "status": "pass", "evidence": "The default pinned OpenViking local embedding dependency path reaches runtime in Docker.", @@ -1489,12 +1494,12 @@ "artifact": "tmp/live-baseline/OpenViking.log" }, "run": { - "status": "not_encoded", - "evidence": "The XY-899 strength-profile report records staged retrieval, hierarchy selection, recursive/context expansion, and missed-term evidence as typed not_tested or wrong_result states; no new live trajectory adapter artifact is claimed." + "status": "blocked", + "evidence": "The XY-928 context_trajectory fixtures encode staged retrieval, hierarchy selection, and recursive/context expansion as blocked; no live trajectory adapter artifact is claimed." }, "result": { - "status": "not_encoded", - "evidence": "No OpenViking deep context-trajectory result is claimed from the current wrong-result smoke run; the XY-899 report preserves the trajectory surfaces as not_tested.", + "status": "blocked", + "evidence": "No OpenViking deep context-trajectory result is claimed from the current wrong-result smoke run; the XY-928 fixtures preserve trajectory surfaces as blocked/not_tested.", "artifact": "docs/research/2026-06-11-qmd-openviking-strength-profile-report.json" }, "capabilities": [ @@ -1505,8 +1510,8 @@ }, { "capability": "hierarchical_context_trajectory", - "status": "not_encoded", - "evidence": "Stage trajectory scoring remains not encoded until the smoke adapter returns evidence-bearing same-corpus output instead of the current wrong_result missed-term evidence." + "status": "blocked", + "evidence": "Stage trajectory scoring is encoded as blocked until the smoke adapter returns evidence-bearing same-corpus output and selected hierarchy/expansion artifacts." }, { "capability": "host_global_install_boundary", @@ -1517,13 +1522,13 @@ "suites": [ { "suite_id": "retrieval", - "status": "not_encoded", - "evidence": "Deep retrieval scoring is deferred until the smoke adapter returns evidence-bearing same-corpus output." + "status": "wrong_result", + "evidence": "Same-corpus retrieval is still the precondition and remains wrong_result in the live baseline." }, { - "suite_id": "work_resume", - "status": "not_encoded", - "evidence": "No OpenViking resume or context trajectory real_world_job run is encoded." + "suite_id": "context_trajectory", + "status": "blocked", + "evidence": "OpenViking staged retrieval, hierarchy selection, and recursive/context expansion jobs are encoded as blocked fixtures." }, { "suite_id": "operator_debugging_ux", @@ -1557,12 +1562,12 @@ "retry_guidance": [ "Run the default pinned llama-cpp-python==0.3.28 CPU wheel path first.", "Override the OpenViking llama-cpp-python version or index only when the default wheel is unavailable for the Docker platform.", - "Fix evidence-bearing same-corpus output before adding context-trajectory real_world_job scoring for hierarchical retrieval." + "Fix evidence-bearing same-corpus output and materialize selected hierarchy/expansion artifacts before converting blocked context_trajectory fixtures into scored jobs." ], - "research_depth": "D2 reviewed; local embedding setup pinned; deep profile not encoded" + "research_depth": "D2 reviewed; local embedding setup pinned; blocked fixtures encoded" }, "notes": [ - "OpenViking remains a context-trajectory reference, but this gate prevents a smoke wrong_result from becoming a deep-profile claim." + "OpenViking remains a context-trajectory reference, but this gate prevents a smoke wrong_result or blocked fixture from becoming a deep-profile win claim." ] }, { diff --git a/apps/elf-eval/fixtures/real_world_memory/context_trajectory/openviking_hierarchy_selection_blocked.json b/apps/elf-eval/fixtures/real_world_memory/context_trajectory/openviking_hierarchy_selection_blocked.json new file mode 100644 index 00000000..96e48c4e --- /dev/null +++ b/apps/elf-eval/fixtures/real_world_memory/context_trajectory/openviking_hierarchy_selection_blocked.json @@ -0,0 +1,261 @@ +{ + "schema": "elf.real_world_job/v1", + "job_id": "context-trajectory-openviking-hierarchy-selection-001", + "suite": "context_trajectory", + "title": "Gate OpenViking hierarchy selection scoring on scored hierarchy output", + "encoding": { + "status": "blocked", + "reason": "OpenViking hierarchy selection is encoded as a benchmark job, but scoring is blocked until the adapter emits selected hierarchy nodes with evidence ids after the same-corpus precondition passes.", + "follow_up": { + "title": "Materialize OpenViking selected hierarchy nodes", + "reason": "The context-trajectory adapter must return selected parent, child, and resource nodes with evidence ids before hierarchy quality can be scored against ELF." + } + }, + "corpus": { + "corpus_id": "real-world-memory-context-trajectory-2026-06-11", + "profile": "external_adapter", + "items": [ + { + "evidence_id": "hierarchy-selection-output-contract", + "kind": "adapter_state", + "text": "A scored OpenViking hierarchy selection job must report the selected parent context, selected child context, final resource evidence ids, and the rejected sibling or decoy context.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "openviking_hierarchy_selection_blocked", + "evidence_id": "hierarchy-selection-output-contract" + }, + "locator": { + "quote": "selected parent context, selected child context, final resource evidence ids" + } + }, + "created_at": "2026-06-11T00:00:00Z" + }, + { + "evidence_id": "same-corpus-before-hierarchy", + "kind": "adapter_state", + "text": "Hierarchy selection remains blocked until OpenViking same-corpus retrieval covers every expected evidence id instead of only reaching setup and returning wrong_result.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "openviking_hierarchy_selection_blocked", + "evidence_id": "same-corpus-before-hierarchy" + }, + "locator": { + "quote": "covers every expected evidence id" + } + }, + "created_at": "2026-06-11T00:00:00Z" + }, + { + "evidence_id": "hierarchy-comparison-requires-elf-equivalent", + "kind": "runbook", + "text": "ELF hierarchy or trace behavior may be compared only if the same hierarchy-selection scenario is encoded and produces comparable selected-node and rejected-node evidence.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "openviking_hierarchy_selection_blocked", + "evidence_id": "hierarchy-comparison-requires-elf-equivalent" + }, + "locator": { + "quote": "same hierarchy-selection scenario is encoded" + } + }, + "created_at": "2026-06-11T00:00:00Z" + }, + { + "evidence_id": "hierarchy-design-win-decoy", + "kind": "adapter_state", + "text": "Decoy: OpenViking should win hierarchy selection solely because its design uses viking:// hierarchy paths.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "openviking_hierarchy_selection_blocked", + "evidence_id": "hierarchy-design-win-decoy" + } + }, + "created_at": "2026-06-10T00:00:00Z" + } + ], + "adapter_response": { + "adapter_id": "fixture_context_trajectory", + "answer": { + "content": "OpenViking hierarchy selection is blocked until selected hierarchy nodes and evidence ids are materialized. OpenViking's hierarchy design remains a reference, not a scored win, tie, or loss, until comparable output exists.", + "claims": [ + { + "claim_id": "hierarchy_selection_blocked", + "text": "OpenViking hierarchy selection is blocked until selected hierarchy nodes and evidence ids are materialized.", + "evidence_ids": [ + "hierarchy-selection-output-contract", + "same-corpus-before-hierarchy" + ], + "confidence": "high" + }, + { + "claim_id": "design_reference_not_score", + "text": "OpenViking's hierarchy design remains a reference, not a scored win, tie, or loss, until comparable output exists.", + "evidence_ids": ["hierarchy-comparison-requires-elf-equivalent"], + "confidence": "high" + } + ], + "evidence_ids": [ + "hierarchy-selection-output-contract", + "same-corpus-before-hierarchy", + "hierarchy-comparison-requires-elf-equivalent" + ], + "latency_ms": 0.0, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + } + } + } + }, + "timeline": [ + { + "event_id": "hierarchy-output-contract-recorded", + "ts": "2026-06-11T00:00:00Z", + "actor": "agent", + "action": "encoded_output_contract", + "evidence_ids": ["hierarchy-selection-output-contract"], + "summary": "The fixture records the minimum hierarchy readback needed before scoring." + }, + { + "event_id": "hierarchy-precondition-blocked", + "ts": "2026-06-11T00:01:00Z", + "actor": "agent", + "action": "blocked_scoring", + "evidence_ids": ["same-corpus-before-hierarchy"], + "summary": "The benchmark blocks hierarchy selection scoring until same-corpus evidence ids match." + }, + { + "event_id": "hierarchy-comparison-gated", + "ts": "2026-06-11T00:02:00Z", + "actor": "agent", + "action": "preserved_claim_boundary", + "evidence_ids": ["hierarchy-comparison-requires-elf-equivalent"], + "summary": "The benchmark requires comparable ELF and OpenViking hierarchy artifacts before any win/tie/loss." + } + ], + "prompt": { + "role": "user", + "content": "Can the benchmark score OpenViking hierarchy selection quality against ELF?", + "job_mode": "answer", + "constraints": [ + "cite_evidence", + "preserve_typed_status", + "separate_design_reference_from_scored_output" + ] + }, + "expected_answer": { + "must_include": [ + { + "claim_id": "hierarchy_selection_blocked", + "text": "OpenViking hierarchy selection is blocked until selected hierarchy nodes and evidence ids are materialized." + }, + { + "claim_id": "design_reference_not_score", + "text": "OpenViking's hierarchy design remains a reference, not a scored win, tie, or loss, until comparable output exists." + } + ], + "must_not_include": [ + "OpenViking wins hierarchy selection", + "ELF wins hierarchy selection" + ], + "evidence_links": { + "hierarchy_selection_blocked": [ + "hierarchy-selection-output-contract", + "same-corpus-before-hierarchy" + ], + "design_reference_not_score": ["hierarchy-comparison-requires-elf-equivalent"] + }, + "answer_type": "direct_answer", + "accepted_alternates": [], + "requires_caveat": true, + "requires_refusal": false + }, + "required_evidence": [ + { + "evidence_id": "hierarchy-selection-output-contract", + "claim_id": "hierarchy_selection_blocked", + "requirement": "cite", + "quote": "selected parent context, selected child context, final resource evidence ids" + }, + { + "evidence_id": "same-corpus-before-hierarchy", + "claim_id": "hierarchy_selection_blocked", + "requirement": "cite", + "quote": "covers every expected evidence id" + }, + { + "evidence_id": "hierarchy-comparison-requires-elf-equivalent", + "claim_id": "design_reference_not_score", + "requirement": "cite", + "quote": "same hierarchy-selection scenario is encoded" + } + ], + "negative_traps": [ + { + "trap_id": "hierarchy-design-win-decoy", + "type": "unsupported_prior", + "evidence_ids": ["hierarchy-design-win-decoy"], + "failure_if_used": true + } + ], + "scoring_rubric": { + "dimensions": { + "answer_correctness": { + "weight": 0.3, + "max_points": 1.0, + "criteria": "States hierarchy selection is blocked until output is materialized." + }, + "evidence_grounding": { + "weight": 0.25, + "max_points": 1.0, + "criteria": "Cites hierarchy output requirements and same-corpus precondition evidence." + }, + "trap_avoidance": { + "weight": 0.2, + "max_points": 1.0, + "criteria": "Does not convert design references into scored wins or losses." + }, + "uncertainty_handling": { + "weight": 0.15, + "max_points": 1.0, + "criteria": "Keeps the hierarchy comparison caveated as blocked." + }, + "workflow_helpfulness": { + "weight": 0.1, + "max_points": 1.0, + "criteria": "Names the selected-node artifact needed next." + } + }, + "pass_threshold": 0.8, + "hard_fail_rules": [ + "unsupported high-confidence claim about a required decision or fact", + "use of a negative trap marked failure_if_used = true" + ] + }, + "allowed_uncertainty": { + "can_answer_unknown": true, + "acceptable_phrases": [ + "Hierarchy selection is blocked.", + "Comparable selected-node evidence is missing." + ], + "fallback_action": "state_blocker" + }, + "tags": [ + "external_adapter", + "openviking", + "context_trajectory", + "hierarchy_selection", + "blocked", + "no_live_claim" + ] +} diff --git a/apps/elf-eval/fixtures/real_world_memory/context_trajectory/openviking_recursive_expansion_blocked.json b/apps/elf-eval/fixtures/real_world_memory/context_trajectory/openviking_recursive_expansion_blocked.json new file mode 100644 index 00000000..16b41a45 --- /dev/null +++ b/apps/elf-eval/fixtures/real_world_memory/context_trajectory/openviking_recursive_expansion_blocked.json @@ -0,0 +1,261 @@ +{ + "schema": "elf.real_world_job/v1", + "job_id": "context-trajectory-openviking-recursive-expansion-001", + "suite": "context_trajectory", + "title": "Gate OpenViking recursive context expansion on materialized expansion paths", + "encoding": { + "status": "blocked", + "reason": "OpenViking recursive/context expansion is encoded as a benchmark job, but scoring is blocked until the adapter materializes expansion paths and same-corpus evidence ids are correct.", + "follow_up": { + "title": "Materialize OpenViking recursive context expansion paths", + "reason": "The adapter must emit the seed context, expanded child contexts, final evidence ids, and pruned branches before recursive expansion quality can be scored." + } + }, + "corpus": { + "corpus_id": "real-world-memory-context-trajectory-2026-06-11", + "profile": "external_adapter", + "items": [ + { + "evidence_id": "recursive-expansion-output-contract", + "kind": "adapter_state", + "text": "A scored recursive/context expansion job must report the seed context, expanded child contexts, final evidence ids, and pruned branches for the same user prompt.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "openviking_recursive_expansion_blocked", + "evidence_id": "recursive-expansion-output-contract" + }, + "locator": { + "quote": "seed context, expanded child contexts, final evidence ids, and pruned branches" + } + }, + "created_at": "2026-06-11T00:00:00Z" + }, + { + "evidence_id": "recursive-same-corpus-gate", + "kind": "adapter_state", + "text": "Recursive/context expansion scoring stays blocked until same-corpus retrieval returns the expected evidence ids and the recursive path output is scored.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "openviking_recursive_expansion_blocked", + "evidence_id": "recursive-same-corpus-gate" + }, + "locator": { + "quote": "same-corpus retrieval returns the expected evidence ids" + } + }, + "created_at": "2026-06-11T00:00:00Z" + }, + { + "evidence_id": "recursive-elf-comparison-gate", + "kind": "runbook", + "text": "ELF recursive or trace expansion may be compared only where the same recursive/context expansion scenario is encoded and both sides publish expansion-path artifacts.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "openviking_recursive_expansion_blocked", + "evidence_id": "recursive-elf-comparison-gate" + }, + "locator": { + "quote": "both sides publish expansion-path artifacts" + } + }, + "created_at": "2026-06-11T00:00:00Z" + }, + { + "evidence_id": "recursive-expansion-win-decoy", + "kind": "adapter_state", + "text": "Decoy: ELF should be scored as tying OpenViking recursive expansion because both systems have trace-related documentation.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "openviking_recursive_expansion_blocked", + "evidence_id": "recursive-expansion-win-decoy" + } + }, + "created_at": "2026-06-10T00:00:00Z" + } + ], + "adapter_response": { + "adapter_id": "fixture_context_trajectory", + "answer": { + "content": "OpenViking recursive/context expansion is blocked until expansion paths and expected evidence ids are materialized. No ELF tie, win, or loss is allowed until both systems publish comparable expansion-path artifacts for the same scenario.", + "claims": [ + { + "claim_id": "recursive_expansion_blocked", + "text": "OpenViking recursive/context expansion is blocked until expansion paths and expected evidence ids are materialized.", + "evidence_ids": [ + "recursive-expansion-output-contract", + "recursive-same-corpus-gate" + ], + "confidence": "high" + }, + { + "claim_id": "recursive_comparison_not_scored", + "text": "No ELF tie, win, or loss is allowed until both systems publish comparable expansion-path artifacts for the same scenario.", + "evidence_ids": ["recursive-elf-comparison-gate"], + "confidence": "high" + } + ], + "evidence_ids": [ + "recursive-expansion-output-contract", + "recursive-same-corpus-gate", + "recursive-elf-comparison-gate" + ], + "latency_ms": 0.0, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + } + } + } + }, + "timeline": [ + { + "event_id": "recursive-output-contract-recorded", + "ts": "2026-06-11T00:00:00Z", + "actor": "agent", + "action": "encoded_output_contract", + "evidence_ids": ["recursive-expansion-output-contract"], + "summary": "The fixture records the recursive expansion artifact needed before scoring." + }, + { + "event_id": "recursive-scoring-blocked", + "ts": "2026-06-11T00:01:00Z", + "actor": "agent", + "action": "blocked_scoring", + "evidence_ids": ["recursive-same-corpus-gate"], + "summary": "The benchmark blocks recursive expansion scoring until expected evidence ids and expansion paths are available." + }, + { + "event_id": "recursive-comparison-gated", + "ts": "2026-06-11T00:02:00Z", + "actor": "agent", + "action": "preserved_claim_boundary", + "evidence_ids": ["recursive-elf-comparison-gate"], + "summary": "The benchmark requires comparable expansion-path artifacts before any ELF comparison." + } + ], + "prompt": { + "role": "user", + "content": "Can the benchmark score OpenViking recursive context expansion against ELF?", + "job_mode": "answer", + "constraints": [ + "cite_evidence", + "preserve_typed_status", + "do_not_claim_tie_without_comparable_artifacts" + ] + }, + "expected_answer": { + "must_include": [ + { + "claim_id": "recursive_expansion_blocked", + "text": "OpenViking recursive/context expansion is blocked until expansion paths and expected evidence ids are materialized." + }, + { + "claim_id": "recursive_comparison_not_scored", + "text": "No ELF tie, win, or loss is allowed until both systems publish comparable expansion-path artifacts for the same scenario." + } + ], + "must_not_include": [ + "ELF ties OpenViking recursive expansion", + "OpenViking recursive expansion passed" + ], + "evidence_links": { + "recursive_expansion_blocked": [ + "recursive-expansion-output-contract", + "recursive-same-corpus-gate" + ], + "recursive_comparison_not_scored": ["recursive-elf-comparison-gate"] + }, + "answer_type": "direct_answer", + "accepted_alternates": [], + "requires_caveat": true, + "requires_refusal": false + }, + "required_evidence": [ + { + "evidence_id": "recursive-expansion-output-contract", + "claim_id": "recursive_expansion_blocked", + "requirement": "cite", + "quote": "seed context, expanded child contexts, final evidence ids, and pruned branches" + }, + { + "evidence_id": "recursive-same-corpus-gate", + "claim_id": "recursive_expansion_blocked", + "requirement": "cite", + "quote": "same-corpus retrieval returns the expected evidence ids" + }, + { + "evidence_id": "recursive-elf-comparison-gate", + "claim_id": "recursive_comparison_not_scored", + "requirement": "cite", + "quote": "both sides publish expansion-path artifacts" + } + ], + "negative_traps": [ + { + "trap_id": "recursive-expansion-trace-doc-decoy", + "type": "unsupported_prior", + "evidence_ids": ["recursive-expansion-win-decoy"], + "failure_if_used": true + } + ], + "scoring_rubric": { + "dimensions": { + "answer_correctness": { + "weight": 0.3, + "max_points": 1.0, + "criteria": "States recursive/context expansion is blocked, not tied or passed." + }, + "evidence_grounding": { + "weight": 0.25, + "max_points": 1.0, + "criteria": "Cites expansion-path and same-corpus evidence gates." + }, + "trap_avoidance": { + "weight": 0.2, + "max_points": 1.0, + "criteria": "Does not convert documentation or trace presence into a scored tie." + }, + "uncertainty_handling": { + "weight": 0.15, + "max_points": 1.0, + "criteria": "Keeps the recursive expansion comparison caveated as blocked." + }, + "workflow_helpfulness": { + "weight": 0.1, + "max_points": 1.0, + "criteria": "Names expansion-path artifacts required next." + } + }, + "pass_threshold": 0.8, + "hard_fail_rules": [ + "unsupported high-confidence claim about a required decision or fact", + "use of a negative trap marked failure_if_used = true" + ] + }, + "allowed_uncertainty": { + "can_answer_unknown": true, + "acceptable_phrases": [ + "Recursive expansion is blocked.", + "Comparable expansion-path artifacts are missing." + ], + "fallback_action": "state_blocker" + }, + "tags": [ + "external_adapter", + "openviking", + "context_trajectory", + "recursive_expansion", + "blocked", + "no_live_claim" + ] +} diff --git a/apps/elf-eval/fixtures/real_world_memory/context_trajectory/openviking_staged_retrieval_blocked.json b/apps/elf-eval/fixtures/real_world_memory/context_trajectory/openviking_staged_retrieval_blocked.json new file mode 100644 index 00000000..b27fedb6 --- /dev/null +++ b/apps/elf-eval/fixtures/real_world_memory/context_trajectory/openviking_staged_retrieval_blocked.json @@ -0,0 +1,260 @@ +{ + "schema": "elf.real_world_job/v1", + "job_id": "context-trajectory-openviking-staged-retrieval-001", + "suite": "context_trajectory", + "title": "Gate OpenViking staged retrieval trajectory on evidence-bearing same-corpus output", + "encoding": { + "status": "blocked", + "reason": "OpenViking staged retrieval trajectory is encoded as a benchmark job, but scoring is blocked until same-corpus output returns expected evidence ids and comparable staged artifacts exist.", + "follow_up": { + "title": "Run OpenViking staged trajectory after same-corpus evidence passes", + "reason": "The adapter must first publish matched expected evidence ids for every same-corpus query, then emit stage-level context trajectory output that can be compared with the equivalent ELF trace/session trajectory." + } + }, + "corpus": { + "corpus_id": "real-world-memory-context-trajectory-2026-06-11", + "profile": "external_adapter", + "items": [ + { + "evidence_id": "openviking-evidence-id-output-contract", + "kind": "adapter_state", + "text": "The OpenViking Docker baseline must emit expected_evidence_ids, matched_evidence_ids, and missing_evidence_ids for every same-corpus query before staged trajectory scoring is allowed.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "repo_file/v1", + "ref": { + "path": "scripts/live-baseline-benchmark.sh" + }, + "locator": { + "symbol": "project_openviking" + } + }, + "created_at": "2026-06-11T00:00:00Z" + }, + { + "evidence_id": "openviking-same-corpus-precondition-blocked", + "kind": "adapter_state", + "text": "OpenViking staged retrieval trajectory remains blocked while same-corpus retrieval is wrong_result or while matched_evidence_ids does not cover every expected evidence id.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "openviking_staged_retrieval_blocked", + "evidence_id": "openviking-same-corpus-precondition-blocked" + }, + "locator": { + "quote": "same-corpus retrieval is wrong_result" + } + }, + "created_at": "2026-06-11T00:00:00Z" + }, + { + "evidence_id": "elf-comparison-requires-comparable-trajectory", + "kind": "runbook", + "text": "ELF trace or search-session trajectory may be compared only after the same context-trajectory scenario is encoded and both systems publish comparable stage artifacts.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "openviking_staged_retrieval_blocked", + "evidence_id": "elf-comparison-requires-comparable-trajectory" + }, + "locator": { + "quote": "both systems publish comparable stage artifacts" + } + }, + "created_at": "2026-06-11T00:00:00Z" + }, + { + "evidence_id": "trajectory-win-decoy", + "kind": "adapter_state", + "text": "Decoy: ELF should be scored as winning staged trajectory because OpenViking same-corpus retrieval is currently wrong_result.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "openviking_staged_retrieval_blocked", + "evidence_id": "trajectory-win-decoy" + } + }, + "created_at": "2026-06-10T00:00:00Z" + } + ], + "adapter_response": { + "adapter_id": "fixture_context_trajectory", + "answer": { + "content": "OpenViking staged retrieval trajectory is blocked until same-corpus output matches expected evidence ids. No ELF win, tie, or loss is allowed until both systems publish comparable stage artifacts for the same context-trajectory scenario.", + "claims": [ + { + "claim_id": "staged_trajectory_blocked", + "text": "OpenViking staged retrieval trajectory is blocked until same-corpus output matches expected evidence ids.", + "evidence_ids": [ + "openviking-evidence-id-output-contract", + "openviking-same-corpus-precondition-blocked" + ], + "confidence": "high" + }, + { + "claim_id": "elf_comparison_not_scored", + "text": "No ELF win, tie, or loss is allowed until both systems publish comparable stage artifacts for the same context-trajectory scenario.", + "evidence_ids": ["elf-comparison-requires-comparable-trajectory"], + "confidence": "high" + } + ], + "evidence_ids": [ + "openviking-evidence-id-output-contract", + "openviking-same-corpus-precondition-blocked", + "elf-comparison-requires-comparable-trajectory" + ], + "latency_ms": 0.0, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + } + } + } + }, + "timeline": [ + { + "event_id": "openviking-evidence-id-contract-added", + "ts": "2026-06-11T00:00:00Z", + "actor": "agent", + "action": "encoded_output_contract", + "evidence_ids": ["openviking-evidence-id-output-contract"], + "summary": "The OpenViking baseline output contract now names expected, matched, and missing evidence ids per query." + }, + { + "event_id": "staged-trajectory-blocked", + "ts": "2026-06-11T00:01:00Z", + "actor": "agent", + "action": "blocked_scoring", + "evidence_ids": ["openviking-same-corpus-precondition-blocked"], + "summary": "The staged trajectory benchmark remains blocked behind same-corpus evidence-bearing output." + }, + { + "event_id": "elf-comparison-gated", + "ts": "2026-06-11T00:02:00Z", + "actor": "agent", + "action": "preserved_claim_boundary", + "evidence_ids": ["elf-comparison-requires-comparable-trajectory"], + "summary": "The benchmark does not compare ELF trajectory output until both sides emit comparable artifacts." + } + ], + "prompt": { + "role": "user", + "content": "Can the benchmark score OpenViking staged retrieval trajectory against ELF now?", + "job_mode": "debug", + "constraints": [ + "cite_evidence", + "preserve_typed_status", + "do_not_claim_elf_win_without_comparable_artifacts" + ] + }, + "expected_answer": { + "must_include": [ + { + "claim_id": "staged_trajectory_blocked", + "text": "OpenViking staged retrieval trajectory is blocked until same-corpus output matches expected evidence ids." + }, + { + "claim_id": "elf_comparison_not_scored", + "text": "No ELF win, tie, or loss is allowed until both systems publish comparable stage artifacts for the same context-trajectory scenario." + } + ], + "must_not_include": [ + "ELF wins staged trajectory", + "OpenViking staged trajectory passed" + ], + "evidence_links": { + "staged_trajectory_blocked": [ + "openviking-evidence-id-output-contract", + "openviking-same-corpus-precondition-blocked" + ], + "elf_comparison_not_scored": ["elf-comparison-requires-comparable-trajectory"] + }, + "answer_type": "debug_report", + "accepted_alternates": [], + "requires_caveat": true, + "requires_refusal": false + }, + "required_evidence": [ + { + "evidence_id": "openviking-evidence-id-output-contract", + "claim_id": "staged_trajectory_blocked", + "requirement": "cite", + "quote": "expected_evidence_ids, matched_evidence_ids, and missing_evidence_ids" + }, + { + "evidence_id": "openviking-same-corpus-precondition-blocked", + "claim_id": "staged_trajectory_blocked", + "requirement": "cite", + "quote": "same-corpus retrieval is wrong_result" + }, + { + "evidence_id": "elf-comparison-requires-comparable-trajectory", + "claim_id": "elf_comparison_not_scored", + "requirement": "cite", + "quote": "both systems publish comparable stage artifacts" + } + ], + "negative_traps": [ + { + "trap_id": "trajectory-win-from-precondition-decoy", + "type": "unsupported_prior", + "evidence_ids": ["trajectory-win-decoy"], + "failure_if_used": true + } + ], + "scoring_rubric": { + "dimensions": { + "answer_correctness": { + "weight": 0.3, + "max_points": 1.0, + "criteria": "States the staged trajectory job is blocked, not won or passed." + }, + "evidence_grounding": { + "weight": 0.25, + "max_points": 1.0, + "criteria": "Cites the evidence-id output contract and comparable-artifact gate." + }, + "trap_avoidance": { + "weight": 0.2, + "max_points": 1.0, + "criteria": "Avoids converting the same-corpus wrong_result into an ELF trajectory win." + }, + "debuggability": { + "weight": 0.15, + "max_points": 1.0, + "criteria": "Identifies the blocked precondition and next artifact needed." + }, + "workflow_helpfulness": { + "weight": 0.1, + "max_points": 1.0, + "criteria": "Gives a concrete next benchmark gate." + } + }, + "pass_threshold": 0.8, + "hard_fail_rules": [ + "unsupported high-confidence claim about a required decision or fact", + "use of a negative trap marked failure_if_used = true" + ] + }, + "allowed_uncertainty": { + "can_answer_unknown": true, + "acceptable_phrases": [ + "The staged trajectory score is blocked.", + "Comparable stage artifacts are missing." + ], + "fallback_action": "state_blocker" + }, + "tags": [ + "external_adapter", + "openviking", + "context_trajectory", + "staged_retrieval", + "blocked", + "no_live_claim" + ] +} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark.rs b/apps/elf-eval/src/bin/real_world_job_benchmark.rs index a167d2bd..efd4a34a 100644 --- a/apps/elf-eval/src/bin/real_world_job_benchmark.rs +++ b/apps/elf-eval/src/bin/real_world_job_benchmark.rs @@ -54,6 +54,7 @@ const SUITES: &[&str] = &[ "capture_integration", "production_ops", "personalization", + "context_trajectory", ]; #[derive(Debug, Parser)] diff --git a/apps/elf-eval/tests/real_world_job_benchmark.rs b/apps/elf-eval/tests/real_world_job_benchmark.rs index dee50e09..9b39fd6a 100644 --- a/apps/elf-eval/tests/real_world_job_benchmark.rs +++ b/apps/elf-eval/tests/real_world_job_benchmark.rs @@ -64,6 +64,10 @@ fn production_ops_fixture_dir() -> PathBuf { real_world_memory_fixture_dir().join("production_ops") } +fn context_trajectory_fixture_dir() -> PathBuf { + real_world_memory_fixture_dir().join("context_trajectory") +} + fn workspace_root() -> Result { let manifest_dir = Path::new(env!("CARGO_MANIFEST_DIR")); let root = manifest_dir @@ -524,13 +528,13 @@ fn assert_external_adapter_manifest_summary(report: &Value) { report .pointer("/external_adapters/summary/overall_status_counts/blocked") .and_then(Value::as_u64), - Some(5) + Some(6) ); assert_eq!( report .pointer("/external_adapters/summary/overall_status_counts/not_encoded") .and_then(Value::as_u64), - Some(7) + Some(6) ); assert_eq!( report @@ -548,7 +552,7 @@ fn assert_external_adapter_manifest_summary(report: &Value) { report .pointer("/external_adapters/summary/suite_status_counts/blocked") .and_then(Value::as_u64), - Some(13) + Some(16) ); assert_eq!( report @@ -698,8 +702,8 @@ fn assert_external_adapter_manifest_records(report: &Value) -> Result<()> { let qmd_deep = find_by_field(adapters, "/adapter_id", "qmd_deep_profile_gate")?; let openviking_deep = find_by_field(adapters, "/adapter_id", "openviking_deep_profile_gate")?; - assert_eq!(elf.pointer("/evidence_class").and_then(Value::as_str), Some("fixture_backed")); - assert_eq!(elf.pointer("/overall_status").and_then(Value::as_str), Some("blocked")); + assert_elf_fixture_adapter_record(elf)?; + assert_eq!( elf_live.pointer("/evidence_class").and_then(Value::as_str), Some("live_real_world") @@ -773,6 +777,7 @@ fn assert_external_adapter_manifest_records(report: &Value) -> Result<()> { assert_graphiti_zep_adapter(graphiti_zep); assert_graphify_adapter(graphify)?; + assert_qmd_deep_profile_gate(qmd_deep); assert_eq!( qmd_deep.pointer("/capabilities/2/status").and_then(Value::as_str), @@ -797,6 +802,30 @@ fn assert_external_adapter_manifest_records(report: &Value) -> Result<()> { Ok(()) } +fn assert_elf_fixture_adapter_record(adapter: &Value) -> Result<()> { + assert_eq!(adapter.pointer("/evidence_class").and_then(Value::as_str), Some("fixture_backed")); + assert_eq!(adapter.pointer("/overall_status").and_then(Value::as_str), Some("blocked")); + + let suites = array_at(adapter, "/suites")?; + let context_trajectory = find_by_field(suites, "/suite_id", "context_trajectory")?; + + assert_eq!(context_trajectory.pointer("/status").and_then(Value::as_str), Some("blocked")); + assert!( + adapter + .pointer("/notes/1") + .and_then(Value::as_str) + .is_some_and(|note| note.contains("OpenViking context-trajectory measurement gates")) + ); + + Ok(()) +} + +fn assert_qmd_deep_profile_gate(adapter: &Value) { + assert_eq!(adapter.pointer("/overall_status").and_then(Value::as_str), Some("not_encoded")); + assert_eq!(adapter.pointer("/run/status").and_then(Value::as_str), Some("not_encoded")); + assert_eq!(adapter.pointer("/result/status").and_then(Value::as_str), Some("not_encoded")); +} + fn assert_qmd_live_baseline_record(adapter: &Value) { let result_evidence = adapter.pointer("/result/evidence").and_then(Value::as_str); let retrieval_evidence = adapter.pointer("/suites/0/evidence").and_then(Value::as_str); @@ -921,9 +950,10 @@ fn assert_operator_debug_live_adapter_records(elf: &Value, qmd: &Value) -> Resul fn assert_openviking_deep_profile_gate(adapter: &Value) { let trajectory_evidence = adapter.pointer("/capabilities/1/evidence").and_then(Value::as_str); + assert_eq!(adapter.pointer("/overall_status").and_then(Value::as_str), Some("blocked")); assert!(trajectory_evidence.is_some_and(|evidence| { evidence.contains("evidence-bearing same-corpus output") - && evidence.contains("wrong_result missed-term evidence") + && evidence.contains("selected hierarchy/expansion artifacts") && !evidence.contains("setup reaches runnable OpenViking APIs") })); } @@ -1524,7 +1554,7 @@ fn assert_live_sweep_record(adapter: &Value, production_ops_status: &str) -> Res fn runner_discovers_nested_fixture_layout() -> Result<()> { let report = run_json_report_from(fixture_root())?; - assert_eq!(report.pointer("/summary/job_count").and_then(Value::as_u64), Some(40)); + assert_eq!(report.pointer("/summary/job_count").and_then(Value::as_u64), Some(43)); Ok(()) } @@ -1864,6 +1894,9 @@ fn current_benchmark_reports_preserve_live_sweep_boundaries() -> Result<()> { measurement_audit .contains("qmd live fails 6/6 jobs after missing the delete/TTL tombstone evidence") ); + + assert_measurement_audit_adapter_status_counts(&measurement_audit); + assert!( competitor_matrix .contains("broader live suites remain `wrong_result`, `blocked`, or `not_encoded`") @@ -2214,13 +2247,13 @@ fn assert_competitor_strength_matrix_json(matrix: &Value) -> Result<()> { ); assert_eq!( openviking.pointer("/unsupported_or_blocked_status/state").and_then(Value::as_str), - Some("not_encoded") + Some("blocked") ); assert!( openviking .pointer("/unsupported_or_blocked_status/details") .and_then(Value::as_str) - .is_some_and(|details| details.contains("same-corpus output misses expected evidence")) + .is_some_and(|details| details.contains("encoded as blocked fixtures")) ); assert!( openviking @@ -2286,6 +2319,16 @@ fn assert_competitor_strength_matrix_manifest_counts(matrix: &Value) { matrix.pointer("/manifest_summary/overall_status_counts/pass").and_then(Value::as_u64), Some(4) ); + assert_eq!( + matrix.pointer("/manifest_summary/overall_status_counts/blocked").and_then(Value::as_u64), + Some(6) + ); + assert_eq!( + matrix + .pointer("/manifest_summary/overall_status_counts/not_encoded") + .and_then(Value::as_u64), + Some(6) + ); assert_eq!( matrix .pointer("/manifest_summary/overall_status_counts/wrong_result") @@ -2535,9 +2578,10 @@ fn assert_openviking_strength_profile(report: &Value) -> Result<()> { assert_eq!(openviking_scenarios.len(), 6); assert_eq!( trajectory.pointer("/evidence_class").and_then(Value::as_str), - Some("research_gate") + Some("fixture_backed") ); - assert_eq!(trajectory.pointer("/result_type").and_then(Value::as_str), Some("not_encoded")); + assert_eq!(trajectory.pointer("/result_type").and_then(Value::as_str), Some("blocked")); + assert_eq!(trajectory.pointer("/openviking_status").and_then(Value::as_str), Some("blocked")); assert_eq!(local_embed_setup.pointer("/result_type").and_then(Value::as_str), Some("pass")); assert_eq!( local_embed_setup.pointer("/elf_outcome").and_then(Value::as_str), @@ -2552,11 +2596,11 @@ fn assert_openviking_strength_profile(report: &Value) -> Result<()> { ); assert_eq!(missed_terms.pointer("/result_type").and_then(Value::as_str), Some("wrong_result")); assert_eq!(missed_terms.pointer("/elf_outcome").and_then(Value::as_str), Some("not_tested")); - assert_eq!(hierarchy.pointer("/result_type").and_then(Value::as_str), Some("not_encoded")); + assert_eq!(hierarchy.pointer("/result_type").and_then(Value::as_str), Some("blocked")); assert_eq!(hierarchy.pointer("/elf_outcome").and_then(Value::as_str), Some("not_tested")); assert_eq!( recursive_expansion.pointer("/result_type").and_then(Value::as_str), - Some("not_encoded") + Some("blocked") ); assert_eq!( recursive_expansion.pointer("/elf_outcome").and_then(Value::as_str), @@ -2580,17 +2624,17 @@ fn assert_strength_profile_json_claim_boundaries(report: &Value) -> Result<()> { assert!(array_contains_str( report, "/claim_boundaries", - "ELF does not beat OpenViking on context trajectory; OpenViking trajectory strengths remain not_tested behind a wrong_result same-corpus output precondition." + "ELF does not beat OpenViking on context trajectory; OpenViking trajectory strengths remain blocked/not_tested behind a wrong_result same-corpus output precondition and missing staged artifacts." )?); assert!(array_contains_str( report, "/claim_boundaries", - "Research_gate records are follow-up gates, not pass evidence." + "Research_gate and blocked fixture records are follow-up gates, not pass evidence." )?); assert!(array_contains_str( report, "/claim_boundaries", - "Missing equivalent surfaces are encoded as unsupported or not_encoded rather than fake losses." + "Missing equivalent surfaces are encoded as unsupported, blocked, or not_encoded rather than fake losses." )?); Ok(()) @@ -2613,7 +2657,7 @@ fn assert_strength_profile_markdown_boundaries(markdown: &str) { "Do not claim ELF beats OpenViking on staged retrieval, hierarchy, or recursive" )); assert!(markdown.contains( - "Do not turn `research_gate`, `not_encoded`, or `unsupported` surfaces into wins" + "Do not turn `research_gate`, `blocked`, `not_encoded`, or `unsupported` surfaces" )); assert!(markdown.contains("no pass evidence is claimed")); assert!(markdown.contains("typed `wrong_result` state")); @@ -2639,26 +2683,72 @@ fn assert_operator_facing_strength_profile_boundaries( assert!( benchmarking_index.contains("separates qmd retrieval quality from debug/replay ergonomics") ); - assert!(benchmarking_index.contains("preserves OpenViking context-trajectory")); + assert!(benchmarking_index.contains("preserves XY-928 OpenViking")); assert!( benchmarking_index - .contains("surfaces as `not_tested` until staged/hierarchical evidence is encoded") + .contains("context-trajectory surfaces as blocked/not-tested until scored staged") ); assert!( iteration_direction .contains("ELF and qmd are tied on the encoded live retrieval, work-resume, and") ); assert!(iteration_direction.contains("ELF does not yet beat qmd's local retrieval-debug")); - assert!( - iteration_direction - .contains("ELF beats OpenViking on context trajectory. That scenario is not encoded.") - ); + + assert_iteration_direction_current_measurement_counts(iteration_direction); + + assert!(iteration_direction.contains( + "ELF beats OpenViking on context trajectory. The scenario is encoded as blocked" + )); assert!( iteration_direction .contains("Do not promote a reference project into a win/loss claim until") ); } +fn assert_measurement_audit_adapter_status_counts(markdown: &str) { + for expected in [ + "| `blocked` | `6` |", + "| `not_encoded` | `6` |", + "The generated JSON report emits `external_project_count: 16`", + ] { + assert!(markdown.contains(expected), "missing measurement audit text: {expected}"); + } + for stale in ["| `blocked` | `5` |", "| `not_encoded` | `7` |"] { + assert!(!markdown.contains(stale), "stale measurement audit text: {stale}"); + } +} + +fn assert_iteration_direction_current_measurement_counts(markdown: &str) { + for expected in [ + "| Jobs | `43` |", + "| Encoded suites | `12` |", + "| Blocked | `5` |", + "| Mean score | `0.884` |", + "| Evidence coverage | `97/97` |", + "| Source-ref coverage | `97/97` |", + "| Quote coverage | `97/97` |", + "| Expected evidence recall | `89/89` |", + "| `blocked` | `6` |", + "| `not_encoded` | `6` |", + "`live_baseline_only`, `fixture_backed`, and `research_gate`", + "`blocked` for fixture-backed trajectory gates", + ] { + assert!(markdown.contains(expected), "missing iteration-direction text: {expected}"); + } + for stale in [ + "| Jobs | `40` |", + "| Encoded suites | `11` |", + "| Mean score | `0.950` |", + "| Evidence coverage | `88/88` |", + "| Expected evidence recall | `80/80` |", + "| `blocked` | `5` |", + "| `not_encoded` | `7` |", + "`live_baseline_only` plus `research_gate`", + ] { + assert!(!markdown.contains(stale), "stale iteration-direction text: {stale}"); + } +} + #[test] fn generated_json_report_renders_markdown() -> Result<()> { let report = run_json_report()?; @@ -2981,6 +3071,46 @@ fn production_ops_fixtures_report_bounded_typed_states() -> Result<()> { Ok(()) } +#[test] +fn context_trajectory_fixtures_report_blocked_openviking_gates() -> Result<()> { + let report = run_json_report_from(context_trajectory_fixture_dir())?; + + assert_eq!(report.pointer("/summary/job_count").and_then(Value::as_u64), Some(3)); + assert_eq!(report.pointer("/summary/pass").and_then(Value::as_u64), Some(0)); + assert_eq!(report.pointer("/summary/blocked").and_then(Value::as_u64), Some(3)); + assert_eq!(report.pointer("/summary/wrong_result").and_then(Value::as_u64), Some(0)); + assert_eq!(report.pointer("/summary/evidence_coverage").and_then(Value::as_f64), Some(1.0)); + assert_eq!( + report.pointer("/summary/expected_evidence_recall").and_then(Value::as_f64), + Some(1.0) + ); + + let suites = array_at(&report, "/suites")?; + let context = find_by_field(suites, "/suite_id", "context_trajectory")?; + + assert_eq!(context.pointer("/status").and_then(Value::as_str), Some("blocked")); + assert_eq!(context.pointer("/encoded_job_count").and_then(Value::as_u64), Some(3)); + + let jobs = array_at(&report, "/jobs")?; + let staged = + find_by_field(jobs, "/job_id", "context-trajectory-openviking-staged-retrieval-001")?; + let hierarchy = + find_by_field(jobs, "/job_id", "context-trajectory-openviking-hierarchy-selection-001")?; + let recursive = + find_by_field(jobs, "/job_id", "context-trajectory-openviking-recursive-expansion-001")?; + + assert_eq!(staged.pointer("/status").and_then(Value::as_str), Some("blocked")); + assert_eq!(hierarchy.pointer("/status").and_then(Value::as_str), Some("blocked")); + assert_eq!(recursive.pointer("/status").and_then(Value::as_str), Some("blocked")); + assert!( + staged.pointer("/reason").and_then(Value::as_str).is_some_and( + |reason| reason.contains("same-corpus output returns expected evidence ids") + ) + ); + + Ok(()) +} + fn assert_root_knowledge_summary(report: &Value) { assert_eq!(report.pointer("/summary/knowledge/job_count").and_then(Value::as_u64), Some(2)); assert_eq!(report.pointer("/summary/knowledge/page_count").and_then(Value::as_u64), Some(4)); @@ -2991,11 +3121,12 @@ fn assert_root_knowledge_summary(report: &Value) { } fn assert_root_aggregate_summary(report: &Value) { - assert_eq!(report.pointer("/summary/job_count").and_then(Value::as_u64), Some(40)); + assert_eq!(report.pointer("/summary/job_count").and_then(Value::as_u64), Some(43)); + assert_eq!(report.pointer("/summary/encoded_suite_count").and_then(Value::as_u64), Some(12)); assert_eq!(report.pointer("/summary/pass").and_then(Value::as_u64), Some(38)); assert_eq!(report.pointer("/summary/wrong_result").and_then(Value::as_u64), Some(0)); assert_eq!(report.pointer("/summary/incomplete").and_then(Value::as_u64), Some(0)); - assert_eq!(report.pointer("/summary/blocked").and_then(Value::as_u64), Some(2)); + assert_eq!(report.pointer("/summary/blocked").and_then(Value::as_u64), Some(5)); assert_eq!(report.pointer("/summary/not_encoded").and_then(Value::as_u64), Some(0)); assert_eq!(report.pointer("/summary/unsupported_claim_count").and_then(Value::as_u64), Some(0)); assert_eq!(report.pointer("/summary/wrong_result_count").and_then(Value::as_u64), Some(0)); @@ -3035,9 +3166,9 @@ fn assert_root_aggregate_summary(report: &Value) { ); assert_eq!( report.pointer("/summary/evidence_required_count").and_then(Value::as_u64), - Some(88) + Some(97) ); - assert_eq!(report.pointer("/summary/evidence_covered_count").and_then(Value::as_u64), Some(88)); + assert_eq!(report.pointer("/summary/evidence_covered_count").and_then(Value::as_u64), Some(97)); assert_eq!(report.pointer("/summary/evidence_coverage").and_then(Value::as_f64), Some(1.0)); assert_eq!(report.pointer("/summary/source_ref_coverage").and_then(Value::as_f64), Some(1.0)); assert_eq!(report.pointer("/summary/quote_coverage").and_then(Value::as_f64), Some(1.0)); @@ -3108,6 +3239,11 @@ fn assert_root_aggregate_suites(report: &Value) -> Result<()> { assert_eq!(production_ops.pointer("/status").and_then(Value::as_str), Some("blocked")); assert_eq!(production_ops.pointer("/encoded_job_count").and_then(Value::as_u64), Some(6)); + let context_trajectory = find_by_field(suites, "/suite_id", "context_trajectory")?; + + assert_eq!(context_trajectory.pointer("/status").and_then(Value::as_str), Some("blocked")); + assert_eq!(context_trajectory.pointer("/encoded_job_count").and_then(Value::as_u64), Some(3)); + Ok(()) } diff --git a/docs/guide/benchmarking/2026-06-11-competitor-strength-adoption-report.md b/docs/guide/benchmarking/2026-06-11-competitor-strength-adoption-report.md index 041418f4..000e7dd1 100644 --- a/docs/guide/benchmarking/2026-06-11-competitor-strength-adoption-report.md +++ b/docs/guide/benchmarking/2026-06-11-competitor-strength-adoption-report.md @@ -39,7 +39,9 @@ The remaining caveats are material: - Several competitor strengths remain `not_tested` or blocked: OpenMemory UI/export is blocked by the XY-931 export-helper setup probe, hosted mem0 Platform behavior remains a non-goal, and OpenViking trajectory, Letta core-vs-archival - memory, and graph/RAG navigation remain unproven. mem0 local OSS preference history + memory, and graph/RAG navigation remain unproven. XY-928 encodes OpenViking staged + trajectory, hierarchy selection, and recursive/context expansion as blocked fixtures + behind same-corpus evidence output and missing staged artifacts. mem0 local OSS preference history is measured separately and is an ELF loss on the current correction history scenario. The XY-923 follow-up also scores qmd's immediate top-10/replay artifact ergonomics as stronger than ELF's default stress report, while expansion, fusion, @@ -73,7 +75,7 @@ results, or lifecycle failures into one aggregate leaderboard. | Command or run | Artifact | Supported claim | | --- | --- | --- | -| `cargo make real-world-memory` | `2026-06-11-measurement-coverage-audit.md` | ELF fixture aggregate covers 40 jobs across 11 suites with 38 pass and 2 blocked production-ops operator boundaries. | +| `cargo make real-world-memory` | `2026-06-11-measurement-coverage-audit.md` | ELF fixture aggregate covers 43 jobs across 12 suites with 38 pass and 5 blocked production-ops or OpenViking context-trajectory measurement gates. | | `cargo make real-world-memory-live-adapters` | `2026-06-11-measurement-coverage-audit.md` | ELF live service adapter reports 22 pass, 5 wrong_result, 2 blocked, and 11 not_encoded jobs; qmd reports 17 pass, 6 wrong_result, 2 blocked, and 15 not_encoded jobs. | | `cargo make real-world-memory-live-adapters` | `2026-06-11-capture-write-policy-live-report.md` | ELF live capture/write-policy jobs pass for redaction, exclusions, source ids, evidence binding, and no secret leakage; qmd remains not_encoded, agentmemory is blocked, and claude-mem is untested for capture breadth. | | `cargo make real-world-job-operator-ux-live-adapters` | `tmp/real-world-job/operator-ux-live-adapters/summary.json` | The narrow live operator-debug slice scores ELF as pass and qmd as wrong_result: ELF wins trace hydration, candidate-drop visibility, and selected-but-not-narrated evidence; both systems expose replay commands and repair-action guidance. | @@ -101,7 +103,7 @@ results, or lifecycle failures into one aggregate leaderboard. | Production ops, restore, backfill, and rebuild | `win` | `live_baseline_only`, `blocked` | ELF has the strongest measured local production-operation story: provider synthetic, stress, resumable backfill, backup/restore, and Qdrant rebuild evidence. | XY-930 | | Private corpus and provider boundaries | `blocked` | `blocked` | Private production profile fails closed without an operator-owned manifest; provider-backed production-ops gates require explicit credentials. | XY-930 | | Personalization and scoped preferences | `tie` | `fixture_backed`, `live_real_world`, `live_baseline_only`, `not_encoded` | ELF and qmd both pass the single encoded live personalization job. mem0 local OSS now passes entity-scoped personalization, so scoped preference behavior is a measured tie; preference correction history remains a separate ELF loss. | XY-927 | -| Context trajectory and hierarchical retrieval | `not_tested` | `live_baseline_only`, `research_gate`, `wrong_result`, `not_encoded` | OpenViking reaches the pinned Docker local embedding path but misses expected same-corpus evidence; staged trajectory/hierarchy scoring is not encoded. | XY-928 | +| Context trajectory and hierarchical retrieval | `not_tested` | `fixture_backed`, `live_baseline_only`, `research_gate`, `wrong_result`, `blocked` | OpenViking reaches the pinned Docker local embedding path and now exposes expected/matched/missing evidence ids, but same-corpus evidence is still wrong_result; staged trajectory, hierarchy selection, and recursive expansion are encoded as blocked fixtures, not scored comparisons. | XY-928 | | Core-vs-archival memory | `not_tested` | `research_gate`, `not_encoded` | ELF has core block semantics in the service contract, but comparable core-vs-archival jobs and a contained Letta export path are not encoded. | XY-927 | | Graph/RAG navigation and citations | `not_tested` | `smoke_only`, `research_gate`, `blocked`, `wrong_result`, `not_encoded` | Graph/RAG smokes produce scored or typed non-pass adapter reports where possible, but broad graph/RAG navigation and citation quality are not tested. | XY-929 | @@ -116,7 +118,7 @@ results, or lifecycle failures into one aggregate leaderboard. | XY-926 | P1 | Backlog | Live consolidation and knowledge-page suites; broad operator-debugging remains dependent on OpenMemory and claude-mem UI runners. | | XY-933 | P1 | Live ELF self-check encoded | Capture/write-policy redaction, exclusion, source-id, evidence-binding, and no-leak scoring for ELF; durable agentmemory/claude-mem capture-hook comparison remains blocked or untested. | | XY-927 | P1 | Backlog | Letta-style core-vs-archival memory comparison. | -| XY-928 | P1 | Backlog | OpenViking context-trajectory and hierarchy benchmark. | +| XY-928 | P1 | Encoded blocked fixtures | OpenViking context-trajectory and hierarchy benchmark is encoded but blocked until evidence-bearing same-corpus and staged artifacts exist. | | XY-929 | P2 | Backlog | Graph/RAG adapters beyond scored smokes. | | XY-930 | P1 | Backlog | Private-corpus and credentialed production gates after operator inputs exist. | | XY-906 | Ops | Todo | Decodex registered-project review-config schema drift blocks Decodex loading of ELF. | diff --git a/docs/guide/benchmarking/2026-06-11-competitor-strength-evidence-matrix.md b/docs/guide/benchmarking/2026-06-11-competitor-strength-evidence-matrix.md index d042d0ec..c2cdc983 100644 --- a/docs/guide/benchmarking/2026-06-11-competitor-strength-evidence-matrix.md +++ b/docs/guide/benchmarking/2026-06-11-competitor-strength-evidence-matrix.md @@ -29,8 +29,9 @@ Current boundary: live pass. The fresh ELF sweep produced 40 jobs with 22 pass, 5 wrong_result, 0 incomplete, 2 blocked, and 11 not_encoded; the fresh qmd sweep produced 17 pass, 6 wrong_result, 0 incomplete, 2 blocked, and 15 not_encoded. -- ELF fixture evidence is strong: `cargo make real-world-memory` reports 40 jobs - across 11 suites with 38 pass and 2 blocked production-ops operator boundaries. +- ELF fixture evidence is strong: `cargo make real-world-memory` reports 43 jobs + across 12 suites with 38 pass and 5 blocked production-ops or OpenViking + context-trajectory measurement gates. That proves the fixture contract, not live-service parity. - qmd is the strongest measured local retrieval-debug comparison, but the current evidence still separates its same-corpus/live-retrieval strengths from the full-suite @@ -77,7 +78,7 @@ lifecycle-fail -> `lifecycle_fail`, and not-encoded -> `not_encoded`. | agentmemory | Coding-agent continuity, MCP/REST packaging, viewer workflow, and durable cross-agent memory lifecycle. | `live_baseline_only`. | `lifecycle_fail`: `ELF_BASELINE_PROJECTS=agentmemory cargo make baseline-live-docker`, `tmp/live-baseline/live-baseline-report.json`. | `blocked`: durable cold-start, capture-hook persistence, and real-world adapter coverage are missing; current Docker baseline uses a process-local StateKV Map and in-memory index. | Durable local adapter with update, delete, cold-start reload, work_resume, capture/write-policy, and lifecycle-staleness jobs. | Cross-agent hooks, packaging, continuity scenarios, and viewer affordances. | | mem0/OpenMemory | Memory lifecycle, personalization, hosted/OpenMemory UI ergonomics, and optional graph memory. | `live_baseline_only`. | `pass`: fresh scoped run `cargo make openmemory-ui-export-readback`, `tmp/live-baseline/live-baseline-report.json`, with mem0 `8/8` local SDK checks passing; `blocked`: OpenMemory export-helper setup probe emits `tmp/live-baseline/mem0-openmemory-ui-export.json` with `DOCKER_UNAVAILABLE_IN_BASELINE_RUNNER`. | `blocked`: OpenMemory UI/export cannot be compared until a compose/import path loads the same corpus into the product app; `unsupported`: hosted Platform export; `not_encoded`: optional graph memory and real-world prompt adapter coverage. | Add a Docker-contained OpenMemory product app import/export path, then score browser/API readback separately from SDK `get_all`; keep hosted Platform and graph memory opt-in/non-goal unless explicitly enabled. | Entity-scoped history, lifecycle surfaces, async update ergonomics, and OpenMemory inspection UX. | | memsearch | Markdown-first canonical store with rebuildable local index and practical hybrid retrieval. | `live_baseline_only`. | `pass`: fresh scoped run `ELF_BASELINE_PROJECTS=ELF,agentmemory,mem0,memsearch,claude-mem cargo make baseline-live-docker`, `tmp/live-baseline/live-baseline-report.json`, with memsearch `4/4` local checks passing. | `not_encoded`: real-world source-of-truth, retrieval, and memory-evolution prompt adapters are not encoded; TTL/expiry is unsupported by the current CLI path. | Score source-of-truth and retrieval-debug real-world jobs over the canonical Markdown store; keep TTL/expiry as unsupported unless a comparable path exists. | Canonical markdown store, local reindex clarity, and user-inspectable source files. | -| OpenViking | Filesystem-like context trajectory, hierarchical retrieval, and staged context loading. | `live_baseline_only`; supporting `research_gate`. | `wrong_result`: `ELF_BASELINE_PROJECTS=OpenViking cargo make baseline-live-docker`, `tmp/live-baseline/live-baseline-report.json`. | `not_encoded`: hierarchical context trajectory is not encoded; same-corpus output still misses expected evidence. | Make evidence-bearing same-corpus output pass, then score staged trajectory and hierarchy expansion. | `viking://`-style context model, trajectory readback, and staged retrieval planning. | +| OpenViking | Filesystem-like context trajectory, hierarchical retrieval, and staged context loading. | `live_baseline_only`; supporting `fixture_backed` and `research_gate`. | `wrong_result`: `ELF_BASELINE_PROJECTS=OpenViking cargo make baseline-live-docker`, `tmp/live-baseline/live-baseline-report.json`; `blocked`: checked-in `context_trajectory` fixtures cover staged retrieval, hierarchy selection, and recursive/context expansion gates. | `blocked`: hierarchical context trajectory is encoded but blocked until same-corpus evidence ids match and staged artifacts are materialized. | Make evidence-bearing same-corpus output pass, then score staged trajectory and hierarchy expansion. | `viking://`-style context model, trajectory readback, and staged retrieval planning. | | claude-mem | Progressive disclosure, automatic capture loop, repository-local lifecycle, and local viewer workflow. | `live_baseline_only`. | `wrong_result`: `ELF_BASELINE_PROJECTS=claude-mem cargo make baseline-live-docker`, `tmp/live-baseline/live-baseline-report.json`. | `not_encoded`: progressive-disclosure and hook/viewer capture real-world jobs are not encoded. | Durable repository-backed work_resume, operator_debugging_ux, capture/write-policy, and progressive-disclosure jobs. | Progressive disclosure, automatic capture review loops, and local viewer/operator comfort. | | RAGFlow | Full RAG application workflow with document, chunk, and reference evidence handles. | `research_gate`. | `blocked`: `ELF_RAGFLOW_SMOKE_START=1 ELF_RAGFLOW_SMOKE_ACCEPT_RESOURCE_ENVELOPE=1 cargo make ragflow-docker-smoke`, `tmp/real-world-memory/ragflow-smoke/ragflow-smoke.json`. | `blocked`: Docker resource envelope and adapter output mapping still need proof. | XY-885 tiny Docker evidence-smoke adapter mapping `reference.chunks` to scored evidence. | Document/chunk references, resource-envelope reporting, and RAG app evidence handles. | | LightRAG | Lightweight graph/RAG context export with source file-path citation shape. | `research_gate`. | `blocked`: `ELF_LIGHTRAG_CONTEXT_START=1 cargo make lightrag-docker-context-smoke`, `tmp/real-world-memory/lightrag-context/summary.json`. | `blocked`: Docker service setup and context export are not proven. | XY-886 Docker context-export adapter with explicit provider config and source citation mapping. | Context-only query modes, graph-aware retrieval layout, and file-path citation readback. | @@ -105,7 +106,7 @@ lifecycle-fail -> `lifecycle_fail`, and not-encoded -> `not_encoded`. | Capture/write policy | Fixture capture_integration passes; ELF live capture_integration passes 4/4 with zero redaction leaks, source ids, write-policy audit, and evidence binding. | agentmemory, claude-mem. | agentmemory capture is `blocked` by mocked/in-memory storage; claude-mem hook/viewer capture is `not_encoded`. | Run durable agentmemory and claude-mem capture-hook jobs proving redaction, exclusion, evidence binding, source ids, and no secret leakage. | | Production ops | Fixture production_ops has 4 pass and 2 blocked; live production_ops is `blocked`; production adoption has provider/backfill/restore evidence. | ELF production gate, qmd, RAG/RAGFlow resource gates. | qmd live production_ops is `blocked`; RAG/resource gates are `research_gate` `blocked`. | Rerun private-corpus and credentialed gates only when operator-owned manifest and credentials exist. | | Personalization | Fixture and live personalization pass. | mem0/OpenMemory, Letta. | mem0/OpenMemory and Letta personalization are `not_encoded`. | Encode scoped preference readback for mem0/OpenMemory and Letta before personalization superiority claims. | -| Context trajectory | ELF has trace direction but no comparable staged trajectory scenario. | OpenViking. | OpenViking setup is pinned, same-corpus retrieval is `wrong_result`, and hierarchy trajectory is `not_encoded`. | Make OpenViking evidence-bearing retrieval pass, then score staged context trajectory outputs. | +| Context trajectory | ELF has trace direction but no comparable staged trajectory scenario. | OpenViking. | OpenViking setup is pinned, same-corpus retrieval is `wrong_result`, and staged/hierarchy/recursive trajectory jobs are encoded as `blocked`. | Make OpenViking evidence-bearing retrieval pass, then score staged context trajectory outputs. | | Core-vs-archival memory | ELF core-block semantics exist in the service contract, but comparative benchmark coverage is not encoded here. | Letta. | Letta is `research_gate` `not_encoded` until contained export proof exists. | Add ELF core-block versus archival-search jobs; compare Letta only after contained export proof. | | Graph/RAG navigation | ELF relation context is not enough to claim graph/RAG navigation parity. | RAGFlow, LightRAG, GraphRAG, Graphiti/Zep, graphify. | RAGFlow, LightRAG, GraphRAG, and Graphiti/Zep remain `research_gate` blocked/incomplete without explicit setup; graphify has only a tiny scored smoke `wrong_result`. | Run larger contained graph/RAG adapters with evidence-linked outputs before any ELF graph/RAG win, tie, or loss claim. | @@ -121,7 +122,7 @@ now explicit: | agentmemory/claude-mem capture-hook breadth | Follow-up after XY-933 | yes | Docker-contained hook/viewer capture path with durable artifacts. | Source ids, redaction/exclusion audit, evidence-bound output, and typed blocker reporting. | | mem0/OpenMemory history and UI coverage | New adapter repair issue | yes | Comparable local OSS path for history/UI/readback evidence. | Preference/entity history, deletion audit readback, personalization, OpenMemory inspection/export, and optional graph-context jobs. | | memsearch source-of-truth real-world coverage | New adapter repair issue | yes | Real-world prompt adapter over the canonical Markdown store. | Source-of-truth rebuild/reload jobs and retrieval-debug jobs that preserve baseline reindex/update/delete evidence without converting it into suite pass claims. | -| OpenViking context trajectory | New benchmark issue after evidence output fix | yes | Evidence-bearing same-corpus retrieval output. | Hierarchical expansion, staged trajectory, and resume/retrieval evidence jobs. | +| OpenViking context trajectory | XY-928 encoded blocked fixtures | yes | Evidence-bearing same-corpus retrieval output and staged artifacts. | Hierarchical expansion, staged trajectory, recursive/context expansion, and comparable ELF trace/session evidence jobs. | | claude-mem progressive disclosure | New adapter issue | yes | Durable repository path and progressive-disclosure output contract. | Work resume, operator debugging, capture/write-policy, and progressive disclosure jobs. | | RAGFlow evidence smoke | XY-885 | yes | Resource envelope accepted for tiny Docker smoke. | `reference.chunks` to benchmark evidence mapping. | | LightRAG context export | XY-886 | yes | Docker service setup and explicit provider config. | Retrieved context export and source file-path citations. | diff --git a/docs/guide/benchmarking/2026-06-11-elf-iteration-direction-from-competitor-benchmarks.md b/docs/guide/benchmarking/2026-06-11-elf-iteration-direction-from-competitor-benchmarks.md index 5948ba26..55ce3ed4 100644 --- a/docs/guide/benchmarking/2026-06-11-elf-iteration-direction-from-competitor-benchmarks.md +++ b/docs/guide/benchmarking/2026-06-11-elf-iteration-direction-from-competitor-benchmarks.md @@ -44,18 +44,20 @@ The strongest current statement is: | Metric | Value | | --- | ---: | -| Jobs | `40` | -| Encoded suites | `11` | +| Jobs | `43` | +| Encoded suites | `12` | | Pass | `38` | -| Blocked | `2` | +| Blocked | `5` | | Wrong result | `0` | | Lifecycle fail | `0` | | Incomplete | `0` | | Not encoded | `0` | | Unsupported claim | `0` | -| Mean score | `0.950` | -| Evidence coverage | `88/88` | -| Expected evidence recall | `80/80` | +| Mean score | `0.884` | +| Evidence coverage | `97/97` | +| Source-ref coverage | `97/97` | +| Quote coverage | `97/97` | +| Expected evidence recall | `89/89` | This proves the fixture contract is broad and well controlled. It does not prove that every live adapter or every competitor runtime passes those scenarios. @@ -116,8 +118,8 @@ Overall adapter statuses: | `pass` | `4` | | `wrong_result` | `6` | | `lifecycle_fail` | `1` | -| `blocked` | `5` | -| `not_encoded` | `7` | +| `blocked` | `6` | +| `not_encoded` | `6` | The ledger is intentionally not a leaderboard. It prevents fixture evidence, same-corpus checks, research gates, and live real-world runs from being collapsed into @@ -151,7 +153,7 @@ one misleading score. | agentmemory | `live_baseline_only`; current status is `lifecycle_fail`; capture breadth comparison is blocked by process-local StateKV Map and in-memory index. | Coding-agent continuity, hooks, MCP/REST packaging, viewer/console observability. | Borrow capture breadth and continuity UX, but require durable lifecycle and capture artifact proof before claims. | | mem0/OpenMemory | `live_baseline_only`; basic local smoke now passes, while entity/preference history, hosted ecosystem, graph memory, and OpenMemory UI remain untested locally. | Entity-scoped memory, lifecycle/history surfaces, hosted ecosystem, OpenMemory UI. | Add entity/preference history and UI readback patterns, while keeping hosted claims out of local OSS benchmarks. | | memsearch | `live_baseline_only`; canonical Markdown reindex/reload smoke now passes, while real-world source-of-truth prompts remain unencoded. | Markdown-first canonical store and local reindex clarity. | Borrow local inspectability and canonical-file ergonomics, not file-as-authority semantics. | -| OpenViking | `live_baseline_only` plus `research_gate`; current status is `wrong_result`. | Filesystem-like context model, hierarchy, staged context trajectory. | Add staged retrieval and trajectory scoring after same-corpus evidence output is correct. | +| OpenViking | `live_baseline_only`, `fixture_backed`, and `research_gate`; current status is `wrong_result` for same-corpus evidence and `blocked` for fixture-backed trajectory gates. | Filesystem-like context model, hierarchy, staged context trajectory. | Add staged retrieval and trajectory scoring after same-corpus evidence output is correct. | | claude-mem | `live_baseline_only`; current status is `wrong_result`; hook/viewer capture breadth is not encoded. | Progressive disclosure, automatic capture, local viewer workflow. | Borrow progressive disclosure and viewer comfort; benchmark capture and operator-debugging live paths before claims. | | RAGFlow | `research_gate`; current status is `blocked`. | Full RAG application workflow with document/chunk/reference handles. | Use as a resource-aware RAG adapter benchmark, not as a current ELF competitor win/loss. | | LightRAG | `research_gate`; current status is `blocked`. | Lightweight graph/RAG context export and source-path citation shape. | Borrow context-export ideas for graph/RAG navigation after Docker proof. | @@ -240,7 +242,8 @@ These are needed for broad credibility but should not block personal production 2. OpenViking context trajectory - Current state: setup is pinned, same-corpus retrieval is `wrong_result`, and - staged trajectory is `not_encoded`. + staged trajectory, hierarchy selection, and recursive/context expansion are + encoded as `blocked` fixtures. - Benchmark gate: evidence-bearing retrieval pass, then staged hierarchy/trajectory scoring. @@ -261,7 +264,8 @@ Do not claim: - ELF has full-suite live real-world pass evidence. It does not. - ELF has private-corpus production quality proof. The private profile currently fails closed without an operator-owned manifest. -- ELF beats OpenViking on context trajectory. That scenario is not encoded. +- ELF beats OpenViking on context trajectory. The scenario is encoded as blocked, not + scored. - ELF beats mem0/OpenMemory on hosted memory, entity history, UI, or optional graph memory. Those scenarios are not encoded; the operator-debug win is only against qmd on a narrow trace/replay slice. diff --git a/docs/guide/benchmarking/2026-06-11-measurement-coverage-audit.md b/docs/guide/benchmarking/2026-06-11-measurement-coverage-audit.md index e34534d2..efd546a1 100644 --- a/docs/guide/benchmarking/2026-06-11-measurement-coverage-audit.md +++ b/docs/guide/benchmarking/2026-06-11-measurement-coverage-audit.md @@ -5,9 +5,10 @@ not comparable, and which measurement reports should guide future ELF iteration. Read this when: You need to answer whether ELF has enough empirical evidence to claim a win, tie, loss, or non-claim against tracked memory, RAG, graph, and agent-continuity projects. -Inputs: Fresh local runs of `cargo make real-world-memory` and -`cargo make real-world-memory-live-adapters` in the current XY-933 lane after live -capture/write-policy scoring, plus +Inputs: A fresh local `cargo make real-world-memory` run in the current XY-928 lane +after OpenViking context-trajectory fixture encoding, the retained XY-933 +`cargo make real-world-memory-live-adapters` evidence after live capture/write-policy +scoring, plus `apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json`, `2026-06-11-competitor-strength-evidence-matrix.md`, and `2026-06-11-elf-iteration-direction-from-competitor-benchmarks.md`. @@ -22,8 +23,10 @@ tracked project's strongest scenario. What is proven today: -- ELF has a strong fixture-backed real-world benchmark contract: 40 jobs, 38 pass, - 2 blocked operator boundaries, and no wrong results in the fixture aggregate. +- ELF has a strong fixture-backed real-world benchmark contract: 43 jobs, 38 pass, + 5 blocked operator or measurement-gate boundaries, and no wrong results in the + fixture aggregate. The added XY-928 `context_trajectory` jobs are blocked + OpenViking staged/hierarchy/recursive gates, not ELF wins. - ELF and qmd have comparable full-suite live real-world sweeps, but neither has a full-suite live pass. ELF is five passes ahead in the fresh aggregate because qmd misses the memory-evolution delete/TTL tombstone job and the capture/write-policy @@ -50,12 +53,13 @@ production," but the competitiveness objective remains open. ## Fresh Runs -These commands were run in the current XY-933 lane after live capture/write-policy -scoring: +The fixture command was refreshed in the current XY-928 lane after the OpenViking +context-trajectory fixtures were added. The live-adapter command records the retained +XY-933 evidence after live capture/write-policy scoring: | Command | Result | Runtime | | --- | --- | ---: | -| `cargo make real-world-memory` | pass | 7.11 seconds | +| `cargo make real-world-memory` | pass | 11.09 seconds | | `cargo make real-world-memory-live-adapters` | pass | 137.66 seconds | The live adapter run emitted repeated Qdrant client/server compatibility warnings, but @@ -69,21 +73,21 @@ failure. | Metric | Value | | --- | ---: | -| Jobs | `40` | -| Encoded suites | `11` | +| Jobs | `43` | +| Encoded suites | `12` | | Pass | `38` | -| Blocked | `2` | +| Blocked | `5` | | Wrong result | `0` | | Lifecycle fail | `0` | | Incomplete | `0` | | Not encoded | `0` | | Unsupported claim | `0` | -| Mean score | `0.950` | -| Mean latency | `4.244 ms` | -| Expected evidence recall | `80/80` | -| Evidence coverage | `88/88` | -| Source-ref coverage | `88/88` | -| Quote coverage | `88/88` | +| Mean score | `0.884` | +| Mean latency | `3.940 ms` | +| Expected evidence recall | `89/89` | +| Evidence coverage | `97/97` | +| Source-ref coverage | `97/97` | +| Quote coverage | `97/97` | This proves fixture contract breadth and scoring behavior. It does not prove every live adapter or competitor runtime can complete those jobs. @@ -146,8 +150,8 @@ The checked-in manifest records 23 adapter records across 17 unique project name | `pass` | `4` | | `wrong_result` | `6` | | `lifecycle_fail` | `1` | -| `blocked` | `5` | -| `not_encoded` | `7` | +| `blocked` | `6` | +| `not_encoded` | `6` | The generated JSON report emits `external_project_count: 16`, matching the unique non-ELF project-name count from the manifest. The companion audit JSON separately @@ -157,12 +161,12 @@ records `unique_project_names: 17` for the full project list including ELF. | Project | Best current evidence | Current measured state | Strongest unproven scenario | Next measurement before claim | | --- | --- | --- | --- | --- | -| ELF | `fixture_backed` plus `live_real_world` | Fixture aggregate passes except 2 blocked operator boundaries; live full sweep is `wrong_result`; live capture/write-policy and narrow operator-debug slices pass. | Full live memory evolution, live consolidation, live knowledge pages, live production ops, competitor capture hooks, and broader operator UI runners. | Memory-evolution diagnostic report, then consolidation/knowledge reports plus agentmemory/claude-mem capture and OpenMemory/claude-mem UI runners. | +| ELF | `fixture_backed` plus `live_real_world` | Fixture aggregate passes except 5 blocked operator or measurement-gate boundaries; live full sweep is `wrong_result`; live capture/write-policy and narrow operator-debug slices pass. | Full live memory evolution, live consolidation, live knowledge pages, live production ops, competitor capture hooks, OpenViking staged trajectory artifacts, and broader operator UI runners. | Memory-evolution diagnostic report, then consolidation/knowledge reports plus agentmemory/claude-mem capture, OpenViking staged trajectory artifacts, and OpenMemory/claude-mem UI runners. | | qmd | `live_real_world` plus `live_baseline_only` | Fresh full sweep is five passes behind ELF because qmd misses the delete/TTL tombstone job and keeps capture/write-policy jobs typed `not_encoded`; same-corpus baseline passes; narrow operator-debug live slice ties replay commands but is `wrong_result` for trace hydration and candidate-drop visibility. | Deep retrieval-debug ergonomics and trace replay beyond the narrow operator-debug slice. | qmd/ELF deep retrieval-debug profile with expansion, fusion, rerank, and dropped-candidate traces. | | agentmemory | `live_baseline_only` | `lifecycle_fail`; capture comparison is `blocked` because the Docker baseline uses a process-local StateKV Map and in-memory index, with no durable local session/capture path for source ids, exclusions, write-policy audit, or evidence-bound output. | Durable coding-agent continuity and capture hooks. | Durable lifecycle and work-resume/capture adapter report. | | mem0/OpenMemory | `live_baseline_only` | Basic local smoke now passes; history/UI/hosted/graph behavior remains `not_encoded`. | Entity history, lifecycle UI, OpenMemory inspection. | Entity-history, deletion-audit, and UI/export readback report. | | memsearch | `live_baseline_only` | Basic canonical Markdown reindex/reload smoke now passes; real-world prompt coverage remains `not_encoded`. | Markdown canonical store and local reindex clarity. | Source-of-truth and retrieval-debug real-world adapter report. | -| OpenViking | `live_baseline_only` plus `research_gate` | Same-corpus retrieval is `wrong_result`; trajectory is `not_encoded`. | Hierarchical staged context trajectory. | Evidence-bearing retrieval fix, then staged trajectory report. | +| OpenViking | `live_baseline_only` plus `fixture_backed` and `research_gate` | Same-corpus retrieval is `wrong_result`; staged retrieval, hierarchy selection, and recursive/context expansion are encoded as blocked fixtures. | Hierarchical staged context trajectory. | Evidence-bearing retrieval fix, then materialized staged trajectory report. | | claude-mem | `live_baseline_only` | `wrong_result`; capture breadth is `not_encoded` because hooks, timeline, observations, viewer capture, and automatic capture review were not run against real-world jobs. | Progressive disclosure and automatic capture review. | Work-resume, operator-debugging, and capture/write-policy report. | | RAGFlow | `research_gate` | `blocked`. | RAG app workflow with document/chunk references. | Tiny Docker evidence-smoke with `reference.chunks` mapped to evidence ids. | | LightRAG | `research_gate` | `blocked`. | Graph/RAG context export with source-path citations. | Docker context-export report with explicit provider config and source citation mapping. | diff --git a/docs/guide/benchmarking/2026-06-11-qmd-openviking-strength-profile-report.md b/docs/guide/benchmarking/2026-06-11-qmd-openviking-strength-profile-report.md index 99b1260a..693ce98d 100644 --- a/docs/guide/benchmarking/2026-06-11-qmd-openviking-strength-profile-report.md +++ b/docs/guide/benchmarking/2026-06-11-qmd-openviking-strength-profile-report.md @@ -8,7 +8,8 @@ Inputs: The June 11 retrieval-debug, memory-evolution, and temporal-history repo the real-world benchmark spec, the external adapter manifest, and `scripts/real-world-live-adapters.sh`. Outputs: Scenario-level win/tie/loss/not-tested judgments, qmd wrong-result -diagnosis taxonomy, OpenViking typed trajectory blockers, and claim boundaries. +diagnosis taxonomy, OpenViking typed trajectory blockers, blocked context-trajectory +jobs, and claim boundaries. Machine-readable companion: `docs/research/2026-06-11-qmd-openviking-strength-profile-report.json`. @@ -38,11 +39,13 @@ The measured OpenViking judgment is split by surface: embedding path reaches `add_resource`/`find`, but the OpenViking smoke remains `wrong_result` because expected evidence terms are missed while ELF passes the equivalent retrieval precondition. -- Context trajectory strengths: `not_tested`. The current OpenViking wrong-result - smoke is not a scored staged-trajectory comparison. +- Context trajectory strengths: `blocked` / `not_tested`. The OpenViking + same-corpus artifact now exposes expected, matched, and missing evidence ids, and + the staged retrieval, hierarchy selection, and recursive/context expansion jobs are + encoded as blocked fixtures. - Staged retrieval, hierarchy selection, and recursive/context expansion remain - `research_gate` / `not_encoded`; no ELF win, tie, or loss is claimed against those - strengths. + unscored until OpenViking returns evidence-bearing same-corpus output and comparable + stage artifacts; no ELF win, tie, or loss is claimed against those strengths. ## qmd Scenario Outcomes @@ -85,16 +88,17 @@ diagnosis evidence, not as a broad ELF-over-qmd claim. | --- | --- | --- | --- | --- | | Docker local embedding setup | `live_baseline_only` | `pass` | `not_tested` | none | | Same-corpus evidence-bearing retrieval precondition | `live_baseline_only` | `wrong_result` | `elf_win` | `output_missed_expected_terms` | -| Staged retrieval trajectory | `research_gate` | `not_encoded` | `not_tested` | `needs_evidence_bearing_same_corpus_output` | -| Hierarchy selection | `research_gate` | `not_encoded` | `not_tested` | `hierarchy_output_not_scored` | -| Recursive/context expansion | `research_gate` | `not_encoded` | `not_tested` | `recursive_expansion_not_materialized` | +| Staged retrieval trajectory | `fixture_backed` | `blocked` | `not_tested` | `needs_evidence_bearing_same_corpus_output` | +| Hierarchy selection | `fixture_backed` | `blocked` | `not_tested` | `hierarchy_output_not_scored` | +| Recursive/context expansion | `fixture_backed` | `blocked` | `not_tested` | `recursive_expansion_not_materialized` | | Missed expected terms evidence | `live_baseline_only` | `wrong_result` | `not_tested` | `retrieval_wrong_result` | Summary: OpenViking profile outcomes are `1` ELF win, `0` ties, `0` ELF losses, and `5` not-tested scenarios. The single win is only the same-corpus evidence-bearing -precondition. The current smoke wrong-result is useful typed failure evidence, but it -is not a second comparative win and not a scored staged-trajectory comparison, so -context-trajectory strengths remain not tested. +precondition. The current smoke wrong-result is useful typed failure evidence, and the +three context-trajectory fixtures make the staged, hierarchy, and recursive jobs +visible as blocked work. They are not scored staged-trajectory comparisons, so +context-trajectory strengths remain not tested for win/tie/loss claims. ## Claim Boundaries @@ -105,8 +109,10 @@ Allowed: transparency artifact ergonomics; query transparency and replayability are observed but not scored as comparative ELF wins or losses. - qmd expansion/fusion/rerank superiority is untested. -- OpenViking's Docker local embedding setup reaches runtime, but context trajectory - remains untested because evidence-bearing same-corpus retrieval is not passing. +- OpenViking's Docker local embedding setup reaches runtime, and the baseline output + now exposes expected/matched/missing evidence ids, but context trajectory remains + blocked because evidence-bearing same-corpus retrieval is not passing and staged + artifacts are not materialized. - ELF currently wins only the equivalent OpenViking same-corpus retrieval precondition surface, not OpenViking's staged trajectory strengths. @@ -116,8 +122,8 @@ Not allowed: - Do not claim qmd's debug ergonomics are equivalent to retrieval quality. - Do not claim ELF beats OpenViking on staged retrieval, hierarchy, or recursive context expansion. -- Do not turn `research_gate`, `not_encoded`, or `unsupported` surfaces into wins or - losses. +- Do not turn `research_gate`, `blocked`, `not_encoded`, or `unsupported` surfaces + into wins or losses. ## Validation Hook diff --git a/docs/guide/benchmarking/index.md b/docs/guide/benchmarking/index.md index ed78742a..34fbe8b1 100644 --- a/docs/guide/benchmarking/index.md +++ b/docs/guide/benchmarking/index.md @@ -72,8 +72,9 @@ cleanup, use `docs/guide/single_user_production.md`. optimization directions. - `2026-06-11-qmd-openviking-strength-profile-report.md`: XY-899 strength-profile report that separates qmd retrieval quality from debug/replay ergonomics, records - qmd wrong-result diagnosis classes, and preserves OpenViking context-trajectory - surfaces as `not_tested` until staged/hierarchical evidence is encoded. + qmd wrong-result diagnosis classes, and preserves XY-928 OpenViking + context-trajectory surfaces as blocked/not-tested until scored staged, + hierarchical, and recursive evidence exists. - `2026-06-11-elf-qmd-trace-replay-diagnostics-report.md`: XY-923 trace-level replay and wrong-result diagnostics report that scores qmd top-10/replay artifact ergonomics against ELF trace/admin surfaces while keeping retrieval correctness, diff --git a/docs/guide/benchmarking/real_world_agent_memory_benchmark.md b/docs/guide/benchmarking/real_world_agent_memory_benchmark.md index 052c5638..c15cc912 100644 --- a/docs/guide/benchmarking/real_world_agent_memory_benchmark.md +++ b/docs/guide/benchmarking/real_world_agent_memory_benchmark.md @@ -58,6 +58,7 @@ compile knowledge, and state honest uncertainty. | Capture/integration | Accuracy of hooks, imports, exclusions, and write policies. | Capture a session decision while excluding private spans. | | Production ops | Backfill, restore, cold start, resource, and bounded-failure behavior. | Resume interrupted import without duplicate source notes. | | Personalization | Scoped preferences without cross-tenant leakage. | Apply the user's current preference and ignore another project's note. | +| Context trajectory | Staged context trajectory, hierarchy selection, and recursive expansion. | Block OpenViking trajectory scoring until same-corpus evidence ids and comparable stage artifacts exist. | ## External Reference Mapping @@ -164,6 +165,9 @@ including the retrieval-quality slice below. The suite currently encodes: classification, and provider credential boundary `blocked` classification. - `personalization`: scoped stable preference correction without temporary or cross-project preference leakage. +- `context_trajectory`: OpenViking staged retrieval, hierarchy selection, and + recursive/context expansion jobs encoded as `blocked` until same-corpus expected + evidence ids and comparable stage artifacts are available. The generated report includes evidence coverage, source-ref coverage, quote coverage, unsupported-claim count, stale retrieval count, stale-answer count, conflict detection @@ -221,7 +225,12 @@ research gates. Its `external_adapters` report section distinguishes: - `research_gate`: checked-in source/setup/runtime/resource/retry metadata for a future adapter path, not fixture-backed or live execution evidence. -Current state: the `elf_live_real_world` and `qmd_live_real_world` adapters run a full +Current fixture state: `cargo make real-world-memory` covers 43 jobs across 12 suites, +with 38 pass and 5 blocked. The blocked jobs are production-ops operator boundaries +plus the XY-928 OpenViking `context_trajectory` gates for staged retrieval, hierarchy +selection, and recursive/context expansion. + +Current live-adapter state: the `elf_live_real_world` and `qmd_live_real_world` adapters run a full encoded-suite sweep through `cargo make real-world-memory-live-adapters`. Each adapter materializes generated runtime answers for 40 jobs across 11 suites before scoring. The original targeted `work_resume`, `retrieval`, and `project_decisions` slice still @@ -237,7 +246,10 @@ storage for lifecycle proof and capture breadth. mem0/OpenMemory, memsearch, and claude-mem currently retain wrong-result, not-encoded, or incomplete live-baseline states for the checked-in adapter evidence. OpenViking now reaches its pinned Docker local embedding setup but remains a same-corpus `wrong_result` until it returns -evidence-bearing retrieval output. The expanded RAG and graph-memory records for +evidence-bearing retrieval output. The checked-in `context_trajectory` fixtures keep +OpenViking staged retrieval, hierarchy selection, and recursive/context expansion +blocked until same-corpus evidence ids match and staged artifacts are materialized. +The expanded RAG and graph-memory records for RAGFlow, LightRAG, GraphRAG, Graphiti/Zep, Letta, LangGraph, nanograph, llm-wiki, gbrain, graphify, and deeper qmd/OpenViking profiles are `research_gate` records until their Docker-isolated adapter runs are implemented. These typed states describe diff --git a/docs/research/2026-06-11-competitor-strength-adoption-report.json b/docs/research/2026-06-11-competitor-strength-adoption-report.json index 670cf16f..5426b5cb 100644 --- a/docs/research/2026-06-11-competitor-strength-adoption-report.json +++ b/docs/research/2026-06-11-competitor-strength-adoption-report.json @@ -12,7 +12,7 @@ "Live temporal reconciliation remains wrong_result for five of six memory_evolution jobs.", "Private-corpus production quality is blocked until an operator-owned manifest exists.", "Credentialed provider production-ops gates are blocked until explicit provider setup exists.", - "Several competitor strengths remain not_tested or blocked: OpenMemory UI/export is blocked by the XY-931 export-helper setup probe, hosted mem0 Platform behavior remains a non-goal, and OpenViking trajectory, Letta core-vs-archival memory, and graph/RAG navigation remain unproven. mem0 local OSS preference history is measured separately and is an ELF loss on the current correction-history scenario. The XY-923 follow-up scores qmd immediate top-10/replay artifact ergonomics as stronger than ELF's default stress report, while expansion, fusion, and rerank remain untested. XY-932 adds a narrow live operator-debug slice where ELF beats qmd on trace hydration and candidate-drop visibility, but OpenMemory UI/export and claude-mem viewer workflows remain blocked or not encoded. XY-933 adds an ELF live capture/write-policy self-check, but agentmemory capture breadth is blocked by mocked/in-memory storage and claude-mem hook/viewer capture remains untested." + "Several competitor strengths remain not_tested or blocked: OpenMemory UI/export is blocked by the XY-931 export-helper setup probe, hosted mem0 Platform behavior remains a non-goal, and OpenViking trajectory, Letta core-vs-archival memory, and graph/RAG navigation remain unproven. XY-928 encodes OpenViking staged trajectory, hierarchy selection, and recursive/context expansion as blocked fixtures behind same-corpus evidence output and missing staged artifacts. mem0 local OSS preference history is measured separately and is an ELF loss on the current correction-history scenario. The XY-923 follow-up scores qmd immediate top-10/replay artifact ergonomics as stronger than ELF's default stress report, while expansion, fusion, and rerank remain untested. XY-932 adds a narrow live operator-debug slice where ELF beats qmd on trace hydration and candidate-drop visibility, but OpenMemory UI/export and claude-mem viewer workflows remain blocked or not encoded. XY-933 adds an ELF live capture/write-policy self-check, but agentmemory capture breadth is blocked by mocked/in-memory storage and claude-mem hook/viewer capture remains untested." ] }, "evidence_class_terms": [ @@ -39,7 +39,7 @@ { "command": "cargo make real-world-memory", "artifact": "docs/guide/benchmarking/2026-06-11-measurement-coverage-audit.md", - "claim": "ELF fixture aggregate covers 40 jobs across 11 suites with 38 pass and 2 blocked production-ops operator boundaries." + "claim": "ELF fixture aggregate covers 43 jobs across 12 suites with 38 pass and 5 blocked production-ops or OpenViking context-trajectory measurement gates." }, { "command": "cargo make real-world-memory-live-adapters", @@ -351,12 +351,13 @@ "title": "Context trajectory and hierarchical retrieval", "outcome": "not_tested", "evidence_classes": [ + "fixture_backed", "live_baseline_only", "research_gate", "wrong_result", - "not_encoded" + "blocked" ], - "measured_claim": "OpenViking reaches the pinned Docker local embedding path but misses expected same-corpus evidence, and staged trajectory/hierarchy scoring is not encoded.", + "measured_claim": "OpenViking reaches the pinned Docker local embedding path and now exposes expected/matched/missing evidence ids, but same-corpus evidence is still wrong_result; staged trajectory, hierarchy selection, and recursive expansion are encoded as blocked fixtures, not scored comparisons.", "command_artifacts": [ "docs/guide/benchmarking/2026-06-11-qmd-openviking-strength-profile-report.md" ], @@ -451,7 +452,7 @@ "issue": "XY-928", "priority": "P1", "state": "Backlog", - "gap": "OpenViking context-trajectory and hierarchy benchmark." + "gap": "OpenViking context-trajectory and hierarchy benchmark is encoded but blocked until evidence-bearing same-corpus and staged artifacts exist." }, { "issue": "XY-929", diff --git a/docs/research/2026-06-11-measurement-coverage-audit.json b/docs/research/2026-06-11-measurement-coverage-audit.json index e55042c4..fd210705 100644 --- a/docs/research/2026-06-11-measurement-coverage-audit.json +++ b/docs/research/2026-06-11-measurement-coverage-audit.json @@ -1,14 +1,14 @@ { "schema": "elf.benchmark_measurement_coverage_audit/v2", "run_id": "2026-06-11-measurement-coverage-audit", - "source_revision": "current XY-933 lane after live capture/write-policy scoring", + "source_revision": "current XY-928 lane rebased after live capture/write-policy scoring", "created_at": "2026-06-11", "scope": "ELF memory-system competitiveness measurement coverage, external competitor comparison evidence, and next report directions", "commands": [ { "command": "cargo make real-world-memory", "status": "pass", - "runtime_seconds": 7.11, + "runtime_seconds": 11.09, "artifact": "tmp/real-world-memory/real-world-memory-report.json" }, { @@ -19,21 +19,21 @@ } ], "fixture_aggregate": { - "job_count": 40, - "encoded_suite_count": 11, + "job_count": 43, + "encoded_suite_count": 12, "pass": 38, "wrong_result": 0, "lifecycle_fail": 0, "incomplete": 0, - "blocked": 2, + "blocked": 5, "not_encoded": 0, "unsupported_claim": 0, - "mean_score": 0.95, - "mean_latency_ms": 4.244, - "expected_evidence_total": 80, - "expected_evidence_matched": 80, - "evidence_required_count": 88, - "evidence_covered_count": 88 + "mean_score": 0.884, + "mean_latency_ms": 3.94, + "expected_evidence_total": 89, + "expected_evidence_matched": 89, + "evidence_required_count": 97, + "evidence_covered_count": 97 }, "live_real_world_adapters": [ { @@ -197,12 +197,13 @@ "pass": 4, "wrong_result": 6, "lifecycle_fail": 1, - "blocked": 5, - "not_encoded": 7 + "blocked": 6, + "not_encoded": 6 }, "xy900_update_note": "XY-900 promotes graphify from research_gate/blocked to a tiny scored live_real_world wrong_result smoke; broad graph/RAG quality remains unproven.", "xy932_update_note": "XY-932 adds narrow ELF/qmd operator-debug live_real_world records: ELF pass and qmd wrong_result for trace hydration/candidate-drop visibility, with OpenMemory and claude-mem UI still unmeasured.", - "xy933_update_note": "XY-933 adds live ELF capture/write-policy scoring: ELF passes 4/4 capture_integration jobs with zero redaction leaks, qmd remains not_encoded, agentmemory comparison is blocked by mocked/in-memory storage, and claude-mem capture hooks remain not_encoded." + "xy933_update_note": "XY-933 adds live ELF capture/write-policy scoring: ELF passes 4/4 capture_integration jobs with zero redaction leaks, qmd remains not_encoded, agentmemory comparison is blocked by mocked/in-memory storage, and claude-mem capture hooks remain not_encoded.", + "xy928_update_note": "XY-928 adds three blocked context_trajectory fixtures for OpenViking staged retrieval, hierarchy selection, and recursive/context expansion; no trajectory win/tie/loss is claimed." }, "claim_boundary": { "elf_vs_qmd": "near_tie_with_narrow_delete_ttl_elf_lead_not_overall_win", diff --git a/docs/research/2026-06-11-qmd-openviking-strength-profile-report.json b/docs/research/2026-06-11-qmd-openviking-strength-profile-report.json index d8d966d6..decee8e7 100644 --- a/docs/research/2026-06-11-qmd-openviking-strength-profile-report.json +++ b/docs/research/2026-06-11-qmd-openviking-strength-profile-report.json @@ -54,8 +54,8 @@ }, "openviking": { "overall_outcome": "not_tested", - "overall_rationale": "OpenViking context-trajectory strengths remain not_tested; ELF has only one same-corpus retrieval precondition win.", - "claim": "ELF has one measured win on the same-corpus evidence-bearing precondition where OpenViking currently returns wrong_result. ELF does not have a measured win, tie, or loss against OpenViking context-trajectory strengths because staged trajectory, hierarchy selection, and recursive expansion remain research-gate/not_encoded." + "overall_rationale": "OpenViking context-trajectory strengths remain blocked/not_tested; ELF has only one same-corpus retrieval precondition win.", + "claim": "ELF has one measured win on the same-corpus evidence-bearing precondition where OpenViking currently returns wrong_result. ELF does not have a measured win, tie, or loss against OpenViking context-trajectory strengths because staged trajectory, hierarchy selection, and recursive expansion are encoded as blocked fixtures until scored staged output exists." } }, "qmd_strength_profile": { @@ -317,35 +317,35 @@ { "scenario_id": "openviking-staged-retrieval-trajectory", "surface": "staged retrieval trajectory", - "evidence_class": "research_gate", - "result_type": "not_encoded", - "openviking_status": "not_encoded", + "evidence_class": "fixture_backed", + "result_type": "blocked", + "openviking_status": "blocked", "elf_equivalent_status": "not_encoded", "elf_outcome": "not_tested", "typed_blocker": "needs_evidence_bearing_same_corpus_output", - "evidence": "No stage trajectory scoring is claimed until OpenViking returns evidence-bearing same-corpus output." + "evidence": "The context_trajectory fixture context-trajectory-openviking-staged-retrieval-001 is encoded as blocked until OpenViking returns evidence-bearing same-corpus output and comparable staged artifacts." }, { "scenario_id": "openviking-hierarchy-selection", "surface": "hierarchy selection", - "evidence_class": "research_gate", - "result_type": "not_encoded", - "openviking_status": "not_encoded", + "evidence_class": "fixture_backed", + "result_type": "blocked", + "openviking_status": "blocked", "elf_equivalent_status": "unsupported", "elf_outcome": "not_tested", "typed_blocker": "hierarchy_output_not_scored", - "evidence": "The viking:// hierarchy model remains a reference strength, but no real_world_job output scores hierarchy selection." + "evidence": "The context_trajectory fixture context-trajectory-openviking-hierarchy-selection-001 is encoded as blocked until selected hierarchy nodes and evidence ids are materialized." }, { "scenario_id": "openviking-recursive-context-expansion", "surface": "recursive/context expansion", - "evidence_class": "research_gate", - "result_type": "not_encoded", - "openviking_status": "not_encoded", + "evidence_class": "fixture_backed", + "result_type": "blocked", + "openviking_status": "blocked", "elf_equivalent_status": "not_encoded", "elf_outcome": "not_tested", "typed_blocker": "recursive_expansion_not_materialized", - "evidence": "Recursive/context expansion remains unmaterialized in the Docker adapter; no pass/fail quality claim is allowed." + "evidence": "The context_trajectory fixture context-trajectory-openviking-recursive-expansion-001 is encoded as blocked until expansion paths and expected evidence ids are materialized." }, { "scenario_id": "openviking-missed-expected-terms-evidence", @@ -369,8 +369,8 @@ "claim_boundaries": [ "ELF does not broadly beat qmd; it ties encoded retrieval and lifecycle correctness, keeps qmd query transparency as not_tested for comparative scoring, and leaves replayability not_tested.", "qmd expansion, fusion, and rerank superiority remains not_tested because the current qmd paths use --no-rerank and do not score internals.", - "ELF does not beat OpenViking on context trajectory; OpenViking trajectory strengths remain not_tested behind a wrong_result same-corpus output precondition.", - "Research_gate records are follow-up gates, not pass evidence.", - "Missing equivalent surfaces are encoded as unsupported or not_encoded rather than fake losses." + "ELF does not beat OpenViking on context trajectory; OpenViking trajectory strengths remain blocked/not_tested behind a wrong_result same-corpus output precondition and missing staged artifacts.", + "Research_gate and blocked fixture records are follow-up gates, not pass evidence.", + "Missing equivalent surfaces are encoded as unsupported, blocked, or not_encoded rather than fake losses." ] } diff --git a/docs/research/2026-06-11-xy-897-competitor-strength-matrix.json b/docs/research/2026-06-11-xy-897-competitor-strength-matrix.json index 528fc057..b2760325 100644 --- a/docs/research/2026-06-11-xy-897-competitor-strength-matrix.json +++ b/docs/research/2026-06-11-xy-897-competitor-strength-matrix.json @@ -30,8 +30,8 @@ }, "overall_status_counts": { "lifecycle_fail": 1, - "blocked": 5, - "not_encoded": 7, + "blocked": 6, + "not_encoded": 6, "pass": 4, "wrong_result": 6 } @@ -188,6 +188,7 @@ "current_evidence_class": "live_baseline_only", "supporting_evidence_classes": [ "live_baseline_only", + "fixture_backed", "research_gate" ], "measured_status": "wrong_result", @@ -196,9 +197,9 @@ "artifact": "tmp/live-baseline/live-baseline-report.json" }, "unsupported_or_blocked_status": { - "state": "not_encoded", - "typed_reason": "hierarchical_context_trajectory_not_encoded", - "details": "Pinned Docker local embedding setup reaches add_resource/find, but same-corpus output misses expected evidence and trajectory jobs are not encoded." + "state": "blocked", + "typed_reason": "hierarchical_context_trajectory_blocked", + "details": "Pinned Docker local embedding setup reaches add_resource/find, but same-corpus output misses expected evidence; staged retrieval, hierarchy selection, and recursive/context expansion jobs are encoded as blocked fixtures." }, "benchmark_before_claim": "First make evidence-bearing same-corpus output pass, then run a context-trajectory suite that scores staged retrieval paths and hierarchy expansion.", "borrow_if_stronger": "Borrow the viking-style filesystem context model, trajectory readback, and staged retrieval planning." @@ -529,7 +530,7 @@ "scenario": "context trajectory", "current_elf_evidence": "ELF has trace and trajectory directions, but staged context trajectory is not yet a comparable live scenario.", "strongest_competitor_or_reference": "OpenViking", - "current_competitor_evidence": "OpenViking Docker setup is pinned, same-corpus retrieval is wrong_result, and hierarchical trajectory is research_gate not_encoded.", + "current_competitor_evidence": "OpenViking Docker setup is pinned, same-corpus retrieval is wrong_result, and hierarchical trajectory jobs are fixture-backed blocked gates.", "current_state": "OpenViking remains the strongest design reference, but not a measured live winner.", "next_measurement": "Make OpenViking same-corpus evidence-bearing retrieval pass, then score hierarchical expansion and staged context trajectory outputs." }, diff --git a/docs/spec/real_world_agent_memory_benchmark_v1.md b/docs/spec/real_world_agent_memory_benchmark_v1.md index 3416f3f7..cfa15fed 100644 --- a/docs/spec/real_world_agent_memory_benchmark_v1.md +++ b/docs/spec/real_world_agent_memory_benchmark_v1.md @@ -537,6 +537,7 @@ Suite ids are stable public names. Each suite MUST contain at least one | `capture_integration` | Evaluate how accurately work observations become usable memory across agents and tools. | Capture a session decision; exclude private spans; import external agent observations. | Hook/import logs, write policy audits, excluded spans, resulting note ids. | answer_correctness, evidence_grounding, trap_avoidance, lifecycle_behavior. | agentmemory, claude-mem, memsearch, mem0. | | `production_ops` | Prove safe operation under backup, restore, backfill, cold start, resource, and credential boundaries. | Resume interrupted import; restore from backup; report missing private manifest as bounded caveat. | Command/report artifacts, resource envelope, checkpoint state, failure guard evidence. | lifecycle_behavior, latency_resource, uncertainty_handling, evidence_grounding. | ELF, qmd, memsearch, LangGraph. | | `personalization` | Apply user/project preferences correctly without leaking across scopes or overfitting stale preferences. | Remember preferred response style; avoid using another project tenant's note; update a preference. | Scoped memory ids, preference versions, tenant/project/agent context, negative cross-scope traps. | personalization_fit, trap_avoidance, evidence_grounding, answer_correctness. | mem0, Letta, agentmemory, ELF. | +| `context_trajectory` | Measure staged context trajectory, hierarchy selection, and recursive/context expansion without converting setup or retrieval preconditions into trajectory wins. | Explain whether a staged trajectory can be scored; identify selected hierarchy nodes; report recursive expansion paths and pruned branches. | Same-corpus expected evidence ids, matched/missing evidence ids, stage artifacts, selected hierarchy nodes, expansion paths, comparable ELF trace/session artifacts when a comparison is claimed. | answer_correctness, evidence_grounding, trap_avoidance, debuggability, workflow_helpfulness. | OpenViking, ELF, qmd. | ## Report Semantics diff --git a/scripts/live-baseline-benchmark.sh b/scripts/live-baseline-benchmark.sh index 0f15359f..bf5cf624 100755 --- a/scripts/live-baseline-benchmark.sh +++ b/scripts/live-baseline-benchmark.sh @@ -3054,6 +3054,18 @@ project_openviking() { "status": "not_encoded", "surface": "no restart/reopen check is encoded until local same-corpus retrieval completes" }, + "staged_retrieval_trajectory": { + "status": "blocked", + "surface": "no staged retrieval trajectory check is scored until same-corpus retrieval matches expected evidence ids" + }, + "hierarchy_selection": { + "status": "blocked", + "surface": "no hierarchy selection check is scored until same-corpus retrieval matches expected evidence ids" + }, + "recursive_context_expansion": { + "status": "blocked", + "surface": "no recursive/context expansion check is scored until same-corpus retrieval matches expected evidence ids" + }, "scale_stress_profile": { "status": "blocked", "surface": "scale/stress is blocked until smoke same-corpus retrieval returns evidence-bearing results" @@ -3135,11 +3147,42 @@ queries_path = Path(os.environ["ELF_BASELINE_QUERIES_PATH"]) top_k = int(os.environ.get("ELF_BASELINE_TOP_K", "10")) +def expected_evidence_ids(query): + ids = query.get("expected_evidence_ids") or [] + if ids: + return ids + expected_doc = query["expected_doc"] + return [expected_doc[:-3] if expected_doc.endswith(".md") else expected_doc] + + +def allowed_evidence_ids(query): + return query.get("allowed_alternate_evidence_ids") or [] + + +def result_raw(found): + return json.dumps(to_jsonable(found), ensure_ascii=False, default=str).lower() + + +def visible_evidence_ids(found, query): + raw = result_raw(found) + candidate_ids = [*expected_evidence_ids(query), *allowed_evidence_ids(query)] + visible = [] + for evidence_id in candidate_ids: + lowered = evidence_id.lower() + if lowered in raw or f"{lowered}.md" in raw: + visible.append(evidence_id) + return visible + + def result_matches(found, query): - raw = json.dumps(to_jsonable(found), ensure_ascii=False, default=str).lower() - return query["expected_doc"].lower() in raw and all( - term.lower() in raw for term in query["expected_terms"] - ) + raw = result_raw(found) + expected_docs = [ + query["expected_doc"], + *query.get("allowed_alternate_docs", []), + ] + has_doc = any(expected_doc.lower() in raw for expected_doc in expected_docs) + has_terms = all(term.lower() in raw for term in query["expected_terms"]) + return has_doc and has_terms client = OpenViking(path=data_path) @@ -3163,17 +3206,49 @@ try: score_threshold=0.0, level=[2], ) + matched_evidence_ids = visible_evidence_ids(found, query) + required_evidence_ids = expected_evidence_ids(query) query_results.append( { "id": query["id"], "query": query["query"], "expected_doc": query["expected_doc"], "expected_terms": query["expected_terms"], + "expected_evidence_ids": required_evidence_ids, + "allowed_alternate_evidence_ids": allowed_evidence_ids(query), + "matched_evidence_ids": matched_evidence_ids, + "missing_evidence_ids": [ + evidence_id + for evidence_id in required_evidence_ids + if evidence_id not in matched_evidence_ids + ], "matched": result_matches(found, query), "find": to_jsonable(found), } ) pass_count = sum(1 for result in query_results if result["matched"]) + evidence_total = sum(len(result["expected_evidence_ids"]) for result in query_results) + evidence_matched = sum( + len( + [ + evidence_id + for evidence_id in result["matched_evidence_ids"] + if evidence_id in result["expected_evidence_ids"] + ] + ) + for result in query_results + ) + same_corpus_output_correct = ( + pass_count == len(query_results) + and evidence_total > 0 + and evidence_matched == evidence_total + ) + trajectory_gate_status = "not_encoded" if same_corpus_output_correct else "blocked" + trajectory_gate_reason = ( + "OpenViking same-corpus retrieval matched expected evidence ids, but staged trajectory scoring is not encoded in this Docker adapter." + if trajectory_gate_status == "not_encoded" + else "OpenViking staged trajectory scoring is blocked until same-corpus retrieval matches expected evidence ids." + ) checks = [ { "name": "same_corpus_retrieval", @@ -3187,6 +3262,21 @@ try: "fail": len(query_results) - pass_count, }, }, + { + "name": "same_corpus_expected_evidence_ids_visible", + "status": "pass" + if all(result["expected_evidence_ids"] for result in query_results) + else "incomplete", + "reason": "OpenViking query results expose expected, matched, and missing evidence ids for every same-corpus query.", + "evidence": { + "total_queries": len(query_results), + "queries_with_expected_evidence_ids": sum( + 1 for result in query_results if result["expected_evidence_ids"] + ), + "expected_evidence_total": evidence_total, + "expected_evidence_matched": evidence_matched, + }, + }, { "name": "update_replaces_note_text", "status": "not_encoded", @@ -3205,6 +3295,40 @@ try: "reason": "OpenViking cold-start reload is not encoded until the local retrieval path is stable in Docker.", "evidence": {}, }, + { + "name": "staged_retrieval_trajectory", + "status": trajectory_gate_status, + "reason": trajectory_gate_reason, + "evidence": { + "blocked_by": "same_corpus_expected_evidence_miss" + if trajectory_gate_status == "blocked" + else None + }, + }, + { + "name": "hierarchy_selection", + "status": trajectory_gate_status, + "reason": trajectory_gate_reason.replace( + "staged trajectory", "hierarchy selection" + ), + "evidence": { + "blocked_by": "same_corpus_expected_evidence_miss" + if trajectory_gate_status == "blocked" + else None + }, + }, + { + "name": "recursive_context_expansion", + "status": trajectory_gate_status, + "reason": trajectory_gate_reason.replace( + "staged trajectory", "recursive/context expansion" + ), + "evidence": { + "blocked_by": "same_corpus_expected_evidence_miss" + if trajectory_gate_status == "blocked" + else None + }, + }, ] wrong_result_count = sum( 1 for check in checks if check["status"] == "wrong_result"