From 36e822c125c9d6f2b95fa3903730a4a90593ffed Mon Sep 17 00:00:00 2001 From: Yvette Carlisle Date: Fri, 12 Jun 2026 06:28:45 +0800 Subject: [PATCH] {"schema":"decodex/commit/1","summary":"Add representative graph RAG benchmark fixtures","authority":"XY-929"} --- Makefile.toml | 52 ++++ README.md | 17 +- .../graphify_graph_report_wrong_result.json | 285 ++++++++++++++++++ .../graphiti_temporal_validity_blocked.json | 197 ++++++++++++ .../graphrag_output_tables_blocked.json | 146 +++++++++ .../lightrag_context_sources_incomplete.json | 141 +++++++++ .../ragflow_reference_chunks_blocked.json | 149 +++++++++ .../memory_projects_manifest.json | 127 ++++++++ .../tests/real_world_job_benchmark.rs | 146 ++++++++- ...-11-competitor-strength-adoption-report.md | 24 +- ...1-graph-rag-scored-smoke-adapter-report.md | 66 +++- docs/guide/benchmarking/index.md | 16 +- .../real_world_agent_memory_benchmark.md | 29 +- ...1-competitor-strength-adoption-report.json | 28 +- 14 files changed, 1363 insertions(+), 60 deletions(-) create mode 100644 apps/elf-eval/fixtures/real_world_external_adapters/graph_rag/graphify_graph_report_wrong_result.json create mode 100644 apps/elf-eval/fixtures/real_world_external_adapters/graph_rag/graphiti_temporal_validity_blocked.json create mode 100644 apps/elf-eval/fixtures/real_world_external_adapters/graph_rag/graphrag_output_tables_blocked.json create mode 100644 apps/elf-eval/fixtures/real_world_external_adapters/graph_rag/lightrag_context_sources_incomplete.json create mode 100644 apps/elf-eval/fixtures/real_world_external_adapters/graph_rag/ragflow_reference_chunks_blocked.json diff --git a/Makefile.toml b/Makefile.toml index eba76c24..5c89f94d 100644 --- a/Makefile.toml +++ b/Makefile.toml @@ -431,6 +431,9 @@ args = [ # | real-world-memory-core-archival | composite | | # | real-world-memory-core-archival-json | command | | # | real-world-memory-core-archival-report | command | | +# | real-world-memory-graph-rag | composite | | +# | real-world-memory-graph-rag-json | command | | +# | real-world-memory-graph-rag-report | command | | # | real-world-memory-live-adapters | command | | [tasks.real-world-job-smoke] @@ -876,6 +879,55 @@ args = [ "tmp/real-world-memory/core-archival/report.md", ] +[tasks.real-world-memory-graph-rag] +workspace = false +dependencies = [ + "real-world-memory-graph-rag-report", +] + +[tasks.real-world-memory-graph-rag-json] +workspace = false +command = "cargo" +args = [ + "run", + "-p", + "elf-eval", + "--bin", + "real_world_job_benchmark", + "--", + "run", + "--fixtures", + "apps/elf-eval/fixtures/real_world_external_adapters/graph_rag", + "--out", + "tmp/real-world-memory/graph-rag/report.json", + "--run-id", + "real-world-memory-graph-rag", + "--adapter-id", + "fixture_graph_rag_external_adapters", + "--adapter-name", + "Graph/RAG representative external-adapter fixtures", +] + +[tasks.real-world-memory-graph-rag-report] +workspace = false +dependencies = [ + "real-world-memory-graph-rag-json", +] +command = "cargo" +args = [ + "run", + "-p", + "elf-eval", + "--bin", + "real_world_job_benchmark", + "--", + "publish", + "--report", + "tmp/real-world-memory/graph-rag/report.json", + "--out", + "tmp/real-world-memory/graph-rag/report.md", +] + [tasks.real-world-memory-live-adapters] workspace = false command = "bash" diff --git a/README.md b/README.md index 203c4da0..87e83366 100644 --- a/README.md +++ b/README.md @@ -196,13 +196,16 @@ provider-backed ELF evidence was required. These records carry source/setup/runtime/resource/retry metadata and typed `blocked`, `incomplete`, `wrong_result`, or `not_encoded` states; they are not fixture-backed or live adapter pass evidence. -- Graph/RAG scored-smoke promotion after XY-900: RAGFlow, LightRAG, GraphRAG, - Graphiti/Zep, and graphify smokes now emit scored or typed non-pass - `real_world_job` adapter reports when run. graphify currently reaches a tiny Docker - graph/report smoke and scores `wrong_result`; the other in-scope projects remain - typed blocked or incomplete without explicit service, resource, or provider setup. - These reports preserve the smoke-only boundary and do not create an ELF win claim - against graph/RAG strengths. +- Graph/RAG scored-smoke promotion after XY-900 and representative slice after XY-929: + RAGFlow, LightRAG, GraphRAG, Graphiti/Zep, and graphify smokes now emit scored or + typed non-pass `real_world_job` adapter reports when run. `cargo make + real-world-memory-graph-rag` adds representative graph/RAG citation, summary, + temporal-validity, graph-report, stale-source-lint, and unsupported-claim fixtures: + RAGFlow, GraphRAG, and Graphiti/Zep are blocked; LightRAG is incomplete with + comparison blocked; graphify is `wrong_result`; llm-wiki is not_tested; gbrain is + blocked; private and hosted graph/RAG profiles are non_goal. These reports preserve + the smoke and typed non-pass boundaries and do not create an ELF win claim against + graph/RAG strengths. - mem0/OpenMemory history follow-up after XY-924 and XY-931: the local OSS mem0 adapter now passes encoded preference correction history, entity-scoped personalization, local `get_all` export-style readback, and deletion audit history. diff --git a/apps/elf-eval/fixtures/real_world_external_adapters/graph_rag/graphify_graph_report_wrong_result.json b/apps/elf-eval/fixtures/real_world_external_adapters/graph_rag/graphify_graph_report_wrong_result.json new file mode 100644 index 00000000..bb6d9b92 --- /dev/null +++ b/apps/elf-eval/fixtures/real_world_external_adapters/graph_rag/graphify_graph_report_wrong_result.json @@ -0,0 +1,285 @@ +{ + "schema": "elf.real_world_job/v1", + "job_id": "graph-rag-graphify-graph-report-001", + "suite": "knowledge_compilation", + "title": "Score graphify graph-report navigation, stale-source lint, and unsupported summaries", + "corpus": { + "corpus_id": "graph-rag-representative-2026-06-11", + "profile": "external_adapter", + "items": [ + { + "evidence_id": "graphify-graph-summary-output", + "kind": "adapter_artifact", + "text": "graphify representative output: graph.json links the ELF memory service node to Qdrant rebuild and graph report mapping nodes with source file references.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "graphify_graph_report_wrong_result", + "evidence_id": "graphify-graph-summary-output" + }, + "locator": { + "quote": "graph.json links the ELF memory service node to Qdrant rebuild and graph report mapping nodes" + } + }, + "created_at": "2026-06-11T17:20:00Z" + }, + { + "evidence_id": "graphify-source-location-output", + "kind": "adapter_artifact", + "text": "graphify source-location output: query output includes source files and line-like locations for generated corpus snippets.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "graphify_graph_report_wrong_result", + "evidence_id": "graphify-source-location-output" + }, + "locator": { + "quote": "query output includes source files and line-like locations" + } + }, + "created_at": "2026-06-11T17:21:00Z" + }, + { + "evidence_id": "graphify-derived-report-boundary", + "kind": "claim_boundary", + "text": "graphify boundary: GRAPH_REPORT.md is a derived graph/report artifact and must not be treated as authoritative ELF memory.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "graphify_graph_report_wrong_result", + "evidence_id": "graphify-derived-report-boundary" + }, + "locator": { + "quote": "GRAPH_REPORT.md is a derived graph/report artifact" + } + }, + "created_at": "2026-06-11T17:22:00Z" + }, + { + "evidence_id": "graphify-stale-source-trap", + "kind": "adapter_state", + "text": "Stale graphify trap: GRAPH_REPORT.md became the authoritative ELF memory store and replaces source notes.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "graphify_graph_report_wrong_result", + "evidence_id": "graphify-stale-source-trap" + }, + "locator": { + "quote": "GRAPH_REPORT.md became the authoritative ELF memory store" + } + }, + "created_at": "2026-06-11T17:19:00Z" + } + ], + "adapter_response": { + "adapter_id": "fixture_graph_rag_external_adapters", + "answer": { + "content": "graphify provides derived graph/report navigation: graph.json links the ELF memory service, Qdrant rebuild, and graph report mapping nodes, and query output carries source-file locations. The graph/report output is derived graph/report evidence, not authoritative ELF memory.", + "claims": [ + { + "claim_id": "graph_report_navigation", + "text": "graphify graph.json links the ELF memory service, Qdrant rebuild, and graph report mapping nodes.", + "evidence_ids": ["graphify-graph-summary-output"], + "confidence": "high" + }, + { + "claim_id": "source_location_citations", + "text": "graphify query output includes source files and line-like locations for generated corpus snippets.", + "evidence_ids": ["graphify-source-location-output"], + "confidence": "high" + }, + { + "claim_id": "derived_report_boundary", + "text": "GRAPH_REPORT.md is a derived graph/report artifact and must not be treated as authoritative ELF memory.", + "evidence_ids": ["graphify-derived-report-boundary"], + "confidence": "high" + } + ], + "evidence_ids": [ + "graphify-graph-summary-output", + "graphify-source-location-output", + "graphify-derived-report-boundary" + ], + "pages": [ + { + "page_id": "graphify:representative-graph-report", + "page_type": "concept", + "title": "graphify Representative Graph Report", + "path": "tmp/real-world-memory/graph-rag/graphify/GRAPH_REPORT.md", + "sections": [ + { + "section_id": "graph-summary", + "heading": "Graph Summary", + "role": "summary", + "content": "graph.json links the ELF memory service, Qdrant rebuild, and graph report mapping nodes.", + "evidence_ids": ["graphify-graph-summary-output"], + "timeline_event_ids": ["graphify-graph-output-recorded"] + }, + { + "section_id": "source-locations", + "heading": "Source Locations", + "role": "citations", + "content": "Query output includes source files and line-like locations for generated corpus snippets.", + "evidence_ids": ["graphify-source-location-output"], + "timeline_event_ids": ["graphify-source-location-recorded"] + }, + { + "section_id": "unsupported-quality-summary", + "heading": "Unsupported Quality Summary", + "role": "summary", + "content": "This fixture does not prove broad graph-navigation quality for graphify or an ELF-over-graphify result.", + "evidence_ids": [], + "timeline_event_ids": [], + "unsupported_reason": "The representative fixture is based on bounded graph/report output and not a broad quality evaluation." + } + ], + "backlinks": ["project:elf-memory-service", "entity:qdrant-rebuild"], + "lint_findings": [], + "rebuild": { + "first_hash": "blake3:graphify-representative-001", + "second_hash": "blake3:graphify-representative-001", + "deterministic": true, + "allowed_variance": [] + } + } + ], + "latency_ms": 4.2, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + } + } + } + }, + "timeline": [ + { + "event_id": "graphify-stale-source-recorded", + "ts": "2026-06-11T17:19:00Z", + "actor": "agent", + "action": "recorded_stale_graph_report_claim", + "evidence_ids": ["graphify-stale-source-trap"], + "summary": "A stale trap claims graphify output became authoritative memory." + }, + { + "event_id": "graphify-graph-output-recorded", + "ts": "2026-06-11T17:20:00Z", + "actor": "adapter", + "action": "recorded_graph_summary_output", + "evidence_ids": ["graphify-graph-summary-output"], + "summary": "The representative output includes graph summary links." + }, + { + "event_id": "graphify-source-location-recorded", + "ts": "2026-06-11T17:21:00Z", + "actor": "adapter", + "action": "recorded_source_locations", + "evidence_ids": ["graphify-source-location-output"], + "summary": "The representative output includes source-file and location citations." + } + ], + "prompt": { + "role": "user", + "content": "Use graphify graph/report output to explain the graph navigation path, source citations, stale-source lint state, and unsupported quality boundary.", + "job_mode": "compile", + "constraints": ["cite_graph_report", "score_stale_lint", "flag_unsupported_summary", "no_broad_quality_claim"] + }, + "expected_answer": { + "must_include": [ + { + "claim_id": "graph_report_navigation", + "text": "graphify graph.json links the ELF memory service, Qdrant rebuild, and graph report mapping nodes." + }, + { + "claim_id": "source_location_citations", + "text": "graphify query output includes source files and line-like locations for generated corpus snippets." + }, + { + "claim_id": "derived_report_boundary", + "text": "GRAPH_REPORT.md is a derived graph/report artifact and must not be treated as authoritative ELF memory." + } + ], + "must_not_include": [ + "GRAPH_REPORT.md became the authoritative ELF memory store.", + "ELF beats graphify on graph-navigation quality." + ], + "evidence_links": { + "graph_report_navigation": ["graphify-graph-summary-output"], + "source_location_citations": ["graphify-source-location-output"], + "derived_report_boundary": ["graphify-derived-report-boundary"] + }, + "answer_type": "compiled_knowledge", + "accepted_alternates": [], + "requires_caveat": true, + "requires_refusal": false + }, + "required_evidence": [ + { + "evidence_id": "graphify-graph-summary-output", + "claim_id": "graph_report_navigation", + "requirement": "cite", + "quote": "graph.json links the ELF memory service node to Qdrant rebuild and graph report mapping nodes" + }, + { + "evidence_id": "graphify-source-location-output", + "claim_id": "source_location_citations", + "requirement": "cite", + "quote": "query output includes source files and line-like locations" + }, + { + "evidence_id": "graphify-derived-report-boundary", + "claim_id": "derived_report_boundary", + "requirement": "cite", + "quote": "GRAPH_REPORT.md is a derived graph/report artifact" + } + ], + "negative_traps": [ + { + "trap_id": "graphify-authoritative-report-trap", + "type": "stale_fact", + "evidence_ids": ["graphify-stale-source-trap"], + "failure_if_used": true + } + ], + "scoring_rubric": { + "dimensions": { + "answer_correctness": { + "weight": 0.25, + "max_points": 1.0, + "criteria": "Must identify the graph/report navigation path and source citation boundary." + }, + "evidence_grounding": { + "weight": 0.3, + "max_points": 1.0, + "criteria": "Must cite graph summary, source-location, and derived-report boundary evidence." + }, + "workflow_helpfulness": { + "weight": 0.2, + "max_points": 1.0, + "criteria": "Must expose graph report, source citations, stale-source lint, and unsupported-summary handling." + }, + "trap_avoidance": { + "weight": 0.25, + "max_points": 1.0, + "criteria": "Must lint the stale authoritative-report trap instead of silently missing it." + } + }, + "pass_threshold": 0.8, + "hard_fail_rules": [ + "unsupported high-confidence claim about a required decision or fact", + "use of a negative trap marked failure_if_used = true" + ] + }, + "allowed_uncertainty": { + "can_answer_unknown": true, + "acceptable_phrases": ["derived graph/report evidence"], + "fallback_action": "state_bounded_graph_report_boundary" + }, + "tags": ["external_adapter", "graph_rag", "graphify", "graph_report", "stale_source_lint", "unsupported_summary"] +} diff --git a/apps/elf-eval/fixtures/real_world_external_adapters/graph_rag/graphiti_temporal_validity_blocked.json b/apps/elf-eval/fixtures/real_world_external_adapters/graph_rag/graphiti_temporal_validity_blocked.json new file mode 100644 index 00000000..1c649e71 --- /dev/null +++ b/apps/elf-eval/fixtures/real_world_external_adapters/graph_rag/graphiti_temporal_validity_blocked.json @@ -0,0 +1,197 @@ +{ + "schema": "elf.real_world_job/v1", + "job_id": "graph-rag-graphiti-temporal-validity-001", + "suite": "memory_evolution", + "title": "Keep Graphiti/Zep temporal-validity scoring provider-blocked until current and historical facts return", + "encoding": { + "status": "blocked", + "reason": "Graphiti/Zep representative temporal-validity scoring requires explicit provider configuration before Docker-local Graphiti can return current, historical, and rationale facts with validity windows.", + "follow_up": { + "title": "Run Graphiti/Zep temporal-validity job with explicit provider config", + "reason": "The representative job can score only after Graphiti search output maps current and historical validity-window facts to generated evidence ids." + } + }, + "corpus": { + "corpus_id": "graph-rag-representative-2026-06-11", + "profile": "external_adapter", + "items": [ + { + "evidence_id": "graphiti-current-fact-contract", + "kind": "adapter_contract", + "text": "Graphiti/Zep representative contract: a current fact must carry a validity window and map to the generated current evidence id.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "graphiti_temporal_validity_blocked", + "evidence_id": "graphiti-current-fact-contract" + }, + "locator": { + "quote": "a current fact must carry a validity window" + } + }, + "created_at": "2026-06-11T17:15:00Z" + }, + { + "evidence_id": "graphiti-historical-fact-contract", + "kind": "adapter_contract", + "text": "Graphiti/Zep representative contract: a historical fact must remain queryable as historical instead of being presented as the current fact.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "graphiti_temporal_validity_blocked", + "evidence_id": "graphiti-historical-fact-contract" + }, + "locator": { + "quote": "a historical fact must remain queryable as historical" + } + }, + "created_at": "2026-06-11T17:16:00Z" + }, + { + "evidence_id": "graphiti-provider-boundary", + "kind": "adapter_blocker", + "text": "Graphiti/Zep blocker: the live temporal smoke is provider-bound and must report provider_api_key_missing when explicit credentials are absent.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "graphiti_temporal_validity_blocked", + "evidence_id": "graphiti-provider-boundary" + }, + "locator": { + "quote": "must report provider_api_key_missing when explicit credentials are absent" + } + }, + "created_at": "2026-06-11T17:17:00Z" + } + ] + }, + "timeline": [ + { + "event_id": "graphiti-temporal-contract-recorded", + "ts": "2026-06-11T17:15:00Z", + "actor": "agent", + "action": "recorded_temporal_contract", + "evidence_ids": ["graphiti-current-fact-contract", "graphiti-historical-fact-contract"], + "summary": "Graphiti/Zep representative scoring requires current and historical validity-window facts." + } + ], + "prompt": { + "role": "user", + "content": "Score Graphiti/Zep temporal validity only when current and historical facts with validity windows are returned.", + "job_mode": "answer", + "constraints": ["distinguish_current_from_historical", "cite_temporal_facts", "typed_provider_blocker"] + }, + "expected_answer": { + "must_include": [ + { + "claim_id": "graphiti_temporal_contract", + "text": "Graphiti/Zep temporal scoring requires current and historical facts with validity windows." + } + ], + "must_not_include": [ + "Graphiti/Zep temporal validity passes without provider-backed output.", + "ELF beats Graphiti/Zep temporal graph memory." + ], + "evidence_links": { + "graphiti_temporal_contract": [ + "graphiti-current-fact-contract", + "graphiti-historical-fact-contract", + "graphiti-provider-boundary" + ] + }, + "answer_type": "typed_blocker", + "accepted_alternates": [], + "requires_caveat": false, + "requires_refusal": false + }, + "required_evidence": [ + { + "evidence_id": "graphiti-current-fact-contract", + "claim_id": "graphiti_temporal_contract", + "requirement": "cite", + "quote": "a current fact must carry a validity window" + }, + { + "evidence_id": "graphiti-historical-fact-contract", + "claim_id": "graphiti_temporal_contract", + "requirement": "cite", + "quote": "a historical fact must remain queryable as historical" + }, + { + "evidence_id": "graphiti-provider-boundary", + "claim_id": "graphiti_temporal_contract", + "requirement": "explain", + "quote": "must report provider_api_key_missing when explicit credentials are absent" + } + ], + "negative_traps": [ + { + "trap_id": "graphiti-providerless-temporal-pass", + "type": "stale_fact", + "evidence_ids": ["graphiti-historical-fact-contract"], + "failure_if_used": false + } + ], + "scoring_rubric": { + "dimensions": { + "lifecycle_behavior": { + "weight": 0.4, + "max_points": 1.0, + "criteria": "Must distinguish current and historical validity windows before scoring." + }, + "answer_correctness": { + "weight": 0.25, + "max_points": 1.0, + "criteria": "Must preserve the provider-backed temporal boundary." + }, + "evidence_grounding": { + "weight": 0.25, + "max_points": 1.0, + "criteria": "Must cite current, historical, and provider-boundary evidence." + }, + "trap_avoidance": { + "weight": 0.1, + "max_points": 1.0, + "criteria": "Must not report historical facts as current." + } + }, + "pass_threshold": 0.8, + "hard_fail_rules": [ + "unsupported high-confidence claim about a required decision or fact", + "use of a negative trap marked failure_if_used = true" + ] + }, + "allowed_uncertainty": { + "can_answer_unknown": false, + "acceptable_phrases": [], + "fallback_action": "preserve_provider_blocker" + }, + "memory_evolution": { + "current_evidence_ids": ["graphiti-current-fact-contract"], + "historical_evidence_ids": ["graphiti-historical-fact-contract"], + "stale_trap_ids": ["graphiti-providerless-temporal-pass"], + "conflicts": [ + { + "conflict_id": "graphiti-current-historical-validity", + "claim_id": "graphiti_temporal_contract", + "current_evidence_id": "graphiti-current-fact-contract", + "historical_evidence_id": "graphiti-historical-fact-contract", + "resolved_by_evidence_id": "graphiti-provider-boundary" + } + ], + "update_rationale": { + "claim_id": "graphiti_temporal_contract", + "evidence_ids": ["graphiti-provider-boundary"], + "available": true + }, + "temporal_validity": { + "required": true, + "encoded": false, + "follow_up": "Run the provider-backed Graphiti/Zep temporal smoke and map validity windows to evidence ids." + } + }, + "tags": ["external_adapter", "graph_rag", "graphiti_zep", "temporal_validity", "typed_blocked"] +} diff --git a/apps/elf-eval/fixtures/real_world_external_adapters/graph_rag/graphrag_output_tables_blocked.json b/apps/elf-eval/fixtures/real_world_external_adapters/graph_rag/graphrag_output_tables_blocked.json new file mode 100644 index 00000000..7f851b0f --- /dev/null +++ b/apps/elf-eval/fixtures/real_world_external_adapters/graph_rag/graphrag_output_tables_blocked.json @@ -0,0 +1,146 @@ +{ + "schema": "elf.real_world_job/v1", + "job_id": "graph-rag-graphrag-output-tables-001", + "suite": "knowledge_compilation", + "title": "Score GraphRAG output-table citations only after provider-backed tables map to evidence ids", + "encoding": { + "status": "blocked", + "reason": "GraphRAG representative knowledge-synthesis scoring is blocked until an explicitly provider-backed Docker run emits output tables whose document, text-unit, community, and report identifiers map to generated evidence ids.", + "follow_up": { + "title": "Run GraphRAG representative output-table citation job with explicit provider config", + "reason": "The representative job can score graph summaries and citations only after parquet output tables and local-search context are mapped to benchmark evidence ids." + } + }, + "corpus": { + "corpus_id": "graph-rag-representative-2026-06-11", + "profile": "external_adapter", + "items": [ + { + "evidence_id": "graphrag-output-table-contract", + "kind": "adapter_contract", + "text": "GraphRAG representative contract: score graph summaries only when documents, text_units, communities, community_reports, entities, and relationships tables map to generated evidence ids.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "graphrag_output_tables_blocked", + "evidence_id": "graphrag-output-table-contract" + }, + "locator": { + "quote": "documents, text_units, communities, community_reports, entities, and relationships tables" + } + }, + "created_at": "2026-06-11T17:10:00Z" + }, + { + "evidence_id": "graphrag-provider-boundary", + "kind": "adapter_blocker", + "text": "GraphRAG blocker: live indexing and local search require explicit provider configuration; missing provider configuration remains a typed blocker.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "graphrag_output_tables_blocked", + "evidence_id": "graphrag-provider-boundary" + }, + "locator": { + "quote": "live indexing and local search require explicit provider configuration" + } + }, + "created_at": "2026-06-11T17:11:00Z" + } + ] + }, + "timeline": [ + { + "event_id": "graphrag-output-contract-recorded", + "ts": "2026-06-11T17:10:00Z", + "actor": "agent", + "action": "recorded_adapter_contract", + "evidence_ids": ["graphrag-output-table-contract"], + "summary": "GraphRAG representative scoring requires output tables and source ids." + } + ], + "prompt": { + "role": "user", + "content": "Compile a GraphRAG graph-summary benchmark only when output tables and citations exist.", + "job_mode": "compile", + "constraints": ["cite_output_tables", "score_graph_summaries", "typed_provider_blocker"] + }, + "expected_answer": { + "must_include": [ + { + "claim_id": "output_table_contract", + "text": "GraphRAG graph-summary scoring requires output tables mapped to generated evidence ids." + } + ], + "must_not_include": [ + "GraphRAG passes graph-summary quality without provider-backed output tables.", + "ELF beats GraphRAG on graph synthesis." + ], + "evidence_links": { + "output_table_contract": ["graphrag-output-table-contract", "graphrag-provider-boundary"] + }, + "answer_type": "typed_blocker", + "accepted_alternates": [], + "requires_caveat": false, + "requires_refusal": false + }, + "required_evidence": [ + { + "evidence_id": "graphrag-output-table-contract", + "claim_id": "output_table_contract", + "requirement": "cite", + "quote": "documents, text_units, communities, community_reports, entities, and relationships tables" + }, + { + "evidence_id": "graphrag-provider-boundary", + "claim_id": "output_table_contract", + "requirement": "explain", + "quote": "live indexing and local search require explicit provider configuration" + } + ], + "negative_traps": [ + { + "trap_id": "graphrag-providerless-pass", + "type": "unsupported_claim", + "evidence_ids": ["graphrag-provider-boundary"], + "failure_if_used": true + } + ], + "scoring_rubric": { + "dimensions": { + "answer_correctness": { + "weight": 0.25, + "max_points": 1.0, + "criteria": "Must keep GraphRAG provider-backed output as a prerequisite." + }, + "evidence_grounding": { + "weight": 0.45, + "max_points": 1.0, + "criteria": "Must require output-table identifiers before citation scoring." + }, + "workflow_helpfulness": { + "weight": 0.2, + "max_points": 1.0, + "criteria": "Must identify graph-summary and citation artifacts needed for rerun." + }, + "trap_avoidance": { + "weight": 0.1, + "max_points": 1.0, + "criteria": "Must not turn a provider blocker into a graph-synthesis pass." + } + }, + "pass_threshold": 0.8, + "hard_fail_rules": [ + "unsupported high-confidence claim about a required decision or fact", + "use of a negative trap marked failure_if_used = true" + ] + }, + "allowed_uncertainty": { + "can_answer_unknown": false, + "acceptable_phrases": [], + "fallback_action": "preserve_provider_blocker" + }, + "tags": ["external_adapter", "graph_rag", "graphrag", "output_tables", "typed_blocked"] +} diff --git a/apps/elf-eval/fixtures/real_world_external_adapters/graph_rag/lightrag_context_sources_incomplete.json b/apps/elf-eval/fixtures/real_world_external_adapters/graph_rag/lightrag_context_sources_incomplete.json new file mode 100644 index 00000000..04629878 --- /dev/null +++ b/apps/elf-eval/fixtures/real_world_external_adapters/graph_rag/lightrag_context_sources_incomplete.json @@ -0,0 +1,141 @@ +{ + "schema": "elf.real_world_job/v1", + "job_id": "graph-rag-lightrag-context-sources-001", + "suite": "retrieval", + "title": "Score LightRAG context-source references only after the Docker API exports source paths", + "encoding": { + "status": "incomplete", + "reason": "LightRAG representative context-source scoring is incomplete when the opt-in Docker API service is not started or does not export context, references, or file paths for the generated corpus.", + "follow_up": { + "title": "Run LightRAG context-source export with the Docker service profile", + "reason": "The representative job can score source references after /query only_need_context returns generated file paths or content that maps to evidence ids." + } + }, + "corpus": { + "corpus_id": "graph-rag-representative-2026-06-11", + "profile": "external_adapter", + "items": [ + { + "evidence_id": "lightrag-context-output-contract", + "kind": "adapter_contract", + "text": "LightRAG representative contract: score context navigation only when /query context export returns generated source file paths, source snippets, or reference content mapped to benchmark evidence ids.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "lightrag_context_sources_incomplete", + "evidence_id": "lightrag-context-output-contract" + }, + "locator": { + "quote": "/query context export returns generated source file paths, source snippets, or reference content" + } + }, + "created_at": "2026-06-11T17:05:00Z" + }, + { + "evidence_id": "lightrag-service-boundary", + "kind": "adapter_blocker", + "text": "LightRAG boundary: missing or unreachable Docker API service is an incomplete setup state, not evidence of graph-RAG citation quality.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "lightrag_context_sources_incomplete", + "evidence_id": "lightrag-service-boundary" + }, + "locator": { + "quote": "missing or unreachable Docker API service is an incomplete setup state" + } + }, + "created_at": "2026-06-11T17:06:00Z" + } + ] + }, + "timeline": [ + { + "event_id": "lightrag-context-contract-recorded", + "ts": "2026-06-11T17:05:00Z", + "actor": "agent", + "action": "recorded_adapter_contract", + "evidence_ids": ["lightrag-context-output-contract"], + "summary": "LightRAG context-source scoring needs context export with generated source mappings." + } + ], + "prompt": { + "role": "user", + "content": "Score LightRAG source-reference navigation only when context export is available.", + "job_mode": "answer", + "constraints": ["cite_source_paths", "typed_incomplete_setup", "no_graph_rag_quality_claim"] + }, + "expected_answer": { + "must_include": [ + { + "claim_id": "context_source_contract", + "text": "LightRAG context-source scoring requires exported context or references mapped to evidence ids." + } + ], + "must_not_include": [ + "LightRAG passes representative graph-RAG navigation.", + "ELF beats LightRAG on source-reference navigation." + ], + "evidence_links": { + "context_source_contract": ["lightrag-context-output-contract", "lightrag-service-boundary"] + }, + "answer_type": "typed_incomplete", + "accepted_alternates": [], + "requires_caveat": false, + "requires_refusal": false + }, + "required_evidence": [ + { + "evidence_id": "lightrag-context-output-contract", + "claim_id": "context_source_contract", + "requirement": "cite", + "quote": "/query context export returns generated source file paths, source snippets, or reference content" + }, + { + "evidence_id": "lightrag-service-boundary", + "claim_id": "context_source_contract", + "requirement": "explain", + "quote": "missing or unreachable Docker API service is an incomplete setup state" + } + ], + "negative_traps": [ + { + "trap_id": "lightrag-context-pass-claim", + "type": "unsupported_claim", + "evidence_ids": ["lightrag-service-boundary"], + "failure_if_used": true + } + ], + "scoring_rubric": { + "dimensions": { + "answer_correctness": { + "weight": 0.3, + "max_points": 1.0, + "criteria": "Must preserve incomplete setup status when the API does not export context." + }, + "evidence_grounding": { + "weight": 0.5, + "max_points": 1.0, + "criteria": "Must require generated source paths or content mappings before scoring." + }, + "trap_avoidance": { + "weight": 0.2, + "max_points": 1.0, + "criteria": "Must not treat service reachability as graph-RAG quality." + } + }, + "pass_threshold": 0.8, + "hard_fail_rules": [ + "unsupported high-confidence claim about a required decision or fact", + "use of a negative trap marked failure_if_used = true" + ] + }, + "allowed_uncertainty": { + "can_answer_unknown": false, + "acceptable_phrases": [], + "fallback_action": "preserve_incomplete_setup_state" + }, + "tags": ["external_adapter", "graph_rag", "lightrag", "context_sources", "typed_incomplete"] +} diff --git a/apps/elf-eval/fixtures/real_world_external_adapters/graph_rag/ragflow_reference_chunks_blocked.json b/apps/elf-eval/fixtures/real_world_external_adapters/graph_rag/ragflow_reference_chunks_blocked.json new file mode 100644 index 00000000..5121966a --- /dev/null +++ b/apps/elf-eval/fixtures/real_world_external_adapters/graph_rag/ragflow_reference_chunks_blocked.json @@ -0,0 +1,149 @@ +{ + "schema": "elf.real_world_job/v1", + "job_id": "graph-rag-ragflow-reference-chunks-001", + "suite": "retrieval", + "title": "Keep RAGFlow reference-chunk citation scoring blocked until live chunks map to evidence ids", + "encoding": { + "status": "blocked", + "reason": "RAGFlow reference-chunk citation scoring requires an explicit Docker resource opt-in plus a local API key before returned reference chunks can be mapped to generated evidence ids.", + "follow_up": { + "title": "Run RAGFlow reference-chunk citation job with Docker resource opt-in", + "reason": "The representative job can score only after the RAGFlow smoke returns reference chunks containing document, chunk, and content fields for the generated public corpus." + } + }, + "corpus": { + "corpus_id": "graph-rag-representative-2026-06-11", + "profile": "external_adapter", + "items": [ + { + "evidence_id": "ragflow-reference-chunk-contract", + "kind": "adapter_contract", + "text": "RAGFlow representative contract: score only when returned reference chunks include generated document ids, chunk ids, content, and document metadata that map to benchmark evidence ids.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "ragflow_reference_chunks_blocked", + "evidence_id": "ragflow-reference-chunk-contract" + }, + "locator": { + "quote": "returned reference chunks include generated document ids, chunk ids, content, and document metadata" + } + }, + "created_at": "2026-06-11T17:00:00Z" + }, + { + "evidence_id": "ragflow-resource-boundary", + "kind": "adapter_blocker", + "text": "RAGFlow blocker: the checked-in smoke remains typed blocked until Docker resource-envelope opt-in and explicit local API configuration are present.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "ragflow_reference_chunks_blocked", + "evidence_id": "ragflow-resource-boundary" + }, + "locator": { + "quote": "Docker resource-envelope opt-in and explicit local API configuration" + } + }, + "created_at": "2026-06-11T17:01:00Z" + } + ] + }, + "timeline": [ + { + "event_id": "ragflow-reference-contract-recorded", + "ts": "2026-06-11T17:00:00Z", + "actor": "agent", + "action": "recorded_adapter_contract", + "evidence_ids": ["ragflow-reference-chunk-contract"], + "summary": "RAGFlow can be scored only from generated reference chunks with stable evidence mapping." + }, + { + "event_id": "ragflow-blocker-recorded", + "ts": "2026-06-11T17:01:00Z", + "actor": "agent", + "action": "recorded_typed_blocker", + "evidence_ids": ["ragflow-resource-boundary"], + "summary": "RAGFlow representative scoring remains blocked by resource and API setup." + } + ], + "prompt": { + "role": "user", + "content": "Score RAGFlow citation quality only if reference chunks from the generated corpus are available.", + "job_mode": "answer", + "constraints": ["cite_chunk_references", "preserve_typed_blocker", "no_smoke_to_quality_claim"] + }, + "expected_answer": { + "must_include": [ + { + "claim_id": "reference_chunk_contract", + "text": "RAGFlow citation scoring requires returned reference chunks mapped to generated evidence ids." + } + ], + "must_not_include": [ + "RAGFlow passes broad citation quality.", + "ELF beats RAGFlow on RAG citation quality." + ], + "evidence_links": { + "reference_chunk_contract": ["ragflow-reference-chunk-contract", "ragflow-resource-boundary"] + }, + "answer_type": "typed_blocker", + "accepted_alternates": [], + "requires_caveat": false, + "requires_refusal": false + }, + "required_evidence": [ + { + "evidence_id": "ragflow-reference-chunk-contract", + "claim_id": "reference_chunk_contract", + "requirement": "cite", + "quote": "returned reference chunks include generated document ids, chunk ids, content, and document metadata" + }, + { + "evidence_id": "ragflow-resource-boundary", + "claim_id": "reference_chunk_contract", + "requirement": "explain", + "quote": "Docker resource-envelope opt-in and explicit local API configuration" + } + ], + "negative_traps": [ + { + "trap_id": "ragflow-smoke-quality-win", + "type": "unsupported_claim", + "evidence_ids": ["ragflow-resource-boundary"], + "failure_if_used": true + } + ], + "scoring_rubric": { + "dimensions": { + "answer_correctness": { + "weight": 0.3, + "max_points": 1.0, + "criteria": "Must preserve the blocked citation-scoring boundary." + }, + "evidence_grounding": { + "weight": 0.5, + "max_points": 1.0, + "criteria": "Must require reference chunk ids and document metadata before scoring." + }, + "trap_avoidance": { + "weight": 0.2, + "max_points": 1.0, + "criteria": "Must not convert the smoke contract into a broad RAGFlow quality claim." + } + }, + "pass_threshold": 0.8, + "hard_fail_rules": [ + "unsupported high-confidence claim about a required decision or fact", + "use of a negative trap marked failure_if_used = true" + ] + }, + "allowed_uncertainty": { + "can_answer_unknown": false, + "acceptable_phrases": [], + "fallback_action": "preserve_typed_blocker" + }, + "tags": ["external_adapter", "graph_rag", "ragflow", "reference_chunks", "typed_blocked"] +} diff --git a/apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json b/apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json index e7cd237f..f5ccdf80 100644 --- a/apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json +++ b/apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json @@ -1797,6 +1797,27 @@ "evidence": "Resource envelope and service startup retry guidance must be documented first." } ], + "scenarios": [ + { + "scenario_id": "reference_chunk_citation_mapping", + "suite_id": "retrieval", + "status": "blocked", + "elf_position": "untested", + "comparison_outcome": "blocked", + "evidence": "XY-929 adds a representative blocked fixture for RAGFlow reference-chunk citation scoring. The job must remain blocked until returned reference chunks include generated document ids, chunk ids, content, and document metadata mapped to benchmark evidence ids.", + "command": "cargo make real-world-memory-graph-rag", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/graph_rag/ragflow_reference_chunks_blocked.json" + }, + { + "scenario_id": "private_or_large_corpus_ragflow_quality", + "suite_id": "retrieval", + "status": "not_encoded", + "elf_position": "untested", + "comparison_outcome": "non_goal", + "evidence": "Private corpus, large-corpus, and hosted RAGFlow quality are outside the generated-public Docker representative lane and must not be inferred from smoke reports.", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json" + } + ], "evidence": [ { "kind": "source", @@ -1920,6 +1941,27 @@ "evidence": "The smoke records context/source mappings, but full trace or viewer diagnostics are not mapped to benchmark scoring." } ], + "scenarios": [ + { + "scenario_id": "context_source_reference_mapping", + "suite_id": "retrieval", + "status": "incomplete", + "elf_position": "untested", + "comparison_outcome": "blocked", + "evidence": "XY-929 adds a representative incomplete fixture for LightRAG context/source-reference scoring. The job cannot score until the opt-in Docker API exports generated source file paths, snippets, or reference content.", + "command": "cargo make real-world-memory-graph-rag", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/graph_rag/lightrag_context_sources_incomplete.json" + }, + { + "scenario_id": "graph_rag_navigation_quality", + "suite_id": "retrieval", + "status": "not_encoded", + "elf_position": "untested", + "comparison_outcome": "not_tested", + "evidence": "LightRAG graph-RAG navigation quality remains not_tested beyond the context-source output contract; no ELF win, tie, or loss is claimed.", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json" + } + ], "evidence": [ { "kind": "source", @@ -2058,6 +2100,27 @@ "evidence": "GraphRAG update/delete/current-versus-historical behavior is not encoded by the smoke." } ], + "scenarios": [ + { + "scenario_id": "output_table_citation_mapping", + "suite_id": "knowledge_compilation", + "status": "blocked", + "elf_position": "untested", + "comparison_outcome": "blocked", + "evidence": "XY-929 adds a representative blocked fixture for GraphRAG output-table citation scoring. The job requires provider-backed Docker output tables whose document, text-unit, community, report, entity, and relationship identifiers map to generated evidence ids.", + "command": "cargo make real-world-memory-graph-rag", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/graph_rag/graphrag_output_tables_blocked.json" + }, + { + "scenario_id": "graph_summary_synthesis_quality", + "suite_id": "knowledge_compilation", + "status": "not_encoded", + "elf_position": "untested", + "comparison_outcome": "not_tested", + "evidence": "GraphRAG graph-summary synthesis quality remains not_tested until provider-backed output tables and local-search context are scored beyond the smoke contract.", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json" + } + ], "evidence": [ { "kind": "source", @@ -2196,6 +2259,27 @@ "evidence": "The smoke records setup and provider boundaries but does not encode backup, restore, private corpus, or hosted-service operations." } ], + "scenarios": [ + { + "scenario_id": "temporal_validity_window_mapping", + "suite_id": "memory_evolution", + "status": "blocked", + "elf_position": "untested", + "comparison_outcome": "blocked", + "evidence": "XY-929 adds a representative blocked fixture for Graphiti/Zep temporal-validity scoring. The job remains blocked until provider-backed Docker output maps current and historical validity-window facts to generated evidence ids.", + "command": "cargo make real-world-memory-graph-rag", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/graph_rag/graphiti_temporal_validity_blocked.json" + }, + { + "scenario_id": "hosted_zep_temporal_memory", + "suite_id": "memory_evolution", + "status": "unsupported", + "elf_position": "untested", + "comparison_outcome": "non_goal", + "evidence": "Hosted Zep service behavior is outside the Docker-local representative lane; no hosted-service result is used as ELF win/loss evidence.", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json" + } + ], "evidence": [ { "kind": "source", @@ -2618,6 +2702,17 @@ "evidence": "Resume answers from wiki pages are not encoded." } ], + "scenarios": [ + { + "scenario_id": "wiki_page_citation_lint", + "suite_id": "knowledge_compilation", + "status": "not_encoded", + "elf_position": "untested", + "comparison_outcome": "not_tested", + "evidence": "llm-wiki remains a knowledge-workflow reference. No Docker-contained plugin or file-based page materializer emits cited wiki sections for scoring.", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json" + } + ], "evidence": [ { "kind": "source", @@ -2692,6 +2787,17 @@ "evidence": "Operator continuity through brain pages is not encoded." } ], + "scenarios": [ + { + "scenario_id": "compiled_truth_timeline_export", + "suite_id": "knowledge_compilation", + "status": "blocked", + "elf_position": "untested", + "comparison_outcome": "blocked", + "evidence": "gbrain compiled-truth and timeline scoring remains blocked until a Docker-local brain repository and database setup emits current-truth pages with source timeline evidence.", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json" + } + ], "evidence": [ { "kind": "source", @@ -2796,6 +2902,27 @@ "evidence": "Resume answers from graph context are not encoded." } ], + "scenarios": [ + { + "scenario_id": "graph_report_navigation_lint", + "suite_id": "knowledge_compilation", + "status": "wrong_result", + "elf_position": "untested", + "comparison_outcome": "not_tested", + "evidence": "XY-929 adds a representative graphify fixture that scores graph report navigation, source-location citations, stale-source lint, and unsupported-summary handling as wrong_result because stale-source lint is still missing. This remains graphify non-pass evidence, not an ELF victory claim.", + "command": "cargo make real-world-memory-graph-rag", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/graph_rag/graphify_graph_report_wrong_result.json" + }, + { + "scenario_id": "broad_graph_navigation_quality", + "suite_id": "retrieval", + "status": "not_encoded", + "elf_position": "untested", + "comparison_outcome": "not_tested", + "evidence": "Broad graph-navigation, codebase, multimodal, and private-corpus quality remain not_tested; the graphify evidence is bounded to generated graph/report artifacts.", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json" + } + ], "evidence": [ { "kind": "source", diff --git a/apps/elf-eval/tests/real_world_job_benchmark.rs b/apps/elf-eval/tests/real_world_job_benchmark.rs index c1e541bb..a71a7c81 100644 --- a/apps/elf-eval/tests/real_world_job_benchmark.rs +++ b/apps/elf-eval/tests/real_world_job_benchmark.rs @@ -72,6 +72,13 @@ fn context_trajectory_fixture_dir() -> PathBuf { real_world_memory_fixture_dir().join("context_trajectory") } +fn graph_rag_external_fixture_dir() -> PathBuf { + Path::new(env!("CARGO_MANIFEST_DIR")) + .join("fixtures") + .join("real_world_external_adapters") + .join("graph_rag") +} + fn workspace_root() -> Result { let manifest_dir = Path::new(env!("CARGO_MANIFEST_DIR")); let root = manifest_dir @@ -510,7 +517,7 @@ fn external_adapter_run_summarizes_nonzero_scenario_losses() -> Result<()> { report .pointer("/external_adapters/summary/scenario_position_counts/untested") .and_then(Value::as_u64), - Some(22) + Some(34) ); assert_eq!( report @@ -522,7 +529,7 @@ fn external_adapter_run_summarizes_nonzero_scenario_losses() -> Result<()> { report .pointer("/external_adapters/summary/scenario_outcome_counts/not_tested") .and_then(Value::as_u64), - Some(11) + Some(16) ); let adapters = array_at(&report, "/external_adapters/adapters")?; @@ -680,25 +687,25 @@ fn assert_external_adapter_manifest_scenario_summary(report: &Value) { report .pointer("/external_adapters/summary/scenario_status_counts/unsupported") .and_then(Value::as_u64), - Some(2) + Some(3) ); assert_eq!( report .pointer("/external_adapters/summary/scenario_status_counts/blocked") .and_then(Value::as_u64), - Some(8) + Some(12) ); assert_eq!( report .pointer("/external_adapters/summary/scenario_status_counts/incomplete") .and_then(Value::as_u64), - Some(0) + Some(1) ); assert_eq!( report .pointer("/external_adapters/summary/scenario_status_counts/wrong_result") .and_then(Value::as_u64), - Some(5) + Some(6) ); assert_eq!( report @@ -716,7 +723,7 @@ fn assert_external_adapter_manifest_scenario_summary(report: &Value) { report .pointer("/external_adapters/summary/scenario_status_counts/not_encoded") .and_then(Value::as_u64), - Some(6) + Some(11) ); assert_eq!( report @@ -740,7 +747,7 @@ fn assert_external_adapter_manifest_scenario_summary(report: &Value) { report .pointer("/external_adapters/summary/scenario_position_counts/untested") .and_then(Value::as_u64), - Some(23) + Some(35) ); assert_eq!( report @@ -764,19 +771,19 @@ fn assert_external_adapter_manifest_scenario_summary(report: &Value) { report .pointer("/external_adapters/summary/scenario_outcome_counts/not_tested") .and_then(Value::as_u64), - Some(12) + Some(17) ); assert_eq!( report .pointer("/external_adapters/summary/scenario_outcome_counts/blocked") .and_then(Value::as_u64), - Some(8) + Some(13) ); assert_eq!( report .pointer("/external_adapters/summary/scenario_outcome_counts/non_goal") .and_then(Value::as_u64), - Some(3) + Some(5) ); } @@ -838,6 +845,7 @@ fn assert_external_adapter_manifest_records(report: &Value) -> Result<()> { assert_graph_rag_research_gate_records(ragflow, lightrag, graphrag); assert_graphiti_zep_adapter(graphiti_zep); assert_graphify_adapter(graphify)?; + assert_graph_rag_representative_scenarios(ragflow, lightrag, graphrag, graphiti_zep, graphify)?; assert_letta_core_archival_gate(letta)?; assert_qmd_deep_profile_gate(qmd_deep); @@ -1367,6 +1375,63 @@ fn assert_graphify_adapter(adapter: &Value) -> Result<()> { Ok(()) } +fn assert_graph_rag_representative_scenarios( + ragflow: &Value, + lightrag: &Value, + graphrag: &Value, + graphiti_zep: &Value, + graphify: &Value, +) -> Result<()> { + let ragflow_scenarios = array_at(ragflow, "/scenarios")?; + let lightrag_scenarios = array_at(lightrag, "/scenarios")?; + let graphrag_scenarios = array_at(graphrag, "/scenarios")?; + let graphiti_scenarios = array_at(graphiti_zep, "/scenarios")?; + let graphify_scenarios = array_at(graphify, "/scenarios")?; + let ragflow_chunk = + find_by_field(ragflow_scenarios, "/scenario_id", "reference_chunk_citation_mapping")?; + let lightrag_context = + find_by_field(lightrag_scenarios, "/scenario_id", "context_source_reference_mapping")?; + let graphrag_tables = + find_by_field(graphrag_scenarios, "/scenario_id", "output_table_citation_mapping")?; + let graphiti_temporal = + find_by_field(graphiti_scenarios, "/scenario_id", "temporal_validity_window_mapping")?; + let graphify_lint = + find_by_field(graphify_scenarios, "/scenario_id", "graph_report_navigation_lint")?; + + assert_eq!( + ragflow_chunk.pointer("/comparison_outcome").and_then(Value::as_str), + Some("blocked") + ); + assert_eq!(lightrag_context.pointer("/status").and_then(Value::as_str), Some("incomplete")); + assert_eq!( + lightrag_context.pointer("/comparison_outcome").and_then(Value::as_str), + Some("blocked") + ); + assert_eq!( + graphrag_tables.pointer("/artifact").and_then(Value::as_str), + Some( + "apps/elf-eval/fixtures/real_world_external_adapters/graph_rag/graphrag_output_tables_blocked.json" + ) + ); + assert_eq!( + graphiti_temporal.pointer("/comparison_outcome").and_then(Value::as_str), + Some("blocked") + ); + assert_eq!(graphify_lint.pointer("/status").and_then(Value::as_str), Some("wrong_result")); + assert_eq!( + graphify_lint.pointer("/comparison_outcome").and_then(Value::as_str), + Some("not_tested") + ); + assert!( + graphify_lint + .pointer("/evidence") + .and_then(Value::as_str) + .is_some_and(|evidence| evidence.contains("not an ELF victory claim")) + ); + + Ok(()) +} + #[test] fn graphify_generated_manifest_keeps_retrieval_unscored() -> Result<()> { let manifest = serde_json::json!({ @@ -1481,6 +1546,61 @@ fn graphify_generated_manifest_keeps_retrieval_unscored() -> Result<()> { Ok(()) } +#[test] +fn graph_rag_representative_fixtures_report_typed_non_pass_states() -> Result<()> { + let report = run_json_report_from(graph_rag_external_fixture_dir())?; + + assert_eq!(report.pointer("/summary/job_count").and_then(Value::as_u64), Some(5)); + assert_eq!(report.pointer("/summary/pass").and_then(Value::as_u64), Some(0)); + assert_eq!(report.pointer("/summary/wrong_result").and_then(Value::as_u64), Some(1)); + assert_eq!(report.pointer("/summary/incomplete").and_then(Value::as_u64), Some(1)); + assert_eq!(report.pointer("/summary/blocked").and_then(Value::as_u64), Some(3)); + assert_eq!( + report.pointer("/summary/knowledge/citation_coverage").and_then(Value::as_f64), + Some(0.667) + ); + assert_eq!( + report.pointer("/summary/knowledge/stale_claim_detection").and_then(Value::as_f64), + Some(0.0) + ); + assert_eq!( + report.pointer("/summary/knowledge/unsupported_summary_count").and_then(Value::as_u64), + Some(1) + ); + assert_eq!( + report.pointer("/summary/temporal_validity_not_encoded_count").and_then(Value::as_u64), + Some(1) + ); + + let jobs = array_at(&report, "/jobs")?; + let ragflow = find_by_field(jobs, "/job_id", "graph-rag-ragflow-reference-chunks-001")?; + let lightrag = find_by_field(jobs, "/job_id", "graph-rag-lightrag-context-sources-001")?; + let graphrag = find_by_field(jobs, "/job_id", "graph-rag-graphrag-output-tables-001")?; + let graphiti = find_by_field(jobs, "/job_id", "graph-rag-graphiti-temporal-validity-001")?; + let graphify = find_by_field(jobs, "/job_id", "graph-rag-graphify-graph-report-001")?; + + assert_eq!(ragflow.pointer("/status").and_then(Value::as_str), Some("blocked")); + assert_eq!(lightrag.pointer("/status").and_then(Value::as_str), Some("incomplete")); + assert_eq!(graphrag.pointer("/status").and_then(Value::as_str), Some("blocked")); + assert_eq!(graphiti.pointer("/status").and_then(Value::as_str), Some("blocked")); + assert_eq!(graphify.pointer("/status").and_then(Value::as_str), Some("wrong_result")); + assert_eq!( + graphify.pointer("/knowledge/stale_claim_detection").and_then(Value::as_f64), + Some(0.0) + ); + assert_eq!( + graphify.pointer("/knowledge/unsupported_summary_count").and_then(Value::as_u64), + Some(1) + ); + assert_eq!( + graphiti.pointer("/evolution/temporal_validity_not_encoded").and_then(Value::as_bool), + Some(true) + ); + assert!(array_contains_str(graphify, "/produced_evidence", "graphify-source-location-output")?); + + Ok(()) +} + #[test] fn live_adapter_aggregate_forwards_graph_rag_smoke_controls() -> Result<()> { let makefile = fs::read_to_string( @@ -3346,9 +3466,9 @@ fn generated_json_report_renders_markdown() -> Result<()> { assert!(markdown.contains("xy844-current-worktree")); assert!(markdown.contains("Existing live-baseline reports remain valid")); assert!(markdown.contains("### Adapter Scenario Judgments")); - assert!(markdown.contains("ELF scenario positions: `wins=10, ties=11, loses=1, untested=23`")); + assert!(markdown.contains("ELF scenario positions: `wins=10, ties=11, loses=1, untested=35`")); assert!(markdown.contains( - "Scenario comparison outcomes: `win=10, tie=11, loss=1, not_tested=12, blocked=8, non_goal=3`" + "Scenario comparison outcomes: `win=10, tie=11, loss=1, not_tested=17, blocked=13, non_goal=5`" )); assert!(markdown.contains("| `claude_mem_live_baseline` | `same_corpus_retrieval`")); assert!(markdown.contains("| `memsearch_live_baseline` | `ttl_expiry_lifecycle`")); diff --git a/docs/guide/benchmarking/2026-06-11-competitor-strength-adoption-report.md b/docs/guide/benchmarking/2026-06-11-competitor-strength-adoption-report.md index 2d99e670..fee7cda8 100644 --- a/docs/guide/benchmarking/2026-06-11-competitor-strength-adoption-report.md +++ b/docs/guide/benchmarking/2026-06-11-competitor-strength-adoption-report.md @@ -40,8 +40,11 @@ The remaining caveats are material: - Several competitor strengths remain `not_tested` or blocked: OpenMemory UI/export is blocked by the XY-931 export-helper setup probe, hosted mem0 Platform behavior remains a non-goal, and OpenViking trajectory, Letta core-vs-archival - memory, and graph/RAG navigation remain unproven. XY-928 encodes OpenViking staged - trajectory, hierarchy selection, and recursive/context expansion as blocked fixtures + memory, and broad graph/RAG navigation remain unproven. XY-929 adds a + representative graph/RAG fixture slice with typed blockers, one incomplete LightRAG + job, and one graphify wrong_result job, but it does not create any broad graph/RAG + win, tie, or loss claim. XY-928 encodes OpenViking staged trajectory, hierarchy + selection, and recursive/context expansion as blocked fixtures behind same-corpus evidence output and missing staged artifacts. XY-927 adds fixture-only `core_archival_memory` coverage, but Letta scenario rows remain blocked or `not_tested` until the selected contained export/readback path exists. @@ -75,6 +78,7 @@ results, or lifecycle failures into one aggregate leaderboard. | `smoke_only` | A tiny setup or output-shape smoke ran. | | `research_gate` | Source/setup/resource/output-contract evidence exists only as research. | | `blocked` | A credential, private input, provider, or setup boundary is missing. | +| `incomplete` | Setup reached a partial adapter path but did not reach the behavioral scoring surface. | | `unsupported` | The project shape is not comparable for the scenario. | | `not_encoded` | The benchmark does not yet cover the scenario. | | `wrong_result` | The system ran but produced the wrong memory answer or evidence. | @@ -94,6 +98,7 @@ results, or lifecycle failures into one aggregate leaderboard. | `cargo make openmemory-ui-export-readback` | `2026-06-11-mem0-openmemory-history-ui-export-report.md` | mem0 local OSS passes preference correction history, entity-scoped personalization, local `get_all` export-style readback, and deletion audit history; OpenMemory export-helper setup emits a separate blocked artifact with `DOCKER_UNAVAILABLE_IN_BASELINE_RUNNER`, and hosted Platform export remains non-goal. | | `ELF_GRAPHITI_ZEP_SMOKE_START=1 ELF_GRAPHITI_ZEP_SMOKE_RUN=1 cargo make graphiti-zep-docker-temporal-smoke` | `2026-06-11-temporal-history-competitor-gap-report.md` | Graphiti/Zep temporal smoke remains blocked by `provider_api_key_missing`. | | `cargo make graphify-docker-graph-report-smoke` | `2026-06-11-graph-rag-scored-smoke-adapter-report.md` | graphify reaches tiny Docker graph/report scoring but remains wrong_result. | +| `cargo make real-world-memory-graph-rag` | `tmp/real-world-memory/graph-rag/report.json` | Representative graph/RAG fixtures produce typed non-pass reports: RAGFlow, GraphRAG, and Graphiti/Zep blocked; LightRAG incomplete with comparison blocked; graphify wrong_result; llm-wiki not_tested; gbrain blocked; private/hosted profiles non_goal. | | `cargo make baseline-production-synthetic`, `cargo make baseline-backfill-docker`, backup/restore, Qdrant rebuild proof | `2026-06-10-production-adoption-refresh.md` | ELF has provider synthetic, stress, backfill, restore, and rebuild evidence; private-corpus proof is blocked by missing operator-owned manifest. | | `ELF_BASELINE_PROJECTS=ELF,qmd ELF_BASELINE_PROFILE=stress cargo make baseline-live-docker` plus ELF trace-bundle and qmd CLI replay commands | `2026-06-11-elf-qmd-trace-replay-diagnostics-report.md` | Retrieval correctness remains tied, but qmd wins current immediate top-10/replay artifact ergonomics; ELF trace/admin surfaces are useful but not yet hydrated into the default stress artifact. | @@ -108,7 +113,7 @@ results, or lifecycle failures into one aggregate leaderboard. | Retrieval quality and local debug UX | `loss` | `live_baseline_only`, `research_gate`, `wrong_result`, `not_encoded` | The XY-923 trace/replay report scores qmd stronger on immediate top-10 candidate artifacts and short CLI replay commands. ELF keeps useful service trace/admin replay surfaces, and expansion, fusion, rerank-on, and candidate-drop diagnostics remain untested. | XY-923 | | Memory evolution and temporal history | `loss` | `fixture_backed`, `live_real_world`, `live_baseline_only`, `wrong_result`, `blocked` | ELF fixture memory evolution passes, but live ELF passes only delete/TTL and reports five wrong_result jobs where current-vs-historical state is not reconciled. The mem0 local OSS preference-correction history scenario is now measured and is also an ELF loss. | XY-905 | | Consolidation/proposal review | `not_tested` | `fixture_backed`, `not_encoded` | ELF fixture consolidation passes, but live consolidation proposal generation and review-action scoring are not encoded. | XY-926 | -| Knowledge page compilation | `not_tested` | `fixture_backed`, `live_real_world`, `wrong_result`, `research_gate`, `not_encoded` | ELF fixture knowledge pages pass, but live knowledge compilation is not encoded; graphify reaches a tiny scored smoke and remains wrong_result. | XY-926, XY-929 | +| Knowledge page compilation | `not_tested` | `fixture_backed`, `live_real_world`, `wrong_result`, `research_gate`, `blocked`, `not_encoded` | ELF fixture knowledge pages pass, but live knowledge compilation is not encoded. The XY-929 graph/RAG representative slice scores graphify as wrong_result and keeps GraphRAG, llm-wiki, and gbrain as blocked or not_tested references. | XY-926, XY-929 | | Operator debugging/viewer UX | `win` | `fixture_backed`, `live_real_world`, `blocked`, `not_encoded` | ELF now has a narrow live operator-debug win over qmd on trace hydration, candidate-drop visibility, and selected-but-not-narrated evidence. ELF ties qmd on replay-command availability and repair-action clarity. XY-925 adds claude-mem progressive-disclosure and retrieval-repair prompt coverage, but claude-mem viewer/operator workflows and OpenMemory UI/export remain blocked, so this is not a broad viewer-product superiority claim. | XY-926 | | Capture/write policy and redaction | `not_tested` | `fixture_backed`, `live_real_world`, `live_baseline_only`, `blocked`, `not_encoded` | ELF live capture/write-policy self-check jobs pass for redaction, exclusions, source ids, evidence binding, and no secret leakage. qmd remains `not_encoded`; agentmemory and claude-mem hook-capture comparisons remain `blocked` until Docker-contained hook observations and write-policy/viewer readback artifacts exist, so no broad capture-hook superiority claim is allowed. | XY-933, XY-925 | | Production ops, restore, backfill, and rebuild | `win` | `live_baseline_only`, `blocked` | ELF has the strongest measured local production-operation story: provider synthetic, stress, resumable backfill, backup/restore, and Qdrant rebuild evidence. | XY-930 | @@ -116,7 +121,7 @@ results, or lifecycle failures into one aggregate leaderboard. | Personalization and scoped preferences | `tie` | `fixture_backed`, `live_real_world`, `live_baseline_only`, `not_encoded` | ELF and qmd both pass the single encoded live personalization job. mem0 local OSS now passes entity-scoped personalization, so scoped preference behavior is a measured tie; preference correction history remains a separate ELF loss. | XY-927 | | Context trajectory and hierarchical retrieval | `not_tested` | `fixture_backed`, `live_baseline_only`, `research_gate`, `wrong_result`, `blocked` | OpenViking reaches the pinned Docker local embedding path and now exposes expected/matched/missing evidence ids, but same-corpus evidence is still wrong_result; staged trajectory, hierarchy selection, and recursive expansion are encoded as blocked fixtures, not scored comparisons. | XY-928 | | Core-vs-archival memory | `blocked` | `fixture_backed`, `research_gate`, `blocked`, `not_encoded` | ELF now has 6 fixture-backed `core_archival_memory` jobs that score core block attachment, scope, provenance, stale-core detection, archival fallback, and project-decision recovery separately from archival note search. Letta remains blocked or not tested until its contained export/readback artifact maps core and archival source ids. | XY-927 | -| Graph/RAG navigation and citations | `not_tested` | `smoke_only`, `research_gate`, `blocked`, `wrong_result`, `not_encoded` | Graph/RAG smokes produce scored or typed non-pass adapter reports where possible, but broad graph/RAG navigation and citation quality are not tested. | XY-929 | +| Graph/RAG navigation and citations | `not_tested` | `smoke_only`, `research_gate`, `blocked`, `incomplete`, `wrong_result`, `not_encoded` | `cargo make real-world-memory-graph-rag` adds representative citation, graph-summary, temporal-validity, graph-report, stale-source-lint, and unsupported-claim fixtures. The slice is typed non-pass: RAGFlow, GraphRAG, and Graphiti/Zep are blocked; LightRAG is incomplete with comparison blocked; graphify is wrong_result; llm-wiki is not_tested; gbrain is blocked. Broad graph/RAG navigation and citation quality remain not_tested. | XY-929 | ## Follow-Up Queue @@ -130,7 +135,7 @@ results, or lifecycle failures into one aggregate leaderboard. | XY-933 | P1 | Live ELF self-check encoded | Capture/write-policy redaction, exclusion, source-id, evidence-binding, and no-leak scoring for ELF; durable agentmemory/claude-mem capture-hook comparison remains blocked. | | XY-927 | P1 | Fixture encoded; Letta export blocked | ELF core-vs-archival fixture coverage is encoded; a contained Letta export/readback adapter remains future work before win/tie/loss claims. | | XY-928 | P1 | Encoded blocked fixtures | OpenViking context-trajectory and hierarchy benchmark is encoded but blocked until evidence-bearing same-corpus and staged artifacts exist. | -| XY-929 | P2 | Backlog | Graph/RAG adapters beyond scored smokes. | +| XY-929 | P2 | Representative fixture slice encoded; live contracts still blocked or typed non-pass | Graph/RAG adapters now have representative citation/navigation/lint fixtures, but live evidence-linked output contracts are still blocked, incomplete, wrong_result, not_tested, or non_goal. | | XY-930 | P1 | Backlog | Private-corpus and credentialed production gates after operator inputs exist. | | XY-906 | Ops | Todo | Decodex registered-project review-config schema drift blocks Decodex loading of ELF. | @@ -152,7 +157,7 @@ results, or lifecycle failures into one aggregate leaderboard. - ELF has a live temporal reconciliation loss against the benchmark expectation: five memory-evolution jobs remain `wrong_result`. - Most competitor strengths outside qmd retrieval are `not_tested`, `blocked`, - `smoke_only`, or `research_gate`. + `incomplete`, `smoke_only`, or `research_gate`. ## Claims Not Allowed @@ -169,7 +174,8 @@ results, or lifecycle failures into one aggregate leaderboard. current comparison is blocked for their hook/viewer capture paths. - Do not claim ELF beats OpenViking on staged context trajectory. - Do not claim ELF beats Letta on core-vs-archival memory. -- Do not claim graph/RAG parity from smoke-only evidence. +- Do not claim graph/RAG parity from smoke-only or typed non-pass representative + evidence. - Do not promote `fixture_backed`, `live_baseline_only`, `smoke_only`, - `research_gate`, `blocked`, `wrong_result`, `lifecycle_fail`, `unsupported`, or - `not_encoded` states into a generic pass/fail score. + `research_gate`, `blocked`, `incomplete`, `wrong_result`, `lifecycle_fail`, + `unsupported`, or `not_encoded` states into a generic pass/fail score. diff --git a/docs/guide/benchmarking/2026-06-11-graph-rag-scored-smoke-adapter-report.md b/docs/guide/benchmarking/2026-06-11-graph-rag-scored-smoke-adapter-report.md index e970ea94..542e0839 100644 --- a/docs/guide/benchmarking/2026-06-11-graph-rag-scored-smoke-adapter-report.md +++ b/docs/guide/benchmarking/2026-06-11-graph-rag-scored-smoke-adapter-report.md @@ -1,15 +1,15 @@ # Graph/RAG Scored Smoke Adapter Report - June 11, 2026 -Goal: Record the XY-900 promotion of graph/RAG Docker smokes into scored -`real_world_job` adapter evidence without upgrading smoke evidence into broad quality -claims. +Goal: Record the XY-900 promotion of graph/RAG Docker smokes and the XY-929 +representative fixture slice into scored or typed `real_world_job` adapter evidence +without upgrading smoke or typed non-pass evidence into broad quality claims. Read this when: You need to decide whether ELF currently wins, ties, loses, or remains untested against RAGFlow, LightRAG, GraphRAG, Graphiti/Zep, and graphify graph/RAG strengths. -Inputs: `memory_projects_manifest.json`, the graph/RAG smoke commands in -`Makefile.toml`, and the generated smoke report contracts. -Outputs: Scored-smoke status, claim boundary, blocker taxonomy, and next measurement -gate for each in-scope project. +Inputs: `memory_projects_manifest.json`, the graph/RAG smoke and representative +fixture commands in `Makefile.toml`, and the generated report contracts. +Outputs: Scored-smoke status, representative typed non-pass status, claim boundary, +blocker taxonomy, and next measurement gate for each in-scope project. ## Verdict @@ -29,6 +29,12 @@ typed `blocked` before live execution because `ELF_GRAPHITI_ZEP_SMOKE_START=1` a without provider credentials, the blocker remains `provider_api_key_missing`; no hosted Zep service or unrecorded provider credentials are used or implied. +XY-929 adds a representative external-adapter fixture slice for graph/RAG navigation, +citations, graph summaries, temporal validity, graph reports, stale-source lint, and +unsupported-claim handling. The slice intentionally remains typed non-pass: 5 jobs, +0 pass, 3 blocked, 1 incomplete, and 1 wrong_result. It strengthens the reporting +contract, not the quality claim. + ## Scored Smoke Status | Project | Scored scenario | Command | Current scored status | Claim boundary | @@ -51,6 +57,46 @@ Each promoted smoke now writes a generated fixture and scored report: | Graphiti/Zep | `tmp/real-world-memory/graphiti-zep-smoke/graphiti-zep-report.json` and `.md` | | graphify | `tmp/real-world-memory/graphify-smoke/graphify-report.json` and `.md` | +## Representative Fixture Slice + +Run the representative graph/RAG slice separately from the heavyweight live adapter +sweep: + +```sh +cargo make real-world-memory-graph-rag +``` + +Artifacts: + +```text +tmp/real-world-memory/graph-rag/report.json +tmp/real-world-memory/graph-rag/report.md +``` + +Current focused report summary: + +| Metric | Value | +| --- | --- | +| Jobs | 5 | +| Pass | 0 | +| Blocked | 3 | +| Incomplete | 1 | +| Wrong result | 1 | +| Temporal validity not encoded | 1 | + +Representative job outcomes: + +| Project | Representative contract | Job status | ELF outcome | Boundary | +| --- | --- | --- | --- | --- | +| RAGFlow | Reference chunks must map generated document ids, chunk ids, content, and document metadata to benchmark evidence ids. | `blocked` | `blocked` | Resource/API setup and returned reference chunks are still missing. | +| LightRAG | Context/source export must expose generated file paths, snippets, or reference content mapped to evidence ids. | `incomplete` | `blocked` | The opt-in Docker API export is not available by default, so comparison remains blocked. | +| GraphRAG | Output tables must map documents, text units, communities, reports, entities, and relationships to generated evidence ids. | `blocked` | `blocked` | Provider-backed Docker output tables are required before citation or synthesis scoring can pass. | +| Graphiti/Zep | Current and historical graph facts must carry validity windows and evidence ids. | `blocked` | `blocked` | Temporal validity is not encoded without provider-backed current/historical output. | +| graphify | `graph.json`, source-location report sections, unsupported-claim lint, and stale-source lint are scored. | `wrong_result` | `not_tested` | The representative job reaches scoring but misses stale-source/answer requirements; no ELF victory or graphify quality conclusion follows. | +| llm-wiki | Citation-bearing wiki/page generation with stale-source and unsupported-claim lint. | `not_encoded` | `not_tested` | No contained output contract exists yet. | +| gbrain | Compiled-truth or timeline export with evidence-linked page sections. | `blocked` | `blocked` | Docker-local setup and export readback remain missing. | +| Private, hosted, or large-corpus graph/RAG profiles | Provider, private data, or hosted service behavior. | `not_encoded` | `non_goal` | These profiles are outside the generated public representative lane unless explicitly authorized. | + The aggregate live-adapter sweep can include these reports through explicit opt-in flags. These flags include an adapter in the aggregate report; provider-backed, service-started, or resource-heavy live attempts still require the adapter-specific @@ -85,6 +131,8 @@ Allowed: - Say the in-scope graph/RAG smokes now produce scored `real_world_job` adapter reports or typed non-pass reports. +- Say the XY-929 representative slice produces typed non-pass reports for RAGFlow, + LightRAG, GraphRAG, Graphiti/Zep, graphify, llm-wiki, and gbrain claim boundaries. - Say graph/RAG quality remains untested where live output has not mapped to generated evidence ids or where scored output remains typed non-pass. - Say graphify reached a tiny Docker graph/report smoke and currently scores @@ -96,7 +144,9 @@ Allowed: Not allowed: - Do not call a smoke pass a broad RAG, graph, temporal, or production-quality pass. +- Do not call a representative blocked, incomplete, wrong_result, or not_encoded job a + broad RAG, graph, temporal, or production-quality result. - Do not claim ELF beats Graphiti/Zep, RAGFlow, LightRAG, GraphRAG, or graphify on - their graph/RAG strengths from these smoke reports. + their graph/RAG strengths from these smoke or representative non-pass reports. - Do not use hosted/cloud-only results, host-global installs, private corpora, or unrecorded credentials as evidence for this lane. diff --git a/docs/guide/benchmarking/index.md b/docs/guide/benchmarking/index.md index e2eb3469..b2292476 100644 --- a/docs/guide/benchmarking/index.md +++ b/docs/guide/benchmarking/index.md @@ -90,15 +90,17 @@ cleanup, use `docs/guide/single_user_production.md`. source-store/debug jobs, and claude-mem progressive-disclosure, retrieval-repair, hook, and viewer/operator surfaces. - `2026-06-11-graph-rag-scored-smoke-adapter-report.md`: XY-900 graph/RAG - scored-smoke adapter report that promotes RAGFlow, LightRAG, GraphRAG, - Graphiti/Zep, and graphify smoke contracts into scored or typed non-pass - `real_world_job` adapter reports without converting smoke evidence into quality - claims. + scored-smoke adapter report, updated by XY-929 with a representative + graph/RAG fixture slice, that keeps RAGFlow, LightRAG, GraphRAG, Graphiti/Zep, + graphify, llm-wiki, and gbrain outputs as scored or typed non-pass + `real_world_job` evidence without converting smoke or representative + non-pass evidence into quality claims. - `2026-06-11-competitor-strength-adoption-report.md`: XY-901 final competitor-strength adoption report, updated by XY-927 with fixture-backed - core-vs-archival coverage and a blocked Letta export/readback boundary, plus the - bounded personal-production decision, scenario-level win/tie/loss/not-tested - matrix, claim boundaries, and optimization issue queue. + core-vs-archival coverage and by XY-929 with representative graph/RAG + typed non-pass fixtures, plus the bounded personal-production decision, + scenario-level win/tie/loss/not-tested matrix, claim boundaries, and + optimization issue queue. - `2026-06-11-capture-write-policy-live-report.md`: XY-933 live capture/write-policy report that scores ELF redaction, exclusions, source ids, evidence binding, and no secret leakage while preserving typed blocked/untested boundaries for agentmemory diff --git a/docs/guide/benchmarking/real_world_agent_memory_benchmark.md b/docs/guide/benchmarking/real_world_agent_memory_benchmark.md index 0e097230..81693524 100644 --- a/docs/guide/benchmarking/real_world_agent_memory_benchmark.md +++ b/docs/guide/benchmarking/real_world_agent_memory_benchmark.md @@ -263,12 +263,29 @@ returns evidence-bearing retrieval output. The checked-in `context_trajectory` fixtures keep OpenViking staged retrieval, hierarchy selection, and recursive/context expansion blocked until same-corpus evidence ids match and staged artifacts are materialized. -The expanded RAG and graph-memory records for -RAGFlow, LightRAG, GraphRAG, Graphiti/Zep, Letta, LangGraph, nanograph, llm-wiki, -gbrain, graphify, and deeper qmd/OpenViking profiles are `research_gate` records until -their Docker-isolated adapter runs are implemented. These typed states describe -benchmark coverage; do not convert setup weight, missing research, or unencoded suites -into broad project quality rankings. +The expanded RAG and graph-memory records for RAGFlow, LightRAG, GraphRAG, +Graphiti/Zep, Letta, LangGraph, nanograph, llm-wiki, gbrain, graphify, and deeper +qmd/OpenViking profiles stay `research_gate`, typed non-pass, or not-encoded records +until Docker-contained or provider-backed evidence-linked outputs exist. XY-929 adds a +focused representative slice for graph/RAG navigation, citation mapping, graph +summaries, temporal validity, graph reports, stale-source lint, and unsupported-claim +handling: + +```sh +cargo make real-world-memory-graph-rag +``` + +Artifacts: + +```text +tmp/real-world-memory/graph-rag/report.json +tmp/real-world-memory/graph-rag/report.md +``` + +This slice is allowed to report blocked, incomplete, wrong_result, not_tested, and +non_goal outcomes. These typed states describe benchmark coverage; do not convert setup +weight, missing research, smoke output, or representative non-pass fixtures into broad +project quality rankings. To run the full live adapter sweep for ELF and qmd: diff --git a/docs/research/2026-06-11-competitor-strength-adoption-report.json b/docs/research/2026-06-11-competitor-strength-adoption-report.json index 5d4aa7ad..c918eab9 100644 --- a/docs/research/2026-06-11-competitor-strength-adoption-report.json +++ b/docs/research/2026-06-11-competitor-strength-adoption-report.json @@ -12,7 +12,7 @@ "Live temporal reconciliation remains wrong_result for five of six memory_evolution jobs.", "Private-corpus production quality is blocked until an operator-owned manifest exists.", "Credentialed provider production-ops gates are blocked until explicit provider setup exists.", - "Several competitor strengths remain not_tested or blocked: OpenMemory UI/export is blocked by the XY-931 export-helper setup probe, hosted mem0 Platform behavior remains a non-goal, and OpenViking trajectory, Letta core-vs-archival memory, and graph/RAG navigation remain unproven. XY-928 encodes OpenViking staged trajectory, hierarchy selection, and recursive/context expansion as blocked fixtures behind same-corpus evidence output and missing staged artifacts. XY-927 adds six ELF fixture-backed core_archival_memory jobs, but Letta scenario rows remain blocked or not_tested until the selected contained export/readback path exists. XY-925 adds fixture-backed first-generation OSS prompt coverage and typed blockers for agentmemory durable continuity, memsearch Markdown source-store/debug jobs, and claude-mem progressive-disclosure, retrieval-repair, hook, and viewer/operator surfaces without creating live external real-world suite passes. mem0 local OSS preference history is measured separately and is an ELF loss on the current correction-history scenario. The XY-923 follow-up scores qmd immediate top-10/replay artifact ergonomics as stronger than ELF's default stress report, while expansion, fusion, and rerank remain untested. XY-932 adds a narrow live operator-debug slice where ELF beats qmd on trace hydration and candidate-drop visibility, but OpenMemory UI/export remains blocked and claude-mem viewer workflows remain blocked until Docker-contained hook/viewer evidence exists. XY-933 adds an ELF live capture/write-policy self-check, but agentmemory and claude-mem hook-capture breadth remain blocked until Docker-contained hook/viewer evidence exists." + "Several competitor strengths remain not_tested or blocked: OpenMemory UI/export is blocked by the XY-931 export-helper setup probe, hosted mem0 Platform behavior remains a non-goal, and OpenViking trajectory, Letta core-vs-archival memory, and broad graph/RAG navigation remain unproven. XY-929 adds a representative graph/RAG fixture slice with typed blockers, one incomplete LightRAG job, and one graphify wrong_result job, but it does not create any broad graph/RAG win, tie, or loss claim. XY-928 encodes OpenViking staged trajectory, hierarchy selection, and recursive/context expansion as blocked fixtures behind same-corpus evidence output and missing staged artifacts. XY-927 adds six ELF fixture-backed core_archival_memory jobs, but Letta scenario rows remain blocked or not_tested until the selected contained export/readback path exists. XY-925 adds fixture-backed first-generation OSS prompt coverage and typed blockers for agentmemory durable continuity, memsearch Markdown source-store/debug jobs, and claude-mem progressive-disclosure, retrieval-repair, hook, and viewer/operator surfaces without creating live external real-world suite passes. mem0 local OSS preference history is measured separately and is an ELF loss on the current correction-history scenario. The XY-923 follow-up scores qmd immediate top-10/replay artifact ergonomics as stronger than ELF's default stress report, while expansion, fusion, and rerank remain untested. XY-932 adds a narrow live operator-debug slice where ELF beats qmd on trace hydration and candidate-drop visibility, but OpenMemory UI/export remains blocked and claude-mem viewer workflows remain blocked until Docker-contained hook/viewer evidence exists. XY-933 adds an ELF live capture/write-policy self-check, but agentmemory and claude-mem hook-capture breadth remain blocked until Docker-contained hook/viewer evidence exists." ] }, "evidence_class_terms": [ @@ -22,6 +22,7 @@ "smoke_only", "research_gate", "blocked", + "incomplete", "unsupported", "not_encoded", "wrong_result", @@ -86,6 +87,11 @@ "artifact": "docs/guide/benchmarking/2026-06-11-graph-rag-scored-smoke-adapter-report.md", "claim": "graphify reaches tiny Docker graph/report scoring but remains wrong_result; broad graph/RAG quality is not tested." }, + { + "command": "cargo make real-world-memory-graph-rag", + "artifact": "tmp/real-world-memory/graph-rag/report.json", + "claim": "Representative graph/RAG fixtures produce typed non-pass reports: RAGFlow, GraphRAG, and Graphiti/Zep blocked; LightRAG incomplete with comparison blocked; graphify wrong_result; llm-wiki not_tested; gbrain blocked; private and hosted profiles non_goal." + }, { "command": "cargo make baseline-production-synthetic, cargo make baseline-backfill-docker, backup/restore plus Qdrant rebuild proof", "artifact": "docs/guide/benchmarking/2026-06-10-production-adoption-refresh.md", @@ -243,9 +249,10 @@ "live_real_world", "wrong_result", "research_gate", + "blocked", "not_encoded" ], - "measured_claim": "ELF fixture knowledge pages pass, but live knowledge compilation is not encoded. graphify reaches a tiny scored smoke and remains wrong_result.", + "measured_claim": "ELF fixture knowledge pages pass, but live knowledge compilation is not encoded. The XY-929 graph/RAG representative slice scores graphify as wrong_result and keeps GraphRAG, llm-wiki, and gbrain as blocked or not_tested references.", "command_artifacts": [ "docs/guide/benchmarking/2026-06-11-measurement-coverage-audit.md", "docs/guide/benchmarking/2026-06-11-graph-rag-scored-smoke-adapter-report.md" @@ -254,7 +261,7 @@ "XY-926", "XY-929" ], - "caveat": "llm-wiki, gbrain, GraphRAG, and graphify remain references until representative citation/lint jobs are scored." + "caveat": "GraphRAG, graphify, llm-wiki, and gbrain remain references until contained citation, graph-report, and lint jobs produce passable evidence-linked output." }, { "scenario_id": "operator_debugging_viewer_ux", @@ -409,17 +416,18 @@ "smoke_only", "research_gate", "blocked", + "incomplete", "wrong_result", "not_encoded" ], - "measured_claim": "Graph/RAG smokes now produce scored or typed non-pass adapter reports where possible, but broad graph/RAG navigation and citation quality are not tested.", + "measured_claim": "cargo make real-world-memory-graph-rag adds representative citation, graph-summary, temporal-validity, graph-report, stale-source-lint, and unsupported-claim fixtures. The slice is typed non-pass: RAGFlow, GraphRAG, and Graphiti/Zep are blocked; LightRAG is incomplete with comparison blocked; graphify is wrong_result; llm-wiki is not_tested; gbrain is blocked. Broad graph/RAG navigation and citation quality remain not_tested.", "command_artifacts": [ "docs/guide/benchmarking/2026-06-11-graph-rag-scored-smoke-adapter-report.md" ], "follow_up_issues": [ "XY-929" ], - "caveat": "RAGFlow, LightRAG, GraphRAG, Graphiti/Zep, llm-wiki, and gbrain remain blocked, research_gate, or not_encoded; graphify only has a tiny wrong_result smoke." + "caveat": "RAGFlow, LightRAG, GraphRAG, Graphiti/Zep, llm-wiki, gbrain, and graphify have no broad quality proof; private, hosted, and large-corpus graph/RAG behavior remains non_goal unless explicitly authorized." } ], "follow_up_queue": [ @@ -474,8 +482,8 @@ { "issue": "XY-929", "priority": "P2", - "state": "Backlog", - "gap": "Graph/RAG adapters beyond scored smokes." + "state": "Representative fixture slice encoded; live contracts still blocked or typed non-pass", + "gap": "Graph/RAG adapters now have representative citation/navigation/lint fixtures, but live evidence-linked output contracts are still blocked, incomplete, wrong_result, not_tested, or non_goal." }, { "issue": "XY-930", @@ -497,7 +505,7 @@ "ELF ties qmd on encoded live retrieval, work_resume, project_decisions, and personalization slices.", "ELF fixture-backed core_archival_memory coverage passes attachment, scope, provenance, stale-core detection, archival fallback, and project-decision recovery jobs separately from archival search.", "ELF has a live temporal reconciliation loss against the benchmark expectation: five memory_evolution jobs remain wrong_result.", - "Most competitor strengths outside qmd retrieval are not_tested, blocked, smoke_only, or research_gate.", + "Most competitor strengths outside qmd retrieval are not_tested, blocked, incomplete, smoke_only, or research_gate.", "ELF has a narrow live operator-debug win over qmd for trace hydration, candidate-drop visibility, and selected-but-not-narrated evidence, with replay-command availability and repair-action clarity tied.", "ELF live capture/write-policy self-checks pass for redaction, exclusions, source ids, evidence binding, and no secret leakage." ], @@ -507,8 +515,8 @@ "Do not claim ELF beats mem0/OpenMemory on preference history, UI/export, hosted behavior, or graph memory. The local OSS correction-history scenario is currently an ELF loss, while OpenMemory UI/export is a measured setup blocker and hosted behavior plus graph memory remain outside measured local OSS evidence.", "Do not claim ELF beats OpenViking on staged context trajectory.", "Do not claim ELF beats Letta on core-vs-archival memory.", - "Do not claim graph/RAG parity from smoke-only evidence.", - "Do not promote fixture-backed, live_baseline_only, smoke_only, research_gate, blocked, wrong_result, lifecycle_fail, unsupported, or not_encoded states into a generic pass/fail score.", + "Do not claim graph/RAG parity from smoke-only or typed non-pass representative evidence.", + "Do not promote fixture-backed, live_baseline_only, smoke_only, research_gate, blocked, incomplete, wrong_result, lifecycle_fail, unsupported, or not_encoded states into a generic pass/fail score.", "Do not claim ELF broadly beats OpenMemory or claude-mem viewer UX from the narrow ELF/qmd operator-debug slice.", "Do not claim ELF broadly beats agentmemory or claude-mem on capture breadth; the current comparison is blocked for their hook/viewer capture paths." ]