From 2707f7c344d7d5f26f73104b85c6f8c1f79e0b37 Mon Sep 17 00:00:00 2001 From: Yvette Carlisle Date: Wed, 10 Jun 2026 16:03:45 +0800 Subject: [PATCH] {"schema":"decodex/commit/1","summary":"Add expanded RAG and graph-memory adapter research gates","authority":"XY-834"} --- README.md | 14 +- .../memory_projects_manifest.json | 1010 +++++++++++++++++ .../src/bin/real_world_job_benchmark.rs | 120 +- .../tests/real_world_job_benchmark.rs | 63 +- ...2026-06-10-real-world-comparison-report.md | 17 +- .../benchmarking/live_baseline_benchmark.md | 4 +- .../real_world_agent_memory_benchmark.md | 15 +- .../research/comparison_external_projects.md | 23 +- .../external_memory_improvement_plan.md | 2 + .../research/research_projects_inventory.md | 10 +- .../real_world_agent_memory_benchmark_v1.md | 39 +- 11 files changed, 1273 insertions(+), 44 deletions(-) diff --git a/README.md b/README.md index 4fc5cf10..e306299d 100644 --- a/README.md +++ b/README.md @@ -152,6 +152,12 @@ with the production embedding provider path, `Qwen3-Embedding-8B`, and `retrieval`, and `project_decisions` jobs through `cargo make real-world-memory-live-adapters`. This does not imply full-suite live-service parity, broad adapter parity, or private-corpus production proof. +- Expanded adapter-pack coverage after XY-834: the real-world external adapter + manifest now includes `research_gate` records for RAGFlow, LightRAG, GraphRAG, + Graphiti/Zep, Letta, LangGraph, nanograph, llm-wiki, gbrain, graphify, and deeper + qmd/OpenViking profiles. These records carry source/setup/runtime/resource/retry + metadata and typed `blocked`, `incomplete`, or `not_encoded` states; they are not + fixture-backed or live adapter pass evidence. - The benchmark runner and report publisher are checked in and Docker-isolated: `cargo make baseline-live-docker`, `cargo make baseline-backfill-docker`, `cargo make baseline-production-private-addendum`, @@ -174,10 +180,10 @@ Detailed evidence and interpretation: [Real-World Agent Memory Benchmark v1](docs/spec/real_world_agent_memory_benchmark_v1.md). This contract defines job-level suites for agent work. `cargo make real-world-memory` now reports fixture-backed ELF evidence plus the external adapter coverage manifest - for ELF, qmd, agentmemory, mem0/OpenMemory, claude-mem, memsearch, and OpenViking. - The report still distinguishes fixture-backed and live-baseline-only evidence from - true live real-world adapter runs; only the targeted ELF and qmd live adapter slice - currently executes `real_world_job` prompts and scoring. + for the first memory-project set plus expanded RAG and graph-memory research gates. + The report still distinguishes fixture-backed, live-baseline-only, research-gate, + and true live real-world adapter evidence; only the targeted ELF and qmd live + adapter slice currently executes `real_world_job` prompts and scoring. Evidence-backed position after the June 10 real-world report: diff --git a/apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json b/apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json index 8b9f0f61..9ee1acb6 100644 --- a/apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json +++ b/apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json @@ -734,6 +734,1016 @@ "notes": [ "claude-mem remains a UX reference; current Docker evidence is not a real-world progressive-disclosure pass." ] + }, + { + "adapter_id": "qmd_deep_profile_gate", + "project": "qmd", + "adapter_kind": "docker_cli_deep_profile_gate", + "evidence_class": "research_gate", + "docker_default": true, + "host_global_installs_required": false, + "overall_status": "not_encoded", + "setup": { + "status": "pass", + "evidence": "qmd already has a Docker CLI live-baseline adapter; this gate records the deeper profile extension before a separate scaled run is claimed.", + "command": "ELF_BASELINE_PROJECTS=qmd ELF_BASELINE_PROFILE=stress cargo make baseline-live-docker", + "artifact": "tmp/live-baseline/qmd.log" + }, + "run": { + "status": "not_encoded", + "evidence": "No expanded qmd stress or real_world_job deep-profile artifact is checked in for this adapter-pack gate." + }, + "result": { + "status": "not_encoded", + "evidence": "qmd deep retrieval-debug evidence remains a planned profile, not a new pass claim." + }, + "capabilities": [ + { + "capability": "stress_profile_retrieval_debug", + "status": "not_encoded", + "evidence": "The stress command path exists, but this adapter-pack gate has not published a deep qmd profile result." + }, + { + "capability": "real_world_job_adapter", + "status": "not_encoded", + "evidence": "The qmd live real-world slice covers representative jobs only; expanded retrieval-debug suites need their own materialized adapter run." + }, + { + "capability": "host_global_install_boundary", + "status": "unsupported", + "evidence": "Repository-supported qmd benchmark runs must stay inside docker-compose.baseline.yml and must not require host-global installs." + } + ], + "suites": [ + { + "suite_id": "retrieval", + "status": "not_encoded", + "evidence": "A deeper stress retrieval-debug report is not checked in for this gate." + }, + { + "suite_id": "operator_debugging_ux", + "status": "not_encoded", + "evidence": "qmd query planning and score readback are not yet scored as operator-debugging real_world_job outputs." + } + ], + "evidence": [ + { + "kind": "source", + "ref": "https://github.com/tobi/qmd", + "status": "real" + }, + { + "kind": "runner", + "ref": "scripts/live-baseline-benchmark.sh", + "status": "real" + } + ], + "execution_metadata": { + "sources": [ + { + "label": "qmd repository", + "url": "https://github.com/tobi/qmd", + "evidence": "Official qmd source for local hybrid search, CLI setup, and query behavior." + } + ], + "setup_path": "Use the existing Docker baseline qmd install, collection add, update, embed, and query flow with scale or stress profiles.", + "runtime_boundary": "docker-compose.baseline.yml baseline-runner container with project files and caches inside Docker volumes.", + "resource_expectation": "CPU local embedding and rerank cost scale with corpus size; record elapsed time and qmd log artifacts before claims.", + "retry_guidance": [ + "Run qmd stress profile in Docker and publish the artifact path.", + "Map qmd JSON output to retrieval-debug real_world_job scoring before suite claims." + ], + "research_depth": "D2 reviewed; deep profile not encoded" + }, + "notes": [ + "This gate deepens qmd planning without changing the existing qmd pass evidence from the smoke live baseline." + ] + }, + { + "adapter_id": "openviking_deep_profile_gate", + "project": "OpenViking", + "adapter_kind": "docker_local_embed_deep_profile_gate", + "evidence_class": "research_gate", + "docker_default": true, + "host_global_installs_required": false, + "overall_status": "incomplete", + "setup": { + "status": "incomplete", + "evidence": "OpenViking deep-profile work is blocked at the same Docker local-embedding dependency boundary as the current live-baseline adapter.", + "command": "ELF_BASELINE_PROJECTS=OpenViking cargo make baseline-live-docker", + "artifact": "tmp/live-baseline/OpenViking.log" + }, + "run": { + "status": "incomplete", + "evidence": "The adapter cannot fairly exercise hierarchical trajectory behavior until add_resource/find reaches execution in Docker." + }, + "result": { + "status": "incomplete", + "evidence": "No OpenViking deep context-trajectory result is claimed from a setup-blocked run." + }, + "capabilities": [ + { + "capability": "docker_local_embed_setup", + "status": "incomplete", + "evidence": "The local embedding setup must be pinned before deep profile runs can execute." + }, + { + "capability": "hierarchical_context_trajectory", + "status": "not_encoded", + "evidence": "Stage trajectory scoring is not encoded until setup reaches runnable OpenViking APIs." + }, + { + "capability": "host_global_install_boundary", + "status": "unsupported", + "evidence": "The adapter pack must not ask operators to install OpenViking dependencies globally on the host." + } + ], + "suites": [ + { + "suite_id": "retrieval", + "status": "incomplete", + "evidence": "Same-corpus retrieval setup remains incomplete in Docker." + }, + { + "suite_id": "work_resume", + "status": "not_encoded", + "evidence": "No OpenViking resume or context trajectory real_world_job run is encoded." + }, + { + "suite_id": "operator_debugging_ux", + "status": "not_encoded", + "evidence": "Trajectory readback is a reference feature but not a scored adapter output." + } + ], + "evidence": [ + { + "kind": "source", + "ref": "https://github.com/volcengine/OpenViking/", + "status": "real" + }, + { + "kind": "runner", + "ref": "scripts/live-baseline-benchmark.sh", + "status": "incomplete" + } + ], + "execution_metadata": { + "sources": [ + { + "label": "OpenViking repository", + "url": "https://github.com/volcengine/OpenViking/", + "evidence": "Official source for OpenViking local context database, resource, and retrieval APIs." + } + ], + "setup_path": "Pin a Docker-compatible local embedding path, then run OpenViking add_resource/find before any deep profile scoring.", + "runtime_boundary": "docker-compose.baseline.yml baseline-runner container; no host model or compiler setup outside Docker.", + "resource_expectation": "Local embedding builds can be native-toolchain and model heavy; record build logs, model cache size, and elapsed time.", + "retry_guidance": [ + "Pin or prebuild the local embedding dependency in the baseline image.", + "Only then add context-trajectory real_world_job scoring for hierarchical retrieval." + ], + "research_depth": "D2 reviewed; runtime setup incomplete" + }, + "notes": [ + "OpenViking remains a context-trajectory reference, but this gate prevents setup failure from becoming a quality judgment." + ] + }, + { + "adapter_id": "ragflow_research_gate", + "project": "RAGFlow", + "adapter_kind": "research_gate", + "evidence_class": "research_gate", + "docker_default": true, + "host_global_installs_required": false, + "overall_status": "blocked", + "setup": { + "status": "blocked", + "evidence": "RAGFlow remains a large RAG system watch item; D1/D2 research must prove a Docker-safe corpus ingest and query path before adapter implementation." + }, + "run": { + "status": "not_encoded", + "evidence": "No RAGFlow real_world_job or live-baseline adapter is encoded." + }, + "result": { + "status": "blocked", + "evidence": "No quality result is claimed until deployability, resource envelope, and output mapping are researched." + }, + "capabilities": [ + { + "capability": "d1_d2_research_before_adapter", + "status": "blocked", + "evidence": "The inventory marks RAGFlow as D0 pending deep dive." + }, + { + "capability": "docker_service_setup", + "status": "blocked", + "evidence": "The adapter must size the multi-service Docker setup and avoid host-global installs before running." + }, + { + "capability": "real_world_job_adapter", + "status": "not_encoded", + "evidence": "No job prompt, answer, evidence, or trap mapping is implemented." + } + ], + "suites": [ + { + "suite_id": "retrieval", + "status": "blocked", + "evidence": "Corpus ingestion, query output, and evidence citation mapping need D1/D2 research." + }, + { + "suite_id": "knowledge_compilation", + "status": "not_encoded", + "evidence": "RAGFlow knowledge output is not mapped to real_world_job page or citation scoring." + }, + { + "suite_id": "production_ops", + "status": "blocked", + "evidence": "Resource envelope and service startup retry guidance must be documented first." + } + ], + "evidence": [ + { + "kind": "source", + "ref": "https://github.com/infiniflow/ragflow", + "status": "real" + }, + { + "kind": "source", + "ref": "https://ragflow.io/docs/", + "status": "real" + } + ], + "execution_metadata": { + "sources": [ + { + "label": "RAGFlow repository", + "url": "https://github.com/infiniflow/ragflow", + "evidence": "Official source for RAGFlow service code and Docker Compose setup." + }, + { + "label": "RAGFlow docs", + "url": "https://ragflow.io/docs/", + "evidence": "Official deployment and setup documentation." + } + ], + "setup_path": "Research the official Docker deployment, corpus ingest API, query API, and artifact export before adding a runner.", + "runtime_boundary": "Future runs must use docker-compose.baseline.yml or a nested Docker-isolated service profile without host-global installs.", + "resource_expectation": "Large multi-service RAG stack; record CPU/GPU mode, memory, disk, startup time, and provider credential needs before scoring.", + "retry_guidance": [ + "Complete a D1/D2 setup and API deep dive.", + "Prototype a tiny Docker smoke that reaches ingest and query before adding quality checks." + ], + "research_depth": "D0 watch item; D1/D2 required" + }, + "follow_up": { + "title": "[ELF benchmark adapter] Research RAGFlow Docker adapter feasibility", + "reason": "The project is too large to score fairly without setup, resource, and API mapping research." + } + }, + { + "adapter_id": "lightrag_research_gate", + "project": "LightRAG", + "adapter_kind": "research_gate", + "evidence_class": "research_gate", + "docker_default": true, + "host_global_installs_required": false, + "overall_status": "blocked", + "setup": { + "status": "blocked", + "evidence": "LightRAG requires D1/D2 research on Docker setup, LLM/embedding configuration, persistence, and context output before adapter implementation." + }, + "run": { + "status": "not_encoded", + "evidence": "No LightRAG real_world_job adapter is encoded." + }, + "result": { + "status": "blocked", + "evidence": "No graph-RAG quality claim is allowed until a Docker-safe adapter reaches query output." + }, + "capabilities": [ + { + "capability": "graph_augmented_rag_setup", + "status": "blocked", + "evidence": "The inventory marks LightRAG as D0 pending deep dive." + }, + { + "capability": "retrieved_context_export", + "status": "blocked", + "evidence": "The adapter must prove it can extract evidence-bearing retrieved contexts for scoring." + }, + { + "capability": "real_world_job_adapter", + "status": "not_encoded", + "evidence": "No LightRAG fixture materializer or scorer mapping exists." + } + ], + "suites": [ + { + "suite_id": "retrieval", + "status": "blocked", + "evidence": "Graph/vector retrieval output mapping needs research." + }, + { + "suite_id": "memory_evolution", + "status": "blocked", + "evidence": "Stale/corrected fact update behavior is not yet audited." + }, + { + "suite_id": "operator_debugging_ux", + "status": "not_encoded", + "evidence": "Trace or context-debug output is not mapped to benchmark scoring." + } + ], + "evidence": [ + { + "kind": "source", + "ref": "https://github.com/HKUDS/LightRAG", + "status": "real" + }, + { + "kind": "source", + "ref": "https://github.com/HKUDS/LightRAG/blob/main/docs/DockerDeployment.md", + "status": "real" + } + ], + "execution_metadata": { + "sources": [ + { + "label": "LightRAG repository", + "url": "https://github.com/HKUDS/LightRAG", + "evidence": "Official source for LightRAG server, Docker, and retrieval modes." + }, + { + "label": "LightRAG Docker docs", + "url": "https://github.com/HKUDS/LightRAG/blob/main/docs/DockerDeployment.md", + "evidence": "Official Docker deployment reference." + } + ], + "setup_path": "Research Docker Compose with explicit LLM, embedding, rerank, and storage configuration before adding a benchmark runner.", + "runtime_boundary": "Docker-only service profile with generated corpus mounted as container-local input.", + "resource_expectation": "Graph extraction and local model choices may dominate runtime; record backend choices, cache sizes, and provider needs.", + "retry_guidance": [ + "Run a tiny Docker ingest/query smoke with deterministic or local providers.", + "Verify returned contexts can be mapped to required evidence IDs." + ], + "research_depth": "D0 watch item; D1/D2 required" + }, + "follow_up": { + "title": "[ELF benchmark adapter] Research LightRAG graph-RAG adapter feasibility", + "reason": "Graph extraction, persistence, and context output must be understood before fair scoring." + } + }, + { + "adapter_id": "graphrag_research_gate", + "project": "GraphRAG", + "adapter_kind": "research_gate", + "evidence_class": "research_gate", + "docker_default": true, + "host_global_installs_required": false, + "overall_status": "blocked", + "setup": { + "status": "blocked", + "evidence": "GraphRAG indexing cost and source-citation mapping require D1/D2 research before adapter implementation." + }, + "run": { + "status": "not_encoded", + "evidence": "No GraphRAG real_world_job adapter is encoded." + }, + "result": { + "status": "blocked", + "evidence": "No graph-navigation or knowledge-synthesis result is claimed from docs-only research." + }, + "capabilities": [ + { + "capability": "indexing_resource_envelope", + "status": "blocked", + "evidence": "Official docs warn that indexing can be expensive; the benchmark must start small and record costs." + }, + { + "capability": "source_citation_mapping", + "status": "blocked", + "evidence": "The adapter must map graph summaries and query output back to benchmark evidence IDs." + }, + { + "capability": "real_world_job_adapter", + "status": "not_encoded", + "evidence": "No GraphRAG materializer or scorer mapping exists." + } + ], + "suites": [ + { + "suite_id": "knowledge_compilation", + "status": "blocked", + "evidence": "Community summaries and graph reports need source coverage checks before scoring." + }, + { + "suite_id": "retrieval", + "status": "blocked", + "evidence": "Query output and expected-evidence mapping are not researched." + }, + { + "suite_id": "production_ops", + "status": "blocked", + "evidence": "Indexing resource envelope is not established." + } + ], + "evidence": [ + { + "kind": "source", + "ref": "https://github.com/microsoft/graphrag", + "status": "real" + }, + { + "kind": "source", + "ref": "https://microsoft.github.io/graphrag/", + "status": "real" + } + ], + "execution_metadata": { + "sources": [ + { + "label": "GraphRAG repository", + "url": "https://github.com/microsoft/graphrag", + "evidence": "Official Microsoft GraphRAG source and setup reference." + }, + { + "label": "GraphRAG docs", + "url": "https://microsoft.github.io/graphrag/", + "evidence": "Official documentation for indexing and querying." + } + ], + "setup_path": "Research a tiny CLI index/query path with explicit model configuration and source mapping.", + "runtime_boundary": "Docker-only Python CLI run with generated corpus and container-local artifacts.", + "resource_expectation": "Indexing may be expensive; record model calls, cache size, elapsed time, and maximum corpus size used.", + "retry_guidance": [ + "Complete D1/D2 indexing and query-output research.", + "Add a cost-bounded smoke before any scale or quality claim." + ], + "research_depth": "D0 watch item; D1/D2 required" + }, + "follow_up": { + "title": "[ELF benchmark adapter] Research GraphRAG cost-bounded adapter path", + "reason": "Indexing cost, graph summaries, and citation guarantees need proof before scoring." + } + }, + { + "adapter_id": "graphiti_zep_research_gate", + "project": "Graphiti/Zep", + "adapter_kind": "research_gate", + "evidence_class": "research_gate", + "docker_default": true, + "host_global_installs_required": false, + "overall_status": "not_encoded", + "setup": { + "status": "not_encoded", + "evidence": "Graphiti/Zep is D1 reviewed as a temporal graph-memory reference, but no Docker adapter is implemented." + }, + "run": { + "status": "not_encoded", + "evidence": "No temporal graph fact add/query job is encoded." + }, + "result": { + "status": "not_encoded", + "evidence": "No current-versus-historical real_world_job pass is claimed." + }, + "capabilities": [ + { + "capability": "temporal_graph_memory", + "status": "not_encoded", + "evidence": "Temporal fact validity is a reference dimension but not an executable adapter output." + }, + { + "capability": "docker_graph_store_setup", + "status": "blocked", + "evidence": "A safe local graph store, embedding, and LLM configuration must be documented before execution." + }, + { + "capability": "real_world_job_adapter", + "status": "not_encoded", + "evidence": "No Graphiti/Zep materializer or scorer mapping exists." + } + ], + "suites": [ + { + "suite_id": "memory_evolution", + "status": "not_encoded", + "evidence": "Current/historical fact validity jobs are not encoded for Graphiti/Zep." + }, + { + "suite_id": "retrieval", + "status": "not_encoded", + "evidence": "Hybrid graph retrieval output is not mapped to evidence IDs." + } + ], + "evidence": [ + { + "kind": "source", + "ref": "https://github.com/getzep/graphiti", + "status": "real" + }, + { + "kind": "source", + "ref": "https://www.getzep.com/platform/graphiti/", + "status": "real" + } + ], + "execution_metadata": { + "sources": [ + { + "label": "Graphiti repository", + "url": "https://github.com/getzep/graphiti", + "evidence": "Official open-source temporal context graph engine." + }, + { + "label": "Zep Graphiti overview", + "url": "https://www.getzep.com/platform/graphiti/", + "evidence": "Official product documentation for temporal context graph behavior." + } + ], + "setup_path": "Define a Docker-local graph store and provider configuration, then encode add/query current-versus-historical fact jobs.", + "runtime_boundary": "Docker-only service or SDK run with graph store state under benchmark artifacts.", + "resource_expectation": "Requires graph store plus LLM/embedding configuration; record service startup, storage size, and provider boundaries.", + "retry_guidance": [ + "Prototype a tiny temporal fact add/query run.", + "Map valid_at/invalid_at evidence to memory_evolution scoring." + ], + "research_depth": "D1 reviewed; adapter not encoded" + } + }, + { + "adapter_id": "letta_research_gate", + "project": "Letta", + "adapter_kind": "research_gate", + "evidence_class": "research_gate", + "docker_default": true, + "host_global_installs_required": false, + "overall_status": "not_encoded", + "setup": { + "status": "not_encoded", + "evidence": "Letta is D1 reviewed as a core/archival memory reference, but no Docker real_world_job adapter is implemented." + }, + "run": { + "status": "not_encoded", + "evidence": "No Letta core block, archival memory, or shared-memory job is encoded." + }, + "result": { + "status": "not_encoded", + "evidence": "No Letta personalization or project-decision suite result is claimed." + }, + "capabilities": [ + { + "capability": "core_archival_memory", + "status": "not_encoded", + "evidence": "Core blocks and archival memory are reference semantics but not scored." + }, + { + "capability": "docker_embedding_configuration", + "status": "blocked", + "evidence": "Docker setup requires explicit embedding configuration before archival retrieval can be tested." + }, + { + "capability": "real_world_job_adapter", + "status": "not_encoded", + "evidence": "No Letta materializer or scorer mapping exists." + } + ], + "suites": [ + { + "suite_id": "personalization", + "status": "not_encoded", + "evidence": "Core memory preference application is not encoded for Letta." + }, + { + "suite_id": "project_decisions", + "status": "not_encoded", + "evidence": "Archival memory decision retrieval is not encoded for Letta." + }, + { + "suite_id": "work_resume", + "status": "not_encoded", + "evidence": "Agent resumption through Letta memory blocks is not encoded." + } + ], + "evidence": [ + { + "kind": "source", + "ref": "https://github.com/letta-ai/letta", + "status": "real" + }, + { + "kind": "source", + "ref": "https://docs.letta.com/guides/docker/", + "status": "real" + } + ], + "execution_metadata": { + "sources": [ + { + "label": "Letta repository", + "url": "https://github.com/letta-ai/letta", + "evidence": "Official source for Letta stateful agents and memory." + }, + { + "label": "Letta Docker docs", + "url": "https://docs.letta.com/guides/docker/", + "evidence": "Official Docker deployment guide and embedding configuration boundary." + } + ], + "setup_path": "Define Docker server setup, embedding model configuration, and a core/archival memory fixture flow.", + "runtime_boundary": "Docker-only Letta server or CLI flow with benchmark-created agents and no host-global state.", + "resource_expectation": "Embedding model and agent server state must be explicit; record storage and provider boundaries.", + "retry_guidance": [ + "Create a tiny Docker agent with archival memory search.", + "Score core-versus-archival retrieval only after source evidence can be exported." + ], + "research_depth": "D1 reviewed; adapter not encoded" + } + }, + { + "adapter_id": "langgraph_research_gate", + "project": "LangGraph", + "adapter_kind": "research_gate", + "evidence_class": "research_gate", + "docker_default": true, + "host_global_installs_required": false, + "overall_status": "not_encoded", + "setup": { + "status": "not_encoded", + "evidence": "LangGraph is D1 reviewed as a replay/checkpoint reference, not a direct memory backend adapter." + }, + "run": { + "status": "not_encoded", + "evidence": "No checkpoint replay real_world_job harness is encoded." + }, + "result": { + "status": "not_encoded", + "evidence": "No production-ops or resume suite result is claimed." + }, + "capabilities": [ + { + "capability": "checkpoint_replay_regression", + "status": "not_encoded", + "evidence": "Replay/fork behavior needs an agent graph harness before scoring." + }, + { + "capability": "standalone_memory_backend", + "status": "unsupported", + "evidence": "LangGraph persistence is an agent-state/checkpoint layer, not a drop-in memory retrieval backend." + }, + { + "capability": "real_world_job_adapter", + "status": "not_encoded", + "evidence": "No LangGraph benchmark materializer exists." + } + ], + "suites": [ + { + "suite_id": "production_ops", + "status": "not_encoded", + "evidence": "Checkpoint recovery and replay regression are not encoded." + }, + { + "suite_id": "work_resume", + "status": "not_encoded", + "evidence": "Resume from checkpoint with memory reads is not encoded." + } + ], + "evidence": [ + { + "kind": "source", + "ref": "https://docs.langchain.com/oss/python/langgraph/persistence", + "status": "real" + } + ], + "execution_metadata": { + "sources": [ + { + "label": "LangGraph persistence docs", + "url": "https://docs.langchain.com/oss/python/langgraph/persistence", + "evidence": "Official documentation for checkpoints, replay, fork, and persistence behavior." + } + ], + "setup_path": "Build a tiny LangGraph agent with a checkpointer and explicit memory read/write steps before scoring.", + "runtime_boundary": "Docker-only Python harness with checkpoint store under the artifact directory.", + "resource_expectation": "Small runtime expected, but LLM calls and side effects must be stubbed or deterministic before replay claims.", + "retry_guidance": [ + "Encode one replay/fork failure recovery job.", + "Keep LangGraph classified as replay reference unless memory retrieval is actually exercised." + ], + "research_depth": "D1 reviewed; adapter not encoded" + } + }, + { + "adapter_id": "nanograph_research_gate", + "project": "nanograph", + "adapter_kind": "research_gate", + "evidence_class": "research_gate", + "docker_default": true, + "host_global_installs_required": false, + "overall_status": "not_encoded", + "setup": { + "status": "not_encoded", + "evidence": "nanograph is D1 reviewed as typed graph DX, but no Docker adapter is implemented." + }, + "run": { + "status": "not_encoded", + "evidence": "No typed graph schema/query real_world_job run is encoded." + }, + "result": { + "status": "not_encoded", + "evidence": "No graph temporal or retrieval-debug result is claimed." + }, + "capabilities": [ + { + "capability": "typed_graph_schema", + "status": "not_encoded", + "evidence": "Schema-as-code and typed query ergonomics need a benchmark harness." + }, + { + "capability": "memory_backend_comparison", + "status": "unsupported", + "evidence": "nanograph is a graph database reference, not a complete agent memory service." + }, + { + "capability": "real_world_job_adapter", + "status": "not_encoded", + "evidence": "No nanograph materializer exists." + } + ], + "suites": [ + { + "suite_id": "memory_evolution", + "status": "not_encoded", + "evidence": "Typed current/historical fact jobs are not encoded." + }, + { + "suite_id": "retrieval", + "status": "not_encoded", + "evidence": "Typed query explainability is not scored." + } + ], + "evidence": [ + { + "kind": "source", + "ref": "https://github.com/nanograph/nanograph", + "status": "real" + } + ], + "execution_metadata": { + "sources": [ + { + "label": "nanograph repository", + "url": "https://github.com/nanograph/nanograph", + "evidence": "Official source for on-device typed property graph behavior." + } + ], + "setup_path": "Build or install nanograph inside Docker and load a typed graph fixture from generated corpus facts.", + "runtime_boundary": "Docker-only CLI run with graph folder under benchmark artifacts.", + "resource_expectation": "Light local graph runtime expected; record binary build/install time and graph artifact size.", + "retry_guidance": [ + "Define a minimal schema for memory_evolution facts.", + "Score typed query output only if it cites fixture evidence IDs." + ], + "research_depth": "D1 reviewed; adapter not encoded" + } + }, + { + "adapter_id": "llm_wiki_research_gate", + "project": "llm-wiki", + "adapter_kind": "research_gate", + "evidence_class": "research_gate", + "docker_default": true, + "host_global_installs_required": false, + "overall_status": "not_encoded", + "setup": { + "status": "not_encoded", + "evidence": "llm-wiki is D1 reviewed as a knowledge-compilation reference, but no plugin or generated-page adapter is implemented." + }, + "run": { + "status": "not_encoded", + "evidence": "No llm-wiki corpus-to-page run is encoded." + }, + "result": { + "status": "not_encoded", + "evidence": "No knowledge page citation or lint result is claimed." + }, + "capabilities": [ + { + "capability": "knowledge_page_compilation", + "status": "not_encoded", + "evidence": "Wiki generation and citation lint are not executed by the runner." + }, + { + "capability": "live_service_runtime", + "status": "unsupported", + "evidence": "llm-wiki is a plugin/workflow reference rather than a service adapter." + }, + { + "capability": "real_world_job_adapter", + "status": "not_encoded", + "evidence": "No page materializer or scorer mapping exists." + } + ], + "suites": [ + { + "suite_id": "knowledge_compilation", + "status": "not_encoded", + "evidence": "Corpus-to-wiki output is not encoded." + }, + { + "suite_id": "work_resume", + "status": "not_encoded", + "evidence": "Resume answers from wiki pages are not encoded." + } + ], + "evidence": [ + { + "kind": "source", + "ref": "https://github.com/nvk/llm-wiki", + "status": "real" + } + ], + "execution_metadata": { + "sources": [ + { + "label": "llm-wiki repository", + "url": "https://github.com/nvk/llm-wiki", + "evidence": "Official source for the LLM Wiki plugin and knowledge-base workflow." + } + ], + "setup_path": "Research plugin bootstrap inside a Docker-contained Codex or file-based harness, then materialize page artifacts.", + "runtime_boundary": "Docker-only plugin or fixture materializer; no user-global Codex plugin install.", + "resource_expectation": "LLM generation cost depends on page build; record provider boundary and generated artifact size.", + "retry_guidance": [ + "Prototype a fixture-only page build with explicit citations.", + "Do not score until generated sections can be mapped to evidence IDs." + ], + "research_depth": "D1 reviewed; adapter not encoded" + } + }, + { + "adapter_id": "gbrain_research_gate", + "project": "gbrain", + "adapter_kind": "research_gate", + "evidence_class": "research_gate", + "docker_default": true, + "host_global_installs_required": false, + "overall_status": "not_encoded", + "setup": { + "status": "not_encoded", + "evidence": "gbrain is D1 reviewed as a compiled-truth and timeline reference, but no Docker adapter is implemented." + }, + "run": { + "status": "not_encoded", + "evidence": "No gbrain brain-repo import or compiled-truth run is encoded." + }, + "result": { + "status": "not_encoded", + "evidence": "No knowledge-synthesis or operator-continuity result is claimed." + }, + "capabilities": [ + { + "capability": "compiled_truth_timeline", + "status": "not_encoded", + "evidence": "Compiled truth plus timeline output is a reference pattern but not scored." + }, + { + "capability": "postgres_backed_brain_repo", + "status": "blocked", + "evidence": "A Docker-local brain repo and Postgres setup path must be proven before execution." + }, + { + "capability": "real_world_job_adapter", + "status": "not_encoded", + "evidence": "No gbrain materializer exists." + } + ], + "suites": [ + { + "suite_id": "knowledge_compilation", + "status": "not_encoded", + "evidence": "Compiled truth and timeline pages are not scored." + }, + { + "suite_id": "operator_debugging_ux", + "status": "not_encoded", + "evidence": "Operator continuity through brain pages is not encoded." + } + ], + "evidence": [ + { + "kind": "source", + "ref": "https://github.com/garrytan/gbrain", + "status": "real" + }, + { + "kind": "source", + "ref": "https://github.com/garrytan/gbrain/blob/master/docs/guides/compiled-truth.md", + "status": "real" + } + ], + "execution_metadata": { + "sources": [ + { + "label": "gbrain repository", + "url": "https://github.com/garrytan/gbrain", + "evidence": "Official source for brain repo and retrieval workflow." + }, + { + "label": "compiled truth guide", + "url": "https://github.com/garrytan/gbrain/blob/master/docs/guides/compiled-truth.md", + "evidence": "Official guide for compiled truth plus timeline behavior." + } + ], + "setup_path": "Create a Docker-local brain repo fixture, run import/sync, and export compiled truth plus timeline evidence.", + "runtime_boundary": "Docker-only repository and database state with no operator-owned brain repo.", + "resource_expectation": "Postgres-backed sync and embedding choices must be explicit; record DB size and import time.", + "retry_guidance": [ + "Prototype a tiny brain repo with one current-truth page and timeline.", + "Score only if compiled truth cites the source timeline evidence." + ], + "research_depth": "D1 reviewed; adapter not encoded" + } + }, + { + "adapter_id": "graphify_research_gate", + "project": "graphify", + "adapter_kind": "research_gate", + "evidence_class": "research_gate", + "docker_default": true, + "host_global_installs_required": false, + "overall_status": "not_encoded", + "setup": { + "status": "not_encoded", + "evidence": "graphify is D1 reviewed as a graph-navigation reference, but no Docker adapter is implemented." + }, + "run": { + "status": "not_encoded", + "evidence": "No graphify graph/report build is encoded." + }, + "result": { + "status": "not_encoded", + "evidence": "No graph-navigation or knowledge-compilation result is claimed." + }, + "capabilities": [ + { + "capability": "graph_report_generation", + "status": "not_encoded", + "evidence": "Graph reports and assistant query flows are not executed by the runner." + }, + { + "capability": "multimodal_code_graph", + "status": "not_encoded", + "evidence": "Multimodal graph extraction is a reference capability but not scored." + }, + { + "capability": "real_world_job_adapter", + "status": "not_encoded", + "evidence": "No graphify materializer exists." + } + ], + "suites": [ + { + "suite_id": "knowledge_compilation", + "status": "not_encoded", + "evidence": "Graph report citation and lint behavior are not scored." + }, + { + "suite_id": "retrieval", + "status": "not_encoded", + "evidence": "Graph-guided query output is not mapped to required evidence." + }, + { + "suite_id": "work_resume", + "status": "not_encoded", + "evidence": "Resume answers from graph context are not encoded." + } + ], + "evidence": [ + { + "kind": "source", + "ref": "https://github.com/safishamsi/graphify", + "status": "real" + } + ], + "execution_metadata": { + "sources": [ + { + "label": "graphify repository", + "url": "https://github.com/safishamsi/graphify", + "evidence": "Official source for graphify graph extraction and query workflow." + } + ], + "setup_path": "Install graphify inside Docker, build a graph/report from a generated corpus, and export query evidence.", + "runtime_boundary": "Docker-only CLI or skill run over mounted benchmark corpus.", + "resource_expectation": "Graph build cost scales with corpus and model choices; record build time, graph size, and generated report size.", + "retry_guidance": [ + "Start with a generated public code/document corpus.", + "Score graph-guided answers only when report nodes cite source evidence IDs." + ], + "research_depth": "D1 reviewed; adapter not encoded" + } } ] } diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark.rs b/apps/elf-eval/src/bin/real_world_job_benchmark.rs index c80f749c..e987986b 100644 --- a/apps/elf-eval/src/bin/real_world_job_benchmark.rs +++ b/apps/elf-eval/src/bin/real_world_job_benchmark.rs @@ -686,6 +686,8 @@ struct ExternalAdapterReport { suites: Vec, #[serde(default)] evidence: Vec, + #[serde(skip_serializing_if = "Option::is_none")] + execution_metadata: Option, #[serde(default)] notes: Vec, #[serde(skip_serializing_if = "Option::is_none")] @@ -724,6 +726,26 @@ struct AdapterEvidencePointer { status: AdapterCoverageStatus, } +#[derive(Clone, Debug, Deserialize, Serialize)] +struct AdapterExecutionMetadata { + #[serde(default)] + sources: Vec, + setup_path: String, + runtime_boundary: String, + resource_expectation: String, + #[serde(default)] + retry_guidance: Vec, + #[serde(skip_serializing_if = "Option::is_none")] + research_depth: Option, +} + +#[derive(Clone, Debug, Deserialize, Serialize)] +struct AdapterSource { + label: String, + url: String, + evidence: String, +} + #[derive(Clone, Debug, Default, Deserialize, Serialize)] struct ExternalAdapterSummary { adapter_count: usize, @@ -733,6 +755,8 @@ struct ExternalAdapterSummary { fixture_backed_count: usize, live_baseline_only_count: usize, live_real_world_count: usize, + #[serde(default)] + research_gate_count: usize, overall_status_counts: AdapterStatusCounts, capability_status_counts: AdapterStatusCounts, suite_status_counts: AdapterStatusCounts, @@ -3719,7 +3743,7 @@ fn validate_external_adapter(path: &Path, adapter: &ExternalAdapterReport) -> Re } if !matches!( adapter.evidence_class.as_str(), - "fixture_backed" | "live_baseline_only" | "live_real_world" + "fixture_backed" | "live_baseline_only" | "live_real_world" | "research_gate" ) { return Err(eyre::eyre!( "{} adapter {} has unsupported evidence_class {}.", @@ -3740,6 +3764,7 @@ fn validate_external_adapter(path: &Path, adapter: &ExternalAdapterReport) -> Re validate_adapter_capabilities(path, adapter)?; validate_adapter_suites(path, adapter)?; validate_adapter_evidence(path, adapter)?; + validate_adapter_execution_metadata(path, adapter)?; if let Some(follow_up) = &adapter.follow_up && (follow_up.title.trim().is_empty() || follow_up.reason.trim().is_empty()) @@ -3822,6 +3847,40 @@ fn validate_adapter_evidence(path: &Path, adapter: &ExternalAdapterReport) -> Re Ok(()) } +fn validate_adapter_execution_metadata(path: &Path, adapter: &ExternalAdapterReport) -> Result<()> { + let Some(metadata) = &adapter.execution_metadata else { + return Ok(()); + }; + + if metadata.setup_path.trim().is_empty() + || metadata.runtime_boundary.trim().is_empty() + || metadata.resource_expectation.trim().is_empty() + || metadata.retry_guidance.iter().any(|guidance| guidance.trim().is_empty()) + || metadata.sources.is_empty() + { + return Err(eyre::eyre!( + "{} adapter {} has incomplete execution metadata.", + path.display(), + adapter.adapter_id + )); + } + + for source in &metadata.sources { + if source.label.trim().is_empty() + || source.url.trim().is_empty() + || source.evidence.trim().is_empty() + { + return Err(eyre::eyre!( + "{} adapter {} has incomplete source metadata.", + path.display(), + adapter.adapter_id + )); + } + } + + Ok(()) +} + fn external_adapter_summary(adapters: &[ExternalAdapterReport]) -> ExternalAdapterSummary { let mut summary = ExternalAdapterSummary { adapter_count: adapters.len(), @@ -3846,6 +3905,7 @@ fn accumulate_adapter_summary( summary.fixture_backed_count += usize::from(adapter.evidence_class == "fixture_backed"); summary.live_baseline_only_count += usize::from(adapter.evidence_class == "live_baseline_only"); summary.live_real_world_count += usize::from(adapter.evidence_class == "live_real_world"); + summary.research_gate_count += usize::from(adapter.evidence_class == "research_gate"); increment_adapter_status_count(&mut summary.overall_status_counts, adapter.overall_status); @@ -4013,10 +4073,11 @@ fn render_markdown_external_adapters(out: &mut String, report: &RealWorldReport) summary.host_global_install_required_count )); out.push_str(&format!( - "- Evidence classes: `{}` fixture-backed, `{}` live-baseline-only, `{}` live real-world\n", + "- Evidence classes: `{}` fixture-backed, `{}` live-baseline-only, `{}` live real-world, `{}` research-gate\n", summary.fixture_backed_count, summary.live_baseline_only_count, - summary.live_real_world_count + summary.live_real_world_count, + summary.research_gate_count )); out.push_str(&format!( "- Overall statuses: `{}`\n", @@ -4065,9 +4126,43 @@ fn render_markdown_external_adapters(out: &mut String, report: &RealWorldReport) } } + render_markdown_adapter_execution_metadata(out, report.external_adapters.adapters.as_slice()); + out.push('\n'); } +fn render_markdown_adapter_execution_metadata( + out: &mut String, + adapters: &[ExternalAdapterReport], +) { + let mut wrote_header = false; + + for adapter in adapters { + let Some(metadata) = &adapter.execution_metadata else { + continue; + }; + + if !wrote_header { + out.push_str("\n### Adapter Execution Metadata\n\n"); + out.push_str("| Adapter | Sources | Setup Path | Runtime Boundary | Resource Expectation | Retry Guidance | Research Depth |\n"); + out.push_str("| --- | --- | --- | --- | --- | --- | --- |\n"); + + wrote_header = true; + } + + out.push_str(&format!( + "| `{}` | {} | {} | {} | {} | {} | {} |\n", + md_inline(adapter.adapter_id.as_str()), + adapter_sources_cell(metadata.sources.as_slice()), + md_cell(metadata.setup_path.as_str()), + md_cell(metadata.runtime_boundary.as_str()), + md_cell(metadata.resource_expectation.as_str()), + md_list(metadata.retry_guidance.as_slice()), + md_cell(metadata.research_depth.as_deref().unwrap_or("not recorded")) + )); + } +} + fn render_markdown_header(out: &mut String, report: &RealWorldReport, report_path: &str) { out.push_str("# Real-World Job Benchmark Report\n\n"); out.push_str( @@ -4728,6 +4823,25 @@ fn adapter_evidence_cell(adapter: &ExternalAdapterReport) -> String { format!("setup: `{}`
result: `{}`", md_inline(setup), md_inline(result)) } +fn adapter_sources_cell(sources: &[AdapterSource]) -> String { + if sources.is_empty() { + return "`none`".to_string(); + } + + sources + .iter() + .map(|source| { + format!( + "[{}]({}): {}", + md_cell(source.label.as_str()), + md_url(source.url.as_str()), + md_cell(source.evidence.as_str()) + ) + }) + .collect::>() + .join("
") +} + fn trace_failure_stage(trace: Option<&TraceExplainability>) -> Option<&str> { trace.and_then(|trace| trace.failure_stage.as_deref()) } diff --git a/apps/elf-eval/tests/real_world_job_benchmark.rs b/apps/elf-eval/tests/real_world_job_benchmark.rs index 1f9fb61b..45ac5b1f 100644 --- a/apps/elf-eval/tests/real_world_job_benchmark.rs +++ b/apps/elf-eval/tests/real_world_job_benchmark.rs @@ -122,12 +122,16 @@ fn smoke_fixture_produces_typed_json_report() -> Result<()> { assert_eq!(report.pointer("/summary/wrong_result_count").and_then(Value::as_u64), Some(0)); assert_eq!( report.pointer("/external_adapters/summary/adapter_count").and_then(Value::as_u64), - Some(9) + Some(21) ); assert_eq!( report.pointer("/external_adapters/summary/live_real_world_count").and_then(Value::as_u64), Some(2) ); + assert_eq!( + report.pointer("/external_adapters/summary/research_gate_count").and_then(Value::as_u64), + Some(12) + ); let jobs = array_at(&report, "/jobs")?; let job = find_by_field(jobs, "/job_id", "work-resume-stale-worktree-001")?; @@ -174,6 +178,13 @@ fn smoke_fixture_produces_typed_json_report() -> Result<()> { fn real_world_report_includes_external_adapter_coverage_manifest() -> Result<()> { let report = run_json_report_from(real_world_memory_fixture_dir())?; + assert_external_adapter_manifest_summary(&report); + assert_external_adapter_manifest_records(&report)?; + + Ok(()) +} + +fn assert_external_adapter_manifest_summary(report: &Value) { assert_eq!( report.pointer("/external_adapters/schema").and_then(Value::as_str), Some("elf.real_world_external_adapter_report/v1") @@ -194,11 +205,11 @@ fn real_world_report_includes_external_adapter_coverage_manifest() -> Result<()> ); assert_eq!( report.pointer("/external_adapters/summary/adapter_count").and_then(Value::as_u64), - Some(9) + Some(21) ); assert_eq!( report.pointer("/external_adapters/summary/external_project_count").and_then(Value::as_u64), - Some(7) + Some(19) ); assert_eq!( report.pointer("/external_adapters/summary/fixture_backed_count").and_then(Value::as_u64), @@ -214,6 +225,10 @@ fn real_world_report_includes_external_adapter_coverage_manifest() -> Result<()> report.pointer("/external_adapters/summary/live_real_world_count").and_then(Value::as_u64), Some(2) ); + assert_eq!( + report.pointer("/external_adapters/summary/research_gate_count").and_then(Value::as_u64), + Some(12) + ); assert_eq!( report .pointer("/external_adapters/summary/overall_status_counts/pass") @@ -236,7 +251,19 @@ fn real_world_report_includes_external_adapter_coverage_manifest() -> Result<()> report .pointer("/external_adapters/summary/overall_status_counts/incomplete") .and_then(Value::as_u64), - Some(2) + Some(3) + ); + assert_eq!( + report + .pointer("/external_adapters/summary/overall_status_counts/blocked") + .and_then(Value::as_u64), + Some(3) + ); + assert_eq!( + report + .pointer("/external_adapters/summary/overall_status_counts/not_encoded") + .and_then(Value::as_u64), + Some(8) ); assert_eq!( report @@ -244,20 +271,30 @@ fn real_world_report_includes_external_adapter_coverage_manifest() -> Result<()> .and_then(Value::as_u64), Some(2) ); + assert_eq!( + report + .pointer("/external_adapters/summary/capability_status_counts/unsupported") + .and_then(Value::as_u64), + Some(5) + ); assert_eq!( report .pointer("/external_adapters/summary/suite_status_counts/blocked") .and_then(Value::as_u64), - Some(3) + Some(10) ); +} - let adapters = array_at(&report, "/external_adapters/adapters")?; +fn assert_external_adapter_manifest_records(report: &Value) -> Result<()> { + let adapters = array_at(report, "/external_adapters/adapters")?; let elf = find_by_field(adapters, "/adapter_id", "elf_real_world_memory_fixture")?; let elf_live = find_by_field(adapters, "/adapter_id", "elf_live_real_world")?; let qmd = find_by_field(adapters, "/adapter_id", "qmd_live_baseline")?; let qmd_live = find_by_field(adapters, "/adapter_id", "qmd_live_real_world")?; let agentmemory = find_by_field(adapters, "/adapter_id", "agentmemory_live_baseline")?; let openviking = find_by_field(adapters, "/adapter_id", "openviking_live_baseline")?; + let ragflow = find_by_field(adapters, "/adapter_id", "ragflow_research_gate")?; + let qmd_deep = find_by_field(adapters, "/adapter_id", "qmd_deep_profile_gate")?; assert_eq!(elf.pointer("/evidence_class").and_then(Value::as_str), Some("fixture_backed")); assert_eq!(elf.pointer("/overall_status").and_then(Value::as_str), Some("incomplete")); @@ -280,6 +317,20 @@ fn real_world_report_includes_external_adapter_coverage_manifest() -> Result<()> Some("mocked") ); assert_eq!(openviking.pointer("/overall_status").and_then(Value::as_str), Some("incomplete")); + assert_eq!(ragflow.pointer("/evidence_class").and_then(Value::as_str), Some("research_gate")); + assert_eq!(ragflow.pointer("/overall_status").and_then(Value::as_str), Some("blocked")); + assert_eq!( + ragflow.pointer("/execution_metadata/research_depth").and_then(Value::as_str), + Some("D0 watch item; D1/D2 required") + ); + assert_eq!( + ragflow.pointer("/execution_metadata/sources/0/url").and_then(Value::as_str), + Some("https://github.com/infiniflow/ragflow") + ); + assert_eq!( + qmd_deep.pointer("/capabilities/2/status").and_then(Value::as_str), + Some("unsupported") + ); Ok(()) } diff --git a/docs/guide/benchmarking/2026-06-10-real-world-comparison-report.md b/docs/guide/benchmarking/2026-06-10-real-world-comparison-report.md index e35aee54..490fecfb 100644 --- a/docs/guide/benchmarking/2026-06-10-real-world-comparison-report.md +++ b/docs/guide/benchmarking/2026-06-10-real-world-comparison-report.md @@ -100,7 +100,7 @@ Suite-level outcomes: The real-world runner loads `apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json`. -That manifest is an evidence ledger, not a leaderboard. It keeps three evidence classes +That manifest is an evidence ledger, not a leaderboard. It keeps four evidence classes separate: | Evidence class | Count | Meaning | @@ -108,6 +108,7 @@ separate: | `fixture_backed` | 1 | ELF fixture scoring through checked-in real-world jobs. | | `live_baseline_only` | 6 | Docker same-corpus/lifecycle evidence from the live-baseline runner only. | | `live_real_world` | 2 | Targeted ELF and qmd adapters execute representative `real_world_job` prompts and scoring. | +| `research_gate` | 12 | Source/setup/runtime/resource/retry metadata for future adapter paths; not fixture-backed or live execution evidence. | Adapter-level status after refreshing the manifest: @@ -122,10 +123,16 @@ Adapter-level status after refreshing the manifest: | memsearch | `live_baseline_only` | `wrong_result` | Markdown-first design remains a source-of-truth ergonomics reference. | Same-corpus retrieval was not a clean pass and real-world suites are incomplete/not encoded. | | OpenViking | `live_baseline_only` | `incomplete` | Hierarchical context trajectory remains a reference direction. | Docker local-embedding setup must be pinned before fair retrieval or real-world jobs can run. | | claude-mem | `live_baseline_only` | `wrong_result` | Progressive disclosure and local viewer remain UX references. | Current Docker evidence is not a clean same-corpus pass and progressive disclosure jobs are not encoded. | +| qmd deep profile | `research_gate` | `not_encoded` | The stress-profile command path and source metadata are recorded for a future deeper retrieval-debug run. | No expanded qmd stress artifact or broader real-world suite pass is checked in. | +| OpenViking deep profile | `research_gate` | `incomplete` | The deeper context-trajectory gate inherits the current Docker local-embedding setup blocker. | No hierarchical trajectory suite result is claimed. | +| RAGFlow, LightRAG, GraphRAG | `research_gate` | `blocked` | Official sources and setup/resource/retry expectations are recorded. | D1/D2 research, Docker runtime proof, and evidence-output mapping are required before adapter implementation. | +| Graphiti/Zep, Letta, LangGraph, nanograph, llm-wiki, gbrain, graphify | `research_gate` | `not_encoded` | D1/D2-inspired adapter directions have source/setup/runtime/resource/retry metadata. | No Docker-isolated `real_world_job` adapter has run for these projects. | -External summary counters: `9` adapter records, `7` external project records, `9` Docker-default, -`0` host-global-install requirements, `2` live real-world adapters, `3` external -wrong-result overall states, `1` lifecycle-fail state, and `1` external incomplete state. +External summary counters: `21` adapter records, `19` non-ELF adapter records, +`21` Docker-default, `0` host-global-install requirements, `2` live real-world +adapters, and `12` research-gate records. Overall adapter statuses are `3` pass, +`3` wrong_result, `1` lifecycle_fail, `3` incomplete, `3` blocked, and +`8` not_encoded. ## Remaining Gaps @@ -144,6 +151,8 @@ report: | memsearch same-corpus and real-world coverage | `wrong_result` / `incomplete` | Fix Docker same-corpus retrieval/reindex evidence before scoring Markdown-first real-world jobs. | | OpenViking Docker local embedding path | `incomplete` | `[ELF benchmark adapter] Pin OpenViking Docker local embedding dependency path`. | | claude-mem durable/progressive-disclosure adapter | `wrong_result` / `not_encoded` | Add durable local repository and progressive-disclosure job coverage before UX parity claims. | +| RAGFlow, LightRAG, and GraphRAG adapter feasibility | `blocked` research gates | Run D1/D2 research on setup, resource envelope, corpus ingest, query output, source mapping, and Docker retry path before implementation. | +| Graphiti/Zep, Letta, LangGraph, nanograph, llm-wiki, gbrain, and graphify adapters | `not_encoded` research gates | Implement only after a scoped Docker path can emit evidence-linked outputs for the relevant real-world suites. | ## Adoption Implications diff --git a/docs/guide/benchmarking/live_baseline_benchmark.md b/docs/guide/benchmarking/live_baseline_benchmark.md index e71ade85..3b6a1997 100644 --- a/docs/guide/benchmarking/live_baseline_benchmark.md +++ b/docs/guide/benchmarking/live_baseline_benchmark.md @@ -355,7 +355,9 @@ by default and records live-baseline-only external adapter evidence under `external_adapters`; those records preserve the typed setup/run evidence but still leave real-world suites as `not_encoded`, `blocked`, `incomplete`, `wrong_result`, or `lifecycle_fail` until an adapter actually executes `real_world_job` prompts and -scoring. +scoring. The same manifest can also contain `research_gate` records for future adapter +packs; those records provide source/setup/runtime/resource/retry guidance but are not +live-baseline evidence. The targeted live real-world adapter slice for ELF and qmd is separate from the same-corpus live baseline: diff --git a/docs/guide/benchmarking/real_world_agent_memory_benchmark.md b/docs/guide/benchmarking/real_world_agent_memory_benchmark.md index d721a24d..61872397 100644 --- a/docs/guide/benchmarking/real_world_agent_memory_benchmark.md +++ b/docs/guide/benchmarking/real_world_agent_memory_benchmark.md @@ -208,9 +208,8 @@ The report also loads the checked-in external adapter coverage manifest by defau apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json ``` -That manifest records the first memory-project set: ELF, qmd, agentmemory, -mem0/OpenMemory, claude-mem, memsearch, and OpenViking. Its `external_adapters` -report section distinguishes: +That manifest records the first memory-project set plus expanded RAG and graph-memory +research gates. Its `external_adapters` report section distinguishes: - `fixture_backed`: checked-in real-world fixture scoring, such as the ELF fixture response path. @@ -218,6 +217,8 @@ report section distinguishes: a real-world suite win. - `live_real_world`: external adapters that actually execute `real_world_job` prompts and scoring. +- `research_gate`: checked-in source/setup/runtime/resource/retry metadata for a + future adapter path, not fixture-backed or live execution evidence. Current state: the targeted `elf_live_real_world` and `qmd_live_real_world` adapter slice is encoded through `cargo make real-world-memory-live-adapters`. It materializes @@ -228,8 +229,12 @@ record is not a real-world suite win. agentmemory is blocked on durable upstream storage for lifecycle proof. mem0/OpenMemory, memsearch, and claude-mem currently retain wrong-result or incomplete live-baseline states for the checked-in adapter evidence. OpenViking is incomplete until its local embedding setup is reliable inside -Docker. These typed states describe benchmark coverage; do not treat them as broad -project quality rankings. +Docker. The expanded RAG and graph-memory records for RAGFlow, LightRAG, GraphRAG, +Graphiti/Zep, Letta, LangGraph, nanograph, llm-wiki, gbrain, graphify, and deeper +qmd/OpenViking profiles are `research_gate` records until their Docker-isolated +adapter runs are implemented. These typed states describe benchmark coverage; do not +convert setup weight, missing research, or unencoded suites into broad project quality +rankings. To run the targeted live adapter slice for ELF and qmd: diff --git a/docs/guide/research/comparison_external_projects.md b/docs/guide/research/comparison_external_projects.md index a61030a6..8e549544 100644 --- a/docs/guide/research/comparison_external_projects.md +++ b/docs/guide/research/comparison_external_projects.md @@ -63,9 +63,13 @@ projects only have `live_baseline_only` Docker retrieval/lifecycle evidence, whi capabilities are `mocked`, `blocked`, `unsupported`, `incomplete`, `wrong_result`, or `lifecycle_fail`, and which real-world suites remain `not_encoded`. The manifest now includes targeted `live_real_world` records for ELF and qmd through -`cargo make real-world-memory-live-adapters`; other external projects remain -live-baseline-only, incomplete, blocked, or not encoded until their own -`real_world_job` adapters run. +`cargo make real-world-memory-live-adapters`; it also includes `research_gate` records +for RAGFlow, LightRAG, GraphRAG, Graphiti/Zep, Letta, LangGraph, nanograph, +llm-wiki, gbrain, graphify, and deeper qmd/OpenViking profiles. Research gates carry +source/setup/runtime/resource/retry metadata for future adapter work, but they are not +fixture-backed, live-baseline-only, or live-real-world evidence. Other external +projects remain live-baseline-only, incomplete, blocked, or not encoded until their +own `real_world_job` adapters run. Benchmark suite labels: @@ -102,8 +106,9 @@ Project-to-suite map: | Graphiti / Zep | `rw.graph-temporal`, `rw.resume-evidence` | Temporal entities, relations, fact triples, validity windows, and graph search directly target stale/contradictory factual memory. | Add fact triples with validity changes, query current and historical answers, and score invalidation/append behavior under contradiction traps. | Docs-grounded D1; no benchmark adapter evidence. Confidence: medium-high for temporal-graph dimension. | ELF graph-lite covers evidence-linked validity windows and current/historical relation context; Graphiti/Zep remains the reference for broader temporal graph workflows. | | nanograph | `rw.graph-temporal`, `rw.retrieval-debug` | Typed schema and typed query ergonomics are relevant to making ELF graph-lite interactions inspectable and hard to misuse. | Define typed graph schemas and queries for the same fact set, then score developer-visible validation, query shape, and explainability rather than retrieval quality alone. | Docs-grounded D1; no benchmark adapter evidence. Confidence: medium for DX reference, low for memory-system comparison. | ELF should borrow typed graph ergonomics without treating nanograph as a full memory backend. | -Pending watch items remain D0. Keep them out of benchmark strength claims until current -evidence is gathered: +Pending watch items remain D0 even when they have checked-in `research_gate` adapter +records. Keep them out of benchmark strength claims until current D1/D2 evidence is +gathered and a Docker-isolated adapter actually runs: | Watch item | Candidate suite if promoted | Minimum evidence needed before adapter or quality claims | | ---------- | --------------------------- | ------------------------------------------------------- | @@ -282,7 +287,7 @@ Capability notes: - [gbrain](https://github.com/garrytan/gbrain): Strong operational knowledge-brain shape with primary-home routing, `compiled_truth` + timeline pages, and explicit maintenance/enrichment workflows. Trade-off: page-first ontology and personal-brain workflow assumptions would over-couple ELF core to one UI/content model if copied directly. - [Always-On Memory Agent](https://github.com/GoogleCloudPlatform/generative-ai/tree/main/gemini/agents/always-on-memory-agent): Strong always-on ingest/consolidate/query loop with multimodal inbox, timer-driven consolidation, simple SQLite persistence, and a lightweight dashboard/API. Trade-off: memory formation is LLM-first, so it does not preserve ELF-style deterministic write boundaries or evidence-bound fact contracts. - [graphify](https://github.com/safishamsi/graphify): Strong multimodal graph compression with deterministic AST extraction for code, explicit `EXTRACTED`/`INFERRED`/`AMBIGUOUS` relation tagging, and always-on assistant hooks. Trade-off: it is closer to a graph-guided corpus understanding skill than a multi-tenant memory service, so its graph artifact should be treated as a derived operator surface rather than a source-of-truth memory backend. -- [nanograph](https://github.com/aaltshuler/nanograph): Strong typed schema + typed query developer ergonomics. Trade-off: focuses on graph-first DX patterns rather than ELF's evidence-bound notes + multi-tenant service contract. +- [nanograph](https://github.com/nanograph/nanograph): Strong typed schema + typed query developer ergonomics. Trade-off: focuses on graph-first DX patterns rather than ELF's evidence-bound notes + multi-tenant service contract. ## nanograph Snapshot (New) @@ -293,9 +298,9 @@ Snapshot date for this subsection: March 4, 2026. Primary references: -- [nanograph](https://github.com/aaltshuler/nanograph) -- [Schema docs](https://github.com/aaltshuler/nanograph/blob/main/docs/user/schema.md) -- [Query docs](https://github.com/aaltshuler/nanograph/blob/main/docs/user/queries.md) +- [nanograph](https://github.com/nanograph/nanograph) +- [Schema docs](https://github.com/nanograph/nanograph/blob/main/docs/user/schema.md) +- [Query docs](https://github.com/nanograph/nanograph/blob/main/docs/user/queries.md) ## LLM Wiki And Operational Brain Snapshot (New) diff --git a/docs/guide/research/external_memory_improvement_plan.md b/docs/guide/research/external_memory_improvement_plan.md index 508bfab2..2e2e53a8 100644 --- a/docs/guide/research/external_memory_improvement_plan.md +++ b/docs/guide/research/external_memory_improvement_plan.md @@ -229,6 +229,8 @@ Implementation shape: - Replace mock/in-memory external adapters with durable local modes where feasible. - For every external adapter, mark which behaviors are real, mocked, unsupported, or blocked. +- For expanded RAG and graph-memory systems, use `research_gate` records until D1/D2 + research, resource sizing, and Docker runtime boundaries are proven. - Add lifecycle checks: update, delete/expire, cold-start reload, and same-corpus retrieval. - Keep failures typed with the terms in this document. - Use `apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json` diff --git a/docs/guide/research/research_projects_inventory.md b/docs/guide/research/research_projects_inventory.md index c84ddab6..23c6f565 100644 --- a/docs/guide/research/research_projects_inventory.md +++ b/docs/guide/research/research_projects_inventory.md @@ -6,7 +6,7 @@ Inputs: Existing research notes, open architecture questions, and tracked adopti Depends on: `docs/guide/research/comparison_external_projects.md`. Outputs: A current inventory of reviewed and pending external projects. -Last updated: June 9, 2026. +Last updated: June 10, 2026. ## Legend @@ -34,10 +34,10 @@ Last updated: June 9, 2026. | [Letta](https://github.com/letta-ai/letta) | D1 | Reviewed | `rw.core-archival`, `rw.operator-continuity` | Core vs archival memory split, shared blocks | `docs/guide/research/comparison_external_projects.md`; `docs/research/2026-06-09-xy-841-external-memory-benchmark-dimensions.json` | | [LangGraph](https://docs.langchain.com/oss/python/langgraph/persistence) | D1 | Reviewed | `rw.replay-regression`, `rw.resume-evidence` | Checkpoint/replay mindset for quality regression workflows | `docs/guide/research/comparison_external_projects.md`; `docs/research/2026-06-09-xy-841-external-memory-benchmark-dimensions.json` | | [Graphiti / Zep](https://help.getzep.com/graphiti/core-concepts/temporal-awareness) | D1 | Reviewed | `rw.graph-temporal`, `rw.resume-evidence` | Temporal fact validity model for graph-like memory evolution | `docs/guide/research/comparison_external_projects.md`; `docs/research/2026-06-09-xy-841-external-memory-benchmark-dimensions.json` | -| [nanograph](https://github.com/aaltshuler/nanograph) | D1 | Reviewed | `rw.graph-temporal`, `rw.retrieval-debug` | Typed schema + typed query ergonomics for graph-lite developer experience | `docs/guide/research/comparison_external_projects.md`; `docs/research/2026-06-09-xy-841-external-memory-benchmark-dimensions.json` | -| [RAGFlow](https://github.com/infiniflow/ragflow) | D0 | Watch item; pending deep dive | Candidate `rw.resume-evidence`, `rw.graph-navigation`, `rw.retrieval-debug`; no strength claim | Potential framework integration discussion; not yet audited to adoption level | Discussion history only; see watch-item evidence requirements in `docs/guide/research/comparison_external_projects.md` | -| [LightRAG](https://github.com/HKUDS/LightRAG) | D0 | Watch item; pending deep dive | Candidate `rw.graph-navigation`, `rw.graph-temporal`, `rw.retrieval-debug`; no strength claim | Graph-augmented RAG strategy relevance; not yet audited to adoption level | Discussion history only; see watch-item evidence requirements in `docs/guide/research/comparison_external_projects.md` | -| [GraphRAG](https://www.microsoft.com/en-us/research/project/graphrag/) | D0 | Watch item; pending deep dive | Candidate `rw.graph-navigation`, `rw.knowledge-synthesis`, `rw.retrieval-debug`; no strength claim | Graph-based retrieval concepts; not yet audited to implementation decision level | Discussion history only; see watch-item evidence requirements in `docs/guide/research/comparison_external_projects.md` | +| [nanograph](https://github.com/nanograph/nanograph) | D1 | Reviewed; research gate added | `rw.graph-temporal`, `rw.retrieval-debug` | Typed schema + typed query ergonomics for graph-lite developer experience | `docs/guide/research/comparison_external_projects.md`; `docs/research/2026-06-09-xy-841-external-memory-benchmark-dimensions.json`; `apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json` | +| [RAGFlow](https://github.com/infiniflow/ragflow) | D0 | Research gate added; D1/D2 still required before adapter | Candidate `rw.resume-evidence`, `rw.graph-navigation`, `rw.retrieval-debug`; no strength claim | Potential framework integration discussion; not yet audited to adoption level | `apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json`; see watch-item evidence requirements in `docs/guide/research/comparison_external_projects.md` | +| [LightRAG](https://github.com/HKUDS/LightRAG) | D0 | Research gate added; D1/D2 still required before adapter | Candidate `rw.graph-navigation`, `rw.graph-temporal`, `rw.retrieval-debug`; no strength claim | Graph-augmented RAG strategy relevance; not yet audited to adoption level | `apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json`; see watch-item evidence requirements in `docs/guide/research/comparison_external_projects.md` | +| [GraphRAG](https://github.com/microsoft/graphrag) | D0 | Research gate added; D1/D2 still required before adapter | Candidate `rw.graph-navigation`, `rw.knowledge-synthesis`, `rw.retrieval-debug`; no strength claim | Graph-based retrieval concepts; not yet audited to implementation decision level | `apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json`; see watch-item evidence requirements in `docs/guide/research/comparison_external_projects.md` | ## June 2026 Activity Snapshot diff --git a/docs/spec/real_world_agent_memory_benchmark_v1.md b/docs/spec/real_world_agent_memory_benchmark_v1.md index b48a0f97..bb0a4b82 100644 --- a/docs/spec/real_world_agent_memory_benchmark_v1.md +++ b/docs/spec/real_world_agent_memory_benchmark_v1.md @@ -162,9 +162,9 @@ Each `adapters[]` record MUST include: - `adapter_id`: stable id unique within the manifest. - `project`: display name such as `qmd`, `agentmemory`, or `mem0/OpenMemory`. - `adapter_kind`: local execution shape, for example `docker_cli_same_corpus`, - `docker_sdk_same_corpus`, or `offline_fixture_response`. -- `evidence_class`: one of `fixture_backed`, `live_baseline_only`, or - `live_real_world`. + `docker_sdk_same_corpus`, `offline_fixture_response`, or `research_gate`. +- `evidence_class`: one of `fixture_backed`, `live_baseline_only`, + `live_real_world`, or `research_gate`. - `docker_default`: boolean. - `host_global_installs_required`: boolean. - `overall_status`: one adapter status from the table below. @@ -177,6 +177,30 @@ Each `adapters[]` record MUST include: - `evidence`: array of evidence pointers with `kind`, `ref`, and `status`. - `notes`: optional bounded explanatory strings. - `follow_up`: optional `title` and `reason`. +- `execution_metadata`: optional object used by expanded adapter packs and research + gates. When present, it MUST include `sources`, `setup_path`, + `runtime_boundary`, `resource_expectation`, and `retry_guidance`. It MAY include + `research_depth`. + +`research_gate` evidence class means the adapter record is a checked-in gating record +for future implementation, not a benchmark execution result. It is used when a project +needs D1/D2 research, resource sizing, credentials, Docker runtime proof, or source +mapping before a fair adapter can run. A `research_gate` record MUST NOT be counted as +fixture-backed, live-baseline-only, or live-real-world evidence. + +`execution_metadata.sources[]` entries MUST include: + +- `label`: short source label. +- `url`: official source, docs, or repository URL. +- `evidence`: bounded description of why the source matters. + +`execution_metadata` fields: + +- `setup_path`: intended setup path or the setup blocker to resolve. +- `runtime_boundary`: Docker/service/CLI/process boundary expected for safe runs. +- `resource_expectation`: expected resource or credential envelope, including unknowns. +- `retry_guidance`: one or more concrete next checks before claiming pass/fail. +- `research_depth`: optional `D0`, `D1`, or `D2` research state. Adapter coverage status terms: @@ -198,7 +222,8 @@ metadata, per-adapter records, and summary counters for: - adapter count, external project count, Docker-default count, host-global-install count; -- `fixture_backed`, `live_baseline_only`, and `live_real_world` evidence classes; +- `fixture_backed`, `live_baseline_only`, `live_real_world`, and `research_gate` + evidence classes; - overall adapter statuses; - capability coverage statuses; - real-world suite coverage statuses. @@ -542,9 +567,9 @@ Reports MUST include: preserving the `real`, `fixture_backed`, `mocked`, `blocked`, and `not_encoded` distinction. - external adapter coverage when an external adapter manifest is loaded, preserving - `fixture_backed`, `live_baseline_only`, `live_real_world`, `real`, `mocked`, - `unsupported`, `blocked`, `incomplete`, `wrong_result`, `lifecycle_fail`, `pass`, - and `not_encoded` distinctions. + `fixture_backed`, `live_baseline_only`, `live_real_world`, `research_gate`, + `real`, `mocked`, `unsupported`, `blocked`, `incomplete`, `wrong_result`, + `lifecycle_fail`, `pass`, and `not_encoded` distinctions. Reports that encode `memory_evolution` jobs SHOULD also include stale-answer counts, conflict detection counts, update rationale availability, and temporal-validity