diff --git a/apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json b/apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json index 9ee1acb6..beea373a 100644 --- a/apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json +++ b/apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json @@ -918,7 +918,7 @@ "overall_status": "blocked", "setup": { "status": "blocked", - "evidence": "RAGFlow remains a large RAG system watch item; D1/D2 research must prove a Docker-safe corpus ingest and query path before adapter implementation." + "evidence": "XY-882 marks RAGFlow as an adapter_candidate, but the runner still needs a Docker-safe tiny-corpus ingest/query smoke before any live adapter claim." }, "run": { "status": "not_encoded", @@ -930,9 +930,9 @@ }, "capabilities": [ { - "capability": "d1_d2_research_before_adapter", - "status": "blocked", - "evidence": "The inventory marks RAGFlow as D0 pending deep dive." + "capability": "adapter_candidate_verdict", + "status": "not_encoded", + "evidence": "XY-882 completed D1/D2 feasibility research and marks RAGFlow adapter_candidate; no adapter run is encoded." }, { "capability": "docker_service_setup", @@ -985,20 +985,25 @@ "label": "RAGFlow docs", "url": "https://ragflow.io/docs/", "evidence": "Official deployment and setup documentation." + }, + { + "label": "RAGFlow HTTP API reference", + "url": "https://raw.githubusercontent.com/infiniflow/ragflow/main/docs/references/http_api_reference.md", + "evidence": "Official reference for OpenAI-compatible responses with reference chunks and document metadata." } ], - "setup_path": "Research the official Docker deployment, corpus ingest API, query API, and artifact export before adding a runner.", + "setup_path": "Implement a tiny Docker evidence-smoke runner using the official Docker deployment, dataset ingest API, and OpenAI-compatible query API.", "runtime_boundary": "Future runs must use docker-compose.baseline.yml or a nested Docker-isolated service profile without host-global installs.", "resource_expectation": "Large multi-service RAG stack; record CPU/GPU mode, memory, disk, startup time, and provider credential needs before scoring.", "retry_guidance": [ - "Complete a D1/D2 setup and API deep dive.", - "Prototype a tiny Docker smoke that reaches ingest and query before adding quality checks." + "Start with CPU mode and a generated tiny text corpus.", + "Record image pull/build size, expanded disk use, startup time, vm.max_map_count handling, and provider boundaries before scoring." ], - "research_depth": "D0 watch item; D1/D2 required" + "research_depth": "D2 feasibility verdict: adapter_candidate (XY-882); research_gate only, adapter not encoded" }, "follow_up": { - "title": "[ELF benchmark adapter] Research RAGFlow Docker adapter feasibility", - "reason": "The project is too large to score fairly without setup, resource, and API mapping research." + "title": "[ELF benchmark adapter] Implement RAGFlow Docker evidence-smoke adapter", + "reason": "Created as XY-885. XY-882 found a Docker boundary and reference-chunk output contract; implementation must prove a tiny ingest/query run before any quality claim." } }, { @@ -1011,7 +1016,7 @@ "overall_status": "blocked", "setup": { "status": "blocked", - "evidence": "LightRAG requires D1/D2 research on Docker setup, LLM/embedding configuration, persistence, and context output before adapter implementation." + "evidence": "XY-882 marks LightRAG as an adapter_candidate, but the runner still needs a Docker context-export adapter before any live result." }, "run": { "status": "not_encoded", @@ -1024,8 +1029,8 @@ "capabilities": [ { "capability": "graph_augmented_rag_setup", - "status": "blocked", - "evidence": "The inventory marks LightRAG as D0 pending deep dive." + "status": "not_encoded", + "evidence": "XY-882 completed setup/output feasibility research; graph-augmented RAG execution is still not encoded." }, { "capability": "retrieved_context_export", @@ -1078,20 +1083,30 @@ "label": "LightRAG Docker docs", "url": "https://github.com/HKUDS/LightRAG/blob/main/docs/DockerDeployment.md", "evidence": "Official Docker deployment reference." + }, + { + "label": "LightRAG API server docs", + "url": "https://github.com/HKUDS/LightRAG/blob/main/docs/LightRAG-API-Server.md", + "evidence": "Official query-mode and context-output reference." + }, + { + "label": "LightRAG core programming docs", + "url": "https://github.com/HKUDS/LightRAG/blob/main/docs/ProgramingWithCore.md", + "evidence": "Official source-id and file-path citation reference." } ], - "setup_path": "Research Docker Compose with explicit LLM, embedding, rerank, and storage configuration before adding a benchmark runner.", + "setup_path": "Implement Docker Compose with explicit LLM, embedding, rerank, storage, workspace, and data-volume configuration, then export context-only query output.", "runtime_boundary": "Docker-only service profile with generated corpus mounted as container-local input.", "resource_expectation": "Graph extraction and local model choices may dominate runtime; record backend choices, cache sizes, and provider needs.", "retry_guidance": [ "Run a tiny Docker ingest/query smoke with deterministic or local providers.", "Verify returned contexts can be mapped to required evidence IDs." ], - "research_depth": "D0 watch item; D1/D2 required" + "research_depth": "D2 feasibility verdict: adapter_candidate (XY-882); research_gate only, adapter not encoded" }, "follow_up": { - "title": "[ELF benchmark adapter] Research LightRAG graph-RAG adapter feasibility", - "reason": "Graph extraction, persistence, and context output must be understood before fair scoring." + "title": "[ELF benchmark adapter] Implement LightRAG Docker context-export adapter", + "reason": "Created as XY-886. XY-882 found a Docker service path and context/source mapping contract; implementation must prove evidence export before scoring." } }, { @@ -1104,7 +1119,7 @@ "overall_status": "blocked", "setup": { "status": "blocked", - "evidence": "GraphRAG indexing cost and source-citation mapping require D1/D2 research before adapter implementation." + "evidence": "XY-882 marks GraphRAG as an adapter_candidate, but indexing cost and source mapping still need a cost-bounded Docker implementation before live scoring." }, "run": { "status": "not_encoded", @@ -1118,7 +1133,7 @@ { "capability": "indexing_resource_envelope", "status": "blocked", - "evidence": "Official docs warn that indexing can be expensive; the benchmark must start small and record costs." + "evidence": "XY-882 requires the first adapter to start with a tiny corpus and record indexing cost before any scale or quality claim." }, { "capability": "source_citation_mapping", @@ -1171,20 +1186,25 @@ "label": "GraphRAG docs", "url": "https://microsoft.github.io/graphrag/", "evidence": "Official documentation for indexing and querying." + }, + { + "label": "GraphRAG output tables", + "url": "https://microsoft.github.io/graphrag/index/outputs/", + "evidence": "Official output schema with document, text unit, community, and relationship identifiers." } ], - "setup_path": "Research a tiny CLI index/query path with explicit model configuration and source mapping.", + "setup_path": "Implement a tiny CLI/API index/query path with explicit model configuration and source mapping from parquet output tables.", "runtime_boundary": "Docker-only Python CLI run with generated corpus and container-local artifacts.", "resource_expectation": "Indexing may be expensive; record model calls, cache size, elapsed time, and maximum corpus size used.", "retry_guidance": [ - "Complete D1/D2 indexing and query-output research.", - "Add a cost-bounded smoke before any scale or quality claim." + "Add a cost-bounded smoke before any scale or quality claim.", + "Fail typed if source document or text_unit identifiers cannot be mapped to expected evidence IDs." ], - "research_depth": "D0 watch item; D1/D2 required" + "research_depth": "D2 feasibility verdict: adapter_candidate (XY-882); research_gate only, adapter not encoded" }, "follow_up": { - "title": "[ELF benchmark adapter] Research GraphRAG cost-bounded adapter path", - "reason": "Indexing cost, graph summaries, and citation guarantees need proof before scoring." + "title": "[ELF benchmark adapter] Implement GraphRAG cost-bounded Docker adapter", + "reason": "Created as XY-887. XY-882 found a Docker-bounded CLI/API path and output-table evidence handles; implementation must stay tiny and cost-recorded." } }, { @@ -1197,7 +1217,7 @@ "overall_status": "not_encoded", "setup": { "status": "not_encoded", - "evidence": "Graphiti/Zep is D1 reviewed as a temporal graph-memory reference, but no Docker adapter is implemented." + "evidence": "XY-882 marks Graphiti/Zep as an adapter_candidate, but no Docker temporal graph adapter is implemented." }, "run": { "status": "not_encoded", @@ -1211,7 +1231,7 @@ { "capability": "temporal_graph_memory", "status": "not_encoded", - "evidence": "Temporal fact validity is a reference dimension but not an executable adapter output." + "evidence": "Temporal fact validity has a scoped adapter candidate path, but no executable adapter output is encoded." }, { "capability": "docker_graph_store_setup", @@ -1259,16 +1279,30 @@ "label": "Zep Graphiti overview", "url": "https://www.getzep.com/platform/graphiti/", "evidence": "Official product documentation for temporal context graph behavior." + }, + { + "label": "Graphiti quick start", + "url": "https://help.getzep.com/graphiti/getting-started/quick-start", + "evidence": "Official setup, episode ingest, and search output reference." + }, + { + "label": "Graphiti FalkorDB configuration", + "url": "https://help.getzep.com/graphiti/configuration/falkor-db-configuration", + "evidence": "Official Docker-local FalkorDB setup reference." } ], - "setup_path": "Define a Docker-local graph store and provider configuration, then encode add/query current-versus-historical fact jobs.", + "setup_path": "Implement a Docker-local FalkorDB or Neo4j graph store and provider configuration, then encode add/query current-versus-historical fact jobs.", "runtime_boundary": "Docker-only service or SDK run with graph store state under benchmark artifacts.", "resource_expectation": "Requires graph store plus LLM/embedding configuration; record service startup, storage size, and provider boundaries.", "retry_guidance": [ "Prototype a tiny temporal fact add/query run.", "Map valid_at/invalid_at evidence to memory_evolution scoring." ], - "research_depth": "D1 reviewed; adapter not encoded" + "research_depth": "D1 feasibility verdict: adapter_candidate (XY-882); research_gate only, adapter not encoded" + }, + "follow_up": { + "title": "[ELF benchmark adapter] Implement Graphiti/Zep temporal graph adapter", + "reason": "Created as XY-888. XY-882 found a Docker-local graph-store path and fact/validity-window output contract for memory_evolution scoring." } }, { @@ -1357,7 +1391,7 @@ "Create a tiny Docker agent with archival memory search.", "Score core-versus-archival retrieval only after source evidence can be exported." ], - "research_depth": "D1 reviewed; adapter not encoded" + "research_depth": "D1 feasibility verdict: research_only (XY-882); core/archival reference, adapter not encoded" } }, { @@ -1431,7 +1465,7 @@ "Encode one replay/fork failure recovery job.", "Keep LangGraph classified as replay reference unless memory retrieval is actually exercised." ], - "research_depth": "D1 reviewed; adapter not encoded" + "research_depth": "D1 feasibility verdict: research_only (XY-882); replay/checkpoint reference, adapter not encoded" } }, { @@ -1505,7 +1539,7 @@ "Define a minimal schema for memory_evolution facts.", "Score typed query output only if it cites fixture evidence IDs." ], - "research_depth": "D1 reviewed; adapter not encoded" + "research_depth": "D1 feasibility verdict: research_only (XY-882); typed graph DX reference, adapter not encoded" } }, { @@ -1579,7 +1613,7 @@ "Prototype a fixture-only page build with explicit citations.", "Do not score until generated sections can be mapped to evidence IDs." ], - "research_depth": "D1 reviewed; adapter not encoded" + "research_depth": "D1 feasibility verdict: research_only (XY-882); derived wiki workflow reference, adapter not encoded" } }, { @@ -1663,7 +1697,7 @@ "Prototype a tiny brain repo with one current-truth page and timeline.", "Score only if compiled truth cites the source timeline evidence." ], - "research_depth": "D1 reviewed; adapter not encoded" + "research_depth": "D1 feasibility verdict: blocked (XY-882); Docker-local brain repo and database path not proven" } }, { @@ -1676,7 +1710,7 @@ "overall_status": "not_encoded", "setup": { "status": "not_encoded", - "evidence": "graphify is D1 reviewed as a graph-navigation reference, but no Docker adapter is implemented." + "evidence": "XY-882 marks graphify as an adapter_candidate for a Docker-only CLI/materializer path, but no adapter is implemented." }, "run": { "status": "not_encoded", @@ -1690,7 +1724,7 @@ { "capability": "graph_report_generation", "status": "not_encoded", - "evidence": "Graph reports and assistant query flows are not executed by the runner." + "evidence": "Graph reports and query output have a candidate scoring path, but they are not executed by the runner." }, { "capability": "multimodal_code_graph", @@ -1733,16 +1767,25 @@ "label": "graphify repository", "url": "https://github.com/safishamsi/graphify", "evidence": "Official source for graphify graph extraction and query workflow." + }, + { + "label": "graphify README", + "url": "https://github.com/safishamsi/graphify/blob/v3/README.md", + "evidence": "Official CLI, output artifact, query, and source-location contract." } ], - "setup_path": "Install graphify inside Docker, build a graph/report from a generated corpus, and export query evidence.", - "runtime_boundary": "Docker-only CLI or skill run over mounted benchmark corpus.", + "setup_path": "Install graphify inside Docker, build a graph/report from a generated corpus, and export query evidence without installing host-global assistant hooks.", + "runtime_boundary": "Docker-only CLI/materializer run over mounted benchmark corpus.", "resource_expectation": "Graph build cost scales with corpus and model choices; record build time, graph size, and generated report size.", "retry_guidance": [ "Start with a generated public code/document corpus.", "Score graph-guided answers only when report nodes cite source evidence IDs." ], - "research_depth": "D1 reviewed; adapter not encoded" + "research_depth": "D1 feasibility verdict: adapter_candidate (XY-882); research_gate only, adapter not encoded" + }, + "follow_up": { + "title": "[ELF benchmark adapter] Implement graphify Docker graph-report adapter", + "reason": "Created as XY-889. XY-882 found a Docker-only CLI/materializer path and source-file/source-location output contract." } } ] diff --git a/apps/elf-eval/tests/real_world_job_benchmark.rs b/apps/elf-eval/tests/real_world_job_benchmark.rs index 45ac5b1f..48461dd4 100644 --- a/apps/elf-eval/tests/real_world_job_benchmark.rs +++ b/apps/elf-eval/tests/real_world_job_benchmark.rs @@ -321,7 +321,9 @@ fn assert_external_adapter_manifest_records(report: &Value) -> Result<()> { assert_eq!(ragflow.pointer("/overall_status").and_then(Value::as_str), Some("blocked")); assert_eq!( ragflow.pointer("/execution_metadata/research_depth").and_then(Value::as_str), - Some("D0 watch item; D1/D2 required") + Some( + "D2 feasibility verdict: adapter_candidate (XY-882); research_gate only, adapter not encoded" + ) ); assert_eq!( ragflow.pointer("/execution_metadata/sources/0/url").and_then(Value::as_str), diff --git a/docs/guide/benchmarking/2026-06-10-real-world-comparison-report.md b/docs/guide/benchmarking/2026-06-10-real-world-comparison-report.md index 490fecfb..0b91ce4e 100644 --- a/docs/guide/benchmarking/2026-06-10-real-world-comparison-report.md +++ b/docs/guide/benchmarking/2026-06-10-real-world-comparison-report.md @@ -110,6 +110,22 @@ separate: | `live_real_world` | 2 | Targeted ELF and qmd adapters execute representative `real_world_job` prompts and scoring. | | `research_gate` | 12 | Source/setup/runtime/resource/retry metadata for future adapter paths; not fixture-backed or live execution evidence. | +XY-882 added D1/D2 feasibility verdicts inside the research-gate lane. RAGFlow +([XY-885](https://linear.app/hack-ink/issue/XY-885/elf-benchmark-adapter-implement-ragflow-docker-evidence-smoke-adapter)), +LightRAG +([XY-886](https://linear.app/hack-ink/issue/XY-886/elf-benchmark-adapter-implement-lightrag-docker-context-export-adapter)), +GraphRAG +([XY-887](https://linear.app/hack-ink/issue/XY-887/elf-benchmark-adapter-implement-graphrag-cost-bounded-docker-adapter)), +Graphiti/Zep +([XY-888](https://linear.app/hack-ink/issue/XY-888/elf-benchmark-adapter-implement-graphitizep-temporal-graph-adapter)), +and graphify +([XY-889](https://linear.app/hack-ink/issue/XY-889/elf-benchmark-adapter-implement-graphify-docker-graph-report-adapter)) +are now adapter implementation candidates because they have scoped Docker boundaries +and evidence-linked output contracts. Letta, LangGraph, nanograph, and llm-wiki remain +`research_only`; gbrain remains `blocked` until a Docker-local brain repo and database +path is proven. These verdicts do not change any record into live adapter pass +evidence. + Adapter-level status after refreshing the manifest: | Project | Evidence class | Overall status | What is proven | What is not proven | @@ -125,8 +141,8 @@ Adapter-level status after refreshing the manifest: | claude-mem | `live_baseline_only` | `wrong_result` | Progressive disclosure and local viewer remain UX references. | Current Docker evidence is not a clean same-corpus pass and progressive disclosure jobs are not encoded. | | qmd deep profile | `research_gate` | `not_encoded` | The stress-profile command path and source metadata are recorded for a future deeper retrieval-debug run. | No expanded qmd stress artifact or broader real-world suite pass is checked in. | | OpenViking deep profile | `research_gate` | `incomplete` | The deeper context-trajectory gate inherits the current Docker local-embedding setup blocker. | No hierarchical trajectory suite result is claimed. | -| RAGFlow, LightRAG, GraphRAG | `research_gate` | `blocked` | Official sources and setup/resource/retry expectations are recorded. | D1/D2 research, Docker runtime proof, and evidence-output mapping are required before adapter implementation. | -| Graphiti/Zep, Letta, LangGraph, nanograph, llm-wiki, gbrain, graphify | `research_gate` | `not_encoded` | D1/D2-inspired adapter directions have source/setup/runtime/resource/retry metadata. | No Docker-isolated `real_world_job` adapter has run for these projects. | +| RAGFlow, LightRAG, GraphRAG | `research_gate` | `blocked` | Official sources, setup/resource/retry expectations, and XY-882 adapter-candidate verdicts are recorded. | Docker runtime proof and real_world_job evidence-output mapping are still required before any live adapter claim. | +| Graphiti/Zep, Letta, LangGraph, nanograph, llm-wiki, gbrain, graphify | `research_gate` | `not_encoded` | XY-882 records Graphiti/Zep and graphify as adapter candidates, Letta/LangGraph/nanograph/llm-wiki as research-only, and gbrain as blocked. | No Docker-isolated `real_world_job` adapter has run for these projects. | External summary counters: `21` adapter records, `19` non-ELF adapter records, `21` Docker-default, `0` host-global-install requirements, `2` live real-world @@ -151,8 +167,9 @@ report: | memsearch same-corpus and real-world coverage | `wrong_result` / `incomplete` | Fix Docker same-corpus retrieval/reindex evidence before scoring Markdown-first real-world jobs. | | OpenViking Docker local embedding path | `incomplete` | `[ELF benchmark adapter] Pin OpenViking Docker local embedding dependency path`. | | claude-mem durable/progressive-disclosure adapter | `wrong_result` / `not_encoded` | Add durable local repository and progressive-disclosure job coverage before UX parity claims. | -| RAGFlow, LightRAG, and GraphRAG adapter feasibility | `blocked` research gates | Run D1/D2 research on setup, resource envelope, corpus ingest, query output, source mapping, and Docker retry path before implementation. | -| Graphiti/Zep, Letta, LangGraph, nanograph, llm-wiki, gbrain, and graphify adapters | `not_encoded` research gates | Implement only after a scoped Docker path can emit evidence-linked outputs for the relevant real-world suites. | +| RAGFlow, LightRAG, GraphRAG, Graphiti/Zep, and graphify adapters | `research_gate` adapter candidates | Follow-up issues [XY-885](https://linear.app/hack-ink/issue/XY-885/elf-benchmark-adapter-implement-ragflow-docker-evidence-smoke-adapter), [XY-886](https://linear.app/hack-ink/issue/XY-886/elf-benchmark-adapter-implement-lightrag-docker-context-export-adapter), [XY-887](https://linear.app/hack-ink/issue/XY-887/elf-benchmark-adapter-implement-graphrag-cost-bounded-docker-adapter), [XY-888](https://linear.app/hack-ink/issue/XY-888/elf-benchmark-adapter-implement-graphitizep-temporal-graph-adapter), and [XY-889](https://linear.app/hack-ink/issue/XY-889/elf-benchmark-adapter-implement-graphify-docker-graph-report-adapter) must run only Docker-contained adapter smokes that emit evidence-linked outputs before any live result claim. | +| Letta, LangGraph, nanograph, and llm-wiki adapters | `research_only` research gates | Keep as architecture or workflow references until a contained output contract is selected. | +| gbrain adapter | `blocked` research gate | Revisit only after a Docker-local brain repo and database path can be proven without operator-owned state. | ## Adoption Implications diff --git a/docs/guide/research/comparison_external_projects.md b/docs/guide/research/comparison_external_projects.md index 8e549544..0d297ec2 100644 --- a/docs/guide/research/comparison_external_projects.md +++ b/docs/guide/research/comparison_external_projects.md @@ -71,6 +71,11 @@ fixture-backed, live-baseline-only, or live-real-world evidence. Other external projects remain live-baseline-only, incomplete, blocked, or not encoded until their own `real_world_job` adapters run. +XY-882 adds D1/D2 feasibility verdicts for the RAG and graph-memory research gates. +`adapter_candidate` means an implementation follow-up is justified because a scoped +Docker boundary and evidence-linked output contract exist. It does not mean a Docker +adapter has run, and it does not change the `research_gate` evidence class. + Benchmark suite labels: | Suite | Real-world job shape | @@ -106,15 +111,20 @@ Project-to-suite map: | Graphiti / Zep | `rw.graph-temporal`, `rw.resume-evidence` | Temporal entities, relations, fact triples, validity windows, and graph search directly target stale/contradictory factual memory. | Add fact triples with validity changes, query current and historical answers, and score invalidation/append behavior under contradiction traps. | Docs-grounded D1; no benchmark adapter evidence. Confidence: medium-high for temporal-graph dimension. | ELF graph-lite covers evidence-linked validity windows and current/historical relation context; Graphiti/Zep remains the reference for broader temporal graph workflows. | | nanograph | `rw.graph-temporal`, `rw.retrieval-debug` | Typed schema and typed query ergonomics are relevant to making ELF graph-lite interactions inspectable and hard to misuse. | Define typed graph schemas and queries for the same fact set, then score developer-visible validation, query shape, and explainability rather than retrieval quality alone. | Docs-grounded D1; no benchmark adapter evidence. Confidence: medium for DX reference, low for memory-system comparison. | ELF should borrow typed graph ergonomics without treating nanograph as a full memory backend. | -Pending watch items remain D0 even when they have checked-in `research_gate` adapter -records. Keep them out of benchmark strength claims until current D1/D2 evidence is -gathered and a Docker-isolated adapter actually runs: - -| Watch item | Candidate suite if promoted | Minimum evidence needed before adapter or quality claims | -| ---------- | --------------------------- | ------------------------------------------------------- | -| RAGFlow | `rw.resume-evidence`, `rw.graph-navigation`, `rw.retrieval-debug` | D1/D2 deep dive on deployability, corpus ingestion, graph/RAG retrieval path, API/CLI outputs, and Docker resource envelope. | -| LightRAG | `rw.graph-navigation`, `rw.graph-temporal`, `rw.retrieval-debug` | D1/D2 deep dive on graph extraction/update semantics, local persistence, query output, and whether stale/corrected facts can be tested fairly. | -| GraphRAG | `rw.graph-navigation`, `rw.knowledge-synthesis`, `rw.retrieval-debug` | D1/D2 deep dive on indexing cost, graph summaries, update/rebuild behavior, source citation guarantees, and task-level output inspectability. | +XY-882 feasibility verdicts for RAG and graph-memory gates: + +| Project | Verdict | Docker boundary | Evidence-linked output contract | Follow-up | +| ------- | ------- | --------------- | ------------------------------- | --------- | +| RAGFlow | `adapter_candidate` | Official Docker Compose path, but the first adapter must use a tiny CPU corpus and record the 4 CPU / 16 GB RAM / 50 GB disk envelope, image size, `vm.max_map_count`, provider needs, and retry behavior. | OpenAI-compatible and agent completion responses can include `reference.chunks` with chunk id, document id/name, metadata, dataset id, positions, and similarity fields. | [XY-885](https://linear.app/hack-ink/issue/XY-885/elf-benchmark-adapter-implement-ragflow-docker-evidence-smoke-adapter); no live pass claim. | +| LightRAG | `adapter_candidate` | Docker Compose server with explicit LLM, embedding, rerank, storage, workspace, and data-volume configuration. | Context-only query modes can return the context prepared for the LLM; core APIs can insert documents with ids and source file paths. | [XY-886](https://linear.app/hack-ink/issue/XY-886/elf-benchmark-adapter-implement-lightrag-docker-context-export-adapter); no live pass claim. | +| GraphRAG | `adapter_candidate` | Cost-bounded Docker Python CLI/API run over a generated tiny corpus with container-local parquet artifacts. | Output tables contain generated UUIDs, human-readable ids, source documents, text units, community reports, and text-unit links for graph summaries and relationships. | [XY-887](https://linear.app/hack-ink/issue/XY-887/elf-benchmark-adapter-implement-graphrag-cost-bounded-docker-adapter); no live pass claim. | +| Graphiti / Zep | `adapter_candidate` | Docker-local FalkorDB or Neo4j plus Python SDK runner with provider config captured under benchmark artifacts. | Search results and fact triples expose UUIDs, fact text, and validity windows (`valid_at` / `invalid_at`) that map to memory-evolution scoring. | [XY-888](https://linear.app/hack-ink/issue/XY-888/elf-benchmark-adapter-implement-graphitizep-temporal-graph-adapter); no live pass claim. | +| graphify | `adapter_candidate` | Docker-only CLI/materializer using `pip install graphifyy` over a mounted corpus; host-global assistant hooks are out of scope. | `graph.json`, `GRAPH_REPORT.md`, and graph query output include edge types, confidence tags, source files, and source locations. | [XY-889](https://linear.app/hack-ink/issue/XY-889/elf-benchmark-adapter-implement-graphify-docker-graph-report-adapter); no live pass claim. | +| Letta | `research_only` | Docker server exists, but current docs require explicit embedding configuration and steer Letta Code evaluation toward non-Docker local/frontier-model exploration. | Core/archival memory and shared blocks remain useful semantics, but no contained evidence export is selected for this adapter batch. | No implementation issue. | +| LangGraph | `research_only` | A Docker harness is possible, but the project is an agent-state/checkpoint framework rather than a standalone memory adapter. | Store search and checkpoints are references for replay-regression jobs, not a direct external memory output contract here. | No implementation issue. | +| nanograph | `research_only` | Official positioning is one CLI / one folder / no server / no Docker. | Typed schema, query, CDC, and search ergonomics remain graph-lite DX inspiration. | No implementation issue. | +| llm-wiki | `research_only` | Plugin or instruction-file workflow would require a contained harness before scoring; host-global plugin installs are not proof. | Wiki compile/query/lint/audit workflows are derived-knowledge references, not current adapter outputs. | No implementation issue. | +| gbrain | `blocked` | A Docker-local brain repo and database setup path was not proven in this lane. | Compiled truth, timeline, and source attribution are strong, but not enough for implementation without contained setup proof. | No implementation issue until Docker setup is proven. | ## Where ELF Is Not Yet The Reference @@ -129,7 +139,7 @@ gathered and a Docker-isolated adapter actually runs: | Agent replay and forkable regression debugging | LangGraph | ELF traces are replay evidence for retrieval, not full persisted agent-state replay with side-effect boundaries. | | Derived knowledge pages and lint/repair loops | llm-wiki, gbrain | ELF does not yet ship rebuildable entity/project pages with unsupported-claim lint as a first-class workflow. | | Scheduled consolidation as a product surface | Always-On Memory Agent | ELF's target should be reviewable derived consolidation, but the scheduling/operator-control workflow is not implemented. | -| Graph-compressed navigation over large corpora | graphify, GraphRAG/LightRAG watch items | ELF relation context is bounded and evidence-linked, but broader graph report/navigation workflows remain future work. | +| Graph-compressed navigation over large corpora | graphify, GraphRAG/LightRAG adapter candidates | ELF relation context is bounded and evidence-linked, but broader graph report/navigation workflows remain future work. | ## June 2026 Agentmemory And Dreaming Refresh @@ -400,6 +410,20 @@ Snapshot date for this subsection: February 17, 2026. ## Extended Source Map +- RAGFlow: + - https://ragflow.io/docs/ + - https://github.com/infiniflow/ragflow/blob/main/docker/README.md + - https://raw.githubusercontent.com/infiniflow/ragflow/main/docs/references/http_api_reference.md +- LightRAG: + - https://github.com/HKUDS/LightRAG + - https://raw.githubusercontent.com/HKUDS/LightRAG/main/docs/DockerDeployment.md + - https://raw.githubusercontent.com/HKUDS/LightRAG/main/docs/LightRAG-API-Server.md + - https://raw.githubusercontent.com/HKUDS/LightRAG/main/docs/ProgramingWithCore.md +- GraphRAG: + - https://microsoft.github.io/graphrag/ + - https://microsoft.github.io/graphrag/index/inputs/ + - https://microsoft.github.io/graphrag/index/outputs/ + - https://microsoft.github.io/graphrag/query/local_search/ - mem0: - https://docs.mem0.ai/platform/features/entity-scoped-memory - https://docs.mem0.ai/platform/features/graph-memory diff --git a/docs/guide/research/research_projects_inventory.md b/docs/guide/research/research_projects_inventory.md index 23c6f565..960fcfec 100644 --- a/docs/guide/research/research_projects_inventory.md +++ b/docs/guide/research/research_projects_inventory.md @@ -27,17 +27,36 @@ Last updated: June 10, 2026. | [qmd](https://github.com/tobi/qmd) | D2 | Reviewed | `rw.retrieval-debug`, `rw.lifecycle-staleness`, `rw.resume-evidence` | Retrieval routing, weighted fusion, and local-first explainability | `docs/guide/research/comparison_external_projects.md`; `docs/research/2026-06-09-xy-841-external-memory-benchmark-dimensions.json` | | [claude-mem](https://github.com/thedotmack/claude-mem) | D2 | Reviewed | `rw.operator-continuity`, `rw.resume-evidence`, `rw.retrieval-debug` | Progressive disclosure and strong operator workflow | `docs/guide/research/comparison_external_projects.md`; `docs/research/2026-06-09-xy-841-external-memory-benchmark-dimensions.json` | | [OpenViking](https://github.com/volcengine/OpenViking) | D2 | Reviewed | `rw.context-trajectory`, `rw.resume-evidence`, `rw.retrieval-debug` | Filesystem context paradigm, hierarchical retrieval, trajectory observability | `docs/guide/research/comparison_external_projects.md`; `docs/research/2026-06-09-xy-841-external-memory-benchmark-dimensions.json` | -| [llm-wiki](https://github.com/nvk/llm-wiki) | D1 | Reviewed | `rw.knowledge-synthesis`, `rw.resume-evidence` | LLM-maintained wiki pattern, topic-scoped knowledge bases, query-save and lint workflows | `docs/guide/research/comparison_external_projects.md`; `docs/research/2026-06-09-xy-841-external-memory-benchmark-dimensions.json` | -| [gbrain](https://github.com/garrytan/gbrain) | D1 | Reviewed | `rw.knowledge-synthesis`, `rw.operator-continuity` | Operational knowledge brain, `compiled_truth` + timeline pages, enrichment and maintenance loops | `docs/guide/research/comparison_external_projects.md`; `docs/research/2026-06-09-xy-841-external-memory-benchmark-dimensions.json` | +| [llm-wiki](https://github.com/nvk/llm-wiki) | D1 | Reviewed; XY-882 verdict `research_only` | `rw.knowledge-synthesis`, `rw.resume-evidence` | LLM-maintained wiki pattern, topic-scoped knowledge bases, query-save and lint workflows | `docs/guide/research/comparison_external_projects.md`; `docs/research/2026-06-09-xy-841-external-memory-benchmark-dimensions.json`; `docs/research/2026-06-10-xy-882-rag-graph-adapter-feasibility.json` | +| [gbrain](https://github.com/garrytan/gbrain) | D1 | Reviewed; XY-882 verdict `blocked` | `rw.knowledge-synthesis`, `rw.operator-continuity` | Operational knowledge brain, `compiled_truth` + timeline pages, enrichment and maintenance loops; blocked on Docker-local brain repo and database proof | `docs/guide/research/comparison_external_projects.md`; `docs/research/2026-06-09-xy-841-external-memory-benchmark-dimensions.json`; `docs/research/2026-06-10-xy-882-rag-graph-adapter-feasibility.json` | | [Always-On Memory Agent](https://github.com/GoogleCloudPlatform/generative-ai/tree/main/gemini/agents/always-on-memory-agent) | D1 | Reviewed | `rw.consolidation-review`, `rw.operator-continuity` | Always-on multimodal ingest + scheduled consolidation loop with simple local ops surface | `docs/guide/research/comparison_external_projects.md`; `docs/research/2026-06-09-xy-841-external-memory-benchmark-dimensions.json` | -| [graphify](https://github.com/safishamsi/graphify) | D1 | Reviewed | `rw.graph-navigation`, `rw.knowledge-synthesis`, `rw.resume-evidence` | Multimodal graph compression, deterministic code extraction, and always-on graph-guided assistant workflow | `docs/guide/research/comparison_external_projects.md`; `docs/research/2026-06-09-xy-841-external-memory-benchmark-dimensions.json` | -| [Letta](https://github.com/letta-ai/letta) | D1 | Reviewed | `rw.core-archival`, `rw.operator-continuity` | Core vs archival memory split, shared blocks | `docs/guide/research/comparison_external_projects.md`; `docs/research/2026-06-09-xy-841-external-memory-benchmark-dimensions.json` | -| [LangGraph](https://docs.langchain.com/oss/python/langgraph/persistence) | D1 | Reviewed | `rw.replay-regression`, `rw.resume-evidence` | Checkpoint/replay mindset for quality regression workflows | `docs/guide/research/comparison_external_projects.md`; `docs/research/2026-06-09-xy-841-external-memory-benchmark-dimensions.json` | -| [Graphiti / Zep](https://help.getzep.com/graphiti/core-concepts/temporal-awareness) | D1 | Reviewed | `rw.graph-temporal`, `rw.resume-evidence` | Temporal fact validity model for graph-like memory evolution | `docs/guide/research/comparison_external_projects.md`; `docs/research/2026-06-09-xy-841-external-memory-benchmark-dimensions.json` | -| [nanograph](https://github.com/nanograph/nanograph) | D1 | Reviewed; research gate added | `rw.graph-temporal`, `rw.retrieval-debug` | Typed schema + typed query ergonomics for graph-lite developer experience | `docs/guide/research/comparison_external_projects.md`; `docs/research/2026-06-09-xy-841-external-memory-benchmark-dimensions.json`; `apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json` | -| [RAGFlow](https://github.com/infiniflow/ragflow) | D0 | Research gate added; D1/D2 still required before adapter | Candidate `rw.resume-evidence`, `rw.graph-navigation`, `rw.retrieval-debug`; no strength claim | Potential framework integration discussion; not yet audited to adoption level | `apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json`; see watch-item evidence requirements in `docs/guide/research/comparison_external_projects.md` | -| [LightRAG](https://github.com/HKUDS/LightRAG) | D0 | Research gate added; D1/D2 still required before adapter | Candidate `rw.graph-navigation`, `rw.graph-temporal`, `rw.retrieval-debug`; no strength claim | Graph-augmented RAG strategy relevance; not yet audited to adoption level | `apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json`; see watch-item evidence requirements in `docs/guide/research/comparison_external_projects.md` | -| [GraphRAG](https://github.com/microsoft/graphrag) | D0 | Research gate added; D1/D2 still required before adapter | Candidate `rw.graph-navigation`, `rw.knowledge-synthesis`, `rw.retrieval-debug`; no strength claim | Graph-based retrieval concepts; not yet audited to implementation decision level | `apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json`; see watch-item evidence requirements in `docs/guide/research/comparison_external_projects.md` | +| [graphify](https://github.com/safishamsi/graphify) | D1 | Reviewed; XY-882 verdict `adapter_candidate` | `rw.graph-navigation`, `rw.knowledge-synthesis`, `rw.resume-evidence` | Multimodal graph compression, deterministic code extraction, and graph/report outputs with source-file/source-location references | `docs/guide/research/comparison_external_projects.md`; `docs/research/2026-06-09-xy-841-external-memory-benchmark-dimensions.json`; `docs/research/2026-06-10-xy-882-rag-graph-adapter-feasibility.json` | +| [Letta](https://github.com/letta-ai/letta) | D1 | Reviewed; XY-882 verdict `research_only` | `rw.core-archival`, `rw.operator-continuity` | Core vs archival memory split, shared blocks; not an implementation candidate until a supported contained server path can export evidence | `docs/guide/research/comparison_external_projects.md`; `docs/research/2026-06-09-xy-841-external-memory-benchmark-dimensions.json`; `docs/research/2026-06-10-xy-882-rag-graph-adapter-feasibility.json` | +| [LangGraph](https://docs.langchain.com/oss/python/langgraph/persistence) | D1 | Reviewed; XY-882 verdict `research_only` | `rw.replay-regression`, `rw.resume-evidence` | Checkpoint/replay mindset for quality regression workflows; not a standalone memory backend adapter | `docs/guide/research/comparison_external_projects.md`; `docs/research/2026-06-09-xy-841-external-memory-benchmark-dimensions.json`; `docs/research/2026-06-10-xy-882-rag-graph-adapter-feasibility.json` | +| [Graphiti / Zep](https://help.getzep.com/graphiti/core-concepts/temporal-awareness) | D1 | Reviewed; XY-882 verdict `adapter_candidate` | `rw.graph-temporal`, `rw.resume-evidence` | Temporal fact validity model with Docker-local graph-store options and UUID/fact/validity-window output | `docs/guide/research/comparison_external_projects.md`; `docs/research/2026-06-09-xy-841-external-memory-benchmark-dimensions.json`; `docs/research/2026-06-10-xy-882-rag-graph-adapter-feasibility.json` | +| [nanograph](https://github.com/nanograph/nanograph) | D1 | Reviewed; XY-882 verdict `research_only` | `rw.graph-temporal`, `rw.retrieval-debug` | Typed schema + typed query ergonomics for graph-lite developer experience; official shape is no server/no Docker | `docs/guide/research/comparison_external_projects.md`; `docs/research/2026-06-09-xy-841-external-memory-benchmark-dimensions.json`; `apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json`; `docs/research/2026-06-10-xy-882-rag-graph-adapter-feasibility.json` | +| [RAGFlow](https://github.com/infiniflow/ragflow) | D2 feasibility gate | Research gate remains; XY-882 verdict `adapter_candidate` | Candidate `rw.resume-evidence`, `rw.graph-navigation`, `rw.retrieval-debug`; no live strength claim | Docker setup is resource-heavy but documented; API references expose document/chunk evidence handles for a tiny-corpus adapter smoke | `apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json`; `docs/research/2026-06-10-xy-882-rag-graph-adapter-feasibility.json` | +| [LightRAG](https://github.com/HKUDS/LightRAG) | D2 feasibility gate | Research gate remains; XY-882 verdict `adapter_candidate` | Candidate `rw.graph-navigation`, `rw.graph-temporal`, `rw.retrieval-debug`; no live strength claim | Docker compose path, context-only query modes, and source file-path citation shape support an implementation follow-up | `apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json`; `docs/research/2026-06-10-xy-882-rag-graph-adapter-feasibility.json` | +| [GraphRAG](https://github.com/microsoft/graphrag) | D2 feasibility gate | Research gate remains; XY-882 verdict `adapter_candidate` | Candidate `rw.graph-navigation`, `rw.knowledge-synthesis`, `rw.retrieval-debug`; no live strength claim | Cost-bounded CLI/API path and parquet output tables expose document, text-unit, and graph-summary handles for evidence mapping | `apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json`; `docs/research/2026-06-10-xy-882-rag-graph-adapter-feasibility.json` | + +## June 10, 2026 Adapter Feasibility Verdicts + +XY-882 resolved the D1/D2 feasibility gate for the RAG and graph-memory +`research_gate` records. These verdicts do not change any project into live adapter +evidence; they only decide whether an implementation follow-up is justified. + +| Project | Verdict | Follow-up rule | +| ------- | ------- | -------------- | +| RAGFlow | `adapter_candidate` | Follow-up issue: [XY-885](https://linear.app/hack-ink/issue/XY-885/elf-benchmark-adapter-implement-ragflow-docker-evidence-smoke-adapter), a tiny Docker evidence-smoke adapter that records the resource envelope and maps `reference.chunks` to benchmark evidence. | +| LightRAG | `adapter_candidate` | Follow-up issue: [XY-886](https://linear.app/hack-ink/issue/XY-886/elf-benchmark-adapter-implement-lightrag-docker-context-export-adapter), a Docker context-export adapter using explicit LLM/embedding config and source file-path citations. | +| GraphRAG | `adapter_candidate` | Follow-up issue: [XY-887](https://linear.app/hack-ink/issue/XY-887/elf-benchmark-adapter-implement-graphrag-cost-bounded-docker-adapter), a cost-bounded Docker CLI/API adapter over a tiny corpus and parquet output tables. | +| Graphiti / Zep | `adapter_candidate` | Follow-up issue: [XY-888](https://linear.app/hack-ink/issue/XY-888/elf-benchmark-adapter-implement-graphitizep-temporal-graph-adapter), a Docker-local temporal graph adapter that scores current/historical fact validity. | +| graphify | `adapter_candidate` | Follow-up issue: [XY-889](https://linear.app/hack-ink/issue/XY-889/elf-benchmark-adapter-implement-graphify-docker-graph-report-adapter), a Docker-only CLI/materializer adapter over `graph.json` and `GRAPH_REPORT.md`; host-global assistant hooks remain out of scope. | +| Letta | `research_only` | Keep as a core/archival memory reference until a supported contained path can export archival-memory evidence for scoring. | +| LangGraph | `research_only` | Keep as a checkpoint/replay regression reference, not a standalone external memory adapter. | +| nanograph | `research_only` | Keep as typed graph DX inspiration; official shape is no server/no Docker. | +| llm-wiki | `research_only` | Keep as a derived knowledge-page workflow reference; host-global plugin installs are not adapter proof. | +| gbrain | `blocked` | Revisit only after a Docker-local brain repo and database path can be proven without operator-owned state. | ## June 2026 Activity Snapshot @@ -73,6 +92,7 @@ replacing ELF's evidence-bound service contract. - Current June 2026 research runs: - `docs/research/2026-06-08-agent-memory-selection.json` - `docs/research/2026-06-09-xy-841-external-memory-benchmark-dimensions.json` + - `docs/research/2026-06-10-xy-882-rag-graph-adapter-feasibility.json` ## Notes diff --git a/docs/research/2026-06-10-xy-882-rag-graph-adapter-feasibility.json b/docs/research/2026-06-10-xy-882-rag-graph-adapter-feasibility.json new file mode 100644 index 00000000..9f42812b --- /dev/null +++ b/docs/research/2026-06-10-xy-882-rag-graph-adapter-feasibility.json @@ -0,0 +1,348 @@ +{ + "schema": "research-run/2", + "run_id": "2026-06-10-xy-882-rag-graph-adapter-feasibility", + "question": "Which RAG and graph-memory research gates should become Docker-bounded adapter implementation candidates for ELF real-world benchmarks?", + "success_criteria": [ + "Give RAGFlow, LightRAG, GraphRAG, Graphiti/Zep, Letta, LangGraph, nanograph, llm-wiki, gbrain, and graphify one explicit verdict: adapter_candidate, research_only, blocked, or reject.", + "Separate setup/resource feasibility from product quality; heavy setup is not treated as a quality failure.", + "Require adapter_candidate projects to have both a Docker-contained path and an evidence-linked output contract.", + "Keep all researched projects in the research_gate evidence class until a Docker adapter executes real_world_job scoring." + ], + "constraints": [ + "Do not implement adapters in this issue.", + "Do not use host-global installs as proof.", + "Do not claim live adapter pass evidence from source or docs review.", + "Create implementation follow-ups only for adapter candidates with a scoped Docker boundary and evidence-linked output." + ], + "stop_rule": "Stop when every target project has a verdict, adapter candidates have scoped follow-up issue titles, and the docs/manifest still label these records as research gates rather than live evidence.", + "primary_hypothesis": "RAGFlow, LightRAG, GraphRAG, Graphiti/Zep, and graphify have enough Docker-bounded setup and evidence-output shape to justify implementation follow-ups; Letta, LangGraph, nanograph, and llm-wiki remain research-only references; gbrain remains blocked until a Docker-local brain repo/database path is proven.", + "rival_hypotheses": [ + "All projects should remain research-only because none has executed in the benchmark runner.", + "All projects with official Docker or CLI instructions should become adapter candidates.", + "RAGFlow should be rejected because its official resource envelope is large." + ], + "falsifiers": [ + "If a candidate cannot run without host-global state, it is not an adapter implementation candidate for this benchmark lane.", + "If a candidate cannot emit source IDs, document IDs, file locations, citations, or equivalent evidence handles, it cannot support real_world_job scoring.", + "If a project is a useful architecture reference but not a standalone memory/retrieval output path, it should remain research_only." + ], + "coverage": { + "mode": "primary_source_docs_and_existing_repo_contracts", + "min_source_families": 4 + }, + "events": [ + { + "seq": 1, + "type": "probe_completed", + "remaining_option_count": 4, + "independent_option_questions": [ + "Does the project expose a Docker-contained setup path?", + "Does the project expose corpus ingest and query output that can map back to source evidence?", + "Is the project a direct adapter candidate, a reference-only design input, blocked by missing Docker proof, or rejected?" + ], + "external_slices": [ + "RAGFlow", + "LightRAG", + "GraphRAG", + "Graphiti/Zep", + "Letta", + "LangGraph", + "nanograph", + "llm-wiki", + "gbrain", + "graphify" + ] + }, + { + "seq": 2, + "type": "evidence_recorded", + "evidence": [ + { + "id": "E1", + "kind": "contract", + "summary": "The real-world benchmark spec defines research_gate records as source/setup/runtime/resource/retry metadata for future implementation; research gates must not count as fixture-backed, live-baseline, or live-real-world evidence.", + "source_family": "repo_spec", + "source_locator": "docs/spec/real_world_agent_memory_benchmark_v1.md" + }, + { + "id": "E2", + "kind": "setup", + "summary": "RAGFlow official quickstart documents Docker startup, 4 CPU / 16 GB RAM / 50 GB disk prerequisites, x86/Nvidia support, image-size caveats, dataset creation, chunk visibility, and citation-backed retrieval testing.", + "source_family": "upstream_docs", + "source_locator": "https://ragflow.io/docs/" + }, + { + "id": "E3", + "kind": "output_contract", + "summary": "RAGFlow HTTP API can include reference metadata and returns reference chunks containing chunk id, content, document id, document name, document metadata, dataset id, positions, and similarity scores.", + "source_family": "upstream_docs", + "source_locator": "https://raw.githubusercontent.com/infiniflow/ragflow/main/docs/references/http_api_reference.md" + }, + { + "id": "E4", + "kind": "setup", + "summary": "LightRAG Docker docs describe docker compose startup, generated compose files, persistent data paths, environment-driven LLM and embedding configuration, and optional Docker-local vLLM embedding/rerank services.", + "source_family": "upstream_docs", + "source_locator": "https://raw.githubusercontent.com/HKUDS/LightRAG/main/docs/DockerDeployment.md" + }, + { + "id": "E5", + "kind": "output_contract", + "summary": "LightRAG supports query prefixes including context-only modes, can return the context prepared for the LLM, supports inserting documents with stable ids, and traces sources through file_paths.", + "source_family": "upstream_docs", + "source_locator": "https://raw.githubusercontent.com/HKUDS/LightRAG/main/docs/LightRAG-API-Server.md" + }, + { + "id": "E6", + "kind": "output_contract", + "summary": "GraphRAG writes parquet output tables with UUIDs and human-readable ids; communities and reports carry text_unit_ids, and text_units carry raw text plus document ids and relationship/entity ids.", + "source_family": "upstream_docs", + "source_locator": "https://microsoft.github.io/graphrag/index/outputs/" + }, + { + "id": "E7", + "kind": "setup", + "summary": "GraphRAG input and query docs describe a CLI/API indexing and local-search path over structured documents, raw text chunks, graph data, and query context builders.", + "source_family": "upstream_docs", + "source_locator": "https://microsoft.github.io/graphrag/" + }, + { + "id": "E8", + "kind": "output_contract", + "summary": "Graphiti/Zep requires Python plus Neo4j or FalkorDB, supports Docker-local FalkorDB, adds episodes or fact triples, and search results include UUID, fact text, valid_at, and invalid_at fields.", + "source_family": "upstream_docs", + "source_locator": "https://help.getzep.com/graphiti/getting-started/quick-start" + }, + { + "id": "E9", + "kind": "boundary", + "summary": "Letta remains a strong core/archival memory reference, but Docker use needs explicit embedding configuration and the current docs steer new Letta Code users away from Docker-first evaluation.", + "source_family": "upstream_docs", + "source_locator": "https://docs.letta.com/guides/docker/" + }, + { + "id": "E10", + "kind": "boundary", + "summary": "LangGraph persistence provides checkpoints, replay, stores, and semantic memory search, but it is an agent-state framework rather than a standalone external memory service adapter.", + "source_family": "upstream_docs", + "source_locator": "https://docs.langchain.com/oss/python/langgraph/persistence" + }, + { + "id": "E11", + "kind": "boundary", + "summary": "nanograph documents one CLI, one folder, schema-as-code, no server, no cloud, and no Docker; this makes it a graph-lite DX reference rather than a Docker adapter candidate for this lane.", + "source_family": "upstream_docs", + "source_locator": "https://www.nanograph.io/" + }, + { + "id": "E12", + "kind": "boundary", + "summary": "llm-wiki ships as agent plugins or portable instructions with wiki query, compile, lint, audit, and output workflows; it is a derived knowledge workflow reference, not a service adapter candidate without a contained plugin harness.", + "source_family": "upstream_docs", + "source_locator": "https://github.com/nvk/llm-wiki" + }, + { + "id": "E13", + "kind": "boundary", + "summary": "gbrain has strong compiled-truth, append-only timeline, and source attribution contracts, but this lane did not prove a Docker-local brain repository and database setup path.", + "source_family": "upstream_docs", + "source_locator": "https://raw.githubusercontent.com/garrytan/gbrain/master/docs/guides/compiled-truth.md" + }, + { + "id": "E14", + "kind": "output_contract", + "summary": "graphify can run over a folder, produces graph.html, GRAPH_REPORT.md, graph.json, and cache artifacts, and query output includes node labels, edge types, confidence tags, source files, and source locations.", + "source_family": "upstream_docs", + "source_locator": "https://raw.githubusercontent.com/safishamsi/graphify/v3/README.md" + } + ] + }, + { + "seq": 3, + "type": "project_verdicts_recorded", + "verdicts": [ + { + "project": "RAGFlow", + "verdict": "adapter_candidate", + "supporting_evidence_ids": [ + "E2", + "E3" + ], + "docker_boundary": "Nested Docker service profile or baseline compose service using official RAGFlow Docker Compose, capped to a tiny corpus and CPU mode first.", + "output_contract": "Map RAGFlow reference.chunks fields to real_world_job expected evidence ids.", + "follow_up_title": "[ELF benchmark adapter] Implement RAGFlow Docker evidence-smoke adapter", + "follow_up_issue": "XY-885", + "follow_up_url": "https://linear.app/hack-ink/issue/XY-885/elf-benchmark-adapter-implement-ragflow-docker-evidence-smoke-adapter" + }, + { + "project": "LightRAG", + "verdict": "adapter_candidate", + "supporting_evidence_ids": [ + "E4", + "E5" + ], + "docker_boundary": "Docker Compose LightRAG server with explicit LLM, embedding, rerank, and data-volume configuration.", + "output_contract": "Use context-only query modes and file_paths-backed citations for evidence scoring.", + "follow_up_title": "[ELF benchmark adapter] Implement LightRAG Docker context-export adapter", + "follow_up_issue": "XY-886", + "follow_up_url": "https://linear.app/hack-ink/issue/XY-886/elf-benchmark-adapter-implement-lightrag-docker-context-export-adapter" + }, + { + "project": "GraphRAG", + "verdict": "adapter_candidate", + "supporting_evidence_ids": [ + "E6", + "E7" + ], + "docker_boundary": "Cost-bounded Docker Python CLI/API run over a generated tiny corpus with container-local parquet artifacts.", + "output_contract": "Map documents, text_units, communities, and community_reports output tables back to source evidence ids.", + "follow_up_title": "[ELF benchmark adapter] Implement GraphRAG cost-bounded Docker adapter", + "follow_up_issue": "XY-887", + "follow_up_url": "https://linear.app/hack-ink/issue/XY-887/elf-benchmark-adapter-implement-graphrag-cost-bounded-docker-adapter" + }, + { + "project": "Graphiti/Zep", + "verdict": "adapter_candidate", + "supporting_evidence_ids": [ + "E8" + ], + "docker_boundary": "Docker-local FalkorDB or Neo4j plus Python SDK runner with provider configuration explicit in benchmark artifacts.", + "output_contract": "Score UUID, fact, valid_at, and invalid_at search output against memory_evolution current/historical evidence.", + "follow_up_title": "[ELF benchmark adapter] Implement Graphiti/Zep temporal graph adapter", + "follow_up_issue": "XY-888", + "follow_up_url": "https://linear.app/hack-ink/issue/XY-888/elf-benchmark-adapter-implement-graphitizep-temporal-graph-adapter" + }, + { + "project": "Letta", + "verdict": "research_only", + "supporting_evidence_ids": [ + "E9" + ], + "reason": "Keep as core/archival memory semantics reference; do not create an implementation issue until a supported, contained server path can export archival evidence for scoring." + }, + { + "project": "LangGraph", + "verdict": "research_only", + "supporting_evidence_ids": [ + "E10" + ], + "reason": "Keep as checkpoint/replay regression reference; it is not a standalone external memory adapter candidate in this benchmark lane." + }, + { + "project": "nanograph", + "verdict": "research_only", + "supporting_evidence_ids": [ + "E11" + ], + "reason": "Keep as typed graph DX inspiration; official positioning is no server/no Docker and no real_world_job evidence contract is proven." + }, + { + "project": "llm-wiki", + "verdict": "research_only", + "supporting_evidence_ids": [ + "E12" + ], + "reason": "Keep as derived knowledge-page workflow inspiration; no host-global plugin install may be used as adapter proof." + }, + { + "project": "gbrain", + "verdict": "blocked", + "supporting_evidence_ids": [ + "E13" + ], + "reason": "The evidence contract is strong, but a Docker-local brain repo and database path must be proven before an implementation issue is safe." + }, + { + "project": "graphify", + "verdict": "adapter_candidate", + "supporting_evidence_ids": [ + "E14" + ], + "docker_boundary": "Docker-only CLI/materializer run using pip-installed graphifyy over mounted benchmark corpus, with no assistant global hook install.", + "output_contract": "Score graph.json query output and GRAPH_REPORT.md source-file/source-location references against expected evidence.", + "follow_up_title": "[ELF benchmark adapter] Implement graphify Docker graph-report adapter", + "follow_up_issue": "XY-889", + "follow_up_url": "https://linear.app/hack-ink/issue/XY-889/elf-benchmark-adapter-implement-graphify-docker-graph-report-adapter" + } + ] + }, + { + "seq": 4, + "type": "tradeoffs_recorded", + "tradeoffs": [ + { + "id": "T1", + "summary": "RAGFlow is resource-heavy, but the official Docker and reference chunk output make it an adapter candidate as long as the follow-up starts with a tiny corpus and records resource bounds instead of making a quality claim.", + "supporting_evidence_ids": [ + "E2", + "E3" + ], + "disconfirming_evidence_ids": [] + }, + { + "id": "T2", + "summary": "LightRAG and GraphRAG can become adapter candidates because both expose bounded ingest/query paths and source mapping, but their first adapter issues must remain cost-bounded.", + "supporting_evidence_ids": [ + "E4", + "E5", + "E6", + "E7" + ], + "disconfirming_evidence_ids": [] + }, + { + "id": "T3", + "summary": "Graphiti/Zep is a stronger adapter candidate than generic graph-memory references because it can emit temporal facts with validity windows and run against Docker-local graph stores.", + "supporting_evidence_ids": [ + "E8" + ], + "disconfirming_evidence_ids": [] + }, + { + "id": "T4", + "summary": "Letta, LangGraph, nanograph, and llm-wiki should still inform ELF design, but creating adapter implementation issues now would blur reference workflows with executable memory-service evidence.", + "supporting_evidence_ids": [ + "E9", + "E10", + "E11", + "E12" + ], + "disconfirming_evidence_ids": [] + }, + { + "id": "T5", + "summary": "gbrain has a good citation and current-truth/timeline contract, but the missing Docker-local brain repo/database setup keeps it blocked rather than adapter_candidate.", + "supporting_evidence_ids": [ + "E13" + ], + "disconfirming_evidence_ids": [] + }, + { + "id": "T6", + "summary": "graphify is an adapter candidate only if implemented as an isolated CLI/materializer over generated corpus artifacts, not as a host-global assistant hook install.", + "supporting_evidence_ids": [ + "E14" + ], + "disconfirming_evidence_ids": [] + } + ] + }, + { + "seq": 5, + "type": "challenge_recorded", + "summary": "The main risk is that adapter_candidate could be read as benchmark evidence. The mitigation is to keep evidence_class=research_gate, keep overall status non-pass, and state that follow-up implementation issues must still run Docker and real_world_job scoring before any live evidence claim.", + "resolved": true + }, + { + "seq": 6, + "type": "finalized_decision_ready", + "confidence": "medium", + "decision": "Create implementation follow-ups only for RAGFlow, LightRAG, GraphRAG, Graphiti/Zep, and graphify. Keep Letta, LangGraph, nanograph, and llm-wiki as research_only references. Keep gbrain blocked pending a Docker-local brain repo/database proof. Do not change any research_gate record into live evidence until an adapter executes inside Docker and emits evidence-linked outputs.", + "missing_evidence": [ + "No Docker adapter was implemented or executed in this lane.", + "No host-global install was used as proof.", + "Provider credentials and private corpora remain out of scope." + ] + } + ] +}