Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -918,7 +918,7 @@
"overall_status": "blocked",
"setup": {
"status": "blocked",
"evidence": "RAGFlow remains a large RAG system watch item; D1/D2 research must prove a Docker-safe corpus ingest and query path before adapter implementation."
"evidence": "XY-882 marks RAGFlow as an adapter_candidate, but the runner still needs a Docker-safe tiny-corpus ingest/query smoke before any live adapter claim."
},
"run": {
"status": "not_encoded",
Expand All @@ -930,9 +930,9 @@
},
"capabilities": [
{
"capability": "d1_d2_research_before_adapter",
"status": "blocked",
"evidence": "The inventory marks RAGFlow as D0 pending deep dive."
"capability": "adapter_candidate_verdict",
"status": "not_encoded",
"evidence": "XY-882 completed D1/D2 feasibility research and marks RAGFlow adapter_candidate; no adapter run is encoded."
},
{
"capability": "docker_service_setup",
Expand Down Expand Up @@ -985,20 +985,25 @@
"label": "RAGFlow docs",
"url": "https://ragflow.io/docs/",
"evidence": "Official deployment and setup documentation."
},
{
"label": "RAGFlow HTTP API reference",
"url": "https://raw.githubusercontent.com/infiniflow/ragflow/main/docs/references/http_api_reference.md",
"evidence": "Official reference for OpenAI-compatible responses with reference chunks and document metadata."
}
],
"setup_path": "Research the official Docker deployment, corpus ingest API, query API, and artifact export before adding a runner.",
"setup_path": "Implement a tiny Docker evidence-smoke runner using the official Docker deployment, dataset ingest API, and OpenAI-compatible query API.",
"runtime_boundary": "Future runs must use docker-compose.baseline.yml or a nested Docker-isolated service profile without host-global installs.",
"resource_expectation": "Large multi-service RAG stack; record CPU/GPU mode, memory, disk, startup time, and provider credential needs before scoring.",
"retry_guidance": [
"Complete a D1/D2 setup and API deep dive.",
"Prototype a tiny Docker smoke that reaches ingest and query before adding quality checks."
"Start with CPU mode and a generated tiny text corpus.",
"Record image pull/build size, expanded disk use, startup time, vm.max_map_count handling, and provider boundaries before scoring."
],
"research_depth": "D0 watch item; D1/D2 required"
"research_depth": "D2 feasibility verdict: adapter_candidate (XY-882); research_gate only, adapter not encoded"
},
"follow_up": {
"title": "[ELF benchmark adapter] Research RAGFlow Docker adapter feasibility",
"reason": "The project is too large to score fairly without setup, resource, and API mapping research."
"title": "[ELF benchmark adapter] Implement RAGFlow Docker evidence-smoke adapter",
"reason": "Created as XY-885. XY-882 found a Docker boundary and reference-chunk output contract; implementation must prove a tiny ingest/query run before any quality claim."
}
},
{
Expand All @@ -1011,7 +1016,7 @@
"overall_status": "blocked",
"setup": {
"status": "blocked",
"evidence": "LightRAG requires D1/D2 research on Docker setup, LLM/embedding configuration, persistence, and context output before adapter implementation."
"evidence": "XY-882 marks LightRAG as an adapter_candidate, but the runner still needs a Docker context-export adapter before any live result."
},
"run": {
"status": "not_encoded",
Expand All @@ -1024,8 +1029,8 @@
"capabilities": [
{
"capability": "graph_augmented_rag_setup",
"status": "blocked",
"evidence": "The inventory marks LightRAG as D0 pending deep dive."
"status": "not_encoded",
"evidence": "XY-882 completed setup/output feasibility research; graph-augmented RAG execution is still not encoded."
},
{
"capability": "retrieved_context_export",
Expand Down Expand Up @@ -1078,20 +1083,30 @@
"label": "LightRAG Docker docs",
"url": "https://github.com/HKUDS/LightRAG/blob/main/docs/DockerDeployment.md",
"evidence": "Official Docker deployment reference."
},
{
"label": "LightRAG API server docs",
"url": "https://github.com/HKUDS/LightRAG/blob/main/docs/LightRAG-API-Server.md",
"evidence": "Official query-mode and context-output reference."
},
{
"label": "LightRAG core programming docs",
"url": "https://github.com/HKUDS/LightRAG/blob/main/docs/ProgramingWithCore.md",
"evidence": "Official source-id and file-path citation reference."
}
],
"setup_path": "Research Docker Compose with explicit LLM, embedding, rerank, and storage configuration before adding a benchmark runner.",
"setup_path": "Implement Docker Compose with explicit LLM, embedding, rerank, storage, workspace, and data-volume configuration, then export context-only query output.",
"runtime_boundary": "Docker-only service profile with generated corpus mounted as container-local input.",
"resource_expectation": "Graph extraction and local model choices may dominate runtime; record backend choices, cache sizes, and provider needs.",
"retry_guidance": [
"Run a tiny Docker ingest/query smoke with deterministic or local providers.",
"Verify returned contexts can be mapped to required evidence IDs."
],
"research_depth": "D0 watch item; D1/D2 required"
"research_depth": "D2 feasibility verdict: adapter_candidate (XY-882); research_gate only, adapter not encoded"
},
"follow_up": {
"title": "[ELF benchmark adapter] Research LightRAG graph-RAG adapter feasibility",
"reason": "Graph extraction, persistence, and context output must be understood before fair scoring."
"title": "[ELF benchmark adapter] Implement LightRAG Docker context-export adapter",
"reason": "Created as XY-886. XY-882 found a Docker service path and context/source mapping contract; implementation must prove evidence export before scoring."
}
},
{
Expand All @@ -1104,7 +1119,7 @@
"overall_status": "blocked",
"setup": {
"status": "blocked",
"evidence": "GraphRAG indexing cost and source-citation mapping require D1/D2 research before adapter implementation."
"evidence": "XY-882 marks GraphRAG as an adapter_candidate, but indexing cost and source mapping still need a cost-bounded Docker implementation before live scoring."
},
"run": {
"status": "not_encoded",
Expand All @@ -1118,7 +1133,7 @@
{
"capability": "indexing_resource_envelope",
"status": "blocked",
"evidence": "Official docs warn that indexing can be expensive; the benchmark must start small and record costs."
"evidence": "XY-882 requires the first adapter to start with a tiny corpus and record indexing cost before any scale or quality claim."
},
{
"capability": "source_citation_mapping",
Expand Down Expand Up @@ -1171,20 +1186,25 @@
"label": "GraphRAG docs",
"url": "https://microsoft.github.io/graphrag/",
"evidence": "Official documentation for indexing and querying."
},
{
"label": "GraphRAG output tables",
"url": "https://microsoft.github.io/graphrag/index/outputs/",
"evidence": "Official output schema with document, text unit, community, and relationship identifiers."
}
],
"setup_path": "Research a tiny CLI index/query path with explicit model configuration and source mapping.",
"setup_path": "Implement a tiny CLI/API index/query path with explicit model configuration and source mapping from parquet output tables.",
"runtime_boundary": "Docker-only Python CLI run with generated corpus and container-local artifacts.",
"resource_expectation": "Indexing may be expensive; record model calls, cache size, elapsed time, and maximum corpus size used.",
"retry_guidance": [
"Complete D1/D2 indexing and query-output research.",
"Add a cost-bounded smoke before any scale or quality claim."
"Add a cost-bounded smoke before any scale or quality claim.",
"Fail typed if source document or text_unit identifiers cannot be mapped to expected evidence IDs."
],
"research_depth": "D0 watch item; D1/D2 required"
"research_depth": "D2 feasibility verdict: adapter_candidate (XY-882); research_gate only, adapter not encoded"
},
"follow_up": {
"title": "[ELF benchmark adapter] Research GraphRAG cost-bounded adapter path",
"reason": "Indexing cost, graph summaries, and citation guarantees need proof before scoring."
"title": "[ELF benchmark adapter] Implement GraphRAG cost-bounded Docker adapter",
"reason": "Created as XY-887. XY-882 found a Docker-bounded CLI/API path and output-table evidence handles; implementation must stay tiny and cost-recorded."
}
},
{
Expand All @@ -1197,7 +1217,7 @@
"overall_status": "not_encoded",
"setup": {
"status": "not_encoded",
"evidence": "Graphiti/Zep is D1 reviewed as a temporal graph-memory reference, but no Docker adapter is implemented."
"evidence": "XY-882 marks Graphiti/Zep as an adapter_candidate, but no Docker temporal graph adapter is implemented."
},
"run": {
"status": "not_encoded",
Expand All @@ -1211,7 +1231,7 @@
{
"capability": "temporal_graph_memory",
"status": "not_encoded",
"evidence": "Temporal fact validity is a reference dimension but not an executable adapter output."
"evidence": "Temporal fact validity has a scoped adapter candidate path, but no executable adapter output is encoded."
},
{
"capability": "docker_graph_store_setup",
Expand Down Expand Up @@ -1259,16 +1279,30 @@
"label": "Zep Graphiti overview",
"url": "https://www.getzep.com/platform/graphiti/",
"evidence": "Official product documentation for temporal context graph behavior."
},
{
"label": "Graphiti quick start",
"url": "https://help.getzep.com/graphiti/getting-started/quick-start",
"evidence": "Official setup, episode ingest, and search output reference."
},
{
"label": "Graphiti FalkorDB configuration",
"url": "https://help.getzep.com/graphiti/configuration/falkor-db-configuration",
"evidence": "Official Docker-local FalkorDB setup reference."
}
],
"setup_path": "Define a Docker-local graph store and provider configuration, then encode add/query current-versus-historical fact jobs.",
"setup_path": "Implement a Docker-local FalkorDB or Neo4j graph store and provider configuration, then encode add/query current-versus-historical fact jobs.",
"runtime_boundary": "Docker-only service or SDK run with graph store state under benchmark artifacts.",
"resource_expectation": "Requires graph store plus LLM/embedding configuration; record service startup, storage size, and provider boundaries.",
"retry_guidance": [
"Prototype a tiny temporal fact add/query run.",
"Map valid_at/invalid_at evidence to memory_evolution scoring."
],
"research_depth": "D1 reviewed; adapter not encoded"
"research_depth": "D1 feasibility verdict: adapter_candidate (XY-882); research_gate only, adapter not encoded"
},
"follow_up": {
"title": "[ELF benchmark adapter] Implement Graphiti/Zep temporal graph adapter",
"reason": "Created as XY-888. XY-882 found a Docker-local graph-store path and fact/validity-window output contract for memory_evolution scoring."
}
},
{
Expand Down Expand Up @@ -1357,7 +1391,7 @@
"Create a tiny Docker agent with archival memory search.",
"Score core-versus-archival retrieval only after source evidence can be exported."
],
"research_depth": "D1 reviewed; adapter not encoded"
"research_depth": "D1 feasibility verdict: research_only (XY-882); core/archival reference, adapter not encoded"
}
},
{
Expand Down Expand Up @@ -1431,7 +1465,7 @@
"Encode one replay/fork failure recovery job.",
"Keep LangGraph classified as replay reference unless memory retrieval is actually exercised."
],
"research_depth": "D1 reviewed; adapter not encoded"
"research_depth": "D1 feasibility verdict: research_only (XY-882); replay/checkpoint reference, adapter not encoded"
}
},
{
Expand Down Expand Up @@ -1505,7 +1539,7 @@
"Define a minimal schema for memory_evolution facts.",
"Score typed query output only if it cites fixture evidence IDs."
],
"research_depth": "D1 reviewed; adapter not encoded"
"research_depth": "D1 feasibility verdict: research_only (XY-882); typed graph DX reference, adapter not encoded"
}
},
{
Expand Down Expand Up @@ -1579,7 +1613,7 @@
"Prototype a fixture-only page build with explicit citations.",
"Do not score until generated sections can be mapped to evidence IDs."
],
"research_depth": "D1 reviewed; adapter not encoded"
"research_depth": "D1 feasibility verdict: research_only (XY-882); derived wiki workflow reference, adapter not encoded"
}
},
{
Expand Down Expand Up @@ -1663,7 +1697,7 @@
"Prototype a tiny brain repo with one current-truth page and timeline.",
"Score only if compiled truth cites the source timeline evidence."
],
"research_depth": "D1 reviewed; adapter not encoded"
"research_depth": "D1 feasibility verdict: blocked (XY-882); Docker-local brain repo and database path not proven"
}
},
{
Expand All @@ -1676,7 +1710,7 @@
"overall_status": "not_encoded",
"setup": {
"status": "not_encoded",
"evidence": "graphify is D1 reviewed as a graph-navigation reference, but no Docker adapter is implemented."
"evidence": "XY-882 marks graphify as an adapter_candidate for a Docker-only CLI/materializer path, but no adapter is implemented."
},
"run": {
"status": "not_encoded",
Expand All @@ -1690,7 +1724,7 @@
{
"capability": "graph_report_generation",
"status": "not_encoded",
"evidence": "Graph reports and assistant query flows are not executed by the runner."
"evidence": "Graph reports and query output have a candidate scoring path, but they are not executed by the runner."
},
{
"capability": "multimodal_code_graph",
Expand Down Expand Up @@ -1733,16 +1767,25 @@
"label": "graphify repository",
"url": "https://github.com/safishamsi/graphify",
"evidence": "Official source for graphify graph extraction and query workflow."
},
{
"label": "graphify README",
"url": "https://github.com/safishamsi/graphify/blob/v3/README.md",
"evidence": "Official CLI, output artifact, query, and source-location contract."
}
],
"setup_path": "Install graphify inside Docker, build a graph/report from a generated corpus, and export query evidence.",
"runtime_boundary": "Docker-only CLI or skill run over mounted benchmark corpus.",
"setup_path": "Install graphify inside Docker, build a graph/report from a generated corpus, and export query evidence without installing host-global assistant hooks.",
"runtime_boundary": "Docker-only CLI/materializer run over mounted benchmark corpus.",
"resource_expectation": "Graph build cost scales with corpus and model choices; record build time, graph size, and generated report size.",
"retry_guidance": [
"Start with a generated public code/document corpus.",
"Score graph-guided answers only when report nodes cite source evidence IDs."
],
"research_depth": "D1 reviewed; adapter not encoded"
"research_depth": "D1 feasibility verdict: adapter_candidate (XY-882); research_gate only, adapter not encoded"
},
"follow_up": {
"title": "[ELF benchmark adapter] Implement graphify Docker graph-report adapter",
"reason": "Created as XY-889. XY-882 found a Docker-only CLI/materializer path and source-file/source-location output contract."
}
}
]
Expand Down
4 changes: 3 additions & 1 deletion apps/elf-eval/tests/real_world_job_benchmark.rs
Original file line number Diff line number Diff line change
Expand Up @@ -321,7 +321,9 @@ fn assert_external_adapter_manifest_records(report: &Value) -> Result<()> {
assert_eq!(ragflow.pointer("/overall_status").and_then(Value::as_str), Some("blocked"));
assert_eq!(
ragflow.pointer("/execution_metadata/research_depth").and_then(Value::as_str),
Some("D0 watch item; D1/D2 required")
Some(
"D2 feasibility verdict: adapter_candidate (XY-882); research_gate only, adapter not encoded"
)
);
assert_eq!(
ragflow.pointer("/execution_metadata/sources/0/url").and_then(Value::as_str),
Expand Down
Loading