diff --git a/Makefile.toml b/Makefile.toml index 8348b19f..5d570b77 100644 --- a/Makefile.toml +++ b/Makefile.toml @@ -811,7 +811,7 @@ workspace = false command = "bash" args = [ "-lc", - "set -euo pipefail; docker compose -f docker-compose.baseline.yml run --build --rm baseline-runner bash scripts/real-world-live-adapters.sh", + "set -euo pipefail; lightrag_start=\"$(printenv ELF_LIGHTRAG_CONTEXT_START || true)\"; graphiti_start=\"$(printenv ELF_GRAPHITI_ZEP_SMOKE_START || true)\"; status=0; if [ \"$lightrag_start\" = \"1\" ]; then docker compose -f docker-compose.baseline.yml --profile lightrag up -d lightrag; fi; if [ \"$graphiti_start\" = \"1\" ]; then docker compose -f docker-compose.baseline.yml --profile graphiti-zep up -d graphiti-falkordb; fi; docker compose -f docker-compose.baseline.yml run --build --rm -e ELF_REAL_WORLD_LIVE_ENABLE_RAGFLOW -e ELF_REAL_WORLD_LIVE_ENABLE_LIGHTRAG -e ELF_REAL_WORLD_LIVE_ENABLE_GRAPHRAG -e ELF_REAL_WORLD_LIVE_ENABLE_GRAPHITI_ZEP -e ELF_REAL_WORLD_LIVE_ENABLE_GRAPHIFY -e ELF_RAGFLOW_SMOKE_START -e ELF_RAGFLOW_SMOKE_ACCEPT_RESOURCE_ENVELOPE -e ELF_RAGFLOW_SMOKE_ALLOW_ARM -e ELF_RAGFLOW_SMOKE_PULL_IMAGE -e ELF_RAGFLOW_SMOKE_CLEANUP -e ELF_RAGFLOW_SMOKE_DEVICE -e ELF_RAGFLOW_API_PORT -e ELF_RAGFLOW_API_BASE -e ELF_RAGFLOW_API_KEY -e RAGFLOW_API_KEY -e ELF_RAGFLOW_SMOKE_STARTUP_ATTEMPTS -e ELF_RAGFLOW_SMOKE_STARTUP_INTERVAL_SECONDS -e ELF_RAGFLOW_SMOKE_COMPOSE_TIMEOUT_SECONDS -e ELF_RAGFLOW_REPO_URL -e ELF_RAGFLOW_REF -e ELF_RAGFLOW_IMAGE -e ELF_RAGFLOW_COMPOSE_PROJECT -e ELF_LIGHTRAG_CONTEXT_START -e ELF_LIGHTRAG_API_BASE -e ELF_LIGHTRAG_ADAPTER_ID -e ELF_LIGHTRAG_ADAPTER_NAME -e ELF_LIGHTRAG_STARTUP_ATTEMPTS -e ELF_LIGHTRAG_STARTUP_INTERVAL_SECONDS -e ELF_LIGHTRAG_INDEX_ATTEMPTS -e ELF_LIGHTRAG_INDEX_INTERVAL_SECONDS -e ELF_GRAPHRAG_SMOKE_RUN -e ELF_GRAPHRAG_SMOKE_WORK_DIR -e ELF_GRAPHRAG_SMOKE_INSTALL -e ELF_GRAPHRAG_VERSION -e ELF_GRAPHRAG_PACKAGE -e ELF_GRAPHRAG_REF -e ELF_GRAPHRAG_CHAT_MODEL -e ELF_GRAPHRAG_EMBEDDING_MODEL -e ELF_GRAPHRAG_API_BASE -e ELF_GRAPHRAG_API_KEY -e ELF_GRAPHRAG_INDEX_METHOD -e ELF_GRAPHRAG_QUERY_METHOD -e ELF_GRAPHRAG_TIMEOUT_SECONDS -e ELF_GRAPHRAG_MAX_DOCS -e ELF_GRAPHRAG_MAX_INPUT_CHARS -e ELF_GRAPHITI_ZEP_SMOKE_START -e ELF_GRAPHITI_ZEP_SMOKE_RUN -e ELF_GRAPHITI_ZEP_SMOKE_WORK_DIR -e ELF_GRAPHITI_ZEP_SMOKE_INSTALL -e ELF_GRAPHITI_ZEP_VERSION -e ELF_GRAPHITI_ZEP_PACKAGE -e ELF_GRAPHITI_ZEP_REF -e ELF_GRAPHITI_ZEP_API_BASE -e ELF_GRAPHITI_ZEP_API_KEY -e ELF_GRAPHITI_ZEP_LLM_MODEL -e ELF_GRAPHITI_ZEP_EMBEDDING_MODEL -e ELF_GRAPHITI_ZEP_FALKORDB_HOST -e ELF_GRAPHITI_ZEP_FALKORDB_PORT -e ELF_GRAPHITI_ZEP_FALKORDB_DATABASE -e ELF_GRAPHITI_ZEP_TIMEOUT_SECONDS -e ELF_GRAPHITI_ZEP_STARTUP_ATTEMPTS -e ELF_GRAPHITI_ZEP_STARTUP_INTERVAL_SECONDS -e ELF_GRAPHIFY_SMOKE_RUN -e ELF_GRAPHIFY_SMOKE_WORK_DIR -e ELF_GRAPHIFY_SMOKE_INSTALL -e ELF_GRAPHIFY_PACKAGE -e ELF_GRAPHIFY_REF -e ELF_GRAPHIFY_TIMEOUT_SECONDS -e ELF_GRAPHIFY_QUERY_BUDGET baseline-runner bash scripts/real-world-live-adapters.sh || status=$?; if [ \"$lightrag_start\" = \"1\" ]; then docker compose -f docker-compose.baseline.yml --profile lightrag stop lightrag lightrag-mock-provider >/dev/null 2>&1 || true; fi; if [ \"$graphiti_start\" = \"1\" ]; then docker compose -f docker-compose.baseline.yml --profile graphiti-zep stop graphiti-falkordb >/dev/null 2>&1 || true; fi; exit \"$status\"", ] diff --git a/README.md b/README.md index b4032dde..550c3587 100644 --- a/README.md +++ b/README.md @@ -161,10 +161,18 @@ provider-backed ELF evidence was required. 1 incomplete, 2 blocked, and 12 not_encoded jobs. - Expanded adapter-pack coverage after XY-834: the real-world external adapter manifest now includes `research_gate` records for RAGFlow, LightRAG, GraphRAG, - Graphiti/Zep, Letta, LangGraph, nanograph, llm-wiki, gbrain, graphify, and deeper - qmd/OpenViking profiles. These records carry source/setup/runtime/resource/retry - metadata and typed `blocked`, `incomplete`, `wrong_result`, or `not_encoded` states; - they are not fixture-backed or live adapter pass evidence. + Graphiti/Zep, Letta, LangGraph, nanograph, llm-wiki, gbrain, and deeper + qmd/OpenViking profiles, while graphify now has a scored tiny Docker smoke record. + These records carry source/setup/runtime/resource/retry metadata and typed + `blocked`, `incomplete`, `wrong_result`, or `not_encoded` states; they are not + fixture-backed or live adapter pass evidence. +- Graph/RAG scored-smoke promotion after XY-900: RAGFlow, LightRAG, GraphRAG, + Graphiti/Zep, and graphify smokes now emit scored or typed non-pass + `real_world_job` adapter reports when run. graphify currently reaches a tiny Docker + graph/report smoke and scores `wrong_result`; the other in-scope projects remain + typed blocked or incomplete without explicit service, resource, or provider setup. + These reports preserve the smoke-only boundary and do not create an ELF win claim + against graph/RAG strengths. - The benchmark runner and report publisher are checked in and Docker-isolated: `cargo make baseline-live-docker`, `cargo make baseline-backfill-docker`, `cargo make baseline-production-private-addendum`, @@ -183,6 +191,7 @@ Detailed evidence and interpretation: - [Real-World Comparison Report - June 10, 2026](docs/guide/benchmarking/2026-06-10-real-world-comparison-report.md) - [Live Real-World Adapter Sweep Report - June 10, 2026](docs/guide/benchmarking/2026-06-10-live-real-world-sweep-report.md) - [Post-Adapter Production Adoption Refresh - June 10, 2026](docs/guide/benchmarking/2026-06-10-production-adoption-refresh.md) +- [Graph/RAG Scored Smoke Adapter Report - June 11, 2026](docs/guide/benchmarking/2026-06-11-graph-rag-scored-smoke-adapter-report.md) - [Live Baseline Benchmark Runbook](docs/guide/benchmarking/live_baseline_benchmark.md) - [Single-User Production Runbook](docs/guide/single_user_production.md) - Benchmark contract: @@ -254,6 +263,9 @@ Detailed comparison, mechanism-level analysis, and source map: - [Real-World Comparison Report - June 10, 2026](docs/guide/benchmarking/2026-06-10-real-world-comparison-report.md) - [Live Real-World Adapter Sweep Report - June 10, 2026](docs/guide/benchmarking/2026-06-10-live-real-world-sweep-report.md) - [Post-Adapter Production Adoption Refresh - June 10, 2026](docs/guide/benchmarking/2026-06-10-production-adoption-refresh.md) +- [Competitor Strength Evidence Matrix - June 11, 2026](docs/guide/benchmarking/2026-06-11-competitor-strength-evidence-matrix.md) +- [Temporal History Competitor Gap Report - June 11, 2026](docs/guide/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md) +- [Graph/RAG Scored Smoke Adapter Report - June 11, 2026](docs/guide/benchmarking/2026-06-11-graph-rag-scored-smoke-adapter-report.md) - [Live Baseline Benchmark Runbook](docs/guide/benchmarking/live_baseline_benchmark.md) - [Real-World Agent Memory Benchmark](docs/guide/benchmarking/real_world_agent_memory_benchmark.md) - [External Memory Improvement Plan](docs/guide/research/external_memory_improvement_plan.md) @@ -263,7 +275,7 @@ Detailed comparison, mechanism-level analysis, and source map: - [Real-World Benchmark Dimension Research Run](docs/research/2026-06-09-xy-841-external-memory-benchmark-dimensions.json) - [RAG/Graph Adapter Feasibility Research Run](docs/research/2026-06-10-xy-882-rag-graph-adapter-feasibility.json) -Latest real-world benchmark report: June 10, 2026. Latest external research refresh: +Latest real-world benchmark report: June 11, 2026. Latest external research refresh: June 10, 2026. ## Documentation diff --git a/apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json b/apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json index 152b1f15..b1d3014e 100644 --- a/apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json +++ b/apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json @@ -1,6 +1,6 @@ { "schema": "elf.real_world_external_adapter_manifest/v1", - "manifest_id": "real-world-memory-project-adapters-2026-06-10", + "manifest_id": "real-world-memory-project-adapters-2026-06-11", "docker_isolation": { "default": true, "compose_file": "docker-compose.baseline.yml", @@ -1085,7 +1085,7 @@ "overall_status": "blocked", "setup": { "status": "blocked", - "evidence": "XY-885 adds a Docker-safe tiny-corpus evidence smoke command. The checked-in manifest remains a research gate until a generated artifact reaches RAGFlow query output.", + "evidence": "XY-900 promotes the Docker-safe tiny-corpus evidence smoke into a generated real_world_job report while the checked-in row remains smoke-only research_gate evidence.", "command": "cargo make ragflow-docker-smoke", "artifact": "tmp/real-world-memory/ragflow-smoke/ragflow-smoke.json" }, @@ -1097,8 +1097,8 @@ }, "result": { "status": "blocked", - "evidence": "No quality result is claimed from the checked-in research gate. Generated smoke artifacts may become live_real_world only after RAGFlow returns reference chunks mapped to generated evidence ids.", - "artifact": "tmp/real-world-memory/ragflow-smoke/ragflow-smoke.json" + "evidence": "The smoke now emits ragflow-report.json and ragflow-report.md from one generated retrieval job. Pass or wrong_result is allowed only when returned reference chunks map to generated evidence ids; resource, setup, and API-key limits remain typed blockers.", + "artifact": "tmp/real-world-memory/ragflow-smoke/ragflow-report.json" }, "capabilities": [ { @@ -1113,15 +1113,20 @@ }, { "capability": "real_world_job_adapter", + "status": "blocked", + "evidence": "One generated retrieval job is scored from the smoke artifact or typed blocked when resource, service, or local API-key boundaries stop execution." + }, + { + "capability": "quality_or_scale_claim", "status": "not_encoded", - "evidence": "The smoke maps RAGFlow reference chunks to generated evidence ids, but broad real_world_job scoring and quality claims remain not encoded." + "evidence": "The scored smoke does not claim broad RAGFlow quality, private corpus behavior, scale, or comparative ranking." } ], "suites": [ { "suite_id": "retrieval", "status": "blocked", - "evidence": "The generated smoke can exercise tiny corpus ingest and retrieval-reference mapping, but the checked-in record stays blocked until a live artifact reaches query output." + "evidence": "The generated retrieval smoke is scored as pass, wrong_result, blocked, or incomplete by ragflow-report.json; the checked-in row remains blocked until live reference chunks map to evidence ids." }, { "suite_id": "knowledge_compilation", @@ -1144,6 +1149,16 @@ "kind": "source", "ref": "https://ragflow.io/docs/", "status": "real" + }, + { + "kind": "artifact", + "ref": "tmp/real-world-memory/ragflow-smoke/ragflow-report.json", + "status": "blocked" + }, + { + "kind": "artifact", + "ref": "tmp/real-world-memory/ragflow-smoke/ragflow-report.md", + "status": "blocked" } ], "execution_metadata": { @@ -1172,8 +1187,12 @@ "Start the live path only with ELF_RAGFLOW_SMOKE_START=1 and ELF_RAGFLOW_SMOKE_ACCEPT_RESOURCE_ENVELOPE=1.", "Keep private corpora and operator-owned provider credentials out of this smoke; map only generated public corpus reference chunks to evidence ids." ], - "research_depth": "D2 feasibility verdict plus XY-885 evidence-smoke implementation; checked-in record remains research_gate unless a generated artifact reaches query output" + "research_depth": "D2 feasibility verdict plus XY-885 evidence-smoke implementation and XY-900 scored smoke promotion; checked-in record remains research_gate unless a generated artifact reaches query output" }, + "notes": [ + "Status class: smoke-only scored adapter path with typed resource/setup/API-key blockers.", + "Do not interpret ragflow-report.json as broad RAGFlow quality evidence unless reference chunks map to generated evidence ids." + ], "follow_up": { "title": "[ELF benchmark adapter] Implement RAGFlow Docker evidence-smoke adapter", "reason": "Created as XY-885. XY-882 found a Docker boundary and reference-chunk output contract; implementation must prove a tiny ingest/query run before any quality claim." @@ -1189,7 +1208,7 @@ "overall_status": "blocked", "setup": { "status": "blocked", - "evidence": "XY-886 adds a Docker-profile context-export smoke command. The checked-in manifest remains a research gate until a generated artifact reaches LightRAG context/source output.", + "evidence": "XY-886 adds a Docker-profile context-export smoke command, and XY-900 keeps its generated retrieval fixtures scored through real_world_job_benchmark. The checked-in row remains smoke-only research_gate evidence.", "command": "cargo make lightrag-docker-context-smoke", "artifact": "tmp/real-world-memory/lightrag-context/lightrag-materialization.json" }, @@ -1201,7 +1220,7 @@ }, "result": { "status": "blocked", - "evidence": "No graph-RAG quality result is claimed from the checked-in research gate. Generated smoke artifacts may become live_real_world only after LightRAG returns context or references mapped to generated evidence ids.", + "evidence": "The smoke emits lightrag-report.json and lightrag-report.md over generated retrieval jobs. Pass or wrong_result is allowed only when returned context, references, or file paths map to generated evidence ids.", "artifact": "tmp/real-world-memory/lightrag-context/lightrag-report.json" }, "capabilities": [ @@ -1263,6 +1282,11 @@ "kind": "artifact", "ref": "tmp/real-world-memory/lightrag-context/lightrag-materialization.json", "status": "blocked" + }, + { + "kind": "artifact", + "ref": "tmp/real-world-memory/lightrag-context/lightrag-report.md", + "status": "blocked" } ], "execution_metadata": { @@ -1296,8 +1320,12 @@ "Set ELF_LIGHTRAG_CONTEXT_START=1 only when Docker may pull/start the LightRAG service profile.", "Score retrieval only when returned context, references.file_path, or references.content map to required evidence ids." ], - "research_depth": "D2 feasibility plus XY-886 context-export implementation; checked-in record remains research_gate unless a generated artifact reaches query output" + "research_depth": "D2 feasibility plus XY-886 context-export implementation and XY-900 scored smoke aggregation; checked-in record remains research_gate unless a generated artifact reaches query output" }, + "notes": [ + "Status class: smoke-only scored adapter path with typed service/setup blockers.", + "Do not interpret lightrag-report.json as broad graph-RAG quality evidence unless generated source/context mappings score as pass." + ], "follow_up": { "title": "[ELF benchmark adapter] Implement LightRAG Docker context-export adapter", "reason": "Created as XY-886. XY-882 found a Docker service path and context/source mapping contract; implementation must prove evidence export before scoring." @@ -1313,7 +1341,7 @@ "overall_status": "blocked", "setup": { "status": "blocked", - "evidence": "XY-887 adds a Docker-safe generated-corpus GraphRAG smoke command. The checked-in manifest remains a research gate until a generated artifact reaches GraphRAG parquet output.", + "evidence": "XY-900 promotes the Docker-safe generated-corpus GraphRAG smoke into a scored knowledge_compilation report while the checked-in row remains smoke-only research_gate evidence.", "command": "cargo make graphrag-docker-smoke", "artifact": "tmp/real-world-memory/graphrag-smoke/graphrag-smoke.json" }, @@ -1325,8 +1353,8 @@ }, "result": { "status": "blocked", - "evidence": "No graph-navigation or knowledge-synthesis result is claimed from the checked-in research gate. Generated smoke artifacts may become live_real_world only after GraphRAG output tables map to generated evidence ids.", - "artifact": "tmp/real-world-memory/graphrag-smoke/memory_projects_manifest.graphrag-smoke.json" + "evidence": "The smoke now emits graphrag-report.json and graphrag-report.md from one generated knowledge_compilation job. Pass or wrong_result is allowed only when GraphRAG output tables map to generated evidence ids.", + "artifact": "tmp/real-world-memory/graphrag-smoke/graphrag-report.json" }, "capabilities": [ { @@ -1342,7 +1370,7 @@ { "capability": "real_world_job_adapter", "status": "blocked", - "evidence": "The smoke writes a generated real_world_job fixture for the tiny corpus, but the checked-in record stays blocked until live GraphRAG output maps to expected evidence ids." + "evidence": "The smoke writes a generated real_world_job fixture and scored report; provider/setup limits remain blocked until live GraphRAG output maps to expected evidence ids." }, { "capability": "quality_or_scale_claim", @@ -1392,6 +1420,11 @@ "kind": "artifact", "ref": "tmp/real-world-memory/graphrag-smoke/graphrag-smoke.json", "status": "blocked" + }, + { + "kind": "artifact", + "ref": "tmp/real-world-memory/graphrag-smoke/graphrag-report.md", + "status": "blocked" } ], "execution_metadata": { @@ -1430,8 +1463,12 @@ "Enable ELF_GRAPHRAG_SMOKE_RUN=1 only for generated public corpus indexing with explicit provider configuration.", "Fail typed if source document or text_unit identifiers cannot be mapped to expected evidence IDs." ], - "research_depth": "D2 feasibility plus XY-887 Docker smoke implementation; checked-in record remains research_gate unless a generated artifact reaches GraphRAG output" + "research_depth": "D2 feasibility plus XY-887 Docker smoke implementation and XY-900 scored smoke promotion; checked-in record remains research_gate unless a generated artifact reaches GraphRAG output" }, + "notes": [ + "Status class: smoke-only scored adapter path with typed provider/setup blockers.", + "Do not interpret graphrag-report.json as broad graph-navigation or knowledge-synthesis quality evidence unless output tables map to generated evidence ids." + ], "follow_up": { "title": "[ELF benchmark adapter] Implement GraphRAG cost-bounded Docker adapter", "reason": "Created as XY-887. XY-882 found a Docker-bounded CLI/API path and output-table evidence handles; implementation must stay tiny and cost-recorded." @@ -1447,7 +1484,7 @@ "overall_status": "blocked", "setup": { "status": "blocked", - "evidence": "XY-888 adds a Docker-contained Graphiti/Zep temporal smoke command. The checked-in manifest remains a research gate until a generated artifact reaches Graphiti search output.", + "evidence": "XY-900 promotes the Docker-contained Graphiti/Zep temporal smoke into a scored memory_evolution report while the checked-in row remains smoke-only research_gate evidence.", "command": "cargo make graphiti-zep-docker-temporal-smoke", "artifact": "tmp/real-world-memory/graphiti-zep-smoke/graphiti-zep-smoke.json" }, @@ -1459,8 +1496,8 @@ }, "result": { "status": "blocked", - "evidence": "No temporal graph quality result is claimed from the checked-in research gate. Generated smoke artifacts may become live_real_world only after Graphiti/Zep returns UUID, fact, valid_at, and invalid_at output mapped to generated memory_evolution evidence ids.", - "artifact": "tmp/real-world-memory/graphiti-zep-smoke/graphiti-zep-smoke.json" + "evidence": "The smoke now emits graphiti-zep-report.json and graphiti-zep-report.md from one generated memory_evolution job. The default blocker is live-run opt-in disabled; when ELF_GRAPHITI_ZEP_SMOKE_START=1 and ELF_GRAPHITI_ZEP_SMOKE_RUN=1 are set without provider credentials, the blocker is provider_api_key_missing. No hosted Zep service or unrecorded credentials are used.", + "artifact": "tmp/real-world-memory/graphiti-zep-smoke/graphiti-zep-report.json" }, "capabilities": [ { @@ -1476,7 +1513,7 @@ { "capability": "real_world_job_adapter", "status": "blocked", - "evidence": "The generated smoke fixture maps Graphiti/Zep temporal fact output to memory_evolution expected evidence ids when search output is available." + "evidence": "The generated temporal-validity fixture is scored or typed blocked; live quality evidence requires Graphiti/Zep search output mapped to current and historical evidence ids." }, { "capability": "quality_or_scale_claim", @@ -1521,6 +1558,11 @@ "kind": "artifact", "ref": "tmp/real-world-memory/graphiti-zep-smoke/graphiti-zep-smoke.json", "status": "blocked" + }, + { + "kind": "artifact", + "ref": "tmp/real-world-memory/graphiti-zep-smoke/graphiti-zep-report.md", + "status": "blocked" } ], "execution_metadata": { @@ -1559,8 +1601,12 @@ "Start the live path only with ELF_GRAPHITI_ZEP_SMOKE_START=1, ELF_GRAPHITI_ZEP_SMOKE_RUN=1, and explicit provider configuration.", "Treat missing validity windows or unmapped current/historical facts as wrong_result, not pass." ], - "research_depth": "D2 feasibility plus XY-888 Docker temporal smoke implementation; checked-in record remains research_gate unless a generated artifact reaches Graphiti search output" + "research_depth": "D2 feasibility plus XY-888 Docker temporal smoke implementation and XY-900 scored smoke promotion; checked-in record remains research_gate unless a generated artifact reaches Graphiti search output" }, + "notes": [ + "Status class: smoke-only scored adapter path with typed live-run opt-in, provider, and setup blockers.", + "Graphiti/Zep remains the temporal-validity reference; do not claim ELF-over-Graphiti/Zep until provider-backed temporal output maps to scored evidence ids." + ], "follow_up": { "title": "[ELF benchmark adapter] Implement Graphiti/Zep temporal graph adapter", "reason": "Created as XY-888. XY-882 found a Docker-local graph-store path and fact/validity-window output contract for memory_evolution scoring." @@ -1962,45 +2008,45 @@ } }, { - "adapter_id": "graphify_research_gate", + "adapter_id": "graphify_docker_smoke", "project": "graphify", - "adapter_kind": "research_gate", - "evidence_class": "research_gate", + "adapter_kind": "docker_cli_real_world_job", + "evidence_class": "live_real_world", "docker_default": true, "host_global_installs_required": false, - "overall_status": "blocked", + "overall_status": "wrong_result", "setup": { - "status": "blocked", - "evidence": "XY-889 adds a Docker-only graph/report smoke command. The checked-in manifest remains a research gate until a generated artifact reaches graphify graph/report output.", + "status": "pass", + "evidence": "XY-900 validation reached the Docker-only graph/report smoke setup inside the baseline runner without host-global assistant hooks.", "command": "cargo make graphify-docker-graph-report-smoke", "artifact": "tmp/real-world-memory/graphify-smoke/graphify-smoke.json" }, "run": { - "status": "blocked", - "evidence": "The smoke installs graphify in a container-local venv, runs over a generated public corpus, and records typed setup/runtime failure if graph/report build or query output is unavailable.", + "status": "pass", + "evidence": "The smoke installed graphify in a container-local venv, ran over a generated public corpus, and produced graph/report/query output for scoring.", "command": "cargo make graphify-docker-graph-report-smoke", "artifact": "tmp/real-world-memory/graphify-smoke/summary.json" }, "result": { - "status": "blocked", - "evidence": "No graph-navigation or knowledge-compilation quality result is claimed from the checked-in research gate. Generated smoke artifacts may become live_real_world only after graph.json, GRAPH_REPORT.md, and graphify query output map to generated evidence ids.", - "artifact": "tmp/real-world-memory/graphify-smoke/graphify-smoke.json" + "status": "wrong_result", + "evidence": "The smoke emits graphify-report.json and graphify-report.md from one generated knowledge_compilation job. The current scored report maps evidence ids but remains wrong_result because the scoring rubric still records a wrong-result signal.", + "artifact": "tmp/real-world-memory/graphify-smoke/graphify-report.json" }, "capabilities": [ { "capability": "docker_cli_boundary", - "status": "blocked", + "status": "pass", "evidence": "The smoke uses docker-compose.baseline.yml baseline-runner, a container-local Python venv, and isolated assistant config paths; it does not install host-global assistant hooks." }, { "capability": "graph_report_generation", - "status": "blocked", - "evidence": "The smoke captures graphify-out/graph.json, GRAPH_REPORT.md, cache metadata, command logs, build time, graph size, and report size when build succeeds." + "status": "pass", + "evidence": "The smoke captures graphify-out/graph.json, GRAPH_REPORT.md, cache metadata, command logs, build time, graph size, and report size." }, { "capability": "real_world_job_adapter", - "status": "blocked", - "evidence": "The smoke maps node labels, edge types, confidence tags, source files, source locations, report text, and query output to generated real_world_job evidence ids when graphify reaches output." + "status": "wrong_result", + "evidence": "The smoke writes a generated real_world_job fixture and scored report; current knowledge_compilation scoring is wrong_result, not pass." }, { "capability": "multimodal_code_graph", @@ -2016,13 +2062,13 @@ "suites": [ { "suite_id": "knowledge_compilation", - "status": "blocked", - "evidence": "The generated smoke can exercise graph/report evidence mapping for one generated knowledge-compilation fixture, but the checked-in record stays blocked until a live artifact reaches graph/report output." + "status": "wrong_result", + "evidence": "The generated smoke exercised graph/report evidence mapping for one generated knowledge-compilation fixture and scored wrong_result with mean_score 0.75." }, { "suite_id": "retrieval", "status": "blocked", - "evidence": "Graph-guided query output is mapped only for the generated smoke when available; broad retrieval quality scoring remains unclaimed." + "evidence": "Graph-guided query output is present only as support for the generated knowledge_compilation smoke; broad retrieval quality scoring remains unclaimed." }, { "suite_id": "work_resume", @@ -2039,12 +2085,17 @@ { "kind": "command", "ref": "cargo make graphify-docker-graph-report-smoke", - "status": "blocked" + "status": "wrong_result" }, { "kind": "artifact", "ref": "tmp/real-world-memory/graphify-smoke/graphify-smoke.json", - "status": "blocked" + "status": "pass" + }, + { + "kind": "artifact", + "ref": "tmp/real-world-memory/graphify-smoke/graphify-report.md", + "status": "wrong_result" } ], "execution_metadata": { @@ -2068,8 +2119,12 @@ "Do not use graphify host assistant hook installs or operator-owned assistant configuration as proof.", "Score graph-guided answers only when graph.json, GRAPH_REPORT.md, and graphify query output map to generated evidence ids." ], - "research_depth": "D1 feasibility verdict plus XY-889 Docker graph/report smoke implementation; checked-in record remains research_gate unless a generated artifact reaches graphify output" + "research_depth": "D1 feasibility verdict plus XY-889 Docker graph/report smoke implementation and XY-900 scored smoke promotion; current Docker validation reaches graphify output and scores the tiny knowledge_compilation job as wrong_result" }, + "notes": [ + "Status class: live Docker scored smoke with a current wrong_result outcome.", + "Do not interpret graphify-report.json as broad graph-navigation or knowledge-compilation quality evidence; the tiny smoke is scored and currently non-pass." + ], "follow_up": { "title": "[ELF benchmark adapter] Implement graphify Docker graph-report adapter", "reason": "Created as XY-889. XY-882 found a Docker-only CLI/materializer path and source-file/source-location output contract." diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark.rs b/apps/elf-eval/src/bin/real_world_job_benchmark.rs index e987986b..d0482174 100644 --- a/apps/elf-eval/src/bin/real_world_job_benchmark.rs +++ b/apps/elf-eval/src/bin/real_world_job_benchmark.rs @@ -3882,9 +3882,13 @@ fn validate_adapter_execution_metadata(path: &Path, adapter: &ExternalAdapterRep } fn external_adapter_summary(adapters: &[ExternalAdapterReport]) -> ExternalAdapterSummary { + let external_projects = adapters + .iter() + .filter_map(|adapter| (adapter.project != "ELF").then_some(adapter.project.as_str())) + .collect::>(); let mut summary = ExternalAdapterSummary { adapter_count: adapters.len(), - external_project_count: adapters.iter().filter(|adapter| adapter.project != "ELF").count(), + external_project_count: external_projects.len(), ..ExternalAdapterSummary::default() }; diff --git a/apps/elf-eval/tests/real_world_job_benchmark.rs b/apps/elf-eval/tests/real_world_job_benchmark.rs index b8f14a81..99c3a7ad 100644 --- a/apps/elf-eval/tests/real_world_job_benchmark.rs +++ b/apps/elf-eval/tests/real_world_job_benchmark.rs @@ -126,11 +126,11 @@ fn smoke_fixture_produces_typed_json_report() -> Result<()> { ); assert_eq!( report.pointer("/external_adapters/summary/live_real_world_count").and_then(Value::as_u64), - Some(2) + Some(3) ); assert_eq!( report.pointer("/external_adapters/summary/research_gate_count").and_then(Value::as_u64), - Some(12) + Some(11) ); let jobs = array_at(&report, "/jobs")?; @@ -191,7 +191,7 @@ fn assert_external_adapter_manifest_summary(report: &Value) { ); assert_eq!( report.pointer("/external_adapters/manifest_id").and_then(Value::as_str), - Some("real-world-memory-project-adapters-2026-06-10") + Some("real-world-memory-project-adapters-2026-06-11") ); assert_eq!( report.pointer("/external_adapters/docker_isolation/default").and_then(Value::as_bool), @@ -209,7 +209,7 @@ fn assert_external_adapter_manifest_summary(report: &Value) { ); assert_eq!( report.pointer("/external_adapters/summary/external_project_count").and_then(Value::as_u64), - Some(19) + Some(16) ); assert_eq!( report.pointer("/external_adapters/summary/fixture_backed_count").and_then(Value::as_u64), @@ -223,11 +223,11 @@ fn assert_external_adapter_manifest_summary(report: &Value) { ); assert_eq!( report.pointer("/external_adapters/summary/live_real_world_count").and_then(Value::as_u64), - Some(2) + Some(3) ); assert_eq!( report.pointer("/external_adapters/summary/research_gate_count").and_then(Value::as_u64), - Some(12) + Some(11) ); assert_eq!( report @@ -239,7 +239,7 @@ fn assert_external_adapter_manifest_summary(report: &Value) { report .pointer("/external_adapters/summary/overall_status_counts/wrong_result") .and_then(Value::as_u64), - Some(6) + Some(7) ); assert_eq!( report @@ -257,7 +257,7 @@ fn assert_external_adapter_manifest_summary(report: &Value) { report .pointer("/external_adapters/summary/overall_status_counts/blocked") .and_then(Value::as_u64), - Some(6) + Some(5) ); assert_eq!( report @@ -281,7 +281,7 @@ fn assert_external_adapter_manifest_summary(report: &Value) { report .pointer("/external_adapters/summary/suite_status_counts/blocked") .and_then(Value::as_u64), - Some(11) + Some(10) ); } @@ -300,7 +300,7 @@ fn assert_external_adapter_manifest_records(report: &Value) -> Result<()> { let lightrag = find_by_field(adapters, "/adapter_id", "lightrag_research_gate")?; let graphrag = find_by_field(adapters, "/adapter_id", "graphrag_research_gate")?; let graphiti_zep = find_by_field(adapters, "/adapter_id", "graphiti_zep_research_gate")?; - let graphify = find_by_field(adapters, "/adapter_id", "graphify_research_gate")?; + let graphify = find_by_field(adapters, "/adapter_id", "graphify_docker_smoke")?; let qmd_deep = find_by_field(adapters, "/adapter_id", "qmd_deep_profile_gate")?; assert_eq!(elf.pointer("/evidence_class").and_then(Value::as_str), Some("fixture_backed")); @@ -336,7 +336,7 @@ fn assert_external_adapter_manifest_records(report: &Value) -> Result<()> { assert_eq!( ragflow.pointer("/execution_metadata/research_depth").and_then(Value::as_str), Some( - "D2 feasibility verdict plus XY-885 evidence-smoke implementation; checked-in record remains research_gate unless a generated artifact reaches query output" + "D2 feasibility verdict plus XY-885 evidence-smoke implementation and XY-900 scored smoke promotion; checked-in record remains research_gate unless a generated artifact reaches query output" ) ); assert_eq!( @@ -345,7 +345,7 @@ fn assert_external_adapter_manifest_records(report: &Value) -> Result<()> { ); assert_eq!( ragflow.pointer("/result/artifact").and_then(Value::as_str), - Some("tmp/real-world-memory/ragflow-smoke/ragflow-smoke.json") + Some("tmp/real-world-memory/ragflow-smoke/ragflow-report.json") ); assert_eq!( ragflow.pointer("/execution_metadata/sources/0/url").and_then(Value::as_str), @@ -373,7 +373,7 @@ fn assert_external_adapter_manifest_records(report: &Value) -> Result<()> { assert_eq!(graphrag.pointer("/suites/1/status").and_then(Value::as_str), Some("not_encoded")); assert_graphiti_zep_adapter(graphiti_zep); - assert_graphify_adapter(graphify); + assert_graphify_adapter(graphify)?; assert_eq!( qmd_deep.pointer("/capabilities/2/status").and_then(Value::as_str), @@ -427,14 +427,17 @@ fn assert_graphiti_zep_adapter(adapter: &Value) { assert_eq!( adapter.pointer("/execution_metadata/research_depth").and_then(Value::as_str), Some( - "D2 feasibility plus XY-888 Docker temporal smoke implementation; checked-in record remains research_gate unless a generated artifact reaches Graphiti search output" + "D2 feasibility plus XY-888 Docker temporal smoke implementation and XY-900 scored smoke promotion; checked-in record remains research_gate unless a generated artifact reaches Graphiti search output" ) ); } -fn assert_graphify_adapter(adapter: &Value) { - assert_eq!(adapter.pointer("/evidence_class").and_then(Value::as_str), Some("research_gate")); - assert_eq!(adapter.pointer("/overall_status").and_then(Value::as_str), Some("blocked")); +fn assert_graphify_adapter(adapter: &Value) -> Result<()> { + assert_eq!(adapter.pointer("/evidence_class").and_then(Value::as_str), Some("live_real_world")); + assert_eq!(adapter.pointer("/overall_status").and_then(Value::as_str), Some("wrong_result")); + assert_eq!(adapter.pointer("/setup/status").and_then(Value::as_str), Some("pass")); + assert_eq!(adapter.pointer("/run/status").and_then(Value::as_str), Some("pass")); + assert_eq!(adapter.pointer("/result/status").and_then(Value::as_str), Some("wrong_result")); assert_eq!( adapter.pointer("/setup/command").and_then(Value::as_str), Some("cargo make graphify-docker-graph-report-smoke") @@ -443,15 +446,178 @@ fn assert_graphify_adapter(adapter: &Value) { adapter.pointer("/suites/0/suite_id").and_then(Value::as_str), Some("knowledge_compilation") ); - assert_eq!(adapter.pointer("/suites/0/status").and_then(Value::as_str), Some("blocked")); + assert_eq!(adapter.pointer("/suites/0/status").and_then(Value::as_str), Some("wrong_result")); assert_eq!(adapter.pointer("/suites/1/suite_id").and_then(Value::as_str), Some("retrieval")); assert_eq!(adapter.pointer("/suites/1/status").and_then(Value::as_str), Some("blocked")); assert_eq!( adapter.pointer("/execution_metadata/research_depth").and_then(Value::as_str), Some( - "D1 feasibility verdict plus XY-889 Docker graph/report smoke implementation; checked-in record remains research_gate unless a generated artifact reaches graphify output" + "D1 feasibility verdict plus XY-889 Docker graph/report smoke implementation and XY-900 scored smoke promotion; current Docker validation reaches graphify output and scores the tiny knowledge_compilation job as wrong_result" ) ); + + let capabilities = array_at(adapter, "/capabilities")?; + let quality = find_by_field(capabilities, "/capability", "quality_or_scale_claim")?; + + assert_eq!(quality.pointer("/status").and_then(Value::as_str), Some("not_encoded")); + assert!(array_at(adapter, "/notes")?.iter().any(|note| { + note.as_str().is_some_and(|text| text.contains("tiny smoke") && text.contains("non-pass")) + })); + + Ok(()) +} + +#[test] +fn graphify_generated_manifest_keeps_retrieval_unscored() -> Result<()> { + let manifest = serde_json::json!({ + "schema": "elf.real_world_external_adapter_manifest/v1", + "manifest_id": "graphify-generated-manifest-test", + "docker_isolation": { + "default": true, + "compose_file": "docker-compose.baseline.yml", + "runner": "scripts/graphify-docker-graph-report-smoke.py", + "artifact_dir": "tmp/real-world-memory/graphify-smoke", + "host_global_installs_required": false, + "notes": ["Synthetic graphify generated-manifest regression test."] + }, + "adapters": [{ + "adapter_id": "graphify_docker_smoke", + "project": "graphify", + "adapter_kind": "docker_cli_graph_report_smoke", + "evidence_class": "live_real_world", + "docker_default": true, + "host_global_installs_required": false, + "overall_status": "wrong_result", + "setup": { + "status": "pass", + "evidence": "setup evidence", + "command": "cargo make graphify-docker-graph-report-smoke", + "artifact": "tmp/real-world-memory/graphify-smoke/graphify-smoke.json" + }, + "run": { + "status": "pass", + "evidence": "run evidence", + "command": "cargo make graphify-docker-graph-report-smoke", + "artifact": "tmp/real-world-memory/graphify-smoke/summary.json" + }, + "result": { + "status": "wrong_result", + "evidence": "result evidence", + "artifact": "tmp/real-world-memory/graphify-smoke/graphify-report.json" + }, + "capabilities": [{ + "capability": "quality_or_scale_claim", + "status": "not_encoded", + "evidence": "No broad graph quality claim." + }], + "suites": [ + { + "suite_id": "knowledge_compilation", + "status": "wrong_result", + "evidence": "Only the generated graph/report evidence-mapping job is represented." + }, + { + "suite_id": "retrieval", + "status": "blocked", + "evidence": "The smoke uses graphify query output only to support source mapping; broad retrieval quality is not scored." + } + ], + "evidence": [], + "execution_metadata": { + "setup_path": "cargo make graphify-docker-graph-report-smoke", + "runtime_boundary": "Docker-only generated graph/report smoke.", + "resource_expectation": "Tiny generated corpus only.", + "retry_guidance": [], + "sources": [{ + "label": "graphify", + "url": "https://github.com/safishamsi/graphify", + "evidence": "Synthetic generated-manifest regression source." + }], + "research_depth": "Generated smoke manifest path" + }, + "notes": ["tiny smoke non-pass"] + }] + }); + let temp_dir = + env::temp_dir().join(format!("elf-real-world-graphify-manifest-test-{}", process::id())); + + fs::create_dir_all(&temp_dir)?; + + let manifest_path = temp_dir.join("manifest.json"); + let report_path = temp_dir.join("report.json"); + + fs::write(&manifest_path, serde_json::to_vec_pretty(&manifest)?)?; + + let output = Command::new(env!("CARGO_BIN_EXE_real_world_job_benchmark")) + .arg("run") + .arg("--fixtures") + .arg(fixture_dir()) + .arg("--out") + .arg(&report_path) + .arg("--external-adapter-manifest") + .arg(&manifest_path) + .output()?; + + assert!( + output.status.success(), + "real_world_job runner failed: {}", + String::from_utf8_lossy(&output.stderr), + ); + + let report: Value = serde_json::from_slice(&fs::read(&report_path)?)?; + let adapters = array_at(&report, "/external_adapters/adapters")?; + let graphify = find_by_field(adapters, "/adapter_id", "graphify_docker_smoke")?; + let suites = array_at(graphify, "/suites")?; + let retrieval = find_by_field(suites, "/suite_id", "retrieval")?; + + assert_eq!(retrieval.pointer("/status").and_then(Value::as_str), Some("blocked")); + assert!( + retrieval + .pointer("/evidence") + .and_then(Value::as_str) + .is_some_and(|text| { text.contains("broad retrieval quality is not scored") }) + ); + + Ok(()) +} + +#[test] +fn live_adapter_aggregate_forwards_graph_rag_smoke_controls() -> Result<()> { + let makefile = fs::read_to_string( + Path::new(env!("CARGO_MANIFEST_DIR")).join("..").join("..").join("Makefile.toml"), + )?; + + for env_name in [ + "ELF_REAL_WORLD_LIVE_ENABLE_RAGFLOW", + "ELF_REAL_WORLD_LIVE_ENABLE_LIGHTRAG", + "ELF_REAL_WORLD_LIVE_ENABLE_GRAPHRAG", + "ELF_REAL_WORLD_LIVE_ENABLE_GRAPHITI_ZEP", + "ELF_REAL_WORLD_LIVE_ENABLE_GRAPHIFY", + "ELF_RAGFLOW_SMOKE_START", + "ELF_RAGFLOW_SMOKE_ACCEPT_RESOURCE_ENVELOPE", + "ELF_GRAPHRAG_SMOKE_RUN", + "ELF_GRAPHRAG_API_KEY", + "ELF_GRAPHITI_ZEP_SMOKE_START", + "ELF_GRAPHITI_ZEP_SMOKE_RUN", + "ELF_GRAPHITI_ZEP_API_KEY", + "ELF_GRAPHIFY_SMOKE_RUN", + ] { + assert!( + makefile.contains(&format!("-e {env_name}")), + "real-world-memory-live-adapters must forward {env_name}", + ); + } + + assert!( + makefile.contains("--profile lightrag up -d lightrag"), + "aggregate task should start LightRAG profile when ELF_LIGHTRAG_CONTEXT_START=1", + ); + assert!( + makefile.contains("--profile graphiti-zep up -d graphiti-falkordb"), + "aggregate task should start Graphiti/Zep profile when ELF_GRAPHITI_ZEP_SMOKE_START=1", + ); + + Ok(()) } fn assert_live_sweep_record(adapter: &Value) -> Result<()> { diff --git a/docs/guide/benchmarking/2026-06-11-competitor-strength-evidence-matrix.md b/docs/guide/benchmarking/2026-06-11-competitor-strength-evidence-matrix.md index 1802eaf5..97dcfb32 100644 --- a/docs/guide/benchmarking/2026-06-11-competitor-strength-evidence-matrix.md +++ b/docs/guide/benchmarking/2026-06-11-competitor-strength-evidence-matrix.md @@ -41,10 +41,10 @@ Current boundary: ## Current Ledger Summary -The current manifest has 21 adapter records across 17 projects. Evidence-class counts: -1 `fixture_backed`, 6 `live_baseline_only`, 2 `live_real_world`, and 12 -`research_gate`. Overall adapter-status counts: 1 `pass`, 6 `wrong_result`, 1 -`lifecycle_fail`, 6 `blocked`, and 7 `not_encoded`. +The current manifest has 21 adapter records across 16 external projects plus ELF. +Evidence-class counts: 1 `fixture_backed`, 6 `live_baseline_only`, 3 +`live_real_world`, and 11 `research_gate`. Overall adapter-status counts: 1 `pass`, +7 `wrong_result`, 1 `lifecycle_fail`, 5 `blocked`, and 7 `not_encoded`. ## State Taxonomy @@ -87,7 +87,7 @@ lifecycle-fail -> `lifecycle_fail`, and not-encoded -> `not_encoded`. | nanograph | Typed graph schema and query ergonomics for graph-lite developer experience. | `research_gate`. | `not_encoded`: `docs/research/2026-06-10-xy-882-rag-graph-adapter-feasibility.json`. | `unsupported`: not a memory backend comparison target. | Non-goal for direct win/loss unless a contained memory-backed target emerges; measure ELF graph-lite DX instead. | Typed relation schema, query ergonomics, and small graph developer experience. | | llm-wiki | LLM-maintained wiki or knowledge-page workflow with query-save and lint loops. | `research_gate`. | `not_encoded`: `docs/research/2026-06-10-xy-882-rag-graph-adapter-feasibility.json`. | `unsupported`: no live service runtime for adapter proof. | Select contained plugin or instruction harness, then score knowledge pages for citations, unsupported claims, rebuild, and stale-source lint. | Maintained wiki workflows, page lint, query-save loops, and topic-scoped navigation. | | gbrain | Operational knowledge brain with compiled_truth pages, timelines, enrichment, and maintenance loops. | `research_gate`. | `not_encoded`: `docs/research/2026-06-10-xy-882-rag-graph-adapter-feasibility.json`. | `blocked`: Docker-local brain repo and database path are missing. | Prove Docker-local repository/database setup, then encode compiled_truth/timeline and operator-continuity jobs. | Compiled truth pages, timeline maintenance, and human-operable knowledge-brain navigation. | -| graphify | Graph-compressed navigation with `graph.json` and `GRAPH_REPORT` evidence outputs. | `research_gate`. | `blocked`: `cargo make graphify-docker-graph-report-smoke`, `tmp/real-world-memory/graphify-smoke/graphify-smoke.json`. | `blocked`: Docker CLI graph/report generation is not proven; host-global assistant hooks are out of scope. | XY-889 Docker-only graph/report adapter over `graph.json` and `GRAPH_REPORT.md`. | Graph compression, source-location graph reports, and navigation hints for large code or document spaces. | +| graphify | Graph-compressed navigation with `graph.json` and `GRAPH_REPORT` evidence outputs. | Scored tiny `live_real_world` smoke; not broad graph-quality proof. | `wrong_result`: `cargo make graphify-docker-graph-report-smoke`, `tmp/real-world-memory/graphify-smoke/graphify-report.json`. | `not_encoded`: broad graph navigation, multimodal, private-corpus, and large-corpus quality remain outside the tiny smoke. | Expand beyond the generated smoke only after graph/report output maps to scored evidence on representative graph/RAG jobs. | Graph compression, source-location graph reports, and navigation hints for large code or document spaces. | ## Scenario Matrix @@ -99,14 +99,14 @@ lifecycle-fail -> `lifecycle_fail`, and not-encoded -> `not_encoded`. | Source-of-truth | Fixture and live trust_source_of_truth pass. | memsearch. | memsearch canonical-store evidence exists, but source-of-truth is `incomplete` and retrieval is `wrong_result`. | Fix memsearch reindex/retrieval evidence and score source-of-truth rebuild/reload jobs. | | Temporal/current-vs-historical memory | Fixture memory_evolution passes; live memory_evolution is `wrong_result`. | Graphiti/Zep, mem0/OpenMemory. | Graphiti/Zep is `research_gate` `blocked`; mem0/OpenMemory is `wrong_result`. | Fix ELF/qmd live memory_evolution evidence links and run XY-888. | | Consolidation | Fixture consolidation passes; live consolidation is `not_encoded`. | agentmemory, managed-memory references, llm-wiki. | No manifest project has live consolidation scoring. | Run reviewable consolidation proposal generation with source refs, unsupported-claim flags, and audit transitions. | -| Knowledge pages | Fixture knowledge_compilation passes; live knowledge_compilation is `not_encoded`. | llm-wiki, gbrain, GraphRAG, graphify. | llm-wiki and gbrain are `research_gate` `not_encoded` or `blocked`; GraphRAG and graphify are `blocked`. | Encode live derived-page rebuild/lint scoring and run contained knowledge/RAG adapters only after setup proof. | +| Knowledge pages | Fixture knowledge_compilation passes; live knowledge_compilation is `not_encoded`. | llm-wiki, gbrain, GraphRAG, graphify. | llm-wiki and gbrain are `research_gate` `not_encoded` or `blocked`; GraphRAG is `blocked`; graphify has a tiny scored smoke `wrong_result`. | Encode live derived-page rebuild/lint scoring and run contained knowledge/RAG adapters only after setup proof. | | Operator debugging | Fixture operator_debugging_ux passes; live operator_debugging_ux is `not_encoded`. | qmd, claude-mem, OpenMemory. | qmd has debug strengths but operator_debugging_ux is `not_encoded`; claude-mem and OpenMemory UX are `not_encoded`. | Score trace hydration, stage attribution, raw-SQL avoidance, and repair-action clarity through live artifacts. | | Capture/write policy | Fixture capture_integration passes; live capture_integration is `not_encoded`. | agentmemory, claude-mem. | agentmemory capture is `blocked`; claude-mem capture is `not_encoded`. | Run live capture/write-policy jobs proving redaction, exclusion, evidence binding, and no secret leakage. | | Production ops | Fixture production_ops has 4 pass and 2 blocked; live production_ops is `incomplete`; production adoption has provider/backfill/restore evidence. | ELF production gate, qmd, RAG/RAGFlow resource gates. | qmd live production_ops is `incomplete`; RAG/resource gates are `research_gate` `blocked`. | Rerun private-corpus and credentialed gates only when operator-owned manifest and credentials exist. | | Personalization | Fixture and live personalization pass. | mem0/OpenMemory, Letta. | mem0/OpenMemory and Letta personalization are `not_encoded`. | Encode scoped preference readback for mem0/OpenMemory and Letta before personalization superiority claims. | | Context trajectory | ELF has trace direction but no comparable staged trajectory scenario. | OpenViking. | OpenViking setup is pinned, same-corpus retrieval is `wrong_result`, and hierarchy trajectory is `not_encoded`. | Make OpenViking evidence-bearing retrieval pass, then score staged context trajectory outputs. | | Core-vs-archival memory | ELF core-block semantics exist in the service contract, but comparative benchmark coverage is not encoded here. | Letta. | Letta is `research_gate` `not_encoded` until contained export proof exists. | Add ELF core-block versus archival-search jobs; compare Letta only after contained export proof. | -| Graph/RAG navigation | ELF relation context is not enough to claim graph/RAG navigation parity. | RAGFlow, LightRAG, GraphRAG, Graphiti/Zep, graphify. | All named RAG/graph projects are `research_gate` `blocked` or `not_encoded`. | Run XY-885 through XY-889 Docker-contained adapters with evidence-linked outputs. | +| Graph/RAG navigation | ELF relation context is not enough to claim graph/RAG navigation parity. | RAGFlow, LightRAG, GraphRAG, Graphiti/Zep, graphify. | RAGFlow, LightRAG, GraphRAG, and Graphiti/Zep remain `research_gate` blocked/incomplete without explicit setup; graphify has only a tiny scored smoke `wrong_result`. | Run larger contained graph/RAG adapters with evidence-linked outputs before any ELF graph/RAG win, tie, or loss claim. | ## Parallelizable Benchmark Follow-Ups @@ -125,7 +125,7 @@ now explicit: | LightRAG context export | XY-886 | yes | Docker service setup and explicit provider config. | Retrieved context export and source file-path citations. | | GraphRAG cost-bounded adapter | XY-887 | yes | Tiny corpus cost/resource envelope. | Document, text-unit, graph-summary, and citation output tables. | | Graphiti/Zep temporal graph adapter | XY-888 | yes | Docker-local graph store setup. | Current/historical/future fact validity and evidence ids. | -| graphify graph report adapter | XY-889 | yes | Docker CLI graph/report generation proof. | `graph.json` and `GRAPH_REPORT` evidence for graph navigation and knowledge synthesis. | +| graphify graph report adapter | XY-889 plus post-XY-900 expansion | yes | Representative graph/RAG jobs beyond the tiny scored smoke. | `graph.json` and `GRAPH_REPORT` evidence mapped to scored graph navigation and knowledge synthesis ids. | | Private corpus and credentialed production ops | Operator-owned benchmark gates | no | Sanitized private manifest and routed provider credentials. | Private-corpus retrieval quality and credentialed production-ops evidence. | | Letta, LangGraph, nanograph, llm-wiki direct adapters | Research-only until output contract | no | Contained evidence export or non-memory-backend comparability contract. | Run only after each has a comparable output contract; otherwise keep as product-reference evidence. | diff --git a/docs/guide/benchmarking/2026-06-11-elf-iteration-direction-from-competitor-benchmarks.md b/docs/guide/benchmarking/2026-06-11-elf-iteration-direction-from-competitor-benchmarks.md index d581b76c..12ee4bc1 100644 --- a/docs/guide/benchmarking/2026-06-11-elf-iteration-direction-from-competitor-benchmarks.md +++ b/docs/guide/benchmarking/2026-06-11-elf-iteration-direction-from-competitor-benchmarks.md @@ -101,17 +101,17 @@ The current adapter manifest records 21 adapter records across 17 projects: | --- | ---: | --- | | `fixture_backed` | `1` | ELF real-world fixture scoring. | | `live_baseline_only` | `6` | Docker same-corpus or lifecycle evidence without real-world job scoring. | -| `live_real_world` | `2` | ELF and qmd full-suite live sweeps. | -| `research_gate` | `12` | Source/setup/resource/output-contract evidence only. | +| `live_real_world` | `3` | ELF and qmd full-suite live sweeps plus graphify's tiny scored Docker smoke. | +| `research_gate` | `11` | Source/setup/resource/output-contract evidence only. | Overall adapter statuses: | Status | Count | | --- | ---: | | `pass` | `1` | -| `wrong_result` | `6` | +| `wrong_result` | `7` | | `lifecycle_fail` | `1` | -| `blocked` | `6` | +| `blocked` | `5` | | `not_encoded` | `7` | The ledger is intentionally not a leaderboard. It prevents fixture evidence, @@ -135,7 +135,7 @@ one misleading score. | Personalization | ELF live personalization passes; mem0/OpenMemory and Letta are not encoded. | Add entity-scoped preference history and UI readback before claiming stronger personalization. | | Context trajectory | Not comparable yet; OpenViking remains the reference. | Score staged retrieval, hierarchy expansion, and trajectory readback. | | Core-vs-archival | Product gap, not a measured comparison yet. | Borrow Letta's core memory block shape with explicit scope, provenance, and read-only attachment. | -| Graph/RAG navigation | Research gates only. | Run RAGFlow, LightRAG, GraphRAG, Graphiti/Zep, and graphify adapters only when Docker outputs map to evidence ids. | +| Graph/RAG navigation | RAGFlow, LightRAG, GraphRAG, and Graphiti/Zep remain research gates; graphify has a tiny scored `wrong_result` smoke. | Run larger contained graph/RAG adapters before any broad graph-navigation claim. | ## Project Guidance Matrix @@ -157,7 +157,7 @@ one misleading score. | nanograph | `research_gate`; current status is `not_encoded` or `unsupported` as a full memory backend. | Typed graph schema and query ergonomics. | Borrow graph-lite DX and typed relation query ideas. | | llm-wiki | `research_gate`; current status is `not_encoded`. | Maintained wiki pages, query-save, lint, and repair loops. | Use as a reference for rebuildable, cited knowledge pages. | | gbrain | `research_gate`; current status is `not_encoded` and setup-blocked. | Compiled truth pages, timelines, and human-operable knowledge navigation. | Borrow current-truth plus timeline presentation after Docker-local setup proof exists. | -| graphify | `research_gate`; current status is `blocked`. | `graph.json`, `GRAPH_REPORT`, source-location graph navigation. | Borrow graph-compressed navigation only after Docker graph/report output maps to evidence ids. | +| graphify | `live_real_world`; tiny scored smoke is `wrong_result`. | `graph.json`, `GRAPH_REPORT`, source-location graph navigation. | Treat the tiny smoke as bounded non-pass evidence and expand only after representative graph/RAG jobs map to evidence ids. | ## Optimization Direction @@ -223,8 +223,8 @@ These improve day-to-day usefulness while preserving ELF's evidence-bound core. These are needed for broad credibility but should not block personal production use. 1. RAG and graph adapters - - Current state: RAGFlow, LightRAG, GraphRAG, Graphiti/Zep, and graphify are - adapter candidates, but still `research_gate`. + - Current state: RAGFlow, LightRAG, GraphRAG, and Graphiti/Zep remain typed + research gates; graphify has a tiny scored `wrong_result` smoke. - Benchmark gate: Docker-contained adapters must emit evidence-linked outputs before any live pass claim. @@ -253,7 +253,8 @@ Do not claim: memory. Those scenarios are not encoded. - ELF beats Letta on core-vs-archival memory. That scenario is not encoded. - ELF beats RAGFlow, LightRAG, GraphRAG, Graphiti/Zep, or graphify on graph/RAG - navigation. Current evidence is research-gate or blocked. + navigation. Current evidence is research-gate or blocked except graphify's tiny + non-pass smoke. ## Suggested Report Cadence diff --git a/docs/guide/benchmarking/2026-06-11-graph-rag-scored-smoke-adapter-report.md b/docs/guide/benchmarking/2026-06-11-graph-rag-scored-smoke-adapter-report.md new file mode 100644 index 00000000..e970ea94 --- /dev/null +++ b/docs/guide/benchmarking/2026-06-11-graph-rag-scored-smoke-adapter-report.md @@ -0,0 +1,102 @@ +# Graph/RAG Scored Smoke Adapter Report - June 11, 2026 + +Goal: Record the XY-900 promotion of graph/RAG Docker smokes into scored +`real_world_job` adapter evidence without upgrading smoke evidence into broad quality +claims. +Read this when: You need to decide whether ELF currently wins, ties, loses, or remains +untested against RAGFlow, LightRAG, GraphRAG, Graphiti/Zep, and graphify graph/RAG +strengths. +Inputs: `memory_projects_manifest.json`, the graph/RAG smoke commands in +`Makefile.toml`, and the generated smoke report contracts. +Outputs: Scored-smoke status, claim boundary, blocker taxonomy, and next measurement +gate for each in-scope project. + +## Verdict + +XY-900 promotes the in-scope Docker smokes into scored adapter evidence where the smoke +already has enough generated evidence ids to evaluate a bounded job. This is still +smoke-only evidence. + +Current graph/RAG quality comparison remains mostly untested. ELF cannot claim a win, +tie, or loss against the in-scope graph/RAG strengths from smoke evidence alone. +`graphify` is the current exception only in the narrow sense that its Docker smoke +reaches graph/report output and scores one tiny `knowledge_compilation` job as +`wrong_result`; that is a bounded graphify non-pass, not an ELF victory claim. + +Graphiti/Zep remains the temporal-validity reference. The default checked-in smoke is +typed `blocked` before live execution because `ELF_GRAPHITI_ZEP_SMOKE_START=1` and +`ELF_GRAPHITI_ZEP_SMOKE_RUN=1` are not set. When that live path is explicitly enabled +without provider credentials, the blocker remains `provider_api_key_missing`; no +hosted Zep service or unrecorded provider credentials are used or implied. + +## Scored Smoke Status + +| Project | Scored scenario | Command | Current scored status | Claim boundary | +| --- | --- | --- | --- | --- | +| RAGFlow | `retrieval`: reference chunks mapped to generated evidence ids | `cargo make ragflow-docker-smoke` | `blocked` or `incomplete` by execution boundary | Smoke-only. No RAGFlow quality claim until returned reference chunks map to `ragflow-smoke-anchor`. | +| LightRAG | `retrieval`: context/source export mapped to fixture evidence ids | `cargo make lightrag-docker-context-smoke` | `incomplete` when the API service is not started | Smoke-only. No graph-RAG quality claim until context or references map to generated evidence ids. | +| GraphRAG | `knowledge_compilation`: output tables mapped to generated evidence ids | `cargo make graphrag-docker-smoke` | `blocked` | Smoke-only. No graph-navigation or synthesis claim until output tables map to generated evidence ids. | +| Graphiti/Zep | `memory_evolution`: current and historical validity facts | `cargo make graphiti-zep-docker-temporal-smoke` | `blocked` before live opt-in; `provider_api_key_missing` when live path is enabled without explicit credentials | Provider-bound. No ELF-over-Graphiti/Zep claim until temporal output maps to scored evidence ids. | +| graphify | `knowledge_compilation`: `graph.json`, `GRAPH_REPORT.md`, and query output mapping | `cargo make graphify-docker-graph-report-smoke` | `wrong_result` after setup/run pass | Scored tiny smoke. The graph/report output maps to evidence ids, but the job remains non-pass; no broad graph-navigation quality claim follows. | + +## Artifact Contract + +Each promoted smoke now writes a generated fixture and scored report: + +| Project | Generated report | +| --- | --- | +| RAGFlow | `tmp/real-world-memory/ragflow-smoke/ragflow-report.json` and `.md` | +| LightRAG | `tmp/real-world-memory/lightrag-context/lightrag-report.json` and `.md` | +| GraphRAG | `tmp/real-world-memory/graphrag-smoke/graphrag-report.json` and `.md` | +| Graphiti/Zep | `tmp/real-world-memory/graphiti-zep-smoke/graphiti-zep-report.json` and `.md` | +| graphify | `tmp/real-world-memory/graphify-smoke/graphify-report.json` and `.md` | + +The aggregate live-adapter sweep can include these reports through explicit opt-in +flags. These flags include an adapter in the aggregate report; provider-backed, +service-started, or resource-heavy live attempts still require the adapter-specific +controls listed by each smoke task: + +- `ELF_REAL_WORLD_LIVE_ENABLE_RAGFLOW=1` +- `ELF_REAL_WORLD_LIVE_ENABLE_LIGHTRAG=1` +- `ELF_REAL_WORLD_LIVE_ENABLE_GRAPHRAG=1` +- `ELF_REAL_WORLD_LIVE_ENABLE_GRAPHITI_ZEP=1` +- `ELF_REAL_WORLD_LIVE_ENABLE_GRAPHIFY=1` + +Default `cargo make real-world-memory-live-adapters` still runs ELF and qmd only. That +keeps heavyweight services, provider-backed runs, and graph/report installs out of the +default sweep unless explicitly requested. + +## Typed Limits + +Resource, runtime, provider, and setup limits remain first-class report states: + +- `blocked`: live execution requires explicit resource opt-in, provider credentials, + a Docker service profile, or a generated output that is not yet available. +- `incomplete`: setup or service reachability failed before the behavioral check. +- `wrong_result`: the smoke reached scoring but failed required answer or rubric + signals, including unmapped evidence where applicable. +- `pass`: the smoke reached output and all required generated evidence ids mapped. +- `not_encoded`: broad quality, scale, private corpus, hosted-service behavior, and + non-smoke suites remain outside the current adapter. + +## Claim Rules + +Allowed: + +- Say the in-scope graph/RAG smokes now produce scored `real_world_job` adapter reports + or typed non-pass reports. +- Say graph/RAG quality remains untested where live output has not mapped to generated + evidence ids or where scored output remains typed non-pass. +- Say graphify reached a tiny Docker graph/report smoke and currently scores + `wrong_result`. +- Say Graphiti/Zep remains blocked by default live-run opt-in, and provider-blocked + when that live path is explicitly enabled without credentials; it remains the + temporal-validity reference. + +Not allowed: + +- Do not call a smoke pass a broad RAG, graph, temporal, or production-quality pass. +- Do not claim ELF beats Graphiti/Zep, RAGFlow, LightRAG, GraphRAG, or graphify on + their graph/RAG strengths from these smoke reports. +- Do not use hosted/cloud-only results, host-global installs, private corpora, or + unrecorded credentials as evidence for this lane. diff --git a/docs/guide/benchmarking/2026-06-11-measurement-coverage-audit.md b/docs/guide/benchmarking/2026-06-11-measurement-coverage-audit.md index 862395b4..9daa9eb6 100644 --- a/docs/guide/benchmarking/2026-06-11-measurement-coverage-audit.md +++ b/docs/guide/benchmarking/2026-06-11-measurement-coverage-audit.md @@ -119,21 +119,20 @@ The checked-in manifest records 21 adapter records across 17 unique project name | --- | ---: | --- | | `fixture_backed` | `1` | ELF fixture scoring only. | | `live_baseline_only` | `6` | Docker same-corpus or lifecycle evidence without real-world job scoring. | -| `live_real_world` | `2` | ELF and qmd live real-world sweeps. | -| `research_gate` | `12` | Setup, source, resource, or output-contract gate only. | +| `live_real_world` | `3` | ELF and qmd live real-world sweeps plus graphify's tiny scored Docker smoke. | +| `research_gate` | `11` | Setup, source, resource, or output-contract gate only. | | Overall status | Adapter records | | --- | ---: | | `pass` | `1` | -| `wrong_result` | `6` | +| `wrong_result` | `7` | | `lifecycle_fail` | `1` | -| `blocked` | `6` | +| `blocked` | `5` | | `not_encoded` | `7` | -The generated JSON report also emits `external_project_count: 19`, while the unique -project-name count from the manifest is 17. The runner currently computes that field -as adapter records whose project is not `ELF`, not as unique external project names. -Interpret the unique manifest project list as the project coverage count. +The generated JSON report emits `external_project_count: 16`, matching the unique +non-ELF project-name count from the manifest. The full project-name count remains 17 +when ELF is included. ## Project Coverage @@ -155,7 +154,7 @@ Interpret the unique manifest project list as the project coverage count. | nanograph | `research_gate` | `not_encoded`; full memory backend is unsupported. | Typed graph schema and query ergonomics. | Typed relation query report only if evidence ids can be emitted. | | llm-wiki | `research_gate` | `not_encoded`. | Wiki/page generation, query-save, lint and repair loops. | Contained page-generation report with citation and unsupported-claim lint. | | gbrain | `research_gate` | `not_encoded`; setup path is blocked. | Compiled truth pages, timelines, and brain navigation. | Docker-local brain repo setup proof, then compiled-truth/timeline report. | -| graphify | `research_gate` | `blocked`. | Graph-compressed navigation with `graph.json` and `GRAPH_REPORT`. | Docker graph/report output report mapped to benchmark evidence ids. | +| graphify | `live_real_world` | Tiny scored smoke is `wrong_result`. | Graph-compressed navigation with `graph.json` and `GRAPH_REPORT`. | Expand beyond the generated smoke only after graph/report output maps to scored evidence on representative graph/RAG jobs. | ## Scenario Coverage And Claims @@ -174,7 +173,7 @@ Interpret the unique manifest project list as the project coverage count. | Personalization | ELF and qmd live pass one scoped preference job. | Narrow encoded pass only. | mem0/OpenMemory and Letta entity/preference history comparison. | | Context trajectory | Not comparable. | No claim. | OpenViking staged hierarchy/trajectory scoring. | | Core-vs-archival memory | Not comparable. | No claim. | Letta contained export and ELF core-block benchmark. | -| Graph/RAG navigation | Research gates and blocked adapters only. | No claim. | RAGFlow, LightRAG, GraphRAG, Graphiti/Zep, and graphify Docker reports. | +| Graph/RAG navigation | RAGFlow, LightRAG, GraphRAG, and Graphiti/Zep remain typed research gates; graphify has a tiny scored `wrong_result` smoke. | No graph/RAG parity claim; only graphify's bounded non-pass smoke can be cited. | Larger contained RAG/graph adapters with evidence-linked outputs before any ELF graph/RAG win, tie, or loss claim. | ## Next Measurement Reports @@ -214,9 +213,9 @@ Order these by decision value, not implementation convenience: - Output: Docker-contained artifacts mapped to evidence ids, or typed setup and resource blockers. -Before publishing the next aggregate report, clarify or rename the generated -`external_project_count` field so readers do not confuse non-ELF adapter records with -unique external projects. +Before publishing the next aggregate report, keep the generated `external_project_count` +field tied to unique non-ELF project names so readers do not confuse adapter records +with unique external projects. ## Fail Criteria diff --git a/docs/guide/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md b/docs/guide/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md index d48a02fa..dd86fde4 100644 --- a/docs/guide/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md +++ b/docs/guide/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md @@ -132,7 +132,7 @@ the right snippets. | Core-vs-archival memory | Letta core memory blocks versus archival memory | Research-only, no contained live output | Not comparable. Borrow design only. | | Context trajectory | OpenViking staged context and hierarchy | Existing adapter remains not encoded or wrong_result for trajectory | Not comparable. Need staged trajectory benchmark. | | Capture and continuity | agentmemory, claude-mem hooks/viewers | Existing adapters are baseline-only and undermeasured | Not comparable. Need capture/write-policy and work-resume adapters. | -| Knowledge pages and graph/RAG navigation | llm-wiki, gbrain, graphify, RAGFlow, LightRAG, GraphRAG | Research-gate or blocked adapter state | Not comparable. Need Docker-contained evidence-linked adapters. | +| Knowledge pages and graph/RAG navigation | llm-wiki, gbrain, graphify, RAGFlow, LightRAG, GraphRAG | llm-wiki/gbrain/GraphRAG/RAGFlow/LightRAG remain research-gate or blocked; graphify has a tiny scored `wrong_result` smoke | Not comparable for graph/RAG parity. Need larger Docker-contained evidence-linked adapters. | | Production operation discipline | ELF backfill, restore, typed gates | Existing production adoption reports plus current benchmark discipline | ELF has the strongest measured local production-operation story, with private/provider gates still typed blocked. | ## What ELF Should Borrow diff --git a/docs/guide/benchmarking/index.md b/docs/guide/benchmarking/index.md index e7b0cded..20de8c2d 100644 --- a/docs/guide/benchmarking/index.md +++ b/docs/guide/benchmarking/index.md @@ -70,6 +70,11 @@ cleanup, use `docs/guide/single_user_production.md`. records Graphiti/Zep and Letta claim boundaries, and turns qmd, mem0/OpenMemory, Graphiti/Zep, Letta, and adjacent project strengths into benchmark-gated ELF optimization directions. +- `2026-06-11-graph-rag-scored-smoke-adapter-report.md`: XY-900 graph/RAG + scored-smoke adapter report that promotes RAGFlow, LightRAG, GraphRAG, + Graphiti/Zep, and graphify smoke contracts into scored or typed non-pass + `real_world_job` adapter reports without converting smoke evidence into quality + claims. - `real_world_agent_memory_benchmark.md`: operator overview for the v1 real-world agent memory benchmark contract, including suite taxonomy, typed report states, knowledge-compilation fixture tasks, and the production-ops fixture target. diff --git a/docs/guide/research/comparison_external_projects.md b/docs/guide/research/comparison_external_projects.md index f9540823..05e12a0d 100644 --- a/docs/guide/research/comparison_external_projects.md +++ b/docs/guide/research/comparison_external_projects.md @@ -106,7 +106,7 @@ Project-to-suite map: | llm-wiki | `rw.knowledge-synthesis`, `rw.resume-evidence` | Query/save/lint flows and topic-scoped wiki pages are a useful reference for turning retrieved memory into maintained project knowledge. | Run a corpus-to-wiki job, ask resume/decision questions, require page citations back to source memory, then mutate a stale source and prove lint/repair catches it. | Docs-grounded D1; no benchmark adapter evidence. Confidence: medium for derived-knowledge fit. | ELF is not yet stronger on derived knowledge pages; llm-wiki should inform rebuildable, evidence-cited dossiers rather than core storage. | | gbrain | `rw.knowledge-synthesis`, `rw.operator-continuity` | `compiled_truth`, timeline sections, backlinks, primary-home routing, and enrichment workflows model a living operational brain for project work. | Build or update pages from the real-world corpus, require current-truth plus timeline answers, and prove enrichment/backlink maintenance does not hide unsupported claims. | Docs-grounded D1; no benchmark adapter evidence. Confidence: medium for operator knowledge UX. | ELF should keep source notes authoritative; gbrain is a reference for presentation, enrichment, and maintenance loops. | | Always-On Memory Agent | `rw.consolidation-review`, `rw.operator-continuity` | The file/API/dashboard ingest loop and timer-based consolidation show how background memory formation becomes a user-visible product surface. | Run scheduled consolidation on a fixed corpus, record source rows and output insights, then score whether consolidation is reviewable, repeatable, and bounded against unsupported claims. | Docs-grounded D1; no benchmark adapter evidence. Confidence: medium for consolidation workflow reference. | ELF should borrow scheduling and operator controls while keeping deterministic writes and reviewable derived outputs. | -| graphify | `rw.graph-navigation`, `rw.knowledge-synthesis`, `rw.resume-evidence` | Deterministic code extraction, LLM-assisted graph building, honesty tags, graph reports, and assistant hooks are strong references for graph-compressed navigation over large corpora. | Generate graph/report artifacts from the benchmark corpus, require answers to use graph structure plus source evidence, and prove rebuild behavior after corpus edits. | Implementation-backed research gate: `cargo make graphify-docker-graph-report-smoke` records a Docker-only generated-corpus graph/report artifact; checked-in manifest remains blocked/research_gate and does not claim broad graph quality or rebuild strength. Confidence: medium for adapter feasibility, low for production-quality graph navigation. | ELF is stronger as a memory service; graphify is now a runnable reference for derived graph reports and pre-search guidance, but not yet a stronger end-to-end memory system. | +| graphify | `rw.graph-navigation`, `rw.knowledge-synthesis`, `rw.resume-evidence` | Deterministic code extraction, LLM-assisted graph building, honesty tags, graph reports, and assistant hooks are strong references for graph-compressed navigation over large corpora. | Generate graph/report artifacts from the benchmark corpus, require answers to use graph structure plus source evidence, and prove rebuild behavior after corpus edits. | Scored tiny `live_real_world` smoke: `cargo make graphify-docker-graph-report-smoke` records a Docker-only generated-corpus graph/report artifact and currently scores `wrong_result`; the checked-in manifest does not claim broad graph quality, rebuild strength, or production-quality graph navigation. Confidence: medium for adapter feasibility, low for production-quality graph navigation. | ELF is stronger as a memory service; graphify is now a runnable reference for derived graph reports and pre-search guidance, but not yet a stronger end-to-end memory system. | | Letta | `rw.core-archival`, `rw.operator-continuity` | Core memory blocks, archival memory, and shared/read-only memory blocks map directly to always-loaded operating context versus retrievable memory. | Build a multi-agent job where core blocks must be attached/detached/shared read-only, while archival memory is retrieved separately and audited. | Docs-grounded D1; no benchmark adapter evidence. Confidence: medium for memory-semantics reference. | ELF has scoped notes but not first-class core/archival block ergonomics; Letta is the reference dimension. | | LangGraph | `rw.replay-regression`, `rw.resume-evidence` | Thread checkpoints, durable execution, replay, fork, and time travel define a strong model for debugging agent-state and memory-regression behavior. | Run an agent job with memory reads across checkpoints, replay/fork the thread after a stale-memory failure, and verify side-effect boundaries. | Docs-grounded D1; no benchmark adapter evidence. Confidence: medium for replay workflow reference. | ELF traces are useful but do not replace full agent checkpoint replay; LangGraph is the reference for replay-regression jobs. | | Graphiti / Zep | `rw.graph-temporal`, `rw.resume-evidence` | Temporal entities, relations, fact triples, validity windows, and graph search directly target stale/contradictory factual memory. | Add fact triples with validity changes, query current and historical answers, and score invalidation/append behavior under contradiction traps. | Docs-grounded D1; no benchmark adapter evidence. Confidence: medium-high for temporal-graph dimension. | ELF graph-lite covers evidence-linked validity windows and current/historical relation context; Graphiti/Zep remains the reference for broader temporal graph workflows. | @@ -120,7 +120,7 @@ XY-882 feasibility verdicts for RAG and graph-memory gates: | LightRAG | `adapter_candidate` | Docker Compose server with explicit LLM, embedding, rerank, storage, workspace, and data-volume configuration. | Context-only query modes can return the context prepared for the LLM; core APIs can insert documents with ids and source file paths. | [XY-886](https://linear.app/hack-ink/issue/XY-886/elf-benchmark-adapter-implement-lightrag-docker-context-export-adapter); no live pass claim. | | GraphRAG | `adapter_candidate` | Cost-bounded Docker Python CLI/API run over a generated tiny corpus with container-local parquet artifacts. | Output tables contain generated UUIDs, human-readable ids, source documents, text units, community reports, and text-unit links for graph summaries and relationships. | [XY-887](https://linear.app/hack-ink/issue/XY-887/elf-benchmark-adapter-implement-graphrag-cost-bounded-docker-adapter); no live pass claim. | | Graphiti / Zep | `adapter_candidate` | Docker-local FalkorDB or Neo4j plus Python SDK runner with provider config captured under benchmark artifacts. | Search results and fact triples expose UUIDs, fact text, and validity windows (`valid_at` / `invalid_at`) that map to memory-evolution scoring. | [XY-888](https://linear.app/hack-ink/issue/XY-888/elf-benchmark-adapter-implement-graphitizep-temporal-graph-adapter); no live pass claim. | -| graphify | `adapter_candidate` | Docker-only CLI/materializer using `pip install graphifyy` over a mounted corpus; host-global assistant hooks are out of scope. | `graph.json`, `GRAPH_REPORT.md`, and graph query output include edge types, confidence tags, source files, and source locations. | [XY-889](https://linear.app/hack-ink/issue/XY-889/elf-benchmark-adapter-implement-graphify-docker-graph-report-adapter) adds `cargo make graphify-docker-graph-report-smoke`; generated artifacts may carry live status, while the checked-in research-gate record avoids broad quality claims. | +| graphify | `adapter_candidate` | Docker-only CLI/materializer using `pip install graphifyy` over a mounted corpus; host-global assistant hooks are out of scope. | `graph.json`, `GRAPH_REPORT.md`, and graph query output include edge types, confidence tags, source files, and source locations. | [XY-889](https://linear.app/hack-ink/issue/XY-889/elf-benchmark-adapter-implement-graphify-docker-graph-report-adapter) adds `cargo make graphify-docker-graph-report-smoke`; XY-900 promotes the tiny generated smoke to scored `live_real_world` `wrong_result` evidence while still avoiding broad quality claims. | | Letta | `research_only` | Docker server exists, but current docs require explicit embedding configuration and steer Letta Code evaluation toward non-Docker local/frontier-model exploration. | Core/archival memory and shared blocks remain useful semantics, but no contained evidence export is selected for this adapter batch. | No implementation issue. | | LangGraph | `research_only` | A Docker harness is possible, but the project is an agent-state/checkpoint framework rather than a standalone memory adapter. | Store search and checkpoints are references for replay-regression jobs, not a direct external memory output contract here. | No implementation issue. | | nanograph | `research_only` | Official positioning is one CLI / one folder / no server / no Docker. | Typed schema, query, CDC, and search ergonomics remain graph-lite DX inspiration. | No implementation issue. | diff --git a/docs/guide/research/research_projects_inventory.md b/docs/guide/research/research_projects_inventory.md index a76a0d4f..2f1cb9c0 100644 --- a/docs/guide/research/research_projects_inventory.md +++ b/docs/guide/research/research_projects_inventory.md @@ -6,7 +6,7 @@ Inputs: Existing research notes, open architecture questions, and tracked adopti Depends on: `docs/guide/research/comparison_external_projects.md`. Outputs: A current inventory of reviewed and pending external projects. -Last updated: June 10, 2026. +Last updated: June 11, 2026. ## Legend @@ -43,7 +43,9 @@ Last updated: June 10, 2026. XY-882 resolved the D1/D2 feasibility gate for the RAG and graph-memory `research_gate` records. These verdicts do not change any project into live adapter -evidence; they only decide whether an implementation follow-up is justified. +evidence by themselves; they only decide whether an implementation follow-up is +justified. XY-900 later promotes graphify's generated-corpus Docker smoke into a +scored tiny `live_real_world` non-pass record, but not broad graph-quality proof. | Project | Verdict | Follow-up rule | | ------- | ------- | -------------- | @@ -51,7 +53,7 @@ evidence; they only decide whether an implementation follow-up is justified. | LightRAG | `adapter_candidate` | Follow-up issue: [XY-886](https://linear.app/hack-ink/issue/XY-886/elf-benchmark-adapter-implement-lightrag-docker-context-export-adapter), a Docker context-export adapter using explicit LLM/embedding config and source file-path citations. | | GraphRAG | `adapter_candidate` | Follow-up issue: [XY-887](https://linear.app/hack-ink/issue/XY-887/elf-benchmark-adapter-implement-graphrag-cost-bounded-docker-adapter), a cost-bounded Docker CLI/API adapter over a tiny corpus and parquet output tables. | | Graphiti / Zep | `adapter_candidate` | Follow-up issue: [XY-888](https://linear.app/hack-ink/issue/XY-888/elf-benchmark-adapter-implement-graphitizep-temporal-graph-adapter), a Docker-local temporal graph adapter that scores current/historical fact validity. | -| graphify | `adapter_candidate` | Follow-up issue: [XY-889](https://linear.app/hack-ink/issue/XY-889/elf-benchmark-adapter-implement-graphify-docker-graph-report-adapter), a Docker-only CLI/materializer adapter over `graph.json` and `GRAPH_REPORT.md`; host-global assistant hooks remain out of scope. The checked-in manifest remains a research gate, while generated smoke artifacts may carry live status. | +| graphify | `adapter_candidate` | Follow-up issue: [XY-889](https://linear.app/hack-ink/issue/XY-889/elf-benchmark-adapter-implement-graphify-docker-graph-report-adapter), a Docker-only CLI/materializer adapter over `graph.json` and `GRAPH_REPORT.md`; host-global assistant hooks remain out of scope. XY-900 promotes the checked-in graphify row to a scored tiny Docker smoke with `wrong_result`; it is still not broad graph-navigation quality proof. | | Letta | `research_only` | Keep as a core/archival memory reference until a supported contained path can export archival-memory evidence for scoring. | | LangGraph | `research_only` | Keep as a checkpoint/replay regression reference, not a standalone external memory adapter. | | nanograph | `research_only` | Keep as typed graph DX inspiration; official shape is no server/no Docker. | diff --git a/docs/research/2026-06-11-measurement-coverage-audit.json b/docs/research/2026-06-11-measurement-coverage-audit.json index b04d86ef..0019110a 100644 --- a/docs/research/2026-06-11-measurement-coverage-audit.json +++ b/docs/research/2026-06-11-measurement-coverage-audit.json @@ -83,20 +83,21 @@ "adapter_ledger": { "adapter_records": 21, "unique_project_names": 17, - "external_project_count_note": "The generated report field external_project_count currently counts non-ELF adapter records, not unique external project names.", + "external_project_count_note": "At audit commit 286af8b, the generated report field external_project_count counted non-ELF adapter records, not unique external project names; XY-900 later repaired the runner to report unique non-ELF project names.", "evidence_class_counts": { "fixture_backed": 1, "live_baseline_only": 6, - "live_real_world": 2, - "research_gate": 12 + "live_real_world": 3, + "research_gate": 11 }, "overall_status_counts": { "pass": 1, - "wrong_result": 6, + "wrong_result": 7, "lifecycle_fail": 1, - "blocked": 6, + "blocked": 5, "not_encoded": 7 - } + }, + "xy900_update_note": "XY-900 promotes graphify from research_gate/blocked to a tiny scored live_real_world wrong_result smoke; broad graph/RAG quality remains unproven." }, "claim_boundary": { "elf_vs_qmd": "tie_on_current_encoded_live_real_world_shape_not_overall_win", diff --git a/docs/research/2026-06-11-xy-897-competitor-strength-matrix.json b/docs/research/2026-06-11-xy-897-competitor-strength-matrix.json index b847ecc7..893caf9b 100644 --- a/docs/research/2026-06-11-xy-897-competitor-strength-matrix.json +++ b/docs/research/2026-06-11-xy-897-competitor-strength-matrix.json @@ -25,14 +25,14 @@ "evidence_class_counts": { "fixture_backed": 1, "live_baseline_only": 6, - "live_real_world": 2, - "research_gate": 12 + "live_real_world": 3, + "research_gate": 11 }, "overall_status_counts": { "pass": 1, - "wrong_result": 6, + "wrong_result": 7, "lifecycle_fail": 1, - "blocked": 6, + "blocked": 5, "not_encoded": 7 } }, @@ -406,21 +406,21 @@ { "project": "graphify", "strongest_user_facing_scenario": "Graph-compressed navigation with graph.json and GRAPH_REPORT evidence outputs.", - "current_evidence_class": "research_gate", + "current_evidence_class": "live_real_world", "supporting_evidence_classes": [ - "research_gate" + "live_real_world" ], - "measured_status": "blocked", + "measured_status": "wrong_result", "proof": { "command": "cargo make graphify-docker-graph-report-smoke", - "artifact": "tmp/real-world-memory/graphify-smoke/graphify-smoke.json" + "artifact": "tmp/real-world-memory/graphify-smoke/graphify-report.json" }, "unsupported_or_blocked_status": { - "state": "blocked", - "typed_reason": "docker_cli_graph_report_generation_not_proven", - "details": "Adapter candidate, but graph report generation and real-world scoring are still blocked; host-global assistant hooks are out of scope." + "state": "not_encoded", + "typed_reason": "broad_graph_navigation_not_encoded", + "details": "The tiny generated graph/report smoke scores wrong_result; broad graph navigation, rebuild behavior, private-corpus, and large-corpus quality remain not encoded." }, - "benchmark_before_claim": "Run XY-889 Docker-only graph/report adapter over graph.json and GRAPH_REPORT.md, then score graph navigation and knowledge-synthesis evidence.", + "benchmark_before_claim": "Expand beyond the tiny generated smoke and score representative graph/RAG navigation jobs before any broad graphify quality or ELF comparison claim.", "borrow_if_stronger": "Borrow graph compression, source-location graph reports, and navigation hints for large code or document spaces." } ], @@ -484,9 +484,9 @@ "scenario": "knowledge pages", "current_elf_evidence": "ELF fixture-backed knowledge_compilation passes, but live_real_world knowledge_compilation is not_encoded.", "strongest_competitor_or_reference": "llm-wiki, gbrain, GraphRAG, graphify", - "current_competitor_evidence": "llm-wiki and gbrain are research_gate not_encoded or blocked; GraphRAG and graphify are research_gate blocked.", - "current_state": "No live knowledge-page competitor result exists; ELF has only fixture-backed derived-page evidence.", - "next_measurement": "Encode live knowledge-page rebuild/lint scoring for ELF and run contained llm-wiki, gbrain, GraphRAG, or graphify adapters only after setup proof exists." + "current_competitor_evidence": "llm-wiki and gbrain are research_gate not_encoded or blocked; GraphRAG remains research_gate blocked; graphify has a tiny live_real_world wrong_result smoke.", + "current_state": "No live knowledge-page competitor pass exists; graphify has only bounded non-pass tiny-smoke evidence and ELF has fixture-backed derived-page evidence.", + "next_measurement": "Encode live knowledge-page rebuild/lint scoring for ELF and run larger contained llm-wiki, gbrain, GraphRAG, or graphify adapters only after setup proof exists." }, { "scenario_id": "operator_debugging", @@ -547,9 +547,9 @@ "scenario": "graph/RAG navigation", "current_elf_evidence": "ELF relation context and graph-lite work are not enough to claim graph/RAG navigation parity.", "strongest_competitor_or_reference": "RAGFlow, LightRAG, GraphRAG, Graphiti/Zep, graphify", - "current_competitor_evidence": "All named RAG/graph projects are research_gate blocked or not_encoded, with adapter-candidate follow-ups for RAGFlow, LightRAG, GraphRAG, Graphiti/Zep, and graphify.", - "current_state": "No RAG/graph project has live_real_world pass evidence; research gates define follow-up adapter work only.", - "next_measurement": "Run XY-885 through XY-889 Docker-contained adapters and require evidence-linked outputs before any graph/RAG navigation claim." + "current_competitor_evidence": "RAGFlow, LightRAG, GraphRAG, and Graphiti/Zep remain research_gate blocked or incomplete; graphify has a tiny live_real_world wrong_result smoke.", + "current_state": "No RAG/graph project has live_real_world pass evidence; graphify supplies only bounded non-pass tiny-smoke evidence.", + "next_measurement": "Run larger Docker-contained adapters and require evidence-linked outputs before any graph/RAG navigation claim." } ], "parallelizable_followups": [ @@ -625,10 +625,10 @@ }, { "workstream": "graphify graph report adapter", - "issue_or_candidate": "XY-889", + "issue_or_candidate": "XY-889 plus post-XY-900 expansion", "parallelizable": true, - "blocked_by": "Docker CLI graph/report generation proof.", - "measurement": "graph.json and GRAPH_REPORT evidence for graph navigation and knowledge synthesis." + "blocked_by": "Representative graph/RAG navigation and quality proof beyond the tiny generated smoke.", + "measurement": "Graph/report evidence over representative graph/RAG jobs, with graph.json and GRAPH_REPORT outputs mapped to scored evidence ids." }, { "workstream": "Private corpus and credentialed production ops", diff --git a/scripts/graphify-docker-graph-report-smoke.py b/scripts/graphify-docker-graph-report-smoke.py index da1555a3..0035a1b9 100755 --- a/scripts/graphify-docker-graph-report-smoke.py +++ b/scripts/graphify-docker-graph-report-smoke.py @@ -10,7 +10,7 @@ import subprocess import sys import time -from dataclasses import dataclass +from dataclasses import dataclass, replace from datetime import datetime, timezone from pathlib import Path from typing import Any @@ -33,6 +33,8 @@ ) ) SUMMARY_OUT = Path(os.environ.get("ELF_GRAPHIFY_SMOKE_SUMMARY_OUT", REPORT_DIR / "summary.json")) +REPORT_JSON = Path(os.environ.get("ELF_GRAPHIFY_SMOKE_REPORT_JSON", REPORT_DIR / "graphify-report.json")) +REPORT_MD = Path(os.environ.get("ELF_GRAPHIFY_SMOKE_REPORT_MD", REPORT_DIR / "graphify-report.md")) FIXTURE_DIR = REPORT_DIR / "graphify-fixtures" CORPUS_DIR = WORK_DIR / "generated-public-corpus" OUTPUT_CAPTURE_DIR = REPORT_DIR / "graphify-out" @@ -120,7 +122,14 @@ def mkdirs() -> None: for path in (REPORT_DIR, WORK_DIR, FIXTURE_DIR, OUTPUT_CAPTURE_DIR, LOG_DIR): path.mkdir(parents=True, exist_ok=True) - for path in (OUT, MANIFEST_OUT, SUMMARY_OUT, REPORT_DIR / "generated-corpus.csv"): + for path in ( + OUT, + MANIFEST_OUT, + SUMMARY_OUT, + REPORT_JSON, + REPORT_MD, + REPORT_DIR / "generated-corpus.csv", + ): if path.exists(): path.unlink() @@ -132,6 +141,136 @@ def write_json(path: Path, payload: Any) -> None: path.write_text(json.dumps(payload, indent=2, sort_keys=True) + "\n", encoding="utf-8") +def run_scored_report(fixture_path: Path, manifest_path: Path, status: StatusState) -> dict[str, Any]: + """Score the generated graphify fixture through the real-world job runner.""" + + run_cmd = [ + "cargo", + "run", + "-p", + "elf-eval", + "--bin", + "real_world_job_benchmark", + "--", + "run", + "--fixtures", + str(fixture_path), + "--out", + str(REPORT_JSON), + "--run-id", + "real-world-memory-live-graphify", + "--adapter-id", + "graphify_docker_smoke", + "--adapter-name", + "graphify Docker graph/report smoke adapter", + "--adapter-behavior", + "docker_cli_graph_report_smoke", + "--adapter-storage-status", + status.setup, + "--adapter-runtime-status", + status.overall, + "--adapter-notes", + "Generated by the graphify Docker graph/report smoke; pass or wrong_result requires graph.json, GRAPH_REPORT.md, and query output mapped to generated evidence ids, while setup/runtime limits remain typed.", + "--external-adapter-manifest", + str(manifest_path), + ] + publish_cmd = [ + "cargo", + "run", + "-p", + "elf-eval", + "--bin", + "real_world_job_benchmark", + "--", + "publish", + "--report", + str(REPORT_JSON), + "--out", + str(REPORT_MD), + ] + + subprocess.run(run_cmd, cwd=ROOT_DIR, check=True) + subprocess.run(publish_cmd, cwd=ROOT_DIR, check=True) + + report = json.loads(REPORT_JSON.read_text(encoding="utf-8")) + + return { + "json": rel(REPORT_JSON), + "markdown": rel(REPORT_MD), + "summary": report.get("summary", {}), + "suites": report.get("suites", []), + } + + +def scored_benchmark(report: dict[str, Any] | None) -> dict[str, Any]: + """Extract the post-score benchmark status from a real_world_job report.""" + + if report is None: + return { + "schema": "elf.scored_benchmark_status/v1", + "source": "real_world_job_benchmark", + "status": "pending", + "reason": "The smoke materialization was written before benchmark scoring completed.", + } + + summary = report.get("summary", {}) + counts = { + status: int(summary.get(status, 0) or 0) + for status in ( + "pass", + "wrong_result", + "lifecycle_fail", + "incomplete", + "blocked", + "not_encoded", + ) + } + status = next((name for name, count in counts.items() if name != "pass" and count > 0), "pass") + + return { + "schema": "elf.scored_benchmark_status/v1", + "source": "real_world_job_benchmark", + "status": status, + "counts": counts, + "job_count": int(summary.get("job_count", 0) or 0), + "mean_score": summary.get("mean_score"), + "evidence_coverage": summary.get("evidence_coverage"), + } + + +def status_with_scored_result(status: StatusState, report: dict[str, Any]) -> StatusState: + """Return a manifest status that follows the scored real_world_job outcome.""" + + scored = scored_benchmark(report) + scored_status = scored.get("status") + if scored_status not in { + "pass", + "wrong_result", + "lifecycle_fail", + "incomplete", + "blocked", + "not_encoded", + }: + return status + + manifest_status = replace(status) + manifest_status.result = str(scored_status) + manifest_status.overall = str(scored_status) + + if scored_status == "pass": + manifest_status.failure_class = "" + manifest_status.failure_reason = "" + elif scored_status == "wrong_result": + manifest_status.failure_class = "scored_benchmark_wrong_result" + manifest_status.failure_reason = ( + "The graphify smoke materialized graph/report evidence, but the scored " + "real_world_job outcome is wrong_result; inspect graphify-report.json for " + "wrong-result signals." + ) + + return manifest_status + + def dir_size(path: Path) -> int: """Return total file size for a directory or file.""" @@ -932,8 +1071,9 @@ def write_fixture(corpus: list[CorpusItem], status: StatusState, mapped_ids: lis "hard_fail_rules": [], }, "allowed_uncertainty": { - "phrases": ["tiny generated corpus", "derived graph/report adapter"], - "fallback": "Report typed failure when graphify output cannot be mapped to evidence ids.", + "can_answer_unknown": False, + "acceptable_phrases": ["tiny generated corpus", "derived graph/report adapter"], + "fallback_action": "state_blocker", }, "operator_debug": None, "encoding": {}, @@ -960,6 +1100,7 @@ def write_materialization( command_records: list[CommandRecord], mappings: dict[str, Any], started_at: float, + report: dict[str, Any] | None = None, ) -> dict[str, Any]: """Write the primary smoke artifact.""" @@ -975,6 +1116,7 @@ def write_materialization( "adapter_id": "graphify_docker_smoke", "evidence_class": status.evidence_class, "status": { + "source": "smoke_materialization", "setup": status.setup, "run": status.run, "result": status.result, @@ -982,6 +1124,7 @@ def write_materialization( "failure_class": status.failure_class, "failure_reason": status.failure_reason, }, + "scored_benchmark": scored_benchmark(report), "artifacts": { "generated_corpus_csv": rel(corpus_csv), "generated_corpus_dir": rel(CORPUS_DIR), @@ -992,6 +1135,8 @@ def write_materialization( "query_output": query_record.stdout_artifact if query_record else None, "manifest": rel(MANIFEST_OUT), "summary": rel(SUMMARY_OUT), + "scored_report_json": rel(REPORT_JSON), + "scored_report_markdown": rel(REPORT_MD), }, "docker_boundary": { "compose_file": "docker-compose.baseline.yml", @@ -1110,7 +1255,7 @@ def write_manifest(status: StatusState) -> dict[str, Any]: }, { "suite_id": "retrieval", - "status": status.result if status.result in {"pass", "wrong_result"} else status.run, + "status": "blocked", "evidence": "The smoke uses graphify query output only to support source mapping; broad retrieval quality is not scored.", }, { @@ -1164,7 +1309,7 @@ def write_manifest(status: StatusState) -> dict[str, Any]: "research_depth": "D1 feasibility plus XY-889 Docker graph/report smoke implementation; generated artifact decides live evidence class.", }, "notes": [ - "The checked-in manifest record remains research_gate; generated smoke artifacts carry live status.", + "The checked-in manifest carries the current graphify status; generated smoke artifacts carry the run-specific live status.", "graphify output is treated as a derived graph/report adapter, not an authoritative ELF memory store.", ], } @@ -1175,7 +1320,7 @@ def write_manifest(status: StatusState) -> dict[str, Any]: return manifest -def write_summary(materialization: dict[str, Any], manifest: dict[str, Any]) -> None: +def write_summary(materialization: dict[str, Any], manifest: dict[str, Any], report: dict[str, Any]) -> None: """Write a small summary artifact.""" write_json( @@ -1185,12 +1330,20 @@ def write_summary(materialization: dict[str, Any], manifest: dict[str, Any]) -> "generated_at": utc_now(), "adapter_id": "graphify_docker_smoke", "evidence_class": materialization["evidence_class"], + "status_boundary": { + "materialization": "setup/run/evidence-mapping state emitted by the smoke runner", + "manifest": "external adapter declaration consumed by the scorer", + "scored_benchmark": "post-score real_world_job outcome; use this for quality status", + }, + "scored_benchmark": materialization["scored_benchmark"], "materialization": materialization, "manifest": { "json": rel(MANIFEST_OUT), + "status_source": "external_adapter_manifest_score_aligned", "summary": manifest["adapters"][0]["overall_status"], "suites": manifest["adapters"][0]["suites"], }, + "report": report, }, ) @@ -1305,7 +1458,22 @@ def main() -> int: started_at, ) manifest = write_manifest(status) - write_summary(materialization, manifest) + report = run_scored_report(fixture_path, MANIFEST_OUT, status) + manifest_status = status_with_scored_result(status, report) + if manifest_status.overall != status.overall or manifest_status.result != status.result: + manifest = write_manifest(manifest_status) + report = run_scored_report(fixture_path, MANIFEST_OUT, manifest_status) + materialization = write_materialization( + status, + corpus, + fixture_path, + corpus_csv, + command_records, + mappings, + started_at, + report, + ) + write_summary(materialization, manifest, report) print(f"graphify smoke artifact: {OUT}") print(f"graphify smoke manifest: {MANIFEST_OUT}") print(f"graphify smoke summary: {SUMMARY_OUT}") diff --git a/scripts/graphiti-zep-docker-temporal-smoke.py b/scripts/graphiti-zep-docker-temporal-smoke.py index 56c63eec..5ba1cc34 100644 --- a/scripts/graphiti-zep-docker-temporal-smoke.py +++ b/scripts/graphiti-zep-docker-temporal-smoke.py @@ -34,6 +34,12 @@ ) ) SUMMARY_OUT = Path(os.environ.get("ELF_GRAPHITI_ZEP_SMOKE_SUMMARY_OUT", REPORT_DIR / "summary.json")) +REPORT_JSON = Path( + os.environ.get("ELF_GRAPHITI_ZEP_SMOKE_REPORT_JSON", REPORT_DIR / "graphiti-zep-report.json") +) +REPORT_MD = Path( + os.environ.get("ELF_GRAPHITI_ZEP_SMOKE_REPORT_MD", REPORT_DIR / "graphiti-zep-report.md") +) FIXTURE_DIR = REPORT_DIR / "graphiti-zep-fixtures" LOG_DIR = REPORT_DIR / "logs" @@ -127,6 +133,103 @@ def write_json(path: Path, payload: Any) -> None: path.write_text(json.dumps(payload, indent=2, sort_keys=True) + "\n", encoding="utf-8") +def run_scored_report(fixture_path: Path, manifest_path: Path, status: StatusState) -> dict[str, Any]: + """Score the generated temporal smoke fixture through the real-world job runner.""" + + run_cmd = [ + "cargo", + "run", + "-p", + "elf-eval", + "--bin", + "real_world_job_benchmark", + "--", + "run", + "--fixtures", + str(fixture_path), + "--out", + str(REPORT_JSON), + "--run-id", + "real-world-memory-live-graphiti-zep", + "--adapter-id", + "graphiti_zep_temporal_smoke", + "--adapter-name", + "Graphiti/Zep Docker temporal smoke adapter", + "--adapter-behavior", + "docker_python_falkordb_temporal_smoke", + "--adapter-storage-status", + status.setup, + "--adapter-runtime-status", + status.overall, + "--adapter-notes", + "Generated by the Graphiti/Zep Docker temporal smoke; pass or wrong_result requires current and historical validity-window facts mapped to generated evidence ids, while provider/setup limits remain typed.", + "--external-adapter-manifest", + str(manifest_path), + ] + publish_cmd = [ + "cargo", + "run", + "-p", + "elf-eval", + "--bin", + "real_world_job_benchmark", + "--", + "publish", + "--report", + str(REPORT_JSON), + "--out", + str(REPORT_MD), + ] + + subprocess.run(run_cmd, cwd=ROOT_DIR, check=True) + subprocess.run(publish_cmd, cwd=ROOT_DIR, check=True) + + report = json.loads(REPORT_JSON.read_text(encoding="utf-8")) + + return { + "json": rel(REPORT_JSON), + "markdown": rel(REPORT_MD), + "summary": report.get("summary", {}), + "suites": report.get("suites", []), + } + + +def scored_benchmark(report: dict[str, Any] | None) -> dict[str, Any]: + """Extract the post-score benchmark status from a real_world_job report.""" + + if report is None: + return { + "schema": "elf.scored_benchmark_status/v1", + "source": "real_world_job_benchmark", + "status": "pending", + "reason": "The smoke materialization was written before benchmark scoring completed.", + } + + summary = report.get("summary", {}) + counts = { + status: int(summary.get(status, 0) or 0) + for status in ( + "pass", + "wrong_result", + "lifecycle_fail", + "incomplete", + "blocked", + "not_encoded", + ) + } + status = next((name for name, count in counts.items() if name != "pass" and count > 0), "pass") + + return { + "schema": "elf.scored_benchmark_status/v1", + "source": "real_world_job_benchmark", + "status": status, + "counts": counts, + "job_count": int(summary.get("job_count", 0) or 0), + "mean_score": summary.get("mean_score"), + "evidence_coverage": summary.get("evidence_coverage"), + } + + def command_available(command: str) -> bool: """Return whether a command is on PATH.""" @@ -775,7 +878,7 @@ def write_fixture(facts: list[dict[str, Any]], status: StatusState, mapping: dic "tags": ["external_adapter", "generated_public", "memory_evolution", "reference_graphiti_zep_temporal"], } - if status.result in {"blocked", "incomplete", "wrong_result"}: + if status.result in {"blocked", "incomplete", "not_encoded"}: fixture["encoding"] = {"status": status.result, "reason": status.failure_reason} write_json(fixture_path, fixture) @@ -792,6 +895,7 @@ def write_materialization( search_results: list[dict[str, Any]], mapping: dict[str, Any], started_at: float, + report: dict[str, Any] | None = None, ) -> dict[str, Any]: """Write the primary smoke artifact.""" @@ -803,6 +907,16 @@ def write_materialization( "adapter_id": "graphiti_zep_temporal_smoke", "project": "Graphiti/Zep", "status": status.overall, + "materialization_status": { + "source": "smoke_materialization", + "setup": status.setup, + "run": status.run, + "result": status.result, + "overall": status.overall, + "failure_class": status.failure_class, + "failure_reason": status.failure_reason, + }, + "scored_benchmark": scored_benchmark(report), "evidence_class": status.evidence_class, "failure": { "class": status.failure_class or None, @@ -813,6 +927,8 @@ def write_materialization( "manifest": rel(MANIFEST_OUT), "summary": rel(SUMMARY_OUT), "fixture": rel(fixture_path), + "scored_report_json": rel(REPORT_JSON), + "scored_report_markdown": rel(REPORT_MD), }, "docker_boundary": { "compose_file": "docker-compose.baseline.yml", @@ -1008,7 +1124,7 @@ def write_manifest(status: StatusState) -> dict[str, Any]: return manifest -def write_summary(materialization: dict[str, Any], manifest: dict[str, Any]) -> None: +def write_summary(materialization: dict[str, Any], manifest: dict[str, Any], report: dict[str, Any]) -> None: """Write a small summary artifact.""" write_json( @@ -1018,12 +1134,20 @@ def write_summary(materialization: dict[str, Any], manifest: dict[str, Any]) -> "generated_at": utc_now(), "adapter_id": "graphiti_zep_temporal_smoke", "evidence_class": materialization["evidence_class"], + "status_boundary": { + "materialization": "setup/run/evidence-mapping state emitted by the smoke runner", + "manifest": "external adapter declaration consumed by the scorer", + "scored_benchmark": "post-score real_world_job outcome; use this for quality status", + }, + "scored_benchmark": materialization["scored_benchmark"], "materialization": materialization, "manifest": { "json": rel(MANIFEST_OUT), + "status_source": "external_adapter_manifest_pre_score", "summary": manifest["adapters"][0]["overall_status"], "suites": manifest["adapters"][0]["suites"], }, + "report": report, }, ) @@ -1141,7 +1265,19 @@ def main() -> int: started_at, ) manifest = write_manifest(status) - write_summary(materialization, manifest) + report = run_scored_report(fixture_path, MANIFEST_OUT, status) + materialization = write_materialization( + status, + facts, + fixture_path, + command_records, + inserted, + search_results, + mapping, + started_at, + report, + ) + write_summary(materialization, manifest, report) print(f"Graphiti/Zep smoke artifact: {OUT}") print(f"Graphiti/Zep smoke manifest: {MANIFEST_OUT}") print(f"Graphiti/Zep smoke summary: {SUMMARY_OUT}") diff --git a/scripts/graphrag-docker-smoke.py b/scripts/graphrag-docker-smoke.py index 69942e45..02be1560 100755 --- a/scripts/graphrag-docker-smoke.py +++ b/scripts/graphrag-docker-smoke.py @@ -34,6 +34,8 @@ ) ) SUMMARY_OUT = Path(os.environ.get("ELF_GRAPHRAG_SMOKE_SUMMARY_OUT", REPORT_DIR / "summary.json")) +REPORT_JSON = Path(os.environ.get("ELF_GRAPHRAG_SMOKE_REPORT_JSON", REPORT_DIR / "graphrag-report.json")) +REPORT_MD = Path(os.environ.get("ELF_GRAPHRAG_SMOKE_REPORT_MD", REPORT_DIR / "graphrag-report.md")) FIXTURE_DIR = REPORT_DIR / "graphrag-fixtures" OUTPUT_CAPTURE_DIR = REPORT_DIR / "graphrag-output" LOG_DIR = REPORT_DIR / "logs" @@ -55,7 +57,7 @@ INDEX_METHOD = os.environ.get("ELF_GRAPHRAG_INDEX_METHOD", "fast") QUERY_METHOD = os.environ.get("ELF_GRAPHRAG_QUERY_METHOD", "local") TIMEOUT_SECONDS = int(os.environ.get("ELF_GRAPHRAG_TIMEOUT_SECONDS", "900")) -MAX_DOCS = max(1, min(int(os.environ.get("ELF_GRAPHRAG_MAX_DOCS", "2")), 3)) +MAX_DOCS = max(1, min(int(os.environ.get("ELF_GRAPHRAG_MAX_DOCS", "3")), 3)) MAX_INPUT_CHARS = max(400, min(int(os.environ.get("ELF_GRAPHRAG_MAX_INPUT_CHARS", "2400")), 6000)) TABLES = ( @@ -127,6 +129,103 @@ def write_json(path: Path, payload: Any) -> None: path.write_text(json.dumps(payload, indent=2, sort_keys=True) + "\n", encoding="utf-8") +def run_scored_report(fixture_path: Path, manifest_path: Path, status: StatusState) -> dict[str, Any]: + """Score the generated smoke fixture through the real-world job runner.""" + + run_cmd = [ + "cargo", + "run", + "-p", + "elf-eval", + "--bin", + "real_world_job_benchmark", + "--", + "run", + "--fixtures", + str(fixture_path), + "--out", + str(REPORT_JSON), + "--run-id", + "real-world-memory-live-graphrag", + "--adapter-id", + "graphrag_docker_smoke", + "--adapter-name", + "GraphRAG Docker smoke adapter", + "--adapter-behavior", + "docker_python_cli_api_smoke", + "--adapter-storage-status", + status.setup, + "--adapter-runtime-status", + status.overall, + "--adapter-notes", + "Generated by the cost-bounded GraphRAG Docker smoke; pass or wrong_result requires GraphRAG output tables mapped to generated evidence ids, while provider/setup limits remain typed.", + "--external-adapter-manifest", + str(manifest_path), + ] + publish_cmd = [ + "cargo", + "run", + "-p", + "elf-eval", + "--bin", + "real_world_job_benchmark", + "--", + "publish", + "--report", + str(REPORT_JSON), + "--out", + str(REPORT_MD), + ] + + subprocess.run(run_cmd, cwd=ROOT_DIR, check=True) + subprocess.run(publish_cmd, cwd=ROOT_DIR, check=True) + + report = json.loads(REPORT_JSON.read_text(encoding="utf-8")) + + return { + "json": rel(REPORT_JSON), + "markdown": rel(REPORT_MD), + "summary": report.get("summary", {}), + "suites": report.get("suites", []), + } + + +def scored_benchmark(report: dict[str, Any] | None) -> dict[str, Any]: + """Extract the post-score benchmark status from a real_world_job report.""" + + if report is None: + return { + "schema": "elf.scored_benchmark_status/v1", + "source": "real_world_job_benchmark", + "status": "pending", + "reason": "The smoke materialization was written before benchmark scoring completed.", + } + + summary = report.get("summary", {}) + counts = { + status: int(summary.get(status, 0) or 0) + for status in ( + "pass", + "wrong_result", + "lifecycle_fail", + "incomplete", + "blocked", + "not_encoded", + ) + } + status = next((name for name, count in counts.items() if name != "pass" and count > 0), "pass") + + return { + "schema": "elf.scored_benchmark_status/v1", + "source": "real_world_job_benchmark", + "status": status, + "counts": counts, + "job_count": int(summary.get("job_count", 0) or 0), + "mean_score": summary.get("mean_score"), + "evidence_coverage": summary.get("evidence_coverage"), + } + + def dir_size(path: Path) -> int: """Return total file size for a directory or file.""" @@ -310,6 +409,9 @@ def write_fixture(corpus: list[dict[str, str]], status: StatusState, mapped_ids: fixture_path = FIXTURE_DIR / "knowledge" / "graphrag_tiny_corpus.json" expected_ids = [item["evidence_id"] for item in corpus if item["evidence_id"] != "graphrag-smoke-stale-trap"] used_ids = [item for item in mapped_ids if item in expected_ids] + stale_trap_ids = [ + item["evidence_id"] for item in corpus if item["evidence_id"] == "graphrag-smoke-stale-trap" + ] response = { "adapter_id": "graphrag_docker_smoke", "answer": { @@ -416,10 +518,12 @@ def write_fixture(corpus: list[dict[str, str]], status: StatusState, mapped_ids: { "trap_id": "retired-zenith-ledger", "type": "stale_fact", - "evidence_ids": ["graphrag-smoke-stale-trap"], + "evidence_ids": stale_trap_ids, "failure_if_used": True, } - ], + ] + if stale_trap_ids + else [], "scoring_rubric": { "dimensions": { "answer_correctness": { @@ -447,8 +551,9 @@ def write_fixture(corpus: list[dict[str, str]], status: StatusState, mapped_ids: "hard_fail_rules": [], }, "allowed_uncertainty": { - "phrases": ["tiny generated corpus", "smoke only"], - "fallback": "Report typed failure when GraphRAG output identifiers cannot be mapped.", + "can_answer_unknown": False, + "acceptable_phrases": ["tiny generated corpus", "smoke only"], + "fallback_action": "state_blocker", }, "operator_debug": None, "encoding": {}, @@ -971,6 +1076,7 @@ def write_materialization( mappings: list[dict[str, Any]], mapped_ids: list[str], started_at: float, + report: dict[str, Any] | None = None, ) -> dict[str, Any]: """Write the primary smoke artifact.""" @@ -985,6 +1091,7 @@ def write_materialization( "adapter_id": "graphrag_docker_smoke", "evidence_class": status.evidence_class, "status": { + "source": "smoke_materialization", "setup": status.setup, "run": status.run, "result": status.result, @@ -992,12 +1099,15 @@ def write_materialization( "failure_class": status.failure_class, "failure_reason": status.failure_reason, }, + "scored_benchmark": scored_benchmark(report), "artifacts": { "generated_corpus_csv": rel(corpus_csv), "generated_fixture": rel(fixture_path), "graph_output_dir": rel(OUTPUT_CAPTURE_DIR), "manifest": rel(MANIFEST_OUT), "summary": rel(SUMMARY_OUT), + "scored_report_json": rel(REPORT_JSON), + "scored_report_markdown": rel(REPORT_MD), }, "docker_boundary": { "compose_file": "docker-compose.baseline.yml", @@ -1199,7 +1309,7 @@ def write_manifest(status: StatusState) -> dict[str, Any]: return manifest -def write_summary(materialization: dict[str, Any], manifest: dict[str, Any]) -> None: +def write_summary(materialization: dict[str, Any], manifest: dict[str, Any], report: dict[str, Any]) -> None: """Write a small summary artifact.""" write_json( @@ -1209,12 +1319,20 @@ def write_summary(materialization: dict[str, Any], manifest: dict[str, Any]) -> "generated_at": utc_now(), "adapter_id": "graphrag_docker_smoke", "evidence_class": materialization["evidence_class"], + "status_boundary": { + "materialization": "setup/run/evidence-mapping state emitted by the smoke runner", + "manifest": "external adapter declaration consumed by the scorer", + "scored_benchmark": "post-score real_world_job outcome; use this for quality status", + }, + "scored_benchmark": materialization["scored_benchmark"], "materialization": materialization, "manifest": { "json": rel(MANIFEST_OUT), + "status_source": "external_adapter_manifest_pre_score", "summary": manifest["adapters"][0]["overall_status"], "suites": manifest["adapters"][0]["suites"], }, + "report": report, }, ) @@ -1328,7 +1446,19 @@ def main() -> int: started_at, ) manifest = write_manifest(status) - write_summary(materialization, manifest) + report = run_scored_report(fixture_path, MANIFEST_OUT, status) + materialization = write_materialization( + status, + corpus, + fixture_path, + corpus_csv, + command_records, + mappings, + mapped_ids, + started_at, + report, + ) + write_summary(materialization, manifest, report) print(f"GraphRAG smoke artifact: {OUT}") print(f"GraphRAG smoke manifest: {MANIFEST_OUT}") print(f"GraphRAG smoke summary: {SUMMARY_OUT}") diff --git a/scripts/lightrag-docker-context-smoke.sh b/scripts/lightrag-docker-context-smoke.sh index feac9054..6e4d302e 100644 --- a/scripts/lightrag-docker-context-smoke.sh +++ b/scripts/lightrag-docker-context-smoke.sh @@ -66,13 +66,49 @@ cargo run -p elf-eval --bin real_world_job_benchmark -- publish \ jq -n \ --slurpfile materialization "${REPORT_DIR}/lightrag-materialization.json" \ --slurpfile report "${REPORT_DIR}/lightrag-report.json" \ - '{ + 'def count($key): ($report[0].summary[$key] // 0); + def scored_status: + if count("wrong_result") > 0 then "wrong_result" + elif count("lifecycle_fail") > 0 then "lifecycle_fail" + elif count("incomplete") > 0 then "incomplete" + elif count("blocked") > 0 then "blocked" + elif count("not_encoded") > 0 then "not_encoded" + elif count("pass") > 0 then "pass" + else "not_encoded" + end; + { schema: "elf.lightrag_context_export_smoke/v1", generated_at: (now | todateiso8601), artifact_dir: (env.ELF_LIGHTRAG_CONTEXT_REPORT_DIR // "tmp/real-world-memory/lightrag-context"), fixture_dir: (env.ELF_LIGHTRAG_CONTEXT_FIXTURES // "apps/elf-eval/fixtures/real_world_memory/retrieval"), adapter_id: (env.ELF_LIGHTRAG_ADAPTER_ID // "lightrag_live_real_world"), - evidence_class: "live_real_world_when_materialization_passes", + evidence_class: ( + if ($materialization[0].status == "pass" or $materialization[0].status == "wrong_result") then + "live_real_world" + else + "research_gate" + end + ), + status_boundary: { + materialization: "API reachability, ingest, context export, and evidence-mapping state emitted by the adapter", + report: "post-score real_world_job outcome; use this for quality status" + }, + scored_benchmark: { + schema: "elf.scored_benchmark_status/v1", + source: "real_world_job_benchmark", + status: scored_status, + counts: { + pass: count("pass"), + wrong_result: count("wrong_result"), + lifecycle_fail: count("lifecycle_fail"), + incomplete: count("incomplete"), + blocked: count("blocked"), + not_encoded: count("not_encoded") + }, + job_count: ($report[0].summary.job_count // 0), + mean_score: ($report[0].summary.mean_score // null), + evidence_coverage: ($report[0].summary.evidence_coverage // null) + }, materialization: $materialization[0], report: { json: "tmp/real-world-memory/lightrag-context/lightrag-report.json", diff --git a/scripts/ragflow-docker-evidence-smoke.sh b/scripts/ragflow-docker-evidence-smoke.sh index e19e54ed..95cd50f5 100755 --- a/scripts/ragflow-docker-evidence-smoke.sh +++ b/scripts/ragflow-docker-evidence-smoke.sh @@ -5,6 +5,12 @@ ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" ARTIFACT_DIR="${ELF_RAGFLOW_SMOKE_ARTIFACT_DIR:-${ROOT_DIR}/tmp/real-world-memory/ragflow-smoke}" OUT="${ELF_RAGFLOW_SMOKE_OUT:-${ARTIFACT_DIR}/ragflow-smoke.json}" MANIFEST_OUT="${ELF_RAGFLOW_SMOKE_MANIFEST_OUT:-${ARTIFACT_DIR}/memory_projects_manifest.ragflow-smoke.json}" +SUMMARY_OUT="${ELF_RAGFLOW_SMOKE_SUMMARY_OUT:-${ARTIFACT_DIR}/summary.json}" +FIXTURE_DIR="${ELF_RAGFLOW_SMOKE_FIXTURE_DIR:-${ARTIFACT_DIR}/ragflow-fixtures}" +FIXTURE_PATH="${ELF_RAGFLOW_SMOKE_FIXTURE_PATH:-${FIXTURE_DIR}/retrieval/ragflow_evidence_smoke.json}" +REPORT_JSON="${ELF_RAGFLOW_SMOKE_REPORT_JSON:-${ARTIFACT_DIR}/ragflow-report.json}" +REPORT_MD="${ELF_RAGFLOW_SMOKE_REPORT_MD:-${ARTIFACT_DIR}/ragflow-report.md}" +SCORED_BENCHMARK="${ELF_RAGFLOW_SMOKE_SCORED_BENCHMARK:-${ARTIFACT_DIR}/scored-benchmark.json}" WORK_DIR="${ELF_RAGFLOW_SMOKE_WORK_DIR:-${ARTIFACT_DIR}/work}" RAGFLOW_REPO_URL="${ELF_RAGFLOW_REPO_URL:-https://github.com/infiniflow/ragflow.git}" RAGFLOW_REF="${ELF_RAGFLOW_REF:-v0.25.6}" @@ -28,7 +34,18 @@ DOCUMENT_NAME="${RUN_ID}.txt" EVIDENCE_TOKEN="ELF_RAGFLOW_SMOKE_TOKEN_${RUN_ID}" CORPUS_TEXT="RAGFlow smoke evidence ${EVIDENCE_TOKEN}: the ELF adapter maps returned reference chunks to the ragflow-smoke-anchor evidence id." -mkdir -p "${ARTIFACT_DIR}" "${WORK_DIR}" "$(dirname "${OUT}")" "$(dirname "${MANIFEST_OUT}")" +mkdir -p \ + "${ARTIFACT_DIR}" \ + "${WORK_DIR}" \ + "$(dirname "${OUT}")" \ + "$(dirname "${MANIFEST_OUT}")" \ + "$(dirname "${SUMMARY_OUT}")" \ + "$(dirname "${FIXTURE_PATH}")" \ + "$(dirname "${REPORT_JSON}")" \ + "$(dirname "${REPORT_MD}")" \ + "$(dirname "${SCORED_BENCHMARK}")" + +rm -f "${OUT}" "${MANIFEST_OUT}" "${SUMMARY_OUT}" "${REPORT_JSON}" "${REPORT_MD}" "${SCORED_BENCHMARK}" DOCKER_INFO="${ARTIFACT_DIR}/docker-info.json" IMAGE_INSPECT="${ARTIFACT_DIR}/ragflow-image-inspect.json" @@ -495,11 +512,52 @@ cleanup_stack() { ) >"${COMPOSE_DOWN_LOG}" 2>&1 || true } +write_scored_benchmark() { + if [[ -s "${REPORT_JSON}" ]]; then + jq 'def count($key): (.summary[$key] // 0); + def scored_status: + if count("wrong_result") > 0 then "wrong_result" + elif count("lifecycle_fail") > 0 then "lifecycle_fail" + elif count("incomplete") > 0 then "incomplete" + elif count("blocked") > 0 then "blocked" + elif count("not_encoded") > 0 then "not_encoded" + elif count("pass") > 0 then "pass" + else "not_encoded" + end; + { + schema: "elf.scored_benchmark_status/v1", + source: "real_world_job_benchmark", + status: scored_status, + counts: { + pass: count("pass"), + wrong_result: count("wrong_result"), + lifecycle_fail: count("lifecycle_fail"), + incomplete: count("incomplete"), + blocked: count("blocked"), + not_encoded: count("not_encoded") + }, + job_count: (.summary.job_count // 0), + mean_score: (.summary.mean_score // null), + evidence_coverage: (.summary.evidence_coverage // null) + }' "${REPORT_JSON}" >"${SCORED_BENCHMARK}" + else + jq -n '{ + schema: "elf.scored_benchmark_status/v1", + source: "real_world_job_benchmark", + status: "pending", + reason: "The smoke materialization was written before benchmark scoring completed." + }' >"${SCORED_BENCHMARK}" + fi +} + write_artifact() { - local generated_at out_rel manifest_rel docker_status git_status curl_status jq_status + local generated_at out_rel manifest_rel fixture_rel report_json_rel report_md_rel docker_status git_status curl_status jq_status generated_at="$(date -u +"%Y-%m-%dT%H:%M:%SZ")" out_rel="$(relative_path "${OUT}")" manifest_rel="$(relative_path "${MANIFEST_OUT}")" + fixture_rel="$(relative_path "${FIXTURE_PATH}")" + report_json_rel="$(relative_path "${REPORT_JSON}")" + report_md_rel="$(relative_path "${REPORT_MD}")" docker_status="$(optional_command_status docker)" git_status="$(optional_command_status git)" curl_status="$(optional_command_status curl)" @@ -519,6 +577,9 @@ write_artifact() { --arg failure_reason "${FAILURE_REASON}" \ --arg out_rel "${out_rel}" \ --arg manifest_rel "${manifest_rel}" \ + --arg fixture_rel "${fixture_rel}" \ + --arg report_json_rel "${report_json_rel}" \ + --arg report_md_rel "${report_md_rel}" \ --arg artifact_dir "$(relative_path "${ARTIFACT_DIR}")" \ --arg work_dir "$(relative_path "${WORK_DIR}")" \ --arg repo_url "${RAGFLOW_REPO_URL}" \ @@ -569,6 +630,7 @@ write_artifact() { --slurpfile document_response "${DOCUMENT_RESPONSE}" \ --slurpfile chunk_response "${CHUNK_RESPONSE}" \ --slurpfile retrieval_response "${RETRIEVAL_RESPONSE}" \ + --slurpfile scored_benchmark "${SCORED_BENCHMARK}" \ --slurpfile startup_attempts <(jq -s '.' "${STARTUP_ATTEMPTS_JSONL}") \ '{ schema: $schema, @@ -577,6 +639,8 @@ write_artifact() { adapter_id: $adapter_id, evidence_class: $evidence_class, overall_status: $overall_status, + status_source: "smoke_materialization", + scored_benchmark: $scored_benchmark[0], no_quality_claim: true, failure: ( if $failure_class == "" then null @@ -589,6 +653,9 @@ write_artifact() { artifacts: { smoke: $out_rel, external_adapter_manifest: $manifest_rel, + generated_fixture: $fixture_rel, + scored_report_json: $report_json_rel, + scored_report_markdown: $report_md_rel, artifact_dir: $artifact_dir, work_dir: $work_dir }, @@ -893,6 +960,228 @@ write_manifest() { }' >"${MANIFEST_OUT}" } +write_fixture() { + local result_status reason + result_status="$(json_status "${RESULT_STATUS}")" + reason="${FAILURE_REASON}" + + jq -n \ + --arg run_id "${RUN_ID}" \ + --arg evidence_id "${EVIDENCE_ID}" \ + --arg evidence_token "${EVIDENCE_TOKEN}" \ + --arg corpus_text "${CORPUS_TEXT}" \ + --arg result_status "${result_status}" \ + --arg failure_reason "${reason}" \ + '{ + schema: "elf.real_world_job/v1", + job_id: "ragflow-evidence-smoke-001", + suite: "retrieval", + title: "Map RAGFlow reference chunks to generated evidence", + corpus: { + corpus_id: "ragflow-generated-public-smoke", + profile: "generated_public", + items: [ + { + evidence_id: $evidence_id, + kind: "document", + text: $corpus_text, + source_ref: { + schema: "source_ref/v1", + resolver: "ragflow_smoke/v1", + ref: { + run_id: $run_id, + evidence_token: $evidence_token + } + }, + created_at: "2026-06-10T00:00:00Z" + } + ], + adapter_response: { + adapter_id: "ragflow_docker_evidence_smoke", + answer: { + content: ( + if $result_status == "pass" then + "RAGFlow returned reference chunks that map to the generated ragflow-smoke-anchor evidence id." + else + "" + end + ), + claims: ( + if $result_status == "pass" then + [ + { + claim_id: "ragflow_reference_mapping", + text: "RAGFlow reference chunks map to the generated ragflow-smoke-anchor evidence id.", + evidence_ids: [$evidence_id], + confidence: "derived_from_ragflow_reference_chunk_mapping" + } + ] + else + [] + end + ), + evidence_ids: (if $result_status == "pass" then [$evidence_id] else [] end), + latency_ms: 0.0, + cost: { + currency: "USD", + amount: 0.0, + input_tokens: 0, + output_tokens: 0 + } + } + } + }, + timeline: [ + { + event_id: "ragflow-smoke-corpus-generated", + ts: "2026-06-10T00:00:00Z", + actor: "system", + action: "generated_public_corpus", + evidence_ids: [$evidence_id], + summary: "The RAGFlow smoke generated a tiny public corpus for reference chunk mapping." + } + ], + prompt: { + role: "user", + content: "Which RAGFlow smoke evidence token maps to the generated reference chunk?", + job_mode: "answer", + constraints: ["cite_evidence", "avoid_broad_quality_claims"] + }, + expected_answer: { + must_include: [ + { + claim_id: "ragflow_reference_mapping", + text: "RAGFlow reference chunks map to the generated ragflow-smoke-anchor evidence id." + } + ], + must_not_include: ["RAGFlow passed a broad graph/RAG quality benchmark."], + evidence_links: { + ragflow_reference_mapping: [$evidence_id] + }, + answer_type: "direct_answer", + accepted_alternates: [], + requires_caveat: true, + requires_refusal: false + }, + required_evidence: [ + { + evidence_id: $evidence_id, + claim_id: "ragflow_reference_mapping", + requirement: "cite", + quote: "ragflow-smoke-anchor evidence id" + } + ], + negative_traps: [], + scoring_rubric: { + dimensions: { + answer_correctness: { + weight: 0.3, + max_points: 1.0, + criteria: "States the generated evidence mapping without broad quality claims." + }, + evidence_grounding: { + weight: 0.45, + max_points: 1.0, + criteria: "Maps returned RAGFlow reference chunks to the generated evidence id." + }, + trap_avoidance: { + weight: 0.15, + max_points: 1.0, + criteria: "Does not claim broad RAGFlow quality from the tiny smoke." + }, + latency_resource: { + weight: 0.1, + max_points: 1.0, + criteria: "Records setup, resource, provider, and reference-mapping boundaries." + } + }, + pass_threshold: 0.75, + hard_fail_rules: [] + }, + allowed_uncertainty: { + can_answer_unknown: false, + acceptable_phrases: ["tiny generated corpus", "reference chunk smoke only"], + fallback_action: "state_blocker" + }, + operator_debug: null, + encoding: {}, + memory_evolution: null, + tags: ["external_adapter", "generated_public", "ragflow", "no_live_claim"] + } + | if ["blocked", "incomplete", "not_encoded"] | index($result_status) then + .encoding = {status: $result_status, reason: $failure_reason} + else + . + end' >"${FIXTURE_PATH}" +} + +write_scored_report() { + ( + cd "${ROOT_DIR}" + cargo run -p elf-eval --bin real_world_job_benchmark -- run \ + --fixtures "${FIXTURE_PATH}" \ + --out "${REPORT_JSON}" \ + --run-id real-world-memory-live-ragflow \ + --adapter-id ragflow_docker_evidence_smoke \ + --adapter-name "RAGFlow Docker evidence smoke adapter" \ + --adapter-behavior docker_service_evidence_smoke \ + --adapter-storage-status "$(json_status "${SETUP_STATUS}")" \ + --adapter-runtime-status "$(json_status "${OVERALL_STATUS}")" \ + --adapter-notes "Generated by the RAGFlow Docker evidence smoke; pass or wrong_result requires reference chunks mapped to generated evidence ids, while resource/setup/API-key limits remain typed." \ + --external-adapter-manifest "${MANIFEST_OUT}" + cargo run -p elf-eval --bin real_world_job_benchmark -- publish \ + --report "${REPORT_JSON}" \ + --out "${REPORT_MD}" + ) +} + +write_summary() { + jq -n \ + --slurpfile materialization "${OUT}" \ + --slurpfile manifest "${MANIFEST_OUT}" \ + --slurpfile report "${REPORT_JSON}" \ + '{ + schema: "elf.ragflow_docker_smoke_summary/v1", + generated_at: (now | todateiso8601), + adapter_id: "ragflow_docker_evidence_smoke", + evidence_class: $materialization[0].evidence_class, + status_boundary: { + materialization: "setup/run/evidence-mapping state emitted by the smoke runner", + manifest: "external adapter declaration consumed by the scorer", + scored_benchmark: "post-score real_world_job outcome; use this for quality status" + }, + scored_benchmark: $materialization[0].scored_benchmark, + materialization: $materialization[0], + manifest: { + json: ($materialization[0].artifacts.external_adapter_manifest // "tmp/real-world-memory/ragflow-smoke/memory_projects_manifest.ragflow-smoke.json"), + status_source: "external_adapter_manifest_pre_score", + summary: $manifest[0].adapters[0].overall_status, + suites: $manifest[0].adapters[0].suites + }, + report: { + json: ($materialization[0].artifacts.scored_report_json // "tmp/real-world-memory/ragflow-smoke/ragflow-report.json"), + markdown: ($materialization[0].artifacts.scored_report_markdown // "tmp/real-world-memory/ragflow-smoke/ragflow-report.md"), + summary: $report[0].summary, + suites: $report[0].suites + } + }' >"${SUMMARY_OUT}" +} + +write_outputs() { + write_scored_benchmark + write_artifact + write_manifest + write_fixture + write_scored_report + write_scored_benchmark + write_artifact + write_summary + echo "RAGFlow smoke artifact: ${OUT}" + echo "RAGFlow smoke manifest: ${MANIFEST_OUT}" + echo "RAGFlow smoke report: ${REPORT_JSON}" + echo "RAGFlow smoke summary: ${SUMMARY_OUT}" +} + for cmd in jq curl; do required_command "${cmd}" done @@ -904,10 +1193,7 @@ if ! command -v docker >/dev/null 2>&1; then RESULT_STATUS="incomplete" FAILURE_CLASS="docker_cli_missing" FAILURE_REASON="Docker CLI is required for the RAGFlow evidence smoke." - write_artifact - write_manifest - echo "RAGFlow smoke artifact: ${OUT}" - echo "RAGFlow smoke manifest: ${MANIFEST_OUT}" + write_outputs exit 0 fi @@ -917,10 +1203,7 @@ if ! capture_docker_info; then RESULT_STATUS="incomplete" FAILURE_CLASS="docker_unavailable" FAILURE_REASON="Docker is installed but docker info failed; RAGFlow Docker setup was not attempted." - write_artifact - write_manifest - echo "RAGFlow smoke artifact: ${OUT}" - echo "RAGFlow smoke manifest: ${MANIFEST_OUT}" + write_outputs exit 0 fi @@ -935,26 +1218,17 @@ if [[ "${ARCH}" != "x86_64" && "${ARCH}" != "amd64" && "${ALLOW_ARM}" != "1" ]]; RESULT_STATUS="blocked" FAILURE_CLASS="unsupported_ragflow_docker_architecture" FAILURE_REASON="Official RAGFlow quickstart supports x86 CPU and Nvidia GPU Docker images; set ELF_RAGFLOW_SMOKE_ALLOW_ARM=1 only for an explicitly built ARM image path." - write_artifact - write_manifest - echo "RAGFlow smoke artifact: ${OUT}" - echo "RAGFlow smoke manifest: ${MANIFEST_OUT}" + write_outputs exit 0 fi if [[ "${START_RAGFLOW}" != "1" ]]; then - write_artifact - write_manifest - echo "RAGFlow smoke artifact: ${OUT}" - echo "RAGFlow smoke manifest: ${MANIFEST_OUT}" + write_outputs exit 0 fi if [[ "${ACCEPT_RESOURCE_ENVELOPE}" != "1" ]]; then - write_artifact - write_manifest - echo "RAGFlow smoke artifact: ${OUT}" - echo "RAGFlow smoke manifest: ${MANIFEST_OUT}" + write_outputs exit 0 fi @@ -964,10 +1238,7 @@ if ! command -v git >/dev/null 2>&1; then RESULT_STATUS="incomplete" FAILURE_CLASS="git_missing_for_ragflow_source" FAILURE_REASON="git is required to fetch the official RAGFlow Docker Compose files for this smoke." - write_artifact - write_manifest - echo "RAGFlow smoke artifact: ${OUT}" - echo "RAGFlow smoke manifest: ${MANIFEST_OUT}" + write_outputs exit 0 fi @@ -1004,8 +1275,4 @@ if [[ "${SETUP_STATUS}" == "pass" ]]; then fi cleanup_stack -write_artifact -write_manifest - -echo "RAGFlow smoke artifact: ${OUT}" -echo "RAGFlow smoke manifest: ${MANIFEST_OUT}" +write_outputs diff --git a/scripts/real-world-live-adapters.sh b/scripts/real-world-live-adapters.sh index 3cd5ab31..7c87667c 100755 --- a/scripts/real-world-live-adapters.sh +++ b/scripts/real-world-live-adapters.sh @@ -28,11 +28,12 @@ rm -rf "${REPORT_DIR:?}/elf-fixtures" \ "${REPORT_DIR:?}/elf-report.md" \ "${REPORT_DIR:?}/qmd-report.json" \ "${REPORT_DIR:?}/qmd-report.md" \ - "${REPORT_DIR:?}/lightrag" \ - "${REPORT_DIR:?}/graphrag" \ - "${REPORT_DIR:?}/graphiti-zep" \ - "${REPORT_DIR:?}/graphify" \ - "${REPORT_DIR:?}/summary.json" + "${REPORT_DIR:?}/ragflow" \ + "${REPORT_DIR:?}/lightrag" \ + "${REPORT_DIR:?}/graphrag" \ + "${REPORT_DIR:?}/graphiti-zep" \ + "${REPORT_DIR:?}/graphify" \ + "${REPORT_DIR:?}/summary.json" cd "${ROOT_DIR}" @@ -79,6 +80,11 @@ cargo run -p elf-eval --bin real_world_job_benchmark -- publish \ --report "${REPORT_DIR}/qmd-report.json" \ --out "${REPORT_DIR}/qmd-report.md" +if [[ "${ELF_REAL_WORLD_LIVE_ENABLE_RAGFLOW:-0}" == "1" ]]; then + ELF_RAGFLOW_SMOKE_ARTIFACT_DIR="${REPORT_DIR}/ragflow" \ + bash scripts/ragflow-docker-evidence-smoke.sh +fi + if [[ "${ELF_REAL_WORLD_LIVE_ENABLE_LIGHTRAG:-0}" == "1" ]]; then ELF_LIGHTRAG_CONTEXT_REPORT_DIR="${REPORT_DIR}/lightrag" \ ELF_LIGHTRAG_CONTEXT_FIXTURES="${ELF_LIGHTRAG_CONTEXT_FIXTURES:-${FIXTURE_DIR}/retrieval}" \ @@ -106,11 +112,34 @@ jq -n \ --slurpfile elf_report "${REPORT_DIR}/elf-report.json" \ --slurpfile qmd_report "${REPORT_DIR}/qmd-report.json" \ '{ - schema: "elf.real_world_live_adapter_sweep/v1", - generated_at: (now | todateiso8601), - artifact_dir: (env.ELF_REAL_WORLD_LIVE_REPORT_DIR // "tmp/real-world-memory/live-adapters"), - fixture_dir: (env.ELF_REAL_WORLD_LIVE_FIXTURES // "apps/elf-eval/fixtures/real_world_memory"), - adapters: [ + schema: "elf.real_world_live_adapter_sweep/v1", + generated_at: (now | todateiso8601), + artifact_dir: (env.ELF_REAL_WORLD_LIVE_REPORT_DIR // "tmp/real-world-memory/live-adapters"), + fixture_dir: (env.ELF_REAL_WORLD_LIVE_FIXTURES // "apps/elf-eval/fixtures/real_world_memory"), + graph_rag_smoke_controls: { + inclusion_flags: { + ragflow: (env.ELF_REAL_WORLD_LIVE_ENABLE_RAGFLOW // "0"), + lightrag: (env.ELF_REAL_WORLD_LIVE_ENABLE_LIGHTRAG // "0"), + graphrag: (env.ELF_REAL_WORLD_LIVE_ENABLE_GRAPHRAG // "0"), + graphiti_zep: (env.ELF_REAL_WORLD_LIVE_ENABLE_GRAPHITI_ZEP // "0"), + graphify: (env.ELF_REAL_WORLD_LIVE_ENABLE_GRAPHIFY // "0") + }, + live_attempt_boundary: "Inclusion flags only add smoke adapters to this aggregate sweep. Provider, service-start, and resource-heavy live attempts still require each adapter-specific control.", + service_start_controls: { + lightrag: (env.ELF_LIGHTRAG_CONTEXT_START // "0"), + graphiti_zep: (env.ELF_GRAPHITI_ZEP_SMOKE_START // "0") + }, + provider_or_resource_controls_forwarded: [ + "ELF_RAGFLOW_SMOKE_START", + "ELF_RAGFLOW_SMOKE_ACCEPT_RESOURCE_ENVELOPE", + "ELF_GRAPHRAG_SMOKE_RUN", + "ELF_GRAPHRAG_API_KEY", + "ELF_GRAPHITI_ZEP_SMOKE_RUN", + "ELF_GRAPHITI_ZEP_API_KEY", + "ELF_GRAPHIFY_SMOKE_RUN" + ] + }, + adapters: [ { adapter_id: "elf_live_real_world", evidence_class: "live_real_world", @@ -136,15 +165,33 @@ jq -n \ ] }' >"${REPORT_DIR}/summary.json" +if [[ -f "${REPORT_DIR}/ragflow/summary.json" ]]; then + jq \ + --slurpfile ragflow_summary "${REPORT_DIR}/ragflow/summary.json" \ + '.adapters += [ + { + adapter_id: $ragflow_summary[0].adapter_id, + evidence_class: $ragflow_summary[0].evidence_class, + status_boundary: $ragflow_summary[0].status_boundary, + scored_benchmark: $ragflow_summary[0].scored_benchmark, + materialization: $ragflow_summary[0].materialization, + report: $ragflow_summary[0].report + } + ]' "${REPORT_DIR}/summary.json" >"${REPORT_DIR}/summary.json.tmp" + mv "${REPORT_DIR}/summary.json.tmp" "${REPORT_DIR}/summary.json" +fi + if [[ -f "${REPORT_DIR}/lightrag/summary.json" ]]; then jq \ --slurpfile lightrag_summary "${REPORT_DIR}/lightrag/summary.json" \ '.adapters += [ { - adapter_id: $lightrag_summary[0].adapter_id, - evidence_class: $lightrag_summary[0].evidence_class, - materialization: $lightrag_summary[0].materialization, - report: $lightrag_summary[0].report + adapter_id: $lightrag_summary[0].adapter_id, + evidence_class: $lightrag_summary[0].evidence_class, + status_boundary: $lightrag_summary[0].status_boundary, + scored_benchmark: $lightrag_summary[0].scored_benchmark, + materialization: $lightrag_summary[0].materialization, + report: $lightrag_summary[0].report } ]' "${REPORT_DIR}/summary.json" >"${REPORT_DIR}/summary.json.tmp" mv "${REPORT_DIR}/summary.json.tmp" "${REPORT_DIR}/summary.json" @@ -155,15 +202,12 @@ if [[ -f "${REPORT_DIR}/graphrag/summary.json" ]]; then --slurpfile graphrag_summary "${REPORT_DIR}/graphrag/summary.json" \ '.adapters += [ { - adapter_id: $graphrag_summary[0].adapter_id, - evidence_class: $graphrag_summary[0].evidence_class, - materialization: $graphrag_summary[0].materialization, - report: { - json: "tmp/real-world-memory/live-adapters/graphrag/graphrag-smoke.json", - markdown: null, - summary: $graphrag_summary[0].materialization.status, - suites: $graphrag_summary[0].manifest.suites - } + adapter_id: $graphrag_summary[0].adapter_id, + evidence_class: $graphrag_summary[0].evidence_class, + status_boundary: $graphrag_summary[0].status_boundary, + scored_benchmark: $graphrag_summary[0].scored_benchmark, + materialization: $graphrag_summary[0].materialization, + report: $graphrag_summary[0].report } ]' "${REPORT_DIR}/summary.json" >"${REPORT_DIR}/summary.json.tmp" mv "${REPORT_DIR}/summary.json.tmp" "${REPORT_DIR}/summary.json" @@ -174,15 +218,12 @@ if [[ -f "${REPORT_DIR}/graphiti-zep/summary.json" ]]; then --slurpfile graphiti_summary "${REPORT_DIR}/graphiti-zep/summary.json" \ '.adapters += [ { - adapter_id: $graphiti_summary[0].adapter_id, - evidence_class: $graphiti_summary[0].evidence_class, - materialization: $graphiti_summary[0].materialization, - report: { - json: "tmp/real-world-memory/live-adapters/graphiti-zep/graphiti-zep-smoke.json", - markdown: null, - summary: $graphiti_summary[0].materialization.status, - suites: $graphiti_summary[0].manifest.suites - } + adapter_id: $graphiti_summary[0].adapter_id, + evidence_class: $graphiti_summary[0].evidence_class, + status_boundary: $graphiti_summary[0].status_boundary, + scored_benchmark: $graphiti_summary[0].scored_benchmark, + materialization: $graphiti_summary[0].materialization, + report: $graphiti_summary[0].report } ]' "${REPORT_DIR}/summary.json" >"${REPORT_DIR}/summary.json.tmp" mv "${REPORT_DIR}/summary.json.tmp" "${REPORT_DIR}/summary.json" @@ -193,15 +234,12 @@ if [[ -f "${REPORT_DIR}/graphify/summary.json" ]]; then --slurpfile graphify_summary "${REPORT_DIR}/graphify/summary.json" \ '.adapters += [ { - adapter_id: $graphify_summary[0].adapter_id, - evidence_class: $graphify_summary[0].evidence_class, - materialization: $graphify_summary[0].materialization, - report: { - json: "tmp/real-world-memory/live-adapters/graphify/graphify-smoke.json", - markdown: null, - summary: $graphify_summary[0].materialization.status, - suites: $graphify_summary[0].manifest.suites - } + adapter_id: $graphify_summary[0].adapter_id, + evidence_class: $graphify_summary[0].evidence_class, + status_boundary: $graphify_summary[0].status_boundary, + scored_benchmark: $graphify_summary[0].scored_benchmark, + materialization: $graphify_summary[0].materialization, + report: $graphify_summary[0].report } ]' "${REPORT_DIR}/summary.json" >"${REPORT_DIR}/summary.json.tmp" mv "${REPORT_DIR}/summary.json.tmp" "${REPORT_DIR}/summary.json" @@ -212,19 +250,31 @@ echo " ${REPORT_DIR}/elf-report.json" echo " ${REPORT_DIR}/elf-report.md" echo " ${REPORT_DIR}/qmd-report.json" echo " ${REPORT_DIR}/qmd-report.md" +if [[ -f "${REPORT_DIR}/ragflow/summary.json" ]]; then + echo " ${REPORT_DIR}/ragflow/ragflow-report.json" + echo " ${REPORT_DIR}/ragflow/ragflow-report.md" + echo " ${REPORT_DIR}/ragflow/summary.json" +fi if [[ -f "${REPORT_DIR}/lightrag/summary.json" ]]; then echo " ${REPORT_DIR}/lightrag/lightrag-report.json" echo " ${REPORT_DIR}/lightrag/lightrag-report.md" + echo " ${REPORT_DIR}/lightrag/summary.json" fi if [[ -f "${REPORT_DIR}/graphrag/summary.json" ]]; then + echo " ${REPORT_DIR}/graphrag/graphrag-report.json" + echo " ${REPORT_DIR}/graphrag/graphrag-report.md" echo " ${REPORT_DIR}/graphrag/graphrag-smoke.json" echo " ${REPORT_DIR}/graphrag/summary.json" fi if [[ -f "${REPORT_DIR}/graphiti-zep/summary.json" ]]; then + echo " ${REPORT_DIR}/graphiti-zep/graphiti-zep-report.json" + echo " ${REPORT_DIR}/graphiti-zep/graphiti-zep-report.md" echo " ${REPORT_DIR}/graphiti-zep/graphiti-zep-smoke.json" echo " ${REPORT_DIR}/graphiti-zep/summary.json" fi if [[ -f "${REPORT_DIR}/graphify/summary.json" ]]; then + echo " ${REPORT_DIR}/graphify/graphify-report.json" + echo " ${REPORT_DIR}/graphify/graphify-report.md" echo " ${REPORT_DIR}/graphify/graphify-smoke.json" echo " ${REPORT_DIR}/graphify/summary.json" fi