Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 12 additions & 3 deletions Makefile.toml
Original file line number Diff line number Diff line change
Expand Up @@ -821,9 +821,10 @@ args = [
# | real-world-memory-knowledge | composite | |
# | real-world-memory-knowledge-json | command | |
# | real-world-memory-knowledge-report | command | |
# | ragflow-docker-smoke | command | |
# | lightrag-docker-context-smoke | command | |
# | graphrag-docker-smoke | command | |
# | ragflow-docker-smoke | command | |
# | lightrag-docker-context-smoke | command | |
# | graphrag-docker-smoke | command | |
# | graphiti-zep-docker-temporal-smoke | command | |

[tasks.ragflow-docker-smoke]
workspace = false
Expand All @@ -848,6 +849,14 @@ args = [
"set -euo pipefail; docker compose -f docker-compose.baseline.yml run --build --rm -e ELF_GRAPHRAG_SMOKE_RUN -e ELF_GRAPHRAG_SMOKE_REPORT_DIR -e ELF_GRAPHRAG_SMOKE_WORK_DIR -e ELF_GRAPHRAG_SMOKE_INSTALL -e ELF_GRAPHRAG_VERSION -e ELF_GRAPHRAG_PACKAGE -e ELF_GRAPHRAG_REF -e ELF_GRAPHRAG_CHAT_MODEL -e ELF_GRAPHRAG_EMBEDDING_MODEL -e ELF_GRAPHRAG_API_BASE -e ELF_GRAPHRAG_API_KEY -e ELF_GRAPHRAG_INDEX_METHOD -e ELF_GRAPHRAG_QUERY_METHOD -e ELF_GRAPHRAG_TIMEOUT_SECONDS -e ELF_GRAPHRAG_MAX_DOCS -e ELF_GRAPHRAG_MAX_INPUT_CHARS baseline-runner python3 scripts/graphrag-docker-smoke.py",
]

[tasks.graphiti-zep-docker-temporal-smoke]
workspace = false
command = "bash"
args = [
"-lc",
"set -euo pipefail; start=\"$(printenv ELF_GRAPHITI_ZEP_SMOKE_START || true)\"; status=0; if [ \"$start\" = \"1\" ]; then docker compose -f docker-compose.baseline.yml --profile graphiti-zep up -d graphiti-falkordb; fi; docker compose -f docker-compose.baseline.yml run --build --rm -e ELF_GRAPHITI_ZEP_SMOKE_RUN -e ELF_GRAPHITI_ZEP_SMOKE_REPORT_DIR -e ELF_GRAPHITI_ZEP_SMOKE_WORK_DIR -e ELF_GRAPHITI_ZEP_SMOKE_INSTALL -e ELF_GRAPHITI_ZEP_VERSION -e ELF_GRAPHITI_ZEP_PACKAGE -e ELF_GRAPHITI_ZEP_REF -e ELF_GRAPHITI_ZEP_API_BASE -e ELF_GRAPHITI_ZEP_API_KEY -e ELF_GRAPHITI_ZEP_LLM_MODEL -e ELF_GRAPHITI_ZEP_EMBEDDING_MODEL -e ELF_GRAPHITI_ZEP_FALKORDB_HOST -e ELF_GRAPHITI_ZEP_FALKORDB_PORT -e ELF_GRAPHITI_ZEP_FALKORDB_DATABASE -e ELF_GRAPHITI_ZEP_TIMEOUT_SECONDS -e ELF_GRAPHITI_ZEP_STARTUP_ATTEMPTS -e ELF_GRAPHITI_ZEP_STARTUP_INTERVAL_SECONDS baseline-runner python3 scripts/graphiti-zep-docker-temporal-smoke.py || status=$?; if [ \"$start\" = \"1\" ]; then docker compose -f docker-compose.baseline.yml --profile graphiti-zep stop graphiti-falkordb >/dev/null 2>&1 || true; fi; exit \"$status\"",
]

[tasks.real-world-memory-knowledge]
workspace = false
dependencies = [
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1419,46 +1419,61 @@
"evidence_class": "research_gate",
"docker_default": true,
"host_global_installs_required": false,
"overall_status": "not_encoded",
"overall_status": "blocked",
"setup": {
"status": "not_encoded",
"evidence": "XY-882 marks Graphiti/Zep as an adapter_candidate, but no Docker temporal graph adapter is implemented."
"status": "blocked",
"evidence": "XY-888 adds a Docker-contained Graphiti/Zep temporal smoke command. The checked-in manifest remains a research gate until a generated artifact reaches Graphiti search output.",
"command": "cargo make graphiti-zep-docker-temporal-smoke",
"artifact": "tmp/real-world-memory/graphiti-zep-smoke/graphiti-zep-smoke.json"
},
"run": {
"status": "not_encoded",
"evidence": "No temporal graph fact add/query job is encoded."
"status": "blocked",
"evidence": "The default smoke records a typed setup/runtime failure if live execution is not explicitly enabled. Set ELF_GRAPHITI_ZEP_SMOKE_START=1 and ELF_GRAPHITI_ZEP_SMOKE_RUN=1 with explicit provider configuration to start Docker-local FalkorDB and run Graphiti.",
"command": "ELF_GRAPHITI_ZEP_SMOKE_START=1 ELF_GRAPHITI_ZEP_SMOKE_RUN=1 cargo make graphiti-zep-docker-temporal-smoke",
"artifact": "tmp/real-world-memory/graphiti-zep-smoke/summary.json"
},
"result": {
"status": "not_encoded",
"evidence": "No current-versus-historical real_world_job pass is claimed."
"status": "blocked",
"evidence": "No temporal graph quality result is claimed from the checked-in research gate. Generated smoke artifacts may become live_real_world only after Graphiti/Zep returns UUID, fact, valid_at, and invalid_at output mapped to generated memory_evolution evidence ids.",
"artifact": "tmp/real-world-memory/graphiti-zep-smoke/graphiti-zep-smoke.json"
},
"capabilities": [
{
"capability": "temporal_graph_memory",
"status": "not_encoded",
"evidence": "Temporal fact validity has a scoped adapter candidate path, but no executable adapter output is encoded."
"status": "blocked",
"evidence": "The smoke materializes generated current, historical, and rationale facts with validity windows, but the checked-in record stays blocked until a live artifact maps search output."
},
{
"capability": "docker_graph_store_setup",
"status": "blocked",
"evidence": "A safe local graph store, embedding, and LLM configuration must be documented before execution."
"evidence": "The task uses a Docker Compose graphiti-zep profile for FalkorDB and a container-local Python venv; no host-global graph database or hosted Zep service is used."
},
{
"capability": "real_world_job_adapter",
"status": "blocked",
"evidence": "The generated smoke fixture maps Graphiti/Zep temporal fact output to memory_evolution expected evidence ids when search output is available."
},
{
"capability": "quality_or_scale_claim",
"status": "not_encoded",
"evidence": "No Graphiti/Zep materializer or scorer mapping exists."
"evidence": "The smoke does not claim broad graph-memory quality, managed Zep service behavior, private-corpus behavior, or large-corpus performance."
}
],
"suites": [
{
"suite_id": "memory_evolution",
"status": "not_encoded",
"evidence": "Current/historical fact validity jobs are not encoded for Graphiti/Zep."
"status": "blocked",
"evidence": "Generated current/historical relation facts are encoded, but the checked-in manifest stays blocked until the Docker smoke returns validity-window search output."
},
{
"suite_id": "retrieval",
"status": "not_encoded",
"evidence": "Hybrid graph retrieval output is not mapped to evidence IDs."
"evidence": "Hybrid graph retrieval reachability is not scored beyond the temporal search smoke."
},
{
"suite_id": "production_ops",
"status": "not_encoded",
"evidence": "The smoke records setup and provider boundaries but does not encode backup, restore, private corpus, or hosted-service operations."
}
],
"evidence": [
Expand All @@ -1471,6 +1486,16 @@
"kind": "source",
"ref": "https://www.getzep.com/platform/graphiti/",
"status": "real"
},
{
"kind": "command",
"ref": "cargo make graphiti-zep-docker-temporal-smoke",
"status": "blocked"
},
{
"kind": "artifact",
"ref": "tmp/real-world-memory/graphiti-zep-smoke/graphiti-zep-smoke.json",
"status": "blocked"
}
],
"execution_metadata": {
Expand All @@ -1494,16 +1519,22 @@
"label": "Graphiti FalkorDB configuration",
"url": "https://help.getzep.com/graphiti/configuration/falkor-db-configuration",
"evidence": "Official Docker-local FalkorDB setup reference."
},
{
"label": "Graphiti fact triples",
"url": "https://help.getzep.com/graphiti/working-with-data/adding-fact-triples",
"evidence": "Official manual fact-triple ingest contract."
}
],
"setup_path": "Implement a Docker-local FalkorDB or Neo4j graph store and provider configuration, then encode add/query current-versus-historical fact jobs.",
"runtime_boundary": "Docker-only service or SDK run with graph store state under benchmark artifacts.",
"resource_expectation": "Requires graph store plus LLM/embedding configuration; record service startup, storage size, and provider boundaries.",
"setup_path": "Run cargo make graphiti-zep-docker-temporal-smoke for a typed artifact; set ELF_GRAPHITI_ZEP_SMOKE_START=1 ELF_GRAPHITI_ZEP_SMOKE_RUN=1 with explicit provider configuration for a live attempt.",
"runtime_boundary": "docker-compose.baseline.yml baseline-runner plus graphiti-zep FalkorDB profile, container-local Python venv, generated public temporal facts, and report artifacts under tmp/real-world-memory/graphiti-zep-smoke.",
"resource_expectation": "Requires Docker-local FalkorDB plus LLM/embedding configuration; generated artifacts record service startup, storage size, provider boundaries, fact count, and timeout before scoring.",
"retry_guidance": [
"Prototype a tiny temporal fact add/query run.",
"Map valid_at/invalid_at evidence to memory_evolution scoring."
"Run cargo make graphiti-zep-docker-temporal-smoke first to produce a typed blocked artifact.",
"Start the live path only with ELF_GRAPHITI_ZEP_SMOKE_START=1, ELF_GRAPHITI_ZEP_SMOKE_RUN=1, and explicit provider configuration.",
"Treat missing validity windows or unmapped current/historical facts as wrong_result, not pass."
],
"research_depth": "D1 feasibility verdict: adapter_candidate (XY-882); research_gate only, adapter not encoded"
"research_depth": "D2 feasibility plus XY-888 Docker temporal smoke implementation; checked-in record remains research_gate unless a generated artifact reaches Graphiti search output"
},
"follow_up": {
"title": "[ELF benchmark adapter] Implement Graphiti/Zep temporal graph adapter",
Expand Down
33 changes: 30 additions & 3 deletions apps/elf-eval/tests/real_world_job_benchmark.rs
Original file line number Diff line number Diff line change
Expand Up @@ -257,13 +257,13 @@ fn assert_external_adapter_manifest_summary(report: &Value) {
report
.pointer("/external_adapters/summary/overall_status_counts/blocked")
.and_then(Value::as_u64),
Some(4)
Some(5)
);
assert_eq!(
report
.pointer("/external_adapters/summary/overall_status_counts/not_encoded")
.and_then(Value::as_u64),
Some(9)
Some(8)
);
assert_eq!(
report
Expand All @@ -281,7 +281,7 @@ fn assert_external_adapter_manifest_summary(report: &Value) {
report
.pointer("/external_adapters/summary/suite_status_counts/blocked")
.and_then(Value::as_u64),
Some(8)
Some(9)
);
}

Expand All @@ -296,6 +296,7 @@ fn assert_external_adapter_manifest_records(report: &Value) -> Result<()> {
let ragflow = find_by_field(adapters, "/adapter_id", "ragflow_research_gate")?;
let lightrag = find_by_field(adapters, "/adapter_id", "lightrag_research_gate")?;
let graphrag = find_by_field(adapters, "/adapter_id", "graphrag_research_gate")?;
let graphiti_zep = find_by_field(adapters, "/adapter_id", "graphiti_zep_research_gate")?;
let qmd_deep = find_by_field(adapters, "/adapter_id", "qmd_deep_profile_gate")?;

assert_eq!(elf.pointer("/evidence_class").and_then(Value::as_str), Some("fixture_backed"));
Expand Down Expand Up @@ -363,6 +364,32 @@ fn assert_external_adapter_manifest_records(report: &Value) -> Result<()> {
Some("cargo make graphrag-docker-smoke")
);
assert_eq!(graphrag.pointer("/suites/1/status").and_then(Value::as_str), Some("not_encoded"));
assert_eq!(
graphiti_zep.pointer("/evidence_class").and_then(Value::as_str),
Some("research_gate")
);
assert_eq!(graphiti_zep.pointer("/overall_status").and_then(Value::as_str), Some("blocked"));
assert_eq!(
graphiti_zep.pointer("/setup/command").and_then(Value::as_str),
Some("cargo make graphiti-zep-docker-temporal-smoke")
);
assert_eq!(
graphiti_zep.pointer("/run/command").and_then(Value::as_str),
Some(
"ELF_GRAPHITI_ZEP_SMOKE_START=1 ELF_GRAPHITI_ZEP_SMOKE_RUN=1 cargo make graphiti-zep-docker-temporal-smoke"
)
);
assert_eq!(
graphiti_zep.pointer("/suites/0/suite_id").and_then(Value::as_str),
Some("memory_evolution")
);
assert_eq!(graphiti_zep.pointer("/suites/0/status").and_then(Value::as_str), Some("blocked"));
assert_eq!(
graphiti_zep.pointer("/execution_metadata/research_depth").and_then(Value::as_str),
Some(
"D2 feasibility plus XY-888 Docker temporal smoke implementation; checked-in record remains research_gate unless a generated artifact reaches Graphiti search output"
)
);
assert_eq!(
qmd_deep.pointer("/capabilities/2/status").and_then(Value::as_str),
Some("unsupported")
Expand Down
8 changes: 8 additions & 0 deletions docker-compose.baseline.yml
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,13 @@ services:
- elf-live-baseline-lightrag-inputs:/app/data/inputs
- elf-live-baseline-lightrag-prompts:/app/data/prompts

graphiti-falkordb:
profiles:
- graphiti-zep
image: ${ELF_GRAPHITI_ZEP_FALKORDB_IMAGE:-falkordb/falkordb:edge}
volumes:
- elf-live-baseline-graphiti-falkordb:/data

baseline-runner:
build:
context: .
Expand Down Expand Up @@ -149,6 +156,7 @@ services:
volumes:
elf-live-baseline-cargo-git:
elf-live-baseline-cargo-registry:
elf-live-baseline-graphiti-falkordb:
elf-live-baseline-huggingface-cache:
elf-live-baseline-lightrag-inputs:
elf-live-baseline-lightrag-prompts:
Expand Down
Loading