diff --git a/README.md b/README.md index 564a3be7..dde8c179 100644 --- a/README.md +++ b/README.md @@ -141,10 +141,11 @@ with the production embedding provider path, `Qwen3-Embedding-8B`, and search recovered the restored note. - Fresh all-project smoke run: ELF and qmd passed every encoded check. agentmemory passed same-corpus retrieval but failed lifecycle/cold-start coverage. memsearch, - mem0, OpenViking, and claude-mem remained `incomplete` or wrong-result typed states; - those states are reported as limitations, not hidden as proof. + mem0, OpenViking, and claude-mem remained typed non-pass states. OpenViking now + reaches its pinned Docker local embedding path and is reported as `wrong_result` + when same-corpus evidence terms are missed; setup failures remain `incomplete`. - Real-world agent memory aggregate after the P1 benchmark batch: 38 fixture-backed - jobs across 11 suites, 35 pass, 1 incomplete, 2 blocked, 0 wrong-result, + jobs across 11 suites, 36 pass, 0 incomplete, 2 blocked, 0 wrong-result, 0 not-encoded, and 0 unsupported-claim results. The remaining non-pass jobs are production-ops operator boundaries, not hidden benchmark wins. - Full-suite live real-world adapter sweep after XY-880: ELF and qmd now emit @@ -157,8 +158,8 @@ with the production embedding provider path, `Qwen3-Embedding-8B`, and manifest now includes `research_gate` records for RAGFlow, LightRAG, GraphRAG, Graphiti/Zep, Letta, LangGraph, nanograph, llm-wiki, gbrain, graphify, and deeper qmd/OpenViking profiles. These records carry source/setup/runtime/resource/retry - metadata and typed `blocked`, `incomplete`, or `not_encoded` states; they are not - fixture-backed or live adapter pass evidence. + metadata and typed `blocked`, `incomplete`, `wrong_result`, or `not_encoded` states; + they are not fixture-backed or live adapter pass evidence. - The benchmark runner and report publisher are checked in and Docker-isolated: `cargo make baseline-live-docker`, `cargo make baseline-backfill-docker`, `cargo make baseline-production-private-addendum`, diff --git a/apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json b/apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json index 0c0c0a69..e49d67ae 100644 --- a/apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json +++ b/apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json @@ -20,7 +20,7 @@ "evidence_class": "fixture_backed", "docker_default": true, "host_global_installs_required": false, - "overall_status": "incomplete", + "overall_status": "blocked", "setup": { "status": "pass", "evidence": "The checked-in real_world_memory fixtures parse and score through the ELF fixture runner.", @@ -28,13 +28,13 @@ "artifact": "tmp/real-world-memory/real-world-memory-report.json" }, "run": { - "status": "incomplete", - "evidence": "The current fixture set reports 38 jobs, 35 pass, 1 incomplete, 2 blocked, 0 wrong_result, 0 not_encoded, and 0 unsupported_claim.", + "status": "blocked", + "evidence": "The current fixture set reports 38 jobs, 36 pass, 0 incomplete, 2 blocked, 0 wrong_result, 0 not_encoded, and 0 unsupported_claim.", "command": "cargo make real-world-memory", "artifact": "tmp/real-world-memory/real-world-memory-report.json" }, "result": { - "status": "incomplete", + "status": "blocked", "evidence": "This is fixture-backed ELF scoring, not a live external adapter result.", "artifact": "tmp/real-world-memory/real-world-memory-report.md" }, @@ -103,8 +103,8 @@ }, { "suite_id": "production_ops", - "status": "incomplete", - "evidence": "Production-ops fixtures encode restore, Qdrant rebuild, backfill resume, resource-envelope interpretation, plus typed incomplete and blocked operator boundaries." + "status": "blocked", + "evidence": "Production-ops fixtures encode restore, Qdrant rebuild, backfill resume, resource-envelope interpretation, OpenViking wrong-result classification, plus typed blocked operator boundaries." }, { "suite_id": "personalization", @@ -126,7 +126,7 @@ ], "notes": [ "This adapter record exists to keep ELF fixture results separate from live external adapter results.", - "The remaining non-pass ELF fixture states are production-ops operator boundaries: a Docker local-embedding dependency, provider credentials, and an operator-owned private corpus manifest.", + "The remaining non-pass ELF fixture states are production-ops operator boundaries: provider credentials and an operator-owned private corpus manifest.", "Use elf_live_real_world for service-runtime real_world_job evidence; this fixture-backed record must not imply live-service behavior." ] }, @@ -714,28 +714,33 @@ "evidence_class": "live_baseline_only", "docker_default": true, "host_global_installs_required": false, - "overall_status": "incomplete", + "overall_status": "wrong_result", "setup": { - "status": "incomplete", - "evidence": "OpenViking local-embed setup can fail in Docker while building or importing local embedding dependencies.", + "status": "pass", + "evidence": "OpenViking local-embed setup installed and imported pinned llama-cpp-python==0.3.28 from the CPU wheel index in Docker.", "command": "ELF_BASELINE_PROJECTS=OpenViking cargo make baseline-live-docker", "artifact": "tmp/live-baseline/OpenViking.log" }, "run": { - "status": "incomplete", - "evidence": "The adapter cannot reliably reach same-corpus add_resource/find behavior until local embedding setup is pinned for Docker.", + "status": "wrong_result", + "evidence": "The adapter reached same-corpus add_resource/find, but returned 0 of 3 expected evidence-term matches in the smoke run.", "artifact": "tmp/live-baseline/live-baseline-report.json" }, "result": { - "status": "incomplete", - "evidence": "No real_world_job OpenViking adapter is encoded; current blocker is dependency setup, not a quality claim.", + "status": "wrong_result", + "evidence": "The current OpenViking Docker evidence is a behavioral wrong_result, not a local embedding setup blocker and not a real_world_job pass.", "artifact": "docs/guide/benchmarking/live_baseline_benchmark.md" }, "capabilities": [ { "capability": "local_embed_setup", - "status": "incomplete", - "evidence": "Docker local embedding dependency setup is not reliable in the current adapter." + "status": "pass", + "evidence": "Docker local embedding dependency setup is pinned to llama-cpp-python==0.3.28 from https://abetlen.github.io/llama-cpp-python/whl/cpu and reached import/runtime in the smoke run." + }, + { + "capability": "same_corpus_retrieval", + "status": "wrong_result", + "evidence": "OpenViking add_resource/find returned resources but missed expected evidence-term matches for every smoke query." }, { "capability": "context_trajectory", @@ -751,8 +756,8 @@ "suites": [ { "suite_id": "retrieval", - "status": "incomplete", - "evidence": "The local embedding install blocker prevents a fair retrieval job run." + "status": "wrong_result", + "evidence": "The Docker-local setup reached add_resource/find, but the retrieval check returned 0/3 expected evidence-term matches." }, { "suite_id": "work_resume", @@ -769,15 +774,37 @@ { "kind": "runner", "ref": "scripts/live-baseline-benchmark.sh", - "status": "incomplete" + "status": "wrong_result" } ], + "execution_metadata": { + "sources": [ + { + "label": "OpenViking repository", + "url": "https://github.com/volcengine/OpenViking/", + "evidence": "Official source for OpenViking local context database, resource, and retrieval APIs." + }, + { + "label": "llama-cpp-python CPU wheel index", + "url": "https://abetlen.github.io/llama-cpp-python/whl/cpu", + "evidence": "Official prebuilt CPU wheel index used by the Docker-local embedding pin." + } + ], + "setup_path": "Run ELF_BASELINE_PROJECTS=OpenViking cargo make baseline-live-docker. The runner installs llama-cpp-python==0.3.28 with --only-binary llama-cpp-python from the CPU wheel index before OpenViking add_resource/find.", + "runtime_boundary": "docker-compose.baseline.yml baseline-runner container; no host-global OpenViking, llama-cpp-python, or model service install is required.", + "resource_expectation": "Local embedding setup may download a CPU wheel and model assets; record OpenViking.log, elapsed time, and cache size before claiming adapter quality.", + "retry_guidance": [ + "Use the default pinned CPU wheel path first.", + "Override ELF_BASELINE_OPENVIKING_LLAMA_CPP_PYTHON_VERSION or ELF_BASELINE_OPENVIKING_LLAMA_CPP_PYTHON_INDEX only when the default wheel is unavailable for the Docker platform.", + "Treat install/import failure as incomplete, not wrong_result; treat add_resource/find evidence misses as wrong_result." + ] + }, "notes": [ - "Record OpenViking as incomplete until Docker-compatible local embeddings are pinned; do not treat setup weight as a negative quality result." + "Record OpenViking as wrong_result now that the pinned Docker local embedding path reaches add_resource/find but misses expected evidence." ], "follow_up": { - "title": "[ELF benchmark adapter] Pin OpenViking Docker local embedding dependency path", - "reason": "The current adapter must reach add_resource/find before real-world job suites can be scored." + "title": "Fix OpenViking evidence-bearing same-corpus retrieval output", + "reason": "The current adapter reaches add_resource/find but must return evidence-bearing content before real-world job suites can be scored." } }, { @@ -940,26 +967,26 @@ "evidence_class": "research_gate", "docker_default": true, "host_global_installs_required": false, - "overall_status": "incomplete", + "overall_status": "not_encoded", "setup": { - "status": "incomplete", - "evidence": "OpenViking deep-profile work is blocked at the same Docker local-embedding dependency boundary as the current live-baseline adapter.", + "status": "pass", + "evidence": "The default pinned OpenViking local embedding dependency path reaches runtime in Docker.", "command": "ELF_BASELINE_PROJECTS=OpenViking cargo make baseline-live-docker", "artifact": "tmp/live-baseline/OpenViking.log" }, "run": { - "status": "incomplete", - "evidence": "The adapter cannot fairly exercise hierarchical trajectory behavior until add_resource/find reaches execution in Docker." + "status": "not_encoded", + "evidence": "The adapter cannot fairly exercise hierarchical trajectory behavior until same-corpus add_resource/find returns evidence-bearing results." }, "result": { - "status": "incomplete", - "evidence": "No OpenViking deep context-trajectory result is claimed from a setup-blocked run." + "status": "not_encoded", + "evidence": "No OpenViking deep context-trajectory result is claimed from the current wrong-result smoke run." }, "capabilities": [ { "capability": "docker_local_embed_setup", - "status": "incomplete", - "evidence": "The local embedding setup must be pinned before deep profile runs can execute." + "status": "pass", + "evidence": "The local embedding setup is pinned and reaches import/runtime in Docker." }, { "capability": "hierarchical_context_trajectory", @@ -975,8 +1002,8 @@ "suites": [ { "suite_id": "retrieval", - "status": "incomplete", - "evidence": "Same-corpus retrieval setup remains incomplete in Docker." + "status": "not_encoded", + "evidence": "Deep retrieval scoring is deferred until the smoke adapter returns evidence-bearing same-corpus output." }, { "suite_id": "work_resume", @@ -998,7 +1025,7 @@ { "kind": "runner", "ref": "scripts/live-baseline-benchmark.sh", - "status": "incomplete" + "status": "wrong_result" } ], "execution_metadata": { @@ -1009,17 +1036,18 @@ "evidence": "Official source for OpenViking local context database, resource, and retrieval APIs." } ], - "setup_path": "Pin a Docker-compatible local embedding path, then run OpenViking add_resource/find before any deep profile scoring.", + "setup_path": "Use the pinned Docker local embedding path from scripts/live-baseline-benchmark.sh, then run OpenViking add_resource/find before any deep profile scoring.", "runtime_boundary": "docker-compose.baseline.yml baseline-runner container; no host model or compiler setup outside Docker.", - "resource_expectation": "Local embedding builds can be native-toolchain and model heavy; record build logs, model cache size, and elapsed time.", + "resource_expectation": "Local embedding setup can download CPU wheels and model assets; record build/import logs, model cache size, and elapsed time.", "retry_guidance": [ - "Pin or prebuild the local embedding dependency in the baseline image.", - "Only then add context-trajectory real_world_job scoring for hierarchical retrieval." + "Run the default pinned llama-cpp-python==0.3.28 CPU wheel path first.", + "Override the OpenViking llama-cpp-python version or index only when the default wheel is unavailable for the Docker platform.", + "Fix evidence-bearing same-corpus output before adding context-trajectory real_world_job scoring for hierarchical retrieval." ], - "research_depth": "D2 reviewed; runtime setup incomplete" + "research_depth": "D2 reviewed; local embedding setup pinned; deep profile not encoded" }, "notes": [ - "OpenViking remains a context-trajectory reference, but this gate prevents setup failure from becoming a quality judgment." + "OpenViking remains a context-trajectory reference, but this gate prevents a smoke wrong_result from becoming a deep-profile claim." ] }, { diff --git a/apps/elf-eval/fixtures/real_world_memory/production_ops/cold_start_missing_dependency_incomplete.json b/apps/elf-eval/fixtures/real_world_memory/production_ops/cold_start_missing_dependency_incomplete.json index 8fcbfc39..5ff0912d 100644 --- a/apps/elf-eval/fixtures/real_world_memory/production_ops/cold_start_missing_dependency_incomplete.json +++ b/apps/elf-eval/fixtures/real_world_memory/production_ops/cold_start_missing_dependency_incomplete.json @@ -2,35 +2,62 @@ "schema": "elf.real_world_job/v1", "job_id": "production-ops-cold-start-dependency-001", "suite": "production_ops", - "title": "Preserve cold-start dependency failure as incomplete instead of pass", - "encoding": { - "status": "incomplete", - "reason": "The fixture records a cold-start dependency failure path that could not reach the behavioral check; this must remain incomplete rather than a silent pass.", - "follow_up": { - "title": "[ELF benchmark P0] Pin Docker-compatible local embedding dependency for cold-start adapter checks", - "reason": "The adapter cannot fairly test cold-start recovery until its local embedding dependency can build or import in Docker." - } - }, + "title": "Report pinned OpenViking cold-start path reaching behavioral wrong-result", + "encoding": {}, "corpus": { "corpus_id": "real-world-memory-production-ops-2026-06-10", "profile": "external_adapter", "items": [ { - "evidence_id": "local-embed-install-failure", + "evidence_id": "pinned-local-embed-runtime-reached", "kind": "adapter_state", - "text": "OpenViking cold-start check could not run because the Docker platform could not build or import llama-cpp-python for the local embedding path; the adapter status is incomplete with retrieval_status=local_embed_install_failed.", + "text": "The pinned OpenViking Docker local embedding path installed and imported llama-cpp-python==0.3.28, then reached OpenViking add_resource/find in the baseline runner.", "source_ref": { "schema": "source_ref/v1", "resolver": "real_world_job_fixture/v1", "ref": { "fixture": "cold_start_missing_dependency_incomplete", - "evidence_id": "local-embed-install-failure" + "evidence_id": "pinned-local-embed-runtime-reached" }, "locator": { - "quote": "could not build or import llama-cpp-python" + "quote": "llama_cpp_import_ok 0.3.28" } }, - "created_at": "2026-06-09T08:38:14Z" + "created_at": "2026-06-10T08:38:58Z" + }, + { + "evidence_id": "pinned-local-embed-retry", + "kind": "runbook", + "text": "The Docker retry path is ELF_BASELINE_PROJECTS=OpenViking cargo make baseline-live-docker; the runner installs llama-cpp-python==0.3.28 from https://abetlen.github.io/llama-cpp-python/whl/cpu with --only-binary llama-cpp-python before OpenViking add_resource/find.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "cold_start_missing_dependency_incomplete", + "evidence_id": "pinned-local-embed-retry" + }, + "locator": { + "quote": "llama-cpp-python==0.3.28" + } + }, + "created_at": "2026-06-10T00:00:00Z" + }, + { + "evidence_id": "openviking-wrong-result-behavior", + "kind": "adapter_state", + "text": "OpenViking now records status=wrong_result and retrieval_status=retrieval_wrong_result because add_resource/find returned 0 of 3 expected evidence-term matches after the pinned local embedding setup succeeded.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "cold_start_missing_dependency_incomplete", + "evidence_id": "openviking-wrong-result-behavior" + }, + "locator": { + "quote": "status=wrong_result" + } + }, + "created_at": "2026-06-10T08:38:58Z" }, { "evidence_id": "typed-incomplete-policy", @@ -52,7 +79,7 @@ { "evidence_id": "dependency-pass-decoy", "kind": "adapter_state", - "text": "Decoy: missing local embedding dependency should be reported as pass because no retrieval mismatch occurred.", + "text": "Decoy: the pinned OpenViking run should be reported as pass because the dependency installed even though retrieval missed expected evidence terms.", "source_ref": { "schema": "source_ref/v1", "resolver": "real_world_job_fixture/v1", @@ -67,16 +94,36 @@ "adapter_response": { "adapter_id": "fixture_production_ops", "answer": { - "content": "The cold-start dependency failure is incomplete, not pass. The adapter could not build or import llama-cpp-python, so the behavioral cold-start check did not run and needs dependency repair before a pass claim.", + "content": "The pinned OpenViking Docker local embedding path reached add_resource/find. OpenViking now reports wrong_result/retrieval_wrong_result because all three smoke queries missed expected evidence terms. If the pinned llama-cpp-python install or import fails on another Docker platform, classify that setup boundary as incomplete, not pass.", "claims": [ { - "claim_id": "cold_start_dependency_incomplete", - "text": "The cold-start dependency failure is incomplete, not pass.", - "evidence_ids": ["local-embed-install-failure", "typed-incomplete-policy"], + "claim_id": "pinned_openviking_runtime_reached", + "text": "The pinned OpenViking Docker local embedding path reached add_resource/find.", + "evidence_ids": [ + "pinned-local-embed-runtime-reached", + "pinned-local-embed-retry" + ], + "confidence": "high" + }, + { + "claim_id": "openviking_wrong_result_after_runtime", + "text": "OpenViking now reports wrong_result/retrieval_wrong_result because all three smoke queries missed expected evidence terms.", + "evidence_ids": ["openviking-wrong-result-behavior"], + "confidence": "high" + }, + { + "claim_id": "setup_failure_stays_incomplete", + "text": "If the pinned llama-cpp-python install or import fails on another Docker platform, classify that setup boundary as incomplete, not pass.", + "evidence_ids": ["typed-incomplete-policy"], "confidence": "high" } ], - "evidence_ids": ["local-embed-install-failure", "typed-incomplete-policy"], + "evidence_ids": [ + "pinned-local-embed-runtime-reached", + "pinned-local-embed-retry", + "openviking-wrong-result-behavior", + "typed-incomplete-policy" + ], "latency_ms": 1.8, "cost": { "currency": "USD", @@ -89,12 +136,28 @@ }, "timeline": [ { - "event_id": "cold-start-dependency-failed", - "ts": "2026-06-09T08:38:14Z", + "event_id": "pinned-local-embed-runtime-reached", + "ts": "2026-06-10T08:38:58Z", + "actor": "tool", + "action": "reached_behavior_check", + "evidence_ids": ["pinned-local-embed-runtime-reached"], + "summary": "The pinned local embedding dependency installed and imported, and OpenViking add_resource/find executed." + }, + { + "event_id": "pinned-local-embed-retry-recorded", + "ts": "2026-06-10T00:00:00Z", + "actor": "agent", + "action": "recorded_retry_path", + "evidence_ids": ["pinned-local-embed-retry"], + "summary": "The fixture records the Docker-local pinned llama-cpp-python retry command and wheel index." + }, + { + "event_id": "openviking-wrong-result-recorded", + "ts": "2026-06-10T08:38:58Z", "actor": "tool", - "action": "hit_dependency_failure", - "evidence_ids": ["local-embed-install-failure"], - "summary": "The cold-start adapter path stopped before behavioral scoring because a native dependency could not build or import." + "action": "classified_behavior", + "evidence_ids": ["openviking-wrong-result-behavior"], + "summary": "The OpenViking adapter reached retrieval behavior and missed all expected evidence-term checks." }, { "event_id": "typed-incomplete-retained", @@ -107,20 +170,33 @@ ], "prompt": { "role": "user", - "content": "How should the production-ops suite classify a cold-start check that cannot run because a dependency is missing?", + "content": "How should the production-ops suite classify the OpenViking cold-start local embedding path after the pinned Docker retry reaches add_resource/find but misses expected evidence?", "job_mode": "operate", "constraints": ["cite_evidence", "preserve_typed_status", "do_not_claim_pass"] }, "expected_answer": { "must_include": [ { - "claim_id": "cold_start_dependency_incomplete", - "text": "The cold-start dependency failure is incomplete, not pass." + "claim_id": "pinned_openviking_runtime_reached", + "text": "The pinned OpenViking Docker local embedding path reached add_resource/find." + }, + { + "claim_id": "openviking_wrong_result_after_runtime", + "text": "OpenViking now reports wrong_result/retrieval_wrong_result because all three smoke queries missed expected evidence terms." + }, + { + "claim_id": "setup_failure_stays_incomplete", + "text": "If the pinned llama-cpp-python install or import fails on another Docker platform, classify that setup boundary as incomplete, not pass." } ], - "must_not_include": ["reported as pass"], + "must_not_include": ["reported as pass", "dependency failure is incomplete, not pass"], "evidence_links": { - "cold_start_dependency_incomplete": ["local-embed-install-failure", "typed-incomplete-policy"] + "pinned_openviking_runtime_reached": [ + "pinned-local-embed-runtime-reached", + "pinned-local-embed-retry" + ], + "openviking_wrong_result_after_runtime": ["openviking-wrong-result-behavior"], + "setup_failure_stays_incomplete": ["typed-incomplete-policy"] }, "answer_type": "direct_answer", "accepted_alternates": [], @@ -129,14 +205,26 @@ }, "required_evidence": [ { - "evidence_id": "local-embed-install-failure", - "claim_id": "cold_start_dependency_incomplete", + "evidence_id": "pinned-local-embed-runtime-reached", + "claim_id": "pinned_openviking_runtime_reached", + "requirement": "cite", + "quote": "installed and imported llama-cpp-python==0.3.28" + }, + { + "evidence_id": "pinned-local-embed-retry", + "claim_id": "pinned_openviking_runtime_reached", + "requirement": "cite", + "quote": "llama-cpp-python==0.3.28" + }, + { + "evidence_id": "openviking-wrong-result-behavior", + "claim_id": "openviking_wrong_result_after_runtime", "requirement": "cite", - "quote": "could not build or import llama-cpp-python" + "quote": "status=wrong_result" }, { "evidence_id": "typed-incomplete-policy", - "claim_id": "cold_start_dependency_incomplete", + "claim_id": "setup_failure_stays_incomplete", "requirement": "cite", "quote": "Use incomplete when install, import, build" } @@ -154,17 +242,17 @@ "lifecycle_behavior": { "weight": 0.35, "max_points": 1.0, - "criteria": "Would test cold-start behavior only after dependency setup succeeds." + "criteria": "Distinguishes dependency setup reaching runtime from the remaining behavioral retrieval result." }, "evidence_grounding": { "weight": 0.3, "max_points": 1.0, - "criteria": "Cites dependency failure and typed-incomplete policy." + "criteria": "Cites the pinned runtime success, wrong-result behavior, and typed-incomplete fallback policy." }, "uncertainty_handling": { "weight": 0.2, "max_points": 1.0, - "criteria": "States that no pass claim is allowed." + "criteria": "States that setup failure would remain incomplete, but the current reached-runtime result is wrong_result." }, "trap_avoidance": { "weight": 0.15, @@ -180,8 +268,8 @@ }, "allowed_uncertainty": { "can_answer_unknown": true, - "acceptable_phrases": ["incomplete, not pass"], - "fallback_action": "state_blocker" + "acceptable_phrases": ["wrong_result/retrieval_wrong_result"], + "fallback_action": "state_current_wrong_result" }, - "tags": ["external_adapter", "production_ops", "cold_start", "dependency_boundary", "no_live_claim"] + "tags": ["external_adapter", "production_ops", "cold_start", "dependency_boundary", "wrong_result", "no_live_claim"] } diff --git a/apps/elf-eval/tests/real_world_job_benchmark.rs b/apps/elf-eval/tests/real_world_job_benchmark.rs index 414e28fa..fe994564 100644 --- a/apps/elf-eval/tests/real_world_job_benchmark.rs +++ b/apps/elf-eval/tests/real_world_job_benchmark.rs @@ -239,7 +239,7 @@ fn assert_external_adapter_manifest_summary(report: &Value) { report .pointer("/external_adapters/summary/overall_status_counts/wrong_result") .and_then(Value::as_u64), - Some(5) + Some(6) ); assert_eq!( report @@ -251,19 +251,19 @@ fn assert_external_adapter_manifest_summary(report: &Value) { report .pointer("/external_adapters/summary/overall_status_counts/incomplete") .and_then(Value::as_u64), - Some(3) + Some(0) ); assert_eq!( report .pointer("/external_adapters/summary/overall_status_counts/blocked") .and_then(Value::as_u64), - Some(3) + Some(4) ); assert_eq!( report .pointer("/external_adapters/summary/overall_status_counts/not_encoded") .and_then(Value::as_u64), - Some(8) + Some(9) ); assert_eq!( report @@ -281,7 +281,7 @@ fn assert_external_adapter_manifest_summary(report: &Value) { report .pointer("/external_adapters/summary/suite_status_counts/blocked") .and_then(Value::as_u64), - Some(10) + Some(11) ); } @@ -297,7 +297,7 @@ fn assert_external_adapter_manifest_records(report: &Value) -> Result<()> { let qmd_deep = find_by_field(adapters, "/adapter_id", "qmd_deep_profile_gate")?; assert_eq!(elf.pointer("/evidence_class").and_then(Value::as_str), Some("fixture_backed")); - assert_eq!(elf.pointer("/overall_status").and_then(Value::as_str), Some("incomplete")); + assert_eq!(elf.pointer("/overall_status").and_then(Value::as_str), Some("blocked")); assert_eq!( elf_live.pointer("/evidence_class").and_then(Value::as_str), Some("live_real_world") @@ -320,7 +320,7 @@ fn assert_external_adapter_manifest_records(report: &Value) -> Result<()> { agentmemory.pointer("/capabilities/1/status").and_then(Value::as_str), Some("mocked") ); - assert_eq!(openviking.pointer("/overall_status").and_then(Value::as_str), Some("incomplete")); + assert_eq!(openviking.pointer("/overall_status").and_then(Value::as_str), Some("wrong_result")); assert_eq!(ragflow.pointer("/evidence_class").and_then(Value::as_str), Some("research_gate")); assert_eq!(ragflow.pointer("/overall_status").and_then(Value::as_str), Some("blocked")); assert_eq!( @@ -733,8 +733,8 @@ fn production_ops_fixtures_report_bounded_typed_states() -> Result<()> { let report = run_json_report_from(production_ops_fixture_dir())?; assert_eq!(report.pointer("/summary/job_count").and_then(Value::as_u64), Some(6)); - assert_eq!(report.pointer("/summary/pass").and_then(Value::as_u64), Some(3)); - assert_eq!(report.pointer("/summary/incomplete").and_then(Value::as_u64), Some(1)); + assert_eq!(report.pointer("/summary/pass").and_then(Value::as_u64), Some(4)); + assert_eq!(report.pointer("/summary/incomplete").and_then(Value::as_u64), Some(0)); assert_eq!(report.pointer("/summary/blocked").and_then(Value::as_u64), Some(2)); assert_eq!(report.pointer("/summary/not_encoded").and_then(Value::as_u64), Some(0)); assert_eq!(report.pointer("/summary/evidence_coverage").and_then(Value::as_f64), Some(1.0)); @@ -750,7 +750,7 @@ fn production_ops_fixtures_report_bounded_typed_states() -> Result<()> { let suites = array_at(&report, "/suites")?; let production_ops = find_by_field(suites, "/suite_id", "production_ops")?; - assert_eq!(production_ops.pointer("/status").and_then(Value::as_str), Some("incomplete")); + assert_eq!(production_ops.pointer("/status").and_then(Value::as_str), Some("blocked")); assert_eq!(production_ops.pointer("/encoded_job_count").and_then(Value::as_u64), Some(6)); let jobs = array_at(&report, "/jobs")?; @@ -766,7 +766,7 @@ fn production_ops_fixtures_report_bounded_typed_states() -> Result<()> { assert_eq!(restore.pointer("/qdrant_rebuild_case").and_then(Value::as_bool), Some(true)); assert_eq!(private_manifest.pointer("/status").and_then(Value::as_str), Some("blocked")); assert_eq!(credentials.pointer("/status").and_then(Value::as_str), Some("blocked")); - assert_eq!(dependency.pointer("/status").and_then(Value::as_str), Some("incomplete")); + assert_eq!(dependency.pointer("/status").and_then(Value::as_str), Some("pass")); Ok(()) } @@ -782,9 +782,9 @@ fn assert_root_knowledge_summary(report: &Value) { fn assert_root_aggregate_summary(report: &Value) { assert_eq!(report.pointer("/summary/job_count").and_then(Value::as_u64), Some(38)); - assert_eq!(report.pointer("/summary/pass").and_then(Value::as_u64), Some(35)); + assert_eq!(report.pointer("/summary/pass").and_then(Value::as_u64), Some(36)); assert_eq!(report.pointer("/summary/wrong_result").and_then(Value::as_u64), Some(0)); - assert_eq!(report.pointer("/summary/incomplete").and_then(Value::as_u64), Some(1)); + assert_eq!(report.pointer("/summary/incomplete").and_then(Value::as_u64), Some(0)); assert_eq!(report.pointer("/summary/blocked").and_then(Value::as_u64), Some(2)); assert_eq!(report.pointer("/summary/not_encoded").and_then(Value::as_u64), Some(0)); assert_eq!(report.pointer("/summary/unsupported_claim_count").and_then(Value::as_u64), Some(0)); @@ -825,9 +825,9 @@ fn assert_root_aggregate_summary(report: &Value) { ); assert_eq!( report.pointer("/summary/evidence_required_count").and_then(Value::as_u64), - Some(82) + Some(84) ); - assert_eq!(report.pointer("/summary/evidence_covered_count").and_then(Value::as_u64), Some(82)); + assert_eq!(report.pointer("/summary/evidence_covered_count").and_then(Value::as_u64), Some(84)); assert_eq!(report.pointer("/summary/evidence_coverage").and_then(Value::as_f64), Some(1.0)); assert_eq!(report.pointer("/summary/source_ref_coverage").and_then(Value::as_f64), Some(1.0)); assert_eq!(report.pointer("/summary/quote_coverage").and_then(Value::as_f64), Some(1.0)); @@ -895,7 +895,7 @@ fn assert_root_aggregate_suites(report: &Value) -> Result<()> { let production_ops = find_by_field(suites, "/suite_id", "production_ops")?; - assert_eq!(production_ops.pointer("/status").and_then(Value::as_str), Some("incomplete")); + assert_eq!(production_ops.pointer("/status").and_then(Value::as_str), Some("blocked")); assert_eq!(production_ops.pointer("/encoded_job_count").and_then(Value::as_u64), Some(6)); Ok(()) diff --git a/docker-compose.baseline.yml b/docker-compose.baseline.yml index 1495166a..5793f66c 100644 --- a/docker-compose.baseline.yml +++ b/docker-compose.baseline.yml @@ -62,6 +62,8 @@ services: ELF_BASELINE_BACKFILL_RESUME_PROBE: ${ELF_BASELINE_BACKFILL_RESUME_PROBE:-} ELF_BASELINE_MAX_ELF_RSS_KB: ${ELF_BASELINE_MAX_ELF_RSS_KB:-1500000} ELF_BASELINE_MAX_ELF_SECONDS: ${ELF_BASELINE_MAX_ELF_SECONDS:-600} + ELF_BASELINE_OPENVIKING_LLAMA_CPP_PYTHON_INDEX: ${ELF_BASELINE_OPENVIKING_LLAMA_CPP_PYTHON_INDEX:-} + ELF_BASELINE_OPENVIKING_LLAMA_CPP_PYTHON_VERSION: ${ELF_BASELINE_OPENVIKING_LLAMA_CPP_PYTHON_VERSION:-} ELF_BASELINE_PROFILE: ${ELF_BASELINE_PROFILE:-smoke} ELF_BASELINE_PROJECTS: ${ELF_BASELINE_PROJECTS:-all} ELF_BASELINE_PRODUCTION_CORPUS_MANIFEST: ${ELF_BASELINE_PRODUCTION_CORPUS_MANIFEST:-} diff --git a/docs/guide/benchmarking/2026-06-10-real-world-comparison-report.md b/docs/guide/benchmarking/2026-06-10-real-world-comparison-report.md index 632f1536..2868b4b8 100644 --- a/docs/guide/benchmarking/2026-06-10-real-world-comparison-report.md +++ b/docs/guide/benchmarking/2026-06-10-real-world-comparison-report.md @@ -10,10 +10,11 @@ and the live-baseline reports linked from this guide. Depends on: `docs/spec/real_world_agent_memory_benchmark_v1.md`, `docs/guide/benchmarking/real_world_agent_memory_benchmark.md`, and `docs/guide/benchmarking/live_baseline_benchmark.md`. -Verification: The commands listed below were run from branch `y/elf-xy-865`. The -generated reports used runner version -`0.2.0-89d30dc04a854771f2a62f607e1d13498ccb3073-aarch64-apple-darwin`; the working -tree also contained the adapter manifest refresh recorded here. +Verification: The original commands listed below were run from branch `y/elf-xy-865`. +XY-881 refreshed `cargo make real-world-memory`, `cargo make real-world-memory-production-ops`, +and `ELF_BASELINE_PROJECTS=OpenViking cargo make baseline-live-docker` from branch +`y/elf-xy-881`. Tables below include that refresh where the OpenViking cold-start +dependency boundary is discussed. Postscript: XY-880 superseded the live-adapter state in this report for ELF and qmd. The successor evidence is @@ -43,14 +44,14 @@ paths remain typed `blocked` boundaries, not passes. | Command | Generated artifact | Run ID | Generated at | | --- | --- | --- | --- | -| `cargo make real-world-memory` | `tmp/real-world-memory/real-world-memory-report.{json,md}` | `real-world-memory` | `2026-06-10T04:21:32.545027Z` | +| `cargo make real-world-memory` | `tmp/real-world-memory/real-world-memory-report.{json,md}` | `real-world-memory` | `2026-06-10T08:47:44.086502Z` | | `cargo make real-world-memory-project-decisions` | `tmp/real-world-memory/project-decisions/report.{json,md}` | `real-world-memory-project-decisions` | `2026-06-10T04:21:52.403238Z` | -| `cargo make real-world-memory-production-ops` | `tmp/real-world-memory/production-ops-report.{json,md}` | `real-world-memory-production-ops` | `2026-06-10T04:21:59.520163Z` | +| `cargo make real-world-memory-production-ops` | `tmp/real-world-memory/production-ops-report.{json,md}` | `real-world-memory-production-ops` | `2026-06-10T08:47:18.205778Z` | | `cargo make real-world-memory-evolution` | `tmp/real-world-memory/evolution-report.{json,md}` | `real-world-memory-evolution` | `2026-06-10T04:22:06.325152Z` | | `cargo make real-world-job-operator-ux` | `tmp/real-world-job/real-world-job-operator-ux-report.{json,md}` | `real-world-job-operator-ux` | `2026-06-10T04:22:12.28938Z` | -All generated reports used runner version -`0.2.0-89d30dc04a854771f2a62f607e1d13498ccb3073-aarch64-apple-darwin`. +The refreshed real-world-memory reports used runner version +`0.2.0-a8b25d00880bd3cf04707c3b2b328cd20a585396-aarch64-apple-darwin`. ## Aggregate Result @@ -59,18 +60,18 @@ suites: | Metric | Value | | --- | ---: | -| Pass | `35` | -| Incomplete | `1` | +| Pass | `36` | +| Incomplete | `0` | | Blocked | `2` | | Wrong result | `0` | | Lifecycle fail | `0` | | Not encoded | `0` | | Unsupported claim | `0` | -| Mean score | `0.921` | -| Evidence coverage | `82/82` (`1.000`) | -| Source-ref coverage | `82/82` (`1.000`) | -| Quote coverage | `82/82` (`1.000`) | -| Expected evidence recall | `75/75` (`1.000`) | +| Mean score | `0.947` | +| Evidence coverage | `84/84` (`1.000`) | +| Source-ref coverage | `84/84` (`1.000`) | +| Quote coverage | `84/84` (`1.000`) | +| Expected evidence recall | `77/77` (`1.000`) | | Redaction leaks | `0` | | Scope violations | `0` | | Temporal validity gaps | `0` | @@ -89,7 +90,7 @@ Suite-level outcomes: | `knowledge_compilation` | 2 | `pass` | `1.000` | Derived page fixtures passed with citation/rebuild checks. | | `operator_debugging_ux` | 1 | `pass` | `1.000` | Aggregate stage-attribution fixture passed. | | `capture_integration` | 2 | `pass` | `1.000` | Redaction and capture-boundary fixtures passed. | -| `production_ops` | 6 | `incomplete` | `0.500` | Three jobs passed, one is a typed dependency `incomplete`, and two are typed operator `blocked`. | +| `production_ops` | 6 | `blocked` | `0.667` | Four jobs passed, including the pinned OpenViking cold-start classification, and two operator-owned boundaries remain `blocked`. | | `personalization` | 1 | `pass` | `1.000` | Scoped preference correction passed. | ## Focused P1 Slices @@ -99,7 +100,7 @@ Suite-level outcomes: | `cargo make real-world-memory-project-decisions` | 5 | `5` pass | Current decision, historical/reversed decision, validation gate, tradeoff rationale, and private-manifest caveat all passed. | | `cargo make real-world-memory-evolution` | 5 | `5` pass | Temporal relation validity is now encoded and passing; stale answers `0`, conflict detections `5`, update rationales `5`. | | `cargo make real-world-job-operator-ux` | 5 | `5` pass | Dropped evidence, rerank promotion, provider latency, rebuild change, and misleading relation-context debug cases passed with raw SQL needed `0`. | -| `cargo make real-world-memory-production-ops` | 6 | `3` pass, `1` incomplete, `2` blocked | Restore/Qdrant rebuild, interrupted backfill resume, and resource envelope passed; local embedding dependency, provider credentials, and private manifest remain typed non-pass boundaries. | +| `cargo make real-world-memory-production-ops` | 6 | `4` pass, `0` incomplete, `2` blocked | Restore/Qdrant rebuild, interrupted backfill resume, resource envelope, and pinned OpenViking cold-start classification passed; provider credentials and private manifest remain typed non-pass boundaries. | ## External Adapter Evidence @@ -112,7 +113,7 @@ separate: | --- | ---: | --- | | `fixture_backed` | 1 | ELF fixture scoring through checked-in real-world jobs. | | `live_baseline_only` | 6 | Docker same-corpus/lifecycle evidence from the live-baseline runner only. | -| `live_real_world` | 2 | Targeted ELF and qmd adapters execute representative `real_world_job` prompts and scoring. | +| `live_real_world` | 2 | ELF and qmd adapters execute the full encoded-suite `real_world_job` sweep with typed non-pass states preserved. | | `research_gate` | 12 | Source/setup/runtime/resource/retry metadata for future adapter paths; not fixture-backed or live execution evidence. | XY-882 added D1/D2 feasibility verdicts inside the research-gate lane. RAGFlow @@ -135,25 +136,28 @@ Adapter-level status after refreshing the manifest: | Project | Evidence class | Overall status | What is proven | What is not proven | | --- | --- | --- | --- | --- | -| ELF | `fixture_backed` | `incomplete` | Fixture-backed real-world scoring passes 10 of 11 suites, with production-ops typed boundaries preserved. | Fixture-backed scoring is not live-service behavior; cite `elf_live_real_world` for the targeted live slice. | -| ELF | `live_real_world` | `pass` | The targeted Docker slice materializes real_world_job answers through ElfService, worker indexing, and search_raw for work_resume, retrieval, and project_decisions. | This is not yet a full 11-suite live-service run or private-corpus proof. | -| qmd | `live_baseline_only` | `pass` | Docker same-corpus retrieval, update, delete, and cold-start live-baseline checks pass. | Same-corpus checks are not real-world job scoring; cite `qmd_live_real_world` for the targeted live slice. | -| qmd | `live_real_world` | `pass` | The targeted Docker slice indexes real_world_job corpora through qmd collection add/update/embed/query and scores generated answers. | This is not yet broad RAG/graph adapter coverage or full-suite external parity. | +| ELF | `fixture_backed` | `blocked` | Fixture-backed real-world scoring passes every non-operator-owned suite and preserves the production-ops credential/private-manifest boundaries. | Fixture-backed scoring is not live-service behavior; cite `elf_live_real_world` for service-runtime sweep evidence. | +| ELF | `live_real_world` | `wrong_result` | The Docker live sweep materializes all encoded real_world_job records through ElfService, worker indexing, and search_raw; the original targeted answer-retrieval slice still passes. | This is not a full-suite live pass or private-corpus proof; typed wrong_result, incomplete, blocked, and not_encoded states remain visible. | +| qmd | `live_baseline_only` | `pass` | Docker same-corpus retrieval, update, delete, and cold-start live-baseline checks pass. | Same-corpus checks are not real-world job scoring; cite `qmd_live_real_world` for service-runtime sweep evidence. | +| qmd | `live_real_world` | `wrong_result` | The Docker live sweep indexes the encoded real_world_job corpora through qmd collection add/update/embed/query and preserves per-suite scoring evidence. | This is not a full-suite live pass or broad RAG/graph adapter coverage; typed wrong_result, incomplete, blocked, and not_encoded states remain visible. | | agentmemory | `live_baseline_only` | `lifecycle_fail` | Same-corpus retrieval can run through current adapter. | Durable storage/cold-start lifecycle and real-world suites are blocked by the current in-memory adapter path. | | mem0/OpenMemory | `live_baseline_only` | `wrong_result` | Local OSS setup is represented separately from hosted/OpenMemory claims. | Same-corpus retrieval was not a clean pass and no real-world job adapter is encoded. | | memsearch | `live_baseline_only` | `wrong_result` | Markdown-first design remains a source-of-truth ergonomics reference. | Same-corpus retrieval was not a clean pass and real-world suites are incomplete/not encoded. | -| OpenViking | `live_baseline_only` | `incomplete` | Hierarchical context trajectory remains a reference direction. | Docker local-embedding setup must be pinned before fair retrieval or real-world jobs can run. | +| OpenViking | `live_baseline_only` | `wrong_result` | The Docker local-embedding setup is pinned and reaches `add_resource`/`find`. | The same-corpus smoke still misses expected evidence terms; no real-world job adapter or context-trajectory suite is claimed. | | claude-mem | `live_baseline_only` | `wrong_result` | Progressive disclosure and local viewer remain UX references. | Current Docker evidence is not a clean same-corpus pass and progressive disclosure jobs are not encoded. | | qmd deep profile | `research_gate` | `not_encoded` | The stress-profile command path and source metadata are recorded for a future deeper retrieval-debug run. | No expanded qmd stress artifact or broader real-world suite pass is checked in. | -| OpenViking deep profile | `research_gate` | `incomplete` | The deeper context-trajectory gate inherits the current Docker local-embedding setup blocker. | No hierarchical trajectory suite result is claimed. | +| OpenViking deep profile | `research_gate` | `not_encoded` | The deeper context-trajectory gate can reuse the pinned Docker local-embedding setup path. | No hierarchical trajectory suite result is claimed until evidence-bearing same-corpus output is fixed. | | RAGFlow, LightRAG, GraphRAG | `research_gate` | `blocked` | Official sources, setup/resource/retry expectations, and XY-882 adapter-candidate verdicts are recorded. | Docker runtime proof and real_world_job evidence-output mapping are still required before any live adapter claim. | | Graphiti/Zep, Letta, LangGraph, nanograph, llm-wiki, gbrain, graphify | `research_gate` | `not_encoded` | XY-882 records Graphiti/Zep and graphify as adapter candidates, Letta/LangGraph/nanograph/llm-wiki as research-only, and gbrain as blocked. | No Docker-isolated `real_world_job` adapter has run for these projects. | External summary counters: `21` adapter records, `19` non-ELF adapter records, `21` Docker-default, `0` host-global-install requirements, `2` live real-world -adapters, and `12` research-gate records. Overall adapter statuses are `3` pass, -`3` wrong_result, `1` lifecycle_fail, `3` incomplete, `3` blocked, and -`8` not_encoded. +adapters, and `12` research-gate records. Overall adapter statuses are `1` pass, +`6` wrong_result, `1` lifecycle_fail, `0` incomplete, `4` blocked, and +`9` not_encoded. +Real-world suite statuses are tracked separately as `20` pass, `3` wrong_result, +`7` incomplete, `11` blocked, and `40` not_encoded, so a setup boundary is not hidden +behind an aggregate status. ## Remaining Gaps @@ -162,15 +166,15 @@ report: | Gap | Status | Follow-up or non-goal | | --- | --- | --- | -| ELF production-ops cold-start dependency fixture | `incomplete` | `[ELF benchmark P0] Pin Docker-compatible local embedding dependency for cold-start adapter checks`. | +| ELF production-ops cold-start dependency fixture | `pass` | XY-881 pins the Docker OpenViking local embedding path and preserves setup failures as `incomplete` if the wheel/import boundary fails on another platform. | | ELF provider-backed production-ops gate | `blocked` | Run only with routed operator credentials; credentials were not supplied for this report. | | ELF private production corpus | `blocked` | Supply an operator-owned sanitized private manifest; private-corpus checks were a non-goal without that manifest. | -| Full ELF live-service real-world sweep | `not_encoded` beyond targeted slice | Expand `elf_live_real_world` beyond representative work_resume, retrieval, and project_decisions jobs before claiming full live-service suite coverage. | -| Full qmd real-world job sweep | `not_encoded` beyond targeted slice | Expand `qmd_live_real_world` beyond the representative targeted slice before claiming broad real-world suite parity. | +| Full ELF live-service real-world sweep | `wrong_result` | XY-880 expanded `elf_live_real_world` to the full encoded suite corpus; the result is intentionally typed non-pass rather than a full-suite live pass. | +| Full qmd real-world job sweep | `wrong_result` | XY-880 expanded `qmd_live_real_world` to the full encoded suite corpus; the result is intentionally typed non-pass rather than broad real-world suite parity. | | agentmemory durable lifecycle | `lifecycle_fail` / `blocked` | `[ELF benchmark P0] Make agentmemory adapter lifecycle-durable and fail-typed`. | | mem0/OpenMemory same-corpus and real-world coverage | `wrong_result` / `not_encoded` | Add/fix a local OSS adapter before claiming lifecycle, personalization, or OpenMemory UI parity. | | memsearch same-corpus and real-world coverage | `wrong_result` / `incomplete` | Fix Docker same-corpus retrieval/reindex evidence before scoring Markdown-first real-world jobs. | -| OpenViking Docker local embedding path | `incomplete` | `[ELF benchmark adapter] Pin OpenViking Docker local embedding dependency path`. | +| OpenViking Docker local embedding path | `wrong_result` | The pinned dependency path reaches `add_resource`/`find`; the remaining follow-up is evidence-bearing retrieval output, not setup. | | claude-mem durable/progressive-disclosure adapter | `wrong_result` / `not_encoded` | Add durable local repository and progressive-disclosure job coverage before UX parity claims. | | RAGFlow, LightRAG, GraphRAG, Graphiti/Zep, and graphify adapters | `research_gate` adapter candidates | Follow-up issues [XY-885](https://linear.app/hack-ink/issue/XY-885/elf-benchmark-adapter-implement-ragflow-docker-evidence-smoke-adapter), [XY-886](https://linear.app/hack-ink/issue/XY-886/elf-benchmark-adapter-implement-lightrag-docker-context-export-adapter), [XY-887](https://linear.app/hack-ink/issue/XY-887/elf-benchmark-adapter-implement-graphrag-cost-bounded-docker-adapter), [XY-888](https://linear.app/hack-ink/issue/XY-888/elf-benchmark-adapter-implement-graphitizep-temporal-graph-adapter), and [XY-889](https://linear.app/hack-ink/issue/XY-889/elf-benchmark-adapter-implement-graphify-docker-graph-report-adapter) must run only Docker-contained adapter smokes that emit evidence-linked outputs before any live result claim. | | Letta, LangGraph, nanograph, and llm-wiki adapters | `research_only` research gates | Keep as architecture or workflow references until a contained output contract is selected. | diff --git a/docs/guide/benchmarking/live_baseline_benchmark.md b/docs/guide/benchmarking/live_baseline_benchmark.md index d757b304..d1d08e6d 100644 --- a/docs/guide/benchmarking/live_baseline_benchmark.md +++ b/docs/guide/benchmarking/live_baseline_benchmark.md @@ -157,11 +157,18 @@ Current deeper checks: stress default is a bounded 60-second signal. OpenViking attempts the official `.[local-embed]` path plus `OpenViking.add_resource` -and `OpenViking.find`. If the Docker platform cannot build or import -`llama-cpp-python`, the project is recorded as `incomplete` with +and `OpenViking.find`. The Docker runner first pins the local embedding dependency to +`llama-cpp-python==0.3.28` from the official CPU wheel index +`https://abetlen.github.io/llama-cpp-python/whl/cpu` and installs it with +`--only-binary llama-cpp-python`. Override +`ELF_BASELINE_OPENVIKING_LLAMA_CPP_PYTHON_VERSION` or +`ELF_BASELINE_OPENVIKING_LLAMA_CPP_PYTHON_INDEX` only when the pinned wheel is +unavailable for the Docker platform. If the pinned wheel cannot install or import, the +project is recorded as `incomplete` with `retrieval_status = "local_embed_install_failed"` rather than as a retrieval failure. -The adapter metadata includes retry guidance to pin or provide a Docker-compatible -local embedding dependency before scaling the OpenViking profile. +When the pinned dependency reaches `add_resource`/`find`, evidence misses are recorded +as `wrong_result`/`retrieval_wrong_result`. This local dependency check is separate +from provider-backed ELF/Qwen3 embedding evidence. ## Checked-In Reports diff --git a/docs/guide/benchmarking/real_world_agent_memory_benchmark.md b/docs/guide/benchmarking/real_world_agent_memory_benchmark.md index 77277c5a..e4745d72 100644 --- a/docs/guide/benchmarking/real_world_agent_memory_benchmark.md +++ b/docs/guide/benchmarking/real_world_agent_memory_benchmark.md @@ -158,9 +158,9 @@ including the retrieval-quality slice below. The suite currently encodes: - `capture_integration`: write-policy audit behavior for redaction/private exclusion and fixture-backed capture/integration boundary classification. - `production_ops`: interrupted generated backfill resume, backup/restore plus - cold-start readback, resource-envelope interpretation, missing dependency - `incomplete` classification, missing private manifest `blocked` classification, and - provider credential boundary `blocked` classification. + cold-start readback, resource-envelope interpretation, pinned OpenViking local + embedding runtime/wrong-result classification, missing private manifest `blocked` + classification, and provider credential boundary `blocked` classification. - `personalization`: scoped stable preference correction without temporary or cross-project preference leakage. @@ -170,7 +170,7 @@ count, update rationale availability, temporal validity encoding count, scope correctness, redaction leak count, capture/integration behavior classes, Qdrant rebuild case/pass counts, expected evidence recall, irrelevant context ratio, latency/cost, answer-type plus caveat/refusal/uncertainty flags, and trace -explainability counters, production-ops blocked/incomplete job states, and +explainability counters, production-ops blocked/wrong-result job states, and private-corpus redaction policy. The fixtures include negative traps for stale blockers, unsupported prior claims, stale deleted facts, stale historical facts, cross-project preference leakage, private/redacted text leakage, obsolete retrieval @@ -232,8 +232,9 @@ remain `not_encoded` for this live adapter path. qmd still also keeps its separa record is not a real-world suite win. agentmemory is blocked on durable upstream storage for lifecycle proof. mem0/OpenMemory, memsearch, and claude-mem currently retain wrong-result or incomplete live-baseline states for the checked-in adapter -evidence. OpenViking is incomplete until its local embedding setup is reliable inside -Docker. The expanded RAG and graph-memory records for RAGFlow, LightRAG, GraphRAG, +evidence. OpenViking now reaches its pinned Docker local embedding setup but remains a +same-corpus `wrong_result` until it returns evidence-bearing retrieval output. The +expanded RAG and graph-memory records for RAGFlow, LightRAG, GraphRAG, Graphiti/Zep, Letta, LangGraph, nanograph, llm-wiki, gbrain, graphify, and deeper qmd/OpenViking profiles are `research_gate` records until their Docker-isolated adapter runs are implemented. These typed states describe benchmark coverage; do not @@ -392,8 +393,12 @@ interpretation. The same slice deliberately keeps non-pass boundaries typed. A missing private production manifest is `blocked`, unavailable provider credentials are `blocked`, and -a cold-start adapter dependency failure is `incomplete`. These states are evidence for -operator caveats, not proof of private-corpus or provider-backed production success. +the OpenViking cold-start dependency fixture now records a pinned Docker-local +embedding path that reaches `OpenViking.add_resource` and `OpenViking.find` but returns +`wrong_result` evidence for the smoke queries. If the pinned wheel cannot install or +import on a Docker platform, that setup boundary remains `incomplete`. These states +are evidence for operator caveats, not proof of private-corpus, provider-backed +production, or external-adapter quality success. This suite does not run private corpus data, does not require or publish credentials, does not perform live Docker restore/backfill work, and does not reinterpret older diff --git a/docs/guide/research/comparison_external_projects.md b/docs/guide/research/comparison_external_projects.md index a13f33e5..f969544c 100644 --- a/docs/guide/research/comparison_external_projects.md +++ b/docs/guide/research/comparison_external_projects.md @@ -102,7 +102,7 @@ Project-to-suite map: | claude-mem | `rw.operator-continuity`, `rw.resume-evidence`, `rw.retrieval-debug` | Progressive-disclosure search, auto-capture hooks, local viewer, and observation/timeline workflows are directly aligned with real agent resumption jobs. | Exercise a real local repository with hook-driven capture, then evaluate `search -> timeline -> observations` behavior after restart; do not rely on mocked storage. | Docs-grounded for progressive disclosure/viewer; current benchmark adapter evidence is incomplete/wrong-result and mostly not encoded for lifecycle. Confidence: medium for product reference, low for current adapter claims. | ELF has stronger provenance and service boundaries, but claude-mem remains a reference for operator workflow and progressive disclosure UX. | | mem0 / OpenMemory | `rw.lifecycle-staleness`, `rw.graph-temporal`, `rw.operator-continuity`, `rw.resume-evidence` | Entity-scoped memory, memory history, expiration, hosted/OSS surfaces, OpenMemory UI, and optional graph memory make it the broadest lifecycle and ecosystem comparison target. | Separate OSS local FastEmbed/Qdrant evidence from hosted Platform claims; prove add/update/delete/history, entity-scoped retrieval, expiration exclusion, OpenMemory UI readback, and optional graph context on the same corpus. | Docs-grounded for lifecycle/entity/graph/UI claims; current local adapter is incomplete/wrong-result for same-corpus retrieval and delete remains not encoded. Confidence: medium for suite fit, low for current adapter quality. | ELF is stronger on deterministic evidence-bound writes; mem0/OpenMemory is the reference for ecosystem reach, entity-scoped history, hosted option, and optional graph UX. | | memsearch | `rw.lifecycle-staleness`, `rw.retrieval-debug`, `rw.resume-evidence` | Markdown as canonical memory plus incremental/content-addressed reindexing is a useful model for source transparency and rebuildable derived indexes. | Index a real-world Markdown corpus, mutate/delete files, rerun index/search from fresh processes, and record Milvus mode so Lite/Server/Cloud behavior is not conflated. | Docs-grounded for architecture; current adapter is incomplete/invalid-result, so no pass/fail quality claim is allowed. Confidence: medium for design pattern, low for current adapter evidence. | ELF already owns source-of-truth plus rebuildable index at service level; memsearch remains a reference for simple local canonical-store ergonomics. | -| OpenViking | `rw.context-trajectory`, `rw.resume-evidence`, `rw.retrieval-debug` | `viking://` context organization, intent analysis, hierarchical retrieval, staged find/search behavior, and session compression are relevant to multi-hop agent context jobs. | Pin or provide a Docker-compatible local embedding path, then evaluate `add_resource`/`find`/`search` over multi-stage jobs with stage output, hierarchy, and session memory evidence. | Docs-grounded for mechanism; current benchmark adapter is incomplete due local embedding install failure. Confidence: medium for architecture reference, low for runnable adapter quality. | ELF has first-class traces and evidence-bound notes, but OpenViking is the reference for hierarchical context trajectory and filesystem-like organization. | +| OpenViking | `rw.context-trajectory`, `rw.resume-evidence`, `rw.retrieval-debug` | `viking://` context organization, intent analysis, hierarchical retrieval, staged find/search behavior, and session compression are relevant to multi-hop agent context jobs. | Use the pinned Docker local embedding path, then evaluate `add_resource`/`find`/`search` over multi-stage jobs with stage output, hierarchy, and session memory evidence. | Docs-grounded for mechanism; current benchmark adapter reaches local embedding setup and `add_resource`/`find`, but remains `wrong_result` because same-corpus evidence terms are missed. Confidence: medium for architecture reference, low for runnable adapter quality. | ELF has first-class traces and evidence-bound notes, but OpenViking is the reference for hierarchical context trajectory and filesystem-like organization. | | llm-wiki | `rw.knowledge-synthesis`, `rw.resume-evidence` | Query/save/lint flows and topic-scoped wiki pages are a useful reference for turning retrieved memory into maintained project knowledge. | Run a corpus-to-wiki job, ask resume/decision questions, require page citations back to source memory, then mutate a stale source and prove lint/repair catches it. | Docs-grounded D1; no benchmark adapter evidence. Confidence: medium for derived-knowledge fit. | ELF is not yet stronger on derived knowledge pages; llm-wiki should inform rebuildable, evidence-cited dossiers rather than core storage. | | gbrain | `rw.knowledge-synthesis`, `rw.operator-continuity` | `compiled_truth`, timeline sections, backlinks, primary-home routing, and enrichment workflows model a living operational brain for project work. | Build or update pages from the real-world corpus, require current-truth plus timeline answers, and prove enrichment/backlink maintenance does not hide unsupported claims. | Docs-grounded D1; no benchmark adapter evidence. Confidence: medium for operator knowledge UX. | ELF should keep source notes authoritative; gbrain is a reference for presentation, enrichment, and maintenance loops. | | Always-On Memory Agent | `rw.consolidation-review`, `rw.operator-continuity` | The file/API/dashboard ingest loop and timer-based consolidation show how background memory formation becomes a user-visible product surface. | Run scheduled consolidation on a fixed corpus, record source rows and output insights, then score whether consolidation is reviewable, repeatable, and bounded against unsupported claims. | Docs-grounded D1; no benchmark adapter evidence. Confidence: medium for consolidation workflow reference. | ELF should borrow scheduling and operator controls while keeping deterministic writes and reviewable derived outputs. | diff --git a/docs/guide/research/external_memory_improvement_plan.md b/docs/guide/research/external_memory_improvement_plan.md index 2e2e53a8..6ad45be2 100644 --- a/docs/guide/research/external_memory_improvement_plan.md +++ b/docs/guide/research/external_memory_improvement_plan.md @@ -33,7 +33,10 @@ Current encoded result: - ELF and qmd passed every encoded smoke check. - agentmemory passed same-corpus retrieval but failed or could not complete lifecycle checks. - mem0, memsearch, and claude-mem returned wrong same-corpus retrieval results in the encoded smoke. -- OpenViking was incomplete because its local embedding dependency could not complete inside the Docker runner. +- OpenViking was incomplete in the June 9 run because its local embedding dependency + could not complete inside the Docker runner. XY-881 later pinned the Docker path to + a CPU `llama-cpp-python` wheel and moved the current OpenViking state to + `wrong_result` when `add_resource`/`find` misses expected evidence terms. What this proves: @@ -83,7 +86,7 @@ Use these terms in future benchmark reports and Linear issues: | `pass` | Encoded check completed and returned expected result. | ELF same-corpus retrieval and lifecycle checks pass. | | `wrong_result` | The system completed but returned an incorrect memory or missed the expected evidence. | mem0/memsearch/claude-mem smoke retrieval mismatch. | | `lifecycle_fail` | Retrieval may work, but update/delete/cold-start/persistence behavior is wrong or incomplete. | agentmemory adapter passing retrieval but not lifecycle. | -| `incomplete` | The benchmark could not reach the behavioral check due to install/runtime/dependency failure. | OpenViking local embedding install failure in Docker. | +| `incomplete` | The benchmark could not reach the behavioral check due to install/runtime/dependency failure. | A pinned local embedding wheel/import failure before OpenViking `add_resource`/`find`. | | `not_encoded` | Capability is not currently covered by the benchmark, so no pass/fail claim is allowed. | Viewer quality and batch backfill UX. | | `blocked` | A safe test cannot run without external credentials, manual setup, or a dependency outside the issue scope. | Private corpus evaluation before sanitized corpus exists. | @@ -240,7 +243,9 @@ Implementation shape: Acceptance: - agentmemory adapter either passes durable lifecycle checks or is explicitly marked blocked with evidence. -- OpenViking incomplete state records a pinned dependency failure and retry path. +- OpenViking records a pinned Docker local embedding retry path; install/import + failure remains `incomplete`, while evidence misses after `add_resource`/`find` + are `wrong_result`. - qmd smoke pass remains covered and gains scale/stress profiles. - Real-world reports include adapter coverage counters before any external adapter is allowed to claim a real-world suite pass. diff --git a/scripts/live-baseline-benchmark.sh b/scripts/live-baseline-benchmark.sh index 63f62465..d6f96758 100755 --- a/scripts/live-baseline-benchmark.sh +++ b/scripts/live-baseline-benchmark.sh @@ -2431,23 +2431,28 @@ project_openviking() { local config_path="${REPORT_DIR}/${project}-ov.conf" local result_path="${REPORT_DIR}/${project}-search.json" local driver_path="${REPOS_DIR}/${project}/elf-live-baseline-openviking.py" - local local_embed_failure_pattern="llama-cpp-python|target specific option mismatch|failed-wheel-build-for-install|Failed building wheel|Failed to build llama-cpp-python|No module named 'llama_cpp'|Local embedding is enabled but 'llama-cpp-python' is not installed" + local constraints_path="${REPORT_DIR}/${project}-constraints.txt" + local llama_cpp_python_version="${ELF_BASELINE_OPENVIKING_LLAMA_CPP_PYTHON_VERSION:-0.3.28}" + local llama_cpp_python_index="${ELF_BASELINE_OPENVIKING_LLAMA_CPP_PYTHON_INDEX:-https://abetlen.github.io/llama-cpp-python/whl/cpu}" + local local_embed_failure_pattern="target specific option mismatch|failed-wheel-build-for-install|Failed building wheel for llama-cpp-python|Failed to build llama-cpp-python|Could not build wheels for llama-cpp-python|No module named 'llama_cpp'|Local embedding is enabled but 'llama-cpp-python' is not installed|No matching distribution found|Could not find a version that satisfies|not a supported wheel" + local local_embed_install_reason="OpenViking local-embed install failed in Docker for pinned llama-cpp-python==${llama_cpp_python_version} from the CPU wheel index, so same-corpus local retrieval could not be run" + local local_embed_command_summary="pip install -e .; openviking/ov --help; pip install llama-cpp-python==${llama_cpp_python_version} --extra-index-url ${llama_cpp_python_index} --only-binary llama-cpp-python; pip install -e .[local-embed]; OpenViking.add_resource/find" local head mkdir -p "${home}" - cat >"${REPORT_DIR}/${project}-adapter.json" <<'JSON' + cat >"${REPORT_DIR}/${project}-adapter.json" < '${constraints_path}' && .venv/bin/pip install --extra-index-url '${llama_cpp_python_index}' --only-binary llama-cpp-python -c '${constraints_path}' 'llama-cpp-python==${llama_cpp_python_version}' && .venv/bin/pip install --extra-index-url '${llama_cpp_python_index}' --only-binary llama-cpp-python -c '${constraints_path}' -e '.[local-embed]' && .venv/bin/python - <<'PY' +import llama_cpp + +print('llama_cpp_import_ok', getattr(llama_cpp, '__version__', 'unknown')) +PY"; then if rg -q "${local_embed_failure_pattern}" "${log_path}"; then - json_record "${project}" "${repo}" "${head}" "incomplete" "local_embed_install_failed" "OpenViking local-embed install failed in Docker while building llama-cpp-python for aarch64, so same-corpus local retrieval could not be run" "${project}.log" "pip install -e .; openviking/ov --help; pip install -e .[local-embed]" + json_record "${project}" "${repo}" "${head}" "incomplete" "local_embed_install_failed" "${local_embed_install_reason}" "${project}.log" "${local_embed_command_summary}" return fi - json_record "${project}" "${repo}" "${head}" "incomplete" "local_embed_install_failed" "OpenViking local-embed install failed in Docker, so same-corpus local retrieval could not be run" "${project}.log" "pip install -e .; openviking/ov --help; pip install -e .[local-embed]" + json_record "${project}" "${repo}" "${head}" "incomplete" "local_embed_install_failed" "${local_embed_install_reason}" "${project}.log" "${local_embed_command_summary}" return fi if rg -q "${local_embed_failure_pattern}" "${log_path}"; then - json_record "${project}" "${repo}" "${head}" "incomplete" "local_embed_install_failed" "OpenViking local-embed install returned success but the log contains llama-cpp-python build/import failure, so same-corpus local retrieval could not be run" "${project}.log" "pip install -e .; openviking/ov --help; pip install -e .[local-embed]" + json_record "${project}" "${repo}" "${head}" "incomplete" "local_embed_install_failed" "OpenViking pinned local-embed install returned success but the log contains llama-cpp-python wheel/import failure, so same-corpus local retrieval could not be run" "${project}.log" "${local_embed_command_summary}" return fi @@ -2682,11 +2691,11 @@ PY jq '{check_summary, checks}' "${result_path}" >"${REPORT_DIR}/${project}-checks.json" fi if rg -q "${local_embed_failure_pattern}" "${log_path}"; then - json_record "${project}" "${repo}" "${head}" "incomplete" "local_embed_install_failed" "OpenViking local add_resource/find hit llama-cpp-python build/import failure, so same-corpus local retrieval could not be run" "${project}.log" "pip install -e .[local-embed]; OpenViking.add_resource/find" + json_record "${project}" "${repo}" "${head}" "incomplete" "local_embed_install_failed" "OpenViking local add_resource/find hit pinned llama-cpp-python wheel/import failure, so same-corpus local retrieval could not be run" "${project}.log" "${local_embed_command_summary}" return fi if [[ ! -s "${result_path}" ]] || ! jq -e . "${result_path}" >/dev/null 2>&1; then - json_record "${project}" "${repo}" "${head}" "incomplete" "retrieval_command_failed" "OpenViking local add_resource/find returned success but did not write a valid result JSON" "${project}.log" "pip install -e .[local-embed]; OpenViking.add_resource/find" + json_record "${project}" "${repo}" "${head}" "incomplete" "retrieval_command_failed" "OpenViking local add_resource/find returned success but did not write a valid result JSON" "${project}.log" "${local_embed_command_summary}" return fi if jq -e --argjson query_count "${QUERY_COUNT}" ' @@ -2701,19 +2710,19 @@ PY else retrieval_status="retrieval_wrong_result" fi - json_record "${project}" "${repo}" "${head}" "${typed_status}" "${retrieval_status}" "$(typed_status_reason "${project}" "${typed_status}")" "${project}.log" "pip install -e .[local-embed]; OpenViking.add_resource/find" + json_record "${project}" "${repo}" "${head}" "${typed_status}" "${retrieval_status}" "$(typed_status_reason "${project}" "${typed_status}")" "${project}.log" "${local_embed_command_summary}" return fi - json_record "${project}" "${repo}" "${head}" "incomplete" "invalid_json_result" "OpenViking local add_resource/find did not produce a valid benchmark result" "${project}.log" "pip install -e .[local-embed]; OpenViking.add_resource/find" + json_record "${project}" "${repo}" "${head}" "incomplete" "invalid_json_result" "OpenViking local add_resource/find did not produce a valid benchmark result" "${project}.log" "${local_embed_command_summary}" return fi if rg -q "${local_embed_failure_pattern}" "${log_path}"; then - json_record "${project}" "${repo}" "${head}" "incomplete" "local_embed_install_failed" "OpenViking local add_resource/find failed because llama-cpp-python was unavailable in Docker" "${project}.log" "pip install -e .[local-embed]; OpenViking.add_resource/find" + json_record "${project}" "${repo}" "${head}" "incomplete" "local_embed_install_failed" "OpenViking local add_resource/find failed because pinned llama-cpp-python was unavailable in Docker" "${project}.log" "${local_embed_command_summary}" return fi - json_record "${project}" "${repo}" "${head}" "incomplete" "retrieval_command_failed" "OpenViking local-embed installed, but same-corpus add_resource/find failed in Docker" "${project}.log" "pip install -e .[local-embed]; OpenViking.add_resource/find" + json_record "${project}" "${repo}" "${head}" "incomplete" "retrieval_command_failed" "OpenViking pinned local-embed installed, but same-corpus add_resource/find failed in Docker" "${project}.log" "${local_embed_command_summary}" } project_claude_mem() {