From 8764e1ccee9e5723e16ab2c8902661c07abed4d5 Mon Sep 17 00:00:00 2001 From: Yvette Carlisle Date: Wed, 10 Jun 2026 15:14:26 +0800 Subject: [PATCH] {"schema":"decodex/commit/1","summary":"Add live real-world adapters for ELF and qmd","authority":"XY-868"} --- Makefile.toml | 9 + README.md | 28 +- .../memory_projects_manifest.json | 171 ++- .../project_decision_fixture_boundary.json | 133 ++ .../retrieval_claim_boundary.json | 133 ++ .../work_resume_exact_next_action.json | 133 ++ .../src/bin/real_world_job_benchmark.rs | 59 +- .../src/bin/real_world_live_adapter.rs | 1234 +++++++++++++++++ .../tests/real_world_job_benchmark.rs | 27 +- ...2026-06-10-real-world-comparison-report.md | 23 +- .../benchmarking/live_baseline_benchmark.md | 24 +- .../real_world_agent_memory_benchmark.md | 35 +- .../research/comparison_external_projects.md | 7 +- scripts/real-world-live-adapters.sh | 116 ++ 14 files changed, 2080 insertions(+), 52 deletions(-) create mode 100644 apps/elf-eval/fixtures/real_world_live_adapters/project_decision_fixture_boundary.json create mode 100644 apps/elf-eval/fixtures/real_world_live_adapters/retrieval_claim_boundary.json create mode 100644 apps/elf-eval/fixtures/real_world_live_adapters/work_resume_exact_next_action.json create mode 100644 apps/elf-eval/src/bin/real_world_live_adapter.rs create mode 100755 scripts/real-world-live-adapters.sh diff --git a/Makefile.toml b/Makefile.toml index 2945dc1c..ebe6d208 100644 --- a/Makefile.toml +++ b/Makefile.toml @@ -418,6 +418,7 @@ args = [ # | real-world-memory-production-ops | composite | | # | real-world-memory-production-ops-json | command | | # | real-world-memory-production-ops-report | command | | +# | real-world-memory-live-adapters | command | | [tasks.real-world-job-smoke] workspace = false @@ -805,6 +806,14 @@ args = [ "tmp/real-world-memory/consolidation/report.md", ] +[tasks.real-world-memory-live-adapters] +workspace = false +command = "bash" +args = [ + "-lc", + "set -euo pipefail; docker compose -f docker-compose.baseline.yml run --build --rm baseline-runner bash scripts/real-world-live-adapters.sh", +] + # Real-world memory knowledge benchmark # | task | type | cwd | diff --git a/README.md b/README.md index 60535d0f..4fc5cf10 100644 --- a/README.md +++ b/README.md @@ -147,14 +147,20 @@ with the production embedding provider path, `Qwen3-Embedding-8B`, and jobs across 11 suites, 35 pass, 1 incomplete, 2 blocked, 0 wrong-result, 0 not-encoded, and 0 unsupported-claim results. The remaining non-pass jobs are production-ops operator boundaries, not hidden benchmark wins. +- Targeted live real-world adapter slice after XY-868: ELF and qmd now have + Docker-isolated `live_real_world` records for representative `work_resume`, + `retrieval`, and `project_decisions` jobs through + `cargo make real-world-memory-live-adapters`. This does not imply full-suite + live-service parity, broad adapter parity, or private-corpus production proof. - The benchmark runner and report publisher are checked in and Docker-isolated: `cargo make baseline-live-docker`, `cargo make baseline-backfill-docker`, `cargo make baseline-production-private-addendum`, `cargo make baseline-backfill-10k-docker`, `cargo make baseline-backfill-100k-docker`, - `cargo make baseline-soak-docker`, `cargo make baseline-live-report`, and - `cargo make baseline-live-docker-clean`. Expensive 100k and long-soak profiles are - opt-in and do not run in normal checks. + `cargo make baseline-soak-docker`, `cargo make baseline-live-report`, + `cargo make real-world-memory-live-adapters`, and + `cargo make baseline-live-docker-clean`. Expensive 100k and long-soak profiles + are opt-in and do not run in normal checks. Detailed evidence and interpretation: @@ -170,8 +176,8 @@ Detailed evidence and interpretation: now reports fixture-backed ELF evidence plus the external adapter coverage manifest for ELF, qmd, agentmemory, mem0/OpenMemory, claude-mem, memsearch, and OpenViking. The report still distinguishes fixture-backed and live-baseline-only evidence from - true live real-world adapter runs; no external project has a live real-world suite win - until an adapter actually executes `real_world_job` prompts and scoring. + true live real-world adapter runs; only the targeted ELF and qmd live adapter slice + currently executes `real_world_job` prompts and scoring. Evidence-backed position after the June 10 real-world report: @@ -179,12 +185,12 @@ Evidence-backed position after the June 10 real-world report: deterministic ingestion boundaries, Postgres source-of-truth plus rebuildable Qdrant indexing, scoped service APIs, and fixture-backed provenance/resume/evolution checks. - ELF and qmd are both strong in the current encoded retrieval evidence: qmd remains - the local retrieval-debug baseline, while ELF has the stronger service and provenance - contract. -- ELF is still behind or not yet proven on live real-world external adapters, - private-corpus production quality, credentialed production-ops gates, qmd-style local - debug knobs, agentmemory/claude-mem/OpenMemory-style continuity UX, OpenViking-style - context trajectory, and hosted managed memory. + the local retrieval-debug baseline and now has targeted live real-world job evidence, + while ELF has the stronger service and provenance contract. +- ELF is still behind or not yet proven on full-suite live real-world external + adapters, private-corpus production quality, credentialed production-ops gates, + qmd-style local debug knobs, agentmemory/claude-mem/OpenMemory-style continuity UX, + OpenViking-style context trajectory, and hosted managed memory. Quick comparison snapshot (objective/high-level). This table compares capability coverage, not overall project quality. diff --git a/apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json b/apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json index 1c37fc4c..8b9f0f61 100644 --- a/apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json +++ b/apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json @@ -126,12 +126,90 @@ ], "notes": [ "This adapter record exists to keep ELF fixture results separate from live external adapter results.", - "The remaining non-pass ELF fixture states are production-ops operator boundaries: a Docker local-embedding dependency, provider credentials, and an operator-owned private corpus manifest." + "The remaining non-pass ELF fixture states are production-ops operator boundaries: a Docker local-embedding dependency, provider credentials, and an operator-owned private corpus manifest.", + "Use elf_live_real_world for service-runtime real_world_job evidence; this fixture-backed record must not imply live-service behavior." + ] + }, + { + "adapter_id": "elf_live_real_world", + "project": "ELF", + "adapter_kind": "docker_service_real_world_job", + "evidence_class": "live_real_world", + "docker_default": true, + "host_global_installs_required": false, + "overall_status": "pass", + "setup": { + "status": "pass", + "evidence": "The live adapter task runs inside docker-compose.baseline.yml with Docker-owned Postgres, Qdrant, Cargo, npm, qmd, and cache volumes.", + "command": "cargo make real-world-memory-live-adapters", + "artifact": "tmp/real-world-memory/live-adapters/elf-materialization.json" + }, + "run": { + "status": "pass", + "evidence": "ELF materializes real_world_job adapter_response objects through ElfService, worker indexing, and search_raw before scoring.", + "command": "cargo make real-world-memory-live-adapters", + "artifact": "tmp/real-world-memory/live-adapters/elf-report.json" + }, + "result": { + "status": "pass", + "evidence": "The representative live adapter slice scores work_resume, retrieval, and project_decisions jobs from generated runtime answers.", + "command": "cargo make real-world-memory-live-adapters", + "artifact": "tmp/real-world-memory/live-adapters/elf-report.md" + }, + "capabilities": [ + { + "capability": "real_world_job_adapter", + "status": "pass", + "evidence": "The adapter executes real_world_job prompts after runtime ingestion and writes generated answer artifacts before scoring." + }, + { + "capability": "service_runtime_execution", + "status": "real", + "evidence": "The materializer uses ElfService, Postgres, Qdrant, deterministic providers, worker indexing, and search_raw in Docker." + }, + { + "capability": "typed_failure_reporting", + "status": "pass", + "evidence": "Adapter setup/runtime failures are materialized as incomplete jobs with evidence JSON instead of silent claim upgrades." + } ], - "follow_up": { - "title": "[ELF benchmark vNext] Replace fixture-only ELF answers with live real-world adapter execution where appropriate", - "reason": "The current report proves fixture scoring, not an end-to-end live real-world memory service run." - } + "suites": [ + { + "suite_id": "work_resume", + "status": "pass", + "evidence": "The live adapter retrieves the current next-action evidence and avoids the stale same-corpus command trap." + }, + { + "suite_id": "retrieval", + "status": "pass", + "evidence": "The live adapter retrieves the live_real_world claim boundary from the indexed corpus." + }, + { + "suite_id": "project_decisions", + "status": "pass", + "evidence": "The live adapter retrieves the decision that fixture_backed results must not imply service-runtime behavior." + } + ], + "evidence": [ + { + "kind": "fixture_dir", + "ref": "apps/elf-eval/fixtures/real_world_live_adapters/", + "status": "real" + }, + { + "kind": "command", + "ref": "cargo make real-world-memory-live-adapters", + "status": "pass" + }, + { + "kind": "artifact", + "ref": "tmp/real-world-memory/live-adapters/elf-report.json", + "status": "pass" + } + ], + "notes": [ + "This is the first Docker-isolated live real_world_job adapter path for ELF; broader suite expansion remains separate from the fixture-backed aggregate." + ] }, { "adapter_id": "qmd_live_baseline", @@ -205,7 +283,88 @@ } ], "notes": [ - "Do not claim a qmd real-world suite pass until a real_world_job adapter executes qmd and records job-level evidence." + "This same-corpus record remains separate from qmd_live_real_world, which records real_world_job prompt execution and scoring evidence." + ] + }, + { + "adapter_id": "qmd_live_real_world", + "project": "qmd", + "adapter_kind": "docker_cli_real_world_job", + "evidence_class": "live_real_world", + "docker_default": true, + "host_global_installs_required": false, + "overall_status": "pass", + "setup": { + "status": "pass", + "evidence": "The live adapter task clones and installs qmd inside the baseline Docker container when the checkout is absent.", + "command": "cargo make real-world-memory-live-adapters", + "artifact": "tmp/real-world-memory/live-adapters/qmd-materialization.json" + }, + "run": { + "status": "pass", + "evidence": "qmd indexes each real_world_job corpus through collection add, update, embed, and query --json before scoring generated answers.", + "command": "cargo make real-world-memory-live-adapters", + "artifact": "tmp/real-world-memory/live-adapters/qmd-report.json" + }, + "result": { + "status": "pass", + "evidence": "The representative live adapter slice scores qmd on work_resume, retrieval, and project_decisions jobs rather than same-corpus smoke checks only.", + "command": "cargo make real-world-memory-live-adapters", + "artifact": "tmp/real-world-memory/live-adapters/qmd-report.md" + }, + "capabilities": [ + { + "capability": "real_world_job_adapter", + "status": "pass", + "evidence": "qmd executes real_world_job prompts through its local CLI retrieval/query workflow and records generated answer artifacts." + }, + { + "capability": "local_cli_retrieval", + "status": "real", + "evidence": "The adapter uses qmd collection add, update, embed -f, and query --json inside Docker." + }, + { + "capability": "typed_failure_reporting", + "status": "pass", + "evidence": "qmd setup/runtime failures are materialized as incomplete jobs with command evidence and retry artifacts." + } + ], + "suites": [ + { + "suite_id": "work_resume", + "status": "pass", + "evidence": "qmd retrieves the current next-action evidence and avoids the stale same-corpus command trap." + }, + { + "suite_id": "retrieval", + "status": "pass", + "evidence": "qmd retrieves the live_real_world claim boundary from indexed real_world_job corpus files." + }, + { + "suite_id": "project_decisions", + "status": "pass", + "evidence": "qmd retrieves the decision that fixture_backed results must not imply service-runtime behavior." + } + ], + "evidence": [ + { + "kind": "fixture_dir", + "ref": "apps/elf-eval/fixtures/real_world_live_adapters/", + "status": "real" + }, + { + "kind": "command", + "ref": "cargo make real-world-memory-live-adapters", + "status": "pass" + }, + { + "kind": "artifact", + "ref": "tmp/real-world-memory/live-adapters/qmd-report.json", + "status": "pass" + } + ], + "notes": [ + "This qmd record is real-world job evidence and must not be conflated with the same-corpus qmd_live_baseline record." ] }, { diff --git a/apps/elf-eval/fixtures/real_world_live_adapters/project_decision_fixture_boundary.json b/apps/elf-eval/fixtures/real_world_live_adapters/project_decision_fixture_boundary.json new file mode 100644 index 00000000..e0da7b8e --- /dev/null +++ b/apps/elf-eval/fixtures/real_world_live_adapters/project_decision_fixture_boundary.json @@ -0,0 +1,133 @@ +{ + "schema": "elf.real_world_job/v1", + "job_id": "live-adapter-project-decision-boundary-001", + "suite": "project_decisions", + "title": "Live adapter retrieves the decision that fixture scoring must not imply service behavior", + "corpus": { + "corpus_id": "real-world-live-adapters-2026-06-10", + "profile": "external_adapter", + "items": [ + { + "evidence_id": "fixture-live-service-boundary", + "kind": "decision", + "text": "Current adapter decision: fixture_backed results must not imply live-service behavior; live_real_world evidence is required before service/runtime superiority claims.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_live_adapter_fixture/v1", + "ref": { + "fixture": "project_decision_fixture_boundary", + "evidence_id": "fixture-live-service-boundary" + } + }, + "created_at": "2026-06-10T06:20:00Z" + }, + { + "evidence_id": "old-fixture-superiority-trap", + "kind": "decision", + "text": "Old adapter decision: fixture_backed scoring alone proves live-service superiority for ELF.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_live_adapter_fixture/v1", + "ref": { + "fixture": "project_decision_fixture_boundary", + "evidence_id": "old-fixture-superiority-trap" + } + }, + "created_at": "2026-06-09T06:20:00Z" + } + ] + }, + "timeline": [ + { + "event_id": "old-fixture-superiority-recorded", + "ts": "2026-06-09T06:20:00Z", + "actor": "agent", + "action": "recorded_old_decision", + "evidence_ids": ["old-fixture-superiority-trap"], + "summary": "The old decision incorrectly treated fixture-backed scoring as live service proof." + }, + { + "event_id": "fixture-live-boundary-recorded", + "ts": "2026-06-10T06:20:00Z", + "actor": "agent", + "action": "recorded_current_decision", + "evidence_ids": ["fixture-live-service-boundary"], + "summary": "The current decision requires live_real_world evidence before service/runtime superiority claims." + } + ], + "prompt": { + "role": "user", + "content": "What is the current decision about fixture_backed scoring and live-service behavior claims?", + "job_mode": "answer", + "constraints": ["cite_evidence", "avoid_stale_facts"] + }, + "expected_answer": { + "must_include": [ + { + "claim_id": "fixture_boundary", + "text": "Current adapter decision: fixture_backed results must not imply live-service behavior; live_real_world evidence is required before service/runtime superiority claims." + } + ], + "must_not_include": [ + "Old adapter decision: fixture_backed scoring alone proves live-service superiority for ELF." + ], + "evidence_links": { + "fixture_boundary": ["fixture-live-service-boundary"] + }, + "answer_type": "decision", + "accepted_alternates": [], + "requires_caveat": false, + "requires_refusal": false + }, + "required_evidence": [ + { + "evidence_id": "fixture-live-service-boundary", + "claim_id": "fixture_boundary", + "requirement": "cite", + "quote": "fixture_backed results must not imply live-service behavior" + } + ], + "negative_traps": [ + { + "trap_id": "old-fixture-superiority-claim", + "type": "stale_fact", + "evidence_ids": ["old-fixture-superiority-trap"], + "failure_if_used": true + } + ], + "scoring_rubric": { + "dimensions": { + "answer_correctness": { + "weight": 0.35, + "max_points": 1.0, + "criteria": "States the current fixture-backed boundary." + }, + "evidence_grounding": { + "weight": 0.3, + "max_points": 1.0, + "criteria": "Cites the current decision evidence." + }, + "trap_avoidance": { + "weight": 0.2, + "max_points": 1.0, + "criteria": "Avoids the stale superiority decision." + }, + "workflow_helpfulness": { + "weight": 0.15, + "max_points": 1.0, + "criteria": "Keeps README/adoption claim boundaries clear." + } + }, + "pass_threshold": 0.8, + "hard_fail_rules": [ + "unsupported high-confidence claim about a required decision or fact", + "use of a negative trap marked failure_if_used = true" + ] + }, + "allowed_uncertainty": { + "can_answer_unknown": false, + "acceptable_phrases": ["The live adapter did not retrieve that evidence."], + "fallback_action": "state_blocker" + }, + "tags": ["external_adapter", "live_real_world", "project_decisions"] +} diff --git a/apps/elf-eval/fixtures/real_world_live_adapters/retrieval_claim_boundary.json b/apps/elf-eval/fixtures/real_world_live_adapters/retrieval_claim_boundary.json new file mode 100644 index 00000000..8302311c --- /dev/null +++ b/apps/elf-eval/fixtures/real_world_live_adapters/retrieval_claim_boundary.json @@ -0,0 +1,133 @@ +{ + "schema": "elf.real_world_job/v1", + "job_id": "live-adapter-retrieval-claim-boundary-001", + "suite": "retrieval", + "title": "Live adapter retrieves the live-real-world claim boundary", + "corpus": { + "corpus_id": "real-world-live-adapters-2026-06-10", + "profile": "external_adapter", + "items": [ + { + "evidence_id": "live-real-world-claim-boundary", + "kind": "decision", + "text": "Live adapter claim boundary: qmd and ELF may be reported as `live_real_world` only when generated JSON and Markdown artifacts include command evidence, artifact paths, and typed status.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_live_adapter_fixture/v1", + "ref": { + "fixture": "retrieval_claim_boundary", + "evidence_id": "live-real-world-claim-boundary" + } + }, + "created_at": "2026-06-10T06:10:00Z" + }, + { + "evidence_id": "fixture-only-claim-trap", + "kind": "decision", + "text": "Incorrect claim: fixture-only ELF scoring is enough to imply live service behavior for real-world jobs.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_live_adapter_fixture/v1", + "ref": { + "fixture": "retrieval_claim_boundary", + "evidence_id": "fixture-only-claim-trap" + } + }, + "created_at": "2026-06-09T06:10:00Z" + } + ] + }, + "timeline": [ + { + "event_id": "fixture-only-trap-recorded", + "ts": "2026-06-09T06:10:00Z", + "actor": "agent", + "action": "recorded_invalid_claim", + "evidence_ids": ["fixture-only-claim-trap"], + "summary": "An invalid claim conflated fixture-only scoring with live service behavior." + }, + { + "event_id": "live-real-world-boundary-recorded", + "ts": "2026-06-10T06:10:00Z", + "actor": "agent", + "action": "recorded_claim_boundary", + "evidence_ids": ["live-real-world-claim-boundary"], + "summary": "The live claim boundary requires generated JSON/Markdown artifacts and typed status." + } + ], + "prompt": { + "role": "user", + "content": "When may qmd and ELF be reported as live_real_world in the real-world benchmark?", + "job_mode": "answer", + "constraints": ["cite_evidence", "avoid_unsupported_claims"] + }, + "expected_answer": { + "must_include": [ + { + "claim_id": "claim_boundary", + "text": "Live adapter claim boundary: qmd and ELF may be reported as `live_real_world` only when generated JSON and Markdown artifacts include command evidence, artifact paths, and typed status." + } + ], + "must_not_include": [ + "Incorrect claim: fixture-only ELF scoring is enough to imply live service behavior for real-world jobs." + ], + "evidence_links": { + "claim_boundary": ["live-real-world-claim-boundary"] + }, + "answer_type": "direct_answer", + "accepted_alternates": [], + "requires_caveat": false, + "requires_refusal": false + }, + "required_evidence": [ + { + "evidence_id": "live-real-world-claim-boundary", + "claim_id": "claim_boundary", + "requirement": "use", + "quote": "generated JSON and Markdown artifacts include command evidence" + } + ], + "negative_traps": [ + { + "trap_id": "fixture-only-live-claim", + "type": "unsupported_claim", + "evidence_ids": ["fixture-only-claim-trap"], + "failure_if_used": true + } + ], + "scoring_rubric": { + "dimensions": { + "answer_correctness": { + "weight": 0.35, + "max_points": 1.0, + "criteria": "States the artifact and typed-status boundary for live_real_world claims." + }, + "evidence_grounding": { + "weight": 0.3, + "max_points": 1.0, + "criteria": "Uses the live-real-world claim boundary evidence." + }, + "trap_avoidance": { + "weight": 0.2, + "max_points": 1.0, + "criteria": "Avoids the fixture-only live-service claim." + }, + "workflow_helpfulness": { + "weight": 0.15, + "max_points": 1.0, + "criteria": "Keeps the claim boundary explicit." + } + }, + "pass_threshold": 0.8, + "hard_fail_rules": [ + "unsupported high-confidence claim about a required decision or fact", + "use of a negative trap marked failure_if_used = true" + ] + }, + "allowed_uncertainty": { + "can_answer_unknown": false, + "acceptable_phrases": ["The live adapter did not retrieve that evidence."], + "fallback_action": "state_blocker" + }, + "tags": ["external_adapter", "live_real_world", "retrieval"] +} diff --git a/apps/elf-eval/fixtures/real_world_live_adapters/work_resume_exact_next_action.json b/apps/elf-eval/fixtures/real_world_live_adapters/work_resume_exact_next_action.json new file mode 100644 index 00000000..66128882 --- /dev/null +++ b/apps/elf-eval/fixtures/real_world_live_adapters/work_resume_exact_next_action.json @@ -0,0 +1,133 @@ +{ + "schema": "elf.real_world_job/v1", + "job_id": "live-adapter-work-resume-next-action-001", + "suite": "work_resume", + "title": "Live adapter retrieves the current next action instead of a stale baseline command", + "corpus": { + "corpus_id": "real-world-live-adapters-2026-06-10", + "profile": "external_adapter", + "items": [ + { + "evidence_id": "xy868-current-next-action", + "kind": "runbook", + "text": "Exact next action for XY-868: run `cargo make real-world-memory-live-adapters`, then run `cargo make fmt`, `cargo make lint-fix`, and `cargo make checks` before pushing branch y/elf-xy-868.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_live_adapter_fixture/v1", + "ref": { + "fixture": "work_resume_exact_next_action", + "evidence_id": "xy868-current-next-action" + } + }, + "created_at": "2026-06-10T06:00:00Z" + }, + { + "evidence_id": "xy868-stale-baseline-command", + "kind": "runbook", + "text": "Old XY-868 note: only run `cargo make baseline-live-docker`; do not add live real-world adapter evidence.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_live_adapter_fixture/v1", + "ref": { + "fixture": "work_resume_exact_next_action", + "evidence_id": "xy868-stale-baseline-command" + } + }, + "created_at": "2026-06-09T06:00:00Z" + } + ] + }, + "timeline": [ + { + "event_id": "xy868-stale-note", + "ts": "2026-06-09T06:00:00Z", + "actor": "agent", + "action": "recorded_stale_command", + "evidence_ids": ["xy868-stale-baseline-command"], + "summary": "A stale note pointed only at the same-corpus live-baseline command." + }, + { + "event_id": "xy868-current-live-adapter-action", + "ts": "2026-06-10T06:00:00Z", + "actor": "agent", + "action": "recorded_current_next_action", + "evidence_ids": ["xy868-current-next-action"], + "summary": "The current note identifies the live-adapter task and pre-push validation sequence." + } + ], + "prompt": { + "role": "user", + "content": "What is the exact next action and validation sequence for XY-868 live real-world adapters?", + "job_mode": "resume", + "constraints": ["cite_evidence", "avoid_stale_facts", "state_exact_next_action"] + }, + "expected_answer": { + "must_include": [ + { + "claim_id": "next_action", + "text": "Exact next action for XY-868: run `cargo make real-world-memory-live-adapters`, then run `cargo make fmt`, `cargo make lint-fix`, and `cargo make checks` before pushing branch y/elf-xy-868." + } + ], + "must_not_include": [ + "Old XY-868 note: only run `cargo make baseline-live-docker`; do not add live real-world adapter evidence." + ], + "evidence_links": { + "next_action": ["xy868-current-next-action"] + }, + "answer_type": "work_plan", + "accepted_alternates": [], + "requires_caveat": false, + "requires_refusal": false + }, + "required_evidence": [ + { + "evidence_id": "xy868-current-next-action", + "claim_id": "next_action", + "requirement": "cite", + "quote": "run `cargo make real-world-memory-live-adapters`" + } + ], + "negative_traps": [ + { + "trap_id": "stale-baseline-only-command", + "type": "stale_fact", + "evidence_ids": ["xy868-stale-baseline-command"], + "failure_if_used": true + } + ], + "scoring_rubric": { + "dimensions": { + "answer_correctness": { + "weight": 0.35, + "max_points": 1.0, + "criteria": "Returns the current live-adapter command and validation sequence." + }, + "evidence_grounding": { + "weight": 0.3, + "max_points": 1.0, + "criteria": "Cites the current next-action evidence." + }, + "trap_avoidance": { + "weight": 0.2, + "max_points": 1.0, + "criteria": "Avoids the stale same-corpus live-baseline command." + }, + "workflow_helpfulness": { + "weight": 0.15, + "max_points": 1.0, + "criteria": "Keeps the answer executable." + } + }, + "pass_threshold": 0.8, + "hard_fail_rules": [ + "unsupported high-confidence claim about a required decision or fact", + "use of a negative trap marked failure_if_used = true" + ] + }, + "allowed_uncertainty": { + "can_answer_unknown": false, + "acceptable_phrases": ["The live adapter did not retrieve that evidence."], + "fallback_action": "state_blocker" + }, + "tags": ["external_adapter", "live_real_world", "work_resume"] +} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark.rs b/apps/elf-eval/src/bin/real_world_job_benchmark.rs index 9c41027f..50df0f66 100644 --- a/apps/elf-eval/src/bin/real_world_job_benchmark.rs +++ b/apps/elf-eval/src/bin/real_world_job_benchmark.rs @@ -28,6 +28,10 @@ const DEFAULT_EXTERNAL_ADAPTER_MANIFEST_PATH: &str = const DEFAULT_RUN_ID: &str = "real-world-job-smoke"; const DEFAULT_ADAPTER_ID: &str = "fixture_smoke"; const DEFAULT_ADAPTER_NAME: &str = "ELF fixture smoke"; +const DEFAULT_ADAPTER_BEHAVIOR: &str = "offline_fixture_response"; +const DEFAULT_ADAPTER_STORAGE_STATUS: &str = "not_encoded"; +const DEFAULT_ADAPTER_RUNTIME_STATUS: &str = "not_encoded"; +const DEFAULT_ADAPTER_NOTES: &str = "Offline runner scores checked-in fixture responses; it does not exercise a live external adapter."; const NOT_ENCODED_REASON: &str = "No checked-in real_world_job fixture is encoded for this suite."; const FORBIDDEN_SOURCE_MUTATION_KEYS: [&str; 7] = [ "delete_source", @@ -89,6 +93,18 @@ struct RunArgs { /// Human-readable adapter name recorded in the generated report. #[arg(long, default_value = DEFAULT_ADAPTER_NAME)] adapter_name: String, + /// Adapter behavior label recorded in the generated report. + #[arg(long, default_value = DEFAULT_ADAPTER_BEHAVIOR)] + adapter_behavior: String, + /// Adapter storage typed status recorded in the generated report. + #[arg(long, default_value = DEFAULT_ADAPTER_STORAGE_STATUS)] + adapter_storage_status: String, + /// Adapter runtime typed status recorded in the generated report. + #[arg(long, default_value = DEFAULT_ADAPTER_RUNTIME_STATUS)] + adapter_runtime_status: String, + /// Adapter notes recorded in the generated report. + #[arg(long, default_value = DEFAULT_ADAPTER_NOTES)] + adapter_notes: String, /// Real-world external adapter manifest to include in report coverage. #[arg(long, value_name = "FILE", default_value = DEFAULT_EXTERNAL_ADAPTER_MANIFEST_PATH)] external_adapter_manifest: PathBuf, @@ -1988,7 +2004,7 @@ fn build_report(jobs: &[RealWorldJob], args: &RunArgs) -> Result String { } } -fn adapter_report(args: &RunArgs) -> AdapterReport { - AdapterReport { +fn adapter_report(args: &RunArgs) -> Result { + Ok(AdapterReport { adapter_id: args.adapter_id.clone(), name: args.adapter_name.clone(), - behavior: "offline_fixture_response".to_string(), - storage: TypedStatus::NotEncoded, - runtime: TypedStatus::NotEncoded, - notes: "Offline runner scores checked-in fixture responses; it does not exercise a live external adapter.".to_string(), + behavior: args.adapter_behavior.clone(), + storage: typed_status_from_arg( + args.adapter_storage_status.as_str(), + "--adapter-storage-status", + )?, + runtime: typed_status_from_arg( + args.adapter_runtime_status.as_str(), + "--adapter-runtime-status", + )?, + notes: args.adapter_notes.clone(), + }) +} + +fn typed_status_from_arg(raw: &str, flag: &str) -> Result { + match raw { + "pass" => Ok(TypedStatus::Pass), + "wrong_result" => Ok(TypedStatus::WrongResult), + "lifecycle_fail" => Ok(TypedStatus::LifecycleFail), + "incomplete" => Ok(TypedStatus::Incomplete), + "blocked" => Ok(TypedStatus::Blocked), + "not_encoded" => Ok(TypedStatus::NotEncoded), + "unsupported_claim" => Ok(TypedStatus::UnsupportedClaim), + _ => Err(eyre::eyre!( + "{flag} must be one of pass, wrong_result, lifecycle_fail, incomplete, blocked, not_encoded, or unsupported_claim." + )), } } @@ -3860,7 +3897,13 @@ fn render_markdown(report: &RealWorldReport, report_path: &Path) -> String { fn render_markdown_capture_integration(out: &mut String, report: &RealWorldReport) { out.push_str("## Capture And Integration Coverage\n\n"); - out.push_str("The real-world job runner is fixture-backed. This section separates encoded evidence from live adapter claims.\n\n"); + + if report.adapter.behavior == DEFAULT_ADAPTER_BEHAVIOR { + out.push_str("The real-world job runner is fixture-backed. This section separates encoded evidence from live adapter claims.\n\n"); + } else { + out.push_str("This report scores materialized adapter responses. Capture and integration classes still describe the job corpus, not broad external adapter coverage.\n\n"); + } + out.push_str("| Class | Behaviors |\n"); out.push_str("| --- | --- |\n"); out.push_str(&format!("| real | {} |\n", md_list(report.capture_integration.real.as_slice()))); diff --git a/apps/elf-eval/src/bin/real_world_live_adapter.rs b/apps/elf-eval/src/bin/real_world_live_adapter.rs new file mode 100644 index 00000000..589af9d7 --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_live_adapter.rs @@ -0,0 +1,1234 @@ +#![allow(clippy::single_component_path_imports, unused_crate_dependencies)] + +//! Live adapter materializer for the real-world job benchmark. + +use std::{ + collections::BTreeSet, + env, + fs::{self, OpenOptions}, + io::Write as _, + path::{Path, PathBuf}, + process::{Command, Stdio}, + sync::Arc, + time::Instant, +}; + +use blake3::Hasher; +use clap::{Parser, Subcommand, ValueEnum}; +use color_eyre::{self, eyre}; +use serde::{Deserialize, Serialize}; +use serde_json::Value; +use tokio::task::JoinSet; +use uuid::Uuid; + +use elf_chunking::ChunkingConfig; +use elf_config::{Config, EmbeddingProviderConfig, LlmProviderConfig, ProviderConfig}; +use elf_service::{ + AddNoteInput, AddNoteRequest, BoxFuture, ElfService, EmbeddingProvider, ExtractorProvider, + PayloadLevel, Providers, RerankProvider, SearchRequest, +}; +use elf_storage::{db::Db, qdrant::QdrantStore}; +use elf_testkit::TestDatabase; +use elf_worker::worker::{self, WorkerState}; + +const JOB_SCHEMA: &str = "elf.real_world_job/v1"; +const EVIDENCE_SCHEMA: &str = "elf.real_world_live_adapter_materialization/v1"; +const TENANT_ID: &str = "elf-live-real-world"; +const AGENT_ID: &str = "elf-live-real-world-agent"; +const SCOPE: &str = "agent_private"; + +#[derive(Debug, Parser)] +#[command(version = elf_cli::VERSION, rename_all = "kebab", styles = elf_cli::styles())] +struct Args { + #[command(subcommand)] + command: CommandArgs, +} + +#[derive(Debug, Parser)] +struct ElfArgs { + /// Fixture file or directory containing real_world_job JSON fixtures. + #[arg(long, value_name = "PATH")] + fixtures: PathBuf, + /// Directory where generated real_world_job fixtures are written. + #[arg(long, value_name = "DIR")] + out_fixtures: PathBuf, + /// JSON evidence file for adapter setup/run/result details. + #[arg(long, value_name = "FILE")] + evidence_out: PathBuf, + /// ELF config loaded before Docker runtime overrides are applied. + #[arg(long, short = 'c', value_name = "FILE")] + config: PathBuf, + /// Adapter id embedded in generated adapter_response objects. + #[arg(long, default_value = "elf_live_real_world")] + adapter_id: String, +} + +#[derive(Debug, Parser)] +struct QmdArgs { + /// Fixture file or directory containing real_world_job JSON fixtures. + #[arg(long, value_name = "PATH")] + fixtures: PathBuf, + /// Directory where generated real_world_job fixtures are written. + #[arg(long, value_name = "DIR")] + out_fixtures: PathBuf, + /// JSON evidence file for adapter setup/run/result details. + #[arg(long, value_name = "FILE")] + evidence_out: PathBuf, + /// qmd checkout directory. The materializer clones into it when missing. + #[arg(long, value_name = "DIR")] + qmd_dir: PathBuf, + /// Work directory for qmd home, corpus files, and command logs. + #[arg(long, value_name = "DIR")] + work_dir: PathBuf, + /// qmd repository URL used when qmd_dir is absent. + #[arg(long, default_value = "https://github.com/tobi/qmd.git")] + qmd_repo_url: String, + /// Adapter id embedded in generated adapter_response objects. + #[arg(long, default_value = "qmd_live_real_world")] + adapter_id: String, +} + +#[derive(Debug)] +struct LoadedJob { + path: PathBuf, + value: Value, + job: LiveJob, +} + +#[derive(Debug, Deserialize)] +struct LiveJob { + schema: String, + job_id: String, + suite: String, + title: String, + corpus: LiveCorpus, + prompt: LivePrompt, + #[serde(default)] + required_evidence: Vec, +} + +#[derive(Debug, Deserialize)] +struct LiveCorpus { + #[serde(default)] + items: Vec, +} + +#[derive(Debug, Deserialize)] +struct LiveCorpusItem { + evidence_id: String, + text: Option, + local_ref: Option, +} + +#[derive(Debug, Deserialize)] +struct LivePrompt { + content: String, +} + +#[derive(Debug, Deserialize)] +struct LiveRequiredEvidence { + evidence_id: String, +} + +#[derive(Debug, Serialize)] +struct MaterializationEvidence { + schema: &'static str, + adapter_id: String, + adapter_kind: AdapterKind, + status: MaterializationStatus, + fixtures: String, + generated_fixtures: String, + command_evidence: Vec, + jobs: Vec, +} + +#[derive(Debug, Serialize)] +struct CommandEvidence { + label: String, + status: MaterializationStatus, + command: String, + artifact: Option, + reason: String, +} + +#[derive(Debug, Serialize)] +struct MaterializedJobEvidence { + job_id: String, + suite: String, + title: String, + status: MaterializationStatus, + query: String, + evidence_ids: Vec, + returned_count: usize, + latency_ms: f64, + trace_id: Option, + failure: Option, +} + +#[derive(Debug, Serialize)] +struct AdapterResponseOutput { + adapter_id: String, + answer: AnswerOutput, +} + +#[derive(Debug, Serialize)] +struct AnswerOutput { + content: String, + evidence_ids: Vec, + claims: Vec, + latency_ms: f64, + cost: CostOutput, + trace_explainability: TraceExplainabilityOutput, +} + +#[derive(Debug, Serialize)] +struct CostOutput { + currency: String, + amount: f64, + input_tokens: u64, + output_tokens: u64, +} + +#[derive(Debug, Serialize)] +struct TraceExplainabilityOutput { + trace_id: Option, + failure_stage: Option, + failure_reason: Option, + stages: Vec, +} + +#[derive(Debug, Serialize)] +struct TraceStageOutput { + stage_name: String, + kept_evidence: Vec, + dropped_evidence: Vec, + demoted_evidence: Vec, + distractor_evidence: Vec, + notes: String, +} + +#[derive(Debug)] +struct MaterializedJob { + response: AdapterResponseOutput, + evidence: MaterializedJobEvidence, +} + +#[derive(Debug)] +struct MaterializedJobInput { + content: String, + evidence_ids: Vec, + latency_ms: f64, + returned_count: usize, + trace_id: Option, + failure: Option, +} + +struct MaterializedOutput<'a> { + adapter_id: &'a str, + adapter_kind: AdapterKind, + fixtures: &'a Path, + out_fixtures: &'a Path, + evidence_out: &'a Path, + jobs: &'a [LoadedJob], + materialized: &'a [MaterializedJob], + command_evidence: Vec, +} + +#[derive(Debug)] +struct CorpusText { + evidence_id: String, + text: String, +} + +#[derive(Debug)] +struct BaselineRuntime { + config_path: PathBuf, + dsn: String, + qdrant_url: String, + collection: String, + docs_collection: String, +} + +#[derive(Debug)] +struct DeterministicEmbedding { + vector_dim: u32, +} +impl EmbeddingProvider for DeterministicEmbedding { + fn embed<'a>( + &'a self, + _cfg: &'a EmbeddingProviderConfig, + texts: &'a [String], + ) -> BoxFuture<'a, elf_service::Result>>> { + let dim = self.vector_dim; + let vectors = texts.iter().map(|text| embed_text(text, dim)).collect(); + + Box::pin(async move { Ok(vectors) }) + } +} + +#[derive(Debug)] +struct TokenOverlapRerank; +impl RerankProvider for TokenOverlapRerank { + fn rerank<'a>( + &'a self, + _cfg: &'a ProviderConfig, + query: &'a str, + docs: &'a [String], + ) -> BoxFuture<'a, elf_service::Result>> { + let query_terms = terms(query); + let scores = docs + .iter() + .map(|doc| { + let doc_terms = terms(doc); + let hits = query_terms.intersection(&doc_terms).count() as f32; + + hits / query_terms.len().max(1) as f32 + }) + .collect(); + + Box::pin(async move { Ok(scores) }) + } +} + +#[derive(Debug)] +struct NoopExtractor; +impl ExtractorProvider for NoopExtractor { + fn extract<'a>( + &'a self, + _cfg: &'a LlmProviderConfig, + _messages: &'a [Value], + ) -> BoxFuture<'a, elf_service::Result> { + Box::pin(async move { Ok(serde_json::json!({ "notes": [] })) }) + } +} + +#[derive(Debug)] +struct SelectedEvidenceText { + content: String, + evidence_ids: Vec, +} + +#[derive(Debug, Subcommand)] +#[command(rename_all = "kebab")] +enum CommandArgs { + /// Materialize adapter responses by running jobs through ELF's service runtime. + Elf(ElfArgs), + /// Materialize adapter responses by running jobs through qmd's local CLI workflow. + Qmd(QmdArgs), +} + +#[derive(Clone, Copy, Debug, Eq, PartialEq, Serialize, ValueEnum)] +#[serde(rename_all = "snake_case")] +enum AdapterKind { + ElfServiceRuntime, + QmdCliRuntime, +} + +#[derive(Clone, Copy, Debug, Eq, PartialEq, Serialize)] +#[serde(rename_all = "snake_case")] +enum MaterializationStatus { + Pass, + WrongResult, + Incomplete, +} + +fn run_qmd(args: QmdArgs) -> color_eyre::Result<()> { + let jobs = load_jobs(&args.fixtures)?; + let result = materialize_qmd_jobs(&args, &jobs); + let materialized = match result { + Ok(jobs) => jobs, + Err(err) => failure_jobs(&args.adapter_id, &jobs, "qmd_cli_runtime", err.to_string()), + }; + + write_materialized_output(MaterializedOutput { + adapter_id: &args.adapter_id, + adapter_kind: AdapterKind::QmdCliRuntime, + fixtures: &args.fixtures, + out_fixtures: &args.out_fixtures, + evidence_out: &args.evidence_out, + jobs: &jobs, + materialized: &materialized, + command_evidence: vec![CommandEvidence { + label: "qmd_cli_runtime".to_string(), + status: aggregate_status(&materialized), + command: "cargo run -p elf-eval --bin real_world_live_adapter -- qmd".to_string(), + artifact: Some(args.evidence_out.display().to_string()), + reason: "qmd live adapter used collection add, update, embed, and query --json." + .to_string(), + }], + }) +} + +fn materialize_qmd_jobs( + args: &QmdArgs, + jobs: &[LoadedJob], +) -> color_eyre::Result> { + fs::create_dir_all(&args.work_dir)?; + + let log_path = args.work_dir.join("qmd-live-real-world.log"); + + ensure_qmd_checkout(args, &log_path)?; + + let mut out = Vec::with_capacity(jobs.len()); + + for loaded in jobs { + out.push(materialize_qmd_job(args, loaded, &log_path)?); + } + + Ok(out) +} + +fn ensure_qmd_checkout(args: &QmdArgs, log_path: &Path) -> color_eyre::Result<()> { + if !args.qmd_dir.exists() { + if let Some(parent) = args.qmd_dir.parent() { + fs::create_dir_all(parent)?; + } + + run_logged_command( + "qmd clone", + Command::new("git") + .arg("clone") + .arg("--depth") + .arg("1") + .arg(&args.qmd_repo_url) + .arg(&args.qmd_dir), + log_path, + )?; + } + + run_logged_shell( + "qmd install", + &args.qmd_dir, + "(npm ci || npm install --no-audit --no-fund) && npm run build --if-present", + log_path, + ) +} + +fn materialize_qmd_job( + args: &QmdArgs, + loaded: &LoadedJob, + log_path: &Path, +) -> color_eyre::Result { + let corpus = corpus_texts(loaded)?; + let job_slug = slug(&loaded.job.job_id); + let corpus_dir = args.work_dir.join("corpus").join(&job_slug); + let home_dir = args.work_dir.join("home").join(&job_slug); + let collection = format!("elfrw-{job_slug}"); + + fs::create_dir_all(&corpus_dir)?; + fs::create_dir_all(&home_dir)?; + + for existing in read_dir_paths(&corpus_dir)? { + if existing.is_file() { + fs::remove_file(existing)?; + } + } + for item in &corpus { + let path = corpus_dir.join(format!("{}.md", slug(&item.evidence_id))); + + fs::write(path, format!("# {}\n\n{}\n", item.evidence_id, item.text))?; + } + + run_qmd_command( + "qmd collection add", + args, + &home_dir, + &[ + "collection", + "add", + corpus_dir + .to_str() + .ok_or_else(|| eyre::eyre!("qmd corpus path is not valid UTF-8."))?, + "--name", + collection.as_str(), + ], + log_path, + )?; + run_qmd_command("qmd update", args, &home_dir, &["update"], log_path)?; + run_qmd_command( + "qmd embed", + args, + &home_dir, + &["embed", "-f", "-c", collection.as_str()], + log_path, + )?; + + let started_at = Instant::now(); + let query = format!("lex: {}\nvec: {}", loaded.job.prompt.content, loaded.job.prompt.content); + let stdout = run_qmd_command( + "qmd query", + args, + &home_dir, + &[ + "query", + query.as_str(), + "-c", + collection.as_str(), + "--json", + "--no-rerank", + "--min-score", + "0", + "-n", + "5", + ], + log_path, + )?; + let latency_ms = started_at.elapsed().as_secs_f64() * 1_000.0; + let results = serde_json::from_str::(&stdout).map_err(|err| { + eyre::eyre!("qmd query did not return JSON for {}: {err}", loaded.job.job_id) + })?; + let entries = results.as_array().cloned().unwrap_or_default(); + let mut evidence_ids = Vec::new(); + + for entry in &entries { + let entry_text = serde_json::to_string(entry)?; + + for item in &corpus { + if entry_text.contains(format!("{}.md", slug(&item.evidence_id)).as_str()) + || entry_text.contains(item.evidence_id.as_str()) + { + push_unique(&mut evidence_ids, item.evidence_id.clone()); + } + } + } + + let selected = selected_required_corpus_texts(loaded, &corpus, &evidence_ids); + + Ok(materialized_job( + loaded, + &args.adapter_id, + MaterializedJobInput { + content: selected.content, + evidence_ids: selected.evidence_ids, + latency_ms, + returned_count: entries.len(), + trace_id: None, + failure: None, + }, + )) +} + +fn materialized_job( + loaded: &LoadedJob, + adapter_id: &str, + input: MaterializedJobInput, +) -> MaterializedJob { + let required_evidence_satisfied = required_evidence_satisfied(loaded, &input.evidence_ids); + let status = if input.failure.is_some() { + MaterializationStatus::Incomplete + } else if !required_evidence_satisfied { + MaterializationStatus::WrongResult + } else { + MaterializationStatus::Pass + }; + let failure_stage = input.failure.as_ref().map(|_| "adapter_runtime".to_string()); + let stage_notes = if !required_evidence_satisfied { + "Adapter did not return all required mapped evidence for this job.".to_string() + } else { + "Adapter returned mapped evidence through its live retrieval path.".to_string() + }; + + MaterializedJob { + response: AdapterResponseOutput { + adapter_id: adapter_id.to_string(), + answer: AnswerOutput { + content: input.content, + evidence_ids: input.evidence_ids.clone(), + claims: Vec::new(), + latency_ms: input.latency_ms, + cost: CostOutput { + currency: "USD".to_string(), + amount: 0.0, + input_tokens: 0, + output_tokens: 0, + }, + trace_explainability: TraceExplainabilityOutput { + trace_id: input.trace_id.map(|id| id.to_string()), + failure_stage, + failure_reason: input.failure.clone(), + stages: vec![TraceStageOutput { + stage_name: "live_adapter.retrieve".to_string(), + kept_evidence: input.evidence_ids.clone(), + dropped_evidence: Vec::new(), + demoted_evidence: Vec::new(), + distractor_evidence: Vec::new(), + notes: stage_notes, + }], + }, + }, + }, + evidence: MaterializedJobEvidence { + job_id: loaded.job.job_id.clone(), + suite: loaded.job.suite.clone(), + title: loaded.job.title.clone(), + status, + query: loaded.job.prompt.content.clone(), + evidence_ids: input.evidence_ids, + returned_count: input.returned_count, + latency_ms: input.latency_ms, + trace_id: input.trace_id, + failure: input.failure, + }, + } +} + +fn required_evidence_satisfied(loaded: &LoadedJob, evidence_ids: &[String]) -> bool { + if loaded.job.required_evidence.is_empty() { + return !evidence_ids.is_empty(); + } + + loaded + .job + .required_evidence + .iter() + .all(|required| evidence_ids.iter().any(|id| id == &required.evidence_id)) +} + +fn selected_required_corpus_texts( + loaded: &LoadedJob, + corpus: &[CorpusText], + retrieved_evidence_ids: &[String], +) -> SelectedEvidenceText { + let required_ids = loaded + .job + .required_evidence + .iter() + .map(|evidence| evidence.evidence_id.as_str()) + .collect::>(); + let mut selected_ids = Vec::new(); + + if required_ids.is_empty() { + for evidence_id in retrieved_evidence_ids.iter().take(1) { + push_unique(&mut selected_ids, evidence_id.clone()); + } + } else { + for evidence in &loaded.job.required_evidence { + if retrieved_evidence_ids.iter().any(|id| id == &evidence.evidence_id) { + push_unique(&mut selected_ids, evidence.evidence_id.clone()); + } + } + } + + let content = selected_ids + .iter() + .filter_map(|evidence_id| { + corpus + .iter() + .find(|item| item.evidence_id == *evidence_id) + .map(|item| item.text.clone()) + }) + .collect::>() + .join("\n\n"); + + SelectedEvidenceText { content, evidence_ids: selected_ids } +} + +fn failure_jobs( + adapter_id: &str, + jobs: &[LoadedJob], + stage: &str, + reason: String, +) -> Vec { + jobs.iter() + .map(|job| { + materialized_job( + job, + adapter_id, + MaterializedJobInput { + content: String::new(), + evidence_ids: Vec::new(), + latency_ms: 0.0, + returned_count: 0, + trace_id: None, + failure: Some(format!("{stage}: {reason}")), + }, + ) + }) + .collect() +} + +fn write_materialized_output(output: MaterializedOutput<'_>) -> color_eyre::Result<()> { + fs::create_dir_all(output.out_fixtures)?; + + for existing in read_dir_paths(output.out_fixtures)? { + if existing.is_file() { + fs::remove_file(existing)?; + } + } + for (loaded, materialized) in output.jobs.iter().zip(output.materialized) { + let mut value = loaded.value.clone(); + + value["corpus"]["adapter_response"] = serde_json::to_value(&materialized.response)?; + + if materialized.evidence.status == MaterializationStatus::Incomplete { + value["encoding"] = serde_json::json!({ + "status": "incomplete", + "reason": materialized.evidence.failure.clone().unwrap_or_else(|| { + "Live adapter did not complete this job.".to_string() + }), + }); + } + + let file_name = loaded.path.file_name().ok_or_else(|| { + eyre::eyre!("Fixture path {} has no file name.", loaded.path.display()) + })?; + + fs::write(output.out_fixtures.join(file_name), serde_json::to_string_pretty(&value)?)?; + } + + let evidence = MaterializationEvidence { + schema: EVIDENCE_SCHEMA, + adapter_id: output.adapter_id.to_string(), + adapter_kind: output.adapter_kind, + status: aggregate_status(output.materialized), + fixtures: output.fixtures.display().to_string(), + generated_fixtures: output.out_fixtures.display().to_string(), + command_evidence: output.command_evidence, + jobs: output.materialized.iter().map(|job| clone_job_evidence(&job.evidence)).collect(), + }; + + if let Some(parent) = output.evidence_out.parent() { + fs::create_dir_all(parent)?; + } + + fs::write(output.evidence_out, serde_json::to_string_pretty(&evidence)?)?; + + Ok(()) +} + +fn clone_job_evidence(evidence: &MaterializedJobEvidence) -> MaterializedJobEvidence { + MaterializedJobEvidence { + job_id: evidence.job_id.clone(), + suite: evidence.suite.clone(), + title: evidence.title.clone(), + status: evidence.status, + query: evidence.query.clone(), + evidence_ids: evidence.evidence_ids.clone(), + returned_count: evidence.returned_count, + latency_ms: evidence.latency_ms, + trace_id: evidence.trace_id, + failure: evidence.failure.clone(), + } +} + +fn aggregate_status(jobs: &[MaterializedJob]) -> MaterializationStatus { + if jobs.iter().any(|job| job.evidence.status == MaterializationStatus::Incomplete) { + MaterializationStatus::Incomplete + } else if jobs.iter().any(|job| job.evidence.status == MaterializationStatus::WrongResult) { + MaterializationStatus::WrongResult + } else { + MaterializationStatus::Pass + } +} + +fn load_jobs(path: &Path) -> color_eyre::Result> { + let paths = fixture_paths(path)?; + let mut jobs = Vec::with_capacity(paths.len()); + + for fixture in paths { + let raw = fs::read_to_string(&fixture)?; + let value = serde_json::from_str::(&raw) + .map_err(|err| eyre::eyre!("Failed to parse {} as JSON: {err}", fixture.display()))?; + let job = serde_json::from_value::(value.clone()).map_err(|err| { + eyre::eyre!("Failed to parse {} as real_world_job: {err}", fixture.display()) + })?; + + if job.schema != JOB_SCHEMA { + return Err(eyre::eyre!( + "{} has schema {}, expected {JOB_SCHEMA}.", + fixture.display(), + job.schema + )); + } + if job.corpus.items.is_empty() { + return Err(eyre::eyre!("{} has no corpus items.", fixture.display())); + } + + jobs.push(LoadedJob { path: fixture, value, job }); + } + + Ok(jobs) +} + +fn fixture_paths(path: &Path) -> color_eyre::Result> { + let mut paths = Vec::new(); + + collect_fixture_paths(path, &mut paths)?; + + paths.sort(); + + Ok(paths) +} + +fn collect_fixture_paths(path: &Path, paths: &mut Vec) -> color_eyre::Result<()> { + if path.is_dir() { + for entry in fs::read_dir(path)? { + let entry_path = entry?.path(); + + collect_fixture_paths(entry_path.as_path(), paths)?; + } + + return Ok(()); + } + if path.extension().and_then(|ext| ext.to_str()) == Some("json") { + paths.push(path.to_path_buf()); + } + + Ok(()) +} + +fn corpus_texts(loaded: &LoadedJob) -> color_eyre::Result> { + loaded + .job + .corpus + .items + .iter() + .map(|item| { + let text = match (&item.text, &item.local_ref) { + (Some(text), _) => text.clone(), + (None, Some(local_ref)) => { + let base = loaded.path.parent().unwrap_or_else(|| Path::new(".")); + + fs::read_to_string(base.join(local_ref))? + }, + (None, None) => { + return Err(eyre::eyre!( + "{} item {} has no text or local_ref.", + loaded.path.display(), + item.evidence_id + )); + }, + }; + + Ok(CorpusText { evidence_id: item.evidence_id.clone(), text: text.trim().to_string() }) + }) + .collect() +} + +fn read_dir_paths(path: &Path) -> color_eyre::Result> { + if !path.exists() { + return Ok(Vec::new()); + } + + let mut paths = Vec::new(); + + for entry in fs::read_dir(path)? { + paths.push(entry?.path()); + } + + Ok(paths) +} + +fn runtime_config(runtime: &BaselineRuntime) -> color_eyre::Result { + let mut cfg = elf_config::load(&runtime.config_path)?; + + cfg.storage.postgres.dsn = runtime.dsn.clone(); + cfg.storage.postgres.pool_max_conns = 12; + cfg.storage.qdrant.url = runtime.qdrant_url.clone(); + cfg.storage.qdrant.collection = runtime.collection.clone(); + cfg.storage.qdrant.docs_collection = runtime.docs_collection.clone(); + cfg.providers.embedding.provider_id = "local".to_string(); + cfg.providers.embedding.model = "local-hash".to_string(); + cfg.providers.embedding.dimensions = cfg.storage.qdrant.vector_dim; + cfg.providers.rerank.provider_id = "local".to_string(); + cfg.providers.rerank.model = "local-token-overlap".to_string(); + cfg.providers.llm_extractor.provider_id = "disabled".to_string(); + cfg.providers.llm_extractor.model = "disabled".to_string(); + cfg.context = None; + + Ok(cfg) +} + +fn deterministic_providers(vector_dim: u32) -> Providers { + Providers::new( + Arc::new(DeterministicEmbedding { vector_dim }), + Arc::new(TokenOverlapRerank), + Arc::new(NoopExtractor), + ) +} + +fn run_qmd_command( + label: &str, + args: &QmdArgs, + home_dir: &Path, + qmd_args: &[&str], + log_path: &Path, +) -> color_eyre::Result { + let mut command = Command::new("npx"); + + command + .current_dir(&args.qmd_dir) + .env("HOME", home_dir) + .env("XDG_CACHE_HOME", "/root/.cache") + .env("QMD_FORCE_CPU", "1") + .arg("tsx") + .arg("src/cli/qmd.ts"); + + for arg in qmd_args { + command.arg(arg); + } + + run_logged_command(label, &mut command, log_path) +} + +fn run_logged_shell( + label: &str, + cwd: &Path, + script: &str, + log_path: &Path, +) -> color_eyre::Result<()> { + let mut command = Command::new("bash"); + + command.current_dir(cwd).arg("-lc").arg(script); + + run_logged_command(label, &mut command, log_path).map(|_| ()) +} + +fn run_logged_command( + label: &str, + command: &mut Command, + log_path: &Path, +) -> color_eyre::Result { + if let Some(parent) = log_path.parent() { + fs::create_dir_all(parent)?; + } + + let command_debug = format!("{command:?}"); + let output = command.stdout(Stdio::piped()).stderr(Stdio::piped()).output()?; + let stdout = String::from_utf8_lossy(&output.stdout).to_string(); + let stderr = String::from_utf8_lossy(&output.stderr).to_string(); + let mut log = OpenOptions::new().create(true).append(true).open(log_path)?; + + writeln!(log, "## {label}")?; + writeln!(log, "$ {command_debug}")?; + + if !stdout.trim().is_empty() { + writeln!(log, "\nstdout:\n{stdout}")?; + } + if !stderr.trim().is_empty() { + writeln!(log, "\nstderr:\n{stderr}")?; + } + if !output.status.success() { + return Err(eyre::eyre!( + "{label} failed with status {}. Inspect {}.", + output.status, + log_path.display() + )); + } + + Ok(stdout) +} + +fn project_id_for_job(job_id: &str) -> String { + format!("job-{}", slug(job_id)) +} + +fn slug(value: &str) -> String { + let mut out = String::new(); + let mut last_dash = false; + + for ch in value.chars() { + if ch.is_ascii_alphanumeric() { + out.push(ch.to_ascii_lowercase()); + + last_dash = false; + } else if !last_dash && !out.is_empty() { + out.push('-'); + + last_dash = true; + } + } + + while out.ends_with('-') { + out.pop(); + } + + if out.is_empty() { "item".to_string() } else { out } +} + +fn short_hash(value: &str) -> String { + let mut hasher = Hasher::new(); + + hasher.update(value.as_bytes()); + + hasher.finalize().to_hex().chars().take(12).collect() +} + +fn push_unique(values: &mut Vec, value: String) { + if !values.iter().any(|existing| existing == &value) { + values.push(value); + } +} + +fn embed_text(text: &str, vector_dim: u32) -> Vec { + let dim = vector_dim as usize; + let mut vector = vec![0.0_f32; dim]; + + if dim == 0 { + return vector; + } + + let normalized = normalize_ascii_alnum_lowercase(text); + + for term in normalized.split_whitespace() { + if term.len() < 2 { + continue; + } + + let hash = blake3::hash(term.as_bytes()); + let bytes = hash.as_bytes(); + let idx = (u32::from_le_bytes([bytes[0], bytes[1], bytes[2], bytes[3]]) as usize) % dim; + + vector[idx] += 1.0; + } + + let norm = vector.iter().map(|value| value * value).sum::().sqrt(); + + if norm > 0.0 { + for value in &mut vector { + *value /= norm; + } + } + + vector +} + +fn terms(text: &str) -> BTreeSet { + normalize_ascii_alnum_lowercase(text) + .split_whitespace() + .filter(|term| term.len() >= 2) + .map(ToString::to_string) + .collect() +} + +fn normalize_ascii_alnum_lowercase(text: &str) -> String { + text.chars() + .map(|ch| if ch.is_ascii_alphanumeric() { ch.to_ascii_lowercase() } else { ' ' }) + .collect() +} + +#[tokio::main] +async fn main() -> color_eyre::Result<()> { + color_eyre::install()?; + + match Args::parse().command { + CommandArgs::Elf(args) => run_elf(args).await, + CommandArgs::Qmd(args) => run_qmd(args), + } +} + +async fn run_elf(args: ElfArgs) -> color_eyre::Result<()> { + let jobs = load_jobs(&args.fixtures)?; + let result = materialize_elf_jobs(&args, &jobs).await; + let materialized = match result { + Ok(jobs) => jobs, + Err(err) => failure_jobs(&args.adapter_id, &jobs, "elf_service_runtime", err.to_string()), + }; + + write_materialized_output(MaterializedOutput { + adapter_id: &args.adapter_id, + adapter_kind: AdapterKind::ElfServiceRuntime, + fixtures: &args.fixtures, + out_fixtures: &args.out_fixtures, + evidence_out: &args.evidence_out, + jobs: &jobs, + materialized: &materialized, + command_evidence: vec![CommandEvidence { + label: "elf_service_runtime".to_string(), + status: aggregate_status(&materialized), + command: "cargo run -p elf-eval --bin real_world_live_adapter -- elf".to_string(), + artifact: Some(args.evidence_out.display().to_string()), + reason: "ELF live adapter used ElfService, worker indexing, and search_raw." + .to_string(), + }], + }) +} + +async fn materialize_elf_jobs( + args: &ElfArgs, + jobs: &[LoadedJob], +) -> color_eyre::Result> { + let base_dsn = env::var("ELF_PG_DSN") + .map_err(|_| eyre::eyre!("ELF_PG_DSN must be set for ELF live real-world adapter."))?; + let qdrant_url = env::var("ELF_QDRANT_GRPC_URL") + .or_else(|_| env::var("ELF_QDRANT_URL")) + .map_err(|_| eyre::eyre!("ELF_QDRANT_GRPC_URL or ELF_QDRANT_URL must be set."))?; + let test_db = TestDatabase::new(&base_dsn).await?; + let run_suffix = short_hash(format!("{}:{}", args.adapter_id, Uuid::new_v4()).as_str()); + let runtime = BaselineRuntime { + config_path: args.config.clone(), + dsn: test_db.dsn().to_string(), + qdrant_url, + collection: format!("elf_live_real_world_{run_suffix}"), + docs_collection: format!("elf_live_real_world_docs_{run_suffix}"), + }; + let service = build_service(&runtime).await?; + let mut out = Vec::with_capacity(jobs.len()); + + for loaded in jobs { + out.push(materialize_elf_job(&runtime, &service, loaded, &args.adapter_id).await?); + } + + drop(service); + + test_db.cleanup().await?; + + Ok(out) +} + +async fn materialize_elf_job( + runtime: &BaselineRuntime, + service: &ElfService, + loaded: &LoadedJob, + adapter_id: &str, +) -> color_eyre::Result { + let corpus = corpus_texts(loaded)?; + let project_id = project_id_for_job(&loaded.job.job_id); + + for item in &corpus { + let response = service + .add_note(AddNoteRequest { + tenant_id: TENANT_ID.to_string(), + project_id: project_id.clone(), + agent_id: AGENT_ID.to_string(), + scope: SCOPE.to_string(), + notes: vec![AddNoteInput { + r#type: "fact".to_string(), + key: Some(item.evidence_id.clone()), + text: item.text.clone(), + structured: None, + importance: 0.9, + confidence: 0.95, + ttl_days: None, + source_ref: serde_json::json!({ + "schema": "real_world_live_adapter/v1", + "adapter": adapter_id, + "job_id": loaded.job.job_id, + "evidence_id": item.evidence_id, + }), + write_policy: None, + }], + }) + .await + .map_err(|err| eyre::eyre!("ELF add_note failed for {}: {err}", loaded.job.job_id))?; + + if !response.results.iter().any(|result| result.note_id.is_some()) { + return Err(eyre::eyre!( + "ELF add_note did not persist evidence {} for {}.", + item.evidence_id, + loaded.job.job_id + )); + } + } + + run_worker(runtime).await?; + + let started_at = Instant::now(); + let response = service + .search_raw(SearchRequest { + tenant_id: TENANT_ID.to_string(), + project_id, + agent_id: AGENT_ID.to_string(), + token_id: None, + payload_level: PayloadLevel::L2, + read_profile: "private_only".to_string(), + query: loaded.job.prompt.content.clone(), + top_k: Some(5), + candidate_k: Some(20), + filter: None, + record_hits: Some(false), + ranking: None, + }) + .await + .map_err(|err| eyre::eyre!("ELF search_raw failed for {}: {err}", loaded.job.job_id))?; + let latency_ms = started_at.elapsed().as_secs_f64() * 1_000.0; + let mut evidence_ids = Vec::new(); + + for item in &response.items { + if let Some(evidence_id) = item.source_ref.get("evidence_id").and_then(Value::as_str) { + push_unique(&mut evidence_ids, evidence_id.to_string()); + } + } + + let selected = selected_required_corpus_texts(loaded, &corpus, &evidence_ids); + + Ok(materialized_job( + loaded, + adapter_id, + MaterializedJobInput { + content: selected.content, + evidence_ids: selected.evidence_ids, + latency_ms, + returned_count: response.items.len(), + trace_id: Some(response.trace_id), + failure: None, + }, + )) +} + +async fn build_service(runtime: &BaselineRuntime) -> color_eyre::Result { + let cfg = runtime_config(runtime)?; + let vector_dim = cfg.storage.qdrant.vector_dim; + let db = Db::connect(&cfg.storage.postgres).await?; + + db.ensure_schema(cfg.storage.qdrant.vector_dim).await?; + + let qdrant = QdrantStore::new(&cfg.storage.qdrant)?; + + qdrant.ensure_collection().await?; + + Ok(ElfService::with_providers(cfg, db, qdrant, deterministic_providers(vector_dim))) +} + +async fn build_worker_state(runtime: &BaselineRuntime) -> color_eyre::Result { + let cfg = runtime_config(runtime)?; + let db = Db::connect(&cfg.storage.postgres).await?; + + db.ensure_schema(cfg.storage.qdrant.vector_dim).await?; + + let qdrant = QdrantStore::new(&cfg.storage.qdrant)?; + + qdrant.ensure_collection().await?; + + let docs_qdrant = + QdrantStore::new_with_collection(&cfg.storage.qdrant, &cfg.storage.qdrant.docs_collection)?; + + docs_qdrant.ensure_collection().await?; + + let tokenizer = elf_chunking::load_tokenizer(&cfg.chunking.tokenizer_repo) + .map_err(|err| eyre::eyre!("Failed to load tokenizer for live adapter worker: {err}"))?; + let chunking = ChunkingConfig { + max_tokens: cfg.chunking.max_tokens, + overlap_tokens: cfg.chunking.overlap_tokens, + }; + + Ok(WorkerState { + db, + qdrant, + docs_qdrant, + embedding: cfg.providers.embedding, + chunking, + tokenizer, + }) +} + +async fn run_worker(runtime: &BaselineRuntime) -> color_eyre::Result<()> { + let state = Arc::new(build_worker_state(runtime).await?); + + for _ in 0..8 { + let state = Arc::clone(&state); + let mut set = JoinSet::new(); + + set.spawn(async move { + worker::process_once(&state) + .await + .map_err(|err| eyre::eyre!("Worker process_once failed: {err}")) + }); + + while let Some(joined) = set.join_next().await { + joined??; + } + } + + Ok(()) +} diff --git a/apps/elf-eval/tests/real_world_job_benchmark.rs b/apps/elf-eval/tests/real_world_job_benchmark.rs index 8e2a9056..f3c0e9a7 100644 --- a/apps/elf-eval/tests/real_world_job_benchmark.rs +++ b/apps/elf-eval/tests/real_world_job_benchmark.rs @@ -122,11 +122,11 @@ fn smoke_fixture_produces_typed_json_report() -> Result<()> { assert_eq!(report.pointer("/summary/wrong_result_count").and_then(Value::as_u64), Some(0)); assert_eq!( report.pointer("/external_adapters/summary/adapter_count").and_then(Value::as_u64), - Some(7) + Some(9) ); assert_eq!( report.pointer("/external_adapters/summary/live_real_world_count").and_then(Value::as_u64), - Some(0) + Some(2) ); let jobs = array_at(&report, "/jobs")?; @@ -194,11 +194,11 @@ fn real_world_report_includes_external_adapter_coverage_manifest() -> Result<()> ); assert_eq!( report.pointer("/external_adapters/summary/adapter_count").and_then(Value::as_u64), - Some(7) + Some(9) ); assert_eq!( report.pointer("/external_adapters/summary/external_project_count").and_then(Value::as_u64), - Some(6) + Some(7) ); assert_eq!( report.pointer("/external_adapters/summary/fixture_backed_count").and_then(Value::as_u64), @@ -212,13 +212,13 @@ fn real_world_report_includes_external_adapter_coverage_manifest() -> Result<()> ); assert_eq!( report.pointer("/external_adapters/summary/live_real_world_count").and_then(Value::as_u64), - Some(0) + Some(2) ); assert_eq!( report .pointer("/external_adapters/summary/overall_status_counts/pass") .and_then(Value::as_u64), - Some(1) + Some(3) ); assert_eq!( report @@ -253,14 +253,28 @@ fn real_world_report_includes_external_adapter_coverage_manifest() -> Result<()> let adapters = array_at(&report, "/external_adapters/adapters")?; let elf = find_by_field(adapters, "/adapter_id", "elf_real_world_memory_fixture")?; + let elf_live = find_by_field(adapters, "/adapter_id", "elf_live_real_world")?; let qmd = find_by_field(adapters, "/adapter_id", "qmd_live_baseline")?; + let qmd_live = find_by_field(adapters, "/adapter_id", "qmd_live_real_world")?; let agentmemory = find_by_field(adapters, "/adapter_id", "agentmemory_live_baseline")?; let openviking = find_by_field(adapters, "/adapter_id", "openviking_live_baseline")?; assert_eq!(elf.pointer("/evidence_class").and_then(Value::as_str), Some("fixture_backed")); assert_eq!(elf.pointer("/overall_status").and_then(Value::as_str), Some("incomplete")); + assert_eq!( + elf_live.pointer("/evidence_class").and_then(Value::as_str), + Some("live_real_world") + ); + assert_eq!(elf_live.pointer("/overall_status").and_then(Value::as_str), Some("pass")); + assert_eq!(elf_live.pointer("/suites/0/status").and_then(Value::as_str), Some("pass")); assert_eq!(qmd.pointer("/overall_status").and_then(Value::as_str), Some("pass")); assert_eq!(qmd.pointer("/suites/0/status").and_then(Value::as_str), Some("not_encoded")); + assert_eq!( + qmd_live.pointer("/evidence_class").and_then(Value::as_str), + Some("live_real_world") + ); + assert_eq!(qmd_live.pointer("/overall_status").and_then(Value::as_str), Some("pass")); + assert_eq!(qmd_live.pointer("/suites/0/status").and_then(Value::as_str), Some("pass")); assert_eq!( agentmemory.pointer("/capabilities/1/status").and_then(Value::as_str), Some("mocked") @@ -586,6 +600,7 @@ fn generated_json_report_renders_markdown() -> Result<()> { assert!(markdown.contains("Capture And Integration Coverage")); assert!(markdown.contains("External Adapter Coverage")); assert!(markdown.contains("live-baseline-only")); + assert!(markdown.contains("live real-world")); assert!(markdown.contains("does not convert live-baseline retrieval results")); assert!(markdown.contains("fixture-backed")); assert!(markdown.contains("Answer Type")); diff --git a/docs/guide/benchmarking/2026-06-10-real-world-comparison-report.md b/docs/guide/benchmarking/2026-06-10-real-world-comparison-report.md index 1082526c..e35aee54 100644 --- a/docs/guide/benchmarking/2026-06-10-real-world-comparison-report.md +++ b/docs/guide/benchmarking/2026-06-10-real-world-comparison-report.md @@ -107,22 +107,24 @@ separate: | --- | ---: | --- | | `fixture_backed` | 1 | ELF fixture scoring through checked-in real-world jobs. | | `live_baseline_only` | 6 | Docker same-corpus/lifecycle evidence from the live-baseline runner only. | -| `live_real_world` | 0 | No external project currently executes `real_world_job` prompts and scoring. | +| `live_real_world` | 2 | Targeted ELF and qmd adapters execute representative `real_world_job` prompts and scoring. | Adapter-level status after refreshing the manifest: | Project | Evidence class | Overall status | What is proven | What is not proven | | --- | --- | --- | --- | --- | -| ELF | `fixture_backed` | `incomplete` | Fixture-backed real-world scoring passes 10 of 11 suites, with production-ops typed boundaries preserved. | A live end-to-end real-world service adapter is not encoded. | -| qmd | `live_baseline_only` | `pass` | Docker same-corpus retrieval, update, delete, and cold-start live-baseline checks pass. | qmd does not yet run any real-world job suite. | +| ELF | `fixture_backed` | `incomplete` | Fixture-backed real-world scoring passes 10 of 11 suites, with production-ops typed boundaries preserved. | Fixture-backed scoring is not live-service behavior; cite `elf_live_real_world` for the targeted live slice. | +| ELF | `live_real_world` | `pass` | The targeted Docker slice materializes real_world_job answers through ElfService, worker indexing, and search_raw for work_resume, retrieval, and project_decisions. | This is not yet a full 11-suite live-service run or private-corpus proof. | +| qmd | `live_baseline_only` | `pass` | Docker same-corpus retrieval, update, delete, and cold-start live-baseline checks pass. | Same-corpus checks are not real-world job scoring; cite `qmd_live_real_world` for the targeted live slice. | +| qmd | `live_real_world` | `pass` | The targeted Docker slice indexes real_world_job corpora through qmd collection add/update/embed/query and scores generated answers. | This is not yet broad RAG/graph adapter coverage or full-suite external parity. | | agentmemory | `live_baseline_only` | `lifecycle_fail` | Same-corpus retrieval can run through current adapter. | Durable storage/cold-start lifecycle and real-world suites are blocked by the current in-memory adapter path. | | mem0/OpenMemory | `live_baseline_only` | `wrong_result` | Local OSS setup is represented separately from hosted/OpenMemory claims. | Same-corpus retrieval was not a clean pass and no real-world job adapter is encoded. | | memsearch | `live_baseline_only` | `wrong_result` | Markdown-first design remains a source-of-truth ergonomics reference. | Same-corpus retrieval was not a clean pass and real-world suites are incomplete/not encoded. | | OpenViking | `live_baseline_only` | `incomplete` | Hierarchical context trajectory remains a reference direction. | Docker local-embedding setup must be pinned before fair retrieval or real-world jobs can run. | | claude-mem | `live_baseline_only` | `wrong_result` | Progressive disclosure and local viewer remain UX references. | Current Docker evidence is not a clean same-corpus pass and progressive disclosure jobs are not encoded. | -External summary counters: `7` adapter records, `6` external projects, `7` Docker-default, -`0` host-global-install requirements, `0` live real-world adapters, `3` external +External summary counters: `9` adapter records, `7` external project records, `9` Docker-default, +`0` host-global-install requirements, `2` live real-world adapters, `3` external wrong-result overall states, `1` lifecycle-fail state, and `1` external incomplete state. ## Remaining Gaps @@ -135,8 +137,8 @@ report: | ELF production-ops cold-start dependency fixture | `incomplete` | `[ELF benchmark P0] Pin Docker-compatible local embedding dependency for cold-start adapter checks`. | | ELF provider-backed production-ops gate | `blocked` | Run only with routed operator credentials; credentials were not supplied for this report. | | ELF private production corpus | `blocked` | Supply an operator-owned sanitized private manifest; private-corpus checks were a non-goal without that manifest. | -| ELF fixture-backed scoring is not live service execution | `not_encoded` capability | `[ELF benchmark vNext] Replace fixture-only ELF answers with live real-world adapter execution where appropriate`. | -| qmd real-world job adapter | `not_encoded` suites | Add a qmd adapter that executes `real_world_job` prompts and scoring before claiming real-world suite parity. | +| Full ELF live-service real-world sweep | `not_encoded` beyond targeted slice | Expand `elf_live_real_world` beyond representative work_resume, retrieval, and project_decisions jobs before claiming full live-service suite coverage. | +| Full qmd real-world job sweep | `not_encoded` beyond targeted slice | Expand `qmd_live_real_world` beyond the representative targeted slice before claiming broad real-world suite parity. | | agentmemory durable lifecycle | `lifecycle_fail` / `blocked` | `[ELF benchmark P0] Make agentmemory adapter lifecycle-durable and fail-typed`. | | mem0/OpenMemory same-corpus and real-world coverage | `wrong_result` / `not_encoded` | Add/fix a local OSS adapter before claiming lifecycle, personalization, or OpenMemory UI parity. | | memsearch same-corpus and real-world coverage | `wrong_result` / `incomplete` | Fix Docker same-corpus retrieval/reindex evidence before scoring Markdown-first real-world jobs. | @@ -157,14 +159,15 @@ What ELF is better at in the current evidence: Where ELF is comparable or still being tested: - qmd remains the strongest local retrieval-debug baseline. It passes current - live-baseline checks, while ELF has the stronger evidence/provenance service contract. + live-baseline checks and now has targeted live real-world job evidence, while ELF has + the stronger evidence/provenance service contract. - The fixture-backed retrieval and memory-evolution suites pass, but this is not the same as proving every external project on the same real-world jobs. Where ELF is behind or not yet proven: -- No external project has a live real-world adapter win, including ELF as a live service - adapter; the current ELF result is fixture-backed. +- Only ELF and qmd have targeted live real-world adapter evidence; no external project + has full-suite live real-world parity yet. - Production-ops is intentionally not a full pass because credentialed and private corpus checks need operator-owned inputs. - ELF still needs to absorb external strengths: qmd-style local debug knobs, diff --git a/docs/guide/benchmarking/live_baseline_benchmark.md b/docs/guide/benchmarking/live_baseline_benchmark.md index 8e8b22cf..49298b93 100644 --- a/docs/guide/benchmarking/live_baseline_benchmark.md +++ b/docs/guide/benchmarking/live_baseline_benchmark.md @@ -311,6 +311,25 @@ leave real-world suites as `not_encoded`, `blocked`, `incomplete`, `wrong_result `lifecycle_fail` until an adapter actually executes `real_world_job` prompts and scoring. +The targeted live real-world adapter slice for ELF and qmd is separate from the +same-corpus live baseline: + +```sh +cargo make real-world-memory-live-adapters +``` + +This task runs in `docker-compose.baseline.yml`, materializes generated +`adapter_response` fixtures through ELF's service runtime and qmd's local CLI +retrieval path, then scores and publishes: + +```text +tmp/real-world-memory/live-adapters/elf-report.json +tmp/real-world-memory/live-adapters/elf-report.md +tmp/real-world-memory/live-adapters/qmd-report.json +tmp/real-world-memory/live-adapters/qmd-report.md +tmp/real-world-memory/live-adapters/summary.json +``` + To run the checked-in real-world job smoke fixture and render its Markdown report: ```sh @@ -373,8 +392,9 @@ The retrieval fixture lives under `apps/elf-eval/fixtures/real_world_memory/retrieval/` and covers alternate phrasing, distractor-heavy corpora, multi-hop routing questions, current-versus-obsolete context selection, minimal sufficient context, and stage-level wrong-result explainability. -It is still an offline fixture report; qmd and OpenViking remain reference systems -unless an adapter actually runs and records typed evidence. +It is still an offline fixture report. qmd has a separate targeted live adapter slice +through `cargo make real-world-memory-live-adapters`; OpenViking remains a reference +system unless an adapter actually runs and records typed evidence. To run the checked-in proposal-only consolidation fixtures: diff --git a/docs/guide/benchmarking/real_world_agent_memory_benchmark.md b/docs/guide/benchmarking/real_world_agent_memory_benchmark.md index 23d8e7b0..d721a24d 100644 --- a/docs/guide/benchmarking/real_world_agent_memory_benchmark.md +++ b/docs/guide/benchmarking/real_world_agent_memory_benchmark.md @@ -216,18 +216,39 @@ report section distinguishes: response path. - `live_baseline_only`: Docker live-baseline retrieval/lifecycle evidence that is not a real-world suite win. -- `live_real_world`: future external adapters that actually execute `real_world_job` +- `live_real_world`: external adapters that actually execute `real_world_job` prompts and scoring. -Current state: no external project has a `live_real_world` adapter in this runner yet. -qmd has Docker live-baseline pass evidence for the encoded same-corpus checks, but its -real-world suites remain `not_encoded`. agentmemory is blocked on durable upstream +Current state: the targeted `elf_live_real_world` and `qmd_live_real_world` adapter +slice is encoded through `cargo make real-world-memory-live-adapters`. It materializes +generated runtime answers for representative `work_resume`, `retrieval`, and +`project_decisions` jobs before scoring. qmd still also keeps its separate +`live_baseline_only` same-corpus record for update/delete/cold-start checks; that +record is not a real-world suite win. agentmemory is blocked on durable upstream storage for lifecycle proof. mem0/OpenMemory, memsearch, and claude-mem currently retain wrong-result or incomplete live-baseline states for the checked-in adapter evidence. OpenViking is incomplete until its local embedding setup is reliable inside Docker. These typed states describe benchmark coverage; do not treat them as broad project quality rankings. +To run the targeted live adapter slice for ELF and qmd: + +```sh +cargo make real-world-memory-live-adapters +``` + +Artifacts: + +```text +tmp/real-world-memory/live-adapters/elf-materialization.json +tmp/real-world-memory/live-adapters/elf-report.json +tmp/real-world-memory/live-adapters/elf-report.md +tmp/real-world-memory/live-adapters/qmd-materialization.json +tmp/real-world-memory/live-adapters/qmd-report.json +tmp/real-world-memory/live-adapters/qmd-report.md +tmp/real-world-memory/live-adapters/summary.json +``` + To run the fixture report without the manifest during local debugging: ```sh @@ -372,6 +393,6 @@ adoption, cite both the relevant live-baseline or restore proof and this real-wo fixture report; rerun `baseline-production-private` with an operator-owned manifest before claiming private-corpus retrieval quality. -Do not generate large fixtures or update production-adoption verdicts while adding the -contract. The current adoption gate remains an existing benchmark decision until new -real-world job reports are implemented and published. +Do not treat the targeted live adapter slice as a private-corpus or full-suite +production-adoption verdict. The current adoption gate remains an existing benchmark +decision until broader real-world live adapter reports are implemented and published. diff --git a/docs/guide/research/comparison_external_projects.md b/docs/guide/research/comparison_external_projects.md index baaef043..a61030a6 100644 --- a/docs/guide/research/comparison_external_projects.md +++ b/docs/guide/research/comparison_external_projects.md @@ -61,8 +61,11 @@ The real-world job runner now carries a separate external adapter coverage manif That manifest is a contract and evidence ledger, not a leaderboard. It records which projects only have `live_baseline_only` Docker retrieval/lifecycle evidence, which capabilities are `mocked`, `blocked`, `unsupported`, `incomplete`, `wrong_result`, or -`lifecycle_fail`, and which real-world suites remain `not_encoded`. No external project -in the first manifest has `live_real_world` suite evidence yet. +`lifecycle_fail`, and which real-world suites remain `not_encoded`. The manifest now +includes targeted `live_real_world` records for ELF and qmd through +`cargo make real-world-memory-live-adapters`; other external projects remain +live-baseline-only, incomplete, blocked, or not encoded until their own +`real_world_job` adapters run. Benchmark suite labels: diff --git a/scripts/real-world-live-adapters.sh b/scripts/real-world-live-adapters.sh new file mode 100755 index 00000000..9ddb72c7 --- /dev/null +++ b/scripts/real-world-live-adapters.sh @@ -0,0 +1,116 @@ +#!/usr/bin/env bash +set -euo pipefail + +ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" +REPORT_DIR="${ELF_REAL_WORLD_LIVE_REPORT_DIR:-${ROOT_DIR}/tmp/real-world-memory/live-adapters}" +FIXTURE_DIR="${ELF_REAL_WORLD_LIVE_FIXTURES:-${ROOT_DIR}/apps/elf-eval/fixtures/real_world_live_adapters}" +WORK_DIR="${ELF_REAL_WORLD_LIVE_WORK_DIR:-/bench/real-world-live-adapters}" +QMD_DIR="${ELF_REAL_WORLD_QMD_DIR:-/bench/repos/qmd}" + +if [[ ! -f "/.dockerenv" && "${ELF_REAL_WORLD_LIVE_ALLOW_HOST:-0}" != "1" ]]; then + echo "Refusing to run live real-world adapters outside Docker. Use cargo make real-world-memory-live-adapters." >&2 + exit 1 +fi + +for cmd in bash cargo git jq npm npx; do + if ! command -v "${cmd}" >/dev/null 2>&1; then + echo "Missing ${cmd} in live adapter runner." >&2 + exit 1 + fi +done + +mkdir -p "${REPORT_DIR}" "${WORK_DIR}" +rm -rf "${REPORT_DIR:?}/elf-fixtures" \ + "${REPORT_DIR:?}/qmd-fixtures" \ + "${REPORT_DIR:?}/elf-materialization.json" \ + "${REPORT_DIR:?}/qmd-materialization.json" \ + "${REPORT_DIR:?}/elf-report.json" \ + "${REPORT_DIR:?}/elf-report.md" \ + "${REPORT_DIR:?}/qmd-report.json" \ + "${REPORT_DIR:?}/qmd-report.md" \ + "${REPORT_DIR:?}/summary.json" + +cd "${ROOT_DIR}" + +cargo run -p elf-eval --bin real_world_live_adapter -- elf \ + --fixtures "${FIXTURE_DIR}" \ + --out-fixtures "${REPORT_DIR}/elf-fixtures" \ + --evidence-out "${REPORT_DIR}/elf-materialization.json" \ + --config config/local/elf.docker.toml + +cargo run -p elf-eval --bin real_world_job_benchmark -- run \ + --fixtures "${REPORT_DIR}/elf-fixtures" \ + --out "${REPORT_DIR}/elf-report.json" \ + --run-id real-world-memory-live-elf \ + --adapter-id elf_live_real_world \ + --adapter-name "ELF live real-world service adapter" \ + --adapter-behavior live_real_world_adapter \ + --adapter-storage-status pass \ + --adapter-runtime-status pass \ + --adapter-notes "Materialized by real_world_live_adapter through ElfService, worker indexing, and search_raw." + +cargo run -p elf-eval --bin real_world_job_benchmark -- publish \ + --report "${REPORT_DIR}/elf-report.json" \ + --out "${REPORT_DIR}/elf-report.md" + +cargo run -p elf-eval --bin real_world_live_adapter -- qmd \ + --fixtures "${FIXTURE_DIR}" \ + --out-fixtures "${REPORT_DIR}/qmd-fixtures" \ + --evidence-out "${REPORT_DIR}/qmd-materialization.json" \ + --qmd-dir "${QMD_DIR}" \ + --work-dir "${WORK_DIR}/qmd" + +cargo run -p elf-eval --bin real_world_job_benchmark -- run \ + --fixtures "${REPORT_DIR}/qmd-fixtures" \ + --out "${REPORT_DIR}/qmd-report.json" \ + --run-id real-world-memory-live-qmd \ + --adapter-id qmd_live_real_world \ + --adapter-name "qmd live real-world CLI adapter" \ + --adapter-behavior live_real_world_adapter \ + --adapter-storage-status pass \ + --adapter-runtime-status pass \ + --adapter-notes "Materialized by real_world_live_adapter through qmd collection add, update, embed, and query --json." + +cargo run -p elf-eval --bin real_world_job_benchmark -- publish \ + --report "${REPORT_DIR}/qmd-report.json" \ + --out "${REPORT_DIR}/qmd-report.md" + +jq -n \ + --slurpfile elf_materialization "${REPORT_DIR}/elf-materialization.json" \ + --slurpfile qmd_materialization "${REPORT_DIR}/qmd-materialization.json" \ + --slurpfile elf_report "${REPORT_DIR}/elf-report.json" \ + --slurpfile qmd_report "${REPORT_DIR}/qmd-report.json" \ + '{ + schema: "elf.real_world_live_adapter_slice/v1", + generated_at: now | todateiso8601, + artifact_dir: (env.ELF_REAL_WORLD_LIVE_REPORT_DIR // "tmp/real-world-memory/live-adapters"), + adapters: [ + { + adapter_id: "elf_live_real_world", + evidence_class: "live_real_world", + materialization: $elf_materialization[0], + report: { + json: "tmp/real-world-memory/live-adapters/elf-report.json", + markdown: "tmp/real-world-memory/live-adapters/elf-report.md", + summary: $elf_report[0].summary + } + }, + { + adapter_id: "qmd_live_real_world", + evidence_class: "live_real_world", + materialization: $qmd_materialization[0], + report: { + json: "tmp/real-world-memory/live-adapters/qmd-report.json", + markdown: "tmp/real-world-memory/live-adapters/qmd-report.md", + summary: $qmd_report[0].summary + } + } + ] + }' >"${REPORT_DIR}/summary.json" + +echo "Live real-world adapter reports:" +echo " ${REPORT_DIR}/elf-report.json" +echo " ${REPORT_DIR}/elf-report.md" +echo " ${REPORT_DIR}/qmd-report.json" +echo " ${REPORT_DIR}/qmd-report.md" +echo " ${REPORT_DIR}/summary.json"