hack-ink · yvette-carlisle · Jun 10, 2026 · Jun 10, 2026
diff --git a/Makefile.toml b/Makefile.toml
@@ -418,6 +418,7 @@ args = [
 # | real-world-memory-production-ops        | composite |     |
 # | real-world-memory-production-ops-json   | command   |     |
 # | real-world-memory-production-ops-report | command   |     |
+# | real-world-memory-live-adapters         | command   |     |
 
 [tasks.real-world-job-smoke]
 workspace = false
@@ -805,6 +806,14 @@ args = [
 	"tmp/real-world-memory/consolidation/report.md",
 ]
 
+[tasks.real-world-memory-live-adapters]
+workspace = false
+command = "bash"
+args = [
+	"-lc",
+	"set -euo pipefail; docker compose -f docker-compose.baseline.yml run --build --rm baseline-runner bash scripts/real-world-live-adapters.sh",
+]
+
 
 # Real-world memory knowledge benchmark
 # | task                           | type      | cwd |

diff --git a/README.md b/README.md
@@ -147,14 +147,20 @@ with the production embedding provider path, `Qwen3-Embedding-8B`, and
   jobs across 11 suites, 35 pass, 1 incomplete, 2 blocked, 0 wrong-result,
   0 not-encoded, and 0 unsupported-claim results. The remaining non-pass jobs are
   production-ops operator boundaries, not hidden benchmark wins.
+- Targeted live real-world adapter slice after XY-868: ELF and qmd now have
+  Docker-isolated `live_real_world` records for representative `work_resume`,
+  `retrieval`, and `project_decisions` jobs through
+  `cargo make real-world-memory-live-adapters`. This does not imply full-suite
+  live-service parity, broad adapter parity, or private-corpus production proof.
 - The benchmark runner and report publisher are checked in and Docker-isolated:
   `cargo make baseline-live-docker`, `cargo make baseline-backfill-docker`,
   `cargo make baseline-production-private-addendum`,
   `cargo make baseline-backfill-10k-docker`,
   `cargo make baseline-backfill-100k-docker`,
-  `cargo make baseline-soak-docker`, `cargo make baseline-live-report`, and
-  `cargo make baseline-live-docker-clean`. Expensive 100k and long-soak profiles are
-  opt-in and do not run in normal checks.
+  `cargo make baseline-soak-docker`, `cargo make baseline-live-report`,
+  `cargo make real-world-memory-live-adapters`, and
+  `cargo make baseline-live-docker-clean`. Expensive 100k and long-soak profiles
+  are opt-in and do not run in normal checks.
 
 Detailed evidence and interpretation:
 
@@ -170,21 +176,21 @@ Detailed evidence and interpretation:
   now reports fixture-backed ELF evidence plus the external adapter coverage manifest
   for ELF, qmd, agentmemory, mem0/OpenMemory, claude-mem, memsearch, and OpenViking.
   The report still distinguishes fixture-backed and live-baseline-only evidence from
-  true live real-world adapter runs; no external project has a live real-world suite win
-  until an adapter actually executes `real_world_job` prompts and scoring.
+  true live real-world adapter runs; only the targeted ELF and qmd live adapter slice
+  currently executes `real_world_job` prompts and scoring.
 
 Evidence-backed position after the June 10 real-world report:
 
 - ELF is better evidenced than the tested alternatives on evidence-bound writes,
   deterministic ingestion boundaries, Postgres source-of-truth plus rebuildable Qdrant
   indexing, scoped service APIs, and fixture-backed provenance/resume/evolution checks.
 - ELF and qmd are both strong in the current encoded retrieval evidence: qmd remains
-  the local retrieval-debug baseline, while ELF has the stronger service and provenance
-  contract.
-- ELF is still behind or not yet proven on live real-world external adapters,
-  private-corpus production quality, credentialed production-ops gates, qmd-style local
-  debug knobs, agentmemory/claude-mem/OpenMemory-style continuity UX, OpenViking-style
-  context trajectory, and hosted managed memory.
+  the local retrieval-debug baseline and now has targeted live real-world job evidence,
+  while ELF has the stronger service and provenance contract.
+- ELF is still behind or not yet proven on full-suite live real-world external
+  adapters, private-corpus production quality, credentialed production-ops gates,
+  qmd-style local debug knobs, agentmemory/claude-mem/OpenMemory-style continuity UX,
+  OpenViking-style context trajectory, and hosted managed memory.
 
 Quick comparison snapshot (objective/high-level).
 This table compares capability coverage, not overall project quality.

diff --git a/apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json b/apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json
@@ -126,12 +126,90 @@
       ],
       "notes": [
         "This adapter record exists to keep ELF fixture results separate from live external adapter results.",
-        "The remaining non-pass ELF fixture states are production-ops operator boundaries: a Docker local-embedding dependency, provider credentials, and an operator-owned private corpus manifest."
+        "The remaining non-pass ELF fixture states are production-ops operator boundaries: a Docker local-embedding dependency, provider credentials, and an operator-owned private corpus manifest.",
+        "Use elf_live_real_world for service-runtime real_world_job evidence; this fixture-backed record must not imply live-service behavior."
+      ]
+    },
+    {
+      "adapter_id": "elf_live_real_world",
+      "project": "ELF",
+      "adapter_kind": "docker_service_real_world_job",
+      "evidence_class": "live_real_world",
+      "docker_default": true,
+      "host_global_installs_required": false,
+      "overall_status": "pass",
+      "setup": {
+        "status": "pass",
+        "evidence": "The live adapter task runs inside docker-compose.baseline.yml with Docker-owned Postgres, Qdrant, Cargo, npm, qmd, and cache volumes.",
+        "command": "cargo make real-world-memory-live-adapters",
+        "artifact": "tmp/real-world-memory/live-adapters/elf-materialization.json"
+      },
+      "run": {
+        "status": "pass",
+        "evidence": "ELF materializes real_world_job adapter_response objects through ElfService, worker indexing, and search_raw before scoring.",
+        "command": "cargo make real-world-memory-live-adapters",
+        "artifact": "tmp/real-world-memory/live-adapters/elf-report.json"
+      },
+      "result": {
+        "status": "pass",
+        "evidence": "The representative live adapter slice scores work_resume, retrieval, and project_decisions jobs from generated runtime answers.",
+        "command": "cargo make real-world-memory-live-adapters",
+        "artifact": "tmp/real-world-memory/live-adapters/elf-report.md"
+      },
+      "capabilities": [
+        {
+          "capability": "real_world_job_adapter",
+          "status": "pass",
+          "evidence": "The adapter executes real_world_job prompts after runtime ingestion and writes generated answer artifacts before scoring."
+        },
+        {
+          "capability": "service_runtime_execution",
+          "status": "real",
+          "evidence": "The materializer uses ElfService, Postgres, Qdrant, deterministic providers, worker indexing, and search_raw in Docker."
+        },
+        {
+          "capability": "typed_failure_reporting",
+          "status": "pass",
+          "evidence": "Adapter setup/runtime failures are materialized as incomplete jobs with evidence JSON instead of silent claim upgrades."
+        }
       ],
-      "follow_up": {
-        "title": "[ELF benchmark vNext] Replace fixture-only ELF answers with live real-world adapter execution where appropriate",
-        "reason": "The current report proves fixture scoring, not an end-to-end live real-world memory service run."
-      }
+      "suites": [
+        {
+          "suite_id": "work_resume",
+          "status": "pass",
+          "evidence": "The live adapter retrieves the current next-action evidence and avoids the stale same-corpus command trap."
+        },
+        {
+          "suite_id": "retrieval",
+          "status": "pass",
+          "evidence": "The live adapter retrieves the live_real_world claim boundary from the indexed corpus."
+        },
+        {
+          "suite_id": "project_decisions",
+          "status": "pass",
+          "evidence": "The live adapter retrieves the decision that fixture_backed results must not imply service-runtime behavior."
+        }
+      ],
+      "evidence": [
+        {
+          "kind": "fixture_dir",
+          "ref": "apps/elf-eval/fixtures/real_world_live_adapters/",
+          "status": "real"
+        },
+        {
+          "kind": "command",
+          "ref": "cargo make real-world-memory-live-adapters",
+          "status": "pass"
+        },
+        {
+          "kind": "artifact",
+          "ref": "tmp/real-world-memory/live-adapters/elf-report.json",
+          "status": "pass"
+        }
+      ],
+      "notes": [
+        "This is the first Docker-isolated live real_world_job adapter path for ELF; broader suite expansion remains separate from the fixture-backed aggregate."
+      ]
     },
     {
       "adapter_id": "qmd_live_baseline",
@@ -205,7 +283,88 @@
         }
       ],
       "notes": [
-        "Do not claim a qmd real-world suite pass until a real_world_job adapter executes qmd and records job-level evidence."
+        "This same-corpus record remains separate from qmd_live_real_world, which records real_world_job prompt execution and scoring evidence."
+      ]
+    },
+    {
+      "adapter_id": "qmd_live_real_world",
+      "project": "qmd",
+      "adapter_kind": "docker_cli_real_world_job",
+      "evidence_class": "live_real_world",
+      "docker_default": true,
+      "host_global_installs_required": false,
+      "overall_status": "pass",
+      "setup": {
+        "status": "pass",
+        "evidence": "The live adapter task clones and installs qmd inside the baseline Docker container when the checkout is absent.",
+        "command": "cargo make real-world-memory-live-adapters",
+        "artifact": "tmp/real-world-memory/live-adapters/qmd-materialization.json"
+      },
+      "run": {
+        "status": "pass",
+        "evidence": "qmd indexes each real_world_job corpus through collection add, update, embed, and query --json before scoring generated answers.",
+        "command": "cargo make real-world-memory-live-adapters",
+        "artifact": "tmp/real-world-memory/live-adapters/qmd-report.json"
+      },
+      "result": {
+        "status": "pass",
+        "evidence": "The representative live adapter slice scores qmd on work_resume, retrieval, and project_decisions jobs rather than same-corpus smoke checks only.",
+        "command": "cargo make real-world-memory-live-adapters",
+        "artifact": "tmp/real-world-memory/live-adapters/qmd-report.md"
+      },
+      "capabilities": [
+        {
+          "capability": "real_world_job_adapter",
+          "status": "pass",
+          "evidence": "qmd executes real_world_job prompts through its local CLI retrieval/query workflow and records generated answer artifacts."
+        },
+        {
+          "capability": "local_cli_retrieval",
+          "status": "real",
+          "evidence": "The adapter uses qmd collection add, update, embed -f, and query --json inside Docker."
+        },
+        {
+          "capability": "typed_failure_reporting",
+          "status": "pass",
+          "evidence": "qmd setup/runtime failures are materialized as incomplete jobs with command evidence and retry artifacts."
+        }
+      ],
+      "suites": [
+        {
+          "suite_id": "work_resume",
+          "status": "pass",
+          "evidence": "qmd retrieves the current next-action evidence and avoids the stale same-corpus command trap."
+        },
+        {
+          "suite_id": "retrieval",
+          "status": "pass",
+          "evidence": "qmd retrieves the live_real_world claim boundary from indexed real_world_job corpus files."
+        },
+        {
+          "suite_id": "project_decisions",
+          "status": "pass",
+          "evidence": "qmd retrieves the decision that fixture_backed results must not imply service-runtime behavior."
+        }
+      ],
+      "evidence": [
+        {
+          "kind": "fixture_dir",
+          "ref": "apps/elf-eval/fixtures/real_world_live_adapters/",
+          "status": "real"
+        },
+        {
+          "kind": "command",
+          "ref": "cargo make real-world-memory-live-adapters",
+          "status": "pass"
+        },
+        {
+          "kind": "artifact",
+          "ref": "tmp/real-world-memory/live-adapters/qmd-report.json",
+          "status": "pass"
+        }
+      ],
+      "notes": [
+        "This qmd record is real-world job evidence and must not be conflated with the same-corpus qmd_live_baseline record."
       ]
     },
     {

diff --git a/apps/elf-eval/fixtures/real_world_live_adapters/project_decision_fixture_boundary.json b/apps/elf-eval/fixtures/real_world_live_adapters/project_decision_fixture_boundary.json
@@ -0,0 +1,133 @@
+{
+  "schema": "elf.real_world_job/v1",
+  "job_id": "live-adapter-project-decision-boundary-001",
+  "suite": "project_decisions",
+  "title": "Live adapter retrieves the decision that fixture scoring must not imply service behavior",
+  "corpus": {
+    "corpus_id": "real-world-live-adapters-2026-06-10",
+    "profile": "external_adapter",
+    "items": [
+      {
+        "evidence_id": "fixture-live-service-boundary",
+        "kind": "decision",
+        "text": "Current adapter decision: fixture_backed results must not imply live-service behavior; live_real_world evidence is required before service/runtime superiority claims.",
+        "source_ref": {
+          "schema": "source_ref/v1",
+          "resolver": "real_world_live_adapter_fixture/v1",
+          "ref": {
+            "fixture": "project_decision_fixture_boundary",
+            "evidence_id": "fixture-live-service-boundary"
+          }
+        },
+        "created_at": "2026-06-10T06:20:00Z"
+      },
+      {
+        "evidence_id": "old-fixture-superiority-trap",
+        "kind": "decision",
+        "text": "Old adapter decision: fixture_backed scoring alone proves live-service superiority for ELF.",
+        "source_ref": {
+          "schema": "source_ref/v1",
+          "resolver": "real_world_live_adapter_fixture/v1",
+          "ref": {
+            "fixture": "project_decision_fixture_boundary",
+            "evidence_id": "old-fixture-superiority-trap"
+          }
+        },
+        "created_at": "2026-06-09T06:20:00Z"
+      }
+    ]
+  },
+  "timeline": [
+    {
+      "event_id": "old-fixture-superiority-recorded",
+      "ts": "2026-06-09T06:20:00Z",
+      "actor": "agent",
+      "action": "recorded_old_decision",
+      "evidence_ids": ["old-fixture-superiority-trap"],
+      "summary": "The old decision incorrectly treated fixture-backed scoring as live service proof."
+    },
+    {
+      "event_id": "fixture-live-boundary-recorded",
+      "ts": "2026-06-10T06:20:00Z",
+      "actor": "agent",
+      "action": "recorded_current_decision",
+      "evidence_ids": ["fixture-live-service-boundary"],
+      "summary": "The current decision requires live_real_world evidence before service/runtime superiority claims."
+    }
+  ],
+  "prompt": {
+    "role": "user",
+    "content": "What is the current decision about fixture_backed scoring and live-service behavior claims?",
+    "job_mode": "answer",
+    "constraints": ["cite_evidence", "avoid_stale_facts"]
+  },
+  "expected_answer": {
+    "must_include": [
+      {
+        "claim_id": "fixture_boundary",
+        "text": "Current adapter decision: fixture_backed results must not imply live-service behavior; live_real_world evidence is required before service/runtime superiority claims."
+      }
+    ],
+    "must_not_include": [
+      "Old adapter decision: fixture_backed scoring alone proves live-service superiority for ELF."
+    ],
+    "evidence_links": {
+      "fixture_boundary": ["fixture-live-service-boundary"]
+    },
+    "answer_type": "decision",
+    "accepted_alternates": [],
+    "requires_caveat": false,
+    "requires_refusal": false
+  },
+  "required_evidence": [
+    {
+      "evidence_id": "fixture-live-service-boundary",
+      "claim_id": "fixture_boundary",
+      "requirement": "cite",
+      "quote": "fixture_backed results must not imply live-service behavior"
+    }
+  ],
+  "negative_traps": [
+    {
+      "trap_id": "old-fixture-superiority-claim",
+      "type": "stale_fact",
+      "evidence_ids": ["old-fixture-superiority-trap"],
+      "failure_if_used": true
+    }
+  ],
+  "scoring_rubric": {
+    "dimensions": {
+      "answer_correctness": {
+        "weight": 0.35,
+        "max_points": 1.0,
+        "criteria": "States the current fixture-backed boundary."
+      },
+      "evidence_grounding": {
+        "weight": 0.3,
+        "max_points": 1.0,
+        "criteria": "Cites the current decision evidence."
+      },
+      "trap_avoidance": {
+        "weight": 0.2,
+        "max_points": 1.0,
+        "criteria": "Avoids the stale superiority decision."
+      },
+      "workflow_helpfulness": {
+        "weight": 0.15,
+        "max_points": 1.0,
+        "criteria": "Keeps README/adoption claim boundaries clear."
+      }
+    },
+    "pass_threshold": 0.8,
+    "hard_fail_rules": [
+      "unsupported high-confidence claim about a required decision or fact",
+      "use of a negative trap marked failure_if_used = true"
+    ]
+  },
+  "allowed_uncertainty": {
+    "can_answer_unknown": false,
+    "acceptable_phrases": ["The live adapter did not retrieve that evidence."],
+    "fallback_action": "state_blocker"
+  },
+  "tags": ["external_adapter", "live_real_world", "project_decisions"]
+}