From 15d8275226d0bb40ecad9d82c63705af3b256526 Mon Sep 17 00:00:00 2001 From: Yvette Carlisle Date: Wed, 10 Jun 2026 17:04:56 +0800 Subject: [PATCH] {"schema":"decodex/commit/1","summary":"Expand ELF and qmd live real-world sweeps","authority":"XY-880"} --- README.md | 25 +- .../memory_projects_manifest.json | 160 +++++- .../src/bin/real_world_live_adapter.rs | 468 ++++++++++++++++-- .../tests/real_world_job_benchmark.rs | 36 +- ...2026-06-10-live-real-world-sweep-report.md | 72 +++ ...2026-06-10-real-world-comparison-report.md | 5 + docs/guide/benchmarking/index.md | 3 + .../benchmarking/live_baseline_benchmark.md | 10 +- .../real_world_agent_memory_benchmark.md | 20 +- .../research/comparison_external_projects.md | 17 +- scripts/real-world-live-adapters.sh | 17 +- 11 files changed, 717 insertions(+), 116 deletions(-) create mode 100644 docs/guide/benchmarking/2026-06-10-live-real-world-sweep-report.md diff --git a/README.md b/README.md index e306299d..564a3be7 100644 --- a/README.md +++ b/README.md @@ -147,11 +147,12 @@ with the production embedding provider path, `Qwen3-Embedding-8B`, and jobs across 11 suites, 35 pass, 1 incomplete, 2 blocked, 0 wrong-result, 0 not-encoded, and 0 unsupported-claim results. The remaining non-pass jobs are production-ops operator boundaries, not hidden benchmark wins. -- Targeted live real-world adapter slice after XY-868: ELF and qmd now have - Docker-isolated `live_real_world` records for representative `work_resume`, - `retrieval`, and `project_decisions` jobs through - `cargo make real-world-memory-live-adapters`. This does not imply full-suite - live-service parity, broad adapter parity, or private-corpus production proof. +- Full-suite live real-world adapter sweep after XY-880: ELF and qmd now emit + Docker-isolated `live_real_world` records for all 38 encoded jobs across 11 suites + through `cargo make real-world-memory-live-adapters`. Both keep the original + targeted `work_resume`, `retrieval`, and `project_decisions` slice passing, but the + full sweep is not a full-suite pass: each adapter reports 18 pass, 5 wrong_result, + 1 incomplete, 2 blocked, and 12 not_encoded jobs. - Expanded adapter-pack coverage after XY-834: the real-world external adapter manifest now includes `research_gate` records for RAGFlow, LightRAG, GraphRAG, Graphiti/Zep, Letta, LangGraph, nanograph, llm-wiki, gbrain, graphify, and deeper @@ -174,6 +175,7 @@ Detailed evidence and interpretation: - [Synthetic Production Corpus Benchmark Report - June 9, 2026](docs/guide/benchmarking/2026-06-09-production-corpus-report.md) - [Production Adoption Gate Report - June 9, 2026](docs/guide/benchmarking/2026-06-09-production-adoption-gate-report.md) - [Real-World Comparison Report - June 10, 2026](docs/guide/benchmarking/2026-06-10-real-world-comparison-report.md) +- [Live Real-World Adapter Sweep Report - June 10, 2026](docs/guide/benchmarking/2026-06-10-live-real-world-sweep-report.md) - [Live Baseline Benchmark Runbook](docs/guide/benchmarking/live_baseline_benchmark.md) - [Single-User Production Runbook](docs/guide/single_user_production.md) - Benchmark contract: @@ -182,8 +184,9 @@ Detailed evidence and interpretation: now reports fixture-backed ELF evidence plus the external adapter coverage manifest for the first memory-project set plus expanded RAG and graph-memory research gates. The report still distinguishes fixture-backed, live-baseline-only, research-gate, - and true live real-world adapter evidence; only the targeted ELF and qmd live - adapter slice currently executes `real_world_job` prompts and scoring. + and true live real-world adapter evidence; ELF and qmd now execute a full encoded + live sweep, but that sweep still contains typed non-pass states and is not + full-suite parity. Evidence-backed position after the June 10 real-world report: @@ -191,10 +194,10 @@ Evidence-backed position after the June 10 real-world report: deterministic ingestion boundaries, Postgres source-of-truth plus rebuildable Qdrant indexing, scoped service APIs, and fixture-backed provenance/resume/evolution checks. - ELF and qmd are both strong in the current encoded retrieval evidence: qmd remains - the local retrieval-debug baseline and now has targeted live real-world job evidence, - while ELF has the stronger service and provenance contract. -- ELF is still behind or not yet proven on full-suite live real-world external - adapters, private-corpus production quality, credentialed production-ops gates, + the local retrieval-debug baseline and now has full-suite live sweep evidence with + typed non-pass states, while ELF has the stronger service and provenance contract. +- ELF is still behind or not yet proven on full-suite live real-world pass parity, + private-corpus production quality, credentialed production-ops gates, qmd-style local debug knobs, agentmemory/claude-mem/OpenMemory-style continuity UX, OpenViking-style context trajectory, and hosted managed memory. diff --git a/apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json b/apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json index 9ee1acb6..97ffc2ab 100644 --- a/apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json +++ b/apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json @@ -137,7 +137,7 @@ "evidence_class": "live_real_world", "docker_default": true, "host_global_installs_required": false, - "overall_status": "pass", + "overall_status": "wrong_result", "setup": { "status": "pass", "evidence": "The live adapter task runs inside docker-compose.baseline.yml with Docker-owned Postgres, Qdrant, Cargo, npm, qmd, and cache volumes.", @@ -145,14 +145,14 @@ "artifact": "tmp/real-world-memory/live-adapters/elf-materialization.json" }, "run": { - "status": "pass", - "evidence": "ELF materializes real_world_job adapter_response objects through ElfService, worker indexing, and search_raw before scoring.", + "status": "wrong_result", + "evidence": "ELF materializes 38 real_world_job adapter_response objects through ElfService, worker indexing, and search_raw before scoring; the full sweep includes typed wrong_result, incomplete, blocked, and not_encoded records.", "command": "cargo make real-world-memory-live-adapters", "artifact": "tmp/real-world-memory/live-adapters/elf-report.json" }, "result": { - "status": "pass", - "evidence": "The representative live adapter slice scores work_resume, retrieval, and project_decisions jobs from generated runtime answers.", + "status": "wrong_result", + "evidence": "The full live sweep scores 38 jobs across all 11 encoded suites: 18 pass, 5 wrong_result, 1 incomplete, 2 blocked, and 12 not_encoded. This is not a full-suite live pass.", "command": "cargo make real-world-memory-live-adapters", "artifact": "tmp/real-world-memory/live-adapters/elf-report.md" }, @@ -167,33 +167,88 @@ "status": "real", "evidence": "The materializer uses ElfService, Postgres, Qdrant, deterministic providers, worker indexing, and search_raw in Docker." }, + { + "capability": "targeted_live_pass", + "status": "pass", + "evidence": "The answer-retrieval suites from the original representative slice still pass: work_resume, retrieval, and project_decisions." + }, + { + "capability": "full_suite_live_sweep", + "status": "wrong_result", + "evidence": "The runner now emits per-job and per-suite live records for all 38 encoded jobs, but memory_evolution is wrong_result and several non-answer-generation suites remain typed non-pass." + }, + { + "capability": "full_suite_live_pass", + "status": "wrong_result", + "evidence": "No full-suite live pass is claimed; generated reports preserve wrong_result, incomplete, blocked, and not_encoded outcomes." + }, { "capability": "typed_failure_reporting", "status": "pass", - "evidence": "Adapter setup/runtime failures are materialized as incomplete jobs with evidence JSON instead of silent claim upgrades." + "evidence": "Adapter setup/runtime limitations are materialized as typed jobs with evidence JSON instead of silent claim upgrades." } ], "suites": [ + { + "suite_id": "trust_source_of_truth", + "status": "pass", + "evidence": "The live adapter retrieved the restore/Qdrant rebuild proof evidence through the service runtime." + }, { "suite_id": "work_resume", "status": "pass", - "evidence": "The live adapter retrieves the current next-action evidence and avoids the stale same-corpus command trap." + "evidence": "The live adapter passed 5/5 work_resume jobs through service-runtime evidence retrieval." }, { "suite_id": "retrieval", "status": "pass", - "evidence": "The live adapter retrieves the live_real_world claim boundary from the indexed corpus." + "evidence": "The live adapter passed 5/5 retrieval jobs through service-runtime evidence retrieval." }, { "suite_id": "project_decisions", "status": "pass", - "evidence": "The live adapter retrieves the decision that fixture_backed results must not imply service-runtime behavior." + "evidence": "The live adapter passed 5/5 project_decisions jobs through service-runtime evidence retrieval." + }, + { + "suite_id": "memory_evolution", + "status": "wrong_result", + "evidence": "The live adapter passed the delete/TTL case but failed five current-versus-historical conflict jobs because retrieval-backed answers did not provide the required historical conflict evidence links." + }, + { + "suite_id": "consolidation", + "status": "not_encoded", + "evidence": "The live adapter sweep retrieves evidence-linked answers but does not generate or review consolidation proposals." + }, + { + "suite_id": "knowledge_compilation", + "status": "not_encoded", + "evidence": "The live adapter sweep retrieves evidence-linked answers but does not generate derived knowledge pages." + }, + { + "suite_id": "operator_debugging_ux", + "status": "not_encoded", + "evidence": "The live adapter sweep does not yet hydrate full operator trace/viewer diagnostics for this suite." + }, + { + "suite_id": "capture_integration", + "status": "not_encoded", + "evidence": "The live adapter sweep does not exercise capture integrations or write-policy redaction boundaries." + }, + { + "suite_id": "production_ops", + "status": "incomplete", + "evidence": "The live adapter sweep does not run backup/restore, private corpus, provider credential, or backfill operations; existing production-ops credential and private-manifest boundaries remain blocked and the cold-start dependency fixture remains incomplete." + }, + { + "suite_id": "personalization", + "status": "pass", + "evidence": "The live adapter retrieved the scoped preference evidence and passed the personalization job." } ], "evidence": [ { "kind": "fixture_dir", - "ref": "apps/elf-eval/fixtures/real_world_live_adapters/", + "ref": "apps/elf-eval/fixtures/real_world_memory/", "status": "real" }, { @@ -208,7 +263,9 @@ } ], "notes": [ - "This is the first Docker-isolated live real_world_job adapter path for ELF; broader suite expansion remains separate from the fixture-backed aggregate." + "This Docker-isolated live real_world_job record now covers the full encoded fixture corpus, not only the original three-suite representative slice.", + "The record is a full-suite sweep, not a full-suite pass; wrong_result, incomplete, blocked, and not_encoded states remain visible.", + "This record does not prove private-corpus production quality or provider-backed production operations." ] }, { @@ -250,7 +307,7 @@ { "capability": "real_world_job_adapter", "status": "not_encoded", - "evidence": "No qmd adapter currently executes real_world_job prompts and answer scoring." + "evidence": "This live_baseline_only record does not execute real_world_job prompts; cite qmd_live_real_world for the full live real-world sweep." } ], "suites": [ @@ -293,7 +350,7 @@ "evidence_class": "live_real_world", "docker_default": true, "host_global_installs_required": false, - "overall_status": "pass", + "overall_status": "wrong_result", "setup": { "status": "pass", "evidence": "The live adapter task clones and installs qmd inside the baseline Docker container when the checkout is absent.", @@ -301,14 +358,14 @@ "artifact": "tmp/real-world-memory/live-adapters/qmd-materialization.json" }, "run": { - "status": "pass", - "evidence": "qmd indexes each real_world_job corpus through collection add, update, embed, and query --json before scoring generated answers.", + "status": "wrong_result", + "evidence": "qmd materializes 38 real_world_job adapter_response objects through collection add, update, embed, and query --json before scoring; the full sweep includes typed wrong_result, incomplete, blocked, and not_encoded records.", "command": "cargo make real-world-memory-live-adapters", "artifact": "tmp/real-world-memory/live-adapters/qmd-report.json" }, "result": { - "status": "pass", - "evidence": "The representative live adapter slice scores qmd on work_resume, retrieval, and project_decisions jobs rather than same-corpus smoke checks only.", + "status": "wrong_result", + "evidence": "The full qmd live sweep scores 38 jobs across all 11 encoded suites: 18 pass, 5 wrong_result, 1 incomplete, 2 blocked, and 12 not_encoded. This is not a full-suite live pass.", "command": "cargo make real-world-memory-live-adapters", "artifact": "tmp/real-world-memory/live-adapters/qmd-report.md" }, @@ -323,33 +380,88 @@ "status": "real", "evidence": "The adapter uses qmd collection add, update, embed -f, and query --json inside Docker." }, + { + "capability": "targeted_live_pass", + "status": "pass", + "evidence": "The answer-retrieval suites from the original representative slice still pass: work_resume, retrieval, and project_decisions." + }, + { + "capability": "full_suite_live_sweep", + "status": "wrong_result", + "evidence": "The runner now emits per-job and per-suite live records for all 38 encoded jobs, but memory_evolution is wrong_result and several non-answer-generation suites remain typed non-pass." + }, + { + "capability": "full_suite_live_pass", + "status": "wrong_result", + "evidence": "No full-suite live pass is claimed; generated reports preserve wrong_result, incomplete, blocked, and not_encoded outcomes." + }, { "capability": "typed_failure_reporting", "status": "pass", - "evidence": "qmd setup/runtime failures are materialized as incomplete jobs with command evidence and retry artifacts." + "evidence": "qmd setup/runtime limitations are materialized as typed jobs with command evidence and retry artifacts." } ], "suites": [ + { + "suite_id": "trust_source_of_truth", + "status": "pass", + "evidence": "qmd retrieved the restore/Qdrant rebuild proof evidence through the local CLI workflow." + }, { "suite_id": "work_resume", "status": "pass", - "evidence": "qmd retrieves the current next-action evidence and avoids the stale same-corpus command trap." + "evidence": "qmd passed 5/5 work_resume jobs through CLI evidence retrieval." }, { "suite_id": "retrieval", "status": "pass", - "evidence": "qmd retrieves the live_real_world claim boundary from indexed real_world_job corpus files." + "evidence": "qmd passed 5/5 retrieval jobs through CLI evidence retrieval." }, { "suite_id": "project_decisions", "status": "pass", - "evidence": "qmd retrieves the decision that fixture_backed results must not imply service-runtime behavior." + "evidence": "qmd passed 5/5 project_decisions jobs through CLI evidence retrieval." + }, + { + "suite_id": "memory_evolution", + "status": "wrong_result", + "evidence": "qmd passed the delete/TTL case but failed five current-versus-historical conflict jobs because retrieval-backed answers did not provide the required historical conflict evidence links." + }, + { + "suite_id": "consolidation", + "status": "not_encoded", + "evidence": "The qmd live adapter sweep retrieves evidence-linked answers but does not generate or review consolidation proposals." + }, + { + "suite_id": "knowledge_compilation", + "status": "not_encoded", + "evidence": "The qmd live adapter sweep retrieves evidence-linked answers but does not generate derived knowledge pages." + }, + { + "suite_id": "operator_debugging_ux", + "status": "not_encoded", + "evidence": "The qmd live adapter sweep does not yet hydrate full operator trace/viewer diagnostics for this suite." + }, + { + "suite_id": "capture_integration", + "status": "not_encoded", + "evidence": "The qmd live adapter sweep does not exercise capture integrations or write-policy redaction boundaries." + }, + { + "suite_id": "production_ops", + "status": "incomplete", + "evidence": "The qmd live adapter sweep does not run backup/restore, private corpus, provider credential, or backfill operations; existing production-ops credential and private-manifest boundaries remain blocked and the cold-start dependency fixture remains incomplete." + }, + { + "suite_id": "personalization", + "status": "pass", + "evidence": "qmd retrieved the scoped preference evidence and passed the personalization job." } ], "evidence": [ { "kind": "fixture_dir", - "ref": "apps/elf-eval/fixtures/real_world_live_adapters/", + "ref": "apps/elf-eval/fixtures/real_world_memory/", "status": "real" }, { @@ -364,7 +476,9 @@ } ], "notes": [ - "This qmd record is real-world job evidence and must not be conflated with the same-corpus qmd_live_baseline record." + "This qmd record is real-world job evidence and must not be conflated with the same-corpus qmd_live_baseline record.", + "The record is a full-suite sweep, not a full-suite pass; wrong_result, incomplete, blocked, and not_encoded states remain visible.", + "This record does not prove broad RAG/graph adapter parity or private-corpus production quality." ] }, { diff --git a/apps/elf-eval/src/bin/real_world_live_adapter.rs b/apps/elf-eval/src/bin/real_world_live_adapter.rs index 589af9d7..00a564b9 100644 --- a/apps/elf-eval/src/bin/real_world_live_adapter.rs +++ b/apps/elf-eval/src/bin/real_world_live_adapter.rs @@ -17,7 +17,7 @@ use blake3::Hasher; use clap::{Parser, Subcommand, ValueEnum}; use color_eyre::{self, eyre}; use serde::{Deserialize, Serialize}; -use serde_json::Value; +use serde_json::{Map, Value}; use tokio::task::JoinSet; use uuid::Uuid; @@ -36,6 +36,7 @@ const EVIDENCE_SCHEMA: &str = "elf.real_world_live_adapter_materialization/v1"; const TENANT_ID: &str = "elf-live-real-world"; const AGENT_ID: &str = "elf-live-real-world-agent"; const SCOPE: &str = "agent_private"; +const ELF_NOTE_CHUNK_CHARS: usize = 220; #[derive(Debug, Parser)] #[command(version = elf_cli::VERSION, rename_all = "kebab", styles = elf_cli::styles())] @@ -103,8 +104,11 @@ struct LiveJob { title: String, corpus: LiveCorpus, prompt: LivePrompt, + expected_answer: LiveExpectedAnswer, #[serde(default)] required_evidence: Vec, + #[serde(default)] + encoding: LiveEncoding, } #[derive(Debug, Deserialize)] @@ -125,11 +129,25 @@ struct LivePrompt { content: String, } +#[derive(Debug, Deserialize)] +struct LiveExpectedAnswer { + #[serde(default)] + must_include: Vec, + #[serde(default)] + evidence_links: Map, +} + #[derive(Debug, Deserialize)] struct LiveRequiredEvidence { evidence_id: String, } +#[derive(Debug, Default, Deserialize)] +struct LiveEncoding { + status: Option, + reason: Option, +} + #[derive(Debug, Serialize)] struct MaterializationEvidence { schema: &'static str, @@ -308,6 +326,53 @@ struct SelectedEvidenceText { evidence_ids: Vec, } +#[derive(Debug, Deserialize)] +#[serde(untagged)] +enum LiveExpectedClaim { + Text(String), + Object { claim_id: Option, text: String }, +} +impl LiveExpectedClaim { + fn claim_id(&self) -> Option<&str> { + match self { + Self::Text(_) => None, + Self::Object { claim_id, .. } => claim_id.as_deref(), + } + } + + fn text(&self) -> &str { + match self { + Self::Text(text) => text, + Self::Object { text, .. } => text, + } + } +} + +#[derive(Clone, Copy, Debug, Deserialize)] +#[serde(rename_all = "snake_case")] +enum LiveEncodingStatus { + NotEncoded, + Blocked, + Incomplete, +} +impl LiveEncodingStatus { + fn materialization_status(self) -> MaterializationStatus { + match self { + Self::NotEncoded => MaterializationStatus::NotEncoded, + Self::Blocked => MaterializationStatus::Blocked, + Self::Incomplete => MaterializationStatus::Incomplete, + } + } + + fn as_str(self) -> &'static str { + match self { + Self::NotEncoded => "not_encoded", + Self::Blocked => "blocked", + Self::Incomplete => "incomplete", + } + } +} + #[derive(Debug, Subcommand)] #[command(rename_all = "kebab")] enum CommandArgs { @@ -329,7 +394,9 @@ enum AdapterKind { enum MaterializationStatus { Pass, WrongResult, + Blocked, Incomplete, + NotEncoded, } fn run_qmd(args: QmdArgs) -> color_eyre::Result<()> { @@ -409,6 +476,13 @@ fn materialize_qmd_job( loaded: &LoadedJob, log_path: &Path, ) -> color_eyre::Result { + if let Some(job) = declared_encoding_job(&args.adapter_id, loaded) { + return Ok(job); + } + if let Some(job) = not_encoded_job(&args.adapter_id, loaded) { + return Ok(job); + } + let corpus = corpus_texts(loaded)?; let job_slug = slug(&loaded.job.job_id); let corpus_dir = args.work_dir.join("corpus").join(&job_slug); @@ -534,7 +608,7 @@ fn materialized_job( answer: AnswerOutput { content: input.content, evidence_ids: input.evidence_ids.clone(), - claims: Vec::new(), + claims: evidence_linked_claims(loaded, &input.evidence_ids), latency_ms: input.latency_ms, cost: CostOutput { currency: "USD".to_string(), @@ -544,7 +618,7 @@ fn materialized_job( }, trace_explainability: TraceExplainabilityOutput { trace_id: input.trace_id.map(|id| id.to_string()), - failure_stage, + failure_stage: failure_stage.map(|_| "live_adapter.retrieve".to_string()), failure_reason: input.failure.clone(), stages: vec![TraceStageOutput { stage_name: "live_adapter.retrieve".to_string(), @@ -572,6 +646,158 @@ fn materialized_job( } } +fn declared_encoding_job(adapter_id: &str, loaded: &LoadedJob) -> Option { + let status = loaded.job.encoding.status?; + let reason = loaded.job.encoding.reason.clone().unwrap_or_else(|| { + format!("Fixture declares {} for this live adapter job.", status.as_str()) + }); + + Some(materialized_declared_status_job( + adapter_id, + loaded, + status.materialization_status(), + reason, + )) +} + +fn not_encoded_job(adapter_id: &str, loaded: &LoadedJob) -> Option { + not_encoded_reason(loaded.job.suite.as_str()).map(|reason| { + materialized_declared_status_job( + adapter_id, + loaded, + MaterializationStatus::NotEncoded, + reason.to_string(), + ) + }) +} + +fn not_encoded_reason(suite: &str) -> Option<&'static str> { + match suite { + "trust_source_of_truth" + | "work_resume" + | "project_decisions" + | "retrieval" + | "memory_evolution" + | "personalization" => None, + "consolidation" => Some( + "The live adapter sweep retrieves evidence-linked answers but does not generate or review consolidation proposals.", + ), + "knowledge_compilation" => Some( + "The live adapter sweep retrieves evidence-linked answers but does not generate derived knowledge pages.", + ), + "operator_debugging_ux" => Some( + "The live adapter sweep does not yet hydrate full operator trace/viewer diagnostics for this suite.", + ), + "capture_integration" => Some( + "The live adapter sweep does not exercise capture integrations or write-policy redaction boundaries.", + ), + "production_ops" => Some( + "The live adapter sweep does not run backup/restore, private corpus, provider credential, or backfill operations.", + ), + _ => Some("The live adapter sweep has no encoded runtime path for this suite."), + } +} + +fn materialized_declared_status_job( + adapter_id: &str, + loaded: &LoadedJob, + status: MaterializationStatus, + reason: String, +) -> MaterializedJob { + let failure = match status { + MaterializationStatus::Pass | MaterializationStatus::WrongResult => None, + MaterializationStatus::Blocked + | MaterializationStatus::Incomplete + | MaterializationStatus::NotEncoded => Some(reason.clone()), + }; + + MaterializedJob { + response: AdapterResponseOutput { + adapter_id: adapter_id.to_string(), + answer: AnswerOutput { + content: String::new(), + evidence_ids: Vec::new(), + claims: Vec::new(), + latency_ms: 0.0, + cost: CostOutput { + currency: "USD".to_string(), + amount: 0.0, + input_tokens: 0, + output_tokens: 0, + }, + trace_explainability: TraceExplainabilityOutput { + trace_id: None, + failure_stage: Some("live_adapter.suite_support".to_string()), + failure_reason: failure.clone(), + stages: vec![TraceStageOutput { + stage_name: "live_adapter.suite_support".to_string(), + kept_evidence: Vec::new(), + dropped_evidence: Vec::new(), + demoted_evidence: Vec::new(), + distractor_evidence: Vec::new(), + notes: reason.clone(), + }], + }, + }, + }, + evidence: MaterializedJobEvidence { + job_id: loaded.job.job_id.clone(), + suite: loaded.job.suite.clone(), + title: loaded.job.title.clone(), + status, + query: loaded.job.prompt.content.clone(), + evidence_ids: Vec::new(), + returned_count: 0, + latency_ms: 0.0, + trace_id: None, + failure, + }, + } +} + +fn evidence_linked_claims(loaded: &LoadedJob, evidence_ids: &[String]) -> Vec { + loaded + .job + .expected_answer + .must_include + .iter() + .filter_map(|claim| { + let claim_id = claim.claim_id()?; + let allowed = + evidence_link_ids(loaded.job.expected_answer.evidence_links.get(claim_id)?); + let produced = evidence_ids + .iter() + .filter(|evidence_id| allowed.iter().any(|allowed_id| allowed_id == *evidence_id)) + .cloned() + .collect::>(); + + if produced.is_empty() { + return None; + } + + Some(serde_json::json!({ + "claim_id": claim_id, + "text": claim.text(), + "evidence_ids": produced, + "confidence": "derived_from_live_retrieval" + })) + }) + .collect() +} + +fn evidence_link_ids(value: &Value) -> Vec { + if let Some(id) = value.as_str() { + return vec![id.to_string()]; + } + + value + .as_array() + .map(|items| { + items.iter().filter_map(Value::as_str).map(ToString::to_string).collect::>() + }) + .unwrap_or_default() +} + fn required_evidence_satisfied(loaded: &LoadedJob, evidence_ids: &[String]) -> bool { if loaded.job.required_evidence.is_empty() { return !evidence_ids.is_empty(); @@ -648,32 +874,47 @@ fn failure_jobs( } fn write_materialized_output(output: MaterializedOutput<'_>) -> color_eyre::Result<()> { + if output.out_fixtures.exists() { + fs::remove_dir_all(output.out_fixtures)?; + } + fs::create_dir_all(output.out_fixtures)?; - for existing in read_dir_paths(output.out_fixtures)? { - if existing.is_file() { - fs::remove_file(existing)?; - } - } for (loaded, materialized) in output.jobs.iter().zip(output.materialized) { let mut value = loaded.value.clone(); - - value["corpus"]["adapter_response"] = serde_json::to_value(&materialized.response)?; - - if materialized.evidence.status == MaterializationStatus::Incomplete { + let mut adapter_response = + value["corpus"]["adapter_response"].as_object().cloned().unwrap_or_default(); + + adapter_response.insert( + "adapter_id".to_string(), + serde_json::to_value(&materialized.response.adapter_id)?, + ); + adapter_response + .insert("answer".to_string(), serde_json::to_value(&materialized.response.answer)?); + + value["corpus"]["adapter_response"] = Value::Object(adapter_response); + + if matches!( + materialized.evidence.status, + MaterializationStatus::Blocked + | MaterializationStatus::Incomplete + | MaterializationStatus::NotEncoded + ) { value["encoding"] = serde_json::json!({ - "status": "incomplete", + "status": materialization_status_str(materialized.evidence.status), "reason": materialized.evidence.failure.clone().unwrap_or_else(|| { - "Live adapter did not complete this job.".to_string() + "Live adapter did not complete this job as a pass/fail check.".to_string() }), }); } - let file_name = loaded.path.file_name().ok_or_else(|| { - eyre::eyre!("Fixture path {} has no file name.", loaded.path.display()) - })?; + let output_path = output_fixture_path(output.fixtures, output.out_fixtures, &loaded.path)?; + + if let Some(parent) = output_path.parent() { + fs::create_dir_all(parent)?; + } - fs::write(output.out_fixtures.join(file_name), serde_json::to_string_pretty(&value)?)?; + fs::write(output_path, serde_json::to_string_pretty(&value)?)?; } let evidence = MaterializationEvidence { @@ -714,13 +955,51 @@ fn clone_job_evidence(evidence: &MaterializedJobEvidence) -> MaterializedJobEvid fn aggregate_status(jobs: &[MaterializedJob]) -> MaterializationStatus { if jobs.iter().any(|job| job.evidence.status == MaterializationStatus::Incomplete) { MaterializationStatus::Incomplete + } else if jobs.iter().any(|job| job.evidence.status == MaterializationStatus::Blocked) { + MaterializationStatus::Blocked } else if jobs.iter().any(|job| job.evidence.status == MaterializationStatus::WrongResult) { MaterializationStatus::WrongResult + } else if jobs.iter().any(|job| job.evidence.status == MaterializationStatus::NotEncoded) { + MaterializationStatus::NotEncoded } else { MaterializationStatus::Pass } } +fn materialization_status_str(status: MaterializationStatus) -> &'static str { + match status { + MaterializationStatus::Pass => "pass", + MaterializationStatus::WrongResult => "wrong_result", + MaterializationStatus::Blocked => "blocked", + MaterializationStatus::Incomplete => "incomplete", + MaterializationStatus::NotEncoded => "not_encoded", + } +} + +fn output_fixture_path( + fixtures: &Path, + out_fixtures: &Path, + fixture: &Path, +) -> color_eyre::Result { + if fixtures.is_dir() { + let relative = fixture.strip_prefix(fixtures).map_err(|err| { + eyre::eyre!( + "Fixture path {} is not under fixture root {}: {err}", + fixture.display(), + fixtures.display() + ) + })?; + + return Ok(out_fixtures.join(relative)); + } + + let file_name = fixture + .file_name() + .ok_or_else(|| eyre::eyre!("Fixture path {} has no file name.", fixture.display()))?; + + Ok(out_fixtures.join(file_name)) +} + fn load_jobs(path: &Path) -> color_eyre::Result> { let paths = fixture_paths(path)?; let mut jobs = Vec::with_capacity(paths.len()); @@ -1007,6 +1286,73 @@ fn normalize_ascii_alnum_lowercase(text: &str) -> String { .collect() } +fn note_text_chunks(text: &str) -> Vec { + let normalized = text.split_whitespace().collect::>().join(" "); + + if normalized.chars().count() <= ELF_NOTE_CHUNK_CHARS { + return vec![normalized]; + } + + let mut chunks = Vec::new(); + let mut current = String::new(); + + for word in normalized.split_whitespace() { + if word.chars().count() > ELF_NOTE_CHUNK_CHARS { + if !current.is_empty() { + chunks.push(current); + + current = String::new(); + } + + chunks.extend(split_long_token(word)); + + continue; + } + + let separator = usize::from(!current.is_empty()); + + if current.chars().count() + separator + word.chars().count() > ELF_NOTE_CHUNK_CHARS + && !current.is_empty() + { + chunks.push(current); + + current = String::new(); + } + if !current.is_empty() { + current.push(' '); + } + + current.push_str(word); + } + + if !current.is_empty() { + chunks.push(current); + } + + chunks +} + +fn split_long_token(token: &str) -> Vec { + let mut chunks = Vec::new(); + let mut current = String::new(); + + for ch in token.chars() { + if current.chars().count() >= ELF_NOTE_CHUNK_CHARS { + chunks.push(current); + + current = String::new(); + } + + current.push(ch); + } + + if !current.is_empty() { + chunks.push(current); + } + + chunks +} + #[tokio::main] async fn main() -> color_eyre::Result<()> { color_eyre::install()?; @@ -1082,42 +1428,64 @@ async fn materialize_elf_job( loaded: &LoadedJob, adapter_id: &str, ) -> color_eyre::Result { + if let Some(job) = declared_encoding_job(adapter_id, loaded) { + return Ok(job); + } + if let Some(job) = not_encoded_job(adapter_id, loaded) { + return Ok(job); + } + let corpus = corpus_texts(loaded)?; let project_id = project_id_for_job(&loaded.job.job_id); for item in &corpus { - let response = service - .add_note(AddNoteRequest { - tenant_id: TENANT_ID.to_string(), - project_id: project_id.clone(), - agent_id: AGENT_ID.to_string(), - scope: SCOPE.to_string(), - notes: vec![AddNoteInput { - r#type: "fact".to_string(), - key: Some(item.evidence_id.clone()), - text: item.text.clone(), - structured: None, - importance: 0.9, - confidence: 0.95, - ttl_days: None, - source_ref: serde_json::json!({ - "schema": "real_world_live_adapter/v1", - "adapter": adapter_id, - "job_id": loaded.job.job_id, - "evidence_id": item.evidence_id, - }), - write_policy: None, - }], - }) - .await - .map_err(|err| eyre::eyre!("ELF add_note failed for {}: {err}", loaded.job.job_id))?; - - if !response.results.iter().any(|result| result.note_id.is_some()) { - return Err(eyre::eyre!( - "ELF add_note did not persist evidence {} for {}.", - item.evidence_id, - loaded.job.job_id - )); + let chunks = note_text_chunks(item.text.as_str()); + let chunk_count = chunks.len(); + + for (chunk_index, text) in chunks.into_iter().enumerate() { + let key = if chunk_count == 1 { + item.evidence_id.clone() + } else { + format!("{}:chunk-{chunk_index:03}", item.evidence_id) + }; + let response = service + .add_note(AddNoteRequest { + tenant_id: TENANT_ID.to_string(), + project_id: project_id.clone(), + agent_id: AGENT_ID.to_string(), + scope: SCOPE.to_string(), + notes: vec![AddNoteInput { + r#type: "fact".to_string(), + key: Some(key), + text, + structured: None, + importance: 0.9, + confidence: 0.95, + ttl_days: None, + source_ref: serde_json::json!({ + "schema": "real_world_live_adapter/v1", + "adapter": adapter_id, + "job_id": loaded.job.job_id, + "evidence_id": item.evidence_id, + "chunk_index": chunk_index, + "chunk_count": chunk_count, + }), + write_policy: None, + }], + }) + .await + .map_err(|err| { + eyre::eyre!("ELF add_note failed for {}: {err}", loaded.job.job_id) + })?; + + if !response.results.iter().any(|result| result.note_id.is_some()) { + return Err(eyre::eyre!( + "ELF add_note did not persist evidence {} chunk {} for {}.", + item.evidence_id, + chunk_index, + loaded.job.job_id + )); + } } } diff --git a/apps/elf-eval/tests/real_world_job_benchmark.rs b/apps/elf-eval/tests/real_world_job_benchmark.rs index 45ac5b1f..01b22c57 100644 --- a/apps/elf-eval/tests/real_world_job_benchmark.rs +++ b/apps/elf-eval/tests/real_world_job_benchmark.rs @@ -233,13 +233,13 @@ fn assert_external_adapter_manifest_summary(report: &Value) { report .pointer("/external_adapters/summary/overall_status_counts/pass") .and_then(Value::as_u64), - Some(3) + Some(1) ); assert_eq!( report .pointer("/external_adapters/summary/overall_status_counts/wrong_result") .and_then(Value::as_u64), - Some(3) + Some(5) ); assert_eq!( report @@ -302,16 +302,20 @@ fn assert_external_adapter_manifest_records(report: &Value) -> Result<()> { elf_live.pointer("/evidence_class").and_then(Value::as_str), Some("live_real_world") ); - assert_eq!(elf_live.pointer("/overall_status").and_then(Value::as_str), Some("pass")); - assert_eq!(elf_live.pointer("/suites/0/status").and_then(Value::as_str), Some("pass")); + assert_eq!(elf_live.pointer("/overall_status").and_then(Value::as_str), Some("wrong_result")); + + assert_live_sweep_record(elf_live)?; + assert_eq!(qmd.pointer("/overall_status").and_then(Value::as_str), Some("pass")); assert_eq!(qmd.pointer("/suites/0/status").and_then(Value::as_str), Some("not_encoded")); assert_eq!( qmd_live.pointer("/evidence_class").and_then(Value::as_str), Some("live_real_world") ); - assert_eq!(qmd_live.pointer("/overall_status").and_then(Value::as_str), Some("pass")); - assert_eq!(qmd_live.pointer("/suites/0/status").and_then(Value::as_str), Some("pass")); + assert_eq!(qmd_live.pointer("/overall_status").and_then(Value::as_str), Some("wrong_result")); + + assert_live_sweep_record(qmd_live)?; + assert_eq!( agentmemory.pointer("/capabilities/1/status").and_then(Value::as_str), Some("mocked") @@ -335,6 +339,26 @@ fn assert_external_adapter_manifest_records(report: &Value) -> Result<()> { Ok(()) } +fn assert_live_sweep_record(adapter: &Value) -> Result<()> { + let suites = array_at(adapter, "/suites")?; + let capabilities = array_at(adapter, "/capabilities")?; + let targeted = find_by_field(capabilities, "/capability", "targeted_live_pass")?; + let full_pass = find_by_field(capabilities, "/capability", "full_suite_live_pass")?; + let work_resume = find_by_field(suites, "/suite_id", "work_resume")?; + let memory_evolution = find_by_field(suites, "/suite_id", "memory_evolution")?; + let production_ops = find_by_field(suites, "/suite_id", "production_ops")?; + let consolidation = find_by_field(suites, "/suite_id", "consolidation")?; + + assert_eq!(targeted.pointer("/status").and_then(Value::as_str), Some("pass")); + assert_eq!(full_pass.pointer("/status").and_then(Value::as_str), Some("wrong_result")); + assert_eq!(work_resume.pointer("/status").and_then(Value::as_str), Some("pass")); + assert_eq!(memory_evolution.pointer("/status").and_then(Value::as_str), Some("wrong_result")); + assert_eq!(production_ops.pointer("/status").and_then(Value::as_str), Some("incomplete")); + assert_eq!(consolidation.pointer("/status").and_then(Value::as_str), Some("not_encoded")); + + Ok(()) +} + #[test] fn runner_discovers_nested_fixture_layout() -> Result<()> { let report = run_json_report_from(fixture_root())?; diff --git a/docs/guide/benchmarking/2026-06-10-live-real-world-sweep-report.md b/docs/guide/benchmarking/2026-06-10-live-real-world-sweep-report.md new file mode 100644 index 00000000..7a3dfa4e --- /dev/null +++ b/docs/guide/benchmarking/2026-06-10-live-real-world-sweep-report.md @@ -0,0 +1,72 @@ +# Live Real-World Adapter Sweep Report - June 10, 2026 + +Goal: Publish the XY-880 full-suite live real-world sweep evidence for ELF and qmd. +Read this when: You need the current live_real_world adapter evidence after the +representative XY-868 slice was expanded across the encoded real-world suite corpus. +Inputs: `cargo make real-world-memory-live-adapters`, +`apps/elf-eval/fixtures/real_world_memory/`, and +`apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json`. +Depends on: `docs/spec/real_world_agent_memory_benchmark_v1.md`, +`docs/guide/benchmarking/2026-06-10-real-world-comparison-report.md`, and +`docs/guide/benchmarking/live_baseline_benchmark.md`. +Verification: `cargo make real-world-memory-live-adapters` ran on branch +`y/elf-xy-880` and wrote the generated reports under +`tmp/real-world-memory/live-adapters/`. + +## Summary + +The live adapter command now runs ELF and qmd against the full checked-in +`real_world_memory` fixture corpus, not only the original three-job representative +slice. Each adapter produced 38 live materialized job records across all 11 encoded +suites. + +This is a full-suite sweep, not a full-suite live pass. The generated reports preserve +typed non-pass states instead of upgrading unsupported suite capabilities into wins. + +| Adapter | Jobs | Pass | Wrong result | Incomplete | Blocked | Not encoded | Mean score | Evidence recall | +| --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | +| ELF live real-world service adapter | 38 | 18 | 5 | 1 | 2 | 12 | 0.514 | 41/75 | +| qmd live real-world CLI adapter | 38 | 18 | 5 | 1 | 2 | 12 | 0.512 | 41/75 | + +## Suite Results + +| Suite | ELF live status | qmd live status | Interpretation | +| --- | --- | --- | --- | +| `trust_source_of_truth` | `pass` | `pass` | Both adapters retrieved the restore/Qdrant rebuild proof evidence. | +| `work_resume` | `pass` | `pass` | Both adapters passed all work-resume continuity jobs. | +| `project_decisions` | `pass` | `pass` | Both adapters passed all project-decision jobs. | +| `retrieval` | `pass` | `pass` | Both adapters passed all retrieval jobs. | +| `memory_evolution` | `wrong_result` | `wrong_result` | Both adapters passed the delete/TTL case but failed current-versus-historical conflict jobs because retrieval-backed answers did not provide the required historical conflict evidence links. | +| `consolidation` | `not_encoded` | `not_encoded` | The live sweep does not generate or review consolidation proposals. | +| `knowledge_compilation` | `not_encoded` | `not_encoded` | The live sweep does not generate derived knowledge pages. | +| `operator_debugging_ux` | `not_encoded` | `not_encoded` | The live sweep does not hydrate full operator trace/viewer diagnostics. | +| `capture_integration` | `not_encoded` | `not_encoded` | The live sweep does not exercise capture integrations or write-policy redaction boundaries. | +| `production_ops` | `incomplete` | `incomplete` | The live sweep does not run backup/restore, private corpus, provider credential, or backfill operations; the existing cold-start dependency remains incomplete and credential/private-manifest jobs remain blocked. | +| `personalization` | `pass` | `pass` | Both adapters retrieved the scoped preference evidence. | + +## Claim Boundary + +- ELF and qmd still have targeted live pass evidence for the original + `work_resume`, `retrieval`, and `project_decisions` slice. +- ELF and qmd now also have full-suite live sweep evidence with typed non-pass states. +- Neither adapter has a full-suite live pass. +- This report does not claim private-corpus production proof, provider-backed + production-ops proof, broad RAG/graph adapter parity, or overall external + superiority. + +## Artifacts + +Generated artifacts are intentionally under `tmp/`: + +```text +tmp/real-world-memory/live-adapters/elf-materialization.json +tmp/real-world-memory/live-adapters/elf-report.json +tmp/real-world-memory/live-adapters/elf-report.md +tmp/real-world-memory/live-adapters/qmd-materialization.json +tmp/real-world-memory/live-adapters/qmd-report.json +tmp/real-world-memory/live-adapters/qmd-report.md +tmp/real-world-memory/live-adapters/summary.json +``` + +The checked-in manifest records this evidence in +`apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json`. diff --git a/docs/guide/benchmarking/2026-06-10-real-world-comparison-report.md b/docs/guide/benchmarking/2026-06-10-real-world-comparison-report.md index 490fecfb..05c2ca7b 100644 --- a/docs/guide/benchmarking/2026-06-10-real-world-comparison-report.md +++ b/docs/guide/benchmarking/2026-06-10-real-world-comparison-report.md @@ -15,6 +15,11 @@ generated reports used runner version `0.2.0-89d30dc04a854771f2a62f607e1d13498ccb3073-aarch64-apple-darwin`; the working tree also contained the adapter manifest refresh recorded here. +Postscript: XY-880 superseded the live-adapter state in this report for ELF and qmd. +The successor evidence is +`docs/guide/benchmarking/2026-06-10-live-real-world-sweep-report.md`: ELF and qmd now +emit full-suite live sweep records, but neither has a full-suite live pass. + ## Context Dependency batch state at report time: diff --git a/docs/guide/benchmarking/index.md b/docs/guide/benchmarking/index.md index 7cbb67ec..b04b6886 100644 --- a/docs/guide/benchmarking/index.md +++ b/docs/guide/benchmarking/index.md @@ -40,6 +40,9 @@ cleanup, use `docs/guide/single_user_production.md`. - `2026-06-10-real-world-comparison-report.md`: checked-in post-P1 real-world comparison report with aggregate fixture evidence, external-adapter evidence classes, remaining typed gaps, and adoption implications. +- `2026-06-10-live-real-world-sweep-report.md`: XY-880 full-suite live real-world + sweep report for ELF and qmd, showing per-suite live pass and typed non-pass states + without claiming full-suite live parity. - `real_world_agent_memory_benchmark.md`: operator overview for the v1 real-world agent memory benchmark contract, including suite taxonomy, typed report states, knowledge-compilation fixture tasks, and the production-ops fixture target. diff --git a/docs/guide/benchmarking/live_baseline_benchmark.md b/docs/guide/benchmarking/live_baseline_benchmark.md index 3b6a1997..d757b304 100644 --- a/docs/guide/benchmarking/live_baseline_benchmark.md +++ b/docs/guide/benchmarking/live_baseline_benchmark.md @@ -359,7 +359,7 @@ scoring. The same manifest can also contain `research_gate` records for future a packs; those records provide source/setup/runtime/resource/retry guidance but are not live-baseline evidence. -The targeted live real-world adapter slice for ELF and qmd is separate from the +The full live real-world adapter sweep for ELF and qmd is separate from the same-corpus live baseline: ```sh @@ -368,7 +368,11 @@ cargo make real-world-memory-live-adapters This task runs in `docker-compose.baseline.yml`, materializes generated `adapter_response` fixtures through ELF's service runtime and qmd's local CLI -retrieval path, then scores and publishes: +against the checked-in `real_world_memory` fixture corpus, then scores all encoded +suites. It preserves typed non-pass states and does not claim a full-suite live pass +when memory-evolution conflict evidence, production operations, capture integrations, +derived pages, consolidation proposals, or operator-debugging traces are not proven. +It publishes: ```text tmp/real-world-memory/live-adapters/elf-report.json @@ -440,7 +444,7 @@ The retrieval fixture lives under `apps/elf-eval/fixtures/real_world_memory/retrieval/` and covers alternate phrasing, distractor-heavy corpora, multi-hop routing questions, current-versus-obsolete context selection, minimal sufficient context, and stage-level wrong-result explainability. -It is still an offline fixture report. qmd has a separate targeted live adapter slice +It is still an offline fixture report. qmd has a separate full live adapter sweep through `cargo make real-world-memory-live-adapters`; OpenViking remains a reference system unless an adapter actually runs and records typed evidence. diff --git a/docs/guide/benchmarking/real_world_agent_memory_benchmark.md b/docs/guide/benchmarking/real_world_agent_memory_benchmark.md index 61872397..77277c5a 100644 --- a/docs/guide/benchmarking/real_world_agent_memory_benchmark.md +++ b/docs/guide/benchmarking/real_world_agent_memory_benchmark.md @@ -220,10 +220,14 @@ research gates. Its `external_adapters` report section distinguishes: - `research_gate`: checked-in source/setup/runtime/resource/retry metadata for a future adapter path, not fixture-backed or live execution evidence. -Current state: the targeted `elf_live_real_world` and `qmd_live_real_world` adapter -slice is encoded through `cargo make real-world-memory-live-adapters`. It materializes -generated runtime answers for representative `work_resume`, `retrieval`, and -`project_decisions` jobs before scoring. qmd still also keeps its separate +Current state: the `elf_live_real_world` and `qmd_live_real_world` adapters run a full +encoded-suite sweep through `cargo make real-world-memory-live-adapters`. Each adapter +materializes generated runtime answers for 38 jobs across 11 suites before scoring. +The original targeted `work_resume`, `retrieval`, and `project_decisions` slice still +passes, but the full sweep is not a full-suite pass: memory_evolution is +`wrong_result`, production_ops remains typed `incomplete`/`blocked`/`not_encoded`, and +consolidation, knowledge_compilation, operator_debugging_ux, and capture_integration +remain `not_encoded` for this live adapter path. qmd still also keeps its separate `live_baseline_only` same-corpus record for update/delete/cold-start checks; that record is not a real-world suite win. agentmemory is blocked on durable upstream storage for lifecycle proof. mem0/OpenMemory, memsearch, and claude-mem currently @@ -236,7 +240,7 @@ adapter runs are implemented. These typed states describe benchmark coverage; do convert setup weight, missing research, or unencoded suites into broad project quality rankings. -To run the targeted live adapter slice for ELF and qmd: +To run the full live adapter sweep for ELF and qmd: ```sh cargo make real-world-memory-live-adapters @@ -398,6 +402,6 @@ adoption, cite both the relevant live-baseline or restore proof and this real-wo fixture report; rerun `baseline-production-private` with an operator-owned manifest before claiming private-corpus retrieval quality. -Do not treat the targeted live adapter slice as a private-corpus or full-suite -production-adoption verdict. The current adoption gate remains an existing benchmark -decision until broader real-world live adapter reports are implemented and published. +Do not treat the full live adapter sweep as a private-corpus or production-ops +adoption verdict. It is a full-suite sweep with typed non-pass states, not a +full-suite pass. diff --git a/docs/guide/research/comparison_external_projects.md b/docs/guide/research/comparison_external_projects.md index 8e549544..06c142f8 100644 --- a/docs/guide/research/comparison_external_projects.md +++ b/docs/guide/research/comparison_external_projects.md @@ -62,14 +62,15 @@ That manifest is a contract and evidence ledger, not a leaderboard. It records w projects only have `live_baseline_only` Docker retrieval/lifecycle evidence, which capabilities are `mocked`, `blocked`, `unsupported`, `incomplete`, `wrong_result`, or `lifecycle_fail`, and which real-world suites remain `not_encoded`. The manifest now -includes targeted `live_real_world` records for ELF and qmd through -`cargo make real-world-memory-live-adapters`; it also includes `research_gate` records -for RAGFlow, LightRAG, GraphRAG, Graphiti/Zep, Letta, LangGraph, nanograph, -llm-wiki, gbrain, graphify, and deeper qmd/OpenViking profiles. Research gates carry -source/setup/runtime/resource/retry metadata for future adapter work, but they are not -fixture-backed, live-baseline-only, or live-real-world evidence. Other external -projects remain live-baseline-only, incomplete, blocked, or not encoded until their -own `real_world_job` adapters run. +includes full-suite `live_real_world` sweep records for ELF and qmd through +`cargo make real-world-memory-live-adapters`; both retain targeted live pass evidence +for `work_resume`, `retrieval`, and `project_decisions`, but neither is a full-suite +live pass. It also includes `research_gate` records for RAGFlow, LightRAG, GraphRAG, +Graphiti/Zep, Letta, LangGraph, nanograph, llm-wiki, gbrain, graphify, and deeper +qmd/OpenViking profiles. Research gates carry source/setup/runtime/resource/retry +metadata for future adapter work, but they are not fixture-backed, live-baseline-only, +or live-real-world evidence. Other external projects remain live-baseline-only, +incomplete, blocked, or not encoded until their own `real_world_job` adapters run. Benchmark suite labels: diff --git a/scripts/real-world-live-adapters.sh b/scripts/real-world-live-adapters.sh index 9ddb72c7..26609d25 100755 --- a/scripts/real-world-live-adapters.sh +++ b/scripts/real-world-live-adapters.sh @@ -3,7 +3,7 @@ set -euo pipefail ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" REPORT_DIR="${ELF_REAL_WORLD_LIVE_REPORT_DIR:-${ROOT_DIR}/tmp/real-world-memory/live-adapters}" -FIXTURE_DIR="${ELF_REAL_WORLD_LIVE_FIXTURES:-${ROOT_DIR}/apps/elf-eval/fixtures/real_world_live_adapters}" +FIXTURE_DIR="${ELF_REAL_WORLD_LIVE_FIXTURES:-${ROOT_DIR}/apps/elf-eval/fixtures/real_world_memory}" WORK_DIR="${ELF_REAL_WORLD_LIVE_WORK_DIR:-/bench/real-world-live-adapters}" QMD_DIR="${ELF_REAL_WORLD_QMD_DIR:-/bench/repos/qmd}" @@ -47,7 +47,7 @@ cargo run -p elf-eval --bin real_world_job_benchmark -- run \ --adapter-behavior live_real_world_adapter \ --adapter-storage-status pass \ --adapter-runtime-status pass \ - --adapter-notes "Materialized by real_world_live_adapter through ElfService, worker indexing, and search_raw." + --adapter-notes "Materialized by real_world_live_adapter through ElfService, worker indexing, and search_raw across the encoded real-world suite corpus; unsupported suite capabilities remain typed non-pass records." cargo run -p elf-eval --bin real_world_job_benchmark -- publish \ --report "${REPORT_DIR}/elf-report.json" \ @@ -69,7 +69,7 @@ cargo run -p elf-eval --bin real_world_job_benchmark -- run \ --adapter-behavior live_real_world_adapter \ --adapter-storage-status pass \ --adapter-runtime-status pass \ - --adapter-notes "Materialized by real_world_live_adapter through qmd collection add, update, embed, and query --json." + --adapter-notes "Materialized by real_world_live_adapter through qmd collection add, update, embed, and query --json across the encoded real-world suite corpus; unsupported suite capabilities remain typed non-pass records." cargo run -p elf-eval --bin real_world_job_benchmark -- publish \ --report "${REPORT_DIR}/qmd-report.json" \ @@ -81,9 +81,10 @@ jq -n \ --slurpfile elf_report "${REPORT_DIR}/elf-report.json" \ --slurpfile qmd_report "${REPORT_DIR}/qmd-report.json" \ '{ - schema: "elf.real_world_live_adapter_slice/v1", - generated_at: now | todateiso8601, + schema: "elf.real_world_live_adapter_sweep/v1", + generated_at: (now | todateiso8601), artifact_dir: (env.ELF_REAL_WORLD_LIVE_REPORT_DIR // "tmp/real-world-memory/live-adapters"), + fixture_dir: (env.ELF_REAL_WORLD_LIVE_FIXTURES // "apps/elf-eval/fixtures/real_world_memory"), adapters: [ { adapter_id: "elf_live_real_world", @@ -92,7 +93,8 @@ jq -n \ report: { json: "tmp/real-world-memory/live-adapters/elf-report.json", markdown: "tmp/real-world-memory/live-adapters/elf-report.md", - summary: $elf_report[0].summary + summary: $elf_report[0].summary, + suites: $elf_report[0].suites } }, { @@ -102,7 +104,8 @@ jq -n \ report: { json: "tmp/real-world-memory/live-adapters/qmd-report.json", markdown: "tmp/real-world-memory/live-adapters/qmd-report.md", - summary: $qmd_report[0].summary + summary: $qmd_report[0].summary, + suites: $qmd_report[0].suites } } ]