diff --git a/Cargo.lock b/Cargo.lock index ccd3b168..f9ffbcfc 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -964,12 +964,16 @@ dependencies = [ name = "elf-eval" version = "0.2.0" dependencies = [ + "blake3", "clap", "color-eyre", + "elf-chunking", "elf-cli", "elf-config", "elf-service", "elf-storage", + "elf-testkit", + "elf-worker", "serde", "serde_json", "sqlx", diff --git a/Makefile.toml b/Makefile.toml index 832f0c7e..3cf5f17c 100644 --- a/Makefile.toml +++ b/Makefile.toml @@ -293,6 +293,41 @@ args = [ ] +# Live external baseline benchmark +# | task | type | cwd | +# | -------------------------- | ------- | --- | +# | baseline-live-docker | command | | +# | baseline-live-report | command | | +# | baseline-live-docker-clean | command | | + +[tasks.baseline-live-docker] +workspace = false +command = "bash" +args = [ + "-lc", + "set -euo pipefail; head=\"$(git rev-parse HEAD)\"; if [ -n \"$(git status --porcelain)\" ]; then head=\"$head+dirty\"; fi; export ELF_BASELINE_ELF_HEAD=\"$head\"; docker compose -f docker-compose.baseline.yml run --build --rm baseline-runner", +] + +[tasks.baseline-live-report] +workspace = false +command = "bash" +args = [ + "scripts/live-baseline-report-to-md.sh", +] + +[tasks.baseline-live-docker-clean] +workspace = false +command = "docker" +args = [ + "compose", + "-f", + "docker-compose.baseline.yml", + "down", + "-v", + "--remove-orphans", +] + + # Meta # | task | type | cwd | # | ------ | --------- | --- | diff --git a/README.md b/README.md index cd17b656..173714aa 100644 --- a/README.md +++ b/README.md @@ -113,6 +113,29 @@ flowchart TB ## Comparison +### Checked-In Live Benchmark Snapshot + +The June 9, 2026 Docker-only live baseline uses the same generated corpus and query +manifest across ELF and the external memory projects below. ELF was run with the +production embedding provider path, `Qwen3-Embedding-8B`, and 4096-dimensional +embeddings. + +- ELF production-provider stress run: 480 documents, 16 queries, `8/8` encoded checks, + `retrieval_pass`, and `pass` in 1163 seconds. +- All-project smoke run: ELF and qmd passed every encoded check. agentmemory passed + same-corpus retrieval but failed or could not complete lifecycle checks. mem0, + memsearch, and claude-mem returned wrong same-corpus retrieval results in the encoded + smoke. OpenViking was `incomplete` because its local embedding dependency could not + complete in the Docker runner. +- The benchmark runner and report publisher are checked in and Docker-isolated: + `cargo make baseline-live-docker`, `cargo make baseline-live-report`, and + `cargo make baseline-live-docker-clean`. + +Detailed evidence and interpretation: + +- [Live Baseline Benchmark Report - June 9, 2026](docs/guide/benchmarking/2026-06-09-live-baseline-report.md) +- [Live Baseline Benchmark Runbook](docs/guide/benchmarking/live_baseline_benchmark.md) + Quick comparison snapshot (objective/high-level). This table compares capability coverage, not overall project quality. @@ -153,6 +176,8 @@ Project signature strengths (what each does especially well): Detailed comparison, mechanism-level analysis, and source map: +- [Live Baseline Benchmark Report - June 9, 2026](docs/guide/benchmarking/2026-06-09-live-baseline-report.md) +- [Live Baseline Benchmark Runbook](docs/guide/benchmarking/live_baseline_benchmark.md) - [Detailed External Comparison](docs/guide/research/comparison_external_projects.md) - [Research Projects Inventory](docs/guide/research/research_projects_inventory.md) - [Agent Memory Selection Research Run](docs/research/2026-06-08-agent-memory-selection.json) @@ -163,6 +188,7 @@ Latest external research refresh: June 8, 2026. - Start here: `docs/index.md` - Operational guide index: `docs/guide/index.md` +- Benchmarking guides and reports: `docs/guide/benchmarking/index.md` - Research index: `docs/guide/research/index.md` - Specifications: `docs/spec/index.md` - System contract: `docs/spec/system_elf_memory_service_v2.md` diff --git a/apps/elf-eval/Cargo.toml b/apps/elf-eval/Cargo.toml index ec438112..149e81f5 100644 --- a/apps/elf-eval/Cargo.toml +++ b/apps/elf-eval/Cargo.toml @@ -6,6 +6,7 @@ name = "elf-eval" version = "0.2.0" [dependencies] +blake3 = { workspace = true } clap = { workspace = true } color-eyre = { workspace = true } serde = { workspace = true } @@ -17,10 +18,13 @@ tracing = { workspace = true } tracing-subscriber = { workspace = true } uuid = { workspace = true } -elf-cli = { workspace = true } -elf-config = { workspace = true } -elf-service = { workspace = true } -elf-storage = { workspace = true } +elf-chunking = { workspace = true } +elf-cli = { workspace = true } +elf-config = { workspace = true } +elf-service = { workspace = true } +elf-storage = { workspace = true } +elf-testkit = { workspace = true } +elf-worker = { workspace = true } [build-dependencies] vergen-gitcl = { workspace = true } diff --git a/apps/elf-eval/src/bin/live_baseline_elf.rs b/apps/elf-eval/src/bin/live_baseline_elf.rs new file mode 100644 index 00000000..75c9b83e --- /dev/null +++ b/apps/elf-eval/src/bin/live_baseline_elf.rs @@ -0,0 +1,1661 @@ +#![allow(clippy::single_component_path_imports, unused_crate_dependencies)] + +//! Docker live-baseline runner for ELF's own same-corpus retrieval path. + +use std::{ + collections::{BTreeMap, HashSet}, + env, fs, + path::{Path, PathBuf}, + process::Command, + sync::Arc, + time::{Duration, Instant}, +}; + +use clap::Parser; +use color_eyre::{Report, eyre}; +use serde::{Deserialize, Serialize}; +use serde_json::Value; +use tokio::{task::JoinSet, time}; +use uuid::Uuid; + +use elf_chunking::ChunkingConfig; +use elf_config::{Config, EmbeddingProviderConfig, LlmProviderConfig, ProviderConfig}; +use elf_service::{ + AddNoteInput, AddNoteRequest, BoxFuture, DeleteRequest, ElfService, EmbeddingProvider, + ExtractorProvider, PayloadLevel, Providers, RerankProvider, SearchRequest, UpdateRequest, +}; +use elf_storage::{db::Db, qdrant::QdrantStore}; +use elf_testkit::TestDatabase; +use elf_worker::worker::{self, WorkerState}; + +const TENANT_ID: &str = "elf-live-baseline"; +const PROJECT_ID: &str = "shared-corpus"; +const AGENT_ID: &str = "elf-bench-agent"; +const SCOPE: &str = "agent_private"; + +#[derive(Debug, Parser)] +#[command(version = elf_cli::VERSION, rename_all = "kebab", styles = elf_cli::styles())] +struct Args { + /// Base ELF config to load before Docker runtime overrides are applied. + #[arg(long, short = 'c', value_name = "FILE")] + config: PathBuf, + + /// Directory containing the generated benchmark corpus markdown files. + #[arg(long, value_name = "DIR")] + corpus: PathBuf, + + /// Query manifest generated by the live-baseline harness. + #[arg(long, value_name = "FILE")] + queries: PathBuf, + + /// Write ELF result JSON to this file. + #[arg(long, value_name = "FILE")] + out: PathBuf, +} + +#[derive(Debug, Deserialize)] +struct QueryManifest { + queries: Vec, +} + +#[derive(Clone, Debug, Deserialize, Serialize)] +struct QueryCase { + id: String, + query: String, + expected_doc: String, + expected_terms: Vec, +} + +#[derive(Debug)] +struct CorpusNote { + key: String, + title: String, + text: String, + source_doc: String, +} + +#[derive(Debug)] +struct BaselineRuntime { + config_path: PathBuf, + dsn: String, + qdrant_url: String, + collection: String, + docs_collection: String, +} + +#[derive(Debug, Serialize)] +struct WorkerRunEvidence { + label: String, + expected_note_count: usize, + iterations: usize, + before: BTreeMap, + after: BTreeMap, + chunk_rows: i64, + chunk_embedding_rows: i64, + failed_jobs: Vec, +} + +#[derive(Debug, Serialize)] +struct FailedOutboxJob { + note_id: Uuid, + note_key: Option, + op: String, + attempts: i32, + last_error: Option, +} + +#[derive(Debug, Serialize)] +struct ResourceEnvelopeEvidence { + elapsed_seconds: f64, + max_elapsed_seconds: f64, + rss_kb: Option, + max_rss_kb: u64, +} + +#[derive(Debug, Serialize)] +struct EmbeddingRuntimeReport { + mode: EmbeddingMode, + provider_id: String, + model: String, + dimensions: u32, + timeout_ms: u64, + api_base: String, + path: String, +} + +#[derive(Debug, Serialize)] +struct SoakConfig { + target_seconds: u64, + write_rounds: usize, + probe_interval_millis: u64, +} + +#[derive(Debug, Serialize)] +struct ElfBaselineReport { + schema: &'static str, + status: &'static str, + retrieval_status: &'static str, + reason: String, + head: String, + embedding: EmbeddingRuntimeReport, + indexing: IndexingReport, + summary: QuerySummary, + check_summary: CheckSummary, + checks: Vec, + queries: Vec, +} + +#[derive(Debug, Serialize)] +struct IndexingReport { + note_count: usize, + rebuild_rebuilt_count: u64, + rebuild_missing_vector_count: u64, + rebuild_error_count: u64, +} + +#[derive(Debug, Serialize)] +struct QuerySummary { + total: usize, + pass: usize, + fail: usize, +} + +#[derive(Debug, Serialize)] +struct CheckSummary { + total: usize, + pass: usize, + fail: usize, + incomplete: usize, +} + +#[derive(Debug, Serialize)] +struct CheckResult { + name: &'static str, + status: &'static str, + reason: String, + evidence: Value, +} + +#[derive(Debug, Serialize)] +struct QueryResult { + id: String, + query: String, + expected_doc: String, + expected_terms: Vec, + matched: bool, + matched_terms: Vec, + top_note_key: Option, + top_snippet: Option, + returned_count: usize, +} + +#[derive(Debug)] +struct DeterministicEmbedding { + vector_dim: u32, +} +impl EmbeddingProvider for DeterministicEmbedding { + fn embed<'a>( + &'a self, + _cfg: &'a EmbeddingProviderConfig, + texts: &'a [String], + ) -> BoxFuture<'a, elf_service::Result>>> { + let dim = self.vector_dim; + let vectors = texts.iter().map(|text| embed_text(text, dim)).collect(); + + Box::pin(async move { Ok(vectors) }) + } +} + +#[derive(Debug)] +struct TokenOverlapRerank; +impl RerankProvider for TokenOverlapRerank { + fn rerank<'a>( + &'a self, + _cfg: &'a ProviderConfig, + query: &'a str, + docs: &'a [String], + ) -> BoxFuture<'a, elf_service::Result>> { + let query_terms = terms(query); + let scores = docs + .iter() + .map(|doc| { + let doc_terms = terms(doc); + let hits = query_terms.intersection(&doc_terms).count() as f32; + + hits / query_terms.len().max(1) as f32 + }) + .collect(); + + Box::pin(async move { Ok(scores) }) + } +} + +#[derive(Debug)] +struct NoopExtractor; +impl ExtractorProvider for NoopExtractor { + fn extract<'a>( + &'a self, + _cfg: &'a LlmProviderConfig, + _messages: &'a [Value], + ) -> BoxFuture<'a, elf_service::Result> { + Box::pin(async move { Ok(serde_json::json!({ "notes": [] })) }) + } +} + +#[derive(Clone, Copy, Debug, Eq, PartialEq, Serialize)] +#[serde(rename_all = "snake_case")] +enum EmbeddingMode { + Local, + Provider, +} + +fn runtime_config(runtime: &BaselineRuntime) -> color_eyre::Result { + let embedding_mode = embedding_mode()?; + let mut cfg = elf_config::load(&runtime.config_path)?; + + cfg.storage.postgres.dsn = runtime.dsn.clone(); + cfg.storage.postgres.pool_max_conns = 12; + cfg.storage.qdrant.url = runtime.qdrant_url.clone(); + cfg.storage.qdrant.collection = runtime.collection.clone(); + cfg.storage.qdrant.docs_collection = runtime.docs_collection.clone(); + + if embedding_mode == EmbeddingMode::Provider { + apply_provider_embedding_overrides(&mut cfg)?; + + cfg.storage.qdrant.vector_dim = cfg.providers.embedding.dimensions; + } else { + cfg.providers.embedding.provider_id = "local".to_string(); + cfg.providers.embedding.model = "local-hash".to_string(); + cfg.providers.embedding.dimensions = cfg.storage.qdrant.vector_dim; + } + + cfg.providers.rerank.provider_id = "local".to_string(); + cfg.providers.rerank.model = "local-token-overlap".to_string(); + cfg.providers.llm_extractor.provider_id = "disabled".to_string(); + cfg.providers.llm_extractor.model = "disabled".to_string(); + cfg.context = None; + + Ok(cfg) +} + +fn deterministic_providers(vector_dim: u32) -> Providers { + Providers::new( + Arc::new(DeterministicEmbedding { vector_dim }), + Arc::new(TokenOverlapRerank), + Arc::new(NoopExtractor), + ) +} + +fn embedding_mode() -> color_eyre::Result { + let raw = env::var("ELF_BASELINE_ELF_EMBEDDING_MODE") + .unwrap_or_else(|_| "local".to_string()) + .to_ascii_lowercase(); + + match raw.as_str() { + "local" | "deterministic" => Ok(EmbeddingMode::Local), + "provider" | "production" => Ok(EmbeddingMode::Provider), + _ => Err(eyre::eyre!( + "Unsupported ELF_BASELINE_ELF_EMBEDDING_MODE={raw:?}; use local or provider." + )), + } +} + +fn apply_provider_embedding_overrides(cfg: &mut Config) -> color_eyre::Result<()> { + apply_env_string( + &mut cfg.providers.embedding.provider_id, + &[ + "ELF_BASELINE_ELF_EMBEDDING_PROVIDER_ID", + "QWEN_EMBEDDING_PROVIDER_ID", + "EMBEDDING_PROVIDER_ID", + ], + ); + apply_env_string( + &mut cfg.providers.embedding.api_base, + &[ + "ELF_BASELINE_ELF_EMBEDDING_API_BASE", + "QWEN_EMBEDDING_API_BASE", + "DASHSCOPE_API_BASE", + "EMBEDDING_API_BASE", + ], + ); + apply_env_string( + &mut cfg.providers.embedding.api_key, + &[ + "ELF_BASELINE_ELF_EMBEDDING_API_KEY", + "QWEN_API_KEY", + "DASHSCOPE_API_KEY", + "EMBEDDING_API_KEY", + ], + ); + apply_env_string( + &mut cfg.providers.embedding.path, + &["ELF_BASELINE_ELF_EMBEDDING_PATH", "QWEN_EMBEDDING_PATH", "EMBEDDING_PATH"], + ); + apply_env_string( + &mut cfg.providers.embedding.model, + &["ELF_BASELINE_ELF_EMBEDDING_MODEL", "QWEN_EMBEDDING_MODEL", "EMBEDDING_MODEL"], + ); + + if let Some(dimensions) = env_u32(&[ + "ELF_BASELINE_ELF_EMBEDDING_DIMENSIONS", + "QWEN_EMBEDDING_DIMENSIONS", + "DASHSCOPE_EMBEDDING_DIMENSIONS", + "EMBEDDING_DIMENSIONS", + ]) { + cfg.providers.embedding.dimensions = dimensions; + } + if let Some(timeout_ms) = env_u64(&[ + "ELF_BASELINE_ELF_EMBEDDING_TIMEOUT_MS", + "QWEN_EMBEDDING_TIMEOUT_MS", + "EMBEDDING_TIMEOUT_MS", + ]) { + cfg.providers.embedding.timeout_ms = timeout_ms; + } else { + cfg.providers.embedding.timeout_ms = cfg.providers.embedding.timeout_ms.max(30_000); + } + + if cfg.providers.embedding.provider_id == "local" { + if env_string(&["ELF_BASELINE_ELF_EMBEDDING_API_KEY", "QWEN_API_KEY"]).is_some() { + cfg.providers.embedding.provider_id = "qwen".to_string(); + } else if env_string(&["DASHSCOPE_API_KEY"]).is_some() { + cfg.providers.embedding.provider_id = "dashscope".to_string(); + } else if env_string(&["EMBEDDING_API_KEY"]).is_some() { + cfg.providers.embedding.provider_id = "provider".to_string(); + } + } + if cfg.providers.embedding.provider_id == "local" { + return Err(eyre::eyre!( + "Provider embedding mode requires a non-local provider id or QWEN_API_KEY/DASHSCOPE_API_KEY/EMBEDDING_API_KEY." + )); + } + if cfg.providers.embedding.api_base.trim().is_empty() + || cfg.providers.embedding.api_base == "http://127.0.0.1" + { + return Err(eyre::eyre!( + "Provider embedding mode requires ELF_BASELINE_ELF_EMBEDDING_API_BASE, QWEN_EMBEDDING_API_BASE, DASHSCOPE_API_BASE, or EMBEDDING_API_BASE." + )); + } + if cfg.providers.embedding.api_key.trim().is_empty() + || cfg.providers.embedding.api_key == "local-dev-placeholder" + { + return Err(eyre::eyre!( + "Provider embedding mode requires ELF_BASELINE_ELF_EMBEDDING_API_KEY, QWEN_API_KEY, DASHSCOPE_API_KEY, or EMBEDDING_API_KEY." + )); + } + if cfg.providers.embedding.model == "local-hash" + || cfg.providers.embedding.model.trim().is_empty() + { + return Err(eyre::eyre!( + "Provider embedding mode requires ELF_BASELINE_ELF_EMBEDDING_MODEL, QWEN_EMBEDDING_MODEL, or EMBEDDING_MODEL." + )); + } + if cfg.providers.embedding.dimensions == 0 { + return Err(eyre::eyre!( + "Provider embedding dimensions must be greater than zero; set ELF_BASELINE_ELF_EMBEDDING_DIMENSIONS, QWEN_EMBEDDING_DIMENSIONS, DASHSCOPE_EMBEDDING_DIMENSIONS, or EMBEDDING_DIMENSIONS." + )); + } + + Ok(()) +} + +fn embedding_runtime_report(cfg: &Config) -> EmbeddingRuntimeReport { + EmbeddingRuntimeReport { + mode: embedding_mode().unwrap_or(EmbeddingMode::Local), + provider_id: cfg.providers.embedding.provider_id.clone(), + model: cfg.providers.embedding.model.clone(), + dimensions: cfg.providers.embedding.dimensions, + timeout_ms: cfg.providers.embedding.timeout_ms, + api_base: cfg.providers.embedding.api_base.clone(), + path: cfg.providers.embedding.path.clone(), + } +} + +fn apply_env_string(target: &mut String, names: &[&str]) { + if let Some(value) = env_string(names) { + *target = value; + } +} + +fn env_string(names: &[&str]) -> Option { + names.iter().find_map(|name| { + env::var(name).ok().map(|value| value.trim().to_string()).filter(|value| !value.is_empty()) + }) +} + +fn env_u32(names: &[&str]) -> Option { + env_string(names).and_then(|value| value.parse::().ok()) +} + +fn env_u64(names: &[&str]) -> Option { + env_string(names).and_then(|value| value.parse::().ok()) +} + +fn load_corpus_notes(corpus_dir: &Path) -> color_eyre::Result> { + let mut paths = fs::read_dir(corpus_dir)? + .map(|entry| entry.map(|entry| entry.path())) + .collect::>>()?; + + paths.retain(|path| { + path.extension() + .and_then(|ext| ext.to_str()) + .is_some_and(|ext| ext.eq_ignore_ascii_case("md")) + }); + paths.sort(); + + let mut out = Vec::with_capacity(paths.len()); + + for path in paths { + let source_doc = path + .file_name() + .and_then(|name| name.to_str()) + .ok_or_else(|| { + eyre::eyre!("Corpus path has no valid UTF-8 file name: {}", path.display()) + })? + .to_string(); + let raw = fs::read_to_string(&path)?; + let title = title_from_markdown(&raw, &source_doc); + let text = raw + .lines() + .filter(|line| !line.trim_start().starts_with('#')) + .collect::>() + .join(" ") + .split_whitespace() + .collect::>() + .join(" "); + + out.push(CorpusNote { key: key_for_doc(&source_doc), title, text, source_doc }); + } + + if out.is_empty() { + return Err(eyre::eyre!("No markdown corpus files found in {}.", corpus_dir.display())); + } + + Ok(out) +} + +fn load_queries(path: &PathBuf) -> color_eyre::Result { + let raw = fs::read_to_string(path)?; + + Ok(serde_json::from_str(&raw)?) +} + +fn worker_max_iterations(note_count: usize) -> usize { + env::var("ELF_BASELINE_WORKER_MAX_ITERATIONS") + .ok() + .and_then(|value| value.parse::().ok()) + .unwrap_or_else(|| note_count.saturating_mul(3).saturating_add(32)) +} + +fn outbox_done(counts: &BTreeMap, expected_note_count: usize) -> bool { + let done = counts.get("DONE").copied().unwrap_or_default(); + let expected = i64::try_from(expected_note_count).unwrap_or(i64::MAX); + let pending = counts.get("PENDING").copied().unwrap_or_default(); + let failed = counts.get("FAILED").copied().unwrap_or_default(); + let claimed = counts.get("CLAIMED").copied().unwrap_or_default(); + + done >= expected && pending == 0 && failed == 0 && claimed == 0 +} + +fn retrieval_check(query_results: &[QueryResult]) -> CheckResult { + let pass_count = query_results.iter().filter(|result| result.matched).count(); + let fail_count = query_results.len().saturating_sub(pass_count); + + CheckResult { + name: "same_corpus_retrieval", + status: if fail_count == 0 { "pass" } else { "fail" }, + reason: if fail_count == 0 { + "All same-corpus retrieval queries returned expected evidence.".to_string() + } else { + format!("{fail_count} same-corpus retrieval query case(s) missed expected evidence.") + }, + evidence: serde_json::json!({ + "total": query_results.len(), + "pass": pass_count, + "fail": fail_count, + }), + } +} + +fn worker_indexing_check(evidence: WorkerRunEvidence) -> CheckResult { + let pass = outbox_done(&evidence.after, evidence.expected_note_count) + && evidence.chunk_rows >= i64::try_from(evidence.expected_note_count).unwrap_or(i64::MAX) + && evidence.chunk_embedding_rows >= evidence.chunk_rows; + + CheckResult { + name: "async_worker_indexing_e2e", + status: if pass { "pass" } else { "fail" }, + reason: if pass { + "ELF worker processed corpus outbox jobs into persisted chunks and embeddings." + .to_string() + } else { + "ELF worker did not fully process corpus outbox jobs into searchable chunks." + .to_string() + }, + evidence: serde_json::json!(evidence), + } +} + +fn concurrent_note_count() -> usize { + if let Ok(value) = env::var("ELF_BASELINE_CONCURRENT_NOTES") + && let Ok(parsed) = value.parse::() + { + return parsed.max(1); + } + + match env::var("ELF_BASELINE_PROFILE").as_deref() { + Ok("stress") => 32, + Ok("scale" | "full") => 16, + _ => 4, + } +} + +fn concurrent_add_request(index: usize) -> AddNoteRequest { + let marker = concurrent_marker(index); + + AddNoteRequest { + tenant_id: TENANT_ID.to_string(), + project_id: PROJECT_ID.to_string(), + agent_id: AGENT_ID.to_string(), + scope: SCOPE.to_string(), + notes: vec![AddNoteInput { + r#type: "fact".to_string(), + key: Some(format!("concurrent_{index:03}")), + text: format!( + "Concurrent benchmark note {index:03} records marker `{marker}` for write race validation." + ), + structured: None, + importance: 0.91, + confidence: 0.96, + ttl_days: None, + source_ref: serde_json::json!({ + "source": "ELF live baseline concurrent write check", + "document": format!("concurrent-{index:03}.md"), + }), + write_policy: None, + }], + } +} + +fn concurrent_query_case(index: usize) -> QueryCase { + let marker = concurrent_marker(index); + + QueryCase { + id: format!("concurrent-{index:03}"), + query: format!("Find the concurrent benchmark note containing marker {marker}."), + expected_doc: format!("concurrent-{index:03}.md"), + expected_terms: vec![marker], + } +} + +fn concurrent_marker(index: usize) -> String { + format!("concurrency-{}-{index:03}", marker_word(index)) +} + +fn soak_config() -> SoakConfig { + let profile = env::var("ELF_BASELINE_PROFILE").ok(); + let (default_seconds, default_rounds) = match profile.as_deref() { + Some("stress") => (60, 6), + Some("scale" | "full") => (15, 3), + _ => (0, 0), + }; + + SoakConfig { + target_seconds: parse_env_u64("ELF_BASELINE_SOAK_SECONDS").unwrap_or(default_seconds), + write_rounds: parse_env_usize("ELF_BASELINE_SOAK_ROUNDS").unwrap_or(default_rounds), + probe_interval_millis: parse_env_u64("ELF_BASELINE_SOAK_PROBE_INTERVAL_MS") + .unwrap_or(1_000) + .max(100), + } +} + +fn parse_env_u64(name: &str) -> Option { + env::var(name).ok()?.parse::().ok() +} + +fn parse_env_usize(name: &str) -> Option { + env::var(name).ok()?.parse::().ok() +} + +fn soak_add_request(index: usize) -> AddNoteRequest { + let marker = soak_marker(index); + let (topic, detail) = soak_topic(index); + + AddNoteRequest { + tenant_id: TENANT_ID.to_string(), + project_id: PROJECT_ID.to_string(), + agent_id: AGENT_ID.to_string(), + scope: SCOPE.to_string(), + notes: vec![AddNoteInput { + r#type: "fact".to_string(), + key: Some(format!("soak_{index:03}")), + text: format!( + "Soak benchmark note {index:03} covers {topic}. {detail} It records stability marker `{marker}` for repeated worker and search probes." + ), + structured: None, + importance: 0.92, + confidence: 0.97, + ttl_days: None, + source_ref: serde_json::json!({ + "source": "ELF live baseline soak stability check", + "document": format!("soak-{index:03}.md"), + }), + write_policy: None, + }], + } +} + +fn soak_query_case(index: usize) -> QueryCase { + let marker = soak_marker(index); + let (topic, _) = soak_topic(index); + + QueryCase { + id: format!("soak-{index:03}"), + query: format!("Find the soak benchmark note about {topic} containing marker {marker}."), + expected_doc: format!("soak-{index:03}.md"), + expected_terms: vec![marker], + } +} + +fn soak_marker(index: usize) -> String { + format!("soak-stability-{}-{index:03}", marker_word(index)) +} + +fn marker_word(index: usize) -> &'static str { + const WORDS: &[&str] = &[ + "aurora", "banyan", "cobalt", "delta", "ember", "fennel", "granite", "harbor", "indigo", + "jasper", "keystone", "lantern", "meridian", "nebula", "onyx", "prairie", "quartz", + "raven", "solstice", "topaz", "umbra", "verdant", "willow", "xenon", "yarrow", "zephyr", + "atlas", "beacon", "citadel", "drift", "equinox", "forge", + ]; + + WORDS[index % WORDS.len()] +} + +fn soak_topic(index: usize) -> (&'static str, &'static str) { + const TOPICS: &[(&str, &str)] = &[ + ( + "release rollback fencing", + "The rollback controller waits for a signed deploy fence before the next canary.", + ), + ( + "invoice export batching", + "The exporter groups invoice CSV rows by merchant ledger before upload.", + ), + ("search shard warming", "The search router warms tenant shard caches before rank probes."), + ( + "incident pager routing", + "The incident desk routes page ownership through the release captain.", + ), + ( + "backup restore rehearsal", + "The restore rehearsal checks WAL freshness before dry-run recovery.", + ), + ( + "feature flag expiry", + "The flag sweeper archives expired toggles before deleting rollout rules.", + ), + ( + "support queue triage", + "The support classifier separates billing tickets from access tickets.", + ), + ( + "analytics job watermark", + "The analytics worker stores a warehouse watermark after each import.", + ), + ]; + + TOPICS[index % TOPICS.len()] +} + +fn concurrency_probe_indexes(note_count: usize) -> Vec { + let mut indexes = vec![0, note_count / 2, note_count.saturating_sub(1)]; + + indexes.sort_unstable(); + indexes.dedup(); + + indexes +} + +fn resource_envelope_check(elapsed_seconds: f64) -> CheckResult { + let max_elapsed_seconds = env::var("ELF_BASELINE_MAX_ELF_SECONDS") + .ok() + .and_then(|value| value.parse::().ok()) + .unwrap_or(600.0); + let max_rss_kb = env::var("ELF_BASELINE_MAX_ELF_RSS_KB") + .ok() + .and_then(|value| value.parse::().ok()) + .unwrap_or(1_500_000); + let rss_kb = current_rss_kb(); + let pass = elapsed_seconds <= max_elapsed_seconds && rss_kb.is_none_or(|rss| rss <= max_rss_kb); + + CheckResult { + name: "resource_envelope", + status: if pass { "pass" } else { "fail" }, + reason: if pass { + "ELF live-baseline runtime stayed within the configured local resource envelope." + .to_string() + } else { + "ELF live-baseline runtime exceeded the configured local resource envelope.".to_string() + }, + evidence: serde_json::json!(ResourceEnvelopeEvidence { + elapsed_seconds, + max_elapsed_seconds, + rss_kb, + max_rss_kb, + }), + } +} + +fn current_rss_kb() -> Option { + let status = fs::read_to_string("/proc/self/status").ok()?; + + status.lines().find_map(|line| { + let rest = line.strip_prefix("VmHWM:")?.trim(); + let value = rest.split_whitespace().next()?; + + value.parse::().ok() + }) +} + +fn incomplete_check(name: &'static str, reason: &str) -> CheckResult { + CheckResult { + name, + status: "incomplete", + reason: reason.to_string(), + evidence: serde_json::json!({}), + } +} + +fn summarize_checks(checks: &[CheckResult]) -> CheckSummary { + CheckSummary { + total: checks.len(), + pass: checks.iter().filter(|check| check.status == "pass").count(), + fail: checks.iter().filter(|check| check.status == "fail").count(), + incomplete: checks.iter().filter(|check| check.status == "incomplete").count(), + } +} + +fn title_from_markdown(raw: &str, source_doc: &str) -> String { + raw.lines() + .find_map(|line| line.trim_start().strip_prefix("# ")) + .map(str::trim) + .filter(|title| !title.is_empty()) + .map(str::to_string) + .unwrap_or_else(|| source_doc.to_string()) +} + +fn key_for_doc(doc: &str) -> String { + let stem = Path::new(doc).file_stem().and_then(|stem| stem.to_str()).unwrap_or(doc); + let mut key = String::with_capacity(stem.len()); + let mut last_was_separator = false; + + for ch in stem.chars() { + if ch.is_ascii_alphanumeric() { + key.push(ch.to_ascii_lowercase()); + + last_was_separator = false; + } else if !last_was_separator && !key.is_empty() { + key.push('_'); + + last_was_separator = true; + } + } + + if key.ends_with('_') { + key.pop(); + } + + if key.is_empty() { "doc".to_string() } else { key } +} + +fn embed_text(text: &str, vector_dim: u32) -> Vec { + let dim = vector_dim as usize; + let mut vector = vec![0.0_f32; dim]; + + if dim == 0 { + return vector; + } + + let normalized = normalize_ascii_alnum_lowercase(text); + + for term in normalized.split_whitespace() { + if term.len() < 2 { + continue; + } + + let hash = blake3::hash(term.as_bytes()); + let bytes = hash.as_bytes(); + let idx = (u32::from_le_bytes([bytes[0], bytes[1], bytes[2], bytes[3]]) as usize) % dim; + let sign = if bytes[4] & 1 == 0 { 1.0 } else { -1.0 }; + + vector[idx] += sign; + } + + if vector.iter().all(|value| *value == 0.0) { + let hash = blake3::hash(text.as_bytes()); + let bytes = hash.as_bytes(); + let idx = (u32::from_le_bytes([bytes[0], bytes[1], bytes[2], bytes[3]]) as usize) % dim; + + vector[idx] = 1.0; + } + + let norm = vector.iter().map(|value| value * value).sum::().sqrt(); + + if norm > 0.0 { + for value in &mut vector { + *value /= norm; + } + } + + vector +} + +fn normalize_ascii_alnum_lowercase(text: &str) -> String { + let mut normalized = String::with_capacity(text.len()); + + for ch in text.chars() { + if ch.is_ascii_alphanumeric() { + normalized.push(ch.to_ascii_lowercase()); + } else { + normalized.push(' '); + } + } + + normalized +} + +fn terms(text: &str) -> HashSet { + text.split(|ch: char| !ch.is_ascii_alphanumeric()) + .map(str::trim) + .filter(|term| !term.is_empty()) + .map(str::to_ascii_lowercase) + .collect() +} + +fn distinctive_terms(text: &str, limit: usize) -> Vec { + let stop_words = [ + "the", "and", "for", "with", "that", "this", "from", "into", "must", "uses", "after", + "before", "query", "memory", "note", + ]; + let stop_words = stop_words.into_iter().collect::>(); + let mut out = Vec::new(); + + for raw in text.split(|ch: char| !ch.is_ascii_alphanumeric()) { + let term = raw.trim(); + + if term.len() < 5 { + continue; + } + + let lowered = term.to_ascii_lowercase(); + + if stop_words.contains(lowered.as_str()) || out.iter().any(|existing| existing == term) { + continue; + } + + out.push(term.to_string()); + + if out.len() >= limit { + break; + } + } + + out +} + +fn contains_case_insensitive(haystack: &str, needle: &str) -> bool { + haystack.to_ascii_lowercase().contains(&needle.to_ascii_lowercase()) +} + +fn git_head() -> color_eyre::Result { + if let Ok(head) = env::var("ELF_BASELINE_ELF_HEAD") { + let head = head.trim(); + + if !head.is_empty() { + return Ok(head.to_string()); + } + } + + let output = Command::new("git").args(["rev-parse", "HEAD"]).output()?; + + if !output.status.success() { + return Err(eyre::eyre!("git rev-parse HEAD failed.")); + } + + Ok(String::from_utf8(output.stdout)?.trim().to_string()) +} + +#[tokio::main] +async fn main() -> color_eyre::Result<()> { + color_eyre::install()?; + + let args = Args::parse(); + let out = args.out.clone(); + let report = run(args).await?; + let raw = serde_json::to_string_pretty(&report)?; + + fs::write(out, raw)?; + + Ok(()) +} + +async fn run(args: Args) -> color_eyre::Result { + let started_at = Instant::now(); + let base_dsn = env::var("ELF_PG_DSN") + .map_err(|_| eyre::eyre!("ELF_PG_DSN must be set for live ELF baseline."))?; + let qdrant_url = env::var("ELF_QDRANT_GRPC_URL") + .or_else(|_| env::var("ELF_QDRANT_URL")) + .map_err(|_| eyre::eyre!("ELF_QDRANT_GRPC_URL or ELF_QDRANT_URL must be set."))?; + let test_db = TestDatabase::new(&base_dsn).await?; + let collection = test_db.collection_name("elf_live_baseline_notes"); + let docs_collection = test_db.collection_name("elf_live_baseline_docs"); + let runtime = BaselineRuntime { + config_path: args.config.clone(), + dsn: test_db.dsn().to_string(), + qdrant_url, + collection, + docs_collection, + }; + let service = Arc::new(build_service(&runtime).await?); + let notes = load_corpus_notes(&args.corpus)?; + let note_ids = add_notes(&service, ¬es).await?; + let initial_worker = + run_worker_until_indexed(&runtime, &service, ¬e_ids, "corpus_upsert").await?; + let rebuild = service.rebuild_qdrant().await?; + let query_manifest = load_queries(&args.queries)?; + let query_results = run_queries(&service, query_manifest.queries).await?; + let pass_count = query_results.iter().filter(|result| result.matched).count(); + let fail_count = query_results.len().saturating_sub(pass_count); + let retrieval_status = + if fail_count == 0 { "retrieval_pass" } else { "retrieval_wrong_result" }; + let mut checks = vec![retrieval_check(&query_results), worker_indexing_check(initial_worker)]; + + checks.extend(run_lifecycle_checks(&runtime, &service, ¬es, ¬e_ids).await?); + checks.push(run_concurrent_write_check(&runtime, Arc::clone(&service)).await?); + + if let Some(soak_check) = run_soak_stability_check(&runtime, Arc::clone(&service)).await? { + checks.push(soak_check); + } + + checks.push(resource_envelope_check(started_at.elapsed().as_secs_f64())); + + let check_summary = summarize_checks(&checks); + let status = + if check_summary.fail == 0 && check_summary.incomplete == 0 { "pass" } else { "fail" }; + let reason = if status == "pass" { + "ELF added the corpus, rebuilt Qdrant, and returned expected evidence for every query" + .to_string() + } else { + format!( + "ELF failed {} live-baseline check(s) and left {} incomplete check(s)", + check_summary.fail, check_summary.incomplete + ) + }; + let report = ElfBaselineReport { + schema: "elf.live_baseline.elf_result/v1", + status, + retrieval_status, + reason, + head: git_head().unwrap_or_else(|_| "unknown".to_string()), + embedding: embedding_runtime_report(&service.cfg), + indexing: IndexingReport { + note_count: notes.len(), + rebuild_rebuilt_count: rebuild.rebuilt_count, + rebuild_missing_vector_count: rebuild.missing_vector_count, + rebuild_error_count: rebuild.error_count, + }, + summary: QuerySummary { total: query_results.len(), pass: pass_count, fail: fail_count }, + check_summary, + checks, + queries: query_results, + }; + + drop(service); + + test_db.cleanup().await?; + + Ok(report) +} + +async fn build_service(runtime: &BaselineRuntime) -> color_eyre::Result { + let cfg = runtime_config(runtime)?; + let embedding_mode = embedding_mode()?; + let vector_dim = cfg.storage.qdrant.vector_dim; + let db = Db::connect(&cfg.storage.postgres).await?; + + db.ensure_schema(cfg.storage.qdrant.vector_dim).await?; + + let qdrant = QdrantStore::new(&cfg.storage.qdrant)?; + + qdrant.ensure_collection().await?; + + if embedding_mode == EmbeddingMode::Provider { + Ok(ElfService::new(cfg, db, qdrant)) + } else { + Ok(ElfService::with_providers(cfg, db, qdrant, deterministic_providers(vector_dim))) + } +} + +async fn build_worker_state(runtime: &BaselineRuntime) -> color_eyre::Result { + let cfg = runtime_config(runtime)?; + let db = Db::connect(&cfg.storage.postgres).await?; + + db.ensure_schema(cfg.storage.qdrant.vector_dim).await?; + + let qdrant = QdrantStore::new(&cfg.storage.qdrant)?; + + qdrant.ensure_collection().await?; + + let docs_qdrant = + QdrantStore::new_with_collection(&cfg.storage.qdrant, &cfg.storage.qdrant.docs_collection)?; + + docs_qdrant.ensure_collection().await?; + + let tokenizer = elf_chunking::load_tokenizer(&cfg.chunking.tokenizer_repo) + .map_err(|err| eyre::eyre!("Failed to load tokenizer for live baseline worker: {err}"))?; + let chunking = ChunkingConfig { + max_tokens: cfg.chunking.max_tokens, + overlap_tokens: cfg.chunking.overlap_tokens, + }; + + Ok(WorkerState { + db, + qdrant, + docs_qdrant, + embedding: cfg.providers.embedding, + chunking, + tokenizer, + }) +} + +async fn add_notes(service: &ElfService, notes: &[CorpusNote]) -> color_eyre::Result> { + let request = AddNoteRequest { + tenant_id: TENANT_ID.to_string(), + project_id: PROJECT_ID.to_string(), + agent_id: AGENT_ID.to_string(), + scope: SCOPE.to_string(), + notes: notes + .iter() + .map(|note| AddNoteInput { + r#type: "fact".to_string(), + key: Some(note.key.clone()), + text: note.text.clone(), + structured: None, + importance: 0.9, + confidence: 0.95, + ttl_days: None, + source_ref: serde_json::json!({ + "source": "ELF live baseline corpus", + "title": note.title, + "document": note.source_doc, + }), + write_policy: None, + }) + .collect(), + }; + let response = service.add_note(request).await?; + let mut ids = Vec::with_capacity(response.results.len()); + + for result in response.results { + let note_id = + result.note_id.ok_or_else(|| eyre::eyre!("ELF add_note did not return a note_id."))?; + + ids.push(note_id); + } + + Ok(ids) +} + +async fn run_worker_until_indexed( + runtime: &BaselineRuntime, + service: &ElfService, + note_ids: &[Uuid], + label: &str, +) -> color_eyre::Result { + let state = build_worker_state(runtime).await?; + let before = outbox_status_counts(service, note_ids).await?; + let max_iterations = worker_max_iterations(note_ids.len()); + let mut iterations = 0_usize; + + while iterations < max_iterations { + let after = outbox_status_counts(service, note_ids).await?; + + if outbox_done(&after, note_ids.len()) { + let (chunk_rows, chunk_embedding_rows) = chunk_counts(service, note_ids).await?; + let failed_jobs = failed_outbox_jobs(service, note_ids).await?; + + return Ok(WorkerRunEvidence { + label: label.to_string(), + expected_note_count: note_ids.len(), + iterations, + before, + after, + chunk_rows, + chunk_embedding_rows, + failed_jobs, + }); + } + + worker::process_once(&state).await?; + + iterations += 1; + } + + let after = outbox_status_counts(service, note_ids).await?; + let (chunk_rows, chunk_embedding_rows) = chunk_counts(service, note_ids).await?; + let failed_jobs = failed_outbox_jobs(service, note_ids).await?; + + Ok(WorkerRunEvidence { + label: label.to_string(), + expected_note_count: note_ids.len(), + iterations, + before, + after, + chunk_rows, + chunk_embedding_rows, + failed_jobs, + }) +} + +async fn outbox_status_counts( + service: &ElfService, + note_ids: &[Uuid], +) -> color_eyre::Result> { + if note_ids.is_empty() { + return Ok(BTreeMap::new()); + } + + let rows = sqlx::query_as::<_, (String, i64)>( + "\ +SELECT status, COUNT(*)::bigint +FROM indexing_outbox +WHERE note_id = ANY($1) +GROUP BY status +ORDER BY status", + ) + .bind(note_ids) + .fetch_all(&service.db.pool) + .await?; + + Ok(rows.into_iter().collect()) +} + +async fn chunk_counts(service: &ElfService, note_ids: &[Uuid]) -> color_eyre::Result<(i64, i64)> { + if note_ids.is_empty() { + return Ok((0, 0)); + } + + let chunk_rows = sqlx::query_scalar::<_, i64>( + "\ +SELECT COUNT(*)::bigint +FROM memory_note_chunks +WHERE note_id = ANY($1)", + ) + .bind(note_ids) + .fetch_one(&service.db.pool) + .await?; + let chunk_embedding_rows = sqlx::query_scalar::<_, i64>( + "\ +SELECT COUNT(*)::bigint +FROM memory_note_chunks c +JOIN note_chunk_embeddings e ON e.chunk_id = c.chunk_id +WHERE c.note_id = ANY($1)", + ) + .bind(note_ids) + .fetch_one(&service.db.pool) + .await?; + + Ok((chunk_rows, chunk_embedding_rows)) +} + +async fn failed_outbox_jobs( + service: &ElfService, + note_ids: &[Uuid], +) -> color_eyre::Result> { + if note_ids.is_empty() { + return Ok(Vec::new()); + } + + let rows = sqlx::query_as::<_, (Uuid, Option, String, i32, Option)>( + "\ +SELECT o.note_id, n.key, o.op, o.attempts, o.last_error +FROM indexing_outbox o +LEFT JOIN memory_notes n ON n.note_id = o.note_id +WHERE o.note_id = ANY($1) + AND o.status = 'FAILED' +ORDER BY n.key NULLS LAST, o.note_id", + ) + .bind(note_ids) + .fetch_all(&service.db.pool) + .await?; + + Ok(rows + .into_iter() + .map(|(note_id, note_key, op, attempts, last_error)| FailedOutboxJob { + note_id, + note_key, + op, + attempts, + last_error, + }) + .collect()) +} + +async fn run_queries( + service: &ElfService, + queries: Vec, +) -> color_eyre::Result> { + let mut out = Vec::with_capacity(queries.len()); + + for case in queries { + out.push(run_single_query(service, case).await?); + } + + Ok(out) +} + +async fn run_single_query( + service: &ElfService, + case: QueryCase, +) -> color_eyre::Result { + let top_k = env::var("ELF_BASELINE_TOP_K") + .ok() + .and_then(|value| value.parse::().ok()) + .unwrap_or(10); + let response = service + .search_raw(SearchRequest { + tenant_id: TENANT_ID.to_string(), + project_id: PROJECT_ID.to_string(), + agent_id: AGENT_ID.to_string(), + token_id: None, + payload_level: PayloadLevel::default(), + read_profile: "private_only".to_string(), + query: case.query.clone(), + top_k: Some(top_k), + candidate_k: Some(top_k.max(20).saturating_mul(4)), + filter: None, + record_hits: Some(false), + ranking: None, + }) + .await?; + let top = response.items.first(); + let top_text = top.map(|item| item.snippet.clone()).unwrap_or_default(); + let matched_terms = case + .expected_terms + .iter() + .filter(|term| contains_case_insensitive(&top_text, term)) + .cloned() + .collect::>(); + let top_key = top.and_then(|item| item.key.clone()); + let expected_key = key_for_doc(&case.expected_doc); + let matched = matched_terms.len() == case.expected_terms.len() + || top_key.as_deref().is_some_and(|key| key == expected_key); + + Ok(QueryResult { + id: case.id, + query: case.query, + expected_doc: case.expected_doc, + expected_terms: case.expected_terms, + matched, + matched_terms, + top_note_key: top_key, + top_snippet: top.map(|item| item.snippet.clone()), + returned_count: response.items.len(), + }) +} + +async fn run_lifecycle_checks( + runtime: &BaselineRuntime, + service: &ElfService, + notes: &[CorpusNote], + note_ids: &[Uuid], +) -> color_eyre::Result> { + let Some(update_note) = notes.first() else { + return Ok(vec![incomplete_check( + "update_replaces_note_text", + "Corpus has no note to update.", + )]); + }; + let Some(update_note_id) = note_ids.first().copied() else { + return Ok(vec![incomplete_check( + "update_replaces_note_text", + "ELF add_note returned no note_id for lifecycle update.", + )]); + }; + let Some(delete_note) = notes.get(1) else { + return Ok(vec![incomplete_check( + "delete_suppresses_retrieval", + "Corpus has no note to delete.", + )]); + }; + let Some(delete_note_id) = note_ids.get(1).copied() else { + return Ok(vec![incomplete_check( + "delete_suppresses_retrieval", + "ELF add_note returned no note_id for lifecycle delete.", + )]); + }; + let Some(recovery_note) = notes.get(2) else { + return Ok(vec![incomplete_check( + "cold_start_recovery_search", + "Corpus has no stable note for recovery search.", + )]); + }; + + Ok(vec![ + run_update_replacement_check(runtime, service, update_note, update_note_id).await?, + run_delete_suppression_check(runtime, service, delete_note, delete_note_id).await?, + run_cold_start_recovery_check(runtime, service, recovery_note).await?, + ]) +} + +async fn run_update_replacement_check( + runtime: &BaselineRuntime, + service: &ElfService, + update_note: &CorpusNote, + update_note_id: Uuid, +) -> color_eyre::Result { + let update_text = "\ + Rotated auth middleware validates JWT tokens with key id `kid-v4` under \ + `RotatedJwtKeyPlan`. It still requires tenant scope `project_shared` for deployment \ + operations after the emergency key rotation." + .to_string(); + let update_response = service + .update(UpdateRequest { + tenant_id: TENANT_ID.to_string(), + project_id: PROJECT_ID.to_string(), + agent_id: AGENT_ID.to_string(), + note_id: update_note_id, + text: Some(update_text.clone()), + importance: None, + confidence: None, + ttl_days: None, + }) + .await?; + let update_worker = + run_worker_until_indexed(runtime, service, &[update_note_id], "lifecycle_update").await?; + let update_query = run_single_query( + service, + QueryCase { + id: "lifecycle-update-new-marker".to_string(), + query: "Which rotated JWT key id does the auth middleware require?".to_string(), + expected_doc: update_note.source_doc.clone(), + expected_terms: vec!["kid-v4".to_string(), "RotatedJwtKeyPlan".to_string()], + }, + ) + .await?; + let old_marker_absent = update_query + .top_snippet + .as_deref() + .is_some_and(|snippet| !contains_case_insensitive(snippet, "kid-v3")); + let update_pass = update_query.matched + && old_marker_absent + && outbox_done(&update_worker.after, update_worker.expected_note_count); + + Ok(CheckResult { + name: "update_replaces_note_text", + status: if update_pass { "pass" } else { "fail" }, + reason: if update_pass { + "Service update plus worker indexing returned the new marker and removed the old marker from the top snippet.".to_string() + } else { + "Service update plus worker indexing did not produce a clean search result for the replacement marker.".to_string() + }, + evidence: serde_json::json!({ + "note_id": update_note_id, + "op": update_response.op, + "worker": update_worker, + "query": update_query, + "old_marker_absent": old_marker_absent, + }), + }) +} + +async fn run_delete_suppression_check( + runtime: &BaselineRuntime, + service: &ElfService, + delete_note: &CorpusNote, + delete_note_id: Uuid, +) -> color_eyre::Result { + let delete_response = service + .delete(DeleteRequest { + tenant_id: TENANT_ID.to_string(), + project_id: PROJECT_ID.to_string(), + agent_id: AGENT_ID.to_string(), + note_id: delete_note_id, + }) + .await?; + let delete_worker = + run_worker_until_indexed(runtime, service, &[delete_note_id], "lifecycle_delete").await?; + let delete_query = run_single_query( + service, + QueryCase { + id: "lifecycle-delete-suppresses-note".to_string(), + query: delete_note.text.clone(), + expected_doc: delete_note.source_doc.clone(), + expected_terms: distinctive_terms(&delete_note.text, 2), + }, + ) + .await?; + let delete_pass = !delete_query.matched + && outbox_done(&delete_worker.after, delete_worker.expected_note_count); + + Ok(CheckResult { + name: "delete_suppresses_retrieval", + status: if delete_pass { "pass" } else { "fail" }, + reason: if delete_pass { + "Service delete suppressed the deleted note from subsequent search results.".to_string() + } else { + "Deleted note was still retrievable after service delete and worker indexing." + .to_string() + }, + evidence: serde_json::json!({ + "note_id": delete_note_id, + "op": delete_response.op, + "worker": delete_worker, + "query": delete_query, + }), + }) +} + +async fn run_cold_start_recovery_check( + runtime: &BaselineRuntime, + service: &ElfService, + recovery_note: &CorpusNote, +) -> color_eyre::Result { + let recovery_service = build_service(runtime).await?; + let recovery_query = run_single_query( + &recovery_service, + QueryCase { + id: "lifecycle-cold-start-recovery".to_string(), + query: recovery_note.text.clone(), + expected_doc: recovery_note.source_doc.clone(), + expected_terms: distinctive_terms(&recovery_note.text, 2), + }, + ) + .await?; + let outbox_counts = pending_outbox_counts(service).await?; + + Ok(CheckResult { + name: "cold_start_recovery_search", + status: if recovery_query.matched { "pass" } else { "fail" }, + reason: if recovery_query.matched { + "A newly constructed service over the same Postgres and Qdrant stores retrieved persisted evidence.".to_string() + } else { + "A newly constructed service over the same stores could not retrieve persisted evidence.".to_string() + }, + evidence: serde_json::json!({ + "query": recovery_query, + "pending_outbox_by_op": outbox_counts, + "note": recovery_note.source_doc, + }), + }) +} + +async fn pending_outbox_counts(service: &ElfService) -> color_eyre::Result> { + let rows = sqlx::query_as::<_, (String, i64)>( + "\ +SELECT op, COUNT(*)::bigint +FROM indexing_outbox +WHERE status = 'PENDING' +GROUP BY op +ORDER BY op", + ) + .fetch_all(&service.db.pool) + .await?; + + Ok(rows.into_iter().collect()) +} + +async fn run_concurrent_write_check( + runtime: &BaselineRuntime, + service: Arc, +) -> color_eyre::Result { + let note_count = concurrent_note_count(); + let mut set = JoinSet::new(); + + for index in 0..note_count { + let request = concurrent_add_request(index); + let service_ref = Arc::clone(&service); + + set.spawn(async move { + let response = service_ref.add_note(request).await?; + let note_id = response + .results + .first() + .and_then(|result| result.note_id) + .ok_or_else(|| eyre::eyre!("Concurrent add_note did not return a note_id."))?; + + Ok::(note_id) + }); + } + + let mut note_ids = Vec::with_capacity(note_count); + + while let Some(joined) = set.join_next().await { + note_ids.push(joined??); + } + + let worker_evidence = + run_worker_until_indexed(runtime, &service, ¬e_ids, "concurrent_upsert").await?; + let probe_indexes = concurrency_probe_indexes(note_count); + let mut query_results = Vec::new(); + + for index in probe_indexes { + query_results.push(run_single_query(&service, concurrent_query_case(index)).await?); + } + + let pass_count = query_results.iter().filter(|result| result.matched).count(); + let pass = outbox_done(&worker_evidence.after, worker_evidence.expected_note_count) + && pass_count == query_results.len(); + + Ok(CheckResult { + name: "concurrent_write_search_e2e", + status: if pass { "pass" } else { "fail" }, + reason: if pass { + "Concurrent add_note calls were indexed by the worker and remained searchable." + .to_string() + } else { + "Concurrent add_note calls did not all become searchable after worker indexing." + .to_string() + }, + evidence: serde_json::json!({ + "note_count": note_count, + "worker": worker_evidence, + "query_summary": { + "total": query_results.len(), + "pass": pass_count, + "fail": query_results.len().saturating_sub(pass_count), + }, + "queries": query_results, + }), + }) +} + +async fn run_soak_stability_check( + runtime: &BaselineRuntime, + service: Arc, +) -> color_eyre::Result> { + let config = soak_config(); + + if config.target_seconds == 0 && config.write_rounds == 0 { + return Ok(None); + } + + let target_duration = Duration::from_secs(config.target_seconds); + let started_at = Instant::now(); + let write_rounds = config.write_rounds.max(if config.target_seconds > 0 { 1 } else { 0 }); + let mut note_ids = Vec::with_capacity(write_rounds); + let mut worker_runs = Vec::with_capacity(write_rounds); + let mut query_results = Vec::new(); + + for index in 0..write_rounds { + let response = service.add_note(soak_add_request(index)).await?; + let note_id = response + .results + .first() + .and_then(|result| result.note_id) + .ok_or_else(|| eyre::eyre!("Soak add_note did not return a note_id."))?; + + note_ids.push(note_id); + worker_runs + .push(run_worker_until_indexed(runtime, &service, &[note_id], "soak_upsert").await?); + query_results.push(run_single_query(&service, soak_query_case(index)).await?); + + if config.target_seconds > 0 && write_rounds > 1 { + let target_elapsed = target_duration.mul_f64((index + 1) as f64 / write_rounds as f64); + + if started_at.elapsed() < target_elapsed { + time::sleep(target_elapsed.saturating_sub(started_at.elapsed())).await; + } + } + } + + let mut probe_index = 0; + + while started_at.elapsed() < target_duration { + let index = probe_index % write_rounds; + + query_results.push(run_single_query(&service, soak_query_case(index)).await?); + + probe_index += 1; + + let sleep_for = Duration::from_millis(config.probe_interval_millis) + .min(target_duration.saturating_sub(started_at.elapsed())); + + if !sleep_for.is_zero() { + time::sleep(sleep_for).await; + } + } + + let elapsed_seconds = started_at.elapsed().as_secs_f64(); + let pass_count = query_results.iter().filter(|result| result.matched).count(); + let query_fail_count = query_results.len().saturating_sub(pass_count); + let worker_pass = + worker_runs.iter().all(|run| outbox_done(&run.after, run.expected_note_count)); + let duration_pass = target_duration.is_zero() || started_at.elapsed() >= target_duration; + let pass = worker_pass && duration_pass && query_fail_count == 0; + let failed_queries = query_results.iter().filter(|result| !result.matched).collect::>(); + + Ok(Some(CheckResult { + name: "soak_stability_e2e", + status: if pass { "pass" } else { "fail" }, + reason: if pass { + "ELF sustained repeated write, worker indexing, and search probes for the configured soak window.".to_string() + } else { + "ELF did not sustain the configured soak write/search window without a failed worker or retrieval probe.".to_string() + }, + evidence: serde_json::json!({ + "config": config, + "elapsed_seconds": elapsed_seconds, + "duration_met": duration_pass, + "worker_pass": worker_pass, + "write_note_ids": note_ids, + "worker_runs": worker_runs, + "query_summary": { + "total": query_results.len(), + "pass": pass_count, + "fail": query_fail_count, + }, + "failed_queries": failed_queries, + }), + })) +} diff --git a/apps/elf-worker/src/worker.rs b/apps/elf-worker/src/worker.rs index 27f3a1ab..823094a5 100644 --- a/apps/elf-worker/src/worker.rs +++ b/apps/elf-worker/src/worker.rs @@ -253,6 +253,15 @@ pub async fn run_worker(state: WorkerState) -> Result<()> { } } +/// Processes at most one due job from each worker-owned queue. +pub async fn process_once(state: &WorkerState) -> Result<()> { + process_indexing_outbox_once(state).await?; + process_doc_indexing_outbox_once(state).await?; + process_trace_outbox_once(state).await?; + + Ok(()) +} + fn is_not_found_error(err: &QdrantError) -> bool { let message = err.to_string().to_lowercase(); let point_not_found = diff --git a/docker-compose.baseline.yml b/docker-compose.baseline.yml new file mode 100644 index 00000000..ac7e9762 --- /dev/null +++ b/docker-compose.baseline.yml @@ -0,0 +1,97 @@ +name: elf-live-baseline + +services: + postgres: + image: pgvector/pgvector:pg18 + environment: + POSTGRES_DB: postgres + POSTGRES_PASSWORD: elf_dev_password + POSTGRES_USER: elf_dev + healthcheck: + test: + - CMD-SHELL + - pg_isready -U elf_dev -d postgres + interval: 2s + timeout: 5s + retries: 30 + volumes: + - elf-live-baseline-postgres-data:/var/lib/postgresql + + qdrant: + image: qdrant/qdrant:v1.16.3 + volumes: + - elf-live-baseline-qdrant-data:/qdrant/storage + + baseline-runner: + build: + context: . + dockerfile: docker/baseline/Dockerfile + depends_on: + postgres: + condition: service_healthy + qdrant: + condition: service_started + environment: + CARGO_HOME: /usr/local/cargo + ELF_BASELINE_ELF_HEAD: ${ELF_BASELINE_ELF_HEAD:-unknown} + DASHSCOPE_API_BASE: ${DASHSCOPE_API_BASE:-} + DASHSCOPE_API_KEY: ${DASHSCOPE_API_KEY:-} + DASHSCOPE_EMBEDDING_DIMENSIONS: ${DASHSCOPE_EMBEDDING_DIMENSIONS:-} + EMBEDDING_API_BASE: ${EMBEDDING_API_BASE:-} + EMBEDDING_API_KEY: ${EMBEDDING_API_KEY:-} + EMBEDDING_DIMENSIONS: ${EMBEDDING_DIMENSIONS:-} + EMBEDDING_MODEL: ${EMBEDDING_MODEL:-} + EMBEDDING_PATH: ${EMBEDDING_PATH:-} + EMBEDDING_PROVIDER_ID: ${EMBEDDING_PROVIDER_ID:-} + EMBEDDING_TIMEOUT_MS: ${EMBEDDING_TIMEOUT_MS:-} + ELF_BASELINE_CONCURRENT_NOTES: ${ELF_BASELINE_CONCURRENT_NOTES:-} + ELF_BASELINE_ELF_EMBEDDING_API_BASE: ${ELF_BASELINE_ELF_EMBEDDING_API_BASE:-} + ELF_BASELINE_ELF_EMBEDDING_API_KEY: ${ELF_BASELINE_ELF_EMBEDDING_API_KEY:-} + ELF_BASELINE_ELF_EMBEDDING_DIMENSIONS: ${ELF_BASELINE_ELF_EMBEDDING_DIMENSIONS:-} + ELF_BASELINE_ELF_EMBEDDING_MODE: ${ELF_BASELINE_ELF_EMBEDDING_MODE:-local} + ELF_BASELINE_ELF_EMBEDDING_MODEL: ${ELF_BASELINE_ELF_EMBEDDING_MODEL:-} + ELF_BASELINE_ELF_EMBEDDING_PATH: ${ELF_BASELINE_ELF_EMBEDDING_PATH:-} + ELF_BASELINE_ELF_EMBEDDING_PROVIDER_ID: ${ELF_BASELINE_ELF_EMBEDDING_PROVIDER_ID:-} + ELF_BASELINE_ELF_EMBEDDING_TIMEOUT_MS: ${ELF_BASELINE_ELF_EMBEDDING_TIMEOUT_MS:-} + ELF_BASELINE_MAX_ELF_RSS_KB: ${ELF_BASELINE_MAX_ELF_RSS_KB:-1500000} + ELF_BASELINE_MAX_ELF_SECONDS: ${ELF_BASELINE_MAX_ELF_SECONDS:-600} + ELF_BASELINE_PROFILE: ${ELF_BASELINE_PROFILE:-smoke} + ELF_BASELINE_PROJECTS: ${ELF_BASELINE_PROJECTS:-all} + ELF_BASELINE_REPORT_DIR: /workspace/tmp/live-baseline + ELF_BASELINE_SCALE_DOCS: ${ELF_BASELINE_SCALE_DOCS:-120} + ELF_BASELINE_SOAK_PROBE_INTERVAL_MS: ${ELF_BASELINE_SOAK_PROBE_INTERVAL_MS:-} + ELF_BASELINE_SOAK_ROUNDS: ${ELF_BASELINE_SOAK_ROUNDS:-} + ELF_BASELINE_SOAK_SECONDS: ${ELF_BASELINE_SOAK_SECONDS:-} + ELF_BASELINE_STRESS_DOCS: ${ELF_BASELINE_STRESS_DOCS:-480} + ELF_BASELINE_TOP_K: ${ELF_BASELINE_TOP_K:-10} + QWEN_API_KEY: ${QWEN_API_KEY:-} + QWEN_EMBEDDING_API_BASE: ${QWEN_EMBEDDING_API_BASE:-} + QWEN_EMBEDDING_DIMENSIONS: ${QWEN_EMBEDDING_DIMENSIONS:-} + QWEN_EMBEDDING_MODEL: ${QWEN_EMBEDDING_MODEL:-} + QWEN_EMBEDDING_PATH: ${QWEN_EMBEDDING_PATH:-} + QWEN_EMBEDDING_PROVIDER_ID: ${QWEN_EMBEDDING_PROVIDER_ID:-} + QWEN_EMBEDDING_TIMEOUT_MS: ${QWEN_EMBEDDING_TIMEOUT_MS:-} + ELF_PG_DSN: postgres://elf_dev:elf_dev_password@postgres:5432/postgres + ELF_QDRANT_GRPC_URL: http://qdrant:6334 + ELF_QDRANT_HTTP_URL: http://qdrant:6333 + RUSTUP_HOME: /usr/local/rustup + volumes: + - elf-live-baseline-npm-cache:/root/.npm + - elf-live-baseline-pip-cache:/root/.cache/pip + - elf-live-baseline-huggingface-cache:/root/.cache/huggingface + - elf-live-baseline-qmd-cache:/root/.cache/qmd + - elf-live-baseline-cargo-git:/usr/local/cargo/git + - elf-live-baseline-cargo-registry:/usr/local/cargo/registry + - elf-live-baseline-target:/workspace/target + - ./tmp/live-baseline:/workspace/tmp/live-baseline + +volumes: + elf-live-baseline-cargo-git: + elf-live-baseline-cargo-registry: + elf-live-baseline-huggingface-cache: + elf-live-baseline-npm-cache: + elf-live-baseline-pip-cache: + elf-live-baseline-postgres-data: + elf-live-baseline-qmd-cache: + elf-live-baseline-qdrant-data: + elf-live-baseline-target: diff --git a/docker/baseline/Dockerfile b/docker/baseline/Dockerfile new file mode 100644 index 00000000..1384eb15 --- /dev/null +++ b/docker/baseline/Dockerfile @@ -0,0 +1,37 @@ +FROM node:22-bookworm + +RUN apt-get update \ + && apt-get install -y --no-install-recommends \ + bash \ + build-essential \ + ca-certificates \ + clang \ + cmake \ + curl \ + git \ + jq \ + libssl-dev \ + pkg-config \ + python3 \ + python3-dev \ + python3-pip \ + python3-venv \ + ripgrep \ + sqlite3 \ + && rm -rf /var/lib/apt/lists/* + +ENV CARGO_HOME=/usr/local/cargo +ENV RUSTUP_HOME=/usr/local/rustup +ENV PATH=/usr/local/cargo/bin:$PATH + +RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs \ + | sh -s -- -y --profile minimal --default-toolchain stable \ + && chmod -R a+w "${CARGO_HOME}" "${RUSTUP_HOME}" + +RUN npm install -g bun pnpm tsx + +WORKDIR /workspace + +COPY . /workspace + +CMD ["bash", "scripts/live-baseline-benchmark.sh"] diff --git a/docs/guide/benchmarking/2026-06-09-live-baseline-report.md b/docs/guide/benchmarking/2026-06-09-live-baseline-report.md new file mode 100644 index 00000000..bbfb55ae --- /dev/null +++ b/docs/guide/benchmarking/2026-06-09-live-baseline-report.md @@ -0,0 +1,203 @@ +# Live Baseline Benchmark Report - 2026-06-09 + +Goal: Preserve the checked-in evidence snapshot behind the README benchmark claims. +Read this when: You need the June 9, 2026 live baseline result, pass/fail reasons, or +the next benchmark iteration backlog. +Inputs: Docker-only benchmark reports generated by `cargo make baseline-live-docker`. +Depends on: `docs/guide/benchmarking/live_baseline_benchmark.md`, +`docker-compose.baseline.yml`, `scripts/live-baseline-benchmark.sh`, and +`scripts/live-baseline-report-to-md.sh`. +Verification: Re-run the commands in this report and compare +`tmp/live-baseline/live-baseline-report.json`. + +## Executive Summary + +- ELF passed the production-provider stress run with `Qwen3-Embedding-8B`, + 4096-dimensional embeddings, 480 documents, 16 queries, and `8/8` encoded checks. +- In the all-project smoke comparison, ELF and qmd passed every encoded check. + agentmemory passed same-corpus retrieval but failed or could not complete lifecycle + checks. mem0, memsearch, and claude-mem returned wrong same-corpus retrieval results + in the encoded smoke. OpenViking was incomplete because its local embedding dependency + could not complete in the Docker runner. +- Under the encoded service-style benchmark checks, ELF passed all ELF checks that were + run. Under the encoded local CLI smoke checks, qmd passed all qmd checks that were + run. +- This report records results for the checked-in Docker benchmark contract. It does not + evaluate dimensions that are not encoded in the runner. + +## ELF Production-Provider Stress Run + +| Field | Value | +| --- | --- | +| Run ID | `live-baseline-20260609010854` | +| Generated at | `2026-06-09T01:28:17Z` | +| Project filter | `ELF` | +| Corpus profile | `stress` | +| Documents | `480` | +| Queries | `16` | +| Verdict | `pass` | +| Same-corpus summary | `1/1 pass` | +| Full check summary | `8/8 pass` | +| Elapsed | `1163` seconds | +| Embedding mode | `provider` | +| Embedding model | `Qwen3-Embedding-8B` | +| Embedding dimensions | `4096` | +| Embedding API path | `https://ai.gitee.com/v1/embeddings` | +| Timeout | `30000` ms | + +Encoded checks covered: + +- same-corpus retrieval for all 16 stress queries; +- worker indexing for the 480-document corpus; +- update replacement; +- delete suppression; +- cold-start recovery over the same stores; +- concurrent write/search behavior; +- stress-profile soak behavior; +- resource envelope under the configured stress threshold. + +Re-run command: + +```sh +set -a +source .env +set +a + +EMBEDDING_MODEL=Qwen3-Embedding-8B \ +EMBEDDING_DIMENSIONS=4096 \ +ELF_BASELINE_PROJECTS=ELF \ +ELF_BASELINE_PROFILE=stress \ +ELF_BASELINE_MAX_ELF_SECONDS=1800 \ +ELF_BASELINE_ELF_EMBEDDING_MODE=provider \ +cargo make baseline-live-docker +``` + +## All-Project Smoke Comparison + +| Field | Value | +| --- | --- | +| Run ID | `live-baseline-20260609022837` | +| Generated at | `2026-06-09T02:42:37Z` | +| Project filter | `all` | +| Corpus profile | `smoke` | +| Documents | `3` | +| Queries | `3` | +| Aggregate verdict | `fail` | +| Project summary | `2 pass`, `4 fail`, `1 incomplete` | +| Same-corpus summary | `3 pass`, `3 fail`, `1 incomplete` | +| Full check summary | `17 pass`, `4 fail`, `4 incomplete` | + +The aggregate verdict is `fail` because the top-level report only passes when every +selected project passes every encoded project check. + +| Project | Status | Retrieval | Checks | Elapsed | Interpretation | +| --- | --- | --- | --- | --- | --- | +| ELF | `pass` | `retrieval_pass` | `7/7` | `57s` | Service-backed provider run passed retrieval, worker indexing, lifecycle, recovery, and concurrency checks. | +| qmd | `pass` | `retrieval_pass` | `4/4` | `53s` | Local CLI hybrid retrieval baseline passed retrieval, update, delete, and cold-start checks. | +| agentmemory | `fail` | `retrieval_pass` | `2/4` | `38s` | Retrieval passed, but update replacement failed because the old marker remained searchable; cold-start is incomplete in the current in-memory adapter. | +| memsearch | `fail` | `retrieval_wrong_result` | `2/4` | `169s` | Local search ran, update and cold-start passed, but same-corpus retrieval missed expected evidence. | +| mem0 | `fail` | `retrieval_wrong_result` | `2/4` | `41s` | Local add/search ran, update and cold-start passed, but same-corpus retrieval missed expected evidence. | +| OpenViking | `incomplete` | `local_embed_install_failed` | `0/1` | `385s` | The local embed install path hit a `llama-cpp-python` build/import failure in Docker, so retrieval was not evaluated. | +| claude-mem | `fail` | `retrieval_wrong_result` | `0/1` | `97s` | Same-corpus repository search ran but did not return expected evidence. | + +Re-run command: + +```sh +set -a +source .env +set +a + +EMBEDDING_MODEL=Qwen3-Embedding-8B \ +EMBEDDING_DIMENSIONS=4096 \ +ELF_BASELINE_PROFILE=smoke \ +ELF_BASELINE_ELF_EMBEDDING_MODE=provider \ +cargo make baseline-live-docker +``` + +## Pass, Fail, And Incomplete Rules + +- `pass`: the project installed and every encoded retrieval, lifecycle, recovery, and + resource check for the selected corpus profile passed. +- `fail`: clone, install, import, build, retrieval, update, delete, recovery, + concurrency, soak, resource-envelope, or another declared project check failed. +- `incomplete`: the project partially ran, but the encoded check could not be completed + without extra provider keys, host integration, native dependency support, durable + runtime wiring, or a project-specific command mapping not yet encoded in the runner. + +`incomplete` is not a pass. It means the benchmark needs more wiring before making a +quality claim for that project. + +## Interpretation + +The benchmark is intentionally stricter than a feature checklist. It exercises whether a +project can ingest the same corpus, return expected evidence for the same queries, and +preserve basic lifecycle behavior under the runner's encoded contract. + +ELF checks covered in this run: + +- production-provider embeddings through the same service path used by ELF; +- Postgres source-of-truth with Qdrant as a rebuildable derived index; +- worker-produced chunks and embeddings, not direct in-memory fixture shortcuts; +- explicit update, delete, cold-start, concurrency, soak, and resource checks; +- report metadata that records corpus profile, document count, query count, project + status, check summaries, elapsed seconds, and embedding configuration. + +qmd was the external project that passed every encoded smoke check. agentmemory passed +same-corpus retrieval, failed update replacement, and has incomplete cold-start coverage +because the current adapter uses an in-memory SDK/KV mock. mem0, memsearch, and +claude-mem failed the encoded smoke retrieval. OpenViking was not retrieval-evaluated +because the Docker local embedding install path did not complete. + +## Speed And Production Stance + +The 480-document ELF stress run took 1163 seconds, roughly 19.4 minutes, or about 2.4 +seconds per document end-to-end. That includes the service path, provider embedding +calls, worker indexing, Qdrant rebuild/search, lifecycle checks, soak, and container +overhead. Whether that is acceptable depends on the production workflow: it is a +cold/backfill measurement, not an interactive-ingest target. + +Throughput work should focus on: + +- micro-batching provider embedding requests; +- multiple outbox worker lanes with leases or `FOR UPDATE SKIP LOCKED`; +- batch Qdrant upserts; +- a bulk import mode that defers or relaxes semantic deduplication; +- vector handoff so an ingest-time embedding can be reused by the worker. + +## Next Benchmark Iterations + +- Add a sanitized private corpus that reflects real coding-agent memory cases. +- Add scale/stress matrix runs for qmd and the other external projects once their smoke + adapters are stable. +- Split elapsed time into install, ingest, embedding, indexing, query, and lifecycle + phases. +- Add recall@k, MRR, and false-positive measurements instead of only pass/fail expected + evidence checks. +- Add a batch-loading benchmark for ELF after provider micro-batching and parallel + worker lanes land. +- Deepen external lifecycle checks for OpenViking and claude-mem after their local + runtime paths can complete in Docker. + +## Publish Workflow + +Generate a fresh aggregate JSON: + +```sh +cargo make baseline-live-docker +``` + +Convert the latest JSON report into Markdown: + +```sh +ELF_BASELINE_MARKDOWN_REPORT=docs/guide/benchmarking/YYYY-MM-DD-live-baseline-report.md \ +cargo make baseline-live-report +``` + +Clean Docker-owned state: + +```sh +cargo make baseline-live-docker-clean +``` + +The only host report directory is `tmp/live-baseline/`. Raw generated JSON stays there +and is not committed by default. diff --git a/docs/guide/benchmarking/index.md b/docs/guide/benchmarking/index.md new file mode 100644 index 00000000..4493e306 --- /dev/null +++ b/docs/guide/benchmarking/index.md @@ -0,0 +1,34 @@ +# Benchmarking Guide Index + +Goal: Route agents to live benchmark runbooks, report publication steps, and checked-in +benchmark evidence. +Read this when: You need to run, publish, interpret, or extend ELF benchmark evidence +against external memory systems. +Inputs: The benchmark question, selected corpus profile, and whether you need a runbook +or a saved evidence snapshot. +Depends on: `docs/index.md`, `docs/guide/index.md`, and `docs/governance.md`. +Outputs: The smallest benchmarking guide or report needed to continue. + +## Use This Index When + +- You need to run the live Docker-only benchmark matrix. +- You need to publish a Markdown report from a generated benchmark JSON report. +- You need the checked-in benchmark evidence behind README claims. +- You need to extend the benchmark matrix with new projects, profiles, or lifecycle + checks. + +## Guides And Reports + +- `live_baseline_benchmark.md`: run, clean up, publish, and interpret the live + Docker-only benchmark matrix. +- `2026-06-09-live-baseline-report.md`: checked-in evidence snapshot for the June 9, + 2026 ELF production-provider stress run and all-project smoke comparison. + +## Update Rules + +- Add a dated report when a new run changes README-level claims. +- Keep generated raw JSON under `tmp/live-baseline/`; commit only reviewed Markdown + summaries and durable scripts. +- Link the newest decision-relevant report from README and this index. +- When benchmark semantics change, update `live_baseline_benchmark.md` and the + relevant spec before publishing a new result. diff --git a/docs/guide/benchmarking/live_baseline_benchmark.md b/docs/guide/benchmarking/live_baseline_benchmark.md new file mode 100644 index 00000000..b61b1e2b --- /dev/null +++ b/docs/guide/benchmarking/live_baseline_benchmark.md @@ -0,0 +1,217 @@ +# Live Baseline Benchmark + +Goal: Run Docker-isolated, current-HEAD baseline checks against ELF and the external memory projects compared with ELF. +Read this when: You need evidence about which external projects actually run against a shared benchmark corpus. +Preconditions: Docker and Docker Compose are available on the host. +Depends on: `docker-compose.baseline.yml`, `scripts/live-baseline-benchmark.sh`, and `docs/spec/system_competitive_parity_gate_v1.md`. +Verification: `cargo make baseline-live-docker` writes `tmp/live-baseline/live-baseline-report.json`; `cargo make baseline-live-report` can render that JSON into a checked-in Markdown report. + +## Scope + +The runner covers ELF plus the six external projects in the README comparison table: + +- ELF +- agentmemory +- OpenViking +- mem0 +- qmd +- claude-mem +- memsearch + +For ELF, the runner uses Docker-owned Postgres and Qdrant, writes the shared corpus +through `add_note`, drains the worker indexing outbox into persisted chunks and +embeddings, rebuilds Qdrant from the worker-produced chunk tables, and verifies +`search_raw` against the shared query manifest. It also runs ELF service lifecycle +checks for note update, note delete, cold-start recovery, concurrent writes, +configurable soak stability, and a local resource envelope over the same Docker-owned +stores. By default these checks use the deterministic local embedding provider. Set +`ELF_BASELINE_ELF_EMBEDDING_MODE=provider` to run ELF through the configured +production embedding provider instead. + +For external projects, the runner clones current upstream `main` inside Docker, records +the exact commit SHA, reads the same generated corpus and query manifest, and runs a +same-corpus retrieval adapter when the project exposes a local API or CLI that can run +without provider keys. + +Corpus profiles: + +- `smoke`: default, 3 documents and 3 query cases. +- `scale`: 120 documents by default, 8 query cases, and generated distractor notes + that make the check closer to a production retrieval benchmark. +- `stress`: 480 documents by default, 16 query cases, and alternate phrasings for + every needle query. + +Use `ELF_BASELINE_SCALE_DOCS` and `ELF_BASELINE_STRESS_DOCS` to raise or lower the +generated corpus sizes. +Use `ELF_BASELINE_CONCURRENT_NOTES`, `ELF_BASELINE_MAX_ELF_SECONDS`, and +`ELF_BASELINE_MAX_ELF_RSS_KB` to tune ELF's concurrent-write and resource-envelope +checks. +Use `ELF_BASELINE_SOAK_SECONDS`, `ELF_BASELINE_SOAK_ROUNDS`, and +`ELF_BASELINE_SOAK_PROBE_INTERVAL_MS` to tune ELF's repeated write/search soak +window. The smoke profile does not run soak by default; the scale/full profiles run a +short 15-second soak by default, and the stress profile runs a 60-second soak by +default. +Use `ELF_BASELINE_ELF_EMBEDDING_MODE=provider` plus +`ELF_BASELINE_ELF_EMBEDDING_API_BASE`, `ELF_BASELINE_ELF_EMBEDDING_API_KEY`, +`ELF_BASELINE_ELF_EMBEDDING_MODEL`, and +`ELF_BASELINE_ELF_EMBEDDING_DIMENSIONS` to run ELF with a production embedding API. +The runner also accepts `QWEN_API_KEY`, `QWEN_EMBEDDING_API_BASE`, +`QWEN_EMBEDDING_MODEL`, `QWEN_EMBEDDING_DIMENSIONS`, and `QWEN_EMBEDDING_PATH` for +Qwen-compatible embedding configuration. Generic aliases `EMBEDDING_API_BASE`, +`EMBEDDING_API_KEY`, `EMBEDDING_MODEL`, `EMBEDDING_DIMENSIONS`, +`EMBEDDING_PROVIDER_ID`, `EMBEDDING_PATH`, and `EMBEDDING_TIMEOUT_MS` are also +supported. Provider-mode runs default to a 30-second embedding timeout unless an +explicit timeout env var is set. For Qwen3 production embedding runs, use +`Qwen3-Embedding-8B` with `EMBEDDING_DIMENSIONS=4096`. The aggregate report records +ELF's embedding mode, provider id, model, dimensions, timeout, API base, and path; it +never records the API key. + +Current external same-corpus adapters: + +- agentmemory: writes every corpus document through `mem::remember`, queries through + `mem::search`, exercises `mem::forget` delete suppression, and probes + superseding by writing a revised memory through `mem::remember`. The current + adapter uses an in-memory SDK/KV mock, so cold-start recovery is recorded as + `incomplete` until a durable agentmemory runtime is wired into the harness. +- qmd: adds the corpus as a collection, embeds it locally, and runs structured hybrid + `query --json` for every query case. It also rewrites and deletes corpus files, + then reruns `qmd update`, `qmd embed -f`, and fresh `qmd query` processes. +- memsearch: indexes the corpus with the local ONNX embedder and runs CLI search. + It also rewrites and deletes corpus files, then reruns `memsearch index` and + fresh `memsearch search` processes. +- mem0: writes the corpus with `infer=false` and searches local FastEmbed + Qdrant + path storage. It also runs public `Memory.update`, `Memory.delete`, and a new + `Memory.from_config` over the same local paths. No LLM inference is required. +- claude-mem: writes every corpus document into the SQLite memory repository and runs + repository search for every query case. + +Current deeper checks: + +- ELF: same-corpus retrieval through worker-produced chunks, async worker indexing + completion, service update replacement through the worker, service delete + suppression through the worker, cold-start search recovery after constructing a + fresh service over the same Postgres and Qdrant stores, concurrent write/search E2E, + configurable repeated write/search soak stability, and a configurable local resource + envelope. +- qmd, memsearch, and mem0: same-corpus retrieval, update replacement, delete + suppression, and cold-start search recovery through their local public API or CLI + surfaces. +- agentmemory: same-corpus retrieval and delete suppression are exercised; update + replacement is probed through superseding `mem::remember`; cold-start recovery is + `incomplete` because the current adapter runs against an in-memory SDK/KV mock. +- claude-mem and OpenViking: same-corpus retrieval only when their local runtime path + can complete. Update, delete, and recovery checks are not yet encoded for these two + adapters. +- Concurrent write, soak stability, and resource-envelope checks are currently encoded + for ELF. They are not yet encoded for the external adapters. Multi-hour production + soak is still operator-controlled through `ELF_BASELINE_SOAK_SECONDS`; the checked-in + stress default is a bounded 60-second signal. + +OpenViking attempts the official `.[local-embed]` path plus `OpenViking.add_resource` +and `OpenViking.find`. If the Docker platform cannot build or import +`llama-cpp-python`, the project is recorded as `incomplete` with +`retrieval_status = "local_embed_install_failed"` rather than as a retrieval failure. + +## Checked-In Reports + +- `docs/guide/benchmarking/2026-06-09-live-baseline-report.md`: June 9, 2026 + production-provider ELF stress run and all-project smoke comparison. + +## Run + +```sh +cargo make baseline-live-docker +``` + +To run the scale profile: + +```sh +ELF_BASELINE_PROFILE=scale cargo make baseline-live-docker +ELF_BASELINE_PROFILE=scale ELF_BASELINE_SCALE_DOCS=240 cargo make baseline-live-docker +ELF_BASELINE_PROFILE=stress cargo make baseline-live-docker +``` + +To iterate on one or more project adapters without rerunning the full matrix: + +```sh +ELF_BASELINE_PROJECTS=qmd cargo make baseline-live-docker +ELF_BASELINE_PROJECTS=ELF,memsearch cargo make baseline-live-docker +``` + +The only host artifact is: + +```text +tmp/live-baseline/ +``` + +That directory contains the aggregate report, per-project logs, and the shared query +fixture used by the run. The aggregate report records `corpus.profile`, +`corpus.document_count`, and `corpus.query_count` so smoke, scale, and stress runs are +not confused. Each project record includes `elapsed_seconds` for rough local runtime +comparison. ELF project records also include an `embedding` summary so deterministic +local and production-provider runs are not confused. Each project record also includes +`checks` and `check_summary`; the aggregate `full_check_summary` is the +adoption-relevant multi-check count. + +## Publish A Markdown Report + +After a run writes `tmp/live-baseline/live-baseline-report.json`, render a durable +Markdown summary: + +```sh +cargo make baseline-live-report +``` + +By default the task prints Markdown to stdout. To write a checked-in report: + +```sh +ELF_BASELINE_MARKDOWN_REPORT=docs/guide/benchmarking/YYYY-MM-DD-live-baseline-report.md \ +cargo make baseline-live-report +``` + +The publisher summarizes one generated aggregate JSON report. For a combined report +that compares multiple runs, use the generated Markdown as input evidence and then add +the interpretation manually under `docs/guide/benchmarking/`. + +## Clean Up + +```sh +cargo make baseline-live-docker-clean +``` + +This removes Docker-managed Postgres, Qdrant, npm, pip, cargo, and target volumes used +by the live baseline runner. It does not remove the host report directory. + +## Result Semantics + +- `pass`: the project installed and every encoded check for that project passed in the + selected corpus profile. +- `fail`: clone, install, import, build, retrieval, or another declared check failed. +- `incomplete`: the project installed or partially ran, but a declared check could not + be completed without extra provider keys, agent-host integration, native dependency + support, durable runtime wiring, or a project-specific command mapping not yet + encoded in the runner. + +The top-level `verdict` is intentionally stricter than the per-project `status`: it +only returns `pass` when every selected project has `status = "pass"` and +`retrieval_status = "retrieval_pass"`. The `same_corpus_summary` field is the +retrieval count and does not treat lifecycle failures as retrieval failures. For +multi-check comparisons, read `full_check_summary` and each project's `checks`. + +`incomplete` is not a pass. Treat it as evidence that more benchmark wiring is needed. + +## Failure Conditions + +A project status should be `fail` when any declared project check completes and proves +the project did not meet the selected benchmark contract. Examples: + +- clone, install, import, or build returns a non-zero result; +- same-corpus retrieval runs but does not return the expected evidence; +- update replacement leaves superseded evidence searchable; +- delete suppression leaves deleted evidence searchable; +- cold-start recovery cannot find data that should persist; +- concurrent, soak, or resource-envelope checks exceed their declared threshold. + +Use `incomplete` instead of `fail` only when the runner cannot execute the declared +check fairly because adapter wiring, provider credentials, native dependency support, +or durable runtime integration is missing. diff --git a/docs/guide/index.md b/docs/guide/index.md index c221adcc..9fc8ace2 100644 --- a/docs/guide/index.md +++ b/docs/guide/index.md @@ -62,6 +62,8 @@ Then structure the body for execution: ## Guide subfolders +- `docs/guide/benchmarking/` for live benchmark runbooks, report publication steps, + and checked-in benchmark evidence. - `docs/guide/competitive_parity_testing.md` for running the Docker-only adoption gate against external memory-system baselines. - `docs/guide/development/` for repository-development workflows. diff --git a/packages/elf-providers/src/lib.rs b/packages/elf-providers/src/lib.rs index b3ea4ac3..a8adbf90 100644 --- a/packages/elf-providers/src/lib.rs +++ b/packages/elf-providers/src/lib.rs @@ -8,7 +8,7 @@ mod error; pub use error::{Error, Result}; -use reqwest::header::{AUTHORIZATION, HeaderMap, HeaderName}; +use reqwest::header::{ACCEPT_ENCODING, AUTHORIZATION, HeaderMap, HeaderName, HeaderValue}; use serde_json::{Map, Value}; /// Builds authenticated request headers for provider API calls. @@ -16,6 +16,7 @@ pub fn auth_headers(api_key: &str, default_headers: &Map) -> Resu let mut headers = HeaderMap::new(); headers.insert(AUTHORIZATION, format!("Bearer {api_key}").parse()?); + headers.insert(ACCEPT_ENCODING, HeaderValue::from_static("identity")); for (key, value) in default_headers { let Some(raw) = value.as_str() else { diff --git a/scripts/live-baseline-benchmark.sh b/scripts/live-baseline-benchmark.sh new file mode 100755 index 00000000..fbb56b05 --- /dev/null +++ b/scripts/live-baseline-benchmark.sh @@ -0,0 +1,2144 @@ +#!/usr/bin/env bash +set -euo pipefail + +ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" +REPORT_DIR="${ELF_BASELINE_REPORT_DIR:-${ROOT_DIR}/tmp/live-baseline}" +WORK_DIR="${ELF_BASELINE_WORK_DIR:-/bench}" +REPOS_DIR="${WORK_DIR}/repos" +CORPUS_DIR="${WORK_DIR}/corpus" +HOME_DIR="${WORK_DIR}/home" +RECORDS="${REPORT_DIR}/project-records.jsonl" +REPORT="${REPORT_DIR}/live-baseline-report.json" +RUN_ID="${ELF_BASELINE_RUN_ID:-live-baseline-$(date +%Y%m%d%H%M%S)}" +PROJECT_FILTER="${ELF_BASELINE_PROJECTS:-all}" +CORPUS_PROFILE="${ELF_BASELINE_PROFILE:-smoke}" +SCALE_DOC_COUNT="${ELF_BASELINE_SCALE_DOCS:-120}" +STRESS_DOC_COUNT="${ELF_BASELINE_STRESS_DOCS:-480}" +QUERY_TOP_K="${ELF_BASELINE_TOP_K:-10}" +CURRENT_PROJECT_STARTED_AT="" + +if [[ ! -f "/.dockerenv" && "${ELF_BASELINE_ALLOW_HOST:-0}" != "1" ]]; then + echo "Refusing to run live baseline benchmark outside Docker. Use cargo make baseline-live-docker." >&2 + exit 1 +fi + +for cmd in bash cargo git jq node npm python3 rg timeout; do + if ! command -v "${cmd}" >/dev/null 2>&1; then + echo "Missing ${cmd} in baseline runner." >&2 + exit 1 + fi +done + +generate_corpus() { + python3 - "${CORPUS_PROFILE}" "${SCALE_DOC_COUNT}" "${STRESS_DOC_COUNT}" "${CORPUS_DIR}" "${REPORT_DIR}/queries.json" <<'PY' +import json +import sys +from pathlib import Path + +profile, scale_doc_count_raw, stress_doc_count_raw, corpus_dir_raw, queries_path_raw = sys.argv[1:] +corpus_dir = Path(corpus_dir_raw) +queries_path = Path(queries_path_raw) +scale_doc_count = int(scale_doc_count_raw) +stress_doc_count = int(stress_doc_count_raw) + +anchors = [ + { + "name": "auth-memory.md", + "title": "Auth Memory", + "body": "The API auth middleware validates JWT tokens with key id `kid-v3`. The middleware rejects tokens older than 15 minutes and requires tenant scope `project_shared` for deployment operations.", + "query": "Which JWT key id does the auth middleware require?", + "alternate_query": "Find the auth note that mentions key id kid-v3 and tenant scope.", + "terms": ["kid-v3", "auth middleware"], + }, + { + "name": "database-memory.md", + "title": "Database Memory", + "body": "The invoice list N+1 query was fixed by eager loading invoice lines through `InvoiceLineBatcher`. Do not reintroduce per-row SQL calls in invoice rendering.", + "query": "How was the invoice list N+1 query fixed?", + "alternate_query": "Find the invoice rendering memory about InvoiceLineBatcher and N+1 prevention.", + "terms": ["InvoiceLineBatcher", "N+1"], + }, + { + "name": "deploy-memory.md", + "title": "Deploy Memory", + "body": "Production deploys must run Docker-isolated parity checks first. The cleanup command must remove Postgres, Qdrant, npm, pip, cargo, and target volumes before adoption.", + "query": "What must be cleaned up after Docker parity checks?", + "alternate_query": "Find the deploy checklist that mentions Postgres, Qdrant, and cleanup volumes.", + "terms": ["Postgres", "Qdrant", "volumes"], + }, + { + "name": "retention-memory.md", + "title": "Retention Memory", + "body": "The retention worker uses `RetentionSweepPlan` before deletion and writes a tombstone ledger entry named `ledger-retain-77` for every expired note.", + "query": "Which plan does the retention worker use before deletion?", + "alternate_query": "Find the retention note with ledger-retain-77 tombstone handling.", + "terms": ["RetentionSweepPlan", "ledger-retain-77"], + }, + { + "name": "incident-memory.md", + "title": "Incident Memory", + "body": "During canary incidents, `CanaryTraceGate` must stay enabled until the rollback window closes and the release captain records marker `incident-green-42`.", + "query": "Which gate stays enabled during canary incidents?", + "alternate_query": "Find the canary incident memory with incident-green-42.", + "terms": ["CanaryTraceGate", "incident-green-42"], + }, + { + "name": "billing-memory.md", + "title": "Billing Memory", + "body": "Billing replay uses `UsageAccumulator` with idempotency key `bill-run-42` so duplicate metering events do not create extra invoices.", + "query": "Which accumulator and idempotency key protect billing replay?", + "alternate_query": "Find the billing replay note with bill-run-42.", + "terms": ["UsageAccumulator", "bill-run-42"], + }, + { + "name": "search-memory.md", + "title": "Search Memory", + "body": "Search fanout routes tenant scoped reads through `SemanticShardRouter`; every shard label must include the prefix `tenant_scope` before merge ranking.", + "query": "Which router handles tenant scoped search fanout?", + "alternate_query": "Find the tenant_scope shard routing memory.", + "terms": ["SemanticShardRouter", "tenant_scope"], + }, + { + "name": "recovery-memory.md", + "title": "Recovery Memory", + "body": "Disaster recovery requires `SnapshotRestoreFence` and a WAL checkpoint named `wal-green-17` before accepting new writes after restore.", + "query": "Which fence is required before accepting writes after restore?", + "alternate_query": "Find the disaster recovery note with wal-green-17.", + "terms": ["SnapshotRestoreFence", "wal-green-17"], + }, +] + +if profile == "smoke": + docs = anchors[:3] +elif profile in {"scale", "full"}: + docs = list(anchors) + target_count = max(scale_doc_count, len(anchors)) +elif profile == "stress": + docs = list(anchors) + target_count = max(stress_doc_count, len(anchors)) +else: + raise SystemExit(f"unsupported ELF_BASELINE_PROFILE={profile!r}") + +if profile in {"scale", "full", "stress"}: + topics = [ + "scheduler dry run budget window", + "operator dashboard cache refresh", + "import packet normalization lane", + "workspace role synchronization", + "trace export sampling policy", + "background compaction checkpoint", + "local fixture replay validation", + "notification queue dampening", + ] + for idx in range(1, target_count - len(anchors) + 1): + topic = topics[idx % len(topics)] + docs.append( + { + "name": f"distractor-{idx:03d}.md", + "title": f"Distractor Memory {idx:03d}", + "body": ( + f"This operational note covers {topic}. " + f"It intentionally uses ordinary maintenance vocabulary for lane {idx:03d}, " + f"checkpoint batch {1000 + idx}, and reviewer group {idx % 9}. " + "It should not answer the benchmark needle queries." + ), + } + ) + +for existing in corpus_dir.glob("*.md"): + existing.unlink() + +for doc in docs: + (corpus_dir / doc["name"]).write_text( + f"# {doc['title']}\n\n{doc['body']}\n", encoding="utf-8" + ) + +query_docs = anchors[: (3 if profile == "smoke" else len(anchors))] +queries = [] +for doc in query_docs: + base_id = doc["name"].replace("-memory.md", "").replace(".md", "") + queries.append( + { + "id": f"q-{base_id}", + "query": doc["query"], + "expected_doc": doc["name"], + "expected_terms": doc["terms"], + } + ) + if profile == "stress": + queries.append( + { + "id": f"q-{base_id}-alt", + "query": doc["alternate_query"], + "expected_doc": doc["name"], + "expected_terms": doc["terms"], + } + ) + +queries_path.write_text( + json.dumps( + { + "schema": "elf.live_baseline.queries/v1", + "profile": profile, + "document_count": len(docs), + "queries": queries, + }, + indent=2, + ) + + "\n", + encoding="utf-8", +) +PY +} + +rm -rf "${WORK_DIR}" +mkdir -p "${REPORT_DIR}" +find "${REPORT_DIR}" -maxdepth 1 -type f -delete +mkdir -p "${REPOS_DIR}" "${CORPUS_DIR}" "${HOME_DIR}" +: >"${RECORDS}" + +generate_corpus +DOCUMENT_COUNT="$(find "${CORPUS_DIR}" -maxdepth 1 -type f -name '*.md' | wc -l | tr -d ' ')" +QUERY_COUNT="$(jq '.queries | length' "${REPORT_DIR}/queries.json")" + +json_record() { + local project="$1" + local repo="$2" + local head="$3" + local status="$4" + local retrieval_status="$5" + local reason="$6" + local log_path="$7" + local command_summary="$8" + local finished_at + local elapsed_seconds + local checks_path + finished_at="$(date +%s)" + elapsed_seconds=0 + if [[ -n "${CURRENT_PROJECT_STARTED_AT}" ]]; then + elapsed_seconds=$((finished_at - CURRENT_PROJECT_STARTED_AT)) + fi + checks_path="${REPORT_DIR}/${project}-checks.json" + + if [[ -s "${checks_path}" ]] && jq -e '.checks and .check_summary' "${checks_path}" >/dev/null 2>&1; then + jq -nc \ + --arg project "${project}" \ + --arg repo "${repo}" \ + --arg head "${head}" \ + --arg status "${status}" \ + --arg retrieval_status "${retrieval_status}" \ + --arg reason "${reason}" \ + --arg log_path "${log_path}" \ + --arg command_summary "${command_summary}" \ + --argjson elapsed_seconds "${elapsed_seconds}" \ + --slurpfile checks "${checks_path}" \ + '{ + project: $project, + repo: $repo, + head: $head, + status: $status, + retrieval_status: $retrieval_status, + reason: $reason, + log_path: $log_path, + command_summary: $command_summary, + elapsed_seconds: $elapsed_seconds, + embedding: ($checks[0].embedding // null), + check_summary: $checks[0].check_summary, + checks: $checks[0].checks + }' >>"${RECORDS}" + else + jq -nc \ + --arg project "${project}" \ + --arg repo "${repo}" \ + --arg head "${head}" \ + --arg status "${status}" \ + --arg retrieval_status "${retrieval_status}" \ + --arg reason "${reason}" \ + --arg log_path "${log_path}" \ + --arg command_summary "${command_summary}" \ + --argjson elapsed_seconds "${elapsed_seconds}" \ + '{ + project: $project, + repo: $repo, + head: $head, + status: $status, + retrieval_status: $retrieval_status, + reason: $reason, + log_path: $log_path, + command_summary: $command_summary, + elapsed_seconds: $elapsed_seconds, + check_summary: { + total: 1, + pass: (if $retrieval_status == "retrieval_pass" then 1 else 0 end), + fail: (if $status == "fail" then 1 else 0 end), + incomplete: (if $retrieval_status != "retrieval_pass" and $status != "fail" then 1 else 0 end) + }, + checks: [ + { + name: "same_corpus_retrieval", + status: (if $retrieval_status == "retrieval_pass" then "pass" elif $status == "fail" then "fail" else "incomplete" end), + reason: $reason, + evidence: { + retrieval_status: $retrieval_status, + log_path: $log_path, + command_summary: $command_summary + } + } + ] + }' >>"${RECORDS}" + fi +} + +run_cmd() { + local label="$1" + local timeout_seconds="$2" + local log_path="$3" + shift 3 + + { + echo "## ${label}" + echo "## started_at=$(date -u +%Y-%m-%dT%H:%M:%SZ)" + echo "## command=$*" + } >>"${log_path}" + + if timeout "${timeout_seconds}" bash -lc "$*" >>"${log_path}" 2>&1; then + echo "## exit=0" >>"${log_path}" + return 0 + fi + + local code + code=$? + echo "## exit=${code}" >>"${log_path}" + return "${code}" +} + +clone_project() { + local project="$1" + local repo="$2" + local log_path="$3" + local target="${REPOS_DIR}/${project}" + + if run_cmd "${project}: clone" 180 "${log_path}" "git clone --depth 1 '${repo}' '${target}'"; then + git -C "${target}" rev-parse HEAD + return 0 + fi + + echo "clone_failed" + return 1 +} + +finish_report() { + jq -s \ + --arg schema "elf.live_baseline.report/v1" \ + --arg run_id "${RUN_ID}" \ + --arg project_filter "${PROJECT_FILTER}" \ + --arg corpus_profile "${CORPUS_PROFILE}" \ + --argjson document_count "${DOCUMENT_COUNT}" \ + --argjson query_count "${QUERY_COUNT}" \ + --arg generated_at "$(date -u +%Y-%m-%dT%H:%M:%SZ)" \ + '{ + schema: $schema, + run_id: $run_id, + generated_at: $generated_at, + docker_only: true, + project_filter: $project_filter, + corpus: { + profile: $corpus_profile, + document_count: $document_count, + query_count: $query_count, + path: "generated in Docker under /bench/corpus", + query_file: "tmp/live-baseline/queries.json" + }, + verdict: ( + if length == 0 then "incomplete" + elif any(.[]; .status == "fail") then "fail" + elif all(.[]; .status == "pass" and .retrieval_status == "retrieval_pass") then "pass" + else "incomplete" + end + ), + summary: { + total: length, + pass: ([.[] | select(.status == "pass")] | length), + fail: ([.[] | select(.status == "fail")] | length), + incomplete: ([.[] | select(.status == "incomplete")] | length) + }, + same_corpus_summary: { + total: length, + pass: ([.[] | select(.retrieval_status == "retrieval_pass")] | length), + fail: ([.[] | select(.retrieval_status != "retrieval_pass" and .status == "fail")] | length), + incomplete: ([.[] | select(.retrieval_status != "retrieval_pass" and .status != "fail")] | length) + }, + full_check_summary: { + total: ([.[] | .check_summary.total // 0] | add // 0), + pass: ([.[] | .check_summary.pass // 0] | add // 0), + fail: ([.[] | .check_summary.fail // 0] | add // 0), + incomplete: ([.[] | .check_summary.incomplete // 0] | add // 0) + }, + projects: . + }' "${RECORDS}" >"${REPORT}" +} + +project_enabled() { + local project="$1" + + if [[ -z "${PROJECT_FILTER}" || "${PROJECT_FILTER}" == "all" ]]; then + return 0 + fi + + for selected in ${PROJECT_FILTER//,/ }; do + if [[ "${selected}" == "${project}" ]]; then + return 0 + fi + done + + return 1 +} + +run_project() { + local project="$1" + local fn="$2" + + if project_enabled "${project}"; then + CURRENT_PROJECT_STARTED_AT="$(date +%s)" + "${fn}" + CURRENT_PROJECT_STARTED_AT="" + fi +} + +project_elf() { + local project="ELF" + local repo="local:/workspace" + local log_path="${REPORT_DIR}/${project}.log" + local result_path="${REPORT_DIR}/${project}-result.json" + local head + head="${ELF_BASELINE_ELF_HEAD:-}" + if [[ -z "${head}" ]]; then + head="$(git -C "${ROOT_DIR}" rev-parse HEAD 2>>"${log_path}" || echo "unknown")" + fi + + if run_cmd "${project}: same-corpus retrieval" 1200 "${log_path}" \ + "cd '${ROOT_DIR}' && cargo run -p elf-eval --bin live_baseline_elf -- --config config/local/elf.docker.toml --corpus '${CORPUS_DIR}' --queries '${REPORT_DIR}/queries.json' --out '${result_path}'"; then + if [[ -s "${result_path}" ]] && jq -e '.checks and .check_summary' "${result_path}" >/dev/null 2>&1; then + jq '{embedding, check_summary, checks}' "${result_path}" >"${REPORT_DIR}/${project}-checks.json" + fi + if [[ -s "${result_path}" ]] && jq -e --argjson document_count "${DOCUMENT_COUNT}" --argjson query_count "${QUERY_COUNT}" ' + .schema == "elf.live_baseline.elf_result/v1" and + .status == "pass" and + .summary.total == $query_count and + .summary.fail == 0 and + .check_summary.fail == 0 and + .check_summary.incomplete == 0 and + .indexing.note_count == $document_count and + .indexing.rebuild_rebuilt_count >= $document_count and + .indexing.rebuild_error_count == 0 + ' "${result_path}" >/dev/null; then + json_record "${project}" "${repo}" "${head}" "pass" "retrieval_pass" \ + "$(jq -r '.reason' "${result_path}")" \ + "${project}.log" "add_note; worker outbox indexing; rebuild_qdrant; search_raw; concurrent writes; soak stability" + return + fi + + if [[ -s "${result_path}" ]] && jq -e '.schema == "elf.live_baseline.elf_result/v1"' "${result_path}" >/dev/null 2>&1; then + json_record "${project}" "${repo}" "${head}" "$(jq -r '.status // "fail"' "${result_path}")" \ + "$(jq -r '.retrieval_status // "retrieval_failed"' "${result_path}")" \ + "$(jq -r '.reason // "ELF result did not satisfy live baseline pass criteria"' "${result_path}")" \ + "${project}.log" "add_note; worker outbox indexing; rebuild_qdrant; search_raw; concurrent writes; soak stability" + return + fi + + json_record "${project}" "${repo}" "${head}" "fail" "runtime_failed" \ + "ELF command completed but did not write a valid live-baseline result; inspect ELF.log for the runtime error" \ + "${project}.log" "add_note; worker outbox indexing; rebuild_qdrant; search_raw; concurrent writes; soak stability" + return + fi + + json_record "${project}" "${repo}" "${head}" "fail" "runtime_failed" \ + "ELF same-corpus retrieval command failed in Docker" \ + "${project}.log" "add_note; worker outbox indexing; rebuild_qdrant; search_raw; concurrent writes; soak stability" +} + +project_agentmemory() { + local project="agentmemory" + local repo="https://github.com/rohitg00/agentmemory.git" + local log_path="${REPORT_DIR}/${project}.log" + local result_path="${REPORT_DIR}/${project}-search.json" + local driver_path="${REPOS_DIR}/${project}/elf-live-baseline-agentmemory.ts" + local head + head="$(clone_project "${project}" "${repo}" "${log_path}")" || { + json_record "${project}" "${repo}" "${head}" "fail" "not_run" "clone failed" "${project}.log" "git clone" + return + } + + if run_cmd "${project}: install/build" 300 "${log_path}" \ + "cd '${REPOS_DIR}/${project}' && (npm ci || npm install --no-audit --no-fund) && npm run build --if-present"; then + cat >"${driver_path}" <<'TS' +import { readFileSync, readdirSync, writeFileSync } from "node:fs"; +import { join } from "node:path"; +import { registerRememberFunction } from "./src/functions/remember.js"; +import { + getSearchIndex, + registerSearchFunction, + setEmbeddingProvider, + setVectorIndex, +} from "./src/functions/search.js"; + +function mockKV() { + const store = new Map>(); + return { + get: async (scope: string, key: string): Promise => + (store.get(scope)?.get(key) as T) ?? null, + set: async (scope: string, key: string, data: T): Promise => { + if (!store.has(scope)) store.set(scope, new Map()); + store.get(scope)!.set(key, data); + return data; + }, + delete: async (scope: string, key: string): Promise => { + store.get(scope)?.delete(key); + }, + list: async (scope: string): Promise => { + const entries = store.get(scope); + return entries ? (Array.from(entries.values()) as T[]) : []; + }, + }; +} + +function mockSdk() { + const functions = new Map(); + return { + registerFunction: (idOrOpts: string | { id: string }, handler: Function) => { + const id = typeof idOrOpts === "string" ? idOrOpts : idOrOpts.id; + functions.set(id, handler); + }, + registerTrigger: () => {}, + trigger: async ( + idOrInput: string | { function_id: string; payload: unknown }, + data?: unknown, + ) => { + const id = typeof idOrInput === "string" ? idOrInput : idOrInput.function_id; + const payload = typeof idOrInput === "string" ? data : idOrInput.payload; + const fn = functions.get(id); + if (!fn) { + if (id === "mem::cascade-update") return { success: true }; + throw new Error(`No function: ${id}`); + } + return fn(payload); + }, + }; +} + +type QueryCase = { + id: string; + query: string; + expected_doc: string; + expected_terms: string[]; +}; + +const outPath = process.argv[2]; +const corpusPath = process.argv[3]; +const queriesPath = process.argv[4]; +if (!outPath || !corpusPath || !queriesPath) { + throw new Error("output path, corpus path, and query path are required"); +} + +const sdk = mockSdk(); +const kv = mockKV(); +getSearchIndex().clear(); +setVectorIndex(null); +setEmbeddingProvider(null); +registerRememberFunction(sdk as never, kv as never); +registerSearchFunction(sdk as never, kv as never); + +function plainText(markdown: string): string { + return markdown + .split(/\r?\n/) + .filter((line) => !line.trimStart().startsWith("#")) + .join(" ") + .replace(/\s+/g, " ") + .trim(); +} + +function conceptsFor(file: string): string[] { + return file + .replace(/\.md$/i, "") + .split(/[^A-Za-z0-9]+/) + .map((part) => part.toLowerCase()) + .filter(Boolean); +} + +function queryMatches(result: unknown, query: QueryCase): boolean { + const results = (result as { results?: unknown[] }).results ?? []; + return results.some((entry) => { + const entryJson = JSON.stringify(entry); + const entryText = entryJson.toLowerCase(); + const files = + (entry as { observation?: { files?: string[] } }).observation?.files ?? []; + return ( + files.includes(query.expected_doc) && + query.expected_terms.every((term) => + entryText.includes(term.toLowerCase()), + ) + ); + }); +} + +function resultEntries(result: unknown): unknown[] { + return (result as { results?: unknown[] }).results ?? []; +} + +function makeCheck( + name: string, + status: "pass" | "fail" | "incomplete", + reason: string, + evidence: unknown, +) { + return { name, status, reason, evidence }; +} + +function summarizeChecks(checks: Array<{ status: string }>) { + return { + total: checks.length, + pass: checks.filter((check) => check.status === "pass").length, + fail: checks.filter((check) => check.status === "fail").length, + incomplete: checks.filter((check) => check.status === "incomplete").length, + }; +} + +async function runSearch(query: QueryCase) { + return sdk.trigger("mem::search", { + query: query.query, + limit: topK, + format: "full", + project: "elfbench", + }); +} + +const docs = readdirSync(corpusPath) + .filter((file) => file.endsWith(".md")) + .sort() + .map((file) => ({ + content: plainText(readFileSync(join(corpusPath, file), "utf8")), + concepts: conceptsFor(file), + files: [file], + })); +const queries = JSON.parse(readFileSync(queriesPath, "utf8")).queries as QueryCase[]; + +const writes = []; +const memoryIdsBySource = new Map(); +for (const doc of docs) { + const write = await sdk.trigger("mem::remember", { + content: doc.content, + type: "fact", + concepts: doc.concepts, + files: doc.files, + project: "elfbench", + agentId: "elf-baseline", + }); + writes.push({ source: doc.files[0], result: write }); + const memoryId = (write as { memory?: { id?: string } }).memory?.id; + if (memoryId) memoryIdsBySource.set(doc.files[0], memoryId); +} + +const queryResults = []; +const topK = Number(process.env.ELF_BASELINE_TOP_K ?? "10"); +for (const query of queries) { + const result = await runSearch(query); + queryResults.push({ + id: query.id, + query: query.query, + expected_doc: query.expected_doc, + expected_terms: query.expected_terms, + matched: queryMatches(result, query), + result, + }); +} + +const pass = queryResults.filter((result) => result.matched).length; +const checks = [ + makeCheck( + "same_corpus_retrieval", + pass === queryResults.length ? "pass" : "fail", + pass === queryResults.length + ? "agentmemory mem::remember/mem::search returned expected evidence for every query." + : "agentmemory mem::remember/mem::search missed one or more expected results.", + { + total: queryResults.length, + pass, + fail: queryResults.length - pass, + }, + ), +]; + +const authId = memoryIdsBySource.get("auth-memory.md"); +if (!authId) { + checks.push( + makeCheck( + "update_replaces_note_text", + "incomplete", + "The auth memory id was not returned by mem::remember, so supersede/update could not be exercised.", + { source: "auth-memory.md" }, + ), + ); +} else { + const updateRemember = await sdk.trigger("mem::remember", { + content: + "The API auth middleware validates JWT tokens with key id `kid-v4` under `RotatedJwtKeyPlan`. The middleware rejects tokens older than 15 minutes and requires tenant scope `project_shared` for deployment operations.", + type: "fact", + concepts: conceptsFor("auth-memory.md"), + files: ["auth-memory.md"], + project: "elfbench", + agentId: "elf-baseline", + }); + const updateQuery: QueryCase = { + id: "lifecycle-update-new-marker", + query: "Which rotated JWT key id does the auth middleware require?", + expected_doc: "auth-memory.md", + expected_terms: ["kid-v4", "RotatedJwtKeyPlan"], + }; + const updateResult = await runSearch(updateQuery); + const updateMatched = queryMatches(updateResult, updateQuery); + const oldMarkerAbsent = resultEntries(updateResult) + .filter((entry) => { + const files = + (entry as { observation?: { files?: string[] } }).observation?.files ?? []; + return files.includes("auth-memory.md"); + }) + .every((entry) => !JSON.stringify(entry).toLowerCase().includes("kid-v3")); + checks.push( + makeCheck( + "update_replaces_note_text", + updateMatched && oldMarkerAbsent ? "pass" : "fail", + updateMatched && oldMarkerAbsent + ? "agentmemory mem::remember supersede returned the new marker and did not return the old marker for the updated file." + : "agentmemory mem::remember supersede did not cleanly replace the searchable auth memory text.", + { + memory_id: authId, + update_result: updateRemember, + matched_new_marker: updateMatched, + old_marker_absent: oldMarkerAbsent, + result: updateResult, + }, + ), + ); +} + +const deleteQuery = queries.find( + (query) => + query.expected_doc !== "auth-memory.md" && + query.expected_doc !== "database-memory.md" && + memoryIdsBySource.has(query.expected_doc), +); +if (!deleteQuery) { + checks.push( + makeCheck( + "delete_suppresses_retrieval", + "incomplete", + "No non-update, non-recovery memory id was available, so mem::forget could not be exercised.", + { available_sources: Array.from(memoryIdsBySource.keys()).sort() }, + ), + ); +} else { + const deleteId = memoryIdsBySource.get(deleteQuery.expected_doc)!; + const deleteResult = await sdk.trigger("mem::forget", { memoryId: deleteId }); + const searchAfterDelete = await runSearch(deleteQuery); + const deletedStillMatched = queryMatches(searchAfterDelete, deleteQuery); + checks.push( + makeCheck( + "delete_suppresses_retrieval", + deletedStillMatched ? "fail" : "pass", + deletedStillMatched + ? "agentmemory mem::forget returned success but the deleted memory was still searchable." + : "agentmemory mem::forget suppressed the deleted memory from subsequent search.", + { + memory_id: deleteId, + source: deleteQuery.expected_doc, + query: deleteQuery, + delete_result: deleteResult, + deleted_still_matched: deletedStillMatched, + result: searchAfterDelete, + }, + ), + ); +} + +checks.push( + makeCheck( + "cold_start_recovery_search", + "incomplete", + "This adapter runs agentmemory against an in-memory SDK/KV mock; no durable store is available in the harness to prove cold-start recovery.", + { + adapter_storage: "mock StateKV Map", + required_next_step: "wire an agentmemory persistent KV/index path or hosted runtime for restart testing", + }, + ), +); + +const checkSummary = summarizeChecks(checks); + +writeFileSync( + outPath, + JSON.stringify( + { + schema: "elf.live_baseline.agentmemory_result/v1", + corpus: { + document_count: docs.length, + query_count: queries.length, + }, + writes, + summary: { + total: queryResults.length, + pass, + fail: queryResults.length - pass, + }, + check_summary: checkSummary, + checks, + queries: queryResults, + }, + null, + 2, + ), +); +TS + if run_cmd "${project}: same-corpus remember/search" 240 "${log_path}" \ + "cd '${REPOS_DIR}/${project}' && npx tsx '${driver_path}' '${result_path}' '${CORPUS_DIR}' '${REPORT_DIR}/queries.json'"; then + if jq -e '.checks and .check_summary' "${result_path}" >/dev/null 2>&1; then + jq '{check_summary, checks}' "${result_path}" >"${REPORT_DIR}/${project}-checks.json" + fi + if jq -e --argjson query_count "${QUERY_COUNT}" --argjson document_count "${DOCUMENT_COUNT}" ' + .schema == "elf.live_baseline.agentmemory_result/v1" and + .corpus.document_count == $document_count and + .summary.total == $query_count and + .summary.fail == 0 and + .check_summary.fail == 0 and + .check_summary.incomplete == 0 + ' "${result_path}" >/dev/null; then + json_record "${project}" "${repo}" "${head}" "pass" "retrieval_pass" "agentmemory mem::remember/mem::search found expected evidence and lifecycle checks passed" "${project}.log" "npm install/build; mem::remember/mem::forget/mem::search" + return + fi + if jq -e --argjson query_count "${QUERY_COUNT}" --argjson document_count "${DOCUMENT_COUNT}" ' + .schema == "elf.live_baseline.agentmemory_result/v1" and + .corpus.document_count == $document_count and + .summary.total == $query_count and + .summary.fail == 0 and + .check_summary.fail == 0 + ' "${result_path}" >/dev/null; then + json_record "${project}" "${repo}" "${head}" "incomplete" "retrieval_pass" "agentmemory same-corpus retrieval passed, but one or more lifecycle checks could not be completed in the in-memory harness" "${project}.log" "npm install/build; mem::remember/mem::forget/mem::search" + return + fi + if jq -e --argjson query_count "${QUERY_COUNT}" --argjson document_count "${DOCUMENT_COUNT}" ' + .schema == "elf.live_baseline.agentmemory_result/v1" and + .corpus.document_count == $document_count and + .summary.total == $query_count and + .summary.fail == 0 + ' "${result_path}" >/dev/null; then + json_record "${project}" "${repo}" "${head}" "fail" "retrieval_pass" "agentmemory same-corpus retrieval passed, but one or more lifecycle checks failed" "${project}.log" "npm install/build; mem::remember/mem::forget/mem::search" + return + fi + json_record "${project}" "${repo}" "${head}" "fail" "retrieval_wrong_result" "agentmemory same-corpus search ran but did not return expected evidence" "${project}.log" "npm install/build; mem::remember; mem::search" + return + fi + json_record "${project}" "${repo}" "${head}" "incomplete" "retrieval_command_failed" "agentmemory install/build passed but same-corpus remember/search failed" "${project}.log" "npm install/build; mem::remember; mem::search" + return + fi + + json_record "${project}" "${repo}" "${head}" "fail" "not_run" "install/build failed" "${project}.log" "npm install/build" +} + +project_qmd() { + local project="qmd" + local repo="https://github.com/tobi/qmd.git" + local log_path="${REPORT_DIR}/${project}.log" + local query_result_path="${REPORT_DIR}/${project}-query.json" + local status_path="${REPORT_DIR}/${project}-status.txt" + local driver_path="${REPOS_DIR}/${project}/elf-live-baseline-qmd.mjs" + local home="${HOME_DIR}/${project}" + local head + mkdir -p "${home}" + head="$(clone_project "${project}" "${repo}" "${log_path}")" || { + json_record "${project}" "${repo}" "${head}" "fail" "not_run" "clone failed" "${project}.log" "git clone" + return + } + + if ! run_cmd "${project}: install/build" 300 "${log_path}" \ + "cd '${REPOS_DIR}/${project}' && (npm ci || npm install --no-audit --no-fund) && npm run build --if-present"; then + json_record "${project}" "${repo}" "${head}" "fail" "not_run" "install/build failed" "${project}.log" "npm install/build" + return + fi + + cat >"${driver_path}" <<'JS' +import { execFileSync } from "node:child_process"; +import { existsSync, readFileSync, unlinkSync, writeFileSync } from "node:fs"; +import { join } from "node:path"; + +const outPath = process.argv[2]; +const queriesPath = process.argv[3]; +const corpusPath = process.argv[4]; +if (!outPath || !queriesPath || !corpusPath) { + throw new Error("output path, query path, and corpus path are required"); +} + +const queries = JSON.parse(readFileSync(queriesPath, "utf8")).queries; +const topK = process.env.ELF_BASELINE_TOP_K ?? "10"; + +function resultMatches(results, query) { + if (!Array.isArray(results)) return false; + return results.some((entry) => { + const entryText = JSON.stringify(entry).toLowerCase(); + const file = String(entry.file ?? ""); + return ( + file.includes(query.expected_doc) && + query.expected_terms.every((term) => + entryText.includes(String(term).toLowerCase()), + ) + ); + }); +} + +function qmdQuery(queryText) { + const structuredQuery = `lex: ${queryText}\nvec: ${queryText}`; + const stdout = execFileSync( + "npx", + [ + "tsx", + "src/cli/qmd.ts", + "query", + structuredQuery, + "-c", + "elfbench", + "--json", + "--no-rerank", + "--min-score", + "0", + "-n", + topK, + ], + { encoding: "utf8", env: process.env }, + ); + return JSON.parse(stdout); +} + +function runQueryCase(query) { + const results = qmdQuery(query.query); + return { + id: query.id, + query: query.query, + expected_doc: query.expected_doc, + expected_terms: query.expected_terms, + matched: resultMatches(results, query), + results, + }; +} + +function makeCheck(name, status, reason, evidence) { + return { name, status, reason, evidence }; +} + +function summarizeChecks(checks) { + return { + total: checks.length, + pass: checks.filter((check) => check.status === "pass").length, + fail: checks.filter((check) => check.status === "fail").length, + incomplete: checks.filter((check) => check.status === "incomplete").length, + }; +} + +function runQmd(args) { + return execFileSync("npx", ["tsx", "src/cli/qmd.ts", ...args], { + encoding: "utf8", + env: process.env, + }); +} + +function syncCollection({ embed = false } = {}) { + runQmd(["update"]); + if (embed) { + runQmd(["embed", "-f", "-c", "elfbench"]); + } +} + +const queryResults = queries.map((query) => runQueryCase(query)); +const pass = queryResults.filter((result) => result.matched).length; +const checks = [ + makeCheck( + "same_corpus_retrieval", + pass === queryResults.length ? "pass" : "fail", + pass === queryResults.length + ? "qmd structured hybrid query returned expected evidence for every query." + : "qmd structured hybrid query missed one or more expected results.", + { + total: queryResults.length, + pass, + fail: queryResults.length - pass, + }, + ), +]; + +const authPath = join(corpusPath, "auth-memory.md"); +if (!existsSync(authPath)) { + checks.push( + makeCheck( + "update_replaces_note_text", + "incomplete", + "The auth corpus file was missing, so qmd update could not be exercised.", + { source: "auth-memory.md" }, + ), + ); +} else { + writeFileSync( + authPath, + "# Auth Memory\n\nRotated auth middleware validates JWT tokens with key id `kid-v4` under `RotatedJwtKeyPlan`. It still requires tenant scope `project_shared` for deployment operations after the emergency key rotation.\n", + ); + syncCollection({ embed: true }); + const updateQuery = { + id: "lifecycle-update-new-marker", + query: "Which rotated JWT key id does the auth middleware require?", + expected_doc: "auth-memory.md", + expected_terms: ["kid-v4", "RotatedJwtKeyPlan"], + }; + const updateResults = qmdQuery(updateQuery.query); + const updateMatched = resultMatches(updateResults, updateQuery); + const oldMarkerAbsent = updateResults + .filter((entry) => String(entry.file ?? "").includes("auth-memory.md")) + .every((entry) => !JSON.stringify(entry).toLowerCase().includes("kid-v3")); + checks.push( + makeCheck( + "update_replaces_note_text", + updateMatched && oldMarkerAbsent ? "pass" : "fail", + updateMatched && oldMarkerAbsent + ? "qmd update/embed returned the new marker and did not return the old marker for the updated file." + : "qmd update/embed did not cleanly replace the searchable auth file text.", + { + source: "auth-memory.md", + matched_new_marker: updateMatched, + old_marker_absent: oldMarkerAbsent, + results: updateResults, + }, + ), + ); +} + +const deleteQuery = queries.find( + (query) => + query.expected_doc !== "auth-memory.md" && + query.expected_doc !== "database-memory.md" && + existsSync(join(corpusPath, query.expected_doc)), +); +if (!deleteQuery) { + checks.push( + makeCheck( + "delete_suppresses_retrieval", + "incomplete", + "No non-update, non-recovery corpus file was available, so qmd delete could not be exercised.", + { available_docs: queries.map((query) => query.expected_doc) }, + ), + ); +} else { + unlinkSync(join(corpusPath, deleteQuery.expected_doc)); + syncCollection(); + const deleteResults = qmdQuery(deleteQuery.query); + const deletedStillMatched = resultMatches(deleteResults, deleteQuery); + checks.push( + makeCheck( + "delete_suppresses_retrieval", + deletedStillMatched ? "fail" : "pass", + deletedStillMatched + ? "qmd update marked the deleted file removed, but it was still searchable." + : "qmd update suppressed the deleted file from subsequent search.", + { + source: deleteQuery.expected_doc, + query: deleteQuery, + deleted_still_matched: deletedStillMatched, + results: deleteResults, + }, + ), + ); +} + +const recoveryQuery = { + id: "lifecycle-cold-start-recovery", + query: + "The invoice list N+1 query was fixed by eager loading invoice lines through `InvoiceLineBatcher`. Do not reintroduce per-row SQL calls in invoice rendering.", + expected_doc: "database-memory.md", + expected_terms: ["InvoiceLineBatcher", "N+1"], +}; +const recoveryResults = qmdQuery(recoveryQuery.query); +const recoveryMatched = resultMatches(recoveryResults, recoveryQuery); +checks.push( + makeCheck( + "cold_start_recovery_search", + recoveryMatched ? "pass" : "fail", + recoveryMatched + ? "A fresh qmd query process reopened the persisted index and retrieved expected evidence." + : "A fresh qmd query process did not retrieve expected persisted evidence.", + { + expected_doc: recoveryQuery.expected_doc, + matched: recoveryMatched, + results: recoveryResults, + }, + ), +); + +const checkSummary = summarizeChecks(checks); +writeFileSync( + outPath, + JSON.stringify( + { + schema: "elf.live_baseline.qmd_result/v1", + summary: { + total: queryResults.length, + pass, + fail: queryResults.length - pass, + }, + check_summary: checkSummary, + checks, + queries: queryResults, + }, + null, + 2, + ), +); +JS + + if run_cmd "${project}: embedded retrieval" 900 "${log_path}" \ + "export HOME='${home}'; export XDG_CACHE_HOME='/root/.cache'; export QMD_FORCE_CPU=1; cd '${REPOS_DIR}/${project}' && npx tsx src/cli/qmd.ts collection add '${CORPUS_DIR}' --name elfbench && npx tsx src/cli/qmd.ts update && npx tsx src/cli/qmd.ts embed -f -c elfbench && npx tsx src/cli/qmd.ts status > '${status_path}' && node '${driver_path}' '${query_result_path}' '${REPORT_DIR}/queries.json' '${CORPUS_DIR}'"; then + if jq -e '.checks and .check_summary' "${query_result_path}" >/dev/null 2>&1; then + jq '{check_summary, checks}' "${query_result_path}" >"${REPORT_DIR}/${project}-checks.json" + fi + if jq -e --argjson query_count "${QUERY_COUNT}" ' + .schema == "elf.live_baseline.qmd_result/v1" and + .summary.total == $query_count and + .summary.fail == 0 and + .check_summary.fail == 0 and + .check_summary.incomplete == 0 + ' "${query_result_path}" >/dev/null; then + json_record "${project}" "${repo}" "${head}" "pass" "retrieval_pass" "qmd embedded structured hybrid query found expected evidence and lifecycle checks passed" "${project}.log" "collection add; update; embed -f; query --json" + elif jq -e --argjson query_count "${QUERY_COUNT}" ' + .schema == "elf.live_baseline.qmd_result/v1" and + .summary.total == $query_count and + .summary.fail == 0 + ' "${query_result_path}" >/dev/null; then + json_record "${project}" "${repo}" "${head}" "fail" "retrieval_pass" "qmd same-corpus retrieval passed, but one or more update/delete/recovery checks failed or were incomplete" "${project}.log" "collection add; update; embed -f; query --json" + elif ! rg -q "Embedded [1-9][0-9]* chunks" "${log_path}"; then + json_record "${project}" "${repo}" "${head}" "incomplete" "embedding_required" "qmd indexed the corpus, but no successful embedding completion was observed" "${project}.log" "collection add; update; embed -f; query --json" + elif ! jq -e '.schema == "elf.live_baseline.qmd_result/v1"' "${query_result_path}" >/dev/null 2>&1; then + json_record "${project}" "${repo}" "${head}" "fail" "invalid_json_result" "qmd query command completed, but did not produce parseable JSON results" "${project}.log" "collection add; update; embed -f; search/query --json" + else + json_record "${project}" "${repo}" "${head}" "fail" "retrieval_wrong_result" "qmd embedded retrieval ran but did not return expected evidence" "${project}.log" "collection add; update; embed -f; search/query --json" + fi + return + fi + + json_record "${project}" "${repo}" "${head}" "incomplete" "retrieval_command_failed" "qmd install passed but embedded retrieval command failed" "${project}.log" "collection add; update; embed -f; search/query --json" +} + +project_memsearch() { + local project="memsearch" + local repo="https://github.com/zilliztech/memsearch.git" + local log_path="${REPORT_DIR}/${project}.log" + local home="${HOME_DIR}/${project}" + local result_path="${REPORT_DIR}/${project}-search.json" + local driver_path="${REPOS_DIR}/${project}/elf-live-baseline-memsearch.py" + local head + mkdir -p "${home}" + head="$(clone_project "${project}" "${repo}" "${log_path}")" || { + json_record "${project}" "${repo}" "${head}" "fail" "not_run" "clone failed" "${project}.log" "git clone" + return + } + + if ! run_cmd "${project}: install" 420 "${log_path}" \ + "cd '${REPOS_DIR}/${project}' && python3 -m venv .venv && .venv/bin/pip install --upgrade pip && .venv/bin/pip install -e '.[local,onnx]'"; then + json_record "${project}" "${repo}" "${head}" "fail" "not_run" "pip install failed" "${project}.log" "pip install -e .[local,onnx]" + return + fi + + cat >"${driver_path}" <<'PY' +import json +import os +import subprocess +from pathlib import Path + +out_path = Path(os.environ["ELF_MEMSEARCH_RESULT_PATH"]) +queries_path = Path(os.environ["ELF_BASELINE_QUERIES_PATH"]) +corpus_path = Path(os.environ["ELF_BASELINE_CORPUS_PATH"]) +top_k = os.environ.get("ELF_BASELINE_TOP_K", "10") +queries = json.loads(queries_path.read_text())["queries"] + + +def run_memsearch(args): + return subprocess.run( + ["memsearch", *args], + check=True, + text=True, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + ).stdout + + +def index_corpus(): + return run_memsearch(["index", str(corpus_path)]) + + +def search_output(query_text): + return run_memsearch(["search", query_text, "--top-k", top_k]) + + +def output_matches(output, query): + lowered = output.lower() + matched = query["expected_doc"] in output and all( + term.lower() in lowered for term in query["expected_terms"] + ) + if not matched: + matched = all(term.lower() in lowered for term in query["expected_terms"]) + return matched + + +def make_check(name, status, reason, evidence): + return { + "name": name, + "status": status, + "reason": reason, + "evidence": evidence, + } + + +def summarize_checks(checks): + return { + "total": len(checks), + "pass": sum(1 for check in checks if check["status"] == "pass"), + "fail": sum(1 for check in checks if check["status"] == "fail"), + "incomplete": sum(1 for check in checks if check["status"] == "incomplete"), + } + + +query_results = [] +for query in queries: + output = search_output(query["query"]) + matched = output_matches(output, query) + query_results.append( + { + "id": query["id"], + "query": query["query"], + "expected_doc": query["expected_doc"], + "expected_terms": query["expected_terms"], + "matched": matched, + "output": output, + } + ) + +pass_count = sum(1 for result in query_results if result["matched"]) +checks = [ + make_check( + "same_corpus_retrieval", + "pass" if pass_count == len(query_results) else "fail", + "memsearch search returned expected evidence for every query." + if pass_count == len(query_results) + else "memsearch search missed one or more expected results.", + { + "total": len(query_results), + "pass": pass_count, + "fail": len(query_results) - pass_count, + }, + ) +] + +auth_path = corpus_path / "auth-memory.md" +if not auth_path.exists(): + checks.append( + make_check( + "update_replaces_note_text", + "incomplete", + "The auth corpus file was missing, so memsearch update could not be exercised.", + {"source": "auth-memory.md"}, + ) + ) +else: + auth_path.write_text( + "# Auth Memory\n\nRotated auth middleware validates JWT tokens with key id `kid-v4` under `RotatedJwtKeyPlan`. It still requires tenant scope `project_shared` for deployment operations after the emergency key rotation.\n" + ) + update_index_output = index_corpus() + update_query = { + "id": "lifecycle-update-new-marker", + "query": "Which rotated JWT key id does the auth middleware require?", + "expected_doc": "auth-memory.md", + "expected_terms": ["kid-v4", "RotatedJwtKeyPlan"], + } + update_output = search_output(update_query["query"]) + update_matched = output_matches(update_output, update_query) + old_marker_absent = "kid-v3" not in update_output.lower() + checks.append( + make_check( + "update_replaces_note_text", + "pass" if update_matched and old_marker_absent else "fail", + "memsearch re-index returned the new marker and did not return the old marker for the updated file." + if update_matched and old_marker_absent + else "memsearch re-index did not cleanly replace the searchable auth file text.", + { + "source": "auth-memory.md", + "matched_new_marker": update_matched, + "old_marker_absent": old_marker_absent, + "index_output": update_index_output, + "output": update_output, + }, + ) + ) + +delete_query = next( + ( + query + for query in queries + if query["expected_doc"] not in {"auth-memory.md", "database-memory.md"} + and (corpus_path / query["expected_doc"]).exists() + ), + None, +) +if delete_query is None: + checks.append( + make_check( + "delete_suppresses_retrieval", + "incomplete", + "No non-update, non-recovery corpus file was available, so memsearch delete could not be exercised.", + {"available_docs": [query["expected_doc"] for query in queries]}, + ) + ) +else: + (corpus_path / delete_query["expected_doc"]).unlink() + delete_index_output = index_corpus() + delete_output = search_output(delete_query["query"]) + deleted_still_matched = output_matches(delete_output, delete_query) + checks.append( + make_check( + "delete_suppresses_retrieval", + "fail" if deleted_still_matched else "pass", + "memsearch index removed the deleted file from subsequent search." + if not deleted_still_matched + else "memsearch index returned success but the deleted file was still searchable.", + { + "source": delete_query["expected_doc"], + "query": delete_query, + "deleted_still_matched": deleted_still_matched, + "index_output": delete_index_output, + "output": delete_output, + }, + ) + ) + +recovery_query = { + "id": "lifecycle-cold-start-recovery", + "query": "The invoice list N+1 query was fixed by eager loading invoice lines through `InvoiceLineBatcher`. Do not reintroduce per-row SQL calls in invoice rendering.", + "expected_doc": "database-memory.md", + "expected_terms": ["InvoiceLineBatcher", "N+1"], +} +recovery_output = search_output(recovery_query["query"]) +recovery_matched = output_matches(recovery_output, recovery_query) +checks.append( + make_check( + "cold_start_recovery_search", + "pass" if recovery_matched else "fail", + "A fresh memsearch CLI process reopened the local Milvus index and retrieved persisted evidence." + if recovery_matched + else "A fresh memsearch CLI process did not retrieve expected persisted evidence.", + { + "expected_doc": recovery_query["expected_doc"], + "matched": recovery_matched, + "output": recovery_output, + }, + ) +) + +check_summary = summarize_checks(checks) +out_path.write_text( + json.dumps( + { + "schema": "elf.live_baseline.memsearch_result/v1", + "summary": { + "total": len(query_results), + "pass": pass_count, + "fail": len(query_results) - pass_count, + }, + "check_summary": check_summary, + "checks": checks, + "queries": query_results, + }, + indent=2, + ) +) +PY + + if run_cmd "${project}: cli retrieval attempt" 240 "${log_path}" \ + "export HOME='${home}'; export ELF_MEMSEARCH_RESULT_PATH='${result_path}'; export ELF_BASELINE_QUERIES_PATH='${REPORT_DIR}/queries.json'; export ELF_BASELINE_CORPUS_PATH='${CORPUS_DIR}'; cd '${REPOS_DIR}/${project}' && source .venv/bin/activate && memsearch --help && memsearch config set embedding.provider onnx && memsearch index '${CORPUS_DIR}' && python '${driver_path}'"; then + if jq -e '.checks and .check_summary' "${result_path}" >/dev/null 2>&1; then + jq '{check_summary, checks}' "${result_path}" >"${REPORT_DIR}/${project}-checks.json" + fi + if jq -e --argjson query_count "${QUERY_COUNT}" ' + .schema == "elf.live_baseline.memsearch_result/v1" and + .summary.total == $query_count and + .summary.fail == 0 and + .check_summary.fail == 0 and + .check_summary.incomplete == 0 + ' "${result_path}" >/dev/null; then + json_record "${project}" "${repo}" "${head}" "pass" "retrieval_pass" "memsearch indexed the corpus and returned expected evidence and lifecycle checks passed" "${project}.log" "config; index; search" + elif jq -e --argjson query_count "${QUERY_COUNT}" ' + .schema == "elf.live_baseline.memsearch_result/v1" and + .summary.total == $query_count and + .summary.fail == 0 + ' "${result_path}" >/dev/null; then + json_record "${project}" "${repo}" "${head}" "fail" "retrieval_pass" "memsearch same-corpus retrieval passed, but one or more update/delete/recovery checks failed or were incomplete" "${project}.log" "config; index; search" + else + json_record "${project}" "${repo}" "${head}" "fail" "retrieval_wrong_result" "memsearch search ran but did not return expected evidence" "${project}.log" "config; index; search" + fi + return + fi + + json_record "${project}" "${repo}" "${head}" "incomplete" "retrieval_command_failed" "memsearch installed, but the current CLI retrieval command failed" "${project}.log" "memsearch --help; config; index; search" +} + +project_mem0() { + local project="mem0" + local repo="https://github.com/mem0ai/mem0.git" + local log_path="${REPORT_DIR}/${project}.log" + local result_path="${REPORT_DIR}/${project}-search.json" + local driver_path="${REPOS_DIR}/${project}/elf-live-baseline-mem0.py" + local home="${HOME_DIR}/${project}" + local head + mkdir -p "${home}" + head="$(clone_project "${project}" "${repo}" "${log_path}")" || { + json_record "${project}" "${repo}" "${head}" "fail" "not_run" "clone failed" "${project}.log" "git clone" + return + } + + if ! run_cmd "${project}: install/import" 420 "${log_path}" \ + "cd '${REPOS_DIR}/${project}' && python3 -m venv .venv && .venv/bin/pip install --upgrade pip && .venv/bin/pip install -e . fastembed ollama && .venv/bin/python - <<'PY' +from mem0 import Memory +print('mem0 Memory import ok:', Memory) +PY"; then + json_record "${project}" "${repo}" "${head}" "fail" "not_run" "pip install or import failed" "${project}.log" "pip install -e . fastembed ollama; import Memory" + return + fi + + cat >"${driver_path}" <<'PY' +import gc +import json +import os +from pathlib import Path + +os.environ.setdefault("MEM0_TELEMETRY", "false") + +from mem0 import Memory + +out_path = Path(os.environ["ELF_MEM0_RESULT_PATH"]) +base = Path(os.environ["ELF_MEM0_HOME"]) +corpus_path = Path(os.environ["ELF_BASELINE_CORPUS_PATH"]) +queries_path = Path(os.environ["ELF_BASELINE_QUERIES_PATH"]) +top_k = int(os.environ.get("ELF_BASELINE_TOP_K", "10")) + +config = { + "vector_store": { + "provider": "qdrant", + "config": { + "collection_name": "elfbench", + "path": str(base / "qdrant"), + "embedding_model_dims": 384, + }, + }, + "embedder": { + "provider": "fastembed", + "config": { + "model": "BAAI/bge-small-en-v1.5", + "embedding_dims": 384, + }, + }, + "llm": { + "provider": "ollama", + "config": { + "model": "llama3.1:8b", + "ollama_base_url": "http://127.0.0.1:11434", + }, + }, + "history_db_path": str(base / "history.db"), + "version": "v1.1", +} + +memory = Memory.from_config(config) + +def plain_text(markdown: str) -> str: + return " ".join( + line.strip() + for line in markdown.splitlines() + if not line.lstrip().startswith("#") + ).strip() + + +docs = [ + (plain_text(path.read_text()), path.name) + for path in sorted(corpus_path.glob("*.md")) +] +queries = json.loads(queries_path.read_text())["queries"] + +adds = [] +memory_ids_by_source = {} +for text, source in docs: + added = memory.add( + text, + user_id="elf-bench", + metadata={"source": source}, + infer=False, + ) + adds.append({"source": source, "result": added}) + results = added.get("results", []) if isinstance(added, dict) else [] + if results and isinstance(results[0], dict) and results[0].get("id"): + memory_ids_by_source[source] = results[0]["id"] + + +def result_entries(search): + return search.get("results", []) if isinstance(search, dict) else [] + + +def search_memory(memory_instance, query_text): + return memory_instance.search( + query_text, + filters={"user_id": "elf-bench"}, + top_k=top_k, + threshold=0.0, + ) + + +def matches_expected(search, expected_doc, expected_terms): + for entry in result_entries(search): + entry_text = json.dumps(entry, default=str).lower() + source = ((entry.get("metadata") or {}).get("source") or "") + if source == expected_doc and all( + term.lower() in entry_text for term in expected_terms + ): + return True + return False + + +def query_result(query, search): + return { + "id": query["id"], + "query": query["query"], + "expected_doc": query["expected_doc"], + "expected_terms": query["expected_terms"], + "matched": matches_expected( + search, + query["expected_doc"], + query["expected_terms"], + ), + "search": search, + } + + +def make_check(name, status, reason, evidence): + return { + "name": name, + "status": status, + "reason": reason, + "evidence": evidence, + } + + +def summarize_checks(checks): + return { + "total": len(checks), + "pass": sum(1 for check in checks if check["status"] == "pass"), + "fail": sum(1 for check in checks if check["status"] == "fail"), + "incomplete": sum(1 for check in checks if check["status"] == "incomplete"), + } + +query_results = [] +for query in queries: + query_results.append(query_result(query, search_memory(memory, query["query"]))) + +pass_count = sum(1 for result in query_results if result["matched"]) +checks = [ + make_check( + "same_corpus_retrieval", + "pass" if pass_count == len(query_results) else "fail", + "mem0 local FastEmbed/Qdrant search returned expected evidence for every query." + if pass_count == len(query_results) + else "mem0 local FastEmbed/Qdrant search missed one or more expected results.", + { + "total": len(query_results), + "pass": pass_count, + "fail": len(query_results) - pass_count, + }, + ) +] + +auth_id = memory_ids_by_source.get("auth-memory.md") +if not auth_id: + checks.append( + make_check( + "update_replaces_note_text", + "incomplete", + "The auth memory id was not returned by mem0 add(), so update could not be exercised.", + {"source": "auth-memory.md"}, + ) + ) +else: + update_text = ( + "Rotated auth middleware validates JWT tokens with key id `kid-v4` " + "under `RotatedJwtKeyPlan`. It still requires tenant scope " + "`project_shared` for deployment operations after the emergency key rotation." + ) + update_result = memory.update( + auth_id, + update_text, + metadata={"source": "auth-memory.md", "lifecycle": "updated"}, + ) + update_search = search_memory( + memory, + "Which rotated JWT key id does the auth middleware require?", + ) + update_matched = matches_expected( + update_search, + "auth-memory.md", + ["kid-v4", "RotatedJwtKeyPlan"], + ) + old_marker_absent = all( + "kid-v3" not in json.dumps(entry, default=str).lower() + for entry in result_entries(update_search) + if entry.get("id") == auth_id + or ((entry.get("metadata") or {}).get("source") == "auth-memory.md") + ) + checks.append( + make_check( + "update_replaces_note_text", + "pass" if update_matched and old_marker_absent else "fail", + "mem0 update() returned the new marker and did not return the old marker for the updated memory." + if update_matched and old_marker_absent + else "mem0 update() did not cleanly replace the searchable auth memory text.", + { + "memory_id": auth_id, + "update_result": update_result, + "matched_new_marker": update_matched, + "old_marker_absent": old_marker_absent, + "search": update_search, + }, + ) + ) + +delete_query = next( + ( + query + for query in queries + if query["expected_doc"] in memory_ids_by_source + and query["expected_doc"] not in {"auth-memory.md", "database-memory.md"} + ), + None, +) +if delete_query is None: + checks.append( + make_check( + "delete_suppresses_retrieval", + "incomplete", + "No non-update, non-recovery memory id was available, so delete could not be exercised.", + {"available_sources": sorted(memory_ids_by_source)}, + ) + ) +else: + delete_source = delete_query["expected_doc"] + delete_id = memory_ids_by_source[delete_source] + delete_result = memory.delete(delete_id) + delete_search = search_memory( + memory, + delete_query["query"], + ) + deleted_still_matched = matches_expected( + delete_search, + delete_source, + delete_query["expected_terms"], + ) + checks.append( + make_check( + "delete_suppresses_retrieval", + "pass" if not deleted_still_matched else "fail", + "mem0 delete() suppressed the deleted memory from subsequent search." + if not deleted_still_matched + else "mem0 delete() returned success but the deleted memory was still searchable.", + { + "memory_id": delete_id, + "source": delete_source, + "query": delete_query, + "delete_result": delete_result, + "deleted_still_matched": deleted_still_matched, + "search": delete_search, + }, + ) + ) + +del memory +gc.collect() +reopened_memory = Memory.from_config(config) +recovery_search = search_memory( + reopened_memory, + "The invoice list N+1 query was fixed by eager loading invoice lines through `InvoiceLineBatcher`. Do not reintroduce per-row SQL calls in invoice rendering.", +) +recovery_matched = matches_expected( + recovery_search, + "database-memory.md", + ["InvoiceLineBatcher", "N+1"], +) +checks.append( + make_check( + "cold_start_recovery_search", + "pass" if recovery_matched else "fail", + "A newly constructed mem0 Memory over the same local Qdrant/history paths retrieved persisted evidence." + if recovery_matched + else "A newly constructed mem0 Memory over the same local Qdrant/history paths did not retrieve persisted evidence.", + { + "expected_doc": "database-memory.md", + "matched": recovery_matched, + "search": recovery_search, + }, + ) +) + +check_summary = summarize_checks(checks) + +out_path.write_text( + json.dumps( + { + "schema": "elf.live_baseline.mem0_result/v1", + "config": { + "embedder": "fastembed:BAAI/bge-small-en-v1.5", + "vector_store": "qdrant:path", + "infer": False, + }, + "corpus": { + "document_count": len(docs), + "query_count": len(queries), + }, + "adds": adds, + "summary": { + "total": len(query_results), + "pass": pass_count, + "fail": len(query_results) - pass_count, + }, + "check_summary": check_summary, + "checks": checks, + "queries": query_results, + }, + indent=2, + default=str, + ) +) +PY + + if run_cmd "${project}: local fastembed add/search" 900 "${log_path}" \ + "export HOME='${home}'; export ELF_MEM0_HOME='${home}'; export ELF_MEM0_RESULT_PATH='${result_path}'; export ELF_BASELINE_CORPUS_PATH='${CORPUS_DIR}'; export ELF_BASELINE_QUERIES_PATH='${REPORT_DIR}/queries.json'; export MEM0_TELEMETRY=false; cd '${REPOS_DIR}/${project}' && source .venv/bin/activate && python '${driver_path}'"; then + if jq -e '.checks and .check_summary' "${result_path}" >/dev/null 2>&1; then + jq '{check_summary, checks}' "${result_path}" >"${REPORT_DIR}/${project}-checks.json" + fi + if jq -e --argjson query_count "${QUERY_COUNT}" --argjson document_count "${DOCUMENT_COUNT}" ' + .schema == "elf.live_baseline.mem0_result/v1" and + .corpus.document_count == $document_count and + .summary.total == $query_count and + .summary.fail == 0 and + .check_summary.fail == 0 and + .check_summary.incomplete == 0 + ' "${result_path}" >/dev/null; then + json_record "${project}" "${repo}" "${head}" "pass" "retrieval_pass" "mem0 infer=false local fastembed/Qdrant search found expected evidence and lifecycle checks passed" "${project}.log" "pip install -e . fastembed ollama; Memory.from_config; add/update/delete/search" + return + fi + if jq -e --argjson query_count "${QUERY_COUNT}" --argjson document_count "${DOCUMENT_COUNT}" ' + .schema == "elf.live_baseline.mem0_result/v1" and + .corpus.document_count == $document_count and + .summary.total == $query_count and + .summary.fail == 0 + ' "${result_path}" >/dev/null; then + json_record "${project}" "${repo}" "${head}" "fail" "retrieval_pass" "mem0 same-corpus retrieval passed, but one or more update/delete/recovery checks failed or were incomplete" "${project}.log" "pip install -e . fastembed ollama; Memory.from_config; add/update/delete/search" + return + fi + json_record "${project}" "${repo}" "${head}" "fail" "retrieval_wrong_result" "mem0 local add/search ran but did not return expected evidence" "${project}.log" "pip install -e . fastembed ollama; Memory.from_config; add infer=false; search" + return + fi + + json_record "${project}" "${repo}" "${head}" "incomplete" "retrieval_command_failed" "mem0 installed and imported, but local fastembed/Qdrant add/search failed" "${project}.log" "pip install -e . fastembed ollama; Memory.from_config; add infer=false; search" +} + +project_openviking() { + local project="OpenViking" + local repo="https://github.com/volcengine/OpenViking.git" + local log_path="${REPORT_DIR}/${project}.log" + local home="${HOME_DIR}/${project}" + local config_path="${REPORT_DIR}/${project}-ov.conf" + local result_path="${REPORT_DIR}/${project}-search.json" + local driver_path="${REPOS_DIR}/${project}/elf-live-baseline-openviking.py" + local local_embed_failure_pattern="llama-cpp-python|target specific option mismatch|failed-wheel-build-for-install|Failed building wheel|Failed to build llama-cpp-python|No module named 'llama_cpp'|Local embedding is enabled but 'llama-cpp-python' is not installed" + local head + mkdir -p "${home}" + head="$(clone_project "${project}" "${repo}" "${log_path}")" || { + json_record "${project}" "${repo}" "${head}" "fail" "not_run" "clone failed" "${project}.log" "git clone" + return + } + + if ! run_cmd "${project}: install/help" 600 "${log_path}" \ + "export HOME='${home}'; cd '${REPOS_DIR}/${project}' && python3 -m venv .venv && .venv/bin/pip install --upgrade pip && .venv/bin/pip install maturin && .venv/bin/pip install -e . && (.venv/bin/openviking language en || .venv/bin/ov language en) && (.venv/bin/openviking --help || .venv/bin/ov --help)"; then + json_record "${project}" "${repo}" "${head}" "fail" "not_run" "pip install or CLI help failed" "${project}.log" "pip install -e .; openviking/ov --help" + return + fi + + if rg -q "ERROR: Failed building editable|Failed to build openviking|error: failed-wheel-build-for-install|CMake Error" "${log_path}"; then + json_record "${project}" "${repo}" "${head}" "fail" "partial_install" "OpenViking install/help returned success but the build log contains native build errors" "${project}.log" "pip install -e .; openviking/ov --help" + return + fi + + cat >"${config_path}" <"${driver_path}" <<'PY' +import json +import os +from pathlib import Path + +from openviking import OpenViking + + +def to_jsonable(value): + if hasattr(value, "to_dict"): + return value.to_dict() + if hasattr(value, "model_dump"): + return value.model_dump() + if isinstance(value, list): + return [to_jsonable(item) for item in value] + if isinstance(value, dict): + return {key: to_jsonable(item) for key, item in value.items()} + return value + + +out_path = Path(os.environ["ELF_OPENVIKING_RESULT_PATH"]) +data_path = os.environ["ELF_OPENVIKING_DATA_PATH"] +corpus_path = os.environ["ELF_OPENVIKING_CORPUS_PATH"] +queries_path = Path(os.environ["ELF_BASELINE_QUERIES_PATH"]) +top_k = int(os.environ.get("ELF_BASELINE_TOP_K", "10")) + + +def result_matches(found, query): + raw = json.dumps(to_jsonable(found), ensure_ascii=False, default=str).lower() + return query["expected_doc"].lower() in raw and all( + term.lower() in raw for term in query["expected_terms"] + ) + + +client = OpenViking(path=data_path) +client.initialize() +try: + queries = json.loads(queries_path.read_text())["queries"] + added = client.add_resource( + corpus_path, + to="viking://resources/elfbench", + wait=True, + timeout=240, + build_index=True, + summarize=False, + ) + query_results = [] + for query in queries: + found = client.find( + query["query"], + target_uri="viking://resources/elfbench", + limit=top_k, + score_threshold=0.0, + level=[2], + ) + query_results.append( + { + "id": query["id"], + "query": query["query"], + "expected_doc": query["expected_doc"], + "expected_terms": query["expected_terms"], + "matched": result_matches(found, query), + "find": to_jsonable(found), + } + ) + pass_count = sum(1 for result in query_results if result["matched"]) + out_path.write_text( + json.dumps( + { + "schema": "elf.live_baseline.openviking_result/v1", + "config": { + "embedder": "local:bge-small-zh-v1.5-f16", + "vector_store": "local", + "mode": "OpenViking.add_resource/find", + }, + "add": to_jsonable(added), + "summary": { + "total": len(query_results), + "pass": pass_count, + "fail": len(query_results) - pass_count, + }, + "queries": query_results, + }, + ensure_ascii=False, + indent=2, + default=str, + ) + ) +finally: + client.close() +PY + + if ! run_cmd "${project}: install local embedding extras" 900 "${log_path}" \ + "export HOME='${home}'; cd '${REPOS_DIR}/${project}' && .venv/bin/pip install -e '.[local-embed]'"; then + if rg -q "${local_embed_failure_pattern}" "${log_path}"; then + json_record "${project}" "${repo}" "${head}" "incomplete" "local_embed_install_failed" "OpenViking local-embed install failed in Docker while building llama-cpp-python for aarch64, so same-corpus local retrieval could not be run" "${project}.log" "pip install -e .; openviking/ov --help; pip install -e .[local-embed]" + return + fi + json_record "${project}" "${repo}" "${head}" "incomplete" "local_embed_install_failed" "OpenViking local-embed install failed in Docker, so same-corpus local retrieval could not be run" "${project}.log" "pip install -e .; openviking/ov --help; pip install -e .[local-embed]" + return + fi + + if rg -q "${local_embed_failure_pattern}" "${log_path}"; then + json_record "${project}" "${repo}" "${head}" "incomplete" "local_embed_install_failed" "OpenViking local-embed install returned success but the log contains llama-cpp-python build/import failure, so same-corpus local retrieval could not be run" "${project}.log" "pip install -e .; openviking/ov --help; pip install -e .[local-embed]" + return + fi + + if run_cmd "${project}: local add/find" 900 "${log_path}" \ + "export HOME='${home}'; export OPENVIKING_CONFIG_FILE='${config_path}'; export ELF_OPENVIKING_DATA_PATH='${home}/data'; export ELF_OPENVIKING_CORPUS_PATH='${CORPUS_DIR}'; export ELF_OPENVIKING_RESULT_PATH='${result_path}'; export ELF_BASELINE_QUERIES_PATH='${REPORT_DIR}/queries.json'; cd '${REPOS_DIR}/${project}' && source .venv/bin/activate && python '${driver_path}'"; then + if rg -q "${local_embed_failure_pattern}" "${log_path}"; then + json_record "${project}" "${repo}" "${head}" "incomplete" "local_embed_install_failed" "OpenViking local add_resource/find hit llama-cpp-python build/import failure, so same-corpus local retrieval could not be run" "${project}.log" "pip install -e .[local-embed]; OpenViking.add_resource/find" + return + fi + if [[ ! -s "${result_path}" ]] || ! jq -e . "${result_path}" >/dev/null 2>&1; then + json_record "${project}" "${repo}" "${head}" "incomplete" "retrieval_command_failed" "OpenViking local add_resource/find returned success but did not write a valid result JSON" "${project}.log" "pip install -e .[local-embed]; OpenViking.add_resource/find" + return + fi + if jq -e --argjson query_count "${QUERY_COUNT}" ' + .schema == "elf.live_baseline.openviking_result/v1" and + .summary.total == $query_count and + .summary.fail == 0 + ' "${result_path}" >/dev/null; then + json_record "${project}" "${repo}" "${head}" "pass" "retrieval_pass" "OpenViking local add_resource/find found expected evidence for every query" "${project}.log" "pip install -e .[local-embed]; OpenViking.add_resource/find" + return + fi + json_record "${project}" "${repo}" "${head}" "fail" "retrieval_wrong_result" "OpenViking local add_resource/find ran but did not return expected evidence" "${project}.log" "pip install -e .[local-embed]; OpenViking.add_resource/find" + return + fi + + if rg -q "${local_embed_failure_pattern}" "${log_path}"; then + json_record "${project}" "${repo}" "${head}" "incomplete" "local_embed_install_failed" "OpenViking local add_resource/find failed because llama-cpp-python was unavailable in Docker" "${project}.log" "pip install -e .[local-embed]; OpenViking.add_resource/find" + return + fi + + json_record "${project}" "${repo}" "${head}" "incomplete" "retrieval_command_failed" "OpenViking local-embed installed, but same-corpus add_resource/find failed in Docker" "${project}.log" "pip install -e .[local-embed]; OpenViking.add_resource/find" +} + +project_claude_mem() { + local project="claude-mem" + local repo="https://github.com/thedotmack/claude-mem.git" + local log_path="${REPORT_DIR}/${project}.log" + local result_path="${REPORT_DIR}/${project}-search.json" + local driver_path="${REPOS_DIR}/${project}/elf-live-baseline-claude-mem.ts" + local head + head="$(clone_project "${project}" "${repo}" "${log_path}")" || { + json_record "${project}" "${repo}" "${head}" "fail" "not_run" "clone failed" "${project}.log" "git clone" + return + } + + if ! run_cmd "${project}: install/build" 420 "${log_path}" \ + "cd '${REPOS_DIR}/${project}' && (npm ci || npm install --no-audit --no-fund) && npm run build --if-present"; then + json_record "${project}" "${repo}" "${head}" "fail" "not_run" "npm install/build failed" "${project}.log" "npm install/build" + return + fi + + cat >"${driver_path}" <<'TS' +import { readFileSync, readdirSync, writeFileSync } from "node:fs"; +import { join } from "node:path"; +import { Database } from "bun:sqlite"; +import { MemoryItemsRepository } from "./src/storage/sqlite/memory-items.ts"; +import { ProjectsRepository } from "./src/storage/sqlite/projects.ts"; + +const outPath = Bun.argv[2]; +const corpusPath = Bun.argv[3]; +const queriesPath = Bun.argv[4]; +if (!outPath || !corpusPath || !queriesPath) { + throw new Error("output path, corpus path, and query path are required"); +} + +type QueryCase = { + id: string; + query: string; + expected_doc: string; + expected_terms: string[]; +}; + +function plainText(markdown: string): string { + return markdown + .split(/\r?\n/) + .filter((line) => !line.trimStart().startsWith("#")) + .join(" ") + .replace(/\s+/g, " ") + .trim(); +} + +function titleFrom(markdown: string, file: string): string { + const heading = markdown + .split(/\r?\n/) + .find((line) => line.trimStart().startsWith("# ")); + return heading ? heading.replace(/^#\s+/, "").trim() : file; +} + +function conceptsFor(file: string): string[] { + return file + .replace(/\.md$/i, "") + .split(/[^A-Za-z0-9]+/) + .map((part) => part.toLowerCase()) + .filter(Boolean); +} + +function resultMatches(results: unknown[], query: QueryCase): boolean { + return results.some((entry) => { + const files = (entry as { filesRead?: string[] }).filesRead ?? []; + const entryText = JSON.stringify(entry).toLowerCase(); + return ( + files.includes(query.expected_doc) && + query.expected_terms.every((term) => + entryText.includes(term.toLowerCase()), + ) + ); + }); +} + +const db = new Database(":memory:"); +db.run("PRAGMA foreign_keys = ON"); + +try { + const projects = new ProjectsRepository(db); + const memories = new MemoryItemsRepository(db); + const project = projects.create({ + name: "elfbench", + slug: "elfbench", + rootPath: "/bench/corpus", + metadata: { source: "elf-live-baseline" }, + }); + + const docs = readdirSync(corpusPath) + .filter((file) => file.endsWith(".md")) + .sort() + .map((file) => { + const raw = readFileSync(join(corpusPath, file), "utf8"); + return { + title: titleFrom(raw, file), + text: plainText(raw), + concepts: conceptsFor(file), + file, + }; + }); + const queries = JSON.parse(readFileSync(queriesPath, "utf8")).queries as QueryCase[]; + const topK = Number(process.env.ELF_BASELINE_TOP_K ?? "10"); + + const created = docs.map((doc) => + memories.create({ + projectId: project.id, + kind: "manual", + type: "fact", + title: doc.title, + text: doc.text, + narrative: doc.text, + facts: [doc.text], + concepts: doc.concepts, + filesRead: [doc.file], + metadata: { source: doc.file }, + }), + ); + + const queryResults = queries.map((query) => { + const results = memories.search(project.id, query.query, topK); + return { + id: query.id, + query: query.query, + expected_doc: query.expected_doc, + expected_terms: query.expected_terms, + matched: resultMatches(results, query), + results, + }; + }); + const pass = queryResults.filter((result) => result.matched).length; + + writeFileSync( + outPath, + JSON.stringify( + { + schema: "elf.live_baseline.claude_mem_result/v1", + corpus: { + document_count: docs.length, + query_count: queries.length, + }, + created, + summary: { + total: queryResults.length, + pass, + fail: queryResults.length - pass, + }, + queries: queryResults, + }, + null, + 2, + ), + ); +} finally { + db.close(); +} +TS + + if run_cmd "${project}: same-corpus sqlite search" 300 "${log_path}" \ + "cd '${REPOS_DIR}/${project}' && bun '${driver_path}' '${result_path}' '${CORPUS_DIR}' '${REPORT_DIR}/queries.json'"; then + if jq -e --argjson query_count "${QUERY_COUNT}" --argjson document_count "${DOCUMENT_COUNT}" ' + .schema == "elf.live_baseline.claude_mem_result/v1" and + .corpus.document_count == $document_count and + .summary.total == $query_count and + .summary.fail == 0 + ' "${result_path}" >/dev/null; then + json_record "${project}" "${repo}" "${head}" "pass" "retrieval_pass" "claude-mem SQLite memory repository search found expected evidence for every query" "${project}.log" "npm install/build; MemoryItemsRepository.create/search" + return + fi + json_record "${project}" "${repo}" "${head}" "fail" "retrieval_wrong_result" "claude-mem same-corpus search ran but did not return expected evidence" "${project}.log" "npm install/build; MemoryItemsRepository.create/search" + return + fi + + json_record "${project}" "${repo}" "${head}" "incomplete" "retrieval_command_failed" "claude-mem built, but same-corpus SQLite search did not pass in Docker" "${project}.log" "npm install/build; MemoryItemsRepository.create/search" +} + +run_project "ELF" project_elf +run_project "agentmemory" project_agentmemory +run_project "qmd" project_qmd +run_project "memsearch" project_memsearch +run_project "mem0" project_mem0 +run_project "OpenViking" project_openviking +run_project "claude-mem" project_claude_mem +finish_report + +jq . "${REPORT}" +echo "Live baseline report: ${REPORT}" + +if [[ "${ELF_BASELINE_STRICT:-0}" == "1" ]]; then + jq -e '.verdict == "pass"' "${REPORT}" >/dev/null +fi diff --git a/scripts/live-baseline-report-to-md.sh b/scripts/live-baseline-report-to-md.sh new file mode 100755 index 00000000..651f29b4 --- /dev/null +++ b/scripts/live-baseline-report-to-md.sh @@ -0,0 +1,99 @@ +#!/usr/bin/env bash +set -euo pipefail + +ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" +REPORT="${1:-${ELF_BASELINE_REPORT:-${ROOT_DIR}/tmp/live-baseline/live-baseline-report.json}}" +OUT="${2:-${ELF_BASELINE_MARKDOWN_REPORT:-}}" + +if ! command -v jq >/dev/null 2>&1; then + echo "Missing jq; cannot render live baseline Markdown report." >&2 + exit 1 +fi + +if [[ ! -f "${REPORT}" ]]; then + echo "Missing report: ${REPORT}" >&2 + exit 1 +fi + +render_report() { + jq -r --arg report_path "${REPORT}" ' + def dash: + if . == null then "-" else tostring end; + def md: + dash | gsub("\\|"; "\\|") | gsub("\n"; " "); + def checks: + ((.check_summary.pass // 0 | tostring) + "/" + (.check_summary.total // 0 | tostring)); + + "# Live Baseline Benchmark Report", + "", + "Goal: Publish a Markdown summary for one generated live baseline aggregate report.", + "Read this when: You need a durable, reviewable summary of a live baseline JSON report.", + ("Inputs: `" + $report_path + "`."), + "Depends on: `scripts/live-baseline-benchmark.sh` and `docs/guide/benchmarking/live_baseline_benchmark.md`.", + "Verification: Compare this Markdown summary with the source JSON before committing.", + "", + "## Summary", + "", + ("- Run ID: `" + (.run_id | md) + "`"), + ("- Generated at: `" + (.generated_at | md) + "`"), + ("- Verdict: `" + (.verdict | md) + "`"), + ("- Project filter: `" + (.project_filter | md) + "`"), + ("- Corpus profile: `" + (.corpus.profile | md) + "`"), + ("- Documents: `" + (.corpus.document_count | tostring) + "`"), + ("- Queries: `" + (.corpus.query_count | tostring) + "`"), + ("- Project summary: `" + (.summary.pass | tostring) + " pass`, `" + (.summary.fail | tostring) + " fail`, `" + (.summary.incomplete | tostring) + " incomplete`"), + ("- Same-corpus summary: `" + (.same_corpus_summary.pass | tostring) + " pass`, `" + (.same_corpus_summary.fail | tostring) + " fail`, `" + (.same_corpus_summary.incomplete | tostring) + " incomplete`"), + ("- Full check summary: `" + (.full_check_summary.pass | tostring) + "/" + (.full_check_summary.total | tostring) + " pass`"), + "", + "## Projects", + "", + "| Project | Status | Retrieval | Checks | Elapsed | Reason |", + "| --- | --- | --- | --- | --- | --- |", + ( + .projects[] + | "| " + (.project | md) + + " | `" + (.status | md) + "`" + + " | `" + (.retrieval_status | md) + "`" + + " | `" + checks + "`" + + " | `" + (.elapsed_seconds | tostring) + "s`" + + " | " + (.reason | md) + " |" + ), + "", + ( + [.projects[] | select(.embedding != null)] as $embedded + | if ($embedded | length) > 0 then + "## Embedding", + "", + "| Project | Mode | Provider | Model | Dimensions | Timeout | API Base | Path |", + "| --- | --- | --- | --- | --- | --- | --- | --- |", + ( + $embedded[] + | "| " + (.project | md) + + " | `" + (.embedding.mode | md) + "`" + + " | `" + (.embedding.provider_id | md) + "`" + + " | `" + (.embedding.model | md) + "`" + + " | `" + (.embedding.dimensions | tostring) + "`" + + " | `" + (.embedding.timeout_ms | tostring) + "ms`" + + " | `" + (.embedding.api_base | md) + "`" + + " | `" + (.embedding.path | md) + "` |" + ), + "" + else empty end + ), + "## Result Semantics", + "", + "- `pass`: every encoded check for the selected project and profile passed.", + "- `fail`: clone, install, import, build, retrieval, lifecycle, recovery, concurrency, soak, resource-envelope, or another declared check failed.", + "- `incomplete`: the encoded check could not complete without extra provider keys, host integration, native dependency support, durable runtime wiring, or more adapter work.", + "", + "`incomplete` is not a pass; treat it as benchmark wiring debt." + ' "${REPORT}" +} + +if [[ -n "${OUT}" ]]; then + mkdir -p "$(dirname "${OUT}")" + render_report >"${OUT}" + echo "Wrote ${OUT}" +else + render_report +fi