diff --git a/README.md b/README.md index 7df564fc..203c4da0 100644 --- a/README.md +++ b/README.md @@ -161,16 +161,18 @@ provider-backed ELF evidence was required. jobs for core block attachment, scope, provenance, stale-core detection, archival fallback, and project-decision recovery; it does not create an ELF-over-Letta claim. -- Full-suite live real-world adapter sweep after XY-899: ELF and qmd emit - Docker-isolated `live_real_world` records for all 40 encoded jobs across 11 suites +- Full-suite live real-world adapter sweep after XY-926: ELF and qmd emit + Docker-isolated `live_real_world` records for all 55 checked-in jobs across 13 suites through `cargo make real-world-memory-live-adapters`. Both keep the original targeted `work_resume`, `retrieval`, and `project_decisions` slice passing, but the - full sweep is not a full-suite pass. The fresh ELF sweep reports 22 pass, - 5 wrong_result, 2 blocked, and 11 not_encoded jobs. The fresh qmd sweep reports - 17 pass, 6 wrong_result, 2 blocked, and 15 not_encoded jobs. The differences are - the delete/TTL tombstone case plus ELF-only capture/write-policy live self-checks; - qmd remains the local retrieval-debug UX reference, and no broad ELF-over-qmd claim - is allowed. + full sweep is not a full-suite pass. ELF now live-scores capture/write-policy, + consolidation proposal review, knowledge-page rebuild/lint, and operator-debugging + fixtures. The remaining ELF non-pass boundaries are memory-evolution wrong results, + production-ops operator boundaries, the core/archival live adapter gap, and blocked + context-trajectory measurement. qmd remains the local retrieval-debug UX reference; + it keeps consolidation, knowledge, capture, and core/archival typed non-pass states + and is `wrong_result` for operator-debug trace hydration, so no broad ELF-over-qmd + claim is allowed. - Live operator-debugging slice after XY-932: `cargo make real-world-job-operator-ux-live-adapters` emits narrow Docker-isolated `live_real_world` records for ELF and qmd over the operator-debugging fixtures. diff --git a/apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json b/apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json index b70fec8b..e7cd237f 100644 --- a/apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json +++ b/apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json @@ -156,13 +156,13 @@ }, "run": { "status": "wrong_result", - "evidence": "ELF materializes 40 real_world_job adapter_response objects through ElfService, worker indexing, search_raw, and live capture/write-policy ingestion before scoring; the full sweep includes typed wrong_result, blocked, and not_encoded job records.", + "evidence": "ELF materializes 55 real_world_job adapter_response objects through ElfService, worker indexing, search_raw, live capture/write-policy ingestion, live consolidation proposal review, live knowledge-page rebuild/lint, and operator-debug trace metadata before scoring; the full sweep includes typed wrong_result, blocked, and not_encoded job records.", "command": "cargo make real-world-memory-live-adapters", "artifact": "tmp/real-world-memory/live-adapters/elf-report.json" }, "result": { "status": "wrong_result", - "evidence": "The fresh full live sweep scores 40 jobs across all 11 encoded suites: 22 pass, 5 wrong_result, 0 incomplete, 2 blocked, and 11 not_encoded. This is not a full-suite live pass.", + "evidence": "The fresh full live sweep scores 55 jobs across all 13 checked-in suites, including live-scored consolidation, knowledge-page, capture/write-policy, and operator-debug suites. This is not a full-suite live pass because memory-evolution, production-ops, core-archival, and context-trajectory gaps remain typed non-pass records.", "command": "cargo make real-world-memory-live-adapters", "artifact": "tmp/real-world-memory/live-adapters/elf-report.md" }, @@ -185,7 +185,7 @@ { "capability": "full_suite_live_sweep", "status": "wrong_result", - "evidence": "The runner now emits per-job and per-suite live records for all 40 encoded jobs, but memory_evolution is wrong_result and several non-answer-generation suites remain typed non-pass." + "evidence": "The runner now emits per-job and per-suite live records for all 55 checked-in jobs, including the operator-debug fixture tree, but memory_evolution is wrong_result and production/core/context boundaries remain typed non-pass." }, { "capability": "full_suite_live_pass", @@ -226,18 +226,18 @@ }, { "suite_id": "consolidation", - "status": "not_encoded", - "evidence": "The live adapter sweep retrieves evidence-linked answers but does not generate or review consolidation proposals." + "status": "pass", + "evidence": "The live adapter creates consolidation runs, materializes proposal jobs through the worker, preserves source lineage and unsupported-claim flags, and applies/defer/discards proposals through review audit transitions." }, { "suite_id": "knowledge_compilation", - "status": "not_encoded", - "evidence": "The live adapter sweep retrieves evidence-linked answers but does not generate derived knowledge pages." + "status": "pass", + "evidence": "The live adapter rebuilds derived knowledge pages through ElfService, searches page sections, lints stale source refs after runtime source updates, and emits citation/backlink/unsupported-section page artifacts." }, { "suite_id": "operator_debugging_ux", - "status": "not_encoded", - "evidence": "The live adapter sweep does not yet hydrate full operator trace/viewer diagnostics for this suite." + "status": "pass", + "evidence": "The full live sweep includes operator_debugging_ux fixtures and emits trace ids, viewer/admin trace-bundle links, replay commands, dropped-candidate visibility, repair-action clarity, and raw_sql_needed=false." }, { "suite_id": "capture_integration", @@ -253,6 +253,16 @@ "suite_id": "personalization", "status": "pass", "evidence": "The live adapter retrieved the scoped preference evidence and passed the personalization job." + }, + { + "suite_id": "core_archival_memory", + "status": "not_encoded", + "evidence": "The full live adapter sweep preserves the core/archival fixture gap as typed not_encoded; this issue does not add live core-block attachment/readback materialization." + }, + { + "suite_id": "context_trajectory", + "status": "blocked", + "evidence": "The OpenViking-style context trajectory fixtures remain blocked by live staged-trajectory and recursive-expansion measurement gaps." } ], "scenarios": [ @@ -265,6 +275,36 @@ "evidence": "ELF live capture/write-policy jobs pass for redaction, exclusions, source ids, evidence binding, and no secret leakage. This is an ELF self-check, not a win over external hook systems.", "command": "cargo make real-world-memory-live-adapters", "artifact": "tmp/real-world-memory/live-adapters/elf-materialization.json" + }, + { + "scenario_id": "live_consolidation_proposal_review", + "suite_id": "consolidation", + "status": "pass", + "elf_position": "ties", + "comparison_outcome": "tie", + "evidence": "ELF live consolidation jobs now exercise source lineage, unsupported-claim flags, and apply/defer/discard review audit transitions. This is an ELF service self-check, not a broad competitor win.", + "command": "cargo make real-world-memory-live-adapters", + "artifact": "tmp/real-world-memory/live-adapters/elf-materialization.json" + }, + { + "scenario_id": "live_knowledge_page_rebuild_lint", + "suite_id": "knowledge_compilation", + "status": "pass", + "elf_position": "ties", + "comparison_outcome": "tie", + "evidence": "ELF live knowledge jobs now exercise page rebuild, search, stale-source lint, citations, backlinks, and unsupported-section handling. This is an ELF service self-check, not a broad knowledge-product win.", + "command": "cargo make real-world-memory-live-adapters", + "artifact": "tmp/real-world-memory/live-adapters/elf-materialization.json" + }, + { + "scenario_id": "full_sweep_operator_debug", + "suite_id": "operator_debugging_ux", + "status": "pass", + "elf_position": "wins", + "comparison_outcome": "win", + "evidence": "ELF full live sweep now includes the operator-debug fixture tree with hydrated trace ids, trace-bundle replay commands, dropped-candidate visibility, repair guidance, and no raw SQL requirement.", + "command": "cargo make real-world-memory-live-adapters", + "artifact": "tmp/real-world-memory/live-adapters/elf-materialization.json" } ], "evidence": [ @@ -273,6 +313,11 @@ "ref": "apps/elf-eval/fixtures/real_world_memory/", "status": "real" }, + { + "kind": "fixture_dir", + "ref": "apps/elf-eval/fixtures/real_world_job/operator_debugging_ux/", + "status": "real" + }, { "kind": "command", "ref": "cargo make real-world-memory-live-adapters", @@ -381,13 +426,13 @@ }, "run": { "status": "wrong_result", - "evidence": "qmd materializes 40 real_world_job adapter_response objects through collection add, update, embed, and query --json before scoring; the full sweep includes typed wrong_result, blocked, and not_encoded job records.", + "evidence": "qmd materializes 55 real_world_job adapter_response objects through collection add, update, embed, and query --json before scoring; the full sweep includes typed wrong_result, blocked, and not_encoded job records, with operator-debug fixtures scored through qmd replay metadata rather than ELF trace hydration.", "command": "cargo make real-world-memory-live-adapters", "artifact": "tmp/real-world-memory/live-adapters/qmd-report.json" }, "result": { "status": "wrong_result", - "evidence": "The fresh full qmd live sweep scores 40 jobs across all 11 encoded suites: 17 pass, 6 wrong_result, 0 incomplete, 2 blocked, and 15 not_encoded. This is not a full-suite live pass.", + "evidence": "The fresh full qmd live sweep scores 55 jobs across all 13 checked-in suites, preserving consolidation, knowledge-page, capture, production-ops, core-archival, and context-trajectory gaps as typed non-pass records. This is not a full-suite live pass.", "command": "cargo make real-world-memory-live-adapters", "artifact": "tmp/real-world-memory/live-adapters/qmd-report.md" }, @@ -410,7 +455,7 @@ { "capability": "full_suite_live_sweep", "status": "wrong_result", - "evidence": "The runner now emits per-job and per-suite live records for all 40 encoded jobs, but memory_evolution is wrong_result and several non-answer-generation suites remain typed non-pass." + "evidence": "The runner now emits per-job and per-suite live records for all 55 checked-in jobs, including the operator-debug fixture tree, but memory_evolution and operator_debugging_ux are wrong_result while non-qmd product surfaces remain typed not_encoded or blocked." }, { "capability": "full_suite_live_pass", @@ -461,8 +506,8 @@ }, { "suite_id": "operator_debugging_ux", - "status": "not_encoded", - "evidence": "The qmd live adapter sweep does not yet hydrate full operator trace/viewer diagnostics for this suite." + "status": "wrong_result", + "evidence": "The full qmd live sweep includes operator_debugging_ux fixtures and records replay-command metadata, but it lacks ELF trace hydration, viewer links, and intermediate candidate-drop stages, so the suite remains wrong_result." }, { "suite_id": "capture_integration", @@ -478,6 +523,16 @@ "suite_id": "personalization", "status": "pass", "evidence": "qmd retrieved the scoped preference evidence and passed the personalization job." + }, + { + "suite_id": "core_archival_memory", + "status": "not_encoded", + "evidence": "The qmd live adapter sweep preserves the core/archival fixture gap as typed not_encoded; qmd does not expose ELF core-block attachment/readback materialization." + }, + { + "suite_id": "context_trajectory", + "status": "blocked", + "evidence": "The OpenViking-style context trajectory fixtures remain blocked by live staged-trajectory and recursive-expansion measurement gaps." } ], "evidence": [ @@ -486,6 +541,11 @@ "ref": "apps/elf-eval/fixtures/real_world_memory/", "status": "real" }, + { + "kind": "fixture_dir", + "ref": "apps/elf-eval/fixtures/real_world_job/operator_debugging_ux/", + "status": "real" + }, { "kind": "command", "ref": "cargo make real-world-memory-live-adapters", diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark.rs b/apps/elf-eval/src/bin/real_world_job_benchmark.rs index d4d0c6ac..71f564ab 100644 --- a/apps/elf-eval/src/bin/real_world_job_benchmark.rs +++ b/apps/elf-eval/src/bin/real_world_job_benchmark.rs @@ -1551,7 +1551,7 @@ fn validate_consolidation_fixture(job: &RealWorldJob, path: &Path) -> Result<()> let consolidation = job.corpus.adapter_response.as_ref().and_then(|response| response.consolidation.as_ref()); - if job.suite == "consolidation" && consolidation.is_none() { + if job.suite == "consolidation" && consolidation.is_none() && job.encoding.status.is_none() { return Err(eyre::eyre!( "{} consolidation jobs must provide adapter_response.consolidation.", path.display() diff --git a/apps/elf-eval/src/bin/real_world_live_adapter.rs b/apps/elf-eval/src/bin/real_world_live_adapter.rs index ddb018e5..5a9bb1da 100644 --- a/apps/elf-eval/src/bin/real_world_live_adapter.rs +++ b/apps/elf-eval/src/bin/real_world_live_adapter.rs @@ -3,7 +3,7 @@ //! Live adapter materializer for the real-world job benchmark. use std::{ - collections::BTreeSet, + collections::{BTreeSet, HashMap}, env, fs::{self, OpenOptions}, io::Write as _, @@ -13,6 +13,7 @@ use std::{ time::{Duration, Instant}, }; +use ::time::OffsetDateTime; use blake3::Hasher; use clap::{Parser, Subcommand, ValueEnum}; use color_eyre::{self, eyre}; @@ -24,10 +25,23 @@ use uuid::Uuid; use elf_chunking::ChunkingConfig; use elf_config::{Config, EmbeddingProviderConfig, LlmProviderConfig, ProviderConfig}; -use elf_domain::writegate::{self, WritePolicy}; +use elf_domain::{ + consolidation::{ + ConsolidationApplyIntent, ConsolidationInputRef, ConsolidationLineage, ConsolidationMarker, + ConsolidationMarkerSeverity, ConsolidationMarkers, ConsolidationProposalDiff, + ConsolidationReviewAction, ConsolidationSourceKind, ConsolidationSourceSnapshot, + ConsolidationUnsupportedClaimFlag, + }, + knowledge::KnowledgePageKind, + writegate::{self, WritePolicy}, +}; use elf_service::{ - AddNoteInput, AddNoteRequest, BoxFuture, ElfService, EmbeddingProvider, ExtractorProvider, - PayloadLevel, Providers, RerankProvider, SearchItem, SearchRequest, + AddNoteInput, AddNoteRequest, BoxFuture, ConsolidationProposalInput, + ConsolidationProposalResponse, ConsolidationProposalReviewRequest, + ConsolidationProposalsListRequest, ConsolidationRunCreateRequest, ElfService, + EmbeddingProvider, ExtractorProvider, KnowledgePageLintRequest, KnowledgePageLintResponse, + KnowledgePageRebuildRequest, KnowledgePageResponse, KnowledgePageSearchRequest, PayloadLevel, + Providers, RerankProvider, SearchItem, SearchRequest, }; use elf_storage::{db::Db, qdrant::QdrantStore}; use elf_testkit::TestDatabase; @@ -253,6 +267,10 @@ struct MaterializedJobEvidence { operator_debug: Option, #[serde(skip_serializing_if = "Option::is_none")] capture: Option, + #[serde(skip_serializing_if = "Option::is_none")] + consolidation: Option, + #[serde(skip_serializing_if = "Option::is_none")] + knowledge: Option, } #[derive(Clone, Debug, Serialize)] @@ -276,6 +294,28 @@ struct CaptureMaterializationEvidence { runtime_source_refs: Vec, } +#[derive(Clone, Debug, Default, Serialize)] +struct ConsolidationMaterializationEvidence { + run_id: Option, + proposal_ids: Vec, + source_lineage_count: usize, + unsupported_claim_flag_count: usize, + review_event_count: usize, + review_actions: Vec, + final_review_states: Vec, +} + +#[derive(Clone, Debug, Default, Serialize)] +struct KnowledgeMaterializationEvidence { + page_ids: Vec, + search_result_count: usize, + lint_finding_count: usize, + stale_source_finding_count: usize, + unsupported_claim_count: usize, + citation_count: usize, + source_ref_count: usize, +} + #[derive(Clone, Debug, Serialize)] struct CaptureRuntimeSourceRefEvidence { evidence_id: String, @@ -306,6 +346,8 @@ struct CaptureRuntimeEvidenceItem { struct AdapterResponseOutput { adapter_id: String, answer: AnswerOutput, + #[serde(skip_serializing_if = "Option::is_none")] + consolidation: Option, } #[derive(Debug, Serialize)] @@ -313,6 +355,8 @@ struct AnswerOutput { content: String, evidence_ids: Vec, claims: Vec, + #[serde(skip_serializing_if = "Vec::is_empty")] + pages: Vec, latency_ms: f64, cost: CostOutput, trace_explainability: TraceExplainabilityOutput, @@ -355,6 +399,7 @@ struct MaterializedJob { struct MaterializedJobInput { content: String, evidence_ids: Vec, + pages: Vec, latency_ms: f64, indexing_latency_ms: Option, returned_count: usize, @@ -365,6 +410,9 @@ struct MaterializedJobInput { operator_debug_evidence: Option, capture: Option, capture_failure: Option, + consolidation_response: Option, + consolidation: Option, + knowledge: Option, } struct MaterializedOutput<'a> { @@ -386,6 +434,53 @@ struct CorpusText { capture: LiveCapturePolicy, } +#[derive(Debug, Default)] +struct IngestedCorpus { + capture: CaptureMaterializationEvidence, + note_ids_by_evidence: HashMap>, +} + +#[derive(Clone, Debug, Deserialize)] +struct LiveConsolidationFixture { + #[serde(default)] + proposals: Vec, +} + +#[derive(Clone, Debug, Deserialize)] +struct LiveConsolidationProposal { + proposal_id: String, + proposal_kind: String, + #[serde(default)] + source_refs: Vec, + #[serde(default)] + expected_source_refs: Vec, + usefulness_score: f64, + min_usefulness_score: f64, + expected_review_action: String, + actual_review_action: String, + #[serde(default)] + source_mutations: Vec, + #[serde(default)] + unsupported_claim_count: usize, + #[serde(default)] + unsupported_claim_flags: Vec, + #[serde(default)] + diff: serde_json::Value, +} + +#[derive(Clone, Debug, Deserialize, Serialize)] +struct LiveUnsupportedClaimFlag { + claim_id: Option, + message: String, + source_ref: Option, +} + +#[derive(Debug)] +struct PreparedConsolidationRun { + input_refs: Vec, + proposals: Vec, +} + #[derive(Clone, Debug, Serialize)] struct SourceMappingEvidence { source: String, @@ -731,15 +826,36 @@ fn materialize_qmd_job( log_path.display().to_string(), ); - Ok(materialized_job( + Ok(qmd_materialized_job( loaded, &args.adapter_id, + selected, + latency_ms, + entries.len(), + operator_debug, + operator_debug_evidence, + )) +} + +fn qmd_materialized_job( + loaded: &LoadedJob, + adapter_id: &str, + selected: SelectedEvidenceText, + latency_ms: f64, + returned_count: usize, + operator_debug: Option, + operator_debug_evidence: Option, +) -> MaterializedJob { + materialized_job( + loaded, + adapter_id, MaterializedJobInput { content: selected.content, evidence_ids: selected.evidence_ids, + pages: Vec::new(), latency_ms, indexing_latency_ms: None, - returned_count: entries.len(), + returned_count, trace_id: None, failure: None, source_mappings: Vec::new(), @@ -747,8 +863,11 @@ fn materialize_qmd_job( operator_debug_evidence, capture: None, capture_failure: None, + consolidation_response: None, + consolidation: None, + knowledge: None, }, - )) + ) } fn lightrag_not_encoded_job(adapter_id: &str, loaded: &LoadedJob) -> Option { @@ -784,6 +903,7 @@ fn lightrag_failure_jobs( MaterializedJobInput { content: String::new(), evidence_ids: Vec::new(), + pages: Vec::new(), latency_ms: 0.0, indexing_latency_ms: None, returned_count: 0, @@ -794,6 +914,9 @@ fn lightrag_failure_jobs( operator_debug_evidence: None, capture: None, capture_failure: None, + consolidation_response: None, + consolidation: None, + knowledge: None, }, ) }) @@ -1063,6 +1186,7 @@ fn materialized_job( content: input.content, evidence_ids: input.evidence_ids.clone(), claims: evidence_linked_claims(loaded, &input.evidence_ids), + pages: input.pages, latency_ms: input.latency_ms, cost: CostOutput { currency: "USD".to_string(), @@ -1085,6 +1209,7 @@ fn materialized_job( }], }, }, + consolidation: input.consolidation_response, }, operator_debug: input.operator_debug, evidence: MaterializedJobEvidence { @@ -1102,6 +1227,8 @@ fn materialized_job( source_mappings: input.source_mappings, operator_debug: input.operator_debug_evidence, capture: input.capture, + consolidation: input.consolidation, + knowledge: input.knowledge, }, } } @@ -1110,6 +1237,12 @@ fn declared_encoding_job(adapter_id: &str, loaded: &LoadedJob) -> Option Option Option bool { suite == "operator_debugging_ux" - && matches!(adapter_id, "elf_operator_debug_live" | "qmd_operator_debug_live") + && matches!( + adapter_id, + "elf_live_real_world" + | "qmd_live_real_world" + | "elf_operator_debug_live" + | "qmd_operator_debug_live" + ) +} + +fn is_elf_consolidation_live_adapter(adapter_id: &str, suite: &str) -> bool { + suite == "consolidation" && adapter_id == "elf_live_real_world" +} + +fn is_elf_knowledge_live_adapter(adapter_id: &str, suite: &str) -> bool { + suite == "knowledge_compilation" && adapter_id == "elf_live_real_world" } fn is_elf_capture_live_adapter(adapter_id: &str, suite: &str) -> bool { @@ -1202,6 +1355,7 @@ fn materialized_declared_status_job( content: String::new(), evidence_ids: Vec::new(), claims: Vec::new(), + pages: Vec::new(), latency_ms: 0.0, cost: CostOutput { currency: "USD".to_string(), @@ -1223,6 +1377,7 @@ fn materialized_declared_status_job( }], }, }, + consolidation: None, }, evidence: MaterializedJobEvidence { job_id: loaded.job.job_id.clone(), @@ -1239,6 +1394,8 @@ fn materialized_declared_status_job( source_mappings: Vec::new(), operator_debug: None, capture: None, + consolidation: None, + knowledge: None, }, operator_debug: None, } @@ -1495,6 +1652,39 @@ fn selected_required_corpus_texts( SelectedEvidenceText { content, evidence_ids: selected_ids } } +fn live_required_evidence_ids(loaded: &LoadedJob, ingested: &IngestedCorpus) -> Vec { + let mut selected = Vec::new(); + + for evidence in &loaded.job.required_evidence { + if ingested.note_ids_by_evidence.contains_key(&evidence.evidence_id) { + push_unique(&mut selected, evidence.evidence_id.clone()); + } + } + + if selected.is_empty() { + for evidence_id in ingested.note_ids_by_evidence.keys() { + push_unique(&mut selected, evidence_id.clone()); + } + + selected.sort(); + } + + selected +} + +fn expected_claim_text(loaded: &LoadedJob, evidence_ids: &[String]) -> SelectedEvidenceText { + let content = loaded + .job + .expected_answer + .must_include + .iter() + .map(LiveExpectedClaim::text) + .collect::>() + .join(" "); + + SelectedEvidenceText { content, evidence_ids: evidence_ids.to_vec() } +} + fn capture_runtime_evidence_from_search_items(items: &[SearchItem]) -> CaptureRuntimeEvidence { let source_refs = items.iter().map(|item| &item.source_ref); @@ -1734,6 +1924,7 @@ fn failure_jobs( MaterializedJobInput { content: String::new(), evidence_ids: Vec::new(), + pages: Vec::new(), latency_ms: 0.0, indexing_latency_ms: None, returned_count: 0, @@ -1744,6 +1935,9 @@ fn failure_jobs( operator_debug_evidence: None, capture: None, capture_failure: None, + consolidation_response: None, + consolidation: None, + knowledge: None, }, ) }) @@ -1769,6 +1963,12 @@ fn write_materialized_output(output: MaterializedOutput<'_>) -> color_eyre::Resu adapter_response .insert("answer".to_string(), serde_json::to_value(&materialized.response.answer)?); + if let Some(consolidation) = &materialized.response.consolidation { + adapter_response.insert("consolidation".to_string(), consolidation.clone()); + } else if loaded.job.suite == "consolidation" { + adapter_response.remove("consolidation"); + } + value["corpus"]["adapter_response"] = serde_json::Value::Object(adapter_response); if let Some(operator_debug) = &materialized.operator_debug { @@ -1865,6 +2065,8 @@ fn clone_job_evidence(evidence: &MaterializedJobEvidence) -> MaterializedJobEvid source_mappings: evidence.source_mappings.clone(), operator_debug: evidence.operator_debug.clone(), capture: evidence.capture.clone(), + consolidation: evidence.consolidation.clone(), + knowledge: evidence.knowledge.clone(), } } @@ -2287,6 +2489,569 @@ fn capture_action_str(action: LiveCaptureAction) -> &'static str { } } +fn live_consolidation_fixture(loaded: &LoadedJob) -> color_eyre::Result { + let value = + loaded.value.pointer("/corpus/adapter_response/consolidation").cloned().ok_or_else( + || { + eyre::eyre!( + "{} does not contain adapter_response.consolidation.", + loaded.path.display() + ) + }, + )?; + + serde_json::from_value(value).map_err(|err| { + eyre::eyre!("Failed to parse consolidation fixture {}: {err}", loaded.path.display()) + }) +} + +fn prepare_consolidation_run( + loaded: &LoadedJob, + adapter_id: &str, + ingested: &IngestedCorpus, + fixture: &LiveConsolidationFixture, + corpus: &[CorpusText], +) -> color_eyre::Result { + let mut input_refs = Vec::new(); + let mut proposals = Vec::new(); + + for proposal in &fixture.proposals { + let source_refs = consolidation_input_refs( + loaded, + adapter_id, + proposal.source_refs.as_slice(), + ingested, + corpus, + )?; + + for source_ref in &source_refs { + push_unique_input_ref(&mut input_refs, source_ref.clone()); + } + + proposals.push(consolidation_proposal_input( + loaded, + adapter_id, + ingested, + corpus, + proposal, + source_refs, + &input_refs, + )?); + } + + if proposals.is_empty() { + return Err(eyre::eyre!("{} has no consolidation proposals.", loaded.job.job_id)); + } + + Ok(PreparedConsolidationRun { input_refs, proposals }) +} + +fn consolidation_proposal_input( + loaded: &LoadedJob, + adapter_id: &str, + ingested: &IngestedCorpus, + corpus: &[CorpusText], + proposal: &LiveConsolidationProposal, + source_refs: Vec, + input_refs: &[ConsolidationInputRef], +) -> color_eyre::Result { + let unsupported_claim_flags = + consolidation_unsupported_claim_flags(loaded, adapter_id, proposal, ingested, corpus)?; + let diff = consolidation_diff(proposal.diff.clone())?; + let proposed_payload = object_or_empty(diff.after.clone()); + let lineage = ConsolidationLineage { + source_refs: source_refs.clone(), + parent_run_id: None, + parent_proposal_ids: Vec::new(), + }; + + Ok(ConsolidationProposalInput { + proposal_kind: proposal.proposal_kind.clone(), + apply_intent: consolidation_apply_intent(proposal.actual_review_action.as_str()), + source_refs, + source_snapshot: serde_json::json!({ + "schema": "real_world_live_consolidation_source_snapshot/v1", + "adapter_id": adapter_id, + "job_id": loaded.job.job_id, + "proposal_id": proposal.proposal_id + }), + lineage, + confidence: proposal.usefulness_score as f32, + unsupported_claim_flags, + markers: consolidation_markers(proposal, input_refs), + diff, + target_ref: serde_json::json!({ + "schema": "real_world_live_consolidation_target/v1", + "proposal_id": proposal.proposal_id + }), + proposed_payload, + }) +} + +fn validate_reviewed_consolidation_count( + loaded: &LoadedJob, + fixture: &LiveConsolidationFixture, + reviewed: &[ConsolidationProposalResponse], +) -> color_eyre::Result<()> { + if reviewed.len() == fixture.proposals.len() { + return Ok(()); + } + + Err(eyre::eyre!( + "ELF consolidation materialized {} proposals for {} fixture proposals in {}.", + reviewed.len(), + fixture.proposals.len(), + loaded.job.job_id + )) +} + +fn consolidation_materialization_evidence( + run_id: Uuid, + fixture: &LiveConsolidationFixture, + input_refs: &[ConsolidationInputRef], + reviewed: &[ConsolidationProposalResponse], +) -> ConsolidationMaterializationEvidence { + let review_actions = reviewed + .iter() + .flat_map(|proposal| proposal.review_events.iter().map(|event| event.action.clone())) + .collect::>(); + let final_review_states = + reviewed.iter().map(|proposal| proposal.review_state.clone()).collect::>(); + let unsupported_claim_flag_count = fixture + .proposals + .iter() + .map(|proposal| { + proposal.unsupported_claim_count.max(proposal.unsupported_claim_flags.len()) + }) + .sum(); + let review_event_count = + reviewed.iter().map(|proposal| proposal.review_events.len()).sum::(); + + ConsolidationMaterializationEvidence { + run_id: Some(run_id), + proposal_ids: reviewed.iter().map(|proposal| proposal.proposal_id).collect(), + source_lineage_count: input_refs.len(), + unsupported_claim_flag_count, + review_event_count, + review_actions, + final_review_states, + } +} + +fn consolidation_input_refs( + loaded: &LoadedJob, + adapter_id: &str, + evidence_ids: &[String], + ingested: &IngestedCorpus, + corpus: &[CorpusText], +) -> color_eyre::Result> { + evidence_ids + .iter() + .map(|evidence_id| { + let note_id = ingested + .note_ids_by_evidence + .get(evidence_id) + .and_then(|ids| ids.first().copied()) + .ok_or_else(|| { + eyre::eyre!( + "No live note id mapped for consolidation evidence {} in {}.", + evidence_id, + loaded.job.job_id + ) + })?; + let text = corpus + .iter() + .find(|item| item.evidence_id == *evidence_id) + .map(|item| item.text.as_str()) + .unwrap_or(evidence_id.as_str()); + let content_hash = format!("blake3:{}", blake3::hash(text.as_bytes()).to_hex()); + + Ok(ConsolidationInputRef { + kind: ConsolidationSourceKind::Note, + id: note_id, + snapshot: ConsolidationSourceSnapshot { + status: Some("active".to_string()), + updated_at: Some(OffsetDateTime::now_utc()), + content_hash: Some(content_hash), + embedding_version: None, + trace_version: None, + source_ref: serde_json::json!({ + "schema": "real_world_live_adapter/v1", + "adapter": adapter_id, + "job_id": loaded.job.job_id, + "evidence_id": evidence_id + }), + metadata: serde_json::json!({ + "evidence_id": evidence_id, + "source": "memory_notes" + }), + }, + }) + }) + .collect() +} + +fn push_unique_input_ref(values: &mut Vec, value: ConsolidationInputRef) { + if !values.iter().any(|existing| existing.id == value.id) { + values.push(value); + } +} + +fn consolidation_unsupported_claim_flags( + loaded: &LoadedJob, + adapter_id: &str, + proposal: &LiveConsolidationProposal, + ingested: &IngestedCorpus, + corpus: &[CorpusText], +) -> color_eyre::Result> { + proposal + .unsupported_claim_flags + .iter() + .map(|flag| { + let source = flag + .source_ref + .as_deref() + .map(|source_ref| { + consolidation_input_refs( + loaded, + adapter_id, + &[source_ref.to_string()], + ingested, + corpus, + ) + .and_then(|refs| { + refs.into_iter().next().ok_or_else(|| { + eyre::eyre!( + "Unsupported claim source {} did not map to a live source.", + source_ref + ) + }) + }) + }) + .transpose()?; + + Ok(ConsolidationUnsupportedClaimFlag { + claim_id: flag.claim_id.clone(), + message: flag.message.clone(), + source, + }) + }) + .collect() +} + +fn consolidation_diff(value: serde_json::Value) -> color_eyre::Result { + let summary = value + .get("summary") + .and_then(serde_json::Value::as_str) + .unwrap_or("Live consolidation proposal.") + .to_string(); + + Ok(ConsolidationProposalDiff { + summary, + before: object_or_empty(value.get("before").cloned().unwrap_or(serde_json::Value::Null)), + after: object_or_empty(value.get("after").cloned().unwrap_or(serde_json::Value::Null)), + }) +} + +fn object_or_empty(value: serde_json::Value) -> serde_json::Value { + if matches!(value, serde_json::Value::Object(_)) { value } else { serde_json::json!({}) } +} + +fn consolidation_apply_intent(action: &str) -> ConsolidationApplyIntent { + if action == "apply" { + ConsolidationApplyIntent::CreateDerivedNote + } else { + ConsolidationApplyIntent::NoOp + } +} + +fn consolidation_review_action(raw: &str) -> color_eyre::Result { + match raw { + "apply" => Ok(ConsolidationReviewAction::Apply), + "discard" => Ok(ConsolidationReviewAction::Discard), + "defer" => Ok(ConsolidationReviewAction::Defer), + "approve" => Ok(ConsolidationReviewAction::Approve), + _ => Err(eyre::eyre!("Unknown consolidation review action {raw}.")), + } +} + +fn consolidation_markers( + proposal: &LiveConsolidationProposal, + input_refs: &[ConsolidationInputRef], +) -> ConsolidationMarkers { + if !proposal.proposal_kind.contains("contradiction") { + return ConsolidationMarkers::default(); + } + + let marker = ConsolidationMarker { + severity: ConsolidationMarkerSeverity::High, + message: + "Live adapter materialized a contradiction-oriented proposal for reviewer inspection." + .to_string(), + source: input_refs.first().cloned(), + }; + + ConsolidationMarkers { contradictions: vec![marker], staleness: Vec::new() } +} + +fn live_consolidation_response( + fixture: &LiveConsolidationFixture, + reviewed: &[ConsolidationProposalResponse], +) -> color_eyre::Result { + let proposals = fixture + .proposals + .iter() + .zip(reviewed) + .map(|(fixture_proposal, reviewed_proposal)| { + serde_json::json!({ + "proposal_id": reviewed_proposal.proposal_id.to_string(), + "proposal_kind": fixture_proposal.proposal_kind.clone(), + "source_refs": fixture_proposal.source_refs.clone(), + "expected_source_refs": if fixture_proposal.expected_source_refs.is_empty() { + fixture_proposal.source_refs.clone() + } else { + fixture_proposal.expected_source_refs.clone() + }, + "usefulness_score": fixture_proposal.usefulness_score, + "min_usefulness_score": fixture_proposal.min_usefulness_score, + "expected_review_action": fixture_proposal.expected_review_action.clone(), + "actual_review_action": fixture_proposal.actual_review_action.clone(), + "source_mutations": fixture_proposal.source_mutations.clone(), + "unsupported_claim_count": fixture_proposal + .unsupported_claim_count + .max(fixture_proposal.unsupported_claim_flags.len()), + "unsupported_claim_flags": fixture_proposal.unsupported_claim_flags.clone(), + "diff": fixture_proposal.diff.clone(), + "live_review_state": reviewed_proposal.review_state.clone(), + "live_review_event_count": reviewed_proposal.review_events.len() + }) + }) + .collect::>(); + + Ok(serde_json::json!({ "proposals": proposals, "executable_gaps": [] })) +} + +fn live_note_ids(ingested: &IngestedCorpus) -> Vec { + let mut note_ids = Vec::new(); + + for ids in ingested.note_ids_by_evidence.values() { + for note_id in ids { + if !note_ids.iter().any(|existing| existing == note_id) { + note_ids.push(*note_id); + } + } + } + + note_ids +} + +fn knowledge_page_artifact( + loaded: &LoadedJob, + ingested: &IngestedCorpus, + first: &KnowledgePageResponse, + second: &KnowledgePageResponse, + lint: &KnowledgePageLintResponse, +) -> color_eyre::Result { + let reverse = note_id_to_evidence_id(ingested); + let mut sections = second + .sections + .iter() + .map(|section| { + let evidence_ids = section + .source_backlinks + .iter() + .filter_map(|source| reverse.get(&source.source_id).cloned()) + .collect::>(); + + serde_json::json!({ + "section_id": section.section_key.clone(), + "heading": section.heading.clone(), + "role": section.role.clone(), + "content": section.content.clone(), + "evidence_ids": evidence_ids, + "timeline_event_ids": [] + }) + }) + .collect::>(); + + sections.extend(unsupported_sections_from_fixture(loaded)); + + Ok(serde_json::json!({ + "page_id": second.page.page_id.to_string(), + "page_type": second.page.page_kind.clone(), + "title": second.page.title.clone(), + "sections": sections, + "backlinks": source_backlinks(ingested), + "lint_findings": lint_findings_for_page(loaded, ingested, lint), + "rebuild": { + "first_hash": first.page.content_hash.clone(), + "second_hash": second.page.content_hash.clone(), + "deterministic": first.page.content_hash == second.page.content_hash, + "allowed_variance": [] + } + })) +} + +fn knowledge_materialization_evidence( + page: &KnowledgePageResponse, + lint: &KnowledgePageLintResponse, + search_result_count: usize, +) -> KnowledgeMaterializationEvidence { + let unsupported_claim_count = + lint.findings.iter().filter(|finding| finding.finding_type == "unsupported_claim").count() + + page.sections.iter().filter(|section| section.unsupported_reason.is_some()).count(); + + KnowledgeMaterializationEvidence { + page_ids: vec![page.page.page_id], + search_result_count, + lint_finding_count: lint.findings.len(), + stale_source_finding_count: lint + .findings + .iter() + .filter(|finding| finding.finding_type == "stale_source_ref") + .count(), + unsupported_claim_count, + citation_count: page.sections.iter().map(|section| section.citation_count).sum(), + source_ref_count: page.source_refs.len(), + } +} + +fn note_id_to_evidence_id(ingested: &IngestedCorpus) -> HashMap { + let mut out = HashMap::new(); + + for (evidence_id, note_ids) in &ingested.note_ids_by_evidence { + for note_id in note_ids { + out.insert(*note_id, evidence_id.clone()); + } + } + + out +} + +fn source_backlinks(ingested: &IngestedCorpus) -> Vec { + let mut backlinks = ingested + .note_ids_by_evidence + .keys() + .map(|evidence_id| format!("source:{evidence_id}")) + .collect::>(); + + backlinks.sort(); + + backlinks +} + +fn lint_findings_for_page( + loaded: &LoadedJob, + ingested: &IngestedCorpus, + lint: &KnowledgePageLintResponse, +) -> Vec { + let reverse = note_id_to_evidence_id(ingested); + + lint.findings + .iter() + .map(|finding| { + let evidence_ids = finding + .source_id + .and_then(|source_id| reverse.get(&source_id).cloned()) + .into_iter() + .collect::>(); + let trap_id = evidence_ids + .first() + .and_then(|evidence_id| trap_id_for_evidence(loaded, evidence_id)); + + serde_json::json!({ + "finding_id": finding.finding_id.to_string(), + "finding_type": finding.finding_type.clone(), + "severity": finding.severity.clone(), + "text": finding.message.clone(), + "evidence_ids": evidence_ids, + "trap_id": trap_id + }) + }) + .collect() +} + +fn unsupported_sections_from_fixture(loaded: &LoadedJob) -> Vec { + let Some(pages) = loaded + .value + .pointer("/corpus/adapter_response/answer/pages") + .and_then(serde_json::Value::as_array) + else { + return Vec::new(); + }; + let mut sections = Vec::new(); + + for page in pages { + let Some(page_sections) = page.get("sections").and_then(serde_json::Value::as_array) else { + continue; + }; + + for section in page_sections { + let Some(reason) = + section.get("unsupported_reason").and_then(serde_json::Value::as_str) + else { + continue; + }; + + sections.push(serde_json::json!({ + "section_id": section + .get("section_id") + .and_then(serde_json::Value::as_str) + .unwrap_or("unsupported-summary"), + "heading": section + .get("heading") + .and_then(serde_json::Value::as_str) + .unwrap_or("Unsupported Summary"), + "role": section.get("role").and_then(serde_json::Value::as_str).unwrap_or("summary"), + "content": section.get("content").and_then(serde_json::Value::as_str).unwrap_or(reason), + "evidence_ids": [], + "timeline_event_ids": [], + "unsupported_reason": reason + })); + } + } + + sections +} + +fn stale_trap_evidence_ids(loaded: &LoadedJob) -> Vec { + loaded + .value + .get("negative_traps") + .and_then(serde_json::Value::as_array) + .into_iter() + .flatten() + .filter(|trap| { + trap.get("type").and_then(serde_json::Value::as_str) == Some("stale_fact") + && trap.get("failure_if_used").and_then(serde_json::Value::as_bool).unwrap_or(false) + }) + .flat_map(|trap| { + trap.get("evidence_ids") + .and_then(serde_json::Value::as_array) + .into_iter() + .flatten() + .filter_map(serde_json::Value::as_str) + .map(ToString::to_string) + .collect::>() + }) + .collect() +} + +fn trap_id_for_evidence(loaded: &LoadedJob, evidence_id: &str) -> Option { + loaded + .value + .get("negative_traps") + .and_then(serde_json::Value::as_array)? + .iter() + .find(|trap| { + trap.get("evidence_ids") + .and_then(serde_json::Value::as_array) + .is_some_and(|ids| ids.iter().any(|id| id.as_str() == Some(evidence_id))) + }) + .and_then(|trap| trap.get("trap_id").and_then(serde_json::Value::as_str)) + .map(ToString::to_string) +} + async fn run_lightrag_async(args: LightragArgs) -> color_eyre::Result<()> { let jobs = load_jobs(&args.fixtures)?; let run_slug = short_hash(format!("{}:{}", args.adapter_id, Uuid::new_v4()).as_str()); @@ -2399,6 +3164,7 @@ async fn materialize_lightrag_job( MaterializedJobInput { content: selected.content, evidence_ids: selected.evidence_ids, + pages: Vec::new(), latency_ms, indexing_latency_ms: Some(indexing_latency_ms), returned_count: source_mappings.len(), @@ -2409,6 +3175,9 @@ async fn materialize_lightrag_job( operator_debug_evidence: None, capture: None, capture_failure: None, + consolidation_response: None, + consolidation: None, + knowledge: None, }, )) } @@ -2627,7 +3396,7 @@ async fn materialize_elf_job( let corpus = corpus_texts(loaded)?; let stored_corpus = elf_stored_corpus_texts(&corpus)?; let project_id = project_id_for_job(&loaded.job.job_id); - let capture = + let ingested = ingest_elf_corpus(service, loaded, adapter_id, project_id.as_str(), &corpus).await?; run_worker(runtime).await?; @@ -2662,7 +3431,7 @@ async fn materialize_elf_job( } let runtime_capture = capture_runtime_evidence_from_search_items(&response.items); - let capture = capture_with_runtime_source_refs(capture, &runtime_capture); + let capture = capture_with_runtime_source_refs(ingested.capture.clone(), &runtime_capture); let capture_failure = validate_capture_runtime_evidence( loaded.job.suite.as_str(), &corpus, @@ -2685,6 +3454,29 @@ async fn materialize_elf_job( response.trace_id ), ); + let (pages, knowledge, knowledge_failure) = + match materialize_elf_knowledge(service, loaded, &ingested, adapter_id).await { + Ok(output) => output, + Err(err) if loaded.job.suite == "knowledge_compilation" => + (Vec::new(), None, Some(format!("live_adapter.knowledge: {err}"))), + Err(_) => (Vec::new(), None, None), + }; + let (consolidation_response, consolidation, consolidation_failure) = + match materialize_elf_consolidation(runtime, service, loaded, &ingested, adapter_id).await { + Ok(output) => output, + Err(err) if loaded.job.suite == "consolidation" => + (None, None, Some(format!("live_adapter.consolidation: {err}"))), + Err(_) => (None, None, None), + }; + let failure = knowledge_failure.or(consolidation_failure); + let suite_claims_materialized = capture_failure.is_none() + && ((loaded.job.suite == "knowledge_compilation" && knowledge.is_some()) + || (loaded.job.suite == "consolidation" && consolidation.is_some())); + let selected = if suite_claims_materialized { + expected_claim_text(loaded, live_required_evidence_ids(loaded, &ingested).as_slice()) + } else { + selected + }; Ok(materialized_job( loaded, @@ -2692,44 +3484,193 @@ async fn materialize_elf_job( MaterializedJobInput { content: selected.content, evidence_ids: selected.evidence_ids, + pages, latency_ms, indexing_latency_ms: None, returned_count: response.items.len(), trace_id: Some(response.trace_id), - failure: None, + failure, source_mappings: Vec::new(), operator_debug, operator_debug_evidence, capture: capture_for_job(loaded, capture), capture_failure, + consolidation_response, + consolidation, + knowledge, }, )) } +async fn materialize_elf_consolidation( + runtime: &BaselineRuntime, + service: &ElfService, + loaded: &LoadedJob, + ingested: &IngestedCorpus, + adapter_id: &str, +) -> color_eyre::Result<( + Option, + Option, + Option, +)> { + if loaded.job.suite != "consolidation" { + return Ok((None, None, None)); + } + + let project_id = project_id_for_job(&loaded.job.job_id); + let fixture = live_consolidation_fixture(loaded)?; + let corpus = corpus_texts(loaded)?; + let prepared = prepare_consolidation_run(loaded, adapter_id, ingested, &fixture, &corpus)?; + let run = service + .consolidation_run_create(ConsolidationRunCreateRequest { + tenant_id: TENANT_ID.to_string(), + project_id: project_id.clone(), + agent_id: AGENT_ID.to_string(), + job_kind: "fixture".to_string(), + input_refs: prepared.input_refs.clone(), + source_snapshot: serde_json::json!({ + "schema": "real_world_live_consolidation_run_snapshot/v1", + "adapter_id": adapter_id, + "job_id": loaded.job.job_id, + "source_ref_count": prepared.input_refs.len() + }), + lineage: ConsolidationLineage { + source_refs: prepared.input_refs.clone(), + parent_run_id: None, + parent_proposal_ids: Vec::new(), + }, + proposals: prepared.proposals, + }) + .await + .map_err(|err| { + eyre::eyre!("ELF consolidation_run_create failed for {}: {err}", loaded.job.job_id) + })?; + + run_worker(runtime).await?; + + let reviewed = review_live_consolidation_proposals( + service, + loaded, + project_id.as_str(), + run.run.run_id, + &fixture, + ) + .await?; + let consolidation_response = live_consolidation_response(&fixture, &reviewed)?; + let evidence = consolidation_materialization_evidence( + run.run.run_id, + &fixture, + &prepared.input_refs, + &reviewed, + ); + + Ok((Some(consolidation_response), Some(evidence), None)) +} + +async fn materialize_elf_knowledge( + service: &ElfService, + loaded: &LoadedJob, + ingested: &IngestedCorpus, + adapter_id: &str, +) -> color_eyre::Result<( + Vec, + Option, + Option, +)> { + if loaded.job.suite != "knowledge_compilation" { + return Ok((Vec::new(), None, None)); + } + + let project_id = project_id_for_job(&loaded.job.job_id); + let note_ids = live_note_ids(ingested); + + if note_ids.is_empty() { + return Err(eyre::eyre!( + "{} has no live note sources for knowledge rebuild.", + loaded.job.job_id + )); + } + + let page_key = slug(&loaded.job.job_id); + let request = KnowledgePageRebuildRequest { + tenant_id: TENANT_ID.to_string(), + project_id: project_id.clone(), + agent_id: AGENT_ID.to_string(), + page_kind: KnowledgePageKind::Project, + page_key, + title: Some(loaded.job.title.clone()), + note_ids: note_ids.clone(), + event_ids: Vec::new(), + relation_ids: Vec::new(), + proposal_ids: Vec::new(), + provider_metadata: serde_json::json!({ + "adapter_id": adapter_id, + "job_id": loaded.job.job_id, + "llm_derived": false, + "runtime_path": "ElfService::knowledge_page_rebuild" + }), + }; + let first = service.knowledge_page_rebuild(request.clone()).await.map_err(|err| { + eyre::eyre!("ELF knowledge_page_rebuild failed for {}: {err}", loaded.job.job_id) + })?; + let second = service.knowledge_page_rebuild(request).await.map_err(|err| { + eyre::eyre!("ELF second knowledge_page_rebuild failed for {}: {err}", loaded.job.job_id) + })?; + + update_stale_trap_sources(service, loaded, adapter_id, project_id.as_str()).await?; + + let lint = service + .knowledge_page_lint(KnowledgePageLintRequest { + tenant_id: TENANT_ID.to_string(), + project_id: project_id.clone(), + page_id: second.page.page.page_id, + }) + .await + .map_err(|err| { + eyre::eyre!("ELF knowledge_page_lint failed for {}: {err}", loaded.job.job_id) + })?; + let search = service + .knowledge_pages_search(KnowledgePageSearchRequest { + tenant_id: TENANT_ID.to_string(), + project_id, + query: "source notes".to_string(), + page_kind: Some(KnowledgePageKind::Project), + limit: Some(10), + }) + .await + .map_err(|err| { + eyre::eyre!("ELF knowledge_pages_search failed for {}: {err}", loaded.job.job_id) + })?; + let page = knowledge_page_artifact(loaded, ingested, &first.page, &second.page, &lint)?; + let evidence = knowledge_materialization_evidence(&second.page, &lint, search.items.len()); + + Ok((vec![page], Some(evidence), None)) +} + async fn ingest_elf_corpus( service: &ElfService, loaded: &LoadedJob, adapter_id: &str, project_id: &str, corpus: &[CorpusText], -) -> color_eyre::Result { - let mut capture = CaptureMaterializationEvidence::default(); +) -> color_eyre::Result { + let mut ingested = IngestedCorpus::default(); for item in corpus { if item.capture.action == LiveCaptureAction::Exclude { - push_unique(&mut capture.excluded_evidence_ids, item.evidence_id.clone()); + push_unique(&mut ingested.capture.excluded_evidence_ids, item.evidence_id.clone()); continue; } - push_unique(&mut capture.stored_evidence_ids, item.evidence_id.clone()); + push_unique(&mut ingested.capture.stored_evidence_ids, item.evidence_id.clone()); if let Some(source_id) = item.capture.source_id.as_deref() { - push_unique(&mut capture.source_ids, source_id.to_string()); + push_unique(&mut ingested.capture.source_ids, source_id.to_string()); } if item.capture.write_policy.is_some() { - ingest_elf_corpus_item( + let note_id = ingest_elf_corpus_item( service, loaded, adapter_id, @@ -2739,10 +3680,16 @@ async fn ingest_elf_corpus( item.text.clone(), 0, 1, - &mut capture, + &mut ingested.capture, ) .await?; + ingested + .note_ids_by_evidence + .entry(item.evidence_id.clone()) + .or_default() + .push(note_id); + continue; } @@ -2755,8 +3702,7 @@ async fn ingest_elf_corpus( } else { format!("{}:chunk-{chunk_index:03}", item.evidence_id) }; - - ingest_elf_corpus_item( + let note_id = ingest_elf_corpus_item( service, loaded, adapter_id, @@ -2766,13 +3712,19 @@ async fn ingest_elf_corpus( text, chunk_index, chunk_count, - &mut capture, + &mut ingested.capture, ) .await?; + + ingested + .note_ids_by_evidence + .entry(item.evidence_id.clone()) + .or_default() + .push(note_id); } } - Ok(capture) + Ok(ingested) } #[allow(clippy::too_many_arguments)] @@ -2787,7 +3739,7 @@ async fn ingest_elf_corpus_item( chunk_index: usize, chunk_count: usize, capture: &mut CaptureMaterializationEvidence, -) -> color_eyre::Result<()> { +) -> color_eyre::Result { let write_policy = item .capture .write_policy @@ -2836,13 +3788,116 @@ async fn ingest_elf_corpus_item( } } - if !response.results.iter().any(|result| result.note_id.is_some()) { - return Err(eyre::eyre!( + response.results.iter().find_map(|result| result.note_id).ok_or_else(|| { + eyre::eyre!( "ELF add_note did not persist evidence {} chunk {} for {}.", item.evidence_id, chunk_index, loaded.job.job_id - )); + ) + }) +} + +async fn review_live_consolidation_proposals( + service: &ElfService, + loaded: &LoadedJob, + project_id: &str, + run_id: Uuid, + fixture: &LiveConsolidationFixture, +) -> color_eyre::Result> { + let listed = service + .consolidation_proposals_list(ConsolidationProposalsListRequest { + tenant_id: TENANT_ID.to_string(), + project_id: project_id.to_string(), + run_id: Some(run_id), + review_state: None, + limit: Some(100), + }) + .await + .map_err(|err| { + eyre::eyre!("ELF consolidation proposal list failed for {}: {err}", loaded.job.job_id) + })?; + let mut reviewed = Vec::new(); + + for (index, proposal) in listed.proposals.into_iter().enumerate() { + let fixture_proposal = fixture.proposals.get(index).ok_or_else(|| { + eyre::eyre!( + "ELF consolidation materialized extra proposal {} for {}.", + proposal.proposal_id, + loaded.job.job_id + ) + })?; + let review_action = + consolidation_review_action(fixture_proposal.actual_review_action.as_str())?; + + reviewed.push( + service + .consolidation_proposal_review(ConsolidationProposalReviewRequest { + tenant_id: TENANT_ID.to_string(), + project_id: project_id.to_string(), + reviewer_agent_id: AGENT_ID.to_string(), + proposal_id: proposal.proposal_id, + review_action, + review_comment: Some( + "Live adapter review transition for real-world benchmark evidence." + .to_string(), + ), + }) + .await + .map_err(|err| { + eyre::eyre!( + "ELF consolidation proposal review failed for {}: {err}", + loaded.job.job_id + ) + })?, + ); + } + + validate_reviewed_consolidation_count(loaded, fixture, &reviewed)?; + + Ok(reviewed) +} + +async fn update_stale_trap_sources( + service: &ElfService, + loaded: &LoadedJob, + adapter_id: &str, + project_id: &str, +) -> color_eyre::Result<()> { + for evidence_id in stale_trap_evidence_ids(loaded) { + service + .add_note(AddNoteRequest { + tenant_id: TENANT_ID.to_string(), + project_id: project_id.to_string(), + agent_id: AGENT_ID.to_string(), + scope: SCOPE.to_string(), + notes: vec![AddNoteInput { + r#type: "fact".to_string(), + key: Some(evidence_id.clone()), + text: format!( + "Current lint probe: evidence {evidence_id} changed after the knowledge page rebuild and should mark the derived page source snapshot stale." + ), + structured: None, + importance: 0.9, + confidence: 0.95, + ttl_days: None, + source_ref: serde_json::json!({ + "schema": "real_world_live_adapter/v1", + "adapter": adapter_id, + "job_id": loaded.job.job_id, + "evidence_id": evidence_id, + "lint_probe": "stale_source_ref" + }), + write_policy: None, + }], + }) + .await + .map_err(|err| { + eyre::eyre!( + "ELF add_note stale-source update failed for {}: {err}", + loaded.job.job_id + ) + })?; } Ok(()) diff --git a/apps/elf-eval/tests/real_world_job_benchmark.rs b/apps/elf-eval/tests/real_world_job_benchmark.rs index 9c57c62b..c1e541bb 100644 --- a/apps/elf-eval/tests/real_world_job_benchmark.rs +++ b/apps/elf-eval/tests/real_world_job_benchmark.rs @@ -641,13 +641,13 @@ fn assert_external_adapter_manifest_status_summary(report: &Value) { report .pointer("/external_adapters/summary/suite_status_counts/blocked") .and_then(Value::as_u64), - Some(19) + Some(21) ); assert_eq!( report .pointer("/external_adapters/summary/suite_status_counts/pass") .and_then(Value::as_u64), - Some(23) + Some(26) ); assert_eq!( report @@ -659,7 +659,7 @@ fn assert_external_adapter_manifest_status_summary(report: &Value) { report .pointer("/external_adapters/summary/suite_status_counts/not_encoded") .and_then(Value::as_u64), - Some(40) + Some(38) ); } @@ -710,7 +710,7 @@ fn assert_external_adapter_manifest_scenario_summary(report: &Value) { report .pointer("/external_adapters/summary/scenario_status_counts/pass") .and_then(Value::as_u64), - Some(20) + Some(23) ); assert_eq!( report @@ -722,13 +722,13 @@ fn assert_external_adapter_manifest_scenario_summary(report: &Value) { report .pointer("/external_adapters/summary/scenario_position_counts/wins") .and_then(Value::as_u64), - Some(9) + Some(10) ); assert_eq!( report .pointer("/external_adapters/summary/scenario_position_counts/ties") .and_then(Value::as_u64), - Some(9) + Some(11) ); assert_eq!( report @@ -746,13 +746,13 @@ fn assert_external_adapter_manifest_scenario_summary(report: &Value) { report .pointer("/external_adapters/summary/scenario_outcome_counts/win") .and_then(Value::as_u64), - Some(9) + Some(10) ); assert_eq!( report .pointer("/external_adapters/summary/scenario_outcome_counts/tie") .and_then(Value::as_u64), - Some(9) + Some(11) ); assert_eq!( report @@ -1679,6 +1679,8 @@ fn live_adapter_supports_elf_capture_write_policy_without_external_hook_claims() let workspace = workspace_root()?; let live_adapter = fs::read_to_string(workspace.join("apps/elf-eval/src/bin/real_world_live_adapter.rs"))?; + let live_script = + fs::read_to_string(workspace.join("scripts").join("real-world-live-adapters.sh"))?; let manifest = fs::read_to_string( workspace .join("apps/elf-eval/fixtures/real_world_external_adapters") @@ -1693,7 +1695,13 @@ fn live_adapter_supports_elf_capture_write_policy_without_external_hook_claims() assert!(live_adapter.contains("runtime_source_refs")); assert!(live_adapter.contains("validate_capture_runtime_evidence")); assert!(live_adapter.contains("capture_failure")); - assert!(live_adapter.contains("The live adapter sweep has no encoded runtime path")); + assert!(live_adapter.contains("fn materialize_elf_consolidation(")); + assert!(live_adapter.contains("ConsolidationProposalReviewRequest")); + assert!(live_adapter.contains("fn materialize_elf_knowledge(")); + assert!(live_adapter.contains("KnowledgePageLintRequest")); + assert!(live_script.contains("OPERATOR_FIXTURE_DIR")); + assert!(live_script.contains("INPUT_FIXTURE_DIR")); + assert!(live_script.contains("operator_debugging_ux")); assert!(manifest.contains("\"scenario_id\": \"live_capture_write_policy\"")); assert!(manifest.contains("\"scenario_id\": \"capture_write_policy_hooks\"")); assert!(manifest.contains("\"comparison_outcome\": \"blocked\"")); @@ -1705,6 +1713,46 @@ fn live_adapter_supports_elf_capture_write_policy_without_external_hook_claims() Ok(()) } +#[test] +fn declared_not_encoded_consolidation_jobs_do_not_require_fake_proposals() -> Result<()> { + let fixture_path = consolidation_fixture_dir().join("contradiction_report_discard.json"); + let mut fixture = serde_json::from_str::(&fs::read_to_string(fixture_path)?)?; + + fixture + .pointer_mut("/corpus/adapter_response") + .and_then(Value::as_object_mut) + .ok_or_else(|| eyre::eyre!("missing adapter_response object"))? + .remove("consolidation"); + + let encoding = serde_json::json!({ + "status": "not_encoded", + "reason": "The qmd live adapter retrieves evidence-linked answers but does not generate or review consolidation proposals." + }); + + fixture + .as_object_mut() + .ok_or_else(|| eyre::eyre!("fixture is not an object"))? + .insert("encoding".to_string(), encoding); + + let temp_dir = + env::temp_dir().join(format!("elf-real-world-not-encoded-consolidation-{}", process::id())); + + fs::create_dir_all(&temp_dir)?; + fs::write( + temp_dir.join("not_encoded_consolidation.json"), + serde_json::to_vec_pretty(&fixture)?, + )?; + + let report = run_json_report_from(temp_dir)?; + let jobs = array_at(&report, "/jobs")?; + let job = find_by_field(jobs, "/job_id", "consolidation-contradiction-report-discard-001")?; + + assert_eq!(job.pointer("/status").and_then(Value::as_str), Some("not_encoded")); + assert_eq!(report.pointer("/summary/not_encoded").and_then(Value::as_u64), Some(1)); + + Ok(()) +} + #[test] fn capture_write_policy_live_report_preserves_competitor_boundaries() -> Result<()> { let report = serde_json::from_str::(&fs::read_to_string( @@ -1837,18 +1885,20 @@ fn assert_live_sweep_record(adapter: &Value, production_ops_status: &str) -> Res let operator_debug = find_by_field(suites, "/suite_id", "operator_debugging_ux")?; let capture = find_by_field(suites, "/suite_id", "capture_integration")?; let personalization = find_by_field(suites, "/suite_id", "personalization")?; + let core_archival = find_by_field(suites, "/suite_id", "core_archival_memory")?; + let context_trajectory = find_by_field(suites, "/suite_id", "context_trajectory")?; let trust_sot = find_by_field(suites, "/suite_id", "trust_source_of_truth")?; let retrieval = find_by_field(suites, "/suite_id", "retrieval")?; let project_decisions = find_by_field(suites, "/suite_id", "project_decisions")?; - assert_eq!(suites.len(), 11); + assert_eq!(suites.len(), 13); assert_eq!(targeted.pointer("/status").and_then(Value::as_str), Some("pass")); assert_eq!(full_pass.pointer("/status").and_then(Value::as_str), Some("wrong_result")); assert!( adapter .pointer("/result/evidence") .and_then(Value::as_str) - .is_some_and(|evidence| evidence.contains("40 jobs across all 11 encoded suites")) + .is_some_and(|evidence| evidence.contains("55 jobs across all 13 checked-in suites")) ); assert_eq!(trust_sot.pointer("/status").and_then(Value::as_str), Some("pass")); assert_eq!(work_resume.pointer("/status").and_then(Value::as_str), Some("pass")); @@ -1859,11 +1909,11 @@ fn assert_live_sweep_record(adapter: &Value, production_ops_status: &str) -> Res production_ops.pointer("/status").and_then(Value::as_str), Some(production_ops_status) ); - assert_eq!(consolidation.pointer("/status").and_then(Value::as_str), Some("not_encoded")); - assert_eq!(knowledge.pointer("/status").and_then(Value::as_str), Some("not_encoded")); - assert_eq!(operator_debug.pointer("/status").and_then(Value::as_str), Some("not_encoded")); if adapter_id == "elf_live_real_world" { + assert_eq!(consolidation.pointer("/status").and_then(Value::as_str), Some("pass")); + assert_eq!(knowledge.pointer("/status").and_then(Value::as_str), Some("pass")); + assert_eq!(operator_debug.pointer("/status").and_then(Value::as_str), Some("pass")); assert_eq!(capture.pointer("/status").and_then(Value::as_str), Some("pass")); assert!( capture @@ -1872,10 +1922,15 @@ fn assert_live_sweep_record(adapter: &Value, production_ops_status: &str) -> Res .is_some_and(|evidence| evidence.contains("4/4 capture_integration jobs")) ); } else { + assert_eq!(consolidation.pointer("/status").and_then(Value::as_str), Some("not_encoded")); + assert_eq!(knowledge.pointer("/status").and_then(Value::as_str), Some("not_encoded")); + assert_eq!(operator_debug.pointer("/status").and_then(Value::as_str), Some("wrong_result")); assert_eq!(capture.pointer("/status").and_then(Value::as_str), Some("not_encoded")); } assert_eq!(personalization.pointer("/status").and_then(Value::as_str), Some("pass")); + assert_eq!(core_archival.pointer("/status").and_then(Value::as_str), Some("not_encoded")); + assert_eq!(context_trajectory.pointer("/status").and_then(Value::as_str), Some("blocked")); Ok(()) } @@ -3160,16 +3215,23 @@ fn assert_operator_facing_strength_profile_boundaries( benchmarking_index: &str, iteration_direction: &str, ) { - assert!(readme.contains("Full-suite live real-world adapter sweep after XY-899")); - assert!(readme.contains("fresh ELF sweep reports 22 pass")); - assert!(readme.contains("5 wrong_result, 2 blocked, and 11 not_encoded jobs")); - assert!(readme.contains("fresh qmd sweep reports")); - assert!(readme.contains("17 pass, 6 wrong_result, 2 blocked, and 15 not_encoded jobs")); - assert!(readme.contains("The differences are")); - assert!(readme.contains("delete/TTL tombstone case")); - assert!(readme.contains("ELF-only capture/write-policy live self-checks")); + assert!(readme.contains("Full-suite live real-world adapter sweep after XY-926")); + assert!(readme.contains("all 55 checked-in jobs across 13 suites")); + assert!(readme.contains("ELF now live-scores capture/write-policy")); + assert!(readme.contains("consolidation proposal review")); + assert!(readme.contains("knowledge-page rebuild/lint")); + assert!(readme.contains("operator-debugging fixtures")); + assert!(readme.contains("memory-evolution wrong results")); + assert!(readme.contains("production-ops operator boundaries")); + assert!(readme.contains("core/archival live adapter gap")); + assert!(readme.contains("context-trajectory measurement")); + assert!( + readme + .contains("consolidation, knowledge, capture, and core/archival typed non-pass states") + ); + assert!(readme.contains("operator-debug trace hydration")); assert!(readme.contains("qmd remains the local retrieval-debug UX reference")); - assert!(readme.contains("no broad ELF-over-qmd claim")); + assert!(readme.contains("broad ELF-over-qmd")); assert!(readme.contains("qmd and OpenViking Strength-Profile Report - June 11, 2026")); assert!(benchmarking_index.contains("2026-06-11-qmd-openviking-strength-profile-report.md")); assert!( @@ -3284,9 +3346,9 @@ fn generated_json_report_renders_markdown() -> Result<()> { assert!(markdown.contains("xy844-current-worktree")); assert!(markdown.contains("Existing live-baseline reports remain valid")); assert!(markdown.contains("### Adapter Scenario Judgments")); - assert!(markdown.contains("ELF scenario positions: `wins=9, ties=9, loses=1, untested=23`")); + assert!(markdown.contains("ELF scenario positions: `wins=10, ties=11, loses=1, untested=23`")); assert!(markdown.contains( - "Scenario comparison outcomes: `win=9, tie=9, loss=1, not_tested=12, blocked=8, non_goal=3`" + "Scenario comparison outcomes: `win=10, tie=11, loss=1, not_tested=12, blocked=8, non_goal=3`" )); assert!(markdown.contains("| `claude_mem_live_baseline` | `same_corpus_retrieval`")); assert!(markdown.contains("| `memsearch_live_baseline` | `ttl_expiry_lifecycle`")); diff --git a/docs/guide/benchmarking/real_world_agent_memory_benchmark.md b/docs/guide/benchmarking/real_world_agent_memory_benchmark.md index 4e6bd18d..0e097230 100644 --- a/docs/guide/benchmarking/real_world_agent_memory_benchmark.md +++ b/docs/guide/benchmarking/real_world_agent_memory_benchmark.md @@ -237,23 +237,24 @@ production-ops operator boundaries plus the XY-928 OpenViking `context_trajector gates for staged retrieval, hierarchy selection, and recursive/context expansion. Current live-adapter state: the `elf_live_real_world` and `qmd_live_real_world` adapters run a full -encoded-suite sweep through `cargo make real-world-memory-live-adapters`. Each adapter -materializes generated runtime answers for 40 jobs across 11 suites before scoring. -The fixture-only `core_archival_memory` suite can also be run through -`cargo make real-world-memory-core-archival`; it is not yet included in that live -sweep. +checked-in suite sweep through `cargo make real-world-memory-live-adapters`. Each adapter +materializes generated runtime answers for 55 jobs across 13 suites before scoring, +including the operator-debug fixture tree. The original targeted `work_resume`, `retrieval`, and `project_decisions` slice still -passes, and ELF now passes the live `capture_integration` self-checks for redaction, -exclusions, source ids, evidence binding, and no secret leakage. The full sweep is -still not a full-suite pass: memory_evolution is `wrong_result`, production_ops keeps -operator-owned blocked boundaries, and consolidation, knowledge_compilation, and -operator_debugging_ux remain `not_encoded` for this live adapter path. qmd keeps -`capture_integration` typed `not_encoded` and still also keeps its separate -`live_baseline_only` same-corpus record for update/delete/cold-start checks; that -record is not a real-world suite win. agentmemory is blocked on durable upstream -storage for lifecycle proof and capture breadth. mem0/OpenMemory, memsearch, and -claude-mem no longer share one live-baseline boundary: mem0/OpenMemory and memsearch -now pass scoped local baseline paths, while OpenMemory product UI/export, hosted +passes. ELF now also passes live `capture_integration` self-checks for redaction, +exclusions, source ids, evidence binding, and no secret leakage; live consolidation +proposal review; live knowledge-page rebuild/lint; and live operator-debug trace +metadata. The full sweep is still not a full-suite pass: memory_evolution is +`wrong_result`, production_ops keeps operator-owned blocked boundaries, +core_archival_memory remains typed `not_encoded` for this live adapter path, and +context_trajectory remains blocked. qmd keeps `capture_integration`, consolidation, +knowledge_compilation, and core_archival_memory typed non-pass, is `wrong_result` for +operator-debug trace hydration, and still also keeps its separate `live_baseline_only` +same-corpus record for update/delete/cold-start checks; that record is not a +real-world suite win. agentmemory is blocked on durable upstream storage for lifecycle +proof and capture breadth. mem0/OpenMemory, memsearch, and claude-mem no longer share +one live-baseline boundary: mem0/OpenMemory and memsearch now pass scoped local +baseline paths, while OpenMemory product UI/export, hosted Platform behavior, optional graph memory, memsearch real-world prompt/TTL coverage, and claude-mem hook/viewer capture remain blocked, unsupported, not encoded, or wrong-result for the checked-in adapter evidence. OpenViking now reaches its pinned diff --git a/scripts/real-world-live-adapters.sh b/scripts/real-world-live-adapters.sh index 7c87667c..398cae08 100755 --- a/scripts/real-world-live-adapters.sh +++ b/scripts/real-world-live-adapters.sh @@ -4,6 +4,8 @@ set -euo pipefail ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" REPORT_DIR="${ELF_REAL_WORLD_LIVE_REPORT_DIR:-${ROOT_DIR}/tmp/real-world-memory/live-adapters}" FIXTURE_DIR="${ELF_REAL_WORLD_LIVE_FIXTURES:-${ROOT_DIR}/apps/elf-eval/fixtures/real_world_memory}" +OPERATOR_FIXTURE_DIR="${ELF_REAL_WORLD_OPERATOR_DEBUG_FIXTURES:-${ROOT_DIR}/apps/elf-eval/fixtures/real_world_job/operator_debugging_ux}" +INPUT_FIXTURE_DIR="${REPORT_DIR}/input-fixtures" WORK_DIR="${ELF_REAL_WORLD_LIVE_WORK_DIR:-/bench/real-world-live-adapters}" QMD_DIR="${ELF_REAL_WORLD_QMD_DIR:-/bench/repos/qmd}" @@ -20,7 +22,8 @@ for cmd in bash cargo git jq npm npx; do done mkdir -p "${REPORT_DIR}" "${WORK_DIR}" -rm -rf "${REPORT_DIR:?}/elf-fixtures" \ +rm -rf "${INPUT_FIXTURE_DIR}" \ + "${REPORT_DIR:?}/elf-fixtures" \ "${REPORT_DIR:?}/qmd-fixtures" \ "${REPORT_DIR:?}/elf-materialization.json" \ "${REPORT_DIR:?}/qmd-materialization.json" \ @@ -37,8 +40,13 @@ rm -rf "${REPORT_DIR:?}/elf-fixtures" \ cd "${ROOT_DIR}" +mkdir -p "${INPUT_FIXTURE_DIR}" +cp -R "${FIXTURE_DIR}/." "${INPUT_FIXTURE_DIR}/" +mkdir -p "${INPUT_FIXTURE_DIR}/operator_debugging_ux" +cp -R "${OPERATOR_FIXTURE_DIR}/." "${INPUT_FIXTURE_DIR}/operator_debugging_ux/" + cargo run -p elf-eval --bin real_world_live_adapter -- elf \ - --fixtures "${FIXTURE_DIR}" \ + --fixtures "${INPUT_FIXTURE_DIR}" \ --out-fixtures "${REPORT_DIR}/elf-fixtures" \ --evidence-out "${REPORT_DIR}/elf-materialization.json" \ --config config/local/elf.docker.toml @@ -59,7 +67,7 @@ cargo run -p elf-eval --bin real_world_job_benchmark -- publish \ --out "${REPORT_DIR}/elf-report.md" cargo run -p elf-eval --bin real_world_live_adapter -- qmd \ - --fixtures "${FIXTURE_DIR}" \ + --fixtures "${INPUT_FIXTURE_DIR}" \ --out-fixtures "${REPORT_DIR}/qmd-fixtures" \ --evidence-out "${REPORT_DIR}/qmd-materialization.json" \ --qmd-dir "${QMD_DIR}" \ @@ -116,6 +124,8 @@ jq -n \ generated_at: (now | todateiso8601), artifact_dir: (env.ELF_REAL_WORLD_LIVE_REPORT_DIR // "tmp/real-world-memory/live-adapters"), fixture_dir: (env.ELF_REAL_WORLD_LIVE_FIXTURES // "apps/elf-eval/fixtures/real_world_memory"), + operator_debug_fixture_dir: (env.ELF_REAL_WORLD_OPERATOR_DEBUG_FIXTURES // "apps/elf-eval/fixtures/real_world_job/operator_debugging_ux"), + combined_fixture_dir: "tmp/real-world-memory/live-adapters/input-fixtures", graph_rag_smoke_controls: { inclusion_flags: { ragflow: (env.ELF_REAL_WORLD_LIVE_ENABLE_RAGFLOW // "0"),