From fb6e47c307f22e9e04bef753d1c620265b4828d3 Mon Sep 17 00:00:00 2001 From: Yvette Carlisle Date: Thu, 11 Jun 2026 19:03:25 +0800 Subject: [PATCH 1/4] {"schema":"decodex/commit/1","summary":"Publish mem0 history and export evidence","authority":"XY-924"} --- README.md | 10 + .../memory_projects_manifest.json | 118 +++++-- .../src/bin/real_world_job_benchmark.rs | 90 +++++- .../tests/real_world_job_benchmark.rs | 148 +++++++-- ...-11-competitor-strength-adoption-report.md | 30 +- ...generation-oss-adapter-promotion-report.md | 6 + ...em0-openmemory-history-ui-export-report.md | 148 +++++++++ ...-temporal-history-competitor-gap-report.md | 8 + docs/guide/benchmarking/index.md | 5 + .../real_world_agent_memory_benchmark_v1.md | 17 +- scripts/live-baseline-benchmark.sh | 288 +++++++++++++++++- 11 files changed, 797 insertions(+), 71 deletions(-) create mode 100644 docs/guide/benchmarking/2026-06-11-mem0-openmemory-history-ui-export-report.md diff --git a/README.md b/README.md index 51452873..0d3fd2ef 100644 --- a/README.md +++ b/README.md @@ -176,6 +176,14 @@ provider-backed ELF evidence was required. typed blocked or incomplete without explicit service, resource, or provider setup. These reports preserve the smoke-only boundary and do not create an ELF win claim against graph/RAG strengths. +- mem0/OpenMemory history follow-up after XY-924: the local OSS mem0 adapter now + passes encoded preference correction history, entity-scoped personalization, local + `get_all` export-style readback, and deletion audit history in + `live-baseline-20260611105855`. The comparison records ELF as a loss on preference + correction history, ties on scoped personalization and delete audit, `not_tested` + for local SDK export-style parity, `blocked` for OpenMemory UI/export, and + `non_goal` for hosted Platform export and optional graph memory in the local OSS + lane. - The benchmark runner and report publisher are checked in and Docker-isolated: `cargo make baseline-live-docker`, `cargo make baseline-backfill-docker`, `cargo make baseline-production-private-addendum`, @@ -197,6 +205,7 @@ Detailed evidence and interpretation: - [qmd and OpenViking Strength-Profile Report - June 11, 2026](docs/guide/benchmarking/2026-06-11-qmd-openviking-strength-profile-report.md) - [ELF/qmd Trace Replay Diagnostics Report - June 11, 2026](docs/guide/benchmarking/2026-06-11-elf-qmd-trace-replay-diagnostics-report.md) - [Graph/RAG Scored Smoke Adapter Report - June 11, 2026](docs/guide/benchmarking/2026-06-11-graph-rag-scored-smoke-adapter-report.md) +- [mem0/OpenMemory History and UI Export Report - June 11, 2026](docs/guide/benchmarking/2026-06-11-mem0-openmemory-history-ui-export-report.md) - [Live Baseline Benchmark Runbook](docs/guide/benchmarking/live_baseline_benchmark.md) - [Single-User Production Runbook](docs/guide/single_user_production.md) - Benchmark contract: @@ -272,6 +281,7 @@ Detailed comparison, mechanism-level analysis, and source map: - [Temporal History Competitor Gap Report - June 11, 2026](docs/guide/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md) - [ELF/qmd Trace Replay Diagnostics Report - June 11, 2026](docs/guide/benchmarking/2026-06-11-elf-qmd-trace-replay-diagnostics-report.md) - [Graph/RAG Scored Smoke Adapter Report - June 11, 2026](docs/guide/benchmarking/2026-06-11-graph-rag-scored-smoke-adapter-report.md) +- [mem0/OpenMemory History and UI Export Report - June 11, 2026](docs/guide/benchmarking/2026-06-11-mem0-openmemory-history-ui-export-report.md) - [Live Baseline Benchmark Runbook](docs/guide/benchmarking/live_baseline_benchmark.md) - [Real-World Agent Memory Benchmark](docs/guide/benchmarking/real_world_agent_memory_benchmark.md) - [External Memory Improvement Plan](docs/guide/research/external_memory_improvement_plan.md) diff --git a/apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json b/apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json index 3c023fe2..cfc54fb4 100644 --- a/apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json +++ b/apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json @@ -1,6 +1,6 @@ { "schema": "elf.real_world_external_adapter_manifest/v1", - "manifest_id": "real-world-memory-project-adapters-2026-06-11", + "manifest_id": "real-world-memory-project-adapters-2026-06-11-mem0-history", "docker_isolation": { "default": true, "compose_file": "docker-compose.baseline.yml", @@ -608,12 +608,13 @@ }, "run": { "status": "pass", - "evidence": "Fresh comparable baseline run live-baseline-20260611061612 exercises local OSS mem0 with FastEmbed, Qdrant path storage, Memory.update, Memory.delete, and cold-start reload; mem0 passed 4/4 encoded checks.", + "evidence": "Fresh scoped baseline run live-baseline-20260611105855 exercises local OSS mem0 with FastEmbed, Qdrant path storage, Memory.update, Memory.delete, Memory.history, Memory.get_all, entity filters, and cold-start reload; mem0 passed 8/8 encoded checks.", + "command": "ELF_BASELINE_PROJECTS=mem0 cargo make baseline-live-docker", "artifact": "tmp/live-baseline/live-baseline-report.json" }, "result": { "status": "pass", - "evidence": "The local OSS mem0 baseline now passes basic same-corpus/update/delete/reload smoke. No real_world_job mem0/OpenMemory adapter, OpenMemory UI, hosted Platform, entity-history, or graph-memory behavior is encoded.", + "evidence": "The local OSS mem0 baseline now passes same-corpus retrieval, update/delete/reload, preference correction history, entity-scoped personalization, local get_all export-style readback, and deletion audit history. It still does not launch the OpenMemory UI, hosted Platform export flow, optional graph memory, or a real_world_job prompt adapter.", "artifact": "tmp/live-baseline/live-baseline-report.json" }, "capabilities": [ @@ -625,44 +626,69 @@ { "capability": "same_corpus_retrieval", "status": "pass", - "evidence": "Fresh comparable baseline run live-baseline-20260611061612 reports mem0 retrieval_pass with 3/3 same-corpus retrieval checks." + "evidence": "Fresh scoped baseline run live-baseline-20260611105855 reports mem0 retrieval_pass with 3/3 same-corpus retrieval checks." }, { "capability": "local_lifecycle_update_delete_reload", "status": "pass", - "evidence": "The Docker runner exercises public Memory.update, Memory.delete, and a new Memory.from_config over the same local Qdrant/history paths; the fresh scoped run reports 4/4 encoded checks passing." + "evidence": "The Docker runner exercises public Memory.update, Memory.delete, and a new Memory.from_config over the same local Qdrant/history paths; the fresh scoped run reports those lifecycle checks passing." + }, + { + "capability": "preference_correction_history", + "status": "pass", + "evidence": "The fresh scoped run reports preference_correction_history as pass: Memory.history preserved ADD and UPDATE records with old and current preference text, and search returned only the current correction." + }, + { + "capability": "entity_scoped_personalization", + "status": "pass", + "evidence": "The fresh scoped run reports entity_scoped_personalization as pass: user_id, agent_id, and run_id filters returned the ELF scoped preference and omitted a PubFi scoped preference." + }, + { + "capability": "local_get_all_export_readback", + "status": "pass", + "evidence": "The fresh scoped run reports local_get_all_export_readback as pass: Memory.get_all returned the current scoped preference and omitted the other scope." + }, + { + "capability": "deletion_audit_history", + "status": "pass", + "evidence": "The fresh scoped run reports delete_history_audit_readback as pass: Memory.history exposed a DELETE event and search suppressed the deleted memory." }, { "capability": "openmemory_ui_readback", - "status": "not_encoded", - "evidence": "OpenMemory UI readback is not encoded in the Docker baseline or real-world job runner." + "status": "blocked", + "evidence": "The Docker live-baseline runner does not launch the OpenMemory web UI, dashboard authentication, or browser export flow. Local SDK get_all readback is measured separately and must not be reused as UI evidence." }, { "capability": "hosted_managed_memory_claims", - "status": "not_encoded", - "evidence": "Hosted mem0 Platform behavior is outside the local OSS Docker adapter and is not counted as a local pass." + "status": "unsupported", + "evidence": "Hosted mem0 Platform behavior and Platform UI export are outside the local OSS Docker adapter and are non-goals for this local evidence record." }, { "capability": "real_world_job_adapter", "status": "not_encoded", "evidence": "No mem0/OpenMemory adapter currently executes real_world_job prompts and answer scoring." + }, + { + "capability": "optional_graph_memory", + "status": "not_encoded", + "evidence": "Optional graph memory is not enabled in the default local OSS path and remains an opt-in scenario gate rather than a default pass/fail claim." } ], "suites": [ { "suite_id": "memory_evolution", "status": "not_encoded", - "evidence": "Basic local lifecycle checks now pass in Docker, but real_world_job memory-evolution prompts, preference history, deletion audit readback, and entity history are not encoded for mem0/OpenMemory." + "evidence": "Scenario-level local OSS checks now measure preference correction history and deletion audit readback, but no mem0 real_world_job memory_evolution prompt adapter is encoded." }, { "suite_id": "personalization", "status": "not_encoded", - "evidence": "Entity-scoped personalization is not encoded as a real_world_job adapter run." + "evidence": "Scenario-level local OSS checks now measure entity-scoped personalization, but no mem0 real_world_job personalization prompt adapter is encoded." }, { "suite_id": "operator_debugging_ux", - "status": "not_encoded", - "evidence": "OpenMemory inspection is not encoded in this runner." + "status": "blocked", + "evidence": "Local SDK get_all inspection is measured, but OpenMemory UI/export readback is blocked because the Docker runner does not launch the web UI or hosted export flow." } ], "scenarios": [ @@ -671,25 +697,77 @@ "suite_id": "memory_evolution", "status": "pass", "elf_position": "ties", - "evidence": "Fresh comparable baseline run live-baseline-20260611061612 reports ELF passing 8/8 local lifecycle checks and mem0 passing 4/4 same-corpus retrieval, update, delete, and cold-start reload checks. This is a basic local lifecycle tie at the encoded smoke surface, not a claim about OpenMemory UI, hosted behavior, entity history, or graph memory.", + "comparison_outcome": "tie", + "evidence": "Prior comparable baseline run live-baseline-20260611061612 reports ELF passing 8/8 local lifecycle checks and mem0 passing basic same-corpus retrieval, update, delete, and cold-start reload checks. This remains a basic local lifecycle tie at the encoded smoke surface and is not reused as history/UI evidence.", "command": "ELF_BASELINE_PROJECTS=ELF,agentmemory,mem0,memsearch,claude-mem cargo make baseline-live-docker", "artifact": "tmp/live-baseline/live-baseline-report.json" }, { - "scenario_id": "preference_entity_history", + "scenario_id": "preference_correction_history", "suite_id": "personalization", - "status": "not_encoded", + "status": "pass", + "elf_position": "loses", + "comparison_outcome": "loss", + "evidence": "Fresh scoped baseline run live-baseline-20260611105855 reports mem0 preference_correction_history as pass. The June 11 temporal report records ELF live memory-evolution preference as wrong_result, so the current measured comparison is an ELF loss on this history dimension until ELF temporal reconciliation is fixed.", + "command": "ELF_BASELINE_PROJECTS=mem0 cargo make baseline-live-docker", + "artifact": "tmp/live-baseline/mem0-checks.json" + }, + { + "scenario_id": "entity_scoped_personalization", + "suite_id": "personalization", + "status": "pass", + "elf_position": "ties", + "comparison_outcome": "tie", + "evidence": "Fresh scoped baseline run live-baseline-20260611105855 reports mem0 entity_scoped_personalization as pass. Existing live real-world evidence records ELF and qmd passing the encoded personalization slice, so this is a measured tie on the current scoped-preference surface.", + "command": "ELF_BASELINE_PROJECTS=mem0 cargo make baseline-live-docker", + "artifact": "tmp/live-baseline/mem0-checks.json" + }, + { + "scenario_id": "delete_audit_readback", + "suite_id": "memory_evolution", + "status": "pass", + "elf_position": "ties", + "comparison_outcome": "tie", + "evidence": "Fresh scoped baseline run live-baseline-20260611105855 reports mem0 delete_history_audit_readback as pass. The June 11 temporal report records ELF passing the delete/TTL tombstone job, so the current measured delete-audit comparison is a tie.", + "command": "ELF_BASELINE_PROJECTS=mem0 cargo make baseline-live-docker", + "artifact": "tmp/live-baseline/mem0-checks.json" + }, + { + "scenario_id": "local_get_all_export_readback", + "suite_id": "operator_debugging_ux", + "status": "pass", "elf_position": "untested", - "evidence": "mem0/OpenMemory's strongest next comparison is preference and entity-scoped history. The current local OSS Docker baseline does not inspect memory history events, correction chains, or entity-scoped readback under real_world_job scoring.", - "artifact": "docs/guide/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md" + "comparison_outcome": "not_tested", + "evidence": "Fresh scoped baseline run live-baseline-20260611105855 reports mem0 local_get_all_export_readback as pass. This is local SDK inspection/export-style readback, not OpenMemory UI evidence; ELF has no directly comparable live UI/export scoring row in this run.", + "command": "ELF_BASELINE_PROJECTS=mem0 cargo make baseline-live-docker", + "artifact": "tmp/live-baseline/mem0-checks.json" }, { "scenario_id": "openmemory_ui_export_readback", "suite_id": "operator_debugging_ux", + "status": "blocked", + "elf_position": "untested", + "comparison_outcome": "blocked", + "evidence": "The local Docker runner does not launch OpenMemory UI/dashboard export, and hosted Platform export remains outside local OSS evidence. Basic lifecycle and local get_all readback are not reused as UI/export proof.", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json" + }, + { + "scenario_id": "hosted_platform_export", + "suite_id": "operator_debugging_ux", + "status": "unsupported", + "elf_position": "untested", + "comparison_outcome": "non_goal", + "evidence": "Hosted mem0 Platform export is explicitly outside the local OSS Docker comparison and is not counted as a local pass, loss, or blocker.", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json" + }, + { + "scenario_id": "optional_graph_memory", + "suite_id": "memory_evolution", "status": "not_encoded", "elf_position": "untested", - "evidence": "OpenMemory UI/export readback is not exercised by the local OSS Docker baseline and hosted Platform behavior remains out of scope for local OSS evidence.", - "artifact": "docs/guide/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md" + "comparison_outcome": "non_goal", + "evidence": "Optional graph memory is kept as an opt-in scenario gate. It is not enabled in the default mem0 local OSS run and is not part of the default pass/fail comparison.", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json" } ], "evidence": [ diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark.rs b/apps/elf-eval/src/bin/real_world_job_benchmark.rs index 7635c0bd..7f0c74e8 100644 --- a/apps/elf-eval/src/bin/real_world_job_benchmark.rs +++ b/apps/elf-eval/src/bin/real_world_job_benchmark.rs @@ -647,6 +647,17 @@ enum ElfScenarioPosition { Untested, } +#[derive(Clone, Copy, Debug, Eq, Ord, PartialEq, PartialOrd, Deserialize, Serialize)] +#[serde(rename_all = "snake_case")] +enum ScenarioComparisonOutcome { + Win, + Tie, + Loss, + NotTested, + Blocked, + NonGoal, +} + #[derive(Debug, Deserialize)] struct ExternalAdapterManifest { schema: String, @@ -736,6 +747,8 @@ struct AdapterScenarioJudgment { suite_id: Option, status: AdapterCoverageStatus, elf_position: ElfScenarioPosition, + #[serde(skip_serializing_if = "Option::is_none")] + comparison_outcome: Option, evidence: String, #[serde(skip_serializing_if = "Option::is_none")] command: Option, @@ -789,6 +802,8 @@ struct ExternalAdapterSummary { scenario_status_counts: AdapterStatusCounts, #[serde(default)] scenario_position_counts: ScenarioPositionCounts, + #[serde(default)] + scenario_outcome_counts: ScenarioOutcomeCounts, } #[derive(Clone, Debug, Default, Deserialize, Serialize)] @@ -812,6 +827,16 @@ struct ScenarioPositionCounts { untested: usize, } +#[derive(Clone, Debug, Default, Deserialize, Serialize)] +struct ScenarioOutcomeCounts { + win: usize, + tie: usize, + loss: usize, + not_tested: usize, + blocked: usize, + non_goal: usize, +} + #[derive(Clone, Debug, Default, Deserialize, Serialize)] struct CaptureIntegrationReport { #[serde(default)] @@ -3993,6 +4018,10 @@ fn accumulate_adapter_summary( &mut summary.scenario_position_counts, scenario.elf_position, ); + increment_scenario_outcome_count( + &mut summary.scenario_outcome_counts, + scenario_comparison_outcome(scenario), + ); } } @@ -4022,6 +4051,29 @@ fn increment_scenario_position_count( } } +fn scenario_comparison_outcome(scenario: &AdapterScenarioJudgment) -> ScenarioComparisonOutcome { + scenario.comparison_outcome.unwrap_or(match scenario.elf_position { + ElfScenarioPosition::Wins => ScenarioComparisonOutcome::Win, + ElfScenarioPosition::Ties => ScenarioComparisonOutcome::Tie, + ElfScenarioPosition::Loses => ScenarioComparisonOutcome::Loss, + ElfScenarioPosition::Untested => ScenarioComparisonOutcome::NotTested, + }) +} + +fn increment_scenario_outcome_count( + counts: &mut ScenarioOutcomeCounts, + outcome: ScenarioComparisonOutcome, +) { + match outcome { + ScenarioComparisonOutcome::Win => counts.win += 1, + ScenarioComparisonOutcome::Tie => counts.tie += 1, + ScenarioComparisonOutcome::Loss => counts.loss += 1, + ScenarioComparisonOutcome::NotTested => counts.not_tested += 1, + ScenarioComparisonOutcome::Blocked => counts.blocked += 1, + ScenarioComparisonOutcome::NonGoal => counts.non_goal += 1, + } +} + fn capture_integration_report(jobs: &[RealWorldJob]) -> CaptureIntegrationReport { let mut report = CaptureIntegrationReport::default(); @@ -4192,6 +4244,10 @@ fn render_markdown_external_adapters(out: &mut String, report: &RealWorldReport) "- ELF scenario positions: `{}`\n", scenario_position_counts_display(&summary.scenario_position_counts) )); + out.push_str(&format!( + "- Scenario comparison outcomes: `{}`\n", + scenario_outcome_counts_display(&summary.scenario_outcome_counts) + )); } out.push('\n'); @@ -4242,7 +4298,7 @@ fn render_markdown_adapter_scenarios(out: &mut String, adapters: &[ExternalAdapt } out.push_str("\n### Adapter Scenario Judgments\n\n"); - out.push_str("| Adapter | Scenario | Suite | Status | ELF Position | Evidence |\n"); + out.push_str("| Adapter | Scenario | Suite | Status | Outcome | Evidence |\n"); out.push_str("| --- | --- | --- | --- | --- | --- |\n"); for adapter in adapters { @@ -4257,7 +4313,7 @@ fn render_markdown_adapter_scenarios(out: &mut String, adapters: &[ExternalAdapt .map(|suite| format!("`{}`", md_inline(suite))) .unwrap_or_else(|| "`none`".to_string()), adapter_status_str(scenario.status), - scenario_position_str(scenario.elf_position), + scenario_comparison_outcome_str(scenario_comparison_outcome(scenario)), adapter_scenario_evidence_cell(scenario) )); } @@ -4906,12 +4962,14 @@ fn adapter_status_str(status: AdapterCoverageStatus) -> &'static str { } } -fn scenario_position_str(position: ElfScenarioPosition) -> &'static str { - match position { - ElfScenarioPosition::Wins => "wins", - ElfScenarioPosition::Ties => "ties", - ElfScenarioPosition::Loses => "loses", - ElfScenarioPosition::Untested => "untested", +fn scenario_comparison_outcome_str(outcome: ScenarioComparisonOutcome) -> &'static str { + match outcome { + ScenarioComparisonOutcome::Win => "win", + ScenarioComparisonOutcome::Tie => "tie", + ScenarioComparisonOutcome::Loss => "loss", + ScenarioComparisonOutcome::NotTested => "not_tested", + ScenarioComparisonOutcome::Blocked => "blocked", + ScenarioComparisonOutcome::NonGoal => "non_goal", } } @@ -4948,6 +5006,22 @@ fn scenario_position_counts_display(counts: &ScenarioPositionCounts) -> String { .join(", ") } +fn scenario_outcome_counts_display(counts: &ScenarioOutcomeCounts) -> String { + [ + ("win", counts.win), + ("tie", counts.tie), + ("loss", counts.loss), + ("not_tested", counts.not_tested), + ("blocked", counts.blocked), + ("non_goal", counts.non_goal), + ] + .into_iter() + .filter(|(_, count)| *count > 0) + .map(|(outcome, count)| format!("{outcome}={count}")) + .collect::>() + .join(", ") +} + fn adapter_suite_cell(suites: &[AdapterSuiteCoverage]) -> String { if suites.is_empty() { return "`none`".to_string(); diff --git a/apps/elf-eval/tests/real_world_job_benchmark.rs b/apps/elf-eval/tests/real_world_job_benchmark.rs index bf0b0bbc..6ef0f0d3 100644 --- a/apps/elf-eval/tests/real_world_job_benchmark.rs +++ b/apps/elf-eval/tests/real_world_job_benchmark.rs @@ -360,13 +360,25 @@ fn external_adapter_run_summarizes_nonzero_scenario_losses() -> Result<()> { report .pointer("/external_adapters/summary/scenario_position_counts/loses") .and_then(Value::as_u64), - Some(1) + Some(2) ); assert_eq!( report .pointer("/external_adapters/summary/scenario_position_counts/untested") .and_then(Value::as_u64), - Some(8) + Some(10) + ); + assert_eq!( + report + .pointer("/external_adapters/summary/scenario_outcome_counts/loss") + .and_then(Value::as_u64), + Some(2) + ); + assert_eq!( + report + .pointer("/external_adapters/summary/scenario_outcome_counts/not_tested") + .and_then(Value::as_u64), + Some(7) ); let adapters = array_at(&report, "/external_adapters/adapters")?; @@ -387,7 +399,7 @@ fn assert_external_adapter_manifest_summary(report: &Value) { ); assert_eq!( report.pointer("/external_adapters/manifest_id").and_then(Value::as_str), - Some("real-world-memory-project-adapters-2026-06-11") + Some("real-world-memory-project-adapters-2026-06-11-mem0-history") ); assert_eq!( report.pointer("/external_adapters/docker_isolation/default").and_then(Value::as_bool), @@ -471,13 +483,13 @@ fn assert_external_adapter_manifest_summary(report: &Value) { report .pointer("/external_adapters/summary/capability_status_counts/unsupported") .and_then(Value::as_u64), - Some(5) + Some(6) ); assert_eq!( report .pointer("/external_adapters/summary/suite_status_counts/blocked") .and_then(Value::as_u64), - Some(12) + Some(13) ); assert_eq!( report @@ -506,13 +518,13 @@ fn assert_external_adapter_manifest_scenario_summary(report: &Value) { report .pointer("/external_adapters/summary/scenario_status_counts/unsupported") .and_then(Value::as_u64), - Some(1) + Some(2) ); assert_eq!( report .pointer("/external_adapters/summary/scenario_status_counts/blocked") .and_then(Value::as_u64), - Some(1) + Some(2) ); assert_eq!( report @@ -536,13 +548,13 @@ fn assert_external_adapter_manifest_scenario_summary(report: &Value) { report .pointer("/external_adapters/summary/scenario_status_counts/pass") .and_then(Value::as_u64), - Some(5) + Some(9) ); assert_eq!( report .pointer("/external_adapters/summary/scenario_status_counts/not_encoded") .and_then(Value::as_u64), - Some(4) + Some(3) ); assert_eq!( report @@ -554,19 +566,55 @@ fn assert_external_adapter_manifest_scenario_summary(report: &Value) { report .pointer("/external_adapters/summary/scenario_position_counts/ties") .and_then(Value::as_u64), - Some(2) + Some(4) ); assert_eq!( report .pointer("/external_adapters/summary/scenario_position_counts/loses") .and_then(Value::as_u64), - Some(0) + Some(1) ); assert_eq!( report .pointer("/external_adapters/summary/scenario_position_counts/untested") .and_then(Value::as_u64), - Some(9) + Some(11) + ); + assert_eq!( + report + .pointer("/external_adapters/summary/scenario_outcome_counts/win") + .and_then(Value::as_u64), + Some(2) + ); + assert_eq!( + report + .pointer("/external_adapters/summary/scenario_outcome_counts/tie") + .and_then(Value::as_u64), + Some(4) + ); + assert_eq!( + report + .pointer("/external_adapters/summary/scenario_outcome_counts/loss") + .and_then(Value::as_u64), + Some(1) + ); + assert_eq!( + report + .pointer("/external_adapters/summary/scenario_outcome_counts/not_tested") + .and_then(Value::as_u64), + Some(8) + ); + assert_eq!( + report + .pointer("/external_adapters/summary/scenario_outcome_counts/blocked") + .and_then(Value::as_u64), + Some(1) + ); + assert_eq!( + report + .pointer("/external_adapters/summary/scenario_outcome_counts/non_goal") + .and_then(Value::as_u64), + Some(2) ); } @@ -733,14 +781,41 @@ fn assert_first_generation_adapter_records( Some("local_lifecycle_update_delete_reload") ); assert_eq!(mem0.pointer("/capabilities/2/status").and_then(Value::as_str), Some("pass")); - assert_eq!(mem0.pointer("/capabilities/4/status").and_then(Value::as_str), Some("not_encoded")); + assert_eq!( + mem0.pointer("/capabilities/3/capability").and_then(Value::as_str), + Some("preference_correction_history") + ); + assert_eq!(mem0.pointer("/capabilities/3/status").and_then(Value::as_str), Some("pass")); + assert_eq!( + mem0.pointer("/capabilities/7/capability").and_then(Value::as_str), + Some("openmemory_ui_readback") + ); + assert_eq!(mem0.pointer("/capabilities/7/status").and_then(Value::as_str), Some("blocked")); + assert_eq!( + mem0.pointer("/capabilities/8/capability").and_then(Value::as_str), + Some("hosted_managed_memory_claims") + ); + assert_eq!(mem0.pointer("/capabilities/8/status").and_then(Value::as_str), Some("unsupported")); assert_eq!(mem0.pointer("/scenarios/0/status").and_then(Value::as_str), Some("pass")); assert_eq!(mem0.pointer("/scenarios/0/elf_position").and_then(Value::as_str), Some("ties")); assert_eq!( - mem0.pointer("/scenarios/2/scenario_id").and_then(Value::as_str), + mem0.pointer("/scenarios/1/scenario_id").and_then(Value::as_str), + Some("preference_correction_history") + ); + assert_eq!(mem0.pointer("/scenarios/1/status").and_then(Value::as_str), Some("pass")); + assert_eq!( + mem0.pointer("/scenarios/1/comparison_outcome").and_then(Value::as_str), + Some("loss") + ); + assert_eq!( + mem0.pointer("/scenarios/5/scenario_id").and_then(Value::as_str), Some("openmemory_ui_export_readback") ); - assert_eq!(mem0.pointer("/scenarios/2/status").and_then(Value::as_str), Some("not_encoded")); + assert_eq!(mem0.pointer("/scenarios/5/status").and_then(Value::as_str), Some("blocked")); + assert_eq!( + mem0.pointer("/scenarios/6/comparison_outcome").and_then(Value::as_str), + Some("non_goal") + ); assert_eq!( memsearch.pointer("/capabilities/2/capability").and_then(Value::as_str), Some("reindex_update_delete_reload") @@ -2073,7 +2148,10 @@ fn generated_json_report_renders_markdown() -> Result<()> { assert!(markdown.contains("xy844-current-worktree")); assert!(markdown.contains("Existing live-baseline reports remain valid")); assert!(markdown.contains("### Adapter Scenario Judgments")); - assert!(markdown.contains("ELF scenario positions: `wins=2, ties=2, untested=9`")); + assert!(markdown.contains("ELF scenario positions: `wins=2, ties=4, loses=1, untested=11`")); + assert!(markdown.contains( + "Scenario comparison outcomes: `win=2, tie=4, loss=1, not_tested=8, blocked=1, non_goal=2`" + )); assert!(markdown.contains("| `claude_mem_live_baseline` | `same_corpus_retrieval`")); assert!(markdown.contains("| `memsearch_live_baseline` | `ttl_expiry_lifecycle`")); @@ -2101,9 +2179,21 @@ fn external_adapter_markdown_renders_nonzero_scenario_losses() -> Result<()> { "/external_adapters/summary/scenario_position_counts", serde_json::json!({ "wins": 2, - "ties": 2, - "loses": 1, - "untested": 8 + "ties": 4, + "loses": 2, + "untested": 10 + }), + )?; + set_json_pointer( + &mut report, + "/external_adapters/summary/scenario_outcome_counts", + serde_json::json!({ + "win": 2, + "tie": 4, + "loss": 2, + "not_tested": 7, + "blocked": 1, + "non_goal": 2 }), )?; @@ -2133,9 +2223,12 @@ fn external_adapter_markdown_renders_nonzero_scenario_losses() -> Result<()> { let markdown = fs::read_to_string(markdown_path)?; - assert!(markdown.contains("ELF scenario positions: `wins=2, ties=2, loses=1, untested=8`")); + assert!(markdown.contains("ELF scenario positions: `wins=2, ties=4, loses=2, untested=10`")); assert!(markdown.contains( - "| `agentmemory_live_baseline` | `basic_same_corpus_retrieval` | `retrieval` | `pass` | `loses` |" + "Scenario comparison outcomes: `win=2, tie=4, loss=2, not_tested=7, blocked=1, non_goal=2`" + )); + assert!(markdown.contains( + "| `agentmemory_live_baseline` | `basic_same_corpus_retrieval` | `retrieval` | `pass` | `loss` |" )); Ok(()) @@ -2178,6 +2271,18 @@ fn external_adapter_markdown_omits_scenario_summary_when_manifest_has_no_scenari "untested": 0 }), )?; + set_json_pointer( + &mut report, + "/external_adapters/summary/scenario_outcome_counts", + serde_json::json!({ + "win": 0, + "tie": 0, + "loss": 0, + "not_tested": 0, + "blocked": 0, + "non_goal": 0 + }), + )?; let temp_dir = env::temp_dir().join(format!("elf-real-world-no-scenario-test-{}", process::id())); @@ -2208,6 +2313,7 @@ fn external_adapter_markdown_omits_scenario_summary_when_manifest_has_no_scenari assert!(markdown.contains("External Adapter Coverage")); assert!(!markdown.contains("Scenario coverage statuses:")); assert!(!markdown.contains("ELF scenario positions:")); + assert!(!markdown.contains("Scenario comparison outcomes:")); assert!(!markdown.contains("### Adapter Scenario Judgments")); Ok(()) diff --git a/docs/guide/benchmarking/2026-06-11-competitor-strength-adoption-report.md b/docs/guide/benchmarking/2026-06-11-competitor-strength-adoption-report.md index 1bf607f7..db01c063 100644 --- a/docs/guide/benchmarking/2026-06-11-competitor-strength-adoption-report.md +++ b/docs/guide/benchmarking/2026-06-11-competitor-strength-adoption-report.md @@ -8,7 +8,8 @@ Inputs: `2026-06-11-measurement-coverage-audit.md`, `2026-06-11-first-generation-oss-adapter-promotion-report.md`, `2026-06-11-qmd-openviking-strength-profile-report.md`, `2026-06-11-temporal-history-competitor-gap-report.md`, -`2026-06-11-graph-rag-scored-smoke-adapter-report.md`, and +`2026-06-11-graph-rag-scored-smoke-adapter-report.md`, +`2026-06-11-mem0-openmemory-history-ui-export-report.md`, and `2026-06-10-production-adoption-refresh.md`. Depends on: `docs/spec/real_world_agent_memory_benchmark_v1.md` and the current external adapter manifest. @@ -35,11 +36,13 @@ The remaining caveats are material: exists. - Credentialed provider production-ops gates are blocked until explicit provider setup exists. -- Several competitor strengths remain `not_tested`: mem0/OpenMemory history/UI, - OpenViking trajectory, Letta core-vs-archival memory, and graph/RAG navigation. - The XY-923 follow-up now scores qmd's immediate top-10/replay artifact ergonomics - as stronger than ELF's default stress report, while expansion, fusion, rerank, and - candidate-drop diagnosis remain untested. +- Several competitor strengths remain `not_tested` or blocked: OpenMemory + UI/export, hosted mem0 Platform behavior, OpenViking trajectory, Letta + core-vs-archival memory, and graph/RAG navigation. mem0 local OSS preference + history is now measured separately and is an ELF loss on the current correction + history scenario. The XY-923 follow-up also scores qmd's immediate top-10/replay + artifact ergonomics as stronger than ELF's default stress report, while + expansion, fusion, rerank, and candidate-drop diagnosis remain untested. ## Evidence Classes @@ -67,6 +70,7 @@ results, or lifecycle failures into one aggregate leaderboard. | `cargo make real-world-memory` | `2026-06-11-measurement-coverage-audit.md` | ELF fixture aggregate covers 38 jobs across 11 suites with 36 pass and 2 blocked production-ops operator boundaries. | | `cargo make real-world-memory-live-adapters` | `2026-06-11-measurement-coverage-audit.md` | ELF live service adapter reports 18 pass, 5 wrong_result, 2 blocked, and 13 not_encoded jobs; qmd reports 17 pass, 6 wrong_result, 2 blocked, and 13 not_encoded jobs. | | `ELF_BASELINE_PROJECTS=ELF,agentmemory,mem0,memsearch,claude-mem cargo make baseline-live-docker` | `2026-06-11-first-generation-oss-adapter-promotion-report.md` | mem0/OpenMemory and memsearch pass basic local baseline smokes; agentmemory remains lifecycle_fail and claude-mem remains wrong_result. | +| `ELF_BASELINE_PROJECTS=mem0 cargo make baseline-live-docker` | `2026-06-11-mem0-openmemory-history-ui-export-report.md` | mem0 local OSS passes preference correction history, entity-scoped personalization, local `get_all` export-style readback, and deletion audit history; OpenMemory UI/export remains blocked and hosted Platform export remains non-goal. | | `ELF_GRAPHITI_ZEP_SMOKE_START=1 ELF_GRAPHITI_ZEP_SMOKE_RUN=1 cargo make graphiti-zep-docker-temporal-smoke` | `2026-06-11-temporal-history-competitor-gap-report.md` | Graphiti/Zep temporal smoke remains blocked by `provider_api_key_missing`. | | `cargo make graphify-docker-graph-report-smoke` | `2026-06-11-graph-rag-scored-smoke-adapter-report.md` | graphify reaches tiny Docker graph/report scoring but remains wrong_result. | | `cargo make baseline-production-synthetic`, `cargo make baseline-backfill-docker`, backup/restore, Qdrant rebuild proof | `2026-06-10-production-adoption-refresh.md` | ELF has provider synthetic, stress, backfill, restore, and rebuild evidence; private-corpus proof is blocked by missing operator-owned manifest. | @@ -81,14 +85,14 @@ results, or lifecycle failures into one aggregate leaderboard. | Project decisions and reversals | `tie` | `fixture_backed`, `live_real_world`, `research_gate`, `not_encoded` | ELF and qmd both pass encoded `project_decisions` jobs; Letta-style core/archival decision memory is not tested. | XY-927 | | Retrieval quality | `tie` | `fixture_backed`, `live_real_world`, `live_baseline_only` | ELF and qmd both pass encoded live retrieval and stress/same-corpus retrieval evidence. | XY-923 | | Retrieval quality and local debug UX | `loss` | `live_baseline_only`, `research_gate`, `wrong_result`, `not_encoded` | The XY-923 trace/replay report scores qmd stronger on immediate top-10 candidate artifacts and short CLI replay commands. ELF keeps useful service trace/admin replay surfaces, and expansion, fusion, rerank-on, and candidate-drop diagnostics remain untested. | XY-923 | -| Memory evolution and temporal history | `loss` | `fixture_backed`, `live_real_world`, `wrong_result`, `blocked` | ELF fixture memory evolution passes, but live ELF passes only delete/TTL and reports five wrong_result jobs where current-vs-historical state is not reconciled. | XY-905 | +| Memory evolution and temporal history | `loss` | `fixture_backed`, `live_real_world`, `live_baseline_only`, `wrong_result`, `blocked` | ELF fixture memory evolution passes, but live ELF passes only delete/TTL and reports five wrong_result jobs where current-vs-historical state is not reconciled. The mem0 local OSS preference-correction history scenario is now measured and is also an ELF loss. | XY-905 | | Consolidation/proposal review | `not_tested` | `fixture_backed`, `not_encoded` | ELF fixture consolidation passes, but live consolidation proposal generation and review-action scoring are not encoded. | XY-926 | | Knowledge page compilation | `not_tested` | `fixture_backed`, `live_real_world`, `wrong_result`, `research_gate`, `not_encoded` | ELF fixture knowledge pages pass, but live knowledge compilation is not encoded; graphify reaches a tiny scored smoke and remains wrong_result. | XY-926, XY-929 | -| Operator debugging/viewer UX | `not_tested` | `fixture_backed`, `not_encoded`, `research_gate` | ELF fixture operator-debugging UX passes, but live trace/viewer scoring and qmd/OpenMemory/claude-mem UX comparisons are unscored. | XY-923, XY-926 | +| Operator debugging/viewer UX | `not_tested` | `fixture_backed`, `live_baseline_only`, `blocked`, `not_encoded`, `research_gate` | ELF fixture operator-debugging UX passes. mem0 local SDK `get_all` readback is measured, but OpenMemory UI/export remains blocked and must not be inferred from SDK readback. Live trace/viewer scoring and qmd/OpenMemory/claude-mem UX comparisons remain unscored. | XY-923, XY-926 | | Capture/write policy and redaction | `not_tested` | `fixture_backed`, `live_baseline_only`, `blocked`, `not_encoded` | ELF fixture capture/write-policy jobs pass, but live capture integration and agentmemory/claude-mem capture hooks are not comparable yet. | XY-925, XY-926 | | Production ops, restore, backfill, and rebuild | `win` | `live_baseline_only`, `blocked` | ELF has the strongest measured local production-operation story: provider synthetic, stress, resumable backfill, backup/restore, and Qdrant rebuild evidence. | XY-930 | | Private corpus and provider boundaries | `blocked` | `blocked` | Private production profile fails closed without an operator-owned manifest; provider-backed production-ops gates require explicit credentials. | XY-930 | -| Personalization and scoped preferences | `tie` | `fixture_backed`, `live_real_world`, `not_encoded` | ELF and qmd both pass the single encoded live personalization job; mem0/OpenMemory and Letta personalization/history are not encoded. | XY-924, XY-927 | +| Personalization and scoped preferences | `tie` | `fixture_backed`, `live_real_world`, `live_baseline_only`, `not_encoded` | ELF and qmd both pass the single encoded live personalization job. mem0 local OSS now passes entity-scoped personalization, so scoped preference behavior is a measured tie; preference correction history remains a separate ELF loss. | XY-927 | | Context trajectory and hierarchical retrieval | `not_tested` | `live_baseline_only`, `research_gate`, `wrong_result`, `not_encoded` | OpenViking reaches the pinned Docker local embedding path but misses expected same-corpus evidence; staged trajectory/hierarchy scoring is not encoded. | XY-928 | | Core-vs-archival memory | `not_tested` | `research_gate`, `not_encoded` | ELF has core block semantics in the service contract, but comparable core-vs-archival jobs and a contained Letta export path are not encoded. | XY-927 | | Graph/RAG navigation and citations | `not_tested` | `smoke_only`, `research_gate`, `blocked`, `wrong_result`, `not_encoded` | Graph/RAG smokes produce scored or typed non-pass adapter reports where possible, but broad graph/RAG navigation and citation quality are not tested. | XY-929 | @@ -99,7 +103,7 @@ results, or lifecycle failures into one aggregate leaderboard. | --- | --- | --- | --- | | XY-905 | P0 | Backlog | Live temporal reconciliation answer and trace contract. | | XY-923 | P0 | Backlog | qmd trace-level replay and wrong-result diagnostics. | -| XY-924 | P0 | Backlog | mem0/OpenMemory history and UI-export comparison. | +| XY-924 | P0 | Encoded local OSS history; UI/export still gated | mem0/OpenMemory local OSS history and SDK export-style readback are measured; OpenMemory UI/export still needs a UI runner before any product-UX claim. | | XY-925 | P1 | Backlog | First-generation OSS continuity and source-store adapters. | | XY-926 | P1 | Backlog | Live operator-debugging, capture, consolidation, and knowledge-page suites. | | XY-927 | P1 | Backlog | Letta-style core-vs-archival memory comparison. | @@ -125,8 +129,10 @@ results, or lifecycle failures into one aggregate leaderboard. - Do not claim ELF broadly beats qmd. - Do not claim qmd's trace/replay artifact win is a broad qmd-over-ELF memory-system or retrieval-quality win. -- Do not claim ELF beats mem0/OpenMemory on history, UI/export, hosted behavior, or - graph memory. +- Do not claim ELF beats mem0/OpenMemory on preference history, UI/export, hosted + behavior, or graph memory. The local OSS correction-history scenario is currently + an ELF loss, while OpenMemory UI/export, hosted behavior, and graph memory remain + outside measured local OSS evidence. - Do not claim ELF beats OpenViking on staged context trajectory. - Do not claim ELF beats Letta on core-vs-archival memory. - Do not claim graph/RAG parity from smoke-only evidence. diff --git a/docs/guide/benchmarking/2026-06-11-first-generation-oss-adapter-promotion-report.md b/docs/guide/benchmarking/2026-06-11-first-generation-oss-adapter-promotion-report.md index 368bbb86..63b44b2b 100644 --- a/docs/guide/benchmarking/2026-06-11-first-generation-oss-adapter-promotion-report.md +++ b/docs/guide/benchmarking/2026-06-11-first-generation-oss-adapter-promotion-report.md @@ -14,6 +14,12 @@ gates. This is benchmark/report evidence only. No ELF retrieval, ranking, memory-quality, or service behavior optimization is implemented here. +Update after XY-924: mem0/OpenMemory history and local SDK export-style readback are +now measured in +`2026-06-11-mem0-openmemory-history-ui-export-report.md`. The basic lifecycle result +in this report remains valid, but the mem0 history/UI rows below are historical +pre-XY-924 gaps and must not be treated as the current complete mem0 comparison. + The updated external adapter manifest now includes scenario-level judgments for the first-generation OSS memory projects. These judgments are intentionally narrower than suite passes: diff --git a/docs/guide/benchmarking/2026-06-11-mem0-openmemory-history-ui-export-report.md b/docs/guide/benchmarking/2026-06-11-mem0-openmemory-history-ui-export-report.md new file mode 100644 index 00000000..7ccef030 --- /dev/null +++ b/docs/guide/benchmarking/2026-06-11-mem0-openmemory-history-ui-export-report.md @@ -0,0 +1,148 @@ +# mem0/OpenMemory History and UI Export Report - June 11, 2026 + +Goal: Add scenario-level mem0/OpenMemory history, personalization, deletion-audit, +and export-readback evidence without promoting basic lifecycle smoke into UI or +hosted Platform claims. +Read this when: You need the current XY-924 comparison between ELF and +mem0/OpenMemory for entity-scoped history, preference correction, deletion audit, +personalization, OpenMemory inspection/export, hosted Platform export, or optional +graph memory. +Inputs: Fresh scoped mem0 Docker baseline run, refreshed real-world external adapter +manifest, generated real-world memory report, and the June 11 first-generation, +temporal/history, and competitor-strength reports. +Depends on: `docs/spec/real_world_agent_memory_benchmark_v1.md`, +`scripts/live-baseline-benchmark.sh`, and +`apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json`. +Outputs: Per-scenario outcomes using `win`, `tie`, `loss`, `not_tested`, `blocked`, +and `non_goal`, plus command and artifact evidence for each measured claim. + +## Executive Judgment + +The XY-924 objective is now encoded for the reproducible local OSS surface. + +mem0/OpenMemory now has fresh local OSS evidence for behavior beyond the basic +lifecycle smoke: + +- `preference_correction_history`: `pass` +- `entity_scoped_personalization`: `pass` +- `local_get_all_export_readback`: `pass` +- `delete_history_audit_readback`: `pass` + +The comparison is intentionally narrower than a hosted/OpenMemory product verdict. +The local run measures the mem0 OSS SDK and local FastEmbed/Qdrant/history paths in +Docker. It does not launch the OpenMemory web UI, does not exercise hosted mem0 +Platform export jobs, and does not enable optional graph memory. + +## Fresh Evidence + +| Command | Result | Runtime | Artifact | +| --- | --- | ---: | --- | +| `ELF_BASELINE_PROJECTS=mem0 cargo make baseline-live-docker` | `pass`; mem0 `8/8` encoded checks pass | 42.89 seconds wall; 41 seconds project runtime | `tmp/live-baseline/live-baseline-report.json`, `tmp/live-baseline/mem0-checks.json` | +| `cargo make real-world-memory` | `pass`; refreshed external adapter report published | 220.57 seconds | `tmp/real-world-memory/real-world-memory-report.json`, `tmp/real-world-memory/real-world-memory-report.md` | + +Fresh mem0 run id: `live-baseline-20260611105855`. + +Generated external adapter summary: + +- Scenario statuses: `unsupported=2`, `blocked=2`, `wrong_result=1`, + `lifecycle_fail=1`, `pass=9`, `not_encoded=3`. +- Legacy ELF positions: `wins=2`, `ties=4`, `loses=1`, `untested=11`. +- Normalized comparison outcomes: `win=2`, `tie=4`, `loss=1`, + `not_tested=8`, `blocked=1`, `non_goal=2`. + +## Scenario Outcomes + +| Scenario | mem0/OpenMemory evidence | ELF comparison outcome | Status | Command | Artifact | +| --- | --- | --- | --- | --- | --- | +| Basic local lifecycle | mem0 passes same-corpus retrieval, update, delete, and cold-start reload in the prior first-generation baseline. | `tie` | `pass` | `ELF_BASELINE_PROJECTS=ELF,agentmemory,mem0,memsearch,claude-mem cargo make baseline-live-docker` | `tmp/live-baseline/live-baseline-report.json` | +| Preference correction history | `Memory.history` preserves old and current preference records; search returns only the current correction. | `loss` | `pass` | `ELF_BASELINE_PROJECTS=mem0 cargo make baseline-live-docker` | `tmp/live-baseline/mem0-checks.json` | +| Entity-scoped personalization | `search()` with `user_id`, `agent_id`, and `run_id` filters returns the ELF-scoped preference and omits a PubFi-scoped preference. | `tie` | `pass` | `ELF_BASELINE_PROJECTS=mem0 cargo make baseline-live-docker` | `tmp/live-baseline/mem0-checks.json` | +| Delete audit readback | `Memory.history` exposes a `DELETE` event and post-delete search suppresses the deleted memory. | `tie` | `pass` | `ELF_BASELINE_PROJECTS=mem0 cargo make baseline-live-docker` | `tmp/live-baseline/mem0-checks.json` | +| Local SDK export-style readback | `Memory.get_all` returns the current scoped preference and omits the other scope. | `not_tested` | `pass` | `ELF_BASELINE_PROJECTS=mem0 cargo make baseline-live-docker` | `tmp/live-baseline/mem0-checks.json` | +| OpenMemory UI/export readback | No local UI/dashboard export flow is launched by the Docker runner. | `blocked` | `blocked` | Not run; outside current local runner. | `apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json` | +| Hosted mem0 Platform export | Hosted Platform export is outside local OSS evidence. | `non_goal` | `unsupported` | Not run; local OSS comparison only. | `apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json` | +| Optional graph memory | Graph memory is not enabled in the default local OSS run. | `non_goal` | `not_encoded` | Not run; opt-in scenario gate. | `apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json` | + +## Evidence Details + +The fresh mem0 check artifact records eight passing checks: + +- `same_corpus_retrieval` +- `update_replaces_note_text` +- `preference_correction_history` +- `entity_scoped_personalization` +- `local_get_all_export_readback` +- `delete_suppresses_retrieval` +- `delete_history_audit_readback` +- `cold_start_recovery_search` + +The `preference_correction_history` check verifies all of: + +- history is available; +- history contains the original preference; +- history contains the corrected preference; +- search contains the corrected preference; +- search omits the old preference. + +The `delete_history_audit_readback` check verifies all of: + +- history is available; +- history contains a delete event; +- search suppresses the deleted memory. + +The local SDK export-style readback check is intentionally named separately from UI +export. It only proves local `get_all` scoped readback through the OSS SDK. + +## Source And Product Boundary + +Official mem0 documentation distinguishes the OSS/self-hosted surface from hosted +Platform API paths. The OSS REST page documents CRUD/search/update/delete/reset +operations by `user_id`, `agent_id`, or `run_id`, an OpenAPI explorer at `/docs`, and +memory history endpoints. The export guide distinguishes bulk `get_all()`, semantic +search, structured exports, and Platform UI exports. + +This report uses those docs only to set the claim boundary: + +- local OSS SDK `history`, `search`, and `get_all` behavior is measurable here; +- OpenMemory browser/dashboard export is not measured here; +- hosted Platform export is a `non_goal` for this local OSS lane; +- optional graph memory remains an opt-in scenario, not a default pass/fail claim. + +References: + +- Mem0 OSS REST API Server: `https://docs.mem0.ai/open-source/features/rest-api` +- Mem0 Export Stored Memories: `https://docs.mem0.ai/cookbooks/essentials/exporting-memories` + +## Claim Boundaries + +Allowed: + +- mem0/OpenMemory local OSS passes the new encoded history, correction, + personalization, deletion-audit, and local `get_all` readback checks in run + `live-baseline-20260611105855`. +- ELF currently has a measured `loss` against mem0 on the preference correction + history dimension because the June 11 temporal/history report records ELF's live + memory-evolution preference job as `wrong_result`. +- ELF and mem0 currently `tie` on the encoded entity-scoped personalization and + delete-audit surfaces. +- OpenMemory UI/export readback is `blocked` until the runner launches and inspects + the UI/export flow. +- Hosted mem0 Platform export and optional graph memory are `non_goal` for this + local OSS comparison. + +Not allowed: + +- Do not reuse the basic lifecycle pass as history, UI, hosted, or graph-memory + evidence. +- Do not claim OpenMemory UI/export quality from local SDK `get_all`. +- Do not claim hosted mem0 Platform behavior from the local OSS run. +- Do not treat optional graph memory as a default mem0 pass or ELF loss. +- Do not convert `blocked`, `unsupported`, `not_encoded`, or `non_goal` scenarios + into wins or losses. + +## Follow-Up Gate + +The next fair UI/export comparison requires a bounded runner that starts OpenMemory, +loads the same local memories, captures authenticated inspection/export readback, and +publishes a browser/API artifact. That is separate from the local SDK `get_all` +export-style readback added here. diff --git a/docs/guide/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md b/docs/guide/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md index dd86fde4..d0749918 100644 --- a/docs/guide/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md +++ b/docs/guide/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md @@ -17,6 +17,14 @@ The overall goal is not complete. ELF does not yet have complete, comparable benchmark wins across all tracked memory projects and all user-important memory scenarios. +Update after XY-924: mem0/OpenMemory local OSS history and local SDK export-style +readback are now measured in +`2026-06-11-mem0-openmemory-history-ui-export-report.md`. That report records mem0 +passes for preference correction history, entity-scoped personalization, deletion +audit history, and local `get_all` readback, while keeping OpenMemory UI/export +blocked and hosted Platform export plus optional graph memory as local-lane +non-goals. + The current evidence supports a narrower judgment: - ELF remains a strong personal-production foundation because its core source of diff --git a/docs/guide/benchmarking/index.md b/docs/guide/benchmarking/index.md index efab4bb0..f6795dfb 100644 --- a/docs/guide/benchmarking/index.md +++ b/docs/guide/benchmarking/index.md @@ -92,6 +92,11 @@ cleanup, use `docs/guide/single_user_production.md`. competitor-strength adoption report with the bounded personal-production decision, scenario-level win/tie/loss/not-tested matrix, claim boundaries, and optimization issue queue. +- `2026-06-11-mem0-openmemory-history-ui-export-report.md`: XY-924 + mem0/OpenMemory local OSS history, preference-correction, deletion-audit, + personalization, and export-readback comparison with normalized + win/tie/loss/not-tested/blocked/non-goal outcomes and explicit hosted/UI/graph + non-claims. - `real_world_agent_memory_benchmark.md`: operator overview for the v1 real-world agent memory benchmark contract, including suite taxonomy, typed report states, knowledge-compilation fixture tasks, and the production-ops fixture target. diff --git a/docs/spec/real_world_agent_memory_benchmark_v1.md b/docs/spec/real_world_agent_memory_benchmark_v1.md index fdc2f571..5bb56574 100644 --- a/docs/spec/real_world_agent_memory_benchmark_v1.md +++ b/docs/spec/real_world_agent_memory_benchmark_v1.md @@ -175,10 +175,15 @@ Each `adapters[]` record MUST include: - `suites`: array of real-world suite coverage records with `suite_id`, `status`, and `evidence`. - `scenarios`: optional array of scenario judgment records with `scenario_id`, - optional `suite_id`, `status`, `elf_position`, `evidence`, and optional `command` - and `artifact`. `elf_position` MUST be one of `wins`, `ties`, `loses`, or - `untested`. Scenario judgments are report inputs for dimension-level comparison; - they MUST NOT convert live-baseline-only evidence into real-world suite pass claims. + optional `suite_id`, `status`, `elf_position`, optional `comparison_outcome`, + `evidence`, and optional `command` and `artifact`. `elf_position` MUST be one of + `wins`, `ties`, `loses`, or `untested`. `comparison_outcome`, when present, MUST be + one of `win`, `tie`, `loss`, `not_tested`, `blocked`, or `non_goal`. Reports SHOULD + derive `comparison_outcome` from `elf_position` when omitted, but SHOULD use the + explicit field for scenarios where the legacy ELF-relative position is less precise + than the report outcome. Scenario judgments are report inputs for dimension-level + comparison; they MUST NOT convert live-baseline-only evidence into real-world suite + pass claims. - `evidence`: array of evidence pointers with `kind`, `ref`, and `status`. - `notes`: optional bounded explanatory strings. - `follow_up`: optional `title` and `reason`. @@ -580,7 +585,9 @@ Reports MUST include: - external adapter coverage when an external adapter manifest is loaded, preserving `fixture_backed`, `live_baseline_only`, `live_real_world`, `research_gate`, `real`, `mocked`, `unsupported`, `blocked`, `incomplete`, `wrong_result`, - `lifecycle_fail`, `pass`, and `not_encoded` distinctions. + `lifecycle_fail`, `pass`, and `not_encoded` distinctions. Scenario summaries MUST + preserve status counts, legacy `elf_position` counts, and normalized + `comparison_outcome` counts when scenario judgments are present. Reports that encode `memory_evolution` jobs SHOULD also include stale-answer counts, conflict detection counts, update rationale availability, and temporal-validity diff --git a/scripts/live-baseline-benchmark.sh b/scripts/live-baseline-benchmark.sh index fe607648..15365610 100755 --- a/scripts/live-baseline-benchmark.sh +++ b/scripts/live-baseline-benchmark.sh @@ -2073,6 +2073,26 @@ project_mem0() { "status": "real", "surface": "new Memory.from_config over the same local Qdrant/history paths" }, + "preference_history": { + "status": "real", + "surface": "Memory.history after a local preference correction update" + }, + "entity_scope_personalization": { + "status": "real", + "surface": "Memory.add/search with user_id, agent_id, and run_id filters" + }, + "deletion_audit": { + "status": "real", + "surface": "Memory.history after Memory.delete" + }, + "local_export_readback": { + "status": "real", + "surface": "Memory.get_all over local OSS storage for inspection/export-style readback" + }, + "openmemory_ui_export": { + "status": "blocked", + "surface": "the Docker live-baseline runner does not launch the OpenMemory web UI or hosted Platform export flow" + }, "scale_stress_profile": { "status": "incomplete", "surface": "smoke lifecycle path is encoded; scale/stress timing and resource thresholds are not yet calibrated" @@ -2170,21 +2190,103 @@ for text, source in docs: def result_entries(search): - return search.get("results", []) if isinstance(search, dict) else [] + if isinstance(search, dict): + for key in ("results", "memories"): + entries = search.get(key) + if isinstance(entries, list): + return entries + if isinstance(search, list): + return search + return [] -def search_memory(memory_instance, query_text): +def search_memory(memory_instance, query_text, filters=None): return memory_instance.search( query_text, - filters={"user_id": "elf-bench"}, + filters=filters or {"user_id": "elf-bench"}, top_k=top_k, threshold=0.0, ) +def json_lower(value): + return json.dumps(value, default=str).lower() + + +def contains_terms(value, terms): + text = json_lower(value) + return all(term.lower() in text for term in terms) + + +def first_memory_id(add_result): + results = add_result.get("results", []) if isinstance(add_result, dict) else [] + if results and isinstance(results[0], dict): + return results[0].get("id") + return None + + +def memory_history(memory_instance, memory_id): + if not hasattr(memory_instance, "history"): + return { + "available": False, + "history": None, + "error": "Memory.history is unavailable", + } + try: + return { + "available": True, + "history": memory_instance.history(memory_id), + "error": None, + } + except Exception as exc: + return { + "available": False, + "history": None, + "error": repr(exc), + } + + +def get_all_memories(memory_instance, filters): + if not hasattr(memory_instance, "get_all"): + return { + "available": False, + "memories": None, + "error": "Memory.get_all is unavailable", + } + try: + return { + "available": True, + "memories": memory_instance.get_all(filters=filters), + "error": None, + } + except TypeError: + try: + return { + "available": True, + "memories": memory_instance.get_all( + user_id=filters.get("user_id"), + agent_id=filters.get("agent_id"), + run_id=filters.get("run_id"), + ), + "error": None, + } + except Exception as exc: + return { + "available": False, + "memories": None, + "error": repr(exc), + } + except Exception as exc: + return { + "available": False, + "memories": None, + "error": repr(exc), + } + + def matches_expected(search, expected_doc, expected_terms): for entry in result_entries(search): - entry_text = json.dumps(entry, default=str).lower() + entry_text = json_lower(entry) source = ((entry.get("metadata") or {}).get("source") or "") if source == expected_doc and all( term.lower() in entry_text for term in expected_terms @@ -2304,6 +2406,152 @@ else: ) ) +history_filters = { + "user_id": "elf-history-user", + "agent_id": "elf-history-agent", + "run_id": "elf-project", +} +old_preference = ( + "Preference v1 for ELF: provide verbose tutorial explanations for every answer." +) +current_preference = ( + "Preference v2 for ELF: answer concisely with evidence-linked bullets." +) +preference_add = memory.add( + old_preference, + user_id=history_filters["user_id"], + agent_id=history_filters["agent_id"], + run_id=history_filters["run_id"], + metadata={"source": "preference-history", "kind": "preference"}, + infer=False, +) +preference_id = first_memory_id(preference_add) +if not preference_id: + checks.append( + make_check( + "preference_correction_history", + "incomplete", + "The preference memory id was not returned, so correction history could not be inspected.", + {"add_result": preference_add}, + ) + ) +else: + preference_update = memory.update( + preference_id, + current_preference, + metadata={"source": "preference-history", "kind": "preference"}, + ) + preference_history = memory_history(memory, preference_id) + preference_search = search_memory( + memory, + "How should answers be written for the ELF project?", + history_filters, + ) + history_has_old = contains_terms(preference_history["history"], ["verbose tutorial"]) + history_has_current = contains_terms( + preference_history["history"], + ["concise", "evidence-linked"], + ) + search_has_current = contains_terms( + result_entries(preference_search), + ["concise", "evidence-linked"], + ) + search_omits_old = "verbose tutorial" not in json_lower(result_entries(preference_search)) + if not preference_history["available"]: + preference_status = "blocked" + preference_reason = "Memory.history could not be read for the updated preference memory." + elif history_has_old and history_has_current and search_has_current and search_omits_old: + preference_status = "pass" + preference_reason = "mem0 history preserved the old and current preference while search returned only the current correction." + else: + preference_status = "lifecycle_fail" + preference_reason = "mem0 did not expose a clean preference correction chain with current-only search readback." + checks.append( + make_check( + "preference_correction_history", + preference_status, + preference_reason, + { + "memory_id": preference_id, + "add_result": preference_add, + "update_result": preference_update, + "history_available": preference_history["available"], + "history_error": preference_history["error"], + "history_has_old": history_has_old, + "history_has_current": history_has_current, + "search_has_current": search_has_current, + "search_omits_old": search_omits_old, + "history": preference_history["history"], + "search": preference_search, + }, + ) + ) + +other_scope_add = memory.add( + "Preference for PubFi: answer in long-form Chinese prose with no bullets.", + user_id=history_filters["user_id"], + agent_id=history_filters["agent_id"], + run_id="pubfi-project", + metadata={"source": "pubfi-preference", "kind": "preference"}, + infer=False, +) +entity_search = search_memory( + memory, + "What answer style preference applies here?", + history_filters, +) +entity_search_text = json_lower(result_entries(entity_search)) +entity_has_current = "evidence-linked bullets" in entity_search_text +entity_omits_other = "long-form chinese" not in entity_search_text +checks.append( + make_check( + "entity_scoped_personalization", + "pass" if entity_has_current and entity_omits_other else "lifecycle_fail", + "mem0 search respected user_id, agent_id, and run_id filters for the current preference scope." + if entity_has_current and entity_omits_other + else "mem0 entity-scoped search did not isolate the current preference from another run/project scope.", + { + "current_memory_id": preference_id, + "other_scope_add": other_scope_add, + "filters": history_filters, + "has_current": entity_has_current, + "omits_other_scope": entity_omits_other, + "search": entity_search, + }, + ) +) + +export_readback = get_all_memories(memory, history_filters) +export_has_current = contains_terms( + export_readback["memories"], + ["concise", "evidence-linked"], +) +export_omits_other = "long-form chinese" not in json_lower(export_readback["memories"]) +if not export_readback["available"]: + export_status = "blocked" + export_reason = "Memory.get_all could not be read for local OSS inspection/export-style evidence." +elif export_has_current and export_omits_other: + export_status = "pass" + export_reason = "mem0 get_all returned local export-style readback for the current scoped preference without the other scope." +else: + export_status = "lifecycle_fail" + export_reason = "mem0 get_all did not return the current scoped preference cleanly for local export-style readback." +checks.append( + make_check( + "local_get_all_export_readback", + export_status, + export_reason, + { + "available": export_readback["available"], + "error": export_readback["error"], + "filters": history_filters, + "has_current": export_has_current, + "omits_other_scope": export_omits_other, + "memories": export_readback["memories"], + }, + ) +) + delete_query = next( ( query @@ -2352,6 +2600,36 @@ else: }, ) ) + delete_history = memory_history(memory, delete_id) + delete_history_has_event = delete_history["available"] and contains_terms( + delete_history["history"], + ["delete"], + ) + if not delete_history["available"]: + delete_audit_status = "blocked" + delete_audit_reason = "Memory.history could not be read after delete, so deletion audit readback is blocked." + elif delete_history_has_event and not deleted_still_matched: + delete_audit_status = "pass" + delete_audit_reason = "mem0 history exposed a delete event and search suppressed the deleted memory." + else: + delete_audit_status = "lifecycle_fail" + delete_audit_reason = "mem0 did not expose a delete audit event while suppressing the deleted memory." + checks.append( + make_check( + "delete_history_audit_readback", + delete_audit_status, + delete_audit_reason, + { + "memory_id": delete_id, + "source": delete_source, + "history_available": delete_history["available"], + "history_error": delete_history["error"], + "history_has_delete_event": delete_history_has_event, + "deleted_still_matched": deleted_still_matched, + "history": delete_history["history"], + }, + ) + ) del memory gc.collect() @@ -2429,7 +2707,7 @@ PY else retrieval_status="retrieval_wrong_result" fi - json_record "${project}" "${repo}" "${head}" "${typed_status}" "${retrieval_status}" "$(typed_status_reason "${project}" "${typed_status}")" "${project}.log" "pip install -e . fastembed ollama; Memory.from_config; add/update/delete/search" + json_record "${project}" "${repo}" "${head}" "${typed_status}" "${retrieval_status}" "$(typed_status_reason "${project}" "${typed_status}")" "${project}.log" "pip install -e . fastembed ollama; Memory.from_config; add/update/delete/history/get_all/search" return fi json_record "${project}" "${repo}" "${head}" "incomplete" "invalid_json_result" "mem0 command completed, but did not produce a valid benchmark result" "${project}.log" "pip install -e . fastembed ollama; Memory.from_config; add infer=false; search" From 5af613ac567d0bff214c7ffdcbfae15dece8bd62 Mon Sep 17 00:00:00 2001 From: Yvette Carlisle Date: Thu, 11 Jun 2026 19:15:18 +0800 Subject: [PATCH 2/4] {"schema":"decodex/commit/1","summary":"Tighten mem0 deletion-audit evidence","authority":"XY-924"} --- README.md | 2 +- .../memory_projects_manifest.json | 24 +++++++++---------- .../tests/real_world_job_benchmark.rs | 21 ++++++++++++++++ ...em0-openmemory-history-ui-export-report.md | 14 +++++------ scripts/live-baseline-benchmark.sh | 23 ++++++++++++++++-- 5 files changed, 62 insertions(+), 22 deletions(-) diff --git a/README.md b/README.md index 0d3fd2ef..c79a217b 100644 --- a/README.md +++ b/README.md @@ -179,7 +179,7 @@ provider-backed ELF evidence was required. - mem0/OpenMemory history follow-up after XY-924: the local OSS mem0 adapter now passes encoded preference correction history, entity-scoped personalization, local `get_all` export-style readback, and deletion audit history in - `live-baseline-20260611105855`. The comparison records ELF as a loss on preference + `live-baseline-20260611111119`. The comparison records ELF as a loss on preference correction history, ties on scoped personalization and delete audit, `not_tested` for local SDK export-style parity, `blocked` for OpenMemory UI/export, and `non_goal` for hosted Platform export and optional graph memory in the local OSS diff --git a/apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json b/apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json index cfc54fb4..9812feae 100644 --- a/apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json +++ b/apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json @@ -608,7 +608,7 @@ }, "run": { "status": "pass", - "evidence": "Fresh scoped baseline run live-baseline-20260611105855 exercises local OSS mem0 with FastEmbed, Qdrant path storage, Memory.update, Memory.delete, Memory.history, Memory.get_all, entity filters, and cold-start reload; mem0 passed 8/8 encoded checks.", + "evidence": "Fresh scoped baseline run live-baseline-20260611111119 exercises local OSS mem0 with FastEmbed, Qdrant path storage, Memory.update, Memory.delete, Memory.history, Memory.get_all, entity filters, and cold-start reload; mem0 passed 8/8 encoded checks.", "command": "ELF_BASELINE_PROJECTS=mem0 cargo make baseline-live-docker", "artifact": "tmp/live-baseline/live-baseline-report.json" }, @@ -626,7 +626,7 @@ { "capability": "same_corpus_retrieval", "status": "pass", - "evidence": "Fresh scoped baseline run live-baseline-20260611105855 reports mem0 retrieval_pass with 3/3 same-corpus retrieval checks." + "evidence": "Fresh scoped baseline run live-baseline-20260611111119 reports mem0 retrieval_pass with 3/3 same-corpus retrieval checks." }, { "capability": "local_lifecycle_update_delete_reload", @@ -708,9 +708,9 @@ "status": "pass", "elf_position": "loses", "comparison_outcome": "loss", - "evidence": "Fresh scoped baseline run live-baseline-20260611105855 reports mem0 preference_correction_history as pass. The June 11 temporal report records ELF live memory-evolution preference as wrong_result, so the current measured comparison is an ELF loss on this history dimension until ELF temporal reconciliation is fixed.", - "command": "ELF_BASELINE_PROJECTS=mem0 cargo make baseline-live-docker", - "artifact": "tmp/live-baseline/mem0-checks.json" + "evidence": "Fresh scoped baseline run live-baseline-20260611111119 reports mem0 preference_correction_history as pass. ELF-side evidence comes from cargo make real-world-memory-live-adapters as summarized in docs/guide/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md, which records ELF live memory-evolution preference as wrong_result. The current measured comparison is therefore an ELF loss on this history dimension until ELF temporal reconciliation is fixed.", + "command": "mem0: ELF_BASELINE_PROJECTS=mem0 cargo make baseline-live-docker; ELF: cargo make real-world-memory-live-adapters", + "artifact": "mem0: tmp/live-baseline/mem0-checks.json; ELF: tmp/real-world-memory/live-adapters/ and docs/guide/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md" }, { "scenario_id": "entity_scoped_personalization", @@ -718,9 +718,9 @@ "status": "pass", "elf_position": "ties", "comparison_outcome": "tie", - "evidence": "Fresh scoped baseline run live-baseline-20260611105855 reports mem0 entity_scoped_personalization as pass. Existing live real-world evidence records ELF and qmd passing the encoded personalization slice, so this is a measured tie on the current scoped-preference surface.", - "command": "ELF_BASELINE_PROJECTS=mem0 cargo make baseline-live-docker", - "artifact": "tmp/live-baseline/mem0-checks.json" + "evidence": "Fresh scoped baseline run live-baseline-20260611111119 reports mem0 entity_scoped_personalization as pass. ELF-side evidence comes from cargo make real-world-memory-live-adapters as summarized in docs/guide/benchmarking/2026-06-11-competitor-strength-adoption-report.md, which records ELF and qmd passing the encoded personalization slice. This is a measured tie on the current scoped-preference surface.", + "command": "mem0: ELF_BASELINE_PROJECTS=mem0 cargo make baseline-live-docker; ELF: cargo make real-world-memory-live-adapters", + "artifact": "mem0: tmp/live-baseline/mem0-checks.json; ELF: tmp/real-world-memory/live-adapters/ and docs/guide/benchmarking/2026-06-11-competitor-strength-adoption-report.md" }, { "scenario_id": "delete_audit_readback", @@ -728,9 +728,9 @@ "status": "pass", "elf_position": "ties", "comparison_outcome": "tie", - "evidence": "Fresh scoped baseline run live-baseline-20260611105855 reports mem0 delete_history_audit_readback as pass. The June 11 temporal report records ELF passing the delete/TTL tombstone job, so the current measured delete-audit comparison is a tie.", - "command": "ELF_BASELINE_PROJECTS=mem0 cargo make baseline-live-docker", - "artifact": "tmp/live-baseline/mem0-checks.json" + "evidence": "Fresh scoped baseline run live-baseline-20260611111119 reports mem0 delete_history_audit_readback as pass. ELF-side evidence comes from cargo make real-world-memory-live-adapters as summarized in docs/guide/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md, which records ELF passing the delete/TTL tombstone job. The current measured delete-audit comparison is a tie.", + "command": "mem0: ELF_BASELINE_PROJECTS=mem0 cargo make baseline-live-docker; ELF: cargo make real-world-memory-live-adapters", + "artifact": "mem0: tmp/live-baseline/mem0-checks.json; ELF: tmp/real-world-memory/live-adapters/ and docs/guide/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md" }, { "scenario_id": "local_get_all_export_readback", @@ -738,7 +738,7 @@ "status": "pass", "elf_position": "untested", "comparison_outcome": "not_tested", - "evidence": "Fresh scoped baseline run live-baseline-20260611105855 reports mem0 local_get_all_export_readback as pass. This is local SDK inspection/export-style readback, not OpenMemory UI evidence; ELF has no directly comparable live UI/export scoring row in this run.", + "evidence": "Fresh scoped baseline run live-baseline-20260611111119 reports mem0 local_get_all_export_readback as pass. This is local SDK inspection/export-style readback, not OpenMemory UI evidence; ELF has no directly comparable live UI/export scoring row in this run.", "command": "ELF_BASELINE_PROJECTS=mem0 cargo make baseline-live-docker", "artifact": "tmp/live-baseline/mem0-checks.json" }, diff --git a/apps/elf-eval/tests/real_world_job_benchmark.rs b/apps/elf-eval/tests/real_world_job_benchmark.rs index 6ef0f0d3..402fafff 100644 --- a/apps/elf-eval/tests/real_world_job_benchmark.rs +++ b/apps/elf-eval/tests/real_world_job_benchmark.rs @@ -2319,6 +2319,27 @@ fn external_adapter_markdown_omits_scenario_summary_when_manifest_has_no_scenari Ok(()) } +#[test] +fn mem0_delete_audit_probe_requires_explicit_delete_history_event() -> Result<()> { + let script = + fs::read_to_string(workspace_root()?.join("scripts").join("live-baseline-benchmark.sh"))?; + + assert!(script.contains("def history_has_event")); + assert!(script.contains("str(entry.get(\"event\", \"\")).upper() == expected")); + assert!( + script.contains( + "history_has_event(\n delete_history[\"history\"],\n \"DELETE\"," + ) + ); + assert!( + !script.contains( + "contains_terms(\n delete_history[\"history\"],\n [\"delete\"]," + ) + ); + + Ok(()) +} + #[test] fn knowledge_json_report_renders_markdown_metrics() -> Result<()> { let report = run_json_report_from(knowledge_fixture_dir())?; diff --git a/docs/guide/benchmarking/2026-06-11-mem0-openmemory-history-ui-export-report.md b/docs/guide/benchmarking/2026-06-11-mem0-openmemory-history-ui-export-report.md index 7ccef030..627465b2 100644 --- a/docs/guide/benchmarking/2026-06-11-mem0-openmemory-history-ui-export-report.md +++ b/docs/guide/benchmarking/2026-06-11-mem0-openmemory-history-ui-export-report.md @@ -37,10 +37,10 @@ Platform export jobs, and does not enable optional graph memory. | Command | Result | Runtime | Artifact | | --- | --- | ---: | --- | -| `ELF_BASELINE_PROJECTS=mem0 cargo make baseline-live-docker` | `pass`; mem0 `8/8` encoded checks pass | 42.89 seconds wall; 41 seconds project runtime | `tmp/live-baseline/live-baseline-report.json`, `tmp/live-baseline/mem0-checks.json` | -| `cargo make real-world-memory` | `pass`; refreshed external adapter report published | 220.57 seconds | `tmp/real-world-memory/real-world-memory-report.json`, `tmp/real-world-memory/real-world-memory-report.md` | +| `ELF_BASELINE_PROJECTS=mem0 cargo make baseline-live-docker` | `pass`; mem0 `8/8` encoded checks pass | 35.50 seconds wall; 33 seconds project runtime | `tmp/live-baseline/live-baseline-report.json`, `tmp/live-baseline/mem0-checks.json` | +| `cargo make real-world-memory` | `pass`; refreshed external adapter report published | 10.18 seconds | `tmp/real-world-memory/real-world-memory-report.json`, `tmp/real-world-memory/real-world-memory-report.md` | -Fresh mem0 run id: `live-baseline-20260611105855`. +Fresh mem0 run id: `live-baseline-20260611111119`. Generated external adapter summary: @@ -55,9 +55,9 @@ Generated external adapter summary: | Scenario | mem0/OpenMemory evidence | ELF comparison outcome | Status | Command | Artifact | | --- | --- | --- | --- | --- | --- | | Basic local lifecycle | mem0 passes same-corpus retrieval, update, delete, and cold-start reload in the prior first-generation baseline. | `tie` | `pass` | `ELF_BASELINE_PROJECTS=ELF,agentmemory,mem0,memsearch,claude-mem cargo make baseline-live-docker` | `tmp/live-baseline/live-baseline-report.json` | -| Preference correction history | `Memory.history` preserves old and current preference records; search returns only the current correction. | `loss` | `pass` | `ELF_BASELINE_PROJECTS=mem0 cargo make baseline-live-docker` | `tmp/live-baseline/mem0-checks.json` | -| Entity-scoped personalization | `search()` with `user_id`, `agent_id`, and `run_id` filters returns the ELF-scoped preference and omits a PubFi-scoped preference. | `tie` | `pass` | `ELF_BASELINE_PROJECTS=mem0 cargo make baseline-live-docker` | `tmp/live-baseline/mem0-checks.json` | -| Delete audit readback | `Memory.history` exposes a `DELETE` event and post-delete search suppresses the deleted memory. | `tie` | `pass` | `ELF_BASELINE_PROJECTS=mem0 cargo make baseline-live-docker` | `tmp/live-baseline/mem0-checks.json` | +| Preference correction history | `Memory.history` preserves old and current preference records; search returns only the current correction. | `loss` | `pass` | mem0: `ELF_BASELINE_PROJECTS=mem0 cargo make baseline-live-docker`; ELF: `cargo make real-world-memory-live-adapters` | mem0: `tmp/live-baseline/mem0-checks.json`; ELF: `tmp/real-world-memory/live-adapters/`, `docs/guide/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md` | +| Entity-scoped personalization | `search()` with `user_id`, `agent_id`, and `run_id` filters returns the ELF-scoped preference and omits a PubFi-scoped preference. | `tie` | `pass` | mem0: `ELF_BASELINE_PROJECTS=mem0 cargo make baseline-live-docker`; ELF: `cargo make real-world-memory-live-adapters` | mem0: `tmp/live-baseline/mem0-checks.json`; ELF: `tmp/real-world-memory/live-adapters/`, `docs/guide/benchmarking/2026-06-11-competitor-strength-adoption-report.md` | +| Delete audit readback | `Memory.history` exposes a `DELETE` event and post-delete search suppresses the deleted memory. | `tie` | `pass` | mem0: `ELF_BASELINE_PROJECTS=mem0 cargo make baseline-live-docker`; ELF: `cargo make real-world-memory-live-adapters` | mem0: `tmp/live-baseline/mem0-checks.json`; ELF: `tmp/real-world-memory/live-adapters/`, `docs/guide/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md` | | Local SDK export-style readback | `Memory.get_all` returns the current scoped preference and omits the other scope. | `not_tested` | `pass` | `ELF_BASELINE_PROJECTS=mem0 cargo make baseline-live-docker` | `tmp/live-baseline/mem0-checks.json` | | OpenMemory UI/export readback | No local UI/dashboard export flow is launched by the Docker runner. | `blocked` | `blocked` | Not run; outside current local runner. | `apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json` | | Hosted mem0 Platform export | Hosted Platform export is outside local OSS evidence. | `non_goal` | `unsupported` | Not run; local OSS comparison only. | `apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json` | @@ -119,7 +119,7 @@ Allowed: - mem0/OpenMemory local OSS passes the new encoded history, correction, personalization, deletion-audit, and local `get_all` readback checks in run - `live-baseline-20260611105855`. + `live-baseline-20260611111119`. - ELF currently has a measured `loss` against mem0 on the preference correction history dimension because the June 11 temporal/history report records ELF's live memory-evolution preference job as `wrong_result`. diff --git a/scripts/live-baseline-benchmark.sh b/scripts/live-baseline-benchmark.sh index 15365610..d899677b 100755 --- a/scripts/live-baseline-benchmark.sh +++ b/scripts/live-baseline-benchmark.sh @@ -2218,6 +2218,25 @@ def contains_terms(value, terms): return all(term.lower() in text for term in terms) +def history_entries(history): + if isinstance(history, dict): + for key in ("results", "history", "memories"): + entries = history.get(key) + if isinstance(entries, list): + return entries + if isinstance(history, list): + return history + return [] + + +def history_has_event(history, expected_event): + expected = expected_event.upper() + return any( + isinstance(entry, dict) and str(entry.get("event", "")).upper() == expected + for entry in history_entries(history) + ) + + def first_memory_id(add_result): results = add_result.get("results", []) if isinstance(add_result, dict) else [] if results and isinstance(results[0], dict): @@ -2601,9 +2620,9 @@ else: ) ) delete_history = memory_history(memory, delete_id) - delete_history_has_event = delete_history["available"] and contains_terms( + delete_history_has_event = delete_history["available"] and history_has_event( delete_history["history"], - ["delete"], + "DELETE", ) if not delete_history["available"]: delete_audit_status = "blocked" From 6b405e933ad371996b779e8a027c86fe3a84871a Mon Sep 17 00:00:00 2001 From: Yvette Carlisle Date: Thu, 11 Jun 2026 19:25:06 +0800 Subject: [PATCH 3/4] {"schema":"decodex/commit/1","summary":"Make chunking tests offline deterministic","authority":"XY-924"} --- packages/elf-chunking/src/lib.rs | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/packages/elf-chunking/src/lib.rs b/packages/elf-chunking/src/lib.rs index f1209da2..00c25670 100644 --- a/packages/elf-chunking/src/lib.rs +++ b/packages/elf-chunking/src/lib.rs @@ -128,10 +128,14 @@ fn overlap_tail(text: &str, overlap_tokens: u32, tokenizer: &Tokenizer) -> Strin mod tests { use crate::ChunkingConfig; + fn local_dev_tokenizer_path() -> std::path::PathBuf { + std::path::PathBuf::from(env!("CARGO_MANIFEST_DIR")) + .join("../../config/local/tokenizer.wordlevel.json") + } + #[test] fn loads_local_dev_tokenizer_fixture() { - let path = std::path::PathBuf::from(env!("CARGO_MANIFEST_DIR")) - .join("../../config/local/tokenizer.wordlevel.json"); + let path = local_dev_tokenizer_path(); let tokenizer = crate::load_tokenizer(path.to_str().expect("Path must be valid UTF-8")) .expect("Local dev tokenizer must load."); let cfg = ChunkingConfig { max_tokens: 10, overlap_tokens: 2 }; @@ -143,11 +147,14 @@ mod tests { #[test] fn splits_into_chunks_with_overlap() { - let cfg = ChunkingConfig { max_tokens: 10, overlap_tokens: 2 }; - let tokenizer = crate::load_tokenizer("Qwen/Qwen3-Embedding-8B").unwrap(); + let cfg = ChunkingConfig { max_tokens: 2, overlap_tokens: 1 }; + let path = local_dev_tokenizer_path(); + let tokenizer = crate::load_tokenizer(path.to_str().expect("Path must be valid UTF-8")) + .expect("Local dev tokenizer must load."); let chunks = crate::split_text("One. Two. Three. Four.", &cfg, &tokenizer); - assert!(!chunks.is_empty()); + assert!(chunks.len() > 1); assert!(chunks[0].text.contains("One")); + assert!(chunks.last().expect("Chunk should exist").text.contains("Four")); } } From 843f100877ded409083c5fcbac5b3f618c61e59b Mon Sep 17 00:00:00 2001 From: Yvette Carlisle Date: Thu, 11 Jun 2026 19:33:55 +0800 Subject: [PATCH 4/4] {"schema":"decodex/commit/1","summary":"Sync mem0 history review evidence","authority":"XY-924"} --- README.md | 2 +- .../memory_projects_manifest.json | 14 ++++---- .../tests/real_world_job_benchmark.rs | 6 ++++ ...em0-openmemory-history-ui-export-report.md | 16 ++++++---- ...-temporal-history-competitor-gap-report.md | 18 ++++++----- ...1-competitor-strength-adoption-report.json | 32 +++++++++++-------- ...emporal-history-competitor-gap-report.json | 20 ++++++++---- scripts/live-baseline-benchmark.sh | 21 ++++++++++-- 8 files changed, 86 insertions(+), 43 deletions(-) diff --git a/README.md b/README.md index c79a217b..1ec443f3 100644 --- a/README.md +++ b/README.md @@ -179,7 +179,7 @@ provider-backed ELF evidence was required. - mem0/OpenMemory history follow-up after XY-924: the local OSS mem0 adapter now passes encoded preference correction history, entity-scoped personalization, local `get_all` export-style readback, and deletion audit history in - `live-baseline-20260611111119`. The comparison records ELF as a loss on preference + `live-baseline-20260611113003`. The comparison records ELF as a loss on preference correction history, ties on scoped personalization and delete audit, `not_tested` for local SDK export-style parity, `blocked` for OpenMemory UI/export, and `non_goal` for hosted Platform export and optional graph memory in the local OSS diff --git a/apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json b/apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json index 9812feae..7bcdef8d 100644 --- a/apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json +++ b/apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json @@ -608,7 +608,7 @@ }, "run": { "status": "pass", - "evidence": "Fresh scoped baseline run live-baseline-20260611111119 exercises local OSS mem0 with FastEmbed, Qdrant path storage, Memory.update, Memory.delete, Memory.history, Memory.get_all, entity filters, and cold-start reload; mem0 passed 8/8 encoded checks.", + "evidence": "Fresh scoped baseline run live-baseline-20260611113003 exercises local OSS mem0 with FastEmbed, Qdrant path storage, Memory.update, Memory.delete, Memory.history, Memory.get_all, entity filters, and cold-start reload; mem0 passed 8/8 encoded checks.", "command": "ELF_BASELINE_PROJECTS=mem0 cargo make baseline-live-docker", "artifact": "tmp/live-baseline/live-baseline-report.json" }, @@ -626,7 +626,7 @@ { "capability": "same_corpus_retrieval", "status": "pass", - "evidence": "Fresh scoped baseline run live-baseline-20260611111119 reports mem0 retrieval_pass with 3/3 same-corpus retrieval checks." + "evidence": "Fresh scoped baseline run live-baseline-20260611113003 reports mem0 retrieval_pass with 3/3 same-corpus retrieval checks." }, { "capability": "local_lifecycle_update_delete_reload", @@ -636,7 +636,7 @@ { "capability": "preference_correction_history", "status": "pass", - "evidence": "The fresh scoped run reports preference_correction_history as pass: Memory.history preserved ADD and UPDATE records with old and current preference text, and search returned only the current correction." + "evidence": "The fresh scoped run reports preference_correction_history as pass: Memory.history preserved explicit ADD and UPDATE records with old and current preference text, and search returned only the current correction." }, { "capability": "entity_scoped_personalization", @@ -708,7 +708,7 @@ "status": "pass", "elf_position": "loses", "comparison_outcome": "loss", - "evidence": "Fresh scoped baseline run live-baseline-20260611111119 reports mem0 preference_correction_history as pass. ELF-side evidence comes from cargo make real-world-memory-live-adapters as summarized in docs/guide/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md, which records ELF live memory-evolution preference as wrong_result. The current measured comparison is therefore an ELF loss on this history dimension until ELF temporal reconciliation is fixed.", + "evidence": "Fresh scoped baseline run live-baseline-20260611113003 reports mem0 preference_correction_history as pass. ELF-side evidence comes from cargo make real-world-memory-live-adapters as summarized in docs/guide/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md, which records ELF live memory-evolution preference as wrong_result. The current measured comparison is therefore an ELF loss on this history dimension until ELF temporal reconciliation is fixed.", "command": "mem0: ELF_BASELINE_PROJECTS=mem0 cargo make baseline-live-docker; ELF: cargo make real-world-memory-live-adapters", "artifact": "mem0: tmp/live-baseline/mem0-checks.json; ELF: tmp/real-world-memory/live-adapters/ and docs/guide/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md" }, @@ -718,7 +718,7 @@ "status": "pass", "elf_position": "ties", "comparison_outcome": "tie", - "evidence": "Fresh scoped baseline run live-baseline-20260611111119 reports mem0 entity_scoped_personalization as pass. ELF-side evidence comes from cargo make real-world-memory-live-adapters as summarized in docs/guide/benchmarking/2026-06-11-competitor-strength-adoption-report.md, which records ELF and qmd passing the encoded personalization slice. This is a measured tie on the current scoped-preference surface.", + "evidence": "Fresh scoped baseline run live-baseline-20260611113003 reports mem0 entity_scoped_personalization as pass. ELF-side evidence comes from cargo make real-world-memory-live-adapters as summarized in docs/guide/benchmarking/2026-06-11-competitor-strength-adoption-report.md, which records ELF and qmd passing the encoded personalization slice. This is a measured tie on the current scoped-preference surface.", "command": "mem0: ELF_BASELINE_PROJECTS=mem0 cargo make baseline-live-docker; ELF: cargo make real-world-memory-live-adapters", "artifact": "mem0: tmp/live-baseline/mem0-checks.json; ELF: tmp/real-world-memory/live-adapters/ and docs/guide/benchmarking/2026-06-11-competitor-strength-adoption-report.md" }, @@ -728,7 +728,7 @@ "status": "pass", "elf_position": "ties", "comparison_outcome": "tie", - "evidence": "Fresh scoped baseline run live-baseline-20260611111119 reports mem0 delete_history_audit_readback as pass. ELF-side evidence comes from cargo make real-world-memory-live-adapters as summarized in docs/guide/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md, which records ELF passing the delete/TTL tombstone job. The current measured delete-audit comparison is a tie.", + "evidence": "Fresh scoped baseline run live-baseline-20260611113003 reports mem0 delete_history_audit_readback as pass. ELF-side evidence comes from cargo make real-world-memory-live-adapters as summarized in docs/guide/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md, which records ELF passing the delete/TTL tombstone job. The current measured delete-audit comparison is a tie.", "command": "mem0: ELF_BASELINE_PROJECTS=mem0 cargo make baseline-live-docker; ELF: cargo make real-world-memory-live-adapters", "artifact": "mem0: tmp/live-baseline/mem0-checks.json; ELF: tmp/real-world-memory/live-adapters/ and docs/guide/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md" }, @@ -738,7 +738,7 @@ "status": "pass", "elf_position": "untested", "comparison_outcome": "not_tested", - "evidence": "Fresh scoped baseline run live-baseline-20260611111119 reports mem0 local_get_all_export_readback as pass. This is local SDK inspection/export-style readback, not OpenMemory UI evidence; ELF has no directly comparable live UI/export scoring row in this run.", + "evidence": "Fresh scoped baseline run live-baseline-20260611113003 reports mem0 local_get_all_export_readback as pass. This is local SDK inspection/export-style readback, not OpenMemory UI evidence; ELF has no directly comparable live UI/export scoring row in this run.", "command": "ELF_BASELINE_PROJECTS=mem0 cargo make baseline-live-docker", "artifact": "tmp/live-baseline/mem0-checks.json" }, diff --git a/apps/elf-eval/tests/real_world_job_benchmark.rs b/apps/elf-eval/tests/real_world_job_benchmark.rs index 402fafff..b76a1ff2 100644 --- a/apps/elf-eval/tests/real_world_job_benchmark.rs +++ b/apps/elf-eval/tests/real_world_job_benchmark.rs @@ -2326,6 +2326,12 @@ fn mem0_delete_audit_probe_requires_explicit_delete_history_event() -> Result<() assert!(script.contains("def history_has_event")); assert!(script.contains("str(entry.get(\"event\", \"\")).upper() == expected")); + assert!(script.contains( + "history_has_event(\n preference_history[\"history\"],\n \"ADD\"," + )); + assert!(script.contains( + "history_has_event(\n preference_history[\"history\"],\n \"UPDATE\"," + )); assert!( script.contains( "history_has_event(\n delete_history[\"history\"],\n \"DELETE\"," diff --git a/docs/guide/benchmarking/2026-06-11-mem0-openmemory-history-ui-export-report.md b/docs/guide/benchmarking/2026-06-11-mem0-openmemory-history-ui-export-report.md index 627465b2..91d5dc15 100644 --- a/docs/guide/benchmarking/2026-06-11-mem0-openmemory-history-ui-export-report.md +++ b/docs/guide/benchmarking/2026-06-11-mem0-openmemory-history-ui-export-report.md @@ -37,12 +37,12 @@ Platform export jobs, and does not enable optional graph memory. | Command | Result | Runtime | Artifact | | --- | --- | ---: | --- | -| `ELF_BASELINE_PROJECTS=mem0 cargo make baseline-live-docker` | `pass`; mem0 `8/8` encoded checks pass | 35.50 seconds wall; 33 seconds project runtime | `tmp/live-baseline/live-baseline-report.json`, `tmp/live-baseline/mem0-checks.json` | -| `cargo make real-world-memory` | `pass`; refreshed external adapter report published | 10.18 seconds | `tmp/real-world-memory/real-world-memory-report.json`, `tmp/real-world-memory/real-world-memory-report.md` | +| `ELF_BASELINE_PROJECTS=mem0 cargo make baseline-live-docker` | `pass`; mem0 `8/8` encoded checks pass | 39.17 seconds wall; 36 seconds project runtime | `tmp/live-baseline/live-baseline-report.json`, `tmp/live-baseline/mem0-checks.json` | +| `cargo make real-world-memory` | `pass`; refreshed external adapter report published | 8.88 seconds | `tmp/real-world-memory/real-world-memory-report.json`, `tmp/real-world-memory/real-world-memory-report.md` | -Fresh mem0 run id: `live-baseline-20260611111119`. +Fresh mem0 run id: `live-baseline-20260611113003`. -Generated external adapter summary: +Generated external adapter summary for all external adapter manifest rows: - Scenario statuses: `unsupported=2`, `blocked=2`, `wrong_result=1`, `lifecycle_fail=1`, `pass=9`, `not_encoded=3`. @@ -50,12 +50,15 @@ Generated external adapter summary: - Normalized comparison outcomes: `win=2`, `tie=4`, `loss=1`, `not_tested=8`, `blocked=1`, `non_goal=2`. +mem0/OpenMemory rows in this report contain eight scenarios: `loss=1`, +`tie=3`, `not_tested=1`, `blocked=1`, and `non_goal=2`. + ## Scenario Outcomes | Scenario | mem0/OpenMemory evidence | ELF comparison outcome | Status | Command | Artifact | | --- | --- | --- | --- | --- | --- | | Basic local lifecycle | mem0 passes same-corpus retrieval, update, delete, and cold-start reload in the prior first-generation baseline. | `tie` | `pass` | `ELF_BASELINE_PROJECTS=ELF,agentmemory,mem0,memsearch,claude-mem cargo make baseline-live-docker` | `tmp/live-baseline/live-baseline-report.json` | -| Preference correction history | `Memory.history` preserves old and current preference records; search returns only the current correction. | `loss` | `pass` | mem0: `ELF_BASELINE_PROJECTS=mem0 cargo make baseline-live-docker`; ELF: `cargo make real-world-memory-live-adapters` | mem0: `tmp/live-baseline/mem0-checks.json`; ELF: `tmp/real-world-memory/live-adapters/`, `docs/guide/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md` | +| Preference correction history | `Memory.history` exposes explicit `ADD` and `UPDATE` preference records; search returns only the current correction. | `loss` | `pass` | mem0: `ELF_BASELINE_PROJECTS=mem0 cargo make baseline-live-docker`; ELF: `cargo make real-world-memory-live-adapters` | mem0: `tmp/live-baseline/mem0-checks.json`; ELF: `tmp/real-world-memory/live-adapters/`, `docs/guide/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md` | | Entity-scoped personalization | `search()` with `user_id`, `agent_id`, and `run_id` filters returns the ELF-scoped preference and omits a PubFi-scoped preference. | `tie` | `pass` | mem0: `ELF_BASELINE_PROJECTS=mem0 cargo make baseline-live-docker`; ELF: `cargo make real-world-memory-live-adapters` | mem0: `tmp/live-baseline/mem0-checks.json`; ELF: `tmp/real-world-memory/live-adapters/`, `docs/guide/benchmarking/2026-06-11-competitor-strength-adoption-report.md` | | Delete audit readback | `Memory.history` exposes a `DELETE` event and post-delete search suppresses the deleted memory. | `tie` | `pass` | mem0: `ELF_BASELINE_PROJECTS=mem0 cargo make baseline-live-docker`; ELF: `cargo make real-world-memory-live-adapters` | mem0: `tmp/live-baseline/mem0-checks.json`; ELF: `tmp/real-world-memory/live-adapters/`, `docs/guide/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md` | | Local SDK export-style readback | `Memory.get_all` returns the current scoped preference and omits the other scope. | `not_tested` | `pass` | `ELF_BASELINE_PROJECTS=mem0 cargo make baseline-live-docker` | `tmp/live-baseline/mem0-checks.json` | @@ -81,6 +84,7 @@ The `preference_correction_history` check verifies all of: - history is available; - history contains the original preference; - history contains the corrected preference; +- history contains explicit `ADD` and `UPDATE` events; - search contains the corrected preference; - search omits the old preference. @@ -119,7 +123,7 @@ Allowed: - mem0/OpenMemory local OSS passes the new encoded history, correction, personalization, deletion-audit, and local `get_all` readback checks in run - `live-baseline-20260611111119`. + `live-baseline-20260611113003`. - ELF currently has a measured `loss` against mem0 on the preference correction history dimension because the June 11 temporal/history report records ELF's live memory-evolution preference job as `wrong_result`. diff --git a/docs/guide/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md b/docs/guide/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md index d0749918..c93ebea8 100644 --- a/docs/guide/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md +++ b/docs/guide/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md @@ -134,9 +134,9 @@ the right snippets. | --- | --- | --- | --- | | Basic local lifecycle | mem0 update/delete/reload | Fresh Docker baseline: ELF `8/8`, mem0 `4/4`, combined `12/12` | ELF ties or exceeds the encoded smoke surface, but does not beat OpenMemory UI/history/hosted claims. | | Retrieval/debug | qmd transparent CLI, expansion/fusion/rerank/replay ergonomics | ELF/qmd live adapters pass retrieval suites; previous qmd debug profile exists | ELF is not clearly stronger. qmd remains the debug-UX bar. | -| Current-vs-historical memory | Graphiti/Zep temporal validity; mem0 history surfaces | ELF/qmd live memory-evolution wrong_result; Graphiti/Zep blocked; mem0 real-world history not encoded | ELF has a measured gap. It only narrowly beats qmd's current run. | +| Current-vs-historical memory | Graphiti/Zep temporal validity; mem0 history surfaces | ELF/qmd live memory-evolution wrong_result; Graphiti/Zep blocked; mem0 local OSS preference correction history now passes, but mem0 real-world prompt history is not encoded | ELF has a measured gap. It only narrowly beats qmd's current run and loses the local OSS preference-correction history scenario to mem0. | | Delete/tombstone lifecycle | ELF production ops and qmd local replay | ELF passes delete/TTL job; qmd misses tombstone | ELF has a narrow measured win over qmd on this job. | -| Entity preference history | mem0/OpenMemory | Only basic mem0 lifecycle smoke passed | Not comparable. Need mem0/OpenMemory history and UI/export benchmark. | +| Entity preference history | mem0/OpenMemory | XY-924 local OSS run passes mem0 preference correction history and entity-scoped personalization; OpenMemory UI/export remains blocked | ELF loses the preference-correction history scenario and ties the scoped-personalization scenario; no OpenMemory UI/export claim is allowed. | | Core-vs-archival memory | Letta core memory blocks versus archival memory | Research-only, no contained live output | Not comparable. Borrow design only. | | Context trajectory | OpenViking staged context and hierarchy | Existing adapter remains not encoded or wrong_result for trajectory | Not comparable. Need staged trajectory benchmark. | | Capture and continuity | agentmemory, claude-mem hooks/viewers | Existing adapters are baseline-only and undermeasured | Not comparable. Need capture/write-policy and work-resume adapters. | @@ -148,7 +148,7 @@ the right snippets. | Source | Best idea to absorb | Benchmark gate before any claim | | --- | --- | --- | | Graphiti/Zep | Validity windows, `valid_at`/`invalid_at`, current/historical/future fact separation, temporal relation provenance | Provider-backed Docker temporal smoke must map current, historical, and rationale facts to scored evidence ids. | -| mem0/OpenMemory | Entity-scoped memory history, user-visible lifecycle inspection, update/delete ergonomics | mem0/OpenMemory adapter must score preference history, correction, deletion, and UI/export readback. | +| mem0/OpenMemory | Entity-scoped memory history, user-visible lifecycle inspection, update/delete ergonomics | Local OSS history, correction, deletion, and SDK `get_all` readback are now scored; UI/export readback still needs a bounded OpenMemory runner. | | Letta | Always-loaded core memory blocks separated from archival search | Add core-vs-archival jobs for attachment scope, provenance, fallback, and stale-core avoidance. | | qmd | Local replay, candidate inspection, expansion/fusion/rerank debug knobs | ELF trace artifacts must show candidate generation, rerank, dropped evidence, conflict candidates, and replay commands. | | OpenViking | Staged context trajectory and hierarchy | Encode trajectory jobs after evidence-bearing same-corpus output passes. | @@ -176,17 +176,19 @@ claim that ELF has solved temporal memory. ### P0 - mem0/OpenMemory History Comparison -The fresh mem0 pass means the next useful comparison is no longer basic update/delete. -It should move to the product behavior users actually care about: +XY-924 moves the reproducible local OSS comparison past basic update/delete into +the product behavior users actually care about: 1. preference history across correction events; 2. entity-scoped memory lookup and update; -3. user-visible inspection/export of memory lifecycle; +3. local SDK inspection/export-style readback of memory lifecycle; 4. deletion versus historical audit readback; 5. optional graph-memory behavior only if the OSS path is reproducible in Docker. -Target benchmark: mem0/OpenMemory and ELF both run comparable history jobs; claims are -made per scenario, not per project brand. +Target benchmark status: local OSS history jobs are now encoded with per-scenario +claims. OpenMemory UI/export readback remains blocked until a UI runner exists, and +hosted Platform export plus optional graph memory remain non-goals for the local OSS +lane. ### P0 - qmd-Level Debugging And Replay diff --git a/docs/research/2026-06-11-competitor-strength-adoption-report.json b/docs/research/2026-06-11-competitor-strength-adoption-report.json index 9226f5ca..11871923 100644 --- a/docs/research/2026-06-11-competitor-strength-adoption-report.json +++ b/docs/research/2026-06-11-competitor-strength-adoption-report.json @@ -12,7 +12,7 @@ "Live temporal reconciliation remains wrong_result for five of six memory_evolution jobs.", "Private-corpus production quality is blocked until an operator-owned manifest exists.", "Credentialed provider production-ops gates are blocked until explicit provider setup exists.", - "Several competitor strengths remain not_tested: mem0/OpenMemory history/UI, OpenViking trajectory, Letta core-vs-archival memory, and graph/RAG navigation. The XY-923 follow-up now scores qmd immediate top-10/replay artifact ergonomics as stronger than ELF's default stress report, while expansion, fusion, rerank, and candidate-drop diagnosis remain untested." + "Several competitor strengths remain not_tested or blocked: OpenMemory UI/export, hosted mem0 Platform behavior, OpenViking trajectory, Letta core-vs-archival memory, and graph/RAG navigation. mem0 local OSS preference history is now measured separately and is an ELF loss on the current correction-history scenario. The XY-923 follow-up now scores qmd immediate top-10/replay artifact ergonomics as stronger than ELF's default stress report, while expansion, fusion, rerank, and candidate-drop diagnosis remain untested." ] }, "evidence_class_terms": [ @@ -51,6 +51,11 @@ "artifact": "docs/guide/benchmarking/2026-06-11-first-generation-oss-adapter-promotion-report.md", "claim": "mem0/OpenMemory and memsearch pass basic local baseline smokes; agentmemory remains lifecycle_fail and claude-mem remains wrong_result on same-corpus retrieval." }, + { + "command": "ELF_BASELINE_PROJECTS=mem0 cargo make baseline-live-docker", + "artifact": "docs/guide/benchmarking/2026-06-11-mem0-openmemory-history-ui-export-report.md", + "claim": "mem0 local OSS passes preference correction history, entity-scoped personalization, local get_all export-style readback, and deletion audit history; OpenMemory UI/export remains blocked and hosted Platform export remains non-goal." + }, { "command": "ELF_GRAPHITI_ZEP_SMOKE_START=1 ELF_GRAPHITI_ZEP_SMOKE_RUN=1 cargo make graphiti-zep-docker-temporal-smoke", "artifact": "docs/guide/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md", @@ -142,8 +147,8 @@ "scenario_id": "memory_evolution_temporal_history", "title": "Memory evolution and temporal history", "outcome": "loss", - "evidence_classes": ["fixture_backed", "live_real_world", "wrong_result", "blocked"], - "measured_claim": "ELF fixture memory_evolution passes, but live ELF passes only the delete/TTL job and reports five wrong_result jobs where evidence is retrieved but current-vs-historical state is not reconciled.", + "evidence_classes": ["fixture_backed", "live_real_world", "live_baseline_only", "wrong_result", "blocked"], + "measured_claim": "ELF fixture memory_evolution passes, but live ELF passes only the delete/TTL job and reports five wrong_result jobs where evidence is retrieved but current-vs-historical state is not reconciled. The mem0 local OSS preference-correction history scenario is now measured and is also an ELF loss.", "command_artifacts": [ "docs/guide/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md", "docs/research/2026-06-11-temporal-history-competitor-gap-report.json" @@ -180,8 +185,8 @@ "scenario_id": "operator_debugging_viewer_ux", "title": "Operator debugging/viewer UX", "outcome": "not_tested", - "evidence_classes": ["fixture_backed", "not_encoded", "research_gate"], - "measured_claim": "ELF fixture operator-debugging UX passes, but live trace/viewer scoring is not encoded and qmd/OpenMemory/claude-mem UX comparisons are unscored.", + "evidence_classes": ["fixture_backed", "live_baseline_only", "blocked", "not_encoded", "research_gate"], + "measured_claim": "ELF fixture operator-debugging UX passes. mem0 local SDK get_all readback is measured, but OpenMemory UI/export remains blocked and must not be inferred from SDK readback. Live trace/viewer scoring and qmd/OpenMemory/claude-mem UX comparisons remain unscored.", "command_artifacts": [ "docs/guide/benchmarking/2026-06-11-measurement-coverage-audit.md", "docs/guide/benchmarking/2026-06-11-qmd-openviking-strength-profile-report.md" @@ -232,13 +237,14 @@ "scenario_id": "personalization_scoped_preferences", "title": "Personalization and scoped preferences", "outcome": "tie", - "evidence_classes": ["fixture_backed", "live_real_world", "not_encoded"], - "measured_claim": "ELF and qmd both pass the single encoded live personalization job. mem0/OpenMemory and Letta personalization/history are not encoded.", + "evidence_classes": ["fixture_backed", "live_real_world", "live_baseline_only", "not_encoded"], + "measured_claim": "ELF and qmd both pass the single encoded live personalization job. mem0 local OSS now passes entity-scoped personalization, so scoped preference behavior is a measured tie; preference correction history remains a separate ELF loss.", "command_artifacts": [ - "docs/guide/benchmarking/2026-06-11-measurement-coverage-audit.md" + "docs/guide/benchmarking/2026-06-11-measurement-coverage-audit.md", + "docs/guide/benchmarking/2026-06-11-mem0-openmemory-history-ui-export-report.md" ], - "follow_up_issues": ["XY-924", "XY-927"], - "caveat": "The tie does not prove entity history, UI readback, or long-term preference evolution." + "follow_up_issues": ["XY-927"], + "caveat": "The tie is scoped to encoded personalization and local OSS entity filters; OpenMemory UI readback and long-term preference evolution remain separate surfaces." }, { "scenario_id": "context_trajectory_hierarchical_retrieval", @@ -294,8 +300,8 @@ { "issue": "XY-924", "priority": "P0", - "state": "Backlog", - "gap": "mem0/OpenMemory history and UI-export comparison." + "state": "Encoded local OSS history; UI/export still gated", + "gap": "mem0/OpenMemory local OSS history and SDK export-style readback are measured; OpenMemory UI/export still needs a UI runner before any product-UX claim." }, { "issue": "XY-925", @@ -351,7 +357,7 @@ "not_allowed": [ "Do not claim ELF broadly beats qmd.", "Do not claim qmd's trace/replay artifact win is a broad qmd-over-ELF memory-system or retrieval-quality win.", - "Do not claim ELF beats mem0/OpenMemory on history, UI/export, hosted behavior, or graph memory.", + "Do not claim ELF beats mem0/OpenMemory on preference history, UI/export, hosted behavior, or graph memory. The local OSS correction-history scenario is currently an ELF loss, while OpenMemory UI/export, hosted behavior, and graph memory remain outside measured local OSS evidence.", "Do not claim ELF beats OpenViking on staged context trajectory.", "Do not claim ELF beats Letta on core-vs-archival memory.", "Do not claim graph/RAG parity from smoke-only evidence.", diff --git a/docs/research/2026-06-11-temporal-history-competitor-gap-report.json b/docs/research/2026-06-11-temporal-history-competitor-gap-report.json index fe95e723..d9129ec7 100644 --- a/docs/research/2026-06-11-temporal-history-competitor-gap-report.json +++ b/docs/research/2026-06-11-temporal-history-competitor-gap-report.json @@ -19,6 +19,13 @@ "runtime_seconds": 50.14, "artifact": "tmp/live-baseline/live-baseline-report.json" }, + { + "command": "ELF_BASELINE_PROJECTS=mem0 cargo make baseline-live-docker", + "status": "pass", + "runtime_seconds": 39.17, + "artifact": "tmp/live-baseline/mem0-checks.json", + "claim": "XY-924 local OSS mem0 history run passes preference correction history, entity-scoped personalization, local get_all readback, and deletion audit history while keeping OpenMemory UI/export blocked." + }, { "command": "cargo make real-world-memory-evolution", "status": "pass", @@ -99,7 +106,7 @@ "not_measured": [ "OpenMemory UI", "hosted ecosystem behavior", - "entity history quality", + "OpenMemory UI/export quality", "optional graph memory", "real-world memory_evolution jobs" ] @@ -248,7 +255,7 @@ "scenario": "basic_local_lifecycle", "current_judgment": "elf_and_mem0_both_pass_encoded_smoke", "claim_strength": "limited_tie_or_elf_broader_smoke_surface", - "next_gate": "mem0/OpenMemory history and UI/export readback benchmark" + "next_gate": "OpenMemory UI/export readback runner; hosted Platform export and optional graph memory remain non-goals for the local OSS lane" }, { "scenario": "retrieval_debug", @@ -291,8 +298,8 @@ { "priority": "P0", "direction": "mem0_openmemory_history_comparison", - "description": "Move past basic update/delete smoke into preference history, entity memory, lifecycle inspection, deletion audit, and UI/export readback.", - "benchmark_gate": "Comparable ELF and mem0/OpenMemory history jobs with typed evidence classes." + "description": "Local OSS comparison has moved past basic update/delete smoke into preference history, entity memory, lifecycle inspection, deletion audit, and SDK export-style readback.", + "benchmark_gate": "Local OSS history jobs are encoded with per-scenario claims; OpenMemory UI/export still needs a bounded UI runner." }, { "priority": "P0", @@ -322,6 +329,7 @@ "claim_boundaries": { "allowed": [ "ELF+mem0 basic local lifecycle smoke passed in the fresh Docker baseline.", + "mem0 local OSS history, entity-scoped personalization, deletion audit, and SDK get_all readback are measured by the XY-924 report.", "ELF narrowly outperformed qmd on the fresh memory-evolution slice because ELF passed delete/TTL and qmd did not.", "ELF still failed five of six live memory-evolution jobs.", "Graphiti/Zep temporal smoke is typed blocked due missing explicit provider key.", @@ -330,7 +338,7 @@ "not_allowed": [ "All goals are complete.", "ELF beats all tracked memory projects.", - "ELF beats mem0/OpenMemory on UI, hosted behavior, entity history, or graph memory.", + "ELF beats mem0/OpenMemory on preference history, UI/export, hosted behavior, or graph memory.", "ELF beats Graphiti/Zep on temporal validity.", "ELF beats Letta on core-vs-archival memory.", "Fixture pass, baseline smoke pass, and live real-world pass are interchangeable evidence classes." @@ -338,7 +346,7 @@ }, "next_issue_directions": [ "P0 ELF live temporal reconciliation and trace contract", - "P0 mem0/OpenMemory history and UI/export readback benchmark", + "P0 OpenMemory UI/export readback runner after the local OSS history benchmark", "P0 ELF/qmd trace-level replay and wrong-result diagnosis", "P1 Letta-style core-vs-archival memory benchmark", "P2 Graphiti/Zep provider-backed temporal smoke after explicit provider credentials exist", diff --git a/scripts/live-baseline-benchmark.sh b/scripts/live-baseline-benchmark.sh index d899677b..d1a65f31 100755 --- a/scripts/live-baseline-benchmark.sh +++ b/scripts/live-baseline-benchmark.sh @@ -2471,6 +2471,14 @@ else: preference_history["history"], ["concise", "evidence-linked"], ) + history_has_add_event = preference_history["available"] and history_has_event( + preference_history["history"], + "ADD", + ) + history_has_update_event = preference_history["available"] and history_has_event( + preference_history["history"], + "UPDATE", + ) search_has_current = contains_terms( result_entries(preference_search), ["concise", "evidence-linked"], @@ -2479,9 +2487,16 @@ else: if not preference_history["available"]: preference_status = "blocked" preference_reason = "Memory.history could not be read for the updated preference memory." - elif history_has_old and history_has_current and search_has_current and search_omits_old: + elif ( + history_has_old + and history_has_current + and history_has_add_event + and history_has_update_event + and search_has_current + and search_omits_old + ): preference_status = "pass" - preference_reason = "mem0 history preserved the old and current preference while search returned only the current correction." + preference_reason = "mem0 history preserved ADD and UPDATE preference events while search returned only the current correction." else: preference_status = "lifecycle_fail" preference_reason = "mem0 did not expose a clean preference correction chain with current-only search readback." @@ -2498,6 +2513,8 @@ else: "history_error": preference_history["error"], "history_has_old": history_has_old, "history_has_current": history_has_current, + "history_has_add_event": history_has_add_event, + "history_has_update_event": history_has_update_event, "search_has_current": search_has_current, "search_omits_old": search_omits_old, "history": preference_history["history"],