From e8cd8cd806562b1afcf34b3edb1da1c539dfda08 Mon Sep 17 00:00:00 2001 From: Yvette Carlisle Date: Thu, 11 Jun 2026 20:45:51 +0800 Subject: [PATCH] {"schema":"decodex/commit/1","summary":"Add OpenMemory export-helper readback probe and evidence reports","authority":"XY-931"} --- Makefile.toml | 9 + README.md | 10 +- .../memory_projects_manifest.json | 30 +- .../tests/real_world_job_benchmark.rs | 90 +++++- docker-compose.baseline.yml | 2 + ...-11-competitor-strength-adoption-report.md | 23 +- ...-11-competitor-strength-evidence-matrix.md | 4 +- ...em0-openmemory-history-ui-export-report.md | 61 ++-- ...-temporal-history-competitor-gap-report.md | 29 +- docs/guide/benchmarking/index.md | 2 +- ...1-competitor-strength-adoption-report.json | 16 +- ...emporal-history-competitor-gap-report.json | 15 +- ...-11-xy-897-competitor-strength-matrix.json | 14 +- ...-xy-931-openmemory-ui-export-readback.json | 60 ++++ scripts/live-baseline-benchmark.sh | 260 +++++++++++++++++- 15 files changed, 535 insertions(+), 90 deletions(-) create mode 100644 docs/research/2026-06-11-xy-931-openmemory-ui-export-readback.json diff --git a/Makefile.toml b/Makefile.toml index 5d570b77..86b24c7d 100644 --- a/Makefile.toml +++ b/Makefile.toml @@ -306,6 +306,7 @@ args = [ # | baseline-backfill-10k-docker | command | | # | baseline-backfill-100k-docker | command | | # | baseline-soak-docker | command | | +# | openmemory-ui-export-readback | command | | [tasks.baseline-live-docker] workspace = false @@ -342,6 +343,14 @@ args = [ "--remove-orphans", ] +[tasks.openmemory-ui-export-readback] +workspace = false +command = "bash" +args = [ + "-lc", + "set -euo pipefail; head=\"$(git rev-parse HEAD)\"; if [ -n \"$(git status --porcelain)\" ]; then head=\"$head+dirty\"; fi; export ELF_BASELINE_ELF_HEAD=\"$head\"; export ELF_BASELINE_PROJECTS=mem0; docker compose -f docker-compose.baseline.yml run --build --rm baseline-runner", +] + [tasks.baseline-production-synthetic] workspace = false command = "bash" diff --git a/README.md b/README.md index 1ec443f3..f4e15199 100644 --- a/README.md +++ b/README.md @@ -176,10 +176,12 @@ provider-backed ELF evidence was required. typed blocked or incomplete without explicit service, resource, or provider setup. These reports preserve the smoke-only boundary and do not create an ELF win claim against graph/RAG strengths. -- mem0/OpenMemory history follow-up after XY-924: the local OSS mem0 adapter now - passes encoded preference correction history, entity-scoped personalization, local - `get_all` export-style readback, and deletion audit history in - `live-baseline-20260611113003`. The comparison records ELF as a loss on preference +- mem0/OpenMemory history follow-up after XY-924 and XY-931: the local OSS mem0 + adapter now passes encoded preference correction history, entity-scoped + personalization, local `get_all` export-style readback, and deletion audit history. + The separate OpenMemory export-helper setup probe in `live-baseline-20260611122416` + records `blocked` with `DOCKER_UNAVAILABLE_IN_BASELINE_RUNNER`, so SDK `get_all` + is still not UI/export evidence. The comparison records ELF as a loss on preference correction history, ties on scoped personalization and delete audit, `not_tested` for local SDK export-style parity, `blocked` for OpenMemory UI/export, and `non_goal` for hosted Platform export and optional graph memory in the local OSS diff --git a/apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json b/apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json index 7bcdef8d..f5eabf62 100644 --- a/apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json +++ b/apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json @@ -1,6 +1,6 @@ { "schema": "elf.real_world_external_adapter_manifest/v1", - "manifest_id": "real-world-memory-project-adapters-2026-06-11-mem0-history", + "manifest_id": "real-world-memory-project-adapters-2026-06-11-openmemory-ui-export", "docker_isolation": { "default": true, "compose_file": "docker-compose.baseline.yml", @@ -608,13 +608,13 @@ }, "run": { "status": "pass", - "evidence": "Fresh scoped baseline run live-baseline-20260611113003 exercises local OSS mem0 with FastEmbed, Qdrant path storage, Memory.update, Memory.delete, Memory.history, Memory.get_all, entity filters, and cold-start reload; mem0 passed 8/8 encoded checks.", - "command": "ELF_BASELINE_PROJECTS=mem0 cargo make baseline-live-docker", + "evidence": "Fresh scoped baseline run live-baseline-20260611122416 exercises local OSS mem0 with FastEmbed, Qdrant path storage, Memory.update, Memory.delete, Memory.history, Memory.get_all, entity filters, and cold-start reload; mem0 passed 8/8 encoded SDK checks. XY-931 adds a separate OpenMemory export-helper setup probe artifact and keeps that blocked UI/export result out of the SDK check summary.", + "command": "cargo make openmemory-ui-export-readback", "artifact": "tmp/live-baseline/live-baseline-report.json" }, "result": { "status": "pass", - "evidence": "The local OSS mem0 baseline now passes same-corpus retrieval, update/delete/reload, preference correction history, entity-scoped personalization, local get_all export-style readback, and deletion audit history. It still does not launch the OpenMemory UI, hosted Platform export flow, optional graph memory, or a real_world_job prompt adapter.", + "evidence": "The local OSS mem0 baseline now passes same-corpus retrieval, update/delete/reload, preference correction history, entity-scoped personalization, local get_all export-style readback, and deletion audit history. The separate OpenMemory export-helper setup probe is blocked because Docker is unavailable inside the baseline-runner container before any product app database readback can run. It still does not claim hosted Platform export, optional graph memory, or a real_world_job prompt adapter.", "artifact": "tmp/live-baseline/live-baseline-report.json" }, "capabilities": [ @@ -626,7 +626,7 @@ { "capability": "same_corpus_retrieval", "status": "pass", - "evidence": "Fresh scoped baseline run live-baseline-20260611113003 reports mem0 retrieval_pass with 3/3 same-corpus retrieval checks." + "evidence": "Fresh scoped baseline run live-baseline-20260611122416 reports mem0 retrieval_pass with 3/3 same-corpus retrieval checks." }, { "capability": "local_lifecycle_update_delete_reload", @@ -656,7 +656,7 @@ { "capability": "openmemory_ui_readback", "status": "blocked", - "evidence": "The Docker live-baseline runner does not launch the OpenMemory web UI, dashboard authentication, or browser export flow. Local SDK get_all readback is measured separately and must not be reused as UI evidence." + "evidence": "XY-931 runs a bounded OpenMemory export-helper setup probe after the mem0 SDK corpus checks. The probe finds the OpenMemory tree, UI package, compose file, and export helper, then records a setup blocker because the export helper requires Docker access to a running OpenMemory container. Local SDK get_all readback is measured separately and must not be reused as UI evidence." }, { "capability": "hosted_managed_memory_claims", @@ -688,7 +688,7 @@ { "suite_id": "operator_debugging_ux", "status": "blocked", - "evidence": "Local SDK get_all inspection is measured, but OpenMemory UI/export readback is blocked because the Docker runner does not launch the web UI or hosted export flow." + "evidence": "Local SDK get_all inspection is measured, but OpenMemory UI/export readback is blocked by the XY-931 export-helper setup probe until a dedicated OpenMemory compose/import path can load the same corpus into the OpenMemory app database." } ], "scenarios": [ @@ -708,7 +708,7 @@ "status": "pass", "elf_position": "loses", "comparison_outcome": "loss", - "evidence": "Fresh scoped baseline run live-baseline-20260611113003 reports mem0 preference_correction_history as pass. ELF-side evidence comes from cargo make real-world-memory-live-adapters as summarized in docs/guide/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md, which records ELF live memory-evolution preference as wrong_result. The current measured comparison is therefore an ELF loss on this history dimension until ELF temporal reconciliation is fixed.", + "evidence": "Fresh scoped baseline run live-baseline-20260611122416 reports mem0 preference_correction_history as pass. ELF-side evidence comes from cargo make real-world-memory-live-adapters as summarized in docs/guide/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md, which records ELF live memory-evolution preference as wrong_result. The current measured comparison is therefore an ELF loss on this history dimension until ELF temporal reconciliation is fixed.", "command": "mem0: ELF_BASELINE_PROJECTS=mem0 cargo make baseline-live-docker; ELF: cargo make real-world-memory-live-adapters", "artifact": "mem0: tmp/live-baseline/mem0-checks.json; ELF: tmp/real-world-memory/live-adapters/ and docs/guide/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md" }, @@ -718,7 +718,7 @@ "status": "pass", "elf_position": "ties", "comparison_outcome": "tie", - "evidence": "Fresh scoped baseline run live-baseline-20260611113003 reports mem0 entity_scoped_personalization as pass. ELF-side evidence comes from cargo make real-world-memory-live-adapters as summarized in docs/guide/benchmarking/2026-06-11-competitor-strength-adoption-report.md, which records ELF and qmd passing the encoded personalization slice. This is a measured tie on the current scoped-preference surface.", + "evidence": "Fresh scoped baseline run live-baseline-20260611122416 reports mem0 entity_scoped_personalization as pass. ELF-side evidence comes from cargo make real-world-memory-live-adapters as summarized in docs/guide/benchmarking/2026-06-11-competitor-strength-adoption-report.md, which records ELF and qmd passing the encoded personalization slice. This is a measured tie on the current scoped-preference surface.", "command": "mem0: ELF_BASELINE_PROJECTS=mem0 cargo make baseline-live-docker; ELF: cargo make real-world-memory-live-adapters", "artifact": "mem0: tmp/live-baseline/mem0-checks.json; ELF: tmp/real-world-memory/live-adapters/ and docs/guide/benchmarking/2026-06-11-competitor-strength-adoption-report.md" }, @@ -728,7 +728,7 @@ "status": "pass", "elf_position": "ties", "comparison_outcome": "tie", - "evidence": "Fresh scoped baseline run live-baseline-20260611113003 reports mem0 delete_history_audit_readback as pass. ELF-side evidence comes from cargo make real-world-memory-live-adapters as summarized in docs/guide/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md, which records ELF passing the delete/TTL tombstone job. The current measured delete-audit comparison is a tie.", + "evidence": "Fresh scoped baseline run live-baseline-20260611122416 reports mem0 delete_history_audit_readback as pass. ELF-side evidence comes from cargo make real-world-memory-live-adapters as summarized in docs/guide/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md, which records ELF passing the delete/TTL tombstone job. The current measured delete-audit comparison is a tie.", "command": "mem0: ELF_BASELINE_PROJECTS=mem0 cargo make baseline-live-docker; ELF: cargo make real-world-memory-live-adapters", "artifact": "mem0: tmp/live-baseline/mem0-checks.json; ELF: tmp/real-world-memory/live-adapters/ and docs/guide/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md" }, @@ -738,7 +738,7 @@ "status": "pass", "elf_position": "untested", "comparison_outcome": "not_tested", - "evidence": "Fresh scoped baseline run live-baseline-20260611113003 reports mem0 local_get_all_export_readback as pass. This is local SDK inspection/export-style readback, not OpenMemory UI evidence; ELF has no directly comparable live UI/export scoring row in this run.", + "evidence": "Fresh scoped baseline run live-baseline-20260611122416 reports mem0 local_get_all_export_readback as pass. This is local SDK inspection/export-style readback, not OpenMemory UI evidence; ELF has no directly comparable live UI/export scoring row in this run.", "command": "ELF_BASELINE_PROJECTS=mem0 cargo make baseline-live-docker", "artifact": "tmp/live-baseline/mem0-checks.json" }, @@ -748,8 +748,9 @@ "status": "blocked", "elf_position": "untested", "comparison_outcome": "blocked", - "evidence": "The local Docker runner does not launch OpenMemory UI/dashboard export, and hosted Platform export remains outside local OSS evidence. Basic lifecycle and local get_all readback are not reused as UI/export proof.", - "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json" + "evidence": "The XY-931 OpenMemory export-helper setup probe is Docker-contained in the mem0 baseline run. It detects the OpenMemory product tree, UI package, compose file, and export helper, but Docker is unavailable inside the baseline-runner container before the helper can reach a running OpenMemory product container or app database. Basic lifecycle and local SDK get_all readback are not reused as UI/export proof.", + "command": "cargo make openmemory-ui-export-readback", + "artifact": "tmp/live-baseline/mem0-openmemory-ui-export.json" }, { "scenario_id": "hosted_platform_export", @@ -778,7 +779,8 @@ } ], "notes": [ - "Separate local OSS mem0 evidence from hosted Platform and OpenMemory UI claims." + "Separate local OSS mem0 SDK evidence from OpenMemory product UI/export claims.", + "A blocked OpenMemory export-helper setup probe is not an ELF win or loss until the product app can import and export the same local corpus." ] }, { diff --git a/apps/elf-eval/tests/real_world_job_benchmark.rs b/apps/elf-eval/tests/real_world_job_benchmark.rs index b76a1ff2..fe6da046 100644 --- a/apps/elf-eval/tests/real_world_job_benchmark.rs +++ b/apps/elf-eval/tests/real_world_job_benchmark.rs @@ -137,6 +137,13 @@ fn competitor_strength_adoption_report_json_path() -> Result { .join("2026-06-11-competitor-strength-adoption-report.json")) } +fn temporal_history_competitor_gap_json_path() -> Result { + Ok(workspace_root()? + .join("docs") + .join("research") + .join("2026-06-11-temporal-history-competitor-gap-report.json")) +} + fn competitor_strength_matrix_path() -> Result { Ok(workspace_root()? .join("docs") @@ -399,7 +406,7 @@ fn assert_external_adapter_manifest_summary(report: &Value) { ); assert_eq!( report.pointer("/external_adapters/manifest_id").and_then(Value::as_str), - Some("real-world-memory-project-adapters-2026-06-11-mem0-history") + Some("real-world-memory-project-adapters-2026-06-11-openmemory-ui-export") ); assert_eq!( report.pointer("/external_adapters/docker_isolation/default").and_then(Value::as_bool), @@ -812,6 +819,20 @@ fn assert_first_generation_adapter_records( Some("openmemory_ui_export_readback") ); assert_eq!(mem0.pointer("/scenarios/5/status").and_then(Value::as_str), Some("blocked")); + assert_eq!( + mem0.pointer("/scenarios/5/command").and_then(Value::as_str), + Some("cargo make openmemory-ui-export-readback") + ); + assert_eq!( + mem0.pointer("/scenarios/5/artifact").and_then(Value::as_str), + Some("tmp/live-baseline/mem0-openmemory-ui-export.json") + ); + assert!( + mem0.pointer("/capabilities/7/evidence") + .and_then(Value::as_str) + .is_some_and(|evidence| evidence.contains("export-helper setup probe") + && evidence.contains("requires Docker access")) + ); assert_eq!( mem0.pointer("/scenarios/6/comparison_outcome").and_then(Value::as_str), Some("non_goal") @@ -1067,6 +1088,48 @@ fn live_adapter_aggregate_forwards_graph_rag_smoke_controls() -> Result<()> { Ok(()) } +#[test] +fn openmemory_ui_export_probe_has_dedicated_docker_task() -> Result<()> { + let workspace_root = workspace_root()?; + let makefile = fs::read_to_string(workspace_root.join("Makefile.toml"))?; + let compose = fs::read_to_string(workspace_root.join("docker-compose.baseline.yml"))?; + let script = fs::read_to_string(workspace_root.join("scripts/live-baseline-benchmark.sh"))?; + let report = serde_json::from_str::(&fs::read_to_string( + workspace_root.join("docs/research/2026-06-11-xy-931-openmemory-ui-export-readback.json"), + )?)?; + + assert!(makefile.contains("[tasks.openmemory-ui-export-readback]")); + assert!(makefile.contains("export ELF_BASELINE_PROJECTS=mem0")); + assert!(compose.contains("ELF_MEM0_OPENMEMORY_EXPORT_USER_ID")); + assert!(compose.contains("ELF_MEM0_OPENMEMORY_EXPORT_CONTAINER")); + assert!(script.contains("probe_mem0_openmemory_ui_export")); + assert!(script.contains("mem0-openmemory-ui-export.json")); + assert!(script.contains("DOCKER_UNAVAILABLE_IN_BASELINE_RUNNER")); + assert!(script.contains("sdk_get_all_is_ui_export_evidence: false")); + assert!( + script.contains("SDK same-corpus retrieval and every encoded SDK behavior check passed") + ); + assert_eq!(report.pointer("/classification/status").and_then(Value::as_str), Some("blocked")); + assert_eq!( + report.pointer("/classification/reason_code").and_then(Value::as_str), + Some("DOCKER_UNAVAILABLE_IN_BASELINE_RUNNER") + ); + assert_eq!( + report + .pointer("/same_corpus_boundary/sdk_get_all_is_ui_export_evidence") + .and_then(Value::as_bool), + Some(false) + ); + assert_eq!( + report + .pointer("/claim_boundary/elf_can_compare_against_openmemory_ui_export_after_this_run") + .and_then(Value::as_bool), + Some(false) + ); + + Ok(()) +} + fn assert_live_sweep_record(adapter: &Value, production_ops_status: &str) -> Result<()> { let suites = array_at(adapter, "/suites")?; let capabilities = array_at(adapter, "/capabilities")?; @@ -1432,6 +1495,9 @@ fn current_benchmark_reports_preserve_live_sweep_boundaries() -> Result<()> { let external_manifest = fs::read_to_string(external_adapter_manifest_path())?; let retrieval_debug_profile = serde_json::from_str::(&fs::read_to_string(retrieval_debug_profile_json_path()?)?)?; + let temporal_history = serde_json::from_str::(&fs::read_to_string( + temporal_history_competitor_gap_json_path()?, + )?)?; assert!( measurement_audit.contains( @@ -1506,6 +1572,20 @@ fn current_benchmark_reports_preserve_live_sweep_boundaries() -> Result<()> { assert_competitor_strength_matrix_json(&competitor_matrix_json)?; + let openmemory_command = find_by_field( + array_at(&temporal_history, "/commands")?, + "/command", + "cargo make openmemory-ui-export-readback", + )?; + + assert!( + openmemory_command + .pointer("/artifact") + .and_then(Value::as_str) + .is_some_and(|artifact| artifact.contains("tmp/live-baseline/mem0-checks.json") + && artifact.contains("tmp/live-baseline/mem0-openmemory-ui-export.json")) + ); + Ok(()) } @@ -1680,12 +1760,16 @@ fn assert_competitor_strength_matrix_json(matrix: &Value) -> Result<()> { assert_eq!(mem0.pointer("/measured_status").and_then(Value::as_str), Some("pass")); assert_eq!( mem0.pointer("/unsupported_or_blocked_status/state").and_then(Value::as_str), - Some("not_encoded") + Some("blocked") + ); + assert_eq!( + mem0.pointer("/unsupported_or_blocked_status/typed_reason").and_then(Value::as_str), + Some("openmemory_export_helper_setup_blocked") ); assert!( mem0.pointer("/benchmark_before_claim") .and_then(Value::as_str) - .is_some_and(|claim| claim.contains("preference/entity history")) + .is_some_and(|claim| claim.contains("OpenMemory product app import/export")) ); assert_eq!( openviking.pointer("/current_evidence_class").and_then(Value::as_str), diff --git a/docker-compose.baseline.yml b/docker-compose.baseline.yml index 6171692c..5dc3180e 100644 --- a/docker-compose.baseline.yml +++ b/docker-compose.baseline.yml @@ -119,6 +119,8 @@ services: ELF_BASELINE_BACKFILL_RESUME_PROBE: ${ELF_BASELINE_BACKFILL_RESUME_PROBE:-} ELF_BASELINE_MAX_ELF_RSS_KB: ${ELF_BASELINE_MAX_ELF_RSS_KB:-1500000} ELF_BASELINE_MAX_ELF_SECONDS: ${ELF_BASELINE_MAX_ELF_SECONDS:-600} + ELF_MEM0_OPENMEMORY_EXPORT_CONTAINER: ${ELF_MEM0_OPENMEMORY_EXPORT_CONTAINER:-} + ELF_MEM0_OPENMEMORY_EXPORT_USER_ID: ${ELF_MEM0_OPENMEMORY_EXPORT_USER_ID:-} ELF_BASELINE_OPENVIKING_LLAMA_CPP_PYTHON_INDEX: ${ELF_BASELINE_OPENVIKING_LLAMA_CPP_PYTHON_INDEX:-} ELF_BASELINE_OPENVIKING_LLAMA_CPP_PYTHON_VERSION: ${ELF_BASELINE_OPENVIKING_LLAMA_CPP_PYTHON_VERSION:-} ELF_BASELINE_PROFILE: ${ELF_BASELINE_PROFILE:-smoke} diff --git a/docs/guide/benchmarking/2026-06-11-competitor-strength-adoption-report.md b/docs/guide/benchmarking/2026-06-11-competitor-strength-adoption-report.md index db01c063..ec2ea8f2 100644 --- a/docs/guide/benchmarking/2026-06-11-competitor-strength-adoption-report.md +++ b/docs/guide/benchmarking/2026-06-11-competitor-strength-adoption-report.md @@ -37,12 +37,13 @@ The remaining caveats are material: - Credentialed provider production-ops gates are blocked until explicit provider setup exists. - Several competitor strengths remain `not_tested` or blocked: OpenMemory - UI/export, hosted mem0 Platform behavior, OpenViking trajectory, Letta - core-vs-archival memory, and graph/RAG navigation. mem0 local OSS preference - history is now measured separately and is an ELF loss on the current correction - history scenario. The XY-923 follow-up also scores qmd's immediate top-10/replay - artifact ergonomics as stronger than ELF's default stress report, while - expansion, fusion, rerank, and candidate-drop diagnosis remain untested. + UI/export is blocked by the XY-931 export-helper setup probe, hosted mem0 Platform + behavior remains a non-goal, and OpenViking trajectory, Letta core-vs-archival + memory, and graph/RAG navigation remain unproven. mem0 local OSS preference history + is measured separately and is an ELF loss on the current correction history + scenario. The XY-923 follow-up also scores qmd's immediate top-10/replay artifact + ergonomics as stronger than ELF's default stress report, while expansion, fusion, + rerank, and candidate-drop diagnosis remain untested. ## Evidence Classes @@ -70,7 +71,7 @@ results, or lifecycle failures into one aggregate leaderboard. | `cargo make real-world-memory` | `2026-06-11-measurement-coverage-audit.md` | ELF fixture aggregate covers 38 jobs across 11 suites with 36 pass and 2 blocked production-ops operator boundaries. | | `cargo make real-world-memory-live-adapters` | `2026-06-11-measurement-coverage-audit.md` | ELF live service adapter reports 18 pass, 5 wrong_result, 2 blocked, and 13 not_encoded jobs; qmd reports 17 pass, 6 wrong_result, 2 blocked, and 13 not_encoded jobs. | | `ELF_BASELINE_PROJECTS=ELF,agentmemory,mem0,memsearch,claude-mem cargo make baseline-live-docker` | `2026-06-11-first-generation-oss-adapter-promotion-report.md` | mem0/OpenMemory and memsearch pass basic local baseline smokes; agentmemory remains lifecycle_fail and claude-mem remains wrong_result. | -| `ELF_BASELINE_PROJECTS=mem0 cargo make baseline-live-docker` | `2026-06-11-mem0-openmemory-history-ui-export-report.md` | mem0 local OSS passes preference correction history, entity-scoped personalization, local `get_all` export-style readback, and deletion audit history; OpenMemory UI/export remains blocked and hosted Platform export remains non-goal. | +| `cargo make openmemory-ui-export-readback` | `2026-06-11-mem0-openmemory-history-ui-export-report.md` | mem0 local OSS passes preference correction history, entity-scoped personalization, local `get_all` export-style readback, and deletion audit history; OpenMemory export-helper setup emits a separate blocked artifact with `DOCKER_UNAVAILABLE_IN_BASELINE_RUNNER`, and hosted Platform export remains non-goal. | | `ELF_GRAPHITI_ZEP_SMOKE_START=1 ELF_GRAPHITI_ZEP_SMOKE_RUN=1 cargo make graphiti-zep-docker-temporal-smoke` | `2026-06-11-temporal-history-competitor-gap-report.md` | Graphiti/Zep temporal smoke remains blocked by `provider_api_key_missing`. | | `cargo make graphify-docker-graph-report-smoke` | `2026-06-11-graph-rag-scored-smoke-adapter-report.md` | graphify reaches tiny Docker graph/report scoring but remains wrong_result. | | `cargo make baseline-production-synthetic`, `cargo make baseline-backfill-docker`, backup/restore, Qdrant rebuild proof | `2026-06-10-production-adoption-refresh.md` | ELF has provider synthetic, stress, backfill, restore, and rebuild evidence; private-corpus proof is blocked by missing operator-owned manifest. | @@ -88,7 +89,7 @@ results, or lifecycle failures into one aggregate leaderboard. | Memory evolution and temporal history | `loss` | `fixture_backed`, `live_real_world`, `live_baseline_only`, `wrong_result`, `blocked` | ELF fixture memory evolution passes, but live ELF passes only delete/TTL and reports five wrong_result jobs where current-vs-historical state is not reconciled. The mem0 local OSS preference-correction history scenario is now measured and is also an ELF loss. | XY-905 | | Consolidation/proposal review | `not_tested` | `fixture_backed`, `not_encoded` | ELF fixture consolidation passes, but live consolidation proposal generation and review-action scoring are not encoded. | XY-926 | | Knowledge page compilation | `not_tested` | `fixture_backed`, `live_real_world`, `wrong_result`, `research_gate`, `not_encoded` | ELF fixture knowledge pages pass, but live knowledge compilation is not encoded; graphify reaches a tiny scored smoke and remains wrong_result. | XY-926, XY-929 | -| Operator debugging/viewer UX | `not_tested` | `fixture_backed`, `live_baseline_only`, `blocked`, `not_encoded`, `research_gate` | ELF fixture operator-debugging UX passes. mem0 local SDK `get_all` readback is measured, but OpenMemory UI/export remains blocked and must not be inferred from SDK readback. Live trace/viewer scoring and qmd/OpenMemory/claude-mem UX comparisons remain unscored. | XY-923, XY-926 | +| Operator debugging/viewer UX | `not_tested` | `fixture_backed`, `live_baseline_only`, `blocked`, `not_encoded`, `research_gate` | ELF fixture operator-debugging UX passes. mem0 local SDK `get_all` readback is measured, but the XY-931 OpenMemory export-helper setup probe is blocked by missing Docker/OpenMemory product container access and must not be inferred from SDK readback. Live trace/viewer scoring and qmd/OpenMemory/claude-mem UX comparisons remain unscored. | XY-923, XY-926 | | Capture/write policy and redaction | `not_tested` | `fixture_backed`, `live_baseline_only`, `blocked`, `not_encoded` | ELF fixture capture/write-policy jobs pass, but live capture integration and agentmemory/claude-mem capture hooks are not comparable yet. | XY-925, XY-926 | | Production ops, restore, backfill, and rebuild | `win` | `live_baseline_only`, `blocked` | ELF has the strongest measured local production-operation story: provider synthetic, stress, resumable backfill, backup/restore, and Qdrant rebuild evidence. | XY-930 | | Private corpus and provider boundaries | `blocked` | `blocked` | Private production profile fails closed without an operator-owned manifest; provider-backed production-ops gates require explicit credentials. | XY-930 | @@ -103,7 +104,7 @@ results, or lifecycle failures into one aggregate leaderboard. | --- | --- | --- | --- | | XY-905 | P0 | Backlog | Live temporal reconciliation answer and trace contract. | | XY-923 | P0 | Backlog | qmd trace-level replay and wrong-result diagnostics. | -| XY-924 | P0 | Encoded local OSS history; UI/export still gated | mem0/OpenMemory local OSS history and SDK export-style readback are measured; OpenMemory UI/export still needs a UI runner before any product-UX claim. | +| XY-924/XY-931 | P0 | Encoded local OSS history; UI/export setup blocker measured | mem0/OpenMemory local OSS history and SDK export-style readback are measured; OpenMemory UI/export has a blocked export-helper setup probe and still needs a dedicated compose/import path before any product-UX comparison. | | XY-925 | P1 | Backlog | First-generation OSS continuity and source-store adapters. | | XY-926 | P1 | Backlog | Live operator-debugging, capture, consolidation, and knowledge-page suites. | | XY-927 | P1 | Backlog | Letta-style core-vs-archival memory comparison. | @@ -131,8 +132,8 @@ results, or lifecycle failures into one aggregate leaderboard. or retrieval-quality win. - Do not claim ELF beats mem0/OpenMemory on preference history, UI/export, hosted behavior, or graph memory. The local OSS correction-history scenario is currently - an ELF loss, while OpenMemory UI/export, hosted behavior, and graph memory remain - outside measured local OSS evidence. + an ELF loss, while OpenMemory UI/export is a measured setup blocker and hosted + behavior plus graph memory remain outside measured local OSS evidence. - Do not claim ELF beats OpenViking on staged context trajectory. - Do not claim ELF beats Letta on core-vs-archival memory. - Do not claim graph/RAG parity from smoke-only evidence. diff --git a/docs/guide/benchmarking/2026-06-11-competitor-strength-evidence-matrix.md b/docs/guide/benchmarking/2026-06-11-competitor-strength-evidence-matrix.md index c78e50f3..2043ed37 100644 --- a/docs/guide/benchmarking/2026-06-11-competitor-strength-evidence-matrix.md +++ b/docs/guide/benchmarking/2026-06-11-competitor-strength-evidence-matrix.md @@ -75,7 +75,7 @@ lifecycle-fail -> `lifecycle_fail`, and not-encoded -> `not_encoded`. | ELF | Evidence-linked source-of-truth memory service with real-world fixtures and live retrieval sweeps. | `live_real_world`; supporting `fixture_backed`. | `wrong_result` full live sweep: `cargo make real-world-memory-live-adapters`, `tmp/real-world-memory/live-adapters/elf-report.md`. Fixture contract: `cargo make real-world-memory`, `tmp/real-world-memory/real-world-memory-report.json`. | `blocked`: private manifest and provider credentials; broader live suites remain `wrong_result`, `blocked`, or `not_encoded`. | Full-suite live pass plus separate private-corpus and credentialed production-ops proof. | Keep borrowing qmd debug knobs, OpenViking staged trajectory, mem0 history, Letta core memory, and graph/RAG navigation. | | qmd | Local retrieval-debug workflow with transparent CLI indexing, querying, expansion, fusion, and rerank ergonomics. | `live_real_world`; supporting `live_baseline_only` and `research_gate`. | `wrong_result` full live sweep: `cargo make real-world-memory-live-adapters`, `tmp/real-world-memory/live-adapters/qmd-report.md`; targeted retrieval suites pass. | `not_encoded`: deep profile and non-retrieval live behavior are not encoded; memory_evolution is `wrong_result`. | qmd deep retrieval/debug profile plus full-suite live replay with trace-level diagnostics. | Weighted fusion, rerank explanation, local debug knobs, and command-line replay. | | agentmemory | Coding-agent continuity, MCP/REST packaging, viewer workflow, and durable cross-agent memory lifecycle. | `live_baseline_only`. | `lifecycle_fail`: `ELF_BASELINE_PROJECTS=agentmemory cargo make baseline-live-docker`, `tmp/live-baseline/live-baseline-report.json`. | `blocked`: durable cold-start and real-world adapter coverage are missing. | Durable local adapter with update, delete, cold-start reload, work_resume, capture/write-policy, and lifecycle-staleness jobs. | Cross-agent hooks, packaging, continuity scenarios, and viewer affordances. | -| mem0/OpenMemory | Memory lifecycle, personalization, hosted/OpenMemory UI ergonomics, and optional graph memory. | `live_baseline_only`. | `pass`: fresh scoped run `ELF_BASELINE_PROJECTS=ELF,agentmemory,mem0,memsearch,claude-mem cargo make baseline-live-docker`, `tmp/live-baseline/live-baseline-report.json`, with mem0 `4/4` local checks passing. | `not_encoded`: OpenMemory UI, hosted claims, entity/preference history, graph memory, and real-world personalization coverage are not encoded. | Encode memory_evolution preference/entity history, deletion audit readback, personalization, UI/export readback, and optional graph-context jobs. | Entity-scoped history, lifecycle surfaces, async update ergonomics, and OpenMemory inspection UX. | +| mem0/OpenMemory | Memory lifecycle, personalization, hosted/OpenMemory UI ergonomics, and optional graph memory. | `live_baseline_only`. | `pass`: fresh scoped run `cargo make openmemory-ui-export-readback`, `tmp/live-baseline/live-baseline-report.json`, with mem0 `8/8` local SDK checks passing; `blocked`: OpenMemory export-helper setup probe emits `tmp/live-baseline/mem0-openmemory-ui-export.json` with `DOCKER_UNAVAILABLE_IN_BASELINE_RUNNER`. | `blocked`: OpenMemory UI/export cannot be compared until a compose/import path loads the same corpus into the product app; `unsupported`: hosted Platform export; `not_encoded`: optional graph memory and real-world prompt adapter coverage. | Add a Docker-contained OpenMemory product app import/export path, then score browser/API readback separately from SDK `get_all`; keep hosted Platform and graph memory opt-in/non-goal unless explicitly enabled. | Entity-scoped history, lifecycle surfaces, async update ergonomics, and OpenMemory inspection UX. | | memsearch | Markdown-first canonical store with rebuildable local index and practical hybrid retrieval. | `live_baseline_only`. | `pass`: fresh scoped run `ELF_BASELINE_PROJECTS=ELF,agentmemory,mem0,memsearch,claude-mem cargo make baseline-live-docker`, `tmp/live-baseline/live-baseline-report.json`, with memsearch `4/4` local checks passing. | `not_encoded`: real-world source-of-truth, retrieval, and memory-evolution prompt adapters are not encoded; TTL/expiry is unsupported by the current CLI path. | Score source-of-truth and retrieval-debug real-world jobs over the canonical Markdown store; keep TTL/expiry as unsupported unless a comparable path exists. | Canonical markdown store, local reindex clarity, and user-inspectable source files. | | OpenViking | Filesystem-like context trajectory, hierarchical retrieval, and staged context loading. | `live_baseline_only`; supporting `research_gate`. | `wrong_result`: `ELF_BASELINE_PROJECTS=OpenViking cargo make baseline-live-docker`, `tmp/live-baseline/live-baseline-report.json`. | `not_encoded`: hierarchical context trajectory is not encoded; same-corpus output still misses expected evidence. | Make evidence-bearing same-corpus output pass, then score staged trajectory and hierarchy expansion. | `viking://`-style context model, trajectory readback, and staged retrieval planning. | | claude-mem | Progressive disclosure, automatic capture loop, repository-local lifecycle, and local viewer workflow. | `live_baseline_only`. | `wrong_result`: `ELF_BASELINE_PROJECTS=claude-mem cargo make baseline-live-docker`, `tmp/live-baseline/live-baseline-report.json`. | `not_encoded`: progressive-disclosure real-world jobs are not encoded. | Durable repository-backed work_resume, operator_debugging_ux, capture/write-policy, and progressive-disclosure jobs. | Progressive disclosure, automatic capture review loops, and local viewer/operator comfort. | @@ -98,7 +98,7 @@ lifecycle-fail -> `lifecycle_fail`, and not-encoded -> `not_encoded`. | Work resume | Fixture and live work_resume pass. | agentmemory, claude-mem, OpenViking. | agentmemory `lifecycle_fail`, claude-mem `wrong_result`, OpenViking work_resume `not_encoded`. | Encode durable work_resume adapters or keep each blocked with lifecycle/setup evidence. | | Project decisions | Fixture and live project_decisions pass. | qmd, Letta. | qmd live project_decisions pass; Letta is `research_gate` `not_encoded`. | Add Letta core/archival decision jobs only after a contained export path exists. | | Source-of-truth | Fixture and live trust_source_of_truth pass. | memsearch. | memsearch canonical-store, reindex, delete, and reload smoke now passes, but source-of-truth real_world_job prompts are `not_encoded`. | Score memsearch source-of-truth rebuild/reload jobs before any suite-level win/loss claim. | -| Temporal/current-vs-historical memory | Fixture memory_evolution passes; live memory_evolution is `wrong_result`. | Graphiti/Zep, mem0/OpenMemory. | Graphiti/Zep is `research_gate` `blocked`; mem0/OpenMemory basic local lifecycle now passes, but preference/entity history, deletion audit, UI/export, and graph-memory scenarios are `not_encoded`. | Fix ELF/qmd live memory_evolution evidence links, encode mem0/OpenMemory history/UI jobs, and run XY-888. | +| Temporal/current-vs-historical memory | Fixture memory_evolution passes; live memory_evolution is `wrong_result`. | Graphiti/Zep, mem0/OpenMemory. | Graphiti/Zep is `research_gate` `blocked`; mem0/OpenMemory local OSS preference history, entity scope, deletion audit, and SDK `get_all` now pass; OpenMemory UI/export is blocked by the export-helper setup probe; graph-memory scenarios are `not_encoded`. | Fix ELF/qmd live memory_evolution evidence links, add OpenMemory product app import/export readback, and run XY-888. | | Consolidation | Fixture consolidation passes; live consolidation is `not_encoded`. | agentmemory, managed-memory references, llm-wiki. | No manifest project has live consolidation scoring. | Run reviewable consolidation proposal generation with source refs, unsupported-claim flags, and audit transitions. | | Knowledge pages | Fixture knowledge_compilation passes; live knowledge_compilation is `not_encoded`. | llm-wiki, gbrain, GraphRAG, graphify. | llm-wiki and gbrain are `research_gate` `not_encoded` or `blocked`; GraphRAG is `blocked`; graphify has a tiny scored smoke `wrong_result`. | Encode live derived-page rebuild/lint scoring and run contained knowledge/RAG adapters only after setup proof. | | Operator debugging | Fixture operator_debugging_ux passes; live operator_debugging_ux is `not_encoded`. | qmd, claude-mem, OpenMemory. | qmd has debug strengths but operator_debugging_ux is `not_encoded`; claude-mem and OpenMemory UX are `not_encoded`. | Score trace hydration, stage attribution, raw-SQL avoidance, and repair-action clarity through live artifacts. | diff --git a/docs/guide/benchmarking/2026-06-11-mem0-openmemory-history-ui-export-report.md b/docs/guide/benchmarking/2026-06-11-mem0-openmemory-history-ui-export-report.md index 91d5dc15..9200bb86 100644 --- a/docs/guide/benchmarking/2026-06-11-mem0-openmemory-history-ui-export-report.md +++ b/docs/guide/benchmarking/2026-06-11-mem0-openmemory-history-ui-export-report.md @@ -1,8 +1,8 @@ # mem0/OpenMemory History and UI Export Report - June 11, 2026 Goal: Add scenario-level mem0/OpenMemory history, personalization, deletion-audit, -and export-readback evidence without promoting basic lifecycle smoke into UI or -hosted Platform claims. +local SDK export-readback, and bounded OpenMemory export-helper setup evidence without +promoting basic lifecycle smoke into UI or hosted Platform claims. Read this when: You need the current XY-924 comparison between ELF and mem0/OpenMemory for entity-scoped history, preference correction, deletion audit, personalization, OpenMemory inspection/export, hosted Platform export, or optional @@ -15,10 +15,12 @@ Depends on: `docs/spec/real_world_agent_memory_benchmark_v1.md`, `apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json`. Outputs: Per-scenario outcomes using `win`, `tie`, `loss`, `not_tested`, `blocked`, and `non_goal`, plus command and artifact evidence for each measured claim. +Machine-readable companion: `docs/research/2026-06-11-xy-931-openmemory-ui-export-readback.json`. ## Executive Judgment -The XY-924 objective is now encoded for the reproducible local OSS surface. +The XY-924 objective is now encoded for the reproducible local OSS SDK surface, and +XY-931 adds a separate bounded OpenMemory export-helper setup probe. mem0/OpenMemory now has fresh local OSS evidence for behavior beyond the basic lifecycle smoke: @@ -27,20 +29,24 @@ lifecycle smoke: - `entity_scoped_personalization`: `pass` - `local_get_all_export_readback`: `pass` - `delete_history_audit_readback`: `pass` +- `openmemory_ui_export_readback`: `blocked` The comparison is intentionally narrower than a hosted/OpenMemory product verdict. The local run measures the mem0 OSS SDK and local FastEmbed/Qdrant/history paths in -Docker. It does not launch the OpenMemory web UI, does not exercise hosted mem0 -Platform export jobs, and does not enable optional graph memory. +Docker. The new product-UX setup probe detects the OpenMemory tree, UI package, +compose file, and export helper, then records a setup blocker: the export helper needs +Docker access to a running OpenMemory product container, while the baseline runner +only has the SDK Qdrant/history artifacts. It does not claim browser/dashboard +readback, hosted mem0 Platform export jobs, or optional graph memory. ## Fresh Evidence | Command | Result | Runtime | Artifact | | --- | --- | ---: | --- | -| `ELF_BASELINE_PROJECTS=mem0 cargo make baseline-live-docker` | `pass`; mem0 `8/8` encoded checks pass | 39.17 seconds wall; 36 seconds project runtime | `tmp/live-baseline/live-baseline-report.json`, `tmp/live-baseline/mem0-checks.json` | -| `cargo make real-world-memory` | `pass`; refreshed external adapter report published | 8.88 seconds | `tmp/real-world-memory/real-world-memory-report.json`, `tmp/real-world-memory/real-world-memory-report.md` | +| `cargo make openmemory-ui-export-readback` | `pass` for SDK baseline; OpenMemory export-helper setup probe `blocked` with `DOCKER_UNAVAILABLE_IN_BASELINE_RUNNER` | 35.14 seconds wall; 33 seconds project runtime | `tmp/live-baseline/live-baseline-report.json`, `tmp/live-baseline/mem0-checks.json`, `tmp/live-baseline/mem0-openmemory-ui-export.json`, `tmp/live-baseline/mem0-openmemory-export-attempt.log` | +| `cargo make real-world-memory` | `pass`; refreshed external adapter report published | 7.97 seconds | `tmp/real-world-memory/real-world-memory-report.json`, `tmp/real-world-memory/real-world-memory-report.md` | -Fresh mem0 run id: `live-baseline-20260611113003`. +Fresh mem0/OpenMemory run id: `live-baseline-20260611122416`. Generated external adapter summary for all external adapter manifest rows: @@ -62,7 +68,7 @@ mem0/OpenMemory rows in this report contain eight scenarios: `loss=1`, | Entity-scoped personalization | `search()` with `user_id`, `agent_id`, and `run_id` filters returns the ELF-scoped preference and omits a PubFi-scoped preference. | `tie` | `pass` | mem0: `ELF_BASELINE_PROJECTS=mem0 cargo make baseline-live-docker`; ELF: `cargo make real-world-memory-live-adapters` | mem0: `tmp/live-baseline/mem0-checks.json`; ELF: `tmp/real-world-memory/live-adapters/`, `docs/guide/benchmarking/2026-06-11-competitor-strength-adoption-report.md` | | Delete audit readback | `Memory.history` exposes a `DELETE` event and post-delete search suppresses the deleted memory. | `tie` | `pass` | mem0: `ELF_BASELINE_PROJECTS=mem0 cargo make baseline-live-docker`; ELF: `cargo make real-world-memory-live-adapters` | mem0: `tmp/live-baseline/mem0-checks.json`; ELF: `tmp/real-world-memory/live-adapters/`, `docs/guide/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md` | | Local SDK export-style readback | `Memory.get_all` returns the current scoped preference and omits the other scope. | `not_tested` | `pass` | `ELF_BASELINE_PROJECTS=mem0 cargo make baseline-live-docker` | `tmp/live-baseline/mem0-checks.json` | -| OpenMemory UI/export readback | No local UI/dashboard export flow is launched by the Docker runner. | `blocked` | `blocked` | Not run; outside current local runner. | `apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json` | +| OpenMemory UI/export readback | The bounded export-helper setup probe finds OpenMemory product files but the export helper cannot run because Docker is unavailable inside the baseline runner. It does not reach browser/dashboard readback or same-corpus product app database validation. | `blocked` | `blocked` | `cargo make openmemory-ui-export-readback` | `tmp/live-baseline/mem0-openmemory-ui-export.json`, `tmp/live-baseline/mem0-openmemory-export-attempt.log` | | Hosted mem0 Platform export | Hosted Platform export is outside local OSS evidence. | `non_goal` | `unsupported` | Not run; local OSS comparison only. | `apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json` | | Optional graph memory | Graph memory is not enabled in the default local OSS run. | `non_goal` | `not_encoded` | Not run; opt-in scenario gate. | `apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json` | @@ -97,6 +103,25 @@ The `delete_history_audit_readback` check verifies all of: The local SDK export-style readback check is intentionally named separately from UI export. It only proves local `get_all` scoped readback through the OSS SDK. +The OpenMemory export-helper setup probe records: + +- OpenMemory tree present: `true`; +- UI package present: `true`; +- compose file present: `true`; +- export helper present: `true`; +- sunsetting notice present: `true`; +- SDK `get_all` status: `pass`; +- export attempt command: + `timeout 30 bash openmemory/backup-scripts/export_openmemory.sh --user-id elf-history-user --container openmemory-openmemory-mcp-1`; +- export attempt exit code: `1`; +- reason code: `DOCKER_UNAVAILABLE_IN_BASELINE_RUNNER`. + +The attempt log contains `docker: command not found` before the helper reports that +`openmemory-openmemory-mcp-1` is not running. The concrete next action is to add a +dedicated OpenMemory Docker Compose profile that imports the generated mem0 corpus +into the OpenMemory app database, starts API/UI with explicit local or provider +configuration, then reruns the export helper and validates exported memories. + ## Source And Product Boundary Official mem0 documentation distinguishes the OSS/self-hosted surface from hosted @@ -108,7 +133,8 @@ search, structured exports, and Platform UI exports. This report uses those docs only to set the claim boundary: - local OSS SDK `history`, `search`, and `get_all` behavior is measurable here; -- OpenMemory browser/dashboard export is not measured here; +- OpenMemory browser/dashboard export is not reached here; the current evidence is a + bounded export-helper setup probe blocked by setup; - hosted Platform export is a `non_goal` for this local OSS lane; - optional graph memory remains an opt-in scenario, not a default pass/fail claim. @@ -123,14 +149,15 @@ Allowed: - mem0/OpenMemory local OSS passes the new encoded history, correction, personalization, deletion-audit, and local `get_all` readback checks in run - `live-baseline-20260611113003`. + `live-baseline-20260611122416`. - ELF currently has a measured `loss` against mem0 on the preference correction history dimension because the June 11 temporal/history report records ELF's live memory-evolution preference job as `wrong_result`. - ELF and mem0 currently `tie` on the encoded entity-scoped personalization and delete-audit surfaces. -- OpenMemory UI/export readback is `blocked` until the runner launches and inspects - the UI/export flow. +- OpenMemory UI/export readback is `blocked` by a concrete setup blocker: + `DOCKER_UNAVAILABLE_IN_BASELINE_RUNNER`; ELF cannot compare against this product-UX + scenario yet. - Hosted mem0 Platform export and optional graph memory are `non_goal` for this local OSS comparison. @@ -146,7 +173,7 @@ Not allowed: ## Follow-Up Gate -The next fair UI/export comparison requires a bounded runner that starts OpenMemory, -loads the same local memories, captures authenticated inspection/export readback, and -publishes a browser/API artifact. That is separate from the local SDK `get_all` -export-style readback added here. +The next fair UI/export comparison requires extending the bounded runner so it starts +OpenMemory, loads the same local memories into the OpenMemory app database, captures +authenticated inspection/export readback, and publishes a browser/API artifact. That +is separate from the local SDK `get_all` export-style readback added here. diff --git a/docs/guide/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md b/docs/guide/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md index c93ebea8..a9bee44c 100644 --- a/docs/guide/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md +++ b/docs/guide/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md @@ -17,13 +17,13 @@ The overall goal is not complete. ELF does not yet have complete, comparable benchmark wins across all tracked memory projects and all user-important memory scenarios. -Update after XY-924: mem0/OpenMemory local OSS history and local SDK export-style -readback are now measured in -`2026-06-11-mem0-openmemory-history-ui-export-report.md`. That report records mem0 +Update after XY-924 and XY-931: mem0/OpenMemory local OSS history, local SDK +export-style readback, and a bounded OpenMemory export-helper setup probe are now measured +in `2026-06-11-mem0-openmemory-history-ui-export-report.md`. That report records mem0 passes for preference correction history, entity-scoped personalization, deletion audit history, and local `get_all` readback, while keeping OpenMemory UI/export -blocked and hosted Platform export plus optional graph memory as local-lane -non-goals. +blocked by `DOCKER_UNAVAILABLE_IN_BASELINE_RUNNER` and hosted Platform export plus +optional graph memory as local-lane non-goals. The current evidence supports a narrower judgment: @@ -136,7 +136,7 @@ the right snippets. | Retrieval/debug | qmd transparent CLI, expansion/fusion/rerank/replay ergonomics | ELF/qmd live adapters pass retrieval suites; previous qmd debug profile exists | ELF is not clearly stronger. qmd remains the debug-UX bar. | | Current-vs-historical memory | Graphiti/Zep temporal validity; mem0 history surfaces | ELF/qmd live memory-evolution wrong_result; Graphiti/Zep blocked; mem0 local OSS preference correction history now passes, but mem0 real-world prompt history is not encoded | ELF has a measured gap. It only narrowly beats qmd's current run and loses the local OSS preference-correction history scenario to mem0. | | Delete/tombstone lifecycle | ELF production ops and qmd local replay | ELF passes delete/TTL job; qmd misses tombstone | ELF has a narrow measured win over qmd on this job. | -| Entity preference history | mem0/OpenMemory | XY-924 local OSS run passes mem0 preference correction history and entity-scoped personalization; OpenMemory UI/export remains blocked | ELF loses the preference-correction history scenario and ties the scoped-personalization scenario; no OpenMemory UI/export claim is allowed. | +| Entity preference history | mem0/OpenMemory | XY-924 local OSS run passes mem0 preference correction history and entity-scoped personalization; XY-931 OpenMemory export-helper setup probe is blocked by missing Docker/OpenMemory product container access inside the baseline runner | ELF loses the preference-correction history scenario and ties the scoped-personalization scenario; no OpenMemory UI/export claim is allowed. | | Core-vs-archival memory | Letta core memory blocks versus archival memory | Research-only, no contained live output | Not comparable. Borrow design only. | | Context trajectory | OpenViking staged context and hierarchy | Existing adapter remains not encoded or wrong_result for trajectory | Not comparable. Need staged trajectory benchmark. | | Capture and continuity | agentmemory, claude-mem hooks/viewers | Existing adapters are baseline-only and undermeasured | Not comparable. Need capture/write-policy and work-resume adapters. | @@ -148,7 +148,7 @@ the right snippets. | Source | Best idea to absorb | Benchmark gate before any claim | | --- | --- | --- | | Graphiti/Zep | Validity windows, `valid_at`/`invalid_at`, current/historical/future fact separation, temporal relation provenance | Provider-backed Docker temporal smoke must map current, historical, and rationale facts to scored evidence ids. | -| mem0/OpenMemory | Entity-scoped memory history, user-visible lifecycle inspection, update/delete ergonomics | Local OSS history, correction, deletion, and SDK `get_all` readback are now scored; UI/export readback still needs a bounded OpenMemory runner. | +| mem0/OpenMemory | Entity-scoped memory history, user-visible lifecycle inspection, update/delete ergonomics | Local OSS history, correction, deletion, and SDK `get_all` readback are now scored; UI/export readback has a bounded export-helper setup probe but remains blocked until OpenMemory can run with the same corpus in its product app database. | | Letta | Always-loaded core memory blocks separated from archival search | Add core-vs-archival jobs for attachment scope, provenance, fallback, and stale-core avoidance. | | qmd | Local replay, candidate inspection, expansion/fusion/rerank debug knobs | ELF trace artifacts must show candidate generation, rerank, dropped evidence, conflict candidates, and replay commands. | | OpenViking | Staged context trajectory and hierarchy | Encode trajectory jobs after evidence-bearing same-corpus output passes. | @@ -186,9 +186,10 @@ the product behavior users actually care about: 5. optional graph-memory behavior only if the OSS path is reproducible in Docker. Target benchmark status: local OSS history jobs are now encoded with per-scenario -claims. OpenMemory UI/export readback remains blocked until a UI runner exists, and -hosted Platform export plus optional graph memory remain non-goals for the local OSS -lane. +claims. OpenMemory UI/export readback has a bounded export-helper setup probe, but it +remains blocked until a dedicated OpenMemory compose/import path can load the same +corpus into the OpenMemory app database. Hosted Platform export plus optional graph +memory remain non-goals for the local OSS lane. ### P0 - qmd-Level Debugging And Replay @@ -261,8 +262,9 @@ Not allowed: - Do not claim all goals are complete. - Do not claim ELF beats all tracked memory projects. -- Do not claim ELF beats mem0/OpenMemory on UI, hosted behavior, entity history, or - graph memory. +- Do not claim ELF beats mem0/OpenMemory on UI/export, hosted behavior, entity + history, or graph memory. The current UI/export result is a setup blocker, not a + comparison win. - Do not claim ELF beats Graphiti/Zep on temporal validity. - Do not claim ELF beats Letta on core-vs-archival memory. - Do not treat fixture pass, baseline smoke pass, and live real-world pass as the @@ -271,7 +273,8 @@ Not allowed: ## Next Concrete Report/Issue Directions 1. Open or refine a P0 issue for ELF live temporal reconciliation and trace contract. -2. Open a P0 benchmark issue for mem0/OpenMemory history and UI/export readback. +2. Follow up the XY-931 OpenMemory UI/export blocker with a Docker Compose/import + path that loads the same corpus into the OpenMemory product app database. 3. Open a P0 benchmark issue for ELF/qmd trace-level replay and wrong-result diagnosis. 4. Open a P1 benchmark issue for Letta-style core-vs-archival memory. diff --git a/docs/guide/benchmarking/index.md b/docs/guide/benchmarking/index.md index f6795dfb..6030af7b 100644 --- a/docs/guide/benchmarking/index.md +++ b/docs/guide/benchmarking/index.md @@ -92,7 +92,7 @@ cleanup, use `docs/guide/single_user_production.md`. competitor-strength adoption report with the bounded personal-production decision, scenario-level win/tie/loss/not-tested matrix, claim boundaries, and optimization issue queue. -- `2026-06-11-mem0-openmemory-history-ui-export-report.md`: XY-924 +- `2026-06-11-mem0-openmemory-history-ui-export-report.md`: XY-924 plus XY-931 mem0/OpenMemory local OSS history, preference-correction, deletion-audit, personalization, and export-readback comparison with normalized win/tie/loss/not-tested/blocked/non-goal outcomes and explicit hosted/UI/graph diff --git a/docs/research/2026-06-11-competitor-strength-adoption-report.json b/docs/research/2026-06-11-competitor-strength-adoption-report.json index 11871923..906c2659 100644 --- a/docs/research/2026-06-11-competitor-strength-adoption-report.json +++ b/docs/research/2026-06-11-competitor-strength-adoption-report.json @@ -12,7 +12,7 @@ "Live temporal reconciliation remains wrong_result for five of six memory_evolution jobs.", "Private-corpus production quality is blocked until an operator-owned manifest exists.", "Credentialed provider production-ops gates are blocked until explicit provider setup exists.", - "Several competitor strengths remain not_tested or blocked: OpenMemory UI/export, hosted mem0 Platform behavior, OpenViking trajectory, Letta core-vs-archival memory, and graph/RAG navigation. mem0 local OSS preference history is now measured separately and is an ELF loss on the current correction-history scenario. The XY-923 follow-up now scores qmd immediate top-10/replay artifact ergonomics as stronger than ELF's default stress report, while expansion, fusion, rerank, and candidate-drop diagnosis remain untested." + "Several competitor strengths remain not_tested or blocked: OpenMemory UI/export is blocked by the XY-931 export-helper setup probe, hosted mem0 Platform behavior remains a non-goal, and OpenViking trajectory, Letta core-vs-archival memory, and graph/RAG navigation remain unproven. mem0 local OSS preference history is measured separately and is an ELF loss on the current correction-history scenario. The XY-923 follow-up now scores qmd immediate top-10/replay artifact ergonomics as stronger than ELF's default stress report, while expansion, fusion, rerank, and candidate-drop diagnosis remain untested." ] }, "evidence_class_terms": [ @@ -52,9 +52,9 @@ "claim": "mem0/OpenMemory and memsearch pass basic local baseline smokes; agentmemory remains lifecycle_fail and claude-mem remains wrong_result on same-corpus retrieval." }, { - "command": "ELF_BASELINE_PROJECTS=mem0 cargo make baseline-live-docker", + "command": "cargo make openmemory-ui-export-readback", "artifact": "docs/guide/benchmarking/2026-06-11-mem0-openmemory-history-ui-export-report.md", - "claim": "mem0 local OSS passes preference correction history, entity-scoped personalization, local get_all export-style readback, and deletion audit history; OpenMemory UI/export remains blocked and hosted Platform export remains non-goal." + "claim": "mem0 local OSS passes preference correction history, entity-scoped personalization, local get_all export-style readback, and deletion audit history; OpenMemory export-helper setup emits a separate blocked artifact with DOCKER_UNAVAILABLE_IN_BASELINE_RUNNER, and hosted Platform export remains non-goal." }, { "command": "ELF_GRAPHITI_ZEP_SMOKE_START=1 ELF_GRAPHITI_ZEP_SMOKE_RUN=1 cargo make graphiti-zep-docker-temporal-smoke", @@ -186,7 +186,7 @@ "title": "Operator debugging/viewer UX", "outcome": "not_tested", "evidence_classes": ["fixture_backed", "live_baseline_only", "blocked", "not_encoded", "research_gate"], - "measured_claim": "ELF fixture operator-debugging UX passes. mem0 local SDK get_all readback is measured, but OpenMemory UI/export remains blocked and must not be inferred from SDK readback. Live trace/viewer scoring and qmd/OpenMemory/claude-mem UX comparisons remain unscored.", + "measured_claim": "ELF fixture operator-debugging UX passes. mem0 local SDK get_all readback is measured, but the XY-931 OpenMemory export-helper setup probe is blocked by missing Docker/OpenMemory product container access and must not be inferred from SDK readback. Live trace/viewer scoring and qmd/OpenMemory/claude-mem UX comparisons remain unscored.", "command_artifacts": [ "docs/guide/benchmarking/2026-06-11-measurement-coverage-audit.md", "docs/guide/benchmarking/2026-06-11-qmd-openviking-strength-profile-report.md" @@ -298,10 +298,10 @@ "gap": "qmd trace-level replay and wrong-result diagnostics." }, { - "issue": "XY-924", + "issue": "XY-924/XY-931", "priority": "P0", - "state": "Encoded local OSS history; UI/export still gated", - "gap": "mem0/OpenMemory local OSS history and SDK export-style readback are measured; OpenMemory UI/export still needs a UI runner before any product-UX claim." + "state": "Encoded local OSS history; UI/export setup blocker measured", + "gap": "mem0/OpenMemory local OSS history and SDK export-style readback are measured; OpenMemory UI/export has a blocked export-helper setup probe and still needs a dedicated compose/import path before any product-UX comparison." }, { "issue": "XY-925", @@ -357,7 +357,7 @@ "not_allowed": [ "Do not claim ELF broadly beats qmd.", "Do not claim qmd's trace/replay artifact win is a broad qmd-over-ELF memory-system or retrieval-quality win.", - "Do not claim ELF beats mem0/OpenMemory on preference history, UI/export, hosted behavior, or graph memory. The local OSS correction-history scenario is currently an ELF loss, while OpenMemory UI/export, hosted behavior, and graph memory remain outside measured local OSS evidence.", + "Do not claim ELF beats mem0/OpenMemory on preference history, UI/export, hosted behavior, or graph memory. The local OSS correction-history scenario is currently an ELF loss, while OpenMemory UI/export is a measured setup blocker and hosted behavior plus graph memory remain outside measured local OSS evidence.", "Do not claim ELF beats OpenViking on staged context trajectory.", "Do not claim ELF beats Letta on core-vs-archival memory.", "Do not claim graph/RAG parity from smoke-only evidence.", diff --git a/docs/research/2026-06-11-temporal-history-competitor-gap-report.json b/docs/research/2026-06-11-temporal-history-competitor-gap-report.json index d9129ec7..cb6cd9be 100644 --- a/docs/research/2026-06-11-temporal-history-competitor-gap-report.json +++ b/docs/research/2026-06-11-temporal-history-competitor-gap-report.json @@ -20,11 +20,11 @@ "artifact": "tmp/live-baseline/live-baseline-report.json" }, { - "command": "ELF_BASELINE_PROJECTS=mem0 cargo make baseline-live-docker", + "command": "cargo make openmemory-ui-export-readback", "status": "pass", - "runtime_seconds": 39.17, - "artifact": "tmp/live-baseline/mem0-checks.json", - "claim": "XY-924 local OSS mem0 history run passes preference correction history, entity-scoped personalization, local get_all readback, and deletion audit history while keeping OpenMemory UI/export blocked." + "runtime_seconds": 35.14, + "artifact": "tmp/live-baseline/mem0-checks.json; tmp/live-baseline/mem0-openmemory-ui-export.json", + "claim": "XY-924 local OSS mem0 history run passes preference correction history, entity-scoped personalization, local get_all readback, and deletion audit history; XY-931 records OpenMemory export-helper setup as blocked with DOCKER_UNAVAILABLE_IN_BASELINE_RUNNER." }, { "command": "cargo make real-world-memory-evolution", @@ -255,7 +255,7 @@ "scenario": "basic_local_lifecycle", "current_judgment": "elf_and_mem0_both_pass_encoded_smoke", "claim_strength": "limited_tie_or_elf_broader_smoke_surface", - "next_gate": "OpenMemory UI/export readback runner; hosted Platform export and optional graph memory remain non-goals for the local OSS lane" + "next_gate": "OpenMemory compose/import path that loads the same corpus into the product app database; hosted Platform export and optional graph memory remain non-goals for the local OSS lane" }, { "scenario": "retrieval_debug", @@ -299,7 +299,7 @@ "priority": "P0", "direction": "mem0_openmemory_history_comparison", "description": "Local OSS comparison has moved past basic update/delete smoke into preference history, entity memory, lifecycle inspection, deletion audit, and SDK export-style readback.", - "benchmark_gate": "Local OSS history jobs are encoded with per-scenario claims; OpenMemory UI/export still needs a bounded UI runner." + "benchmark_gate": "Local OSS history jobs are encoded with per-scenario claims; OpenMemory UI/export has a bounded probe but remains blocked until a Docker-contained product app import/export path exists." }, { "priority": "P0", @@ -330,6 +330,7 @@ "allowed": [ "ELF+mem0 basic local lifecycle smoke passed in the fresh Docker baseline.", "mem0 local OSS history, entity-scoped personalization, deletion audit, and SDK get_all readback are measured by the XY-924 report.", + "OpenMemory UI/export readback is measured as a setup blocker by the XY-931 export-helper setup probe.", "ELF narrowly outperformed qmd on the fresh memory-evolution slice because ELF passed delete/TTL and qmd did not.", "ELF still failed five of six live memory-evolution jobs.", "Graphiti/Zep temporal smoke is typed blocked due missing explicit provider key.", @@ -346,7 +347,7 @@ }, "next_issue_directions": [ "P0 ELF live temporal reconciliation and trace contract", - "P0 OpenMemory UI/export readback runner after the local OSS history benchmark", + "P0 OpenMemory Docker compose/import path after the XY-931 UI/export setup blocker", "P0 ELF/qmd trace-level replay and wrong-result diagnosis", "P1 Letta-style core-vs-archival memory benchmark", "P2 Graphiti/Zep provider-backed temporal smoke after explicit provider credentials exist", diff --git a/docs/research/2026-06-11-xy-897-competitor-strength-matrix.json b/docs/research/2026-06-11-xy-897-competitor-strength-matrix.json index a5ed566f..a741778a 100644 --- a/docs/research/2026-06-11-xy-897-competitor-strength-matrix.json +++ b/docs/research/2026-06-11-xy-897-competitor-strength-matrix.json @@ -151,15 +151,15 @@ ], "measured_status": "pass", "proof": { - "command": "ELF_BASELINE_PROJECTS=ELF,agentmemory,mem0,memsearch,claude-mem cargo make baseline-live-docker", + "command": "cargo make openmemory-ui-export-readback", "artifact": "tmp/live-baseline/live-baseline-report.json" }, "unsupported_or_blocked_status": { - "state": "not_encoded", - "typed_reason": "history_ui_hosted_graph_claims_not_encoded", - "details": "Basic local OSS same-corpus/update/delete/reload smoke now passes, but hosted/OpenMemory UI parity, entity/preference history, deletion-audit readback, optional graph memory, and real-world personalization coverage are not encoded." + "state": "blocked", + "typed_reason": "openmemory_export_helper_setup_blocked", + "details": "Local OSS same-corpus/update/delete/reload, entity/preference history, deletion-audit readback, and SDK get_all readback now pass. OpenMemory UI/export remains blocked by the XY-931 export-helper setup probe until a product app import/export path can load the same corpus. Hosted Platform export is unsupported in the local OSS lane, and optional graph memory plus real-world prompt adapter coverage remain not_encoded." }, - "benchmark_before_claim": "Encode memory_evolution preference/entity history, deletion audit readback, personalization, OpenMemory UI/export readback, and optional graph-context jobs.", + "benchmark_before_claim": "Add a Docker-contained OpenMemory product app import/export path, then score browser/API readback separately from SDK get_all; keep hosted Platform and graph memory opt-in or non-goal unless explicitly enabled.", "borrow_if_stronger": "Borrow entity-scoped memory history, lifecycle surfaces, async update ergonomics, and OpenMemory-style inspection UX." }, { @@ -466,9 +466,9 @@ "scenario": "temporal/current-vs-historical memory", "current_elf_evidence": "ELF fixture-backed memory_evolution passes, but ELF live_real_world memory_evolution is wrong_result.", "strongest_competitor_or_reference": "Graphiti/Zep, mem0/OpenMemory", - "current_competitor_evidence": "Graphiti/Zep is research_gate blocked; mem0/OpenMemory now passes basic live_baseline_only local lifecycle smoke but preference/entity history, deletion audit, UI/export, and graph-memory scenarios are not_encoded.", + "current_competitor_evidence": "Graphiti/Zep is research_gate blocked; mem0/OpenMemory local OSS preference history, entity scope, deletion audit, and SDK get_all now pass; OpenMemory UI/export is blocked by the export-helper setup probe; graph-memory scenarios are not_encoded.", "current_state": "No project has a comparable live pass for current-vs-historical evidence; ELF cannot claim live superiority yet.", - "next_measurement": "Fix ELF/qmd live memory_evolution evidence links, encode mem0/OpenMemory history and UI/export jobs, and run XY-888 Graphiti/Zep temporal graph adapter." + "next_measurement": "Fix ELF/qmd live memory_evolution evidence links, add OpenMemory product app import/export readback, and run XY-888 Graphiti/Zep temporal graph adapter." }, { "scenario_id": "consolidation", diff --git a/docs/research/2026-06-11-xy-931-openmemory-ui-export-readback.json b/docs/research/2026-06-11-xy-931-openmemory-ui-export-readback.json new file mode 100644 index 00000000..8caaa5dd --- /dev/null +++ b/docs/research/2026-06-11-xy-931-openmemory-ui-export-readback.json @@ -0,0 +1,60 @@ +{ + "schema": "elf.openmemory_ui_export_readback_report/v1", + "report_id": "xy-931-openmemory-ui-export-readback-2026-06-11", + "authority": "XY-931", + "created_at": "2026-06-11T12:24:49Z", + "goal": "Measure OpenMemory UI/export readback separately from local mem0 SDK get_all, or record a typed setup blocker with concrete evidence. This run records an export-helper setup blocker before browser/dashboard readback is reached.", + "command": { + "command": "cargo make openmemory-ui-export-readback", + "status": "pass", + "runtime_seconds": 35.14, + "artifact": "tmp/live-baseline/mem0-openmemory-ui-export.json" + }, + "run": { + "run_id": "live-baseline-20260611122416", + "project_filter": "mem0", + "sdk_baseline_status": "pass", + "sdk_check_summary": { + "total": 8, + "pass": 8, + "fail": 0, + "blocked": 0 + }, + "ui_export_status": "blocked", + "ui_export_reason_code": "DOCKER_UNAVAILABLE_IN_BASELINE_RUNNER" + }, + "same_corpus_boundary": { + "sdk_result_artifact": "tmp/live-baseline/mem0-search.json", + "sdk_get_all_check_status": "pass", + "sdk_get_all_is_ui_export_evidence": false, + "openmemory_ui_export_is_separate_product_ux_scenario": true + }, + "openmemory_probe": { + "tree_present": true, + "ui_package_present": true, + "compose_file_present": true, + "export_script_present": true, + "sunsetting_notice_present": true, + "requires_openai_api_key": true, + "requires_docker_compose": true, + "export_requires_running_container": true, + "attempt": { + "command": "timeout 30 bash openmemory/backup-scripts/export_openmemory.sh --user-id elf-history-user --container openmemory-openmemory-mcp-1", + "exit_code": 1, + "log_artifact": "tmp/live-baseline/mem0-openmemory-export-attempt.log", + "output_excerpt": "openmemory/backup-scripts/export_openmemory.sh: line 52: docker: command not found\nERROR: Container 'openmemory-openmemory-mcp-1' not found/running. Pass --container if different." + } + }, + "classification": { + "status": "blocked", + "reason_code": "DOCKER_UNAVAILABLE_IN_BASELINE_RUNNER", + "reason": "The OpenMemory export helper requires Docker access, but Docker is not available inside the baseline-runner container; browser/dashboard readback is not reached.", + "next_action": "Add a dedicated OpenMemory Docker Compose profile that imports the generated mem0 corpus into the OpenMemory app database, starts the API/UI with explicit local or provider configuration, then rerun the export helper and validate the exported memories." + }, + "claim_boundary": { + "elf_can_compare_against_openmemory_ui_export_after_this_run": false, + "hosted_platform_claim": false, + "optional_graph_memory_enabled": false, + "sdk_get_all_is_ui_export_evidence": false + } +} diff --git a/scripts/live-baseline-benchmark.sh b/scripts/live-baseline-benchmark.sh index d1a65f31..0f15359f 100755 --- a/scripts/live-baseline-benchmark.sh +++ b/scripts/live-baseline-benchmark.sh @@ -83,7 +83,11 @@ typed_status_reason() { case "${status}" in pass) - echo "${project} same-corpus retrieval and every encoded behavior check passed" + if [[ "${project}" == "mem0" ]]; then + echo "mem0 SDK same-corpus retrieval and every encoded SDK behavior check passed; OpenMemory export-helper setup probe is reported separately in adapter.behaviors.openmemory_ui_export and tmp/live-baseline/mem0-openmemory-ui-export.json" + else + echo "${project} same-corpus retrieval and every encoded behavior check passed" + fi ;; wrong_result) echo "${project} ran but returned the wrong same-corpus result or missed expected evidence" @@ -106,6 +110,254 @@ typed_status_reason() { esac } +probe_mem0_openmemory_ui_export() { + local project_repo="$1" + local sdk_result_path="$2" + local out_path="$3" + local log_path="$4" + local openmemory_dir="${project_repo}/openmemory" + local export_script="${openmemory_dir}/backup-scripts/export_openmemory.sh" + local ui_package="${openmemory_dir}/ui/package.json" + local compose_file="${openmemory_dir}/docker-compose.yml" + local readme_path="${openmemory_dir}/README.md" + local run_script="${openmemory_dir}/run.sh" + local api_env_example="${openmemory_dir}/api/.env.example" + local attempt_log="${REPORT_DIR}/mem0-openmemory-export-attempt.log" + local validation_path="${REPORT_DIR}/mem0-openmemory-export-validation.json" + local export_user_id="${ELF_MEM0_OPENMEMORY_EXPORT_USER_ID:-elf-history-user}" + local export_container="${ELF_MEM0_OPENMEMORY_EXPORT_CONTAINER:-openmemory-openmemory-mcp-1}" + local export_zip="${project_repo}/memories_export_${export_user_id}.zip" + local command_display="timeout 30 bash openmemory/backup-scripts/export_openmemory.sh --user-id ${export_user_id} --container ${export_container}" + local sdk_get_all_status + local export_exit_code=0 + local openmemory_tree_present=false + local ui_package_present=false + local compose_present=false + local export_script_present=false + local sunsetting_notice_present=false + local requires_api_key=false + local requires_docker_compose=false + local export_requires_running_container=false + local status="blocked" + local comparison_outcome="blocked" + local reason_code="OPENMEMORY_CONTAINER_NOT_RUNNING" + local reason="OpenMemory export-helper setup probe could not run because no OpenMemory product container is available in the Docker baseline runner." + local next_action="Add a dedicated OpenMemory Docker Compose profile that imports the generated mem0 corpus into the OpenMemory app database, starts the API/UI with explicit local or provider configuration, then rerun the export helper and validate the exported memories." + local output_excerpt="" + local validation_json="{}" + + sdk_get_all_status="$(jq -r '[.checks[]? | select(.name == "local_get_all_export_readback") | .status][0] // "missing"' "${sdk_result_path}" 2>/dev/null || echo "missing")" + + [[ -d "${openmemory_dir}" ]] && openmemory_tree_present=true + [[ -f "${ui_package}" ]] && ui_package_present=true + [[ -f "${compose_file}" ]] && compose_present=true + [[ -f "${export_script}" ]] && export_script_present=true + if [[ -f "${readme_path}" ]] && grep -qi "sunsetting notice" "${readme_path}"; then + sunsetting_notice_present=true + fi + if grep -q "OPENAI_API_KEY" "${run_script}" "${api_env_example}" 2>/dev/null; then + requires_api_key=true + fi + if [[ -f "${run_script}" ]] && grep -q "docker compose" "${run_script}"; then + requires_docker_compose=true + fi + if [[ -f "${export_script}" ]] && grep -q "docker ps" "${export_script}"; then + export_requires_running_container=true + fi + + : >"${attempt_log}" + rm -f "${validation_path}" "${export_zip}" + if [[ "${openmemory_tree_present}" != "true" ]]; then + status="unsupported" + reason_code="OPENMEMORY_TREE_MISSING" + reason="The cloned mem0 repository does not contain the OpenMemory product tree, so no export-helper setup probe path is available in this revision." + elif [[ "${export_script_present}" != "true" ]]; then + status="unsupported" + reason_code="OPENMEMORY_EXPORT_SCRIPT_MISSING" + reason="The OpenMemory tree is present, but its export helper is missing, so the runner cannot attempt export-helper setup readback." + else + set +e + ( + cd "${project_repo}" + timeout 30 bash openmemory/backup-scripts/export_openmemory.sh \ + --user-id "${export_user_id}" \ + --container "${export_container}" + ) >"${attempt_log}" 2>&1 + export_exit_code=$? + set -e + output_excerpt="$(head -c 4000 "${attempt_log}" || true)" + + if [[ "${export_exit_code}" -eq 0 && -s "${export_zip}" ]]; then + python3 - "${export_zip}" "${validation_path}" <<'PY' +import json +import sys +import zipfile +from pathlib import Path + +zip_path = Path(sys.argv[1]) +out_path = Path(sys.argv[2]) +result = { + "zip_present": zip_path.is_file(), + "zip_path": str(zip_path), + "memories_json_present": False, + "has_current_preference": False, + "omits_other_scope": False, + "error": None, +} + +try: + with zipfile.ZipFile(zip_path) as archive: + result["members"] = archive.namelist() + if "memories.json" in archive.namelist(): + result["memories_json_present"] = True + payload = archive.read("memories.json").decode("utf-8", "replace") + lowered = payload.lower() + result["has_current_preference"] = ( + "concise" in lowered and "evidence-linked" in lowered + ) + result["omits_other_scope"] = "long-form chinese" not in lowered +except Exception as exc: + result["error"] = repr(exc) + +out_path.write_text(json.dumps(result, indent=2) + "\n", encoding="utf-8") +PY + validation_json="$(cat "${validation_path}")" + if jq -e '.has_current_preference == true and .omits_other_scope == true' "${validation_path}" >/dev/null; then + status="pass" + reason_code="OPENMEMORY_EXPORT_READBACK_MATCHED" + reason="OpenMemory export produced a zip containing the current scoped preference and omitting the other scope." + next_action="Keep OpenMemory export-helper readback as a separate product-UX scenario from SDK get_all and rerun after any OpenMemory setup change." + else + status="blocked" + reason_code="OPENMEMORY_EXPORT_MISSING_SAME_CORPUS" + reason="OpenMemory export ran, but the exported product data did not prove readback of the same local mem0 SDK corpus." + fi + elif [[ "${export_exit_code}" -eq 124 ]]; then + status="blocked" + reason_code="OPENMEMORY_EXPORT_TIMEOUT" + reason="OpenMemory export did not complete within the bounded 30-second probe." + elif grep -qi "docker.*command not found\|docker: not found\|docker not found" "${attempt_log}"; then + status="blocked" + reason_code="DOCKER_UNAVAILABLE_IN_BASELINE_RUNNER" + reason="The OpenMemory export helper requires Docker access, but Docker is not available inside the baseline-runner container." + elif grep -qi "Container .*not found/running" "${attempt_log}"; then + status="blocked" + reason_code="OPENMEMORY_CONTAINER_NOT_RUNNING" + reason="The OpenMemory export helper requires a running OpenMemory product container, but the baseline runner only starts the mem0 SDK path." + else + status="blocked" + reason_code="OPENMEMORY_EXPORT_COMMAND_FAILED" + reason="The OpenMemory export helper failed before export-helper readback could be validated." + fi + fi + + case "${status}" in + pass) + comparison_outcome="not_tested" + ;; + blocked) + comparison_outcome="blocked" + ;; + unsupported) + comparison_outcome="non_goal" + ;; + *) + comparison_outcome="not_tested" + ;; + esac + + jq -nc \ + --arg schema "elf.live_baseline.openmemory_ui_export_probe/v1" \ + --arg run_id "${RUN_ID}" \ + --arg project "mem0/OpenMemory" \ + --arg scenario_id "openmemory_ui_export_readback" \ + --arg status "${status}" \ + --arg comparison_outcome "${comparison_outcome}" \ + --arg generated_at "$(date -u +%Y-%m-%dT%H:%M:%SZ)" \ + --arg sdk_result_artifact "tmp/live-baseline/mem0-search.json" \ + --arg sdk_get_all_status "${sdk_get_all_status}" \ + --arg export_user_id "${export_user_id}" \ + --arg export_container "${export_container}" \ + --arg command "${command_display}" \ + --arg log_artifact "tmp/live-baseline/mem0-openmemory-export-attempt.log" \ + --arg output_excerpt "${output_excerpt}" \ + --arg reason_code "${reason_code}" \ + --arg reason "${reason}" \ + --arg next_action "${next_action}" \ + --argjson exit_code "${export_exit_code}" \ + --argjson openmemory_tree_present "${openmemory_tree_present}" \ + --argjson ui_package_present "${ui_package_present}" \ + --argjson compose_present "${compose_present}" \ + --argjson export_script_present "${export_script_present}" \ + --argjson sunsetting_notice_present "${sunsetting_notice_present}" \ + --argjson requires_api_key "${requires_api_key}" \ + --argjson requires_docker_compose "${requires_docker_compose}" \ + --argjson export_requires_running_container "${export_requires_running_container}" \ + --argjson validation "${validation_json}" \ + '{ + schema: $schema, + run_id: $run_id, + project: $project, + scenario_id: $scenario_id, + status: $status, + comparison_outcome: $comparison_outcome, + generated_at: $generated_at, + same_corpus: { + sdk_result_artifact: $sdk_result_artifact, + sdk_get_all_check_status: $sdk_get_all_status, + sdk_history_filters: { + user_id: "elf-history-user", + agent_id: "elf-history-agent", + run_id: "elf-project" + }, + sdk_get_all_is_ui_export_evidence: false + }, + openmemory_surface: { + tree_present: $openmemory_tree_present, + ui_package_present: $ui_package_present, + compose_file_present: $compose_present, + export_script_present: $export_script_present, + sunsetting_notice_present: $sunsetting_notice_present, + requires_openai_api_key: $requires_api_key, + requires_docker_compose: $requires_docker_compose, + export_requires_running_container: $export_requires_running_container, + default_export_container: $export_container + }, + attempt: { + command: $command, + exit_code: $exit_code, + log_artifact: $log_artifact, + output_excerpt: $output_excerpt + }, + export_validation: $validation, + classification: { + status: $status, + reason_code: $reason_code, + reason: $reason, + next_action: $next_action + }, + claim_boundary: { + hosted_platform_claim: false, + optional_graph_memory_enabled: false, + sdk_get_all_is_ui_export_evidence: false + } + }' >"${out_path}" + + jq \ + --arg status "${status}" \ + --arg artifact "tmp/live-baseline/mem0-openmemory-ui-export.json" \ + '.behaviors.openmemory_ui_export.status = $status + | .behaviors.openmemory_ui_export.surface = + ("bounded OpenMemory export-helper setup probe recorded at " + $artifact + "; SDK get_all remains separate")' \ + "${REPORT_DIR}/mem0-adapter.json" >"${REPORT_DIR}/mem0-adapter.json.tmp" + mv "${REPORT_DIR}/mem0-adapter.json.tmp" "${REPORT_DIR}/mem0-adapter.json" + { + echo "OpenMemory UI/export probe status: ${status}" + echo "Reason code: ${reason_code}" + echo "Next action: ${next_action}" + } >>"${log_path}" +} + if [[ ! -f "/.dockerenv" && "${ELF_BASELINE_ALLOW_HOST:-0}" != "1" ]]; then echo "Refusing to run live baseline benchmark outside Docker. Use cargo make baseline-live-docker." >&2 exit 1 @@ -2039,6 +2291,7 @@ project_mem0() { local repo="https://github.com/mem0ai/mem0.git" local log_path="${REPORT_DIR}/${project}.log" local result_path="${REPORT_DIR}/${project}-search.json" + local openmemory_probe_path="${REPORT_DIR}/${project}-openmemory-ui-export.json" local driver_path="${REPOS_DIR}/${project}/elf-live-baseline-mem0.py" local home="${HOME_DIR}/${project}" local corpus_path @@ -2091,7 +2344,7 @@ project_mem0() { }, "openmemory_ui_export": { "status": "blocked", - "surface": "the Docker live-baseline runner does not launch the OpenMemory web UI or hosted Platform export flow" + "surface": "bounded export-helper setup probe writes tmp/live-baseline/mem0-openmemory-ui-export.json; SDK get_all remains separate" }, "scale_stress_profile": { "status": "incomplete", @@ -2730,6 +2983,7 @@ PY if jq -e '.checks and .check_summary' "${result_path}" >/dev/null 2>&1; then jq '{check_summary, checks}' "${result_path}" >"${REPORT_DIR}/${project}-checks.json" fi + probe_mem0_openmemory_ui_export "${REPOS_DIR}/${project}" "${result_path}" "${openmemory_probe_path}" "${log_path}" if jq -e --argjson query_count "${QUERY_COUNT}" --argjson document_count "${DOCUMENT_COUNT}" ' .schema == "elf.live_baseline.mem0_result/v1" and .corpus.document_count == $document_count and @@ -2743,7 +2997,7 @@ PY else retrieval_status="retrieval_wrong_result" fi - json_record "${project}" "${repo}" "${head}" "${typed_status}" "${retrieval_status}" "$(typed_status_reason "${project}" "${typed_status}")" "${project}.log" "pip install -e . fastembed ollama; Memory.from_config; add/update/delete/history/get_all/search" + json_record "${project}" "${repo}" "${head}" "${typed_status}" "${retrieval_status}" "$(typed_status_reason "${project}" "${typed_status}")" "${project}.log" "pip install -e . fastembed ollama; Memory.from_config; add/update/delete/history/get_all/search; OpenMemory export probe" return fi json_record "${project}" "${repo}" "${head}" "incomplete" "invalid_json_result" "mem0 command completed, but did not produce a valid benchmark result" "${project}.log" "pip install -e . fastembed ollama; Memory.from_config; add infer=false; search"