Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

35 changes: 35 additions & 0 deletions Makefile.toml
Original file line number Diff line number Diff line change
Expand Up @@ -293,6 +293,41 @@ args = [
]


# Live external baseline benchmark
# | task | type | cwd |
# | -------------------------- | ------- | --- |
# | baseline-live-docker | command | |
# | baseline-live-report | command | |
# | baseline-live-docker-clean | command | |

[tasks.baseline-live-docker]
workspace = false
command = "bash"
args = [
"-lc",
"set -euo pipefail; head=\"$(git rev-parse HEAD)\"; if [ -n \"$(git status --porcelain)\" ]; then head=\"$head+dirty\"; fi; export ELF_BASELINE_ELF_HEAD=\"$head\"; docker compose -f docker-compose.baseline.yml run --build --rm baseline-runner",
]

[tasks.baseline-live-report]
workspace = false
command = "bash"
args = [
"scripts/live-baseline-report-to-md.sh",
]

[tasks.baseline-live-docker-clean]
workspace = false
command = "docker"
args = [
"compose",
"-f",
"docker-compose.baseline.yml",
"down",
"-v",
"--remove-orphans",
]


# Meta
# | task | type | cwd |
# | ------ | --------- | --- |
Expand Down
26 changes: 26 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,29 @@ flowchart TB

## Comparison

### Checked-In Live Benchmark Snapshot

The June 9, 2026 Docker-only live baseline uses the same generated corpus and query
manifest across ELF and the external memory projects below. ELF was run with the
production embedding provider path, `Qwen3-Embedding-8B`, and 4096-dimensional
embeddings.

- ELF production-provider stress run: 480 documents, 16 queries, `8/8` encoded checks,
`retrieval_pass`, and `pass` in 1163 seconds.
- All-project smoke run: ELF and qmd passed every encoded check. agentmemory passed
same-corpus retrieval but failed or could not complete lifecycle checks. mem0,
memsearch, and claude-mem returned wrong same-corpus retrieval results in the encoded
smoke. OpenViking was `incomplete` because its local embedding dependency could not
complete in the Docker runner.
- The benchmark runner and report publisher are checked in and Docker-isolated:
`cargo make baseline-live-docker`, `cargo make baseline-live-report`, and
`cargo make baseline-live-docker-clean`.

Detailed evidence and interpretation:

- [Live Baseline Benchmark Report - June 9, 2026](docs/guide/benchmarking/2026-06-09-live-baseline-report.md)
- [Live Baseline Benchmark Runbook](docs/guide/benchmarking/live_baseline_benchmark.md)

Quick comparison snapshot (objective/high-level).
This table compares capability coverage, not overall project quality.

Expand Down Expand Up @@ -153,6 +176,8 @@ Project signature strengths (what each does especially well):

Detailed comparison, mechanism-level analysis, and source map:

- [Live Baseline Benchmark Report - June 9, 2026](docs/guide/benchmarking/2026-06-09-live-baseline-report.md)
- [Live Baseline Benchmark Runbook](docs/guide/benchmarking/live_baseline_benchmark.md)
- [Detailed External Comparison](docs/guide/research/comparison_external_projects.md)
- [Research Projects Inventory](docs/guide/research/research_projects_inventory.md)
- [Agent Memory Selection Research Run](docs/research/2026-06-08-agent-memory-selection.json)
Expand All @@ -163,6 +188,7 @@ Latest external research refresh: June 8, 2026.

- Start here: `docs/index.md`
- Operational guide index: `docs/guide/index.md`
- Benchmarking guides and reports: `docs/guide/benchmarking/index.md`
- Research index: `docs/guide/research/index.md`
- Specifications: `docs/spec/index.md`
- System contract: `docs/spec/system_elf_memory_service_v2.md`
Expand Down
12 changes: 8 additions & 4 deletions apps/elf-eval/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ name = "elf-eval"
version = "0.2.0"

[dependencies]
blake3 = { workspace = true }
clap = { workspace = true }
color-eyre = { workspace = true }
serde = { workspace = true }
Expand All @@ -17,10 +18,13 @@ tracing = { workspace = true }
tracing-subscriber = { workspace = true }
uuid = { workspace = true }

elf-cli = { workspace = true }
elf-config = { workspace = true }
elf-service = { workspace = true }
elf-storage = { workspace = true }
elf-chunking = { workspace = true }
elf-cli = { workspace = true }
elf-config = { workspace = true }
elf-service = { workspace = true }
elf-storage = { workspace = true }
elf-testkit = { workspace = true }
elf-worker = { workspace = true }

[build-dependencies]
vergen-gitcl = { workspace = true }
Loading