diff --git a/docs/guide/benchmarking/2026-06-11-elf-qmd-retrieval-debug-profile.md b/docs/guide/benchmarking/2026-06-11-elf-qmd-retrieval-debug-profile.md new file mode 100644 index 00000000..6e3af93e --- /dev/null +++ b/docs/guide/benchmarking/2026-06-11-elf-qmd-retrieval-debug-profile.md @@ -0,0 +1,264 @@ +# ELF/qmd Retrieval-Debug Profile - June 11, 2026 + +Goal: Compare the measured retrieval-debug evidence for ELF and qmd without turning +retrieval success into a broader memory-system win claim. +Read this when: You need to decide what ELF should learn from qmd's retrieval and +debug workflow. +Inputs: Fresh local runs of `cargo make real-world-memory-live-adapters` and +`ELF_BASELINE_PROJECTS=ELF,qmd ELF_BASELINE_PROFILE=stress cargo make +baseline-live-docker` on commit `38c586d`. +Outputs: Retrieval pass data, stress-profile data, debug artifact comparison, claim +boundaries, and ELF iteration directions. + +## Executive Judgment + +ELF and qmd are tied on the measured retrieval correctness surfaces in this report. +Both pass the encoded real-world retrieval suite and both pass the 480-document +generated-public stress baseline. + +qmd still remains the better retrieval-debug product reference because its CLI baseline +emits directly inspectable top-10 JSON results with files, line numbers, snippets, and +scores for every query. ELF emits stronger service and production-operation evidence, +including trace ids, backfill checkpoints, Qdrant rebuild proof, resource envelope, +and source-of-truth semantics, but the stress baseline report does not hydrate the full +candidate list behind each ELF trace. + +So the correct claim is: + +- ELF and qmd are tied on current encoded retrieval correctness. +- ELF is stronger on source-of-truth and production-style service lifecycle evidence. +- qmd is still the simpler local retrieval-debug reference. +- This report does not prove qmd rerank quality, ELF rerank quality, or expansion / + fusion superiority because the qmd real-world materializer and baseline use + `--no-rerank`, and no scored expansion/fusion/rerank debug suite exists yet. + +## Fresh Runs + +| Command | Result | Runtime | +| --- | --- | ---: | +| `cargo make real-world-memory-live-adapters` | pass | 116.76 seconds | +| `ELF_BASELINE_PROJECTS=ELF,qmd ELF_BASELINE_PROFILE=stress cargo make baseline-live-docker` | pass | 149.41 seconds | + +The stress baseline used the generated-public profile with 480 documents and 16 +queries. The live real-world adapter sweep used the checked-in real-world memory +fixtures. + +## Real-World Retrieval Suite + +Both adapters pass the same retrieval jobs: + +| Adapter | Retrieval jobs | Pass | Expected evidence | Matched evidence | Produced evidence | Mean score | +| --- | ---: | ---: | ---: | ---: | ---: | ---: | +| ELF live service adapter | `5` | `5` | `6` | `6` | `6` | `1.000` | +| qmd live CLI adapter | `5` | `5` | `6` | `6` | `6` | `1.000` | + +The five retrieval jobs are: + +| Job | ELF | qmd | +| --- | --- | --- | +| `retrieval-alt-phrasing-001` | pass | pass | +| `retrieval-current-vs-obsolete-001` | pass | pass | +| `retrieval-distractor-heavy-001` | pass | pass | +| `retrieval-minimal-context-001` | pass | pass | +| `retrieval-multi-hop-routing-001` | pass | pass | + +Full live sweep context remains a non-pass for both systems: + +| Adapter | Jobs | Pass | Wrong result | Blocked | Not encoded | Mean score | Mean latency | +| --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: | +| ELF live service adapter | `38` | `18` | `5` | `2` | `13` | `0.525` | `5.823 ms` | +| qmd live CLI adapter | `38` | `18` | `5` | `2` | `13` | `0.512` | `705.877 ms` | + +Do not overread the latency row. The ELF adapter is a service-runtime path and the qmd +adapter is a CLI materialization path; the row is useful as observed harness evidence, +not as an apples-to-apples product latency benchmark. + +## Stress Baseline + +The stress baseline result: + +| Field | Value | +| --- | ---: | +| Profile | `stress` | +| Documents | `480` | +| Queries | `16` | +| Projects | `ELF,qmd` | +| Verdict | `pass` | +| Project statuses | `2/2 pass` | +| Full checks | `13/13 pass` | +| Wrong result | `0` | +| Lifecycle fail | `0` | +| Blocked | `0` | +| Not encoded | `0` | + +### ELF Stress Result + +| Metric | Value | +| --- | ---: | +| Project elapsed | `81 s` | +| Query pass | `16/16` | +| Mean query latency | `29.808 ms` | +| p95 query latency | `31.298 ms` | +| Backfill source count | `480` | +| Backfill completed count | `480` | +| Resume attempts | `2` | +| Completed before resume | `240` | +| Completed after resume | `480` | +| Duplicate source notes | `0` | +| Qdrant rebuild scope | encoded in the pass criteria | +| Resource envelope elapsed | `71.303 s` | +| RSS | `54,724 KB` | +| Postgres database bytes | `19,338,943` | +| Estimated input tokens | `27,023` | + +ELF passed nine checks: + +| Check | Status | +| --- | --- | +| `resumable_backfill_no_duplicates` | pass | +| `same_corpus_retrieval` | pass | +| `async_worker_indexing_e2e` | pass | +| `update_replaces_note_text` | pass | +| `delete_suppresses_retrieval` | pass | +| `cold_start_recovery_search` | pass | +| `concurrent_write_search_e2e` | pass | +| `soak_stability_e2e` | pass | +| `resource_envelope` | pass | + +Every ELF stress query returned the expected evidence as the top evidence id. + +### qmd Stress Result + +| Metric | Value | +| --- | ---: | +| qmd commit | `636602409c862db077f38d9006df7f0bdca17ff3` | +| Project elapsed | `66 s` | +| Same-corpus query pass | `16/16` | +| Expected doc top-1 | `16/16` | +| Mean expected-doc rank | `1.000` | +| Mean distractors in top-10 | `7.938` | +| Lifecycle checks | `4/4 pass` | + +qmd passed four checks: + +| Check | Status | Evidence | +| --- | --- | --- | +| `same_corpus_retrieval` | pass | 16/16 queries matched expected evidence. | +| `update_replaces_note_text` | pass | updated marker `kid-v4` was found and old marker was absent. | +| `delete_suppresses_retrieval` | pass | deleted `deploy-memory.md` no longer matched. | +| `cold_start_recovery_search` | pass | fresh qmd query process retrieved persisted `database-memory.md`. | + +The qmd baseline report keeps per-query top-10 JSON results. This is the most concrete +measured qmd debug advantage in this report: an operator can inspect matched files, +scores, line numbers, snippets, and distractor density directly from the artifact. + +### Per-Query Stress Observations + +| Query | ELF matched top evidence | ELF latency | qmd expected rank | qmd top-10 distractors | +| --- | --- | ---: | ---: | ---: | +| `q-auth` | yes | `30.571 ms` | `1` | `6` | +| `q-auth-alt` | yes | `30.501 ms` | `1` | `7` | +| `q-database` | yes | `30.534 ms` | `1` | `8` | +| `q-database-alt` | yes | `31.281 ms` | `1` | `8` | +| `q-deploy` | yes | `29.958 ms` | `1` | `9` | +| `q-deploy-alt` | yes | `31.298 ms` | `1` | `8` | +| `q-retention` | yes | `30.434 ms` | `1` | `8` | +| `q-retention-alt` | yes | `29.194 ms` | `1` | `9` | +| `q-incident` | yes | `30.839 ms` | `1` | `7` | +| `q-incident-alt` | yes | `28.700 ms` | `1` | `9` | +| `q-billing` | yes | `30.092 ms` | `1` | `7` | +| `q-billing-alt` | yes | `28.855 ms` | `1` | `9` | +| `q-search` | yes | `29.480 ms` | `1` | `8` | +| `q-search-alt` | yes | `28.642 ms` | `1` | `7` | +| `q-recovery` | yes | `28.357 ms` | `1` | `8` | +| `q-recovery-alt` | yes | `28.188 ms` | `1` | `9` | + +## Debug Artifact Comparison + +| Debug surface | ELF evidence | qmd evidence | Current judgment | +| --- | --- | --- | --- | +| Per-query pass/fail | yes | yes | tied | +| Top expected evidence | yes, top evidence id per query | yes, expected file rank per query | tied on stress profile | +| Candidate list in report | partial: trace id, top snippet, returned count | yes: top-10 file, line, score, snippet | qmd stronger in the checked-in report artifact | +| Trace/replay surface | service trace ids exist | CLI command replay is explicit | different strengths; not directly scored | +| Update/delete/cold-start | yes, service lifecycle checks | yes, collection lifecycle checks | tied on encoded lifecycle correctness | +| Backfill/rebuild/resource envelope | yes | not represented in qmd baseline | ELF stronger | +| Rerank evidence | not scored here | not scored here; qmd path uses `--no-rerank` | non-claim | +| Expansion/fusion evidence | not scored here | structured `lex:` plus `vec:` query is used, but fusion internals are not scored | non-claim | +| Operator-debugging UX suite | live `not_encoded` | live `not_encoded` | non-claim | + +## What ELF Should Learn From qmd + +1. Put the ranked candidate list in the default benchmark artifact. + - The qmd artifact makes the top-10 result set immediately visible. + - ELF has trace ids, but a reader still needs another trace-hydration step to see + the candidate list and dropped/demoted candidates. + +2. Make replay commands short and local. + - qmd's measured surface is `collection add`, `update`, `embed -f`, and + `query --json`. + - ELF should keep service correctness, but benchmark reports should also emit a + concise replay command for each failed or suspicious query. + +3. Score distractor density and candidate-drop behavior. + - qmd returned the expected doc at rank 1 for every stress query, while still + returning an average of 7.938 distractor documents in the top 10. + - ELF should expose equivalent candidate-density metrics from trace candidates so + the report can distinguish "correct top result" from "clean ranked context." + +4. Separate retrieval correctness from retrieval-debug ergonomics. + - Correctness is currently tied on encoded retrieval jobs. + - Ergonomics are not tied until ELF produces qmd-like immediate debug artifacts and + qmd operator-debugging jobs are actually scored. + +## Claim Boundaries + +Allowed claims: + +- ELF and qmd both pass the encoded real-world retrieval suite. +- ELF and qmd both pass the 480-document generated-public stress same-corpus + retrieval profile. +- qmd provides stronger directly inspectable top-10 query artifacts in the current + stress baseline report. +- ELF provides stronger service lifecycle, backfill, rebuild, resource, and + source-of-truth evidence in the same stress baseline. + +Not allowed yet: + +- ELF beats qmd retrieval overall. +- qmd beats ELF as a memory system overall. +- Either system has a full live real-world suite pass. +- Either system has measured rerank superiority from this report. +- Either system has measured expansion/fusion superiority from this report. +- qmd operator-debugging UX is proven by the live real-world suite; it is still + `not_encoded`. + +## Next Measurement Work + +The next report should close the remaining retrieval-debug gaps before making stronger +claims: + +1. Hydrate ELF trace candidates into the stress report. + - Include kept, dropped, demoted, sparse/dense, final rank, and snippet fields. + +2. Add qmd query latency and candidate-density aggregates to the project summary. + - The raw qmd top-10 rows exist, but the summary currently lacks query latency and + candidate-density counters. + +3. Add a rerank-on qmd profile or explicitly keep qmd rerank as unmeasured. + - Current qmd materialization uses `--no-rerank`. + +4. Add a scored operator-debugging retrieval job for both systems. + - The job should ask why a result was wrong or why a distractor appeared, not only + whether the top result was correct. + +5. Add an expansion/fusion trace profile. + - Score lex-only, vec-only, hybrid, fusion, and final ranking stages separately. + +## Bottom Line + +This profile strengthens the evidence base but does not close the competitiveness +goal. Retrieval correctness is currently tied between ELF and qmd on encoded data. +ELF's next useful iteration direction is not "more retrieval" in the abstract; it is +qmd-level immediate retrieval debugging while preserving ELF's stronger +source-of-truth, trace, backfill, and production-operation model. diff --git a/docs/guide/benchmarking/index.md b/docs/guide/benchmarking/index.md index fd2569df..81e90780 100644 --- a/docs/guide/benchmarking/index.md +++ b/docs/guide/benchmarking/index.md @@ -58,6 +58,9 @@ cleanup, use `docs/guide/single_user_production.md`. current measured ELF/qmd data, fixture evidence, external adapter ledger coverage, scenario non-claims, and the next measurement reports needed before stronger competitor claims. +- `2026-06-11-elf-qmd-retrieval-debug-profile.md`: fresh ELF/qmd retrieval-debug + profile with real-world retrieval-suite evidence, 480-document stress baseline + evidence, qmd top-10 artifact inspection, and explicit rerank/fusion non-claims. - `real_world_agent_memory_benchmark.md`: operator overview for the v1 real-world agent memory benchmark contract, including suite taxonomy, typed report states, knowledge-compilation fixture tasks, and the production-ops fixture target. diff --git a/docs/research/2026-06-11-elf-qmd-retrieval-debug-profile.json b/docs/research/2026-06-11-elf-qmd-retrieval-debug-profile.json new file mode 100644 index 00000000..fed5fed9 --- /dev/null +++ b/docs/research/2026-06-11-elf-qmd-retrieval-debug-profile.json @@ -0,0 +1,154 @@ +{ + "schema": "elf.retrieval_debug_profile_report/v1", + "run_id": "2026-06-11-elf-qmd-retrieval-debug-profile", + "commit": "38c586d", + "created_at": "2026-06-11", + "scope": "ELF versus qmd retrieval correctness, stress same-corpus behavior, and retrieval-debug artifact comparison", + "commands": [ + { + "command": "cargo make real-world-memory-live-adapters", + "status": "pass", + "runtime_seconds": 116.76, + "artifact": "tmp/real-world-memory/live-adapters/" + }, + { + "command": "ELF_BASELINE_PROJECTS=ELF,qmd ELF_BASELINE_PROFILE=stress cargo make baseline-live-docker", + "status": "pass", + "runtime_seconds": 149.41, + "artifact": "tmp/live-baseline/live-baseline-report.json" + } + ], + "live_real_world_retrieval": { + "elf": { + "jobs": 5, + "pass": 5, + "expected_evidence": 6, + "matched_evidence": 6, + "produced_evidence": 6, + "mean_score": 1.0 + }, + "qmd": { + "jobs": 5, + "pass": 5, + "expected_evidence": 6, + "matched_evidence": 6, + "produced_evidence": 6, + "mean_score": 1.0 + } + }, + "live_real_world_full_sweep_context": { + "elf": { + "job_count": 38, + "pass": 18, + "wrong_result": 5, + "blocked": 2, + "not_encoded": 13, + "mean_score": 0.525, + "mean_latency_ms": 5.823 + }, + "qmd": { + "job_count": 38, + "pass": 18, + "wrong_result": 5, + "blocked": 2, + "not_encoded": 13, + "mean_score": 0.512, + "mean_latency_ms": 705.877 + } + }, + "stress_baseline": { + "profile": "stress", + "document_count": 480, + "query_count": 16, + "verdict": "pass", + "summary": { + "projects": 2, + "pass": 2, + "fail": 0, + "full_checks": 13, + "full_checks_pass": 13 + }, + "elf": { + "head": "38c586d49167d2e4118c921765c11fbec0a60af9", + "status": "pass", + "retrieval_status": "retrieval_pass", + "elapsed_seconds": 81, + "query_pass": 16, + "query_total": 16, + "expected_top1": 16, + "latency_ms_mean": 29.80780025, + "latency_ms_p95": 31.298164, + "backfill_source_count": 480, + "backfill_completed_count": 480, + "resume_attempts": 2, + "duplicate_source_notes": 0, + "resource_elapsed_seconds": 71.303126711, + "rss_kb": 54724, + "estimated_input_tokens": 27023, + "checks": [ + "resumable_backfill_no_duplicates", + "same_corpus_retrieval", + "async_worker_indexing_e2e", + "update_replaces_note_text", + "delete_suppresses_retrieval", + "cold_start_recovery_search", + "concurrent_write_search_e2e", + "soak_stability_e2e", + "resource_envelope" + ] + }, + "qmd": { + "head": "636602409c862db077f38d9006df7f0bdca17ff3", + "status": "pass", + "retrieval_status": "retrieval_pass", + "elapsed_seconds": 66, + "query_pass": 16, + "query_total": 16, + "expected_top1": 16, + "mean_expected_rank": 1.0, + "mean_distractors_in_top10": 7.9375, + "checks": [ + "same_corpus_retrieval", + "update_replaces_note_text", + "delete_suppresses_retrieval", + "cold_start_recovery_search" + ] + }, + "per_query": [ + {"id": "q-auth", "elf_matched_top_evidence": true, "elf_latency_ms": 30.57141, "qmd_expected_rank": 1, "qmd_top10_distractors": 6}, + {"id": "q-auth-alt", "elf_matched_top_evidence": true, "elf_latency_ms": 30.500951, "qmd_expected_rank": 1, "qmd_top10_distractors": 7}, + {"id": "q-database", "elf_matched_top_evidence": true, "elf_latency_ms": 30.533742, "qmd_expected_rank": 1, "qmd_top10_distractors": 8}, + {"id": "q-database-alt", "elf_matched_top_evidence": true, "elf_latency_ms": 31.280581, "qmd_expected_rank": 1, "qmd_top10_distractors": 8}, + {"id": "q-deploy", "elf_matched_top_evidence": true, "elf_latency_ms": 29.958447, "qmd_expected_rank": 1, "qmd_top10_distractors": 9}, + {"id": "q-deploy-alt", "elf_matched_top_evidence": true, "elf_latency_ms": 31.298164, "qmd_expected_rank": 1, "qmd_top10_distractors": 8}, + {"id": "q-retention", "elf_matched_top_evidence": true, "elf_latency_ms": 30.433992, "qmd_expected_rank": 1, "qmd_top10_distractors": 8}, + {"id": "q-retention-alt", "elf_matched_top_evidence": true, "elf_latency_ms": 29.1944, "qmd_expected_rank": 1, "qmd_top10_distractors": 9}, + {"id": "q-incident", "elf_matched_top_evidence": true, "elf_latency_ms": 30.838953, "qmd_expected_rank": 1, "qmd_top10_distractors": 7}, + {"id": "q-incident-alt", "elf_matched_top_evidence": true, "elf_latency_ms": 28.700106, "qmd_expected_rank": 1, "qmd_top10_distractors": 9}, + {"id": "q-billing", "elf_matched_top_evidence": true, "elf_latency_ms": 30.092115, "qmd_expected_rank": 1, "qmd_top10_distractors": 7}, + {"id": "q-billing-alt", "elf_matched_top_evidence": true, "elf_latency_ms": 28.855273, "qmd_expected_rank": 1, "qmd_top10_distractors": 9}, + {"id": "q-search", "elf_matched_top_evidence": true, "elf_latency_ms": 29.479694, "qmd_expected_rank": 1, "qmd_top10_distractors": 8}, + {"id": "q-search-alt", "elf_matched_top_evidence": true, "elf_latency_ms": 28.641688, "qmd_expected_rank": 1, "qmd_top10_distractors": 7}, + {"id": "q-recovery", "elf_matched_top_evidence": true, "elf_latency_ms": 28.357061, "qmd_expected_rank": 1, "qmd_top10_distractors": 8}, + {"id": "q-recovery-alt", "elf_matched_top_evidence": true, "elf_latency_ms": 28.188227, "qmd_expected_rank": 1, "qmd_top10_distractors": 9} + ] + }, + "debug_artifact_judgment": { + "retrieval_correctness": "tie_on_encoded_surfaces", + "qmd_advantage": "direct_top10_json_results_with_file_line_score_snippet_and_distractor_visibility", + "elf_advantage": "service_lifecycle_backfill_qdrant_rebuild_resource_envelope_source_of_truth_and_trace_ids", + "unmeasured": [ + "qmd_rerank_quality", + "elf_rerank_quality", + "expansion_fusion_stage_quality", + "operator_debugging_ux_live_suite" + ] + }, + "next_measurement_work": [ + "hydrate ELF trace candidates into stress reports", + "add qmd query latency and candidate-density aggregates", + "add rerank-on qmd profile or keep rerank as unmeasured", + "add scored operator-debugging retrieval jobs for both systems", + "add expansion/fusion trace profile" + ] +}