diff --git a/.github/workflows/e2e.yml b/.github/workflows/e2e.yml index 28ac002b..b448c8c7 100644 --- a/.github/workflows/e2e.yml +++ b/.github/workflows/e2e.yml @@ -109,7 +109,7 @@ jobs: - name: Run context misranking harness run: | mkdir -p tmp - cargo make e2e + cargo make test-e2e - name: Upload harness outputs if: always() diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml index 31adcc87..0e409287 100644 --- a/.github/workflows/integration.yml +++ b/.github/workflows/integration.yml @@ -91,4 +91,4 @@ jobs: exit 1 - name: Run integration tests - run: cargo make test-all + run: cargo make test-rust-all diff --git a/.github/workflows/language.yml b/.github/workflows/language.yml index 7fd3cdcb..6385bd46 100644 --- a/.github/workflows/language.yml +++ b/.github/workflows/language.yml @@ -30,8 +30,8 @@ concurrency: cancel-in-progress: ${{ github.ref != 'refs/heads/main' }} jobs: - rust: - name: Rust checks + repo: + name: Repository checks runs-on: ubuntu-latest steps: - name: Fetch latest code @@ -72,37 +72,10 @@ jobs: with: tool: nextest - - name: Run lint - run: cargo make lint - - - name: Run Rust format checks - run: cargo make fmt-rust-check - - - name: Run tests - run: cargo make test-rust - - toml: - name: TOML checks - runs-on: ubuntu-latest - steps: - - name: Fetch latest code - uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 - - - name: Set up Rust toolchain - uses: actions-rust-lang/setup-rust-toolchain@46268bd060767258de96ed93c1251119784f2ab6 - with: - cache: true - rustflags: '' - - - name: Install cargo-make - uses: taiki-e/install-action@15449e3094499af05d8d964a1c884208e4b8b595 - with: - tool: cargo-make - - name: Install taplo uses: taiki-e/install-action@15449e3094499af05d8d964a1c884208e4b8b595 with: tool: taplo - - name: Run TOML format checks - run: cargo make fmt-toml-check + - name: Run repository checks + run: cargo make check diff --git a/.github/workflows/quality.yml b/.github/workflows/quality.yml index 745a0c1e..210114fb 100644 --- a/.github/workflows/quality.yml +++ b/.github/workflows/quality.yml @@ -59,6 +59,11 @@ jobs: cache: true rustflags: '' + - name: Install cargo-make + uses: taiki-e/install-action@15449e3094499af05d8d964a1c884208e4b8b595 + with: + tool: cargo-make + - name: Install Postgres client run: | sudo apt-get update @@ -73,39 +78,8 @@ jobs: echo "Postgres did not become ready in time." exit 1 - - name: Create schema - run: | - python3 - <<'PY' > tmp.schema.sql - from pathlib import Path - - vector_dim = 4 - root = Path(".") - sql_dir = root / "sql" - - out = [] - for raw_line in (sql_dir / "init.sql").read_text(encoding="utf-8").splitlines(): - line = raw_line.strip() - if line.startswith(r"\ir "): - rel = line[len(r"\ir ") :].strip() - out.append((sql_dir / rel).read_text(encoding="utf-8")) - else: - out.append(raw_line) - - expanded = "\n".join(out) + "\n" - print(expanded.replace("", str(vector_dim)), end="") - PY - - psql "${PG_DSN}" -v ON_ERROR_STOP=1 -f tmp.schema.sql - - - name: Load trace gate fixture - run: psql "${PG_DSN}" -v ON_ERROR_STOP=1 -f .github/fixtures/trace_gate/fixture.sql - - name: Run trace regression gate - run: | - cargo run -p elf-eval --bin trace_regression_gate -- \ - --config .github/fixtures/trace_gate/config.toml \ - --gate .github/fixtures/trace_gate/gate.json \ - --out trace_gate.report.json + run: TRACE_GATE_REPORT_PATH=trace_gate.report.json cargo make check-trace-gate - name: Upload trace gate report if: always() diff --git a/Makefile.toml b/Makefile.toml index 7513eb0d..02654763 100644 --- a/Makefile.toml +++ b/Makefile.toml @@ -1,272 +1,144 @@ # Rust workspace tasks. -# Lint -# | task | type | cwd | -# | ------------- | --------- | --- | -# | lint | composite | | -# | lint-fix | composite | | -# | lint-rust | command | | -# | lint-fix-rust | extend | | -# | lint-vstyle | command | | -# | lint-fix-vstyle | command | | - -[tasks.lint] -workspace = false -dependencies = [ - "lint-rust", - "lint-vstyle", -] - -[tasks.lint-fix] -workspace = false -dependencies = [ - "lint-fix-rust", - "lint-fix-vstyle", -] +# Benchmark +# | task | type | cwd | +# | ------------------------------------------ | --------- | --- | +# | baseline-backfill-100k-docker | command | | +# | baseline-backfill-10k-docker | command | | +# | baseline-backfill-docker | command | | +# | baseline-live-docker | command | | +# | baseline-live-report | command | | +# | baseline-production-private | command | | +# | baseline-production-private-addendum | command | | +# | baseline-production-synthetic | command | | +# | baseline-soak-docker | command | | +# | openmemory-ui-export-readback | command | | +# | parity-docker | command | | +# | real-world-first-generation-oss | composite | | +# | real-world-first-generation-oss-json | command | | +# | real-world-first-generation-oss-report | command | | +# | real-world-job-operator-ux | composite | | +# | real-world-job-operator-ux-json | command | | +# | real-world-job-operator-ux-live-adapters | command | | +# | real-world-job-operator-ux-report | command | | +# | real-world-memory | composite | | +# | real-world-memory-consolidation | composite | | +# | real-world-memory-consolidation-json | command | | +# | real-world-memory-consolidation-report | command | | +# | real-world-memory-core-archival | composite | | +# | real-world-memory-core-archival-json | command | | +# | real-world-memory-core-archival-report | command | | +# | real-world-memory-evolution | composite | | +# | real-world-memory-evolution-json | command | | +# | real-world-memory-evolution-report | command | | +# | real-world-memory-graph-rag | composite | | +# | real-world-memory-graph-rag-json | command | | +# | real-world-memory-graph-rag-report | command | | +# | real-world-memory-json | command | | +# | real-world-memory-knowledge | composite | | +# | real-world-memory-knowledge-json | command | | +# | real-world-memory-knowledge-report | command | | +# | real-world-memory-live-adapters | command | | +# | real-world-memory-live-consolidation | command | | +# | real-world-memory-proactive-brief | composite | | +# | real-world-memory-proactive-brief-json | command | | +# | real-world-memory-proactive-brief-report | command | | +# | real-world-memory-production-ops | composite | | +# | real-world-memory-production-ops-json | command | | +# | real-world-memory-production-ops-report | command | | +# | real-world-memory-project-decisions | composite | | +# | real-world-memory-project-decisions-json | command | | +# | real-world-memory-project-decisions-report | command | | +# | real-world-memory-report | command | | +# | real-world-memory-retrieval | composite | | +# | real-world-memory-retrieval-json | command | | +# | real-world-memory-retrieval-report | command | | +# | real-world-memory-scheduled | composite | | +# | real-world-memory-scheduled-json | command | | +# | real-world-memory-scheduled-report | command | | +# | real-world-memory-summary | composite | | +# | real-world-memory-summary-json | command | | +# | real-world-memory-summary-report | command | | -[tasks.lint-rust] -workspace = false -command = "cargo" -args = [ - "clippy", - "--all-features", - "--all-targets", - "--workspace", - "--", - "-D", - "clippy::all", - "-D", - "clippy::too_many_lines", - "-D", - "clippy::unwrap_used", - "-D", - "clippy::use_self", - "-D", - "clippy::wildcard_imports", - "-D", - "missing-docs", - "-D", - "unused-crate-dependencies", - "-D", - "warnings", -] - -[tasks.lint-fix-rust] -extend = "lint-rust" -args = [ - "clippy", - "--fix", - "--allow-dirty", - "--all-features", - "--all-targets", - "--workspace", - "--", - "-D", - "clippy::all", - "-D", - "clippy::too_many_lines", - "-D", - "clippy::unwrap_used", - "-D", - "clippy::use_self", - "-D", - "clippy::wildcard_imports", - "-D", - "missing-docs", - "-D", - "unused-crate-dependencies", - "-D", - "warnings", -] - -[tasks.lint-vstyle] +[tasks.baseline-backfill-100k-docker] workspace = false -command = "cargo" +command = "bash" args = [ - "vstyle", - "curate", - "--language", - "rust", - "--workspace", - "--all-features" + "scripts/baseline-docker.sh", + "backfill-100k", ] -[tasks.lint-fix-vstyle] +[tasks.baseline-backfill-10k-docker] workspace = false -command = "cargo" +command = "bash" args = [ - "vstyle", - "tune", - "--language", - "rust", - "--workspace", - "--all-features", - "--strict", -] - - -# Test -# | task | type | cwd | -# | --------- | --------- | --- | -# | test | composite | | -# | test-rust | command | | -# | test-all | composite | | -# | test-rust-all | command | | -# | test-integration | composite | -# | test-integration-rust | command | - -[tasks.test] -workspace = false -dependencies = [ - "test-rust", + "scripts/baseline-docker.sh", + "backfill-10k", ] -[tasks.test-rust] +[tasks.baseline-backfill-docker] workspace = false -command = "cargo" +command = "bash" args = [ - "nextest", - "run", - "--workspace", - "--all-targets", - "--all-features", + "scripts/baseline-docker.sh", + "backfill", ] -[tasks.test-all] -workspace = false -dependencies = [ - "test-rust-all", -] - -[tasks.test-rust-all] +[tasks.baseline-live-docker] workspace = false -command = "cargo" +command = "bash" args = [ - "nextest", - "run", - "--workspace", - "--all-targets", - "--all-features", - "--run-ignored", - "all", + "scripts/baseline-docker.sh", + "live", ] -[tasks.test-integration] -workspace = false -dependencies = [ - "test-integration-rust", -] - -[tasks.test-integration-rust] +[tasks.baseline-live-report] workspace = false -command = "cargo" +command = "bash" args = [ - "nextest", - "run", - "--workspace", - "--all-targets", - "--all-features", - "--run-ignored", - "only", -] - - -# Format -# | task | type | cwd | -# | -------------- | --------- | --- | -# | fmt | composite | | -# | fmt-check | composite | | -# | fmt-rust | command | | -# | fmt-rust-check | extend | | -# | fmt-toml | command | | -# | fmt-toml-check | extend | | - -[tasks.fmt] -workspace = false -dependencies = [ - "fmt-rust", - "fmt-toml", -] - -[tasks.fmt-check] -workspace = false -dependencies = [ - "fmt-rust-check", - "fmt-toml-check", + "scripts/live-baseline-report-to-md.sh", ] -[tasks.fmt-rust] +[tasks.baseline-production-private] workspace = false -command = "rustup" +command = "bash" args = [ - "run", - "nightly", - "cargo", - "fmt", - "--all", + "scripts/baseline-docker.sh", + "production-private", ] -[tasks.fmt-rust-check] +[tasks.baseline-production-private-addendum] workspace = false -command = "rustup" +command = "bash" args = [ - "run", - "nightly", - "cargo", - "fmt", - "--all", - "--", - "--check", + "scripts/baseline-docker.sh", + "production-private-addendum", ] -[tasks.fmt-toml] +[tasks.baseline-production-synthetic] workspace = false -command = "taplo" -args = [ - "fmt", -] - -[tasks.fmt-toml-check] -extend = "fmt-toml" +command = "bash" args = [ - "fmt", - "--check", -] - -# E2E -# | task | type | cwd | -# | ------------------------------ | --------- | --- | -# | e2e | composite | | -# | e2e-context-misranking-harness | command | | -# | e2e-consolidation-harness | command | | - -[tasks.e2e] -workspace = false -dependencies = [ - "e2e-context-misranking-harness", + "scripts/baseline-docker.sh", + "production-synthetic", ] -[tasks.e2e-context-misranking-harness] +[tasks.baseline-soak-docker] workspace = false command = "bash" args = [ - "scripts/context-misranking-harness.sh", + "scripts/baseline-docker.sh", + "soak", ] -[tasks.e2e-consolidation-harness] +[tasks.openmemory-ui-export-readback] workspace = false command = "bash" args = [ - "scripts/consolidation-harness.sh", + "scripts/baseline-docker.sh", + "openmemory-ui-export-readback", ] - -# Competitive parity -# | task | type | cwd | -# | ------------------- | ------- | --- | -# | parity-docker | command | | -# | parity-docker-clean | command | | - [tasks.parity-docker] workspace = false command = "docker" @@ -280,179 +152,125 @@ args = [ "parity-runner", ] -[tasks.parity-docker-clean] -workspace = false -command = "docker" -args = [ - "compose", - "-f", - "docker-compose.parity.yml", - "down", - "-v", - "--remove-orphans", -] - - -# Live external baseline benchmark -# | task | type | cwd | -# | -------------------------- | ------- | --- | -# | baseline-live-docker | command | | -# | baseline-backfill-docker | command | | -# | baseline-live-report | command | | -# | baseline-live-docker-clean | command | | -# | baseline-production-synthetic | command | | -# | baseline-production-private | command | | -# | baseline-production-private-addendum | command | | -# | baseline-backfill-10k-docker | command | | -# | baseline-backfill-100k-docker | command | | -# | baseline-soak-docker | command | | -# | openmemory-ui-export-readback | command | | - -[tasks.baseline-live-docker] +[tasks.real-world-first-generation-oss] workspace = false -command = "bash" -args = [ - "-lc", - "set -euo pipefail; head=\"$(git rev-parse HEAD)\"; if [ -n \"$(git status --porcelain)\" ]; then head=\"$head+dirty\"; fi; export ELF_BASELINE_ELF_HEAD=\"$head\"; docker compose -f docker-compose.baseline.yml run --build --rm baseline-runner", +dependencies = [ + "real-world-first-generation-oss-report", ] -[tasks.baseline-backfill-docker] +[tasks.real-world-first-generation-oss-json] workspace = false -command = "bash" +command = "cargo" args = [ - "-lc", - "set -euo pipefail; head=\"$(git rev-parse HEAD)\"; if [ -n \"$(git status --porcelain)\" ]; then head=\"$head+dirty\"; fi; selected_projects=\"$(printenv ELF_BASELINE_PROJECTS || true)\"; if [ -z \"$selected_projects\" ]; then selected_projects=\"ELF\"; fi; selected_profile=\"$(printenv ELF_BASELINE_PROFILE || true)\"; if [ -z \"$selected_profile\" ]; then selected_profile=\"backfill\"; fi; backfill_docs=\"$(printenv ELF_BASELINE_BACKFILL_DOCS || true)\"; if [ -z \"$backfill_docs\" ]; then backfill_docs=\"2000\"; fi; elf_timeout=\"$(printenv ELF_BASELINE_ELF_TIMEOUT_SECONDS || true)\"; if [ -z \"$elf_timeout\" ]; then elf_timeout=\"3600\"; fi; max_elf_seconds=\"$(printenv ELF_BASELINE_MAX_ELF_SECONDS || true)\"; if [ -z \"$max_elf_seconds\" ]; then max_elf_seconds=\"3600\"; fi; export ELF_BASELINE_ELF_HEAD=\"$head\"; export ELF_BASELINE_PROJECTS=\"$selected_projects\"; export ELF_BASELINE_PROFILE=\"$selected_profile\"; export ELF_BASELINE_BACKFILL_DOCS=\"$backfill_docs\"; export ELF_BASELINE_ELF_TIMEOUT_SECONDS=\"$elf_timeout\"; export ELF_BASELINE_MAX_ELF_SECONDS=\"$max_elf_seconds\"; docker compose -f docker-compose.baseline.yml run --build --rm baseline-runner", + "run", + "-p", + "elf-eval", + "--bin", + "real_world_job_benchmark", + "--", + "run", + "--fixtures", + "apps/elf-eval/fixtures/real_world_external_adapters/first_generation_oss", + "--out", + "tmp/real-world-memory/first-generation-oss/report.json", + "--run-id", + "first-generation-oss-continuity-source-store", + "--adapter-id", + "fixture_first_generation_oss", + "--adapter-name", + "First-generation OSS fixture coverage", ] -[tasks.baseline-live-report] +[tasks.real-world-first-generation-oss-report] workspace = false -command = "bash" -args = [ - "scripts/live-baseline-report-to-md.sh", +dependencies = [ + "real-world-first-generation-oss-json", ] - -[tasks.baseline-live-docker-clean] -workspace = false -command = "docker" +command = "cargo" args = [ - "compose", - "-f", - "docker-compose.baseline.yml", - "down", - "-v", - "--remove-orphans", + "run", + "-p", + "elf-eval", + "--bin", + "real_world_job_benchmark", + "--", + "publish", + "--report", + "tmp/real-world-memory/first-generation-oss/report.json", + "--out", + "tmp/real-world-memory/first-generation-oss/report.md", ] -[tasks.openmemory-ui-export-readback] +[tasks.real-world-job-operator-ux] workspace = false -command = "bash" -args = [ - "-lc", - "set -euo pipefail; head=\"$(git rev-parse HEAD)\"; if [ -n \"$(git status --porcelain)\" ]; then head=\"$head+dirty\"; fi; export ELF_BASELINE_ELF_HEAD=\"$head\"; export ELF_BASELINE_PROJECTS=mem0; docker compose -f docker-compose.baseline.yml run --build --rm baseline-runner", +dependencies = [ + "real-world-job-operator-ux-report", ] -[tasks.baseline-production-synthetic] +[tasks.real-world-job-operator-ux-json] workspace = false -command = "bash" +command = "cargo" args = [ - "-lc", - "set -euo pipefail; head=\"$(git rev-parse HEAD)\"; if [ -n \"$(git status --porcelain)\" ]; then head=\"$head+dirty\"; fi; selected_projects=\"$(printenv ELF_BASELINE_PROJECTS || true)\"; if [ -z \"$selected_projects\" ]; then selected_projects=\"ELF\"; fi; export ELF_BASELINE_ELF_HEAD=\"$head\"; export ELF_BASELINE_PROJECTS=\"$selected_projects\"; export ELF_BASELINE_PROFILE=production-synthetic; docker compose -f docker-compose.baseline.yml run --build --rm baseline-runner", + "run", + "-p", + "elf-eval", + "--bin", + "real_world_job_benchmark", + "--", + "run", + "--fixtures", + "apps/elf-eval/fixtures/real_world_job/operator_debugging_ux", + "--out", + "tmp/real-world-job/real-world-job-operator-ux-report.json", + "--run-id", + "real-world-job-operator-ux", + "--adapter-id", + "fixture_operator_ux", + "--adapter-name", + "ELF operator UX fixture", ] -[tasks.baseline-production-private] +[tasks.real-world-job-operator-ux-live-adapters] workspace = false command = "bash" args = [ - "-lc", - "set -euo pipefail; manifest=\"$(printenv ELF_BASELINE_PRODUCTION_CORPUS_MANIFEST || true)\"; if [ -z \"$manifest\" ]; then echo \"ELF_BASELINE_PRODUCTION_CORPUS_MANIFEST is required for baseline-production-private\" >&2; exit 1; fi; head=\"$(git rev-parse HEAD)\"; if [ -n \"$(git status --porcelain)\" ]; then head=\"$head+dirty\"; fi; selected_projects=\"$(printenv ELF_BASELINE_PROJECTS || true)\"; if [ -z \"$selected_projects\" ]; then selected_projects=\"ELF\"; fi; export ELF_BASELINE_ELF_HEAD=\"$head\"; export ELF_BASELINE_PROJECTS=\"$selected_projects\"; export ELF_BASELINE_PROFILE=production-private; docker compose -f docker-compose.baseline.yml run --build --rm baseline-runner", + "scripts/real-world-docker.sh", + "job-operator-ux-live-adapters", ] -[tasks.baseline-production-private-addendum] +[tasks.real-world-job-operator-ux-report] workspace = false -command = "bash" -args = [ - "-lc", - "set -euo pipefail; manifest=\"$(printenv ELF_BASELINE_PRODUCTION_CORPUS_MANIFEST || true)\"; if [ -z \"$manifest\" ]; then echo \"ELF_BASELINE_PRODUCTION_CORPUS_MANIFEST is required for baseline-production-private-addendum\" >&2; exit 1; fi; head=\"$(git rev-parse HEAD)\"; if [ -n \"$(git status --porcelain)\" ]; then head=\"$head+dirty\"; fi; selected_projects=\"$(printenv ELF_BASELINE_PROJECTS || true)\"; if [ -z \"$selected_projects\" ]; then selected_projects=\"ELF\"; fi; addendum=\"$(printenv ELF_BASELINE_PRIVATE_ADDENDUM || true)\"; if [ -z \"$addendum\" ]; then addendum=\"tmp/live-baseline/private-production-addendum.md\"; fi; export ELF_BASELINE_ELF_HEAD=\"$head\"; export ELF_BASELINE_PROJECTS=\"$selected_projects\"; export ELF_BASELINE_PROFILE=production-private; docker compose -f docker-compose.baseline.yml run --build --rm baseline-runner; ELF_BASELINE_MARKDOWN_REPORT=\"$addendum\" cargo make baseline-live-report; echo \"Private production addendum: $addendum\"", +dependencies = [ + "real-world-job-operator-ux-json", ] - -[tasks.baseline-backfill-10k-docker] -workspace = false -command = "bash" +command = "cargo" args = [ - "-lc", - "set -euo pipefail; head=\"$(git rev-parse HEAD)\"; if [ -n \"$(git status --porcelain)\" ]; then head=\"$head+dirty\"; fi; backfill_docs=\"$(printenv ELF_BASELINE_BACKFILL_DOCS || true)\"; if [ -z \"$backfill_docs\" ]; then backfill_docs=\"10000\"; fi; elf_timeout=\"$(printenv ELF_BASELINE_ELF_TIMEOUT_SECONDS || true)\"; if [ -z \"$elf_timeout\" ]; then elf_timeout=\"14400\"; fi; max_elf_seconds=\"$(printenv ELF_BASELINE_MAX_ELF_SECONDS || true)\"; if [ -z \"$max_elf_seconds\" ]; then max_elf_seconds=\"$elf_timeout\"; fi; export ELF_BASELINE_ELF_HEAD=\"$head\"; export ELF_BASELINE_PROJECTS=ELF; export ELF_BASELINE_PROFILE=backfill; export ELF_BASELINE_BACKFILL_DOCS=\"$backfill_docs\"; export ELF_BASELINE_ELF_TIMEOUT_SECONDS=\"$elf_timeout\"; export ELF_BASELINE_MAX_ELF_SECONDS=\"$max_elf_seconds\"; docker compose -f docker-compose.baseline.yml run --build --rm baseline-runner", + "run", + "-p", + "elf-eval", + "--bin", + "real_world_job_benchmark", + "--", + "publish", + "--report", + "tmp/real-world-job/real-world-job-operator-ux-report.json", + "--out", + "tmp/real-world-job/real-world-job-operator-ux-report.md", ] -[tasks.baseline-backfill-100k-docker] +[tasks.real-world-memory] workspace = false -command = "bash" -args = [ - "-lc", - "set -euo pipefail; enabled=\"$(printenv ELF_BASELINE_ENABLE_EXPENSIVE || true)\"; if [ \"$enabled\" != \"1\" ]; then echo \"ELF_BASELINE_ENABLE_EXPENSIVE=1 is required for baseline-backfill-100k-docker\" >&2; exit 1; fi; head=\"$(git rev-parse HEAD)\"; if [ -n \"$(git status --porcelain)\" ]; then head=\"$head+dirty\"; fi; backfill_docs=\"$(printenv ELF_BASELINE_BACKFILL_DOCS || true)\"; if [ -z \"$backfill_docs\" ]; then backfill_docs=\"100000\"; fi; elf_timeout=\"$(printenv ELF_BASELINE_ELF_TIMEOUT_SECONDS || true)\"; if [ -z \"$elf_timeout\" ]; then elf_timeout=\"86400\"; fi; max_elf_seconds=\"$(printenv ELF_BASELINE_MAX_ELF_SECONDS || true)\"; if [ -z \"$max_elf_seconds\" ]; then max_elf_seconds=\"$elf_timeout\"; fi; export ELF_BASELINE_ELF_HEAD=\"$head\"; export ELF_BASELINE_PROJECTS=ELF; export ELF_BASELINE_PROFILE=backfill; export ELF_BASELINE_BACKFILL_DOCS=\"$backfill_docs\"; export ELF_BASELINE_ELF_TIMEOUT_SECONDS=\"$elf_timeout\"; export ELF_BASELINE_MAX_ELF_SECONDS=\"$max_elf_seconds\"; docker compose -f docker-compose.baseline.yml run --build --rm baseline-runner", +dependencies = [ + "real-world-memory-report", ] -[tasks.baseline-soak-docker] -workspace = false -command = "bash" -args = [ - "-lc", - "set -euo pipefail; head=\"$(git rev-parse HEAD)\"; if [ -n \"$(git status --porcelain)\" ]; then head=\"$head+dirty\"; fi; soak_seconds=\"$(printenv ELF_BASELINE_SOAK_SECONDS || true)\"; if [ -z \"$soak_seconds\" ]; then soak_seconds=\"3600\"; fi; elf_timeout=\"$(printenv ELF_BASELINE_ELF_TIMEOUT_SECONDS || true)\"; if [ -z \"$elf_timeout\" ]; then elf_timeout=\"$((soak_seconds + 1800))\"; fi; max_elf_seconds=\"$(printenv ELF_BASELINE_MAX_ELF_SECONDS || true)\"; if [ -z \"$max_elf_seconds\" ]; then max_elf_seconds=\"$elf_timeout\"; fi; export ELF_BASELINE_ELF_HEAD=\"$head\"; export ELF_BASELINE_PROJECTS=ELF; export ELF_BASELINE_PROFILE=stress; export ELF_BASELINE_SOAK_SECONDS=\"$soak_seconds\"; export ELF_BASELINE_ELF_TIMEOUT_SECONDS=\"$elf_timeout\"; export ELF_BASELINE_MAX_ELF_SECONDS=\"$max_elf_seconds\"; docker compose -f docker-compose.baseline.yml run --build --rm baseline-runner", -] - - -# Real-world job benchmark smoke -# | task | type | cwd | -# | -------------------------------------- | --------- | --- | -# | real-world-job-smoke | composite | | -# | real-world-job-smoke-json | command | | -# | real-world-job-smoke-report | command | | -# | real-world-memory | composite | | -# | real-world-memory-json | command | | -# | real-world-memory-report | command | | -# | real-world-memory-project-decisions | composite | | -# | real-world-memory-project-decisions-json | command | | -# | real-world-memory-project-decisions-report | command | | -# | real-world-memory-evolution | composite | | -# | real-world-memory-evolution-json | command | | -# | real-world-memory-evolution-report | command | | -# | real-world-memory-consolidation | composite | | -# | real-world-memory-consolidation-json | command | | -# | real-world-memory-consolidation-report | command | | -# | real-world-memory-summary | composite | | -# | real-world-memory-summary-json | command | | -# | real-world-memory-summary-report | command | | -# | real-world-memory-proactive-brief | composite | | -# | real-world-memory-proactive-brief-json | command | | -# | real-world-memory-proactive-brief-report | command | | -# | real-world-memory-scheduled | composite | | -# | real-world-memory-scheduled-json | command | | -# | real-world-memory-scheduled-report | command | | -# | real-world-memory-live-consolidation | command | | -# | real-world-job-operator-ux | composite | | -# | real-world-job-operator-ux-json | command | | -# | real-world-job-operator-ux-report | command | | -# | real-world-job-operator-ux-live-adapters | command | | -# | real-world-memory-retrieval | composite | | -# | real-world-memory-retrieval-json | command | | -# | real-world-memory-retrieval-report | command | | -# | real-world-memory-production-ops | composite | | -# | real-world-memory-production-ops-json | command | | -# | real-world-memory-production-ops-report | command | | -# | real-world-memory-core-archival | composite | | -# | real-world-memory-core-archival-json | command | | -# | real-world-memory-core-archival-report | command | | -# | real-world-memory-graph-rag | composite | | -# | real-world-memory-graph-rag-json | command | | -# | real-world-memory-graph-rag-report | command | | -# | real-world-memory-live-adapters | command | | - -[tasks.real-world-job-smoke] +[tasks.real-world-memory-consolidation] workspace = false dependencies = [ - "real-world-job-smoke-report", + "real-world-memory-consolidation-report", ] -[tasks.real-world-job-smoke-json] +[tasks.real-world-memory-consolidation-json] workspace = false command = "cargo" args = [ @@ -464,15 +282,21 @@ args = [ "--", "run", "--fixtures", - "apps/elf-eval/fixtures/real_world_memory/work_resume", + "apps/elf-eval/fixtures/real_world_memory/consolidation", "--out", - "tmp/real-world-job/real-world-job-smoke-report.json", + "tmp/real-world-memory/consolidation/report.json", + "--run-id", + "real-world-memory-consolidation", + "--adapter-id", + "fixture_consolidation", + "--adapter-name", + "ELF consolidation fixture", ] -[tasks.real-world-job-smoke-report] +[tasks.real-world-memory-consolidation-report] workspace = false dependencies = [ - "real-world-job-smoke-json", + "real-world-memory-consolidation-json", ] command = "cargo" args = [ @@ -484,18 +308,18 @@ args = [ "--", "publish", "--report", - "tmp/real-world-job/real-world-job-smoke-report.json", + "tmp/real-world-memory/consolidation/report.json", "--out", - "tmp/real-world-job/real-world-job-smoke-report.md", + "tmp/real-world-memory/consolidation/report.md", ] -[tasks.real-world-memory] +[tasks.real-world-memory-core-archival] workspace = false dependencies = [ - "real-world-memory-report", + "real-world-memory-core-archival-report", ] -[tasks.real-world-memory-json] +[tasks.real-world-memory-core-archival-json] workspace = false command = "cargo" args = [ @@ -507,21 +331,21 @@ args = [ "--", "run", "--fixtures", - "apps/elf-eval/fixtures/real_world_memory", + "apps/elf-eval/fixtures/real_world_memory/core_archival_memory", "--out", - "tmp/real-world-memory/real-world-memory-report.json", + "tmp/real-world-memory/core-archival/report.json", "--run-id", - "real-world-memory", + "real-world-memory-core-archival", "--adapter-id", - "elf_real_world_memory_fixture", + "fixture_core_archival_memory", "--adapter-name", - "ELF real-world memory fixture", + "ELF core and archival memory fixture", ] -[tasks.real-world-memory-report] +[tasks.real-world-memory-core-archival-report] workspace = false dependencies = [ - "real-world-memory-json", + "real-world-memory-core-archival-json", ] command = "cargo" args = [ @@ -533,18 +357,18 @@ args = [ "--", "publish", "--report", - "tmp/real-world-memory/real-world-memory-report.json", + "tmp/real-world-memory/core-archival/report.json", "--out", - "tmp/real-world-memory/real-world-memory-report.md", + "tmp/real-world-memory/core-archival/report.md", ] -[tasks.real-world-memory-project-decisions] +[tasks.real-world-memory-evolution] workspace = false dependencies = [ - "real-world-memory-project-decisions-report", + "real-world-memory-evolution-report", ] -[tasks.real-world-memory-project-decisions-json] +[tasks.real-world-memory-evolution-json] workspace = false command = "cargo" args = [ @@ -556,21 +380,21 @@ args = [ "--", "run", "--fixtures", - "apps/elf-eval/fixtures/real_world_memory/project_decisions", + "apps/elf-eval/fixtures/real_world_memory/evolution", "--out", - "tmp/real-world-memory/project-decisions/report.json", + "tmp/real-world-memory/evolution-report.json", "--run-id", - "real-world-memory-project-decisions", + "real-world-memory-evolution", "--adapter-id", - "fixture_project_decisions", + "fixture_memory_evolution", "--adapter-name", - "ELF project decision fixture", + "ELF fixture memory evolution", ] -[tasks.real-world-memory-project-decisions-report] +[tasks.real-world-memory-evolution-report] workspace = false dependencies = [ - "real-world-memory-project-decisions-json", + "real-world-memory-evolution-json", ] command = "cargo" args = [ @@ -582,18 +406,18 @@ args = [ "--", "publish", "--report", - "tmp/real-world-memory/project-decisions/report.json", + "tmp/real-world-memory/evolution-report.json", "--out", - "tmp/real-world-memory/project-decisions/report.md", + "tmp/real-world-memory/evolution-report.md", ] -[tasks.real-world-memory-evolution] +[tasks.real-world-memory-graph-rag] workspace = false dependencies = [ - "real-world-memory-evolution-report", + "real-world-memory-graph-rag-report", ] -[tasks.real-world-memory-evolution-json] +[tasks.real-world-memory-graph-rag-json] workspace = false command = "cargo" args = [ @@ -605,21 +429,21 @@ args = [ "--", "run", "--fixtures", - "apps/elf-eval/fixtures/real_world_memory/evolution", + "apps/elf-eval/fixtures/real_world_external_adapters/graph_rag", "--out", - "tmp/real-world-memory/evolution-report.json", + "tmp/real-world-memory/graph-rag/report.json", "--run-id", - "real-world-memory-evolution", + "real-world-memory-graph-rag", "--adapter-id", - "fixture_memory_evolution", + "fixture_graph_rag_external_adapters", "--adapter-name", - "ELF fixture memory evolution", + "Graph/RAG representative external-adapter fixtures", ] -[tasks.real-world-memory-evolution-report] +[tasks.real-world-memory-graph-rag-report] workspace = false dependencies = [ - "real-world-memory-evolution-json", + "real-world-memory-graph-rag-json", ] command = "cargo" args = [ @@ -631,18 +455,41 @@ args = [ "--", "publish", "--report", - "tmp/real-world-memory/evolution-report.json", + "tmp/real-world-memory/graph-rag/report.json", "--out", - "tmp/real-world-memory/evolution-report.md", + "tmp/real-world-memory/graph-rag/report.md", ] -[tasks.real-world-job-operator-ux] +[tasks.real-world-memory-json] +workspace = false +command = "cargo" +args = [ + "run", + "-p", + "elf-eval", + "--bin", + "real_world_job_benchmark", + "--", + "run", + "--fixtures", + "apps/elf-eval/fixtures/real_world_memory", + "--out", + "tmp/real-world-memory/real-world-memory-report.json", + "--run-id", + "real-world-memory", + "--adapter-id", + "elf_real_world_memory_fixture", + "--adapter-name", + "ELF real-world memory fixture", +] + +[tasks.real-world-memory-knowledge] workspace = false dependencies = [ - "real-world-job-operator-ux-report", + "real-world-memory-knowledge-report", ] -[tasks.real-world-job-operator-ux-json] +[tasks.real-world-memory-knowledge-json] workspace = false command = "cargo" args = [ @@ -654,21 +501,21 @@ args = [ "--", "run", "--fixtures", - "apps/elf-eval/fixtures/real_world_job/operator_debugging_ux", + "apps/elf-eval/fixtures/real_world_memory/knowledge", "--out", - "tmp/real-world-job/real-world-job-operator-ux-report.json", + "tmp/real-world-memory/knowledge-report.json", "--run-id", - "real-world-job-operator-ux", + "real-world-memory-knowledge", "--adapter-id", - "fixture_operator_ux", + "fixture_knowledge", "--adapter-name", - "ELF operator UX fixture", + "ELF knowledge fixture", ] -[tasks.real-world-job-operator-ux-report] +[tasks.real-world-memory-knowledge-report] workspace = false dependencies = [ - "real-world-job-operator-ux-json", + "real-world-memory-knowledge-json", ] command = "cargo" args = [ @@ -680,26 +527,34 @@ args = [ "--", "publish", "--report", - "tmp/real-world-job/real-world-job-operator-ux-report.json", + "tmp/real-world-memory/knowledge-report.json", "--out", - "tmp/real-world-job/real-world-job-operator-ux-report.md", + "tmp/real-world-memory/knowledge-report.md", ] -[tasks.real-world-job-operator-ux-live-adapters] +[tasks.real-world-memory-live-adapters] workspace = false command = "bash" args = [ - "-lc", - "docker compose -f docker-compose.baseline.yml run --build --rm -e ELF_OPERATOR_DEBUG_LIVE_REPORT_DIR -e ELF_OPERATOR_DEBUG_LIVE_FIXTURES -e ELF_OPERATOR_DEBUG_LIVE_WORK_DIR -e ELF_OPERATOR_DEBUG_QMD_DIR baseline-runner bash scripts/real-world-operator-debug-live-adapters.sh", + "scripts/real-world-docker.sh", + "memory-live-adapters", ] -[tasks.real-world-memory-retrieval] +[tasks.real-world-memory-live-consolidation] +workspace = false +command = "bash" +args = [ + "scripts/real-world-docker.sh", + "memory-live-consolidation", +] + +[tasks.real-world-memory-proactive-brief] workspace = false dependencies = [ - "real-world-memory-retrieval-report", + "real-world-memory-proactive-brief-report", ] -[tasks.real-world-memory-retrieval-json] +[tasks.real-world-memory-proactive-brief-json] workspace = false command = "cargo" args = [ @@ -711,21 +566,21 @@ args = [ "--", "run", "--fixtures", - "apps/elf-eval/fixtures/real_world_memory/retrieval", + "apps/elf-eval/fixtures/real_world_memory/proactive_brief", + "--out", + "tmp/real-world-memory/proactive-brief/report.json", "--run-id", - "real-world-memory-retrieval", + "real-world-memory-proactive-brief", "--adapter-id", - "fixture_retrieval", + "fixture_proactive_brief", "--adapter-name", - "ELF fixture retrieval cases", - "--out", - "tmp/real-world-memory/retrieval-report.json", + "ELF proactive brief fixture", ] -[tasks.real-world-memory-retrieval-report] +[tasks.real-world-memory-proactive-brief-report] workspace = false dependencies = [ - "real-world-memory-retrieval-json", + "real-world-memory-proactive-brief-json", ] command = "cargo" args = [ @@ -737,9 +592,9 @@ args = [ "--", "publish", "--report", - "tmp/real-world-memory/retrieval-report.json", + "tmp/real-world-memory/proactive-brief/report.json", "--out", - "tmp/real-world-memory/retrieval-report.md", + "tmp/real-world-memory/proactive-brief/report.md", ] [tasks.real-world-memory-production-ops] @@ -791,13 +646,13 @@ args = [ "tmp/real-world-memory/production-ops-report.md", ] -[tasks.real-world-memory-consolidation] +[tasks.real-world-memory-project-decisions] workspace = false dependencies = [ - "real-world-memory-consolidation-report", + "real-world-memory-project-decisions-report", ] -[tasks.real-world-memory-consolidation-json] +[tasks.real-world-memory-project-decisions-json] workspace = false command = "cargo" args = [ @@ -809,21 +664,21 @@ args = [ "--", "run", "--fixtures", - "apps/elf-eval/fixtures/real_world_memory/consolidation", + "apps/elf-eval/fixtures/real_world_memory/project_decisions", "--out", - "tmp/real-world-memory/consolidation/report.json", + "tmp/real-world-memory/project-decisions/report.json", "--run-id", - "real-world-memory-consolidation", + "real-world-memory-project-decisions", "--adapter-id", - "fixture_consolidation", + "fixture_project_decisions", "--adapter-name", - "ELF consolidation fixture", + "ELF project decision fixture", ] -[tasks.real-world-memory-consolidation-report] +[tasks.real-world-memory-project-decisions-report] workspace = false dependencies = [ - "real-world-memory-consolidation-json", + "real-world-memory-project-decisions-json", ] command = "cargo" args = [ @@ -835,44 +690,15 @@ args = [ "--", "publish", "--report", - "tmp/real-world-memory/consolidation/report.json", - "--out", - "tmp/real-world-memory/consolidation/report.md", -] - -[tasks.real-world-memory-summary] -workspace = false -dependencies = [ - "real-world-memory-summary-report", -] - -[tasks.real-world-memory-summary-json] -workspace = false -command = "cargo" -args = [ - "run", - "-p", - "elf-eval", - "--bin", - "real_world_job_benchmark", - "--", - "run", - "--fixtures", - "apps/elf-eval/fixtures/real_world_memory/memory_summary", + "tmp/real-world-memory/project-decisions/report.json", "--out", - "tmp/real-world-memory/memory-summary/report.json", - "--run-id", - "real-world-memory-summary", - "--adapter-id", - "fixture_memory_summary", - "--adapter-name", - "ELF memory summary fixture", + "tmp/real-world-memory/project-decisions/report.md", ] -[tasks.real-world-memory-summary-report] +[tasks.real-world-memory-report] workspace = false dependencies = [ - "real-world-memory-summary-json", + "real-world-memory-json", ] command = "cargo" args = [ @@ -884,18 +710,18 @@ args = [ "--", "publish", "--report", - "tmp/real-world-memory/memory-summary/report.json", + "tmp/real-world-memory/real-world-memory-report.json", "--out", - "tmp/real-world-memory/memory-summary/report.md", + "tmp/real-world-memory/real-world-memory-report.md", ] -[tasks.real-world-memory-proactive-brief] +[tasks.real-world-memory-retrieval] workspace = false dependencies = [ - "real-world-memory-proactive-brief-report", + "real-world-memory-retrieval-report", ] -[tasks.real-world-memory-proactive-brief-json] +[tasks.real-world-memory-retrieval-json] workspace = false command = "cargo" args = [ @@ -907,21 +733,21 @@ args = [ "--", "run", "--fixtures", - "apps/elf-eval/fixtures/real_world_memory/proactive_brief", - "--out", - "tmp/real-world-memory/proactive-brief/report.json", + "apps/elf-eval/fixtures/real_world_memory/retrieval", "--run-id", - "real-world-memory-proactive-brief", + "real-world-memory-retrieval", "--adapter-id", - "fixture_proactive_brief", + "fixture_retrieval", "--adapter-name", - "ELF proactive brief fixture", + "ELF fixture retrieval cases", + "--out", + "tmp/real-world-memory/retrieval-report.json", ] -[tasks.real-world-memory-proactive-brief-report] +[tasks.real-world-memory-retrieval-report] workspace = false dependencies = [ - "real-world-memory-proactive-brief-json", + "real-world-memory-retrieval-json", ] command = "cargo" args = [ @@ -933,9 +759,9 @@ args = [ "--", "publish", "--report", - "tmp/real-world-memory/proactive-brief/report.json", + "tmp/real-world-memory/retrieval-report.json", "--out", - "tmp/real-world-memory/proactive-brief/report.md", + "tmp/real-world-memory/retrieval-report.md", ] [tasks.real-world-memory-scheduled] @@ -987,21 +813,13 @@ args = [ "tmp/real-world-memory/scheduled/report.md", ] -[tasks.real-world-memory-live-consolidation] -workspace = false -command = "bash" -args = [ - "-lc", - "docker compose -f docker-compose.baseline.yml run --build --rm -e ELF_CONSOLIDATION_LIVE_REPORT_DIR -e ELF_CONSOLIDATION_LIVE_FIXTURES baseline-runner bash scripts/real-world-consolidation-live-adapter.sh", -] - -[tasks.real-world-memory-core-archival] +[tasks.real-world-memory-summary] workspace = false dependencies = [ - "real-world-memory-core-archival-report", + "real-world-memory-summary-report", ] -[tasks.real-world-memory-core-archival-json] +[tasks.real-world-memory-summary-json] workspace = false command = "cargo" args = [ @@ -1013,21 +831,21 @@ args = [ "--", "run", "--fixtures", - "apps/elf-eval/fixtures/real_world_memory/core_archival_memory", + "apps/elf-eval/fixtures/real_world_memory/memory_summary", "--out", - "tmp/real-world-memory/core-archival/report.json", + "tmp/real-world-memory/memory-summary/report.json", "--run-id", - "real-world-memory-core-archival", + "real-world-memory-summary", "--adapter-id", - "fixture_core_archival_memory", + "fixture_memory_summary", "--adapter-name", - "ELF core and archival memory fixture", + "ELF memory summary fixture", ] -[tasks.real-world-memory-core-archival-report] +[tasks.real-world-memory-summary-report] workspace = false dependencies = [ - "real-world-memory-core-archival-json", + "real-world-memory-summary-json", ] command = "cargo" args = [ @@ -1039,233 +857,250 @@ args = [ "--", "publish", "--report", - "tmp/real-world-memory/core-archival/report.json", + "tmp/real-world-memory/memory-summary/report.json", "--out", - "tmp/real-world-memory/core-archival/report.md", + "tmp/real-world-memory/memory-summary/report.md", ] -[tasks.real-world-memory-graph-rag] +# Check +# | task | type | cwd | +# | ---------------- | --------- | --- | +# | check | composite | | +# | check-docs | command | | +# | check-rust | command | | +# | check-trace-gate | command | | + +[tasks.check] +clear = true workspace = false dependencies = [ - "real-world-memory-graph-rag-report", + "fmt-check", + "check-docs", + "check-rust", + "lint", + "test", ] -[tasks.real-world-memory-graph-rag-json] +[tasks.check-docs] workspace = false -command = "cargo" +command = "python3" args = [ - "run", - "-p", - "elf-eval", - "--bin", - "real_world_job_benchmark", - "--", - "run", - "--fixtures", - "apps/elf-eval/fixtures/real_world_external_adapters/graph_rag", - "--out", - "tmp/real-world-memory/graph-rag/report.json", - "--run-id", - "real-world-memory-graph-rag", - "--adapter-id", - "fixture_graph_rag_external_adapters", - "--adapter-name", - "Graph/RAG representative external-adapter fixtures", + "scripts/check-docs.py", ] -[tasks.real-world-memory-graph-rag-report] +[tasks.check-rust] workspace = false -dependencies = [ - "real-world-memory-graph-rag-json", -] command = "cargo" args = [ - "run", - "-p", - "elf-eval", - "--bin", - "real_world_job_benchmark", - "--", - "publish", - "--report", - "tmp/real-world-memory/graph-rag/report.json", - "--out", - "tmp/real-world-memory/graph-rag/report.md", + "check", + "--workspace", + "--all-targets", + "--all-features", ] -[tasks.real-world-memory-live-adapters] +[tasks.check-trace-gate] workspace = false command = "bash" args = [ - "-lc", - "set -euo pipefail; lightrag_start=\"$(printenv ELF_LIGHTRAG_CONTEXT_START || true)\"; graphiti_start=\"$(printenv ELF_GRAPHITI_ZEP_SMOKE_START || true)\"; status=0; if [ \"$lightrag_start\" = \"1\" ]; then docker compose -f docker-compose.baseline.yml --profile lightrag up -d lightrag; fi; if [ \"$graphiti_start\" = \"1\" ]; then docker compose -f docker-compose.baseline.yml --profile graphiti-zep up -d graphiti-falkordb; fi; docker compose -f docker-compose.baseline.yml run --build --rm -e ELF_REAL_WORLD_LIVE_ENABLE_RAGFLOW -e ELF_REAL_WORLD_LIVE_ENABLE_LIGHTRAG -e ELF_REAL_WORLD_LIVE_ENABLE_GRAPHRAG -e ELF_REAL_WORLD_LIVE_ENABLE_GRAPHITI_ZEP -e ELF_REAL_WORLD_LIVE_ENABLE_GRAPHIFY -e ELF_RAGFLOW_SMOKE_START -e ELF_RAGFLOW_SMOKE_ACCEPT_RESOURCE_ENVELOPE -e ELF_RAGFLOW_SMOKE_ALLOW_ARM -e ELF_RAGFLOW_SMOKE_PULL_IMAGE -e ELF_RAGFLOW_SMOKE_CLEANUP -e ELF_RAGFLOW_SMOKE_DEVICE -e ELF_RAGFLOW_API_PORT -e ELF_RAGFLOW_API_BASE -e ELF_RAGFLOW_API_KEY -e RAGFLOW_API_KEY -e ELF_RAGFLOW_SMOKE_STARTUP_ATTEMPTS -e ELF_RAGFLOW_SMOKE_STARTUP_INTERVAL_SECONDS -e ELF_RAGFLOW_SMOKE_COMPOSE_TIMEOUT_SECONDS -e ELF_RAGFLOW_REPO_URL -e ELF_RAGFLOW_REF -e ELF_RAGFLOW_IMAGE -e ELF_RAGFLOW_COMPOSE_PROJECT -e ELF_LIGHTRAG_CONTEXT_START -e ELF_LIGHTRAG_API_BASE -e ELF_LIGHTRAG_ADAPTER_ID -e ELF_LIGHTRAG_ADAPTER_NAME -e ELF_LIGHTRAG_STARTUP_ATTEMPTS -e ELF_LIGHTRAG_STARTUP_INTERVAL_SECONDS -e ELF_LIGHTRAG_INDEX_ATTEMPTS -e ELF_LIGHTRAG_INDEX_INTERVAL_SECONDS -e ELF_GRAPHRAG_SMOKE_RUN -e ELF_GRAPHRAG_SMOKE_WORK_DIR -e ELF_GRAPHRAG_SMOKE_INSTALL -e ELF_GRAPHRAG_VERSION -e ELF_GRAPHRAG_PACKAGE -e ELF_GRAPHRAG_REF -e ELF_GRAPHRAG_CHAT_MODEL -e ELF_GRAPHRAG_EMBEDDING_MODEL -e ELF_GRAPHRAG_API_BASE -e ELF_GRAPHRAG_API_KEY -e ELF_GRAPHRAG_INDEX_METHOD -e ELF_GRAPHRAG_QUERY_METHOD -e ELF_GRAPHRAG_TIMEOUT_SECONDS -e ELF_GRAPHRAG_MAX_DOCS -e ELF_GRAPHRAG_MAX_INPUT_CHARS -e ELF_GRAPHITI_ZEP_SMOKE_START -e ELF_GRAPHITI_ZEP_SMOKE_RUN -e ELF_GRAPHITI_ZEP_SMOKE_WORK_DIR -e ELF_GRAPHITI_ZEP_SMOKE_INSTALL -e ELF_GRAPHITI_ZEP_VERSION -e ELF_GRAPHITI_ZEP_PACKAGE -e ELF_GRAPHITI_ZEP_REF -e ELF_GRAPHITI_ZEP_API_BASE -e ELF_GRAPHITI_ZEP_API_KEY -e ELF_GRAPHITI_ZEP_LLM_MODEL -e ELF_GRAPHITI_ZEP_EMBEDDING_MODEL -e ELF_GRAPHITI_ZEP_FALKORDB_HOST -e ELF_GRAPHITI_ZEP_FALKORDB_PORT -e ELF_GRAPHITI_ZEP_FALKORDB_DATABASE -e ELF_GRAPHITI_ZEP_TIMEOUT_SECONDS -e ELF_GRAPHITI_ZEP_STARTUP_ATTEMPTS -e ELF_GRAPHITI_ZEP_STARTUP_INTERVAL_SECONDS -e ELF_GRAPHIFY_SMOKE_RUN -e ELF_GRAPHIFY_SMOKE_WORK_DIR -e ELF_GRAPHIFY_SMOKE_INSTALL -e ELF_GRAPHIFY_PACKAGE -e ELF_GRAPHIFY_REF -e ELF_GRAPHIFY_TIMEOUT_SECONDS -e ELF_GRAPHIFY_QUERY_BUDGET baseline-runner bash scripts/real-world-live-adapters.sh || status=$?; if [ \"$lightrag_start\" = \"1\" ]; then docker compose -f docker-compose.baseline.yml --profile lightrag stop lightrag lightrag-mock-provider >/dev/null 2>&1 || true; fi; if [ \"$graphiti_start\" = \"1\" ]; then docker compose -f docker-compose.baseline.yml --profile graphiti-zep stop graphiti-falkordb >/dev/null 2>&1 || true; fi; exit \"$status\"", + "scripts/trace-gate.sh", ] +# Clean +# | task | type | cwd | +# | -------------------------- | ------- | --- | +# | clean-baseline-live-docker | command | | +# | clean-parity-docker | command | | -# Real-world memory knowledge benchmark -# | task | type | cwd | -# | ------------------------------ | --------- | --- | -# | real-world-memory-knowledge | composite | | -# | real-world-memory-knowledge-json | command | | -# | real-world-memory-knowledge-report | command | | -# | real-world-first-generation-oss | composite | | -# | real-world-first-generation-oss-json | command | | -# | real-world-first-generation-oss-report | command | | -# | ragflow-docker-smoke | command | | -# | lightrag-docker-context-smoke | command | | -# | graphrag-docker-smoke | command | | -# | graphiti-zep-docker-temporal-smoke | command | | -# | graphify-docker-graph-report-smoke | command | | - -[tasks.ragflow-docker-smoke] +[tasks.clean-baseline-live-docker] workspace = false -command = "bash" +command = "docker" args = [ - "scripts/ragflow-docker-evidence-smoke.sh", + "compose", + "-f", + "docker-compose.baseline.yml", + "down", + "-v", + "--remove-orphans", ] -[tasks.lightrag-docker-context-smoke] +[tasks.clean-parity-docker] workspace = false -command = "bash" +command = "docker" args = [ - "-lc", - "set -euo pipefail; start=\"$(printenv ELF_LIGHTRAG_CONTEXT_START || true)\"; status=0; if [ \"$start\" = \"1\" ]; then docker compose -f docker-compose.baseline.yml --profile lightrag up -d lightrag; fi; docker compose -f docker-compose.baseline.yml run --build --rm baseline-runner bash scripts/lightrag-docker-context-smoke.sh || status=$?; if [ \"$start\" = \"1\" ]; then docker compose -f docker-compose.baseline.yml --profile lightrag stop lightrag lightrag-mock-provider >/dev/null 2>&1 || true; fi; exit \"$status\"", + "compose", + "-f", + "docker-compose.parity.yml", + "down", + "-v", + "--remove-orphans", ] -[tasks.graphrag-docker-smoke] +# Format +# | task | type | cwd | +# | -------------- | --------- | --- | +# | fmt | composite | | +# | fmt-check | composite | | +# | fmt-rust | command | | +# | fmt-rust-check | extend | | +# | fmt-toml | command | | +# | fmt-toml-check | extend | | + +[tasks.fmt] workspace = false -command = "bash" -args = [ - "-lc", - "set -euo pipefail; docker compose -f docker-compose.baseline.yml run --build --rm -e ELF_GRAPHRAG_SMOKE_RUN -e ELF_GRAPHRAG_SMOKE_REPORT_DIR -e ELF_GRAPHRAG_SMOKE_WORK_DIR -e ELF_GRAPHRAG_SMOKE_INSTALL -e ELF_GRAPHRAG_VERSION -e ELF_GRAPHRAG_PACKAGE -e ELF_GRAPHRAG_REF -e ELF_GRAPHRAG_CHAT_MODEL -e ELF_GRAPHRAG_EMBEDDING_MODEL -e ELF_GRAPHRAG_API_BASE -e ELF_GRAPHRAG_API_KEY -e ELF_GRAPHRAG_INDEX_METHOD -e ELF_GRAPHRAG_QUERY_METHOD -e ELF_GRAPHRAG_TIMEOUT_SECONDS -e ELF_GRAPHRAG_MAX_DOCS -e ELF_GRAPHRAG_MAX_INPUT_CHARS baseline-runner python3 scripts/graphrag-docker-smoke.py", +dependencies = [ + "fmt-rust", + "fmt-toml", +] + +[tasks.fmt-check] +workspace = false +dependencies = [ + "fmt-rust-check", + "fmt-toml-check", ] -[tasks.graphiti-zep-docker-temporal-smoke] +[tasks.fmt-rust] +workspace = false +script = "cargo +nightly fmt --all" + +[tasks.fmt-rust-check] +extend = "fmt-rust" +script = "cargo +nightly fmt --all -- --check" + +[tasks.fmt-toml] workspace = false -command = "bash" +command = "taplo" args = [ - "-lc", - "set -euo pipefail; start=\"$(printenv ELF_GRAPHITI_ZEP_SMOKE_START || true)\"; status=0; if [ \"$start\" = \"1\" ]; then docker compose -f docker-compose.baseline.yml --profile graphiti-zep up -d graphiti-falkordb; fi; docker compose -f docker-compose.baseline.yml run --build --rm -e ELF_GRAPHITI_ZEP_SMOKE_RUN -e ELF_GRAPHITI_ZEP_SMOKE_REPORT_DIR -e ELF_GRAPHITI_ZEP_SMOKE_WORK_DIR -e ELF_GRAPHITI_ZEP_SMOKE_INSTALL -e ELF_GRAPHITI_ZEP_VERSION -e ELF_GRAPHITI_ZEP_PACKAGE -e ELF_GRAPHITI_ZEP_REF -e ELF_GRAPHITI_ZEP_API_BASE -e ELF_GRAPHITI_ZEP_API_KEY -e ELF_GRAPHITI_ZEP_LLM_MODEL -e ELF_GRAPHITI_ZEP_EMBEDDING_MODEL -e ELF_GRAPHITI_ZEP_FALKORDB_HOST -e ELF_GRAPHITI_ZEP_FALKORDB_PORT -e ELF_GRAPHITI_ZEP_FALKORDB_DATABASE -e ELF_GRAPHITI_ZEP_TIMEOUT_SECONDS -e ELF_GRAPHITI_ZEP_STARTUP_ATTEMPTS -e ELF_GRAPHITI_ZEP_STARTUP_INTERVAL_SECONDS baseline-runner python3 scripts/graphiti-zep-docker-temporal-smoke.py || status=$?; if [ \"$start\" = \"1\" ]; then docker compose -f docker-compose.baseline.yml --profile graphiti-zep stop graphiti-falkordb >/dev/null 2>&1 || true; fi; exit \"$status\"", + "fmt", ] -[tasks.graphify-docker-graph-report-smoke] -workspace = false -command = "bash" +[tasks.fmt-toml-check] +extend = "fmt-toml" args = [ - "-lc", - "set -euo pipefail; docker compose -f docker-compose.baseline.yml run --build --rm -e ELF_GRAPHIFY_SMOKE_RUN -e ELF_GRAPHIFY_SMOKE_REPORT_DIR -e ELF_GRAPHIFY_SMOKE_WORK_DIR -e ELF_GRAPHIFY_SMOKE_INSTALL -e ELF_GRAPHIFY_PACKAGE -e ELF_GRAPHIFY_REF -e ELF_GRAPHIFY_TIMEOUT_SECONDS -e ELF_GRAPHIFY_QUERY_BUDGET baseline-runner python3 scripts/graphify-docker-graph-report-smoke.py", + "fmt", + "--check", ] -[tasks.real-world-memory-knowledge] +# Lint +# | task | type | cwd | +# | ----------- | --------- | --- | +# | lint | composite | | +# | lint-rust | command | | +# | lint-vstyle | command | | + +[tasks.lint] workspace = false dependencies = [ - "real-world-memory-knowledge-report", + "lint-rust", + "lint-vstyle", ] -[tasks.real-world-memory-knowledge-json] +[tasks.lint-rust] workspace = false command = "cargo" args = [ - "run", - "-p", - "elf-eval", - "--bin", - "real_world_job_benchmark", + "clippy", + "--all-features", + "--all-targets", + "--workspace", "--", - "run", - "--fixtures", - "apps/elf-eval/fixtures/real_world_memory/knowledge", - "--out", - "tmp/real-world-memory/knowledge-report.json", - "--run-id", - "real-world-memory-knowledge", - "--adapter-id", - "fixture_knowledge", - "--adapter-name", - "ELF knowledge fixture", + "-D", + "clippy::all", + "-D", + "clippy::too_many_lines", + "-D", + "clippy::unwrap_used", + "-D", + "clippy::use_self", + "-D", + "clippy::wildcard_imports", + "-D", + "missing-docs", + "-D", + "unused-crate-dependencies", + "-D", + "warnings", ] -[tasks.real-world-memory-knowledge-report] +[tasks.lint-vstyle] workspace = false -dependencies = [ - "real-world-memory-knowledge-json", -] command = "cargo" args = [ - "run", - "-p", - "elf-eval", - "--bin", - "real_world_job_benchmark", - "--", - "publish", - "--report", - "tmp/real-world-memory/knowledge-report.json", - "--out", - "tmp/real-world-memory/knowledge-report.md", + "vstyle", + "curate", + "--language", + "rust", + "--workspace", + "--all-features", ] -[tasks.real-world-first-generation-oss] +# Lint Fix +# | task | type | cwd | +# | --------------- | --------- | --- | +# | lint-fix | composite | | +# | lint-fix-rust | command | | +# | lint-fix-vstyle | command | | + +[tasks.lint-fix] workspace = false dependencies = [ - "real-world-first-generation-oss-report", + "lint-fix-rust", + "lint-fix-vstyle", ] -[tasks.real-world-first-generation-oss-json] +[tasks.lint-fix-rust] workspace = false command = "cargo" args = [ - "run", - "-p", - "elf-eval", - "--bin", - "real_world_job_benchmark", + "clippy", + "--fix", + "--allow-dirty", + "--all-features", + "--all-targets", + "--workspace", "--", - "run", - "--fixtures", - "apps/elf-eval/fixtures/real_world_external_adapters/first_generation_oss", - "--out", - "tmp/real-world-memory/first-generation-oss/report.json", - "--run-id", - "first-generation-oss-continuity-source-store", - "--adapter-id", - "fixture_first_generation_oss", - "--adapter-name", - "First-generation OSS fixture coverage", + "-D", + "clippy::all", + "-D", + "clippy::too_many_lines", + "-D", + "clippy::unwrap_used", + "-D", + "clippy::use_self", + "-D", + "clippy::wildcard_imports", + "-D", + "missing-docs", + "-D", + "unused-crate-dependencies", + "-D", + "warnings", ] -[tasks.real-world-first-generation-oss-report] +[tasks.lint-fix-vstyle] workspace = false -dependencies = [ - "real-world-first-generation-oss-json", -] command = "cargo" args = [ - "run", - "-p", - "elf-eval", - "--bin", - "real_world_job_benchmark", - "--", - "publish", - "--report", - "tmp/real-world-memory/first-generation-oss/report.json", - "--out", - "tmp/real-world-memory/first-generation-oss/report.md", + "vstyle", + "tune", + "--language", + "rust", + "--workspace", + "--all-features", + "--strict", ] - -# External memory pattern radar -# | task | type | cwd | -# | ---------------------------------- | --------- | --- | -# | external-memory-radar | command | | -# | external-memory-radar-artifact | composite | | -# | external-memory-radar-artifact-json | command | | -# | external-memory-radar-artifact-validate | command | | -# | external-memory-radar-dry-run | composite | | -# | external-memory-radar-dry-run-json | command | | -# | external-memory-radar-dry-run-validate | command | | -# | external-memory-radar-validate | command | | +# Research +# | task | type | cwd | +# | --------------------------------------- | --------- | --- | +# | external-memory-radar | command | | +# | external-memory-radar-artifact | composite | | +# | external-memory-radar-artifact-json | command | | +# | external-memory-radar-artifact-validate | command | | +# | external-memory-radar-dry-run | composite | | +# | external-memory-radar-dry-run-json | command | | +# | external-memory-radar-dry-run-validate | command | | +# | external-memory-radar-validate | command | | [tasks.external-memory-radar] workspace = false @@ -1383,30 +1218,156 @@ args = [ "docs/research/external_memory_pattern_radar/cursor.json", ] +# Smoke +# | task | type | cwd | +# | ---------------------------------- | --------- | --- | +# | smoke-graphify-docker-graph-report | command | | +# | smoke-graphiti-zep-docker-temporal | command | | +# | smoke-graphrag-docker | command | | +# | smoke-lightrag-docker-context | command | | +# | smoke-ragflow-docker | command | | +# | smoke-real-world-job | composite | | +# | smoke-real-world-job-json | command | | +# | smoke-real-world-job-report | command | | + +[tasks.smoke-graphify-docker-graph-report] +workspace = false +command = "bash" +args = [ + "scripts/smoke-docker.sh", + "graphify-docker-graph-report", +] + +[tasks.smoke-graphiti-zep-docker-temporal] +workspace = false +command = "bash" +args = [ + "scripts/smoke-docker.sh", + "graphiti-zep-docker-temporal", +] + +[tasks.smoke-graphrag-docker] +workspace = false +command = "bash" +args = [ + "scripts/smoke-docker.sh", + "graphrag-docker", +] + +[tasks.smoke-lightrag-docker-context] +workspace = false +command = "bash" +args = [ + "scripts/smoke-docker.sh", + "lightrag-docker-context", +] -# Meta -# | task | type | cwd | -# | ------ | --------- | --- | -# | checks | composite | | +[tasks.smoke-ragflow-docker] +workspace = false +command = "bash" +args = [ + "scripts/ragflow-docker-evidence-smoke.sh", +] -[tasks.checks] +[tasks.smoke-real-world-job] workspace = false dependencies = [ - "lint", - "test", - "fmt-check", + "smoke-real-world-job-report", +] + +[tasks.smoke-real-world-job-json] +workspace = false +command = "cargo" +args = [ + "run", + "-p", + "elf-eval", + "--bin", + "real_world_job_benchmark", + "--", + "run", + "--fixtures", + "apps/elf-eval/fixtures/real_world_memory/work_resume", + "--out", + "tmp/real-world-job/real-world-job-smoke-report.json", +] + +[tasks.smoke-real-world-job-report] +workspace = false +dependencies = [ + "smoke-real-world-job-json", +] +command = "cargo" +args = [ + "run", + "-p", + "elf-eval", + "--bin", + "real_world_job_benchmark", + "--", + "publish", + "--report", + "tmp/real-world-job/real-world-job-smoke-report.json", + "--out", + "tmp/real-world-job/real-world-job-smoke-report.md", ] +# Test +# | task | type | cwd | +# | --------------------- | --------- | --- | +# | test | composite | | +# | test-e2e | command | | +# | test-rust | command | | +# | test-rust-all | command | | +# | test-rust-integration | command | | -# Quality utilities -# | task | type | cwd | -# | --------- | ------- | --- | -# | trace-gate | command | | +[tasks.test] +clear = true +workspace = false +dependencies = [ + "test-rust", +] -[tasks.trace-gate] +[tasks.test-e2e] workspace = false command = "bash" args = [ - "-lc", - "set -euo pipefail; DSN=\"${TRACE_GATE_PG_DSN:-postgres://postgres:postgres@127.0.0.1:5432/elf}\"; psql \"${DSN}\" -v ON_ERROR_STOP=1 -f sql/init.sql; psql \"${DSN}\" -v ON_ERROR_STOP=1 -f .github/fixtures/trace_gate/fixture.sql; cargo run -p elf-eval --bin trace_regression_gate -- --config .github/fixtures/trace_gate/config.toml --gate .github/fixtures/trace_gate/gate.json --out tmp/trace_gate.report.json", + "scripts/context-misranking-harness.sh", +] + +[tasks.test-rust] +workspace = false +command = "cargo" +args = [ + "nextest", + "run", + "--workspace", + "--all-targets", + "--all-features", +] + +[tasks.test-rust-all] +workspace = false +command = "cargo" +args = [ + "nextest", + "run", + "--workspace", + "--all-targets", + "--all-features", + "--run-ignored", + "all", +] + +[tasks.test-rust-integration] +workspace = false +command = "cargo" +args = [ + "nextest", + "run", + "--workspace", + "--all-targets", + "--all-features", + "--run-ignored", + "only", ] diff --git a/README.md b/README.md index 13de0803..5649d0d6 100644 --- a/README.md +++ b/README.md @@ -254,7 +254,7 @@ provider-backed ELF evidence was required. `cargo make baseline-soak-docker`, `cargo make baseline-live-report`, `cargo make real-world-memory-live-adapters`, `cargo make real-world-first-generation-oss`, and - `cargo make baseline-live-docker-clean`. Expensive 100k and long-soak profiles + `cargo make clean-baseline-live-docker`. Expensive 100k and long-soak profiles are opt-in and do not run in normal checks. Detailed evidence and interpretation: @@ -390,8 +390,8 @@ self-check evidence, and fixture-backed scheduled-memory task scoring. ```sh cargo make fmt -cargo make lint -cargo make test +cargo make check +cargo make test-rust ``` For integration and E2E workflows, use `docs/guide/getting_started.md` and `docs/guide/integration-testing.md`. diff --git a/apps/elf-eval/fixtures/production_corpus/synthetic_coding_agent_manifest.json b/apps/elf-eval/fixtures/production_corpus/synthetic_coding_agent_manifest.json index d627b627..62873c40 100644 --- a/apps/elf-eval/fixtures/production_corpus/synthetic_coding_agent_manifest.json +++ b/apps/elf-eval/fixtures/production_corpus/synthetic_coding_agent_manifest.json @@ -13,13 +13,13 @@ "evidence_id": "pr-110-review", "category": "pr", "title": "PR 110 Review Status", - "text": "PR #110 is review-ready for the ELF viewer lane. It passed `cargo make checks` and waits for the non-draft review handoff." + "text": "PR #110 is review-ready for the ELF viewer lane. It passed `cargo make check` and waits for the non-draft review handoff." }, { "evidence_id": "worktree-xy791-repair", "category": "worktree", "title": "XY-791 Strict Config Repair", - "text": "Worktree XY-791 recovered strict-config repair after rebase. The exact gate was `cargo make fmt && cargo make lint-fix && cargo make checks`." + "text": "Worktree XY-791 recovered strict-config repair after rebase. The exact gate was `cargo make fmt && cargo make lint-fix && cargo make check`." }, { "evidence_id": "runbook-live-baseline", @@ -67,7 +67,7 @@ "query": "Recover the exact repair gate command for XY-791 strict config.", "expected_evidence_ids": ["worktree-xy791-repair"], "allowed_alternate_evidence_ids": ["runbook-live-baseline"], - "expected_terms": ["XY-791", "cargo make fmt && cargo make lint-fix && cargo make checks"] + "expected_terms": ["XY-791", "cargo make fmt && cargo make lint-fix && cargo make check"] }, { "query_id": "q-explain-stale-blocker", diff --git a/apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json b/apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json index afd789bc..0ba49733 100644 --- a/apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json +++ b/apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json @@ -1759,13 +1759,13 @@ "setup": { "status": "blocked", "evidence": "XY-900 promotes the Docker-safe tiny-corpus evidence smoke into a generated real_world_job report while the checked-in row remains smoke-only research_gate evidence.", - "command": "cargo make ragflow-docker-smoke", + "command": "cargo make smoke-ragflow-docker", "artifact": "tmp/real-world-memory/ragflow-smoke/ragflow-smoke.json" }, "run": { "status": "blocked", "evidence": "The live path requires explicit resource-envelope opt-in and a local self-hosted RAGFlow API key; setup failures stay typed in the generated smoke artifact.", - "command": "ELF_RAGFLOW_SMOKE_START=1 ELF_RAGFLOW_SMOKE_ACCEPT_RESOURCE_ENVELOPE=1 cargo make ragflow-docker-smoke", + "command": "ELF_RAGFLOW_SMOKE_START=1 ELF_RAGFLOW_SMOKE_ACCEPT_RESOURCE_ENVELOPE=1 cargo make smoke-ragflow-docker", "artifact": "tmp/real-world-memory/ragflow-smoke/memory_projects_manifest.ragflow-smoke.json" }, "result": { @@ -1877,7 +1877,7 @@ "runtime_boundary": "Run scripts/ragflow-docker-evidence-smoke.sh through cargo make; the live path uses the official RAGFlow Docker Compose service boundary without host-global RAGFlow installs.", "resource_expectation": "Large multi-service RAG stack; generated artifacts record CPU/GPU mode, memory, disk, image size, expanded disk notes, startup time, vm.max_map_count handling, and provider boundaries before scoring.", "retry_guidance": [ - "Run cargo make ragflow-docker-smoke first to produce a typed preflight artifact.", + "Run cargo make smoke-ragflow-docker first to produce a typed preflight artifact.", "Start the live path only with ELF_RAGFLOW_SMOKE_START=1 and ELF_RAGFLOW_SMOKE_ACCEPT_RESOURCE_ENVELOPE=1.", "Keep private corpora and operator-owned provider credentials out of this smoke; map only generated public corpus reference chunks to evidence ids." ], @@ -1903,13 +1903,13 @@ "setup": { "status": "blocked", "evidence": "XY-886 adds a Docker-profile context-export smoke command, and XY-900 keeps its generated retrieval fixtures scored through real_world_job_benchmark. The checked-in row remains smoke-only research_gate evidence.", - "command": "cargo make lightrag-docker-context-smoke", + "command": "cargo make smoke-lightrag-docker-context", "artifact": "tmp/real-world-memory/lightrag-context/lightrag-materialization.json" }, "run": { "status": "blocked", "evidence": "The default smoke records a typed setup/runtime failure if the LightRAG API is unavailable; set ELF_LIGHTRAG_CONTEXT_START=1 to start the opt-in Docker service profile.", - "command": "ELF_LIGHTRAG_CONTEXT_START=1 cargo make lightrag-docker-context-smoke", + "command": "ELF_LIGHTRAG_CONTEXT_START=1 cargo make smoke-lightrag-docker-context", "artifact": "tmp/real-world-memory/lightrag-context/summary.json" }, "result": { @@ -1990,7 +1990,7 @@ }, { "kind": "command", - "ref": "cargo make lightrag-docker-context-smoke", + "ref": "cargo make smoke-lightrag-docker-context", "status": "blocked" }, { @@ -2027,11 +2027,11 @@ "evidence": "Official source-id and file-path citation reference." } ], - "setup_path": "Run cargo make lightrag-docker-context-smoke for a typed preflight artifact; set ELF_LIGHTRAG_CONTEXT_START=1 to start the opt-in LightRAG Docker profile and attempt live context export.", + "setup_path": "Run cargo make smoke-lightrag-docker-context for a typed preflight artifact; set ELF_LIGHTRAG_CONTEXT_START=1 to start the opt-in LightRAG Docker profile and attempt live context export.", "runtime_boundary": "docker-compose.baseline.yml baseline-runner plus opt-in lightrag and lightrag-mock-provider services; generated source files and LightRAG data stay in Docker-mounted artifact paths and Docker volumes.", "resource_expectation": "The default profile uses the official LightRAG image, a local OpenAI-compatible mock provider, 64-dimensional embeddings, rerank disabled for context queries, cargo/pip/Hugging Face caches, and Docker volumes for rag_storage, inputs, and prompts.", "retry_guidance": [ - "Run cargo make lightrag-docker-context-smoke first; a missing API must remain a typed incomplete artifact, not a pass claim.", + "Run cargo make smoke-lightrag-docker-context first; a missing API must remain a typed incomplete artifact, not a pass claim.", "Set ELF_LIGHTRAG_CONTEXT_START=1 only when Docker may pull/start the LightRAG service profile.", "Score retrieval only when returned context, references.file_path, or references.content map to required evidence ids." ], @@ -2057,13 +2057,13 @@ "setup": { "status": "blocked", "evidence": "XY-900 promotes the Docker-safe generated-corpus GraphRAG smoke into a scored knowledge_compilation report while the checked-in row remains smoke-only research_gate evidence.", - "command": "cargo make graphrag-docker-smoke", + "command": "cargo make smoke-graphrag-docker", "artifact": "tmp/real-world-memory/graphrag-smoke/graphrag-smoke.json" }, "run": { "status": "blocked", "evidence": "The default smoke records a typed blocked artifact without model calls; set ELF_GRAPHRAG_SMOKE_RUN=1 with explicit provider configuration to attempt live GraphRAG index/query.", - "command": "ELF_GRAPHRAG_SMOKE_RUN=1 cargo make graphrag-docker-smoke", + "command": "ELF_GRAPHRAG_SMOKE_RUN=1 cargo make smoke-graphrag-docker", "artifact": "tmp/real-world-memory/graphrag-smoke/summary.json" }, "result": { @@ -2149,7 +2149,7 @@ }, { "kind": "command", - "ref": "cargo make graphrag-docker-smoke", + "ref": "cargo make smoke-graphrag-docker", "status": "blocked" }, { @@ -2191,11 +2191,11 @@ "evidence": "Official local-search context and graph traversal reference." } ], - "setup_path": "Run cargo make graphrag-docker-smoke for a typed preflight artifact; set ELF_GRAPHRAG_SMOKE_RUN=1 with explicit provider configuration for a live GraphRAG index/query attempt.", + "setup_path": "Run cargo make smoke-graphrag-docker for a typed preflight artifact; set ELF_GRAPHRAG_SMOKE_RUN=1 with explicit provider configuration for a live GraphRAG index/query attempt.", "runtime_boundary": "docker-compose.baseline.yml baseline-runner, container-local Python venv, generated public corpus, and report artifacts under tmp/real-world-memory/graphrag-smoke.", "resource_expectation": "The default profile uses a generated public corpus capped by ELF_GRAPHRAG_MAX_DOCS and ELF_GRAPHRAG_MAX_INPUT_CHARS, pins GraphRAG through ELF_GRAPHRAG_PACKAGE, and records elapsed time, cache size, output size, and observed cache entries.", "retry_guidance": [ - "Run cargo make graphrag-docker-smoke first; missing provider configuration must remain a typed blocked artifact, not a pass claim.", + "Run cargo make smoke-graphrag-docker first; missing provider configuration must remain a typed blocked artifact, not a pass claim.", "Enable ELF_GRAPHRAG_SMOKE_RUN=1 only for generated public corpus indexing with explicit provider configuration.", "Fail typed if source document or text_unit identifiers cannot be mapped to expected evidence IDs." ], @@ -2221,13 +2221,13 @@ "setup": { "status": "blocked", "evidence": "XY-900 promotes the Docker-contained Graphiti/Zep temporal smoke into a scored memory_evolution report while the checked-in row remains smoke-only research_gate evidence.", - "command": "cargo make graphiti-zep-docker-temporal-smoke", + "command": "cargo make smoke-graphiti-zep-docker-temporal", "artifact": "tmp/real-world-memory/graphiti-zep-smoke/graphiti-zep-smoke.json" }, "run": { "status": "blocked", "evidence": "The default smoke records a typed setup/runtime failure if live execution is not explicitly enabled. Set ELF_GRAPHITI_ZEP_SMOKE_START=1 and ELF_GRAPHITI_ZEP_SMOKE_RUN=1 with explicit provider configuration to start Docker-local FalkorDB and run Graphiti.", - "command": "ELF_GRAPHITI_ZEP_SMOKE_START=1 ELF_GRAPHITI_ZEP_SMOKE_RUN=1 cargo make graphiti-zep-docker-temporal-smoke", + "command": "ELF_GRAPHITI_ZEP_SMOKE_START=1 ELF_GRAPHITI_ZEP_SMOKE_RUN=1 cargo make smoke-graphiti-zep-docker-temporal", "artifact": "tmp/real-world-memory/graphiti-zep-smoke/summary.json" }, "result": { @@ -2308,7 +2308,7 @@ }, { "kind": "command", - "ref": "cargo make graphiti-zep-docker-temporal-smoke", + "ref": "cargo make smoke-graphiti-zep-docker-temporal", "status": "blocked" }, { @@ -2350,11 +2350,11 @@ "evidence": "Official manual fact-triple ingest contract." } ], - "setup_path": "Run cargo make graphiti-zep-docker-temporal-smoke for a typed artifact; set ELF_GRAPHITI_ZEP_SMOKE_START=1 ELF_GRAPHITI_ZEP_SMOKE_RUN=1 with explicit provider configuration for a live attempt.", + "setup_path": "Run cargo make smoke-graphiti-zep-docker-temporal for a typed artifact; set ELF_GRAPHITI_ZEP_SMOKE_START=1 ELF_GRAPHITI_ZEP_SMOKE_RUN=1 with explicit provider configuration for a live attempt.", "runtime_boundary": "docker-compose.baseline.yml baseline-runner plus graphiti-zep FalkorDB profile, container-local Python venv, generated public temporal facts, and report artifacts under tmp/real-world-memory/graphiti-zep-smoke.", "resource_expectation": "Requires Docker-local FalkorDB plus LLM/embedding configuration; generated artifacts record service startup, storage size, provider boundaries, fact count, and timeout before scoring.", "retry_guidance": [ - "Run cargo make graphiti-zep-docker-temporal-smoke first to produce a typed blocked artifact.", + "Run cargo make smoke-graphiti-zep-docker-temporal first to produce a typed blocked artifact.", "Start the live path only with ELF_GRAPHITI_ZEP_SMOKE_START=1, ELF_GRAPHITI_ZEP_SMOKE_RUN=1, and explicit provider configuration.", "Treat missing validity windows or unmapped current/historical facts as wrong_result, not pass." ], @@ -2859,13 +2859,13 @@ "setup": { "status": "pass", "evidence": "XY-900 validation reached the Docker-only graph/report smoke setup inside the baseline runner without host-global assistant hooks.", - "command": "cargo make graphify-docker-graph-report-smoke", + "command": "cargo make smoke-graphify-docker-graph-report", "artifact": "tmp/real-world-memory/graphify-smoke/graphify-smoke.json" }, "run": { "status": "pass", "evidence": "The smoke installed graphify in a container-local venv, ran over a generated public corpus, and produced graph/report/query output for scoring.", - "command": "cargo make graphify-docker-graph-report-smoke", + "command": "cargo make smoke-graphify-docker-graph-report", "artifact": "tmp/real-world-memory/graphify-smoke/summary.json" }, "result": { @@ -2946,7 +2946,7 @@ }, { "kind": "command", - "ref": "cargo make graphify-docker-graph-report-smoke", + "ref": "cargo make smoke-graphify-docker-graph-report", "status": "wrong_result" }, { @@ -2973,11 +2973,11 @@ "evidence": "Official CLI, output artifact, query, and source-location contract." } ], - "setup_path": "Run cargo make graphify-docker-graph-report-smoke to install graphify in Docker, build graph/report artifacts from a generated public corpus, and export query evidence without installing host-global assistant hooks.", + "setup_path": "Run cargo make smoke-graphify-docker-graph-report to install graphify in Docker, build graph/report artifacts from a generated public corpus, and export query evidence without installing host-global assistant hooks.", "runtime_boundary": "docker-compose.baseline.yml baseline-runner, container-local Python venv, isolated HOME/config paths, generated public corpus, and artifacts under tmp/real-world-memory/graphify-smoke.", "resource_expectation": "Graph build cost scales with corpus and model choices; generated artifacts record package reference, provider/model boundary, build time, graph size, report size, cache size, timeout, and retry behavior.", "retry_guidance": [ - "Run cargo make graphify-docker-graph-report-smoke first; setup/runtime failures must remain typed artifacts, not pass claims.", + "Run cargo make smoke-graphify-docker-graph-report first; setup/runtime failures must remain typed artifacts, not pass claims.", "Do not use graphify host assistant hook installs or operator-owned assistant configuration as proof.", "Score graph-guided answers only when graph.json, GRAPH_REPORT.md, and graphify query output map to generated evidence ids." ], diff --git a/apps/elf-eval/fixtures/real_world_live_adapters/work_resume_exact_next_action.json b/apps/elf-eval/fixtures/real_world_live_adapters/work_resume_exact_next_action.json index 66128882..d3dd6d44 100644 --- a/apps/elf-eval/fixtures/real_world_live_adapters/work_resume_exact_next_action.json +++ b/apps/elf-eval/fixtures/real_world_live_adapters/work_resume_exact_next_action.json @@ -10,7 +10,7 @@ { "evidence_id": "xy868-current-next-action", "kind": "runbook", - "text": "Exact next action for XY-868: run `cargo make real-world-memory-live-adapters`, then run `cargo make fmt`, `cargo make lint-fix`, and `cargo make checks` before pushing branch y/elf-xy-868.", + "text": "Exact next action for XY-868: run `cargo make real-world-memory-live-adapters`, then run `cargo make fmt`, `cargo make lint-fix`, and `cargo make check` before pushing branch y/elf-xy-868.", "source_ref": { "schema": "source_ref/v1", "resolver": "real_world_live_adapter_fixture/v1", @@ -65,7 +65,7 @@ "must_include": [ { "claim_id": "next_action", - "text": "Exact next action for XY-868: run `cargo make real-world-memory-live-adapters`, then run `cargo make fmt`, `cargo make lint-fix`, and `cargo make checks` before pushing branch y/elf-xy-868." + "text": "Exact next action for XY-868: run `cargo make real-world-memory-live-adapters`, then run `cargo make fmt`, `cargo make lint-fix`, and `cargo make check` before pushing branch y/elf-xy-868." } ], "must_not_include": [ diff --git a/apps/elf-eval/fixtures/real_world_memory/core_archival_memory/stale_core_detection.json b/apps/elf-eval/fixtures/real_world_memory/core_archival_memory/stale_core_detection.json index 084c26cb..0dde7817 100644 --- a/apps/elf-eval/fixtures/real_world_memory/core_archival_memory/stale_core_detection.json +++ b/apps/elf-eval/fixtures/real_world_memory/core_archival_memory/stale_core_detection.json @@ -24,7 +24,7 @@ { "evidence_id": "archival-current-validation-gate", "kind": "decision", - "text": "Archival decision update: before pushing a refreshed PR head, run cargo make fmt, cargo make lint-fix, and cargo make checks.", + "text": "Archival decision update: before pushing a refreshed PR head, run cargo make fmt, cargo make lint-fix, and cargo make check.", "source_ref": { "schema": "source_ref/v1", "resolver": "real_world_job_fixture/v1", @@ -33,7 +33,7 @@ "evidence_id": "archival-current-validation-gate" }, "locator": { - "quote": "cargo make fmt, cargo make lint-fix, and cargo make checks" + "quote": "cargo make fmt, cargo make lint-fix, and cargo make check" } }, "created_at": "2026-06-11T04:30:00Z" @@ -73,7 +73,7 @@ "adapter_response": { "adapter_id": "fixture_core_archival_memory", "answer": { - "content": "Treat the attached validation-gate core block as stale. The current archival decision says to run cargo make fmt, cargo make lint-fix, and cargo make checks before pushing a refreshed PR head, and the archival rationale says that evidence supersedes the core block until it is updated from source-of-truth state.", + "content": "Treat the attached validation-gate core block as stale. The current archival decision says to run cargo make fmt, cargo make lint-fix, and cargo make check before pushing a refreshed PR head, and the archival rationale says that evidence supersedes the core block until it is updated from source-of-truth state.", "claims": [ { "claim_id": "stale_core_detected", @@ -83,7 +83,7 @@ }, { "claim_id": "archival_current_gate", - "text": "The current archival validation gate is cargo make fmt, cargo make lint-fix, and cargo make checks.", + "text": "The current archival validation gate is cargo make fmt, cargo make lint-fix, and cargo make check.", "evidence_ids": ["archival-current-validation-gate"], "confidence": "high" } @@ -131,7 +131,7 @@ }, { "claim_id": "archival_current_gate", - "text": "The current archival validation gate is cargo make fmt, cargo make lint-fix, and cargo make checks." + "text": "The current archival validation gate is cargo make fmt, cargo make lint-fix, and cargo make check." } ], "must_not_include": [ @@ -151,7 +151,7 @@ "evidence_id": "archival-current-validation-gate", "claim_id": "archival_current_gate", "requirement": "cite", - "quote": "cargo make fmt, cargo make lint-fix, and cargo make checks" + "quote": "cargo make fmt, cargo make lint-fix, and cargo make check" }, { "evidence_id": "archival-supersedes-core-rationale", diff --git a/apps/elf-eval/fixtures/real_world_memory/memory_evolution/delete_ttl_staleness.json b/apps/elf-eval/fixtures/real_world_memory/memory_evolution/delete_ttl_staleness.json index d6dc98c7..e903e548 100644 --- a/apps/elf-eval/fixtures/real_world_memory/memory_evolution/delete_ttl_staleness.json +++ b/apps/elf-eval/fixtures/real_world_memory/memory_evolution/delete_ttl_staleness.json @@ -41,7 +41,7 @@ { "evidence_id": "current-benchmark-plan", "kind": "decision", - "text": "Current plan: add real_world_memory trust and personalization cases, then run cargo make real-world-job-smoke and cargo make real-world-memory.", + "text": "Current plan: add real_world_memory trust and personalization cases, then run cargo make smoke-real-world-job and cargo make real-world-memory.", "source_ref": { "schema": "source_ref/v1", "resolver": "real_world_job_fixture/v1", @@ -59,7 +59,7 @@ "adapter_response": { "adapter_id": "elf_real_world_memory_fixture", "answer": { - "content": "Do not use the expired Redis branch plan. The tombstone says deleted-temp-plan expired and search must suppress it after the worker DELETE outbox completes. The current plan is to add real_world_memory trust and personalization cases, then run cargo make real-world-job-smoke and cargo make real-world-memory.", + "content": "Do not use the expired Redis branch plan. The tombstone says deleted-temp-plan expired and search must suppress it after the worker DELETE outbox completes. The current plan is to add real_world_memory trust and personalization cases, then run cargo make smoke-real-world-job and cargo make real-world-memory.", "claims": [ { "claim_id": "deleted_fact_suppressed", @@ -69,7 +69,7 @@ }, { "claim_id": "current_plan", - "text": "Add real_world_memory trust and personalization cases, then run cargo make real-world-job-smoke and cargo make real-world-memory.", + "text": "Add real_world_memory trust and personalization cases, then run cargo make smoke-real-world-job and cargo make real-world-memory.", "evidence_ids": ["current-benchmark-plan"], "confidence": "high" } @@ -125,7 +125,7 @@ }, { "claim_id": "current_plan", - "text": "Add real_world_memory trust and personalization cases, then run cargo make real-world-job-smoke and cargo make real-world-memory." + "text": "Add real_world_memory trust and personalization cases, then run cargo make smoke-real-world-job and cargo make real-world-memory." } ], "must_not_include": [ diff --git a/apps/elf-eval/fixtures/real_world_memory/project_decisions/current_validation_gate.json b/apps/elf-eval/fixtures/real_world_memory/project_decisions/current_validation_gate.json index f3e459b1..224f44f8 100644 --- a/apps/elf-eval/fixtures/real_world_memory/project_decisions/current_validation_gate.json +++ b/apps/elf-eval/fixtures/real_world_memory/project_decisions/current_validation_gate.json @@ -24,7 +24,7 @@ { "evidence_id": "validation-gate-current-decodex", "kind": "decision", - "text": "Current validation gate: before pushing a refreshed PR head, run cargo make fmt, cargo make lint-fix, and cargo make checks.", + "text": "Current validation gate: before pushing a refreshed PR head, run cargo make fmt, cargo make lint-fix, and cargo make check.", "source_ref": { "schema": "source_ref/v1", "resolver": "real_world_memory_fixture/v1", @@ -67,11 +67,11 @@ "adapter_response": { "adapter_id": "fixture_project_decisions", "answer": { - "content": "The current validation gate is cargo make fmt, cargo make lint-fix, and cargo make checks before pushing a refreshed PR head. The older lint-and-test gate is historical; the current gate adds formatting, automatic lint repair, and full checks to prevent avoidable review churn before Decodex review handoff.", + "content": "The current validation gate is cargo make fmt, cargo make lint-fix, and cargo make check before pushing a refreshed PR head. The older lint-and-test gate is historical; the current gate adds formatting, automatic lint repair, and full checks to prevent avoidable review churn before Decodex review handoff.", "claims": [ { "claim_id": "current_validation_gate", - "text": "The current validation gate is cargo make fmt, cargo make lint-fix, and cargo make checks before pushing a refreshed PR head.", + "text": "The current validation gate is cargo make fmt, cargo make lint-fix, and cargo make check before pushing a refreshed PR head.", "evidence_ids": [ "validation-gate-current-decodex", "validation-gate-old-lint-test", @@ -136,7 +136,7 @@ "must_include": [ { "claim_id": "current_validation_gate", - "text": "The current validation gate is cargo make fmt, cargo make lint-fix, and cargo make checks before pushing a refreshed PR head." + "text": "The current validation gate is cargo make fmt, cargo make lint-fix, and cargo make check before pushing a refreshed PR head." }, { "claim_id": "validation_gate_rationale", @@ -164,7 +164,7 @@ "evidence_id": "validation-gate-current-decodex", "claim_id": "current_validation_gate", "requirement": "cite", - "quote": "run cargo make fmt, cargo make lint-fix, and cargo make checks" + "quote": "run cargo make fmt, cargo make lint-fix, and cargo make check" }, { "evidence_id": "validation-gate-old-lint-test", diff --git a/apps/elf-eval/fixtures/real_world_memory/retrieval/alternate_phrasing.json b/apps/elf-eval/fixtures/real_world_memory/retrieval/alternate_phrasing.json index c939fb62..a8b4c6c5 100644 --- a/apps/elf-eval/fixtures/real_world_memory/retrieval/alternate_phrasing.json +++ b/apps/elf-eval/fixtures/real_world_memory/retrieval/alternate_phrasing.json @@ -10,7 +10,7 @@ { "evidence_id": "xy840-current-handoff", "kind": "issue", - "text": "XY-840 trace schema lane uses branch y/elf-xy-840. Before review handoff, run `cargo make checks` after the trace schema update is complete.", + "text": "XY-840 trace schema lane uses branch y/elf-xy-840. Before review handoff, run `cargo make check` after the trace schema update is complete.", "source_ref": { "schema": "source_ref/v1", "resolver": "real_world_job_fixture/v1", @@ -39,7 +39,7 @@ "adapter_response": { "adapter_id": "fixture_retrieval", "answer": { - "content": "Use branch y/elf-xy-840 for XY-840 and run `cargo make checks` before review handoff.", + "content": "Use branch y/elf-xy-840 for XY-840 and run `cargo make check` before review handoff.", "claims": [ { "claim_id": "branch", @@ -49,7 +49,7 @@ }, { "claim_id": "gate", - "text": "Run `cargo make checks` before review handoff.", + "text": "Run `cargo make check` before review handoff.", "evidence_ids": ["xy840-current-handoff"], "confidence": "high" } @@ -97,7 +97,7 @@ }, { "claim_id": "gate", - "text": "Run `cargo make checks` before review handoff." + "text": "Run `cargo make check` before review handoff." } ], "must_not_include": [ @@ -124,7 +124,7 @@ "evidence_id": "xy840-current-handoff", "claim_id": "gate", "requirement": "use", - "quote": "run `cargo make checks`" + "quote": "run `cargo make check`" } ], "negative_traps": [ diff --git a/apps/elf-eval/fixtures/real_world_memory/work_resume/work_resume_failed_command_recovery.json b/apps/elf-eval/fixtures/real_world_memory/work_resume/work_resume_failed_command_recovery.json index b6c3da36..5ddbda61 100644 --- a/apps/elf-eval/fixtures/real_world_memory/work_resume/work_resume_failed_command_recovery.json +++ b/apps/elf-eval/fixtures/real_world_memory/work_resume/work_resume_failed_command_recovery.json @@ -10,7 +10,7 @@ { "evidence_id": "xy844-current-command-failure", "kind": "trace", - "text": "Command log: cargo make real-world-job-smoke failed because apps/elf-eval/fixtures/real_world_memory/work_resume/ did not contain the required suite fixtures. Earlier lint warnings had already been repaired. The exact recovery action is to create the missing fixtures, then rerun cargo make real-world-job-smoke.", + "text": "Command log: cargo make smoke-real-world-job failed because apps/elf-eval/fixtures/real_world_memory/work_resume/ did not contain the required suite fixtures. Earlier lint warnings had already been repaired. The exact recovery action is to create the missing fixtures, then rerun cargo make smoke-real-world-job.", "source_ref": { "schema": "source_ref/v1", "resolver": "real_world_job_fixture/v1", @@ -39,7 +39,7 @@ "adapter_response": { "adapter_id": "fixture_work_resume", "answer": { - "content": "The current stop is the missing work_resume fixture suite, not the old lint warning. Create the missing fixtures under apps/elf-eval/fixtures/real_world_memory/work_resume/, then rerun `cargo make real-world-job-smoke`.", + "content": "The current stop is the missing work_resume fixture suite, not the old lint warning. Create the missing fixtures under apps/elf-eval/fixtures/real_world_memory/work_resume/, then rerun `cargo make smoke-real-world-job`.", "claims": [ { "claim_id": "current_stop", @@ -49,7 +49,7 @@ }, { "claim_id": "recovery_action", - "text": "Create the missing fixtures, then rerun `cargo make real-world-job-smoke`.", + "text": "Create the missing fixtures, then rerun `cargo make smoke-real-world-job`.", "evidence_ids": ["xy844-current-command-failure"], "confidence": "high" }, @@ -107,7 +107,7 @@ }, { "claim_id": "recovery_action", - "text": "Create the missing fixtures, then rerun `cargo make real-world-job-smoke`." + "text": "Create the missing fixtures, then rerun `cargo make smoke-real-world-job`." }, { "claim_id": "stale_blocker", @@ -139,7 +139,7 @@ "evidence_id": "xy844-current-command-failure", "claim_id": "recovery_action", "requirement": "use", - "quote": "The exact recovery action is to create the missing fixtures, then rerun cargo make real-world-job-smoke." + "quote": "The exact recovery action is to create the missing fixtures, then rerun cargo make smoke-real-world-job." } ], "negative_traps": [ diff --git a/apps/elf-eval/fixtures/real_world_memory/work_resume/work_resume_next_action_extraction.json b/apps/elf-eval/fixtures/real_world_memory/work_resume/work_resume_next_action_extraction.json index 4cbe80c3..1f6ae49e 100644 --- a/apps/elf-eval/fixtures/real_world_memory/work_resume/work_resume_next_action_extraction.json +++ b/apps/elf-eval/fixtures/real_world_memory/work_resume/work_resume_next_action_extraction.json @@ -10,7 +10,7 @@ { "evidence_id": "xy844-next-action-handoff", "kind": "runbook", - "text": "XY-844 handoff note: after adding the six work-resume continuity and capture/integration fixtures, run `cargo make real-world-job-smoke`, then run `cargo make fmt`, `cargo make lint-fix`, and `cargo make checks` before pushing branch y/elf-xy-844.", + "text": "XY-844 handoff note: after adding the six work-resume continuity and capture/integration fixtures, run `cargo make smoke-real-world-job`, then run `cargo make fmt`, `cargo make lint-fix`, and `cargo make check` before pushing branch y/elf-xy-844.", "source_ref": { "schema": "source_ref/v1", "resolver": "real_world_job_fixture/v1", @@ -39,7 +39,7 @@ "adapter_response": { "adapter_id": "fixture_work_resume", "answer": { - "content": "The exact next action is to add the six work-resume continuity and capture/integration fixtures, run `cargo make real-world-job-smoke`, then run the pre-push gate: `cargo make fmt`, `cargo make lint-fix`, and `cargo make checks` before pushing y/elf-xy-844.", + "content": "The exact next action is to add the six work-resume continuity and capture/integration fixtures, run `cargo make smoke-real-world-job`, then run the pre-push gate: `cargo make fmt`, `cargo make lint-fix`, and `cargo make check` before pushing y/elf-xy-844.", "claims": [ { "claim_id": "exact_next_action", @@ -49,7 +49,7 @@ }, { "claim_id": "validation_sequence", - "text": "Run `cargo make real-world-job-smoke`, then `cargo make fmt`, `cargo make lint-fix`, and `cargo make checks` before pushing.", + "text": "Run `cargo make smoke-real-world-job`, then `cargo make fmt`, `cargo make lint-fix`, and `cargo make check` before pushing.", "evidence_ids": ["xy844-next-action-handoff"], "confidence": "high" } @@ -101,7 +101,7 @@ }, { "claim_id": "validation_sequence", - "text": "Run `cargo make real-world-job-smoke`, then `cargo make fmt`, `cargo make lint-fix`, and `cargo make checks` before pushing." + "text": "Run `cargo make smoke-real-world-job`, then `cargo make fmt`, `cargo make lint-fix`, and `cargo make check` before pushing." } ], "must_not_include": [ @@ -127,7 +127,7 @@ "evidence_id": "xy844-next-action-handoff", "claim_id": "validation_sequence", "requirement": "use", - "quote": "run `cargo make real-world-job-smoke`, then run `cargo make fmt`, `cargo make lint-fix`, and `cargo make checks`" + "quote": "run `cargo make smoke-real-world-job`, then run `cargo make fmt`, `cargo make lint-fix`, and `cargo make check`" } ], "negative_traps": [ diff --git a/apps/elf-eval/tests/real_world_job_benchmark.rs b/apps/elf-eval/tests/real_world_job_benchmark.rs index ff9d3c6f..a9a6a8f7 100644 --- a/apps/elf-eval/tests/real_world_job_benchmark.rs +++ b/apps/elf-eval/tests/real_world_job_benchmark.rs @@ -944,7 +944,7 @@ fn assert_graph_rag_research_gate_records(ragflow: &Value, lightrag: &Value, gra ); assert_eq!( ragflow.pointer("/setup/command").and_then(Value::as_str), - Some("cargo make ragflow-docker-smoke") + Some("cargo make smoke-ragflow-docker") ); assert_eq!( ragflow.pointer("/result/artifact").and_then(Value::as_str), @@ -958,11 +958,11 @@ fn assert_graph_rag_research_gate_records(ragflow: &Value, lightrag: &Value, gra assert_eq!(lightrag.pointer("/overall_status").and_then(Value::as_str), Some("blocked")); assert_eq!( lightrag.pointer("/setup/command").and_then(Value::as_str), - Some("cargo make lightrag-docker-context-smoke") + Some("cargo make smoke-lightrag-docker-context") ); assert_eq!( lightrag.pointer("/run/command").and_then(Value::as_str), - Some("ELF_LIGHTRAG_CONTEXT_START=1 cargo make lightrag-docker-context-smoke") + Some("ELF_LIGHTRAG_CONTEXT_START=1 cargo make smoke-lightrag-docker-context") ); assert_eq!( lightrag.pointer("/capabilities/3/status").and_then(Value::as_str), @@ -971,7 +971,7 @@ fn assert_graph_rag_research_gate_records(ragflow: &Value, lightrag: &Value, gra assert_eq!(graphrag.pointer("/evidence_class").and_then(Value::as_str), Some("research_gate")); assert_eq!( graphrag.pointer("/setup/command").and_then(Value::as_str), - Some("cargo make graphrag-docker-smoke") + Some("cargo make smoke-graphrag-docker") ); assert_eq!(graphrag.pointer("/suites/1/status").and_then(Value::as_str), Some("not_encoded")); } @@ -1389,12 +1389,12 @@ fn assert_graphiti_zep_adapter(adapter: &Value) { assert_eq!(adapter.pointer("/overall_status").and_then(Value::as_str), Some("blocked")); assert_eq!( adapter.pointer("/setup/command").and_then(Value::as_str), - Some("cargo make graphiti-zep-docker-temporal-smoke") + Some("cargo make smoke-graphiti-zep-docker-temporal") ); assert_eq!( adapter.pointer("/run/command").and_then(Value::as_str), Some( - "ELF_GRAPHITI_ZEP_SMOKE_START=1 ELF_GRAPHITI_ZEP_SMOKE_RUN=1 cargo make graphiti-zep-docker-temporal-smoke" + "ELF_GRAPHITI_ZEP_SMOKE_START=1 ELF_GRAPHITI_ZEP_SMOKE_RUN=1 cargo make smoke-graphiti-zep-docker-temporal" ) ); assert_eq!( @@ -1418,7 +1418,7 @@ fn assert_graphify_adapter(adapter: &Value) -> Result<()> { assert_eq!(adapter.pointer("/result/status").and_then(Value::as_str), Some("wrong_result")); assert_eq!( adapter.pointer("/setup/command").and_then(Value::as_str), - Some("cargo make graphify-docker-graph-report-smoke") + Some("cargo make smoke-graphify-docker-graph-report") ); assert_eq!( adapter.pointer("/suites/0/suite_id").and_then(Value::as_str), @@ -1526,13 +1526,13 @@ fn graphify_generated_manifest_keeps_retrieval_unscored() -> Result<()> { "setup": { "status": "pass", "evidence": "setup evidence", - "command": "cargo make graphify-docker-graph-report-smoke", + "command": "cargo make smoke-graphify-docker-graph-report", "artifact": "tmp/real-world-memory/graphify-smoke/graphify-smoke.json" }, "run": { "status": "pass", "evidence": "run evidence", - "command": "cargo make graphify-docker-graph-report-smoke", + "command": "cargo make smoke-graphify-docker-graph-report", "artifact": "tmp/real-world-memory/graphify-smoke/summary.json" }, "result": { @@ -1559,7 +1559,7 @@ fn graphify_generated_manifest_keeps_retrieval_unscored() -> Result<()> { ], "evidence": [], "execution_metadata": { - "setup_path": "cargo make graphify-docker-graph-report-smoke", + "setup_path": "cargo make smoke-graphify-docker-graph-report", "runtime_boundary": "Docker-only generated graph/report smoke.", "resource_expectation": "Tiny generated corpus only.", "retry_guidance": [], @@ -1673,9 +1673,16 @@ fn graph_rag_representative_fixtures_report_typed_non_pass_states() -> Result<() #[test] fn live_adapter_aggregate_forwards_graph_rag_smoke_controls() -> Result<()> { - let makefile = fs::read_to_string( - Path::new(env!("CARGO_MANIFEST_DIR")).join("..").join("..").join("Makefile.toml"), - )?; + let workspace = workspace_root()?; + let makefile = fs::read_to_string(workspace.join("Makefile.toml"))?; + let docker_script = fs::read_to_string(workspace.join("scripts/real-world-docker.sh"))?; + + assert!( + makefile.contains("[tasks.real-world-memory-live-adapters]") + && makefile.contains("scripts/real-world-docker.sh") + && makefile.contains("memory-live-adapters"), + "Makefile should expose the live-adapter command and delegate Docker details to a script", + ); for env_name in [ "ELF_REAL_WORLD_LIVE_ENABLE_RAGFLOW", @@ -1693,17 +1700,17 @@ fn live_adapter_aggregate_forwards_graph_rag_smoke_controls() -> Result<()> { "ELF_GRAPHIFY_SMOKE_RUN", ] { assert!( - makefile.contains(&format!("-e {env_name}")), + docker_script.contains(&format!("-e {env_name}")), "real-world-memory-live-adapters must forward {env_name}", ); } assert!( - makefile.contains("--profile lightrag up -d lightrag"), + docker_script.contains("--profile lightrag up -d lightrag"), "aggregate task should start LightRAG profile when ELF_LIGHTRAG_CONTEXT_START=1", ); assert!( - makefile.contains("--profile graphiti-zep up -d graphiti-falkordb"), + docker_script.contains("--profile graphiti-zep up -d graphiti-falkordb"), "aggregate task should start Graphiti/Zep profile when ELF_GRAPHITI_ZEP_SMOKE_START=1", ); @@ -1714,6 +1721,7 @@ fn live_adapter_aggregate_forwards_graph_rag_smoke_controls() -> Result<()> { fn openmemory_ui_export_probe_has_dedicated_docker_task() -> Result<()> { let workspace_root = workspace_root()?; let makefile = fs::read_to_string(workspace_root.join("Makefile.toml"))?; + let docker_script = fs::read_to_string(workspace_root.join("scripts/baseline-docker.sh"))?; let compose = fs::read_to_string(workspace_root.join("docker-compose.baseline.yml"))?; let script = fs::read_to_string(workspace_root.join("scripts/live-baseline-benchmark.sh"))?; let report = serde_json::from_str::(&fs::read_to_string( @@ -1721,7 +1729,9 @@ fn openmemory_ui_export_probe_has_dedicated_docker_task() -> Result<()> { )?)?; assert!(makefile.contains("[tasks.openmemory-ui-export-readback]")); - assert!(makefile.contains("export ELF_BASELINE_PROJECTS=mem0")); + assert!(makefile.contains("scripts/baseline-docker.sh")); + assert!(makefile.contains("openmemory-ui-export-readback")); + assert!(docker_script.contains("export ELF_BASELINE_PROJECTS=mem0")); assert!(compose.contains("ELF_MEM0_OPENMEMORY_EXPORT_USER_ID")); assert!(compose.contains("ELF_MEM0_OPENMEMORY_EXPORT_CONTAINER")); assert!(script.contains("probe_mem0_openmemory_ui_export")); @@ -1756,6 +1766,7 @@ fn openmemory_ui_export_probe_has_dedicated_docker_task() -> Result<()> { fn operator_debug_live_adapter_task_is_docker_scoped() -> Result<()> { let workspace = workspace_root()?; let makefile = fs::read_to_string(workspace.join("Makefile.toml"))?; + let docker_script = fs::read_to_string(workspace.join("scripts/real-world-docker.sh"))?; let script = fs::read_to_string( workspace.join("scripts").join("real-world-operator-debug-live-adapters.sh"), )?; @@ -1765,8 +1776,12 @@ fn operator_debug_live_adapter_task_is_docker_scoped() -> Result<()> { fs::read_to_string(workspace.join("apps/elf-eval/src/bin/real_world_job_benchmark.rs"))?; assert!(makefile.contains("[tasks.real-world-job-operator-ux-live-adapters]")); - assert!(makefile.contains("docker compose -f docker-compose.baseline.yml run --build --rm")); - assert!(makefile.contains("scripts/real-world-operator-debug-live-adapters.sh")); + assert!(makefile.contains("scripts/real-world-docker.sh")); + assert!(makefile.contains("job-operator-ux-live-adapters")); + assert!( + docker_script.contains("docker compose -f docker-compose.baseline.yml run --build --rm") + ); + assert!(docker_script.contains("scripts/real-world-operator-debug-live-adapters.sh")); assert!(script.contains("apps/elf-eval/fixtures/real_world_job/operator_debugging_ux")); assert!(script.contains("elf_operator_debug_live")); assert!(script.contains("qmd_operator_debug_live")); @@ -2169,7 +2184,11 @@ fn live_consolidation_report_preserves_reviewable_output_boundaries() -> Result< assert!(benchmark_guide.contains("Current live consolidation increment")); assert!(benchmark_guide.contains("tmp/real-world-memory/live-consolidation/summary.json")); assert!(makefile.contains("[tasks.real-world-memory-live-consolidation]")); - assert!(makefile.contains("scripts/real-world-consolidation-live-adapter.sh")); + assert!(makefile.contains("scripts/real-world-docker.sh")); + + let docker_script = fs::read_to_string(workspace.join("scripts/real-world-docker.sh"))?; + + assert!(docker_script.contains("scripts/real-world-consolidation-live-adapter.sh")); assert!(live_script.contains("elf.real_world_consolidation_live_adapter_sweep/v1")); assert!(live_script.contains("real_world_live_adapter -- elf")); assert!(!live_script.contains("real_world_live_adapter -- qmd")); diff --git a/docs/guide/agent-setup.md b/docs/guide/agent-setup.md index e4e81473..57257017 100644 --- a/docs/guide/agent-setup.md +++ b/docs/guide/agent-setup.md @@ -155,7 +155,7 @@ Example: ELF_PG_DSN="postgres://elf_dev:elf_dev_password@127.0.0.1:51888/postgres" \ ELF_QDRANT_GRPC_URL="http://127.0.0.1:51890" \ ELF_QDRANT_HTTP_URL="http://127.0.0.1:51889" \ -cargo make e2e +cargo make test-e2e ``` ## Troubleshooting diff --git a/docs/guide/benchmarking/2026-06-09-live-baseline-report.md b/docs/guide/benchmarking/2026-06-09-live-baseline-report.md index 78df93bb..9551adeb 100644 --- a/docs/guide/benchmarking/2026-06-09-live-baseline-report.md +++ b/docs/guide/benchmarking/2026-06-09-live-baseline-report.md @@ -230,7 +230,7 @@ cargo make baseline-live-report Clean Docker-owned state: ```sh -cargo make baseline-live-docker-clean +cargo make clean-baseline-live-docker ``` The only host report directory is `tmp/live-baseline/`. Raw generated JSON stays there diff --git a/docs/guide/benchmarking/2026-06-11-competitor-strength-adoption-report.md b/docs/guide/benchmarking/2026-06-11-competitor-strength-adoption-report.md index 4f960804..12aeeb01 100644 --- a/docs/guide/benchmarking/2026-06-11-competitor-strength-adoption-report.md +++ b/docs/guide/benchmarking/2026-06-11-competitor-strength-adoption-report.md @@ -99,8 +99,8 @@ results, or lifecycle failures into one aggregate leaderboard. | `ELF_BASELINE_PROJECTS=ELF,agentmemory,mem0,memsearch,claude-mem cargo make baseline-live-docker` | `2026-06-11-first-generation-oss-adapter-promotion-report.md` | mem0/OpenMemory and memsearch pass basic local baseline smokes; agentmemory remains lifecycle_fail and claude-mem remains wrong_result. | | `cargo make real-world-first-generation-oss` | `2026-06-11-first-generation-oss-continuity-source-store-report.md` | First-generation OSS fixture slice reports 6 jobs: 4 pass, 2 blocked, full evidence/source-ref/quote coverage, and manifest scenario outcomes across win, tie, loss, not_tested, blocked, and non_goal without promoting smoke evidence into live suite passes. | | `cargo make openmemory-ui-export-readback` | `2026-06-11-mem0-openmemory-history-ui-export-report.md` | mem0 local OSS passes preference correction history, entity-scoped personalization, local `get_all` export-style readback, and deletion audit history; OpenMemory export-helper setup emits a separate blocked artifact with `DOCKER_UNAVAILABLE_IN_BASELINE_RUNNER`, and hosted Platform export remains non-goal. | -| `ELF_GRAPHITI_ZEP_SMOKE_START=1 ELF_GRAPHITI_ZEP_SMOKE_RUN=1 cargo make graphiti-zep-docker-temporal-smoke` | `2026-06-11-temporal-history-competitor-gap-report.md` | Graphiti/Zep temporal smoke remains blocked by `provider_api_key_missing`. | -| `cargo make graphify-docker-graph-report-smoke` | `2026-06-11-graph-rag-scored-smoke-adapter-report.md` | graphify reaches tiny Docker graph/report scoring but remains wrong_result. | +| `ELF_GRAPHITI_ZEP_SMOKE_START=1 ELF_GRAPHITI_ZEP_SMOKE_RUN=1 cargo make smoke-graphiti-zep-docker-temporal` | `2026-06-11-temporal-history-competitor-gap-report.md` | Graphiti/Zep temporal smoke remains blocked by `provider_api_key_missing`. | +| `cargo make smoke-graphify-docker-graph-report` | `2026-06-11-graph-rag-scored-smoke-adapter-report.md` | graphify reaches tiny Docker graph/report scoring but remains wrong_result. | | `cargo make real-world-memory-graph-rag` | `tmp/real-world-memory/graph-rag/report.json` | Representative graph/RAG fixtures produce typed non-pass reports: RAGFlow, GraphRAG, and Graphiti/Zep blocked; LightRAG incomplete with comparison blocked; graphify wrong_result; llm-wiki not_tested; gbrain blocked; private/hosted profiles non_goal. | | `cargo make baseline-production-synthetic`, `cargo make baseline-backfill-docker`, backup/restore, Qdrant rebuild proof | `2026-06-10-production-adoption-refresh.md` | ELF has provider synthetic, stress, backfill, restore, and rebuild evidence; private-corpus proof is blocked by missing operator-owned manifest. | | `ELF_BASELINE_PROJECTS=ELF,qmd ELF_BASELINE_PROFILE=stress cargo make baseline-live-docker` plus ELF trace-bundle and qmd CLI replay commands | `2026-06-11-elf-qmd-trace-replay-diagnostics-report.md` | Retrieval correctness remains tied, but qmd wins current immediate top-10/replay artifact ergonomics; ELF trace/admin surfaces are useful but not yet hydrated into the default stress artifact. | diff --git a/docs/guide/benchmarking/2026-06-11-competitor-strength-evidence-matrix.md b/docs/guide/benchmarking/2026-06-11-competitor-strength-evidence-matrix.md index c48bdcf2..6402b188 100644 --- a/docs/guide/benchmarking/2026-06-11-competitor-strength-evidence-matrix.md +++ b/docs/guide/benchmarking/2026-06-11-competitor-strength-evidence-matrix.md @@ -90,16 +90,16 @@ lifecycle-fail -> `lifecycle_fail`, and not-encoded -> `not_encoded`. | memsearch | Markdown-first canonical store with rebuildable local index and practical hybrid retrieval. | `live_baseline_only`; XY-925 `fixture_backed`. | `pass`: fresh scoped run `ELF_BASELINE_PROJECTS=ELF,agentmemory,mem0,memsearch,claude-mem cargo make baseline-live-docker`, `tmp/live-baseline/live-baseline-report.json`, with memsearch `4/4` local checks passing. XY-925 adds fixture-backed source-store and retrieval-debug prompts through `cargo make real-world-first-generation-oss`, `tmp/real-world-memory/first-generation-oss/report.json`. | `not_encoded`: no live memsearch runtime adapter executes real-world prompt scoring; memory-evolution prompt adapters remain not encoded; TTL/expiry is unsupported by the current CLI path. | Promote the fixture-backed source-store and retrieval-debug prompts into a live memsearch real-world adapter before any suite-level win/loss claim; keep TTL/expiry as unsupported unless a comparable path exists. | Canonical markdown store, local reindex clarity, and user-inspectable source files. | | OpenViking | Filesystem-like context trajectory, hierarchical retrieval, and staged context loading. | `live_baseline_only`; supporting `fixture_backed` and `research_gate`. | `wrong_result`: `ELF_BASELINE_PROJECTS=OpenViking cargo make baseline-live-docker`, `tmp/live-baseline/live-baseline-report.json`; `blocked`: checked-in `context_trajectory` fixtures cover staged retrieval, hierarchy selection, and recursive/context expansion gates. | `blocked`: hierarchical context trajectory is encoded but blocked until same-corpus evidence ids match and staged artifacts are materialized. | Make evidence-bearing same-corpus output pass, then score staged trajectory and hierarchy expansion. | `viking://`-style context model, trajectory readback, and staged retrieval planning. | | claude-mem | Progressive disclosure, automatic capture loop, repository-local lifecycle, and local viewer workflow. | `live_baseline_only`; XY-925 `fixture_backed`. | `wrong_result`: `ELF_BASELINE_PROJECTS=claude-mem cargo make baseline-live-docker`, `tmp/live-baseline/live-baseline-report.json`. XY-925 adds fixture-backed progressive-disclosure and retrieval-repair prompts through `cargo make real-world-first-generation-oss`, `tmp/real-world-memory/first-generation-oss/report.json`. | `blocked`: hook capture and viewer/operator workflows still lack a Docker-contained runner; retrieval remains `wrong_result`, and the repair prompt lists rerun/inspection targets `tmp/live-baseline/claude-mem.log` and `tmp/live-baseline/claude-mem-checks.json`. | Promote durable repository-backed work_resume, operator_debugging_ux, capture/write-policy, and progressive-disclosure prompts into a live claude-mem adapter before any broader UX claim. | Progressive disclosure, automatic capture review loops, and local viewer/operator comfort. | -| RAGFlow | Full RAG application workflow with document, chunk, and reference evidence handles. | `research_gate`. | `blocked`: `ELF_RAGFLOW_SMOKE_START=1 ELF_RAGFLOW_SMOKE_ACCEPT_RESOURCE_ENVELOPE=1 cargo make ragflow-docker-smoke`, `tmp/real-world-memory/ragflow-smoke/ragflow-smoke.json`. | `blocked`: Docker resource envelope and adapter output mapping still need proof. | XY-885 tiny Docker evidence-smoke adapter mapping `reference.chunks` to scored evidence. | Document/chunk references, resource-envelope reporting, and RAG app evidence handles. | -| LightRAG | Lightweight graph/RAG context export with source file-path citation shape. | `research_gate`. | `blocked`: `ELF_LIGHTRAG_CONTEXT_START=1 cargo make lightrag-docker-context-smoke`, `tmp/real-world-memory/lightrag-context/summary.json`. | `blocked`: Docker service setup and context export are not proven. | XY-886 Docker context-export adapter with explicit provider config and source citation mapping. | Context-only query modes, graph-aware retrieval layout, and file-path citation readback. | -| GraphRAG | GraphRAG indexing, graph summaries, and document/text-unit evidence tables. | `research_gate`. | `blocked`: `ELF_GRAPHRAG_SMOKE_RUN=1 cargo make graphrag-docker-smoke`, `tmp/real-world-memory/graphrag-smoke/summary.json`. | `blocked`: indexing resource envelope and source citation mapping are not proven. | XY-887 cost-bounded Docker adapter over a tiny corpus and scored output tables. | Graph summary artifacts, local/global search separation, and source table evidence mapping. | -| Graphiti/Zep | Temporal graph memory with current, historical, and future fact validity windows. | `research_gate`. | `blocked`: `ELF_GRAPHITI_ZEP_SMOKE_START=1 ELF_GRAPHITI_ZEP_SMOKE_RUN=1 cargo make graphiti-zep-docker-temporal-smoke`, `tmp/real-world-memory/graphiti-zep-smoke/summary.json`. | `blocked`: Docker graph-store and temporal adapter are not proven. | XY-888 Docker-local temporal graph adapter scoring current/historical fact validity. | Temporal fact windows, invalidation/supersession semantics, and graph fact provenance. | +| RAGFlow | Full RAG application workflow with document, chunk, and reference evidence handles. | `research_gate`. | `blocked`: `ELF_RAGFLOW_SMOKE_START=1 ELF_RAGFLOW_SMOKE_ACCEPT_RESOURCE_ENVELOPE=1 cargo make smoke-ragflow-docker`, `tmp/real-world-memory/ragflow-smoke/ragflow-smoke.json`. | `blocked`: Docker resource envelope and adapter output mapping still need proof. | XY-885 tiny Docker evidence-smoke adapter mapping `reference.chunks` to scored evidence. | Document/chunk references, resource-envelope reporting, and RAG app evidence handles. | +| LightRAG | Lightweight graph/RAG context export with source file-path citation shape. | `research_gate`. | `blocked`: `ELF_LIGHTRAG_CONTEXT_START=1 cargo make smoke-lightrag-docker-context`, `tmp/real-world-memory/lightrag-context/summary.json`. | `blocked`: Docker service setup and context export are not proven. | XY-886 Docker context-export adapter with explicit provider config and source citation mapping. | Context-only query modes, graph-aware retrieval layout, and file-path citation readback. | +| GraphRAG | GraphRAG indexing, graph summaries, and document/text-unit evidence tables. | `research_gate`. | `blocked`: `ELF_GRAPHRAG_SMOKE_RUN=1 cargo make smoke-graphrag-docker`, `tmp/real-world-memory/graphrag-smoke/summary.json`. | `blocked`: indexing resource envelope and source citation mapping are not proven. | XY-887 cost-bounded Docker adapter over a tiny corpus and scored output tables. | Graph summary artifacts, local/global search separation, and source table evidence mapping. | +| Graphiti/Zep | Temporal graph memory with current, historical, and future fact validity windows. | `research_gate`. | `blocked`: `ELF_GRAPHITI_ZEP_SMOKE_START=1 ELF_GRAPHITI_ZEP_SMOKE_RUN=1 cargo make smoke-graphiti-zep-docker-temporal`, `tmp/real-world-memory/graphiti-zep-smoke/summary.json`. | `blocked`: Docker graph-store and temporal adapter are not proven. | XY-888 Docker-local temporal graph adapter scoring current/historical fact validity. | Temporal fact windows, invalidation/supersession semantics, and graph fact provenance. | | Letta | Core memory blocks versus archival memory with explicit operating-context surfaces. | `research_gate`. | `blocked`: the selected comparison contract is a Docker-only benchmark-created agent export that returns core block JSON, archival search/readback JSON, and source ids; no materialized export exists yet. | `blocked`: no Letta materializer currently creates the benchmark agent, imports the ELF `core_archival_memory` fixture corpus, or exports comparable core and archival evidence. | Implement and run the contained export/readback adapter before any Letta win, tie, or loss claim; keep personalization and project-decision scenarios blocked or not tested until that evidence exists. | Core memory block ergonomics, archival separation, and shared operating context readback. | | LangGraph | Checkpoint/replay regression workflow and durable state replay for agent runs. | `research_gate`. | `not_encoded`: `docs/research/2026-06-10-xy-882-rag-graph-adapter-feasibility.json`. | `unsupported`: not a standalone memory backend adapter. | Non-goal for direct win/loss until a standalone memory output contract exists; use replay jobs as benchmark infrastructure reference. | Checkpoint replay, deterministic regression, and state-diff evaluation patterns. | | nanograph | Typed graph schema and query ergonomics for graph-lite developer experience. | `research_gate`. | `not_encoded`: `docs/research/2026-06-10-xy-882-rag-graph-adapter-feasibility.json`. | `unsupported`: not a memory backend comparison target. | Non-goal for direct win/loss unless a contained memory-backed target emerges; measure ELF graph-lite DX instead. | Typed relation schema, query ergonomics, and small graph developer experience. | | llm-wiki | LLM-maintained wiki or knowledge-page workflow with query-save and lint loops. | `research_gate`. | `not_encoded`: `docs/research/2026-06-10-xy-882-rag-graph-adapter-feasibility.json`. | `unsupported`: no live service runtime for adapter proof. | Select contained plugin or instruction harness, then score knowledge pages for citations, unsupported claims, rebuild, and stale-source lint. | Maintained wiki workflows, page lint, query-save loops, and topic-scoped navigation. | | gbrain | Operational knowledge brain with compiled_truth pages, timelines, enrichment, and maintenance loops. | `research_gate`. | `not_encoded`: `docs/research/2026-06-10-xy-882-rag-graph-adapter-feasibility.json`. | `blocked`: Docker-local brain repo and database path are missing. | Prove Docker-local repository/database setup, then encode compiled_truth/timeline and operator-continuity jobs. | Compiled truth pages, timeline maintenance, and human-operable knowledge-brain navigation. | -| graphify | Graph-compressed navigation with `graph.json` and `GRAPH_REPORT` evidence outputs. | Scored tiny `live_real_world` smoke; not broad graph-quality proof. | `wrong_result`: `cargo make graphify-docker-graph-report-smoke`, `tmp/real-world-memory/graphify-smoke/graphify-report.json`. | `not_encoded`: broad graph navigation, multimodal, private-corpus, and large-corpus quality remain outside the tiny smoke. | Expand beyond the generated smoke only after graph/report output maps to scored evidence on representative graph/RAG jobs. | Graph compression, source-location graph reports, and navigation hints for large code or document spaces. | +| graphify | Graph-compressed navigation with `graph.json` and `GRAPH_REPORT` evidence outputs. | Scored tiny `live_real_world` smoke; not broad graph-quality proof. | `wrong_result`: `cargo make smoke-graphify-docker-graph-report`, `tmp/real-world-memory/graphify-smoke/graphify-report.json`. | `not_encoded`: broad graph navigation, multimodal, private-corpus, and large-corpus quality remain outside the tiny smoke. | Expand beyond the generated smoke only after graph/report output maps to scored evidence on representative graph/RAG jobs. | Graph compression, source-location graph reports, and navigation hints for large code or document spaces. | ## Scenario Matrix diff --git a/docs/guide/benchmarking/2026-06-11-graph-rag-scored-smoke-adapter-report.md b/docs/guide/benchmarking/2026-06-11-graph-rag-scored-smoke-adapter-report.md index 542e0839..290092d3 100644 --- a/docs/guide/benchmarking/2026-06-11-graph-rag-scored-smoke-adapter-report.md +++ b/docs/guide/benchmarking/2026-06-11-graph-rag-scored-smoke-adapter-report.md @@ -39,11 +39,11 @@ contract, not the quality claim. | Project | Scored scenario | Command | Current scored status | Claim boundary | | --- | --- | --- | --- | --- | -| RAGFlow | `retrieval`: reference chunks mapped to generated evidence ids | `cargo make ragflow-docker-smoke` | `blocked` or `incomplete` by execution boundary | Smoke-only. No RAGFlow quality claim until returned reference chunks map to `ragflow-smoke-anchor`. | -| LightRAG | `retrieval`: context/source export mapped to fixture evidence ids | `cargo make lightrag-docker-context-smoke` | `incomplete` when the API service is not started | Smoke-only. No graph-RAG quality claim until context or references map to generated evidence ids. | -| GraphRAG | `knowledge_compilation`: output tables mapped to generated evidence ids | `cargo make graphrag-docker-smoke` | `blocked` | Smoke-only. No graph-navigation or synthesis claim until output tables map to generated evidence ids. | -| Graphiti/Zep | `memory_evolution`: current and historical validity facts | `cargo make graphiti-zep-docker-temporal-smoke` | `blocked` before live opt-in; `provider_api_key_missing` when live path is enabled without explicit credentials | Provider-bound. No ELF-over-Graphiti/Zep claim until temporal output maps to scored evidence ids. | -| graphify | `knowledge_compilation`: `graph.json`, `GRAPH_REPORT.md`, and query output mapping | `cargo make graphify-docker-graph-report-smoke` | `wrong_result` after setup/run pass | Scored tiny smoke. The graph/report output maps to evidence ids, but the job remains non-pass; no broad graph-navigation quality claim follows. | +| RAGFlow | `retrieval`: reference chunks mapped to generated evidence ids | `cargo make smoke-ragflow-docker` | `blocked` or `incomplete` by execution boundary | Smoke-only. No RAGFlow quality claim until returned reference chunks map to `ragflow-smoke-anchor`. | +| LightRAG | `retrieval`: context/source export mapped to fixture evidence ids | `cargo make smoke-lightrag-docker-context` | `incomplete` when the API service is not started | Smoke-only. No graph-RAG quality claim until context or references map to generated evidence ids. | +| GraphRAG | `knowledge_compilation`: output tables mapped to generated evidence ids | `cargo make smoke-graphrag-docker` | `blocked` | Smoke-only. No graph-navigation or synthesis claim until output tables map to generated evidence ids. | +| Graphiti/Zep | `memory_evolution`: current and historical validity facts | `cargo make smoke-graphiti-zep-docker-temporal` | `blocked` before live opt-in; `provider_api_key_missing` when live path is enabled without explicit credentials | Provider-bound. No ELF-over-Graphiti/Zep claim until temporal output maps to scored evidence ids. | +| graphify | `knowledge_compilation`: `graph.json`, `GRAPH_REPORT.md`, and query output mapping | `cargo make smoke-graphify-docker-graph-report` | `wrong_result` after setup/run pass | Scored tiny smoke. The graph/report output maps to evidence ids, but the job remains non-pass; no broad graph-navigation quality claim follows. | ## Artifact Contract diff --git a/docs/guide/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md b/docs/guide/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md index a9bee44c..40fca7fa 100644 --- a/docs/guide/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md +++ b/docs/guide/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md @@ -53,7 +53,7 @@ clear answer and trace. | Command | Result | Runtime | Main artifact | | --- | --- | ---: | --- | -| `ELF_GRAPHITI_ZEP_SMOKE_START=1 ELF_GRAPHITI_ZEP_SMOKE_RUN=1 cargo make graphiti-zep-docker-temporal-smoke` | typed blocked | 3.5 seconds | `tmp/real-world-memory/graphiti-zep-smoke/summary.json` | +| `ELF_GRAPHITI_ZEP_SMOKE_START=1 ELF_GRAPHITI_ZEP_SMOKE_RUN=1 cargo make smoke-graphiti-zep-docker-temporal` | typed blocked | 3.5 seconds | `tmp/real-world-memory/graphiti-zep-smoke/summary.json` | | `ELF_BASELINE_PROJECTS=ELF,mem0 cargo make baseline-live-docker` | pass | 50.14 seconds | `tmp/live-baseline/live-baseline-report.json` | | `cargo make real-world-memory-evolution` | pass | 59.65 seconds | `tmp/real-world-memory/evolution-report.json` | | `cargo make real-world-memory-live-adapters` | pass | 166.61 seconds | `tmp/real-world-memory/live-adapters/` | diff --git a/docs/guide/benchmarking/2026-06-16-scheduled-memory-task-scoring-report.md b/docs/guide/benchmarking/2026-06-16-scheduled-memory-task-scoring-report.md index 7907c225..f0d5dedd 100644 --- a/docs/guide/benchmarking/2026-06-16-scheduled-memory-task-scoring-report.md +++ b/docs/guide/benchmarking/2026-06-16-scheduled-memory-task-scoring-report.md @@ -79,16 +79,16 @@ This section is manifest-backed. It records external adapter coverage and blocke | claude-mem | `claude_mem_live_baseline` | `live_baseline_only` | `wrong_result` | `pass` | `wrong_result` | `wrong_result` | `true` | `work_resume`: `not_encoded`
`operator_debugging_ux`: `blocked`
`capture_integration`: `blocked` | setup: `ELF_BASELINE_PROJECTS=claude-mem cargo make baseline-live-docker`
result: `tmp/live-baseline/live-baseline-report.json` | | qmd | `qmd_deep_profile_gate` | `research_gate` | `not_encoded` | `pass` | `not_encoded` | `not_encoded` | `true` | `retrieval`: `not_encoded`
`operator_debugging_ux`: `not_encoded` | setup: `ELF_BASELINE_PROJECTS=qmd ELF_BASELINE_PROFILE=stress cargo make baseline-live-docker`
result: `docs/research/2026-06-11-qmd-openviking-strength-profile-report.json` | | OpenViking | `openviking_deep_profile_gate` | `research_gate` | `blocked` | `pass` | `blocked` | `blocked` | `true` | `retrieval`: `wrong_result`
`context_trajectory`: `blocked`
`operator_debugging_ux`: `not_encoded` | setup: `ELF_BASELINE_PROJECTS=OpenViking cargo make baseline-live-docker`
result: `docs/research/2026-06-11-qmd-openviking-strength-profile-report.json` | -| RAGFlow | `ragflow_research_gate` | `research_gate` | `blocked` | `blocked` | `blocked` | `blocked` | `true` | `retrieval`: `blocked`
`knowledge_compilation`: `not_encoded`
`production_ops`: `blocked` | setup: `cargo make ragflow-docker-smoke`
result: `tmp/real-world-memory/ragflow-smoke/ragflow-report.json` | -| LightRAG | `lightrag_research_gate` | `research_gate` | `blocked` | `blocked` | `blocked` | `blocked` | `true` | `retrieval`: `blocked`
`memory_evolution`: `not_encoded`
`operator_debugging_ux`: `not_encoded` | setup: `cargo make lightrag-docker-context-smoke`
result: `tmp/real-world-memory/lightrag-context/lightrag-report.json` | -| GraphRAG | `graphrag_research_gate` | `research_gate` | `blocked` | `blocked` | `blocked` | `blocked` | `true` | `knowledge_compilation`: `blocked`
`retrieval`: `not_encoded`
`production_ops`: `not_encoded`
`memory_evolution`: `not_encoded` | setup: `cargo make graphrag-docker-smoke`
result: `tmp/real-world-memory/graphrag-smoke/graphrag-report.json` | -| Graphiti/Zep | `graphiti_zep_research_gate` | `research_gate` | `blocked` | `blocked` | `blocked` | `blocked` | `true` | `memory_evolution`: `blocked`
`retrieval`: `not_encoded`
`production_ops`: `not_encoded` | setup: `cargo make graphiti-zep-docker-temporal-smoke`
result: `tmp/real-world-memory/graphiti-zep-smoke/graphiti-zep-report.json` | +| RAGFlow | `ragflow_research_gate` | `research_gate` | `blocked` | `blocked` | `blocked` | `blocked` | `true` | `retrieval`: `blocked`
`knowledge_compilation`: `not_encoded`
`production_ops`: `blocked` | setup: `cargo make smoke-ragflow-docker`
result: `tmp/real-world-memory/ragflow-smoke/ragflow-report.json` | +| LightRAG | `lightrag_research_gate` | `research_gate` | `blocked` | `blocked` | `blocked` | `blocked` | `true` | `retrieval`: `blocked`
`memory_evolution`: `not_encoded`
`operator_debugging_ux`: `not_encoded` | setup: `cargo make smoke-lightrag-docker-context`
result: `tmp/real-world-memory/lightrag-context/lightrag-report.json` | +| GraphRAG | `graphrag_research_gate` | `research_gate` | `blocked` | `blocked` | `blocked` | `blocked` | `true` | `knowledge_compilation`: `blocked`
`retrieval`: `not_encoded`
`production_ops`: `not_encoded`
`memory_evolution`: `not_encoded` | setup: `cargo make smoke-graphrag-docker`
result: `tmp/real-world-memory/graphrag-smoke/graphrag-report.json` | +| Graphiti/Zep | `graphiti_zep_research_gate` | `research_gate` | `blocked` | `blocked` | `blocked` | `blocked` | `true` | `memory_evolution`: `blocked`
`retrieval`: `not_encoded`
`production_ops`: `not_encoded` | setup: `cargo make smoke-graphiti-zep-docker-temporal`
result: `tmp/real-world-memory/graphiti-zep-smoke/graphiti-zep-report.json` | | Letta | `letta_research_gate` | `research_gate` | `blocked` | `blocked` | `not_encoded` | `not_encoded` | `true` | `personalization`: `not_encoded`
`project_decisions`: `not_encoded`
`work_resume`: `not_encoded`
`core_archival_memory`: `blocked` | setup: `Letta is D1 reviewed as a core/archival memory reference. The contained comparison contract is a Docker-only benchmark-created agent export that must return core block JSON, archival search readback, and source ids before any scenario claim is scored.`
result: `No Letta core block, archival fallback, stale-core, scope, provenance, or project-decision result is claimed.` | | LangGraph | `langgraph_research_gate` | `research_gate` | `not_encoded` | `not_encoded` | `not_encoded` | `not_encoded` | `true` | `production_ops`: `not_encoded`
`work_resume`: `not_encoded` | setup: `LangGraph is D1 reviewed as a replay/checkpoint reference, not a direct memory backend adapter.`
result: `No production-ops or resume suite result is claimed.` | | nanograph | `nanograph_research_gate` | `research_gate` | `not_encoded` | `not_encoded` | `not_encoded` | `not_encoded` | `true` | `memory_evolution`: `not_encoded`
`retrieval`: `not_encoded` | setup: `nanograph is D1 reviewed as typed graph DX, but no Docker adapter is implemented.`
result: `No graph temporal or retrieval-debug result is claimed.` | | llm-wiki | `llm_wiki_research_gate` | `research_gate` | `not_encoded` | `not_encoded` | `not_encoded` | `not_encoded` | `true` | `knowledge_compilation`: `not_encoded`
`work_resume`: `not_encoded` | setup: `llm-wiki is D1 reviewed as a knowledge-compilation reference, but no plugin or generated-page adapter is implemented.`
result: `No knowledge page citation or lint result is claimed.` | | gbrain | `gbrain_research_gate` | `research_gate` | `not_encoded` | `not_encoded` | `not_encoded` | `not_encoded` | `true` | `knowledge_compilation`: `not_encoded`
`operator_debugging_ux`: `not_encoded` | setup: `gbrain is D1 reviewed as a compiled-truth and timeline reference, but no Docker adapter is implemented.`
result: `No knowledge-synthesis or operator-continuity result is claimed.` | -| graphify | `graphify_docker_smoke` | `live_real_world` | `wrong_result` | `pass` | `pass` | `wrong_result` | `true` | `knowledge_compilation`: `wrong_result`
`retrieval`: `blocked`
`work_resume`: `not_encoded` | setup: `cargo make graphify-docker-graph-report-smoke`
result: `tmp/real-world-memory/graphify-smoke/graphify-report.json` | +| graphify | `graphify_docker_smoke` | `live_real_world` | `wrong_result` | `pass` | `pass` | `wrong_result` | `true` | `knowledge_compilation`: `wrong_result`
`retrieval`: `blocked`
`work_resume`: `not_encoded` | setup: `cargo make smoke-graphify-docker-graph-report`
result: `tmp/real-world-memory/graphify-smoke/graphify-report.json` | ### Adapter Capability Details @@ -267,16 +267,16 @@ This section is manifest-backed. It records external adapter coverage and blocke | `openviking_live_baseline` | [OpenViking repository](https://github.com/volcengine/OpenViking/): Official source for OpenViking local context database, resource, and retrieval APIs.
[llama-cpp-python CPU wheel index](https://abetlen.github.io/llama-cpp-python/whl/cpu): Official prebuilt CPU wheel index used by the Docker-local embedding pin. | Run ELF_BASELINE_PROJECTS=OpenViking cargo make baseline-live-docker. The runner installs llama-cpp-python==0.3.28 with --only-binary llama-cpp-python from the CPU wheel index before OpenViking add_resource/find. | docker-compose.baseline.yml baseline-runner container; no host-global OpenViking, llama-cpp-python, or model service install is required. | Local embedding setup may download a CPU wheel and model assets; record OpenViking.log, elapsed time, and cache size before claiming adapter quality. | Use the default pinned CPU wheel path first.; Override ELF_BASELINE_OPENVIKING_LLAMA_CPP_PYTHON_VERSION or ELF_BASELINE_OPENVIKING_LLAMA_CPP_PYTHON_INDEX only when the default wheel is unavailable for the Docker platform.; Treat install/import failure as incomplete, not wrong_result; treat add_resource/find evidence misses as wrong_result. | not recorded | | `qmd_deep_profile_gate` | [qmd repository](https://github.com/tobi/qmd): Official qmd source for local hybrid search, CLI setup, and query behavior. | Use the existing Docker baseline qmd install, collection add, update, embed, and query flow with scale or stress profiles. | docker-compose.baseline.yml baseline-runner container with project files and caches inside Docker volumes. | CPU local embedding and rerank cost scale with corpus size; record elapsed time and qmd log artifacts before claims. | Run qmd stress profile in Docker and publish the artifact path.; Map qmd JSON output to retrieval-debug real_world_job scoring before suite claims. | D2 reviewed; deep profile not encoded | | `openviking_deep_profile_gate` | [OpenViking repository](https://github.com/volcengine/OpenViking/): Official source for OpenViking local context database, resource, and retrieval APIs. | Use the pinned Docker local embedding path from scripts/live-baseline-benchmark.sh, then run OpenViking add_resource/find before any deep profile scoring. | docker-compose.baseline.yml baseline-runner container; no host model or compiler setup outside Docker. | Local embedding setup can download CPU wheels and model assets; record build/import logs, model cache size, and elapsed time. | Run the default pinned llama-cpp-python==0.3.28 CPU wheel path first.; Override the OpenViking llama-cpp-python version or index only when the default wheel is unavailable for the Docker platform.; Fix evidence-bearing same-corpus output and materialize selected hierarchy/expansion artifacts before converting blocked context_trajectory fixtures into scored jobs. | D2 reviewed; local embedding setup pinned; blocked fixtures encoded | -| `ragflow_research_gate` | [RAGFlow repository](https://github.com/infiniflow/ragflow): Official source for RAGFlow service code and Docker Compose setup.
[RAGFlow docs](https://ragflow.io/docs/): Official deployment and setup documentation.
[RAGFlow HTTP API reference](https://raw.githubusercontent.com/infiniflow/ragflow/main/docs/references/http_api_reference.md): Official reference for OpenAI-compatible responses with reference chunks and document metadata. | Implement a tiny Docker evidence-smoke runner using the official Docker deployment, dataset ingest API, and OpenAI-compatible query API. | Run scripts/ragflow-docker-evidence-smoke.sh through cargo make; the live path uses the official RAGFlow Docker Compose service boundary without host-global RAGFlow installs. | Large multi-service RAG stack; generated artifacts record CPU/GPU mode, memory, disk, image size, expanded disk notes, startup time, vm.max_map_count handling, and provider boundaries before scoring. | Run cargo make ragflow-docker-smoke first to produce a typed preflight artifact.; Start the live path only with ELF_RAGFLOW_SMOKE_START=1 and ELF_RAGFLOW_SMOKE_ACCEPT_RESOURCE_ENVELOPE=1.; Keep private corpora and operator-owned provider credentials out of this smoke; map only generated public corpus reference chunks to evidence ids. | D2 feasibility verdict plus XY-885 evidence-smoke implementation and XY-900 scored smoke promotion; checked-in record remains research_gate unless a generated artifact reaches query output | -| `lightrag_research_gate` | [LightRAG repository](https://github.com/HKUDS/LightRAG): Official source for LightRAG server, Docker, and retrieval modes.
[LightRAG Docker docs](https://github.com/HKUDS/LightRAG/blob/main/docs/DockerDeployment.md): Official Docker deployment reference.
[LightRAG API server docs](https://github.com/HKUDS/LightRAG/blob/main/docs/LightRAG-API-Server.md): Official query-mode and context-output reference.
[LightRAG core programming docs](https://github.com/HKUDS/LightRAG/blob/main/docs/ProgramingWithCore.md): Official source-id and file-path citation reference. | Run cargo make lightrag-docker-context-smoke for a typed preflight artifact; set ELF_LIGHTRAG_CONTEXT_START=1 to start the opt-in LightRAG Docker profile and attempt live context export. | docker-compose.baseline.yml baseline-runner plus opt-in lightrag and lightrag-mock-provider services; generated source files and LightRAG data stay in Docker-mounted artifact paths and Docker volumes. | The default profile uses the official LightRAG image, a local OpenAI-compatible mock provider, 64-dimensional embeddings, rerank disabled for context queries, cargo/pip/Hugging Face caches, and Docker volumes for rag_storage, inputs, and prompts. | Run cargo make lightrag-docker-context-smoke first; a missing API must remain a typed incomplete artifact, not a pass claim.; Set ELF_LIGHTRAG_CONTEXT_START=1 only when Docker may pull/start the LightRAG service profile.; Score retrieval only when returned context, references.file_path, or references.content map to required evidence ids. | D2 feasibility plus XY-886 context-export implementation and XY-900 scored smoke aggregation; checked-in record remains research_gate unless a generated artifact reaches query output | -| `graphrag_research_gate` | [GraphRAG repository](https://github.com/microsoft/graphrag): Official Microsoft GraphRAG source and setup reference.
[GraphRAG docs](https://microsoft.github.io/graphrag/): Official documentation for indexing and querying.
[GraphRAG input docs](https://microsoft.github.io/graphrag/index/inputs/): Official input format and document metadata reference.
[GraphRAG output tables](https://microsoft.github.io/graphrag/index/outputs/): Official output schema with document, text unit, community, and relationship identifiers.
[GraphRAG local search docs](https://microsoft.github.io/graphrag/query/local_search/): Official local-search context and graph traversal reference. | Run cargo make graphrag-docker-smoke for a typed preflight artifact; set ELF_GRAPHRAG_SMOKE_RUN=1 with explicit provider configuration for a live GraphRAG index/query attempt. | docker-compose.baseline.yml baseline-runner, container-local Python venv, generated public corpus, and report artifacts under tmp/real-world-memory/graphrag-smoke. | The default profile uses a generated public corpus capped by ELF_GRAPHRAG_MAX_DOCS and ELF_GRAPHRAG_MAX_INPUT_CHARS, pins GraphRAG through ELF_GRAPHRAG_PACKAGE, and records elapsed time, cache size, output size, and observed cache entries. | Run cargo make graphrag-docker-smoke first; missing provider configuration must remain a typed blocked artifact, not a pass claim.; Enable ELF_GRAPHRAG_SMOKE_RUN=1 only for generated public corpus indexing with explicit provider configuration.; Fail typed if source document or text_unit identifiers cannot be mapped to expected evidence IDs. | D2 feasibility plus XY-887 Docker smoke implementation and XY-900 scored smoke promotion; checked-in record remains research_gate unless a generated artifact reaches GraphRAG output | -| `graphiti_zep_research_gate` | [Graphiti repository](https://github.com/getzep/graphiti): Official open-source temporal context graph engine.
[Zep Graphiti overview](https://www.getzep.com/platform/graphiti/): Official product documentation for temporal context graph behavior.
[Graphiti quick start](https://help.getzep.com/graphiti/getting-started/quick-start): Official setup, episode ingest, and search output reference.
[Graphiti FalkorDB configuration](https://help.getzep.com/graphiti/configuration/falkor-db-configuration): Official Docker-local FalkorDB setup reference.
[Graphiti fact triples](https://help.getzep.com/graphiti/working-with-data/adding-fact-triples): Official manual fact-triple ingest contract. | Run cargo make graphiti-zep-docker-temporal-smoke for a typed artifact; set ELF_GRAPHITI_ZEP_SMOKE_START=1 ELF_GRAPHITI_ZEP_SMOKE_RUN=1 with explicit provider configuration for a live attempt. | docker-compose.baseline.yml baseline-runner plus graphiti-zep FalkorDB profile, container-local Python venv, generated public temporal facts, and report artifacts under tmp/real-world-memory/graphiti-zep-smoke. | Requires Docker-local FalkorDB plus LLM/embedding configuration; generated artifacts record service startup, storage size, provider boundaries, fact count, and timeout before scoring. | Run cargo make graphiti-zep-docker-temporal-smoke first to produce a typed blocked artifact.; Start the live path only with ELF_GRAPHITI_ZEP_SMOKE_START=1, ELF_GRAPHITI_ZEP_SMOKE_RUN=1, and explicit provider configuration.; Treat missing validity windows or unmapped current/historical facts as wrong_result, not pass. | D2 feasibility plus XY-888 Docker temporal smoke implementation and XY-900 scored smoke promotion; checked-in record remains research_gate unless a generated artifact reaches Graphiti search output | +| `ragflow_research_gate` | [RAGFlow repository](https://github.com/infiniflow/ragflow): Official source for RAGFlow service code and Docker Compose setup.
[RAGFlow docs](https://ragflow.io/docs/): Official deployment and setup documentation.
[RAGFlow HTTP API reference](https://raw.githubusercontent.com/infiniflow/ragflow/main/docs/references/http_api_reference.md): Official reference for OpenAI-compatible responses with reference chunks and document metadata. | Implement a tiny Docker evidence-smoke runner using the official Docker deployment, dataset ingest API, and OpenAI-compatible query API. | Run scripts/ragflow-docker-evidence-smoke.sh through cargo make; the live path uses the official RAGFlow Docker Compose service boundary without host-global RAGFlow installs. | Large multi-service RAG stack; generated artifacts record CPU/GPU mode, memory, disk, image size, expanded disk notes, startup time, vm.max_map_count handling, and provider boundaries before scoring. | Run cargo make smoke-ragflow-docker first to produce a typed preflight artifact.; Start the live path only with ELF_RAGFLOW_SMOKE_START=1 and ELF_RAGFLOW_SMOKE_ACCEPT_RESOURCE_ENVELOPE=1.; Keep private corpora and operator-owned provider credentials out of this smoke; map only generated public corpus reference chunks to evidence ids. | D2 feasibility verdict plus XY-885 evidence-smoke implementation and XY-900 scored smoke promotion; checked-in record remains research_gate unless a generated artifact reaches query output | +| `lightrag_research_gate` | [LightRAG repository](https://github.com/HKUDS/LightRAG): Official source for LightRAG server, Docker, and retrieval modes.
[LightRAG Docker docs](https://github.com/HKUDS/LightRAG/blob/main/docs/DockerDeployment.md): Official Docker deployment reference.
[LightRAG API server docs](https://github.com/HKUDS/LightRAG/blob/main/docs/LightRAG-API-Server.md): Official query-mode and context-output reference.
[LightRAG core programming docs](https://github.com/HKUDS/LightRAG/blob/main/docs/ProgramingWithCore.md): Official source-id and file-path citation reference. | Run cargo make smoke-lightrag-docker-context for a typed preflight artifact; set ELF_LIGHTRAG_CONTEXT_START=1 to start the opt-in LightRAG Docker profile and attempt live context export. | docker-compose.baseline.yml baseline-runner plus opt-in lightrag and lightrag-mock-provider services; generated source files and LightRAG data stay in Docker-mounted artifact paths and Docker volumes. | The default profile uses the official LightRAG image, a local OpenAI-compatible mock provider, 64-dimensional embeddings, rerank disabled for context queries, cargo/pip/Hugging Face caches, and Docker volumes for rag_storage, inputs, and prompts. | Run cargo make smoke-lightrag-docker-context first; a missing API must remain a typed incomplete artifact, not a pass claim.; Set ELF_LIGHTRAG_CONTEXT_START=1 only when Docker may pull/start the LightRAG service profile.; Score retrieval only when returned context, references.file_path, or references.content map to required evidence ids. | D2 feasibility plus XY-886 context-export implementation and XY-900 scored smoke aggregation; checked-in record remains research_gate unless a generated artifact reaches query output | +| `graphrag_research_gate` | [GraphRAG repository](https://github.com/microsoft/graphrag): Official Microsoft GraphRAG source and setup reference.
[GraphRAG docs](https://microsoft.github.io/graphrag/): Official documentation for indexing and querying.
[GraphRAG input docs](https://microsoft.github.io/graphrag/index/inputs/): Official input format and document metadata reference.
[GraphRAG output tables](https://microsoft.github.io/graphrag/index/outputs/): Official output schema with document, text unit, community, and relationship identifiers.
[GraphRAG local search docs](https://microsoft.github.io/graphrag/query/local_search/): Official local-search context and graph traversal reference. | Run cargo make smoke-graphrag-docker for a typed preflight artifact; set ELF_GRAPHRAG_SMOKE_RUN=1 with explicit provider configuration for a live GraphRAG index/query attempt. | docker-compose.baseline.yml baseline-runner, container-local Python venv, generated public corpus, and report artifacts under tmp/real-world-memory/graphrag-smoke. | The default profile uses a generated public corpus capped by ELF_GRAPHRAG_MAX_DOCS and ELF_GRAPHRAG_MAX_INPUT_CHARS, pins GraphRAG through ELF_GRAPHRAG_PACKAGE, and records elapsed time, cache size, output size, and observed cache entries. | Run cargo make smoke-graphrag-docker first; missing provider configuration must remain a typed blocked artifact, not a pass claim.; Enable ELF_GRAPHRAG_SMOKE_RUN=1 only for generated public corpus indexing with explicit provider configuration.; Fail typed if source document or text_unit identifiers cannot be mapped to expected evidence IDs. | D2 feasibility plus XY-887 Docker smoke implementation and XY-900 scored smoke promotion; checked-in record remains research_gate unless a generated artifact reaches GraphRAG output | +| `graphiti_zep_research_gate` | [Graphiti repository](https://github.com/getzep/graphiti): Official open-source temporal context graph engine.
[Zep Graphiti overview](https://www.getzep.com/platform/graphiti/): Official product documentation for temporal context graph behavior.
[Graphiti quick start](https://help.getzep.com/graphiti/getting-started/quick-start): Official setup, episode ingest, and search output reference.
[Graphiti FalkorDB configuration](https://help.getzep.com/graphiti/configuration/falkor-db-configuration): Official Docker-local FalkorDB setup reference.
[Graphiti fact triples](https://help.getzep.com/graphiti/working-with-data/adding-fact-triples): Official manual fact-triple ingest contract. | Run cargo make smoke-graphiti-zep-docker-temporal for a typed artifact; set ELF_GRAPHITI_ZEP_SMOKE_START=1 ELF_GRAPHITI_ZEP_SMOKE_RUN=1 with explicit provider configuration for a live attempt. | docker-compose.baseline.yml baseline-runner plus graphiti-zep FalkorDB profile, container-local Python venv, generated public temporal facts, and report artifacts under tmp/real-world-memory/graphiti-zep-smoke. | Requires Docker-local FalkorDB plus LLM/embedding configuration; generated artifacts record service startup, storage size, provider boundaries, fact count, and timeout before scoring. | Run cargo make smoke-graphiti-zep-docker-temporal first to produce a typed blocked artifact.; Start the live path only with ELF_GRAPHITI_ZEP_SMOKE_START=1, ELF_GRAPHITI_ZEP_SMOKE_RUN=1, and explicit provider configuration.; Treat missing validity windows or unmapped current/historical facts as wrong_result, not pass. | D2 feasibility plus XY-888 Docker temporal smoke implementation and XY-900 scored smoke promotion; checked-in record remains research_gate unless a generated artifact reaches Graphiti search output | | `letta_research_gate` | [Letta repository](https://github.com/letta-ai/letta): Official source for Letta stateful agents and memory.
[Letta Docker docs](https://docs.letta.com/guides/docker/): Official Docker deployment guide and embedding configuration boundary. | Use a Docker-only Letta server or CLI flow that creates a benchmark-owned agent, loads the checked-in core_archival_memory fixture corpus, writes core memory and archival memory with fixture source ids, then exports core block JSON plus archival search/readback JSON. | Docker-only Letta server or CLI flow with benchmark-created agents, benchmark-owned storage, no host-global state, and no unstated hosted service dependency. | Embedding model, agent server state, exported core memory, archival search output, and provider boundaries must be explicit in the artifact. | Create a tiny Docker agent with core memory and archival memory loaded from the ELF core_archival_memory fixtures.; Export core block readback, archival search results, source ids, and any audit-equivalent metadata as JSON before scoring.; Score core-versus-archival scenarios only after source evidence can be exported and mapped to the fixture evidence ids. | D1 feasibility verdict: research_only (XY-882); XY-927 selects the contained export/readback contract, but the Letta adapter remains blocked until that artifact exists | | `langgraph_research_gate` | [LangGraph persistence docs](https://docs.langchain.com/oss/python/langgraph/persistence): Official documentation for checkpoints, replay, fork, and persistence behavior. | Build a tiny LangGraph agent with a checkpointer and explicit memory read/write steps before scoring. | Docker-only Python harness with checkpoint store under the artifact directory. | Small runtime expected, but LLM calls and side effects must be stubbed or deterministic before replay claims. | Encode one replay/fork failure recovery job.; Keep LangGraph classified as replay reference unless memory retrieval is actually exercised. | D1 feasibility verdict: research_only (XY-882); replay/checkpoint reference, adapter not encoded | | `nanograph_research_gate` | [nanograph repository](https://github.com/nanograph/nanograph): Official source for on-device typed property graph behavior. | Build or install nanograph inside Docker and load a typed graph fixture from generated corpus facts. | Docker-only CLI run with graph folder under benchmark artifacts. | Light local graph runtime expected; record binary build/install time and graph artifact size. | Define a minimal schema for memory_evolution facts.; Score typed query output only if it cites fixture evidence IDs. | D1 feasibility verdict: research_only (XY-882); typed graph DX reference, adapter not encoded | | `llm_wiki_research_gate` | [llm-wiki repository](https://github.com/nvk/llm-wiki): Official source for the LLM Wiki plugin and knowledge-base workflow. | Research plugin bootstrap inside a Docker-contained Codex or file-based harness, then materialize page artifacts. | Docker-only plugin or fixture materializer; no user-global Codex plugin install. | LLM generation cost depends on page build; record provider boundary and generated artifact size. | Prototype a fixture-only page build with explicit citations.; Do not score until generated sections can be mapped to evidence IDs. | D1 feasibility verdict: research_only (XY-882); derived wiki workflow reference, adapter not encoded | | `gbrain_research_gate` | [gbrain repository](https://github.com/garrytan/gbrain): Official source for brain repo and retrieval workflow.
[compiled truth guide](https://github.com/garrytan/gbrain/blob/master/docs/guides/compiled-truth.md): Official guide for compiled truth plus timeline behavior. | Create a Docker-local brain repo fixture, run import/sync, and export compiled truth plus timeline evidence. | Docker-only repository and database state with no operator-owned brain repo. | Postgres-backed sync and embedding choices must be explicit; record DB size and import time. | Prototype a tiny brain repo with one current-truth page and timeline.; Score only if compiled truth cites the source timeline evidence. | D1 feasibility verdict: blocked (XY-882); Docker-local brain repo and database path not proven | -| `graphify_docker_smoke` | [graphify repository](https://github.com/safishamsi/graphify): Official source for graphify graph extraction and query workflow.
[graphify README](https://github.com/safishamsi/graphify/blob/v3/README.md): Official CLI, output artifact, query, and source-location contract. | Run cargo make graphify-docker-graph-report-smoke to install graphify in Docker, build graph/report artifacts from a generated public corpus, and export query evidence without installing host-global assistant hooks. | docker-compose.baseline.yml baseline-runner, container-local Python venv, isolated HOME/config paths, generated public corpus, and artifacts under tmp/real-world-memory/graphify-smoke. | Graph build cost scales with corpus and model choices; generated artifacts record package reference, provider/model boundary, build time, graph size, report size, cache size, timeout, and retry behavior. | Run cargo make graphify-docker-graph-report-smoke first; setup/runtime failures must remain typed artifacts, not pass claims.; Do not use graphify host assistant hook installs or operator-owned assistant configuration as proof.; Score graph-guided answers only when graph.json, GRAPH_REPORT.md, and graphify query output map to generated evidence ids. | D1 feasibility verdict plus XY-889 Docker graph/report smoke implementation and XY-900 scored smoke promotion; current Docker validation reaches graphify output and scores the tiny knowledge_compilation job as wrong_result | +| `graphify_docker_smoke` | [graphify repository](https://github.com/safishamsi/graphify): Official source for graphify graph extraction and query workflow.
[graphify README](https://github.com/safishamsi/graphify/blob/v3/README.md): Official CLI, output artifact, query, and source-location contract. | Run cargo make smoke-graphify-docker-graph-report to install graphify in Docker, build graph/report artifacts from a generated public corpus, and export query evidence without installing host-global assistant hooks. | docker-compose.baseline.yml baseline-runner, container-local Python venv, isolated HOME/config paths, generated public corpus, and artifacts under tmp/real-world-memory/graphify-smoke. | Graph build cost scales with corpus and model choices; generated artifacts record package reference, provider/model boundary, build time, graph size, report size, cache size, timeout, and retry behavior. | Run cargo make smoke-graphify-docker-graph-report first; setup/runtime failures must remain typed artifacts, not pass claims.; Do not use graphify host assistant hook installs or operator-owned assistant configuration as proof.; Score graph-guided answers only when graph.json, GRAPH_REPORT.md, and graphify query output map to generated evidence ids. | D1 feasibility verdict plus XY-889 Docker graph/report smoke implementation and XY-900 scored smoke promotion; current Docker validation reaches graphify output and scores the tiny knowledge_compilation job as wrong_result | ## Capture And Integration Coverage diff --git a/docs/guide/benchmarking/live_baseline_benchmark.md b/docs/guide/benchmarking/live_baseline_benchmark.md index ad839597..9d93a2d6 100644 --- a/docs/guide/benchmarking/live_baseline_benchmark.md +++ b/docs/guide/benchmarking/live_baseline_benchmark.md @@ -405,7 +405,7 @@ tmp/real-world-memory/live-adapters/summary.json To run the checked-in real-world job smoke fixture and render its Markdown report: ```sh -cargo make real-world-job-smoke +cargo make smoke-real-world-job ``` To run the checked-in work-resume, source-of-truth, lifecycle, redaction, @@ -508,7 +508,7 @@ benchmark artifacts, not source-truth replacements. ## Clean Up ```sh -cargo make baseline-live-docker-clean +cargo make clean-baseline-live-docker ``` This removes Docker-managed Postgres, Qdrant, npm, pip, cargo, and target volumes used diff --git a/docs/guide/benchmarking/real_world_agent_memory_benchmark.md b/docs/guide/benchmarking/real_world_agent_memory_benchmark.md index 969dc125..c4e5c141 100644 --- a/docs/guide/benchmarking/real_world_agent_memory_benchmark.md +++ b/docs/guide/benchmarking/real_world_agent_memory_benchmark.md @@ -117,7 +117,7 @@ Recommended first increments: Current checked-in smoke increment: ```sh -cargo make real-world-job-smoke +cargo make smoke-real-world-job ``` This parses `apps/elf-eval/fixtures/real_world_memory/work_resume/`, writes diff --git a/docs/guide/competitive_parity_testing.md b/docs/guide/competitive_parity_testing.md index 0497ae74..328bdd91 100644 --- a/docs/guide/competitive_parity_testing.md +++ b/docs/guide/competitive_parity_testing.md @@ -29,7 +29,7 @@ tmp/parity/competitive-parity-report.json Remove parity containers and Docker-managed volumes: ```sh -cargo make parity-docker-clean +cargo make clean-parity-docker ``` The cleanup command removes Postgres, Qdrant, Cargo cache, and Rust target volumes diff --git a/docs/guide/evaluation.md b/docs/guide/evaluation.md index 994ab0af..39441ab9 100644 --- a/docs/guide/evaluation.md +++ b/docs/guide/evaluation.md @@ -172,7 +172,7 @@ To measure cross-scope misranking before and after enabling context boosting, us script: ```bash -cargo make e2e +cargo make test-e2e ``` Or run the script directly: @@ -339,12 +339,6 @@ What it does: To validate the reflection/consolidation loop with stable query assertions, use the harness: -```bash -cargo make e2e-consolidation-harness -``` - -Or run directly: - ```bash scripts/consolidation-harness.sh ``` diff --git a/docs/guide/getting_started.md b/docs/guide/getting_started.md index b630c218..f5ede104 100644 --- a/docs/guide/getting_started.md +++ b/docs/guide/getting_started.md @@ -141,7 +141,7 @@ ELF_PG_DSN="postgres://elf_dev:elf_dev_password@127.0.0.1:51888/postgres" \ ELF_QDRANT_GRPC_URL="http://127.0.0.1:51890" \ ELF_QDRANT_HTTP_URL="http://127.0.0.1:51889" \ ELF_HARNESS_VECTOR_DIM=256 \ -cargo make e2e +cargo make test-e2e ``` ## 8. Development workflow @@ -150,17 +150,17 @@ Use `cargo make` tasks from repository root. ```sh cargo make fmt -cargo make lint -cargo make test -cargo make test-integration -cargo make e2e +cargo make check +cargo make test-rust +cargo make test-rust-integration +cargo make test-e2e ``` Notes: -- `cargo make test-integration` runs ignored tests that require external Postgres and Qdrant. +- `cargo make test-rust-integration` runs ignored tests that require external Postgres and Qdrant. Set `ELF_PG_DSN` and `ELF_QDRANT_GRPC_URL`. -- `cargo make e2e` runs the context misranking harness. +- `cargo make test-e2e` runs the context misranking harness. Set `ELF_PG_DSN`, `ELF_QDRANT_GRPC_URL`, and `ELF_QDRANT_HTTP_URL`. - Stop local dependencies with `docker compose -f docker-compose.yml down`. Add `-v` only when you intentionally want to delete the local development volumes. diff --git a/docs/guide/integration-testing.md b/docs/guide/integration-testing.md index c6219b46..336715f9 100644 --- a/docs/guide/integration-testing.md +++ b/docs/guide/integration-testing.md @@ -20,7 +20,7 @@ Run the ignored integration suite (requires external Postgres and Qdrant): ```bash ELF_PG_DSN="postgres://postgres:postgres@127.0.0.1:51888/postgres" \ ELF_QDRANT_GRPC_URL="http://127.0.0.1:51890" \ -cargo make test-integration +cargo make test-rust-integration ``` Run the context misranking harness (creates and drops a dedicated database and collection): @@ -29,7 +29,7 @@ Run the context misranking harness (creates and drops a dedicated database and c ELF_PG_DSN="postgres://postgres:postgres@127.0.0.1:51888/postgres" \ ELF_QDRANT_GRPC_URL="http://127.0.0.1:51890" \ ELF_QDRANT_HTTP_URL="http://127.0.0.1:51889" \ -cargo make e2e +cargo make test-e2e ``` CI also runs this harness as a required check for code changes (see `.github/workflows/e2e.yml`). diff --git a/docs/guide/research/comparison_external_projects.md b/docs/guide/research/comparison_external_projects.md index 7173ecb1..42a861f8 100644 --- a/docs/guide/research/comparison_external_projects.md +++ b/docs/guide/research/comparison_external_projects.md @@ -110,7 +110,7 @@ Project-to-suite map: | llm-wiki | `rw.knowledge-synthesis`, `rw.resume-evidence` | Query/save/lint flows and topic-scoped wiki pages are a useful reference for turning retrieved memory into maintained project knowledge. | Run a corpus-to-wiki job, ask resume/decision questions, require page citations back to source memory, then mutate a stale source and prove lint/repair catches it. | Docs-grounded D1; no benchmark adapter evidence. Confidence: medium for derived-knowledge fit. | ELF is not yet stronger on derived knowledge pages; llm-wiki should inform rebuildable, evidence-cited dossiers rather than core storage. | | gbrain | `rw.knowledge-synthesis`, `rw.operator-continuity` | `compiled_truth`, timeline sections, backlinks, primary-home routing, and enrichment workflows model a living operational brain for project work. | Build or update pages from the real-world corpus, require current-truth plus timeline answers, and prove enrichment/backlink maintenance does not hide unsupported claims. | Docs-grounded D1; no benchmark adapter evidence. Confidence: medium for operator knowledge UX. | ELF should keep source notes authoritative; gbrain is a reference for presentation, enrichment, and maintenance loops. | | Always-On Memory Agent | `rw.consolidation-review`, `rw.operator-continuity` | The file/API/dashboard ingest loop and timer-based consolidation show how background memory formation becomes a user-visible product surface. | Run scheduled consolidation on a fixed corpus, record source rows and output insights, then score whether consolidation is reviewable, repeatable, and bounded against unsupported claims. | Docs-grounded D1; no benchmark adapter evidence. Confidence: medium for consolidation workflow reference. | ELF should borrow scheduling and operator controls while keeping deterministic writes and reviewable derived outputs. | -| graphify | `rw.graph-navigation`, `rw.knowledge-synthesis`, `rw.resume-evidence` | Deterministic code extraction, LLM-assisted graph building, honesty tags, graph reports, and assistant hooks are strong references for graph-compressed navigation over large corpora. | Generate graph/report artifacts from the benchmark corpus, require answers to use graph structure plus source evidence, and prove rebuild behavior after corpus edits. | Scored tiny `live_real_world` smoke: `cargo make graphify-docker-graph-report-smoke` records a Docker-only generated-corpus graph/report artifact and currently scores `wrong_result`; the checked-in manifest does not claim broad graph quality, rebuild strength, or production-quality graph navigation. Confidence: medium for adapter feasibility, low for production-quality graph navigation. | ELF is stronger as a memory service; graphify is now a runnable reference for derived graph reports and pre-search guidance, but not yet a stronger end-to-end memory system. | +| graphify | `rw.graph-navigation`, `rw.knowledge-synthesis`, `rw.resume-evidence` | Deterministic code extraction, LLM-assisted graph building, honesty tags, graph reports, and assistant hooks are strong references for graph-compressed navigation over large corpora. | Generate graph/report artifacts from the benchmark corpus, require answers to use graph structure plus source evidence, and prove rebuild behavior after corpus edits. | Scored tiny `live_real_world` smoke: `cargo make smoke-graphify-docker-graph-report` records a Docker-only generated-corpus graph/report artifact and currently scores `wrong_result`; the checked-in manifest does not claim broad graph quality, rebuild strength, or production-quality graph navigation. Confidence: medium for adapter feasibility, low for production-quality graph navigation. | ELF is stronger as a memory service; graphify is now a runnable reference for derived graph reports and pre-search guidance, but not yet a stronger end-to-end memory system. | | Letta | `rw.core-archival`, `rw.operator-continuity` | Core memory blocks, archival memory, and shared/read-only memory blocks map directly to always-loaded operating context versus retrievable memory. | Build a multi-agent job where core blocks must be attached/detached/shared read-only, while archival memory is retrieved separately and audited. | Docs-grounded D1; no benchmark adapter evidence. Confidence: medium for memory-semantics reference. | ELF has scoped notes but not first-class core/archival block ergonomics; Letta is the reference dimension. | | LangGraph | `rw.replay-regression`, `rw.resume-evidence` | Thread checkpoints, durable execution, replay, fork, and time travel define a strong model for debugging agent-state and memory-regression behavior. | Run an agent job with memory reads across checkpoints, replay/fork the thread after a stale-memory failure, and verify side-effect boundaries. | Docs-grounded D1; no benchmark adapter evidence. Confidence: medium for replay workflow reference. | ELF traces are useful but do not replace full agent checkpoint replay; LangGraph is the reference for replay-regression jobs. | | Graphiti / Zep | `rw.graph-temporal`, `rw.resume-evidence` | Temporal entities, relations, fact triples, validity windows, and graph search directly target stale/contradictory factual memory. | Add fact triples with validity changes, query current and historical answers, and score invalidation/append behavior under contradiction traps. | Docs-grounded D1; no benchmark adapter evidence. Confidence: medium-high for temporal-graph dimension. | ELF graph-lite covers evidence-linked validity windows and current/historical relation context; Graphiti/Zep remains the reference for broader temporal graph workflows. | @@ -124,7 +124,7 @@ XY-882 feasibility verdicts for RAG and graph-memory gates: | LightRAG | `adapter_candidate` | Docker Compose server with explicit LLM, embedding, rerank, storage, workspace, and data-volume configuration. | Context-only query modes can return the context prepared for the LLM; core APIs can insert documents with ids and source file paths. | [XY-886](https://linear.app/hack-ink/issue/XY-886/elf-benchmark-adapter-implement-lightrag-docker-context-export-adapter); no live pass claim. | | GraphRAG | `adapter_candidate` | Cost-bounded Docker Python CLI/API run over a generated tiny corpus with container-local parquet artifacts. | Output tables contain generated UUIDs, human-readable ids, source documents, text units, community reports, and text-unit links for graph summaries and relationships. | [XY-887](https://linear.app/hack-ink/issue/XY-887/elf-benchmark-adapter-implement-graphrag-cost-bounded-docker-adapter); no live pass claim. | | Graphiti / Zep | `adapter_candidate` | Docker-local FalkorDB or Neo4j plus Python SDK runner with provider config captured under benchmark artifacts. | Search results and fact triples expose UUIDs, fact text, and validity windows (`valid_at` / `invalid_at`) that map to memory-evolution scoring. | [XY-888](https://linear.app/hack-ink/issue/XY-888/elf-benchmark-adapter-implement-graphitizep-temporal-graph-adapter); no live pass claim. | -| graphify | `adapter_candidate` | Docker-only CLI/materializer using `pip install graphifyy` over a mounted corpus; host-global assistant hooks are out of scope. | `graph.json`, `GRAPH_REPORT.md`, and graph query output include edge types, confidence tags, source files, and source locations. | [XY-889](https://linear.app/hack-ink/issue/XY-889/elf-benchmark-adapter-implement-graphify-docker-graph-report-adapter) adds `cargo make graphify-docker-graph-report-smoke`; XY-900 promotes the tiny generated smoke to scored `live_real_world` `wrong_result` evidence while still avoiding broad quality claims. | +| graphify | `adapter_candidate` | Docker-only CLI/materializer using `pip install graphifyy` over a mounted corpus; host-global assistant hooks are out of scope. | `graph.json`, `GRAPH_REPORT.md`, and graph query output include edge types, confidence tags, source files, and source locations. | [XY-889](https://linear.app/hack-ink/issue/XY-889/elf-benchmark-adapter-implement-graphify-docker-graph-report-adapter) adds `cargo make smoke-graphify-docker-graph-report`; XY-900 promotes the tiny generated smoke to scored `live_real_world` `wrong_result` evidence while still avoiding broad quality claims. | | Letta | `research_only` | Docker server exists, but current docs require explicit embedding configuration and steer Letta Code evaluation toward non-Docker local/frontier-model exploration. | Core/archival memory and shared blocks remain useful semantics, but no contained evidence export is selected for this adapter batch. | No implementation issue. | | LangGraph | `research_only` | A Docker harness is possible, but the project is an agent-state/checkpoint framework rather than a standalone memory adapter. | Store search and checkpoints are references for replay-regression jobs, not a direct external memory output contract here. | No implementation issue. | | nanograph | `research_only` | Official positioning is one CLI / one folder / no server / no Docker. | Typed schema, query, CDC, and search ergonomics remain graph-lite DX inspiration. | No implementation issue. | diff --git a/docs/guide/testing.md b/docs/guide/testing.md index dbd539e0..480a8c61 100644 --- a/docs/guide/testing.md +++ b/docs/guide/testing.md @@ -10,9 +10,9 @@ Outputs: A consistent test-category name and the matching command or workflow. - `unit` — Tests inside `#[cfg(test)]` modules in `src/`. Run with `cargo make test`. - `integration` — Rust integration tests under `tests/*.rs`. Run with `cargo make test`. -- `integration (ignored)` — Integration tests that require external services and are marked `#[ignore]`. +- `integration (ignored)` — Integration tests that require external services and are marked `#[ignore]`. Run with `cargo make test-rust-integration`. - `acceptance` — The integration suite in `packages/elf-service/tests/acceptance.rs` and `packages/elf-service/tests/acceptance/*.rs`. These are usually `#[ignore]` and require external services. -- `E2E harness` — Deterministic harness scripts for memory retrieval/ranking. Run locally with `cargo make e2e` and in CI via `.github/workflows/e2e.yml`. +- `E2E harness` — Deterministic harness scripts for memory retrieval/ranking. Run locally with `cargo make test-e2e` and in CI via `.github/workflows/e2e.yml`. Note: Some integration tests require external services such as Postgres or Qdrant and are marked `#[ignore]`. When requesting those, say "integration (ignored)" so the ignored set is included. diff --git a/docs/plans/2026-02-02-project-cleanup-design.md b/docs/plans/2026-02-02-project-cleanup-design.md index 2199e4ba..4f6d6cf4 100644 --- a/docs/plans/2026-02-02-project-cleanup-design.md +++ b/docs/plans/2026-02-02-project-cleanup-design.md @@ -1,6 +1,6 @@ # Project Cleanup Architecture Design -**Goal:** Restructure each app into a library-plus-binary layout, remove `#[path]` test imports, and make `cargo make lint` pass without suppressing lints. +**Goal:** Restructure each app into a library-plus-binary layout, remove `#[path]` test imports, and make `cargo make lint-rust` pass without suppressing lints. **Scope (Option 2):** - Apply the `lib + bin` layout to `elf-api`, `elf-mcp`, and `elf-worker`. @@ -19,5 +19,5 @@ - Any remaining clippy errors will be fixed by small structural adjustments rather than `#[allow]` attributes. **Testing and Verification:** -- Run `cargo make lint` to confirm workspace linting passes. +- Run `cargo make lint-rust` to confirm workspace linting passes. - Do not change test behavior; only update import paths and shared wiring required by the new layout. diff --git a/docs/plans/2026-02-02-project-cleanup.md b/docs/plans/2026-02-02-project-cleanup.md index 536991c7..a0ef40d4 100644 --- a/docs/plans/2026-02-02-project-cleanup.md +++ b/docs/plans/2026-02-02-project-cleanup.md @@ -2,7 +2,7 @@ > **For Claude:** REQUIRED SUB-SKILL: Use superpowers:executing-plans to implement this plan task-by-task. -**Goal:** Refactor each app into a lib+bin layout, remove `#[path]` test imports, and keep CLI/logging behavior unchanged while ensuring `cargo make lint` passes. +**Goal:** Refactor each app into a lib+bin layout, remove `#[path]` test imports, and keep CLI/logging behavior unchanged while ensuring `cargo make lint-rust` passes. **Architecture:** Each app exposes a small `lib.rs` with its CLI `Args` and `run` entrypoint plus existing modules. `main.rs` becomes a thin wrapper that parses CLI args and calls the library. Tests import the library modules instead of using `#[path]`. @@ -250,7 +250,7 @@ git commit -m "refactor: move elf-mcp entrypoint into lib" - Modify: None **Step 1: Run lint** -Run: `cargo make lint` +Run: `cargo make lint-rust` Expected: PASS. **Step 2: Run targeted app tests** diff --git a/docs/plans/2026-02-25-ci-services-checks-design.md b/docs/plans/2026-02-25-ci-services-checks-design.md index 359c7017..92b8765d 100644 --- a/docs/plans/2026-02-25-ci-services-checks-design.md +++ b/docs/plans/2026-02-25-ci-services-checks-design.md @@ -43,7 +43,7 @@ Update `.github/workflows/integration.yml` to run on PR and merge queue (in addi In this workflow, run the full workspace test suite including ignored tests: -- `cargo nextest run --workspace --all-targets --all-features --run-ignored all` +- `cargo make test-rust-all` Rationale: @@ -54,7 +54,7 @@ Rationale: Add a new workflow to run the lightweight, deterministic E2E harness: -- `cargo make e2e` (which runs `scripts/context-misranking-harness.sh`) +- `cargo make test-e2e` (which runs `scripts/context-misranking-harness.sh`) Key properties: @@ -73,7 +73,6 @@ Do not change `.github/workflows/nightly-harness-signals.yml` scope: it remains - `Integration Tests` runs with `--run-ignored all` and succeeds on `main`. - A new E2E workflow runs on: - `pull_request`, `merge_group`, `workflow_dispatch` -- E2E job starts Postgres + Qdrant via GitHub Actions services and successfully runs `cargo make e2e` without external secrets. +- E2E job starts Postgres + Qdrant via GitHub Actions services and successfully runs `cargo make test-e2e` without external secrets. - Both workflows use `paths-ignore` for docs-only changes (`docs/**`, `**/*.md`, `.gitignore`). - Local docs reflect the updated meaning of “E2E harness” vs “nightly harness signals”. - diff --git a/docs/research/2026-06-11-competitor-strength-adoption-report.json b/docs/research/2026-06-11-competitor-strength-adoption-report.json index cfe2f5ca..6404bc35 100644 --- a/docs/research/2026-06-11-competitor-strength-adoption-report.json +++ b/docs/research/2026-06-11-competitor-strength-adoption-report.json @@ -93,12 +93,12 @@ "claim": "mem0 local OSS passes preference correction history, entity-scoped personalization, local get_all export-style readback, and deletion audit history; OpenMemory export-helper setup emits a separate blocked artifact with DOCKER_UNAVAILABLE_IN_BASELINE_RUNNER, and hosted Platform export remains non-goal." }, { - "command": "ELF_GRAPHITI_ZEP_SMOKE_START=1 ELF_GRAPHITI_ZEP_SMOKE_RUN=1 cargo make graphiti-zep-docker-temporal-smoke", + "command": "ELF_GRAPHITI_ZEP_SMOKE_START=1 ELF_GRAPHITI_ZEP_SMOKE_RUN=1 cargo make smoke-graphiti-zep-docker-temporal", "artifact": "docs/guide/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md", "claim": "Graphiti/Zep temporal smoke remains blocked by provider_api_key_missing when live provider execution is explicitly enabled without credentials." }, { - "command": "cargo make graphify-docker-graph-report-smoke", + "command": "cargo make smoke-graphify-docker-graph-report", "artifact": "docs/guide/benchmarking/2026-06-11-graph-rag-scored-smoke-adapter-report.md", "claim": "graphify reaches tiny Docker graph/report scoring but remains wrong_result; broad graph/RAG quality is not tested." }, diff --git a/docs/research/2026-06-11-temporal-history-competitor-gap-report.json b/docs/research/2026-06-11-temporal-history-competitor-gap-report.json index cb6cd9be..8bfcffd6 100644 --- a/docs/research/2026-06-11-temporal-history-competitor-gap-report.json +++ b/docs/research/2026-06-11-temporal-history-competitor-gap-report.json @@ -7,7 +7,7 @@ "role_boundary": "No ELF optimization implementation is included; this report records evidence, claim boundaries, and future optimization directions.", "commands": [ { - "command": "ELF_GRAPHITI_ZEP_SMOKE_START=1 ELF_GRAPHITI_ZEP_SMOKE_RUN=1 cargo make graphiti-zep-docker-temporal-smoke", + "command": "ELF_GRAPHITI_ZEP_SMOKE_START=1 ELF_GRAPHITI_ZEP_SMOKE_RUN=1 cargo make smoke-graphiti-zep-docker-temporal", "status": "blocked", "typed_status": "provider_api_key_missing", "runtime_seconds": 3.5, diff --git a/docs/research/2026-06-11-xy-897-competitor-strength-matrix.json b/docs/research/2026-06-11-xy-897-competitor-strength-matrix.json index 3de690bd..f74e0d45 100644 --- a/docs/research/2026-06-11-xy-897-competitor-strength-matrix.json +++ b/docs/research/2026-06-11-xy-897-competitor-strength-matrix.json @@ -237,7 +237,7 @@ ], "measured_status": "blocked", "proof": { - "command": "ELF_RAGFLOW_SMOKE_START=1 ELF_RAGFLOW_SMOKE_ACCEPT_RESOURCE_ENVELOPE=1 cargo make ragflow-docker-smoke", + "command": "ELF_RAGFLOW_SMOKE_START=1 ELF_RAGFLOW_SMOKE_ACCEPT_RESOURCE_ENVELOPE=1 cargo make smoke-ragflow-docker", "artifact": "tmp/real-world-memory/ragflow-smoke/ragflow-smoke.json" }, "unsupported_or_blocked_status": { @@ -257,7 +257,7 @@ ], "measured_status": "blocked", "proof": { - "command": "ELF_LIGHTRAG_CONTEXT_START=1 cargo make lightrag-docker-context-smoke", + "command": "ELF_LIGHTRAG_CONTEXT_START=1 cargo make smoke-lightrag-docker-context", "artifact": "tmp/real-world-memory/lightrag-context/summary.json" }, "unsupported_or_blocked_status": { @@ -277,7 +277,7 @@ ], "measured_status": "blocked", "proof": { - "command": "ELF_GRAPHRAG_SMOKE_RUN=1 cargo make graphrag-docker-smoke", + "command": "ELF_GRAPHRAG_SMOKE_RUN=1 cargo make smoke-graphrag-docker", "artifact": "tmp/real-world-memory/graphrag-smoke/summary.json" }, "unsupported_or_blocked_status": { @@ -297,7 +297,7 @@ ], "measured_status": "blocked", "proof": { - "command": "ELF_GRAPHITI_ZEP_SMOKE_START=1 ELF_GRAPHITI_ZEP_SMOKE_RUN=1 cargo make graphiti-zep-docker-temporal-smoke", + "command": "ELF_GRAPHITI_ZEP_SMOKE_START=1 ELF_GRAPHITI_ZEP_SMOKE_RUN=1 cargo make smoke-graphiti-zep-docker-temporal", "artifact": "tmp/real-world-memory/graphiti-zep-smoke/summary.json" }, "unsupported_or_blocked_status": { @@ -417,7 +417,7 @@ ], "measured_status": "wrong_result", "proof": { - "command": "cargo make graphify-docker-graph-report-smoke", + "command": "cargo make smoke-graphify-docker-graph-report", "artifact": "tmp/real-world-memory/graphify-smoke/graphify-report.json" }, "unsupported_or_blocked_status": { diff --git a/docs/research/2026-06-16-scheduled-memory-task-scoring-report.json b/docs/research/2026-06-16-scheduled-memory-task-scoring-report.json index 612802ff..9bdae08b 100644 --- a/docs/research/2026-06-16-scheduled-memory-task-scoring-report.json +++ b/docs/research/2026-06-16-scheduled-memory-task-scoring-report.json @@ -1847,13 +1847,13 @@ "setup": { "status": "blocked", "evidence": "XY-900 promotes the Docker-safe tiny-corpus evidence smoke into a generated real_world_job report while the checked-in row remains smoke-only research_gate evidence.", - "command": "cargo make ragflow-docker-smoke", + "command": "cargo make smoke-ragflow-docker", "artifact": "tmp/real-world-memory/ragflow-smoke/ragflow-smoke.json" }, "run": { "status": "blocked", "evidence": "The live path requires explicit resource-envelope opt-in and a local self-hosted RAGFlow API key; setup failures stay typed in the generated smoke artifact.", - "command": "ELF_RAGFLOW_SMOKE_START=1 ELF_RAGFLOW_SMOKE_ACCEPT_RESOURCE_ENVELOPE=1 cargo make ragflow-docker-smoke", + "command": "ELF_RAGFLOW_SMOKE_START=1 ELF_RAGFLOW_SMOKE_ACCEPT_RESOURCE_ENVELOPE=1 cargo make smoke-ragflow-docker", "artifact": "tmp/real-world-memory/ragflow-smoke/memory_projects_manifest.ragflow-smoke.json" }, "result": { @@ -1965,7 +1965,7 @@ "runtime_boundary": "Run scripts/ragflow-docker-evidence-smoke.sh through cargo make; the live path uses the official RAGFlow Docker Compose service boundary without host-global RAGFlow installs.", "resource_expectation": "Large multi-service RAG stack; generated artifacts record CPU/GPU mode, memory, disk, image size, expanded disk notes, startup time, vm.max_map_count handling, and provider boundaries before scoring.", "retry_guidance": [ - "Run cargo make ragflow-docker-smoke first to produce a typed preflight artifact.", + "Run cargo make smoke-ragflow-docker first to produce a typed preflight artifact.", "Start the live path only with ELF_RAGFLOW_SMOKE_START=1 and ELF_RAGFLOW_SMOKE_ACCEPT_RESOURCE_ENVELOPE=1.", "Keep private corpora and operator-owned provider credentials out of this smoke; map only generated public corpus reference chunks to evidence ids." ], @@ -1991,13 +1991,13 @@ "setup": { "status": "blocked", "evidence": "XY-886 adds a Docker-profile context-export smoke command, and XY-900 keeps its generated retrieval fixtures scored through real_world_job_benchmark. The checked-in row remains smoke-only research_gate evidence.", - "command": "cargo make lightrag-docker-context-smoke", + "command": "cargo make smoke-lightrag-docker-context", "artifact": "tmp/real-world-memory/lightrag-context/lightrag-materialization.json" }, "run": { "status": "blocked", "evidence": "The default smoke records a typed setup/runtime failure if the LightRAG API is unavailable; set ELF_LIGHTRAG_CONTEXT_START=1 to start the opt-in Docker service profile.", - "command": "ELF_LIGHTRAG_CONTEXT_START=1 cargo make lightrag-docker-context-smoke", + "command": "ELF_LIGHTRAG_CONTEXT_START=1 cargo make smoke-lightrag-docker-context", "artifact": "tmp/real-world-memory/lightrag-context/summary.json" }, "result": { @@ -2078,7 +2078,7 @@ }, { "kind": "command", - "ref": "cargo make lightrag-docker-context-smoke", + "ref": "cargo make smoke-lightrag-docker-context", "status": "blocked" }, { @@ -2115,11 +2115,11 @@ "evidence": "Official source-id and file-path citation reference." } ], - "setup_path": "Run cargo make lightrag-docker-context-smoke for a typed preflight artifact; set ELF_LIGHTRAG_CONTEXT_START=1 to start the opt-in LightRAG Docker profile and attempt live context export.", + "setup_path": "Run cargo make smoke-lightrag-docker-context for a typed preflight artifact; set ELF_LIGHTRAG_CONTEXT_START=1 to start the opt-in LightRAG Docker profile and attempt live context export.", "runtime_boundary": "docker-compose.baseline.yml baseline-runner plus opt-in lightrag and lightrag-mock-provider services; generated source files and LightRAG data stay in Docker-mounted artifact paths and Docker volumes.", "resource_expectation": "The default profile uses the official LightRAG image, a local OpenAI-compatible mock provider, 64-dimensional embeddings, rerank disabled for context queries, cargo/pip/Hugging Face caches, and Docker volumes for rag_storage, inputs, and prompts.", "retry_guidance": [ - "Run cargo make lightrag-docker-context-smoke first; a missing API must remain a typed incomplete artifact, not a pass claim.", + "Run cargo make smoke-lightrag-docker-context first; a missing API must remain a typed incomplete artifact, not a pass claim.", "Set ELF_LIGHTRAG_CONTEXT_START=1 only when Docker may pull/start the LightRAG service profile.", "Score retrieval only when returned context, references.file_path, or references.content map to required evidence ids." ], @@ -2145,13 +2145,13 @@ "setup": { "status": "blocked", "evidence": "XY-900 promotes the Docker-safe generated-corpus GraphRAG smoke into a scored knowledge_compilation report while the checked-in row remains smoke-only research_gate evidence.", - "command": "cargo make graphrag-docker-smoke", + "command": "cargo make smoke-graphrag-docker", "artifact": "tmp/real-world-memory/graphrag-smoke/graphrag-smoke.json" }, "run": { "status": "blocked", "evidence": "The default smoke records a typed blocked artifact without model calls; set ELF_GRAPHRAG_SMOKE_RUN=1 with explicit provider configuration to attempt live GraphRAG index/query.", - "command": "ELF_GRAPHRAG_SMOKE_RUN=1 cargo make graphrag-docker-smoke", + "command": "ELF_GRAPHRAG_SMOKE_RUN=1 cargo make smoke-graphrag-docker", "artifact": "tmp/real-world-memory/graphrag-smoke/summary.json" }, "result": { @@ -2237,7 +2237,7 @@ }, { "kind": "command", - "ref": "cargo make graphrag-docker-smoke", + "ref": "cargo make smoke-graphrag-docker", "status": "blocked" }, { @@ -2279,11 +2279,11 @@ "evidence": "Official local-search context and graph traversal reference." } ], - "setup_path": "Run cargo make graphrag-docker-smoke for a typed preflight artifact; set ELF_GRAPHRAG_SMOKE_RUN=1 with explicit provider configuration for a live GraphRAG index/query attempt.", + "setup_path": "Run cargo make smoke-graphrag-docker for a typed preflight artifact; set ELF_GRAPHRAG_SMOKE_RUN=1 with explicit provider configuration for a live GraphRAG index/query attempt.", "runtime_boundary": "docker-compose.baseline.yml baseline-runner, container-local Python venv, generated public corpus, and report artifacts under tmp/real-world-memory/graphrag-smoke.", "resource_expectation": "The default profile uses a generated public corpus capped by ELF_GRAPHRAG_MAX_DOCS and ELF_GRAPHRAG_MAX_INPUT_CHARS, pins GraphRAG through ELF_GRAPHRAG_PACKAGE, and records elapsed time, cache size, output size, and observed cache entries.", "retry_guidance": [ - "Run cargo make graphrag-docker-smoke first; missing provider configuration must remain a typed blocked artifact, not a pass claim.", + "Run cargo make smoke-graphrag-docker first; missing provider configuration must remain a typed blocked artifact, not a pass claim.", "Enable ELF_GRAPHRAG_SMOKE_RUN=1 only for generated public corpus indexing with explicit provider configuration.", "Fail typed if source document or text_unit identifiers cannot be mapped to expected evidence IDs." ], @@ -2309,13 +2309,13 @@ "setup": { "status": "blocked", "evidence": "XY-900 promotes the Docker-contained Graphiti/Zep temporal smoke into a scored memory_evolution report while the checked-in row remains smoke-only research_gate evidence.", - "command": "cargo make graphiti-zep-docker-temporal-smoke", + "command": "cargo make smoke-graphiti-zep-docker-temporal", "artifact": "tmp/real-world-memory/graphiti-zep-smoke/graphiti-zep-smoke.json" }, "run": { "status": "blocked", "evidence": "The default smoke records a typed setup/runtime failure if live execution is not explicitly enabled. Set ELF_GRAPHITI_ZEP_SMOKE_START=1 and ELF_GRAPHITI_ZEP_SMOKE_RUN=1 with explicit provider configuration to start Docker-local FalkorDB and run Graphiti.", - "command": "ELF_GRAPHITI_ZEP_SMOKE_START=1 ELF_GRAPHITI_ZEP_SMOKE_RUN=1 cargo make graphiti-zep-docker-temporal-smoke", + "command": "ELF_GRAPHITI_ZEP_SMOKE_START=1 ELF_GRAPHITI_ZEP_SMOKE_RUN=1 cargo make smoke-graphiti-zep-docker-temporal", "artifact": "tmp/real-world-memory/graphiti-zep-smoke/summary.json" }, "result": { @@ -2396,7 +2396,7 @@ }, { "kind": "command", - "ref": "cargo make graphiti-zep-docker-temporal-smoke", + "ref": "cargo make smoke-graphiti-zep-docker-temporal", "status": "blocked" }, { @@ -2438,11 +2438,11 @@ "evidence": "Official manual fact-triple ingest contract." } ], - "setup_path": "Run cargo make graphiti-zep-docker-temporal-smoke for a typed artifact; set ELF_GRAPHITI_ZEP_SMOKE_START=1 ELF_GRAPHITI_ZEP_SMOKE_RUN=1 with explicit provider configuration for a live attempt.", + "setup_path": "Run cargo make smoke-graphiti-zep-docker-temporal for a typed artifact; set ELF_GRAPHITI_ZEP_SMOKE_START=1 ELF_GRAPHITI_ZEP_SMOKE_RUN=1 with explicit provider configuration for a live attempt.", "runtime_boundary": "docker-compose.baseline.yml baseline-runner plus graphiti-zep FalkorDB profile, container-local Python venv, generated public temporal facts, and report artifacts under tmp/real-world-memory/graphiti-zep-smoke.", "resource_expectation": "Requires Docker-local FalkorDB plus LLM/embedding configuration; generated artifacts record service startup, storage size, provider boundaries, fact count, and timeout before scoring.", "retry_guidance": [ - "Run cargo make graphiti-zep-docker-temporal-smoke first to produce a typed blocked artifact.", + "Run cargo make smoke-graphiti-zep-docker-temporal first to produce a typed blocked artifact.", "Start the live path only with ELF_GRAPHITI_ZEP_SMOKE_START=1, ELF_GRAPHITI_ZEP_SMOKE_RUN=1, and explicit provider configuration.", "Treat missing validity windows or unmapped current/historical facts as wrong_result, not pass." ], @@ -2954,13 +2954,13 @@ "setup": { "status": "pass", "evidence": "XY-900 validation reached the Docker-only graph/report smoke setup inside the baseline runner without host-global assistant hooks.", - "command": "cargo make graphify-docker-graph-report-smoke", + "command": "cargo make smoke-graphify-docker-graph-report", "artifact": "tmp/real-world-memory/graphify-smoke/graphify-smoke.json" }, "run": { "status": "pass", "evidence": "The smoke installed graphify in a container-local venv, ran over a generated public corpus, and produced graph/report/query output for scoring.", - "command": "cargo make graphify-docker-graph-report-smoke", + "command": "cargo make smoke-graphify-docker-graph-report", "artifact": "tmp/real-world-memory/graphify-smoke/summary.json" }, "result": { @@ -3041,7 +3041,7 @@ }, { "kind": "command", - "ref": "cargo make graphify-docker-graph-report-smoke", + "ref": "cargo make smoke-graphify-docker-graph-report", "status": "wrong_result" }, { @@ -3068,11 +3068,11 @@ "evidence": "Official CLI, output artifact, query, and source-location contract." } ], - "setup_path": "Run cargo make graphify-docker-graph-report-smoke to install graphify in Docker, build graph/report artifacts from a generated public corpus, and export query evidence without installing host-global assistant hooks.", + "setup_path": "Run cargo make smoke-graphify-docker-graph-report to install graphify in Docker, build graph/report artifacts from a generated public corpus, and export query evidence without installing host-global assistant hooks.", "runtime_boundary": "docker-compose.baseline.yml baseline-runner, container-local Python venv, isolated HOME/config paths, generated public corpus, and artifacts under tmp/real-world-memory/graphify-smoke.", "resource_expectation": "Graph build cost scales with corpus and model choices; generated artifacts record package reference, provider/model boundary, build time, graph size, report size, cache size, timeout, and retry behavior.", "retry_guidance": [ - "Run cargo make graphify-docker-graph-report-smoke first; setup/runtime failures must remain typed artifacts, not pass claims.", + "Run cargo make smoke-graphify-docker-graph-report first; setup/runtime failures must remain typed artifacts, not pass claims.", "Do not use graphify host assistant hook installs or operator-owned assistant configuration as proof.", "Score graph-guided answers only when graph.json, GRAPH_REPORT.md, and graphify query output map to generated evidence ids." ], diff --git a/docs/spec/production_corpus_manifest_v1.md b/docs/spec/production_corpus_manifest_v1.md index 05bc417e..36347823 100644 --- a/docs/spec/production_corpus_manifest_v1.md +++ b/docs/spec/production_corpus_manifest_v1.md @@ -82,7 +82,7 @@ evidence ID. It must not silently fall back to the checked-in synthetic corpus. "evidence_id": "issue-xy123-resume", "category": "issue", "title": "XY-123 Resume State", - "text": "XY-123 resumes on branch y/example with command `cargo make checks`." + "text": "XY-123 resumes on branch y/example with command `cargo make check`." } ], "queries": [ @@ -92,7 +92,7 @@ evidence ID. It must not silently fall back to the checked-in synthetic corpus. "query": "How do I resume XY-123?", "expected_evidence_ids": ["issue-xy123-resume"], "allowed_alternate_evidence_ids": [], - "expected_terms": ["XY-123", "cargo make checks"] + "expected_terms": ["XY-123", "cargo make check"] } ] } diff --git a/scripts/baseline-docker.sh b/scripts/baseline-docker.sh new file mode 100755 index 00000000..a6e38d82 --- /dev/null +++ b/scripts/baseline-docker.sh @@ -0,0 +1,173 @@ +#!/usr/bin/env bash +set -euo pipefail + +profile="${1:-}" +if [ -z "$profile" ]; then + echo "usage: scripts/baseline-docker.sh " >&2 + exit 2 +fi + +head="$(git rev-parse HEAD)" +if [ -n "$(git status --porcelain)" ]; then + head="$head+dirty" +fi + +run_baseline() { + docker compose -f docker-compose.baseline.yml run --build --rm baseline-runner +} + +selected_projects_or_default() { + local selected_projects + selected_projects="$(printenv ELF_BASELINE_PROJECTS || true)" + if [ -z "$selected_projects" ]; then + selected_projects="ELF" + fi + printf '%s' "$selected_projects" +} + +case "$profile" in +live) + export ELF_BASELINE_ELF_HEAD="$head" + run_baseline + ;; +backfill) + selected_projects="$(selected_projects_or_default)" + selected_profile="$(printenv ELF_BASELINE_PROFILE || true)" + if [ -z "$selected_profile" ]; then + selected_profile="backfill" + fi + backfill_docs="$(printenv ELF_BASELINE_BACKFILL_DOCS || true)" + if [ -z "$backfill_docs" ]; then + backfill_docs="2000" + fi + elf_timeout="$(printenv ELF_BASELINE_ELF_TIMEOUT_SECONDS || true)" + if [ -z "$elf_timeout" ]; then + elf_timeout="3600" + fi + max_elf_seconds="$(printenv ELF_BASELINE_MAX_ELF_SECONDS || true)" + if [ -z "$max_elf_seconds" ]; then + max_elf_seconds="3600" + fi + export ELF_BASELINE_ELF_HEAD="$head" + export ELF_BASELINE_PROJECTS="$selected_projects" + export ELF_BASELINE_PROFILE="$selected_profile" + export ELF_BASELINE_BACKFILL_DOCS="$backfill_docs" + export ELF_BASELINE_ELF_TIMEOUT_SECONDS="$elf_timeout" + export ELF_BASELINE_MAX_ELF_SECONDS="$max_elf_seconds" + run_baseline + ;; +openmemory-ui-export-readback) + export ELF_BASELINE_ELF_HEAD="$head" + export ELF_BASELINE_PROJECTS=mem0 + run_baseline + ;; +production-synthetic) + selected_projects="$(selected_projects_or_default)" + export ELF_BASELINE_ELF_HEAD="$head" + export ELF_BASELINE_PROJECTS="$selected_projects" + export ELF_BASELINE_PROFILE=production-synthetic + run_baseline + ;; +production-private) + manifest="$(printenv ELF_BASELINE_PRODUCTION_CORPUS_MANIFEST || true)" + if [ -z "$manifest" ]; then + echo "ELF_BASELINE_PRODUCTION_CORPUS_MANIFEST is required for baseline-production-private" >&2 + exit 1 + fi + selected_projects="$(selected_projects_or_default)" + export ELF_BASELINE_ELF_HEAD="$head" + export ELF_BASELINE_PROJECTS="$selected_projects" + export ELF_BASELINE_PROFILE=production-private + run_baseline + ;; +production-private-addendum) + manifest="$(printenv ELF_BASELINE_PRODUCTION_CORPUS_MANIFEST || true)" + if [ -z "$manifest" ]; then + echo "ELF_BASELINE_PRODUCTION_CORPUS_MANIFEST is required for baseline-production-private-addendum" >&2 + exit 1 + fi + selected_projects="$(selected_projects_or_default)" + addendum="$(printenv ELF_BASELINE_PRIVATE_ADDENDUM || true)" + if [ -z "$addendum" ]; then + addendum="tmp/live-baseline/private-production-addendum.md" + fi + export ELF_BASELINE_ELF_HEAD="$head" + export ELF_BASELINE_PROJECTS="$selected_projects" + export ELF_BASELINE_PROFILE=production-private + run_baseline + ELF_BASELINE_MARKDOWN_REPORT="$addendum" bash scripts/live-baseline-report-to-md.sh + echo "Private production addendum: $addendum" + ;; +backfill-10k) + backfill_docs="$(printenv ELF_BASELINE_BACKFILL_DOCS || true)" + if [ -z "$backfill_docs" ]; then + backfill_docs="10000" + fi + elf_timeout="$(printenv ELF_BASELINE_ELF_TIMEOUT_SECONDS || true)" + if [ -z "$elf_timeout" ]; then + elf_timeout="14400" + fi + max_elf_seconds="$(printenv ELF_BASELINE_MAX_ELF_SECONDS || true)" + if [ -z "$max_elf_seconds" ]; then + max_elf_seconds="$elf_timeout" + fi + export ELF_BASELINE_ELF_HEAD="$head" + export ELF_BASELINE_PROJECTS=ELF + export ELF_BASELINE_PROFILE=backfill + export ELF_BASELINE_BACKFILL_DOCS="$backfill_docs" + export ELF_BASELINE_ELF_TIMEOUT_SECONDS="$elf_timeout" + export ELF_BASELINE_MAX_ELF_SECONDS="$max_elf_seconds" + run_baseline + ;; +backfill-100k) + enabled="$(printenv ELF_BASELINE_ENABLE_EXPENSIVE || true)" + if [ "$enabled" != "1" ]; then + echo "ELF_BASELINE_ENABLE_EXPENSIVE=1 is required for baseline-backfill-100k-docker" >&2 + exit 1 + fi + backfill_docs="$(printenv ELF_BASELINE_BACKFILL_DOCS || true)" + if [ -z "$backfill_docs" ]; then + backfill_docs="100000" + fi + elf_timeout="$(printenv ELF_BASELINE_ELF_TIMEOUT_SECONDS || true)" + if [ -z "$elf_timeout" ]; then + elf_timeout="86400" + fi + max_elf_seconds="$(printenv ELF_BASELINE_MAX_ELF_SECONDS || true)" + if [ -z "$max_elf_seconds" ]; then + max_elf_seconds="$elf_timeout" + fi + export ELF_BASELINE_ELF_HEAD="$head" + export ELF_BASELINE_PROJECTS=ELF + export ELF_BASELINE_PROFILE=backfill + export ELF_BASELINE_BACKFILL_DOCS="$backfill_docs" + export ELF_BASELINE_ELF_TIMEOUT_SECONDS="$elf_timeout" + export ELF_BASELINE_MAX_ELF_SECONDS="$max_elf_seconds" + run_baseline + ;; +soak) + soak_seconds="$(printenv ELF_BASELINE_SOAK_SECONDS || true)" + if [ -z "$soak_seconds" ]; then + soak_seconds="3600" + fi + elf_timeout="$(printenv ELF_BASELINE_ELF_TIMEOUT_SECONDS || true)" + if [ -z "$elf_timeout" ]; then + elf_timeout="$((soak_seconds + 1800))" + fi + max_elf_seconds="$(printenv ELF_BASELINE_MAX_ELF_SECONDS || true)" + if [ -z "$max_elf_seconds" ]; then + max_elf_seconds="$elf_timeout" + fi + export ELF_BASELINE_ELF_HEAD="$head" + export ELF_BASELINE_PROJECTS=ELF + export ELF_BASELINE_PROFILE=stress + export ELF_BASELINE_SOAK_SECONDS="$soak_seconds" + export ELF_BASELINE_ELF_TIMEOUT_SECONDS="$elf_timeout" + export ELF_BASELINE_MAX_ELF_SECONDS="$max_elf_seconds" + run_baseline + ;; +*) + echo "unknown baseline profile: $profile" >&2 + exit 2 + ;; +esac diff --git a/scripts/check-docs.py b/scripts/check-docs.py new file mode 100755 index 00000000..9f64d34e --- /dev/null +++ b/scripts/check-docs.py @@ -0,0 +1,116 @@ +#!/usr/bin/env python3 +from __future__ import annotations + +import re +import sys +from pathlib import Path + + +ROOT = Path(__file__).resolve().parents[1] +TASK_RE = re.compile(r"^\[tasks\.([^\]]+)\]", re.MULTILINE) +CARGO_MAKE_RE = re.compile(r"\bcargo\s+make\s+([A-Za-z0-9][A-Za-z0-9_:-]*)") +MARKDOWN_LINK_RE = re.compile(r"!?\[[^\]\n]*\]\(([^)\n]+)\)") + + +def read_text(path: Path) -> str: + return path.read_text(encoding="utf-8") + + +def cargo_make_tasks() -> set[str]: + return set(TASK_RE.findall(read_text(ROOT / "Makefile.toml"))) + + +def iter_reference_files() -> list[Path]: + roots = [ + ROOT / "README.md", + ROOT / "AGENTS.md", + ROOT / "docs", + ROOT / ".github" / "workflows", + ] + files: list[Path] = [] + for root in roots: + if root.is_file(): + files.append(root) + continue + if root.is_dir(): + files.extend( + path + for path in root.rglob("*") + if path.suffix in {".md", ".yml", ".yaml"} + ) + return sorted(files) + + +def iter_markdown_files() -> list[Path]: + return [ + path + for path in iter_reference_files() + if path.suffix == ".md" + ] + + +def normalize_link_target(raw_target: str) -> str: + target = raw_target.strip() + if target.startswith("<") and ">" in target: + target = target[1:target.index(">")] + elif " " in target: + target = target.split(maxsplit=1)[0] + return target + + +def is_external_or_anchor(target: str) -> bool: + return ( + not target + or target.startswith("#") + or target.startswith("/") + or bool(re.match(r"^[A-Za-z][A-Za-z0-9+.-]*:", target)) + ) + + +def check_cargo_make_references(tasks: set[str]) -> list[str]: + errors: list[str] = [] + for path in iter_reference_files(): + for line_number, line in enumerate(read_text(path).splitlines(), start=1): + for match in CARGO_MAKE_RE.finditer(line): + task = match.group(1) + if task not in tasks: + rel_path = path.relative_to(ROOT) + errors.append(f"{rel_path}:{line_number}: unknown cargo make task `{task}`") + return errors + + +def check_markdown_links() -> list[str]: + errors: list[str] = [] + for path in iter_markdown_files(): + for line_number, line in enumerate(read_text(path).splitlines(), start=1): + for match in MARKDOWN_LINK_RE.finditer(line): + target = normalize_link_target(match.group(1)) + if is_external_or_anchor(target): + continue + path_part = target.split("#", maxsplit=1)[0] + if not path_part: + continue + candidate = ( + ROOT / path_part.removeprefix("/") + if path_part.startswith("/") + else path.parent / path_part + ) + if not candidate.exists(): + rel_path = path.relative_to(ROOT) + errors.append(f"{rel_path}:{line_number}: broken local link `{target}`") + return errors + + +def main() -> int: + errors = check_cargo_make_references(cargo_make_tasks()) + errors.extend(check_markdown_links()) + if errors: + for error in errors: + print(error, file=sys.stderr) + return 1 + print("check-docs passed") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/scripts/graphify-docker-graph-report-smoke.py b/scripts/graphify-docker-graph-report-smoke.py index 0035a1b9..c5ac0cfc 100755 --- a/scripts/graphify-docker-graph-report-smoke.py +++ b/scripts/graphify-docker-graph-report-smoke.py @@ -1209,13 +1209,13 @@ def write_manifest(status: StatusState) -> dict[str, Any]: "setup": { "status": status.setup, "evidence": "The smoke installs graphify in a container-local Python venv and runs with isolated assistant config paths.", - "command": "cargo make graphify-docker-graph-report-smoke", + "command": "cargo make smoke-graphify-docker-graph-report", "artifact": rel(OUT), }, "run": { "status": status.run, "evidence": "The live path builds graphify graph/report artifacts from a generated public corpus and runs graphify query over graph.json.", - "command": "cargo make graphify-docker-graph-report-smoke", + "command": "cargo make smoke-graphify-docker-graph-report", "artifact": rel(OUT), }, "result": { @@ -1298,11 +1298,11 @@ def write_manifest(status: StatusState) -> dict[str, Any]: "evidence": "Official package referenced by the graphify README.", }, ], - "setup_path": "Run cargo make graphify-docker-graph-report-smoke to install graphify in a container-local venv and build graph/report artifacts over generated public files.", + "setup_path": "Run cargo make smoke-graphify-docker-graph-report to install graphify in a container-local venv and build graph/report artifacts over generated public files.", "runtime_boundary": "docker-compose.baseline.yml baseline-runner, isolated HOME/config paths, generated corpus, and artifacts under tmp/real-world-memory/graphify-smoke.", "resource_expectation": f"graphify package {GRAPHIFY_REF}, generated_files=4, timeout_seconds={TIMEOUT_SECONDS}, query_budget={QUERY_BUDGET}.", "retry_guidance": [ - "Rerun cargo make graphify-docker-graph-report-smoke after dependency or runtime fixes.", + "Rerun cargo make smoke-graphify-docker-graph-report after dependency or runtime fixes.", "Do not use graphify install hooks, host-global Codex/Claude/Gemini config, or private corpora as proof.", "Score only when graph.json, GRAPH_REPORT.md, and graphify query output map to generated evidence ids.", ], @@ -1404,7 +1404,7 @@ def main() -> int: status.result = "incomplete" status.overall = "incomplete" status.failure_class = "not_running_in_docker" - status.failure_reason = "graphify smoke must run inside Docker; use cargo make graphify-docker-graph-report-smoke." + status.failure_reason = "graphify smoke must run inside Docker; use cargo make smoke-graphify-docker-graph-report." elif not command_available("python3"): status.setup = "incomplete" status.result = "incomplete" diff --git a/scripts/graphiti-zep-docker-temporal-smoke.py b/scripts/graphiti-zep-docker-temporal-smoke.py index 5ba1cc34..065bb78c 100644 --- a/scripts/graphiti-zep-docker-temporal-smoke.py +++ b/scripts/graphiti-zep-docker-temporal-smoke.py @@ -1003,13 +1003,13 @@ def write_manifest(status: StatusState) -> dict[str, Any]: "setup": { "status": status.setup, "evidence": "The smoke runs inside the baseline Docker runner and uses Docker-local FalkorDB plus a container-local Python venv.", - "command": "cargo make graphiti-zep-docker-temporal-smoke", + "command": "cargo make smoke-graphiti-zep-docker-temporal", "artifact": rel(OUT), }, "run": { "status": status.run, "evidence": "The live path adds generated temporal fact triples and searches Graphiti/Zep for UUID, fact, valid_at, invalid_at, and source node evidence.", - "command": "ELF_GRAPHITI_ZEP_SMOKE_START=1 ELF_GRAPHITI_ZEP_SMOKE_RUN=1 cargo make graphiti-zep-docker-temporal-smoke", + "command": "ELF_GRAPHITI_ZEP_SMOKE_START=1 ELF_GRAPHITI_ZEP_SMOKE_RUN=1 cargo make smoke-graphiti-zep-docker-temporal", "artifact": rel(OUT), }, "result": { @@ -1101,7 +1101,7 @@ def write_manifest(status: StatusState) -> dict[str, Any]: "evidence": "Official manual fact-triple ingest contract.", }, ], - "setup_path": "Run cargo make graphiti-zep-docker-temporal-smoke for a typed artifact; set ELF_GRAPHITI_ZEP_SMOKE_START=1 ELF_GRAPHITI_ZEP_SMOKE_RUN=1 with explicit provider configuration for a live attempt.", + "setup_path": "Run cargo make smoke-graphiti-zep-docker-temporal for a typed artifact; set ELF_GRAPHITI_ZEP_SMOKE_START=1 ELF_GRAPHITI_ZEP_SMOKE_RUN=1 with explicit provider configuration for a live attempt.", "runtime_boundary": "docker-compose.baseline.yml baseline-runner plus graphiti-zep FalkorDB profile, container-local Python venv, generated public temporal facts, and report artifacts under tmp/real-world-memory/graphiti-zep-smoke.", "resource_expectation": f"Graphiti package {GRAPHITI_REF}, fact_count=3, timeout_seconds={TIMEOUT_SECONDS}, FalkorDB host={FALKORDB_HOST}:{FALKORDB_PORT}.", "retry_guidance": [ @@ -1185,7 +1185,7 @@ def main() -> int: status.result = "incomplete" status.overall = "incomplete" status.failure_class = "not_running_in_docker" - status.failure_reason = "Graphiti/Zep smoke must run inside Docker; use cargo make graphiti-zep-docker-temporal-smoke." + status.failure_reason = "Graphiti/Zep smoke must run inside Docker; use cargo make smoke-graphiti-zep-docker-temporal." mapping["status"] = status.result mapping["reason"] = status.failure_reason elif not command_available("python3"): diff --git a/scripts/graphrag-docker-smoke.py b/scripts/graphrag-docker-smoke.py index 02be1560..c6b01d45 100755 --- a/scripts/graphrag-docker-smoke.py +++ b/scripts/graphrag-docker-smoke.py @@ -1186,13 +1186,13 @@ def write_manifest(status: StatusState) -> dict[str, Any]: "setup": { "status": status.setup, "evidence": "The smoke runs inside the baseline Docker runner and installs or invokes GraphRAG only in the container-local work directory.", - "command": "cargo make graphrag-docker-smoke", + "command": "cargo make smoke-graphrag-docker", "artifact": rel(OUT), }, "run": { "status": status.run, "evidence": "The live path generates a tiny public corpus, initializes GraphRAG, indexes with bounded inputs, and runs local search when provider config is supplied.", - "command": "ELF_GRAPHRAG_SMOKE_RUN=1 cargo make graphrag-docker-smoke", + "command": "ELF_GRAPHRAG_SMOKE_RUN=1 cargo make smoke-graphrag-docker", "artifact": rel(OUT), }, "result": { @@ -1286,7 +1286,7 @@ def write_manifest(status: StatusState) -> dict[str, Any]: "evidence": "Official local-search context and graph traversal reference.", }, ], - "setup_path": "Run cargo make graphrag-docker-smoke for a typed artifact; set ELF_GRAPHRAG_SMOKE_RUN=1 with explicit provider configuration for a live index/query attempt.", + "setup_path": "Run cargo make smoke-graphrag-docker for a typed artifact; set ELF_GRAPHRAG_SMOKE_RUN=1 with explicit provider configuration for a live index/query attempt.", "runtime_boundary": "docker-compose.baseline.yml baseline-runner, container-local Python venv, generated public corpus, and report artifacts under tmp/real-world-memory/graphrag-smoke.", "resource_expectation": f"GraphRAG package {GRAPH_RAG_REF}, max_docs={MAX_DOCS}, max_input_chars={MAX_INPUT_CHARS}, timeout_seconds={TIMEOUT_SECONDS}, index_method={INDEX_METHOD}.", "retry_guidance": [ @@ -1378,7 +1378,7 @@ def main() -> int: status.result = "incomplete" status.overall = "incomplete" status.failure_class = "not_running_in_docker" - status.failure_reason = "GraphRAG smoke must run inside Docker; use cargo make graphrag-docker-smoke." + status.failure_reason = "GraphRAG smoke must run inside Docker; use cargo make smoke-graphrag-docker." elif not command_available("python3"): status.setup = "incomplete" status.result = "incomplete" diff --git a/scripts/lightrag-docker-context-smoke.sh b/scripts/lightrag-docker-context-smoke.sh index 6e4d302e..a643d286 100644 --- a/scripts/lightrag-docker-context-smoke.sh +++ b/scripts/lightrag-docker-context-smoke.sh @@ -14,7 +14,7 @@ INDEX_ATTEMPTS="${ELF_LIGHTRAG_INDEX_ATTEMPTS:-60}" INDEX_INTERVAL_SECONDS="${ELF_LIGHTRAG_INDEX_INTERVAL_SECONDS:-2}" if [[ ! -f "/.dockerenv" && "${ELF_LIGHTRAG_CONTEXT_ALLOW_HOST:-0}" != "1" ]]; then - echo "Refusing to run LightRAG context smoke outside Docker. Use cargo make lightrag-docker-context-smoke." >&2 + echo "Refusing to run LightRAG context smoke outside Docker. Use cargo make smoke-lightrag-docker-context." >&2 exit 1 fi diff --git a/scripts/parity-docker-gate.sh b/scripts/parity-docker-gate.sh index 99cd5aaf..62fa0ec1 100755 --- a/scripts/parity-docker-gate.sh +++ b/scripts/parity-docker-gate.sh @@ -151,7 +151,7 @@ write_report() { }, cleanup: { status: "documented", - command: "cargo make parity-docker-clean" + command: "cargo make clean-parity-docker" } }, thresholds: { diff --git a/scripts/ragflow-docker-evidence-smoke.sh b/scripts/ragflow-docker-evidence-smoke.sh index 95cd50f5..17dd572f 100755 --- a/scripts/ragflow-docker-evidence-smoke.sh +++ b/scripts/ragflow-docker-evidence-smoke.sh @@ -687,8 +687,8 @@ write_artifact() { }, setup: { status: $setup_status, - command: "cargo make ragflow-docker-smoke", - live_command: "ELF_RAGFLOW_SMOKE_START=1 ELF_RAGFLOW_SMOKE_ACCEPT_RESOURCE_ENVELOPE=1 cargo make ragflow-docker-smoke", + command: "cargo make smoke-ragflow-docker", + live_command: "ELF_RAGFLOW_SMOKE_START=1 ELF_RAGFLOW_SMOKE_ACCEPT_RESOURCE_ENVELOPE=1 cargo make smoke-ragflow-docker", started: ($started == "true"), startup_time_ms: (if $startup_time_ms == "" then null else ($startup_time_ms | tonumber) end), vm_max_map_count: { @@ -847,13 +847,13 @@ write_manifest() { setup: { status: $setup_status, evidence: "Official RAGFlow Docker Compose boundary and resource envelope were evaluated for the tiny evidence smoke.", - command: "cargo make ragflow-docker-smoke", + command: "cargo make smoke-ragflow-docker", artifact: $out_rel }, run: { status: $run_status, evidence: "The smoke attempts dataset creation, empty-document corpus ingest, chunk insert, retrieval query, and reference chunk extraction.", - command: "ELF_RAGFLOW_SMOKE_START=1 ELF_RAGFLOW_SMOKE_ACCEPT_RESOURCE_ENVELOPE=1 cargo make ragflow-docker-smoke", + command: "ELF_RAGFLOW_SMOKE_START=1 ELF_RAGFLOW_SMOKE_ACCEPT_RESOURCE_ENVELOPE=1 cargo make smoke-ragflow-docker", artifact: $out_rel }, result: { diff --git a/scripts/real-world-docker.sh b/scripts/real-world-docker.sh new file mode 100755 index 00000000..a6413839 --- /dev/null +++ b/scripts/real-world-docker.sh @@ -0,0 +1,118 @@ +#!/usr/bin/env bash +set -euo pipefail + +profile="${1:-}" +if [ -z "$profile" ]; then + echo "usage: scripts/real-world-docker.sh " >&2 + exit 2 +fi + +case "$profile" in +job-operator-ux-live-adapters) + docker compose -f docker-compose.baseline.yml run --build --rm \ + -e ELF_OPERATOR_DEBUG_LIVE_REPORT_DIR \ + -e ELF_OPERATOR_DEBUG_LIVE_FIXTURES \ + -e ELF_OPERATOR_DEBUG_LIVE_WORK_DIR \ + -e ELF_OPERATOR_DEBUG_QMD_DIR \ + baseline-runner bash scripts/real-world-operator-debug-live-adapters.sh + ;; +memory-live-consolidation) + docker compose -f docker-compose.baseline.yml run --build --rm \ + -e ELF_CONSOLIDATION_LIVE_REPORT_DIR \ + -e ELF_CONSOLIDATION_LIVE_FIXTURES \ + baseline-runner bash scripts/real-world-consolidation-live-adapter.sh + ;; +memory-live-adapters) + lightrag_start="$(printenv ELF_LIGHTRAG_CONTEXT_START || true)" + graphiti_start="$(printenv ELF_GRAPHITI_ZEP_SMOKE_START || true)" + status=0 + if [ "$lightrag_start" = "1" ]; then + docker compose -f docker-compose.baseline.yml --profile lightrag up -d lightrag + fi + if [ "$graphiti_start" = "1" ]; then + docker compose -f docker-compose.baseline.yml --profile graphiti-zep up -d graphiti-falkordb + fi + docker compose -f docker-compose.baseline.yml run --build --rm \ + -e ELF_REAL_WORLD_LIVE_ENABLE_RAGFLOW \ + -e ELF_REAL_WORLD_LIVE_ENABLE_LIGHTRAG \ + -e ELF_REAL_WORLD_LIVE_ENABLE_GRAPHRAG \ + -e ELF_REAL_WORLD_LIVE_ENABLE_GRAPHITI_ZEP \ + -e ELF_REAL_WORLD_LIVE_ENABLE_GRAPHIFY \ + -e ELF_RAGFLOW_SMOKE_START \ + -e ELF_RAGFLOW_SMOKE_ACCEPT_RESOURCE_ENVELOPE \ + -e ELF_RAGFLOW_SMOKE_ALLOW_ARM \ + -e ELF_RAGFLOW_SMOKE_PULL_IMAGE \ + -e ELF_RAGFLOW_SMOKE_CLEANUP \ + -e ELF_RAGFLOW_SMOKE_DEVICE \ + -e ELF_RAGFLOW_API_PORT \ + -e ELF_RAGFLOW_API_BASE \ + -e ELF_RAGFLOW_API_KEY \ + -e RAGFLOW_API_KEY \ + -e ELF_RAGFLOW_SMOKE_STARTUP_ATTEMPTS \ + -e ELF_RAGFLOW_SMOKE_STARTUP_INTERVAL_SECONDS \ + -e ELF_RAGFLOW_SMOKE_COMPOSE_TIMEOUT_SECONDS \ + -e ELF_RAGFLOW_REPO_URL \ + -e ELF_RAGFLOW_REF \ + -e ELF_RAGFLOW_IMAGE \ + -e ELF_RAGFLOW_COMPOSE_PROJECT \ + -e ELF_LIGHTRAG_CONTEXT_START \ + -e ELF_LIGHTRAG_API_BASE \ + -e ELF_LIGHTRAG_ADAPTER_ID \ + -e ELF_LIGHTRAG_ADAPTER_NAME \ + -e ELF_LIGHTRAG_STARTUP_ATTEMPTS \ + -e ELF_LIGHTRAG_STARTUP_INTERVAL_SECONDS \ + -e ELF_LIGHTRAG_INDEX_ATTEMPTS \ + -e ELF_LIGHTRAG_INDEX_INTERVAL_SECONDS \ + -e ELF_GRAPHRAG_SMOKE_RUN \ + -e ELF_GRAPHRAG_SMOKE_WORK_DIR \ + -e ELF_GRAPHRAG_SMOKE_INSTALL \ + -e ELF_GRAPHRAG_VERSION \ + -e ELF_GRAPHRAG_PACKAGE \ + -e ELF_GRAPHRAG_REF \ + -e ELF_GRAPHRAG_CHAT_MODEL \ + -e ELF_GRAPHRAG_EMBEDDING_MODEL \ + -e ELF_GRAPHRAG_API_BASE \ + -e ELF_GRAPHRAG_API_KEY \ + -e ELF_GRAPHRAG_INDEX_METHOD \ + -e ELF_GRAPHRAG_QUERY_METHOD \ + -e ELF_GRAPHRAG_TIMEOUT_SECONDS \ + -e ELF_GRAPHRAG_MAX_DOCS \ + -e ELF_GRAPHRAG_MAX_INPUT_CHARS \ + -e ELF_GRAPHITI_ZEP_SMOKE_START \ + -e ELF_GRAPHITI_ZEP_SMOKE_RUN \ + -e ELF_GRAPHITI_ZEP_SMOKE_WORK_DIR \ + -e ELF_GRAPHITI_ZEP_SMOKE_INSTALL \ + -e ELF_GRAPHITI_ZEP_VERSION \ + -e ELF_GRAPHITI_ZEP_PACKAGE \ + -e ELF_GRAPHITI_ZEP_REF \ + -e ELF_GRAPHITI_ZEP_API_BASE \ + -e ELF_GRAPHITI_ZEP_API_KEY \ + -e ELF_GRAPHITI_ZEP_LLM_MODEL \ + -e ELF_GRAPHITI_ZEP_EMBEDDING_MODEL \ + -e ELF_GRAPHITI_ZEP_FALKORDB_HOST \ + -e ELF_GRAPHITI_ZEP_FALKORDB_PORT \ + -e ELF_GRAPHITI_ZEP_FALKORDB_DATABASE \ + -e ELF_GRAPHITI_ZEP_TIMEOUT_SECONDS \ + -e ELF_GRAPHITI_ZEP_STARTUP_ATTEMPTS \ + -e ELF_GRAPHITI_ZEP_STARTUP_INTERVAL_SECONDS \ + -e ELF_GRAPHIFY_SMOKE_RUN \ + -e ELF_GRAPHIFY_SMOKE_WORK_DIR \ + -e ELF_GRAPHIFY_SMOKE_INSTALL \ + -e ELF_GRAPHIFY_PACKAGE \ + -e ELF_GRAPHIFY_REF \ + -e ELF_GRAPHIFY_TIMEOUT_SECONDS \ + -e ELF_GRAPHIFY_QUERY_BUDGET \ + baseline-runner bash scripts/real-world-live-adapters.sh || status=$? + if [ "$lightrag_start" = "1" ]; then + docker compose -f docker-compose.baseline.yml --profile lightrag stop lightrag lightrag-mock-provider >/dev/null 2>&1 || true + fi + if [ "$graphiti_start" = "1" ]; then + docker compose -f docker-compose.baseline.yml --profile graphiti-zep stop graphiti-falkordb >/dev/null 2>&1 || true + fi + exit "$status" + ;; +*) + echo "unknown real-world Docker profile: $profile" >&2 + exit 2 + ;; +esac diff --git a/scripts/smoke-docker.sh b/scripts/smoke-docker.sh new file mode 100755 index 00000000..6aa816a8 --- /dev/null +++ b/scripts/smoke-docker.sh @@ -0,0 +1,90 @@ +#!/usr/bin/env bash +set -euo pipefail + +smoke="${1:-}" +if [ -z "$smoke" ]; then + echo "usage: scripts/smoke-docker.sh " >&2 + exit 2 +fi + +case "$smoke" in +graphify-docker-graph-report) + docker compose -f docker-compose.baseline.yml run --build --rm \ + -e ELF_GRAPHIFY_SMOKE_RUN \ + -e ELF_GRAPHIFY_SMOKE_REPORT_DIR \ + -e ELF_GRAPHIFY_SMOKE_WORK_DIR \ + -e ELF_GRAPHIFY_SMOKE_INSTALL \ + -e ELF_GRAPHIFY_PACKAGE \ + -e ELF_GRAPHIFY_REF \ + -e ELF_GRAPHIFY_TIMEOUT_SECONDS \ + -e ELF_GRAPHIFY_QUERY_BUDGET \ + baseline-runner python3 scripts/graphify-docker-graph-report-smoke.py + ;; +graphiti-zep-docker-temporal) + start="$(printenv ELF_GRAPHITI_ZEP_SMOKE_START || true)" + status=0 + if [ "$start" = "1" ]; then + docker compose -f docker-compose.baseline.yml --profile graphiti-zep up -d graphiti-falkordb + fi + docker compose -f docker-compose.baseline.yml run --build --rm \ + -e ELF_GRAPHITI_ZEP_SMOKE_RUN \ + -e ELF_GRAPHITI_ZEP_SMOKE_REPORT_DIR \ + -e ELF_GRAPHITI_ZEP_SMOKE_WORK_DIR \ + -e ELF_GRAPHITI_ZEP_SMOKE_INSTALL \ + -e ELF_GRAPHITI_ZEP_VERSION \ + -e ELF_GRAPHITI_ZEP_PACKAGE \ + -e ELF_GRAPHITI_ZEP_REF \ + -e ELF_GRAPHITI_ZEP_API_BASE \ + -e ELF_GRAPHITI_ZEP_API_KEY \ + -e ELF_GRAPHITI_ZEP_LLM_MODEL \ + -e ELF_GRAPHITI_ZEP_EMBEDDING_MODEL \ + -e ELF_GRAPHITI_ZEP_FALKORDB_HOST \ + -e ELF_GRAPHITI_ZEP_FALKORDB_PORT \ + -e ELF_GRAPHITI_ZEP_FALKORDB_DATABASE \ + -e ELF_GRAPHITI_ZEP_TIMEOUT_SECONDS \ + -e ELF_GRAPHITI_ZEP_STARTUP_ATTEMPTS \ + -e ELF_GRAPHITI_ZEP_STARTUP_INTERVAL_SECONDS \ + baseline-runner python3 scripts/graphiti-zep-docker-temporal-smoke.py || status=$? + if [ "$start" = "1" ]; then + docker compose -f docker-compose.baseline.yml --profile graphiti-zep stop graphiti-falkordb >/dev/null 2>&1 || true + fi + exit "$status" + ;; +graphrag-docker) + docker compose -f docker-compose.baseline.yml run --build --rm \ + -e ELF_GRAPHRAG_SMOKE_RUN \ + -e ELF_GRAPHRAG_SMOKE_REPORT_DIR \ + -e ELF_GRAPHRAG_SMOKE_WORK_DIR \ + -e ELF_GRAPHRAG_SMOKE_INSTALL \ + -e ELF_GRAPHRAG_VERSION \ + -e ELF_GRAPHRAG_PACKAGE \ + -e ELF_GRAPHRAG_REF \ + -e ELF_GRAPHRAG_CHAT_MODEL \ + -e ELF_GRAPHRAG_EMBEDDING_MODEL \ + -e ELF_GRAPHRAG_API_BASE \ + -e ELF_GRAPHRAG_API_KEY \ + -e ELF_GRAPHRAG_INDEX_METHOD \ + -e ELF_GRAPHRAG_QUERY_METHOD \ + -e ELF_GRAPHRAG_TIMEOUT_SECONDS \ + -e ELF_GRAPHRAG_MAX_DOCS \ + -e ELF_GRAPHRAG_MAX_INPUT_CHARS \ + baseline-runner python3 scripts/graphrag-docker-smoke.py + ;; +lightrag-docker-context) + start="$(printenv ELF_LIGHTRAG_CONTEXT_START || true)" + status=0 + if [ "$start" = "1" ]; then + docker compose -f docker-compose.baseline.yml --profile lightrag up -d lightrag + fi + docker compose -f docker-compose.baseline.yml run --build --rm \ + baseline-runner bash scripts/lightrag-docker-context-smoke.sh || status=$? + if [ "$start" = "1" ]; then + docker compose -f docker-compose.baseline.yml --profile lightrag stop lightrag lightrag-mock-provider >/dev/null 2>&1 || true + fi + exit "$status" + ;; +*) + echo "unknown smoke: $smoke" >&2 + exit 2 + ;; +esac diff --git a/scripts/trace-gate.sh b/scripts/trace-gate.sh new file mode 100755 index 00000000..5cbdd52e --- /dev/null +++ b/scripts/trace-gate.sh @@ -0,0 +1,37 @@ +#!/usr/bin/env bash +set -euo pipefail + +DSN="${TRACE_GATE_PG_DSN:-${PG_DSN:-postgres://postgres:postgres@127.0.0.1:5432/elf}}" +VECTOR_DIM="${TRACE_GATE_VECTOR_DIM:-4}" +SCHEMA_PATH="tmp/trace_gate.schema.sql" +REPORT_PATH="${TRACE_GATE_REPORT_PATH:-tmp/trace_gate.report.json}" + +mkdir -p tmp + +TRACE_GATE_VECTOR_DIM="${VECTOR_DIM}" python3 - <<'PY' > "${SCHEMA_PATH}" +import os +from pathlib import Path + +vector_dim = int(os.environ["TRACE_GATE_VECTOR_DIM"]) +root = Path(".") +sql_dir = root / "sql" + +out = [] +for raw_line in (sql_dir / "init.sql").read_text(encoding="utf-8").splitlines(): + line = raw_line.strip() + if line.startswith(r"\ir "): + rel = line[len(r"\ir ") :].strip() + out.append((sql_dir / rel).read_text(encoding="utf-8")) + else: + out.append(raw_line) + +expanded = "\n".join(out) + "\n" +print(expanded.replace("", str(vector_dim)), end="") +PY + +psql "${DSN}" -v ON_ERROR_STOP=1 -f "${SCHEMA_PATH}" +psql "${DSN}" -v ON_ERROR_STOP=1 -f .github/fixtures/trace_gate/fixture.sql +cargo run -p elf-eval --bin trace_regression_gate -- \ + --config .github/fixtures/trace_gate/config.toml \ + --gate .github/fixtures/trace_gate/gate.json \ + --out "${REPORT_PATH}"