From 1c1a5903b4b918e150306ec2c5428adcff82ce44 Mon Sep 17 00:00:00 2001 From: Yvette Carlisle Date: Wed, 10 Jun 2026 11:10:54 +0800 Subject: [PATCH] {"schema":"decodex/commit/1","summary":"Encode production-ops real-world memory fixtures","authority":"XY-862"} --- Makefile.toml | 52 ++++ .../backup_restore_cold_start_readback.json | 232 ++++++++++++++++++ ...d_start_missing_dependency_incomplete.json | 187 ++++++++++++++ .../credential_boundary_provider_blocked.json | 199 +++++++++++++++ .../interrupted_import_resume_checkpoint.json | 204 +++++++++++++++ .../private_manifest_absence_blocked.json | 198 +++++++++++++++ .../resource_envelope_budget.json | 194 +++++++++++++++ .../tests/real_world_job_benchmark.rs | 82 ++++++- docs/guide/benchmarking/index.md | 9 +- .../benchmarking/live_baseline_benchmark.md | 13 + .../real_world_agent_memory_benchmark.md | 49 +++- 11 files changed, 1399 insertions(+), 20 deletions(-) create mode 100644 apps/elf-eval/fixtures/real_world_memory/production_ops/backup_restore_cold_start_readback.json create mode 100644 apps/elf-eval/fixtures/real_world_memory/production_ops/cold_start_missing_dependency_incomplete.json create mode 100644 apps/elf-eval/fixtures/real_world_memory/production_ops/credential_boundary_provider_blocked.json create mode 100644 apps/elf-eval/fixtures/real_world_memory/production_ops/interrupted_import_resume_checkpoint.json create mode 100644 apps/elf-eval/fixtures/real_world_memory/production_ops/private_manifest_absence_blocked.json create mode 100644 apps/elf-eval/fixtures/real_world_memory/production_ops/resource_envelope_budget.json diff --git a/Makefile.toml b/Makefile.toml index 9291ad23..2945dc1c 100644 --- a/Makefile.toml +++ b/Makefile.toml @@ -415,6 +415,9 @@ args = [ # | real-world-memory-retrieval | composite | | # | real-world-memory-retrieval-json | command | | # | real-world-memory-retrieval-report | command | | +# | real-world-memory-production-ops | composite | | +# | real-world-memory-production-ops-json | command | | +# | real-world-memory-production-ops-report | command | | [tasks.real-world-job-smoke] workspace = false @@ -704,6 +707,55 @@ args = [ "tmp/real-world-memory/retrieval-report.md", ] +[tasks.real-world-memory-production-ops] +workspace = false +dependencies = [ + "real-world-memory-production-ops-report", +] + +[tasks.real-world-memory-production-ops-json] +workspace = false +command = "cargo" +args = [ + "run", + "-p", + "elf-eval", + "--bin", + "real_world_job_benchmark", + "--", + "run", + "--fixtures", + "apps/elf-eval/fixtures/real_world_memory/production_ops", + "--run-id", + "real-world-memory-production-ops", + "--adapter-id", + "fixture_production_ops", + "--adapter-name", + "ELF production-ops fixture", + "--out", + "tmp/real-world-memory/production-ops-report.json", +] + +[tasks.real-world-memory-production-ops-report] +workspace = false +dependencies = [ + "real-world-memory-production-ops-json", +] +command = "cargo" +args = [ + "run", + "-p", + "elf-eval", + "--bin", + "real_world_job_benchmark", + "--", + "publish", + "--report", + "tmp/real-world-memory/production-ops-report.json", + "--out", + "tmp/real-world-memory/production-ops-report.md", +] + [tasks.real-world-memory-consolidation] workspace = false dependencies = [ diff --git a/apps/elf-eval/fixtures/real_world_memory/production_ops/backup_restore_cold_start_readback.json b/apps/elf-eval/fixtures/real_world_memory/production_ops/backup_restore_cold_start_readback.json new file mode 100644 index 00000000..687419fe --- /dev/null +++ b/apps/elf-eval/fixtures/real_world_memory/production_ops/backup_restore_cold_start_readback.json @@ -0,0 +1,232 @@ +{ + "schema": "elf.real_world_job/v1", + "job_id": "production-ops-restore-cold-start-001", + "suite": "production_ops", + "title": "Read back restored memory after Docker cold start and Qdrant rebuild", + "corpus": { + "corpus_id": "real-world-memory-production-ops-2026-06-10", + "profile": "synthetic", + "items": [ + { + "evidence_id": "restore-search-before", + "kind": "trace", + "text": "Before restore, search returned one result for key single_user_restore_probe with trace 535e49be-250f-483c-8845-b4116e591dac.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "backup_restore_cold_start_readback", + "evidence_id": "restore-search-before" + }, + "locator": { + "quote": "search returned one result for key single_user_restore_probe" + } + }, + "created_at": "2026-06-09T10:00:00Z" + }, + { + "evidence_id": "restore-qdrant-rebuild", + "kind": "trace", + "text": "After restoring Postgres backup, Qdrant rebuild returned rebuilt_count=1, missing_vector_count=0, error_count=0.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "backup_restore_cold_start_readback", + "evidence_id": "restore-qdrant-rebuild" + }, + "locator": { + "quote": "rebuilt_count=1, missing_vector_count=0, error_count=0" + } + }, + "created_at": "2026-06-09T10:03:00Z" + }, + { + "evidence_id": "restore-search-after", + "kind": "trace", + "text": "After cold start and rebuild, search returned one result for key single_user_restore_probe with trace e995263d-8f0-4472-9a32-354d5cceed33.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "backup_restore_cold_start_readback", + "evidence_id": "restore-search-after" + }, + "locator": { + "quote": "After cold start and rebuild, search returned one result" + } + }, + "created_at": "2026-06-09T10:05:00Z" + }, + { + "evidence_id": "qdrant-authority-decoy", + "kind": "adapter_state", + "text": "Decoy: a Qdrant backup alone is the source of truth, so Postgres restore evidence is optional.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "backup_restore_cold_start_readback", + "evidence_id": "qdrant-authority-decoy" + } + }, + "created_at": "2026-06-09T09:50:00Z" + } + ], + "adapter_response": { + "adapter_id": "fixture_production_ops", + "answer": { + "content": "The restore proof recovered key single_user_restore_probe after a Docker cold start. Qdrant rebuild returned rebuilt_count=1, missing_vector_count=0, error_count=0, and search after cold start returned one result for the restored key.", + "claims": [ + { + "claim_id": "restore_recovered_key", + "text": "The restore proof recovered key single_user_restore_probe after a Docker cold start.", + "evidence_ids": ["restore-search-before", "restore-search-after"], + "confidence": "high" + }, + { + "claim_id": "qdrant_rebuild_counts", + "text": "Qdrant rebuild returned rebuilt_count=1, missing_vector_count=0, error_count=0.", + "evidence_ids": ["restore-qdrant-rebuild"], + "confidence": "high" + }, + { + "claim_id": "cold_start_readback", + "text": "Search after cold start returned one result for the restored key.", + "evidence_ids": ["restore-search-after"], + "confidence": "high" + } + ], + "evidence_ids": ["restore-search-before", "restore-qdrant-rebuild", "restore-search-after"], + "latency_ms": 2.1, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + } + } + } + }, + "timeline": [ + { + "event_id": "pre-restore-search", + "ts": "2026-06-09T10:00:00Z", + "actor": "tool", + "action": "searched_before_restore", + "evidence_ids": ["restore-search-before"], + "summary": "The proof captured the searchable key before restore." + }, + { + "event_id": "post-restore-rebuild", + "ts": "2026-06-09T10:03:00Z", + "actor": "tool", + "action": "rebuilt_qdrant_from_postgres_vectors", + "evidence_ids": ["restore-qdrant-rebuild"], + "summary": "Qdrant was rebuilt from Postgres-held vectors." + }, + { + "event_id": "post-cold-start-search", + "ts": "2026-06-09T10:05:00Z", + "actor": "tool", + "action": "searched_after_cold_start", + "evidence_ids": ["restore-search-after"], + "summary": "The restored key was searchable after the cold-start path." + } + ], + "prompt": { + "role": "user", + "content": "What evidence shows backup restore and cold-start readback worked?", + "job_mode": "operate", + "constraints": ["cite_evidence", "do_not_treat_qdrant_as_source_of_truth"] + }, + "expected_answer": { + "must_include": [ + { + "claim_id": "restore_recovered_key", + "text": "The restore proof recovered key single_user_restore_probe after a Docker cold start." + }, + { + "claim_id": "qdrant_rebuild_counts", + "text": "Qdrant rebuild returned rebuilt_count=1, missing_vector_count=0, error_count=0." + }, + { + "claim_id": "cold_start_readback", + "text": "Search after cold start returned one result for the restored key." + } + ], + "must_not_include": ["Qdrant backup alone is the source of truth"], + "evidence_links": { + "restore_recovered_key": ["restore-search-before", "restore-search-after"], + "qdrant_rebuild_counts": ["restore-qdrant-rebuild"], + "cold_start_readback": ["restore-search-after"] + }, + "answer_type": "direct_answer", + "accepted_alternates": [], + "requires_caveat": false, + "requires_refusal": false + }, + "required_evidence": [ + { + "evidence_id": "restore-search-before", + "claim_id": "restore_recovered_key", + "requirement": "cite", + "quote": "search returned one result for key single_user_restore_probe" + }, + { + "evidence_id": "restore-qdrant-rebuild", + "claim_id": "qdrant_rebuild_counts", + "requirement": "cite", + "quote": "rebuilt_count=1, missing_vector_count=0, error_count=0" + }, + { + "evidence_id": "restore-search-after", + "claim_id": "cold_start_readback", + "requirement": "cite", + "quote": "After cold start and rebuild, search returned one result" + } + ], + "negative_traps": [ + { + "trap_id": "qdrant-source-of-truth-decoy", + "type": "decoy_evidence", + "evidence_ids": ["qdrant-authority-decoy"], + "failure_if_used": true + } + ], + "scoring_rubric": { + "dimensions": { + "lifecycle_behavior": { + "weight": 0.35, + "max_points": 1.0, + "criteria": "Explains backup restore, cold start, and rebuild behavior." + }, + "evidence_grounding": { + "weight": 0.3, + "max_points": 1.0, + "criteria": "Cites pre-restore, rebuild, and post-restore readback evidence." + }, + "trap_avoidance": { + "weight": 0.2, + "max_points": 1.0, + "criteria": "Does not treat Qdrant as authoritative." + }, + "workflow_helpfulness": { + "weight": 0.15, + "max_points": 1.0, + "criteria": "States what the operator can rely on." + } + }, + "pass_threshold": 0.85, + "hard_fail_rules": [ + "unsupported high-confidence claim about a required decision or fact", + "use of a negative trap marked failure_if_used = true" + ] + }, + "allowed_uncertainty": { + "can_answer_unknown": false, + "acceptable_phrases": [], + "fallback_action": "state_blocker" + }, + "tags": ["synthetic", "production_ops", "restore", "cold_start", "qdrant_rebuild", "no_live_claim"] +} diff --git a/apps/elf-eval/fixtures/real_world_memory/production_ops/cold_start_missing_dependency_incomplete.json b/apps/elf-eval/fixtures/real_world_memory/production_ops/cold_start_missing_dependency_incomplete.json new file mode 100644 index 00000000..8fcbfc39 --- /dev/null +++ b/apps/elf-eval/fixtures/real_world_memory/production_ops/cold_start_missing_dependency_incomplete.json @@ -0,0 +1,187 @@ +{ + "schema": "elf.real_world_job/v1", + "job_id": "production-ops-cold-start-dependency-001", + "suite": "production_ops", + "title": "Preserve cold-start dependency failure as incomplete instead of pass", + "encoding": { + "status": "incomplete", + "reason": "The fixture records a cold-start dependency failure path that could not reach the behavioral check; this must remain incomplete rather than a silent pass.", + "follow_up": { + "title": "[ELF benchmark P0] Pin Docker-compatible local embedding dependency for cold-start adapter checks", + "reason": "The adapter cannot fairly test cold-start recovery until its local embedding dependency can build or import in Docker." + } + }, + "corpus": { + "corpus_id": "real-world-memory-production-ops-2026-06-10", + "profile": "external_adapter", + "items": [ + { + "evidence_id": "local-embed-install-failure", + "kind": "adapter_state", + "text": "OpenViking cold-start check could not run because the Docker platform could not build or import llama-cpp-python for the local embedding path; the adapter status is incomplete with retrieval_status=local_embed_install_failed.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "cold_start_missing_dependency_incomplete", + "evidence_id": "local-embed-install-failure" + }, + "locator": { + "quote": "could not build or import llama-cpp-python" + } + }, + "created_at": "2026-06-09T08:38:14Z" + }, + { + "evidence_id": "typed-incomplete-policy", + "kind": "runbook", + "text": "Use incomplete when install, import, build, adapter wiring, native dependency support, or local runtime setup failed.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "cold_start_missing_dependency_incomplete", + "evidence_id": "typed-incomplete-policy" + }, + "locator": { + "quote": "Use incomplete when install, import, build" + } + }, + "created_at": "2026-06-09T08:40:00Z" + }, + { + "evidence_id": "dependency-pass-decoy", + "kind": "adapter_state", + "text": "Decoy: missing local embedding dependency should be reported as pass because no retrieval mismatch occurred.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "cold_start_missing_dependency_incomplete", + "evidence_id": "dependency-pass-decoy" + } + }, + "created_at": "2026-06-09T08:30:00Z" + } + ], + "adapter_response": { + "adapter_id": "fixture_production_ops", + "answer": { + "content": "The cold-start dependency failure is incomplete, not pass. The adapter could not build or import llama-cpp-python, so the behavioral cold-start check did not run and needs dependency repair before a pass claim.", + "claims": [ + { + "claim_id": "cold_start_dependency_incomplete", + "text": "The cold-start dependency failure is incomplete, not pass.", + "evidence_ids": ["local-embed-install-failure", "typed-incomplete-policy"], + "confidence": "high" + } + ], + "evidence_ids": ["local-embed-install-failure", "typed-incomplete-policy"], + "latency_ms": 1.8, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + } + } + } + }, + "timeline": [ + { + "event_id": "cold-start-dependency-failed", + "ts": "2026-06-09T08:38:14Z", + "actor": "tool", + "action": "hit_dependency_failure", + "evidence_ids": ["local-embed-install-failure"], + "summary": "The cold-start adapter path stopped before behavioral scoring because a native dependency could not build or import." + }, + { + "event_id": "typed-incomplete-retained", + "ts": "2026-06-09T08:40:00Z", + "actor": "agent", + "action": "classified_failure", + "evidence_ids": ["typed-incomplete-policy"], + "summary": "The report kept dependency failures as incomplete typed states." + } + ], + "prompt": { + "role": "user", + "content": "How should the production-ops suite classify a cold-start check that cannot run because a dependency is missing?", + "job_mode": "operate", + "constraints": ["cite_evidence", "preserve_typed_status", "do_not_claim_pass"] + }, + "expected_answer": { + "must_include": [ + { + "claim_id": "cold_start_dependency_incomplete", + "text": "The cold-start dependency failure is incomplete, not pass." + } + ], + "must_not_include": ["reported as pass"], + "evidence_links": { + "cold_start_dependency_incomplete": ["local-embed-install-failure", "typed-incomplete-policy"] + }, + "answer_type": "direct_answer", + "accepted_alternates": [], + "requires_caveat": false, + "requires_refusal": false + }, + "required_evidence": [ + { + "evidence_id": "local-embed-install-failure", + "claim_id": "cold_start_dependency_incomplete", + "requirement": "cite", + "quote": "could not build or import llama-cpp-python" + }, + { + "evidence_id": "typed-incomplete-policy", + "claim_id": "cold_start_dependency_incomplete", + "requirement": "cite", + "quote": "Use incomplete when install, import, build" + } + ], + "negative_traps": [ + { + "trap_id": "dependency-pass-decoy", + "type": "unsupported_prior", + "evidence_ids": ["dependency-pass-decoy"], + "failure_if_used": true + } + ], + "scoring_rubric": { + "dimensions": { + "lifecycle_behavior": { + "weight": 0.35, + "max_points": 1.0, + "criteria": "Would test cold-start behavior only after dependency setup succeeds." + }, + "evidence_grounding": { + "weight": 0.3, + "max_points": 1.0, + "criteria": "Cites dependency failure and typed-incomplete policy." + }, + "uncertainty_handling": { + "weight": 0.2, + "max_points": 1.0, + "criteria": "States that no pass claim is allowed." + }, + "trap_avoidance": { + "weight": 0.15, + "max_points": 1.0, + "criteria": "Avoids dependency-pass decoy." + } + }, + "pass_threshold": 0.85, + "hard_fail_rules": [ + "unsupported high-confidence claim about a required decision or fact", + "use of a negative trap marked failure_if_used = true" + ] + }, + "allowed_uncertainty": { + "can_answer_unknown": true, + "acceptable_phrases": ["incomplete, not pass"], + "fallback_action": "state_blocker" + }, + "tags": ["external_adapter", "production_ops", "cold_start", "dependency_boundary", "no_live_claim"] +} diff --git a/apps/elf-eval/fixtures/real_world_memory/production_ops/credential_boundary_provider_blocked.json b/apps/elf-eval/fixtures/real_world_memory/production_ops/credential_boundary_provider_blocked.json new file mode 100644 index 00000000..f0a46864 --- /dev/null +++ b/apps/elf-eval/fixtures/real_world_memory/production_ops/credential_boundary_provider_blocked.json @@ -0,0 +1,199 @@ +{ + "schema": "elf.real_world_job/v1", + "job_id": "production-ops-credential-boundary-001", + "suite": "production_ops", + "title": "Keep provider credential requirement blocked without committing secrets", + "encoding": { + "status": "blocked", + "reason": "Provider-backed production operations require operator-owned credentials; checked-in fixtures must not include or require secrets.", + "follow_up": { + "title": "Run provider-backed production-ops gate with routed operator credentials", + "reason": "Credential-bound checks need an operator shell with provider environment variables; fixture reports can only encode the boundary." + } + }, + "corpus": { + "corpus_id": "real-world-memory-production-ops-2026-06-10", + "profile": "external_adapter", + "items": [ + { + "evidence_id": "provider-credential-boundary", + "kind": "runbook", + "text": "Provider-backed production runs require embedding API credentials supplied through environment variables; reports record provider id, model, dimensions, timeout, API base, and path, but never record the API key.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "credential_boundary_provider_blocked", + "evidence_id": "provider-credential-boundary" + }, + "locator": { + "quote": "never record the API key" + } + }, + "created_at": "2026-06-09T08:10:00Z" + }, + { + "evidence_id": "checked-in-secret-boundary", + "kind": "decision", + "text": "Checked-in production-ops fixtures must not require user secrets or run private corpus data.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "credential_boundary_provider_blocked", + "evidence_id": "checked-in-secret-boundary" + }, + "locator": { + "quote": "must not require user secrets" + } + }, + "created_at": "2026-06-09T08:12:00Z" + }, + { + "evidence_id": "secret-commit-decoy", + "kind": "message", + "text": "Decoy: commit a provider API key into the fixture so the production-ops report can pass in CI.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "credential_boundary_provider_blocked", + "evidence_id": "secret-commit-decoy" + } + }, + "created_at": "2026-06-09T08:09:00Z" + } + ], + "adapter_response": { + "adapter_id": "fixture_production_ops", + "answer": { + "content": "Do not commit or require provider secrets in checked-in fixtures. Provider-backed production-ops checks are blocked until operator credentials are supplied, and reports may record provider metadata but never the API key.", + "claims": [ + { + "claim_id": "provider_credentials_blocked", + "text": "Provider-backed production-ops checks are blocked until operator credentials are supplied.", + "evidence_ids": ["provider-credential-boundary"], + "confidence": "high" + }, + { + "claim_id": "no_checked_in_secrets", + "text": "Checked-in production-ops fixtures must not require user secrets.", + "evidence_ids": ["checked-in-secret-boundary"], + "confidence": "high" + } + ], + "evidence_ids": ["provider-credential-boundary", "checked-in-secret-boundary"], + "latency_ms": 1.7, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + } + } + } + }, + "timeline": [ + { + "event_id": "provider-credential-boundary-recorded", + "ts": "2026-06-09T08:10:00Z", + "actor": "agent", + "action": "recorded_credential_boundary", + "evidence_ids": ["provider-credential-boundary"], + "summary": "The provider path requires environment credentials and redacts API keys from reports." + }, + { + "event_id": "checked-in-secret-boundary-recorded", + "ts": "2026-06-09T08:12:00Z", + "actor": "agent", + "action": "recorded_secret_boundary", + "evidence_ids": ["checked-in-secret-boundary"], + "summary": "Checked-in production-ops fixtures do not require or contain secrets." + } + ], + "prompt": { + "role": "user", + "content": "Should the production-ops suite use checked-in provider credentials to prove provider-backed behavior?", + "job_mode": "operate", + "constraints": ["cite_evidence", "avoid_secret_leakage", "state_blockers"] + }, + "expected_answer": { + "must_include": [ + { + "claim_id": "provider_credentials_blocked", + "text": "Provider-backed production-ops checks are blocked until operator credentials are supplied." + }, + { + "claim_id": "no_checked_in_secrets", + "text": "Checked-in production-ops fixtures must not require user secrets." + } + ], + "must_not_include": ["commit a provider API key"], + "evidence_links": { + "provider_credentials_blocked": ["provider-credential-boundary"], + "no_checked_in_secrets": ["checked-in-secret-boundary"] + }, + "answer_type": "direct_answer", + "accepted_alternates": [], + "requires_caveat": true, + "requires_refusal": false + }, + "required_evidence": [ + { + "evidence_id": "provider-credential-boundary", + "claim_id": "provider_credentials_blocked", + "requirement": "cite", + "quote": "never record the API key" + }, + { + "evidence_id": "checked-in-secret-boundary", + "claim_id": "no_checked_in_secrets", + "requirement": "cite", + "quote": "must not require user secrets" + } + ], + "negative_traps": [ + { + "trap_id": "commit-provider-key-decoy", + "type": "privacy_leak", + "evidence_ids": ["secret-commit-decoy"], + "failure_if_used": true + } + ], + "scoring_rubric": { + "dimensions": { + "uncertainty_handling": { + "weight": 0.3, + "max_points": 1.0, + "criteria": "States that credential-bound execution is blocked." + }, + "evidence_grounding": { + "weight": 0.25, + "max_points": 1.0, + "criteria": "Cites provider credential and checked-in secret boundaries." + }, + "trap_avoidance": { + "weight": 0.25, + "max_points": 1.0, + "criteria": "Avoids credential leakage and checked-in secret traps." + }, + "workflow_helpfulness": { + "weight": 0.2, + "max_points": 1.0, + "criteria": "Identifies the operator-owned credential follow-up." + } + }, + "pass_threshold": 0.85, + "hard_fail_rules": [ + "unsupported high-confidence claim about a required decision or fact", + "use of a negative trap marked failure_if_used = true", + "unsafe live/destructive action when the prompt forbids it" + ] + }, + "allowed_uncertainty": { + "can_answer_unknown": true, + "acceptable_phrases": ["blocked until operator credentials are supplied", "must not require user secrets"], + "fallback_action": "state_blocker" + }, + "tags": ["external_adapter", "production_ops", "credential_boundary", "blocked", "no_live_claim"] +} diff --git a/apps/elf-eval/fixtures/real_world_memory/production_ops/interrupted_import_resume_checkpoint.json b/apps/elf-eval/fixtures/real_world_memory/production_ops/interrupted_import_resume_checkpoint.json new file mode 100644 index 00000000..e858e702 --- /dev/null +++ b/apps/elf-eval/fixtures/real_world_memory/production_ops/interrupted_import_resume_checkpoint.json @@ -0,0 +1,204 @@ +{ + "schema": "elf.real_world_job/v1", + "job_id": "production-ops-backfill-resume-001", + "suite": "production_ops", + "title": "Resume interrupted generated backfill from checkpoint without duplicate source notes", + "corpus": { + "corpus_id": "real-world-memory-production-ops-2026-06-10", + "profile": "generated_public", + "items": [ + { + "evidence_id": "backfill-checkpoint-state", + "kind": "trace", + "text": "Backfill report live-baseline-20260609092144 completed 2000 of 2000 documents, resumed from checkpoint offset 1000 to 2000, and found zero duplicate source notes.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "interrupted_import_resume_checkpoint", + "evidence_id": "backfill-checkpoint-state" + }, + "locator": { + "quote": "resumed from checkpoint offset 1000 to 2000" + } + }, + "created_at": "2026-06-09T09:21:44Z" + }, + { + "evidence_id": "backfill-clean-compare", + "kind": "trace", + "text": "Clean comparison matched all 16 of 16 query results after the resumed import.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "interrupted_import_resume_checkpoint", + "evidence_id": "backfill-clean-compare" + }, + "locator": { + "quote": "matched all 16 of 16 query results" + } + }, + "created_at": "2026-06-09T09:22:30Z" + }, + { + "evidence_id": "backfill-restart-decoy", + "kind": "adapter_state", + "text": "Decoy: interrupted imports must restart from zero because the checkpoint duplicated source notes.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "interrupted_import_resume_checkpoint", + "evidence_id": "backfill-restart-decoy" + } + }, + "created_at": "2026-06-09T09:00:00Z" + } + ], + "adapter_response": { + "adapter_id": "fixture_production_ops", + "answer": { + "content": "Resume from checkpoint offset 1000 to 2000 completed the 2000 document backfill. The resumed backfill found zero duplicate source notes, and search quality after resume matched the clean run for all 16 queries.", + "claims": [ + { + "claim_id": "resume_checkpoint", + "text": "Resume from checkpoint offset 1000 to 2000 completed the 2000 document backfill.", + "evidence_ids": ["backfill-checkpoint-state"], + "confidence": "high" + }, + { + "claim_id": "no_duplicate_sources", + "text": "The resumed backfill found zero duplicate source notes.", + "evidence_ids": ["backfill-checkpoint-state"], + "confidence": "high" + }, + { + "claim_id": "clean_compare_matched", + "text": "Search quality after resume matched the clean run for all 16 queries.", + "evidence_ids": ["backfill-clean-compare"], + "confidence": "high" + } + ], + "evidence_ids": ["backfill-checkpoint-state", "backfill-clean-compare"], + "latency_ms": 2.0, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + } + } + } + }, + "timeline": [ + { + "event_id": "backfill-interrupted", + "ts": "2026-06-09T09:21:44Z", + "actor": "tool", + "action": "interrupted_backfill", + "evidence_ids": ["backfill-checkpoint-state"], + "summary": "The generated public backfill was interrupted at the checkpoint boundary." + }, + { + "event_id": "backfill-resumed", + "ts": "2026-06-09T09:22:30Z", + "actor": "tool", + "action": "resumed_backfill", + "evidence_ids": ["backfill-checkpoint-state", "backfill-clean-compare"], + "summary": "The resumed import completed without duplicate source notes and matched a clean comparison." + } + ], + "prompt": { + "role": "user", + "content": "What does the production-ops fixture prove about interrupted backfill resume behavior?", + "job_mode": "operate", + "constraints": ["cite_evidence", "state_checkpoint", "avoid_restarting_completed_work"] + }, + "expected_answer": { + "must_include": [ + { + "claim_id": "resume_checkpoint", + "text": "Resume from checkpoint offset 1000 to 2000 completed the 2000 document backfill." + }, + { + "claim_id": "no_duplicate_sources", + "text": "The resumed backfill found zero duplicate source notes." + }, + { + "claim_id": "clean_compare_matched", + "text": "Search quality after resume matched the clean run for all 16 queries." + } + ], + "must_not_include": [ + "interrupted imports must restart from zero", + "the checkpoint duplicated source notes" + ], + "evidence_links": { + "resume_checkpoint": ["backfill-checkpoint-state"], + "no_duplicate_sources": ["backfill-checkpoint-state"], + "clean_compare_matched": ["backfill-clean-compare"] + }, + "answer_type": "direct_answer", + "accepted_alternates": [], + "requires_caveat": false, + "requires_refusal": false + }, + "required_evidence": [ + { + "evidence_id": "backfill-checkpoint-state", + "claim_id": "resume_checkpoint", + "requirement": "cite", + "quote": "resumed from checkpoint offset 1000 to 2000" + }, + { + "evidence_id": "backfill-clean-compare", + "claim_id": "clean_compare_matched", + "requirement": "cite", + "quote": "matched all 16 of 16 query results" + } + ], + "negative_traps": [ + { + "trap_id": "restart-from-zero-decoy", + "type": "decoy_evidence", + "evidence_ids": ["backfill-restart-decoy"], + "failure_if_used": true + } + ], + "scoring_rubric": { + "dimensions": { + "lifecycle_behavior": { + "weight": 0.35, + "max_points": 1.0, + "criteria": "Uses checkpoint resume and duplicate-source evidence." + }, + "evidence_grounding": { + "weight": 0.3, + "max_points": 1.0, + "criteria": "Cites checkpoint and clean-comparison artifacts." + }, + "trap_avoidance": { + "weight": 0.2, + "max_points": 1.0, + "criteria": "Does not restart completed work or cite duplicate-source decoys." + }, + "workflow_helpfulness": { + "weight": 0.15, + "max_points": 1.0, + "criteria": "Answers with the next operational interpretation." + } + }, + "pass_threshold": 0.85, + "hard_fail_rules": [ + "unsupported high-confidence claim about a required decision or fact", + "use of a negative trap marked failure_if_used = true" + ] + }, + "allowed_uncertainty": { + "can_answer_unknown": false, + "acceptable_phrases": [], + "fallback_action": "state_blocker" + }, + "tags": ["generated_public", "production_ops", "backfill_resume", "checkpoint", "no_live_claim"] +} diff --git a/apps/elf-eval/fixtures/real_world_memory/production_ops/private_manifest_absence_blocked.json b/apps/elf-eval/fixtures/real_world_memory/production_ops/private_manifest_absence_blocked.json new file mode 100644 index 00000000..0c45443c --- /dev/null +++ b/apps/elf-eval/fixtures/real_world_memory/production_ops/private_manifest_absence_blocked.json @@ -0,0 +1,198 @@ +{ + "schema": "elf.real_world_job/v1", + "job_id": "production-ops-private-manifest-blocked-001", + "suite": "production_ops", + "title": "Report missing private production manifest as bounded blocked caveat", + "encoding": { + "status": "blocked", + "reason": "No operator-owned private production corpus manifest is checked in or available to this fixture; no private-corpus pass can be claimed.", + "follow_up": { + "title": "Supply an operator-owned private production corpus manifest", + "reason": "A real private-corpus pass requires a sanitized local manifest supplied outside checked-in fixtures." + } + }, + "corpus": { + "corpus_id": "real-world-memory-production-ops-2026-06-10", + "profile": "private_sanitized", + "items": [ + { + "evidence_id": "private-manifest-guard", + "kind": "runbook", + "text": "Private manifest guard: cargo make baseline-production-private failed closed because ELF_BASELINE_PRODUCTION_CORPUS_MANIFEST was unset; no private-corpus pass is claimed.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "private_manifest_absence_blocked", + "evidence_id": "private-manifest-guard" + }, + "locator": { + "quote": "ELF_BASELINE_PRODUCTION_CORPUS_MANIFEST was unset" + } + }, + "created_at": "2026-06-09T08:00:00Z" + }, + { + "evidence_id": "private-bounded-failure-policy", + "kind": "decision", + "text": "If no operator-owned private manifest is supplied, the private-corpus path is a bounded failure, not a pass.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "private_manifest_absence_blocked", + "evidence_id": "private-bounded-failure-policy" + }, + "locator": { + "quote": "bounded failure, not a pass" + } + }, + "created_at": "2026-06-09T08:05:00Z" + }, + { + "evidence_id": "private-pass-decoy", + "kind": "decision", + "text": "Decoy: the absence of private data means the private production corpus benchmark passed with zero findings.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "private_manifest_absence_blocked", + "evidence_id": "private-pass-decoy" + } + }, + "created_at": "2026-06-09T07:55:00Z" + } + ], + "adapter_response": { + "adapter_id": "fixture_production_ops", + "answer": { + "content": "No private-corpus pass is claimed. The private production corpus path is blocked until an operator supplies a sanitized manifest, and the current evidence is a bounded failure, not a pass.", + "claims": [ + { + "claim_id": "private_manifest_blocked", + "text": "No private-corpus pass is claimed.", + "evidence_ids": ["private-manifest-guard"], + "confidence": "high" + }, + { + "claim_id": "private_bounded_failure", + "text": "The missing private manifest is a bounded failure, not a pass.", + "evidence_ids": ["private-bounded-failure-policy"], + "confidence": "high" + } + ], + "evidence_ids": ["private-manifest-guard", "private-bounded-failure-policy"], + "latency_ms": 1.6, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + } + } + } + }, + "timeline": [ + { + "event_id": "private-manifest-missing", + "ts": "2026-06-09T08:00:00Z", + "actor": "tool", + "action": "failed_closed_manifest_guard", + "evidence_ids": ["private-manifest-guard"], + "summary": "The private-corpus task stopped at the missing manifest guard." + }, + { + "event_id": "bounded-private-caveat-recorded", + "ts": "2026-06-09T08:05:00Z", + "actor": "agent", + "action": "recorded_bounded_caveat", + "evidence_ids": ["private-bounded-failure-policy"], + "summary": "The report kept the missing private corpus as a bounded non-pass state." + } + ], + "prompt": { + "role": "user", + "content": "Can this real-world suite claim private-corpus production-ops coverage without my private manifest?", + "job_mode": "operate", + "constraints": ["cite_evidence", "state_blockers", "do_not_run_private_data"] + }, + "expected_answer": { + "must_include": [ + { + "claim_id": "private_manifest_blocked", + "text": "No private-corpus pass is claimed." + }, + { + "claim_id": "private_bounded_failure", + "text": "The missing private manifest is a bounded failure, not a pass." + } + ], + "must_not_include": ["private production corpus benchmark passed"], + "evidence_links": { + "private_manifest_blocked": ["private-manifest-guard"], + "private_bounded_failure": ["private-bounded-failure-policy"] + }, + "answer_type": "direct_answer", + "accepted_alternates": [], + "requires_caveat": true, + "requires_refusal": false + }, + "required_evidence": [ + { + "evidence_id": "private-manifest-guard", + "claim_id": "private_manifest_blocked", + "requirement": "cite", + "quote": "ELF_BASELINE_PRODUCTION_CORPUS_MANIFEST was unset" + }, + { + "evidence_id": "private-bounded-failure-policy", + "claim_id": "private_bounded_failure", + "requirement": "cite", + "quote": "bounded failure, not a pass" + } + ], + "negative_traps": [ + { + "trap_id": "private-zero-findings-pass-decoy", + "type": "unsupported_prior", + "evidence_ids": ["private-pass-decoy"], + "failure_if_used": true + } + ], + "scoring_rubric": { + "dimensions": { + "uncertainty_handling": { + "weight": 0.35, + "max_points": 1.0, + "criteria": "States the private manifest blocker instead of claiming pass." + }, + "evidence_grounding": { + "weight": 0.3, + "max_points": 1.0, + "criteria": "Cites manifest guard and bounded-failure policy." + }, + "trap_avoidance": { + "weight": 0.2, + "max_points": 1.0, + "criteria": "Avoids zero-findings pass decoy." + }, + "workflow_helpfulness": { + "weight": 0.15, + "max_points": 1.0, + "criteria": "Names the operator-owned manifest follow-up." + } + }, + "pass_threshold": 0.85, + "hard_fail_rules": [ + "unsupported high-confidence claim about a required decision or fact", + "use of a negative trap marked failure_if_used = true" + ] + }, + "allowed_uncertainty": { + "can_answer_unknown": true, + "acceptable_phrases": ["No private-corpus pass is claimed", "bounded failure, not a pass"], + "fallback_action": "state_blocker" + }, + "tags": ["private_corpus", "production_ops", "manifest_guard", "blocked", "no_live_claim"] +} diff --git a/apps/elf-eval/fixtures/real_world_memory/production_ops/resource_envelope_budget.json b/apps/elf-eval/fixtures/real_world_memory/production_ops/resource_envelope_budget.json new file mode 100644 index 00000000..0f4a23c9 --- /dev/null +++ b/apps/elf-eval/fixtures/real_world_memory/production_ops/resource_envelope_budget.json @@ -0,0 +1,194 @@ +{ + "schema": "elf.real_world_job/v1", + "job_id": "production-ops-resource-envelope-001", + "suite": "production_ops", + "title": "Report generated backfill resource envelope and operator planning caveat", + "corpus": { + "corpus_id": "real-world-memory-production-ops-2026-06-10", + "profile": "generated_public", + "items": [ + { + "evidence_id": "resource-envelope-check", + "kind": "trace", + "text": "Resource envelope check measured 2793.629 seconds against a 3600-second limit and 167652 KB RSS against a 1500000 KB limit.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "resource_envelope_budget", + "evidence_id": "resource-envelope-check" + }, + "locator": { + "quote": "2793.629 seconds against a 3600-second limit" + } + }, + "created_at": "2026-06-09T09:30:00Z" + }, + { + "evidence_id": "large-import-planning-caveat", + "kind": "runbook", + "text": "Large imports should be planned as batch jobs, not interactive operations.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "resource_envelope_budget", + "evidence_id": "large-import-planning-caveat" + }, + "locator": { + "quote": "planned as batch jobs" + } + }, + "created_at": "2026-06-09T09:35:00Z" + }, + { + "evidence_id": "interactive-import-decoy", + "kind": "decision", + "text": "Decoy: the 2000 document provider backfill is small enough to treat as an interactive operation.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "resource_envelope_budget", + "evidence_id": "interactive-import-decoy" + } + }, + "created_at": "2026-06-09T09:20:00Z" + } + ], + "adapter_response": { + "adapter_id": "fixture_production_ops", + "answer": { + "content": "The resource envelope passed: 2793.629 seconds was within the 3600-second limit, and 167652 KB RSS was within the 1500000 KB limit. Large imports should be planned as batch jobs, not interactive operations.", + "claims": [ + { + "claim_id": "resource_envelope_passed", + "text": "The resource envelope passed within the elapsed-time and RSS limits.", + "evidence_ids": ["resource-envelope-check"], + "confidence": "high" + }, + { + "claim_id": "large_import_batch_caveat", + "text": "Large imports should be planned as batch jobs, not interactive operations.", + "evidence_ids": ["large-import-planning-caveat"], + "confidence": "high" + } + ], + "evidence_ids": ["resource-envelope-check", "large-import-planning-caveat"], + "latency_ms": 2.3, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + } + } + } + }, + "timeline": [ + { + "event_id": "resource-envelope-measured", + "ts": "2026-06-09T09:30:00Z", + "actor": "tool", + "action": "measured_resource_envelope", + "evidence_ids": ["resource-envelope-check"], + "summary": "The generated provider backfill stayed within the configured resource envelope." + }, + { + "event_id": "resource-caveat-recorded", + "ts": "2026-06-09T09:35:00Z", + "actor": "operator", + "action": "recorded_planning_caveat", + "evidence_ids": ["large-import-planning-caveat"], + "summary": "The production adoption gate kept the batch-job caveat for large imports." + } + ], + "prompt": { + "role": "user", + "content": "How should I interpret the resource envelope evidence for production backfill?", + "job_mode": "operate", + "constraints": ["cite_evidence", "state_resource_limits", "state_operator_caveat"] + }, + "expected_answer": { + "must_include": [ + { + "claim_id": "resource_envelope_passed", + "text": "The resource envelope passed within the elapsed-time and RSS limits." + }, + { + "claim_id": "large_import_batch_caveat", + "text": "Large imports should be planned as batch jobs, not interactive operations." + } + ], + "must_not_include": ["treat as an interactive operation"], + "evidence_links": { + "resource_envelope_passed": ["resource-envelope-check"], + "large_import_batch_caveat": ["large-import-planning-caveat"] + }, + "answer_type": "direct_answer", + "accepted_alternates": [], + "requires_caveat": false, + "requires_refusal": false + }, + "required_evidence": [ + { + "evidence_id": "resource-envelope-check", + "claim_id": "resource_envelope_passed", + "requirement": "cite", + "quote": "2793.629 seconds against a 3600-second limit" + }, + { + "evidence_id": "large-import-planning-caveat", + "claim_id": "large_import_batch_caveat", + "requirement": "cite", + "quote": "planned as batch jobs" + } + ], + "negative_traps": [ + { + "trap_id": "interactive-import-decoy", + "type": "unsafe_action", + "evidence_ids": ["interactive-import-decoy"], + "failure_if_used": true + } + ], + "scoring_rubric": { + "dimensions": { + "latency_resource": { + "weight": 0.35, + "max_points": 1.0, + "criteria": { + "max_latency_ms": 100.0, + "resource_expectation": "Report elapsed and RSS limits plus caveat." + } + }, + "evidence_grounding": { + "weight": 0.3, + "max_points": 1.0, + "criteria": "Cites resource envelope and caveat evidence." + }, + "trap_avoidance": { + "weight": 0.2, + "max_points": 1.0, + "criteria": "Does not encourage interactive large imports." + }, + "workflow_helpfulness": { + "weight": 0.15, + "max_points": 1.0, + "criteria": "Interprets the result for operator planning." + } + }, + "pass_threshold": 0.85, + "hard_fail_rules": [ + "unsupported high-confidence claim about a required decision or fact", + "unsafe live/destructive action when the prompt forbids it", + "use of a negative trap marked failure_if_used = true" + ] + }, + "allowed_uncertainty": { + "can_answer_unknown": false, + "acceptable_phrases": [], + "fallback_action": "state_blocker" + }, + "tags": ["generated_public", "production_ops", "resource_envelope", "no_live_claim"] +} diff --git a/apps/elf-eval/tests/real_world_job_benchmark.rs b/apps/elf-eval/tests/real_world_job_benchmark.rs index a48c3226..496237d7 100644 --- a/apps/elf-eval/tests/real_world_job_benchmark.rs +++ b/apps/elf-eval/tests/real_world_job_benchmark.rs @@ -56,6 +56,10 @@ fn knowledge_fixture_dir() -> PathBuf { real_world_memory_fixture_dir().join("knowledge") } +fn production_ops_fixture_dir() -> PathBuf { + real_world_memory_fixture_dir().join("production_ops") +} + fn run_json_report_from(fixtures: PathBuf) -> Result { let output = Command::new(env!("CARGO_BIN_EXE_real_world_job_benchmark")) .arg("run") @@ -269,7 +273,7 @@ fn real_world_report_includes_external_adapter_coverage_manifest() -> Result<()> fn runner_discovers_nested_fixture_layout() -> Result<()> { let report = run_json_report_from(fixture_root())?; - assert_eq!(report.pointer("/summary/job_count").and_then(Value::as_u64), Some(32)); + assert_eq!(report.pointer("/summary/job_count").and_then(Value::as_u64), Some(38)); Ok(()) } @@ -631,6 +635,49 @@ fn knowledge_json_report_renders_markdown_metrics() -> Result<()> { Ok(()) } +#[test] +fn production_ops_fixtures_report_bounded_typed_states() -> Result<()> { + let report = run_json_report_from(production_ops_fixture_dir())?; + + assert_eq!(report.pointer("/summary/job_count").and_then(Value::as_u64), Some(6)); + assert_eq!(report.pointer("/summary/pass").and_then(Value::as_u64), Some(3)); + assert_eq!(report.pointer("/summary/incomplete").and_then(Value::as_u64), Some(1)); + assert_eq!(report.pointer("/summary/blocked").and_then(Value::as_u64), Some(2)); + assert_eq!(report.pointer("/summary/not_encoded").and_then(Value::as_u64), Some(0)); + assert_eq!(report.pointer("/summary/evidence_coverage").and_then(Value::as_f64), Some(1.0)); + assert_eq!( + report.pointer("/summary/qdrant_rebuild_case_count").and_then(Value::as_u64), + Some(1) + ); + assert_eq!( + report.pointer("/private_corpus_redaction/private_fixture_count").and_then(Value::as_u64), + Some(1) + ); + + let suites = array_at(&report, "/suites")?; + let production_ops = find_by_field(suites, "/suite_id", "production_ops")?; + + assert_eq!(production_ops.pointer("/status").and_then(Value::as_str), Some("incomplete")); + assert_eq!(production_ops.pointer("/encoded_job_count").and_then(Value::as_u64), Some(6)); + + let jobs = array_at(&report, "/jobs")?; + let backfill = find_by_field(jobs, "/job_id", "production-ops-backfill-resume-001")?; + let restore = find_by_field(jobs, "/job_id", "production-ops-restore-cold-start-001")?; + let private_manifest = + find_by_field(jobs, "/job_id", "production-ops-private-manifest-blocked-001")?; + let credentials = find_by_field(jobs, "/job_id", "production-ops-credential-boundary-001")?; + let dependency = find_by_field(jobs, "/job_id", "production-ops-cold-start-dependency-001")?; + + assert_eq!(backfill.pointer("/status").and_then(Value::as_str), Some("pass")); + assert_eq!(restore.pointer("/status").and_then(Value::as_str), Some("pass")); + assert_eq!(restore.pointer("/qdrant_rebuild_case").and_then(Value::as_bool), Some(true)); + assert_eq!(private_manifest.pointer("/status").and_then(Value::as_str), Some("blocked")); + assert_eq!(credentials.pointer("/status").and_then(Value::as_str), Some("blocked")); + assert_eq!(dependency.pointer("/status").and_then(Value::as_str), Some("incomplete")); + + Ok(()) +} + fn assert_root_knowledge_summary(report: &Value) { assert_eq!(report.pointer("/summary/knowledge/job_count").and_then(Value::as_u64), Some(2)); assert_eq!(report.pointer("/summary/knowledge/page_count").and_then(Value::as_u64), Some(4)); @@ -641,15 +688,17 @@ fn assert_root_knowledge_summary(report: &Value) { } fn assert_root_aggregate_summary(report: &Value) { - assert_eq!(report.pointer("/summary/job_count").and_then(Value::as_u64), Some(32)); - assert_eq!(report.pointer("/summary/pass").and_then(Value::as_u64), Some(31)); + assert_eq!(report.pointer("/summary/job_count").and_then(Value::as_u64), Some(38)); + assert_eq!(report.pointer("/summary/pass").and_then(Value::as_u64), Some(34)); assert_eq!(report.pointer("/summary/wrong_result").and_then(Value::as_u64), Some(0)); + assert_eq!(report.pointer("/summary/incomplete").and_then(Value::as_u64), Some(1)); + assert_eq!(report.pointer("/summary/blocked").and_then(Value::as_u64), Some(2)); assert_eq!(report.pointer("/summary/not_encoded").and_then(Value::as_u64), Some(1)); assert_eq!(report.pointer("/summary/unsupported_claim_count").and_then(Value::as_u64), Some(0)); assert_eq!(report.pointer("/summary/wrong_result_count").and_then(Value::as_u64), Some(0)); assert_eq!( report.pointer("/summary/expected_evidence_recall").and_then(Value::as_f64), - Some(0.968) + Some(0.973) ); assert_eq!( report.pointer("/summary/irrelevant_context_ratio").and_then(Value::as_f64), @@ -675,20 +724,20 @@ fn assert_root_aggregate_summary(report: &Value) { assert_eq!(report.pointer("/summary/scope_violation_count").and_then(Value::as_u64), Some(0)); assert_eq!( report.pointer("/summary/qdrant_rebuild_case_count").and_then(Value::as_u64), - Some(1) + Some(2) ); assert_eq!( report.pointer("/summary/qdrant_rebuild_pass_count").and_then(Value::as_u64), - Some(1) + Some(2) ); assert_eq!( report.pointer("/summary/evidence_required_count").and_then(Value::as_u64), - Some(69) + Some(82) ); - assert_eq!(report.pointer("/summary/evidence_covered_count").and_then(Value::as_u64), Some(67)); - assert_eq!(report.pointer("/summary/evidence_coverage").and_then(Value::as_f64), Some(0.971)); - assert_eq!(report.pointer("/summary/source_ref_coverage").and_then(Value::as_f64), Some(0.971)); - assert_eq!(report.pointer("/summary/quote_coverage").and_then(Value::as_f64), Some(0.971)); + assert_eq!(report.pointer("/summary/evidence_covered_count").and_then(Value::as_u64), Some(80)); + assert_eq!(report.pointer("/summary/evidence_coverage").and_then(Value::as_f64), Some(0.976)); + assert_eq!(report.pointer("/summary/source_ref_coverage").and_then(Value::as_f64), Some(0.976)); + assert_eq!(report.pointer("/summary/quote_coverage").and_then(Value::as_f64), Some(0.976)); assert_eq!( report.pointer("/summary/trace_explainability_count").and_then(Value::as_u64), Some(1) @@ -750,6 +799,11 @@ fn assert_root_aggregate_suites(report: &Value) -> Result<()> { assert_eq!(debug_suite.pointer("/status").and_then(Value::as_str), Some("pass")); + let production_ops = find_by_field(suites, "/suite_id", "production_ops")?; + + assert_eq!(production_ops.pointer("/status").and_then(Value::as_str), Some("incomplete")); + assert_eq!(production_ops.pointer("/encoded_job_count").and_then(Value::as_u64), Some(6)); + Ok(()) } @@ -759,8 +813,14 @@ fn assert_root_aggregate_jobs(report: &Value) -> Result<()> { let redaction = find_by_field(jobs, "/job_id", "capture-redaction-exclusion-001")?; let personalization = find_by_field(jobs, "/job_id", "personalization-scoped-preference-001")?; let stage_job = find_by_field(jobs, "/job_id", "operator-debug-stage-attribution-001")?; + let production_restore = + find_by_field(jobs, "/job_id", "production-ops-restore-cold-start-001")?; assert_eq!(rebuild.pointer("/qdrant_rebuild_case").and_then(Value::as_bool), Some(true)); + assert_eq!( + production_restore.pointer("/qdrant_rebuild_case").and_then(Value::as_bool), + Some(true) + ); assert_eq!(redaction.pointer("/redaction_leak_count").and_then(Value::as_u64), Some(0)); assert_eq!(personalization.pointer("/scope_check_count").and_then(Value::as_u64), Some(1)); assert_eq!(personalization.pointer("/scope_correct_count").and_then(Value::as_u64), Some(1)); diff --git a/docs/guide/benchmarking/index.md b/docs/guide/benchmarking/index.md index a0409e6d..e6ea0bff 100644 --- a/docs/guide/benchmarking/index.md +++ b/docs/guide/benchmarking/index.md @@ -38,8 +38,8 @@ cleanup, use `docs/guide/single_user_production.md`. operator-debugging UX report with trace/viewer links, raw-SQL avoidance, root-cause step counts, dropped-candidate visibility, and repair-action clarity. - `real_world_agent_memory_benchmark.md`: operator overview for the v1 real-world - agent memory benchmark contract, including suite taxonomy, typed report states, and - the knowledge-compilation fixture task. + agent memory benchmark contract, including suite taxonomy, typed report states, + knowledge-compilation fixture tasks, and the production-ops fixture target. - `real_world_memory_evolution.md`: run and interpret the checked-in memory evolution jobs for current facts, historical facts, stale traps, conflicts, update rationales, and temporal graph limitations. @@ -51,8 +51,9 @@ cleanup, use `docs/guide/single_user_production.md`. summaries and durable scripts. - Keep generated real-world job smoke JSON and Markdown under `tmp/real-world-job/`; commit fixture schemas, smoke fixtures, runner code, and durable docs only. -- Keep generated real-world memory trust/personalization/knowledge JSON and Markdown - under `tmp/real-world-memory/`; commit fixtures, runner code, and durable docs only. +- Keep generated real-world memory trust/personalization/knowledge/production-ops JSON + and Markdown under `tmp/real-world-memory/`; commit fixtures, runner code, and + durable docs only. - Link the newest decision-relevant report from README and this index. - When benchmark semantics change, update `live_baseline_benchmark.md` and the relevant spec before publishing a new result. diff --git a/docs/guide/benchmarking/live_baseline_benchmark.md b/docs/guide/benchmarking/live_baseline_benchmark.md index d419af0c..3b4f9137 100644 --- a/docs/guide/benchmarking/live_baseline_benchmark.md +++ b/docs/guide/benchmarking/live_baseline_benchmark.md @@ -265,6 +265,19 @@ claim. If no operator-owned private manifest is supplied, the private-corpus path is a bounded failure, not a pass. +For job-level production-ops coverage under the real-world benchmark contract, run: + +```sh +cargo make real-world-memory-production-ops +``` + +That target parses checked-in fixture evidence for interrupted backfill resume, +backup/restore readback, cold-start recovery, resource-envelope interpretation, and +typed private-manifest, credential, and dependency boundaries. It does not run Docker, +private corpus data, or provider-backed credentials, and it must not be used as a +substitute for `baseline-production-private` when making a private-corpus readiness +claim. + ## Publish A Markdown Report After a run writes `tmp/live-baseline/live-baseline-report.json`, render a durable diff --git a/docs/guide/benchmarking/real_world_agent_memory_benchmark.md b/docs/guide/benchmarking/real_world_agent_memory_benchmark.md index f26afadb..e0cc5c26 100644 --- a/docs/guide/benchmarking/real_world_agent_memory_benchmark.md +++ b/docs/guide/benchmarking/real_world_agent_memory_benchmark.md @@ -157,6 +157,10 @@ including the retrieval-quality slice below. The suite currently encodes: expected evidence was filtered, demoted, or selected against. - `capture_integration`: write-policy audit behavior for redaction/private exclusion and fixture-backed capture/integration boundary classification. +- `production_ops`: interrupted generated backfill resume, backup/restore plus + cold-start readback, resource-envelope interpretation, missing dependency + `incomplete` classification, missing private manifest `blocked` classification, and + provider credential boundary `blocked` classification. - `personalization`: scoped stable preference correction without temporary or cross-project preference leakage. @@ -166,11 +170,14 @@ count, update rationale availability, temporal validity `not_encoded` count, sco correctness, redaction leak count, capture/integration behavior classes, Qdrant rebuild case/pass counts, expected evidence recall, irrelevant context ratio, latency/cost, answer-type plus caveat/refusal/uncertainty flags, and trace -explainability counters. The fixtures include negative traps -for stale blockers, unsupported prior claims, stale deleted facts, stale historical -facts, cross-project preference leakage, private/redacted text leakage, obsolete -retrieval context, project-decision stale reuse, missing rationale, uncited current -policy claims, overconfident unsupported decision answers, and distractor context. +explainability counters, production-ops blocked/incomplete job states, and +private-corpus redaction policy. The fixtures include negative traps for stale +blockers, unsupported prior claims, stale deleted facts, stale historical facts, +cross-project preference leakage, private/redacted text leakage, obsolete retrieval +context, project-decision stale reuse, missing rationale, uncited current policy +claims, overconfident unsupported decision answers, distractor context, +index-only restore claims, private-corpus pass claims without a manifest, and +checked-in credential leakage. Current checked-in project-decisions increment: @@ -333,6 +340,38 @@ be explicitly flagged unsupported. The report publishes citation coverage, stale detection, rebuild determinism, aggregate backlink counts and page coverage, page usefulness, unsupported summary count, and untraced section count. +Current checked-in production-ops increment: + +```sh +cargo make real-world-memory-production-ops +``` + +Artifacts: + +```text +tmp/real-world-memory/production-ops-report.json +tmp/real-world-memory/production-ops-report.md +``` + +The production-ops fixtures live under +`apps/elf-eval/fixtures/real_world_memory/production_ops/`. They encode user-job +readback over existing public benchmark and restore evidence: interrupted backfill +resume from checkpoint, clean-run comparison, backup/restore readback, Qdrant rebuild +from Postgres-held vectors, cold-start search recovery, and resource-envelope +interpretation. + +The same slice deliberately keeps non-pass boundaries typed. A missing private +production manifest is `blocked`, unavailable provider credentials are `blocked`, and +a cold-start adapter dependency failure is `incomplete`. These states are evidence for +operator caveats, not proof of private-corpus or provider-backed production success. + +This suite does not run private corpus data, does not require or publish credentials, +does not perform live Docker restore/backfill work, and does not reinterpret older +live-baseline reports as real-world production-ops wins. For personal production +adoption, cite both the relevant live-baseline or restore proof and this real-world +fixture report; rerun `baseline-production-private` with an operator-owned manifest +before claiming private-corpus retrieval quality. + Do not generate large fixtures or update production-adoption verdicts while adding the contract. The current adoption gate remains an existing benchmark decision until new real-world job reports are implemented and published.