Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 41 additions & 6 deletions apps/elf-api/src/routes.rs
Original file line number Diff line number Diff line change
Expand Up @@ -55,12 +55,13 @@ use elf_service::{
KnowledgePageLintRequest, KnowledgePageLintResponse, KnowledgePageRebuildRequest,
KnowledgePageRebuildResponse, KnowledgePageResponse, KnowledgePageSearchRequest,
KnowledgePageSearchResponse, KnowledgePagesListRequest, KnowledgePagesListResponse,
ListRequest, ListResponse, NoteFetchRequest, NoteFetchResponse, NoteProvenanceBundleResponse,
NoteProvenanceGetRequest, PayloadLevel, PublishNoteRequest, QueryPlan, RankingRequestOverride,
RebuildReport, SearchDetailsRequest, SearchDetailsResult, SearchExplainRequest,
SearchExplainResponse, SearchIndexItem, SearchRequest, SearchResponse, SearchSessionGetRequest,
SearchTimelineGroup, SearchTimelineRequest, SearchTrajectoryResponse, SearchTrajectorySummary,
ShareScope, SpaceGrantRevokeRequest, SpaceGrantRevokeResponse, SpaceGrantUpsertRequest,
ListRequest, ListResponse, MemoryHistoryGetRequest, MemoryHistoryResponse, NoteFetchRequest,
NoteFetchResponse, NoteProvenanceBundleResponse, NoteProvenanceGetRequest, PayloadLevel,
PublishNoteRequest, QueryPlan, RankingRequestOverride, RebuildReport, SearchDetailsRequest,
SearchDetailsResult, SearchExplainRequest, SearchExplainResponse, SearchIndexItem,
SearchRequest, SearchResponse, SearchSessionGetRequest, SearchTimelineGroup,
SearchTimelineRequest, SearchTrajectoryResponse, SearchTrajectorySummary, ShareScope,
SpaceGrantRevokeRequest, SpaceGrantRevokeResponse, SpaceGrantUpsertRequest,
SpaceGrantsListRequest, TextPositionSelector, TextQuoteSelector, TraceBundleGetRequest,
TraceBundleResponse, TraceGetRequest, TraceGetResponse, TraceRecentListRequest,
TraceRecentListResponse, TraceTrajectoryGetRequest, UnpublishNoteRequest, UpdateRequest,
Expand Down Expand Up @@ -154,6 +155,7 @@ const VIEWER_HTML: &str = include_str!("../static/viewer.html");
admin_graph_predicate_alias_add,
admin_graph_predicate_aliases_list,
admin_note_provenance_get,
admin_note_history_get,
),
components(schemas(
AdminIngestionProfileDefaultResponseV2,
Expand Down Expand Up @@ -707,6 +709,7 @@ pub fn admin_router(state: AppState) -> Router {
routing::post(admin_graph_predicate_alias_add).get(admin_graph_predicate_aliases_list),
)
.route("/v2/admin/notes/{note_id}/provenance", routing::get(admin_note_provenance_get))
.route("/v2/admin/notes/{note_id}/history", routing::get(admin_note_history_get))
.with_state(state)
.layer(DefaultBodyLimit::max(MAX_REQUEST_BYTES))
.layer(middleware::from_fn_with_state(auth_state, admin_auth_middleware));
Expand Down Expand Up @@ -2481,6 +2484,38 @@ async fn admin_note_provenance_get(
Ok(Json(response))
}

#[utoipa::path(
get,
path = "/v2/admin/notes/{note_id}/history",
tag = "admin",
params(("note_id" = Uuid, Path, description = "Note ID.")),
responses(
(status = 200, description = "Memory history timeline.", body = Value),
(status = 400, description = "Invalid request.", body = ErrorBody),
(status = 401, description = "Authentication required.", body = ErrorBody),
(status = 403, description = "Admin access required.", body = ErrorBody),
(status = 404, description = "Note was not found.", body = ErrorBody),
(status = 500, description = "Internal error.", body = ErrorBody),
)
)]
async fn admin_note_history_get(
State(state): State<AppState>,
headers: HeaderMap,
Path(note_id): Path<Uuid>,
) -> Result<Json<MemoryHistoryResponse>, ApiError> {
let ctx = RequestContext::from_headers(&headers)?;
let response = state
.service
.memory_history_get(MemoryHistoryGetRequest {
tenant_id: ctx.tenant_id,
project_id: ctx.project_id,
note_id,
})
.await?;

Ok(Json(response))
}

#[utoipa::path(
post,
path = "/v2/admin/consolidation/runs",
Expand Down
52 changes: 52 additions & 0 deletions apps/elf-api/tests/http.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2373,6 +2373,58 @@ async fn admin_note_provenance_includes_request_id_on_success() {
test_db.cleanup().await.expect("Failed to cleanup test database.");
}

#[tokio::test]
#[ignore = "Requires external Postgres and Qdrant. Set ELF_PG_DSN and ELF_QDRANT_GRPC_URL (or ELF_QDRANT_URL) to run."]
async fn admin_note_history_includes_request_id_on_success() {
let Some((test_db, qdrant_url, collection)) = test_env().await else {
return;
};
let mut config = test_config(test_db.dsn().to_string(), qdrant_url, collection);

config.security.auth_mode = "off".to_string();

let state = AppState::new(config).await.expect("Failed to initialize app state.");
let app = routes::admin_router(state.clone());
let note_id = Uuid::new_v4();
let request_id = Uuid::new_v4();

insert_note(&state, note_id, "agent_private", TEST_AGENT_A, "History integration test note.")
.await;

let response = app
.oneshot(
Request::builder()
.uri(format!("/v2/admin/notes/{note_id}/history"))
.header("X-ELF-Tenant-Id", TEST_TENANT_ID)
.header("X-ELF-Project-Id", TEST_PROJECT_ID)
.header("X-ELF-Agent-Id", TEST_AGENT_A)
.header("X-ELF-Request-Id", request_id.to_string())
.body(Body::empty())
.expect("Failed to build history request."),
)
.await
.expect("Failed to call admin note history.");

assert_eq!(response.status(), StatusCode::OK);

let expected_request_id = request_id.to_string();

assert_eq!(
response.headers().get("X-ELF-Request-Id").and_then(|value| value.to_str().ok()),
Some(expected_request_id.as_str())
);

let body = body::to_bytes(response.into_body(), usize::MAX)
.await
.expect("Failed to read history response body.");
let json: serde_json::Value = serde_json::from_slice(&body).expect("Failed to parse response.");

assert_eq!(json["schema"], "elf.memory_history/v1");
assert_eq!(json["request_id"], request_id.to_string());

test_db.cleanup().await.expect("Failed to cleanup test database.");
}

#[tokio::test]
#[ignore = "Requires external Postgres and Qdrant. Set ELF_PG_DSN and ELF_QDRANT_GRPC_URL (or ELF_QDRANT_URL) to run."]
async fn admin_note_provenance_rejects_invalid_request_id_header() {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -212,6 +212,11 @@
"required": false,
"encoded": false,
"follow_up": null
},
"history_readback": {
"encoded": true,
"required_event_types": ["add", "update", "ignore"],
"requires_note_version_links": true
}
},
"tags": [
Expand Down
84 changes: 78 additions & 6 deletions apps/elf-eval/src/bin/real_world_job_benchmark.rs
Original file line number Diff line number Diff line change
Expand Up @@ -298,6 +298,7 @@ struct MemoryEvolution {
conflicts: Vec<EvolutionConflict>,
update_rationale: Option<UpdateRationale>,
temporal_validity: Option<TemporalValidity>,
history_readback: Option<HistoryReadback>,
}

#[derive(Debug, Deserialize)]
Expand All @@ -324,6 +325,14 @@ struct TemporalValidity {
follow_up: Option<String>,
}

#[derive(Debug, Deserialize)]
struct HistoryReadback {
encoded: bool,
#[serde(default)]
required_event_types: Vec<String>,
requires_note_version_links: bool,
}

#[derive(Debug, Deserialize)]
struct ScoringRubric {
#[serde(default)]
Expand Down Expand Up @@ -763,6 +772,8 @@ struct ReportSummary {
update_rationale_available_count: usize,
#[serde(default)]
temporal_validity_not_encoded_count: usize,
#[serde(default)]
history_readback_encoded_count: usize,
expected_evidence_total: usize,
expected_evidence_matched: usize,
expected_evidence_recall: f64,
Expand Down Expand Up @@ -865,6 +876,8 @@ struct SuiteReport {
update_rationale_available_count: usize,
#[serde(default)]
temporal_validity_not_encoded_count: usize,
#[serde(default)]
history_readback_encoded_count: usize,
expected_evidence_recall: Option<f64>,
irrelevant_context_ratio: Option<f64>,
trace_explainability_count: usize,
Expand Down Expand Up @@ -896,6 +909,8 @@ struct JobReport {
update_rationale_available: bool,
#[serde(default)]
temporal_validity_not_encoded: bool,
#[serde(default)]
history_readback_encoded: bool,
retrieval_quality: RetrievalQualityReport,
latency_ms: Option<f64>,
cost: Option<CostReport>,
Expand Down Expand Up @@ -1036,6 +1051,7 @@ struct EvolutionSummary {
conflict_detection_count: usize,
update_rationale_available_count: usize,
temporal_validity_not_encoded_count: usize,
history_readback_encoded_count: usize,
}

#[derive(Clone, Debug, Deserialize, Serialize)]
Expand All @@ -1050,6 +1066,9 @@ struct EvolutionJobReport {
temporal_validity_required: bool,
temporal_validity_encoded: bool,
temporal_validity_not_encoded: bool,
history_readback_encoded: bool,
history_event_types: Vec<String>,
history_requires_note_version_links: bool,
#[serde(skip_serializing_if = "Option::is_none")]
follow_up: Option<String>,
}
Expand Down Expand Up @@ -2265,6 +2284,16 @@ fn evolution_job_report(
let temporal_validity_encoded =
evolution.temporal_validity.as_ref().is_some_and(|temporal| temporal.encoded);
let temporal_validity_not_encoded = temporal_validity_required && !temporal_validity_encoded;
let history_readback_encoded =
evolution.history_readback.as_ref().is_some_and(|history| history.encoded);
let history_event_types = evolution
.history_readback
.as_ref()
.map_or_else(Vec::new, |history| history.required_event_types.clone());
let history_requires_note_version_links = evolution
.history_readback
.as_ref()
.is_some_and(|history| history.requires_note_version_links);
let follow_up = evolution
.temporal_validity
.as_ref()
Expand All @@ -2282,6 +2311,9 @@ fn evolution_job_report(
temporal_validity_required,
temporal_validity_encoded,
temporal_validity_not_encoded,
history_readback_encoded,
history_event_types,
history_requires_note_version_links,
follow_up,
})
}
Expand Down Expand Up @@ -2783,6 +2815,10 @@ fn job_report(job: &RealWorldJob, scoring: JobScoring) -> JobReport {
.evolution
.as_ref()
.is_some_and(|report| report.temporal_validity_not_encoded),
history_readback_encoded: scoring
.evolution
.as_ref()
.is_some_and(|report| report.history_readback_encoded),
retrieval_quality,
latency_ms: answer.latency_ms,
cost: answer.cost.clone(),
Expand Down Expand Up @@ -3101,6 +3137,7 @@ fn suite_report(suite_id: &str, jobs: &[JobReport]) -> SuiteReport {
conflict_detection_count: 0,
update_rationale_available_count: 0,
temporal_validity_not_encoded_count: 0,
history_readback_encoded_count: 0,
expected_evidence_recall: None,
irrelevant_context_ratio: None,
trace_explainability_count: 0,
Expand All @@ -3118,6 +3155,8 @@ fn suite_report(suite_id: &str, jobs: &[JobReport]) -> SuiteReport {
suite_jobs.iter().filter(|job| job.update_rationale_available).count();
let temporal_validity_not_encoded_count =
suite_jobs.iter().filter(|job| job.temporal_validity_not_encoded).count();
let history_readback_encoded_count =
suite_jobs.iter().filter(|job| job.history_readback_encoded).count();
let trace_explainability_count =
suite_jobs.iter().filter(|job| job.trace_explainability.is_some()).count();

Expand All @@ -3132,6 +3171,7 @@ fn suite_report(suite_id: &str, jobs: &[JobReport]) -> SuiteReport {
conflict_detection_count,
update_rationale_available_count,
temporal_validity_not_encoded_count,
history_readback_encoded_count,
expected_evidence_recall: Some(expected_evidence_recall_for_jobs(&suite_jobs)),
irrelevant_context_ratio: Some(irrelevant_context_ratio_for_jobs(&suite_jobs)),
trace_explainability_count,
Expand Down Expand Up @@ -3206,6 +3246,10 @@ fn report_summary(jobs: &[JobReport], suites: &[SuiteReport]) -> ReportSummary {
.iter()
.filter(|job| job.temporal_validity_not_encoded)
.count(),
history_readback_encoded_count: jobs
.iter()
.filter(|job| job.history_readback_encoded)
.count(),
expected_evidence_total: jobs
.iter()
.map(|job| job.retrieval_quality.expected_evidence_total)
Expand Down Expand Up @@ -3302,6 +3346,10 @@ fn evolution_summary(jobs: &[JobReport]) -> EvolutionSummary {
.iter()
.filter(|job| job.temporal_validity_not_encoded)
.count(),
history_readback_encoded_count: jobs
.iter()
.filter(|job| job.history_readback_encoded)
.count(),
}
}

Expand Down Expand Up @@ -4028,6 +4076,10 @@ fn render_markdown_header(out: &mut String, report: &RealWorldReport, report_pat
"- Temporal validity not encoded: `{}`\n",
report.summary.temporal_validity_not_encoded_count
));
out.push_str(&format!(
"- History readback encoded: `{}`\n",
report.summary.history_readback_encoded_count
));

render_markdown_quality_summary(out, report);

Expand Down Expand Up @@ -4131,13 +4183,13 @@ fn render_markdown_quality_summary(out: &mut String, report: &RealWorldReport) {
fn render_markdown_suites(out: &mut String, report: &RealWorldReport) {
out.push_str("## Suites\n\n");
out.push_str(
"| Suite | Status | Jobs | Score | Evidence Recall | Irrelevant Context | Trace Explain | Stale Answers | Conflicts | Update Rationales | Temporal Gaps | Unsupported Claims | Wrong Results | Reason |\n",
"| Suite | Status | Jobs | Score | Evidence Recall | Irrelevant Context | Trace Explain | Stale Answers | Conflicts | Update Rationales | Temporal Gaps | History Readback | Unsupported Claims | Wrong Results | Reason |\n",
);
out.push_str("| --- | --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | --- |\n");
out.push_str("| --- | --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | --- |\n");

for suite in &report.suites {
out.push_str(&format!(
"| {} | `{}` | {} | `{}` | `{}` | `{}` | {} | {} | {} | {} | {} | {} | {} | {} |\n",
"| {} | `{}` | {} | `{}` | `{}` | `{}` | {} | {} | {} | {} | {} | {} | {} | {} | {} |\n",
md_cell(suite.suite_id.as_str()),
status_str(suite.status),
suite.encoded_job_count,
Expand All @@ -4149,6 +4201,7 @@ fn render_markdown_suites(out: &mut String, report: &RealWorldReport) {
suite.conflict_detection_count,
suite.update_rationale_available_count,
suite.temporal_validity_not_encoded_count,
suite.history_readback_encoded_count,
suite.unsupported_claim_count,
suite.wrong_result_count,
md_cell(suite.reason.as_str())
Expand Down Expand Up @@ -4306,16 +4359,20 @@ fn render_markdown_evolution(out: &mut String, report: &RealWorldReport) {
"- Temporal validity not encoded: `{}`\n\n",
report.evolution.temporal_validity_not_encoded_count
));
out.push_str("| Suite | Job | Current Evidence | Historical Evidence | Stale Traps Used | Conflict Count | Detected | Update Rationale | Temporal Validity | Follow-up |\n");
out.push_str("| --- | --- | --- | --- | --- | ---: | ---: | --- | --- | --- |\n");
out.push_str(&format!(
"- History readback encoded: `{}`\n\n",
report.evolution.history_readback_encoded_count
));
out.push_str("| Suite | Job | Current Evidence | Historical Evidence | Stale Traps Used | Conflict Count | Detected | Update Rationale | Temporal Validity | History Readback | Follow-up |\n");
out.push_str("| --- | --- | --- | --- | --- | ---: | ---: | --- | --- | --- | --- |\n");

for job in &report.jobs {
let Some(evolution) = &job.evolution else {
continue;
};

out.push_str(&format!(
"| {} | {} | `{}` | `{}` | `{}` | {} | {} | `{}` | `{}` | {} |\n",
"| {} | {} | `{}` | `{}` | `{}` | {} | {} | `{}` | `{}` | `{}` | {} |\n",
md_cell(job.suite_id.as_str()),
md_cell(job.job_id.as_str()),
md_inline(evolution.current_evidence.join(", ").as_str()),
Expand All @@ -4325,6 +4382,7 @@ fn render_markdown_evolution(out: &mut String, report: &RealWorldReport) {
evolution.conflict_detection_count,
bool_display(evolution.update_rationale_available),
temporal_display(evolution),
history_display(evolution),
md_cell(evolution.follow_up.as_deref().unwrap_or("-"))
));
}
Expand Down Expand Up @@ -4695,6 +4753,20 @@ fn temporal_display(evolution: &EvolutionJobReport) -> &'static str {
}
}

fn history_display(evolution: &EvolutionJobReport) -> String {
if !evolution.history_readback_encoded {
return "-".to_string();
}

let mut parts = vec![format!("events={}", evolution.history_event_types.join(","))];

if evolution.history_requires_note_version_links {
parts.push("note_version_links=true".to_string());
}

parts.join(";")
}

fn cost_display(cost: Option<&CostReport>) -> String {
let Some(cost) = cost else {
return "-".to_string();
Expand Down
Loading