Skip to content

Commit 399b22a

Browse files
committed
eval: include structured results, run config, and summary in JSON output
The eval JSON output now includes: - Run metadata: name, timestamp, duration, config (agent, judge model, concurrency, evals dir), and aggregate summary - Per-session eval_result with pass/fail, human-readable successes/failures, error, cost, and output tokens - Structured checks: size (actual vs expected), tool_calls (F1 score), and relevance (per-criterion pass/fail with judge reasoning) - Relevance results list ALL criteria with passed bool, not just failures Output format changes from a bare session array to a RunOutput wrapper. No changes to the session DB schema. Assisted-By: docker-agent
1 parent cd4881c commit 399b22a

5 files changed

Lines changed: 263 additions & 17 deletions

File tree

pkg/evaluation/eval.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,7 @@ func Evaluate(ctx context.Context, ttyOut, out io.Writer, isTTY bool, runName st
9090
Name: runName,
9191
Timestamp: startTime,
9292
Duration: duration,
93+
Config: cfg,
9394
Results: results,
9495
Summary: summary,
9596
}

pkg/evaluation/save.go

Lines changed: 99 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -374,11 +374,15 @@ func SaveRunJSON(run *EvalRun, outputDir string) (string, error) {
374374
return saveJSON(run, filepath.Join(outputDir, run.Name+".json"))
375375
}
376376

377-
// SaveRunSessionsJSON saves all eval sessions to a single JSON file.
378-
// Each session includes its eval criteria in the "evals" field.
379-
// This complements SaveRunSessions which saves to SQLite, providing a
380-
// human-readable format for inspection.
377+
// SaveRunSessionsJSON saves the full evaluation run output to a JSON file.
378+
// The output includes run metadata (config, summary) and all sessions with
379+
// their eval criteria and scoring results (pass/fail, judge reasoning, errors).
381380
func SaveRunSessionsJSON(run *EvalRun, outputDir string) (string, error) {
381+
// Populate eval results on each session
382+
for i := range run.Results {
383+
populateEvalResult(&run.Results[i])
384+
}
385+
382386
// Collect all sessions from results
383387
var sessions []*session.Session
384388
for i := range run.Results {
@@ -387,8 +391,98 @@ func SaveRunSessionsJSON(run *EvalRun, outputDir string) (string, error) {
387391
}
388392
}
389393

394+
output := RunOutput{
395+
Name: run.Name,
396+
Timestamp: run.Timestamp,
397+
Duration: run.Duration.Round(time.Millisecond).String(),
398+
Config: RunOutputConfig{
399+
Agent: run.Config.AgentFilename,
400+
JudgeModel: run.Config.JudgeModel,
401+
Concurrency: run.Config.Concurrency,
402+
EvalsDir: run.Config.EvalsDir,
403+
BaseImage: run.Config.BaseImage,
404+
},
405+
Summary: run.Summary,
406+
Sessions: sessions,
407+
}
408+
390409
outputPath := filepath.Join(outputDir, run.Name+".json")
391-
return saveJSON(sessions, outputPath)
410+
return saveJSON(output, outputPath)
411+
}
412+
413+
// populateEvalResult copies scoring data from a Result to its Session's EvalResult field.
414+
func populateEvalResult(result *Result) {
415+
if result.Session == nil {
416+
return
417+
}
418+
419+
successes, failures := result.checkResults()
420+
421+
evalResult := &session.EvalResult{
422+
Passed: len(failures) == 0,
423+
Successes: successes,
424+
Failures: failures,
425+
Error: result.Error,
426+
Cost: result.Cost,
427+
OutputTokens: result.OutputTokens,
428+
}
429+
430+
// Populate size check if size was evaluated
431+
if result.SizeExpected != "" {
432+
evalResult.Checks.Size = &session.SizeCheck{
433+
Passed: result.Size == result.SizeExpected,
434+
Actual: result.Size,
435+
Expected: result.SizeExpected,
436+
}
437+
}
438+
439+
// Populate tool calls check if tool calls were evaluated
440+
if result.ToolCallsExpected > 0 {
441+
evalResult.Checks.ToolCalls = &session.ToolCallsCheck{
442+
Passed: result.ToolCallsScore >= 1.0,
443+
Score: result.ToolCallsScore,
444+
}
445+
}
446+
447+
// Populate relevance check if relevance was evaluated
448+
if result.RelevanceExpected > 0 {
449+
// Build a map of failed criteria for quick lookup
450+
failedMap := make(map[string]string, len(result.FailedRelevance))
451+
for _, fr := range result.FailedRelevance {
452+
failedMap[fr.Criterion] = fr.Reason
453+
}
454+
455+
// Build results for ALL criteria (passed + failed) from the eval input
456+
var criteria []string
457+
if result.Session.Evals != nil {
458+
criteria = result.Session.Evals.Relevance
459+
}
460+
461+
results := make([]session.RelevanceCriterionResult, 0, len(criteria))
462+
for _, criterion := range criteria {
463+
if reason, failed := failedMap[criterion]; failed {
464+
results = append(results, session.RelevanceCriterionResult{
465+
Criterion: criterion,
466+
Passed: false,
467+
Reason: reason,
468+
})
469+
} else {
470+
results = append(results, session.RelevanceCriterionResult{
471+
Criterion: criterion,
472+
Passed: true,
473+
})
474+
}
475+
}
476+
477+
evalResult.Checks.Relevance = &session.RelevanceCheck{
478+
Passed: result.RelevancePassed >= result.RelevanceExpected,
479+
PassedCount: result.RelevancePassed,
480+
Total: result.RelevanceExpected,
481+
Results: results,
482+
}
483+
}
484+
485+
result.Session.EvalResult = evalResult
392486
}
393487

394488
func Save(sess *session.Session, filename string) (string, error) {

pkg/evaluation/save_test.go

Lines changed: 93 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -131,6 +131,9 @@ func TestSaveRunSessionsJSON(t *testing.T) {
131131
sess1.InputTokens = 100
132132
sess1.OutputTokens = 50
133133
sess1.Cost = 0.01
134+
sess1.Evals = &session.EvalCriteria{
135+
Relevance: []string{"mentions Paris", "mentions France"},
136+
}
134137

135138
sess2 := session.New(
136139
session.WithTitle("eval-json-2"),
@@ -139,23 +142,49 @@ func TestSaveRunSessionsJSON(t *testing.T) {
139142
sess2.InputTokens = 80
140143
sess2.OutputTokens = 30
141144
sess2.Cost = 0.005
145+
sess2.Evals = &session.EvalCriteria{
146+
Relevance: []string{"gives the correct answer", "explains the math"},
147+
}
142148

143149
// Create an eval run with sessions and eval criteria
144150
run := &EvalRun{
145151
Name: "test-json-001",
146152
Timestamp: time.Now(),
153+
Duration: 42 * time.Second,
154+
Config: Config{
155+
AgentFilename: "./test-agent.yaml",
156+
JudgeModel: "anthropic/claude-opus-4-5",
157+
Concurrency: 4,
158+
EvalsDir: "./evals",
159+
},
160+
Summary: Summary{
161+
TotalEvals: 3,
162+
FailedEvals: 1,
163+
TotalCost: 0.015,
164+
},
147165
Results: []Result{
148166
{
149-
Title: "eval-json-1",
150-
Question: "What is the capital of France?",
151-
Response: "Paris is the capital of France.",
152-
Session: sess1,
167+
Title: "eval-json-1",
168+
Question: "What is the capital of France?",
169+
Response: "Paris is the capital of France.",
170+
Cost: 0.01,
171+
OutputTokens: 50,
172+
RelevancePassed: 2,
173+
RelevanceExpected: 2,
174+
Session: sess1,
153175
},
154176
{
155-
Title: "eval-json-2",
156-
Question: "What is 2+2?",
157-
Response: "4",
158-
Session: sess2,
177+
Title: "eval-json-2",
178+
Question: "What is 2+2?",
179+
Response: "4",
180+
Cost: 0.005,
181+
OutputTokens: 30,
182+
RelevancePassed: 1,
183+
RelevanceExpected: 2,
184+
FailedRelevance: []RelevanceResult{
185+
{Criterion: "explains the math", Reason: "no explanation given"},
186+
},
187+
Session: sess2,
159188
},
160189
{
161190
// Result without a session (error case)
@@ -176,16 +205,29 @@ func TestSaveRunSessionsJSON(t *testing.T) {
176205
data, err := os.ReadFile(sessionsPath)
177206
require.NoError(t, err)
178207

179-
var loadedSessions []*session.Session
180-
err = json.Unmarshal(data, &loadedSessions)
208+
var output RunOutput
209+
err = json.Unmarshal(data, &output)
181210
require.NoError(t, err)
182211

212+
// Verify run-level metadata
213+
assert.Equal(t, "test-json-001", output.Name)
214+
assert.Equal(t, "42s", output.Duration)
215+
assert.Equal(t, "./test-agent.yaml", output.Config.Agent)
216+
assert.Equal(t, "anthropic/claude-opus-4-5", output.Config.JudgeModel)
217+
assert.Equal(t, 4, output.Config.Concurrency)
218+
assert.Equal(t, "./evals", output.Config.EvalsDir)
219+
220+
// Verify summary
221+
assert.Equal(t, 3, output.Summary.TotalEvals)
222+
assert.Equal(t, 1, output.Summary.FailedEvals)
223+
assert.InDelta(t, 0.015, output.Summary.TotalCost, 0.0001)
224+
183225
// Should have 2 sessions (excluding the error case)
184-
assert.Len(t, loadedSessions, 2)
226+
assert.Len(t, output.Sessions, 2)
185227

186228
// Verify session content
187229
titles := make(map[string]*session.Session)
188-
for _, sess := range loadedSessions {
230+
for _, sess := range output.Sessions {
189231
titles[sess.Title] = sess
190232
}
191233

@@ -198,10 +240,49 @@ func TestSaveRunSessionsJSON(t *testing.T) {
198240
assert.Equal(t, int64(50), sess1Loaded.OutputTokens)
199241
assert.InDelta(t, 0.01, sess1Loaded.Cost, 0.0001)
200242

243+
// Verify eval results are populated
244+
require.NotNil(t, sess1Loaded.EvalResult)
245+
assert.True(t, sess1Loaded.EvalResult.Passed)
246+
assert.NotEmpty(t, sess1Loaded.EvalResult.Successes)
247+
assert.Empty(t, sess1Loaded.EvalResult.Failures)
248+
assert.InDelta(t, 0.01, sess1Loaded.EvalResult.Cost, 0.0001)
249+
assert.Equal(t, int64(50), sess1Loaded.EvalResult.OutputTokens)
250+
251+
// Verify structured relevance check
252+
require.NotNil(t, sess1Loaded.EvalResult.Checks.Relevance)
253+
assert.True(t, sess1Loaded.EvalResult.Checks.Relevance.Passed)
254+
assert.Equal(t, float64(2), sess1Loaded.EvalResult.Checks.Relevance.PassedCount)
255+
assert.Equal(t, float64(2), sess1Loaded.EvalResult.Checks.Relevance.Total)
256+
257+
// No size or tool calls checks were configured
258+
assert.Nil(t, sess1Loaded.EvalResult.Checks.Size)
259+
assert.Nil(t, sess1Loaded.EvalResult.Checks.ToolCalls)
260+
201261
sess2Loaded := titles["eval-json-2"]
202262
assert.Equal(t, int64(80), sess2Loaded.InputTokens)
203263
assert.Equal(t, int64(30), sess2Loaded.OutputTokens)
204264
assert.InDelta(t, 0.005, sess2Loaded.Cost, 0.0001)
265+
266+
// Verify failed eval result
267+
require.NotNil(t, sess2Loaded.EvalResult)
268+
assert.False(t, sess2Loaded.EvalResult.Passed)
269+
assert.NotEmpty(t, sess2Loaded.EvalResult.Failures)
270+
271+
// Verify structured relevance check with per-criterion results
272+
require.NotNil(t, sess2Loaded.EvalResult.Checks.Relevance)
273+
assert.False(t, sess2Loaded.EvalResult.Checks.Relevance.Passed)
274+
assert.Equal(t, float64(1), sess2Loaded.EvalResult.Checks.Relevance.PassedCount)
275+
assert.Equal(t, float64(2), sess2Loaded.EvalResult.Checks.Relevance.Total)
276+
require.Len(t, sess2Loaded.EvalResult.Checks.Relevance.Results, 2)
277+
278+
// First criterion should be passed (not in failed list)
279+
assert.True(t, sess2Loaded.EvalResult.Checks.Relevance.Results[0].Passed)
280+
assert.Empty(t, sess2Loaded.EvalResult.Checks.Relevance.Results[0].Reason)
281+
282+
// Second criterion should be failed with reason
283+
assert.False(t, sess2Loaded.EvalResult.Checks.Relevance.Results[1].Passed)
284+
assert.Equal(t, "explains the math", sess2Loaded.EvalResult.Checks.Relevance.Results[1].Criterion)
285+
assert.Equal(t, "no explanation given", sess2Loaded.EvalResult.Checks.Relevance.Results[1].Reason)
205286
}
206287

207288
func TestSaveRunSessionsWithCost(t *testing.T) {

pkg/evaluation/types.go

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -94,10 +94,30 @@ type EvalRun struct {
9494
Name string `json:"name"`
9595
Timestamp time.Time `json:"timestamp"`
9696
Duration time.Duration `json:"duration"`
97+
Config Config `json:"-"` // Used to build RunOutput, not serialized directly
9798
Results []Result `json:"results"`
9899
Summary Summary `json:"summary"`
99100
}
100101

102+
// RunOutput is the top-level structure for the evaluation run JSON output.
103+
type RunOutput struct {
104+
Name string `json:"name"`
105+
Timestamp time.Time `json:"timestamp"`
106+
Duration string `json:"duration"`
107+
Config RunOutputConfig `json:"config"`
108+
Summary Summary `json:"summary"`
109+
Sessions []*session.Session `json:"sessions"`
110+
}
111+
112+
// RunOutputConfig captures the evaluation run configuration.
113+
type RunOutputConfig struct {
114+
Agent string `json:"agent"`
115+
JudgeModel string `json:"judge_model,omitempty"`
116+
Concurrency int `json:"concurrency"`
117+
EvalsDir string `json:"evals_dir"`
118+
BaseImage string `json:"base_image,omitempty"`
119+
}
120+
101121
// Config holds configuration for evaluation runs.
102122
type Config struct {
103123
AgentFilename string // Path to the agent configuration file

pkg/session/session.go

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,9 @@ type Session struct {
7474
// Evals contains evaluation criteria for this session (used by eval framework)
7575
Evals *EvalCriteria `json:"evals,omitempty"`
7676

77+
// EvalResult contains the evaluation scoring outcome (populated after eval run).
78+
EvalResult *EvalResult `json:"eval_result,omitempty"`
79+
7780
// Messages holds the conversation history (messages and sub-sessions)
7881
Messages []Item `json:"messages"`
7982

@@ -229,6 +232,53 @@ func NewSubSessionItem(subSession *Session) Item {
229232
return Item{SubSession: subSession}
230233
}
231234

235+
// EvalResult contains the evaluation scoring outcome for a session.
236+
type EvalResult struct {
237+
Passed bool `json:"passed"`
238+
Successes []string `json:"successes,omitempty"`
239+
Failures []string `json:"failures,omitempty"`
240+
Error string `json:"error,omitempty"`
241+
Cost float64 `json:"cost"`
242+
OutputTokens int64 `json:"output_tokens"`
243+
Checks EvalResultChecks `json:"checks"`
244+
}
245+
246+
// EvalResultChecks groups the individual check results.
247+
// Only checks that were evaluated will be present (omitted if nil).
248+
type EvalResultChecks struct {
249+
Size *SizeCheck `json:"size,omitempty"`
250+
ToolCalls *ToolCallsCheck `json:"tool_calls,omitempty"`
251+
Relevance *RelevanceCheck `json:"relevance,omitempty"`
252+
}
253+
254+
// SizeCheck contains the result of the response size check.
255+
type SizeCheck struct {
256+
Passed bool `json:"passed"`
257+
Actual string `json:"actual"`
258+
Expected string `json:"expected"`
259+
}
260+
261+
// ToolCallsCheck contains the result of the tool calls F1 score check.
262+
type ToolCallsCheck struct {
263+
Passed bool `json:"passed"`
264+
Score float64 `json:"score"`
265+
}
266+
267+
// RelevanceCheck contains the result of the LLM judge relevance check.
268+
type RelevanceCheck struct {
269+
Passed bool `json:"passed"`
270+
PassedCount float64 `json:"passed_count"`
271+
Total float64 `json:"total"`
272+
Results []RelevanceCriterionResult `json:"results"`
273+
}
274+
275+
// RelevanceCriterionResult contains the judge's verdict on a single relevance criterion.
276+
type RelevanceCriterionResult struct {
277+
Criterion string `json:"criterion"`
278+
Passed bool `json:"passed"`
279+
Reason string `json:"reason,omitempty"` // Only set for failed criteria
280+
}
281+
232282
// EvalCriteria contains the evaluation criteria for a session.
233283
type EvalCriteria struct {
234284
Relevance []string `json:"relevance"` // Statements that should be true about the response

0 commit comments

Comments
 (0)