Skip to content

Commit 429fed8

Browse files
authored
Merge pull request #2309 from docker/eval-structured-json-output
eval: include structured results, run config, and summary in JSON output
2 parents cd4881c + b7b134a commit 429fed8

File tree

8 files changed

+292
-55
lines changed

8 files changed

+292
-55
lines changed

pkg/evaluation/eval.go

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,7 @@ func Evaluate(ctx context.Context, ttyOut, out io.Writer, isTTY bool, runName st
9090
Name: runName,
9191
Timestamp: startTime,
9292
Duration: duration,
93+
Config: cfg,
9394
Results: results,
9495
Summary: summary,
9596
}
@@ -356,12 +357,18 @@ func (r *Runner) runSingleEval(ctx context.Context, evalSess *InputSession) (Res
356357
if r.judge != nil && len(evals.Relevance) > 0 {
357358
// Use transcript for relevance checking to preserve temporal ordering
358359
transcript := buildTranscript(events)
359-
passed, failed, err := r.judge.CheckRelevance(ctx, transcript, evals.Relevance)
360+
results, err := r.judge.CheckRelevance(ctx, transcript, evals.Relevance)
360361
if err != nil {
361362
return result, fmt.Errorf("relevance check failed: %w", err)
362363
}
363-
result.RelevancePassed = float64(passed)
364-
result.FailedRelevance = failed
364+
var passed float64
365+
for _, rr := range results {
366+
if rr.Passed {
367+
passed++
368+
}
369+
}
370+
result.RelevancePassed = passed
371+
result.RelevanceResults = results
365372
}
366373

367374
slog.Debug("Evaluation complete", "title", evalSess.Title, "duration", time.Since(startTime))

pkg/evaluation/eval_test.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -196,7 +196,7 @@ func TestResultCheckResults(t *testing.T) {
196196
},
197197
{
198198
name: "relevance failures listed",
199-
result: Result{RelevanceExpected: 2, RelevancePassed: 0, FailedRelevance: []RelevanceResult{{Criterion: "check A", Reason: "reason A"}, {Criterion: "check B", Reason: "reason B"}}},
199+
result: Result{RelevanceExpected: 2, RelevancePassed: 0, RelevanceResults: []RelevanceResult{{Criterion: "check A", Passed: false, Reason: "reason A"}, {Criterion: "check B", Passed: false, Reason: "reason B"}}},
200200
wantSuccess: nil,
201201
wantFailures: []string{"relevance: check A (reason: reason A)", "relevance: check B (reason: reason B)"},
202202
},
@@ -658,7 +658,7 @@ func TestProgressBarPrintResult(t *testing.T) {
658658
Size: "S",
659659
RelevanceExpected: 2,
660660
RelevancePassed: 1,
661-
FailedRelevance: []RelevanceResult{{Criterion: "check failed", Reason: "did not meet criteria"}},
661+
RelevanceResults: []RelevanceResult{{Criterion: "check failed", Passed: false, Reason: "did not meet criteria"}},
662662
},
663663
wantContains: []string{
664664
"✗ mixed-session", // overall failed

pkg/evaluation/judge.go

Lines changed: 20 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -97,17 +97,18 @@ func (j *Judge) Validate(ctx context.Context) error {
9797
// RelevanceResult contains the result of a single relevance check.
9898
type RelevanceResult struct {
9999
Criterion string `json:"criterion"`
100+
Passed bool `json:"passed"`
100101
Reason string `json:"reason"`
101102
}
102103

103104
// CheckRelevance runs all relevance checks concurrently with the configured concurrency.
104-
// It returns the number of passed checks, a slice of failed results with reasons, and an error
105-
// if any check encountered an error (e.g. judge model misconfiguration). Errors cause a hard
106-
// failure so that configuration issues are surfaced immediately rather than silently producing
107-
// zero-relevance results.
108-
func (j *Judge) CheckRelevance(ctx context.Context, response string, criteria []string) (passed int, failed []RelevanceResult, err error) {
105+
// It returns a result for every criterion (both passed and failed, each with a reason from
106+
// the judge model), and an error if any check encountered an error (e.g. judge model
107+
// misconfiguration). Errors cause a hard failure so that configuration issues are surfaced
108+
// immediately rather than silently producing zero-relevance results.
109+
func (j *Judge) CheckRelevance(ctx context.Context, response string, criteria []string) (results []RelevanceResult, err error) {
109110
if len(criteria) == 0 {
110-
return 0, nil, nil
111+
return nil, nil
111112
}
112113

113114
// Create work channel
@@ -122,23 +123,23 @@ func (j *Judge) CheckRelevance(ctx context.Context, response string, criteria []
122123
close(work)
123124

124125
// Results slice preserves order
125-
type result struct {
126+
type rawResult struct {
126127
passed bool
127128
reason string
128129
err error
129130
}
130-
results := make([]result, len(criteria))
131+
rawResults := make([]rawResult, len(criteria))
131132

132133
var wg sync.WaitGroup
133134
for range j.concurrency {
134135
wg.Go(func() {
135136
for item := range work {
136137
if ctx.Err() != nil {
137-
results[item.index] = result{err: fmt.Errorf("context cancelled: %w", ctx.Err())}
138+
rawResults[item.index] = rawResult{err: fmt.Errorf("context cancelled: %w", ctx.Err())}
138139
continue
139140
}
140141
pass, reason, checkErr := j.checkSingle(ctx, response, item.criterion)
141-
results[item.index] = result{passed: pass, reason: reason, err: checkErr}
142+
rawResults[item.index] = rawResult{passed: pass, reason: reason, err: checkErr}
142143
}
143144
})
144145
}
@@ -147,26 +148,24 @@ func (j *Judge) CheckRelevance(ctx context.Context, response string, criteria []
147148
// Aggregate results. Any error is fatal — return it immediately so the
148149
// caller can fail fast on judge misconfiguration.
149150
var errs []error
150-
for i, r := range results {
151+
results = make([]RelevanceResult, len(criteria))
152+
for i := range results {
153+
results[i].Criterion = criteria[i]
154+
}
155+
for i, r := range rawResults {
151156
if r.err != nil {
152157
errs = append(errs, fmt.Errorf("checking %q: %w", criteria[i], r.err))
153158
continue
154159
}
155-
if r.passed {
156-
passed++
157-
} else {
158-
failed = append(failed, RelevanceResult{
159-
Criterion: criteria[i],
160-
Reason: r.reason,
161-
})
162-
}
160+
results[i].Passed = r.passed
161+
results[i].Reason = r.reason
163162
}
164163

165164
if len(errs) > 0 {
166-
return passed, failed, errors.Join(errs...)
165+
return results, errors.Join(errs...)
167166
}
168167

169-
return passed, failed, nil
168+
return results, nil
170169
}
171170

172171
// checkSingle checks a single relevance criterion against the response.

pkg/evaluation/judge_test.go

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -47,10 +47,9 @@ func TestJudge_CheckRelevance_EmptyCriteria(t *testing.T) {
4747
t.Parallel()
4848

4949
judge := NewJudge(nil, 1)
50-
passed, failed, err := judge.CheckRelevance(t.Context(), "some response", nil)
50+
results, err := judge.CheckRelevance(t.Context(), "some response", nil)
5151

52-
assert.Equal(t, 0, passed)
53-
assert.Empty(t, failed)
52+
assert.Empty(t, results)
5453
assert.NoError(t, err)
5554
}
5655

@@ -63,11 +62,10 @@ func TestJudge_CheckRelevance_ContextCanceled(t *testing.T) {
6362
cancel() // Cancel immediately
6463

6564
criteria := []string{"criterion1", "criterion2", "criterion3"}
66-
passed, failed, err := judge.CheckRelevance(ctx, "some response", criteria)
65+
results, err := judge.CheckRelevance(ctx, "some response", criteria)
6766

6867
// All should have errors due to context cancellation
69-
assert.Equal(t, 0, passed)
70-
assert.Empty(t, failed)
68+
assert.Len(t, results, len(criteria))
7169
require.Error(t, err)
7270
assert.Contains(t, err.Error(), "context cancelled")
7371
}

pkg/evaluation/save.go

Lines changed: 80 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -374,11 +374,15 @@ func SaveRunJSON(run *EvalRun, outputDir string) (string, error) {
374374
return saveJSON(run, filepath.Join(outputDir, run.Name+".json"))
375375
}
376376

377-
// SaveRunSessionsJSON saves all eval sessions to a single JSON file.
378-
// Each session includes its eval criteria in the "evals" field.
379-
// This complements SaveRunSessions which saves to SQLite, providing a
380-
// human-readable format for inspection.
377+
// SaveRunSessionsJSON saves the full evaluation run output to a JSON file.
378+
// The output includes run metadata (config, summary) and all sessions with
379+
// their eval criteria and scoring results (pass/fail, judge reasoning, errors).
381380
func SaveRunSessionsJSON(run *EvalRun, outputDir string) (string, error) {
381+
// Populate eval results on each session
382+
for i := range run.Results {
383+
populateEvalResult(&run.Results[i])
384+
}
385+
382386
// Collect all sessions from results
383387
var sessions []*session.Session
384388
for i := range run.Results {
@@ -387,8 +391,79 @@ func SaveRunSessionsJSON(run *EvalRun, outputDir string) (string, error) {
387391
}
388392
}
389393

394+
output := RunOutput{
395+
Name: run.Name,
396+
Timestamp: run.Timestamp,
397+
Duration: run.Duration.Round(time.Millisecond).String(),
398+
Config: RunOutputConfig{
399+
Agent: run.Config.AgentFilename,
400+
JudgeModel: run.Config.JudgeModel,
401+
Concurrency: run.Config.Concurrency,
402+
EvalsDir: run.Config.EvalsDir,
403+
BaseImage: run.Config.BaseImage,
404+
},
405+
Summary: run.Summary,
406+
Sessions: sessions,
407+
}
408+
390409
outputPath := filepath.Join(outputDir, run.Name+".json")
391-
return saveJSON(sessions, outputPath)
410+
return saveJSON(output, outputPath)
411+
}
412+
413+
// populateEvalResult copies scoring data from a Result to its Session's EvalResult field.
414+
func populateEvalResult(result *Result) {
415+
if result.Session == nil {
416+
return
417+
}
418+
419+
successes, failures := result.checkResults()
420+
421+
evalResult := &session.EvalResult{
422+
Passed: len(failures) == 0,
423+
Successes: successes,
424+
Failures: failures,
425+
Error: result.Error,
426+
Cost: result.Cost,
427+
OutputTokens: result.OutputTokens,
428+
}
429+
430+
// Populate size check if size was evaluated
431+
if result.SizeExpected != "" {
432+
evalResult.Checks.Size = &session.SizeCheck{
433+
Passed: result.Size == result.SizeExpected,
434+
Actual: result.Size,
435+
Expected: result.SizeExpected,
436+
}
437+
}
438+
439+
// Populate tool calls check if tool calls were evaluated
440+
if result.ToolCallsExpected > 0 {
441+
evalResult.Checks.ToolCalls = &session.ToolCallsCheck{
442+
Passed: result.ToolCallsScore >= 1.0,
443+
Score: result.ToolCallsScore,
444+
}
445+
}
446+
447+
// Populate relevance check if relevance was evaluated
448+
if result.RelevanceExpected > 0 {
449+
results := make([]session.RelevanceCriterionResult, 0, len(result.RelevanceResults))
450+
for _, rr := range result.RelevanceResults {
451+
results = append(results, session.RelevanceCriterionResult{
452+
Criterion: rr.Criterion,
453+
Passed: rr.Passed,
454+
Reason: rr.Reason,
455+
})
456+
}
457+
458+
evalResult.Checks.Relevance = &session.RelevanceCheck{
459+
Passed: result.RelevancePassed >= result.RelevanceExpected,
460+
PassedCount: result.RelevancePassed,
461+
Total: result.RelevanceExpected,
462+
Results: results,
463+
}
464+
}
465+
466+
result.Session.EvalResult = evalResult
392467
}
393468

394469
func Save(sess *session.Session, filename string) (string, error) {

0 commit comments

Comments
 (0)