Skip to content

Commit 15d0327

Browse files
committed
eval: include judge reasons for passed relevance criteria
The LLM judge already generates a reason for every criterion (pass and fail), but CheckRelevance was discarding reasons for passed ones. Changes: - RelevanceResult now includes a Passed bool field - CheckRelevance returns []RelevanceResult (all criteria) instead of (passed int, failed []RelevanceResult) - populateEvalResult copies reasons for all criteria into the JSON output - The JSON output now shows reason on every criterion, not just failures Assisted-By: docker-agent
1 parent 399b22a commit 15d0327

8 files changed

Lines changed: 62 additions & 71 deletions

File tree

pkg/evaluation/eval.go

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -357,12 +357,18 @@ func (r *Runner) runSingleEval(ctx context.Context, evalSess *InputSession) (Res
357357
if r.judge != nil && len(evals.Relevance) > 0 {
358358
// Use transcript for relevance checking to preserve temporal ordering
359359
transcript := buildTranscript(events)
360-
passed, failed, err := r.judge.CheckRelevance(ctx, transcript, evals.Relevance)
360+
results, err := r.judge.CheckRelevance(ctx, transcript, evals.Relevance)
361361
if err != nil {
362362
return result, fmt.Errorf("relevance check failed: %w", err)
363363
}
364-
result.RelevancePassed = float64(passed)
365-
result.FailedRelevance = failed
364+
var passed float64
365+
for _, rr := range results {
366+
if rr.Passed {
367+
passed++
368+
}
369+
}
370+
result.RelevancePassed = passed
371+
result.RelevanceResults = results
366372
}
367373

368374
slog.Debug("Evaluation complete", "title", evalSess.Title, "duration", time.Since(startTime))

pkg/evaluation/eval_test.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -196,7 +196,7 @@ func TestResultCheckResults(t *testing.T) {
196196
},
197197
{
198198
name: "relevance failures listed",
199-
result: Result{RelevanceExpected: 2, RelevancePassed: 0, FailedRelevance: []RelevanceResult{{Criterion: "check A", Reason: "reason A"}, {Criterion: "check B", Reason: "reason B"}}},
199+
result: Result{RelevanceExpected: 2, RelevancePassed: 0, RelevanceResults: []RelevanceResult{{Criterion: "check A", Passed: false, Reason: "reason A"}, {Criterion: "check B", Passed: false, Reason: "reason B"}}},
200200
wantSuccess: nil,
201201
wantFailures: []string{"relevance: check A (reason: reason A)", "relevance: check B (reason: reason B)"},
202202
},
@@ -658,7 +658,7 @@ func TestProgressBarPrintResult(t *testing.T) {
658658
Size: "S",
659659
RelevanceExpected: 2,
660660
RelevancePassed: 1,
661-
FailedRelevance: []RelevanceResult{{Criterion: "check failed", Reason: "did not meet criteria"}},
661+
RelevanceResults: []RelevanceResult{{Criterion: "check failed", Passed: false, Reason: "did not meet criteria"}},
662662
},
663663
wantContains: []string{
664664
"✗ mixed-session", // overall failed

pkg/evaluation/judge.go

Lines changed: 19 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -97,17 +97,18 @@ func (j *Judge) Validate(ctx context.Context) error {
9797
// RelevanceResult contains the result of a single relevance check.
9898
type RelevanceResult struct {
9999
Criterion string `json:"criterion"`
100+
Passed bool `json:"passed"`
100101
Reason string `json:"reason"`
101102
}
102103

103104
// CheckRelevance runs all relevance checks concurrently with the configured concurrency.
104-
// It returns the number of passed checks, a slice of failed results with reasons, and an error
105-
// if any check encountered an error (e.g. judge model misconfiguration). Errors cause a hard
106-
// failure so that configuration issues are surfaced immediately rather than silently producing
107-
// zero-relevance results.
108-
func (j *Judge) CheckRelevance(ctx context.Context, response string, criteria []string) (passed int, failed []RelevanceResult, err error) {
105+
// It returns a result for every criterion (both passed and failed, each with a reason from
106+
// the judge model), and an error if any check encountered an error (e.g. judge model
107+
// misconfiguration). Errors cause a hard failure so that configuration issues are surfaced
108+
// immediately rather than silently producing zero-relevance results.
109+
func (j *Judge) CheckRelevance(ctx context.Context, response string, criteria []string) (results []RelevanceResult, err error) {
109110
if len(criteria) == 0 {
110-
return 0, nil, nil
111+
return nil, nil
111112
}
112113

113114
// Create work channel
@@ -122,23 +123,23 @@ func (j *Judge) CheckRelevance(ctx context.Context, response string, criteria []
122123
close(work)
123124

124125
// Results slice preserves order
125-
type result struct {
126+
type rawResult struct {
126127
passed bool
127128
reason string
128129
err error
129130
}
130-
results := make([]result, len(criteria))
131+
rawResults := make([]rawResult, len(criteria))
131132

132133
var wg sync.WaitGroup
133134
for range j.concurrency {
134135
wg.Go(func() {
135136
for item := range work {
136137
if ctx.Err() != nil {
137-
results[item.index] = result{err: fmt.Errorf("context cancelled: %w", ctx.Err())}
138+
rawResults[item.index] = rawResult{err: fmt.Errorf("context cancelled: %w", ctx.Err())}
138139
continue
139140
}
140141
pass, reason, checkErr := j.checkSingle(ctx, response, item.criterion)
141-
results[item.index] = result{passed: pass, reason: reason, err: checkErr}
142+
rawResults[item.index] = rawResult{passed: pass, reason: reason, err: checkErr}
142143
}
143144
})
144145
}
@@ -147,26 +148,24 @@ func (j *Judge) CheckRelevance(ctx context.Context, response string, criteria []
147148
// Aggregate results. Any error is fatal — return it immediately so the
148149
// caller can fail fast on judge misconfiguration.
149150
var errs []error
150-
for i, r := range results {
151+
results = make([]RelevanceResult, len(criteria))
152+
for i, r := range rawResults {
151153
if r.err != nil {
152154
errs = append(errs, fmt.Errorf("checking %q: %w", criteria[i], r.err))
153155
continue
154156
}
155-
if r.passed {
156-
passed++
157-
} else {
158-
failed = append(failed, RelevanceResult{
159-
Criterion: criteria[i],
160-
Reason: r.reason,
161-
})
157+
results[i] = RelevanceResult{
158+
Criterion: criteria[i],
159+
Passed: r.passed,
160+
Reason: r.reason,
162161
}
163162
}
164163

165164
if len(errs) > 0 {
166-
return passed, failed, errors.Join(errs...)
165+
return results, errors.Join(errs...)
167166
}
168167

169-
return passed, failed, nil
168+
return results, nil
170169
}
171170

172171
// checkSingle checks a single relevance criterion against the response.

pkg/evaluation/judge_test.go

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -47,10 +47,9 @@ func TestJudge_CheckRelevance_EmptyCriteria(t *testing.T) {
4747
t.Parallel()
4848

4949
judge := NewJudge(nil, 1)
50-
passed, failed, err := judge.CheckRelevance(t.Context(), "some response", nil)
50+
results, err := judge.CheckRelevance(t.Context(), "some response", nil)
5151

52-
assert.Equal(t, 0, passed)
53-
assert.Empty(t, failed)
52+
assert.Empty(t, results)
5453
assert.NoError(t, err)
5554
}
5655

@@ -63,11 +62,10 @@ func TestJudge_CheckRelevance_ContextCanceled(t *testing.T) {
6362
cancel() // Cancel immediately
6463

6564
criteria := []string{"criterion1", "criterion2", "criterion3"}
66-
passed, failed, err := judge.CheckRelevance(ctx, "some response", criteria)
65+
results, err := judge.CheckRelevance(ctx, "some response", criteria)
6766

6867
// All should have errors due to context cancellation
69-
assert.Equal(t, 0, passed)
70-
assert.Empty(t, failed)
68+
assert.Len(t, results, len(criteria))
7169
require.Error(t, err)
7270
assert.Contains(t, err.Error(), "context cancelled")
7371
}

pkg/evaluation/save.go

Lines changed: 7 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -446,32 +446,13 @@ func populateEvalResult(result *Result) {
446446

447447
// Populate relevance check if relevance was evaluated
448448
if result.RelevanceExpected > 0 {
449-
// Build a map of failed criteria for quick lookup
450-
failedMap := make(map[string]string, len(result.FailedRelevance))
451-
for _, fr := range result.FailedRelevance {
452-
failedMap[fr.Criterion] = fr.Reason
453-
}
454-
455-
// Build results for ALL criteria (passed + failed) from the eval input
456-
var criteria []string
457-
if result.Session.Evals != nil {
458-
criteria = result.Session.Evals.Relevance
459-
}
460-
461-
results := make([]session.RelevanceCriterionResult, 0, len(criteria))
462-
for _, criterion := range criteria {
463-
if reason, failed := failedMap[criterion]; failed {
464-
results = append(results, session.RelevanceCriterionResult{
465-
Criterion: criterion,
466-
Passed: false,
467-
Reason: reason,
468-
})
469-
} else {
470-
results = append(results, session.RelevanceCriterionResult{
471-
Criterion: criterion,
472-
Passed: true,
473-
})
474-
}
449+
results := make([]session.RelevanceCriterionResult, 0, len(result.RelevanceResults))
450+
for _, rr := range result.RelevanceResults {
451+
results = append(results, session.RelevanceCriterionResult{
452+
Criterion: rr.Criterion,
453+
Passed: rr.Passed,
454+
Reason: rr.Reason,
455+
})
475456
}
476457

477458
evalResult.Checks.Relevance = &session.RelevanceCheck{

pkg/evaluation/save_test.go

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -171,7 +171,11 @@ func TestSaveRunSessionsJSON(t *testing.T) {
171171
OutputTokens: 50,
172172
RelevancePassed: 2,
173173
RelevanceExpected: 2,
174-
Session: sess1,
174+
RelevanceResults: []RelevanceResult{
175+
{Criterion: "mentions Paris", Passed: true, Reason: "response includes Paris"},
176+
{Criterion: "mentions France", Passed: true, Reason: "response includes France"},
177+
},
178+
Session: sess1,
175179
},
176180
{
177181
Title: "eval-json-2",
@@ -181,8 +185,9 @@ func TestSaveRunSessionsJSON(t *testing.T) {
181185
OutputTokens: 30,
182186
RelevancePassed: 1,
183187
RelevanceExpected: 2,
184-
FailedRelevance: []RelevanceResult{
185-
{Criterion: "explains the math", Reason: "no explanation given"},
188+
RelevanceResults: []RelevanceResult{
189+
{Criterion: "gives the correct answer", Passed: true, Reason: "the response says 4"},
190+
{Criterion: "explains the math", Passed: false, Reason: "no explanation given"},
186191
},
187192
Session: sess2,
188193
},
@@ -275,9 +280,9 @@ func TestSaveRunSessionsJSON(t *testing.T) {
275280
assert.Equal(t, float64(2), sess2Loaded.EvalResult.Checks.Relevance.Total)
276281
require.Len(t, sess2Loaded.EvalResult.Checks.Relevance.Results, 2)
277282

278-
// First criterion should be passed (not in failed list)
283+
// First criterion should be passed with reason
279284
assert.True(t, sess2Loaded.EvalResult.Checks.Relevance.Results[0].Passed)
280-
assert.Empty(t, sess2Loaded.EvalResult.Checks.Relevance.Results[0].Reason)
285+
assert.Equal(t, "the response says 4", sess2Loaded.EvalResult.Checks.Relevance.Results[0].Reason)
281286

282287
// Second criterion should be failed with reason
283288
assert.False(t, sess2Loaded.EvalResult.Checks.Relevance.Results[1].Passed)

pkg/evaluation/types.go

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -26,9 +26,9 @@ type Result struct {
2626
SizeExpected string `json:"size_expected"`
2727
ToolCallsScore float64 `json:"tool_calls_score"`
2828
ToolCallsExpected float64 `json:"tool_calls_score_expected"`
29-
RelevancePassed float64 `json:"relevance"`
30-
RelevanceExpected float64 `json:"relevance_expected"`
31-
FailedRelevance []RelevanceResult `json:"failed_relevance,omitempty"`
29+
RelevancePassed float64 `json:"relevance"`
30+
RelevanceExpected float64 `json:"relevance_expected"`
31+
RelevanceResults []RelevanceResult `json:"relevance_results,omitempty"`
3232
Error string `json:"error,omitempty"`
3333
RawOutput []map[string]any `json:"raw_output,omitempty"`
3434
Session *session.Session `json:"-"` // Full session for database storage (not in JSON)
@@ -63,11 +63,13 @@ func (r *Result) checkResults() (successes, failures []string) {
6363
if r.RelevancePassed >= r.RelevanceExpected {
6464
successes = append(successes, fmt.Sprintf("relevance %.0f/%.0f", r.RelevancePassed, r.RelevanceExpected))
6565
} else {
66-
for _, result := range r.FailedRelevance {
67-
if result.Reason != "" {
68-
failures = append(failures, fmt.Sprintf("relevance: %s (reason: %s)", result.Criterion, result.Reason))
69-
} else {
70-
failures = append(failures, "relevance: "+result.Criterion)
66+
for _, result := range r.RelevanceResults {
67+
if !result.Passed {
68+
if result.Reason != "" {
69+
failures = append(failures, fmt.Sprintf("relevance: %s (reason: %s)", result.Criterion, result.Reason))
70+
} else {
71+
failures = append(failures, "relevance: "+result.Criterion)
72+
}
7173
}
7274
}
7375
}

pkg/session/session.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -276,7 +276,7 @@ type RelevanceCheck struct {
276276
type RelevanceCriterionResult struct {
277277
Criterion string `json:"criterion"`
278278
Passed bool `json:"passed"`
279-
Reason string `json:"reason,omitempty"` // Only set for failed criteria
279+
Reason string `json:"reason,omitempty"`
280280
}
281281

282282
// EvalCriteria contains the evaluation criteria for a session.

0 commit comments

Comments
 (0)