Skip to content

Commit 7fc9e32

Browse files
authored
Merge pull request #2440 from dgageot/evals-repeat
Add --repeat flag to eval command for running evaluations multiple times
2 parents e5f7cfa + c275d4d commit 7fc9e32

5 files changed

Lines changed: 48 additions & 7 deletions

File tree

cmd/root/eval.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@ func newEvalCmd() *cobra.Command {
4545
cmd.Flags().StringVar(&flags.BaseImage, "base-image", "", "Custom base Docker image for running evaluations")
4646
cmd.Flags().BoolVar(&flags.KeepContainers, "keep-containers", false, "Keep containers after evaluation (don't use --rm)")
4747
cmd.Flags().StringSliceVarP(&flags.EnvVars, "env", "e", nil, "Environment variables to pass to container (KEY or KEY=VALUE)")
48+
cmd.Flags().IntVar(&flags.Repeat, "repeat", 1, "Number of times to repeat each evaluation (useful for computing baselines)")
4849

4950
return cmd
5051
}

docs/features/cli/index.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -169,6 +169,7 @@ $ docker agent eval eval-config.yaml
169169
$ docker agent eval agent.yaml ./evals -c 8 # 8 concurrent evaluations
170170
$ docker agent eval agent.yaml --keep-containers # Keep containers for debugging
171171
$ docker agent eval agent.yaml --only "auth*" # Only run matching evals
172+
$ docker agent eval agent.yaml --repeat 5 # Repeat each eval 5 times
172173
```
173174

174175
### `docker agent alias`

docs/features/evaluation/index.md

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,12 @@ $ docker agent eval agent.yaml -c 8
3333

3434
# Only run evals matching a pattern
3535
$ docker agent eval agent.yaml --only "auth*"
36+
37+
# Repeat each eval 5 times to compute a baseline
38+
$ docker agent eval agent.yaml --repeat 5
39+
40+
# Repeat a specific eval 5 times
41+
$ docker agent eval agent.yaml --only "auth*" --repeat 5
3642
```
3743

3844
## Eval Directory Structure
@@ -162,6 +168,7 @@ $ docker agent eval <agent-file>|<registry-ref> [<eval-dir>|./evals]
162168
| `--base-image` | (default) | Custom base Docker image for eval containers |
163169
| `--keep-containers` | `false` | Keep containers after evaluation (don't remove with `--rm`) |
164170
| `-e, --env` | (none) | Environment variables to pass to container (`KEY` or `KEY=VALUE`) |
171+
| `--repeat` | `1` | Number of times to repeat each evaluation (useful for computing baselines) |
165172

166173
## Output
167174

pkg/evaluation/eval.go

Lines changed: 28 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -160,11 +160,11 @@ func (r *Runner) Run(ctx context.Context, ttyOut, out io.Writer, isTTY bool) ([]
160160
return
161161
}
162162

163-
progress.setRunning(item.eval.Title)
163+
progress.setRunning(item.eval.displayTitle())
164164
result, runErr := r.runSingleEval(ctx, item.eval)
165165
if runErr != nil {
166166
result.Error = runErr.Error()
167-
slog.Error("Evaluation failed", "title", item.eval.Title, "error", runErr)
167+
slog.Error("Evaluation failed", "title", item.eval.displayTitle(), "error", runErr)
168168
}
169169

170170
results[item.index] = result
@@ -227,6 +227,22 @@ func (r *Runner) loadEvalSessions(ctx context.Context) ([]InputSession, error) {
227227
return cmp.Compare(b.Duration(), a.Duration())
228228
})
229229

230+
// Repeat evals if requested
231+
repeat := max(r.Repeat, 1)
232+
if repeat > 1 {
233+
original := evals
234+
evals = make([]InputSession, 0, len(original)*repeat)
235+
for i := range repeat {
236+
for _, e := range original {
237+
evals = append(evals, InputSession{
238+
Session: e.Session,
239+
SourcePath: e.SourcePath,
240+
RepeatIndex: i + 1,
241+
})
242+
}
243+
}
244+
}
245+
230246
return evals, nil
231247
}
232248

@@ -305,7 +321,9 @@ func (r *Runner) preBuildImages(ctx context.Context, out io.Writer, evals []Inpu
305321

306322
func (r *Runner) runSingleEval(ctx context.Context, evalSess *InputSession) (Result, error) {
307323
startTime := time.Now()
308-
slog.Debug("Starting evaluation", "title", evalSess.Title)
324+
title := evalSess.displayTitle()
325+
326+
slog.Debug("Starting evaluation", "title", title)
309327

310328
var evals *session.EvalCriteria
311329
if evalSess.Evals != nil {
@@ -318,7 +336,7 @@ func (r *Runner) runSingleEval(ctx context.Context, evalSess *InputSession) (Res
318336

319337
result := Result{
320338
InputPath: evalSess.SourcePath,
321-
Title: evalSess.Title,
339+
Title: title,
322340
Question: strings.Join(userMessages, "\n"),
323341
SizeExpected: evals.Size,
324342
RelevanceExpected: float64(len(evals.Relevance)),
@@ -347,9 +365,13 @@ func (r *Runner) runSingleEval(ctx context.Context, evalSess *InputSession) (Res
347365
result.Size = getResponseSize(result.Response)
348366

349367
// Build session from events for database storage
350-
result.Session = SessionFromEvents(events, evalSess.Title, userMessages)
368+
result.Session = SessionFromEvents(events, title, userMessages)
351369
result.Session.Evals = evals
352370

371+
// Re-apply the display title in case a session_title event overrode it.
372+
// This ensures repeated evals retain their '#N' suffix in stored sessions.
373+
result.Session.Title = title
374+
353375
if len(expectedToolCalls) > 0 || len(actualToolCalls) > 0 {
354376
result.ToolCallsScore = toolCallF1Score(expectedToolCalls, actualToolCalls)
355377
}
@@ -371,7 +393,7 @@ func (r *Runner) runSingleEval(ctx context.Context, evalSess *InputSession) (Res
371393
result.RelevanceResults = results
372394
}
373395

374-
slog.Debug("Evaluation complete", "title", evalSess.Title, "duration", time.Since(startTime))
396+
slog.Debug("Evaluation complete", "title", title, "duration", time.Since(startTime))
375397
return result, nil
376398
}
377399

pkg/evaluation/types.go

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,16 @@ import (
1111
type InputSession struct {
1212
*session.Session
1313

14-
SourcePath string // Path to the source eval file (not serialized)
14+
SourcePath string // Path to the source eval file (not serialized)
15+
RepeatIndex int // Repeat iteration (1-based); 0 means no repeat
16+
}
17+
18+
// displayTitle returns the title with an optional repeat suffix.
19+
func (s *InputSession) displayTitle() string {
20+
if s.RepeatIndex > 0 {
21+
return fmt.Sprintf("%s #%d", s.Title, s.RepeatIndex)
22+
}
23+
return s.Title
1524
}
1625

1726
// Result contains the evaluation results for a single test case.
@@ -131,6 +140,7 @@ type Config struct {
131140
BaseImage string // Custom base Docker image for running evaluations
132141
KeepContainers bool // If true, don't remove containers after evaluation (skip --rm)
133142
EnvVars []string // Environment variables to pass: KEY (value from env) or KEY=VALUE (explicit)
143+
Repeat int // Number of times to repeat each evaluation (default 1)
134144
}
135145

136146
// Session helper functions

0 commit comments

Comments
 (0)