Add --repeat flag to eval command for running evaluations multiple times

dgageot · dgageot · commit c275d4dc79c4 · 2026-04-15T15:31:16.000+02:00
This is useful for computing baselines by running the same eval(s) N times. Each repeated eval gets a '#N' suffix in its title (e.g. 'my-eval #3'). Assisted-By: docker-agent
diff --git a/cmd/root/eval.go b/cmd/root/eval.go
@@ -45,6 +45,7 @@ func newEvalCmd() *cobra.Command {
 	cmd.Flags().StringVar(&flags.BaseImage, "base-image", "", "Custom base Docker image for running evaluations")
 	cmd.Flags().BoolVar(&flags.KeepContainers, "keep-containers", false, "Keep containers after evaluation (don't use --rm)")
 	cmd.Flags().StringSliceVarP(&flags.EnvVars, "env", "e", nil, "Environment variables to pass to container (KEY or KEY=VALUE)")
+	cmd.Flags().IntVar(&flags.Repeat, "repeat", 1, "Number of times to repeat each evaluation (useful for computing baselines)")
 
 	return cmd
 }
diff --git a/docs/features/cli/index.md b/docs/features/cli/index.md
@@ -169,6 +169,7 @@ $ docker agent eval eval-config.yaml
 $ docker agent eval agent.yaml ./evals -c 8              # 8 concurrent evaluations
 $ docker agent eval agent.yaml --keep-containers         # Keep containers for debugging
 $ docker agent eval agent.yaml --only "auth*"            # Only run matching evals
+$ docker agent eval agent.yaml --repeat 5                # Repeat each eval 5 times
 ```
 
 ### `docker agent alias`
diff --git a/docs/features/evaluation/index.md b/docs/features/evaluation/index.md
@@ -33,6 +33,12 @@ $ docker agent eval agent.yaml -c 8
 
 # Only run evals matching a pattern
 $ docker agent eval agent.yaml --only "auth*"
+
+# Repeat each eval 5 times to compute a baseline
+$ docker agent eval agent.yaml --repeat 5
+
+# Repeat a specific eval 5 times
+$ docker agent eval agent.yaml --only "auth*" --repeat 5
 ```
 
 ## Eval Directory Structure
@@ -162,6 +168,7 @@ $ docker agent eval <agent-file>|<registry-ref> [<eval-dir>|./evals]
 | `--base-image`      | (default)                   | Custom base Docker image for eval containers                      |
 | `--keep-containers` | `false`                     | Keep containers after evaluation (don't remove with `--rm`)       |
 | `-e, --env`         | (none)                      | Environment variables to pass to container (`KEY` or `KEY=VALUE`) |
+| `--repeat`          | `1`                         | Number of times to repeat each evaluation (useful for computing baselines) |
 
 ## Output
 
diff --git a/pkg/evaluation/eval.go b/pkg/evaluation/eval.go
@@ -160,11 +160,11 @@ func (r *Runner) Run(ctx context.Context, ttyOut, out io.Writer, isTTY bool) ([]
 					return
 				}
 
-				progress.setRunning(item.eval.Title)
+				progress.setRunning(item.eval.displayTitle())
 				result, runErr := r.runSingleEval(ctx, item.eval)
 				if runErr != nil {
 					result.Error = runErr.Error()
-					slog.Error("Evaluation failed", "title", item.eval.Title, "error", runErr)
+					slog.Error("Evaluation failed", "title", item.eval.displayTitle(), "error", runErr)
 				}
 
 				results[item.index] = result
@@ -227,6 +227,22 @@ func (r *Runner) loadEvalSessions(ctx context.Context) ([]InputSession, error) {
 		return cmp.Compare(b.Duration(), a.Duration())
 	})
 
+	// Repeat evals if requested
+	repeat := max(r.Repeat, 1)
+	if repeat > 1 {
+		original := evals
+		evals = make([]InputSession, 0, len(original)*repeat)
+		for i := range repeat {
+			for _, e := range original {
+				evals = append(evals, InputSession{
+					Session:     e.Session,
+					SourcePath:  e.SourcePath,
+					RepeatIndex: i + 1,
+				})
+			}
+		}
+	}
+
 	return evals, nil
 }
 
@@ -305,7 +321,9 @@ func (r *Runner) preBuildImages(ctx context.Context, out io.Writer, evals []Inpu
 
 func (r *Runner) runSingleEval(ctx context.Context, evalSess *InputSession) (Result, error) {
 	startTime := time.Now()
-	slog.Debug("Starting evaluation", "title", evalSess.Title)
+	title := evalSess.displayTitle()
+
+	slog.Debug("Starting evaluation", "title", title)
 
 	var evals *session.EvalCriteria
 	if evalSess.Evals != nil {
@@ -318,7 +336,7 @@ func (r *Runner) runSingleEval(ctx context.Context, evalSess *InputSession) (Res
 
 	result := Result{
 		InputPath:         evalSess.SourcePath,
-		Title:             evalSess.Title,
+		Title:             title,
 		Question:          strings.Join(userMessages, "\n"),
 		SizeExpected:      evals.Size,
 		RelevanceExpected: float64(len(evals.Relevance)),
@@ -347,9 +365,13 @@ func (r *Runner) runSingleEval(ctx context.Context, evalSess *InputSession) (Res
 	result.Size = getResponseSize(result.Response)
 
 	// Build session from events for database storage
-	result.Session = SessionFromEvents(events, evalSess.Title, userMessages)
+	result.Session = SessionFromEvents(events, title, userMessages)
 	result.Session.Evals = evals
 
+	// Re-apply the display title in case a session_title event overrode it.
+	// This ensures repeated evals retain their '#N' suffix in stored sessions.
+	result.Session.Title = title
+
 	if len(expectedToolCalls) > 0 || len(actualToolCalls) > 0 {
 		result.ToolCallsScore = toolCallF1Score(expectedToolCalls, actualToolCalls)
 	}
@@ -371,7 +393,7 @@ func (r *Runner) runSingleEval(ctx context.Context, evalSess *InputSession) (Res
 		result.RelevanceResults = results
 	}
 
-	slog.Debug("Evaluation complete", "title", evalSess.Title, "duration", time.Since(startTime))
+	slog.Debug("Evaluation complete", "title", title, "duration", time.Since(startTime))
 	return result, nil
 }
 
diff --git a/pkg/evaluation/types.go b/pkg/evaluation/types.go
@@ -11,7 +11,16 @@ import (
 type InputSession struct {
 	*session.Session
 
-	SourcePath string // Path to the source eval file (not serialized)
+	SourcePath  string // Path to the source eval file (not serialized)
+	RepeatIndex int    // Repeat iteration (1-based); 0 means no repeat
+}
+
+// displayTitle returns the title with an optional repeat suffix.
+func (s *InputSession) displayTitle() string {
+	if s.RepeatIndex > 0 {
+		return fmt.Sprintf("%s #%d", s.Title, s.RepeatIndex)
+	}
+	return s.Title
 }
 
 // Result contains the evaluation results for a single test case.
@@ -131,6 +140,7 @@ type Config struct {
 	BaseImage      string   // Custom base Docker image for running evaluations
 	KeepContainers bool     // If true, don't remove containers after evaluation (skip --rm)
 	EnvVars        []string // Environment variables to pass: KEY (value from env) or KEY=VALUE (explicit)
+	Repeat         int      // Number of times to repeat each evaluation (default 1)
 }
 
 // Session helper functions

Original file line number	Diff line number	Diff line change
`@@ -45,6 +45,7 @@ func newEvalCmd() *cobra.Command {`
`45`	`45`	`cmd.Flags().StringVar(&flags.BaseImage, "base-image", "", "Custom base Docker image for running evaluations")`
`46`	`46`	`cmd.Flags().BoolVar(&flags.KeepContainers, "keep-containers", false, "Keep containers after evaluation (don't use --rm)")`
`47`	`47`	`cmd.Flags().StringSliceVarP(&flags.EnvVars, "env", "e", nil, "Environment variables to pass to container (KEY or KEY=VALUE)")`
	`48`	`+ cmd.Flags().IntVar(&flags.Repeat, "repeat", 1, "Number of times to repeat each evaluation (useful for computing baselines)")`
`48`	`49`
`49`	`50`	`return cmd`
`50`	`51`	`}`