@@ -160,11 +160,11 @@ func (r *Runner) Run(ctx context.Context, ttyOut, out io.Writer, isTTY bool) ([]
160160 return
161161 }
162162
163- progress .setRunning (item .eval .Title )
163+ progress .setRunning (item .eval .displayTitle () )
164164 result , runErr := r .runSingleEval (ctx , item .eval )
165165 if runErr != nil {
166166 result .Error = runErr .Error ()
167- slog .Error ("Evaluation failed" , "title" , item .eval .Title , "error" , runErr )
167+ slog .Error ("Evaluation failed" , "title" , item .eval .displayTitle () , "error" , runErr )
168168 }
169169
170170 results [item .index ] = result
@@ -227,6 +227,22 @@ func (r *Runner) loadEvalSessions(ctx context.Context) ([]InputSession, error) {
227227 return cmp .Compare (b .Duration (), a .Duration ())
228228 })
229229
230+ // Repeat evals if requested
231+ repeat := max (r .Repeat , 1 )
232+ if repeat > 1 {
233+ original := evals
234+ evals = make ([]InputSession , 0 , len (original )* repeat )
235+ for i := range repeat {
236+ for _ , e := range original {
237+ evals = append (evals , InputSession {
238+ Session : e .Session ,
239+ SourcePath : e .SourcePath ,
240+ RepeatIndex : i + 1 ,
241+ })
242+ }
243+ }
244+ }
245+
230246 return evals , nil
231247}
232248
@@ -305,7 +321,9 @@ func (r *Runner) preBuildImages(ctx context.Context, out io.Writer, evals []Inpu
305321
306322func (r * Runner ) runSingleEval (ctx context.Context , evalSess * InputSession ) (Result , error ) {
307323 startTime := time .Now ()
308- slog .Debug ("Starting evaluation" , "title" , evalSess .Title )
324+ title := evalSess .displayTitle ()
325+
326+ slog .Debug ("Starting evaluation" , "title" , title )
309327
310328 var evals * session.EvalCriteria
311329 if evalSess .Evals != nil {
@@ -318,7 +336,7 @@ func (r *Runner) runSingleEval(ctx context.Context, evalSess *InputSession) (Res
318336
319337 result := Result {
320338 InputPath : evalSess .SourcePath ,
321- Title : evalSess . Title ,
339+ Title : title ,
322340 Question : strings .Join (userMessages , "\n " ),
323341 SizeExpected : evals .Size ,
324342 RelevanceExpected : float64 (len (evals .Relevance )),
@@ -347,9 +365,13 @@ func (r *Runner) runSingleEval(ctx context.Context, evalSess *InputSession) (Res
347365 result .Size = getResponseSize (result .Response )
348366
349367 // Build session from events for database storage
350- result .Session = SessionFromEvents (events , evalSess . Title , userMessages )
368+ result .Session = SessionFromEvents (events , title , userMessages )
351369 result .Session .Evals = evals
352370
371+ // Re-apply the display title in case a session_title event overrode it.
372+ // This ensures repeated evals retain their '#N' suffix in stored sessions.
373+ result .Session .Title = title
374+
353375 if len (expectedToolCalls ) > 0 || len (actualToolCalls ) > 0 {
354376 result .ToolCallsScore = toolCallF1Score (expectedToolCalls , actualToolCalls )
355377 }
@@ -371,7 +393,7 @@ func (r *Runner) runSingleEval(ctx context.Context, evalSess *InputSession) (Res
371393 result .RelevanceResults = results
372394 }
373395
374- slog .Debug ("Evaluation complete" , "title" , evalSess . Title , "duration" , time .Since (startTime ))
396+ slog .Debug ("Evaluation complete" , "title" , title , "duration" , time .Since (startTime ))
375397 return result , nil
376398}
377399
0 commit comments