@@ -374,11 +374,15 @@ func SaveRunJSON(run *EvalRun, outputDir string) (string, error) {
374374 return saveJSON (run , filepath .Join (outputDir , run .Name + ".json" ))
375375}
376376
377- // SaveRunSessionsJSON saves all eval sessions to a single JSON file.
378- // Each session includes its eval criteria in the "evals" field.
379- // This complements SaveRunSessions which saves to SQLite, providing a
380- // human-readable format for inspection.
377+ // SaveRunSessionsJSON saves the full evaluation run output to a JSON file.
378+ // The output includes run metadata (config, summary) and all sessions with
379+ // their eval criteria and scoring results (pass/fail, judge reasoning, errors).
381380func SaveRunSessionsJSON (run * EvalRun , outputDir string ) (string , error ) {
381+ // Populate eval results on each session
382+ for i := range run .Results {
383+ populateEvalResult (& run .Results [i ])
384+ }
385+
382386 // Collect all sessions from results
383387 var sessions []* session.Session
384388 for i := range run .Results {
@@ -387,8 +391,79 @@ func SaveRunSessionsJSON(run *EvalRun, outputDir string) (string, error) {
387391 }
388392 }
389393
394+ output := RunOutput {
395+ Name : run .Name ,
396+ Timestamp : run .Timestamp ,
397+ Duration : run .Duration .Round (time .Millisecond ).String (),
398+ Config : RunOutputConfig {
399+ Agent : run .Config .AgentFilename ,
400+ JudgeModel : run .Config .JudgeModel ,
401+ Concurrency : run .Config .Concurrency ,
402+ EvalsDir : run .Config .EvalsDir ,
403+ BaseImage : run .Config .BaseImage ,
404+ },
405+ Summary : run .Summary ,
406+ Sessions : sessions ,
407+ }
408+
390409 outputPath := filepath .Join (outputDir , run .Name + ".json" )
391- return saveJSON (sessions , outputPath )
410+ return saveJSON (output , outputPath )
411+ }
412+
413+ // populateEvalResult copies scoring data from a Result to its Session's EvalResult field.
414+ func populateEvalResult (result * Result ) {
415+ if result .Session == nil {
416+ return
417+ }
418+
419+ successes , failures := result .checkResults ()
420+
421+ evalResult := & session.EvalResult {
422+ Passed : len (failures ) == 0 ,
423+ Successes : successes ,
424+ Failures : failures ,
425+ Error : result .Error ,
426+ Cost : result .Cost ,
427+ OutputTokens : result .OutputTokens ,
428+ }
429+
430+ // Populate size check if size was evaluated
431+ if result .SizeExpected != "" {
432+ evalResult .Checks .Size = & session.SizeCheck {
433+ Passed : result .Size == result .SizeExpected ,
434+ Actual : result .Size ,
435+ Expected : result .SizeExpected ,
436+ }
437+ }
438+
439+ // Populate tool calls check if tool calls were evaluated
440+ if result .ToolCallsExpected > 0 {
441+ evalResult .Checks .ToolCalls = & session.ToolCallsCheck {
442+ Passed : result .ToolCallsScore >= 1.0 ,
443+ Score : result .ToolCallsScore ,
444+ }
445+ }
446+
447+ // Populate relevance check if relevance was evaluated
448+ if result .RelevanceExpected > 0 {
449+ results := make ([]session.RelevanceCriterionResult , 0 , len (result .RelevanceResults ))
450+ for _ , rr := range result .RelevanceResults {
451+ results = append (results , session.RelevanceCriterionResult {
452+ Criterion : rr .Criterion ,
453+ Passed : rr .Passed ,
454+ Reason : rr .Reason ,
455+ })
456+ }
457+
458+ evalResult .Checks .Relevance = & session.RelevanceCheck {
459+ Passed : result .RelevancePassed >= result .RelevanceExpected ,
460+ PassedCount : result .RelevancePassed ,
461+ Total : result .RelevanceExpected ,
462+ Results : results ,
463+ }
464+ }
465+
466+ result .Session .EvalResult = evalResult
392467}
393468
394469func Save (sess * session.Session , filename string ) (string , error ) {
0 commit comments