eval: include structured results, run config, and summary in JSON output

hamza-jeddad · hamza-jeddad · commit 399b22a0fcdd · 2026-04-02T14:36:21.000+02:00
The eval JSON output now includes:
- Run metadata: name, timestamp, duration, config (agent, judge model,
  concurrency, evals dir), and aggregate summary
- Per-session eval_result with pass/fail, human-readable successes/failures,
  error, cost, and output tokens
- Structured checks: size (actual vs expected), tool_calls (F1 score),
  and relevance (per-criterion pass/fail with judge reasoning)
- Relevance results list ALL criteria with passed bool, not just failures

Output format changes from a bare session array to a RunOutput wrapper.
No changes to the session DB schema.

Assisted-By: docker-agent
diff --git a/pkg/evaluation/eval.go b/pkg/evaluation/eval.go
@@ -90,6 +90,7 @@ func Evaluate(ctx context.Context, ttyOut, out io.Writer, isTTY bool, runName st
 		Name:      runName,
 		Timestamp: startTime,
 		Duration:  duration,
+		Config:    cfg,
 		Results:   results,
 		Summary:   summary,
 	}
diff --git a/pkg/evaluation/save.go b/pkg/evaluation/save.go
@@ -374,11 +374,15 @@ func SaveRunJSON(run *EvalRun, outputDir string) (string, error) {
 	return saveJSON(run, filepath.Join(outputDir, run.Name+".json"))
 }
 
-// SaveRunSessionsJSON saves all eval sessions to a single JSON file.
-// Each session includes its eval criteria in the "evals" field.
-// This complements SaveRunSessions which saves to SQLite, providing a
-// human-readable format for inspection.
+// SaveRunSessionsJSON saves the full evaluation run output to a JSON file.
+// The output includes run metadata (config, summary) and all sessions with
+// their eval criteria and scoring results (pass/fail, judge reasoning, errors).
 func SaveRunSessionsJSON(run *EvalRun, outputDir string) (string, error) {
+	// Populate eval results on each session
+	for i := range run.Results {
+		populateEvalResult(&run.Results[i])
+	}
+
 	// Collect all sessions from results
 	var sessions []*session.Session
 	for i := range run.Results {
@@ -387,8 +391,98 @@ func SaveRunSessionsJSON(run *EvalRun, outputDir string) (string, error) {
 		}
 	}
 
+	output := RunOutput{
+		Name:      run.Name,
+		Timestamp: run.Timestamp,
+		Duration:  run.Duration.Round(time.Millisecond).String(),
+		Config: RunOutputConfig{
+			Agent:       run.Config.AgentFilename,
+			JudgeModel:  run.Config.JudgeModel,
+			Concurrency: run.Config.Concurrency,
+			EvalsDir:    run.Config.EvalsDir,
+			BaseImage:   run.Config.BaseImage,
+		},
+		Summary:  run.Summary,
+		Sessions: sessions,
+	}
+
 	outputPath := filepath.Join(outputDir, run.Name+".json")
-	return saveJSON(sessions, outputPath)
+	return saveJSON(output, outputPath)
+}
+
+// populateEvalResult copies scoring data from a Result to its Session's EvalResult field.
+func populateEvalResult(result *Result) {
+	if result.Session == nil {
+		return
+	}
+
+	successes, failures := result.checkResults()
+
+	evalResult := &session.EvalResult{
+		Passed:       len(failures) == 0,
+		Successes:    successes,
+		Failures:     failures,
+		Error:        result.Error,
+		Cost:         result.Cost,
+		OutputTokens: result.OutputTokens,
+	}
+
+	// Populate size check if size was evaluated
+	if result.SizeExpected != "" {
+		evalResult.Checks.Size = &session.SizeCheck{
+			Passed:   result.Size == result.SizeExpected,
+			Actual:   result.Size,
+			Expected: result.SizeExpected,
+		}
+	}
+
+	// Populate tool calls check if tool calls were evaluated
+	if result.ToolCallsExpected > 0 {
+		evalResult.Checks.ToolCalls = &session.ToolCallsCheck{
+			Passed: result.ToolCallsScore >= 1.0,
+			Score:  result.ToolCallsScore,
+		}
+	}
+
+	// Populate relevance check if relevance was evaluated
+	if result.RelevanceExpected > 0 {
+		// Build a map of failed criteria for quick lookup
+		failedMap := make(map[string]string, len(result.FailedRelevance))
+		for _, fr := range result.FailedRelevance {
+			failedMap[fr.Criterion] = fr.Reason
+		}
+
+		// Build results for ALL criteria (passed + failed) from the eval input
+		var criteria []string
+		if result.Session.Evals != nil {
+			criteria = result.Session.Evals.Relevance
+		}
+
+		results := make([]session.RelevanceCriterionResult, 0, len(criteria))
+		for _, criterion := range criteria {
+			if reason, failed := failedMap[criterion]; failed {
+				results = append(results, session.RelevanceCriterionResult{
+					Criterion: criterion,
+					Passed:    false,
+					Reason:    reason,
+				})
+			} else {
+				results = append(results, session.RelevanceCriterionResult{
+					Criterion: criterion,
+					Passed:    true,
+				})
+			}
+		}
+
+		evalResult.Checks.Relevance = &session.RelevanceCheck{
+			Passed:      result.RelevancePassed >= result.RelevanceExpected,
+			PassedCount: result.RelevancePassed,
+			Total:       result.RelevanceExpected,
+			Results:     results,
+		}
+	}
+
+	result.Session.EvalResult = evalResult
 }
 
 func Save(sess *session.Session, filename string) (string, error) {
diff --git a/pkg/evaluation/save_test.go b/pkg/evaluation/save_test.go
@@ -131,6 +131,9 @@ func TestSaveRunSessionsJSON(t *testing.T) {
 	sess1.InputTokens = 100
 	sess1.OutputTokens = 50
 	sess1.Cost = 0.01
+	sess1.Evals = &session.EvalCriteria{
+		Relevance: []string{"mentions Paris", "mentions France"},
+	}
 
 	sess2 := session.New(
 		session.WithTitle("eval-json-2"),
@@ -139,23 +142,49 @@ func TestSaveRunSessionsJSON(t *testing.T) {
 	sess2.InputTokens = 80
 	sess2.OutputTokens = 30
 	sess2.Cost = 0.005
+	sess2.Evals = &session.EvalCriteria{
+		Relevance: []string{"gives the correct answer", "explains the math"},
+	}
 
 	// Create an eval run with sessions and eval criteria
 	run := &EvalRun{
 		Name:      "test-json-001",
 		Timestamp: time.Now(),
+		Duration:  42 * time.Second,
+		Config: Config{
+			AgentFilename: "./test-agent.yaml",
+			JudgeModel:    "anthropic/claude-opus-4-5",
+			Concurrency:   4,
+			EvalsDir:      "./evals",
+		},
+		Summary: Summary{
+			TotalEvals:  3,
+			FailedEvals: 1,
+			TotalCost:   0.015,
+		},
 		Results: []Result{
 			{
-				Title:    "eval-json-1",
-				Question: "What is the capital of France?",
-				Response: "Paris is the capital of France.",
-				Session:  sess1,
+				Title:             "eval-json-1",
+				Question:          "What is the capital of France?",
+				Response:          "Paris is the capital of France.",
+				Cost:              0.01,
+				OutputTokens:      50,
+				RelevancePassed:   2,
+				RelevanceExpected: 2,
+				Session:           sess1,
 			},
 			{
-				Title:    "eval-json-2",
-				Question: "What is 2+2?",
-				Response: "4",
-				Session:  sess2,
+				Title:             "eval-json-2",
+				Question:          "What is 2+2?",
+				Response:          "4",
+				Cost:              0.005,
+				OutputTokens:      30,
+				RelevancePassed:   1,
+				RelevanceExpected: 2,
+				FailedRelevance: []RelevanceResult{
+					{Criterion: "explains the math", Reason: "no explanation given"},
+				},
+				Session: sess2,
 			},
 			{
 				// Result without a session (error case)
@@ -176,16 +205,29 @@ func TestSaveRunSessionsJSON(t *testing.T) {
 	data, err := os.ReadFile(sessionsPath)
 	require.NoError(t, err)
 
-	var loadedSessions []*session.Session
-	err = json.Unmarshal(data, &loadedSessions)
+	var output RunOutput
+	err = json.Unmarshal(data, &output)
 	require.NoError(t, err)
 
+	// Verify run-level metadata
+	assert.Equal(t, "test-json-001", output.Name)
+	assert.Equal(t, "42s", output.Duration)
+	assert.Equal(t, "./test-agent.yaml", output.Config.Agent)
+	assert.Equal(t, "anthropic/claude-opus-4-5", output.Config.JudgeModel)
+	assert.Equal(t, 4, output.Config.Concurrency)
+	assert.Equal(t, "./evals", output.Config.EvalsDir)
+
+	// Verify summary
+	assert.Equal(t, 3, output.Summary.TotalEvals)
+	assert.Equal(t, 1, output.Summary.FailedEvals)
+	assert.InDelta(t, 0.015, output.Summary.TotalCost, 0.0001)
+
 	// Should have 2 sessions (excluding the error case)
-	assert.Len(t, loadedSessions, 2)
+	assert.Len(t, output.Sessions, 2)
 
 	// Verify session content
 	titles := make(map[string]*session.Session)
-	for _, sess := range loadedSessions {
+	for _, sess := range output.Sessions {
 		titles[sess.Title] = sess
 	}
 
@@ -198,10 +240,49 @@ func TestSaveRunSessionsJSON(t *testing.T) {
 	assert.Equal(t, int64(50), sess1Loaded.OutputTokens)
 	assert.InDelta(t, 0.01, sess1Loaded.Cost, 0.0001)
 
+	// Verify eval results are populated
+	require.NotNil(t, sess1Loaded.EvalResult)
+	assert.True(t, sess1Loaded.EvalResult.Passed)
+	assert.NotEmpty(t, sess1Loaded.EvalResult.Successes)
+	assert.Empty(t, sess1Loaded.EvalResult.Failures)
+	assert.InDelta(t, 0.01, sess1Loaded.EvalResult.Cost, 0.0001)
+	assert.Equal(t, int64(50), sess1Loaded.EvalResult.OutputTokens)
+
+	// Verify structured relevance check
+	require.NotNil(t, sess1Loaded.EvalResult.Checks.Relevance)
+	assert.True(t, sess1Loaded.EvalResult.Checks.Relevance.Passed)
+	assert.Equal(t, float64(2), sess1Loaded.EvalResult.Checks.Relevance.PassedCount)
+	assert.Equal(t, float64(2), sess1Loaded.EvalResult.Checks.Relevance.Total)
+
+	// No size or tool calls checks were configured
+	assert.Nil(t, sess1Loaded.EvalResult.Checks.Size)
+	assert.Nil(t, sess1Loaded.EvalResult.Checks.ToolCalls)
+
 	sess2Loaded := titles["eval-json-2"]
 	assert.Equal(t, int64(80), sess2Loaded.InputTokens)
 	assert.Equal(t, int64(30), sess2Loaded.OutputTokens)
 	assert.InDelta(t, 0.005, sess2Loaded.Cost, 0.0001)
+
+	// Verify failed eval result
+	require.NotNil(t, sess2Loaded.EvalResult)
+	assert.False(t, sess2Loaded.EvalResult.Passed)
+	assert.NotEmpty(t, sess2Loaded.EvalResult.Failures)
+
+	// Verify structured relevance check with per-criterion results
+	require.NotNil(t, sess2Loaded.EvalResult.Checks.Relevance)
+	assert.False(t, sess2Loaded.EvalResult.Checks.Relevance.Passed)
+	assert.Equal(t, float64(1), sess2Loaded.EvalResult.Checks.Relevance.PassedCount)
+	assert.Equal(t, float64(2), sess2Loaded.EvalResult.Checks.Relevance.Total)
+	require.Len(t, sess2Loaded.EvalResult.Checks.Relevance.Results, 2)
+
+	// First criterion should be passed (not in failed list)
+	assert.True(t, sess2Loaded.EvalResult.Checks.Relevance.Results[0].Passed)
+	assert.Empty(t, sess2Loaded.EvalResult.Checks.Relevance.Results[0].Reason)
+
+	// Second criterion should be failed with reason
+	assert.False(t, sess2Loaded.EvalResult.Checks.Relevance.Results[1].Passed)
+	assert.Equal(t, "explains the math", sess2Loaded.EvalResult.Checks.Relevance.Results[1].Criterion)
+	assert.Equal(t, "no explanation given", sess2Loaded.EvalResult.Checks.Relevance.Results[1].Reason)
 }
 
 func TestSaveRunSessionsWithCost(t *testing.T) {
diff --git a/pkg/evaluation/types.go b/pkg/evaluation/types.go
@@ -94,10 +94,30 @@ type EvalRun struct {
 	Name      string        `json:"name"`
 	Timestamp time.Time     `json:"timestamp"`
 	Duration  time.Duration `json:"duration"`
+	Config    Config        `json:"-"` // Used to build RunOutput, not serialized directly
 	Results   []Result      `json:"results"`
 	Summary   Summary       `json:"summary"`
 }
 
+// RunOutput is the top-level structure for the evaluation run JSON output.
+type RunOutput struct {
+	Name      string             `json:"name"`
+	Timestamp time.Time          `json:"timestamp"`
+	Duration  string             `json:"duration"`
+	Config    RunOutputConfig    `json:"config"`
+	Summary   Summary            `json:"summary"`
+	Sessions  []*session.Session `json:"sessions"`
+}
+
+// RunOutputConfig captures the evaluation run configuration.
+type RunOutputConfig struct {
+	Agent       string `json:"agent"`
+	JudgeModel  string `json:"judge_model,omitempty"`
+	Concurrency int    `json:"concurrency"`
+	EvalsDir    string `json:"evals_dir"`
+	BaseImage   string `json:"base_image,omitempty"`
+}
+
 // Config holds configuration for evaluation runs.
 type Config struct {
 	AgentFilename  string   // Path to the agent configuration file
diff --git a/pkg/session/session.go b/pkg/session/session.go
@@ -74,6 +74,9 @@ type Session struct {
 	// Evals contains evaluation criteria for this session (used by eval framework)
 	Evals *EvalCriteria `json:"evals,omitempty"`
 
+	// EvalResult contains the evaluation scoring outcome (populated after eval run).
+	EvalResult *EvalResult `json:"eval_result,omitempty"`
+
 	// Messages holds the conversation history (messages and sub-sessions)
 	Messages []Item `json:"messages"`
 
@@ -229,6 +232,53 @@ func NewSubSessionItem(subSession *Session) Item {
 	return Item{SubSession: subSession}
 }
 
+// EvalResult contains the evaluation scoring outcome for a session.
+type EvalResult struct {
+	Passed       bool              `json:"passed"`
+	Successes    []string          `json:"successes,omitempty"`
+	Failures     []string          `json:"failures,omitempty"`
+	Error        string            `json:"error,omitempty"`
+	Cost         float64           `json:"cost"`
+	OutputTokens int64             `json:"output_tokens"`
+	Checks       EvalResultChecks  `json:"checks"`
+}
+
+// EvalResultChecks groups the individual check results.
+// Only checks that were evaluated will be present (omitted if nil).
+type EvalResultChecks struct {
+	Size      *SizeCheck      `json:"size,omitempty"`
+	ToolCalls *ToolCallsCheck `json:"tool_calls,omitempty"`
+	Relevance *RelevanceCheck `json:"relevance,omitempty"`
+}
+
+// SizeCheck contains the result of the response size check.
+type SizeCheck struct {
+	Passed   bool   `json:"passed"`
+	Actual   string `json:"actual"`
+	Expected string `json:"expected"`
+}
+
+// ToolCallsCheck contains the result of the tool calls F1 score check.
+type ToolCallsCheck struct {
+	Passed bool    `json:"passed"`
+	Score  float64 `json:"score"`
+}
+
+// RelevanceCheck contains the result of the LLM judge relevance check.
+type RelevanceCheck struct {
+	Passed      bool                      `json:"passed"`
+	PassedCount float64                   `json:"passed_count"`
+	Total       float64                   `json:"total"`
+	Results     []RelevanceCriterionResult `json:"results"`
+}
+
+// RelevanceCriterionResult contains the judge's verdict on a single relevance criterion.
+type RelevanceCriterionResult struct {
+	Criterion string `json:"criterion"`
+	Passed    bool   `json:"passed"`
+	Reason    string `json:"reason,omitempty"` // Only set for failed criteria
+}
+
 // EvalCriteria contains the evaluation criteria for a session.
 type EvalCriteria struct {
 	Relevance  []string `json:"relevance"`             // Statements that should be true about the response

Original file line number	Diff line number	Diff line change
`@@ -90,6 +90,7 @@ func Evaluate(ctx context.Context, ttyOut, out io.Writer, isTTY bool, runName st`
`90`	`90`	`Name: runName,`
`91`	`91`	`Timestamp: startTime,`
`92`	`92`	`Duration: duration,`
	`93`	`+ Config: cfg,`
`93`	`94`	`Results: results,`
`94`	`95`	`Summary: summary,`
`95`	`96`	`}`