@@ -131,6 +131,9 @@ func TestSaveRunSessionsJSON(t *testing.T) {
131131 sess1 .InputTokens = 100
132132 sess1 .OutputTokens = 50
133133 sess1 .Cost = 0.01
134+ sess1 .Evals = & session.EvalCriteria {
135+ Relevance : []string {"mentions Paris" , "mentions France" },
136+ }
134137
135138 sess2 := session .New (
136139 session .WithTitle ("eval-json-2" ),
@@ -139,23 +142,49 @@ func TestSaveRunSessionsJSON(t *testing.T) {
139142 sess2 .InputTokens = 80
140143 sess2 .OutputTokens = 30
141144 sess2 .Cost = 0.005
145+ sess2 .Evals = & session.EvalCriteria {
146+ Relevance : []string {"gives the correct answer" , "explains the math" },
147+ }
142148
143149 // Create an eval run with sessions and eval criteria
144150 run := & EvalRun {
145151 Name : "test-json-001" ,
146152 Timestamp : time .Now (),
153+ Duration : 42 * time .Second ,
154+ Config : Config {
155+ AgentFilename : "./test-agent.yaml" ,
156+ JudgeModel : "anthropic/claude-opus-4-5" ,
157+ Concurrency : 4 ,
158+ EvalsDir : "./evals" ,
159+ },
160+ Summary : Summary {
161+ TotalEvals : 3 ,
162+ FailedEvals : 1 ,
163+ TotalCost : 0.015 ,
164+ },
147165 Results : []Result {
148166 {
149- Title : "eval-json-1" ,
150- Question : "What is the capital of France?" ,
151- Response : "Paris is the capital of France." ,
152- Session : sess1 ,
167+ Title : "eval-json-1" ,
168+ Question : "What is the capital of France?" ,
169+ Response : "Paris is the capital of France." ,
170+ Cost : 0.01 ,
171+ OutputTokens : 50 ,
172+ RelevancePassed : 2 ,
173+ RelevanceExpected : 2 ,
174+ Session : sess1 ,
153175 },
154176 {
155- Title : "eval-json-2" ,
156- Question : "What is 2+2?" ,
157- Response : "4" ,
158- Session : sess2 ,
177+ Title : "eval-json-2" ,
178+ Question : "What is 2+2?" ,
179+ Response : "4" ,
180+ Cost : 0.005 ,
181+ OutputTokens : 30 ,
182+ RelevancePassed : 1 ,
183+ RelevanceExpected : 2 ,
184+ FailedRelevance : []RelevanceResult {
185+ {Criterion : "explains the math" , Reason : "no explanation given" },
186+ },
187+ Session : sess2 ,
159188 },
160189 {
161190 // Result without a session (error case)
@@ -176,16 +205,29 @@ func TestSaveRunSessionsJSON(t *testing.T) {
176205 data , err := os .ReadFile (sessionsPath )
177206 require .NoError (t , err )
178207
179- var loadedSessions [] * session. Session
180- err = json .Unmarshal (data , & loadedSessions )
208+ var output RunOutput
209+ err = json .Unmarshal (data , & output )
181210 require .NoError (t , err )
182211
212+ // Verify run-level metadata
213+ assert .Equal (t , "test-json-001" , output .Name )
214+ assert .Equal (t , "42s" , output .Duration )
215+ assert .Equal (t , "./test-agent.yaml" , output .Config .Agent )
216+ assert .Equal (t , "anthropic/claude-opus-4-5" , output .Config .JudgeModel )
217+ assert .Equal (t , 4 , output .Config .Concurrency )
218+ assert .Equal (t , "./evals" , output .Config .EvalsDir )
219+
220+ // Verify summary
221+ assert .Equal (t , 3 , output .Summary .TotalEvals )
222+ assert .Equal (t , 1 , output .Summary .FailedEvals )
223+ assert .InDelta (t , 0.015 , output .Summary .TotalCost , 0.0001 )
224+
183225 // Should have 2 sessions (excluding the error case)
184- assert .Len (t , loadedSessions , 2 )
226+ assert .Len (t , output . Sessions , 2 )
185227
186228 // Verify session content
187229 titles := make (map [string ]* session.Session )
188- for _ , sess := range loadedSessions {
230+ for _ , sess := range output . Sessions {
189231 titles [sess .Title ] = sess
190232 }
191233
@@ -198,10 +240,49 @@ func TestSaveRunSessionsJSON(t *testing.T) {
198240 assert .Equal (t , int64 (50 ), sess1Loaded .OutputTokens )
199241 assert .InDelta (t , 0.01 , sess1Loaded .Cost , 0.0001 )
200242
243+ // Verify eval results are populated
244+ require .NotNil (t , sess1Loaded .EvalResult )
245+ assert .True (t , sess1Loaded .EvalResult .Passed )
246+ assert .NotEmpty (t , sess1Loaded .EvalResult .Successes )
247+ assert .Empty (t , sess1Loaded .EvalResult .Failures )
248+ assert .InDelta (t , 0.01 , sess1Loaded .EvalResult .Cost , 0.0001 )
249+ assert .Equal (t , int64 (50 ), sess1Loaded .EvalResult .OutputTokens )
250+
251+ // Verify structured relevance check
252+ require .NotNil (t , sess1Loaded .EvalResult .Checks .Relevance )
253+ assert .True (t , sess1Loaded .EvalResult .Checks .Relevance .Passed )
254+ assert .Equal (t , float64 (2 ), sess1Loaded .EvalResult .Checks .Relevance .PassedCount )
255+ assert .Equal (t , float64 (2 ), sess1Loaded .EvalResult .Checks .Relevance .Total )
256+
257+ // No size or tool calls checks were configured
258+ assert .Nil (t , sess1Loaded .EvalResult .Checks .Size )
259+ assert .Nil (t , sess1Loaded .EvalResult .Checks .ToolCalls )
260+
201261 sess2Loaded := titles ["eval-json-2" ]
202262 assert .Equal (t , int64 (80 ), sess2Loaded .InputTokens )
203263 assert .Equal (t , int64 (30 ), sess2Loaded .OutputTokens )
204264 assert .InDelta (t , 0.005 , sess2Loaded .Cost , 0.0001 )
265+
266+ // Verify failed eval result
267+ require .NotNil (t , sess2Loaded .EvalResult )
268+ assert .False (t , sess2Loaded .EvalResult .Passed )
269+ assert .NotEmpty (t , sess2Loaded .EvalResult .Failures )
270+
271+ // Verify structured relevance check with per-criterion results
272+ require .NotNil (t , sess2Loaded .EvalResult .Checks .Relevance )
273+ assert .False (t , sess2Loaded .EvalResult .Checks .Relevance .Passed )
274+ assert .Equal (t , float64 (1 ), sess2Loaded .EvalResult .Checks .Relevance .PassedCount )
275+ assert .Equal (t , float64 (2 ), sess2Loaded .EvalResult .Checks .Relevance .Total )
276+ require .Len (t , sess2Loaded .EvalResult .Checks .Relevance .Results , 2 )
277+
278+ // First criterion should be passed (not in failed list)
279+ assert .True (t , sess2Loaded .EvalResult .Checks .Relevance .Results [0 ].Passed )
280+ assert .Empty (t , sess2Loaded .EvalResult .Checks .Relevance .Results [0 ].Reason )
281+
282+ // Second criterion should be failed with reason
283+ assert .False (t , sess2Loaded .EvalResult .Checks .Relevance .Results [1 ].Passed )
284+ assert .Equal (t , "explains the math" , sess2Loaded .EvalResult .Checks .Relevance .Results [1 ].Criterion )
285+ assert .Equal (t , "no explanation given" , sess2Loaded .EvalResult .Checks .Relevance .Results [1 ].Reason )
205286}
206287
207288func TestSaveRunSessionsWithCost (t * testing.T ) {
0 commit comments