Skip to content

Commit ae34273

Browse files
authored
Merge pull request #2302 from docker/fix/eval-tool-call-response-format
fix: eval tool_call_response uses correct event field names
2 parents 4e0861d + cc26c3f commit ae34273

4 files changed

Lines changed: 34 additions & 36 deletions

File tree

pkg/evaluation/eval.go

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -575,7 +575,12 @@ func buildTranscript(events []map[string]any) string {
575575
fmt.Fprintf(&transcript, "[Agent %s calls tool %q with arguments: %s]\n\n", cmp.Or(currentAgent, "unknown"), name, args)
576576

577577
case "tool_call_response":
578-
name, _ := getToolCallInfo(event)
578+
// The ToolCallResponseEvent has tool_definition at the top level, not
579+
// nested under "tool_call".
580+
var name string
581+
if td, ok := event["tool_definition"].(map[string]any); ok {
582+
name, _ = td["name"].(string)
583+
}
579584
response, _ := event["response"].(string)
580585
if len(response) > 500 {
581586
response = response[:500] + "...(truncated)"

pkg/evaluation/eval_test.go

Lines changed: 10 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -796,12 +796,11 @@ func TestBuildTranscript(t *testing.T) {
796796
},
797797
},
798798
{
799-
"type": "tool_call_response",
800-
"response": "file contents here",
801-
"tool_call": map[string]any{
802-
"function": map[string]any{
803-
"name": "read_file",
804-
},
799+
"type": "tool_call_response",
800+
"response": "file contents here",
801+
"tool_call_id": "call_123",
802+
"tool_definition": map[string]any{
803+
"name": "read_file",
805804
},
806805
},
807806
},
@@ -814,12 +813,11 @@ func TestBuildTranscript(t *testing.T) {
814813
name: "long tool response truncated",
815814
events: []map[string]any{
816815
{
817-
"type": "tool_call_response",
818-
"response": strings.Repeat("x", 600),
819-
"tool_call": map[string]any{
820-
"function": map[string]any{
821-
"name": "shell",
822-
},
816+
"type": "tool_call_response",
817+
"response": strings.Repeat("x", 600),
818+
"tool_call_id": "call_789",
819+
"tool_definition": map[string]any{
820+
"name": "shell",
823821
},
824822
},
825823
},

pkg/evaluation/save.go

Lines changed: 12 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -192,21 +192,20 @@ func SessionFromEvents(events []map[string]any, title string, questions []string
192192
// Flush any pending assistant message before adding tool response
193193
flushAssistantMessage()
194194

195-
// Add tool response message
196-
if tc, ok := event["tool_call"].(map[string]any); ok {
197-
toolCallID, _ := tc["id"].(string)
198-
response, _ := event["response"].(string)
195+
// The ToolCallResponseEvent serializes tool_call_id as a top-level string field,
196+
// not nested under a "tool_call" map.
197+
toolCallID, _ := event["tool_call_id"].(string)
198+
response, _ := event["response"].(string)
199199

200-
msg := &session.Message{
201-
Message: chat.Message{
202-
Role: chat.MessageRoleTool,
203-
Content: response,
204-
ToolCallID: toolCallID,
205-
CreatedAt: eventTimestamp,
206-
},
207-
}
208-
sess.AddMessage(msg)
200+
msg := &session.Message{
201+
Message: chat.Message{
202+
Role: chat.MessageRoleTool,
203+
Content: response,
204+
ToolCallID: toolCallID,
205+
CreatedAt: eventTimestamp,
206+
},
209207
}
208+
sess.AddMessage(msg)
210209

211210
case "token_usage":
212211
// Update session token usage

pkg/evaluation/save_test.go

Lines changed: 6 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -302,11 +302,9 @@ func TestSessionFromEvents(t *testing.T) {
302302
},
303303
},
304304
{
305-
"type": "tool_call_response",
306-
"tool_call": map[string]any{
307-
"id": "call_123",
308-
},
309-
"response": "file content",
305+
"type": "tool_call_response",
306+
"tool_call_id": "call_123",
307+
"response": "file content",
310308
},
311309
{"type": "agent_choice", "content": "Done!"},
312310
{"type": "stream_stopped"},
@@ -452,11 +450,9 @@ func TestSessionFromEventsWithToolDefinitions(t *testing.T) {
452450
},
453451
},
454452
{
455-
"type": "tool_call_response",
456-
"tool_call": map[string]any{
457-
"id": "call_123",
458-
},
459-
"response": "file content",
453+
"type": "tool_call_response",
454+
"tool_call_id": "call_123",
455+
"response": "file content",
460456
},
461457
{"type": "stream_stopped"},
462458
}

0 commit comments

Comments
 (0)