Skip to content

Commit cfc7048

Browse files
Add tool support for RULER evaluation (#542)
Co-authored-by: Cursor <cursoragent@cursor.com>
1 parent 97671f1 commit cfc7048

1 file changed

Lines changed: 13 additions & 0 deletions

File tree

src/art/rewards/ruler.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,7 @@ async def ruler(
5555
judge_model: str = "openai/o3",
5656
extra_litellm_params: dict | None = None,
5757
rubric: str = DEFAULT_RUBRIC,
58+
tools: list | None = None,
5859
*,
5960
debug: bool = False,
6061
) -> list[TrajectoryScore]:
@@ -81,6 +82,8 @@ async def ruler(
8182
extra_litellm_params: Additional parameters to pass to LiteLLM completion.
8283
Can include temperature, max_tokens, etc.
8384
rubric: The grading rubric. The default rubric works well for most tasks.
85+
tools: Optional list of tool definitions available to the agent. When provided,
86+
the judge will see which tools were available when evaluating tool usage.
8487
debug: If True, pretty-print the judge's reasoning to help understand scores.
8588
8689
Returns:
@@ -137,6 +140,12 @@ async def ruler(
137140
"<context>\n" + json.dumps(common_prefix_messages) + "\n</context>\n\n"
138141
)
139142

143+
# Include available tools so the judge knows which tool calls are valid
144+
if tools:
145+
user_text += (
146+
"<available_tools>\n" + json.dumps(tools) + "\n</available_tools>\n\n"
147+
)
148+
140149
# Serialize each trajectory (minus the common prefix) for the judge.
141150
# If all trajectories are identical, only serialize one full trajectory to save tokens.
142151
serialized_trajectories: List[str] = []
@@ -292,13 +301,17 @@ async def ruler_score_group(
292301
message_lists.append(traj.messages())
293302
traj.metrics["independent_reward"] = traj.reward
294303

304+
# Extract tools from first trajectory (they should all be the same)
305+
tools = new_trajectories[0].tools if new_trajectories else None
306+
295307
try:
296308
# Call the core ruler function to get scores
297309
scores = await ruler(
298310
message_lists,
299311
judge_model=judge_model,
300312
extra_litellm_params=extra_litellm_params,
301313
rubric=rubric,
314+
tools=tools,
302315
debug=debug,
303316
)
304317
except Exception as e:

0 commit comments

Comments
 (0)