@@ -55,6 +55,7 @@ async def ruler(
5555 judge_model : str = "openai/o3" ,
5656 extra_litellm_params : dict | None = None ,
5757 rubric : str = DEFAULT_RUBRIC ,
58+ tools : list | None = None ,
5859 * ,
5960 debug : bool = False ,
6061) -> list [TrajectoryScore ]:
@@ -81,6 +82,8 @@ async def ruler(
8182 extra_litellm_params: Additional parameters to pass to LiteLLM completion.
8283 Can include temperature, max_tokens, etc.
8384 rubric: The grading rubric. The default rubric works well for most tasks.
85+ tools: Optional list of tool definitions available to the agent. When provided,
86+ the judge will see which tools were available when evaluating tool usage.
8487 debug: If True, pretty-print the judge's reasoning to help understand scores.
8588
8689 Returns:
@@ -137,6 +140,12 @@ async def ruler(
137140 "<context>\n " + json .dumps (common_prefix_messages ) + "\n </context>\n \n "
138141 )
139142
143+ # Include available tools so the judge knows which tool calls are valid
144+ if tools :
145+ user_text += (
146+ "<available_tools>\n " + json .dumps (tools ) + "\n </available_tools>\n \n "
147+ )
148+
140149 # Serialize each trajectory (minus the common prefix) for the judge.
141150 # If all trajectories are identical, only serialize one full trajectory to save tokens.
142151 serialized_trajectories : List [str ] = []
@@ -292,13 +301,17 @@ async def ruler_score_group(
292301 message_lists .append (traj .messages ())
293302 traj .metrics ["independent_reward" ] = traj .reward
294303
304+ # Extract tools from first trajectory (they should all be the same)
305+ tools = new_trajectories [0 ].tools if new_trajectories else None
306+
295307 try :
296308 # Call the core ruler function to get scores
297309 scores = await ruler (
298310 message_lists ,
299311 judge_model = judge_model ,
300312 extra_litellm_params = extra_litellm_params ,
301313 rubric = rubric ,
314+ tools = tools ,
302315 debug = debug ,
303316 )
304317 except Exception as e :
0 commit comments