Add tool support for RULER evaluation (#542)

angkywilliam · cursoragent · web-flow · commit cfc70482686a · 2026-01-29T16:57:27.000-08:00
Co-authored-by: Cursor &lt;cursoragent@cursor.com&gt;
diff --git a/src/art/rewards/ruler.py b/src/art/rewards/ruler.py
@@ -55,6 +55,7 @@ async def ruler(
     judge_model: str = "openai/o3",
     extra_litellm_params: dict | None = None,
     rubric: str = DEFAULT_RUBRIC,
+    tools: list | None = None,
     *,
     debug: bool = False,
 ) -> list[TrajectoryScore]:
@@ -81,6 +82,8 @@ async def ruler(
         extra_litellm_params: Additional parameters to pass to LiteLLM completion.
             Can include temperature, max_tokens, etc.
         rubric: The grading rubric. The default rubric works well for most tasks.
+        tools: Optional list of tool definitions available to the agent. When provided,
+            the judge will see which tools were available when evaluating tool usage.
         debug: If True, pretty-print the judge's reasoning to help understand scores.
 
     Returns:
@@ -137,6 +140,12 @@ async def ruler(
             "<context>\n" + json.dumps(common_prefix_messages) + "\n</context>\n\n"
         )
 
+    # Include available tools so the judge knows which tool calls are valid
+    if tools:
+        user_text += (
+            "<available_tools>\n" + json.dumps(tools) + "\n</available_tools>\n\n"
+        )
+
     # Serialize each trajectory (minus the common prefix) for the judge.
     # If all trajectories are identical, only serialize one full trajectory to save tokens.
     serialized_trajectories: List[str] = []
@@ -292,13 +301,17 @@ async def ruler_score_group(
         message_lists.append(traj.messages())
         traj.metrics["independent_reward"] = traj.reward
 
+    # Extract tools from first trajectory (they should all be the same)
+    tools = new_trajectories[0].tools if new_trajectories else None
+
     try:
         # Call the core ruler function to get scores
         scores = await ruler(
             message_lists,
             judge_model=judge_model,
             extra_litellm_params=extra_litellm_params,
             rubric=rubric,
+            tools=tools,
             debug=debug,
         )
     except Exception as e: