gooddata · romrak · Jun 4, 2026 · Jun 4, 2026 · Jun 4, 2026 · Jun 4, 2026
@@ -162,7 +162,45 @@ A dataset is a folder of `.json` files, one per question:
 ```
 
 Supported `test_kind` values: `visualization`, `metric_skill`, `alert_skill`,
-`search_tool`, `general_question`, `guardrail`.
+`search_tool`, `general_question`, `guardrail`, `dashboard_summary`.
+
+### `dashboard_summary` items
+
+Summary items call the dedicated summary endpoint
+(`POST /api/v1/ai/workspaces/{ws}/summary`) instead of the chat endpoint, so
+they carry an extra `summary_input` block, and the `expected_output` is a
+**rubric** rather than an exact answer (summaries are free text):
+
+```json
+{
+  "id": "summary-001",
+  "dataset_name": "summary_pilot",
+  "test_kind": "dashboard_summary",
+  "question": "Summarize the Sales Overview dashboard.",
+  "summary_input": {
+    "dashboard_id": "sales_overview"
+  },
+  "expected_output": {
+    "must_include":     ["States the overall revenue trend", "Identifies the top segment"],
+    "must_not_include": ["Numbers or segments not present in the visualizations"],
+    "rubric":           ["Reads as a coherent business summary"]
+  }
+}
+```
+
+`summary_input` requires only `dashboard_id` (the endpoint summarizes the whole
+dashboard). Optional fields narrow the scope: `visualizations` (list of ids),
+`filter_context` (AFM filters), `tab_id`, and `format_hint`.
+
+The `expected_output` rubric:
+
+- `must_include` — facts a good summary must contain; **all** must pass for the item to pass.
+- `must_not_include` — hallucination/accuracy guards; **any** violation fails the item.
+- `rubric` — soft quality dimensions; they affect `quality_score` but do not gate pass/fail.
+
+Each criterion is scored independently by the LLM judge, so `quality_score`
+is the fraction of satisfied criteria. Runnable examples live in
+[`examples/summary_dataset/`](examples/summary_dataset/).
 
 ## Supported test kinds
 
@@ -174,6 +212,7 @@ Supported `test_kind` values: `visualization`, `metric_skill`, `alert_skill`,
 | `search_tool` | `search_objects` tool call (correct function called = pass; correct arguments = quality score) | — |
 | `general_question` | Text answer judged by LLM | `[llm-judge]` |
 | `guardrail` | Refusal/redirect (visualization response auto-fails) | `[llm-judge]` |
+| `dashboard_summary` | Dashboard summary (via `/summary` endpoint) scored against a rubric by LLM | `[llm-judge]` |
 
 ## Optional extras
 

@@ -0,0 +1,26 @@
+{
+  "id": "summary-format-hint-brief",
+  "dataset_name": "summary_pilot",
+  "question": "Give a brief executive summary of the Top 10 Products.",
+  "test_kind": "dashboard_summary",
+  "summary_input": {
+    "dashboard_id": "b2f2d436-9831-4fe0-81df-8c59fd33242b",
+    "visualizations": ["top_10_products"],
+    "format_hint": "A brief executive summary: at most 3 sentences, no headings or bullet points."
+  },
+  "expected_output": {
+    "must_include": [
+      "Cites specific numeric values from the Top 10 Products data"
+    ],
+    "must_not_include": [
+      "Uses section headings (such as 'Summary' or 'Key Insights') or bullet/numbered lists",
+      "Reports specific data or findings about visualizations other than Top 10 Products"
+    ],
+    "rubric": [
+      "Respects the requested brevity (roughly three sentences or fewer)",
+      "Conveys the main product or category concentration insight (e.g. Outdoor or Neptide dominance)",
+      "Notes that only the Top 10 Products visualization was analyzed",
+      "Reads as fluent prose rather than a list of raw values"
+    ]
+  }
+}
@@ -0,0 +1,27 @@
+{
+  "id": "summary-full-dashboard",
+  "dataset_name": "summary_pilot",
+  "test_kind": "dashboard_summary",
+  "question": "Summarize the All visualizations 1 with custom filter dashboard.",
+  "summary_input": {
+    "dashboard_id": "b2f2d436-9831-4fe0-81df-8c59fd33242b"
+  },
+  "expected_output": {
+    "must_include": [
+      "Includes a Summary, Key Insights, and Areas to Watch section",
+      "Cites specific numeric values from the dashboard data"
+    ],
+    "must_not_include": [
+      "Includes a markdown heading dedicated to filters (a line such as '## Filter Context' or '### Filters'), separate from the Summary, Key Insights, and Areas to Watch sections. Filters mentioned within the prose of those sections do NOT count."
+    ],
+    "rubric": [
+      "The Summary section captures the overall state across the dashboard",
+      "Key Insights call out top and/or bottom performers and their relative contribution (shares or percentages)",
+      "Key Insights describe a direction of change over time (growing, declining, or flat) for at least one metric",
+      "Areas to Watch highlights at least one risk, decline, or concentration issue and explains why it matters for business decisions",
+      "Findings reference the active filter context (e.g. the time window or selected segments) in the wording",
+      "Synthesizes insights across multiple visualizations rather than describing each one in isolation",
+      "Reads as a cohesive business narrative connecting related metrics, not a list of raw values"
+    ]
+  }
+}
@@ -0,0 +1,26 @@
+{
+  "id": "summary-selected-visualizations",
+  "dataset_name": "summary_pilot",
+  "test_kind": "dashboard_summary",
+  "question": "Summarize only the Top 10 Customers and Top 10 Products on the dashboard.",
+  "summary_input": {
+    "dashboard_id": "b2f2d436-9831-4fe0-81df-8c59fd33242b",
+    "visualizations": ["top_10_customers", "top_10_products"]
+  },
+  "expected_output": {
+    "must_include": [
+      "Includes a Summary, Key Insights, and Areas to Watch section",
+      "Cites specific numeric values for both the customers and the products data"
+    ],
+    "must_not_include": [
+      "Reports specific data or findings about dashboard visualizations other than Top 10 Customers and Top 10 Products"
+    ],
+    "rubric": [
+      "Identifies the top customer (highest-revenue account) with their revenue or contribution",
+      "Identifies the top product with its revenue and share",
+      "Highlights product or category concentration (e.g. a single product or category dominating)",
+      "Notes that only the two requested visualizations were analyzed and other dashboard data was not included",
+      "Reads as a cohesive business narrative connecting customers and products, not a list of raw values"
+    ]
+  }
+}
@@ -16,14 +16,38 @@
 from gooddata_eval.core.connection import ConnectionError_, resolve_connection
 from gooddata_eval.core.dataset.local import load_local_dataset
 from gooddata_eval.core.langfuse.sink import LangfuseSink
-from gooddata_eval.core.models import DatasetItem
+from gooddata_eval.core.models import ChatResult, DatasetItem
 from gooddata_eval.core.reporting.console import render_comparison, render_console
 from gooddata_eval.core.reporting.json_report import write_multi_model_report
 from gooddata_eval.core.runner import ItemReport, run_items
+from gooddata_eval.core.summary.http_client import SummaryClient
 from gooddata_eval.core.workspace import ModelResolutionError, WorkspaceModelController
 
 _EXIT_OK = 0
 _EXIT_OPERATIONAL_ERROR = 2
+_SUMMARY_TEST_KIND = "dashboard_summary"
+
+
+class _RoutingBackend:
+    """Dispatch each item to the right backend by test_kind.
+
+    `dashboard_summary` items go to the dedicated summary endpoint; everything
+    else uses the conversational chat endpoint.
+    """
+
+    def __init__(self, chat: ChatClient, summary: SummaryClient):
+        self._chat = chat
+        self._summary = summary
+
+    def ask(self, item: DatasetItem) -> ChatResult:
+        if item.test_kind == _SUMMARY_TEST_KIND:
+            return self._summary.ask(item)
+        return self._chat.ask(item)
+
+    def close(self) -> None:
+        for backend in (self._chat, self._summary):
+            if hasattr(backend, "close"):
+                backend.close()
 
 
 def _build_parser() -> argparse.ArgumentParser:
@@ -256,7 +280,10 @@ def on_langfuse_item_done(
                 ) -> None:
                     _sink.log_item(report, dataset_item_id=report.id)
 
-            backend = ChatClient(host=config.host, token=config.token, workspace_id=config.workspace_id)
+            backend = _RoutingBackend(
+                ChatClient(host=config.host, token=config.token, workspace_id=config.workspace_id),
+                SummaryClient(host=config.host, token=config.token, workspace_id=config.workspace_id),
+            )
             try:
                 report = run_items(
                     items,

@@ -19,7 +19,7 @@
 
 import httpx
 
-from gooddata_eval.core.models import ChatResult
+from gooddata_eval.core.models import ChatResult, DatasetItem
 
 SSE_DATA_PREFIX = "data: "
 
@@ -169,11 +169,11 @@ def _send_message(self, conversation_id: str, question: str) -> ChatResult:
             resp.raise_for_status()
             return parse_sse_lines(resp.iter_lines())
 
-    def ask(self, question: str) -> ChatResult:
+    def ask(self, item: DatasetItem) -> ChatResult:
         """Run one single-turn conversation: create, send, parse, clean up."""
         conversation_id = self._create_conversation()
         try:
-            return self._send_message(conversation_id, question)
+            return self._send_message(conversation_id, item.question)
         finally:
             self._delete_conversation(conversation_id)
 

@@ -20,15 +20,18 @@
     )
 }
 
-# LLM-judge evaluators (general_question, guardrail) require the [llm-judge] extra.
-# Their modules are imported lazily on first use so the CLI starts without openai.
+# LLM-judge evaluators (general_question, guardrail, dashboard_summary) require the
+# [llm-judge] extra. Their modules are imported lazily on first use so the CLI
+# starts without openai.
 _LAZY_EVALUATOR_MODULES: dict[str, str] = {
     "general_question": "gooddata_eval.core.evaluators.general_question",
     "guardrail": "gooddata_eval.core.evaluators.guardrail",
+    "dashboard_summary": "gooddata_eval.core.evaluators.summary",
 }
 _LAZY_EVALUATOR_CLASSES: dict[str, str] = {
     "general_question": "GeneralQuestionEvaluator",
     "guardrail": "GuardrailEvaluator",
+    "dashboard_summary": "DashboardSummaryEvaluator",
 }
 
 

@@ -0,0 +1,96 @@
+# (C) 2026 GoodData Corporation
+"""Evaluator for dashboard_summary: rubric-based LLM-as-judge scoring.
+
+Summaries are free text, so we do not match strings. Instead, `expected_output`
+is a rubric of checkable criteria:
+
+    {
+      "must_include":     ["...facts a good summary must contain..."],
+      "must_not_include": ["...things a good summary must avoid (hallucinations)..."],
+      "rubric":           ["...soft quality dimensions..."]
+    }
+
+Each criterion is scored independently by the judge (True/False), so the
+runner's `quality_score` becomes the fraction of satisfied criteria. The item
+*passes* only when every `must_include` is satisfied and no `must_not_include`
+is violated; `rubric` items contribute to quality but do not gate pass/fail.
+
+As a fallback, a non-dict `expected_output` is treated as a single rubric
+criterion (same behaviour as `general_question`).
+"""
+
+from typing import Any
+
+from gooddata_eval.core.evaluators._llm_judge import LLMJudge
+from gooddata_eval.core.evaluators._text_utils import extract_text
+from gooddata_eval.core.evaluators.base import ItemEvaluation
+from gooddata_eval.core.models import ChatResult, DatasetItem
+
+_POSITIVE_STEPS = [
+    "Read the INPUT (the user's request) and the EXPECTED OUTPUT (one criterion the summary must satisfy).",
+    "Read the ACTUAL OUTPUT (the generated summary).",
+    "Score 1 if the actual output clearly satisfies the criterion (allow paraphrasing and reasonable numeric tolerance).",
+    "Score 0 if the criterion is missing, contradicted, or only partially addressed.",
+]
+
+# For must_not_include we ask the judge a plain presence question and invert the
+# result in code. Scoring "does the summary AVOID X?" via a field labelled
+# EXPECTED OUTPUT is unreliable: the model reads the forbidden behaviour as
+# desired and flips the verdict. Detecting presence (no negation, no
+# contradictory label) is far more robust.
+_VIOLATION_STEPS = [
+    "Read the CHARACTERISTIC described in EXPECTED OUTPUT.",
+    "Read the ACTUAL OUTPUT (the generated summary).",
+    "Score 1 if the actual output clearly exhibits the described characteristic.",
+    "Score 0 if it does not exhibit it.",
+]
+
+
+class DashboardSummaryEvaluator:
+    test_kind = "dashboard_summary"
+
+    def __init__(self):
+        self._positive_judge = LLMJudge(evaluation_steps=_POSITIVE_STEPS)
+        self._violation_judge = LLMJudge(evaluation_steps=_VIOLATION_STEPS)
+
+    @staticmethod
+    def _criteria(expected_output: Any) -> tuple[list[str], list[str], list[str]]:
+        if isinstance(expected_output, dict):
+            must_include = [str(c) for c in expected_output.get("must_include", [])]
+            must_not_include = [str(c) for c in expected_output.get("must_not_include", [])]
+            rubric = [str(c) for c in expected_output.get("rubric", [])]
+            if must_include or must_not_include or rubric:
+                return must_include, must_not_include, rubric
+        # Fallback: treat the whole expected_output as a single gating criterion
+        # (same pass/fail semantics as general_question).
+        return [str(expected_output)], [], []
+
+    def evaluate(self, item: DatasetItem, chat_result: ChatResult) -> ItemEvaluation:
+        actual = extract_text(chat_result)
+        must_include, must_not_include, rubric = self._criteria(item.expected_output)
+
+        detail: dict[str, Any] = {"actual_output": actual}
+        passed = True
+
+        for i, criterion in enumerate(must_include):
+            ok, reason = self._positive_judge.score(item.question, criterion, actual)
+            detail[f"include_{i}"] = ok
+            detail[f"include_{i}_reason"] = reason
+            passed = passed and ok
+
+        for i, criterion in enumerate(must_not_include):
+            violated, reason = self._violation_judge.score(item.question, criterion, actual)
+            ok = not violated  # True == characteristic absent == correctly avoided
+            detail[f"exclude_{i}"] = ok
+            detail[f"exclude_{i}_reason"] = reason
+            passed = passed and ok
+
+        for i, criterion in enumerate(rubric):
+            ok, reason = self._positive_judge.score(item.question, criterion, actual)
+            detail[f"rubric_{i}"] = ok
+            detail[f"rubric_{i}_reason"] = reason
+
+        bool_checks = [v for v in detail.values() if isinstance(v, bool)]
+        quality = sum(1 for v in bool_checks if v) / len(bool_checks) if bool_checks else 0.0
+
+        return ItemEvaluation(passed=passed, rank_key=(int(passed), quality), detail=detail)
@@ -85,6 +85,23 @@ class ChatResult(BaseModel):
     tool_call_events: list[ToolCallEvent] = Field(default_factory=list, alias="toolCallEvents")
 
 
+class SummaryInput(BaseModel):
+    """Structured input for the `dashboard_summary` test kind.
+
+    Maps onto the dedicated summary endpoint's request body
+    (`POST /api/v1/ai/workspaces/{ws}/summary`). Authored in snake_case in the
+    dataset; the SummaryClient maps it to the endpoint's camelCase fields.
+    """
+
+    model_config = ConfigDict(extra="ignore")
+
+    dashboard_id: str
+    visualizations: list[str] | None = None
+    filter_context: list[dict] | None = None
+    tab_id: str | None = None
+    format_hint: str | None = None
+
+
 class DatasetItem(BaseModel):
     """Common dataset envelope. `expected_output` stays raw; each evaluator parses its own shape."""
 
@@ -95,3 +112,5 @@ class DatasetItem(BaseModel):
     test_kind: str
     question: str
     expected_output: Any
+    # Only used by the `dashboard_summary` test kind; ignored by all others.
+    summary_input: SummaryInput | None = None
@@ -32,13 +32,11 @@ def render_console(report: EvalReport, *, console: Console | None = None) -> str
         elif item.pass_at_k:
             result, notes = "PASS", ""
         else:
-            d = item.best_detail
-            failing = [
-                k
-                for k in ("metrics_correct", "dimensions_correct", "filters_correct", "viz_type_hard")
-                if d.get(k) is False
-            ]
-            notes = "failed: " + ", ".join(failing) if failing else "no visualization created"
+            # Evaluator-agnostic: report whichever boolean checks came back False
+            # (visualization uses metrics_correct/…; dashboard_summary uses
+            # include_*/exclude_*/rubric_*). Falls back to a generic message.
+            failing = [k for k, v in item.best_detail.items() if v is False]
+            notes = "failed: " + ", ".join(failing) if failing else "did not pass strict checks"
             result = "FAIL"
         latency = "-" if item.runs == 0 else f"{item.latency_s:.2f}s"
         avg = "-" if item.runs == 0 else f"{item.avg_latency_s:.2f}s"

@@ -12,7 +12,9 @@
 
 
 class ChatBackend(Protocol):
-    def ask(self, question: str) -> ChatResult: ...
+    # Receives the whole item so backends can use per-item context beyond the
+    # question text (e.g. dashboard_summary needs item.summary_input).
+    def ask(self, item: DatasetItem) -> ChatResult: ...
 
 
 @dataclass
@@ -106,7 +108,7 @@ def _run_one_item(
     try:
         for run_index in range(1, runs + 1):
             t0 = time.perf_counter()
-            chat_result = backend.ask(item.question)
+            chat_result = backend.ask(item)
             evaluation = evaluator.evaluate(item, chat_result)
             latency = time.perf_counter() - t0
             report.runs += 1