diff --git a/packages/gooddata-eval/README.md b/packages/gooddata-eval/README.md
index 44733933f..1920747a6 100644
--- a/packages/gooddata-eval/README.md
+++ b/packages/gooddata-eval/README.md
@@ -162,7 +162,45 @@ A dataset is a folder of `.json` files, one per question:
 ```
 
 Supported `test_kind` values: `visualization`, `metric_skill`, `alert_skill`,
-`search_tool`, `general_question`, `guardrail`.
+`search_tool`, `general_question`, `guardrail`, `dashboard_summary`.
+
+### `dashboard_summary` items
+
+Summary items call the dedicated summary endpoint
+(`POST /api/v1/ai/workspaces/{ws}/summary`) instead of the chat endpoint, so
+they carry an extra `summary_input` block, and the `expected_output` is a
+**rubric** rather than an exact answer (summaries are free text):
+
+```json
+{
+  "id": "summary-001",
+  "dataset_name": "summary_pilot",
+  "test_kind": "dashboard_summary",
+  "question": "Summarize the Sales Overview dashboard.",
+  "summary_input": {
+    "dashboard_id": "sales_overview"
+  },
+  "expected_output": {
+    "must_include":     ["States the overall revenue trend", "Identifies the top segment"],
+    "must_not_include": ["Numbers or segments not present in the visualizations"],
+    "rubric":           ["Reads as a coherent business summary"]
+  }
+}
+```
+
+`summary_input` requires only `dashboard_id` (the endpoint summarizes the whole
+dashboard). Optional fields narrow the scope: `visualizations` (list of ids),
+`filter_context` (AFM filters), `tab_id`, and `format_hint`.
+
+The `expected_output` rubric:
+
+- `must_include` — facts a good summary must contain; **all** must pass for the item to pass.
+- `must_not_include` — hallucination/accuracy guards; **any** violation fails the item.
+- `rubric` — soft quality dimensions; they affect `quality_score` but do not gate pass/fail.
+
+Each criterion is scored independently by the LLM judge, so `quality_score`
+is the fraction of satisfied criteria. Runnable examples live in
+[`examples/summary_dataset/`](examples/summary_dataset/).
 
 ## Supported test kinds
 
@@ -174,6 +212,7 @@ Supported `test_kind` values: `visualization`, `metric_skill`, `alert_skill`,
 | `search_tool` | `search_objects` tool call (correct function called = pass; correct arguments = quality score) | — |
 | `general_question` | Text answer judged by LLM | `[llm-judge]` |
 | `guardrail` | Refusal/redirect (visualization response auto-fails) | `[llm-judge]` |
+| `dashboard_summary` | Dashboard summary (via `/summary` endpoint) scored against a rubric by LLM | `[llm-judge]` |
 
 ## Optional extras
 
diff --git a/packages/gooddata-eval/examples/summary_dataset/summary_format_hint_brief.json b/packages/gooddata-eval/examples/summary_dataset/summary_format_hint_brief.json
new file mode 100644
index 000000000..b1fcbf58d
--- /dev/null
+++ b/packages/gooddata-eval/examples/summary_dataset/summary_format_hint_brief.json
@@ -0,0 +1,26 @@
+{
+  "id": "summary-format-hint-brief",
+  "dataset_name": "summary_pilot",
+  "question": "Give a brief executive summary of the Top 10 Products.",
+  "test_kind": "dashboard_summary",
+  "summary_input": {
+    "dashboard_id": "b2f2d436-9831-4fe0-81df-8c59fd33242b",
+    "visualizations": ["top_10_products"],
+    "format_hint": "A brief executive summary: at most 3 sentences, no headings or bullet points."
+  },
+  "expected_output": {
+    "must_include": [
+      "Cites specific numeric values from the Top 10 Products data"
+    ],
+    "must_not_include": [
+      "Uses section headings (such as 'Summary' or 'Key Insights') or bullet/numbered lists",
+      "Reports specific data or findings about visualizations other than Top 10 Products"
+    ],
+    "rubric": [
+      "Respects the requested brevity (roughly three sentences or fewer)",
+      "Conveys the main product or category concentration insight (e.g. Outdoor or Neptide dominance)",
+      "Notes that only the Top 10 Products visualization was analyzed",
+      "Reads as fluent prose rather than a list of raw values"
+    ]
+  }
+}
diff --git a/packages/gooddata-eval/examples/summary_dataset/summary_full_dashboard.json b/packages/gooddata-eval/examples/summary_dataset/summary_full_dashboard.json
new file mode 100644
index 000000000..916bb4a8b
--- /dev/null
+++ b/packages/gooddata-eval/examples/summary_dataset/summary_full_dashboard.json
@@ -0,0 +1,27 @@
+{
+  "id": "summary-full-dashboard",
+  "dataset_name": "summary_pilot",
+  "test_kind": "dashboard_summary",
+  "question": "Summarize the All visualizations 1 with custom filter dashboard.",
+  "summary_input": {
+    "dashboard_id": "b2f2d436-9831-4fe0-81df-8c59fd33242b"
+  },
+  "expected_output": {
+    "must_include": [
+      "Includes a Summary, Key Insights, and Areas to Watch section",
+      "Cites specific numeric values from the dashboard data"
+    ],
+    "must_not_include": [
+      "Includes a markdown heading dedicated to filters (a line such as '## Filter Context' or '### Filters'), separate from the Summary, Key Insights, and Areas to Watch sections. Filters mentioned within the prose of those sections do NOT count."
+    ],
+    "rubric": [
+      "The Summary section captures the overall state across the dashboard",
+      "Key Insights call out top and/or bottom performers and their relative contribution (shares or percentages)",
+      "Key Insights describe a direction of change over time (growing, declining, or flat) for at least one metric",
+      "Areas to Watch highlights at least one risk, decline, or concentration issue and explains why it matters for business decisions",
+      "Findings reference the active filter context (e.g. the time window or selected segments) in the wording",
+      "Synthesizes insights across multiple visualizations rather than describing each one in isolation",
+      "Reads as a cohesive business narrative connecting related metrics, not a list of raw values"
+    ]
+  }
+}
diff --git a/packages/gooddata-eval/examples/summary_dataset/summary_selected_visualizations.json b/packages/gooddata-eval/examples/summary_dataset/summary_selected_visualizations.json
new file mode 100644
index 000000000..01433a74b
--- /dev/null
+++ b/packages/gooddata-eval/examples/summary_dataset/summary_selected_visualizations.json
@@ -0,0 +1,26 @@
+{
+  "id": "summary-selected-visualizations",
+  "dataset_name": "summary_pilot",
+  "test_kind": "dashboard_summary",
+  "question": "Summarize only the Top 10 Customers and Top 10 Products on the dashboard.",
+  "summary_input": {
+    "dashboard_id": "b2f2d436-9831-4fe0-81df-8c59fd33242b",
+    "visualizations": ["top_10_customers", "top_10_products"]
+  },
+  "expected_output": {
+    "must_include": [
+      "Includes a Summary, Key Insights, and Areas to Watch section",
+      "Cites specific numeric values for both the customers and the products data"
+    ],
+    "must_not_include": [
+      "Reports specific data or findings about dashboard visualizations other than Top 10 Customers and Top 10 Products"
+    ],
+    "rubric": [
+      "Identifies the top customer (highest-revenue account) with their revenue or contribution",
+      "Identifies the top product with its revenue and share",
+      "Highlights product or category concentration (e.g. a single product or category dominating)",
+      "Notes that only the two requested visualizations were analyzed and other dashboard data was not included",
+      "Reads as a cohesive business narrative connecting customers and products, not a list of raw values"
+    ]
+  }
+}
diff --git a/packages/gooddata-eval/src/gooddata_eval/cli/main.py b/packages/gooddata-eval/src/gooddata_eval/cli/main.py
index ff041f18b..0ba08be9c 100644
--- a/packages/gooddata-eval/src/gooddata_eval/cli/main.py
+++ b/packages/gooddata-eval/src/gooddata_eval/cli/main.py
@@ -16,14 +16,38 @@
 from gooddata_eval.core.connection import ConnectionError_, resolve_connection
 from gooddata_eval.core.dataset.local import load_local_dataset
 from gooddata_eval.core.langfuse.sink import LangfuseSink
-from gooddata_eval.core.models import DatasetItem
+from gooddata_eval.core.models import ChatResult, DatasetItem
 from gooddata_eval.core.reporting.console import render_comparison, render_console
 from gooddata_eval.core.reporting.json_report import write_multi_model_report
 from gooddata_eval.core.runner import ItemReport, run_items
+from gooddata_eval.core.summary.http_client import SummaryClient
 from gooddata_eval.core.workspace import ModelResolutionError, WorkspaceModelController
 
 _EXIT_OK = 0
 _EXIT_OPERATIONAL_ERROR = 2
+_SUMMARY_TEST_KIND = "dashboard_summary"
+
+
+class _RoutingBackend:
+    """Dispatch each item to the right backend by test_kind.
+
+    `dashboard_summary` items go to the dedicated summary endpoint; everything
+    else uses the conversational chat endpoint.
+    """
+
+    def __init__(self, chat: ChatClient, summary: SummaryClient):
+        self._chat = chat
+        self._summary = summary
+
+    def ask(self, item: DatasetItem) -> ChatResult:
+        if item.test_kind == _SUMMARY_TEST_KIND:
+            return self._summary.ask(item)
+        return self._chat.ask(item)
+
+    def close(self) -> None:
+        for backend in (self._chat, self._summary):
+            if hasattr(backend, "close"):
+                backend.close()
 
 
 def _build_parser() -> argparse.ArgumentParser:
@@ -256,7 +280,10 @@ def on_langfuse_item_done(
                 ) -> None:
                     _sink.log_item(report, dataset_item_id=report.id)
 
-            backend = ChatClient(host=config.host, token=config.token, workspace_id=config.workspace_id)
+            backend = _RoutingBackend(
+                ChatClient(host=config.host, token=config.token, workspace_id=config.workspace_id),
+                SummaryClient(host=config.host, token=config.token, workspace_id=config.workspace_id),
+            )
             try:
                 report = run_items(
                     items,
diff --git a/packages/gooddata-eval/src/gooddata_eval/core/chat/sse_client.py b/packages/gooddata-eval/src/gooddata_eval/core/chat/sse_client.py
index d36e59440..6548a4ee8 100644
--- a/packages/gooddata-eval/src/gooddata_eval/core/chat/sse_client.py
+++ b/packages/gooddata-eval/src/gooddata_eval/core/chat/sse_client.py
@@ -19,7 +19,7 @@
 
 import httpx
 
-from gooddata_eval.core.models import ChatResult
+from gooddata_eval.core.models import ChatResult, DatasetItem
 
 SSE_DATA_PREFIX = "data: "
 
@@ -169,11 +169,11 @@ def _send_message(self, conversation_id: str, question: str) -> ChatResult:
             resp.raise_for_status()
             return parse_sse_lines(resp.iter_lines())
 
-    def ask(self, question: str) -> ChatResult:
+    def ask(self, item: DatasetItem) -> ChatResult:
         """Run one single-turn conversation: create, send, parse, clean up."""
         conversation_id = self._create_conversation()
         try:
-            return self._send_message(conversation_id, question)
+            return self._send_message(conversation_id, item.question)
         finally:
             self._delete_conversation(conversation_id)
 
diff --git a/packages/gooddata-eval/src/gooddata_eval/core/evaluators/__init__.py b/packages/gooddata-eval/src/gooddata_eval/core/evaluators/__init__.py
index 4da18f876..f2b71955a 100644
--- a/packages/gooddata-eval/src/gooddata_eval/core/evaluators/__init__.py
+++ b/packages/gooddata-eval/src/gooddata_eval/core/evaluators/__init__.py
@@ -20,15 +20,18 @@
     )
 }
 
-# LLM-judge evaluators (general_question, guardrail) require the [llm-judge] extra.
-# Their modules are imported lazily on first use so the CLI starts without openai.
+# LLM-judge evaluators (general_question, guardrail, dashboard_summary) require the
+# [llm-judge] extra. Their modules are imported lazily on first use so the CLI
+# starts without openai.
 _LAZY_EVALUATOR_MODULES: dict[str, str] = {
     "general_question": "gooddata_eval.core.evaluators.general_question",
     "guardrail": "gooddata_eval.core.evaluators.guardrail",
+    "dashboard_summary": "gooddata_eval.core.evaluators.summary",
 }
 _LAZY_EVALUATOR_CLASSES: dict[str, str] = {
     "general_question": "GeneralQuestionEvaluator",
     "guardrail": "GuardrailEvaluator",
+    "dashboard_summary": "DashboardSummaryEvaluator",
 }
 
 
diff --git a/packages/gooddata-eval/src/gooddata_eval/core/evaluators/summary.py b/packages/gooddata-eval/src/gooddata_eval/core/evaluators/summary.py
new file mode 100644
index 000000000..b0af2ce42
--- /dev/null
+++ b/packages/gooddata-eval/src/gooddata_eval/core/evaluators/summary.py
@@ -0,0 +1,96 @@
+# (C) 2026 GoodData Corporation
+"""Evaluator for dashboard_summary: rubric-based LLM-as-judge scoring.
+
+Summaries are free text, so we do not match strings. Instead, `expected_output`
+is a rubric of checkable criteria:
+
+    {
+      "must_include":     ["...facts a good summary must contain..."],
+      "must_not_include": ["...things a good summary must avoid (hallucinations)..."],
+      "rubric":           ["...soft quality dimensions..."]
+    }
+
+Each criterion is scored independently by the judge (True/False), so the
+runner's `quality_score` becomes the fraction of satisfied criteria. The item
+*passes* only when every `must_include` is satisfied and no `must_not_include`
+is violated; `rubric` items contribute to quality but do not gate pass/fail.
+
+As a fallback, a non-dict `expected_output` is treated as a single rubric
+criterion (same behaviour as `general_question`).
+"""
+
+from typing import Any
+
+from gooddata_eval.core.evaluators._llm_judge import LLMJudge
+from gooddata_eval.core.evaluators._text_utils import extract_text
+from gooddata_eval.core.evaluators.base import ItemEvaluation
+from gooddata_eval.core.models import ChatResult, DatasetItem
+
+_POSITIVE_STEPS = [
+    "Read the INPUT (the user's request) and the EXPECTED OUTPUT (one criterion the summary must satisfy).",
+    "Read the ACTUAL OUTPUT (the generated summary).",
+    "Score 1 if the actual output clearly satisfies the criterion (allow paraphrasing and reasonable numeric tolerance).",
+    "Score 0 if the criterion is missing, contradicted, or only partially addressed.",
+]
+
+# For must_not_include we ask the judge a plain presence question and invert the
+# result in code. Scoring "does the summary AVOID X?" via a field labelled
+# EXPECTED OUTPUT is unreliable: the model reads the forbidden behaviour as
+# desired and flips the verdict. Detecting presence (no negation, no
+# contradictory label) is far more robust.
+_VIOLATION_STEPS = [
+    "Read the CHARACTERISTIC described in EXPECTED OUTPUT.",
+    "Read the ACTUAL OUTPUT (the generated summary).",
+    "Score 1 if the actual output clearly exhibits the described characteristic.",
+    "Score 0 if it does not exhibit it.",
+]
+
+
+class DashboardSummaryEvaluator:
+    test_kind = "dashboard_summary"
+
+    def __init__(self):
+        self._positive_judge = LLMJudge(evaluation_steps=_POSITIVE_STEPS)
+        self._violation_judge = LLMJudge(evaluation_steps=_VIOLATION_STEPS)
+
+    @staticmethod
+    def _criteria(expected_output: Any) -> tuple[list[str], list[str], list[str]]:
+        if isinstance(expected_output, dict):
+            must_include = [str(c) for c in expected_output.get("must_include", [])]
+            must_not_include = [str(c) for c in expected_output.get("must_not_include", [])]
+            rubric = [str(c) for c in expected_output.get("rubric", [])]
+            if must_include or must_not_include or rubric:
+                return must_include, must_not_include, rubric
+        # Fallback: treat the whole expected_output as a single gating criterion
+        # (same pass/fail semantics as general_question).
+        return [str(expected_output)], [], []
+
+    def evaluate(self, item: DatasetItem, chat_result: ChatResult) -> ItemEvaluation:
+        actual = extract_text(chat_result)
+        must_include, must_not_include, rubric = self._criteria(item.expected_output)
+
+        detail: dict[str, Any] = {"actual_output": actual}
+        passed = True
+
+        for i, criterion in enumerate(must_include):
+            ok, reason = self._positive_judge.score(item.question, criterion, actual)
+            detail[f"include_{i}"] = ok
+            detail[f"include_{i}_reason"] = reason
+            passed = passed and ok
+
+        for i, criterion in enumerate(must_not_include):
+            violated, reason = self._violation_judge.score(item.question, criterion, actual)
+            ok = not violated  # True == characteristic absent == correctly avoided
+            detail[f"exclude_{i}"] = ok
+            detail[f"exclude_{i}_reason"] = reason
+            passed = passed and ok
+
+        for i, criterion in enumerate(rubric):
+            ok, reason = self._positive_judge.score(item.question, criterion, actual)
+            detail[f"rubric_{i}"] = ok
+            detail[f"rubric_{i}_reason"] = reason
+
+        bool_checks = [v for v in detail.values() if isinstance(v, bool)]
+        quality = sum(1 for v in bool_checks if v) / len(bool_checks) if bool_checks else 0.0
+
+        return ItemEvaluation(passed=passed, rank_key=(int(passed), quality), detail=detail)
diff --git a/packages/gooddata-eval/src/gooddata_eval/core/models.py b/packages/gooddata-eval/src/gooddata_eval/core/models.py
index 8c1965d30..63587f680 100644
--- a/packages/gooddata-eval/src/gooddata_eval/core/models.py
+++ b/packages/gooddata-eval/src/gooddata_eval/core/models.py
@@ -85,6 +85,23 @@ class ChatResult(BaseModel):
     tool_call_events: list[ToolCallEvent] = Field(default_factory=list, alias="toolCallEvents")
 
 
+class SummaryInput(BaseModel):
+    """Structured input for the `dashboard_summary` test kind.
+
+    Maps onto the dedicated summary endpoint's request body
+    (`POST /api/v1/ai/workspaces/{ws}/summary`). Authored in snake_case in the
+    dataset; the SummaryClient maps it to the endpoint's camelCase fields.
+    """
+
+    model_config = ConfigDict(extra="ignore")
+
+    dashboard_id: str
+    visualizations: list[str] | None = None
+    filter_context: list[dict] | None = None
+    tab_id: str | None = None
+    format_hint: str | None = None
+
+
 class DatasetItem(BaseModel):
     """Common dataset envelope. `expected_output` stays raw; each evaluator parses its own shape."""
 
@@ -95,3 +112,5 @@ class DatasetItem(BaseModel):
     test_kind: str
     question: str
     expected_output: Any
+    # Only used by the `dashboard_summary` test kind; ignored by all others.
+    summary_input: SummaryInput | None = None
diff --git a/packages/gooddata-eval/src/gooddata_eval/core/reporting/console.py b/packages/gooddata-eval/src/gooddata_eval/core/reporting/console.py
index 316490a84..d2c79fccd 100644
--- a/packages/gooddata-eval/src/gooddata_eval/core/reporting/console.py
+++ b/packages/gooddata-eval/src/gooddata_eval/core/reporting/console.py
@@ -32,13 +32,11 @@ def render_console(report: EvalReport, *, console: Console | None = None) -> str
         elif item.pass_at_k:
             result, notes = "PASS", ""
         else:
-            d = item.best_detail
-            failing = [
-                k
-                for k in ("metrics_correct", "dimensions_correct", "filters_correct", "viz_type_hard")
-                if d.get(k) is False
-            ]
-            notes = "failed: " + ", ".join(failing) if failing else "no visualization created"
+            # Evaluator-agnostic: report whichever boolean checks came back False
+            # (visualization uses metrics_correct/…; dashboard_summary uses
+            # include_*/exclude_*/rubric_*). Falls back to a generic message.
+            failing = [k for k, v in item.best_detail.items() if v is False]
+            notes = "failed: " + ", ".join(failing) if failing else "did not pass strict checks"
             result = "FAIL"
         latency = "-" if item.runs == 0 else f"{item.latency_s:.2f}s"
         avg = "-" if item.runs == 0 else f"{item.avg_latency_s:.2f}s"
diff --git a/packages/gooddata-eval/src/gooddata_eval/core/runner.py b/packages/gooddata-eval/src/gooddata_eval/core/runner.py
index 7fb0d36d6..087f142f2 100644
--- a/packages/gooddata-eval/src/gooddata_eval/core/runner.py
+++ b/packages/gooddata-eval/src/gooddata_eval/core/runner.py
@@ -12,7 +12,9 @@
 
 
 class ChatBackend(Protocol):
-    def ask(self, question: str) -> ChatResult: ...
+    # Receives the whole item so backends can use per-item context beyond the
+    # question text (e.g. dashboard_summary needs item.summary_input).
+    def ask(self, item: DatasetItem) -> ChatResult: ...
 
 
 @dataclass
@@ -106,7 +108,7 @@ def _run_one_item(
     try:
         for run_index in range(1, runs + 1):
             t0 = time.perf_counter()
-            chat_result = backend.ask(item.question)
+            chat_result = backend.ask(item)
             evaluation = evaluator.evaluate(item, chat_result)
             latency = time.perf_counter() - t0
             report.runs += 1
diff --git a/packages/gooddata-eval/src/gooddata_eval/core/summary/__init__.py b/packages/gooddata-eval/src/gooddata_eval/core/summary/__init__.py
new file mode 100644
index 000000000..efe7c60c8
--- /dev/null
+++ b/packages/gooddata-eval/src/gooddata_eval/core/summary/__init__.py
@@ -0,0 +1 @@
+# (C) 2026 GoodData Corporation
diff --git a/packages/gooddata-eval/src/gooddata_eval/core/summary/http_client.py b/packages/gooddata-eval/src/gooddata_eval/core/summary/http_client.py
new file mode 100644
index 000000000..bab285e04
--- /dev/null
+++ b/packages/gooddata-eval/src/gooddata_eval/core/summary/http_client.py
@@ -0,0 +1,54 @@
+# (C) 2026 GoodData Corporation
+"""HTTP client for the dedicated dashboard-summary endpoint.
+
+Unlike the conversational chat skill, this endpoint executes the AFM for each
+visualization server-side and returns a plain synchronous JSON summary — no SSE
+stream and no client-side ``result_id`` wrangling. The response is adapted into
+a ``ChatResult`` (summary text -> ``text_response``) so the existing
+LLM-as-judge evaluators can score it unchanged.
+
+Endpoint (gen-ai service):
+    POST /api/v1/ai/workspaces/{workspace_id}/summary
+
+If the route is ever renamed (e.g. to ``/summarize``), change ``_PATH`` only.
+"""
+
+import httpx
+
+from gooddata_eval.core.models import ChatResult, DatasetItem
+
+_PATH = "/api/v1/ai/workspaces/{workspace_id}/summary"
+
+
+class SummaryClient:
+    """Single-shot client for the dashboard-summary endpoint."""
+
+    def __init__(self, host: str, token: str, workspace_id: str, *, timeout: float = 300.0):
+        self._url = f"{host.rstrip('/')}{_PATH.format(workspace_id=workspace_id)}"
+        self._auth = {"Authorization": f"Bearer {token}"}
+        self._client = httpx.Client(timeout=timeout)
+
+    def ask(self, item: DatasetItem) -> ChatResult:
+        """Request a summary for one dataset item and adapt it to a ChatResult."""
+        si = item.summary_input
+        if si is None:
+            raise ValueError(f"dashboard_summary item '{item.id}' is missing required 'summary_input'.")
+
+        body: dict = {"dashboardId": si.dashboard_id}
+        if si.visualizations is not None:
+            body["visualizations"] = si.visualizations
+        if si.filter_context is not None:
+            body["filterContext"] = si.filter_context
+        if si.tab_id is not None:
+            body["tabId"] = si.tab_id
+        if si.format_hint is not None:
+            body["formatHint"] = si.format_hint
+
+        resp = self._client.post(self._url, json=body, headers={**self._auth, "Content-Type": "application/json"})
+        resp.raise_for_status()
+        data = resp.json()
+        summary = data.get("summary") or ""
+        return ChatResult.model_validate({"textResponse": summary})
+
+    def close(self) -> None:
+        self._client.close()
diff --git a/packages/gooddata-eval/src/gooddata_eval/core/workspace.py b/packages/gooddata-eval/src/gooddata_eval/core/workspace.py
index e849c0aea..35b3ee132 100644
--- a/packages/gooddata-eval/src/gooddata_eval/core/workspace.py
+++ b/packages/gooddata-eval/src/gooddata_eval/core/workspace.py
@@ -4,9 +4,13 @@
 from dataclasses import dataclass, field
 
 import httpx
-from gooddata_api_client.exceptions import ApiException, NotFoundException
+from gooddata_api_client.exceptions import ApiException
 from gooddata_sdk import CatalogWorkspaceSetting, GoodDataSdk
 
+# Default id used only when creating the setting for the first time. The backend
+# enforces a single setting per *type* (ACTIVE_LLM_PROVIDER), and an existing
+# setting may have been created with any id (e.g. by the UI), so reads and
+# updates must locate it by type rather than by this id.
 _SETTING_ID = "activeLlmProvider"
 _SETTING_TYPE = "ACTIVE_LLM_PROVIDER"
 
@@ -142,10 +146,16 @@ def __init__(self, host: str, token: str, workspace_id: str):
         self._workspace_id = workspace_id
         self._sdk = GoodDataSdk.create(host, token)
 
+    def _active_setting(self) -> CatalogWorkspaceSetting | None:
+        """Find the workspace's ACTIVE_LLM_PROVIDER setting by type (id may vary)."""
+        for setting in self._sdk.catalog_workspace.list_workspace_settings(self._workspace_id):
+            if setting.setting_type == _SETTING_TYPE:
+                return setting
+        return None
+
     def get_active(self) -> ActiveLlmProvider | None:
-        try:
-            setting = self._sdk.catalog_workspace.get_workspace_setting(self._workspace_id, _SETTING_ID)
-        except NotFoundException:
+        setting = self._active_setting()
+        if setting is None:
             return None
         content = setting.content or {}
         return ActiveLlmProvider(
@@ -176,8 +186,12 @@ def all_provider_models(self) -> dict[str, list[str]]:
         }
 
     def activate(self, provider_id: str, model_id: str) -> None:
+        # Reuse the existing setting's id so create_or_update performs an UPDATE.
+        # Creating a second ACTIVE_LLM_PROVIDER setting (under a different id)
+        # would be rejected by the backend with HTTP 409 (one per type).
+        existing = self._active_setting()
         setting = CatalogWorkspaceSetting(
-            id=_SETTING_ID,
+            id=existing.id if existing is not None else _SETTING_ID,
             setting_type=_SETTING_TYPE,
             content=active_provider_content(provider_id, model_id),
         )
@@ -195,6 +209,7 @@ def resolve_and_activate(self, requested_model: str | None, requested_provider:
         if requested_model is None and requested_provider is None:
             provider_id, model_id = resolve_model(None, active)
             provider_name = ""
+            provider_type = ""
         else:
             info = self._provider_info()
             providers_models = {
diff --git a/packages/gooddata-eval/tests/test_runner.py b/packages/gooddata-eval/tests/test_runner.py
index ede002043..1ffbe45bd 100644
--- a/packages/gooddata-eval/tests/test_runner.py
+++ b/packages/gooddata-eval/tests/test_runner.py
@@ -32,7 +32,7 @@ def __init__(self, results):
         self._results = results
         self.calls = 0
 
-    def ask(self, question: str) -> ChatResult:
+    def ask(self, item: DatasetItem) -> ChatResult:
         r = self._results[min(self.calls, len(self._results) - 1)]
         self.calls += 1
         return r
@@ -66,7 +66,7 @@ def test_run_items_marks_unsupported_test_kind_skipped():
 
 def test_run_items_records_agent_error_without_passing():
     class _BoomBackend:
-        def ask(self, question: str) -> ChatResult:
+        def ask(self, item: DatasetItem) -> ChatResult:
             raise RuntimeError("network down")
 
     report = run_items([_item()], _BoomBackend(), runs=1)
@@ -109,7 +109,15 @@ def test_run_items_reports_latency_and_per_run_callback():
 
 
 def test_run_items_routes_all_supported_kinds():
-    expected_kinds = {"visualization", "metric_skill", "alert_skill", "search_tool", "general_question", "guardrail"}
+    expected_kinds = {
+        "visualization",
+        "metric_skill",
+        "alert_skill",
+        "search_tool",
+        "general_question",
+        "guardrail",
+        "dashboard_summary",
+    }
     assert expected_kinds == supported_test_kinds()
 
 
diff --git a/packages/gooddata-eval/tests/test_summary_client.py b/packages/gooddata-eval/tests/test_summary_client.py
new file mode 100644
index 000000000..637e03aed
--- /dev/null
+++ b/packages/gooddata-eval/tests/test_summary_client.py
@@ -0,0 +1,85 @@
+# (C) 2026 GoodData Corporation
+import json
+
+import httpx
+import pytest
+from gooddata_eval.core.models import DatasetItem, SummaryInput
+from gooddata_eval.core.summary.http_client import SummaryClient
+
+
+def _item() -> DatasetItem:
+    return DatasetItem(
+        id="s1",
+        dataset_name="d",
+        test_kind="dashboard_summary",
+        question="Summarize the dashboard",
+        expected_output={},
+        summary_input=SummaryInput(
+            dashboard_id="dash1",
+            visualizations=["v1", "v2"],
+            format_hint="3 bullets",
+        ),
+    )
+
+
+def test_summary_client_posts_request_and_maps_summary():
+    captured: dict = {}
+
+    def handler(request: httpx.Request) -> httpx.Response:
+        captured["url"] = str(request.url)
+        captured["body"] = json.loads(request.content)
+        captured["auth"] = request.headers.get("authorization")
+        return httpx.Response(200, json={"summary": "Revenue grew QoQ.", "filterContext": []})
+
+    client = SummaryClient(host="https://h", token="tok", workspace_id="ws")
+    client._client = httpx.Client(transport=httpx.MockTransport(handler))
+
+    result = client.ask(_item())
+
+    assert result.text_response == "Revenue grew QoQ."
+    assert captured["url"] == "https://h/api/v1/ai/workspaces/ws/summary"
+    assert captured["auth"] == "Bearer tok"
+    assert captured["body"] == {
+        "dashboardId": "dash1",
+        "visualizations": ["v1", "v2"],
+        "formatHint": "3 bullets",
+    }
+
+
+def test_summary_client_omits_unset_optional_fields():
+    captured: dict = {}
+
+    def handler(request: httpx.Request) -> httpx.Response:
+        captured["body"] = json.loads(request.content)
+        return httpx.Response(200, json={"summary": "ok"})
+
+    client = SummaryClient(host="https://h", token="tok", workspace_id="ws")
+    client._client = httpx.Client(transport=httpx.MockTransport(handler))
+
+    item = DatasetItem(
+        id="s2",
+        dataset_name="d",
+        test_kind="dashboard_summary",
+        question="q",
+        expected_output={},
+        summary_input=SummaryInput(dashboard_id="only-dashboard"),
+    )
+    client.ask(item)
+    assert captured["body"] == {"dashboardId": "only-dashboard"}
+
+
+def test_summary_client_raises_without_summary_input():
+    client = SummaryClient(host="https://h", token="tok", workspace_id="ws")
+    item = DatasetItem(id="s3", dataset_name="d", test_kind="dashboard_summary", question="q", expected_output={})
+    with pytest.raises(ValueError, match="summary_input"):
+        client.ask(item)
+
+
+def test_summary_client_raises_on_http_error():
+    def handler(request: httpx.Request) -> httpx.Response:
+        return httpx.Response(404, json={"detail": "dashboard not found"})
+
+    client = SummaryClient(host="https://h", token="tok", workspace_id="ws")
+    client._client = httpx.Client(transport=httpx.MockTransport(handler))
+    with pytest.raises(httpx.HTTPStatusError):
+        client.ask(_item())
diff --git a/packages/gooddata-eval/tests/test_summary_evaluator.py b/packages/gooddata-eval/tests/test_summary_evaluator.py
new file mode 100644
index 000000000..6056047e0
--- /dev/null
+++ b/packages/gooddata-eval/tests/test_summary_evaluator.py
@@ -0,0 +1,87 @@
+# (C) 2026 GoodData Corporation
+from unittest.mock import MagicMock, patch
+
+from gooddata_eval.core.evaluators.summary import DashboardSummaryEvaluator
+from gooddata_eval.core.models import ChatResult, DatasetItem
+
+
+def _make_evaluator():
+    with patch("openai.OpenAI"), patch.dict("os.environ", {"OPENAI_API_KEY": "sk-test"}):
+        return DashboardSummaryEvaluator()
+
+
+def _item(expected_output) -> DatasetItem:
+    return DatasetItem(
+        id="s1",
+        dataset_name="d",
+        test_kind="dashboard_summary",
+        question="Summarize the dashboard",
+        expected_output=expected_output,
+    )
+
+
+def _chat(text: str = "Revenue grew QoQ; West is the top region.") -> ChatResult:
+    return ChatResult.model_validate({"textResponse": text})
+
+
+def test_passes_when_all_criteria_satisfied():
+    ev = _make_evaluator()
+    ev._positive_judge.score = MagicMock(return_value=(True, "ok"))
+    ev._violation_judge.score = MagicMock(return_value=(False, "characteristic absent"))
+
+    item = _item({"must_include": ["a", "b"], "must_not_include": ["x"], "rubric": ["r"]})
+    res = ev.evaluate(item, _chat())
+
+    assert res.passed is True
+    # 4 bool checks, all True -> quality 1.0
+    assert res.rank_key == (1, 1.0)
+
+
+def test_fails_when_must_not_include_violated():
+    ev = _make_evaluator()
+    ev._positive_judge.score = MagicMock(return_value=(True, "ok"))
+    # violation judge detects the forbidden characteristic is present -> avoided=False
+    ev._violation_judge.score = MagicMock(return_value=(True, "has a separate filter section"))
+
+    item = _item({"must_include": ["a"], "must_not_include": ["x"]})
+    res = ev.evaluate(item, _chat())
+
+    assert res.passed is False
+    # include_0 True, exclude_0 False -> quality 0.5
+    assert res.rank_key == (0, 0.5)
+
+
+def test_fails_when_a_must_include_is_missing():
+    ev = _make_evaluator()
+    ev._positive_judge.score = MagicMock(side_effect=[(True, "ok"), (False, "missing")])
+    ev._violation_judge.score = MagicMock(return_value=(False, "characteristic absent"))
+
+    item = _item({"must_include": ["a", "b"]})
+    res = ev.evaluate(item, _chat())
+
+    assert res.passed is False
+    assert res.rank_key == (0, 0.5)
+
+
+def test_rubric_does_not_gate_pass_but_lowers_quality():
+    ev = _make_evaluator()
+    # must_include passes; rubric fails.
+    ev._positive_judge.score = MagicMock(side_effect=[(True, "ok"), (False, "weak")])
+
+    item = _item({"must_include": ["a"], "rubric": ["nice prose"]})
+    res = ev.evaluate(item, _chat())
+
+    assert res.passed is True  # rubric failure does not fail the item
+    assert res.rank_key == (1, 0.5)  # but quality reflects it
+
+
+def test_non_dict_expected_output_is_single_rubric_criterion():
+    ev = _make_evaluator()
+    ev._positive_judge.score = MagicMock(return_value=(True, "ok"))
+
+    item = _item("A good summary mentions the overall revenue trend.")
+    res = ev.evaluate(item, _chat())
+
+    assert res.passed is True
+    assert res.rank_key == (1, 1.0)
+    ev._positive_judge.score.assert_called_once()
diff --git a/packages/gooddata-eval/tests/test_workspace.py b/packages/gooddata-eval/tests/test_workspace.py
index fbeb06095..fdeace6df 100644
--- a/packages/gooddata-eval/tests/test_workspace.py
+++ b/packages/gooddata-eval/tests/test_workspace.py
@@ -11,6 +11,7 @@
     resolve_model,
     select_provider_and_model,
 )
+from gooddata_sdk import CatalogWorkspaceSetting
 
 
 def test_active_provider_content_shape():
@@ -111,6 +112,68 @@ def test_resolve_provider_ref_ambiguous_name():
         _resolve_provider_ref("Shared Name", info)
 
 
+def _controller_with_settings(settings: list[CatalogWorkspaceSetting]) -> WorkspaceModelController:
+    controller = WorkspaceModelController.__new__(WorkspaceModelController)
+    controller._workspace_id = "demo"
+    controller._sdk = MagicMock()
+    controller._sdk.catalog_workspace.list_workspace_settings.return_value = settings
+    return controller
+
+
+def _setting(setting_id: str, setting_type: str, content: dict) -> CatalogWorkspaceSetting:
+    return CatalogWorkspaceSetting(id=setting_id, setting_type=setting_type, content=content)
+
+
+def test_get_active_finds_setting_by_type_regardless_of_id():
+    # The setting exists under a non-default id (e.g. a UI-generated one).
+    controller = _controller_with_settings(
+        [
+            _setting("some-other-setting", "OTHER_TYPE", {}),
+            _setting("uuid-1234", "ACTIVE_LLM_PROVIDER", {"id": "prov_1", "defaultModelId": "gpt-5.2"}),
+        ]
+    )
+    active = controller.get_active()
+    assert active == ActiveLlmProvider(provider_id="prov_1", default_model_id="gpt-5.2")
+
+
+def test_get_active_returns_none_when_no_setting_of_type():
+    controller = _controller_with_settings([_setting("x", "OTHER_TYPE", {})])
+    assert controller.get_active() is None
+
+
+def test_activate_updates_existing_setting_using_its_real_id():
+    controller = _controller_with_settings(
+        [_setting("uuid-1234", "ACTIVE_LLM_PROVIDER", {"id": "prov_1", "defaultModelId": "gpt-5.2"})]
+    )
+    controller.activate("prov_2", "gpt-4o")
+    args, _ = controller._sdk.catalog_workspace.create_or_update_workspace_setting.call_args
+    _, written = args
+    assert written.id == "uuid-1234"  # reuses existing id -> UPDATE, no 409
+    assert written.content == active_provider_content("prov_2", "gpt-4o")
+
+
+def test_activate_creates_with_default_id_when_absent():
+    controller = _controller_with_settings([])
+    controller.activate("prov_1", "gpt-5.2")
+    args, _ = controller._sdk.catalog_workspace.create_or_update_workspace_setting.call_args
+    _, written = args
+    assert written.id == "activeLlmProvider"
+
+
+def test_resolve_and_activate_default_path_sets_empty_provider_type():
+    # No --model/--provider: the default branch must still populate provider_type
+    # (regression: it was left unbound -> UnboundLocalError).
+    ctrl = WorkspaceModelController.__new__(WorkspaceModelController)
+    ctrl._workspace_id = "ws"
+    ctrl._sdk = MagicMock()
+    ctrl.get_active = lambda: ActiveLlmProvider(provider_id="prov_1", default_model_id="gpt-5.2")
+    ctrl.activate = lambda pid, mid: None
+    resolved = ctrl.resolve_and_activate(None, None)
+    assert (resolved.provider_id, resolved.model_id) == ("prov_1", "gpt-5.2")
+    assert resolved.provider_type == ""
+    assert resolved.provider_name == ""
+
+
 def test_workspace_controller_restore_calls_activate():
     ctrl = WorkspaceModelController.__new__(WorkspaceModelController)
     ctrl._workspace_id = "ws"