diff --git a/packages/gooddata-eval/README.md b/packages/gooddata-eval/README.md index 44733933f..1920747a6 100644 --- a/packages/gooddata-eval/README.md +++ b/packages/gooddata-eval/README.md @@ -162,7 +162,45 @@ A dataset is a folder of `.json` files, one per question: ``` Supported `test_kind` values: `visualization`, `metric_skill`, `alert_skill`, -`search_tool`, `general_question`, `guardrail`. +`search_tool`, `general_question`, `guardrail`, `dashboard_summary`. + +### `dashboard_summary` items + +Summary items call the dedicated summary endpoint +(`POST /api/v1/ai/workspaces/{ws}/summary`) instead of the chat endpoint, so +they carry an extra `summary_input` block, and the `expected_output` is a +**rubric** rather than an exact answer (summaries are free text): + +```json +{ + "id": "summary-001", + "dataset_name": "summary_pilot", + "test_kind": "dashboard_summary", + "question": "Summarize the Sales Overview dashboard.", + "summary_input": { + "dashboard_id": "sales_overview" + }, + "expected_output": { + "must_include": ["States the overall revenue trend", "Identifies the top segment"], + "must_not_include": ["Numbers or segments not present in the visualizations"], + "rubric": ["Reads as a coherent business summary"] + } +} +``` + +`summary_input` requires only `dashboard_id` (the endpoint summarizes the whole +dashboard). Optional fields narrow the scope: `visualizations` (list of ids), +`filter_context` (AFM filters), `tab_id`, and `format_hint`. + +The `expected_output` rubric: + +- `must_include` — facts a good summary must contain; **all** must pass for the item to pass. +- `must_not_include` — hallucination/accuracy guards; **any** violation fails the item. +- `rubric` — soft quality dimensions; they affect `quality_score` but do not gate pass/fail. + +Each criterion is scored independently by the LLM judge, so `quality_score` +is the fraction of satisfied criteria. Runnable examples live in +[`examples/summary_dataset/`](examples/summary_dataset/). ## Supported test kinds @@ -174,6 +212,7 @@ Supported `test_kind` values: `visualization`, `metric_skill`, `alert_skill`, | `search_tool` | `search_objects` tool call (correct function called = pass; correct arguments = quality score) | — | | `general_question` | Text answer judged by LLM | `[llm-judge]` | | `guardrail` | Refusal/redirect (visualization response auto-fails) | `[llm-judge]` | +| `dashboard_summary` | Dashboard summary (via `/summary` endpoint) scored against a rubric by LLM | `[llm-judge]` | ## Optional extras diff --git a/packages/gooddata-eval/examples/summary_dataset/summary_format_hint_brief.json b/packages/gooddata-eval/examples/summary_dataset/summary_format_hint_brief.json new file mode 100644 index 000000000..b1fcbf58d --- /dev/null +++ b/packages/gooddata-eval/examples/summary_dataset/summary_format_hint_brief.json @@ -0,0 +1,26 @@ +{ + "id": "summary-format-hint-brief", + "dataset_name": "summary_pilot", + "question": "Give a brief executive summary of the Top 10 Products.", + "test_kind": "dashboard_summary", + "summary_input": { + "dashboard_id": "b2f2d436-9831-4fe0-81df-8c59fd33242b", + "visualizations": ["top_10_products"], + "format_hint": "A brief executive summary: at most 3 sentences, no headings or bullet points." + }, + "expected_output": { + "must_include": [ + "Cites specific numeric values from the Top 10 Products data" + ], + "must_not_include": [ + "Uses section headings (such as 'Summary' or 'Key Insights') or bullet/numbered lists", + "Reports specific data or findings about visualizations other than Top 10 Products" + ], + "rubric": [ + "Respects the requested brevity (roughly three sentences or fewer)", + "Conveys the main product or category concentration insight (e.g. Outdoor or Neptide dominance)", + "Notes that only the Top 10 Products visualization was analyzed", + "Reads as fluent prose rather than a list of raw values" + ] + } +} diff --git a/packages/gooddata-eval/examples/summary_dataset/summary_full_dashboard.json b/packages/gooddata-eval/examples/summary_dataset/summary_full_dashboard.json new file mode 100644 index 000000000..916bb4a8b --- /dev/null +++ b/packages/gooddata-eval/examples/summary_dataset/summary_full_dashboard.json @@ -0,0 +1,27 @@ +{ + "id": "summary-full-dashboard", + "dataset_name": "summary_pilot", + "test_kind": "dashboard_summary", + "question": "Summarize the All visualizations 1 with custom filter dashboard.", + "summary_input": { + "dashboard_id": "b2f2d436-9831-4fe0-81df-8c59fd33242b" + }, + "expected_output": { + "must_include": [ + "Includes a Summary, Key Insights, and Areas to Watch section", + "Cites specific numeric values from the dashboard data" + ], + "must_not_include": [ + "Includes a markdown heading dedicated to filters (a line such as '## Filter Context' or '### Filters'), separate from the Summary, Key Insights, and Areas to Watch sections. Filters mentioned within the prose of those sections do NOT count." + ], + "rubric": [ + "The Summary section captures the overall state across the dashboard", + "Key Insights call out top and/or bottom performers and their relative contribution (shares or percentages)", + "Key Insights describe a direction of change over time (growing, declining, or flat) for at least one metric", + "Areas to Watch highlights at least one risk, decline, or concentration issue and explains why it matters for business decisions", + "Findings reference the active filter context (e.g. the time window or selected segments) in the wording", + "Synthesizes insights across multiple visualizations rather than describing each one in isolation", + "Reads as a cohesive business narrative connecting related metrics, not a list of raw values" + ] + } +} diff --git a/packages/gooddata-eval/examples/summary_dataset/summary_selected_visualizations.json b/packages/gooddata-eval/examples/summary_dataset/summary_selected_visualizations.json new file mode 100644 index 000000000..01433a74b --- /dev/null +++ b/packages/gooddata-eval/examples/summary_dataset/summary_selected_visualizations.json @@ -0,0 +1,26 @@ +{ + "id": "summary-selected-visualizations", + "dataset_name": "summary_pilot", + "test_kind": "dashboard_summary", + "question": "Summarize only the Top 10 Customers and Top 10 Products on the dashboard.", + "summary_input": { + "dashboard_id": "b2f2d436-9831-4fe0-81df-8c59fd33242b", + "visualizations": ["top_10_customers", "top_10_products"] + }, + "expected_output": { + "must_include": [ + "Includes a Summary, Key Insights, and Areas to Watch section", + "Cites specific numeric values for both the customers and the products data" + ], + "must_not_include": [ + "Reports specific data or findings about dashboard visualizations other than Top 10 Customers and Top 10 Products" + ], + "rubric": [ + "Identifies the top customer (highest-revenue account) with their revenue or contribution", + "Identifies the top product with its revenue and share", + "Highlights product or category concentration (e.g. a single product or category dominating)", + "Notes that only the two requested visualizations were analyzed and other dashboard data was not included", + "Reads as a cohesive business narrative connecting customers and products, not a list of raw values" + ] + } +} diff --git a/packages/gooddata-eval/src/gooddata_eval/cli/main.py b/packages/gooddata-eval/src/gooddata_eval/cli/main.py index ff041f18b..0ba08be9c 100644 --- a/packages/gooddata-eval/src/gooddata_eval/cli/main.py +++ b/packages/gooddata-eval/src/gooddata_eval/cli/main.py @@ -16,14 +16,38 @@ from gooddata_eval.core.connection import ConnectionError_, resolve_connection from gooddata_eval.core.dataset.local import load_local_dataset from gooddata_eval.core.langfuse.sink import LangfuseSink -from gooddata_eval.core.models import DatasetItem +from gooddata_eval.core.models import ChatResult, DatasetItem from gooddata_eval.core.reporting.console import render_comparison, render_console from gooddata_eval.core.reporting.json_report import write_multi_model_report from gooddata_eval.core.runner import ItemReport, run_items +from gooddata_eval.core.summary.http_client import SummaryClient from gooddata_eval.core.workspace import ModelResolutionError, WorkspaceModelController _EXIT_OK = 0 _EXIT_OPERATIONAL_ERROR = 2 +_SUMMARY_TEST_KIND = "dashboard_summary" + + +class _RoutingBackend: + """Dispatch each item to the right backend by test_kind. + + `dashboard_summary` items go to the dedicated summary endpoint; everything + else uses the conversational chat endpoint. + """ + + def __init__(self, chat: ChatClient, summary: SummaryClient): + self._chat = chat + self._summary = summary + + def ask(self, item: DatasetItem) -> ChatResult: + if item.test_kind == _SUMMARY_TEST_KIND: + return self._summary.ask(item) + return self._chat.ask(item) + + def close(self) -> None: + for backend in (self._chat, self._summary): + if hasattr(backend, "close"): + backend.close() def _build_parser() -> argparse.ArgumentParser: @@ -256,7 +280,10 @@ def on_langfuse_item_done( ) -> None: _sink.log_item(report, dataset_item_id=report.id) - backend = ChatClient(host=config.host, token=config.token, workspace_id=config.workspace_id) + backend = _RoutingBackend( + ChatClient(host=config.host, token=config.token, workspace_id=config.workspace_id), + SummaryClient(host=config.host, token=config.token, workspace_id=config.workspace_id), + ) try: report = run_items( items, diff --git a/packages/gooddata-eval/src/gooddata_eval/core/chat/sse_client.py b/packages/gooddata-eval/src/gooddata_eval/core/chat/sse_client.py index d36e59440..6548a4ee8 100644 --- a/packages/gooddata-eval/src/gooddata_eval/core/chat/sse_client.py +++ b/packages/gooddata-eval/src/gooddata_eval/core/chat/sse_client.py @@ -19,7 +19,7 @@ import httpx -from gooddata_eval.core.models import ChatResult +from gooddata_eval.core.models import ChatResult, DatasetItem SSE_DATA_PREFIX = "data: " @@ -169,11 +169,11 @@ def _send_message(self, conversation_id: str, question: str) -> ChatResult: resp.raise_for_status() return parse_sse_lines(resp.iter_lines()) - def ask(self, question: str) -> ChatResult: + def ask(self, item: DatasetItem) -> ChatResult: """Run one single-turn conversation: create, send, parse, clean up.""" conversation_id = self._create_conversation() try: - return self._send_message(conversation_id, question) + return self._send_message(conversation_id, item.question) finally: self._delete_conversation(conversation_id) diff --git a/packages/gooddata-eval/src/gooddata_eval/core/evaluators/__init__.py b/packages/gooddata-eval/src/gooddata_eval/core/evaluators/__init__.py index 4da18f876..f2b71955a 100644 --- a/packages/gooddata-eval/src/gooddata_eval/core/evaluators/__init__.py +++ b/packages/gooddata-eval/src/gooddata_eval/core/evaluators/__init__.py @@ -20,15 +20,18 @@ ) } -# LLM-judge evaluators (general_question, guardrail) require the [llm-judge] extra. -# Their modules are imported lazily on first use so the CLI starts without openai. +# LLM-judge evaluators (general_question, guardrail, dashboard_summary) require the +# [llm-judge] extra. Their modules are imported lazily on first use so the CLI +# starts without openai. _LAZY_EVALUATOR_MODULES: dict[str, str] = { "general_question": "gooddata_eval.core.evaluators.general_question", "guardrail": "gooddata_eval.core.evaluators.guardrail", + "dashboard_summary": "gooddata_eval.core.evaluators.summary", } _LAZY_EVALUATOR_CLASSES: dict[str, str] = { "general_question": "GeneralQuestionEvaluator", "guardrail": "GuardrailEvaluator", + "dashboard_summary": "DashboardSummaryEvaluator", } diff --git a/packages/gooddata-eval/src/gooddata_eval/core/evaluators/summary.py b/packages/gooddata-eval/src/gooddata_eval/core/evaluators/summary.py new file mode 100644 index 000000000..b0af2ce42 --- /dev/null +++ b/packages/gooddata-eval/src/gooddata_eval/core/evaluators/summary.py @@ -0,0 +1,96 @@ +# (C) 2026 GoodData Corporation +"""Evaluator for dashboard_summary: rubric-based LLM-as-judge scoring. + +Summaries are free text, so we do not match strings. Instead, `expected_output` +is a rubric of checkable criteria: + + { + "must_include": ["...facts a good summary must contain..."], + "must_not_include": ["...things a good summary must avoid (hallucinations)..."], + "rubric": ["...soft quality dimensions..."] + } + +Each criterion is scored independently by the judge (True/False), so the +runner's `quality_score` becomes the fraction of satisfied criteria. The item +*passes* only when every `must_include` is satisfied and no `must_not_include` +is violated; `rubric` items contribute to quality but do not gate pass/fail. + +As a fallback, a non-dict `expected_output` is treated as a single rubric +criterion (same behaviour as `general_question`). +""" + +from typing import Any + +from gooddata_eval.core.evaluators._llm_judge import LLMJudge +from gooddata_eval.core.evaluators._text_utils import extract_text +from gooddata_eval.core.evaluators.base import ItemEvaluation +from gooddata_eval.core.models import ChatResult, DatasetItem + +_POSITIVE_STEPS = [ + "Read the INPUT (the user's request) and the EXPECTED OUTPUT (one criterion the summary must satisfy).", + "Read the ACTUAL OUTPUT (the generated summary).", + "Score 1 if the actual output clearly satisfies the criterion (allow paraphrasing and reasonable numeric tolerance).", + "Score 0 if the criterion is missing, contradicted, or only partially addressed.", +] + +# For must_not_include we ask the judge a plain presence question and invert the +# result in code. Scoring "does the summary AVOID X?" via a field labelled +# EXPECTED OUTPUT is unreliable: the model reads the forbidden behaviour as +# desired and flips the verdict. Detecting presence (no negation, no +# contradictory label) is far more robust. +_VIOLATION_STEPS = [ + "Read the CHARACTERISTIC described in EXPECTED OUTPUT.", + "Read the ACTUAL OUTPUT (the generated summary).", + "Score 1 if the actual output clearly exhibits the described characteristic.", + "Score 0 if it does not exhibit it.", +] + + +class DashboardSummaryEvaluator: + test_kind = "dashboard_summary" + + def __init__(self): + self._positive_judge = LLMJudge(evaluation_steps=_POSITIVE_STEPS) + self._violation_judge = LLMJudge(evaluation_steps=_VIOLATION_STEPS) + + @staticmethod + def _criteria(expected_output: Any) -> tuple[list[str], list[str], list[str]]: + if isinstance(expected_output, dict): + must_include = [str(c) for c in expected_output.get("must_include", [])] + must_not_include = [str(c) for c in expected_output.get("must_not_include", [])] + rubric = [str(c) for c in expected_output.get("rubric", [])] + if must_include or must_not_include or rubric: + return must_include, must_not_include, rubric + # Fallback: treat the whole expected_output as a single gating criterion + # (same pass/fail semantics as general_question). + return [str(expected_output)], [], [] + + def evaluate(self, item: DatasetItem, chat_result: ChatResult) -> ItemEvaluation: + actual = extract_text(chat_result) + must_include, must_not_include, rubric = self._criteria(item.expected_output) + + detail: dict[str, Any] = {"actual_output": actual} + passed = True + + for i, criterion in enumerate(must_include): + ok, reason = self._positive_judge.score(item.question, criterion, actual) + detail[f"include_{i}"] = ok + detail[f"include_{i}_reason"] = reason + passed = passed and ok + + for i, criterion in enumerate(must_not_include): + violated, reason = self._violation_judge.score(item.question, criterion, actual) + ok = not violated # True == characteristic absent == correctly avoided + detail[f"exclude_{i}"] = ok + detail[f"exclude_{i}_reason"] = reason + passed = passed and ok + + for i, criterion in enumerate(rubric): + ok, reason = self._positive_judge.score(item.question, criterion, actual) + detail[f"rubric_{i}"] = ok + detail[f"rubric_{i}_reason"] = reason + + bool_checks = [v for v in detail.values() if isinstance(v, bool)] + quality = sum(1 for v in bool_checks if v) / len(bool_checks) if bool_checks else 0.0 + + return ItemEvaluation(passed=passed, rank_key=(int(passed), quality), detail=detail) diff --git a/packages/gooddata-eval/src/gooddata_eval/core/models.py b/packages/gooddata-eval/src/gooddata_eval/core/models.py index 8c1965d30..63587f680 100644 --- a/packages/gooddata-eval/src/gooddata_eval/core/models.py +++ b/packages/gooddata-eval/src/gooddata_eval/core/models.py @@ -85,6 +85,23 @@ class ChatResult(BaseModel): tool_call_events: list[ToolCallEvent] = Field(default_factory=list, alias="toolCallEvents") +class SummaryInput(BaseModel): + """Structured input for the `dashboard_summary` test kind. + + Maps onto the dedicated summary endpoint's request body + (`POST /api/v1/ai/workspaces/{ws}/summary`). Authored in snake_case in the + dataset; the SummaryClient maps it to the endpoint's camelCase fields. + """ + + model_config = ConfigDict(extra="ignore") + + dashboard_id: str + visualizations: list[str] | None = None + filter_context: list[dict] | None = None + tab_id: str | None = None + format_hint: str | None = None + + class DatasetItem(BaseModel): """Common dataset envelope. `expected_output` stays raw; each evaluator parses its own shape.""" @@ -95,3 +112,5 @@ class DatasetItem(BaseModel): test_kind: str question: str expected_output: Any + # Only used by the `dashboard_summary` test kind; ignored by all others. + summary_input: SummaryInput | None = None diff --git a/packages/gooddata-eval/src/gooddata_eval/core/reporting/console.py b/packages/gooddata-eval/src/gooddata_eval/core/reporting/console.py index 316490a84..d2c79fccd 100644 --- a/packages/gooddata-eval/src/gooddata_eval/core/reporting/console.py +++ b/packages/gooddata-eval/src/gooddata_eval/core/reporting/console.py @@ -32,13 +32,11 @@ def render_console(report: EvalReport, *, console: Console | None = None) -> str elif item.pass_at_k: result, notes = "PASS", "" else: - d = item.best_detail - failing = [ - k - for k in ("metrics_correct", "dimensions_correct", "filters_correct", "viz_type_hard") - if d.get(k) is False - ] - notes = "failed: " + ", ".join(failing) if failing else "no visualization created" + # Evaluator-agnostic: report whichever boolean checks came back False + # (visualization uses metrics_correct/…; dashboard_summary uses + # include_*/exclude_*/rubric_*). Falls back to a generic message. + failing = [k for k, v in item.best_detail.items() if v is False] + notes = "failed: " + ", ".join(failing) if failing else "did not pass strict checks" result = "FAIL" latency = "-" if item.runs == 0 else f"{item.latency_s:.2f}s" avg = "-" if item.runs == 0 else f"{item.avg_latency_s:.2f}s" diff --git a/packages/gooddata-eval/src/gooddata_eval/core/runner.py b/packages/gooddata-eval/src/gooddata_eval/core/runner.py index 7fb0d36d6..087f142f2 100644 --- a/packages/gooddata-eval/src/gooddata_eval/core/runner.py +++ b/packages/gooddata-eval/src/gooddata_eval/core/runner.py @@ -12,7 +12,9 @@ class ChatBackend(Protocol): - def ask(self, question: str) -> ChatResult: ... + # Receives the whole item so backends can use per-item context beyond the + # question text (e.g. dashboard_summary needs item.summary_input). + def ask(self, item: DatasetItem) -> ChatResult: ... @dataclass @@ -106,7 +108,7 @@ def _run_one_item( try: for run_index in range(1, runs + 1): t0 = time.perf_counter() - chat_result = backend.ask(item.question) + chat_result = backend.ask(item) evaluation = evaluator.evaluate(item, chat_result) latency = time.perf_counter() - t0 report.runs += 1 diff --git a/packages/gooddata-eval/src/gooddata_eval/core/summary/__init__.py b/packages/gooddata-eval/src/gooddata_eval/core/summary/__init__.py new file mode 100644 index 000000000..efe7c60c8 --- /dev/null +++ b/packages/gooddata-eval/src/gooddata_eval/core/summary/__init__.py @@ -0,0 +1 @@ +# (C) 2026 GoodData Corporation diff --git a/packages/gooddata-eval/src/gooddata_eval/core/summary/http_client.py b/packages/gooddata-eval/src/gooddata_eval/core/summary/http_client.py new file mode 100644 index 000000000..bab285e04 --- /dev/null +++ b/packages/gooddata-eval/src/gooddata_eval/core/summary/http_client.py @@ -0,0 +1,54 @@ +# (C) 2026 GoodData Corporation +"""HTTP client for the dedicated dashboard-summary endpoint. + +Unlike the conversational chat skill, this endpoint executes the AFM for each +visualization server-side and returns a plain synchronous JSON summary — no SSE +stream and no client-side ``result_id`` wrangling. The response is adapted into +a ``ChatResult`` (summary text -> ``text_response``) so the existing +LLM-as-judge evaluators can score it unchanged. + +Endpoint (gen-ai service): + POST /api/v1/ai/workspaces/{workspace_id}/summary + +If the route is ever renamed (e.g. to ``/summarize``), change ``_PATH`` only. +""" + +import httpx + +from gooddata_eval.core.models import ChatResult, DatasetItem + +_PATH = "/api/v1/ai/workspaces/{workspace_id}/summary" + + +class SummaryClient: + """Single-shot client for the dashboard-summary endpoint.""" + + def __init__(self, host: str, token: str, workspace_id: str, *, timeout: float = 300.0): + self._url = f"{host.rstrip('/')}{_PATH.format(workspace_id=workspace_id)}" + self._auth = {"Authorization": f"Bearer {token}"} + self._client = httpx.Client(timeout=timeout) + + def ask(self, item: DatasetItem) -> ChatResult: + """Request a summary for one dataset item and adapt it to a ChatResult.""" + si = item.summary_input + if si is None: + raise ValueError(f"dashboard_summary item '{item.id}' is missing required 'summary_input'.") + + body: dict = {"dashboardId": si.dashboard_id} + if si.visualizations is not None: + body["visualizations"] = si.visualizations + if si.filter_context is not None: + body["filterContext"] = si.filter_context + if si.tab_id is not None: + body["tabId"] = si.tab_id + if si.format_hint is not None: + body["formatHint"] = si.format_hint + + resp = self._client.post(self._url, json=body, headers={**self._auth, "Content-Type": "application/json"}) + resp.raise_for_status() + data = resp.json() + summary = data.get("summary") or "" + return ChatResult.model_validate({"textResponse": summary}) + + def close(self) -> None: + self._client.close() diff --git a/packages/gooddata-eval/src/gooddata_eval/core/workspace.py b/packages/gooddata-eval/src/gooddata_eval/core/workspace.py index e849c0aea..35b3ee132 100644 --- a/packages/gooddata-eval/src/gooddata_eval/core/workspace.py +++ b/packages/gooddata-eval/src/gooddata_eval/core/workspace.py @@ -4,9 +4,13 @@ from dataclasses import dataclass, field import httpx -from gooddata_api_client.exceptions import ApiException, NotFoundException +from gooddata_api_client.exceptions import ApiException from gooddata_sdk import CatalogWorkspaceSetting, GoodDataSdk +# Default id used only when creating the setting for the first time. The backend +# enforces a single setting per *type* (ACTIVE_LLM_PROVIDER), and an existing +# setting may have been created with any id (e.g. by the UI), so reads and +# updates must locate it by type rather than by this id. _SETTING_ID = "activeLlmProvider" _SETTING_TYPE = "ACTIVE_LLM_PROVIDER" @@ -142,10 +146,16 @@ def __init__(self, host: str, token: str, workspace_id: str): self._workspace_id = workspace_id self._sdk = GoodDataSdk.create(host, token) + def _active_setting(self) -> CatalogWorkspaceSetting | None: + """Find the workspace's ACTIVE_LLM_PROVIDER setting by type (id may vary).""" + for setting in self._sdk.catalog_workspace.list_workspace_settings(self._workspace_id): + if setting.setting_type == _SETTING_TYPE: + return setting + return None + def get_active(self) -> ActiveLlmProvider | None: - try: - setting = self._sdk.catalog_workspace.get_workspace_setting(self._workspace_id, _SETTING_ID) - except NotFoundException: + setting = self._active_setting() + if setting is None: return None content = setting.content or {} return ActiveLlmProvider( @@ -176,8 +186,12 @@ def all_provider_models(self) -> dict[str, list[str]]: } def activate(self, provider_id: str, model_id: str) -> None: + # Reuse the existing setting's id so create_or_update performs an UPDATE. + # Creating a second ACTIVE_LLM_PROVIDER setting (under a different id) + # would be rejected by the backend with HTTP 409 (one per type). + existing = self._active_setting() setting = CatalogWorkspaceSetting( - id=_SETTING_ID, + id=existing.id if existing is not None else _SETTING_ID, setting_type=_SETTING_TYPE, content=active_provider_content(provider_id, model_id), ) @@ -195,6 +209,7 @@ def resolve_and_activate(self, requested_model: str | None, requested_provider: if requested_model is None and requested_provider is None: provider_id, model_id = resolve_model(None, active) provider_name = "" + provider_type = "" else: info = self._provider_info() providers_models = { diff --git a/packages/gooddata-eval/tests/test_runner.py b/packages/gooddata-eval/tests/test_runner.py index ede002043..1ffbe45bd 100644 --- a/packages/gooddata-eval/tests/test_runner.py +++ b/packages/gooddata-eval/tests/test_runner.py @@ -32,7 +32,7 @@ def __init__(self, results): self._results = results self.calls = 0 - def ask(self, question: str) -> ChatResult: + def ask(self, item: DatasetItem) -> ChatResult: r = self._results[min(self.calls, len(self._results) - 1)] self.calls += 1 return r @@ -66,7 +66,7 @@ def test_run_items_marks_unsupported_test_kind_skipped(): def test_run_items_records_agent_error_without_passing(): class _BoomBackend: - def ask(self, question: str) -> ChatResult: + def ask(self, item: DatasetItem) -> ChatResult: raise RuntimeError("network down") report = run_items([_item()], _BoomBackend(), runs=1) @@ -109,7 +109,15 @@ def test_run_items_reports_latency_and_per_run_callback(): def test_run_items_routes_all_supported_kinds(): - expected_kinds = {"visualization", "metric_skill", "alert_skill", "search_tool", "general_question", "guardrail"} + expected_kinds = { + "visualization", + "metric_skill", + "alert_skill", + "search_tool", + "general_question", + "guardrail", + "dashboard_summary", + } assert expected_kinds == supported_test_kinds() diff --git a/packages/gooddata-eval/tests/test_summary_client.py b/packages/gooddata-eval/tests/test_summary_client.py new file mode 100644 index 000000000..637e03aed --- /dev/null +++ b/packages/gooddata-eval/tests/test_summary_client.py @@ -0,0 +1,85 @@ +# (C) 2026 GoodData Corporation +import json + +import httpx +import pytest +from gooddata_eval.core.models import DatasetItem, SummaryInput +from gooddata_eval.core.summary.http_client import SummaryClient + + +def _item() -> DatasetItem: + return DatasetItem( + id="s1", + dataset_name="d", + test_kind="dashboard_summary", + question="Summarize the dashboard", + expected_output={}, + summary_input=SummaryInput( + dashboard_id="dash1", + visualizations=["v1", "v2"], + format_hint="3 bullets", + ), + ) + + +def test_summary_client_posts_request_and_maps_summary(): + captured: dict = {} + + def handler(request: httpx.Request) -> httpx.Response: + captured["url"] = str(request.url) + captured["body"] = json.loads(request.content) + captured["auth"] = request.headers.get("authorization") + return httpx.Response(200, json={"summary": "Revenue grew QoQ.", "filterContext": []}) + + client = SummaryClient(host="https://h", token="tok", workspace_id="ws") + client._client = httpx.Client(transport=httpx.MockTransport(handler)) + + result = client.ask(_item()) + + assert result.text_response == "Revenue grew QoQ." + assert captured["url"] == "https://h/api/v1/ai/workspaces/ws/summary" + assert captured["auth"] == "Bearer tok" + assert captured["body"] == { + "dashboardId": "dash1", + "visualizations": ["v1", "v2"], + "formatHint": "3 bullets", + } + + +def test_summary_client_omits_unset_optional_fields(): + captured: dict = {} + + def handler(request: httpx.Request) -> httpx.Response: + captured["body"] = json.loads(request.content) + return httpx.Response(200, json={"summary": "ok"}) + + client = SummaryClient(host="https://h", token="tok", workspace_id="ws") + client._client = httpx.Client(transport=httpx.MockTransport(handler)) + + item = DatasetItem( + id="s2", + dataset_name="d", + test_kind="dashboard_summary", + question="q", + expected_output={}, + summary_input=SummaryInput(dashboard_id="only-dashboard"), + ) + client.ask(item) + assert captured["body"] == {"dashboardId": "only-dashboard"} + + +def test_summary_client_raises_without_summary_input(): + client = SummaryClient(host="https://h", token="tok", workspace_id="ws") + item = DatasetItem(id="s3", dataset_name="d", test_kind="dashboard_summary", question="q", expected_output={}) + with pytest.raises(ValueError, match="summary_input"): + client.ask(item) + + +def test_summary_client_raises_on_http_error(): + def handler(request: httpx.Request) -> httpx.Response: + return httpx.Response(404, json={"detail": "dashboard not found"}) + + client = SummaryClient(host="https://h", token="tok", workspace_id="ws") + client._client = httpx.Client(transport=httpx.MockTransport(handler)) + with pytest.raises(httpx.HTTPStatusError): + client.ask(_item()) diff --git a/packages/gooddata-eval/tests/test_summary_evaluator.py b/packages/gooddata-eval/tests/test_summary_evaluator.py new file mode 100644 index 000000000..6056047e0 --- /dev/null +++ b/packages/gooddata-eval/tests/test_summary_evaluator.py @@ -0,0 +1,87 @@ +# (C) 2026 GoodData Corporation +from unittest.mock import MagicMock, patch + +from gooddata_eval.core.evaluators.summary import DashboardSummaryEvaluator +from gooddata_eval.core.models import ChatResult, DatasetItem + + +def _make_evaluator(): + with patch("openai.OpenAI"), patch.dict("os.environ", {"OPENAI_API_KEY": "sk-test"}): + return DashboardSummaryEvaluator() + + +def _item(expected_output) -> DatasetItem: + return DatasetItem( + id="s1", + dataset_name="d", + test_kind="dashboard_summary", + question="Summarize the dashboard", + expected_output=expected_output, + ) + + +def _chat(text: str = "Revenue grew QoQ; West is the top region.") -> ChatResult: + return ChatResult.model_validate({"textResponse": text}) + + +def test_passes_when_all_criteria_satisfied(): + ev = _make_evaluator() + ev._positive_judge.score = MagicMock(return_value=(True, "ok")) + ev._violation_judge.score = MagicMock(return_value=(False, "characteristic absent")) + + item = _item({"must_include": ["a", "b"], "must_not_include": ["x"], "rubric": ["r"]}) + res = ev.evaluate(item, _chat()) + + assert res.passed is True + # 4 bool checks, all True -> quality 1.0 + assert res.rank_key == (1, 1.0) + + +def test_fails_when_must_not_include_violated(): + ev = _make_evaluator() + ev._positive_judge.score = MagicMock(return_value=(True, "ok")) + # violation judge detects the forbidden characteristic is present -> avoided=False + ev._violation_judge.score = MagicMock(return_value=(True, "has a separate filter section")) + + item = _item({"must_include": ["a"], "must_not_include": ["x"]}) + res = ev.evaluate(item, _chat()) + + assert res.passed is False + # include_0 True, exclude_0 False -> quality 0.5 + assert res.rank_key == (0, 0.5) + + +def test_fails_when_a_must_include_is_missing(): + ev = _make_evaluator() + ev._positive_judge.score = MagicMock(side_effect=[(True, "ok"), (False, "missing")]) + ev._violation_judge.score = MagicMock(return_value=(False, "characteristic absent")) + + item = _item({"must_include": ["a", "b"]}) + res = ev.evaluate(item, _chat()) + + assert res.passed is False + assert res.rank_key == (0, 0.5) + + +def test_rubric_does_not_gate_pass_but_lowers_quality(): + ev = _make_evaluator() + # must_include passes; rubric fails. + ev._positive_judge.score = MagicMock(side_effect=[(True, "ok"), (False, "weak")]) + + item = _item({"must_include": ["a"], "rubric": ["nice prose"]}) + res = ev.evaluate(item, _chat()) + + assert res.passed is True # rubric failure does not fail the item + assert res.rank_key == (1, 0.5) # but quality reflects it + + +def test_non_dict_expected_output_is_single_rubric_criterion(): + ev = _make_evaluator() + ev._positive_judge.score = MagicMock(return_value=(True, "ok")) + + item = _item("A good summary mentions the overall revenue trend.") + res = ev.evaluate(item, _chat()) + + assert res.passed is True + assert res.rank_key == (1, 1.0) + ev._positive_judge.score.assert_called_once() diff --git a/packages/gooddata-eval/tests/test_workspace.py b/packages/gooddata-eval/tests/test_workspace.py index fbeb06095..fdeace6df 100644 --- a/packages/gooddata-eval/tests/test_workspace.py +++ b/packages/gooddata-eval/tests/test_workspace.py @@ -11,6 +11,7 @@ resolve_model, select_provider_and_model, ) +from gooddata_sdk import CatalogWorkspaceSetting def test_active_provider_content_shape(): @@ -111,6 +112,68 @@ def test_resolve_provider_ref_ambiguous_name(): _resolve_provider_ref("Shared Name", info) +def _controller_with_settings(settings: list[CatalogWorkspaceSetting]) -> WorkspaceModelController: + controller = WorkspaceModelController.__new__(WorkspaceModelController) + controller._workspace_id = "demo" + controller._sdk = MagicMock() + controller._sdk.catalog_workspace.list_workspace_settings.return_value = settings + return controller + + +def _setting(setting_id: str, setting_type: str, content: dict) -> CatalogWorkspaceSetting: + return CatalogWorkspaceSetting(id=setting_id, setting_type=setting_type, content=content) + + +def test_get_active_finds_setting_by_type_regardless_of_id(): + # The setting exists under a non-default id (e.g. a UI-generated one). + controller = _controller_with_settings( + [ + _setting("some-other-setting", "OTHER_TYPE", {}), + _setting("uuid-1234", "ACTIVE_LLM_PROVIDER", {"id": "prov_1", "defaultModelId": "gpt-5.2"}), + ] + ) + active = controller.get_active() + assert active == ActiveLlmProvider(provider_id="prov_1", default_model_id="gpt-5.2") + + +def test_get_active_returns_none_when_no_setting_of_type(): + controller = _controller_with_settings([_setting("x", "OTHER_TYPE", {})]) + assert controller.get_active() is None + + +def test_activate_updates_existing_setting_using_its_real_id(): + controller = _controller_with_settings( + [_setting("uuid-1234", "ACTIVE_LLM_PROVIDER", {"id": "prov_1", "defaultModelId": "gpt-5.2"})] + ) + controller.activate("prov_2", "gpt-4o") + args, _ = controller._sdk.catalog_workspace.create_or_update_workspace_setting.call_args + _, written = args + assert written.id == "uuid-1234" # reuses existing id -> UPDATE, no 409 + assert written.content == active_provider_content("prov_2", "gpt-4o") + + +def test_activate_creates_with_default_id_when_absent(): + controller = _controller_with_settings([]) + controller.activate("prov_1", "gpt-5.2") + args, _ = controller._sdk.catalog_workspace.create_or_update_workspace_setting.call_args + _, written = args + assert written.id == "activeLlmProvider" + + +def test_resolve_and_activate_default_path_sets_empty_provider_type(): + # No --model/--provider: the default branch must still populate provider_type + # (regression: it was left unbound -> UnboundLocalError). + ctrl = WorkspaceModelController.__new__(WorkspaceModelController) + ctrl._workspace_id = "ws" + ctrl._sdk = MagicMock() + ctrl.get_active = lambda: ActiveLlmProvider(provider_id="prov_1", default_model_id="gpt-5.2") + ctrl.activate = lambda pid, mid: None + resolved = ctrl.resolve_and_activate(None, None) + assert (resolved.provider_id, resolved.model_id) == ("prov_1", "gpt-5.2") + assert resolved.provider_type == "" + assert resolved.provider_name == "" + + def test_workspace_controller_restore_calls_activate(): ctrl = WorkspaceModelController.__new__(WorkspaceModelController) ctrl._workspace_id = "ws"