diff --git a/python/packages/core/agent_framework/__init__.py b/python/packages/core/agent_framework/__init__.py index cc517d993e..bd8b66cbb1 100644 --- a/python/packages/core/agent_framework/__init__.py +++ b/python/packages/core/agent_framework/__init__.py @@ -71,6 +71,7 @@ from ._evaluation import ( Evaluator, ExpectedToolCall, LocalEvaluator, + RubricScore, evaluate_agent, evaluate_workflow, evaluator, @@ -460,6 +461,7 @@ __all__ = [ "ResponseStream", "Role", "RoleLiteral", + "RubricScore", "RunContext", "Runner", "RunnerContext", diff --git a/python/packages/core/agent_framework/_evaluation.py b/python/packages/core/agent_framework/_evaluation.py index 64fab0eacb..52bdf90d0f 100644 --- a/python/packages/core/agent_framework/_evaluation.py +++ b/python/packages/core/agent_framework/_evaluation.py @@ -311,12 +311,15 @@ class EvalScoreResult: score: Numeric score from the evaluator. passed: Whether the item passed this evaluator's threshold. sample: Optional raw evaluator output (rationale, metadata). + dimensions: Per-dimension scores when this evaluator is a rubric + evaluator. ``None`` for non-rubric (e.g. built-in) evaluators. """ name: str score: float passed: bool | None = None sample: dict[str, Any] | None = None + dimensions: list[RubricScore] | None = None @experimental(feature_id=ExperimentalFeature.EVALS) @@ -496,6 +499,179 @@ class EvalResults: detail += f" Errored items: {', '.join(summaries)}." raise EvalNotPassedError(detail) + def assert_score_at_least( + self, + min_score: float, + *, + evaluator: str | None = None, + msg: str | None = None, + ) -> None: + """Assert every item's score (optionally filtered by evaluator) is ``>= min_score``. + + Designed for CI gates on generated rubric evaluators (e.g. + ``results.assert_score_at_least(0.80)``). Includes any + sub-results from workflow evaluations. + + Args: + min_score: Minimum acceptable score (inclusive). + evaluator: When set, only check scores from the evaluator + whose ``EvalScoreResult.name`` matches. + msg: Optional custom failure message. + + Raises: + EvalNotPassedError: When any matching score is below the threshold. + """ + offenders: list[str] = [] + + def _check(results: EvalResults) -> None: + for item in results.items: + for score in item.scores: + if evaluator is not None and score.name != evaluator: + continue + if score.score < min_score: + offenders.append(f"{item.item_id}/{score.name}={score.score:.3f}") + for sub in results.sub_results.values(): + _check(sub) + + _check(self) + if offenders: + detail = msg or ( + f"{len(offenders)} score(s) below threshold {min_score}" + f"{' for ' + evaluator if evaluator else ''}: {', '.join(offenders[:5])}" + + (f" (+{len(offenders) - 5} more)" if len(offenders) > 5 else "") + ) + raise EvalNotPassedError(detail) + + def assert_dimension_score_at_least( + self, + dimension_id: str, + min_score: float, + *, + evaluator: str | None = None, + require_applicable: bool = False, + msg: str | None = None, + ) -> None: + """Assert every item's score for a rubric *dimension* is ``>= min_score``. + + Walks ``EvalScoreResult.dimensions`` looking for the named + dimension across all items (and sub-results). Non-applicable + dimensions are skipped by default; pass + ``require_applicable=True`` to fail when no applicable score is + produced. + + Args: + dimension_id: Dimension id (matches the rubric definition). + min_score: Minimum acceptable dimension score (inclusive). + evaluator: When set, only consider scores from the evaluator + whose ``EvalScoreResult.name`` matches. + require_applicable: When ``True``, missing or non-applicable + dimension scores raise. Defaults to ``False`` (skip). + msg: Optional custom failure message. + + Raises: + EvalNotPassedError: When the dimension fails the threshold. + """ + offenders: list[str] = [] + missing_items: list[str] = [] + + def _check(results: EvalResults) -> None: + for item in results.items: + found_applicable = False + for score in item.scores: + if evaluator is not None and score.name != evaluator: + continue + if not score.dimensions: + continue + for rs in score.dimensions: + if rs.id != dimension_id: + continue + if not rs.applicable: + continue + found_applicable = True + if rs.score is None or rs.score < min_score: + offenders.append( + f"{item.item_id}/{score.name}/{dimension_id}=" + f"{rs.score if rs.score is not None else 'None'}" + ) + if require_applicable and not found_applicable: + missing_items.append(item.item_id) + for sub in results.sub_results.values(): + _check(sub) + + _check(self) + problems: list[str] = [] + if offenders: + problems.append( + f"{len(offenders)} dimension score(s) for '{dimension_id}' below {min_score}: " + f"{', '.join(offenders[:5])}" + (f" (+{len(offenders) - 5} more)" if len(offenders) > 5 else "") + ) + if missing_items: + problems.append( + f"Dimension '{dimension_id}' not applicable on {len(missing_items)} item(s): " + f"{', '.join(missing_items[:5])}" + ) + if problems: + raise EvalNotPassedError(msg or "; ".join(problems)) + + def assert_no_failed_items(self, msg: str | None = None) -> None: + """Assert no item ended in ``fail`` or ``error`` status. + + Includes any sub-results from workflow evaluations. + + Args: + msg: Optional custom failure message. + + Raises: + EvalNotPassedError: When any item failed or errored. + """ + bad: list[str] = [] + + def _check(results: EvalResults) -> None: + for item in results.items: + if item.is_failed or item.is_error: + bad.append(f"{item.item_id}:{item.status}") + for sub in results.sub_results.values(): + _check(sub) + + _check(self) + if bad: + detail = msg or ( + f"{len(bad)} item(s) failed or errored: {', '.join(bad[:5])}" + + (f" (+{len(bad) - 5} more)" if len(bad) > 5 else "") + ) + raise EvalNotPassedError(detail) + + +# endregion + +# region Generated rubric evaluators + + +@experimental(feature_id=ExperimentalFeature.EVALS) +@dataclass(frozen=True) +class RubricScore: + """A single dimension's score from a rubric-based evaluator run. + + Rubric evaluators emit one ``RubricScore`` per dimension per item. + Attached to :class:`EvalScoreResult` as a typed view of the raw + ``properties.rubric_scores`` payload returned by providers such as + Foundry's generated rubric evaluators. + + Attributes: + id: Dimension id (matches the rubric definition). + score: Numeric score, or ``None`` when the dimension was marked + non-applicable for this item. + applicable: Whether the dimension applied to this item. + weight: Dimension weight (mirrors the rubric definition). + reason: Short rationale produced by the evaluator. + """ + + id: str + score: int | None + applicable: bool + weight: int + reason: str + # endregion diff --git a/python/packages/core/agent_framework/foundry/__init__.py b/python/packages/core/agent_framework/foundry/__init__.py index 103bdca8f8..06bd4df17e 100644 --- a/python/packages/core/agent_framework/foundry/__init__.py +++ b/python/packages/core/agent_framework/foundry/__init__.py @@ -34,6 +34,7 @@ _IMPORTS: dict[str, tuple[str, str]] = { "FoundryLocalChatOptions": ("agent_framework_foundry_local", "agent-framework-foundry-local"), "FoundryLocalClient": ("agent_framework_foundry_local", "agent-framework-foundry-local"), "FoundryLocalSettings": ("agent_framework_foundry_local", "agent-framework-foundry-local"), + "GeneratedEvaluatorRef": ("agent_framework_foundry", "agent-framework-foundry"), "RawAnthropicFoundryClient": ("agent_framework_anthropic", "agent-framework-anthropic"), "RawFoundryAgent": ("agent_framework_foundry", "agent-framework-foundry"), "RawFoundryAgentChatClient": ("agent_framework_foundry", "agent-framework-foundry"), diff --git a/python/packages/core/agent_framework/foundry/__init__.pyi b/python/packages/core/agent_framework/foundry/__init__.pyi index 73c3ffe589..08a7fc1b88 100644 --- a/python/packages/core/agent_framework/foundry/__init__.pyi +++ b/python/packages/core/agent_framework/foundry/__init__.pyi @@ -20,6 +20,7 @@ from agent_framework_foundry import ( FoundryEmbeddingSettings, FoundryEvals, FoundryMemoryProvider, + GeneratedEvaluatorRef, RawFoundryAgent, RawFoundryAgentChatClient, RawFoundryChatClient, @@ -52,6 +53,7 @@ __all__ = [ "FoundryLocalClient", "FoundryLocalSettings", "FoundryMemoryProvider", + "GeneratedEvaluatorRef", "RawAnthropicFoundryClient", "RawFoundryAgent", "RawFoundryAgentChatClient", diff --git a/python/packages/core/tests/core/test_local_eval.py b/python/packages/core/tests/core/test_local_eval.py index 96b0e1a391..e60fb35d51 100644 --- a/python/packages/core/tests/core/test_local_eval.py +++ b/python/packages/core/tests/core/test_local_eval.py @@ -11,8 +11,13 @@ import pytest from agent_framework._evaluation import ( CheckResult, EvalItem, + EvalItemResult, + EvalNotPassedError, + EvalResults, + EvalScoreResult, ExpectedToolCall, LocalEvaluator, + RubricScore, _coerce_result, evaluator, keyword_check, @@ -1010,19 +1015,101 @@ class TestAllPassedSubResults: # --------------------------------------------------------------------------- -# r5 review: _build_overall_item with empty outputs +# Rubric assertions (EvalResults.assert_*) # --------------------------------------------------------------------------- -class TestBuildOverallItemEmpty: - """Test _build_overall_item returns None for empty workflow outputs.""" +def _rubric_results(*scores_per_item: list[EvalScoreResult]) -> EvalResults: + items = [ + EvalItemResult(item_id=f"item-{i}", status="pass", scores=scores) for i, scores in enumerate(scores_per_item) + ] + return EvalResults( + provider="test", + eval_id="ev1", + run_id="run1", + result_counts={"passed": len(items), "failed": 0, "errored": 0, "total": len(items)}, + items=items, + ) - def test_returns_none_for_empty_outputs(self): - from unittest.mock import MagicMock - from agent_framework._evaluation import _build_overall_item +class TestRubricAssertions: + """Tests for EvalResults.assert_dimension_score_at_least.""" - mock_result = MagicMock() - mock_result.get_outputs.return_value = [] - item = _build_overall_item("Hello", mock_result) - assert item is None + def test_dimension_at_or_above_threshold_passes(self) -> None: + results = _rubric_results( + [ + EvalScoreResult( + name="policy", + score=0.9, + dimensions=[RubricScore(id="clarity", score=4, applicable=True, weight=1, reason="")], + ) + ], + ) + # Should not raise. + results.assert_dimension_score_at_least("clarity", 3) + + def test_dimension_below_threshold_raises(self) -> None: + results = _rubric_results( + [ + EvalScoreResult( + name="policy", + score=0.5, + dimensions=[RubricScore(id="clarity", score=2, applicable=True, weight=1, reason="")], + ) + ], + ) + with pytest.raises(EvalNotPassedError): + results.assert_dimension_score_at_least("clarity", 3) + + def test_non_applicable_skipped_by_default(self) -> None: + results = _rubric_results( + [ + EvalScoreResult( + name="policy", + score=1.0, + dimensions=[RubricScore(id="clarity", score=None, applicable=False, weight=1, reason="n/a")], + ) + ], + ) + # No applicable scores; default behaviour is to skip silently. + results.assert_dimension_score_at_least("clarity", 3) + + def test_require_applicable_raises_when_dimension_absent(self) -> None: + results = _rubric_results( + [EvalScoreResult(name="policy", score=1.0, dimensions=[])], + ) + with pytest.raises(EvalNotPassedError, match="not applicable"): + results.assert_dimension_score_at_least("clarity", 3, require_applicable=True) + + def test_require_applicable_raises_when_filtered_evaluator_missing(self) -> None: + # Regression: previously the (not evaluator or found_any) guard caused + # this case to silently pass even with require_applicable=True. + results = _rubric_results( + [ + EvalScoreResult( + name="other", + score=0.9, + dimensions=[RubricScore(id="clarity", score=4, applicable=True, weight=1, reason="")], + ) + ], + ) + with pytest.raises(EvalNotPassedError, match="not applicable"): + results.assert_dimension_score_at_least("clarity", 3, evaluator="policy", require_applicable=True) + + def test_evaluator_filter_isolates_offenders(self) -> None: + results = _rubric_results( + [ + EvalScoreResult( + name="other", + score=0.1, + dimensions=[RubricScore(id="clarity", score=1, applicable=True, weight=1, reason="")], + ), + EvalScoreResult( + name="policy", + score=0.9, + dimensions=[RubricScore(id="clarity", score=4, applicable=True, weight=1, reason="")], + ), + ], + ) + # The low-scoring "other" evaluator is filtered out; "policy" passes. + results.assert_dimension_score_at_least("clarity", 3, evaluator="policy") diff --git a/python/packages/foundry/agent_framework_foundry/__init__.py b/python/packages/foundry/agent_framework_foundry/__init__.py index e6422e72c8..1ee0fc56dd 100644 --- a/python/packages/foundry/agent_framework_foundry/__init__.py +++ b/python/packages/foundry/agent_framework_foundry/__init__.py @@ -12,6 +12,7 @@ from ._embedding_client import ( ) from ._foundry_evals import ( FoundryEvals, + GeneratedEvaluatorRef, evaluate_foundry_target, evaluate_traces, ) @@ -33,6 +34,7 @@ __all__ = [ "FoundryEmbeddingSettings", "FoundryEvals", "FoundryMemoryProvider", + "GeneratedEvaluatorRef", "RawFoundryAgent", "RawFoundryAgentChatClient", "RawFoundryChatClient", diff --git a/python/packages/foundry/agent_framework_foundry/_foundry_evals.py b/python/packages/foundry/agent_framework_foundry/_foundry_evals.py index eef58b0a04..8059c2ce99 100644 --- a/python/packages/foundry/agent_framework_foundry/_foundry_evals.py +++ b/python/packages/foundry/agent_framework_foundry/_foundry_evals.py @@ -28,8 +28,9 @@ from __future__ import annotations import asyncio import logging -from collections.abc import Sequence -from typing import TYPE_CHECKING, Any +from collections.abc import Iterable, Sequence +from dataclasses import dataclass +from typing import TYPE_CHECKING, Any, cast from agent_framework._evaluation import ( AgentEvalConverter, @@ -39,6 +40,7 @@ from agent_framework._evaluation import ( EvalItemResult, EvalResults, EvalScoreResult, + RubricScore, ) from agent_framework._feature_stage import ExperimentalFeature, experimental from openai import AsyncOpenAI @@ -51,6 +53,54 @@ if TYPE_CHECKING: logger = logging.getLogger(__name__) + +# region Generated rubric evaluator references + + +@experimental(feature_id=ExperimentalFeature.EVALS) +@dataclass(frozen=True) +class GeneratedEvaluatorRef: + """A reference to a rubric evaluator that already exists in Foundry. + + Pass instances of this class to :class:`FoundryEvals` to score items + with a pre-existing rubric evaluator (manually authored or + auto-generated through the Foundry portal). agent-framework is a + consumer here: it does not create or modify the evaluator definition; + it only references the persisted version by name. + + Pinning ``version`` is strongly recommended so evaluation runs are + reproducible. ``version=None`` resolves to whichever version is + current at execution time; :class:`FoundryEvals` emits a warning when + a versionless reference is used. CI gates should always pass a + concrete version. + + Attributes: + name: Evaluator name as stored in the Foundry project (for + example ``"reservation-policy-rubric"``). Distinct from + built-in evaluators such as ``"builtin.relevance"``. + version: Pinned evaluator version. ``None`` means "latest" — + this is discouraged for CI/repro and :class:`FoundryEvals` + will emit a warning when used. + display_name: Optional human-readable name used in result + summaries. Defaults to ``name`` when unset. + """ + + name: str + version: str | None = None + display_name: str | None = None + + @classmethod + def latest(cls, name: str, *, display_name: str | None = None) -> GeneratedEvaluatorRef: + """Construct a versionless reference (resolves to the latest version at run time). + + Discouraged for reproducible runs. Prefer the constructor with + an explicit ``version`` so CI and replay evaluations stay stable + when the evaluator is updated in Foundry. + """ + return cls(name=name, version=None, display_name=display_name) + + +# endregion # Agent evaluators that accept query/response as conversation arrays. # Maintained manually — check https://learn.microsoft.com/en-us/azure/ai-studio/how-to/develop/evaluate-sdk # for the latest evaluator list. These are the evaluators that need conversation-format input. @@ -166,7 +216,7 @@ def _resolve_evaluator(name: str) -> str: def _build_testing_criteria( - evaluators: Sequence[str], + evaluators: Sequence[str | GeneratedEvaluatorRef], model: str, *, include_data_mapping: bool = False, @@ -175,7 +225,9 @@ def _build_testing_criteria( """Build ``testing_criteria`` for ``evals.create()``. Args: - evaluators: Evaluator names. + evaluators: Evaluator names (built-in shorts / fully-qualified + ``builtin.*`` names) or :class:`GeneratedEvaluatorRef` + instances for generated rubric evaluators. model: Model deployment for the LLM judge. include_data_mapping: Whether to include field-level data mapping (required for the JSONL data source, not needed for response-based). @@ -183,7 +235,38 @@ def _build_testing_criteria( definitions. """ criteria: list[dict[str, Any]] = [] - for name in evaluators: + for entry_spec in evaluators: + if isinstance(entry_spec, GeneratedEvaluatorRef): + short = entry_spec.display_name or entry_spec.name + ref_entry: dict[str, Any] = { + "type": "azure_ai_evaluator", + "name": short, + "evaluator_name": entry_spec.name, + "initialization_parameters": {"deployment_name": model}, + } + if entry_spec.version is not None: + ref_entry["evaluator_version"] = entry_spec.version + else: + logger.warning( + "GeneratedEvaluatorRef '%s' has no pinned version; the eval run " + "will resolve to whichever version is current at execution time. " + "Pin the version for reproducible runs.", + entry_spec.name, + ) + if include_data_mapping: + # Rubric evaluators accept conversation arrays like agent + # evaluators, plus tool_definitions when items are tool-aware. + ref_mapping: dict[str, str] = { + "query": "{{item.query_messages}}", + "response": "{{item.response_messages}}", + } + if include_tool_definitions: + ref_mapping["tool_definitions"] = "{{item.tool_definitions}}" + ref_entry["data_mapping"] = ref_mapping + criteria.append(ref_entry) + continue + + name = entry_spec qualified = _resolve_evaluator(name) short = name if not name.startswith("builtin.") else name.split(".")[-1] @@ -247,9 +330,9 @@ def _build_item_schema( def _resolve_default_evaluators( - evaluators: Sequence[str] | None, + evaluators: Sequence[str | GeneratedEvaluatorRef] | None, items: Sequence[EvalItem | dict[str, Any]] | None = None, -) -> list[str]: +) -> list[str | GeneratedEvaluatorRef]: """Resolve evaluators, applying defaults when ``None``. Defaults to relevance + coherence + task_adherence. Automatically adds @@ -258,7 +341,7 @@ def _resolve_default_evaluators( if evaluators is not None: return list(evaluators) - result = list(_DEFAULT_EVALUATORS) + result: list[str | GeneratedEvaluatorRef] = list(_DEFAULT_EVALUATORS) if items is not None: has_tools = any((item.tools if isinstance(item, EvalItem) else item.get("tool_definitions")) for item in items) if has_tools: @@ -267,14 +350,24 @@ def _resolve_default_evaluators( def _filter_tool_evaluators( - evaluators: list[str], + evaluators: list[str | GeneratedEvaluatorRef], items: Sequence[EvalItem | dict[str, Any]], -) -> list[str]: - """Remove tool evaluators if no items have tool definitions.""" +) -> list[str | GeneratedEvaluatorRef]: + """Remove tool evaluators if no items have tool definitions. + + Generated rubric evaluators are tool-aware but not tool-required; they + are preserved regardless of whether items carry tool definitions. + """ has_tools = any((item.tools if isinstance(item, EvalItem) else item.get("tool_definitions")) for item in items) if has_tools: return evaluators - filtered = [e for e in evaluators if _resolve_evaluator(e) not in _TOOL_EVALUATORS] + + def _is_tool_only(spec: str | GeneratedEvaluatorRef) -> bool: + if isinstance(spec, GeneratedEvaluatorRef): + return False + return _resolve_evaluator(spec) in _TOOL_EVALUATORS + + filtered = [e for e in evaluators if not _is_tool_only(e)] if not filtered: raise ValueError( f"All requested evaluators {evaluators} require tool definitions, " @@ -282,7 +375,7 @@ def _filter_tool_evaluators( "or choose evaluators that do not require tools." ) if len(filtered) < len(evaluators): - removed = [e for e in evaluators if _resolve_evaluator(e) in _TOOL_EVALUATORS] + removed = [e for e in evaluators if _is_tool_only(e)] logger.info("Removed tool evaluators %s (no items have tools)", removed) return filtered @@ -354,6 +447,114 @@ def _extract_per_evaluator(run: RunRetrieveResponse) -> dict[str, dict[str, int] return per_eval +_RUBRIC_DIMENSION_KEYS: tuple[str, ...] = ("dimension_scores", "rubric_scores") +"""Property keys that may carry per-dimension rubric breakdowns. + +The published Foundry rubric-evaluator output format uses +``properties.dimension_scores`` (see the Microsoft Learn "Rubric +evaluators" reference). Earlier preview builds and some SDK shapes +used ``rubric_scores``; we accept both for defensive forward/backward +compatibility. +""" + + +def _parse_dimension_entries(raw: Any) -> list[RubricScore]: + """Parse a raw list-like payload into ``RubricScore`` instances. + + Returns an empty list when ``raw`` is falsy, not iterable, or + contains no well-formed entries. + """ + if not raw: + return [] + try: + raw_iter: Iterable[Any] = iter(raw) + except TypeError: + return [] + + parsed: list[RubricScore] = [] + for raw_entry in raw_iter: + entry: Any = raw_entry + try: + rid: Any + score_val: Any + applicable: Any + weight: Any + reason: Any + if isinstance(entry, dict): + entry_any = cast("dict[str, Any]", entry) + rid = entry_any.get("id") + score_val = entry_any.get("score") + applicable = entry_any.get("applicable") + weight = entry_any.get("weight") + reason = entry_any.get("reason", "") + else: + rid = getattr(entry, "id", None) + score_val = getattr(entry, "score", None) + applicable = getattr(entry, "applicable", None) + weight = getattr(entry, "weight", None) + reason = getattr(entry, "reason", "") or "" + if rid is None or weight is None or applicable is None: + continue + parsed.append( + RubricScore( + id=str(rid), + score=int(score_val) if isinstance(score_val, (int, float)) else None, + applicable=bool(applicable), + weight=int(weight), + reason=str(reason) if reason is not None else "", + ) + ) + except (TypeError, ValueError): + logger.debug("Skipping malformed rubric dimension entry: %s", cast("Any", entry), exc_info=True) + return parsed + + +def _extract_rubric_scores(sample: Any) -> list[RubricScore] | None: + """Extract typed ``RubricScore`` instances from an evaluator's raw sample payload. + + Foundry rubric evaluators include a per-dimension breakdown under + ``properties.dimension_scores`` on each result (preview builds used + ``rubric_scores``; both keys are accepted, with the canonical + ``dimension_scores`` taking priority). The exact location may + vary across SDK versions, so this helper accepts a few shapes: + + * The SDK ``sample`` object exposes + ``properties.dimension_scores`` / ``properties.rubric_scores``. + * The ``sample`` is a dict containing the same under + ``properties.``. + * The ``sample`` is a dict with ``dimension_scores`` / + ``rubric_scores`` at the top level. + + Returns ``None`` when no rubric scores are present (i.e. the + evaluator was not a rubric evaluator). + """ + if sample is None: + return None + + containers: list[Any] = [] + properties: Any = getattr(sample, "properties", None) + if properties is not None: + containers.append(properties) + if isinstance(sample, dict): + sample_any = cast("dict[str, Any]", sample) + props_dict: Any = sample_any.get("properties") + if props_dict is not None and props_dict is not properties: + containers.append(props_dict) + containers.append(sample_any) + + for container in containers: + for key in _RUBRIC_DIMENSION_KEYS: + raw: Any = None + if isinstance(container, dict): + raw = cast("dict[str, Any]", container).get(key) + elif hasattr(container, key): + raw = getattr(container, key, None) + parsed = _parse_dimension_entries(raw) + if parsed: + return parsed + return None + + async def _fetch_output_items( client: AsyncOpenAI, eval_id: str, @@ -377,12 +578,15 @@ async def _fetch_output_items( # Extract per-evaluator scores scores: list[EvalScoreResult] = [] for r in oi.results or []: + sample = r.sample + dimensions = _extract_rubric_scores(sample) scores.append( EvalScoreResult( name=r.name, score=r.score, passed=r.passed, - sample=r.sample, + sample=sample, + dimensions=dimensions, ) ) @@ -394,15 +598,18 @@ async def _fetch_output_items( output_text: str | None = None response_id: str | None = None - sample = oi.sample - if sample is not None: # pyright: ignore[reportUnnecessaryComparison] - err = sample.error - if err is not None and (err.code or err.message): # pyright: ignore[reportUnnecessaryComparison] + # mypy infers oi.sample as dict[str, object] | None, but the + # OpenAI SDK actually returns a typed Sample model. Cast to Any so + # both type checkers accept the attribute access pattern. + oi_sample: Any = oi.sample + if oi_sample is not None: + err = oi_sample.error + if err is not None and (err.code or err.message): error_code = err.code or None error_message = err.message or None - usage = sample.usage - if usage is not None and usage.total_tokens: # pyright: ignore[reportUnnecessaryComparison] + usage = oi_sample.usage + if usage is not None and usage.total_tokens: token_usage = { "prompt_tokens": usage.prompt_tokens, "completion_tokens": usage.completion_tokens, @@ -411,13 +618,13 @@ async def _fetch_output_items( } # Extract input/output text - if sample.input: - parts = [si.content for si in sample.input if si.role == "user"] + if oi_sample.input: + parts = [si.content for si in oi_sample.input if si.role == "user"] if parts: input_text = " ".join(parts) - if sample.output: - parts = [so.content or "" for so in sample.output if so.role == "assistant"] + if oi_sample.output: + parts = [so.content or "" for so in oi_sample.output if so.role == "assistant"] if parts: output_text = " ".join(parts) @@ -472,7 +679,7 @@ async def _evaluate_via_responses_impl( *, client: AsyncOpenAI, response_ids: Sequence[str], - evaluators: list[str], + evaluators: list[str | GeneratedEvaluatorRef], model: str, eval_name: str, poll_interval: float, @@ -573,8 +780,11 @@ class FoundryEvals: (from ``azure.ai.projects.aio``). Provide this or *client*. model: Model deployment name for the evaluator LLM judge. Resolved from ``client.model`` when omitted. - evaluators: Evaluator names (e.g. ``["relevance", "tool_call_accuracy"]``). - When ``None`` (default), uses smart defaults based on item data. + evaluators: Evaluator specifications. Entries may be built-in + short names (e.g. ``"relevance"``), fully-qualified + ``"builtin.*"`` names, or :class:`GeneratedEvaluatorRef` + instances for previously generated rubric evaluators. When + ``None`` (default), uses smart defaults based on item data. conversation_split: How to split multi-turn conversations into query/response halves. Defaults to ``LAST_TURN``. Pass a ``ConversationSplit`` enum value or a custom callable — see @@ -623,7 +833,7 @@ class FoundryEvals: client: FoundryChatClient | None = None, project_client: AIProjectClient | None = None, model: str | None = None, - evaluators: Sequence[str] | None = None, + evaluators: Sequence[str | GeneratedEvaluatorRef] | None = None, conversation_split: ConversationSplitter = ConversationSplit.LAST_TURN, poll_interval: float = 5.0, timeout: float = 180.0, @@ -642,7 +852,9 @@ class FoundryEvals: "Model is required. Pass model= explicitly or use a FoundryChatClient that has a model configured." ) self._model = resolved_model - self._evaluators = list(evaluators) if evaluators is not None else None + self._evaluators: list[str | GeneratedEvaluatorRef] | None = ( + list(evaluators) if evaluators is not None else None + ) self._conversation_split = conversation_split self._poll_interval = poll_interval self._timeout = timeout @@ -678,7 +890,7 @@ class FoundryEvals: async def _evaluate_via_dataset( self, items: Sequence[EvalItem], - evaluators: list[str], + evaluators: list[str | GeneratedEvaluatorRef], eval_name: str, ) -> EvalResults: """Evaluate using JSONL dataset upload path.""" diff --git a/python/packages/foundry/tests/test_foundry_evals.py b/python/packages/foundry/tests/test_foundry_evals.py index a5d9f2e864..8734650aaf 100644 --- a/python/packages/foundry/tests/test_foundry_evals.py +++ b/python/packages/foundry/tests/test_foundry_evals.py @@ -25,16 +25,25 @@ from agent_framework._evaluation import ( from agent_framework._workflows._workflow import WorkflowRunResult from openai import AsyncOpenAI +from agent_framework_foundry import GeneratedEvaluatorRef from agent_framework_foundry._foundry_evals import ( + _AGENT_EVALUATORS, + _BUILTIN_EVALUATORS, + _TOOL_EVALUATORS, FoundryEvals, _build_item_schema, _build_testing_criteria, _extract_per_evaluator, _extract_result_counts, + _extract_rubric_scores, + _fetch_output_items, _filter_tool_evaluators, + _poll_eval_run, _resolve_default_evaluators, _resolve_evaluator, _resolve_openai_client, + evaluate_foundry_target, + evaluate_traces, ) @@ -806,6 +815,67 @@ class TestBuildTestingCriteria: for c in criteria: assert "tool_definitions" in c["data_mapping"], f"{c['name']} missing tool_definitions" + def test_generated_evaluator_ref_pinned_version(self) -> None: + + ref = GeneratedEvaluatorRef(name="my-rubric", version="1") + criteria = _build_testing_criteria([ref], "gpt-4o", include_data_mapping=True) + + assert len(criteria) == 1 + c = criteria[0] + assert c["type"] == "azure_ai_evaluator" + assert c["evaluator_name"] == "my-rubric" + assert c["evaluator_version"] == "1" + assert c["name"] == "my-rubric" + assert c["initialization_parameters"] == {"deployment_name": "gpt-4o"} + assert c["data_mapping"] == { + "query": "{{item.query_messages}}", + "response": "{{item.response_messages}}", + } + + def test_generated_evaluator_ref_display_name_used_as_short(self) -> None: + + ref = GeneratedEvaluatorRef(name="my-rubric", version="2", display_name="My Rubric") + criteria = _build_testing_criteria([ref], "gpt-4o") + + assert criteria[0]["name"] == "My Rubric" + assert criteria[0]["evaluator_name"] == "my-rubric" + + def test_generated_evaluator_ref_tool_definitions_added(self) -> None: + + ref = GeneratedEvaluatorRef(name="my-rubric", version="1") + criteria = _build_testing_criteria( + [ref], + "gpt-4o", + include_data_mapping=True, + include_tool_definitions=True, + ) + + assert criteria[0]["data_mapping"]["tool_definitions"] == "{{item.tool_definitions}}" + + def test_generated_evaluator_ref_unpinned_warns(self, caplog: pytest.LogCaptureFixture) -> None: + import logging + + ref = GeneratedEvaluatorRef.latest("my-rubric") + with caplog.at_level(logging.WARNING, logger="agent_framework_foundry._foundry_evals"): + criteria = _build_testing_criteria([ref], "gpt-4o") + + assert "evaluator_version" not in criteria[0] + assert any("no pinned version" in r.message for r in caplog.records) + + def test_generated_evaluator_ref_mixed_with_builtins(self) -> None: + + ref = GeneratedEvaluatorRef(name="my-rubric", version="1") + criteria = _build_testing_criteria( + ["relevance", ref, "task_adherence"], + "gpt-4o", + include_data_mapping=True, + ) + + assert [c["name"] for c in criteria] == ["relevance", "my-rubric", "task_adherence"] + assert criteria[0]["evaluator_name"] == "builtin.relevance" + assert criteria[1]["evaluator_name"] == "my-rubric" + assert criteria[2]["evaluator_name"] == "builtin.task_adherence" + # --------------------------------------------------------------------------- # _build_item_schema @@ -1263,6 +1333,29 @@ class TestFilterToolEvaluators: items, ) + def test_preserves_generated_ref_when_no_tools(self) -> None: + + ref = GeneratedEvaluatorRef(name="rubric", version="1") + items = [ + EvalItem(conversation=[Message("user", ["q"]), Message("assistant", ["r"])]), + ] + result = _filter_tool_evaluators( + ["relevance", ref, "tool_call_accuracy"], + items, + ) + assert "relevance" in result + assert ref in result + assert "tool_call_accuracy" not in result + + def test_generated_ref_alone_does_not_raise(self) -> None: + + ref = GeneratedEvaluatorRef(name="rubric", version="1") + items = [ + EvalItem(conversation=[Message("user", ["q"]), Message("assistant", ["r"])]), + ] + result = _filter_tool_evaluators([ref], items) + assert result == [ref] + # --------------------------------------------------------------------------- # EvalResults @@ -2267,7 +2360,6 @@ class TestEvalResultsWithItems: class TestFetchOutputItems: async def test_fetches_and_converts_output_items(self) -> None: - from agent_framework_foundry._foundry_evals import _fetch_output_items # Build mock output items matching the OpenAI SDK schema mock_result = MagicMock() @@ -2329,7 +2421,6 @@ class TestFetchOutputItems: assert item.error_code is None async def test_handles_errored_item(self) -> None: - from agent_framework_foundry._foundry_evals import _fetch_output_items mock_error = MagicMock() mock_error.code = "QueryExtractionError" @@ -2361,7 +2452,6 @@ class TestFetchOutputItems: assert len(item.scores) == 0 async def test_handles_api_failure_gracefully(self) -> None: - from agent_framework_foundry._foundry_evals import _fetch_output_items mock_client = MagicMock() mock_client.evals.runs.output_items.list = AsyncMock(side_effect=TypeError("API error")) @@ -2369,6 +2459,166 @@ class TestFetchOutputItems: items = await _fetch_output_items(mock_client, "eval_1", "run_1") assert items == [] + async def test_extracts_rubric_scores_from_dict_sample(self) -> None: + + mock_result = MagicMock() + mock_result.name = "my-rubric" + mock_result.score = 0.85 + mock_result.passed = True + mock_result.sample = { + "properties": { + "rubric_scores": [ + {"id": "policy", "score": 4, "applicable": True, "weight": 1, "reason": "ok"}, + {"id": "safety", "score": None, "applicable": False, "weight": 1, "reason": "n/a"}, + ] + } + } + + mock_oi = MagicMock() + mock_oi.id = "oi_1" + mock_oi.status = "pass" + mock_oi.results = [mock_result] + mock_oi.sample = None + mock_oi.datasource_item = {} + + mock_client = MagicMock() + mock_client.evals.runs.output_items.list = AsyncMock(return_value=_AsyncPage([mock_oi])) + + items = await _fetch_output_items(mock_client, "eval_1", "run_1") + + assert len(items) == 1 + scores = items[0].scores + assert len(scores) == 1 + assert scores[0].dimensions is not None + assert len(scores[0].dimensions) == 2 + policy = next(d for d in scores[0].dimensions if d.id == "policy") + assert policy.score == 4 + assert policy.applicable is True + assert policy.weight == 1 + assert policy.reason == "ok" + safety = next(d for d in scores[0].dimensions if d.id == "safety") + assert safety.score is None + assert safety.applicable is False + + async def test_no_rubric_scores_when_absent(self) -> None: + + mock_result = MagicMock() + mock_result.name = "relevance" + mock_result.score = 0.85 + mock_result.passed = True + mock_result.sample = None + + mock_oi = MagicMock() + mock_oi.id = "oi_2" + mock_oi.status = "pass" + mock_oi.results = [mock_result] + mock_oi.sample = None + mock_oi.datasource_item = {} + + mock_client = MagicMock() + mock_client.evals.runs.output_items.list = AsyncMock(return_value=_AsyncPage([mock_oi])) + + items = await _fetch_output_items(mock_client, "eval_1", "run_1") + + assert items[0].scores[0].dimensions is None + + +class TestExtractRubricScores: + def test_handles_attribute_style_properties(self) -> None: + + rs = MagicMock() + rs.id = "policy" + rs.score = 5 + rs.applicable = True + rs.weight = 2 + rs.reason = "ok" + + sample = MagicMock() + sample.properties = MagicMock() + sample.properties.rubric_scores = [rs] + + result = _extract_rubric_scores(sample) + assert result is not None + assert result[0].id == "policy" + assert result[0].score == 5 + assert result[0].weight == 2 + + def test_top_level_rubric_scores_in_dict(self) -> None: + + sample = {"rubric_scores": [{"id": "a", "score": 3, "applicable": True, "weight": 1, "reason": "r"}]} + result = _extract_rubric_scores(sample) + assert result is not None + assert result[0].id == "a" + + def test_returns_none_when_missing(self) -> None: + + assert _extract_rubric_scores(None) is None + assert _extract_rubric_scores({}) is None + assert _extract_rubric_scores({"properties": {}}) is None + + def test_skips_malformed_entries(self) -> None: + + sample = { + "properties": { + "rubric_scores": [ + {"id": "good", "score": 3, "applicable": True, "weight": 1, "reason": "ok"}, + {"id": "bad-no-weight", "score": 2, "applicable": True, "reason": "x"}, + ] + } + } + result = _extract_rubric_scores(sample) + assert result is not None + assert len(result) == 1 + assert result[0].id == "good" + + def test_canonical_dimension_scores_key_from_docs(self) -> None: + """Per the Microsoft Learn docs, runtime output uses ``properties.dimension_scores``.""" + + sample = { + "properties": { + "dimension_scores": [ + { + "id": "intent_recognition", + "score": 5, + "applicable": True, + "weight": 9, + "reason": "Identified correctly.", + }, + { + "id": "general_quality", + "score": 4, + "applicable": True, + "weight": 5, + "reason": "Strong overall.", + }, + ] + } + } + result = _extract_rubric_scores(sample) + assert result is not None + assert [r.id for r in result] == ["intent_recognition", "general_quality"] + assert [r.score for r in result] == [5, 4] + assert [r.weight for r in result] == [9, 5] + + def test_dimension_scores_via_attribute(self) -> None: + """Canonical key also resolves when properties exposes ``dimension_scores`` as an attr.""" + + rs = MagicMock() + rs.id = "policy_enforcement" + rs.score = 1 + rs.applicable = True + rs.weight = 5 + rs.reason = "violated" + + sample = MagicMock() + sample.properties = MagicMock(spec=["dimension_scores"]) + sample.properties.dimension_scores = [rs] + + result = _extract_rubric_scores(sample) + assert result is not None + assert result[0].id == "policy_enforcement" + assert result[0].score == 1 + # --------------------------------------------------------------------------- # _poll_eval_run — timeout / failed / canceled paths @@ -2378,7 +2628,6 @@ class TestFetchOutputItems: class TestPollEvalRun: async def test_timeout_returns_timeout_status(self) -> None: """Poll timeout returns EvalResults with status='timeout'.""" - from agent_framework_foundry._foundry_evals import _poll_eval_run mock_client = MagicMock() mock_pending = MagicMock() @@ -2392,7 +2641,6 @@ class TestPollEvalRun: async def test_failed_run_returns_error(self) -> None: """Failed run returns EvalResults with error message.""" - from agent_framework_foundry._foundry_evals import _poll_eval_run mock_client = MagicMock() mock_failed = MagicMock() @@ -2410,7 +2658,6 @@ class TestPollEvalRun: async def test_canceled_run_returns_canceled_status(self) -> None: """Canceled run returns EvalResults with status='canceled'.""" - from agent_framework_foundry._foundry_evals import _poll_eval_run mock_client = MagicMock() mock_canceled = MagicMock() @@ -2435,7 +2682,6 @@ class TestPollEvalRun: class TestEvaluateTraces: async def test_raises_without_required_args(self) -> None: """Raises ValueError when no response_ids, trace_ids, or agent_id given.""" - from agent_framework_foundry._foundry_evals import evaluate_traces mock_client = MagicMock() with pytest.raises(ValueError, match="Provide at least one of"): @@ -2446,7 +2692,6 @@ class TestEvaluateTraces: async def test_response_ids_path(self) -> None: """evaluate_traces with response_ids uses the responses API path.""" - from agent_framework_foundry._foundry_evals import evaluate_traces mock_client = MagicMock() @@ -2494,7 +2739,6 @@ class TestEvaluateTraces: async def test_trace_ids_path(self) -> None: """evaluate_traces with trace_ids builds azure_ai_traces data source.""" - from agent_framework_foundry._foundry_evals import evaluate_traces mock_client = MagicMock() @@ -2534,7 +2778,6 @@ class TestEvaluateTraces: class TestEvaluateFoundryTarget: async def test_happy_path(self) -> None: """evaluate_foundry_target creates eval + run and polls to completion.""" - from agent_framework_foundry._foundry_evals import evaluate_foundry_target mock_client = MagicMock() @@ -2670,13 +2913,11 @@ class TestEvaluatorSetConsistency: """Verify that _AGENT_EVALUATORS and _TOOL_EVALUATORS are subsets of _BUILTIN_EVALUATORS.""" def test_agent_evaluators_subset(self): - from agent_framework_foundry._foundry_evals import _AGENT_EVALUATORS, _BUILTIN_EVALUATORS diff = _AGENT_EVALUATORS - set(_BUILTIN_EVALUATORS.values()) assert not diff, f"_AGENT_EVALUATORS has names not in _BUILTIN_EVALUATORS: {diff}" def test_tool_evaluators_subset(self): - from agent_framework_foundry._foundry_evals import _BUILTIN_EVALUATORS, _TOOL_EVALUATORS diff = _TOOL_EVALUATORS - set(_BUILTIN_EVALUATORS.values()) assert not diff, f"_TOOL_EVALUATORS has names not in _BUILTIN_EVALUATORS: {diff}" @@ -2690,7 +2931,6 @@ class TestEvaluatorSetConsistency: class TestEvaluateTracesAgentId: async def test_agent_id_only_path(self) -> None: """evaluate_traces with agent_id only builds azure_ai_traces data source.""" - from agent_framework_foundry._foundry_evals import evaluate_traces mock_client = MagicMock() @@ -2748,7 +2988,6 @@ class TestFilterToolEvaluatorsRaises: class TestEvaluateFoundryTargetValidation: async def test_target_without_type_raises(self) -> None: """target dict without 'type' key raises ValueError.""" - from agent_framework_foundry._foundry_evals import evaluate_foundry_target mock_client = MagicMock() with pytest.raises(ValueError, match="'type' key"): diff --git a/python/samples/05-end-to-end/evaluation/foundry_evals/.env.example b/python/samples/05-end-to-end/evaluation/foundry_evals/.env.example index b6a8af233e..388350edea 100644 --- a/python/samples/05-end-to-end/evaluation/foundry_evals/.env.example +++ b/python/samples/05-end-to-end/evaluation/foundry_evals/.env.example @@ -1,3 +1,12 @@ FOUNDRY_PROJECT_ENDPOINT="" FOUNDRY_MODEL="" +# Only needed for evaluate_with_rubric_sample.py — connects to the +# pre-existing Foundry agent that the rubric evaluator was created against. +FOUNDRY_AGENT_NAME="" +FOUNDRY_AGENT_VERSION="" + +# Only needed for evaluate_with_rubric_sample.py — references a rubric +# evaluator you created in Foundry. Pin the version for reproducible runs. +FOUNDRY_RUBRIC_NAME="" +FOUNDRY_RUBRIC_VERSION="" \ No newline at end of file diff --git a/python/samples/05-end-to-end/evaluation/foundry_evals/README.md b/python/samples/05-end-to-end/evaluation/foundry_evals/README.md index 81412a7f0e..e30ce6aa46 100644 --- a/python/samples/05-end-to-end/evaluation/foundry_evals/README.md +++ b/python/samples/05-end-to-end/evaluation/foundry_evals/README.md @@ -35,6 +35,34 @@ Evaluate what already happened — zero changes to agent code: uv run samples/05-end-to-end/evaluation/foundry_evals/evaluate_traces_sample.py ``` +### Referencing a rubric evaluator created in Foundry + +Foundry users can create rubric evaluators in the Foundry portal (or +through the dedicated SDK / REST surface). Once an evaluator exists, +agent-framework consumes it like any other evaluator: pass a +`GeneratedEvaluatorRef(name=..., version=...)` in the `evaluators=` +list and pin the version for reproducible runs. + +```python +from agent_framework.foundry import FoundryEvals, GeneratedEvaluatorRef + +evals = FoundryEvals( + evaluators=[ + GeneratedEvaluatorRef(name="reservation-policy-rubric", version="3"), + "relevance", + "coherence", + ], +) +``` + +Quality gates on rubric output use the standard `EvalResults` helpers, +including `assert_dimension_score_at_least(...)` for per-dimension +thresholds. + +See [`evaluate_with_rubric_sample.py`](./evaluate_with_rubric_sample.py) +for a runnable end-to-end example that combines a rubric evaluator with +built-in evaluators and gates a per-dimension threshold. + ## Setup Create a `.env` file with configuration as in the `.env.example` file in this folder. @@ -44,3 +72,4 @@ Create a `.env` file with configuration as in the `.env.example` file in this fo - **"I want to test my agent during development"** → `evaluate_agent_sample.py`, Pattern 1 - **"I want to evaluate past agent runs"** → `evaluate_traces_sample.py` - **"I want to inspect/modify eval data before submitting"** → `evaluate_agent_sample.py`, Pattern 2 +- **"I want to score against a custom rubric I created in Foundry"** → `evaluate_with_rubric_sample.py` diff --git a/python/samples/05-end-to-end/evaluation/foundry_evals/evaluate_with_rubric_sample.py b/python/samples/05-end-to-end/evaluation/foundry_evals/evaluate_with_rubric_sample.py new file mode 100644 index 0000000000..06ec5c9bdd --- /dev/null +++ b/python/samples/05-end-to-end/evaluation/foundry_evals/evaluate_with_rubric_sample.py @@ -0,0 +1,138 @@ +# Copyright (c) Microsoft. All rights reserved. + +"""Evaluate a Foundry agent against a rubric evaluator that was created in Foundry. + +Rubric evaluators are LLM-as-judge evaluators with custom scoring dimensions +that you define for your domain. agent-framework consumes pre-existing rubric +evaluators — they are authored in the Foundry portal (or via the dedicated +SDK / REST surface) and referenced here by name and version. + +See: https://learn.microsoft.com/azure/ai-foundry/concepts/evaluation-evaluators/rubric-evaluators + +This sample demonstrates: +1. Connecting to a pre-existing Foundry agent (PromptAgent or HostedAgent). +2. Referencing a pre-existing rubric evaluator by ``name`` and ``version``. +3. Mixing the rubric with built-in Foundry evaluators in one run. +4. Asserting per-dimension thresholds with + ``EvalResults.assert_dimension_score_at_least(...)`` for CI quality gates. + +Starting condition / prerequisites: +- An Azure AI Foundry project with a deployed model. +- A registered Foundry agent (PromptAgent or HostedAgent) in that project. + This is the agent the rubric is meant to evaluate. +- A rubric evaluator already created in the Foundry portal against that + agent. Creating rubrics through the portal currently requires picking a + Foundry agent as the generation context, so this prerequisite is implied + by having a rubric at all. +- Set the following in .env (see ``.env.example``): + - ``FOUNDRY_PROJECT_ENDPOINT`` + - ``FOUNDRY_AGENT_NAME`` and ``FOUNDRY_AGENT_VERSION`` for the agent + - ``FOUNDRY_RUBRIC_NAME`` and ``FOUNDRY_RUBRIC_VERSION`` for the rubric + - ``FOUNDRY_MODEL`` for the rubric judge model +""" + +import asyncio +import os + +from agent_framework import EvalNotPassedError, evaluate_agent +from agent_framework.foundry import FoundryAgent, FoundryChatClient, FoundryEvals, GeneratedEvaluatorRef +from azure.identity import AzureCliCredential +from dotenv import load_dotenv + +load_dotenv(override=True) + + +async def main() -> None: + # 1. Connect to the existing Foundry agent that the rubric was created + # against. PromptAgents and HostedAgents are both supported. + credential = AzureCliCredential() + project_endpoint = os.environ["FOUNDRY_PROJECT_ENDPOINT"] + + agent = FoundryAgent( + project_endpoint=project_endpoint, + agent_name=os.environ["FOUNDRY_AGENT_NAME"], + agent_version=os.environ.get("FOUNDRY_AGENT_VERSION"), + credential=credential, + ) + + # 2. Reference the pre-existing rubric evaluator by name + version. + # Always pin a version for reproducible CI runs; versionless refs + # resolve to "latest" and emit a warning at evaluation time. + rubric_name = os.environ["FOUNDRY_RUBRIC_NAME"] + rubric_version = os.environ["FOUNDRY_RUBRIC_VERSION"] + rubric = GeneratedEvaluatorRef(name=rubric_name, version=rubric_version) + + # 3. Mix the rubric with built-in evaluators in a single FoundryEvals + # config. FoundryEvals talks to Foundry over the project endpoint, so + # we hand it a FoundryChatClient configured with the same credential. + eval_client = FoundryChatClient( + project_endpoint=project_endpoint, + model=os.environ["FOUNDRY_MODEL"], + credential=credential, + ) + evals = FoundryEvals( + client=eval_client, + evaluators=[ + rubric, + FoundryEvals.RELEVANCE, + FoundryEvals.COHERENCE, + ], + ) + + # ========================================================================= + # Run evaluation + # ========================================================================= + print("=" * 60) + print(f"Evaluating '{agent.name}' with rubric '{rubric_name}' (version {rubric_version})") + print("=" * 60) + + results = await evaluate_agent( + agent=agent, + queries=[ + "What's the weather like in Seattle?", + "Should I bring an umbrella to London tomorrow?", + ], + evaluators=evals, + ) + + for r in results: + print(f"Status: {r.status}") + print(f"Results: {r.passed}/{r.total} passed") + print(f"Portal: {r.report_url}") + if r.all_passed: + print("[PASS] All passed") + else: + print(f"[FAIL] {r.failed} failed") + + # ========================================================================= + # Per-dimension quality gate + # ========================================================================= + # Rubric evaluators emit per-dimension scores (1–5) on top of the overall + # weighted score. Use assert_dimension_score_at_least to gate CI on a + # specific dimension — e.g., never ship if a critical dimension drops + # below 3. + # + # The dimension_id must match an id defined on your rubric in Foundry. + # ``general_quality`` is used here because it's the conventional + # ``always_applicable: true`` dimension in the Foundry docs' example + # rubric — swap it for whatever dimension id(s) your rubric actually + # defines. + print() + print("=" * 60) + print("Per-dimension quality gate") + print("=" * 60) + + for r in results: + try: + r.assert_dimension_score_at_least( + "general_quality", + min_score=3.0, + evaluator=rubric_name, + ) + print(f"[PASS] {r.provider}: general_quality >= 3 on every item") + except EvalNotPassedError as exc: + print(f"[FAIL] {r.provider}: dimension gate tripped: {exc}") + + +if __name__ == "__main__": + asyncio.run(main())