diff --git a/python/packages/core/agent_framework/__init__.py b/python/packages/core/agent_framework/__init__.py
index cc517d993e..bd8b66cbb1 100644
--- a/python/packages/core/agent_framework/__init__.py
+++ b/python/packages/core/agent_framework/__init__.py
@@ -71,6 +71,7 @@ from ._evaluation import (
     Evaluator,
     ExpectedToolCall,
     LocalEvaluator,
+    RubricScore,
     evaluate_agent,
     evaluate_workflow,
     evaluator,
@@ -460,6 +461,7 @@ __all__ = [
     "ResponseStream",
     "Role",
     "RoleLiteral",
+    "RubricScore",
     "RunContext",
     "Runner",
     "RunnerContext",
diff --git a/python/packages/core/agent_framework/_evaluation.py b/python/packages/core/agent_framework/_evaluation.py
index 64fab0eacb..52bdf90d0f 100644
--- a/python/packages/core/agent_framework/_evaluation.py
+++ b/python/packages/core/agent_framework/_evaluation.py
@@ -311,12 +311,15 @@ class EvalScoreResult:
         score: Numeric score from the evaluator.
         passed: Whether the item passed this evaluator's threshold.
         sample: Optional raw evaluator output (rationale, metadata).
+        dimensions: Per-dimension scores when this evaluator is a rubric
+            evaluator.  ``None`` for non-rubric (e.g. built-in) evaluators.
     """
 
     name: str
     score: float
     passed: bool | None = None
     sample: dict[str, Any] | None = None
+    dimensions: list[RubricScore] | None = None
 
 
 @experimental(feature_id=ExperimentalFeature.EVALS)
@@ -496,6 +499,179 @@ class EvalResults:
                     detail += f" Errored items: {', '.join(summaries)}."
             raise EvalNotPassedError(detail)
 
+    def assert_score_at_least(
+        self,
+        min_score: float,
+        *,
+        evaluator: str | None = None,
+        msg: str | None = None,
+    ) -> None:
+        """Assert every item's score (optionally filtered by evaluator) is ``>= min_score``.
+
+        Designed for CI gates on generated rubric evaluators (e.g.
+        ``results.assert_score_at_least(0.80)``).  Includes any
+        sub-results from workflow evaluations.
+
+        Args:
+            min_score: Minimum acceptable score (inclusive).
+            evaluator: When set, only check scores from the evaluator
+                whose ``EvalScoreResult.name`` matches.
+            msg: Optional custom failure message.
+
+        Raises:
+            EvalNotPassedError: When any matching score is below the threshold.
+        """
+        offenders: list[str] = []
+
+        def _check(results: EvalResults) -> None:
+            for item in results.items:
+                for score in item.scores:
+                    if evaluator is not None and score.name != evaluator:
+                        continue
+                    if score.score < min_score:
+                        offenders.append(f"{item.item_id}/{score.name}={score.score:.3f}")
+            for sub in results.sub_results.values():
+                _check(sub)
+
+        _check(self)
+        if offenders:
+            detail = msg or (
+                f"{len(offenders)} score(s) below threshold {min_score}"
+                f"{' for ' + evaluator if evaluator else ''}: {', '.join(offenders[:5])}"
+                + (f" (+{len(offenders) - 5} more)" if len(offenders) > 5 else "")
+            )
+            raise EvalNotPassedError(detail)
+
+    def assert_dimension_score_at_least(
+        self,
+        dimension_id: str,
+        min_score: float,
+        *,
+        evaluator: str | None = None,
+        require_applicable: bool = False,
+        msg: str | None = None,
+    ) -> None:
+        """Assert every item's score for a rubric *dimension* is ``>= min_score``.
+
+        Walks ``EvalScoreResult.dimensions`` looking for the named
+        dimension across all items (and sub-results).  Non-applicable
+        dimensions are skipped by default; pass
+        ``require_applicable=True`` to fail when no applicable score is
+        produced.
+
+        Args:
+            dimension_id: Dimension id (matches the rubric definition).
+            min_score: Minimum acceptable dimension score (inclusive).
+            evaluator: When set, only consider scores from the evaluator
+                whose ``EvalScoreResult.name`` matches.
+            require_applicable: When ``True``, missing or non-applicable
+                dimension scores raise.  Defaults to ``False`` (skip).
+            msg: Optional custom failure message.
+
+        Raises:
+            EvalNotPassedError: When the dimension fails the threshold.
+        """
+        offenders: list[str] = []
+        missing_items: list[str] = []
+
+        def _check(results: EvalResults) -> None:
+            for item in results.items:
+                found_applicable = False
+                for score in item.scores:
+                    if evaluator is not None and score.name != evaluator:
+                        continue
+                    if not score.dimensions:
+                        continue
+                    for rs in score.dimensions:
+                        if rs.id != dimension_id:
+                            continue
+                        if not rs.applicable:
+                            continue
+                        found_applicable = True
+                        if rs.score is None or rs.score < min_score:
+                            offenders.append(
+                                f"{item.item_id}/{score.name}/{dimension_id}="
+                                f"{rs.score if rs.score is not None else 'None'}"
+                            )
+                if require_applicable and not found_applicable:
+                    missing_items.append(item.item_id)
+            for sub in results.sub_results.values():
+                _check(sub)
+
+        _check(self)
+        problems: list[str] = []
+        if offenders:
+            problems.append(
+                f"{len(offenders)} dimension score(s) for '{dimension_id}' below {min_score}: "
+                f"{', '.join(offenders[:5])}" + (f" (+{len(offenders) - 5} more)" if len(offenders) > 5 else "")
+            )
+        if missing_items:
+            problems.append(
+                f"Dimension '{dimension_id}' not applicable on {len(missing_items)} item(s): "
+                f"{', '.join(missing_items[:5])}"
+            )
+        if problems:
+            raise EvalNotPassedError(msg or "; ".join(problems))
+
+    def assert_no_failed_items(self, msg: str | None = None) -> None:
+        """Assert no item ended in ``fail`` or ``error`` status.
+
+        Includes any sub-results from workflow evaluations.
+
+        Args:
+            msg: Optional custom failure message.
+
+        Raises:
+            EvalNotPassedError: When any item failed or errored.
+        """
+        bad: list[str] = []
+
+        def _check(results: EvalResults) -> None:
+            for item in results.items:
+                if item.is_failed or item.is_error:
+                    bad.append(f"{item.item_id}:{item.status}")
+            for sub in results.sub_results.values():
+                _check(sub)
+
+        _check(self)
+        if bad:
+            detail = msg or (
+                f"{len(bad)} item(s) failed or errored: {', '.join(bad[:5])}"
+                + (f" (+{len(bad) - 5} more)" if len(bad) > 5 else "")
+            )
+            raise EvalNotPassedError(detail)
+
+
+# endregion
+
+# region Generated rubric evaluators
+
+
+@experimental(feature_id=ExperimentalFeature.EVALS)
+@dataclass(frozen=True)
+class RubricScore:
+    """A single dimension's score from a rubric-based evaluator run.
+
+    Rubric evaluators emit one ``RubricScore`` per dimension per item.
+    Attached to :class:`EvalScoreResult` as a typed view of the raw
+    ``properties.rubric_scores`` payload returned by providers such as
+    Foundry's generated rubric evaluators.
+
+    Attributes:
+        id: Dimension id (matches the rubric definition).
+        score: Numeric score, or ``None`` when the dimension was marked
+            non-applicable for this item.
+        applicable: Whether the dimension applied to this item.
+        weight: Dimension weight (mirrors the rubric definition).
+        reason: Short rationale produced by the evaluator.
+    """
+
+    id: str
+    score: int | None
+    applicable: bool
+    weight: int
+    reason: str
+
 
 # endregion
 
diff --git a/python/packages/core/agent_framework/foundry/__init__.py b/python/packages/core/agent_framework/foundry/__init__.py
index 103bdca8f8..06bd4df17e 100644
--- a/python/packages/core/agent_framework/foundry/__init__.py
+++ b/python/packages/core/agent_framework/foundry/__init__.py
@@ -34,6 +34,7 @@ _IMPORTS: dict[str, tuple[str, str]] = {
     "FoundryLocalChatOptions": ("agent_framework_foundry_local", "agent-framework-foundry-local"),
     "FoundryLocalClient": ("agent_framework_foundry_local", "agent-framework-foundry-local"),
     "FoundryLocalSettings": ("agent_framework_foundry_local", "agent-framework-foundry-local"),
+    "GeneratedEvaluatorRef": ("agent_framework_foundry", "agent-framework-foundry"),
     "RawAnthropicFoundryClient": ("agent_framework_anthropic", "agent-framework-anthropic"),
     "RawFoundryAgent": ("agent_framework_foundry", "agent-framework-foundry"),
     "RawFoundryAgentChatClient": ("agent_framework_foundry", "agent-framework-foundry"),
diff --git a/python/packages/core/agent_framework/foundry/__init__.pyi b/python/packages/core/agent_framework/foundry/__init__.pyi
index 73c3ffe589..08a7fc1b88 100644
--- a/python/packages/core/agent_framework/foundry/__init__.pyi
+++ b/python/packages/core/agent_framework/foundry/__init__.pyi
@@ -20,6 +20,7 @@ from agent_framework_foundry import (
     FoundryEmbeddingSettings,
     FoundryEvals,
     FoundryMemoryProvider,
+    GeneratedEvaluatorRef,
     RawFoundryAgent,
     RawFoundryAgentChatClient,
     RawFoundryChatClient,
@@ -52,6 +53,7 @@ __all__ = [
     "FoundryLocalClient",
     "FoundryLocalSettings",
     "FoundryMemoryProvider",
+    "GeneratedEvaluatorRef",
     "RawAnthropicFoundryClient",
     "RawFoundryAgent",
     "RawFoundryAgentChatClient",
diff --git a/python/packages/core/tests/core/test_local_eval.py b/python/packages/core/tests/core/test_local_eval.py
index 96b0e1a391..e60fb35d51 100644
--- a/python/packages/core/tests/core/test_local_eval.py
+++ b/python/packages/core/tests/core/test_local_eval.py
@@ -11,8 +11,13 @@ import pytest
 from agent_framework._evaluation import (
     CheckResult,
     EvalItem,
+    EvalItemResult,
+    EvalNotPassedError,
+    EvalResults,
+    EvalScoreResult,
     ExpectedToolCall,
     LocalEvaluator,
+    RubricScore,
     _coerce_result,
     evaluator,
     keyword_check,
@@ -1010,19 +1015,101 @@ class TestAllPassedSubResults:
 
 
 # ---------------------------------------------------------------------------
-# r5 review: _build_overall_item with empty outputs
+# Rubric assertions (EvalResults.assert_*)
 # ---------------------------------------------------------------------------
 
 
-class TestBuildOverallItemEmpty:
-    """Test _build_overall_item returns None for empty workflow outputs."""
+def _rubric_results(*scores_per_item: list[EvalScoreResult]) -> EvalResults:
+    items = [
+        EvalItemResult(item_id=f"item-{i}", status="pass", scores=scores) for i, scores in enumerate(scores_per_item)
+    ]
+    return EvalResults(
+        provider="test",
+        eval_id="ev1",
+        run_id="run1",
+        result_counts={"passed": len(items), "failed": 0, "errored": 0, "total": len(items)},
+        items=items,
+    )
 
-    def test_returns_none_for_empty_outputs(self):
-        from unittest.mock import MagicMock
 
-        from agent_framework._evaluation import _build_overall_item
+class TestRubricAssertions:
+    """Tests for EvalResults.assert_dimension_score_at_least."""
 
-        mock_result = MagicMock()
-        mock_result.get_outputs.return_value = []
-        item = _build_overall_item("Hello", mock_result)
-        assert item is None
+    def test_dimension_at_or_above_threshold_passes(self) -> None:
+        results = _rubric_results(
+            [
+                EvalScoreResult(
+                    name="policy",
+                    score=0.9,
+                    dimensions=[RubricScore(id="clarity", score=4, applicable=True, weight=1, reason="")],
+                )
+            ],
+        )
+        # Should not raise.
+        results.assert_dimension_score_at_least("clarity", 3)
+
+    def test_dimension_below_threshold_raises(self) -> None:
+        results = _rubric_results(
+            [
+                EvalScoreResult(
+                    name="policy",
+                    score=0.5,
+                    dimensions=[RubricScore(id="clarity", score=2, applicable=True, weight=1, reason="")],
+                )
+            ],
+        )
+        with pytest.raises(EvalNotPassedError):
+            results.assert_dimension_score_at_least("clarity", 3)
+
+    def test_non_applicable_skipped_by_default(self) -> None:
+        results = _rubric_results(
+            [
+                EvalScoreResult(
+                    name="policy",
+                    score=1.0,
+                    dimensions=[RubricScore(id="clarity", score=None, applicable=False, weight=1, reason="n/a")],
+                )
+            ],
+        )
+        # No applicable scores; default behaviour is to skip silently.
+        results.assert_dimension_score_at_least("clarity", 3)
+
+    def test_require_applicable_raises_when_dimension_absent(self) -> None:
+        results = _rubric_results(
+            [EvalScoreResult(name="policy", score=1.0, dimensions=[])],
+        )
+        with pytest.raises(EvalNotPassedError, match="not applicable"):
+            results.assert_dimension_score_at_least("clarity", 3, require_applicable=True)
+
+    def test_require_applicable_raises_when_filtered_evaluator_missing(self) -> None:
+        # Regression: previously the (not evaluator or found_any) guard caused
+        # this case to silently pass even with require_applicable=True.
+        results = _rubric_results(
+            [
+                EvalScoreResult(
+                    name="other",
+                    score=0.9,
+                    dimensions=[RubricScore(id="clarity", score=4, applicable=True, weight=1, reason="")],
+                )
+            ],
+        )
+        with pytest.raises(EvalNotPassedError, match="not applicable"):
+            results.assert_dimension_score_at_least("clarity", 3, evaluator="policy", require_applicable=True)
+
+    def test_evaluator_filter_isolates_offenders(self) -> None:
+        results = _rubric_results(
+            [
+                EvalScoreResult(
+                    name="other",
+                    score=0.1,
+                    dimensions=[RubricScore(id="clarity", score=1, applicable=True, weight=1, reason="")],
+                ),
+                EvalScoreResult(
+                    name="policy",
+                    score=0.9,
+                    dimensions=[RubricScore(id="clarity", score=4, applicable=True, weight=1, reason="")],
+                ),
+            ],
+        )
+        # The low-scoring "other" evaluator is filtered out; "policy" passes.
+        results.assert_dimension_score_at_least("clarity", 3, evaluator="policy")
diff --git a/python/packages/foundry/agent_framework_foundry/__init__.py b/python/packages/foundry/agent_framework_foundry/__init__.py
index e6422e72c8..1ee0fc56dd 100644
--- a/python/packages/foundry/agent_framework_foundry/__init__.py
+++ b/python/packages/foundry/agent_framework_foundry/__init__.py
@@ -12,6 +12,7 @@ from ._embedding_client import (
 )
 from ._foundry_evals import (
     FoundryEvals,
+    GeneratedEvaluatorRef,
     evaluate_foundry_target,
     evaluate_traces,
 )
@@ -33,6 +34,7 @@ __all__ = [
     "FoundryEmbeddingSettings",
     "FoundryEvals",
     "FoundryMemoryProvider",
+    "GeneratedEvaluatorRef",
     "RawFoundryAgent",
     "RawFoundryAgentChatClient",
     "RawFoundryChatClient",
diff --git a/python/packages/foundry/agent_framework_foundry/_foundry_evals.py b/python/packages/foundry/agent_framework_foundry/_foundry_evals.py
index eef58b0a04..8059c2ce99 100644
--- a/python/packages/foundry/agent_framework_foundry/_foundry_evals.py
+++ b/python/packages/foundry/agent_framework_foundry/_foundry_evals.py
@@ -28,8 +28,9 @@ from __future__ import annotations
 
 import asyncio
 import logging
-from collections.abc import Sequence
-from typing import TYPE_CHECKING, Any
+from collections.abc import Iterable, Sequence
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Any, cast
 
 from agent_framework._evaluation import (
     AgentEvalConverter,
@@ -39,6 +40,7 @@ from agent_framework._evaluation import (
     EvalItemResult,
     EvalResults,
     EvalScoreResult,
+    RubricScore,
 )
 from agent_framework._feature_stage import ExperimentalFeature, experimental
 from openai import AsyncOpenAI
@@ -51,6 +53,54 @@ if TYPE_CHECKING:
 
 logger = logging.getLogger(__name__)
 
+
+# region Generated rubric evaluator references
+
+
+@experimental(feature_id=ExperimentalFeature.EVALS)
+@dataclass(frozen=True)
+class GeneratedEvaluatorRef:
+    """A reference to a rubric evaluator that already exists in Foundry.
+
+    Pass instances of this class to :class:`FoundryEvals` to score items
+    with a pre-existing rubric evaluator (manually authored or
+    auto-generated through the Foundry portal).  agent-framework is a
+    consumer here: it does not create or modify the evaluator definition;
+    it only references the persisted version by name.
+
+    Pinning ``version`` is strongly recommended so evaluation runs are
+    reproducible.  ``version=None`` resolves to whichever version is
+    current at execution time; :class:`FoundryEvals` emits a warning when
+    a versionless reference is used.  CI gates should always pass a
+    concrete version.
+
+    Attributes:
+        name: Evaluator name as stored in the Foundry project (for
+            example ``"reservation-policy-rubric"``).  Distinct from
+            built-in evaluators such as ``"builtin.relevance"``.
+        version: Pinned evaluator version.  ``None`` means "latest" —
+            this is discouraged for CI/repro and :class:`FoundryEvals`
+            will emit a warning when used.
+        display_name: Optional human-readable name used in result
+            summaries.  Defaults to ``name`` when unset.
+    """
+
+    name: str
+    version: str | None = None
+    display_name: str | None = None
+
+    @classmethod
+    def latest(cls, name: str, *, display_name: str | None = None) -> GeneratedEvaluatorRef:
+        """Construct a versionless reference (resolves to the latest version at run time).
+
+        Discouraged for reproducible runs.  Prefer the constructor with
+        an explicit ``version`` so CI and replay evaluations stay stable
+        when the evaluator is updated in Foundry.
+        """
+        return cls(name=name, version=None, display_name=display_name)
+
+
+# endregion
 # Agent evaluators that accept query/response as conversation arrays.
 # Maintained manually — check https://learn.microsoft.com/en-us/azure/ai-studio/how-to/develop/evaluate-sdk
 # for the latest evaluator list. These are the evaluators that need conversation-format input.
@@ -166,7 +216,7 @@ def _resolve_evaluator(name: str) -> str:
 
 
 def _build_testing_criteria(
-    evaluators: Sequence[str],
+    evaluators: Sequence[str | GeneratedEvaluatorRef],
     model: str,
     *,
     include_data_mapping: bool = False,
@@ -175,7 +225,9 @@ def _build_testing_criteria(
     """Build ``testing_criteria`` for ``evals.create()``.
 
     Args:
-        evaluators: Evaluator names.
+        evaluators: Evaluator names (built-in shorts / fully-qualified
+            ``builtin.*`` names) or :class:`GeneratedEvaluatorRef`
+            instances for generated rubric evaluators.
         model: Model deployment for the LLM judge.
         include_data_mapping: Whether to include field-level data mapping
             (required for the JSONL data source, not needed for response-based).
@@ -183,7 +235,38 @@ def _build_testing_criteria(
             definitions.
     """
     criteria: list[dict[str, Any]] = []
-    for name in evaluators:
+    for entry_spec in evaluators:
+        if isinstance(entry_spec, GeneratedEvaluatorRef):
+            short = entry_spec.display_name or entry_spec.name
+            ref_entry: dict[str, Any] = {
+                "type": "azure_ai_evaluator",
+                "name": short,
+                "evaluator_name": entry_spec.name,
+                "initialization_parameters": {"deployment_name": model},
+            }
+            if entry_spec.version is not None:
+                ref_entry["evaluator_version"] = entry_spec.version
+            else:
+                logger.warning(
+                    "GeneratedEvaluatorRef '%s' has no pinned version; the eval run "
+                    "will resolve to whichever version is current at execution time. "
+                    "Pin the version for reproducible runs.",
+                    entry_spec.name,
+                )
+            if include_data_mapping:
+                # Rubric evaluators accept conversation arrays like agent
+                # evaluators, plus tool_definitions when items are tool-aware.
+                ref_mapping: dict[str, str] = {
+                    "query": "{{item.query_messages}}",
+                    "response": "{{item.response_messages}}",
+                }
+                if include_tool_definitions:
+                    ref_mapping["tool_definitions"] = "{{item.tool_definitions}}"
+                ref_entry["data_mapping"] = ref_mapping
+            criteria.append(ref_entry)
+            continue
+
+        name = entry_spec
         qualified = _resolve_evaluator(name)
         short = name if not name.startswith("builtin.") else name.split(".")[-1]
 
@@ -247,9 +330,9 @@ def _build_item_schema(
 
 
 def _resolve_default_evaluators(
-    evaluators: Sequence[str] | None,
+    evaluators: Sequence[str | GeneratedEvaluatorRef] | None,
     items: Sequence[EvalItem | dict[str, Any]] | None = None,
-) -> list[str]:
+) -> list[str | GeneratedEvaluatorRef]:
     """Resolve evaluators, applying defaults when ``None``.
 
     Defaults to relevance + coherence + task_adherence. Automatically adds
@@ -258,7 +341,7 @@ def _resolve_default_evaluators(
     if evaluators is not None:
         return list(evaluators)
 
-    result = list(_DEFAULT_EVALUATORS)
+    result: list[str | GeneratedEvaluatorRef] = list(_DEFAULT_EVALUATORS)
     if items is not None:
         has_tools = any((item.tools if isinstance(item, EvalItem) else item.get("tool_definitions")) for item in items)
         if has_tools:
@@ -267,14 +350,24 @@ def _resolve_default_evaluators(
 
 
 def _filter_tool_evaluators(
-    evaluators: list[str],
+    evaluators: list[str | GeneratedEvaluatorRef],
     items: Sequence[EvalItem | dict[str, Any]],
-) -> list[str]:
-    """Remove tool evaluators if no items have tool definitions."""
+) -> list[str | GeneratedEvaluatorRef]:
+    """Remove tool evaluators if no items have tool definitions.
+
+    Generated rubric evaluators are tool-aware but not tool-required; they
+    are preserved regardless of whether items carry tool definitions.
+    """
     has_tools = any((item.tools if isinstance(item, EvalItem) else item.get("tool_definitions")) for item in items)
     if has_tools:
         return evaluators
-    filtered = [e for e in evaluators if _resolve_evaluator(e) not in _TOOL_EVALUATORS]
+
+    def _is_tool_only(spec: str | GeneratedEvaluatorRef) -> bool:
+        if isinstance(spec, GeneratedEvaluatorRef):
+            return False
+        return _resolve_evaluator(spec) in _TOOL_EVALUATORS
+
+    filtered = [e for e in evaluators if not _is_tool_only(e)]
     if not filtered:
         raise ValueError(
             f"All requested evaluators {evaluators} require tool definitions, "
@@ -282,7 +375,7 @@ def _filter_tool_evaluators(
             "or choose evaluators that do not require tools."
         )
     if len(filtered) < len(evaluators):
-        removed = [e for e in evaluators if _resolve_evaluator(e) in _TOOL_EVALUATORS]
+        removed = [e for e in evaluators if _is_tool_only(e)]
         logger.info("Removed tool evaluators %s (no items have tools)", removed)
     return filtered
 
@@ -354,6 +447,114 @@ def _extract_per_evaluator(run: RunRetrieveResponse) -> dict[str, dict[str, int]
     return per_eval
 
 
+_RUBRIC_DIMENSION_KEYS: tuple[str, ...] = ("dimension_scores", "rubric_scores")
+"""Property keys that may carry per-dimension rubric breakdowns.
+
+The published Foundry rubric-evaluator output format uses
+``properties.dimension_scores`` (see the Microsoft Learn "Rubric
+evaluators" reference).  Earlier preview builds and some SDK shapes
+used ``rubric_scores``; we accept both for defensive forward/backward
+compatibility.
+"""
+
+
+def _parse_dimension_entries(raw: Any) -> list[RubricScore]:
+    """Parse a raw list-like payload into ``RubricScore`` instances.
+
+    Returns an empty list when ``raw`` is falsy, not iterable, or
+    contains no well-formed entries.
+    """
+    if not raw:
+        return []
+    try:
+        raw_iter: Iterable[Any] = iter(raw)
+    except TypeError:
+        return []
+
+    parsed: list[RubricScore] = []
+    for raw_entry in raw_iter:
+        entry: Any = raw_entry
+        try:
+            rid: Any
+            score_val: Any
+            applicable: Any
+            weight: Any
+            reason: Any
+            if isinstance(entry, dict):
+                entry_any = cast("dict[str, Any]", entry)
+                rid = entry_any.get("id")
+                score_val = entry_any.get("score")
+                applicable = entry_any.get("applicable")
+                weight = entry_any.get("weight")
+                reason = entry_any.get("reason", "")
+            else:
+                rid = getattr(entry, "id", None)
+                score_val = getattr(entry, "score", None)
+                applicable = getattr(entry, "applicable", None)
+                weight = getattr(entry, "weight", None)
+                reason = getattr(entry, "reason", "") or ""
+            if rid is None or weight is None or applicable is None:
+                continue
+            parsed.append(
+                RubricScore(
+                    id=str(rid),
+                    score=int(score_val) if isinstance(score_val, (int, float)) else None,
+                    applicable=bool(applicable),
+                    weight=int(weight),
+                    reason=str(reason) if reason is not None else "",
+                )
+            )
+        except (TypeError, ValueError):
+            logger.debug("Skipping malformed rubric dimension entry: %s", cast("Any", entry), exc_info=True)
+    return parsed
+
+
+def _extract_rubric_scores(sample: Any) -> list[RubricScore] | None:
+    """Extract typed ``RubricScore`` instances from an evaluator's raw sample payload.
+
+    Foundry rubric evaluators include a per-dimension breakdown under
+    ``properties.dimension_scores`` on each result (preview builds used
+    ``rubric_scores``; both keys are accepted, with the canonical
+    ``dimension_scores`` taking priority).  The exact location may
+    vary across SDK versions, so this helper accepts a few shapes:
+
+    * The SDK ``sample`` object exposes
+      ``properties.dimension_scores`` / ``properties.rubric_scores``.
+    * The ``sample`` is a dict containing the same under
+      ``properties.<key>``.
+    * The ``sample`` is a dict with ``dimension_scores`` /
+      ``rubric_scores`` at the top level.
+
+    Returns ``None`` when no rubric scores are present (i.e. the
+    evaluator was not a rubric evaluator).
+    """
+    if sample is None:
+        return None
+
+    containers: list[Any] = []
+    properties: Any = getattr(sample, "properties", None)
+    if properties is not None:
+        containers.append(properties)
+    if isinstance(sample, dict):
+        sample_any = cast("dict[str, Any]", sample)
+        props_dict: Any = sample_any.get("properties")
+        if props_dict is not None and props_dict is not properties:
+            containers.append(props_dict)
+        containers.append(sample_any)
+
+    for container in containers:
+        for key in _RUBRIC_DIMENSION_KEYS:
+            raw: Any = None
+            if isinstance(container, dict):
+                raw = cast("dict[str, Any]", container).get(key)
+            elif hasattr(container, key):
+                raw = getattr(container, key, None)
+            parsed = _parse_dimension_entries(raw)
+            if parsed:
+                return parsed
+    return None
+
+
 async def _fetch_output_items(
     client: AsyncOpenAI,
     eval_id: str,
@@ -377,12 +578,15 @@ async def _fetch_output_items(
             # Extract per-evaluator scores
             scores: list[EvalScoreResult] = []
             for r in oi.results or []:
+                sample = r.sample
+                dimensions = _extract_rubric_scores(sample)
                 scores.append(
                     EvalScoreResult(
                         name=r.name,
                         score=r.score,
                         passed=r.passed,
-                        sample=r.sample,
+                        sample=sample,
+                        dimensions=dimensions,
                     )
                 )
 
@@ -394,15 +598,18 @@ async def _fetch_output_items(
             output_text: str | None = None
             response_id: str | None = None
 
-            sample = oi.sample
-            if sample is not None:  # pyright: ignore[reportUnnecessaryComparison]
-                err = sample.error
-                if err is not None and (err.code or err.message):  # pyright: ignore[reportUnnecessaryComparison]
+            # mypy infers oi.sample as dict[str, object] | None, but the
+            # OpenAI SDK actually returns a typed Sample model. Cast to Any so
+            # both type checkers accept the attribute access pattern.
+            oi_sample: Any = oi.sample
+            if oi_sample is not None:
+                err = oi_sample.error
+                if err is not None and (err.code or err.message):
                     error_code = err.code or None
                     error_message = err.message or None
 
-                usage = sample.usage
-                if usage is not None and usage.total_tokens:  # pyright: ignore[reportUnnecessaryComparison]
+                usage = oi_sample.usage
+                if usage is not None and usage.total_tokens:
                     token_usage = {
                         "prompt_tokens": usage.prompt_tokens,
                         "completion_tokens": usage.completion_tokens,
@@ -411,13 +618,13 @@ async def _fetch_output_items(
                     }
 
                 # Extract input/output text
-                if sample.input:
-                    parts = [si.content for si in sample.input if si.role == "user"]
+                if oi_sample.input:
+                    parts = [si.content for si in oi_sample.input if si.role == "user"]
                     if parts:
                         input_text = " ".join(parts)
 
-                if sample.output:
-                    parts = [so.content or "" for so in sample.output if so.role == "assistant"]
+                if oi_sample.output:
+                    parts = [so.content or "" for so in oi_sample.output if so.role == "assistant"]
                     if parts:
                         output_text = " ".join(parts)
 
@@ -472,7 +679,7 @@ async def _evaluate_via_responses_impl(
     *,
     client: AsyncOpenAI,
     response_ids: Sequence[str],
-    evaluators: list[str],
+    evaluators: list[str | GeneratedEvaluatorRef],
     model: str,
     eval_name: str,
     poll_interval: float,
@@ -573,8 +780,11 @@ class FoundryEvals:
             (from ``azure.ai.projects.aio``).  Provide this or *client*.
         model: Model deployment name for the evaluator LLM judge.
             Resolved from ``client.model`` when omitted.
-        evaluators: Evaluator names (e.g. ``["relevance", "tool_call_accuracy"]``).
-            When ``None`` (default), uses smart defaults based on item data.
+        evaluators: Evaluator specifications.  Entries may be built-in
+            short names (e.g. ``"relevance"``), fully-qualified
+            ``"builtin.*"`` names, or :class:`GeneratedEvaluatorRef`
+            instances for previously generated rubric evaluators.  When
+            ``None`` (default), uses smart defaults based on item data.
         conversation_split: How to split multi-turn conversations into
             query/response halves.  Defaults to ``LAST_TURN``.  Pass a
             ``ConversationSplit`` enum value or a custom callable — see
@@ -623,7 +833,7 @@ class FoundryEvals:
         client: FoundryChatClient | None = None,
         project_client: AIProjectClient | None = None,
         model: str | None = None,
-        evaluators: Sequence[str] | None = None,
+        evaluators: Sequence[str | GeneratedEvaluatorRef] | None = None,
         conversation_split: ConversationSplitter = ConversationSplit.LAST_TURN,
         poll_interval: float = 5.0,
         timeout: float = 180.0,
@@ -642,7 +852,9 @@ class FoundryEvals:
                 "Model is required. Pass model= explicitly or use a FoundryChatClient that has a model configured."
             )
         self._model = resolved_model
-        self._evaluators = list(evaluators) if evaluators is not None else None
+        self._evaluators: list[str | GeneratedEvaluatorRef] | None = (
+            list(evaluators) if evaluators is not None else None
+        )
         self._conversation_split = conversation_split
         self._poll_interval = poll_interval
         self._timeout = timeout
@@ -678,7 +890,7 @@ class FoundryEvals:
     async def _evaluate_via_dataset(
         self,
         items: Sequence[EvalItem],
-        evaluators: list[str],
+        evaluators: list[str | GeneratedEvaluatorRef],
         eval_name: str,
     ) -> EvalResults:
         """Evaluate using JSONL dataset upload path."""
diff --git a/python/packages/foundry/tests/test_foundry_evals.py b/python/packages/foundry/tests/test_foundry_evals.py
index a5d9f2e864..8734650aaf 100644
--- a/python/packages/foundry/tests/test_foundry_evals.py
+++ b/python/packages/foundry/tests/test_foundry_evals.py
@@ -25,16 +25,25 @@ from agent_framework._evaluation import (
 from agent_framework._workflows._workflow import WorkflowRunResult
 from openai import AsyncOpenAI
 
+from agent_framework_foundry import GeneratedEvaluatorRef
 from agent_framework_foundry._foundry_evals import (
+    _AGENT_EVALUATORS,
+    _BUILTIN_EVALUATORS,
+    _TOOL_EVALUATORS,
     FoundryEvals,
     _build_item_schema,
     _build_testing_criteria,
     _extract_per_evaluator,
     _extract_result_counts,
+    _extract_rubric_scores,
+    _fetch_output_items,
     _filter_tool_evaluators,
+    _poll_eval_run,
     _resolve_default_evaluators,
     _resolve_evaluator,
     _resolve_openai_client,
+    evaluate_foundry_target,
+    evaluate_traces,
 )
 
 
@@ -806,6 +815,67 @@ class TestBuildTestingCriteria:
         for c in criteria:
             assert "tool_definitions" in c["data_mapping"], f"{c['name']} missing tool_definitions"
 
+    def test_generated_evaluator_ref_pinned_version(self) -> None:
+
+        ref = GeneratedEvaluatorRef(name="my-rubric", version="1")
+        criteria = _build_testing_criteria([ref], "gpt-4o", include_data_mapping=True)
+
+        assert len(criteria) == 1
+        c = criteria[0]
+        assert c["type"] == "azure_ai_evaluator"
+        assert c["evaluator_name"] == "my-rubric"
+        assert c["evaluator_version"] == "1"
+        assert c["name"] == "my-rubric"
+        assert c["initialization_parameters"] == {"deployment_name": "gpt-4o"}
+        assert c["data_mapping"] == {
+            "query": "{{item.query_messages}}",
+            "response": "{{item.response_messages}}",
+        }
+
+    def test_generated_evaluator_ref_display_name_used_as_short(self) -> None:
+
+        ref = GeneratedEvaluatorRef(name="my-rubric", version="2", display_name="My Rubric")
+        criteria = _build_testing_criteria([ref], "gpt-4o")
+
+        assert criteria[0]["name"] == "My Rubric"
+        assert criteria[0]["evaluator_name"] == "my-rubric"
+
+    def test_generated_evaluator_ref_tool_definitions_added(self) -> None:
+
+        ref = GeneratedEvaluatorRef(name="my-rubric", version="1")
+        criteria = _build_testing_criteria(
+            [ref],
+            "gpt-4o",
+            include_data_mapping=True,
+            include_tool_definitions=True,
+        )
+
+        assert criteria[0]["data_mapping"]["tool_definitions"] == "{{item.tool_definitions}}"
+
+    def test_generated_evaluator_ref_unpinned_warns(self, caplog: pytest.LogCaptureFixture) -> None:
+        import logging
+
+        ref = GeneratedEvaluatorRef.latest("my-rubric")
+        with caplog.at_level(logging.WARNING, logger="agent_framework_foundry._foundry_evals"):
+            criteria = _build_testing_criteria([ref], "gpt-4o")
+
+        assert "evaluator_version" not in criteria[0]
+        assert any("no pinned version" in r.message for r in caplog.records)
+
+    def test_generated_evaluator_ref_mixed_with_builtins(self) -> None:
+
+        ref = GeneratedEvaluatorRef(name="my-rubric", version="1")
+        criteria = _build_testing_criteria(
+            ["relevance", ref, "task_adherence"],
+            "gpt-4o",
+            include_data_mapping=True,
+        )
+
+        assert [c["name"] for c in criteria] == ["relevance", "my-rubric", "task_adherence"]
+        assert criteria[0]["evaluator_name"] == "builtin.relevance"
+        assert criteria[1]["evaluator_name"] == "my-rubric"
+        assert criteria[2]["evaluator_name"] == "builtin.task_adherence"
+
 
 # ---------------------------------------------------------------------------
 # _build_item_schema
@@ -1263,6 +1333,29 @@ class TestFilterToolEvaluators:
                 items,
             )
 
+    def test_preserves_generated_ref_when_no_tools(self) -> None:
+
+        ref = GeneratedEvaluatorRef(name="rubric", version="1")
+        items = [
+            EvalItem(conversation=[Message("user", ["q"]), Message("assistant", ["r"])]),
+        ]
+        result = _filter_tool_evaluators(
+            ["relevance", ref, "tool_call_accuracy"],
+            items,
+        )
+        assert "relevance" in result
+        assert ref in result
+        assert "tool_call_accuracy" not in result
+
+    def test_generated_ref_alone_does_not_raise(self) -> None:
+
+        ref = GeneratedEvaluatorRef(name="rubric", version="1")
+        items = [
+            EvalItem(conversation=[Message("user", ["q"]), Message("assistant", ["r"])]),
+        ]
+        result = _filter_tool_evaluators([ref], items)
+        assert result == [ref]
+
 
 # ---------------------------------------------------------------------------
 # EvalResults
@@ -2267,7 +2360,6 @@ class TestEvalResultsWithItems:
 
 class TestFetchOutputItems:
     async def test_fetches_and_converts_output_items(self) -> None:
-        from agent_framework_foundry._foundry_evals import _fetch_output_items
 
         # Build mock output items matching the OpenAI SDK schema
         mock_result = MagicMock()
@@ -2329,7 +2421,6 @@ class TestFetchOutputItems:
         assert item.error_code is None
 
     async def test_handles_errored_item(self) -> None:
-        from agent_framework_foundry._foundry_evals import _fetch_output_items
 
         mock_error = MagicMock()
         mock_error.code = "QueryExtractionError"
@@ -2361,7 +2452,6 @@ class TestFetchOutputItems:
         assert len(item.scores) == 0
 
     async def test_handles_api_failure_gracefully(self) -> None:
-        from agent_framework_foundry._foundry_evals import _fetch_output_items
 
         mock_client = MagicMock()
         mock_client.evals.runs.output_items.list = AsyncMock(side_effect=TypeError("API error"))
@@ -2369,6 +2459,166 @@ class TestFetchOutputItems:
         items = await _fetch_output_items(mock_client, "eval_1", "run_1")
         assert items == []
 
+    async def test_extracts_rubric_scores_from_dict_sample(self) -> None:
+
+        mock_result = MagicMock()
+        mock_result.name = "my-rubric"
+        mock_result.score = 0.85
+        mock_result.passed = True
+        mock_result.sample = {
+            "properties": {
+                "rubric_scores": [
+                    {"id": "policy", "score": 4, "applicable": True, "weight": 1, "reason": "ok"},
+                    {"id": "safety", "score": None, "applicable": False, "weight": 1, "reason": "n/a"},
+                ]
+            }
+        }
+
+        mock_oi = MagicMock()
+        mock_oi.id = "oi_1"
+        mock_oi.status = "pass"
+        mock_oi.results = [mock_result]
+        mock_oi.sample = None
+        mock_oi.datasource_item = {}
+
+        mock_client = MagicMock()
+        mock_client.evals.runs.output_items.list = AsyncMock(return_value=_AsyncPage([mock_oi]))
+
+        items = await _fetch_output_items(mock_client, "eval_1", "run_1")
+
+        assert len(items) == 1
+        scores = items[0].scores
+        assert len(scores) == 1
+        assert scores[0].dimensions is not None
+        assert len(scores[0].dimensions) == 2
+        policy = next(d for d in scores[0].dimensions if d.id == "policy")
+        assert policy.score == 4
+        assert policy.applicable is True
+        assert policy.weight == 1
+        assert policy.reason == "ok"
+        safety = next(d for d in scores[0].dimensions if d.id == "safety")
+        assert safety.score is None
+        assert safety.applicable is False
+
+    async def test_no_rubric_scores_when_absent(self) -> None:
+
+        mock_result = MagicMock()
+        mock_result.name = "relevance"
+        mock_result.score = 0.85
+        mock_result.passed = True
+        mock_result.sample = None
+
+        mock_oi = MagicMock()
+        mock_oi.id = "oi_2"
+        mock_oi.status = "pass"
+        mock_oi.results = [mock_result]
+        mock_oi.sample = None
+        mock_oi.datasource_item = {}
+
+        mock_client = MagicMock()
+        mock_client.evals.runs.output_items.list = AsyncMock(return_value=_AsyncPage([mock_oi]))
+
+        items = await _fetch_output_items(mock_client, "eval_1", "run_1")
+
+        assert items[0].scores[0].dimensions is None
+
+
+class TestExtractRubricScores:
+    def test_handles_attribute_style_properties(self) -> None:
+
+        rs = MagicMock()
+        rs.id = "policy"
+        rs.score = 5
+        rs.applicable = True
+        rs.weight = 2
+        rs.reason = "ok"
+
+        sample = MagicMock()
+        sample.properties = MagicMock()
+        sample.properties.rubric_scores = [rs]
+
+        result = _extract_rubric_scores(sample)
+        assert result is not None
+        assert result[0].id == "policy"
+        assert result[0].score == 5
+        assert result[0].weight == 2
+
+    def test_top_level_rubric_scores_in_dict(self) -> None:
+
+        sample = {"rubric_scores": [{"id": "a", "score": 3, "applicable": True, "weight": 1, "reason": "r"}]}
+        result = _extract_rubric_scores(sample)
+        assert result is not None
+        assert result[0].id == "a"
+
+    def test_returns_none_when_missing(self) -> None:
+
+        assert _extract_rubric_scores(None) is None
+        assert _extract_rubric_scores({}) is None
+        assert _extract_rubric_scores({"properties": {}}) is None
+
+    def test_skips_malformed_entries(self) -> None:
+
+        sample = {
+            "properties": {
+                "rubric_scores": [
+                    {"id": "good", "score": 3, "applicable": True, "weight": 1, "reason": "ok"},
+                    {"id": "bad-no-weight", "score": 2, "applicable": True, "reason": "x"},
+                ]
+            }
+        }
+        result = _extract_rubric_scores(sample)
+        assert result is not None
+        assert len(result) == 1
+        assert result[0].id == "good"
+
+    def test_canonical_dimension_scores_key_from_docs(self) -> None:
+        """Per the Microsoft Learn docs, runtime output uses ``properties.dimension_scores``."""
+
+        sample = {
+            "properties": {
+                "dimension_scores": [
+                    {
+                        "id": "intent_recognition",
+                        "score": 5,
+                        "applicable": True,
+                        "weight": 9,
+                        "reason": "Identified correctly.",
+                    },
+                    {
+                        "id": "general_quality",
+                        "score": 4,
+                        "applicable": True,
+                        "weight": 5,
+                        "reason": "Strong overall.",
+                    },
+                ]
+            }
+        }
+        result = _extract_rubric_scores(sample)
+        assert result is not None
+        assert [r.id for r in result] == ["intent_recognition", "general_quality"]
+        assert [r.score for r in result] == [5, 4]
+        assert [r.weight for r in result] == [9, 5]
+
+    def test_dimension_scores_via_attribute(self) -> None:
+        """Canonical key also resolves when properties exposes ``dimension_scores`` as an attr."""
+
+        rs = MagicMock()
+        rs.id = "policy_enforcement"
+        rs.score = 1
+        rs.applicable = True
+        rs.weight = 5
+        rs.reason = "violated"
+
+        sample = MagicMock()
+        sample.properties = MagicMock(spec=["dimension_scores"])
+        sample.properties.dimension_scores = [rs]
+
+        result = _extract_rubric_scores(sample)
+        assert result is not None
+        assert result[0].id == "policy_enforcement"
+        assert result[0].score == 1
+
 
 # ---------------------------------------------------------------------------
 # _poll_eval_run — timeout / failed / canceled paths
@@ -2378,7 +2628,6 @@ class TestFetchOutputItems:
 class TestPollEvalRun:
     async def test_timeout_returns_timeout_status(self) -> None:
         """Poll timeout returns EvalResults with status='timeout'."""
-        from agent_framework_foundry._foundry_evals import _poll_eval_run
 
         mock_client = MagicMock()
         mock_pending = MagicMock()
@@ -2392,7 +2641,6 @@ class TestPollEvalRun:
 
     async def test_failed_run_returns_error(self) -> None:
         """Failed run returns EvalResults with error message."""
-        from agent_framework_foundry._foundry_evals import _poll_eval_run
 
         mock_client = MagicMock()
         mock_failed = MagicMock()
@@ -2410,7 +2658,6 @@ class TestPollEvalRun:
 
     async def test_canceled_run_returns_canceled_status(self) -> None:
         """Canceled run returns EvalResults with status='canceled'."""
-        from agent_framework_foundry._foundry_evals import _poll_eval_run
 
         mock_client = MagicMock()
         mock_canceled = MagicMock()
@@ -2435,7 +2682,6 @@ class TestPollEvalRun:
 class TestEvaluateTraces:
     async def test_raises_without_required_args(self) -> None:
         """Raises ValueError when no response_ids, trace_ids, or agent_id given."""
-        from agent_framework_foundry._foundry_evals import evaluate_traces
 
         mock_client = MagicMock()
         with pytest.raises(ValueError, match="Provide at least one of"):
@@ -2446,7 +2692,6 @@ class TestEvaluateTraces:
 
     async def test_response_ids_path(self) -> None:
         """evaluate_traces with response_ids uses the responses API path."""
-        from agent_framework_foundry._foundry_evals import evaluate_traces
 
         mock_client = MagicMock()
 
@@ -2494,7 +2739,6 @@ class TestEvaluateTraces:
 
     async def test_trace_ids_path(self) -> None:
         """evaluate_traces with trace_ids builds azure_ai_traces data source."""
-        from agent_framework_foundry._foundry_evals import evaluate_traces
 
         mock_client = MagicMock()
 
@@ -2534,7 +2778,6 @@ class TestEvaluateTraces:
 class TestEvaluateFoundryTarget:
     async def test_happy_path(self) -> None:
         """evaluate_foundry_target creates eval + run and polls to completion."""
-        from agent_framework_foundry._foundry_evals import evaluate_foundry_target
 
         mock_client = MagicMock()
 
@@ -2670,13 +2913,11 @@ class TestEvaluatorSetConsistency:
     """Verify that _AGENT_EVALUATORS and _TOOL_EVALUATORS are subsets of _BUILTIN_EVALUATORS."""
 
     def test_agent_evaluators_subset(self):
-        from agent_framework_foundry._foundry_evals import _AGENT_EVALUATORS, _BUILTIN_EVALUATORS
 
         diff = _AGENT_EVALUATORS - set(_BUILTIN_EVALUATORS.values())
         assert not diff, f"_AGENT_EVALUATORS has names not in _BUILTIN_EVALUATORS: {diff}"
 
     def test_tool_evaluators_subset(self):
-        from agent_framework_foundry._foundry_evals import _BUILTIN_EVALUATORS, _TOOL_EVALUATORS
 
         diff = _TOOL_EVALUATORS - set(_BUILTIN_EVALUATORS.values())
         assert not diff, f"_TOOL_EVALUATORS has names not in _BUILTIN_EVALUATORS: {diff}"
@@ -2690,7 +2931,6 @@ class TestEvaluatorSetConsistency:
 class TestEvaluateTracesAgentId:
     async def test_agent_id_only_path(self) -> None:
         """evaluate_traces with agent_id only builds azure_ai_traces data source."""
-        from agent_framework_foundry._foundry_evals import evaluate_traces
 
         mock_client = MagicMock()
 
@@ -2748,7 +2988,6 @@ class TestFilterToolEvaluatorsRaises:
 class TestEvaluateFoundryTargetValidation:
     async def test_target_without_type_raises(self) -> None:
         """target dict without 'type' key raises ValueError."""
-        from agent_framework_foundry._foundry_evals import evaluate_foundry_target
 
         mock_client = MagicMock()
         with pytest.raises(ValueError, match="'type' key"):
diff --git a/python/samples/05-end-to-end/evaluation/foundry_evals/.env.example b/python/samples/05-end-to-end/evaluation/foundry_evals/.env.example
index b6a8af233e..388350edea 100644
--- a/python/samples/05-end-to-end/evaluation/foundry_evals/.env.example
+++ b/python/samples/05-end-to-end/evaluation/foundry_evals/.env.example
@@ -1,3 +1,12 @@
 FOUNDRY_PROJECT_ENDPOINT="<your-project-endpoint>"
 FOUNDRY_MODEL="<your-model-deployment>"
 
+# Only needed for evaluate_with_rubric_sample.py — connects to the
+# pre-existing Foundry agent that the rubric evaluator was created against.
+FOUNDRY_AGENT_NAME="<your-agent-name>"
+FOUNDRY_AGENT_VERSION="<your-agent-version>"
+
+# Only needed for evaluate_with_rubric_sample.py — references a rubric
+# evaluator you created in Foundry. Pin the version for reproducible runs.
+FOUNDRY_RUBRIC_NAME="<your-rubric-name>"
+FOUNDRY_RUBRIC_VERSION="<your-rubric-version>"
\ No newline at end of file
diff --git a/python/samples/05-end-to-end/evaluation/foundry_evals/README.md b/python/samples/05-end-to-end/evaluation/foundry_evals/README.md
index 81412a7f0e..e30ce6aa46 100644
--- a/python/samples/05-end-to-end/evaluation/foundry_evals/README.md
+++ b/python/samples/05-end-to-end/evaluation/foundry_evals/README.md
@@ -35,6 +35,34 @@ Evaluate what already happened — zero changes to agent code:
 uv run samples/05-end-to-end/evaluation/foundry_evals/evaluate_traces_sample.py
 ```
 
+### Referencing a rubric evaluator created in Foundry
+
+Foundry users can create rubric evaluators in the Foundry portal (or
+through the dedicated SDK / REST surface). Once an evaluator exists,
+agent-framework consumes it like any other evaluator: pass a
+`GeneratedEvaluatorRef(name=..., version=...)` in the `evaluators=`
+list and pin the version for reproducible runs.
+
+```python
+from agent_framework.foundry import FoundryEvals, GeneratedEvaluatorRef
+
+evals = FoundryEvals(
+    evaluators=[
+        GeneratedEvaluatorRef(name="reservation-policy-rubric", version="3"),
+        "relevance",
+        "coherence",
+    ],
+)
+```
+
+Quality gates on rubric output use the standard `EvalResults` helpers,
+including `assert_dimension_score_at_least(...)` for per-dimension
+thresholds.
+
+See [`evaluate_with_rubric_sample.py`](./evaluate_with_rubric_sample.py)
+for a runnable end-to-end example that combines a rubric evaluator with
+built-in evaluators and gates a per-dimension threshold.
+
 ## Setup
 
 Create a `.env` file with configuration as in the `.env.example` file in this folder.
@@ -44,3 +72,4 @@ Create a `.env` file with configuration as in the `.env.example` file in this fo
 - **"I want to test my agent during development"** → `evaluate_agent_sample.py`, Pattern 1
 - **"I want to evaluate past agent runs"** → `evaluate_traces_sample.py`
 - **"I want to inspect/modify eval data before submitting"** → `evaluate_agent_sample.py`, Pattern 2
+- **"I want to score against a custom rubric I created in Foundry"** → `evaluate_with_rubric_sample.py`
diff --git a/python/samples/05-end-to-end/evaluation/foundry_evals/evaluate_with_rubric_sample.py b/python/samples/05-end-to-end/evaluation/foundry_evals/evaluate_with_rubric_sample.py
new file mode 100644
index 0000000000..06ec5c9bdd
--- /dev/null
+++ b/python/samples/05-end-to-end/evaluation/foundry_evals/evaluate_with_rubric_sample.py
@@ -0,0 +1,138 @@
+# Copyright (c) Microsoft. All rights reserved.
+
+"""Evaluate a Foundry agent against a rubric evaluator that was created in Foundry.
+
+Rubric evaluators are LLM-as-judge evaluators with custom scoring dimensions
+that you define for your domain. agent-framework consumes pre-existing rubric
+evaluators — they are authored in the Foundry portal (or via the dedicated
+SDK / REST surface) and referenced here by name and version.
+
+See: https://learn.microsoft.com/azure/ai-foundry/concepts/evaluation-evaluators/rubric-evaluators
+
+This sample demonstrates:
+1. Connecting to a pre-existing Foundry agent (PromptAgent or HostedAgent).
+2. Referencing a pre-existing rubric evaluator by ``name`` and ``version``.
+3. Mixing the rubric with built-in Foundry evaluators in one run.
+4. Asserting per-dimension thresholds with
+   ``EvalResults.assert_dimension_score_at_least(...)`` for CI quality gates.
+
+Starting condition / prerequisites:
+- An Azure AI Foundry project with a deployed model.
+- A registered Foundry agent (PromptAgent or HostedAgent) in that project.
+  This is the agent the rubric is meant to evaluate.
+- A rubric evaluator already created in the Foundry portal against that
+  agent. Creating rubrics through the portal currently requires picking a
+  Foundry agent as the generation context, so this prerequisite is implied
+  by having a rubric at all.
+- Set the following in .env (see ``.env.example``):
+    - ``FOUNDRY_PROJECT_ENDPOINT``
+    - ``FOUNDRY_AGENT_NAME`` and ``FOUNDRY_AGENT_VERSION`` for the agent
+    - ``FOUNDRY_RUBRIC_NAME`` and ``FOUNDRY_RUBRIC_VERSION`` for the rubric
+    - ``FOUNDRY_MODEL`` for the rubric judge model
+"""
+
+import asyncio
+import os
+
+from agent_framework import EvalNotPassedError, evaluate_agent
+from agent_framework.foundry import FoundryAgent, FoundryChatClient, FoundryEvals, GeneratedEvaluatorRef
+from azure.identity import AzureCliCredential
+from dotenv import load_dotenv
+
+load_dotenv(override=True)
+
+
+async def main() -> None:
+    # 1. Connect to the existing Foundry agent that the rubric was created
+    #    against. PromptAgents and HostedAgents are both supported.
+    credential = AzureCliCredential()
+    project_endpoint = os.environ["FOUNDRY_PROJECT_ENDPOINT"]
+
+    agent = FoundryAgent(
+        project_endpoint=project_endpoint,
+        agent_name=os.environ["FOUNDRY_AGENT_NAME"],
+        agent_version=os.environ.get("FOUNDRY_AGENT_VERSION"),
+        credential=credential,
+    )
+
+    # 2. Reference the pre-existing rubric evaluator by name + version.
+    #    Always pin a version for reproducible CI runs; versionless refs
+    #    resolve to "latest" and emit a warning at evaluation time.
+    rubric_name = os.environ["FOUNDRY_RUBRIC_NAME"]
+    rubric_version = os.environ["FOUNDRY_RUBRIC_VERSION"]
+    rubric = GeneratedEvaluatorRef(name=rubric_name, version=rubric_version)
+
+    # 3. Mix the rubric with built-in evaluators in a single FoundryEvals
+    #    config. FoundryEvals talks to Foundry over the project endpoint, so
+    #    we hand it a FoundryChatClient configured with the same credential.
+    eval_client = FoundryChatClient(
+        project_endpoint=project_endpoint,
+        model=os.environ["FOUNDRY_MODEL"],
+        credential=credential,
+    )
+    evals = FoundryEvals(
+        client=eval_client,
+        evaluators=[
+            rubric,
+            FoundryEvals.RELEVANCE,
+            FoundryEvals.COHERENCE,
+        ],
+    )
+
+    # =========================================================================
+    # Run evaluation
+    # =========================================================================
+    print("=" * 60)
+    print(f"Evaluating '{agent.name}' with rubric '{rubric_name}' (version {rubric_version})")
+    print("=" * 60)
+
+    results = await evaluate_agent(
+        agent=agent,
+        queries=[
+            "What's the weather like in Seattle?",
+            "Should I bring an umbrella to London tomorrow?",
+        ],
+        evaluators=evals,
+    )
+
+    for r in results:
+        print(f"Status: {r.status}")
+        print(f"Results: {r.passed}/{r.total} passed")
+        print(f"Portal: {r.report_url}")
+        if r.all_passed:
+            print("[PASS] All passed")
+        else:
+            print(f"[FAIL] {r.failed} failed")
+
+    # =========================================================================
+    # Per-dimension quality gate
+    # =========================================================================
+    # Rubric evaluators emit per-dimension scores (1–5) on top of the overall
+    # weighted score. Use assert_dimension_score_at_least to gate CI on a
+    # specific dimension — e.g., never ship if a critical dimension drops
+    # below 3.
+    #
+    # The dimension_id must match an id defined on your rubric in Foundry.
+    # ``general_quality`` is used here because it's the conventional
+    # ``always_applicable: true`` dimension in the Foundry docs' example
+    # rubric — swap it for whatever dimension id(s) your rubric actually
+    # defines.
+    print()
+    print("=" * 60)
+    print("Per-dimension quality gate")
+    print("=" * 60)
+
+    for r in results:
+        try:
+            r.assert_dimension_score_at_least(
+                "general_quality",
+                min_score=3.0,
+                evaluator=rubric_name,
+            )
+            print(f"[PASS] {r.provider}: general_quality >= 3 on every item")
+        except EvalNotPassedError as exc:
+            print(f"[FAIL] {r.provider}: dimension gate tripped: {exc}")
+
+
+if __name__ == "__main__":
+    asyncio.run(main())