Python: feat(evals): add ground_truth support for similarity evaluator (#5234)

* feat(evals): add ground_truth support for similarity evaluator - Include expected_output as ground_truth in Foundry JSONL dataset rows - Add ground_truth to item schema and data mapping for similarity evaluator - Add expected_output parameter to evaluate_workflow - Add similarity Pattern 3 to evaluate_agent and evaluate_workflow samples - Add tests for ground_truth in dataset, schema, and evaluate_workflow * Apply suggestions from code review Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> * fix: wrap long line to satisfy ruff E501 --------- Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
2026-06-16 21:04:09 +08:00 · 2026-04-21 12:40:53 -07:00
parent 8f17067383
commit aa582d021d
5 changed files with 270 additions and 7 deletions
@@ -2,9 +2,10 @@

 """Evaluate an agent using Azure AI Foundry's built-in evaluators.

-This sample demonstrates two patterns:
+This sample demonstrates three patterns:
 1. evaluate_agent(responses=...) — Evaluate a response you already have.
 2. evaluate_agent(queries=...) — Run the agent against test queries and evaluate in one call.
+3. Similarity — Compare agent output against ground-truth reference answers.

 See ``evaluate_tool_calls_sample.py`` for tool-call accuracy evaluation.

@@ -149,6 +150,41 @@ async def main() -> None:
        else:
            print(f"[FAIL] {r.failed} failed")

+    # =========================================================================
+    # Pattern 3: Similarity — compare agent output to ground-truth answers
+    # =========================================================================
+    print()
+    print("=" * 60)
+    print("Pattern 3: Similarity evaluation with ground truth")
+    print("=" * 60)
+
+    # Similarity requires expected_output — a reference answer per query
+    # that the evaluator compares against the agent's actual response.
+    results = await evaluate_agent(
+        agent=agent,
+        queries=[
+            "What's the weather like in Seattle?",
+            "How much does a flight from Seattle to Paris cost?",
+        ],
+        expected_output=[
+            "62°F, cloudy with a chance of rain",
+            "Flights from Seattle to Paris: $450 round-trip",
+        ],
+        evaluators=FoundryEvals(
+            client=chat_client,
+            evaluators=[FoundryEvals.SIMILARITY],
+        ),
+    )
+
+    for r in results:
+        print(f"Status: {r.status}")
+        print(f"Results: {r.passed}/{r.total} passed")
+        print(f"Portal: {r.report_url}")
+        if r.all_passed:
+            print("[PASS] All passed")
+        else:
+            print(f"[FAIL] {r.failed} failed")
+

 if __name__ == "__main__":
    asyncio.run(main())
@@ -2,11 +2,12 @@

 """Evaluate a multi-agent workflow using Azure AI Foundry evaluators.

-This sample demonstrates two patterns:
+This sample demonstrates three patterns:
 1. Post-hoc: Run the workflow, then evaluate the result you already have.
 2. Run + evaluate: Pass queries and let evaluate_workflow() run the workflow for you.
+3. Similarity: Evaluate the workflow's final output against ground-truth reference answers.

-Both patterns return a list of results (one per provider), each with a per-agent
+Patterns 1 & 2 return a list of results (one per provider), each with a per-agent
 breakdown in sub_results so you can identify which agent is underperforming.

 Prerequisites:
@@ -79,7 +80,6 @@ async def main() -> None:

    # 4. Create the evaluator — provider config goes here, once
    evals = FoundryEvals(client=client)
-
    # =========================================================================
    # Pattern 1: Post-hoc — evaluate a workflow run you already did
    # =========================================================================
@@ -143,6 +143,43 @@ async def main() -> None:
            if agent_eval.report_url:
                print(f"    Portal: {agent_eval.report_url}")

+    # =========================================================================
+    # Pattern 3: Similarity — compare workflow output to ground-truth answers
+    # =========================================================================
+    # Build a fresh workflow to avoid stale session state from Pattern 2.
+    workflow3 = SequentialBuilder(participants=[researcher, planner]).build()
+
+    print()
+    print("=" * 60)
+    print("Pattern 3: Similarity evaluation with ground truth")
+    print("=" * 60)
+
+    # Similarity compares the final workflow output against a reference answer,
+    # so per-agent breakdown is disabled — individual agents don't have their
+    # own ground-truth targets.
+    eval_results = await evaluate_workflow(
+        workflow=workflow3,
+        queries=[
+            "Plan a trip from Seattle to Paris",
+            "Plan a trip from London to Tokyo",
+        ],
+        expected_output=[
+            "Pack layers and an umbrella for Paris. Flights from Seattle are around $450 round-trip.",
+            "Bring warm clothing for Tokyo in spring. Flights from London are around $500 round-trip.",
+        ],
+        evaluators=FoundryEvals(
+            client=client,
+            evaluators=[FoundryEvals.SIMILARITY],
+        ),
+        include_per_agent=False,
+    )
+
+    for r in eval_results:
+        print(f"\nOverall: {r.status}")
+        print(f"  Passed: {r.passed}/{r.total}")
+        if r.report_url:
+            print(f"  Portal: {r.report_url}")
+

 if __name__ == "__main__":
    asyncio.run(main())
@@ -173,4 +210,12 @@ Overall: completed
 Per-agent breakdown:
  researcher: 2/2 passed
  planner: 2/2 passed
+
+============================================================
+Pattern 3: Similarity evaluation with ground truth
+============================================================
+
+Overall: completed
+  Passed: 2/2
+  Portal: https://ai.azure.com/...
 """