Python: feat(evals): add ground_truth support for similarity evaluator (#5234)

* feat(evals): add ground_truth support for similarity evaluator

- Include expected_output as ground_truth in Foundry JSONL dataset rows
- Add ground_truth to item schema and data mapping for similarity evaluator
- Add expected_output parameter to evaluate_workflow
- Add similarity Pattern 3 to evaluate_agent and evaluate_workflow samples
- Add tests for ground_truth in dataset, schema, and evaluate_workflow

* Apply suggestions from code review

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>

* fix: wrap long line to satisfy ruff E501

---------

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
This commit is contained in:
chetantoshniwal
2026-04-21 12:40:53 -07:00
committed by GitHub
Unverified
parent 8f17067383
commit aa582d021d
5 changed files with 270 additions and 7 deletions
@@ -2,9 +2,10 @@
"""Evaluate an agent using Azure AI Foundry's built-in evaluators.
This sample demonstrates two patterns:
This sample demonstrates three patterns:
1. evaluate_agent(responses=...) — Evaluate a response you already have.
2. evaluate_agent(queries=...) — Run the agent against test queries and evaluate in one call.
3. Similarity — Compare agent output against ground-truth reference answers.
See ``evaluate_tool_calls_sample.py`` for tool-call accuracy evaluation.
@@ -149,6 +150,41 @@ async def main() -> None:
else:
print(f"[FAIL] {r.failed} failed")
# =========================================================================
# Pattern 3: Similarity — compare agent output to ground-truth answers
# =========================================================================
print()
print("=" * 60)
print("Pattern 3: Similarity evaluation with ground truth")
print("=" * 60)
# Similarity requires expected_output — a reference answer per query
# that the evaluator compares against the agent's actual response.
results = await evaluate_agent(
agent=agent,
queries=[
"What's the weather like in Seattle?",
"How much does a flight from Seattle to Paris cost?",
],
expected_output=[
"62°F, cloudy with a chance of rain",
"Flights from Seattle to Paris: $450 round-trip",
],
evaluators=FoundryEvals(
client=chat_client,
evaluators=[FoundryEvals.SIMILARITY],
),
)
for r in results:
print(f"Status: {r.status}")
print(f"Results: {r.passed}/{r.total} passed")
print(f"Portal: {r.report_url}")
if r.all_passed:
print("[PASS] All passed")
else:
print(f"[FAIL] {r.failed} failed")
if __name__ == "__main__":
asyncio.run(main())
@@ -2,11 +2,12 @@
"""Evaluate a multi-agent workflow using Azure AI Foundry evaluators.
This sample demonstrates two patterns:
This sample demonstrates three patterns:
1. Post-hoc: Run the workflow, then evaluate the result you already have.
2. Run + evaluate: Pass queries and let evaluate_workflow() run the workflow for you.
3. Similarity: Evaluate the workflow's final output against ground-truth reference answers.
Both patterns return a list of results (one per provider), each with a per-agent
Patterns 1 & 2 return a list of results (one per provider), each with a per-agent
breakdown in sub_results so you can identify which agent is underperforming.
Prerequisites:
@@ -79,7 +80,6 @@ async def main() -> None:
# 4. Create the evaluator — provider config goes here, once
evals = FoundryEvals(client=client)
# =========================================================================
# Pattern 1: Post-hoc — evaluate a workflow run you already did
# =========================================================================
@@ -143,6 +143,43 @@ async def main() -> None:
if agent_eval.report_url:
print(f" Portal: {agent_eval.report_url}")
# =========================================================================
# Pattern 3: Similarity — compare workflow output to ground-truth answers
# =========================================================================
# Build a fresh workflow to avoid stale session state from Pattern 2.
workflow3 = SequentialBuilder(participants=[researcher, planner]).build()
print()
print("=" * 60)
print("Pattern 3: Similarity evaluation with ground truth")
print("=" * 60)
# Similarity compares the final workflow output against a reference answer,
# so per-agent breakdown is disabled — individual agents don't have their
# own ground-truth targets.
eval_results = await evaluate_workflow(
workflow=workflow3,
queries=[
"Plan a trip from Seattle to Paris",
"Plan a trip from London to Tokyo",
],
expected_output=[
"Pack layers and an umbrella for Paris. Flights from Seattle are around $450 round-trip.",
"Bring warm clothing for Tokyo in spring. Flights from London are around $500 round-trip.",
],
evaluators=FoundryEvals(
client=client,
evaluators=[FoundryEvals.SIMILARITY],
),
include_per_agent=False,
)
for r in eval_results:
print(f"\nOverall: {r.status}")
print(f" Passed: {r.passed}/{r.total}")
if r.report_url:
print(f" Portal: {r.report_url}")
if __name__ == "__main__":
asyncio.run(main())
@@ -173,4 +210,12 @@ Overall: completed
Per-agent breakdown:
researcher: 2/2 passed
planner: 2/2 passed
============================================================
Pattern 3: Similarity evaluation with ground truth
============================================================
Overall: completed
Passed: 2/2
Portal: https://ai.azure.com/...
"""