mirror of
https://github.com/microsoft/agent-framework.git
synced 2026-06-16 21:04:09 +08:00
Python: feat(evals): add ground_truth support for similarity evaluator (#5234)
* feat(evals): add ground_truth support for similarity evaluator - Include expected_output as ground_truth in Foundry JSONL dataset rows - Add ground_truth to item schema and data mapping for similarity evaluator - Add expected_output parameter to evaluate_workflow - Add similarity Pattern 3 to evaluate_agent and evaluate_workflow samples - Add tests for ground_truth in dataset, schema, and evaluate_workflow * Apply suggestions from code review Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> * fix: wrap long line to satisfy ruff E501 --------- Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
This commit is contained in:
committed by
GitHub
Unverified
parent
8f17067383
commit
aa582d021d
@@ -2,9 +2,10 @@
|
||||
|
||||
"""Evaluate an agent using Azure AI Foundry's built-in evaluators.
|
||||
|
||||
This sample demonstrates two patterns:
|
||||
This sample demonstrates three patterns:
|
||||
1. evaluate_agent(responses=...) — Evaluate a response you already have.
|
||||
2. evaluate_agent(queries=...) — Run the agent against test queries and evaluate in one call.
|
||||
3. Similarity — Compare agent output against ground-truth reference answers.
|
||||
|
||||
See ``evaluate_tool_calls_sample.py`` for tool-call accuracy evaluation.
|
||||
|
||||
@@ -149,6 +150,41 @@ async def main() -> None:
|
||||
else:
|
||||
print(f"[FAIL] {r.failed} failed")
|
||||
|
||||
# =========================================================================
|
||||
# Pattern 3: Similarity — compare agent output to ground-truth answers
|
||||
# =========================================================================
|
||||
print()
|
||||
print("=" * 60)
|
||||
print("Pattern 3: Similarity evaluation with ground truth")
|
||||
print("=" * 60)
|
||||
|
||||
# Similarity requires expected_output — a reference answer per query
|
||||
# that the evaluator compares against the agent's actual response.
|
||||
results = await evaluate_agent(
|
||||
agent=agent,
|
||||
queries=[
|
||||
"What's the weather like in Seattle?",
|
||||
"How much does a flight from Seattle to Paris cost?",
|
||||
],
|
||||
expected_output=[
|
||||
"62°F, cloudy with a chance of rain",
|
||||
"Flights from Seattle to Paris: $450 round-trip",
|
||||
],
|
||||
evaluators=FoundryEvals(
|
||||
client=chat_client,
|
||||
evaluators=[FoundryEvals.SIMILARITY],
|
||||
),
|
||||
)
|
||||
|
||||
for r in results:
|
||||
print(f"Status: {r.status}")
|
||||
print(f"Results: {r.passed}/{r.total} passed")
|
||||
print(f"Portal: {r.report_url}")
|
||||
if r.all_passed:
|
||||
print("[PASS] All passed")
|
||||
else:
|
||||
print(f"[FAIL] {r.failed} failed")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
|
||||
@@ -2,11 +2,12 @@
|
||||
|
||||
"""Evaluate a multi-agent workflow using Azure AI Foundry evaluators.
|
||||
|
||||
This sample demonstrates two patterns:
|
||||
This sample demonstrates three patterns:
|
||||
1. Post-hoc: Run the workflow, then evaluate the result you already have.
|
||||
2. Run + evaluate: Pass queries and let evaluate_workflow() run the workflow for you.
|
||||
3. Similarity: Evaluate the workflow's final output against ground-truth reference answers.
|
||||
|
||||
Both patterns return a list of results (one per provider), each with a per-agent
|
||||
Patterns 1 & 2 return a list of results (one per provider), each with a per-agent
|
||||
breakdown in sub_results so you can identify which agent is underperforming.
|
||||
|
||||
Prerequisites:
|
||||
@@ -79,7 +80,6 @@ async def main() -> None:
|
||||
|
||||
# 4. Create the evaluator — provider config goes here, once
|
||||
evals = FoundryEvals(client=client)
|
||||
|
||||
# =========================================================================
|
||||
# Pattern 1: Post-hoc — evaluate a workflow run you already did
|
||||
# =========================================================================
|
||||
@@ -143,6 +143,43 @@ async def main() -> None:
|
||||
if agent_eval.report_url:
|
||||
print(f" Portal: {agent_eval.report_url}")
|
||||
|
||||
# =========================================================================
|
||||
# Pattern 3: Similarity — compare workflow output to ground-truth answers
|
||||
# =========================================================================
|
||||
# Build a fresh workflow to avoid stale session state from Pattern 2.
|
||||
workflow3 = SequentialBuilder(participants=[researcher, planner]).build()
|
||||
|
||||
print()
|
||||
print("=" * 60)
|
||||
print("Pattern 3: Similarity evaluation with ground truth")
|
||||
print("=" * 60)
|
||||
|
||||
# Similarity compares the final workflow output against a reference answer,
|
||||
# so per-agent breakdown is disabled — individual agents don't have their
|
||||
# own ground-truth targets.
|
||||
eval_results = await evaluate_workflow(
|
||||
workflow=workflow3,
|
||||
queries=[
|
||||
"Plan a trip from Seattle to Paris",
|
||||
"Plan a trip from London to Tokyo",
|
||||
],
|
||||
expected_output=[
|
||||
"Pack layers and an umbrella for Paris. Flights from Seattle are around $450 round-trip.",
|
||||
"Bring warm clothing for Tokyo in spring. Flights from London are around $500 round-trip.",
|
||||
],
|
||||
evaluators=FoundryEvals(
|
||||
client=client,
|
||||
evaluators=[FoundryEvals.SIMILARITY],
|
||||
),
|
||||
include_per_agent=False,
|
||||
)
|
||||
|
||||
for r in eval_results:
|
||||
print(f"\nOverall: {r.status}")
|
||||
print(f" Passed: {r.passed}/{r.total}")
|
||||
if r.report_url:
|
||||
print(f" Portal: {r.report_url}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
@@ -173,4 +210,12 @@ Overall: completed
|
||||
Per-agent breakdown:
|
||||
researcher: 2/2 passed
|
||||
planner: 2/2 passed
|
||||
|
||||
============================================================
|
||||
Pattern 3: Similarity evaluation with ground truth
|
||||
============================================================
|
||||
|
||||
Overall: completed
|
||||
Passed: 2/2
|
||||
Portal: https://ai.azure.com/...
|
||||
"""
|
||||
|
||||
Reference in New Issue
Block a user