Files
agent-framework/python/samples/02-agents/evaluation/evaluate_with_expected.py
alliscode 45527eed29 Foundry Evals integration for Python
Merged and refactored eval module per Eduard's PR review:

- Merge _eval.py + _local_eval.py into single _evaluation.py
- Convert EvalItem from dataclass to regular class
- Rename to_dict() to to_eval_data()
- Convert _AgentEvalData to TypedDict
- Simplify check system: unified async pattern with isawaitable
- Parallelize checks and evaluators with asyncio.gather
- Add all/any mode to tool_called_check
- Fix bool(passed) truthy bug in _coerce_result
- Remove deprecated function_evaluator/async_function_evaluator aliases
- Remove _MinimalAgent, tighten evaluate_agent signature
- Set self.name in __init__ (LocalEvaluator, FoundryEvals)
- Limit FoundryEvals to AsyncOpenAI only
- Type project_client as AIProjectClient
- Remove NotImplementedError continuous eval code
- Add evaluation samples in 02-agents/ and 03-workflows/
- Update all imports and tests (167 passing)

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
2026-03-20 14:24:21 -07:00

65 lines
1.7 KiB
Python

# Copyright (c) Microsoft. All rights reserved.
"""Evaluate an agent with expected outputs and tool call checks.
Demonstrates ground-truth comparison and tool usage evaluation:
1. Provide expected outputs alongside queries
2. Use built-in tool_calls_present for tool verification
3. Combine multiple evaluation criteria
Usage:
uv run python samples/02-agents/evaluation/evaluate_with_expected.py
"""
import asyncio
from agent_framework import (
Agent,
LocalEvaluator,
evaluate_agent,
evaluator,
tool_calls_present,
)
@evaluator
def response_matches_expected(response: str, expected_output: str) -> float:
"""Score based on word overlap with expected output."""
if not expected_output:
return 1.0
response_words = set(response.lower().split())
expected_words = set(expected_output.lower().split())
return len(response_words & expected_words) / max(len(expected_words), 1)
async def main():
agent = Agent(
model="gpt-4o-mini",
instructions="You are a math tutor. Answer concisely.",
)
local = LocalEvaluator(
response_matches_expected,
tool_calls_present, # verifies expected tools were called
)
results = await evaluate_agent(
agent=agent,
queries=["What is 2 + 2?", "What is the square root of 144?"],
expected_output=["4", "12"],
expected_tool_calls=[
[], # no tools expected for simple math
[],
],
evaluators=local,
)
for r in results:
print(f"{r.provider}: {r.passed}/{r.total} passed")
for item in r.items:
print(f" [{item.status}] {item.input_text}{item.output_text[:80]}")
if __name__ == "__main__":
asyncio.run(main())