mirror of
https://github.com/microsoft/agent-framework.git
synced 2026-06-16 21:04:09 +08:00
45527eed29
Merged and refactored eval module per Eduard's PR review: - Merge _eval.py + _local_eval.py into single _evaluation.py - Convert EvalItem from dataclass to regular class - Rename to_dict() to to_eval_data() - Convert _AgentEvalData to TypedDict - Simplify check system: unified async pattern with isawaitable - Parallelize checks and evaluators with asyncio.gather - Add all/any mode to tool_called_check - Fix bool(passed) truthy bug in _coerce_result - Remove deprecated function_evaluator/async_function_evaluator aliases - Remove _MinimalAgent, tighten evaluate_agent signature - Set self.name in __init__ (LocalEvaluator, FoundryEvals) - Limit FoundryEvals to AsyncOpenAI only - Type project_client as AIProjectClient - Remove NotImplementedError continuous eval code - Add evaluation samples in 02-agents/ and 03-workflows/ - Update all imports and tests (167 passing) Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
69 lines
1.9 KiB
Python
69 lines
1.9 KiB
Python
# Copyright (c) Microsoft. All rights reserved.
|
|
|
|
"""Evaluate an agent with local checks — no API keys needed.
|
|
|
|
Demonstrates the simplest evaluation workflow:
|
|
1. Define checks using the @evaluator decorator
|
|
2. Run evaluate_agent() which calls agent.run() under the covers
|
|
3. Assert results in CI or inspect interactively
|
|
|
|
Usage:
|
|
uv run python samples/02-agents/evaluation/evaluate_agent.py
|
|
"""
|
|
|
|
import asyncio
|
|
|
|
from agent_framework import (
|
|
Agent,
|
|
LocalEvaluator,
|
|
evaluate_agent,
|
|
evaluator,
|
|
keyword_check,
|
|
)
|
|
|
|
|
|
# A custom check — parameter names determine what data you receive
|
|
@evaluator
|
|
def is_helpful(response: str) -> bool:
|
|
"""Check the response isn't empty or a refusal."""
|
|
refusals = ["i can't", "i'm not able", "i don't know"]
|
|
return len(response) > 10 and not any(r in response.lower() for r in refusals)
|
|
|
|
|
|
async def main():
|
|
agent = Agent(
|
|
model="gpt-4o-mini",
|
|
instructions="You are a helpful weather assistant.",
|
|
)
|
|
|
|
# Combine built-in and custom checks
|
|
local = LocalEvaluator(
|
|
keyword_check("weather"), # response must mention "weather"
|
|
is_helpful, # custom check
|
|
)
|
|
|
|
# evaluate_agent() calls agent.run() for each query, then evaluates
|
|
results = await evaluate_agent(
|
|
agent=agent,
|
|
queries=[
|
|
"What's the weather like in Seattle?",
|
|
"Will it rain in London tomorrow?",
|
|
"What should I wear for 30°C weather?",
|
|
],
|
|
evaluators=local,
|
|
)
|
|
|
|
for r in results:
|
|
print(f"{r.provider}: {r.passed}/{r.total} passed")
|
|
for item in r.items:
|
|
print(f" [{item.status}] Q: {item.input_text[:50]} A: {item.output_text[:50]}...")
|
|
for score in item.scores:
|
|
print(f" {score.name}: {'✓' if score.passed else '✗'}")
|
|
|
|
# Use in CI: will raise AssertionError if any check fails
|
|
# results[0].assert_passed()
|
|
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(main())
|