agent-framework/python/samples/02-agents/evaluation/evaluate_agent.py

# Copyright (c) Microsoft. All rights reserved.

"""Evaluate an agent with local checks — no API keys needed.

Demonstrates the simplest evaluation workflow:
1. Define checks using the @evaluator decorator
2. Run evaluate_agent() which calls agent.run() under the covers
3. Assert results in CI or inspect interactively

Usage:
    uv run python samples/02-agents/evaluation/evaluate_agent.py
"""

import asyncio

from agent_framework import (
    Agent,
    LocalEvaluator,
    evaluate_agent,
    evaluator,
    keyword_check,
)


# A custom check — parameter names determine what data you receive
@evaluator
def is_helpful(response: str) -> bool:
    """Check the response isn't empty or a refusal."""
    refusals = ["i can't", "i'm not able", "i don't know"]
    return len(response) > 10 and not any(r in response.lower() for r in refusals)


async def main():
    agent = Agent(
        model="gpt-4o-mini",
        instructions="You are a helpful weather assistant.",
    )

    # Combine built-in and custom checks
    local = LocalEvaluator(
        keyword_check("weather"),  # response must mention "weather"
        is_helpful,  # custom check
    )

    # evaluate_agent() calls agent.run() for each query, then evaluates
    results = await evaluate_agent(
        agent=agent,
        queries=[
            "What's the weather like in Seattle?",
            "Will it rain in London tomorrow?",
            "What should I wear for 30°C weather?",
        ],
        evaluators=local,
    )

    for r in results:
        print(f"{r.provider}: {r.passed}/{r.total} passed")
        for item in r.items:
            print(f"  [{item.status}] Q: {item.input_text[:50]}  A: {item.output_text[:50]}...")
            for score in item.scores:
                print(f"    {score.name}: {'✓' if score.passed else '✗'}")

    # Use in CI: will raise AssertionError if any check fails
    # results[0].assert_passed()


if __name__ == "__main__":
    asyncio.run(main())