agent-framework/python/samples/02-agents/evaluation/evaluate_multimodal.py

# Copyright (c) Microsoft. All rights reserved.

"""Evaluate multimodal (image) conversations locally.

Demonstrates that the evaluation pipeline preserves image content:
1. Build EvalItems with image content in conversations
2. Use @evaluator checks that inspect multimodal content
3. Verify images flow through the eval pipeline intact

Usage:
    uv run python samples/02-agents/evaluation/evaluate_multimodal.py
"""

import asyncio
import base64

from agent_framework import (
    Content,
    EvalItem,
    LocalEvaluator,
    Message,
    evaluator,
)

# -- Custom evaluators that inspect multimodal content --


@evaluator
def has_image_content(conversation: list) -> bool:
    """Check that the conversation contains at least one image."""
    return any(
        c.type in ("data", "uri") and c.media_type and c.media_type.startswith("image/")
        for m in conversation
        for c in (m.contents or [])
    )


@evaluator
def response_describes_image(response: str) -> bool:
    """Check that the assistant response acknowledges the image."""
    image_words = {"image", "picture", "photo", "shows", "depicts", "see"}
    return any(word in response.lower() for word in image_words)


@evaluator
def image_count(conversation: list) -> float:
    """Return the number of images in the conversation as a score."""
    count = sum(
        1
        for m in conversation
        for c in (m.contents or [])
        if c.type in ("data", "uri") and c.media_type and c.media_type.startswith("image/")
    )
    return float(count)


# A tiny 1x1 red PNG for demonstration (no external dependencies needed)
_TINY_PNG = base64.b64decode(
    "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mP8/5+hHgAHggJ/PchI7wAAAABJRU5ErkJggg=="
)


async def main() -> None:
    # Build eval items with multimodal content (no agent run needed)
    items = [
        # Item 1: User sends an image URL with a question
        EvalItem(
            conversation=[
                Message(
                    "user",
                    [
                        Content.from_text("What do you see in this image?"),
                        Content.from_uri(
                            "https://upload.wikimedia.org/wikipedia/commons/thumb/4/47/PNG_transparency_demonstration_1.png/300px-PNG_transparency_demonstration_1.png",
                            media_type="image/png",
                        ),
                    ],
                ),
                Message("assistant", ["The image shows two dice on a transparent background."]),
            ]
        ),
        # Item 2: User sends inline image bytes
        EvalItem(
            conversation=[
                Message(
                    "user",
                    [
                        Content.from_text("Describe this picture"),
                        Content.from_data(data=_TINY_PNG, media_type="image/png"),
                    ],
                ),
                Message("assistant", ["I see a small red image — it appears to be a single pixel."]),
            ]
        ),
        # Item 3: Text-only conversation (should fail has_image_content)
        EvalItem(
            conversation=[
                Message("user", ["Tell me about cats"]),
                Message("assistant", ["Cats are wonderful pets."]),
            ]
        ),
    ]

    local = LocalEvaluator(
        has_image_content,
        response_describes_image,
        image_count,
    )

    results = await local.evaluate(items)

    print(f"\n{results.provider}: {results.passed}/{results.total} passed")
    for item in results.items:
        print(f"\n  [{item.status}] Q: {item.input_text[:60]}...")
        for score in item.scores:
            symbol = "PASS" if score.passed else "FAIL"
            print(f"    {symbol} {score.name}: {score.score}")


if __name__ == "__main__":
    asyncio.run(main())