agent-framework/python/samples/02-agents/multimodal_input/azure_responses_multimodal.py

# Copyright (c) Microsoft. All rights reserved.

import asyncio
from pathlib import Path

from agent_framework import Content, Message
from agent_framework.foundry import FoundryChatClient
from azure.identity import AzureCliCredential
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

ASSETS_DIR = Path(__file__).resolve().parents[2] / "shared" / "sample_assets"


def load_sample_pdf() -> bytes:
    """Read the bundled sample PDF for tests."""
    pdf_path = ASSETS_DIR / "sample.pdf"
    return pdf_path.read_bytes()


def create_sample_image() -> str:
    """Create a simple 1x1 pixel PNG image for testing."""
    # This is a tiny yellow pixel in PNG format
    png_data = "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mP8/5+hHgAHggJ/PchI7wAAAABJRU5ErkJggg=="
    return f"data:image/png;base64,{png_data}"


async def test_image() -> None:
    """Test image analysis with Azure OpenAI Responses API."""
    # For authentication, run `az login` command in terminal or replace AzureCliCredential with preferred
    # authentication option. Requires AZURE_OPENAI_ENDPOINT and FOUNDRY_MODEL
    # environment variables to be set.
    # Alternatively, you can pass model explicitly:
    # client = FoundryChatClient(credential=AzureCliCredential(), model="your-deployment-name")
    client = FoundryChatClient(credential=AzureCliCredential())

    image_uri = create_sample_image()
    message = Message(
        role="user",
        contents=[
            Content.from_text(text="What's in this image?"),
            Content.from_uri(uri=image_uri, media_type="image/png"),
        ],
    )

    response = await client.get_response([message])
    print(f"Image Response: {response}")


async def test_pdf() -> None:
    """Test PDF document analysis with Azure OpenAI Responses API."""
    client = FoundryChatClient(credential=AzureCliCredential())

    pdf_bytes = load_sample_pdf()
    message = Message(
        role="user",
        contents=[
            Content.from_text(text="What information can you extract from this document?"),
            Content.from_data(
                data=pdf_bytes,
                media_type="application/pdf",
                additional_properties={"filename": "sample.pdf"},
            ),
        ],
    )

    response = await client.get_response([message])
    print(f"PDF Response: {response}")


async def main() -> None:
    print("=== Testing Azure OpenAI Responses API Multimodal ===")
    print("The Responses API supports both images AND PDFs")
    await test_image()
    await test_pdf()


if __name__ == "__main__":
    asyncio.run(main())