agent-framework/python/samples/02-agents/multimodal_input/openai_chat_multimodal.py

# Copyright (c) Microsoft. All rights reserved.

import asyncio
import base64
import struct
from pathlib import Path

from agent_framework import Content, Message
from agent_framework.openai import OpenAIChatClient, OpenAIChatCompletionClient
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

ASSETS_DIR = Path(__file__).resolve().parents[2] / "shared" / "sample_assets"

"""
Leverage multimodel capabilities of different models.

Uses the OpenAIChatClient and OpenAIChatCompletionClient to demonstrate multimodal input handling with the gpt-4o and gpt-4o-audio-preview models, respectively. The sample includes demonstrations for image, audio, and PDF inputs, showcasing how to create appropriate Content objects and send them in messages to the chat clients.

"""


def load_sample_pdf() -> bytes:
    """Read the bundled sample PDF for tests."""
    pdf_path = ASSETS_DIR / "sample.pdf"
    return pdf_path.read_bytes()


def create_sample_image() -> str:
    """Create a simple 1x1 pixel PNG image for testing."""
    # This is a tiny yellow pixel in PNG format
    png_data = "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mP8/5+hHgAHggJ/PchI7wAAAABJRU5ErkJggg=="
    return f"data:image/png;base64,{png_data}"


def create_sample_audio() -> str:
    """Create a minimal WAV file for testing (0.1 seconds of silence)."""
    wav_header = (
        b"RIFF"
        + struct.pack("<I", 44)  # file size
        + b"WAVEfmt "
        + struct.pack("<I", 16)  # fmt chunk
        + struct.pack("<HHIIHH", 1, 1, 8000, 16000, 2, 16)  # PCM, mono, 8kHz
        + b"data"
        + struct.pack("<I", 1600)  # data chunk
        + b"\x00" * 1600  # 0.1 sec silence
    )
    audio_b64 = base64.b64encode(wav_header).decode()
    return f"data:audio/wav;base64,{audio_b64}"


async def test_image() -> None:
    """Test image analysis with OpenAI."""
    client = OpenAIChatClient(model="gpt-4o")

    image_uri = create_sample_image()
    message = Message(
        role="user",
        contents=[
            Content.from_text(text="What's in this image?"),
            Content.from_uri(uri=image_uri, media_type="image/png"),
        ],
    )

    response = await client.get_response([message])
    print(f"Image Response: {response}")


async def test_audio() -> None:
    """Test audio analysis with OpenAI."""
    client = OpenAIChatCompletionClient(model="gpt-4o-audio-preview-2025-06-03")

    audio_uri = create_sample_audio()
    message = Message(
        role="user",
        contents=[
            Content.from_text(text="What do you hear in this audio?"),
            Content.from_uri(uri=audio_uri, media_type="audio/wav"),
        ],
    )

    response = await client.get_response([message])
    print(f"Audio Response: {response}")


async def test_pdf() -> None:
    """Test PDF document analysis with OpenAI."""
    client = OpenAIChatClient(model="gpt-4o")

    pdf_bytes = load_sample_pdf()
    message = Message(
        role="user",
        contents=[
            Content.from_text(text="What information can you extract from this document?"),
            Content.from_data(
                data=pdf_bytes, media_type="application/pdf", additional_properties={"filename": "employee_report.pdf"}
            ),
        ],
    )

    response = await client.get_response([message])
    print(f"PDF Response: {response}")


async def main() -> None:
    print("=== Testing OpenAI Multimodal ===")
    await test_image()
    await test_audio()
    await test_pdf()


if __name__ == "__main__":
    asyncio.run(main())