mirror of
https://github.com/microsoft/agent-framework.git
synced 2026-06-16 21:04:09 +08:00
4b9856e66f
* updated azure ai inference sample * openai multimodel fix * update language
116 lines
3.5 KiB
Python
116 lines
3.5 KiB
Python
# Copyright (c) Microsoft. All rights reserved.
|
|
|
|
import asyncio
|
|
import base64
|
|
import struct
|
|
from pathlib import Path
|
|
|
|
from agent_framework import Content, Message
|
|
from agent_framework.openai import OpenAIChatClient, OpenAIChatCompletionClient
|
|
from dotenv import load_dotenv
|
|
|
|
# Load environment variables from .env file
|
|
load_dotenv()
|
|
|
|
ASSETS_DIR = Path(__file__).resolve().parents[2] / "shared" / "sample_assets"
|
|
|
|
"""
|
|
Leverage multimodel capabilities of different models.
|
|
|
|
Uses the OpenAIChatClient and OpenAIChatCompletionClient to demonstrate multimodal input handling with the gpt-4o and gpt-4o-audio-preview models, respectively. The sample includes demonstrations for image, audio, and PDF inputs, showcasing how to create appropriate Content objects and send them in messages to the chat clients.
|
|
|
|
"""
|
|
|
|
|
|
def load_sample_pdf() -> bytes:
|
|
"""Read the bundled sample PDF for tests."""
|
|
pdf_path = ASSETS_DIR / "sample.pdf"
|
|
return pdf_path.read_bytes()
|
|
|
|
|
|
def create_sample_image() -> str:
|
|
"""Create a simple 1x1 pixel PNG image for testing."""
|
|
# This is a tiny yellow pixel in PNG format
|
|
png_data = "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mP8/5+hHgAHggJ/PchI7wAAAABJRU5ErkJggg=="
|
|
return f"data:image/png;base64,{png_data}"
|
|
|
|
|
|
def create_sample_audio() -> str:
|
|
"""Create a minimal WAV file for testing (0.1 seconds of silence)."""
|
|
wav_header = (
|
|
b"RIFF"
|
|
+ struct.pack("<I", 44) # file size
|
|
+ b"WAVEfmt "
|
|
+ struct.pack("<I", 16) # fmt chunk
|
|
+ struct.pack("<HHIIHH", 1, 1, 8000, 16000, 2, 16) # PCM, mono, 8kHz
|
|
+ b"data"
|
|
+ struct.pack("<I", 1600) # data chunk
|
|
+ b"\x00" * 1600 # 0.1 sec silence
|
|
)
|
|
audio_b64 = base64.b64encode(wav_header).decode()
|
|
return f"data:audio/wav;base64,{audio_b64}"
|
|
|
|
|
|
async def test_image() -> None:
|
|
"""Test image analysis with OpenAI."""
|
|
client = OpenAIChatClient(model="gpt-4o")
|
|
|
|
image_uri = create_sample_image()
|
|
message = Message(
|
|
role="user",
|
|
contents=[
|
|
Content.from_text(text="What's in this image?"),
|
|
Content.from_uri(uri=image_uri, media_type="image/png"),
|
|
],
|
|
)
|
|
|
|
response = await client.get_response([message])
|
|
print(f"Image Response: {response}")
|
|
|
|
|
|
async def test_audio() -> None:
|
|
"""Test audio analysis with OpenAI."""
|
|
client = OpenAIChatCompletionClient(model="gpt-4o-audio-preview-2025-06-03")
|
|
|
|
audio_uri = create_sample_audio()
|
|
message = Message(
|
|
role="user",
|
|
contents=[
|
|
Content.from_text(text="What do you hear in this audio?"),
|
|
Content.from_uri(uri=audio_uri, media_type="audio/wav"),
|
|
],
|
|
)
|
|
|
|
response = await client.get_response([message])
|
|
print(f"Audio Response: {response}")
|
|
|
|
|
|
async def test_pdf() -> None:
|
|
"""Test PDF document analysis with OpenAI."""
|
|
client = OpenAIChatClient(model="gpt-4o")
|
|
|
|
pdf_bytes = load_sample_pdf()
|
|
message = Message(
|
|
role="user",
|
|
contents=[
|
|
Content.from_text(text="What information can you extract from this document?"),
|
|
Content.from_data(
|
|
data=pdf_bytes, media_type="application/pdf", additional_properties={"filename": "employee_report.pdf"}
|
|
),
|
|
],
|
|
)
|
|
|
|
response = await client.get_response([message])
|
|
print(f"PDF Response: {response}")
|
|
|
|
|
|
async def main() -> None:
|
|
print("=== Testing OpenAI Multimodal ===")
|
|
await test_image()
|
|
await test_audio()
|
|
await test_pdf()
|
|
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(main())
|