Python: Add PDF file support to OpenAI content parser with filename mapping (#1121)

* Add mapping for application media type in OpenAI responses client * Enhance multimodal input samples: Add PDF testing functionality and fix image sample * Standardize filename handling and add multimodal samples - Standardized filename extraction logic between chat and responses clients - Both clients now omit filename when not provided (no default fallback) - Added Azure Responses API multimodal sample with PDF support - Cleaned up Azure Chat sample to focus on supported features only - Fixed test comment placement for better code documentation - Updated README with clear API capability differences * Enhance multimodal input samples with image and PDF handling - Refactor image and PDF handling in `azure_chat_multimodal.py` and `openai_chat_multimodal.py` to use new utility functions. - Add `load_sample_pdf` and `create_sample_image` functions for better test asset management. - Remove redundant code for creating sample images and PDFs. - Introduce a sample PDF file in `sample_assets` for testing purposes. * Fix formatting in OpenAI chat client --------- Co-authored-by: Dmytro Struk <13853051+dmytrostruk@users.noreply.github.com>
2026-06-16 21:04:09 +08:00 · 2025-10-07 19:04:33 +02:00
parent 1d27b57672
commit 8bb1266020
8 changed files with 268 additions and 77 deletions
@@ -419,27 +419,24 @@ class OpenAIBaseChatClient(OpenAIBase, BaseChatClient):
                        "format": audio_format,
                    },
                }
-            case DataContent() | UriContent() if content.media_type and content.media_type.startswith("application/"):
-                if content.media_type == "application/pdf":
-                    if content.uri.startswith("data:"):
-                        filename = (
-                            getattr(content, "filename", None)
-                            or content.additional_properties.get("filename", "document.pdf")
-                            if hasattr(content, "additional_properties") and content.additional_properties
-                            else "document.pdf"
-                        )
-                        return {
-                            "type": "file",
-                            "file": {
-                                "file_data": content.uri,  # Send full data URI
-                                "filename": filename,
-                            },
-                        }
-
-                    return content.to_dict(exclude_none=True)
-
-                return content.to_dict(exclude_none=True)
+            case DataContent() | UriContent() if content.has_top_level_media_type(
+                "application"
+            ) and content.uri.startswith("data:"):
+                # All application/* media types should be treated as files for OpenAI
+                filename = getattr(content, "filename", None) or (
+                    content.additional_properties.get("filename")
+                    if hasattr(content, "additional_properties") and content.additional_properties
+                    else None
+                )
+                file_obj = {"file_data": content.uri}
+                if filename:
+                    file_obj["filename"] = filename
+                return {
+                    "type": "file",
+                    "file": file_obj,
+                }
            case _:
+                # Default fallback for all other content types
                return content.to_dict(exclude_none=True)

    @override
@@ -454,6 +454,19 @@ class OpenAIBaseResponsesClient(OpenAIBase, BaseChatClient):
                            "format": format,
                        },
                    }
+                if content.has_top_level_media_type("application"):
+                    filename = getattr(content, "filename", None) or (
+                        content.additional_properties.get("filename")
+                        if hasattr(content, "additional_properties") and content.additional_properties
+                        else None
+                    )
+                    file_obj = {
+                        "type": "input_file",
+                        "file_data": content.uri,
+                    }
+                    if filename:
+                        file_obj["filename"] = filename
+                    return file_obj
                return {}
            case FunctionCallContent():
                return {
@@ -756,7 +756,12 @@ def test_openai_content_parser_data_content_image(openai_unit_test_env: dict[str
    assert result["input_audio"]["data"] == "//uQAAAAWGluZwAAAA8AAAACAAACcQ=="
    assert result["input_audio"]["format"] == "mp3"

-    # Test DataContent with PDF file
+
+def test_openai_content_parser_document_file_mapping(openai_unit_test_env: dict[str, str]) -> None:
+    """Test _openai_content_parser converts document files (PDF, DOCX, etc.) to OpenAI file format."""
+    client = OpenAIChatClient()
+
+    # Test PDF without filename - should omit filename in OpenAI payload
    pdf_data_content = DataContent(
        uri="data:application/pdf;base64,JVBERi0xLjQKJcfsj6IKNSAwIG9iago8PC9UeXBlL0NhdGFsb2cvUGFnZXMgMiAwIFI+PgplbmRvYmoKMiAwIG9iago8PC9UeXBlL1BhZ2VzL0tpZHNbMyAwIFJdL0NvdW50IDE+PgplbmRvYmoKMyAwIG9iago8PC9UeXBlL1BhZ2UvTWVkaWFCb3ggWzAgMCA2MTIgNzkyXS9QYXJlbnQgMiAwIFIvUmVzb3VyY2VzPDwvRm9udDw8L0YxIDQgMCBSPj4+Pi9Db250ZW50cyA1IDAgUj4+CmVuZG9iago0IDAgb2JqCjw8L1R5cGUvRm9udC9TdWJ0eXBlL1R5cGUxL0Jhc2VGb250L0hlbHZldGljYT4+CmVuZG9iago1IDAgb2JqCjw8L0xlbmd0aCA0ND4+CnN0cmVhbQpCVApxCjcwIDUwIFRECi9GMSA4IFRmCihIZWxsbyBXb3JsZCEpIFRqCkVUCmVuZHN0cmVhbQplbmRvYmoKeHJlZgowIDYKMDAwMDAwMDAwMCA2NTUzNSBmIAowMDAwMDAwMDA5IDAwMDAwIG4gCjAwMDAwMDAwNTggMDAwMDAgbiAKMDAwMDAwMDExNSAwMDAwMCBuIAowMDAwMDAwMjQ1IDAwMDAwIG4gCjAwMDAwMDAzMDcgMDAwMDAgbiAKdHJhaWxlcgo8PC9TaXplIDYvUm9vdCAxIDAgUj4+CnN0YXJ0eHJlZgo0MDUKJSVFT0Y=",
        media_type="application/pdf",
@@ -764,14 +769,15 @@ def test_openai_content_parser_data_content_image(openai_unit_test_env: dict[str

    result = client._openai_content_parser(pdf_data_content)  # type: ignore

-    # Should convert to OpenAI file format
+    # Should convert to OpenAI file format without filename
    assert result["type"] == "file"
-    assert result["file"]["filename"] == "document.pdf"
+    assert "filename" not in result["file"]  # No filename provided, so none should be set
    assert "file_data" in result["file"]
    # Base64 data should be the full data URI (OpenAI requirement)
    assert result["file"]["file_data"].startswith("data:application/pdf;base64,")
+    assert result["file"]["file_data"] == pdf_data_content.uri

-    # Test DataContent with PDF and custom filename
+    # Test PDF with custom filename via additional_properties
    pdf_with_filename = DataContent(
        uri="data:application/pdf;base64,JVBERi0xLjQ=",
        media_type="application/pdf",
@@ -783,17 +789,75 @@ def test_openai_content_parser_data_content_image(openai_unit_test_env: dict[str
    # Should use custom filename
    assert result["type"] == "file"
    assert result["file"]["filename"] == "report.pdf"
+    assert result["file"]["file_data"] == "data:application/pdf;base64,JVBERi0xLjQ="

+    # Test different application/* media types - all should now be mapped to file format
+    test_cases = [
+        {
+            "media_type": "application/json",
+            "filename": "data.json",
+            "base64": "eyJrZXkiOiJ2YWx1ZSJ9",
+        },
+        {
+            "media_type": "application/xml",
+            "filename": "config.xml",
+            "base64": "PD94bWwgdmVyc2lvbj0iMS4wIj8+",
+        },
+        {
+            "media_type": "application/octet-stream",
+            "filename": "binary.bin",
+            "base64": "AQIDBAUGBwgJCg==",
+        },
+    ]

-def test_openai_chat_client_with_callable_api_key() -> None:
-    """Test OpenAIChatClient initialization with callable API key."""
+    for case in test_cases:
+        # Test without filename
+        doc_content = DataContent(
+            uri=f"data:{case['media_type']};base64,{case['base64']}",
+            media_type=case["media_type"],
+        )

-    async def get_api_key() -> str:
-        return "test-api-key-123"
+        result = client._openai_content_parser(doc_content)  # type: ignore

-    client = OpenAIChatClient(model_id="gpt-4o", api_key=get_api_key)
+        # All application/* types should now be mapped to file format
+        assert result["type"] == "file"
+        assert "filename" not in result["file"]  # Should omit filename when not provided
+        assert result["file"]["file_data"] == doc_content.uri

-    # Verify client was created successfully
-    assert client.model_id == "gpt-4o"
-    # OpenAI SDK now manages callable API keys internally
-    assert client.client is not None
+        # Test with filename - should now use file format with filename
+        doc_with_filename = DataContent(
+            uri=f"data:{case['media_type']};base64,{case['base64']}",
+            media_type=case["media_type"],
+            additional_properties={"filename": case["filename"]},
+        )
+
+        result = client._openai_content_parser(doc_with_filename)  # type: ignore
+
+        # Should now use file format with filename
+        assert result["type"] == "file"
+        assert result["file"]["filename"] == case["filename"]
+        assert result["file"]["file_data"] == doc_with_filename.uri
+
+    # Test edge case: empty additional_properties dict
+    pdf_empty_props = DataContent(
+        uri="data:application/pdf;base64,JVBERi0xLjQ=",
+        media_type="application/pdf",
+        additional_properties={},
+    )
+
+    result = client._openai_content_parser(pdf_empty_props)  # type: ignore
+
+    assert result["type"] == "file"
+    assert "filename" not in result["file"]
+
+    # Test edge case: None filename in additional_properties
+    pdf_none_filename = DataContent(
+        uri="data:application/pdf;base64,JVBERi0xLjQ=",
+        media_type="application/pdf",
+        additional_properties={"filename": None},
+    )
+
+    result = client._openai_content_parser(pdf_none_filename)  # type: ignore
+
+    assert result["type"] == "file"
+    assert "filename" not in result["file"]  # None filename should be omitted
@@ -10,11 +10,17 @@ This folder contains examples demonstrating how to send multimodal content (imag
 - **Description**: Shows how to send images, audio, and PDF files to OpenAI's Chat Completions API
 - **Supported formats**: PNG/JPEG images, WAV/MP3 audio, PDF documents

-### Azure Chat Client
+### Azure OpenAI Chat Client

 - **File**: `azure_chat_multimodal.py`
- **Description**: Shows how to send multimodal content to Azure OpenAI service
- **Supported formats**: PNG/JPEG images, WAV/MP3 audio, PDF documents
+- **Description**: Shows how to send images to Azure OpenAI Chat Completions API
+- **Supported formats**: PNG/JPEG images (PDF files are NOT supported by Chat Completions API)
+
+### Azure OpenAI Responses Client
+
+- **File**: `azure_responses_multimodal.py`
+- **Description**: Shows how to send images and PDF files to Azure OpenAI Responses API
+- **Supported formats**: PNG/JPEG images, PDF documents (full multimodal support)

 ## Environment Variables

@@ -24,8 +30,10 @@ Set the following environment variables before running the examples:
 - `OPENAI_API_KEY`: Your OpenAI API key

 **For Azure OpenAI:**
+
 - `AZURE_OPENAI_ENDPOINT`: Your Azure OpenAI endpoint
 - `AZURE_OPENAI_CHAT_DEPLOYMENT_NAME`: The name of your Azure OpenAI chat model deployment
+- `AZURE_OPENAI_RESPONSES_DEPLOYMENT_NAME`: The name of your Azure OpenAI responses model deployment

 Optionally for Azure OpenAI:
 - `AZURE_OPENAI_API_VERSION`: The API version to use (default is `2024-10-21`)
@@ -51,8 +59,11 @@ The Azure example uses `AzureCliCredential` for authentication. Run `az login` i
 # Run OpenAI example
 python openai_chat_multimodal.py

-# Run Azure example (requires az login or API key)
+# Run Azure Chat example (requires az login or API key)
 python azure_chat_multimodal.py
+
+# Run Azure Responses example (requires az login or API key)
+python azure_responses_multimodal.py
 ```

 ## Using Your Own Files
@@ -101,8 +112,8 @@ DataContent(

 ## API Differences

- **Chat Completions API**: Supports images, audio, and PDF files
- **Assistants API**: Only supports text and images (no audio/PDF)
- **Responses API**: Similar to Chat Completions
+- **OpenAI Chat Completions API**: Supports images, audio, and PDF files
+- **Azure OpenAI Chat Completions API**: Supports images only (no PDF/audio file types)
+- **Azure OpenAI Responses API**: Supports images and PDF files (full multimodal support)

-Choose the appropriate client based on your multimodal needs.
+Choose the appropriate client based on your multimodal needs and available APIs.
@@ -1,16 +1,20 @@
 # Copyright (c) Microsoft. All rights reserved.

 import asyncio
-import base64

-import requests
 from agent_framework import ChatMessage, DataContent, Role, TextContent
 from agent_framework.azure import AzureOpenAIChatClient
 from azure.identity import AzureCliCredential


+def create_sample_image() -> str:
+    """Create a simple 1x1 pixel PNG image for testing."""
+    # This is a tiny red pixel in PNG format
+    png_data = "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mP8/5+hHgAHggJ/PchI7wAAAABJRU5ErkJggg=="
+    return f"data:image/png;base64,{png_data}"
+
 async def test_image() -> None:
-    """Test image analysis with Azure."""
+    """Test image analysis with Azure OpenAI."""
    # For authentication, run `az login` command in terminal or replace AzureCliCredential with preferred
    # authentication option. Requires AZURE_OPENAI_ENDPOINT and AZURE_OPENAI_CHAT_DEPLOYMENT_NAME
    # environment variables to be set.
@@ -18,15 +22,10 @@ async def test_image() -> None:
    # client = AzureOpenAIChatClient(credential=AzureCliCredential(), deployment_name="your-deployment-name")
    client = AzureOpenAIChatClient(credential=AzureCliCredential())

-    # Fetch image from httpbin
-    image_url = "https://httpbin.org/image/jpeg"
-    response = requests.get(image_url)
-    image_b64 = base64.b64encode(response.content).decode()
-    image_uri = f"data:image/jpeg;base64,{image_b64}"
-
+    image_uri = create_sample_image()
    message = ChatMessage(
        role=Role.USER,
-        contents=[TextContent(text="What's in this image?"), DataContent(uri=image_uri, media_type="image/jpeg")],
+        contents=[TextContent(text="What's in this image?"), DataContent(uri=image_uri, media_type="image/png")],
    )

    response = await client.get_response(message)
@@ -34,9 +33,9 @@ async def test_image() -> None:


 async def main() -> None:
-    print("=== Testing Azure Multimodal ===")
+    print("=== Testing Azure OpenAI Multimodal ===")
+    print("Testing image analysis (supported by Chat Completions API)")
    await test_image()

-
 if __name__ == "__main__":
    asyncio.run(main())
@@ -0,0 +1,74 @@
+# Copyright (c) Microsoft. All rights reserved.
+
+import asyncio
+from pathlib import Path
+
+from agent_framework import ChatMessage, DataContent, Role, TextContent
+from agent_framework.azure import AzureOpenAIResponsesClient
+from azure.identity import AzureCliCredential
+
+ASSETS_DIR = Path(__file__).resolve().parent.parent / "sample_assets"
+
+
+def load_sample_pdf() -> bytes:
+    """Read the bundled sample PDF for tests."""
+    pdf_path = ASSETS_DIR / "sample.pdf"
+    return pdf_path.read_bytes()
+
+
+def create_sample_image() -> str:
+    """Create a simple 1x1 pixel PNG image for testing."""
+    # This is a tiny red pixel in PNG format
+    png_data = "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mP8/5+hHgAHggJ/PchI7wAAAABJRU5ErkJggg=="
+    return f"data:image/png;base64,{png_data}"
+
+
+async def test_image() -> None:
+    """Test image analysis with Azure OpenAI Responses API."""
+    # For authentication, run `az login` command in terminal or replace AzureCliCredential with preferred
+    # authentication option. Requires AZURE_OPENAI_ENDPOINT and AZURE_OPENAI_RESPONSES_DEPLOYMENT_NAME
+    # environment variables to be set.
+    # Alternatively, you can pass deployment_name explicitly:
+    # client = AzureOpenAIResponsesClient(credential=AzureCliCredential(), deployment_name="your-deployment-name")
+    client = AzureOpenAIResponsesClient(credential=AzureCliCredential())
+
+    image_uri = create_sample_image()
+    message = ChatMessage(
+        role=Role.USER,
+        contents=[TextContent(text="What's in this image?"), DataContent(uri=image_uri, media_type="image/png")],
+    )
+
+    response = await client.get_response(message)
+    print(f"Image Response: {response}")
+
+
+async def test_pdf() -> None:
+    """Test PDF document analysis with Azure OpenAI Responses API."""
+    client = AzureOpenAIResponsesClient(credential=AzureCliCredential())
+
+    pdf_bytes = load_sample_pdf()
+    message = ChatMessage(
+        role=Role.USER,
+        contents=[
+            TextContent(text="What information can you extract from this document?"),
+            DataContent(
+                data=pdf_bytes,
+                media_type="application/pdf",
+                additional_properties={"filename": "sample.pdf"},
+            ),
+        ],
+    )
+
+    response = await client.get_response(message)
+    print(f"PDF Response: {response}")
+
+
+async def main() -> None:
+    print("=== Testing Azure OpenAI Responses API Multimodal ===")
+    print("The Responses API supports both images AND PDFs")
+    await test_image()
+    await test_pdf()
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
@@ -3,36 +3,29 @@
 import asyncio
 import base64
 import struct
+from pathlib import Path

-import requests
 from agent_framework import ChatMessage, DataContent, Role, TextContent
 from agent_framework.openai import OpenAIChatClient

-
-async def test_image() -> None:
-    """Test image analysis with OpenAI."""
-    client = OpenAIChatClient(model_id="gpt-4o")
-
-    # Fetch image from httpbin
-    image_url = "https://httpbin.org/image/jpeg"
-    response = requests.get(image_url)
-    image_b64 = base64.b64encode(response.content).decode()
-    image_uri = f"data:image/jpeg;base64,{image_b64}"
-
-    message = ChatMessage(
-        role=Role.USER,
-        contents=[TextContent(text="What's in this image?"), DataContent(uri=image_uri, media_type="image/jpeg")],
-    )
-
-    response = await client.get_response(message)
-    print(f"Image Response: {response}")
+ASSETS_DIR = Path(__file__).resolve().parent.parent / "sample_assets"


-async def test_audio() -> None:
-    """Test audio analysis with OpenAI."""
-    client = OpenAIChatClient(model_id="gpt-4o-audio-preview")
+def load_sample_pdf() -> bytes:
+    """Read the bundled sample PDF for tests."""
+    pdf_path = ASSETS_DIR / "sample.pdf"
+    return pdf_path.read_bytes()

-    # Create minimal WAV file (0.1 seconds of silence)
+
+def create_sample_image() -> str:
+    """Create a simple 1x1 pixel PNG image for testing."""
+    # This is a tiny red pixel in PNG format
+    png_data = "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mP8/5+hHgAHggJ/PchI7wAAAABJRU5ErkJggg=="
+    return f"data:image/png;base64,{png_data}"
+
+
+def create_sample_audio() -> str:
+    """Create a minimal WAV file for testing (0.1 seconds of silence)."""
    wav_header = (
        b"RIFF"
        + struct.pack("<I", 44)  # file size
@@ -44,8 +37,28 @@ async def test_audio() -> None:
        + b"\x00" * 1600  # 0.1 sec silence
    )
    audio_b64 = base64.b64encode(wav_header).decode()
-    audio_uri = f"data:audio/wav;base64,{audio_b64}"
+    return f"data:audio/wav;base64,{audio_b64}"

+
+async def test_image() -> None:
+    """Test image analysis with OpenAI."""
+    client = OpenAIChatClient(model_id="gpt-4o")
+
+    image_uri = create_sample_image()
+    message = ChatMessage(
+        role=Role.USER,
+        contents=[TextContent(text="What's in this image?"), DataContent(uri=image_uri, media_type="image/png")],
+    )
+
+    response = await client.get_response(message)
+    print(f"Image Response: {response}")
+
+
+async def test_audio() -> None:
+    """Test audio analysis with OpenAI."""
+    client = OpenAIChatClient(model_id="gpt-4o-audio-preview")
+
+    audio_uri = create_sample_audio()
    message = ChatMessage(
        role=Role.USER,
        contents=[
@@ -58,10 +71,30 @@ async def test_audio() -> None:
    print(f"Audio Response: {response}")


+async def test_pdf() -> None:
+    """Test PDF document analysis with OpenAI."""
+    client = OpenAIChatClient(model_id="gpt-4o")
+
+    pdf_bytes = load_sample_pdf()
+    message = ChatMessage(
+        role=Role.USER,
+        contents=[
+            TextContent(text="What information can you extract from this document?"),
+            DataContent(
+                data=pdf_bytes, media_type="application/pdf", additional_properties={"filename": "employee_report.pdf"}
+            ),
+        ],
+    )
+
+    response = await client.get_response(message)
+    print(f"PDF Response: {response}")
+
+
 async def main() -> None:
    print("=== Testing OpenAI Multimodal ===")
    await test_image()
    await test_audio()
+    await test_pdf()


 if __name__ == "__main__":