Python: Fix Eval samples (#4033)

* fix red team sample * Updated self-reflection * fix for workflow eval sample * fix test
2026-06-16 21:04:09 +08:00 · 2026-02-18 20:50:33 +01:00
parent 6a39d5a652
commit aab80d9ed9
38 changed files with 536 additions and 2629 deletions
@@ -129,9 +129,7 @@ def test_azure_assistants_client_init_validation_fail() -> None:
 def test_azure_assistants_client_init_missing_deployment_name(azure_openai_unit_test_env: dict[str, str]) -> None:
    """Test AzureOpenAIAssistantsClient initialization with missing deployment name."""
    with pytest.raises(ServiceInitializationError):
-        AzureOpenAIAssistantsClient(
-            api_key=azure_openai_unit_test_env.get("AZURE_OPENAI_API_KEY", "test-key")
-        )
+        AzureOpenAIAssistantsClient(api_key=azure_openai_unit_test_env.get("AZURE_OPENAI_API_KEY", "test-key"))


 def test_azure_assistants_client_init_with_default_headers(azure_openai_unit_test_env: dict[str, str]) -> None:
@@ -94,15 +94,13 @@ def test_init_endpoint(azure_openai_unit_test_env: dict[str, str]) -> None:
@pytest.mark.parametrize("exclude_list", [["AZURE_OPENAI_CHAT_DEPLOYMENT_NAME"]], indirect=True)
 def test_init_with_empty_deployment_name(azure_openai_unit_test_env: dict[str, str]) -> None:
    with pytest.raises(ServiceInitializationError):
-        AzureOpenAIChatClient(
-        )
+        AzureOpenAIChatClient()


@pytest.mark.parametrize("exclude_list", [["AZURE_OPENAI_ENDPOINT", "AZURE_OPENAI_BASE_URL"]], indirect=True)
 def test_init_with_empty_endpoint_and_base_url(azure_openai_unit_test_env: dict[str, str]) -> None:
    with pytest.raises(ServiceInitializationError):
-        AzureOpenAIChatClient(
-        )
+        AzureOpenAIChatClient()


@pytest.mark.parametrize("override_env_param_dict", [{"AZURE_OPENAI_ENDPOINT": "http://test.com"}], indirect=True)
@@ -1,6 +1,7 @@
 # Copyright (c) Microsoft. All rights reserved.

 import json
+import logging
 import os
 from typing import Annotated, Any
 from unittest.mock import MagicMock
@@ -30,6 +31,8 @@ skip_if_azure_integration_tests_disabled = pytest.mark.skipif(
    else "Integration tests are disabled.",
 )

+logger = logging.getLogger(__name__)
+

 class OutputStruct(BaseModel):
    """A structured output for testing purposes."""
@@ -111,8 +114,7 @@ def test_init_with_default_header(azure_openai_unit_test_env: dict[str, str]) ->
@pytest.mark.parametrize("exclude_list", [["AZURE_OPENAI_RESPONSES_DEPLOYMENT_NAME"]], indirect=True)
 def test_init_with_empty_model_id(azure_openai_unit_test_env: dict[str, str]) -> None:
    with pytest.raises(ServiceInitializationError):
-        AzureOpenAIResponsesClient(
-        )
+        AzureOpenAIResponsesClient()


 def test_init_with_project_client(azure_openai_unit_test_env: dict[str, str]) -> None:
@@ -2700,3 +2700,74 @@ async def test_conversation_id_updated_in_options_between_tool_iterations():
    assert conversation_ids_received[1] == "stream_conv_after_first", (
        "streaming: conversation_id should be updated in options after receiving new conversation_id from API"
    )
+
+
+async def test_streaming_function_calling_response_includes_reasoning_and_tool_results(
+    chat_client_base: SupportsChatGetResponse,
+):
+    """Test that the finalized streaming response includes reasoning, function_call,
+    function_result, and final text in its messages.
+
+    This is critical for workflow chaining: when one agent's response is passed as
+    input to the next agent, the conversation must include all items (reasoning,
+    function_call, function_call_output) so the API can validate the history.
+    """
+
+    @tool(name="search", approval_mode="never_require")
+    def search_func(query: str) -> str:
+        return f"Found results for {query}"
+
+    chat_client_base.streaming_responses = [
+        [
+            # First response: reasoning + function_call
+            ChatResponseUpdate(
+                contents=[
+                    Content.from_text_reasoning(
+                        text="Let me search for that",
+                        additional_properties={"reasoning_id": "rs_test123", "status": "completed"},
+                    )
+                ],
+                role="assistant",
+            ),
+            ChatResponseUpdate(
+                contents=[
+                    Content.from_function_call(
+                        call_id="call_1",
+                        name="search",
+                        arguments='{"query": "test"}',
+                        additional_properties={"fc_id": "fc_test456"},
+                    )
+                ],
+                role="assistant",
+            ),
+        ],
+        [
+            # Second response: final text
+            ChatResponseUpdate(
+                contents=[Content.from_text(text="Here are the results")],
+                role="assistant",
+            ),
+        ],
+    ]
+
+    stream = chat_client_base.get_response(
+        "search for test", options={"tool_choice": "auto", "tools": [search_func]}, stream=True
+    )
+
+    updates = []
+    async for update in stream:
+        updates.append(update)
+    response = await stream.get_final_response()
+
+    # Verify all content types are in the response messages
+    all_content_types = [c.type for msg in response.messages for c in msg.contents]
+    assert "text_reasoning" in all_content_types, "Reasoning must be preserved in response messages"
+    assert "function_call" in all_content_types, "Function call must be preserved in response messages"
+    assert "function_result" in all_content_types, "Function result must be in response messages for chaining"
+    assert "text" in all_content_types, "Final text must be in response messages"
+
+    # Verify reasoning has the reasoning_id preserved
+    reasoning_contents = [c for msg in response.messages for c in msg.contents if c.type == "text_reasoning"]
+    assert len(reasoning_contents) >= 1
+    assert reasoning_contents[0].additional_properties is not None
+    assert reasoning_contents[0].additional_properties.get("reasoning_id") == "rs_test123"
@@ -154,9 +154,7 @@ def test_init_validation_fail() -> None:
 def test_init_missing_model_id(openai_unit_test_env: dict[str, str]) -> None:
    """Test OpenAIAssistantsClient initialization with missing model ID."""
    with pytest.raises(ServiceInitializationError):
-        OpenAIAssistantsClient(
-            api_key=openai_unit_test_env.get("OPENAI_API_KEY", "test-key")
-        )
+        OpenAIAssistantsClient(api_key=openai_unit_test_env.get("OPENAI_API_KEY", "test-key"))


@pytest.mark.parametrize("exclude_list", [["OPENAI_API_KEY"]], indirect=True)
@@ -97,8 +97,7 @@ def test_init_base_url_from_settings_env() -> None:
@pytest.mark.parametrize("exclude_list", [["OPENAI_CHAT_MODEL_ID"]], indirect=True)
 def test_init_with_empty_model_id(openai_unit_test_env: dict[str, str]) -> None:
    with pytest.raises(ServiceInitializationError):
-        OpenAIChatClient(
-        )
+        OpenAIChatClient()


@pytest.mark.parametrize("exclude_list", [["OPENAI_API_KEY"]], indirect=True)
@@ -139,8 +139,7 @@ def test_init_with_default_header(openai_unit_test_env: dict[str, str]) -> None:
@pytest.mark.parametrize("exclude_list", [["OPENAI_RESPONSES_MODEL_ID"]], indirect=True)
 def test_init_with_empty_model_id(openai_unit_test_env: dict[str, str]) -> None:
    with pytest.raises(ServiceInitializationError):
-        OpenAIResponsesClient(
-        )
+        OpenAIResponsesClient()


@pytest.mark.parametrize("exclude_list", [["OPENAI_API_KEY"]], indirect=True)
@@ -816,7 +815,101 @@ def test_prepare_message_for_openai_with_function_approval_response() -> None:
    assert prepared_message["approve"] is True


-def test_chat_message_with_error_content() -> None:
+def test_prepare_message_for_openai_includes_reasoning_with_function_call() -> None:
+    """Test _prepare_message_for_openai includes reasoning items alongside function_calls.
+
+    Reasoning models require reasoning items to be present in the input when
+    function_call items are included. Stripping reasoning causes a 400 error:
+    "function_call was provided without its required reasoning item".
+    """
+    client = OpenAIResponsesClient(model_id="test-model", api_key="test-key")
+
+    reasoning = Content.from_text_reasoning(
+        text="Let me analyze the request",
+        additional_properties={"status": "completed", "reasoning_id": "rs_abc123"},
+    )
+    function_call = Content.from_function_call(
+        call_id="call_123",
+        name="search_hotels",
+        arguments='{"city": "Paris"}',
+    )
+
+    message = Message(role="assistant", contents=[reasoning, function_call])
+    call_id_to_id: dict[str, str] = {}
+
+    result = client._prepare_message_for_openai(message, call_id_to_id)
+
+    # Both reasoning and function_call should be present as top-level items
+    types = [item["type"] for item in result]
+    assert "reasoning" in types, "Reasoning items must be included for reasoning models"
+    assert "function_call" in types
+
+    reasoning_item = next(item for item in result if item["type"] == "reasoning")
+    assert reasoning_item["summary"]["text"] == "Let me analyze the request"
+    assert reasoning_item["id"] == "rs_abc123", "Reasoning id must be preserved for the API"
+
+
+def test_prepare_messages_for_openai_full_conversation_with_reasoning() -> None:
+    """Test _prepare_messages_for_openai correctly serializes a full conversation
+    that includes reasoning + function_call + function_result + final text.
+
+    This simulates the conversation history passed between agents in a workflow.
+    The API requires reasoning items alongside function_calls.
+    """
+    client = OpenAIResponsesClient(model_id="test-model", api_key="test-key")
+
+    messages = [
+        Message(role="user", contents=[Content.from_text(text="search for hotels")]),
+        Message(
+            role="assistant",
+            contents=[
+                Content.from_text_reasoning(
+                    text="I need to search for hotels",
+                    additional_properties={"reasoning_id": "rs_test123", "status": "completed"},
+                ),
+                Content.from_function_call(
+                    call_id="call_1",
+                    name="search_hotels",
+                    arguments='{"city": "Paris"}',
+                    additional_properties={"fc_id": "fc_test456"},
+                ),
+            ],
+        ),
+        Message(
+            role="tool",
+            contents=[
+                Content.from_function_result(
+                    call_id="call_1",
+                    result="Found 3 hotels in Paris",
+                ),
+            ],
+        ),
+        Message(role="assistant", contents=[Content.from_text(text="I found hotels for you")]),
+    ]
+
+    result = client._prepare_messages_for_openai(messages)
+
+    types = [item.get("type") for item in result]
+    assert "message" in types, "User/assistant messages should be present"
+    assert "reasoning" in types, "Reasoning items must be present"
+    assert "function_call" in types, "Function call items must be present"
+    assert "function_call_output" in types, "Function call output must be present"
+
+    # Verify reasoning has id
+    reasoning_items = [item for item in result if item.get("type") == "reasoning"]
+    assert reasoning_items[0]["id"] == "rs_test123"
+
+    # Verify function_call has id
+    fc_items = [item for item in result if item.get("type") == "function_call"]
+    assert fc_items[0]["id"] == "fc_test456"
+
+    # Verify correct ordering: reasoning before function_call
+    reasoning_idx = types.index("reasoning")
+    fc_idx = types.index("function_call")
+    assert reasoning_idx < fc_idx, "Reasoning must come before function_call"
+
+
+def test_prepare_message_for_openai_filters_error_content() -> None:
    """Test that error content in messages is handled properly."""
    client = OpenAIResponsesClient(model_id="test-model", api_key="test-key")