Python: Fix Eval samples (#4033)

* fix red team sample * Updated self-reflection * fix for workflow eval sample * fix test
2026-06-16 21:04:09 +08:00 · 2026-02-18 20:50:33 +01:00
parent 6a39d5a652
commit aab80d9ed9
38 changed files with 536 additions and 2629 deletions
@@ -788,6 +788,9 @@ class RawOpenAIResponsesClient(  # type: ignore[misc]
        request_input = self._prepare_messages_for_openai(messages)
        if not request_input:
            raise ServiceInvalidRequestError("Messages are required for chat completions")
+
+        conversation_id = self._get_current_conversation_id(options, **kwargs)
+
        run_options["input"] = request_input

        # model id
@@ -911,8 +914,11 @@ class RawOpenAIResponsesClient(  # type: ignore[misc]
        for content in message.contents:
            match content.type:
                case "text_reasoning":
-                    # Don't send reasoning content back to model
-                    continue
+                    # Reasoning items must be sent back as top-level input items
+                    # for reasoning models that require them alongside function_calls
+                    reasoning = self._prepare_content_for_openai(message.role, content, call_id_to_id)  # type: ignore[arg-type]
+                    if reasoning:
+                        all_messages.append(reasoning)
                case "function_result":
                    new_args: dict[str, Any] = {}
                    new_args.update(self._prepare_content_for_openai(message.role, content, call_id_to_id))  # type: ignore[arg-type]
@@ -967,6 +973,8 @@ class RawOpenAIResponsesClient(  # type: ignore[misc]
                }
                props: dict[str, Any] | None = getattr(content, "additional_properties", None)
                if props:
+                    if reasoning_id := props.get("reasoning_id"):
+                        ret["id"] = reasoning_id
                    if status := props.get("status"):
                        ret["status"] = status
                    if reasoning_text := props.get("reasoning_text"):
@@ -1184,22 +1192,29 @@ class RawOpenAIResponsesClient(  # type: ignore[misc]
                                    )
                                )
                case "reasoning":  # ResponseOutputReasoning
+                    reasoning_id = getattr(item, "id", None)
                    if hasattr(item, "content") and item.content:
                        for index, reasoning_content in enumerate(item.content):
-                            additional_properties = None
+                            additional_properties: dict[str, Any] = {}
+                            if reasoning_id:
+                                additional_properties["reasoning_id"] = reasoning_id
                            if hasattr(item, "summary") and item.summary and index < len(item.summary):
-                                additional_properties = {"summary": item.summary[index]}
+                                additional_properties["summary"] = item.summary[index]
                            contents.append(
                                Content.from_text_reasoning(
                                    text=reasoning_content.text,
                                    raw_representation=reasoning_content,
-                                    additional_properties=additional_properties,
+                                    additional_properties=additional_properties or None,
                                )
                            )
                    if hasattr(item, "summary") and item.summary:
                        for summary in item.summary:
                            contents.append(
-                                Content.from_text_reasoning(text=summary.text, raw_representation=summary)  # type: ignore[arg-type]
+                                Content.from_text_reasoning(
+                                    text=summary.text,
+                                    raw_representation=summary,  # type: ignore[arg-type]
+                                    additional_properties={"reasoning_id": reasoning_id} if reasoning_id else None,
+                                )
                            )
                case "code_interpreter_call":  # ResponseOutputCodeInterpreterCall
                    call_id = getattr(item, "call_id", None) or getattr(item, "id", None)
@@ -1413,16 +1428,40 @@ class RawOpenAIResponsesClient(  # type: ignore[misc]
                contents.append(Content.from_text(text=event.delta, raw_representation=event))
                metadata.update(self._get_metadata_from_response(event))
            case "response.reasoning_text.delta":
-                contents.append(Content.from_text_reasoning(text=event.delta, raw_representation=event))
+                contents.append(
+                    Content.from_text_reasoning(
+                        text=event.delta,
+                        raw_representation=event,
+                        additional_properties={"reasoning_id": event.item_id},
+                    )
+                )
                metadata.update(self._get_metadata_from_response(event))
            case "response.reasoning_text.done":
-                contents.append(Content.from_text_reasoning(text=event.text, raw_representation=event))
+                contents.append(
+                    Content.from_text_reasoning(
+                        text=event.text,
+                        raw_representation=event,
+                        additional_properties={"reasoning_id": event.item_id},
+                    )
+                )
                metadata.update(self._get_metadata_from_response(event))
            case "response.reasoning_summary_text.delta":
-                contents.append(Content.from_text_reasoning(text=event.delta, raw_representation=event))
+                contents.append(
+                    Content.from_text_reasoning(
+                        text=event.delta,
+                        raw_representation=event,
+                        additional_properties={"reasoning_id": event.item_id},
+                    )
+                )
                metadata.update(self._get_metadata_from_response(event))
            case "response.reasoning_summary_text.done":
-                contents.append(Content.from_text_reasoning(text=event.text, raw_representation=event))
+                contents.append(
+                    Content.from_text_reasoning(
+                        text=event.text,
+                        raw_representation=event,
+                        additional_properties={"reasoning_id": event.item_id},
+                    )
+                )
                metadata.update(self._get_metadata_from_response(event))
            case "response.code_interpreter_call_code.delta":
                call_id = getattr(event, "call_id", None) or getattr(event, "id", None) or event.item_id
@@ -1593,20 +1632,23 @@ class RawOpenAIResponsesClient(  # type: ignore[misc]
                            )
                        )
                    case "reasoning":  # ResponseOutputReasoning
+                        reasoning_id = getattr(event_item, "id", None)
                        if hasattr(event_item, "content") and event_item.content:
                            for index, reasoning_content in enumerate(event_item.content):
-                                additional_properties = None
+                                additional_properties: dict[str, Any] = {}
+                                if reasoning_id:
+                                    additional_properties["reasoning_id"] = reasoning_id
                                if (
                                    hasattr(event_item, "summary")
                                    and event_item.summary
                                    and index < len(event_item.summary)
                                ):
-                                    additional_properties = {"summary": event_item.summary[index]}
+                                    additional_properties["summary"] = event_item.summary[index]
                                contents.append(
                                    Content.from_text_reasoning(
                                        text=reasoning_content.text,
                                        raw_representation=reasoning_content,
-                                        additional_properties=additional_properties,
+                                        additional_properties=additional_properties or None,
                                    )
                                )
                    case _:
@@ -129,9 +129,7 @@ def test_azure_assistants_client_init_validation_fail() -> None:
 def test_azure_assistants_client_init_missing_deployment_name(azure_openai_unit_test_env: dict[str, str]) -> None:
    """Test AzureOpenAIAssistantsClient initialization with missing deployment name."""
    with pytest.raises(ServiceInitializationError):
-        AzureOpenAIAssistantsClient(
-            api_key=azure_openai_unit_test_env.get("AZURE_OPENAI_API_KEY", "test-key")
-        )
+        AzureOpenAIAssistantsClient(api_key=azure_openai_unit_test_env.get("AZURE_OPENAI_API_KEY", "test-key"))


 def test_azure_assistants_client_init_with_default_headers(azure_openai_unit_test_env: dict[str, str]) -> None:
@@ -94,15 +94,13 @@ def test_init_endpoint(azure_openai_unit_test_env: dict[str, str]) -> None:
@pytest.mark.parametrize("exclude_list", [["AZURE_OPENAI_CHAT_DEPLOYMENT_NAME"]], indirect=True)
 def test_init_with_empty_deployment_name(azure_openai_unit_test_env: dict[str, str]) -> None:
    with pytest.raises(ServiceInitializationError):
-        AzureOpenAIChatClient(
-        )
+        AzureOpenAIChatClient()


@pytest.mark.parametrize("exclude_list", [["AZURE_OPENAI_ENDPOINT", "AZURE_OPENAI_BASE_URL"]], indirect=True)
 def test_init_with_empty_endpoint_and_base_url(azure_openai_unit_test_env: dict[str, str]) -> None:
    with pytest.raises(ServiceInitializationError):
-        AzureOpenAIChatClient(
-        )
+        AzureOpenAIChatClient()


@pytest.mark.parametrize("override_env_param_dict", [{"AZURE_OPENAI_ENDPOINT": "http://test.com"}], indirect=True)
@@ -1,6 +1,7 @@
 # Copyright (c) Microsoft. All rights reserved.

 import json
+import logging
 import os
 from typing import Annotated, Any
 from unittest.mock import MagicMock
@@ -30,6 +31,8 @@ skip_if_azure_integration_tests_disabled = pytest.mark.skipif(
    else "Integration tests are disabled.",
 )

+logger = logging.getLogger(__name__)
+

 class OutputStruct(BaseModel):
    """A structured output for testing purposes."""
@@ -111,8 +114,7 @@ def test_init_with_default_header(azure_openai_unit_test_env: dict[str, str]) ->
@pytest.mark.parametrize("exclude_list", [["AZURE_OPENAI_RESPONSES_DEPLOYMENT_NAME"]], indirect=True)
 def test_init_with_empty_model_id(azure_openai_unit_test_env: dict[str, str]) -> None:
    with pytest.raises(ServiceInitializationError):
-        AzureOpenAIResponsesClient(
-        )
+        AzureOpenAIResponsesClient()


 def test_init_with_project_client(azure_openai_unit_test_env: dict[str, str]) -> None:
@@ -2700,3 +2700,74 @@ async def test_conversation_id_updated_in_options_between_tool_iterations():
    assert conversation_ids_received[1] == "stream_conv_after_first", (
        "streaming: conversation_id should be updated in options after receiving new conversation_id from API"
    )
+
+
+async def test_streaming_function_calling_response_includes_reasoning_and_tool_results(
+    chat_client_base: SupportsChatGetResponse,
+):
+    """Test that the finalized streaming response includes reasoning, function_call,
+    function_result, and final text in its messages.
+
+    This is critical for workflow chaining: when one agent's response is passed as
+    input to the next agent, the conversation must include all items (reasoning,
+    function_call, function_call_output) so the API can validate the history.
+    """
+
+    @tool(name="search", approval_mode="never_require")
+    def search_func(query: str) -> str:
+        return f"Found results for {query}"
+
+    chat_client_base.streaming_responses = [
+        [
+            # First response: reasoning + function_call
+            ChatResponseUpdate(
+                contents=[
+                    Content.from_text_reasoning(
+                        text="Let me search for that",
+                        additional_properties={"reasoning_id": "rs_test123", "status": "completed"},
+                    )
+                ],
+                role="assistant",
+            ),
+            ChatResponseUpdate(
+                contents=[
+                    Content.from_function_call(
+                        call_id="call_1",
+                        name="search",
+                        arguments='{"query": "test"}',
+                        additional_properties={"fc_id": "fc_test456"},
+                    )
+                ],
+                role="assistant",
+            ),
+        ],
+        [
+            # Second response: final text
+            ChatResponseUpdate(
+                contents=[Content.from_text(text="Here are the results")],
+                role="assistant",
+            ),
+        ],
+    ]
+
+    stream = chat_client_base.get_response(
+        "search for test", options={"tool_choice": "auto", "tools": [search_func]}, stream=True
+    )
+
+    updates = []
+    async for update in stream:
+        updates.append(update)
+    response = await stream.get_final_response()
+
+    # Verify all content types are in the response messages
+    all_content_types = [c.type for msg in response.messages for c in msg.contents]
+    assert "text_reasoning" in all_content_types, "Reasoning must be preserved in response messages"
+    assert "function_call" in all_content_types, "Function call must be preserved in response messages"
+    assert "function_result" in all_content_types, "Function result must be in response messages for chaining"
+    assert "text" in all_content_types, "Final text must be in response messages"
+
+    # Verify reasoning has the reasoning_id preserved
+    reasoning_contents = [c for msg in response.messages for c in msg.contents if c.type == "text_reasoning"]
+    assert len(reasoning_contents) >= 1
+    assert reasoning_contents[0].additional_properties is not None
+    assert reasoning_contents[0].additional_properties.get("reasoning_id") == "rs_test123"
@@ -154,9 +154,7 @@ def test_init_validation_fail() -> None:
 def test_init_missing_model_id(openai_unit_test_env: dict[str, str]) -> None:
    """Test OpenAIAssistantsClient initialization with missing model ID."""
    with pytest.raises(ServiceInitializationError):
-        OpenAIAssistantsClient(
-            api_key=openai_unit_test_env.get("OPENAI_API_KEY", "test-key")
-        )
+        OpenAIAssistantsClient(api_key=openai_unit_test_env.get("OPENAI_API_KEY", "test-key"))


@pytest.mark.parametrize("exclude_list", [["OPENAI_API_KEY"]], indirect=True)
@@ -97,8 +97,7 @@ def test_init_base_url_from_settings_env() -> None:
@pytest.mark.parametrize("exclude_list", [["OPENAI_CHAT_MODEL_ID"]], indirect=True)
 def test_init_with_empty_model_id(openai_unit_test_env: dict[str, str]) -> None:
    with pytest.raises(ServiceInitializationError):
-        OpenAIChatClient(
-        )
+        OpenAIChatClient()


@pytest.mark.parametrize("exclude_list", [["OPENAI_API_KEY"]], indirect=True)
@@ -139,8 +139,7 @@ def test_init_with_default_header(openai_unit_test_env: dict[str, str]) -> None:
@pytest.mark.parametrize("exclude_list", [["OPENAI_RESPONSES_MODEL_ID"]], indirect=True)
 def test_init_with_empty_model_id(openai_unit_test_env: dict[str, str]) -> None:
    with pytest.raises(ServiceInitializationError):
-        OpenAIResponsesClient(
-        )
+        OpenAIResponsesClient()


@pytest.mark.parametrize("exclude_list", [["OPENAI_API_KEY"]], indirect=True)
@@ -816,7 +815,101 @@ def test_prepare_message_for_openai_with_function_approval_response() -> None:
    assert prepared_message["approve"] is True


-def test_chat_message_with_error_content() -> None:
+def test_prepare_message_for_openai_includes_reasoning_with_function_call() -> None:
+    """Test _prepare_message_for_openai includes reasoning items alongside function_calls.
+
+    Reasoning models require reasoning items to be present in the input when
+    function_call items are included. Stripping reasoning causes a 400 error:
+    "function_call was provided without its required reasoning item".
+    """
+    client = OpenAIResponsesClient(model_id="test-model", api_key="test-key")
+
+    reasoning = Content.from_text_reasoning(
+        text="Let me analyze the request",
+        additional_properties={"status": "completed", "reasoning_id": "rs_abc123"},
+    )
+    function_call = Content.from_function_call(
+        call_id="call_123",
+        name="search_hotels",
+        arguments='{"city": "Paris"}',
+    )
+
+    message = Message(role="assistant", contents=[reasoning, function_call])
+    call_id_to_id: dict[str, str] = {}
+
+    result = client._prepare_message_for_openai(message, call_id_to_id)
+
+    # Both reasoning and function_call should be present as top-level items
+    types = [item["type"] for item in result]
+    assert "reasoning" in types, "Reasoning items must be included for reasoning models"
+    assert "function_call" in types
+
+    reasoning_item = next(item for item in result if item["type"] == "reasoning")
+    assert reasoning_item["summary"]["text"] == "Let me analyze the request"
+    assert reasoning_item["id"] == "rs_abc123", "Reasoning id must be preserved for the API"
+
+
+def test_prepare_messages_for_openai_full_conversation_with_reasoning() -> None:
+    """Test _prepare_messages_for_openai correctly serializes a full conversation
+    that includes reasoning + function_call + function_result + final text.
+
+    This simulates the conversation history passed between agents in a workflow.
+    The API requires reasoning items alongside function_calls.
+    """
+    client = OpenAIResponsesClient(model_id="test-model", api_key="test-key")
+
+    messages = [
+        Message(role="user", contents=[Content.from_text(text="search for hotels")]),
+        Message(
+            role="assistant",
+            contents=[
+                Content.from_text_reasoning(
+                    text="I need to search for hotels",
+                    additional_properties={"reasoning_id": "rs_test123", "status": "completed"},
+                ),
+                Content.from_function_call(
+                    call_id="call_1",
+                    name="search_hotels",
+                    arguments='{"city": "Paris"}',
+                    additional_properties={"fc_id": "fc_test456"},
+                ),
+            ],
+        ),
+        Message(
+            role="tool",
+            contents=[
+                Content.from_function_result(
+                    call_id="call_1",
+                    result="Found 3 hotels in Paris",
+                ),
+            ],
+        ),
+        Message(role="assistant", contents=[Content.from_text(text="I found hotels for you")]),
+    ]
+
+    result = client._prepare_messages_for_openai(messages)
+
+    types = [item.get("type") for item in result]
+    assert "message" in types, "User/assistant messages should be present"
+    assert "reasoning" in types, "Reasoning items must be present"
+    assert "function_call" in types, "Function call items must be present"
+    assert "function_call_output" in types, "Function call output must be present"
+
+    # Verify reasoning has id
+    reasoning_items = [item for item in result if item.get("type") == "reasoning"]
+    assert reasoning_items[0]["id"] == "rs_test123"
+
+    # Verify function_call has id
+    fc_items = [item for item in result if item.get("type") == "function_call"]
+    assert fc_items[0]["id"] == "fc_test456"
+
+    # Verify correct ordering: reasoning before function_call
+    reasoning_idx = types.index("reasoning")
+    fc_idx = types.index("function_call")
+    assert reasoning_idx < fc_idx, "Reasoning must come before function_call"
+
+
+def test_prepare_message_for_openai_filters_error_content() -> None:
    """Test that error content in messages is handled properly."""
    client = OpenAIResponsesClient(model_id="test-model", api_key="test-key")