Python: Fix Eval samples (#4033)

* fix red team sample

* Updated self-reflection

* fix for workflow eval sample

* fix test
This commit is contained in:
Eduard van Valkenburg
2026-02-18 20:50:33 +01:00
committed by GitHub
Unverified
parent 6a39d5a652
commit aab80d9ed9
38 changed files with 536 additions and 2629 deletions
@@ -129,9 +129,7 @@ def test_azure_assistants_client_init_validation_fail() -> None:
def test_azure_assistants_client_init_missing_deployment_name(azure_openai_unit_test_env: dict[str, str]) -> None:
"""Test AzureOpenAIAssistantsClient initialization with missing deployment name."""
with pytest.raises(ServiceInitializationError):
AzureOpenAIAssistantsClient(
api_key=azure_openai_unit_test_env.get("AZURE_OPENAI_API_KEY", "test-key")
)
AzureOpenAIAssistantsClient(api_key=azure_openai_unit_test_env.get("AZURE_OPENAI_API_KEY", "test-key"))
def test_azure_assistants_client_init_with_default_headers(azure_openai_unit_test_env: dict[str, str]) -> None:
@@ -94,15 +94,13 @@ def test_init_endpoint(azure_openai_unit_test_env: dict[str, str]) -> None:
@pytest.mark.parametrize("exclude_list", [["AZURE_OPENAI_CHAT_DEPLOYMENT_NAME"]], indirect=True)
def test_init_with_empty_deployment_name(azure_openai_unit_test_env: dict[str, str]) -> None:
with pytest.raises(ServiceInitializationError):
AzureOpenAIChatClient(
)
AzureOpenAIChatClient()
@pytest.mark.parametrize("exclude_list", [["AZURE_OPENAI_ENDPOINT", "AZURE_OPENAI_BASE_URL"]], indirect=True)
def test_init_with_empty_endpoint_and_base_url(azure_openai_unit_test_env: dict[str, str]) -> None:
with pytest.raises(ServiceInitializationError):
AzureOpenAIChatClient(
)
AzureOpenAIChatClient()
@pytest.mark.parametrize("override_env_param_dict", [{"AZURE_OPENAI_ENDPOINT": "http://test.com"}], indirect=True)
@@ -1,6 +1,7 @@
# Copyright (c) Microsoft. All rights reserved.
import json
import logging
import os
from typing import Annotated, Any
from unittest.mock import MagicMock
@@ -30,6 +31,8 @@ skip_if_azure_integration_tests_disabled = pytest.mark.skipif(
else "Integration tests are disabled.",
)
logger = logging.getLogger(__name__)
class OutputStruct(BaseModel):
"""A structured output for testing purposes."""
@@ -111,8 +114,7 @@ def test_init_with_default_header(azure_openai_unit_test_env: dict[str, str]) ->
@pytest.mark.parametrize("exclude_list", [["AZURE_OPENAI_RESPONSES_DEPLOYMENT_NAME"]], indirect=True)
def test_init_with_empty_model_id(azure_openai_unit_test_env: dict[str, str]) -> None:
with pytest.raises(ServiceInitializationError):
AzureOpenAIResponsesClient(
)
AzureOpenAIResponsesClient()
def test_init_with_project_client(azure_openai_unit_test_env: dict[str, str]) -> None:
@@ -2700,3 +2700,74 @@ async def test_conversation_id_updated_in_options_between_tool_iterations():
assert conversation_ids_received[1] == "stream_conv_after_first", (
"streaming: conversation_id should be updated in options after receiving new conversation_id from API"
)
async def test_streaming_function_calling_response_includes_reasoning_and_tool_results(
chat_client_base: SupportsChatGetResponse,
):
"""Test that the finalized streaming response includes reasoning, function_call,
function_result, and final text in its messages.
This is critical for workflow chaining: when one agent's response is passed as
input to the next agent, the conversation must include all items (reasoning,
function_call, function_call_output) so the API can validate the history.
"""
@tool(name="search", approval_mode="never_require")
def search_func(query: str) -> str:
return f"Found results for {query}"
chat_client_base.streaming_responses = [
[
# First response: reasoning + function_call
ChatResponseUpdate(
contents=[
Content.from_text_reasoning(
text="Let me search for that",
additional_properties={"reasoning_id": "rs_test123", "status": "completed"},
)
],
role="assistant",
),
ChatResponseUpdate(
contents=[
Content.from_function_call(
call_id="call_1",
name="search",
arguments='{"query": "test"}',
additional_properties={"fc_id": "fc_test456"},
)
],
role="assistant",
),
],
[
# Second response: final text
ChatResponseUpdate(
contents=[Content.from_text(text="Here are the results")],
role="assistant",
),
],
]
stream = chat_client_base.get_response(
"search for test", options={"tool_choice": "auto", "tools": [search_func]}, stream=True
)
updates = []
async for update in stream:
updates.append(update)
response = await stream.get_final_response()
# Verify all content types are in the response messages
all_content_types = [c.type for msg in response.messages for c in msg.contents]
assert "text_reasoning" in all_content_types, "Reasoning must be preserved in response messages"
assert "function_call" in all_content_types, "Function call must be preserved in response messages"
assert "function_result" in all_content_types, "Function result must be in response messages for chaining"
assert "text" in all_content_types, "Final text must be in response messages"
# Verify reasoning has the reasoning_id preserved
reasoning_contents = [c for msg in response.messages for c in msg.contents if c.type == "text_reasoning"]
assert len(reasoning_contents) >= 1
assert reasoning_contents[0].additional_properties is not None
assert reasoning_contents[0].additional_properties.get("reasoning_id") == "rs_test123"
@@ -154,9 +154,7 @@ def test_init_validation_fail() -> None:
def test_init_missing_model_id(openai_unit_test_env: dict[str, str]) -> None:
"""Test OpenAIAssistantsClient initialization with missing model ID."""
with pytest.raises(ServiceInitializationError):
OpenAIAssistantsClient(
api_key=openai_unit_test_env.get("OPENAI_API_KEY", "test-key")
)
OpenAIAssistantsClient(api_key=openai_unit_test_env.get("OPENAI_API_KEY", "test-key"))
@pytest.mark.parametrize("exclude_list", [["OPENAI_API_KEY"]], indirect=True)
@@ -97,8 +97,7 @@ def test_init_base_url_from_settings_env() -> None:
@pytest.mark.parametrize("exclude_list", [["OPENAI_CHAT_MODEL_ID"]], indirect=True)
def test_init_with_empty_model_id(openai_unit_test_env: dict[str, str]) -> None:
with pytest.raises(ServiceInitializationError):
OpenAIChatClient(
)
OpenAIChatClient()
@pytest.mark.parametrize("exclude_list", [["OPENAI_API_KEY"]], indirect=True)
@@ -139,8 +139,7 @@ def test_init_with_default_header(openai_unit_test_env: dict[str, str]) -> None:
@pytest.mark.parametrize("exclude_list", [["OPENAI_RESPONSES_MODEL_ID"]], indirect=True)
def test_init_with_empty_model_id(openai_unit_test_env: dict[str, str]) -> None:
with pytest.raises(ServiceInitializationError):
OpenAIResponsesClient(
)
OpenAIResponsesClient()
@pytest.mark.parametrize("exclude_list", [["OPENAI_API_KEY"]], indirect=True)
@@ -816,7 +815,101 @@ def test_prepare_message_for_openai_with_function_approval_response() -> None:
assert prepared_message["approve"] is True
def test_chat_message_with_error_content() -> None:
def test_prepare_message_for_openai_includes_reasoning_with_function_call() -> None:
"""Test _prepare_message_for_openai includes reasoning items alongside function_calls.
Reasoning models require reasoning items to be present in the input when
function_call items are included. Stripping reasoning causes a 400 error:
"function_call was provided without its required reasoning item".
"""
client = OpenAIResponsesClient(model_id="test-model", api_key="test-key")
reasoning = Content.from_text_reasoning(
text="Let me analyze the request",
additional_properties={"status": "completed", "reasoning_id": "rs_abc123"},
)
function_call = Content.from_function_call(
call_id="call_123",
name="search_hotels",
arguments='{"city": "Paris"}',
)
message = Message(role="assistant", contents=[reasoning, function_call])
call_id_to_id: dict[str, str] = {}
result = client._prepare_message_for_openai(message, call_id_to_id)
# Both reasoning and function_call should be present as top-level items
types = [item["type"] for item in result]
assert "reasoning" in types, "Reasoning items must be included for reasoning models"
assert "function_call" in types
reasoning_item = next(item for item in result if item["type"] == "reasoning")
assert reasoning_item["summary"]["text"] == "Let me analyze the request"
assert reasoning_item["id"] == "rs_abc123", "Reasoning id must be preserved for the API"
def test_prepare_messages_for_openai_full_conversation_with_reasoning() -> None:
"""Test _prepare_messages_for_openai correctly serializes a full conversation
that includes reasoning + function_call + function_result + final text.
This simulates the conversation history passed between agents in a workflow.
The API requires reasoning items alongside function_calls.
"""
client = OpenAIResponsesClient(model_id="test-model", api_key="test-key")
messages = [
Message(role="user", contents=[Content.from_text(text="search for hotels")]),
Message(
role="assistant",
contents=[
Content.from_text_reasoning(
text="I need to search for hotels",
additional_properties={"reasoning_id": "rs_test123", "status": "completed"},
),
Content.from_function_call(
call_id="call_1",
name="search_hotels",
arguments='{"city": "Paris"}',
additional_properties={"fc_id": "fc_test456"},
),
],
),
Message(
role="tool",
contents=[
Content.from_function_result(
call_id="call_1",
result="Found 3 hotels in Paris",
),
],
),
Message(role="assistant", contents=[Content.from_text(text="I found hotels for you")]),
]
result = client._prepare_messages_for_openai(messages)
types = [item.get("type") for item in result]
assert "message" in types, "User/assistant messages should be present"
assert "reasoning" in types, "Reasoning items must be present"
assert "function_call" in types, "Function call items must be present"
assert "function_call_output" in types, "Function call output must be present"
# Verify reasoning has id
reasoning_items = [item for item in result if item.get("type") == "reasoning"]
assert reasoning_items[0]["id"] == "rs_test123"
# Verify function_call has id
fc_items = [item for item in result if item.get("type") == "function_call"]
assert fc_items[0]["id"] == "fc_test456"
# Verify correct ordering: reasoning before function_call
reasoning_idx = types.index("reasoning")
fc_idx = types.index("function_call")
assert reasoning_idx < fc_idx, "Reasoning must come before function_call"
def test_prepare_message_for_openai_filters_error_content() -> None:
"""Test that error content in messages is handled properly."""
client = OpenAIResponsesClient(model_id="test-model", api_key="test-key")