Python: Fix Eval samples (#4033)

* fix red team sample

* Updated self-reflection

* fix for workflow eval sample

* fix test
This commit is contained in:
Eduard van Valkenburg
2026-02-18 20:50:33 +01:00
committed by GitHub
Unverified
parent 6a39d5a652
commit aab80d9ed9
38 changed files with 536 additions and 2629 deletions
@@ -788,6 +788,9 @@ class RawOpenAIResponsesClient( # type: ignore[misc]
request_input = self._prepare_messages_for_openai(messages)
if not request_input:
raise ServiceInvalidRequestError("Messages are required for chat completions")
conversation_id = self._get_current_conversation_id(options, **kwargs)
run_options["input"] = request_input
# model id
@@ -911,8 +914,11 @@ class RawOpenAIResponsesClient( # type: ignore[misc]
for content in message.contents:
match content.type:
case "text_reasoning":
# Don't send reasoning content back to model
continue
# Reasoning items must be sent back as top-level input items
# for reasoning models that require them alongside function_calls
reasoning = self._prepare_content_for_openai(message.role, content, call_id_to_id) # type: ignore[arg-type]
if reasoning:
all_messages.append(reasoning)
case "function_result":
new_args: dict[str, Any] = {}
new_args.update(self._prepare_content_for_openai(message.role, content, call_id_to_id)) # type: ignore[arg-type]
@@ -967,6 +973,8 @@ class RawOpenAIResponsesClient( # type: ignore[misc]
}
props: dict[str, Any] | None = getattr(content, "additional_properties", None)
if props:
if reasoning_id := props.get("reasoning_id"):
ret["id"] = reasoning_id
if status := props.get("status"):
ret["status"] = status
if reasoning_text := props.get("reasoning_text"):
@@ -1184,22 +1192,29 @@ class RawOpenAIResponsesClient( # type: ignore[misc]
)
)
case "reasoning": # ResponseOutputReasoning
reasoning_id = getattr(item, "id", None)
if hasattr(item, "content") and item.content:
for index, reasoning_content in enumerate(item.content):
additional_properties = None
additional_properties: dict[str, Any] = {}
if reasoning_id:
additional_properties["reasoning_id"] = reasoning_id
if hasattr(item, "summary") and item.summary and index < len(item.summary):
additional_properties = {"summary": item.summary[index]}
additional_properties["summary"] = item.summary[index]
contents.append(
Content.from_text_reasoning(
text=reasoning_content.text,
raw_representation=reasoning_content,
additional_properties=additional_properties,
additional_properties=additional_properties or None,
)
)
if hasattr(item, "summary") and item.summary:
for summary in item.summary:
contents.append(
Content.from_text_reasoning(text=summary.text, raw_representation=summary) # type: ignore[arg-type]
Content.from_text_reasoning(
text=summary.text,
raw_representation=summary, # type: ignore[arg-type]
additional_properties={"reasoning_id": reasoning_id} if reasoning_id else None,
)
)
case "code_interpreter_call": # ResponseOutputCodeInterpreterCall
call_id = getattr(item, "call_id", None) or getattr(item, "id", None)
@@ -1413,16 +1428,40 @@ class RawOpenAIResponsesClient( # type: ignore[misc]
contents.append(Content.from_text(text=event.delta, raw_representation=event))
metadata.update(self._get_metadata_from_response(event))
case "response.reasoning_text.delta":
contents.append(Content.from_text_reasoning(text=event.delta, raw_representation=event))
contents.append(
Content.from_text_reasoning(
text=event.delta,
raw_representation=event,
additional_properties={"reasoning_id": event.item_id},
)
)
metadata.update(self._get_metadata_from_response(event))
case "response.reasoning_text.done":
contents.append(Content.from_text_reasoning(text=event.text, raw_representation=event))
contents.append(
Content.from_text_reasoning(
text=event.text,
raw_representation=event,
additional_properties={"reasoning_id": event.item_id},
)
)
metadata.update(self._get_metadata_from_response(event))
case "response.reasoning_summary_text.delta":
contents.append(Content.from_text_reasoning(text=event.delta, raw_representation=event))
contents.append(
Content.from_text_reasoning(
text=event.delta,
raw_representation=event,
additional_properties={"reasoning_id": event.item_id},
)
)
metadata.update(self._get_metadata_from_response(event))
case "response.reasoning_summary_text.done":
contents.append(Content.from_text_reasoning(text=event.text, raw_representation=event))
contents.append(
Content.from_text_reasoning(
text=event.text,
raw_representation=event,
additional_properties={"reasoning_id": event.item_id},
)
)
metadata.update(self._get_metadata_from_response(event))
case "response.code_interpreter_call_code.delta":
call_id = getattr(event, "call_id", None) or getattr(event, "id", None) or event.item_id
@@ -1593,20 +1632,23 @@ class RawOpenAIResponsesClient( # type: ignore[misc]
)
)
case "reasoning": # ResponseOutputReasoning
reasoning_id = getattr(event_item, "id", None)
if hasattr(event_item, "content") and event_item.content:
for index, reasoning_content in enumerate(event_item.content):
additional_properties = None
additional_properties: dict[str, Any] = {}
if reasoning_id:
additional_properties["reasoning_id"] = reasoning_id
if (
hasattr(event_item, "summary")
and event_item.summary
and index < len(event_item.summary)
):
additional_properties = {"summary": event_item.summary[index]}
additional_properties["summary"] = event_item.summary[index]
contents.append(
Content.from_text_reasoning(
text=reasoning_content.text,
raw_representation=reasoning_content,
additional_properties=additional_properties,
additional_properties=additional_properties or None,
)
)
case _:
@@ -129,9 +129,7 @@ def test_azure_assistants_client_init_validation_fail() -> None:
def test_azure_assistants_client_init_missing_deployment_name(azure_openai_unit_test_env: dict[str, str]) -> None:
"""Test AzureOpenAIAssistantsClient initialization with missing deployment name."""
with pytest.raises(ServiceInitializationError):
AzureOpenAIAssistantsClient(
api_key=azure_openai_unit_test_env.get("AZURE_OPENAI_API_KEY", "test-key")
)
AzureOpenAIAssistantsClient(api_key=azure_openai_unit_test_env.get("AZURE_OPENAI_API_KEY", "test-key"))
def test_azure_assistants_client_init_with_default_headers(azure_openai_unit_test_env: dict[str, str]) -> None:
@@ -94,15 +94,13 @@ def test_init_endpoint(azure_openai_unit_test_env: dict[str, str]) -> None:
@pytest.mark.parametrize("exclude_list", [["AZURE_OPENAI_CHAT_DEPLOYMENT_NAME"]], indirect=True)
def test_init_with_empty_deployment_name(azure_openai_unit_test_env: dict[str, str]) -> None:
with pytest.raises(ServiceInitializationError):
AzureOpenAIChatClient(
)
AzureOpenAIChatClient()
@pytest.mark.parametrize("exclude_list", [["AZURE_OPENAI_ENDPOINT", "AZURE_OPENAI_BASE_URL"]], indirect=True)
def test_init_with_empty_endpoint_and_base_url(azure_openai_unit_test_env: dict[str, str]) -> None:
with pytest.raises(ServiceInitializationError):
AzureOpenAIChatClient(
)
AzureOpenAIChatClient()
@pytest.mark.parametrize("override_env_param_dict", [{"AZURE_OPENAI_ENDPOINT": "http://test.com"}], indirect=True)
@@ -1,6 +1,7 @@
# Copyright (c) Microsoft. All rights reserved.
import json
import logging
import os
from typing import Annotated, Any
from unittest.mock import MagicMock
@@ -30,6 +31,8 @@ skip_if_azure_integration_tests_disabled = pytest.mark.skipif(
else "Integration tests are disabled.",
)
logger = logging.getLogger(__name__)
class OutputStruct(BaseModel):
"""A structured output for testing purposes."""
@@ -111,8 +114,7 @@ def test_init_with_default_header(azure_openai_unit_test_env: dict[str, str]) ->
@pytest.mark.parametrize("exclude_list", [["AZURE_OPENAI_RESPONSES_DEPLOYMENT_NAME"]], indirect=True)
def test_init_with_empty_model_id(azure_openai_unit_test_env: dict[str, str]) -> None:
with pytest.raises(ServiceInitializationError):
AzureOpenAIResponsesClient(
)
AzureOpenAIResponsesClient()
def test_init_with_project_client(azure_openai_unit_test_env: dict[str, str]) -> None:
@@ -2700,3 +2700,74 @@ async def test_conversation_id_updated_in_options_between_tool_iterations():
assert conversation_ids_received[1] == "stream_conv_after_first", (
"streaming: conversation_id should be updated in options after receiving new conversation_id from API"
)
async def test_streaming_function_calling_response_includes_reasoning_and_tool_results(
chat_client_base: SupportsChatGetResponse,
):
"""Test that the finalized streaming response includes reasoning, function_call,
function_result, and final text in its messages.
This is critical for workflow chaining: when one agent's response is passed as
input to the next agent, the conversation must include all items (reasoning,
function_call, function_call_output) so the API can validate the history.
"""
@tool(name="search", approval_mode="never_require")
def search_func(query: str) -> str:
return f"Found results for {query}"
chat_client_base.streaming_responses = [
[
# First response: reasoning + function_call
ChatResponseUpdate(
contents=[
Content.from_text_reasoning(
text="Let me search for that",
additional_properties={"reasoning_id": "rs_test123", "status": "completed"},
)
],
role="assistant",
),
ChatResponseUpdate(
contents=[
Content.from_function_call(
call_id="call_1",
name="search",
arguments='{"query": "test"}',
additional_properties={"fc_id": "fc_test456"},
)
],
role="assistant",
),
],
[
# Second response: final text
ChatResponseUpdate(
contents=[Content.from_text(text="Here are the results")],
role="assistant",
),
],
]
stream = chat_client_base.get_response(
"search for test", options={"tool_choice": "auto", "tools": [search_func]}, stream=True
)
updates = []
async for update in stream:
updates.append(update)
response = await stream.get_final_response()
# Verify all content types are in the response messages
all_content_types = [c.type for msg in response.messages for c in msg.contents]
assert "text_reasoning" in all_content_types, "Reasoning must be preserved in response messages"
assert "function_call" in all_content_types, "Function call must be preserved in response messages"
assert "function_result" in all_content_types, "Function result must be in response messages for chaining"
assert "text" in all_content_types, "Final text must be in response messages"
# Verify reasoning has the reasoning_id preserved
reasoning_contents = [c for msg in response.messages for c in msg.contents if c.type == "text_reasoning"]
assert len(reasoning_contents) >= 1
assert reasoning_contents[0].additional_properties is not None
assert reasoning_contents[0].additional_properties.get("reasoning_id") == "rs_test123"
@@ -154,9 +154,7 @@ def test_init_validation_fail() -> None:
def test_init_missing_model_id(openai_unit_test_env: dict[str, str]) -> None:
"""Test OpenAIAssistantsClient initialization with missing model ID."""
with pytest.raises(ServiceInitializationError):
OpenAIAssistantsClient(
api_key=openai_unit_test_env.get("OPENAI_API_KEY", "test-key")
)
OpenAIAssistantsClient(api_key=openai_unit_test_env.get("OPENAI_API_KEY", "test-key"))
@pytest.mark.parametrize("exclude_list", [["OPENAI_API_KEY"]], indirect=True)
@@ -97,8 +97,7 @@ def test_init_base_url_from_settings_env() -> None:
@pytest.mark.parametrize("exclude_list", [["OPENAI_CHAT_MODEL_ID"]], indirect=True)
def test_init_with_empty_model_id(openai_unit_test_env: dict[str, str]) -> None:
with pytest.raises(ServiceInitializationError):
OpenAIChatClient(
)
OpenAIChatClient()
@pytest.mark.parametrize("exclude_list", [["OPENAI_API_KEY"]], indirect=True)
@@ -139,8 +139,7 @@ def test_init_with_default_header(openai_unit_test_env: dict[str, str]) -> None:
@pytest.mark.parametrize("exclude_list", [["OPENAI_RESPONSES_MODEL_ID"]], indirect=True)
def test_init_with_empty_model_id(openai_unit_test_env: dict[str, str]) -> None:
with pytest.raises(ServiceInitializationError):
OpenAIResponsesClient(
)
OpenAIResponsesClient()
@pytest.mark.parametrize("exclude_list", [["OPENAI_API_KEY"]], indirect=True)
@@ -816,7 +815,101 @@ def test_prepare_message_for_openai_with_function_approval_response() -> None:
assert prepared_message["approve"] is True
def test_chat_message_with_error_content() -> None:
def test_prepare_message_for_openai_includes_reasoning_with_function_call() -> None:
"""Test _prepare_message_for_openai includes reasoning items alongside function_calls.
Reasoning models require reasoning items to be present in the input when
function_call items are included. Stripping reasoning causes a 400 error:
"function_call was provided without its required reasoning item".
"""
client = OpenAIResponsesClient(model_id="test-model", api_key="test-key")
reasoning = Content.from_text_reasoning(
text="Let me analyze the request",
additional_properties={"status": "completed", "reasoning_id": "rs_abc123"},
)
function_call = Content.from_function_call(
call_id="call_123",
name="search_hotels",
arguments='{"city": "Paris"}',
)
message = Message(role="assistant", contents=[reasoning, function_call])
call_id_to_id: dict[str, str] = {}
result = client._prepare_message_for_openai(message, call_id_to_id)
# Both reasoning and function_call should be present as top-level items
types = [item["type"] for item in result]
assert "reasoning" in types, "Reasoning items must be included for reasoning models"
assert "function_call" in types
reasoning_item = next(item for item in result if item["type"] == "reasoning")
assert reasoning_item["summary"]["text"] == "Let me analyze the request"
assert reasoning_item["id"] == "rs_abc123", "Reasoning id must be preserved for the API"
def test_prepare_messages_for_openai_full_conversation_with_reasoning() -> None:
"""Test _prepare_messages_for_openai correctly serializes a full conversation
that includes reasoning + function_call + function_result + final text.
This simulates the conversation history passed between agents in a workflow.
The API requires reasoning items alongside function_calls.
"""
client = OpenAIResponsesClient(model_id="test-model", api_key="test-key")
messages = [
Message(role="user", contents=[Content.from_text(text="search for hotels")]),
Message(
role="assistant",
contents=[
Content.from_text_reasoning(
text="I need to search for hotels",
additional_properties={"reasoning_id": "rs_test123", "status": "completed"},
),
Content.from_function_call(
call_id="call_1",
name="search_hotels",
arguments='{"city": "Paris"}',
additional_properties={"fc_id": "fc_test456"},
),
],
),
Message(
role="tool",
contents=[
Content.from_function_result(
call_id="call_1",
result="Found 3 hotels in Paris",
),
],
),
Message(role="assistant", contents=[Content.from_text(text="I found hotels for you")]),
]
result = client._prepare_messages_for_openai(messages)
types = [item.get("type") for item in result]
assert "message" in types, "User/assistant messages should be present"
assert "reasoning" in types, "Reasoning items must be present"
assert "function_call" in types, "Function call items must be present"
assert "function_call_output" in types, "Function call output must be present"
# Verify reasoning has id
reasoning_items = [item for item in result if item.get("type") == "reasoning"]
assert reasoning_items[0]["id"] == "rs_test123"
# Verify function_call has id
fc_items = [item for item in result if item.get("type") == "function_call"]
assert fc_items[0]["id"] == "fc_test456"
# Verify correct ordering: reasoning before function_call
reasoning_idx = types.index("reasoning")
fc_idx = types.index("function_call")
assert reasoning_idx < fc_idx, "Reasoning must come before function_call"
def test_prepare_message_for_openai_filters_error_content() -> None:
"""Test that error content in messages is handled properly."""
client = OpenAIResponsesClient(model_id="test-model", api_key="test-key")
@@ -44,9 +44,7 @@ def test_foundry_local_settings_missing_model_id(foundry_local_unit_test_env: di
def test_foundry_local_settings_explicit_overrides_env(foundry_local_unit_test_env: dict[str, str]) -> None:
"""Test that explicit values override environment variables."""
settings = load_settings(
FoundryLocalSettings, env_prefix="FOUNDRY_LOCAL_", model_id="override-model-id"
)
settings = load_settings(FoundryLocalSettings, env_prefix="FOUNDRY_LOCAL_", model_id="override-model-id")
assert settings["model_id"] == "override-model-id"
assert settings["model_id"] != foundry_local_unit_test_env["FOUNDRY_LOCAL_MODEL_ID"]