Files
agent-framework/python/samples/demos/workflow_evaluation/run_evaluation.py
T
Eduard van Valkenburg 838a7fd61d Python: [BREAKING] Types API Review improvements (#3647)
* Replace Role and FinishReason classes with NewType + Literal

- Remove EnumLike metaclass from _types.py
- Replace Role class with NewType('Role', str) + RoleLiteral
- Replace FinishReason class with NewType('FinishReason', str) + FinishReasonLiteral
- Update all usages across codebase to use string literals
- Remove .value access patterns (direct string comparison now works)
- Add backward compatibility for legacy dict serialization format
- Update tests to reflect new string-based types

Addresses #3591, #3615

* Simplify ChatResponse and AgentResponse type hints (#3592)

- Remove overloads from ChatResponse.__init__
- Remove text parameter from ChatResponse.__init__
- Remove | dict[str, Any] from finish_reason and usage_details params
- Remove **kwargs from AgentResponse.__init__
- Both now accept ChatMessage | Sequence[ChatMessage] | None for messages
- Update docstrings and examples to reflect changes
- Fix tests that were using removed kwargs
- Fix Role type hint usage in ag-ui utils

* Remove text parameter from ChatResponseUpdate and AgentResponseUpdate (#3597)

- Remove text parameter from ChatResponseUpdate.__init__
- Remove text parameter from AgentResponseUpdate.__init__
- Remove **kwargs from both update classes
- Simplify contents parameter type to Sequence[Content] | None
- Update all usages to use contents=[Content.from_text(...)] pattern
- Fix imports in test files
- Update docstrings and examples

* Rename from_chat_response_updates to from_updates (#3593)

- ChatResponse.from_chat_response_updates → ChatResponse.from_updates
- ChatResponse.from_chat_response_generator → ChatResponse.from_update_generator
- AgentResponse.from_agent_run_response_updates → AgentResponse.from_updates

* Remove try_parse_value method from ChatResponse and AgentResponse (#3595)

- Remove try_parse_value method from ChatResponse
- Remove try_parse_value method from AgentResponse
- Remove try_parse_value calls from from_updates and from_update_generator methods
- Update samples to use try/except with response.value instead
- Update tests to use response.value pattern
- Users should now use response.value with try/except for safe parsing

* Add agent_id to AgentResponse and clarify author_name documentation (#3596)

- Add agent_id parameter to AgentResponse class
- Document that author_name is on ChatMessage objects, not responses
- Update ChatResponse docstring with author_name note
- Update AgentResponse docstring with author_name note

* Simplify ChatMessage.__init__ signature (#3618)

- Make contents a positional argument accepting Sequence[Content | str]
- Auto-convert strings in contents to TextContent
- Remove overloads, keep text kwarg for backward compatibility with serialization
- Update _parse_content_list to handle string items
- Update all usages across codebase to use new format: ChatMessage("role", ["text"])

* Allow Content as input on run and get_response

- Update prepare_messages and normalize_messages to accept Content
- Update type signatures in _agents.py and _clients.py
- Add tests for Content input handling

* Fix ChatMessage usage across packages and samples

Update all remaining ChatMessage(role=..., text=...) to use new
ChatMessage('role', ['text']) signature.

* Fix Role string usage and response format parsing

- Fix redis provider: remove .value access on string literals
- Fix durabletask ensure_response_format: set _response_format before accessing .value

* Fix ollama .value and ai_model_id issues, handle None in content list

- Fix ollama _chat_client: remove .value on string literals
- Fix ollama _chat_client: rename ai_model_id to model_id
- Fix _parse_content_list: skip None values gracefully

* Fix A2AAgent type signature to include Content

* Fix Role/FinishReason NewType dict annotations and improve test coverage to 95%

* Fix mypy errors for Role/FinishReason NewType usage

* Fix Role.TOOL and Role.ASSISTANT usage in _orchestrator_helpers.py

* Fix Role NewType usage in durabletask _models.py
2026-02-04 10:13:23 +00:00

220 lines
7.0 KiB
Python

# Copyright (c) Microsoft. All rights reserved.
"""
Script to run multi-agent travel planning workflow and evaluate agent responses.
This script:
1. Executes the multi-agent workflow
2. Displays response data summary
3. Creates and runs evaluation with multiple evaluators
4. Monitors evaluation progress and displays results
"""
import asyncio
import os
import time
from azure.ai.projects import AIProjectClient
from azure.identity import DefaultAzureCredential
from create_workflow import create_and_run_workflow
from dotenv import load_dotenv
def print_section(title: str):
"""Print a formatted section header."""
print(f"\n{'=' * 80}")
print(f"{title}")
print(f"{'=' * 80}")
async def run_workflow():
"""Execute the multi-agent travel planning workflow.
Returns:
Dictionary containing workflow data with agent response IDs
"""
print_section("Step 1: Running Workflow")
print("Executing multi-agent travel planning workflow...")
print("This may take a few minutes...")
workflow_data = await create_and_run_workflow()
print("Workflow execution completed")
return workflow_data
def display_response_summary(workflow_data: dict):
"""Display summary of response data."""
print_section("Step 2: Response Data Summary")
print(f"Query: {workflow_data['query']}")
print(f"\nAgents tracked: {len(workflow_data['agents'])}")
for agent_name, agent_data in workflow_data["agents"].items():
response_count = agent_data["response_count"]
print(f" {agent_name}: {response_count} response(s)")
def fetch_agent_responses(openai_client, workflow_data: dict, agent_names: list):
"""Fetch and display final responses from specified agents."""
print_section("Step 3: Fetching Agent Responses")
for agent_name in agent_names:
if agent_name not in workflow_data["agents"]:
continue
agent_data = workflow_data["agents"][agent_name]
if not agent_data["response_ids"]:
continue
final_response_id = agent_data["response_ids"][-1]
print(f"\n{agent_name}")
print(f" Response ID: {final_response_id}")
try:
response = openai_client.responses.retrieve(response_id=final_response_id)
content = response.output[-1].content[-1].text
truncated = content[:300] + "..." if len(content) > 300 else content
print(f" Content preview: {truncated}")
except Exception as e:
print(f" Error: {e}")
def create_evaluation(openai_client, model_deployment: str):
"""Create evaluation with multiple evaluators."""
print_section("Step 4: Creating Evaluation")
data_source_config = {"type": "azure_ai_source", "scenario": "responses"}
testing_criteria = [
{
"type": "azure_ai_evaluator",
"name": "relevance",
"evaluator_name": "builtin.relevance",
"initialization_parameters": {"deployment_name": model_deployment}
},
{
"type": "azure_ai_evaluator",
"name": "groundedness",
"evaluator_name": "builtin.groundedness",
"initialization_parameters": {"deployment_name": model_deployment}
},
{
"type": "azure_ai_evaluator",
"name": "tool_call_accuracy",
"evaluator_name": "builtin.tool_call_accuracy",
"initialization_parameters": {"deployment_name": model_deployment}
},
{
"type": "azure_ai_evaluator",
"name": "tool_output_utilization",
"evaluator_name": "builtin.tool_output_utilization",
"initialization_parameters": {"deployment_name": model_deployment}
},
]
eval_object = openai_client.evals.create(
name="Travel Workflow Multi-Evaluator Assessment",
data_source_config=data_source_config,
testing_criteria=testing_criteria,
)
evaluator_names = [criterion["name"] for criterion in testing_criteria]
print(f"Evaluation created: {eval_object.id}")
print(f"Evaluators ({len(evaluator_names)}): {', '.join(evaluator_names)}")
return eval_object
def run_evaluation(openai_client, eval_object, workflow_data: dict, agent_names: list):
"""Run evaluation on selected agent responses."""
print_section("Step 5: Running Evaluation")
selected_response_ids = []
for agent_name in agent_names:
if agent_name in workflow_data["agents"]:
agent_data = workflow_data["agents"][agent_name]
if agent_data["response_ids"]:
selected_response_ids.append(agent_data["response_ids"][-1])
print(f"Selected {len(selected_response_ids)} responses for evaluation")
data_source = {
"type": "azure_ai_responses",
"item_generation_params": {
"type": "response_retrieval",
"data_mapping": {"response_id": "{{item.resp_id}}"},
"source": {
"type": "file_content",
"content": [{"item": {"resp_id": resp_id}} for resp_id in selected_response_ids]
},
},
}
eval_run = openai_client.evals.runs.create(
eval_id=eval_object.id,
name="Multi-Agent Response Evaluation",
data_source=data_source
)
print(f"Evaluation run created: {eval_run.id}")
return eval_run
def monitor_evaluation(openai_client, eval_object, eval_run):
"""Monitor evaluation progress and display results."""
print_section("Step 6: Monitoring Evaluation")
print("Waiting for evaluation to complete...")
while eval_run.status not in ["completed", "failed"]:
eval_run = openai_client.evals.runs.retrieve(
run_id=eval_run.id,
eval_id=eval_object.id
)
print(f"Status: {eval_run.status}")
time.sleep(5)
if eval_run.status == "completed":
print("\nEvaluation completed successfully")
print(f"Result counts: {eval_run.result_counts}")
print(f"\nReport URL: {eval_run.report_url}")
else:
print("\nEvaluation failed")
async def main():
"""Main execution flow."""
load_dotenv()
print("Travel Planning Workflow Evaluation")
workflow_data = await run_workflow()
display_response_summary(workflow_data)
project_client = AIProjectClient(
endpoint=os.environ["AZURE_AI_PROJECT_ENDPOINT"],
credential=DefaultAzureCredential(),
api_version="2025-11-15-preview"
)
openai_client = project_client.get_openai_client()
agents_to_evaluate = ["hotel-search-agent", "flight-search-agent", "activity-search-agent"]
fetch_agent_responses(openai_client, workflow_data, agents_to_evaluate)
model_deployment = os.environ.get("AZURE_AI_MODEL_DEPLOYMENT_NAME", "gpt-4o-mini")
eval_object = create_evaluation(openai_client, model_deployment)
eval_run = run_evaluation(openai_client, eval_object, workflow_data, agents_to_evaluate)
monitor_evaluation(openai_client, eval_object, eval_run)
print_section("Complete")
if __name__ == "__main__":
asyncio.run(main())