mirror of
https://github.com/microsoft/agent-framework.git
synced 2026-06-16 21:04:09 +08:00
b05fc9e849
* Initial plan * Add load_dotenv() to 303 Python samples for environment variable loading Co-authored-by: TaoChenOSU <12570346+TaoChenOSU@users.noreply.github.com> * Update SAMPLE_GUIDELINES.md to document load_dotenv() requirement Co-authored-by: TaoChenOSU <12570346+TaoChenOSU@users.noreply.github.com> * Update samples README.md to document .env file usage Co-authored-by: TaoChenOSU <12570346+TaoChenOSU@users.noreply.github.com> * Run ruff format on all changed sample files Co-authored-by: TaoChenOSU <12570346+TaoChenOSU@users.noreply.github.com> * Clarify load_dotenv() usage in README - local dev vs production Co-authored-by: TaoChenOSU <12570346+TaoChenOSU@users.noreply.github.com> * Remove deprecated getting_started folder as requested Co-authored-by: TaoChenOSU <12570346+TaoChenOSU@users.noreply.github.com> * Document env_file_path parameter for per-client configuration Co-authored-by: TaoChenOSU <12570346+TaoChenOSU@users.noreply.github.com> * Merge main branch to resolve conflicts Co-authored-by: TaoChenOSU <12570346+TaoChenOSU@users.noreply.github.com> * Fix run_evaluation.py file that was empty in merge commit Co-authored-by: TaoChenOSU <12570346+TaoChenOSU@users.noreply.github.com> * Remove dotnet changes from merge - out of scope for this PR Co-authored-by: TaoChenOSU <12570346+TaoChenOSU@users.noreply.github.com> * Remove package and test changes from merge - only sample changes needed Co-authored-by: TaoChenOSU <12570346+TaoChenOSU@users.noreply.github.com> * Remove test_func_utils.py - only sample changes needed Co-authored-by: TaoChenOSU <12570346+TaoChenOSU@users.noreply.github.com> * Revert sample files not in original changeset - keep only load_dotenv additions Co-authored-by: TaoChenOSU <12570346+TaoChenOSU@users.noreply.github.com> * Move load_dotenv() outside snippet tag in 06_host_your_agent.py Co-authored-by: TaoChenOSU <12570346+TaoChenOSU@users.noreply.github.com> * Fix comment placement - move load_dotenv before code comments Co-authored-by: TaoChenOSU <12570346+TaoChenOSU@users.noreply.github.com> * Fix load_dotenv() placement across all samples - after docstring, before code comments Co-authored-by: TaoChenOSU <12570346+TaoChenOSU@users.noreply.github.com> * Merge latest main branch with load_dotenv changes Co-authored-by: TaoChenOSU <12570346+TaoChenOSU@users.noreply.github.com> * Remove non-sample changes from merge - keep only load_dotenv additions Co-authored-by: TaoChenOSU <12570346+TaoChenOSU@users.noreply.github.com> * Revert non-load_dotenv sample changes from merge Co-authored-by: TaoChenOSU <12570346+TaoChenOSU@users.noreply.github.com> * Fix run_evaluation.py - use main's improved version (file already had load_dotenv) Co-authored-by: TaoChenOSU <12570346+TaoChenOSU@users.noreply.github.com> * Manual update * Manual update 2 * Fix Role usage and load_dotenv placement per PR review feedback Co-authored-by: eavanvalkenburg <13749212+eavanvalkenburg@users.noreply.github.com> * Fix Role usage - use string literals not enum attributes Co-authored-by: eavanvalkenburg <13749212+eavanvalkenburg@users.noreply.github.com> * Fix SAMPLE_GUIDELINES.md example - load_dotenv before docstring per guidance Co-authored-by: eavanvalkenburg <13749212+eavanvalkenburg@users.noreply.github.com> * Move load_dotenv() before docstrings in all samples per SAMPLE_GUIDELINES ordering Co-authored-by: eavanvalkenburg <13749212+eavanvalkenburg@users.noreply.github.com> * Address PR review: rename files, fix placement, add session usage, remove note Co-authored-by: eavanvalkenburg <13749212+eavanvalkenburg@users.noreply.github.com> * Update Redis README to reference renamed file redis_history_provider.py Co-authored-by: eavanvalkenburg <13749212+eavanvalkenburg@users.noreply.github.com> --------- Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com> Co-authored-by: TaoChenOSU <12570346+TaoChenOSU@users.noreply.github.com> Co-authored-by: Tao Chen <taochen@microsoft.com> Co-authored-by: eavanvalkenburg <13749212+eavanvalkenburg@users.noreply.github.com> Co-authored-by: Eduard van Valkenburg <eavanvalkenburg@users.noreply.github.com>
241 lines
8.2 KiB
Python
241 lines
8.2 KiB
Python
# Copyright (c) Microsoft. All rights reserved.
|
|
# type: ignore
|
|
|
|
from __future__ import annotations
|
|
|
|
import asyncio
|
|
import os
|
|
import time
|
|
from typing import TYPE_CHECKING, Any
|
|
|
|
from azure.ai.projects import AIProjectClient
|
|
from azure.identity import DefaultAzureCredential
|
|
from create_workflow import create_and_run_workflow
|
|
from dotenv import load_dotenv
|
|
|
|
if TYPE_CHECKING:
|
|
from openai import OpenAI
|
|
from openai.types import EvalCreateResponse
|
|
from openai.types.evals import RunCreateResponse
|
|
|
|
"""
|
|
Script to run multi-agent travel planning workflow and evaluate agent responses.
|
|
|
|
This script:
|
|
1. Runs the multi-agent travel planning workflow
|
|
2. Displays a summary of tracked agent responses
|
|
3. Fetches and previews final agent responses
|
|
4. Creates an evaluation with multiple evaluators
|
|
5. Runs the evaluation on selected agent responses
|
|
6. Monitors evaluation progress and displays results
|
|
"""
|
|
|
|
|
|
def create_openai_client() -> OpenAI:
|
|
project_client = AIProjectClient(
|
|
endpoint=os.environ["AZURE_AI_PROJECT_ENDPOINT"],
|
|
credential=DefaultAzureCredential(),
|
|
)
|
|
return project_client.get_openai_client()
|
|
|
|
|
|
def print_section(title: str):
|
|
"""Print a formatted section header."""
|
|
print(f"\n{'=' * 80}")
|
|
print(f"{title}")
|
|
print(f"{'=' * 80}")
|
|
|
|
|
|
async def run_workflow(deployment_name: str | None = None) -> dict[str, Any]:
|
|
"""Execute the multi-agent travel planning workflow.
|
|
|
|
Args:
|
|
deployment_name: Optional model deployment name for the workflow agents
|
|
|
|
Returns:
|
|
Dictionary containing workflow data with agent response IDs
|
|
"""
|
|
print("Executing multi-agent travel planning workflow...")
|
|
print("This may take a few minutes...")
|
|
|
|
workflow_data = await create_and_run_workflow(deployment_name=deployment_name)
|
|
|
|
print("Workflow execution completed")
|
|
return workflow_data
|
|
|
|
|
|
def display_response_summary(workflow_data: dict) -> None:
|
|
"""Display summary of response data."""
|
|
print(f"Query: {workflow_data['query']}")
|
|
print(f"\nAgents tracked: {len(workflow_data['agents'])}")
|
|
|
|
for agent_name, agent_data in workflow_data["agents"].items():
|
|
response_count = agent_data["response_count"]
|
|
print(f" {agent_name}: {response_count} response(s)")
|
|
|
|
|
|
def fetch_agent_responses(openai_client: OpenAI, workflow_data: dict[str, Any], agent_names: list[str]) -> None:
|
|
"""Fetch and display final responses from specified agents."""
|
|
for agent_name in agent_names:
|
|
if agent_name not in workflow_data["agents"]:
|
|
continue
|
|
|
|
agent_data = workflow_data["agents"][agent_name]
|
|
if not agent_data["response_ids"]:
|
|
continue
|
|
|
|
final_response_id = agent_data["response_ids"][-1]
|
|
print(f"\n{agent_name}")
|
|
print(f" Response ID: {final_response_id}")
|
|
|
|
try:
|
|
response = openai_client.responses.retrieve(response_id=final_response_id)
|
|
content = response.output[-1].content[-1].text
|
|
truncated = content[:300] + "..." if len(content) > 300 else content
|
|
print(f" Content preview: {truncated}")
|
|
except Exception as e:
|
|
print(f" Error: {e}")
|
|
|
|
|
|
def create_evaluation(openai_client: OpenAI, deployment_name: str | None = "gpt-5.2") -> EvalCreateResponse:
|
|
"""Create evaluation with multiple evaluators."""
|
|
deployment_name = os.environ.get("AZURE_AI_MODEL_DEPLOYMENT_NAME", deployment_name)
|
|
data_source_config = {"type": "azure_ai_source", "scenario": "responses"}
|
|
|
|
testing_criteria = [
|
|
{
|
|
"type": "azure_ai_evaluator",
|
|
"name": "relevance",
|
|
"evaluator_name": "builtin.relevance",
|
|
"initialization_parameters": {"deployment_name": deployment_name},
|
|
},
|
|
{
|
|
"type": "azure_ai_evaluator",
|
|
"name": "groundedness",
|
|
"evaluator_name": "builtin.groundedness",
|
|
"initialization_parameters": {"deployment_name": deployment_name},
|
|
},
|
|
{
|
|
"type": "azure_ai_evaluator",
|
|
"name": "tool_call_accuracy",
|
|
"evaluator_name": "builtin.tool_call_accuracy",
|
|
"initialization_parameters": {"deployment_name": deployment_name},
|
|
},
|
|
{
|
|
"type": "azure_ai_evaluator",
|
|
"name": "tool_output_utilization",
|
|
"evaluator_name": "builtin.tool_output_utilization",
|
|
"initialization_parameters": {"deployment_name": deployment_name},
|
|
},
|
|
]
|
|
|
|
eval_object = openai_client.evals.create(
|
|
name="Travel Workflow Multi-Evaluator Assessment",
|
|
data_source_config=data_source_config,
|
|
testing_criteria=testing_criteria,
|
|
)
|
|
|
|
evaluator_names = [criterion["name"] for criterion in testing_criteria]
|
|
print(f"Evaluation created: {eval_object.id}")
|
|
print(f"Evaluators ({len(evaluator_names)}): {', '.join(evaluator_names)}")
|
|
|
|
return eval_object
|
|
|
|
|
|
def run_evaluation(
|
|
openai_client: OpenAI, eval_object: EvalCreateResponse, workflow_data: dict[str, Any], agent_names: list[str]
|
|
) -> RunCreateResponse:
|
|
"""Run evaluation on selected agent responses."""
|
|
selected_response_ids = []
|
|
for agent_name in agent_names:
|
|
if agent_name in workflow_data["agents"]:
|
|
agent_data = workflow_data["agents"][agent_name]
|
|
if agent_data["response_ids"]:
|
|
selected_response_ids.append(agent_data["response_ids"][-1])
|
|
|
|
print(f"Selected {len(selected_response_ids)} responses for evaluation")
|
|
|
|
data_source = {
|
|
"type": "azure_ai_responses",
|
|
"item_generation_params": {
|
|
"type": "response_retrieval",
|
|
"data_mapping": {"response_id": "{{item.resp_id}}"},
|
|
"source": {
|
|
"type": "file_content",
|
|
"content": [{"item": {"resp_id": resp_id}} for resp_id in selected_response_ids],
|
|
},
|
|
},
|
|
}
|
|
|
|
eval_run = openai_client.evals.runs.create(
|
|
eval_id=eval_object.id, name="Multi-Agent Response Evaluation", data_source=data_source
|
|
)
|
|
|
|
print(f"Evaluation run created: {eval_run.id}")
|
|
|
|
return eval_run
|
|
|
|
|
|
def monitor_evaluation(openai_client: OpenAI, eval_object: EvalCreateResponse, eval_run: RunCreateResponse):
|
|
"""Monitor evaluation progress and display results."""
|
|
print("Waiting for evaluation to complete...")
|
|
|
|
while eval_run.status not in ["completed", "failed"]:
|
|
eval_run = openai_client.evals.runs.retrieve(run_id=eval_run.id, eval_id=eval_object.id)
|
|
print(f"Status: {eval_run.status}")
|
|
time.sleep(5)
|
|
|
|
if eval_run.status == "completed":
|
|
print("\nEvaluation completed successfully")
|
|
print(f"Result counts: {eval_run.result_counts}")
|
|
print(f"\nReport URL: {eval_run.report_url}")
|
|
else:
|
|
print("\nEvaluation failed")
|
|
|
|
|
|
async def main():
|
|
"""Main execution flow."""
|
|
load_dotenv()
|
|
openai_client = create_openai_client()
|
|
|
|
# Model configuration
|
|
workflow_agent_model = os.environ.get("AZURE_AI_MODEL_DEPLOYMENT_NAME_WORKFLOW", "gpt-4.1-nano")
|
|
eval_model = os.environ.get("AZURE_AI_MODEL_DEPLOYMENT_NAME_EVAL", "gpt-5.2")
|
|
|
|
# Focus on these agents, uncomment other ones you want to have evals run on
|
|
agents_to_evaluate = [
|
|
"hotel-search-agent",
|
|
"flight-search-agent",
|
|
"activity-search-agent",
|
|
# "booking-payment-agent",
|
|
# "booking-info-aggregation-agent",
|
|
# "travel-request-handler",
|
|
# "booking-confirmation-agent",
|
|
]
|
|
|
|
print_section("Travel Planning Workflow Evaluation")
|
|
|
|
print_section("Step 1: Running Workflow")
|
|
workflow_data = await run_workflow(deployment_name=workflow_agent_model)
|
|
|
|
print_section("Step 2: Response Data Summary")
|
|
display_response_summary(workflow_data)
|
|
|
|
print_section("Step 3: Fetching Agent Responses")
|
|
fetch_agent_responses(openai_client, workflow_data, agents_to_evaluate)
|
|
|
|
print_section("Step 4: Creating Evaluation")
|
|
eval_object = create_evaluation(openai_client, deployment_name=eval_model)
|
|
|
|
print_section("Step 5: Running Evaluation")
|
|
eval_run = run_evaluation(openai_client, eval_object, workflow_data, agents_to_evaluate)
|
|
|
|
print_section("Step 6: Monitoring Evaluation")
|
|
monitor_evaluation(openai_client, eval_object, eval_run)
|
|
|
|
print_section("Complete")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(main())
|