Files
Copilot b05fc9e849 Python: Add load_dotenv() to samples for .env file support (#4043)
* Initial plan

* Add load_dotenv() to 303 Python samples for environment variable loading

Co-authored-by: TaoChenOSU <12570346+TaoChenOSU@users.noreply.github.com>

* Update SAMPLE_GUIDELINES.md to document load_dotenv() requirement

Co-authored-by: TaoChenOSU <12570346+TaoChenOSU@users.noreply.github.com>

* Update samples README.md to document .env file usage

Co-authored-by: TaoChenOSU <12570346+TaoChenOSU@users.noreply.github.com>

* Run ruff format on all changed sample files

Co-authored-by: TaoChenOSU <12570346+TaoChenOSU@users.noreply.github.com>

* Clarify load_dotenv() usage in README - local dev vs production

Co-authored-by: TaoChenOSU <12570346+TaoChenOSU@users.noreply.github.com>

* Remove deprecated getting_started folder as requested

Co-authored-by: TaoChenOSU <12570346+TaoChenOSU@users.noreply.github.com>

* Document env_file_path parameter for per-client configuration

Co-authored-by: TaoChenOSU <12570346+TaoChenOSU@users.noreply.github.com>

* Merge main branch to resolve conflicts

Co-authored-by: TaoChenOSU <12570346+TaoChenOSU@users.noreply.github.com>

* Fix run_evaluation.py file that was empty in merge commit

Co-authored-by: TaoChenOSU <12570346+TaoChenOSU@users.noreply.github.com>

* Remove dotnet changes from merge - out of scope for this PR

Co-authored-by: TaoChenOSU <12570346+TaoChenOSU@users.noreply.github.com>

* Remove package and test changes from merge - only sample changes needed

Co-authored-by: TaoChenOSU <12570346+TaoChenOSU@users.noreply.github.com>

* Remove test_func_utils.py - only sample changes needed

Co-authored-by: TaoChenOSU <12570346+TaoChenOSU@users.noreply.github.com>

* Revert sample files not in original changeset - keep only load_dotenv additions

Co-authored-by: TaoChenOSU <12570346+TaoChenOSU@users.noreply.github.com>

* Move load_dotenv() outside snippet tag in 06_host_your_agent.py

Co-authored-by: TaoChenOSU <12570346+TaoChenOSU@users.noreply.github.com>

* Fix comment placement - move load_dotenv before code comments

Co-authored-by: TaoChenOSU <12570346+TaoChenOSU@users.noreply.github.com>

* Fix load_dotenv() placement across all samples - after docstring, before code comments

Co-authored-by: TaoChenOSU <12570346+TaoChenOSU@users.noreply.github.com>

* Merge latest main branch with load_dotenv changes

Co-authored-by: TaoChenOSU <12570346+TaoChenOSU@users.noreply.github.com>

* Remove non-sample changes from merge - keep only load_dotenv additions

Co-authored-by: TaoChenOSU <12570346+TaoChenOSU@users.noreply.github.com>

* Revert non-load_dotenv sample changes from merge

Co-authored-by: TaoChenOSU <12570346+TaoChenOSU@users.noreply.github.com>

* Fix run_evaluation.py - use main's improved version (file already had load_dotenv)

Co-authored-by: TaoChenOSU <12570346+TaoChenOSU@users.noreply.github.com>

* Manual update

* Manual update 2

* Fix Role usage and load_dotenv placement per PR review feedback

Co-authored-by: eavanvalkenburg <13749212+eavanvalkenburg@users.noreply.github.com>

* Fix Role usage - use string literals not enum attributes

Co-authored-by: eavanvalkenburg <13749212+eavanvalkenburg@users.noreply.github.com>

* Fix SAMPLE_GUIDELINES.md example - load_dotenv before docstring per guidance

Co-authored-by: eavanvalkenburg <13749212+eavanvalkenburg@users.noreply.github.com>

* Move load_dotenv() before docstrings in all samples per SAMPLE_GUIDELINES ordering

Co-authored-by: eavanvalkenburg <13749212+eavanvalkenburg@users.noreply.github.com>

* Address PR review: rename files, fix placement, add session usage, remove note

Co-authored-by: eavanvalkenburg <13749212+eavanvalkenburg@users.noreply.github.com>

* Update Redis README to reference renamed file redis_history_provider.py

Co-authored-by: eavanvalkenburg <13749212+eavanvalkenburg@users.noreply.github.com>

---------

Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com>
Co-authored-by: TaoChenOSU <12570346+TaoChenOSU@users.noreply.github.com>
Co-authored-by: Tao Chen <taochen@microsoft.com>
Co-authored-by: eavanvalkenburg <13749212+eavanvalkenburg@users.noreply.github.com>
Co-authored-by: Eduard van Valkenburg <eavanvalkenburg@users.noreply.github.com>
2026-02-19 10:55:13 +00:00

241 lines
8.2 KiB
Python

# Copyright (c) Microsoft. All rights reserved.
# type: ignore
from __future__ import annotations
import asyncio
import os
import time
from typing import TYPE_CHECKING, Any
from azure.ai.projects import AIProjectClient
from azure.identity import DefaultAzureCredential
from create_workflow import create_and_run_workflow
from dotenv import load_dotenv
if TYPE_CHECKING:
from openai import OpenAI
from openai.types import EvalCreateResponse
from openai.types.evals import RunCreateResponse
"""
Script to run multi-agent travel planning workflow and evaluate agent responses.
This script:
1. Runs the multi-agent travel planning workflow
2. Displays a summary of tracked agent responses
3. Fetches and previews final agent responses
4. Creates an evaluation with multiple evaluators
5. Runs the evaluation on selected agent responses
6. Monitors evaluation progress and displays results
"""
def create_openai_client() -> OpenAI:
project_client = AIProjectClient(
endpoint=os.environ["AZURE_AI_PROJECT_ENDPOINT"],
credential=DefaultAzureCredential(),
)
return project_client.get_openai_client()
def print_section(title: str):
"""Print a formatted section header."""
print(f"\n{'=' * 80}")
print(f"{title}")
print(f"{'=' * 80}")
async def run_workflow(deployment_name: str | None = None) -> dict[str, Any]:
"""Execute the multi-agent travel planning workflow.
Args:
deployment_name: Optional model deployment name for the workflow agents
Returns:
Dictionary containing workflow data with agent response IDs
"""
print("Executing multi-agent travel planning workflow...")
print("This may take a few minutes...")
workflow_data = await create_and_run_workflow(deployment_name=deployment_name)
print("Workflow execution completed")
return workflow_data
def display_response_summary(workflow_data: dict) -> None:
"""Display summary of response data."""
print(f"Query: {workflow_data['query']}")
print(f"\nAgents tracked: {len(workflow_data['agents'])}")
for agent_name, agent_data in workflow_data["agents"].items():
response_count = agent_data["response_count"]
print(f" {agent_name}: {response_count} response(s)")
def fetch_agent_responses(openai_client: OpenAI, workflow_data: dict[str, Any], agent_names: list[str]) -> None:
"""Fetch and display final responses from specified agents."""
for agent_name in agent_names:
if agent_name not in workflow_data["agents"]:
continue
agent_data = workflow_data["agents"][agent_name]
if not agent_data["response_ids"]:
continue
final_response_id = agent_data["response_ids"][-1]
print(f"\n{agent_name}")
print(f" Response ID: {final_response_id}")
try:
response = openai_client.responses.retrieve(response_id=final_response_id)
content = response.output[-1].content[-1].text
truncated = content[:300] + "..." if len(content) > 300 else content
print(f" Content preview: {truncated}")
except Exception as e:
print(f" Error: {e}")
def create_evaluation(openai_client: OpenAI, deployment_name: str | None = "gpt-5.2") -> EvalCreateResponse:
"""Create evaluation with multiple evaluators."""
deployment_name = os.environ.get("AZURE_AI_MODEL_DEPLOYMENT_NAME", deployment_name)
data_source_config = {"type": "azure_ai_source", "scenario": "responses"}
testing_criteria = [
{
"type": "azure_ai_evaluator",
"name": "relevance",
"evaluator_name": "builtin.relevance",
"initialization_parameters": {"deployment_name": deployment_name},
},
{
"type": "azure_ai_evaluator",
"name": "groundedness",
"evaluator_name": "builtin.groundedness",
"initialization_parameters": {"deployment_name": deployment_name},
},
{
"type": "azure_ai_evaluator",
"name": "tool_call_accuracy",
"evaluator_name": "builtin.tool_call_accuracy",
"initialization_parameters": {"deployment_name": deployment_name},
},
{
"type": "azure_ai_evaluator",
"name": "tool_output_utilization",
"evaluator_name": "builtin.tool_output_utilization",
"initialization_parameters": {"deployment_name": deployment_name},
},
]
eval_object = openai_client.evals.create(
name="Travel Workflow Multi-Evaluator Assessment",
data_source_config=data_source_config,
testing_criteria=testing_criteria,
)
evaluator_names = [criterion["name"] for criterion in testing_criteria]
print(f"Evaluation created: {eval_object.id}")
print(f"Evaluators ({len(evaluator_names)}): {', '.join(evaluator_names)}")
return eval_object
def run_evaluation(
openai_client: OpenAI, eval_object: EvalCreateResponse, workflow_data: dict[str, Any], agent_names: list[str]
) -> RunCreateResponse:
"""Run evaluation on selected agent responses."""
selected_response_ids = []
for agent_name in agent_names:
if agent_name in workflow_data["agents"]:
agent_data = workflow_data["agents"][agent_name]
if agent_data["response_ids"]:
selected_response_ids.append(agent_data["response_ids"][-1])
print(f"Selected {len(selected_response_ids)} responses for evaluation")
data_source = {
"type": "azure_ai_responses",
"item_generation_params": {
"type": "response_retrieval",
"data_mapping": {"response_id": "{{item.resp_id}}"},
"source": {
"type": "file_content",
"content": [{"item": {"resp_id": resp_id}} for resp_id in selected_response_ids],
},
},
}
eval_run = openai_client.evals.runs.create(
eval_id=eval_object.id, name="Multi-Agent Response Evaluation", data_source=data_source
)
print(f"Evaluation run created: {eval_run.id}")
return eval_run
def monitor_evaluation(openai_client: OpenAI, eval_object: EvalCreateResponse, eval_run: RunCreateResponse):
"""Monitor evaluation progress and display results."""
print("Waiting for evaluation to complete...")
while eval_run.status not in ["completed", "failed"]:
eval_run = openai_client.evals.runs.retrieve(run_id=eval_run.id, eval_id=eval_object.id)
print(f"Status: {eval_run.status}")
time.sleep(5)
if eval_run.status == "completed":
print("\nEvaluation completed successfully")
print(f"Result counts: {eval_run.result_counts}")
print(f"\nReport URL: {eval_run.report_url}")
else:
print("\nEvaluation failed")
async def main():
"""Main execution flow."""
load_dotenv()
openai_client = create_openai_client()
# Model configuration
workflow_agent_model = os.environ.get("AZURE_AI_MODEL_DEPLOYMENT_NAME_WORKFLOW", "gpt-4.1-nano")
eval_model = os.environ.get("AZURE_AI_MODEL_DEPLOYMENT_NAME_EVAL", "gpt-5.2")
# Focus on these agents, uncomment other ones you want to have evals run on
agents_to_evaluate = [
"hotel-search-agent",
"flight-search-agent",
"activity-search-agent",
# "booking-payment-agent",
# "booking-info-aggregation-agent",
# "travel-request-handler",
# "booking-confirmation-agent",
]
print_section("Travel Planning Workflow Evaluation")
print_section("Step 1: Running Workflow")
workflow_data = await run_workflow(deployment_name=workflow_agent_model)
print_section("Step 2: Response Data Summary")
display_response_summary(workflow_data)
print_section("Step 3: Fetching Agent Responses")
fetch_agent_responses(openai_client, workflow_data, agents_to_evaluate)
print_section("Step 4: Creating Evaluation")
eval_object = create_evaluation(openai_client, deployment_name=eval_model)
print_section("Step 5: Running Evaluation")
eval_run = run_evaluation(openai_client, eval_object, workflow_data, agents_to_evaluate)
print_section("Step 6: Monitoring Evaluation")
monitor_evaluation(openai_client, eval_object, eval_run)
print_section("Complete")
if __name__ == "__main__":
asyncio.run(main())