mirror of
https://github.com/microsoft/agent-framework.git
synced 2026-06-16 21:04:09 +08:00
1543370027
* Lab: Updates to GAIA module * update * emoj! * fix lint * update lab test workflow to only trigger for python changes * lint * lint * Fix broken OpenAI agents JS documentation link
296 lines
11 KiB
Python
296 lines
11 KiB
Python
# Copyright (c) Microsoft. All rights reserved.
|
|
|
|
"""GAIA Benchmark Sample.
|
|
|
|
Run the GAIA (General AI Assistant) benchmark with configurable agent providers,
|
|
telemetry options, and benchmark parameters.
|
|
|
|
Agent Providers:
|
|
- Azure AI (default): See azure_ai_agent.py for required environment variables
|
|
- OpenAI: See openai_agent.py for required environment variables
|
|
|
|
Prerequisites:
|
|
1. Set HF_TOKEN environment variable with your Hugging Face token:
|
|
- Get token: https://huggingface.co/settings/tokens
|
|
- Request dataset access: https://huggingface.co/datasets/gaia-benchmark/GAIA
|
|
- Set: export HF_TOKEN="your-huggingface-token"
|
|
|
|
2. Configure your chosen agent provider (see agent module files for details)
|
|
|
|
Telemetry:
|
|
When using --otlp-endpoint or --trace-file, OpenTelemetry will export trace data
|
|
in JSON format to the console in addition to the configured endpoints. This is
|
|
expected behavior from the OpenTelemetry SDK and provides visibility into the
|
|
telemetry being captured. The traces are also exported to:
|
|
- OTLP endpoint (e.g., Aspire Dashboard) if --otlp-endpoint is specified
|
|
- Local file if --trace-file is specified
|
|
|
|
To suppress console output, redirect stderr: `python gaia_sample.py 2>/dev/null`
|
|
|
|
Usage:
|
|
# Run with default settings (Azure AI agent)
|
|
uv run python gaia_sample.py
|
|
|
|
# Run with OpenAI agent
|
|
uv run python gaia_sample.py --agent-provider openai
|
|
|
|
# Run with telemetry export to Aspire Dashboard
|
|
uv run python gaia_sample.py --otlp-endpoint http://localhost:4318
|
|
|
|
# See all options
|
|
uv run python gaia_sample.py --help
|
|
"""
|
|
|
|
import argparse
|
|
|
|
from agent_framework.lab.gaia import GAIA, Evaluation, GAIATelemetryConfig, Prediction, Task
|
|
|
|
|
|
async def evaluate_task(task: Task, prediction: Prediction) -> Evaluation:
|
|
"""Evaluate the prediction for a given task."""
|
|
# Simple evaluation: check if the prediction contains the answer
|
|
is_correct = (task.answer or "").lower() in prediction.prediction.lower()
|
|
return Evaluation(is_correct=is_correct, score=1 if is_correct else 0)
|
|
|
|
|
|
async def main(
|
|
otlp_endpoint: str | None = None,
|
|
trace_file: str | None = None,
|
|
result_file: str | None = None,
|
|
data_dir: str | None = None,
|
|
agent_provider: str = "azure-ai",
|
|
level: int | list[int] = 1,
|
|
max_n: int = 2,
|
|
parallel: int = 1,
|
|
timeout: int = 120,
|
|
) -> None:
|
|
"""Run GAIA benchmark with telemetry configuration.
|
|
|
|
Args:
|
|
otlp_endpoint: Optional OTLP endpoint URL for exporting traces (e.g., http://localhost:4318)
|
|
trace_file: Optional file path to export traces to. If None, traces won't be saved to file.
|
|
result_file: Optional file path to save benchmark results. If None, results won't be saved to file.
|
|
data_dir: Directory to cache GAIA dataset. If None, uses temp directory.
|
|
agent_provider: Agent provider to use: 'azure-ai' or 'openai' (default: 'azure-ai')
|
|
level: GAIA level(s) to run (1, 2, or 3)
|
|
max_n: Maximum number of tasks to run per level
|
|
parallel: Number of parallel tasks to run
|
|
timeout: Timeout per task in seconds
|
|
"""
|
|
# Check for required Hugging Face token
|
|
import logging
|
|
import os
|
|
|
|
# Suppress console logging for traces and verbose SDK output
|
|
logging.getLogger("opentelemetry").setLevel(logging.ERROR)
|
|
logging.getLogger("azure").setLevel(logging.WARNING)
|
|
logging.getLogger("agent_framework").setLevel(logging.WARNING)
|
|
logging.getLogger("httpx").setLevel(logging.WARNING)
|
|
logging.getLogger("httpcore").setLevel(logging.WARNING)
|
|
|
|
# Suppress OpenTelemetry exporters console output
|
|
import os as _os
|
|
|
|
_os.environ.setdefault("OTEL_PYTHON_LOG_LEVEL", "error")
|
|
|
|
# Print trace export configuration
|
|
print("\n=== Telemetry Configuration ===")
|
|
if trace_file:
|
|
print(f"📁 Trace file: {os.path.abspath(trace_file)}")
|
|
else:
|
|
print("📁 Trace file: disabled")
|
|
|
|
if otlp_endpoint:
|
|
print(f"🌐 OTLP endpoint: {otlp_endpoint}")
|
|
else:
|
|
print("🌐 OTLP endpoint: disabled")
|
|
|
|
if result_file:
|
|
print(f"📊 Results file: {os.path.abspath(result_file)}")
|
|
else:
|
|
print("📊 Results file: disabled")
|
|
|
|
print("\n=== Run Configuration ===")
|
|
print(f"🤖 Agent provider: {agent_provider}")
|
|
if data_dir:
|
|
print(f"📂 Data directory: {os.path.abspath(data_dir)}")
|
|
else:
|
|
import tempfile
|
|
from pathlib import Path
|
|
|
|
default_data_dir = Path(tempfile.gettempdir()) / "data_gaia_hub"
|
|
print(f"📂 Data directory: {default_data_dir} (default)")
|
|
print(f"🎯 Level: {level}")
|
|
print(f"🔢 Max tasks: {max_n}")
|
|
print(f"⚡ Parallel: {parallel}")
|
|
print(f"⏱️ Timeout: {timeout}s")
|
|
print()
|
|
|
|
# Import the appropriate agent factory based on provider
|
|
if agent_provider == "azure-ai":
|
|
from azure_ai_agent import create_gaia_agent
|
|
elif agent_provider == "openai":
|
|
from openai_agent import create_gaia_agent
|
|
else:
|
|
raise ValueError(f"Unknown agent provider: {agent_provider}. Use 'azure-ai' or 'openai'.")
|
|
|
|
# Configure telemetry for tracing
|
|
telemetry_config = GAIATelemetryConfig(
|
|
enable_tracing=True, # Enable OpenTelemetry tracing
|
|
trace_to_file=trace_file is not None, # Export traces to local file only if path provided
|
|
file_path=trace_file, # Custom file path for traces (can be None)
|
|
otlp_endpoint=otlp_endpoint, # Optional OTLP endpoint for Aspire Dashboard or other collectors
|
|
)
|
|
|
|
# Create a single agent once and reuse it for all tasks
|
|
async with create_gaia_agent() as agent:
|
|
|
|
async def run_task(task: Task) -> Prediction:
|
|
"""Run a single GAIA task and return the prediction using the shared agent."""
|
|
input_message = f"Task: {task.question}"
|
|
if task.file_name:
|
|
input_message += f"\nFile: {task.file_name}"
|
|
result = await agent.run(input_message)
|
|
return Prediction(prediction=result.text, messages=result.messages)
|
|
|
|
# Create the GAIA benchmark runner with telemetry configuration
|
|
runner = GAIA(
|
|
evaluator=evaluate_task,
|
|
telemetry_config=telemetry_config,
|
|
data_dir=data_dir,
|
|
)
|
|
|
|
# Run the benchmark with the task runner.
|
|
# By default, this will check for locally cached benchmark data and checkout
|
|
# the latest version from HuggingFace if not found.
|
|
# Note: The GAIA dataset has been updated to use Parquet format.
|
|
# If you encounter issues, try using validation split which has labeled data.
|
|
results = await runner.run(
|
|
run_task,
|
|
level=level,
|
|
max_n=max_n,
|
|
parallel=parallel,
|
|
timeout=timeout,
|
|
out=result_file, # Output file to save results including detailed traces (optional, None = no file output)
|
|
)
|
|
|
|
# Print summary similar to the viewer in gaia.py
|
|
total = len(results)
|
|
correct = sum(1 for r in results if r.evaluation.is_correct)
|
|
accuracy = correct / total if total > 0 else 0.0
|
|
avg_runtime = sum(r.runtime_seconds or 0 for r in results) / total if total > 0 else 0.0
|
|
|
|
print("\n=== GAIA Benchmark Summary ===")
|
|
print(f"📝 Total: {total}, ✅ Correct: {correct}, 🎯 Accuracy: {accuracy:.3f}")
|
|
print(f"⏱️ Average runtime: {avg_runtime:.2f}s")
|
|
if result_file:
|
|
print(f"💾 Detailed results saved to: {result_file}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
import asyncio
|
|
|
|
# Parse command line arguments
|
|
parser = argparse.ArgumentParser(
|
|
description="Run GAIA benchmark with optional telemetry export to OTLP endpoint and/or file",
|
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
epilog="""
|
|
Examples:
|
|
# Run with default settings
|
|
python gaia_sample.py
|
|
|
|
# Run with custom data directory
|
|
python gaia_sample.py --data-dir ./gaia_data
|
|
|
|
# Run with OpenAI agent provider
|
|
python gaia_sample.py --agent-provider openai
|
|
|
|
# Run with trace file export
|
|
python gaia_sample.py --trace-file gaia_benchmark_traces.jsonl
|
|
|
|
# Run level 2 tasks with 5 maximum tasks
|
|
python gaia_sample.py --level 2 --max-n 5
|
|
|
|
# Run with OTLP export to Aspire Dashboard and custom settings
|
|
python gaia_sample.py --otlp-endpoint http://localhost:4318 --level 1 --max-n 10 --parallel 2
|
|
|
|
# Run with all options configured
|
|
python gaia_sample.py --agent-provider openai \
|
|
--trace-file traces.jsonl \
|
|
--result-file results.jsonl \
|
|
--otlp-endpoint http://localhost:4318 --level 1 --max-n 5 --parallel 2 --timeout 180
|
|
""",
|
|
)
|
|
parser.add_argument(
|
|
"--otlp-endpoint",
|
|
type=str,
|
|
default=None,
|
|
help="OTLP endpoint URL for exporting traces (e.g., http://localhost:4318 for Aspire Dashboard)",
|
|
)
|
|
parser.add_argument(
|
|
"--trace-file",
|
|
type=str,
|
|
default=None,
|
|
help="File path to export traces to (e.g., gaia_benchmark_traces.jsonl). "
|
|
"If not set, traces won't be saved to file.",
|
|
)
|
|
parser.add_argument(
|
|
"--result-file",
|
|
type=str,
|
|
default="gaia_results_level1.jsonl",
|
|
help="File path to save benchmark results (default: gaia_results_level1.jsonl)",
|
|
)
|
|
parser.add_argument(
|
|
"--data-dir",
|
|
type=str,
|
|
default=None,
|
|
help="Directory to cache GAIA dataset. If not set, uses system temp directory.",
|
|
)
|
|
parser.add_argument(
|
|
"--agent-provider",
|
|
type=str,
|
|
default="azure-ai",
|
|
choices=["azure-ai", "openai"],
|
|
help="Agent provider to use: 'azure-ai' or 'openai' (default: 'azure-ai')",
|
|
)
|
|
parser.add_argument(
|
|
"--level",
|
|
type=int,
|
|
default=1,
|
|
choices=[1, 2, 3],
|
|
help="GAIA benchmark level to run: 1, 2, or 3 (default: 1)",
|
|
)
|
|
parser.add_argument(
|
|
"--max-n",
|
|
type=int,
|
|
default=2,
|
|
help="Maximum number of tasks to run per level (default: 2)",
|
|
)
|
|
parser.add_argument(
|
|
"--parallel",
|
|
type=int,
|
|
default=1,
|
|
help="Number of parallel tasks to run (default: 1)",
|
|
)
|
|
parser.add_argument(
|
|
"--timeout",
|
|
type=int,
|
|
default=120,
|
|
help="Timeout per task in seconds (default: 120)",
|
|
)
|
|
args = parser.parse_args()
|
|
|
|
asyncio.run(
|
|
main(
|
|
otlp_endpoint=args.otlp_endpoint,
|
|
trace_file=args.trace_file,
|
|
result_file=args.result_file,
|
|
data_dir=args.data_dir,
|
|
agent_provider=args.agent_provider,
|
|
level=args.level,
|
|
max_n=args.max_n,
|
|
parallel=args.parallel,
|
|
timeout=args.timeout,
|
|
)
|
|
)
|