mirror of
https://github.com/microsoft/agent-framework.git
synced 2026-06-16 21:04:09 +08:00
514d0209a8
* consolidate lab packages into a single one; update contribution guidelines * update dep list * add poe tasks; fix tests and lint erros * add lab tests for CI * fix test * update root pyproject.toml
79 lines
3.2 KiB
Python
79 lines
3.2 KiB
Python
# Copyright (c) Microsoft. All rights reserved.
|
|
|
|
"""GAIA Benchmark Sample.
|
|
|
|
To run this sample, execute it from the root directory of the agent-framework repository:
|
|
cd /path/to/agent-framework
|
|
uv run python python/packages/lab/gaia/gaia_sample.py
|
|
|
|
This avoids namespace package conflicts that occur when running from within the gaia package directory.
|
|
"""
|
|
|
|
from agent_framework.azure import AzureAIAgentClient
|
|
from agent_framework.lab.gaia import GAIA, Evaluation, GAIATelemetryConfig, Prediction, Task
|
|
from azure.identity.aio import AzureCliCredential
|
|
|
|
|
|
def evaluate_task(task: Task, prediction: Prediction) -> Evaluation:
|
|
"""Evaluate the prediction for a given task."""
|
|
# Simple evaluation: check if the prediction contains the answer
|
|
is_correct = (task.answer or "").lower() in prediction.prediction.lower()
|
|
return Evaluation(is_correct=is_correct, score=1 if is_correct else 0)
|
|
|
|
|
|
async def main() -> None:
|
|
"""Run GAIA benchmark with telemetry configuration."""
|
|
# Configure telemetry for tracing
|
|
telemetry_config = GAIATelemetryConfig(
|
|
enable_tracing=True, # Enable OpenTelemetry tracing
|
|
# Configure local file tracing
|
|
trace_to_file=True, # Export traces to local file
|
|
file_path="gaia_benchmark_traces.jsonl", # Custom file path for traces
|
|
)
|
|
|
|
# Create a single agent once and reuse it for all tasks
|
|
async with (
|
|
AzureCliCredential() as credential,
|
|
AzureAIAgentClient(async_credential=credential).create_agent(
|
|
name="GaiaAgent",
|
|
instructions="Solve tasks to your best ability.",
|
|
) as agent,
|
|
):
|
|
|
|
async def run_task(task: Task) -> Prediction:
|
|
"""Run a single GAIA task and return the prediction using the shared agent."""
|
|
input_message = f"Task: {task.question}"
|
|
if task.file_name:
|
|
input_message += f"\nFile: {task.file_name}"
|
|
result = await agent.run(input_message)
|
|
return Prediction(prediction=result.text, messages=result.messages)
|
|
|
|
# Create the GAIA benchmark runner with telemetry configuration
|
|
runner = GAIA(evaluator=evaluate_task, telemetry_config=telemetry_config)
|
|
|
|
# Run the benchmark with the task runner.
|
|
# By default, this will check for locally cached benchmark data and checkout
|
|
# the latest version from HuggingFace if not found.
|
|
results = await runner.run(
|
|
run_task,
|
|
level=1, # Level 1, 2, or 3 or multiple levels like [1, 2]
|
|
max_n=5, # Maximum number of tasks to run per level
|
|
parallel=2, # Number of parallel tasks to run
|
|
timeout=60, # Timeout per task in seconds
|
|
out="gaia_results_level1.jsonl", # Output file to save results including detailed traces (optional)
|
|
)
|
|
|
|
# Print the results.
|
|
print("\n=== GAIA Benchmark Results ===")
|
|
for result in results:
|
|
print(f"\n--- Task ID: {result.task_id} ---")
|
|
print(f"Task: {result.task.question[:100]}...")
|
|
print(f"Prediction: {result.prediction.prediction}")
|
|
print(f"Evaluation: Correct={result.evaluation.is_correct}, Score={result.evaluation.score}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
import asyncio
|
|
|
|
asyncio.run(main())
|