agent-framework/python/packages/hyperlight/samples/codeact_benchmark.py

# Copyright (c) Microsoft. All rights reserved.

"""Benchmark CodeAct vs. traditional tool-calling for a multi-tool-call task.

This sample runs the same prompt against the same FoundryChatClient twice:

1. **Traditional tool-calling**: the five business tools are passed directly to
   the agent, so the model calls each tool individually via the LLM tool-call
   interface.
2. **CodeAct**: the same tools are registered on a HyperlightCodeActProvider
   and the model sees a single ``execute_code`` tool that calls them from
   inside the Hyperlight sandbox via ``call_tool(...)``.

The task (computing grand totals per user) naturally requires many tool calls
to complete. At the end, the sample prints elapsed time and token usage for
each run so the two approaches can be compared.

Run with:
    cd python
    uv run --directory packages/hyperlight python samples/codeact_benchmark.py

Required environment variables (loaded from ``.env`` if present):
    FOUNDRY_PROJECT_ENDPOINT
    FOUNDRY_MODEL
"""

from __future__ import annotations

import asyncio
import os
import time
from typing import Annotated, Any, Literal

from agent_framework import Agent, AgentResponse, UsageDetails
from agent_framework.foundry import FoundryChatClient
from azure.identity import AzureCliCredential
from dotenv import load_dotenv
from pydantic import BaseModel, Field

from agent_framework_hyperlight import HyperlightCodeActProvider

load_dotenv()


# 1. Deterministic "business" data and tools.

_USERS: list[dict[str, Any]] = [
    {"id": 1, "name": "Alice", "region": "EU", "tier": "gold"},
    {"id": 2, "name": "Bob", "region": "US", "tier": "silver"},
    {"id": 3, "name": "Charlie", "region": "US", "tier": "gold"},
    {"id": 4, "name": "Diana", "region": "APAC", "tier": "bronze"},
    {"id": 5, "name": "Evan", "region": "EU", "tier": "silver"},
    {"id": 6, "name": "Fiona", "region": "US", "tier": "gold"},
    {"id": 7, "name": "George", "region": "APAC", "tier": "gold"},
    {"id": 8, "name": "Hana", "region": "EU", "tier": "bronze"},
]

_ORDERS: dict[int, list[dict[str, Any]]] = {
    1: [{"product": "Widget", "qty": 3, "unit_price": 9.99}, {"product": "Gadget", "qty": 1, "unit_price": 19.99}],
    2: [{"product": "Widget", "qty": 1, "unit_price": 9.99}],
    3: [{"product": "Gadget", "qty": 2, "unit_price": 19.99}, {"product": "Thingamajig", "qty": 4, "unit_price": 4.50}],
    4: [{"product": "Widget", "qty": 10, "unit_price": 9.99}],
    5: [{"product": "Gadget", "qty": 1, "unit_price": 19.99}],
    6: [{"product": "Widget", "qty": 2, "unit_price": 9.99}, {"product": "Thingamajig", "qty": 5, "unit_price": 4.50}],
    7: [{"product": "Gadget", "qty": 3, "unit_price": 19.99}],
    8: [{"product": "Thingamajig", "qty": 2, "unit_price": 4.50}],
}

_DISCOUNTS: dict[str, float] = {"gold": 0.20, "silver": 0.10, "bronze": 0.05}
_TAX_RATES: dict[str, float] = {"EU": 0.21, "US": 0.08, "APAC": 0.10}


def list_users() -> list[dict[str, Any]]:
    """Return all users as a list of dictionaries.

    Each entry has keys: id (int), name (str), region (str), tier (str).
    """
    return _USERS


def get_orders_for_user(
    user_id: Annotated[int, "The user id whose orders to retrieve."],
) -> list[dict[str, Any]]:
    """Return the user's orders as a list of dictionaries.

    Each entry has keys: product (str), qty (int), unit_price (float).
    """
    return _ORDERS.get(user_id, [])


def get_discount_rate(
    tier: Annotated[Literal["gold", "silver", "bronze"], "The customer tier."],
) -> float:
    """Return the discount rate as a float fraction (e.g. 0.2 for 20%)."""
    return _DISCOUNTS[tier]


def get_tax_rate(
    region: Annotated[Literal["EU", "US", "APAC"], "The region code."],
) -> float:
    """Return the tax rate as a float fraction (e.g. 0.21 for 21%)."""
    return _TAX_RATES[region]


def compute_line_total(
    qty: Annotated[int, "Line item quantity."],
    unit_price: Annotated[float, "Line item unit price."],
    discount_rate: Annotated[float, "Discount rate as a fraction (e.g. 0.2 for 20%)."],
    tax_rate: Annotated[float, "Tax rate as a fraction (e.g. 0.21 for 21%)."],
) -> float:
    """Compute a single order line total.

    Formula: qty * unit_price * (1 - discount_rate) * (1 + tax_rate), rounded to 2 decimals.
    """
    subtotal = qty * unit_price
    discounted = subtotal * (1.0 - discount_rate)
    return round(discounted * (1.0 + tax_rate), 2)


TOOLS = [list_users, get_orders_for_user, get_discount_rate, get_tax_rate, compute_line_total]


# 2. Structured output schema shared between both runs.


class UserTotal(BaseModel):
    """A user's grand total of all their orders."""

    user_id: int = Field(description="The user's id.")
    name: str = Field(description="The user's display name.")
    grand_total: float = Field(description="Sum of all line totals, rounded to 2 decimals.")


class UserGrandTotals(BaseModel):
    """Structured output schema for both runs."""

    results: list[UserTotal] = Field(description="One entry per user, sorted by grand_total descending.")


INSTRUCTIONS = "You are a careful assistant. Use the provided tools for every lookup and computation."

BENCHMARK_PROMPT = (
    "For every user in our system (there are 8 of them), compute the grand total of all their orders. "
    "Use the compute_line_total tool for each user's orders, after looking up the relevant discount and "
    "tax rates for that user. "
    "Use the provided tools for EVERY data lookup (users, orders, discount rates, tax rates) and for EVERY "
    "line-total computation via compute_line_total — do not invent values or hardcode any numbers. "
    "The total per order item should apply the discount first and then the tax "
    "(e.g. total = qty * unit_price * (1-discount) * (1+tax)). "
    "Return one entry per user, sorted by grand_total descending."
)


def get_client() -> FoundryChatClient:
    """Create a FoundryChatClient from environment variables."""
    return FoundryChatClient(
        project_endpoint=os.environ["FOUNDRY_PROJECT_ENDPOINT"],
        model=os.environ["FOUNDRY_MODEL"],
        credential=AzureCliCredential(),
    )


# 3. Two runners that share the same tools, prompt, and structured output schema.


async def _run_traditional() -> tuple[float, AgentResponse]:
    agent = Agent(
        client=get_client(),
        name="TraditionalAgent",
        instructions=INSTRUCTIONS,
        tools=TOOLS,
        default_options={"response_format": UserGrandTotals},
    )
    start = time.perf_counter()
    result = await agent.run(BENCHMARK_PROMPT)
    elapsed = time.perf_counter() - start
    return elapsed, result


async def _run_codeact() -> tuple[float, AgentResponse]:
    codeact = HyperlightCodeActProvider(
        tools=TOOLS,
        approval_mode="never_require",
    )
    agent = Agent(
        client=get_client(),
        name="CodeActAgent",
        instructions=INSTRUCTIONS,
        context_providers=[codeact],
        default_options={"response_format": UserGrandTotals},
    )
    start = time.perf_counter()
    result = await agent.run(BENCHMARK_PROMPT)
    elapsed = time.perf_counter() - start
    return elapsed, result


# 4. Report results side by side.


def _print_section(title: str) -> None:
    bar = "=" * 70
    print(f"\n{bar}\n{title}\n{bar}")


def _format_usage(usage: UsageDetails | None) -> str:
    if usage is None:
        return "usage=<none>"
    return (
        f"input={usage.get('input_token_count') or 0:>6} "
        f"output={usage.get('output_token_count') or 0:>6} "
        f"total={usage.get('total_token_count') or 0:>6}"
    )


def _print_results(result: AgentResponse) -> None:
    if result.value is not None:
        for row in result.value.results:
            print(f"  user_id={row.user_id:>2}  name={row.name:<8}  grand_total={row.grand_total:>8.2f}")
    else:
        print(result.text)


async def main() -> None:
    """Run the benchmark and print a comparison."""
    trad_time, trad_result = await _run_traditional()
    code_time, code_result = await _run_codeact()

    _print_section("Traditional tool-calling")
    print(f"time={trad_time:7.2f}s  {_format_usage(trad_result.usage_details)}")
    _print_results(trad_result)

    _print_section("CodeAct (HyperlightCodeActProvider)")
    print(f"time={code_time:7.2f}s  {_format_usage(code_result.usage_details)}")
    _print_results(code_result)

    _print_section("Comparison")
    trad_total = (trad_result.usage_details or {}).get("total_token_count") or 0
    code_total = (code_result.usage_details or {}).get("total_token_count") or 0

    def pct(new: float, old: float) -> str:
        if old == 0:
            return "n/a"
        delta = (new - old) / old * 100
        sign = "+" if delta >= 0 else ""
        return f"{sign}{delta:.1f}%"

    print(f"time   : traditional={trad_time:7.2f}s   codeact={code_time:7.2f}s   delta={pct(code_time, trad_time)}")
    print(f"tokens : traditional={trad_total:7d}    codeact={code_total:7d}    delta={pct(code_total, trad_total)}")


if __name__ == "__main__":
    asyncio.run(main())