mirror of
https://github.com/microsoft/agent-framework.git
synced 2026-06-16 21:04:09 +08:00
Python: Flatten hyperlight execute_code output (#5333)
* small fix for hyperlight * improved sandbox dependency
This commit is contained in:
committed by
GitHub
Unverified
parent
495e1dad6b
commit
69894eded8
@@ -431,7 +431,7 @@ def _build_execution_contents(
|
||||
outputs.append(Content.from_text(stderr, raw_representation=result))
|
||||
if not outputs:
|
||||
outputs.append(Content.from_text("Code executed successfully without output."))
|
||||
return [Content.from_code_interpreter_tool_result(outputs=outputs, raw_representation=result)]
|
||||
return outputs
|
||||
|
||||
error_details = stderr or "Unknown sandbox error"
|
||||
outputs.append(
|
||||
@@ -441,12 +441,16 @@ def _build_execution_contents(
|
||||
raw_representation=result,
|
||||
)
|
||||
)
|
||||
return [Content.from_code_interpreter_tool_result(outputs=outputs, raw_representation=result)]
|
||||
return outputs
|
||||
|
||||
|
||||
def _make_sandbox_callback(tool_obj: FunctionTool) -> Callable[..., Any]:
|
||||
sandbox_tool = copy.copy(tool_obj)
|
||||
sandbox_tool.result_parser = _passthrough_result_parser
|
||||
# Auto-assign a passthrough parser so the raw return value round-trips through
|
||||
# `ast.literal_eval` in the sandbox callback below. User-supplied parsers are
|
||||
# left in place so callers can customize how results are exposed to the guest.
|
||||
if sandbox_tool.result_parser is None:
|
||||
sandbox_tool.result_parser = _passthrough_result_parser
|
||||
|
||||
def _callback(**kwargs: Any) -> Any:
|
||||
async def _invoke() -> list[Content]:
|
||||
@@ -765,6 +769,7 @@ class HyperlightExecuteCodeTool(FunctionTool):
|
||||
return build_codeact_instructions(
|
||||
tools=config.tools,
|
||||
tools_visible_to_model=tools_visible_to_model,
|
||||
filesystem_enabled=config.filesystem_enabled,
|
||||
)
|
||||
|
||||
def create_run_tool(self) -> HyperlightExecuteCodeTool:
|
||||
|
||||
@@ -68,6 +68,7 @@ def build_codeact_instructions(
|
||||
*,
|
||||
tools: Sequence[FunctionTool],
|
||||
tools_visible_to_model: bool,
|
||||
filesystem_enabled: bool = False,
|
||||
) -> str:
|
||||
"""Build dynamic CodeAct instructions for the effective sandbox state."""
|
||||
usage_note = (
|
||||
@@ -77,12 +78,24 @@ def build_codeact_instructions(
|
||||
else "Provider-owned sandbox tools are not exposed separately; use `execute_code` when you need them."
|
||||
)
|
||||
|
||||
output_note = (
|
||||
"To surface results from `execute_code`, end the code with `print(...)`; the sandbox does not "
|
||||
"return the value of the last expression."
|
||||
)
|
||||
if filesystem_enabled:
|
||||
output_note += (
|
||||
" For larger artifacts, write them to `/output/<filename>` instead — returned files will be "
|
||||
"attached to the tool result."
|
||||
)
|
||||
|
||||
return f"""You have one primary tool: execute_code.
|
||||
|
||||
Prefer one execute_code call per request when possible.
|
||||
Its tool description contains the current `call_tool(...)` guidance, sandbox
|
||||
tool registry, and capability limits.
|
||||
|
||||
{output_note}
|
||||
|
||||
{usage_note}
|
||||
"""
|
||||
|
||||
|
||||
@@ -24,7 +24,7 @@ classifiers = [
|
||||
dependencies = [
|
||||
"agent-framework-core>=1.0.0,<2",
|
||||
"hyperlight-sandbox>=0.3.0,<0.4",
|
||||
"hyperlight-sandbox-backend-wasm>=0.3.0,<0.4 ; (sys_platform == 'linux' or sys_platform == 'win32') and python_version < '3.14'",
|
||||
"hyperlight-sandbox-backend-wasm>=0.3.0,<0.4 ; ((sys_platform == 'linux' and platform_machine == 'x86_64') or (sys_platform == 'win32' and platform_machine == 'AMD64')) and python_version < '3.14'",
|
||||
"hyperlight-sandbox-python-guest>=0.3.0,<0.4",
|
||||
]
|
||||
|
||||
|
||||
@@ -0,0 +1,253 @@
|
||||
# Copyright (c) Microsoft. All rights reserved.
|
||||
|
||||
"""Benchmark CodeAct vs. traditional tool-calling for a multi-tool-call task.
|
||||
|
||||
This sample runs the same prompt against the same FoundryChatClient twice:
|
||||
|
||||
1. **Traditional tool-calling**: the five business tools are passed directly to
|
||||
the agent, so the model calls each tool individually via the LLM tool-call
|
||||
interface.
|
||||
2. **CodeAct**: the same tools are registered on a HyperlightCodeActProvider
|
||||
and the model sees a single ``execute_code`` tool that calls them from
|
||||
inside the Hyperlight sandbox via ``call_tool(...)``.
|
||||
|
||||
The task (computing grand totals per user) naturally requires many tool calls
|
||||
to complete. At the end, the sample prints elapsed time and token usage for
|
||||
each run so the two approaches can be compared.
|
||||
|
||||
Run with:
|
||||
cd python
|
||||
uv run --directory packages/hyperlight python samples/codeact_benchmark.py
|
||||
|
||||
Required environment variables (loaded from ``.env`` if present):
|
||||
FOUNDRY_PROJECT_ENDPOINT
|
||||
FOUNDRY_MODEL
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import os
|
||||
import time
|
||||
from typing import Annotated, Any, Literal
|
||||
|
||||
from agent_framework import Agent, AgentResponse, UsageDetails
|
||||
from agent_framework.foundry import FoundryChatClient
|
||||
from azure.identity import AzureCliCredential
|
||||
from dotenv import load_dotenv
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
from agent_framework_hyperlight import HyperlightCodeActProvider
|
||||
|
||||
load_dotenv()
|
||||
|
||||
|
||||
# 1. Deterministic "business" data and tools.
|
||||
|
||||
_USERS: list[dict[str, Any]] = [
|
||||
{"id": 1, "name": "Alice", "region": "EU", "tier": "gold"},
|
||||
{"id": 2, "name": "Bob", "region": "US", "tier": "silver"},
|
||||
{"id": 3, "name": "Charlie", "region": "US", "tier": "gold"},
|
||||
{"id": 4, "name": "Diana", "region": "APAC", "tier": "bronze"},
|
||||
{"id": 5, "name": "Evan", "region": "EU", "tier": "silver"},
|
||||
{"id": 6, "name": "Fiona", "region": "US", "tier": "gold"},
|
||||
{"id": 7, "name": "George", "region": "APAC", "tier": "gold"},
|
||||
{"id": 8, "name": "Hana", "region": "EU", "tier": "bronze"},
|
||||
]
|
||||
|
||||
_ORDERS: dict[int, list[dict[str, Any]]] = {
|
||||
1: [{"product": "Widget", "qty": 3, "unit_price": 9.99}, {"product": "Gadget", "qty": 1, "unit_price": 19.99}],
|
||||
2: [{"product": "Widget", "qty": 1, "unit_price": 9.99}],
|
||||
3: [{"product": "Gadget", "qty": 2, "unit_price": 19.99}, {"product": "Thingamajig", "qty": 4, "unit_price": 4.50}],
|
||||
4: [{"product": "Widget", "qty": 10, "unit_price": 9.99}],
|
||||
5: [{"product": "Gadget", "qty": 1, "unit_price": 19.99}],
|
||||
6: [{"product": "Widget", "qty": 2, "unit_price": 9.99}, {"product": "Thingamajig", "qty": 5, "unit_price": 4.50}],
|
||||
7: [{"product": "Gadget", "qty": 3, "unit_price": 19.99}],
|
||||
8: [{"product": "Thingamajig", "qty": 2, "unit_price": 4.50}],
|
||||
}
|
||||
|
||||
_DISCOUNTS: dict[str, float] = {"gold": 0.20, "silver": 0.10, "bronze": 0.05}
|
||||
_TAX_RATES: dict[str, float] = {"EU": 0.21, "US": 0.08, "APAC": 0.10}
|
||||
|
||||
|
||||
def list_users() -> list[dict[str, Any]]:
|
||||
"""Return all users as a list of dictionaries.
|
||||
|
||||
Each entry has keys: id (int), name (str), region (str), tier (str).
|
||||
"""
|
||||
return _USERS
|
||||
|
||||
|
||||
def get_orders_for_user(
|
||||
user_id: Annotated[int, "The user id whose orders to retrieve."],
|
||||
) -> list[dict[str, Any]]:
|
||||
"""Return the user's orders as a list of dictionaries.
|
||||
|
||||
Each entry has keys: product (str), qty (int), unit_price (float).
|
||||
"""
|
||||
return _ORDERS.get(user_id, [])
|
||||
|
||||
|
||||
def get_discount_rate(
|
||||
tier: Annotated[Literal["gold", "silver", "bronze"], "The customer tier."],
|
||||
) -> float:
|
||||
"""Return the discount rate as a float fraction (e.g. 0.2 for 20%)."""
|
||||
return _DISCOUNTS[tier]
|
||||
|
||||
|
||||
def get_tax_rate(
|
||||
region: Annotated[Literal["EU", "US", "APAC"], "The region code."],
|
||||
) -> float:
|
||||
"""Return the tax rate as a float fraction (e.g. 0.21 for 21%)."""
|
||||
return _TAX_RATES[region]
|
||||
|
||||
|
||||
def compute_line_total(
|
||||
qty: Annotated[int, "Line item quantity."],
|
||||
unit_price: Annotated[float, "Line item unit price."],
|
||||
discount_rate: Annotated[float, "Discount rate as a fraction (e.g. 0.2 for 20%)."],
|
||||
tax_rate: Annotated[float, "Tax rate as a fraction (e.g. 0.21 for 21%)."],
|
||||
) -> float:
|
||||
"""Compute a single order line total.
|
||||
|
||||
Formula: qty * unit_price * (1 - discount_rate) * (1 + tax_rate), rounded to 2 decimals.
|
||||
"""
|
||||
subtotal = qty * unit_price
|
||||
discounted = subtotal * (1.0 - discount_rate)
|
||||
return round(discounted * (1.0 + tax_rate), 2)
|
||||
|
||||
|
||||
TOOLS = [list_users, get_orders_for_user, get_discount_rate, get_tax_rate, compute_line_total]
|
||||
|
||||
|
||||
# 2. Structured output schema shared between both runs.
|
||||
|
||||
|
||||
class UserTotal(BaseModel):
|
||||
"""A user's grand total of all their orders."""
|
||||
|
||||
user_id: int = Field(description="The user's id.")
|
||||
name: str = Field(description="The user's display name.")
|
||||
grand_total: float = Field(description="Sum of all line totals, rounded to 2 decimals.")
|
||||
|
||||
|
||||
class UserGrandTotals(BaseModel):
|
||||
"""Structured output schema for both runs."""
|
||||
|
||||
results: list[UserTotal] = Field(description="One entry per user, sorted by grand_total descending.")
|
||||
|
||||
|
||||
INSTRUCTIONS = "You are a careful assistant. Use the provided tools for every lookup and computation."
|
||||
|
||||
BENCHMARK_PROMPT = (
|
||||
"For every user in our system (there are 8 of them), compute the grand total of all their orders. "
|
||||
"Use the compute_line_total tool for each user's orders, after looking up the relevant discount and "
|
||||
"tax rates for that user. "
|
||||
"Use the provided tools for EVERY data lookup (users, orders, discount rates, tax rates) and for EVERY "
|
||||
"line-total computation via compute_line_total — do not invent values or hardcode any numbers. "
|
||||
"The total per order item should apply the discount first and then the tax "
|
||||
"(e.g. total = qty * unit_price * (1-discount) * (1+tax)). "
|
||||
"Return one entry per user, sorted by grand_total descending."
|
||||
)
|
||||
|
||||
|
||||
def get_client() -> FoundryChatClient:
|
||||
"""Create a FoundryChatClient from environment variables."""
|
||||
return FoundryChatClient(
|
||||
project_endpoint=os.environ["FOUNDRY_PROJECT_ENDPOINT"],
|
||||
model=os.environ["FOUNDRY_MODEL"],
|
||||
credential=AzureCliCredential(),
|
||||
)
|
||||
|
||||
|
||||
# 3. Two runners that share the same tools, prompt, and structured output schema.
|
||||
|
||||
|
||||
async def _run_traditional() -> tuple[float, AgentResponse]:
|
||||
agent = Agent(
|
||||
client=get_client(),
|
||||
name="TraditionalAgent",
|
||||
instructions=INSTRUCTIONS,
|
||||
tools=TOOLS,
|
||||
default_options={"response_format": UserGrandTotals},
|
||||
)
|
||||
start = time.perf_counter()
|
||||
result = await agent.run(BENCHMARK_PROMPT)
|
||||
elapsed = time.perf_counter() - start
|
||||
return elapsed, result
|
||||
|
||||
|
||||
async def _run_codeact() -> tuple[float, AgentResponse]:
|
||||
codeact = HyperlightCodeActProvider(
|
||||
tools=TOOLS,
|
||||
approval_mode="never_require",
|
||||
)
|
||||
agent = Agent(
|
||||
client=get_client(),
|
||||
name="CodeActAgent",
|
||||
instructions=INSTRUCTIONS,
|
||||
context_providers=[codeact],
|
||||
default_options={"response_format": UserGrandTotals},
|
||||
)
|
||||
start = time.perf_counter()
|
||||
result = await agent.run(BENCHMARK_PROMPT)
|
||||
elapsed = time.perf_counter() - start
|
||||
return elapsed, result
|
||||
|
||||
|
||||
# 4. Report results side by side.
|
||||
|
||||
|
||||
def _print_section(title: str) -> None:
|
||||
bar = "=" * 70
|
||||
print(f"\n{bar}\n{title}\n{bar}")
|
||||
|
||||
|
||||
def _format_usage(usage: UsageDetails | None) -> str:
|
||||
if usage is None:
|
||||
return "usage=<none>"
|
||||
return (
|
||||
f"input={usage.get('input_token_count') or 0:>6} "
|
||||
f"output={usage.get('output_token_count') or 0:>6} "
|
||||
f"total={usage.get('total_token_count') or 0:>6}"
|
||||
)
|
||||
|
||||
|
||||
def _print_results(result: AgentResponse) -> None:
|
||||
if result.value is not None:
|
||||
for row in result.value.results:
|
||||
print(f" user_id={row.user_id:>2} name={row.name:<8} grand_total={row.grand_total:>8.2f}")
|
||||
else:
|
||||
print(result.text)
|
||||
|
||||
|
||||
async def main() -> None:
|
||||
"""Run the benchmark and print a comparison."""
|
||||
trad_time, trad_result = await _run_traditional()
|
||||
code_time, code_result = await _run_codeact()
|
||||
|
||||
_print_section("Traditional tool-calling")
|
||||
print(f"time={trad_time:7.2f}s {_format_usage(trad_result.usage_details)}")
|
||||
_print_results(trad_result)
|
||||
|
||||
_print_section("CodeAct (HyperlightCodeActProvider)")
|
||||
print(f"time={code_time:7.2f}s {_format_usage(code_result.usage_details)}")
|
||||
_print_results(code_result)
|
||||
|
||||
_print_section("Comparison")
|
||||
trad_total = (trad_result.usage_details or {}).get("total_token_count") or 0
|
||||
code_total = (code_result.usage_details or {}).get("total_token_count") or 0
|
||||
|
||||
def pct(new: float, old: float) -> str:
|
||||
if old == 0:
|
||||
return "n/a"
|
||||
delta = (new - old) / old * 100
|
||||
sign = "+" if delta >= 0 else ""
|
||||
return f"{sign}{delta:.1f}%"
|
||||
|
||||
print(f"time : traditional={trad_time:7.2f}s codeact={code_time:7.2f}s delta={pct(code_time, trad_time)}")
|
||||
print(f"tokens : traditional={trad_total:7d} codeact={code_total:7d} delta={pct(code_total, trad_total)}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
@@ -72,15 +72,11 @@ async def log_function_calls(
|
||||
|
||||
result = context.result
|
||||
if function_name == "execute_code" and isinstance(result, list):
|
||||
for item in result:
|
||||
if item.type != "code_interpreter_tool_result":
|
||||
continue
|
||||
|
||||
for output in item.outputs or []:
|
||||
if output.type == "text" and output.text:
|
||||
print(f"{_GREEN}stdout:\n{output.text}{_RESET}")
|
||||
if output.type == "error" and output.error_details:
|
||||
print(f"{_YELLOW}stderr:\n{output.error_details}{_RESET}")
|
||||
for output in result:
|
||||
if output.type == "text" and output.text:
|
||||
print(f"{_GREEN}stdout:\n{output.text}{_RESET}")
|
||||
elif output.type == "error" and output.error_details:
|
||||
print(f"{_YELLOW}stderr:\n{output.error_details}{_RESET}")
|
||||
else:
|
||||
print(f"{_YELLOW}◀ {function_name} → {result!r}{_RESET}")
|
||||
|
||||
|
||||
@@ -289,38 +289,20 @@ class _FakeSessionContext:
|
||||
self.tools.append((source_id, tools))
|
||||
|
||||
|
||||
def _extract_execute_code_result(function_result: Content) -> Content:
|
||||
def _extract_text_output(function_result: Content) -> str:
|
||||
assert function_result.type == "function_result"
|
||||
assert function_result.exception is None, (
|
||||
f"execute_code raised {function_result.exception!r} with items={function_result.items!r}"
|
||||
)
|
||||
|
||||
code_result = next(
|
||||
(item for item in function_result.items or [] if item.type == "code_interpreter_tool_result"),
|
||||
text_output = next(
|
||||
(item for item in function_result.items or [] if item.type == "text" and item.text is not None),
|
||||
None,
|
||||
)
|
||||
if code_result is not None:
|
||||
return code_result
|
||||
|
||||
text_outputs = [item for item in function_result.items or [] if item.type == "text"]
|
||||
if text_outputs:
|
||||
return Content.from_code_interpreter_tool_result(outputs=text_outputs)
|
||||
|
||||
if text_output is not None and text_output.text is not None:
|
||||
return text_output.text
|
||||
if function_result.result:
|
||||
return Content.from_code_interpreter_tool_result(outputs=[Content.from_text(function_result.result)])
|
||||
|
||||
raise AssertionError(f"execute_code returned no usable outputs: {function_result.items!r}")
|
||||
|
||||
|
||||
def _extract_text_output(result_content: Content) -> str:
|
||||
code_result = _extract_execute_code_result(result_content)
|
||||
text_output = next(
|
||||
(item for item in code_result.outputs or [] if item.type == "text" and item.text is not None), None
|
||||
)
|
||||
assert text_output is not None and text_output.text is not None, (
|
||||
f"Expected text output from execute_code, got {code_result.outputs!r}"
|
||||
)
|
||||
return text_output.text
|
||||
return function_result.result
|
||||
raise AssertionError(f"Expected text output from execute_code, got {function_result.items!r}")
|
||||
|
||||
|
||||
class _FakeCodeActChatClient(FunctionInvocationLayer[Any], BaseChatClient[Any]):
|
||||
@@ -432,7 +414,7 @@ async def test_execute_code_tool_populates_input_dir_with_workspace_and_file_mou
|
||||
)
|
||||
result = await execute_code.invoke(arguments={"code": "None"})
|
||||
|
||||
assert result[0].type == "code_interpreter_tool_result"
|
||||
assert result[0].type == "text"
|
||||
assert _FakeSandbox.instances[0].input_dir is not None
|
||||
|
||||
input_root = Path(_FakeSandbox.instances[0].input_dir)
|
||||
@@ -493,11 +475,9 @@ async def test_execute_code_tool_executes_with_structured_content(monkeypatch: p
|
||||
|
||||
result = await execute_code.invoke(arguments={"code": "create-output"})
|
||||
|
||||
assert result[0].type == "code_interpreter_tool_result"
|
||||
assert result[0].outputs is not None
|
||||
assert result[0].outputs[0].type == "text"
|
||||
assert result[0].outputs[0].text == "done\n"
|
||||
assert any(item.type == "data" for item in result[0].outputs)
|
||||
assert result[0].type == "text"
|
||||
assert result[0].text == "done\n"
|
||||
assert any(item.type == "data" for item in result)
|
||||
assert _FakeSandbox.instances[0].allowed_domains == [("api.example.com", ["GET"])]
|
||||
assert "compute" in _FakeSandbox.instances[0].registered_tools
|
||||
|
||||
@@ -512,11 +492,8 @@ async def test_execute_code_tool_collects_output_files_without_backend_listing(
|
||||
)
|
||||
result = await execute_code.invoke(arguments={"code": "create-output"})
|
||||
|
||||
assert result[0].type == "code_interpreter_tool_result"
|
||||
assert result[0].outputs is not None
|
||||
assert any(
|
||||
item.type == "data" and item.additional_properties["path"] == "/output/report.txt" for item in result[0].outputs
|
||||
)
|
||||
assert result[0].type == "text"
|
||||
assert any(item.type == "data" and item.additional_properties["path"] == "/output/report.txt" for item in result)
|
||||
|
||||
|
||||
async def test_execute_code_tool_waits_for_unlisted_output_files_to_appear(
|
||||
@@ -535,11 +512,7 @@ async def test_execute_code_tool_waits_for_unlisted_output_files_to_appear(
|
||||
for writer_thread in _FakeSandboxWithDelayedUnlistedOutput.writer_threads:
|
||||
writer_thread.join()
|
||||
|
||||
assert result[0].type == "code_interpreter_tool_result"
|
||||
assert result[0].outputs is not None
|
||||
assert any(
|
||||
item.type == "data" and item.additional_properties["path"] == "/output/report.txt" for item in result[0].outputs
|
||||
)
|
||||
assert any(item.type == "data" and item.additional_properties["path"] == "/output/report.txt" for item in result)
|
||||
|
||||
|
||||
async def test_execute_code_tool_failure_returns_error_content(monkeypatch: pytest.MonkeyPatch) -> None:
|
||||
@@ -549,10 +522,8 @@ async def test_execute_code_tool_failure_returns_error_content(monkeypatch: pyte
|
||||
execute_code = HyperlightExecuteCodeTool()
|
||||
result = await execute_code.invoke(arguments={"code": "fail"})
|
||||
|
||||
assert result[0].type == "code_interpreter_tool_result"
|
||||
assert result[0].outputs is not None
|
||||
assert result[0].outputs[0].type == "error"
|
||||
assert result[0].outputs[0].error_details == "sandbox boom"
|
||||
assert result[0].type == "error"
|
||||
assert result[0].error_details == "sandbox boom"
|
||||
|
||||
|
||||
async def test_execute_code_tool_retries_allowed_domains_with_urls_when_backend_rejects_host_targets(
|
||||
@@ -596,7 +567,7 @@ async def test_execute_code_tool_retries_allowed_domains_with_urls_when_backend_
|
||||
execute_code = HyperlightExecuteCodeTool(allowed_domains=[("127.0.0.1:8080", "get")])
|
||||
result = await execute_code.invoke(arguments={"code": "None"})
|
||||
|
||||
assert result[0].type == "code_interpreter_tool_result"
|
||||
assert result[0].type == "text"
|
||||
assert len(_FakeStrictNetworkSandbox.instances) == 2
|
||||
assert _FakeStrictNetworkSandbox.instances[0].allowed_domains == [("127.0.0.1:8080", ["GET"])]
|
||||
assert _FakeStrictNetworkSandbox.instances[1].allowed_domains == [
|
||||
@@ -731,8 +702,7 @@ async def test_provider_run_tool_writes_files_with_real_sandbox(tmp_path: Path)
|
||||
}
|
||||
)
|
||||
|
||||
assert result[0].type == "code_interpreter_tool_result"
|
||||
outputs = result[0].outputs or []
|
||||
outputs = result
|
||||
error_outputs = [
|
||||
f"{item.message}: {item.error_details}"
|
||||
for item in outputs
|
||||
@@ -795,8 +765,7 @@ async def test_provider_run_tool_pings_bing_with_real_sandbox() -> None:
|
||||
}
|
||||
)
|
||||
|
||||
assert result[0].type == "code_interpreter_tool_result"
|
||||
outputs = result[0].outputs or []
|
||||
outputs = result
|
||||
error_outputs = [
|
||||
f"{item.message}: {item.error_details}"
|
||||
for item in outputs
|
||||
@@ -823,9 +792,7 @@ async def test_sandbox_runs_simple_code(restored_sandbox) -> None:
|
||||
|
||||
@skip_if_hyperlight_integration_tests_disabled
|
||||
async def test_sandbox_stdout_and_stderr_captured(restored_sandbox) -> None:
|
||||
result = restored_sandbox.run(
|
||||
'import sys\nprint("out")\nprint("err", file=sys.stderr)'
|
||||
)
|
||||
result = restored_sandbox.run('import sys\nprint("out")\nprint("err", file=sys.stderr)')
|
||||
assert result.success
|
||||
assert "out" in result.stdout
|
||||
assert "err" in result.stderr
|
||||
@@ -910,24 +877,17 @@ async def test_output_dir_cleared_between_invocations() -> None:
|
||||
|
||||
# First invocation: write a file
|
||||
result1 = await run_tool.invoke(
|
||||
arguments={
|
||||
"code": (
|
||||
'with open("/output/stale.txt", "w") as f:\n'
|
||||
' f.write("first")\n'
|
||||
'print("wrote")\n'
|
||||
)
|
||||
}
|
||||
arguments={"code": ('with open("/output/stale.txt", "w") as f:\n f.write("first")\nprint("wrote")\n')}
|
||||
)
|
||||
assert result1[0].type == "code_interpreter_tool_result"
|
||||
outputs1 = result1[0].outputs or []
|
||||
assert result1[0].type == "text" or result1[0].type == "data"
|
||||
outputs1 = result1
|
||||
assert any(
|
||||
item.type == "data" and "stale.txt" in (item.additional_properties or {}).get("path", "")
|
||||
for item in outputs1
|
||||
item.type == "data" and "stale.txt" in (item.additional_properties or {}).get("path", "") for item in outputs1
|
||||
), "First invocation should produce stale.txt"
|
||||
|
||||
# Second invocation: no file writes
|
||||
result2 = await run_tool.invoke(arguments={"code": 'print("clean")\n'})
|
||||
outputs2 = result2[0].outputs or []
|
||||
outputs2 = result2
|
||||
stale_files = [
|
||||
item
|
||||
for item in outputs2
|
||||
@@ -971,11 +931,9 @@ async def test_run_code_does_not_block_event_loop() -> None:
|
||||
concurrent_ran = True
|
||||
release.set()
|
||||
|
||||
code_task = asyncio.create_task(
|
||||
run_tool.invoke(arguments={"code": 'print("done")\n'})
|
||||
)
|
||||
code_task = asyncio.create_task(run_tool.invoke(arguments={"code": 'print("done")\n'}))
|
||||
await _concurrent_task()
|
||||
result = await code_task
|
||||
|
||||
assert concurrent_ran, "Event loop was blocked during sandbox execution"
|
||||
assert result[0].type == "code_interpreter_tool_result"
|
||||
assert result[0].type == "text"
|
||||
|
||||
Reference in New Issue
Block a user