mirror of
https://github.com/microsoft/agent-framework.git
synced 2026-06-16 21:04:09 +08:00
178 lines
7.6 KiB
C#
178 lines
7.6 KiB
C#
// Copyright (c) Microsoft. All rights reserved.
|
|
|
|
// This sample shows how to use Computer Use Tool with AI Agents.
|
|
|
|
using Azure.AI.Projects;
|
|
using Azure.AI.Projects.OpenAI;
|
|
using Azure.Identity;
|
|
using Microsoft.Agents.AI;
|
|
using Microsoft.Extensions.AI;
|
|
using OpenAI.Responses;
|
|
|
|
namespace Demo.ComputerUse;
|
|
|
|
internal sealed class Program
|
|
{
|
|
private static async Task Main(string[] args)
|
|
{
|
|
string endpoint = Environment.GetEnvironmentVariable("AZURE_FOUNDRY_PROJECT_ENDPOINT") ?? throw new InvalidOperationException("AZURE_FOUNDRY_PROJECT_ENDPOINT is not set.");
|
|
string deploymentName = Environment.GetEnvironmentVariable("AZURE_FOUNDRY_PROJECT_DEPLOYMENT_NAME") ?? "computer-use-preview";
|
|
|
|
// WARNING: DefaultAzureCredential is convenient for development but requires careful consideration in production.
|
|
// In production, consider using a specific credential (e.g., ManagedIdentityCredential) to avoid
|
|
// latency issues, unintended credential probing, and potential security risks from fallback mechanisms.
|
|
// Get a client to create/retrieve/delete server side agents with Azure Foundry Agents.
|
|
AIProjectClient aiProjectClient = new(new Uri(endpoint), new DefaultAzureCredential());
|
|
const string AgentInstructions = @"
|
|
You are a computer automation assistant.
|
|
|
|
Be direct and efficient. When you reach the search results page, read and describe the actual search result titles and descriptions you can see.
|
|
";
|
|
|
|
const string AgentNameMEAI = "ComputerAgent-MEAI";
|
|
const string AgentNameNative = "ComputerAgent-NATIVE";
|
|
|
|
// Option 1 - Using ComputerUseTool + AgentOptions (MEAI + AgentFramework)
|
|
// Create AIAgent directly
|
|
AIAgent agentOption1 = await aiProjectClient.CreateAIAgentAsync(
|
|
name: AgentNameMEAI,
|
|
model: deploymentName,
|
|
instructions: AgentInstructions,
|
|
description: "Computer automation agent with screen interaction capabilities.",
|
|
tools: [
|
|
ResponseTool.CreateComputerTool(ComputerToolEnvironment.Browser, 1026, 769).AsAITool(),
|
|
]);
|
|
|
|
// Option 2 - Using PromptAgentDefinition SDK native type
|
|
// Create the server side agent version
|
|
AIAgent agentOption2 = await aiProjectClient.CreateAIAgentAsync(
|
|
name: AgentNameNative,
|
|
creationOptions: new AgentVersionCreationOptions(
|
|
new PromptAgentDefinition(model: deploymentName)
|
|
{
|
|
Instructions = AgentInstructions,
|
|
Tools = { ResponseTool.CreateComputerTool(
|
|
environment: new ComputerToolEnvironment("windows"),
|
|
displayWidth: 1026,
|
|
displayHeight: 769) }
|
|
})
|
|
);
|
|
|
|
// Either invoke option1 or option2 agent, should have same result
|
|
// Option 1
|
|
await InvokeComputerUseAgentAsync(agentOption1);
|
|
|
|
// Option 2
|
|
//await InvokeComputerUseAgentAsync(agentOption2);
|
|
|
|
// Cleanup by agent name removes the agent version created.
|
|
await aiProjectClient.Agents.DeleteAgentAsync(agentOption1.Name);
|
|
await aiProjectClient.Agents.DeleteAgentAsync(agentOption2.Name);
|
|
}
|
|
|
|
private static async Task InvokeComputerUseAgentAsync(AIAgent agent)
|
|
{
|
|
// Load screenshot assets
|
|
Dictionary<string, byte[]> screenshots = ComputerUseUtil.LoadScreenshotAssets();
|
|
|
|
ChatOptions chatOptions = new();
|
|
CreateResponseOptions responseCreationOptions = new()
|
|
{
|
|
TruncationMode = ResponseTruncationMode.Auto
|
|
};
|
|
chatOptions.RawRepresentationFactory = (_) => responseCreationOptions;
|
|
ChatClientAgentRunOptions runOptions = new(chatOptions)
|
|
{
|
|
AllowBackgroundResponses = true,
|
|
};
|
|
|
|
AgentSession session = await agent.CreateSessionAsync();
|
|
|
|
ChatMessage message = new(ChatRole.User, [
|
|
new TextContent("I need you to help me search for 'OpenAI news'. Please type 'OpenAI news' and submit the search. Once you see search results, the task is complete."),
|
|
new DataContent(new BinaryData(screenshots["browser_search"]), "image/png")
|
|
]);
|
|
|
|
// Initial request with screenshot - start with Bing search page
|
|
Console.WriteLine("Starting computer automation session (initial screenshot: cua_browser_search.png)...");
|
|
|
|
AgentResponse response = await agent.RunAsync(message, session: session, options: runOptions);
|
|
|
|
// Main interaction loop
|
|
const int MaxIterations = 10;
|
|
int iteration = 0;
|
|
// Initialize state machine
|
|
SearchState currentState = SearchState.Initial;
|
|
string initialCallId = string.Empty;
|
|
|
|
while (true)
|
|
{
|
|
// Poll until the response is complete.
|
|
while (response.ContinuationToken is { } token)
|
|
{
|
|
// Wait before polling again.
|
|
await Task.Delay(TimeSpan.FromSeconds(2));
|
|
|
|
// Continue with the token.
|
|
runOptions.ContinuationToken = token;
|
|
|
|
response = await agent.RunAsync(session, runOptions);
|
|
}
|
|
|
|
Console.WriteLine($"Agent response received (ID: {response.ResponseId})");
|
|
|
|
if (iteration >= MaxIterations)
|
|
{
|
|
Console.WriteLine($"\nReached maximum iterations ({MaxIterations}). Stopping.");
|
|
break;
|
|
}
|
|
|
|
iteration++;
|
|
Console.WriteLine($"\n--- Iteration {iteration} ---");
|
|
|
|
// Check for computer calls in the response
|
|
IEnumerable<ComputerCallResponseItem> computerCallResponseItems = response.Messages
|
|
.SelectMany(x => x.Contents)
|
|
.Where(c => c.RawRepresentation is ComputerCallResponseItem and not null)
|
|
.Select(c => (ComputerCallResponseItem)c.RawRepresentation!);
|
|
|
|
ComputerCallResponseItem? firstComputerCall = computerCallResponseItems.FirstOrDefault();
|
|
if (firstComputerCall is null)
|
|
{
|
|
Console.WriteLine("No computer call actions found. Ending interaction.");
|
|
Console.WriteLine($"Final Response: {response}");
|
|
break;
|
|
}
|
|
|
|
// Process the first computer call response
|
|
ComputerCallAction action = firstComputerCall.Action;
|
|
string currentCallId = firstComputerCall.CallId;
|
|
|
|
// Set the initial computer call ID for tracking and subsequent responses.
|
|
if (string.IsNullOrEmpty(initialCallId))
|
|
{
|
|
initialCallId = currentCallId;
|
|
}
|
|
|
|
Console.WriteLine($"Processing computer call (ID: {currentCallId})");
|
|
|
|
// Simulate executing the action and taking a screenshot
|
|
(SearchState CurrentState, byte[] ImageBytes) screenInfo = ComputerUseUtil.HandleComputerActionAndTakeScreenshot(action, currentState, screenshots);
|
|
currentState = screenInfo.CurrentState;
|
|
|
|
Console.WriteLine("Sending action result back to agent...");
|
|
|
|
AIContent content = new()
|
|
{
|
|
RawRepresentation = new ComputerCallOutputResponseItem(
|
|
initialCallId,
|
|
output: ComputerCallOutput.CreateScreenshotOutput(new BinaryData(screenInfo.ImageBytes), "image/png"))
|
|
};
|
|
|
|
// Follow-up message with action result and new screenshot
|
|
message = new(ChatRole.User, [content]);
|
|
response = await agent.RunAsync(message, session: session, options: runOptions);
|
|
}
|
|
}
|
|
}
|