mirror of
https://github.com/microsoft/agent-framework.git
synced 2026-06-16 21:04:09 +08:00
Merge branch 'main' into copilot/add-unit-tests-workflows-routebuilder
This commit is contained in:
@@ -298,6 +298,7 @@
|
||||
</Folder>
|
||||
<Folder Name="/Samples/03-workflows/Evaluation/">
|
||||
<Project Path="samples/03-workflows/Evaluation/Evaluation_WorkflowEval/Evaluation_WorkflowEval.csproj" />
|
||||
<Project Path="samples/03-workflows/Evaluation/Evaluation_WorkflowExpectedOutputs/Evaluation_WorkflowExpectedOutputs.csproj" />
|
||||
</Folder>
|
||||
<Folder Name="/Samples/04-hosting/">
|
||||
</Folder>
|
||||
|
||||
@@ -1,14 +1,14 @@
|
||||
<Project>
|
||||
<PropertyGroup>
|
||||
<!-- Central version prefix - applies to all nuget packages. -->
|
||||
<VersionPrefix>1.5.0</VersionPrefix>
|
||||
<VersionPrefix>1.6.0</VersionPrefix>
|
||||
<RCNumber>1</RCNumber>
|
||||
<DateSuffix>260507</DateSuffix>
|
||||
<DateSuffix>260512</DateSuffix>
|
||||
<PackageVersion Condition="'$(IsReleaseCandidate)' == 'true'">$(VersionPrefix)-rc$(RCNumber)</PackageVersion>
|
||||
<PackageVersion Condition="'$(IsReleaseCandidate)' != 'true' AND '$(VersionSuffix)' != ''">$(VersionPrefix)-$(VersionSuffix).$(DateSuffix).1</PackageVersion>
|
||||
<PackageVersion Condition="'$(IsReleaseCandidate)' != 'true' AND '$(VersionSuffix)' == ''">$(VersionPrefix)-preview.$(DateSuffix).1</PackageVersion>
|
||||
<PackageVersion Condition="'$(IsReleased)' == 'true'">$(VersionPrefix)</PackageVersion>
|
||||
<GitTag>1.5.0</GitTag>
|
||||
<GitTag>1.6.0</GitTag>
|
||||
|
||||
<Configurations>Debug;Release;Publish</Configurations>
|
||||
<IsPackable>true</IsPackable>
|
||||
|
||||
+16
@@ -0,0 +1,16 @@
|
||||
<Project Sdk="Microsoft.NET.Sdk">
|
||||
|
||||
<PropertyGroup>
|
||||
<OutputType>Exe</OutputType>
|
||||
<TargetFrameworks>net10.0</TargetFrameworks>
|
||||
|
||||
<Nullable>enable</Nullable>
|
||||
<ImplicitUsings>enable</ImplicitUsings>
|
||||
</PropertyGroup>
|
||||
|
||||
<ItemGroup>
|
||||
<ProjectReference Include="..\..\..\..\src\Microsoft.Agents.AI.Foundry\Microsoft.Agents.AI.Foundry.csproj" />
|
||||
<ProjectReference Include="..\..\..\..\src\Microsoft.Agents.AI.Workflows\Microsoft.Agents.AI.Workflows.csproj" />
|
||||
</ItemGroup>
|
||||
|
||||
</Project>
|
||||
@@ -0,0 +1,76 @@
|
||||
// Copyright (c) Microsoft. All rights reserved.
|
||||
|
||||
// This sample demonstrates evaluating a multi-agent workflow against a
|
||||
// golden answer using Foundry's reference-based Similarity evaluator.
|
||||
|
||||
using Azure.AI.Projects;
|
||||
using Azure.Identity;
|
||||
using Microsoft.Agents.AI;
|
||||
using Microsoft.Agents.AI.Workflows;
|
||||
using Microsoft.Extensions.AI;
|
||||
using FoundryEvals = Microsoft.Agents.AI.Foundry.FoundryEvals;
|
||||
|
||||
string endpoint = Environment.GetEnvironmentVariable("AZURE_AI_PROJECT_ENDPOINT")
|
||||
?? throw new InvalidOperationException("AZURE_AI_PROJECT_ENDPOINT is not set.");
|
||||
string deploymentName = Environment.GetEnvironmentVariable("AZURE_AI_MODEL_DEPLOYMENT_NAME") ?? "gpt-4o-mini";
|
||||
|
||||
// WARNING: DefaultAzureCredential is convenient for development but requires careful consideration in production.
|
||||
// In production, consider using a specific credential (e.g., ManagedIdentityCredential) to avoid
|
||||
// latency issues, unintended credential probing, and potential security risks from fallback mechanisms.
|
||||
AIProjectClient projectClient = new(new Uri(endpoint), new DefaultAzureCredential());
|
||||
|
||||
// Build a two-agent workflow: a researcher writes a draft answer, then an
|
||||
// editor polishes it into the final response that we compare to ground truth.
|
||||
// EmitAgentResponseEvents is enabled so the workflow surfaces an AgentResponseEvent
|
||||
// for each agent — this is what EvaluateAsync uses to find the overall final answer.
|
||||
var hostOptions = new AIAgentHostOptions { EmitAgentResponseEvents = true };
|
||||
|
||||
AIAgent researcher = projectClient.AsAIAgent(
|
||||
model: deploymentName,
|
||||
instructions: "You research questions and produce a short factual draft answer.",
|
||||
name: "researcher");
|
||||
|
||||
AIAgent editor = projectClient.AsAIAgent(
|
||||
model: deploymentName,
|
||||
instructions: "You take a draft answer and produce the final concise response.",
|
||||
name: "editor");
|
||||
|
||||
ExecutorBinding researcherExecutor = researcher.BindAsExecutor(hostOptions);
|
||||
ExecutorBinding editorExecutor = editor.BindAsExecutor(hostOptions);
|
||||
|
||||
Workflow workflow = new WorkflowBuilder(researcherExecutor)
|
||||
.AddEdge(researcherExecutor, editorExecutor)
|
||||
.Build();
|
||||
|
||||
// Run the workflow against the user question.
|
||||
const string Query = "What is the capital of France?";
|
||||
const string GroundTruth = "Paris";
|
||||
|
||||
await using Run run = await InProcessExecution.RunAsync(
|
||||
workflow,
|
||||
new ChatMessage(ChatRole.User, Query));
|
||||
|
||||
// Evaluate the overall workflow output against a golden answer using the
|
||||
// reference-based Similarity evaluator. The 'expectedOutput' value is stamped
|
||||
// onto the overall EvalItem.ExpectedOutput and is surfaced to Foundry as
|
||||
// `ground_truth` in the underlying JSONL payload.
|
||||
//
|
||||
// Per-agent breakdown is disabled here: ground truth applies to the workflow's
|
||||
// final answer, not to each sub-agent's intermediate output. Without
|
||||
// includePerAgent: false, the evaluator would be invoked for per-agent items
|
||||
// (which have no ExpectedOutput) and Similarity would fail validation.
|
||||
FoundryEvals similarity = new(projectClient, deploymentName, FoundryEvals.Similarity);
|
||||
|
||||
AgentEvaluationResults results = await run.EvaluateAsync(
|
||||
similarity,
|
||||
includePerAgent: false,
|
||||
expectedOutput: GroundTruth);
|
||||
|
||||
Console.WriteLine($"Query: {Query}");
|
||||
Console.WriteLine($"Expected: {GroundTruth}");
|
||||
Console.WriteLine($"Provider: {results.ProviderName}");
|
||||
Console.WriteLine($"Passed: {results.Passed}/{results.Total}");
|
||||
if (results.ReportUrl is not null)
|
||||
{
|
||||
Console.WriteLine($"Report: {results.ReportUrl}");
|
||||
}
|
||||
@@ -0,0 +1,37 @@
|
||||
# Evaluation - Workflow Expected Outputs
|
||||
|
||||
This sample demonstrates evaluating a multi-agent workflow's final answer
|
||||
against a golden expected output using Foundry's reference-based **Similarity**
|
||||
evaluator.
|
||||
|
||||
## What this sample demonstrates
|
||||
|
||||
- Building a small researcher → editor workflow
|
||||
- Running the workflow and obtaining a `Run`
|
||||
- Calling `run.EvaluateAsync(evaluator, expectedOutput: ...)` to attach a
|
||||
ground-truth answer to the overall workflow item
|
||||
- Using `FoundryEvals.Similarity`, which requires a `ground_truth` value
|
||||
per item
|
||||
|
||||
The `expectedOutput` value is stamped onto the overall `EvalItem.ExpectedOutput`
|
||||
and is surfaced to Foundry as `ground_truth` in the JSONL payload sent to the
|
||||
Evals API.
|
||||
|
||||
## Prerequisites
|
||||
|
||||
- .NET 10 SDK or later
|
||||
- Azure CLI installed and authenticated (`az login`)
|
||||
|
||||
Set the following environment variables:
|
||||
|
||||
```powershell
|
||||
$env:AZURE_AI_PROJECT_ENDPOINT="https://your-foundry-service.services.ai.azure.com/api/projects/your-foundry-project"
|
||||
$env:AZURE_AI_MODEL_DEPLOYMENT_NAME="gpt-4o-mini"
|
||||
```
|
||||
|
||||
## Run the sample
|
||||
|
||||
```powershell
|
||||
cd dotnet/samples/03-workflows/Evaluation
|
||||
dotnet run --project .\Evaluation_WorkflowExpectedOutputs
|
||||
```
|
||||
@@ -130,6 +130,7 @@ internal static class FoundryEvalConverter
|
||||
QueryMessages = ConvertMessages(queryMessages),
|
||||
ResponseMessages = ConvertMessages(responseMessages),
|
||||
Context = item.Context,
|
||||
GroundTruth = item.ExpectedOutput,
|
||||
ToolDefinitions = item.Tools is { Count: > 0 }
|
||||
? item.Tools
|
||||
.OfType<AIFunction>()
|
||||
@@ -185,6 +186,11 @@ internal static class FoundryEvalConverter
|
||||
dataMapping["context"] = "{{item.context}}";
|
||||
}
|
||||
|
||||
if (GroundTruthEvaluators.Contains(qualified))
|
||||
{
|
||||
dataMapping["ground_truth"] = "{{item.ground_truth}}";
|
||||
}
|
||||
|
||||
if (ToolEvaluators.Contains(qualified))
|
||||
{
|
||||
dataMapping["tool_definitions"] = "{{item.tool_definitions}}";
|
||||
@@ -206,7 +212,7 @@ internal static class FoundryEvalConverter
|
||||
/// <summary>
|
||||
/// Builds the <c>item_schema</c> for custom JSONL eval definitions.
|
||||
/// </summary>
|
||||
internal static WireItemSchema BuildItemSchema(bool hasContext = false, bool hasTools = false)
|
||||
internal static WireItemSchema BuildItemSchema(bool hasContext = false, bool hasTools = false, bool hasGroundTruth = false)
|
||||
{
|
||||
var properties = new Dictionary<string, WireSchemaProperty>
|
||||
{
|
||||
@@ -221,6 +227,11 @@ internal static class FoundryEvalConverter
|
||||
properties["context"] = new WireSchemaProperty { Type = "string" };
|
||||
}
|
||||
|
||||
if (hasGroundTruth)
|
||||
{
|
||||
properties["ground_truth"] = new WireSchemaProperty { Type = "string" };
|
||||
}
|
||||
|
||||
if (hasTools)
|
||||
{
|
||||
properties["tool_definitions"] = new WireSchemaProperty { Type = "array" };
|
||||
@@ -233,6 +244,31 @@ internal static class FoundryEvalConverter
|
||||
};
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Returns the subset of <paramref name="evaluators"/> that require a ground-truth
|
||||
/// (reference) value but cannot be evaluated because no item provided one.
|
||||
/// </summary>
|
||||
internal static List<string> FindMissingGroundTruthEvaluators(
|
||||
IEnumerable<string> evaluators,
|
||||
bool hasGroundTruth)
|
||||
{
|
||||
if (hasGroundTruth)
|
||||
{
|
||||
return [];
|
||||
}
|
||||
|
||||
var missing = new List<string>();
|
||||
foreach (var name in evaluators)
|
||||
{
|
||||
if (GroundTruthEvaluators.Contains(ResolveEvaluator(name)))
|
||||
{
|
||||
missing.Add(name);
|
||||
}
|
||||
}
|
||||
|
||||
return missing;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Resolves a short evaluator name to its fully-qualified <c>builtin.*</c> form.
|
||||
/// </summary>
|
||||
@@ -277,6 +313,12 @@ internal static class FoundryEvalConverter
|
||||
"builtin.tool_call_success",
|
||||
};
|
||||
|
||||
// Evaluators that require a ground_truth (reference) value per item.
|
||||
internal static readonly HashSet<string> GroundTruthEvaluators = new(StringComparer.OrdinalIgnoreCase)
|
||||
{
|
||||
"builtin.similarity",
|
||||
};
|
||||
|
||||
// Short name → fully-qualified name mapping.
|
||||
internal static readonly Dictionary<string, string> BuiltinEvaluators = new(StringComparer.OrdinalIgnoreCase)
|
||||
{
|
||||
|
||||
@@ -103,6 +103,9 @@ internal sealed class WireEvalItemPayload
|
||||
[JsonPropertyName("context")]
|
||||
public string? Context { get; init; }
|
||||
|
||||
[JsonPropertyName("ground_truth")]
|
||||
public string? GroundTruth { get; init; }
|
||||
|
||||
[JsonPropertyName("tool_definitions")]
|
||||
public List<WireToolDefinition>? ToolDefinitions { get; init; }
|
||||
}
|
||||
|
||||
@@ -145,6 +145,8 @@ public sealed class FoundryEvals : IAgentEvaluator
|
||||
|
||||
bool hasContext = payloads.Any(p => p.Context is not null);
|
||||
bool hasTools = payloads.Any(p => p.ToolDefinitions is { Count: > 0 });
|
||||
bool hasGroundTruth = payloads.Any(p => p.GroundTruth is not null);
|
||||
bool allHaveGroundTruth = payloads.Count > 0 && payloads.All(p => p.GroundTruth is not null);
|
||||
|
||||
// Filter out tool evaluators if no items have tools; auto-add ToolCallAccuracy if tools present
|
||||
var evaluators = FilterToolEvaluators(this._evaluatorNames, hasTools);
|
||||
@@ -153,13 +155,27 @@ public sealed class FoundryEvals : IAgentEvaluator
|
||||
evaluators = [.. evaluators, ToolCallAccuracy];
|
||||
}
|
||||
|
||||
// Fail fast if a ground-truth evaluator (e.g. similarity) is requested but not
|
||||
// every item carries an ExpectedOutput. Reference-based evaluators score each
|
||||
// item against its own ground truth, so even one missing value will surface as
|
||||
// a provider-side validation error. Catch it here with a clearer message.
|
||||
var missingGroundTruth = FoundryEvalConverter.FindMissingGroundTruthEvaluators(evaluators, allHaveGroundTruth);
|
||||
if (missingGroundTruth.Count > 0)
|
||||
{
|
||||
throw new InvalidOperationException(
|
||||
"The following evaluator(s) require a ground-truth/expected output on every item but " +
|
||||
$"at least one item is missing an {nameof(EvalItem.ExpectedOutput)}: {string.Join(", ", missingGroundTruth)}. " +
|
||||
"Provide an expected output per item (for example via the 'expectedOutput' parameter on EvaluateAsync), " +
|
||||
"or set 'includePerAgent: false' so the evaluator only runs on the overall item.");
|
||||
}
|
||||
|
||||
// 2. Create the evaluation definition
|
||||
var createEvalPayload = new WireCreateEvalRequest
|
||||
{
|
||||
Name = evalName,
|
||||
DataSourceConfig = new WireCustomDataSourceConfig
|
||||
{
|
||||
ItemSchema = FoundryEvalConverter.BuildItemSchema(hasContext, hasTools),
|
||||
ItemSchema = FoundryEvalConverter.BuildItemSchema(hasContext, hasTools, hasGroundTruth),
|
||||
},
|
||||
TestingCriteria = FoundryEvalConverter.BuildTestingCriteria(
|
||||
evaluators, this._model, includeDataMapping: true),
|
||||
@@ -822,15 +838,15 @@ public sealed class FoundryEvals : IAgentEvaluator
|
||||
var result = new EvalItemResult(itemId, status, scores);
|
||||
|
||||
// Extract error info from sample
|
||||
if (outputItem.TryGetProperty("sample", out var sample))
|
||||
if (outputItem.TryGetProperty("sample", out var sample) && sample.ValueKind == JsonValueKind.Object)
|
||||
{
|
||||
if (sample.TryGetProperty("error", out var errObj))
|
||||
if (sample.TryGetProperty("error", out var errObj) && errObj.ValueKind == JsonValueKind.Object)
|
||||
{
|
||||
result.ErrorCode = errObj.TryGetProperty("code", out var code) ? code.GetString() : null;
|
||||
result.ErrorMessage = errObj.TryGetProperty("message", out var msg) ? msg.GetString() : null;
|
||||
}
|
||||
|
||||
if (sample.TryGetProperty("usage", out var usage) && usage.TryGetProperty("total_tokens", out var tt) && tt.ValueKind == JsonValueKind.Number)
|
||||
if (sample.TryGetProperty("usage", out var usage) && usage.ValueKind == JsonValueKind.Object && usage.TryGetProperty("total_tokens", out var tt) && tt.ValueKind == JsonValueKind.Number)
|
||||
{
|
||||
var tokenUsage = new Dictionary<string, int>();
|
||||
if (usage.TryGetProperty("prompt_tokens", out var pt) && pt.ValueKind == JsonValueKind.Number)
|
||||
@@ -886,7 +902,7 @@ public sealed class FoundryEvals : IAgentEvaluator
|
||||
}
|
||||
|
||||
// Extract response_id from datasource_item
|
||||
if (outputItem.TryGetProperty("datasource_item", out var dsItem))
|
||||
if (outputItem.TryGetProperty("datasource_item", out var dsItem) && dsItem.ValueKind == JsonValueKind.Object)
|
||||
{
|
||||
if (dsItem.TryGetProperty("resp_id", out var respId))
|
||||
{
|
||||
|
||||
+111
-21
@@ -28,6 +28,17 @@ public static class WorkflowEvaluationExtensions
|
||||
/// Use <see cref="ConversationSplitters.LastTurn"/>, <see cref="ConversationSplitters.Full"/>,
|
||||
/// or a custom <see cref="IConversationSplitter"/> implementation.
|
||||
/// </param>
|
||||
/// <param name="expectedOutput">
|
||||
/// Optional ground-truth/expected output for the workflow's overall final answer.
|
||||
/// When provided, it is stamped onto the overall <see cref="EvalItem.ExpectedOutput"/>
|
||||
/// so reference-based evaluators (for example, similarity) can compare the
|
||||
/// workflow's response against a golden answer. Ground truth is only applied
|
||||
/// to the overall item; per-agent items are intentionally left without an
|
||||
/// expected output, since ground truth is defined against the final response.
|
||||
/// When using a reference-based evaluator that requires ground truth, set
|
||||
/// <paramref name="includePerAgent"/> to <see langword="false"/> to avoid
|
||||
/// invoking the evaluator on per-agent items that have no expected output.
|
||||
/// </param>
|
||||
/// <param name="cancellationToken">Cancellation token.</param>
|
||||
/// <returns>Evaluation results with optional per-agent sub-results.</returns>
|
||||
public static async Task<AgentEvaluationResults> EvaluateAsync(
|
||||
@@ -37,6 +48,7 @@ public static class WorkflowEvaluationExtensions
|
||||
bool includePerAgent = true,
|
||||
string evalName = "Workflow Eval",
|
||||
IConversationSplitter? splitter = null,
|
||||
string? expectedOutput = null,
|
||||
CancellationToken cancellationToken = default)
|
||||
{
|
||||
var events = run.OutgoingEvents.ToList();
|
||||
@@ -48,28 +60,26 @@ public static class WorkflowEvaluationExtensions
|
||||
var overallItems = new List<EvalItem>();
|
||||
if (includeOverall)
|
||||
{
|
||||
var finalResponse = events.OfType<AgentResponseEvent>().LastOrDefault();
|
||||
if (finalResponse is not null)
|
||||
var overallItem = BuildOverallItem(events, splitter, expectedOutput);
|
||||
if (overallItem is not null)
|
||||
{
|
||||
var firstInvoked = events.OfType<ExecutorInvokedEvent>().FirstOrDefault();
|
||||
var query = firstInvoked?.Data switch
|
||||
{
|
||||
ChatMessage cm => cm.Text ?? string.Empty,
|
||||
IReadOnlyList<ChatMessage> msgs => msgs.LastOrDefault(m => m.Role == ChatRole.User)?.Text ?? string.Empty,
|
||||
string s => s,
|
||||
_ => firstInvoked?.Data?.ToString() ?? string.Empty,
|
||||
};
|
||||
var conversation = new List<ChatMessage>
|
||||
{
|
||||
new(ChatRole.User, query),
|
||||
};
|
||||
|
||||
conversation.AddRange(finalResponse.Response.Messages);
|
||||
|
||||
overallItems.Add(new EvalItem(query, finalResponse.Response.Text, conversation)
|
||||
{
|
||||
Splitter = splitter,
|
||||
});
|
||||
overallItems.Add(overallItem);
|
||||
}
|
||||
else
|
||||
{
|
||||
// The caller asked for an overall evaluation but we couldn't find a final
|
||||
// response to score — almost always because the workflow's agents weren't
|
||||
// built with EmitAgentResponseEvents enabled (so no AgentResponseEvent was
|
||||
// emitted) and no terminal ExecutorCompletedEvent carried an AgentResponse
|
||||
// / ChatMessage / string payload. Fail loudly instead of silently returning
|
||||
// 0/0 (or skipping evaluation against a supplied expectedOutput).
|
||||
throw new InvalidOperationException(
|
||||
"Cannot evaluate the overall workflow output: no AgentResponseEvent or " +
|
||||
"ExecutorCompletedEvent with an AgentResponse/ChatMessage/string payload " +
|
||||
"was found in the run. Bind agents with " +
|
||||
"AIAgentHostOptions { EmitAgentResponseEvents = true } " +
|
||||
"(for example via agent.BindAsExecutor(new AIAgentHostOptions { EmitAgentResponseEvents = true })) " +
|
||||
"so the workflow surfaces the final agent response, or set 'includeOverall: false'.");
|
||||
}
|
||||
}
|
||||
|
||||
@@ -97,6 +107,86 @@ public static class WorkflowEvaluationExtensions
|
||||
return overallResult;
|
||||
}
|
||||
|
||||
internal static EvalItem? BuildOverallItem(
|
||||
IReadOnlyList<WorkflowEvent> events,
|
||||
IConversationSplitter? splitter,
|
||||
string? expectedOutput)
|
||||
{
|
||||
var firstInvoked = events.OfType<ExecutorInvokedEvent>().FirstOrDefault();
|
||||
var query = firstInvoked?.Data switch
|
||||
{
|
||||
ChatMessage cm => cm.Text ?? string.Empty,
|
||||
IReadOnlyList<ChatMessage> msgs => msgs.LastOrDefault(m => m.Role == ChatRole.User)?.Text ?? string.Empty,
|
||||
string s => s,
|
||||
_ => firstInvoked?.Data?.ToString() ?? string.Empty,
|
||||
};
|
||||
|
||||
var conversation = new List<ChatMessage>
|
||||
{
|
||||
new(ChatRole.User, query),
|
||||
};
|
||||
|
||||
// Prefer AgentResponseEvent (only emitted when AIAgentHostOptions.EmitAgentResponseEvents
|
||||
// is enabled). Otherwise fall back to the last ExecutorCompletedEvent that carries an
|
||||
// AgentResponse / ChatMessage / string payload — these are always emitted by the runtime.
|
||||
var finalResponse = events.OfType<AgentResponseEvent>().LastOrDefault();
|
||||
string responseText;
|
||||
if (finalResponse is not null)
|
||||
{
|
||||
responseText = finalResponse.Response.Text;
|
||||
conversation.AddRange(finalResponse.Response.Messages);
|
||||
}
|
||||
else
|
||||
{
|
||||
ExecutorCompletedEvent? finalCompleted = null;
|
||||
for (int i = events.Count - 1; i >= 0; i--)
|
||||
{
|
||||
if (events[i] is ExecutorCompletedEvent completed
|
||||
&& !IsInternalExecutor(completed.ExecutorId)
|
||||
&& completed.Data is AgentResponse or ChatMessage or string)
|
||||
{
|
||||
finalCompleted = completed;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (finalCompleted is null)
|
||||
{
|
||||
return null;
|
||||
}
|
||||
|
||||
switch (finalCompleted.Data)
|
||||
{
|
||||
case AgentResponse ar:
|
||||
responseText = ar.Text;
|
||||
conversation.AddRange(ar.Messages);
|
||||
break;
|
||||
case ChatMessage cm:
|
||||
responseText = cm.Text ?? string.Empty;
|
||||
conversation.Add(cm);
|
||||
break;
|
||||
case string s:
|
||||
responseText = s;
|
||||
conversation.Add(new ChatMessage(ChatRole.Assistant, s));
|
||||
break;
|
||||
default:
|
||||
// Unreachable — the for-loop above already constrains Data to one of the
|
||||
// three handled types. Throw if the contract drifts so the bug is visible
|
||||
// instead of silently dropping the overall item.
|
||||
throw new InvalidOperationException(
|
||||
"BuildOverallItem: unexpected ExecutorCompletedEvent.Data type " +
|
||||
$"'{finalCompleted.Data?.GetType().FullName ?? "null"}'. Expected " +
|
||||
$"{nameof(AgentResponse)}, {nameof(ChatMessage)}, or string.");
|
||||
}
|
||||
}
|
||||
|
||||
return new EvalItem(query, responseText, conversation)
|
||||
{
|
||||
Splitter = splitter,
|
||||
ExpectedOutput = expectedOutput,
|
||||
};
|
||||
}
|
||||
|
||||
internal static Dictionary<string, List<EvalItem>> ExtractAgentData(
|
||||
List<WorkflowEvent> events,
|
||||
IConversationSplitter? splitter)
|
||||
|
||||
@@ -179,6 +179,35 @@ public sealed class FoundryEvalConverterTests
|
||||
Assert.Null(payload.Context);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void ConvertEvalItem_WithExpectedOutput_PopulatesGroundTruth()
|
||||
{
|
||||
// Arrange
|
||||
var item = new EvalItem(query: "q", response: "r")
|
||||
{
|
||||
ExpectedOutput = "the golden answer",
|
||||
};
|
||||
|
||||
// Act
|
||||
var payload = FoundryEvalConverter.ConvertEvalItem(item);
|
||||
|
||||
// Assert
|
||||
Assert.Equal("the golden answer", payload.GroundTruth);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void ConvertEvalItem_WithoutExpectedOutput_OmitsGroundTruth()
|
||||
{
|
||||
// Arrange
|
||||
var item = new EvalItem(query: "q", response: "r");
|
||||
|
||||
// Act
|
||||
var payload = FoundryEvalConverter.ConvertEvalItem(item);
|
||||
|
||||
// Assert
|
||||
Assert.Null(payload.GroundTruth);
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------
|
||||
// FoundryEvalConverter.BuildTestingCriteria tests
|
||||
// ---------------------------------------------------------------
|
||||
@@ -239,6 +268,33 @@ public sealed class FoundryEvalConverterTests
|
||||
Assert.Equal("{{item.context}}", mapping["context"]);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void BuildTestingCriteria_SimilarityEvaluator_IncludesGroundTruth()
|
||||
{
|
||||
// Act
|
||||
var criteria = FoundryEvalConverter.BuildTestingCriteria(
|
||||
["similarity"], "gpt-4o-mini", includeDataMapping: true);
|
||||
|
||||
// Assert
|
||||
Assert.Single(criteria);
|
||||
Assert.Equal("builtin.similarity", criteria[0].EvaluatorName);
|
||||
var mapping = criteria[0].DataMapping;
|
||||
Assert.NotNull(mapping);
|
||||
Assert.True(mapping.ContainsKey("ground_truth"));
|
||||
Assert.Equal("{{item.ground_truth}}", mapping["ground_truth"]);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void BuildTestingCriteria_NonGroundTruthEvaluator_OmitsGroundTruth()
|
||||
{
|
||||
var criteria = FoundryEvalConverter.BuildTestingCriteria(
|
||||
["relevance"], "gpt-4o-mini", includeDataMapping: true);
|
||||
|
||||
var mapping = criteria[0].DataMapping;
|
||||
Assert.NotNull(mapping);
|
||||
Assert.False(mapping.ContainsKey("ground_truth"));
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void BuildTestingCriteria_WithoutDataMapping_OmitsMappingField()
|
||||
{
|
||||
@@ -282,6 +338,59 @@ public sealed class FoundryEvalConverterTests
|
||||
Assert.True(schema.Properties.ContainsKey("tool_definitions"));
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void BuildItemSchema_WithGroundTruth_IncludesGroundTruthProperty()
|
||||
{
|
||||
// Act
|
||||
var schema = FoundryEvalConverter.BuildItemSchema(hasGroundTruth: true);
|
||||
|
||||
// Assert
|
||||
Assert.True(schema.Properties.ContainsKey("ground_truth"));
|
||||
Assert.Equal("string", schema.Properties["ground_truth"].Type);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void BuildItemSchema_WithoutGroundTruth_OmitsGroundTruthProperty()
|
||||
{
|
||||
var schema = FoundryEvalConverter.BuildItemSchema();
|
||||
|
||||
Assert.False(schema.Properties.ContainsKey("ground_truth"));
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------
|
||||
// FoundryEvalConverter.FindMissingGroundTruthEvaluators tests
|
||||
// ---------------------------------------------------------------
|
||||
|
||||
[Fact]
|
||||
public void FindMissingGroundTruthEvaluators_NoGroundTruth_ReturnsSimilarity()
|
||||
{
|
||||
// Act
|
||||
var missing = FoundryEvalConverter.FindMissingGroundTruthEvaluators(
|
||||
["similarity", "relevance"], hasGroundTruth: false);
|
||||
|
||||
// Assert
|
||||
Assert.Single(missing);
|
||||
Assert.Equal("similarity", missing[0]);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void FindMissingGroundTruthEvaluators_HasGroundTruth_ReturnsEmpty()
|
||||
{
|
||||
var missing = FoundryEvalConverter.FindMissingGroundTruthEvaluators(
|
||||
["similarity"], hasGroundTruth: true);
|
||||
|
||||
Assert.Empty(missing);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void FindMissingGroundTruthEvaluators_NoGroundTruthEvaluators_ReturnsEmpty()
|
||||
{
|
||||
var missing = FoundryEvalConverter.FindMissingGroundTruthEvaluators(
|
||||
["relevance", "coherence"], hasGroundTruth: false);
|
||||
|
||||
Assert.Empty(missing);
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------
|
||||
// FoundryEvalConverter.ConvertMessage DataContent test
|
||||
// ---------------------------------------------------------------
|
||||
|
||||
+9
-4
@@ -36,13 +36,18 @@ public sealed class InputWaiterTests : IDisposable
|
||||
{
|
||||
Task waitTask = this._waiter.WaitForInputAsync(TimeSpan.FromSeconds(5));
|
||||
|
||||
await Task.Delay(50);
|
||||
waitTask.IsCompleted.Should().BeFalse("the waiter should block until input is signaled");
|
||||
Task completedBeforeSignal = await Task.WhenAny(waitTask, Task.Delay(100));
|
||||
completedBeforeSignal.Should().NotBeSameAs(
|
||||
waitTask,
|
||||
"the waiter should not complete before input is signaled");
|
||||
|
||||
this._waiter.SignalInput();
|
||||
|
||||
Task completed = await Task.WhenAny(waitTask, Task.Delay(TimeSpan.FromSeconds(1)));
|
||||
completed.Should().BeSameAs(waitTask, "the wait task should complete after being signaled");
|
||||
Task completedAfterSignal = await Task.WhenAny(waitTask, Task.Delay(TimeSpan.FromSeconds(1)));
|
||||
completedAfterSignal.Should().BeSameAs(
|
||||
waitTask,
|
||||
"the wait task should complete after being signaled");
|
||||
|
||||
await waitTask;
|
||||
}
|
||||
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
// Copyright (c) Microsoft. All rights reserved.
|
||||
|
||||
using System;
|
||||
using System.Collections.Generic;
|
||||
using System.Threading.Tasks;
|
||||
using Microsoft.Extensions.AI;
|
||||
@@ -290,6 +291,121 @@ public sealed class WorkflowEvaluationTests
|
||||
Assert.DoesNotContain("end", result.Keys);
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------
|
||||
// BuildOverallItem tests (expected output / ground truth)
|
||||
// ---------------------------------------------------------------
|
||||
|
||||
[Fact]
|
||||
public void BuildOverallItem_NoCompletedExecutorWithResponse_ReturnsNull()
|
||||
{
|
||||
// Arrange — no ExecutorCompletedEvent with usable response data and no AgentResponseEvent
|
||||
var events = new List<WorkflowEvent>
|
||||
{
|
||||
new ExecutorInvokedEvent("agent-1", "query"),
|
||||
};
|
||||
|
||||
// Act
|
||||
var item = WorkflowEvaluationExtensions.BuildOverallItem(events, splitter: null, expectedOutput: null);
|
||||
|
||||
// Assert
|
||||
Assert.Null(item);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void BuildOverallItem_NoAgentResponseEvent_FallsBackToLastExecutorCompleted()
|
||||
{
|
||||
// Arrange — only ExecutorCompletedEvent (the default when EmitAgentResponseEvents is false)
|
||||
var finalResponse = new AgentResponse(new ChatMessage(ChatRole.Assistant, "Paris"));
|
||||
var events = new List<WorkflowEvent>
|
||||
{
|
||||
new ExecutorInvokedEvent("researcher", "What is the capital of France?"),
|
||||
new ExecutorCompletedEvent("researcher", new AgentResponse(new ChatMessage(ChatRole.Assistant, "draft"))),
|
||||
new ExecutorInvokedEvent("editor", "draft"),
|
||||
new ExecutorCompletedEvent("editor", finalResponse),
|
||||
};
|
||||
|
||||
// Act
|
||||
var item = WorkflowEvaluationExtensions.BuildOverallItem(
|
||||
events, splitter: null, expectedOutput: "Paris");
|
||||
|
||||
// Assert
|
||||
Assert.NotNull(item);
|
||||
Assert.Equal("What is the capital of France?", item.Query);
|
||||
Assert.Equal("Paris", item.Response);
|
||||
Assert.Equal("Paris", item.ExpectedOutput);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void BuildOverallItem_WithFinalResponseAndExpectedOutput_StampsExpectedOutput()
|
||||
{
|
||||
// Arrange
|
||||
var finalResponse = new AgentResponse(new ChatMessage(ChatRole.Assistant, "Ofrece 41 planes"));
|
||||
var events = new List<WorkflowEvent>
|
||||
{
|
||||
new ExecutorInvokedEvent("agent-1", "How many plans does Netlife offer?"),
|
||||
new ExecutorCompletedEvent("agent-1", finalResponse),
|
||||
new AgentResponseEvent("agent-1", finalResponse),
|
||||
};
|
||||
|
||||
// Act
|
||||
var item = WorkflowEvaluationExtensions.BuildOverallItem(
|
||||
events, splitter: null, expectedOutput: "Ofrece 41 planes");
|
||||
|
||||
// Assert
|
||||
Assert.NotNull(item);
|
||||
Assert.Equal("How many plans does Netlife offer?", item.Query);
|
||||
Assert.Equal("Ofrece 41 planes", item.Response);
|
||||
Assert.Equal("Ofrece 41 planes", item.ExpectedOutput);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void BuildOverallItem_WithFinalResponseAndNoExpectedOutput_LeavesExpectedOutputNull()
|
||||
{
|
||||
// Arrange
|
||||
var finalResponse = new AgentResponse(new ChatMessage(ChatRole.Assistant, "answer"));
|
||||
var events = new List<WorkflowEvent>
|
||||
{
|
||||
new ExecutorInvokedEvent("agent-1", "query"),
|
||||
new ExecutorCompletedEvent("agent-1", finalResponse),
|
||||
new AgentResponseEvent("agent-1", finalResponse),
|
||||
};
|
||||
|
||||
// Act
|
||||
var item = WorkflowEvaluationExtensions.BuildOverallItem(events, splitter: null, expectedOutput: null);
|
||||
|
||||
// Assert
|
||||
Assert.NotNull(item);
|
||||
Assert.Null(item.ExpectedOutput);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task EvaluateAsync_WithIncludeOverallButNoFinalResponse_ThrowsAsync()
|
||||
{
|
||||
// Arrange — build a workflow whose AIAgentHostExecutor is NOT bound with
|
||||
// EmitAgentResponseEvents=true, so no AgentResponseEvent is emitted, and the
|
||||
// ExecutorCompletedEvent for the host carries null Data. That is the scenario
|
||||
// where BuildOverallItem returns null. When the caller asks for an overall
|
||||
// evaluation (includeOverall: true), we should fail fast rather than silently
|
||||
// returning empty results — regardless of whether expectedOutput was supplied.
|
||||
var agent = new TestEchoAgent(name: "echo");
|
||||
var workflow = AgentWorkflowBuilder.BuildSequential(agent);
|
||||
var input = new List<ChatMessage> { new(ChatRole.User, "Hello") };
|
||||
|
||||
var evaluator = new LocalEvaluator(
|
||||
FunctionEvaluator.Create("noop", (EvalItem _) => true));
|
||||
|
||||
await using var run = await InProcessExecution.RunAsync(workflow, input);
|
||||
|
||||
// Act + Assert — throws even without expectedOutput
|
||||
var ex = await Assert.ThrowsAsync<InvalidOperationException>(() =>
|
||||
run.EvaluateAsync(
|
||||
evaluator,
|
||||
includeOverall: true,
|
||||
includePerAgent: false));
|
||||
|
||||
Assert.Contains("EmitAgentResponseEvents", ex.Message);
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------
|
||||
// EvaluateAsync integration test
|
||||
// ---------------------------------------------------------------
|
||||
|
||||
@@ -10,7 +10,7 @@ import logging
|
||||
import re
|
||||
import sys
|
||||
from abc import abstractmethod
|
||||
from collections.abc import Callable, Collection, Sequence
|
||||
from collections.abc import Callable, Collection, Coroutine, Sequence
|
||||
from contextlib import AsyncExitStack, _AsyncGeneratorContextManager # type: ignore
|
||||
from datetime import timedelta
|
||||
from functools import partial
|
||||
@@ -264,6 +264,7 @@ class MCPTool:
|
||||
self.is_connected: bool = False
|
||||
self._tools_loaded: bool = False
|
||||
self._prompts_loaded: bool = False
|
||||
self._pending_reload_tasks: set[asyncio.Task[None]] = set()
|
||||
|
||||
def __str__(self) -> str:
|
||||
return f"MCPTool(name={self.name}, description={self.description})"
|
||||
@@ -905,12 +906,47 @@ class MCPTool:
|
||||
if isinstance(message, types.ServerNotification):
|
||||
match message.root.method:
|
||||
case "notifications/tools/list_changed":
|
||||
await self.load_tools()
|
||||
self._schedule_reload(self.load_tools())
|
||||
case "notifications/prompts/list_changed":
|
||||
await self.load_prompts()
|
||||
self._schedule_reload(self.load_prompts())
|
||||
case _:
|
||||
logger.debug("Unhandled notification: %s", message.root.method)
|
||||
|
||||
def _schedule_reload(self, coro: Coroutine[Any, Any, None]) -> None:
|
||||
"""Schedule a reload coroutine as a background task.
|
||||
|
||||
Reloads (load_tools / load_prompts) triggered by MCP server
|
||||
notifications must NOT be awaited inside the message handler because
|
||||
the handler runs on the MCP SDK's single-threaded receive loop.
|
||||
Awaiting a session request (e.g. ``list_tools``) from within that loop
|
||||
deadlocks: the receive loop cannot read the response while it is
|
||||
blocked waiting for the handler to return.
|
||||
|
||||
Instead we fire the reload as an independent ``asyncio.Task`` and keep
|
||||
a strong reference in ``_pending_reload_tasks`` so it is not garbage-
|
||||
collected before completion. Only one reload per kind (tools / prompts)
|
||||
is kept in flight; a new notification cancels the previous pending task
|
||||
for the same coroutine name to avoid unbounded growth.
|
||||
"""
|
||||
# Cancel-and-replace: only one reload per kind should be in flight.
|
||||
reload_name = f"mcp-reload:{self.name}:{coro.__qualname__}"
|
||||
for existing in list(self._pending_reload_tasks):
|
||||
if existing.get_name() == reload_name and not existing.done():
|
||||
logger.debug("Cancelling in-flight reload %s; superseded by new notification", reload_name)
|
||||
existing.cancel()
|
||||
|
||||
async def _safe_reload() -> None:
|
||||
try:
|
||||
await coro
|
||||
except asyncio.CancelledError:
|
||||
raise
|
||||
except Exception:
|
||||
logger.warning("Background MCP reload failed", exc_info=True)
|
||||
|
||||
task = asyncio.create_task(_safe_reload(), name=reload_name)
|
||||
self._pending_reload_tasks.add(task)
|
||||
task.add_done_callback(self._pending_reload_tasks.discard)
|
||||
|
||||
def _determine_approval_mode(
|
||||
self,
|
||||
*candidate_names: str,
|
||||
@@ -1047,6 +1083,14 @@ class MCPTool:
|
||||
params = types.PaginatedRequestParams(cursor=tool_list.nextCursor)
|
||||
|
||||
async def _close_on_owner(self) -> None:
|
||||
# Cancel any pending reload tasks before tearing down the session.
|
||||
tasks = list(self._pending_reload_tasks)
|
||||
for task in tasks:
|
||||
task.cancel()
|
||||
self._pending_reload_tasks.clear()
|
||||
if tasks:
|
||||
await asyncio.gather(*tasks, return_exceptions=True)
|
||||
|
||||
await self._safe_close_exit_stack()
|
||||
self._exit_stack = AsyncExitStack()
|
||||
self.session = None
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
# Copyright (c) Microsoft. All rights reserved.
|
||||
# type: ignore[reportPrivateUsage]
|
||||
import asyncio
|
||||
import contextlib
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
@@ -1615,7 +1616,7 @@ async def test_mcp_connection_reset_integration():
|
||||
|
||||
async def test_mcp_tool_message_handler_notification():
|
||||
"""Test that message_handler correctly processes tools/list_changed and prompts/list_changed
|
||||
notifications."""
|
||||
notifications by scheduling reloads as background tasks."""
|
||||
tool = MCPStdioTool(name="test_tool", command="python")
|
||||
|
||||
# Mock the load_tools and load_prompts methods
|
||||
@@ -1629,6 +1630,8 @@ async def test_mcp_tool_message_handler_notification():
|
||||
|
||||
result = await tool.message_handler(tools_notification)
|
||||
assert result is None
|
||||
# The reload is scheduled as a background task; let it run.
|
||||
await asyncio.sleep(0)
|
||||
tool.load_tools.assert_called_once()
|
||||
|
||||
# Reset mock
|
||||
@@ -1641,6 +1644,7 @@ async def test_mcp_tool_message_handler_notification():
|
||||
|
||||
result = await tool.message_handler(prompts_notification)
|
||||
assert result is None
|
||||
await asyncio.sleep(0)
|
||||
tool.load_prompts.assert_called_once()
|
||||
|
||||
# Test unhandled notification
|
||||
@@ -1664,6 +1668,112 @@ async def test_mcp_tool_message_handler_error():
|
||||
assert result is None
|
||||
|
||||
|
||||
async def test_mcp_tool_message_handler_does_not_block_receive_loop():
|
||||
"""Test that message_handler does not deadlock the MCP receive loop.
|
||||
|
||||
Regression test for https://github.com/microsoft/agent-framework/issues/4828.
|
||||
When the MCP server sends a ``notifications/tools/list_changed``
|
||||
notification, the handler must NOT await ``load_tools()`` synchronously
|
||||
because that would block the single-threaded MCP receive loop, preventing
|
||||
it from delivering the ``list_tools`` response — a classic deadlock.
|
||||
"""
|
||||
tool = MCPStdioTool(name="test_tool", command="python")
|
||||
|
||||
# Use an event to make load_tools block until we release it.
|
||||
# This simulates load_tools waiting for a session response that the
|
||||
# receive loop would need to deliver.
|
||||
release = asyncio.Event()
|
||||
|
||||
async def slow_load_tools():
|
||||
await release.wait()
|
||||
|
||||
tool.load_tools = slow_load_tools # type: ignore[assignment]
|
||||
|
||||
tools_notification = Mock(spec=types.ServerNotification)
|
||||
tools_notification.root = Mock()
|
||||
tools_notification.root.method = "notifications/tools/list_changed"
|
||||
|
||||
# message_handler must return immediately even though load_tools blocks.
|
||||
await tool.message_handler(tools_notification)
|
||||
|
||||
# If the handler had awaited load_tools synchronously, we would never
|
||||
# reach this line (deadlock). Verify the reload task is pending.
|
||||
assert len(tool._pending_reload_tasks) == 1
|
||||
|
||||
# Unblock the reload so the background task finishes cleanly.
|
||||
release.set()
|
||||
# Wait for the pending reload task(s) to complete so their done-callbacks
|
||||
# have a chance to remove them from _pending_reload_tasks.
|
||||
await asyncio.wait_for(asyncio.gather(*tool._pending_reload_tasks), timeout=1)
|
||||
assert len(tool._pending_reload_tasks) == 0
|
||||
|
||||
|
||||
async def test_mcp_tool_message_handler_reload_failure_is_logged(caplog: pytest.LogCaptureFixture):
|
||||
"""Background reload errors are logged, not raised into the receive loop."""
|
||||
tool = MCPStdioTool(name="test_tool", command="python")
|
||||
tool.load_tools = AsyncMock(side_effect=RuntimeError("connection lost"))
|
||||
|
||||
tools_notification = Mock(spec=types.ServerNotification)
|
||||
tools_notification.root = Mock()
|
||||
tools_notification.root.method = "notifications/tools/list_changed"
|
||||
|
||||
await tool.message_handler(tools_notification)
|
||||
# Let the background task run — it should not propagate the exception.
|
||||
# Snapshot tasks and await them to ensure done-callbacks fire.
|
||||
pending = list(tool._pending_reload_tasks)
|
||||
if pending:
|
||||
await asyncio.wait_for(asyncio.gather(*pending, return_exceptions=True), timeout=1)
|
||||
tool.load_tools.assert_called_once()
|
||||
assert len(tool._pending_reload_tasks) == 0
|
||||
|
||||
# Verify the warning was actually logged with exception info.
|
||||
reload_warnings = [r for r in caplog.records if "Background MCP reload failed" in r.message]
|
||||
assert len(reload_warnings) == 1
|
||||
assert reload_warnings[0].levelname == "WARNING"
|
||||
assert reload_warnings[0].exc_info is not None
|
||||
|
||||
|
||||
async def test_mcp_tool_message_handler_cancel_and_replace():
|
||||
"""Sending two notifications in quick succession cancels the first reload task."""
|
||||
tool = MCPStdioTool(name="test_tool", command="python")
|
||||
|
||||
release = asyncio.Event()
|
||||
call_count = 0
|
||||
|
||||
async def blocking_load_tools():
|
||||
nonlocal call_count
|
||||
call_count += 1
|
||||
await release.wait()
|
||||
|
||||
tool.load_tools = blocking_load_tools # type: ignore[assignment]
|
||||
|
||||
notification = Mock(spec=types.ServerNotification)
|
||||
notification.root = Mock()
|
||||
notification.root.method = "notifications/tools/list_changed"
|
||||
|
||||
# First notification — starts a blocking reload task.
|
||||
await tool.message_handler(notification)
|
||||
assert len(tool._pending_reload_tasks) == 1
|
||||
first_task = next(iter(tool._pending_reload_tasks))
|
||||
|
||||
# Second notification — should cancel the first and replace it.
|
||||
await tool.message_handler(notification)
|
||||
# Yield to the event loop so the cancellation is processed.
|
||||
with contextlib.suppress(asyncio.CancelledError):
|
||||
await first_task
|
||||
|
||||
assert first_task.cancelled()
|
||||
|
||||
assert len(tool._pending_reload_tasks) == 1
|
||||
second_task = next(iter(tool._pending_reload_tasks))
|
||||
assert second_task is not first_task
|
||||
|
||||
# Unblock and let the second task finish.
|
||||
release.set()
|
||||
await asyncio.wait_for(asyncio.gather(*tool._pending_reload_tasks), timeout=1)
|
||||
assert len(tool._pending_reload_tasks) == 0
|
||||
|
||||
|
||||
async def test_mcp_tool_sampling_callback_no_client():
|
||||
"""Test sampling callback error path when no chat client is available."""
|
||||
tool = MCPStdioTool(name="test_tool", command="python")
|
||||
|
||||
Reference in New Issue
Block a user