diff --git a/dotnet/agent-framework-dotnet.slnx b/dotnet/agent-framework-dotnet.slnx
index c684fcf883..87e6d9d3c6 100644
--- a/dotnet/agent-framework-dotnet.slnx
+++ b/dotnet/agent-framework-dotnet.slnx
@@ -298,6 +298,7 @@
+
diff --git a/dotnet/nuget/nuget-package.props b/dotnet/nuget/nuget-package.props
index d97f72eea3..cb1713e5e9 100644
--- a/dotnet/nuget/nuget-package.props
+++ b/dotnet/nuget/nuget-package.props
@@ -1,14 +1,14 @@
- 1.5.0
+ 1.6.0
1
- 260507
+ 260512
$(VersionPrefix)-rc$(RCNumber)
$(VersionPrefix)-$(VersionSuffix).$(DateSuffix).1
$(VersionPrefix)-preview.$(DateSuffix).1
$(VersionPrefix)
- 1.5.0
+ 1.6.0
Debug;Release;Publish
true
diff --git a/dotnet/samples/03-workflows/Evaluation/Evaluation_WorkflowExpectedOutputs/Evaluation_WorkflowExpectedOutputs.csproj b/dotnet/samples/03-workflows/Evaluation/Evaluation_WorkflowExpectedOutputs/Evaluation_WorkflowExpectedOutputs.csproj
new file mode 100644
index 0000000000..adbcde8572
--- /dev/null
+++ b/dotnet/samples/03-workflows/Evaluation/Evaluation_WorkflowExpectedOutputs/Evaluation_WorkflowExpectedOutputs.csproj
@@ -0,0 +1,16 @@
+
+
+
+ Exe
+ net10.0
+
+ enable
+ enable
+
+
+
+
+
+
+
+
diff --git a/dotnet/samples/03-workflows/Evaluation/Evaluation_WorkflowExpectedOutputs/Program.cs b/dotnet/samples/03-workflows/Evaluation/Evaluation_WorkflowExpectedOutputs/Program.cs
new file mode 100644
index 0000000000..30fa79faa8
--- /dev/null
+++ b/dotnet/samples/03-workflows/Evaluation/Evaluation_WorkflowExpectedOutputs/Program.cs
@@ -0,0 +1,76 @@
+// Copyright (c) Microsoft. All rights reserved.
+
+// This sample demonstrates evaluating a multi-agent workflow against a
+// golden answer using Foundry's reference-based Similarity evaluator.
+
+using Azure.AI.Projects;
+using Azure.Identity;
+using Microsoft.Agents.AI;
+using Microsoft.Agents.AI.Workflows;
+using Microsoft.Extensions.AI;
+using FoundryEvals = Microsoft.Agents.AI.Foundry.FoundryEvals;
+
+string endpoint = Environment.GetEnvironmentVariable("AZURE_AI_PROJECT_ENDPOINT")
+ ?? throw new InvalidOperationException("AZURE_AI_PROJECT_ENDPOINT is not set.");
+string deploymentName = Environment.GetEnvironmentVariable("AZURE_AI_MODEL_DEPLOYMENT_NAME") ?? "gpt-4o-mini";
+
+// WARNING: DefaultAzureCredential is convenient for development but requires careful consideration in production.
+// In production, consider using a specific credential (e.g., ManagedIdentityCredential) to avoid
+// latency issues, unintended credential probing, and potential security risks from fallback mechanisms.
+AIProjectClient projectClient = new(new Uri(endpoint), new DefaultAzureCredential());
+
+// Build a two-agent workflow: a researcher writes a draft answer, then an
+// editor polishes it into the final response that we compare to ground truth.
+// EmitAgentResponseEvents is enabled so the workflow surfaces an AgentResponseEvent
+// for each agent — this is what EvaluateAsync uses to find the overall final answer.
+var hostOptions = new AIAgentHostOptions { EmitAgentResponseEvents = true };
+
+AIAgent researcher = projectClient.AsAIAgent(
+ model: deploymentName,
+ instructions: "You research questions and produce a short factual draft answer.",
+ name: "researcher");
+
+AIAgent editor = projectClient.AsAIAgent(
+ model: deploymentName,
+ instructions: "You take a draft answer and produce the final concise response.",
+ name: "editor");
+
+ExecutorBinding researcherExecutor = researcher.BindAsExecutor(hostOptions);
+ExecutorBinding editorExecutor = editor.BindAsExecutor(hostOptions);
+
+Workflow workflow = new WorkflowBuilder(researcherExecutor)
+ .AddEdge(researcherExecutor, editorExecutor)
+ .Build();
+
+// Run the workflow against the user question.
+const string Query = "What is the capital of France?";
+const string GroundTruth = "Paris";
+
+await using Run run = await InProcessExecution.RunAsync(
+ workflow,
+ new ChatMessage(ChatRole.User, Query));
+
+// Evaluate the overall workflow output against a golden answer using the
+// reference-based Similarity evaluator. The 'expectedOutput' value is stamped
+// onto the overall EvalItem.ExpectedOutput and is surfaced to Foundry as
+// `ground_truth` in the underlying JSONL payload.
+//
+// Per-agent breakdown is disabled here: ground truth applies to the workflow's
+// final answer, not to each sub-agent's intermediate output. Without
+// includePerAgent: false, the evaluator would be invoked for per-agent items
+// (which have no ExpectedOutput) and Similarity would fail validation.
+FoundryEvals similarity = new(projectClient, deploymentName, FoundryEvals.Similarity);
+
+AgentEvaluationResults results = await run.EvaluateAsync(
+ similarity,
+ includePerAgent: false,
+ expectedOutput: GroundTruth);
+
+Console.WriteLine($"Query: {Query}");
+Console.WriteLine($"Expected: {GroundTruth}");
+Console.WriteLine($"Provider: {results.ProviderName}");
+Console.WriteLine($"Passed: {results.Passed}/{results.Total}");
+if (results.ReportUrl is not null)
+{
+ Console.WriteLine($"Report: {results.ReportUrl}");
+}
diff --git a/dotnet/samples/03-workflows/Evaluation/Evaluation_WorkflowExpectedOutputs/README.md b/dotnet/samples/03-workflows/Evaluation/Evaluation_WorkflowExpectedOutputs/README.md
new file mode 100644
index 0000000000..9390e91e4c
--- /dev/null
+++ b/dotnet/samples/03-workflows/Evaluation/Evaluation_WorkflowExpectedOutputs/README.md
@@ -0,0 +1,37 @@
+# Evaluation - Workflow Expected Outputs
+
+This sample demonstrates evaluating a multi-agent workflow's final answer
+against a golden expected output using Foundry's reference-based **Similarity**
+evaluator.
+
+## What this sample demonstrates
+
+- Building a small researcher → editor workflow
+- Running the workflow and obtaining a `Run`
+- Calling `run.EvaluateAsync(evaluator, expectedOutput: ...)` to attach a
+ ground-truth answer to the overall workflow item
+- Using `FoundryEvals.Similarity`, which requires a `ground_truth` value
+ per item
+
+The `expectedOutput` value is stamped onto the overall `EvalItem.ExpectedOutput`
+and is surfaced to Foundry as `ground_truth` in the JSONL payload sent to the
+Evals API.
+
+## Prerequisites
+
+- .NET 10 SDK or later
+- Azure CLI installed and authenticated (`az login`)
+
+Set the following environment variables:
+
+```powershell
+$env:AZURE_AI_PROJECT_ENDPOINT="https://your-foundry-service.services.ai.azure.com/api/projects/your-foundry-project"
+$env:AZURE_AI_MODEL_DEPLOYMENT_NAME="gpt-4o-mini"
+```
+
+## Run the sample
+
+```powershell
+cd dotnet/samples/03-workflows/Evaluation
+dotnet run --project .\Evaluation_WorkflowExpectedOutputs
+```
diff --git a/dotnet/src/Microsoft.Agents.AI.Foundry/Evaluation/FoundryEvalConverter.cs b/dotnet/src/Microsoft.Agents.AI.Foundry/Evaluation/FoundryEvalConverter.cs
index c539175ed2..0754e2bc76 100644
--- a/dotnet/src/Microsoft.Agents.AI.Foundry/Evaluation/FoundryEvalConverter.cs
+++ b/dotnet/src/Microsoft.Agents.AI.Foundry/Evaluation/FoundryEvalConverter.cs
@@ -130,6 +130,7 @@ internal static class FoundryEvalConverter
QueryMessages = ConvertMessages(queryMessages),
ResponseMessages = ConvertMessages(responseMessages),
Context = item.Context,
+ GroundTruth = item.ExpectedOutput,
ToolDefinitions = item.Tools is { Count: > 0 }
? item.Tools
.OfType()
@@ -185,6 +186,11 @@ internal static class FoundryEvalConverter
dataMapping["context"] = "{{item.context}}";
}
+ if (GroundTruthEvaluators.Contains(qualified))
+ {
+ dataMapping["ground_truth"] = "{{item.ground_truth}}";
+ }
+
if (ToolEvaluators.Contains(qualified))
{
dataMapping["tool_definitions"] = "{{item.tool_definitions}}";
@@ -206,7 +212,7 @@ internal static class FoundryEvalConverter
///
/// Builds the item_schema for custom JSONL eval definitions.
///
- internal static WireItemSchema BuildItemSchema(bool hasContext = false, bool hasTools = false)
+ internal static WireItemSchema BuildItemSchema(bool hasContext = false, bool hasTools = false, bool hasGroundTruth = false)
{
var properties = new Dictionary
{
@@ -221,6 +227,11 @@ internal static class FoundryEvalConverter
properties["context"] = new WireSchemaProperty { Type = "string" };
}
+ if (hasGroundTruth)
+ {
+ properties["ground_truth"] = new WireSchemaProperty { Type = "string" };
+ }
+
if (hasTools)
{
properties["tool_definitions"] = new WireSchemaProperty { Type = "array" };
@@ -233,6 +244,31 @@ internal static class FoundryEvalConverter
};
}
+ ///
+ /// Returns the subset of that require a ground-truth
+ /// (reference) value but cannot be evaluated because no item provided one.
+ ///
+ internal static List FindMissingGroundTruthEvaluators(
+ IEnumerable evaluators,
+ bool hasGroundTruth)
+ {
+ if (hasGroundTruth)
+ {
+ return [];
+ }
+
+ var missing = new List();
+ foreach (var name in evaluators)
+ {
+ if (GroundTruthEvaluators.Contains(ResolveEvaluator(name)))
+ {
+ missing.Add(name);
+ }
+ }
+
+ return missing;
+ }
+
///
/// Resolves a short evaluator name to its fully-qualified builtin.* form.
///
@@ -277,6 +313,12 @@ internal static class FoundryEvalConverter
"builtin.tool_call_success",
};
+ // Evaluators that require a ground_truth (reference) value per item.
+ internal static readonly HashSet GroundTruthEvaluators = new(StringComparer.OrdinalIgnoreCase)
+ {
+ "builtin.similarity",
+ };
+
// Short name → fully-qualified name mapping.
internal static readonly Dictionary BuiltinEvaluators = new(StringComparer.OrdinalIgnoreCase)
{
diff --git a/dotnet/src/Microsoft.Agents.AI.Foundry/Evaluation/FoundryEvalWireModels.cs b/dotnet/src/Microsoft.Agents.AI.Foundry/Evaluation/FoundryEvalWireModels.cs
index 4438b35807..c05232575c 100644
--- a/dotnet/src/Microsoft.Agents.AI.Foundry/Evaluation/FoundryEvalWireModels.cs
+++ b/dotnet/src/Microsoft.Agents.AI.Foundry/Evaluation/FoundryEvalWireModels.cs
@@ -103,6 +103,9 @@ internal sealed class WireEvalItemPayload
[JsonPropertyName("context")]
public string? Context { get; init; }
+ [JsonPropertyName("ground_truth")]
+ public string? GroundTruth { get; init; }
+
[JsonPropertyName("tool_definitions")]
public List? ToolDefinitions { get; init; }
}
diff --git a/dotnet/src/Microsoft.Agents.AI.Foundry/Evaluation/FoundryEvals.cs b/dotnet/src/Microsoft.Agents.AI.Foundry/Evaluation/FoundryEvals.cs
index d91b69c1e1..675ae38dfe 100644
--- a/dotnet/src/Microsoft.Agents.AI.Foundry/Evaluation/FoundryEvals.cs
+++ b/dotnet/src/Microsoft.Agents.AI.Foundry/Evaluation/FoundryEvals.cs
@@ -145,6 +145,8 @@ public sealed class FoundryEvals : IAgentEvaluator
bool hasContext = payloads.Any(p => p.Context is not null);
bool hasTools = payloads.Any(p => p.ToolDefinitions is { Count: > 0 });
+ bool hasGroundTruth = payloads.Any(p => p.GroundTruth is not null);
+ bool allHaveGroundTruth = payloads.Count > 0 && payloads.All(p => p.GroundTruth is not null);
// Filter out tool evaluators if no items have tools; auto-add ToolCallAccuracy if tools present
var evaluators = FilterToolEvaluators(this._evaluatorNames, hasTools);
@@ -153,13 +155,27 @@ public sealed class FoundryEvals : IAgentEvaluator
evaluators = [.. evaluators, ToolCallAccuracy];
}
+ // Fail fast if a ground-truth evaluator (e.g. similarity) is requested but not
+ // every item carries an ExpectedOutput. Reference-based evaluators score each
+ // item against its own ground truth, so even one missing value will surface as
+ // a provider-side validation error. Catch it here with a clearer message.
+ var missingGroundTruth = FoundryEvalConverter.FindMissingGroundTruthEvaluators(evaluators, allHaveGroundTruth);
+ if (missingGroundTruth.Count > 0)
+ {
+ throw new InvalidOperationException(
+ "The following evaluator(s) require a ground-truth/expected output on every item but " +
+ $"at least one item is missing an {nameof(EvalItem.ExpectedOutput)}: {string.Join(", ", missingGroundTruth)}. " +
+ "Provide an expected output per item (for example via the 'expectedOutput' parameter on EvaluateAsync), " +
+ "or set 'includePerAgent: false' so the evaluator only runs on the overall item.");
+ }
+
// 2. Create the evaluation definition
var createEvalPayload = new WireCreateEvalRequest
{
Name = evalName,
DataSourceConfig = new WireCustomDataSourceConfig
{
- ItemSchema = FoundryEvalConverter.BuildItemSchema(hasContext, hasTools),
+ ItemSchema = FoundryEvalConverter.BuildItemSchema(hasContext, hasTools, hasGroundTruth),
},
TestingCriteria = FoundryEvalConverter.BuildTestingCriteria(
evaluators, this._model, includeDataMapping: true),
@@ -822,15 +838,15 @@ public sealed class FoundryEvals : IAgentEvaluator
var result = new EvalItemResult(itemId, status, scores);
// Extract error info from sample
- if (outputItem.TryGetProperty("sample", out var sample))
+ if (outputItem.TryGetProperty("sample", out var sample) && sample.ValueKind == JsonValueKind.Object)
{
- if (sample.TryGetProperty("error", out var errObj))
+ if (sample.TryGetProperty("error", out var errObj) && errObj.ValueKind == JsonValueKind.Object)
{
result.ErrorCode = errObj.TryGetProperty("code", out var code) ? code.GetString() : null;
result.ErrorMessage = errObj.TryGetProperty("message", out var msg) ? msg.GetString() : null;
}
- if (sample.TryGetProperty("usage", out var usage) && usage.TryGetProperty("total_tokens", out var tt) && tt.ValueKind == JsonValueKind.Number)
+ if (sample.TryGetProperty("usage", out var usage) && usage.ValueKind == JsonValueKind.Object && usage.TryGetProperty("total_tokens", out var tt) && tt.ValueKind == JsonValueKind.Number)
{
var tokenUsage = new Dictionary();
if (usage.TryGetProperty("prompt_tokens", out var pt) && pt.ValueKind == JsonValueKind.Number)
@@ -886,7 +902,7 @@ public sealed class FoundryEvals : IAgentEvaluator
}
// Extract response_id from datasource_item
- if (outputItem.TryGetProperty("datasource_item", out var dsItem))
+ if (outputItem.TryGetProperty("datasource_item", out var dsItem) && dsItem.ValueKind == JsonValueKind.Object)
{
if (dsItem.TryGetProperty("resp_id", out var respId))
{
diff --git a/dotnet/src/Microsoft.Agents.AI.Workflows/Evaluation/WorkflowEvaluationExtensions.cs b/dotnet/src/Microsoft.Agents.AI.Workflows/Evaluation/WorkflowEvaluationExtensions.cs
index 31cbf08273..223378b787 100644
--- a/dotnet/src/Microsoft.Agents.AI.Workflows/Evaluation/WorkflowEvaluationExtensions.cs
+++ b/dotnet/src/Microsoft.Agents.AI.Workflows/Evaluation/WorkflowEvaluationExtensions.cs
@@ -28,6 +28,17 @@ public static class WorkflowEvaluationExtensions
/// Use , ,
/// or a custom implementation.
///
+ ///
+ /// Optional ground-truth/expected output for the workflow's overall final answer.
+ /// When provided, it is stamped onto the overall
+ /// so reference-based evaluators (for example, similarity) can compare the
+ /// workflow's response against a golden answer. Ground truth is only applied
+ /// to the overall item; per-agent items are intentionally left without an
+ /// expected output, since ground truth is defined against the final response.
+ /// When using a reference-based evaluator that requires ground truth, set
+ /// to to avoid
+ /// invoking the evaluator on per-agent items that have no expected output.
+ ///
/// Cancellation token.
/// Evaluation results with optional per-agent sub-results.
public static async Task EvaluateAsync(
@@ -37,6 +48,7 @@ public static class WorkflowEvaluationExtensions
bool includePerAgent = true,
string evalName = "Workflow Eval",
IConversationSplitter? splitter = null,
+ string? expectedOutput = null,
CancellationToken cancellationToken = default)
{
var events = run.OutgoingEvents.ToList();
@@ -48,28 +60,26 @@ public static class WorkflowEvaluationExtensions
var overallItems = new List();
if (includeOverall)
{
- var finalResponse = events.OfType().LastOrDefault();
- if (finalResponse is not null)
+ var overallItem = BuildOverallItem(events, splitter, expectedOutput);
+ if (overallItem is not null)
{
- var firstInvoked = events.OfType().FirstOrDefault();
- var query = firstInvoked?.Data switch
- {
- ChatMessage cm => cm.Text ?? string.Empty,
- IReadOnlyList msgs => msgs.LastOrDefault(m => m.Role == ChatRole.User)?.Text ?? string.Empty,
- string s => s,
- _ => firstInvoked?.Data?.ToString() ?? string.Empty,
- };
- var conversation = new List
- {
- new(ChatRole.User, query),
- };
-
- conversation.AddRange(finalResponse.Response.Messages);
-
- overallItems.Add(new EvalItem(query, finalResponse.Response.Text, conversation)
- {
- Splitter = splitter,
- });
+ overallItems.Add(overallItem);
+ }
+ else
+ {
+ // The caller asked for an overall evaluation but we couldn't find a final
+ // response to score — almost always because the workflow's agents weren't
+ // built with EmitAgentResponseEvents enabled (so no AgentResponseEvent was
+ // emitted) and no terminal ExecutorCompletedEvent carried an AgentResponse
+ // / ChatMessage / string payload. Fail loudly instead of silently returning
+ // 0/0 (or skipping evaluation against a supplied expectedOutput).
+ throw new InvalidOperationException(
+ "Cannot evaluate the overall workflow output: no AgentResponseEvent or " +
+ "ExecutorCompletedEvent with an AgentResponse/ChatMessage/string payload " +
+ "was found in the run. Bind agents with " +
+ "AIAgentHostOptions { EmitAgentResponseEvents = true } " +
+ "(for example via agent.BindAsExecutor(new AIAgentHostOptions { EmitAgentResponseEvents = true })) " +
+ "so the workflow surfaces the final agent response, or set 'includeOverall: false'.");
}
}
@@ -97,6 +107,86 @@ public static class WorkflowEvaluationExtensions
return overallResult;
}
+ internal static EvalItem? BuildOverallItem(
+ IReadOnlyList events,
+ IConversationSplitter? splitter,
+ string? expectedOutput)
+ {
+ var firstInvoked = events.OfType().FirstOrDefault();
+ var query = firstInvoked?.Data switch
+ {
+ ChatMessage cm => cm.Text ?? string.Empty,
+ IReadOnlyList msgs => msgs.LastOrDefault(m => m.Role == ChatRole.User)?.Text ?? string.Empty,
+ string s => s,
+ _ => firstInvoked?.Data?.ToString() ?? string.Empty,
+ };
+
+ var conversation = new List
+ {
+ new(ChatRole.User, query),
+ };
+
+ // Prefer AgentResponseEvent (only emitted when AIAgentHostOptions.EmitAgentResponseEvents
+ // is enabled). Otherwise fall back to the last ExecutorCompletedEvent that carries an
+ // AgentResponse / ChatMessage / string payload — these are always emitted by the runtime.
+ var finalResponse = events.OfType().LastOrDefault();
+ string responseText;
+ if (finalResponse is not null)
+ {
+ responseText = finalResponse.Response.Text;
+ conversation.AddRange(finalResponse.Response.Messages);
+ }
+ else
+ {
+ ExecutorCompletedEvent? finalCompleted = null;
+ for (int i = events.Count - 1; i >= 0; i--)
+ {
+ if (events[i] is ExecutorCompletedEvent completed
+ && !IsInternalExecutor(completed.ExecutorId)
+ && completed.Data is AgentResponse or ChatMessage or string)
+ {
+ finalCompleted = completed;
+ break;
+ }
+ }
+
+ if (finalCompleted is null)
+ {
+ return null;
+ }
+
+ switch (finalCompleted.Data)
+ {
+ case AgentResponse ar:
+ responseText = ar.Text;
+ conversation.AddRange(ar.Messages);
+ break;
+ case ChatMessage cm:
+ responseText = cm.Text ?? string.Empty;
+ conversation.Add(cm);
+ break;
+ case string s:
+ responseText = s;
+ conversation.Add(new ChatMessage(ChatRole.Assistant, s));
+ break;
+ default:
+ // Unreachable — the for-loop above already constrains Data to one of the
+ // three handled types. Throw if the contract drifts so the bug is visible
+ // instead of silently dropping the overall item.
+ throw new InvalidOperationException(
+ "BuildOverallItem: unexpected ExecutorCompletedEvent.Data type " +
+ $"'{finalCompleted.Data?.GetType().FullName ?? "null"}'. Expected " +
+ $"{nameof(AgentResponse)}, {nameof(ChatMessage)}, or string.");
+ }
+ }
+
+ return new EvalItem(query, responseText, conversation)
+ {
+ Splitter = splitter,
+ ExpectedOutput = expectedOutput,
+ };
+ }
+
internal static Dictionary> ExtractAgentData(
List events,
IConversationSplitter? splitter)
diff --git a/dotnet/tests/Microsoft.Agents.AI.Foundry.UnitTests/FoundryEvalConverterTests.cs b/dotnet/tests/Microsoft.Agents.AI.Foundry.UnitTests/FoundryEvalConverterTests.cs
index aa0df10200..aea1459e5e 100644
--- a/dotnet/tests/Microsoft.Agents.AI.Foundry.UnitTests/FoundryEvalConverterTests.cs
+++ b/dotnet/tests/Microsoft.Agents.AI.Foundry.UnitTests/FoundryEvalConverterTests.cs
@@ -179,6 +179,35 @@ public sealed class FoundryEvalConverterTests
Assert.Null(payload.Context);
}
+ [Fact]
+ public void ConvertEvalItem_WithExpectedOutput_PopulatesGroundTruth()
+ {
+ // Arrange
+ var item = new EvalItem(query: "q", response: "r")
+ {
+ ExpectedOutput = "the golden answer",
+ };
+
+ // Act
+ var payload = FoundryEvalConverter.ConvertEvalItem(item);
+
+ // Assert
+ Assert.Equal("the golden answer", payload.GroundTruth);
+ }
+
+ [Fact]
+ public void ConvertEvalItem_WithoutExpectedOutput_OmitsGroundTruth()
+ {
+ // Arrange
+ var item = new EvalItem(query: "q", response: "r");
+
+ // Act
+ var payload = FoundryEvalConverter.ConvertEvalItem(item);
+
+ // Assert
+ Assert.Null(payload.GroundTruth);
+ }
+
// ---------------------------------------------------------------
// FoundryEvalConverter.BuildTestingCriteria tests
// ---------------------------------------------------------------
@@ -239,6 +268,33 @@ public sealed class FoundryEvalConverterTests
Assert.Equal("{{item.context}}", mapping["context"]);
}
+ [Fact]
+ public void BuildTestingCriteria_SimilarityEvaluator_IncludesGroundTruth()
+ {
+ // Act
+ var criteria = FoundryEvalConverter.BuildTestingCriteria(
+ ["similarity"], "gpt-4o-mini", includeDataMapping: true);
+
+ // Assert
+ Assert.Single(criteria);
+ Assert.Equal("builtin.similarity", criteria[0].EvaluatorName);
+ var mapping = criteria[0].DataMapping;
+ Assert.NotNull(mapping);
+ Assert.True(mapping.ContainsKey("ground_truth"));
+ Assert.Equal("{{item.ground_truth}}", mapping["ground_truth"]);
+ }
+
+ [Fact]
+ public void BuildTestingCriteria_NonGroundTruthEvaluator_OmitsGroundTruth()
+ {
+ var criteria = FoundryEvalConverter.BuildTestingCriteria(
+ ["relevance"], "gpt-4o-mini", includeDataMapping: true);
+
+ var mapping = criteria[0].DataMapping;
+ Assert.NotNull(mapping);
+ Assert.False(mapping.ContainsKey("ground_truth"));
+ }
+
[Fact]
public void BuildTestingCriteria_WithoutDataMapping_OmitsMappingField()
{
@@ -282,6 +338,59 @@ public sealed class FoundryEvalConverterTests
Assert.True(schema.Properties.ContainsKey("tool_definitions"));
}
+ [Fact]
+ public void BuildItemSchema_WithGroundTruth_IncludesGroundTruthProperty()
+ {
+ // Act
+ var schema = FoundryEvalConverter.BuildItemSchema(hasGroundTruth: true);
+
+ // Assert
+ Assert.True(schema.Properties.ContainsKey("ground_truth"));
+ Assert.Equal("string", schema.Properties["ground_truth"].Type);
+ }
+
+ [Fact]
+ public void BuildItemSchema_WithoutGroundTruth_OmitsGroundTruthProperty()
+ {
+ var schema = FoundryEvalConverter.BuildItemSchema();
+
+ Assert.False(schema.Properties.ContainsKey("ground_truth"));
+ }
+
+ // ---------------------------------------------------------------
+ // FoundryEvalConverter.FindMissingGroundTruthEvaluators tests
+ // ---------------------------------------------------------------
+
+ [Fact]
+ public void FindMissingGroundTruthEvaluators_NoGroundTruth_ReturnsSimilarity()
+ {
+ // Act
+ var missing = FoundryEvalConverter.FindMissingGroundTruthEvaluators(
+ ["similarity", "relevance"], hasGroundTruth: false);
+
+ // Assert
+ Assert.Single(missing);
+ Assert.Equal("similarity", missing[0]);
+ }
+
+ [Fact]
+ public void FindMissingGroundTruthEvaluators_HasGroundTruth_ReturnsEmpty()
+ {
+ var missing = FoundryEvalConverter.FindMissingGroundTruthEvaluators(
+ ["similarity"], hasGroundTruth: true);
+
+ Assert.Empty(missing);
+ }
+
+ [Fact]
+ public void FindMissingGroundTruthEvaluators_NoGroundTruthEvaluators_ReturnsEmpty()
+ {
+ var missing = FoundryEvalConverter.FindMissingGroundTruthEvaluators(
+ ["relevance", "coherence"], hasGroundTruth: false);
+
+ Assert.Empty(missing);
+ }
+
// ---------------------------------------------------------------
// FoundryEvalConverter.ConvertMessage DataContent test
// ---------------------------------------------------------------
diff --git a/dotnet/tests/Microsoft.Agents.AI.Workflows.UnitTests/InputWaiterAndOutputFilterTests.cs b/dotnet/tests/Microsoft.Agents.AI.Workflows.UnitTests/InputWaiterAndOutputFilterTests.cs
index 77c0160200..dead5454b4 100644
--- a/dotnet/tests/Microsoft.Agents.AI.Workflows.UnitTests/InputWaiterAndOutputFilterTests.cs
+++ b/dotnet/tests/Microsoft.Agents.AI.Workflows.UnitTests/InputWaiterAndOutputFilterTests.cs
@@ -36,13 +36,18 @@ public sealed class InputWaiterTests : IDisposable
{
Task waitTask = this._waiter.WaitForInputAsync(TimeSpan.FromSeconds(5));
- await Task.Delay(50);
- waitTask.IsCompleted.Should().BeFalse("the waiter should block until input is signaled");
+ Task completedBeforeSignal = await Task.WhenAny(waitTask, Task.Delay(100));
+ completedBeforeSignal.Should().NotBeSameAs(
+ waitTask,
+ "the waiter should not complete before input is signaled");
this._waiter.SignalInput();
- Task completed = await Task.WhenAny(waitTask, Task.Delay(TimeSpan.FromSeconds(1)));
- completed.Should().BeSameAs(waitTask, "the wait task should complete after being signaled");
+ Task completedAfterSignal = await Task.WhenAny(waitTask, Task.Delay(TimeSpan.FromSeconds(1)));
+ completedAfterSignal.Should().BeSameAs(
+ waitTask,
+ "the wait task should complete after being signaled");
+
await waitTask;
}
diff --git a/dotnet/tests/Microsoft.Agents.AI.Workflows.UnitTests/WorkflowEvaluationTests.cs b/dotnet/tests/Microsoft.Agents.AI.Workflows.UnitTests/WorkflowEvaluationTests.cs
index cc4f8338d5..fe7052d440 100644
--- a/dotnet/tests/Microsoft.Agents.AI.Workflows.UnitTests/WorkflowEvaluationTests.cs
+++ b/dotnet/tests/Microsoft.Agents.AI.Workflows.UnitTests/WorkflowEvaluationTests.cs
@@ -1,5 +1,6 @@
// Copyright (c) Microsoft. All rights reserved.
+using System;
using System.Collections.Generic;
using System.Threading.Tasks;
using Microsoft.Extensions.AI;
@@ -290,6 +291,121 @@ public sealed class WorkflowEvaluationTests
Assert.DoesNotContain("end", result.Keys);
}
+ // ---------------------------------------------------------------
+ // BuildOverallItem tests (expected output / ground truth)
+ // ---------------------------------------------------------------
+
+ [Fact]
+ public void BuildOverallItem_NoCompletedExecutorWithResponse_ReturnsNull()
+ {
+ // Arrange — no ExecutorCompletedEvent with usable response data and no AgentResponseEvent
+ var events = new List
+ {
+ new ExecutorInvokedEvent("agent-1", "query"),
+ };
+
+ // Act
+ var item = WorkflowEvaluationExtensions.BuildOverallItem(events, splitter: null, expectedOutput: null);
+
+ // Assert
+ Assert.Null(item);
+ }
+
+ [Fact]
+ public void BuildOverallItem_NoAgentResponseEvent_FallsBackToLastExecutorCompleted()
+ {
+ // Arrange — only ExecutorCompletedEvent (the default when EmitAgentResponseEvents is false)
+ var finalResponse = new AgentResponse(new ChatMessage(ChatRole.Assistant, "Paris"));
+ var events = new List
+ {
+ new ExecutorInvokedEvent("researcher", "What is the capital of France?"),
+ new ExecutorCompletedEvent("researcher", new AgentResponse(new ChatMessage(ChatRole.Assistant, "draft"))),
+ new ExecutorInvokedEvent("editor", "draft"),
+ new ExecutorCompletedEvent("editor", finalResponse),
+ };
+
+ // Act
+ var item = WorkflowEvaluationExtensions.BuildOverallItem(
+ events, splitter: null, expectedOutput: "Paris");
+
+ // Assert
+ Assert.NotNull(item);
+ Assert.Equal("What is the capital of France?", item.Query);
+ Assert.Equal("Paris", item.Response);
+ Assert.Equal("Paris", item.ExpectedOutput);
+ }
+
+ [Fact]
+ public void BuildOverallItem_WithFinalResponseAndExpectedOutput_StampsExpectedOutput()
+ {
+ // Arrange
+ var finalResponse = new AgentResponse(new ChatMessage(ChatRole.Assistant, "Ofrece 41 planes"));
+ var events = new List
+ {
+ new ExecutorInvokedEvent("agent-1", "How many plans does Netlife offer?"),
+ new ExecutorCompletedEvent("agent-1", finalResponse),
+ new AgentResponseEvent("agent-1", finalResponse),
+ };
+
+ // Act
+ var item = WorkflowEvaluationExtensions.BuildOverallItem(
+ events, splitter: null, expectedOutput: "Ofrece 41 planes");
+
+ // Assert
+ Assert.NotNull(item);
+ Assert.Equal("How many plans does Netlife offer?", item.Query);
+ Assert.Equal("Ofrece 41 planes", item.Response);
+ Assert.Equal("Ofrece 41 planes", item.ExpectedOutput);
+ }
+
+ [Fact]
+ public void BuildOverallItem_WithFinalResponseAndNoExpectedOutput_LeavesExpectedOutputNull()
+ {
+ // Arrange
+ var finalResponse = new AgentResponse(new ChatMessage(ChatRole.Assistant, "answer"));
+ var events = new List
+ {
+ new ExecutorInvokedEvent("agent-1", "query"),
+ new ExecutorCompletedEvent("agent-1", finalResponse),
+ new AgentResponseEvent("agent-1", finalResponse),
+ };
+
+ // Act
+ var item = WorkflowEvaluationExtensions.BuildOverallItem(events, splitter: null, expectedOutput: null);
+
+ // Assert
+ Assert.NotNull(item);
+ Assert.Null(item.ExpectedOutput);
+ }
+
+ [Fact]
+ public async Task EvaluateAsync_WithIncludeOverallButNoFinalResponse_ThrowsAsync()
+ {
+ // Arrange — build a workflow whose AIAgentHostExecutor is NOT bound with
+ // EmitAgentResponseEvents=true, so no AgentResponseEvent is emitted, and the
+ // ExecutorCompletedEvent for the host carries null Data. That is the scenario
+ // where BuildOverallItem returns null. When the caller asks for an overall
+ // evaluation (includeOverall: true), we should fail fast rather than silently
+ // returning empty results — regardless of whether expectedOutput was supplied.
+ var agent = new TestEchoAgent(name: "echo");
+ var workflow = AgentWorkflowBuilder.BuildSequential(agent);
+ var input = new List { new(ChatRole.User, "Hello") };
+
+ var evaluator = new LocalEvaluator(
+ FunctionEvaluator.Create("noop", (EvalItem _) => true));
+
+ await using var run = await InProcessExecution.RunAsync(workflow, input);
+
+ // Act + Assert — throws even without expectedOutput
+ var ex = await Assert.ThrowsAsync(() =>
+ run.EvaluateAsync(
+ evaluator,
+ includeOverall: true,
+ includePerAgent: false));
+
+ Assert.Contains("EmitAgentResponseEvents", ex.Message);
+ }
+
// ---------------------------------------------------------------
// EvaluateAsync integration test
// ---------------------------------------------------------------
diff --git a/python/packages/core/agent_framework/_mcp.py b/python/packages/core/agent_framework/_mcp.py
index 0d85b1699a..a7a3f1a796 100644
--- a/python/packages/core/agent_framework/_mcp.py
+++ b/python/packages/core/agent_framework/_mcp.py
@@ -10,7 +10,7 @@ import logging
import re
import sys
from abc import abstractmethod
-from collections.abc import Callable, Collection, Sequence
+from collections.abc import Callable, Collection, Coroutine, Sequence
from contextlib import AsyncExitStack, _AsyncGeneratorContextManager # type: ignore
from datetime import timedelta
from functools import partial
@@ -264,6 +264,7 @@ class MCPTool:
self.is_connected: bool = False
self._tools_loaded: bool = False
self._prompts_loaded: bool = False
+ self._pending_reload_tasks: set[asyncio.Task[None]] = set()
def __str__(self) -> str:
return f"MCPTool(name={self.name}, description={self.description})"
@@ -905,12 +906,47 @@ class MCPTool:
if isinstance(message, types.ServerNotification):
match message.root.method:
case "notifications/tools/list_changed":
- await self.load_tools()
+ self._schedule_reload(self.load_tools())
case "notifications/prompts/list_changed":
- await self.load_prompts()
+ self._schedule_reload(self.load_prompts())
case _:
logger.debug("Unhandled notification: %s", message.root.method)
+ def _schedule_reload(self, coro: Coroutine[Any, Any, None]) -> None:
+ """Schedule a reload coroutine as a background task.
+
+ Reloads (load_tools / load_prompts) triggered by MCP server
+ notifications must NOT be awaited inside the message handler because
+ the handler runs on the MCP SDK's single-threaded receive loop.
+ Awaiting a session request (e.g. ``list_tools``) from within that loop
+ deadlocks: the receive loop cannot read the response while it is
+ blocked waiting for the handler to return.
+
+ Instead we fire the reload as an independent ``asyncio.Task`` and keep
+ a strong reference in ``_pending_reload_tasks`` so it is not garbage-
+ collected before completion. Only one reload per kind (tools / prompts)
+ is kept in flight; a new notification cancels the previous pending task
+ for the same coroutine name to avoid unbounded growth.
+ """
+ # Cancel-and-replace: only one reload per kind should be in flight.
+ reload_name = f"mcp-reload:{self.name}:{coro.__qualname__}"
+ for existing in list(self._pending_reload_tasks):
+ if existing.get_name() == reload_name and not existing.done():
+ logger.debug("Cancelling in-flight reload %s; superseded by new notification", reload_name)
+ existing.cancel()
+
+ async def _safe_reload() -> None:
+ try:
+ await coro
+ except asyncio.CancelledError:
+ raise
+ except Exception:
+ logger.warning("Background MCP reload failed", exc_info=True)
+
+ task = asyncio.create_task(_safe_reload(), name=reload_name)
+ self._pending_reload_tasks.add(task)
+ task.add_done_callback(self._pending_reload_tasks.discard)
+
def _determine_approval_mode(
self,
*candidate_names: str,
@@ -1047,6 +1083,14 @@ class MCPTool:
params = types.PaginatedRequestParams(cursor=tool_list.nextCursor)
async def _close_on_owner(self) -> None:
+ # Cancel any pending reload tasks before tearing down the session.
+ tasks = list(self._pending_reload_tasks)
+ for task in tasks:
+ task.cancel()
+ self._pending_reload_tasks.clear()
+ if tasks:
+ await asyncio.gather(*tasks, return_exceptions=True)
+
await self._safe_close_exit_stack()
self._exit_stack = AsyncExitStack()
self.session = None
diff --git a/python/packages/core/tests/core/test_mcp.py b/python/packages/core/tests/core/test_mcp.py
index 487331e3f0..cd3173a7d3 100644
--- a/python/packages/core/tests/core/test_mcp.py
+++ b/python/packages/core/tests/core/test_mcp.py
@@ -1,6 +1,7 @@
# Copyright (c) Microsoft. All rights reserved.
# type: ignore[reportPrivateUsage]
import asyncio
+import contextlib
import json
import logging
import os
@@ -1615,7 +1616,7 @@ async def test_mcp_connection_reset_integration():
async def test_mcp_tool_message_handler_notification():
"""Test that message_handler correctly processes tools/list_changed and prompts/list_changed
- notifications."""
+ notifications by scheduling reloads as background tasks."""
tool = MCPStdioTool(name="test_tool", command="python")
# Mock the load_tools and load_prompts methods
@@ -1629,6 +1630,8 @@ async def test_mcp_tool_message_handler_notification():
result = await tool.message_handler(tools_notification)
assert result is None
+ # The reload is scheduled as a background task; let it run.
+ await asyncio.sleep(0)
tool.load_tools.assert_called_once()
# Reset mock
@@ -1641,6 +1644,7 @@ async def test_mcp_tool_message_handler_notification():
result = await tool.message_handler(prompts_notification)
assert result is None
+ await asyncio.sleep(0)
tool.load_prompts.assert_called_once()
# Test unhandled notification
@@ -1664,6 +1668,112 @@ async def test_mcp_tool_message_handler_error():
assert result is None
+async def test_mcp_tool_message_handler_does_not_block_receive_loop():
+ """Test that message_handler does not deadlock the MCP receive loop.
+
+ Regression test for https://github.com/microsoft/agent-framework/issues/4828.
+ When the MCP server sends a ``notifications/tools/list_changed``
+ notification, the handler must NOT await ``load_tools()`` synchronously
+ because that would block the single-threaded MCP receive loop, preventing
+ it from delivering the ``list_tools`` response — a classic deadlock.
+ """
+ tool = MCPStdioTool(name="test_tool", command="python")
+
+ # Use an event to make load_tools block until we release it.
+ # This simulates load_tools waiting for a session response that the
+ # receive loop would need to deliver.
+ release = asyncio.Event()
+
+ async def slow_load_tools():
+ await release.wait()
+
+ tool.load_tools = slow_load_tools # type: ignore[assignment]
+
+ tools_notification = Mock(spec=types.ServerNotification)
+ tools_notification.root = Mock()
+ tools_notification.root.method = "notifications/tools/list_changed"
+
+ # message_handler must return immediately even though load_tools blocks.
+ await tool.message_handler(tools_notification)
+
+ # If the handler had awaited load_tools synchronously, we would never
+ # reach this line (deadlock). Verify the reload task is pending.
+ assert len(tool._pending_reload_tasks) == 1
+
+ # Unblock the reload so the background task finishes cleanly.
+ release.set()
+ # Wait for the pending reload task(s) to complete so their done-callbacks
+ # have a chance to remove them from _pending_reload_tasks.
+ await asyncio.wait_for(asyncio.gather(*tool._pending_reload_tasks), timeout=1)
+ assert len(tool._pending_reload_tasks) == 0
+
+
+async def test_mcp_tool_message_handler_reload_failure_is_logged(caplog: pytest.LogCaptureFixture):
+ """Background reload errors are logged, not raised into the receive loop."""
+ tool = MCPStdioTool(name="test_tool", command="python")
+ tool.load_tools = AsyncMock(side_effect=RuntimeError("connection lost"))
+
+ tools_notification = Mock(spec=types.ServerNotification)
+ tools_notification.root = Mock()
+ tools_notification.root.method = "notifications/tools/list_changed"
+
+ await tool.message_handler(tools_notification)
+ # Let the background task run — it should not propagate the exception.
+ # Snapshot tasks and await them to ensure done-callbacks fire.
+ pending = list(tool._pending_reload_tasks)
+ if pending:
+ await asyncio.wait_for(asyncio.gather(*pending, return_exceptions=True), timeout=1)
+ tool.load_tools.assert_called_once()
+ assert len(tool._pending_reload_tasks) == 0
+
+ # Verify the warning was actually logged with exception info.
+ reload_warnings = [r for r in caplog.records if "Background MCP reload failed" in r.message]
+ assert len(reload_warnings) == 1
+ assert reload_warnings[0].levelname == "WARNING"
+ assert reload_warnings[0].exc_info is not None
+
+
+async def test_mcp_tool_message_handler_cancel_and_replace():
+ """Sending two notifications in quick succession cancels the first reload task."""
+ tool = MCPStdioTool(name="test_tool", command="python")
+
+ release = asyncio.Event()
+ call_count = 0
+
+ async def blocking_load_tools():
+ nonlocal call_count
+ call_count += 1
+ await release.wait()
+
+ tool.load_tools = blocking_load_tools # type: ignore[assignment]
+
+ notification = Mock(spec=types.ServerNotification)
+ notification.root = Mock()
+ notification.root.method = "notifications/tools/list_changed"
+
+ # First notification — starts a blocking reload task.
+ await tool.message_handler(notification)
+ assert len(tool._pending_reload_tasks) == 1
+ first_task = next(iter(tool._pending_reload_tasks))
+
+ # Second notification — should cancel the first and replace it.
+ await tool.message_handler(notification)
+ # Yield to the event loop so the cancellation is processed.
+ with contextlib.suppress(asyncio.CancelledError):
+ await first_task
+
+ assert first_task.cancelled()
+
+ assert len(tool._pending_reload_tasks) == 1
+ second_task = next(iter(tool._pending_reload_tasks))
+ assert second_task is not first_task
+
+ # Unblock and let the second task finish.
+ release.set()
+ await asyncio.wait_for(asyncio.gather(*tool._pending_reload_tasks), timeout=1)
+ assert len(tool._pending_reload_tasks) == 0
+
+
async def test_mcp_tool_sampling_callback_no_client():
"""Test sampling callback error path when no chat client is available."""
tool = MCPStdioTool(name="test_tool", command="python")